/*
     This file is part of libextractor.
     Copyright (C) 2002, 2003, 2009, 2012 Vidyut Samanta and Christian Grothoff

     libextractor is free software; you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published
     by the Free Software Foundation; either version 3, or (at your
     option) any later version.

     libextractor is distributed in the hope that it will be useful, but
     WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     General Public License for more details.

     You should have received a copy of the GNU General Public License
     along with libextractor; see the file COPYING.  If not, write to the
     Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
     Boston, MA 02110-1301, USA.
 */
/**
 * @file plugins/ps_extractor.c
 * @brief plugin to support PostScript files
 * @author Christian Grothoff
 */
#include "platform.h"
#include "extractor.h"


/**
 * Maximum length of a single line in the PostScript file we're
 * willing to look at.  While the body of the file can have longer
 * lines, this should be a sane limit for the lines in the header with
 * the meta data.
 */
#define MAX_LINE (1024)

/**
 * Header of a PostScript file.
 */
#define PS_HEADER "%!PS-Adobe"


/**
 * Pair with prefix in the PS header and corresponding LE type.
 */
struct Matches
{
  /**
   * PS header prefix.
   */
  const char *prefix;

  /**
   * Corresponding LE type.
   */
  enum EXTRACTOR_MetaType type;
};


/**
 * Map of PS prefixes to LE types.
 */
static struct Matches tests[] = {
  { "%%Title: ", EXTRACTOR_METATYPE_TITLE },
  { "% Subject: ", EXTRACTOR_METATYPE_SUBJECT },
  { "%%Author: ", EXTRACTOR_METATYPE_AUTHOR_NAME },
  { "% From: ", EXTRACTOR_METATYPE_AUTHOR_NAME },
  { "%%Version: ", EXTRACTOR_METATYPE_REVISION_NUMBER },
  { "%%Creator: ", EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE },
  { "%%CreationDate: ", EXTRACTOR_METATYPE_CREATION_DATE },
  { "% Date: ", EXTRACTOR_METATYPE_UNKNOWN_DATE },
  { "%%Pages: ", EXTRACTOR_METATYPE_PAGE_COUNT },
  { "%%Orientation: ", EXTRACTOR_METATYPE_PAGE_ORIENTATION },
  { "%%DocumentPaperSizes: ", EXTRACTOR_METATYPE_PAPER_SIZE },
  { "%%PageOrder: ", EXTRACTOR_METATYPE_PAGE_ORDER },
  { "%%LanguageLevel: ", EXTRACTOR_METATYPE_FORMAT_VERSION },
  { "%%Magnification: ", EXTRACTOR_METATYPE_MAGNIFICATION },

  /* Also widely used but not supported since they
     probably make no sense:
     "%%BoundingBox: ",
     "%%DocumentNeededResources: ",
     "%%DocumentSuppliedResources: ",
     "%%DocumentProcSets: ",
     "%%DocumentData: ", */

  { NULL, 0 }
};


/**
 * Read a single ('\n'-terminated) line of input.
 *
 * @param ec context for IO
 * @return NULL on end-of-file (or if next line exceeds limit)
 */
static char *
readline (struct EXTRACTOR_ExtractContext *ec)
{
  int64_t pos;
  ssize_t ret;
  char *res;
  void *data;
  const char *cdata;
  const char *eol;

  pos = ec->seek (ec->cls, 0, SEEK_CUR);
  if (0 >= (ret = ec->read (ec->cls, &data, MAX_LINE)))
    return NULL;
  cdata = data;
  if (NULL == (eol = memchr (cdata, '\n', ret)))
    return NULL; /* no end-of-line found */
  if (NULL == (res = malloc (eol - cdata + 1)))
    return NULL;
  memcpy (res, cdata, eol - cdata);
  res[eol - cdata] = '\0';
  ec->seek (ec->cls, pos + eol - cdata + 1, SEEK_SET);
  return res;
}


/**
 * Main entry method for the 'application/postscript' extraction plugin.
 *
 * @param ec extraction context provided to the plugin
 */
void
EXTRACTOR_ps_extract_method (struct EXTRACTOR_ExtractContext *ec)
{
  unsigned int i;
  char *line;
  char *next;
  char *acc;
  const char *match;

  if (NULL == (line = readline (ec)))
    return;
  if ( (strlen (line) < strlen (PS_HEADER)) ||
       (0 != memcmp (PS_HEADER,
		     line,
		     strlen (PS_HEADER))) )
    {
      free (line);
      return;
    }
  free (line);
  if (0 != ec->proc (ec->cls,
		     "ps",
		     EXTRACTOR_METATYPE_MIMETYPE,
		     EXTRACTOR_METAFORMAT_UTF8,
		     "text/plain",
		     "application/postscript",
		     strlen ("application/postscript") + 1))
    return;

  line = NULL;
  next = readline (ec);
  while ( (NULL != next) &&
	  ('%' == next[0]) )
    {
      line = next;
      next = readline (ec);
      for (i = 0; NULL != tests[i].prefix; i++)
        {
	  match = tests[i].prefix;
	  if ( (strlen (line) < strlen (match)) ||
	       (0 != strncmp (line, match, strlen (match))) )
	    continue;
	  /* %%+ continues previous meta-data type... */
	  while ( (NULL != next) &&
		  (0 == strncmp (next, "%%+", strlen ("%%+"))) )
	    {
	      if (NULL == (acc = malloc (strlen (line) + strlen (next) - 1)))
		break;
	      strcpy (acc, line);
	      strcat (acc, " ");
	      strcat (acc, next + 3);
	      free (line);
	      line = acc;
	      free (next);
	      next = readline (ec);
	    }
	  if ( (line[strlen (line) - 1] == ')') &&
	       (line[strlen (match)] == '(') )
	    {
	      acc = &line[strlen (match) + 1];
	      acc[strlen (acc) - 1] = '\0'; /* remove ")" */
	    }
	  else
	    {
	      acc = &line[strlen (match)];
	    }
	  while (isspace ((unsigned char) acc[0]))
	    acc++;
	  if ( (strlen (acc) > 0) &&
	       (0 != ec->proc (ec->cls,
			       "ps",
			       tests[i].type,
			       EXTRACTOR_METAFORMAT_UTF8,
			       "text/plain",
			       acc,
			       strlen (acc) + 1)) )
	    {
	      free (line);
	      if (NULL != next)
		free (next);
	      return;
	    }
	  break;
	}
      free (line);
    }
  if (NULL != next)
    free (next);
}

/* end of ps_extractor.c */