libextractor

GNU libextractor
Log | Files | Refs | Submodules | README | LICENSE

ps_extractor.c (5929B)


      1 /*
      2      This file is part of libextractor.
      3      Copyright (C) 2002, 2003, 2009, 2012 Vidyut Samanta and Christian Grothoff
      4 
      5      libextractor is free software; you can redistribute it and/or modify
      6      it under the terms of the GNU General Public License as published
      7      by the Free Software Foundation; either version 3, or (at your
      8      option) any later version.
      9 
     10      libextractor is distributed in the hope that it will be useful, but
     11      WITHOUT ANY WARRANTY; without even the implied warranty of
     12      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     13      General Public License for more details.
     14 
     15      You should have received a copy of the GNU General Public License
     16      along with libextractor; see the file COPYING.  If not, write to the
     17      Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
     18      Boston, MA 02110-1301, USA.
     19  */
     20 /**
     21  * @file plugins/ps_extractor.c
     22  * @brief plugin to support PostScript files
     23  * @author Christian Grothoff
     24  */
     25 #include "platform.h"
     26 #include "extractor.h"
     27 
     28 
     29 /**
     30  * Maximum length of a single line in the PostScript file we're
     31  * willing to look at.  While the body of the file can have longer
     32  * lines, this should be a sane limit for the lines in the header with
     33  * the meta data.
     34  */
     35 #define MAX_LINE (1024)
     36 
     37 /**
     38  * Header of a PostScript file.
     39  */
     40 #define PS_HEADER "%!PS-Adobe"
     41 
     42 
     43 /**
     44  * Pair with prefix in the PS header and corresponding LE type.
     45  */
     46 struct Matches
     47 {
     48   /**
     49    * PS header prefix.
     50    */
     51   const char *prefix;
     52 
     53   /**
     54    * Corresponding LE type.
     55    */
     56   enum EXTRACTOR_MetaType type;
     57 };
     58 
     59 
     60 /**
     61  * Map of PS prefixes to LE types.
     62  */
     63 static struct Matches tests[] = {
     64   { "%%Title: ", EXTRACTOR_METATYPE_TITLE },
     65   { "% Subject: ", EXTRACTOR_METATYPE_SUBJECT },
     66   { "%%Author: ", EXTRACTOR_METATYPE_AUTHOR_NAME },
     67   { "% From: ", EXTRACTOR_METATYPE_AUTHOR_NAME },
     68   { "%%Version: ", EXTRACTOR_METATYPE_REVISION_NUMBER },
     69   { "%%Creator: ", EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE },
     70   { "%%CreationDate: ", EXTRACTOR_METATYPE_CREATION_DATE },
     71   { "% Date: ", EXTRACTOR_METATYPE_UNKNOWN_DATE },
     72   { "%%Pages: ", EXTRACTOR_METATYPE_PAGE_COUNT },
     73   { "%%Orientation: ", EXTRACTOR_METATYPE_PAGE_ORIENTATION },
     74   { "%%DocumentPaperSizes: ", EXTRACTOR_METATYPE_PAPER_SIZE },
     75   { "%%PageOrder: ", EXTRACTOR_METATYPE_PAGE_ORDER },
     76   { "%%LanguageLevel: ", EXTRACTOR_METATYPE_FORMAT_VERSION },
     77   { "%%Magnification: ", EXTRACTOR_METATYPE_MAGNIFICATION },
     78 
     79   /* Also widely used but not supported since they
     80      probably make no sense:
     81      "%%BoundingBox: ",
     82      "%%DocumentNeededResources: ",
     83      "%%DocumentSuppliedResources: ",
     84      "%%DocumentProcSets: ",
     85      "%%DocumentData: ", */
     86 
     87   { NULL, 0 }
     88 };
     89 
     90 
     91 /**
     92  * Read a single ('\n'-terminated) line of input.
     93  *
     94  * @param ec context for IO
     95  * @return NULL on end-of-file (or if next line exceeds limit)
     96  */
     97 static char *
     98 readline (struct EXTRACTOR_ExtractContext *ec)
     99 {
    100   int64_t pos;
    101   ssize_t ret;
    102   char *res;
    103   void *data;
    104   const char *cdata;
    105   const char *eol;
    106 
    107   pos = ec->seek (ec->cls, 0, SEEK_CUR);
    108   if (0 >= (ret = ec->read (ec->cls, &data, MAX_LINE)))
    109     return NULL;
    110   cdata = data;
    111   if (NULL == (eol = memchr (cdata, '\n', ret)))
    112     return NULL; /* no end-of-line found */
    113   if (NULL == (res = malloc (eol - cdata + 1)))
    114     return NULL;
    115   memcpy (res, cdata, eol - cdata);
    116   res[eol - cdata] = '\0';
    117   ec->seek (ec->cls, pos + eol - cdata + 1, SEEK_SET);
    118   return res;
    119 }
    120 
    121 
    122 /**
    123  * Main entry method for the 'application/postscript' extraction plugin.
    124  *
    125  * @param ec extraction context provided to the plugin
    126  */
    127 void
    128 EXTRACTOR_ps_extract_method (struct EXTRACTOR_ExtractContext *ec)
    129 {
    130   unsigned int i;
    131   char *line;
    132   char *next;
    133   char *acc;
    134   const char *match;
    135 
    136   if (NULL == (line = readline (ec)))
    137     return;
    138   if ( (strlen (line) < strlen (PS_HEADER)) ||
    139        (0 != memcmp (PS_HEADER,
    140                      line,
    141                      strlen (PS_HEADER))) )
    142   {
    143     free (line);
    144     return;
    145   }
    146   free (line);
    147   if (0 != ec->proc (ec->cls,
    148                      "ps",
    149                      EXTRACTOR_METATYPE_MIMETYPE,
    150                      EXTRACTOR_METAFORMAT_UTF8,
    151                      "text/plain",
    152                      "application/postscript",
    153                      strlen ("application/postscript") + 1))
    154     return;
    155 
    156   line = NULL;
    157   next = readline (ec);
    158   while ( (NULL != next) &&
    159           ('%' == next[0]) )
    160   {
    161     line = next;
    162     next = readline (ec);
    163     for (i = 0; NULL != tests[i].prefix; i++)
    164     {
    165       match = tests[i].prefix;
    166       if ( (strlen (line) < strlen (match)) ||
    167            (0 != strncmp (line, match, strlen (match))) )
    168         continue;
    169       /* %%+ continues previous meta-data type... */
    170       while ( (NULL != next) &&
    171               (0 == strncmp (next, "%%+", strlen ("%%+"))) )
    172       {
    173         if (NULL == (acc = malloc (strlen (line) + strlen (next) - 1)))
    174           break;
    175         strcpy (acc, line);
    176         strcat (acc, " ");
    177         strcat (acc, next + 3);
    178         free (line);
    179         line = acc;
    180         free (next);
    181         next = readline (ec);
    182       }
    183       if ( (line[strlen (line) - 1] == ')') &&
    184            (line[strlen (match)] == '(') )
    185       {
    186         acc = &line[strlen (match) + 1];
    187         acc[strlen (acc) - 1] = '\0'; /* remove ")" */
    188       }
    189       else
    190       {
    191         acc = &line[strlen (match)];
    192       }
    193       while (isspace ((unsigned char) acc[0]))
    194         acc++;
    195       if ( (strlen (acc) > 0) &&
    196            (0 != ec->proc (ec->cls,
    197                            "ps",
    198                            tests[i].type,
    199                            EXTRACTOR_METAFORMAT_UTF8,
    200                            "text/plain",
    201                            acc,
    202                            strlen (acc) + 1)) )
    203       {
    204         free (line);
    205         if (NULL != next)
    206           free (next);
    207         return;
    208       }
    209       break;
    210     }
    211     free (line);
    212   }
    213   if (NULL != next)
    214     free (next);
    215 }
    216 
    217 
    218 /* end of ps_extractor.c */