libextractor

GNU libextractor
Log | Files | Refs | Submodules | README | LICENSE

dvi_extractor.c (8709B)


      1 /*
      2      This file is part of libextractor.
      3      Copyright (C) 2002, 2003, 2004, 2012, 2017, 2019 Vidyut Samanta and Christian Grothoff
      4 
      5      libextractor is free software; you can redistribute it and/or modify
      6      it under the terms of the GNU General Public License as published
      7      by the Free Software Foundation; either version 3, or (at your
      8      option) any later version.
      9 
     10      libextractor is distributed in the hope that it will be useful, but
     11      WITHOUT ANY WARRANTY; without even the implied warranty of
     12      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     13      General Public License for more details.
     14 
     15      You should have received a copy of the GNU General Public License
     16      along with libextractor; see the file COPYING.  If not, write to the
     17      Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
     18      Boston, MA 02110-1301, USA.
     19  */
     20 /**
     21  * @file plugins/dvi_extractor.c
     22  * @brief plugin to support DVI files (from LaTeX)
     23  * @author Christian Grothoff
     24  */
     25 #include "platform.h"
     26 #include "extractor.h"
     27 
     28 
     29 /**
     30  * Pair of a PostScipt prefix and the corresponding LE type.
     31  */
     32 struct Matches
     33 {
     34   /**
     35    * Prefix in the PS map.
     36    */
     37   const char *text;
     38 
     39   /**
     40    * Corresponding LE type.
     41    */
     42   enum EXTRACTOR_MetaType type;
     43 };
     44 
     45 
     46 /**
     47  * Map from PS names to LE types.
     48  */
     49 static struct Matches tmap[] = {
     50   { "/Title (",    EXTRACTOR_METATYPE_TITLE },
     51   { "/Subject (",  EXTRACTOR_METATYPE_SUBJECT },
     52   { "/Author (",   EXTRACTOR_METATYPE_AUTHOR_NAME },
     53   { "/Keywords (", EXTRACTOR_METATYPE_KEYWORDS },
     54   { "/Creator (",  EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE },
     55   { "/Producer (", EXTRACTOR_METATYPE_PRODUCED_BY_SOFTWARE },
     56   { NULL, 0 }
     57 };
     58 
     59 
     60 /**
     61  * Parse a "ZZZ" tag.  Specifically, the data may contain a
     62  * postscript dictionary with metadata.
     63  *
     64  * @param data overall input stream
     65  * @param pos where in data is the zzz data
     66  * @param len how many bytes from 'pos' does the zzz data extend?
     67  * @param proc function to call with meta data found
     68  * @param proc_cls closure for proc
     69  * @return 0 to continue to extract, 1 to stop
     70  */
     71 static int
     72 parseZZZ (const char *data,
     73           size_t pos, size_t len,
     74           EXTRACTOR_MetaDataProcessor proc,
     75           void *proc_cls)
     76 {
     77   size_t slen;
     78   size_t end;
     79   unsigned int i;
     80 
     81   end = pos + len;
     82   slen = strlen ("ps:SDict begin [");
     83   if ( (len <= slen) ||
     84        (0 != strncmp ("ps:SDict begin [ ", &data[pos], slen)) )
     85     return 0;
     86   pos += slen;
     87   while (pos < end)
     88   {
     89     for (i = 0; NULL != tmap[i].text; i++)
     90     {
     91       slen = strlen (tmap[i].text);
     92       if ( (pos + slen > end) ||
     93            (0 != strncmp (&data[pos], tmap[i].text, slen)) )
     94         continue;
     95       pos += slen;
     96       slen = pos;
     97       while ((slen < end) && (data[slen] != ')'))
     98         slen++;
     99       slen = slen - pos;
    100       {
    101         char value[slen + 1];
    102 
    103         value[slen] = '\0';
    104         memcpy (value, &data[pos], slen);
    105         if (0 != proc (proc_cls,
    106                        "dvi",
    107                        tmap[i].type,
    108                        EXTRACTOR_METAFORMAT_C_STRING,
    109                        "text/plain",
    110                        value,
    111                        slen + 1))
    112           return 1;
    113       }
    114       pos += slen + 1;
    115       break;
    116     }
    117     pos++;
    118   }
    119   return 0;
    120 }
    121 
    122 
    123 /**
    124  * Read 32-bit unsigned integer in big-endian format from 'data'.
    125  *
    126  * @param data pointer to integer (possibly unaligned)
    127  * @return 32-bit integer in host byte order
    128  */
    129 static uint32_t
    130 getIntAt (const void *data)
    131 {
    132   uint32_t p;
    133 
    134   memcpy (&p, data, 4);          /* ensure alignment! */
    135   return ntohl (p);
    136 }
    137 
    138 
    139 /**
    140  * Read 16-bit unsigned integer in big-endian format from 'data'.
    141  *
    142  * @param data pointer to integer (possibly unaligned)
    143  * @return 16-bit integer in host byte order
    144  */
    145 static uint16_t
    146 getShortAt (const void *data)
    147 {
    148   uint16_t p;
    149 
    150   memcpy (&p, data, sizeof (uint16_t));          /* ensure alignment! */
    151   return ntohs (p);
    152 }
    153 
    154 
    155 /**
    156  * Main entry method for the 'application/x-dvi' extraction plugin.
    157  *
    158  * @param ec extraction context provided to the plugin
    159  */
    160 void
    161 EXTRACTOR_dvi_extract_method (struct EXTRACTOR_ExtractContext *ec)
    162 {
    163   unsigned int klen;
    164   uint32_t pos;
    165   uint32_t opos;
    166   unsigned int len;
    167   unsigned int pageCount;
    168   char pages[16];
    169   void *buf;
    170   unsigned char *data;
    171   uint64_t size;
    172   uint64_t off;
    173   ssize_t iret;
    174 
    175   if (40 >= (iret = ec->read (ec->cls, &buf, 1024)))
    176     return;
    177   data = buf;
    178   if ( (data[0] != 247) ||
    179        (data[1] != 2) )
    180     return;                /* cannot be DVI or unsupported version */
    181   klen = data[14];
    182   size = ec->get_size (ec->cls);
    183   if (size > 16 * 1024 * 1024)
    184     return; /* too large */
    185   if (klen + 15 > size)
    186     return; /* malformed klen */
    187   if (NULL == (data = malloc ((size_t) size)))
    188     return; /* out of memory */
    189   memcpy (data, buf, iret);
    190   off = iret;
    191   while (off < size)
    192   {
    193     if (0 >= (iret = ec->read (ec->cls, &buf, 16 * 1024)))
    194     {
    195       free (data);
    196       return;
    197     }
    198     memcpy (&data[off], buf, iret);
    199     off += iret;
    200   }
    201   pos = size - 1;
    202   while ( (223 == data[pos]) &&
    203           (pos > 0) )
    204     pos--;
    205   if ( (2 != data[pos]) ||
    206        (pos < 40) )
    207     goto CLEANUP;
    208   pos--;
    209   pos -= 4;
    210   /* assert pos at 'post_post tag' */
    211   if (data[pos] != 249)
    212     goto CLEANUP;
    213   opos = pos;
    214   pos = getIntAt (&data[opos + 1]);
    215   if ( (pos + 25 > size) ||
    216        (pos + 25 < pos) )
    217     goto CLEANUP;
    218   /* assert pos at 'post' command */
    219   if (data[pos] != 248)
    220     goto CLEANUP;
    221   pageCount = 0;
    222   opos = pos;
    223   pos = getIntAt (&data[opos + 1]);
    224   while (1)
    225   {
    226     if (UINT32_MAX == pos)
    227       break;
    228     if ( (pos + 45 > size) ||
    229          (pos + 45 < pos) )
    230       goto CLEANUP;
    231     if (data[pos] != 139)       /* expect 'bop' */
    232       goto CLEANUP;
    233     pageCount++;
    234     opos = pos;
    235     pos = getIntAt (&data[opos + 41]);
    236     if (UINT32_MAX == pos)
    237       break;
    238     if (pos >= opos)
    239       goto CLEANUP;       /* invalid! */
    240   }
    241   /* ok, now we believe it's a dvi... */
    242   snprintf (pages,
    243             sizeof (pages),
    244             "%u",
    245             pageCount);
    246   if (0 != ec->proc (ec->cls,
    247                      "dvi",
    248                      EXTRACTOR_METATYPE_PAGE_COUNT,
    249                      EXTRACTOR_METAFORMAT_UTF8,
    250                      "text/plain",
    251                      pages,
    252                      strlen (pages) + 1))
    253     goto CLEANUP;
    254   if (0 != ec->proc (ec->cls,
    255                      "dvi",
    256                      EXTRACTOR_METATYPE_MIMETYPE,
    257                      EXTRACTOR_METAFORMAT_UTF8,
    258                      "text/plain",
    259                      "application/x-dvi",
    260                      strlen ("application/x-dvi") + 1))
    261     goto CLEANUP;
    262   {
    263     char comment[klen + 1];
    264 
    265     comment[klen] = '\0';
    266     memcpy (comment, &data[15], klen);
    267     if (0 != ec->proc (ec->cls,
    268                        "dvi",
    269                        EXTRACTOR_METATYPE_COMMENT,
    270                        EXTRACTOR_METAFORMAT_C_STRING,
    271                        "text/plain",
    272                        comment,
    273                        klen + 1))
    274       goto CLEANUP;
    275   }
    276   /* try to find PDF/ps special */
    277   pos = opos;
    278   while ( (size >= 100) &&
    279           (pos < size - 100) )
    280   {
    281     switch (data[pos])
    282     {
    283     case 139:                  /* begin page 'bop', we typically have to skip that one to
    284                                    find the zzz's */
    285       pos += 45;                /* skip bop */
    286       break;
    287     case 239:                  /* zzz1 */
    288       len = data[pos + 1];
    289       if ( (pos + 2 + len < size) &&
    290            (0 != parseZZZ ((const char *) data, pos + 2, len, ec->proc,
    291                            ec->cls)) )
    292         goto CLEANUP;
    293       pos += len + 2;
    294       break;
    295     case 240:                  /* zzz2 */
    296       len = getShortAt (&data[pos + 1]);
    297       if ( (pos + 3 + len < size) &&
    298            (0 != parseZZZ ((const char *) data, pos + 3, len, ec->proc,
    299                            ec->cls)) )
    300         goto CLEANUP;
    301       pos += len + 3;
    302       break;
    303     case 241:                  /* zzz3, who uses that? */
    304       len = (getShortAt (&data[pos + 1])) + 65536 * data[pos + 3];
    305       if ( (pos + 4 + len < size) &&
    306            (0 != parseZZZ ((const char *) data, pos + 4, len, ec->proc,
    307                            ec->cls)) )
    308         goto CLEANUP;
    309       pos += len + 4;
    310       break;
    311     case 242:                  /* zzz4, hurray! */
    312       len = getIntAt (&data[pos + 1]);
    313       if ( (pos + 1 + len < size) &&
    314            (0 != parseZZZ ((const char *) data, pos + 5, len, ec->proc,
    315                            ec->cls)) )
    316         goto CLEANUP;
    317       pos += len + 5;
    318       break;
    319     default:                   /* unsupported opcode, abort scan */
    320       goto CLEANUP;
    321     }
    322   }
    323 CLEANUP:
    324   free (data);
    325 }
    326 
    327 
    328 /* end of dvi_extractor.c */