libextractor

GNU libextractor
Log | Files | Refs | Submodules | README | LICENSE

pdf_extractor.c (5530B)


      1 /*
      2      This file is part of libextractor.
      3      Copyright (C) 2016 Christian Grothoff
      4 
      5      libextractor is free software; you can redistribute it and/or modify
      6      it under the terms of the GNU General Public License as published
      7      by the Free Software Foundation; either version 3, or (at your
      8      option) any later version.
      9 
     10      libextractor is distributed in the hope that it will be useful, but
     11      WITHOUT ANY WARRANTY; without even the implied warranty of
     12      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     13      General Public License for more details.
     14 
     15      You should have received a copy of the GNU General Public License
     16      along with libextractor; see the file COPYING.  If not, write to the
     17      Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
     18      Boston, MA 02110-1301, USA.
     19  */
     20 /**
     21  * @file plugins/pdf_extractor.c
     22  * @brief plugin to support PDF files
     23  * @author Christian Grothoff
     24  *
     25  * PDF libraries today are a nightmare (TM).  So instead of doing the
     26  * fast thing and calling some library functions to parse the PDF,
     27  * we execute 'pdfinfo' and parse the output. Because that's 21st
     28  * century plumbing: nobody writes reasonable code anymore.
     29  */
     30 #include "platform.h"
     31 #include <extractor.h>
     32 #include <sys/types.h>
     33 #include <sys/wait.h>
     34 #include <signal.h>
     35 #include <unistd.h>
     36 
     37 /**
     38  * Entry in the mapping from control data to LE types.
     39  */
     40 struct Matches
     41 {
     42   /**
     43    * Key in the Pdfian control file.
     44    */
     45   const char *text;
     46 
     47   /**
     48    * Corresponding type in LE.
     49    */
     50   enum EXTRACTOR_MetaType type;
     51 };
     52 
     53 
     54 /**
     55  * Map from pdf-control entries to LE types.
     56  *
     57  * See output of 'pdfinfo'.
     58  */
     59 static struct Matches tmap[] = {
     60   {"Title",        EXTRACTOR_METATYPE_TITLE},
     61   {"Subject",      EXTRACTOR_METATYPE_SUBJECT},
     62   {"Keywords",     EXTRACTOR_METATYPE_KEYWORDS},
     63   {"Author",       EXTRACTOR_METATYPE_AUTHOR_NAME},
     64   {"Creator",      EXTRACTOR_METATYPE_CREATOR},
     65   {"Producer",     EXTRACTOR_METATYPE_PRODUCED_BY_SOFTWARE},
     66   {"CreationDate", EXTRACTOR_METATYPE_CREATION_DATE},
     67   {"ModDate",      EXTRACTOR_METATYPE_MODIFICATION_DATE},
     68   {"PDF version",  EXTRACTOR_METATYPE_ENCODER_VERSION},
     69   {"Pages",        EXTRACTOR_METATYPE_PAGE_COUNT},
     70   {NULL, 0}
     71 };
     72 
     73 
     74 /**
     75  * Process the "stdout" file from pdfinfo.
     76  *
     77  * @param fout stdout of pdfinfo
     78  * @param proc function to call with meta data
     79  * @param proc_cls closure for @e proc
     80  */
     81 static void
     82 process_stdout (FILE *fout,
     83                 EXTRACTOR_MetaDataProcessor proc,
     84                 void *proc_cls)
     85 {
     86   unsigned int i;
     87   char line[1025];
     88   const char *psuffix;
     89   const char *colon;
     90 
     91   while (! feof (fout))
     92   {
     93     if (NULL == fgets (line, sizeof (line) - 1, fout))
     94       break;
     95     if (0 == strlen (line))
     96       continue;
     97     if ('\n' == line[strlen (line) - 1])
     98       line[strlen (line) - 1] = '\0';
     99     colon = strchr (line, (int) ':');
    100     if (NULL == colon)
    101       break;
    102     psuffix = colon + 1;
    103     while (isblank ((unsigned char) psuffix[0]))
    104       psuffix++;
    105     if (0 == strlen (psuffix))
    106       continue;
    107     for (i = 0; NULL != tmap[i].text; i++)
    108     {
    109       if (0 != strncasecmp (line,
    110                             tmap[i].text,
    111                             colon - line))
    112         continue;
    113       if (0 != proc (proc_cls,
    114                      "pdf",
    115                      tmap[i].type,
    116                      EXTRACTOR_METAFORMAT_UTF8,
    117                      "text/plain",
    118                      psuffix,
    119                      strlen (psuffix) + 1))
    120         return;
    121       break;
    122     }
    123   }
    124 }
    125 
    126 
    127 /**
    128  * Main entry method for the PDF extraction plugin.
    129  *
    130  * @param ec extraction context provided to the plugin
    131  */
    132 void
    133 EXTRACTOR_pdf_extract_method (struct EXTRACTOR_ExtractContext *ec)
    134 {
    135   uint64_t fsize;
    136   void *data;
    137   pid_t pid;
    138   int in[2];
    139   int out[2];
    140   FILE *fout;
    141   uint64_t pos;
    142 
    143   fsize = ec->get_size (ec->cls);
    144   if (fsize < 128)
    145     return;
    146   if (4 !=
    147       ec->read (ec->cls, &data, 4))
    148     return;
    149   if (0 != strncmp ("%PDF", data, 4))
    150     return;
    151   if (0 !=
    152       ec->seek (ec->cls, 0, SEEK_SET))
    153     return;
    154   if (0 != pipe (in))
    155     return;
    156   if (0 != pipe (out))
    157   {
    158     close (in[0]);
    159     close (in[1]);
    160     return;
    161   }
    162   pid = fork ();
    163   if (-1 == pid)
    164   {
    165     close (in[0]);
    166     close (in[1]);
    167     close (out[0]);
    168     close (out[1]);
    169     return;
    170   }
    171   if (0 == pid)
    172   {
    173     char *const args[] = {
    174       "pdfinfo",
    175       "-",
    176       NULL
    177     };
    178     /* am child, exec 'pdfinfo' */
    179     close (0);
    180     close (1);
    181     if ( (-1 == dup2 (in[0], 0)) ||
    182          (-1 == dup2 (out[1], 1)) )
    183       exit (1);
    184     close (in[0]);
    185     close (in[1]);
    186     close (out[0]);
    187     close (out[1]);
    188     execvp ("pdfinfo", args);
    189     exit (1);
    190   }
    191   /* am parent, send file */
    192   close (in[0]);
    193   close (out[1]);
    194   fout = fdopen (out[0], "r");
    195   if (NULL == fout)
    196   {
    197     close (in[1]);
    198     close (out[0]);
    199     kill (pid, SIGKILL);
    200     waitpid (pid, NULL, 0);
    201     return;
    202   }
    203   pos = 0;
    204   while (pos < fsize)
    205   {
    206     ssize_t got;
    207     size_t wpos;
    208 
    209     data = NULL;
    210     got = ec->read (ec->cls,
    211                     &data,
    212                     fsize - pos);
    213     if ( (-1 == got) ||
    214          (NULL == data) )
    215       break;
    216     wpos = 0;
    217     while (wpos < got)
    218     {
    219       ssize_t out;
    220 
    221       out = write (in[1], data + wpos, got - wpos);
    222       if (out <= 0)
    223         break;
    224       wpos += out;
    225     }
    226     if (wpos < got)
    227       break;
    228     pos += got;
    229   }
    230   close (in[1]);
    231   process_stdout (fout, ec->proc, ec->cls);
    232   fclose (fout);
    233   kill (pid, SIGKILL);
    234   waitpid (pid, NULL, 0);
    235 }
    236 
    237 
    238 /* end of pdf_extractor.c */