libextractor

GNU libextractor
Log | Files | Refs | Submodules | README | LICENSE

pdf_extractor.cc (8024B)


      1 /*
      2      This file is part of libextractor.
      3      Copyright (C) 2002, 2003, 2009, 2026 Vidyut Samanta and Christian Grothoff
      4 
      5      libextractor is free software; you can redistribute it and/or modify
      6      it under the terms of the GNU General Public License as published
      7      by the Free Software Foundation; either version 3, or (at your
      8      option) any later version.
      9 
     10      libextractor is distributed in the hope that it will be useful, but
     11      WITHOUT ANY WARRANTY; without even the implied warranty of
     12      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     13      General Public License for more details.
     14 
     15      You should have received a copy of the GNU General Public License
     16      along with libextractor; see the file COPYING.  If not, write to the
     17      Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
     18      Boston, MA 02110-1301, USA.
     19  */
     20 /**
     21  * @file plugins/pdf_extractor.cc
     22  * @brief plugin to support PDF files
     23  * @author Vidyut Samanta
     24  * @author Christian Grothoff
     25  */
     26 #include "platform.h"
     27 #include "extractor.h"
     28 #include <poppler/cpp/poppler-document.h>
     29 #include <poppler/cpp/poppler-global.h>
     30 #include <string>
     31 #include <vector>
     32 
     33 
     34 /**
     35  * Sanity bound on the size of a PDF we are willing to buffer
     36  * in memory (1 GB).  libpoppler needs the whole document, and
     37  * its raw-data loader takes an `int` length.
     38  */
     39 #define MAX_PDF_SIZE (1024LL * 1024LL * 1024LL)
     40 
     41 
     42 /**
     43  * Entry in the mapping from poppler accessors to LE types.
     44  */
     45 struct Matches
     46 {
     47   /**
     48    * Accessor on the poppler document returning the value.
     49    */
     50   poppler::ustring (poppler::document::*get) () const;
     51 
     52   /**
     53    * Corresponding meta data type in LE.
     54    */
     55   enum EXTRACTOR_MetaType type;
     56 };
     57 
     58 
     59 /**
     60  * Map from poppler document info accessors to LE types.
     61  *
     62  * Note that we deliberately map "Creator" to
     63  * #EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE: we believe that
     64  * Adobe's "Creator" is not a person nor an organisation, but
     65  * just a piece of software.
     66  */
     67 static const struct Matches tmap[] = {
     68   { &poppler::document::get_title,    EXTRACTOR_METATYPE_TITLE },
     69   { &poppler::document::get_subject,  EXTRACTOR_METATYPE_SUBJECT },
     70   { &poppler::document::get_keywords, EXTRACTOR_METATYPE_KEYWORDS },
     71   { &poppler::document::get_author,   EXTRACTOR_METATYPE_AUTHOR_NAME },
     72   { &poppler::document::get_creator,  EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE },
     73   { &poppler::document::get_producer, EXTRACTOR_METATYPE_PRODUCED_BY_SOFTWARE },
     74   { NULL, EXTRACTOR_METATYPE_RESERVED }
     75 };
     76 
     77 
     78 /**
     79  * Silence libpoppler: we do not want parsing diagnostics on
     80  * stderr of the plugin child process.
     81  *
     82  * @param msg the message (ignored)
     83  * @param cls closure (ignored)
     84  */
     85 static void
     86 quiet_error (const std::string &msg,
     87              void *cls)
     88 {
     89   (void) msg;
     90   (void) cls;
     91 }
     92 
     93 
     94 /**
     95  * Hand a UTF-8 string to the meta data processor, after
     96  * stripping trailing whitespace.  Empty values are skipped.
     97  *
     98  * @param ec extraction context
     99  * @param type meta data type to use
    100  * @param val UTF-8 bytes (need not be 0-terminated)
    101  * @return 0 to continue extracting, 1 if @a ec asked us to stop
    102  */
    103 static int
    104 add_utf8 (struct EXTRACTOR_ExtractContext *ec,
    105           enum EXTRACTOR_MetaType type,
    106           std::vector<char> val)
    107 {
    108   size_t len = val.size ();
    109 
    110   /*
    111    * Avoid outputting trailing whitespace.  Note that ISO-8859
    112    * NBSP (0xA0) becomes 0xC2 0xA0 in UTF-8 and is intentionally
    113    * not stripped here.
    114    */
    115   while ((0 < len) &&
    116          ((' ' == val[len - 1]) ||
    117           ('\r' == val[len - 1]) ||
    118           ('\n' == val[len - 1]) ||
    119           ('\t' == val[len - 1]) ||
    120           ('\v' == val[len - 1]) ||
    121           ('\f' == val[len - 1])))
    122     len--;
    123   if (0 == len)
    124     return 0;
    125   std::string s (val.data (), len);
    126   if (0 != ec->proc (ec->cls,
    127                      "pdf",
    128                      type,
    129                      EXTRACTOR_METAFORMAT_UTF8,
    130                      "text/plain",
    131                      s.c_str (),
    132                      s.size () + 1))
    133     return 1;
    134   return 0;
    135 }
    136 
    137 
    138 /**
    139  * Hand a 0-terminated C string to the meta data processor.
    140  *
    141  * @param ec extraction context
    142  * @param type meta data type to use
    143  * @param s the string
    144  * @return 0 to continue extracting, 1 if @a ec asked us to stop
    145  */
    146 static int
    147 add_str (struct EXTRACTOR_ExtractContext *ec,
    148          enum EXTRACTOR_MetaType type,
    149          const char *s)
    150 {
    151   if ((NULL == s) || ('\0' == s[0]))
    152     return 0;
    153   if (0 != ec->proc (ec->cls,
    154                      "pdf",
    155                      type,
    156                      EXTRACTOR_METAFORMAT_UTF8,
    157                      "text/plain",
    158                      s,
    159                      strlen (s) + 1))
    160     return 1;
    161   return 0;
    162 }
    163 
    164 
    165 /**
    166  * Report a date (given as a `time_t`) in ISO-8601 / UTC.
    167  *
    168  * @param ec extraction context
    169  * @param type meta data type to use
    170  * @param t the time, `(time_t) -1` or 0 if absent
    171  * @return 0 to continue extracting, 1 if @a ec asked us to stop
    172  */
    173 static int
    174 add_date (struct EXTRACTOR_ExtractContext *ec,
    175           enum EXTRACTOR_MetaType type,
    176           time_t t)
    177 {
    178   char buf[32];
    179   struct tm tv;
    180 
    181   if (((time_t) -1 == t) || (0 == t))
    182     return 0;
    183   if (NULL == gmtime_r (&t, &tv))
    184     return 0;
    185   if (0 == strftime (buf, sizeof (buf), "%Y-%m-%d %H:%M:%S", &tv))
    186     return 0;
    187   return add_str (ec, type, buf);
    188 }
    189 
    190 
    191 /**
    192  * Read the entire input into @a buf.
    193  *
    194  * @param ec extraction context
    195  * @param[out] buf buffer to fill with the file contents
    196  * @return 0 on success, -1 on error
    197  */
    198 static int
    199 read_all (struct EXTRACTOR_ExtractContext *ec,
    200           std::vector<char> &buf)
    201 {
    202   uint64_t size;
    203 
    204   size = ec->get_size (ec->cls);
    205   if ((UINT64_MAX == size) ||
    206       (0 == size) ||
    207       (size > MAX_PDF_SIZE))
    208     return -1;
    209   if (0 != ec->seek (ec->cls, 0, SEEK_SET))
    210     return -1;
    211   buf.reserve ((size_t) size);
    212   while (buf.size () < size)
    213   {
    214     void *data;
    215     ssize_t got;
    216 
    217     got = ec->read (ec->cls,
    218                     &data,
    219                     (size_t) (size - buf.size ()));
    220     if ((got <= 0) || (NULL == data))
    221       break;
    222     buf.insert (buf.end (),
    223                 static_cast<char *> (data),
    224                 static_cast<char *> (data) + got);
    225   }
    226   if (buf.empty ())
    227     return -1;
    228   return 0;
    229 }
    230 
    231 
    232 /**
    233  * Main entry method for the PDF extraction plugin.
    234  *
    235  * @param ec extraction context provided to the plugin
    236  */
    237 extern "C" void
    238 EXTRACTOR_pdf_extract_method (struct EXTRACTOR_ExtractContext *ec)
    239 {
    240   void *hdr;
    241   std::vector<char> buf;
    242   poppler::document *doc;
    243   int major;
    244   int minor;
    245   char ver[32];
    246   char pages[16];
    247 
    248   if (4 != ec->read (ec->cls, &hdr, 4))
    249     return;
    250   if (0 != memcmp ("%PDF", hdr, 4))
    251     return;
    252   if (0 != read_all (ec, buf))
    253     return;
    254 
    255   poppler::set_debug_error_function (&quiet_error, NULL);
    256   doc = poppler::document::load_from_raw_data (buf.data (),
    257                                                (int) buf.size ());
    258   if (NULL == doc)
    259     return;
    260   /* An encrypted document we cannot open exposes no usable meta data. */
    261   if (doc->is_locked ())
    262   {
    263     delete doc;
    264     return;
    265   }
    266 
    267   if (0 != add_str (ec,
    268                     EXTRACTOR_METATYPE_MIMETYPE,
    269                     "application/pdf"))
    270     goto CLEANUP;
    271   for (unsigned int i = 0; NULL != tmap[i].get; i++)
    272     if (0 != add_utf8 (ec,
    273                        tmap[i].type,
    274                        (doc->*tmap[i].get)().to_utf8 ()))
    275       goto CLEANUP;
    276   doc->get_pdf_version (&major, &minor);
    277   snprintf (ver, sizeof (ver), "PDF %d.%d", major, minor);
    278   if (0 != add_str (ec, EXTRACTOR_METATYPE_FORMAT, ver))
    279     goto CLEANUP;
    280   snprintf (pages, sizeof (pages), "%d", doc->pages ());
    281   if (0 != add_str (ec, EXTRACTOR_METATYPE_PAGE_COUNT, pages))
    282     goto CLEANUP;
    283   if (0 != add_date (ec,
    284                      EXTRACTOR_METATYPE_CREATION_DATE,
    285                      doc->get_creation_date_t ()))
    286     goto CLEANUP;
    287   if (0 != add_date (ec,
    288                      EXTRACTOR_METATYPE_MODIFICATION_DATE,
    289                      doc->get_modification_date_t ()))
    290     goto CLEANUP;
    291 CLEANUP:
    292   delete doc;
    293 }
    294 
    295 
    296 /* end of pdf_extractor.cc */