libextractor

GNU libextractor
Log | Files | Refs | Submodules | README | LICENSE

pdf_extractor.cc (6377B)


      1 /*
      2      This file is part of libextractor.
      3      (C) 2002, 2003, 2009 Vidyut Samanta and Christian Grothoff
      4 
      5      libextractor is free software; you can redistribute it and/or modify
      6      it under the terms of the GNU General Public License as published
      7      by the Free Software Foundation; either version 2, or (at your
      8      option) any later version.
      9 
     10      libextractor is distributed in the hope that it will be useful, but
     11      WITHOUT ANY WARRANTY; without even the implied warranty of
     12      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     13      General Public License for more details.
     14 
     15      You should have received a copy of the GNU General Public License
     16      along with libextractor; see the file COPYING.  If not, write to the
     17      Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
     18      Boston, MA 02110-1301, USA.
     19 
     20      This code was inspired by pdfinfo and depends heavily
     21      on the xpdf code that pdfinfo is a part of. See also
     22      the INFO file in this directory.
     23  */
     24 
     25 #include "platform.h"
     26 #include "extractor.h"
     27 #include "convert.h"
     28 #include <math.h>
     29 
     30 #include <poppler/goo/gmem.h>
     31 #include <poppler/Object.h>
     32 #include <poppler/Stream.h>
     33 #include <poppler/Array.h>
     34 #include <poppler/Dict.h>
     35 #include <poppler/XRef.h>
     36 #include <poppler/Catalog.h>
     37 #include <poppler/Page.h>
     38 #include <poppler/PDFDoc.h>
     39 #include <poppler/Error.h>
     40 #include <poppler/GlobalParams.h>
     41 #include <poppler/goo/GooString.h>
     42 
     43 #define ADD(s, type) do { if (0!=proc(proc_cls, "pdf", type, EXTRACTOR_METAFORMAT_UTF8, "text/plain", s, strlen(s)+1)) { err = 1; goto EXIT; }} while (0)
     44 
     45 static int 
     46 printInfoString(Dict *infoDict,
     47 		const char *key,
     48 		enum EXTRACTOR_MetaType type,
     49 		EXTRACTOR_MetaDataProcessor proc,
     50 		void *proc_cls)
     51 {
     52   Object obj;
     53   GooString *s1;
     54   const char * s;
     55   char *ckey = strdup (key);
     56   int err = 0;
     57   char * result;
     58       
     59   if (ckey == NULL)
     60     return 0;
     61   result = NULL;
     62   if (infoDict->lookup(ckey, &obj)->isString()) {
     63     s1 = obj.getString();
     64     s = s1->getCString();
     65     if ((((unsigned char)s[0]) & 0xff) == 0xfe &&
     66 	(((unsigned char)s[1]) & 0xff) == 0xff) {
     67       result = EXTRACTOR_common_convert_to_utf8(&s[2], s1->getLength() - 2, "UTF-16BE");
     68       if (result != NULL)
     69 	ADD (result, type);
     70     } else {
     71       size_t len = strlen(s);
     72       
     73       while(0 < len) 
     74 	{
     75 	  /*
     76 	   * Avoid outputting trailing spaces.
     77 	   *
     78 	   * The following expression might be rewritten as
     79 	   * (! isspace(s[len - 1]) && 0xA0 != s[len - 1]).
     80 	   * There seem to exist isspace() implementations
     81 	   * which do return non-zero from NBSP (maybe locale-dependent).
     82 	   * Remove ISO-8859 non-breaking space (NBSP, hex value 0xA0) from
     83 	   * the expression if it looks suspicious (locale issues for instance).
     84 	   *
     85 	   * Squeezing out all non-printable characters might also be useful.
     86 	   */
     87   	  if ( (' '  != s[len - 1]) && (((char)0xA0) != s[len - 1]) &&
     88                ('\r' != s[len - 1]) && ('\n' != s[len - 1]) &&
     89                ('\t' != s[len - 1]) && ('\v' != s[len - 1]) &&
     90                ('\f' != s[len - 1]) )
     91 	    break;	  
     92           else
     93             len --;
     94         }
     95 
     96         /* there should be a check to truncate preposterously long values. */
     97       
     98       if (0 < len) {
     99 	result = EXTRACTOR_common_convert_to_utf8(s, len,
    100 						  "ISO-8859-1");
    101 	if (result != NULL)
    102 	  ADD (result, type);
    103       }
    104     }
    105   }
    106  EXIT:
    107   obj.free();
    108   if (result != NULL)
    109     free (result);
    110   free (ckey);
    111   return err;
    112 }
    113 
    114 static int 
    115 printInfoDate(Dict *infoDict,
    116 	      const char *key,
    117 	      enum EXTRACTOR_MetaType type,
    118 	      EXTRACTOR_MetaDataProcessor proc,
    119 	      void *proc_cls)
    120 {
    121   Object obj;
    122   const char *s;
    123   GooString *s1;  
    124   char *gkey;
    125   char * result;
    126   int err;
    127   
    128   err = 0;
    129   result = NULL;
    130   gkey = strdup (key);
    131   if (gkey == NULL)
    132     return 0;
    133   if (infoDict->lookup(gkey, &obj)->isString()) {
    134     s1 = obj.getString();
    135     s = s1->getCString();
    136     
    137     if ((s1->getChar(0) & 0xff) == 0xfe &&
    138 	(s1->getChar(1) & 0xff) == 0xff) {
    139       /* isUnicode */
    140       
    141       result = EXTRACTOR_common_convert_to_utf8((const char*)&s[2], s1->getLength() - 2, "UTF-16BE");
    142       if (result != NULL)
    143 	ADD (result, type);
    144     } else {
    145       if (s[0] == 'D' && s[1] == ':') 
    146 	s += 2;
    147       
    148       ADD (s, type);
    149     }
    150     /* printf(fmt, s);*/
    151   }
    152  EXIT:
    153   obj.free();
    154   if (result != NULL)
    155     free (result);
    156   free (gkey);
    157   return err;
    158 }
    159 
    160 #define PIS(s,t) do { if (0 != (err = printInfoString (info.getDict(), s, t, proc, proc_cls))) goto EXIT; } while (0)
    161 
    162 #define PID(s,t) do { if (0 != (err = printInfoDate (info.getDict(), s, t, proc, proc_cls))) goto EXIT; } while (0)
    163 
    164 extern "C" {
    165  
    166 
    167   int 
    168   EXTRACTOR_pdf_extract (const char *data,
    169 			 size_t size,
    170 			 EXTRACTOR_MetaDataProcessor proc,
    171 			 void *proc_cls,
    172 			 const char *options)
    173   {
    174     PDFDoc * doc;
    175     Object info;
    176     Object obj;
    177     BaseStream * stream;
    178     int err;
    179 
    180     if (globalParams == NULL)
    181       {
    182 	globalParams = new GlobalParams();
    183 	globalParams->setErrQuiet (gTrue);
    184       }
    185     obj.initNull();
    186     err = 0;
    187     stream = new MemStream( (char*) data, 0, size, &obj);
    188     doc = new PDFDoc(stream, NULL, NULL);
    189     if (! doc->isOk()) {
    190       delete doc;
    191       return 0;
    192     }
    193 
    194     ADD ("application/pdf",
    195 	 EXTRACTOR_METATYPE_MIMETYPE);
    196     if ( (NULL != doc->getDocInfo(&info)) &&
    197 	 (info.isDict()) ) {
    198       PIS ("Title", EXTRACTOR_METATYPE_TITLE);
    199       PIS ("Subject", EXTRACTOR_METATYPE_SUBJECT);
    200       PIS ("Keywords", EXTRACTOR_METATYPE_KEYWORDS);
    201       PIS ("Author", EXTRACTOR_METATYPE_AUTHOR_NAME);
    202       /*
    203        * we now believe that Adobe's Creator is not a person nor an
    204        * organisation, but just a piece of software.
    205        */
    206       PIS ("Creator", EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE);
    207       PIS ("Producer", EXTRACTOR_METATYPE_PRODUCED_BY_SOFTWARE);
    208       {
    209 	char pcnt[20];
    210 	sprintf(pcnt, "%d", doc->getNumPages());
    211 	ADD (pcnt, EXTRACTOR_METATYPE_PAGE_COUNT);
    212       }
    213       {
    214 	char pcnt[64];
    215 #if HAVE_POPPLER_GETPDFMAJORVERSION
    216 	sprintf(pcnt, "PDF %d.%d", 
    217 		doc->getPDFMajorVersion(),
    218 		doc->getPDFMinorVersion());
    219 #else
    220 	sprintf(pcnt, "PDF %.1f", 
    221 		doc->getPDFVersion());
    222 #endif
    223 	ADD (pcnt, EXTRACTOR_METATYPE_FORMAT);
    224       }
    225       PID ("CreationDate", EXTRACTOR_METATYPE_CREATION_DATE);
    226       PID ("ModDate", EXTRACTOR_METATYPE_MODIFICATION_DATE);
    227     }
    228   EXIT:
    229     info.free();
    230     delete doc;
    231 
    232     return err;
    233   }
    234 }
    235