pdf_extractor.cc (6377B)
1 /* 2 This file is part of libextractor. 3 (C) 2002, 2003, 2009 Vidyut Samanta and Christian Grothoff 4 5 libextractor is free software; you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published 7 by the Free Software Foundation; either version 2, or (at your 8 option) any later version. 9 10 libextractor is distributed in the hope that it will be useful, but 11 WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 General Public License for more details. 14 15 You should have received a copy of the GNU General Public License 16 along with libextractor; see the file COPYING. If not, write to the 17 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 18 Boston, MA 02110-1301, USA. 19 20 This code was inspired by pdfinfo and depends heavily 21 on the xpdf code that pdfinfo is a part of. See also 22 the INFO file in this directory. 23 */ 24 25 #include "platform.h" 26 #include "extractor.h" 27 #include "convert.h" 28 #include <math.h> 29 30 #include <poppler/goo/gmem.h> 31 #include <poppler/Object.h> 32 #include <poppler/Stream.h> 33 #include <poppler/Array.h> 34 #include <poppler/Dict.h> 35 #include <poppler/XRef.h> 36 #include <poppler/Catalog.h> 37 #include <poppler/Page.h> 38 #include <poppler/PDFDoc.h> 39 #include <poppler/Error.h> 40 #include <poppler/GlobalParams.h> 41 #include <poppler/goo/GooString.h> 42 43 #define ADD(s, type) do { if (0!=proc(proc_cls, "pdf", type, EXTRACTOR_METAFORMAT_UTF8, "text/plain", s, strlen(s)+1)) { err = 1; goto EXIT; }} while (0) 44 45 static int 46 printInfoString(Dict *infoDict, 47 const char *key, 48 enum EXTRACTOR_MetaType type, 49 EXTRACTOR_MetaDataProcessor proc, 50 void *proc_cls) 51 { 52 Object obj; 53 GooString *s1; 54 const char * s; 55 char *ckey = strdup (key); 56 int err = 0; 57 char * result; 58 59 if (ckey == NULL) 60 return 0; 61 result = NULL; 62 if (infoDict->lookup(ckey, &obj)->isString()) { 63 s1 = obj.getString(); 64 s = s1->getCString(); 65 if ((((unsigned char)s[0]) & 0xff) == 0xfe && 66 (((unsigned char)s[1]) & 0xff) == 0xff) { 67 result = EXTRACTOR_common_convert_to_utf8(&s[2], s1->getLength() - 2, "UTF-16BE"); 68 if (result != NULL) 69 ADD (result, type); 70 } else { 71 size_t len = strlen(s); 72 73 while(0 < len) 74 { 75 /* 76 * Avoid outputting trailing spaces. 77 * 78 * The following expression might be rewritten as 79 * (! isspace(s[len - 1]) && 0xA0 != s[len - 1]). 80 * There seem to exist isspace() implementations 81 * which do return non-zero from NBSP (maybe locale-dependent). 82 * Remove ISO-8859 non-breaking space (NBSP, hex value 0xA0) from 83 * the expression if it looks suspicious (locale issues for instance). 84 * 85 * Squeezing out all non-printable characters might also be useful. 86 */ 87 if ( (' ' != s[len - 1]) && (((char)0xA0) != s[len - 1]) && 88 ('\r' != s[len - 1]) && ('\n' != s[len - 1]) && 89 ('\t' != s[len - 1]) && ('\v' != s[len - 1]) && 90 ('\f' != s[len - 1]) ) 91 break; 92 else 93 len --; 94 } 95 96 /* there should be a check to truncate preposterously long values. */ 97 98 if (0 < len) { 99 result = EXTRACTOR_common_convert_to_utf8(s, len, 100 "ISO-8859-1"); 101 if (result != NULL) 102 ADD (result, type); 103 } 104 } 105 } 106 EXIT: 107 obj.free(); 108 if (result != NULL) 109 free (result); 110 free (ckey); 111 return err; 112 } 113 114 static int 115 printInfoDate(Dict *infoDict, 116 const char *key, 117 enum EXTRACTOR_MetaType type, 118 EXTRACTOR_MetaDataProcessor proc, 119 void *proc_cls) 120 { 121 Object obj; 122 const char *s; 123 GooString *s1; 124 char *gkey; 125 char * result; 126 int err; 127 128 err = 0; 129 result = NULL; 130 gkey = strdup (key); 131 if (gkey == NULL) 132 return 0; 133 if (infoDict->lookup(gkey, &obj)->isString()) { 134 s1 = obj.getString(); 135 s = s1->getCString(); 136 137 if ((s1->getChar(0) & 0xff) == 0xfe && 138 (s1->getChar(1) & 0xff) == 0xff) { 139 /* isUnicode */ 140 141 result = EXTRACTOR_common_convert_to_utf8((const char*)&s[2], s1->getLength() - 2, "UTF-16BE"); 142 if (result != NULL) 143 ADD (result, type); 144 } else { 145 if (s[0] == 'D' && s[1] == ':') 146 s += 2; 147 148 ADD (s, type); 149 } 150 /* printf(fmt, s);*/ 151 } 152 EXIT: 153 obj.free(); 154 if (result != NULL) 155 free (result); 156 free (gkey); 157 return err; 158 } 159 160 #define PIS(s,t) do { if (0 != (err = printInfoString (info.getDict(), s, t, proc, proc_cls))) goto EXIT; } while (0) 161 162 #define PID(s,t) do { if (0 != (err = printInfoDate (info.getDict(), s, t, proc, proc_cls))) goto EXIT; } while (0) 163 164 extern "C" { 165 166 167 int 168 EXTRACTOR_pdf_extract (const char *data, 169 size_t size, 170 EXTRACTOR_MetaDataProcessor proc, 171 void *proc_cls, 172 const char *options) 173 { 174 PDFDoc * doc; 175 Object info; 176 Object obj; 177 BaseStream * stream; 178 int err; 179 180 if (globalParams == NULL) 181 { 182 globalParams = new GlobalParams(); 183 globalParams->setErrQuiet (gTrue); 184 } 185 obj.initNull(); 186 err = 0; 187 stream = new MemStream( (char*) data, 0, size, &obj); 188 doc = new PDFDoc(stream, NULL, NULL); 189 if (! doc->isOk()) { 190 delete doc; 191 return 0; 192 } 193 194 ADD ("application/pdf", 195 EXTRACTOR_METATYPE_MIMETYPE); 196 if ( (NULL != doc->getDocInfo(&info)) && 197 (info.isDict()) ) { 198 PIS ("Title", EXTRACTOR_METATYPE_TITLE); 199 PIS ("Subject", EXTRACTOR_METATYPE_SUBJECT); 200 PIS ("Keywords", EXTRACTOR_METATYPE_KEYWORDS); 201 PIS ("Author", EXTRACTOR_METATYPE_AUTHOR_NAME); 202 /* 203 * we now believe that Adobe's Creator is not a person nor an 204 * organisation, but just a piece of software. 205 */ 206 PIS ("Creator", EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE); 207 PIS ("Producer", EXTRACTOR_METATYPE_PRODUCED_BY_SOFTWARE); 208 { 209 char pcnt[20]; 210 sprintf(pcnt, "%d", doc->getNumPages()); 211 ADD (pcnt, EXTRACTOR_METATYPE_PAGE_COUNT); 212 } 213 { 214 char pcnt[64]; 215 #if HAVE_POPPLER_GETPDFMAJORVERSION 216 sprintf(pcnt, "PDF %d.%d", 217 doc->getPDFMajorVersion(), 218 doc->getPDFMinorVersion()); 219 #else 220 sprintf(pcnt, "PDF %.1f", 221 doc->getPDFVersion()); 222 #endif 223 ADD (pcnt, EXTRACTOR_METATYPE_FORMAT); 224 } 225 PID ("CreationDate", EXTRACTOR_METATYPE_CREATION_DATE); 226 PID ("ModDate", EXTRACTOR_METATYPE_MODIFICATION_DATE); 227 } 228 EXIT: 229 info.free(); 230 delete doc; 231 232 return err; 233 } 234 } 235