pdf_extractor.cc (8024B)
1 /* 2 This file is part of libextractor. 3 Copyright (C) 2002, 2003, 2009, 2026 Vidyut Samanta and Christian Grothoff 4 5 libextractor is free software; you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published 7 by the Free Software Foundation; either version 3, or (at your 8 option) any later version. 9 10 libextractor is distributed in the hope that it will be useful, but 11 WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 General Public License for more details. 14 15 You should have received a copy of the GNU General Public License 16 along with libextractor; see the file COPYING. If not, write to the 17 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 18 Boston, MA 02110-1301, USA. 19 */ 20 /** 21 * @file plugins/pdf_extractor.cc 22 * @brief plugin to support PDF files 23 * @author Vidyut Samanta 24 * @author Christian Grothoff 25 */ 26 #include "platform.h" 27 #include "extractor.h" 28 #include <poppler/cpp/poppler-document.h> 29 #include <poppler/cpp/poppler-global.h> 30 #include <string> 31 #include <vector> 32 33 34 /** 35 * Sanity bound on the size of a PDF we are willing to buffer 36 * in memory (1 GB). libpoppler needs the whole document, and 37 * its raw-data loader takes an `int` length. 38 */ 39 #define MAX_PDF_SIZE (1024LL * 1024LL * 1024LL) 40 41 42 /** 43 * Entry in the mapping from poppler accessors to LE types. 44 */ 45 struct Matches 46 { 47 /** 48 * Accessor on the poppler document returning the value. 49 */ 50 poppler::ustring (poppler::document::*get) () const; 51 52 /** 53 * Corresponding meta data type in LE. 54 */ 55 enum EXTRACTOR_MetaType type; 56 }; 57 58 59 /** 60 * Map from poppler document info accessors to LE types. 61 * 62 * Note that we deliberately map "Creator" to 63 * #EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE: we believe that 64 * Adobe's "Creator" is not a person nor an organisation, but 65 * just a piece of software. 66 */ 67 static const struct Matches tmap[] = { 68 { &poppler::document::get_title, EXTRACTOR_METATYPE_TITLE }, 69 { &poppler::document::get_subject, EXTRACTOR_METATYPE_SUBJECT }, 70 { &poppler::document::get_keywords, EXTRACTOR_METATYPE_KEYWORDS }, 71 { &poppler::document::get_author, EXTRACTOR_METATYPE_AUTHOR_NAME }, 72 { &poppler::document::get_creator, EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE }, 73 { &poppler::document::get_producer, EXTRACTOR_METATYPE_PRODUCED_BY_SOFTWARE }, 74 { NULL, EXTRACTOR_METATYPE_RESERVED } 75 }; 76 77 78 /** 79 * Silence libpoppler: we do not want parsing diagnostics on 80 * stderr of the plugin child process. 81 * 82 * @param msg the message (ignored) 83 * @param cls closure (ignored) 84 */ 85 static void 86 quiet_error (const std::string &msg, 87 void *cls) 88 { 89 (void) msg; 90 (void) cls; 91 } 92 93 94 /** 95 * Hand a UTF-8 string to the meta data processor, after 96 * stripping trailing whitespace. Empty values are skipped. 97 * 98 * @param ec extraction context 99 * @param type meta data type to use 100 * @param val UTF-8 bytes (need not be 0-terminated) 101 * @return 0 to continue extracting, 1 if @a ec asked us to stop 102 */ 103 static int 104 add_utf8 (struct EXTRACTOR_ExtractContext *ec, 105 enum EXTRACTOR_MetaType type, 106 std::vector<char> val) 107 { 108 size_t len = val.size (); 109 110 /* 111 * Avoid outputting trailing whitespace. Note that ISO-8859 112 * NBSP (0xA0) becomes 0xC2 0xA0 in UTF-8 and is intentionally 113 * not stripped here. 114 */ 115 while ((0 < len) && 116 ((' ' == val[len - 1]) || 117 ('\r' == val[len - 1]) || 118 ('\n' == val[len - 1]) || 119 ('\t' == val[len - 1]) || 120 ('\v' == val[len - 1]) || 121 ('\f' == val[len - 1]))) 122 len--; 123 if (0 == len) 124 return 0; 125 std::string s (val.data (), len); 126 if (0 != ec->proc (ec->cls, 127 "pdf", 128 type, 129 EXTRACTOR_METAFORMAT_UTF8, 130 "text/plain", 131 s.c_str (), 132 s.size () + 1)) 133 return 1; 134 return 0; 135 } 136 137 138 /** 139 * Hand a 0-terminated C string to the meta data processor. 140 * 141 * @param ec extraction context 142 * @param type meta data type to use 143 * @param s the string 144 * @return 0 to continue extracting, 1 if @a ec asked us to stop 145 */ 146 static int 147 add_str (struct EXTRACTOR_ExtractContext *ec, 148 enum EXTRACTOR_MetaType type, 149 const char *s) 150 { 151 if ((NULL == s) || ('\0' == s[0])) 152 return 0; 153 if (0 != ec->proc (ec->cls, 154 "pdf", 155 type, 156 EXTRACTOR_METAFORMAT_UTF8, 157 "text/plain", 158 s, 159 strlen (s) + 1)) 160 return 1; 161 return 0; 162 } 163 164 165 /** 166 * Report a date (given as a `time_t`) in ISO-8601 / UTC. 167 * 168 * @param ec extraction context 169 * @param type meta data type to use 170 * @param t the time, `(time_t) -1` or 0 if absent 171 * @return 0 to continue extracting, 1 if @a ec asked us to stop 172 */ 173 static int 174 add_date (struct EXTRACTOR_ExtractContext *ec, 175 enum EXTRACTOR_MetaType type, 176 time_t t) 177 { 178 char buf[32]; 179 struct tm tv; 180 181 if (((time_t) -1 == t) || (0 == t)) 182 return 0; 183 if (NULL == gmtime_r (&t, &tv)) 184 return 0; 185 if (0 == strftime (buf, sizeof (buf), "%Y-%m-%d %H:%M:%S", &tv)) 186 return 0; 187 return add_str (ec, type, buf); 188 } 189 190 191 /** 192 * Read the entire input into @a buf. 193 * 194 * @param ec extraction context 195 * @param[out] buf buffer to fill with the file contents 196 * @return 0 on success, -1 on error 197 */ 198 static int 199 read_all (struct EXTRACTOR_ExtractContext *ec, 200 std::vector<char> &buf) 201 { 202 uint64_t size; 203 204 size = ec->get_size (ec->cls); 205 if ((UINT64_MAX == size) || 206 (0 == size) || 207 (size > MAX_PDF_SIZE)) 208 return -1; 209 if (0 != ec->seek (ec->cls, 0, SEEK_SET)) 210 return -1; 211 buf.reserve ((size_t) size); 212 while (buf.size () < size) 213 { 214 void *data; 215 ssize_t got; 216 217 got = ec->read (ec->cls, 218 &data, 219 (size_t) (size - buf.size ())); 220 if ((got <= 0) || (NULL == data)) 221 break; 222 buf.insert (buf.end (), 223 static_cast<char *> (data), 224 static_cast<char *> (data) + got); 225 } 226 if (buf.empty ()) 227 return -1; 228 return 0; 229 } 230 231 232 /** 233 * Main entry method for the PDF extraction plugin. 234 * 235 * @param ec extraction context provided to the plugin 236 */ 237 extern "C" void 238 EXTRACTOR_pdf_extract_method (struct EXTRACTOR_ExtractContext *ec) 239 { 240 void *hdr; 241 std::vector<char> buf; 242 poppler::document *doc; 243 int major; 244 int minor; 245 char ver[32]; 246 char pages[16]; 247 248 if (4 != ec->read (ec->cls, &hdr, 4)) 249 return; 250 if (0 != memcmp ("%PDF", hdr, 4)) 251 return; 252 if (0 != read_all (ec, buf)) 253 return; 254 255 poppler::set_debug_error_function (&quiet_error, NULL); 256 doc = poppler::document::load_from_raw_data (buf.data (), 257 (int) buf.size ()); 258 if (NULL == doc) 259 return; 260 /* An encrypted document we cannot open exposes no usable meta data. */ 261 if (doc->is_locked ()) 262 { 263 delete doc; 264 return; 265 } 266 267 if (0 != add_str (ec, 268 EXTRACTOR_METATYPE_MIMETYPE, 269 "application/pdf")) 270 goto CLEANUP; 271 for (unsigned int i = 0; NULL != tmap[i].get; i++) 272 if (0 != add_utf8 (ec, 273 tmap[i].type, 274 (doc->*tmap[i].get)().to_utf8 ())) 275 goto CLEANUP; 276 doc->get_pdf_version (&major, &minor); 277 snprintf (ver, sizeof (ver), "PDF %d.%d", major, minor); 278 if (0 != add_str (ec, EXTRACTOR_METATYPE_FORMAT, ver)) 279 goto CLEANUP; 280 snprintf (pages, sizeof (pages), "%d", doc->pages ()); 281 if (0 != add_str (ec, EXTRACTOR_METATYPE_PAGE_COUNT, pages)) 282 goto CLEANUP; 283 if (0 != add_date (ec, 284 EXTRACTOR_METATYPE_CREATION_DATE, 285 doc->get_creation_date_t ())) 286 goto CLEANUP; 287 if (0 != add_date (ec, 288 EXTRACTOR_METATYPE_MODIFICATION_DATE, 289 doc->get_modification_date_t ())) 290 goto CLEANUP; 291 CLEANUP: 292 delete doc; 293 } 294 295 296 /* end of pdf_extractor.cc */