libextractor

GNU libextractor
Log | Files | Refs | Submodules | README | LICENSE

commit 7b14c4528fa1b109240eba13678aa0e671951f26
parent 96894ba21cd0b10fb32b826e6e9c5a3107212d31
Author: Christian Grothoff <christian@grothoff.org>
Date:   Tue, 19 May 2026 21:58:57 +0200

modernize PDF plugin

Diffstat:
Dsrc/plugins/old/pdf_extractor.cc | 235-------------------------------------------------------------------------------
Dsrc/plugins/pdf_extractor.c | 238-------------------------------------------------------------------------------
Asrc/plugins/pdf_extractor.cc | 304+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/plugins/test_pdf.c | 139+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/plugins/testdata/pdf_extract.pdf | 26++++++++++++++++++++++++++
5 files changed, 469 insertions(+), 473 deletions(-)

diff --git a/src/plugins/old/pdf_extractor.cc b/src/plugins/old/pdf_extractor.cc @@ -1,235 +0,0 @@ -/* - This file is part of libextractor. - (C) 2002, 2003, 2009 Vidyut Samanta and Christian Grothoff - - libextractor is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 2, or (at your - option) any later version. - - libextractor is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with libextractor; see the file COPYING. If not, write to the - Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, - Boston, MA 02110-1301, USA. - - This code was inspired by pdfinfo and depends heavily - on the xpdf code that pdfinfo is a part of. See also - the INFO file in this directory. - */ - -#include "platform.h" -#include "extractor.h" -#include "convert.h" -#include <math.h> - -#include <poppler/goo/gmem.h> -#include <poppler/Object.h> -#include <poppler/Stream.h> -#include <poppler/Array.h> -#include <poppler/Dict.h> -#include <poppler/XRef.h> -#include <poppler/Catalog.h> -#include <poppler/Page.h> -#include <poppler/PDFDoc.h> -#include <poppler/Error.h> -#include <poppler/GlobalParams.h> -#include <poppler/goo/GooString.h> - -#define ADD(s, type) do { if (0!=proc(proc_cls, "pdf", type, EXTRACTOR_METAFORMAT_UTF8, "text/plain", s, strlen(s)+1)) { err = 1; goto EXIT; }} while (0) - -static int -printInfoString(Dict *infoDict, - const char *key, - enum EXTRACTOR_MetaType type, - EXTRACTOR_MetaDataProcessor proc, - void *proc_cls) -{ - Object obj; - GooString *s1; - const char * s; - char *ckey = strdup (key); - int err = 0; - char * result; - - if (ckey == NULL) - return 0; - result = NULL; - if (infoDict->lookup(ckey, &obj)->isString()) { - s1 = obj.getString(); - s = s1->getCString(); - if ((((unsigned char)s[0]) & 0xff) == 0xfe && - (((unsigned char)s[1]) & 0xff) == 0xff) { - result = EXTRACTOR_common_convert_to_utf8(&s[2], s1->getLength() - 2, "UTF-16BE"); - if (result != NULL) - ADD (result, type); - } else { - size_t len = strlen(s); - - while(0 < len) - { - /* - * Avoid outputting trailing spaces. - * - * The following expression might be rewritten as - * (! isspace(s[len - 1]) && 0xA0 != s[len - 1]). - * There seem to exist isspace() implementations - * which do return non-zero from NBSP (maybe locale-dependent). - * Remove ISO-8859 non-breaking space (NBSP, hex value 0xA0) from - * the expression if it looks suspicious (locale issues for instance). - * - * Squeezing out all non-printable characters might also be useful. - */ - if ( (' ' != s[len - 1]) && (((char)0xA0) != s[len - 1]) && - ('\r' != s[len - 1]) && ('\n' != s[len - 1]) && - ('\t' != s[len - 1]) && ('\v' != s[len - 1]) && - ('\f' != s[len - 1]) ) - break; - else - len --; - } - - /* there should be a check to truncate preposterously long values. */ - - if (0 < len) { - result = EXTRACTOR_common_convert_to_utf8(s, len, - "ISO-8859-1"); - if (result != NULL) - ADD (result, type); - } - } - } - EXIT: - obj.free(); - if (result != NULL) - free (result); - free (ckey); - return err; -} - -static int -printInfoDate(Dict *infoDict, - const char *key, - enum EXTRACTOR_MetaType type, - EXTRACTOR_MetaDataProcessor proc, - void *proc_cls) -{ - Object obj; - const char *s; - GooString *s1; - char *gkey; - char * result; - int err; - - err = 0; - result = NULL; - gkey = strdup (key); - if (gkey == NULL) - return 0; - if (infoDict->lookup(gkey, &obj)->isString()) { - s1 = obj.getString(); - s = s1->getCString(); - - if ((s1->getChar(0) & 0xff) == 0xfe && - (s1->getChar(1) & 0xff) == 0xff) { - /* isUnicode */ - - result = EXTRACTOR_common_convert_to_utf8((const char*)&s[2], s1->getLength() - 2, "UTF-16BE"); - if (result != NULL) - ADD (result, type); - } else { - if (s[0] == 'D' && s[1] == ':') - s += 2; - - ADD (s, type); - } - /* printf(fmt, s);*/ - } - EXIT: - obj.free(); - if (result != NULL) - free (result); - free (gkey); - return err; -} - -#define PIS(s,t) do { if (0 != (err = printInfoString (info.getDict(), s, t, proc, proc_cls))) goto EXIT; } while (0) - -#define PID(s,t) do { if (0 != (err = printInfoDate (info.getDict(), s, t, proc, proc_cls))) goto EXIT; } while (0) - -extern "C" { - - - int - EXTRACTOR_pdf_extract (const char *data, - size_t size, - EXTRACTOR_MetaDataProcessor proc, - void *proc_cls, - const char *options) - { - PDFDoc * doc; - Object info; - Object obj; - BaseStream * stream; - int err; - - if (globalParams == NULL) - { - globalParams = new GlobalParams(); - globalParams->setErrQuiet (gTrue); - } - obj.initNull(); - err = 0; - stream = new MemStream( (char*) data, 0, size, &obj); - doc = new PDFDoc(stream, NULL, NULL); - if (! doc->isOk()) { - delete doc; - return 0; - } - - ADD ("application/pdf", - EXTRACTOR_METATYPE_MIMETYPE); - if ( (NULL != doc->getDocInfo(&info)) && - (info.isDict()) ) { - PIS ("Title", EXTRACTOR_METATYPE_TITLE); - PIS ("Subject", EXTRACTOR_METATYPE_SUBJECT); - PIS ("Keywords", EXTRACTOR_METATYPE_KEYWORDS); - PIS ("Author", EXTRACTOR_METATYPE_AUTHOR_NAME); - /* - * we now believe that Adobe's Creator is not a person nor an - * organisation, but just a piece of software. - */ - PIS ("Creator", EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE); - PIS ("Producer", EXTRACTOR_METATYPE_PRODUCED_BY_SOFTWARE); - { - char pcnt[20]; - sprintf(pcnt, "%d", doc->getNumPages()); - ADD (pcnt, EXTRACTOR_METATYPE_PAGE_COUNT); - } - { - char pcnt[64]; -#if HAVE_POPPLER_GETPDFMAJORVERSION - sprintf(pcnt, "PDF %d.%d", - doc->getPDFMajorVersion(), - doc->getPDFMinorVersion()); -#else - sprintf(pcnt, "PDF %.1f", - doc->getPDFVersion()); -#endif - ADD (pcnt, EXTRACTOR_METATYPE_FORMAT); - } - PID ("CreationDate", EXTRACTOR_METATYPE_CREATION_DATE); - PID ("ModDate", EXTRACTOR_METATYPE_MODIFICATION_DATE); - } - EXIT: - info.free(); - delete doc; - - return err; - } -} - diff --git a/src/plugins/pdf_extractor.c b/src/plugins/pdf_extractor.c @@ -1,238 +0,0 @@ -/* - This file is part of libextractor. - Copyright (C) 2016 Christian Grothoff - - libextractor is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3, or (at your - option) any later version. - - libextractor is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with libextractor; see the file COPYING. If not, write to the - Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, - Boston, MA 02110-1301, USA. - */ -/** - * @file plugins/pdf_extractor.c - * @brief plugin to support PDF files - * @author Christian Grothoff - * - * PDF libraries today are a nightmare (TM). So instead of doing the - * fast thing and calling some library functions to parse the PDF, - * we execute 'pdfinfo' and parse the output. Because that's 21st - * century plumbing: nobody writes reasonable code anymore. - */ -#include "platform.h" -#include <extractor.h> -#include <sys/types.h> -#include <sys/wait.h> -#include <signal.h> -#include <unistd.h> - -/** - * Entry in the mapping from control data to LE types. - */ -struct Matches -{ - /** - * Key in the Pdfian control file. - */ - const char *text; - - /** - * Corresponding type in LE. - */ - enum EXTRACTOR_MetaType type; -}; - - -/** - * Map from pdf-control entries to LE types. - * - * See output of 'pdfinfo'. - */ -static struct Matches tmap[] = { - {"Title", EXTRACTOR_METATYPE_TITLE}, - {"Subject", EXTRACTOR_METATYPE_SUBJECT}, - {"Keywords", EXTRACTOR_METATYPE_KEYWORDS}, - {"Author", EXTRACTOR_METATYPE_AUTHOR_NAME}, - {"Creator", EXTRACTOR_METATYPE_CREATOR}, - {"Producer", EXTRACTOR_METATYPE_PRODUCED_BY_SOFTWARE}, - {"CreationDate", EXTRACTOR_METATYPE_CREATION_DATE}, - {"ModDate", EXTRACTOR_METATYPE_MODIFICATION_DATE}, - {"PDF version", EXTRACTOR_METATYPE_ENCODER_VERSION}, - {"Pages", EXTRACTOR_METATYPE_PAGE_COUNT}, - {NULL, 0} -}; - - -/** - * Process the "stdout" file from pdfinfo. - * - * @param fout stdout of pdfinfo - * @param proc function to call with meta data - * @param proc_cls closure for @e proc - */ -static void -process_stdout (FILE *fout, - EXTRACTOR_MetaDataProcessor proc, - void *proc_cls) -{ - unsigned int i; - char line[1025]; - const char *psuffix; - const char *colon; - - while (! feof (fout)) - { - if (NULL == fgets (line, sizeof (line) - 1, fout)) - break; - if (0 == strlen (line)) - continue; - if ('\n' == line[strlen (line) - 1]) - line[strlen (line) - 1] = '\0'; - colon = strchr (line, (int) ':'); - if (NULL == colon) - break; - psuffix = colon + 1; - while (isblank ((unsigned char) psuffix[0])) - psuffix++; - if (0 == strlen (psuffix)) - continue; - for (i = 0; NULL != tmap[i].text; i++) - { - if (0 != strncasecmp (line, - tmap[i].text, - colon - line)) - continue; - if (0 != proc (proc_cls, - "pdf", - tmap[i].type, - EXTRACTOR_METAFORMAT_UTF8, - "text/plain", - psuffix, - strlen (psuffix) + 1)) - return; - break; - } - } -} - - -/** - * Main entry method for the PDF extraction plugin. - * - * @param ec extraction context provided to the plugin - */ -void -EXTRACTOR_pdf_extract_method (struct EXTRACTOR_ExtractContext *ec) -{ - uint64_t fsize; - void *data; - pid_t pid; - int in[2]; - int out[2]; - FILE *fout; - uint64_t pos; - - fsize = ec->get_size (ec->cls); - if (fsize < 128) - return; - if (4 != - ec->read (ec->cls, &data, 4)) - return; - if (0 != strncmp ("%PDF", data, 4)) - return; - if (0 != - ec->seek (ec->cls, 0, SEEK_SET)) - return; - if (0 != pipe (in)) - return; - if (0 != pipe (out)) - { - close (in[0]); - close (in[1]); - return; - } - pid = fork (); - if (-1 == pid) - { - close (in[0]); - close (in[1]); - close (out[0]); - close (out[1]); - return; - } - if (0 == pid) - { - char *const args[] = { - "pdfinfo", - "-", - NULL - }; - /* am child, exec 'pdfinfo' */ - close (0); - close (1); - if ( (-1 == dup2 (in[0], 0)) || - (-1 == dup2 (out[1], 1)) ) - exit (1); - close (in[0]); - close (in[1]); - close (out[0]); - close (out[1]); - execvp ("pdfinfo", args); - exit (1); - } - /* am parent, send file */ - close (in[0]); - close (out[1]); - fout = fdopen (out[0], "r"); - if (NULL == fout) - { - close (in[1]); - close (out[0]); - kill (pid, SIGKILL); - waitpid (pid, NULL, 0); - return; - } - pos = 0; - while (pos < fsize) - { - ssize_t got; - size_t wpos; - - data = NULL; - got = ec->read (ec->cls, - &data, - fsize - pos); - if ( (-1 == got) || - (NULL == data) ) - break; - wpos = 0; - while (wpos < got) - { - ssize_t out; - - out = write (in[1], data + wpos, got - wpos); - if (out <= 0) - break; - wpos += out; - } - if (wpos < got) - break; - pos += got; - } - close (in[1]); - process_stdout (fout, ec->proc, ec->cls); - fclose (fout); - kill (pid, SIGKILL); - waitpid (pid, NULL, 0); -} - - -/* end of pdf_extractor.c */ diff --git a/src/plugins/pdf_extractor.cc b/src/plugins/pdf_extractor.cc @@ -0,0 +1,304 @@ +/* + This file is part of libextractor. + Copyright (C) 2002, 2003, 2009, 2026 Vidyut Samanta and Christian Grothoff + + libextractor is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3, or (at your + option) any later version. + + libextractor is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with libextractor; see the file COPYING. If not, write to the + Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, + Boston, MA 02110-1301, USA. + */ +/** + * @file plugins/pdf_extractor.cc + * @brief plugin to support PDF files + * @author Vidyut Samanta + * @author Christian Grothoff + * + * This plugin uses the stable C++ binding of libpoppler + * (`poppler-cpp`). Earlier versions of this plugin linked + * against poppler's internal headers (`PDFDoc`, `GooString`, + * ...), which carry no API or ABI stability guarantees and + * broke with virtually every poppler release. The poppler-cpp + * interface is the supported public API and is what we use + * here. + */ +#include "platform.h" +#include "extractor.h" +#include <poppler/cpp/poppler-document.h> +#include <poppler/cpp/poppler-global.h> +#include <string> +#include <vector> + + +/** + * Sanity bound on the size of a PDF we are willing to buffer + * in memory (1 GB). libpoppler needs the whole document, and + * its raw-data loader takes an `int` length. + */ +#define MAX_PDF_SIZE (1024LL * 1024LL * 1024LL) + + +/** + * Entry in the mapping from poppler accessors to LE types. + */ +struct Matches +{ + /** + * Accessor on the poppler document returning the value. + */ + poppler::ustring (poppler::document::*get) () const; + + /** + * Corresponding meta data type in LE. + */ + enum EXTRACTOR_MetaType type; +}; + + +/** + * Map from poppler document info accessors to LE types. + * + * Note that we deliberately map "Creator" to + * #EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE: we believe that + * Adobe's "Creator" is not a person nor an organisation, but + * just a piece of software. + */ +static const struct Matches tmap[] = { + { &poppler::document::get_title, EXTRACTOR_METATYPE_TITLE }, + { &poppler::document::get_subject, EXTRACTOR_METATYPE_SUBJECT }, + { &poppler::document::get_keywords, EXTRACTOR_METATYPE_KEYWORDS }, + { &poppler::document::get_author, EXTRACTOR_METATYPE_AUTHOR_NAME }, + { &poppler::document::get_creator, EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE }, + { &poppler::document::get_producer, EXTRACTOR_METATYPE_PRODUCED_BY_SOFTWARE }, + { NULL, EXTRACTOR_METATYPE_RESERVED } +}; + + +/** + * Silence libpoppler: we do not want parsing diagnostics on + * stderr of the plugin child process. + * + * @param msg the message (ignored) + * @param cls closure (ignored) + */ +static void +quiet_error (const std::string &msg, + void *cls) +{ + (void) msg; + (void) cls; +} + + +/** + * Hand a UTF-8 string to the meta data processor, after + * stripping trailing whitespace. Empty values are skipped. + * + * @param ec extraction context + * @param type meta data type to use + * @param val UTF-8 bytes (need not be 0-terminated) + * @return 0 to continue extracting, 1 if @a ec asked us to stop + */ +static int +add_utf8 (struct EXTRACTOR_ExtractContext *ec, + enum EXTRACTOR_MetaType type, + std::vector<char> val) +{ + size_t len = val.size (); + + /* + * Avoid outputting trailing whitespace. Note that ISO-8859 + * NBSP (0xA0) becomes 0xC2 0xA0 in UTF-8 and is intentionally + * not stripped here. + */ + while ((0 < len) && + ((' ' == val[len - 1]) || + ('\r' == val[len - 1]) || + ('\n' == val[len - 1]) || + ('\t' == val[len - 1]) || + ('\v' == val[len - 1]) || + ('\f' == val[len - 1]))) + len--; + if (0 == len) + return 0; + std::string s (val.data (), len); + if (0 != ec->proc (ec->cls, + "pdf", + type, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + s.c_str (), + s.size () + 1)) + return 1; + return 0; +} + + +/** + * Hand a 0-terminated C string to the meta data processor. + * + * @param ec extraction context + * @param type meta data type to use + * @param s the string + * @return 0 to continue extracting, 1 if @a ec asked us to stop + */ +static int +add_str (struct EXTRACTOR_ExtractContext *ec, + enum EXTRACTOR_MetaType type, + const char *s) +{ + if ((NULL == s) || ('\0' == s[0])) + return 0; + if (0 != ec->proc (ec->cls, + "pdf", + type, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + s, + strlen (s) + 1)) + return 1; + return 0; +} + + +/** + * Report a date (given as a `time_t`) in ISO-8601 / UTC. + * + * @param ec extraction context + * @param type meta data type to use + * @param t the time, `(time_t) -1` or 0 if absent + * @return 0 to continue extracting, 1 if @a ec asked us to stop + */ +static int +add_date (struct EXTRACTOR_ExtractContext *ec, + enum EXTRACTOR_MetaType type, + time_t t) +{ + char buf[32]; + struct tm tv; + + if (((time_t) -1 == t) || (0 == t)) + return 0; + if (NULL == gmtime_r (&t, &tv)) + return 0; + if (0 == strftime (buf, sizeof (buf), "%Y-%m-%d %H:%M:%S", &tv)) + return 0; + return add_str (ec, type, buf); +} + + +/** + * Read the entire input into @a buf. + * + * @param ec extraction context + * @param[out] buf buffer to fill with the file contents + * @return 0 on success, -1 on error + */ +static int +read_all (struct EXTRACTOR_ExtractContext *ec, + std::vector<char> &buf) +{ + uint64_t size; + + size = ec->get_size (ec->cls); + if ((UINT64_MAX == size) || + (0 == size) || + (size > MAX_PDF_SIZE)) + return -1; + if (0 != ec->seek (ec->cls, 0, SEEK_SET)) + return -1; + buf.reserve ((size_t) size); + while (buf.size () < size) + { + void *data; + ssize_t got; + + got = ec->read (ec->cls, + &data, + (size_t) (size - buf.size ())); + if ((got <= 0) || (NULL == data)) + break; + buf.insert (buf.end (), + static_cast<char *> (data), + static_cast<char *> (data) + got); + } + if (buf.empty ()) + return -1; + return 0; +} + + +/** + * Main entry method for the PDF extraction plugin. + * + * @param ec extraction context provided to the plugin + */ +extern "C" void +EXTRACTOR_pdf_extract_method (struct EXTRACTOR_ExtractContext *ec) +{ + void *hdr; + std::vector<char> buf; + poppler::document *doc; + int major; + int minor; + char ver[32]; + char pages[16]; + + if (4 != ec->read (ec->cls, &hdr, 4)) + return; + if (0 != memcmp ("%PDF", hdr, 4)) + return; + if (0 != read_all (ec, buf)) + return; + + poppler::set_debug_error_function (&quiet_error, NULL); + doc = poppler::document::load_from_raw_data (buf.data (), + (int) buf.size ()); + if (NULL == doc) + return; + /* An encrypted document we cannot open exposes no usable meta data. */ + if (doc->is_locked ()) + { + delete doc; + return; + } + + if (0 != add_str (ec, + EXTRACTOR_METATYPE_MIMETYPE, + "application/pdf")) + goto CLEANUP; + for (unsigned int i = 0; NULL != tmap[i].get; i++) + if (0 != add_utf8 (ec, + tmap[i].type, + (doc->*tmap[i].get)().to_utf8 ())) + goto CLEANUP; + doc->get_pdf_version (&major, &minor); + snprintf (ver, sizeof (ver), "PDF %d.%d", major, minor); + if (0 != add_str (ec, EXTRACTOR_METATYPE_FORMAT, ver)) + goto CLEANUP; + snprintf (pages, sizeof (pages), "%d", doc->pages ()); + if (0 != add_str (ec, EXTRACTOR_METATYPE_PAGE_COUNT, pages)) + goto CLEANUP; + if (0 != add_date (ec, + EXTRACTOR_METATYPE_CREATION_DATE, + doc->get_creation_date_t ())) + goto CLEANUP; + if (0 != add_date (ec, + EXTRACTOR_METATYPE_MODIFICATION_DATE, + doc->get_modification_date_t ())) + goto CLEANUP; +CLEANUP: + delete doc; +} + + +/* end of pdf_extractor.cc */ diff --git a/src/plugins/test_pdf.c b/src/plugins/test_pdf.c @@ -0,0 +1,139 @@ +/* + This file is part of libextractor. + Copyright (C) 2026 Vidyut Samanta and Christian Grothoff + + libextractor is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3, or (at your + option) any later version. + + libextractor is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with libextractor; see the file COPYING. If not, write to the + Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, + Boston, MA 02110-1301, USA. +*/ +/** + * @file plugins/test_pdf.c + * @brief testcase for pdf plugin + * @author Christian Grothoff + */ +#include "platform.h" +#include "test_lib.h" + + +/** + * Main function for the PDF testcase. + * + * @param argc number of arguments (ignored) + * @param argv arguments (ignored) + * @return 0 on success + */ +int +main (int argc, char *argv[]) +{ + struct SolutionData pdf_extract_sol[] = { + { + EXTRACTOR_METATYPE_MIMETYPE, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + "application/pdf", + strlen ("application/pdf") + 1, + 0 + }, + { + EXTRACTOR_METATYPE_TITLE, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + "GNU libextractor PDF Test", + strlen ("GNU libextractor PDF Test") + 1, + 0 + }, + { + EXTRACTOR_METATYPE_SUBJECT, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + "Metadata extraction", + strlen ("Metadata extraction") + 1, + 0 + }, + { + EXTRACTOR_METATYPE_KEYWORDS, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + "PDF, libextractor, test", + strlen ("PDF, libextractor, test") + 1, + 0 + }, + { + EXTRACTOR_METATYPE_AUTHOR_NAME, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + "Vidyut Samanta", + strlen ("Vidyut Samanta") + 1, + 0 + }, + { + EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + "WritePDF", + strlen ("WritePDF") + 1, + 0 + }, + { + EXTRACTOR_METATYPE_PRODUCED_BY_SOFTWARE, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + "libextractor testsuite", + strlen ("libextractor testsuite") + 1, + 0 + }, + { + EXTRACTOR_METATYPE_FORMAT, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + "PDF 1.4", + strlen ("PDF 1.4") + 1, + 0 + }, + { + EXTRACTOR_METATYPE_PAGE_COUNT, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + "1", + strlen ("1") + 1, + 0 + }, + { + EXTRACTOR_METATYPE_CREATION_DATE, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + "2020-01-15 12:30:00", + strlen ("2020-01-15 12:30:00") + 1, + 0 + }, + { + EXTRACTOR_METATYPE_MODIFICATION_DATE, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + "2020-01-16 08:00:00", + strlen ("2020-01-16 08:00:00") + 1, + 0 + }, + { 0, 0, NULL, NULL, 0, -1 } + }; + struct ProblemSet ps[] = { + { "testdata/pdf_extract.pdf", + pdf_extract_sol }, + { NULL, NULL } + }; + return ET_main ("pdf", ps); +} + + +/* end of test_pdf.c */ diff --git a/src/plugins/testdata/pdf_extract.pdf b/src/plugins/testdata/pdf_extract.pdf @@ -0,0 +1,26 @@ +%PDF-1.4 +%âãÏÓ +1 0 obj +<< /Type /Catalog /Pages 2 0 R >> +endobj +2 0 obj +<< /Type /Pages /Kids [3 0 R] /Count 1 >> +endobj +3 0 obj +<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /Resources << >> >> +endobj +4 0 obj +<< /Title (GNU libextractor PDF Test) /Author (Vidyut Samanta) /Subject (Metadata extraction) /Keywords (PDF, libextractor, test) /Creator (WritePDF) /Producer (libextractor testsuite) /CreationDate (D:20200115123000Z) /ModDate (D:20200116080000Z) >> +endobj +xref +0 5 +0000000000 65535 f +0000000015 00000 n +0000000064 00000 n +0000000121 00000 n +0000000209 00000 n +trailer +<< /Size 5 /Root 1 0 R /Info 4 0 R >> +startxref +475 +%%EOF