commit 7b14c4528fa1b109240eba13678aa0e671951f26
parent 96894ba21cd0b10fb32b826e6e9c5a3107212d31
Author: Christian Grothoff <christian@grothoff.org>
Date: Tue, 19 May 2026 21:58:57 +0200
modernize PDF plugin
Diffstat:
5 files changed, 469 insertions(+), 473 deletions(-)
diff --git a/src/plugins/old/pdf_extractor.cc b/src/plugins/old/pdf_extractor.cc
@@ -1,235 +0,0 @@
-/*
- This file is part of libextractor.
- (C) 2002, 2003, 2009 Vidyut Samanta and Christian Grothoff
-
- libextractor is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 2, or (at your
- option) any later version.
-
- libextractor is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with libextractor; see the file COPYING. If not, write to the
- Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
- Boston, MA 02110-1301, USA.
-
- This code was inspired by pdfinfo and depends heavily
- on the xpdf code that pdfinfo is a part of. See also
- the INFO file in this directory.
- */
-
-#include "platform.h"
-#include "extractor.h"
-#include "convert.h"
-#include <math.h>
-
-#include <poppler/goo/gmem.h>
-#include <poppler/Object.h>
-#include <poppler/Stream.h>
-#include <poppler/Array.h>
-#include <poppler/Dict.h>
-#include <poppler/XRef.h>
-#include <poppler/Catalog.h>
-#include <poppler/Page.h>
-#include <poppler/PDFDoc.h>
-#include <poppler/Error.h>
-#include <poppler/GlobalParams.h>
-#include <poppler/goo/GooString.h>
-
-#define ADD(s, type) do { if (0!=proc(proc_cls, "pdf", type, EXTRACTOR_METAFORMAT_UTF8, "text/plain", s, strlen(s)+1)) { err = 1; goto EXIT; }} while (0)
-
-static int
-printInfoString(Dict *infoDict,
- const char *key,
- enum EXTRACTOR_MetaType type,
- EXTRACTOR_MetaDataProcessor proc,
- void *proc_cls)
-{
- Object obj;
- GooString *s1;
- const char * s;
- char *ckey = strdup (key);
- int err = 0;
- char * result;
-
- if (ckey == NULL)
- return 0;
- result = NULL;
- if (infoDict->lookup(ckey, &obj)->isString()) {
- s1 = obj.getString();
- s = s1->getCString();
- if ((((unsigned char)s[0]) & 0xff) == 0xfe &&
- (((unsigned char)s[1]) & 0xff) == 0xff) {
- result = EXTRACTOR_common_convert_to_utf8(&s[2], s1->getLength() - 2, "UTF-16BE");
- if (result != NULL)
- ADD (result, type);
- } else {
- size_t len = strlen(s);
-
- while(0 < len)
- {
- /*
- * Avoid outputting trailing spaces.
- *
- * The following expression might be rewritten as
- * (! isspace(s[len - 1]) && 0xA0 != s[len - 1]).
- * There seem to exist isspace() implementations
- * which do return non-zero from NBSP (maybe locale-dependent).
- * Remove ISO-8859 non-breaking space (NBSP, hex value 0xA0) from
- * the expression if it looks suspicious (locale issues for instance).
- *
- * Squeezing out all non-printable characters might also be useful.
- */
- if ( (' ' != s[len - 1]) && (((char)0xA0) != s[len - 1]) &&
- ('\r' != s[len - 1]) && ('\n' != s[len - 1]) &&
- ('\t' != s[len - 1]) && ('\v' != s[len - 1]) &&
- ('\f' != s[len - 1]) )
- break;
- else
- len --;
- }
-
- /* there should be a check to truncate preposterously long values. */
-
- if (0 < len) {
- result = EXTRACTOR_common_convert_to_utf8(s, len,
- "ISO-8859-1");
- if (result != NULL)
- ADD (result, type);
- }
- }
- }
- EXIT:
- obj.free();
- if (result != NULL)
- free (result);
- free (ckey);
- return err;
-}
-
-static int
-printInfoDate(Dict *infoDict,
- const char *key,
- enum EXTRACTOR_MetaType type,
- EXTRACTOR_MetaDataProcessor proc,
- void *proc_cls)
-{
- Object obj;
- const char *s;
- GooString *s1;
- char *gkey;
- char * result;
- int err;
-
- err = 0;
- result = NULL;
- gkey = strdup (key);
- if (gkey == NULL)
- return 0;
- if (infoDict->lookup(gkey, &obj)->isString()) {
- s1 = obj.getString();
- s = s1->getCString();
-
- if ((s1->getChar(0) & 0xff) == 0xfe &&
- (s1->getChar(1) & 0xff) == 0xff) {
- /* isUnicode */
-
- result = EXTRACTOR_common_convert_to_utf8((const char*)&s[2], s1->getLength() - 2, "UTF-16BE");
- if (result != NULL)
- ADD (result, type);
- } else {
- if (s[0] == 'D' && s[1] == ':')
- s += 2;
-
- ADD (s, type);
- }
- /* printf(fmt, s);*/
- }
- EXIT:
- obj.free();
- if (result != NULL)
- free (result);
- free (gkey);
- return err;
-}
-
-#define PIS(s,t) do { if (0 != (err = printInfoString (info.getDict(), s, t, proc, proc_cls))) goto EXIT; } while (0)
-
-#define PID(s,t) do { if (0 != (err = printInfoDate (info.getDict(), s, t, proc, proc_cls))) goto EXIT; } while (0)
-
-extern "C" {
-
-
- int
- EXTRACTOR_pdf_extract (const char *data,
- size_t size,
- EXTRACTOR_MetaDataProcessor proc,
- void *proc_cls,
- const char *options)
- {
- PDFDoc * doc;
- Object info;
- Object obj;
- BaseStream * stream;
- int err;
-
- if (globalParams == NULL)
- {
- globalParams = new GlobalParams();
- globalParams->setErrQuiet (gTrue);
- }
- obj.initNull();
- err = 0;
- stream = new MemStream( (char*) data, 0, size, &obj);
- doc = new PDFDoc(stream, NULL, NULL);
- if (! doc->isOk()) {
- delete doc;
- return 0;
- }
-
- ADD ("application/pdf",
- EXTRACTOR_METATYPE_MIMETYPE);
- if ( (NULL != doc->getDocInfo(&info)) &&
- (info.isDict()) ) {
- PIS ("Title", EXTRACTOR_METATYPE_TITLE);
- PIS ("Subject", EXTRACTOR_METATYPE_SUBJECT);
- PIS ("Keywords", EXTRACTOR_METATYPE_KEYWORDS);
- PIS ("Author", EXTRACTOR_METATYPE_AUTHOR_NAME);
- /*
- * we now believe that Adobe's Creator is not a person nor an
- * organisation, but just a piece of software.
- */
- PIS ("Creator", EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE);
- PIS ("Producer", EXTRACTOR_METATYPE_PRODUCED_BY_SOFTWARE);
- {
- char pcnt[20];
- sprintf(pcnt, "%d", doc->getNumPages());
- ADD (pcnt, EXTRACTOR_METATYPE_PAGE_COUNT);
- }
- {
- char pcnt[64];
-#if HAVE_POPPLER_GETPDFMAJORVERSION
- sprintf(pcnt, "PDF %d.%d",
- doc->getPDFMajorVersion(),
- doc->getPDFMinorVersion());
-#else
- sprintf(pcnt, "PDF %.1f",
- doc->getPDFVersion());
-#endif
- ADD (pcnt, EXTRACTOR_METATYPE_FORMAT);
- }
- PID ("CreationDate", EXTRACTOR_METATYPE_CREATION_DATE);
- PID ("ModDate", EXTRACTOR_METATYPE_MODIFICATION_DATE);
- }
- EXIT:
- info.free();
- delete doc;
-
- return err;
- }
-}
-
diff --git a/src/plugins/pdf_extractor.c b/src/plugins/pdf_extractor.c
@@ -1,238 +0,0 @@
-/*
- This file is part of libextractor.
- Copyright (C) 2016 Christian Grothoff
-
- libextractor is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3, or (at your
- option) any later version.
-
- libextractor is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with libextractor; see the file COPYING. If not, write to the
- Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
- Boston, MA 02110-1301, USA.
- */
-/**
- * @file plugins/pdf_extractor.c
- * @brief plugin to support PDF files
- * @author Christian Grothoff
- *
- * PDF libraries today are a nightmare (TM). So instead of doing the
- * fast thing and calling some library functions to parse the PDF,
- * we execute 'pdfinfo' and parse the output. Because that's 21st
- * century plumbing: nobody writes reasonable code anymore.
- */
-#include "platform.h"
-#include <extractor.h>
-#include <sys/types.h>
-#include <sys/wait.h>
-#include <signal.h>
-#include <unistd.h>
-
-/**
- * Entry in the mapping from control data to LE types.
- */
-struct Matches
-{
- /**
- * Key in the Pdfian control file.
- */
- const char *text;
-
- /**
- * Corresponding type in LE.
- */
- enum EXTRACTOR_MetaType type;
-};
-
-
-/**
- * Map from pdf-control entries to LE types.
- *
- * See output of 'pdfinfo'.
- */
-static struct Matches tmap[] = {
- {"Title", EXTRACTOR_METATYPE_TITLE},
- {"Subject", EXTRACTOR_METATYPE_SUBJECT},
- {"Keywords", EXTRACTOR_METATYPE_KEYWORDS},
- {"Author", EXTRACTOR_METATYPE_AUTHOR_NAME},
- {"Creator", EXTRACTOR_METATYPE_CREATOR},
- {"Producer", EXTRACTOR_METATYPE_PRODUCED_BY_SOFTWARE},
- {"CreationDate", EXTRACTOR_METATYPE_CREATION_DATE},
- {"ModDate", EXTRACTOR_METATYPE_MODIFICATION_DATE},
- {"PDF version", EXTRACTOR_METATYPE_ENCODER_VERSION},
- {"Pages", EXTRACTOR_METATYPE_PAGE_COUNT},
- {NULL, 0}
-};
-
-
-/**
- * Process the "stdout" file from pdfinfo.
- *
- * @param fout stdout of pdfinfo
- * @param proc function to call with meta data
- * @param proc_cls closure for @e proc
- */
-static void
-process_stdout (FILE *fout,
- EXTRACTOR_MetaDataProcessor proc,
- void *proc_cls)
-{
- unsigned int i;
- char line[1025];
- const char *psuffix;
- const char *colon;
-
- while (! feof (fout))
- {
- if (NULL == fgets (line, sizeof (line) - 1, fout))
- break;
- if (0 == strlen (line))
- continue;
- if ('\n' == line[strlen (line) - 1])
- line[strlen (line) - 1] = '\0';
- colon = strchr (line, (int) ':');
- if (NULL == colon)
- break;
- psuffix = colon + 1;
- while (isblank ((unsigned char) psuffix[0]))
- psuffix++;
- if (0 == strlen (psuffix))
- continue;
- for (i = 0; NULL != tmap[i].text; i++)
- {
- if (0 != strncasecmp (line,
- tmap[i].text,
- colon - line))
- continue;
- if (0 != proc (proc_cls,
- "pdf",
- tmap[i].type,
- EXTRACTOR_METAFORMAT_UTF8,
- "text/plain",
- psuffix,
- strlen (psuffix) + 1))
- return;
- break;
- }
- }
-}
-
-
-/**
- * Main entry method for the PDF extraction plugin.
- *
- * @param ec extraction context provided to the plugin
- */
-void
-EXTRACTOR_pdf_extract_method (struct EXTRACTOR_ExtractContext *ec)
-{
- uint64_t fsize;
- void *data;
- pid_t pid;
- int in[2];
- int out[2];
- FILE *fout;
- uint64_t pos;
-
- fsize = ec->get_size (ec->cls);
- if (fsize < 128)
- return;
- if (4 !=
- ec->read (ec->cls, &data, 4))
- return;
- if (0 != strncmp ("%PDF", data, 4))
- return;
- if (0 !=
- ec->seek (ec->cls, 0, SEEK_SET))
- return;
- if (0 != pipe (in))
- return;
- if (0 != pipe (out))
- {
- close (in[0]);
- close (in[1]);
- return;
- }
- pid = fork ();
- if (-1 == pid)
- {
- close (in[0]);
- close (in[1]);
- close (out[0]);
- close (out[1]);
- return;
- }
- if (0 == pid)
- {
- char *const args[] = {
- "pdfinfo",
- "-",
- NULL
- };
- /* am child, exec 'pdfinfo' */
- close (0);
- close (1);
- if ( (-1 == dup2 (in[0], 0)) ||
- (-1 == dup2 (out[1], 1)) )
- exit (1);
- close (in[0]);
- close (in[1]);
- close (out[0]);
- close (out[1]);
- execvp ("pdfinfo", args);
- exit (1);
- }
- /* am parent, send file */
- close (in[0]);
- close (out[1]);
- fout = fdopen (out[0], "r");
- if (NULL == fout)
- {
- close (in[1]);
- close (out[0]);
- kill (pid, SIGKILL);
- waitpid (pid, NULL, 0);
- return;
- }
- pos = 0;
- while (pos < fsize)
- {
- ssize_t got;
- size_t wpos;
-
- data = NULL;
- got = ec->read (ec->cls,
- &data,
- fsize - pos);
- if ( (-1 == got) ||
- (NULL == data) )
- break;
- wpos = 0;
- while (wpos < got)
- {
- ssize_t out;
-
- out = write (in[1], data + wpos, got - wpos);
- if (out <= 0)
- break;
- wpos += out;
- }
- if (wpos < got)
- break;
- pos += got;
- }
- close (in[1]);
- process_stdout (fout, ec->proc, ec->cls);
- fclose (fout);
- kill (pid, SIGKILL);
- waitpid (pid, NULL, 0);
-}
-
-
-/* end of pdf_extractor.c */
diff --git a/src/plugins/pdf_extractor.cc b/src/plugins/pdf_extractor.cc
@@ -0,0 +1,304 @@
+/*
+ This file is part of libextractor.
+ Copyright (C) 2002, 2003, 2009, 2026 Vidyut Samanta and Christian Grothoff
+
+ libextractor is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3, or (at your
+ option) any later version.
+
+ libextractor is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with libextractor; see the file COPYING. If not, write to the
+ Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ Boston, MA 02110-1301, USA.
+ */
+/**
+ * @file plugins/pdf_extractor.cc
+ * @brief plugin to support PDF files
+ * @author Vidyut Samanta
+ * @author Christian Grothoff
+ *
+ * This plugin uses the stable C++ binding of libpoppler
+ * (`poppler-cpp`). Earlier versions of this plugin linked
+ * against poppler's internal headers (`PDFDoc`, `GooString`,
+ * ...), which carry no API or ABI stability guarantees and
+ * broke with virtually every poppler release. The poppler-cpp
+ * interface is the supported public API and is what we use
+ * here.
+ */
+#include "platform.h"
+#include "extractor.h"
+#include <poppler/cpp/poppler-document.h>
+#include <poppler/cpp/poppler-global.h>
+#include <string>
+#include <vector>
+
+
+/**
+ * Sanity bound on the size of a PDF we are willing to buffer
+ * in memory (1 GB). libpoppler needs the whole document, and
+ * its raw-data loader takes an `int` length.
+ */
+#define MAX_PDF_SIZE (1024LL * 1024LL * 1024LL)
+
+
+/**
+ * Entry in the mapping from poppler accessors to LE types.
+ */
+struct Matches
+{
+ /**
+ * Accessor on the poppler document returning the value.
+ */
+ poppler::ustring (poppler::document::*get) () const;
+
+ /**
+ * Corresponding meta data type in LE.
+ */
+ enum EXTRACTOR_MetaType type;
+};
+
+
+/**
+ * Map from poppler document info accessors to LE types.
+ *
+ * Note that we deliberately map "Creator" to
+ * #EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE: we believe that
+ * Adobe's "Creator" is not a person nor an organisation, but
+ * just a piece of software.
+ */
+static const struct Matches tmap[] = {
+ { &poppler::document::get_title, EXTRACTOR_METATYPE_TITLE },
+ { &poppler::document::get_subject, EXTRACTOR_METATYPE_SUBJECT },
+ { &poppler::document::get_keywords, EXTRACTOR_METATYPE_KEYWORDS },
+ { &poppler::document::get_author, EXTRACTOR_METATYPE_AUTHOR_NAME },
+ { &poppler::document::get_creator, EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE },
+ { &poppler::document::get_producer, EXTRACTOR_METATYPE_PRODUCED_BY_SOFTWARE },
+ { NULL, EXTRACTOR_METATYPE_RESERVED }
+};
+
+
+/**
+ * Silence libpoppler: we do not want parsing diagnostics on
+ * stderr of the plugin child process.
+ *
+ * @param msg the message (ignored)
+ * @param cls closure (ignored)
+ */
+static void
+quiet_error (const std::string &msg,
+ void *cls)
+{
+ (void) msg;
+ (void) cls;
+}
+
+
+/**
+ * Hand a UTF-8 string to the meta data processor, after
+ * stripping trailing whitespace. Empty values are skipped.
+ *
+ * @param ec extraction context
+ * @param type meta data type to use
+ * @param val UTF-8 bytes (need not be 0-terminated)
+ * @return 0 to continue extracting, 1 if @a ec asked us to stop
+ */
+static int
+add_utf8 (struct EXTRACTOR_ExtractContext *ec,
+ enum EXTRACTOR_MetaType type,
+ std::vector<char> val)
+{
+ size_t len = val.size ();
+
+ /*
+ * Avoid outputting trailing whitespace. Note that ISO-8859
+ * NBSP (0xA0) becomes 0xC2 0xA0 in UTF-8 and is intentionally
+ * not stripped here.
+ */
+ while ((0 < len) &&
+ ((' ' == val[len - 1]) ||
+ ('\r' == val[len - 1]) ||
+ ('\n' == val[len - 1]) ||
+ ('\t' == val[len - 1]) ||
+ ('\v' == val[len - 1]) ||
+ ('\f' == val[len - 1])))
+ len--;
+ if (0 == len)
+ return 0;
+ std::string s (val.data (), len);
+ if (0 != ec->proc (ec->cls,
+ "pdf",
+ type,
+ EXTRACTOR_METAFORMAT_UTF8,
+ "text/plain",
+ s.c_str (),
+ s.size () + 1))
+ return 1;
+ return 0;
+}
+
+
+/**
+ * Hand a 0-terminated C string to the meta data processor.
+ *
+ * @param ec extraction context
+ * @param type meta data type to use
+ * @param s the string
+ * @return 0 to continue extracting, 1 if @a ec asked us to stop
+ */
+static int
+add_str (struct EXTRACTOR_ExtractContext *ec,
+ enum EXTRACTOR_MetaType type,
+ const char *s)
+{
+ if ((NULL == s) || ('\0' == s[0]))
+ return 0;
+ if (0 != ec->proc (ec->cls,
+ "pdf",
+ type,
+ EXTRACTOR_METAFORMAT_UTF8,
+ "text/plain",
+ s,
+ strlen (s) + 1))
+ return 1;
+ return 0;
+}
+
+
+/**
+ * Report a date (given as a `time_t`) in ISO-8601 / UTC.
+ *
+ * @param ec extraction context
+ * @param type meta data type to use
+ * @param t the time, `(time_t) -1` or 0 if absent
+ * @return 0 to continue extracting, 1 if @a ec asked us to stop
+ */
+static int
+add_date (struct EXTRACTOR_ExtractContext *ec,
+ enum EXTRACTOR_MetaType type,
+ time_t t)
+{
+ char buf[32];
+ struct tm tv;
+
+ if (((time_t) -1 == t) || (0 == t))
+ return 0;
+ if (NULL == gmtime_r (&t, &tv))
+ return 0;
+ if (0 == strftime (buf, sizeof (buf), "%Y-%m-%d %H:%M:%S", &tv))
+ return 0;
+ return add_str (ec, type, buf);
+}
+
+
+/**
+ * Read the entire input into @a buf.
+ *
+ * @param ec extraction context
+ * @param[out] buf buffer to fill with the file contents
+ * @return 0 on success, -1 on error
+ */
+static int
+read_all (struct EXTRACTOR_ExtractContext *ec,
+ std::vector<char> &buf)
+{
+ uint64_t size;
+
+ size = ec->get_size (ec->cls);
+ if ((UINT64_MAX == size) ||
+ (0 == size) ||
+ (size > MAX_PDF_SIZE))
+ return -1;
+ if (0 != ec->seek (ec->cls, 0, SEEK_SET))
+ return -1;
+ buf.reserve ((size_t) size);
+ while (buf.size () < size)
+ {
+ void *data;
+ ssize_t got;
+
+ got = ec->read (ec->cls,
+ &data,
+ (size_t) (size - buf.size ()));
+ if ((got <= 0) || (NULL == data))
+ break;
+ buf.insert (buf.end (),
+ static_cast<char *> (data),
+ static_cast<char *> (data) + got);
+ }
+ if (buf.empty ())
+ return -1;
+ return 0;
+}
+
+
+/**
+ * Main entry method for the PDF extraction plugin.
+ *
+ * @param ec extraction context provided to the plugin
+ */
+extern "C" void
+EXTRACTOR_pdf_extract_method (struct EXTRACTOR_ExtractContext *ec)
+{
+ void *hdr;
+ std::vector<char> buf;
+ poppler::document *doc;
+ int major;
+ int minor;
+ char ver[32];
+ char pages[16];
+
+ if (4 != ec->read (ec->cls, &hdr, 4))
+ return;
+ if (0 != memcmp ("%PDF", hdr, 4))
+ return;
+ if (0 != read_all (ec, buf))
+ return;
+
+ poppler::set_debug_error_function (&quiet_error, NULL);
+ doc = poppler::document::load_from_raw_data (buf.data (),
+ (int) buf.size ());
+ if (NULL == doc)
+ return;
+ /* An encrypted document we cannot open exposes no usable meta data. */
+ if (doc->is_locked ())
+ {
+ delete doc;
+ return;
+ }
+
+ if (0 != add_str (ec,
+ EXTRACTOR_METATYPE_MIMETYPE,
+ "application/pdf"))
+ goto CLEANUP;
+ for (unsigned int i = 0; NULL != tmap[i].get; i++)
+ if (0 != add_utf8 (ec,
+ tmap[i].type,
+ (doc->*tmap[i].get)().to_utf8 ()))
+ goto CLEANUP;
+ doc->get_pdf_version (&major, &minor);
+ snprintf (ver, sizeof (ver), "PDF %d.%d", major, minor);
+ if (0 != add_str (ec, EXTRACTOR_METATYPE_FORMAT, ver))
+ goto CLEANUP;
+ snprintf (pages, sizeof (pages), "%d", doc->pages ());
+ if (0 != add_str (ec, EXTRACTOR_METATYPE_PAGE_COUNT, pages))
+ goto CLEANUP;
+ if (0 != add_date (ec,
+ EXTRACTOR_METATYPE_CREATION_DATE,
+ doc->get_creation_date_t ()))
+ goto CLEANUP;
+ if (0 != add_date (ec,
+ EXTRACTOR_METATYPE_MODIFICATION_DATE,
+ doc->get_modification_date_t ()))
+ goto CLEANUP;
+CLEANUP:
+ delete doc;
+}
+
+
+/* end of pdf_extractor.cc */
diff --git a/src/plugins/test_pdf.c b/src/plugins/test_pdf.c
@@ -0,0 +1,139 @@
+/*
+ This file is part of libextractor.
+ Copyright (C) 2026 Vidyut Samanta and Christian Grothoff
+
+ libextractor is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3, or (at your
+ option) any later version.
+
+ libextractor is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with libextractor; see the file COPYING. If not, write to the
+ Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ Boston, MA 02110-1301, USA.
+*/
+/**
+ * @file plugins/test_pdf.c
+ * @brief testcase for pdf plugin
+ * @author Christian Grothoff
+ */
+#include "platform.h"
+#include "test_lib.h"
+
+
+/**
+ * Main function for the PDF testcase.
+ *
+ * @param argc number of arguments (ignored)
+ * @param argv arguments (ignored)
+ * @return 0 on success
+ */
+int
+main (int argc, char *argv[])
+{
+ struct SolutionData pdf_extract_sol[] = {
+ {
+ EXTRACTOR_METATYPE_MIMETYPE,
+ EXTRACTOR_METAFORMAT_UTF8,
+ "text/plain",
+ "application/pdf",
+ strlen ("application/pdf") + 1,
+ 0
+ },
+ {
+ EXTRACTOR_METATYPE_TITLE,
+ EXTRACTOR_METAFORMAT_UTF8,
+ "text/plain",
+ "GNU libextractor PDF Test",
+ strlen ("GNU libextractor PDF Test") + 1,
+ 0
+ },
+ {
+ EXTRACTOR_METATYPE_SUBJECT,
+ EXTRACTOR_METAFORMAT_UTF8,
+ "text/plain",
+ "Metadata extraction",
+ strlen ("Metadata extraction") + 1,
+ 0
+ },
+ {
+ EXTRACTOR_METATYPE_KEYWORDS,
+ EXTRACTOR_METAFORMAT_UTF8,
+ "text/plain",
+ "PDF, libextractor, test",
+ strlen ("PDF, libextractor, test") + 1,
+ 0
+ },
+ {
+ EXTRACTOR_METATYPE_AUTHOR_NAME,
+ EXTRACTOR_METAFORMAT_UTF8,
+ "text/plain",
+ "Vidyut Samanta",
+ strlen ("Vidyut Samanta") + 1,
+ 0
+ },
+ {
+ EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE,
+ EXTRACTOR_METAFORMAT_UTF8,
+ "text/plain",
+ "WritePDF",
+ strlen ("WritePDF") + 1,
+ 0
+ },
+ {
+ EXTRACTOR_METATYPE_PRODUCED_BY_SOFTWARE,
+ EXTRACTOR_METAFORMAT_UTF8,
+ "text/plain",
+ "libextractor testsuite",
+ strlen ("libextractor testsuite") + 1,
+ 0
+ },
+ {
+ EXTRACTOR_METATYPE_FORMAT,
+ EXTRACTOR_METAFORMAT_UTF8,
+ "text/plain",
+ "PDF 1.4",
+ strlen ("PDF 1.4") + 1,
+ 0
+ },
+ {
+ EXTRACTOR_METATYPE_PAGE_COUNT,
+ EXTRACTOR_METAFORMAT_UTF8,
+ "text/plain",
+ "1",
+ strlen ("1") + 1,
+ 0
+ },
+ {
+ EXTRACTOR_METATYPE_CREATION_DATE,
+ EXTRACTOR_METAFORMAT_UTF8,
+ "text/plain",
+ "2020-01-15 12:30:00",
+ strlen ("2020-01-15 12:30:00") + 1,
+ 0
+ },
+ {
+ EXTRACTOR_METATYPE_MODIFICATION_DATE,
+ EXTRACTOR_METAFORMAT_UTF8,
+ "text/plain",
+ "2020-01-16 08:00:00",
+ strlen ("2020-01-16 08:00:00") + 1,
+ 0
+ },
+ { 0, 0, NULL, NULL, 0, -1 }
+ };
+ struct ProblemSet ps[] = {
+ { "testdata/pdf_extract.pdf",
+ pdf_extract_sol },
+ { NULL, NULL }
+ };
+ return ET_main ("pdf", ps);
+}
+
+
+/* end of test_pdf.c */
diff --git a/src/plugins/testdata/pdf_extract.pdf b/src/plugins/testdata/pdf_extract.pdf
@@ -0,0 +1,26 @@
+%PDF-1.4
+%âãÏÓ
+1 0 obj
+<< /Type /Catalog /Pages 2 0 R >>
+endobj
+2 0 obj
+<< /Type /Pages /Kids [3 0 R] /Count 1 >>
+endobj
+3 0 obj
+<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /Resources << >> >>
+endobj
+4 0 obj
+<< /Title (GNU libextractor PDF Test) /Author (Vidyut Samanta) /Subject (Metadata extraction) /Keywords (PDF, libextractor, test) /Creator (WritePDF) /Producer (libextractor testsuite) /CreationDate (D:20200115123000Z) /ModDate (D:20200116080000Z) >>
+endobj
+xref
+0 5
+0000000000 65535 f
+0000000015 00000 n
+0000000064 00000 n
+0000000121 00000 n
+0000000209 00000 n
+trailer
+<< /Size 5 /Root 1 0 R /Info 4 0 R >>
+startxref
+475
+%%EOF