modernize PDF plugin - libextractor

commit 7b14c4528fa1b109240eba13678aa0e671951f26
parent 96894ba21cd0b10fb32b826e6e9c5a3107212d31
Author: Christian Grothoff <christian@grothoff.org>
Date:   Tue, 19 May 2026 21:58:57 +0200

modernize PDF plugin

Diffstat:
D src/plugins/old/pdf_extractor.cc  | 235 -------------------------------------------------------------------------------
D src/plugins/pdf_extractor.c  | 238 -------------------------------------------------------------------------------
A src/plugins/pdf_extractor.cc  | 304 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A src/plugins/test_pdf.c  | 139 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A src/plugins/testdata/pdf_extract.pdf  | 26 ++++++++++++++++++++++++++

5 files changed, 469 insertions(+), 473 deletions(-)
diff --git a/src/plugins/old/pdf_extractor.cc b/src/plugins/old/pdf_extractor.cc
@@ -1,235 +0,0 @@
-/*
-     This file is part of libextractor.
-     (C) 2002, 2003, 2009 Vidyut Samanta and Christian Grothoff
-
-     libextractor is free software; you can redistribute it and/or modify
-     it under the terms of the GNU General Public License as published
-     by the Free Software Foundation; either version 2, or (at your
-     option) any later version.
-
-     libextractor is distributed in the hope that it will be useful, but
-     WITHOUT ANY WARRANTY; without even the implied warranty of
-     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-     General Public License for more details.
-
-     You should have received a copy of the GNU General Public License
-     along with libextractor; see the file COPYING.  If not, write to the
-     Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
-     Boston, MA 02110-1301, USA.
-
-     This code was inspired by pdfinfo and depends heavily
-     on the xpdf code that pdfinfo is a part of. See also
-     the INFO file in this directory.
- */
-
-#include "platform.h"
-#include "extractor.h"
-#include "convert.h"
-#include <math.h>
-
-#include <poppler/goo/gmem.h>
-#include <poppler/Object.h>
-#include <poppler/Stream.h>
-#include <poppler/Array.h>
-#include <poppler/Dict.h>
-#include <poppler/XRef.h>
-#include <poppler/Catalog.h>
-#include <poppler/Page.h>
-#include <poppler/PDFDoc.h>
-#include <poppler/Error.h>
-#include <poppler/GlobalParams.h>
-#include <poppler/goo/GooString.h>
-
-#define ADD(s, type) do { if (0!=proc(proc_cls, "pdf", type, EXTRACTOR_METAFORMAT_UTF8, "text/plain", s, strlen(s)+1)) { err = 1; goto EXIT; }} while (0)
-
-static int 
-printInfoString(Dict *infoDict,
-		const char *key,
-		enum EXTRACTOR_MetaType type,
-		EXTRACTOR_MetaDataProcessor proc,
-		void *proc_cls)
-{
-  Object obj;
-  GooString *s1;
-  const char * s;
-  char *ckey = strdup (key);
-  int err = 0;
-  char * result;
-      
-  if (ckey == NULL)
-    return 0;
-  result = NULL;
-  if (infoDict->lookup(ckey, &obj)->isString()) {
-    s1 = obj.getString();
-    s = s1->getCString();
-    if ((((unsigned char)s[0]) & 0xff) == 0xfe &&
-	(((unsigned char)s[1]) & 0xff) == 0xff) {
-      result = EXTRACTOR_common_convert_to_utf8(&s[2], s1->getLength() - 2, "UTF-16BE");
-      if (result != NULL)
-	ADD (result, type);
-    } else {
-      size_t len = strlen(s);
-      
-      while(0 < len) 
-	{
-	  /*
-	   * Avoid outputting trailing spaces.
-	   *
-	   * The following expression might be rewritten as
-	   * (! isspace(s[len - 1]) && 0xA0 != s[len - 1]).
-	   * There seem to exist isspace() implementations
-	   * which do return non-zero from NBSP (maybe locale-dependent).
-	   * Remove ISO-8859 non-breaking space (NBSP, hex value 0xA0) from
-	   * the expression if it looks suspicious (locale issues for instance).
-	   *
-	   * Squeezing out all non-printable characters might also be useful.
-	   */
-  	  if ( (' '  != s[len - 1]) && (((char)0xA0) != s[len - 1]) &&
-               ('\r' != s[len - 1]) && ('\n' != s[len - 1]) &&
-               ('\t' != s[len - 1]) && ('\v' != s[len - 1]) &&
-               ('\f' != s[len - 1]) )
-	    break;	  
-          else
-            len --;
-        }
-
-        /* there should be a check to truncate preposterously long values. */
-      
-      if (0 < len) {
-	result = EXTRACTOR_common_convert_to_utf8(s, len,
-						  "ISO-8859-1");
-	if (result != NULL)
-	  ADD (result, type);
-      }
-    }
-  }
- EXIT:
-  obj.free();
-  if (result != NULL)
-    free (result);
-  free (ckey);
-  return err;
-}
-
-static int 
-printInfoDate(Dict *infoDict,
-	      const char *key,
-	      enum EXTRACTOR_MetaType type,
-	      EXTRACTOR_MetaDataProcessor proc,
-	      void *proc_cls)
-{
-  Object obj;
-  const char *s;
-  GooString *s1;  
-  char *gkey;
-  char * result;
-  int err;
-  
-  err = 0;
-  result = NULL;
-  gkey = strdup (key);
-  if (gkey == NULL)
-    return 0;
-  if (infoDict->lookup(gkey, &obj)->isString()) {
-    s1 = obj.getString();
-    s = s1->getCString();
-    
-    if ((s1->getChar(0) & 0xff) == 0xfe &&
-	(s1->getChar(1) & 0xff) == 0xff) {
-      /* isUnicode */
-      
-      result = EXTRACTOR_common_convert_to_utf8((const char*)&s[2], s1->getLength() - 2, "UTF-16BE");
-      if (result != NULL)
-	ADD (result, type);
-    } else {
-      if (s[0] == 'D' && s[1] == ':') 
-	s += 2;
-      
-      ADD (s, type);
-    }
-    /* printf(fmt, s);*/
-  }
- EXIT:
-  obj.free();
-  if (result != NULL)
-    free (result);
-  free (gkey);
-  return err;
-}
-
-#define PIS(s,t) do { if (0 != (err = printInfoString (info.getDict(), s, t, proc, proc_cls))) goto EXIT; } while (0)
-
-#define PID(s,t) do { if (0 != (err = printInfoDate (info.getDict(), s, t, proc, proc_cls))) goto EXIT; } while (0)
-
-extern "C" {
- 
-
-  int 
-  EXTRACTOR_pdf_extract (const char *data,
-			 size_t size,
-			 EXTRACTOR_MetaDataProcessor proc,
-			 void *proc_cls,
-			 const char *options)
-  {
-    PDFDoc * doc;
-    Object info;
-    Object obj;
-    BaseStream * stream;
-    int err;
-
-    if (globalParams == NULL)
-      {
-	globalParams = new GlobalParams();
-	globalParams->setErrQuiet (gTrue);
-      }
-    obj.initNull();
-    err = 0;
-    stream = new MemStream( (char*) data, 0, size, &obj);
-    doc = new PDFDoc(stream, NULL, NULL);
-    if (! doc->isOk()) {
-      delete doc;
-      return 0;
-    }
-
-    ADD ("application/pdf",
-	 EXTRACTOR_METATYPE_MIMETYPE);
-    if ( (NULL != doc->getDocInfo(&info)) &&
-	 (info.isDict()) ) {
-      PIS ("Title", EXTRACTOR_METATYPE_TITLE);
-      PIS ("Subject", EXTRACTOR_METATYPE_SUBJECT);
-      PIS ("Keywords", EXTRACTOR_METATYPE_KEYWORDS);
-      PIS ("Author", EXTRACTOR_METATYPE_AUTHOR_NAME);
-      /*
-       * we now believe that Adobe's Creator is not a person nor an
-       * organisation, but just a piece of software.
-       */
-      PIS ("Creator", EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE);
-      PIS ("Producer", EXTRACTOR_METATYPE_PRODUCED_BY_SOFTWARE);
-      {
-	char pcnt[20];
-	sprintf(pcnt, "%d", doc->getNumPages());
-	ADD (pcnt, EXTRACTOR_METATYPE_PAGE_COUNT);
-      }
-      {
-	char pcnt[64];
-#if HAVE_POPPLER_GETPDFMAJORVERSION
-	sprintf(pcnt, "PDF %d.%d", 
-		doc->getPDFMajorVersion(),
-		doc->getPDFMinorVersion());
-#else
-	sprintf(pcnt, "PDF %.1f", 
-		doc->getPDFVersion());
-#endif
-	ADD (pcnt, EXTRACTOR_METATYPE_FORMAT);
-      }
-      PID ("CreationDate", EXTRACTOR_METATYPE_CREATION_DATE);
-      PID ("ModDate", EXTRACTOR_METATYPE_MODIFICATION_DATE);
-    }
-  EXIT:
-    info.free();
-    delete doc;
-
-    return err;
-  }
-}
-
diff --git a/src/plugins/pdf_extractor.c b/src/plugins/pdf_extractor.c
@@ -1,238 +0,0 @@
-/*
-     This file is part of libextractor.
-     Copyright (C) 2016 Christian Grothoff
-
-     libextractor is free software; you can redistribute it and/or modify
-     it under the terms of the GNU General Public License as published
-     by the Free Software Foundation; either version 3, or (at your
-     option) any later version.
-
-     libextractor is distributed in the hope that it will be useful, but
-     WITHOUT ANY WARRANTY; without even the implied warranty of
-     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-     General Public License for more details.
-
-     You should have received a copy of the GNU General Public License
-     along with libextractor; see the file COPYING.  If not, write to the
-     Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
-     Boston, MA 02110-1301, USA.
- */
-/**
- * @file plugins/pdf_extractor.c
- * @brief plugin to support PDF files
- * @author Christian Grothoff
- *
- * PDF libraries today are a nightmare (TM).  So instead of doing the
- * fast thing and calling some library functions to parse the PDF,
- * we execute 'pdfinfo' and parse the output. Because that's 21st
- * century plumbing: nobody writes reasonable code anymore.
- */
-#include "platform.h"
-#include <extractor.h>
-#include <sys/types.h>
-#include <sys/wait.h>
-#include <signal.h>
-#include <unistd.h>
-
-/**
- * Entry in the mapping from control data to LE types.
- */
-struct Matches
-{
-  /**
-   * Key in the Pdfian control file.
-   */
-  const char *text;
-
-  /**
-   * Corresponding type in LE.
-   */
-  enum EXTRACTOR_MetaType type;
-};
-
-
-/**
- * Map from pdf-control entries to LE types.
- *
- * See output of 'pdfinfo'.
- */
-static struct Matches tmap[] = {
-  {"Title",        EXTRACTOR_METATYPE_TITLE},
-  {"Subject",      EXTRACTOR_METATYPE_SUBJECT},
-  {"Keywords",     EXTRACTOR_METATYPE_KEYWORDS},
-  {"Author",       EXTRACTOR_METATYPE_AUTHOR_NAME},
-  {"Creator",      EXTRACTOR_METATYPE_CREATOR},
-  {"Producer",     EXTRACTOR_METATYPE_PRODUCED_BY_SOFTWARE},
-  {"CreationDate", EXTRACTOR_METATYPE_CREATION_DATE},
-  {"ModDate",      EXTRACTOR_METATYPE_MODIFICATION_DATE},
-  {"PDF version",  EXTRACTOR_METATYPE_ENCODER_VERSION},
-  {"Pages",        EXTRACTOR_METATYPE_PAGE_COUNT},
-  {NULL, 0}
-};
-
-
-/**
- * Process the "stdout" file from pdfinfo.
- *
- * @param fout stdout of pdfinfo
- * @param proc function to call with meta data
- * @param proc_cls closure for @e proc
- */
-static void
-process_stdout (FILE *fout,
-                EXTRACTOR_MetaDataProcessor proc,
-                void *proc_cls)
-{
-  unsigned int i;
-  char line[1025];
-  const char *psuffix;
-  const char *colon;
-
-  while (! feof (fout))
-  {
-    if (NULL == fgets (line, sizeof (line) - 1, fout))
-      break;
-    if (0 == strlen (line))
-      continue;
-    if ('\n' == line[strlen (line) - 1])
-      line[strlen (line) - 1] = '\0';
-    colon = strchr (line, (int) ':');
-    if (NULL == colon)
-      break;
-    psuffix = colon + 1;
-    while (isblank ((unsigned char) psuffix[0]))
-      psuffix++;
-    if (0 == strlen (psuffix))
-      continue;
-    for (i = 0; NULL != tmap[i].text; i++)
-    {
-      if (0 != strncasecmp (line,
-                            tmap[i].text,
-                            colon - line))
-        continue;
-      if (0 != proc (proc_cls,
-                     "pdf",
-                     tmap[i].type,
-                     EXTRACTOR_METAFORMAT_UTF8,
-                     "text/plain",
-                     psuffix,
-                     strlen (psuffix) + 1))
-        return;
-      break;
-    }
-  }
-}
-
-
-/**
- * Main entry method for the PDF extraction plugin.
- *
- * @param ec extraction context provided to the plugin
- */
-void
-EXTRACTOR_pdf_extract_method (struct EXTRACTOR_ExtractContext *ec)
-{
-  uint64_t fsize;
-  void *data;
-  pid_t pid;
-  int in[2];
-  int out[2];
-  FILE *fout;
-  uint64_t pos;
-
-  fsize = ec->get_size (ec->cls);
-  if (fsize < 128)
-    return;
-  if (4 !=
-      ec->read (ec->cls, &data, 4))
-    return;
-  if (0 != strncmp ("%PDF", data, 4))
-    return;
-  if (0 !=
-      ec->seek (ec->cls, 0, SEEK_SET))
-    return;
-  if (0 != pipe (in))
-    return;
-  if (0 != pipe (out))
-  {
-    close (in[0]);
-    close (in[1]);
-    return;
-  }
-  pid = fork ();
-  if (-1 == pid)
-  {
-    close (in[0]);
-    close (in[1]);
-    close (out[0]);
-    close (out[1]);
-    return;
-  }
-  if (0 == pid)
-  {
-    char *const args[] = {
-      "pdfinfo",
-      "-",
-      NULL
-    };
-    /* am child, exec 'pdfinfo' */
-    close (0);
-    close (1);
-    if ( (-1 == dup2 (in[0], 0)) ||
-         (-1 == dup2 (out[1], 1)) )
-      exit (1);
-    close (in[0]);
-    close (in[1]);
-    close (out[0]);
-    close (out[1]);
-    execvp ("pdfinfo", args);
-    exit (1);
-  }
-  /* am parent, send file */
-  close (in[0]);
-  close (out[1]);
-  fout = fdopen (out[0], "r");
-  if (NULL == fout)
-  {
-    close (in[1]);
-    close (out[0]);
-    kill (pid, SIGKILL);
-    waitpid (pid, NULL, 0);
-    return;
-  }
-  pos = 0;
-  while (pos < fsize)
-  {
-    ssize_t got;
-    size_t wpos;
-
-    data = NULL;
-    got = ec->read (ec->cls,
-                    &data,
-                    fsize - pos);
-    if ( (-1 == got) ||
-         (NULL == data) )
-      break;
-    wpos = 0;
-    while (wpos < got)
-    {
-      ssize_t out;
-
-      out = write (in[1], data + wpos, got - wpos);
-      if (out <= 0)
-        break;
-      wpos += out;
-    }
-    if (wpos < got)
-      break;
-    pos += got;
-  }
-  close (in[1]);
-  process_stdout (fout, ec->proc, ec->cls);
-  fclose (fout);
-  kill (pid, SIGKILL);
-  waitpid (pid, NULL, 0);
-}
-
-
-/* end of pdf_extractor.c */
diff --git a/src/plugins/pdf_extractor.cc b/src/plugins/pdf_extractor.cc
@@ -0,0 +1,304 @@
+/*
+     This file is part of libextractor.
+     Copyright (C) 2002, 2003, 2009, 2026 Vidyut Samanta and Christian Grothoff
+
+     libextractor is free software; you can redistribute it and/or modify
+     it under the terms of the GNU General Public License as published
+     by the Free Software Foundation; either version 3, or (at your
+     option) any later version.
+
+     libextractor is distributed in the hope that it will be useful, but
+     WITHOUT ANY WARRANTY; without even the implied warranty of
+     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+     General Public License for more details.
+
+     You should have received a copy of the GNU General Public License
+     along with libextractor; see the file COPYING.  If not, write to the
+     Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+     Boston, MA 02110-1301, USA.
+ */
+/**
+ * @file plugins/pdf_extractor.cc
+ * @brief plugin to support PDF files
+ * @author Vidyut Samanta
+ * @author Christian Grothoff
+ *
+ * This plugin uses the stable C++ binding of libpoppler
+ * (`poppler-cpp`).  Earlier versions of this plugin linked
+ * against poppler's internal headers (`PDFDoc`, `GooString`,
+ * ...), which carry no API or ABI stability guarantees and
+ * broke with virtually every poppler release.  The poppler-cpp
+ * interface is the supported public API and is what we use
+ * here.
+ */
+#include "platform.h"
+#include "extractor.h"
+#include <poppler/cpp/poppler-document.h>
+#include <poppler/cpp/poppler-global.h>
+#include <string>
+#include <vector>
+
+
+/**
+ * Sanity bound on the size of a PDF we are willing to buffer
+ * in memory (1 GB).  libpoppler needs the whole document, and
+ * its raw-data loader takes an `int` length.
+ */
+#define MAX_PDF_SIZE (1024LL * 1024LL * 1024LL)
+
+
+/**
+ * Entry in the mapping from poppler accessors to LE types.
+ */
+struct Matches
+{
+  /**
+   * Accessor on the poppler document returning the value.
+   */
+  poppler::ustring (poppler::document::*get) () const;
+
+  /**
+   * Corresponding meta data type in LE.
+   */
+  enum EXTRACTOR_MetaType type;
+};
+
+
+/**
+ * Map from poppler document info accessors to LE types.
+ *
+ * Note that we deliberately map "Creator" to
+ * #EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE: we believe that
+ * Adobe's "Creator" is not a person nor an organisation, but
+ * just a piece of software.
+ */
+static const struct Matches tmap[] = {
+  { &poppler::document::get_title,    EXTRACTOR_METATYPE_TITLE },
+  { &poppler::document::get_subject,  EXTRACTOR_METATYPE_SUBJECT },
+  { &poppler::document::get_keywords, EXTRACTOR_METATYPE_KEYWORDS },
+  { &poppler::document::get_author,   EXTRACTOR_METATYPE_AUTHOR_NAME },
+  { &poppler::document::get_creator,  EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE },
+  { &poppler::document::get_producer, EXTRACTOR_METATYPE_PRODUCED_BY_SOFTWARE },
+  { NULL, EXTRACTOR_METATYPE_RESERVED }
+};
+
+
+/**
+ * Silence libpoppler: we do not want parsing diagnostics on
+ * stderr of the plugin child process.
+ *
+ * @param msg the message (ignored)
+ * @param cls closure (ignored)
+ */
+static void
+quiet_error (const std::string &msg,
+             void *cls)
+{
+  (void) msg;
+  (void) cls;
+}
+
+
+/**
+ * Hand a UTF-8 string to the meta data processor, after
+ * stripping trailing whitespace.  Empty values are skipped.
+ *
+ * @param ec extraction context
+ * @param type meta data type to use
+ * @param val UTF-8 bytes (need not be 0-terminated)
+ * @return 0 to continue extracting, 1 if @a ec asked us to stop
+ */
+static int
+add_utf8 (struct EXTRACTOR_ExtractContext *ec,
+          enum EXTRACTOR_MetaType type,
+          std::vector<char> val)
+{
+  size_t len = val.size ();
+
+  /*
+   * Avoid outputting trailing whitespace.  Note that ISO-8859
+   * NBSP (0xA0) becomes 0xC2 0xA0 in UTF-8 and is intentionally
+   * not stripped here.
+   */
+  while ((0 < len) &&
+         ((' ' == val[len - 1]) ||
+          ('\r' == val[len - 1]) ||
+          ('\n' == val[len - 1]) ||
+          ('\t' == val[len - 1]) ||
+          ('\v' == val[len - 1]) ||
+          ('\f' == val[len - 1])))
+    len--;
+  if (0 == len)
+    return 0;
+  std::string s (val.data (), len);
+  if (0 != ec->proc (ec->cls,
+                     "pdf",
+                     type,
+                     EXTRACTOR_METAFORMAT_UTF8,
+                     "text/plain",
+                     s.c_str (),
+                     s.size () + 1))
+    return 1;
+  return 0;
+}
+
+
+/**
+ * Hand a 0-terminated C string to the meta data processor.
+ *
+ * @param ec extraction context
+ * @param type meta data type to use
+ * @param s the string
+ * @return 0 to continue extracting, 1 if @a ec asked us to stop
+ */
+static int
+add_str (struct EXTRACTOR_ExtractContext *ec,
+         enum EXTRACTOR_MetaType type,
+         const char *s)
+{
+  if ((NULL == s) || ('\0' == s[0]))
+    return 0;
+  if (0 != ec->proc (ec->cls,
+                     "pdf",
+                     type,
+                     EXTRACTOR_METAFORMAT_UTF8,
+                     "text/plain",
+                     s,
+                     strlen (s) + 1))
+    return 1;
+  return 0;
+}
+
+
+/**
+ * Report a date (given as a `time_t`) in ISO-8601 / UTC.
+ *
+ * @param ec extraction context
+ * @param type meta data type to use
+ * @param t the time, `(time_t) -1` or 0 if absent
+ * @return 0 to continue extracting, 1 if @a ec asked us to stop
+ */
+static int
+add_date (struct EXTRACTOR_ExtractContext *ec,
+          enum EXTRACTOR_MetaType type,
+          time_t t)
+{
+  char buf[32];
+  struct tm tv;
+
+  if (((time_t) -1 == t) || (0 == t))
+    return 0;
+  if (NULL == gmtime_r (&t, &tv))
+    return 0;
+  if (0 == strftime (buf, sizeof (buf), "%Y-%m-%d %H:%M:%S", &tv))
+    return 0;
+  return add_str (ec, type, buf);
+}
+
+
+/**
+ * Read the entire input into @a buf.
+ *
+ * @param ec extraction context
+ * @param[out] buf buffer to fill with the file contents
+ * @return 0 on success, -1 on error
+ */
+static int
+read_all (struct EXTRACTOR_ExtractContext *ec,
+          std::vector<char> &buf)
+{
+  uint64_t size;
+
+  size = ec->get_size (ec->cls);
+  if ((UINT64_MAX == size) ||
+      (0 == size) ||
+      (size > MAX_PDF_SIZE))
+    return -1;
+  if (0 != ec->seek (ec->cls, 0, SEEK_SET))
+    return -1;
+  buf.reserve ((size_t) size);
+  while (buf.size () < size)
+  {
+    void *data;
+    ssize_t got;
+
+    got = ec->read (ec->cls,
+                    &data,
+                    (size_t) (size - buf.size ()));
+    if ((got <= 0) || (NULL == data))
+      break;
+    buf.insert (buf.end (),
+                static_cast<char *> (data),
+                static_cast<char *> (data) + got);
+  }
+  if (buf.empty ())
+    return -1;
+  return 0;
+}
+
+
+/**
+ * Main entry method for the PDF extraction plugin.
+ *
+ * @param ec extraction context provided to the plugin
+ */
+extern "C" void
+EXTRACTOR_pdf_extract_method (struct EXTRACTOR_ExtractContext *ec)
+{
+  void *hdr;
+  std::vector<char> buf;
+  poppler::document *doc;
+  int major;
+  int minor;
+  char ver[32];
+  char pages[16];
+
+  if (4 != ec->read (ec->cls, &hdr, 4))
+    return;
+  if (0 != memcmp ("%PDF", hdr, 4))
+    return;
+  if (0 != read_all (ec, buf))
+    return;
+
+  poppler::set_debug_error_function (&quiet_error, NULL);
+  doc = poppler::document::load_from_raw_data (buf.data (),
+                                               (int) buf.size ());
+  if (NULL == doc)
+    return;
+  /* An encrypted document we cannot open exposes no usable meta data. */
+  if (doc->is_locked ())
+  {
+    delete doc;
+    return;
+  }
+
+  if (0 != add_str (ec,
+                    EXTRACTOR_METATYPE_MIMETYPE,
+                    "application/pdf"))
+    goto CLEANUP;
+  for (unsigned int i = 0; NULL != tmap[i].get; i++)
+    if (0 != add_utf8 (ec,
+                       tmap[i].type,
+                       (doc->*tmap[i].get)().to_utf8 ()))
+      goto CLEANUP;
+  doc->get_pdf_version (&major, &minor);
+  snprintf (ver, sizeof (ver), "PDF %d.%d", major, minor);
+  if (0 != add_str (ec, EXTRACTOR_METATYPE_FORMAT, ver))
+    goto CLEANUP;
+  snprintf (pages, sizeof (pages), "%d", doc->pages ());
+  if (0 != add_str (ec, EXTRACTOR_METATYPE_PAGE_COUNT, pages))
+    goto CLEANUP;
+  if (0 != add_date (ec,
+                     EXTRACTOR_METATYPE_CREATION_DATE,
+                     doc->get_creation_date_t ()))
+    goto CLEANUP;
+  if (0 != add_date (ec,
+                     EXTRACTOR_METATYPE_MODIFICATION_DATE,
+                     doc->get_modification_date_t ()))
+    goto CLEANUP;
+CLEANUP:
+  delete doc;
+}
+
+
+/* end of pdf_extractor.cc */
diff --git a/src/plugins/test_pdf.c b/src/plugins/test_pdf.c
@@ -0,0 +1,139 @@
+/*
+     This file is part of libextractor.
+     Copyright (C) 2026 Vidyut Samanta and Christian Grothoff
+
+     libextractor is free software; you can redistribute it and/or modify
+     it under the terms of the GNU General Public License as published
+     by the Free Software Foundation; either version 3, or (at your
+     option) any later version.
+
+     libextractor is distributed in the hope that it will be useful, but
+     WITHOUT ANY WARRANTY; without even the implied warranty of
+     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+     General Public License for more details.
+
+     You should have received a copy of the GNU General Public License
+     along with libextractor; see the file COPYING.  If not, write to the
+     Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+     Boston, MA 02110-1301, USA.
+*/
+/**
+ * @file plugins/test_pdf.c
+ * @brief testcase for pdf plugin
+ * @author Christian Grothoff
+ */
+#include "platform.h"
+#include "test_lib.h"
+
+
+/**
+ * Main function for the PDF testcase.
+ *
+ * @param argc number of arguments (ignored)
+ * @param argv arguments (ignored)
+ * @return 0 on success
+ */
+int
+main (int argc, char *argv[])
+{
+  struct SolutionData pdf_extract_sol[] = {
+    {
+      EXTRACTOR_METATYPE_MIMETYPE,
+      EXTRACTOR_METAFORMAT_UTF8,
+      "text/plain",
+      "application/pdf",
+      strlen ("application/pdf") + 1,
+      0
+    },
+    {
+      EXTRACTOR_METATYPE_TITLE,
+      EXTRACTOR_METAFORMAT_UTF8,
+      "text/plain",
+      "GNU libextractor PDF Test",
+      strlen ("GNU libextractor PDF Test") + 1,
+      0
+    },
+    {
+      EXTRACTOR_METATYPE_SUBJECT,
+      EXTRACTOR_METAFORMAT_UTF8,
+      "text/plain",
+      "Metadata extraction",
+      strlen ("Metadata extraction") + 1,
+      0
+    },
+    {
+      EXTRACTOR_METATYPE_KEYWORDS,
+      EXTRACTOR_METAFORMAT_UTF8,
+      "text/plain",
+      "PDF, libextractor, test",
+      strlen ("PDF, libextractor, test") + 1,
+      0
+    },
+    {
+      EXTRACTOR_METATYPE_AUTHOR_NAME,
+      EXTRACTOR_METAFORMAT_UTF8,
+      "text/plain",
+      "Vidyut Samanta",
+      strlen ("Vidyut Samanta") + 1,
+      0
+    },
+    {
+      EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE,
+      EXTRACTOR_METAFORMAT_UTF8,
+      "text/plain",
+      "WritePDF",
+      strlen ("WritePDF") + 1,
+      0
+    },
+    {
+      EXTRACTOR_METATYPE_PRODUCED_BY_SOFTWARE,
+      EXTRACTOR_METAFORMAT_UTF8,
+      "text/plain",
+      "libextractor testsuite",
+      strlen ("libextractor testsuite") + 1,
+      0
+    },
+    {
+      EXTRACTOR_METATYPE_FORMAT,
+      EXTRACTOR_METAFORMAT_UTF8,
+      "text/plain",
+      "PDF 1.4",
+      strlen ("PDF 1.4") + 1,
+      0
+    },
+    {
+      EXTRACTOR_METATYPE_PAGE_COUNT,
+      EXTRACTOR_METAFORMAT_UTF8,
+      "text/plain",
+      "1",
+      strlen ("1") + 1,
+      0
+    },
+    {
+      EXTRACTOR_METATYPE_CREATION_DATE,
+      EXTRACTOR_METAFORMAT_UTF8,
+      "text/plain",
+      "2020-01-15 12:30:00",
+      strlen ("2020-01-15 12:30:00") + 1,
+      0
+    },
+    {
+      EXTRACTOR_METATYPE_MODIFICATION_DATE,
+      EXTRACTOR_METAFORMAT_UTF8,
+      "text/plain",
+      "2020-01-16 08:00:00",
+      strlen ("2020-01-16 08:00:00") + 1,
+      0
+    },
+    { 0, 0, NULL, NULL, 0, -1 }
+  };
+  struct ProblemSet ps[] = {
+    { "testdata/pdf_extract.pdf",
+      pdf_extract_sol },
+    { NULL, NULL }
+  };
+  return ET_main ("pdf", ps);
+}
+
+
+/* end of test_pdf.c */
diff --git a/src/plugins/testdata/pdf_extract.pdf b/src/plugins/testdata/pdf_extract.pdf
@@ -0,0 +1,26 @@
+%PDF-1.4
+%����
+1 0 obj
+<< /Type /Catalog /Pages 2 0 R >>
+endobj
+2 0 obj
+<< /Type /Pages /Kids [3 0 R] /Count 1 >>
+endobj
+3 0 obj
+<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /Resources << >> >>
+endobj
+4 0 obj
+<< /Title (GNU libextractor PDF Test) /Author (Vidyut Samanta) /Subject (Metadata extraction) /Keywords (PDF, libextractor, test) /Creator (WritePDF) /Producer (libextractor testsuite) /CreationDate (D:20200115123000Z) /ModDate (D:20200116080000Z) >>
+endobj
+xref
+0 5
+0000000000 65535 f 
+0000000015 00000 n 
+0000000064 00000 n 
+0000000121 00000 n 
+0000000209 00000 n 
+trailer
+<< /Size 5 /Root 1 0 R /Info 4 0 R >>
+startxref
+475
+%%EOF

	libextractor GNU libextractor
	Log \| Files \| Refs \| Submodules \| README \| LICENSE

D	src/plugins/old/pdf_extractor.cc	\|	235	-------------------------------------------------------------------------------
D	src/plugins/pdf_extractor.c	\|	238	-------------------------------------------------------------------------------
A	src/plugins/pdf_extractor.cc	\|	304	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	src/plugins/test_pdf.c	\|	139	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	src/plugins/testdata/pdf_extract.pdf	\|	26	++++++++++++++++++++++++++