libextractor

GNU libextractor
Log | Files | Refs | Submodules | README | LICENSE

commit ac125f1b1949603e7e11fe09a0af73e8418a7463
parent 34ddbd35d1efff0d3761a85fa39adfe9d1eaf3fb
Author: Christian Grothoff <christian@grothoff.org>
Date:   Sat, 26 Mar 2016 15:26:31 +0000

simple hack for PDF support

Diffstat:
MChangeLog | 4++++
Msrc/include/extractor.h | 22++++++++++++----------
Msrc/plugins/Makefile.am | 12++++++++++++
Asrc/plugins/pdf_extractor.c | 229+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 257 insertions(+), 10 deletions(-)

diff --git a/ChangeLog b/ChangeLog @@ -1,3 +1,7 @@ +Sat Mar 26 16:23:56 CET 2016 + Adding PDF support using pdfinfo. + Likely conflicts with Apparmor. -CG + Mon Aug 31 19:19:17 CEST 2015 Adding apparmor support. -jmorvan/CG diff --git a/src/include/extractor.h b/src/include/extractor.h @@ -35,7 +35,7 @@ extern "C" { * 0.2.6-1 => 0x00020601 * 4.5.2-0 => 0x04050200 */ -#define EXTRACTOR_VERSION 0x01030001 +#define EXTRACTOR_VERSION 0x01030002 #include <stdio.h> @@ -383,7 +383,7 @@ enum EXTRACTOR_MetaType EXTRACTOR_METATYPE_AUDIO_DURATION = 226, EXTRACTOR_METATYPE_SUBTITLE_DURATION = 227, - EXTRACTOR_METATYPE_AUDIO_PREVIEW = 228, + EXTRACTOR_METATYPE_AUDIO_PREVIEW = 228, EXTRACTOR_METATYPE_LAST = 229 }; @@ -443,13 +443,14 @@ EXTRACTOR_metatype_get_max (void); * @param data_len number of bytes in @a data * @return 0 to continue extracting, 1 to abort */ -typedef int (*EXTRACTOR_MetaDataProcessor) (void *cls, - const char *plugin_name, - enum EXTRACTOR_MetaType type, - enum EXTRACTOR_MetaFormat format, - const char *data_mime_type, - const char *data, - size_t data_len); +typedef int +(*EXTRACTOR_MetaDataProcessor) (void *cls, + const char *plugin_name, + enum EXTRACTOR_MetaType type, + enum EXTRACTOR_MetaFormat format, + const char *data_mime_type, + const char *data, + size_t data_len); /** @@ -519,7 +520,8 @@ struct EXTRACTOR_ExtractContext * * @param ec extraction context provided to the plugin */ -typedef void (*EXTRACTOR_extract_method) (struct EXTRACTOR_ExtractContext *ec); +typedef void +(*EXTRACTOR_extract_method) (struct EXTRACTOR_ExtractContext *ec); /** diff --git a/src/plugins/Makefile.am b/src/plugins/Makefile.am @@ -160,6 +160,9 @@ PLUGIN_OGG=libextractor_ogg.la TEST_OGG=test_ogg endif +if ! WINDOWS +PLUGIN_PDF=libextractor_pdf.la +endif if HAVE_ZLIB PLUGIN_ZLIB= \ @@ -198,6 +201,7 @@ plugin_LTLIBRARIES = \ $(PLUGIN_MP4) \ $(PLUGIN_MPEG) \ $(PLUGIN_OGG) \ + $(PLUGIN_PDF) \ $(PLUGIN_PREVIEWOPUS) \ $(PLUGIN_RPM) \ $(PLUGIN_TIFF) \ @@ -524,6 +528,14 @@ test_ogg_LDADD = \ $(top_builddir)/src/plugins/libtest.la +libextractor_pdf_la_SOURCES = \ + pdf_extractor.c +libextractor_pdf_la_LDFLAGS = \ + $(PLUGINFLAGS) +libextractor_pdf_la_LIBADD = \ + $(top_builddir)/src/common/libextractor_common.la $(XLIB) $(SOCKET_LIBS) + + libextractor_png_la_SOURCES = \ png_extractor.c libextractor_png_la_LDFLAGS = \ diff --git a/src/plugins/pdf_extractor.c b/src/plugins/pdf_extractor.c @@ -0,0 +1,229 @@ +/* + This file is part of libextractor. + Copyright (C) 2016 Christian Grothoff + + libextractor is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3, or (at your + option) any later version. + + libextractor is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with libextractor; see the file COPYING. If not, write to the + Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, + Boston, MA 02110-1301, USA. + */ +/** + * @file plugins/pdf_extractor.c + * @brief plugin to support PDF files + * @author Christian Grothoff + * + * PDF libraries today are a nightmare (TM). So instead of doing the + * fast thing and calling some library functions to parse the PDF, + * we execute 'pdfinfo' and parse the output. Because that's 21st + * century plumbing: nobody writes reasonable code anymore. + */ +#include "platform.h" +#include <extractor.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <signal.h> +#include <unistd.h> + +/** + * Entry in the mapping from control data to LE types. + */ +struct Matches +{ + /** + * Key in the Pdfian control file. + */ + const char *text; + + /** + * Corresponding type in LE. + */ + enum EXTRACTOR_MetaType type; +}; + + +/** + * Map from pdf-control entries to LE types. + * + * See output of 'pdfinfo'. + */ +static struct Matches tmap[] = { + {"Title", EXTRACTOR_METATYPE_TITLE}, + {"Subject", EXTRACTOR_METATYPE_SUBJECT}, + {"Keywords", EXTRACTOR_METATYPE_KEYWORDS}, + {"Author", EXTRACTOR_METATYPE_AUTHOR_NAME}, + {"Creator", EXTRACTOR_METATYPE_CREATOR}, + {"Producer", EXTRACTOR_METATYPE_PRODUCED_BY_SOFTWARE}, + {"CreationDate", EXTRACTOR_METATYPE_CREATION_DATE}, + {"ModDate", EXTRACTOR_METATYPE_MODIFICATION_DATE}, + {"PDF version", EXTRACTOR_METATYPE_ENCODER_VERSION}, + {"Pages", EXTRACTOR_METATYPE_PAGE_COUNT}, + {NULL, 0} +}; + + +/** + * Process the "stdout" file from pdfinfo. + * + * @param fout stdout of pdfinfo + * @param proc function to call with meta data + * @param proc_cls closure for @e proc + */ +static void +process_stdout (FILE *fout, + EXTRACTOR_MetaDataProcessor proc, + void *proc_cls) +{ + unsigned int i; + char line[1025]; + const char *psuffix; + const char *colon; + + while (! feof (fout)) + { + if (NULL == fgets (line, sizeof (line) - 1, fout)) + break; + if (0 == strlen (line)) + continue; + if ('\n' == line[strlen(line)-1]) + line[strlen(line)-1] = '\0'; + colon = strchr (line, (int) ':'); + if (NULL == colon) + break; + psuffix = colon + 1; + while (isblank ((int) psuffix[0])) + psuffix++; + if (0 == strlen (psuffix)) + continue; + for (i = 0; NULL != tmap[i].text; i++) + { + if (0 != strncasecmp (line, + tmap[i].text, + colon - line)) + continue; + if (0 != proc (proc_cls, + "pdf", + tmap[i].type, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + psuffix, + strlen(psuffix) + 1)) + return; + break; + } + } +} + + +/** + * Main entry method for the PDF extraction plugin. + * + * @param ec extraction context provided to the plugin + */ +void +EXTRACTOR_pdf_extract_method (struct EXTRACTOR_ExtractContext *ec) +{ + uint64_t fsize; + void *data; + pid_t pid; + int in[2]; + int out[2]; + FILE *fout; + uint64_t pos; + + fsize = ec->get_size (ec->cls); + if (fsize < 128) + return; + if (4 != + ec->read (ec->cls, &data, 4)) + return; + if (0 != strncmp ("%PDF", data, 4)) + return; + if (0 != + ec->seek (ec->cls, 0, SEEK_SET)) + return; + if (0 != pipe (in)) + return; + if (0 != pipe (out)) + { + close (in[0]); + close (in[1]); + return; + } + pid = fork (); + if (-1 == pid) + { + close (in[0]); + close (in[1]); + close (out[0]); + close (out[1]); + return; + } + if (0 == pid) + { + char *const args[] = { + "pdfinfo", + "-", + NULL + }; + /* am child, exec 'pdfinfo' */ + close (0); + close (1); + dup2 (in[0], 0); + dup2 (out[1], 1); + close (in[0]); + close (in[1]); + close (out[0]); + close (out[1]); + execvp ("pdfinfo", args); + exit (1); + } + /* am parent, send file */ + close (in[0]); + close (out[1]); + fout = fdopen (out[0], "r"); + + pos = 0; + while (pos < fsize) + { + ssize_t got; + size_t wpos; + + data = NULL; + got = ec->read (ec->cls, + &data, + fsize - pos); + if ( (-1 == got) || + (NULL == data) ) + break; + wpos = 0; + while (wpos < got) + { + ssize_t out; + + out = write (in[1], data + wpos, got - wpos); + if (out <= 0) + break; + wpos += out; + } + if (wpos < got) + break; + pos += got; + } + close (in[1]); + process_stdout (fout, ec->proc, ec->cls); + fclose (fout); + kill (pid, SIGKILL); + waitpid (pid, NULL, 0); +} + +/* end of pdf_extractor.c */