summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristian Grothoff <christian@grothoff.org>2016-03-26 15:26:31 +0000
committerChristian Grothoff <christian@grothoff.org>2016-03-26 15:26:31 +0000
commitac125f1b1949603e7e11fe09a0af73e8418a7463 (patch)
tree47c653615baad7812d6a3a00ad9f4afcf0cf1f86
parent34ddbd35d1efff0d3761a85fa39adfe9d1eaf3fb (diff)
downloadlibextractor-ac125f1b1949603e7e11fe09a0af73e8418a7463.tar.gz
libextractor-ac125f1b1949603e7e11fe09a0af73e8418a7463.zip
simple hack for PDF support
-rw-r--r--ChangeLog4
-rw-r--r--src/include/extractor.h22
-rw-r--r--src/plugins/Makefile.am12
-rw-r--r--src/plugins/pdf_extractor.c229
4 files changed, 257 insertions, 10 deletions
diff --git a/ChangeLog b/ChangeLog
index 4dcb2ec..b5b7271 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,7 @@
1Sat Mar 26 16:23:56 CET 2016
2 Adding PDF support using pdfinfo.
3 Likely conflicts with Apparmor. -CG
4
1Mon Aug 31 19:19:17 CEST 2015 5Mon Aug 31 19:19:17 CEST 2015
2 Adding apparmor support. -jmorvan/CG 6 Adding apparmor support. -jmorvan/CG
3 7
diff --git a/src/include/extractor.h b/src/include/extractor.h
index f59cabe..9bce88c 100644
--- a/src/include/extractor.h
+++ b/src/include/extractor.h
@@ -35,7 +35,7 @@ extern "C" {
35 * 0.2.6-1 => 0x00020601 35 * 0.2.6-1 => 0x00020601
36 * 4.5.2-0 => 0x04050200 36 * 4.5.2-0 => 0x04050200
37 */ 37 */
38#define EXTRACTOR_VERSION 0x01030001 38#define EXTRACTOR_VERSION 0x01030002
39 39
40#include <stdio.h> 40#include <stdio.h>
41 41
@@ -383,7 +383,7 @@ enum EXTRACTOR_MetaType
383 EXTRACTOR_METATYPE_AUDIO_DURATION = 226, 383 EXTRACTOR_METATYPE_AUDIO_DURATION = 226,
384 EXTRACTOR_METATYPE_SUBTITLE_DURATION = 227, 384 EXTRACTOR_METATYPE_SUBTITLE_DURATION = 227,
385 385
386 EXTRACTOR_METATYPE_AUDIO_PREVIEW = 228, 386 EXTRACTOR_METATYPE_AUDIO_PREVIEW = 228,
387 387
388 EXTRACTOR_METATYPE_LAST = 229 388 EXTRACTOR_METATYPE_LAST = 229
389 }; 389 };
@@ -443,13 +443,14 @@ EXTRACTOR_metatype_get_max (void);
443 * @param data_len number of bytes in @a data 443 * @param data_len number of bytes in @a data
444 * @return 0 to continue extracting, 1 to abort 444 * @return 0 to continue extracting, 1 to abort
445 */ 445 */
446typedef int (*EXTRACTOR_MetaDataProcessor) (void *cls, 446typedef int
447 const char *plugin_name, 447(*EXTRACTOR_MetaDataProcessor) (void *cls,
448 enum EXTRACTOR_MetaType type, 448 const char *plugin_name,
449 enum EXTRACTOR_MetaFormat format, 449 enum EXTRACTOR_MetaType type,
450 const char *data_mime_type, 450 enum EXTRACTOR_MetaFormat format,
451 const char *data, 451 const char *data_mime_type,
452 size_t data_len); 452 const char *data,
453 size_t data_len);
453 454
454 455
455/** 456/**
@@ -519,7 +520,8 @@ struct EXTRACTOR_ExtractContext
519 * 520 *
520 * @param ec extraction context provided to the plugin 521 * @param ec extraction context provided to the plugin
521 */ 522 */
522typedef void (*EXTRACTOR_extract_method) (struct EXTRACTOR_ExtractContext *ec); 523typedef void
524(*EXTRACTOR_extract_method) (struct EXTRACTOR_ExtractContext *ec);
523 525
524 526
525/** 527/**
diff --git a/src/plugins/Makefile.am b/src/plugins/Makefile.am
index 85c3998..8cdd905 100644
--- a/src/plugins/Makefile.am
+++ b/src/plugins/Makefile.am
@@ -160,6 +160,9 @@ PLUGIN_OGG=libextractor_ogg.la
160TEST_OGG=test_ogg 160TEST_OGG=test_ogg
161endif 161endif
162 162
163if ! WINDOWS
164PLUGIN_PDF=libextractor_pdf.la
165endif
163 166
164if HAVE_ZLIB 167if HAVE_ZLIB
165PLUGIN_ZLIB= \ 168PLUGIN_ZLIB= \
@@ -198,6 +201,7 @@ plugin_LTLIBRARIES = \
198 $(PLUGIN_MP4) \ 201 $(PLUGIN_MP4) \
199 $(PLUGIN_MPEG) \ 202 $(PLUGIN_MPEG) \
200 $(PLUGIN_OGG) \ 203 $(PLUGIN_OGG) \
204 $(PLUGIN_PDF) \
201 $(PLUGIN_PREVIEWOPUS) \ 205 $(PLUGIN_PREVIEWOPUS) \
202 $(PLUGIN_RPM) \ 206 $(PLUGIN_RPM) \
203 $(PLUGIN_TIFF) \ 207 $(PLUGIN_TIFF) \
@@ -524,6 +528,14 @@ test_ogg_LDADD = \
524 $(top_builddir)/src/plugins/libtest.la 528 $(top_builddir)/src/plugins/libtest.la
525 529
526 530
531libextractor_pdf_la_SOURCES = \
532 pdf_extractor.c
533libextractor_pdf_la_LDFLAGS = \
534 $(PLUGINFLAGS)
535libextractor_pdf_la_LIBADD = \
536 $(top_builddir)/src/common/libextractor_common.la $(XLIB) $(SOCKET_LIBS)
537
538
527libextractor_png_la_SOURCES = \ 539libextractor_png_la_SOURCES = \
528 png_extractor.c 540 png_extractor.c
529libextractor_png_la_LDFLAGS = \ 541libextractor_png_la_LDFLAGS = \
diff --git a/src/plugins/pdf_extractor.c b/src/plugins/pdf_extractor.c
new file mode 100644
index 0000000..b84981f
--- /dev/null
+++ b/src/plugins/pdf_extractor.c
@@ -0,0 +1,229 @@
1/*
2 This file is part of libextractor.
3 Copyright (C) 2016 Christian Grothoff
4
5 libextractor is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published
7 by the Free Software Foundation; either version 3, or (at your
8 option) any later version.
9
10 libextractor is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with libextractor; see the file COPYING. If not, write to the
17 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
18 Boston, MA 02110-1301, USA.
19 */
20/**
21 * @file plugins/pdf_extractor.c
22 * @brief plugin to support PDF files
23 * @author Christian Grothoff
24 *
25 * PDF libraries today are a nightmare (TM). So instead of doing the
26 * fast thing and calling some library functions to parse the PDF,
27 * we execute 'pdfinfo' and parse the output. Because that's 21st
28 * century plumbing: nobody writes reasonable code anymore.
29 */
30#include "platform.h"
31#include <extractor.h>
32#include <sys/types.h>
33#include <sys/wait.h>
34#include <signal.h>
35#include <unistd.h>
36
37/**
38 * Entry in the mapping from control data to LE types.
39 */
40struct Matches
41{
42 /**
43 * Key in the Pdfian control file.
44 */
45 const char *text;
46
47 /**
48 * Corresponding type in LE.
49 */
50 enum EXTRACTOR_MetaType type;
51};
52
53
54/**
55 * Map from pdf-control entries to LE types.
56 *
57 * See output of 'pdfinfo'.
58 */
59static struct Matches tmap[] = {
60 {"Title", EXTRACTOR_METATYPE_TITLE},
61 {"Subject", EXTRACTOR_METATYPE_SUBJECT},
62 {"Keywords", EXTRACTOR_METATYPE_KEYWORDS},
63 {"Author", EXTRACTOR_METATYPE_AUTHOR_NAME},
64 {"Creator", EXTRACTOR_METATYPE_CREATOR},
65 {"Producer", EXTRACTOR_METATYPE_PRODUCED_BY_SOFTWARE},
66 {"CreationDate", EXTRACTOR_METATYPE_CREATION_DATE},
67 {"ModDate", EXTRACTOR_METATYPE_MODIFICATION_DATE},
68 {"PDF version", EXTRACTOR_METATYPE_ENCODER_VERSION},
69 {"Pages", EXTRACTOR_METATYPE_PAGE_COUNT},
70 {NULL, 0}
71};
72
73
74/**
75 * Process the "stdout" file from pdfinfo.
76 *
77 * @param fout stdout of pdfinfo
78 * @param proc function to call with meta data
79 * @param proc_cls closure for @e proc
80 */
81static void
82process_stdout (FILE *fout,
83 EXTRACTOR_MetaDataProcessor proc,
84 void *proc_cls)
85{
86 unsigned int i;
87 char line[1025];
88 const char *psuffix;
89 const char *colon;
90
91 while (! feof (fout))
92 {
93 if (NULL == fgets (line, sizeof (line) - 1, fout))
94 break;
95 if (0 == strlen (line))
96 continue;
97 if ('\n' == line[strlen(line)-1])
98 line[strlen(line)-1] = '\0';
99 colon = strchr (line, (int) ':');
100 if (NULL == colon)
101 break;
102 psuffix = colon + 1;
103 while (isblank ((int) psuffix[0]))
104 psuffix++;
105 if (0 == strlen (psuffix))
106 continue;
107 for (i = 0; NULL != tmap[i].text; i++)
108 {
109 if (0 != strncasecmp (line,
110 tmap[i].text,
111 colon - line))
112 continue;
113 if (0 != proc (proc_cls,
114 "pdf",
115 tmap[i].type,
116 EXTRACTOR_METAFORMAT_UTF8,
117 "text/plain",
118 psuffix,
119 strlen(psuffix) + 1))
120 return;
121 break;
122 }
123 }
124}
125
126
127/**
128 * Main entry method for the PDF extraction plugin.
129 *
130 * @param ec extraction context provided to the plugin
131 */
132void
133EXTRACTOR_pdf_extract_method (struct EXTRACTOR_ExtractContext *ec)
134{
135 uint64_t fsize;
136 void *data;
137 pid_t pid;
138 int in[2];
139 int out[2];
140 FILE *fout;
141 uint64_t pos;
142
143 fsize = ec->get_size (ec->cls);
144 if (fsize < 128)
145 return;
146 if (4 !=
147 ec->read (ec->cls, &data, 4))
148 return;
149 if (0 != strncmp ("%PDF", data, 4))
150 return;
151 if (0 !=
152 ec->seek (ec->cls, 0, SEEK_SET))
153 return;
154 if (0 != pipe (in))
155 return;
156 if (0 != pipe (out))
157 {
158 close (in[0]);
159 close (in[1]);
160 return;
161 }
162 pid = fork ();
163 if (-1 == pid)
164 {
165 close (in[0]);
166 close (in[1]);
167 close (out[0]);
168 close (out[1]);
169 return;
170 }
171 if (0 == pid)
172 {
173 char *const args[] = {
174 "pdfinfo",
175 "-",
176 NULL
177 };
178 /* am child, exec 'pdfinfo' */
179 close (0);
180 close (1);
181 dup2 (in[0], 0);
182 dup2 (out[1], 1);
183 close (in[0]);
184 close (in[1]);
185 close (out[0]);
186 close (out[1]);
187 execvp ("pdfinfo", args);
188 exit (1);
189 }
190 /* am parent, send file */
191 close (in[0]);
192 close (out[1]);
193 fout = fdopen (out[0], "r");
194
195 pos = 0;
196 while (pos < fsize)
197 {
198 ssize_t got;
199 size_t wpos;
200
201 data = NULL;
202 got = ec->read (ec->cls,
203 &data,
204 fsize - pos);
205 if ( (-1 == got) ||
206 (NULL == data) )
207 break;
208 wpos = 0;
209 while (wpos < got)
210 {
211 ssize_t out;
212
213 out = write (in[1], data + wpos, got - wpos);
214 if (out <= 0)
215 break;
216 wpos += out;
217 }
218 if (wpos < got)
219 break;
220 pos += got;
221 }
222 close (in[1]);
223 process_stdout (fout, ec->proc, ec->cls);
224 fclose (fout);
225 kill (pid, SIGKILL);
226 waitpid (pid, NULL, 0);
227}
228
229/* end of pdf_extractor.c */