diff options
author | Christian Grothoff <christian@grothoff.org> | 2016-03-26 15:26:31 +0000 |
---|---|---|
committer | Christian Grothoff <christian@grothoff.org> | 2016-03-26 15:26:31 +0000 |
commit | ac125f1b1949603e7e11fe09a0af73e8418a7463 (patch) | |
tree | 47c653615baad7812d6a3a00ad9f4afcf0cf1f86 | |
parent | 34ddbd35d1efff0d3761a85fa39adfe9d1eaf3fb (diff) | |
download | libextractor-ac125f1b1949603e7e11fe09a0af73e8418a7463.tar.gz libextractor-ac125f1b1949603e7e11fe09a0af73e8418a7463.zip |
simple hack for PDF support
-rw-r--r-- | ChangeLog | 4 | ||||
-rw-r--r-- | src/include/extractor.h | 22 | ||||
-rw-r--r-- | src/plugins/Makefile.am | 12 | ||||
-rw-r--r-- | src/plugins/pdf_extractor.c | 229 |
4 files changed, 257 insertions, 10 deletions
@@ -1,3 +1,7 @@ | |||
1 | Sat Mar 26 16:23:56 CET 2016 | ||
2 | Adding PDF support using pdfinfo. | ||
3 | Likely conflicts with Apparmor. -CG | ||
4 | |||
1 | Mon Aug 31 19:19:17 CEST 2015 | 5 | Mon Aug 31 19:19:17 CEST 2015 |
2 | Adding apparmor support. -jmorvan/CG | 6 | Adding apparmor support. -jmorvan/CG |
3 | 7 | ||
diff --git a/src/include/extractor.h b/src/include/extractor.h index f59cabe..9bce88c 100644 --- a/src/include/extractor.h +++ b/src/include/extractor.h | |||
@@ -35,7 +35,7 @@ extern "C" { | |||
35 | * 0.2.6-1 => 0x00020601 | 35 | * 0.2.6-1 => 0x00020601 |
36 | * 4.5.2-0 => 0x04050200 | 36 | * 4.5.2-0 => 0x04050200 |
37 | */ | 37 | */ |
38 | #define EXTRACTOR_VERSION 0x01030001 | 38 | #define EXTRACTOR_VERSION 0x01030002 |
39 | 39 | ||
40 | #include <stdio.h> | 40 | #include <stdio.h> |
41 | 41 | ||
@@ -383,7 +383,7 @@ enum EXTRACTOR_MetaType | |||
383 | EXTRACTOR_METATYPE_AUDIO_DURATION = 226, | 383 | EXTRACTOR_METATYPE_AUDIO_DURATION = 226, |
384 | EXTRACTOR_METATYPE_SUBTITLE_DURATION = 227, | 384 | EXTRACTOR_METATYPE_SUBTITLE_DURATION = 227, |
385 | 385 | ||
386 | EXTRACTOR_METATYPE_AUDIO_PREVIEW = 228, | 386 | EXTRACTOR_METATYPE_AUDIO_PREVIEW = 228, |
387 | 387 | ||
388 | EXTRACTOR_METATYPE_LAST = 229 | 388 | EXTRACTOR_METATYPE_LAST = 229 |
389 | }; | 389 | }; |
@@ -443,13 +443,14 @@ EXTRACTOR_metatype_get_max (void); | |||
443 | * @param data_len number of bytes in @a data | 443 | * @param data_len number of bytes in @a data |
444 | * @return 0 to continue extracting, 1 to abort | 444 | * @return 0 to continue extracting, 1 to abort |
445 | */ | 445 | */ |
446 | typedef int (*EXTRACTOR_MetaDataProcessor) (void *cls, | 446 | typedef int |
447 | const char *plugin_name, | 447 | (*EXTRACTOR_MetaDataProcessor) (void *cls, |
448 | enum EXTRACTOR_MetaType type, | 448 | const char *plugin_name, |
449 | enum EXTRACTOR_MetaFormat format, | 449 | enum EXTRACTOR_MetaType type, |
450 | const char *data_mime_type, | 450 | enum EXTRACTOR_MetaFormat format, |
451 | const char *data, | 451 | const char *data_mime_type, |
452 | size_t data_len); | 452 | const char *data, |
453 | size_t data_len); | ||
453 | 454 | ||
454 | 455 | ||
455 | /** | 456 | /** |
@@ -519,7 +520,8 @@ struct EXTRACTOR_ExtractContext | |||
519 | * | 520 | * |
520 | * @param ec extraction context provided to the plugin | 521 | * @param ec extraction context provided to the plugin |
521 | */ | 522 | */ |
522 | typedef void (*EXTRACTOR_extract_method) (struct EXTRACTOR_ExtractContext *ec); | 523 | typedef void |
524 | (*EXTRACTOR_extract_method) (struct EXTRACTOR_ExtractContext *ec); | ||
523 | 525 | ||
524 | 526 | ||
525 | /** | 527 | /** |
diff --git a/src/plugins/Makefile.am b/src/plugins/Makefile.am index 85c3998..8cdd905 100644 --- a/src/plugins/Makefile.am +++ b/src/plugins/Makefile.am | |||
@@ -160,6 +160,9 @@ PLUGIN_OGG=libextractor_ogg.la | |||
160 | TEST_OGG=test_ogg | 160 | TEST_OGG=test_ogg |
161 | endif | 161 | endif |
162 | 162 | ||
163 | if ! WINDOWS | ||
164 | PLUGIN_PDF=libextractor_pdf.la | ||
165 | endif | ||
163 | 166 | ||
164 | if HAVE_ZLIB | 167 | if HAVE_ZLIB |
165 | PLUGIN_ZLIB= \ | 168 | PLUGIN_ZLIB= \ |
@@ -198,6 +201,7 @@ plugin_LTLIBRARIES = \ | |||
198 | $(PLUGIN_MP4) \ | 201 | $(PLUGIN_MP4) \ |
199 | $(PLUGIN_MPEG) \ | 202 | $(PLUGIN_MPEG) \ |
200 | $(PLUGIN_OGG) \ | 203 | $(PLUGIN_OGG) \ |
204 | $(PLUGIN_PDF) \ | ||
201 | $(PLUGIN_PREVIEWOPUS) \ | 205 | $(PLUGIN_PREVIEWOPUS) \ |
202 | $(PLUGIN_RPM) \ | 206 | $(PLUGIN_RPM) \ |
203 | $(PLUGIN_TIFF) \ | 207 | $(PLUGIN_TIFF) \ |
@@ -524,6 +528,14 @@ test_ogg_LDADD = \ | |||
524 | $(top_builddir)/src/plugins/libtest.la | 528 | $(top_builddir)/src/plugins/libtest.la |
525 | 529 | ||
526 | 530 | ||
531 | libextractor_pdf_la_SOURCES = \ | ||
532 | pdf_extractor.c | ||
533 | libextractor_pdf_la_LDFLAGS = \ | ||
534 | $(PLUGINFLAGS) | ||
535 | libextractor_pdf_la_LIBADD = \ | ||
536 | $(top_builddir)/src/common/libextractor_common.la $(XLIB) $(SOCKET_LIBS) | ||
537 | |||
538 | |||
527 | libextractor_png_la_SOURCES = \ | 539 | libextractor_png_la_SOURCES = \ |
528 | png_extractor.c | 540 | png_extractor.c |
529 | libextractor_png_la_LDFLAGS = \ | 541 | libextractor_png_la_LDFLAGS = \ |
diff --git a/src/plugins/pdf_extractor.c b/src/plugins/pdf_extractor.c new file mode 100644 index 0000000..b84981f --- /dev/null +++ b/src/plugins/pdf_extractor.c | |||
@@ -0,0 +1,229 @@ | |||
1 | /* | ||
2 | This file is part of libextractor. | ||
3 | Copyright (C) 2016 Christian Grothoff | ||
4 | |||
5 | libextractor is free software; you can redistribute it and/or modify | ||
6 | it under the terms of the GNU General Public License as published | ||
7 | by the Free Software Foundation; either version 3, or (at your | ||
8 | option) any later version. | ||
9 | |||
10 | libextractor is distributed in the hope that it will be useful, but | ||
11 | WITHOUT ANY WARRANTY; without even the implied warranty of | ||
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
13 | General Public License for more details. | ||
14 | |||
15 | You should have received a copy of the GNU General Public License | ||
16 | along with libextractor; see the file COPYING. If not, write to the | ||
17 | Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, | ||
18 | Boston, MA 02110-1301, USA. | ||
19 | */ | ||
20 | /** | ||
21 | * @file plugins/pdf_extractor.c | ||
22 | * @brief plugin to support PDF files | ||
23 | * @author Christian Grothoff | ||
24 | * | ||
25 | * PDF libraries today are a nightmare (TM). So instead of doing the | ||
26 | * fast thing and calling some library functions to parse the PDF, | ||
27 | * we execute 'pdfinfo' and parse the output. Because that's 21st | ||
28 | * century plumbing: nobody writes reasonable code anymore. | ||
29 | */ | ||
30 | #include "platform.h" | ||
31 | #include <extractor.h> | ||
32 | #include <sys/types.h> | ||
33 | #include <sys/wait.h> | ||
34 | #include <signal.h> | ||
35 | #include <unistd.h> | ||
36 | |||
37 | /** | ||
38 | * Entry in the mapping from control data to LE types. | ||
39 | */ | ||
40 | struct Matches | ||
41 | { | ||
42 | /** | ||
43 | * Key in the Pdfian control file. | ||
44 | */ | ||
45 | const char *text; | ||
46 | |||
47 | /** | ||
48 | * Corresponding type in LE. | ||
49 | */ | ||
50 | enum EXTRACTOR_MetaType type; | ||
51 | }; | ||
52 | |||
53 | |||
54 | /** | ||
55 | * Map from pdf-control entries to LE types. | ||
56 | * | ||
57 | * See output of 'pdfinfo'. | ||
58 | */ | ||
59 | static struct Matches tmap[] = { | ||
60 | {"Title", EXTRACTOR_METATYPE_TITLE}, | ||
61 | {"Subject", EXTRACTOR_METATYPE_SUBJECT}, | ||
62 | {"Keywords", EXTRACTOR_METATYPE_KEYWORDS}, | ||
63 | {"Author", EXTRACTOR_METATYPE_AUTHOR_NAME}, | ||
64 | {"Creator", EXTRACTOR_METATYPE_CREATOR}, | ||
65 | {"Producer", EXTRACTOR_METATYPE_PRODUCED_BY_SOFTWARE}, | ||
66 | {"CreationDate", EXTRACTOR_METATYPE_CREATION_DATE}, | ||
67 | {"ModDate", EXTRACTOR_METATYPE_MODIFICATION_DATE}, | ||
68 | {"PDF version", EXTRACTOR_METATYPE_ENCODER_VERSION}, | ||
69 | {"Pages", EXTRACTOR_METATYPE_PAGE_COUNT}, | ||
70 | {NULL, 0} | ||
71 | }; | ||
72 | |||
73 | |||
74 | /** | ||
75 | * Process the "stdout" file from pdfinfo. | ||
76 | * | ||
77 | * @param fout stdout of pdfinfo | ||
78 | * @param proc function to call with meta data | ||
79 | * @param proc_cls closure for @e proc | ||
80 | */ | ||
81 | static void | ||
82 | process_stdout (FILE *fout, | ||
83 | EXTRACTOR_MetaDataProcessor proc, | ||
84 | void *proc_cls) | ||
85 | { | ||
86 | unsigned int i; | ||
87 | char line[1025]; | ||
88 | const char *psuffix; | ||
89 | const char *colon; | ||
90 | |||
91 | while (! feof (fout)) | ||
92 | { | ||
93 | if (NULL == fgets (line, sizeof (line) - 1, fout)) | ||
94 | break; | ||
95 | if (0 == strlen (line)) | ||
96 | continue; | ||
97 | if ('\n' == line[strlen(line)-1]) | ||
98 | line[strlen(line)-1] = '\0'; | ||
99 | colon = strchr (line, (int) ':'); | ||
100 | if (NULL == colon) | ||
101 | break; | ||
102 | psuffix = colon + 1; | ||
103 | while (isblank ((int) psuffix[0])) | ||
104 | psuffix++; | ||
105 | if (0 == strlen (psuffix)) | ||
106 | continue; | ||
107 | for (i = 0; NULL != tmap[i].text; i++) | ||
108 | { | ||
109 | if (0 != strncasecmp (line, | ||
110 | tmap[i].text, | ||
111 | colon - line)) | ||
112 | continue; | ||
113 | if (0 != proc (proc_cls, | ||
114 | "pdf", | ||
115 | tmap[i].type, | ||
116 | EXTRACTOR_METAFORMAT_UTF8, | ||
117 | "text/plain", | ||
118 | psuffix, | ||
119 | strlen(psuffix) + 1)) | ||
120 | return; | ||
121 | break; | ||
122 | } | ||
123 | } | ||
124 | } | ||
125 | |||
126 | |||
127 | /** | ||
128 | * Main entry method for the PDF extraction plugin. | ||
129 | * | ||
130 | * @param ec extraction context provided to the plugin | ||
131 | */ | ||
132 | void | ||
133 | EXTRACTOR_pdf_extract_method (struct EXTRACTOR_ExtractContext *ec) | ||
134 | { | ||
135 | uint64_t fsize; | ||
136 | void *data; | ||
137 | pid_t pid; | ||
138 | int in[2]; | ||
139 | int out[2]; | ||
140 | FILE *fout; | ||
141 | uint64_t pos; | ||
142 | |||
143 | fsize = ec->get_size (ec->cls); | ||
144 | if (fsize < 128) | ||
145 | return; | ||
146 | if (4 != | ||
147 | ec->read (ec->cls, &data, 4)) | ||
148 | return; | ||
149 | if (0 != strncmp ("%PDF", data, 4)) | ||
150 | return; | ||
151 | if (0 != | ||
152 | ec->seek (ec->cls, 0, SEEK_SET)) | ||
153 | return; | ||
154 | if (0 != pipe (in)) | ||
155 | return; | ||
156 | if (0 != pipe (out)) | ||
157 | { | ||
158 | close (in[0]); | ||
159 | close (in[1]); | ||
160 | return; | ||
161 | } | ||
162 | pid = fork (); | ||
163 | if (-1 == pid) | ||
164 | { | ||
165 | close (in[0]); | ||
166 | close (in[1]); | ||
167 | close (out[0]); | ||
168 | close (out[1]); | ||
169 | return; | ||
170 | } | ||
171 | if (0 == pid) | ||
172 | { | ||
173 | char *const args[] = { | ||
174 | "pdfinfo", | ||
175 | "-", | ||
176 | NULL | ||
177 | }; | ||
178 | /* am child, exec 'pdfinfo' */ | ||
179 | close (0); | ||
180 | close (1); | ||
181 | dup2 (in[0], 0); | ||
182 | dup2 (out[1], 1); | ||
183 | close (in[0]); | ||
184 | close (in[1]); | ||
185 | close (out[0]); | ||
186 | close (out[1]); | ||
187 | execvp ("pdfinfo", args); | ||
188 | exit (1); | ||
189 | } | ||
190 | /* am parent, send file */ | ||
191 | close (in[0]); | ||
192 | close (out[1]); | ||
193 | fout = fdopen (out[0], "r"); | ||
194 | |||
195 | pos = 0; | ||
196 | while (pos < fsize) | ||
197 | { | ||
198 | ssize_t got; | ||
199 | size_t wpos; | ||
200 | |||
201 | data = NULL; | ||
202 | got = ec->read (ec->cls, | ||
203 | &data, | ||
204 | fsize - pos); | ||
205 | if ( (-1 == got) || | ||
206 | (NULL == data) ) | ||
207 | break; | ||
208 | wpos = 0; | ||
209 | while (wpos < got) | ||
210 | { | ||
211 | ssize_t out; | ||
212 | |||
213 | out = write (in[1], data + wpos, got - wpos); | ||
214 | if (out <= 0) | ||
215 | break; | ||
216 | wpos += out; | ||
217 | } | ||
218 | if (wpos < got) | ||
219 | break; | ||
220 | pos += got; | ||
221 | } | ||
222 | close (in[1]); | ||
223 | process_stdout (fout, ec->proc, ec->cls); | ||
224 | fclose (fout); | ||
225 | kill (pid, SIGKILL); | ||
226 | waitpid (pid, NULL, 0); | ||
227 | } | ||
228 | |||
229 | /* end of pdf_extractor.c */ | ||