diff options
Diffstat (limited to 'src/plugins/pdf_extractor.c')
-rw-r--r-- | src/plugins/pdf_extractor.c | 229 |
1 files changed, 229 insertions, 0 deletions
diff --git a/src/plugins/pdf_extractor.c b/src/plugins/pdf_extractor.c new file mode 100644 index 0000000..b84981f --- /dev/null +++ b/src/plugins/pdf_extractor.c | |||
@@ -0,0 +1,229 @@ | |||
1 | /* | ||
2 | This file is part of libextractor. | ||
3 | Copyright (C) 2016 Christian Grothoff | ||
4 | |||
5 | libextractor is free software; you can redistribute it and/or modify | ||
6 | it under the terms of the GNU General Public License as published | ||
7 | by the Free Software Foundation; either version 3, or (at your | ||
8 | option) any later version. | ||
9 | |||
10 | libextractor is distributed in the hope that it will be useful, but | ||
11 | WITHOUT ANY WARRANTY; without even the implied warranty of | ||
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
13 | General Public License for more details. | ||
14 | |||
15 | You should have received a copy of the GNU General Public License | ||
16 | along with libextractor; see the file COPYING. If not, write to the | ||
17 | Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, | ||
18 | Boston, MA 02110-1301, USA. | ||
19 | */ | ||
20 | /** | ||
21 | * @file plugins/pdf_extractor.c | ||
22 | * @brief plugin to support PDF files | ||
23 | * @author Christian Grothoff | ||
24 | * | ||
25 | * PDF libraries today are a nightmare (TM). So instead of doing the | ||
26 | * fast thing and calling some library functions to parse the PDF, | ||
27 | * we execute 'pdfinfo' and parse the output. Because that's 21st | ||
28 | * century plumbing: nobody writes reasonable code anymore. | ||
29 | */ | ||
30 | #include "platform.h" | ||
31 | #include <extractor.h> | ||
32 | #include <sys/types.h> | ||
33 | #include <sys/wait.h> | ||
34 | #include <signal.h> | ||
35 | #include <unistd.h> | ||
36 | |||
37 | /** | ||
38 | * Entry in the mapping from control data to LE types. | ||
39 | */ | ||
40 | struct Matches | ||
41 | { | ||
42 | /** | ||
43 | * Key in the Pdfian control file. | ||
44 | */ | ||
45 | const char *text; | ||
46 | |||
47 | /** | ||
48 | * Corresponding type in LE. | ||
49 | */ | ||
50 | enum EXTRACTOR_MetaType type; | ||
51 | }; | ||
52 | |||
53 | |||
54 | /** | ||
55 | * Map from pdf-control entries to LE types. | ||
56 | * | ||
57 | * See output of 'pdfinfo'. | ||
58 | */ | ||
59 | static struct Matches tmap[] = { | ||
60 | {"Title", EXTRACTOR_METATYPE_TITLE}, | ||
61 | {"Subject", EXTRACTOR_METATYPE_SUBJECT}, | ||
62 | {"Keywords", EXTRACTOR_METATYPE_KEYWORDS}, | ||
63 | {"Author", EXTRACTOR_METATYPE_AUTHOR_NAME}, | ||
64 | {"Creator", EXTRACTOR_METATYPE_CREATOR}, | ||
65 | {"Producer", EXTRACTOR_METATYPE_PRODUCED_BY_SOFTWARE}, | ||
66 | {"CreationDate", EXTRACTOR_METATYPE_CREATION_DATE}, | ||
67 | {"ModDate", EXTRACTOR_METATYPE_MODIFICATION_DATE}, | ||
68 | {"PDF version", EXTRACTOR_METATYPE_ENCODER_VERSION}, | ||
69 | {"Pages", EXTRACTOR_METATYPE_PAGE_COUNT}, | ||
70 | {NULL, 0} | ||
71 | }; | ||
72 | |||
73 | |||
74 | /** | ||
75 | * Process the "stdout" file from pdfinfo. | ||
76 | * | ||
77 | * @param fout stdout of pdfinfo | ||
78 | * @param proc function to call with meta data | ||
79 | * @param proc_cls closure for @e proc | ||
80 | */ | ||
81 | static void | ||
82 | process_stdout (FILE *fout, | ||
83 | EXTRACTOR_MetaDataProcessor proc, | ||
84 | void *proc_cls) | ||
85 | { | ||
86 | unsigned int i; | ||
87 | char line[1025]; | ||
88 | const char *psuffix; | ||
89 | const char *colon; | ||
90 | |||
91 | while (! feof (fout)) | ||
92 | { | ||
93 | if (NULL == fgets (line, sizeof (line) - 1, fout)) | ||
94 | break; | ||
95 | if (0 == strlen (line)) | ||
96 | continue; | ||
97 | if ('\n' == line[strlen(line)-1]) | ||
98 | line[strlen(line)-1] = '\0'; | ||
99 | colon = strchr (line, (int) ':'); | ||
100 | if (NULL == colon) | ||
101 | break; | ||
102 | psuffix = colon + 1; | ||
103 | while (isblank ((int) psuffix[0])) | ||
104 | psuffix++; | ||
105 | if (0 == strlen (psuffix)) | ||
106 | continue; | ||
107 | for (i = 0; NULL != tmap[i].text; i++) | ||
108 | { | ||
109 | if (0 != strncasecmp (line, | ||
110 | tmap[i].text, | ||
111 | colon - line)) | ||
112 | continue; | ||
113 | if (0 != proc (proc_cls, | ||
114 | "pdf", | ||
115 | tmap[i].type, | ||
116 | EXTRACTOR_METAFORMAT_UTF8, | ||
117 | "text/plain", | ||
118 | psuffix, | ||
119 | strlen(psuffix) + 1)) | ||
120 | return; | ||
121 | break; | ||
122 | } | ||
123 | } | ||
124 | } | ||
125 | |||
126 | |||
127 | /** | ||
128 | * Main entry method for the PDF extraction plugin. | ||
129 | * | ||
130 | * @param ec extraction context provided to the plugin | ||
131 | */ | ||
132 | void | ||
133 | EXTRACTOR_pdf_extract_method (struct EXTRACTOR_ExtractContext *ec) | ||
134 | { | ||
135 | uint64_t fsize; | ||
136 | void *data; | ||
137 | pid_t pid; | ||
138 | int in[2]; | ||
139 | int out[2]; | ||
140 | FILE *fout; | ||
141 | uint64_t pos; | ||
142 | |||
143 | fsize = ec->get_size (ec->cls); | ||
144 | if (fsize < 128) | ||
145 | return; | ||
146 | if (4 != | ||
147 | ec->read (ec->cls, &data, 4)) | ||
148 | return; | ||
149 | if (0 != strncmp ("%PDF", data, 4)) | ||
150 | return; | ||
151 | if (0 != | ||
152 | ec->seek (ec->cls, 0, SEEK_SET)) | ||
153 | return; | ||
154 | if (0 != pipe (in)) | ||
155 | return; | ||
156 | if (0 != pipe (out)) | ||
157 | { | ||
158 | close (in[0]); | ||
159 | close (in[1]); | ||
160 | return; | ||
161 | } | ||
162 | pid = fork (); | ||
163 | if (-1 == pid) | ||
164 | { | ||
165 | close (in[0]); | ||
166 | close (in[1]); | ||
167 | close (out[0]); | ||
168 | close (out[1]); | ||
169 | return; | ||
170 | } | ||
171 | if (0 == pid) | ||
172 | { | ||
173 | char *const args[] = { | ||
174 | "pdfinfo", | ||
175 | "-", | ||
176 | NULL | ||
177 | }; | ||
178 | /* am child, exec 'pdfinfo' */ | ||
179 | close (0); | ||
180 | close (1); | ||
181 | dup2 (in[0], 0); | ||
182 | dup2 (out[1], 1); | ||
183 | close (in[0]); | ||
184 | close (in[1]); | ||
185 | close (out[0]); | ||
186 | close (out[1]); | ||
187 | execvp ("pdfinfo", args); | ||
188 | exit (1); | ||
189 | } | ||
190 | /* am parent, send file */ | ||
191 | close (in[0]); | ||
192 | close (out[1]); | ||
193 | fout = fdopen (out[0], "r"); | ||
194 | |||
195 | pos = 0; | ||
196 | while (pos < fsize) | ||
197 | { | ||
198 | ssize_t got; | ||
199 | size_t wpos; | ||
200 | |||
201 | data = NULL; | ||
202 | got = ec->read (ec->cls, | ||
203 | &data, | ||
204 | fsize - pos); | ||
205 | if ( (-1 == got) || | ||
206 | (NULL == data) ) | ||
207 | break; | ||
208 | wpos = 0; | ||
209 | while (wpos < got) | ||
210 | { | ||
211 | ssize_t out; | ||
212 | |||
213 | out = write (in[1], data + wpos, got - wpos); | ||
214 | if (out <= 0) | ||
215 | break; | ||
216 | wpos += out; | ||
217 | } | ||
218 | if (wpos < got) | ||
219 | break; | ||
220 | pos += got; | ||
221 | } | ||
222 | close (in[1]); | ||
223 | process_stdout (fout, ec->proc, ec->cls); | ||
224 | fclose (fout); | ||
225 | kill (pid, SIGKILL); | ||
226 | waitpid (pid, NULL, 0); | ||
227 | } | ||
228 | |||
229 | /* end of pdf_extractor.c */ | ||