diff options
Diffstat (limited to 'src/plugins/dvi_extractor.c')
-rw-r--r-- | src/plugins/dvi_extractor.c | 315 |
1 files changed, 315 insertions, 0 deletions
diff --git a/src/plugins/dvi_extractor.c b/src/plugins/dvi_extractor.c new file mode 100644 index 0000000..f13e695 --- /dev/null +++ b/src/plugins/dvi_extractor.c | |||
@@ -0,0 +1,315 @@ | |||
1 | /* | ||
2 | This file is part of libextractor. | ||
3 | (C) 2002, 2003, 2004, 2012 Vidyut Samanta and Christian Grothoff | ||
4 | |||
5 | libextractor is free software; you can redistribute it and/or modify | ||
6 | it under the terms of the GNU General Public License as published | ||
7 | by the Free Software Foundation; either version 3, or (at your | ||
8 | option) any later version. | ||
9 | |||
10 | libextractor is distributed in the hope that it will be useful, but | ||
11 | WITHOUT ANY WARRANTY; without even the implied warranty of | ||
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
13 | General Public License for more details. | ||
14 | |||
15 | You should have received a copy of the GNU General Public License | ||
16 | along with libextractor; see the file COPYING. If not, write to the | ||
17 | Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
18 | Boston, MA 02111-1307, USA. | ||
19 | */ | ||
20 | /** | ||
21 | * @file plugins/dvi_extractor.c | ||
22 | * @brief plugin to support DVI files (from LaTeX) | ||
23 | * @author Christian Grothoff | ||
24 | */ | ||
25 | #include "platform.h" | ||
26 | #include "extractor.h" | ||
27 | |||
28 | |||
29 | /** | ||
30 | * Pair of a PostScipt prefix and the corresponding LE type. | ||
31 | */ | ||
32 | struct Matches | ||
33 | { | ||
34 | /** | ||
35 | * Prefix in the PS map. | ||
36 | */ | ||
37 | const char *text; | ||
38 | |||
39 | /** | ||
40 | * Corresponding LE type. | ||
41 | */ | ||
42 | enum EXTRACTOR_MetaType type; | ||
43 | }; | ||
44 | |||
45 | |||
46 | /** | ||
47 | * Map from PS names to LE types. | ||
48 | */ | ||
49 | static struct Matches tmap[] = { | ||
50 | { "/Title (", EXTRACTOR_METATYPE_TITLE }, | ||
51 | { "/Subject (", EXTRACTOR_METATYPE_SUBJECT }, | ||
52 | { "/Author (", EXTRACTOR_METATYPE_AUTHOR_NAME }, | ||
53 | { "/Keywords (", EXTRACTOR_METATYPE_KEYWORDS }, | ||
54 | { "/Creator (", EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE }, | ||
55 | { "/Producer (", EXTRACTOR_METATYPE_PRODUCED_BY_SOFTWARE }, | ||
56 | { NULL, 0 } | ||
57 | }; | ||
58 | |||
59 | |||
60 | /** | ||
61 | * Parse a "ZZZ" tag. Specifically, the data may contain a | ||
62 | * postscript dictionary with metadata. | ||
63 | * | ||
64 | * @param data overall input stream | ||
65 | * @param pos where in data is the zzz data | ||
66 | * @param len how many bytes from 'pos' does the zzz data extend? | ||
67 | * @param proc function to call with meta data found | ||
68 | * @param proc_cls closure for proc | ||
69 | * @return 0 to continue to extract, 1 to stop | ||
70 | */ | ||
71 | static int | ||
72 | parseZZZ (const char *data, | ||
73 | size_t pos, size_t len, | ||
74 | EXTRACTOR_MetaDataProcessor proc, | ||
75 | void *proc_cls) | ||
76 | { | ||
77 | size_t slen; | ||
78 | size_t end; | ||
79 | unsigned int i; | ||
80 | |||
81 | end = pos + len; | ||
82 | slen = strlen ("ps:SDict begin ["); | ||
83 | if ( (len <= slen) || | ||
84 | (0 != strncmp ("ps:SDict begin [ ", &data[pos], slen)) ) | ||
85 | return 0; | ||
86 | pos += slen; | ||
87 | while (pos < end) | ||
88 | { | ||
89 | for (i = 0; NULL != tmap[i].text; i++) | ||
90 | { | ||
91 | slen = strlen (tmap[i].text); | ||
92 | if ( (pos + slen > end) || | ||
93 | (0 != strncmp (&data[pos], tmap[i].text, slen)) ) | ||
94 | continue; | ||
95 | pos += slen; | ||
96 | slen = pos; | ||
97 | while ((slen < end) && (data[slen] != ')')) | ||
98 | slen++; | ||
99 | slen = slen - pos; | ||
100 | { | ||
101 | char value[slen + 1]; | ||
102 | |||
103 | value[slen] = '\0'; | ||
104 | memcpy (value, &data[pos], slen); | ||
105 | if (0 != proc (proc_cls, | ||
106 | "dvi", | ||
107 | tmap[i].type, | ||
108 | EXTRACTOR_METAFORMAT_C_STRING, | ||
109 | "text/plain", | ||
110 | value, | ||
111 | slen + 1)) | ||
112 | return 1; | ||
113 | } | ||
114 | pos += slen + 1; | ||
115 | break; | ||
116 | } | ||
117 | pos++; | ||
118 | } | ||
119 | return 0; | ||
120 | } | ||
121 | |||
122 | |||
123 | /** | ||
124 | * Read 32-bit unsigned integer in big-endian format from 'data'. | ||
125 | * | ||
126 | * @param data pointer to integer (possibly unaligned) | ||
127 | * @return 32-bit integer in host byte order | ||
128 | */ | ||
129 | static uint32_t | ||
130 | getIntAt (const void *data) | ||
131 | { | ||
132 | uint32_t p; | ||
133 | |||
134 | memcpy (&p, data, 4); /* ensure alignment! */ | ||
135 | return ntohl (p); | ||
136 | } | ||
137 | |||
138 | |||
139 | /** | ||
140 | * Read 16-bit unsigned integer in big-endian format from 'data'. | ||
141 | * | ||
142 | * @param data pointer to integer (possibly unaligned) | ||
143 | * @return 16-bit integer in host byte order | ||
144 | */ | ||
145 | static uint16_t | ||
146 | getShortAt (const void *data) | ||
147 | { | ||
148 | uint16_t p; | ||
149 | |||
150 | memcpy (&p, data, sizeof (uint16_t)); /* ensure alignment! */ | ||
151 | return ntohs (p); | ||
152 | } | ||
153 | |||
154 | |||
155 | /** | ||
156 | * Main entry method for the 'application/x-dvi' extraction plugin. | ||
157 | * | ||
158 | * @param ec extraction context provided to the plugin | ||
159 | */ | ||
160 | void | ||
161 | EXTRACTOR_dvi_extract_method (struct EXTRACTOR_ExtractContext *ec) | ||
162 | { | ||
163 | unsigned int klen; | ||
164 | uint32_t pos; | ||
165 | uint32_t opos; | ||
166 | unsigned int len; | ||
167 | unsigned int pageCount; | ||
168 | char pages[16]; | ||
169 | void *buf; | ||
170 | unsigned char *data; | ||
171 | uint64_t size; | ||
172 | uint64_t off; | ||
173 | ssize_t iret; | ||
174 | |||
175 | if (40 >= (iret = ec->read (ec->cls, &buf, 1024))) | ||
176 | return; | ||
177 | data = buf; | ||
178 | if ((data[0] != 247) || (data[1] != 2)) | ||
179 | return; /* cannot be DVI or unsupported version */ | ||
180 | klen = data[14]; | ||
181 | size = ec->get_size (ec->cls); | ||
182 | if (size > 16 * 1024 * 1024) | ||
183 | return; /* too large */ | ||
184 | if (NULL == (data = malloc ((size_t) size))) | ||
185 | return; /* out of memory */ | ||
186 | memcpy (data, buf, iret); | ||
187 | off = iret; | ||
188 | while (off < size) | ||
189 | { | ||
190 | if (0 >= (iret = ec->read (ec->cls, &buf, 16 * 1024))) | ||
191 | { | ||
192 | free (data); | ||
193 | return; | ||
194 | } | ||
195 | memcpy (&data[off], buf, iret); | ||
196 | off += iret; | ||
197 | } | ||
198 | pos = size - 1; | ||
199 | while ((223 == data[pos]) && (pos > 0)) | ||
200 | pos--; | ||
201 | if ((2 != data[pos]) || (pos < 40)) | ||
202 | goto CLEANUP; | ||
203 | pos--; | ||
204 | pos -= 4; | ||
205 | /* assert pos at 'post_post tag' */ | ||
206 | if (data[pos] != 249) | ||
207 | goto CLEANUP; | ||
208 | opos = pos; | ||
209 | pos = getIntAt (&data[opos + 1]); | ||
210 | if (pos + 25 > size) | ||
211 | goto CLEANUP; | ||
212 | /* assert pos at 'post' command */ | ||
213 | if (data[pos] != 248) | ||
214 | goto CLEANUP; | ||
215 | pageCount = 0; | ||
216 | opos = pos; | ||
217 | pos = getIntAt (&data[opos + 1]); | ||
218 | while (1) | ||
219 | { | ||
220 | if (UINT32_MAX == pos) | ||
221 | break; | ||
222 | if (pos + 45 > size) | ||
223 | goto CLEANUP; | ||
224 | if (data[pos] != 139) /* expect 'bop' */ | ||
225 | goto CLEANUP; | ||
226 | pageCount++; | ||
227 | opos = pos; | ||
228 | pos = getIntAt (&data[opos + 41]); | ||
229 | if (UINT32_MAX == pos) | ||
230 | break; | ||
231 | if (pos >= opos) | ||
232 | goto CLEANUP; /* invalid! */ | ||
233 | } | ||
234 | /* ok, now we believe it's a dvi... */ | ||
235 | snprintf (pages, | ||
236 | sizeof (pages), | ||
237 | "%u", | ||
238 | pageCount); | ||
239 | if (0 != ec->proc (ec->cls, | ||
240 | "dvi", | ||
241 | EXTRACTOR_METATYPE_PAGE_COUNT, | ||
242 | EXTRACTOR_METAFORMAT_UTF8, | ||
243 | "text/plain", | ||
244 | pages, | ||
245 | strlen (pages) + 1)) | ||
246 | goto CLEANUP; | ||
247 | if (0 != ec->proc (ec->cls, | ||
248 | "dvi", | ||
249 | EXTRACTOR_METATYPE_MIMETYPE, | ||
250 | EXTRACTOR_METAFORMAT_UTF8, | ||
251 | "text/plain", | ||
252 | "application/x-dvi", | ||
253 | strlen ("application/x-dvi") + 1)) | ||
254 | goto CLEANUP; | ||
255 | { | ||
256 | char comment[klen + 1]; | ||
257 | |||
258 | comment[klen] = '\0'; | ||
259 | memcpy (comment, &data[15], klen); | ||
260 | if (0 != ec->proc (ec->cls, | ||
261 | "dvi", | ||
262 | EXTRACTOR_METATYPE_COMMENT, | ||
263 | EXTRACTOR_METAFORMAT_C_STRING, | ||
264 | "text/plain", | ||
265 | comment, | ||
266 | klen + 1)) | ||
267 | goto CLEANUP; | ||
268 | } | ||
269 | /* try to find PDF/ps special */ | ||
270 | pos = opos; | ||
271 | while (pos < size - 100) | ||
272 | { | ||
273 | switch (data[pos]) | ||
274 | { | ||
275 | case 139: /* begin page 'bop', we typically have to skip that one to | ||
276 | find the zzz's */ | ||
277 | pos += 45; /* skip bop */ | ||
278 | break; | ||
279 | case 239: /* zzz1 */ | ||
280 | len = data[pos + 1]; | ||
281 | if (pos + 2 + len < size) | ||
282 | if (0 != parseZZZ ((const char *) data, pos + 2, len, ec->proc, ec->cls)) | ||
283 | goto CLEANUP; | ||
284 | pos += len + 2; | ||
285 | break; | ||
286 | case 240: /* zzz2 */ | ||
287 | len = getShortAt (&data[pos + 1]); | ||
288 | if (pos + 3 + len < size) | ||
289 | if (0 != parseZZZ ((const char *) data, pos + 3, len, ec->proc, ec->cls)) | ||
290 | goto CLEANUP; | ||
291 | pos += len + 3; | ||
292 | break; | ||
293 | case 241: /* zzz3, who uses that? */ | ||
294 | len = (getShortAt (&data[pos + 1])) + 65536 * data[pos + 3]; | ||
295 | if (pos + 4 + len < size) | ||
296 | if (0 != parseZZZ ((const char *) data, pos + 4, len, ec->proc, ec->cls)) | ||
297 | goto CLEANUP; | ||
298 | pos += len + 4; | ||
299 | break; | ||
300 | case 242: /* zzz4, hurray! */ | ||
301 | len = getIntAt (&data[pos + 1]); | ||
302 | if (pos + 1 + len < size) | ||
303 | if (0 != parseZZZ ((const char *) data, pos + 5, len, ec->proc, ec->cls)) | ||
304 | goto CLEANUP; | ||
305 | pos += len + 5; | ||
306 | break; | ||
307 | default: /* unsupported opcode, abort scan */ | ||
308 | goto CLEANUP; | ||
309 | } | ||
310 | } | ||
311 | CLEANUP: | ||
312 | free (data); | ||
313 | } | ||
314 | |||
315 | /* end of dvi_extractor.c */ | ||