aboutsummaryrefslogtreecommitdiff
path: root/src/plugins/dvi_extractor.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/plugins/dvi_extractor.c')
-rw-r--r--src/plugins/dvi_extractor.c315
1 files changed, 315 insertions, 0 deletions
diff --git a/src/plugins/dvi_extractor.c b/src/plugins/dvi_extractor.c
new file mode 100644
index 0000000..f13e695
--- /dev/null
+++ b/src/plugins/dvi_extractor.c
@@ -0,0 +1,315 @@
1/*
2 This file is part of libextractor.
3 (C) 2002, 2003, 2004, 2012 Vidyut Samanta and Christian Grothoff
4
5 libextractor is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published
7 by the Free Software Foundation; either version 3, or (at your
8 option) any later version.
9
10 libextractor is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with libextractor; see the file COPYING. If not, write to the
17 Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18 Boston, MA 02111-1307, USA.
19 */
20/**
21 * @file plugins/dvi_extractor.c
22 * @brief plugin to support DVI files (from LaTeX)
23 * @author Christian Grothoff
24 */
25#include "platform.h"
26#include "extractor.h"
27
28
29/**
30 * Pair of a PostScipt prefix and the corresponding LE type.
31 */
32struct Matches
33{
34 /**
35 * Prefix in the PS map.
36 */
37 const char *text;
38
39 /**
40 * Corresponding LE type.
41 */
42 enum EXTRACTOR_MetaType type;
43};
44
45
46/**
47 * Map from PS names to LE types.
48 */
49static struct Matches tmap[] = {
50 { "/Title (", EXTRACTOR_METATYPE_TITLE },
51 { "/Subject (", EXTRACTOR_METATYPE_SUBJECT },
52 { "/Author (", EXTRACTOR_METATYPE_AUTHOR_NAME },
53 { "/Keywords (", EXTRACTOR_METATYPE_KEYWORDS },
54 { "/Creator (", EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE },
55 { "/Producer (", EXTRACTOR_METATYPE_PRODUCED_BY_SOFTWARE },
56 { NULL, 0 }
57};
58
59
60/**
61 * Parse a "ZZZ" tag. Specifically, the data may contain a
62 * postscript dictionary with metadata.
63 *
64 * @param data overall input stream
65 * @param pos where in data is the zzz data
66 * @param len how many bytes from 'pos' does the zzz data extend?
67 * @param proc function to call with meta data found
68 * @param proc_cls closure for proc
69 * @return 0 to continue to extract, 1 to stop
70 */
71static int
72parseZZZ (const char *data,
73 size_t pos, size_t len,
74 EXTRACTOR_MetaDataProcessor proc,
75 void *proc_cls)
76{
77 size_t slen;
78 size_t end;
79 unsigned int i;
80
81 end = pos + len;
82 slen = strlen ("ps:SDict begin [");
83 if ( (len <= slen) ||
84 (0 != strncmp ("ps:SDict begin [ ", &data[pos], slen)) )
85 return 0;
86 pos += slen;
87 while (pos < end)
88 {
89 for (i = 0; NULL != tmap[i].text; i++)
90 {
91 slen = strlen (tmap[i].text);
92 if ( (pos + slen > end) ||
93 (0 != strncmp (&data[pos], tmap[i].text, slen)) )
94 continue;
95 pos += slen;
96 slen = pos;
97 while ((slen < end) && (data[slen] != ')'))
98 slen++;
99 slen = slen - pos;
100 {
101 char value[slen + 1];
102
103 value[slen] = '\0';
104 memcpy (value, &data[pos], slen);
105 if (0 != proc (proc_cls,
106 "dvi",
107 tmap[i].type,
108 EXTRACTOR_METAFORMAT_C_STRING,
109 "text/plain",
110 value,
111 slen + 1))
112 return 1;
113 }
114 pos += slen + 1;
115 break;
116 }
117 pos++;
118 }
119 return 0;
120}
121
122
123/**
124 * Read 32-bit unsigned integer in big-endian format from 'data'.
125 *
126 * @param data pointer to integer (possibly unaligned)
127 * @return 32-bit integer in host byte order
128 */
129static uint32_t
130getIntAt (const void *data)
131{
132 uint32_t p;
133
134 memcpy (&p, data, 4); /* ensure alignment! */
135 return ntohl (p);
136}
137
138
139/**
140 * Read 16-bit unsigned integer in big-endian format from 'data'.
141 *
142 * @param data pointer to integer (possibly unaligned)
143 * @return 16-bit integer in host byte order
144 */
145static uint16_t
146getShortAt (const void *data)
147{
148 uint16_t p;
149
150 memcpy (&p, data, sizeof (uint16_t)); /* ensure alignment! */
151 return ntohs (p);
152}
153
154
155/**
156 * Main entry method for the 'application/x-dvi' extraction plugin.
157 *
158 * @param ec extraction context provided to the plugin
159 */
160void
161EXTRACTOR_dvi_extract_method (struct EXTRACTOR_ExtractContext *ec)
162{
163 unsigned int klen;
164 uint32_t pos;
165 uint32_t opos;
166 unsigned int len;
167 unsigned int pageCount;
168 char pages[16];
169 void *buf;
170 unsigned char *data;
171 uint64_t size;
172 uint64_t off;
173 ssize_t iret;
174
175 if (40 >= (iret = ec->read (ec->cls, &buf, 1024)))
176 return;
177 data = buf;
178 if ((data[0] != 247) || (data[1] != 2))
179 return; /* cannot be DVI or unsupported version */
180 klen = data[14];
181 size = ec->get_size (ec->cls);
182 if (size > 16 * 1024 * 1024)
183 return; /* too large */
184 if (NULL == (data = malloc ((size_t) size)))
185 return; /* out of memory */
186 memcpy (data, buf, iret);
187 off = iret;
188 while (off < size)
189 {
190 if (0 >= (iret = ec->read (ec->cls, &buf, 16 * 1024)))
191 {
192 free (data);
193 return;
194 }
195 memcpy (&data[off], buf, iret);
196 off += iret;
197 }
198 pos = size - 1;
199 while ((223 == data[pos]) && (pos > 0))
200 pos--;
201 if ((2 != data[pos]) || (pos < 40))
202 goto CLEANUP;
203 pos--;
204 pos -= 4;
205 /* assert pos at 'post_post tag' */
206 if (data[pos] != 249)
207 goto CLEANUP;
208 opos = pos;
209 pos = getIntAt (&data[opos + 1]);
210 if (pos + 25 > size)
211 goto CLEANUP;
212 /* assert pos at 'post' command */
213 if (data[pos] != 248)
214 goto CLEANUP;
215 pageCount = 0;
216 opos = pos;
217 pos = getIntAt (&data[opos + 1]);
218 while (1)
219 {
220 if (UINT32_MAX == pos)
221 break;
222 if (pos + 45 > size)
223 goto CLEANUP;
224 if (data[pos] != 139) /* expect 'bop' */
225 goto CLEANUP;
226 pageCount++;
227 opos = pos;
228 pos = getIntAt (&data[opos + 41]);
229 if (UINT32_MAX == pos)
230 break;
231 if (pos >= opos)
232 goto CLEANUP; /* invalid! */
233 }
234 /* ok, now we believe it's a dvi... */
235 snprintf (pages,
236 sizeof (pages),
237 "%u",
238 pageCount);
239 if (0 != ec->proc (ec->cls,
240 "dvi",
241 EXTRACTOR_METATYPE_PAGE_COUNT,
242 EXTRACTOR_METAFORMAT_UTF8,
243 "text/plain",
244 pages,
245 strlen (pages) + 1))
246 goto CLEANUP;
247 if (0 != ec->proc (ec->cls,
248 "dvi",
249 EXTRACTOR_METATYPE_MIMETYPE,
250 EXTRACTOR_METAFORMAT_UTF8,
251 "text/plain",
252 "application/x-dvi",
253 strlen ("application/x-dvi") + 1))
254 goto CLEANUP;
255 {
256 char comment[klen + 1];
257
258 comment[klen] = '\0';
259 memcpy (comment, &data[15], klen);
260 if (0 != ec->proc (ec->cls,
261 "dvi",
262 EXTRACTOR_METATYPE_COMMENT,
263 EXTRACTOR_METAFORMAT_C_STRING,
264 "text/plain",
265 comment,
266 klen + 1))
267 goto CLEANUP;
268 }
269 /* try to find PDF/ps special */
270 pos = opos;
271 while (pos < size - 100)
272 {
273 switch (data[pos])
274 {
275 case 139: /* begin page 'bop', we typically have to skip that one to
276 find the zzz's */
277 pos += 45; /* skip bop */
278 break;
279 case 239: /* zzz1 */
280 len = data[pos + 1];
281 if (pos + 2 + len < size)
282 if (0 != parseZZZ ((const char *) data, pos + 2, len, ec->proc, ec->cls))
283 goto CLEANUP;
284 pos += len + 2;
285 break;
286 case 240: /* zzz2 */
287 len = getShortAt (&data[pos + 1]);
288 if (pos + 3 + len < size)
289 if (0 != parseZZZ ((const char *) data, pos + 3, len, ec->proc, ec->cls))
290 goto CLEANUP;
291 pos += len + 3;
292 break;
293 case 241: /* zzz3, who uses that? */
294 len = (getShortAt (&data[pos + 1])) + 65536 * data[pos + 3];
295 if (pos + 4 + len < size)
296 if (0 != parseZZZ ((const char *) data, pos + 4, len, ec->proc, ec->cls))
297 goto CLEANUP;
298 pos += len + 4;
299 break;
300 case 242: /* zzz4, hurray! */
301 len = getIntAt (&data[pos + 1]);
302 if (pos + 1 + len < size)
303 if (0 != parseZZZ ((const char *) data, pos + 5, len, ec->proc, ec->cls))
304 goto CLEANUP;
305 pos += len + 5;
306 break;
307 default: /* unsupported opcode, abort scan */
308 goto CLEANUP;
309 }
310 }
311 CLEANUP:
312 free (data);
313}
314
315/* end of dvi_extractor.c */