diff options
Diffstat (limited to 'src/plugins/zip_extractor.c')
-rw-r--r-- | src/plugins/zip_extractor.c | 410 |
1 files changed, 410 insertions, 0 deletions
diff --git a/src/plugins/zip_extractor.c b/src/plugins/zip_extractor.c new file mode 100644 index 0000000..c7fef95 --- /dev/null +++ b/src/plugins/zip_extractor.c | |||
@@ -0,0 +1,410 @@ | |||
1 | /** | ||
2 | zipextractor.c version 0.0.2 | ||
3 | |||
4 | Changes from 0.0.1 to 0.0.2 | ||
5 | -> Searches for central dir struct from end of file if this is a self-extracting executable | ||
6 | |||
7 | |||
8 | This file was based on mp3extractor.c (0.1.2) | ||
9 | |||
10 | Currently, this only returns a list of the filenames within a zipfile | ||
11 | and any comments on each file or the whole file itself. File sizes, | ||
12 | modification times, and crc's are currently ignored. | ||
13 | |||
14 | TODO: Break the comments up into small, atomically, searchable chunks (keywords) | ||
15 | - might need some knowledge of English? | ||
16 | |||
17 | It returns: | ||
18 | |||
19 | one EXTRACTOR_MIMETYPE | ||
20 | multiple EXTRACTOR_FILENAME | ||
21 | multiple EXTRACTOR_COMMENT | ||
22 | |||
23 | ... from a .ZIP file | ||
24 | |||
25 | TODO: EXTRACTOR_DATE, EXTRACTOR_DESCRIPTION, EXTRACTOR_KEYWORDS, others? | ||
26 | |||
27 | Does NOT test data integrity (CRCs etc.) | ||
28 | |||
29 | This version is not recursive (i.e. doesn't look inside zip | ||
30 | files within zip files) | ||
31 | |||
32 | TODO: Run extract on files inside of archive (?) (i.e. gif, mp3, etc.) | ||
33 | |||
34 | The current .ZIP format description: | ||
35 | ftp://ftp.pkware.com/appnote.zip | ||
36 | |||
37 | No Copyright 2003 Julia Wolf | ||
38 | |||
39 | */ | ||
40 | |||
41 | /* | ||
42 | * This file is part of libextractor. | ||
43 | * (C) 2002, 2003, 2009 Vidyut Samanta and Christian Grothoff | ||
44 | * | ||
45 | * libextractor is free software; you can redistribute it and/or modify | ||
46 | * it under the terms of the GNU General Public License as published | ||
47 | * by the Free Software Foundation; either version 2, or (at your | ||
48 | * option) any later version. | ||
49 | * | ||
50 | * libextractor is distributed in the hope that it will be useful, but | ||
51 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
52 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
53 | * General Public License for more details. | ||
54 | * | ||
55 | * You should have received a copy of the GNU General Public License | ||
56 | * along with libextractor; see the file COPYING. If not, write to the | ||
57 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
58 | * Boston, MA 02111-1307, USA. | ||
59 | */ | ||
60 | |||
61 | #include "platform.h" | ||
62 | #include "extractor.h" | ||
63 | |||
64 | #define DEBUG_EXTRACT_ZIP 0 | ||
65 | |||
66 | /* In a zipfile there are two kinds of comments. One is a big one for the | ||
67 | entire .zip, it's usually a BBS ad. The other is a small comment on each | ||
68 | individual file; most people don't use this. | ||
69 | */ | ||
70 | |||
71 | /* TODO: zip_entry linked list is handeled kinda messily, should clean up (maybe) */ | ||
72 | typedef struct | ||
73 | { | ||
74 | char *filename; | ||
75 | char *comment; | ||
76 | void *next; | ||
77 | } zip_entry; | ||
78 | |||
79 | /* mimetype = application/zip */ | ||
80 | int | ||
81 | EXTRACTOR_zip_extract (const unsigned char *data, | ||
82 | size_t size, | ||
83 | EXTRACTOR_MetaDataProcessor proc, | ||
84 | void *proc_cls, | ||
85 | const char *options) | ||
86 | { | ||
87 | int ret; | ||
88 | void *tmp; | ||
89 | zip_entry * info; | ||
90 | zip_entry * start; | ||
91 | char *filecomment = NULL; | ||
92 | const unsigned char *pos; | ||
93 | unsigned int offset, stop; | ||
94 | unsigned int name_length, extra_length, comment_length; | ||
95 | unsigned int filecomment_length; | ||
96 | unsigned int entry_total, entry_count; | ||
97 | |||
98 | /* I think the smallest zipfile you can have is about 120 bytes */ | ||
99 | if ((NULL == data) || (size < 100)) | ||
100 | return 0; | ||
101 | if (! (('P' == data[0]) && ('K' == data[1]) && (0x03 == data[2]) | ||
102 | && (0x04 == data[3]))) | ||
103 | return 0; | ||
104 | |||
105 | /* The filenames for each file in a zipfile are stored in two locations. | ||
106 | * There is one at the start of each entry, just before the compressed data, | ||
107 | * and another at the end in a 'central directory structure'. | ||
108 | * | ||
109 | * In order to catch self-extracting executables, we scan backwards from the end | ||
110 | * of the file looking for the central directory structure. The previous version | ||
111 | * of this went forewards through the local headers, but that only works for plain | ||
112 | * vanilla zip's and I don't feel like writing a special case for each of the dozen | ||
113 | * self-extracting executable stubs. | ||
114 | * | ||
115 | * This assumes that the zip file is considered to be non-corrupt/non-truncated. | ||
116 | * If it is truncated then it's not considered to be a zip and skipped. | ||
117 | * | ||
118 | */ | ||
119 | |||
120 | /* From appnote.iz and appnote.txt (more or less) | ||
121 | * | ||
122 | * (this is why you always need to put in the last floppy if you span disks) | ||
123 | * | ||
124 | * 0- 3 end of central dir signature 4 bytes (0x06054b50) P K ^E ^F | ||
125 | * 4- 5 number of this disk 2 bytes | ||
126 | * 6- 7 number of the disk with the | ||
127 | * start of the central directory 2 bytes | ||
128 | * 8- 9 total number of entries in | ||
129 | * the central dir on this disk 2 bytes | ||
130 | * 10-11 total number of entries in | ||
131 | * the central dir 2 bytes | ||
132 | * 12-15 size of the central directory 4 bytes | ||
133 | * 16-19 offset of start of central | ||
134 | * directory with respect to | ||
135 | * the starting disk number 4 bytes | ||
136 | * 20-21 zipfile comment length 2 bytes | ||
137 | * 22-?? zipfile comment (variable size) max length 65536 bytes | ||
138 | */ | ||
139 | |||
140 | /* the signature can't be more than 22 bytes from the end */ | ||
141 | offset = size - 22; | ||
142 | pos = &data[offset]; | ||
143 | stop = 0; | ||
144 | if (((signed int) size - 65556) > 0) | ||
145 | stop = size - 65556; | ||
146 | |||
147 | /* not using int 0x06054b50 so that we don't have to deal with endianess issues. | ||
148 | break out if we go more than 64K backwards and havn't found it, or if we hit the | ||
149 | begining of the file. */ | ||
150 | while ((!(('P' == pos[0]) && ('K' == pos[1]) && (0x05 == pos[2]) | ||
151 | && (0x06 == pos[3]))) && (offset > stop)) | ||
152 | pos = &data[offset--]; | ||
153 | if (offset == stop) | ||
154 | { | ||
155 | |||
156 | #if DEBUG_EXTRACT_ZIP | ||
157 | fprintf (stderr, | ||
158 | "Did not find end of central directory structure signature. offset: %i\n", | ||
159 | offset); | ||
160 | |||
161 | #endif /* */ | ||
162 | return 0; | ||
163 | } | ||
164 | |||
165 | /* offset should now point to the start of the end-of-central directory structure */ | ||
166 | /* and pos[0] should be pointing there too */ | ||
167 | /* so slurp down filecomment while here... */ | ||
168 | filecomment_length = pos[20] + (pos[21] << 8); | ||
169 | if (filecomment_length + offset + 22 > size) | ||
170 | { | ||
171 | return 0; /* invalid zip file format! */ | ||
172 | } | ||
173 | filecomment = NULL; | ||
174 | if (filecomment_length > 0) | ||
175 | { | ||
176 | filecomment = malloc (filecomment_length + 1); | ||
177 | memcpy (filecomment, &pos[22], filecomment_length); | ||
178 | filecomment[filecomment_length] = '\0'; | ||
179 | } | ||
180 | if ((0 != pos[4]) && (0 != pos[5])) | ||
181 | { | ||
182 | |||
183 | #if DEBUG_EXTRACT_ZIP | ||
184 | fprintf (stderr, | ||
185 | "WARNING: This seems to be the last disk in a multi-volume" | ||
186 | " ZIP archive, and so this might not work.\n"); | ||
187 | |||
188 | #endif /* */ | ||
189 | } | ||
190 | if ((pos[8] != pos[10]) && (pos[9] != pos[11])) | ||
191 | { | ||
192 | |||
193 | #if DEBUG_EXTRACT_ZIP | ||
194 | fprintf (stderr, | ||
195 | "WARNING: May not be able to find all the files in this" | ||
196 | " ZIP archive (no multi-volume support right now).\n"); | ||
197 | |||
198 | #endif /* */ | ||
199 | } | ||
200 | entry_total = pos[10] + (pos[11] << 8); | ||
201 | entry_count = 0; | ||
202 | |||
203 | /* jump to start of central directory, ASSUMING that the starting disk that it's on is disk 0 */ | ||
204 | /* starting disk would otherwise be pos[6]+pos[7]<<8 */ | ||
205 | offset = pos[16] + (pos[17] << 8) + (pos[18] << 16) + (pos[19] << 24); /* offset of cent-dir from start of disk 0 */ | ||
206 | |||
207 | /* stop = pos[12] + (pos[13]<<8) + (pos[14]<<16) + (pos[15]<<24); *//* length of central dir */ | ||
208 | if (offset + 46 > size) | ||
209 | { | ||
210 | |||
211 | /* not a zip */ | ||
212 | if (filecomment != NULL) | ||
213 | free (filecomment); | ||
214 | return 0; | ||
215 | } | ||
216 | pos = &data[offset]; /* jump */ | ||
217 | |||
218 | /* we should now be at the begining of the central directory structure */ | ||
219 | |||
220 | /* from appnote.txt and appnote.iz (mostly) | ||
221 | * | ||
222 | * 0- 3 central file header signature 4 bytes (0x02014b50) | ||
223 | * 4- 5 version made by 2 bytes | ||
224 | * 6- 7 version needed to extract 2 bytes | ||
225 | * 8- 9 general purpose bit flag 2 bytes | ||
226 | * 10-11 compression method 2 bytes | ||
227 | * 12-13 last mod file time 2 bytes | ||
228 | * 14-15 last mod file date 2 bytes | ||
229 | * 16-19 crc-32 4 bytes | ||
230 | * 20-23 compressed size 4 bytes | ||
231 | * 24-27 uncompressed size 4 bytes | ||
232 | * 28-29 filename length 2 bytes | ||
233 | * 30-31 extra field length 2 bytes | ||
234 | * 32-33 file comment length 2 bytes | ||
235 | * 34-35 disk number start 2 bytes | ||
236 | * 36-37 internal file attributes 2 bytes | ||
237 | * 38-41 external file attributes 4 bytes | ||
238 | * 42-45 relative offset of local header 4 bytes | ||
239 | * | ||
240 | * 46-?? filename (variable size) | ||
241 | * ?- ? extra field (variable size) | ||
242 | * ?- ? file comment (variable size) | ||
243 | */ | ||
244 | if (!(('P' == pos[0]) && ('K' == pos[1]) && (0x01 == pos[2]) | ||
245 | && (0x02 == pos[3]))) | ||
246 | { | ||
247 | |||
248 | #if DEBUG_EXTRACT_ZIP | ||
249 | fprintf (stderr, | ||
250 | "Did not find central directory structure signature. offset: %i\n", | ||
251 | offset); | ||
252 | |||
253 | #endif | ||
254 | if (filecomment != NULL) | ||
255 | free (filecomment); | ||
256 | return 0; | ||
257 | } | ||
258 | start = NULL; | ||
259 | info = NULL; | ||
260 | |||
261 | do | ||
262 | { /* while ( (0x01==pos[2])&&(0x02==pos[3]) ) */ | ||
263 | entry_count++; /* check to make sure we found everything at the end */ | ||
264 | name_length = pos[28] + (pos[29] << 8); | ||
265 | extra_length = pos[30] + (pos[31] << 8); | ||
266 | comment_length = pos[32] + (pos[33] << 8); | ||
267 | if (name_length + extra_length + comment_length + offset + 46 > size) | ||
268 | { | ||
269 | |||
270 | /* ok, invalid, abort! */ | ||
271 | break; | ||
272 | } | ||
273 | |||
274 | #if DEBUG_EXTRACT_ZIP | ||
275 | fprintf (stderr, "Found filename length %i Comment length: %i\n", | ||
276 | name_length, comment_length); | ||
277 | |||
278 | #endif /* */ | ||
279 | |||
280 | /* yay, finally get filenames */ | ||
281 | if (start == NULL) | ||
282 | { | ||
283 | start = malloc (sizeof (zip_entry)); | ||
284 | start->next = NULL; | ||
285 | info = start; | ||
286 | } | ||
287 | else | ||
288 | { | ||
289 | info->next = malloc (sizeof (zip_entry)); | ||
290 | info = info->next; | ||
291 | info->next = NULL; | ||
292 | } | ||
293 | info->filename = malloc (name_length + 1); | ||
294 | info->comment = malloc (comment_length + 1); | ||
295 | |||
296 | /* (strings in zip files are not null terminated) */ | ||
297 | memcpy (info->filename, &pos[46], name_length); | ||
298 | info->filename[name_length] = '\0'; | ||
299 | memcpy (info->comment, &pos[46 + name_length + extra_length], | ||
300 | comment_length); | ||
301 | info->comment[comment_length] = '\0'; | ||
302 | |||
303 | #if DEBUG_EXTRACT_ZIP | ||
304 | fprintf (stderr, "Found file %s, Comment: %s\n", info->filename, | ||
305 | info->comment); | ||
306 | |||
307 | #endif | ||
308 | offset += 46 + name_length + extra_length + comment_length; | ||
309 | pos = &data[offset]; | ||
310 | |||
311 | /* check for next header entry (0x02014b50) or (0x06054b50) if at end */ | ||
312 | if (('P' != pos[0]) && ('K' != pos[1])) | ||
313 | { | ||
314 | |||
315 | #if DEBUG_EXTRACT_ZIP | ||
316 | fprintf (stderr, | ||
317 | "Did not find next header in central directory.\n"); | ||
318 | |||
319 | #endif | ||
320 | info = start; | ||
321 | while (info != NULL) | ||
322 | { | ||
323 | start = info->next; | ||
324 | free (info->filename); | ||
325 | free (info->comment); | ||
326 | free (info); | ||
327 | info = start; | ||
328 | } | ||
329 | if (filecomment != NULL) | ||
330 | free (filecomment); | ||
331 | return 0; | ||
332 | } | ||
333 | } | ||
334 | while ((0x01 == pos[2]) && (0x02 == pos[3])); | ||
335 | |||
336 | /* end list */ | ||
337 | |||
338 | /* TODO: should this return an error? indicates corrupt zipfile (or | ||
339 | disk missing in middle of multi-disk)? */ | ||
340 | if (entry_count != entry_total) | ||
341 | { | ||
342 | |||
343 | #if DEBUG_EXTRACT_ZIP | ||
344 | fprintf (stderr, | ||
345 | "WARNING: Did not find all of the zipfile entries that we should have.\n"); | ||
346 | |||
347 | #endif /* */ | ||
348 | } | ||
349 | |||
350 | ret = proc (proc_cls, | ||
351 | "zip", | ||
352 | EXTRACTOR_METATYPE_MIMETYPE, | ||
353 | EXTRACTOR_METAFORMAT_UTF8, | ||
354 | "text/plain", | ||
355 | "application/zip", | ||
356 | strlen ("application/zip")+1); | ||
357 | if ( (filecomment != NULL) && (ret != 0) ) | ||
358 | { | ||
359 | ret = proc (proc_cls, | ||
360 | "zip", | ||
361 | EXTRACTOR_METATYPE_MIMETYPE, | ||
362 | EXTRACTOR_METAFORMAT_UTF8, | ||
363 | "text/plain", | ||
364 | filecomment, | ||
365 | strlen (filecomment)+1); | ||
366 | } | ||
367 | free (filecomment); | ||
368 | |||
369 | |||
370 | /* if we've gotten to here then there is at least one zip entry (see get_zipinfo call above) */ | ||
371 | /* note: this free()'s the info list as it goes */ | ||
372 | info = start; | ||
373 | while (NULL != info) | ||
374 | { | ||
375 | if (info->filename != NULL) | ||
376 | { | ||
377 | if ( (ret == 0) && (strlen (info->filename)) ) | ||
378 | { | ||
379 | ret = proc (proc_cls, | ||
380 | "zip", | ||
381 | EXTRACTOR_METATYPE_FILENAME, | ||
382 | EXTRACTOR_METAFORMAT_UTF8, | ||
383 | "text/plain", | ||
384 | info->filename, | ||
385 | strlen (info->filename)+1); | ||
386 | } | ||
387 | } | ||
388 | if (info->comment != NULL) | ||
389 | { | ||
390 | if ( (ret == 0) && (strlen (info->comment) > 0) ) | ||
391 | { | ||
392 | ret = proc (proc_cls, | ||
393 | "zip", | ||
394 | EXTRACTOR_METATYPE_FILENAME, | ||
395 | EXTRACTOR_METAFORMAT_UTF8, | ||
396 | "text/plain", | ||
397 | info->comment, | ||
398 | strlen (info->comment)+1); | ||
399 | } | ||
400 | } | ||
401 | free (info->filename); | ||
402 | free (info->comment); | ||
403 | tmp = info; | ||
404 | info = info->next; | ||
405 | free (tmp); | ||
406 | } | ||
407 | return ret; | ||
408 | } | ||
409 | |||
410 | |||