aboutsummaryrefslogtreecommitdiff
path: root/src/plugins/zip_extractor.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/plugins/zip_extractor.c')
-rw-r--r--src/plugins/zip_extractor.c410
1 files changed, 410 insertions, 0 deletions
diff --git a/src/plugins/zip_extractor.c b/src/plugins/zip_extractor.c
new file mode 100644
index 0000000..c7fef95
--- /dev/null
+++ b/src/plugins/zip_extractor.c
@@ -0,0 +1,410 @@
1/**
2 zipextractor.c version 0.0.2
3
4 Changes from 0.0.1 to 0.0.2
5 -> Searches for central dir struct from end of file if this is a self-extracting executable
6
7
8 This file was based on mp3extractor.c (0.1.2)
9
10 Currently, this only returns a list of the filenames within a zipfile
11 and any comments on each file or the whole file itself. File sizes,
12 modification times, and crc's are currently ignored.
13
14 TODO: Break the comments up into small, atomically, searchable chunks (keywords)
15 - might need some knowledge of English?
16
17 It returns:
18
19 one EXTRACTOR_MIMETYPE
20 multiple EXTRACTOR_FILENAME
21 multiple EXTRACTOR_COMMENT
22
23 ... from a .ZIP file
24
25 TODO: EXTRACTOR_DATE, EXTRACTOR_DESCRIPTION, EXTRACTOR_KEYWORDS, others?
26
27 Does NOT test data integrity (CRCs etc.)
28
29 This version is not recursive (i.e. doesn't look inside zip
30 files within zip files)
31
32 TODO: Run extract on files inside of archive (?) (i.e. gif, mp3, etc.)
33
34 The current .ZIP format description:
35 ftp://ftp.pkware.com/appnote.zip
36
37 No Copyright 2003 Julia Wolf
38
39 */
40
41/*
42 * This file is part of libextractor.
43 * (C) 2002, 2003, 2009 Vidyut Samanta and Christian Grothoff
44 *
45 * libextractor is free software; you can redistribute it and/or modify
46 * it under the terms of the GNU General Public License as published
47 * by the Free Software Foundation; either version 2, or (at your
48 * option) any later version.
49 *
50 * libextractor is distributed in the hope that it will be useful, but
51 * WITHOUT ANY WARRANTY; without even the implied warranty of
52 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
53 * General Public License for more details.
54 *
55 * You should have received a copy of the GNU General Public License
56 * along with libextractor; see the file COPYING. If not, write to the
57 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
58 * Boston, MA 02111-1307, USA.
59 */
60
61#include "platform.h"
62#include "extractor.h"
63
64#define DEBUG_EXTRACT_ZIP 0
65
66/* In a zipfile there are two kinds of comments. One is a big one for the
67 entire .zip, it's usually a BBS ad. The other is a small comment on each
68 individual file; most people don't use this.
69 */
70
71/* TODO: zip_entry linked list is handeled kinda messily, should clean up (maybe) */
72 typedef struct
73{
74 char *filename;
75 char *comment;
76 void *next;
77 } zip_entry;
78
79/* mimetype = application/zip */
80int
81EXTRACTOR_zip_extract (const unsigned char *data,
82 size_t size,
83 EXTRACTOR_MetaDataProcessor proc,
84 void *proc_cls,
85 const char *options)
86{
87 int ret;
88 void *tmp;
89 zip_entry * info;
90 zip_entry * start;
91 char *filecomment = NULL;
92 const unsigned char *pos;
93 unsigned int offset, stop;
94 unsigned int name_length, extra_length, comment_length;
95 unsigned int filecomment_length;
96 unsigned int entry_total, entry_count;
97
98 /* I think the smallest zipfile you can have is about 120 bytes */
99 if ((NULL == data) || (size < 100))
100 return 0;
101 if (! (('P' == data[0]) && ('K' == data[1]) && (0x03 == data[2])
102 && (0x04 == data[3])))
103 return 0;
104
105 /* The filenames for each file in a zipfile are stored in two locations.
106 * There is one at the start of each entry, just before the compressed data,
107 * and another at the end in a 'central directory structure'.
108 *
109 * In order to catch self-extracting executables, we scan backwards from the end
110 * of the file looking for the central directory structure. The previous version
111 * of this went forewards through the local headers, but that only works for plain
112 * vanilla zip's and I don't feel like writing a special case for each of the dozen
113 * self-extracting executable stubs.
114 *
115 * This assumes that the zip file is considered to be non-corrupt/non-truncated.
116 * If it is truncated then it's not considered to be a zip and skipped.
117 *
118 */
119
120 /* From appnote.iz and appnote.txt (more or less)
121 *
122 * (this is why you always need to put in the last floppy if you span disks)
123 *
124 * 0- 3 end of central dir signature 4 bytes (0x06054b50) P K ^E ^F
125 * 4- 5 number of this disk 2 bytes
126 * 6- 7 number of the disk with the
127 * start of the central directory 2 bytes
128 * 8- 9 total number of entries in
129 * the central dir on this disk 2 bytes
130 * 10-11 total number of entries in
131 * the central dir 2 bytes
132 * 12-15 size of the central directory 4 bytes
133 * 16-19 offset of start of central
134 * directory with respect to
135 * the starting disk number 4 bytes
136 * 20-21 zipfile comment length 2 bytes
137 * 22-?? zipfile comment (variable size) max length 65536 bytes
138 */
139
140 /* the signature can't be more than 22 bytes from the end */
141 offset = size - 22;
142 pos = &data[offset];
143 stop = 0;
144 if (((signed int) size - 65556) > 0)
145 stop = size - 65556;
146
147 /* not using int 0x06054b50 so that we don't have to deal with endianess issues.
148 break out if we go more than 64K backwards and havn't found it, or if we hit the
149 begining of the file. */
150 while ((!(('P' == pos[0]) && ('K' == pos[1]) && (0x05 == pos[2])
151 && (0x06 == pos[3]))) && (offset > stop))
152 pos = &data[offset--];
153 if (offset == stop)
154 {
155
156#if DEBUG_EXTRACT_ZIP
157 fprintf (stderr,
158 "Did not find end of central directory structure signature. offset: %i\n",
159 offset);
160
161#endif /* */
162 return 0;
163 }
164
165 /* offset should now point to the start of the end-of-central directory structure */
166 /* and pos[0] should be pointing there too */
167 /* so slurp down filecomment while here... */
168 filecomment_length = pos[20] + (pos[21] << 8);
169 if (filecomment_length + offset + 22 > size)
170 {
171 return 0; /* invalid zip file format! */
172 }
173 filecomment = NULL;
174 if (filecomment_length > 0)
175 {
176 filecomment = malloc (filecomment_length + 1);
177 memcpy (filecomment, &pos[22], filecomment_length);
178 filecomment[filecomment_length] = '\0';
179 }
180 if ((0 != pos[4]) && (0 != pos[5]))
181 {
182
183#if DEBUG_EXTRACT_ZIP
184 fprintf (stderr,
185 "WARNING: This seems to be the last disk in a multi-volume"
186 " ZIP archive, and so this might not work.\n");
187
188#endif /* */
189 }
190 if ((pos[8] != pos[10]) && (pos[9] != pos[11]))
191 {
192
193#if DEBUG_EXTRACT_ZIP
194 fprintf (stderr,
195 "WARNING: May not be able to find all the files in this"
196 " ZIP archive (no multi-volume support right now).\n");
197
198#endif /* */
199 }
200 entry_total = pos[10] + (pos[11] << 8);
201 entry_count = 0;
202
203 /* jump to start of central directory, ASSUMING that the starting disk that it's on is disk 0 */
204 /* starting disk would otherwise be pos[6]+pos[7]<<8 */
205 offset = pos[16] + (pos[17] << 8) + (pos[18] << 16) + (pos[19] << 24); /* offset of cent-dir from start of disk 0 */
206
207 /* stop = pos[12] + (pos[13]<<8) + (pos[14]<<16) + (pos[15]<<24); *//* length of central dir */
208 if (offset + 46 > size)
209 {
210
211 /* not a zip */
212 if (filecomment != NULL)
213 free (filecomment);
214 return 0;
215 }
216 pos = &data[offset]; /* jump */
217
218 /* we should now be at the begining of the central directory structure */
219
220 /* from appnote.txt and appnote.iz (mostly)
221 *
222 * 0- 3 central file header signature 4 bytes (0x02014b50)
223 * 4- 5 version made by 2 bytes
224 * 6- 7 version needed to extract 2 bytes
225 * 8- 9 general purpose bit flag 2 bytes
226 * 10-11 compression method 2 bytes
227 * 12-13 last mod file time 2 bytes
228 * 14-15 last mod file date 2 bytes
229 * 16-19 crc-32 4 bytes
230 * 20-23 compressed size 4 bytes
231 * 24-27 uncompressed size 4 bytes
232 * 28-29 filename length 2 bytes
233 * 30-31 extra field length 2 bytes
234 * 32-33 file comment length 2 bytes
235 * 34-35 disk number start 2 bytes
236 * 36-37 internal file attributes 2 bytes
237 * 38-41 external file attributes 4 bytes
238 * 42-45 relative offset of local header 4 bytes
239 *
240 * 46-?? filename (variable size)
241 * ?- ? extra field (variable size)
242 * ?- ? file comment (variable size)
243 */
244 if (!(('P' == pos[0]) && ('K' == pos[1]) && (0x01 == pos[2])
245 && (0x02 == pos[3])))
246 {
247
248#if DEBUG_EXTRACT_ZIP
249 fprintf (stderr,
250 "Did not find central directory structure signature. offset: %i\n",
251 offset);
252
253#endif
254 if (filecomment != NULL)
255 free (filecomment);
256 return 0;
257 }
258 start = NULL;
259 info = NULL;
260
261 do
262 { /* while ( (0x01==pos[2])&&(0x02==pos[3]) ) */
263 entry_count++; /* check to make sure we found everything at the end */
264 name_length = pos[28] + (pos[29] << 8);
265 extra_length = pos[30] + (pos[31] << 8);
266 comment_length = pos[32] + (pos[33] << 8);
267 if (name_length + extra_length + comment_length + offset + 46 > size)
268 {
269
270 /* ok, invalid, abort! */
271 break;
272 }
273
274#if DEBUG_EXTRACT_ZIP
275 fprintf (stderr, "Found filename length %i Comment length: %i\n",
276 name_length, comment_length);
277
278#endif /* */
279
280 /* yay, finally get filenames */
281 if (start == NULL)
282 {
283 start = malloc (sizeof (zip_entry));
284 start->next = NULL;
285 info = start;
286 }
287 else
288 {
289 info->next = malloc (sizeof (zip_entry));
290 info = info->next;
291 info->next = NULL;
292 }
293 info->filename = malloc (name_length + 1);
294 info->comment = malloc (comment_length + 1);
295
296 /* (strings in zip files are not null terminated) */
297 memcpy (info->filename, &pos[46], name_length);
298 info->filename[name_length] = '\0';
299 memcpy (info->comment, &pos[46 + name_length + extra_length],
300 comment_length);
301 info->comment[comment_length] = '\0';
302
303#if DEBUG_EXTRACT_ZIP
304 fprintf (stderr, "Found file %s, Comment: %s\n", info->filename,
305 info->comment);
306
307#endif
308 offset += 46 + name_length + extra_length + comment_length;
309 pos = &data[offset];
310
311 /* check for next header entry (0x02014b50) or (0x06054b50) if at end */
312 if (('P' != pos[0]) && ('K' != pos[1]))
313 {
314
315#if DEBUG_EXTRACT_ZIP
316 fprintf (stderr,
317 "Did not find next header in central directory.\n");
318
319#endif
320 info = start;
321 while (info != NULL)
322 {
323 start = info->next;
324 free (info->filename);
325 free (info->comment);
326 free (info);
327 info = start;
328 }
329 if (filecomment != NULL)
330 free (filecomment);
331 return 0;
332 }
333 }
334 while ((0x01 == pos[2]) && (0x02 == pos[3]));
335
336 /* end list */
337
338 /* TODO: should this return an error? indicates corrupt zipfile (or
339 disk missing in middle of multi-disk)? */
340 if (entry_count != entry_total)
341 {
342
343#if DEBUG_EXTRACT_ZIP
344 fprintf (stderr,
345 "WARNING: Did not find all of the zipfile entries that we should have.\n");
346
347#endif /* */
348 }
349
350 ret = proc (proc_cls,
351 "zip",
352 EXTRACTOR_METATYPE_MIMETYPE,
353 EXTRACTOR_METAFORMAT_UTF8,
354 "text/plain",
355 "application/zip",
356 strlen ("application/zip")+1);
357 if ( (filecomment != NULL) && (ret != 0) )
358 {
359 ret = proc (proc_cls,
360 "zip",
361 EXTRACTOR_METATYPE_MIMETYPE,
362 EXTRACTOR_METAFORMAT_UTF8,
363 "text/plain",
364 filecomment,
365 strlen (filecomment)+1);
366 }
367 free (filecomment);
368
369
370 /* if we've gotten to here then there is at least one zip entry (see get_zipinfo call above) */
371 /* note: this free()'s the info list as it goes */
372 info = start;
373 while (NULL != info)
374 {
375 if (info->filename != NULL)
376 {
377 if ( (ret == 0) && (strlen (info->filename)) )
378 {
379 ret = proc (proc_cls,
380 "zip",
381 EXTRACTOR_METATYPE_FILENAME,
382 EXTRACTOR_METAFORMAT_UTF8,
383 "text/plain",
384 info->filename,
385 strlen (info->filename)+1);
386 }
387 }
388 if (info->comment != NULL)
389 {
390 if ( (ret == 0) && (strlen (info->comment) > 0) )
391 {
392 ret = proc (proc_cls,
393 "zip",
394 EXTRACTOR_METATYPE_FILENAME,
395 EXTRACTOR_METAFORMAT_UTF8,
396 "text/plain",
397 info->comment,
398 strlen (info->comment)+1);
399 }
400 }
401 free (info->filename);
402 free (info->comment);
403 tmp = info;
404 info = info->next;
405 free (tmp);
406 }
407 return ret;
408}
409
410