aboutsummaryrefslogtreecommitdiff
path: root/src/plugins/zip_extractor.c
blob: 518533da12a93b186746c412e2f05baa15504f41 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
/** 
     zipextractor.c  version 0.0.2

     Changes from 0.0.1 to 0.0.2 
     -> Searches for central dir struct from end of file if this is a self-extracting executable


     This file was based on mp3extractor.c  (0.1.2)

     Currently, this only returns a list of the filenames within a zipfile
     and any comments on each file or the whole file itself. File sizes, 
     modification times, and crc's are currently ignored.

     TODO: Break the comments up into small, atomically, searchable chunks (keywords)
         - might need some knowledge of English?

     It returns:

     one      EXTRACTOR_MIMETYPE
     multiple EXTRACTOR_FILENAME
     multiple EXTRACTOR_COMMENT
     
     ... from a .ZIP file

     TODO: EXTRACTOR_DATE, EXTRACTOR_DESCRIPTION, EXTRACTOR_KEYWORDS, others?

     Does NOT test data integrity (CRCs etc.)

     This version is not recursive (i.e. doesn't look inside zip 
     files within zip files)
     
     TODO: Run extract on files inside of archive (?) (i.e. gif, mp3, etc.)
     
     The current .ZIP format description:
     ftp://ftp.pkware.com/appnote.zip

     No Copyright 2003 Julia Wolf

 */  
  
/*
 * This file is part of libextractor.
 * (C) 2002, 2003, 2009 Vidyut Samanta and Christian Grothoff
 *
 * libextractor is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published
 * by the Free Software Foundation; either version 2, or (at your
 * option) any later version.
 * 
 * libextractor is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with libextractor; see the file COPYING.  If not, write to the
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 * Boston, MA 02111-1307, USA.
 */ 
  
#include "platform.h"
#include "extractor.h"
  
#define DEBUG_EXTRACT_ZIP 0
  
/* In a zipfile there are two kinds of comments. One is a big one for the
   entire .zip, it's usually a BBS ad. The other is a small comment on each
   individual file; most people don't use this.
 */ 
  
/* TODO: zip_entry linked list is handeled kinda messily, should clean up (maybe) */ 
  typedef struct
{
  char *filename;
   char *comment;
   void *next;
 } zip_entry;

/* mimetype = application/zip */ 
int 
EXTRACTOR_zip_extract (const unsigned char *data,
		       size_t size,
		       EXTRACTOR_MetaDataProcessor proc,
		       void *proc_cls,
		       const char *options)
{
  int ret;
  void *tmp;
  zip_entry * info;
  zip_entry * start;
  char *filecomment;
  const unsigned char *pos;
  unsigned int offset, stop;
  unsigned int name_length, extra_length, comment_length;
  unsigned int filecomment_length;
  unsigned int entry_total, entry_count;

  /* I think the smallest zipfile you can have is about 120 bytes */ 
  if ((NULL == data) || (size < 100))
    return 0;
  if (! (('P' == data[0]) && ('K' == data[1]) && (0x03 == data[2])
         && (0x04 == data[3])))
    return 0;
  
    /* The filenames for each file in a zipfile are stored in two locations.
     * There is one at the start of each entry, just before the compressed data,
     * and another at the end in a 'central directory structure'.
     *
     * In order to catch self-extracting executables, we scan backwards from the end
     * of the file looking for the central directory structure. The previous version
     * of this went forewards through the local headers, but that only works for plain
     * vanilla zip's and I don't feel like writing a special case for each of the dozen
     * self-extracting executable stubs.
     *
     * This assumes that the zip file is considered to be non-corrupt/non-truncated.
     * If it is truncated then it's not considered to be a zip and skipped.
     *
     */ 
    
    /* From appnote.iz and appnote.txt (more or less)
     *
     *   (this is why you always need to put in the last floppy if you span disks)
     *
     *   0- 3  end of central dir signature    4 bytes  (0x06054b50) P K ^E ^F
     *   4- 5  number of this disk             2 bytes
     *   6- 7  number of the disk with the
     *         start of the central directory  2 bytes
     *   8- 9  total number of entries in
     *         the central dir on this disk    2 bytes
     *  10-11  total number of entries in
     *         the central dir                 2 bytes
     *  12-15  size of the central directory   4 bytes
     *  16-19  offset of start of central
     *         directory with respect to
     *         the starting disk number        4 bytes
     *  20-21  zipfile comment length          2 bytes
     *  22-??  zipfile comment (variable size) max length 65536 bytes
     */ 
    
    /*  the signature can't be more than 22 bytes from the end */ 
  offset = size - 22;
  pos = &data[offset];
  stop = 0;
  if (((signed int) size - 65556) > 0)
    stop = size - 65556;
  
    /* not using int 0x06054b50 so that we don't have to deal with endianess issues.
       break out if we go more than 64K backwards and havn't found it, or if we hit the
       begining of the file. */ 
    while ((!(('P' == pos[0]) && ('K' == pos[1]) && (0x05 == pos[2])
              && (0x06 == pos[3]))) && (offset > stop))
    pos = &data[offset--];
  if (offset == stop)
    {
#if DEBUG_EXTRACT_ZIP
      fprintf (stderr,
                 "Did not find end of central directory structure signature. offset: %i\n",
                 offset);
      
#endif
      return 0;
    }
  
    /* offset should now point to the start of the end-of-central directory structure */ 
    /* and pos[0] should be pointing there too */ 
    /* so slurp down filecomment while here... */ 
    filecomment_length = pos[20] + (pos[21] << 8);
  if (filecomment_length + offset + 22 > size)
    {
      return 0;             /* invalid zip file format! */
    }
  filecomment = NULL;
  if (filecomment_length > 0)
    {
      filecomment = malloc (filecomment_length + 1);
      if (filecomment != NULL)
	{
	  memcpy (filecomment, &pos[22], filecomment_length);
	  filecomment[filecomment_length] = '\0';
	}
    }

#if DEBUG_EXTRACT_ZIP
  if ((0 != pos[4]) && (0 != pos[5]))
    fprintf (stderr,
	     "WARNING: This seems to be the last disk in a multi-volume"
	     " ZIP archive, and so this might not work.\n");          
#endif
      
#if DEBUG_EXTRACT_ZIP
  if ((pos[8] != pos[10]) && (pos[9] != pos[11]))
    fprintf (stderr,
	     "WARNING: May not be able to find all the files in this" 
	     " ZIP archive (no multi-volume support right now).\n");  
#endif
  entry_total = pos[10] + (pos[11] << 8);
  entry_count = 0;
  
    /* jump to start of central directory, ASSUMING that the starting disk that it's on is disk 0 */ 
    /* starting disk would otherwise be pos[6]+pos[7]<<8 */ 
    offset = pos[16] + (pos[17] << 8) + (pos[18] << 16) + (pos[19] << 24);     /* offset of cent-dir from start of disk 0 */
  
    /* stop   = pos[12] + (pos[13]<<8) + (pos[14]<<16) + (pos[15]<<24); *//* length of central dir */ 
    if (offset + 46 > size)
    {
      
        /* not a zip */ 
      if (filecomment != NULL)
        free (filecomment);
      return 0;
    }
  pos = &data[offset];         /* jump */
  
    /* we should now be at the begining of the central directory structure */ 
    
    /* from appnote.txt and appnote.iz (mostly)
     *
     *   0- 3  central file header signature   4 bytes  (0x02014b50)
     *   4- 5  version made by                 2 bytes
     *   6- 7  version needed to extract       2 bytes
     *   8- 9  general purpose bit flag        2 bytes
     *  10-11  compression method              2 bytes
     *  12-13  last mod file time              2 bytes
     *  14-15  last mod file date              2 bytes
     *  16-19  crc-32                          4 bytes
     *  20-23  compressed size                 4 bytes
     *  24-27  uncompressed size               4 bytes
     *  28-29  filename length                 2 bytes
     *  30-31  extra field length              2 bytes
     *  32-33  file comment length             2 bytes
     *  34-35  disk number start               2 bytes
     *  36-37  internal file attributes        2 bytes
     *  38-41  external file attributes        4 bytes
     *  42-45  relative offset of local header 4 bytes
     *
     *  46-??  filename (variable size)
     *   ?- ?  extra field (variable size)
     *   ?- ?  file comment (variable size)
     */ 
    if (!(('P' == pos[0]) && ('K' == pos[1]) && (0x01 == pos[2])
          && (0x02 == pos[3])))
      {      
#if DEBUG_EXTRACT_ZIP
        fprintf (stderr,
                 "Did not find central directory structure signature. offset: %i\n",
                 offset);
	
#endif
        if (filecomment != NULL)
	  free (filecomment);
	return 0;
      }
  start = NULL;
  info = NULL;
  
  do
    {                           /* while ( (0x01==pos[2])&&(0x02==pos[3]) ) */
      entry_count++;           /* check to make sure we found everything at the end */
      name_length = pos[28] + (pos[29] << 8);
      extra_length = pos[30] + (pos[31] << 8);
      comment_length = pos[32] + (pos[33] << 8);
      if (name_length + extra_length + comment_length + offset + 46 > size)
        {
          
            /* ok, invalid, abort! */ 
            break;
        }
      
#if DEBUG_EXTRACT_ZIP
        fprintf (stderr, "Found filename length %i  Comment length: %i\n",
                 name_length, comment_length);
      
#endif        
        /* yay, finally get filenames */ 
        if (start == NULL)
        {
          start = malloc (sizeof (zip_entry));
	  if (start == NULL)
	    break;
          start->next = NULL;
          info = start;
        }
      else
        {
          info->next = malloc (sizeof (zip_entry));
	  if (info->next == NULL)
	    break;
          info = info->next;
          info->next = NULL;
        }
      info->filename = malloc (name_length + 1);
      info->comment = malloc (comment_length + 1);
      
        /* (strings in zip files are not null terminated) */ 
      if (info->filename != NULL)
	{
	  memcpy (info->filename, &pos[46], name_length);
	  info->filename[name_length] = '\0';
	}
      if (info->comment != NULL)
	{
	  memcpy (info->comment, &pos[46 + name_length + extra_length],
		  comment_length);
	  info->comment[comment_length] = '\0';
	}
      offset += 46 + name_length + extra_length + comment_length;
      pos = &data[offset];      
      /* check for next header entry (0x02014b50) or (0x06054b50) if at end */ 
      if (('P' != pos[0]) && ('K' != pos[1]))
        {         
#if DEBUG_EXTRACT_ZIP
	  fprintf (stderr,
		   "Did not find next header in central directory.\n");
          
#endif
	  info = start;
          while (info != NULL)
            {
              start = info->next;
	      if (info->filename != NULL)
		free (info->filename);
	      if (info->comment != NULL)
		free (info->comment);
              free (info);
              info = start;
            }
          if (filecomment != NULL)
            free (filecomment);
          return 0;
        }
    }
  while ((0x01 == pos[2]) && (0x02 == pos[3]));
  
    /* end list */ 
    
    /* TODO: should this return an error? indicates corrupt zipfile (or
       disk missing in middle of multi-disk)? */ 
#if DEBUG_EXTRACT_ZIP
  if (entry_count != entry_total)
    fprintf (stderr,
	     "WARNING: Did not find all of the zipfile entries that we should have.\n");    
#endif
  
  ret = proc (proc_cls,
	      "zip",
	      EXTRACTOR_METATYPE_MIMETYPE,
	      EXTRACTOR_METAFORMAT_UTF8,
	      "text/plain",
	      "application/zip",
	      strlen ("application/zip")+1);
  if ( (filecomment != NULL) && (ret != 0) )
    {
      ret = proc (proc_cls,
		  "zip",
		  EXTRACTOR_METATYPE_MIMETYPE,
		  EXTRACTOR_METAFORMAT_UTF8,
		  "text/plain",
		  filecomment,
		  strlen (filecomment)+1);
    }
  if (filecomment != NULL)
    free (filecomment);

  
  /* if we've gotten to here then there is at least one zip entry (see get_zipinfo call above) */ 
  /* note: this free()'s the info list as it goes */ 
  info = start;
  while (NULL != info)
    {
      if (info->filename != NULL)
        {
          if ( (ret == 0) && (strlen (info->filename)) )
            {
	      ret = proc (proc_cls,
			  "zip",
			  EXTRACTOR_METATYPE_FILENAME,
			  EXTRACTOR_METAFORMAT_UTF8,
			  "text/plain",
			  info->filename,
			  strlen (info->filename)+1);
            }
	   free (info->filename);
        }
      if (info->comment != NULL)
	{
	  if ( (ret == 0) && (strlen (info->comment) > 0) )
	    {
	      ret = proc (proc_cls,
			  "zip",
			  EXTRACTOR_METATYPE_FILENAME,
			  EXTRACTOR_METAFORMAT_UTF8,
			  "text/plain",
			  info->comment,
			  strlen (info->comment)+1);
	    }
	  free (info->comment);
	}
      tmp = info;
      info = info->next;
      free (tmp);
    }
  return ret;
}