aboutsummaryrefslogtreecommitdiff
path: root/src/main/extractor_datasource.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/main/extractor_datasource.c')
-rw-r--r--src/main/extractor_datasource.c912
1 files changed, 497 insertions, 415 deletions
diff --git a/src/main/extractor_datasource.c b/src/main/extractor_datasource.c
index 2adbdfc..90dd077 100644
--- a/src/main/extractor_datasource.c
+++ b/src/main/extractor_datasource.c
@@ -22,10 +22,22 @@
22 22
23#if HAVE_LIBBZ2 23#if HAVE_LIBBZ2
24#include <bzlib.h> 24#include <bzlib.h>
25#define MIN_BZ2_HEADER 4
26#ifndef MIN_COMPRESSED_HEADER
27#define MIN_COMPRESSED_HEADER MIN_ZLIB_HEADER
28#endif
25#endif 29#endif
26 30
27#if HAVE_ZLIB 31#if HAVE_ZLIB
28#include <zlib.h> 32#include <zlib.h>
33#define MIN_ZLIB_HEADER 12
34#ifndef MIN_COMPRESSED_HEADER
35#define MIN_COMPRESSED_HEADER MIN_BZ2_HEADER
36#endif
37#endif
38
39#ifndef MIN_COMPRESSED_HEADER
40#define MIN_COMPRESSED_HEADER -1
29#endif 41#endif
30 42
31#ifndef O_LARGEFILE 43#ifndef O_LARGEFILE
@@ -37,24 +49,12 @@
37 */ 49 */
38#define MAX_READ (4 * 1024 * 1024) 50#define MAX_READ (4 * 1024 * 1024)
39 51
52/**
53 * Data is read from the source and shoved into decompressor
54 * in chunks this big.
55 */
56#define COM_CHUNK_SIZE (10 * 1024)
40 57
41#if HAVE_ZLIB
42#define MIN_ZLIB_HEADER 12
43#endif
44#if HAVE_LIBBZ2
45#define MIN_BZ2_HEADER 4
46#endif
47#if !defined (MIN_COMPRESSED_HEADER) && HAVE_ZLIB
48#define MIN_COMPRESSED_HEADER MIN_ZLIB_HEADER
49#endif
50#if !defined (MIN_COMPRESSED_HEADER) && HAVE_LIBBZ2
51#define MIN_COMPRESSED_HEADER MIN_BZ2_HEADER
52#endif
53#if !defined (MIN_COMPRESSED_HEADER)
54#define MIN_COMPRESSED_HEADER -1
55#endif
56
57#define COMPRESSED_DATA_PROBE_SIZE 3
58 58
59/** 59/**
60 * Enum with the various possible types of compression supported. 60 * Enum with the various possible types of compression supported.
@@ -106,17 +106,17 @@ struct BufferedFileDataSource
106 uint64_t fsize; 106 uint64_t fsize;
107 107
108 /** 108 /**
109 * Position within the file or the data buffer 109 * Position of the buffer in the file.
110 */ 110 */
111 uint64_t fpos; 111 uint64_t fpos;
112 112
113 /** 113 /**
114 * Position within the buffer. 114 * Position within the buffer.
115 */ 115 */
116 uint64_t buffer_pos; 116 uint64_t buffer_pos;
117 117
118 /** 118 /**
119 * Number of bytes in the buffer (<= buffer_size) 119 * Number of valid bytes in the buffer (<= buffer_size)
120 */ 120 */
121 uint64_t buffer_bytes; 121 uint64_t buffer_bytes;
122 122
@@ -144,12 +144,22 @@ struct CompressedFileSource
144 struct BufferedFileDataSource *bfds; 144 struct BufferedFileDataSource *bfds;
145 145
146 /** 146 /**
147 * Decompression target buffer.
148 */
149 char result[COM_CHUNK_SIZE];
150
151 /**
152 * At which offset in 'result' is 'fpos'?
153 */
154 size_t result_pos;
155
156 /**
147 * Size of the source (same as bfds->fsize) 157 * Size of the source (same as bfds->fsize)
148 */ 158 */
149 int64_t fsize; 159 int64_t fsize;
150 160
151 /** 161 /**
152 * Position within the source 162 * Position within the (decompressed) source
153 */ 163 */
154 int64_t fpos; 164 int64_t fpos;
155 165
@@ -206,7 +216,7 @@ bfds_pick_next_buffer_at (struct BufferedFileDataSource *bfds,
206 return -1; /* invalid */ 216 return -1; /* invalid */
207 if (NULL == bfds->buffer) 217 if (NULL == bfds->buffer)
208 { 218 {
209 bfds->buffer_bytes = bfds->fsize; 219 bfds->buffer_pos = pos;
210 return 0; 220 return 0;
211 } 221 }
212#if WINDOWS 222#if WINDOWS
@@ -219,6 +229,7 @@ bfds_pick_next_buffer_at (struct BufferedFileDataSource *bfds,
219 if (position < 0) 229 if (position < 0)
220 return -1; 230 return -1;
221 bfds->fpos = position; 231 bfds->fpos = position;
232 bfds->buffer_pos = 0;
222 rd = read (bfds->fd, bfds->buffer, bfds->buffer_size); 233 rd = read (bfds->fd, bfds->buffer, bfds->buffer_size);
223 if (rd < 0) 234 if (rd < 0)
224 return -1; 235 return -1;
@@ -258,7 +269,7 @@ bfds_new (const void *data,
258 memset (result, 0, sizeof (struct BufferedFileDataSource)); 269 memset (result, 0, sizeof (struct BufferedFileDataSource));
259 result->data = (NULL != data) ? data : &result[1]; 270 result->data = (NULL != data) ? data : &result[1];
260 result->buffer = (NULL != data) ? NULL : &result[1]; 271 result->buffer = (NULL != data) ? NULL : &result[1];
261 result->buffer_size = (NULL != data) ? fsize : xtra; 272 result->buffer_size = (NULL != data) ? fsize : xtra;
262 result->fsize = fsize; 273 result->fsize = fsize;
263 result->fd = fd; 274 result->fd = fd;
264 bfds_pick_next_buffer_at (result, 0); 275 bfds_pick_next_buffer_at (result, 0);
@@ -297,86 +308,95 @@ bfds_seek (struct BufferedFileDataSource *bfds,
297 switch (whence) 308 switch (whence)
298 { 309 {
299 case SEEK_CUR: 310 case SEEK_CUR:
300 if (NULL != bfds->buffer) 311 if (bfds->fpos + bfds->buffer_pos + pos < 0)
312 return -1;
313 if (bfds->fpos + bfds->buffer_pos + pos > bfds->fsize)
314 return -1;
315 if ( (NULL == bfds->buffer) ||
316 ( (bfds->buffer_pos + pos < pos->buffer_bytes) &&
317 (bfds->buffer_pos + pos >= 0) ) )
301 { 318 {
302 if (0 != bfds_pick_next_buffer_at (bfds, 319 bfds->buffer_pos += pos;
303 bfds->fpos + bfds->buffer_pos + pos)) 320 return bfds->buffer_pos;
304 return -1;
305 bfds->buffer_pos = 0;
306 return bfds->fpos;
307 } 321 }
308 bfds->buffer_pos += pos; 322 if (0 != bfds_pick_next_buffer_at (bfds,
309 return bfds->buffer_pos; 323 bfds->fpos + bfds->buffer_pos + pos))
324 return -1;
325 return bfds->fpos;
326 case SEEK_END:
327 if (pos > 0)
328 return -1;
329 if (bfds->fsize < - pos)
330 return -1;
331 pos = bfds->fsize + pos;
332 /* fall-through! */
310 case SEEK_SET: 333 case SEEK_SET:
311 if (pos < 0) 334 if (pos < 0)
312 return -1; 335 return -1;
313 if (NULL != bfds->buffer) 336 if (pos > bfds->fsize)
314 { 337 return -1;
315 if (0 != bfds_pick_next_buffer_at (bfds, pos)) 338 if ( (NULL == bfds->buffer) ||
316 return -1; 339 ( (bfds->buffer_pos <= pos) &&
317 bfds->buffer_pos = 0; 340 (bfds->buffer_pos + pos->buffer_bytes > pos) ) )
318 return bfds->fpos;
319 }
320 bfds->buffer_pos = pos;
321 return bfds->buffer_pos;
322 case SEEK_END:
323 if (NULL != bfds->buffer)
324 { 341 {
325 if (0 != bfds_pick_next_buffer_at (bfds, bfds->fsize + pos)) 342 bfds->buffer_pos = pos;
326 return -1; 343 return bfds->buffer_pos;
327 bfds->buffer_pos = 0;
328 return bfds->fpos;
329 } 344 }
330 bfds->buffer_pos = bfds->fsize + pos; 345 if (0 != bfds_pick_next_buffer_at (bfds, pos))
331 return bfds->buffer_pos; 346 return -1;
347 return bfds->fpos;
332 } 348 }
333 return -1; 349 return -1;
334} 350}
335 351
336 352
337/** 353/**
338 * Fills 'buf_ptr' with a chunk of data. 354 * Fills 'buf_ptr' with a chunk of data. Will
339 * Will seek if necessary. Will fail if 'count' exceeds buffer size. 355 * fail if 'count' exceeds buffer size.
340 * 356 *
341 * @param bfds bfds 357 * @param bfds bfds
342 * @param buf_ptr location to store data 358 * @param buf_ptr location to store data
343 * @param count number of bytes to read 359 * @param count number of bytes to read
344 * @return number of bytes (<= count) available at location pointed by buf_ptr 360 * @return number of bytes (<= count) available at location pointed by buf_ptr,
361 * 0 for end of stream, -1 on error
345 */ 362 */
346static ssize_t 363static ssize_t
347bfds_read (struct BufferedFileDataSource *bfds, 364bfds_read (struct BufferedFileDataSource *bfds,
348 void *buf_ptr, 365 void *buf_ptr,
349 size_t count) 366 size_t count)
350{ 367{
351 if (count > MAX_READ) 368 char *cbuf = buf_ptr;
352 return -1; 369 uint64_t old_off;
353 if (count > bfds->buffer_bytes - bfds->buffer_pos) 370 size_t avail;
371 size_t ret;
372
373 old_off = bfds->fpos + bfds->buffer_pos + bfds->buffer_bytes;
374 if (old_off == bfds->fsize)
375 return 0; /* end of stream */
376 ret = 0;
377 while (count > 0)
354 { 378 {
355 if (bfds->fpos + bfds->buffer_pos != bfds_seek (bfds, bfds->fpos + bfds->buffer_pos, SEEK_SET)) 379 if ( (bfds->buffer_bytes == bfds->buffer_pos) &&
356 return -1; 380 (0 != bfds_pick_next_buffer_at (bfds,
357 if (NULL != bfds->buffer) 381 bfds->fpos + bfds->buffer_pos + bfds->buffer_bytes)) )
358 { 382 {
359 *buf_ptr = &bfds->buffer[bfds->buffer_pos]; 383 /* revert to original position, invalidate buffer */
360 bfds->buffer_pos += count < bfds->buffer_bytes ? count : bfds->buffer_bytes; 384 bfds->fpos = old_off;
361 return (count < bfds->buffer_bytes ? count : bfds->buffer_bytes); 385 bfds->buffer_bytes = 0;
362 } 386 bfds->buffer_pos = 0;
363 else 387 return -1; /* getting more failed */
364 {
365 int64_t ret = count < (bfds->buffer_bytes - bfds->buffer_pos) ? count : (bfds->buffer_bytes - bfds->buffer_pos);
366 *buf_ptr = (unsigned char*) &bfds->data[bfds->buffer_pos];
367 bfds->buffer_pos += ret;
368 return ret;
369 } 388 }
389 avail = bfds->buffer_bytes - bfds->buffer_pos;
390 if (avail > count)
391 avail = count;
392 if (0 == avail)
393 abort (); /* must not happen */
394 memcpy (&cbuf[ret], &bfds->data[bfds->buffer_pos], avail);
395 bfds->buffer_pos += avail;
396 count -= avail;
397 ret += avail;
370 } 398 }
371 else 399 return ret;
372 {
373 if (NULL != bfds->buffer)
374 *buf_ptr = &bfds->buffer[bfds->buffer_pos];
375 else
376 *buf_ptr = (unsigned char*) &bfds->data[bfds->buffer_pos];
377 bfds->buffer_pos += count;
378 return count;
379 }
380} 400}
381 401
382 402
@@ -395,7 +415,7 @@ cfs_delete (struct CompressedFileSource *cfs)
395/** 415/**
396 * Reset gz-compressed data stream to the beginning. 416 * Reset gz-compressed data stream to the beginning.
397 * 417 *
398 * @return 1 on success, 0 if we failed to seek, 418 * @return 1 on success, 0 to terminate extraction,
399 * -1 on decompressor initialization failure 419 * -1 on decompressor initialization failure
400 */ 420 */
401static int 421static int
@@ -422,13 +442,13 @@ cfs_reset_stream_zlib (struct CompressedFileSource *cfs)
422#ifdef ZLIB_VERNUM 442#ifdef ZLIB_VERNUM
423 15 + 32 443 15 + 32
424#else 444#else
425 -MAX_WBITS 445 - MAX_WBITS
426#endif 446#endif
427 )) 447 ))
428 { 448 {
429 return -1; 449 return -1;
430 } 450 }
431 cfs->fpos = cfs->gzip_header_length; 451 cfs->fpos = 0;
432 cfs->shm_pos = 0; 452 cfs->shm_pos = 0;
433 cfs->shm_buf_size = 0; 453 cfs->shm_buf_size = 0;
434 return 1; 454 return 1;
@@ -438,7 +458,7 @@ cfs_reset_stream_zlib (struct CompressedFileSource *cfs)
438/** 458/**
439 * Reset bz2-compressed data stream to the beginning. 459 * Reset bz2-compressed data stream to the beginning.
440 * 460 *
441 * @return 1 on success, 0 if we failed to seek, 461 * @return 1 on success, 0 to terminate extraction,
442 * -1 on decompressor initialization failure 462 * -1 on decompressor initialization failure
443 */ 463 */
444static int 464static int
@@ -455,21 +475,21 @@ cfs_reset_stream_bz2 (struct CompressedFileSource *cfs)
455 * seeking backward. 475 * seeking backward.
456 * 476 *
457 * @param cfs cfs to reset 477 * @param cfs cfs to reset
458 * @return 1 on success, , 0 if we failed to seek, 478 * @return 1 on success, 0 to terminate extraction,
459 * -1 on error 479 * -1 on error
460 */ 480 */
461static int 481static int
462cfs_reset_stream (struct CompressedFileSource *cfs) 482cfs_reset_stream (struct CompressedFileSource *cfs)
463{ 483{
464 switch (cfs->compression_type) 484 switch (cfs->compression_type)
465 { 485 {
466 case COMP_TYPE_ZLIB: 486 case COMP_TYPE_ZLIB:
467 return cfs_reset_stream_zlib (cfs); 487 return cfs_reset_stream_zlib (cfs);
468 case COMP_TYPE_BZ2: 488 case COMP_TYPE_BZ2:
469 return cfs_reset_stream_bz2 (cfs); 489 return cfs_reset_stream_bz2 (cfs);
470 default: 490 default:
471 return -1; 491 return -1;
472 } 492 }
473} 493}
474 494
475 495
@@ -480,98 +500,75 @@ cfs_reset_stream (struct CompressedFileSource *cfs)
480 * @param cfs cfs to initialize 500 * @param cfs cfs to initialize
481 * @param proc callback for metadata 501 * @param proc callback for metadata
482 * @param proc_cls callback cls 502 * @param proc_cls callback cls
483 * @return 1 on success, -1 on error 503 * @return 1 on success, 0 to terminate extraction, -1 on error
484 */ 504 */
485static int 505static int
486cfs_init_decompressor_zlib (struct CompressedFileSource *cfs, 506cfs_init_decompressor_zlib (struct CompressedFileSource *cfs,
487 EXTRACTOR_MetaDataProcessor proc, void *proc_cls) 507 EXTRACTOR_MetaDataProcessor proc, void *proc_cls)
488{ 508{
489 /* Process gzip header */
490 unsigned int gzip_header_length = 10; 509 unsigned int gzip_header_length = 10;
491 unsigned char data[12]; 510 unsigned char hdata[12];
492 int64_t buf_bytes;
493 int len;
494 unsigned char *buf;
495 unsigned char *cptr;
496
497 if (sizeof (data) > bfds_read (cfs->bfds, data, sizeof (data)))
498 return -1;
499
500 if (0 != (data[3] & 0x4)) /* FEXTRA set */
501 gzip_header_length += 2 + (unsigned) (data[10] & 0xff) +
502 (((unsigned) (data[11] & 0xff)) * 256);
503 511
504 if (0 != (data[3] & 0x8)) /* FNAME set */ 512 /* Process gzip header */
505 { 513 if (sizeof (hdata) > bfds_read (cfs->bfds, hdata, sizeof (hdata)))
506 if (gzip_header_length > bfds_seek (cfs->bfds, gzip_header_length, SEEK_SET)) 514 return -1;
507 return -1; 515 if (0 != (hdata[3] & 0x4)) /* FEXTRA set */
508 buf_bytes = bfds_read (cfs->bfds, &buf, 1024); 516 gzip_header_length += 2 + (unsigned) (hdata[10] & 0xff) +
509 if (buf_bytes <= 0) 517 (((unsigned) (hdata[11] & 0xff)) * 256);
510 return -1;
511 cptr = buf;
512 518
513 len = 0; 519 if (0 != (hdata[3] & 0x8))
514 /* stored file name is here */
515 while (len < buf_bytes)
516 { 520 {
517 if ('\0' == *cptr) 521 /* FNAME set */
518 break; 522 char fname[1024];
519 cptr++; 523 char *cptr;
520 len++; 524 size_t len;
525 ssize_t buf_bytes;
526
527 if (gzip_header_length > bfds_seek (cfs->bfds, gzip_header_length, SEEK_SET))
528 return -1;
529 buf_bytes = bfds_read (cfs->bfds, fname, sizeof (fname));
530 if (buf_bytes <= 0)
531 return -1;
532 if (NULL == (cptr = memchr (fname, 0, buf_bytes)))
533 return -1;
534 len = cptr - fname;
535 if (0 != proc (proc_cls, "<zlib>", EXTRACTOR_METATYPE_FILENAME,
536 EXTRACTOR_METAFORMAT_C_STRING, "text/plain",
537 fname,
538 len))
539 return 0; /* done */
540 gzip_header_length += len + 1;
521 } 541 }
522 542
523 if (0 != proc (proc_cls, "<zlib>", EXTRACTOR_METATYPE_FILENAME, 543 if (0 != (hdata[3] & 0x16))
524 EXTRACTOR_METAFORMAT_C_STRING, "text/plain", 544 {
525 (const char *) buf, 545 /* FCOMMENT set */
526 len)) 546 char fcomment[1024];
527 return 0; /* done */ 547 char *cptr;
528 548 ssize_t buf_bytes;
529 /* FIXME: check for correctness */ 549 size_t len;
530 //gzip_header_length = (cptr - data) + 1; 550
531 gzip_header_length += len + 1; 551 if (gzip_header_length > bfds_seek (cfs->bfds, gzip_header_length, SEEK_SET))
532 } 552 return -1;
533 553 buf_bytes = bfds_read (cfs->bfds, fcomment, sizeof (fcomment));
534 if (0 != (data[3] & 0x16)) /* FCOMMENT set */ 554 if (buf_bytes <= 0)
535 { 555 return -1;
536 int64_t buf_bytes; 556 if (NULL == (cptr = memchr (fcomment, 0, buf_bytes)))
537 int len; 557 return -1;
538 unsigned char *buf; 558 len = cptr - fcomment;
539 unsigned char *cptr; 559 if (0 != proc (proc_cls, "<zlib>", EXTRACTOR_METATYPE_COMMENT,
540 560 EXTRACTOR_METAFORMAT_C_STRING, "text/plain",
541 if (gzip_header_length > bfds_seek (cfs->bfds, gzip_header_length, SEEK_SET)) 561 (const char *) fcomment,
542 return -1; 562 len))
543 buf_bytes = bfds_read (cfs->bfds, &buf, 1024); 563 return 0; /* done */
544 if (buf_bytes <= 0) 564 gzip_header_length += len + 1;
545 return -1; 565 }
546 cptr = buf; 566 if (0 != (hdata[3] & 0x2)) /* FCHRC set */
547
548 len = 0;
549 /* stored file name is here */
550 while (len < buf_bytes)
551 {
552 if ('\0' == *cptr)
553 break;
554 cptr++;
555 len++;
556 }
557
558 if (0 != proc (proc_cls, "<zlib>", EXTRACTOR_METATYPE_COMMENT,
559 EXTRACTOR_METAFORMAT_C_STRING, "text/plain",
560 (const char *) buf,
561 len))
562 return 0; /* done */
563
564 /* FIXME: check for correctness */
565 //gzip_header_length = (cptr - data) + 1;
566 gzip_header_length += len + 1;
567 }
568
569 if (data[3] & 0x2) /* FCHRC set */
570 gzip_header_length += 2; 567 gzip_header_length += 2;
571
572 memset (&cfs->strm, 0, sizeof (z_stream)); 568 memset (&cfs->strm, 0, sizeof (z_stream));
573 569
574#ifdef ZLIB_VERNUM 570#ifdef ZLIB_VERNUM
571 /* zlib will take care of its header */
575 gzip_header_length = 0; 572 gzip_header_length = 0;
576#endif 573#endif
577 574
@@ -604,7 +601,7 @@ cfs_init_decompressor_bz2 (struct CompressedFileSource *cfs,
604 * @param cfs cfs to initialize 601 * @param cfs cfs to initialize
605 * @param proc callback for metadata 602 * @param proc callback for metadata
606 * @param proc_cls callback cls 603 * @param proc_cls callback cls
607 * @return 1 on success, -1 on error 604 * @return 1 on success, 0 to terminate extraction, -1 on error
608 */ 605 */
609static int 606static int
610cfs_init_decompressor (struct CompressedFileSource *cfs, 607cfs_init_decompressor (struct CompressedFileSource *cfs,
@@ -671,12 +668,25 @@ cfs_deinit_decompressor (struct CompressedFileSource *cfs)
671 668
672 669
673/** 670/**
671 * Destroy compressed file source.
672 *
673 * @param cfs source to destroy
674 */
675static void
676cfs_destroy (struct CompressedFileSource *cfs)
677{
678 cfs_deinit_decompressor (cfs);
679 free (cfs);
680}
681
682
683/**
674 * Allocates and initializes new cfs object. 684 * Allocates and initializes new cfs object.
675 * 685 *
676 * @param bfds data source to use 686 * @param bfds data source to use
677 * @param fsize size of the source 687 * @param fsize size of the source
678 * @param compression_type type of compression used 688 * @param compression_type type of compression used
679 * @param proc metadata callback 689 * @param proc metadata callback to call with meta data found upon opening
680 * @param proc_cls callback cls 690 * @param proc_cls callback cls
681 * @return newly allocated cfs on success, NULL on error 691 * @return newly allocated cfs on success, NULL on error
682 */ 692 */
@@ -686,7 +696,6 @@ cfs_new (struct BufferedFileDataSource *bfds,
686 enum ExtractorCompressionType compression_type, 696 enum ExtractorCompressionType compression_type,
687 EXTRACTOR_MetaDataProcessor proc, void *proc_cls) 697 EXTRACTOR_MetaDataProcessor proc, void *proc_cls)
688{ 698{
689 int shm_result;
690 struct CompressedFileSource *cfs; 699 struct CompressedFileSource *cfs;
691 700
692 if (NULL == (cfs = malloc (sizeof (struct CompressedFileSource)))) 701 if (NULL == (cfs = malloc (sizeof (struct CompressedFileSource))))
@@ -696,93 +705,118 @@ cfs_new (struct BufferedFileDataSource *bfds,
696 cfs->bfds = bfds; 705 cfs->bfds = bfds;
697 cfs->fsize = fsize; 706 cfs->fsize = fsize;
698 cfs->uncompressed_size = -1; 707 cfs->uncompressed_size = -1;
708 if (1 != cfs_init_decompressor (cfs,
709 proc, proc_cls))
710 {
711 free (cfs);
712 return NULL;
713 }
699 return cfs; 714 return cfs;
700} 715}
701 716
702 717
703/** 718/**
704 * Data is read from the source and shoved into decompressor 719 * Fills 'data' with new uncompressed data. Does the actual
705 * in chunks this big. 720 * decompression. Will set uncompressed_size on the end of compressed
706 */ 721 * stream.
707#define COM_CHUNK_SIZE (10*1024)
708
709
710/**
711 * Re-fills shm with new uncompressed data, preserving the last
712 * 'preserve' bytes of existing data as the first 'preserve' bytes
713 * of the new data.
714 * Does the actual decompression. Will set uncompressed_size on
715 * the end of compressed stream.
716 * 722 *
717 * @param cfds cfs to read from 723 * @param cfds cfs to read from
718 * @param preserve number of bytes to preserve (0 to discard all old data) 724 * @param data where to copy the data
719 * @return number of bytes in shm. 0 if no more data can be uncompressed, -1 on error 725 * @param size number of bytes available in data
726 * @return number of bytes in data. 0 if no more data can be uncompressed, -1 on error
720 */ 727 */
721static int 728static ssize_t
722cfs_read_zlib (struct CompressedFileSource *cfs, int64_t preserve) 729cfs_read_zlib (struct CompressedFileSource *cfs,
730 void *data,
731 size_t size)
723{ 732{
733 char *dst = data;
724 int ret; 734 int ret;
725 int64_t rc = preserve; 735 size_t rc;
726 int64_t total = cfs->strm.total_out; 736 ssize_t in;
737 char buf[COM_CHUNK_SIZE];
727 738
728 if (preserve > 0) 739 if (cfs->fpos == cfs->uncompressed_size)
729 memmove (cfs->shm_ptr, &((unsigned char *)cfs->shm_ptr)[0], preserve); 740 return 0;
730 741 rc = 0;
731 while (rc < cfs->shm_size && ret != Z_STREAM_END) 742 if (strm.avail_out > 0)
732 { 743 {
733 if (cfs->strm.avail_in == 0) 744 /* got left-over decompressed data from previous round! */
745 in = strm.avail_out;
746 if (in > size)
747 in = size;
748 memcpy (&dst[rc], &cfs->result[cfs->result_pos], in);
749 cfs->fpos += in;
750 cfs->result_pos += in;
751 rc += in;
752 }
753 ret = Z_OK;
754 while ( (rc < size) && (Z_STREAM_END != ret) )
734 { 755 {
735 int64_t count = bfds_read (cfs->bfds, &cfs->strm.next_in, COM_CHUNK_SIZE); 756 /* read block from original data source */
736 if (count <= 0) 757 in = bfds_read (cfs->bfds,
737 return 0; 758 buf, sizeof (buf));
759 if (in <= 0)
760 return -1; /* unexpected EOF */
761 cfs->strm.next_in = buf;
738 cfs->strm.avail_in = (uInt) count; 762 cfs->strm.avail_in = (uInt) count;
763 cfs->strm.next_out = cfs->result;
764 cfs->strm.avail_out = COM_CHUNK_SIZE;
765 cfs->result_pos = 0;
766 ret = inflate (&cfs->strm, Z_SYNC_FLUSH);
767 if ( (Z_OK != ret) && (Z_STREAM_END != ret) )
768 return -1; /* unexpected error */
769 /* go backwards by the number of bytes left in the buffer */
770 if (-1 == bfds_seek (cfs->bfds, - cfs->strm.avail_in, SEEK_CUR))
771 return -1;
772 /* copy decompressed bytes to target buffer */
773 in = cfs->strm.total_out;
774 if (in > size - rc)
775 in = size - rc;
776 memcpy (&dst[rc], &cfs->result[cfs->result_pos], in);
777 cfs->fpos += in;
778 cfs->result_pos += in;
779 rc += in;
739 } 780 }
740 cfs->strm.next_out = &((unsigned char *)cfs->shm_ptr)[rc]; 781 if (Z_STREAM_END == ret)
741 cfs->strm.avail_out = cfs->shm_size - rc; 782 cfs->uncompressed_size = cfs->fpos;
742 ret = inflate (&cfs->strm, Z_SYNC_FLUSH); 783 return rc;
743 if (ret != Z_OK && ret != Z_STREAM_END)
744 return 0;
745 rc = cfs->strm.total_out - total;
746 }
747 if (ret == Z_STREAM_END)
748 cfs->uncompressed_size = cfs->strm.total_out;
749 cfs->shm_pos = preserve;
750 cfs->shm_buf_size = rc + preserve;
751 return 1;
752} 784}
753 785
754 786
755/** 787/**
756 * Re-fills shm with new uncompressed data, preserving the last 788 * Fills 'data' with new uncompressed data. Does the actual
757 * 'preserve' bytes of existing data as the first 'preserve' bytes 789 * decompression. Will set uncompressed_size on the end of compressed
758 * of the new data. 790 * stream.
759 * Does the actual decompression. Will set uncompressed_size on
760 * the end of compressed stream.
761 * 791 *
762 * @param cfds cfs to read from 792 * @param cfds cfs to read from
763 * @param preserve number of bytes to preserve (0 to discard all old data) 793 * @param data where to copy the data
764 * @return number of bytes in shm. 0 if no more data can be uncompressed, -1 on error 794 * @param size number of bytes available in data
795 * @return number of bytes in data. 0 if no more data can be uncompressed, -1 on error
765 */ 796 */
766static int 797static ssize_t
767cfs_read_bz2 (struct CompressedFileSource *cfs, int64_t preserve) 798cfs_read_bz2 (struct CompressedFileSource *cfs,
799 void *data,
800 size_t size)
768{ 801{
769 return -1; 802 return -1;
770} 803}
771 804
772 805
773/** 806/**
774 * Re-fills shm with new uncompressed data, preserving the last 807 * Fills 'data' with new uncompressed data. Does the actual
775 * 'preserve' bytes of existing data as the first 'preserve' bytes 808 * decompression. Will set uncompressed_size on the end of compressed
776 * of the new data. 809 * stream.
777 * Does the actual decompression. Will set uncompressed_size on
778 * the end of compressed stream.
779 * 810 *
780 * @param cfds cfs to read from 811 * @param cfds cfs to read from
781 * @param preserve number of bytes to preserve (0 to discard all old data) 812 * @param data where to copy the data
782 * @return number of bytes in shm. 0 if no more data can be uncompressed, -1 on error 813 * @param size number of bytes available in data
814 * @return number of bytes in data. 0 if no more data can be uncompressed, -1 on error
783 */ 815 */
784static int64_t 816static ssize_t
785cfs_read (struct CompressedFileSource *cfs, int64_t preserve) 817cfs_read (struct CompressedFileSource *cfs,
818 void *data,
819 size_t size)
786{ 820{
787 switch (cfs->compression_type) 821 switch (cfs->compression_type)
788 { 822 {
@@ -801,72 +835,44 @@ cfs_read (struct CompressedFileSource *cfs, int64_t preserve)
801 * requires seeking backwards beyond the boundaries of the buffer, resets the 835 * requires seeking backwards beyond the boundaries of the buffer, resets the
802 * stream and repeats decompression from the beginning to 'position'. 836 * stream and repeats decompression from the beginning to 'position'.
803 * 837 *
804 * @param cfds cfs to seek on 838 * @param cfs cfs to seek on
805 * @param position new starting point for the buffer
806 * @return new absolute buffer position, -1 on error or EOS
807 */
808static int64_t
809cfs_seek_zlib (struct CompressedFileSource *cfs, int64_t position)
810{
811 int64_t ret;
812
813 if (position > cfs->strm.total_out - cfs->shm_buf_size && position < cfs->strm.total_out)
814 {
815 ret = cfs_read (cfs, cfs->strm.total_out - position);
816 if (ret < 0)
817 return ret;
818 return position;
819 }
820 while (position >= cfs->strm.total_out)
821 {
822 if (0 > (ret = cfs_read (cfs, 0)))
823 return ret;
824 if (ret == 0)
825 return position;
826 }
827 if (position < cfs->strm.total_out && position > cfs->strm.total_out - cfs->shm_buf_size)
828 return cfs->strm.total_out - cfs->shm_buf_size;
829 return -1;
830}
831
832
833/**
834 * Moves the buffer to 'position' in uncompressed steam. If position
835 * requires seeking backwards beyond the boundaries of the buffer, resets the
836 * stream and repeats decompression from the beginning to 'position'.
837 *
838 * @param cfds cfs to seek on
839 * @param position new starting point for the buffer 839 * @param position new starting point for the buffer
840 * @return new absolute buffer position, -1 on error or EOS 840 * @return new absolute buffer position, -1 on error or EOS
841 */ 841 */
842static int64_t 842static int64_t
843cfs_seek_bz2 (struct CompressedFileSource *cfs, int64_t position) 843cfs_seek (struct CompressedFileSource *cfs,
844 uint64_t position)
844{ 845{
845 return -1; 846 int64_t delta;
846} 847
847 848 delta = position - cfs->fpos;
848 849 if (delta < 0)
849/**
850 * Moves the buffer to 'position' in uncompressed steam. If position
851 * requires seeking backwards beyond the boundaries of the buffer, resets the
852 * stream and repeats decompression from the beginning to 'position'.
853 *
854 * @param cfds cfs to seek on
855 * @param position new starting point for the buffer
856 * @return new absolute buffer position, -1 on error or EOS
857 */
858static int64_t
859cfs_seek (struct CompressedFileSource *cfs, int64_t position)
860{
861 switch (cfs->compression_type)
862 { 850 {
863 case COMP_TYPE_ZLIB: 851 if (result_pos >= - delta)
864 return cfs_seek_zlib (cfs, position); 852 {
865 case COMP_TYPE_BZ2: 853 result_pos += delta;
866 return cfs_seek_bz2 (cfs, position); 854 delta = 0;
867 default: 855 }
868 return -1; 856 else
857 {
858 if (-1 == cfs_reset_stream (cfs))
859 return -1;
860 delta = position;
861 }
862 }
863 while (delta > 0)
864 {
865 char buf[COM_CHUNK_SIZE];
866 size_t max;
867 int64_t ret;
868
869 max = (sizeof (buf) > delta) ? delta : sizeof (buf);
870 ret = cfs_read (cfs, buf, max);
871 if (-1 == ret)
872 return -1;
873 delta -= ret;
869 } 874 }
875 return cfs->fpos;
870} 876}
871 877
872 878
@@ -879,163 +885,239 @@ cfs_seek (struct CompressedFileSource *cfs, int64_t position)
879 * @return -1 to indicate an error, 0 to indicate uncompressed data, or a type (> 0) of compression 885 * @return -1 to indicate an error, 0 to indicate uncompressed data, or a type (> 0) of compression
880 */ 886 */
881static enum ExtractorCompressionType 887static enum ExtractorCompressionType
882get_compression_type (const unsigned char *data, 888get_compression_type (struct BufferedFileDataSource *bfds)
883 int fd,
884 int64_t fsize)
885{ 889{
886 void *read_data = NULL; 890 unsigned char read_data[3];
887 size_t read_data_size = 0;
888 ssize_t read_result;
889 enum ExtractorCompressionType result = COMP_TYPE_INVALID;
890 891
891 if ((MIN_COMPRESSED_HEADER < 0) || (fsize < MIN_COMPRESSED_HEADER)) 892 if (0 != bfds_seek (bfds, 0, SEEK_SET))
892 {
893 return COMP_TYPE_INVALID; 893 return COMP_TYPE_INVALID;
894 } 894 if (sizeof (read_data) !=
895 if (data == NULL) 895 bfds_read (bfds, read_data, sizeof (read_data)))
896 { 896 return COMP_TYPE_UNDEFINED;
897 int64_t position; 897
898 read_data_size = COMPRESSED_DATA_PROBE_SIZE;
899 read_data = malloc (read_data_size);
900 if (read_data == NULL)
901 return -1;
902#if WINDOWS
903 position = _lseeki64 (fd, 0, SEEK_CUR);
904#elif HAVE_LSEEK64
905 position = lseek64 (fd, 0, SEEK_CUR);
906#else
907 position = (int64_t) lseek (fd, 0, SEEK_CUR);
908#endif
909 read_result = READ (fd, read_data, read_data_size);
910#if WINDOWS
911 position = _lseeki64 (fd, position, SEEK_SET);
912#elif HAVE_LSEEK64
913 position = lseek64 (fd, position, SEEK_SET);
914#else
915 position = lseek (fd, (off_t) position, SEEK_SET);
916#endif
917 if (read_result != read_data_size)
918 {
919 free (read_data);
920 return COMP_TYPE_UNDEFINED;
921 }
922 data = (const void *) read_data;
923 }
924#if HAVE_ZLIB 898#if HAVE_ZLIB
925 if ((fsize >= MIN_ZLIB_HEADER) && (data[0] == 0x1f) && (data[1] == 0x8b) && (data[2] == 0x08)) 899 if ( (bdfs->fsize >= MIN_ZLIB_HEADER) &&
926 result = COMP_TYPE_ZLIB; 900 (data[0] == 0x1f) &&
901 (data[1] == 0x8b) &&
902 (data[2] == 0x08) )
903 return COMP_TYPE_ZLIB;
927#endif 904#endif
928#if HAVE_LIBBZ2 905#if HAVE_LIBBZ2
929 if ((fsize >= MIN_BZ2_HEADER) && (data[0] == 'B') && (data[1] == 'Z') && (data[2] == 'h')) 906 if ( (bdfs->fsize >= MIN_BZ2_HEADER) &&
930 result = COMP_TYPE_BZ2; 907 (data[0] == 'B') &&
908 (data[1] == 'Z') &&
909 (data[2] == 'h'))
910 return COMP_TYPE_BZ2;
931#endif 911#endif
932 if (read_data != NULL) 912 return COMP_TYPE_INVALID;
933 free (read_data);
934 return result;
935} 913}
936 914
937 915
938#if 0 916/**
939 917 * Handle to a datasource we can use for the plugins.
940 enum ExtractorCompressionType compression_type = -1; 918 */
941 struct CompressedFileSource *cfs = NULL; 919struct EXTRACTOR_Datasource
942 int fd = -1; 920{
943 struct stat64 fstatbuf; 921
944 int64_t fsize = 0; 922 /**
945 923 * Underlying buffered data source.
946 /* If data is not given, then we need to read it from the file. Try opening it */ 924 */
947 if ((data == NULL) && 925 struct BufferedFileDataSource *bfds;
948 (filename != NULL) &&
949 (0 == STAT64(filename, &fstatbuf)) &&
950 (!S_ISDIR(fstatbuf.st_mode)) &&
951 (-1 != (fd = file_open (filename,
952 O_RDONLY | O_LARGEFILE))))
953 {
954 /* Empty files are of no interest */
955 fsize = fstatbuf.st_size;
956 if (fsize == 0)
957 {
958 close(fd);
959 return;
960 }
961 }
962
963 /* Data is not given, and we've failed to open the file with data -> exit */
964 if ((fsize == 0) && (data == NULL))
965 return;
966 /* fsize is now size of the data OR size of the file */
967 if (data != NULL)
968 fsize = size;
969
970 errno = 0;
971
972 /* Peek at first few bytes of the file (or of the data), and see if it's compressed. */
973 compression_type = get_compression_type (data, fd, fsize);
974 if (compression_type < 0)
975 {
976 /* errno is set by get_compression_type () */
977 if (fd != -1)
978 close (fd);
979 return;
980 }
981 926
927 /**
928 * Compressed file source (NULL if not applicable).
929 */
930 struct CompressedFileSource *cfs;
931
932 /**
933 * Underlying file descriptor, -1 for none.
934 */
935 int fd;
936};
937
938
939/**
940 * Create a datasource from a file on disk.
941 *
942 * @param filename name of the file on disk
943 * @param proc metadata callback to call with meta data found upon opening
944 * @param proc_cls callback cls
945 * @return handle to the datasource, NULL on error
946 */
947struct EXTRACTOR_Datasource *
948EXTRACTOR_datasource_create_from_file_ (const char *filename,
949 EXTRACTOR_MetaDataProcessor proc,
950 void *proc_cls)
951{
982 struct BufferedFileDataSource *bfds; 952 struct BufferedFileDataSource *bfds;
983 bfds = bfds_new (data, fd, fsize); 953 struct EXTRACTOR_Datasource *ds;
984 if (bfds == NULL) 954 enum ExtractorCompressionType ct;
985 return; 955 int fd;
986 956 struct stat sb;
987 if (compression_type > 0) 957 int64_t fsize;
988 { 958
989 int icr = 0; 959 if (-1 == (fd = open (filename, O_RDONLY | O_LARGEFILE)))
990 /* Set up a decompressor. 960 return NULL;
991 * Will also report compression-related metadata to the caller. 961 if ( (0 != fstat (fd, &sb)) ||
992 */ 962 (S_ISDIR (fstatbuf.st_mode)) )
993 cfs = cfs_new (bfds, fsize, compression_type, proc, proc_cls); 963 {
994 if (cfs == NULL) 964 (void) close (fd);
965 return NULL;
966 }
967 fsize = (int64_t) sb.st_size;
968 if (0 == fsize)
969 {
970 (void) close (fd);
971 return NULL;
972 }
973 bfds = bfds_new (NULL, fd, fsize);
974 if (NULL == bfds)
995 { 975 {
996 if (fd != -1) 976 (void) close (fd);
997 close (fd); 977 return NULL;
998 errno = EILSEQ;
999 return;
1000 } 978 }
1001 icr = cfs_init_decompressor (cfs, proc, proc_cls); 979 if (NULL == (ds = malloc (sizeof (struct EXTRACTOR_Datasource))))
1002 if (icr < 0)
1003 { 980 {
1004 if (fd != -1) 981 bfds_delete (bfds);
1005 close (fd); 982 return NULL;
1006 errno = EILSEQ;
1007 return;
1008 } 983 }
1009 else if (icr == 0) 984 ds->bfds = bfds;
985 ds->fd;
986 ct = get_compression_type (bfds);
987 if ( (COMP_TYPE_ZLIB == ct) ||
988 (COMP_TYPE_BZ2 == ct) )
989 ds->cfs = cfs_new (bfds, fsize, ct, proc, proc_cls);
990 if (NULL == ds->cfs)
1010 { 991 {
1011 if (fd != -1) 992 bfds_delete (bfds);
1012 close (fd); 993 free (ds);
1013 errno = 0; 994 (void) close (fd);
1014 return; 995 return NULL;
1015 } 996 }
1016 } 997 return ds;
998}
1017 999
1018 1000
1019#endif 1001/**
1002 * Create a datasource from a buffer in memory.
1003 *
1004 * @param buf data in memory
1005 * @param size number of bytes in 'buf'
1006 * @param proc metadata callback to call with meta data found upon opening
1007 * @param proc_cls callback cls
1008 * @return handle to the datasource
1009 */
1010struct EXTRACTOR_Datasource *
1011EXTRACTOR_datasource_create_from_buffer_ (const char *buf,
1012 size_t size,
1013 EXTRACTOR_MetaDataProcessor proc, void *proc_cls)
1014{
1015 struct BufferedFileDataSource *bfds;
1016 struct EXTRACTOR_Datasource *ds;
1017 enum ExtractorCompressionType ct;
1020 1018
1019 if (0 == size)
1020 return NULL;
1021 if (NULL == (bfds = bfds_new (buf, -1, size)))
1022 return NULL;
1023 if (NULL == (ds = malloc (sizeof (struct EXTRACTOR_Datasource))))
1024 {
1025 bfds_delete (bfds);
1026 return NULL;
1027 }
1028 ds->bfds = bfds;
1029 ds->fd;
1030 ct = get_compression_type (bfds);
1031 if ( (COMP_TYPE_ZLIB == ct) ||
1032 (COMP_TYPE_BZ2 == ct) )
1033 ds->cfs = cfs_new (bfds, fsize, ct, proc, proc_cls);
1034 if (NULL == ds->cfs)
1035 {
1036 bfds_delete (bfds);
1037 free (ds);
1038 return NULL;
1039 }
1040 return ds;
1041}
1021 1042
1022 1043
1023/** 1044/**
1024 * Destroy a data source. 1045 * Destroy a data source.
1025 * 1046 *
1026 * @param datasource source to destroy 1047 * @param ds source to destroy
1027 */ 1048 */
1028void 1049void
1029EXTRACTOR_datasource_destroy_ (struct EXTRACTOR_Datasource *datasource) 1050EXTRACTOR_datasource_destroy_ (struct EXTRACTOR_Datasource *ds)
1051{
1052 if (NULL != ds->cfs)
1053 cfs_destroy (ds->cfs);
1054 bfds_delete (ds->bfds);
1055 if (-1 != ds->fd)
1056 (void) close (ds->fd);
1057 free (ds);
1058}
1059
1060
1061/**
1062 * Make 'size' bytes of data from the data source available at 'data'.
1063 *
1064 * @param cls must be a 'struct EXTRACTOR_Datasource'
1065 * @param data where the data should be copied to
1066 * @param size maximum number of bytes requested
1067 * @return number of bytes now available in data (can be smaller than 'size'),
1068 * -1 on error
1069 */
1070ssize_t
1071EXTRACTOR_datasource_read_ (void *cls,
1072 void *data,
1073 size_t size)
1074{
1075 struct EXTRACTOR_Datasource *ds = cls;
1076
1077 if (NULL != ds->cfs)
1078 return cfs_read (ds->cfs, data, size);
1079 return bdfs_read (ds->bdfs, data, size);
1080}
1081
1082
1083/**
1084 * Seek in the datasource. Use 'SEEK_CUR' for whence and 'pos' of 0 to
1085 * obtain the current position in the file.
1086 *
1087 * @param cls must be a 'struct EXTRACTOR_Datasource'
1088 * @param pos position to seek (see 'man lseek')
1089 * @param whence how to see (absolute to start, relative, absolute to end)
1090 * @return new absolute position, UINT64_MAX on error (i.e. desired position
1091 * does not exist)
1092 */
1093int64_t
1094EXTRACTOR_datasource_seek_ (void *cls,
1095 uint64_t pos,
1096 int whence)
1030{ 1097{
1031 if (cfs != NULL) 1098 struct EXTRACTOR_Datasource *ds = cls;
1032 { 1099
1033 cfs_deinit_decompressor (cfs); 1100 if (NULL != ds->cfs)
1034 cfs_delete (cfs); 1101 return cfs_seek (ds->cfs, pos, whence);
1035 } 1102 return bdfs_seek (ds->bdfs, pos, whence);
1036 bfds_delete (bfds);
1037 if (-1 != fd)
1038 close(fd);
1039} 1103}
1040 1104
1105
1106/**
1107 * Determine the overall size of the data source (after compression).
1108 *
1109 * @param cls must be a 'struct EXTRACTOR_Datasource'
1110 * @return overall file size, UINT64_MAX on error or unknown
1111 */
1112int64_t
1113EXTRACTOR_datasource_get_size_ (void *cls)
1114{
1115 struct EXTRACTOR_Datasource *ds = cls;
1116
1117 if (NULL != ds->cfs)
1118 return cfs_seek (ds->cfs, pos, whence);
1119 return bdfs_seek (ds->bdfs, pos, whence);
1120}
1121
1122
1041/* end of extractor_datasource.c */ 1123/* end of extractor_datasource.c */