extractor_datasource.c (34206B)
1 /* 2 This file is part of libextractor. 3 Copyright (C) 2002, 2003, 2004, 2005, 2006, 2009, 2012 Vidyut Samanta and Christian Grothoff 4 5 libextractor is free software; you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published 7 by the Free Software Foundation; either version 3, or (at your 8 option) any later version. 9 10 libextractor is distributed in the hope that it will be useful, but 11 WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 General Public License for more details. 14 15 You should have received a copy of the GNU General Public License 16 along with libextractor; see the file COPYING. If not, write to the 17 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 18 Boston, MA 02110-1301, USA. 19 */ 20 /** 21 * @file main/extractor_datasource.c 22 * @brief random access and possibly decompression of data from buffer in memory or file on disk 23 * @author Christian Grothoff 24 */ 25 #include "platform.h" 26 #include "extractor_logging.h" 27 #include "extractor_datasource.h" 28 29 #if HAVE_LIBBZ2 30 #include <bzlib.h> 31 #define MIN_BZ2_HEADER 4 32 #ifndef MIN_COMPRESSED_HEADER 33 #define MIN_COMPRESSED_HEADER MIN_ZLIB_HEADER 34 #endif 35 #endif 36 37 #if HAVE_ZLIB 38 #include <zlib.h> 39 #define MIN_ZLIB_HEADER 12 40 #ifndef MIN_COMPRESSED_HEADER 41 #define MIN_COMPRESSED_HEADER MIN_BZ2_HEADER 42 #endif 43 #endif 44 45 #ifndef MIN_COMPRESSED_HEADER 46 #define MIN_COMPRESSED_HEADER -1 47 #endif 48 49 #ifndef O_LARGEFILE 50 #define O_LARGEFILE 0 51 #endif 52 53 /** 54 * Maximum size of an IO buffer. 55 */ 56 #define MAX_READ (4 * 1024 * 1024) 57 58 /** 59 * Data is read from the source and shoved into decompressor 60 * in chunks this big. 61 */ 62 #define COM_CHUNK_SIZE (16 * 1024) 63 64 65 /** 66 * Enum with the various possible types of compression supported. 67 */ 68 enum ExtractorCompressionType 69 { 70 /** 71 * We cannot tell from the data (header incomplete). 72 */ 73 COMP_TYPE_UNDEFINED = -1, 74 75 /** 76 * Invalid header (likely uncompressed) 77 */ 78 COMP_TYPE_INVALID = 0, 79 80 /** 81 * libz / gzip compression. 82 */ 83 COMP_TYPE_ZLIB = 1, 84 85 /** 86 * bz2 compression 87 */ 88 COMP_TYPE_BZ2 = 2 89 }; 90 91 92 /** 93 * Abstraction of the data source (file or a memory buffer) 94 * for the decompressor. 95 */ 96 struct BufferedFileDataSource 97 { 98 /** 99 * Pointer to the buffer to read from (may be NULL) 100 */ 101 const void *data; 102 103 /** 104 * A buffer to read into. For fd != -1: when data != NULL, 105 * data is used directly. 106 */ 107 void *buffer; 108 109 /** 110 * Size of the file (or the data buffer) 111 */ 112 uint64_t fsize; 113 114 /** 115 * Position of the buffer in the file. 116 */ 117 uint64_t fpos; 118 119 /** 120 * Position within the buffer. Our absolute offset in the file 121 * is thus 'fpos + buffer_pos'. 122 */ 123 size_t buffer_pos; 124 125 /** 126 * Number of valid bytes in the buffer (<= buffer_size) 127 */ 128 size_t buffer_bytes; 129 130 /** 131 * Allocated size of the buffer 132 */ 133 size_t buffer_size; 134 135 /** 136 * Descriptor of the file to read data from (may be -1) 137 */ 138 int fd; 139 140 }; 141 142 143 /** 144 * An object from which uncompressed data can be read 145 */ 146 struct CompressedFileSource 147 { 148 /** 149 * The source of data 150 */ 151 struct BufferedFileDataSource *bfds; 152 153 /** 154 * Decompression target buffer. 155 */ 156 char result[COM_CHUNK_SIZE]; 157 158 /** 159 * At which offset in 'result' is 'fpos'? 160 */ 161 size_t result_pos; 162 163 /** 164 * Size of the source (same as bfds->fsize) 165 */ 166 int64_t fsize; 167 168 /** 169 * Position within the (decompressed) source 170 */ 171 int64_t fpos; 172 173 /** 174 * Total size of the uncompressed data. Remains -1 until 175 * decompression is finished. 176 */ 177 int64_t uncompressed_size; 178 179 #if HAVE_LIBBZ2 180 /** 181 * BZ2 stream object 182 */ 183 bz_stream bstrm; 184 #endif 185 186 #if HAVE_ZLIB 187 /** 188 * ZLIB stream object 189 */ 190 z_stream strm; 191 192 /** 193 * Length of gzip header (may be 0, in that case ZLIB parses the header) 194 */ 195 int gzip_header_length; 196 #endif 197 198 /** 199 * The type of compression used in the source 200 */ 201 enum ExtractorCompressionType compression_type; 202 203 }; 204 205 206 /** 207 * Makes bfds seek to 'pos' and read a chunk of bytes there. 208 * Changes bfds->fpos, bfds->buffer_bytes and bfds->buffer_pos. 209 * Does almost nothing for memory-backed bfds. 210 * 211 * @param bfds bfds 212 * @param pos position 213 * @return 0 on success, -1 on error 214 */ 215 static int 216 bfds_pick_next_buffer_at (struct BufferedFileDataSource *bfds, 217 uint64_t pos) 218 { 219 int64_t position; 220 ssize_t rd; 221 222 if (pos > bfds->fsize) 223 { 224 LOG ("Invalid seek operation\n"); 225 return -1; /* invalid */ 226 } 227 if (NULL == bfds->buffer) 228 { 229 bfds->buffer_pos = pos; 230 return 0; 231 } 232 position = (int64_t) lseek (bfds->fd, pos, SEEK_SET); 233 if (position < 0) 234 { 235 LOG_STRERROR ("lseek"); 236 return -1; 237 } 238 bfds->fpos = position; 239 bfds->buffer_pos = 0; 240 rd = read (bfds->fd, bfds->buffer, bfds->buffer_size); 241 if (rd < 0) 242 { 243 LOG_STRERROR ("read"); 244 return -1; 245 } 246 bfds->buffer_bytes = rd; 247 return 0; 248 } 249 250 251 /** 252 * Creates a bfds 253 * 254 * @param data data buffer to use as a source (NULL if fd != -1) 255 * @param fd file descriptor to use as a source (-1 if data != NULL) 256 * @param fsize size of the file (or the buffer) 257 * @return newly allocated bfds 258 */ 259 static struct BufferedFileDataSource * 260 bfds_new (const void *data, 261 int fd, 262 int64_t fsize) 263 { 264 struct BufferedFileDataSource *result; 265 size_t xtra; 266 267 if (fsize > MAX_READ) 268 xtra = MAX_READ; 269 else 270 xtra = (size_t) fsize; 271 if ( (-1 == fd) && (NULL == data) ) 272 { 273 LOG ("Invalid arguments\n"); 274 return NULL; 275 } 276 if ( (-1 != fd) && (NULL != data) ) 277 fd = -1; /* don't need fd */ 278 if (NULL != data) 279 xtra = 0; 280 if (NULL == (result = malloc (sizeof (struct BufferedFileDataSource) + xtra))) 281 { 282 LOG_STRERROR ("malloc"); 283 return NULL; 284 } 285 memset (result, 0, sizeof (struct BufferedFileDataSource)); 286 result->data = (NULL != data) ? data : &result[1]; 287 result->buffer = (NULL != data) ? NULL : &result[1]; 288 result->buffer_size = (NULL != data) ? fsize : xtra; 289 result->buffer_bytes = (NULL != data) ? fsize : 0; 290 result->fsize = fsize; 291 result->fd = fd; 292 bfds_pick_next_buffer_at (result, 0); 293 return result; 294 } 295 296 297 /** 298 * Unallocates bfds 299 * 300 * @param bfds bfds to deallocate 301 */ 302 static void 303 bfds_delete (struct BufferedFileDataSource *bfds) 304 { 305 free (bfds); 306 } 307 308 309 /** 310 * Makes bfds seek to 'pos' in 'whence' mode. 311 * Will try to seek within the buffer, will move the buffer location if 312 * the seek request falls outside of the buffer range. 313 * 314 * @param bfds bfds 315 * @param pos position to seek to 316 * @param whence one of the seek constants (SEEK_CUR, SEEK_SET, SEEK_END) 317 * @return new absolute position, -1 on error 318 */ 319 static int64_t 320 bfds_seek (struct BufferedFileDataSource *bfds, 321 int64_t pos, int whence) 322 { 323 uint64_t npos; 324 size_t nbpos; 325 326 switch (whence) 327 { 328 case SEEK_CUR: 329 npos = bfds->fpos + bfds->buffer_pos + pos; 330 if (npos > bfds->fsize) 331 { 332 LOG ("Invalid seek operation to %lld from %llu (max is %llu)\n", 333 (long long) pos, 334 bfds->fpos + bfds->buffer_pos, 335 (unsigned long long) bfds->fsize); 336 return -1; 337 } 338 nbpos = bfds->buffer_pos + pos; 339 if ( (NULL == bfds->buffer) || 340 (nbpos < bfds->buffer_bytes) ) 341 { 342 bfds->buffer_pos = nbpos; 343 return npos; 344 } 345 if (0 != bfds_pick_next_buffer_at (bfds, 346 npos)) 347 { 348 LOG ("seek operation failed\n"); 349 return -1; 350 } 351 return npos; 352 case SEEK_END: 353 if (pos > 0) 354 { 355 LOG ("Invalid seek operation\n"); 356 return -1; 357 } 358 if (bfds->fsize < -pos) 359 { 360 LOG ("Invalid seek operation\n"); 361 return -1; 362 } 363 pos = bfds->fsize + pos; 364 /* fall-through! */ 365 case SEEK_SET: 366 if (pos < 0) 367 { 368 LOG ("Invalid seek operation\n"); 369 return -1; 370 } 371 if (pos > bfds->fsize) 372 { 373 LOG ("Invalid seek operation (%lld > %llu) %d\n", 374 (long long) pos, 375 (unsigned long long) bfds->fsize, 376 SEEK_SET == whence); 377 return -1; 378 } 379 if ( (NULL == bfds->buffer) || 380 ( (bfds->fpos <= pos) && 381 (bfds->fpos + bfds->buffer_bytes > pos) ) ) 382 { 383 bfds->buffer_pos = pos - bfds->fpos; 384 return pos; 385 } 386 if (0 != bfds_pick_next_buffer_at (bfds, pos)) 387 { 388 LOG ("seek operation failed\n"); 389 return -1; 390 } 391 ASSERT (pos == bfds->fpos + bfds->buffer_pos); 392 return pos; 393 } 394 return -1; 395 } 396 397 398 /** 399 * Fills 'buf_ptr' with a chunk of data. Will 400 * fail if 'count' exceeds buffer size. 401 * 402 * @param bfds bfds 403 * @param buf_ptr location to store data 404 * @param count number of bytes to read 405 * @return number of bytes (<= count) available at location pointed by buf_ptr, 406 * 0 for end of stream, -1 on error 407 */ 408 static ssize_t 409 bfds_read (struct BufferedFileDataSource *bfds, 410 void *buf_ptr, 411 size_t count) 412 { 413 char *cbuf = buf_ptr; 414 uint64_t old_off; 415 size_t avail; 416 size_t ret; 417 418 old_off = bfds->fpos + bfds->buffer_pos; 419 if (old_off == bfds->fsize) 420 return 0; /* end of stream */ 421 ret = 0; 422 while (count > 0) 423 { 424 if ( (bfds->buffer_bytes == bfds->buffer_pos) && 425 (0 != bfds_pick_next_buffer_at (bfds, 426 bfds->fpos + bfds->buffer_bytes)) ) 427 { 428 /* revert to original position, invalidate buffer */ 429 bfds->fpos = old_off; 430 bfds->buffer_bytes = 0; 431 bfds->buffer_pos = 0; 432 LOG ("read operation failed\n"); 433 return -1; /* getting more failed */ 434 } 435 avail = bfds->buffer_bytes - bfds->buffer_pos; 436 if (avail > count) 437 avail = count; 438 if (0 == avail) 439 break; 440 memcpy (&cbuf[ret], bfds->data + bfds->buffer_pos, avail); 441 bfds->buffer_pos += avail; 442 count -= avail; 443 ret += avail; 444 } 445 return ret; 446 } 447 448 449 #if HAVE_ZLIB 450 /** 451 * Initializes gz-decompression object. Might report metadata about 452 * compresse stream, if available. Resets the stream to the beginning. 453 * 454 * @param cfs cfs to initialize 455 * @param proc callback for metadata 456 * @param proc_cls callback cls 457 * @return 1 on success, 0 to terminate extraction, -1 on error 458 */ 459 static int 460 cfs_init_decompressor_zlib (struct CompressedFileSource *cfs, 461 EXTRACTOR_MetaDataProcessor proc, void *proc_cls) 462 { 463 unsigned int gzip_header_length = 10; 464 unsigned char hdata[12]; 465 ssize_t rsize; 466 467 if (0 != bfds_seek (cfs->bfds, 0, SEEK_SET)) 468 { 469 LOG ("Failed to seek to offset 0!\n"); 470 return -1; 471 } 472 /* Process gzip header */ 473 rsize = bfds_read (cfs->bfds, hdata, sizeof (hdata)); 474 if ( (-1 == rsize) || 475 (sizeof (hdata) > (size_t) rsize) ) 476 return -1; 477 if (0 != (hdata[3] & 0x4)) /* FEXTRA set */ 478 gzip_header_length += 2 + (hdata[10] & 0xff) + ((hdata[11] & 0xff) * 256); 479 480 if (0 != (hdata[3] & 0x8)) 481 { 482 /* FNAME set */ 483 char fname[1024]; 484 char *cptr; 485 size_t len; 486 ssize_t buf_bytes; 487 488 if (gzip_header_length > bfds_seek (cfs->bfds, gzip_header_length, 489 SEEK_SET)) 490 { 491 LOG ("Corrupt gzip, failed to seek to end of header\n"); 492 return -1; 493 } 494 buf_bytes = bfds_read (cfs->bfds, fname, sizeof (fname)); 495 if (buf_bytes <= 0) 496 { 497 LOG ("Corrupt gzip, failed to read filename\n"); 498 return -1; 499 } 500 if (NULL == (cptr = memchr (fname, 0, buf_bytes))) 501 { 502 LOG ("Corrupt gzip, failed to read filename terminator\n"); 503 return -1; 504 } 505 len = cptr - fname; 506 if ( (NULL != proc) && 507 (0 != proc (proc_cls, "<zlib>", EXTRACTOR_METATYPE_FILENAME, 508 EXTRACTOR_METAFORMAT_C_STRING, "text/plain", 509 fname, 510 len)) ) 511 return 0; /* done */ 512 gzip_header_length += len + 1; 513 } 514 515 if (0 != (hdata[3] & 0x16)) 516 { 517 /* FCOMMENT set */ 518 char fcomment[1024]; 519 char *cptr; 520 ssize_t buf_bytes; 521 size_t len; 522 523 if (gzip_header_length > bfds_seek (cfs->bfds, gzip_header_length, 524 SEEK_SET)) 525 { 526 LOG ("Corrupt gzip, failed to seek to end of header\n"); 527 return -1; 528 } 529 buf_bytes = bfds_read (cfs->bfds, fcomment, sizeof (fcomment)); 530 if (buf_bytes <= 0) 531 { 532 LOG ("Corrupt gzip, failed to read comment\n"); 533 return -1; 534 } 535 if (NULL == (cptr = memchr (fcomment, 0, buf_bytes))) 536 { 537 LOG ("Corrupt gzip, failed to read comment terminator\n"); 538 return -1; 539 } 540 len = cptr - fcomment; 541 if ( (NULL != proc) && 542 (0 != proc (proc_cls, "<zlib>", EXTRACTOR_METATYPE_COMMENT, 543 EXTRACTOR_METAFORMAT_C_STRING, "text/plain", 544 (const char *) fcomment, 545 len)) ) 546 return 0; /* done */ 547 gzip_header_length += len + 1; 548 } 549 if (0 != (hdata[3] & 0x2)) /* FCHRC set */ 550 gzip_header_length += 2; 551 memset (&cfs->strm, 0, sizeof (z_stream)); 552 553 #ifdef ZLIB_VERNUM 554 /* zlib will take care of its header */ 555 gzip_header_length = 0; 556 #endif 557 cfs->gzip_header_length = gzip_header_length; 558 559 if (cfs->gzip_header_length != 560 bfds_seek (cfs->bfds, cfs->gzip_header_length, SEEK_SET)) 561 { 562 LOG ("Failed to seek to start to initialize gzip decompressor\n"); 563 return -1; 564 } 565 cfs->strm.avail_out = COM_CHUNK_SIZE; 566 /* 567 * note: maybe plain inflateInit(&strm) is adequate, 568 * it looks more backward-compatible also ; 569 * 570 * ZLIB_VERNUM isn't defined by zlib version 1.1.4 ; 571 * there might be a better check. 572 */if (Z_OK != inflateInit2 (&cfs->strm, 573 #ifdef ZLIB_VERNUM 574 15 + 32 575 #else 576 -MAX_WBITS 577 #endif 578 )) 579 { 580 LOG ("Failed to initialize zlib decompression\n"); 581 return -1; 582 } 583 return 1; 584 } 585 586 587 #endif 588 589 590 #if HAVE_LIBBZ2 591 /** 592 * Initializes bz2-decompression object. Might report metadata about 593 * compresse stream, if available. Resets the stream to the beginning. 594 * 595 * @param cfs cfs to initialize 596 * @param proc callback for metadata 597 * @param proc_cls callback cls 598 * @return 1 on success, -1 on error 599 */ 600 static int 601 cfs_init_decompressor_bz2 (struct CompressedFileSource *cfs, 602 EXTRACTOR_MetaDataProcessor proc, void *proc_cls) 603 { 604 if (0 != 605 bfds_seek (cfs->bfds, 0, SEEK_SET)) 606 { 607 LOG ("Failed to seek to start to initialize BZ2 decompressor\n"); 608 return -1; 609 } 610 memset (&cfs->bstrm, 0, sizeof (bz_stream)); 611 if (BZ_OK != 612 BZ2_bzDecompressInit (&cfs->bstrm, 0, 0)) 613 { 614 LOG ("Failed to initialize BZ2 decompressor\n"); 615 return -1; 616 } 617 cfs->bstrm.avail_out = COM_CHUNK_SIZE; 618 return 1; 619 } 620 621 622 #endif 623 624 625 /** 626 * Initializes decompression object. Might report metadata about 627 * compresse stream, if available. Resets the stream to the beginning. 628 * 629 * @param cfs cfs to initialize 630 * @param proc callback for metadata 631 * @param proc_cls callback cls 632 * @return 1 on success, 0 to terminate extraction, -1 on error 633 */ 634 static int 635 cfs_init_decompressor (struct CompressedFileSource *cfs, 636 EXTRACTOR_MetaDataProcessor proc, void *proc_cls) 637 { 638 cfs->result_pos = 0; 639 cfs->fpos = 0; 640 switch (cfs->compression_type) 641 { 642 #if HAVE_ZLIB 643 case COMP_TYPE_ZLIB: 644 return cfs_init_decompressor_zlib (cfs, proc, proc_cls); 645 #endif 646 #if HAVE_LIBBZ2 647 case COMP_TYPE_BZ2: 648 return cfs_init_decompressor_bz2 (cfs, proc, proc_cls); 649 #endif 650 default: 651 LOG ("invalid compression type selected\n"); 652 return -1; 653 } 654 } 655 656 657 #if HAVE_ZLIB 658 /** 659 * Deinitializes gz-decompression object. 660 * 661 * @param cfs cfs to deinitialize 662 * @return 1 on success, -1 on error 663 */ 664 static int 665 cfs_deinit_decompressor_zlib (struct CompressedFileSource *cfs) 666 { 667 inflateEnd (&cfs->strm); 668 return 1; 669 } 670 671 672 #endif 673 674 675 #if HAVE_LIBBZ2 676 /** 677 * Deinitializes bz2-decompression object. 678 * 679 * @param cfs cfs to deinitialize 680 * @return 1 on success, -1 on error 681 */ 682 static int 683 cfs_deinit_decompressor_bz2 (struct CompressedFileSource *cfs) 684 { 685 BZ2_bzDecompressEnd (&cfs->bstrm); 686 return 1; 687 } 688 689 690 #endif 691 692 693 /** 694 * Deinitializes decompression object. 695 * 696 * @param cfs cfs to deinitialize 697 * @return 1 on success, -1 on error 698 */ 699 static int 700 cfs_deinit_decompressor (struct CompressedFileSource *cfs) 701 { 702 switch (cfs->compression_type) 703 { 704 #if HAVE_ZLIB 705 case COMP_TYPE_ZLIB: 706 return cfs_deinit_decompressor_zlib (cfs); 707 #endif 708 #if HAVE_LIBBZ2 709 case COMP_TYPE_BZ2: 710 return cfs_deinit_decompressor_bz2 (cfs); 711 #endif 712 default: 713 LOG ("invalid compression type selected\n"); 714 return -1; 715 } 716 } 717 718 719 /** 720 * Resets the compression stream to begin uncompressing 721 * from the beginning. Used at initialization time, and when 722 * seeking backward. 723 * 724 * @param cfs cfs to reset 725 * @return 1 on success, 0 to terminate extraction, 726 * -1 on error 727 */ 728 static int 729 cfs_reset_stream (struct CompressedFileSource *cfs) 730 { 731 if (-1 == cfs_deinit_decompressor (cfs)) 732 return -1; 733 return cfs_init_decompressor (cfs, NULL, NULL); 734 } 735 736 737 /** 738 * Destroy compressed file source. 739 * 740 * @param cfs source to destroy 741 */ 742 static void 743 cfs_destroy (struct CompressedFileSource *cfs) 744 { 745 cfs_deinit_decompressor (cfs); 746 free (cfs); 747 } 748 749 750 /** 751 * Allocates and initializes new cfs object. 752 * 753 * @param bfds data source to use 754 * @param fsize size of the source 755 * @param compression_type type of compression used 756 * @param proc metadata callback to call with meta data found upon opening 757 * @param proc_cls callback cls 758 * @return newly allocated cfs on success, NULL on error 759 */ 760 struct CompressedFileSource * 761 cfs_new (struct BufferedFileDataSource *bfds, 762 int64_t fsize, 763 enum ExtractorCompressionType compression_type, 764 EXTRACTOR_MetaDataProcessor proc, void *proc_cls) 765 { 766 struct CompressedFileSource *cfs; 767 768 if (NULL == (cfs = malloc (sizeof (struct CompressedFileSource)))) 769 { 770 LOG_STRERROR ("malloc"); 771 return NULL; 772 } 773 memset (cfs, 0, sizeof (struct CompressedFileSource)); 774 cfs->compression_type = compression_type; 775 cfs->bfds = bfds; 776 cfs->fsize = fsize; 777 cfs->uncompressed_size = -1; 778 if (1 != cfs_init_decompressor (cfs, 779 proc, proc_cls)) 780 { 781 free (cfs); 782 return NULL; 783 } 784 return cfs; 785 } 786 787 788 #if HAVE_ZLIB 789 /** 790 * Fills 'data' with new uncompressed data. Does the actual 791 * decompression. Will set uncompressed_size on the end of compressed 792 * stream. 793 * 794 * @param cfds cfs to read from 795 * @param data where to copy the data 796 * @param size number of bytes available in data 797 * @return number of bytes in data. 0 if no more data can be uncompressed, -1 on error 798 */ 799 static ssize_t 800 cfs_read_zlib (struct CompressedFileSource *cfs, 801 void *data, 802 size_t size) 803 { 804 char *dst = data; 805 int ret; 806 size_t rc; 807 ssize_t in; 808 unsigned char buf[COM_CHUNK_SIZE]; 809 810 if (cfs->fpos == cfs->uncompressed_size) 811 { 812 /* end of file */ 813 return 0; 814 } 815 rc = 0; 816 if (COM_CHUNK_SIZE > cfs->strm.avail_out + cfs->result_pos) 817 { 818 /* got left-over decompressed data from previous round! */ 819 in = COM_CHUNK_SIZE - (cfs->strm.avail_out + cfs->result_pos); 820 if (in > size) 821 in = size; 822 memcpy (&dst[rc], &cfs->result[cfs->result_pos], in); 823 cfs->fpos += in; 824 cfs->result_pos += in; 825 rc += in; 826 } 827 ret = Z_OK; 828 while ( (rc < size) && (Z_STREAM_END != ret) ) 829 { 830 /* read block from original data source */ 831 in = bfds_read (cfs->bfds, 832 buf, sizeof (buf)); 833 if (in < 0) 834 { 835 LOG ("unexpected EOF\n"); 836 return -1; /* unexpected EOF */ 837 } 838 if (0 == in) 839 { 840 cfs->uncompressed_size = cfs->fpos; 841 return rc; 842 } 843 cfs->strm.next_in = buf; 844 cfs->strm.avail_in = (uInt) in; 845 cfs->strm.next_out = (unsigned char *) cfs->result; 846 cfs->strm.avail_out = COM_CHUNK_SIZE; 847 cfs->result_pos = 0; 848 ret = inflate (&cfs->strm, Z_SYNC_FLUSH); 849 if ( (Z_OK != ret) && (Z_STREAM_END != ret) ) 850 { 851 LOG ("unexpected gzip inflate error: %d\n", ret); 852 return -1; /* unexpected error */ 853 } 854 /* go backwards by the number of bytes left in the buffer */ 855 if (-1 == bfds_seek (cfs->bfds, -(int64_t) cfs->strm.avail_in, SEEK_CUR)) 856 { 857 LOG ("seek failed\n"); 858 return -1; 859 } 860 /* copy decompressed bytes to target buffer */ 861 in = COM_CHUNK_SIZE - cfs->strm.avail_out; 862 if (in > size - rc) 863 { 864 if (Z_STREAM_END == ret) 865 { 866 cfs->uncompressed_size = cfs->fpos + in; 867 ret = Z_OK; 868 } 869 in = size - rc; 870 } 871 memcpy (&dst[rc], &cfs->result[cfs->result_pos], in); 872 cfs->fpos += in; 873 cfs->result_pos += in; 874 rc += in; 875 } 876 if (Z_STREAM_END == ret) 877 { 878 cfs->uncompressed_size = cfs->fpos; 879 } 880 return rc; 881 } 882 883 884 #endif 885 886 887 #if HAVE_LIBBZ2 888 /** 889 * Fills 'data' with new uncompressed data. Does the actual 890 * decompression. Will set uncompressed_size on the end of compressed 891 * stream. 892 * 893 * @param cfds cfs to read from 894 * @param data where to copy the data 895 * @param size number of bytes available in data 896 * @return number of bytes in data. 0 if no more data can be uncompressed, -1 on error 897 */ 898 static ssize_t 899 cfs_read_bz2 (struct CompressedFileSource *cfs, 900 void *data, 901 size_t size) 902 { 903 char *dst = data; 904 int ret; 905 size_t rc; 906 ssize_t in; 907 char buf[COM_CHUNK_SIZE]; 908 909 if (cfs->fpos == cfs->uncompressed_size) 910 { 911 /* end of file */ 912 return 0; 913 } 914 rc = 0; 915 if (COM_CHUNK_SIZE > cfs->bstrm.avail_out + cfs->result_pos) 916 { 917 /* got left-over decompressed data from previous round! */ 918 in = COM_CHUNK_SIZE - (cfs->bstrm.avail_out + cfs->result_pos); 919 if (in > size) 920 in = size; 921 memcpy (&dst[rc], &cfs->result[cfs->result_pos], in); 922 cfs->fpos += in; 923 cfs->result_pos += in; 924 rc += in; 925 } 926 ret = BZ_OK; 927 while ( (rc < size) && (BZ_STREAM_END != ret) ) 928 { 929 /* read block from original data source */ 930 in = bfds_read (cfs->bfds, 931 buf, sizeof (buf)); 932 if (in < 0) 933 { 934 LOG ("unexpected EOF\n"); 935 return -1; /* unexpected EOF */ 936 } 937 if (0 == in) 938 { 939 cfs->uncompressed_size = cfs->fpos; 940 return rc; 941 } 942 cfs->bstrm.next_in = buf; 943 cfs->bstrm.avail_in = (unsigned int) in; 944 cfs->bstrm.next_out = cfs->result; 945 cfs->bstrm.avail_out = COM_CHUNK_SIZE; 946 cfs->result_pos = 0; 947 ret = BZ2_bzDecompress (&cfs->bstrm); 948 if ( (BZ_OK != ret) && (BZ_STREAM_END != ret) ) 949 { 950 LOG ("unexpected bzip2 decompress error: %d\n", ret); 951 return -1; /* unexpected error */ 952 } 953 /* go backwards by the number of bytes left in the buffer */ 954 if (-1 == bfds_seek (cfs->bfds, -(int64_t) cfs->bstrm.avail_in, SEEK_CUR)) 955 { 956 LOG ("seek failed\n"); 957 return -1; 958 } 959 /* copy decompressed bytes to target buffer */ 960 in = COM_CHUNK_SIZE - cfs->bstrm.avail_out; 961 if (in > size - rc) 962 { 963 if (BZ_STREAM_END == ret) 964 { 965 cfs->uncompressed_size = cfs->fpos + in; 966 ret = BZ_OK; 967 } 968 in = size - rc; 969 } 970 memcpy (&dst[rc], &cfs->result[cfs->result_pos], in); 971 cfs->fpos += in; 972 cfs->result_pos += in; 973 rc += in; 974 } 975 if (BZ_STREAM_END == ret) 976 { 977 cfs->uncompressed_size = cfs->fpos; 978 } 979 return rc; 980 } 981 982 983 #endif 984 985 986 /** 987 * Fills 'data' with new uncompressed data. Does the actual 988 * decompression. Will set uncompressed_size on the end of compressed 989 * stream. 990 * 991 * @param cfds cfs to read from 992 * @param data where to copy the data 993 * @param size number of bytes available in data 994 * @return number of bytes in data. 0 if no more data can be uncompressed, -1 on error 995 */ 996 static ssize_t 997 cfs_read (struct CompressedFileSource *cfs, 998 void *data, 999 size_t size) 1000 { 1001 switch (cfs->compression_type) 1002 { 1003 #if HAVE_ZLIB 1004 case COMP_TYPE_ZLIB: 1005 return cfs_read_zlib (cfs, data, size); 1006 #endif 1007 #if HAVE_LIBBZ2 1008 case COMP_TYPE_BZ2: 1009 return cfs_read_bz2 (cfs, data, size); 1010 #endif 1011 default: 1012 LOG ("invalid compression type selected\n"); 1013 return -1; 1014 } 1015 } 1016 1017 1018 /** 1019 * Moves the buffer to 'position' in uncompressed steam. If position 1020 * requires seeking backwards beyond the boundaries of the buffer, resets the 1021 * stream and repeats decompression from the beginning to 'position'. 1022 * 1023 * @param cfs cfs to seek on 1024 * @param position new starting point for the buffer 1025 * @param whence one of the seek constants (SEEK_CUR, SEEK_SET, SEEK_END) 1026 * @return new absolute buffer position, -1 on error or EOS 1027 */ 1028 static int64_t 1029 cfs_seek (struct CompressedFileSource *cfs, 1030 int64_t position, 1031 int whence) 1032 { 1033 uint64_t nposition; 1034 int64_t delta; 1035 1036 switch (whence) 1037 { 1038 case SEEK_CUR: 1039 if (cfs->fpos + position < 0) 1040 { 1041 /* underflow */ 1042 LOG ("Invalid seek operation\n"); 1043 return -1; 1044 } 1045 if ( (-1 != cfs->uncompressed_size) && 1046 (cfs->fpos + position > cfs->uncompressed_size) ) 1047 { 1048 LOG ("Invalid seek operation\n"); 1049 return -1; 1050 } 1051 nposition = cfs->fpos + position; 1052 break; 1053 case SEEK_END: 1054 ASSERT (-1 != cfs->uncompressed_size); 1055 if (position > 0) 1056 { 1057 LOG ("Invalid seek operation\n"); 1058 return -1; 1059 } 1060 if (cfs->uncompressed_size < -position) 1061 { 1062 LOG ("Invalid seek operation\n"); 1063 return -1; 1064 } 1065 nposition = cfs->uncompressed_size + position; 1066 break; 1067 case SEEK_SET: 1068 if (position < 0) 1069 { 1070 LOG ("Invalid seek operation\n"); 1071 return -1; 1072 } 1073 if ( (-1 != cfs->uncompressed_size) && 1074 (cfs->uncompressed_size < position) ) 1075 { 1076 LOG ("Invalid seek operation\n"); 1077 return -1; 1078 } 1079 nposition = (uint64_t) position; 1080 break; 1081 default: 1082 LOG ("Invalid seek operation\n"); 1083 return -1; 1084 } 1085 delta = nposition - cfs->fpos; 1086 if (delta < 0) 1087 { 1088 if (cfs->result_pos >= -delta) 1089 { 1090 cfs->result_pos += delta; 1091 cfs->fpos += delta; 1092 delta = 0; 1093 } 1094 else 1095 { 1096 if (-1 == cfs_reset_stream (cfs)) 1097 { 1098 LOG ("Failed to restart compressed stream for seek operation\n"); 1099 return -1; 1100 } 1101 delta = nposition; 1102 } 1103 } 1104 while (delta > 0) 1105 { 1106 char buf[COM_CHUNK_SIZE]; 1107 size_t max; 1108 int64_t ret; 1109 1110 max = (sizeof (buf) > delta) ? delta : sizeof (buf); 1111 ret = cfs_read (cfs, buf, max); 1112 if (-1 == ret) 1113 { 1114 LOG ("Failed to read decompressed stream for seek operation\n"); 1115 return -1; 1116 } 1117 if (0 == ret) 1118 { 1119 LOG ( 1120 "Reached unexpected end of stream at %llu during seek operation to %llu (%d left)\n", 1121 (unsigned long long) cfs->fpos, 1122 (unsigned long long) nposition, 1123 delta); 1124 return -1; 1125 } 1126 ASSERT (ret <= delta); 1127 delta -= ret; 1128 } 1129 return cfs->fpos; 1130 } 1131 1132 1133 /** 1134 * Detect if we have compressed data on our hands. 1135 * 1136 * @param data pointer to a data buffer or NULL (in case fd is not -1) 1137 * @param fd a file to read data from, or -1 (if data is not NULL) 1138 * @param fsize size of data (if data is not NULL) or of file (if fd is not -1) 1139 * @return -1 to indicate an error, 0 to indicate uncompressed data, or a type (> 0) of compression 1140 */ 1141 static enum ExtractorCompressionType 1142 get_compression_type (struct BufferedFileDataSource *bfds) 1143 { 1144 unsigned char read_data[3]; 1145 1146 if (0 != bfds_seek (bfds, 0, SEEK_SET)) 1147 return COMP_TYPE_INVALID; 1148 if (sizeof (read_data) != 1149 bfds_read (bfds, read_data, sizeof (read_data))) 1150 return COMP_TYPE_UNDEFINED; 1151 1152 #if HAVE_ZLIB 1153 if ( (bfds->fsize >= MIN_ZLIB_HEADER) && 1154 (read_data[0] == 0x1f) && 1155 (read_data[1] == 0x8b) && 1156 (read_data[2] == 0x08) ) 1157 return COMP_TYPE_ZLIB; 1158 #endif 1159 #if HAVE_LIBBZ2 1160 if ( (bfds->fsize >= MIN_BZ2_HEADER) && 1161 (read_data[0] == 'B') && 1162 (read_data[1] == 'Z') && 1163 (read_data[2] == 'h')) 1164 return COMP_TYPE_BZ2; 1165 #endif 1166 return COMP_TYPE_INVALID; 1167 } 1168 1169 1170 /** 1171 * Handle to a datasource we can use for the plugins. 1172 */ 1173 struct EXTRACTOR_Datasource 1174 { 1175 1176 /** 1177 * Underlying buffered data source. 1178 */ 1179 struct BufferedFileDataSource *bfds; 1180 1181 /** 1182 * Compressed file source (NULL if not applicable). 1183 */ 1184 struct CompressedFileSource *cfs; 1185 1186 /** 1187 * Underlying file descriptor, -1 for none. 1188 */ 1189 int fd; 1190 }; 1191 1192 1193 /** 1194 * Create a datasource from a file on disk. 1195 * 1196 * @param filename name of the file on disk 1197 * @param proc metadata callback to call with meta data found upon opening 1198 * @param proc_cls callback cls 1199 * @return handle to the datasource, NULL on error 1200 */ 1201 struct EXTRACTOR_Datasource * 1202 EXTRACTOR_datasource_create_from_file_ (const char *filename, 1203 EXTRACTOR_MetaDataProcessor proc, 1204 void *proc_cls) 1205 { 1206 struct BufferedFileDataSource *bfds; 1207 struct EXTRACTOR_Datasource *ds; 1208 enum ExtractorCompressionType ct; 1209 int fd; 1210 struct stat sb; 1211 int64_t fsize; 1212 int winmode = 0; 1213 #if WINDOWS 1214 winmode = O_BINARY; 1215 #endif 1216 1217 if (-1 == (fd = open (filename, O_RDONLY | O_LARGEFILE | winmode))) 1218 { 1219 LOG_STRERROR_FILE ("open", filename); 1220 return NULL; 1221 } 1222 if ( (0 != fstat (fd, &sb)) || 1223 (S_ISDIR (sb.st_mode)) ) 1224 { 1225 if (! S_ISDIR (sb.st_mode)) 1226 LOG_STRERROR_FILE ("fstat", filename); 1227 else 1228 LOG ("Skipping directory `%s'\n", filename); 1229 (void) close (fd); 1230 return NULL; 1231 } 1232 fsize = (int64_t) sb.st_size; 1233 if (0 == fsize) 1234 { 1235 (void) close (fd); 1236 return NULL; 1237 } 1238 bfds = bfds_new (NULL, fd, fsize); 1239 if (NULL == bfds) 1240 { 1241 (void) close (fd); 1242 return NULL; 1243 } 1244 if (NULL == (ds = malloc (sizeof (struct EXTRACTOR_Datasource)))) 1245 { 1246 LOG_STRERROR ("malloc"); 1247 bfds_delete (bfds); 1248 (void) close (fd); 1249 return NULL; 1250 } 1251 ds->bfds = bfds; 1252 ds->fd = fd; 1253 ds->cfs = NULL; 1254 ct = get_compression_type (bfds); 1255 if ( (COMP_TYPE_ZLIB == ct) || 1256 (COMP_TYPE_BZ2 == ct) ) 1257 { 1258 ds->cfs = cfs_new (bfds, fsize, ct, proc, proc_cls); 1259 if (NULL == ds->cfs) 1260 { 1261 LOG ("Failed to initialize decompressor\n"); 1262 bfds_delete (bfds); 1263 free (ds); 1264 (void) close (fd); 1265 return NULL; 1266 } 1267 } 1268 return ds; 1269 } 1270 1271 1272 /** 1273 * Create a datasource from a buffer in memory. 1274 * 1275 * @param buf data in memory 1276 * @param size number of bytes in 'buf' 1277 * @param proc metadata callback to call with meta data found upon opening 1278 * @param proc_cls callback cls 1279 * @return handle to the datasource 1280 */ 1281 struct EXTRACTOR_Datasource * 1282 EXTRACTOR_datasource_create_from_buffer_ (const char *buf, 1283 size_t size, 1284 EXTRACTOR_MetaDataProcessor proc, 1285 void *proc_cls) 1286 { 1287 struct BufferedFileDataSource *bfds; 1288 struct EXTRACTOR_Datasource *ds; 1289 enum ExtractorCompressionType ct; 1290 1291 if (0 == size) 1292 return NULL; 1293 if (NULL == (bfds = bfds_new (buf, -1, size))) 1294 { 1295 LOG ("Failed to initialize buffer data source\n"); 1296 return NULL; 1297 } 1298 if (NULL == (ds = malloc (sizeof (struct EXTRACTOR_Datasource)))) 1299 { 1300 LOG_STRERROR ("malloc"); 1301 bfds_delete (bfds); 1302 return NULL; 1303 } 1304 ds->bfds = bfds; 1305 ds->fd = -1; 1306 ds->cfs = NULL; 1307 ct = get_compression_type (bfds); 1308 if ( (COMP_TYPE_ZLIB == ct) || 1309 (COMP_TYPE_BZ2 == ct) ) 1310 { 1311 ds->cfs = cfs_new (bfds, size, ct, proc, proc_cls); 1312 if (NULL == ds->cfs) 1313 { 1314 LOG ("Failed to initialize decompressor\n"); 1315 bfds_delete (bfds); 1316 free (ds); 1317 return NULL; 1318 } 1319 } 1320 return ds; 1321 } 1322 1323 1324 /** 1325 * Destroy a data source. 1326 * 1327 * @param ds source to destroy 1328 */ 1329 void 1330 EXTRACTOR_datasource_destroy_ (struct EXTRACTOR_Datasource *ds) 1331 { 1332 if (NULL != ds->cfs) 1333 cfs_destroy (ds->cfs); 1334 bfds_delete (ds->bfds); 1335 if (-1 != ds->fd) 1336 (void) close (ds->fd); 1337 free (ds); 1338 } 1339 1340 1341 /** 1342 * Make 'size' bytes of data from the data source available at 'data'. 1343 * 1344 * @param cls must be a 'struct EXTRACTOR_Datasource' 1345 * @param data where the data should be copied to 1346 * @param size maximum number of bytes requested 1347 * @return number of bytes now available in data (can be smaller than 'size'), 1348 * -1 on error 1349 */ 1350 ssize_t 1351 EXTRACTOR_datasource_read_ (void *cls, 1352 void *data, 1353 size_t size) 1354 { 1355 struct EXTRACTOR_Datasource *ds = cls; 1356 1357 if (NULL != ds->cfs) 1358 return cfs_read (ds->cfs, data, size); 1359 return bfds_read (ds->bfds, data, size); 1360 } 1361 1362 1363 /** 1364 * Seek in the datasource. Use 'SEEK_CUR' for whence and 'pos' of 0 to 1365 * obtain the current position in the file. 1366 * 1367 * @param cls must be a 'struct EXTRACTOR_Datasource' 1368 * @param pos position to seek (see 'man lseek') 1369 * @param whence how to see (absolute to start, relative, absolute to end) 1370 * @return new absolute position, UINT64_MAX on error (i.e. desired position 1371 * does not exist) 1372 */ 1373 int64_t 1374 EXTRACTOR_datasource_seek_ (void *cls, 1375 int64_t pos, 1376 int whence) 1377 { 1378 struct EXTRACTOR_Datasource *ds = cls; 1379 1380 if (NULL != ds->cfs) 1381 { 1382 if ( (SEEK_END == whence) && 1383 (-1 == ds->cfs->uncompressed_size) ) 1384 { 1385 /* need to obtain uncompressed size */ 1386 (void) EXTRACTOR_datasource_get_size_ (ds, 1); 1387 if (-1 == ds->cfs->uncompressed_size) 1388 return -1; 1389 } 1390 return cfs_seek (ds->cfs, pos, whence); 1391 } 1392 return bfds_seek (ds->bfds, pos, whence); 1393 } 1394 1395 1396 /** 1397 * Determine the overall size of the data source (after compression). 1398 * 1399 * @param cls must be a 'struct EXTRACTOR_Datasource' 1400 * @param force force computing the size if it is unavailable 1401 * @return overall file size, UINT64_MAX on error or unknown 1402 */ 1403 int64_t 1404 EXTRACTOR_datasource_get_size_ (void *cls, 1405 int force) 1406 { 1407 struct EXTRACTOR_Datasource *ds = cls; 1408 char buf[32 * 1024]; 1409 uint64_t pos; 1410 1411 if (NULL != ds->cfs) 1412 { 1413 if ( (force) && 1414 (-1 == ds->cfs->uncompressed_size) ) 1415 { 1416 pos = ds->cfs->fpos; 1417 while ( (-1 == ds->cfs->uncompressed_size) && 1418 (-1 != cfs_read (ds->cfs, buf, sizeof (buf))) ) 1419 ; 1420 if (-1 == cfs_seek (ds->cfs, pos, SEEK_SET)) 1421 { 1422 LOG ( 1423 "Serious problem, I moved the buffer to determine the file size but could not restore it...\n"); 1424 return -1; 1425 } 1426 if (-1 == ds->cfs->uncompressed_size) 1427 return -1; 1428 } 1429 return ds->cfs->uncompressed_size; 1430 } 1431 return ds->bfds->fsize; 1432 } 1433 1434 1435 /* end of extractor_datasource.c */