libextractor

GNU libextractor
Log | Files | Refs | Submodules | README | LICENSE

commit 38b0cdd4c4d94c644eca757f5736ee8f3f03cc84
parent 479d302fc73af96bda241e6a64eba48cc18ab65e
Author: Christian Grothoff <christian@grothoff.org>
Date:   Sun, 22 Jul 2012 09:38:54 +0000

datasource hacking


Diffstat:
Msrc/main/extractor_datasource.c | 912+++++++++++++++++++++++++++++++++++++++++++------------------------------------
Msrc/main/extractor_datasource.h | 30++++++++++++++++++------------
2 files changed, 515 insertions(+), 427 deletions(-)

diff --git a/src/main/extractor_datasource.c b/src/main/extractor_datasource.c @@ -22,10 +22,22 @@ #if HAVE_LIBBZ2 #include <bzlib.h> +#define MIN_BZ2_HEADER 4 +#ifndef MIN_COMPRESSED_HEADER +#define MIN_COMPRESSED_HEADER MIN_ZLIB_HEADER +#endif #endif #if HAVE_ZLIB #include <zlib.h> +#define MIN_ZLIB_HEADER 12 +#ifndef MIN_COMPRESSED_HEADER +#define MIN_COMPRESSED_HEADER MIN_BZ2_HEADER +#endif +#endif + +#ifndef MIN_COMPRESSED_HEADER +#define MIN_COMPRESSED_HEADER -1 #endif #ifndef O_LARGEFILE @@ -37,24 +49,12 @@ */ #define MAX_READ (4 * 1024 * 1024) +/** + * Data is read from the source and shoved into decompressor + * in chunks this big. + */ +#define COM_CHUNK_SIZE (10 * 1024) -#if HAVE_ZLIB -#define MIN_ZLIB_HEADER 12 -#endif -#if HAVE_LIBBZ2 -#define MIN_BZ2_HEADER 4 -#endif -#if !defined (MIN_COMPRESSED_HEADER) && HAVE_ZLIB -#define MIN_COMPRESSED_HEADER MIN_ZLIB_HEADER -#endif -#if !defined (MIN_COMPRESSED_HEADER) && HAVE_LIBBZ2 -#define MIN_COMPRESSED_HEADER MIN_BZ2_HEADER -#endif -#if !defined (MIN_COMPRESSED_HEADER) -#define MIN_COMPRESSED_HEADER -1 -#endif - -#define COMPRESSED_DATA_PROBE_SIZE 3 /** * Enum with the various possible types of compression supported. @@ -106,17 +106,17 @@ struct BufferedFileDataSource uint64_t fsize; /** - * Position within the file or the data buffer + * Position of the buffer in the file. */ uint64_t fpos; /** - * Position within the buffer. + * Position within the buffer. */ uint64_t buffer_pos; /** - * Number of bytes in the buffer (<= buffer_size) + * Number of valid bytes in the buffer (<= buffer_size) */ uint64_t buffer_bytes; @@ -144,12 +144,22 @@ struct CompressedFileSource struct BufferedFileDataSource *bfds; /** + * Decompression target buffer. + */ + char result[COM_CHUNK_SIZE]; + + /** + * At which offset in 'result' is 'fpos'? + */ + size_t result_pos; + + /** * Size of the source (same as bfds->fsize) */ int64_t fsize; /** - * Position within the source + * Position within the (decompressed) source */ int64_t fpos; @@ -206,7 +216,7 @@ bfds_pick_next_buffer_at (struct BufferedFileDataSource *bfds, return -1; /* invalid */ if (NULL == bfds->buffer) { - bfds->buffer_bytes = bfds->fsize; + bfds->buffer_pos = pos; return 0; } #if WINDOWS @@ -219,6 +229,7 @@ bfds_pick_next_buffer_at (struct BufferedFileDataSource *bfds, if (position < 0) return -1; bfds->fpos = position; + bfds->buffer_pos = 0; rd = read (bfds->fd, bfds->buffer, bfds->buffer_size); if (rd < 0) return -1; @@ -258,7 +269,7 @@ bfds_new (const void *data, memset (result, 0, sizeof (struct BufferedFileDataSource)); result->data = (NULL != data) ? data : &result[1]; result->buffer = (NULL != data) ? NULL : &result[1]; - result->buffer_size = (NULL != data) ? fsize : xtra; + result->buffer_size = (NULL != data) ? fsize : xtra; result->fsize = fsize; result->fd = fd; bfds_pick_next_buffer_at (result, 0); @@ -297,86 +308,95 @@ bfds_seek (struct BufferedFileDataSource *bfds, switch (whence) { case SEEK_CUR: - if (NULL != bfds->buffer) + if (bfds->fpos + bfds->buffer_pos + pos < 0) + return -1; + if (bfds->fpos + bfds->buffer_pos + pos > bfds->fsize) + return -1; + if ( (NULL == bfds->buffer) || + ( (bfds->buffer_pos + pos < pos->buffer_bytes) && + (bfds->buffer_pos + pos >= 0) ) ) { - if (0 != bfds_pick_next_buffer_at (bfds, - bfds->fpos + bfds->buffer_pos + pos)) - return -1; - bfds->buffer_pos = 0; - return bfds->fpos; + bfds->buffer_pos += pos; + return bfds->buffer_pos; } - bfds->buffer_pos += pos; - return bfds->buffer_pos; + if (0 != bfds_pick_next_buffer_at (bfds, + bfds->fpos + bfds->buffer_pos + pos)) + return -1; + return bfds->fpos; + case SEEK_END: + if (pos > 0) + return -1; + if (bfds->fsize < - pos) + return -1; + pos = bfds->fsize + pos; + /* fall-through! */ case SEEK_SET: if (pos < 0) return -1; - if (NULL != bfds->buffer) - { - if (0 != bfds_pick_next_buffer_at (bfds, pos)) - return -1; - bfds->buffer_pos = 0; - return bfds->fpos; - } - bfds->buffer_pos = pos; - return bfds->buffer_pos; - case SEEK_END: - if (NULL != bfds->buffer) + if (pos > bfds->fsize) + return -1; + if ( (NULL == bfds->buffer) || + ( (bfds->buffer_pos <= pos) && + (bfds->buffer_pos + pos->buffer_bytes > pos) ) ) { - if (0 != bfds_pick_next_buffer_at (bfds, bfds->fsize + pos)) - return -1; - bfds->buffer_pos = 0; - return bfds->fpos; + bfds->buffer_pos = pos; + return bfds->buffer_pos; } - bfds->buffer_pos = bfds->fsize + pos; - return bfds->buffer_pos; + if (0 != bfds_pick_next_buffer_at (bfds, pos)) + return -1; + return bfds->fpos; } return -1; } /** - * Fills 'buf_ptr' with a chunk of data. - * Will seek if necessary. Will fail if 'count' exceeds buffer size. + * Fills 'buf_ptr' with a chunk of data. Will + * fail if 'count' exceeds buffer size. * * @param bfds bfds * @param buf_ptr location to store data * @param count number of bytes to read - * @return number of bytes (<= count) available at location pointed by buf_ptr + * @return number of bytes (<= count) available at location pointed by buf_ptr, + * 0 for end of stream, -1 on error */ static ssize_t bfds_read (struct BufferedFileDataSource *bfds, void *buf_ptr, size_t count) { - if (count > MAX_READ) - return -1; - if (count > bfds->buffer_bytes - bfds->buffer_pos) + char *cbuf = buf_ptr; + uint64_t old_off; + size_t avail; + size_t ret; + + old_off = bfds->fpos + bfds->buffer_pos + bfds->buffer_bytes; + if (old_off == bfds->fsize) + return 0; /* end of stream */ + ret = 0; + while (count > 0) { - if (bfds->fpos + bfds->buffer_pos != bfds_seek (bfds, bfds->fpos + bfds->buffer_pos, SEEK_SET)) - return -1; - if (NULL != bfds->buffer) + if ( (bfds->buffer_bytes == bfds->buffer_pos) && + (0 != bfds_pick_next_buffer_at (bfds, + bfds->fpos + bfds->buffer_pos + bfds->buffer_bytes)) ) { - *buf_ptr = &bfds->buffer[bfds->buffer_pos]; - bfds->buffer_pos += count < bfds->buffer_bytes ? count : bfds->buffer_bytes; - return (count < bfds->buffer_bytes ? count : bfds->buffer_bytes); - } - else - { - int64_t ret = count < (bfds->buffer_bytes - bfds->buffer_pos) ? count : (bfds->buffer_bytes - bfds->buffer_pos); - *buf_ptr = (unsigned char*) &bfds->data[bfds->buffer_pos]; - bfds->buffer_pos += ret; - return ret; + /* revert to original position, invalidate buffer */ + bfds->fpos = old_off; + bfds->buffer_bytes = 0; + bfds->buffer_pos = 0; + return -1; /* getting more failed */ } + avail = bfds->buffer_bytes - bfds->buffer_pos; + if (avail > count) + avail = count; + if (0 == avail) + abort (); /* must not happen */ + memcpy (&cbuf[ret], &bfds->data[bfds->buffer_pos], avail); + bfds->buffer_pos += avail; + count -= avail; + ret += avail; } - else - { - if (NULL != bfds->buffer) - *buf_ptr = &bfds->buffer[bfds->buffer_pos]; - else - *buf_ptr = (unsigned char*) &bfds->data[bfds->buffer_pos]; - bfds->buffer_pos += count; - return count; - } + return ret; } @@ -395,7 +415,7 @@ cfs_delete (struct CompressedFileSource *cfs) /** * Reset gz-compressed data stream to the beginning. * - * @return 1 on success, 0 if we failed to seek, + * @return 1 on success, 0 to terminate extraction, * -1 on decompressor initialization failure */ static int @@ -422,13 +442,13 @@ cfs_reset_stream_zlib (struct CompressedFileSource *cfs) #ifdef ZLIB_VERNUM 15 + 32 #else - -MAX_WBITS + - MAX_WBITS #endif )) { return -1; } - cfs->fpos = cfs->gzip_header_length; + cfs->fpos = 0; cfs->shm_pos = 0; cfs->shm_buf_size = 0; return 1; @@ -438,7 +458,7 @@ cfs_reset_stream_zlib (struct CompressedFileSource *cfs) /** * Reset bz2-compressed data stream to the beginning. * - * @return 1 on success, 0 if we failed to seek, + * @return 1 on success, 0 to terminate extraction, * -1 on decompressor initialization failure */ static int @@ -455,21 +475,21 @@ cfs_reset_stream_bz2 (struct CompressedFileSource *cfs) * seeking backward. * * @param cfs cfs to reset - * @return 1 on success, , 0 if we failed to seek, + * @return 1 on success, 0 to terminate extraction, * -1 on error */ static int cfs_reset_stream (struct CompressedFileSource *cfs) { switch (cfs->compression_type) - { - case COMP_TYPE_ZLIB: - return cfs_reset_stream_zlib (cfs); - case COMP_TYPE_BZ2: - return cfs_reset_stream_bz2 (cfs); - default: - return -1; - } + { + case COMP_TYPE_ZLIB: + return cfs_reset_stream_zlib (cfs); + case COMP_TYPE_BZ2: + return cfs_reset_stream_bz2 (cfs); + default: + return -1; + } } @@ -480,98 +500,75 @@ cfs_reset_stream (struct CompressedFileSource *cfs) * @param cfs cfs to initialize * @param proc callback for metadata * @param proc_cls callback cls - * @return 1 on success, -1 on error + * @return 1 on success, 0 to terminate extraction, -1 on error */ static int cfs_init_decompressor_zlib (struct CompressedFileSource *cfs, EXTRACTOR_MetaDataProcessor proc, void *proc_cls) { - /* Process gzip header */ unsigned int gzip_header_length = 10; - unsigned char data[12]; - int64_t buf_bytes; - int len; - unsigned char *buf; - unsigned char *cptr; - - if (sizeof (data) > bfds_read (cfs->bfds, data, sizeof (data))) - return -1; - - if (0 != (data[3] & 0x4)) /* FEXTRA set */ - gzip_header_length += 2 + (unsigned) (data[10] & 0xff) + - (((unsigned) (data[11] & 0xff)) * 256); + unsigned char hdata[12]; - if (0 != (data[3] & 0x8)) /* FNAME set */ - { - if (gzip_header_length > bfds_seek (cfs->bfds, gzip_header_length, SEEK_SET)) - return -1; - buf_bytes = bfds_read (cfs->bfds, &buf, 1024); - if (buf_bytes <= 0) - return -1; - cptr = buf; + /* Process gzip header */ + if (sizeof (hdata) > bfds_read (cfs->bfds, hdata, sizeof (hdata))) + return -1; + if (0 != (hdata[3] & 0x4)) /* FEXTRA set */ + gzip_header_length += 2 + (unsigned) (hdata[10] & 0xff) + + (((unsigned) (hdata[11] & 0xff)) * 256); - len = 0; - /* stored file name is here */ - while (len < buf_bytes) + if (0 != (hdata[3] & 0x8)) { - if ('\0' == *cptr) - break; - cptr++; - len++; + /* FNAME set */ + char fname[1024]; + char *cptr; + size_t len; + ssize_t buf_bytes; + + if (gzip_header_length > bfds_seek (cfs->bfds, gzip_header_length, SEEK_SET)) + return -1; + buf_bytes = bfds_read (cfs->bfds, fname, sizeof (fname)); + if (buf_bytes <= 0) + return -1; + if (NULL == (cptr = memchr (fname, 0, buf_bytes))) + return -1; + len = cptr - fname; + if (0 != proc (proc_cls, "<zlib>", EXTRACTOR_METATYPE_FILENAME, + EXTRACTOR_METAFORMAT_C_STRING, "text/plain", + fname, + len)) + return 0; /* done */ + gzip_header_length += len + 1; } - - if (0 != proc (proc_cls, "<zlib>", EXTRACTOR_METATYPE_FILENAME, - EXTRACTOR_METAFORMAT_C_STRING, "text/plain", - (const char *) buf, - len)) - return 0; /* done */ - - /* FIXME: check for correctness */ - //gzip_header_length = (cptr - data) + 1; - gzip_header_length += len + 1; - } - - if (0 != (data[3] & 0x16)) /* FCOMMENT set */ - { - int64_t buf_bytes; - int len; - unsigned char *buf; - unsigned char *cptr; - - if (gzip_header_length > bfds_seek (cfs->bfds, gzip_header_length, SEEK_SET)) - return -1; - buf_bytes = bfds_read (cfs->bfds, &buf, 1024); - if (buf_bytes <= 0) - return -1; - cptr = buf; - - len = 0; - /* stored file name is here */ - while (len < buf_bytes) - { - if ('\0' == *cptr) - break; - cptr++; - len++; - } - - if (0 != proc (proc_cls, "<zlib>", EXTRACTOR_METATYPE_COMMENT, - EXTRACTOR_METAFORMAT_C_STRING, "text/plain", - (const char *) buf, - len)) - return 0; /* done */ - - /* FIXME: check for correctness */ - //gzip_header_length = (cptr - data) + 1; - gzip_header_length += len + 1; - } - - if (data[3] & 0x2) /* FCHRC set */ + + if (0 != (hdata[3] & 0x16)) + { + /* FCOMMENT set */ + char fcomment[1024]; + char *cptr; + ssize_t buf_bytes; + size_t len; + + if (gzip_header_length > bfds_seek (cfs->bfds, gzip_header_length, SEEK_SET)) + return -1; + buf_bytes = bfds_read (cfs->bfds, fcomment, sizeof (fcomment)); + if (buf_bytes <= 0) + return -1; + if (NULL == (cptr = memchr (fcomment, 0, buf_bytes))) + return -1; + len = cptr - fcomment; + if (0 != proc (proc_cls, "<zlib>", EXTRACTOR_METATYPE_COMMENT, + EXTRACTOR_METAFORMAT_C_STRING, "text/plain", + (const char *) fcomment, + len)) + return 0; /* done */ + gzip_header_length += len + 1; + } + if (0 != (hdata[3] & 0x2)) /* FCHRC set */ gzip_header_length += 2; - memset (&cfs->strm, 0, sizeof (z_stream)); - + #ifdef ZLIB_VERNUM + /* zlib will take care of its header */ gzip_header_length = 0; #endif @@ -604,7 +601,7 @@ cfs_init_decompressor_bz2 (struct CompressedFileSource *cfs, * @param cfs cfs to initialize * @param proc callback for metadata * @param proc_cls callback cls - * @return 1 on success, -1 on error + * @return 1 on success, 0 to terminate extraction, -1 on error */ static int cfs_init_decompressor (struct CompressedFileSource *cfs, @@ -671,12 +668,25 @@ cfs_deinit_decompressor (struct CompressedFileSource *cfs) /** + * Destroy compressed file source. + * + * @param cfs source to destroy + */ +static void +cfs_destroy (struct CompressedFileSource *cfs) +{ + cfs_deinit_decompressor (cfs); + free (cfs); +} + + +/** * Allocates and initializes new cfs object. * * @param bfds data source to use * @param fsize size of the source * @param compression_type type of compression used - * @param proc metadata callback + * @param proc metadata callback to call with meta data found upon opening * @param proc_cls callback cls * @return newly allocated cfs on success, NULL on error */ @@ -686,7 +696,6 @@ cfs_new (struct BufferedFileDataSource *bfds, enum ExtractorCompressionType compression_type, EXTRACTOR_MetaDataProcessor proc, void *proc_cls) { - int shm_result; struct CompressedFileSource *cfs; if (NULL == (cfs = malloc (sizeof (struct CompressedFileSource)))) @@ -696,93 +705,118 @@ cfs_new (struct BufferedFileDataSource *bfds, cfs->bfds = bfds; cfs->fsize = fsize; cfs->uncompressed_size = -1; + if (1 != cfs_init_decompressor (cfs, + proc, proc_cls)) + { + free (cfs); + return NULL; + } return cfs; } /** - * Data is read from the source and shoved into decompressor - * in chunks this big. - */ -#define COM_CHUNK_SIZE (10*1024) - - -/** - * Re-fills shm with new uncompressed data, preserving the last - * 'preserve' bytes of existing data as the first 'preserve' bytes - * of the new data. - * Does the actual decompression. Will set uncompressed_size on - * the end of compressed stream. + * Fills 'data' with new uncompressed data. Does the actual + * decompression. Will set uncompressed_size on the end of compressed + * stream. * * @param cfds cfs to read from - * @param preserve number of bytes to preserve (0 to discard all old data) - * @return number of bytes in shm. 0 if no more data can be uncompressed, -1 on error + * @param data where to copy the data + * @param size number of bytes available in data + * @return number of bytes in data. 0 if no more data can be uncompressed, -1 on error */ -static int -cfs_read_zlib (struct CompressedFileSource *cfs, int64_t preserve) +static ssize_t +cfs_read_zlib (struct CompressedFileSource *cfs, + void *data, + size_t size) { + char *dst = data; int ret; - int64_t rc = preserve; - int64_t total = cfs->strm.total_out; + size_t rc; + ssize_t in; + char buf[COM_CHUNK_SIZE]; - if (preserve > 0) - memmove (cfs->shm_ptr, &((unsigned char *)cfs->shm_ptr)[0], preserve); - - while (rc < cfs->shm_size && ret != Z_STREAM_END) - { - if (cfs->strm.avail_in == 0) + if (cfs->fpos == cfs->uncompressed_size) + return 0; + rc = 0; + if (strm.avail_out > 0) + { + /* got left-over decompressed data from previous round! */ + in = strm.avail_out; + if (in > size) + in = size; + memcpy (&dst[rc], &cfs->result[cfs->result_pos], in); + cfs->fpos += in; + cfs->result_pos += in; + rc += in; + } + ret = Z_OK; + while ( (rc < size) && (Z_STREAM_END != ret) ) { - int64_t count = bfds_read (cfs->bfds, &cfs->strm.next_in, COM_CHUNK_SIZE); - if (count <= 0) - return 0; + /* read block from original data source */ + in = bfds_read (cfs->bfds, + buf, sizeof (buf)); + if (in <= 0) + return -1; /* unexpected EOF */ + cfs->strm.next_in = buf; cfs->strm.avail_in = (uInt) count; + cfs->strm.next_out = cfs->result; + cfs->strm.avail_out = COM_CHUNK_SIZE; + cfs->result_pos = 0; + ret = inflate (&cfs->strm, Z_SYNC_FLUSH); + if ( (Z_OK != ret) && (Z_STREAM_END != ret) ) + return -1; /* unexpected error */ + /* go backwards by the number of bytes left in the buffer */ + if (-1 == bfds_seek (cfs->bfds, - cfs->strm.avail_in, SEEK_CUR)) + return -1; + /* copy decompressed bytes to target buffer */ + in = cfs->strm.total_out; + if (in > size - rc) + in = size - rc; + memcpy (&dst[rc], &cfs->result[cfs->result_pos], in); + cfs->fpos += in; + cfs->result_pos += in; + rc += in; } - cfs->strm.next_out = &((unsigned char *)cfs->shm_ptr)[rc]; - cfs->strm.avail_out = cfs->shm_size - rc; - ret = inflate (&cfs->strm, Z_SYNC_FLUSH); - if (ret != Z_OK && ret != Z_STREAM_END) - return 0; - rc = cfs->strm.total_out - total; - } - if (ret == Z_STREAM_END) - cfs->uncompressed_size = cfs->strm.total_out; - cfs->shm_pos = preserve; - cfs->shm_buf_size = rc + preserve; - return 1; + if (Z_STREAM_END == ret) + cfs->uncompressed_size = cfs->fpos; + return rc; } /** - * Re-fills shm with new uncompressed data, preserving the last - * 'preserve' bytes of existing data as the first 'preserve' bytes - * of the new data. - * Does the actual decompression. Will set uncompressed_size on - * the end of compressed stream. + * Fills 'data' with new uncompressed data. Does the actual + * decompression. Will set uncompressed_size on the end of compressed + * stream. * * @param cfds cfs to read from - * @param preserve number of bytes to preserve (0 to discard all old data) - * @return number of bytes in shm. 0 if no more data can be uncompressed, -1 on error + * @param data where to copy the data + * @param size number of bytes available in data + * @return number of bytes in data. 0 if no more data can be uncompressed, -1 on error */ -static int -cfs_read_bz2 (struct CompressedFileSource *cfs, int64_t preserve) +static ssize_t +cfs_read_bz2 (struct CompressedFileSource *cfs, + void *data, + size_t size) { return -1; } /** - * Re-fills shm with new uncompressed data, preserving the last - * 'preserve' bytes of existing data as the first 'preserve' bytes - * of the new data. - * Does the actual decompression. Will set uncompressed_size on - * the end of compressed stream. + * Fills 'data' with new uncompressed data. Does the actual + * decompression. Will set uncompressed_size on the end of compressed + * stream. * * @param cfds cfs to read from - * @param preserve number of bytes to preserve (0 to discard all old data) - * @return number of bytes in shm. 0 if no more data can be uncompressed, -1 on error + * @param data where to copy the data + * @param size number of bytes available in data + * @return number of bytes in data. 0 if no more data can be uncompressed, -1 on error */ -static int64_t -cfs_read (struct CompressedFileSource *cfs, int64_t preserve) +static ssize_t +cfs_read (struct CompressedFileSource *cfs, + void *data, + size_t size) { switch (cfs->compression_type) { @@ -801,72 +835,44 @@ cfs_read (struct CompressedFileSource *cfs, int64_t preserve) * requires seeking backwards beyond the boundaries of the buffer, resets the * stream and repeats decompression from the beginning to 'position'. * - * @param cfds cfs to seek on - * @param position new starting point for the buffer - * @return new absolute buffer position, -1 on error or EOS - */ -static int64_t -cfs_seek_zlib (struct CompressedFileSource *cfs, int64_t position) -{ - int64_t ret; - - if (position > cfs->strm.total_out - cfs->shm_buf_size && position < cfs->strm.total_out) - { - ret = cfs_read (cfs, cfs->strm.total_out - position); - if (ret < 0) - return ret; - return position; - } - while (position >= cfs->strm.total_out) - { - if (0 > (ret = cfs_read (cfs, 0))) - return ret; - if (ret == 0) - return position; - } - if (position < cfs->strm.total_out && position > cfs->strm.total_out - cfs->shm_buf_size) - return cfs->strm.total_out - cfs->shm_buf_size; - return -1; -} - - -/** - * Moves the buffer to 'position' in uncompressed steam. If position - * requires seeking backwards beyond the boundaries of the buffer, resets the - * stream and repeats decompression from the beginning to 'position'. - * - * @param cfds cfs to seek on + * @param cfs cfs to seek on * @param position new starting point for the buffer * @return new absolute buffer position, -1 on error or EOS */ static int64_t -cfs_seek_bz2 (struct CompressedFileSource *cfs, int64_t position) +cfs_seek (struct CompressedFileSource *cfs, + uint64_t position) { - return -1; -} - - -/** - * Moves the buffer to 'position' in uncompressed steam. If position - * requires seeking backwards beyond the boundaries of the buffer, resets the - * stream and repeats decompression from the beginning to 'position'. - * - * @param cfds cfs to seek on - * @param position new starting point for the buffer - * @return new absolute buffer position, -1 on error or EOS - */ -static int64_t -cfs_seek (struct CompressedFileSource *cfs, int64_t position) -{ - switch (cfs->compression_type) + int64_t delta; + + delta = position - cfs->fpos; + if (delta < 0) { - case COMP_TYPE_ZLIB: - return cfs_seek_zlib (cfs, position); - case COMP_TYPE_BZ2: - return cfs_seek_bz2 (cfs, position); - default: - return -1; + if (result_pos >= - delta) + { + result_pos += delta; + delta = 0; + } + else + { + if (-1 == cfs_reset_stream (cfs)) + return -1; + delta = position; + } + } + while (delta > 0) + { + char buf[COM_CHUNK_SIZE]; + size_t max; + int64_t ret; + + max = (sizeof (buf) > delta) ? delta : sizeof (buf); + ret = cfs_read (cfs, buf, max); + if (-1 == ret) + return -1; + delta -= ret; } + return cfs->fpos; } @@ -879,163 +885,239 @@ cfs_seek (struct CompressedFileSource *cfs, int64_t position) * @return -1 to indicate an error, 0 to indicate uncompressed data, or a type (> 0) of compression */ static enum ExtractorCompressionType -get_compression_type (const unsigned char *data, - int fd, - int64_t fsize) +get_compression_type (struct BufferedFileDataSource *bfds) { - void *read_data = NULL; - size_t read_data_size = 0; - ssize_t read_result; - enum ExtractorCompressionType result = COMP_TYPE_INVALID; + unsigned char read_data[3]; - if ((MIN_COMPRESSED_HEADER < 0) || (fsize < MIN_COMPRESSED_HEADER)) - { + if (0 != bfds_seek (bfds, 0, SEEK_SET)) return COMP_TYPE_INVALID; - } - if (data == NULL) - { - int64_t position; - read_data_size = COMPRESSED_DATA_PROBE_SIZE; - read_data = malloc (read_data_size); - if (read_data == NULL) - return -1; -#if WINDOWS - position = _lseeki64 (fd, 0, SEEK_CUR); -#elif HAVE_LSEEK64 - position = lseek64 (fd, 0, SEEK_CUR); -#else - position = (int64_t) lseek (fd, 0, SEEK_CUR); -#endif - read_result = READ (fd, read_data, read_data_size); -#if WINDOWS - position = _lseeki64 (fd, position, SEEK_SET); -#elif HAVE_LSEEK64 - position = lseek64 (fd, position, SEEK_SET); -#else - position = lseek (fd, (off_t) position, SEEK_SET); -#endif - if (read_result != read_data_size) - { - free (read_data); - return COMP_TYPE_UNDEFINED; - } - data = (const void *) read_data; - } + if (sizeof (read_data) != + bfds_read (bfds, read_data, sizeof (read_data))) + return COMP_TYPE_UNDEFINED; + #if HAVE_ZLIB - if ((fsize >= MIN_ZLIB_HEADER) && (data[0] == 0x1f) && (data[1] == 0x8b) && (data[2] == 0x08)) - result = COMP_TYPE_ZLIB; + if ( (bdfs->fsize >= MIN_ZLIB_HEADER) && + (data[0] == 0x1f) && + (data[1] == 0x8b) && + (data[2] == 0x08) ) + return COMP_TYPE_ZLIB; #endif #if HAVE_LIBBZ2 - if ((fsize >= MIN_BZ2_HEADER) && (data[0] == 'B') && (data[1] == 'Z') && (data[2] == 'h')) - result = COMP_TYPE_BZ2; + if ( (bdfs->fsize >= MIN_BZ2_HEADER) && + (data[0] == 'B') && + (data[1] == 'Z') && + (data[2] == 'h')) + return COMP_TYPE_BZ2; #endif - if (read_data != NULL) - free (read_data); - return result; + return COMP_TYPE_INVALID; } -#if 0 - - enum ExtractorCompressionType compression_type = -1; - struct CompressedFileSource *cfs = NULL; - int fd = -1; - struct stat64 fstatbuf; - int64_t fsize = 0; - - /* If data is not given, then we need to read it from the file. Try opening it */ - if ((data == NULL) && - (filename != NULL) && - (0 == STAT64(filename, &fstatbuf)) && - (!S_ISDIR(fstatbuf.st_mode)) && - (-1 != (fd = file_open (filename, - O_RDONLY | O_LARGEFILE)))) - { - /* Empty files are of no interest */ - fsize = fstatbuf.st_size; - if (fsize == 0) - { - close(fd); - return; - } - } - - /* Data is not given, and we've failed to open the file with data -> exit */ - if ((fsize == 0) && (data == NULL)) - return; - /* fsize is now size of the data OR size of the file */ - if (data != NULL) - fsize = size; - - errno = 0; - - /* Peek at first few bytes of the file (or of the data), and see if it's compressed. */ - compression_type = get_compression_type (data, fd, fsize); - if (compression_type < 0) - { - /* errno is set by get_compression_type () */ - if (fd != -1) - close (fd); - return; - } +/** + * Handle to a datasource we can use for the plugins. + */ +struct EXTRACTOR_Datasource +{ + + /** + * Underlying buffered data source. + */ + struct BufferedFileDataSource *bfds; + /** + * Compressed file source (NULL if not applicable). + */ + struct CompressedFileSource *cfs; + + /** + * Underlying file descriptor, -1 for none. + */ + int fd; +}; + + +/** + * Create a datasource from a file on disk. + * + * @param filename name of the file on disk + * @param proc metadata callback to call with meta data found upon opening + * @param proc_cls callback cls + * @return handle to the datasource, NULL on error + */ +struct EXTRACTOR_Datasource * +EXTRACTOR_datasource_create_from_file_ (const char *filename, + EXTRACTOR_MetaDataProcessor proc, + void *proc_cls) +{ struct BufferedFileDataSource *bfds; - bfds = bfds_new (data, fd, fsize); - if (bfds == NULL) - return; - - if (compression_type > 0) - { - int icr = 0; - /* Set up a decompressor. - * Will also report compression-related metadata to the caller. - */ - cfs = cfs_new (bfds, fsize, compression_type, proc, proc_cls); - if (cfs == NULL) + struct EXTRACTOR_Datasource *ds; + enum ExtractorCompressionType ct; + int fd; + struct stat sb; + int64_t fsize; + + if (-1 == (fd = open (filename, O_RDONLY | O_LARGEFILE))) + return NULL; + if ( (0 != fstat (fd, &sb)) || + (S_ISDIR (fstatbuf.st_mode)) ) + { + (void) close (fd); + return NULL; + } + fsize = (int64_t) sb.st_size; + if (0 == fsize) + { + (void) close (fd); + return NULL; + } + bfds = bfds_new (NULL, fd, fsize); + if (NULL == bfds) { - if (fd != -1) - close (fd); - errno = EILSEQ; - return; + (void) close (fd); + return NULL; } - icr = cfs_init_decompressor (cfs, proc, proc_cls); - if (icr < 0) + if (NULL == (ds = malloc (sizeof (struct EXTRACTOR_Datasource)))) { - if (fd != -1) - close (fd); - errno = EILSEQ; - return; + bfds_delete (bfds); + return NULL; } - else if (icr == 0) + ds->bfds = bfds; + ds->fd; + ct = get_compression_type (bfds); + if ( (COMP_TYPE_ZLIB == ct) || + (COMP_TYPE_BZ2 == ct) ) + ds->cfs = cfs_new (bfds, fsize, ct, proc, proc_cls); + if (NULL == ds->cfs) { - if (fd != -1) - close (fd); - errno = 0; - return; + bfds_delete (bfds); + free (ds); + (void) close (fd); + return NULL; } - } + return ds; +} -#endif +/** + * Create a datasource from a buffer in memory. + * + * @param buf data in memory + * @param size number of bytes in 'buf' + * @param proc metadata callback to call with meta data found upon opening + * @param proc_cls callback cls + * @return handle to the datasource + */ +struct EXTRACTOR_Datasource * +EXTRACTOR_datasource_create_from_buffer_ (const char *buf, + size_t size, + EXTRACTOR_MetaDataProcessor proc, void *proc_cls) +{ + struct BufferedFileDataSource *bfds; + struct EXTRACTOR_Datasource *ds; + enum ExtractorCompressionType ct; + if (0 == size) + return NULL; + if (NULL == (bfds = bfds_new (buf, -1, size))) + return NULL; + if (NULL == (ds = malloc (sizeof (struct EXTRACTOR_Datasource)))) + { + bfds_delete (bfds); + return NULL; + } + ds->bfds = bfds; + ds->fd; + ct = get_compression_type (bfds); + if ( (COMP_TYPE_ZLIB == ct) || + (COMP_TYPE_BZ2 == ct) ) + ds->cfs = cfs_new (bfds, fsize, ct, proc, proc_cls); + if (NULL == ds->cfs) + { + bfds_delete (bfds); + free (ds); + return NULL; + } + return ds; +} /** * Destroy a data source. * - * @param datasource source to destroy + * @param ds source to destroy */ void -EXTRACTOR_datasource_destroy_ (struct EXTRACTOR_Datasource *datasource) +EXTRACTOR_datasource_destroy_ (struct EXTRACTOR_Datasource *ds) +{ + if (NULL != ds->cfs) + cfs_destroy (ds->cfs); + bfds_delete (ds->bfds); + if (-1 != ds->fd) + (void) close (ds->fd); + free (ds); +} + + +/** + * Make 'size' bytes of data from the data source available at 'data'. + * + * @param cls must be a 'struct EXTRACTOR_Datasource' + * @param data where the data should be copied to + * @param size maximum number of bytes requested + * @return number of bytes now available in data (can be smaller than 'size'), + * -1 on error + */ +ssize_t +EXTRACTOR_datasource_read_ (void *cls, + void *data, + size_t size) +{ + struct EXTRACTOR_Datasource *ds = cls; + + if (NULL != ds->cfs) + return cfs_read (ds->cfs, data, size); + return bdfs_read (ds->bdfs, data, size); +} + + +/** + * Seek in the datasource. Use 'SEEK_CUR' for whence and 'pos' of 0 to + * obtain the current position in the file. + * + * @param cls must be a 'struct EXTRACTOR_Datasource' + * @param pos position to seek (see 'man lseek') + * @param whence how to see (absolute to start, relative, absolute to end) + * @return new absolute position, UINT64_MAX on error (i.e. desired position + * does not exist) + */ +int64_t +EXTRACTOR_datasource_seek_ (void *cls, + uint64_t pos, + int whence) { - if (cfs != NULL) - { - cfs_deinit_decompressor (cfs); - cfs_delete (cfs); - } - bfds_delete (bfds); - if (-1 != fd) - close(fd); + struct EXTRACTOR_Datasource *ds = cls; + + if (NULL != ds->cfs) + return cfs_seek (ds->cfs, pos, whence); + return bdfs_seek (ds->bdfs, pos, whence); } + +/** + * Determine the overall size of the data source (after compression). + * + * @param cls must be a 'struct EXTRACTOR_Datasource' + * @return overall file size, UINT64_MAX on error or unknown + */ +int64_t +EXTRACTOR_datasource_get_size_ (void *cls) +{ + struct EXTRACTOR_Datasource *ds = cls; + + if (NULL != ds->cfs) + return cfs_seek (ds->cfs, pos, whence); + return bdfs_seek (ds->bdfs, pos, whence); +} + + /* end of extractor_datasource.c */ diff --git a/src/main/extractor_datasource.h b/src/main/extractor_datasource.h @@ -30,10 +30,13 @@ struct EXTRACTOR_Datasource; * Create a datasource from a file on disk. * * @param filename name of the file on disk - * @return handle to the datasource + * @param proc metadata callback to call with meta data found upon opening + * @param proc_cls callback cls + * @return handle to the datasource, NULL on error */ struct EXTRACTOR_Datasource * -EXTRACTOR_datasource_create_from_file_ (const char *filename); +EXTRACTOR_datasource_create_from_file_ (const char *filename, + EXTRACTOR_MetaDataProcessor proc, void *proc_cls); /** @@ -41,24 +44,27 @@ EXTRACTOR_datasource_create_from_file_ (const char *filename); * * @param buf data in memory * @param size number of bytes in 'buf' - * @return handle to the datasource + * @param proc metadata callback to call with meta data found upon opening + * @param proc_cls callback cls + * @return handle to the datasource, NULL on error */ struct EXTRACTOR_Datasource * EXTRACTOR_datasource_create_from_buffer_ (const char *buf, - size_t size); + size_t size, + EXTRACTOR_MetaDataProcessor proc, void *proc_cls); /** * Destroy a data source. * - * @param datasource source to destroy + * @param ds source to destroy */ void -EXTRACTOR_datasource_destroy_ (struct EXTRACTOR_Datasource *datasource); +EXTRACTOR_datasource_destroy_ (struct EXTRACTOR_Datasource *ds); /** - * Make 'size' bytes of data from the data source available at '*data'. + * Make 'size' bytes of data from the data source available at 'data'. * * @param cls must be a 'struct EXTRACTOR_Datasource' * @param data where the data should be copied to @@ -77,12 +83,12 @@ EXTRACTOR_datasource_read_ (void *cls, * obtain the current position in the file. * * @param cls must be a 'struct EXTRACTOR_Datasource' - * @param pos position to seek (see 'man lseek') + * @param pos position to seek (see 'man lseek')o * @param whence how to see (absolute to start, relative, absolute to end) - * @return new absolute position, UINT64_MAX on error (i.e. desired position + * @return new absolute position, -1 on error (i.e. desired position * does not exist) */ -uint64_t +int64_t EXTRACTOR_datasource_seek_ (void *cls, uint64_t pos, int whence); @@ -92,9 +98,9 @@ EXTRACTOR_datasource_seek_ (void *cls, * Determine the overall size of the data source (after compression). * * @param cls must be a 'struct EXTRACTOR_Datasource' - * @return overall file size, UINT64_MAX on error (i.e. IPC failure) + * @return overall file size, -1 on error or unknown */ -uint64_t +int64_t EXTRACTOR_datasource_get_size_ (void *cls);