libextractor

GNU libextractor
Log | Files | Refs | Submodules | README | LICENSE

commit a4da15ff4a1303ea65ca00545e39a5fcfc02e84e
parent a9f53c38cd4bba696bdb51a4687d71a46ad885c0
Author: Christian Grothoff <christian@grothoff.org>
Date:   Thu, 12 Apr 2012 16:43:56 +0000

-LRN: improved LE API

Diffstat:
Msrc/include/extractor.h | 3---
Msrc/main/extract.c | 70++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------
Msrc/main/extractor.c | 1957+++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------------
Msrc/main/extractor_plugins.c | 27++++++++++++++++-----------
Msrc/main/extractor_plugins.h | 30++++++++++++++++++++++++++----
Msrc/plugins/Makefile.am | 5+++++
Msrc/plugins/id3_extractor.c | 126++++++++++++++++++-------------------------------------------------------------
Msrc/plugins/id3v2_extractor.c | 303++++++++++++++++++++++++++++++++++++++-----------------------------------------
Msrc/plugins/mp3_extractor.c | 89++++++++++++++++++++++++++++++++++++++++++-------------------------------------
9 files changed, 1651 insertions(+), 959 deletions(-)

diff --git a/src/include/extractor.h b/src/include/extractor.h @@ -404,9 +404,6 @@ struct EXTRACTOR_PluginList; typedef int (*EXTRACTOR_extract_method) (struct EXTRACTOR_PluginList *plugin, EXTRACTOR_MetaDataProcessor proc, void *proc_cls); -typedef void (*EXTRACTOR_discard_state_method) (struct EXTRACTOR_PluginList *plugin); -typedef void (*EXTRACTOR_init_state_method) (struct EXTRACTOR_PluginList *plugin); - /** diff --git a/src/main/extract.c b/src/main/extract.c @@ -43,6 +43,11 @@ static int verbose; */ static int in_process; +/** + * Read file contents into memory, then feed them to extractor. + */ +static int from_memory; + static void catcher (int sig) @@ -175,6 +180,8 @@ printHelp () gettext_noop("print this help") }, { 'i', "in-process", NULL, gettext_noop("run plugins in-process (simplifies debugging)") }, + { 'm', "from-memory", NULL, + gettext_noop("read data from file into memory and extract from memory") }, { 'l', "library", "LIBRARY", gettext_noop("load an extractor plugin named LIBRARY") }, { 'L', "list", NULL, @@ -573,6 +580,7 @@ main (int argc, char *argv[]) {"grep-friendly", 0, 0, 'g'}, {"help", 0, 0, 'h'}, {"in-process", 0, 0, 'i'}, + {"from-memory", 0, 0, 'm'}, {"list", 0, 0, 'L'}, {"library", 1, 0, 'l'}, {"nodefault", 0, 0, 'n'}, @@ -585,7 +593,7 @@ main (int argc, char *argv[]) option_index = 0; c = getopt_long (argc, argv, - "abghil:Lnp:vVx:", + "abghiml:Lnp:vVx:", long_options, &option_index); @@ -619,6 +627,9 @@ main (int argc, char *argv[]) case 'i': in_process = 1; break; + case 'm': + from_memory = 1; + break; case 'l': libraries = optarg; break; @@ -749,11 +760,58 @@ main (int argc, char *argv[]) argv[i]); else start_bibtex (); - EXTRACTOR_extract (plugins, - argv[i], - NULL, 0, - processor, - NULL); + if (!from_memory) + EXTRACTOR_extract (plugins, + argv[i], + NULL, 0, + processor, + NULL); + else + { + int f = open (argv[i], _O_RDONLY | _O_BINARY); + if (f != -1) + { + int64_t k = 0; +#if WINDOWS + k = _lseeki64 (f, 0, SEEK_END); +#elif HAVE_LSEEK64 + k = lseek64 (f, 0, SEEK_END); +#else + k = (int64_t) lseek (f, 0, SEEK_END); +#endif + if (k > 0) + { + int64_t j; + int rd; + unsigned char *data = malloc (k); + close (f); + f = open (argv[i], _O_RDONLY | _O_BINARY); + for (j = 0; j < k; j += rd) + { + void *ptr = (void *) &data[j]; + int to_read = 64*1024; + if (to_read > k - j) + to_read = k - j; + rd = read (f, ptr, to_read); + if (rd < 0) + { + fprintf (stderr, "Failed to read file `%s': %d %s\n", argv[i], errno, strerror (errno)); + break; + } + if (rd == 0) + break; + } + if (j > 0) + EXTRACTOR_extract (plugins, + NULL, + data, j, + processor, + NULL); + free (data); + } + close (f); + } + } if (0 != errno) { if (verbose > 0) { fprintf(stderr, diff --git a/src/main/extractor.c b/src/main/extractor.c @@ -74,6 +74,10 @@ #define MESSAGE_META 0x05 #define MESSAGE_DISCARD_STATE 0x06 +#define OPMODE_MEMORY 1 +#define OPMODE_DECOMPRESS 2 +#define OPMODE_FILE 3 + /** * Header used for our IPC replies. A header * with all fields being zero is used to indicate @@ -89,22 +93,35 @@ struct IpcHeader #if !WINDOWS int -plugin_open_shm (struct EXTRACTOR_PluginList *plugin, char *shm_name) +plugin_open_shm (struct EXTRACTOR_PluginList *plugin, const char *shm_name) { if (plugin->shm_id != -1) close (plugin->shm_id); plugin->shm_id = shm_open (shm_name, O_RDONLY, 0); return plugin->shm_id; } +int +plugin_open_file (struct EXTRACTOR_PluginList *plugin, const char *shm_name) +{ + if (plugin->shm_id != -1) + close (plugin->shm_id); + plugin->shm_id = open (shm_name, O_RDONLY, 0); + return plugin->shm_id; +} #else HANDLE -plugin_open_shm (struct EXTRACTOR_PluginList *plugin, char *shm_name) +plugin_open_shm (struct EXTRACTOR_PluginList *plugin, const char *shm_name) { if (plugin->map_handle != 0) CloseHandle (plugin->map_handle); plugin->map_handle = OpenFileMapping (FILE_MAP_READ, FALSE, shm_name); return plugin->map_handle; } +HANDLE +plugin_open_file (struct EXTRACTOR_PluginList *plugin, const char *shm_name) +{ + return plugin_open_shm (plugin, shm_name); +} #endif static int @@ -177,24 +194,62 @@ transmit_reply (void *cls, return 0; } -/** - * 'main' function of the child process. Reads shm-filenames from - * 'in' (line-by-line) and writes meta data blocks to 'out'. The meta - * data stream is terminated by an empty entry. - * - * @param plugin extractor plugin to use - * @param in stream to read from - * @param out stream to write to - */ +/* init the read/seek wrappers */ +static int +init_state_method (struct EXTRACTOR_PluginList *plugin, uint8_t operation_mode, int64_t fsize, const char *shm_name) +{ + plugin->seek_request = 0; +#if !WINDOWS + if (plugin->shm_ptr != NULL) + munmap (plugin->shm_ptr, plugin->map_size); + plugin->shm_ptr = NULL; + if (operation_mode == OPMODE_FILE) + { + if (-1 == plugin_open_file (plugin, shm_name)) + return 1; + } + else if (-1 == plugin_open_shm (plugin, shm_name)) + return 1; +#else + if (plugin->shm_ptr != NULL) + UnmapViewOfFile (plugin->shm_ptr); + plugin->shm_ptr = NULL; + if (INVALID_HANDLE_VALUE == plugin_open_shm (plugin, shm_name)) + return 1; +#endif + plugin->fsize = fsize; + plugin->shm_pos = 0; + plugin->fpos = 0; + return 0; +} + static void -process_requests (struct EXTRACTOR_PluginList *plugin, int in, int out) +discard_state_method (struct EXTRACTOR_PluginList *plugin) +{ +#if !WINDOWS + if (plugin->shm_ptr != NULL && plugin->map_size > 0) + munmap (plugin->shm_ptr, plugin->map_size); + if (plugin->shm_id != -1) + close (plugin->shm_id); + plugin->shm_id = -1; +#else + if (plugin->shm_ptr != NULL) + UnmapViewOfFile (plugin->shm_ptr); + if (plugin->map_handle != 0) + CloseHandle (plugin->map_handle); + plugin->map_handle = 0; +#endif + plugin->map_size = 0; + plugin->shm_ptr = NULL; +} + +static int +process_requests (struct EXTRACTOR_PluginList *plugin) { - int read_result1, read_result2, read_result3; + int in, out; + int read_result1, read_result2, read_result3, read_result4; unsigned char code; - int64_t fsize = -1; - int64_t position = 0; void *shm_ptr = NULL; - size_t shm_size = 0; char *shm_name = NULL; size_t shm_name_len; @@ -207,27 +262,17 @@ process_requests (struct EXTRACTOR_PluginList *plugin, int in, int out) MEMORY_BASIC_INFORMATION mi; #endif - if (plugin == NULL) + in = plugin->pipe_in; + out = plugin->cpipe_out; + + if (plugin->waiting_for_update == 1) { - close (in); - close (out); - return; + unsigned char seek_byte = MESSAGE_SEEK; + if (write (out, &seek_byte, 1) != 1) + return -1; + if (write (out, &plugin->seek_request, sizeof (int64_t)) != sizeof (int64_t)) + return -1; } - if (0 != plugin_load (plugin)) - { - close (in); - close (out); -#if DEBUG - fprintf (stderr, "Plugin `%s' failed to load!\n", plugin->short_libname); -#endif - return; - } - if ((plugin->specials != NULL) && - (NULL != strstr (plugin->specials, "close-stderr"))) - close (2); - if ((plugin->specials != NULL) && - (NULL != strstr (plugin->specials, "close-stdout"))) - close (1); memset (&hdr, 0, sizeof (hdr)); do_break = 0; @@ -239,114 +284,55 @@ process_requests (struct EXTRACTOR_PluginList *plugin, int in, int out) switch (code) { case MESSAGE_INIT_STATE: - read_result2 = read (in, &fsize, sizeof (int64_t)); - read_result3 = read (in, &shm_name_len, sizeof (size_t)); - if ((read_result2 < sizeof (int64_t)) || (read_result3 < sizeof (size_t)) || - shm_name_len > MAX_SHM_NAME || fsize <= 0) - { - do_break = 1; - break; - } - if (shm_name != NULL) - free (shm_name); - shm_name = malloc (shm_name_len); - if (shm_name == NULL) - { - do_break = 1; - break; - } - read_result2 = read (in, shm_name, shm_name_len); - if (read_result2 < shm_name_len) + read_result2 = read (in, &plugin->operation_mode, sizeof (uint8_t)); + read_result3 = read (in, &plugin->fsize, sizeof (int64_t)); + read_result4 = read (in, &shm_name_len, sizeof (size_t)); + if ((read_result2 < sizeof (uint8_t)) || + (read_result3 < sizeof (int64_t)) || + (read_result4 < sizeof (size_t))) { do_break = 1; break; } - shm_name[shm_name_len - 1] = '\0'; -#if !WINDOWS - if (shm_ptr != NULL) - munmap (shm_ptr, shm_size); - if (-1 == plugin_open_shm (plugin, shm_name)) + if (plugin->operation_mode != OPMODE_MEMORY && + plugin->operation_mode != OPMODE_DECOMPRESS && + plugin->operation_mode != OPMODE_FILE) { do_break = 1; break; } -#else - if (shm_ptr != NULL) - UnmapViewOfFile (shm_ptr); - if (INVALID_HANDLE_VALUE == plugin_open_shm (plugin, shm_name)) + if ((plugin->operation_mode == OPMODE_MEMORY || + plugin->operation_mode == OPMODE_DECOMPRESS) && + shm_name_len > MAX_SHM_NAME) { do_break = 1; break; } -#endif - plugin->fsize = fsize; - plugin->init_state_method (plugin); - break; - case MESSAGE_DISCARD_STATE: - plugin->discard_state_method (plugin); -#if !WINDOWS - if (shm_ptr != NULL && shm_size > 0) - munmap (shm_ptr, shm_size); - if (plugin->shm_id != -1) - close (plugin->shm_id); - plugin->shm_id = -1; - shm_size = 0; -#else - if (shm_ptr != NULL) - UnmapViewOfFile (shm_ptr); - if (plugin->map_handle != 0) - CloseHandle (plugin->map_handle); - plugin->map_handle = 0; -#endif - shm_ptr = NULL; - break; - case MESSAGE_UPDATED_SHM: - read_result2 = read (in, &position, sizeof (int64_t)); - read_result3 = read (in, &shm_size, sizeof (size_t)); - if ((read_result2 < sizeof (int64_t)) || (read_result3 < sizeof (size_t)) || - position < 0 || fsize <= 0 || position >= fsize) + if (plugin->operation_mode != OPMODE_DECOMPRESS && plugin->fsize <= 0) { do_break = 1; break; } - /* FIXME: also check mapped region size (lseek for *nix, VirtualQuery for W32) */ -#if !WINDOWS - if ((-1 == plugin->shm_id) || - (NULL == (shm_ptr = mmap (NULL, shm_size, PROT_READ, MAP_SHARED, plugin->shm_id, 0))) || - (shm_ptr == (void *) -1)) + if (shm_name != NULL) + free (shm_name); + shm_name = malloc (shm_name_len); + if (shm_name == NULL) { do_break = 1; break; } -#else - if ((plugin->map_handle == 0) || - (NULL == (shm_ptr = MapViewOfFile (plugin->map_handle, FILE_MAP_READ, 0, 0, 0)))) + read_result2 = read (in, shm_name, shm_name_len); + if (read_result2 < shm_name_len) { do_break = 1; break; } -#endif - plugin->position = position; - plugin->shm_ptr = shm_ptr; - plugin->map_size = shm_size; - /* Now, ideally a plugin would do reads and seeks on a virtual "plugin" object - * completely transparently, and the underlying code would return bytes from - * the memory map, or would block and wait for a seek to happen. - * That, however, requires somewhat different architecture, and even more wrapping - * and hand-helding. It's easier to make plugins aware of the fact that they work - * with discrete in-memory buffers with expensive seeking, not continuous files. - */ - extract_reply = plugin->extract_method (plugin, transmit_reply, &out); -#if !WINDOWS - if ((shm_ptr != NULL) && - (shm_ptr != (void*) -1) ) - munmap (shm_ptr, shm_size); -#else - if (shm_ptr != NULL) - UnmapViewOfFile (shm_ptr); -#endif - if (extract_reply == 1) + shm_name[shm_name_len - 1] = '\0'; + do_break = init_state_method (plugin, plugin->operation_mode, plugin->fsize, shm_name); + if (!do_break && (plugin->operation_mode == OPMODE_MEMORY || + plugin->operation_mode == OPMODE_FILE)) { + extract_reply = plugin->extract_method (plugin, transmit_reply, &out); unsigned char done_byte = MESSAGE_DONE; if (write (out, &done_byte, 1) != 1) { @@ -366,23 +352,143 @@ process_requests (struct EXTRACTOR_PluginList *plugin, int in, int out) _exit (0); } } - else + break; + case MESSAGE_DISCARD_STATE: + discard_state_method (plugin); + break; + case MESSAGE_UPDATED_SHM: + if (plugin->operation_mode == OPMODE_DECOMPRESS) { - unsigned char seek_byte = MESSAGE_SEEK; - if (write (out, &seek_byte, 1) != 1) + read_result2 = read (in, &plugin->fpos, sizeof (int64_t)); + read_result3 = read (in, &plugin->map_size, sizeof (size_t)); + read_result4 = read (in, &plugin->fsize, sizeof (int64_t)); + if ((read_result2 < sizeof (int64_t)) || (read_result3 < sizeof (size_t)) || + plugin->fpos < 0 || (plugin->operation_mode != OPMODE_DECOMPRESS && (plugin->fsize <= 0 || plugin->fpos >= plugin->fsize))) { do_break = 1; break; } - if (write (out, &plugin->seek_request, sizeof (int64_t)) != sizeof (int64_t)) + /* FIXME: also check mapped region size (lseek for *nix, VirtualQuery for W32) */ +#if !WINDOWS + if ((-1 == plugin->shm_id) || + (NULL == (plugin->shm_ptr = mmap (NULL, plugin->map_size, PROT_READ, MAP_SHARED, plugin->shm_id, 0))) || + (plugin->shm_ptr == (void *) -1)) + { + do_break = 1; + break; + } +#else + if ((plugin->map_handle == 0) || + (NULL == (plugin->shm_ptr = MapViewOfFile (plugin->map_handle, FILE_MAP_READ, 0, 0, 0)))) + { + do_break = 1; + break; + } +#endif + if (plugin->waiting_for_update == 1) { do_break = 1; + plugin->waiting_for_update = 2; break; } + extract_reply = plugin->extract_method (plugin, transmit_reply, &out); +#if !WINDOWS + if ((plugin->shm_ptr != NULL) && + (plugin->shm_ptr != (void*) -1) ) + munmap (plugin->shm_ptr, plugin->map_size); +#else + if (plugin->shm_ptr != NULL) + UnmapViewOfFile (plugin->shm_ptr); +#endif + plugin->shm_ptr = NULL; + if (extract_reply == 1) + { + unsigned char done_byte = MESSAGE_DONE; + if (write (out, &done_byte, 1) != 1) + { + do_break = 1; + break; + } + if ((plugin->specials != NULL) && + (NULL != strstr (plugin->specials, "force-kill"))) + { + /* we're required to die after each file since this + plugin only supports a single file at a time */ +#if !WINDOWS + fsync (out); +#else + _commit (out); +#endif + _exit (0); + } + } + else + { + unsigned char seek_byte = MESSAGE_SEEK; + if (write (out, &seek_byte, 1) != 1) + { + do_break = 1; + break; + } + if (write (out, &plugin->seek_request, sizeof (int64_t)) != sizeof (int64_t)) + { + do_break = 1; + break; + } + } + } + else + { + int64_t t; + size_t t2; + read_result2 = read (in, &t, sizeof (int64_t)); + read_result3 = read (in, &t2, sizeof (size_t)); + read_result4 = read (in, &t, sizeof (int64_t)); } break; } } + return 0; +} + +/** + * 'main' function of the child process. Reads shm-filenames from + * 'in' (line-by-line) and writes meta data blocks to 'out'. The meta + * data stream is terminated by an empty entry. + * + * @param plugin extractor plugin to use + * @param in stream to read from + * @param out stream to write to + */ +static void +plugin_main (struct EXTRACTOR_PluginList *plugin, int in, int out) +{ + if (plugin == NULL) + { + close (in); + close (out); + return; + } + if (0 != plugin_load (plugin)) + { + close (in); + close (out); +#if DEBUG + fprintf (stderr, "Plugin `%s' failed to load!\n", plugin->short_libname); +#endif + return; + } + if ((plugin->specials != NULL) && + (NULL != strstr (plugin->specials, "close-stderr"))) + close (2); + if ((plugin->specials != NULL) && + (NULL != strstr (plugin->specials, "close-stdout"))) + close (1); + + plugin->pipe_in = in; + plugin->cpipe_out = out; + process_requests (plugin); + close (in); close (out); } @@ -446,7 +552,7 @@ start_process (struct EXTRACTOR_PluginList *plugin) { close (p1[1]); close (p2[0]); - process_requests (plugin, p1[0], p2[1]); + plugin_main (plugin, p1[0], p2[1]); _exit (0); } close (p1[0]); @@ -806,6 +912,15 @@ read_plugin_data (int fd) read (fd, ret->plugin_options, i); ret->plugin_options[i - 1] = '\0'; } +#if WINDOWS + { + SYSTEM_INFO si; + GetSystemInfo (&si); + ret->allocation_granularity = si.dwAllocationGranularity; + } +#else + ret->allocation_granularity = sysconf (_SC_PAGE_SIZE); +#endif return ret; } @@ -1045,393 +1160,33 @@ static int file_open(const char *filename, int oflag, ...) return OPEN(fn, oflag, mode); } -#ifndef O_LARGEFILE -#define O_LARGEFILE 0 -#endif - -#if HAVE_ZLIB -#define MIN_ZLIB_HEADER 12 -#endif -#if HAVE_LIBBZ2 -#define MIN_BZ2_HEADER 4 -#endif -#if !defined (MIN_COMPRESSED_HEADER) && HAVE_ZLIB -#define MIN_COMPRESSED_HEADER MIN_ZLIB_HEADER -#endif -#if !defined (MIN_COMPRESSED_HEADER) && HAVE_LIBBZ2 -#define MIN_COMPRESSED_HEADER MIN_BZ2_HEADER -#endif -#if !defined (MIN_COMPRESSED_HEADER) -#define MIN_COMPRESSED_HEADER -1 -#endif - -#define COMPRESSED_DATA_PROBE_SIZE 3 +#if WINDOWS /** - * Try to decompress compressed data + * Setup a shared memory segment. * - * @param data data to decompress, or NULL (if fd is not -1) - * @param fd file to read data from, or -1 (if data is not NULL) - * @param fsize size of data (if data is not NULL) or size of fd file (if fd is not -1) - * @param compression_type type of compression, as returned by get_compression_type () - * @param buffer a pointer to a buffer pointer, buffer pointer is NEVER a NULL and already has some data (usually - COMPRESSED_DATA_PROBE_SIZE bytes) in it. - * @param buffer_size a pointer to buffer size - * @param proc callback for metadata - * @param proc_cls cls for proc - * @return 0 on success, anything else on error + * @param ptr set to the location of the map segment + * @param map where to store the map handle + * @param fn name of the mapping + * @param fn_size size available in fn + * @param size number of bytes to allocated for the mapping + * @return 0 on success */ static int -try_to_decompress (const unsigned char *data, int fd, int64_t fsize, int compression_type, void **buffer, size_t *buffer_size, EXTRACTOR_MetaDataProcessor proc, void *proc_cls) +make_shm_w32 (void **ptr, HANDLE *map, char *fn, size_t fn_size, size_t size) { - unsigned char *new_buffer; - ssize_t read_result; - - unsigned char *buf; - unsigned char *rbuf; - size_t dsize; -#if HAVE_ZLIB - z_stream strm; - int ret; - size_t pos; -#endif -#if HAVE_LIBBZ2 - bz_stream bstrm; - int bret; - size_t bpos; -#endif - - if (fd != -1) + const char *tpath = "Local\\"; + snprintf (fn, fn_size, "%slibextractor-shm-%u-%u", tpath, getpid(), + (unsigned int) RANDOM()); + *map = CreateFileMapping (INVALID_HANDLE_VALUE, NULL, PAGE_READWRITE, 0, size, fn); + *ptr = MapViewOfFile (*map, FILE_MAP_WRITE, 0, 0, size); + if (*ptr == NULL) { - if (fsize > *buffer_size) - { - /* Read the rest of the file. Can't de-compress it partially anyway */ - /* Memory mapping is not useful here, because memory mapping ALSO takes up - * memory (even more than a buffer, since it might be aligned), and - * because we need to read every byte anyway (lazy on-demand reads into - * memory provided by memory mapping won't help). - */ - new_buffer = realloc (*buffer, fsize); - if (new_buffer == NULL) - { - free (*buffer); - return -1; - } - read_result = READ (fd, &new_buffer[*buffer_size], fsize - *buffer_size); - if (read_result != fsize - *buffer_size) - { - free (*buffer); - return -1; - } - *buffer = new_buffer; - *buffer_size = fsize; - } - data = (const unsigned char *) new_buffer; + CloseHandle (*map); + return 1; } - -#if HAVE_ZLIB - if (compression_type == 1) - { - /* Process gzip header */ - unsigned int gzip_header_length = 10; - - if (data[3] & 0x4) /* FEXTRA set */ - gzip_header_length += 2 + (unsigned) (data[10] & 0xff) + - (((unsigned) (data[11] & 0xff)) * 256); - - if (data[3] & 0x8) /* FNAME set */ - { - const unsigned char *cptr = data + gzip_header_length; - - /* stored file name is here */ - while ((cptr - data) < fsize) - { - if ('\0' == *cptr) - break; - cptr++; - } - - if (0 != proc (proc_cls, "<zlib>", EXTRACTOR_METATYPE_FILENAME, - EXTRACTOR_METAFORMAT_C_STRING, "text/plain", - (const char *) (data + gzip_header_length), - cptr - (data + gzip_header_length))) - return 0; /* done */ - - gzip_header_length = (cptr - data) + 1; - } - - if (data[3] & 0x16) /* FCOMMENT set */ - { - const unsigned char * cptr = data + gzip_header_length; - - /* stored comment is here */ - while (cptr < data + fsize) - { - if ('\0' == *cptr) - break; - cptr ++; - } - - if (0 != proc (proc_cls, "<zlib>", EXTRACTOR_METATYPE_COMMENT, - EXTRACTOR_METAFORMAT_C_STRING, "text/plain", - (const char *) (data + gzip_header_length), - cptr - (data + gzip_header_length))) - return 0; /* done */ - - gzip_header_length = (cptr - data) + 1; - } - - if (data[3] & 0x2) /* FCHRC set */ - gzip_header_length += 2; - - memset (&strm, 0, sizeof (z_stream)); - -#ifdef ZLIB_VERNUM - gzip_header_length = 0; -#endif - - if (fsize > gzip_header_length) - { - strm.next_in = (Bytef *) data + gzip_header_length; - strm.avail_in = fsize - gzip_header_length; - } - else - { - strm.next_in = (Bytef *) data; - strm.avail_in = 0; - } - strm.total_in = 0; - strm.zalloc = NULL; - strm.zfree = NULL; - strm.opaque = NULL; - - /* - * note: maybe plain inflateInit(&strm) is adequate, - * it looks more backward-compatible also ; - * - * ZLIB_VERNUM isn't defined by zlib version 1.1.4 ; - * there might be a better check. - */ - if (Z_OK == inflateInit2 (&strm, -#ifdef ZLIB_VERNUM - 15 + 32 -#else - -MAX_WBITS -#endif - )) - { - pos = 0; - dsize = 2 * fsize; - if ( (dsize > MAX_DECOMPRESS) || - (dsize < fsize) ) - dsize = MAX_DECOMPRESS; - buf = malloc (dsize); - - if (buf != NULL) - { - strm.next_out = (Bytef *) buf; - strm.avail_out = dsize; - - do - { - ret = inflate (&strm, Z_SYNC_FLUSH); - if (ret == Z_OK) - { - if (dsize == MAX_DECOMPRESS) - break; - - pos += strm.total_out; - strm.total_out = 0; - dsize *= 2; - - if (dsize > MAX_DECOMPRESS) - dsize = MAX_DECOMPRESS; - - rbuf = realloc (buf, dsize); - if (rbuf == NULL) - { - free (buf); - buf = NULL; - break; - } - - buf = rbuf; - strm.next_out = (Bytef *) &buf[pos]; - strm.avail_out = dsize - pos; - } - else if (ret != Z_STREAM_END) - { - /* error */ - free (buf); - buf = NULL; - } - } while ((buf != NULL) && (ret != Z_STREAM_END)); - - dsize = pos + strm.total_out; - if ((dsize == 0) && (buf != NULL)) - { - free (buf); - buf = NULL; - } - } - - inflateEnd (&strm); - - if (fd != -1) - if (*buffer != NULL) - free (*buffer); - - if (buf == NULL) - { - return -1; - } - else - { - *buffer = buf; - *buffer_size = dsize; - return 0; - } - } - } -#endif - -#if HAVE_LIBBZ2 - if (compression_type == 2) - { - memset(&bstrm, 0, sizeof (bz_stream)); - bstrm.next_in = (char *) data; - bstrm.avail_in = fsize; - bstrm.total_in_lo32 = 0; - bstrm.total_in_hi32 = 0; - bstrm.bzalloc = NULL; - bstrm.bzfree = NULL; - bstrm.opaque = NULL; - if (BZ_OK == BZ2_bzDecompressInit(&bstrm, 0,0)) - { - bpos = 0; - dsize = 2 * fsize; - if ( (dsize > MAX_DECOMPRESS) || (dsize < fsize) ) - dsize = MAX_DECOMPRESS; - buf = malloc (dsize); - - if (buf != NULL) - { - bstrm.next_out = (char *) buf; - bstrm.avail_out = dsize; - - do - { - bret = BZ2_bzDecompress (&bstrm); - if (bret == Z_OK) - { - if (dsize == MAX_DECOMPRESS) - break; - bpos += bstrm.total_out_lo32; - bstrm.total_out_lo32 = 0; - - dsize *= 2; - if (dsize > MAX_DECOMPRESS) - dsize = MAX_DECOMPRESS; - - rbuf = realloc(buf, dsize); - if (rbuf == NULL) - { - free (buf); - buf = NULL; - break; - } - - buf = rbuf; - bstrm.next_out = (char*) &buf[bpos]; - bstrm.avail_out = dsize - bpos; - } - else if (bret != BZ_STREAM_END) - { - /* error */ - free (buf); - buf = NULL; - } - } while ((buf != NULL) && (bret != BZ_STREAM_END)); - - dsize = bpos + bstrm.total_out_lo32; - if ((dsize == 0) && (buf != NULL)) - { - free (buf); - buf = NULL; - } - } - - BZ2_bzDecompressEnd (&bstrm); - - if (fd != -1) - if (*buffer != NULL) - free (*buffer); - - if (buf == NULL) - { - return -1; - } - else - { - *buffer = buf; - *buffer_size = dsize; - return 0; - } - } - } -#endif - return -1; -} - -/** - * Detect if we have compressed data on our hands. - * - * @param data pointer to a data buffer or NULL (in case fd is not -1) - * @param fd a file to read data from, or -1 (if data is not NULL) - * @param fsize size of data (if data is not NULL) or of file (if fd is not -1) - * @param buffer will receive a pointer to the data that this function read - * @param buffer_size will receive size of the buffer - * @return -1 to indicate an error, 0 to indicate uncompressed data, or a type (> 0) of compression - */ -static int -get_compression_type (const unsigned char *data, int fd, int64_t fsize, void **buffer, size_t *buffer_size) -{ - void *read_data = NULL; - size_t read_data_size = 0; - ssize_t read_result; - - if ((MIN_COMPRESSED_HEADER < 0) || (fsize < MIN_COMPRESSED_HEADER)) - { - *buffer = NULL; - return 0; - } - if (data == NULL) - { - read_data_size = COMPRESSED_DATA_PROBE_SIZE; - read_data = malloc (read_data_size); - if (read_data == NULL) - return -1; - read_result = READ (fd, read_data, read_data_size); - if (read_result != read_data_size) - { - free (read_data); - return -1; - } - *buffer = read_data; - *buffer_size = read_data_size; - data = (const void *) read_data; - } -#if HAVE_ZLIB - if ((fsize >= MIN_ZLIB_HEADER) && (data[0] == 0x1f) && (data[1] == 0x8b) && (data[2] == 0x08)) - return 1; -#endif -#if HAVE_LIBBZ2 - if ((fsize >= MIN_BZ2_HEADER) && (data[0] == 'B') && (data[1] == 'Z') && (data[2] == 'h')) - return 2; -#endif - return 0; -} - -#if WINDOWS + return 0; +} /** * Setup a shared memory segment. @@ -1444,16 +1199,15 @@ get_compression_type (const unsigned char *data, int fd, int64_t fsize, void **b * @return 0 on success */ static int -make_shm_w32 (void **ptr, HANDLE *map, char *fn, size_t fn_size, size_t size) +make_file_backed_shm_w32 (HANDLE *map, HANDLE file, char *fn, size_t fn_size) { const char *tpath = "Local\\"; snprintf (fn, fn_size, "%slibextractor-shm-%u-%u", tpath, getpid(), (unsigned int) RANDOM()); - *map = CreateFileMapping (INVALID_HANDLE_VALUE, NULL, PAGE_READWRITE, 0, size, fn); - *ptr = MapViewOfFile (*map, FILE_MAP_WRITE, 0, 0, size); - if (*ptr == NULL) + *map = CreateFileMapping (file, NULL, PAGE_READONLY, 0, 0, fn); + if (*map == NULL) { - CloseHandle (*map); + DWORD err = GetLastError (); return 1; } return 0; @@ -1466,6 +1220,12 @@ destroy_shm_w32 (void *ptr, HANDLE map) CloseHandle (map); } +static void +destroy_file_backed_shm_w32 (HANDLE map) +{ + CloseHandle (map); +} + #else /** @@ -1505,30 +1265,671 @@ make_shm_posix (void **ptr, int *shmid, char *fn, size_t fn_size, size_t size) shm_unlink (fn); return 1; } - return 0; + return 0; +} + +static void +destroy_shm_posix (void *ptr, int shm_id, size_t size, char *shm_name) +{ + if (NULL != ptr) + munmap (ptr, size); + if (shm_id != -1) + close (shm_id); + shm_unlink (shm_name); +} +#endif + +#ifndef O_LARGEFILE +#define O_LARGEFILE 0 +#endif + +struct BufferedFileDataSource +{ + int fd; + const unsigned char *data; + + int64_t fsize; + int64_t fpos; + + unsigned char *buffer; + int64_t buffer_pos; + int64_t buffer_bytes; + int64_t buffer_size; +}; + +struct BufferedFileDataSource * +bfds_new (const unsigned char *data, int fd, int64_t fsize); + +void +bfds_delete (struct BufferedFileDataSource *bfds); + +int +bfds_pick_next_buffer_at (struct BufferedFileDataSource *bfds, int64_t pos); + +int64_t +bfds_seek (struct BufferedFileDataSource *bfds, int64_t pos, int whence); + +int64_t +bfds_read (struct BufferedFileDataSource *bfds, unsigned char **buf_ptr, int64_t count); + +struct BufferedFileDataSource * +bfds_new (const unsigned char *data, int fd, int64_t fsize) +{ + struct BufferedFileDataSource *result; + result = malloc (sizeof (struct BufferedFileDataSource)); + if (result == NULL) + return NULL; + memset (result, 0, sizeof (struct BufferedFileDataSource)); + result->data = data; + result->fsize = fsize; + result->fd = fd; + result->buffer_size = fsize; + if (result->data == NULL) + { + if (result->buffer_size > MAX_READ) + result->buffer_size = MAX_READ; + result->buffer = malloc (result->buffer_size); + if (result->buffer == NULL) + { + free (result); + return NULL; + } + } + bfds_pick_next_buffer_at (result, 0); + return result; +} + +void +bfds_delete (struct BufferedFileDataSource *bfds) +{ + if (bfds->buffer) + free (bfds->buffer); + free (bfds); +} + +int +bfds_pick_next_buffer_at (struct BufferedFileDataSource *bfds, int64_t pos) +{ + int64_t position, rd; + if (bfds->data != NULL) + { + bfds->buffer_bytes = bfds->fsize; + return 0; + } +#if WINDOWS + position = _lseeki64 (bfds->fd, pos, SEEK_SET); +#elif HAVE_LSEEK64 + position = lseek64 (bfds->fd, pos, SEEK_SET); +#else + position = (int64_t) lseek (bfds->fd, pos, SEEK_SET); +#endif + if (position < 0) + return -1; + bfds->fpos = position; + rd = read (bfds->fd, bfds->buffer, bfds->buffer_size); + if (rd < 0) + return -1; + bfds->buffer_bytes = rd; + return 0; +} + +int64_t +bfds_seek (struct BufferedFileDataSource *bfds, int64_t pos, int whence) +{ + switch (whence) + { + case SEEK_CUR: + if (bfds->data == NULL) + { + if (0 != bfds_pick_next_buffer_at (bfds, bfds->fpos + bfds->buffer_pos + pos)) + return -1; + bfds->buffer_pos = 0; + return bfds->fpos; + } + bfds->buffer_pos += pos; + return bfds->buffer_pos; + break; + case SEEK_SET: + if (pos < 0) + return -1; + if (bfds->data == NULL) + { + if (0 != bfds_pick_next_buffer_at (bfds, pos)) + return -1; + bfds->buffer_pos = 0; + return bfds->fpos; + } + bfds->buffer_pos = pos; + return bfds->buffer_pos; + break; + case SEEK_END: + if (bfds->data == NULL) + { + if (0 != bfds_pick_next_buffer_at (bfds, bfds->fsize + pos)) + return -1; + bfds->buffer_pos = 0; + return bfds->fpos; + } + bfds->buffer_pos = bfds->fsize + pos; + return bfds->buffer_pos; + break; + } + return -1; +} + +int64_t +bfds_read (struct BufferedFileDataSource *bfds, unsigned char **buf_ptr, int64_t count) +{ + if (count > MAX_READ) + return -1; + if (count > bfds->buffer_bytes - bfds->buffer_pos) + { + if (bfds->fpos + bfds->buffer_pos != bfds_seek (bfds, bfds->fpos + bfds->buffer_pos, SEEK_SET)) + return -1; + if (bfds->data == NULL) + { + *buf_ptr = &bfds->buffer[bfds->buffer_pos]; + bfds->buffer_pos += count < bfds->buffer_bytes ? count : bfds->buffer_bytes; + return (count < bfds->buffer_bytes ? count : bfds->buffer_bytes); + } + else + { + int64_t ret = count < (bfds->buffer_bytes - bfds->buffer_pos) ? count : (bfds->buffer_bytes - bfds->buffer_pos); + *buf_ptr = &bfds->data[bfds->buffer_pos]; + bfds->buffer_pos += ret; + return ret; + } + } + else + { + if (bfds->data == NULL) + *buf_ptr = &bfds->buffer[bfds->buffer_pos]; + else + *buf_ptr = &bfds->data[bfds->buffer_pos]; + bfds->buffer_pos += count; + return count; + } +} + +#if HAVE_ZLIB +#define MIN_ZLIB_HEADER 12 +#endif +#if HAVE_LIBBZ2 +#define MIN_BZ2_HEADER 4 +#endif +#if !defined (MIN_COMPRESSED_HEADER) && HAVE_ZLIB +#define MIN_COMPRESSED_HEADER MIN_ZLIB_HEADER +#endif +#if !defined (MIN_COMPRESSED_HEADER) && HAVE_LIBBZ2 +#define MIN_COMPRESSED_HEADER MIN_BZ2_HEADER +#endif +#if !defined (MIN_COMPRESSED_HEADER) +#define MIN_COMPRESSED_HEADER -1 +#endif + +#define COMPRESSED_DATA_PROBE_SIZE 3 + +enum ExtractorCompressionType +{ + COMP_TYPE_UNDEFINED = -1, + COMP_TYPE_INVALID = 0, + COMP_TYPE_ZLIB = 1, + COMP_TYPE_BZ2 = 2 +}; + +struct CompressedFileSource +{ + enum ExtractorCompressionType compression_type; + struct BufferedFileDataSource *bfds; + int64_t fsize; + int64_t fpos; + + int64_t uncompressed_size; + + unsigned char *buffer; + int64_t buffer_bytes; + int64_t buffer_len; + +#if WINDOWS + HANDLE shm; +#else + int shm; +#endif + char shm_name[MAX_SHM_NAME + 1]; + void *shm_ptr; + int64_t shm_pos; + size_t shm_buf_pos; + int64_t shm_size; + size_t shm_buf_size; + +#if HAVE_ZLIB + z_stream strm; + int ret; + size_t pos; + int gzip_header_length; +#endif +#if HAVE_LIBBZ2 + bz_stream bstrm; + int bret; + size_t bpos; +#endif +}; + +int +cfs_delete (struct CompressedFileSource *cfs) +{ +#if WINDOWS + destroy_shm_w32 (cfs->shm_ptr, cfs->shm); +#else + destroy_shm_posix (cfs->shm_ptr, cfs->shm, cfs->shm_size, cfs->shm_name); +#endif + free (cfs); +} + +int +cfs_reset_stream_zlib (struct CompressedFileSource *cfs) +{ + if (cfs->gzip_header_length != bfds_seek (cfs->bfds, cfs->gzip_header_length, SEEK_SET)) + return 0; + cfs->strm.next_in = NULL; + cfs->strm.avail_in = 0; + cfs->strm.total_in = 0; + cfs->strm.zalloc = NULL; + cfs->strm.zfree = NULL; + cfs->strm.opaque = NULL; + + /* + * note: maybe plain inflateInit(&strm) is adequate, + * it looks more backward-compatible also ; + * + * ZLIB_VERNUM isn't defined by zlib version 1.1.4 ; + * there might be a better check. + */ + if (Z_OK != inflateInit2 (&cfs->strm, +#ifdef ZLIB_VERNUM + 15 + 32 +#else + -MAX_WBITS +#endif + )) + { + return -1; + } + + cfs->fpos = cfs->gzip_header_length; + cfs->shm_pos = 0; + cfs->shm_buf_pos = 0; + cfs->shm_buf_size = 0; + +#if HAVE_ZLIB + z_stream strm; + cfs->ret = 0; + cfs->pos = 0; +#endif + return 1; +} + +static int +cfs_reset_stream_bz2 (struct CompressedFileSource *cfs) +{ + return -1; +} + +int +cfs_reset_stream (struct CompressedFileSource *cfs) +{ + switch (cfs->compression_type) + { + case COMP_TYPE_ZLIB: + return cfs_reset_stream_zlib (cfs); + case COMP_TYPE_BZ2: + return cfs_reset_stream_bz2 (cfs); + default: + return -1; + } +} + + +static int +cfs_init_decompressor_zlib (struct CompressedFileSource *cfs, EXTRACTOR_MetaDataProcessor proc, void *proc_cls) +{ + /* Process gzip header */ + unsigned int gzip_header_length = 10; + unsigned char *pdata; + unsigned char data[12]; + + if (12 > bfds_read (cfs->bfds, &pdata, 12)) + return -1; + memcpy (data, pdata, 12); + + if (data[3] & 0x4) /* FEXTRA set */ + gzip_header_length += 2 + (unsigned) (data[10] & 0xff) + + (((unsigned) (data[11] & 0xff)) * 256); + + if (data[3] & 0x8) /* FNAME set */ + { + int64_t fp = cfs->fpos; + int64_t buf_bytes; + int len; + unsigned char *buf, *cptr; + if (gzip_header_length > bfds_seek (cfs->bfds, gzip_header_length, SEEK_SET)) + return -1; + buf_bytes = bfds_read (cfs->bfds, &buf, 1024); + if (buf_bytes <= 0) + return -1; + cptr = buf; + + len = 0; + /* stored file name is here */ + while (len < buf_bytes) + { + if ('\0' == *cptr) + break; + cptr++; + len++; + } + + if (0 != proc (proc_cls, "<zlib>", EXTRACTOR_METATYPE_FILENAME, + EXTRACTOR_METAFORMAT_C_STRING, "text/plain", + (const char *) buf, + len)) + return 0; /* done */ + + /* FIXME: check for correctness */ + //gzip_header_length = (cptr - data) + 1; + gzip_header_length += len + 1; + } + + if (data[3] & 0x16) /* FCOMMENT set */ + { + int64_t fp = cfs->fpos; + int64_t buf_bytes; + int len; + unsigned char *buf, *cptr; + if (gzip_header_length > bfds_seek (cfs->bfds, gzip_header_length, SEEK_SET)) + return -1; + buf_bytes = bfds_read (cfs->bfds, &buf, 1024); + if (buf_bytes <= 0) + return -1; + cptr = buf; + + len = 0; + /* stored file name is here */ + while (len < buf_bytes) + { + if ('\0' == *cptr) + break; + cptr++; + len++; + } + + if (0 != proc (proc_cls, "<zlib>", EXTRACTOR_METATYPE_COMMENT, + EXTRACTOR_METAFORMAT_C_STRING, "text/plain", + (const char *) buf, + len)) + return 0; /* done */ + + /* FIXME: check for correctness */ + //gzip_header_length = (cptr - data) + 1; + gzip_header_length += len + 1; + } + + if (data[3] & 0x2) /* FCHRC set */ + gzip_header_length += 2; + + memset (&cfs->strm, 0, sizeof (z_stream)); + +#ifdef ZLIB_VERNUM + gzip_header_length = 0; +#endif + + cfs->gzip_header_length = gzip_header_length; + return cfs_reset_stream_zlib (cfs); +} + +int +cfs_deinit_decompressor_zlib (struct CompressedFileSource *cfs) +{ + inflateEnd (&cfs->strm); +} + +static int +cfs_init_decompressor_bz2 (struct CompressedFileSource *cfs, EXTRACTOR_MetaDataProcessor proc, void *proc_cls) +{ + return -1; +} + +static int +cfs_deinit_decompressor_bz2 (struct CompressedFileSource *cfs) +{ + return -1; +} + +static int +cfs_init_decompressor (struct CompressedFileSource *cfs, EXTRACTOR_MetaDataProcessor proc, void *proc_cls) +{ + switch (cfs->compression_type) + { + case COMP_TYPE_ZLIB: + return cfs_init_decompressor_zlib (cfs, proc, proc_cls); + case COMP_TYPE_BZ2: + return cfs_init_decompressor_bz2 (cfs, proc, proc_cls); + default: + return -1; + } +} + +static int +cfs_deinit_decompressor (struct CompressedFileSource *cfs) +{ + switch (cfs->compression_type) + { + case COMP_TYPE_ZLIB: + return cfs_deinit_decompressor_zlib (cfs); + case COMP_TYPE_BZ2: + return cfs_deinit_decompressor_bz2 (cfs); + default: + return -1; + } +} + +struct CompressedFileSource * +cfs_new (struct BufferedFileDataSource *bfds, int64_t fsize, enum ExtractorCompressionType compression_type, EXTRACTOR_MetaDataProcessor proc, void *proc_cls) +{ + int shm_result; + size_t map_size; + struct CompressedFileSource *cfs; + cfs = malloc (sizeof (struct CompressedFileSource)); + if (cfs == NULL) + return NULL; + memset (cfs, 0, sizeof (struct CompressedFileSource)); + cfs->compression_type = compression_type; + cfs->bfds = bfds; + cfs->fsize = fsize; + cfs->uncompressed_size = -1; + cfs->shm_size = MAX_READ; +#if !WINDOWS + shm_result = make_shm_posix ((void **) &cfs->shm_ptr, &cfs->shm, cfs->shm_name, MAX_SHM_NAME, cfs->shm_size); +#else + shm_result = make_shm_w32 ((void **) &cfs->shm_ptr, &cfs->shm, cfs->shm_name, MAX_SHM_NAME, cfs->shm_size); +#endif + if (shm_result != 0) + { + cfs_delete (cfs); + return NULL; + } + return cfs; +} + +#define COM_CHUNK_SIZE (10*1024) + +int +cfs_read_zlib (struct CompressedFileSource *cfs, int64_t preserve) +{ + int ret; + int64_t rc = preserve; + int64_t total = cfs->strm.total_out; + if (preserve > 0) + memmove (cfs->shm_ptr, &((unsigned char *)cfs->shm_ptr)[0], preserve); + + while (rc < cfs->shm_size && ret != Z_STREAM_END) + { + if (cfs->strm.avail_in == 0) + { + int64_t count = bfds_read (cfs->bfds, &cfs->strm.next_in, COM_CHUNK_SIZE); + if (count <= 0) + return 0; + cfs->strm.avail_in = (uInt) count; + } + cfs->strm.next_out = &((unsigned char *)cfs->shm_ptr)[rc]; + cfs->strm.avail_out = cfs->shm_size - rc; + ret = inflate (&cfs->strm, Z_SYNC_FLUSH); + if (ret != Z_OK && ret != Z_STREAM_END) + return 0; + rc = cfs->strm.total_out - total; + } + if (ret == Z_STREAM_END) + cfs->uncompressed_size = cfs->strm.total_out; + cfs->shm_pos = preserve; + cfs->shm_buf_size = rc + preserve; + return 1; +} + +int +cfs_read_bz2 (struct CompressedFileSource *cfs, int64_t preserve) +{ + return -1; +} + +int64_t +cfs_read (struct CompressedFileSource *cfs, int64_t preserve) +{ + switch (cfs->compression_type) + { + case COMP_TYPE_ZLIB: + return cfs_read_zlib (cfs, preserve); + case COMP_TYPE_BZ2: + return cfs_read_bz2 (cfs, preserve); + default: + return -1; + } +} + +int64_t +cfs_seek_zlib (struct CompressedFileSource *cfs, int64_t position) +{ + int64_t ret; + if (position > cfs->strm.total_out - cfs->shm_buf_size && position < cfs->strm.total_out) + { + ret = cfs_read (cfs, cfs->strm.total_out - position); + if (ret < 0) + return ret; + return position; + } + while (position >= cfs->strm.total_out) + { + if (0 > (ret = cfs_read (cfs, 0))) + return ret; + if (ret == 0) + return position; + } + if (position < cfs->strm.total_out && position > cfs->strm.total_out - cfs->shm_buf_size) + return cfs->strm.total_out - cfs->shm_buf_size; + return -1; } -static void -destroy_shm_posix (void *ptr, int shm_id, size_t size, char *shm_name) +int64_t +cfs_seek_bz2 (struct CompressedFileSource *cfs, int64_t position) { - if (NULL != ptr) - munmap (ptr, size); - if (shm_id != -1) - close (shm_id); - shm_unlink (shm_name); + return -1; } -#endif +int64_t +cfs_seek (struct CompressedFileSource *cfs, int64_t position) +{ + switch (cfs->compression_type) + { + case COMP_TYPE_ZLIB: + return cfs_seek_zlib (cfs, position); + case COMP_TYPE_BZ2: + return cfs_seek_bz2 (cfs, position); + default: + return -1; + } +} + +/** + * Detect if we have compressed data on our hands. + * + * @param data pointer to a data buffer or NULL (in case fd is not -1) + * @param fd a file to read data from, or -1 (if data is not NULL) + * @param fsize size of data (if data is not NULL) or of file (if fd is not -1) + * @return -1 to indicate an error, 0 to indicate uncompressed data, or a type (> 0) of compression + */ +static enum ExtractorCompressionType +get_compression_type (const unsigned char *data, int fd, int64_t fsize) +{ + void *read_data = NULL; + size_t read_data_size = 0; + ssize_t read_result; + enum ExtractorCompressionType result = COMP_TYPE_INVALID; + + if ((MIN_COMPRESSED_HEADER < 0) || (fsize < MIN_COMPRESSED_HEADER)) + { + return COMP_TYPE_INVALID; + } + if (data == NULL) + { + int64_t position; + read_data_size = COMPRESSED_DATA_PROBE_SIZE; + read_data = malloc (read_data_size); + if (read_data == NULL) + return -1; +#if WINDOWS + position = _lseeki64 (fd, 0, SEEK_CUR); +#elif HAVE_LSEEK64 + position = lseek64 (fd, 0, SEEK_CUR); +#else + position = (int64_t) lseek (fd, 0, SEEK_CUR); +#endif + read_result = READ (fd, read_data, read_data_size); +#if WINDOWS + position = _lseeki64 (fd, position, SEEK_SET); +#elif HAVE_LSEEK64 + position = lseek64 (fd, position, SEEK_SET); +#else + position = lseek (fd, (off_t) position, SEEK_SET); +#endif + if (read_result != read_data_size) + { + free (read_data); + return COMP_TYPE_UNDEFINED; + } + data = (const void *) read_data; + } +#if HAVE_ZLIB + if ((fsize >= MIN_ZLIB_HEADER) && (data[0] == 0x1f) && (data[1] == 0x8b) && (data[2] == 0x08)) + result = COMP_TYPE_ZLIB; +#endif +#if HAVE_LIBBZ2 + if ((fsize >= MIN_BZ2_HEADER) && (data[0] == 'B') && (data[1] == 'Z') && (data[2] == 'h')) + result = COMP_TYPE_BZ2; +#endif + if (read_data != NULL) + free (read_data); + return result; +} static void -init_plugin_state (struct EXTRACTOR_PluginList *plugin, char *shm_name, int64_t fsize) +init_plugin_state (struct EXTRACTOR_PluginList *plugin, uint8_t operation_mode, int fd, const char *shm_name, int64_t fsize) { int write_result; int init_state_size; unsigned char *init_state; int t; size_t shm_name_len = strlen (shm_name) + 1; - init_state_size = 1 + sizeof (size_t) + shm_name_len + sizeof (int64_t); + init_state_size = 1 + sizeof (size_t) + shm_name_len + sizeof (uint8_t) + sizeof (int64_t); + plugin->operation_mode = operation_mode; switch (plugin->flags) { case EXTRACTOR_OPTION_DEFAULT_POLICY: @@ -1542,6 +1943,8 @@ init_plugin_state (struct EXTRACTOR_PluginList *plugin, char *shm_name, int64_t t = 0; init_state[t] = MESSAGE_INIT_STATE; t += 1; + memcpy (&init_state[t], &operation_mode, sizeof (uint8_t)); + t += sizeof (uint8_t); memcpy (&init_state[t], &fsize, sizeof (int64_t)); t += sizeof (int64_t); memcpy (&init_state[t], &shm_name_len, sizeof (size_t)); @@ -1558,10 +1961,7 @@ init_plugin_state (struct EXTRACTOR_PluginList *plugin, char *shm_name, int64_t plugin->seek_request = 0; break; case EXTRACTOR_OPTION_IN_PROCESS: - plugin_open_shm (plugin, shm_name); - plugin->fsize = fsize; - plugin->init_state_method (plugin); - plugin->seek_request = 0; + init_state_method (plugin, operation_mode, fsize, shm_name); return; break; case EXTRACTOR_OPTION_DISABLED: @@ -1593,7 +1993,7 @@ discard_plugin_state (struct EXTRACTOR_PluginList *plugin) } break; case EXTRACTOR_OPTION_IN_PROCESS: - plugin->discard_state_method (plugin); + discard_state_method (plugin); return; break; case EXTRACTOR_OPTION_DISABLED: @@ -1603,10 +2003,234 @@ discard_plugin_state (struct EXTRACTOR_PluginList *plugin) } static int -give_shm_to_plugin (struct EXTRACTOR_PluginList *plugin, int64_t position, size_t map_size) +pl_pick_next_buffer_at (struct EXTRACTOR_PluginList *plugin, int64_t pos, uint8_t want_start) +{ + if (plugin->operation_mode == OPMODE_MEMORY) + { + int64_t old_pos; + int64_t gran_fix; +#if !WINDOWS + if (plugin->shm_ptr != NULL) + munmap (plugin->shm_ptr, plugin->map_size); +#else + if (plugin->shm_ptr != NULL) + UnmapViewOfFile (plugin->shm_ptr); +#endif + plugin->shm_ptr = NULL; + old_pos = plugin->fpos + plugin->shm_pos; + if (pos < 0) + pos = 0; + if (pos > plugin->fsize) + pos = plugin->fsize - 1; + plugin->fpos = pos; + plugin->map_size = MAX_READ; + plugin->shm_pos = old_pos - plugin->fpos; + if (want_start) + gran_fix = -1 * (plugin->fpos % plugin->allocation_granularity); + else + { + gran_fix = plugin->fpos % plugin->allocation_granularity; + if (gran_fix > 0) + gran_fix = plugin->allocation_granularity - gran_fix; + } + if (plugin->fpos + gran_fix + plugin->map_size > plugin->fsize) + plugin->map_size = plugin->fsize - plugin->fpos - gran_fix; + plugin->fpos += gran_fix; +#if !WINDOWS + if ((-1 == plugin->shm_id) || + (NULL == (plugin->shm_ptr = mmap (NULL, plugin->map_size, PROT_READ, MAP_SHARED, plugin->shm_id, plugin->fpos))) || + (plugin->shm_ptr == (void *) -1)) + { + return -1; + } +#else + LARGE_INTEGER off; + off.QuadPart = plugin->fpos; + if ((plugin->map_handle == 0) || + (NULL == (plugin->shm_ptr = MapViewOfFile (plugin->map_handle, FILE_MAP_READ, off.HighPart, off.LowPart, plugin->map_size)))) + { + DWORD err = GetLastError (); + return -1; + } +#endif + plugin->shm_pos -= gran_fix; + return 0; + } + if (plugin->operation_mode == OPMODE_FILE) + { + int64_t old_pos; + int64_t gran_fix; +#if !WINDOWS + if (plugin->shm_ptr != NULL) + munmap (plugin->shm_ptr, plugin->map_size); +#else + if (plugin->shm_ptr != NULL) + UnmapViewOfFile (plugin->shm_ptr); +#endif + plugin->shm_ptr = NULL; + old_pos = plugin->fpos + plugin->shm_pos; + if (pos < 0) + pos = 0; + if (pos > plugin->fsize) + pos = plugin->fsize - 1; + plugin->fpos = pos; + plugin->map_size = MAX_READ; + plugin->shm_pos = old_pos - plugin->fpos; + if (want_start) + gran_fix = -1 * (plugin->fpos % plugin->allocation_granularity); + else + { + gran_fix = plugin->fpos % plugin->allocation_granularity; + if (gran_fix > 0) + gran_fix = plugin->allocation_granularity - gran_fix; + } + if (plugin->fpos + gran_fix + plugin->map_size > plugin->fsize) + plugin->map_size = plugin->fsize - plugin->fpos - gran_fix; + plugin->fpos += gran_fix; +#if !WINDOWS + if ((-1 == plugin->shm_id) || + (NULL == (plugin->shm_ptr = mmap (NULL, plugin->map_size, PROT_READ, MAP_SHARED, plugin->shm_id, plugin->fpos))) || + (plugin->shm_ptr == (void *) -1)) + { + return -1; + } +#else + LARGE_INTEGER off; + off.QuadPart = plugin->fpos; + if ((plugin->map_handle == 0) || + (NULL == (plugin->shm_ptr = MapViewOfFile (plugin->map_handle, FILE_MAP_READ, off.HighPart, off.LowPart, plugin->map_size)))) + { + DWORD err = GetLastError (); + return -1; + } +#endif + plugin->shm_pos -= gran_fix; + return 0; + } + if (plugin->operation_mode == OPMODE_DECOMPRESS) + { + if (plugin->pipe_in != 0) + { + int64_t old_pos; + old_pos = plugin->fpos + plugin->shm_pos; + plugin->seek_request = pos; + while (plugin->fpos != pos) + { + plugin->waiting_for_update = 1; + if (process_requests (plugin) < 0) + return -1; + plugin->waiting_for_update = 0; + } + plugin->shm_pos = old_pos - plugin->fpos; + } + else + { + if (pos < plugin->fpos) + { + if (1 != cfs_reset_stream (plugin->state)) + return -1; + } + while (plugin->fpos < pos && plugin->fpos >= 0) + plugin->fpos = cfs_seek (plugin->state, pos); + plugin->fsize = ((struct CompressedFileSource *)plugin->state)->uncompressed_size; + plugin->shm_pos = pos - plugin->fpos; + } + return 0; + } +} + +int64_t +pl_seek (struct EXTRACTOR_PluginList *plugin, int64_t pos, int whence) +{ + switch (whence) + { + case SEEK_CUR: + if (plugin->shm_pos + pos < plugin->map_size && plugin->shm_pos + pos >= 0) + { + plugin->shm_pos += pos; + return plugin->fpos + plugin->shm_pos; + } + if (0 != pl_pick_next_buffer_at (plugin, plugin->fpos + plugin->shm_pos + pos, 1)) + return -1; + plugin->shm_pos += pos; + return plugin->fpos + plugin->shm_pos; + break; + case SEEK_SET: + if (pos < 0) + return -1; + if (pos >= plugin->fpos && pos < plugin->fpos + plugin->map_size) + { + plugin->shm_pos = pos - plugin->fpos; + return pos; + } + if (0 != pl_pick_next_buffer_at (plugin, pos, 1)) + return -1; + if (pos >= plugin->fpos && pos < plugin->fpos + plugin->map_size) + { + plugin->shm_pos = pos - plugin->fpos; + return pos; + } + return -1; + break; + case SEEK_END: + while (plugin->fsize == -1) + { + pl_pick_next_buffer_at (plugin, plugin->fpos + plugin->map_size + pos, 0); + } + if (plugin->fsize + pos - 1 >= plugin->fpos && plugin->fsize + pos - 1 <= plugin->fpos + plugin->map_size) + { + plugin->shm_pos = plugin->fsize + pos - plugin->fpos; + return plugin->fpos + plugin->shm_pos - 1; + } + if (0 != pl_pick_next_buffer_at (plugin, plugin->fsize - MAX_READ, 0)) + return -1; + plugin->shm_pos = plugin->fsize + pos - plugin->fpos; + return plugin->fsize + pos - 1; + break; + } + return -1; +} + +int64_t +pl_get_fsize (struct EXTRACTOR_PluginList *plugin) +{ + return plugin->fsize; +} + +int64_t +pl_get_pos (struct EXTRACTOR_PluginList *plugin) +{ + return plugin->fpos + plugin->shm_pos; +} + +int64_t +pl_read (struct EXTRACTOR_PluginList *plugin, unsigned char **data, size_t count) +{ + if (count > MAX_READ) + return -1; + if (count > plugin->map_size - plugin->shm_pos) + { + int64_t actual_count; + if (plugin->fpos + plugin->shm_pos != pl_seek (plugin, plugin->fpos + plugin->shm_pos, SEEK_SET)) + return -1; + *data = &plugin->shm_ptr[plugin->shm_pos]; + actual_count = (count < plugin->map_size - plugin->shm_pos ? count : plugin->map_size - plugin->shm_pos); + plugin->shm_pos += actual_count; + return actual_count; + } + else + { + *data = &plugin->shm_ptr[plugin->shm_pos]; + plugin->shm_pos += count; + return count; + } +} + +static int +give_shm_to_plugin (struct EXTRACTOR_PluginList *plugin, int64_t position, size_t map_size, int64_t fsize, uint8_t operation_mode) { int write_result; - int updated_shm_size = 1 + sizeof (int64_t) + sizeof (size_t); + int updated_shm_size = 1 + sizeof (int64_t) + sizeof (size_t) + sizeof (int64_t); unsigned char updated_shm[updated_shm_size]; int t = 0; updated_shm[t] = MESSAGE_UPDATED_SHM; @@ -1615,22 +2239,31 @@ give_shm_to_plugin (struct EXTRACTOR_PluginList *plugin, int64_t position, size_ t += sizeof (int64_t); memcpy (&updated_shm[t], &map_size, sizeof (size_t)); t += sizeof (size_t); + memcpy (&updated_shm[t], &fsize, sizeof (int64_t)); + t += sizeof (int64_t); switch (plugin->flags) { case EXTRACTOR_OPTION_DEFAULT_POLICY: case EXTRACTOR_OPTION_OUT_OF_PROCESS_NO_RESTART: - if (plugin->seek_request < 0) - return 0; - write_result = plugin_write (plugin, updated_shm, updated_shm_size); - if (write_result < updated_shm_size) + if (operation_mode == OPMODE_DECOMPRESS) { - stop_process (plugin); - return 0; + if (plugin->seek_request < 0) + return 0; + write_result = plugin_write (plugin, updated_shm, updated_shm_size); + if (write_result < updated_shm_size) + { + stop_process (plugin); + return 0; + } } return 1; case EXTRACTOR_OPTION_IN_PROCESS: - plugin->position = position; - plugin->map_size = map_size; + if (operation_mode == OPMODE_DECOMPRESS) + { + plugin->fpos = position; + plugin->map_size = map_size; + plugin->fsize = fsize; + } return 0; case EXTRACTOR_OPTION_DISABLED: return 0; @@ -1640,7 +2273,7 @@ give_shm_to_plugin (struct EXTRACTOR_PluginList *plugin, int64_t position, size_ } static void -ask_in_process_plugin (struct EXTRACTOR_PluginList *plugin, int64_t position, void *shm_ptr, EXTRACTOR_MetaDataProcessor proc, void *proc_cls) +ask_in_process_plugin (struct EXTRACTOR_PluginList *plugin, void *shm_ptr, EXTRACTOR_MetaDataProcessor proc, void *proc_cls) { int extract_reply; switch (plugin->flags) @@ -1931,9 +2564,10 @@ wait_for_reply (struct EXTRACTOR_PluginList *plugins, EXTRACTOR_MetaDataProcesso #endif static int64_t -seek_to_new_position (struct EXTRACTOR_PluginList *plugins, int fd, int64_t fsize, int64_t current_position) +seek_to_new_position (struct EXTRACTOR_PluginList *plugins, struct CompressedFileSource *cfs, int64_t current_position, int64_t map_size) { - int64_t min_pos = fsize; + int64_t min_pos = current_position + map_size; + int64_t min_plugin_pos = 0x7FFFFFFFFFFFFFF; struct EXTRACTOR_PluginList *ppos; for (ppos = plugins; NULL != ppos; ppos = ppos->next) { @@ -1942,26 +2576,24 @@ seek_to_new_position (struct EXTRACTOR_PluginList *plugins, int fd, int64_t fsiz case EXTRACTOR_OPTION_DEFAULT_POLICY: case EXTRACTOR_OPTION_OUT_OF_PROCESS_NO_RESTART: case EXTRACTOR_OPTION_IN_PROCESS: - if (ppos->seek_request > 0 && ppos->seek_request >= current_position && - ppos->seek_request <= min_pos) - min_pos = ppos->seek_request; + if (ppos->seek_request >= 0 && ppos->seek_request <= min_pos) + min_pos = ppos->seek_request; + if (ppos->seek_request >= 0 && ppos->seek_request <= min_plugin_pos) + min_plugin_pos = ppos->seek_request; break; case EXTRACTOR_OPTION_DISABLED: break; } } - if (min_pos >= fsize) + if (min_plugin_pos == 0x7FFFFFFFFFFFFFF) return -1; -#if WINDOWS - _lseeki64 (fd, min_pos, SEEK_SET); -#elif !HAVE_SEEK64 - lseek64 (fd, min_pos, SEEK_SET); -#else - if (min_pos >= INT_MAX) - return -1; - lseek (fd, (ssize_t) min_pos, SEEK_SET); -#endif - return min_pos; + if (min_pos < current_position - map_size) + { + if (1 != cfs_reset_stream (cfs)) + return -1; + return 0; + } + return cfs_seek (cfs, min_pos); } static void @@ -1992,8 +2624,10 @@ load_in_process_plugin (struct EXTRACTOR_PluginList *plugin) * @param proc_cls cls argument to proc */ static void -do_extract (struct EXTRACTOR_PluginList *plugins, const char *data, int fd, int64_t fsize, void *buffer, size_t buffer_size, EXTRACTOR_MetaDataProcessor proc, void *proc_cls) +do_extract (struct EXTRACTOR_PluginList *plugins, const char *data, int fd, const char *filename, struct CompressedFileSource *cfs, int64_t fsize, EXTRACTOR_MetaDataProcessor proc, void *proc_cls) { + int operation_mode; + int plugin_count = 0; int shm_result; unsigned char *shm_ptr; #if !WINDOWS @@ -2006,26 +2640,56 @@ do_extract (struct EXTRACTOR_PluginList *plugins, const char *data, int fd, int6 struct EXTRACTOR_PluginList *ppos; int64_t position = 0; + int64_t preserve = 0; size_t map_size; ssize_t read_result; int kill_plugins = 0; + if (cfs != NULL) + operation_mode = OPMODE_DECOMPRESS; + else if (data != NULL) + operation_mode = OPMODE_MEMORY; + else if (fd != -1) + operation_mode = OPMODE_FILE; + else + return; + map_size = (fd == -1) ? fsize : MAX_READ; - /* Make a shared memory object. Even if we're running in-process. Simpler that way */ + /* Make a shared memory object. Even if we're running in-process. Simpler that way. + * This is only for reading-from-memory case. For reading-from-file we will use + * the file itself; for uncompressing-on-the-fly the decompressor will make its own + * shared memory object and uncompress into it directly. + */ + if (operation_mode == OPMODE_MEMORY) + { + operation_mode = OPMODE_MEMORY; #if !WINDOWS - shm_result = make_shm_posix ((void **) &shm_ptr, &shm_id, shm_name, MAX_SHM_NAME, - map_size); + shm_result = make_shm_posix ((void **) &shm_ptr, &shm_id, shm_name, MAX_SHM_NAME, + fsize); #else - shm_result = make_shm_w32 ((void **) &shm_ptr, &map_handle, shm_name, MAX_SHM_NAME, - map_size); + shm_result = make_shm_w32 ((void **) &shm_ptr, &map_handle, shm_name, MAX_SHM_NAME, + fsize); #endif - if (shm_result != 0) - return; + if (shm_result != 0) + return; + memcpy (shm_ptr, data, fsize); + } + else if (operation_mode == OPMODE_FILE) + { +#if WINDOWS + shm_result = make_file_backed_shm_w32 (&map_handle, (HANDLE) _get_osfhandle (fd), shm_name, MAX_SHM_NAME); + if (shm_result != 0) + return; +#endif + } - /* This three-loops-instead-of-one construction is intended to increase parallelism */ + /* This four-loops-instead-of-one construction is intended to increase parallelism */ for (ppos = plugins; NULL != ppos; ppos = ppos->next) + { start_process (ppos); + plugin_count += 1; + } for (ppos = plugins; NULL != ppos; ppos = ppos->next) load_in_process_plugin (ppos); @@ -2033,29 +2697,33 @@ do_extract (struct EXTRACTOR_PluginList *plugins, const char *data, int fd, int6 for (ppos = plugins; NULL != ppos; ppos = ppos->next) write_plugin_data (ppos); - for (ppos = plugins; NULL != ppos; ppos = ppos->next) - init_plugin_state (ppos, shm_name, fsize); + if (operation_mode == OPMODE_DECOMPRESS) + { + for (ppos = plugins; NULL != ppos; ppos = ppos->next) + init_plugin_state (ppos, operation_mode, -1, cfs->shm_name, -1); + } + else if (operation_mode == OPMODE_FILE) + { + for (ppos = plugins; NULL != ppos; ppos = ppos->next) +#if !WINDOWS + init_plugin_state (ppos, operation_mode, fd, filename, fsize); +#else + init_plugin_state (ppos, operation_mode, fd, shm_name, fsize); +#endif + } + else + { + for (ppos = plugins; NULL != ppos; ppos = ppos->next) + init_plugin_state (ppos, operation_mode, -1, shm_name, fsize); + } - while (1) + if (operation_mode == OPMODE_FILE || operation_mode == OPMODE_MEMORY) { int plugins_not_ready = 0; - if (fd != -1) - { - /* fill the share buffer with data from the file */ - if (buffer_size > 0) - memcpy (shm_ptr, buffer, buffer_size); - read_result = READ (fd, &shm_ptr[buffer_size], MAX_READ - buffer_size); - if (read_result <= 0) - break; - else - map_size = read_result + buffer_size; - if (buffer_size > 0) - buffer_size = 0; - } for (ppos = plugins; NULL != ppos; ppos = ppos->next) - plugins_not_ready += give_shm_to_plugin (ppos, position, map_size); + plugins_not_ready += give_shm_to_plugin (ppos, position, map_size, fsize, operation_mode); for (ppos = plugins; NULL != ppos; ppos = ppos->next) - ask_in_process_plugin (ppos, position, shm_ptr, proc, proc_cls); + ask_in_process_plugin (ppos, shm_ptr, proc, proc_cls); while (plugins_not_ready > 0 && !kill_plugins) { int ready = wait_for_reply (plugins, proc, proc_cls); @@ -2063,17 +2731,40 @@ do_extract (struct EXTRACTOR_PluginList *plugins, const char *data, int fd, int6 kill_plugins = 1; plugins_not_ready -= ready; } - if (kill_plugins) - break; - if (fd != -1) + } + else + { + read_result = cfs_read (cfs, preserve); + if (read_result > 0) + while (1) { - position += map_size; - position = seek_to_new_position (plugins, fd, fsize, position); - if (position < 0) + int plugins_not_ready = 0; + + map_size = cfs->shm_buf_size; + for (ppos = plugins; NULL != ppos; ppos = ppos->next) + plugins_not_ready += give_shm_to_plugin (ppos, position, map_size, cfs->uncompressed_size, operation_mode); + /* Can't block in in-process plugins, unless we ONLY have one plugin */ + if (plugin_count == 1) + for (ppos = plugins; NULL != ppos; ppos = ppos->next) + { + /* Pass this way. we'll need it to call cfs functions later on */ + /* This is a special case */ + ppos->state = cfs; + ask_in_process_plugin (ppos, cfs->shm_ptr, proc, proc_cls); + } + while (plugins_not_ready > 0 && !kill_plugins) + { + int ready = wait_for_reply (plugins, proc, proc_cls); + if (ready <= 0) + kill_plugins = 1; + plugins_not_ready -= ready; + } + if (kill_plugins) + break; + position = seek_to_new_position (plugins, cfs, position, map_size); + if (position < 0 || position == cfs->uncompressed_size) break; } - else - break; } if (kill_plugins) @@ -2082,11 +2773,20 @@ do_extract (struct EXTRACTOR_PluginList *plugins, const char *data, int fd, int6 for (ppos = plugins; NULL != ppos; ppos = ppos->next) discard_plugin_state (ppos); + if (operation_mode == OPMODE_MEMORY) + { #if WINDOWS - destroy_shm_w32 (shm_ptr, map_handle); + destroy_shm_w32 (shm_ptr, map_handle); #else - destroy_shm_posix (shm_ptr, shm_id, (fd == -1) ? fsize : MAX_READ, shm_name); + destroy_shm_posix (shm_ptr, shm_id, (fd == -1) ? fsize : MAX_READ, shm_name); +#endif + } + else if (operation_mode == OPMODE_FILE) + { +#if WINDOWS + destroy_file_backed_shm_w32 (map_handle); #endif + } } @@ -2115,11 +2815,11 @@ EXTRACTOR_extract (struct EXTRACTOR_PluginList *plugins, int fd = -1; struct stat64 fstatbuf; int64_t fsize = 0; - int memory_only = 1; - int compression_type = -1; + enum ExtractorCompressionType compression_type = -1; void *buffer = NULL; size_t buffer_size; int decompression_result; + struct CompressedFileSource *cfs = NULL; /* If data is not given, then we need to read it from the file. Try opening it */ if ((data == NULL) && @@ -2136,9 +2836,6 @@ EXTRACTOR_extract (struct EXTRACTOR_PluginList *plugins, close(fd); return; } - /* File is too big -> can't read it into memory */ - if (fsize > MAX_READ) - memory_only = 0; } /* Data is not given, and we've failed to open the file with data -> exit */ @@ -2149,11 +2846,8 @@ EXTRACTOR_extract (struct EXTRACTOR_PluginList *plugins, fsize = size; errno = 0; - /* Peek at first few bytes of the file (or of the data), and see if it's compressed. - * If data is NULL, buffer is allocated by the function and holds the first few bytes - * of the file, buffer_size is set too. - */ - compression_type = get_compression_type (data, fd, fsize, &buffer, &buffer_size); + /* Peek at first few bytes of the file (or of the data), and see if it's compressed. */ + compression_type = get_compression_type (data, fd, fsize); if (compression_type < 0) { /* errno is set by get_compression_type () */ @@ -2161,62 +2855,53 @@ EXTRACTOR_extract (struct EXTRACTOR_PluginList *plugins, close (fd); return; } + + struct BufferedFileDataSource *bfds; + bfds = bfds_new (data, fd, fsize); + if (bfds == NULL) + return; + if (compression_type > 0) { - /* Don't assume that MAX_DECOMPRESS < MAX_READ */ - if ((fsize > MAX_DECOMPRESS) || (fsize > MAX_READ)) + int icr = 0; + /* Set up a decompressor. + * Will also report compression-related metadata to the caller. + */ + cfs = cfs_new (bfds, fsize, compression_type, proc, proc_cls); + if (cfs == NULL) { - /* File or data is to big to be decompressed in-memory (the only kind of decompression we do) */ - errno = EFBIG; if (fd != -1) close (fd); - if (buffer != NULL) - free (buffer); + errno = EILSEQ; return; } - /* Decompress data (or file contents + what we've read so far. Either way it writes a new - * pointer to buffer, sets buffer_size, and frees the old buffer (if it wasn't NULL). - * In case of failure it cleans up the buffer after itself. - * Will also report compression-related metadata to the caller. - */ - decompression_result = try_to_decompress (data, fd, fsize, compression_type, &buffer, &buffer_size, proc, proc_cls); - if (decompression_result != 0) + icr = cfs_init_decompressor (cfs, proc, proc_cls); + if (icr < 0) { - /* Buffer is taken care of already */ - close (fd); + if (fd != -1) + close (fd); errno = EILSEQ; return; } - else + else if (icr == 0) { - close (fd); - fd = -1; + if (fd != -1) + close (fd); + errno = 0; + return; } } - /* Now we either have a non-NULL data of fsize bytes - * OR a valid fd to read from and a small buffer of buffer_size bytes - * OR an invalid fd and a big buffer of buffer_size bytes - * Simplify this situation a bit: - */ - if ((data == NULL) && (fd == -1) && (buffer_size > 0)) - { - data = (const void *) buffer; - fsize = buffer_size; - } - - /* Now we either have a non-NULL data of fsize bytes - * OR a valid fd to read from and a small buffer of buffer_size bytes - * and we might need to free the buffer later in either case - */ - /* do_extract () might set errno itself, but from our point of view everything is OK */ errno = 0; - do_extract (plugins, data, fd, fsize, buffer, buffer_size, proc, proc_cls); - - if (buffer != NULL) - free (buffer); + do_extract (plugins, data, fd, filename, cfs, fsize, proc, proc_cls); + if (cfs != NULL) + { + cfs_deinit_decompressor (cfs); + cfs_delete (cfs); + } + bfds_delete (bfds); if (-1 != fd) close(fd); } @@ -2238,7 +2923,7 @@ RundllEntryPoint (HWND hwnd, out = _open_osfhandle (out_h, 0); setmode (in, _O_BINARY); setmode (out, _O_BINARY); - process_requests (read_plugin_data (in), + plugin_main (read_plugin_data (in), in, out); } diff --git a/src/main/extractor_plugins.c b/src/main/extractor_plugins.c @@ -208,20 +208,11 @@ plugin_load (struct EXTRACTOR_PluginList *plugin) "_EXTRACTOR_%s_extract_method", plugin->libname, &plugin->specials); - plugin->init_state_method = get_symbol_with_prefix (plugin->libraryHandle, - "_EXTRACTOR_%s_init_state_method", - plugin->libname, - &plugin->specials); - plugin->discard_state_method = get_symbol_with_prefix (plugin->libraryHandle, - "_EXTRACTOR_%s_discard_state_method", - plugin->libname, - &plugin->specials); - if (plugin->extract_method == NULL || plugin->init_state_method == NULL || - plugin->discard_state_method == NULL) + if (plugin->extract_method == NULL) { #if DEBUG fprintf (stderr, - "Resolving `extract', 'init_state' or 'discard_state' method(s) of plugin `%s' failed: %s\n", + "Resolving `extract' method of plugin `%s' failed: %s\n", plugin->short_libname, lt_dlerror ()); #endif @@ -285,6 +276,20 @@ EXTRACTOR_plugin_add (struct EXTRACTOR_PluginList * prev, result->plugin_options = strdup (options); else result->plugin_options = NULL; + /* This is kinda weird, but it allows us to not to call GetSystemInfo() + * or sysconf() every time we need allocation granularity - just once + * for each plugin. + * The only alternative is to keep it in a global variable... + */ +#if WINDOWS + { + SYSTEM_INFO si; + GetSystemInfo (&si); + result->allocation_granularity = si.dwAllocationGranularity; + } +#else + result->allocation_granularity = sysconf (_SC_PAGE_SIZE); +#endif return result; } diff --git a/src/main/extractor_plugins.h b/src/main/extractor_plugins.h @@ -65,8 +65,6 @@ struct EXTRACTOR_PluginList * Pointer to the function used for meta data extraction. */ EXTRACTOR_extract_method extract_method; - EXTRACTOR_init_state_method init_state_method; - EXTRACTOR_discard_state_method discard_state_method; /** * Options for the plugin. @@ -103,6 +101,7 @@ struct EXTRACTOR_PluginList #else HANDLE cpipe_in; #endif + int pipe_in; /** * A position this plugin wants us to seek to. -1 if it's finished. @@ -120,11 +119,13 @@ struct EXTRACTOR_PluginList int64_t fsize; - int64_t position; + int64_t fpos; unsigned char *shm_ptr; - size_t map_size; + int64_t map_size; + + int64_t shm_pos; /** * Pipe used to read information about extracted meta data from @@ -136,6 +137,12 @@ struct EXTRACTOR_PluginList HANDLE cpipe_out; #endif +#if !WINDOWS + long allocation_granularity; +#else + DWORD allocation_granularity; +#endif + #if WINDOWS /** * A structure for overlapped reads on W32. @@ -152,6 +159,9 @@ struct EXTRACTOR_PluginList */ unsigned char *ov_write_buffer; #endif + + uint8_t operation_mode; + int waiting_for_update; }; /** @@ -163,4 +173,16 @@ struct EXTRACTOR_PluginList int plugin_load (struct EXTRACTOR_PluginList *plugin); +int64_t +pl_read (struct EXTRACTOR_PluginList *plugin, unsigned char **data, size_t count); + +int64_t +pl_seek (struct EXTRACTOR_PluginList *plugin, int64_t pos, int whence); + +int64_t +pl_get_fsize (struct EXTRACTOR_PluginList *plugin); + +int64_t +pl_get_pos (struct EXTRACTOR_PluginList *plugin); + #endif /* EXTRACTOR_PLUGINS_H */ diff --git a/src/plugins/Makefile.am b/src/plugins/Makefile.am @@ -14,6 +14,7 @@ SUBDIRS = . plugin_LTLIBRARIES = \ libextractor_id3.la \ libextractor_id3v2.la \ + libextractor_ebml.la \ libextractor_mp3.la libextractor_mp3_la_SOURCES = \ @@ -22,11 +23,13 @@ libextractor_mp3_la_LDFLAGS = \ $(PLUGINFLAGS) libextractor_mp3_la_LIBADD = \ $(top_builddir)/src/common/libextractor_common.la \ + $(top_builddir)/src/main/libextractor.la \ $(LE_LIBINTL) libextractor_ebml_la_SOURCES = \ ebml_extractor.c libextractor_ebml_la_LDFLAGS = \ + $(top_builddir)/src/main/libextractor.la \ $(PLUGINFLAGS) libextractor_id3_la_SOURCES = \ @@ -35,6 +38,7 @@ libextractor_id3_la_LDFLAGS = \ $(PLUGINFLAGS) libextractor_id3_la_LIBADD = \ $(top_builddir)/src/common/libextractor_common.la \ + $(top_builddir)/src/main/libextractor.la \ $(LE_LIBINTL) libextractor_id3v2_la_SOURCES = \ @@ -42,6 +46,7 @@ libextractor_id3v2_la_SOURCES = \ libextractor_id3v2_la_LDFLAGS = \ $(PLUGINFLAGS) libextractor_id3v2_la_LIBADD = \ + $(top_builddir)/src/main/libextractor.la \ $(top_builddir)/src/common/libextractor_common.la EXTRA_DIST = template_extractor.c diff --git a/src/plugins/id3_extractor.c b/src/plugins/id3_extractor.c @@ -201,46 +201,6 @@ static const char *const genre_names[] = { #define OK 0 #define INVALID_ID3 1 -struct id3_state -{ - int state; - id3tag info; -}; - -enum ID3State -{ - ID3_INVALID = -1, - ID3_SEEKING_TO_TAIL = 0, - ID3_READING_TAIL = 1 -}; - -void -EXTRACTOR_id3_init_state_method (struct EXTRACTOR_PluginList *plugin) -{ - struct id3_state *state; - state = plugin->state = malloc (sizeof (struct id3_state)); - if (state == NULL) - return; - memset (state, 0, sizeof (struct id3_state)); - state->state = ID3_SEEKING_TO_TAIL; -} - -void -EXTRACTOR_id3_discard_state_method (struct EXTRACTOR_PluginList *plugin) -{ - struct id3_state *state = plugin->state; - if (state != NULL) - { - if (state->info.title != NULL) free (state->info.title); - if (state->info.year != NULL) free (state->info.year); - if (state->info.album != NULL) free (state->info.album); - if (state->info.artist != NULL) free (state->info.artist); - if (state->info.comment != NULL) free (state->info.comment); - free (state); - } - plugin->state = NULL; -} - static void trim (char *k) { @@ -302,74 +262,44 @@ int EXTRACTOR_id3_extract_method (struct EXTRACTOR_PluginList *plugin, EXTRACTOR_MetaDataProcessor proc, void *proc_cls) { - int64_t file_position; - int64_t file_size; - int64_t offset = 0; - int64_t size; - struct id3_state *state; + id3tag info; + int64_t fsize; char *data; - char track[16]; - if (plugin == NULL || plugin->state == NULL) + if (plugin == NULL) return 1; - state = plugin->state; - file_position = plugin->position; - file_size = plugin->fsize; - size = plugin->map_size; - data = (char *) plugin->shm_ptr; - - if (plugin->seek_request < 0) + pl_seek (plugin, -128, SEEK_END); + fsize = pl_get_fsize (plugin); + if (fsize <= 0) return 1; - if (file_position - plugin->seek_request > 0) - { - plugin->seek_request = -1; + + if (128 != pl_read (plugin, &data, 128)) return 1; - } - if (plugin->seek_request - file_position < size) - offset = plugin->seek_request - file_position; - while (1) + memset (&info, 0, sizeof (info)); + + if (OK != get_id3 (data, 0, 128, &info)) + return 1; + ADD (info.title, EXTRACTOR_METATYPE_TITLE); + ADD (info.artist, EXTRACTOR_METATYPE_ARTIST); + ADD (info.album, EXTRACTOR_METATYPE_ALBUM); + ADD (info.year, EXTRACTOR_METATYPE_PUBLICATION_YEAR); + ADD (info.genre, EXTRACTOR_METATYPE_GENRE); + ADD (info.comment, EXTRACTOR_METATYPE_COMMENT); + if (info.track_number != 0) { - switch (state->state) - { - case ID3_INVALID: - plugin->seek_request = -1; - return 1; - case ID3_SEEKING_TO_TAIL: - offset = file_size - 128 - file_position; - if (offset > size) - { - state->state = ID3_READING_TAIL; - plugin->seek_request = file_position + offset; - return 0; - } - else if (offset < 0) - { - state->state = ID3_INVALID; - break; - } - state->state = ID3_READING_TAIL; - break; - case ID3_READING_TAIL: - if (OK != get_id3 (data, offset, size - offset, &state->info)) - return 1; - ADD (state->info.title, EXTRACTOR_METATYPE_TITLE); - ADD (state->info.artist, EXTRACTOR_METATYPE_ARTIST); - ADD (state->info.album, EXTRACTOR_METATYPE_ALBUM); - ADD (state->info.year, EXTRACTOR_METATYPE_PUBLICATION_YEAR); - ADD (state->info.genre, EXTRACTOR_METATYPE_GENRE); - ADD (state->info.comment, EXTRACTOR_METATYPE_COMMENT); - if (state->info.track_number != 0) - { - snprintf(track, - sizeof(track), "%u", state->info.track_number); - ADD (track, EXTRACTOR_METATYPE_TRACK_NUMBER); - } - state->state = ID3_INVALID; - } + snprintf (track, sizeof(track), "%u", info.track_number); + ADD (track, EXTRACTOR_METATYPE_TRACK_NUMBER); } + + if (info.title != NULL) free (info.title); + if (info.year != NULL) free (info.year); + if (info.album != NULL) free (info.album); + if (info.artist != NULL) free (info.artist); + if (info.comment != NULL) free (info.comment); + return 1; } diff --git a/src/plugins/id3v2_extractor.c b/src/plugins/id3v2_extractor.c @@ -215,30 +215,30 @@ enum ID3v2State ID3V2_READING_FRAME }; -void -EXTRACTOR_id3v2_init_state_method (struct EXTRACTOR_PluginList *plugin) +struct id3v2_state * +EXTRACTOR_id3v2_init_state_method () { struct id3v2_state *state; - state = plugin->state = malloc (sizeof (struct id3v2_state)); + state = malloc (sizeof (struct id3v2_state)); if (state == NULL) - return; + return NULL; memset (state, 0, sizeof (struct id3v2_state)); state->state = ID3V2_READING_HEADER; state->ti = -1; state->mime = NULL; + return state; } -void -EXTRACTOR_id3v2_discard_state_method (struct EXTRACTOR_PluginList *plugin) +static int +EXTRACTOR_id3v2_discard_state_method (struct id3v2_state *state) { - struct id3v2_state *state = plugin->state; if (state != NULL) { if (state->mime != NULL) free (state->mime); free (state); } - plugin->state = NULL; + return 1; } static int @@ -266,24 +266,12 @@ EXTRACTOR_id3v2_extract_method (struct EXTRACTOR_PluginList *plugin, enum EXTRACTOR_MetaType type; unsigned char picture_type; - if (plugin == NULL || plugin->state == NULL) + if (plugin == NULL) return 1; - state = plugin->state; - file_position = plugin->position; - file_size = plugin->fsize; - size = plugin->map_size; - data = plugin->shm_ptr; - - if (plugin->seek_request < 0) - return 1; - if (file_position - plugin->seek_request > 0) - { - plugin->seek_request = -1; + state = EXTRACTOR_id3v2_init_state_method (); + if (state == NULL) return 1; - } - if (plugin->seek_request - file_position < size) - offset = plugin->seek_request - file_position; while (1) { @@ -291,7 +279,7 @@ EXTRACTOR_id3v2_extract_method (struct EXTRACTOR_PluginList *plugin, { case ID3V2_INVALID: plugin->seek_request = -1; - return 1; + return EXTRACTOR_id3v2_discard_state_method (state); case ID3V2_READING_HEADER: /* TODO: support id3v24 tags at the end of file. Here's a quote from id3 faq: * Q: Where is an ID3v2 tag located in an MP3 file? @@ -303,7 +291,8 @@ EXTRACTOR_id3v2_extract_method (struct EXTRACTOR_PluginList *plugin, * in the actual MPEG stream, on an MPEG frame boundry. Almost nobody does * this. * Parsing of such tags will not be completely correct, because we can't - * seek backwards. We will have to seek to file_size - chunk_size instead + * seek backwards. (OK, now we CAN seek backwards, but we still need to mind the + * chunk size). We will have to seek to file_size - chunk_size instead * (by the way, chunk size is theoretically unknown, LE is free to use any chunk * size, even though plugins often make assumptions about chunk size being large * enough to make one atomic read without seeking, if offset == 0) and search @@ -326,7 +315,12 @@ EXTRACTOR_id3v2_extract_method (struct EXTRACTOR_PluginList *plugin, * flag is not set, id3v2 parser must discard id3v1 data). * At the moment id3v1 and id3v2 are parsed separately, and update flag is ignored. */ - if (file_position != 0 || size < 10 || (data[0] != 0x49) || (data[1] != 0x44) || (data[2] != 0x33) || ((data[3] != 0x02) && (data[3] != 0x03) && (data[3] != 0x04))/* || (data[4] != 0x00) minor verisons are backward-compatible*/) + if (10 != pl_read (plugin, &data, 10)) + { + state->state = ID3V2_INVALID; + break; + } + if ((data[0] != 0x49) || (data[1] != 0x44) || (data[2] != 0x33) || ((data[3] != 0x02) && (data[3] != 0x03) && (data[3] != 0x04))/* || (data[4] != 0x00) minor verisons are backward-compatible*/) { state->state = ID3V2_INVALID; break; @@ -353,12 +347,6 @@ EXTRACTOR_id3v2_extract_method (struct EXTRACTOR_PluginList *plugin, } } state->tsize = (((data[6] & 0x7F) << 21) | ((data[7] & 0x7F) << 14) | ((data[8] & 0x7F) << 07) | ((data[9] & 0x7F) << 00)); - if (state->tsize + 10 > file_size) - { - state->state = ID3V2_INVALID; - break; - } - offset = 10; if (state->ver == 0x03 && state->extended_header) state->state = ID3V23_READING_EXTENDED_HEADER; else if (state->ver == 0x04 && state->extended_header) @@ -367,28 +355,17 @@ EXTRACTOR_id3v2_extract_method (struct EXTRACTOR_PluginList *plugin, state->state = ID3V2_READING_FRAME_HEADER; break; case ID3V23_READING_EXTENDED_HEADER: - if (offset + 9 >= size) - { - if (offset == 0) - { - state->state = ID3V2_INVALID; - break; - } - plugin->seek_request = file_position + offset; - return 0; + if (10 != pl_read (plugin, &data, 10)) + { + state->state = ID3V2_INVALID; + break; } if (state->ver == 0x03 && state->extended_header) { uint32_t padding, extended_header_size; - extended_header_size = (((data[offset]) << 24) | ((data[offset + 1]) << 16) | ((data[offset + 2]) << 8) | ((data[offset + 3]) << 0)); - padding = (((data[offset + 6]) << 24) | ((data[offset + 7]) << 16) | ((data[offset + 8]) << 8) | ((data[offset + 9]) << 0)); - if (data[offset + 4] == 0 && data[offset + 5] == 0) - /* Skip the CRC32 byte after extended header */ - offset += 1; - offset += 4 + extended_header_size; - if (padding < state->tsize) - state->tsize -= padding; - else + extended_header_size = (((data[0]) << 24) | ((data[1]) << 16) | ((data[2]) << 8) | ((data[3]) << 0)); + padding = (((data[6]) << 24) | ((data[7]) << 16) | ((data[8]) << 8) | ((data[9]) << 0)); + if (extended_header_size - 6 != pl_read (plugin, &data, extended_header_size - 6)) { state->state = ID3V2_INVALID; break; @@ -396,73 +373,75 @@ EXTRACTOR_id3v2_extract_method (struct EXTRACTOR_PluginList *plugin, } break; case ID3V24_READING_EXTENDED_HEADER: - if (offset + 6 >= size) - { - if (offset == 0) - { - state->state = ID3V2_INVALID; - break; - } - plugin->seek_request = file_position + offset; - return 0; + if (4 != pl_read (plugin, &data, 4)) + { + state->state = ID3V2_INVALID; + break; } - if ( (state->ver == 0x04) && (state->extended_header)) + if ((state->ver == 0x04) && (state->extended_header)) { uint32_t extended_header_size; - extended_header_size = (((data[offset]) << 24) | - ((data[offset + 1]) << 16) | - ((data[offset + 2]) << 8) | - ((data[offset + 3]) << 0)); - offset += 4 + extended_header_size; + extended_header_size = (((data[0]) << 24) | + ((data[1]) << 16) | + ((data[2]) << 8) | + ((data[3]) << 0)); + if (extended_header_size != pl_read (plugin, &data, extended_header_size)) + { + state->state = ID3V2_INVALID; + break; + } } break; case ID3V2_READING_FRAME_HEADER: - if (file_position + offset > state->tsize || - ((state->ver == 0x02) && file_position + offset + 6 >= state->tsize) || - (((state->ver == 0x03) || (state->ver == 0x04))&& file_position + offset + 10 >= state->tsize)) + if (state->ver == 0x02) { - state->state = ID3V2_INVALID; - break; + if (6 != pl_read (plugin, &data, 6)) + { + state->state = ID3V2_INVALID; + break; + } } - if (((state->ver == 0x02) && (offset + 6 >= size)) || - (((state->ver == 0x03) || (state->ver == 0x04)) && (offset + 10 >= size))) + else if ((state->ver == 0x03) || (state->ver == 0x04)) { - plugin->seek_request = file_position + offset; - return 0; + if (10 != pl_read (plugin, &data, 10)) + { + state->state = ID3V2_INVALID; + break; + } } if (state->ver == 0x02) { - memcpy (state->id, &data[offset], 3); - state->csize = (data[offset + 3] << 16) + (data[offset + 4] << 8) + data[offset + 5]; - if ((file_position + offset + 6 + state->csize > file_size) || (state->csize > file_size) || (state->csize == 0)) + memcpy (state->id, &data[0], 3); + state->csize = (data[3] << 16) + (data[4] << 8) + data[5]; + if (state->csize == 0) { state->state = ID3V2_INVALID; break; } - offset += 6; state->frame_flags = 0; } else if ((state->ver == 0x03) || (state->ver == 0x04)) { - memcpy (state->id, &data[offset], 4); + memcpy (state->id, &data[0], 4); if (state->ver == 0x03) - state->csize = (data[offset + 4] << 24) + (data[offset + 5] << 16) + (data[offset + 6] << 8) + data[offset + 7]; + state->csize = (data[4] << 24) + (data[5] << 16) + (data[6] << 8) + data[7]; else if (state->ver == 0x04) - state->csize = ((data[offset + 4] & 0x7F) << 21) | ((data[offset + 5] & 0x7F) << 14) | ((data[offset + 6] & 0x7F) << 07) | ((data[offset + 7] & 0x7F) << 00); - if ((file_position + offset + 10 + state->csize > file_size) || (state->csize > file_size) || (state->csize == 0)) + state->csize = ((data[4] & 0x7F) << 21) | ((data[5] & 0x7F) << 14) | ((data[6] & 0x7F) << 07) | ((data[7] & 0x7F) << 00); + if (state->csize == 0) { state->state = ID3V2_INVALID; break; } - state->frame_flags = (data[offset + 8] << 8) + data[offset + 9]; + state->frame_flags = (data[8] << 8) + data[9]; if (state->ver == 0x03) { if (((state->frame_flags & 0x80) > 0) /* compressed, not yet supported */ || ((state->frame_flags & 0x40) > 0) /* encrypted, not supported */) { /* Skip to next frame header */ - offset += 10 + state->csize; + if (state->csize != pl_read (plugin, &data, state->csize)) + state->state = ID3V2_INVALID; break; } } @@ -473,70 +452,77 @@ EXTRACTOR_id3v2_extract_method (struct EXTRACTOR_PluginList *plugin, ((state->frame_flags & 0x02) > 0) /* unsynchronization, not supported */) { /* Skip to next frame header */ - offset += 10 + state->csize; + if (state->csize != pl_read (plugin, &data, state->csize)) + state->state = ID3V2_INVALID; break; } if ((state->frame_flags & 0x01) > 0) { /* Skip data length indicator */ state->csize -= 4; - offset += 4; + if (4 != pl_read (plugin, &data, 4)) + { + state->state = ID3V2_INVALID; + break; + } } } - offset += 10; } state->ti = find_type ((const char *) state->id, (state->ver == 0x02) ? 3 : (((state->ver == 0x03) || (state->ver == 0x04)) ? 4 : 0)); if (state->ti == -1) { - offset += state->csize; + if (state->csize != pl_read (plugin, &data, state->csize)) + state->state = ID3V2_INVALID; break; } state->state = ID3V2_READING_FRAME; break; case ID3V2_READING_FRAME: - if (offset == 0 && state->csize > size) + if (0 > (offset = pl_get_pos (plugin))) { - /* frame size is larger than the size of one data chunk we get at a time */ - offset += state->csize; - state->state = ID3V2_READING_FRAME_HEADER; + state->state = ID3V2_INVALID; break; } - if (offset + state->csize > size) - { - plugin->seek_request = file_position + offset; - return 0; - } word = NULL; if (((state->ver == 0x03) && ((state->frame_flags & 0x20) > 0)) || ((state->ver == 0x04) && ((state->frame_flags & 0x40) > 0))) { /* "group" identifier, skip a byte */ - offset++; + if (1 != pl_read (plugin, &data, 1)) + { + state->state = ID3V2_INVALID; + break; + } state->csize--; } + if (state->csize != pl_read (plugin, &data, state->csize)) + { + state->state = ID3V2_INVALID; + break; + } switch (tmap[state->ti].fmt) { case T: - if (data[offset] == 0x00) - word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[offset + 1], + if (data[0] == 0x00) + word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[1], state->csize - 1, "ISO-8859-1"); - else if (data[offset] == 0x01) - word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[offset + 1], + else if (data[0] == 0x01) + word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[1], state->csize - 1, "UCS-2"); - else if ((state->ver == 0x04) && (data[offset] == 0x02)) - word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[offset + 1], + else if ((state->ver == 0x04) && (data[0] == 0x02)) + word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[1], state->csize - 1, "UTF-16BE"); - else if ((state->ver == 0x04) && (data[offset] == 0x03)) - word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[offset + 1], + else if ((state->ver == 0x04) && (data[0] == 0x03)) + word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[1], state->csize - 1, "UTF-8"); else /* bad encoding byte, try to convert from iso-8859-1 */ - word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[offset + 1], + word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[1], state->csize - 1, "ISO-8859-1"); break; case U: - word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[offset], + word = EXTRACTOR_common_convert_to_utf8 ((const char *) data, state->csize, "ISO-8859-1"); break; case UL: @@ -548,30 +534,30 @@ EXTRACTOR_id3v2_extract_method (struct EXTRACTOR_PluginList *plugin, } /* find end of description */ off = 4; - while ((off < size) && (off < offset + state->csize) && (data[offset + off] != '\0')) + while ((off < size) && (off < state->csize) && (data[off] != '\0')) off++; - if ((off >= state->csize) || (data[offset + off] != '\0')) + if ((off >= state->csize) || (data[off] != '\0')) { /* malformed */ state->state = ID3V2_INVALID; break; } off++; - if (data[offset] == 0x00) - word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[offset + off], + if (data[0] == 0x00) + word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[off], state->csize - off, "ISO-8859-1"); - else if (data[offset] == 0x01) - word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[offset + off], + else if (data[0] == 0x01) + word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[off], state->csize - off, "UCS-2"); - else if ((state->ver == 0x04) && (data[offset] == 0x02)) - word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[offset + off], + else if ((state->ver == 0x04) && (data[0] == 0x02)) + word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[off], state->csize - off, "UTF-16BE"); - else if ((state->ver == 0x04) && (data[offset] == 0x03)) - word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[offset + off], + else if ((state->ver == 0x04) && (data[0] == 0x03)) + word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[off], state->csize - off, "UTF-8"); else /* bad encoding byte, try to convert from iso-8859-1 */ - word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[offset + off], + word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[off], state->csize - off, "ISO-8859-1"); break; case SL: @@ -581,21 +567,21 @@ EXTRACTOR_id3v2_extract_method (struct EXTRACTOR_PluginList *plugin, state->state = ID3V2_INVALID; break; } - if (data[offset] == 0x00) - word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[offset + 6], + if (data[0] == 0x00) + word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[6], state->csize - 6, "ISO-8859-1"); - else if (data[offset] == 0x01) - word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[offset + 6], + else if (data[0] == 0x01) + word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[6], state->csize - 6, "UCS-2"); - else if ((state->ver == 0x04) && (data[offset] == 0x02)) - word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[offset + 6], + else if ((state->ver == 0x04) && (data[0] == 0x02)) + word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[6], state->csize - 6, "UTF-16BE"); - else if ((state->ver == 0x04) && (data[offset] == 0x03)) - word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[offset + 6], + else if ((state->ver == 0x04) && (data[0] == 0x03)) + word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[6], state->csize - 6, "UTF-8"); else /* bad encoding byte, try to convert from iso-8859-1 */ - word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[offset + 6], + word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[6], state->csize - 6, "ISO-8859-1"); break; case L: @@ -607,9 +593,9 @@ EXTRACTOR_id3v2_extract_method (struct EXTRACTOR_PluginList *plugin, } /* find end of description */ off = 4; - while ((off < size) && (off < offset + state->csize) && (data[offset + off] != '\0')) + while ((off < size) && (off < state->csize) && (data[off] != '\0')) off++; - if ((off >= state->csize) || (data[offset + off] != '\0')) + if ((off >= state->csize) || (data[off] != '\0')) { /* malformed */ state->state = ID3V2_INVALID; @@ -617,21 +603,21 @@ EXTRACTOR_id3v2_extract_method (struct EXTRACTOR_PluginList *plugin, } off++; - if (data[offset] == 0x00) - word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[offset + off], + if (data[0] == 0x00) + word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[off], state->csize - off, "ISO-8859-1"); - else if (data[offset] == 0x01) - word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[offset + off], + else if (data[0] == 0x01) + word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[off], state->csize - off, "UCS-2"); - else if ((state->ver == 0x04) && (data[offset] == 0x02)) - word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[offset + off], + else if ((state->ver == 0x04) && (data[0] == 0x02)) + word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[off], state->csize - off, "UTF-1offBE"); - else if ((state->ver == 0x04) && (data[offset] == 0x03)) - word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[offset + off], + else if ((state->ver == 0x04) && (data[0] == 0x03)) + word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[off], state->csize - off, "UTF-8"); else /* bad encoding byte, try to convert from iso-8859-1 */ - word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[offset + off], + word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[off], state->csize - off, "ISO-8859-1"); break; case I: @@ -650,38 +636,38 @@ EXTRACTOR_id3v2_extract_method (struct EXTRACTOR_PluginList *plugin, if (state->ver == 0x02) { off = 5; - picture_type = data[offset + 5]; + picture_type = data[4]; } else if ((state->ver == 0x03) || (state->ver == 0x04)) { off = 1; - while ((off < size) && (off < offset + state->csize) && (data[offset + off] != '\0') ) + while ((off < state->csize) && (data[off] != '\0')) off++; - if ((off >= state->csize) || (data[offset + off] != '\0')) + if ((off >= state->csize) || (data[off] != '\0')) { /* malformed */ state->state = ID3V2_INVALID; break; } state->mime = malloc (off); - memcpy (state->mime, &data[offset + 1], off - 1); + memcpy (state->mime, &data[1], off - 1); state->mime[off - 1] = '\0'; off += 1; - picture_type = data[offset]; + picture_type = data[off]; off += 1; - } - /* find end of description */ - while ((off < size) && (off < offset + state->csize) && (data[offset + off] != '\0')) + /* find end of mime type*/ + while ((off < state->csize) && (data[off] != '\0')) + off++; + if ((off >= state->csize) || (data[off] != '\0')) + { + free (state->mime); + state->mime = NULL; + /* malformed */ + state->state = ID3V2_INVALID; + break; + } off++; - if ((off >= state->csize) || (data[offset + off] != '\0')) - { - free (state->mime); - state->mime = NULL; - /* malformed */ - state->state = ID3V2_INVALID; - break; } - off++; switch (picture_type) { case 0x03: @@ -711,9 +697,9 @@ EXTRACTOR_id3v2_extract_method (struct EXTRACTOR_PluginList *plugin, } if (state->ver == 0x02) { - if (0 == strncasecmp ("PNG", (const char *) &data[offset + 1], 3)) + if (0 == strncasecmp ("PNG", (const char *) &data[1], 3)) state->mime = strdup ("image/png"); - else if (0 == strncasecmp ("JPG", (const char *) &data[offset + 1], 3)) + else if (0 == strncasecmp ("JPG", (const char *) &data[1], 3)) state->mime = strdup ("image/jpeg"); else state->mime = NULL; @@ -734,7 +720,7 @@ EXTRACTOR_id3v2_extract_method (struct EXTRACTOR_PluginList *plugin, } else { - if (0 != proc (proc_cls, "id3v2", type, EXTRACTOR_METAFORMAT_BINARY, state->mime, (const char*) &data[offset + off], state->csize - off)) + if (0 != proc (proc_cls, "id3v2", type, EXTRACTOR_METAFORMAT_BINARY, state->mime, (const char*) &data[off], state->csize - off)) { if (state->mime != NULL) free (state->mime); @@ -760,7 +746,6 @@ EXTRACTOR_id3v2_extract_method (struct EXTRACTOR_PluginList *plugin, } if (word != NULL) free (word); - offset = offset + state->csize; state->state = ID3V2_READING_FRAME_HEADER; break; } diff --git a/src/plugins/mp3_extractor.c b/src/plugins/mp3_extractor.c @@ -169,13 +169,13 @@ enum MP3State MP3_READING_FRAME = 1, }; -void -EXTRACTOR_mp3_init_state_method (struct EXTRACTOR_PluginList *plugin) +static struct mp3_state * +EXTRACTOR_mp3_init_state_method () { struct mp3_state *state; - state = plugin->state = malloc (sizeof (struct mp3_state)); + state = malloc (sizeof (struct mp3_state)); if (state == NULL) - return; + return NULL; state->header = 0; state->sample_rate = 0; state->number_of_frames = 0; @@ -189,16 +189,17 @@ EXTRACTOR_mp3_init_state_method (struct EXTRACTOR_PluginList *plugin) state->avg_bps = 0; state->bitrate = 0; state->state = 0; + return state; } -void -EXTRACTOR_mp3_discard_state_method (struct EXTRACTOR_PluginList *plugin) +static int +EXTRACTOR_mp3_discard_state_method (struct mp3_state *state) { - if (plugin->state != NULL) + if (state != NULL) { - free (plugin->state); + free (state); } - plugin->state = NULL; + return 1; } static int @@ -247,14 +248,13 @@ EXTRACTOR_mp3_extract_method (struct EXTRACTOR_PluginList *plugin, EXTRACTOR_MetaDataProcessor proc, void *proc_cls) { - int64_t file_position; - int64_t file_size; - size_t offset = 0; - size_t size; + int64_t offset = 0; + int64_t round_offset; + int64_t read_result; + int64_t i; unsigned char *data; struct mp3_state *state; - size_t frames_found_in_this_round = 0; int start_anew = 0; char mpeg_ver = 0; @@ -267,24 +267,12 @@ EXTRACTOR_mp3_extract_method (struct EXTRACTOR_PluginList *plugin, int ch = 0; int frame_size; - if (plugin == NULL || plugin->state == NULL) + if (plugin == NULL) return 1; - state = plugin->state; - file_position = plugin->position; - file_size = plugin->fsize; - size = plugin->map_size; - data = plugin->shm_ptr; - - if (plugin->seek_request < 0) - return 1; - if (file_position - plugin->seek_request > 0) - { - plugin->seek_request = -1; + state = EXTRACTOR_mp3_init_state_method (); + if (state == NULL) return 1; - } - if (plugin->seek_request - file_position < size) - offset = plugin->seek_request - file_position; while (1) { @@ -292,22 +280,40 @@ EXTRACTOR_mp3_extract_method (struct EXTRACTOR_PluginList *plugin, { case MP3_LOOKING_FOR_FRAME: /* Look for a frame header */ - while (offset + sizeof (state->header) < size && (((*((uint32_t *) &data[offset])) & MPA_SYNC_MASK_MEM) != MPA_SYNC_MASK_MEM)) - offset += 1; - if (offset + sizeof (state->header) >= size) + round_offset = offset = pl_get_pos (plugin); + while (1) { - /* Alternative: (frames_found_in_this_round < (size / LARGEST_FRAME_SIZE / 2)) is to generous */ - if ((file_position == 0 && (state->number_of_valid_frames > 2) && ((double) state->number_of_valid_frames / (double) state->number_of_frames) < 0.8) || - file_position + offset + sizeof (state->header) >= file_size) + pl_seek (plugin, offset, SEEK_SET); + read_result = pl_read (plugin, &data, 1024*1024); + if (read_result < 4) { calculate_frame_statistics_and_maybe_report_it (plugin, state, proc, proc_cls); - return 1; + return EXTRACTOR_mp3_discard_state_method (state); + } + for (i = 0; i + 3 < read_result; i++) + if (((*((uint32_t *) &data[i])) & MPA_SYNC_MASK_MEM) == MPA_SYNC_MASK_MEM) + break; + if (i + 3 >= 1024*1024) + offset += read_result - 3; + else + break; + if (offset > round_offset + 31*1024*1024) + { + if (((state->number_of_valid_frames > 2) && ((double) state->number_of_valid_frames / (double) state->number_of_frames) < 0.8)) + { + calculate_frame_statistics_and_maybe_report_it (plugin, state, proc, proc_cls); + } + return EXTRACTOR_mp3_discard_state_method (state); } - plugin->seek_request = file_position + offset; - return 0; } - state->header = (data[offset] << 24) | (data[offset + 1] << 16) | - (data[offset + 2] << 8) | data[offset + 3]; + pl_seek (plugin, offset + i, SEEK_SET); + if (4 != pl_read (plugin, &data, 4)) + { + calculate_frame_statistics_and_maybe_report_it (plugin, state, proc, proc_cls); + return EXTRACTOR_mp3_discard_state_method (state); + } + state->header = (data[0] << 24) | (data[1] << 16) | + (data[2] << 8) | data[3]; if ((state->header & MPA_SYNC_MASK) == MPA_SYNC_MASK) { state->state = MP3_READING_FRAME; @@ -402,11 +408,10 @@ EXTRACTOR_mp3_extract_method (struct EXTRACTOR_PluginList *plugin, state->original_flag = original_flag; state->bitrate = bitrate; - frames_found_in_this_round += 1; state->number_of_valid_frames += 1; if (state->avg_bps / state->number_of_valid_frames != bitrate / 1000) state->vbr_flag = 1; - offset += frame_size; + pl_seek (plugin, frame_size - 4, SEEK_CUR); state->state = MP3_LOOKING_FOR_FRAME; break; }