libextractor

GNU libextractor
Log | Files | Refs | Submodules | README | LICENSE

commit 929bfb08deeab0db8345f144d03e388891695995
parent 375ead169d33210a0e3088f410f93bdcece8c99d
Author: Christian Grothoff <christian@grothoff.org>
Date:   Sun, 22 Jul 2012 22:01:56 +0000

-stuff

Diffstat:
Msrc/main/extractor.c | 24------------------------
Msrc/main/extractor_ipc.c | 38++++++++++++++++++++------------------
Msrc/main/extractor_ipc.h | 222++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------
Msrc/main/extractor_ipc_gnu.c | 21+++++++++++++++++++++
Msrc/main/extractor_plugin_main.c | 48+++++++++++++++++++++++++++++++++++++++++++++---
Msrc/main/extractor_plugins.c | 18++----------------
Msrc/main/extractor_plugins.h | 36+++++++-----------------------------
7 files changed, 281 insertions(+), 126 deletions(-)

diff --git a/src/main/extractor.c b/src/main/extractor.c @@ -44,30 +44,6 @@ /** - * Client provided a memory buffer, analyze it. Creates a shm, copies - * buffer contents into it. Does not support seeking (all data comes - * in one [big] chunk. - */ -#define OPMODE_MEMORY 1 - -/** - * Client provided a memory buffer or a file, which contains compressed data. - * Creates a shm of limited size and repeatedly fills it with uncompressed - * data. Never skips data (has to uncompress every byte, discards unwanted bytes), - * can't efficiently seek backwards. Uses MESSAGE_UPDATED_SHM and MESSAGE_SEEK. - */ -#define OPMODE_DECOMPRESS 2 - -/** - * Client provided a filename. Creates a file-backed shm (on W32) or just - * communicates the file name to each plugin, and plugin opens its own file - * descriptor of the file (POSIX). Each plugin maps different parts of the - * file into its memory independently. - */ -#define OPMODE_FILE 3 - - -/** * Writes 'size' bytes from 'buf' to 'fd', returns only when * writing is not possible, or when all 'size' bytes were written * (never does partial writes). diff --git a/src/main/extractor_ipc.c b/src/main/extractor_ipc.c @@ -46,8 +46,8 @@ EXTRACTOR_IPC_process_reply_ (struct EXTRACTOR_PluginList *plugin, { const char *cdata = data; unsigned char code; - int64_t seek_position; - struct IpcHeader hdr; + struct SeekRequestMessage seek; + struct MetaMessage meta; const char *mime_type; const char *value; @@ -60,49 +60,51 @@ EXTRACTOR_IPC_process_reply_ (struct EXTRACTOR_PluginList *plugin, plugin->seek_request = -1; plugin->round_finished = 1; return 1; - case MESSAGE_SEEK: /* Seek */ - if (size < 1 + sizeof (int64_t)) + case MESSAGE_SEEK: /* Seek */ + if (size < sizeof (struct SeekRequestMessage)) { plugin->seek_request = -1; return 0; } - memcpy (&seek_position, &cdata[1], sizeof (int64_t)); - plugin->seek_request = seek_position; - return 1 + sizeof (int64_t); + memcpy (&seek, cdata, sizeof (seek)); + plugin->seek_request = seek.file_offset; + return sizeof (struct SeekRequestMessage); case MESSAGE_META: /* Meta */ - if (size < 1 + sizeof (hdr) ) + if (size < sizeof (struct MetaMessage)) { plugin->seek_request = -1; return 0; } - memcpy (&hdr, &cdata[1], sizeof (hdr)); + memcpy (&meta, cdata, sizeof (meta)); /* check hdr for sanity */ - if (hdr.data_len > MAX_META_DATA) + if (meta.value_size > MAX_META_DATA) return -1; /* not allowing more than MAX_META_DATA meta data */ - if (size < 1 + sizeof (hdr) + hdr.mime_len + hdr.data_len) + if (size < sizeof (meta) + meta.mime_length + meta.value_size) { plugin->seek_request = -1; return 0; } - if (0 == hdr.mime_len) + if (0 == meta.mime_length) { mime_type = NULL; } else { - mime_type = &cdata[1 + sizeof (hdr)]; - if ('\0' != mime_type[hdr.mime_len-1]) + mime_type = &cdata[sizeof (struct MetaMessage)]; + if ('\0' != mime_type[meta.mime_length - 1]) return -1; } - if (0 == hdr.data_len) + if (0 == meta.value_size) value = NULL; else - value = &cdata[1 + sizeof (hdr) + hdr.mime_len]; + value = &cdata[sizeof (struct MetaMessage) + meta.mime_length]; proc (proc_cls, plugin, - &hdr, + (enum EXTRACTOR_MetaType) meta.meta_type, + (enum EXTRACTOR_MetaFormat) meta.meta_format, + meta.value_size, mime_type, value); - return 1 + sizeof (hdr) + hdr.mime_len + hdr.data_len; + return sizeof (struct MetaMessage) + meta.mime_length + meta.value_size; default: return -1; } diff --git a/src/main/extractor_ipc.h b/src/main/extractor_ipc.h @@ -41,20 +41,120 @@ #define MAX_SHM_NAME 255 /** - * Sent from LE to a plugin to initialize it (open shm, - * reset position counters etc). + * Sent from LE to a plugin to initialize it (opens shm). */ -#define MESSAGE_INIT_STATE 0x01 +#define MESSAGE_INIT_STATE 0x00 + +/** + * IPC message send to plugin to initialize SHM. + */ +struct InitMessage +{ + /** + * Set to MESSAGE_INIT_STATE. + */ + unsigned char opcode; + + /** + * Always zero. + */ + unsigned char reserved; + + /** + * Name of the shared-memory name. + */ + uint32_t shm_name_length; + + /** + * Maximum size of the shm map. + */ + uint32_t shm_map_size; + + /* followed by name of the SHM */ +}; + + +/** + * Sent from LE to a plugin to tell it extracting + * can now start. The SHM will point to offset 0 + * of the file. + */ +#define MESSAGE_EXTRACT_START 0x01 + +/** + * IPC message send to plugin to start extracting. + */ +struct StartMessage +{ + /** + * Set to MESSAGE_EXTRACT_START. + */ + unsigned char opcode; + + /** + * Always zero. + */ + unsigned char reserved; + + /** + * Always zero. + */ + uint16_t reserved2; + + /** + * Number of bytes ready in SHM. + */ + uint32_t shm_ready_bytes; + + /** + * Overall size of the file. + */ + uint64_t file_size; + +}; /** * Sent from LE to a plugin to tell it that shm contents - * were updated. Only used for OPMODE_COMPRESS. + * were updated. */ #define MESSAGE_UPDATED_SHM 0x02 /** + * IPC message send to plugin to notify it about a change in the SHM. + */ +struct UpdateMessage +{ + /** + * Set to MESSAGE_UPDATED_SHM. + */ + unsigned char opcode; + + /** + * Always zero. + */ + unsigned char reserved; + + /** + * Always zero. + */ + uint16_t reserved2; + + /** + * Number of bytes ready in SHM. + */ + uint32_t shm_ready_bytes; + + /** + * Overall size of the file. + */ + uint64_t file_size; + +}; + +/** * Sent from plugin to LE to tell LE that plugin is done * analyzing current file and will send no more data. + * No message format as this is only one byte. */ #define MESSAGE_DONE 0x03 @@ -65,58 +165,103 @@ #define MESSAGE_SEEK 0x04 /** - * Sent from plugin to LE to tell LE about metadata discovered. + * IPC message send to plugin to start extracting. */ -#define MESSAGE_META 0x05 +struct SeekRequestMessage +{ + /** + * Set to MESSAGE_SEEK. + */ + unsigned char opcode; -/** - * Sent from LE to plugin to make plugin discard its state (unmap - * and close shm). - */ -#define MESSAGE_DISCARD_STATE 0x06 + /** + * Always zero. + */ + unsigned char reserved; + /** + * Always zero. + */ + uint16_t reserved2; -/** - * Definition of an IPC communication channel with - * some plugin. - */ -struct EXTRACTOR_Channel; + /** + * Number of bytes requested for SHM. + */ + uint32_t requested_bytes; + + /** + * Requested offset. + */ + uint64_t file_offset; + +}; /** - * Definition of a shared memory area. + * Sent from plugin to LE to tell LE about metadata discovered. */ -struct EXTRACTOR_SharedMemory; - +#define MESSAGE_META 0x05 /** - * Header used for our IPC replies. A header - * with all fields being zero is used to indicate - * the end of the stream. + * Plugin to parent: metadata discovered */ -struct IpcHeader +struct MetaMessage { /** - * Type of the meta data. + * Set to MESSAGE_META. */ - enum EXTRACTOR_MetaType meta_type; + unsigned char opcode; /** - * Format of the meta data. + * Always zero. */ - enum EXTRACTOR_MetaFormat meta_format; + unsigned char reserved; /** - * Number of bytes of meta data (value) + * An 'enum EXTRACTOR_MetaFormat' in 16 bits. */ - size_t data_len; - + uint16_t meta_format; + + /** + * An 'enum EXTRACTOR_MetaType' in 16 bits. + */ + uint16_t meta_type; + /** - * Length of the mime type string describing the meta data value's mime type, - * including 0-terminator, 0 for mime type of "NULL". + * Length of the mime type string. */ - size_t mime_len; + uint16_t mime_length; + + /** + * Size of the value. + */ + uint32_t value_size; + + /* followed by mime_length bytes of 0-terminated + mime-type (unless mime_length is 0) */ + + /* followed by value_size bytes of value */ + }; +/** + * Sent from LE to plugin to make plugin discard its state + * (extraction aborted by application). Only one byte. + * Plugin should get ready for next 'StartMessage' after this. + */ +#define MESSAGE_DISCARD_STATE 0x06 + + +/** + * Definition of an IPC communication channel with + * some plugin. + */ +struct EXTRACTOR_Channel; + +/** + * Definition of a shared memory area. + */ +struct EXTRACTOR_SharedMemory; + /** * Create a shared memory area. @@ -196,17 +341,22 @@ EXTRACTOR_IPC_channel_send_ (struct EXTRACTOR_Channel *channel, * Handler for a message from one of the plugins. * * @param cls closure - * @param short_libname library name of the channel sending the message - * @param msg header of the message from the plugin + * @param plugin plugin of the channel sending the message + * @param meta_type type of the meta data + * @param meta_format format of the meta data + * @param value_len number of bytes in 'value' * @param value 'data' send from the plugin * @param mime mime string send from the plugin */ typedef void (*EXTRACTOR_ChannelMessageProcessor) (void *cls, struct EXTRACTOR_PluginList *plugin, - const struct IpcHeader *msg, + enum EXTRACTOR_MetaType meta_type, + enum EXTRACTOR_MetaFormat meta_format, + size_t value_len, const void *value, const char *mime); + /** * Process a reply from channel (seek request, metadata and done message) * diff --git a/src/main/extractor_ipc_gnu.c b/src/main/extractor_ipc_gnu.c @@ -225,6 +225,8 @@ EXTRACTOR_IPC_channel_create_ (struct EXTRACTOR_PluginList *plugin, int p1[2]; int p2[2]; pid_t pid; + struct InitMessage *init; + size_t slen; if (NULL == (channel = malloc (sizeof (struct EXTRACTOR_Channel)))) return NULL; @@ -264,6 +266,25 @@ EXTRACTOR_IPC_channel_create_ (struct EXTRACTOR_PluginList *plugin, channel->cpipe_in = p1[1]; channel->cpipe_out = p2[0]; channel->cpid = pid; + slen = strlen (shm->shm_name) + 1; + if (NULL == (init = malloc (sizeof (struct InitMessage) + slen))) + { + EXTRACTOR_IPC_channel_destroy_ (channel); + return NULL; + } + init->opcode = MESSAGE_INIT_STATE; + init->reserved = 0; + init->shm_name_length = slen; + init->shm_map_size = shm->shm_size; + memcpy (&init[1], shm->shm_name, slen); + if (sizeof (init) != + EXTRACTOR_IPC_channel_send_ (channel, + init, + sizeof (init) + slen) ) + { + EXTRACTOR_IPC_channel_destroy_ (channel); + return NULL; + } return channel; } diff --git a/src/main/extractor_plugin_main.c b/src/main/extractor_plugin_main.c @@ -27,6 +27,7 @@ #include "plibc.h" #include "extractor.h" #include "extractor_datasource.h" +#include "extractor_ipc.h" #include "extractor_plugin_main.h" #include <dirent.h> #include <sys/types.h> @@ -493,6 +494,36 @@ process_requests (struct EXTRACTOR_PluginList *plugin, } +#ifndef WINDOWS +/** + * Open '/dev/null' and make the result the given + * file descriptor. + * + * @param target_fd desired FD to point to /dev/null + * @param flags open flags (O_RDONLY, O_WRONLY) + */ +static void +open_dev_null (int target_fd, + int flags) +{ + int fd; + + fd = open ("/dev/null", flags); + if (-1 == fd) + return; /* good luck */ + if (fd == target_fd) + return; /* already done */ + if (-1 == dup2 (fd, target_fd)) + { + (void) close (fd); + return; /* good luck */ + } + /* close original result from 'open' */ + (void) close (fd); +} +#endif + + /** * 'main' function of the child process. Loads the plugin, * sets up its in and out pipes, then runs the request serving function. @@ -508,16 +539,27 @@ EXTRACTOR_plugin_main_ (struct EXTRACTOR_PluginList *plugin, if (0 != EXTRACTOR_plugin_load_ (plugin)) { #if DEBUG - fprintf (stderr, "Plugin `%s' failed to load!\n", plugin->short_libname); + fprintf (stderr, "Plugin `%s' failed to load!\n", + plugin->short_libname); #endif return; } if ( (NULL != plugin->specials) && (NULL != strstr (plugin->specials, "close-stderr"))) - close (2); + { + (void) close (2); +#ifndef WINDOWS + open_dev_null (2, O_WRONLY); +#endif + } if ( (NULL != plugin->specials) && (NULL != strstr (plugin->specials, "close-stdout"))) - close (1); + { + (void) close (1); +#ifndef WINDOWS + open_dev_null (1, O_WRONLY); +#endif + } process_requests (plugin, in, out); } diff --git a/src/main/extractor_plugins.c b/src/main/extractor_plugins.c @@ -89,7 +89,7 @@ get_symbol_with_prefix (void *lib_handle, lt_dlerror()); } if (NULL != first_error) - free(first_error); + free (first_error); #endif } @@ -158,11 +158,11 @@ EXTRACTOR_plugin_load_ (struct EXTRACTOR_PluginList *plugin) "Loading `%s' plugin failed: %s\n", plugin->short_libname, "can't convert plugin name to local encoding"); +#endif free (plugin->libname); plugin->libname = NULL; plugin->flags = EXTRACTOR_OPTION_DISABLED; return -1; -#endif } plugin->libraryHandle = lt_dlopenadvise (llibname, advise); @@ -249,20 +249,6 @@ EXTRACTOR_plugin_add (struct EXTRACTOR_PluginList *prev, result->plugin_options = strdup (options); else result->plugin_options = NULL; - /* This is kinda weird, but it allows us to not to call GetSystemInfo() - * or sysconf() every time we need allocation granularity - just once - * for each plugin. - * The only alternative is to keep it in a global variable... - */ -#if WINDOWS - { - SYSTEM_INFO si; - GetSystemInfo (&si); - result->allocation_granularity = si.dwAllocationGranularity; - } -#else - result->allocation_granularity = sysconf (_SC_PAGE_SIZE); -#endif return result; } diff --git a/src/main/extractor_plugins.h b/src/main/extractor_plugins.h @@ -34,8 +34,8 @@ /** * Linked list of extractor plugins. An application builds this list - * by telling libextractor to load various keyword-extraction - * plugins. Libraries can also be unloaded (removed from this list, + * by telling libextractor to load various meta data extraction + * plugins. Plugins can also be unloaded (removed from this list, * see EXTRACTOR_plugin_remove). */ struct EXTRACTOR_PluginList @@ -83,44 +83,22 @@ struct EXTRACTOR_PluginList struct EXTRACTOR_Channel *channel; /** - * Flags to control how the plugin is executed. - */ - enum EXTRACTOR_Options flags; - -#if WINDOWS - /** - * Page size. Mmap offset is a multiple of this number. - */ - DWORD allocation_granularity; -#else - /** - * Page size. Mmap offset is a multiple of this number. + * A position this plugin wants us to seek to. -1 if it's finished. + * Starts at 0. */ - long allocation_granularity; -#endif + int64_t seek_request; /** - * A position this plugin wants us to seek to. -1 if it's finished. - * Starts at 0; + * Flags to control how the plugin is executed. */ - int64_t seek_request; + enum EXTRACTOR_Options flags; /** * Is this plugin finished extracting for this round? * 0: no, 1: yes */ int round_finished; - - /** - * Mode of operation. One of the OPMODE_* constants - */ - uint8_t operation_mode; - /** - * 1 if plugin is currently in a recursive process_requests() call, - * 0 otherwise - */ - int waiting_for_update; };