libextractor

GNU libextractor
Log | Files | Refs | Submodules | README | LICENSE

commit 66a6e66acc2ab995d0b50f98fe6bd2f9b3aa46ee
parent 8b969da6d45e3a9245320f676b00d87e3768b1a6
Author: Christian Grothoff <christian@grothoff.org>
Date:   Sun, 22 Jul 2012 21:21:54 +0000

-more hxing on new LE APIs

Diffstat:
Msrc/main/Makefile.am | 8++++----
Msrc/main/extractor.c | 47-----------------------------------------------
Msrc/main/extractor_datasource.h | 2++
Msrc/main/extractor_ipc.c | 106+++++++++++++++++++++++++++++++------------------------------------------------
Msrc/main/extractor_ipc.h | 57+++++++++++++++++++++++++++++++++++++++++++++++++++++----
Msrc/main/extractor_ipc_gnu.c | 45++++++++++++++++++++++++---------------------
Msrc/main/extractor_plugin_main.c | 1+
Msrc/main/extractor_plugins.c | 7++++---
Msrc/main/extractor_plugins.h | 24++++++++++++++++++++++++
Msrc/main/extractor_plugpath.c | 21++++++++++++++-------
Msrc/main/extractor_plugpath.h | 22----------------------
11 files changed, 167 insertions(+), 173 deletions(-)

diff --git a/src/main/Makefile.am b/src/main/Makefile.am @@ -40,13 +40,13 @@ EXTRA_DIST = \ libextractor_la_CPPFLAGS = -DPLUGINDIR=\"@RPLUGINDIR@\" -DPLUGININSTDIR=\"${plugindir}\" $(AM_CPPFLAGS) libextractor_la_SOURCES = \ - extractor.c \ - $(EXTRACTOR_IPC) extractor_ipc.c extractor_ipc.h \ + extractor_metatypes.c \ + extractor_print.c \ extractor_plugpath.c extractor_plugpath.h \ extractor_plugins.c extractor_plugins.h \ + $(EXTRACTOR_IPC) extractor_ipc.c extractor_ipc.h \ extractor_plugin_main.c extractor_plugin_main.h \ - extractor_metatypes.c \ - extractor_print.c + extractor.c extract_SOURCES = \ extract.c \ diff --git a/src/main/extractor.c b/src/main/extractor.c @@ -31,64 +31,17 @@ #include "extractor_plugpath.h" #include "extractor_plugins.h" - -/** - * How long do we allow an individual meta data object to be? - * Used to guard against (broken) plugns causing us to use - * excessive amounts of memory. - */ -#define MAX_META_DATA 32 * 1024 * 1024 - /** * Maximum length of a Mime-Type string. */ #define MAX_MIME_LEN 256 /** - * Maximum length of a shared memory object name - */ -#define MAX_SHM_NAME 255 - -/** * Set to 1 to get failure info, * 2 for actual debug info. */ #define DEBUG 1 -/** - * Sent from LE to a plugin to initialize it (open shm, - * reset position counters etc). - */ -#define MESSAGE_INIT_STATE 0x01 - -/** - * Sent from LE to a plugin to tell it that shm contents - * were updated. Only used for OPMODE_COMPRESS. - */ -#define MESSAGE_UPDATED_SHM 0x02 - -/** - * Sent from plugin to LE to tell LE that plugin is done - * analyzing current file and will send no more data. - */ -#define MESSAGE_DONE 0x03 - -/** - * Sent from plugin to LE to tell LE that plugin needs - * to read a different part of the source file. - */ -#define MESSAGE_SEEK 0x04 - -/** - * Sent from plugin to LE to tell LE about metadata discovered. - */ -#define MESSAGE_META 0x05 - -/** - * Sent from LE to plugin to make plugin discard its state (unmap - * and close shm). - */ -#define MESSAGE_DISCARD_STATE 0x06 /** * Client provided a memory buffer, analyze it. Creates a shm, copies diff --git a/src/main/extractor_datasource.h b/src/main/extractor_datasource.h @@ -25,6 +25,8 @@ #ifndef EXTRACTOR_DATASOURCE_H #define EXTRACTOR_DATASOURCE_H +#include "extractor.h" + /** * Handle to a datasource we can use for the plugins. */ diff --git a/src/main/extractor_ipc.c b/src/main/extractor_ipc.c @@ -23,11 +23,14 @@ * @author Christian Grothoff */ #include "platform.h" +#include "extractor_ipc.h" +#include "extractor_plugins.h" /** * Process a reply from channel (seek request, metadata and done message) * + * @param plugin plugin this communication is about * @param buf buffer with data from IPC channel * @param size number of bytes in buffer * @param proc metadata callback @@ -35,98 +38,71 @@ * @return number of bytes processed, -1 on error */ ssize_t -EXTRACTOR_IPC_process_reply_ (const void *data, +EXTRACTOR_IPC_process_reply_ (struct EXTRACTOR_PluginList *plugin, + const void *data, size_t size, EXTRACTOR_ChannelMessageProcessor proc, void *proc_cls) { - int read_result; + const char *cdata = data; unsigned char code; int64_t seek_position; struct IpcHeader hdr; - char *mime_type; - char *data; - int must_read = 1; + const char *mime_type; + const char *value; - while (must_read) + while (size > 0) { - read_result = plugin_read (plugin, &code, 1); - if (read_result < 1) - return -1; + code = (unsigned char) cdata[0]; switch (code) { case MESSAGE_DONE: /* Done */ plugin->seek_request = -1; - must_read = 0; - break; + plugin->round_finished = 1; + return 1; case MESSAGE_SEEK: /* Seek */ - read_result = plugin_read (plugin, - &seek_position, sizeof (int64_t)); - if (read_result < sizeof (int64_t)) - return -1; + if (size < 1 + sizeof (int64_t)) + { + plugin->seek_request = -1; + return 0; + } + memcpy (&seek_position, &cdata[1], sizeof (int64_t)); plugin->seek_request = seek_position; - must_read = 0; - break; + return 1 + sizeof (int64_t); case MESSAGE_META: /* Meta */ - read_result = plugin_read (plugin, - &hdr, sizeof (hdr)); - if (read_result < sizeof (hdr)) - return -1; - /* FIXME: check hdr for sanity */ + if (size < 1 + sizeof (hdr) ) + { + plugin->seek_request = -1; + return 0; + } + memcpy (&hdr, &cdata[1], sizeof (hdr)); + /* check hdr for sanity */ if (hdr.data_len > MAX_META_DATA) return -1; /* not allowing more than MAX_META_DATA meta data */ + if (size < 1 + sizeof (hdr) + hdr.mime_len + hdr.data_len) + { + plugin->seek_request = -1; + return 0; + } if (0 == hdr.mime_len) { mime_type = NULL; } else { - if (NULL == (mime_type = malloc (hdr.mime_len))) - return -1; - read_result = plugin_read (plugin, - mime_type, - hdr.mime_len); - if ( (read_result < hdr.mime_len) || - ('\0' != mime_type[hdr.mime_len-1]) ) - { - if (NULL != mime_type) - free (mime_type); - return -1; - } + mime_type = &cdata[1 + sizeof (hdr)]; + if ('\0' != mime_type[hdr.mime_len-1]) + return -1; } if (0 == hdr.data_len) - { - data = NULL; - } + value = NULL; else - { - if (NULL == (data = malloc (hdr.data_len))) - { - if (NULL != mime_type) - free (mime_type); - return -1; - } - read_result = plugin_read (plugin, - data, hdr.data_len); - if (read_result < hdr.data_len) - { - if (NULL != mime_type) - free (mime_type); - free (data); - return -1; - } - } - read_result = proc (proc_cls, - plugin->short_libname, - hdr.meta_type, hdr.meta_format, - mime_type, data, hdr.data_len); - if (NULL != mime_type) - free (mime_type); - if (NULL != data) - free (data); - if (0 != read_result) - return 1; - break; + value = &cdata[1 + sizeof (hdr) + hdr.mime_len]; + proc (proc_cls, + plugin, + &hdr, + mime_type, value); + return 1 + sizeof (hdr) + hdr.mime_len + hdr.data_len; default: return -1; } diff --git a/src/main/extractor_ipc.h b/src/main/extractor_ipc.h @@ -25,6 +25,53 @@ #ifndef EXTRACTOR_IPC_H #define EXTRACTOR_IPC_H +#include "extractor_datasource.h" + + +/** + * How long do we allow an individual meta data object to be? + * Used to guard against (broken) plugns causing us to use + * excessive amounts of memory. + */ +#define MAX_META_DATA 32 * 1024 * 1024 + + +/** + * Sent from LE to a plugin to initialize it (open shm, + * reset position counters etc). + */ +#define MESSAGE_INIT_STATE 0x01 + +/** + * Sent from LE to a plugin to tell it that shm contents + * were updated. Only used for OPMODE_COMPRESS. + */ +#define MESSAGE_UPDATED_SHM 0x02 + +/** + * Sent from plugin to LE to tell LE that plugin is done + * analyzing current file and will send no more data. + */ +#define MESSAGE_DONE 0x03 + +/** + * Sent from plugin to LE to tell LE that plugin needs + * to read a different part of the source file. + */ +#define MESSAGE_SEEK 0x04 + +/** + * Sent from plugin to LE to tell LE about metadata discovered. + */ +#define MESSAGE_META 0x05 + +/** + * Sent from LE to plugin to make plugin discard its state (unmap + * and close shm). + */ +#define MESSAGE_DISCARD_STATE 0x06 + + /** * Definition of an IPC communication channel with * some plugin. @@ -107,12 +154,12 @@ EXTRACTOR_IPC_shared_memory_set_ (struct EXTRACTOR_SharedMemory *shm, * Create a channel to communicate with a process wrapping * the plugin of the given name. Starts the process as well. * - * @param short_libname name of the plugin + * @param plugin the plugin * @param shm memory to share with the process * @return NULL on error, otherwise IPC channel */ struct EXTRACTOR_Channel * -EXTRACTOR_IPC_channel_create_ (const char *short_libname, +EXTRACTOR_IPC_channel_create_ (struct EXTRACTOR_PluginList *plugin, struct EXTRACTOR_SharedMemory *shm); @@ -151,7 +198,7 @@ EXTRACTOR_IPC_channel_send_ (struct EXTRACTOR_Channel *channel, * @param mime mime string send from the plugin */ typedef void (*EXTRACTOR_ChannelMessageProcessor) (void *cls, - const char *short_libname, + struct EXTRACTOR_PluginList *plugin, const struct IpcHeader *msg, const void *value, const char *mime); @@ -159,6 +206,7 @@ typedef void (*EXTRACTOR_ChannelMessageProcessor) (void *cls, /** * Process a reply from channel (seek request, metadata and done message) * + * @param plugin plugin this communication is about * @param buf buffer with data from IPC channel * @param size number of bytes in buffer * @param proc metadata callback @@ -166,7 +214,8 @@ typedef void (*EXTRACTOR_ChannelMessageProcessor) (void *cls, * @return number of bytes processed, -1 on error */ ssize_t -EXTRACTOR_IPC_process_reply_ (const void *data, +EXTRACTOR_IPC_process_reply_ (struct EXTRACTOR_PluginList *plugin, + const void *data, size_t size, EXTRACTOR_ChannelMessageProcessor proc, void *proc_cls); diff --git a/src/main/extractor_ipc_gnu.c b/src/main/extractor_ipc_gnu.c @@ -26,6 +26,7 @@ #include "plibc.h" #include "extractor.h" #include "extractor_datasource.h" +#include "extractor_plugin_main.h" #include "extractor_ipc.h" #include <dirent.h> #include <sys/types.h> @@ -33,12 +34,12 @@ #include <sys/shm.h> #include <signal.h> + /** - * Size of the channel buffer; determines largest IPC message that - * is going to be allowed. FIXME: we might want to grow this - * buffer dynamically instead... + * Maximum length of a shared memory object name */ -#define CHANNEL_BUFFER_SIZE (1024 * 256) +#define MAX_SHM_NAME 255 + /** * A shared memory resource (often shared with several @@ -64,7 +65,7 @@ struct EXTRACTOR_SharedMemory /** * POSIX id of the shm into which data is uncompressed */ - int shm; + int shm_id; /** * Name of the shm @@ -83,8 +84,10 @@ struct EXTRACTOR_Channel /** * Buffer for reading data from the plugin. + * FIXME: we might want to grow this + * buffer dynamically instead of always using 32 MB! */ - char data[CHANNEL_BUFFER_SIZE]; + char data[MAX_META_DATA]; /** * Memory segment shared with this process. @@ -92,9 +95,9 @@ struct EXTRACTOR_Channel struct EXTRACTOR_SharedMemory *shm; /** - * Name of the plugin to use for this channel. + * The plugin this channel is to communicate with. */ - const char *short_libname; + struct EXTRACTOR_PluginList *plugin; /** * Pipe used to communicate information to the plugin child process. @@ -179,8 +182,8 @@ EXTRACTOR_IPC_shared_memory_create_ (size_t size) void EXTRACTOR_IPC_shared_memory_destroy_ (struct EXTRACTOR_SharedMemory *shm) { - munmap (shm->shm_ptr, shm->map_size); - (void) close (plugin->shm_id); + munmap (shm->shm_ptr, shm->shm_size); + (void) close (shm->shm_id); (void) shm_unlink (shm->shm_name); free (shm); } @@ -204,8 +207,8 @@ EXTRACTOR_IPC_shared_memory_set_ (struct EXTRACTOR_SharedMemory *shm, if (-1 == EXTRACTOR_datasource_seek_ (ds, off, SEEK_SET)) return -1; - if (size > shm->map_size) - size = shm->map_size; + if (size > shm->shm_size) + size = shm->shm_size; return EXTRACTOR_datasource_read_ (ds, shm->shm_ptr, size); @@ -216,24 +219,23 @@ EXTRACTOR_IPC_shared_memory_set_ (struct EXTRACTOR_SharedMemory *shm, * Create a channel to communicate with a process wrapping * the plugin of the given name. Starts the process as well. * - * @param short_libname name of the plugin + * @param plugin the plugin * @param shm memory to share with the process * @return NULL on error, otherwise IPC channel */ struct EXTRACTOR_Channel * -EXTRACTOR_IPC_channel_create_ (const char *short_libname, +EXTRACTOR_IPC_channel_create_ (struct EXTRACTOR_PluginList *plugin, struct EXTRACTOR_SharedMemory *shm) { struct EXTRACTOR_Channel *channel; int p1[2]; int p2[2]; pid_t pid; - int status; if (NULL == (channel = malloc (sizeof (struct EXTRACTOR_Channel)))) return NULL; channel->shm = shm; - channel->short_libname = short_libname; + channel->plugin = plugin; if (0 != pipe (p1)) { free (channel); @@ -244,7 +246,7 @@ EXTRACTOR_IPC_channel_create_ (const char *short_libname, (void) close (p1[0]); (void) close (p1[1]); free (channel); - return; + return NULL; } pid = fork (); if (pid == -1) @@ -260,7 +262,7 @@ EXTRACTOR_IPC_channel_create_ (const char *short_libname, { (void) close (p1[1]); (void) close (p2[0]); - EXTRACTOR_plugin_main_ (short_libname, p1[0], p2[1]); + EXTRACTOR_plugin_main_ (plugin, p1[0], p2[1]); _exit (0); } (void) close (p1[0]); @@ -376,12 +378,13 @@ EXTRACTOR_IPC_channel_recv_ (struct EXTRACTOR_Channel **channels, continue; if ( (-1 == (iret = read (channel->cpipe_out, &channel->data[channel->size], - CHANNEL_BUFFER_SIZE - channel->size)) ) || - (ret = EXTRACTOR_IPC_process_reply_ (channel->data, + MAX_META_DATA - channel->size)) ) || + (ret = EXTRACTOR_IPC_process_reply_ (channel->plugin, + channel->data, channel->size + iret, proc, proc_cls)) ) { - EXTRACTOR_IPC_channel_destroy (channel); + EXTRACTOR_IPC_channel_destroy_ (channel); channels[i] = NULL; } else diff --git a/src/main/extractor_plugin_main.c b/src/main/extractor_plugin_main.c @@ -27,6 +27,7 @@ #include "plibc.h" #include "extractor.h" #include "extractor_datasource.h" +#include "extractor_plugin_main.h" #include <dirent.h> #include <sys/types.h> #include <sys/wait.h> diff --git a/src/main/extractor_plugins.c b/src/main/extractor_plugins.c @@ -24,6 +24,7 @@ */ #include "extractor_plugins.h" #include "extractor_plugpath.h" +#include "extractor_ipc.h" /** @@ -227,7 +228,7 @@ EXTRACTOR_plugin_add (struct EXTRACTOR_PluginList *prev, for (pos = prev; NULL != pos; pos = pos->next) if (0 == strcmp (pos->short_libname, library)) return prev; /* no change, library already loaded */ - if (NULL == (libname = find_plugin (library))) + if (NULL == (libname = EXTRACTOR_find_plugin_ (library))) { fprintf (stderr, "Could not load `%s'\n", @@ -391,8 +392,8 @@ EXTRACTOR_plugin_remove (struct EXTRACTOR_PluginList * prev, first = pos->next; else prev->next = pos->next; - /* found */ - stop_process (pos); + if (NULL != pos->channel) + EXTRACTOR_IPC_channel_destroy_ (pos->channel); free (pos->short_libname); free (pos->libname); free (pos->plugin_options); diff --git a/src/main/extractor_plugins.h b/src/main/extractor_plugins.h @@ -31,6 +31,7 @@ #include <signal.h> #include <ltdl.h> + /** * Linked list of extractor plugins. An application builds this list * by telling libextractor to load various keyword-extraction @@ -77,10 +78,27 @@ struct EXTRACTOR_PluginList const char *specials; /** + * Channel to communicate with out-of-process plugin. + */ + struct EXTRACTOR_Channel *channel; + + /** * Flags to control how the plugin is executed. */ enum EXTRACTOR_Options flags; +#if WINDOWS + /** + * Page size. Mmap offset is a multiple of this number. + */ + DWORD allocation_granularity; +#else + /** + * Page size. Mmap offset is a multiple of this number. + */ + long allocation_granularity; +#endif + /** * A position this plugin wants us to seek to. -1 if it's finished. * Starts at 0; @@ -88,6 +106,12 @@ struct EXTRACTOR_PluginList int64_t seek_request; /** + * Is this plugin finished extracting for this round? + * 0: no, 1: yes + */ + int round_finished; + + /** * Mode of operation. One of the OPMODE_* constants */ uint8_t operation_mode; diff --git a/src/main/extractor_plugpath.c b/src/main/extractor_plugpath.c @@ -28,15 +28,22 @@ #include "extractor.h" #include <dirent.h> #include <sys/types.h> -#ifndef WINDOWS -#include <sys/wait.h> -#include <sys/shm.h> -#endif #include <signal.h> #include <ltdl.h> #include "extractor_plugpath.h" + +/** + * Function to call on paths. + * + * @param cls closure + * @param path a directory path + */ +typedef void (*EXTRACTOR_PathProcessor) (void *cls, + const char *path); + + /** * Remove a trailing '/bin/' from 'in' (if present). * @@ -392,9 +399,9 @@ append_to_dir (const char *path, * @param pp function to call for each path * @param pp_cls cls argument for pp. */ -void -EXTRACTOR_get_installation_paths_ (EXTRACTOR_PathProcessor pp, - void *pp_cls) +static void +get_installation_paths (EXTRACTOR_PathProcessor pp, + void *pp_cls) { const char *p; char *path; diff --git a/src/main/extractor_plugpath.h b/src/main/extractor_plugpath.h @@ -26,28 +26,6 @@ #define EXTRACTOR_PLUGPATH_H /** - * Function to call on paths. - * - * @param cls closure - * @param path a directory path - */ -typedef void (*EXTRACTOR_PathProcessor) (void *cls, - const char *path); - - -/** - * Iterate over all paths where we expect to find GNU libextractor - * plugins. - * - * @param pp function to call for each path - * @param pp_cls cls argument for pp. - */ -void -EXTRACTOR_get_installation_paths_ (EXTRACTOR_PathProcessor pp, - void *pp_cls); - - -/** * Given a short name of a library (i.e. "mime"), find * the full path of the respective plugin. */