libextractor

GNU libextractor
Log | Files | Refs | Submodules | README | LICENSE

commit e005be81997d31fdc7e18a079e97ebe3fffe370d
parent 34b4812d8902ae2bb8ca2340ad51955a50e40ec0
Author: Christian Grothoff <christian@grothoff.org>
Date:   Mon, 26 Mar 2012 16:20:17 +0000

-LRN: Divide-extractor.c.patch was the first thing i did, once i
realized that extractor.c is just too long.



Diffstat:
Msrc/main/Makefile.am | 2++
Msrc/main/extractor.c | 908+------------------------------------------------------------------------------
2 files changed, 4 insertions(+), 906 deletions(-)

diff --git a/src/main/Makefile.am b/src/main/Makefile.am @@ -36,6 +36,8 @@ libextractor_la_CPPFLAGS = -DPLUGINDIR=\"@RPLUGINDIR@\" -DPLUGININSTDIR=\"${plug libextractor_la_SOURCES = \ extractor.c \ + extractor_plugpath.c \ + extractor_plugins.c \ extractor_metatypes.c \ extractor_print.c diff --git a/src/main/extractor.c b/src/main/extractor.c @@ -38,6 +38,8 @@ #include <zlib.h> #endif +#include "extractor_plugpath.h" +#include "extractor_plugins.h" /** @@ -65,912 +67,6 @@ /** - * Linked list of extractor plugins. An application builds this list - * by telling libextractor to load various keyword-extraction - * plugins. Libraries can also be unloaded (removed from this list, - * see EXTRACTOR_plugin_remove). - */ -struct EXTRACTOR_PluginList -{ - /** - * This is a linked list. - */ - struct EXTRACTOR_PluginList *next; - - /** - * Pointer to the plugin (as returned by lt_dlopen). - */ - void * libraryHandle; - - /** - * Name of the library (i.e., 'libextractor_foo.so') - */ - char *libname; - - /** - * Name of the library (i.e., 'libextractor_foo.so') - */ - char *short_libname; - - /** - * Pointer to the function used for meta data extraction. - */ - EXTRACTOR_ExtractMethod extractMethod; - - /** - * Options for the plugin. - */ - char * plugin_options; - - /** - * Special options for the plugin - * (as returned by the plugin's "options" method; - * typically NULL). - */ - const char *specials; - - /** - * Flags to control how the plugin is executed. - */ - enum EXTRACTOR_Options flags; - - /** - * Process ID of the child process for this plugin. 0 for - * none. - */ -#ifndef WINDOWS - int cpid; -#else - HANDLE hProcess; -#endif - - /** - * Pipe used to send information about shared memory segments to - * the child process. NULL if not initialized. - */ - FILE *cpipe_in; - - /** - * Pipe used to read information about extracted meta data from - * the child process. -1 if not initialized. - */ - int cpipe_out; -}; - - -/** - * Remove a trailing '/bin' from in (if present). - */ -static char * -cut_bin(char * in) { - size_t p; - - if (in == NULL) - return NULL; - p = strlen(in); - if (p > 4) { - if ( (in[p-1] == '/') || - (in[p-1] == '\\') ) - in[--p] = '\0'; - if (0 == strcmp(&in[p-3], - "bin")) { - in[p-3] = '\0'; - p -= 3; - } - } - return in; -} - -#if LINUX -/** - * Try to determine path by reading /proc/PID/exe or - * /proc/PID/maps. - * - * Note that this may fail if LE is installed in one directory - * and the binary linking against it sits elsewhere. - */ -static char * -get_path_from_proc_exe() { - char fn[64]; - char line[1024]; - char dir[1024]; - char * lnk; - char * ret; - char * lestr; - ssize_t size; - FILE * f; - - snprintf(fn, - sizeof (fn), - "/proc/%u/maps", - getpid()); - f = FOPEN(fn, "r"); - if (f != NULL) { - while (NULL != fgets(line, 1024, f)) { - if ( (1 == sscanf(line, - "%*x-%*x %*c%*c%*c%*c %*x %*2x:%*2x %*u%*[ ]%s", - dir)) && - (NULL != (lestr = strstr(dir, - "libextractor")) ) ) { - lestr[0] = '\0'; - fclose(f); - return strdup(dir); - } - } - fclose(f); - } - snprintf(fn, - sizeof (fn), - "/proc/%u/exe", - getpid()); - lnk = malloc(1029); /* 1024 + 5 for "lib/" catenation */ - if (lnk == NULL) - return NULL; - size = readlink(fn, lnk, 1023); - if ( (size <= 0) || (size >= 1024) ) { - free(lnk); - return NULL; - } - lnk[size] = '\0'; - while ( (lnk[size] != '/') && - (size > 0) ) - size--; - if ( (size < 4) || - (lnk[size-4] != '/') ) { - /* not installed in "/bin/" -- binary path probably useless */ - free(lnk); - return NULL; - } - lnk[size] = '\0'; - lnk = cut_bin(lnk); - ret = realloc(lnk, strlen(lnk) + 5); - if (ret == NULL) - { - free (lnk); - return NULL; - } - strcat(ret, "lib/"); /* guess "lib/" as the library dir */ - return ret; -} -#endif - -#if WINDOWS -/** - * Try to determine path with win32-specific function - */ -static char * -get_path_from_module_filename() { - char * path; - char * ret; - char * idx; - - path = malloc(4103); /* 4096+nil+6 for "/lib/" catenation */ - if (path == NULL) - return NULL; - GetModuleFileName(NULL, path, 4096); - idx = path + strlen(path); - while ( (idx > path) && - (*idx != '\\') && - (*idx != '/') ) - idx--; - *idx = '\0'; - path = cut_bin(path); - ret = realloc(path, strlen(path) + 6); - if (ret == NULL) - { - free (path); - return NULL; - } - strcat(ret, "/lib/"); /* guess "lib/" as the library dir */ - return ret; -} -#endif - -#if DARWIN -static char * get_path_from_dyld_image() { - const char * path; - char * p, * s; - int i; - int c; - - p = NULL; - c = _dyld_image_count(); - for (i = 0; i < c; i++) { - if (_dyld_get_image_header(i) == &_mh_dylib_header) { - path = _dyld_get_image_name(i); - if (path != NULL && strlen(path) > 0) { - p = strdup(path); - if (p == NULL) - return NULL; - s = p + strlen(p); - while ( (s > p) && (*s != '/') ) - s--; - s++; - *s = '\0'; - } - break; - } - } - return p; -} -#endif - -/** - * This may also fail -- for example, if extract - * is not also installed. - */ -static char * -get_path_from_PATH() { - struct stat sbuf; - char * path; - char * pos; - char * end; - char * buf; - char * ret; - const char * p; - - p = getenv("PATH"); - if (p == NULL) - return NULL; - path = strdup(p); /* because we write on it */ - if (path == NULL) - return NULL; - buf = malloc(strlen(path) + 20); - if (buf == NULL) - { - free (path); - return NULL; - } - pos = path; - - while (NULL != (end = strchr(pos, ':'))) { - *end = '\0'; - sprintf(buf, "%s/%s", pos, "extract"); - if (0 == stat(buf, &sbuf)) { - pos = strdup(pos); - free(buf); - free(path); - if (pos == NULL) - return NULL; - pos = cut_bin(pos); - ret = realloc(pos, strlen(pos) + 5); - if (ret == NULL) - { - free (pos); - return NULL; - } - strcat(ret, "lib/"); - return ret; - } - pos = end + 1; - } - sprintf(buf, "%s/%s", pos, "extract"); - if (0 == stat(buf, &sbuf)) { - pos = strdup(pos); - free(buf); - free(path); - if (pos == NULL) - return NULL; - pos = cut_bin(pos); - ret = realloc(pos, strlen(pos) + 5); - if (ret == NULL) - { - free (pos); - return NULL; - } - strcat(ret, "lib/"); - return ret; - } - free(buf); - free(path); - return NULL; -} - - -/** - * Function to call on paths. - * - * @param cls closure - * @param path a directory path - */ -typedef void (*PathProcessor)(void *cls, - const char *path); - - -/** - * Create a filename by appending 'fname' to 'path'. - * - * @param path the base path - * @param fname the filename to append - * @return '$path/$fname' - */ -static char * -append_to_dir (const char *path, - const char *fname) -{ - char *ret; - size_t slen; - - slen = strlen (path); - if (slen == 0) - return NULL; - if (fname[0] == DIR_SEPARATOR) - fname++; - ret = malloc (slen + strlen(fname) + 2); - if (ret == NULL) - return NULL; -#ifdef MINGW - if (path[slen-1] == '\\') - sprintf (ret, - "%s%s", - path, - fname); - else - sprintf (ret, - "%s\\%s", - path, - fname); -#else - if (path[slen-1] == '/') - sprintf (ret, - "%s%s", - path, - fname); - else - sprintf (ret, - "%s/%s", - path, - fname); -#endif - return ret; -} - - -/** - * Iterate over all paths where we expect to find GNU libextractor - * plugins. - * - * @param pp function to call for each path - * @param pp_cls cls argument for pp. - */ -static void -get_installation_paths (PathProcessor pp, - void *pp_cls) -{ - const char *p; - char * path; - char * prefix; - char * d; - - prefix = NULL; - p = getenv("LIBEXTRACTOR_PREFIX"); - if (p != NULL) - { - d = strdup (p); - if (d == NULL) - return; - prefix = strtok (d, PATH_SEPARATOR_STR); - while (NULL != prefix) - { - pp (pp_cls, prefix); - prefix = strtok (NULL, PATH_SEPARATOR_STR); - } - free (d); - return; - } -#if LINUX - if (prefix == NULL) - prefix = get_path_from_proc_exe(); -#endif -#if WINDOWS - if (prefix == NULL) - prefix = get_path_from_module_filename(); -#endif -#if DARWIN - if (prefix == NULL) - prefix = get_path_from_dyld_image(); -#endif - if (prefix == NULL) - prefix = get_path_from_PATH(); - pp (pp_cls, PLUGININSTDIR); - if (prefix == NULL) - return; - path = append_to_dir (prefix, PLUGINDIR); - if (path != NULL) - { - if (0 != strcmp (path, - PLUGININSTDIR)) - pp (pp_cls, path); - free (path); - } - free (prefix); -} - - -struct SearchContext -{ - const char *short_name; - char *path; -}; - - -/** - * Load all plugins from the given directory. - * - * @param cls pointer to the "struct EXTRACTOR_PluginList*" to extend - * @param path path to a directory with plugins - */ -static void -find_plugin_in_path (void *cls, - const char *path) -{ - struct SearchContext *sc = cls; - DIR *dir; - struct dirent *ent; - const char *la; - const char *sym_name; - char *sym; - char *dot; - - if (sc->path != NULL) - return; - dir = OPENDIR (path); - if (NULL == dir) - return; - while (NULL != (ent = READDIR (dir))) - { - if (ent->d_name[0] == '.') - continue; - if ( (NULL != (la = strstr (ent->d_name, ".la"))) && - (la[3] == '\0') ) - continue; /* only load '.so' and '.dll' */ - sym_name = strstr (ent->d_name, "_"); - if (sym_name == NULL) - continue; - sym_name++; - sym = strdup (sym_name); - if (sym == NULL) - { - CLOSEDIR (dir); - return; - } - dot = strstr (sym, "."); - if (dot != NULL) - *dot = '\0'; - if (0 == strcmp (sym, sc->short_name)) - { - sc->path = append_to_dir (path, ent->d_name); - free (sym); - break; - } - free (sym); - } -#if DEBUG - if (sc->path == NULL) - fprintf (stderr, - "Failed to find plugin `%s' in `%s'\n", - sc->short_name, - path); -#endif - CLOSEDIR (dir); -} - - - -/** - * Given a short name of a library (i.e. "mime"), find - * the full path of the respective plugin. - */ -static char * -find_plugin (const char *short_name) -{ - struct SearchContext sc; - - sc.path = NULL; - sc.short_name = short_name; - get_installation_paths (&find_plugin_in_path, - &sc); - return sc.path; -} - - - -struct DefaultLoaderContext -{ - struct EXTRACTOR_PluginList *res; - enum EXTRACTOR_Options flags; -}; - - -/** - * Load all plugins from the given directory. - * - * @param cls pointer to the "struct EXTRACTOR_PluginList*" to extend - * @param path path to a directory with plugins - */ -static void -load_plugins_from_dir (void *cls, - const char *path) -{ - struct DefaultLoaderContext *dlc = cls; - DIR *dir; - struct dirent *ent; - const char *la; - const char *sym_name; - char *sym; - char *dot; - - dir = opendir (path); - if (NULL == dir) - return; - while (NULL != (ent = readdir (dir))) - { - if (ent->d_name[0] == '.') - continue; - if ( ( (NULL != (la = strstr (ent->d_name, ".la"))) && - (la[3] == '\0') ) || - ( (NULL != (la = strstr (ent->d_name, ".a"))) && - (la[2] == '\0')) ) - continue; /* only load '.so' and '.dll' */ - - sym_name = strstr (ent->d_name, "_"); - if (sym_name == NULL) - continue; - sym_name++; - sym = strdup (sym_name); - if (NULL == sym) - { - closedir (dir); - return; - } - dot = strstr (sym, "."); - if (dot != NULL) - *dot = '\0'; -#if DEBUG > 1 - fprintf (stderr, - "Adding default plugin `%s'\n", - sym); -#endif - dlc->res = EXTRACTOR_plugin_add (dlc->res, - sym, - NULL, - dlc->flags); - free (sym); - } - closedir (dir); -} - - -/** - * Load the default set of plugins. The default can be changed - * by setting the LIBEXTRACTOR_LIBRARIES environment variable. - * If it is set to "env", then this function will return - * EXTRACTOR_plugin_add_config (NULL, env, flags). Otherwise, - * it will load all of the installed plugins and return them. - * - * @param flags options for all of the plugins loaded - * @return the default set of plugins, NULL if no plugins were found - */ -struct EXTRACTOR_PluginList * -EXTRACTOR_plugin_add_defaults(enum EXTRACTOR_Options flags) -{ - struct DefaultLoaderContext dlc; - char *env; - - env = getenv ("LIBEXTRACTOR_LIBRARIES"); - if (env != NULL) - return EXTRACTOR_plugin_add_config (NULL, env, flags); - dlc.res = NULL; - dlc.flags = flags; - get_installation_paths (&load_plugins_from_dir, - &dlc); - return dlc.res; -} - - -/** - * Try to resolve a plugin function. - * - * @param lib_handle library to search for the symbol - * @param prefix prefix to add - * @param sym_name base name for the symbol - * @param options set to special options requested by the plugin - * @return NULL on error, otherwise pointer to the symbol - */ -static void * -get_symbol_with_prefix(void *lib_handle, - const char *template, - const char *prefix, - const char **options) -{ - char *name; - void *symbol; - const char *sym_name; - char *sym; - char *dot; - const char *(*opt_fun)(void); - - if (NULL != options) *options = NULL; - sym_name = strstr (prefix, "_"); - if (sym_name == NULL) - return NULL; - sym_name++; - sym = strdup (sym_name); - if (sym == NULL) - return NULL; - dot = strstr (sym, "."); - if (dot != NULL) - *dot = '\0'; - name = malloc(strlen(sym) + strlen(template) + 1); - if (name == NULL) - { - free (sym); - return NULL; - } - sprintf(name, - template, - sym); - /* try without '_' first */ - symbol = lt_dlsym(lib_handle, name + 1); - if (symbol==NULL) - { - /* now try with the '_' */ -#if DEBUG - char *first_error = strdup (lt_dlerror()); -#endif - symbol = lt_dlsym(lib_handle, name); -#if DEBUG - if (NULL == symbol) - { - fprintf(stderr, - "Resolving symbol `%s' failed, " - "so I tried `%s', but that failed also. Errors are: " - "`%s' and `%s'.\n", - name+1, - name, - first_error == NULL ? "out of memory" : first_error, - lt_dlerror()); - } - if (first_error != NULL) - free(first_error); -#endif - } - - if ( (symbol != NULL) && - (NULL != options) ) - { - /* get special options */ - sprintf(name, - "_EXTRACTOR_%s_options", - sym); - /* try without '_' first */ - opt_fun = lt_dlsym(lib_handle, name + 1); - if (opt_fun == NULL) - opt_fun = lt_dlsym(lib_handle, name); - if (opt_fun != NULL) - *options = opt_fun (); - } - free (sym); - free(name); - - return symbol; -} - - -/** - * Load a plugin. - * - * @param plugin plugin to load - * @return 0 on success, -1 on error - */ -static int -plugin_load (struct EXTRACTOR_PluginList *plugin) -{ -#if WINDOWS - wchar_t wlibname[4097]; - char llibname[4097]; -#endif - lt_dladvise advise; - - if (plugin->libname == NULL) - plugin->libname = find_plugin (plugin->short_libname); - if (plugin->libname == NULL) - { -#if DEBUG - fprintf (stderr, - "Failed to find plugin `%s'\n", - plugin->short_libname); -#endif - plugin->flags = EXTRACTOR_OPTION_DISABLED; - return -1; - } - lt_dladvise_init (&advise); - lt_dladvise_ext (&advise); - lt_dladvise_local (&advise); -#if WINDOWS - wlibname[0] = L'\0'; - llibname[0] = '\0'; - if (MultiByteToWideChar (CP_UTF8, 0, plugin->libname, -1, wlibname, 4097) <= 0 - || WideCharToMultiByte (CP_ACP, 0, wlibname, -1, llibname, 4097, NULL, NULL) < 0) - { -#if DEBUG - fprintf (stderr, - "Loading `%s' plugin failed: %s\n", - plugin->short_libname, - "can't convert plugin name to local encoding"); - free (plugin->libname); - plugin->libname = NULL; - plugin->flags = EXTRACTOR_OPTION_DISABLED; - return -1; -#endif - } - plugin->libraryHandle = lt_dlopenadvise (llibname, - advise); -#else - plugin->libraryHandle = lt_dlopenadvise (plugin->libname, - advise); -#endif - lt_dladvise_destroy(&advise); - if (plugin->libraryHandle == NULL) - { -#if DEBUG - fprintf (stderr, - "Loading `%s' plugin failed: %s\n", - plugin->short_libname, - lt_dlerror ()); -#endif - free (plugin->libname); - plugin->libname = NULL; - plugin->flags = EXTRACTOR_OPTION_DISABLED; - return -1; - } - plugin->extractMethod = get_symbol_with_prefix (plugin->libraryHandle, - "_EXTRACTOR_%s_extract", - plugin->libname, - &plugin->specials); - if (plugin->extractMethod == NULL) - { -#if DEBUG - fprintf (stderr, - "Resolving `extract' method of plugin `%s' failed: %s\n", - plugin->short_libname, - lt_dlerror ()); -#endif - lt_dlclose (plugin->libraryHandle); - free (plugin->libname); - plugin->libname = NULL; - plugin->flags = EXTRACTOR_OPTION_DISABLED; - return -1; - } - return 0; -} - - - - -/** - * Add a library for keyword extraction. - * - * @param prev the previous list of libraries, may be NULL - * @param library the name of the library - * @param flags options to use - * @return the new list of libraries, equal to prev iff an error occured - */ -struct EXTRACTOR_PluginList * -EXTRACTOR_plugin_add (struct EXTRACTOR_PluginList * prev, - const char *library, - const char *options, - enum EXTRACTOR_Options flags) -{ - struct EXTRACTOR_PluginList *result; - char *libname; - - libname = find_plugin (library); - if (libname == NULL) - { - fprintf (stderr, - "Could not load `%s'\n", - library); - return prev; - } - result = calloc (1, sizeof (struct EXTRACTOR_PluginList)); - if (result == NULL) - return prev; - result->next = prev; - result->short_libname = strdup (library); - if (result->short_libname == NULL) - { - free (result); - return NULL; - } - result->libname = libname; - result->flags = flags; - if (NULL != options) - result->plugin_options = strdup (options); - else - result->plugin_options = NULL; - return result; -} - - -/** - * Load multiple libraries as specified by the user. - * - * @param config a string given by the user that defines which - * libraries should be loaded. Has the format - * "[[-]LIBRARYNAME[(options)][:[-]LIBRARYNAME[(options)]]]*". - * For example, 'mp3:ogg.so' loads the - * mp3 and the ogg library. The '-' before the LIBRARYNAME - * indicates that the library should be removed from - * the library list. - * @param prev the previous list of libraries, may be NULL - * @param flags options to use - * @return the new list of libraries, equal to prev iff an error occured - * or if config was empty (or NULL). - */ -struct EXTRACTOR_PluginList * -EXTRACTOR_plugin_add_config (struct EXTRACTOR_PluginList * prev, - const char *config, - enum EXTRACTOR_Options flags) -{ - char *cpy; - size_t pos; - size_t last; - ssize_t lastconf; - size_t len; - - if (config == NULL) - return prev; - len = strlen(config); - cpy = strdup(config); - if (cpy == NULL) - return prev; - pos = 0; - last = 0; - lastconf = 0; - while (pos < len) - { - while ((cpy[pos] != ':') && (cpy[pos] != '\0') && - (cpy[pos] != '(')) - pos++; - if( cpy[pos] == '(' ) { - cpy[pos++] = '\0'; /* replace '(' by termination */ - lastconf = pos; /* start config from here, after (. */ - while ((cpy[pos] != '\0') && (cpy[pos] != ')')) - pos++; /* config until ) or EOS. */ - if( cpy[pos] == ')' ) { - cpy[pos++] = '\0'; /* write end of config here. */ - while ((cpy[pos] != ':') && (cpy[pos] != '\0')) - pos++; /* forward until real end of string found. */ - cpy[pos++] = '\0'; - } else { - cpy[pos++] = '\0'; /* end of string. */ - } - } else { - lastconf = -1; /* NULL config when no (). */ - cpy[pos++] = '\0'; /* replace ':' by termination */ - } - if (cpy[last] == '-') - { - last++; - prev = EXTRACTOR_plugin_remove (prev, - &cpy[last]); - } - else - { - prev = EXTRACTOR_plugin_add (prev, - &cpy[last], - (lastconf != -1) ? &cpy[lastconf] : NULL, - flags); - } - last = pos; - } - free (cpy); - return prev; -} - - -/** * Stop the child process of this plugin. */ static void