libextractor-python

GNU libextractor
Log | Files | Refs | README | LICENSE

commit 219c90a9959ece246d316a950ab13268a4512e0f
parent 6a35a271ece0ef7c03affb1d1857c55485f161df
Author: Christian Grothoff <christian@grothoff.org>
Date:   Mon,  4 Jul 2005 16:01:35 +0000

py

Diffstat:
Mbootstrap | 3---
Mconfigure.ac | 9++++++---
Aextractor.h | 347+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mlibextractor_python.c | 31++++++++++++++++++++-----------
Mlibextractor_python_setup.py | 6+++---
Am4/ac_python_devel.m4 | 54++++++++++++++++++++++++++++++++++++++++++++++++++++++
6 files changed, 430 insertions(+), 20 deletions(-)

diff --git a/bootstrap b/bootstrap @@ -1,5 +1,2 @@ #!/bin/sh autoreconf -f -i -cd libltdl -autoreconf -f -i -cd .. diff --git a/configure.ac b/configure.ac @@ -20,7 +20,7 @@ fi # test for libextractor extractor=0 -AC_MSG_CHECKING(for libextractor) +AC_MSG_CHECKING([for libextractor]) AC_ARG_WITH(extractor, [ --with-extractor=PFX Base of libextractor installation], [AC_MSG_RESULT([$with_extractor]) @@ -33,8 +33,8 @@ AC_ARG_WITH(extractor, extractor=1)) ;; *) - LDFLAGS="-L$with_extractor/lib $LDFLAGS" - CPPFLAGS="-I$with_extractor/include $CPPFLAGS" + LIBDIR="-L$with_extractor/lib $LDFLAGS" + INCLUDEDIR="$with_extractor/include $CPPFLAGS" AC_CHECK_HEADERS(extractor.h, AC_CHECK_LIB([extractor], [EXTRACTOR_loadDefaultLibraries], EXT_LIB_PATH="-L$with_extractor/lib $EXT_LIB_PATH" @@ -51,6 +51,9 @@ then AC_MSG_ERROR([libextractor-python requires libextractor]) fi +AC_SUBST(INCLUDEDIR) +AC_SUBST(LDFLAGS) +AC_SUBST(LIBDIR) AC_CONFIG_FILES([Makefile]) AC_OUTPUT diff --git a/extractor.h b/extractor.h @@ -0,0 +1,347 @@ +/* + This file is part of libextractor. + (C) 2002, 2003, 2004, 2005 Vidyut Samanta and Christian Grothoff + + libextractor is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 2, or (at your + option) any later version. + + libextractor is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with libextractor; see the file COPYING. If not, write to the + Free Software Foundation, Inc., 59 Temple Place - Suite 330, + Boston, MA 02111-1307, USA. + */ + +#ifndef EXTRACTOR_H +#define EXTRACTOR_H + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * 0.2.6-1 => 0x00020601 + * 4.5.2-0 => 0x04050200 + */ +#define EXTRACTOR_VERSION 0x00050002 + +#include <stdio.h> + +/* ignore the 'type' of the keyword when eliminating duplicates */ +#define EXTRACTOR_DUPLICATES_TYPELESS 1 +/* remove type 'UNKNOWN' if there is a duplicate keyword of + known type, even if usually different types should be + preserved */ +#define EXTRACTOR_DUPLICATES_REMOVE_UNKNOWN 2 + +#define EXTRACTOR_DEFAULT_LIBRARIES EXTRACTOR_getDefaultLibraries() + +const char * EXTRACTOR_getDefaultLibraries(void); + +/** + * Enumeration defining various sources of keywords. + * See also + * http://dublincore.org/documents/1998/09/dces/ + */ +typedef enum { + EXTRACTOR_UNKNOWN = 0, + EXTRACTOR_FILENAME = 1, + EXTRACTOR_MIMETYPE = 2, + EXTRACTOR_TITLE = 3, + EXTRACTOR_AUTHOR = 4, + EXTRACTOR_ARTIST = 5, + EXTRACTOR_DESCRIPTION = 6, + EXTRACTOR_COMMENT = 7, + EXTRACTOR_DATE = 8, + EXTRACTOR_PUBLISHER = 9, + EXTRACTOR_LANGUAGE = 10, + EXTRACTOR_ALBUM = 11, + EXTRACTOR_GENRE = 12, + EXTRACTOR_LOCATION = 13, + EXTRACTOR_VERSIONNUMBER = 14, + EXTRACTOR_ORGANIZATION = 15, + EXTRACTOR_COPYRIGHT = 16, + EXTRACTOR_SUBJECT = 17, + EXTRACTOR_KEYWORDS = 18, + EXTRACTOR_CONTRIBUTOR = 19, + EXTRACTOR_RESOURCE_TYPE = 20, + EXTRACTOR_FORMAT = 21, + EXTRACTOR_RESOURCE_IDENTIFIER = 22, + EXTRACTOR_SOURCE = 23, + EXTRACTOR_RELATION = 24, + EXTRACTOR_COVERAGE = 25, + EXTRACTOR_SOFTWARE = 26, + EXTRACTOR_DISCLAIMER = 27, + EXTRACTOR_WARNING = 28, + EXTRACTOR_TRANSLATED = 29, + EXTRACTOR_CREATION_DATE = 30, + EXTRACTOR_MODIFICATION_DATE = 31, + EXTRACTOR_CREATOR = 32, + EXTRACTOR_PRODUCER = 33, + EXTRACTOR_PAGE_COUNT = 34, + EXTRACTOR_PAGE_ORIENTATION = 35, + EXTRACTOR_PAPER_SIZE = 36, + EXTRACTOR_USED_FONTS = 37, + EXTRACTOR_PAGE_ORDER = 38, + EXTRACTOR_CREATED_FOR = 39, + EXTRACTOR_MAGNIFICATION = 40, + EXTRACTOR_RELEASE = 41, + EXTRACTOR_GROUP = 42, + EXTRACTOR_SIZE = 43, + EXTRACTOR_SUMMARY = 44, + EXTRACTOR_PACKAGER = 45, + EXTRACTOR_VENDOR = 46, + EXTRACTOR_LICENSE = 47, + EXTRACTOR_DISTRIBUTION = 48, + EXTRACTOR_BUILDHOST = 49, + EXTRACTOR_OS = 50, + EXTRACTOR_DEPENDENCY = 51, + EXTRACTOR_HASH_MD4 = 52, + EXTRACTOR_HASH_MD5 = 53, + EXTRACTOR_HASH_SHA0 = 54, + EXTRACTOR_HASH_SHA1 = 55, + EXTRACTOR_HASH_RMD160 = 56, + EXTRACTOR_RESOLUTION = 57, + EXTRACTOR_CATEGORY = 58, + EXTRACTOR_BOOKTITLE = 59, + EXTRACTOR_PRIORITY = 60, + EXTRACTOR_CONFLICTS = 61, + EXTRACTOR_REPLACES = 62, + EXTRACTOR_PROVIDES = 63, + EXTRACTOR_CONDUCTOR = 64, + EXTRACTOR_INTERPRET = 65, + EXTRACTOR_OWNER = 66, + EXTRACTOR_LYRICS = 67, + EXTRACTOR_MEDIA_TYPE = 68, + EXTRACTOR_CONTACT = 69, + EXTRACTOR_THUMBNAIL_DATA = 70, + EXTRACTOR_PUBLICATION_DATE = 71, + EXTRACTOR_CAMERA_MAKE = 72, + EXTRACTOR_CAMERA_MODEL = 73, + EXTRACTOR_EXPOSURE = 74, + EXTRACTOR_APERTURE = 75, + EXTRACTOR_EXPOSURE_BIAS = 76, + EXTRACTOR_FLASH = 77, + EXTRACTOR_FLASH_BIAS = 78, + EXTRACTOR_FOCAL_LENGTH = 79, + EXTRACTOR_FOCAL_LENGTH_35MM = 80, + EXTRACTOR_ISO_SPEED = 81, + EXTRACTOR_EXPOSURE_MODE = 82, + EXTRACTOR_METERING_MODE = 83, + EXTRACTOR_MACRO_MODE = 84, + EXTRACTOR_IMAGE_QUALITY = 85, + EXTRACTOR_WHITE_BALANCE = 86, + EXTRACTOR_FILESIZE = 87, + EXTRACTOR_ORIENTATION = 88, +} EXTRACTOR_KeywordType; + +/** + * A linked list of keywords. This structure is passed around + * in libExtractor and is typically the result of any keyword + * extraction operation. + * <p> + * Each entry in the keyword list consists of a string (the + * keyword) and the keyword type (of type KeywordType) + * describing how/from where the keyword was obtained. + */ +typedef struct EXTRACTOR_Keywords { + /* the keyword that was found */ + char * keyword; + /* the type of the keyword (classification) */ + EXTRACTOR_KeywordType keywordType; + /* the next entry in the list */ + struct EXTRACTOR_Keywords * next; +} EXTRACTOR_KeywordList; + +/** + * Signature of the extract method that each plugin + * must provide. + */ +typedef EXTRACTOR_KeywordList * +(*ExtractMethod)(const char * filename, + char * data, + size_t filesize, + EXTRACTOR_KeywordList * next, + const char * options); + +/** + * Linked list of extractor helper-libraries. An application + * builds this list by telling libextractor to load various + * keyword-extraction libraries. Libraries can also be unloaded + * (removed from this list, see removeLibrary). + * <p> + * Client code should never be concerned with the internals of + * this struct. + */ +typedef struct EXTRACTOR_Extractor { + void * libraryHandle; + char * libname; + ExtractMethod extractMethod; + struct EXTRACTOR_Extractor * next; + char * options; +} EXTRACTOR_ExtractorList; + +/** + * Load the default set of libraries. + * @return the default set of libraries. + */ +EXTRACTOR_ExtractorList * EXTRACTOR_loadDefaultLibraries(void); + +/** + * Get the textual name of the keyword. + * @return NULL if the type is not known + */ +const char * +EXTRACTOR_getKeywordTypeAsString(const EXTRACTOR_KeywordType type); + +/** + * Return the highest type number, exclusive as in [0,highest). + */ +EXTRACTOR_KeywordType +EXTRACTOR_getHighestKeywordTypeNumber(void); + +/** + * Load multiple libraries as specified by the user. + * @param config a string given by the user that defines which + * libraries should be loaded. Has the format + * "[[-]LIBRARYNAME[(options)][:[-]LIBRARYNAME[(options)]]]*". + * For example, + * libextractor_mp3.so:libextractor_ogg.so loads the + * mp3 and the ogg library. The '-' before the LIBRARYNAME + * indicates that the library should be added to the end + * of the library list (addLibraryLast). + * @param prev the previous list of libraries, may be NULL + * @return the new list of libraries, equal to prev iff an error occured + * or if config was empty (or NULL). + */ +EXTRACTOR_ExtractorList * +EXTRACTOR_loadConfigLibraries(EXTRACTOR_ExtractorList * prev, + const char * config); + +/** + * Add a library for keyword extraction. + * @param prev the previous list of libraries, may be NULL + * @param library the name of the library + * @return the new list of libraries, equal to prev iff an error occured + */ +EXTRACTOR_ExtractorList * +EXTRACTOR_addLibrary(EXTRACTOR_ExtractorList * prev, + const char * library); + +/** + * Add a library for keyword extraction at the END of the list. + * @param prev the previous list of libraries, may be NULL + * @param library the name of the library + * @return the new list of libraries, always equal to prev + * except if prev was NULL and no error occurs + */ +EXTRACTOR_ExtractorList * +EXTRACTOR_addLibraryLast(EXTRACTOR_ExtractorList * prev, + const char * library); + +/** + * Remove a library for keyword extraction. + * @param prev the current list of libraries + * @param library the name of the library to remove + * @return the reduced list, unchanged if the library was not loaded + */ +EXTRACTOR_ExtractorList * +EXTRACTOR_removeLibrary(EXTRACTOR_ExtractorList * prev, + const char * library); + +/** + * Remove all extractors. + * @param libraries the list of extractors + */ +void EXTRACTOR_removeAll(EXTRACTOR_ExtractorList * libraries); + +/** + * Extract keywords from a file using the available extractors. + * @param extractor the list of extractor libraries + * @param filename the name of the file + * @return the list of keywords found in the file, NULL if none + * were found (or other errors) + */ +EXTRACTOR_KeywordList * +EXTRACTOR_getKeywords(EXTRACTOR_ExtractorList * extractor, + const char * filename); + + +/** + * Remove duplicate keywords from the list. + * @param list the original keyword list (destroyed in the process!) + * @param options a set of options (DUPLICATES_XXXX) + * @return a list of keywords without duplicates + */ +EXTRACTOR_KeywordList * +EXTRACTOR_removeDuplicateKeywords(EXTRACTOR_KeywordList * list, + const unsigned int options); + + +/** + * Remove empty (all-whitespace) keywords from the list. + * @param list the original keyword list (destroyed in the process!) + * @return a list of keywords without duplicates + */ +EXTRACTOR_KeywordList * +EXTRACTOR_removeEmptyKeywords (EXTRACTOR_KeywordList * list); + +/** + * Print a keyword list to a file. + * For debugging. + * @param handle the file to write to (stdout, stderr), must NOT be NULL + * @param keywords the list of keywords to print, may be NULL + */ +void EXTRACTOR_printKeywords(FILE * handle, + EXTRACTOR_KeywordList * keywords); + +/** + * Free the memory occupied by the keyword list (and the + * keyword strings in it!) + * @param keywords the list to free + */ +void EXTRACTOR_freeKeywords(EXTRACTOR_KeywordList * keywords); + +/** + * Extract the last keyword that of the given type from the keyword list. + * @param type the type of the keyword + * @param keywords the keyword list + * @return the last matching keyword, or NULL if none matches; + * the string returned is aliased in the keywords list and must + * not be freed or manipulated by the client. It will become + * invalid once the keyword list is freed. + */ +const char * EXTRACTOR_extractLast(const EXTRACTOR_KeywordType type, + EXTRACTOR_KeywordList * keywords); + +/** + * Extract the last keyword of the given string from the keyword list. + * @param type the string describing the type of the keyword + * @param keywords the keyword list + * @return the last matching keyword, or NULL if none matches; + * the string returned is aliased in the keywords list and must + * not be freed or manipulated by the client. It will become + * invalid once the keyword list is freed. + */ +const char * EXTRACTOR_extractLastByString(const char * type, + EXTRACTOR_KeywordList * keywords); + +/** + * Count the number of keywords in the keyword list. + * @param keywords the keyword list + * @return the number of keywords in the list + */ +unsigned int EXTRACTOR_countKeywords(EXTRACTOR_KeywordList * keywords); + + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/libextractor_python.c b/libextractor_python.c @@ -297,16 +297,19 @@ static PyTypeObject ModuleListType = { /* Module type. */ -static EXTRACTOR_KeywordList *Module_extractMethod(const char *filename, - char *data, size_t filesize, - EXTRACTOR_KeywordList *next, - const char *options) -{ +static EXTRACTOR_KeywordList * +Module_extractMethod(const char *filename, + char *data, + size_t filesize, + EXTRACTOR_KeywordList * next, + const char *options) { Module *self = NULL; self = (Module*)atoi(options); /* convert back from string repr of self. */ - - printf("In the extractor with object %i.",(int)self); +#if 0 + printf("In the extractor with object %p.", + self); +#endif return next; } @@ -337,8 +340,8 @@ static PyObject *Module_new(PyTypeObject *type, PyObject *args, self->module->libraryHandle = NULL; self->module->extractMethod = (ExtractMethod)&Module_extractMethod; self->module->libname = strdup(name); - self->module->options = malloc(12); /* store self as string in options. */ - sprintf(self->module->options,"%i",(int)self); + self->module->options = malloc(24); /* store self as string in options. */ + snprintf(self->module->options, 24, "%p", self); self->module->next = NULL; goto finish; @@ -439,7 +442,10 @@ static int Module_traverse(Module *self, visitproc visit, void *arg) static int Module_clear(Module *self) { - printf("Removing module in clear: %s.\n",self->module->libname); +#if 0 + printf("Removing module in clear: %s.\n", + self->module->libname); +#endif #ifdef Py_CLEAR Py_CLEAR(self->mlist); #endif @@ -449,7 +455,10 @@ static int Module_clear(Module *self) static void Module_dealloc(Module *self) { Module_clear(self); - printf("Removing module: %s.\n",self->module->libname); +#if 0 + printf("Removing module: %s.\n", + self->module->libname); +#endif self->module->next = NULL; EXTRACTOR_removeAll(self->module); self->ob_type->tp_free((PyObject*)self); diff --git a/libextractor_python_setup.py b/libextractor_python_setup.py @@ -3,14 +3,14 @@ import sys path=sys.argv[0] sys.argv = sys.argv[1:] - + cmod = Extension("extractor",["libextractor_python.c"], libraries=["extractor"], - include_dirs=["../include"], + include_dirs=["."], library_dirs=[path]) setup(name="Extractor", - version="0.5.0", + version="0.5.1", ext_modules=[cmod], author="Christian Grothoff, Heiko Wundram", author_email="libextractor@gnu.org") diff --git a/m4/ac_python_devel.m4 b/m4/ac_python_devel.m4 @@ -0,0 +1,54 @@ +dnl Available from the GNU Autoconf Macro Archive at: +dnl http://www.gnu.org/software/ac-archive/htmldoc/ac_python_devel.html +dnl +AC_DEFUN([AC_PYTHON_DEVEL],[ + # + # should allow for checking of python version here... + # + AC_REQUIRE([AM_PATH_PYTHON]) + + # Check for Python include path + AC_MSG_CHECKING([for Python include path]) + python_path=`echo $PYTHON | sed "s,/bin.*$,,"` + for i in "$python_path/include/python$PYTHON_VERSION/" "$python_path/include/python/" "$python_path/" ; do + python_path=`find $i -type f -name Python.h -print | sed "1q"` + if test -n "$python_path" ; then + break + fi + done + python_path=`echo $python_path | sed "s,/Python.h$,,"` + AC_MSG_RESULT([$python_path]) + if test -z "$python_path" ; then + AC_MSG_WARN([cannot find Python include path]) + else + AC_SUBST([PYTHON_CPPFLAGS],[-I$python_path]) + + # Check for Python library path + AC_MSG_CHECKING([for Python library path]) + python_path=`echo $PYTHON | sed "s,/bin.*$,,"` + for i in "$python_path/lib/python$PYTHON_VERSION/config/" "$python_path/lib/python$PYTHON_VERSION/" "$python_path/lib/python/config/" "$python_path/lib/python/" "$python_path/" ; do + python_path=`find $i -type f -name libpython$PYTHON_VERSION.* -print | sed "1q"` + if test -n "$python_path" ; then + break + fi + done + python_path=`echo $python_path | sed "s,/libpython.*$,,"` + AC_MSG_RESULT([$python_path]) + if test -z "$python_path" ; then + AC_MSG_ERROR([cannot find Python library path]) + fi + AC_SUBST([PYTHON_LDFLAGS],["-L$python_path -lpython$PYTHON_VERSION"]) + # + python_site=`echo $python_path | sed "s/config/site-packages/"` + AC_SUBST([PYTHON_SITE_PKG],[$python_site]) + # + # libraries which must be linked in when embedding + # + AC_MSG_CHECKING(python extra libraries) + PYTHON_EXTRA_LIBS=`$PYTHON -c "import distutils.sysconfig; \ + conf = distutils.sysconfig.get_config_var; \ + print conf('LOCALMODLIBS')+' '+conf('LIBS')" + AC_MSG_RESULT($PYTHON_EXTRA_LIBS)` + AC_SUBST(PYTHON_EXTRA_LIBS) + fi +])