libextractor-python

GNU libextractor
Log | Files | Refs | README | LICENSE

commit 6a35a271ece0ef7c03affb1d1857c55485f161df
parent 764f6d675c085bb87cf8178db65af67b5be3cd03
Author: Christian Grothoff <christian@grothoff.org>
Date:   Mon,  4 Jul 2005 15:17:20 +0000

update

Diffstat:
AAUTHORS | 2++
AChangeLog | 5+++++
AExtractor.py | 112+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
ANEWS | 2++
AREADME | 1+
Abootstrap | 5+++++
Aconfigure.ac | 56++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aextract.py | 32++++++++++++++++++++++++++++++++
Alibextractor_python.c | 527+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Alibextractor_python_setup.py | 17+++++++++++++++++
10 files changed, 759 insertions(+), 0 deletions(-)

diff --git a/AUTHORS b/AUTHORS @@ -0,0 +1,2 @@ +Heiko Wundram <modelnine@ceosg.de> +Christian Grothoff <christian@grothoff.org> diff --git a/ChangeLog b/ChangeLog @@ -0,0 +1,4 @@ +Mon Jul 4 17:19:33 CEST 2005 + Moved python binding into seperate package. + + +\ No newline at end of file diff --git a/Extractor.py b/Extractor.py @@ -0,0 +1,112 @@ +"""Extractor.py + + This file is part of libextractor. + (C) 2002, 2003, 2004, 2005 Vidyut Samanta and Christian Grothoff + + libextractor is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 2, or (at your + option) any later version. + + libextractor is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with libextractor; see the file COPYING. If not, write to the + Free Software Foundation, Inc., 59 Temple Place - Suite 330, + Boston, MA 02111-1307, USA. + +libextractor is a simple library for keyword extraction. libextractor +does not support all formats but supports a simple plugging mechanism +such that you can quickly add extractors for additional formats, even +without recompiling libextractor. libextractor typically ships with a +dozen helper-libraries that can be used to obtain keywords from common +file-types. + +libextractor is a part of the GNU project (http://www.gnu.org/). +""" + +import _extractor + +__all__ = ["Extractor","Keyword"] +__author__ = "Christian Grothoff, Heiko Wundram" +__version__ = "0.5.0" +__license__ = "GPL" +__date__ = "5/5/2005" + +class Extractor(object): + """ + Main class for extracting meta-data with GNU libextractor. + + You may create multiple instances of Extractor to use + different sets of plugins. Initially each Extractor + will start with the default set of plugins. + + Use the extract method to obtain keywords from a file. + + Use the load and unload methods to change the list of + plugins that should be used. + """ + + def __init__(self): + self.__plugins = _extractor.loadDefaultLibraries() + def __del__(self): + _extractor.removeAll(self.__plugins) + def load(self,plugs): + """ + Load certain plugins. Invoke with a string with the names + of the plugins that should be loaded. For example, + + 'libextractor_filename:-libextractor_split' + + will prepend the extractor that just adds the filename as a + keyword and append (runs last) the extractor that splits + keywords at whitespaces and punctuations. + + No errors are reported if any of the listed plugins are not + found. + """ + self.__plugins = _extractor.load(self.__plugins, plugs) + return None + def unload(self,plugs): + """ + Unload a plugin. Pass the name of the plugin that is to + be unloaded. Only one plugin can be unloaded at a time. + For example, + + 'libextractor_pdf' + + unloads the PDF extractor (if loaded). No errors are + reported if no matching plugin is found. + """ + self.__plugins = _extractor.unload(self.__plugins, plugs) + return None + def extract(self,filename): + """Pass a filename to extract keywords. + + This function returns a list of Keyword objects. + If the file cannot be opened or cannot be found, + the list will be empty. The list can also be empty + if no metadata was found for the file. + """ + return _extractor.extract(self.__plugins, filename, Keyword) + +class Keyword(object): + def __init__(self,type,value): + self.__type = type + self.__value = value.decode("utf-8") + def __repr__(self): + return u"%s(%i,%r)" % (self.__class__.__name__,self.__type,self.__value) + def __str__(self): + return u"%s: %s" % (self.__getType(), self.__getValue()) + def __getType(self): + return _extractor.getKeywordTypeAsString(self.__type).decode("utf-8") + def __getValue(self): + return self.__value + def __hash__(self): + return hash(self.__value)+self.__type + + type = property(__getType,None,None,"Type of the Keyword (i.e. author, title)") + value = property(__getValue,None,None,"Value of the Keyword (i.e. 'The GNU GPL')") diff --git a/NEWS b/NEWS @@ -0,0 +1 @@ +See ChangeLog. +\ No newline at end of file diff --git a/README b/README @@ -0,0 +1 @@ +This is the python binding for libextractor. diff --git a/bootstrap b/bootstrap @@ -0,0 +1,5 @@ +#!/bin/sh +autoreconf -f -i +cd libltdl +autoreconf -f -i +cd .. diff --git a/configure.ac b/configure.ac @@ -0,0 +1,56 @@ +# Process this file with autoconf to produce a configure script. +AC_PREREQ(2.57) +AC_INIT([libextractor-python], [0.5.1], [bug-libextractor@gnu.org]) +AC_REVISION($Revision: 1.67 $) +AM_INIT_AUTOMAKE([libextractor-python], [0.5.1]) + +AC_PROG_AWK +AC_PROG_CC + +AC_PYTHON_DEVEL +AC_MSG_CHECKING(for python) +if test -z "$python_path"; +then + AC_MSG_RESULT(yes) + AM_CONDITIONAL(HAVE_PYTHON,false) +else + AC_MSG_RESULT(no) + AM_CONDITIONAL(HAVE_PYTHON,true) +fi + +# test for libextractor +extractor=0 +AC_MSG_CHECKING(for libextractor) +AC_ARG_WITH(extractor, + [ --with-extractor=PFX Base of libextractor installation], + [AC_MSG_RESULT([$with_extractor]) + case $with_extractor in + no) + ;; + yes) + AC_CHECK_HEADERS(extractor.h, + AC_CHECK_LIB([extractor], [EXTRACTOR_loadDefaultLibraries], + extractor=1)) + ;; + *) + LDFLAGS="-L$with_extractor/lib $LDFLAGS" + CPPFLAGS="-I$with_extractor/include $CPPFLAGS" + AC_CHECK_HEADERS(extractor.h, + AC_CHECK_LIB([extractor], [EXTRACTOR_loadDefaultLibraries], + EXT_LIB_PATH="-L$with_extractor/lib $EXT_LIB_PATH" + extractor=1)) + ;; + esac + ], + [AC_MSG_RESULT([--with-extractor not specified]) + AC_CHECK_HEADERS(extractor.h, + AC_CHECK_LIB([extractor], [EXTRACTOR_loadDefaultLibraries], + extractor=1))]) +if test "$extractor" != 1 +then + AC_MSG_ERROR([libextractor-python requires libextractor]) +fi + + +AC_CONFIG_FILES([Makefile]) +AC_OUTPUT diff --git a/extract.py b/extract.py @@ -0,0 +1,32 @@ +"""extract.py + + This file is part of libextractor. + (C) 2002, 2003, 2004, 2005 Vidyut Samanta and Christian Grothoff + + libextractor is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 2, or (at your + option) any later version. + + libextractor is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with libextractor; see the file COPYING. If not, write to the + Free Software Foundation, Inc., 59 Temple Place - Suite 330, + Boston, MA 02111-1307, USA. + +Little demo how to use the libextractor Python binding. + +""" +import Extractor +import sys + +xtract = Extractor.Extractor() +for arg in sys.argv[1:]: + print "Keywords from " + arg + keys = xtract.extract(arg); + for i in keys: + print i diff --git a/libextractor_python.c b/libextractor_python.c @@ -0,0 +1,527 @@ +/* + This file is part of libextractor. + (C) 2004, 2005 Vidyut Samanta and Christian Grothoff + + libextractor is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 2, or (at your + option) any later version. + + libextractor is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with libextractor; see the file COPYING. If not, write to the + Free Software Foundation, Inc., 59 Temple Place - Suite 330, + Boston, MA 02111-1307, USA. + */ + + +/* libextractor_python.c + --------------------- + + Implements the Python wrapper for libextractor. The wrapper builds on the + Python type module, which wraps a single module, over extractor, which + implements the extractor from modules, up to keyword(list), which implements + keyword handling. */ + +/* Includes. */ + +#include <Python.h> +#include <extractor.h> + +/* Typedefs. */ + +typedef struct { + PyObject_HEAD + PyObject *mlist; + int locks; +} ModuleList; + +typedef struct { + PyObject_HEAD + EXTRACTOR_ExtractorList *module; + ModuleList *mlist; +} Module; + +/* Type objects. */ + +static PyTypeObject ModuleListType; +static PyTypeObject ModuleType; + +/* Module list type. */ + +static inline int ModuleList_checkModule(Module *arg) +{ + if( !PyObject_IsInstance((PyObject*)arg,(PyObject*)&ModuleType) ) { + PyErr_SetString(PyExc_TypeError,"append only accepts a Module."); + return -1; + } + + if( arg->mlist ) { + PyErr_SetString(PyExc_TypeError,"cannot take ownership of module."); + return -1; + } + + return 0; +} + +static PyObject *ModuleList_prepend(ModuleList *self, Module *arg) +{ + PyObject *rv = NULL; + Module *first = NULL; + int mlistlen = 0; + + if( ModuleList_checkModule(arg) ) + goto error; + + mlistlen = PyList_GET_SIZE(self->mlist); + if( mlistlen ) { + first = (Module*)PyList_GET_ITEM(self->mlist,0); + arg->module->next = first->module; + } + + if( PyList_Insert(self->mlist,0,(PyObject*)arg) ) + goto error; + arg->mlist = self; + Py_INCREF(self); + + rv = (PyObject*)arg; + Py_INCREF(rv); + + goto finish; + + error: + Py_XDECREF(rv); + rv = NULL; + + finish: + return (PyObject*)rv; +} + +static PyObject *ModuleList_append(ModuleList *self, Module *arg) +{ + PyObject *rv = NULL; + Module *last = NULL; + int mlistlen = 0; + + if( ModuleList_checkModule(arg) ) + goto error; + + mlistlen = PyList_GET_SIZE(self->mlist); + if( mlistlen ) { + last = (Module*)PyList_GET_ITEM(self->mlist,mlistlen-1); + last->module->next = arg->module; + } + + if( PyList_Append(self->mlist,(PyObject*)arg) ) + goto error; + arg->mlist = self; + Py_INCREF(self); + + rv = (PyObject*)arg; + Py_INCREF(rv); + + goto finish; + + error: + Py_XDECREF(rv); + rv = NULL; + + finish: + return (PyObject*)rv; +} + +static PyObject *ModuleList_new(PyTypeObject *type, PyObject *args, + PyObject *kwargs) +{ + ModuleList *self = NULL; + + if( !( self = (ModuleList*)type->tp_alloc(type,0) ) ) + goto error; + self->locks = 0; + + if( !( self->mlist = PyList_New(0) ) ) + goto error; + + goto finish; + + error: + Py_XDECREF(self); + self = NULL; + + finish: + return (PyObject*)self; +} + +static int ModuleList_init(ModuleList *self, PyObject *args, PyObject *kwargs) +{ + PyObject *mod = NULL, *mod_iter = NULL, *mod_item = NULL; + EXTRACTOR_ExtractorList *elist = NULL, *ecur = NULL; + char *kwargs_list[] = {"modules",NULL}; + int rv = 0; + + if( !PyArg_ParseTupleAndKeywords(args,kwargs,"|O:__init__",kwargs_list, + &mod) ) + goto error; + + if( !mod || mod == Py_None || PyString_Check(mod) ) { + if( !mod || mod == Py_None ) + elist = EXTRACTOR_loadDefaultLibraries(); + else + elist = EXTRACTOR_loadConfigLibraries(NULL,PyString_AsString(mod)); + + ecur = elist; + while( ecur ) { + if( !( mod_item = (PyObject*)PyObject_GC_New(Module,&ModuleType) ) ) + goto error; + + elist = ecur; + ecur = elist->next; + elist->next = NULL; + + ((Module*)mod_item)->module = elist; + ((Module*)mod_item)->mlist = NULL; + + if( !ModuleList_append(self,(Module*)mod_item) ) + goto error; + Py_DECREF(mod_item); + mod_item = NULL; + } + } else if( PyObject_IsInstance(mod,(PyObject*)&ModuleType) ) { + if( !ModuleList_append(self,(Module*)mod) ) + goto error; + } else { + if( !( mod_iter = PyObject_GetIter(mod) ) ) + goto error; + + while( ( mod_item = PyIter_Next(mod_iter) ) ) { + if( !ModuleList_append(self,(Module*)mod_item) ) + goto error; + Py_DECREF(mod_item); + mod_item = NULL; + } + } + + goto finish; + + error: + EXTRACTOR_removeAll(ecur); + Py_XDECREF(mod_item); + rv = -1; + + finish: + Py_XDECREF(mod_iter); + return rv; +} + +static PyObject *ModuleList_repr(ModuleList *self) +{ + return PyString_FromFormat("<ModuleList: %i modules>", + PyList_GET_SIZE(self->mlist)); +} + +static int ModuleList_traverse(ModuleList *self, visitproc visit, void *arg) +{ +#ifdef Py_VISIT + Py_VISIT(self->mlist); +#endif + return 0; +} + +static int ModuleList_clear(ModuleList *self) +{ +#ifdef Py_CLEAR + Py_CLEAR(self->mlist); +#endif + return 0; +} + +static void ModuleList_dealloc(ModuleList *self) +{ + ModuleList_clear(self); + self->ob_type->tp_free((PyObject*)self); +} + +static PyMethodDef ModuleList_methods[] = { + {"prepend",(PyCFunction)ModuleList_prepend,METH_O, + "Prepend a single module to the structure."}, + {"append",(PyCFunction)ModuleList_append,METH_O, + "Append a single module to the structure."}, + {NULL} /* Sentinel */ +}; + +static PyTypeObject ModuleListType = { + PyObject_HEAD_INIT(NULL) + 0, /*ob_size*/ + "extractor.ModuleList", /*tp_name*/ + sizeof(ModuleList), /*tp_basicsize*/ + 0, /*tp_itemsize*/ + (destructor)ModuleList_dealloc, /*tp_dealloc*/ + 0, /*tp_print*/ + 0, /*tp_getattr*/ + 0, /*tp_setattr*/ + 0, /*tp_compare*/ + (reprfunc)ModuleList_repr, /*tp_repr*/ + 0, /*tp_as_number*/ + 0, /*tp_as_sequence*/ + 0, /*tp_as_mapping*/ + 0, /*tp_hash */ + 0, /*tp_call*/ + 0, /*tp_str*/ + 0, /*tp_getattro*/ + 0, /*tp_setattro*/ + 0, /*tp_as_buffer*/ + Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC, /*tp_flags*/ + "ModuleList objects", /* tp_doc */ + (traverseproc)ModuleList_traverse, /* tp_traverse */ + (inquiry)ModuleList_clear, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + 0, /* tp_iter */ + 0, /* tp_iternext */ + ModuleList_methods, /* tp_methods */ + 0, /* tp_members */ + 0, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + (initproc)ModuleList_init, /* tp_init */ + 0, /* tp_alloc */ + ModuleList_new, /* tp_new */ +}; + +/* Module type. */ + +static EXTRACTOR_KeywordList *Module_extractMethod(const char *filename, + char *data, size_t filesize, + EXTRACTOR_KeywordList *next, + const char *options) +{ + Module *self = NULL; + + self = (Module*)atoi(options); /* convert back from string repr of self. */ + + printf("In the extractor with object %i.",(int)self); + return next; +} + +static PyObject *Module_new(PyTypeObject *type, PyObject *args, + PyObject *kwargs) +{ + Module *self = NULL; + char *name = NULL, *options = NULL; + char *kwargs_list[] = {"name","options",NULL}; + int namelen = 0, i; + + if( !PyArg_ParseTupleAndKeywords(args,kwargs,"s#|z:__new__",kwargs_list, + &name,&namelen,&options) ) + goto error; + + i = 0; + while( name[i] ) + if( name[i++] == '(' ) { + PyErr_SetString(PyExc_ValueError,"name may not contain (."); + goto error; + } + + if( !( self = (Module*)type->tp_alloc(type,0) ) ) + goto error; + + /* Somewhat a HACK, creates a module structure from scratch. */ + self->module = malloc(sizeof(EXTRACTOR_ExtractorList)); + self->module->libraryHandle = NULL; + self->module->extractMethod = (ExtractMethod)&Module_extractMethod; + self->module->libname = strdup(name); + self->module->options = malloc(12); /* store self as string in options. */ + sprintf(self->module->options,"%i",(int)self); + self->module->next = NULL; + + goto finish; + + error: + Py_XDECREF(self); + self = NULL; + + finish: + return (PyObject*)self; +} + +static int Module_init(Module *self, PyObject *args, PyObject *kwargs) +{ + char *name = NULL, *options = NULL, *optstring = NULL; + char *kwargs_list[] = {"name","options",NULL}; + int namelen = 0, optionslen = 0, i, rv = 0; + + if( !PyArg_ParseTupleAndKeywords(args,kwargs,"s#|z#:__init__",kwargs_list, + &name,&namelen,&options,&optionslen) ) + goto error; + + i = 0; + while( options && options[i] ) + if( options[i++] == ')' ) { + PyErr_SetString(PyExc_ValueError,"option may not contain )."); + goto error; + } + + EXTRACTOR_removeAll(self->module); /* slight crutch, was allocated in */ + self->module = NULL; /* __new__, so that programmer can create subtype. */ + + optstring = malloc(namelen+optionslen+3); + if( options ) + sprintf(optstring,"%s(%s)",name,options); + else + sprintf(optstring,"%s",name); + if( !( self->module = EXTRACTOR_loadConfigLibraries(NULL,optstring) ) ) { + PyErr_SetString(PyExc_ValueError,"could not load module."); + goto error; + } + + goto finish; + + error: + rv = -1; + + finish: + if( optstring ) + free(optstring); + return rv; +} + +static PyObject *Module_getattr(Module *self, char *name) +{ + if( !strcmp(name,"libname") ) + return PyString_FromString(self->module->libname); + else if( !strcmp(name,"options") ) + return PyString_FromString(self->module->options); + else if( !strcmp(name,"mlist") ) + return (PyObject*)self->mlist; + PyErr_SetString(PyExc_AttributeError,name); + return NULL; +} + +static int Module_setattr(Module *self, char *name, PyObject *value) +{ + if( !strcmp(name,"libname") || !strcmp(name,"options") || + !strcmp(name,"mlist") ) + PyErr_Format(PyExc_AttributeError,"cannot set %s.",name); + else + PyErr_SetString(PyExc_AttributeError,name); + return -1; +} + +static PyObject *Module_repr(Module *self) +{ + if( self->module->options ) + return PyString_FromFormat("%s(\"%s\",\"%s\")",self->ob_type->tp_name, + self->module->libname,self->module->options); + else + return PyString_FromFormat("%s(\"%s\")",self->ob_type->tp_name, + self->module->libname); +} + +static long Module_hash(Module *self) +{ + return (int)self->module; +} + +static int Module_traverse(Module *self, visitproc visit, void *arg) +{ +#ifdef Py_VISIT + Py_VISIT((PyObject*)self->mlist); +#endif + return 0; +} + +static int Module_clear(Module *self) +{ + printf("Removing module in clear: %s.\n",self->module->libname); +#ifdef Py_CLEAR + Py_CLEAR(self->mlist); +#endif + return 0; +} + +static void Module_dealloc(Module *self) +{ + Module_clear(self); + printf("Removing module: %s.\n",self->module->libname); + self->module->next = NULL; + EXTRACTOR_removeAll(self->module); + self->ob_type->tp_free((PyObject*)self); +} + +static PyMethodDef Module_methods[] = { + {NULL} /* Sentinel */ +}; + +static PyTypeObject ModuleType = { + PyObject_HEAD_INIT(NULL) + 0, /*ob_size*/ + "extractor.Module", /*tp_name*/ + sizeof(Module), /*tp_basicsize*/ + 0, /*tp_itemsize*/ + (destructor)Module_dealloc, /*tp_dealloc*/ + 0, /*tp_print*/ + (getattrfunc)Module_getattr, /*tp_getattr*/ + (setattrfunc)Module_setattr, /*tp_setattr*/ + 0, /*tp_compare*/ + (reprfunc)Module_repr, /*tp_repr*/ + 0, /*tp_as_number*/ + 0, /*tp_as_sequence*/ + 0, /*tp_as_mapping*/ + (hashfunc)Module_hash, /*tp_hash */ + 0, /*tp_call*/ + 0, /*tp_str*/ + 0, /*tp_getattro*/ + 0, /*tp_setattro*/ + 0, /*tp_as_buffer*/ + Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HAVE_GC, /*tp_flags*/ + "Module objects", /* tp_doc */ + (traverseproc)Module_traverse, /* tp_traverse */ + (inquiry)Module_clear, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + 0, /* tp_iter */ + 0, /* tp_iternext */ + Module_methods, /* tp_methods */ + 0, /* tp_members */ + 0, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + (initproc)Module_init, /* tp_init */ + 0, /* tp_alloc */ + Module_new, /* tp_new */ +}; + +/* Module level. */ + +static PyMethodDef Extractor_Module_methods[] = { + {NULL} /* Sentinel */ +}; + +PyMODINIT_FUNC initextractor(void) +{ + PyObject *m; + + if( PyType_Ready(&ModuleListType) ) + return; + if( PyType_Ready(&ModuleType) ) + return; + + m = Py_InitModule3("extractor",Extractor_Module_methods,"Extractor module."); + if (m == NULL) + return; + + Py_INCREF(&ModuleListType); + Py_INCREF(&ModuleType); + PyModule_AddObject(m,"ModuleList",(PyObject*)&ModuleListType); + PyModule_AddObject(m,"Module",(PyObject*)&ModuleType); +} diff --git a/libextractor_python_setup.py b/libextractor_python_setup.py @@ -0,0 +1,17 @@ +from distutils.core import Extension, setup +import sys + +path=sys.argv[0] +sys.argv = sys.argv[1:] + +cmod = Extension("extractor",["libextractor_python.c"], + libraries=["extractor"], + include_dirs=["../include"], + library_dirs=[path]) + +setup(name="Extractor", + version="0.5.0", + ext_modules=[cmod], + author="Christian Grothoff, Heiko Wundram", + author_email="libextractor@gnu.org") +