commit be2c5b9b55104558d9ea57299d0f760901c047f7
parent 7d439ed996f8e7f94e4f15024980a954337ae951
Author: Christian Grothoff <christian@grothoff.org>
Date: Fri, 9 Sep 2011 19:13:37 +0000
updating binding to work with LE 0.6
Diffstat:
5 files changed, 56 insertions(+), 255 deletions(-)
diff --git a/ChangeLog b/ChangeLog
@@ -1,8 +1,9 @@
+Fri Sep 9 21:09:47 CEST 2011
+ Updated Python binding to GNU libextractor 0.6-API. -CG
+
Sun May 28 00:42:00 CEST 2006
Rewritten using ctypes.
-
+
Mon Jul 4 17:19:33 CEST 2005
Moved python binding into separate package.
-
-
diff --git a/README b/README
@@ -18,8 +18,8 @@ Dependencies
* python >= 2.3
web site: http://www.python.org/
- * libextractor > 0.5
- web site: http://gnunet.org/libextractor
+ * libextractor >= 0.6
+ web site: http://www.gnu.org/software/libextractor/
* ctypes >= 0.9
web site: http://starship.python.net/crew/theller/ctypes/
@@ -62,6 +62,7 @@ Copyright
=========
Copyright (C) 2006 Bader Ladjemi <bader@tele2.fr>
+ Copyright (C) 2011 Christian Grothoff <christian@grothoff.org>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
diff --git a/extract.py b/extract.py
@@ -24,10 +24,22 @@ Little demo how to use the libextractor Python binding.
"""
import extractor
import sys
+from ctypes import *
+import struct
xtract = extractor.Extractor()
+
+def print_k(xt, plugin, type, format, mime, data, datalen):
+ mstr = cast (data, c_char_p)
+# FIXME: this ignores 'datalen', not that great...
+# (in general, depending on the mime type and format, only
+# the first 'datalen' bytes in 'data' should be used).
+ if (format == extractor.EXTRACTOR_METAFORMAT_UTF8):
+ print "%s - %s" % (xtract.keywordTypes()[type], mstr.value)
+ return 0
+
+
for arg in sys.argv[1:]:
print "Keywords from %s:" % arg
- keys = xtract.extract(arg)
- for keyword_type, keyword in keys:
- print "%s - %s" % (keyword_type.encode('iso-8859-1'), keyword.encode('iso-8859-1'))
+ xtract.extract(print_k, None, arg)
+
diff --git a/extractor.py b/extractor.py
@@ -2,6 +2,7 @@
## Python bindings for GNU libextractor
##
## Copyright (C) 2006 Bader Ladjemi <bader@tele2.fr>
+## Copyright (C) 2011 Christian Grothoff <christian@grothoff.org>
##
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
@@ -34,56 +35,34 @@ from ctypes import *
#fake cdll import
try:
#loading shared object file
- libextractor = cdll.LoadLibrary('libextractor.so.1')
+ libextractor = cdll.LoadLibrary('libextractor.so.3')
except OSError:
libextractor = cdll.extractor
-__all__ = ['Extractor', 'isBinaryType', 'EXTRACTOR_ENCODING', 'DEFAULT_LIBRARIES', 'EXTRACTOR_THUMBNAIL_DATA']
-__version__ = "0.5"
+__all__ = ['Extractor']
+__version__ = "0.6"
__licence__ = "GNU GPL"
"""
keyword's charset encoding
"""
-EXTRACTOR_ENCODING = "utf-8"
-
KeywordType = c_int
-Keywords_p = POINTER('Keywords')
-class Keywords(Structure):
- """
- EXTRACTOR_Keywords struct
- """
- _fields_ = [('keyword', c_char_p),
- ('keywordType', KeywordType),
- ('next', Keywords_p)]
-SetPointerType(Keywords_p, Keywords)
-
-KEYWORDS = POINTER(Keywords)
+MetaType = c_int
-libextractor.EXTRACTOR_getKeywords.restype = KEYWORDS
-libextractor.EXTRACTOR_getKeywords2.restype = KEYWORDS
-libextractor.EXTRACTOR_removeDuplicateKeywords.restype = KEYWORDS
-libextractor.EXTRACTOR_getKeywordTypeAsString.restype = c_char_p
+EXTRACT_CB = CFUNCTYPE(c_int, c_void_p, c_char_p, KeywordType, MetaType, c_char_p, c_void_p, c_size_t)
-libextractor.EXTRACTOR_getDefaultLibraries.restype = c_char_p
+libextractor.EXTRACTOR_metatype_get_max.restype = KeywordType
+libextractor.EXTRACTOR_metatype_to_description.restype = c_char_p
+libextractor.EXTRACTOR_metatype_to_string.restype = c_char_p
+libextractor.EXTRACTOR_plugin_add_defaults.restype = c_void_p
+libextractor.EXTRACTOR_extract.argtypes = [c_void_p, c_char_p, c_void_p, c_size_t, EXTRACT_CB, c_void_p]
-"""
-thumbnail keyword type (binary)
-"""
-EXTRACTOR_THUMBNAIL_DATA = 70
-def isBinaryType(keyword_type):
- """
- returns if the given keyword_type is binary
+EXTRACTOR_METAFORMAT_UNKNOWN = 0
+EXTRACTOR_METAFORMAT_UTF8 = 1
+EXTRACTOR_METAFORMAT_BINARY = 2
+EXTRACTOR_METAFORMAT_C_STRING = 3
- @param keyword_type: keyword type (int)
- """
- return keyword_type == EXTRACTOR_THUMBNAIL_DATA
-
-"""
-default loaded libraries
-"""
-DEFAULT_LIBRARIES = libextractor.EXTRACTOR_getDefaultLibraries().split(':')
class Extractor(object):
"""
@@ -99,177 +78,37 @@ class Extractor(object):
libraries that should be used.
"""
- def __init__(self, defaults=True, libraries=None, lang=None, languages=None, hash=None, use_filename=False, split_keywords=False):
+ def __init__(self, defaults=True, libraries=None):
"""
Initialize Extractor's instance
- @param extractors: list of strings that contains extractor's name (supported types)
+ @param libraries: list of strings that contains extractor's name (supported types)
@param defaults: load default plugins
- @param lang: use the generic plaintext extractor for the language with the 2-letter language code LANG
- @param languages: list of lang
- @param hash: compute hash using the given algorithm (currently 'sha1' or 'md5')
- @param use_filename: use the filename as a keyword (add filename-extractor library)
- @param split_keywords: use keyword splitting (add split-extractor library)
-
- >>> Extractor() #doctest: +ELLIPSIS
- <__main__.Extractor object at 0x...>
-
- >>> extractor = Extractor(defaults=False)
- >>> extractor.libraries
- ()
-
- >>> extractor = Extractor()
- >>> sorted(extractor.libraries) == sorted(tuple(DEFAULT_LIBRARIES))
- True
-
- >>> extractor = Extractor(hash='md5')
- >>> found = False
- >>> for library in extractor.libraries:
- ... if 'md5' in library:
- ... found = True
- ... break
- >>> found
- True
-
- >>> extractor = Extractor(use_filename=True)
- >>> found = False
- >>> for library in extractor.libraries:
- ... if 'filename' in library:
- ... found = True
- ... break
- >>> found
- True
-
- >>> extractor = Extractor(split_keywords=True)
- >>> found = False
- >>> for library in extractor.libraries:
- ... if 'split' in library:
- ... found = True
- ... break
- >>> found
- True
"""
- self._libraries = {}
self.extractors = None
if defaults:
- self.extractors = libextractor.EXTRACTOR_loadDefaultLibraries()
- self._libraries = dict([(library, None) for library in DEFAULT_LIBRARIES])
- if use_filename:
- self.addLibrary("libextractor_filename")
+ self.extractors = libextractor.EXTRACTOR_plugin_add_defaults(0)
if libraries:
- self.extractors = libextractor.EXTRACTOR_loadConfigLibraries(self.extractors, libraries)
- self._libraries.update(dict([(library, None) for library in libraries.split(':')]))
- if isinstance(lang, str):
- self.addLibraryLast("libextractor_printable_%s" % lang)
- if isinstance(hash, str):
- self.addLibraryLast("libextractor_hash_%s" % hash)
- if languages:
- [self.addLibraryLast("libextractor_printable_%s" % language) for language in languages]
- if split_keywords:
- self.addLibraryLast("libextractor_split")
+ self.extractors = libextractor.EXTRACTOR_plugin_add_config (self.extractors, libraries, 0)
- def extract(self, filename=None, data=None, size=None):
+ def extract(self, proc, proc_cls, filename=None, data=None, size=0):
"""Extract keywords from a file, or from its data.
@param filename: filename string
@param data: data contents
@param size: data size
+ @param proc: function to call on each value
+ @param proc_cls: closure to proc
- This function returns a list of tuples. Its first value is keyword type
- and its second value is keyword value. If the file cannot be opened
- or cannot be found, the list will be empty. The list can
- also be empty if no keyword was found for the file.
-
- If you give data, size had to be given too.
+ If you give data, size has to be given as well.
"""
if not filename and not (data and size):
return None
- elif filename:
- return self.extractFromFile(filename)
else:
- return self.extractFromData(data, size)
+ libextractor.EXTRACTOR_extract (self.extractors, filename, data, size, EXTRACT_CB(proc), proc_cls)
- def extractFromFile(self, filename):
- """Extract keywords from a file using its filename.
-
- @param filename: filename string
-
- This function returns a list of tuples. Its first value is keyword type
- and its second value is keyword value. If the file cannot be opened
- or cannot be found, the list will be empty. The list can
- also be empty if no keyword was found for the file.
-
- >>> import os
- >>> extractor = Extractor()
- >>> filename = os.tmpnam()
- >>> f = file(filename, 'w')
- >>> extractor.extract(filename)
- []
-
- >>> import os
- >>> extractor = Extractor()
- >>> filename = '../Extractor/test/test.png'
- >>> extractor.extract(filename)
- [(u'comment', u'Testing keyword extraction\\n'), (u'resource-identifier', u'dc6c58c971715e8043baef058b675eec'), (u'size', u'4x4'), (u'mimetype', u'image/png')]
-
- >>> import os, glob
- >>> extractor = Extractor()
- >>> filename = glob.glob('dist/*.gz')[0]
- >>> extracted = extractor.extract(filename)
- >>> filename_count = 0
- >>> for keyword_type, keyword in extracted:
- ... if keyword_type == 'filename':
- ... filename_count += 1
- >>> filename_count > 1
- True
-
- """
- self.keywords_p = libextractor.EXTRACTOR_getKeywords(self.extractors, filename)
- return self._extract()
-
- def extractFromData(self, data, size):
- """Extract keywords using its data.
-
- @param data: data contents
- @param size: data size
-
- This function returns a list of tuples. Its first value is keyword type
- and its second value is keyword value. If the file cannot be opened
- or cannot be found, the list will be empty. The list can
- also be empty if no keyword was found for the file.
-
- """
- self.keywords_p = libextractor.EXTRACTOR_getKeywords2(self.extractors, data, size)
- return self._extract()
-
- def _extract(self):
- self.extracted = []
-
- if not self.keywords_p:
- return self.extracted
-
- try:
- self.keywords = self.keywords_p.contents
- except ValueError:
- return self.extracted
-
- while True:
- keyword_type = libextractor.EXTRACTOR_getKeywordTypeAsString(self.keywords.keywordType).decode(EXTRACTOR_ENCODING)
- keyword = self.keywords.keyword
-
- if not isBinaryType(self.keywords.keywordType):
- keyword = keyword.decode(EXTRACTOR_ENCODING)
-
- self.extracted.append((keyword_type, keyword))
- try:
- self.keywords = self.keywords.next.contents
- except ValueError:
- libextractor.EXTRACTOR_freeKeywords(self.keywords_p)
- self.keywords_p = None
- return self.extracted
-
def addLibrary(self, library):
"""
Add given library to the extractor. Invoke with a string with the name
@@ -285,19 +124,7 @@ class Extractor(object):
@param library: library's name
"""
- self._libraries[library] = None
-
- self.extractors = libextractor.EXTRACTOR_addLibrary(self.extractors, library)
-
- def addLibraryLast(self, library):
- """
- Same as addLibrary but the library is added at the last.
-
- @param library: library's name
- """
- self._libraries[library] = None
-
- self.extractors = libextractor.EXTRACTOR_addLibraryLast(self.extractors, library)
+ self.extractors = libextractor.EXTRACTOR_plugin_add (self.extractors, library, NULL, 0)
def removeLibrary(self, library):
"""
@@ -312,12 +139,8 @@ class Extractor(object):
@param library: library's name
"""
- try:
- del self._libraries[library]
- except KeyError:
- raise ValueError, "No such loaded library"
-
- self.extractors = libextractor.EXTRACTOR_removeLibrary(self.extractors, library)
+
+ self.extractors = libextractor.EXTRACTOR_plugin_remove(self.extractors, library)
def addLibraries(self, libraries):
"""
@@ -326,38 +149,29 @@ class Extractor(object):
@param libraries: list of libraries names
"""
- for library in libraries:
- if isinstance(library, str):
- self.addLibrary(library)
+
+ self.extractors = libextractor.EXTRACTOR_plugin_add_config(self.extractors, libraries)
def removeAllLibraries(self):
"""
Remove all libraries.
- >>> extractor = Extractor()
- >>> extractor.removeAllLibraries()
- >>> extractor.libraries
- ()
"""
- self._libraries = {}
- if self.extractors:
- libextractor.EXTRACTOR_removeAll(self.extractors)
- self.extractors = None
+
+ libextractor.EXTRACTOR_plugin_remove_all(self.extractors)
+ self.extractors = None
def keywordTypes(self):
"""
Returns the list of all keywords types.
@return: list of all keywords types
- >>> extractor = Extractor()
- >>> extractor.keywordTypes()
- ('unknown', 'filename', 'mimetype', 'title', 'author', 'artist', 'description', 'comment', 'date', 'publisher', 'language', 'album', 'genre', 'location', 'version', 'organization', 'copyright', 'subject', 'keywords', 'contributor', 'resource-type', 'format', 'resource-identifier', 'source', 'relation', 'coverage', 'software', 'disclaimer', 'warning', 'translated', 'creation date', 'modification date', 'creator', 'producer', 'page count', 'page orientation', 'paper size', 'used fonts', 'page order', 'created for', 'magnification', 'release', 'group', 'size', 'summary', 'packager', 'vendor', 'license', 'distribution', 'build-host', 'os', 'dependency', 'MD4', 'MD5', 'SHA-0', 'SHA-1', 'RipeMD160', 'resolution', 'category', 'book title', 'priority', 'conflicts', 'replaces', 'provides', 'conductor', 'interpreter', 'owner', 'lyrics', 'media type', 'contact', 'binary thumbnail data', 'publication date', 'camera make', 'camera model', 'exposure', 'aperture', 'exposure bias', 'flash', 'flash bias', 'focal length', 'focal length (35mm equivalent)', 'iso speed', 'exposure mode', 'metering mode', 'macro mode', 'image quality', 'white balance', 'orientation')
"""
i = 0
keyword_types = []
while True:
- keyword_type = libextractor.EXTRACTOR_getKeywordTypeAsString(i)
+ keyword_type = libextractor.EXTRACTOR_metatype_to_string(i)
if not keyword_type:
break
keyword_types.append(keyword_type)
@@ -365,30 +179,6 @@ class Extractor(object):
return tuple(keyword_types)
- def _get_libraries(self):
- """
- Return current libraries
- @return: current libraries
- """
- return tuple(self._libraries.keys())
-
- def _set_libraries(self, libraries):
- """
- Add libraries to load (don't replace current ones)
-
- @param libraries: list of libraries
-
- >>> extractor = Extractor()
- >>> extractor.libraries = ('libextractor_filename', )
- >>> 'libextractor_filename' in extractor.libraries
- True
- >>> len(extractor.libraries) == len(DEFAULT_LIBRARIES)+1
- True
-
- """
- self.addLibraries(libraries)
-
- libraries = property(fget=_get_libraries, fset=_set_libraries, fdel=removeAllLibraries, doc='tuple of loaded libraries')
def __del__(self):
"""
diff --git a/setup.py b/setup.py
@@ -6,7 +6,7 @@ except ImportError:
setup(
name = "Extractor",
- version = "0.5",
+ version = "0.6",
py_modules = ['extractor'],
scripts = ['extract.py'],
@@ -20,9 +20,6 @@ setup(
license = "GNU GPL",
keywords = "libextractor binding tag metadata",
url = "http://gnunet.org/libextractor/",
-
- dependency_links=['http://starship.python.net/crew/theller/ctypes/',],
-
long_description="""libextractor is a simple library for keyword extraction. libextractor
does not support all formats but supports a simple plugging mechanism
such that you can quickly add extractors for additional formats, even