libextractor-python

GNU libextractor
Log | Files | Refs | README | LICENSE

extractor.py (6531B)


      1 # -*- coding: utf-8 -*-
      2 # Python bindings for GNU libextractor
      3 #
      4 # Copyright (C) 2006 Bader Ladjemi <bader@tele2.fr>
      5 # Copyright (C) 2011 Christian Grothoff <christian@grothoff.org>
      6 # Copyright (C) 2017, 2018 Nikita Gillmann <nikita@n0.is>
      7 #
      8 # This program is free software; you can redistribute it and/or modify
      9 # it under the terms of the GNU General Public License as published by
     10 # the Free Software Foundation; either version 3 of the License, or
     11 # (at your option) any later version.
     12 #
     13 # This program is distributed in the hope that it will be useful,
     14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
     15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     16 # GNU General Public License for more details.
     17 #
     18 # You should have received a copy of the GNU General Public License
     19 # along with this program; see the file COPYING. If not, write to the
     20 # Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139,
     21 # USA.
     22 
     23 """
     24 Python bindings for GNU libextractor
     25 
     26 libextractor is a simple library for keyword extraction.  libextractor
     27 does not support all formats but supports a simple plugging mechanism
     28 such that you can quickly add extractors for additional formats, even
     29 without recompiling libextractor. libextractor typically ships with a
     30 dozen helper-libraries that can be used to obtain keywords from common
     31 file-types.
     32 
     33 libextractor is a part of the GNU project (http://www.gnu.org/).
     34 """
     35 from ctypes import *
     36 from ctypes import CDLL
     37 from ctypes.util import find_library
     38 import os
     39 import logging
     40 import faulthandler
     41 
     42 faulthandler.enable()
     43 
     44 # Can be useful:
     45 # "DYLD_LIBRARY_PATH" in os.environ
     46 
     47 # fake cdll import
     48 # loading shared object file
     49 try:
     50     if os.uname()[0] == 'Linux':
     51         libextractor = CDLL('libextractor.so.3')
     52     elif os.uname()[0] == 'Linux':
     53         libextractor = cdll.LoadLibrary(find_library('libextractor.so.3'))
     54     else:
     55         libextractor = cdll.extractor
     56 except:
     57     raise ImportError("Could not find shared 'libextractor' library.")
     58 
     59 
     60 __all__ = ['Extractor']
     61 __version__ = "1.7"
     62 __licence__ = "GNU GPL"
     63 
     64 """
     65 keyword's charset encoding
     66 """
     67 KeywordType = c_int
     68 MetaType = c_int
     69 
     70 EXTRACT_CB = CFUNCTYPE(c_int, c_void_p, c_char_p, KeywordType, MetaType, c_char_p, c_void_p, c_size_t)
     71 
     72 libextractor.EXTRACTOR_metatype_get_max.restype = KeywordType
     73 libextractor.EXTRACTOR_metatype_to_description.restype = c_char_p
     74 libextractor.EXTRACTOR_metatype_to_string.restype = c_char_p
     75 libextractor.EXTRACTOR_plugin_add_defaults.restype = c_void_p
     76 libextractor.EXTRACTOR_extract.argtypes = [c_void_p, c_char_p, c_void_p, c_size_t, EXTRACT_CB, c_void_p]
     77 
     78 
     79 EXTRACTOR_METAFORMAT_UNKNOWN = 0
     80 EXTRACTOR_METAFORMAT_UTF8 = 1
     81 EXTRACTOR_METAFORMAT_BINARY = 2
     82 EXTRACTOR_METAFORMAT_C_STRING = 3
     83 
     84 
     85 class Extractor(object):
     86     """
     87     Main class for extracting meta-data with GNU libextractor.
     88 
     89     You may create multiple instances of Extractor to use
     90     different sets of library.  Initially each Extractor
     91     will start with the default set of libraries.
     92 
     93     Use the extract method to obtain keywords from a file.
     94 
     95     Use the add and remove libraries methods to change the list of
     96     libraries that should be used.
     97     """
     98 
     99     def __init__(self, defaults=True, libraries=None):
    100         """
    101         Initialize Extractor's instance
    102 
    103         @param libraries: list of strings that contains extractor's name (supported types)
    104         @param defaults: load default plugins
    105         """
    106         self.extractors = None
    107         if defaults:
    108             self.extractors = libextractor.EXTRACTOR_plugin_add_defaults(0)
    109         if libraries:
    110             self.extractors = libextractor.EXTRACTOR_plugin_add_config(self.extractors, libraries, 0)
    111 
    112     def extract(self, proc, proc_cls, filename=None, data=None, size=0):
    113         """
    114         Extract keywords from a file, or from its data.
    115 
    116         @param filename: filename string
    117         @param data: data contents
    118         @param size: data size
    119         @param proc: function to call on each value
    120         @param proc_cls: closure to proc
    121 
    122         If you give data, size has to be given as well.
    123 
    124         """
    125         if not filename and not (data and size):
    126             return None
    127         else:
    128             libextractor.EXTRACTOR_extract(self.extractors, filename, data, size, EXTRACT_CB(proc), proc_cls)
    129 
    130     def addLibrary(self, library):
    131         """
    132         Add given library to the extractor. Invoke with a string with the name
    133         of the library that should be added.  For example,
    134 
    135         'libextractor_filename'
    136 
    137         will prepend the extractor that just adds the filename as a
    138         keyword.
    139 
    140         No errors are reported if the library is not
    141         found.
    142 
    143         @param library: library's name
    144         """
    145         self.extractors = libextractor.EXTRACTOR_plugin_add(self.extractors, library, NULL, 0)
    146 
    147     def removeLibrary(self, library):
    148         """
    149         Remove a library.  Pass the name of the library that is to
    150         be removed.  Only one library can be removed at a time.
    151         For example,
    152 
    153         'libextractor_pdf'
    154 
    155         removes the PDF extractor (if added).
    156         ValueError will be thrown if no library match.
    157 
    158         @param library: library's name
    159         """
    160 
    161         self.extractors = libextractor.EXTRACTOR_plugin_remove(self.extractors, library)
    162 
    163     def addLibraries(self, libraries):
    164         """
    165         Add given libraries.
    166         Same as addLibary but libraries is a list of library's names.
    167 
    168         @param libraries: list of libraries names
    169         """
    170 
    171         self.extractors = libextractor.EXTRACTOR_plugin_add_config(self.extractors, libraries)
    172 
    173     def removeAllLibraries(self):
    174         """
    175         Remove all libraries.
    176         """
    177 
    178         libextractor.EXTRACTOR_plugin_remove_all(self.extractors)
    179         self.extractors = None
    180 
    181     def keywordTypes(self):
    182         """
    183         Returns the list of all keywords types.
    184         @return: list of all keywords types
    185         """
    186         i = 0
    187         keyword_types = []
    188 
    189         while True:
    190             keyword_type = libextractor.EXTRACTOR_metatype_to_string(i)
    191             if not keyword_type:
    192                 break
    193             keyword_types.append(keyword_type)
    194             i += 1
    195 
    196         return tuple(keyword_types)
    197 
    198     def __del__(self):
    199         """
    200         >>> extractor = Extractor()
    201         >>> del extractor
    202         """
    203         if self.extractors:
    204             self.removeAllLibraries()
    205 
    206 
    207 if __name__ == "__main__":
    208     try:
    209         import doctest
    210         doctest.testmod()
    211     except Exception as e:
    212         print(e)