extractor.py (6531B)
1 # -*- coding: utf-8 -*- 2 # Python bindings for GNU libextractor 3 # 4 # Copyright (C) 2006 Bader Ladjemi <bader@tele2.fr> 5 # Copyright (C) 2011 Christian Grothoff <christian@grothoff.org> 6 # Copyright (C) 2017, 2018 Nikita Gillmann <nikita@n0.is> 7 # 8 # This program is free software; you can redistribute it and/or modify 9 # it under the terms of the GNU General Public License as published by 10 # the Free Software Foundation; either version 3 of the License, or 11 # (at your option) any later version. 12 # 13 # This program is distributed in the hope that it will be useful, 14 # but WITHOUT ANY WARRANTY; without even the implied warranty of 15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 # GNU General Public License for more details. 17 # 18 # You should have received a copy of the GNU General Public License 19 # along with this program; see the file COPYING. If not, write to the 20 # Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, 21 # USA. 22 23 """ 24 Python bindings for GNU libextractor 25 26 libextractor is a simple library for keyword extraction. libextractor 27 does not support all formats but supports a simple plugging mechanism 28 such that you can quickly add extractors for additional formats, even 29 without recompiling libextractor. libextractor typically ships with a 30 dozen helper-libraries that can be used to obtain keywords from common 31 file-types. 32 33 libextractor is a part of the GNU project (http://www.gnu.org/). 34 """ 35 from ctypes import * 36 from ctypes import CDLL 37 from ctypes.util import find_library 38 import os 39 import logging 40 import faulthandler 41 42 faulthandler.enable() 43 44 # Can be useful: 45 # "DYLD_LIBRARY_PATH" in os.environ 46 47 # fake cdll import 48 # loading shared object file 49 try: 50 if os.uname()[0] == 'Linux': 51 libextractor = CDLL('libextractor.so.3') 52 elif os.uname()[0] == 'Linux': 53 libextractor = cdll.LoadLibrary(find_library('libextractor.so.3')) 54 else: 55 libextractor = cdll.extractor 56 except: 57 raise ImportError("Could not find shared 'libextractor' library.") 58 59 60 __all__ = ['Extractor'] 61 __version__ = "1.7" 62 __licence__ = "GNU GPL" 63 64 """ 65 keyword's charset encoding 66 """ 67 KeywordType = c_int 68 MetaType = c_int 69 70 EXTRACT_CB = CFUNCTYPE(c_int, c_void_p, c_char_p, KeywordType, MetaType, c_char_p, c_void_p, c_size_t) 71 72 libextractor.EXTRACTOR_metatype_get_max.restype = KeywordType 73 libextractor.EXTRACTOR_metatype_to_description.restype = c_char_p 74 libextractor.EXTRACTOR_metatype_to_string.restype = c_char_p 75 libextractor.EXTRACTOR_plugin_add_defaults.restype = c_void_p 76 libextractor.EXTRACTOR_extract.argtypes = [c_void_p, c_char_p, c_void_p, c_size_t, EXTRACT_CB, c_void_p] 77 78 79 EXTRACTOR_METAFORMAT_UNKNOWN = 0 80 EXTRACTOR_METAFORMAT_UTF8 = 1 81 EXTRACTOR_METAFORMAT_BINARY = 2 82 EXTRACTOR_METAFORMAT_C_STRING = 3 83 84 85 class Extractor(object): 86 """ 87 Main class for extracting meta-data with GNU libextractor. 88 89 You may create multiple instances of Extractor to use 90 different sets of library. Initially each Extractor 91 will start with the default set of libraries. 92 93 Use the extract method to obtain keywords from a file. 94 95 Use the add and remove libraries methods to change the list of 96 libraries that should be used. 97 """ 98 99 def __init__(self, defaults=True, libraries=None): 100 """ 101 Initialize Extractor's instance 102 103 @param libraries: list of strings that contains extractor's name (supported types) 104 @param defaults: load default plugins 105 """ 106 self.extractors = None 107 if defaults: 108 self.extractors = libextractor.EXTRACTOR_plugin_add_defaults(0) 109 if libraries: 110 self.extractors = libextractor.EXTRACTOR_plugin_add_config(self.extractors, libraries, 0) 111 112 def extract(self, proc, proc_cls, filename=None, data=None, size=0): 113 """ 114 Extract keywords from a file, or from its data. 115 116 @param filename: filename string 117 @param data: data contents 118 @param size: data size 119 @param proc: function to call on each value 120 @param proc_cls: closure to proc 121 122 If you give data, size has to be given as well. 123 124 """ 125 if not filename and not (data and size): 126 return None 127 else: 128 libextractor.EXTRACTOR_extract(self.extractors, filename, data, size, EXTRACT_CB(proc), proc_cls) 129 130 def addLibrary(self, library): 131 """ 132 Add given library to the extractor. Invoke with a string with the name 133 of the library that should be added. For example, 134 135 'libextractor_filename' 136 137 will prepend the extractor that just adds the filename as a 138 keyword. 139 140 No errors are reported if the library is not 141 found. 142 143 @param library: library's name 144 """ 145 self.extractors = libextractor.EXTRACTOR_plugin_add(self.extractors, library, NULL, 0) 146 147 def removeLibrary(self, library): 148 """ 149 Remove a library. Pass the name of the library that is to 150 be removed. Only one library can be removed at a time. 151 For example, 152 153 'libextractor_pdf' 154 155 removes the PDF extractor (if added). 156 ValueError will be thrown if no library match. 157 158 @param library: library's name 159 """ 160 161 self.extractors = libextractor.EXTRACTOR_plugin_remove(self.extractors, library) 162 163 def addLibraries(self, libraries): 164 """ 165 Add given libraries. 166 Same as addLibary but libraries is a list of library's names. 167 168 @param libraries: list of libraries names 169 """ 170 171 self.extractors = libextractor.EXTRACTOR_plugin_add_config(self.extractors, libraries) 172 173 def removeAllLibraries(self): 174 """ 175 Remove all libraries. 176 """ 177 178 libextractor.EXTRACTOR_plugin_remove_all(self.extractors) 179 self.extractors = None 180 181 def keywordTypes(self): 182 """ 183 Returns the list of all keywords types. 184 @return: list of all keywords types 185 """ 186 i = 0 187 keyword_types = [] 188 189 while True: 190 keyword_type = libextractor.EXTRACTOR_metatype_to_string(i) 191 if not keyword_type: 192 break 193 keyword_types.append(keyword_type) 194 i += 1 195 196 return tuple(keyword_types) 197 198 def __del__(self): 199 """ 200 >>> extractor = Extractor() 201 >>> del extractor 202 """ 203 if self.extractors: 204 self.removeAllLibraries() 205 206 207 if __name__ == "__main__": 208 try: 209 import doctest 210 doctest.testmod() 211 except Exception as e: 212 print(e)