libextractor

GNU libextractor
Log | Files | Refs | Submodules | README | LICENSE

commit 304c2d93317f614b247d0d7d5cfcbea458e3e0d8
parent bff37ddc5da45ef43c94ea86820b25599a84806b
Author: Christian Grothoff <christian@grothoff.org>
Date:   Sun, 13 Dec 2009 23:02:19 +0000

new API for GNU libextractor, converted first 3 plugins as well

Diffstat:
MAUTHORS | 3++-
MChangeLog | 4++++
Mconfigure.ac | 12++++++------
Mdoc/extract.1 | 28++++++----------------------
Mdoc/libextractor.3 | 68++++++++++++++++++++++++--------------------------------------------
Mdoc/version.texi | 4++--
Msrc/include/Makefile.am | 1-
Msrc/include/extractor.h | 789+++++++++++++++++++++++++++++++++++++++++++------------------------------------
Dsrc/include/winproc.h | 44--------------------------------------------
Msrc/main/Makefile.am | 28++--------------------------
Msrc/main/extract.c | 723+++++++++++++++++++++++++++++++++++++++++++------------------------------------
Msrc/main/extractor.c | 2473+++++++++++++++++++++++++++++++++++++++++++------------------------------------
Msrc/main/iconv.c | 5+++--
Dsrc/main/test_binary.c | 66------------------------------------------------------------------
Dsrc/main/winproc.c | 51---------------------------------------------------
Msrc/plugins/Makefile.am | 87+++++++++++++++++++++++++++----------------------------------------------------
Dsrc/plugins/filenameextractor.c | 72------------------------------------------------------------------------
Asrc/plugins/html_extractor.c | 403+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Dsrc/plugins/htmlextractor.c | 446-------------------------------------------------------------------------------
Asrc/plugins/it_extractor.c | 102+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Dsrc/plugins/itextractor.c | 107-------------------------------------------------------------------------------
Dsrc/plugins/lowerextractor.c | 80-------------------------------------------------------------------------------
Asrc/plugins/mime_extractor.c | 320+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Dsrc/plugins/mimeextractor.c | 333-------------------------------------------------------------------------------
Dsrc/plugins/splitextractor.c | 157-------------------------------------------------------------------------------
25 files changed, 3073 insertions(+), 3333 deletions(-)

diff --git a/AUTHORS b/AUTHORS @@ -1,6 +1,6 @@ Core Team: -Vidyut Samanta <vids@cs.ucla.edu> Christian Grothoff <christian@grothoff.org> +Nils Durner <durner@gnunet.org> Formats: html - core team with code from libhtmlparse 0.1.13, http://msalem.translator.cx/libhtmlparse.html @@ -55,6 +55,7 @@ Heiko Wundram <modelnine@ceosg.de> Ronan MELENNEC <ronan.melennec@cena.fr> Vasil Dimov <vd@freebsd.org> Pavol Rusnak <prusnak@suse.cz> +Vidyut Samanta <vids@cs.ucla.edu> Translations: German - Karl Eichwalder <ke@gnu.franken.de> diff --git a/ChangeLog b/ChangeLog @@ -1,3 +1,7 @@ +Sun Dec 13 16:53:35 CET 2009 + Starting with major API breakage with the goal to fix all of + the not-so-nice things that have accumulated since version 0.0.0. -CG + Sat Dec 5 11:32:30 CET 2009 Adding extraction of Iptc data using exiv2. diff --git a/configure.ac b/configure.ac @@ -1,14 +1,14 @@ # Process this file with autoconf to produce a configure script. AC_PREREQ(2.61) -AC_INIT([libextractor], [0.5.23], [bug-libextractor@gnu.org]) +AC_INIT([libextractor], [0.6.0], [bug-libextractor@gnu.org]) AC_CONFIG_AUX_DIR([libltdl/config]) -AM_INIT_AUTOMAKE([libextractor], [0.5.23]) +AM_INIT_AUTOMAKE([libextractor], [0.6.0]) AC_CONFIG_HEADERS([config.h]) AH_TOP([#define _GNU_SOURCE 1]) -LIB_VERSION_CURRENT=2 -LIB_VERSION_REVISION=1 -LIB_VERSION_AGE=1 +LIB_VERSION_CURRENT=3 +LIB_VERSION_REVISION=0 +LIB_VERSION_AGE=0 AC_SUBST(LIB_VERSION_CURRENT) AC_SUBST(LIB_VERSION_REVISION) AC_SUBST(LIB_VERSION_AGE) @@ -540,7 +540,7 @@ AC_CACHE_CHECK([whether -export-symbols-regex works], if test "x$gn_cv_export_symbols_regex_works" = "xyes" then LE_LIB_LDFLAGS="$LE_LIB_LDFLAGS -export-symbols-regex \"EXTRACTOR_@<:@a-zA-Z0-9_@:>@*\"" - LE_PLUGIN_LDFLAGS="$LE_PLUGIN_LDFLAGS -export-symbols-regex \"libextractor_@<:@a-zA-Z0-9_@:>@*_extract\"" + LE_PLUGIN_LDFLAGS="$LE_PLUGIN_LDFLAGS -export-symbols-regex \"EXTRACTOR_@<:@a-zA-Z0-9_@:>@*_extract\"" fi AC_SUBST(LE_LIB_LDFLAGS) AC_SUBST(LE_PLUGIN_LDFLAGS) diff --git a/doc/extract.1 b/doc/extract.1 @@ -1,4 +1,4 @@ -.TH EXTRACT 1 "Dec 29, 2006" "libextractor 0.5.17" +.TH EXTRACT 1 "Dec 14, 2009" "libextractor 0.6.0" .\" $Id .SH NAME extract @@ -6,7 +6,7 @@ extract .SH SYNOPSIS .B extract [ -.B \-abdfghLnrsvV +.B \-bghLnvV ] [ .B \-B @@ -32,7 +32,7 @@ extract \&... .br .SH DESCRIPTION -This manual page documents version 0.5.17 of the +This manual page documents version 0.6.0 of the .B extract command. .PP @@ -46,9 +46,6 @@ option. .SH OPTIONS .TP 8 -.B \-a -Do not remove any duplicates, even if the keywords match exactly and have the same type (i.e. because the same keyword was found by different extractor libraries). -.TP 8 .B \-b Display the output in BiBTeX format. This implies the .B \-d @@ -57,12 +54,6 @@ option .B \-B LANG Use the generic plaintext extractor for the language with the 2\-letter language code LANG. Supported languages are DA (Danish), DE (German), EN (English), ES (Spanish), FI (Finnish), FR (French), GA (Gaelic), IT (Italian), NO (Norwegian) and SV (Swedish). .TP 8 -.B \-d -Remove duplicates only if the types match exactly. By default, duplicates are removed if the types match or if one of the types is \I unknown (in this case, the duplicate of unknown type is removed). -.TP 8 -.B \-f -add the filename(s) (without directory) to the list of keywords. -.TP 8 .B \-g Use grep\-friendly output (all keywords on a single line for each file). Use the verbose option to print the filename first, followed by the keywords. Use the verbose option twice to also display the keyword types. This option will not print keyword types or non\-textual metadata. .TP 8 @@ -78,12 +69,6 @@ Print a list of all known keyword types. .B \-n Do not use the default set of extractors (typically all standard extractors, currently mp3, ogg, jpg, gif, png, tiff, real, html, pdf and mime\-types), use only the extractors specified with the .B \-l option. .TP 8 -.B \-r -Remove all duplicates disregarding differences in the keyword type. -.TP 8 -.B \-s -Split keywords at delimiters (space, comma, colon, etc.) and list split keywords to be of .I unknown type. This can also be done by loading the split\-library. Using this option guarantees that the splitting is performed after all other libraries have been run. It is always performed before duplicate elimination. -.TP 8 .B \-v Print the version number and exit. .TP 8 @@ -111,10 +96,9 @@ $ extract test/test.jpg comment \- (C) 2001 by Christian Grothoff, using gimp 1.2 1 mimetype \- image/jpeg -$ extract \-Vf \-x comment test/test.jpg +$ extract \-V \-x comment test/test.jpg Keywords for file test/test.jpg: mimetype \- image/jpeg -filename \- test.jpg $ extract \-p comment test/test.jpg comment \- (C) 2001 by Christian Grothoff, using gimp 1.2 1 @@ -125,7 +109,7 @@ Keywords for file test/test.png: comment \- Testing keyword extraction .SH LEGAL NOTICE -libextractor and the extract tool are released under the GPL. libextractor is a GNU project. +libextractor and the extract tool are released under the GPL. libextractor is a GNU package. .SH BUGS A couple of file\-formats (on the order of 10^3) are not recognized... @@ -138,4 +122,4 @@ to contact the current maintainer(s). .SH AVAILABILITY You can obtain the original author's latest version from -http://gnunet.org/libextractor/ +http://www.gnu.org/software/libextractor/ diff --git a/doc/libextractor.3 b/doc/libextractor.3 @@ -1,75 +1,55 @@ -.TH LIBEXTRACTOR 3 "Jul 14, 2005" +.TH LIBEXTRACTOR 3 "Dec 14, 2009" .SH NAME -libextractor \- meta\-information extraction library 0.5.11 +libextractor \- meta\-information extraction library 0.6.0 .SH SYNOPSIS \fB#include <extractor.h> - \fBtypedef struct EXTRACTOR_Keywords { - char * \fIkeyword\fB; - EXTRACTOR_KeywordType \fIkeywordType\fB; - struct EXTRACTOR_Keywords * \fInext\fB; - } EXTRACTOR_KeywordList;\FB +\fBconst char *EXTRACTOR_metatype_to_string(enum EXTRACTOR_MetaType \fItype\fB); +\fBconst char *EXTRACTOR_metatype_to_description(enum EXTRACTOR_MetaType \fItype\fB); - \fBEXTRACTOR_ExtractorList * EXTRACTOR_loadDefaultLibraries (); +\fBenum EXTRACTOR_MetaTypeEXTRACTOR_metatype_get_max (void); - \fBconst char * EXTRACTOR_getKeywordTypeAsString (const EXTRACTOR_KeywordType \fItype\fB); +\fBstruct EXTRACTOR_PluginList *EXTRACTOR_plugin_add_defaults(enum EXTRACTOR_Options \fIflags\fB); - \fBEXTRACTOR_ExtractorList * EXTRACTOR_loadConfigLibraries (EXTRACTOR_ExtractorList * \fIprev\fB, const char * \fIconfig\fB); +\fBstruct EXTRACTOR_PluginList *EXTRACTOR_plugin_add (struct EXTRACTOR_PluginList * \fIprev\fB, const char * \fIlibrary\fB, const char * \fIoptions\fB, enum EXTRACTOR_Options \fIflags\fB); - \fBEXTRACTOR_ExtractorList * EXTRACTOR_addLibrary (EXTRACTOR_ExtractorList * \fIprev\fB, const char * \fIlibrary\fB); - \fBEXTRACTOR_ExtractorList * EXTRACTOR_addLibraryLast (EXTRACTOR_ExtractorList * \fIprev\fB, const char * \fIlibrary\fB); +\fBstruct EXTRACTOR_PluginList *EXTRACTOR_plugin_add_last(struct EXTRACTOR_PluginList *\fIprev\fB, const char *\fIlibrary\fB, const char *\fIoptions\fB, enum EXTRACTOR_Options \fIflags\fB); - \fBEXTRACTOR_ExtractorList * EXTRACTOR_removeLibrary (EXTRACTOR_ExtractorList * \fIprev\fB, const char * \fIlibrary\fB); +\fBstruct EXTRACTOR_PluginList *EXTRACTOR_plugin_add_config (struct EXTRACTOR_PluginList * \fIprev\fB, const char *\fIconfig\fB, enum EXTRACTOR_Options \fIflags\fB); + +\fBstruct EXTRACTOR_PluginList *EXTRACTOR_plugin_remove(struct EXTRACTOR_PluginList * \fIprev\fB, const char * \fIlibrary\fB); - \fBvoid EXTRACTOR_removeAll (EXTRACTOR_ExtractorList * \fIprev\fB); +\fBvoid EXTRACTOR_plugin_remove_all(struct EXTRACTOR_PluginList *\fIplugins\fB); - \fBEXTRACTOR_KeywordList * EXTRACTOR_getKeywords (EXTRACTOR_ExtractorList * \fIextractor\fB, const char * \fIfilename\fB); +\fBvoid EXTRACTOR_extract(struct EXTRACTOR_PluginList *\fIplugins\fB, const char *\fIfilename\fB, const void *\fIdata\fB, size_t \fIsize\fB, EXTRACTOR_MetaDataProcessor \fIproc\fB, void *\fIproc_cls\fB); - \fBEXTRACTOR_KeywordList * EXTRACTOR_getKeywords (EXTRACTOR_ExtractorList * \fIextractor\fB, const char * \fIdata\fB, size_t \fIsize\fB); +\fBint EXTRACTOR_meta_data_print(void * \fIhandle\fB, const char *\fIplugin_name\fB, enum EXTRACTOR_MetaType \fItype\fB, enum EXTRACTOR_MetaFormat \fIformat\fB, const char *\fIdata_mime_type\fB, const char *\fIdata\fB, size_t \fIdata_len\fB); - \fBEXTRACTOR_KeywordList * EXTRACTOR_removeEmptyKeywords (EXTRACTOR_KeywordList * \fIlist\fB); - - \fBEXTRACTOR_KeywordList * EXTRACTOR_removeDuplicateKeywords (EXTRACTOR_KeywordList * \fIlist\fB, const unsigned int \fIoptions\fB); - - \fBvoid EXTRACTOR_printKeywords (FILE * \fIhandle\fB, EXTRACTOR_KeywordList * \fIkeywords\fB); - - \fBvoid EXTRACTOR_freeKeywords (EXTRACTOR_KeywordList * \fIkeywords\fB); - - \fBconst char * EXTRACTOR_extractLast (const EXTRACTOR_KeywordType * \fItype\fB, EXTRACTOR_KeywordList * \fIkeywords\fB); - - \fBconst char * EXTRACTOR_extractLastByString (const char * \fItype\fB, EXTRACTOR_KeywordList * \fIkeywords\fB); - - \fBunsigned int EXTRACTOR_countKeywords (EXTRACTOR_KeywordList * \fIkeywords\fB); - - \fBEXTRACTOR_DEFAULT_LIBRARIES - - \fBEXTRACTOR_VERSION +\fBEXTRACTOR_VERSION .SH DESCRIPTION .P -libextractor is a simple library for keyword extraction. libExtractor does not support all formats but supports a simple plugging mechanism such that you can quickly add extractors for additional formats, even without recompiling libExtractor. libExtractor typically ships with one or more helper-libraries that can be used to obtain keywords from common file-types. If you want to write your own extractor for some filetype, all you need to do is write a little library that implements a single method with this signature: +GNU libextractor is a simple library for keyword extraction. libextractor does not support all formats but supports a simple plugging mechanism such that you can quickly add extractors for additional formats, even without recompiling libextractor. libextractor typically ships with dozens of plugins that can be used to obtain meta data from common file-types. If you want to write your own plugin for some filetype, all you need to do is write a little library that implements a single method with this signature: - \fBEXTRACTOR_KeywordList * LIBRARYNAME_extract(const char * \fIfilename\fB, - char * \fIdata\fB, - size_t \fIsize\fB, - EXTRACTOR_KeywordList * \fIprev\fB); + \fBint EXTRACTOR_name_extract(const char *\fIdata\fB, size_t \fIdatasize\fB, EXTRACTOR_MetaDataProcessor \fIproc\fB, void *\fIproc_cls\fB, const char *\fIoptions\fB); .P -The filename is the name of the file, data is a pointer to the contents of the file and size is the size of the file. The extract method must prepend keywords that it finds to the linked list 'prev' and return the new head. The library must allocate (malloc) the entry in the keyword list and the memory for the filename since both will be free'ed by libExtractor once the application calls freeKeywords. An example implementation can be found in \fImp3extractor.c\fP. The application extract gives an example how to use libExtractor. - +Data is a pointer to the contents of the file and datasize is the size of data. The extract method must call proc for meta data that it finds. The interpretation of options is up to the plugin. The function should return 0 if 'proc' always returned 0, otherwise 1. After 'proc' returned a non-zero value, proc should not be called again. An example implementation can be found in \fIhtml_extractor.c\fP. Plugins should be automatically found and used once they are installed in the respective directory (typically something like /usr/lib/libextractor/). +.P +The application extract gives an example how to use libextractor. .P -The basic use of libextractor is to load the plugins (for example with \fBEXTRACTOR_loadDefaultLibraries\fP), then to extract the keyword list using \fBEXTRACTOR_getKeywords\fP, processing the list (using application specific code and possibly some of the postprocessing convenience functions like \fBEXTRACTOR_removeDuplicateKeywords\fP), freeing the keyword list (using \fBEXTRACTOR_freeKeywords\fP) and finally unloading the plugins (with \fBEXTRACTOR_removeAll\fP). +The basic use of libextractor is to load the plugins (for example with \fBEXTRACTOR_plugin_add_defaults\fP), then to extract the keyword list using \fBEXTRACTOR_extract\fP, and finally unloading the plugins (with \fBEXTRACTOR_plugin_remove_all\fP). .P -The keywords obtained from libextractor are supposed to be UTF-8 encoded. The EXTRACTOR_printKeywords function converts the UTF-8 keywords to the character set from the current locale before printing them. Plugins are supposed to convert meta-data to UTF-8 if necessary. +Textual meta data obtained from libextractor is supposed to be UTF-8 encoded if the text encoding is known. Plugins are supposed to convert meta-data to UTF-8 if necessary. The EXTRACTOR_meta_data_print function converts the UTF-8 keywords to the character set from the current locale before printing them. .P .SH "SEE ALSO" extract(1) .SH LEGAL NOTICE -libextractor is released under the GPL and a GNU project (http://www.gnu.org/). +libextractor is released under the GPL and a GNU package (http://www.gnu.org/). .SH BUGS A couple of file-formats (on the order of 10^3) are not recognized... @@ -78,4 +58,4 @@ A couple of file-formats (on the order of 10^3) are not recognized... extract was originally written by Christian Grothoff <christian@grothoff.org> and Vidyut Samanta <vids@cs.ucla.edu>. Use <libextractor@gnu.org> to contact the current maintainer(s). .SH AVAILABILITY -You can obtain the original author's latest version from http://gnunet.org/libextractor/. +You can obtain the original author's latest version from http://www.gnu.org/software/libextractor/. diff --git a/doc/version.texi b/doc/version.texi @@ -1,4 +1,4 @@ @set UPDATED 1 October 2009 @set UPDATED-MONTH October 2009 -@set EDITION 0.5.23 -@set VERSION 0.5.23 +@set EDITION 0.6.0 +@set VERSION 0.6.0 diff --git a/src/include/Makefile.am b/src/include/Makefile.am @@ -3,6 +3,5 @@ include_HEADERS = \ extractor.h EXTRA_DIST = \ plibc.h \ - winproc.h \ platform.h \ gettext.h diff --git a/src/include/extractor.h b/src/include/extractor.h @@ -1,6 +1,6 @@ /* This file is part of libextractor. - (C) 2002, 2003, 2004, 2005, 2006 Vidyut Samanta and Christian Grothoff + (C) 2002, 2003, 2004, 2005, 2006, 2009 Vidyut Samanta and Christian Grothoff libextractor is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published @@ -32,432 +32,503 @@ extern "C" { * 0.2.6-1 => 0x00020601 * 4.5.2-0 => 0x04050200 */ -#define EXTRACTOR_VERSION 0x00052301 +#define EXTRACTOR_VERSION 0x00060000 #include <stdio.h> -/* ignore the 'type' of the keyword when eliminating duplicates */ -#define EXTRACTOR_DUPLICATES_TYPELESS 1 -/* remove type 'UNKNOWN' if there is a duplicate keyword of - known type, even if usually different types should be - preserved */ -#define EXTRACTOR_DUPLICATES_REMOVE_UNKNOWN 2 - -#define EXTRACTOR_DEFAULT_LIBRARIES EXTRACTOR_getDefaultLibraries() - -const char * EXTRACTOR_getDefaultLibraries(void); /** - * Enumeration defining various sources of keywords. - * See also - * http://dublincore.org/documents/1998/09/dces/ + * Options for how plugin execution should be done. */ -typedef enum { - EXTRACTOR_UNKNOWN = 0, - EXTRACTOR_FILENAME = 1, - EXTRACTOR_MIMETYPE = 2, - EXTRACTOR_TITLE = 3, - EXTRACTOR_AUTHOR = 4, - EXTRACTOR_ARTIST = 5, - EXTRACTOR_DESCRIPTION = 6, - EXTRACTOR_COMMENT = 7, - EXTRACTOR_DATE = 8, - EXTRACTOR_PUBLISHER = 9, - EXTRACTOR_LANGUAGE = 10, - EXTRACTOR_ALBUM = 11, - EXTRACTOR_GENRE = 12, - EXTRACTOR_LOCATION = 13, - EXTRACTOR_VERSIONNUMBER = 14, - EXTRACTOR_ORGANIZATION = 15, - EXTRACTOR_COPYRIGHT = 16, - EXTRACTOR_SUBJECT = 17, - EXTRACTOR_KEYWORDS = 18, - EXTRACTOR_CONTRIBUTOR = 19, - EXTRACTOR_RESOURCE_TYPE = 20, - EXTRACTOR_FORMAT = 21, - EXTRACTOR_RESOURCE_IDENTIFIER = 22, - EXTRACTOR_SOURCE = 23, - EXTRACTOR_RELATION = 24, - EXTRACTOR_COVERAGE = 25, - EXTRACTOR_SOFTWARE = 26, - EXTRACTOR_DISCLAIMER = 27, - EXTRACTOR_WARNING = 28, - EXTRACTOR_TRANSLATED = 29, - EXTRACTOR_CREATION_DATE = 30, - EXTRACTOR_MODIFICATION_DATE = 31, - EXTRACTOR_CREATOR = 32, - EXTRACTOR_PRODUCER = 33, - EXTRACTOR_PAGE_COUNT = 34, - EXTRACTOR_PAGE_ORIENTATION = 35, - EXTRACTOR_PAPER_SIZE = 36, - EXTRACTOR_USED_FONTS = 37, - EXTRACTOR_PAGE_ORDER = 38, - EXTRACTOR_CREATED_FOR = 39, - EXTRACTOR_MAGNIFICATION = 40, - EXTRACTOR_RELEASE = 41, - EXTRACTOR_GROUP = 42, - EXTRACTOR_SIZE = 43, - EXTRACTOR_SUMMARY = 44, - EXTRACTOR_PACKAGER = 45, - EXTRACTOR_VENDOR = 46, - EXTRACTOR_LICENSE = 47, - EXTRACTOR_DISTRIBUTION = 48, - EXTRACTOR_BUILDHOST = 49, - EXTRACTOR_OS = 50, - EXTRACTOR_DEPENDENCY = 51, - EXTRACTOR_HASH_MD4 = 52, - EXTRACTOR_HASH_MD5 = 53, - EXTRACTOR_HASH_SHA0 = 54, - EXTRACTOR_HASH_SHA1 = 55, - EXTRACTOR_HASH_RMD160 = 56, - EXTRACTOR_RESOLUTION = 57, - EXTRACTOR_CATEGORY = 58, - EXTRACTOR_BOOKTITLE = 59, - EXTRACTOR_PRIORITY = 60, - EXTRACTOR_CONFLICTS = 61, - EXTRACTOR_REPLACES = 62, - EXTRACTOR_PROVIDES = 63, - EXTRACTOR_CONDUCTOR = 64, - EXTRACTOR_INTERPRET = 65, - EXTRACTOR_OWNER = 66, - EXTRACTOR_LYRICS = 67, - EXTRACTOR_MEDIA_TYPE = 68, - EXTRACTOR_CONTACT = 69, - EXTRACTOR_THUMBNAIL_DATA = 70, - EXTRACTOR_PUBLICATION_DATE = 71, - EXTRACTOR_CAMERA_MAKE = 72, - EXTRACTOR_CAMERA_MODEL = 73, - EXTRACTOR_EXPOSURE = 74, - EXTRACTOR_APERTURE = 75, - EXTRACTOR_EXPOSURE_BIAS = 76, - EXTRACTOR_FLASH = 77, - EXTRACTOR_FLASH_BIAS = 78, - EXTRACTOR_FOCAL_LENGTH = 79, - EXTRACTOR_FOCAL_LENGTH_35MM = 80, - EXTRACTOR_ISO_SPEED = 81, - EXTRACTOR_EXPOSURE_MODE = 82, - EXTRACTOR_METERING_MODE = 83, - EXTRACTOR_MACRO_MODE = 84, - EXTRACTOR_IMAGE_QUALITY = 85, - EXTRACTOR_WHITE_BALANCE = 86, - EXTRACTOR_ORIENTATION = 87, - EXTRACTOR_TEMPLATE = 88, - EXTRACTOR_SPLIT = 89, - EXTRACTOR_PRODUCTVERSION = 90, - EXTRACTOR_LAST_SAVED_BY = 91, - EXTRACTOR_LAST_PRINTED = 92, - EXTRACTOR_WORD_COUNT = 93, - EXTRACTOR_CHARACTER_COUNT = 94, - EXTRACTOR_TOTAL_EDITING_TIME = 95, - EXTRACTOR_THUMBNAILS = 96, - EXTRACTOR_SECURITY = 97, - EXTRACTOR_CREATED_BY_SOFTWARE = 98, - EXTRACTOR_MODIFIED_BY_SOFTWARE = 99, - EXTRACTOR_REVISION_HISTORY = 100, - EXTRACTOR_LOWERCASE = 101, - EXTRACTOR_COMPANY = 102, - EXTRACTOR_GENERATOR = 103, - EXTRACTOR_CHARACTER_SET = 104, - EXTRACTOR_LINE_COUNT = 105, - EXTRACTOR_PARAGRAPH_COUNT = 106, - EXTRACTOR_EDITING_CYCLES = 107, - EXTRACTOR_SCALE = 108, - EXTRACTOR_MANAGER = 109, - EXTRACTOR_MOVIE_DIRECTOR = 110, - EXTRACTOR_DURATION = 111, - EXTRACTOR_INFORMATION = 112, - EXTRACTOR_FULL_NAME = 113, - EXTRACTOR_CHAPTER = 114, - EXTRACTOR_YEAR = 115, - EXTRACTOR_LINK = 116, - EXTRACTOR_MUSIC_CD_IDENTIFIER = 117, - EXTRACTOR_PLAY_COUNTER = 118, - EXTRACTOR_POPULARITY_METER = 119, - EXTRACTOR_CONTENT_TYPE = 120, - EXTRACTOR_ENCODED_BY = 121, - EXTRACTOR_TIME = 122, - EXTRACTOR_MUSICIAN_CREDITS_LIST = 123, - EXTRACTOR_MOOD = 124, - EXTRACTOR_FORMAT_VERSION = 125, - EXTRACTOR_TELEVISION_SYSTEM = 126, - EXTRACTOR_SONG_COUNT = 127, - EXTRACTOR_STARTING_SONG = 128, - EXTRACTOR_HARDWARE_DEPENDENCY = 129, - EXTRACTOR_RIPPER = 130, - EXTRACTOR_FILE_SIZE = 131, - EXTRACTOR_TRACK_NUMBER = 132, - EXTRACTOR_ISRC = 133, - EXTRACTOR_DISC_NUMBER = 134, - EXTRACTOR_GNUNET_DISPLAY_TYPE = 135, - EXTRACTOR_GNUNET_ECBC_URI = 136, - EXTRACTOR_GNUNET_FULL_DATA = 137, - EXTRACTOR_LOCATION_CITY = 138, - EXTRACTOR_LOCATION_COUNTRY = 139, - EXTRACTOR_LOCATION_SUBLOCATION = 140, - EXTRACTOR_GPS_LATITUDE_REF = 141, - EXTRACTOR_GPS_LATITUDE = 142, - EXTRACTOR_GPS_LONGITUDE_REF = 143, - EXTRACTOR_GPS_LONGITUDE = 144, - EXTRACTOR_RATING = 145, - EXTRACTOR_COUNTRY_CODE = 146 -} EXTRACTOR_KeywordType; +enum EXTRACTOR_Options + { + /** + * Run plugins in-process. + */ + EXTRACTOR_OPTION_NONE = 0, + + /** + * Run plugins out-of-process, starting the process + * once at the time the plugin is loaded. This will + * prevent the main process crashing if a plugin dies. + * Ignored on platforms where out-of-process starts + * are not supported. + */ + EXTRACTOR_OPTION_OUT_OF_PROCESS = 1, + + /** + * If a plugin crashes, automatically restart the respective + * process for the next file. Implies + * EXTRACTOR_OPTION_OUT_OF_PROCESS. + */ + EXTRACTOR_OPTION_AUTO_RESTART = 2 + + }; -/** - * Test if a given LE type contains binary data. - */ -#define EXTRACTOR_isBinaryType(type) (type == EXTRACTOR_THUMBNAIL_DATA) /** - * A linked list of keywords. This structure is passed around - * in libExtractor and is typically the result of any keyword - * extraction operation. - * <p> - * Each entry in the keyword list consists of a string (the - * keyword) and the keyword type (of type KeywordType) - * describing how/from where the keyword was obtained. + * Format in which the extracted meta data is presented. */ -typedef struct EXTRACTOR_Keywords { - /* the keyword that was found */ - char * keyword; - /* the type of the keyword (classification) */ - EXTRACTOR_KeywordType keywordType; - /* the next entry in the list */ - struct EXTRACTOR_Keywords * next; -} EXTRACTOR_KeywordList; +enum EXTRACTOR_MetaFormat + { + /** + * Format is unknown. + */ + EXTRACTOR_METAFORMAT_UNKNOWN = 0, + + /** + * 0-terminated, UTF-8 encoded string. "data_len" + * is strlen(data)+1. + */ + EXTRACTOR_METAFORMAT_UTF8 = 1, + + /** + * Some kind of binary format, see given Mime type. + */ + EXTRACTOR_METAFORMAT_BINARY = 2, + + /** + * 0-terminated string. The specific encoding is unknown. + * "data_len" is strlen(data)+1. + */ + EXTRACTOR_METAFORMAT_C_STRING = 3 + }; -/** - * Signature of the extract method that each plugin - * must provide. - * - * @param filename MAYBE NULL (!) - * @param data must not be modified (!) - */ -typedef EXTRACTOR_KeywordList * -(*ExtractMethod)(const char * filename, - char * data, - size_t filesize, - EXTRACTOR_KeywordList * next, - const char * options); /** - * Linked list of extractor helper-libraries. An application - * builds this list by telling libextractor to load various - * keyword-extraction libraries. Libraries can also be unloaded - * (removed from this list, see removeLibrary). - * <p> - * Client code should never be concerned with the internals of - * this struct. + * Enumeration defining various sources of keywords. See also + * http://dublincore.org/documents/1998/09/dces/ */ -typedef struct EXTRACTOR_Extractor { - void * libraryHandle; - char * libname; - ExtractMethod extractMethod; - struct EXTRACTOR_Extractor * next; - char * options; -} EXTRACTOR_ExtractorList; +enum EXTRACTOR_MetaType + { + /* fundamental types */ + EXTRACTOR_METATYPE_RESERVED = 0, + EXTRACTOR_METATYPE_MIMETYPE = 1, + EXTRACTOR_METATYPE_FILENAME = 2, + EXTRACTOR_METATYPE_COMMENT = 3, + + /* Standard types from bibtex */ + EXTRACTOR_METATYPE_TITLE = 4, + EXTRACTOR_METATYPE_BOOK_TITLE = 5, + EXTRACTOR_METATYPE_BOOK_EDITION = 6, + EXTRACTOR_METATYPE_BOOK_CHAPTER_NUMBER = 7, + EXTRACTOR_METATYPE_JOURNAL_NAME = 8, + EXTRACTOR_METATYPE_JOURNAL_VOLUME = 9, + EXTRACTOR_METATYPE_JOURNAL_NUMBER = 10, + EXTRACTOR_METATYPE_PAGE_COUNT = 11, + EXTRACTOR_METATYPE_PAGE_RANGE = 12, + EXTRACTOR_METATYPE_AUTHOR_NAME = 13, + EXTRACTOR_METATYPE_AUTHOR_EMAIL = 14, + EXTRACTOR_METATYPE_AUTHOR_INSTITUTION = 15, + EXTRACTOR_METATYPE_PUBLISHER = 16, + EXTRACTOR_METATYPE_PUBLISHER_ADDRESS = 17, + EXTRACTOR_METATYPE_PUBLISHER_INSTITUTION = 18, + EXTRACTOR_METATYPE_PUBLISHER_SERIES = 19, + EXTRACTOR_METATYPE_PUBLICATION_TYPE = 20, + EXTRACTOR_METATYPE_PUBLICATION_YEAR = 21, + EXTRACTOR_METATYPE_PUBLICATION_MONTH = 22, + EXTRACTOR_METATYPE_PUBLICATION_DAY = 23, + EXTRACTOR_METATYPE_PUBLICATION_DATE = 24, + EXTRACTOR_METATYPE_BIBTEX_EPRINT = 25, + EXTRACTOR_METATYPE_BIBTEX_ENTRY_TYPE = 26, + EXTRACTOR_METATYPE_DOCUMENT_LANGUAGE = 27, + EXTRACTOR_METATYPE_CREATION_TIME = 28, + EXTRACTOR_METATYPE_URL = 29, + + /* "unique" document identifiers */ + EXTRACTOR_METATYPE_URI = 30, + EXTRACTOR_METATYPE_ISRC = 31, + EXTRACTOR_METATYPE_HASH_MD4 = 32, + EXTRACTOR_METATYPE_HASH_MD5 = 33, + EXTRACTOR_METATYPE_HASH_SHA0 = 34, + EXTRACTOR_METATYPE_HASH_SHA1 = 35, + EXTRACTOR_METATYPE_HASH_RMD160 = 36, + + /* identifiers of a location */ + EXTRACTOR_METATYPE_GPS_LATITUDE_REF = 37, + EXTRACTOR_METATYPE_GPS_LATITUDE = 38, + EXTRACTOR_METATYPE_GPS_LONGITUDE_REF = 39, + EXTRACTOR_METATYPE_GPS_LONGITUDE = 40, + EXTRACTOR_METATYPE_LOCATION_CITY = 41, + EXTRACTOR_METATYPE_LOCATION_SUBLOCATION = 42, + EXTRACTOR_METATYPE_LOCATION_COUNTRY = 43, + EXTRACTOR_METATYPE_LOCATION_COUNTRY_CODE = 44, + + /* generic attributes */ + EXTRACTOR_METATYPE_UNKNOWN = 45, + EXTRACTOR_METATYPE_DESCRIPTION = 46, + EXTRACTOR_METATYPE_COPYRIGHT = 47, + EXTRACTOR_METATYPE_RIGHTS = 48, + EXTRACTOR_METATYPE_KEYWORDS = 49, + EXTRACTOR_METATYPE_ABSTRACT = 50, + EXTRACTOR_METATYPE_SUMMARY = 51, + EXTRACTOR_METATYPE_SUBJECT = 52, + EXTRACTOR_METATYPE_CREATOR = 53, + EXTRACTOR_METATYPE_FORMAT = 54, + EXTRACTOR_METATYPE_FORMAT_VERSION = 55, + + /* processing history */ + EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE = 56, + EXTRACTOR_METATYPE_UNKNOWN_DATE = 57, + EXTRACTOR_METATYPE_CREATION_DATE = 58, + EXTRACTOR_METATYPE_MODIFICATION_DATE = 59, + EXTRACTOR_METATYPE_LAST_PRINTED = 60, + EXTRACTOR_METATYPE_LAST_SAVED_BY = 61, + EXTRACTOR_METATYPE_TOTAL_EDITING_TIME = 62, + EXTRACTOR_METATYPE_EDITING_CYCLES = 63, + EXTRACTOR_METATYPE_MODIFIED_BY_SOFTWARE = 64, + EXTRACTOR_METATYPE_REVISION_HISTORY = 65, + + /* FIXME... */ + + /* software package specifics (deb, rpm, tgz) */ + EXTRACTOR_METATYPE_PACKAGER = 45, + EXTRACTOR_METATYPE_VENDOR = 46, + EXTRACTOR_METATYPE_LICENSE = 47, + EXTRACTOR_METATYPE_DISTRIBUTION = 48, + EXTRACTOR_METATYPE_BUILDHOST = 49, + EXTRACTOR_METATYPE_TARGET_OS = 50, + EXTRACTOR_METATYPE_DEPENDENCY = 51, + EXTRACTOR_METATYPE_CONFLICTS = 61, + EXTRACTOR_METATYPE_REPLACES = 62, + EXTRACTOR_METATYPE_PROVIDES = 63, + + /* (text) document processing specifics */ + EXTRACTOR_METATYPE_CHARACTER_SET = 104, + EXTRACTOR_METATYPE_LINE_COUNT = 105, + EXTRACTOR_METATYPE_PARAGRAPH_COUNT = 106, + EXTRACTOR_METATYPE_WORD_COUNT = 93, + EXTRACTOR_METATYPE_CHARACTER_COUNT = 94, + EXTRACTOR_METATYPE_PAGE_ORIENTATION = 35, + EXTRACTOR_METATYPE_PAPER_SIZE = 36, + EXTRACTOR_METATYPE_USED_FONTS = 37, + EXTRACTOR_METATYPE_PAGE_ORDER = 38, + + /* music / video specifics */ + EXTRACTOR_METATYPE_LYRICS = 67, + EXTRACTOR_METATYPE_CONDUCTOR = 64, + EXTRACTOR_METATYPE_INTERPRET = 65, + EXTRACTOR_METATYPE_MUSIC_CD_IDENTIFIER = 117, + EXTRACTOR_METATYPE_PLAY_COUNTER = 118, + EXTRACTOR_METATYPE_DURATION = 111, + EXTRACTOR_METATYPE_MOVIE_DIRECTOR = 110, + EXTRACTOR_METATYPE_SONG_COUNT = 127, + EXTRACTOR_METATYPE_STARTING_SONG = 128, + EXTRACTOR_METATYPE_MUSICIAN_CREDITS_LIST = 123, + EXTRACTOR_METATYPE_TRACK_NUMBER = 132, + EXTRACTOR_METATYPE_DISC_NUMBER = 134, + EXTRACTOR_METATYPE_ALBUM = 11, + EXTRACTOR_METATYPE_ARTIST = 5, + EXTRACTOR_METATYPE_GENRE = 12, + + /* image specifics */ + EXTRACTOR_METATYPE_THUMBNAIL_DATA = 70, + EXTRACTOR_METATYPE_RESOLUTION = 57, + EXTRACTOR_METATYPE_IMAGE_DIMENSIONS = 43, + EXTRACTOR_METATYPE_SCALE = 108, + + /* photography specifics */ + EXTRACTOR_METATYPE_CAMERA_MAKE = 72, + EXTRACTOR_METATYPE_CAMERA_MODEL = 73, + EXTRACTOR_METATYPE_EXPOSURE = 74, + EXTRACTOR_METATYPE_APERTURE = 75, + EXTRACTOR_METATYPE_EXPOSURE_BIAS = 76, + EXTRACTOR_METATYPE_FLASH = 77, + EXTRACTOR_METATYPE_FLASH_BIAS = 78, + EXTRACTOR_METATYPE_FOCAL_LENGTH = 79, + EXTRACTOR_METATYPE_FOCAL_LENGTH_35MM = 80, + EXTRACTOR_METATYPE_ISO_SPEED = 81, + EXTRACTOR_METATYPE_EXPOSURE_MODE = 82, + EXTRACTOR_METATYPE_METERING_MODE = 83, + EXTRACTOR_METATYPE_MACRO_MODE = 84, + EXTRACTOR_METATYPE_IMAGE_QUALITY = 85, + EXTRACTOR_METATYPE_WHITE_BALANCE = 86, + EXTRACTOR_METATYPE_ORIENTATION = 87, + EXTRACTOR_METATYPE_MAGNIFICATION = 40, + + /* numeric metrics */ + EXTRACTOR_METATYPE_POPULARITY_METER = 119, + EXTRACTOR_METATYPE_RATING = 145, + EXTRACTOR_METATYPE_PRIORITY = 60, + + /* gnunet specific attributes */ + EXTRACTOR_METATYPE_GNUNET_DISPLAY_TYPE = 135, + EXTRACTOR_METATYPE_GNUNET_ECBC_URI = 136, + + + /* misc (see if these are still needed...) */ + + EXTRACTOR_METATYPE_GENERATOR = 103, + EXTRACTOR_METATYPE_ENCODED_BY = 121, + EXTRACTOR_METATYPE_PRODUCTVERSION = 90, + + EXTRACTOR_METATYPE_DISCLAIMER = 27, + EXTRACTOR_METATYPE_FILE_SIZE = 131, + EXTRACTOR_METATYPE_FULL_DATA = 137, + EXTRACTOR_METATYPE_VERSIONNUMBER = 14, + + EXTRACTOR_METATYPE_ORGANIZATION = 15, + EXTRACTOR_METATYPE_CONTRIBUTOR = 19, + EXTRACTOR_METATYPE_RESOURCE_TYPE = 20, + EXTRACTOR_METATYPE_SOURCE = 23, + EXTRACTOR_METATYPE_RELATION = 24, + EXTRACTOR_METATYPE_COVERAGE = 25, + EXTRACTOR_METATYPE_SOFTWARE = 26, + EXTRACTOR_METATYPE_WARNING = 28, + EXTRACTOR_METATYPE_TRANSLATED = 29, + EXTRACTOR_METATYPE_PRODUCER = 33, + EXTRACTOR_METATYPE_CREATED_FOR = 39, + EXTRACTOR_METATYPE_RELEASE = 41, + EXTRACTOR_METATYPE_GROUP = 42, + EXTRACTOR_METATYPE_CATEGORY = 58, + EXTRACTOR_METATYPE_OWNER = 66, + EXTRACTOR_METATYPE_MEDIA_TYPE = 68, + EXTRACTOR_METATYPE_CONTACT = 69, + EXTRACTOR_METATYPE_TEMPLATE = 88, + EXTRACTOR_METATYPE_SECURITY = 97, + EXTRACTOR_METATYPE_COMPANY = 102, + EXTRACTOR_METATYPE_MANAGER = 109, + EXTRACTOR_METATYPE_INFORMATION = 112, + EXTRACTOR_METATYPE_FULL_NAME = 113, + EXTRACTOR_METATYPE_LINK = 116, + EXTRACTOR_METATYPE_TIME = 122, + EXTRACTOR_METATYPE_MOOD = 124, + EXTRACTOR_METATYPE_TELEVISION_SYSTEM = 126, + EXTRACTOR_METATYPE_HARDWARE_DEPENDENCY = 129, + EXTRACTOR_METATYPE_RIPPER = 130, + }; -/** - * Load the default set of libraries. - * @return the default set of libraries. - */ -EXTRACTOR_ExtractorList * EXTRACTOR_loadDefaultLibraries(void); /** * Get the textual name of the keyword. - * @return NULL if the type is not known + * + * @param type meta type to get a UTF-8 string for + * @return NULL if the type is not known, otherwise + * an English (locale: C) string describing the type; + * translate using 'dgettext ("libextractor", rval)' */ const char * -EXTRACTOR_getKeywordTypeAsString(EXTRACTOR_KeywordType type); +EXTRACTOR_metatype_to_string(enum EXTRACTOR_MetaType type); -/** - * Return the highest type number, exclusive as in [0,highest). - */ -EXTRACTOR_KeywordType -EXTRACTOR_getHighestKeywordTypeNumber(void); /** - * Load multiple libraries as specified by the user. - * @param config a string given by the user that defines which - * libraries should be loaded. Has the format - * "[[-]LIBRARYNAME[(options)][:[-]LIBRARYNAME[(options)]]]*". - * For example, - * libextractor_mp3.so:libextractor_ogg.so loads the - * mp3 and the ogg library. The '-' before the LIBRARYNAME - * indicates that the library should be added to the end - * of the library list (addLibraryLast). - * @param prev the previous list of libraries, may be NULL - * @return the new list of libraries, equal to prev iff an error occured - * or if config was empty (or NULL). + * Get a long description for the meta type. + * + * @param type meta type to get a UTF-8 description for + * @return NULL if the type is not known, otherwise + * an English (locale: C) string describing the type; + * translate using 'dgettext ("libextractor", rval)' */ -EXTRACTOR_ExtractorList * -EXTRACTOR_loadConfigLibraries(EXTRACTOR_ExtractorList * prev, - const char * config); +const char * +EXTRACTOR_metatype_to_description(enum EXTRACTOR_MetaType type); + /** - * Add a library for keyword extraction. - * @param prev the previous list of libraries, may be NULL - * @param library the name of the library - * @return the new list of libraries, equal to prev iff an error occured + * Return the highest type number, exclusive as in [0,max). + * + * @return highest legal metatype number for this version of libextractor */ -EXTRACTOR_ExtractorList * -EXTRACTOR_addLibrary(EXTRACTOR_ExtractorList * prev, - const char * library); +enum EXTRACTOR_MetaType +EXTRACTOR_metatype_get_max (void); + /** - * Add a library for keyword extraction at the END of the list. - * @param prev the previous list of libraries, may be NULL - * @param library the name of the library - * @return the new list of libraries, always equal to prev - * except if prev was NULL and no error occurs - */ -EXTRACTOR_ExtractorList * -EXTRACTOR_addLibraryLast(EXTRACTOR_ExtractorList * prev, - const char * library); - + * Type of a function that libextractor calls for each + * meta data item found. + * + * @param cls closure (user-defined) + * @param plugin_name name of the plugin that produced this value; + * special values can be used (i.e. '<zlib>' for zlib being + * used in the main libextractor library and yielding + * meta data). + * @param type libextractor-type describing the meta data + * @param format basic format information about data + * @param data_mime_type mime-type of data (not of the original file); + * can be NULL (if mime-type is not known) + * @param data actual meta-data found + * @param data_len number of bytes in data + * @return 0 to continue extracting, 1 to abort + */ +typedef int (*EXTRACTOR_MetaDataProcessor)(void *cls, + const char *plugin_name, + enum EXTRACTOR_MetaType type, + enum EXTRACTOR_MetaFormat format, + const char *data_mime_type, + const char *data, + size_t data_len); + + /** - * Remove a library for keyword extraction. - * @param prev the current list of libraries - * @param library the name of the library to remove - * @return the reduced list, unchanged if the library was not loaded + * Signature of the extract method that each plugin + * must provide. + * + * @param data data to process + * @param datasize number of bytes available in data + * @param proc function to call for meta data found + * @param proc_cls cls argument to proc + * @param options options for this plugin; can be NULL + * @return 0 if all calls to proc returned 0, otherwise 1 */ -EXTRACTOR_ExtractorList * -EXTRACTOR_removeLibrary(EXTRACTOR_ExtractorList * prev, - const char * library); +typedef int (*EXTRACTOR_ExtractMethod)(const char *data, + size_t datasize, + EXTRACTOR_MetaDataProcessor proc, + void *proc_cls, + const char *options); -/** - * Remove all extractors. - * @param libraries the list of extractors - */ -void EXTRACTOR_removeAll(EXTRACTOR_ExtractorList * libraries); /** - * Extract keywords from a file using the available extractors. - * @param extractor the list of extractor libraries - * @param filename the name of the file - * @return the list of keywords found in the file, NULL if none - * were found (or other errors) + * Linked list of extractor plugins. An application builds this list + * by telling libextractor to load various keyword-extraction + * plugins. Libraries can also be unloaded (removed from this list, + * see EXTRACTOR_plugin_remove). */ -EXTRACTOR_KeywordList * -EXTRACTOR_getKeywords(EXTRACTOR_ExtractorList * extractor, - const char * filename); +struct EXTRACTOR_PluginList; /** - * Extract keywords from a buffer in memory - * using the available extractors. + * Load the default set of plugins. The default can be changed + * by setting the LIBEXTRACTOR_LIBRARIES environment variable; + * If it is set to "env", then this function will return + * EXTRACTOR_plugin_add_config (NULL, env, flags). + * + * If LIBEXTRACTOR_LIBRARIES is not set, the function will attempt + * to locate the installed plugins and load all of them. + * The directory where the code will search for plugins is typically + * automatically determined; it can be specified explicitly using the + * "LIBEXTRACTOR_PREFIX" environment variable. + * + * This environment variable must be set to the precise directory with + * the plugins (i.e. "/usr/lib/libextractor", not "/usr"). Note that + * setting the environment variable will disable all of the methods + * that are typically used to determine the location of plugins. + * Multiple paths can be specified using ':' to separate them. * - * @param extractor the list of extractor libraries - * @param data the data of the file - * @param size the number of bytes in data - * @return the list of keywords found in the file, NULL if none - * were found (or other errors) + * @param flags options for all of the plugins loaded + * @return the default set of plugins, NULL if no plugins were found */ -EXTRACTOR_KeywordList * -EXTRACTOR_getKeywords2(EXTRACTOR_ExtractorList * extractor, - const void * data, - size_t size); +struct EXTRACTOR_PluginList * +EXTRACTOR_plugin_add_defaults(enum EXTRACTOR_Options flags); /** - * Remove duplicate keywords from the list. - * @param list the original keyword list (destroyed in the process!) - * @param options a set of options (DUPLICATES_XXXX) - * @return a list of keywords without duplicates + * Add a library for keyword extraction. + * + * @param prev the previous list of libraries, may be NULL + * @param library the name of the library (full path) + * @param options options to give to the library + * @param flags options to use + * @return the new list of libraries, equal to prev iff an error occured */ -EXTRACTOR_KeywordList * -EXTRACTOR_removeDuplicateKeywords(EXTRACTOR_KeywordList * list, - unsigned int options); +struct EXTRACTOR_PluginList * +EXTRACTOR_plugin_add (struct EXTRACTOR_PluginList * prev, + const char * library, + const char *options, + enum EXTRACTOR_Options flags); /** - * Remove empty (all-whitespace) keywords from the list. - * @param list the original keyword list (destroyed in the process!) - * @return a list of keywords without duplicates + * Add a library for keyword extraction at the END of the list. + * @param prev the previous list of libraries, may be NULL + * @param library the name of the library (full path) + * @param options options to give to the library + * @param flags options to use + * @return the new list of libraries, always equal to prev + * except if prev was NULL and no error occurs */ -EXTRACTOR_KeywordList * -EXTRACTOR_removeEmptyKeywords (EXTRACTOR_KeywordList * list); +struct EXTRACTOR_PluginList * +EXTRACTOR_plugin_add_last(struct EXTRACTOR_PluginList *prev, + const char *library, + const char *options, + enum EXTRACTOR_Options flags); -/** - * Remove keywords of a particular type from the list. - * @param list the original keyword list (altered in the process!) - * @param type the type to remove - * @return a list of keywords without entries of given type - */ -EXTRACTOR_KeywordList * -EXTRACTOR_removeKeywordsOfType(EXTRACTOR_KeywordList * list, - EXTRACTOR_KeywordType type); /** - * Print a keyword list to a file. - * For debugging. - * @param handle the file to write to (stdout, stderr), must NOT be NULL - * @param keywords the list of keywords to print, may be NULL - */ -void EXTRACTOR_printKeywords(FILE * handle, - EXTRACTOR_KeywordList * keywords); - -/** - * Free the memory occupied by the keyword list (and the - * keyword strings in it!) - * @param keywords the list to free + * Load multiple libraries as specified by the user. + * + * @param config a string given by the user that defines which + * libraries should be loaded. Has the format + * "[[-]LIBRARYNAME[(options)][:[-]LIBRARYNAME[(options)]]]*". + * For example, + * /usr/lib/libextractor/libextractor_mp3.so:/usr/lib/libextractor/libextractor_ogg.so loads the + * mp3 and the ogg library. The '-' before the LIBRARYNAME + * indicates that the library should be added to the end + * of the library list (addLibraryLast). + * @param prev the previous list of libraries, may be NULL + * @param flags options to use + * @return the new list of libraries, equal to prev iff an error occured + * or if config was empty (or NULL). */ -void EXTRACTOR_freeKeywords(EXTRACTOR_KeywordList * keywords); +struct EXTRACTOR_PluginList * +EXTRACTOR_plugin_add_config (struct EXTRACTOR_PluginList * prev, + const char *config, + enum EXTRACTOR_Options flags); + /** - * Extract the last keyword that of the given type from the keyword list. - * @param type the type of the keyword - * @param keywords the keyword list - * @return the last matching keyword, or NULL if none matches; - * the string returned is aliased in the keywords list and must - * not be freed or manipulated by the client. It will become - * invalid once the keyword list is freed. + * Remove a plugin from a list. + * + * @param prev the current list of plugins + * @param library the name of the plugin to remove (full path) + * @return the reduced list, unchanged if the plugin was not loaded */ -const char * EXTRACTOR_extractLast(EXTRACTOR_KeywordType type, - EXTRACTOR_KeywordList * keywords); +struct EXTRACTOR_PluginList * +EXTRACTOR_plugin_remove(struct EXTRACTOR_PluginList * prev, + const char * library); -/** - * Extract the last keyword of the given string from the keyword list. - * @param type the string describing the type of the keyword - * @param keywords the keyword list - * @return the last matching keyword, or NULL if none matches; - * the string returned is aliased in the keywords list and must - * not be freed or manipulated by the client. It will become - * invalid once the keyword list is freed. - */ -const char * EXTRACTOR_extractLastByString(const char * type, - EXTRACTOR_KeywordList * keywords); /** - * Count the number of keywords in the keyword list. - * @param keywords the keyword list - * @return the number of keywords in the list + * Remove all plugins from the given list (destroys the list). + * + * @param plugin the list of plugins */ -unsigned int EXTRACTOR_countKeywords(EXTRACTOR_KeywordList * keywords); +void +EXTRACTOR_plugin_remove_all(struct EXTRACTOR_PluginList *plugins); /** - * This function can be used to decode the binary data - * encoded in the libextractor metadata (i.e. for - * the thumbnails). + * Extract keywords from a file using the given set of plugins. * - * @param in 0-terminated string from the meta-data - * @return 1 on error, 0 on success + * @param plugins the list of plugins to use + * @param filename the name of the file, can be NULL if data is not NULL + * @param data data of the file in memory, can be NULL (in which + * case libextractor will open file) if filename is not NULL + * @param size number of bytes in data, ignored if data is NULL + * @param proc function to call for each meta data item found + * @param proc_cls cls argument to proc */ -int EXTRACTOR_binaryDecode(const char * in, - unsigned char ** out, - size_t * outSize); +void +EXTRACTOR_extract(struct EXTRACTOR_PluginList *plugins, + const char *filename, + const void *data, + size_t size, + EXTRACTOR_MetaDataProcessor proc, + void *proc_cls); /** - * Encode the given binary data object - * as a 0-terminated C-string according - * to the LE binary data encoding standard. - * - * @return NULL on error, the 0-terminated - * encoding otherwise + * Simple EXTRACTOR_MetaDataProcessor implementation that simply + * prints the extracted meta data to the given file. Only prints + * those keywords that are in UTF-8 format. + * + * @param handle the file to write to (stdout, stderr), must NOT be NULL, + * must be of type "FILE *". + * @param plugin_name name of the plugin that produced this value + * @param type libextractor-type describing the meta data + * @param format basic format information about data + * @param data_mime_type mime-type of data (not of the original file); + * can be NULL (if mime-type is not known) + * @param data actual meta-data found + * @param data_len number of bytes in data + * @return non-zero if printing failed, otherwise 0. */ -char * EXTRACTOR_binaryEncode(const unsigned char * data, - size_t size); +int +EXTRACTOR_meta_data_print(void * handle, + const char *plugin_name, + enum EXTRACTOR_MetaType type, + enum EXTRACTOR_MetaFormat format, + const char *data_mime_type, + const char *data, + size_t data_len); #if 0 /* keep Emacsens' auto-indent happy */ diff --git a/src/include/winproc.h b/src/include/winproc.h @@ -1,44 +0,0 @@ -/* - This file is part of libextractor. - (C) 2001, 2002, 2003, 2003, 2005 Christian Grothoff (and other contributing authors) - - libextractor is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 2, or (at your - option) any later version. - - libextractor is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with libextractor; see the file COPYING. If not, write to the - Free Software Foundation, Inc., 59 Temple Place - Suite 330, - Boston, MA 02111-1307, USA. -*/ - -/** - * @file include/winproc.h - * @brief Definitions for MS Windows - * @author Nils Durner - * @note This file differs from GNUnet's winproc.h - */ - -#ifndef WINPROC_H -#define WINPROC_H - -#include "platform.h" - -#ifdef __cplusplus -extern "C" { -#endif - -void InitWinEnv(); -void ShutdownWinEnv(); - -#endif - -#ifdef __cplusplus -} /* extern "C" */ -#endif diff --git a/src/main/Makefile.am b/src/main/Makefile.am @@ -29,31 +29,19 @@ if HAVE_BZ2 bz2lib = -lbz2 endif -if HAVE_GLIB -if WITH_GSF - GSF_LIBS_X = $(GSF_LIBS) -endif -endif - libextractor_la_LDFLAGS = \ $(LE_LIB_LDFLAGS) -version-info @LIB_VERSION_CURRENT@:@LIB_VERSION_REVISION@:@LIB_VERSION_AGE@ libextractor_la_LIBADD = \ - $(LIBLTDL) $(dlflag) $(zlib) $(bz2lib) $(GSF_LIBS_X) $(LIBICONV) -lpthread + $(LIBLTDL) $(dlflag) $(zlib) $(bz2lib) $(LIBICONV) -lrt EXTRA_DIST = \ - winproc.c \ iconv.c -if MINGW - winproc = winproc.c -endif - libextractor_la_CPPFLAGS = -DPLUGINDIR=\"@RPLUGINDIR@\" $(AM_CPPFLAGS) libextractor_la_SOURCES = \ - extractor.c \ - $(winproc) + extractor.c extract_SOURCES = \ extract.c \ @@ -62,15 +50,3 @@ extract_SOURCES = \ getopt1.c -check_PROGRAMS = \ - test_binary - -TESTS = $(check_PROGRAMS) - -test_binary_SOURCES = \ - test_binary.c -test_binary_LDADD = \ - $(top_builddir)/src/main/libextractor.la - - - diff --git a/src/main/extract.c b/src/main/extract.c @@ -26,6 +26,17 @@ #define NO 0 +/** + * Which keyword types should we print? + */ +static int * print; + +/** + * How verbose are we supposed to be? + */ +static int verbose; + + typedef struct { char shortArg; char * longArg; @@ -122,16 +133,10 @@ static void printHelp () { static Help help[] = { - { 'a', "all", NULL, - gettext_noop("do not remove any duplicates") }, { 'b', "bibtex", NULL, gettext_noop("print output in bibtex format") }, { 'B', "binary", "LANG", gettext_noop("use the generic plaintext extractor for the language with the 2-letter language code LANG") }, - { 'd', "duplicates", NULL, - gettext_noop("remove duplicates only if types match") }, - { 'f', "filename", NULL, - gettext_noop("use the filename as a keyword (loads filename-extractor plugin)") }, { 'g', "grep-friendly", NULL, gettext_noop("produce grep-friendly output (all results on one line per file)") }, { 'h', "help", NULL, @@ -146,10 +151,6 @@ printHelp () gettext_noop("do not use the default set of extractor plugins") }, { 'p', "print", "TYPE", gettext_noop("print only keywords of the given TYPE (use -L to get a list)") }, - { 'r', "remove-duplicates", NULL, - gettext_noop("remove duplicates even if keyword types do not match") }, - { 's', "split", NULL, - gettext_noop("use keyword splitting (loads split-extractor plugin)") }, { 'v', "version", NULL, gettext_noop("print the version number") }, { 'V', "verbose", NULL, @@ -166,109 +167,159 @@ printHelp () #include "iconv.c" - /** * Print a keyword list to a file. * - * @param handle the file to write to (stdout, stderr), may NOT be NULL - * @param keywords the list of keywords to print, may be NULL - * @param print array indicating which types to print - */ -static void -printSelectedKeywords(FILE * handle, - EXTRACTOR_KeywordList * keywords, - const int * print, - const int verbose) -{ + * @param cls closure, not used + * @param plugin_name name of the plugin that produced this value; + * special values can be used (i.e. '<zlib>' for zlib being + * used in the main libextractor library and yielding + * meta data). + * @param type libextractor-type describing the meta data + * @param format basic format information about data + * @param data_mime_type mime-type of data (not of the original file); + * can be NULL (if mime-type is not known) + * @param data actual meta-data found + * @param data_len number of bytes in data + * @return 0 to continue extracting, 1 to abort + */ +static int +print_selected_keywords (void *cls, + const char *plugin_name, + enum EXTRACTOR_MetaType type, + enum EXTRACTOR_MetaFormat format, + const char *data_mime_type, + const char *data, + size_t data_len) +{ char * keyword; iconv_t cd; + const char *stype; - cd = iconv_open(nl_langinfo(CODESET), "UTF-8"); - while (keywords != NULL) { - if (EXTRACTOR_isBinaryType(keywords->keywordType)) { - fprintf (handle, - _("%s - (binary)\n"), - _(EXTRACTOR_getKeywordTypeAsString(keywords->keywordType))); - } else { + if (print[type] != YES) + return 0; + stype = gettext(EXTRACTOR_metatype_to_string(type)); + switch (format) + { + case EXTRACTOR_METAFORMAT_UNKNOWN: + fprintf (stdout, + _("%s - (unknown, %u bytes)\n"), + stype, + (unsigned int) data_len); + break; + case EXTRACTOR_METAFORMAT_UTF8: + cd = iconv_open(nl_langinfo(CODESET), "UTF-8"); if (cd != (iconv_t) -1) - keyword = iconvHelper(cd, - keywords->keyword); + keyword = iconv_helper(cd, + data); else - keyword = strdup(keywords->keyword); - if (NULL == EXTRACTOR_getKeywordTypeAsString(keywords->keywordType)) { - if (verbose == YES) { - fprintf(handle, - _("INVALID TYPE - %s\n"), - keyword); - } - } else if (print[keywords->keywordType] == YES) - fprintf (handle, - "%s - %s\n", - _(EXTRACTOR_getKeywordTypeAsString(keywords->keywordType)), - keyword); + keyword = strdup(data); + fprintf (stdout, + "%s - %s\n", + stype, + keyword); free(keyword); + if (cd != (iconv_t) -1) + iconv_close(cd); + break; + case EXTRACTOR_METAFORMAT_BINARY: + fprintf (stdout, + _("%s - (binary, %u bytes)\n"), + stype, + (unsigned int) data_len); + break; + case EXTRACTOR_METAFORMAT_C_STRING: + fprintf (stdout, + "%s - %s\n", + stype, + data); + break; + + default: + break; } - keywords = keywords->next; - } - if (cd != (iconv_t) -1) - iconv_close(cd); + return 0; } + + /** - * Print a keyword list to a file in a grep-friendly manner. + * Print a keyword list to a file without new lines. * - * @param handle the file to write to (stdout, stderr), may NOT be NULL - * @param keywords the list of keywords to print, may be NULL - * @param print array indicating which types to print - */ -static void -printSelectedKeywordsGrepFriendly(FILE * handle, - EXTRACTOR_KeywordList * keywords, - const int * print, - const int verbose) -{ + * @param cls closure, not used + * @param plugin_name name of the plugin that produced this value; + * special values can be used (i.e. '<zlib>' for zlib being + * used in the main libextractor library and yielding + * meta data). + * @param type libextractor-type describing the meta data + * @param format basic format information about data + * @param data_mime_type mime-type of data (not of the original file); + * can be NULL (if mime-type is not known) + * @param data actual meta-data found + * @param data_len number of bytes in data + * @return 0 to continue extracting, 1 to abort + */ +static int +print_selected_keywords_grep_friendly (void *cls, + const char *plugin_name, + enum EXTRACTOR_MetaType type, + enum EXTRACTOR_MetaFormat format, + const char *data_mime_type, + const char *data, + size_t data_len) +{ char * keyword; iconv_t cd; - size_t pos; - - cd = iconv_open(nl_langinfo(CODESET), "UTF-8"); - while (keywords != NULL) { - if ( (EXTRACTOR_isBinaryType(EXTRACTOR_THUMBNAIL_DATA)) && - (print[keywords->keywordType] == YES) ) { - if (verbose > 1) - fprintf(handle, - "%s: ", - _(EXTRACTOR_getKeywordTypeAsString(keywords->keywordType))); + + if (print[type] != YES) + return 0; + switch (format) + { + case EXTRACTOR_METAFORMAT_UNKNOWN: + break; + case EXTRACTOR_METAFORMAT_UTF8: + if (verbose > 1) + fprintf (stdout, + "%s: ", + gettext(EXTRACTOR_metatype_to_string(type))); + cd = iconv_open(nl_langinfo(CODESET), "UTF-8"); if (cd != (iconv_t) -1) - keyword = iconvHelper(cd, - keywords->keyword); + keyword = iconv_helper(cd, + data); else - keyword = strdup(keywords->keyword); - pos = 0; - while (keyword[pos] != '\0') { - if (iscntrl(keyword[pos])) - keyword[pos] = ' '; - pos++; - } - fprintf (handle, - (keywords->next == NULL) ? "%s" : "%s ", + keyword = strdup(data); + fprintf (stdout, + "'%s' ", keyword); free(keyword); + if (cd != (iconv_t) -1) + iconv_close(cd); + break; + case EXTRACTOR_METAFORMAT_BINARY: + break; + case EXTRACTOR_METAFORMAT_C_STRING: + if (verbose > 1) + fprintf (stdout, + "%s ", + gettext(EXTRACTOR_metatype_to_string(type))); + fprintf (stdout, + "'%s'", + data); + break; + default: + break; } - keywords = keywords->next; - } - fprintf(handle, "\n"); - if (cd != (iconv_t) -1) - iconv_close(cd); + return 0; } + /** * Take title, auth, year and return a string */ static char * str_splice(const char * title, - const char * auth, - const char * year) { + const char * year, + const char * auth) { char * temp = malloc(16); int i = 0; @@ -287,190 +338,202 @@ str_splice(const char * title, return temp; } + +/** + * Entry in the map we construct for each file. + */ +struct BibTexMap +{ + const char *bibTexName; + enum EXTRACTOR_MetaType le_type; + char *value; +}; + + +/** + * Type of the entry for bibtex. + */ +static char *entry_type; + +/** + * Mapping between bibTeX strings, libextractor + * meta data types and values for the current document. + */ +static struct BibTexMap btm[] = + { + { "title", EXTRACTOR_METATYPE_TITLE, NULL}, + { "year", EXTRACTOR_METATYPE_PUBLICATION_YEAR, NULL }, + { "author", EXTRACTOR_METATYPE_AUTHOR_NAME, NULL }, + { "book", EXTRACTOR_METATYPE_BOOK_TITLE, NULL}, + { "edition", EXTRACTOR_METATYPE_BOOK_EDITION, NULL}, + { "chapter", EXTRACTOR_METATYPE_BOOK_CHAPTER_NUMBER, NULL}, + { "journal", EXTRACTOR_METATYPE_JOURNAL_NAME, NULL}, + { "volume", EXTRACTOR_METATYPE_JOURNAL_VOLUME, NULL}, + { "number", EXTRACTOR_METATYPE_JOURNAL_NUMBER, NULL}, + { "pages", EXTRACTOR_METATYPE_PAGE_COUNT, NULL }, + { "pages", EXTRACTOR_METATYPE_PAGE_RANGE, NULL }, + { "school", EXTRACTOR_METATYPE_AUTHOR_INSTITUTION, NULL}, + { "publisher", EXTRACTOR_METATYPE_PUBLISHER, NULL }, + { "address", EXTRACTOR_METATYPE_PUBLISHER_ADDRESS, NULL }, + { "institution", EXTRACTOR_METATYPE_PUBLISHER_INSTITUTION, NULL }, + { "series", EXTRACTOR_METATYPE_PUBLISHER_SERIES, NULL}, + { "month", EXTRACTOR_METATYPE_PUBLICATION_MONTH, NULL }, + { "url", EXTRACTOR_METATYPE_URL, NULL}, + { "note", EXTRACTOR_METATYPE_COMMENT, NULL}, + { "eprint", EXTRACTOR_METATYPE_BIBTEX_EPRINT, NULL }, + { "type", EXTRACTOR_METATYPE_PUBLICATION_TYPE, NULL }, + { NULL, 0, NULL } + }; + + /** - * Print a keyword list in bibtex format to a file. - * FIXME: We should generate the three letter abbrev of the month - * @param handle the file to write to (stdout, stderr), may NOT be NULL - * @param keywords the list of keywords to print, may be NULL - * @param print array indicating which types to print + * Clean up the bibtex processor in preparation for the next round. */ +static void +start_bibtex () +{ + int i; + + i = 0; + while (btm[i].bibTexName != NULL) + { + free (btm[i].value); + btm[i].value = NULL; + i++; + } + free (entry_type); + entry_type = NULL; +} + + +/** + * Callback function for printing meta data in bibtex format. + * + * @param cls closure, not used + * @param plugin_name name of the plugin that produced this value; + * special values can be used (i.e. '<zlib>' for zlib being + * used in the main libextractor library and yielding + * meta data). + * @param type libextractor-type describing the meta data + * @param format basic format information about data + * @param data_mime_type mime-type of data (not of the original file); + * can be NULL (if mime-type is not known) + * @param data actual meta-data found + * @param data_len number of bytes in data + * @return 0 to continue extracting (always) + */ +static int +print_bibtex (void *cls, + const char *plugin_name, + enum EXTRACTOR_MetaType type, + enum EXTRACTOR_MetaFormat format, + const char *data_mime_type, + const char *data, + size_t data_len) +{ + int i; + + if (print[type] != YES) + return 0; + if (format != EXTRACTOR_METAFORMAT_UTF8) + return 0; + if (type == EXTRACTOR_METATYPE_BIBTEX_ENTRY_TYPE) + { + entry_type = strdup (data); + return 0; + } + i = 0; + while (btm[i].bibTexName != NULL) + { + if ( (btm[i].value == NULL) && + (btm[i].le_type == type) ) + btm[i].value = strdup (data); + i++; + } + return 0; +} + + static void -printSelectedKeywordsBibtex (FILE * handle, - EXTRACTOR_KeywordList * keywords, - const int * print, - const char * filename) +finish_bibtex (const char *fn) { - const char * last = NULL; - if (keywords == NULL) - return; - if (print[keywords->keywordType] == YES) + int i; + char *tya; + const char *et; + + if (entry_type != NULL) + et = entry_type; + else + et = "misc"; + if ( (btm[0].value == NULL) || + (btm[1].value == NULL) || + (btm[2].value == NULL) ) + fprintf (stdout, + "@%s %s { ", + et, + fn); + else { - const char * title = NULL; - const char * author = NULL; - const char * note = NULL; - const char * date = NULL; - const char * publisher = NULL; - const char * organization = NULL; - const char * key = NULL; - const char * pages = NULL; - char * year = NULL; - char * month = NULL; - char * tmp; - - title = EXTRACTOR_extractLastByString(_("title"), keywords); - if ( !title ) - title = EXTRACTOR_extractLastByString(_("filename"), keywords); - if ( !title ) - title = (char*)filename; - last = title; - - author = EXTRACTOR_extractLastByString(_("author"), keywords); - if ( author ) - last = author; - - note = EXTRACTOR_extractLastByString(_("description"), keywords); - if ( !note ) - note = EXTRACTOR_extractLastByString(_("keywords"), keywords); - if ( !note ) - note = EXTRACTOR_extractLastByString(_("comment"), keywords); - if ( note ) - last = note; - - date = EXTRACTOR_extractLastByString(_("date"), keywords); - if ( !date ) - date = EXTRACTOR_extractLastByString(_("creation date"), keywords); - if ( date ) { - if ( strlen(keywords->keyword) >= 7 ) { - year = (char*)malloc(sizeof(char)*5); - memset(year, 0, sizeof(char)*5); - month = (char*)malloc(sizeof(char)*3); - memset(month, 0, sizeof(char)*3); - year[0] = keywords->keyword[0]; - year[1] = keywords->keyword[1]; - year[2] = keywords->keyword[2]; - year[3] = keywords->keyword[3]; - month[0] = keywords->keyword[4]; - month[1] = keywords->keyword[5]; - } else if ( strlen(keywords->keyword) >= 4 ) { - year = (char*)malloc(sizeof(char)*5); - memset(year, 0, sizeof(char)*5); - year[0] = keywords->keyword[0]; - year[1] = keywords->keyword[1]; - year[2] = keywords->keyword[2]; - year[3] = keywords->keyword[3]; - } - } - if ( year ) - last = year; - - if ( month ) - last = month; - - publisher = EXTRACTOR_extractLastByString(_("publisher"), keywords); - if ( publisher ) - last = publisher; - - organization = EXTRACTOR_extractLastByString(_("organization"), keywords); - if ( organization ) - last = organization; - - key = EXTRACTOR_extractLastByString(_("subject"), keywords); - if ( key ) - last = key; - - pages = EXTRACTOR_extractLastByString(_("page count"), keywords); - if ( pages ) - last = pages; - - tmp = str_splice(title, author, year); - fprintf(handle, - "@misc{ %s,\n", - tmp); - free(tmp); - if ( title ) - fprintf(handle, " title = \"%s\"%s\n", title, - (last == title)?"":","); - if ( author ) - fprintf(handle, " author = \"%s\"%s\n", author, - (last == author)?"":","); - if ( note ) - fprintf(handle, " note = \"%s\"%s\n", note, - (last == note)?"":","); - if ( year ) - fprintf(handle, " year = \"%s\"%s\n", year, - (last == year)?"":","); - if ( month ) - fprintf(handle, " month = \"%s\"%s\n", month, - (last == month)?"":","); - if ( publisher ) - fprintf(handle, " publisher = \"%s\"%s\n", publisher, - (last == publisher)?"":","); - if ( organization ) - fprintf(handle, " organization = \"%s\"%s\n", organization, - (last == organization)?"":","); - if ( key ) - fprintf(handle, " key = \"%s\"%s\n", key, - (last == key)?"":","); - if ( pages ) - fprintf(handle, " pages = \"%s\"%s\n", pages, - (last == pages)?"":","); - if (month != NULL) - free(month); - if (year != NULL) - free(year); - fprintf(handle, "}\n\n"); + tya = str_splice (btm[0].value, + btm[1].value, + btm[2].value); + fprintf (stdout, + "@%s %s { ", + et, + tya); + free (tya); } + + + i = 0; + while (btm[i].bibTexName != NULL) + { + if (btm[i].value != NULL) + fprintf (stdout, + "\t%s = {%s},\n", + btm[i].bibTexName, + btm[i].value); + i++; + } + fprintf(stdout, "}\n\n"); } + /** - * Demo for libExtractor. - * <p> - * Invoke with a list of filenames to extract keywords - * from (demo will use all the extractor libraries that - * are available by default). + * Main function for the 'extract' tool. Invoke with a list of + * filenames to extract keywords from. */ int main (int argc, char *argv[]) { int i; - EXTRACTOR_ExtractorList *extractors; - EXTRACTOR_KeywordList *keywords; + struct EXTRACTOR_PluginList *plugins; int option_index; int c; char * libraries = NULL; char * hash = NULL; - int splitKeywords = NO; - int verbose = 0; - int useFilename = NO; int nodefault = NO; - int *print; int defaultAll = YES; - int duplicates = EXTRACTOR_DUPLICATES_REMOVE_UNKNOWN; int bibtex = NO; int grepfriendly = NO; char * binary = NULL; + char * name; int ret = 0; + EXTRACTOR_MetaDataProcessor processor = NULL; -#ifdef MINGW - InitWinEnv(); -#endif #if ENABLE_NLS setlocale(LC_ALL, ""); - textdomain("libextractor"); - BINDTEXTDOMAIN("libextractor", LOCALEDIR); + textdomain(PACKAGE); #endif - print = malloc (sizeof (int) * EXTRACTOR_getHighestKeywordTypeNumber ()); - for (i = 0; i < EXTRACTOR_getHighestKeywordTypeNumber (); i++) + print = malloc (sizeof (int) * EXTRACTOR_metatype_get_max ()); + for (i = 0; i < EXTRACTOR_metatype_get_max (); i++) print[i] = YES; /* default: print everything */ while (1) { static struct option long_options[] = { - {"all", 0, 0, 'a'}, {"binary", 1, 0, 'B'}, {"bibtex", 0, 0, 'b'}, - {"duplicates", 0, 0, 'd'}, - {"filename", 0, 0, 'f'}, {"grep-friendly", 0, 0, 'g'}, {"help", 0, 0, 'h'}, {"hash", 1, 0, 'H'}, @@ -478,8 +541,6 @@ main (int argc, char *argv[]) {"library", 1, 0, 'l'}, {"nodefault", 0, 0, 'n'}, {"print", 1, 0, 'p'}, - {"remove-duplicates", 0, 0, 'r'}, - {"split", 0, 0, 's'}, {"verbose", 0, 0, 'V'}, {"version", 0, 0, 'v'}, {"exclude", 1, 0, 'x'}, @@ -487,7 +548,8 @@ main (int argc, char *argv[]) }; option_index = 0; c = getopt_long (argc, - argv, "vhbgl:nsH:fp:x:LVdraB:", + argv, + "abB:ghH:l:Lnp:vVx:", long_options, &option_index); @@ -495,23 +557,28 @@ main (int argc, char *argv[]) break; /* No more flags to process */ switch (c) { - case 'a': - duplicates = -1; - break; case 'b': bibtex = YES; + if (processor != NULL) + { + fprintf (stderr, + _("Illegal combination of options, cannot combine multiple styles of printing.\n")); + return 0; + } + processor = &print_bibtex; break; case 'B': binary = optarg; break; - case 'd': - duplicates = 0; - break; - case 'f': - useFilename = YES; - break; case 'g': grepfriendly = YES; + if (processor != NULL) + { + fprintf (stderr, + _("Illegal combination of options, cannot combine multiple styles of printing.\n")); + return 0; + } + processor = &print_selected_keywords_grep_friendly; break; case 'h': printHelp(); @@ -524,32 +591,35 @@ main (int argc, char *argv[]) break; case 'L': i = 0; - while (NULL != EXTRACTOR_getKeywordTypeAsString (i)) + while (NULL != EXTRACTOR_metatype_to_string (i)) printf ("%s\n", - _(EXTRACTOR_getKeywordTypeAsString (i++))); + gettext(EXTRACTOR_metatype_to_string (i++))); return 0; case 'n': nodefault = YES; break; case 'p': - if (optarg == NULL) { - fprintf(stderr, - _("You must specify an argument for the `%s' option (option ignored).\n"), - "-p"); - break; - } + if (optarg == NULL) + { + fprintf(stderr, + _("You must specify an argument for the `%s' option (option ignored).\n"), + "-p"); + break; + } if (defaultAll == YES) { defaultAll = NO; i = 0; - while (NULL != EXTRACTOR_getKeywordTypeAsString (i)) + while (NULL != EXTRACTOR_metatype_to_string (i)) print[i++] = NO; } i = 0; - while (NULL != EXTRACTOR_getKeywordTypeAsString (i)) + while (NULL != EXTRACTOR_metatype_to_string (i)) { - if ( (0 == strcmp (optarg, EXTRACTOR_getKeywordTypeAsString (i))) || - (0 == strcmp (optarg, _(EXTRACTOR_getKeywordTypeAsString (i)))) ) + if ( (0 == strcmp (optarg, + EXTRACTOR_metatype_to_string (i))) || + (0 == strcmp (optarg, + gettext(EXTRACTOR_metatype_to_string (i)))) ) { print[i] = YES; @@ -557,7 +627,7 @@ main (int argc, char *argv[]) } i++; } - if (NULL == EXTRACTOR_getKeywordTypeAsString (i)) + if (NULL == EXTRACTOR_metatype_to_string (i)) { fprintf(stderr, "Unknown keyword type `%s', use option `%s' to get a list.\n", @@ -566,12 +636,6 @@ main (int argc, char *argv[]) return -1; } break; - case 'r': - duplicates = EXTRACTOR_DUPLICATES_TYPELESS; - break; - case 's': - splitKeywords = YES; - break; case 'v': printf ("extract v%s\n", PACKAGE_VERSION); return 0; @@ -580,34 +644,30 @@ main (int argc, char *argv[]) break; case 'x': i = 0; - while (NULL != EXTRACTOR_getKeywordTypeAsString (i)) + while (NULL != EXTRACTOR_metatype_to_string (i)) { - if ( (0 == strcmp (optarg, EXTRACTOR_getKeywordTypeAsString (i))) || - (0 == strcmp (optarg, _(EXTRACTOR_getKeywordTypeAsString (i)))) ) + if ( (0 == strcmp (optarg, + EXTRACTOR_metatype_to_string (i))) || + (0 == strcmp (optarg, + gettext(EXTRACTOR_metatype_to_string (i)))) ) { print[i] = NO; break; } i++; } - if (NULL == EXTRACTOR_getKeywordTypeAsString (i)) + if (NULL == EXTRACTOR_metatype_to_string (i)) { fprintf (stderr, "Unknown keyword type `%s', use option `%s' to get a list.\n", optarg, "-L"); -#ifdef MINGW - ShutdownWinEnv(); -#endif return -1; } break; default: fprintf (stderr, _("Use --help to get a list of options.\n")); -#ifdef MINGW - ShutdownWinEnv(); -#endif return -1; } /* end of parsing commandline */ } /* while (1) */ @@ -616,54 +676,66 @@ main (int argc, char *argv[]) { fprintf (stderr, "Invoke with list of filenames to extract keywords form!\n"); -#ifdef MINGW - ShutdownWinEnv(); -#endif free (print); return -1; } /* build list of libraries */ if (nodefault == NO) - extractors = EXTRACTOR_loadDefaultLibraries (); + plugins = EXTRACTOR_plugin_add_defaults (EXTRACTOR_OPTION_NONE); else - extractors = NULL; - if (useFilename == YES) - extractors = EXTRACTOR_addLibrary (extractors, - "libextractor_filename"); + plugins = NULL; if (libraries != NULL) - extractors = EXTRACTOR_loadConfigLibraries (extractors, libraries); - - if (binary != NULL) { - char * name; - name = malloc(strlen(binary) + strlen("libextractor_printable_") + 1); - strcpy(name, "libextractor_printable_"); - strcat(name, binary); - extractors = EXTRACTOR_addLibraryLast(extractors, - name); - free(name); - } - if (hash != NULL) { - char * name; - name = malloc(strlen(hash) + strlen("libextractor_hash_") + 1); - strcpy(name, "libextractor_hash_"); - strcat(name, hash); - extractors = EXTRACTOR_addLibraryLast(extractors, - name); - free(name); - } + plugins = EXTRACTOR_plugin_add_config (plugins, + libraries, + EXTRACTOR_OPTION_NONE); + if (binary != NULL) + { + /* FIXME: need full path here now... */ + name = malloc(strlen(binary) + strlen("libextractor_printable_") + 1); + strcpy(name, "libextractor_printable_"); + strcat(name, binary); + plugins = EXTRACTOR_plugin_add_last(plugins, + name, + NULL, + EXTRACTOR_OPTION_NONE); + free(name); + } + if (hash != NULL) + { + /* FIXME: need full path here now... */ + name = malloc(strlen(hash) + strlen("libextractor_hash_") + 1); + strcpy(name, "libextractor_hash_"); + strcat(name, hash); + plugins = EXTRACTOR_plugin_add_last(plugins, + name, + NULL, + EXTRACTOR_OPTION_NONE); + free(name); + } - if (splitKeywords == YES) - extractors = EXTRACTOR_addLibraryLast(extractors, - "libextractor_split"); + if (processor == NULL) + processor = &print_selected_keywords; /* extract keywords */ - if ( bibtex == YES ) + if (bibtex == YES) fprintf(stdout, _("%% BiBTeX file\n")); for (i = optind; i < argc; i++) { errno = 0; - keywords = EXTRACTOR_getKeywords (extractors, argv[i]); + if (grepfriendly == YES) + fprintf (stdout, "%s ", argv[i]); + else if (bibtex == NO) + fprintf (stdout, + _("Keywords for file %s:\n"), + argv[i]); + else + start_bibtex (); + EXTRACTOR_extract (plugins, + argv[i], + NULL, 0, + processor, + NULL); if (0 != errno) { if (verbose > 0) { fprintf(stderr, @@ -671,34 +743,21 @@ main (int argc, char *argv[]) argv[0], argv[i], strerror(errno)); } ret = 1; - continue; - } - if ( (duplicates != -1) || (bibtex == YES)) - keywords = EXTRACTOR_removeDuplicateKeywords (keywords, duplicates); - if ( (verbose > 0) - && (bibtex == NO) ) { if (grepfriendly == YES) - printf ("%s ", argv[i]); - else - printf (_("Keywords for file %s:\n"), - argv[i]); + fprintf (stdout, "\n"); + continue; } - if (bibtex == YES) - printSelectedKeywordsBibtex (stdout, keywords, print, argv[i]); - else if (grepfriendly == YES) - printSelectedKeywordsGrepFriendly(stdout, keywords, print, verbose); - else - printSelectedKeywords (stdout, keywords, print, verbose); - if (verbose > 0 && bibtex == NO) + if (grepfriendly == YES) + fprintf (stdout, "\n"); + if (bibtex) + finish_bibtex (argv[i]); + if (verbose > 0) printf ("\n"); - EXTRACTOR_freeKeywords (keywords); } free (print); - EXTRACTOR_removeAll (extractors); - -#ifdef MINGW - ShutdownWinEnv(); -#endif - + EXTRACTOR_plugin_remove_all (plugins); + start_bibtex (); /* actually free's stuff */ return ret; } + +/* end of extract.c */ diff --git a/src/main/extractor.c b/src/main/extractor.c @@ -20,7 +20,32 @@ #include "platform.h" #include "extractor.h" -#include <pthread.h> +#include <dirent.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <sys/shm.h> +#include <signal.h> + + +/** + * How many bytes do we actually try to scan? (from the beginning + * of the file). Limit to 32 MB. + */ +#define MAX_READ 32 * 1024 * 1024 + +/** + * How many bytes do we actually try to decompress? (from the beginning + * of the file). Limit to 16 MB. + */ +#define MAX_DECOMPRESS 16 * 1024 * 1024 + +/** + * Maximum length of a Mime-Type string. + */ +#define MAX_MIME_LEN 256 + +#define DEBUG 0 + #if HAVE_LTDL_H #include <ltdl.h> @@ -36,20 +61,171 @@ #include <zlib.h> #endif -#define DEBUG 0 + +struct MetaTypeDescription +{ + const char *short_description; + + const char *long_description; +}; + /** * The sources of keywords as strings. */ -static const char *keywordTypes[] = { - gettext_noop("unknown"), /* 0 */ - gettext_noop("filename"), - gettext_noop("mimetype"), - gettext_noop("title"), +static const struct MetaTypeDescription meta_type_descriptions[] = { + /* 0 */ + { gettext_noop ("reserved"), + gettext_noop ("reserved value, do not use") }, + { gettext_noop ("mimetype"), + gettext_noop ("mime type") }, + { gettext_noop ("embedded filename"), + gettext_noop ("filename that was embedded (not necessarily the current filename)") }, + { gettext_noop ("comment"), + gettext_noop ("comment about the content") }, + { gettext_noop ("title"), + gettext_noop ("title of the work")}, + /* 5 */ + { gettext_noop ("book title"), + gettext_noop ("title of the book containing the work") }, + { gettext_noop ("book edition"), + gettext_noop ("edition of the book (or book containing the work)") }, + { gettext_noop ("book chapter"), + gettext_noop ("chapter number") }, + { gettext_noop ("journal name"), + gettext_noop ("journal or magazine the work was published in") }, + { gettext_noop ("journal volume"), + gettext_noop ("volume of a journal or multi-volume book") }, + /* 10 */ + { gettext_noop ("journal number"), + gettext_noop ("number of a journal, magazine or tech-report") }, + { gettext_noop ("page count"), + gettext_noop ("total number of pages of the work") }, + { gettext_noop ("page range"), + gettext_noop ("page numbers of the publication in the respective journal or book") }, + { gettext_noop ("author name"), + gettext_noop ("name of the author(s)") }, + { gettext_noop ("author email"), + gettext_noop ("e-mail of the author(s)") }, + /* 15 */ + { gettext_noop ("author institution"), + gettext_noop ("institution the author worked for") }, + { gettext_noop ("publisher"), + gettext_noop ("name of the publisher") }, + { gettext_noop ("publisher's address"), + gettext_noop ("Address of the publisher (often only the city)") }, + { gettext_noop ("publishing institution"), + gettext_noop ("institution that was involved in the publishing, but not necessarily the publisher") }, + { gettext_noop ("publication series"), + gettext_noop ("series of books the book was published in") }, + /* 20 */ + { gettext_noop ("publication type"), + gettext_noop ("type of the tech-report") }, + { gettext_noop ("publication year"), + gettext_noop ("year of publication (or, if unpublished, the year of creation)") }, + { gettext_noop ("publication month"), + gettext_noop ("month of publication (or, if unpublished, the month of creation)") }, + { gettext_noop ("publication day"), + gettext_noop ("day of publication (or, if unpublished, the day of creation), relative to the given month") }, + { gettext_noop ("publication date"), + gettext_noop ("date of publication (or, if unpublished, the date of creation)") }, + /* 25 */ + { gettext_noop ("bibtex eprint"), + gettext_noop ("specification of an electronic publication") }, + { gettext_noop ("bibtex entry type"), + gettext_noop ("type of the publication for bibTeX bibliographies") }, + { gettext_noop ("language"), + gettext_noop ("language the work uses") }, + { gettext_noop ("creation time"), + gettext_noop ("time and date of creation") }, + { gettext_noop ("URL"), + gettext_noop ("universal resource location (where the work is made available)") }, + /* 30 */ + { gettext_noop ("URI"), + gettext_noop ("universal resource identifier") }, + { gettext_noop ("international standard recording code"), + gettext_noop ("ISRC number identifying the work") }, + { gettext_noop ("MD4"), + gettext_noop ("MD4 hash") }, + { gettext_noop ("MD5"), + gettext_noop ("MD5 hash") }, + { gettext_noop ("SHA-0"), + gettext_noop ("SHA-0 hash") }, + /* 35 */ + { gettext_noop ("SHA-1"), + gettext_noop ("SHA-1 hash") }, + { gettext_noop ("RipeMD160"), + gettext_noop ("RipeMD150 hash") }, + { gettext_noop ("GPS latitude ref"), + gettext_noop ("GPS latitude ref") }, + { gettext_noop ("GPS latitude"), + gettext_noop ("GPS latitude") }, + { gettext_noop ("GPS longitude ref"), + gettext_noop ("GPS longitude ref") }, + /* 40 */ + { gettext_noop ("GPS longitude"), + gettext_noop ("GPS longitude") }, + { gettext_noop ("city"), + gettext_noop ("name of the city where the document originated") }, + { gettext_noop ("sublocation"), + gettext_noop ("more specific location of the geographic origin") }, + { gettext_noop ("country"), + gettext_noop ("name of the country where the document originated") }, + { gettext_noop ("country code"), + gettext_noop ("ISO 2-letter country code for the country of origin") }, + /* 45 */ + { gettext_noop ("unknown"), + gettext_noop ("specifics are not known") }, + { gettext_noop ("description"), + gettext_noop ("description") }, + { gettext_noop ("copyright"), + gettext_noop ("copyright information") }, + { gettext_noop ("rights"), + gettext_noop ("information about rights") }, + { gettext_noop ("keywords"), + gettext_noop ("keywords") }, + /* 50 */ + { gettext_noop ("abstract"), + gettext_noop ("abstract") }, + { gettext_noop ("summary"), + gettext_noop ("summary") }, + { gettext_noop ("subject"), + gettext_noop ("subject matter") }, + { gettext_noop ("creator"), + gettext_noop ("name of the person who created the document") }, + { gettext_noop ("format"), + gettext_noop ("name of the document format") }, + /* 55 */ + { gettext_noop ("format version"), + gettext_noop ("version of the document format") }, + { gettext_noop ("created by software"), + gettext_noop ("name of the software that created the document") }, + { gettext_noop ("unknown date"), + gettext_noop ("ambiguous date (could specify creation time, modification time or access time)") }, + { gettext_noop ("creation date"), + gettext_noop ("date the document was created") }, + { gettext_noop ("modification date"), + gettext_noop ("date the document was modified") }, + /* 60 */ + { gettext_noop ("last printed"), + gettext_noop ("date the document was last printed") }, + { gettext_noop ("last saved by"), + gettext_noop ("name of the user who saved the document last") }, + { gettext_noop ("total editing time"), + gettext_noop ("time spent editing the document") }, + { gettext_noop ("editing cycles"), + gettext_noop ("number of editing cycles") }, + { gettext_noop ("modified by software"), + gettext_noop ("name of software making modifications") }, + /* 65 */ + { gettext_noop ("revision history"), + gettext_noop ("information about the revision history") }, + +#if 0 + gettext_noop("author"), gettext_noop("artist"), /* 5 */ gettext_noop("description"), - gettext_noop("comment"), gettext_noop("date"), gettext_noop("publisher"), gettext_noop("language"), /* 10 */ @@ -94,11 +270,6 @@ static const char *keywordTypes[] = { gettext_noop("build-host"), gettext_noop("operating system"), /* 50 */ gettext_noop("dependency"), - gettext_noop("MD4"), - gettext_noop("MD5"), - gettext_noop("SHA-0"), - gettext_noop("SHA-1"), /* 55 */ - gettext_noop("RipeMD160"), gettext_noop("resolution"), gettext_noop("category"), gettext_noop("book title"), @@ -143,7 +314,6 @@ static const char *keywordTypes[] = { gettext_noop("created by software"), gettext_noop("modified by software"), gettext_noop("revision history"), /* 100 */ - gettext_noop("lower case conversion"), gettext_noop("company"), gettext_noop("generator"), gettext_noop("character set"), @@ -175,137 +345,131 @@ static const char *keywordTypes[] = { gettext_noop("ripper"), /* 130 */ gettext_noop("filesize"), gettext_noop("track number"), - gettext_noop("international standard recording code"), gettext_noop("disc number"), gettext_noop("preferred display style (GNUnet)"), /* 135 */ gettext_noop("GNUnet URI of ECBC data"), gettext_noop("Complete file data (for non-binary files only)"), - gettext_noop("city"), - gettext_noop("country"), - gettext_noop("sublocation"), /* 140 */ - gettext_noop("GPS latitude ref"), - gettext_noop("GPS latitude"), - gettext_noop("GPS longitude ref"), - gettext_noop("GPS longitude"), gettext_noop("rating"), /* 145 */ - gettext_noop("country code"), - NULL -}; - -/* the number of keyword types (for bounds-checking) */ -#define HIGHEST_TYPE_NUMBER 147 -#ifdef HAVE_LIBOGG -#if HAVE_VORBIS -#define WITH_OGG 1 -#endif #endif +}; -#if HAVE_VORBISFILE -#define WITH_OGG 1 -#endif +/** + * Total number of keyword types (for bounds-checking) + */ +#define HIGHEST_METATYPE_NUMBER (sizeof (meta_type_descriptions) / sizeof(*meta_type_descriptions)) -#if HAVE_EXIV2 -#define EXSO "libextractor_exiv2:" -#else -#define EXSO "" -#endif -#if WITH_OGG -#define OGGSO "libextractor_ogg:" -#else -#define OGGSO "" -#endif +/** + * Get the textual name of the keyword. + * + * @param type meta type to get a UTF-8 string for + * @return NULL if the type is not known, otherwise + * an English (locale: C) string describing the type; + * translate using 'dgettext ("libextractor", rval)' + */ +const char * +EXTRACTOR_metatype_to_string(enum EXTRACTOR_MetaType type) +{ + if ((type < 0) || (type >= HIGHEST_METATYPE_NUMBER)) + return NULL; + return meta_type_descriptions[type].short_description; +} -#if HAVE_FLAC -#define FLACSO "libextractor_flac:" -#else -#define FLACSO "" -#endif -#if HAVE_ZLIB -#define QTSO "libextractor_qt:" -#else -#define QTSO "" -#endif +/** + * Get a long description for the meta type. + * + * @param type meta type to get a UTF-8 description for + * @return NULL if the type is not known, otherwise + * an English (locale: C) string describing the type; + * translate using 'dgettext ("libextractor", rval)' + */ +const char * +EXTRACTOR_metatype_to_description(enum EXTRACTOR_MetaType type) +{ + if ((type < 0) || (type >= HIGHEST_METATYPE_NUMBER)) + return NULL; + return meta_type_descriptions[type].long_description; +} -#if HAVE_GSF -#define OLESO "libextractor_ole2:" -#else -#define OLESO "" -#endif -#if HAVE_MPEG2 -#define MPEGSO "libextractor_mpeg:" -#else -#define MPEGSO "" -#endif - -/* ATTN: order matters (for performance!) since - mime-types can be used to avoid parsing once - the type has been established! */ -#define DEFSO \ -"libextractor_html:\ -libextractor_man:\ -libextractor_ps:\ -libextractor_pdf:\ -libextractor_mp3:\ -libextractor_id3v2:\ -libextractor_id3v23:\ -libextractor_id3v24:\ -libextractor_mime:\ -libextractor_tar:\ -libextractor_dvi:\ -libextractor_deb:\ -libextractor_png:\ -libextractor_gif:\ -libextractor_wav:\ -libextractor_flv:\ -libextractor_real:\ -libextractor_jpeg:\ -libextractor_tiff:\ -libextractor_zip:\ -libextractor_rpm:\ -libextractor_riff:\ -libextractor_applefile:\ -libextractor_elf:\ -libextractor_oo:\ -libextractor_asf:\ -libextractor_sid:\ -libextractor_nsfe:\ -libextractor_nsf:\ -libextractor_it:\ -libextractor_xm:\ -libextractor_s3m" - -#define DEFAULT_LIBRARIES MPEGSO EXSO OLESO OGGSO FLACSO QTSO DEFSO - -const char * EXTRACTOR_getDefaultLibraries() { - return DEFAULT_LIBRARIES; +/** + * Return the highest type number, exclusive as in [0,max). + * + * @return highest legal metatype number for this version of libextractor + */ +enum EXTRACTOR_MetaType +EXTRACTOR_metatype_get_max () +{ + return HIGHEST_METATYPE_NUMBER; } -/* determine installation path */ -static char * cut_bin(char * in) { - size_t p; +/** + * Linked list of extractor plugins. An application builds this list + * by telling libextractor to load various keyword-extraction + * plugins. Libraries can also be unloaded (removed from this list, + * see EXTRACTOR_plugin_remove). + */ +struct EXTRACTOR_PluginList +{ + /** + * This is a linked list. + */ + struct EXTRACTOR_PluginList *next; + + /** + * Pointer to the plugin (as returned by lt_dlopen). + */ + void * libraryHandle; + + /** + * Name of the library (i.e., 'libextractor_foo.so') + */ + char *libname; + + /** + * Pointer to the function used for meta data extraction. + */ + EXTRACTOR_ExtractMethod extractMethod; + + /** + * Options for the plugin. + */ + char * plugin_options; + + /** + * Flags to control how the plugin is executed. + */ + enum EXTRACTOR_Options flags; + + /** + * Process ID of the child process for this plugin. 0 for + * none. + */ + pid_t cpid; + + /** + * Pipe used to send information about shared memory segments to + * the child process. NULL if not initialized. + */ + FILE *cpipe_in; + + /** + * Pipe used to read information about extracted meta data from + * the child process. -1 if not initialized. + */ + int cpipe_out; + +}; - if (in == NULL) - return NULL; - p = strlen(in); - if (p > 4) { - if ( (in[p-1] == '/') || - (in[p-1] == '\\') ) - in[--p] = '\0'; - if (0 == strcmp(&in[p-3], - "bin")) { - in[p-3] = '\0'; - p -= 3; - } - } - return in; -} -static char * cut_lib(char * in) { +/** + * Remove a trailing '/bin' from in (if present). + */ +static char * +cut_bin(char * in) { size_t p; if (in == NULL) @@ -316,7 +480,7 @@ static char * cut_lib(char * in) { (in[p-1] == '\\') ) in[--p] = '\0'; if (0 == strcmp(&in[p-3], - "lib")) { + "bin")) { in[p-3] = '\0'; p -= 3; } @@ -324,7 +488,6 @@ static char * cut_lib(char * in) { return in; } - #if LINUX /** * Try to determine path by reading /proc/PID/exe or @@ -490,404 +653,337 @@ get_path_from_PATH() { return NULL; } -static char * -get_path_from_ENV_PREFIX() { - const char * p; - p = getenv("LIBEXTRACTOR_PREFIX"); - if (p != NULL) { - char * s = malloc(strlen(p) + 6); - if (s != NULL) { - int len; - strcpy(s, p); - s = cut_bin(cut_lib(s)); - len = strlen(s); - s = realloc(s, len + 6); - if (len > 0 && s[len-1] != '/') - strcat(s, "/lib/"); - else - strcat(s, "lib/"); - return s; - } - } - return NULL; -} +/** + * Function to call on paths. + * + * @param cls closure + * @param path a directory path + */ +typedef void (*PathProcessor)(void *cls, + const char *path); -/* - * @brief get the path to the plugin directory - * @return a pointer to the dir path (to be freed by the caller) + +/** + * Create a filename by appending 'fname' to 'path'. + * + * @param path the base path + * @param fname the filename to append + * @return '$path/$fname' */ -static char * os_get_installation_path() { - size_t n; - char * tmp; - char * lpref; - char * pexe; - char * modu; - char * dima; - char * path; +static char * +append_to_dir (const char *path, + const char *fname) +{ + char *ret; - lpref = get_path_from_ENV_PREFIX(); -#if LINUX - pexe = get_path_from_proc_exe(); -#else - pexe = NULL; -#endif -#if WINDOWS - modu = get_path_from_module_filename(); -#else - modu = NULL; -#endif -#if DARWIN - dima = get_path_from_dyld_image(); - path = NULL; + ret = malloc (strlen (path) + strlen(fname) + 2); + sprintf (ret, +#ifdef MINGW + "%s\%s", #else - dima = NULL; - path = get_path_from_PATH(); + "%s/%s", #endif - n = 1; - if (lpref != NULL) - n += strlen(lpref) + strlen(PLUGINDIR "/:"); - if (pexe != NULL) - n += strlen(pexe) + strlen(PLUGINDIR "/:"); - if (modu != NULL) - n += strlen(modu) + strlen(PLUGINDIR "/:"); - if (dima != NULL) - n += strlen(dima) + strlen(PLUGINDIR "/:"); - if (path != NULL) - n += strlen(path) + strlen(PLUGINDIR "/:"); - tmp = malloc(n); - tmp[0] = '\0'; - if (lpref != NULL) { - strcat(tmp, lpref); - strcat(tmp, PLUGINDIR "/:"); - free(lpref); - } - if (pexe != NULL) { - strcat(tmp, pexe); - strcat(tmp, PLUGINDIR "/:"); - free(pexe); - } - if (modu != NULL) { - strcat(tmp, modu); - strcat(tmp, PLUGINDIR "/:"); - free(modu); - } - if (dima != NULL) { - strcat(tmp, dima); - strcat(tmp, PLUGINDIR "/:"); - free(dima); - } - if (path != NULL) { - strcat(tmp, path); - strcat(tmp, PLUGINDIR "/:"); - free(path); - } - if (strlen(tmp) > 0) - tmp[strlen(tmp)-1] = '\0'; - if (strlen(tmp) == 0) { - free(tmp); - return NULL; - } - return tmp; + path, + fname); + return ret; } -/* ************library initialization ***************** */ - -static char * old_dlsearchpath = NULL; - -/* using libtool, needs init! */ -void __attribute__ ((constructor)) le_ltdl_init() { - int err; - const char * opath; +/** + * Iterate over all paths where we expect to find GNU libextractor + * plugins. + * + * @param pp function to call for each path + * @param pp_cls cls argument for pp. + */ +static void +get_installation_paths (PathProcessor pp, + void *pp_cls) +{ + const char *p; char * path; - char * cpath; + char * prefix; + char * d; -#if ENABLE_NLS - BINDTEXTDOMAIN(PACKAGE, LOCALEDIR); - BINDTEXTDOMAIN("iso-639", ISOLOCALEDIR); /* used by wordextractor */ + prefix = NULL; + p = getenv("LIBEXTRACTOR_PREFIX"); + if (p != NULL) + { + d = strdup (p); + prefix = strtok (d, ":"); + while (NULL != prefix) + { + pp (pp_cls, prefix); + prefix = strtok (NULL, ":"); + } + free (d); + return; + } +#if LINUX + if (prefix == NULL) + prefix = get_path_from_proc_exe(); #endif - err = lt_dlinit (); - if (err > 0) { -#if DEBUG - fprintf(stderr, - _("Initialization of plugin mechanism failed: %s!\n"), - lt_dlerror()); +#if WINDOWS + if (prefix == NULL) + prefix = get_path_from_module_filename(); +#endif +#if DARWIN + if (prefix == NULL) + prefix = get_path_from_dyld_image(); #endif + if (prefix == NULL) + prefix = get_path_from_PATH(); + if (prefix == NULL) return; - } - opath = lt_dlgetsearchpath(); - if (opath != NULL) - old_dlsearchpath = strdup(opath); - path = os_get_installation_path(); - if (path != NULL) { - if (opath != NULL) { - cpath = malloc(strlen(path) + strlen(opath) + 4); - strcpy(cpath, opath); - strcat(cpath, ":"); - strcat(cpath, path); - lt_dlsetsearchpath(cpath); - free(path); - free(cpath); - } else { - lt_dlsetsearchpath(path); - free(path); + if (prefix != NULL) + { + path = append_to_dir (prefix, PLUGINDIR); + pp (pp_cls, path); + free (path); + free (prefix); + return; } - } -#ifdef MINGW - InitWinEnv(); -#endif } -void __attribute__ ((destructor)) le_ltdl_fini() { - lt_dlsetsearchpath(old_dlsearchpath); - if (old_dlsearchpath != NULL) { - free(old_dlsearchpath); - old_dlsearchpath = NULL; - } -#ifdef MINGW - ShutdownWinEnv(); -#endif - lt_dlexit (); -} -/** - * Open a file - */ -static int fileopen(const char *filename, int oflag, ...) +struct DefaultLoaderContext { - int mode; - char *fn; - -#ifdef MINGW - char szFile[_MAX_PATH + 1]; - long lRet; - - if ((lRet = plibc_conv_to_win_path(filename, szFile)) != ERROR_SUCCESS) - { - errno = ENOENT; - SetLastError(lRet); - - return -1; - } - fn = szFile; -#else - fn = (char *) filename; -#endif - - if (oflag & O_CREAT) - { - va_list arg; - va_start(arg, oflag); - mode = va_arg(arg, int); - va_end(arg); - } - else - { - mode = 0; - } + struct EXTRACTOR_PluginList *res; + enum EXTRACTOR_Options flags; +}; -#ifdef MINGW - /* Set binary mode */ - mode |= O_BINARY; -#endif - return open(fn, oflag, mode); +/** + * Load all plugins from the given directory. + * + * @param cls pointer to the "struct EXTRACTOR_PluginList*" to extend + * @param path path to a directory with plugins + */ +static void +load_plugins_from_dir (void *cls, + const char *path) +{ + struct DefaultLoaderContext *dlc = cls; + DIR *dir; + struct dirent *ent; + char *fname; + const char *la; + + dir = opendir (path); + if (NULL == dir) + return; + while (NULL != (ent = readdir (dir))) + { + if (ent->d_name[0] == '.') + continue; + if ( (NULL != (la = strstr (ent->d_name, ".la"))) && + (la[3] == '\0') ) + continue; /* only load '.so' and '.dll' */ + fname = append_to_dir (path, ent->d_name); + dlc->res = EXTRACTOR_plugin_add (dlc->res, + fname, + NULL, + dlc->flags); + free (fname); + } + closedir (dir); } - /** - * Load the default set of libraries. The default set of - * libraries consists of the libraries that are part of - * the libextractor distribution (except split and filename - * extractor) plus the extractors that are specified - * in the environment variable "LIBEXTRACTOR_LIBRARIES". + * Load the default set of plugins. The default can be changed + * by setting the LIBEXTRACTOR_LIBRARIES environment variable. + * If it is set to "env", then this function will return + * EXTRACTOR_plugin_add_config (NULL, env, flags). Otherwise, + * it will load all of the installed plugins and return them. * - * @return the default set of libraries. + * @param flags options for all of the plugins loaded + * @return the default set of plugins, NULL if no plugins were found */ -EXTRACTOR_ExtractorList * -EXTRACTOR_loadDefaultLibraries () +struct EXTRACTOR_PluginList * +EXTRACTOR_plugin_add_defaults(enum EXTRACTOR_Options flags) { + struct DefaultLoaderContext dlc; char *env; - char *tmp; - EXTRACTOR_ExtractorList *res; - env = getenv ("LIBEXTRACTOR_LIBRARIES"); - if (env == NULL) - { - return EXTRACTOR_loadConfigLibraries (NULL, DEFAULT_LIBRARIES); - } - tmp = malloc (strlen (env) + strlen (DEFAULT_LIBRARIES) + 2); - strcpy (tmp, env); - strcat (tmp, ":"); - strcat (tmp, DEFAULT_LIBRARIES); - res = EXTRACTOR_loadConfigLibraries (NULL, tmp); - free (tmp); - return res; + if (env != NULL) + return EXTRACTOR_plugin_add_config (NULL, env, flags); + dlc.res = NULL; + dlc.flags = flags; + get_installation_paths (&load_plugins_from_dir, + &dlc); + return dlc.res; } + /** - * Get the textual name of the keyword. - * @return NULL if the type is not known + * Try to resolve a plugin function. + * + * @param lib_handle library to search for the symbol + * @param prefix prefix to add + * @param sym_name base name for the symbol + * @return NULL on error, otherwise pointer to the symbol */ -const char * -EXTRACTOR_getKeywordTypeAsString(const EXTRACTOR_KeywordType type) +static void * +get_symbol_with_prefix(void *lib_handle, + const char *prefix) { - if ((type >= 0) && (type < HIGHEST_TYPE_NUMBER)) - return keywordTypes[type]; - else + char *name; + void *symbol; + const char *sym_name; + char *sym; + char *dot; + + sym_name = strstr (prefix, "_"); + if (sym_name == NULL) return NULL; -} - -static pthread_mutex_t ltdl_lock = PTHREAD_MUTEX_INITIALIZER; - -#define LTDL_MUTEX_LOCK \ - if (pthread_mutex_lock (&ltdl_lock) != 0) \ - abort(); -#define LTDL_MUTEX_UNLOCK \ - if (pthread_mutex_unlock (&ltdl_lock) != 0) \ - abort(); - -static void *getSymbolWithPrefix(void *lib_handle, - const char *lib_name, - const char *sym_name) -{ - size_t name_size - = strlen(lib_name) - + strlen(sym_name) - + 1 /* for the zero delim. */ - + 1 /* for the optional '_' prefix */; - char *name=malloc(name_size),*first_error; - void *symbol=NULL; - - snprintf(name, - name_size, - "_%s%s", - lib_name, - sym_name); - - LTDL_MUTEX_LOCK - symbol=lt_dlsym(lib_handle,name+1 /* skip the '_' */); - if (symbol==NULL) { - first_error=strdup(lt_dlerror()); - symbol=lt_dlsym(lib_handle,name /* now try with the '_' */); + sym_name++; + sym = strdup (sym_name); + dot = strstr (sym, "."); + if (dot != NULL) + *dot = '\0'; + name = malloc(strlen(sym) + 32); + sprintf(name, + "_EXTRACTOR_%s_extract", + sym); + free (sym); + /* try without '_' first */ + symbol = lt_dlsym(lib_handle, name + 1); + if (symbol==NULL) + { + /* now try with the '_' */ #if DEBUG - fprintf(stderr, - _("Resolving symbol `%s' in library `%s' failed, " - "so I tried `%s', but that failed also. Errors are: " - "`%s' and `%s'.\n"), - name+1, - lib_name, - name, - first_error, - lt_dlerror()); + char *first_error = strdup (lt_dlerror()); #endif - free(first_error); - } - LTDL_MUTEX_UNLOCK + symbol = lt_dlsym(lib_handle, name); +#if DEBUG + if (NULL == symbol) + { + fprintf(stderr, + "Resolving symbol `%s' failed, " + "so I tried `%s', but that failed also. Errors are: " + "`%s' and `%s'.\n", + name+1, + name, + first_error, + lt_dlerror()); + } + free(first_error); +#endif + } free(name); return symbol; } + /** - * Load a dynamic library. - * @return 1 on success, -1 on error + * Load a plugin. + * + * @param name name of the plugin + * @param libhandle set to the handle for the plugin + * @param method set to the extraction method + * @return 0 on success, -1 on error */ static int -loadLibrary (const char *name, +plugin_load (const char *name, void **libHandle, - ExtractMethod * method) + EXTRACTOR_ExtractMethod * method) { lt_dladvise advise; - LTDL_MUTEX_LOCK - lt_dladvise_init(&advise); - lt_dladvise_ext(&advise); - lt_dladvise_local(&advise); + lt_dladvise_init (&advise); + lt_dladvise_ext (&advise); + lt_dladvise_local (&advise); *libHandle = lt_dlopenadvise (name, advise); lt_dladvise_destroy(&advise); if (*libHandle == NULL) { #if DEBUG fprintf (stderr, - _("Loading `%s' plugin failed: %s\n"), + "Loading `%s' plugin failed: %s\n", name, lt_dlerror ()); #endif - LTDL_MUTEX_UNLOCK return -1; } - LTDL_MUTEX_UNLOCK - - *method = (ExtractMethod) getSymbolWithPrefix (*libHandle, name, "_extract"); - if (*method == NULL) { - LTDL_MUTEX_LOCK - lt_dlclose (*libHandle); - LTDL_MUTEX_UNLOCK - return -1; - } - return 1; + *method = get_symbol_with_prefix (*libHandle, name); + if (*method == NULL) + { + lt_dlclose (*libHandle); + return -1; + } + return 0; } -/* Internal function that accepts options. */ -static EXTRACTOR_ExtractorList * -EXTRACTOR_addLibrary2 (EXTRACTOR_ExtractorList * prev, - const char *library, const char *options) + +/** + * Add a library for keyword extraction. + * + * @param prev the previous list of libraries, may be NULL + * @param library the name of the library + * @param flags options to use + * @return the new list of libraries, equal to prev iff an error occured + */ +struct EXTRACTOR_PluginList * +EXTRACTOR_plugin_add (struct EXTRACTOR_PluginList * prev, + const char *library, + const char *options, + enum EXTRACTOR_Options flags) { - EXTRACTOR_ExtractorList *result; + struct EXTRACTOR_PluginList *result; void *handle; - ExtractMethod method; + EXTRACTOR_ExtractMethod method; - if (-1 == loadLibrary (library, &handle, &method)) + if (0 != plugin_load (library, &handle, &method)) return prev; - result = malloc (sizeof (EXTRACTOR_ExtractorList)); + result = malloc (sizeof (struct EXTRACTOR_PluginList)); result->next = prev; result->libraryHandle = handle; result->extractMethod = method; result->libname = strdup (library); - if( options ) - result->options = strdup (options); + result->flags = flags; + if (NULL != options) + result->plugin_options = strdup (options); else - result->options = NULL; + result->plugin_options = NULL; return result; } + /** - * Add a library for keyword extraction. + * Add a library for keyword extraction at the END of the list. * @param prev the previous list of libraries, may be NULL * @param library the name of the library - * @return the new list of libraries, equal to prev iff an error occured + * @param options options to give to the library + * @param flags options to use + * @return the new list of libraries, always equal to prev + * except if prev was NULL and no error occurs */ -EXTRACTOR_ExtractorList * -EXTRACTOR_addLibrary (EXTRACTOR_ExtractorList * prev, - const char *library) -{ - return EXTRACTOR_addLibrary2(prev, library, NULL); -} - -/* Internal function which takes options. */ -static EXTRACTOR_ExtractorList * -EXTRACTOR_addLibraryLast2 (EXTRACTOR_ExtractorList * prev, - const char *library, const char *options) +struct EXTRACTOR_PluginList * +EXTRACTOR_plugin_add_last(struct EXTRACTOR_PluginList *prev, + const char *library, + const char *options, + enum EXTRACTOR_Options flags) { - EXTRACTOR_ExtractorList *result; - EXTRACTOR_ExtractorList *pos; + struct EXTRACTOR_PluginList *result; + struct EXTRACTOR_PluginList *pos; void *handle; - ExtractMethod method; + EXTRACTOR_ExtractMethod method; - if (-1 == loadLibrary (library, &handle, &method)) + if (0 != plugin_load (library, &handle, &method)) return prev; - result = malloc (sizeof (EXTRACTOR_ExtractorList)); + result = malloc (sizeof (struct EXTRACTOR_PluginList)); result->next = NULL; result->libraryHandle = handle; result->extractMethod = method; result->libname = strdup (library); if( options ) - result->options = strdup (options); + result->plugin_options = strdup (options); else - result->options = NULL; + result->plugin_options = NULL; + result->flags = flags; if (prev == NULL) return result; pos = prev; @@ -897,42 +993,33 @@ EXTRACTOR_addLibraryLast2 (EXTRACTOR_ExtractorList * prev, return prev; } -/** - * Add a library for keyword extraction at the END of the list. - * @param prev the previous list of libraries, may be NULL - * @param library the name of the library - * @return the new list of libraries, always equal to prev - * except if prev was NULL and no error occurs - */ -EXTRACTOR_ExtractorList * -EXTRACTOR_addLibraryLast (EXTRACTOR_ExtractorList * prev, - const char *library) -{ - return EXTRACTOR_addLibraryLast2(prev, library, NULL); -} /** * Load multiple libraries as specified by the user. + * * @param config a string given by the user that defines which * libraries should be loaded. Has the format - * "[[-]LIBRARYNAME[:[-]LIBRARYNAME]*]". For example, - * libextractor_mp3.so:libextractor_ogg.so loads the + * "[[-]LIBRARYNAME[(options)][:[-]LIBRARYNAME[(options)]]]*". + * For example, + * /usr/lib/libextractor/libextractor_mp3.so:/usr/lib/libextractor/libextractor_ogg.so loads the * mp3 and the ogg library. The '-' before the LIBRARYNAME * indicates that the library should be added to the end * of the library list (addLibraryLast). * @param prev the previous list of libraries, may be NULL + * @param flags options to use * @return the new list of libraries, equal to prev iff an error occured * or if config was empty (or NULL). */ -EXTRACTOR_ExtractorList * -EXTRACTOR_loadConfigLibraries (EXTRACTOR_ExtractorList * prev, - const char *config) +struct EXTRACTOR_PluginList * +EXTRACTOR_plugin_add_config (struct EXTRACTOR_PluginList * prev, + const char *config, + enum EXTRACTOR_Options flags) { char *cpy; - int pos; - int last; - int lastconf; - int len; + size_t pos; + size_t last; + ssize_t lastconf; + size_t len; if (config == NULL) return prev; @@ -966,36 +1053,39 @@ EXTRACTOR_loadConfigLibraries (EXTRACTOR_ExtractorList * prev, if (cpy[last] == '-') { last++; - if( lastconf != -1 ) - prev = EXTRACTOR_addLibraryLast2 (prev, &cpy[last], - &cpy[lastconf]); - else - prev = EXTRACTOR_addLibraryLast2 (prev, &cpy[last], NULL); + prev = EXTRACTOR_plugin_add_last (prev, + &cpy[last], + (lastconf != -1) ? &cpy[lastconf] : NULL, + flags); } else - if( lastconf != -1 ) - prev = EXTRACTOR_addLibrary2 (prev, &cpy[last], &cpy[lastconf]); - else - prev = EXTRACTOR_addLibrary2 (prev, &cpy[last], NULL); - + { + prev = EXTRACTOR_plugin_add (prev, + &cpy[last], + (lastconf != -1) ? &cpy[lastconf] : NULL, + flags); + } last = pos; } free (cpy); return prev; } + /** - * Remove a library for keyword extraction. - * @param prev the current list of libraries - * @param library the name of the library to remove - * @return the reduced list, unchanged if the library was not loaded + * Remove a plugin from a list. + * + * @param prev the current list of plugins + * @param library the name of the plugin to remove + * @return the reduced list, unchanged if the plugin was not loaded */ -EXTRACTOR_ExtractorList * -EXTRACTOR_removeLibrary(EXTRACTOR_ExtractorList * prev, - const char *library) +struct EXTRACTOR_PluginList * +EXTRACTOR_plugin_remove(struct EXTRACTOR_PluginList * prev, + const char * library) { - EXTRACTOR_ExtractorList *pos; - EXTRACTOR_ExtractorList *first; + struct EXTRACTOR_PluginList *pos; + struct EXTRACTOR_PluginList *first; + pos = prev; first = prev; while ((pos != NULL) && (0 != strcmp (pos->libname, library))) @@ -1011,57 +1101,462 @@ EXTRACTOR_removeLibrary(EXTRACTOR_ExtractorList * prev, else prev->next = pos->next; /* found */ + /* FIXME: stop sub-process! */ free (pos->libname); - if( pos->options ) - free (pos->options); - if( pos->libraryHandle ) { - LTDL_MUTEX_LOCK - lt_dlclose (pos->libraryHandle); - LTDL_MUTEX_UNLOCK - } + free (pos->plugin_options); + if (NULL != pos->libraryHandle) + lt_dlclose (pos->libraryHandle); free (pos); } #if DEBUG else fprintf(stderr, - _("Unloading plugin `%s' failed!\n"), + "Unloading plugin `%s' failed!\n", library); #endif return first; } + /** - * Remove all extractors. - * @param libraries the list of extractors + * Remove all plugins from the given list (destroys the list). + * + * @param plugin the list of plugins */ -void -EXTRACTOR_removeAll (EXTRACTOR_ExtractorList * libraries) +void +EXTRACTOR_plugin_remove_all(struct EXTRACTOR_PluginList *plugins) { - while (libraries != NULL) - libraries = EXTRACTOR_removeLibrary (libraries, libraries->libname); + while (plugins != NULL) + plugins = EXTRACTOR_plugin_remove (plugins, plugins->libname); } +static int +write_all (int fd, + const void *buf, + size_t size) +{ + const char *data = buf; + size_t off = 0; + ssize_t ret; + + while (off < size) + { + ret = write (fd, &data[off], size - off); + if (ret <= 0) + return -1; + off += ret; + } + return 0; +} + + +static int +read_all (int fd, + void *buf, + size_t size) +{ + char *data = buf; + size_t off = 0; + ssize_t ret; + + while (off < size) + { + ret = read (fd, &data[off], size - off); + if (ret <= 0) + return -1; + off += ret; + } + return 0; +} + /** - * How many bytes do we actually try to scan? (from the beginning - * of the file). Limit to 1 GB. + * Header used for our IPC replies. A header + * with all fields being zero is used to indicate + * the end of the stream. */ -#define MAX_READ 1024 * 1024 * 1024 +struct IpcHeader +{ + enum EXTRACTOR_MetaType type; + enum EXTRACTOR_MetaFormat format; + size_t data_len; + size_t mime_len; +}; + /** - * How many bytes do we actually try to decompress? (from the beginning - * of the file). Limit to 16 MB. + * Function called by a plugin in a child process. Transmits + * the meta data back to the parent process. + * + * @param cls closure, "int*" of the FD for transmission + * @param plugin_name name of the plugin that produced this value; + * special values can be used (i.e. '<zlib>' for zlib being + * used in the main libextractor library and yielding + * meta data). + * @param type libextractor-type describing the meta data + * @param format basic format information about data + * @param data_mime_type mime-type of data (not of the original file); + * can be NULL (if mime-type is not known) + * @param data actual meta-data found + * @param data_len number of bytes in data + * @return 0 to continue extracting, 1 to abort (transmission error) + */ +static int +transmit_reply (void *cls, + const char *plugin_name, + enum EXTRACTOR_MetaType type, + enum EXTRACTOR_MetaFormat format, + const char *data_mime_type, + const char *data, + size_t data_len) +{ + int *cpipe_out = cls; + struct IpcHeader hdr; + size_t mime_len; + + if (data_mime_type == NULL) + mime_len = 0; + else + mime_len = strlen (data_mime_type) + 1; + if (mime_len > MAX_MIME_LEN) + mime_len = MAX_MIME_LEN; + hdr.type = type; + hdr.format = format; + hdr.data_len = data_len; + hdr.mime_len = mime_len; + if ( (hdr.type == 0) && + (hdr.format == 0) && + (hdr.data_len == 0) && + (hdr.mime_len == 0) ) + return 0; /* better skip this one, would signal termination... */ + if ( (0 != write_all (*cpipe_out, &hdr, sizeof(hdr))) || + (0 != write_all (*cpipe_out, data_mime_type, mime_len)) || + (0 != write_all (*cpipe_out, data, data_len)) ) + return 1; + return 0; +} + + + + +/** + * 'main' function of the child process. + * Reads shm-filenames from 'in' (line-by-line) and + * writes meta data blocks to 'out'. The meta data + * stream is terminated by an empty entry. + * + * @param plugin extractor plugin to use + * @param in stream to read from + * @param out stream to write to */ -#define MAX_DECOMPRESS 16 * 1024 * 1024 +static void +process_requests (struct EXTRACTOR_PluginList *plugin, + int in, + int out) +{ + char fn[256]; + FILE *fin; + void *ptr; + int shmid; + struct stat sbuf; + struct IpcHeader hdr; + + memset (&hdr, 0, sizeof (hdr)); + fin = fdopen (in, "r"); + while (NULL != fgets (fn, sizeof(fn), fin)) + { + if ( (-1 != (shmid = shm_open (fn, O_RDONLY, 0))) && + (0 == fstat (shmid, &sbuf)) && + (NULL != (ptr = shmat (shmid, NULL, SHM_RDONLY))) ) + { + if (0 != plugin->extractMethod (ptr, + sbuf.st_size, + &transmit_reply, + &out, + plugin->plugin_options)) + break; + if (0 != write_all (out, &hdr, sizeof(hdr))) + break; + } + if (ptr != NULL) + shmdt (ptr); + if (-1 != shmid) + close (shmid); + } + fclose (fin); + close (out); +} + +/** + * Start the process for the given plugin. + */ +static void +start_process (struct EXTRACTOR_PluginList *plugin) +{ + int p1[2]; + int p2[2]; + pid_t pid; + + if (0 != pipe (p1)) + { + plugin->cpid = -1; + return; + } + if (0 != pipe (p2)) + { + close (p1[0]); + close (p1[1]); + plugin->cpid = -1; + return; + } + pid = fork (); + if (pid == -1) + { + close (p1[0]); + close (p1[1]); + close (p2[0]); + close (p2[1]); + plugin->cpid = -1; + return; + } + if (pid == 0) + { + close (p1[1]); + close (p2[0]); + process_requests (plugin, p1[0], p2[1]); + _exit (0); + } + plugin->cpid = 0; + close (p1[0]); + close (p2[1]); + plugin->cpipe_in = fdopen (p1[1], "w"); + plugin->cpipe_out = p2[0]; +} + + +/** + * Stop the child process of this plugin. + */ +static void +stop_process (struct EXTRACTOR_PluginList *plugin) +{ + int status; + + if (plugin->cpid == -1) + return; + kill (plugin->cpid, SIGKILL); + waitpid (plugin->cpid, &status, 0); + plugin->cpid = -1; + close (plugin->cpipe_out); + plugin->cpipe_out = -1; + fclose (plugin->cpipe_in); + plugin->cpipe_in = NULL; +} + + +/** + * Extract meta data using the given plugin, running the + * actual code of the plugin out-of-process. + * + * @param plugin which plugin to call + * @param shmfn file name of the shared memory segment + * @param proc function to call on the meta data + * @param proc_cls cls for proc + * @return 0 if proc did not return non-zero + */ +static int +extract_oop (struct EXTRACTOR_PluginList *plugin, + const char *shmfn, + EXTRACTOR_MetaDataProcessor proc, + void *proc_cls) +{ + struct IpcHeader hdr; + char mimetype[MAX_MIME_LEN + 1]; + char *data; -static EXTRACTOR_KeywordList * -getKeywords (EXTRACTOR_ExtractorList * extractor, - const char * filename, - const unsigned char * data, - size_t size) { - EXTRACTOR_KeywordList *result; + if (0 <= fprintf (plugin->cpipe_in, "%s\n", shmfn)) + { + stop_process (plugin); + plugin->cpid = -1; + return 0; + } + while (1) + { + if (0 != read_all (plugin->cpipe_out, + &hdr, + sizeof(hdr))) + { + return 0; + } + if ( (hdr.type == 0) && + (hdr.format == 0) && + (hdr.data_len == 0) && + (hdr.mime_len == 0) ) + break; + if (hdr.mime_len > MAX_MIME_LEN) + { + stop_process (plugin); + return 0; + } + data = malloc (hdr.data_len); + if (data == NULL) + { + stop_process (plugin); + return 1; + } + if ( (0 != (read_all (plugin->cpipe_out, + mimetype, + hdr.mime_len))) || + (0 != (read_all (plugin->cpipe_out, + data, + hdr.data_len))) ) + { + stop_process (plugin); + free (data); + return 0; + } + mimetype[hdr.mime_len] = '\0'; + if ( (proc != NULL) && + (0 != proc (proc_cls, + plugin->libname, + hdr.type, + hdr.format, + mimetype, + data, + hdr.data_len)) ) + proc = NULL; + free (data); + } + if (NULL == proc) + return 1; + return 0; +} + + +/** + * Extract keywords from a file using the given set of plugins. + * + * @param plugins the list of plugins to use + * @param filename the name of the file, can be NULL + * @param data data to process, never NULL + * @param size number of bytes in data, ignored if data is NULL + * @param proc function to call for each meta data item found + * @param proc_cls cls argument to proc + */ +static void +extract (struct EXTRACTOR_PluginList *plugins, + const char * filename, + const char * data, + size_t size, + EXTRACTOR_MetaDataProcessor proc, + void *proc_cls) +{ + struct EXTRACTOR_PluginList *ppos; + int shmid; + enum EXTRACTOR_Options flags; + void *ptr; + char fn[255]; + int want_shm; + + want_shm = 0; + ppos = plugins; + while (NULL != ppos) + { + switch (ppos->flags) + { + case EXTRACTOR_OPTION_NONE: + break; + case EXTRACTOR_OPTION_OUT_OF_PROCESS: + if (0 == plugins->cpid) + start_process (plugins); + want_shm = 1; + break; + case EXTRACTOR_OPTION_AUTO_RESTART: + if ( (0 == plugins->cpid) || + (-1 == plugins->cpid) ) + start_process (plugins); + want_shm = 1; + break; + } + ppos = ppos->next; + } + + if (want_shm) + { + sprintf (fn, + "/tmp/libextractor-shm-%u-XXXXXX", + getpid()); + mktemp (fn); + shmid = shm_open (fn, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); + ptr = NULL; + if (shmid != -1) + { + if ( (0 != ftruncate (shmid, size)) || + (NULL == (ptr = shmat (shmid, NULL, 0))) ) + { + close (shmid); + shmid = -1; + } + memcpy (ptr, data, size); + } + } + ppos = plugins; + while (NULL != ppos) + { + flags = ppos->flags; + if (shmid == -1) + flags = EXTRACTOR_OPTION_NONE; + switch (flags) + { + case EXTRACTOR_OPTION_NONE: + if (0 != ppos->extractMethod (data, + size, + proc, + proc_cls, + ppos->plugin_options)) + return; + break; + case EXTRACTOR_OPTION_OUT_OF_PROCESS: + case EXTRACTOR_OPTION_AUTO_RESTART: + if (0 != extract_oop (ppos, fn, proc, proc_cls)) + return; + break; + } + ppos = ppos->next; + } + if (want_shm) + { + if (NULL != ptr) + shmdt (ptr); + if (shmid != -1) + close (shmid); + shm_unlink (fn); + unlink (fn); + } +} + + +/** + * If the given data is compressed using gzip or bzip2, decompress + * it. Run 'extract' on the decompressed contents (or the original + * contents if they were not compressed). + * + * @param plugins the list of plugins to use + * @param filename the name of the file, can be NULL + * @param data data to process, never NULL + * @param size number of bytes in data, ignored if data is NULL + * @param proc function to call for each meta data item found + * @param proc_cls cls argument to proc + */ +static void +decompress_and_extract (struct EXTRACTOR_PluginList *plugins, + const char * filename, + const unsigned char * data, + size_t size, + EXTRACTOR_MetaDataProcessor proc, + void *proc_cls) { unsigned char * buf; size_t dsize; #if HAVE_ZLIB @@ -1075,7 +1570,6 @@ getKeywords (EXTRACTOR_ExtractorList * extractor, size_t bpos; #endif - result = NULL; buf = NULL; dsize = 0; #if HAVE_ZLIB @@ -1083,687 +1577,416 @@ getKeywords (EXTRACTOR_ExtractorList * extractor, if ( (size >= 12) && (data[0] == 0x1f) && (data[1] == 0x8b) && - (data[2] == 0x08) ) { - - /* - * Skip gzip header - we might want to retrieve parts of it as keywords - */ - unsigned gzip_header_length = 10; - - if (data[3] & 0x4) /* FEXTRA set */ - gzip_header_length += 2 + (unsigned) (data[10] & 0xff) - + (((unsigned) (data[11] & 0xff)) * 256); - - if(data[3] & 0x8) /* FNAME set */ - { - const unsigned char * cptr = data + gzip_header_length; - - /* - * stored file name is here - * extremely long file names might break the following code. - */ - - while(cptr < data + size) - { - if('\0' == *cptr) - break; - - cptr++; - } - gzip_header_length = (cptr - data) + 1; - } - - if(data[3] & 0x16) /* FCOMMENT set */ + (data[2] == 0x08) ) { - const unsigned char * cptr = data + gzip_header_length; - - /* - * stored comment is here - */ - - while(cptr < data + size) - { - if('\0' == *cptr) - break; - - cptr ++; - } - - gzip_header_length = (cptr - data) + 1; - } - - if(data[3] & 0x2) /* FCHRC set */ - gzip_header_length += 2; - - memset(&strm, - 0, - sizeof(z_stream)); + /* Process gzip header */ + unsigned int gzip_header_length = 10; + + if (data[3] & 0x4) /* FEXTRA set */ + gzip_header_length += 2 + (unsigned) (data[10] & 0xff) + + (((unsigned) (data[11] & 0xff)) * 256); + + if (data[3] & 0x8) /* FNAME set */ + { + const unsigned char * cptr = data + gzip_header_length; + /* stored file name is here */ + while (cptr < data + size) + { + if ('\0' == *cptr) + break; + cptr++; + } + if (0 != proc (proc_cls, + "<zlib>", + EXTRACTOR_METATYPE_FILENAME, + EXTRACTOR_METAFORMAT_C_STRING, + "text/plain", + (const char*) (data + gzip_header_length), + cptr - (data + gzip_header_length))) + return; /* done */ + gzip_header_length = (cptr - data) + 1; + } + if (data[3] & 0x16) /* FCOMMENT set */ + { + const unsigned char * cptr = data + gzip_header_length; + /* stored comment is here */ + while (cptr < data + size) + { + if('\0' == *cptr) + break; + cptr ++; + } + if (0 != proc (proc_cls, + "<zlib>", + EXTRACTOR_METATYPE_COMMENT, + EXTRACTOR_METAFORMAT_C_STRING, + "text/plain", + (const char*) (data + gzip_header_length), + cptr - (data + gzip_header_length))) + return; /* done */ + gzip_header_length = (cptr - data) + 1; + } + if(data[3] & 0x2) /* FCHRC set */ + gzip_header_length += 2; + memset(&strm, + 0, + sizeof(z_stream)); #ifdef ZLIB_VERNUM - gzip_header_length = 0; + gzip_header_length = 0; #endif - if (size > gzip_header_length) { - strm.next_in = (Bytef*) data + gzip_header_length; - strm.avail_in = size - gzip_header_length; - } else { - strm.next_in = (Bytef*) data; - strm.avail_in = 0; - } - strm.total_in = 0; - strm.zalloc = NULL; - strm.zfree = NULL; - strm.opaque = NULL; - - /* - * note: maybe plain inflateInit(&strm) is adequate, - * it looks more backward-compatible also ; - * - * ZLIB_VERNUM isn't defined by zlib version 1.1.4 ; - * there might be a better check. - */ + if (size > gzip_header_length) + { + strm.next_in = (Bytef*) data + gzip_header_length; + strm.avail_in = size - gzip_header_length; + } + else + { + strm.next_in = (Bytef*) data; + strm.avail_in = 0; + } + strm.total_in = 0; + strm.zalloc = NULL; + strm.zfree = NULL; + strm.opaque = NULL; + + /* + * note: maybe plain inflateInit(&strm) is adequate, + * it looks more backward-compatible also ; + * + * ZLIB_VERNUM isn't defined by zlib version 1.1.4 ; + * there might be a better check. + */ + if (Z_OK == inflateInit2(&strm, #ifdef ZLIB_VERNUM - if (Z_OK == inflateInit2(&strm, - 15 + 32)) { + 15 + 32 #else - if (Z_OK == inflateInit2(&strm, - -MAX_WBITS)) { + -MAX_WBITS #endif - dsize = 2 * size; - if (dsize > MAX_DECOMPRESS) - dsize = MAX_DECOMPRESS; - buf = malloc(dsize); - pos = 0; - if (buf == NULL) { - inflateEnd(&strm); - } else { - strm.next_out = (Bytef*) buf; - strm.avail_out = dsize; - do { - ret = inflate(&strm, - Z_SYNC_FLUSH); - if (ret == Z_OK) { - if (dsize == MAX_DECOMPRESS) - break; - pos += strm.total_out; - strm.total_out = 0; - dsize *= 2; - if (dsize > MAX_DECOMPRESS) - dsize = MAX_DECOMPRESS; - buf = realloc(buf, dsize); - strm.next_out = (Bytef*) &buf[pos]; - strm.avail_out = dsize - pos; - } else if (ret != Z_STREAM_END) { - /* error */ - free(buf); - buf = NULL; + )) { + dsize = 2 * size; + if (dsize > MAX_DECOMPRESS) + dsize = MAX_DECOMPRESS; + buf = malloc(dsize); + pos = 0; + if (buf == NULL) + { + inflateEnd(&strm); + } + else + { + strm.next_out = (Bytef*) buf; + strm.avail_out = dsize; + do + { + ret = inflate(&strm, + Z_SYNC_FLUSH); + if (ret == Z_OK) + { + if (dsize == MAX_DECOMPRESS) + break; + pos += strm.total_out; + strm.total_out = 0; + dsize *= 2; + if (dsize > MAX_DECOMPRESS) + dsize = MAX_DECOMPRESS; + buf = realloc(buf, dsize); + strm.next_out = (Bytef*) &buf[pos]; + strm.avail_out = dsize - pos; + } + else if (ret != Z_STREAM_END) + { + /* error */ + free(buf); + buf = NULL; + } + } while ( (buf != NULL) && + (ret != Z_STREAM_END) ); + dsize = pos + strm.total_out; + inflateEnd(&strm); + if (dsize == 0) { + free(buf); + buf = NULL; + } } - } while ( (buf != NULL) && - (ret != Z_STREAM_END) ); - dsize = pos + strm.total_out; - inflateEnd(&strm); - if (dsize == 0) { - free(buf); - buf = NULL; - } } } - } #endif - + #if HAVE_LIBBZ2 if ( (size >= 4) && (data[0] == 'B') && (data[1] == 'Z') && - (data[2] == 'h') ) { - /* now try bz2 decompression */ - memset(&bstrm, - 0, - sizeof(bz_stream)); - bstrm.next_in = (char*) data; - bstrm.avail_in = size; - bstrm.total_in_lo32 = 0; - bstrm.total_in_hi32 = 0; - bstrm.bzalloc = NULL; - bstrm.bzfree = NULL; - bstrm.opaque = NULL; - if ( (buf == NULL) && - (BZ_OK == BZ2_bzDecompressInit(&bstrm, - 0, - 0)) ) { - dsize = 2 * size; - if (dsize > MAX_DECOMPRESS) - dsize = MAX_DECOMPRESS; - buf = malloc(dsize); - bpos = 0; - if (buf == NULL) { - BZ2_bzDecompressEnd(&bstrm); - } else { - bstrm.next_out = (char*) buf; - bstrm.avail_out = dsize; - do { - bret = BZ2_bzDecompress(&bstrm); - if (bret == Z_OK) { - if (dsize == MAX_DECOMPRESS) - break; - bpos += bstrm.total_out_lo32; - bstrm.total_out_lo32 = 0; - dsize *= 2; - if (dsize > MAX_DECOMPRESS) - dsize = MAX_DECOMPRESS; - buf = realloc(buf, dsize); - bstrm.next_out = (char*) &buf[bpos]; - bstrm.avail_out = dsize - bpos; - } else if (bret != BZ_STREAM_END) { - /* error */ - free(buf); - buf = NULL; - } - } while ( (buf != NULL) && - (bret != BZ_STREAM_END) ); - dsize = bpos + bstrm.total_out_lo32; - BZ2_bzDecompressEnd(&bstrm); - if (dsize == 0) { - free(buf); - buf = NULL; + (data[2] == 'h') ) + { + /* now try bz2 decompression */ + memset(&bstrm, + 0, + sizeof(bz_stream)); + bstrm.next_in = (char*) data; + bstrm.avail_in = size; + bstrm.total_in_lo32 = 0; + bstrm.total_in_hi32 = 0; + bstrm.bzalloc = NULL; + bstrm.bzfree = NULL; + bstrm.opaque = NULL; + if ( (buf == NULL) && + (BZ_OK == BZ2_bzDecompressInit(&bstrm, + 0, + 0)) ) + { + dsize = 2 * size; + if (dsize > MAX_DECOMPRESS) + dsize = MAX_DECOMPRESS; + buf = malloc(dsize); + bpos = 0; + if (buf == NULL) + { + BZ2_bzDecompressEnd(&bstrm); + } + else + { + bstrm.next_out = (char*) buf; + bstrm.avail_out = dsize; + do { + bret = BZ2_bzDecompress(&bstrm); + if (bret == Z_OK) + { + if (dsize == MAX_DECOMPRESS) + break; + bpos += bstrm.total_out_lo32; + bstrm.total_out_lo32 = 0; + dsize *= 2; + if (dsize > MAX_DECOMPRESS) + dsize = MAX_DECOMPRESS; + buf = realloc(buf, dsize); + bstrm.next_out = (char*) &buf[bpos]; + bstrm.avail_out = dsize - bpos; + } + else if (bret != BZ_STREAM_END) + { + /* error */ + free(buf); + buf = NULL; + } + } while ( (buf != NULL) && + (bret != BZ_STREAM_END) ); + dsize = bpos + bstrm.total_out_lo32; + BZ2_bzDecompressEnd(&bstrm); + if (dsize == 0) + { + free(buf); + buf = NULL; + } + } } - } } - } -#endif - - - /* finally, call plugins */ - if (buf != NULL) { - data = buf; - size = dsize; - } - while (extractor != NULL) { - result = extractor->extractMethod(filename, - (char*) data, - size, - result, - extractor->options); - extractor = extractor->next; - } +#endif + if (buf != NULL) + { + data = buf; + size = dsize; + } + extract (plugins, + filename, + (const char*) data, + size, + proc, + proc_cls); if (buf != NULL) free(buf); errno = 0; /* kill transient errors */ - return result; } + /** - * Extract keywords from a file using the available extractors. - * @param extractor the list of extractor libraries - * @param filename the name of the file - * @return the list of keywords found in the file, NULL if none - * were found (or other errors) + * Open a file */ -EXTRACTOR_KeywordList * -EXTRACTOR_getKeywords (EXTRACTOR_ExtractorList * extractor, - const char * filename) { - EXTRACTOR_KeywordList *result; - int file; - void * buffer; - struct stat fstatbuf; - size_t size; - int eno, dir; - - if (-1 == STAT(filename, &fstatbuf)) - return NULL; +static int file_open(const char *filename, int oflag, ...) +{ + int mode; + const char *fn; +#ifdef MINGW + char szFile[_MAX_PATH + 1]; + long lRet; - if (!S_ISDIR(fstatbuf.st_mode)) { - dir = 0; - -#ifdef O_LARGEFILE - file = fileopen(filename, O_RDONLY | O_LARGEFILE); + if ((lRet = plibc_conv_to_win_path(filename, szFile)) != ERROR_SUCCESS) + { + errno = ENOENT; + SetLastError(lRet); + return -1; + } + fn = szFile; #else - file = fileopen(filename, O_RDONLY); + fn = filename; #endif - if (-1 == file) - return NULL; - - size = (fstatbuf.st_size > 0xFFFFFFFF) ? 0xFFFFFFFF : fstatbuf.st_size; - if (size == 0) { - close(file); - return NULL; - } - - if (size > MAX_READ) - size = MAX_READ; /* do not mmap/read more than 1 GB! */ - buffer = MMAP(NULL, size, PROT_READ, MAP_PRIVATE, file, 0); - if ( (buffer == NULL) || (buffer == (void *) -1) ) { - eno = errno; - close(file); - errno = eno; - return NULL; - } - } - else { - dir = 1; - - size = 0; - buffer = malloc(1); - } - - result = getKeywords(extractor, - filename, - buffer, - size); - - if (dir) - free(buffer); - else { - MUNMAP (buffer, size); - close(file); - } - return result; + mode = 0; +#ifdef MINGW + /* Set binary mode */ + mode |= O_BINARY; +#endif + return open(fn, oflag, mode); } +#ifndef O_LARGEFILE +#define O_LARGEFILE 0 +#endif -/** - * Extract keywords from a buffer in memory - * using the available extractors. - * - * @param extractor the list of extractor libraries - * @param data the data of the file - * @param size the number of bytes in data - * @return the list of keywords found in the file, NULL if none - * were found (or other errors) - */ -EXTRACTOR_KeywordList * -EXTRACTOR_getKeywords2(EXTRACTOR_ExtractorList * extractor, - const void * data, - size_t size) { - if (data == NULL) - return NULL; - return getKeywords(extractor, - NULL, - data, - size); -} - -static void -removeKeyword (const char *keyword, - const EXTRACTOR_KeywordType type, - const unsigned int options, - EXTRACTOR_KeywordList ** list, - EXTRACTOR_KeywordList * current) { - EXTRACTOR_KeywordList *first; - EXTRACTOR_KeywordList *pos; - EXTRACTOR_KeywordList *prev; - EXTRACTOR_KeywordList *next; - - first = *list; - pos = first; - prev = NULL; - while (pos != NULL) { - if (pos == current) { - prev = pos; - pos = current->next; - } - if (pos == NULL) - break; - if ( (0 == strcmp (pos->keyword, keyword)) && - ( (pos->keywordType == type) || - ( ((options & EXTRACTOR_DUPLICATES_TYPELESS) > 0) && - ( (pos->keywordType == EXTRACTOR_SPLIT) || - (type != EXTRACTOR_SPLIT)) ) || - ( ((options & EXTRACTOR_DUPLICATES_REMOVE_UNKNOWN) > 0) && - (pos->keywordType == EXTRACTOR_UNKNOWN)) ) ) { - /* remove! */ - if (prev == NULL) - first = pos->next; - else - prev->next = pos->next; - next = pos->next; - free (pos->keyword); - free (pos); - pos = next; - } else { - prev = pos; - pos = pos->next; - } - } /* end while */ - *list = first; -} /** - * Remove duplicate keywords from the list. - * @param list the original keyword list (destroyed in the process!) - * @param options a set of options (DUPLICATES_XXXX) - * @return a list of keywords without duplicates - */ -EXTRACTOR_KeywordList * -EXTRACTOR_removeDuplicateKeywords (EXTRACTOR_KeywordList * list, - const unsigned int options) { - EXTRACTOR_KeywordList *pos; - - pos = list; - while (pos != NULL) { - removeKeyword(pos->keyword, - pos->keywordType, - options, - &list, - pos); - pos = pos->next; - } - return list; -} - -/** - * Remove empty (all-whitespace) keywords from the list. - * @param list the original keyword list (destroyed in the process!) - * @return a list of keywords without duplicates + * Extract keywords from a file using the given set of plugins. + * If needed, opens the file and loads its data (via mmap). Then + * decompresses it if the data is compressed. Finally runs the + * plugins on the (now possibly decompressed) data. + * + * @param plugins the list of plugins to use + * @param filename the name of the file, can be NULL if data is not NULL + * @param data data of the file in memory, can be NULL (in which + * case libextractor will open file) if filename is not NULL + * @param size number of bytes in data, ignored if data is NULL + * @param proc function to call for each meta data item found + * @param proc_cls cls argument to proc */ -EXTRACTOR_KeywordList * -EXTRACTOR_removeEmptyKeywords (EXTRACTOR_KeywordList * list) { - EXTRACTOR_KeywordList * pos; - EXTRACTOR_KeywordList * last; - - last = NULL; - pos = list; - while (pos != NULL) - { - int allWhite; - int i; - allWhite = 1; - for (i=strlen(pos->keyword)-1;i>=0;i--) - if (! isspace(pos->keyword[i])) - { - allWhite = 0; - break; - } - if (allWhite) +void +EXTRACTOR_extract (struct EXTRACTOR_PluginList *plugins, + const char *filename, + const void *data, + size_t size, + EXTRACTOR_MetaDataProcessor proc, + void *proc_cls) +{ + int fd; + void * buffer; + struct stat fstatbuf; + size_t fsize; + int eno; + + fd = -1; + buffer = NULL; + if ( (data == NULL) && + (filename != NULL) && + (0 == STAT(filename, &fstatbuf)) && + (!S_ISDIR(fstatbuf.st_mode)) && + (-1 != (fd = file_open (filename, + O_RDONLY | O_LARGEFILE))) ) + { + fsize = (fstatbuf.st_size > 0xFFFFFFFF) ? 0xFFFFFFFF : fstatbuf.st_size; + if (fsize == 0) { - EXTRACTOR_KeywordList * next; - next = pos->next; - if (last == NULL) - list = next; - else - last->next = next; - free(pos->keyword); - free(pos); - pos = next; + close(fd); + return; } - else + if (fsize > MAX_READ) + fsize = MAX_READ; + buffer = MMAP(NULL, fsize, PROT_READ, MAP_PRIVATE, fd, 0); + if ( (buffer == NULL) || (buffer == (void *) -1) ) { - last = pos; - pos = pos->next; + eno = errno; + close(fd); + errno = eno; + return; } } - return list; -} - -/** - * Remove keywords of a particular type from the list. - * @param list the original keyword list (altered in the process!) - * @param type the type to remove - * @return a list of keywords without entries of given type - */ -EXTRACTOR_KeywordList * -EXTRACTOR_removeKeywordsOfType(EXTRACTOR_KeywordList * list, - EXTRACTOR_KeywordType type) { - EXTRACTOR_KeywordList * pos; - EXTRACTOR_KeywordList * last; - - last = NULL; - pos = list; - while (pos != NULL) { - if (pos->keywordType == type) { - EXTRACTOR_KeywordList * next; - next = pos->next; - if (last == NULL) - list = next; - else - last->next = next; - free(pos->keyword); - free(pos); - pos = next; - } else { - last = pos; - pos = pos->next; - } - } - return list; + if ( (buffer == NULL) && + (data == NULL) ) + return; + decompress_and_extract (plugins, + filename, + buffer != NULL ? buffer : data, + buffer != NULL ? fsize : size, + proc, + proc_cls); + if (buffer != NULL) + MUNMAP (buffer, size); + if (-1 != fd) + close(fd); } #include "iconv.c" /** - * Print a keyword list to a file. - * For debugging. - * @param handle the file to write to (stdout, stderr), may NOT be NULL - * @param keywords the list of keywords to print, may be NULL + * Simple EXTRACTOR_MetaDataProcessor implementation that simply + * prints the extracted meta data to the given file. Only prints + * those keywords that are in UTF-8 format. + * + * @param handle the file to write to (stdout, stderr), must NOT be NULL, + * must be of type "FILE *". + * @param plugin_name name of the plugin that produced this value + * @param type libextractor-type describing the meta data + * @param format basic format information about data + * @param data_mime_type mime-type of data (not of the original file); + * can be NULL (if mime-type is not known) + * @param data actual meta-data found + * @param data_len number of bytes in data + * @return non-zero if printing failed, otherwise 0. */ -void -EXTRACTOR_printKeywords(FILE * handle, - EXTRACTOR_KeywordList * keywords) +int +EXTRACTOR_meta_data_print(void * handle, + const char *plugin_name, + enum EXTRACTOR_MetaType type, + enum EXTRACTOR_MetaFormat format, + const char *data_mime_type, + const char *data, + size_t data_len) { iconv_t cd; char * buf; + int ret; - cd = iconv_open( - nl_langinfo(CODESET) - , "UTF-8"); - while (keywords != NULL) - { - if (cd == (iconv_t) -1) - buf = strdup(keywords->keyword); - else - buf = iconvHelper(cd, - keywords->keyword); - if (keywords->keywordType == EXTRACTOR_THUMBNAIL_DATA) { - fprintf(handle, - _("%s - (binary)\n"), - _(keywordTypes[keywords->keywordType])); - } else { - if (keywords->keywordType >= HIGHEST_TYPE_NUMBER) - fprintf(handle, - _("INVALID TYPE - %s\n"), - buf); - else - fprintf(handle, - "%s - %s\n", - _(keywordTypes[keywords->keywordType]), - buf); - } - free(buf); - keywords = keywords->next; - } - if (cd != (iconv_t) -1) - iconv_close(cd); -} - -/** - * Free the memory occupied by the keyword list (and the - * keyword strings in it!) - * @param keywords the list to free - */ -void -EXTRACTOR_freeKeywords (EXTRACTOR_KeywordList * keywords) -{ - EXTRACTOR_KeywordList *prev; - while (keywords != NULL) - { - prev = keywords; - keywords = keywords->next; - free (prev->keyword); - free (prev); - } -} - -/** - * Return the highest type number, exclusive as in [0,highest). - */ -EXTRACTOR_KeywordType -EXTRACTOR_getHighestKeywordTypeNumber () -{ - return HIGHEST_TYPE_NUMBER; -} - -/** - * Extract the last keyword that of the given type from the keyword list. - * @param type the type of the keyword - * @param keywords the keyword list - * @return the last matching keyword, or NULL if none matches - */ -const char * -EXTRACTOR_extractLast (const EXTRACTOR_KeywordType type, - EXTRACTOR_KeywordList * keywords) -{ - char *result = NULL; - while (keywords != NULL) - { - if (keywords->keywordType == type) - result = keywords->keyword; - keywords = keywords->next; - } - return result; + if (format != EXTRACTOR_METAFORMAT_UTF8) + return 0; + cd = iconv_open(nl_langinfo(CODESET), + "UTF-8"); + if (cd == (iconv_t) -1) + return 1; + buf = iconv_helper(cd, data); + ret = fprintf(handle, + "%s - %s\n", + dgettext ("libextractor", + EXTRACTOR_metatype_to_string (type)), + buf); + free(buf); + iconv_close(cd); + if (ret < 0) + return 1; + return 0; } -/** - * Extract the last keyword of the given string from the keyword list. - * @param type the string describing the type of the keyword - * @param keywords the keyword list - * @return the last matching keyword, or NULL if none matches - */ -const char * -EXTRACTOR_extractLastByString (const char * type, - EXTRACTOR_KeywordList * keywords) -{ - char * result = NULL; - - if (type == NULL) - return NULL; - while (keywords != NULL) { - if ( (0 == strcmp(_(keywordTypes[keywords->keywordType]), type)) || - (0 == strcmp(keywordTypes[keywords->keywordType], type) ) ) - result = keywords->keyword; - keywords = keywords->next; - } - return result; -} -/** - * Count the number of keywords in the keyword list. - * @param keywords the keyword list - * @return the number of keywords in the list - */ -unsigned int -EXTRACTOR_countKeywords (EXTRACTOR_KeywordList * keywords) -{ - int count = 0; - while (keywords != NULL) - { - count++; - keywords = keywords->next; - } - return count; -} /** - * Encode the given binary data object - * as a 0-terminated C-string according - * to the LE binary data encoding standard. - * - * @return NULL on error, the 0-terminated - * encoding otherwise + * Initialize gettext and libltdl (and W32 if needed). */ -char * EXTRACTOR_binaryEncode(const unsigned char * data, - size_t size) { - - char * binary; - size_t pos; - size_t end; - size_t wpos; - size_t i; - unsigned int markers[8]; /* 256 bits */ - unsigned char marker; - - /* encode! */ - binary = malloc(2 + size + (size+256) / 254); - if (binary == NULL) - return NULL; +void __attribute__ ((constructor)) EXTRACTOR_ltdl_init() { + int err; - pos = 0; - wpos = 0; - while (pos < size) { - /* find unused value between 1 and 255 in - the next 254 bytes */ - end = pos + 254; - if (end < pos) - break; /* integer overflow! */ - if (end > size) - end = size; - memset(markers, - 0, - sizeof(markers)); - for (i=pos;i<end;i++) - markers[data[i]&7] |= 1 << (data[i] >> 3); - marker = 1; - while (markers[marker&7] & (1 << (marker >> 3))) { - marker++; - if (marker == 0) { - /* assertion failed... */ - free(binary); - return NULL; - } - } - /* recode */ - binary[wpos++] = marker; - for (i=pos;i<end;i++) - binary[wpos++] = data[i] == 0 ? marker : data[i]; - pos = end; +#if ENABLE_NLS + BINDTEXTDOMAIN(PACKAGE, LOCALEDIR); + BINDTEXTDOMAIN("iso-639", ISOLOCALEDIR); /* used by wordextractor */ +#endif + err = lt_dlinit (); + if (err > 0) { +#if DEBUG + fprintf(stderr, + _("Initialization of plugin mechanism failed: %s!\n"), + lt_dlerror()); +#endif + return; } - binary[wpos++] = 0; /* 0-termination! */ - return binary; +#ifdef MINGW + plibc_init("GNU", PACKAGE); +#endif } /** - * This function can be used to decode the binary data - * encoded in the libextractor metadata (i.e. for - * the thumbnails). - * - * @param in 0-terminated string from the meta-data - * @return 1 on error, 0 on success + * Deinit. */ -int EXTRACTOR_binaryDecode(const char * in, - unsigned char ** out, - size_t * outSize) { - unsigned char * buf; - size_t pos; - size_t wpos; - unsigned char marker; - size_t i; - size_t end; - size_t inSize; - - inSize = strlen(in); - if (inSize == 0) { - *out = NULL; - *outSize = 0; - return 0; - } - - buf = malloc(inSize); /* slightly more than needed ;-) */ - if (buf == NULL) - return 1; /* error */ - *out = buf; - - pos = 0; - wpos = 0; - while (pos < inSize) { - end = pos + 255; /* 255 here: count the marker! */ - if (end > inSize) - end = inSize; - marker = in[pos++]; - for (i=pos;i<end;i++) - buf[wpos++] = (in[i] == (char) marker) ? 0 : in[i]; - pos = end; - } - *outSize = wpos; - return 0; +void __attribute__ ((destructor)) EXTRACTOR_ltdl_fini() { +#ifdef MINGW + plibc_shutdown(); +#endif + lt_dlexit (); } diff --git a/src/main/iconv.c b/src/main/iconv.c @@ -22,8 +22,9 @@ * Convert the given input using the given converter * and return as a 0-terminated string. */ -static char * iconvHelper(iconv_t cd, - const char * in) { +static char * +iconv_helper(iconv_t cd, + const char * in) { size_t inSize; char * buf; char * ibuf; diff --git a/src/main/test_binary.c b/src/main/test_binary.c @@ -1,66 +0,0 @@ -/* - This file is part of libextractor. - (C) 2005 Vidyut Samanta and Christian Grothoff - - libextractor is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 2, or (at your - option) any later version. - - libextractor is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with libextractor; see the file COPYING. If not, write to the - Free Software Foundation, Inc., 59 Temple Place - Suite 330, - Boston, MA 02111-1307, USA. - */ - -#include "platform.h" -#include "extractor.h" - -static int test(const char * buf, - size_t size) { - char * enc; - unsigned char * dec; - size_t out; - - enc = EXTRACTOR_binaryEncode((const unsigned char*) buf, - size); - if (0 != EXTRACTOR_binaryDecode(enc, - &dec, - &out)) { - free(enc); - return 0; - } - free(enc); - if (out != size) { - free(dec); - return 0; - } - if (0 != memcmp(buf, - dec, - size)) { - free(dec); - return 0; - } - free(dec); - return 1; -} - -#define MAX 1024 - -int main(int argc, - char * argv[]) { - unsigned int i; - char buf[MAX]; - - for (i=0;i<MAX;i++) { - buf[i] = (char) rand(); - if (! test(buf, i)) - return -1; - } - return 0; -} diff --git a/src/main/winproc.c b/src/main/winproc.c @@ -1,51 +0,0 @@ -/* - This file is part of GNUnet. - (C) 2001, 2002, 2003, 2004, 2005 Christian Grothoff (and other contributing authors) - - GNUnet is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 2, or (at your - option) any later version. - - GNUnet is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with GNUnet; see the file COPYING. If not, write to the - Free Software Foundation, Inc., 59 Temple Place - Suite 330, - Boston, MA 02111-1307, USA. -*/ - -/** - * @file util/win/winproc.c - * @brief Functions for MS Windows - * @author Nils Durner - * @note This file differs from GNUnet's winproc.c - */ - -#include "platform.h" - -#ifdef MINGW - -/** - * Initialize PlibC and set up Windows environment - * @return Error code from winerror.h, ERROR_SUCCESS on success -*/ -void InitWinEnv() -{ - plibc_init("GNU", PACKAGE); -} - -/** - * Clean up Windows environment - */ -void ShutdownWinEnv() -{ - plibc_shutdown(); -} - -#endif /* MINGW */ - -/* end of winproc.c */ diff --git a/src/plugins/Makefile.am b/src/plugins/Makefile.am @@ -60,8 +60,8 @@ if HAVE_MPEG2 endif # toggle for development -# SUBDIRS = . -SUBDIRS = . $(thumbgtk) $(thumbffmpeg) $(oodir) $(printdir) hash $(oledir) $(rpm) $(xpdfdir) $(exiv2dir) +SUBDIRS = . +# SUBDIRS = . $(thumbgtk) $(thumbffmpeg) $(oodir) $(printdir) hash $(oledir) $(rpm) $(xpdfdir) $(exiv2dir) if HAVE_VORBISFILE @@ -85,25 +85,46 @@ extraqt = libextractor_qt.la oodir = oo endif -plugin_LTLIBRARIES = $(pdfplugin) \ +plugin_LTLIBRARIES = \ + libextractor_html.la \ + libextractor_it.la \ + libextractor_mime.la + +libextractor_html_la_SOURCES = \ + html_extractor.c +libextractor_html_la_LDFLAGS = \ + $(PLUGINFLAGS) +libextractor_html_la_LIBADD = \ + $(top_builddir)/src/common/libextractor_common.la + +libextractor_it_la_SOURCES = \ + it_extractor.c +libextractor_it_la_LDFLAGS = \ + $(PLUGINFLAGS) + + +libextractor_mime_la_SOURCES = \ + mime_extractor.c +libextractor_mime_la_LDFLAGS = \ + $(PLUGINFLAGS) + + + +OLD_LIBS = \ + $(pdfplugin) \ libextractor_applefile.la \ libextractor_asf.la \ libextractor_deb.la \ libextractor_dvi.la \ libextractor_elf.la \ - libextractor_filename.la \ $(extraflac) \ libextractor_flv.la \ libextractor_gif.la \ - libextractor_html.la \ libextractor_id3v2.la \ libextractor_id3v24.la \ libextractor_id3v23.la \ - libextractor_it.la \ libextractor_jpeg.la \ - libextractor_lower.la \ libextractor_man.la \ - libextractor_mime.la \ libextractor_mp3.la \ $(extrampeg) \ libextractor_nsf.la \ @@ -116,11 +137,9 @@ plugin_LTLIBRARIES = $(pdfplugin) \ libextractor_riff.la \ libextractor_s3m.la \ libextractor_sid.la \ - libextractor_split.la \ libextractor_tar.la \ libextractor_tiff.la \ $(thumbqt) \ - libextractor_translit.la \ libextractor_wav.la \ libextractor_xm.la \ libextractor_zip.la @@ -205,13 +224,6 @@ libextractor_id3v24_la_LDFLAGS = \ libextractor_id3v24_la_LIBADD = \ $(top_builddir)/src/common/libextractor_common.la -libextractor_it_la_SOURCES = \ - itextractor.c -libextractor_it_la_LDFLAGS = \ - $(PLUGINFLAGS) -libextractor_it_la_LIBADD = \ - $(top_builddir)/src/main/libextractor.la - libextractor_dvi_la_SOURCES = \ dviextractor.c libextractor_dvi_la_LDFLAGS = \ @@ -231,11 +243,6 @@ libextractor_tar_la_LIBADD = \ $(top_builddir)/src/main/libextractor.la -lz endif -libextractor_lower_la_SOURCES = \ - lowerextractor.c -libextractor_lower_la_LDFLAGS = \ - $(PLUGINFLAGS) - libextractor_gif_la_SOURCES = \ gifextractor.c libextractor_gif_la_LDFLAGS = \ @@ -279,14 +286,6 @@ libextractor_jpeg_la_LDFLAGS = \ libextractor_jpeg_la_LIBADD = \ $(LE_LIBINTL) -libextractor_html_la_SOURCES = \ - htmlextractor.c -libextractor_html_la_LDFLAGS = \ - $(PLUGINFLAGS) -libextractor_html_la_LIBADD = \ - $(top_builddir)/src/main/libextractor.la \ - $(top_builddir)/src/common/libextractor_common.la - libextractor_flv_la_SOURCES = \ flvextractor.c libextractor_flv_la_LDFLAGS = \ @@ -299,13 +298,6 @@ libextractor_real_la_SOURCES = \ libextractor_real_la_LDFLAGS = \ $(PLUGINFLAGS) -libextractor_mime_la_SOURCES = \ - mimeextractor.c -libextractor_mime_la_LDFLAGS = \ - $(PLUGINFLAGS) -libextractor_mime_la_LIBADD = \ - $(top_builddir)/src/main/libextractor.la - if HAVE_MPEG2 libextractor_mpeg_la_SOURCES = \ mpegextractor.c @@ -354,15 +346,6 @@ libextractor_png_la_LIBADD = \ -lz endif -libextractor_filename_la_SOURCES = \ - filenameextractor.c -libextractor_filename_la_LDFLAGS = \ - $(PLUGINFLAGS) -libextractor_filename_la_LIBADD = \ - $(top_builddir)/src/main/libextractor.la \ - $(top_builddir)/src/common/libextractor_common.la \ - $(LE_LIBINTL) - libextractor_sid_la_SOURCES = \ sidextractor.c libextractor_sid_la_LDFLAGS = \ @@ -398,18 +381,8 @@ libextractor_s3m_la_LDFLAGS = \ libextractor_s3m_la_LIBADD = \ $(top_builddir)/src/main/libextractor.la -libextractor_split_la_SOURCES = \ - splitextractor.c -libextractor_split_la_LDFLAGS = \ - $(PLUGINFLAGS) - -libextractor_translit_la_SOURCES = \ - translitextractor.c -libextractor_translit_la_LDFLAGS = \ - $(PLUGINFLAGS) - libextractor_thumbnailqt_la_SOURCES = \ - thumbnailextractorqt.cc + thumbnailextractorqt.cc libextractor_thumbnailqt_la_LDFLAGS = \ $(PLUGINFLAGS) libextractor_thumbnailqt_la_LIBADD = \ diff --git a/src/plugins/filenameextractor.c b/src/plugins/filenameextractor.c @@ -1,72 +0,0 @@ -/* - This file is part of libextractor. - (C) 2002, 2003, 2004 Vidyut Samanta and Christian Grothoff - - libextractor is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 2, or (at your - option) any later version. - - libextractor is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with libextractor; see the file COPYING. If not, write to the - Free Software Foundation, Inc., 59 Temple Place - Suite 330, - Boston, MA 02111-1307, USA. - */ - -#include "platform.h" -#include "extractor.h" -#include "convert.h" - - -/* "extract" the 'filename' as a keyword */ -struct EXTRACTOR_Keywords * -libextractor_filename_extract (const char *filename, - char *date, - size_t size, struct EXTRACTOR_Keywords *prev) -{ - EXTRACTOR_KeywordList *keyword; - struct stat fstatbuf; - const char *filenameRoot = filename; - int res; - - /* get filename */ - if (filename == NULL) - return prev; - for (res = strlen (filename) - 1; res >= 0; res--) - if (filename[res] == DIR_SEPARATOR) - { - filenameRoot = &filename[res + 1]; - break; - } - keyword = malloc (sizeof (EXTRACTOR_KeywordList)); - keyword->next = prev; - keyword->keyword = EXTRACTOR_common_convert_to_utf8 (filenameRoot, - strlen (filenameRoot), - nl_langinfo (CODESET)); - keyword->keywordType = EXTRACTOR_FILENAME; - prev = keyword; - if (-1 == STAT(filename, &fstatbuf)) - return prev; - keyword = malloc (sizeof (EXTRACTOR_KeywordList)); - keyword->next = prev; - keyword->keyword = malloc (14); - keyword->keywordType = EXTRACTOR_FILE_SIZE; - - if (size >= 1000000000) - snprintf (keyword->keyword, 14, "%.2f %s", fstatbuf.st_size / 1000000000.0, - _("GB")); - else if (size >= 1000000) - snprintf (keyword->keyword, 14, "%.2f %s", fstatbuf.st_size / 1000000.0, _("MB")); - else if (size >= 1000) - snprintf (keyword->keyword, 14, "%.2f %s", fstatbuf.st_size / 1000.0, _("KB")); - else - snprintf (keyword->keyword, 14, "%.2f %s", (double) fstatbuf.st_size, _("Bytes")); - - prev = keyword; - return prev; -} diff --git a/src/plugins/html_extractor.c b/src/plugins/html_extractor.c @@ -0,0 +1,403 @@ +/* + This file is part of libextractor. + (C) 2002, 2003, 2004, 2005 Vidyut Samanta and Christian Grothoff + + libextractor is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 2, or (at your + option) any later version. + + libextractor is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with libextractor; see the file COPYING. If not, write to the + Free Software Foundation, Inc., 59 Temple Place - Suite 330, + Boston, MA 02111-1307, USA. + + */ + +#include "platform.h" +#include "extractor.h" +#include <string.h> +#include "convert.h" + +static struct +{ + const char *name; + enum EXTRACTOR_MetaType type; +} tagmap[] = { + { "author", EXTRACTOR_METATYPE_AUTHOR_NAME }, + { "dc.author", EXTRACTOR_METATYPE_AUTHOR_NAME }, + { "title", EXTRACTOR_METATYPE_TITLE }, + { "dc.title", EXTRACTOR_METATYPE_TITLE}, + { "description", EXTRACTOR_METATYPE_DESCRIPTION }, + { "dc.description", EXTRACTOR_METATYPE_DESCRIPTION }, + { "subject", EXTRACTOR_METATYPE_SUBJECT}, + { "dc.subject", EXTRACTOR_METATYPE_SUBJECT}, + { "date", EXTRACTOR_METATYPE_UNKNOWN_DATE }, + { "dc.date", EXTRACTOR_METATYPE_UNKNOWN_DATE}, + { "publisher", EXTRACTOR_METATYPE_PUBLISHER }, + { "dc.publisher", EXTRACTOR_METATYPE_PUBLISHER}, + { "rights", EXTRACTOR_METATYPE_RIGHTS }, + { "dc.rights", EXTRACTOR_METATYPE_RIGHTS }, + { "copyright", EXTRACTOR_METATYPE_COPYRIGHT }, + { "language", EXTRACTOR_METATYPE_DOCUMENT_LANGUAGE }, + { "keywords", EXTRACTOR_METATYPE_KEYWORDS }, + { "abstract", EXTRACTOR_METATYPE_ABSTRACT }, + { "formatter", EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE }, + { "dc.creator", EXTRACTOR_METATYPE_CREATOR}, + { "dc.identifier", EXTRACTOR_METATYPE_URI }, + { "dc.format", EXTRACTOR_METATYPE_FORMAT }, + { NULL, EXTRACTOR_METATYPE_RESERVED } +}; + +static const char *relevantTags[] = { + "title", + "meta", + NULL, +}; + +typedef struct TI +{ + struct TI *next; + const char *tagStart; + const char *tagEnd; + const char *dataStart; + const char *dataEnd; +} TagInfo; + + + + +/* ******************** parser helper functions ************** */ + +static int +tagMatch (const char *tag, const char *s, const char *e) +{ + return (((e - s) == strlen (tag)) && (0 == strncasecmp (tag, s, e - s))); +} + +static int +lookFor (char c, size_t * pos, const char *data, size_t size) +{ + size_t p = *pos; + + while ((p < size) && (data[p] != c)) + { + if (data[p] == '\0') + return 0; + p++; + } + *pos = p; + return p < size; +} + +static int +skipWhitespace (size_t * pos, const char *data, size_t size) +{ + size_t p = *pos; + + while ((p < size) && (isspace (data[p]))) + { + if (data[p] == '\0') + return 0; + p++; + } + *pos = p; + return p < size; +} + +static int +skipLetters (size_t * pos, const char *data, size_t size) +{ + size_t p = *pos; + + while ((p < size) && (isalpha (data[p]))) + { + if (data[p] == '\0') + return 0; + p++; + } + *pos = p; + return p < size; +} + +static int +lookForMultiple (const char *c, size_t * pos, const char *data, size_t size) +{ + size_t p = *pos; + + while ((p < size) && (strchr (c, data[p]) == NULL)) + { + if (data[p] == '\0') + return 0; + p++; + } + *pos = p; + return p < size; +} + +static void +findEntry (const char *key, + const char *start, + const char *end, const char **mstart, const char **mend) +{ + size_t len; + + *mstart = NULL; + *mend = NULL; + len = strlen (key); + while (start < end - len - 1) + { + start++; + if (start[len] != '=') + continue; + if (0 == strncmp (start, key, len)) + { + start += len + 1; + *mstart = start; + if ((*start == '\"') || (*start == '\'')) + { + start++; + while ((start < end) && (*start != **mstart)) + start++; + (*mstart)++; /* skip quote */ + } + else + { + while ((start < end) && (!isspace (*start))) + start++; + } + *mend = start; + return; + } + } +} + +/** + * Search all tags that correspond to "tagname". Example: + * If the tag is <meta name="foo" desc="bar">, and + * tagname == "meta", keyname="name", keyvalue="foo", + * and searchname="desc", then this function returns a + * copy (!) of "bar". Easy enough? + * + * @return NULL if nothing is found + */ +static char * +findInTags (TagInfo * t, + const char *tagname, + const char *keyname, const char *keyvalue, const char *searchname) +{ + const char *pstart; + const char *pend; + + while (t != NULL) + { + if (tagMatch (tagname, t->tagStart, t->tagEnd)) + { + findEntry (keyname, t->tagEnd, t->dataStart, &pstart, &pend); + if ((pstart != NULL) && (tagMatch (keyvalue, pstart, pend))) + { + findEntry (searchname, t->tagEnd, t->dataStart, &pstart, &pend); + if (pstart != NULL) + { + char *ret = malloc (pend - pstart + 1); + memcpy (ret, pstart, pend - pstart); + ret[pend - pstart] = '\0'; + return ret; + } + } + } + t = t->next; + } + return NULL; +} + + +/* mimetype = text/html */ +int +EXTRACTOR_html_extract (const char *data, + size_t size, + EXTRACTOR_MetaDataProcessor proc, + void *proc_cls, + const char *options) +{ + size_t xsize; + TagInfo *tags; + TagInfo *t; + TagInfo tag; + size_t pos; + size_t tpos; + int i; + char *charset; + char *tmp; + char *xtmp; + int ret; + + ret = 0; + if (size == 0) + return 0; + /* only scan first 32k */ + if (size > 1024 * 32) + xsize = 1024 * 32; + else + xsize = size; + tags = NULL; + tag.next = NULL; + pos = 0; + while (pos < xsize) + { + if (!lookFor ('<', &pos, data, size)) + break; + tag.tagStart = &data[++pos]; + if (!skipLetters (&pos, data, size)) + break; + tag.tagEnd = &data[pos]; + if (!skipWhitespace (&pos, data, size)) + break; + STEP3: + if (!lookForMultiple (">\"\'", &pos, data, size)) + break; + if (data[pos] != '>') + { + /* find end-quote, ignore escaped quotes (\') */ + do + { + tpos = pos; + pos++; + if (!lookFor (data[tpos], &pos, data, size)) + break; + } + while (data[pos - 1] == '\\'); + pos++; + goto STEP3; + } + pos++; + if (!skipWhitespace (&pos, data, size)) + break; + tag.dataStart = &data[pos]; + if (!lookFor ('<', &pos, data, size)) + break; + tag.dataEnd = &data[pos]; + i = 0; + while (relevantTags[i] != NULL) + { + if ((strlen (relevantTags[i]) == tag.tagEnd - tag.tagStart) && + (0 == strncasecmp (relevantTags[i], + tag.tagStart, tag.tagEnd - tag.tagStart))) + { + t = malloc (sizeof (TagInfo)); + *t = tag; + t->next = tags; + tags = t; + break; + } + i++; + } + /* abort early if we hit the body tag */ + if (tagMatch ("body", tag.tagStart, tag.tagEnd)) + break; + } + + /* fast exit */ + if (tags == NULL) + return 0; + + charset = NULL; + /* first, try to determine mime type and/or character set */ + tmp = findInTags (tags, "meta", "http-equiv", "content-type", "content"); + if (tmp != NULL) + { + /* ideally, tmp == "test/html; charset=ISO-XXXX-Y" or something like that; + if text/html is present, we take that as the mime-type; if charset= + is present, we try to use that for character set conversion. */ + if (0 == strncmp (tmp, "text/html", strlen ("text/html"))) + ret = proc (proc_cls, + "html", + EXTRACTOR_METATYPE_MIMETYPE, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + "text/html", + strlen ("text/html")+1); + charset = strstr (tmp, "charset="); + if (charset != NULL) + charset = strdup (&charset[strlen ("charset=")]); + free (tmp); + } + i = 0; + while (tagmap[i].name != NULL) + { + tmp = findInTags (tags, "meta", "name", tagmap[i].name, "content"); + if ( (tmp != NULL) && + (ret == 0) ) + { + if (charset == NULL) + ret = proc (proc_cls, + "html", + tagmap[i].type, + EXTRACTOR_METAFORMAT_C_STRING, + "text/plain", + tmp, + strlen (tmp) + 1); + else + { + xtmp = EXTRACTOR_common_convert_to_utf8 (tmp, + strlen (tmp), + charset); + ret = proc (proc_cls, + "html", + tagmap[i].type, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + xtmp, + strlen (xtmp) + 1); + free (xtmp); + } + free (tmp); + } + i++; + } + while (tags != NULL) + { + t = tags; + if ( (tagMatch ("title", t->tagStart, t->tagEnd)) && + (ret == 0) ) + { + if (charset == NULL) + { + xtmp = malloc (t->dataEnd - t->dataStart + 1); + memcpy (xtmp, t->dataStart, t->dataEnd - t->dataStart); + xtmp[t->dataEnd - t->dataStart] = '\0'; + ret = proc (proc_cls, + "html", + EXTRACTOR_METATYPE_TITLE, + EXTRACTOR_METAFORMAT_C_STRING, + "text/plain", + xtmp, + strlen (xtmp) + 1); + free (xtmp); + } + else + { + xtmp = EXTRACTOR_common_convert_to_utf8 (tmp, + strlen (tmp), + charset); + ret = proc (proc_cls, + "html", + EXTRACTOR_METATYPE_TITLE, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + xtmp, + strlen (xtmp) + 1); + free (xtmp); + } + } + tags = t->next; + free (t); + } + free (charset); + return ret; +} diff --git a/src/plugins/htmlextractor.c b/src/plugins/htmlextractor.c @@ -1,446 +0,0 @@ -/* - This file is part of libextractor. - (C) 2002, 2003, 2004, 2005 Vidyut Samanta and Christian Grothoff - - libextractor is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 2, or (at your - option) any later version. - - libextractor is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with libextractor; see the file COPYING. If not, write to the - Free Software Foundation, Inc., 59 Temple Place - Suite 330, - Boston, MA 02111-1307, USA. - - */ - -#include "platform.h" -#include "extractor.h" -#include <string.h> -#include "convert.h" - -static struct -{ - char *name; - EXTRACTOR_KeywordType type; -} tagmap[] = -{ - { - "author", EXTRACTOR_AUTHOR}, - { - "title", EXTRACTOR_TITLE}, - { - "description", EXTRACTOR_DESCRIPTION}, - { - "language", EXTRACTOR_LANGUAGE}, - { - "rights", EXTRACTOR_COPYRIGHT}, - { - "publisher", EXTRACTOR_PUBLISHER}, - { - "formatter", EXTRACTOR_SOFTWARE}, - { - "copyright", EXTRACTOR_COPYRIGHT}, - { - "abstract", EXTRACTOR_SUMMARY}, - { - "subject", EXTRACTOR_SUBJECT}, - { - "abstract", EXTRACTOR_SUMMARY}, - { - "date", EXTRACTOR_DATE}, - { - "keywords", EXTRACTOR_KEYWORDS}, - { - "dc.author", EXTRACTOR_AUTHOR}, - { - "dc.title", EXTRACTOR_TITLE}, - { - "dc.description", EXTRACTOR_DESCRIPTION}, - { - "dc.subject", EXTRACTOR_SUBJECT}, - { - "dc.creator", EXTRACTOR_CREATOR}, - { - "dc.publisher", EXTRACTOR_PUBLISHER}, - { - "dc.date", EXTRACTOR_DATE}, - { - "dc.format", EXTRACTOR_FORMAT}, - { - "dc.identifier", EXTRACTOR_RESOURCE_IDENTIFIER}, - { - "dc.rights", EXTRACTOR_COPYRIGHT}, - { -NULL, EXTRACTOR_UNKNOWN},}; - -static char *relevantTags[] = { - "title", - "meta", - NULL, -}; - -/* which mime-types should not be subjected to - the HTML extractor (no use trying & parsing - is expensive!) */ -static char *blacklist[] = { - "image/jpeg", - "image/gif", - "image/png", - "image/x-png", - "image/xcf", - "image/tiff", - "application/java", - "application/pdf", - "application/postscript", - "application/elf", - "application/gnunet-directory", - "application/x-gzip", - "application/bz2", - "application/x-rpm", - "application/x-rar", - "application/x-zip", - "application/x-arj", - "application/x-compress", - "application/x-tar", - "application/x-lha", - "application/x-gtar", - "application/x-dpkg", - "application/ogg", - "audio/real", - "audio/x-wav", - "audio/avi", - "audio/midi", - "audio/mpeg", - "video/real", - "video/asf", - "video/quicktime", - NULL, -}; - -typedef struct TI -{ - struct TI *next; - const char *tagStart; - const char *tagEnd; - const char *dataStart; - const char *dataEnd; -} TagInfo; - -/** - * Add a keyword. - */ -static struct EXTRACTOR_Keywords * -addKeyword (EXTRACTOR_KeywordType type, - char *keyword, struct EXTRACTOR_Keywords *next) -{ - EXTRACTOR_KeywordList *result; - - result = malloc (sizeof (EXTRACTOR_KeywordList)); - result->next = next; - result->keyword = keyword; - result->keywordType = type; - return result; -} - -/* ******************** parser helper functions ************** */ - -static int -tagMatch (const char *tag, const char *s, const char *e) -{ - return (((e - s) == strlen (tag)) && (0 == strncasecmp (tag, s, e - s))); -} - -static int -lookFor (char c, size_t * pos, const char *data, size_t size) -{ - size_t p = *pos; - - while ((p < size) && (data[p] != c)) - { - if (data[p] == '\0') - return 0; - p++; - } - *pos = p; - return p < size; -} - -static int -skipWhitespace (size_t * pos, const char *data, size_t size) -{ - size_t p = *pos; - - while ((p < size) && (isspace (data[p]))) - { - if (data[p] == '\0') - return 0; - p++; - } - *pos = p; - return p < size; -} - -static int -skipLetters (size_t * pos, const char *data, size_t size) -{ - size_t p = *pos; - - while ((p < size) && (isalpha (data[p]))) - { - if (data[p] == '\0') - return 0; - p++; - } - *pos = p; - return p < size; -} - -static int -lookForMultiple (const char *c, size_t * pos, const char *data, size_t size) -{ - size_t p = *pos; - - while ((p < size) && (strchr (c, data[p]) == NULL)) - { - if (data[p] == '\0') - return 0; - p++; - } - *pos = p; - return p < size; -} - -static void -findEntry (const char *key, - const char *start, - const char *end, const char **mstart, const char **mend) -{ - size_t len; - - *mstart = NULL; - *mend = NULL; - len = strlen (key); - while (start < end - len - 1) - { - start++; - if (start[len] != '=') - continue; - if (0 == strncmp (start, key, len)) - { - start += len + 1; - *mstart = start; - if ((*start == '\"') || (*start == '\'')) - { - start++; - while ((start < end) && (*start != **mstart)) - start++; - (*mstart)++; /* skip quote */ - } - else - { - while ((start < end) && (!isspace (*start))) - start++; - } - *mend = start; - return; - } - } -} - -/** - * Search all tags that correspond to "tagname". Example: - * If the tag is <meta name="foo" desc="bar">, and - * tagname == "meta", keyname="name", keyvalue="foo", - * and searchname="desc", then this function returns a - * copy (!) of "bar". Easy enough? - * - * @return NULL if nothing is found - */ -static char * -findInTags (TagInfo * t, - const char *tagname, - const char *keyname, const char *keyvalue, const char *searchname) -{ - const char *pstart; - const char *pend; - - while (t != NULL) - { - if (tagMatch (tagname, t->tagStart, t->tagEnd)) - { - findEntry (keyname, t->tagEnd, t->dataStart, &pstart, &pend); - if ((pstart != NULL) && (tagMatch (keyvalue, pstart, pend))) - { - findEntry (searchname, t->tagEnd, t->dataStart, &pstart, &pend); - if (pstart != NULL) - { - char *ret = malloc (pend - pstart + 1); - memcpy (ret, pstart, pend - pstart); - ret[pend - pstart] = '\0'; - return ret; - } - } - } - t = t->next; - } - return NULL; -} - - -/* mimetype = text/html */ -struct EXTRACTOR_Keywords * -libextractor_html_extract (const char *filename, - const char *data, - const size_t size, struct EXTRACTOR_Keywords *prev) -{ - size_t xsize; - const char *mime; - TagInfo *tags; - TagInfo *t; - TagInfo tag; - size_t pos; - size_t tpos; - int i; - char *charset; - char *tmp; - - if (size == 0) - return prev; - - mime = EXTRACTOR_extractLast (EXTRACTOR_MIMETYPE, prev); - if (mime != NULL) - { - int j; - j = 0; - while (blacklist[j] != NULL) - { - if (0 == strcmp (blacklist[j], mime)) - return prev; - j++; - } - } - - /* only scan first 32k */ - if (size > 1024 * 32) - xsize = 1024 * 32; - else - xsize = size; - tags = NULL; - tag.next = NULL; - pos = 0; - while (pos < xsize) - { - if (!lookFor ('<', &pos, data, size)) - break; - tag.tagStart = &data[++pos]; - if (!skipLetters (&pos, data, size)) - break; - tag.tagEnd = &data[pos]; - if (!skipWhitespace (&pos, data, size)) - break; - STEP3: - if (!lookForMultiple (">\"\'", &pos, data, size)) - break; - if (data[pos] != '>') - { - /* find end-quote, ignore escaped quotes (\') */ - do - { - tpos = pos; - pos++; - if (!lookFor (data[tpos], &pos, data, size)) - break; - } - while (data[pos - 1] == '\\'); - pos++; - goto STEP3; - } - pos++; - if (!skipWhitespace (&pos, data, size)) - break; - tag.dataStart = &data[pos]; - if (!lookFor ('<', &pos, data, size)) - break; - tag.dataEnd = &data[pos]; - i = 0; - while (relevantTags[i] != NULL) - { - if ((strlen (relevantTags[i]) == tag.tagEnd - tag.tagStart) && - (0 == strncasecmp (relevantTags[i], - tag.tagStart, tag.tagEnd - tag.tagStart))) - { - t = malloc (sizeof (TagInfo)); - *t = tag; - t->next = tags; - tags = t; - break; - } - i++; - } - /* abort early if we hit the body tag */ - if (tagMatch ("body", tag.tagStart, tag.tagEnd)) - break; - } - - /* fast exit */ - if (tags == NULL) - return prev; - - charset = NULL; - - /* first, try to determine mime type and/or character set */ - tmp = findInTags (tags, "meta", "http-equiv", "content-type", "content"); - if (tmp != NULL) - { - /* ideally, tmp == "test/html; charset=ISO-XXXX-Y" or something like that; - if text/html is present, we take that as the mime-type; if charset= - is present, we try to use that for character set conversion. */ - if (0 == strncmp (tmp, "text/html", strlen ("text/html"))) - prev = addKeyword (EXTRACTOR_MIMETYPE, strdup ("text/html"), prev); - - charset = strstr (tmp, "charset="); - - if (charset != NULL) - charset = strdup (&charset[strlen ("charset=")]); - free (tmp); - } - if (charset == NULL) - charset = strdup ("ISO-8859-1"); /* try a sensible default */ - - - i = 0; - while (tagmap[i].name != NULL) - { - tmp = findInTags (tags, "meta", "name", tagmap[i].name, "content"); - if (tmp != NULL) - { - prev = addKeyword (tagmap[i].type, - EXTRACTOR_common_convert_to_utf8 (tmp, - strlen (tmp), charset), prev); - free (tmp); - } - i++; - } - - - while (tags != NULL) - { - t = tags; - if (tagMatch ("title", t->tagStart, t->tagEnd)) - prev = addKeyword (EXTRACTOR_TITLE, - EXTRACTOR_common_convert_to_utf8 (t->dataStart, - t->dataEnd - t->dataStart, - charset), prev); - tags = t->next; - free (t); - } - free (charset); - - return prev; -} diff --git a/src/plugins/it_extractor.c b/src/plugins/it_extractor.c @@ -0,0 +1,102 @@ +/* + * This file is part of libextractor. + * (C) 2008 Toni Ruottu + * + * libextractor is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2, or (at your + * option) any later version. + * + * libextractor is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with libextractor; see the file COPYING. If not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + * + */ + +#include "platform.h" +#include "extractor.h" + +#define HEADER_SIZE 0xD0 + +struct header +{ + char magicid[4]; + char title[26]; + char hilight[2]; + char orders[2]; + char instruments[2]; + char samples[2]; + char patterns[2]; + char version[2]; + char compatible[2]; + char flags[2]; + char special[2]; +}; + +/* "extract" keyword from an Impulse Tracker module + * + * ITTECH.TXT as taken from IT 2.14p5 was used, + * while this piece of software was originally + * written. + * + */ +int +EXTRACTOR_mime_extract (const char *data, + size_t size, + EXTRACTOR_MetaDataProcessor proc, + void *proc_cls, + const char *options) +{ + char title[27]; + char itversion[8]; + struct header *head; + + /* Check header size */ + if (size < HEADER_SIZE) + return 0; + head = (struct header *) data; + /* Check "magic" id bytes */ + if (memcmp (head->magicid, "IMPM", 4)) + return 0; + /* Mime-type */ + if (0 != proc (proc_cls, + "it", + EXTRACTOR_METATYPE_MIMETYPE, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + "audio/x-it", + strlen("audio/x-it")+1)) + return 1; + + /* Version of Tracker */ + sprintf (itversion, + "%d.%d", + (head->version[0]& 0x01),head->version[1]); + if (0 != proc (proc_cls, + "it", + EXTRACTOR_METATYPE_FORMAT_VERSION, + EXTRACTOR_METAFORMAT_C_STRING, + "text/plain", + itversion, + strlen(itversion)+1)) + return 1; + + /* Song title */ + memcpy (&title, head->title, 26); + title[26] = '\0'; + if (0 != proc (proc_cls, + "it", + EXTRACTOR_METATYPE_TITLE, + EXTRACTOR_METAFORMAT_C_STRING, + "text/plain", + title, + strlen(title)+1)) + return 1; + return 0; +} diff --git a/src/plugins/itextractor.c b/src/plugins/itextractor.c @@ -1,107 +0,0 @@ -/* - * This file is part of libextractor. - * (C) 2008 Toni Ruottu - * - * libextractor is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published - * by the Free Software Foundation; either version 2, or (at your - * option) any later version. - * - * libextractor is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with libextractor; see the file COPYING. If not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 02111-1307, USA. - * - */ - -#include "platform.h" -#include "extractor.h" -#include "convert.h" - -#define HEADER_SIZE 0xD0 - -struct header -{ - char magicid[4]; - char title[26]; - char hilight[2]; - char orders[2]; - char instruments[2]; - char samples[2]; - char patterns[2]; - char version[2]; - char compatible[2]; - char flags[2]; - char special[2]; -}; - - -static struct EXTRACTOR_Keywords *addkword - (EXTRACTOR_KeywordList * oldhead, - const char *phrase, EXTRACTOR_KeywordType type) -{ - EXTRACTOR_KeywordList *keyword; - - keyword = malloc (sizeof (EXTRACTOR_KeywordList)); - keyword->next = oldhead; - keyword->keyword = strdup (phrase); - keyword->keywordType = type; - return (keyword); -} - - -/* "extract" keyword from an Impulse Tracker module - * - * ITTECH.TXT as taken from IT 2.14p5 was used, - * while this piece of software was originally - * written. - * - */ -struct EXTRACTOR_Keywords *libextractor_it_extract - (const char *filename, - char *data, size_t size, struct EXTRACTOR_Keywords *prev) -{ - char title[27]; - char itversion[8]; - struct header *head; - - /* Check header size */ - - if (size < HEADER_SIZE) - { - return (prev); - } - - head = (struct header *) data; - - /* Check "magic" id bytes */ - - if (memcmp (head->magicid, "IMPM", 4)) - { - return (prev); - } - - /* Mime-type */ - - prev = addkword (prev, "audio/x-it", EXTRACTOR_MIMETYPE); - - - /* Version of Tracker */ - - sprintf (itversion, "%d.%d", (head->version[0]& 0x01),head->version[1]); - prev = addkword (prev, itversion, EXTRACTOR_FORMAT_VERSION); - - /* Song title */ - - memcpy (&title, head->title, 26); - title[26] = '\0'; - prev = addkword (prev, title, EXTRACTOR_TITLE); - - return (prev); - -} diff --git a/src/plugins/lowerextractor.c b/src/plugins/lowerextractor.c @@ -1,80 +0,0 @@ -/* - This file is part of libextractor. - (C) 2002, 2003 Vidyut Samanta and Christian Grothoff - - libextractor is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 2, or (at your - option) any later version. - - libextractor is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with libextractor; see the file COPYING. If not, write to the - Free Software Foundation, Inc., 59 Temple Place - Suite 330, - Boston, MA 02111-1307, USA. - */ - -#include "platform.h" -#include "extractor.h" - -static void -addKeyword (struct EXTRACTOR_Keywords **list, - const char *keyword, EXTRACTOR_KeywordType type) -{ - EXTRACTOR_KeywordList *next; - next = malloc (sizeof (EXTRACTOR_KeywordList)); - next->next = *list; - next->keyword = strdup (keyword); - next->keywordType = type; - *list = next; -} - -/* convert other keywords to lower case */ -struct EXTRACTOR_Keywords * -libextractor_lower_extract (char *filename, - char *data, - size_t size, struct EXTRACTOR_Keywords *prev) -{ - struct EXTRACTOR_Keywords *pos; - char *lower; - unsigned int mem, needed, i; - - pos = prev; - lower = NULL; - mem = 0; - - while (pos != NULL) - { - if (pos->keywordType == EXTRACTOR_FILE_SIZE) - { - pos = pos->next; - continue; - } - - needed = strlen (pos->keyword) + 1; - if (needed > mem) - { - lower = (lower == NULL) ? realloc (lower, needed) : malloc (needed); - mem = needed; - } - - for (i = 0; i < needed; i++) - { - lower[i] = tolower (pos->keyword[i]); - } - - if (strcmp (pos->keyword, lower)) - { - addKeyword (&prev, lower, EXTRACTOR_LOWERCASE); - } - pos = pos->next; - } - if (lower != NULL) - free (lower); - - return prev; -} diff --git a/src/plugins/mime_extractor.c b/src/plugins/mime_extractor.c @@ -0,0 +1,320 @@ +/* + This file is part of libextractor. + (C) 2002, 2003, 2006 Vidyut Samanta and Christian Grothoff + + libextractor is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 2, or (at your + option) any later version. + + libextractor is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with libextractor; see the file COPYING. If not, write to the + Free Software Foundation, Inc., 59 Temple Place - Suite 330, + Boston, MA 02111-1307, USA. + */ + +#include "platform.h" +#include "extractor.h" + + +/** + * Detect a file-type. + * @param data the contents of the file + * @param len the length of the file + * @param arg closure... + * @return 0 if the file does not match, 1 if it does + **/ +typedef int (*Detector) (const char *data, size_t len, void *arg); + +/** + * Detect a file-type. + * @param data the contents of the file + * @param len the length of the file + * @return always 1 + **/ +static int +defaultDetector (const char *data, size_t len, void *arg) +{ + return 1; +} + +/** + * Detect a file-type. + * @param data the contents of the file + * @param len the length of the file + * @return always 0 + **/ +static int +disableDetector (const char *data, size_t len, void *arg) +{ + return 0; +} + +typedef struct ExtraPattern +{ + int pos; + int len; + char *pattern; +} ExtraPattern; + +/** + * Define special matching rules for complicated formats... + **/ +static ExtraPattern xpatterns[] = { +#define AVI_XPATTERN 0 + {8, 4, "AVI "}, + {0, 0, NULL}, +#define WAVE_XPATTERN 2 + {8, 4, "WAVE"}, + {0, 0, NULL}, +#define ACE_XPATTERN 4 + {4, 10, "\x00\x00\x90**ACE**"}, + {0, 0, NULL}, +#define TAR_XPATTERN 6 + {257, 6, "ustar\x00"}, + {0, 0, NULL}, +#define GTAR_XPATTERN 8 + {257, 8, "ustar\040\040\0"}, + {0, 0, NULL}, +#define RMID_XPATTERN 10 + {8, 4, "RMID"}, + {0, 0, NULL}, +#define ACON_XPATTERN 12 + {8, 4, "ACON"}, + {0, 0, NULL}, +#define CR2_PATTERN 14 + {8, 3, "CR\x02"}, + {0, 0, NULL}, +}; + +/** + * Detect AVI. A pattern matches if all XPatterns until the next {0, + * 0, NULL} slot match. OR-ing patterns can be achieved using multiple + * entries in the main table, so this "AND" (all match) semantics are + * the only reasonable answer. + **/ +static int +xPatternMatcher (const char *data, size_t len, void *cls) +{ + ExtraPattern *arg = cls; + + while (arg->pattern != NULL) + { + if (arg->pos + arg->len > len) + return 0; + if (0 != memcmp (&data[arg->pos], arg->pattern, arg->len)) + return 0; + arg++; + } + return 1; +} + +/** + * Detect SVG + */ +static int +svgMatcher (const char *data, size_t len, void *cls) +{ + enum + { XMLSTART, XMLCLOSE, SVGSTART } state; + size_t i; + + i = 0; + state = XMLSTART; + + while (i < len) + { + if (!isprint (data[i])) + return 0; + switch (state) + { + case XMLSTART: + if (i + 6 >= len) + return 0; + else if (memcmp (data + i, "<?xml", 5) == 0 + && isspace (*(data + i + 5))) + state = XMLCLOSE; + break; + case XMLCLOSE: + if (i + 2 >= len) + return 0; + else if (memcmp (data + i, "?>", 2) == 0) + state = SVGSTART; + break; + case SVGSTART: + if (i + 5 >= len) + return 0; + else if (memcmp (data + i, "<svg", 4) == 0 + && isspace (*(data + i + 4))) + return 1; + break; + default: + /* do nothing */ + break; + } + i++; + } + return 0; +} + +/** + * Use this detector, if the simple header-prefix matching is + * sufficient. + **/ +#define DEFAULT &defaultDetector, NULL + +/** + * Use this detector, to disable the mime-type (effectively comment it + * out). + **/ +#define DISABLED &disableDetector, NULL + +/** + * Select an entry in xpatterns for matching + **/ +#define XPATTERN(a) &xPatternMatcher, &xpatterns[(a)] + +typedef struct Pattern +{ + char *pattern; + int size; + char *mimetype; + Detector detector; + void *arg; +} Pattern; + +static Pattern patterns[] = { + {"\xFF\xD8", 2, "image/jpeg", DEFAULT}, + {"\211PNG\r\n\032\n", 8, "image/png", DEFAULT}, + {"/* XPM */", 9, "image/x-xpm", DEFAULT}, + {"GIF8", 4, "image/gif", DEFAULT}, + {"P1", 2, "image/x-portable-bitmap", DEFAULT}, + {"P2", 2, "image/x-portable-graymap", DEFAULT}, + {"P3", 2, "image/x-portable-pixmap", DEFAULT}, + {"P4", 2, "image/x-portable-bitmap", DEFAULT}, + {"P5", 2, "image/x-portable-graymap", DEFAULT}, + {"P6", 2, "image/x-portable-pixmap", DEFAULT}, + {"P7", 2, "image/x-portable-anymap", DEFAULT}, + {"BM", 2, "image/x-bmp", DEFAULT}, + {"fLaC", 4, "audio/flac", DEFAULT}, + {"\x89PNG", 4, "image/x-png", DEFAULT}, + {"id=ImageMagick", 14, "application/x-imagemagick-image", DEFAULT}, + {"hsi1", 4, "image/x-jpeg-proprietary", DEFAULT}, + {"FLV", 3, "video/x-flv", DEFAULT}, + {"FWS", 3, "application/x-shockwave-flash", DEFAULT}, + {"CWS", 3, "application/x-shockwave-flash", DEFAULT}, + {"\x2E\x52\x4d\x46", 4, "video/real", DEFAULT}, + {"\x2e\x72\x61\xfd", 4, "audio/real", DEFAULT}, + {"\x00\x05\x16\x00", 4, "application/applefile", DEFAULT}, + {"\x00\x05\x16\x07", 4, "application/applefile", DEFAULT}, + {"\177ELF", 4, "application/x-executable", DEFAULT}, + /* FIXME: correct MIME-type for an ELF!? */ + {"\xca\xfe\xba\xbe", 4, "application/java", DEFAULT}, + /* FIXME: correct MIME for a class-file? */ + {"gimp xcf", 8, "image/xcf", DEFAULT}, + {"II\x2a\x00\x10", 5, "image/x-canon-cr2", XPATTERN (CR2_PATTERN)}, + {"IIN1", 4, "image/tiff", DEFAULT}, + {"MM\x00\x2a", 4, "image/tiff", DEFAULT}, /* big-endian */ + {"II\x2a\x00", 4, "image/tiff", DEFAULT}, /* little-endian */ + {"%PDF", 4, "application/pdf", DEFAULT}, + {"%!PS-Adobe-", 11, "application/postscript", DEFAULT}, + {"\004%!PS-Adobe-", 12, "application/postscript", DEFAULT}, + {"RIFF", 4, "video/x-msvideo", XPATTERN (AVI_XPATTERN)}, + {"RIFF", 4, "audio/x-wav", XPATTERN (WAVE_XPATTERN)}, + {"RIFX", 4, "video/x-msvideo", XPATTERN (AVI_XPATTERN)}, + {"RIFX", 4, "audio/x-wav", XPATTERN (WAVE_XPATTERN)}, + {"RIFF", 4, "audio/midi", XPATTERN (RMID_XPATTERN)}, + {"RIFX", 4, "audio/midi", XPATTERN (RMID_XPATTERN)}, + {"RIFF", 4, "image/x-animated-cursor", XPATTERN (ACON_XPATTERN)}, + {"RIFX", 4, "image/x-animated-cursor", XPATTERN (ACON_XPATTERN)}, + {"\211GND\r\n\032\n", 8, "application/gnunet-directory", DEFAULT}, + {"{\\rtf", 5, "application/rtf", DEFAULT}, + {"\xf7\x02", 2, "application/x-dvi", DEFAULT}, + {"\x1F\x8B\x08\x00", 4, "application/x-gzip", DEFAULT}, + {"BZh91AY&SY", 10, "application/bz2", DEFAULT}, + {"\xED\xAB\xEE\xDB", 4, "application/x-rpm", DEFAULT}, /* binary */ + {"!<arch>\ndebian", 14, "application/x-dpkg", DEFAULT}, /* .deb */ + {"PK\x03\x04", 4, "application/x-zip", DEFAULT}, + {"\xea\x60", 2, "application/x-arj", DEFAULT}, + {"\037\235", 2, "application/x-compress", DEFAULT}, + {"Rar!", 4, "application/x-rar", DEFAULT}, + {"", 0, "application/x-ace", XPATTERN (ACE_XPATTERN)}, + {"", 0, "application/x-tar", XPATTERN (TAR_XPATTERN)}, + {"", 0, "application/x-gtar", XPATTERN (GTAR_XPATTERN)}, + {"-lh0-", 5, "application/x-lha", DEFAULT}, + {"-lh1-", 5, "application/x-lha", DEFAULT}, + {"-lh2-", 5, "application/x-lha", DEFAULT}, + {"-lh3-", 5, "application/x-lha", DEFAULT}, + {"-lh4-", 5, "application/x-lha", DEFAULT}, + {"-lh5-", 5, "application/x-lha", DEFAULT}, + {"-lh6-", 5, "application/x-lha", DEFAULT}, + {"-lh7-", 5, "application/x-lha", DEFAULT}, + {"-lhd-", 5, "application/x-lha", DEFAULT}, + {"-lh\40-", 5, "application/x-lha", DEFAULT}, + {"-lz4-", 5, "application/x-lha", DEFAULT}, + {"-lz5-", 5, "application/x-lha", DEFAULT}, + {"-lzs-", 5, "application/x-lha", DEFAULT}, + {"\xFD\x76", 2, "application/x-lzh", DEFAULT}, + {"\x00\x00\x01\xb3", 4, "video/mpeg", DEFAULT}, + {"\x00\x00\x01\xba", 4, "video/mpeg", DEFAULT}, + {"moov", 4, "video/quicktime", DEFAULT}, + {"mdat", 4, "video/quicktime", DEFAULT}, + {"\x8aMNG", 4, "video/x-mng", DEFAULT}, + {"\x30\x26\xb2\x75\x8e\x66", 6, "video/asf", DEFAULT}, /* same as .wmv ? */ + {"FWS", 3, "application/x-shockwave-flash", DEFAULT}, + {"MThd", 4, "audio/midi", DEFAULT}, + {"ID3", 3, "audio/mpeg", DEFAULT}, + {"\xFF\xFA", 2, "audio/mpeg", DEFAULT}, + {"\xFF\xFB", 2, "audio/mpeg", DEFAULT}, + {"\xFF\xFC", 2, "audio/mpeg", DEFAULT}, + {"\xFF\xFD", 2, "audio/mpeg", DEFAULT}, + {"\xFF\xFE", 2, "audio/mpeg", DEFAULT}, + {"\xFF\xFF", 2, "audio/mpeg", DEFAULT}, + {"OggS", 4, "application/ogg", DEFAULT}, + {"#!/bin/sh", 9, "application/x-shellscript", DEFAULT}, + {"#!/bin/bash", 11, "application/x-shellscript", DEFAULT}, + {"#!/bin/csh", 10, "application/x-shellscript", DEFAULT}, + {"#!/bin/tcsh", 11, "application/x-shellscript", DEFAULT}, + {"#!/bin/perl", 11, "application/x-perl", DEFAULT}, + {"<?xml", 5, "image/svg+xml", svgMatcher, NULL}, + {NULL, 0, NULL, DISABLED}, +}; + + +int +EXTRACTOR_mime_extract (const char *data, + size_t size, + EXTRACTOR_MetaDataProcessor proc, + void *proc_cls, + const char *options) +{ + int i; + + i = 0; + while (patterns[i].pattern != NULL) + { + if (size < patterns[i].size) + { + i++; + continue; + } + if (0 == memcmp (patterns[i].pattern, data, patterns[i].size)) + { + if (patterns[i].detector (data, size, patterns[i].arg)) + return proc (proc_cls, + "mime", + EXTRACTOR_METATYPE_MIMETYPE, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + patterns[i].mimetype, + strlen(patterns[i].mimetype)+1); + } + i++; + } + return 0; +} diff --git a/src/plugins/mimeextractor.c b/src/plugins/mimeextractor.c @@ -1,333 +0,0 @@ -/* - This file is part of libextractor. - (C) 2002, 2003, 2006 Vidyut Samanta and Christian Grothoff - - libextractor is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 2, or (at your - option) any later version. - - libextractor is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with libextractor; see the file COPYING. If not, write to the - Free Software Foundation, Inc., 59 Temple Place - Suite 330, - Boston, MA 02111-1307, USA. - */ - -#include "platform.h" -#include "extractor.h" - - -static EXTRACTOR_KeywordList * -addKeyword (EXTRACTOR_KeywordType type, - char *keyword, EXTRACTOR_KeywordList * next) -{ - EXTRACTOR_KeywordList *result; - - if (keyword == NULL) - return next; - result = malloc (sizeof (EXTRACTOR_KeywordList)); - result->next = next; - result->keyword = keyword; - result->keywordType = type; - return result; -} - -/** - * Detect a file-type. - * @param data the contents of the file - * @param len the length of the file - * @param arg closure... - * @return 0 if the file does not match, 1 if it does - **/ -typedef int (*Detector) (const char *data, size_t len, void *arg); - -/** - * Detect a file-type. - * @param data the contents of the file - * @param len the length of the file - * @return always 1 - **/ -static int -defaultDetector (const char *data, size_t len, void *arg) -{ - return 1; -} - -/** - * Detect a file-type. - * @param data the contents of the file - * @param len the length of the file - * @return always 0 - **/ -static int -disableDetector (const char *data, size_t len, void *arg) -{ - return 0; -} - -typedef struct ExtraPattern -{ - int pos; - int len; - char *pattern; -} ExtraPattern; - -/** - * Define special matching rules for complicated formats... - **/ -static ExtraPattern xpatterns[] = { -#define AVI_XPATTERN 0 - {8, 4, "AVI "}, - {0, 0, NULL}, -#define WAVE_XPATTERN 2 - {8, 4, "WAVE"}, - {0, 0, NULL}, -#define ACE_XPATTERN 4 - {4, 10, "\x00\x00\x90**ACE**"}, - {0, 0, NULL}, -#define TAR_XPATTERN 6 - {257, 6, "ustar\x00"}, - {0, 0, NULL}, -#define GTAR_XPATTERN 8 - {257, 8, "ustar\040\040\0"}, - {0, 0, NULL}, -#define RMID_XPATTERN 10 - {8, 4, "RMID"}, - {0, 0, NULL}, -#define ACON_XPATTERN 12 - {8, 4, "ACON"}, - {0, 0, NULL}, -#define CR2_PATTERN 14 - {8, 3, "CR\x02"}, - {0, 0, NULL}, -}; - -/** - * Detect AVI. A pattern matches if all XPatterns until the next {0, - * 0, NULL} slot match. OR-ing patterns can be achieved using multiple - * entries in the main table, so this "AND" (all match) semantics are - * the only reasonable answer. - **/ -static int -xPatternMatcher (const char *data, size_t len, void *cls) -{ - ExtraPattern *arg = cls; - - while (arg->pattern != NULL) - { - if (arg->pos + arg->len > len) - return 0; - if (0 != memcmp (&data[arg->pos], arg->pattern, arg->len)) - return 0; - arg++; - } - return 1; -} - -/** - * Detect SVG - */ -static int -svgMatcher (const char *data, size_t len, void *cls) -{ - enum - { XMLSTART, XMLCLOSE, SVGSTART } state; - size_t i; - - i = 0; - state = XMLSTART; - - while (i < len) - { - if (!isprint (data[i])) - return 0; - switch (state) - { - case XMLSTART: - if (i + 6 >= len) - return 0; - else if (memcmp (data + i, "<?xml", 5) == 0 - && isspace (*(data + i + 5))) - state = XMLCLOSE; - break; - case XMLCLOSE: - if (i + 2 >= len) - return 0; - else if (memcmp (data + i, "?>", 2) == 0) - state = SVGSTART; - break; - case SVGSTART: - if (i + 5 >= len) - return 0; - else if (memcmp (data + i, "<svg", 4) == 0 - && isspace (*(data + i + 4))) - return 1; - break; - default: - /* do nothing */ - break; - } - i++; - } - return 0; -} - -/** - * Use this detector, if the simple header-prefix matching is - * sufficient. - **/ -#define DEFAULT &defaultDetector, NULL - -/** - * Use this detector, to disable the mime-type (effectively comment it - * out). - **/ -#define DISABLED &disableDetector, NULL - -/** - * Select an entry in xpatterns for matching - **/ -#define XPATTERN(a) &xPatternMatcher, &xpatterns[(a)] - -typedef struct Pattern -{ - char *pattern; - int size; - char *mimetype; - Detector detector; - void *arg; -} Pattern; - -static Pattern patterns[] = { - {"\xFF\xD8", 2, "image/jpeg", DEFAULT}, - {"\211PNG\r\n\032\n", 8, "image/png", DEFAULT}, - {"/* XPM */", 9, "image/x-xpm", DEFAULT}, - {"GIF8", 4, "image/gif", DEFAULT}, - {"P1", 2, "image/x-portable-bitmap", DEFAULT}, - {"P2", 2, "image/x-portable-graymap", DEFAULT}, - {"P3", 2, "image/x-portable-pixmap", DEFAULT}, - {"P4", 2, "image/x-portable-bitmap", DEFAULT}, - {"P5", 2, "image/x-portable-graymap", DEFAULT}, - {"P6", 2, "image/x-portable-pixmap", DEFAULT}, - {"P7", 2, "image/x-portable-anymap", DEFAULT}, - {"BM", 2, "image/x-bmp", DEFAULT}, - {"fLaC", 4, "audio/flac", DEFAULT}, - {"\x89PNG", 4, "image/x-png", DEFAULT}, - {"id=ImageMagick", 14, "application/x-imagemagick-image", DEFAULT}, - {"hsi1", 4, "image/x-jpeg-proprietary", DEFAULT}, - {"FLV", 3, "video/x-flv", DEFAULT}, - {"FWS", 3, "application/x-shockwave-flash", DEFAULT}, - {"CWS", 3, "application/x-shockwave-flash", DEFAULT}, - {"\x2E\x52\x4d\x46", 4, "video/real", DEFAULT}, - {"\x2e\x72\x61\xfd", 4, "audio/real", DEFAULT}, - {"\x00\x05\x16\x00", 4, "application/applefile", DEFAULT}, - {"\x00\x05\x16\x07", 4, "application/applefile", DEFAULT}, - {"\177ELF", 4, "application/x-executable", DEFAULT}, - /* FIXME: correct MIME-type for an ELF!? */ - {"\xca\xfe\xba\xbe", 4, "application/java", DEFAULT}, - /* FIXME: correct MIME for a class-file? */ - {"gimp xcf", 8, "image/xcf", DEFAULT}, - {"II\x2a\x00\x10", 5, "image/x-canon-cr2", XPATTERN (CR2_PATTERN)}, - {"IIN1", 4, "image/tiff", DEFAULT}, - {"MM\x00\x2a", 4, "image/tiff", DEFAULT}, /* big-endian */ - {"II\x2a\x00", 4, "image/tiff", DEFAULT}, /* little-endian */ - {"%PDF", 4, "application/pdf", DEFAULT}, - {"%!PS-Adobe-", 11, "application/postscript", DEFAULT}, - {"\004%!PS-Adobe-", 12, "application/postscript", DEFAULT}, - {"RIFF", 4, "video/x-msvideo", XPATTERN (AVI_XPATTERN)}, - {"RIFF", 4, "audio/x-wav", XPATTERN (WAVE_XPATTERN)}, - {"RIFX", 4, "video/x-msvideo", XPATTERN (AVI_XPATTERN)}, - {"RIFX", 4, "audio/x-wav", XPATTERN (WAVE_XPATTERN)}, - {"RIFF", 4, "audio/midi", XPATTERN (RMID_XPATTERN)}, - {"RIFX", 4, "audio/midi", XPATTERN (RMID_XPATTERN)}, - {"RIFF", 4, "image/x-animated-cursor", XPATTERN (ACON_XPATTERN)}, - {"RIFX", 4, "image/x-animated-cursor", XPATTERN (ACON_XPATTERN)}, - {"\211GND\r\n\032\n", 8, "application/gnunet-directory", DEFAULT}, - {"{\\rtf", 5, "application/rtf", DEFAULT}, - {"\xf7\x02", 2, "application/x-dvi", DEFAULT}, - {"\x1F\x8B\x08\x00", 4, "application/x-gzip", DEFAULT}, - {"BZh91AY&SY", 10, "application/bz2", DEFAULT}, - {"\xED\xAB\xEE\xDB", 4, "application/x-rpm", DEFAULT}, /* binary */ - {"!<arch>\ndebian", 14, "application/x-dpkg", DEFAULT}, /* .deb */ - {"PK\x03\x04", 4, "application/x-zip", DEFAULT}, - {"\xea\x60", 2, "application/x-arj", DEFAULT}, - {"\037\235", 2, "application/x-compress", DEFAULT}, - {"Rar!", 4, "application/x-rar", DEFAULT}, - {"", 0, "application/x-ace", XPATTERN (ACE_XPATTERN)}, - {"", 0, "application/x-tar", XPATTERN (TAR_XPATTERN)}, - {"", 0, "application/x-gtar", XPATTERN (GTAR_XPATTERN)}, - {"-lh0-", 5, "application/x-lha", DEFAULT}, - {"-lh1-", 5, "application/x-lha", DEFAULT}, - {"-lh2-", 5, "application/x-lha", DEFAULT}, - {"-lh3-", 5, "application/x-lha", DEFAULT}, - {"-lh4-", 5, "application/x-lha", DEFAULT}, - {"-lh5-", 5, "application/x-lha", DEFAULT}, - {"-lh6-", 5, "application/x-lha", DEFAULT}, - {"-lh7-", 5, "application/x-lha", DEFAULT}, - {"-lhd-", 5, "application/x-lha", DEFAULT}, - {"-lh\40-", 5, "application/x-lha", DEFAULT}, - {"-lz4-", 5, "application/x-lha", DEFAULT}, - {"-lz5-", 5, "application/x-lha", DEFAULT}, - {"-lzs-", 5, "application/x-lha", DEFAULT}, - {"\xFD\x76", 2, "application/x-lzh", DEFAULT}, - {"\x00\x00\x01\xb3", 4, "video/mpeg", DEFAULT}, - {"\x00\x00\x01\xba", 4, "video/mpeg", DEFAULT}, - {"moov", 4, "video/quicktime", DEFAULT}, - {"mdat", 4, "video/quicktime", DEFAULT}, - {"\x8aMNG", 4, "video/x-mng", DEFAULT}, - {"\x30\x26\xb2\x75\x8e\x66", 6, "video/asf", DEFAULT}, /* same as .wmv ? */ - {"FWS", 3, "application/x-shockwave-flash", DEFAULT}, - {"MThd", 4, "audio/midi", DEFAULT}, - {"ID3", 3, "audio/mpeg", DEFAULT}, - {"\xFF\xFA", 2, "audio/mpeg", DEFAULT}, - {"\xFF\xFB", 2, "audio/mpeg", DEFAULT}, - {"\xFF\xFC", 2, "audio/mpeg", DEFAULT}, - {"\xFF\xFD", 2, "audio/mpeg", DEFAULT}, - {"\xFF\xFE", 2, "audio/mpeg", DEFAULT}, - {"\xFF\xFF", 2, "audio/mpeg", DEFAULT}, - {"OggS", 4, "application/ogg", DEFAULT}, - {"#!/bin/sh", 9, "application/x-shellscript", DEFAULT}, - {"#!/bin/bash", 11, "application/x-shellscript", DEFAULT}, - {"#!/bin/csh", 10, "application/x-shellscript", DEFAULT}, - {"#!/bin/tcsh", 11, "application/x-shellscript", DEFAULT}, - {"#!/bin/perl", 11, "application/x-perl", DEFAULT}, - {"<?xml", 5, "image/svg+xml", svgMatcher, NULL}, - {NULL, 0, NULL, DISABLED}, -}; - -struct EXTRACTOR_Keywords * -libextractor_mime_extract (const char *filename, - const char *data, - size_t size, struct EXTRACTOR_Keywords *prev) -{ - int i; - const char *mime; - - mime = EXTRACTOR_extractLast (EXTRACTOR_MIMETYPE, prev); - if (mime != NULL) - return prev; /* if the mime-type has already - been determined, there is no need - to probe again (and potentially be wrong...) */ - i = 0; - while (patterns[i].pattern != NULL) - { - if (size < patterns[i].size) - { - i++; - continue; - } - if (0 == memcmp (patterns[i].pattern, data, patterns[i].size)) - { - if (patterns[i].detector (data, size, patterns[i].arg)) - return addKeyword (EXTRACTOR_MIMETYPE, - strdup (patterns[i].mimetype), prev); - } - i++; - } - return prev; -} diff --git a/src/plugins/splitextractor.c b/src/plugins/splitextractor.c @@ -1,157 +0,0 @@ -/* - This file is part of libextractor. - (C) 2002, 2003, 2005, 2006 Vidyut Samanta and Christian Grothoff - - libextractor is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 2, or (at your - option) any later version. - - libextractor is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with libextractor; see the file COPYING. If not, write to the - Free Software Foundation, Inc., 59 Temple Place - Suite 330, - Boston, MA 02111-1307, USA. - */ - -#include "platform.h" -#include "extractor.h" - -/** - * Default split characters. - */ -static const char *TOKENIZERS = "._ ,%@-\n_[](){}"; - -/** - * Do not use keywords shorter than this minimum - * length. - */ -static int MINIMUM_KEYWORD_LENGTH = 3; - -static void -addKeyword (struct EXTRACTOR_Keywords **list, const char *keyword) -{ - EXTRACTOR_KeywordList *next; - next = malloc (sizeof (EXTRACTOR_KeywordList)); - next->next = *list; - next->keyword = strdup (keyword); - next->keywordType = EXTRACTOR_SPLIT; - *list = next; -} - -static int -token (char letter, const char *options) -{ - size_t i; - - i = 0; - while (options[i] != '\0') - { - if (letter == options[i]) - return 1; - i++; - } - return 0; -} - -static void -splitKeywords (const char *keyword, - struct EXTRACTOR_Keywords **list, const char *options) -{ - char *dp; - size_t pos; - size_t last; - size_t len; - - dp = strdup (keyword); - len = strlen (dp); - pos = 0; - last = 0; - while (pos < len) - { - while ((0 == token (dp[pos], options)) && (pos < len)) - pos++; - dp[pos++] = '\0'; - if ((pos - last > MINIMUM_KEYWORD_LENGTH) && - (0 != strcmp (keyword, &dp[last]))) - addKeyword (list, &dp[last]); - while ((pos < len) && (1 == token (dp[pos], options))) - pos++; - last = pos; - } - free (dp); -} - -/* split other keywords into multiple keywords */ -struct EXTRACTOR_Keywords * -libextractor_split_extract (const char *filename, - const char *data, - size_t size, - struct EXTRACTOR_Keywords *prev, - const char *options) -{ - struct EXTRACTOR_Keywords *kpos; - char *opt; - char *pos; - - if (options == NULL) - { - opt = strdup (TOKENIZERS); - } - else - { - opt = strdup (options); - pos = opt; - while (pos[0] != '\0') - { - if (pos[0] == '\\') - { - switch (pos[1]) - { - case 'n': - pos[0] = '\n'; - memmove (&pos[1], &pos[2], strlen (&pos[2])); - continue; - case 'r': - pos[0] = '\r'; - memmove (&pos[1], &pos[2], strlen (&pos[2])); - continue; - case 'b': - pos[0] = '\b'; - memmove (&pos[1], &pos[2], strlen (&pos[2])); - continue; - case 't': - pos[0] = '\t'; - memmove (&pos[1], &pos[2], strlen (&pos[2])); - continue; - case '\\': - memmove (&pos[1], &pos[2], strlen (&pos[2])); - continue; - case '\0': /* invalid escape, ignore */ - pos[0] = '\0'; - break; - default: /* invalid escape, skip */ - memmove (&pos[0], &pos[2], strlen (&pos[2])); - continue; - } - } - pos++; - } - } - kpos = prev; - while (kpos != NULL) - { - if (kpos->keywordType != EXTRACTOR_FILE_SIZE) - splitKeywords (kpos->keyword, &prev, opt); - - kpos = kpos->next; - } - free (opt); - return prev; -} - -/* end of splitextractor.c */