aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristian Grothoff <christian@grothoff.org>2010-01-13 13:42:34 +0000
committerChristian Grothoff <christian@grothoff.org>2010-01-13 13:42:34 +0000
commit40b9c39604e1d2d9db792940500aa48f933d5588 (patch)
tree62875e81b544532bcfddcf7333ec330c31b0ff25
parent8372891411f4e97914386b4626f1dcdb5ec167e8 (diff)
downloadlibextractor-40b9c39604e1d2d9db792940500aa48f933d5588.tar.gz
libextractor-40b9c39604e1d2d9db792940500aa48f933d5588.zip
adding support for tail extraction, documenting, using it for ID3v1
-rw-r--r--doc/extractor.texi212
-rw-r--r--doc/version.texi2
-rw-r--r--src/main/extractor.c361
-rw-r--r--src/plugins/Makefile.am8
-rw-r--r--src/plugins/id3_extractor.c305
-rw-r--r--src/plugins/mp3_extractor.c275
6 files changed, 753 insertions, 410 deletions
diff --git a/doc/extractor.texi b/doc/extractor.texi
index d382aed..4bf6743 100644
--- a/doc/extractor.texi
+++ b/doc/extractor.texi
@@ -10,8 +10,10 @@
10@c %**end of header 10@c %**end of header
11@copying 11@copying
12This manual is for GNU libextractor 12This manual is for GNU libextractor
13(version @value{VERSION}, @value{UPDATED}), 13(version @value{VERSION}, @value{UPDATED}).
14which is GNU's library for meta data extraction. 14
15GNU libextractor is a GNU package.
16
15 17
16Copyright @copyright{} 2007, 2010 Christian Grothoff 18Copyright @copyright{} 2007, 2010 Christian Grothoff
17 19
@@ -73,7 +75,7 @@ Free Documentation License".
73@code{NULL} 75@code{NULL}
74@end macro 76@end macro
75 77
76@macro le{} 78@macro gnule{}
77@acronym{GNU libextractor} 79@acronym{GNU libextractor}
78@end macro 80@end macro
79 81
@@ -84,24 +86,22 @@ Free Documentation License".
84@insertcopying 86@insertcopying
85@end ifnottex 87@end ifnottex
86 88
87GNU libextractor is a GNU package.
88
89@menu 89@menu
90* Introduction:: What is @le{}. 90* Introduction:: What is @gnule{}.
91* Preparation:: What you should do before using the library. 91* Preparation:: What you should do before using the library.
92* Generalities:: General library functions and data types. 92* Generalities:: General library functions and data types.
93* Extracting meta data:: How to use @le{} to obtain meta data. 93* Extracting meta data:: How to use @gnule{} to obtain meta data.
94* Language bindings:: How to use @le{} from languages other than C. 94* Language bindings:: How to use @gnule{} from languages other than C.
95* Utility functions:: Utility functions of @le{}. 95* Utility functions:: Utility functions of @gnule{}.
96* Existing Plugins:: What plugins are available. 96* Existing Plugins:: What plugins are available.
97* Writing new Plugins:: How to write new plugins for @le{}. 97* Writing new Plugins:: How to write new plugins for @gnule{}.
98* Internal utility functions:: Utility functions of @le{} for writing plugins. 98* Internal utility functions:: Utility functions of @gnule{} for writing plugins.
99* Reporting bugs:: How to report bugs or request new features. 99* Reporting bugs:: How to report bugs or request new features.
100 100
101Appendices 101Appendices
102 102
103* Copying:: The GNU General Public License says how you 103* Copying:: The GNU General Public License says how you
104 can copy and share some parts of @le{}. 104 can copy and share some parts of @gnule{}.
105 105
106Indices 106Indices
107 107
@@ -120,7 +120,7 @@ Indices
120@chapter Introduction 120@chapter Introduction
121 121
122@cindex error handling 122@cindex error handling
123@le{} is GNU's library for extracting meta data from 123@gnule{} is GNU's library for extracting meta data from
124files. Meta data includes format information (such as mime type, 124files. Meta data includes format information (such as mime type,
125image dimensions, color depth, recording frequency), content 125image dimensions, color depth, recording frequency), content
126descriptions (such as document title or document description) and 126descriptions (such as document title or document description) and
@@ -128,55 +128,55 @@ copyright information (such as license, author and contributors).
128Meta data extraction is an inherently uncertain business --- a parse 128Meta data extraction is an inherently uncertain business --- a parse
129error can be a corrupt file, an incompatibility in the file format 129error can be a corrupt file, an incompatibility in the file format
130version, an entirely different file format or a bug in the parser. As 130version, an entirely different file format or a bug in the parser. As
131a result of this uncertainty, @le{} deliberately 131a result of this uncertainty, @gnule{} deliberately
132avoids to ever report any errors. Unexpected file contents simply 132avoids to ever report any errors. Unexpected file contents simply
133result in less or possibly no meta data being extracted. 133result in less or possibly no meta data being extracted.
134 134
135@cindex plugin 135@cindex plugin
136@le{} uses plugins to handle various file formats. 136@gnule{} uses plugins to handle various file formats.
137Technically a plugin can support multiple file formats; however, most 137Technically a plugin can support multiple file formats; however, most
138plugins only support one particular format. By default, 138plugins only support one particular format. By default,
139@le{} will use all plugins that are available and found 139@gnule{} will use all plugins that are available and found
140in the plugin installation directory. Applications can 140in the plugin installation directory. Applications can
141request the use of only specific plugins or the exclusion of 141request the use of only specific plugins or the exclusion of
142certain plugins. 142certain plugins.
143 143
144@le{} is distributed with the @command{extract} 144@gnule{} is distributed with the @command{extract}
145command@footnote{Some distributions ship @command{extract} in a 145command@footnote{Some distributions ship @command{extract} in a
146seperate package.} which is a command-line tool for extracting 146seperate package.} which is a command-line tool for extracting
147meta data. @command{extract} is given a list of filenames and 147meta data. @command{extract} is given a list of filenames and
148prints the resulting meta data to the console. The @command{extract} 148prints the resulting meta data to the console. The @command{extract}
149source code also serves as an advanced example for how to use 149source code also serves as an advanced example for how to use
150@le{}. 150@gnule{}.
151 151
152This manual focuses on providing documentation for writing software 152This manual focuses on providing documentation for writing software
153with @le{}. The only relevant parts for end-users 153with @gnule{}. The only relevant parts for end-users
154are the chapter on compiling and installing @le{} 154are the chapter on compiling and installing @gnule{}
155(@xref{Preparation}.). Also, the chapter on existing plugins maybe of 155(@xref{Preparation}.). Also, the chapter on existing plugins maybe of
156interest (@xref{Existing Plugins}.). Additional documentation for 156interest (@xref{Existing Plugins}.). Additional documentation for
157end-users can be find in the man page on @command{extract} (using 157end-users can be find in the man page on @command{extract} (using
158@verb{|man extract|}). 158@verb{|man extract|}).
159 159
160@cindex license 160@cindex license
161@le{} is licensed under the GNU General Public License. The 161@gnule{} is licensed under the GNU General Public License. The
162developers have frequently received requests to license GNU 162developers have frequently received requests to license GNU
163libextractor under alternative terms. However, @le{} 163libextractor under alternative terms. However, @gnule{}
164borrows plenty of GPL-licensed code from various other projects. 164borrows plenty of GPL-licensed code from various other projects.
165Hence we cannot change the license (even if we wanted to).@footnote{It 165Hence we cannot change the license (even if we wanted to).@footnote{It
166maybe possible to switch to GPLv3 in the future. For this, an audit 166maybe possible to switch to GPLv3 in the future. For this, an audit
167of the license status of our dependencies would be required. The new 167of the license status of our dependencies would be required. The new
168code that was developed specifically for @le{} has 168code that was developed specifically for @gnule{} has
169always been licensed under GPLv2 @emph{or any later version}.} 169always been licensed under GPLv2 @emph{or any later version}.}
170 170
171@node Preparation 171@node Preparation
172@chapter Preparation 172@chapter Preparation
173 173
174Compiling @le{} follows the standard GNU autotools 174Compiling @gnule{} follows the standard GNU autotools
175build process using @command{configure} and @command{make}. For 175build process using @command{configure} and @command{make}. For
176details, read the @file{INSTALL} file and query 176details, read the @file{INSTALL} file and query
177@verb{|./configure --help|} for additional options. 177@verb{|./configure --help|} for additional options.
178 178
179@le{} has various dependencies, some of which are optional. 179@gnule{} has various dependencies, some of which are optional.
180Instead of specifying the names of the software packages, we 180Instead of specifying the names of the software packages, we
181will give the list in terms of the names of the respective 181will give the list in terms of the names of the respective
182Debian (unstable) packages that should be installed. 182Debian (unstable) packages that should be installed.
@@ -246,29 +246,29 @@ Please notify us if we missed some dependencies (note that the list is
246supposed to only list direct dependencies, not transitive 246supposed to only list direct dependencies, not transitive
247dependencies). 247dependencies).
248 248
249Once you have compiled and installed @le{}, you should have a file 249Once you have compiled and installed @gnule{}, you should have a file
250@file{extractor.h} installed in your @file{include/} directory. This 250@file{extractor.h} installed in your @file{include/} directory. This
251file should be the starting point for your C and C++ development with 251file should be the starting point for your C and C++ development with
252@le{}. The build process also installs the @file{extract} binary and 252@gnule{}. The build process also installs the @file{extract} binary and
253man pages for @file{extract} and @le{}. The @file{extract} man page 253man pages for @file{extract} and @gnule{}. The @file{extract} man page
254documents the @file{extract} tool. The @le{} man page gives a brief 254documents the @file{extract} tool. The @gnule{} man page gives a brief
255summary of the C API for @le{}. 255summary of the C API for @gnule{}.
256 256
257@cindex packageing 257@cindex packageing
258@cindex directory structure 258@cindex directory structure
259@cindex plugin 259@cindex plugin
260@cindex environment variables 260@cindex environment variables
261@vindex LIBEXTRACTOR_PREFIX 261@vindex LIBEXTRACTOR_PREFIX
262When you install @le{}, various plugins will be 262When you install @gnule{}, various plugins will be
263installed in the @file{lib/libextractor/} directory. The main library 263installed in the @file{lib/libextractor/} directory. The main library
264will be installed as @file{lib/libextractor.so}. Note that 264will be installed as @file{lib/libextractor.so}. Note that
265@le{} will attempt to find the plugins relative to the 265@gnule{} will attempt to find the plugins relative to the
266path of the main library. Consequently, a package manager can move 266path of the main library. Consequently, a package manager can move
267the library and its plugins to a different location later --- as long 267the library and its plugins to a different location later --- as long
268as the relative path between the main library and the plugins is 268as the relative path between the main library and the plugins is
269preserved. As a method of last resort, the user can specify an 269preserved. As a method of last resort, the user can specify an
270environment variable @verb{|LIBEXTRACTOR_PREFIX|}. If 270environment variable @verb{|LIBEXTRACTOR_PREFIX|}. If
271@le{} cannot locate a plugin, it will look in 271@gnule{} cannot locate a plugin, it will look in
272@verb{|LIBEXTRACTOR_PREFIX/lib/libextractor/|}. 272@verb{|LIBEXTRACTOR_PREFIX/lib/libextractor/|}.
273 273
274@section Note to package maintainers 274@section Note to package maintainers
@@ -304,9 +304,9 @@ resources.
304@node Generalities 304@node Generalities
305@chapter Generalities 305@chapter Generalities
306 306
307Each public symbol exported by @le{} has the prefix 307Each public symbol exported by @gnule{} has the prefix
308@verb{|EXTRACTOR_|}. All-caps names are used for constants. For the 308@verb{|EXTRACTOR_|}. All-caps names are used for constants. For the
309impatient, the minimal C code for using @le{} (on the 309impatient, the minimal C code for using @gnule{} (on the
310executing binary itself) looks like this: 310executing binary itself) looks like this:
311 311
312@verbatim 312@verbatim
@@ -326,6 +326,13 @@ int main(int argc, char ** argv) {
326@node Extracting meta data 326@node Extracting meta data
327@chapter Extracting meta data 327@chapter Extracting meta data
328 328
329In order to extract meta data with @gnule{} you first need to
330load the respective plugins and then call the extraction API
331with the plugins and the data to process. This section
332documents how to load and unload plugins, the various types
333and formats in which meta data is returned to the application
334and finally the extraction API itself.
335
329@menu 336@menu
330* Plugin management:: How to load and unload plugins 337* Plugin management:: How to load and unload plugins
331* Meta types:: About meta types 338* Meta types:: About meta types
@@ -350,7 +357,7 @@ from multiple threads at the same time is not safe. Creating multiple
350plugin lists and using them concurrently is supported as long as 357plugin lists and using them concurrently is supported as long as
351the @code{EXTRACTOR_OPTION_IN_PROCESS} option is not used. 358the @code{EXTRACTOR_OPTION_IN_PROCESS} option is not used.
352 359
353Generally, @le{} is fully thread-safe and mostly reentrant. 360Generally, @gnule{} is fully thread-safe and mostly reentrant.
354All plugin code is expected required to be reentrant and state-less, 361All plugin code is expected required to be reentrant and state-less,
355but due to the extensive use of 3rd party libraries this cannot 362but due to the extensive use of 3rd party libraries this cannot
356be guaranteed. Hence plugins are executed (by default) out of 363be guaranteed. Hence plugins are executed (by default) out of
@@ -402,7 +409,7 @@ Loads and unloads plugins based on a configuration string, modifying the existin
402@deftypefun {struct EXTRACTOR_PluginList *} EXTRACTOR_plugin_add_defaults (enum EXTRACTOR_Options flags) 409@deftypefun {struct EXTRACTOR_PluginList *} EXTRACTOR_plugin_add_defaults (enum EXTRACTOR_Options flags)
403@findex EXTRACTOR_plugin_add_defaults 410@findex EXTRACTOR_plugin_add_defaults
404 411
405Loads all of the plugins in the plugin directory. This function is what most @le{} applications should use to setup the plugins. 412Loads all of the plugins in the plugin directory. This function is what most @gnule{} applications should use to setup the plugins.
406@end deftypefun 413@end deftypefun
407 414
408 415
@@ -414,14 +421,14 @@ Loads all of the plugins in the plugin directory. This function is what most @l
414@tindex enum EXTRACTOR_MetaType 421@tindex enum EXTRACTOR_MetaType
415@findex EXTRACTOR_metatype_get_max 422@findex EXTRACTOR_metatype_get_max
416 423
417@verb{|enum EXTRACTOR_MetaType|} is a C enum which defines a list of over 100 different types of meta data. The total number can differ between different @le{} releases; the maximum value for the current release can be obtained using the @verb{|EXTRACTOR_metatype_get_max|} function. All values in this enumeration are of the form @verb{|EXTRACTOR_METATYPE_XXX|}. 424@verb{|enum EXTRACTOR_MetaType|} is a C enum which defines a list of over 100 different types of meta data. The total number can differ between different @gnule{} releases; the maximum value for the current release can be obtained using the @verb{|EXTRACTOR_metatype_get_max|} function. All values in this enumeration are of the form @verb{|EXTRACTOR_METATYPE_XXX|}.
418 425
419@deftypefun {const char *} EXTRACTOR_metatype_to_string (enum EXTRACTOR_MetaType type) 426@deftypefun {const char *} EXTRACTOR_metatype_to_string (enum EXTRACTOR_MetaType type)
420@findex EXTRACTOR_metatype_to_string 427@findex EXTRACTOR_metatype_to_string
421@cindex gettext 428@cindex gettext
422@cindex internationalization 429@cindex internationalization
423 430
424The function @verb{|EXTRACTOR_metatype_to_string|} can be used to obtain a short English string @samp{s} describing the meta data type. The string can be translated into other languages using GNU gettext with the domain set to @le{} (@verb{|dgettext("libextractor", s)|}). 431The function @verb{|EXTRACTOR_metatype_to_string|} can be used to obtain a short English string @samp{s} describing the meta data type. The string can be translated into other languages using GNU gettext with the domain set to @gnule{} (@verb{|dgettext("libextractor", s)|}).
425@end deftypefun 432@end deftypefun
426 433
427@deftypefun {const char *} EXTRACTOR_metatype_to_description (enum EXTRACTOR_MetaType type) 434@deftypefun {const char *} EXTRACTOR_metatype_to_description (enum EXTRACTOR_MetaType type)
@@ -429,7 +436,7 @@ The function @verb{|EXTRACTOR_metatype_to_string|} can be used to obtain a short
429@cindex gettext 436@cindex gettext
430@cindex internationalization 437@cindex internationalization
431 438
432The function @verb{|EXTRACTOR_metatype_to_description|} can be used to obtain a longer English string @samp{s} describing the meta data type. The description may be empty if the short description returned by @code{EXTRACTOR_metatype_to_string} is already comprehensive. The string can be translated into other languages using GNU gettext with the domain set to @le{} (@verb{|dgettext("libextractor", s)|}). 439The function @verb{|EXTRACTOR_metatype_to_description|} can be used to obtain a longer English string @samp{s} describing the meta data type. The description may be empty if the short description returned by @code{EXTRACTOR_metatype_to_string} is already comprehensive. The string can be translated into other languages using GNU gettext with the domain set to @gnule{} (@verb{|dgettext("libextractor", s)|}).
433@end deftypefun 440@end deftypefun
434 441
435 442
@@ -490,11 +497,11 @@ Return 0 to continue extracting, 1 to abort.
490@cindex threads 497@cindex threads
491@cindex thread-safety 498@cindex thread-safety
492 499
493This is the main function for extracting keywords with @le{}. The first argument is a plugin list which specifies the set of plugins that should be used for extracting meta data. The @samp{filename} argument is optional and can be used to specify the name of a file to process. If @samp{filename} is NULL, then the @samp{data} argument must point to the in-memory data to extract meta data from. If @samp{filename} is non-NULL, @samp{data} can be NULL. If @samp{data} is non-null, then @samp{size} is the size of @samp{data} in bytes. Otherwise @samp{size} should be zero. For each meta data item found, GNU libextractor will call the @samp{proc} function, passing @samp{proc_cls} as the first argument to @samp{proc}. The other arguments to @samp{proc} depend on the specific meta data found. 500This is the main function for extracting keywords with @gnule{}. The first argument is a plugin list which specifies the set of plugins that should be used for extracting meta data. The @samp{filename} argument is optional and can be used to specify the name of a file to process. If @samp{filename} is NULL, then the @samp{data} argument must point to the in-memory data to extract meta data from. If @samp{filename} is non-NULL, @samp{data} can be NULL. If @samp{data} is non-null, then @samp{size} is the size of @samp{data} in bytes. Otherwise @samp{size} should be zero. For each meta data item found, GNU libextractor will call the @samp{proc} function, passing @samp{proc_cls} as the first argument to @samp{proc}. The other arguments to @samp{proc} depend on the specific meta data found.
494 501
495@cindex SIGBUS 502@cindex SIGBUS
496@cindex bus error 503@cindex bus error
497Meta data extraction should never really fail --- at worst, @le{} should not call @samp{proc} with any meta data. By design, @le{} should never crash or leak memory, even given corrupt files as input. Note however, that running @le{} on a corrupt file system (or incorrectly @verb{|mmap|}ed files) can result in the operating system sending a SIGBUS (bus error) to the process. While @le{} runs plugins out-of-process, it first maps the file into memory and then attempts to decompress it. During decompression it is possible to encounter a SIGBUS. @le{} will @emph{not} attempt to catch this signal and your application is likely to crash. Note again that this should only happen if the file @emph{system} is corrupt (not if individual files are corrupt). If this is not acceptable, you might want to consider running @le{} itself also out-of-process (as done, for example, by @url{http://grothoff.org/christian/doodle/,doodle}). 504Meta data extraction should never really fail --- at worst, @gnule{} should not call @samp{proc} with any meta data. By design, @gnule{} should never crash or leak memory, even given corrupt files as input. Note however, that running @gnule{} on a corrupt file system (or incorrectly @verb{|mmap|}ed files) can result in the operating system sending a SIGBUS (bus error) to the process. While @gnule{} runs plugins out-of-process, it first maps the file into memory and then attempts to decompress it. During decompression it is possible to encounter a SIGBUS. @gnule{} will @emph{not} attempt to catch this signal and your application is likely to crash. Note again that this should only happen if the file @emph{system} is corrupt (not if individual files are corrupt). If this is not acceptable, you might want to consider running @gnule{} itself also out-of-process (as done, for example, by @url{http://grothoff.org/christian/doodle/,doodle}).
498 505
499@end deftypefun 506@end deftypefun
500 507
@@ -509,7 +516,7 @@ Meta data extraction should never really fail --- at worst, @le{} should not cal
509@cindex PHP 516@cindex PHP
510@cindex Ruby 517@cindex Ruby
511 518
512@le{} works immediately with C and C++ code. Bindings for Java, Mono, Ruby, Perl, PHP and Python are available for download from the main @le{} website. Documentation for these bindings (if available) is part of the downloads for the respective binding. In all cases, a full installation of the C library is required before the binding can be installed. 519@gnule{} works immediately with C and C++ code. Bindings for Java, Mono, Ruby, Perl, PHP and Python are available for download from the main @gnule{} website. Documentation for these bindings (if available) is part of the downloads for the respective binding. In all cases, a full installation of the C library is required before the binding can be installed.
513 520
514@section Java 521@section Java
515 522
@@ -571,7 +578,7 @@ This binding is undocumented at this point.
571@cindex concurrency 578@cindex concurrency
572@cindex threads 579@cindex threads
573@cindex thread-safety 580@cindex thread-safety
574This chapter describes various utility functions for @le{} usage. All of the functions are reentrant. 581This chapter describes various utility functions for @gnule{} usage. All of the functions are reentrant.
575 582
576@menu 583@menu
577* Utility Constants:: 584* Utility Constants::
@@ -724,6 +731,115 @@ in-process (making it easier to debug) and without any of the other
724plugins. 731plugins.
725 732
726 733
734@section Example for a minimal extract method
735
736The following example shows how a plugin can return the mime type of
737a file.
738@example
739
740int
741EXTRACTOR_mymime_extract
742 (const char *data,
743 size_t data_size,
744 EXTRACTOR_MetaDataProcessor proc,
745 void *proc_cls,
746 const char * options)
747{
748 if (data_size < 4)
749 return 0;
750 if (0 != memcmp (data, "\177ELF", 4))
751 return 0;
752 if (0 != proc (proc_cls,
753 "mymime",
754 EXTRACTOR_METATYPE_MIMETYPE,
755 EXTRACTOR_METAFORMAT_UTF8,
756 "text/plain",
757 "application/x-executable",
758 1 + strlen("application/x-executable")))
759 return 1;
760 /* more calls to 'proc' here as needed */
761 return 0;
762}
763
764@end example
765
766@section Plugin execution options
767
768Plugins can request that their execution be done in a particular way.
769For this, the plugin defines a function with the following signature:
770
771@verbatim
772const char *
773EXTRACTOR_XXX_options (void);
774@end verbatim
775
776The function should return a string with the execution options.
777Individual options in this string should be separated by semicolons.
778Options that are included in the string but not known to the library
779are ignored. The following options are supported:
780
781@itemize @bullet
782@item
783@code{oop-only} ensures that the plugin is only run out-of-process; if
784this is not possible, the plugin will not be executed at all if this
785option is set.
786
787@item
788@code{close-stderr} ensures that @code{stderr} is closed during the
789execution of the plugin. This is useful if the plugin uses libraries
790that write (error) messages to @code{stderr} and where this behavior cannot be
791turned off. This option only works if the plugin is executed out-of-process.
792
793@item
794@code{close-stdout} ensures that @code{stdout} is closed during the
795execution of the plugin. This is useful if the plugin uses libraries
796that write messages to @code{stdout} and where this behavior cannot be
797turned off. This option only works if the plugin is executed out-of-process.
798
799@item
800@code{force-kill} kills and restarts the plugin process for each
801file that is being analyzed. This is useful if the plugin uses
802libraries that keep global state between runs that is problematic or
803if the plugin uses libraries that are known to have serious resource
804leaks (such as memory leaks).
805
806@item
807@code{want-tail}
808In order to limit memory consumption, limit the amount if reading from
809disk and to keep the API simple, the @samp{data} argument passed to
810the @code{EXTRACTOR_XXX_extract} method bounded (to 32 MB of normal
811data; for compressed data, a limit of 16 MB is imposed).@footnote{If
812@gnule{} was given a pointer to an existing, uncompressed block of
813data in memory, no bound is imposed for plugins executing in-process;
814for out-of-process plugins, a 32 MB limit is still imposed.} Since
815some file formats contain meta data at the end of the file, this option
816provides a way for plugins to access not the first 16--32 MB of a file
817but instead the last (roughly) 32 MB.
818
819Note that even for files larger than 32 MB, @samp{size} is not
820guaranteed to be 32 MB since @samp{data} will be aligned to the page
821size of the operating system. However, the last byte of @samp{data}
822is guaranteed to be the last byte of the file. Furthermore, if the
823file was large and compressed, unlike in the case of meta data
824extraction from the header, the end of the file will not be
825automatically decompressed by @gnule{}.
826
827@end itemize
828
829Note that using options other than @code{want-tail} is pretty much
830always a kludge and should thus be avoided.
831
832@section Example for an options method
833
834The following example shows how a plugin can set some of the options listed above:
835@example
836const char *
837EXTRACTOR_id3_options ()
838{
839 return "close-stderr;want-tail";
840}
841@end example
842
727@node Internal utility functions 843@node Internal utility functions
728@chapter Internal utility functions 844@chapter Internal utility functions
729 845
@@ -752,12 +868,12 @@ below.
752@cindex UTF-8 868@cindex UTF-8
753@cindex character set 869@cindex character set
754@findex EXTRACTOR_common_convert_to_utf8 870@findex EXTRACTOR_common_convert_to_utf8
755Various @le{} plugins make use of the internal 871Various @gnule{} plugins make use of the internal
756@file{convert.h} header which defines a function 872@file{convert.h} header which defines a function
757 873
758@verb{|EXTRACTOR_common_convert_to_utf8|} which can be used to easily convert text from 874@verb{|EXTRACTOR_common_convert_to_utf8|} which can be used to easily convert text from
759any character set to UTF-8. This conversion is important since the 875any character set to UTF-8. This conversion is important since the
760linked list of keywords that is returned by @le{} is 876linked list of keywords that is returned by @gnule{} is
761expected to contain only UTF-8 strings. Naturally, proper conversion 877expected to contain only UTF-8 strings. Naturally, proper conversion
762may not always be possible since some file formats fail to specify the 878may not always be possible since some file formats fail to specify the
763character set. In that case, it is often better to not convert at 879character set. In that case, it is often better to not convert at
@@ -781,9 +897,9 @@ caller, so storing the string in the keyword list is acceptable.
781@chapter Reporting bugs 897@chapter Reporting bugs
782 898
783@cindex bug 899@cindex bug
784@le{} uses the @url{http://gnunet.org/bugs/,Mantis bugtracking 900@gnule{} uses the @url{http://gnunet.org/bugs/,Mantis bugtracking
785system}. If possible, please report bugs there. You can also e-mail 901system}. If possible, please report bugs there. You can also e-mail
786the @le{} mailinglist at @url{libextractor@@gnu.org}. 902the @gnule{} mailinglist at @url{libextractor@@gnu.org}.
787 903
788 904
789 905
diff --git a/doc/version.texi b/doc/version.texi
index 0715790..6358b99 100644
--- a/doc/version.texi
+++ b/doc/version.texi
@@ -1,4 +1,4 @@
1@set UPDATED 1 January 2010 1@set UPDATED 13 January 2010
2@set UPDATED-MONTH January 2010 2@set UPDATED-MONTH January 2010
3@set EDITION 0.6.0 3@set EDITION 0.6.0
4@set VERSION 0.6.0 4@set VERSION 0.6.0
diff --git a/src/main/extractor.c b/src/main/extractor.c
index b29676b..09d402b 100644
--- a/src/main/extractor.c
+++ b/src/main/extractor.c
@@ -630,6 +630,7 @@ EXTRACTOR_plugin_add_defaults(enum EXTRACTOR_Options flags)
630 */ 630 */
631static void * 631static void *
632get_symbol_with_prefix(void *lib_handle, 632get_symbol_with_prefix(void *lib_handle,
633 const char *template,
633 const char *prefix, 634 const char *prefix,
634 const char **options) 635 const char **options)
635{ 636{
@@ -649,9 +650,9 @@ get_symbol_with_prefix(void *lib_handle,
649 dot = strstr (sym, "."); 650 dot = strstr (sym, ".");
650 if (dot != NULL) 651 if (dot != NULL)
651 *dot = '\0'; 652 *dot = '\0';
652 name = malloc(strlen(sym) + 32); 653 name = malloc(strlen(sym) + strlen(template) + 1);
653 sprintf(name, 654 sprintf(name,
654 "_EXTRACTOR_%s_extract", 655 template,
655 sym); 656 sym);
656 /* try without '_' first */ 657 /* try without '_' first */
657 symbol = lt_dlsym(lib_handle, name + 1); 658 symbol = lt_dlsym(lib_handle, name + 1);
@@ -678,7 +679,8 @@ get_symbol_with_prefix(void *lib_handle,
678#endif 679#endif
679 } 680 }
680 681
681 if (symbol != NULL) 682 if ( (symbol != NULL) &&
683 (NULL != options) )
682 { 684 {
683 /* get special options */ 685 /* get special options */
684 sprintf(name, 686 sprintf(name,
@@ -741,6 +743,7 @@ plugin_load (struct EXTRACTOR_PluginList *plugin)
741 return -1; 743 return -1;
742 } 744 }
743 plugin->extractMethod = get_symbol_with_prefix (plugin->libraryHandle, 745 plugin->extractMethod = get_symbol_with_prefix (plugin->libraryHandle,
746 "_EXTRACTOR_%s_extract",
744 plugin->libname, 747 plugin->libname,
745 &plugin->specials); 748 &plugin->specials);
746 if (plugin->extractMethod == NULL) 749 if (plugin->extractMethod == NULL)
@@ -1094,10 +1097,9 @@ transmit_reply (void *cls,
1094 1097
1095 1098
1096/** 1099/**
1097 * 'main' function of the child process. 1100 * 'main' function of the child process. Reads shm-filenames from
1098 * Reads shm-filenames from 'in' (line-by-line) and 1101 * 'in' (line-by-line) and writes meta data blocks to 'out'. The meta
1099 * writes meta data blocks to 'out'. The meta data 1102 * data stream is terminated by an empty entry.
1100 * stream is terminated by an empty entry.
1101 * 1103 *
1102 * @param plugin extractor plugin to use 1104 * @param plugin extractor plugin to use
1103 * @param in stream to read from 1105 * @param in stream to read from
@@ -1108,12 +1110,15 @@ process_requests (struct EXTRACTOR_PluginList *plugin,
1108 int in, 1110 int in,
1109 int out) 1111 int out)
1110{ 1112{
1111 char fn[256]; 1113 char hfn[256];
1114 char tfn[256];
1115 char *fn;
1112 FILE *fin; 1116 FILE *fin;
1113 void *ptr; 1117 void *ptr;
1114 int shmid; 1118 int shmid;
1115 struct IpcHeader hdr; 1119 struct IpcHeader hdr;
1116 size_t size; 1120 size_t size;
1121 int want_tail;
1117#ifdef WINDOWS 1122#ifdef WINDOWS
1118 HANDLE map; 1123 HANDLE map;
1119#endif 1124#endif
@@ -1129,6 +1134,13 @@ process_requests (struct EXTRACTOR_PluginList *plugin,
1129#endif 1134#endif
1130 return; 1135 return;
1131 } 1136 }
1137 want_tail = 0;
1138 if ( (plugin->specials != NULL) &&
1139 (NULL != strstr (plugin->specials,
1140 "want-tail")) )
1141 {
1142 want_tail = 1;
1143 }
1132 if ( (plugin->specials != NULL) && 1144 if ( (plugin->specials != NULL) &&
1133 (NULL != strstr (plugin->specials, 1145 (NULL != strstr (plugin->specials,
1134 "close-stderr")) ) 1146 "close-stderr")) )
@@ -1144,12 +1156,27 @@ process_requests (struct EXTRACTOR_PluginList *plugin,
1144 1156
1145 memset (&hdr, 0, sizeof (hdr)); 1157 memset (&hdr, 0, sizeof (hdr));
1146 fin = fdopen (in, "r"); 1158 fin = fdopen (in, "r");
1147 while (NULL != fgets (fn, sizeof(fn), fin)) 1159 while (NULL != fgets (hfn, sizeof(hfn), fin))
1148 { 1160 {
1149 if (strlen (fn) == 0) 1161 if (strlen (hfn) <= 1)
1150 break; 1162 break;
1151 ptr = NULL; 1163 ptr = NULL;
1152 fn[strlen(fn)-1] = '\0'; /* kill newline */ 1164 hfn[strlen(hfn)-1] = '\0'; /* kill newline */
1165 if (NULL == fgets (tfn, sizeof(tfn), fin))
1166 break;
1167 if ('!' != tfn[0])
1168 break;
1169 tfn[strlen(tfn)-1] = '\0'; /* kill newline */
1170 if ( (want_tail) &&
1171 (strlen (tfn) > 1) )
1172 {
1173 fn = &tfn[1];
1174 }
1175 else
1176 {
1177 fn = hfn;
1178 }
1179
1153#ifndef WINDOWS 1180#ifndef WINDOWS
1154 if ( (-1 != (shmid = shm_open (fn, O_RDONLY, 0))) && 1181 if ( (-1 != (shmid = shm_open (fn, O_RDONLY, 0))) &&
1155 (((off_t)-1) != (size = lseek (shmid, 0, SEEK_END))) && 1182 (((off_t)-1) != (size = lseek (shmid, 0, SEEK_END))) &&
@@ -1161,12 +1188,13 @@ process_requests (struct EXTRACTOR_PluginList *plugin,
1161 if (ptr != NULL) 1188 if (ptr != NULL)
1162#endif 1189#endif
1163 { 1190 {
1164 if (0 != plugin->extractMethod (ptr, 1191 if ( (plugin->extractMethod != NULL) &&
1165 size, 1192 (0 != plugin->extractMethod (ptr,
1166 &transmit_reply, 1193 size,
1167 &out, 1194 &transmit_reply,
1168 plugin->plugin_options)) 1195 &out,
1169 break; 1196 plugin->plugin_options)) )
1197 break;
1170 if (0 != write_all (out, &hdr, sizeof(hdr))) 1198 if (0 != write_all (out, &hdr, sizeof(hdr)))
1171 break; 1199 break;
1172 } 1200 }
@@ -1195,8 +1223,10 @@ process_requests (struct EXTRACTOR_PluginList *plugin,
1195 close (out); 1223 close (out);
1196} 1224}
1197 1225
1226
1198#ifdef WINDOWS 1227#ifdef WINDOWS
1199static void write_plugin_data (HANDLE h, const struct EXTRACTOR_PluginList *plugin) 1228static void
1229write_plugin_data (HANDLE h, const struct EXTRACTOR_PluginList *plugin)
1200{ 1230{
1201 size_t i; 1231 size_t i;
1202 DWORD len; 1232 DWORD len;
@@ -1217,7 +1247,9 @@ static void write_plugin_data (HANDLE h, const struct EXTRACTOR_PluginList *plug
1217 WriteFile (h, plugin->plugin_options, i, &len, NULL); 1247 WriteFile (h, plugin->plugin_options, i, &len, NULL);
1218} 1248}
1219 1249
1220static struct EXTRACTOR_PluginList *read_plugin_data (FILE *f) 1250
1251static struct EXTRACTOR_PluginList *
1252read_plugin_data (FILE *f)
1221{ 1253{
1222 struct EXTRACTOR_PluginList *ret; 1254 struct EXTRACTOR_PluginList *ret;
1223 size_t i; 1255 size_t i;
@@ -1239,7 +1271,9 @@ static struct EXTRACTOR_PluginList *read_plugin_data (FILE *f)
1239 return ret; 1271 return ret;
1240} 1272}
1241 1273
1242void CALLBACK RundllEntryPoint(HWND hwnd, HINSTANCE hinst, LPSTR lpszCmdLine, int nCmdShow) 1274
1275void CALLBACK
1276RundllEntryPoint(HWND hwnd, HINSTANCE hinst, LPSTR lpszCmdLine, int nCmdShow)
1243{ 1277{
1244 int in, out; 1278 int in, out;
1245 1279
@@ -1253,6 +1287,7 @@ void CALLBACK RundllEntryPoint(HWND hwnd, HINSTANCE hinst, LPSTR lpszCmdLine, in
1253} 1287}
1254#endif 1288#endif
1255 1289
1290
1256/** 1291/**
1257 * Start the process for the given plugin. 1292 * Start the process for the given plugin.
1258 */ 1293 */
@@ -1331,6 +1366,7 @@ start_process (struct EXTRACTOR_PluginList *plugin)
1331 * 1366 *
1332 * @param plugin which plugin to call 1367 * @param plugin which plugin to call
1333 * @param shmfn file name of the shared memory segment 1368 * @param shmfn file name of the shared memory segment
1369 * @param tshmfn file name of the shared memory segment for the end of the data
1334 * @param proc function to call on the meta data 1370 * @param proc function to call on the meta data
1335 * @param proc_cls cls for proc 1371 * @param proc_cls cls for proc
1336 * @return 0 if proc did not return non-zero 1372 * @return 0 if proc did not return non-zero
@@ -1338,6 +1374,7 @@ start_process (struct EXTRACTOR_PluginList *plugin)
1338static int 1374static int
1339extract_oop (struct EXTRACTOR_PluginList *plugin, 1375extract_oop (struct EXTRACTOR_PluginList *plugin,
1340 const char *shmfn, 1376 const char *shmfn,
1377 const char *tshmfn,
1341 EXTRACTOR_MetaDataProcessor proc, 1378 EXTRACTOR_MetaDataProcessor proc,
1342 void *proc_cls) 1379 void *proc_cls)
1343{ 1380{
@@ -1347,7 +1384,19 @@ extract_oop (struct EXTRACTOR_PluginList *plugin,
1347 1384
1348 if (plugin->cpid == -1) 1385 if (plugin->cpid == -1)
1349 return 0; 1386 return 0;
1350 if (0 >= fprintf (plugin->cpipe_in, "%s\n", shmfn)) 1387 if (0 >= fprintf (plugin->cpipe_in,
1388 "%s\n",
1389 shmfn))
1390 {
1391 stop_process (plugin);
1392 plugin->cpid = -1;
1393 if (plugin->flags != EXTRACTOR_OPTION_DEFAULT_POLICY)
1394 plugin->flags = EXTRACTOR_OPTION_DISABLED;
1395 return 0;
1396 }
1397 if (0 >= fprintf (plugin->cpipe_in,
1398 "!%s\n",
1399 (tshmfn != NULL) ? tshmfn : ""))
1351 { 1400 {
1352 stop_process (plugin); 1401 stop_process (plugin);
1353 plugin->cpid = -1; 1402 plugin->cpid = -1;
@@ -1420,33 +1469,108 @@ extract_oop (struct EXTRACTOR_PluginList *plugin,
1420 1469
1421 1470
1422/** 1471/**
1423 * Extract keywords from a file using the given set of plugins. 1472 * Setup a shared memory segment.
1473 *
1474 * @param ptr set to the location of the shm segment
1475 * @param shmid where to store the shm ID
1476 * @param fn name of the shared segment
1477 * @param fn_size size available in fn
1478 * @param size number of bytes to allocated for the segment
1479 * @return 0 on success
1480 */
1481static int
1482make_shm (int is_tail,
1483 void **ptr,
1484#ifndef WINDOWS
1485 int *shmid,
1486#else
1487 HANDLE *mappedFile,
1488 HANDLE *map,
1489#endif
1490 char *fn,
1491 size_t fn_size,
1492 size_t size)
1493{
1494 snprintf (fn,
1495 fn_size,
1496#ifdef WINDOWS
1497 "%TEMP%\\"
1498#else
1499 "/"
1500#endif
1501 "libextractor-%sshm-%u-%u",
1502 (is_tail) ? "t" : "",
1503 getpid(),
1504 (unsigned int) RANDOM());
1505#ifndef WINDOWS
1506 *shmid = shm_open (fn, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
1507 *ptr = NULL;
1508 if (-1 == (*shmid))
1509 return 1;
1510 if ( (0 != ftruncate (*shmid, size)) ||
1511 (NULL == (*ptr = mmap (NULL, size, PROT_WRITE, MAP_SHARED, *shmid, 0))) ||
1512 (*ptr == (void*) -1) )
1513 {
1514 close (*shmid);
1515 *shmid = -1;
1516 return 1;
1517 }
1518 return 0;
1519#else
1520 *mappedFile = CreateFile (fn,
1521 GENERIC_READ | GENERIC_WRITE,
1522 FILE_SHARE_READ | FILE_SHARE_WRITE, NULL, CREATE_ALWAYS,
1523 FILE_FLAG_DELETE_ON_CLOSE, NULL);
1524 *map = CreateFileMapping (*mappedFile, NULL, PAGE_READWRITE, 1, 0, NULL);
1525 ptr = MapViewOfFile (*map, FILE_MAP_READ, 0, 0, 0);
1526 if (ptr == NULL)
1527 {
1528 CloseHandle (*map);
1529 CloseHandle (*mappedFile);
1530 return 1;
1531 }
1532#endif
1533 return 0;
1534}
1535
1536
1537/**
1538 * Extract keywords using the given set of plugins.
1424 * 1539 *
1425 * @param plugins the list of plugins to use 1540 * @param plugins the list of plugins to use
1426 * @param filename the name of the file, can be NULL
1427 * @param data data to process, never NULL 1541 * @param data data to process, never NULL
1428 * @param size number of bytes in data, ignored if data is NULL 1542 * @param size number of bytes in data, ignored if data is NULL
1543 * @param tdata end of file data, or NULL
1544 * @param tsize number of bytes in tdata
1429 * @param proc function to call for each meta data item found 1545 * @param proc function to call for each meta data item found
1430 * @param proc_cls cls argument to proc 1546 * @param proc_cls cls argument to proc
1431 */ 1547 */
1432static void 1548static void
1433extract (struct EXTRACTOR_PluginList *plugins, 1549extract (struct EXTRACTOR_PluginList *plugins,
1434 const char * filename,
1435 const char * data, 1550 const char * data,
1436 size_t size, 1551 size_t size,
1552 const char * tdata,
1553 size_t tsize,
1437 EXTRACTOR_MetaDataProcessor proc, 1554 EXTRACTOR_MetaDataProcessor proc,
1438 void *proc_cls) 1555 void *proc_cls)
1439{ 1556{
1440 struct EXTRACTOR_PluginList *ppos; 1557 struct EXTRACTOR_PluginList *ppos;
1441#ifndef WINDOWS
1442 int shmid;
1443#else
1444 HANDLE map, mappedFile;
1445#endif
1446 enum EXTRACTOR_Options flags; 1558 enum EXTRACTOR_Options flags;
1447 void *ptr; 1559 void *ptr;
1560 void *tptr;
1448 char fn[255]; 1561 char fn[255];
1562 char tfn[255];
1449 int want_shm; 1563 int want_shm;
1564 int want_tail;
1565#ifndef WINDOWS
1566 int shmid;
1567 int tshmid;
1568#else
1569 HANDLE map;
1570 HANDLE mappedFile;
1571 HANDLE tmap;
1572 HANDLE tmappedFile;
1573#endif
1450 1574
1451 want_shm = 0; 1575 want_shm = 0;
1452 ppos = plugins; 1576 ppos = plugins;
@@ -1472,100 +1596,106 @@ extract (struct EXTRACTOR_PluginList *plugins,
1472 } 1596 }
1473 ppos = ppos->next; 1597 ppos = ppos->next;
1474 } 1598 }
1599 ptr = NULL;
1600 tptr = NULL;
1475 if (want_shm) 1601 if (want_shm)
1476 { 1602 {
1477 snprintf (fn, 1603 if (size > MAX_READ)
1478 sizeof(fn), 1604 size = MAX_READ;
1479#ifdef WINDOWS 1605 if (0 == make_shm (0,
1480 "%TEMP%\\" 1606 &ptr,
1607#ifndef WINDOWS
1608 &shmid,
1481#else 1609#else
1482 "/" 1610 &mappedFile,
1611 &map,
1483#endif 1612#endif
1484 "libextractor-shm-%u-%u", 1613 fn, sizeof(fn), size))
1485 getpid(),
1486 (unsigned int) RANDOM());
1487#ifndef WINDOWS
1488 shmid = shm_open (fn, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
1489 ptr = NULL;
1490 if (shmid != -1)
1491 { 1614 {
1492 if ( (0 != ftruncate (shmid, size)) || 1615 memcpy (ptr, data, size);
1493 (NULL == (ptr = mmap (NULL, size, PROT_WRITE, MAP_SHARED, shmid, 0))) || 1616 if ( (tdata != NULL) &&
1494 (ptr == (void*) -1) ) 1617 (0 == make_shm (1,
1618 &tptr,
1619#ifndef WINDOWS
1620 &tshmid,
1621#else
1622 &tmappedFile,
1623 &tmap,
1624#endif
1625 tfn, sizeof(tfn), tsize)) )
1495 { 1626 {
1496 close (shmid); 1627 memcpy (tptr, tdata, tsize);
1497 shmid = -1;
1498 } 1628 }
1499 else 1629 else
1500 { 1630 {
1501 memcpy (ptr, data, size); 1631 tptr = NULL;
1502 } 1632 }
1503 } 1633 }
1504#else
1505 mappedFile = CreateFile (fn, GENERIC_READ | GENERIC_WRITE,
1506 FILE_SHARE_READ | FILE_SHARE_WRITE, NULL, CREATE_ALWAYS,
1507 FILE_FLAG_DELETE_ON_CLOSE, NULL);
1508 map = CreateFileMapping (mappedFile, NULL, PAGE_READWRITE, 1, 0, NULL);
1509 ptr = MapViewOfFile (map, FILE_MAP_READ, 0, 0, 0);
1510 if (ptr == NULL)
1511 {
1512 CloseHandle (map);
1513 CloseHandle (mappedFile);
1514 map = NULL;
1515 }
1516 else 1634 else
1517 memcpy (ptr, data, size); 1635 {
1518#endif 1636 want_shm = 0;
1637 }
1519 } 1638 }
1520 else
1521#ifndef WINDOWS
1522 shmid = -1;
1523 if (want_shm && (shmid == -1))
1524 _exit(1);
1525#else
1526 map = NULL;
1527 if (want_shm && map == NULL)
1528 _exit(1);
1529#endif
1530 ppos = plugins; 1639 ppos = plugins;
1531 while (NULL != ppos) 1640 while (NULL != ppos)
1532 { 1641 {
1533 flags = ppos->flags; 1642 flags = ppos->flags;
1534#ifndef WINDOWS 1643 if (! want_shm)
1535 if (shmid == -1)
1536#else
1537 if (map == NULL)
1538#endif
1539 flags = EXTRACTOR_OPTION_IN_PROCESS; 1644 flags = EXTRACTOR_OPTION_IN_PROCESS;
1540 switch (flags) 1645 switch (flags)
1541 { 1646 {
1542 case EXTRACTOR_OPTION_DEFAULT_POLICY: 1647 case EXTRACTOR_OPTION_DEFAULT_POLICY:
1543 if (0 != extract_oop (ppos, fn, proc, proc_cls)) 1648 if (0 != extract_oop (ppos, fn,
1649 (tptr != NULL) ? tfn : NULL,
1650 proc, proc_cls))
1544 return; 1651 return;
1545 if (ppos->cpid == -1) 1652 if (ppos->cpid == -1)
1546 { 1653 {
1547 start_process (ppos); 1654 start_process (ppos);
1548 if (0 != extract_oop (ppos, fn, proc, proc_cls)) 1655 if (0 != extract_oop (ppos, fn,
1656 (tptr != NULL) ? tfn : NULL,
1657 proc, proc_cls))
1549 return; 1658 return;
1550 } 1659 }
1551 break; 1660 break;
1552 case EXTRACTOR_OPTION_OUT_OF_PROCESS_NO_RESTART: 1661 case EXTRACTOR_OPTION_OUT_OF_PROCESS_NO_RESTART:
1553 if (0 != extract_oop (ppos, fn, proc, proc_cls)) 1662 if (0 != extract_oop (ppos, fn,
1663 (tptr != NULL) ? tfn : NULL,
1664 proc, proc_cls))
1554 return; 1665 return;
1555 break; 1666 break;
1556 case EXTRACTOR_OPTION_IN_PROCESS: 1667 case EXTRACTOR_OPTION_IN_PROCESS:
1557 if (NULL == ppos->extractMethod) 1668 want_tail = ( (ppos->specials != NULL) &&
1669 (NULL != strstr (ppos->specials,
1670 "want-tail")));
1671 if (NULL == ppos->extractMethod)
1558 plugin_load (ppos); 1672 plugin_load (ppos);
1559 if ( ( (ppos->specials == NULL) || 1673 if ( ( (ppos->specials == NULL) ||
1560 (NULL == strstr (ppos->specials, 1674 (NULL == strstr (ppos->specials,
1561 "oop-only")) ) && 1675 "oop-only")) ) )
1562 (NULL != ppos->extractMethod) && 1676 {
1563 (0 != ppos->extractMethod (data, 1677 if (want_tail)
1564 size, 1678 {
1565 proc, 1679 if ( (NULL != ppos->extractMethod) &&
1566 proc_cls, 1680 (tdata != NULL) &&
1567 ppos->plugin_options)) ) 1681 (0 != ppos->extractMethod (tdata,
1568 return; 1682 tsize,
1683 proc,
1684 proc_cls,
1685 ppos->plugin_options)) )
1686 return;
1687 }
1688 else
1689 {
1690 if ( (NULL != ppos->extractMethod) &&
1691 (0 != ppos->extractMethod (data,
1692 size,
1693 proc,
1694 proc_cls,
1695 ppos->plugin_options)) )
1696 return;
1697 }
1698 }
1569 break; 1699 break;
1570 case EXTRACTOR_OPTION_DISABLED: 1700 case EXTRACTOR_OPTION_DISABLED:
1571 break; 1701 break;
@@ -1580,10 +1710,21 @@ extract (struct EXTRACTOR_PluginList *plugins,
1580 if (shmid != -1) 1710 if (shmid != -1)
1581 close (shmid); 1711 close (shmid);
1582 shm_unlink (fn); 1712 shm_unlink (fn);
1713 if (NULL != tptr)
1714 munmap (tptr, tsize);
1715 if (tshmid != -1)
1716 close (tshmid);
1717 shm_unlink (tfn);
1583#else 1718#else
1584 UnmapViewOfFile (ptr); 1719 UnmapViewOfFile (ptr);
1585 CloseHandle (map); 1720 CloseHandle (map);
1586 CloseHandle (mappedFile); 1721 CloseHandle (mappedFile);
1722 if (tptr != NULL)
1723 {
1724 UnmapViewOfFile (tptr);
1725 CloseHandle (tmap);
1726 CloseHandle (tmappedFile);
1727 }
1587#endif 1728#endif
1588 } 1729 }
1589} 1730}
@@ -1595,17 +1736,19 @@ extract (struct EXTRACTOR_PluginList *plugins,
1595 * contents if they were not compressed). 1736 * contents if they were not compressed).
1596 * 1737 *
1597 * @param plugins the list of plugins to use 1738 * @param plugins the list of plugins to use
1598 * @param filename the name of the file, can be NULL
1599 * @param data data to process, never NULL 1739 * @param data data to process, never NULL
1600 * @param size number of bytes in data, ignored if data is NULL 1740 * @param size number of bytes in data
1741 * @param tdata end of file data, or NULL
1742 * @param tsize number of bytes in tdata
1601 * @param proc function to call for each meta data item found 1743 * @param proc function to call for each meta data item found
1602 * @param proc_cls cls argument to proc 1744 * @param proc_cls cls argument to proc
1603 */ 1745 */
1604static void 1746static void
1605decompress_and_extract (struct EXTRACTOR_PluginList *plugins, 1747decompress_and_extract (struct EXTRACTOR_PluginList *plugins,
1606 const char * filename,
1607 const unsigned char * data, 1748 const unsigned char * data,
1608 size_t size, 1749 size_t size,
1750 const char * tdata,
1751 size_t tsize,
1609 EXTRACTOR_MetaDataProcessor proc, 1752 EXTRACTOR_MetaDataProcessor proc,
1610 void *proc_cls) { 1753 void *proc_cls) {
1611 unsigned char * buf; 1754 unsigned char * buf;
@@ -1838,9 +1981,10 @@ decompress_and_extract (struct EXTRACTOR_PluginList *plugins,
1838 size = dsize; 1981 size = dsize;
1839 } 1982 }
1840 extract (plugins, 1983 extract (plugins,
1841 filename,
1842 (const char*) data, 1984 (const char*) data,
1843 size, 1985 size,
1986 tdata,
1987 tsize,
1844 proc, 1988 proc,
1845 proc_cls); 1989 proc_cls);
1846 if (buf != NULL) 1990 if (buf != NULL)
@@ -1908,9 +2052,13 @@ EXTRACTOR_extract (struct EXTRACTOR_PluginList *plugins,
1908{ 2052{
1909 int fd; 2053 int fd;
1910 void * buffer; 2054 void * buffer;
2055 void * tbuffer;
1911 struct stat fstatbuf; 2056 struct stat fstatbuf;
1912 size_t fsize; 2057 size_t fsize;
2058 size_t tsize;
1913 int eno; 2059 int eno;
2060 off_t offset;
2061 long pg;
1914 2062
1915 fd = -1; 2063 fd = -1;
1916 buffer = NULL; 2064 buffer = NULL;
@@ -1941,14 +2089,41 @@ EXTRACTOR_extract (struct EXTRACTOR_PluginList *plugins,
1941 if ( (buffer == NULL) && 2089 if ( (buffer == NULL) &&
1942 (data == NULL) ) 2090 (data == NULL) )
1943 return; 2091 return;
2092 /* for footer extraction */
2093 tsize = 0;
2094 tbuffer = NULL;
2095 if ( (data == NULL) &&
2096 (fstatbuf.st_size > fsize) &&
2097 (fstatbuf.st_size > MAX_READ) )
2098 {
2099 pg = sysconf (_SC_PAGE_SIZE);
2100 if ( (pg > 0) &&
2101 (pg < MAX_READ) )
2102 {
2103 offset = (1 + (fstatbuf.st_size - MAX_READ) / pg) * pg;
2104 if (offset < fstatbuf.st_size)
2105 {
2106 tsize = fstatbuf.st_size - offset;
2107 tbuffer = MMAP (NULL, tsize, PROT_READ, MAP_PRIVATE, fd, offset);
2108 if ( (tbuffer == NULL) || (tbuffer == (void *) -1) )
2109 {
2110 tsize = 0;
2111 tbuffer = NULL;
2112 }
2113 }
2114 }
2115 }
1944 decompress_and_extract (plugins, 2116 decompress_and_extract (plugins,
1945 filename,
1946 buffer != NULL ? buffer : data, 2117 buffer != NULL ? buffer : data,
1947 buffer != NULL ? fsize : size, 2118 buffer != NULL ? fsize : size,
2119 tbuffer,
2120 tsize,
1948 proc, 2121 proc,
1949 proc_cls); 2122 proc_cls);
1950 if (buffer != NULL) 2123 if (buffer != NULL)
1951 MUNMAP (buffer, fsize); 2124 MUNMAP (buffer, fsize);
2125 if (tbuffer != NULL)
2126 MUNMAP (tbuffer, tsize);
1952 if (-1 != fd) 2127 if (-1 != fd)
1953 close(fd); 2128 close(fd);
1954} 2129}
diff --git a/src/plugins/Makefile.am b/src/plugins/Makefile.am
index 0868ebd..07ecb63 100644
--- a/src/plugins/Makefile.am
+++ b/src/plugins/Makefile.am
@@ -86,6 +86,7 @@ plugin_LTLIBRARIES = \
86 libextractor_flv.la \ 86 libextractor_flv.la \
87 libextractor_gif.la \ 87 libextractor_gif.la \
88 libextractor_html.la \ 88 libextractor_html.la \
89 libextractor_id3.la \
89 libextractor_id3v2.la \ 90 libextractor_id3v2.la \
90 libextractor_id3v23.la \ 91 libextractor_id3v23.la \
91 libextractor_id3v24.la \ 92 libextractor_id3v24.la \
@@ -186,6 +187,13 @@ libextractor_html_la_LDFLAGS = \
186libextractor_html_la_LIBADD = \ 187libextractor_html_la_LIBADD = \
187 $(top_builddir)/src/common/libextractor_common.la 188 $(top_builddir)/src/common/libextractor_common.la
188 189
190libextractor_id3_la_SOURCES = \
191 id3_extractor.c
192libextractor_id3_la_LDFLAGS = \
193 $(PLUGINFLAGS)
194libextractor_id3_la_LIBADD = \
195 $(top_builddir)/src/common/libextractor_common.la
196
189libextractor_id3v2_la_SOURCES = \ 197libextractor_id3v2_la_SOURCES = \
190 id3v2_extractor.c 198 id3v2_extractor.c
191libextractor_id3v2_la_LDFLAGS = \ 199libextractor_id3v2_la_LDFLAGS = \
diff --git a/src/plugins/id3_extractor.c b/src/plugins/id3_extractor.c
new file mode 100644
index 0000000..be399e0
--- /dev/null
+++ b/src/plugins/id3_extractor.c
@@ -0,0 +1,305 @@
1/*
2 This file is part of libextractor.
3 (C) 2002, 2003, 2004, 2006, 2009, 2010 Vidyut Samanta and Christian Grothoff
4
5 libextractor is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published
7 by the Free Software Foundation; either version 2, or (at your
8 option) any later version.
9
10 libextractor is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with libextractor; see the file COPYING. If not, write to the
17 Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18 Boston, MA 02111-1307, USA.
19
20 */
21
22#include "platform.h"
23#include "extractor.h"
24#include "convert.h"
25#include <string.h>
26#include <stdio.h>
27#include <sys/types.h>
28#include <sys/stat.h>
29#include <unistd.h>
30#include <stdlib.h>
31
32typedef struct
33{
34 char *title;
35 char *artist;
36 char *album;
37 char *year;
38 char *comment;
39 const char *genre;
40 unsigned int track_number;
41} id3tag;
42
43static const char *const genre_names[] = {
44 gettext_noop ("Blues"),
45 gettext_noop ("Classic Rock"),
46 gettext_noop ("Country"),
47 gettext_noop ("Dance"),
48 gettext_noop ("Disco"),
49 gettext_noop ("Funk"),
50 gettext_noop ("Grunge"),
51 gettext_noop ("Hip-Hop"),
52 gettext_noop ("Jazz"),
53 gettext_noop ("Metal"),
54 gettext_noop ("New Age"),
55 gettext_noop ("Oldies"),
56 gettext_noop ("Other"),
57 gettext_noop ("Pop"),
58 gettext_noop ("R&B"),
59 gettext_noop ("Rap"),
60 gettext_noop ("Reggae"),
61 gettext_noop ("Rock"),
62 gettext_noop ("Techno"),
63 gettext_noop ("Industrial"),
64 gettext_noop ("Alternative"),
65 gettext_noop ("Ska"),
66 gettext_noop ("Death Metal"),
67 gettext_noop ("Pranks"),
68 gettext_noop ("Soundtrack"),
69 gettext_noop ("Euro-Techno"),
70 gettext_noop ("Ambient"),
71 gettext_noop ("Trip-Hop"),
72 gettext_noop ("Vocal"),
73 gettext_noop ("Jazz+Funk"),
74 gettext_noop ("Fusion"),
75 gettext_noop ("Trance"),
76 gettext_noop ("Classical"),
77 gettext_noop ("Instrumental"),
78 gettext_noop ("Acid"),
79 gettext_noop ("House"),
80 gettext_noop ("Game"),
81 gettext_noop ("Sound Clip"),
82 gettext_noop ("Gospel"),
83 gettext_noop ("Noise"),
84 gettext_noop ("Alt. Rock"),
85 gettext_noop ("Bass"),
86 gettext_noop ("Soul"),
87 gettext_noop ("Punk"),
88 gettext_noop ("Space"),
89 gettext_noop ("Meditative"),
90 gettext_noop ("Instrumental Pop"),
91 gettext_noop ("Instrumental Rock"),
92 gettext_noop ("Ethnic"),
93 gettext_noop ("Gothic"),
94 gettext_noop ("Darkwave"),
95 gettext_noop ("Techno-Industrial"),
96 gettext_noop ("Electronic"),
97 gettext_noop ("Pop-Folk"),
98 gettext_noop ("Eurodance"),
99 gettext_noop ("Dream"),
100 gettext_noop ("Southern Rock"),
101 gettext_noop ("Comedy"),
102 gettext_noop ("Cult"),
103 gettext_noop ("Gangsta Rap"),
104 gettext_noop ("Top 40"),
105 gettext_noop ("Christian Rap"),
106 gettext_noop ("Pop/Funk"),
107 gettext_noop ("Jungle"),
108 gettext_noop ("Native American"),
109 gettext_noop ("Cabaret"),
110 gettext_noop ("New Wave"),
111 gettext_noop ("Psychedelic"),
112 gettext_noop ("Rave"),
113 gettext_noop ("Showtunes"),
114 gettext_noop ("Trailer"),
115 gettext_noop ("Lo-Fi"),
116 gettext_noop ("Tribal"),
117 gettext_noop ("Acid Punk"),
118 gettext_noop ("Acid Jazz"),
119 gettext_noop ("Polka"),
120 gettext_noop ("Retro"),
121 gettext_noop ("Musical"),
122 gettext_noop ("Rock & Roll"),
123 gettext_noop ("Hard Rock"),
124 gettext_noop ("Folk"),
125 gettext_noop ("Folk/Rock"),
126 gettext_noop ("National Folk"),
127 gettext_noop ("Swing"),
128 gettext_noop ("Fast-Fusion"),
129 gettext_noop ("Bebob"),
130 gettext_noop ("Latin"),
131 gettext_noop ("Revival"),
132 gettext_noop ("Celtic"),
133 gettext_noop ("Bluegrass"),
134 gettext_noop ("Avantgarde"),
135 gettext_noop ("Gothic Rock"),
136 gettext_noop ("Progressive Rock"),
137 gettext_noop ("Psychedelic Rock"),
138 gettext_noop ("Symphonic Rock"),
139 gettext_noop ("Slow Rock"),
140 gettext_noop ("Big Band"),
141 gettext_noop ("Chorus"),
142 gettext_noop ("Easy Listening"),
143 gettext_noop ("Acoustic"),
144 gettext_noop ("Humour"),
145 gettext_noop ("Speech"),
146 gettext_noop ("Chanson"),
147 gettext_noop ("Opera"),
148 gettext_noop ("Chamber Music"),
149 gettext_noop ("Sonata"),
150 gettext_noop ("Symphony"),
151 gettext_noop ("Booty Bass"),
152 gettext_noop ("Primus"),
153 gettext_noop ("Porn Groove"),
154 gettext_noop ("Satire"),
155 gettext_noop ("Slow Jam"),
156 gettext_noop ("Club"),
157 gettext_noop ("Tango"),
158 gettext_noop ("Samba"),
159 gettext_noop ("Folklore"),
160 gettext_noop ("Ballad"),
161 gettext_noop ("Power Ballad"),
162 gettext_noop ("Rhythmic Soul"),
163 gettext_noop ("Freestyle"),
164 gettext_noop ("Duet"),
165 gettext_noop ("Punk Rock"),
166 gettext_noop ("Drum Solo"),
167 gettext_noop ("A Cappella"),
168 gettext_noop ("Euro-House"),
169 gettext_noop ("Dance Hall"),
170 gettext_noop ("Goa"),
171 gettext_noop ("Drum & Bass"),
172 gettext_noop ("Club-House"),
173 gettext_noop ("Hardcore"),
174 gettext_noop ("Terror"),
175 gettext_noop ("Indie"),
176 gettext_noop ("BritPop"),
177 gettext_noop ("Negerpunk"),
178 gettext_noop ("Polsk Punk"),
179 gettext_noop ("Beat"),
180 gettext_noop ("Christian Gangsta Rap"),
181 gettext_noop ("Heavy Metal"),
182 gettext_noop ("Black Metal"),
183 gettext_noop ("Crossover"),
184 gettext_noop ("Contemporary Christian"),
185 gettext_noop ("Christian Rock"),
186 gettext_noop ("Merengue"),
187 gettext_noop ("Salsa"),
188 gettext_noop ("Thrash Metal"),
189 gettext_noop ("Anime"),
190 gettext_noop ("JPop"),
191 gettext_noop ("Synthpop"),
192};
193
194#define GENRE_NAME_COUNT \
195 ((unsigned int)(sizeof genre_names / sizeof (const char *const)))
196
197
198
199#define OK 0
200#define INVALID_ID3 1
201
202static void
203trim (char *k)
204{
205 while ((strlen (k) > 0) && (isspace (k[strlen (k) - 1])))
206 k[strlen (k) - 1] = '\0';
207}
208
209static int
210get_id3 (const char *data, size_t size, id3tag * id3)
211{
212 const char *pos;
213
214 if (size < 128)
215 return INVALID_ID3;
216
217 pos = &data[size - 128];
218 if (0 != strncmp ("TAG", pos, 3))
219 return INVALID_ID3;
220 pos += 3;
221
222 id3->title = EXTRACTOR_common_convert_to_utf8 (pos, 30, "ISO-8859-1");
223 trim (id3->title);
224 pos += 30;
225 id3->artist = EXTRACTOR_common_convert_to_utf8 (pos, 30, "ISO-8859-1");
226 trim (id3->artist);
227 pos += 30;
228 id3->album = EXTRACTOR_common_convert_to_utf8 (pos, 30, "ISO-8859-1");
229 trim (id3->album);
230 pos += 30;
231 id3->year = EXTRACTOR_common_convert_to_utf8 (pos, 4, "ISO-8859-1");
232 trim (id3->year);
233 pos += 4;
234 id3->comment = EXTRACTOR_common_convert_to_utf8 (pos, 30, "ISO-8859-1");
235 trim (id3->comment);
236 if ( (pos[28] == '\0') &&
237 (pos[29] != '\0') )
238 {
239 /* ID3v1.1 */
240 id3->track_number = pos[29];
241 }
242 else
243 {
244 id3->track_number = 0;
245 }
246 pos += 30;
247 id3->genre = "";
248 if (pos[0] < GENRE_NAME_COUNT)
249 id3->genre = dgettext (PACKAGE, genre_names[(unsigned) pos[0]]);
250 return OK;
251}
252
253
254#define ADD(s,t) do { if (0 != (ret = proc (proc_cls, "id3", t, EXTRACTOR_METAFORMAT_UTF8, "text/plain", s, strlen(s)+1))) goto FINISH; } while (0)
255
256
257const char *
258EXTRACTOR_id3_options ()
259{
260 return "want-tail";
261}
262
263
264int
265EXTRACTOR_id3_extract (const char *data,
266 size_t size,
267 EXTRACTOR_MetaDataProcessor proc,
268 void *proc_cls,
269 const char *options)
270{
271 id3tag info;
272 char track[16];
273 int ret;
274
275 fprintf (stderr, "called with %llu bytes\n", (unsigned long long) size);
276 if (OK != get_id3 (data, size, &info))
277 return 0;
278 if (strlen (info.title) > 0)
279 ADD (info.title, EXTRACTOR_METATYPE_TITLE);
280 if (strlen (info.artist) > 0)
281 ADD (info.artist, EXTRACTOR_METATYPE_ARTIST);
282 if (strlen (info.album) > 0)
283 ADD (info.album, EXTRACTOR_METATYPE_ALBUM);
284 if (strlen (info.year) > 0)
285 ADD (info.year, EXTRACTOR_METATYPE_PUBLICATION_YEAR);
286 if (strlen (info.genre) > 0)
287 ADD (info.genre, EXTRACTOR_METATYPE_GENRE);
288 if (strlen (info.comment) > 0)
289 ADD (info.comment, EXTRACTOR_METATYPE_COMMENT);
290 if (info.track_number != 0)
291 {
292 snprintf(track,
293 sizeof(track), "%u", info.track_number);
294 ADD (track, EXTRACTOR_METATYPE_TRACK_NUMBER);
295 }
296FINISH:
297 free (info.title);
298 free (info.year);
299 free (info.album);
300 free (info.artist);
301 free (info.comment);
302 return ret;
303}
304
305/* end of id3_extractor.c */
diff --git a/src/plugins/mp3_extractor.c b/src/plugins/mp3_extractor.c
index 2696431..a60754a 100644
--- a/src/plugins/mp3_extractor.c
+++ b/src/plugins/mp3_extractor.c
@@ -36,172 +36,6 @@
36#include <unistd.h> 36#include <unistd.h>
37#include <stdlib.h> 37#include <stdlib.h>
38 38
39typedef struct
40{
41 char *title;
42 char *artist;
43 char *album;
44 char *year;
45 char *comment;
46 const char *genre;
47 unsigned int track_number;
48} id3tag;
49
50static const char *const genre_names[] = {
51 gettext_noop ("Blues"),
52 gettext_noop ("Classic Rock"),
53 gettext_noop ("Country"),
54 gettext_noop ("Dance"),
55 gettext_noop ("Disco"),
56 gettext_noop ("Funk"),
57 gettext_noop ("Grunge"),
58 gettext_noop ("Hip-Hop"),
59 gettext_noop ("Jazz"),
60 gettext_noop ("Metal"),
61 gettext_noop ("New Age"),
62 gettext_noop ("Oldies"),
63 gettext_noop ("Other"),
64 gettext_noop ("Pop"),
65 gettext_noop ("R&B"),
66 gettext_noop ("Rap"),
67 gettext_noop ("Reggae"),
68 gettext_noop ("Rock"),
69 gettext_noop ("Techno"),
70 gettext_noop ("Industrial"),
71 gettext_noop ("Alternative"),
72 gettext_noop ("Ska"),
73 gettext_noop ("Death Metal"),
74 gettext_noop ("Pranks"),
75 gettext_noop ("Soundtrack"),
76 gettext_noop ("Euro-Techno"),
77 gettext_noop ("Ambient"),
78 gettext_noop ("Trip-Hop"),
79 gettext_noop ("Vocal"),
80 gettext_noop ("Jazz+Funk"),
81 gettext_noop ("Fusion"),
82 gettext_noop ("Trance"),
83 gettext_noop ("Classical"),
84 gettext_noop ("Instrumental"),
85 gettext_noop ("Acid"),
86 gettext_noop ("House"),
87 gettext_noop ("Game"),
88 gettext_noop ("Sound Clip"),
89 gettext_noop ("Gospel"),
90 gettext_noop ("Noise"),
91 gettext_noop ("Alt. Rock"),
92 gettext_noop ("Bass"),
93 gettext_noop ("Soul"),
94 gettext_noop ("Punk"),
95 gettext_noop ("Space"),
96 gettext_noop ("Meditative"),
97 gettext_noop ("Instrumental Pop"),
98 gettext_noop ("Instrumental Rock"),
99 gettext_noop ("Ethnic"),
100 gettext_noop ("Gothic"),
101 gettext_noop ("Darkwave"),
102 gettext_noop ("Techno-Industrial"),
103 gettext_noop ("Electronic"),
104 gettext_noop ("Pop-Folk"),
105 gettext_noop ("Eurodance"),
106 gettext_noop ("Dream"),
107 gettext_noop ("Southern Rock"),
108 gettext_noop ("Comedy"),
109 gettext_noop ("Cult"),
110 gettext_noop ("Gangsta Rap"),
111 gettext_noop ("Top 40"),
112 gettext_noop ("Christian Rap"),
113 gettext_noop ("Pop/Funk"),
114 gettext_noop ("Jungle"),
115 gettext_noop ("Native American"),
116 gettext_noop ("Cabaret"),
117 gettext_noop ("New Wave"),
118 gettext_noop ("Psychedelic"),
119 gettext_noop ("Rave"),
120 gettext_noop ("Showtunes"),
121 gettext_noop ("Trailer"),
122 gettext_noop ("Lo-Fi"),
123 gettext_noop ("Tribal"),
124 gettext_noop ("Acid Punk"),
125 gettext_noop ("Acid Jazz"),
126 gettext_noop ("Polka"),
127 gettext_noop ("Retro"),
128 gettext_noop ("Musical"),
129 gettext_noop ("Rock & Roll"),
130 gettext_noop ("Hard Rock"),
131 gettext_noop ("Folk"),
132 gettext_noop ("Folk/Rock"),
133 gettext_noop ("National Folk"),
134 gettext_noop ("Swing"),
135 gettext_noop ("Fast-Fusion"),
136 gettext_noop ("Bebob"),
137 gettext_noop ("Latin"),
138 gettext_noop ("Revival"),
139 gettext_noop ("Celtic"),
140 gettext_noop ("Bluegrass"),
141 gettext_noop ("Avantgarde"),
142 gettext_noop ("Gothic Rock"),
143 gettext_noop ("Progressive Rock"),
144 gettext_noop ("Psychedelic Rock"),
145 gettext_noop ("Symphonic Rock"),
146 gettext_noop ("Slow Rock"),
147 gettext_noop ("Big Band"),
148 gettext_noop ("Chorus"),
149 gettext_noop ("Easy Listening"),
150 gettext_noop ("Acoustic"),
151 gettext_noop ("Humour"),
152 gettext_noop ("Speech"),
153 gettext_noop ("Chanson"),
154 gettext_noop ("Opera"),
155 gettext_noop ("Chamber Music"),
156 gettext_noop ("Sonata"),
157 gettext_noop ("Symphony"),
158 gettext_noop ("Booty Bass"),
159 gettext_noop ("Primus"),
160 gettext_noop ("Porn Groove"),
161 gettext_noop ("Satire"),
162 gettext_noop ("Slow Jam"),
163 gettext_noop ("Club"),
164 gettext_noop ("Tango"),
165 gettext_noop ("Samba"),
166 gettext_noop ("Folklore"),
167 gettext_noop ("Ballad"),
168 gettext_noop ("Power Ballad"),
169 gettext_noop ("Rhythmic Soul"),
170 gettext_noop ("Freestyle"),
171 gettext_noop ("Duet"),
172 gettext_noop ("Punk Rock"),
173 gettext_noop ("Drum Solo"),
174 gettext_noop ("A Cappella"),
175 gettext_noop ("Euro-House"),
176 gettext_noop ("Dance Hall"),
177 gettext_noop ("Goa"),
178 gettext_noop ("Drum & Bass"),
179 gettext_noop ("Club-House"),
180 gettext_noop ("Hardcore"),
181 gettext_noop ("Terror"),
182 gettext_noop ("Indie"),
183 gettext_noop ("BritPop"),
184 gettext_noop ("Negerpunk"),
185 gettext_noop ("Polsk Punk"),
186 gettext_noop ("Beat"),
187 gettext_noop ("Christian Gangsta Rap"),
188 gettext_noop ("Heavy Metal"),
189 gettext_noop ("Black Metal"),
190 gettext_noop ("Crossover"),
191 gettext_noop ("Contemporary Christian"),
192 gettext_noop ("Christian Rock"),
193 gettext_noop ("Merengue"),
194 gettext_noop ("Salsa"),
195 gettext_noop ("Thrash Metal"),
196 gettext_noop ("Anime"),
197 gettext_noop ("JPop"),
198 gettext_noop ("Synthpop"),
199};
200
201#define GENRE_NAME_COUNT \
202 ((unsigned int)(sizeof genre_names / sizeof (const char *const)))
203
204
205#define MAX_MP3_SCAN_DEEP 16768 39#define MAX_MP3_SCAN_DEEP 16768
206const int max_frames_scan = 1024; 40const int max_frames_scan = 1024;
207enum 41enum
@@ -270,64 +104,15 @@ static const char * const layer_names[3] = {
270#define SYSERR 1 104#define SYSERR 1
271#define INVALID_ID3 2 105#define INVALID_ID3 2
272 106
273static void
274trim (char *k)
275{
276 while ((strlen (k) > 0) && (isspace (k[strlen (k) - 1])))
277 k[strlen (k) - 1] = '\0';
278}
279
280static int
281get_id3 (const char *data, size_t size, id3tag * id3)
282{
283 const char *pos;
284
285 if (size < 128)
286 return INVALID_ID3;
287
288 pos = &data[size - 128];
289 if (0 != strncmp ("TAG", pos, 3))
290 return INVALID_ID3;
291 pos += 3;
292
293 id3->title = EXTRACTOR_common_convert_to_utf8 (pos, 30, "ISO-8859-1");
294 trim (id3->title);
295 pos += 30;
296 id3->artist = EXTRACTOR_common_convert_to_utf8 (pos, 30, "ISO-8859-1");
297 trim (id3->artist);
298 pos += 30;
299 id3->album = EXTRACTOR_common_convert_to_utf8 (pos, 30, "ISO-8859-1");
300 trim (id3->album);
301 pos += 30;
302 id3->year = EXTRACTOR_common_convert_to_utf8 (pos, 4, "ISO-8859-1");
303 trim (id3->year);
304 pos += 4;
305 id3->comment = EXTRACTOR_common_convert_to_utf8 (pos, 30, "ISO-8859-1");
306 trim (id3->comment);
307 if ( (pos[28] == '\0') &&
308 (pos[29] != '\0') )
309 {
310 /* ID3v1.1 */
311 id3->track_number = pos[29];
312 }
313 else
314 {
315 id3->track_number = 0;
316 }
317 pos += 30;
318 id3->genre = "";
319 if (pos[0] < GENRE_NAME_COUNT)
320 id3->genre = dgettext (PACKAGE, genre_names[(unsigned) pos[0]]);
321 return OK;
322}
323
324
325#define ADDR(s,t) do { if (0 != proc (proc_cls, "mp3", t, EXTRACTOR_METAFORMAT_UTF8, "text/plain", s, strlen(s)+1)) return 1; } while (0) 107#define ADDR(s,t) do { if (0 != proc (proc_cls, "mp3", t, EXTRACTOR_METAFORMAT_UTF8, "text/plain", s, strlen(s)+1)) return 1; } while (0)
326 108
327static int 109/* mimetype = audio/mpeg */
328mp3parse (const unsigned char *data, size_t size, 110int
329 EXTRACTOR_MetaDataProcessor proc, 111EXTRACTOR_mp3_extract (const unsigned char *data,
330 void *proc_cls) 112 size_t size,
113 EXTRACTOR_MetaDataProcessor proc,
114 void *proc_cls,
115 const char *options)
331{ 116{
332 unsigned int header; 117 unsigned int header;
333 int counter = 0; 118 int counter = 0;
@@ -474,50 +259,4 @@ mp3parse (const unsigned char *data, size_t size,
474 return 0; 259 return 0;
475} 260}
476 261
477
478#define ADD(s,t) do { if (0 != (ret = proc (proc_cls, "mp3", t, EXTRACTOR_METAFORMAT_UTF8, "text/plain", s, strlen(s)+1))) goto FINISH; } while (0)
479
480
481/* mimetype = audio/mpeg */
482int
483EXTRACTOR_mp3_extract (const char *data,
484 size_t size,
485 EXTRACTOR_MetaDataProcessor proc,
486 void *proc_cls,
487 const char *options)
488{
489 id3tag info;
490 char track[16];
491 int ret;
492
493 if (0 != get_id3 (data, size, &info))
494 return 0;
495 if (strlen (info.title) > 0)
496 ADD (info.title, EXTRACTOR_METATYPE_TITLE);
497 if (strlen (info.artist) > 0)
498 ADD (info.artist, EXTRACTOR_METATYPE_ARTIST);
499 if (strlen (info.album) > 0)
500 ADD (info.album, EXTRACTOR_METATYPE_ALBUM);
501 if (strlen (info.year) > 0)
502 ADD (info.year, EXTRACTOR_METATYPE_PUBLICATION_YEAR);
503 if (strlen (info.genre) > 0)
504 ADD (info.genre, EXTRACTOR_METATYPE_GENRE);
505 if (strlen (info.comment) > 0)
506 ADD (info.comment, EXTRACTOR_METATYPE_COMMENT);
507 if (info.track_number != 0)
508 {
509 snprintf(track,
510 sizeof(track), "%u", info.track_number);
511 ADD (track, EXTRACTOR_METATYPE_TRACK_NUMBER);
512 }
513 ret = mp3parse ((const unsigned char *) data, size, proc, proc_cls);
514FINISH:
515 free (info.title);
516 free (info.year);
517 free (info.album);
518 free (info.artist);
519 free (info.comment);
520 return ret;
521}
522
523/* end of mp3_extractor.c */ 262/* end of mp3_extractor.c */