diff options
Diffstat (limited to 'doc/extractor.texi')
-rw-r--r-- | doc/extractor.texi | 264 |
1 files changed, 60 insertions, 204 deletions
diff --git a/doc/extractor.texi b/doc/extractor.texi index 31fdf79..6fe55fe 100644 --- a/doc/extractor.texi +++ b/doc/extractor.texi | |||
@@ -158,15 +158,9 @@ end-users can be find in the man page on @command{extract} (using | |||
158 | @verb{|man extract|}). | 158 | @verb{|man extract|}). |
159 | 159 | ||
160 | @cindex license | 160 | @cindex license |
161 | @gnule{} is licensed under the GNU General Public License. The | 161 | @gnule{} is licensed under the GNU General Public License, |
162 | developers have frequently received requests to license GNU | 162 | specifically, since version 0.7, @gnule{} is licensed under GPLv3 |
163 | libextractor under alternative terms. However, @gnule{} | 163 | @emph{or any later version}. |
164 | borrows plenty of GPL-licensed code from various other projects. | ||
165 | Hence we cannot change the license (even if we wanted to).@footnote{It | ||
166 | maybe possible to switch to GPLv3 in the future. For this, an audit | ||
167 | of the license status of our dependencies would be required. The new | ||
168 | code that was developed specifically for @gnule{} has | ||
169 | always been licensed under GPLv2 @emph{or any later version}.} | ||
170 | 164 | ||
171 | @node Preparation | 165 | @node Preparation |
172 | @chapter Preparation | 166 | @chapter Preparation |
@@ -181,7 +175,7 @@ using @command{configure} and @command{make}. For details on the GNU | |||
181 | autotools build process, read the @file{INSTALL} file and query | 175 | autotools build process, read the @file{INSTALL} file and query |
182 | @verb{|./configure --help|} for additional options. | 176 | @verb{|./configure --help|} for additional options. |
183 | 177 | ||
184 | @gnule{} has various dependencies, some of which are optional. | 178 | @gnule{} has various dependencies, most of which are optional. |
185 | Instead of specifying the names of the software packages, we | 179 | Instead of specifying the names of the software packages, we |
186 | will give the list in terms of the names of the respective | 180 | will give the list in terms of the names of the respective |
187 | Debian (unstable) packages that should be installed. | 181 | Debian (unstable) packages that should be installed. |
@@ -199,38 +193,34 @@ make | |||
199 | g++ | 193 | g++ |
200 | @item | 194 | @item |
201 | libltdl7-dev | 195 | libltdl7-dev |
202 | @item | ||
203 | zlib1g-dev | ||
204 | @item | ||
205 | libbz2-dev | ||
206 | @end itemize | 196 | @end itemize |
207 | 197 | ||
208 | Recommended dependencies are: | 198 | Recommended dependencies are: |
209 | @itemize @bullet | 199 | @itemize @bullet |
210 | @item | 200 | @item |
211 | libgtk2.0-dev | 201 | zlib1g-dev |
202 | @item | ||
203 | libbz2-dev | ||
204 | @item | ||
205 | libgif-dev | ||
212 | @item | 206 | @item |
213 | libvorbis-dev | 207 | libvorbis-dev |
214 | @item | 208 | @item |
215 | libflac-dev | 209 | libflac-dev |
216 | @item | 210 | @item |
217 | libgsf-1-dev | ||
218 | @item | ||
219 | libmpeg2-4-dev | 211 | libmpeg2-4-dev |
220 | @item | 212 | @item |
221 | libqt4-dev | ||
222 | @item | ||
223 | librpm-dev | 213 | librpm-dev |
224 | @item | 214 | @item |
215 | libgtk2.0-dev | ||
216 | @item | ||
217 | libgsf-1-dev | ||
218 | @item | ||
219 | libqt4-dev | ||
220 | @item | ||
225 | libpoppler-dev | 221 | libpoppler-dev |
226 | @item | 222 | @item |
227 | libexiv2-dev | 223 | libexiv2-dev |
228 | @end itemize | ||
229 | |||
230 | Optional dependencies (you would need to additionally specify | ||
231 | the configure option @code{--enable-ffmpeg}) to make use of these | ||
232 | are: | ||
233 | @itemize @bullet | ||
234 | @item | 224 | @item |
235 | libavformat-dev | 225 | libavformat-dev |
236 | @item | 226 | @item |
@@ -355,7 +345,8 @@ to LDFLAGS. | |||
355 | // hello.c | 345 | // hello.c |
356 | #include <Extractor/extractor.h> | 346 | #include <Extractor/extractor.h> |
357 | 347 | ||
358 | int main() | 348 | int |
349 | main (int argc, char **argv) | ||
359 | { | 350 | { |
360 | struct EXTRACTOR_PluginList *el; | 351 | struct EXTRACTOR_PluginList *el; |
361 | el = EXTRACTOR_plugin_load_defaults (EXTRACTOR_OPTION_DEFAULT_POLICY); | 352 | el = EXTRACTOR_plugin_load_defaults (EXTRACTOR_OPTION_DEFAULT_POLICY); |
@@ -408,9 +399,7 @@ Notice the difference in the @code{#include} line. | |||
408 | @section Note to package maintainers | 399 | @section Note to package maintainers |
409 | 400 | ||
410 | The suggested way to package GNU libextractor is to split it into | 401 | The suggested way to package GNU libextractor is to split it into |
411 | roughly the following binary packages:@footnote{Debian policy | 402 | roughly the following binary packages: |
412 | furthermore requires a @file{-dev} (meta) package that would depend on | ||
413 | all of the above packages.} | ||
414 | 403 | ||
415 | @itemize @bullet | 404 | @itemize @bullet |
416 | @item | 405 | @item |
@@ -491,7 +480,10 @@ executing binary itself) looks like this: | |||
491 | 480 | ||
492 | @verbatim | 481 | @verbatim |
493 | #include <extractor.h> | 482 | #include <extractor.h> |
494 | int main(int argc, char ** argv) { | 483 | |
484 | int | ||
485 | main (int argc, char ** argv) | ||
486 | { | ||
495 | struct EXTRACTOR_PluginList *plugins | 487 | struct EXTRACTOR_PluginList *plugins |
496 | = EXTRACTOR_plugin_add_defaults (EXTRACTOR_OPTION_DEFAULT_POLICY); | 488 | = EXTRACTOR_plugin_add_defaults (EXTRACTOR_OPTION_DEFAULT_POLICY); |
497 | EXTRACTOR_extract (plugins, argv[1], | 489 | EXTRACTOR_extract (plugins, argv[1], |
@@ -740,7 +732,7 @@ with older versions of Java. | |||
740 | 732 | ||
741 | @section Mono | 733 | @section Mono |
742 | 734 | ||
743 | This binding is undocumented at this point. | 735 | his binding is undocumented at this point. |
744 | 736 | ||
745 | @section Perl | 737 | @section Perl |
746 | 738 | ||
@@ -802,77 +794,28 @@ The @verb{|EXTRACTOR_meta_data_print|} is a simple function which prints the met | |||
802 | 794 | ||
803 | @itemize @bullet | 795 | @itemize @bullet |
804 | @item | 796 | @item |
805 | APPLEFILE | 797 | EXIV2 (using libexiv2) |
806 | @item | 798 | @item |
807 | ASF | 799 | FLAC (using libFLAC) |
808 | @item | ||
809 | DEB | ||
810 | @item | ||
811 | DVI | ||
812 | @item | ||
813 | ELF | ||
814 | @item | ||
815 | EXIV2 | ||
816 | @item | ||
817 | FLAC | ||
818 | @item | ||
819 | FLV | ||
820 | @item | ||
821 | GIF | ||
822 | @item | ||
823 | HTML | ||
824 | @item | ||
825 | ID3 (v2.0, v2.3, v2.4) | ||
826 | @item | 800 | @item |
827 | IT | 801 | GIF (using libgif) |
828 | @item | 802 | @item |
829 | JPEG | 803 | JPEG |
830 | @item | 804 | @item |
831 | OLE2 | 805 | MIME (using libmagic) |
832 | @item | ||
833 | thumbnail (GTK, QT or FFMPEG-based) | ||
834 | @item | ||
835 | MAN | ||
836 | @item | ||
837 | MIME | ||
838 | @item | 806 | @item |
839 | MP3 (ID3v1) | 807 | MP3 (ID3v1) |
840 | @item | 808 | @item |
841 | MPEG | 809 | MPEG (using libmpeg2) |
842 | @item | ||
843 | NSF and NSFE | ||
844 | @item | ||
845 | ODF | ||
846 | @item | 810 | @item |
847 | PNG | 811 | PNG |
848 | @item | 812 | @item |
849 | PS (PostScript) | 813 | RPM (using librpm) |
850 | @item | ||
851 | QT (QuickTime) | ||
852 | @item | ||
853 | REAL | ||
854 | @item | ||
855 | RIFF | ||
856 | @item | ||
857 | RPM | ||
858 | @item | ||
859 | S3M | ||
860 | @item | ||
861 | SID | ||
862 | @item | ||
863 | TAR | ||
864 | @item | ||
865 | TIFF | ||
866 | @item | ||
867 | WAV | ||
868 | @item | ||
869 | XM | ||
870 | @item | ||
871 | ZIP | ||
872 | @end itemize | 814 | @end itemize |
873 | 815 | ||
874 | @file{gzip} and @file{bzip2} compressed versions of these formats are | 816 | @file{gzip} and @file{bzip2} compressed versions of these formats are |
875 | also supported (as well as meta data embedded by @file{gzip} itself). | 817 | also supported (as well as meta data embedded by @file{gzip} itself) |
818 | if zlib or libbz2 are available. | ||
876 | 819 | ||
877 | @node Writing new Plugins | 820 | @node Writing new Plugins |
878 | @chapter Writing new Plugins | 821 | @chapter Writing new Plugins |
@@ -891,28 +834,20 @@ assume that the remainder of the file is well formed. | |||
891 | 834 | ||
892 | The plugin library must be called libextractor_XXX.so, where XXX | 835 | The plugin library must be called libextractor_XXX.so, where XXX |
893 | denotes the file format of the plugin. The library must export a | 836 | denotes the file format of the plugin. The library must export a |
894 | method @verb{|libextractor_XXX_extract|}, with the following | 837 | method @verb{|libextractor_XXX_extract_method|}, with the following |
895 | signature: | 838 | signature: |
896 | @verbatim | 839 | @verbatim |
897 | int | 840 | void |
898 | EXTRACTOR_XXX_extract | 841 | EXTRACTOR_XXX_extract_method (struct EXTRACTOR_ExtractContext *ec); |
899 | (const char *data, | ||
900 | size_t data_size, | ||
901 | EXTRACTOR_MetaDataProcessor proc, | ||
902 | void *proc_cls, | ||
903 | const char * options); | ||
904 | @end verbatim | 842 | @end verbatim |
905 | 843 | ||
906 | @samp{data} is a pointer to the typically memory mapped contents of | 844 | @samp{ec} contains various information the plugin may need for its |
907 | the file. Note that plugins cannot ignore the @verb{|const|} | 845 | execution. Most importantly, it contains functions for reading |
908 | annotation since the memory mapping may have been done read-only (and | 846 | (``read'') and seeking (``seek'') the input data and for returning |
909 | thus writes to this page will result in an error). The @samp{data_size} | 847 | extracted data (``proc''). The ``config'' member can contain |
910 | argument specifies the size of the @samp{data} buffer in bytes. | 848 | additional configuration options. ``proc'' should be called on |
911 | 849 | each meta data item found. If ``proc'' returns non-zero, | |
912 | @samp{proc} should be called on each meta data item found. If @samp{proc} | 850 | processing should be aborted (if possible). |
913 | returns non-zero, processing should be aborted and the @code{extract} | ||
914 | function must return 1. Otherwise @code{extract} should always return zero. | ||
915 | |||
916 | 851 | ||
917 | In order to test new plugins, the @file{extract} command can be run | 852 | In order to test new plugins, the @file{extract} command can be run |
918 | with the options ``-ni'' and ``-l XXX'' . This will run the plugin | 853 | with the options ``-ni'' and ``-l XXX'' . This will run the plugin |
@@ -926,110 +861,31 @@ The following example shows how a plugin can return the mime type of | |||
926 | a file. | 861 | a file. |
927 | @example | 862 | @example |
928 | @verbatim | 863 | @verbatim |
929 | int | 864 | void |
930 | EXTRACTOR_mymime_extract | 865 | EXTRACTOR_mymime_extract (struct EXTRACTOR_ExtractContext *ec) |
931 | (const char *data, | ||
932 | size_t data_size, | ||
933 | EXTRACTOR_MetaDataProcessor proc, | ||
934 | void *proc_cls, | ||
935 | const char * options) | ||
936 | { | 866 | { |
867 | void *data; | ||
868 | ssize_t data_size, | ||
869 | |||
870 | if (-1 == (data_size = ec->read (ec->cls, &data, 4))) | ||
871 | return; /* read error */ | ||
937 | if (data_size < 4) | 872 | if (data_size < 4) |
938 | return 0; | 873 | return; /* file too small */ |
939 | if (0 != memcmp (data, "\177ELF", 4)) | 874 | if (0 != memcmp (data, "\177ELF", 4)) |
940 | return 0; | 875 | return; /* not ELF */ |
941 | if (0 != proc (proc_cls, | 876 | if (0 != ec->proc (ec->cls, |
942 | "mymime", | 877 | "mymime", |
943 | EXTRACTOR_METATYPE_MIMETYPE, | 878 | EXTRACTOR_METATYPE_MIMETYPE, |
944 | EXTRACTOR_METAFORMAT_UTF8, | 879 | EXTRACTOR_METAFORMAT_UTF8, |
945 | "text/plain", | 880 | "text/plain", |
946 | "application/x-executable", | 881 | "application/x-executable", |
947 | 1 + strlen("application/x-executable"))) | 882 | 1 + strlen("application/x-executable"))) |
948 | return 1; | 883 | return; |
949 | /* more calls to 'proc' here as needed */ | 884 | /* more calls to 'proc' here as needed */ |
950 | return 0; | ||
951 | } | 885 | } |
952 | @end verbatim | 886 | @end verbatim |
953 | @end example | 887 | @end example |
954 | 888 | ||
955 | @section Plugin execution options | ||
956 | |||
957 | Plugins can request that their execution be done in a particular way. | ||
958 | For this, the plugin defines a function with the following signature: | ||
959 | |||
960 | @verbatim | ||
961 | const char * | ||
962 | EXTRACTOR_XXX_options (void); | ||
963 | @end verbatim | ||
964 | |||
965 | The function should return a string with the execution options. | ||
966 | Individual options in this string should be separated by semicolons. | ||
967 | Options that are included in the string but not known to the library | ||
968 | are ignored. The following options are supported: | ||
969 | |||
970 | @itemize @bullet | ||
971 | @item | ||
972 | @code{oop-only} ensures that the plugin is only run out-of-process; if | ||
973 | this is not possible, the plugin will not be executed at all if this | ||
974 | option is set. | ||
975 | |||
976 | @item | ||
977 | @code{close-stderr} ensures that @code{stderr} is closed during the | ||
978 | execution of the plugin. This is useful if the plugin uses libraries | ||
979 | that write (error) messages to @code{stderr} and where this behavior cannot be | ||
980 | turned off. This option only works if the plugin is executed out-of-process. | ||
981 | |||
982 | @item | ||
983 | @code{close-stdout} ensures that @code{stdout} is closed during the | ||
984 | execution of the plugin. This is useful if the plugin uses libraries | ||
985 | that write messages to @code{stdout} and where this behavior cannot be | ||
986 | turned off. This option only works if the plugin is executed out-of-process. | ||
987 | |||
988 | @item | ||
989 | @code{force-kill} kills and restarts the plugin process for each | ||
990 | file that is being analyzed. This is useful if the plugin uses | ||
991 | libraries that keep global state between runs that is problematic or | ||
992 | if the plugin uses libraries that are known to have serious resource | ||
993 | leaks (such as memory leaks). | ||
994 | |||
995 | @item | ||
996 | @code{want-tail} | ||
997 | In order to limit memory consumption, limit the amount if reading from | ||
998 | disk and to keep the API simple, the @samp{data} argument passed to | ||
999 | the @code{EXTRACTOR_XXX_extract} method bounded (to 32 MB of normal | ||
1000 | data; for compressed data, a limit of 16 MB is imposed).@footnote{If | ||
1001 | @gnule{} was given a pointer to an existing, uncompressed block of | ||
1002 | data in memory, no bound is imposed for plugins executing in-process; | ||
1003 | for out-of-process plugins, a 32 MB limit is still imposed.} Since | ||
1004 | some file formats contain meta data at the end of the file, this option | ||
1005 | provides a way for plugins to access not the first 16--32 MB of a file | ||
1006 | but instead the last (roughly) 32 MB. | ||
1007 | |||
1008 | Note that even for files larger than 32 MB, @samp{size} is not | ||
1009 | guaranteed to be 32 MB since @samp{data} will be aligned to the page | ||
1010 | size of the operating system. However, the last byte of @samp{data} | ||
1011 | is guaranteed to be the last byte of the file. Furthermore, if the | ||
1012 | file was large and compressed, unlike in the case of meta data | ||
1013 | extraction from the header, the end of the file will not be | ||
1014 | automatically decompressed by @gnule{}. | ||
1015 | |||
1016 | @end itemize | ||
1017 | |||
1018 | Note that using options other than @code{want-tail} is pretty much | ||
1019 | always a kludge and should thus be avoided. | ||
1020 | |||
1021 | @section Example for an options method | ||
1022 | |||
1023 | The following example shows how a plugin can set some of the options listed above: | ||
1024 | @example | ||
1025 | @verbatim | ||
1026 | const char * | ||
1027 | EXTRACTOR_id3_options () | ||
1028 | { | ||
1029 | return "close-stderr;want-tail"; | ||
1030 | } | ||
1031 | @end verbatim | ||
1032 | @end example | ||
1033 | 889 | ||
1034 | @node Internal utility functions | 890 | @node Internal utility functions |
1035 | @chapter Internal utility functions | 891 | @chapter Internal utility functions |
@@ -1055,7 +911,7 @@ byte order at the same time. | |||
1055 | @file{convert.h} provides a function for character set conversion described | 911 | @file{convert.h} provides a function for character set conversion described |
1056 | below. | 912 | below. |
1057 | 913 | ||
1058 | @deftypefun {char *} EXTRACTOR_common_convert_to_utf8 (const char *input, size_t len, const char * charset) | 914 | @deftypefun {char *} EXTRACTOR_common_convert_to_utf8 (const char *input, size_t len, const char *charset) |
1059 | @cindex UTF-8 | 915 | @cindex UTF-8 |
1060 | @cindex character set | 916 | @cindex character set |
1061 | @findex EXTRACTOR_common_convert_to_utf8 | 917 | @findex EXTRACTOR_common_convert_to_utf8 |