diff options
author | Christian Grothoff <christian@grothoff.org> | 2009-12-20 00:06:45 +0000 |
---|---|---|
committer | Christian Grothoff <christian@grothoff.org> | 2009-12-20 00:06:45 +0000 |
commit | 949dae1583254b789e3dafe569e030140a621846 (patch) | |
tree | a6e356030fcb030bd88d96b74d7aa62e1dd5aeb7 | |
parent | 73b50507dba0570f2182f21f8b1c27a95886e4e7 (diff) | |
download | libextractor-949dae1583254b789e3dafe569e030140a621846.tar.gz libextractor-949dae1583254b789e3dafe569e030140a621846.zip |
id3vx
-rw-r--r-- | TODO | 2 | ||||
-rw-r--r-- | src/include/extractor.h | 15 | ||||
-rw-r--r-- | src/main/extractor_metatypes.c | 21 | ||||
-rw-r--r-- | src/plugins/id3v23_extractor.c | 330 | ||||
-rw-r--r-- | src/plugins/id3v24_extractor.c | 432 | ||||
-rw-r--r-- | src/plugins/id3v2_extractor.c | 2 | ||||
-rw-r--r-- | src/plugins/odf_extractor.c | 2 | ||||
-rw-r--r-- | src/plugins/ole2_extractor.c | 4 | ||||
-rw-r--r-- | src/plugins/png_extractor.c | 2 | ||||
-rw-r--r-- | src/plugins/qt_extractor.c | 8 |
10 files changed, 609 insertions, 209 deletions
@@ -17,12 +17,12 @@ Core: | |||
17 | 17 | ||
18 | Incomplete code (missing features): | 18 | Incomplete code (missing features): |
19 | * RIFF (idx1 attribute) | 19 | * RIFF (idx1 attribute) |
20 | * IDv2{3,4} (some attributes, make testcases in test/id3v2/ work) | ||
21 | * StarOffice sdw (some attributes, see doc/) | 20 | * StarOffice sdw (some attributes, see doc/) |
22 | * man pages (interpret sections for authors, brief description) | 21 | * man pages (interpret sections for authors, brief description) |
23 | * pdf: full-text extraction! | 22 | * pdf: full-text extraction! |
24 | * EXIV2 | 23 | * EXIV2 |
25 | * ELF: 64-bit support, lists of architectures, OSes, etc. are incomplete | 24 | * ELF: 64-bit support, lists of architectures, OSes, etc. are incomplete |
25 | * ID3v2x: unsynchronization support, (de)compression support, footer support (24) | ||
26 | 26 | ||
27 | Desirable missing formats: | 27 | Desirable missing formats: |
28 | * mbox / various e-mail formats | 28 | * mbox / various e-mail formats |
diff --git a/src/include/extractor.h b/src/include/extractor.h index 9c4ae60..ffebd5c 100644 --- a/src/include/extractor.h +++ b/src/include/extractor.h | |||
@@ -280,8 +280,8 @@ enum EXTRACTOR_MetaType | |||
280 | EXTRACTOR_METATYPE_PRODUCT_VERSION = 148, | 280 | EXTRACTOR_METATYPE_PRODUCT_VERSION = 148, |
281 | EXTRACTOR_METATYPE_CONTRIBUTOR_NAME = 149, | 281 | EXTRACTOR_METATYPE_CONTRIBUTOR_NAME = 149, |
282 | EXTRACTOR_METATYPE_MOVIE_DIRECTOR = 150, | 282 | EXTRACTOR_METATYPE_MOVIE_DIRECTOR = 150, |
283 | EXTRACTOR_METATYPE_TV_NETWORK_NAME = 151, | 283 | EXTRACTOR_METATYPE_NETWORK_NAME = 151, |
284 | EXTRACTOR_METATYPE_TV_SHOW_NAME = 152, | 284 | EXTRACTOR_METATYPE_SHOW_NAME = 152, |
285 | EXTRACTOR_METATYPE_CHAPTER_NAME = 153, | 285 | EXTRACTOR_METATYPE_CHAPTER_NAME = 153, |
286 | EXTRACTOR_METATYPE_SONG_COUNT = 154, | 286 | EXTRACTOR_METATYPE_SONG_COUNT = 154, |
287 | EXTRACTOR_METATYPE_STARTING_SONG = 155, | 287 | EXTRACTOR_METATYPE_STARTING_SONG = 155, |
@@ -295,13 +295,17 @@ enum EXTRACTOR_MetaType | |||
295 | EXTRACTOR_METATYPE_ORIGINAL_ARTIST = 163, | 295 | EXTRACTOR_METATYPE_ORIGINAL_ARTIST = 163, |
296 | EXTRACTOR_METATYPE_ORIGINAL_WRITER = 164, | 296 | EXTRACTOR_METATYPE_ORIGINAL_WRITER = 164, |
297 | EXTRACTOR_METATYPE_ORIGINAL_RELEASE_YEAR = 165, | 297 | EXTRACTOR_METATYPE_ORIGINAL_RELEASE_YEAR = 165, |
298 | EXTRACTOR_METATYPE_LYRICS = 166, | 298 | EXTRACTOR_METATYPE_ORIGINAL_PERFORMER = 166, |
299 | EXTRACTOR_METATYPE_POPULARITY_METER = 167, | 299 | EXTRACTOR_METATYPE_LYRICS = 167, |
300 | EXTRACTOR_METATYPE_POPULARITY_METER = 168, | ||
301 | EXTRACTOR_METATYPE_LICENSEE = 169, | ||
302 | EXTRACTOR_METATYPE_MUSICIAN_CREDITS_LIST = 170, | ||
303 | EXTRACTOR_METATYPE_MOOD = 171, | ||
304 | EXTRACTOR_METATYPE_SUBTITLE = 172, | ||
300 | 305 | ||
301 | /* fixme: used up to here! */ | 306 | /* fixme: used up to here! */ |
302 | 307 | ||
303 | EXTRACTOR_METATYPE_MUSIC_CD_IDENTIFIER = 117, | 308 | EXTRACTOR_METATYPE_MUSIC_CD_IDENTIFIER = 117, |
304 | EXTRACTOR_METATYPE_MUSICIAN_CREDITS_LIST = 123, | ||
305 | 309 | ||
306 | 310 | ||
307 | EXTRACTOR_METATYPE_SCALE = 108, | 311 | EXTRACTOR_METATYPE_SCALE = 108, |
@@ -342,7 +346,6 @@ enum EXTRACTOR_MetaType | |||
342 | EXTRACTOR_METATYPE_FULL_NAME = 113, | 346 | EXTRACTOR_METATYPE_FULL_NAME = 113, |
343 | EXTRACTOR_METATYPE_LINK = 116, | 347 | EXTRACTOR_METATYPE_LINK = 116, |
344 | EXTRACTOR_METATYPE_TIME = 122, | 348 | EXTRACTOR_METATYPE_TIME = 122, |
345 | EXTRACTOR_METATYPE_MOOD = 124, | ||
346 | EXTRACTOR_METATYPE_TELEVISION_SYSTEM = 126, | 349 | EXTRACTOR_METATYPE_TELEVISION_SYSTEM = 126, |
347 | EXTRACTOR_METATYPE_HARDWARE_DEPENDENCY = 129, | 350 | EXTRACTOR_METATYPE_HARDWARE_DEPENDENCY = 129, |
348 | EXTRACTOR_METATYPE_RIPPER = 130, | 351 | EXTRACTOR_METATYPE_RIPPER = 130, |
diff --git a/src/main/extractor_metatypes.c b/src/main/extractor_metatypes.c index b98a395..c97fc3e 100644 --- a/src/main/extractor_metatypes.c +++ b/src/main/extractor_metatypes.c | |||
@@ -369,10 +369,10 @@ static const struct MetaTypeDescription meta_type_descriptions[] = { | |||
369 | /* 150 */ | 369 | /* 150 */ |
370 | { gettext_noop ("movie director"), | 370 | { gettext_noop ("movie director"), |
371 | gettext_noop ("name of the director") }, | 371 | gettext_noop ("name of the director") }, |
372 | { gettext_noop ("TV network"), | 372 | { gettext_noop ("network"), |
373 | gettext_noop ("name of the broadcasting TV network") }, | 373 | gettext_noop ("name of the broadcasting network or station") }, |
374 | { gettext_noop ("TV show"), | 374 | { gettext_noop ("show"), |
375 | gettext_noop ("name of the TV show") }, | 375 | gettext_noop ("name of the show") }, |
376 | { gettext_noop ("chapter name"), | 376 | { gettext_noop ("chapter name"), |
377 | gettext_noop ("name of the chapter") }, | 377 | gettext_noop ("name of the chapter") }, |
378 | { gettext_noop ("song count"), | 378 | { gettext_noop ("song count"), |
@@ -402,10 +402,23 @@ static const struct MetaTypeDescription meta_type_descriptions[] = { | |||
402 | /* 165 */ | 402 | /* 165 */ |
403 | { gettext_noop ("original release year"), | 403 | { gettext_noop ("original release year"), |
404 | gettext_noop ("year of the original release") }, | 404 | gettext_noop ("year of the original release") }, |
405 | { gettext_noop ("original performer"), | ||
406 | gettext_noop ("name of the original performer") }, | ||
405 | { gettext_noop ("lyrics"), | 407 | { gettext_noop ("lyrics"), |
406 | gettext_noop ("lyrics of the song or text description of vocal activities") }, | 408 | gettext_noop ("lyrics of the song or text description of vocal activities") }, |
407 | { gettext_noop ("popularity"), | 409 | { gettext_noop ("popularity"), |
408 | gettext_noop ("information about the file's popularity") }, | 410 | gettext_noop ("information about the file's popularity") }, |
411 | { gettext_noop ("licensee"), | ||
412 | gettext_noop ("name of the owner or licensee of the file") }, | ||
413 | /* 170 */ | ||
414 | { gettext_noop ("musician credit list"), | ||
415 | gettext_noop ("names of contributing musicians") }, | ||
416 | { gettext_noop ("mood"), | ||
417 | gettext_noop ("keywords reflecting the mood of the piece") }, | ||
418 | { gettext_noop ("subtitle"), | ||
419 | gettext_noop ("subtitle of this part") }, | ||
420 | { gettext_noop (""), | ||
421 | gettext_noop ("") }, | ||
409 | { gettext_noop (""), | 422 | { gettext_noop (""), |
410 | gettext_noop ("") }, | 423 | gettext_noop ("") }, |
411 | #if 0 | 424 | #if 0 |
diff --git a/src/plugins/id3v23_extractor.c b/src/plugins/id3v23_extractor.c index 71553c2..4ab8116 100644 --- a/src/plugins/id3v23_extractor.c +++ b/src/plugins/id3v23_extractor.c | |||
@@ -1,6 +1,6 @@ | |||
1 | /* | 1 | /* |
2 | This file is part of libextractor. | 2 | This file is part of libextractor. |
3 | (C) 2002, 2003, 2004, 2006, 2007 Vidyut Samanta and Christian Grothoff | 3 | (C) 2002, 2003, 2004, 2006, 2007, 2009 Vidyut Samanta and Christian Grothoff |
4 | 4 | ||
5 | libextractor is free software; you can redistribute it and/or modify | 5 | libextractor is free software; you can redistribute it and/or modify |
6 | it under the terms of the GNU General Public License as published | 6 | it under the terms of the GNU General Public License as published |
@@ -35,54 +35,83 @@ | |||
35 | 35 | ||
36 | #include "convert.h" | 36 | #include "convert.h" |
37 | 37 | ||
38 | enum Id3v23Fmt | ||
39 | { | ||
40 | T, /* simple, 0-terminated string, prefixed by encoding */ | ||
41 | U, /* 0-terminated ASCII string, no encoding */ | ||
42 | UL, /* unsync'ed lyrics */ | ||
43 | SL, /* sync'ed lyrics */ | ||
44 | L, /* string with language prefix */ | ||
45 | I /* image */ | ||
46 | }; | ||
47 | |||
38 | typedef struct | 48 | typedef struct |
39 | { | 49 | { |
40 | const char *text; | 50 | const char *text; |
41 | enum EXTRACTOR_MetaType type; | 51 | enum EXTRACTOR_MetaType type; |
52 | enum Id3v23Fmt fmt; | ||
42 | } Matches; | 53 | } Matches; |
43 | 54 | ||
44 | static Matches tmap[] = { | 55 | static Matches tmap[] = { |
45 | {"COMM", EXTRACTOR_METATYPE_COMMENT}, | 56 | {"TALB", EXTRACTOR_METATYPE_ALBUM, T}, |
46 | {"IPLS", EXTRACTOR_METATYPE_CONTRIBUTOR}, | 57 | {"TBPM", EXTRACTOR_METATYPE_BEATS_PER_MINUTE, T}, |
47 | {"LINK", EXTRACTOR_METATYPE_LINK}, | 58 | {"TCOM", EXTRACTOR_METATYPE_COMPOSER, T}, |
48 | {"MCDI", EXTRACTOR_METATYPE_MUSIC_CD_IDENTIFIER}, | 59 | {"TCON", EXTRACTOR_METATYPE_SONG_VERSION, T}, |
49 | {"PCNT", EXTRACTOR_METATYPE_PLAY_COUNTER}, | 60 | {"TCOP", EXTRACTOR_METATYPE_COPYRIGHT, T}, |
50 | {"POPM", EXTRACTOR_METATYPE_POPULARITY_METER}, | 61 | /* {"TDAT", EXTRACTOR_METATYPE_CREATION_DATE, T}, */ |
51 | {"TCOP", EXTRACTOR_METATYPE_COPYRIGHT}, | 62 | /* TDLY */ |
52 | {"TDAT", EXTRACTOR_METATYPE_DATE}, | 63 | {"TENC", EXTRACTOR_METATYPE_ENCODED_BY, T}, |
53 | {"TCON", EXTRACTOR_METATYPE_CONTENT_TYPE}, | 64 | {"TEXT", EXTRACTOR_METATYPE_WRITER, T}, |
54 | {"TIT1", EXTRACTOR_METATYPE_GENRE}, | 65 | {"TFLT", EXTRACTOR_METATYPE_FORMAT_VERSION, T}, |
55 | {"TENC", EXTRACTOR_METATYPE_ENCODED_BY}, | 66 | /* TIME */ |
56 | {"TEXT", EXTRACTOR_METATYPE_LYRICS}, | 67 | {"TIT1", EXTRACTOR_METATYPE_SECTION, T}, |
57 | {"TOLY", EXTRACTOR_METATYPE_CONTRIBUTOR}, | 68 | {"TIT2", EXTRACTOR_METATYPE_TITLE, T}, |
58 | {"TOPE", EXTRACTOR_METATYPE_CONTRIBUTOR}, | 69 | {"TIT3", EXTRACTOR_METATYPE_SONG_VERSION, T}, |
59 | {"TOWN", EXTRACTOR_METATYPE_OWNER}, | 70 | /* TKEY */ |
60 | {"TPE1", EXTRACTOR_METATYPE_ARTIST}, | 71 | {"TLAN", EXTRACTOR_METATYPE_LANGUAGE, T}, |
61 | {"TPE2", EXTRACTOR_METATYPE_ARTIST}, | 72 | {"TLEN", EXTRACTOR_METATYPE_DURATION, T}, /* FIXME: should append 'ms' as unit */ |
62 | {"TPE3", EXTRACTOR_METATYPE_CONDUCTOR}, | 73 | {"TMED", EXTRACTOR_METATYPE_SOURCE, T}, |
63 | {"TPE4", EXTRACTOR_METATYPE_INTERPRET}, | 74 | {"TOAL", EXTRACTOR_METATYPE_ORIGINAL_TITLE, T}, |
64 | {"TMED", EXTRACTOR_METATYPE_MEDIA_TYPE}, | 75 | {"TOFN", EXTRACTOR_METATYPE_ORIGINAL_ARTIST, T}, |
65 | {"TCOM", EXTRACTOR_METATYPE_CREATOR}, | 76 | {"TOLY", EXTRACTOR_METATYPE_ORIGINAL_WRITER, T}, |
66 | {"TIME", EXTRACTOR_METATYPE_TIME}, | 77 | {"TOPE", EXTRACTOR_METATYPE_ORIGINAL_PERFORMER, T}, |
67 | {"TOFN", EXTRACTOR_METATYPE_FILENAME}, | 78 | {"TORY", EXTRACTOR_METATYPE_ORIGINAL_RELEASE_YEAR, T}, |
68 | {"TOPE", EXTRACTOR_METATYPE_ARTIST}, | 79 | {"TOWN", EXTRACTOR_METATYPE_LICENSEE, T}, |
69 | {"TPUB", EXTRACTOR_METATYPE_PUBLISHER}, | 80 | {"TPE1", EXTRACTOR_METATYPE_ARTIST, T}, |
70 | {"TRCK", EXTRACTOR_METATYPE_TRACK_NUMBER}, | 81 | {"TPE2", EXTRACTOR_METATYPE_PERFORMER, T}, |
71 | {"TRSC", EXTRACTOR_METATYPE_ISRC}, | 82 | {"TPE3", EXTRACTOR_METATYPE_CONDUCTOR, T}, |
72 | {"TRSN", EXTRACTOR_METATYPE_SOURCE}, | 83 | {"TPE4", EXTRACTOR_METATYPE_INTERPRETATION, T}, |
73 | {"TRSO", EXTRACTOR_METATYPE_CREATED_FOR}, | 84 | {"TPOS", EXTRACTOR_METATYPE_DISC_NUMBER, T}, |
74 | {"TSRC", EXTRACTOR_METATYPE_RESOURCE_IDENTIFIER}, | 85 | {"TPUB", EXTRACTOR_METATYPE_PUBLISHER, T}, |
75 | {"TOAL", EXTRACTOR_METATYPE_ALBUM}, | 86 | {"TRCK", EXTRACTOR_METATYPE_TRACK_NUMBER, T}, |
76 | {"TALB", EXTRACTOR_METATYPE_ALBUM}, | 87 | /* TRDA */ |
77 | {"TLAN", EXTRACTOR_METATYPE_LANGUAGE}, | 88 | {"TRSN", EXTRACTOR_METATYPE_NETWORK_NAME, T}, |
78 | {"TYER", EXTRACTOR_METATYPE_YEAR}, | 89 | /* TRSO */ |
79 | {"TLEN", EXTRACTOR_METATYPE_DURATION}, | 90 | {"TSIZ", EXTRACTOR_METATYPE_EMBEDDED_FILE_SIZE, T}, |
80 | {"TIT2", EXTRACTOR_METATYPE_TITLE}, | 91 | {"TSRC", EXTRACTOR_METATYPE_ISRC, T}, |
81 | {"TIT3", EXTRACTOR_METATYPE_DESCRIPTION}, | 92 | /* TSSE */ |
82 | {"WCOM", EXTRACTOR_METATYPE_RELEASE}, | 93 | {"TYER", EXTRACTOR_METATYPE_PUBLICATION_YEAR, T}, |
83 | {"WCOP", EXTRACTOR_METATYPE_DISCLAIMER}, | 94 | {"WCOM", EXTRACTOR_METATYPE_URL, U}, |
84 | {"", EXTRACTOR_METATYPE_KEYWORDS}, | 95 | {"WCOP", EXTRACTOR_METATYPE_URL, U}, |
85 | {NULL, 0} | 96 | {"WOAF", EXTRACTOR_METATYPE_URL, U}, |
97 | {"WOAS", EXTRACTOR_METATYPE_URL, U}, | ||
98 | {"WORS", EXTRACTOR_METATYPE_URL, U}, | ||
99 | {"WPAY", EXTRACTOR_METATYPE_URL, U}, | ||
100 | {"WPUB", EXTRACTOR_METATYPE_URL, U}, | ||
101 | {"WXXX", EXTRACTOR_METATYPE_URL, T}, | ||
102 | {"IPLS", EXTRACTOR_METATYPE_CONTRIBUTOR_NAME, T}, | ||
103 | /* ... */ | ||
104 | {"USLT", EXTRACTOR_METATYPE_LYRICS, UL }, | ||
105 | {"SYLT", EXTRACTOR_METATYPE_LYRICS, SL }, | ||
106 | {"COMM", EXTRACTOR_METATYPE_COMMENT, L}, | ||
107 | /* ... */ | ||
108 | {"APIC", EXTRACTOR_METATYPE_PICTURE, I}, | ||
109 | /* ... */ | ||
110 | {"LINK", EXTRACTOR_METATYPE_URL, U}, | ||
111 | /* ... */ | ||
112 | {"USER", EXTRACTOR_METATYPE_LICENSE, T}, | ||
113 | /* ... */ | ||
114 | {NULL, 0, T} | ||
86 | }; | 115 | }; |
87 | 116 | ||
88 | 117 | ||
@@ -104,6 +133,9 @@ EXTRACTOR_id3v23_extract (const unsigned char *data, | |||
104 | uint32_t csize; | 133 | uint32_t csize; |
105 | int i; | 134 | int i; |
106 | uint16_t flags; | 135 | uint16_t flags; |
136 | char *mime; | ||
137 | enum EXTRACTOR_MetaType type; | ||
138 | size_t off; | ||
107 | 139 | ||
108 | if ((size < 16) || | 140 | if ((size < 16) || |
109 | (data[0] != 0x49) || | 141 | (data[0] != 0x49) || |
@@ -111,12 +143,16 @@ EXTRACTOR_id3v23_extract (const unsigned char *data, | |||
111 | (data[2] != 0x33) || (data[3] != 0x03) || (data[4] != 0x00)) | 143 | (data[2] != 0x33) || (data[3] != 0x03) || (data[4] != 0x00)) |
112 | return 0; | 144 | return 0; |
113 | unsync = (data[5] & 0x80) > 0; | 145 | unsync = (data[5] & 0x80) > 0; |
146 | if (unsync) | ||
147 | return 0; /* not supported */ | ||
114 | extendedHdr = (data[5] & 0x40) > 0; | 148 | extendedHdr = (data[5] & 0x40) > 0; |
115 | experimental = (data[5] & 0x20) > 0; | 149 | experimental = (data[5] & 0x20) > 0; |
150 | if (experimental) | ||
151 | return 0; | ||
116 | tsize = (((data[6] & 0x7F) << 21) | | 152 | tsize = (((data[6] & 0x7F) << 21) | |
117 | ((data[7] & 0x7F) << 14) | | 153 | ((data[7] & 0x7F) << 14) | |
118 | ((data[8] & 0x7F) << 7) | ((data[9] & 0x7F) << 0)); | 154 | ((data[8] & 0x7F) << 7) | ((data[9] & 0x7F) << 0)); |
119 | if ((tsize + 10 > size) || (experimental)) | 155 | if (tsize + 10 > size) |
120 | return 0; | 156 | return 0; |
121 | pos = 10; | 157 | pos = 10; |
122 | padding = 0; | 158 | padding = 0; |
@@ -142,7 +178,8 @@ EXTRACTOR_id3v23_extract (const unsigned char *data, | |||
142 | csize = | 178 | csize = |
143 | (data[pos + 4] << 24) + (data[pos + 5] << 16) + (data[pos + 6] << 8) + | 179 | (data[pos + 4] << 24) + (data[pos + 5] << 16) + (data[pos + 6] << 8) + |
144 | data[pos + 7]; | 180 | data[pos + 7]; |
145 | if ((pos + 10 + csize > tsize) || (csize > tsize) || (csize == 0)) | 181 | if ((pos + 10 + csize > tsize) || (csize > tsize) || (csize == 0) || |
182 | (pos + 10 + csize <= pos + 10) || (pos + 10 <= pos)) | ||
146 | break; | 183 | break; |
147 | flags = (data[pos + 8] << 8) + data[pos + 9]; | 184 | flags = (data[pos + 8] << 8) + data[pos + 9]; |
148 | if (((flags & 0x80) > 0) /* compressed, not yet supported */ || | 185 | if (((flags & 0x80) > 0) /* compressed, not yet supported */ || |
@@ -163,32 +200,191 @@ EXTRACTOR_id3v23_extract (const unsigned char *data, | |||
163 | pos++; | 200 | pos++; |
164 | csize--; | 201 | csize--; |
165 | } | 202 | } |
166 | csize--; | 203 | switch (tmap[i].fmt) |
167 | /* this byte describes the encoding | 204 | { |
168 | try to convert strings to UTF-8 | 205 | case T: |
169 | if it fails, then forget it */ | 206 | /* this byte describes the encoding |
170 | switch (data[pos + 10]) | 207 | try to convert strings to UTF-8 |
171 | { | 208 | if it fails, then forget it */ |
172 | case 0x00: | 209 | switch (data[pos + 10]) |
173 | word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 11], | 210 | { |
174 | csize, "ISO-8859-1"); | 211 | case 0x00: |
175 | break; | 212 | word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 11], |
176 | case 0x01: | 213 | csize - 1, "ISO-8859-1"); |
177 | word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 11], | 214 | break; |
178 | csize, "UCS-2"); | 215 | case 0x01: |
179 | break; | 216 | word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 11], |
180 | default: | 217 | csize - 1, "UCS-2"); |
181 | /* bad encoding byte, | 218 | break; |
182 | try to convert from iso-8859-1 */ | 219 | default: |
183 | word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 11], | 220 | /* bad encoding byte, |
184 | csize, "ISO-8859-1"); | 221 | try to convert from iso-8859-1 */ |
185 | break; | 222 | word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 11], |
186 | } | 223 | csize - 1, "ISO-8859-1"); |
187 | pos++; | 224 | break; |
225 | } | ||
226 | break; | ||
227 | case U: | ||
228 | word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 10], | ||
229 | csize, "ISO-8859-1"); | ||
230 | break; | ||
231 | case UL: | ||
232 | if (csize < 6) | ||
233 | return 0; /* malformed */ | ||
234 | /* find end of description */ | ||
235 | off = 14; | ||
236 | while ( (off < size) && | ||
237 | (off - pos < csize) && | ||
238 | (data[pos + off] == '\0') ) | ||
239 | off++; | ||
240 | if ( (off >= csize) || | ||
241 | (data[pos+off] != '\0') ) | ||
242 | return 0; /* malformed */ | ||
243 | off++; | ||
244 | switch (data[pos + 10]) | ||
245 | { | ||
246 | case 0x00: | ||
247 | word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + off], | ||
248 | csize - off, "ISO-8859-1"); | ||
249 | break; | ||
250 | case 0x01: | ||
251 | word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + off], | ||
252 | csize - off, "UCS-2"); | ||
253 | break; | ||
254 | default: | ||
255 | /* bad encoding byte, | ||
256 | try to convert from iso-8859-1 */ | ||
257 | word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + off], | ||
258 | csize - off, "ISO-8859-1"); | ||
259 | break; | ||
260 | } | ||
261 | break; | ||
262 | case SL: | ||
263 | if (csize < 7) | ||
264 | return 0; /* malformed */ | ||
265 | /* find end of description */ | ||
266 | switch (data[pos + 10]) | ||
267 | { | ||
268 | case 0x00: | ||
269 | word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 16], | ||
270 | csize - 6, "ISO-8859-1"); | ||
271 | break; | ||
272 | case 0x01: | ||
273 | word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 16], | ||
274 | csize - 6, "UCS-2"); | ||
275 | break; | ||
276 | default: | ||
277 | /* bad encoding byte, | ||
278 | try to convert from iso-8859-1 */ | ||
279 | word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 16], | ||
280 | csize - 6, "ISO-8859-1"); | ||
281 | break; | ||
282 | } | ||
283 | break; | ||
284 | case L: | ||
285 | if (csize < 5) | ||
286 | return 0; /* malformed */ | ||
287 | /* find end of description */ | ||
288 | switch (data[pos + 10]) | ||
289 | { | ||
290 | case 0x00: | ||
291 | word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 14], | ||
292 | csize - 4, "ISO-8859-1"); | ||
293 | break; | ||
294 | case 0x01: | ||
295 | word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 14], | ||
296 | csize - 4, "UCS-2"); | ||
297 | break; | ||
298 | default: | ||
299 | /* bad encoding byte, | ||
300 | try to convert from iso-8859-1 */ | ||
301 | word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 14], | ||
302 | csize - 4, "ISO-8859-1"); | ||
303 | break; | ||
304 | } | ||
305 | break; | ||
306 | case I: | ||
307 | if (csize < 2) | ||
308 | return 0; /* malformed */ | ||
309 | /* find end of mime type */ | ||
310 | off = 11; | ||
311 | while ( (off < size) && | ||
312 | (off - pos < csize) && | ||
313 | (data[pos + off] == '\0') ) | ||
314 | off++; | ||
315 | if ( (off >= csize) || | ||
316 | (data[pos+off] != '\0') ) | ||
317 | return 0; /* malformed */ | ||
318 | off++; | ||
319 | mime = strdup ((const char*) &data[pos + 11]); | ||
320 | |||
321 | switch (data[pos+off]) | ||
322 | { | ||
323 | case 0x03: | ||
324 | case 0x04: | ||
325 | type = EXTRACTOR_METATYPE_COVER_PICTURE; | ||
326 | break; | ||
327 | case 0x07: | ||
328 | case 0x08: | ||
329 | case 0x09: | ||
330 | case 0x0A: | ||
331 | case 0x0B: | ||
332 | case 0x0C: | ||
333 | type = EXTRACTOR_METATYPE_CONTRIBUTOR_PICTURE; | ||
334 | break; | ||
335 | case 0x0D: | ||
336 | case 0x0E: | ||
337 | case 0x0F: | ||
338 | type = EXTRACTOR_METATYPE_EVENT_PICTURE; | ||
339 | break; | ||
340 | case 0x14: | ||
341 | type = EXTRACTOR_METATYPE_LOGO; | ||
342 | type = EXTRACTOR_METATYPE_LOGO; | ||
343 | break; | ||
344 | default: | ||
345 | type = EXTRACTOR_METATYPE_PICTURE; | ||
346 | break; | ||
347 | } | ||
348 | off++; | ||
349 | |||
350 | /* find end of description */ | ||
351 | while ( (off < size) && | ||
352 | (off - pos < csize) && | ||
353 | (data[pos + off] == '\0') ) | ||
354 | off++; | ||
355 | if ( (off >= csize) || | ||
356 | (data[pos+off] != '\0') ) | ||
357 | return 0; /* malformed */ | ||
358 | off++; | ||
359 | if (0 == strcasecmp ("-->", | ||
360 | mime)) | ||
361 | { | ||
362 | /* not supported */ | ||
363 | } | ||
364 | else | ||
365 | { | ||
366 | if (0 != proc (proc_cls, | ||
367 | "id3v23", | ||
368 | type, | ||
369 | EXTRACTOR_METAFORMAT_BINARY, | ||
370 | mime, | ||
371 | (const char*) &data[pos + off], | ||
372 | csize + 6 - off)) | ||
373 | { | ||
374 | free (mime); | ||
375 | return 1; | ||
376 | } | ||
377 | } | ||
378 | free (mime); | ||
379 | word = NULL; | ||
380 | break; | ||
381 | default: | ||
382 | return 0; | ||
383 | } | ||
188 | if ((word != NULL) && (strlen (word) > 0)) | 384 | if ((word != NULL) && (strlen (word) > 0)) |
189 | { | 385 | { |
190 | if (0 != proc (proc_cls, | 386 | if (0 != proc (proc_cls, |
191 | "id3v2", | 387 | "id3v23", |
192 | tmap[i].type, | 388 | tmap[i].type, |
193 | EXTRACTOR_METAFORMAT_UTF8, | 389 | EXTRACTOR_METAFORMAT_UTF8, |
194 | "text/plain", | 390 | "text/plain", |
diff --git a/src/plugins/id3v24_extractor.c b/src/plugins/id3v24_extractor.c index ec11e4a..acc76af 100644 --- a/src/plugins/id3v24_extractor.c +++ b/src/plugins/id3v24_extractor.c | |||
@@ -1,6 +1,6 @@ | |||
1 | /* | 1 | /* |
2 | This file is part of libextractor. | 2 | This file is part of libextractor. |
3 | (C) 2002, 2003, 2004, 2006, 2009 Vidyut Samanta and Christian Grothoff | 3 | (C) 2002, 2003, 2004, 2006, 2007, 2009 Vidyut Samanta and Christian Grothoff |
4 | 4 | ||
5 | libextractor is free software; you can redistribute it and/or modify | 5 | libextractor is free software; you can redistribute it and/or modify |
6 | it under the terms of the GNU General Public License as published | 6 | it under the terms of the GNU General Public License as published |
@@ -18,7 +18,6 @@ | |||
18 | Boston, MA 02111-1307, USA. | 18 | Boston, MA 02111-1307, USA. |
19 | 19 | ||
20 | */ | 20 | */ |
21 | |||
22 | #define DEBUG_EXTRACT_ID3v24 0 | 21 | #define DEBUG_EXTRACT_ID3v24 0 |
23 | 22 | ||
24 | #include "platform.h" | 23 | #include "platform.h" |
@@ -33,72 +32,98 @@ | |||
33 | #ifndef MINGW | 32 | #ifndef MINGW |
34 | #include <sys/mman.h> | 33 | #include <sys/mman.h> |
35 | #endif | 34 | #endif |
35 | |||
36 | #include "convert.h" | 36 | #include "convert.h" |
37 | 37 | ||
38 | 38 | enum Id3v24Fmt | |
39 | static struct EXTRACTOR_Keywords * | 39 | { |
40 | addKeyword (EXTRACTOR_KeywordList * oldhead, | 40 | T, /* simple, 0-terminated string, prefixed by encoding */ |
41 | char *phrase, EXTRACTOR_KeywordType type) | 41 | U, /* 0-terminated ASCII string, no encoding */ |
42 | { | 42 | UL, /* unsync'ed lyrics */ |
43 | EXTRACTOR_KeywordList *keyword; | 43 | SL, /* sync'ed lyrics */ |
44 | 44 | L, /* string with language prefix */ | |
45 | keyword = malloc (sizeof (EXTRACTOR_KeywordList)); | 45 | I /* image */ |
46 | keyword->next = oldhead; | 46 | }; |
47 | keyword->keyword = phrase; | ||
48 | keyword->keywordType = type; | ||
49 | return keyword; | ||
50 | } | ||
51 | 47 | ||
52 | typedef struct | 48 | typedef struct |
53 | { | 49 | { |
54 | char *text; | 50 | const char *text; |
55 | enum EXTRACTOR_MetaType type; | 51 | enum EXTRACTOR_MetaType type; |
52 | enum Id3v24Fmt fmt; | ||
56 | } Matches; | 53 | } Matches; |
57 | 54 | ||
58 | static Matches tmap[] = { | 55 | static Matches tmap[] = { |
59 | {"COMM", EXTRACTOR_METATYPE_COMMENT}, | 56 | {"TALB", EXTRACTOR_METATYPE_ALBUM, T}, |
60 | {"IPLS", EXTRACTOR_METATYPE_CONTRIBUTOR}, | 57 | {"TBPM", EXTRACTOR_METATYPE_BEATS_PER_MINUTE, T}, |
61 | {"TIPL", EXTRACTOR_METATYPE_CONTRIBUTOR}, | 58 | {"TCOM", EXTRACTOR_METATYPE_COMPOSER, T}, |
62 | {"TMOO", EXTRACTOR_METATYPE_MOOD}, | 59 | {"TCON", EXTRACTOR_METATYPE_SONG_VERSION, T}, |
63 | {"TMCL", EXTRACTOR_METATYPE_MUSICIAN_CREDITS_LIST}, | 60 | {"TCOP", EXTRACTOR_METATYPE_COPYRIGHT, T}, |
64 | {"LINK", EXTRACTOR_METATYPE_LINK}, | 61 | /* {"TDAT", EXTRACTOR_METATYPE_CREATION_DATE, T}, deprecated in 24 */ |
65 | {"MCDI", EXTRACTOR_METATYPE_MUSIC_CD_IDENTIFIER}, | 62 | /* TDLY */ |
66 | {"PCNT", EXTRACTOR_METATYPE_PLAY_COUNTER}, | 63 | {"TENC", EXTRACTOR_METATYPE_ENCODED_BY, T}, |
67 | {"POPM", EXTRACTOR_METATYPE_POPULARITY_METER}, | 64 | {"TEXT", EXTRACTOR_METATYPE_WRITER, T}, |
68 | {"TCOP", EXTRACTOR_METATYPE_COPYRIGHT}, | 65 | {"TFLT", EXTRACTOR_METATYPE_FORMAT_VERSION, T}, |
69 | {"TDRC", EXTRACTOR_METATYPE_DATE}, | 66 | /* TIME, deprecated in 24 */ |
70 | {"TCON", EXTRACTOR_METATYPE_GENRE}, | 67 | {"TIT1", EXTRACTOR_METATYPE_SECTION, T}, |
71 | {"TIT1", EXTRACTOR_METATYPE_GENRE}, | 68 | {"TIT2", EXTRACTOR_METATYPE_TITLE, T}, |
72 | {"TENC", EXTRACTOR_METATYPE_ENCODED_BY}, | 69 | {"TIT3", EXTRACTOR_METATYPE_SONG_VERSION, T}, |
73 | {"TEXT", EXTRACTOR_METATYPE_LYRICS}, | 70 | /* TKEY */ |
74 | {"TOLY", EXTRACTOR_METATYPE_CONTRIBUTOR}, | 71 | {"TLAN", EXTRACTOR_METATYPE_LANGUAGE, T}, |
75 | {"TOPE", EXTRACTOR_METATYPE_CONTRIBUTOR}, | 72 | {"TLEN", EXTRACTOR_METATYPE_DURATION, T}, /* FIXME: should append 'ms' as unit */ |
76 | {"TOWN", EXTRACTOR_METATYPE_OWNER}, | 73 | {"TMED", EXTRACTOR_METATYPE_SOURCE, T}, |
77 | {"TPE1", EXTRACTOR_METATYPE_ARTIST}, | 74 | {"TOAL", EXTRACTOR_METATYPE_ORIGINAL_TITLE, T}, |
78 | {"TPE2", EXTRACTOR_METATYPE_ARTIST}, | 75 | {"TOFN", EXTRACTOR_METATYPE_ORIGINAL_ARTIST, T}, |
79 | {"TPE3", EXTRACTOR_METATYPE_CONDUCTOR}, | 76 | {"TOLY", EXTRACTOR_METATYPE_ORIGINAL_WRITER, T}, |
80 | {"TPE4", EXTRACTOR_METATYPE_INTERPRET}, | 77 | {"TOPE", EXTRACTOR_METATYPE_ORIGINAL_PERFORMER, T}, |
81 | {"TIME", EXTRACTOR_METATYPE_TIME}, | 78 | /* {"TORY", EXTRACTOR_METATYPE_ORIGINAL_RELEASE_YEAR, T}, deprecated in 24 */ |
82 | {"TMED", EXTRACTOR_METATYPE_MEDIA_TYPE}, | 79 | {"TOWN", EXTRACTOR_METATYPE_LICENSEE, T}, |
83 | {"TCOM", EXTRACTOR_METATYPE_CREATOR}, | 80 | {"TPE1", EXTRACTOR_METATYPE_ARTIST, T}, |
84 | {"TOFN", EXTRACTOR_METATYPE_FILENAME}, | 81 | {"TPE2", EXTRACTOR_METATYPE_PERFORMER, T}, |
85 | {"TOPE", EXTRACTOR_METATYPE_ARTIST}, | 82 | {"TPE3", EXTRACTOR_METATYPE_CONDUCTOR, T}, |
86 | {"TPUB", EXTRACTOR_METATYPE_PUBLISHER}, | 83 | {"TPE4", EXTRACTOR_METATYPE_INTERPRETATION, T}, |
87 | {"TRCK", EXTRACTOR_METATYPE_TRACK_NUMBER}, | 84 | {"TPOS", EXTRACTOR_METATYPE_DISC_NUMBER, T}, |
88 | {"TRSC", EXTRACTOR_METATYPE_ISRC}, | 85 | {"TPUB", EXTRACTOR_METATYPE_PUBLISHER, T}, |
89 | {"TRSN", EXTRACTOR_METATYPE_SOURCE}, | 86 | {"TRCK", EXTRACTOR_METATYPE_TRACK_NUMBER, T}, |
90 | {"TRSO", EXTRACTOR_METATYPE_CREATED_FOR}, | 87 | /* TRDA, deprecated in 24 */ |
91 | {"TSRC", EXTRACTOR_METATYPE_RESOURCE_IDENTIFIER}, | 88 | {"TRSN", EXTRACTOR_METATYPE_NETWORK_NAME, T}, |
92 | {"TYER", EXTRACTOR_METATYPE_YEAR}, | 89 | /* TRSO */ |
93 | {"TOAL", EXTRACTOR_METATYPE_ALBUM}, | 90 | /* {"TSIZ", EXTRACTOR_METATYPE_EMBEDDED_FILE_SIZE, T}, deprecated in 24 */ |
94 | {"TALB", EXTRACTOR_METATYPE_ALBUM}, | 91 | {"TSRC", EXTRACTOR_METATYPE_ISRC, T}, |
95 | {"TLAN", EXTRACTOR_METATYPE_LANGUAGE}, | 92 | /* TSSE */ |
96 | {"TIT2", EXTRACTOR_METATYPE_TITLE}, | 93 | /* {"TYER", EXTRACTOR_METATYPE_PUBLICATION_YEAR, T}, deprecated in 24 */ |
97 | {"TIT3", EXTRACTOR_METATYPE_DESCRIPTION}, | 94 | {"WCOM", EXTRACTOR_METATYPE_URL, U}, |
98 | {"WCOM", EXTRACTOR_METATYPE_RELEASE}, | 95 | {"WCOP", EXTRACTOR_METATYPE_URL, U}, |
99 | {"WCOP", EXTRACTOR_METATYPE_DISCLAIMER}, | 96 | {"WOAF", EXTRACTOR_METATYPE_URL, U}, |
100 | {"", EXTRACTOR_METATYPE_KEYWORDS}, | 97 | {"WOAS", EXTRACTOR_METATYPE_URL, U}, |
101 | {NULL, 0} | 98 | {"WORS", EXTRACTOR_METATYPE_URL, U}, |
99 | {"WPAY", EXTRACTOR_METATYPE_URL, U}, | ||
100 | {"WPUB", EXTRACTOR_METATYPE_URL, U}, | ||
101 | {"WXXX", EXTRACTOR_METATYPE_URL, T}, | ||
102 | /* {"IPLS", EXTRACTOR_METATYPE_CONTRIBUTOR_NAME, T}, deprecated in 24 */ | ||
103 | /* ... */ | ||
104 | {"USLT", EXTRACTOR_METATYPE_LYRICS, UL }, | ||
105 | {"SYLT", EXTRACTOR_METATYPE_LYRICS, SL }, | ||
106 | {"COMM", EXTRACTOR_METATYPE_COMMENT, L}, | ||
107 | /* ... */ | ||
108 | {"APIC", EXTRACTOR_METATYPE_PICTURE, I}, | ||
109 | /* ... */ | ||
110 | {"LINK", EXTRACTOR_METATYPE_URL, U}, | ||
111 | /* ... */ | ||
112 | {"USER", EXTRACTOR_METATYPE_LICENSE, T}, | ||
113 | /* ... */ | ||
114 | /* new frames in 24 */ | ||
115 | /* ASPI, EQU2, RVA2, SEEK, SIGN, TDEN */ | ||
116 | {"TDOR", EXTRACTOR_METATYPE_PUBLICATION_DATE, T}, | ||
117 | /* TDRC, TDRL, TDTG */ | ||
118 | {"TIPL", EXTRACTOR_METATYPE_CONTRIBUTOR_NAME, T}, | ||
119 | {"TMCL", EXTRACTOR_METATYPE_MUSICIAN_CREDITS_LIST, T}, | ||
120 | {"TMOO", EXTRACTOR_METATYPE_MOOD, T}, | ||
121 | {"TPRO", EXTRACTOR_METATYPE_COPYRIGHT, T}, | ||
122 | {"TSOA", EXTRACTOR_METATYPE_ALBUM, T}, | ||
123 | {"TSOP", EXTRACTOR_METATYPE_PERFORMER, T}, | ||
124 | {"TSOT", EXTRACTOR_METATYPE_TITLE, T}, | ||
125 | {"TSST", EXTRACTOR_METATYPE_SUBTITLE, T}, | ||
126 | {NULL, 0, T} | ||
102 | }; | 127 | }; |
103 | 128 | ||
104 | 129 | ||
@@ -114,54 +139,60 @@ EXTRACTOR_id3v24_extract (const unsigned char *data, | |||
114 | int extendedHdr; | 139 | int extendedHdr; |
115 | int experimental; | 140 | int experimental; |
116 | int footer; | 141 | int footer; |
117 | unsigned int tsize; | 142 | uint32_t tsize; |
118 | unsigned int pos; | 143 | uint32_t pos; |
119 | unsigned int ehdrSize; | 144 | uint32_t ehdrSize; |
120 | unsigned int padding; | 145 | uint32_t padding; |
146 | uint32_t csize; | ||
147 | int i; | ||
148 | uint16_t flags; | ||
149 | char *mime; | ||
150 | enum EXTRACTOR_MetaType type; | ||
151 | size_t off; | ||
121 | 152 | ||
122 | if ((size < 16) || | 153 | if ((size < 16) || |
123 | (data[0] != 0x49) || | 154 | (data[0] != 0x49) || |
124 | (data[1] != 0x44) || | 155 | (data[1] != 0x44) || |
125 | (data[2] != 0x33) || (data[3] != 0x04) || (data[4] != 0x00)) | 156 | (data[2] != 0x33) || (data[3] != 0x04) || (data[4] != 0x00)) |
126 | return prev; | 157 | return 0; |
127 | unsync = (data[5] & 0x80) > 0; | 158 | unsync = (data[5] & 0x80) > 0; |
159 | if (unsync) | ||
160 | return 0; /* not supported */ | ||
128 | extendedHdr = (data[5] & 0x40) > 0; | 161 | extendedHdr = (data[5] & 0x40) > 0; |
129 | experimental = (data[5] & 0x20) > 0; | 162 | experimental = (data[5] & 0x20) > 0; |
163 | if (experimental) | ||
164 | return 0; | ||
130 | footer = (data[5] & 0x10) > 0; | 165 | footer = (data[5] & 0x10) > 0; |
131 | tsize = (((data[6] & 0x7F) << 21) | | 166 | tsize = (((data[6] & 0x7F) << 21) | |
132 | ((data[7] & 0x7F) << 14) | | 167 | ((data[7] & 0x7F) << 14) | |
133 | ((data[8] & 0x7F) << 7) | ((data[9] & 0x7F) << 0)); | 168 | ((data[8] & 0x7F) << 7) | ((data[9] & 0x7F) << 0)); |
134 | if ((tsize + 10 > size) || (experimental)) | 169 | if (tsize + 10 > size) |
135 | return prev; | 170 | return 0; |
136 | pos = 10; | 171 | pos = 10; |
137 | padding = 0; | 172 | padding = 0; |
138 | if (extendedHdr) | 173 | if (extendedHdr) |
139 | { | 174 | { |
140 | ehdrSize = (((data[10] & 0x7F) << 21) | | 175 | ehdrSize = (((data[10] & 0x7F) << 21) | |
141 | ((data[11] & 0x7F) << 14) | | 176 | ((data[11] & 0x7F) << 14) | |
142 | ((data[12] & 0x7F) << 7) | ((data[13] & 0x7F) << 0)); | 177 | ((data[12] & 0x7F) << 7) | ((data[13] & 0x7F) << 0)); |
143 | pos += ehdrSize; | 178 | pos += 4 + ehdrSize; |
179 | if (ehdrSize > tsize) | ||
180 | return 0; | ||
144 | } | 181 | } |
145 | |||
146 | |||
147 | while (pos < tsize) | 182 | while (pos < tsize) |
148 | { | 183 | { |
149 | size_t csize; | ||
150 | int i; | ||
151 | unsigned short flags; | ||
152 | |||
153 | if (pos + 10 > tsize) | 184 | if (pos + 10 > tsize) |
154 | return prev; | 185 | return 0; |
155 | 186 | csize = | |
156 | csize = (((data[pos + 4] & 0x7F) << 21) | | 187 | (data[pos + 4] << 24) + (data[pos + 5] << 16) + (data[pos + 6] << 8) + |
157 | ((data[pos + 5] & 0x7F) << 14) | | 188 | data[pos + 7]; |
158 | ((data[pos + 6] & 0x7F) << 7) | ((data[pos + 7] & 0x7F) << 0)); | 189 | if ((pos + 10 + csize > tsize) || (csize > tsize) || (csize == 0) || |
159 | 190 | (pos + 10 + csize <= pos + 10) || (pos + 10 <= pos)) | |
160 | if ((pos + 10 + csize > tsize) || (csize > tsize) || (csize == 0)) | ||
161 | break; | 191 | break; |
162 | flags = (data[pos + 8] << 8) + data[pos + 9]; | 192 | flags = (data[pos + 8] << 8) + data[pos + 9]; |
163 | if (((flags & 0x80) > 0) /* compressed, not yet supported */ || | 193 | if (((flags & 0x08) > 0) /* compressed, not yet supported */ || |
164 | ((flags & 0x40) > 0) /* encrypted, not supported */ ) | 194 | ((flags & 0x04) > 0) /* encrypted, not supported */ || |
195 | ((flags & 0x02) > 0) /* unsynchronized, not supported */ ) | ||
165 | { | 196 | { |
166 | pos += 10 + csize; | 197 | pos += 10 + csize; |
167 | continue; | 198 | continue; |
@@ -172,59 +203,216 @@ EXTRACTOR_id3v24_extract (const unsigned char *data, | |||
172 | if (0 == strncmp (tmap[i].text, (const char *) &data[pos], 4)) | 203 | if (0 == strncmp (tmap[i].text, (const char *) &data[pos], 4)) |
173 | { | 204 | { |
174 | char *word; | 205 | char *word; |
175 | if ((flags & 0x20) > 0) | 206 | if ((flags & 0x40) > 0) |
176 | { | 207 | { |
177 | /* "group" identifier, skip a byte */ | 208 | /* "group" identifier, skip a byte */ |
178 | pos++; | 209 | pos++; |
179 | csize--; | 210 | csize--; |
180 | } | 211 | } |
181 | 212 | ||
182 | /* this byte describes the encoding | 213 | switch (tmap[i].fmt) |
183 | try to convert strings to UTF-8 | 214 | { |
184 | if it fails, then forget it */ | 215 | case T: |
185 | csize--; | 216 | /* this byte describes the encoding |
186 | switch (data[pos + 10]) | 217 | try to convert strings to UTF-8 |
187 | { | 218 | if it fails, then forget it */ |
188 | case 0x00: | 219 | switch (data[pos + 10]) |
189 | word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 11], | 220 | { |
190 | csize, "ISO-8859-1"); | 221 | case 0x00: |
191 | break; | 222 | word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 11], |
192 | case 0x01: | 223 | csize - 1, "ISO-8859-1"); |
193 | word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 11], | 224 | break; |
194 | csize, "UTF-16"); | 225 | case 0x01: |
195 | break; | 226 | word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 11], |
196 | case 0x02: | 227 | csize - 1, "UCS-2"); |
197 | word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 11], | 228 | break; |
198 | csize, "UTF-16BE"); | 229 | default: |
199 | break; | 230 | /* bad encoding byte, |
200 | case 0x03: | 231 | try to convert from iso-8859-1 */ |
201 | word = malloc (csize + 1); | 232 | word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 11], |
202 | memcpy (word, &data[pos + 11], csize); | 233 | csize - 1, "ISO-8859-1"); |
203 | word[csize] = '\0'; | 234 | break; |
204 | break; | 235 | } |
205 | default: | 236 | break; |
206 | /* bad encoding byte, | 237 | case U: |
207 | try to convert from iso-8859-1 */ | 238 | word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 10], |
208 | word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 11], | 239 | csize, "ISO-8859-1"); |
209 | csize, "ISO-8859-1"); | 240 | break; |
210 | break; | 241 | case UL: |
211 | } | 242 | if (csize < 6) |
212 | pos++; | 243 | return 0; /* malformed */ |
244 | /* find end of description */ | ||
245 | off = 14; | ||
246 | while ( (off < size) && | ||
247 | (off - pos < csize) && | ||
248 | (data[pos + off] == '\0') ) | ||
249 | off++; | ||
250 | if ( (off >= csize) || | ||
251 | (data[pos+off] != '\0') ) | ||
252 | return 0; /* malformed */ | ||
253 | off++; | ||
254 | switch (data[pos + 10]) | ||
255 | { | ||
256 | case 0x00: | ||
257 | word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + off], | ||
258 | csize - off, "ISO-8859-1"); | ||
259 | break; | ||
260 | case 0x01: | ||
261 | word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + off], | ||
262 | csize - off, "UCS-2"); | ||
263 | break; | ||
264 | default: | ||
265 | /* bad encoding byte, | ||
266 | try to convert from iso-8859-1 */ | ||
267 | word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + off], | ||
268 | csize - off, "ISO-8859-1"); | ||
269 | break; | ||
270 | } | ||
271 | break; | ||
272 | case SL: | ||
273 | if (csize < 7) | ||
274 | return 0; /* malformed */ | ||
275 | /* find end of description */ | ||
276 | switch (data[pos + 10]) | ||
277 | { | ||
278 | case 0x00: | ||
279 | word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 16], | ||
280 | csize - 6, "ISO-8859-1"); | ||
281 | break; | ||
282 | case 0x01: | ||
283 | word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 16], | ||
284 | csize - 6, "UCS-2"); | ||
285 | break; | ||
286 | default: | ||
287 | /* bad encoding byte, | ||
288 | try to convert from iso-8859-1 */ | ||
289 | word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 16], | ||
290 | csize - 6, "ISO-8859-1"); | ||
291 | break; | ||
292 | } | ||
293 | break; | ||
294 | case L: | ||
295 | if (csize < 5) | ||
296 | return 0; /* malformed */ | ||
297 | /* find end of description */ | ||
298 | switch (data[pos + 10]) | ||
299 | { | ||
300 | case 0x00: | ||
301 | word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 14], | ||
302 | csize - 4, "ISO-8859-1"); | ||
303 | break; | ||
304 | case 0x01: | ||
305 | word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 14], | ||
306 | csize - 4, "UCS-2"); | ||
307 | break; | ||
308 | default: | ||
309 | /* bad encoding byte, | ||
310 | try to convert from iso-8859-1 */ | ||
311 | word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 14], | ||
312 | csize - 4, "ISO-8859-1"); | ||
313 | break; | ||
314 | } | ||
315 | break; | ||
316 | case I: | ||
317 | if (csize < 2) | ||
318 | return 0; /* malformed */ | ||
319 | /* find end of mime type */ | ||
320 | off = 11; | ||
321 | while ( (off < size) && | ||
322 | (off - pos < csize) && | ||
323 | (data[pos + off] == '\0') ) | ||
324 | off++; | ||
325 | if ( (off >= csize) || | ||
326 | (data[pos+off] != '\0') ) | ||
327 | return 0; /* malformed */ | ||
328 | off++; | ||
329 | mime = strdup ((const char*) &data[pos + 11]); | ||
330 | |||
331 | switch (data[pos+off]) | ||
332 | { | ||
333 | case 0x03: | ||
334 | case 0x04: | ||
335 | type = EXTRACTOR_METATYPE_COVER_PICTURE; | ||
336 | break; | ||
337 | case 0x07: | ||
338 | case 0x08: | ||
339 | case 0x09: | ||
340 | case 0x0A: | ||
341 | case 0x0B: | ||
342 | case 0x0C: | ||
343 | type = EXTRACTOR_METATYPE_CONTRIBUTOR_PICTURE; | ||
344 | break; | ||
345 | case 0x0D: | ||
346 | case 0x0E: | ||
347 | case 0x0F: | ||
348 | type = EXTRACTOR_METATYPE_EVENT_PICTURE; | ||
349 | break; | ||
350 | case 0x14: | ||
351 | type = EXTRACTOR_METATYPE_LOGO; | ||
352 | type = EXTRACTOR_METATYPE_LOGO; | ||
353 | break; | ||
354 | default: | ||
355 | type = EXTRACTOR_METATYPE_PICTURE; | ||
356 | break; | ||
357 | } | ||
358 | off++; | ||
359 | |||
360 | /* find end of description */ | ||
361 | while ( (off < size) && | ||
362 | (off - pos < csize) && | ||
363 | (data[pos + off] == '\0') ) | ||
364 | off++; | ||
365 | if ( (off >= csize) || | ||
366 | (data[pos+off] != '\0') ) | ||
367 | return 0; /* malformed */ | ||
368 | off++; | ||
369 | if (0 == strcasecmp ("-->", | ||
370 | mime)) | ||
371 | { | ||
372 | /* not supported */ | ||
373 | } | ||
374 | else | ||
375 | { | ||
376 | if (0 != proc (proc_cls, | ||
377 | "id3v24", | ||
378 | type, | ||
379 | EXTRACTOR_METAFORMAT_BINARY, | ||
380 | mime, | ||
381 | (const char*) &data[pos + off], | ||
382 | csize + 6 - off)) | ||
383 | { | ||
384 | free (mime); | ||
385 | return 1; | ||
386 | } | ||
387 | } | ||
388 | free (mime); | ||
389 | word = NULL; | ||
390 | break; | ||
391 | default: | ||
392 | return 0; | ||
393 | } | ||
213 | if ((word != NULL) && (strlen (word) > 0)) | 394 | if ((word != NULL) && (strlen (word) > 0)) |
214 | { | 395 | { |
215 | prev = addKeyword (prev, word, tmap[i].type); | 396 | if (0 != proc (proc_cls, |
216 | } | 397 | "id3v24", |
217 | else | 398 | tmap[i].type, |
218 | { | 399 | EXTRACTOR_METAFORMAT_UTF8, |
219 | free (word); | 400 | "text/plain", |
401 | word, | ||
402 | strlen(word)+1)) | ||
403 | { | ||
404 | free (word); | ||
405 | return 1; | ||
406 | } | ||
220 | } | 407 | } |
408 | free (word); | ||
221 | break; | 409 | break; |
222 | } | 410 | } |
223 | i++; | 411 | i++; |
224 | } | 412 | } |
225 | pos += 10 + csize; | 413 | pos += 10 + csize; |
226 | } | 414 | } |
227 | return prev; | 415 | return 0; |
228 | } | 416 | } |
229 | 417 | ||
230 | /* end of id3v24_extractor.c */ | 418 | /* end of id3v24_extractor.c */ |
diff --git a/src/plugins/id3v2_extractor.c b/src/plugins/id3v2_extractor.c index da5c6d9..f0227f1 100644 --- a/src/plugins/id3v2_extractor.c +++ b/src/plugins/id3v2_extractor.c | |||
@@ -113,7 +113,7 @@ static Matches tmap[] = { | |||
113 | /* skipping CRM */ | 113 | /* skipping CRM */ |
114 | /* skipping CRA */ | 114 | /* skipping CRA */ |
115 | /* {"LNK", EXTRACTOR_METATYPE_URL, XXX}, */ | 115 | /* {"LNK", EXTRACTOR_METATYPE_URL, XXX}, */ |
116 | {NULL, 0}, | 116 | {NULL, 0, T}, |
117 | }; | 117 | }; |
118 | 118 | ||
119 | 119 | ||
diff --git a/src/plugins/odf_extractor.c b/src/plugins/odf_extractor.c index 479f5ca..9a8c827 100644 --- a/src/plugins/odf_extractor.c +++ b/src/plugins/odf_extractor.c | |||
@@ -44,7 +44,7 @@ static Matches tmap[] = { | |||
44 | { "meta:creation-date", EXTRACTOR_METATYPE_CREATION_DATE }, | 44 | { "meta:creation-date", EXTRACTOR_METATYPE_CREATION_DATE }, |
45 | { "dc:date", EXTRACTOR_METATYPE_UNKNOWN_DATE }, | 45 | { "dc:date", EXTRACTOR_METATYPE_UNKNOWN_DATE }, |
46 | { "dc:creator", EXTRACTOR_METATYPE_CREATOR }, | 46 | { "dc:creator", EXTRACTOR_METATYPE_CREATOR }, |
47 | { "dc:language", EXTRACTOR_METATYPE_DOCUMENT_LANGUAGE }, | 47 | { "dc:language", EXTRACTOR_METATYPE_LANGUAGE }, |
48 | { "dc:title", EXTRACTOR_METATYPE_TITLE }, | 48 | { "dc:title", EXTRACTOR_METATYPE_TITLE }, |
49 | { "dc:description", EXTRACTOR_METATYPE_DESCRIPTION }, | 49 | { "dc:description", EXTRACTOR_METATYPE_DESCRIPTION }, |
50 | { "dc:subject", EXTRACTOR_METATYPE_SUBJECT }, | 50 | { "dc:subject", EXTRACTOR_METATYPE_SUBJECT }, |
diff --git a/src/plugins/ole2_extractor.c b/src/plugins/ole2_extractor.c index 1b8d5cf..5f6cd01 100644 --- a/src/plugins/ole2_extractor.c +++ b/src/plugins/ole2_extractor.c | |||
@@ -107,7 +107,7 @@ static Matches tmap[] = { | |||
107 | { "meta:generator", EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE }, | 107 | { "meta:generator", EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE }, |
108 | { "meta:template", EXTRACTOR_METATYPE_TEMPLATE }, | 108 | { "meta:template", EXTRACTOR_METATYPE_TEMPLATE }, |
109 | { "meta:editing-cycles", EXTRACTOR_METATYPE_EDITING_CYCLES }, | 109 | { "meta:editing-cycles", EXTRACTOR_METATYPE_EDITING_CYCLES }, |
110 | /* { "Dictionary", EXTRACTOR_METATYPE_DOCUMENT_LANGUAGE }, */ | 110 | /* { "Dictionary", EXTRACTOR_METATYPE_LANGUAGE }, */ |
111 | /* { "gsf:security", EXTRACTOR_SECURITY }, */ | 111 | /* { "gsf:security", EXTRACTOR_SECURITY }, */ |
112 | /* { "gsf:scale", EXTRACTOR_SCALE }, // always "false"? */ | 112 | /* { "gsf:scale", EXTRACTOR_SCALE }, // always "false"? */ |
113 | /* { "meta:editing-duration", EXTRACTOR_METATYPE_TOTAL_EDITING_TIME }, // encoding? */ | 113 | /* { "meta:editing-duration", EXTRACTOR_METATYPE_TOTAL_EDITING_TIME }, // encoding? */ |
@@ -544,7 +544,7 @@ EXTRACTOR_ole2_extract (const char *data, | |||
544 | if ( (lang != NULL) && (ret == 0) ) | 544 | if ( (lang != NULL) && (ret == 0) ) |
545 | ret = addKeyword(proc, proc_cls, | 545 | ret = addKeyword(proc, proc_cls, |
546 | lang, | 546 | lang, |
547 | EXTRACTOR_METATYPE_DOCUMENT_LANGUAGE); | 547 | EXTRACTOR_METATYPE_LANGUAGE); |
548 | if (lcb >= 6) { | 548 | if (lcb >= 6) { |
549 | for (i=0;i<gsf_infile_num_children(infile);i++) { | 549 | for (i=0;i<gsf_infile_num_children(infile);i++) { |
550 | if (ret != 0) | 550 | if (ret != 0) |
diff --git a/src/plugins/png_extractor.c b/src/plugins/png_extractor.c index 5fb2b9d..bdba922 100644 --- a/src/plugins/png_extractor.c +++ b/src/plugins/png_extractor.c | |||
@@ -141,7 +141,7 @@ processiTXt (const char *data, | |||
141 | language = &data[pos]; | 141 | language = &data[pos]; |
142 | ret = 0; | 142 | ret = 0; |
143 | if (stnlen (language, length - pos) > 0) | 143 | if (stnlen (language, length - pos) > 0) |
144 | ADDF (EXTRACTOR_METATYPE_DOCUMENT_LANGUAGE, | 144 | ADDF (EXTRACTOR_METATYPE_LANGUAGE, |
145 | stndup (language, length - pos)); | 145 | stndup (language, length - pos)); |
146 | pos += stnlen (language, length - pos) + 1; | 146 | pos += stnlen (language, length - pos) + 1; |
147 | if (pos + 1 >= length) | 147 | if (pos + 1 >= length) |
diff --git a/src/plugins/qt_extractor.c b/src/plugins/qt_extractor.c index 3abd543..123e0f5 100644 --- a/src/plugins/qt_extractor.c +++ b/src/plugins/qt_extractor.c | |||
@@ -395,9 +395,9 @@ static ITTagConversionEntry it_to_extr_table[] = { | |||
395 | {"catg", EXTRACTOR_METATYPE_SECTION}, | 395 | {"catg", EXTRACTOR_METATYPE_SECTION}, |
396 | {"keyw", EXTRACTOR_METATYPE_KEYWORDS}, | 396 | {"keyw", EXTRACTOR_METATYPE_KEYWORDS}, |
397 | {"desc", EXTRACTOR_METATYPE_DESCRIPTION}, | 397 | {"desc", EXTRACTOR_METATYPE_DESCRIPTION}, |
398 | {"tvnn", EXTRACTOR_METATYPE_TV_NETWORK_NAME}, | 398 | {"tvnn", EXTRACTOR_METATYPE_NETWORK_NAME}, |
399 | {"tvsh", EXTRACTOR_METATYPE_TV_SHOW_NAME}, | 399 | {"tvsh", EXTRACTOR_METATYPE_SHOW_NAME}, |
400 | {"tven", EXTRACTOR_METATYPE_TV_NETWORK_NAME}, | 400 | {"tven", EXTRACTOR_METATYPE_NETWORK_NAME}, |
401 | {NULL, EXTRACTOR_METATYPE_RESERVED} | 401 | {NULL, EXTRACTOR_METATYPE_RESERVED} |
402 | }; | 402 | }; |
403 | 403 | ||
@@ -850,7 +850,7 @@ processTextTag (const char *input, | |||
850 | lang = ntohs (txt->language); | 850 | lang = ntohs (txt->language); |
851 | if (lang >= sizeof (languages) / sizeof (char *)) | 851 | if (lang >= sizeof (languages) / sizeof (char *)) |
852 | return 0; /* invalid */ | 852 | return 0; /* invalid */ |
853 | addKeyword (EXTRACTOR_METATYPE_DOCUMENT_LANGUAGE, languages[lang], ec); | 853 | addKeyword (EXTRACTOR_METATYPE_LANGUAGE, languages[lang], ec); |
854 | 854 | ||
855 | meta = malloc (len + 1); | 855 | meta = malloc (len + 1); |
856 | memcpy (meta, &txt[1], len); | 856 | memcpy (meta, &txt[1], len); |