diff options
Diffstat (limited to 'src/plugins/id3v23_extractor.c')
-rw-r--r-- | src/plugins/id3v23_extractor.c | 330 |
1 files changed, 263 insertions, 67 deletions
diff --git a/src/plugins/id3v23_extractor.c b/src/plugins/id3v23_extractor.c index 71553c2..4ab8116 100644 --- a/src/plugins/id3v23_extractor.c +++ b/src/plugins/id3v23_extractor.c | |||
@@ -1,6 +1,6 @@ | |||
1 | /* | 1 | /* |
2 | This file is part of libextractor. | 2 | This file is part of libextractor. |
3 | (C) 2002, 2003, 2004, 2006, 2007 Vidyut Samanta and Christian Grothoff | 3 | (C) 2002, 2003, 2004, 2006, 2007, 2009 Vidyut Samanta and Christian Grothoff |
4 | 4 | ||
5 | libextractor is free software; you can redistribute it and/or modify | 5 | libextractor is free software; you can redistribute it and/or modify |
6 | it under the terms of the GNU General Public License as published | 6 | it under the terms of the GNU General Public License as published |
@@ -35,54 +35,83 @@ | |||
35 | 35 | ||
36 | #include "convert.h" | 36 | #include "convert.h" |
37 | 37 | ||
38 | enum Id3v23Fmt | ||
39 | { | ||
40 | T, /* simple, 0-terminated string, prefixed by encoding */ | ||
41 | U, /* 0-terminated ASCII string, no encoding */ | ||
42 | UL, /* unsync'ed lyrics */ | ||
43 | SL, /* sync'ed lyrics */ | ||
44 | L, /* string with language prefix */ | ||
45 | I /* image */ | ||
46 | }; | ||
47 | |||
38 | typedef struct | 48 | typedef struct |
39 | { | 49 | { |
40 | const char *text; | 50 | const char *text; |
41 | enum EXTRACTOR_MetaType type; | 51 | enum EXTRACTOR_MetaType type; |
52 | enum Id3v23Fmt fmt; | ||
42 | } Matches; | 53 | } Matches; |
43 | 54 | ||
44 | static Matches tmap[] = { | 55 | static Matches tmap[] = { |
45 | {"COMM", EXTRACTOR_METATYPE_COMMENT}, | 56 | {"TALB", EXTRACTOR_METATYPE_ALBUM, T}, |
46 | {"IPLS", EXTRACTOR_METATYPE_CONTRIBUTOR}, | 57 | {"TBPM", EXTRACTOR_METATYPE_BEATS_PER_MINUTE, T}, |
47 | {"LINK", EXTRACTOR_METATYPE_LINK}, | 58 | {"TCOM", EXTRACTOR_METATYPE_COMPOSER, T}, |
48 | {"MCDI", EXTRACTOR_METATYPE_MUSIC_CD_IDENTIFIER}, | 59 | {"TCON", EXTRACTOR_METATYPE_SONG_VERSION, T}, |
49 | {"PCNT", EXTRACTOR_METATYPE_PLAY_COUNTER}, | 60 | {"TCOP", EXTRACTOR_METATYPE_COPYRIGHT, T}, |
50 | {"POPM", EXTRACTOR_METATYPE_POPULARITY_METER}, | 61 | /* {"TDAT", EXTRACTOR_METATYPE_CREATION_DATE, T}, */ |
51 | {"TCOP", EXTRACTOR_METATYPE_COPYRIGHT}, | 62 | /* TDLY */ |
52 | {"TDAT", EXTRACTOR_METATYPE_DATE}, | 63 | {"TENC", EXTRACTOR_METATYPE_ENCODED_BY, T}, |
53 | {"TCON", EXTRACTOR_METATYPE_CONTENT_TYPE}, | 64 | {"TEXT", EXTRACTOR_METATYPE_WRITER, T}, |
54 | {"TIT1", EXTRACTOR_METATYPE_GENRE}, | 65 | {"TFLT", EXTRACTOR_METATYPE_FORMAT_VERSION, T}, |
55 | {"TENC", EXTRACTOR_METATYPE_ENCODED_BY}, | 66 | /* TIME */ |
56 | {"TEXT", EXTRACTOR_METATYPE_LYRICS}, | 67 | {"TIT1", EXTRACTOR_METATYPE_SECTION, T}, |
57 | {"TOLY", EXTRACTOR_METATYPE_CONTRIBUTOR}, | 68 | {"TIT2", EXTRACTOR_METATYPE_TITLE, T}, |
58 | {"TOPE", EXTRACTOR_METATYPE_CONTRIBUTOR}, | 69 | {"TIT3", EXTRACTOR_METATYPE_SONG_VERSION, T}, |
59 | {"TOWN", EXTRACTOR_METATYPE_OWNER}, | 70 | /* TKEY */ |
60 | {"TPE1", EXTRACTOR_METATYPE_ARTIST}, | 71 | {"TLAN", EXTRACTOR_METATYPE_LANGUAGE, T}, |
61 | {"TPE2", EXTRACTOR_METATYPE_ARTIST}, | 72 | {"TLEN", EXTRACTOR_METATYPE_DURATION, T}, /* FIXME: should append 'ms' as unit */ |
62 | {"TPE3", EXTRACTOR_METATYPE_CONDUCTOR}, | 73 | {"TMED", EXTRACTOR_METATYPE_SOURCE, T}, |
63 | {"TPE4", EXTRACTOR_METATYPE_INTERPRET}, | 74 | {"TOAL", EXTRACTOR_METATYPE_ORIGINAL_TITLE, T}, |
64 | {"TMED", EXTRACTOR_METATYPE_MEDIA_TYPE}, | 75 | {"TOFN", EXTRACTOR_METATYPE_ORIGINAL_ARTIST, T}, |
65 | {"TCOM", EXTRACTOR_METATYPE_CREATOR}, | 76 | {"TOLY", EXTRACTOR_METATYPE_ORIGINAL_WRITER, T}, |
66 | {"TIME", EXTRACTOR_METATYPE_TIME}, | 77 | {"TOPE", EXTRACTOR_METATYPE_ORIGINAL_PERFORMER, T}, |
67 | {"TOFN", EXTRACTOR_METATYPE_FILENAME}, | 78 | {"TORY", EXTRACTOR_METATYPE_ORIGINAL_RELEASE_YEAR, T}, |
68 | {"TOPE", EXTRACTOR_METATYPE_ARTIST}, | 79 | {"TOWN", EXTRACTOR_METATYPE_LICENSEE, T}, |
69 | {"TPUB", EXTRACTOR_METATYPE_PUBLISHER}, | 80 | {"TPE1", EXTRACTOR_METATYPE_ARTIST, T}, |
70 | {"TRCK", EXTRACTOR_METATYPE_TRACK_NUMBER}, | 81 | {"TPE2", EXTRACTOR_METATYPE_PERFORMER, T}, |
71 | {"TRSC", EXTRACTOR_METATYPE_ISRC}, | 82 | {"TPE3", EXTRACTOR_METATYPE_CONDUCTOR, T}, |
72 | {"TRSN", EXTRACTOR_METATYPE_SOURCE}, | 83 | {"TPE4", EXTRACTOR_METATYPE_INTERPRETATION, T}, |
73 | {"TRSO", EXTRACTOR_METATYPE_CREATED_FOR}, | 84 | {"TPOS", EXTRACTOR_METATYPE_DISC_NUMBER, T}, |
74 | {"TSRC", EXTRACTOR_METATYPE_RESOURCE_IDENTIFIER}, | 85 | {"TPUB", EXTRACTOR_METATYPE_PUBLISHER, T}, |
75 | {"TOAL", EXTRACTOR_METATYPE_ALBUM}, | 86 | {"TRCK", EXTRACTOR_METATYPE_TRACK_NUMBER, T}, |
76 | {"TALB", EXTRACTOR_METATYPE_ALBUM}, | 87 | /* TRDA */ |
77 | {"TLAN", EXTRACTOR_METATYPE_LANGUAGE}, | 88 | {"TRSN", EXTRACTOR_METATYPE_NETWORK_NAME, T}, |
78 | {"TYER", EXTRACTOR_METATYPE_YEAR}, | 89 | /* TRSO */ |
79 | {"TLEN", EXTRACTOR_METATYPE_DURATION}, | 90 | {"TSIZ", EXTRACTOR_METATYPE_EMBEDDED_FILE_SIZE, T}, |
80 | {"TIT2", EXTRACTOR_METATYPE_TITLE}, | 91 | {"TSRC", EXTRACTOR_METATYPE_ISRC, T}, |
81 | {"TIT3", EXTRACTOR_METATYPE_DESCRIPTION}, | 92 | /* TSSE */ |
82 | {"WCOM", EXTRACTOR_METATYPE_RELEASE}, | 93 | {"TYER", EXTRACTOR_METATYPE_PUBLICATION_YEAR, T}, |
83 | {"WCOP", EXTRACTOR_METATYPE_DISCLAIMER}, | 94 | {"WCOM", EXTRACTOR_METATYPE_URL, U}, |
84 | {"", EXTRACTOR_METATYPE_KEYWORDS}, | 95 | {"WCOP", EXTRACTOR_METATYPE_URL, U}, |
85 | {NULL, 0} | 96 | {"WOAF", EXTRACTOR_METATYPE_URL, U}, |
97 | {"WOAS", EXTRACTOR_METATYPE_URL, U}, | ||
98 | {"WORS", EXTRACTOR_METATYPE_URL, U}, | ||
99 | {"WPAY", EXTRACTOR_METATYPE_URL, U}, | ||
100 | {"WPUB", EXTRACTOR_METATYPE_URL, U}, | ||
101 | {"WXXX", EXTRACTOR_METATYPE_URL, T}, | ||
102 | {"IPLS", EXTRACTOR_METATYPE_CONTRIBUTOR_NAME, T}, | ||
103 | /* ... */ | ||
104 | {"USLT", EXTRACTOR_METATYPE_LYRICS, UL }, | ||
105 | {"SYLT", EXTRACTOR_METATYPE_LYRICS, SL }, | ||
106 | {"COMM", EXTRACTOR_METATYPE_COMMENT, L}, | ||
107 | /* ... */ | ||
108 | {"APIC", EXTRACTOR_METATYPE_PICTURE, I}, | ||
109 | /* ... */ | ||
110 | {"LINK", EXTRACTOR_METATYPE_URL, U}, | ||
111 | /* ... */ | ||
112 | {"USER", EXTRACTOR_METATYPE_LICENSE, T}, | ||
113 | /* ... */ | ||
114 | {NULL, 0, T} | ||
86 | }; | 115 | }; |
87 | 116 | ||
88 | 117 | ||
@@ -104,6 +133,9 @@ EXTRACTOR_id3v23_extract (const unsigned char *data, | |||
104 | uint32_t csize; | 133 | uint32_t csize; |
105 | int i; | 134 | int i; |
106 | uint16_t flags; | 135 | uint16_t flags; |
136 | char *mime; | ||
137 | enum EXTRACTOR_MetaType type; | ||
138 | size_t off; | ||
107 | 139 | ||
108 | if ((size < 16) || | 140 | if ((size < 16) || |
109 | (data[0] != 0x49) || | 141 | (data[0] != 0x49) || |
@@ -111,12 +143,16 @@ EXTRACTOR_id3v23_extract (const unsigned char *data, | |||
111 | (data[2] != 0x33) || (data[3] != 0x03) || (data[4] != 0x00)) | 143 | (data[2] != 0x33) || (data[3] != 0x03) || (data[4] != 0x00)) |
112 | return 0; | 144 | return 0; |
113 | unsync = (data[5] & 0x80) > 0; | 145 | unsync = (data[5] & 0x80) > 0; |
146 | if (unsync) | ||
147 | return 0; /* not supported */ | ||
114 | extendedHdr = (data[5] & 0x40) > 0; | 148 | extendedHdr = (data[5] & 0x40) > 0; |
115 | experimental = (data[5] & 0x20) > 0; | 149 | experimental = (data[5] & 0x20) > 0; |
150 | if (experimental) | ||
151 | return 0; | ||
116 | tsize = (((data[6] & 0x7F) << 21) | | 152 | tsize = (((data[6] & 0x7F) << 21) | |
117 | ((data[7] & 0x7F) << 14) | | 153 | ((data[7] & 0x7F) << 14) | |
118 | ((data[8] & 0x7F) << 7) | ((data[9] & 0x7F) << 0)); | 154 | ((data[8] & 0x7F) << 7) | ((data[9] & 0x7F) << 0)); |
119 | if ((tsize + 10 > size) || (experimental)) | 155 | if (tsize + 10 > size) |
120 | return 0; | 156 | return 0; |
121 | pos = 10; | 157 | pos = 10; |
122 | padding = 0; | 158 | padding = 0; |
@@ -142,7 +178,8 @@ EXTRACTOR_id3v23_extract (const unsigned char *data, | |||
142 | csize = | 178 | csize = |
143 | (data[pos + 4] << 24) + (data[pos + 5] << 16) + (data[pos + 6] << 8) + | 179 | (data[pos + 4] << 24) + (data[pos + 5] << 16) + (data[pos + 6] << 8) + |
144 | data[pos + 7]; | 180 | data[pos + 7]; |
145 | if ((pos + 10 + csize > tsize) || (csize > tsize) || (csize == 0)) | 181 | if ((pos + 10 + csize > tsize) || (csize > tsize) || (csize == 0) || |
182 | (pos + 10 + csize <= pos + 10) || (pos + 10 <= pos)) | ||
146 | break; | 183 | break; |
147 | flags = (data[pos + 8] << 8) + data[pos + 9]; | 184 | flags = (data[pos + 8] << 8) + data[pos + 9]; |
148 | if (((flags & 0x80) > 0) /* compressed, not yet supported */ || | 185 | if (((flags & 0x80) > 0) /* compressed, not yet supported */ || |
@@ -163,32 +200,191 @@ EXTRACTOR_id3v23_extract (const unsigned char *data, | |||
163 | pos++; | 200 | pos++; |
164 | csize--; | 201 | csize--; |
165 | } | 202 | } |
166 | csize--; | 203 | switch (tmap[i].fmt) |
167 | /* this byte describes the encoding | 204 | { |
168 | try to convert strings to UTF-8 | 205 | case T: |
169 | if it fails, then forget it */ | 206 | /* this byte describes the encoding |
170 | switch (data[pos + 10]) | 207 | try to convert strings to UTF-8 |
171 | { | 208 | if it fails, then forget it */ |
172 | case 0x00: | 209 | switch (data[pos + 10]) |
173 | word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 11], | 210 | { |
174 | csize, "ISO-8859-1"); | 211 | case 0x00: |
175 | break; | 212 | word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 11], |
176 | case 0x01: | 213 | csize - 1, "ISO-8859-1"); |
177 | word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 11], | 214 | break; |
178 | csize, "UCS-2"); | 215 | case 0x01: |
179 | break; | 216 | word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 11], |
180 | default: | 217 | csize - 1, "UCS-2"); |
181 | /* bad encoding byte, | 218 | break; |
182 | try to convert from iso-8859-1 */ | 219 | default: |
183 | word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 11], | 220 | /* bad encoding byte, |
184 | csize, "ISO-8859-1"); | 221 | try to convert from iso-8859-1 */ |
185 | break; | 222 | word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 11], |
186 | } | 223 | csize - 1, "ISO-8859-1"); |
187 | pos++; | 224 | break; |
225 | } | ||
226 | break; | ||
227 | case U: | ||
228 | word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 10], | ||
229 | csize, "ISO-8859-1"); | ||
230 | break; | ||
231 | case UL: | ||
232 | if (csize < 6) | ||
233 | return 0; /* malformed */ | ||
234 | /* find end of description */ | ||
235 | off = 14; | ||
236 | while ( (off < size) && | ||
237 | (off - pos < csize) && | ||
238 | (data[pos + off] == '\0') ) | ||
239 | off++; | ||
240 | if ( (off >= csize) || | ||
241 | (data[pos+off] != '\0') ) | ||
242 | return 0; /* malformed */ | ||
243 | off++; | ||
244 | switch (data[pos + 10]) | ||
245 | { | ||
246 | case 0x00: | ||
247 | word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + off], | ||
248 | csize - off, "ISO-8859-1"); | ||
249 | break; | ||
250 | case 0x01: | ||
251 | word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + off], | ||
252 | csize - off, "UCS-2"); | ||
253 | break; | ||
254 | default: | ||
255 | /* bad encoding byte, | ||
256 | try to convert from iso-8859-1 */ | ||
257 | word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + off], | ||
258 | csize - off, "ISO-8859-1"); | ||
259 | break; | ||
260 | } | ||
261 | break; | ||
262 | case SL: | ||
263 | if (csize < 7) | ||
264 | return 0; /* malformed */ | ||
265 | /* find end of description */ | ||
266 | switch (data[pos + 10]) | ||
267 | { | ||
268 | case 0x00: | ||
269 | word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 16], | ||
270 | csize - 6, "ISO-8859-1"); | ||
271 | break; | ||
272 | case 0x01: | ||
273 | word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 16], | ||
274 | csize - 6, "UCS-2"); | ||
275 | break; | ||
276 | default: | ||
277 | /* bad encoding byte, | ||
278 | try to convert from iso-8859-1 */ | ||
279 | word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 16], | ||
280 | csize - 6, "ISO-8859-1"); | ||
281 | break; | ||
282 | } | ||
283 | break; | ||
284 | case L: | ||
285 | if (csize < 5) | ||
286 | return 0; /* malformed */ | ||
287 | /* find end of description */ | ||
288 | switch (data[pos + 10]) | ||
289 | { | ||
290 | case 0x00: | ||
291 | word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 14], | ||
292 | csize - 4, "ISO-8859-1"); | ||
293 | break; | ||
294 | case 0x01: | ||
295 | word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 14], | ||
296 | csize - 4, "UCS-2"); | ||
297 | break; | ||
298 | default: | ||
299 | /* bad encoding byte, | ||
300 | try to convert from iso-8859-1 */ | ||
301 | word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 14], | ||
302 | csize - 4, "ISO-8859-1"); | ||
303 | break; | ||
304 | } | ||
305 | break; | ||
306 | case I: | ||
307 | if (csize < 2) | ||
308 | return 0; /* malformed */ | ||
309 | /* find end of mime type */ | ||
310 | off = 11; | ||
311 | while ( (off < size) && | ||
312 | (off - pos < csize) && | ||
313 | (data[pos + off] == '\0') ) | ||
314 | off++; | ||
315 | if ( (off >= csize) || | ||
316 | (data[pos+off] != '\0') ) | ||
317 | return 0; /* malformed */ | ||
318 | off++; | ||
319 | mime = strdup ((const char*) &data[pos + 11]); | ||
320 | |||
321 | switch (data[pos+off]) | ||
322 | { | ||
323 | case 0x03: | ||
324 | case 0x04: | ||
325 | type = EXTRACTOR_METATYPE_COVER_PICTURE; | ||
326 | break; | ||
327 | case 0x07: | ||
328 | case 0x08: | ||
329 | case 0x09: | ||
330 | case 0x0A: | ||
331 | case 0x0B: | ||
332 | case 0x0C: | ||
333 | type = EXTRACTOR_METATYPE_CONTRIBUTOR_PICTURE; | ||
334 | break; | ||
335 | case 0x0D: | ||
336 | case 0x0E: | ||
337 | case 0x0F: | ||
338 | type = EXTRACTOR_METATYPE_EVENT_PICTURE; | ||
339 | break; | ||
340 | case 0x14: | ||
341 | type = EXTRACTOR_METATYPE_LOGO; | ||
342 | type = EXTRACTOR_METATYPE_LOGO; | ||
343 | break; | ||
344 | default: | ||
345 | type = EXTRACTOR_METATYPE_PICTURE; | ||
346 | break; | ||
347 | } | ||
348 | off++; | ||
349 | |||
350 | /* find end of description */ | ||
351 | while ( (off < size) && | ||
352 | (off - pos < csize) && | ||
353 | (data[pos + off] == '\0') ) | ||
354 | off++; | ||
355 | if ( (off >= csize) || | ||
356 | (data[pos+off] != '\0') ) | ||
357 | return 0; /* malformed */ | ||
358 | off++; | ||
359 | if (0 == strcasecmp ("-->", | ||
360 | mime)) | ||
361 | { | ||
362 | /* not supported */ | ||
363 | } | ||
364 | else | ||
365 | { | ||
366 | if (0 != proc (proc_cls, | ||
367 | "id3v23", | ||
368 | type, | ||
369 | EXTRACTOR_METAFORMAT_BINARY, | ||
370 | mime, | ||
371 | (const char*) &data[pos + off], | ||
372 | csize + 6 - off)) | ||
373 | { | ||
374 | free (mime); | ||
375 | return 1; | ||
376 | } | ||
377 | } | ||
378 | free (mime); | ||
379 | word = NULL; | ||
380 | break; | ||
381 | default: | ||
382 | return 0; | ||
383 | } | ||
188 | if ((word != NULL) && (strlen (word) > 0)) | 384 | if ((word != NULL) && (strlen (word) > 0)) |
189 | { | 385 | { |
190 | if (0 != proc (proc_cls, | 386 | if (0 != proc (proc_cls, |
191 | "id3v2", | 387 | "id3v23", |
192 | tmap[i].type, | 388 | tmap[i].type, |
193 | EXTRACTOR_METAFORMAT_UTF8, | 389 | EXTRACTOR_METAFORMAT_UTF8, |
194 | "text/plain", | 390 | "text/plain", |