aboutsummaryrefslogtreecommitdiff
path: root/src/plugins/id3v23_extractor.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/plugins/id3v23_extractor.c')
-rw-r--r--src/plugins/id3v23_extractor.c330
1 files changed, 263 insertions, 67 deletions
diff --git a/src/plugins/id3v23_extractor.c b/src/plugins/id3v23_extractor.c
index 71553c2..4ab8116 100644
--- a/src/plugins/id3v23_extractor.c
+++ b/src/plugins/id3v23_extractor.c
@@ -1,6 +1,6 @@
1/* 1/*
2 This file is part of libextractor. 2 This file is part of libextractor.
3 (C) 2002, 2003, 2004, 2006, 2007 Vidyut Samanta and Christian Grothoff 3 (C) 2002, 2003, 2004, 2006, 2007, 2009 Vidyut Samanta and Christian Grothoff
4 4
5 libextractor is free software; you can redistribute it and/or modify 5 libextractor is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published 6 it under the terms of the GNU General Public License as published
@@ -35,54 +35,83 @@
35 35
36#include "convert.h" 36#include "convert.h"
37 37
38enum Id3v23Fmt
39 {
40 T, /* simple, 0-terminated string, prefixed by encoding */
41 U, /* 0-terminated ASCII string, no encoding */
42 UL, /* unsync'ed lyrics */
43 SL, /* sync'ed lyrics */
44 L, /* string with language prefix */
45 I /* image */
46 };
47
38typedef struct 48typedef struct
39{ 49{
40 const char *text; 50 const char *text;
41 enum EXTRACTOR_MetaType type; 51 enum EXTRACTOR_MetaType type;
52 enum Id3v23Fmt fmt;
42} Matches; 53} Matches;
43 54
44static Matches tmap[] = { 55static Matches tmap[] = {
45 {"COMM", EXTRACTOR_METATYPE_COMMENT}, 56 {"TALB", EXTRACTOR_METATYPE_ALBUM, T},
46 {"IPLS", EXTRACTOR_METATYPE_CONTRIBUTOR}, 57 {"TBPM", EXTRACTOR_METATYPE_BEATS_PER_MINUTE, T},
47 {"LINK", EXTRACTOR_METATYPE_LINK}, 58 {"TCOM", EXTRACTOR_METATYPE_COMPOSER, T},
48 {"MCDI", EXTRACTOR_METATYPE_MUSIC_CD_IDENTIFIER}, 59 {"TCON", EXTRACTOR_METATYPE_SONG_VERSION, T},
49 {"PCNT", EXTRACTOR_METATYPE_PLAY_COUNTER}, 60 {"TCOP", EXTRACTOR_METATYPE_COPYRIGHT, T},
50 {"POPM", EXTRACTOR_METATYPE_POPULARITY_METER}, 61 /* {"TDAT", EXTRACTOR_METATYPE_CREATION_DATE, T}, */
51 {"TCOP", EXTRACTOR_METATYPE_COPYRIGHT}, 62 /* TDLY */
52 {"TDAT", EXTRACTOR_METATYPE_DATE}, 63 {"TENC", EXTRACTOR_METATYPE_ENCODED_BY, T},
53 {"TCON", EXTRACTOR_METATYPE_CONTENT_TYPE}, 64 {"TEXT", EXTRACTOR_METATYPE_WRITER, T},
54 {"TIT1", EXTRACTOR_METATYPE_GENRE}, 65 {"TFLT", EXTRACTOR_METATYPE_FORMAT_VERSION, T},
55 {"TENC", EXTRACTOR_METATYPE_ENCODED_BY}, 66 /* TIME */
56 {"TEXT", EXTRACTOR_METATYPE_LYRICS}, 67 {"TIT1", EXTRACTOR_METATYPE_SECTION, T},
57 {"TOLY", EXTRACTOR_METATYPE_CONTRIBUTOR}, 68 {"TIT2", EXTRACTOR_METATYPE_TITLE, T},
58 {"TOPE", EXTRACTOR_METATYPE_CONTRIBUTOR}, 69 {"TIT3", EXTRACTOR_METATYPE_SONG_VERSION, T},
59 {"TOWN", EXTRACTOR_METATYPE_OWNER}, 70 /* TKEY */
60 {"TPE1", EXTRACTOR_METATYPE_ARTIST}, 71 {"TLAN", EXTRACTOR_METATYPE_LANGUAGE, T},
61 {"TPE2", EXTRACTOR_METATYPE_ARTIST}, 72 {"TLEN", EXTRACTOR_METATYPE_DURATION, T}, /* FIXME: should append 'ms' as unit */
62 {"TPE3", EXTRACTOR_METATYPE_CONDUCTOR}, 73 {"TMED", EXTRACTOR_METATYPE_SOURCE, T},
63 {"TPE4", EXTRACTOR_METATYPE_INTERPRET}, 74 {"TOAL", EXTRACTOR_METATYPE_ORIGINAL_TITLE, T},
64 {"TMED", EXTRACTOR_METATYPE_MEDIA_TYPE}, 75 {"TOFN", EXTRACTOR_METATYPE_ORIGINAL_ARTIST, T},
65 {"TCOM", EXTRACTOR_METATYPE_CREATOR}, 76 {"TOLY", EXTRACTOR_METATYPE_ORIGINAL_WRITER, T},
66 {"TIME", EXTRACTOR_METATYPE_TIME}, 77 {"TOPE", EXTRACTOR_METATYPE_ORIGINAL_PERFORMER, T},
67 {"TOFN", EXTRACTOR_METATYPE_FILENAME}, 78 {"TORY", EXTRACTOR_METATYPE_ORIGINAL_RELEASE_YEAR, T},
68 {"TOPE", EXTRACTOR_METATYPE_ARTIST}, 79 {"TOWN", EXTRACTOR_METATYPE_LICENSEE, T},
69 {"TPUB", EXTRACTOR_METATYPE_PUBLISHER}, 80 {"TPE1", EXTRACTOR_METATYPE_ARTIST, T},
70 {"TRCK", EXTRACTOR_METATYPE_TRACK_NUMBER}, 81 {"TPE2", EXTRACTOR_METATYPE_PERFORMER, T},
71 {"TRSC", EXTRACTOR_METATYPE_ISRC}, 82 {"TPE3", EXTRACTOR_METATYPE_CONDUCTOR, T},
72 {"TRSN", EXTRACTOR_METATYPE_SOURCE}, 83 {"TPE4", EXTRACTOR_METATYPE_INTERPRETATION, T},
73 {"TRSO", EXTRACTOR_METATYPE_CREATED_FOR}, 84 {"TPOS", EXTRACTOR_METATYPE_DISC_NUMBER, T},
74 {"TSRC", EXTRACTOR_METATYPE_RESOURCE_IDENTIFIER}, 85 {"TPUB", EXTRACTOR_METATYPE_PUBLISHER, T},
75 {"TOAL", EXTRACTOR_METATYPE_ALBUM}, 86 {"TRCK", EXTRACTOR_METATYPE_TRACK_NUMBER, T},
76 {"TALB", EXTRACTOR_METATYPE_ALBUM}, 87 /* TRDA */
77 {"TLAN", EXTRACTOR_METATYPE_LANGUAGE}, 88 {"TRSN", EXTRACTOR_METATYPE_NETWORK_NAME, T},
78 {"TYER", EXTRACTOR_METATYPE_YEAR}, 89 /* TRSO */
79 {"TLEN", EXTRACTOR_METATYPE_DURATION}, 90 {"TSIZ", EXTRACTOR_METATYPE_EMBEDDED_FILE_SIZE, T},
80 {"TIT2", EXTRACTOR_METATYPE_TITLE}, 91 {"TSRC", EXTRACTOR_METATYPE_ISRC, T},
81 {"TIT3", EXTRACTOR_METATYPE_DESCRIPTION}, 92 /* TSSE */
82 {"WCOM", EXTRACTOR_METATYPE_RELEASE}, 93 {"TYER", EXTRACTOR_METATYPE_PUBLICATION_YEAR, T},
83 {"WCOP", EXTRACTOR_METATYPE_DISCLAIMER}, 94 {"WCOM", EXTRACTOR_METATYPE_URL, U},
84 {"", EXTRACTOR_METATYPE_KEYWORDS}, 95 {"WCOP", EXTRACTOR_METATYPE_URL, U},
85 {NULL, 0} 96 {"WOAF", EXTRACTOR_METATYPE_URL, U},
97 {"WOAS", EXTRACTOR_METATYPE_URL, U},
98 {"WORS", EXTRACTOR_METATYPE_URL, U},
99 {"WPAY", EXTRACTOR_METATYPE_URL, U},
100 {"WPUB", EXTRACTOR_METATYPE_URL, U},
101 {"WXXX", EXTRACTOR_METATYPE_URL, T},
102 {"IPLS", EXTRACTOR_METATYPE_CONTRIBUTOR_NAME, T},
103 /* ... */
104 {"USLT", EXTRACTOR_METATYPE_LYRICS, UL },
105 {"SYLT", EXTRACTOR_METATYPE_LYRICS, SL },
106 {"COMM", EXTRACTOR_METATYPE_COMMENT, L},
107 /* ... */
108 {"APIC", EXTRACTOR_METATYPE_PICTURE, I},
109 /* ... */
110 {"LINK", EXTRACTOR_METATYPE_URL, U},
111 /* ... */
112 {"USER", EXTRACTOR_METATYPE_LICENSE, T},
113 /* ... */
114 {NULL, 0, T}
86}; 115};
87 116
88 117
@@ -104,6 +133,9 @@ EXTRACTOR_id3v23_extract (const unsigned char *data,
104 uint32_t csize; 133 uint32_t csize;
105 int i; 134 int i;
106 uint16_t flags; 135 uint16_t flags;
136 char *mime;
137 enum EXTRACTOR_MetaType type;
138 size_t off;
107 139
108 if ((size < 16) || 140 if ((size < 16) ||
109 (data[0] != 0x49) || 141 (data[0] != 0x49) ||
@@ -111,12 +143,16 @@ EXTRACTOR_id3v23_extract (const unsigned char *data,
111 (data[2] != 0x33) || (data[3] != 0x03) || (data[4] != 0x00)) 143 (data[2] != 0x33) || (data[3] != 0x03) || (data[4] != 0x00))
112 return 0; 144 return 0;
113 unsync = (data[5] & 0x80) > 0; 145 unsync = (data[5] & 0x80) > 0;
146 if (unsync)
147 return 0; /* not supported */
114 extendedHdr = (data[5] & 0x40) > 0; 148 extendedHdr = (data[5] & 0x40) > 0;
115 experimental = (data[5] & 0x20) > 0; 149 experimental = (data[5] & 0x20) > 0;
150 if (experimental)
151 return 0;
116 tsize = (((data[6] & 0x7F) << 21) | 152 tsize = (((data[6] & 0x7F) << 21) |
117 ((data[7] & 0x7F) << 14) | 153 ((data[7] & 0x7F) << 14) |
118 ((data[8] & 0x7F) << 7) | ((data[9] & 0x7F) << 0)); 154 ((data[8] & 0x7F) << 7) | ((data[9] & 0x7F) << 0));
119 if ((tsize + 10 > size) || (experimental)) 155 if (tsize + 10 > size)
120 return 0; 156 return 0;
121 pos = 10; 157 pos = 10;
122 padding = 0; 158 padding = 0;
@@ -142,7 +178,8 @@ EXTRACTOR_id3v23_extract (const unsigned char *data,
142 csize = 178 csize =
143 (data[pos + 4] << 24) + (data[pos + 5] << 16) + (data[pos + 6] << 8) + 179 (data[pos + 4] << 24) + (data[pos + 5] << 16) + (data[pos + 6] << 8) +
144 data[pos + 7]; 180 data[pos + 7];
145 if ((pos + 10 + csize > tsize) || (csize > tsize) || (csize == 0)) 181 if ((pos + 10 + csize > tsize) || (csize > tsize) || (csize == 0) ||
182 (pos + 10 + csize <= pos + 10) || (pos + 10 <= pos))
146 break; 183 break;
147 flags = (data[pos + 8] << 8) + data[pos + 9]; 184 flags = (data[pos + 8] << 8) + data[pos + 9];
148 if (((flags & 0x80) > 0) /* compressed, not yet supported */ || 185 if (((flags & 0x80) > 0) /* compressed, not yet supported */ ||
@@ -163,32 +200,191 @@ EXTRACTOR_id3v23_extract (const unsigned char *data,
163 pos++; 200 pos++;
164 csize--; 201 csize--;
165 } 202 }
166 csize--; 203 switch (tmap[i].fmt)
167 /* this byte describes the encoding 204 {
168 try to convert strings to UTF-8 205 case T:
169 if it fails, then forget it */ 206 /* this byte describes the encoding
170 switch (data[pos + 10]) 207 try to convert strings to UTF-8
171 { 208 if it fails, then forget it */
172 case 0x00: 209 switch (data[pos + 10])
173 word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 11], 210 {
174 csize, "ISO-8859-1"); 211 case 0x00:
175 break; 212 word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 11],
176 case 0x01: 213 csize - 1, "ISO-8859-1");
177 word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 11], 214 break;
178 csize, "UCS-2"); 215 case 0x01:
179 break; 216 word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 11],
180 default: 217 csize - 1, "UCS-2");
181 /* bad encoding byte, 218 break;
182 try to convert from iso-8859-1 */ 219 default:
183 word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 11], 220 /* bad encoding byte,
184 csize, "ISO-8859-1"); 221 try to convert from iso-8859-1 */
185 break; 222 word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 11],
186 } 223 csize - 1, "ISO-8859-1");
187 pos++; 224 break;
225 }
226 break;
227 case U:
228 word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 10],
229 csize, "ISO-8859-1");
230 break;
231 case UL:
232 if (csize < 6)
233 return 0; /* malformed */
234 /* find end of description */
235 off = 14;
236 while ( (off < size) &&
237 (off - pos < csize) &&
238 (data[pos + off] == '\0') )
239 off++;
240 if ( (off >= csize) ||
241 (data[pos+off] != '\0') )
242 return 0; /* malformed */
243 off++;
244 switch (data[pos + 10])
245 {
246 case 0x00:
247 word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + off],
248 csize - off, "ISO-8859-1");
249 break;
250 case 0x01:
251 word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + off],
252 csize - off, "UCS-2");
253 break;
254 default:
255 /* bad encoding byte,
256 try to convert from iso-8859-1 */
257 word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + off],
258 csize - off, "ISO-8859-1");
259 break;
260 }
261 break;
262 case SL:
263 if (csize < 7)
264 return 0; /* malformed */
265 /* find end of description */
266 switch (data[pos + 10])
267 {
268 case 0x00:
269 word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 16],
270 csize - 6, "ISO-8859-1");
271 break;
272 case 0x01:
273 word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 16],
274 csize - 6, "UCS-2");
275 break;
276 default:
277 /* bad encoding byte,
278 try to convert from iso-8859-1 */
279 word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 16],
280 csize - 6, "ISO-8859-1");
281 break;
282 }
283 break;
284 case L:
285 if (csize < 5)
286 return 0; /* malformed */
287 /* find end of description */
288 switch (data[pos + 10])
289 {
290 case 0x00:
291 word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 14],
292 csize - 4, "ISO-8859-1");
293 break;
294 case 0x01:
295 word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 14],
296 csize - 4, "UCS-2");
297 break;
298 default:
299 /* bad encoding byte,
300 try to convert from iso-8859-1 */
301 word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 14],
302 csize - 4, "ISO-8859-1");
303 break;
304 }
305 break;
306 case I:
307 if (csize < 2)
308 return 0; /* malformed */
309 /* find end of mime type */
310 off = 11;
311 while ( (off < size) &&
312 (off - pos < csize) &&
313 (data[pos + off] == '\0') )
314 off++;
315 if ( (off >= csize) ||
316 (data[pos+off] != '\0') )
317 return 0; /* malformed */
318 off++;
319 mime = strdup ((const char*) &data[pos + 11]);
320
321 switch (data[pos+off])
322 {
323 case 0x03:
324 case 0x04:
325 type = EXTRACTOR_METATYPE_COVER_PICTURE;
326 break;
327 case 0x07:
328 case 0x08:
329 case 0x09:
330 case 0x0A:
331 case 0x0B:
332 case 0x0C:
333 type = EXTRACTOR_METATYPE_CONTRIBUTOR_PICTURE;
334 break;
335 case 0x0D:
336 case 0x0E:
337 case 0x0F:
338 type = EXTRACTOR_METATYPE_EVENT_PICTURE;
339 break;
340 case 0x14:
341 type = EXTRACTOR_METATYPE_LOGO;
342 type = EXTRACTOR_METATYPE_LOGO;
343 break;
344 default:
345 type = EXTRACTOR_METATYPE_PICTURE;
346 break;
347 }
348 off++;
349
350 /* find end of description */
351 while ( (off < size) &&
352 (off - pos < csize) &&
353 (data[pos + off] == '\0') )
354 off++;
355 if ( (off >= csize) ||
356 (data[pos+off] != '\0') )
357 return 0; /* malformed */
358 off++;
359 if (0 == strcasecmp ("-->",
360 mime))
361 {
362 /* not supported */
363 }
364 else
365 {
366 if (0 != proc (proc_cls,
367 "id3v23",
368 type,
369 EXTRACTOR_METAFORMAT_BINARY,
370 mime,
371 (const char*) &data[pos + off],
372 csize + 6 - off))
373 {
374 free (mime);
375 return 1;
376 }
377 }
378 free (mime);
379 word = NULL;
380 break;
381 default:
382 return 0;
383 }
188 if ((word != NULL) && (strlen (word) > 0)) 384 if ((word != NULL) && (strlen (word) > 0))
189 { 385 {
190 if (0 != proc (proc_cls, 386 if (0 != proc (proc_cls,
191 "id3v2", 387 "id3v23",
192 tmap[i].type, 388 tmap[i].type,
193 EXTRACTOR_METAFORMAT_UTF8, 389 EXTRACTOR_METAFORMAT_UTF8,
194 "text/plain", 390 "text/plain",