aboutsummaryrefslogtreecommitdiff
path: root/src/plugins/id3v24_extractor.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/plugins/id3v24_extractor.c')
-rw-r--r--src/plugins/id3v24_extractor.c432
1 files changed, 310 insertions, 122 deletions
diff --git a/src/plugins/id3v24_extractor.c b/src/plugins/id3v24_extractor.c
index ec11e4a..acc76af 100644
--- a/src/plugins/id3v24_extractor.c
+++ b/src/plugins/id3v24_extractor.c
@@ -1,6 +1,6 @@
1/* 1/*
2 This file is part of libextractor. 2 This file is part of libextractor.
3 (C) 2002, 2003, 2004, 2006, 2009 Vidyut Samanta and Christian Grothoff 3 (C) 2002, 2003, 2004, 2006, 2007, 2009 Vidyut Samanta and Christian Grothoff
4 4
5 libextractor is free software; you can redistribute it and/or modify 5 libextractor is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published 6 it under the terms of the GNU General Public License as published
@@ -18,7 +18,6 @@
18 Boston, MA 02111-1307, USA. 18 Boston, MA 02111-1307, USA.
19 19
20 */ 20 */
21
22#define DEBUG_EXTRACT_ID3v24 0 21#define DEBUG_EXTRACT_ID3v24 0
23 22
24#include "platform.h" 23#include "platform.h"
@@ -33,72 +32,98 @@
33#ifndef MINGW 32#ifndef MINGW
34#include <sys/mman.h> 33#include <sys/mman.h>
35#endif 34#endif
35
36#include "convert.h" 36#include "convert.h"
37 37
38 38enum Id3v24Fmt
39static struct EXTRACTOR_Keywords * 39 {
40addKeyword (EXTRACTOR_KeywordList * oldhead, 40 T, /* simple, 0-terminated string, prefixed by encoding */
41 char *phrase, EXTRACTOR_KeywordType type) 41 U, /* 0-terminated ASCII string, no encoding */
42{ 42 UL, /* unsync'ed lyrics */
43 EXTRACTOR_KeywordList *keyword; 43 SL, /* sync'ed lyrics */
44 44 L, /* string with language prefix */
45 keyword = malloc (sizeof (EXTRACTOR_KeywordList)); 45 I /* image */
46 keyword->next = oldhead; 46 };
47 keyword->keyword = phrase;
48 keyword->keywordType = type;
49 return keyword;
50}
51 47
52typedef struct 48typedef struct
53{ 49{
54 char *text; 50 const char *text;
55 enum EXTRACTOR_MetaType type; 51 enum EXTRACTOR_MetaType type;
52 enum Id3v24Fmt fmt;
56} Matches; 53} Matches;
57 54
58static Matches tmap[] = { 55static Matches tmap[] = {
59 {"COMM", EXTRACTOR_METATYPE_COMMENT}, 56 {"TALB", EXTRACTOR_METATYPE_ALBUM, T},
60 {"IPLS", EXTRACTOR_METATYPE_CONTRIBUTOR}, 57 {"TBPM", EXTRACTOR_METATYPE_BEATS_PER_MINUTE, T},
61 {"TIPL", EXTRACTOR_METATYPE_CONTRIBUTOR}, 58 {"TCOM", EXTRACTOR_METATYPE_COMPOSER, T},
62 {"TMOO", EXTRACTOR_METATYPE_MOOD}, 59 {"TCON", EXTRACTOR_METATYPE_SONG_VERSION, T},
63 {"TMCL", EXTRACTOR_METATYPE_MUSICIAN_CREDITS_LIST}, 60 {"TCOP", EXTRACTOR_METATYPE_COPYRIGHT, T},
64 {"LINK", EXTRACTOR_METATYPE_LINK}, 61 /* {"TDAT", EXTRACTOR_METATYPE_CREATION_DATE, T}, deprecated in 24 */
65 {"MCDI", EXTRACTOR_METATYPE_MUSIC_CD_IDENTIFIER}, 62 /* TDLY */
66 {"PCNT", EXTRACTOR_METATYPE_PLAY_COUNTER}, 63 {"TENC", EXTRACTOR_METATYPE_ENCODED_BY, T},
67 {"POPM", EXTRACTOR_METATYPE_POPULARITY_METER}, 64 {"TEXT", EXTRACTOR_METATYPE_WRITER, T},
68 {"TCOP", EXTRACTOR_METATYPE_COPYRIGHT}, 65 {"TFLT", EXTRACTOR_METATYPE_FORMAT_VERSION, T},
69 {"TDRC", EXTRACTOR_METATYPE_DATE}, 66 /* TIME, deprecated in 24 */
70 {"TCON", EXTRACTOR_METATYPE_GENRE}, 67 {"TIT1", EXTRACTOR_METATYPE_SECTION, T},
71 {"TIT1", EXTRACTOR_METATYPE_GENRE}, 68 {"TIT2", EXTRACTOR_METATYPE_TITLE, T},
72 {"TENC", EXTRACTOR_METATYPE_ENCODED_BY}, 69 {"TIT3", EXTRACTOR_METATYPE_SONG_VERSION, T},
73 {"TEXT", EXTRACTOR_METATYPE_LYRICS}, 70 /* TKEY */
74 {"TOLY", EXTRACTOR_METATYPE_CONTRIBUTOR}, 71 {"TLAN", EXTRACTOR_METATYPE_LANGUAGE, T},
75 {"TOPE", EXTRACTOR_METATYPE_CONTRIBUTOR}, 72 {"TLEN", EXTRACTOR_METATYPE_DURATION, T}, /* FIXME: should append 'ms' as unit */
76 {"TOWN", EXTRACTOR_METATYPE_OWNER}, 73 {"TMED", EXTRACTOR_METATYPE_SOURCE, T},
77 {"TPE1", EXTRACTOR_METATYPE_ARTIST}, 74 {"TOAL", EXTRACTOR_METATYPE_ORIGINAL_TITLE, T},
78 {"TPE2", EXTRACTOR_METATYPE_ARTIST}, 75 {"TOFN", EXTRACTOR_METATYPE_ORIGINAL_ARTIST, T},
79 {"TPE3", EXTRACTOR_METATYPE_CONDUCTOR}, 76 {"TOLY", EXTRACTOR_METATYPE_ORIGINAL_WRITER, T},
80 {"TPE4", EXTRACTOR_METATYPE_INTERPRET}, 77 {"TOPE", EXTRACTOR_METATYPE_ORIGINAL_PERFORMER, T},
81 {"TIME", EXTRACTOR_METATYPE_TIME}, 78 /* {"TORY", EXTRACTOR_METATYPE_ORIGINAL_RELEASE_YEAR, T}, deprecated in 24 */
82 {"TMED", EXTRACTOR_METATYPE_MEDIA_TYPE}, 79 {"TOWN", EXTRACTOR_METATYPE_LICENSEE, T},
83 {"TCOM", EXTRACTOR_METATYPE_CREATOR}, 80 {"TPE1", EXTRACTOR_METATYPE_ARTIST, T},
84 {"TOFN", EXTRACTOR_METATYPE_FILENAME}, 81 {"TPE2", EXTRACTOR_METATYPE_PERFORMER, T},
85 {"TOPE", EXTRACTOR_METATYPE_ARTIST}, 82 {"TPE3", EXTRACTOR_METATYPE_CONDUCTOR, T},
86 {"TPUB", EXTRACTOR_METATYPE_PUBLISHER}, 83 {"TPE4", EXTRACTOR_METATYPE_INTERPRETATION, T},
87 {"TRCK", EXTRACTOR_METATYPE_TRACK_NUMBER}, 84 {"TPOS", EXTRACTOR_METATYPE_DISC_NUMBER, T},
88 {"TRSC", EXTRACTOR_METATYPE_ISRC}, 85 {"TPUB", EXTRACTOR_METATYPE_PUBLISHER, T},
89 {"TRSN", EXTRACTOR_METATYPE_SOURCE}, 86 {"TRCK", EXTRACTOR_METATYPE_TRACK_NUMBER, T},
90 {"TRSO", EXTRACTOR_METATYPE_CREATED_FOR}, 87 /* TRDA, deprecated in 24 */
91 {"TSRC", EXTRACTOR_METATYPE_RESOURCE_IDENTIFIER}, 88 {"TRSN", EXTRACTOR_METATYPE_NETWORK_NAME, T},
92 {"TYER", EXTRACTOR_METATYPE_YEAR}, 89 /* TRSO */
93 {"TOAL", EXTRACTOR_METATYPE_ALBUM}, 90 /* {"TSIZ", EXTRACTOR_METATYPE_EMBEDDED_FILE_SIZE, T}, deprecated in 24 */
94 {"TALB", EXTRACTOR_METATYPE_ALBUM}, 91 {"TSRC", EXTRACTOR_METATYPE_ISRC, T},
95 {"TLAN", EXTRACTOR_METATYPE_LANGUAGE}, 92 /* TSSE */
96 {"TIT2", EXTRACTOR_METATYPE_TITLE}, 93 /* {"TYER", EXTRACTOR_METATYPE_PUBLICATION_YEAR, T}, deprecated in 24 */
97 {"TIT3", EXTRACTOR_METATYPE_DESCRIPTION}, 94 {"WCOM", EXTRACTOR_METATYPE_URL, U},
98 {"WCOM", EXTRACTOR_METATYPE_RELEASE}, 95 {"WCOP", EXTRACTOR_METATYPE_URL, U},
99 {"WCOP", EXTRACTOR_METATYPE_DISCLAIMER}, 96 {"WOAF", EXTRACTOR_METATYPE_URL, U},
100 {"", EXTRACTOR_METATYPE_KEYWORDS}, 97 {"WOAS", EXTRACTOR_METATYPE_URL, U},
101 {NULL, 0} 98 {"WORS", EXTRACTOR_METATYPE_URL, U},
99 {"WPAY", EXTRACTOR_METATYPE_URL, U},
100 {"WPUB", EXTRACTOR_METATYPE_URL, U},
101 {"WXXX", EXTRACTOR_METATYPE_URL, T},
102 /* {"IPLS", EXTRACTOR_METATYPE_CONTRIBUTOR_NAME, T}, deprecated in 24 */
103 /* ... */
104 {"USLT", EXTRACTOR_METATYPE_LYRICS, UL },
105 {"SYLT", EXTRACTOR_METATYPE_LYRICS, SL },
106 {"COMM", EXTRACTOR_METATYPE_COMMENT, L},
107 /* ... */
108 {"APIC", EXTRACTOR_METATYPE_PICTURE, I},
109 /* ... */
110 {"LINK", EXTRACTOR_METATYPE_URL, U},
111 /* ... */
112 {"USER", EXTRACTOR_METATYPE_LICENSE, T},
113 /* ... */
114 /* new frames in 24 */
115 /* ASPI, EQU2, RVA2, SEEK, SIGN, TDEN */
116 {"TDOR", EXTRACTOR_METATYPE_PUBLICATION_DATE, T},
117 /* TDRC, TDRL, TDTG */
118 {"TIPL", EXTRACTOR_METATYPE_CONTRIBUTOR_NAME, T},
119 {"TMCL", EXTRACTOR_METATYPE_MUSICIAN_CREDITS_LIST, T},
120 {"TMOO", EXTRACTOR_METATYPE_MOOD, T},
121 {"TPRO", EXTRACTOR_METATYPE_COPYRIGHT, T},
122 {"TSOA", EXTRACTOR_METATYPE_ALBUM, T},
123 {"TSOP", EXTRACTOR_METATYPE_PERFORMER, T},
124 {"TSOT", EXTRACTOR_METATYPE_TITLE, T},
125 {"TSST", EXTRACTOR_METATYPE_SUBTITLE, T},
126 {NULL, 0, T}
102}; 127};
103 128
104 129
@@ -114,54 +139,60 @@ EXTRACTOR_id3v24_extract (const unsigned char *data,
114 int extendedHdr; 139 int extendedHdr;
115 int experimental; 140 int experimental;
116 int footer; 141 int footer;
117 unsigned int tsize; 142 uint32_t tsize;
118 unsigned int pos; 143 uint32_t pos;
119 unsigned int ehdrSize; 144 uint32_t ehdrSize;
120 unsigned int padding; 145 uint32_t padding;
146 uint32_t csize;
147 int i;
148 uint16_t flags;
149 char *mime;
150 enum EXTRACTOR_MetaType type;
151 size_t off;
121 152
122 if ((size < 16) || 153 if ((size < 16) ||
123 (data[0] != 0x49) || 154 (data[0] != 0x49) ||
124 (data[1] != 0x44) || 155 (data[1] != 0x44) ||
125 (data[2] != 0x33) || (data[3] != 0x04) || (data[4] != 0x00)) 156 (data[2] != 0x33) || (data[3] != 0x04) || (data[4] != 0x00))
126 return prev; 157 return 0;
127 unsync = (data[5] & 0x80) > 0; 158 unsync = (data[5] & 0x80) > 0;
159 if (unsync)
160 return 0; /* not supported */
128 extendedHdr = (data[5] & 0x40) > 0; 161 extendedHdr = (data[5] & 0x40) > 0;
129 experimental = (data[5] & 0x20) > 0; 162 experimental = (data[5] & 0x20) > 0;
163 if (experimental)
164 return 0;
130 footer = (data[5] & 0x10) > 0; 165 footer = (data[5] & 0x10) > 0;
131 tsize = (((data[6] & 0x7F) << 21) | 166 tsize = (((data[6] & 0x7F) << 21) |
132 ((data[7] & 0x7F) << 14) | 167 ((data[7] & 0x7F) << 14) |
133 ((data[8] & 0x7F) << 7) | ((data[9] & 0x7F) << 0)); 168 ((data[8] & 0x7F) << 7) | ((data[9] & 0x7F) << 0));
134 if ((tsize + 10 > size) || (experimental)) 169 if (tsize + 10 > size)
135 return prev; 170 return 0;
136 pos = 10; 171 pos = 10;
137 padding = 0; 172 padding = 0;
138 if (extendedHdr) 173 if (extendedHdr)
139 { 174 {
140 ehdrSize = (((data[10] & 0x7F) << 21) | 175 ehdrSize = (((data[10] & 0x7F) << 21) |
141 ((data[11] & 0x7F) << 14) | 176 ((data[11] & 0x7F) << 14) |
142 ((data[12] & 0x7F) << 7) | ((data[13] & 0x7F) << 0)); 177 ((data[12] & 0x7F) << 7) | ((data[13] & 0x7F) << 0));
143 pos += ehdrSize; 178 pos += 4 + ehdrSize;
179 if (ehdrSize > tsize)
180 return 0;
144 } 181 }
145
146
147 while (pos < tsize) 182 while (pos < tsize)
148 { 183 {
149 size_t csize;
150 int i;
151 unsigned short flags;
152
153 if (pos + 10 > tsize) 184 if (pos + 10 > tsize)
154 return prev; 185 return 0;
155 186 csize =
156 csize = (((data[pos + 4] & 0x7F) << 21) | 187 (data[pos + 4] << 24) + (data[pos + 5] << 16) + (data[pos + 6] << 8) +
157 ((data[pos + 5] & 0x7F) << 14) | 188 data[pos + 7];
158 ((data[pos + 6] & 0x7F) << 7) | ((data[pos + 7] & 0x7F) << 0)); 189 if ((pos + 10 + csize > tsize) || (csize > tsize) || (csize == 0) ||
159 190 (pos + 10 + csize <= pos + 10) || (pos + 10 <= pos))
160 if ((pos + 10 + csize > tsize) || (csize > tsize) || (csize == 0))
161 break; 191 break;
162 flags = (data[pos + 8] << 8) + data[pos + 9]; 192 flags = (data[pos + 8] << 8) + data[pos + 9];
163 if (((flags & 0x80) > 0) /* compressed, not yet supported */ || 193 if (((flags & 0x08) > 0) /* compressed, not yet supported */ ||
164 ((flags & 0x40) > 0) /* encrypted, not supported */ ) 194 ((flags & 0x04) > 0) /* encrypted, not supported */ ||
195 ((flags & 0x02) > 0) /* unsynchronized, not supported */ )
165 { 196 {
166 pos += 10 + csize; 197 pos += 10 + csize;
167 continue; 198 continue;
@@ -172,59 +203,216 @@ EXTRACTOR_id3v24_extract (const unsigned char *data,
172 if (0 == strncmp (tmap[i].text, (const char *) &data[pos], 4)) 203 if (0 == strncmp (tmap[i].text, (const char *) &data[pos], 4))
173 { 204 {
174 char *word; 205 char *word;
175 if ((flags & 0x20) > 0) 206 if ((flags & 0x40) > 0)
176 { 207 {
177 /* "group" identifier, skip a byte */ 208 /* "group" identifier, skip a byte */
178 pos++; 209 pos++;
179 csize--; 210 csize--;
180 } 211 }
181 212
182 /* this byte describes the encoding 213 switch (tmap[i].fmt)
183 try to convert strings to UTF-8 214 {
184 if it fails, then forget it */ 215 case T:
185 csize--; 216 /* this byte describes the encoding
186 switch (data[pos + 10]) 217 try to convert strings to UTF-8
187 { 218 if it fails, then forget it */
188 case 0x00: 219 switch (data[pos + 10])
189 word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 11], 220 {
190 csize, "ISO-8859-1"); 221 case 0x00:
191 break; 222 word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 11],
192 case 0x01: 223 csize - 1, "ISO-8859-1");
193 word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 11], 224 break;
194 csize, "UTF-16"); 225 case 0x01:
195 break; 226 word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 11],
196 case 0x02: 227 csize - 1, "UCS-2");
197 word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 11], 228 break;
198 csize, "UTF-16BE"); 229 default:
199 break; 230 /* bad encoding byte,
200 case 0x03: 231 try to convert from iso-8859-1 */
201 word = malloc (csize + 1); 232 word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 11],
202 memcpy (word, &data[pos + 11], csize); 233 csize - 1, "ISO-8859-1");
203 word[csize] = '\0'; 234 break;
204 break; 235 }
205 default: 236 break;
206 /* bad encoding byte, 237 case U:
207 try to convert from iso-8859-1 */ 238 word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 10],
208 word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 11], 239 csize, "ISO-8859-1");
209 csize, "ISO-8859-1"); 240 break;
210 break; 241 case UL:
211 } 242 if (csize < 6)
212 pos++; 243 return 0; /* malformed */
244 /* find end of description */
245 off = 14;
246 while ( (off < size) &&
247 (off - pos < csize) &&
248 (data[pos + off] == '\0') )
249 off++;
250 if ( (off >= csize) ||
251 (data[pos+off] != '\0') )
252 return 0; /* malformed */
253 off++;
254 switch (data[pos + 10])
255 {
256 case 0x00:
257 word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + off],
258 csize - off, "ISO-8859-1");
259 break;
260 case 0x01:
261 word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + off],
262 csize - off, "UCS-2");
263 break;
264 default:
265 /* bad encoding byte,
266 try to convert from iso-8859-1 */
267 word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + off],
268 csize - off, "ISO-8859-1");
269 break;
270 }
271 break;
272 case SL:
273 if (csize < 7)
274 return 0; /* malformed */
275 /* find end of description */
276 switch (data[pos + 10])
277 {
278 case 0x00:
279 word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 16],
280 csize - 6, "ISO-8859-1");
281 break;
282 case 0x01:
283 word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 16],
284 csize - 6, "UCS-2");
285 break;
286 default:
287 /* bad encoding byte,
288 try to convert from iso-8859-1 */
289 word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 16],
290 csize - 6, "ISO-8859-1");
291 break;
292 }
293 break;
294 case L:
295 if (csize < 5)
296 return 0; /* malformed */
297 /* find end of description */
298 switch (data[pos + 10])
299 {
300 case 0x00:
301 word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 14],
302 csize - 4, "ISO-8859-1");
303 break;
304 case 0x01:
305 word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 14],
306 csize - 4, "UCS-2");
307 break;
308 default:
309 /* bad encoding byte,
310 try to convert from iso-8859-1 */
311 word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 14],
312 csize - 4, "ISO-8859-1");
313 break;
314 }
315 break;
316 case I:
317 if (csize < 2)
318 return 0; /* malformed */
319 /* find end of mime type */
320 off = 11;
321 while ( (off < size) &&
322 (off - pos < csize) &&
323 (data[pos + off] == '\0') )
324 off++;
325 if ( (off >= csize) ||
326 (data[pos+off] != '\0') )
327 return 0; /* malformed */
328 off++;
329 mime = strdup ((const char*) &data[pos + 11]);
330
331 switch (data[pos+off])
332 {
333 case 0x03:
334 case 0x04:
335 type = EXTRACTOR_METATYPE_COVER_PICTURE;
336 break;
337 case 0x07:
338 case 0x08:
339 case 0x09:
340 case 0x0A:
341 case 0x0B:
342 case 0x0C:
343 type = EXTRACTOR_METATYPE_CONTRIBUTOR_PICTURE;
344 break;
345 case 0x0D:
346 case 0x0E:
347 case 0x0F:
348 type = EXTRACTOR_METATYPE_EVENT_PICTURE;
349 break;
350 case 0x14:
351 type = EXTRACTOR_METATYPE_LOGO;
352 type = EXTRACTOR_METATYPE_LOGO;
353 break;
354 default:
355 type = EXTRACTOR_METATYPE_PICTURE;
356 break;
357 }
358 off++;
359
360 /* find end of description */
361 while ( (off < size) &&
362 (off - pos < csize) &&
363 (data[pos + off] == '\0') )
364 off++;
365 if ( (off >= csize) ||
366 (data[pos+off] != '\0') )
367 return 0; /* malformed */
368 off++;
369 if (0 == strcasecmp ("-->",
370 mime))
371 {
372 /* not supported */
373 }
374 else
375 {
376 if (0 != proc (proc_cls,
377 "id3v24",
378 type,
379 EXTRACTOR_METAFORMAT_BINARY,
380 mime,
381 (const char*) &data[pos + off],
382 csize + 6 - off))
383 {
384 free (mime);
385 return 1;
386 }
387 }
388 free (mime);
389 word = NULL;
390 break;
391 default:
392 return 0;
393 }
213 if ((word != NULL) && (strlen (word) > 0)) 394 if ((word != NULL) && (strlen (word) > 0))
214 { 395 {
215 prev = addKeyword (prev, word, tmap[i].type); 396 if (0 != proc (proc_cls,
216 } 397 "id3v24",
217 else 398 tmap[i].type,
218 { 399 EXTRACTOR_METAFORMAT_UTF8,
219 free (word); 400 "text/plain",
401 word,
402 strlen(word)+1))
403 {
404 free (word);
405 return 1;
406 }
220 } 407 }
408 free (word);
221 break; 409 break;
222 } 410 }
223 i++; 411 i++;
224 } 412 }
225 pos += 10 + csize; 413 pos += 10 + csize;
226 } 414 }
227 return prev; 415 return 0;
228} 416}
229 417
230/* end of id3v24_extractor.c */ 418/* end of id3v24_extractor.c */