/* This file is part of libextractor. (C) 2002, 2003, 2004, 2006, 2007, 2009 Vidyut Samanta and Christian Grothoff libextractor is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2, or (at your option) any later version. libextractor is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with libextractor; see the file COPYING. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #define DEBUG_EXTRACT_ID3v23 0 #include "platform.h" #include "extractor.h" #include #include #include #include #include #include #include #ifndef MINGW #include #endif #include "convert.h" enum Id3v23Fmt { T, /* simple, 0-terminated string, prefixed by encoding */ U, /* 0-terminated ASCII string, no encoding */ UL, /* unsync'ed lyrics */ SL, /* sync'ed lyrics */ L, /* string with language prefix */ I /* image */ }; typedef struct { const char *text; enum EXTRACTOR_MetaType type; enum Id3v23Fmt fmt; } Matches; static Matches tmap[] = { {"TALB", EXTRACTOR_METATYPE_ALBUM, T}, {"TBPM", EXTRACTOR_METATYPE_BEATS_PER_MINUTE, T}, {"TCOM", EXTRACTOR_METATYPE_COMPOSER, T}, {"TCON", EXTRACTOR_METATYPE_SONG_VERSION, T}, {"TCOP", EXTRACTOR_METATYPE_COPYRIGHT, T}, /* {"TDAT", EXTRACTOR_METATYPE_CREATION_DATE, T}, */ /* TDLY */ {"TENC", EXTRACTOR_METATYPE_ENCODED_BY, T}, {"TEXT", EXTRACTOR_METATYPE_WRITER, T}, {"TFLT", EXTRACTOR_METATYPE_FORMAT_VERSION, T}, /* TIME */ {"TIT1", EXTRACTOR_METATYPE_SECTION, T}, {"TIT2", EXTRACTOR_METATYPE_TITLE, T}, {"TIT3", EXTRACTOR_METATYPE_SONG_VERSION, T}, /* TKEY */ {"TLAN", EXTRACTOR_METATYPE_LANGUAGE, T}, {"TLEN", EXTRACTOR_METATYPE_DURATION, T}, /* FIXME: should append 'ms' as unit */ {"TMED", EXTRACTOR_METATYPE_SOURCE, T}, {"TOAL", EXTRACTOR_METATYPE_ORIGINAL_TITLE, T}, {"TOFN", EXTRACTOR_METATYPE_ORIGINAL_ARTIST, T}, {"TOLY", EXTRACTOR_METATYPE_ORIGINAL_WRITER, T}, {"TOPE", EXTRACTOR_METATYPE_ORIGINAL_PERFORMER, T}, {"TORY", EXTRACTOR_METATYPE_ORIGINAL_RELEASE_YEAR, T}, {"TOWN", EXTRACTOR_METATYPE_LICENSEE, T}, {"TPE1", EXTRACTOR_METATYPE_ARTIST, T}, {"TPE2", EXTRACTOR_METATYPE_PERFORMER, T}, {"TPE3", EXTRACTOR_METATYPE_CONDUCTOR, T}, {"TPE4", EXTRACTOR_METATYPE_INTERPRETATION, T}, {"TPOS", EXTRACTOR_METATYPE_DISC_NUMBER, T}, {"TPUB", EXTRACTOR_METATYPE_PUBLISHER, T}, {"TRCK", EXTRACTOR_METATYPE_TRACK_NUMBER, T}, /* TRDA */ {"TRSN", EXTRACTOR_METATYPE_NETWORK_NAME, T}, /* TRSO */ {"TSIZ", EXTRACTOR_METATYPE_EMBEDDED_FILE_SIZE, T}, {"TSRC", EXTRACTOR_METATYPE_ISRC, T}, /* TSSE */ {"TYER", EXTRACTOR_METATYPE_PUBLICATION_YEAR, T}, {"WCOM", EXTRACTOR_METATYPE_URL, U}, {"WCOP", EXTRACTOR_METATYPE_URL, U}, {"WOAF", EXTRACTOR_METATYPE_URL, U}, {"WOAS", EXTRACTOR_METATYPE_URL, U}, {"WORS", EXTRACTOR_METATYPE_URL, U}, {"WPAY", EXTRACTOR_METATYPE_URL, U}, {"WPUB", EXTRACTOR_METATYPE_URL, U}, {"WXXX", EXTRACTOR_METATYPE_URL, T}, {"IPLS", EXTRACTOR_METATYPE_CONTRIBUTOR_NAME, T}, /* ... */ {"USLT", EXTRACTOR_METATYPE_LYRICS, UL }, {"SYLT", EXTRACTOR_METATYPE_LYRICS, SL }, {"COMM", EXTRACTOR_METATYPE_COMMENT, L}, /* ... */ {"APIC", EXTRACTOR_METATYPE_PICTURE, I}, /* ... */ {"LINK", EXTRACTOR_METATYPE_URL, U}, /* ... */ {"USER", EXTRACTOR_METATYPE_LICENSE, T}, /* ... */ {NULL, 0, T} }; /* mimetype = audio/mpeg */ int EXTRACTOR_id3v23_extract (const unsigned char *data, size_t size, EXTRACTOR_MetaDataProcessor proc, void *proc_cls, const char *options) { int unsync; int extendedHdr; int experimental; uint32_t tsize; uint32_t pos; uint32_t ehdrSize; uint32_t padding; uint32_t csize; int i; uint16_t flags; char *mime; enum EXTRACTOR_MetaType type; size_t off; if ((size < 16) || (data[0] != 0x49) || (data[1] != 0x44) || (data[2] != 0x33) || (data[3] != 0x03) || (data[4] != 0x00)) return 0; unsync = (data[5] & 0x80) > 0; if (unsync) return 0; /* not supported */ extendedHdr = (data[5] & 0x40) > 0; experimental = (data[5] & 0x20) > 0; if (experimental) return 0; tsize = (((data[6] & 0x7F) << 21) | ((data[7] & 0x7F) << 14) | ((data[8] & 0x7F) << 7) | ((data[9] & 0x7F) << 0)); if (tsize + 10 > size) return 0; pos = 10; padding = 0; if (extendedHdr) { ehdrSize = (((data[10]) << 24) | ((data[11]) << 16) | ((data[12]) << 8) | ((data[12]) << 0)); padding = (((data[15]) << 24) | ((data[16]) << 16) | ((data[17]) << 8) | ((data[18]) << 0)); pos += 4 + ehdrSize; if (padding < tsize) tsize -= padding; else return 0; } while (pos < tsize) { if (pos + 10 > tsize) return 0; csize = (data[pos + 4] << 24) + (data[pos + 5] << 16) + (data[pos + 6] << 8) + data[pos + 7]; if ((pos + 10 + csize > tsize) || (csize > tsize) || (csize == 0) || (pos + 10 + csize <= pos + 10) || (pos + 10 <= pos)) break; flags = (data[pos + 8] << 8) + data[pos + 9]; if (((flags & 0x80) > 0) /* compressed, not yet supported */ || ((flags & 0x40) > 0) /* encrypted, not supported */ ) { pos += 10 + csize; continue; } i = 0; while (tmap[i].text != NULL) { if (0 == strncmp (tmap[i].text, (const char *) &data[pos], 4)) { char *word; if ((flags & 0x20) > 0) { /* "group" identifier, skip a byte */ pos++; csize--; } switch (tmap[i].fmt) { case T: /* this byte describes the encoding try to convert strings to UTF-8 if it fails, then forget it */ switch (data[pos + 10]) { case 0x00: word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 11], csize - 1, "ISO-8859-1"); break; case 0x01: word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 11], csize - 1, "UCS-2"); break; default: /* bad encoding byte, try to convert from iso-8859-1 */ word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 11], csize - 1, "ISO-8859-1"); break; } break; case U: word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 10], csize, "ISO-8859-1"); break; case UL: if (csize < 6) return 0; /* malformed */ /* find end of description */ off = 14; while ( (off < size) && (off - pos < csize) && (data[pos + off] == '\0') ) off++; if ( (off >= csize) || (data[pos+off] != '\0') ) return 0; /* malformed */ off++; switch (data[pos + 10]) { case 0x00: word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + off], csize - off, "ISO-8859-1"); break; case 0x01: word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + off], csize - off, "UCS-2"); break; default: /* bad encoding byte, try to convert from iso-8859-1 */ word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + off], csize - off, "ISO-8859-1"); break; } break; case SL: if (csize < 7) return 0; /* malformed */ /* find end of description */ switch (data[pos + 10]) { case 0x00: word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 16], csize - 6, "ISO-8859-1"); break; case 0x01: word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 16], csize - 6, "UCS-2"); break; default: /* bad encoding byte, try to convert from iso-8859-1 */ word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 16], csize - 6, "ISO-8859-1"); break; } break; case L: if (csize < 5) return 0; /* malformed */ /* find end of description */ switch (data[pos + 10]) { case 0x00: word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 14], csize - 4, "ISO-8859-1"); break; case 0x01: word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 14], csize - 4, "UCS-2"); break; default: /* bad encoding byte, try to convert from iso-8859-1 */ word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 14], csize - 4, "ISO-8859-1"); break; } break; case I: if (csize < 2) return 0; /* malformed */ /* find end of mime type */ off = 11; while ( (off < size) && (off - pos < csize) && (data[pos + off] == '\0') ) off++; if ( (off >= csize) || (data[pos+off] != '\0') ) return 0; /* malformed */ off++; mime = strdup ((const char*) &data[pos + 11]); switch (data[pos+off]) { case 0x03: case 0x04: type = EXTRACTOR_METATYPE_COVER_PICTURE; break; case 0x07: case 0x08: case 0x09: case 0x0A: case 0x0B: case 0x0C: type = EXTRACTOR_METATYPE_CONTRIBUTOR_PICTURE; break; case 0x0D: case 0x0E: case 0x0F: type = EXTRACTOR_METATYPE_EVENT_PICTURE; break; case 0x14: type = EXTRACTOR_METATYPE_LOGO; type = EXTRACTOR_METATYPE_LOGO; break; default: type = EXTRACTOR_METATYPE_PICTURE; break; } off++; /* find end of description */ while ( (off < size) && (off - pos < csize) && (data[pos + off] == '\0') ) off++; if ( (off >= csize) || (data[pos+off] != '\0') ) return 0; /* malformed */ off++; if ( (mime != NULL) && (0 == strcasecmp ("-->", mime)) ) { /* not supported */ } else { if (0 != proc (proc_cls, "id3v23", type, EXTRACTOR_METAFORMAT_BINARY, mime, (const char*) &data[pos + off], csize + 6 - off)) { if (mime != NULL) free (mime); return 1; } } if (mime != NULL) free (mime); word = NULL; break; default: return 0; } if ((word != NULL) && (strlen (word) > 0)) { if (0 != proc (proc_cls, "id3v23", tmap[i].type, EXTRACTOR_METAFORMAT_UTF8, "text/plain", word, strlen(word)+1)) { free (word); return 1; } } if (word != NULL) free (word); break; } i++; } pos += 10 + csize; } return 0; } /* end of id3v23_extractor.c */