libextractor

GNU libextractor
Log | Files | Refs | Submodules | README | LICENSE

commit 949dae1583254b789e3dafe569e030140a621846
parent 73b50507dba0570f2182f21f8b1c27a95886e4e7
Author: Christian Grothoff <christian@grothoff.org>
Date:   Sun, 20 Dec 2009 00:06:45 +0000

id3vx

Diffstat:
MTODO | 2+-
Msrc/include/extractor.h | 15+++++++++------
Msrc/main/extractor_metatypes.c | 21+++++++++++++++++----
Msrc/plugins/id3v23_extractor.c | 330+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------
Msrc/plugins/id3v24_extractor.c | 432++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------------------
Msrc/plugins/id3v2_extractor.c | 2+-
Msrc/plugins/odf_extractor.c | 2+-
Msrc/plugins/ole2_extractor.c | 4++--
Msrc/plugins/png_extractor.c | 2+-
Msrc/plugins/qt_extractor.c | 8++++----
10 files changed, 609 insertions(+), 209 deletions(-)

diff --git a/TODO b/TODO @@ -17,12 +17,12 @@ Core: Incomplete code (missing features): * RIFF (idx1 attribute) -* IDv2{3,4} (some attributes, make testcases in test/id3v2/ work) * StarOffice sdw (some attributes, see doc/) * man pages (interpret sections for authors, brief description) * pdf: full-text extraction! * EXIV2 * ELF: 64-bit support, lists of architectures, OSes, etc. are incomplete +* ID3v2x: unsynchronization support, (de)compression support, footer support (24) Desirable missing formats: * mbox / various e-mail formats diff --git a/src/include/extractor.h b/src/include/extractor.h @@ -280,8 +280,8 @@ enum EXTRACTOR_MetaType EXTRACTOR_METATYPE_PRODUCT_VERSION = 148, EXTRACTOR_METATYPE_CONTRIBUTOR_NAME = 149, EXTRACTOR_METATYPE_MOVIE_DIRECTOR = 150, - EXTRACTOR_METATYPE_TV_NETWORK_NAME = 151, - EXTRACTOR_METATYPE_TV_SHOW_NAME = 152, + EXTRACTOR_METATYPE_NETWORK_NAME = 151, + EXTRACTOR_METATYPE_SHOW_NAME = 152, EXTRACTOR_METATYPE_CHAPTER_NAME = 153, EXTRACTOR_METATYPE_SONG_COUNT = 154, EXTRACTOR_METATYPE_STARTING_SONG = 155, @@ -295,13 +295,17 @@ enum EXTRACTOR_MetaType EXTRACTOR_METATYPE_ORIGINAL_ARTIST = 163, EXTRACTOR_METATYPE_ORIGINAL_WRITER = 164, EXTRACTOR_METATYPE_ORIGINAL_RELEASE_YEAR = 165, - EXTRACTOR_METATYPE_LYRICS = 166, - EXTRACTOR_METATYPE_POPULARITY_METER = 167, + EXTRACTOR_METATYPE_ORIGINAL_PERFORMER = 166, + EXTRACTOR_METATYPE_LYRICS = 167, + EXTRACTOR_METATYPE_POPULARITY_METER = 168, + EXTRACTOR_METATYPE_LICENSEE = 169, + EXTRACTOR_METATYPE_MUSICIAN_CREDITS_LIST = 170, + EXTRACTOR_METATYPE_MOOD = 171, + EXTRACTOR_METATYPE_SUBTITLE = 172, /* fixme: used up to here! */ EXTRACTOR_METATYPE_MUSIC_CD_IDENTIFIER = 117, - EXTRACTOR_METATYPE_MUSICIAN_CREDITS_LIST = 123, EXTRACTOR_METATYPE_SCALE = 108, @@ -342,7 +346,6 @@ enum EXTRACTOR_MetaType EXTRACTOR_METATYPE_FULL_NAME = 113, EXTRACTOR_METATYPE_LINK = 116, EXTRACTOR_METATYPE_TIME = 122, - EXTRACTOR_METATYPE_MOOD = 124, EXTRACTOR_METATYPE_TELEVISION_SYSTEM = 126, EXTRACTOR_METATYPE_HARDWARE_DEPENDENCY = 129, EXTRACTOR_METATYPE_RIPPER = 130, diff --git a/src/main/extractor_metatypes.c b/src/main/extractor_metatypes.c @@ -369,10 +369,10 @@ static const struct MetaTypeDescription meta_type_descriptions[] = { /* 150 */ { gettext_noop ("movie director"), gettext_noop ("name of the director") }, - { gettext_noop ("TV network"), - gettext_noop ("name of the broadcasting TV network") }, - { gettext_noop ("TV show"), - gettext_noop ("name of the TV show") }, + { gettext_noop ("network"), + gettext_noop ("name of the broadcasting network or station") }, + { gettext_noop ("show"), + gettext_noop ("name of the show") }, { gettext_noop ("chapter name"), gettext_noop ("name of the chapter") }, { gettext_noop ("song count"), @@ -402,10 +402,23 @@ static const struct MetaTypeDescription meta_type_descriptions[] = { /* 165 */ { gettext_noop ("original release year"), gettext_noop ("year of the original release") }, + { gettext_noop ("original performer"), + gettext_noop ("name of the original performer") }, { gettext_noop ("lyrics"), gettext_noop ("lyrics of the song or text description of vocal activities") }, { gettext_noop ("popularity"), gettext_noop ("information about the file's popularity") }, + { gettext_noop ("licensee"), + gettext_noop ("name of the owner or licensee of the file") }, + /* 170 */ + { gettext_noop ("musician credit list"), + gettext_noop ("names of contributing musicians") }, + { gettext_noop ("mood"), + gettext_noop ("keywords reflecting the mood of the piece") }, + { gettext_noop ("subtitle"), + gettext_noop ("subtitle of this part") }, + { gettext_noop (""), + gettext_noop ("") }, { gettext_noop (""), gettext_noop ("") }, #if 0 diff --git a/src/plugins/id3v23_extractor.c b/src/plugins/id3v23_extractor.c @@ -1,6 +1,6 @@ /* This file is part of libextractor. - (C) 2002, 2003, 2004, 2006, 2007 Vidyut Samanta and Christian Grothoff + (C) 2002, 2003, 2004, 2006, 2007, 2009 Vidyut Samanta and Christian Grothoff libextractor is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published @@ -35,54 +35,83 @@ #include "convert.h" +enum Id3v23Fmt + { + T, /* simple, 0-terminated string, prefixed by encoding */ + U, /* 0-terminated ASCII string, no encoding */ + UL, /* unsync'ed lyrics */ + SL, /* sync'ed lyrics */ + L, /* string with language prefix */ + I /* image */ + }; + typedef struct { const char *text; enum EXTRACTOR_MetaType type; + enum Id3v23Fmt fmt; } Matches; static Matches tmap[] = { - {"COMM", EXTRACTOR_METATYPE_COMMENT}, - {"IPLS", EXTRACTOR_METATYPE_CONTRIBUTOR}, - {"LINK", EXTRACTOR_METATYPE_LINK}, - {"MCDI", EXTRACTOR_METATYPE_MUSIC_CD_IDENTIFIER}, - {"PCNT", EXTRACTOR_METATYPE_PLAY_COUNTER}, - {"POPM", EXTRACTOR_METATYPE_POPULARITY_METER}, - {"TCOP", EXTRACTOR_METATYPE_COPYRIGHT}, - {"TDAT", EXTRACTOR_METATYPE_DATE}, - {"TCON", EXTRACTOR_METATYPE_CONTENT_TYPE}, - {"TIT1", EXTRACTOR_METATYPE_GENRE}, - {"TENC", EXTRACTOR_METATYPE_ENCODED_BY}, - {"TEXT", EXTRACTOR_METATYPE_LYRICS}, - {"TOLY", EXTRACTOR_METATYPE_CONTRIBUTOR}, - {"TOPE", EXTRACTOR_METATYPE_CONTRIBUTOR}, - {"TOWN", EXTRACTOR_METATYPE_OWNER}, - {"TPE1", EXTRACTOR_METATYPE_ARTIST}, - {"TPE2", EXTRACTOR_METATYPE_ARTIST}, - {"TPE3", EXTRACTOR_METATYPE_CONDUCTOR}, - {"TPE4", EXTRACTOR_METATYPE_INTERPRET}, - {"TMED", EXTRACTOR_METATYPE_MEDIA_TYPE}, - {"TCOM", EXTRACTOR_METATYPE_CREATOR}, - {"TIME", EXTRACTOR_METATYPE_TIME}, - {"TOFN", EXTRACTOR_METATYPE_FILENAME}, - {"TOPE", EXTRACTOR_METATYPE_ARTIST}, - {"TPUB", EXTRACTOR_METATYPE_PUBLISHER}, - {"TRCK", EXTRACTOR_METATYPE_TRACK_NUMBER}, - {"TRSC", EXTRACTOR_METATYPE_ISRC}, - {"TRSN", EXTRACTOR_METATYPE_SOURCE}, - {"TRSO", EXTRACTOR_METATYPE_CREATED_FOR}, - {"TSRC", EXTRACTOR_METATYPE_RESOURCE_IDENTIFIER}, - {"TOAL", EXTRACTOR_METATYPE_ALBUM}, - {"TALB", EXTRACTOR_METATYPE_ALBUM}, - {"TLAN", EXTRACTOR_METATYPE_LANGUAGE}, - {"TYER", EXTRACTOR_METATYPE_YEAR}, - {"TLEN", EXTRACTOR_METATYPE_DURATION}, - {"TIT2", EXTRACTOR_METATYPE_TITLE}, - {"TIT3", EXTRACTOR_METATYPE_DESCRIPTION}, - {"WCOM", EXTRACTOR_METATYPE_RELEASE}, - {"WCOP", EXTRACTOR_METATYPE_DISCLAIMER}, - {"", EXTRACTOR_METATYPE_KEYWORDS}, - {NULL, 0} + {"TALB", EXTRACTOR_METATYPE_ALBUM, T}, + {"TBPM", EXTRACTOR_METATYPE_BEATS_PER_MINUTE, T}, + {"TCOM", EXTRACTOR_METATYPE_COMPOSER, T}, + {"TCON", EXTRACTOR_METATYPE_SONG_VERSION, T}, + {"TCOP", EXTRACTOR_METATYPE_COPYRIGHT, T}, + /* {"TDAT", EXTRACTOR_METATYPE_CREATION_DATE, T}, */ + /* TDLY */ + {"TENC", EXTRACTOR_METATYPE_ENCODED_BY, T}, + {"TEXT", EXTRACTOR_METATYPE_WRITER, T}, + {"TFLT", EXTRACTOR_METATYPE_FORMAT_VERSION, T}, + /* TIME */ + {"TIT1", EXTRACTOR_METATYPE_SECTION, T}, + {"TIT2", EXTRACTOR_METATYPE_TITLE, T}, + {"TIT3", EXTRACTOR_METATYPE_SONG_VERSION, T}, + /* TKEY */ + {"TLAN", EXTRACTOR_METATYPE_LANGUAGE, T}, + {"TLEN", EXTRACTOR_METATYPE_DURATION, T}, /* FIXME: should append 'ms' as unit */ + {"TMED", EXTRACTOR_METATYPE_SOURCE, T}, + {"TOAL", EXTRACTOR_METATYPE_ORIGINAL_TITLE, T}, + {"TOFN", EXTRACTOR_METATYPE_ORIGINAL_ARTIST, T}, + {"TOLY", EXTRACTOR_METATYPE_ORIGINAL_WRITER, T}, + {"TOPE", EXTRACTOR_METATYPE_ORIGINAL_PERFORMER, T}, + {"TORY", EXTRACTOR_METATYPE_ORIGINAL_RELEASE_YEAR, T}, + {"TOWN", EXTRACTOR_METATYPE_LICENSEE, T}, + {"TPE1", EXTRACTOR_METATYPE_ARTIST, T}, + {"TPE2", EXTRACTOR_METATYPE_PERFORMER, T}, + {"TPE3", EXTRACTOR_METATYPE_CONDUCTOR, T}, + {"TPE4", EXTRACTOR_METATYPE_INTERPRETATION, T}, + {"TPOS", EXTRACTOR_METATYPE_DISC_NUMBER, T}, + {"TPUB", EXTRACTOR_METATYPE_PUBLISHER, T}, + {"TRCK", EXTRACTOR_METATYPE_TRACK_NUMBER, T}, + /* TRDA */ + {"TRSN", EXTRACTOR_METATYPE_NETWORK_NAME, T}, + /* TRSO */ + {"TSIZ", EXTRACTOR_METATYPE_EMBEDDED_FILE_SIZE, T}, + {"TSRC", EXTRACTOR_METATYPE_ISRC, T}, + /* TSSE */ + {"TYER", EXTRACTOR_METATYPE_PUBLICATION_YEAR, T}, + {"WCOM", EXTRACTOR_METATYPE_URL, U}, + {"WCOP", EXTRACTOR_METATYPE_URL, U}, + {"WOAF", EXTRACTOR_METATYPE_URL, U}, + {"WOAS", EXTRACTOR_METATYPE_URL, U}, + {"WORS", EXTRACTOR_METATYPE_URL, U}, + {"WPAY", EXTRACTOR_METATYPE_URL, U}, + {"WPUB", EXTRACTOR_METATYPE_URL, U}, + {"WXXX", EXTRACTOR_METATYPE_URL, T}, + {"IPLS", EXTRACTOR_METATYPE_CONTRIBUTOR_NAME, T}, + /* ... */ + {"USLT", EXTRACTOR_METATYPE_LYRICS, UL }, + {"SYLT", EXTRACTOR_METATYPE_LYRICS, SL }, + {"COMM", EXTRACTOR_METATYPE_COMMENT, L}, + /* ... */ + {"APIC", EXTRACTOR_METATYPE_PICTURE, I}, + /* ... */ + {"LINK", EXTRACTOR_METATYPE_URL, U}, + /* ... */ + {"USER", EXTRACTOR_METATYPE_LICENSE, T}, + /* ... */ + {NULL, 0, T} }; @@ -104,6 +133,9 @@ EXTRACTOR_id3v23_extract (const unsigned char *data, uint32_t csize; int i; uint16_t flags; + char *mime; + enum EXTRACTOR_MetaType type; + size_t off; if ((size < 16) || (data[0] != 0x49) || @@ -111,12 +143,16 @@ EXTRACTOR_id3v23_extract (const unsigned char *data, (data[2] != 0x33) || (data[3] != 0x03) || (data[4] != 0x00)) return 0; unsync = (data[5] & 0x80) > 0; + if (unsync) + return 0; /* not supported */ extendedHdr = (data[5] & 0x40) > 0; experimental = (data[5] & 0x20) > 0; + if (experimental) + return 0; tsize = (((data[6] & 0x7F) << 21) | ((data[7] & 0x7F) << 14) | ((data[8] & 0x7F) << 7) | ((data[9] & 0x7F) << 0)); - if ((tsize + 10 > size) || (experimental)) + if (tsize + 10 > size) return 0; pos = 10; padding = 0; @@ -142,7 +178,8 @@ EXTRACTOR_id3v23_extract (const unsigned char *data, csize = (data[pos + 4] << 24) + (data[pos + 5] << 16) + (data[pos + 6] << 8) + data[pos + 7]; - if ((pos + 10 + csize > tsize) || (csize > tsize) || (csize == 0)) + if ((pos + 10 + csize > tsize) || (csize > tsize) || (csize == 0) || + (pos + 10 + csize <= pos + 10) || (pos + 10 <= pos)) break; flags = (data[pos + 8] << 8) + data[pos + 9]; if (((flags & 0x80) > 0) /* compressed, not yet supported */ || @@ -163,32 +200,191 @@ EXTRACTOR_id3v23_extract (const unsigned char *data, pos++; csize--; } - csize--; - /* this byte describes the encoding - try to convert strings to UTF-8 - if it fails, then forget it */ - switch (data[pos + 10]) - { - case 0x00: - word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 11], - csize, "ISO-8859-1"); - break; - case 0x01: - word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 11], - csize, "UCS-2"); - break; - default: - /* bad encoding byte, - try to convert from iso-8859-1 */ - word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 11], - csize, "ISO-8859-1"); - break; - } - pos++; + switch (tmap[i].fmt) + { + case T: + /* this byte describes the encoding + try to convert strings to UTF-8 + if it fails, then forget it */ + switch (data[pos + 10]) + { + case 0x00: + word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 11], + csize - 1, "ISO-8859-1"); + break; + case 0x01: + word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 11], + csize - 1, "UCS-2"); + break; + default: + /* bad encoding byte, + try to convert from iso-8859-1 */ + word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 11], + csize - 1, "ISO-8859-1"); + break; + } + break; + case U: + word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 10], + csize, "ISO-8859-1"); + break; + case UL: + if (csize < 6) + return 0; /* malformed */ + /* find end of description */ + off = 14; + while ( (off < size) && + (off - pos < csize) && + (data[pos + off] == '\0') ) + off++; + if ( (off >= csize) || + (data[pos+off] != '\0') ) + return 0; /* malformed */ + off++; + switch (data[pos + 10]) + { + case 0x00: + word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + off], + csize - off, "ISO-8859-1"); + break; + case 0x01: + word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + off], + csize - off, "UCS-2"); + break; + default: + /* bad encoding byte, + try to convert from iso-8859-1 */ + word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + off], + csize - off, "ISO-8859-1"); + break; + } + break; + case SL: + if (csize < 7) + return 0; /* malformed */ + /* find end of description */ + switch (data[pos + 10]) + { + case 0x00: + word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 16], + csize - 6, "ISO-8859-1"); + break; + case 0x01: + word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 16], + csize - 6, "UCS-2"); + break; + default: + /* bad encoding byte, + try to convert from iso-8859-1 */ + word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 16], + csize - 6, "ISO-8859-1"); + break; + } + break; + case L: + if (csize < 5) + return 0; /* malformed */ + /* find end of description */ + switch (data[pos + 10]) + { + case 0x00: + word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 14], + csize - 4, "ISO-8859-1"); + break; + case 0x01: + word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 14], + csize - 4, "UCS-2"); + break; + default: + /* bad encoding byte, + try to convert from iso-8859-1 */ + word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 14], + csize - 4, "ISO-8859-1"); + break; + } + break; + case I: + if (csize < 2) + return 0; /* malformed */ + /* find end of mime type */ + off = 11; + while ( (off < size) && + (off - pos < csize) && + (data[pos + off] == '\0') ) + off++; + if ( (off >= csize) || + (data[pos+off] != '\0') ) + return 0; /* malformed */ + off++; + mime = strdup ((const char*) &data[pos + 11]); + + switch (data[pos+off]) + { + case 0x03: + case 0x04: + type = EXTRACTOR_METATYPE_COVER_PICTURE; + break; + case 0x07: + case 0x08: + case 0x09: + case 0x0A: + case 0x0B: + case 0x0C: + type = EXTRACTOR_METATYPE_CONTRIBUTOR_PICTURE; + break; + case 0x0D: + case 0x0E: + case 0x0F: + type = EXTRACTOR_METATYPE_EVENT_PICTURE; + break; + case 0x14: + type = EXTRACTOR_METATYPE_LOGO; + type = EXTRACTOR_METATYPE_LOGO; + break; + default: + type = EXTRACTOR_METATYPE_PICTURE; + break; + } + off++; + + /* find end of description */ + while ( (off < size) && + (off - pos < csize) && + (data[pos + off] == '\0') ) + off++; + if ( (off >= csize) || + (data[pos+off] != '\0') ) + return 0; /* malformed */ + off++; + if (0 == strcasecmp ("-->", + mime)) + { + /* not supported */ + } + else + { + if (0 != proc (proc_cls, + "id3v23", + type, + EXTRACTOR_METAFORMAT_BINARY, + mime, + (const char*) &data[pos + off], + csize + 6 - off)) + { + free (mime); + return 1; + } + } + free (mime); + word = NULL; + break; + default: + return 0; + } if ((word != NULL) && (strlen (word) > 0)) { if (0 != proc (proc_cls, - "id3v2", + "id3v23", tmap[i].type, EXTRACTOR_METAFORMAT_UTF8, "text/plain", diff --git a/src/plugins/id3v24_extractor.c b/src/plugins/id3v24_extractor.c @@ -1,6 +1,6 @@ /* This file is part of libextractor. - (C) 2002, 2003, 2004, 2006, 2009 Vidyut Samanta and Christian Grothoff + (C) 2002, 2003, 2004, 2006, 2007, 2009 Vidyut Samanta and Christian Grothoff libextractor is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published @@ -18,7 +18,6 @@ Boston, MA 02111-1307, USA. */ - #define DEBUG_EXTRACT_ID3v24 0 #include "platform.h" @@ -33,72 +32,98 @@ #ifndef MINGW #include <sys/mman.h> #endif + #include "convert.h" - -static struct EXTRACTOR_Keywords * -addKeyword (EXTRACTOR_KeywordList * oldhead, - char *phrase, EXTRACTOR_KeywordType type) -{ - EXTRACTOR_KeywordList *keyword; - - keyword = malloc (sizeof (EXTRACTOR_KeywordList)); - keyword->next = oldhead; - keyword->keyword = phrase; - keyword->keywordType = type; - return keyword; -} +enum Id3v24Fmt + { + T, /* simple, 0-terminated string, prefixed by encoding */ + U, /* 0-terminated ASCII string, no encoding */ + UL, /* unsync'ed lyrics */ + SL, /* sync'ed lyrics */ + L, /* string with language prefix */ + I /* image */ + }; typedef struct { - char *text; + const char *text; enum EXTRACTOR_MetaType type; + enum Id3v24Fmt fmt; } Matches; static Matches tmap[] = { - {"COMM", EXTRACTOR_METATYPE_COMMENT}, - {"IPLS", EXTRACTOR_METATYPE_CONTRIBUTOR}, - {"TIPL", EXTRACTOR_METATYPE_CONTRIBUTOR}, - {"TMOO", EXTRACTOR_METATYPE_MOOD}, - {"TMCL", EXTRACTOR_METATYPE_MUSICIAN_CREDITS_LIST}, - {"LINK", EXTRACTOR_METATYPE_LINK}, - {"MCDI", EXTRACTOR_METATYPE_MUSIC_CD_IDENTIFIER}, - {"PCNT", EXTRACTOR_METATYPE_PLAY_COUNTER}, - {"POPM", EXTRACTOR_METATYPE_POPULARITY_METER}, - {"TCOP", EXTRACTOR_METATYPE_COPYRIGHT}, - {"TDRC", EXTRACTOR_METATYPE_DATE}, - {"TCON", EXTRACTOR_METATYPE_GENRE}, - {"TIT1", EXTRACTOR_METATYPE_GENRE}, - {"TENC", EXTRACTOR_METATYPE_ENCODED_BY}, - {"TEXT", EXTRACTOR_METATYPE_LYRICS}, - {"TOLY", EXTRACTOR_METATYPE_CONTRIBUTOR}, - {"TOPE", EXTRACTOR_METATYPE_CONTRIBUTOR}, - {"TOWN", EXTRACTOR_METATYPE_OWNER}, - {"TPE1", EXTRACTOR_METATYPE_ARTIST}, - {"TPE2", EXTRACTOR_METATYPE_ARTIST}, - {"TPE3", EXTRACTOR_METATYPE_CONDUCTOR}, - {"TPE4", EXTRACTOR_METATYPE_INTERPRET}, - {"TIME", EXTRACTOR_METATYPE_TIME}, - {"TMED", EXTRACTOR_METATYPE_MEDIA_TYPE}, - {"TCOM", EXTRACTOR_METATYPE_CREATOR}, - {"TOFN", EXTRACTOR_METATYPE_FILENAME}, - {"TOPE", EXTRACTOR_METATYPE_ARTIST}, - {"TPUB", EXTRACTOR_METATYPE_PUBLISHER}, - {"TRCK", EXTRACTOR_METATYPE_TRACK_NUMBER}, - {"TRSC", EXTRACTOR_METATYPE_ISRC}, - {"TRSN", EXTRACTOR_METATYPE_SOURCE}, - {"TRSO", EXTRACTOR_METATYPE_CREATED_FOR}, - {"TSRC", EXTRACTOR_METATYPE_RESOURCE_IDENTIFIER}, - {"TYER", EXTRACTOR_METATYPE_YEAR}, - {"TOAL", EXTRACTOR_METATYPE_ALBUM}, - {"TALB", EXTRACTOR_METATYPE_ALBUM}, - {"TLAN", EXTRACTOR_METATYPE_LANGUAGE}, - {"TIT2", EXTRACTOR_METATYPE_TITLE}, - {"TIT3", EXTRACTOR_METATYPE_DESCRIPTION}, - {"WCOM", EXTRACTOR_METATYPE_RELEASE}, - {"WCOP", EXTRACTOR_METATYPE_DISCLAIMER}, - {"", EXTRACTOR_METATYPE_KEYWORDS}, - {NULL, 0} + {"TALB", EXTRACTOR_METATYPE_ALBUM, T}, + {"TBPM", EXTRACTOR_METATYPE_BEATS_PER_MINUTE, T}, + {"TCOM", EXTRACTOR_METATYPE_COMPOSER, T}, + {"TCON", EXTRACTOR_METATYPE_SONG_VERSION, T}, + {"TCOP", EXTRACTOR_METATYPE_COPYRIGHT, T}, + /* {"TDAT", EXTRACTOR_METATYPE_CREATION_DATE, T}, deprecated in 24 */ + /* TDLY */ + {"TENC", EXTRACTOR_METATYPE_ENCODED_BY, T}, + {"TEXT", EXTRACTOR_METATYPE_WRITER, T}, + {"TFLT", EXTRACTOR_METATYPE_FORMAT_VERSION, T}, + /* TIME, deprecated in 24 */ + {"TIT1", EXTRACTOR_METATYPE_SECTION, T}, + {"TIT2", EXTRACTOR_METATYPE_TITLE, T}, + {"TIT3", EXTRACTOR_METATYPE_SONG_VERSION, T}, + /* TKEY */ + {"TLAN", EXTRACTOR_METATYPE_LANGUAGE, T}, + {"TLEN", EXTRACTOR_METATYPE_DURATION, T}, /* FIXME: should append 'ms' as unit */ + {"TMED", EXTRACTOR_METATYPE_SOURCE, T}, + {"TOAL", EXTRACTOR_METATYPE_ORIGINAL_TITLE, T}, + {"TOFN", EXTRACTOR_METATYPE_ORIGINAL_ARTIST, T}, + {"TOLY", EXTRACTOR_METATYPE_ORIGINAL_WRITER, T}, + {"TOPE", EXTRACTOR_METATYPE_ORIGINAL_PERFORMER, T}, + /* {"TORY", EXTRACTOR_METATYPE_ORIGINAL_RELEASE_YEAR, T}, deprecated in 24 */ + {"TOWN", EXTRACTOR_METATYPE_LICENSEE, T}, + {"TPE1", EXTRACTOR_METATYPE_ARTIST, T}, + {"TPE2", EXTRACTOR_METATYPE_PERFORMER, T}, + {"TPE3", EXTRACTOR_METATYPE_CONDUCTOR, T}, + {"TPE4", EXTRACTOR_METATYPE_INTERPRETATION, T}, + {"TPOS", EXTRACTOR_METATYPE_DISC_NUMBER, T}, + {"TPUB", EXTRACTOR_METATYPE_PUBLISHER, T}, + {"TRCK", EXTRACTOR_METATYPE_TRACK_NUMBER, T}, + /* TRDA, deprecated in 24 */ + {"TRSN", EXTRACTOR_METATYPE_NETWORK_NAME, T}, + /* TRSO */ + /* {"TSIZ", EXTRACTOR_METATYPE_EMBEDDED_FILE_SIZE, T}, deprecated in 24 */ + {"TSRC", EXTRACTOR_METATYPE_ISRC, T}, + /* TSSE */ + /* {"TYER", EXTRACTOR_METATYPE_PUBLICATION_YEAR, T}, deprecated in 24 */ + {"WCOM", EXTRACTOR_METATYPE_URL, U}, + {"WCOP", EXTRACTOR_METATYPE_URL, U}, + {"WOAF", EXTRACTOR_METATYPE_URL, U}, + {"WOAS", EXTRACTOR_METATYPE_URL, U}, + {"WORS", EXTRACTOR_METATYPE_URL, U}, + {"WPAY", EXTRACTOR_METATYPE_URL, U}, + {"WPUB", EXTRACTOR_METATYPE_URL, U}, + {"WXXX", EXTRACTOR_METATYPE_URL, T}, + /* {"IPLS", EXTRACTOR_METATYPE_CONTRIBUTOR_NAME, T}, deprecated in 24 */ + /* ... */ + {"USLT", EXTRACTOR_METATYPE_LYRICS, UL }, + {"SYLT", EXTRACTOR_METATYPE_LYRICS, SL }, + {"COMM", EXTRACTOR_METATYPE_COMMENT, L}, + /* ... */ + {"APIC", EXTRACTOR_METATYPE_PICTURE, I}, + /* ... */ + {"LINK", EXTRACTOR_METATYPE_URL, U}, + /* ... */ + {"USER", EXTRACTOR_METATYPE_LICENSE, T}, + /* ... */ + /* new frames in 24 */ + /* ASPI, EQU2, RVA2, SEEK, SIGN, TDEN */ + {"TDOR", EXTRACTOR_METATYPE_PUBLICATION_DATE, T}, + /* TDRC, TDRL, TDTG */ + {"TIPL", EXTRACTOR_METATYPE_CONTRIBUTOR_NAME, T}, + {"TMCL", EXTRACTOR_METATYPE_MUSICIAN_CREDITS_LIST, T}, + {"TMOO", EXTRACTOR_METATYPE_MOOD, T}, + {"TPRO", EXTRACTOR_METATYPE_COPYRIGHT, T}, + {"TSOA", EXTRACTOR_METATYPE_ALBUM, T}, + {"TSOP", EXTRACTOR_METATYPE_PERFORMER, T}, + {"TSOT", EXTRACTOR_METATYPE_TITLE, T}, + {"TSST", EXTRACTOR_METATYPE_SUBTITLE, T}, + {NULL, 0, T} }; @@ -114,54 +139,60 @@ EXTRACTOR_id3v24_extract (const unsigned char *data, int extendedHdr; int experimental; int footer; - unsigned int tsize; - unsigned int pos; - unsigned int ehdrSize; - unsigned int padding; + uint32_t tsize; + uint32_t pos; + uint32_t ehdrSize; + uint32_t padding; + uint32_t csize; + int i; + uint16_t flags; + char *mime; + enum EXTRACTOR_MetaType type; + size_t off; if ((size < 16) || (data[0] != 0x49) || (data[1] != 0x44) || (data[2] != 0x33) || (data[3] != 0x04) || (data[4] != 0x00)) - return prev; + return 0; unsync = (data[5] & 0x80) > 0; + if (unsync) + return 0; /* not supported */ extendedHdr = (data[5] & 0x40) > 0; experimental = (data[5] & 0x20) > 0; + if (experimental) + return 0; footer = (data[5] & 0x10) > 0; tsize = (((data[6] & 0x7F) << 21) | ((data[7] & 0x7F) << 14) | ((data[8] & 0x7F) << 7) | ((data[9] & 0x7F) << 0)); - if ((tsize + 10 > size) || (experimental)) - return prev; + if (tsize + 10 > size) + return 0; pos = 10; padding = 0; if (extendedHdr) { ehdrSize = (((data[10] & 0x7F) << 21) | - ((data[11] & 0x7F) << 14) | - ((data[12] & 0x7F) << 7) | ((data[13] & 0x7F) << 0)); - pos += ehdrSize; + ((data[11] & 0x7F) << 14) | + ((data[12] & 0x7F) << 7) | ((data[13] & 0x7F) << 0)); + pos += 4 + ehdrSize; + if (ehdrSize > tsize) + return 0; } - - while (pos < tsize) { - size_t csize; - int i; - unsigned short flags; - if (pos + 10 > tsize) - return prev; - - csize = (((data[pos + 4] & 0x7F) << 21) | - ((data[pos + 5] & 0x7F) << 14) | - ((data[pos + 6] & 0x7F) << 7) | ((data[pos + 7] & 0x7F) << 0)); - - if ((pos + 10 + csize > tsize) || (csize > tsize) || (csize == 0)) + return 0; + csize = + (data[pos + 4] << 24) + (data[pos + 5] << 16) + (data[pos + 6] << 8) + + data[pos + 7]; + if ((pos + 10 + csize > tsize) || (csize > tsize) || (csize == 0) || + (pos + 10 + csize <= pos + 10) || (pos + 10 <= pos)) break; flags = (data[pos + 8] << 8) + data[pos + 9]; - if (((flags & 0x80) > 0) /* compressed, not yet supported */ || - ((flags & 0x40) > 0) /* encrypted, not supported */ ) + if (((flags & 0x08) > 0) /* compressed, not yet supported */ || + ((flags & 0x04) > 0) /* encrypted, not supported */ || + ((flags & 0x02) > 0) /* unsynchronized, not supported */ ) { pos += 10 + csize; continue; @@ -172,59 +203,216 @@ EXTRACTOR_id3v24_extract (const unsigned char *data, if (0 == strncmp (tmap[i].text, (const char *) &data[pos], 4)) { char *word; - if ((flags & 0x20) > 0) + if ((flags & 0x40) > 0) { /* "group" identifier, skip a byte */ pos++; csize--; } - /* this byte describes the encoding - try to convert strings to UTF-8 - if it fails, then forget it */ - csize--; - switch (data[pos + 10]) - { - case 0x00: - word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 11], - csize, "ISO-8859-1"); - break; - case 0x01: - word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 11], - csize, "UTF-16"); - break; - case 0x02: - word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 11], - csize, "UTF-16BE"); - break; - case 0x03: - word = malloc (csize + 1); - memcpy (word, &data[pos + 11], csize); - word[csize] = '\0'; - break; - default: - /* bad encoding byte, - try to convert from iso-8859-1 */ - word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 11], - csize, "ISO-8859-1"); - break; - } - pos++; + switch (tmap[i].fmt) + { + case T: + /* this byte describes the encoding + try to convert strings to UTF-8 + if it fails, then forget it */ + switch (data[pos + 10]) + { + case 0x00: + word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 11], + csize - 1, "ISO-8859-1"); + break; + case 0x01: + word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 11], + csize - 1, "UCS-2"); + break; + default: + /* bad encoding byte, + try to convert from iso-8859-1 */ + word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 11], + csize - 1, "ISO-8859-1"); + break; + } + break; + case U: + word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 10], + csize, "ISO-8859-1"); + break; + case UL: + if (csize < 6) + return 0; /* malformed */ + /* find end of description */ + off = 14; + while ( (off < size) && + (off - pos < csize) && + (data[pos + off] == '\0') ) + off++; + if ( (off >= csize) || + (data[pos+off] != '\0') ) + return 0; /* malformed */ + off++; + switch (data[pos + 10]) + { + case 0x00: + word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + off], + csize - off, "ISO-8859-1"); + break; + case 0x01: + word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + off], + csize - off, "UCS-2"); + break; + default: + /* bad encoding byte, + try to convert from iso-8859-1 */ + word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + off], + csize - off, "ISO-8859-1"); + break; + } + break; + case SL: + if (csize < 7) + return 0; /* malformed */ + /* find end of description */ + switch (data[pos + 10]) + { + case 0x00: + word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 16], + csize - 6, "ISO-8859-1"); + break; + case 0x01: + word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 16], + csize - 6, "UCS-2"); + break; + default: + /* bad encoding byte, + try to convert from iso-8859-1 */ + word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 16], + csize - 6, "ISO-8859-1"); + break; + } + break; + case L: + if (csize < 5) + return 0; /* malformed */ + /* find end of description */ + switch (data[pos + 10]) + { + case 0x00: + word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 14], + csize - 4, "ISO-8859-1"); + break; + case 0x01: + word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 14], + csize - 4, "UCS-2"); + break; + default: + /* bad encoding byte, + try to convert from iso-8859-1 */ + word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 14], + csize - 4, "ISO-8859-1"); + break; + } + break; + case I: + if (csize < 2) + return 0; /* malformed */ + /* find end of mime type */ + off = 11; + while ( (off < size) && + (off - pos < csize) && + (data[pos + off] == '\0') ) + off++; + if ( (off >= csize) || + (data[pos+off] != '\0') ) + return 0; /* malformed */ + off++; + mime = strdup ((const char*) &data[pos + 11]); + + switch (data[pos+off]) + { + case 0x03: + case 0x04: + type = EXTRACTOR_METATYPE_COVER_PICTURE; + break; + case 0x07: + case 0x08: + case 0x09: + case 0x0A: + case 0x0B: + case 0x0C: + type = EXTRACTOR_METATYPE_CONTRIBUTOR_PICTURE; + break; + case 0x0D: + case 0x0E: + case 0x0F: + type = EXTRACTOR_METATYPE_EVENT_PICTURE; + break; + case 0x14: + type = EXTRACTOR_METATYPE_LOGO; + type = EXTRACTOR_METATYPE_LOGO; + break; + default: + type = EXTRACTOR_METATYPE_PICTURE; + break; + } + off++; + + /* find end of description */ + while ( (off < size) && + (off - pos < csize) && + (data[pos + off] == '\0') ) + off++; + if ( (off >= csize) || + (data[pos+off] != '\0') ) + return 0; /* malformed */ + off++; + if (0 == strcasecmp ("-->", + mime)) + { + /* not supported */ + } + else + { + if (0 != proc (proc_cls, + "id3v24", + type, + EXTRACTOR_METAFORMAT_BINARY, + mime, + (const char*) &data[pos + off], + csize + 6 - off)) + { + free (mime); + return 1; + } + } + free (mime); + word = NULL; + break; + default: + return 0; + } if ((word != NULL) && (strlen (word) > 0)) { - prev = addKeyword (prev, word, tmap[i].type); - } - else - { - free (word); + if (0 != proc (proc_cls, + "id3v24", + tmap[i].type, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + word, + strlen(word)+1)) + { + free (word); + return 1; + } } + free (word); break; } i++; } pos += 10 + csize; } - return prev; + return 0; } /* end of id3v24_extractor.c */ diff --git a/src/plugins/id3v2_extractor.c b/src/plugins/id3v2_extractor.c @@ -113,7 +113,7 @@ static Matches tmap[] = { /* skipping CRM */ /* skipping CRA */ /* {"LNK", EXTRACTOR_METATYPE_URL, XXX}, */ - {NULL, 0}, + {NULL, 0, T}, }; diff --git a/src/plugins/odf_extractor.c b/src/plugins/odf_extractor.c @@ -44,7 +44,7 @@ static Matches tmap[] = { { "meta:creation-date", EXTRACTOR_METATYPE_CREATION_DATE }, { "dc:date", EXTRACTOR_METATYPE_UNKNOWN_DATE }, { "dc:creator", EXTRACTOR_METATYPE_CREATOR }, - { "dc:language", EXTRACTOR_METATYPE_DOCUMENT_LANGUAGE }, + { "dc:language", EXTRACTOR_METATYPE_LANGUAGE }, { "dc:title", EXTRACTOR_METATYPE_TITLE }, { "dc:description", EXTRACTOR_METATYPE_DESCRIPTION }, { "dc:subject", EXTRACTOR_METATYPE_SUBJECT }, diff --git a/src/plugins/ole2_extractor.c b/src/plugins/ole2_extractor.c @@ -107,7 +107,7 @@ static Matches tmap[] = { { "meta:generator", EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE }, { "meta:template", EXTRACTOR_METATYPE_TEMPLATE }, { "meta:editing-cycles", EXTRACTOR_METATYPE_EDITING_CYCLES }, - /* { "Dictionary", EXTRACTOR_METATYPE_DOCUMENT_LANGUAGE }, */ + /* { "Dictionary", EXTRACTOR_METATYPE_LANGUAGE }, */ /* { "gsf:security", EXTRACTOR_SECURITY }, */ /* { "gsf:scale", EXTRACTOR_SCALE }, // always "false"? */ /* { "meta:editing-duration", EXTRACTOR_METATYPE_TOTAL_EDITING_TIME }, // encoding? */ @@ -544,7 +544,7 @@ EXTRACTOR_ole2_extract (const char *data, if ( (lang != NULL) && (ret == 0) ) ret = addKeyword(proc, proc_cls, lang, - EXTRACTOR_METATYPE_DOCUMENT_LANGUAGE); + EXTRACTOR_METATYPE_LANGUAGE); if (lcb >= 6) { for (i=0;i<gsf_infile_num_children(infile);i++) { if (ret != 0) diff --git a/src/plugins/png_extractor.c b/src/plugins/png_extractor.c @@ -141,7 +141,7 @@ processiTXt (const char *data, language = &data[pos]; ret = 0; if (stnlen (language, length - pos) > 0) - ADDF (EXTRACTOR_METATYPE_DOCUMENT_LANGUAGE, + ADDF (EXTRACTOR_METATYPE_LANGUAGE, stndup (language, length - pos)); pos += stnlen (language, length - pos) + 1; if (pos + 1 >= length) diff --git a/src/plugins/qt_extractor.c b/src/plugins/qt_extractor.c @@ -395,9 +395,9 @@ static ITTagConversionEntry it_to_extr_table[] = { {"catg", EXTRACTOR_METATYPE_SECTION}, {"keyw", EXTRACTOR_METATYPE_KEYWORDS}, {"desc", EXTRACTOR_METATYPE_DESCRIPTION}, - {"tvnn", EXTRACTOR_METATYPE_TV_NETWORK_NAME}, - {"tvsh", EXTRACTOR_METATYPE_TV_SHOW_NAME}, - {"tven", EXTRACTOR_METATYPE_TV_NETWORK_NAME}, + {"tvnn", EXTRACTOR_METATYPE_NETWORK_NAME}, + {"tvsh", EXTRACTOR_METATYPE_SHOW_NAME}, + {"tven", EXTRACTOR_METATYPE_NETWORK_NAME}, {NULL, EXTRACTOR_METATYPE_RESERVED} }; @@ -850,7 +850,7 @@ processTextTag (const char *input, lang = ntohs (txt->language); if (lang >= sizeof (languages) / sizeof (char *)) return 0; /* invalid */ - addKeyword (EXTRACTOR_METATYPE_DOCUMENT_LANGUAGE, languages[lang], ec); + addKeyword (EXTRACTOR_METATYPE_LANGUAGE, languages[lang], ec); meta = malloc (len + 1); memcpy (meta, &txt[1], len);