libextractor

GNU libextractor
Log | Files | Refs | Submodules | README | LICENSE

commit fa7592ccb96d40353ff0270c57efe28057c81d7f
parent 5307edba27e65305173177ebbeb5759c0c60217a
Author: Christian Grothoff <christian@grothoff.org>
Date:   Sat, 19 Dec 2009 21:10:55 +0000

id3v2

Diffstat:
Msrc/include/extractor.h | 19+++++++++++++------
Msrc/main/extractor_metatypes.c | 32+++++++++++++++++++++++++-------
Msrc/plugins/html_extractor.c | 2+-
Msrc/plugins/id3v2_extractor.c | 327++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------
4 files changed, 305 insertions(+), 75 deletions(-)

diff --git a/src/include/extractor.h b/src/include/extractor.h @@ -136,7 +136,7 @@ enum EXTRACTOR_MetaType EXTRACTOR_METATYPE_PUBLICATION_DATE = 24, EXTRACTOR_METATYPE_BIBTEX_EPRINT = 25, EXTRACTOR_METATYPE_BIBTEX_ENTRY_TYPE = 26, - EXTRACTOR_METATYPE_DOCUMENT_LANGUAGE = 27, + EXTRACTOR_METATYPE_LANGUAGE = 27, EXTRACTOR_METATYPE_CREATION_TIME = 28, EXTRACTOR_METATYPE_URL = 29, @@ -285,12 +285,21 @@ enum EXTRACTOR_MetaType EXTRACTOR_METATYPE_CHAPTER_NAME = 153, EXTRACTOR_METATYPE_SONG_COUNT = 154, EXTRACTOR_METATYPE_STARTING_SONG = 155, + EXTRACTOR_METATYPE_PLAY_COUNTER = 156, + EXTRACTOR_METATYPE_CONDUCTOR = 157, + EXTRACTOR_METATYPE_INTERPRETATION = 158, + EXTRACTOR_METATYPE_COMPOSER = 159, + EXTRACTOR_METATYPE_BEATS_PER_MINUTE = 160, + EXTRACTOR_METATYPE_ENCODED_BY = 161, + EXTRACTOR_METATYPE_ORIGINAL_TITLE = 162, + EXTRACTOR_METATYPE_ORIGINAL_ARTIST = 163, + EXTRACTOR_METATYPE_ORIGINAL_WRITER = 164, + EXTRACTOR_METATYPE_ORIGINAL_RELEASE_YEAR = 165, + EXTRACTOR_METATYPE_LYRICS = 166, + EXTRACTOR_METATYPE_POPULARITY_METER = 167, /* fixme: used up to here! */ - EXTRACTOR_METATYPE_LYRICS = 67, - EXTRACTOR_METATYPE_CONDUCTOR = 64, - EXTRACTOR_METATYPE_INTERPRET = 65, EXTRACTOR_METATYPE_MUSIC_CD_IDENTIFIER = 117, EXTRACTOR_METATYPE_MUSICIAN_CREDITS_LIST = 123, @@ -304,7 +313,6 @@ enum EXTRACTOR_MetaType /* numeric metrics */ - EXTRACTOR_METATYPE_POPULARITY_METER = 119, EXTRACTOR_METATYPE_RATING = 145, EXTRACTOR_METATYPE_PRIORITY = 60, @@ -316,7 +324,6 @@ enum EXTRACTOR_MetaType /* misc (see if these are still needed...) */ EXTRACTOR_METATYPE_GENERATOR = 103, - EXTRACTOR_METATYPE_ENCODED_BY = 121, EXTRACTOR_METATYPE_FULL_DATA = 137, diff --git a/src/main/extractor_metatypes.c b/src/main/extractor_metatypes.c @@ -380,14 +380,32 @@ static const struct MetaTypeDescription meta_type_descriptions[] = { /* 155 */ { gettext_noop ("starting song"), gettext_noop ("number of the first song to play") }, - { gettext_noop (""), - gettext_noop ("") }, - { gettext_noop (""), - gettext_noop ("") }, - { gettext_noop (""), - gettext_noop ("") }, - { gettext_noop (""), + { gettext_noop ("play counter"), + gettext_noop ("number of times the media has been played") }, + { gettext_noop ("conductor"), + gettext_noop ("name of the conductor") }, + { gettext_noop ("interpretation"), + gettext_noop ("information about the people behind interpretations of an existing piece") }, + { gettext_noop ("composer"), + gettext_noop ("name of the composer") }, + /* 160 */ + { gettext_noop ("beats per minute"), gettext_noop ("") }, + { gettext_noop ("encoded by"), + gettext_noop ("name of person or organization that encoded the file") }, + { gettext_noop ("original title"), + gettext_noop ("title of the original work") }, + { gettext_noop ("original artist"), + gettext_noop ("name of the original artist") }, + { gettext_noop ("original writer"), + gettext_noop ("name of the original lyricist or writer") }, + /* 165 */ + { gettext_noop ("original release year"), + gettext_noop ("year of the original release") }, + { gettext_noop ("lyrics"), + gettext_noop ("lyrics of the song or text description of vocal activities") }, + { gettext_noop ("popularity"), + gettext_noop ("information about the file's popularity") }, { gettext_noop (""), gettext_noop ("") }, #if 0 diff --git a/src/plugins/html_extractor.c b/src/plugins/html_extractor.c @@ -44,7 +44,7 @@ static struct { "rights", EXTRACTOR_METATYPE_RIGHTS }, { "dc.rights", EXTRACTOR_METATYPE_RIGHTS }, { "copyright", EXTRACTOR_METATYPE_COPYRIGHT }, - { "language", EXTRACTOR_METATYPE_DOCUMENT_LANGUAGE }, + { "language", EXTRACTOR_METATYPE_LANGUAGE }, { "keywords", EXTRACTOR_METATYPE_KEYWORDS }, { "abstract", EXTRACTOR_METATYPE_ABSTRACT }, { "formatter", EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE }, diff --git a/src/plugins/id3v2_extractor.c b/src/plugins/id3v2_extractor.c @@ -28,46 +28,91 @@ #define DEBUG_EXTRACT_ID3v2 0 +enum Id3v2Fmt + { + T, /* simple, 0-terminated string, prefixed by encoding */ + U, /* 0-terminated ASCII string, no encoding */ + UL, /* unsync'ed lyrics */ + SL, /* sync'ed lyrics */ + L, /* string with language prefix */ + I /* image */ + }; + typedef struct { const char *text; enum EXTRACTOR_MetaType type; + enum Id3v2Fmt fmt; } Matches; static Matches tmap[] = { - {"TAL", EXTRACTOR_METATYPE_TITLE}, - {"TT1", EXTRACTOR_METATYPE_GROUP}, - {"TT2", EXTRACTOR_METATYPE_TITLE}, - {"TT3", EXTRACTOR_METATYPE_TITLE}, - {"TXT", EXTRACTOR_METATYPE_DESCRIPTION}, - {"TPB", EXTRACTOR_METATYPE_PUBLISHER}, - {"WAF", EXTRACTOR_METATYPE_LOCATION}, - {"WAR", EXTRACTOR_METATYPE_LOCATION}, - {"WAS", EXTRACTOR_METATYPE_LOCATION}, - {"WCP", EXTRACTOR_METATYPE_COPYRIGHT}, - {"WAF", EXTRACTOR_METATYPE_LOCATION}, - {"WCM", EXTRACTOR_METATYPE_DISCLAIMER}, - {"TSS", EXTRACTOR_METATYPE_FORMAT}, - {"TYE", EXTRACTOR_METATYPE_DATE}, - {"TLA", EXTRACTOR_METATYPE_LANGUAGE}, - {"TP1", EXTRACTOR_METATYPE_ARTIST}, - {"TP2", EXTRACTOR_METATYPE_ARTIST}, - {"TP3", EXTRACTOR_METATYPE_CONDUCTOR}, - {"TP4", EXTRACTOR_METATYPE_INTERPRET}, - {"IPL", EXTRACTOR_METATYPE_CONTRIBUTOR}, - {"TOF", EXTRACTOR_METATYPE_FILENAME}, - {"TEN", EXTRACTOR_METATYPE_PRODUCER}, - {"TCO", EXTRACTOR_METATYPE_SUBJECT}, - {"TCR", EXTRACTOR_METATYPE_COPYRIGHT}, - {"SLT", EXTRACTOR_METATYPE_LYRICS}, - {"TOA", EXTRACTOR_METATYPE_ARTIST}, - {"TRC", EXTRACTOR_METATYPE_ISRC}, - {"TRK", EXTRACTOR_METATYPE_TRACK_NUMBER}, - {"TCM", EXTRACTOR_METATYPE_CREATOR}, - {"TOT", EXTRACTOR_METATYPE_ALBUM}, - {"TOL", EXTRACTOR_METATYPE_AUTHOR}, - {"COM", EXTRACTOR_METATYPE_COMMENT}, - {"", EXTRACTOR_METATYPE_KEYWORDS}, + /* skipping UFI */ + {"TT1", EXTRACTOR_METATYPE_SECTION, T}, + {"TT2", EXTRACTOR_METATYPE_TITLE, T}, + {"TT3", EXTRACTOR_METATYPE_SONG_VERSION, T}, + {"TP1", EXTRACTOR_METATYPE_ARTIST, T}, + {"TP2", EXTRACTOR_METATYPE_PERFORMER, T}, + {"TP3", EXTRACTOR_METATYPE_CONDUCTOR, T}, + {"TP4", EXTRACTOR_METATYPE_INTERPRETATION, T}, + {"TCM", EXTRACTOR_METATYPE_COMPOSER, T}, + {"TXT", EXTRACTOR_METATYPE_WRITER, T}, + {"TLA", EXTRACTOR_METATYPE_LANGUAGE, T}, + {"TCO", EXTRACTOR_METATYPE_GENRE, T}, + {"TAL", EXTRACTOR_METATYPE_ALBUM, T}, + {"TPA", EXTRACTOR_METATYPE_DISC_NUMBER, T}, + {"TRK", EXTRACTOR_METATYPE_TRACK_NUMBER, T}, + {"TRC", EXTRACTOR_METATYPE_ISRC, T}, + {"TYE", EXTRACTOR_METATYPE_PUBLICATION_YEAR, T}, + /* + FIXME: these two and TYE should be combined into + the actual publication date (if TRD is missing) + {"TDA", EXTRACTOR_METATYPE_PUBLICATION_DATE}, + {"TIM", EXTRACTOR_METATYPE_PUBLICATION_DATE}, + */ + {"TRD", EXTRACTOR_METATYPE_CREATION_TIME, T}, + {"TMT", EXTRACTOR_METATYPE_SOURCE, T}, + {"TFT", EXTRACTOR_METATYPE_FORMAT_VERSION, T}, + {"TBP", EXTRACTOR_METATYPE_BEATS_PER_MINUTE, T}, + {"TCR", EXTRACTOR_METATYPE_COPYRIGHT, T}, + {"TPB", EXTRACTOR_METATYPE_PUBLISHER, T}, + {"TEN", EXTRACTOR_METATYPE_ENCODED_BY, T}, + {"TSS", EXTRACTOR_METATYPE_PRODUCED_BY_SOFTWARE, T}, + {"TOF", EXTRACTOR_METATYPE_FILENAME, T}, + {"TLE", EXTRACTOR_METATYPE_DURATION, T}, /* FIXME: should append 'ms' as unit */ + {"TSI", EXTRACTOR_METATYPE_EMBEDDED_FILE_SIZE, T}, + /* skipping TDY, TKE */ + {"TOT", EXTRACTOR_METATYPE_ORIGINAL_TITLE, T}, + {"TOA", EXTRACTOR_METATYPE_ORIGINAL_ARTIST, T}, + {"TOL", EXTRACTOR_METATYPE_ORIGINAL_WRITER, T}, + {"TOR", EXTRACTOR_METATYPE_ORIGINAL_RELEASE_YEAR, T}, + /* skipping TXX */ + + {"WAF", EXTRACTOR_METATYPE_URL, U}, + {"WAR", EXTRACTOR_METATYPE_URL, U}, + {"WAS", EXTRACTOR_METATYPE_URL, U}, + {"WCM", EXTRACTOR_METATYPE_URL, U}, + {"WCP", EXTRACTOR_METATYPE_RIGHTS, U}, + {"WCB", EXTRACTOR_METATYPE_URL, U}, + /* skipping WXX */ + {"IPL", EXTRACTOR_METATYPE_CONTRIBUTOR_NAME, T}, + /* skipping MCI */ + /* skipping ETC */ + /* skipping MLL */ + /* skipping STC */ + {"ULT", EXTRACTOR_METATYPE_LYRICS, UL}, + {"SLT", EXTRACTOR_METATYPE_LYRICS, SL}, + {"COM", EXTRACTOR_METATYPE_COMMENT, L}, + /* skipping RVA */ + /* skipping EQU */ + /* skipping REV */ + {"PIC", EXTRACTOR_METATYPE_PICTURE, I}, + /* skipping GEN */ + /* {"CNT", EXTRACTOR_METATYPE_PLAY_COUNTER, XXX}, */ + /* {"POP", EXTRACTOR_METATYPE_POPULARITY_METER, XXX}, */ + /* skipping BUF */ + /* skipping CRM */ + /* skipping CRA */ + /* {"LNK", EXTRACTOR_METATYPE_URL, XXX}, */ {NULL, 0}, }; @@ -83,6 +128,9 @@ EXTRACTOR_id3v2_extract (const unsigned char *data, int unsync; unsigned int tsize; unsigned int pos; + unsigned int off; + enum EXTRACTOR_MetaType type; + const char *mime; if ((size < 16) || (data[0] != 0x49) || @@ -102,10 +150,10 @@ EXTRACTOR_id3v2_extract (const unsigned char *data, size_t csize; int i; - if (pos + 6 > tsize) + if (pos + 7 > tsize) return 0; csize = (data[pos + 3] << 16) + (data[pos + 4] << 8) + data[pos + 5]; - if ((pos + 6 + csize > tsize) || (csize > tsize) || (csize == 0)) + if ((pos + 7 + csize > tsize) || (csize > tsize) || (csize == 0)) break; i = 0; while (tmap[i].text != NULL) @@ -116,33 +164,190 @@ EXTRACTOR_id3v2_extract (const unsigned char *data, /* this byte describes the encoding try to convert strings to UTF-8 if it fails, then forget it */ - switch (data[pos + 6]) - { - case 0x00: - word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 7], - csize, "ISO-8859-1"); - break; - case 0x01: - word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 7], - csize, "UCS-2"); - break; - default: - /* bad encoding byte, - try to convert from iso-8859-1 */ - word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 7], - csize, "ISO-8859-1"); - break; - } - pos++; - csize--; - if ((word != NULL) && (strlen (word) > 0)) - { - prev = addKeyword (prev, word, tmap[i].type); - } - else + switch (tmap[i].fmt) + { + case T: + switch (data[pos + 6]) + { + case 0x00: + word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 7], + csize - 1, "ISO-8859-1"); + break; + case 0x01: + word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 7], + csize - 1, "UCS-2"); + break; + default: + /* bad encoding byte, + try to convert from iso-8859-1 */ + word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 7], + csize - 1, "ISO-8859-1"); + break; + } + break; + case U: + word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 6], + csize, "ISO-8859-1"); + break; + case UL: + if (csize < 6) + return 0; /* malformed */ + /* find end of description */ + off = 10; + while ( (off < size) && + (off - pos < csize) && + (data[pos + off] == '\0') ) + off++; + if ( (off >= csize) || + (data[pos+off] != '\0') ) + return 0; /* malformed */ + off++; + switch (data[pos + 6]) + { + case 0x00: + word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + off], + csize - off, "ISO-8859-1"); + break; + case 0x01: + word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + off], + csize - off, "UCS-2"); + break; + default: + /* bad encoding byte, + try to convert from iso-8859-1 */ + word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + off], + csize - off, "ISO-8859-1"); + break; + } + break; + case SL: + if (csize < 7) + return 0; /* malformed */ + /* find end of description */ + switch (data[pos + 6]) + { + case 0x00: + word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 12], + csize - 6, "ISO-8859-1"); + break; + case 0x01: + word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 12], + csize - 6, "UCS-2"); + break; + default: + /* bad encoding byte, + try to convert from iso-8859-1 */ + word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 12], + csize - 6, "ISO-8859-1"); + break; + } + break; + case L: + if (csize < 5) + return 0; /* malformed */ + /* find end of description */ + switch (data[pos + 6]) + { + case 0x00: + word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 10], + csize - 4, "ISO-8859-1"); + break; + case 0x01: + word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 10], + csize - 4, "UCS-2"); + break; + default: + /* bad encoding byte, + try to convert from iso-8859-1 */ + word = EXTRACTOR_common_convert_to_utf8 ((const char *) &data[pos + 10], + csize - 4, "ISO-8859-1"); + break; + } + break; + case I: + if (csize < 6) + return 0; /* malformed */ + /* find end of description */ + off = 12; + while ( (off < size) && + (off - pos < csize) && + (data[pos + off] == '\0') ) + off++; + if ( (off >= csize) || + (data[pos+off] != '\0') ) + return 0; /* malformed */ + off++; + switch (data[pos+11]) + { + case 0x03: + case 0x04: + type = EXTRACTOR_METATYPE_COVER_PICTURE; + break; + case 0x07: + case 0x08: + case 0x09: + case 0x0A: + case 0x0B: + case 0x0C: + type = EXTRACTOR_METATYPE_CONTRIBUTOR_PICTURE; + break; + case 0x0D: + case 0x0E: + case 0x0F: + type = EXTRACTOR_METATYPE_EVENT_PICTURE; + break; + case 0x14: + type = EXTRACTOR_METATYPE_LOGO; + type = EXTRACTOR_METATYPE_LOGO; + break; + default: + type = EXTRACTOR_METATYPE_PICTURE; + break; + } + if (0 == strncasecmp ("PNG", + (const char*) &data[pos + 7], 3)) + mime = "image/png"; + else if (0 == strncasecmp ("JPG", + (const char*) &data[pos + 7], 3)) + mime = "image/jpeg"; + else + mime = NULL; + if (0 == strncasecmp ("-->", + (const char*) &data[pos + 7], 3)) + { + /* not supported */ + } + else + { + if (0 != proc (proc_cls, + "id3v2", + type, + EXTRACTOR_METAFORMAT_BINARY, + mime, + (const char*) &data[pos + off], + csize + 6 - off)) + return 1; + } + word = NULL; + break; + default: + return 0; + } + if ((word != NULL) && (strlen (word) > 0)) { - free (word); - } + if (0 != proc (proc_cls, + "id3v2", + type, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + word, + strlen(word)+1)) + { + free (word); + return 1; + } + } + free (word); break; } i++;