libextractor

GNU libextractor
Log | Files | Refs | Submodules | README | LICENSE

commit c4108ce3eb6805afb493cbfae288eb62f41808fa
parent df57c70280d34506c4221980f4446aa8cce4322e
Author: Christian Grothoff <christian@grothoff.org>
Date:   Thu, 17 Dec 2009 13:41:48 +0000

mp3

Diffstat:
Msrc/include/extractor.h | 30+++++++++++++++++-------------
Msrc/main/extractor_metatypes.c | 16++++++++++++++++
Msrc/plugins/Makefile.am | 18+++++++++---------
Asrc/plugins/mp3_extractor.c | 523+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Dsrc/plugins/mp3extractor.c | 536-------------------------------------------------------------------------------
5 files changed, 565 insertions(+), 558 deletions(-)

diff --git a/src/include/extractor.h b/src/include/extractor.h @@ -254,32 +254,36 @@ enum EXTRACTOR_MetaType EXTRACTOR_METATYPE_COMPANY = 125, EXTRACTOR_METATYPE_MANAGER = 126, EXTRACTOR_METATYPE_REVISION_NUMBER = 127, - - /* fixme: used up to here! */ - EXTRACTOR_METATYPE_SCALE = 108, - - /* FIXME: transcribe & renumber those below */ - EXTRACTOR_METATYPE_USED_FONTS = 37, - EXTRACTOR_METATYPE_PAGE_ORDER = 38, - /* music / video specifics */ + EXTRACTOR_METATYPE_DURATION = 111, + EXTRACTOR_METATYPE_ALBUM = 11, + EXTRACTOR_METATYPE_ARTIST = 5, + EXTRACTOR_METATYPE_GENRE = 12, + EXTRACTOR_METATYPE_TRACK_NUMBER = 132, + EXTRACTOR_METATYPE_LYRICS = 67, EXTRACTOR_METATYPE_CONDUCTOR = 64, EXTRACTOR_METATYPE_INTERPRET = 65, EXTRACTOR_METATYPE_MUSIC_CD_IDENTIFIER = 117, EXTRACTOR_METATYPE_PLAY_COUNTER = 118, - EXTRACTOR_METATYPE_DURATION = 111, EXTRACTOR_METATYPE_MOVIE_DIRECTOR = 110, EXTRACTOR_METATYPE_SONG_COUNT = 127, EXTRACTOR_METATYPE_STARTING_SONG = 128, EXTRACTOR_METATYPE_MUSICIAN_CREDITS_LIST = 123, - EXTRACTOR_METATYPE_TRACK_NUMBER = 132, EXTRACTOR_METATYPE_DISC_NUMBER = 134, - EXTRACTOR_METATYPE_ALBUM = 11, - EXTRACTOR_METATYPE_ARTIST = 5, - EXTRACTOR_METATYPE_GENRE = 12, + + + /* fixme: used up to here! */ + EXTRACTOR_METATYPE_SCALE = 108, + + + + /* FIXME: transcribe & renumber those below */ + EXTRACTOR_METATYPE_USED_FONTS = 37, + EXTRACTOR_METATYPE_PAGE_ORDER = 38, + /* numeric metrics */ EXTRACTOR_METATYPE_POPULARITY_METER = 119, diff --git a/src/main/extractor_metatypes.c b/src/main/extractor_metatypes.c @@ -300,6 +300,7 @@ static const struct MetaTypeDescription meta_type_descriptions[] = { gettext_noop ("number of lines") }, { gettext_noop ("paragraph count"), gettext_noop ("number o paragraphs") }, + /* 120 */ { gettext_noop ("word count"), gettext_noop ("number of words") }, { gettext_noop ("page orientation"), @@ -310,10 +311,25 @@ static const struct MetaTypeDescription meta_type_descriptions[] = { gettext_noop ("template the document uses or is based on") }, { gettext_noop ("company"), gettext_noop ("") }, + /* 125 */ { gettext_noop ("manager"), gettext_noop ("") }, { gettext_noop ("revision number"), gettext_noop ("") }, + { gettext_noop ("duration"), + gettext_noop ("play time for the medium") }, + { gettext_noop ("album"), + gettext_noop ("name of the album") }, + { gettext_noop ("artist"), + gettext_noop ("name of the artist or band") }, + { gettext_noop ("genre"), + gettext_noop ("") }, + { gettext_noop ("track number"), + gettext_noop ("original number of the track on the distribution medium") }, + { gettext_noop (""), + gettext_noop ("") }, + { gettext_noop (""), + gettext_noop ("") }, { gettext_noop (""), gettext_noop ("") }, { gettext_noop (""), diff --git a/src/plugins/Makefile.am b/src/plugins/Makefile.am @@ -79,6 +79,7 @@ plugin_LTLIBRARIES = \ libextractor_jpeg.la \ libextractor_man.la \ libextractor_mime.la \ + libextractor_mp3.la \ $(ole2) \ libextractor_odf.la \ $(pdf) \ @@ -162,6 +163,14 @@ libextractor_mime_la_SOURCES = \ libextractor_mime_la_LDFLAGS = \ $(PLUGINFLAGS) +libextractor_mp3_la_SOURCES = \ + mp3_extractor.c +libextractor_mp3_la_LDFLAGS = \ + $(PLUGINFLAGS) +libextractor_mp3_la_LIBADD = \ + $(top_builddir)/src/common/libextractor_common.la \ + $(LE_LIBINTL) + libextractor_ole2_la_SOURCES = \ ole2_extractor.c libextractor_ole2_la_CFLAGS = \ @@ -218,7 +227,6 @@ OLD_LIBS = \ libextractor_id3v2.la \ libextractor_id3v24.la \ libextractor_id3v23.la \ - libextractor_mp3.la \ $(extrampeg) \ libextractor_nsf.la \ libextractor_nsfe.la \ @@ -281,14 +289,6 @@ libextractor_wav_la_LDFLAGS = \ libextractor_wav_la_LIBADD = \ $(LE_LIBINTL) -libextractor_mp3_la_SOURCES = \ - mp3extractor.c -libextractor_mp3_la_LDFLAGS = \ - $(PLUGINFLAGS) -libextractor_mp3_la_LIBADD = \ - $(top_builddir)/src/common/libextractor_common.la \ - $(LE_LIBINTL) - libextractor_id3v2_la_SOURCES = \ id3v2extractor.c libextractor_id3v2_la_LDFLAGS = \ diff --git a/src/plugins/mp3_extractor.c b/src/plugins/mp3_extractor.c @@ -0,0 +1,523 @@ +/* + This file is part of libextractor. + (C) 2002, 2003, 2004, 2006, 2009 Vidyut Samanta and Christian Grothoff + + libextractor is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 2, or (at your + option) any later version. + + libextractor is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with libextractor; see the file COPYING. If not, write to the + Free Software Foundation, Inc., 59 Temple Place - Suite 330, + Boston, MA 02111-1307, USA. + + + Some of this code is based on AVInfo 1.0 alpha 11 + (c) George Shuklin, gs]AT[shounen.ru, 2002-2004 + http://shounen.ru/soft/avinfo/ + + */ + +#define DEBUG_EXTRACT_MP3 0 + +#include "platform.h" +#include "extractor.h" +#include "convert.h" +#include <string.h> +#include <stdio.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <unistd.h> +#include <stdlib.h> + +typedef struct +{ + char *title; + char *artist; + char *album; + char *year; + char *comment; + const char *genre; + unsigned int track_number; +} id3tag; + +static const char *const genre_names[] = { + gettext_noop ("Blues"), + gettext_noop ("Classic Rock"), + gettext_noop ("Country"), + gettext_noop ("Dance"), + gettext_noop ("Disco"), + gettext_noop ("Funk"), + gettext_noop ("Grunge"), + gettext_noop ("Hip-Hop"), + gettext_noop ("Jazz"), + gettext_noop ("Metal"), + gettext_noop ("New Age"), + gettext_noop ("Oldies"), + gettext_noop ("Other"), + gettext_noop ("Pop"), + gettext_noop ("R&B"), + gettext_noop ("Rap"), + gettext_noop ("Reggae"), + gettext_noop ("Rock"), + gettext_noop ("Techno"), + gettext_noop ("Industrial"), + gettext_noop ("Alternative"), + gettext_noop ("Ska"), + gettext_noop ("Death Metal"), + gettext_noop ("Pranks"), + gettext_noop ("Soundtrack"), + gettext_noop ("Euro-Techno"), + gettext_noop ("Ambient"), + gettext_noop ("Trip-Hop"), + gettext_noop ("Vocal"), + gettext_noop ("Jazz+Funk"), + gettext_noop ("Fusion"), + gettext_noop ("Trance"), + gettext_noop ("Classical"), + gettext_noop ("Instrumental"), + gettext_noop ("Acid"), + gettext_noop ("House"), + gettext_noop ("Game"), + gettext_noop ("Sound Clip"), + gettext_noop ("Gospel"), + gettext_noop ("Noise"), + gettext_noop ("Alt. Rock"), + gettext_noop ("Bass"), + gettext_noop ("Soul"), + gettext_noop ("Punk"), + gettext_noop ("Space"), + gettext_noop ("Meditative"), + gettext_noop ("Instrumental Pop"), + gettext_noop ("Instrumental Rock"), + gettext_noop ("Ethnic"), + gettext_noop ("Gothic"), + gettext_noop ("Darkwave"), + gettext_noop ("Techno-Industrial"), + gettext_noop ("Electronic"), + gettext_noop ("Pop-Folk"), + gettext_noop ("Eurodance"), + gettext_noop ("Dream"), + gettext_noop ("Southern Rock"), + gettext_noop ("Comedy"), + gettext_noop ("Cult"), + gettext_noop ("Gangsta Rap"), + gettext_noop ("Top 40"), + gettext_noop ("Christian Rap"), + gettext_noop ("Pop/Funk"), + gettext_noop ("Jungle"), + gettext_noop ("Native American"), + gettext_noop ("Cabaret"), + gettext_noop ("New Wave"), + gettext_noop ("Psychedelic"), + gettext_noop ("Rave"), + gettext_noop ("Showtunes"), + gettext_noop ("Trailer"), + gettext_noop ("Lo-Fi"), + gettext_noop ("Tribal"), + gettext_noop ("Acid Punk"), + gettext_noop ("Acid Jazz"), + gettext_noop ("Polka"), + gettext_noop ("Retro"), + gettext_noop ("Musical"), + gettext_noop ("Rock & Roll"), + gettext_noop ("Hard Rock"), + gettext_noop ("Folk"), + gettext_noop ("Folk/Rock"), + gettext_noop ("National Folk"), + gettext_noop ("Swing"), + gettext_noop ("Fast-Fusion"), + gettext_noop ("Bebob"), + gettext_noop ("Latin"), + gettext_noop ("Revival"), + gettext_noop ("Celtic"), + gettext_noop ("Bluegrass"), + gettext_noop ("Avantgarde"), + gettext_noop ("Gothic Rock"), + gettext_noop ("Progressive Rock"), + gettext_noop ("Psychedelic Rock"), + gettext_noop ("Symphonic Rock"), + gettext_noop ("Slow Rock"), + gettext_noop ("Big Band"), + gettext_noop ("Chorus"), + gettext_noop ("Easy Listening"), + gettext_noop ("Acoustic"), + gettext_noop ("Humour"), + gettext_noop ("Speech"), + gettext_noop ("Chanson"), + gettext_noop ("Opera"), + gettext_noop ("Chamber Music"), + gettext_noop ("Sonata"), + gettext_noop ("Symphony"), + gettext_noop ("Booty Bass"), + gettext_noop ("Primus"), + gettext_noop ("Porn Groove"), + gettext_noop ("Satire"), + gettext_noop ("Slow Jam"), + gettext_noop ("Club"), + gettext_noop ("Tango"), + gettext_noop ("Samba"), + gettext_noop ("Folklore"), + gettext_noop ("Ballad"), + gettext_noop ("Power Ballad"), + gettext_noop ("Rhythmic Soul"), + gettext_noop ("Freestyle"), + gettext_noop ("Duet"), + gettext_noop ("Punk Rock"), + gettext_noop ("Drum Solo"), + gettext_noop ("A Cappella"), + gettext_noop ("Euro-House"), + gettext_noop ("Dance Hall"), + gettext_noop ("Goa"), + gettext_noop ("Drum & Bass"), + gettext_noop ("Club-House"), + gettext_noop ("Hardcore"), + gettext_noop ("Terror"), + gettext_noop ("Indie"), + gettext_noop ("BritPop"), + gettext_noop ("Negerpunk"), + gettext_noop ("Polsk Punk"), + gettext_noop ("Beat"), + gettext_noop ("Christian Gangsta Rap"), + gettext_noop ("Heavy Metal"), + gettext_noop ("Black Metal"), + gettext_noop ("Crossover"), + gettext_noop ("Contemporary Christian"), + gettext_noop ("Christian Rock"), + gettext_noop ("Merengue"), + gettext_noop ("Salsa"), + gettext_noop ("Thrash Metal"), + gettext_noop ("Anime"), + gettext_noop ("JPop"), + gettext_noop ("Synthpop"), +}; + +#define GENRE_NAME_COUNT \ + ((unsigned int)(sizeof genre_names / sizeof (const char *const))) + + +#define MAX_MP3_SCAN_DEEP 16768 +const int max_frames_scan = 1024; +enum +{ MPEG_ERR = 0, MPEG_V1 = 1, MPEG_V2 = 2, MPEG_V25 = 3 }; + +enum +{ LAYER_ERR = 0, LAYER_1 = 1, LAYER_2 = 2, LAYER_3 = 3 }; + +#define MPA_SYNC_MASK ((unsigned int) 0xFFE00000) +#define MPA_LAST_SYNC_BIT_MASK ((unsigned int) 0x00100000) +#define MPA_VERSION_MASK ((unsigned int) 0x00080000) +#define MPA_LAYER_MASK ((unsigned int) 0x3) +#define MPA_LAYER_SHIFT 17 +#define MPA_BITRATE_MASK ((unsigned int) 0xF) +#define MPA_BITRATE_SHIFT 12 +#define MPA_FREQ_MASK ((unsigned int) 0x3) +#define MPA_FREQ_SHIFT 10 +#define MPA_CHMODE_MASK ((unsigned int) 0x3) +#define MPA_CHMODE_SHIFT 6 +#define MPA_PADDING_SHIFT 9 +#define MPA_COPYRIGHT_SHIFT 3 +#define MPA_ORIGINAL_SHIFT 2 + +static const unsigned int bitrate_table[16][6] = { + {0, 0, 0, 0, 0, 0}, + {32, 32, 32, 32, 8, 8}, + {64, 48, 40, 48, 16, 16}, + {96, 56, 48, 56, 24, 24}, + {128, 64, 56, 64, 32, 32}, + {160, 80, 64, 80, 40, 40}, + {192, 96, 80, 96, 48, 48}, + {224, 112, 96, 112, 56, 56}, + {256, 128, 112, 128, 64, 64}, + {288, 160, 128, 144, 80, 80}, + {320, 192, 160, 160, 96, 96}, + {352, 224, 192, 176, 112, 112}, + {384, 256, 224, 192, 128, 128}, + {416, 320, 256, 224, 144, 144}, + {448, 384, 320, 256, 160, 160}, + {-1, -1, -1, -1, -1, -1} +}; +static const int freq_table[4][3] = { + {44100, 22050, 11025}, + {48000, 24000, 12000}, + {32000, 16000, 8000} +}; +static const char * const channel_modes[4] = { + gettext_noop("stereo"), + gettext_noop("joint stereo"), + gettext_noop("dual channel"), + gettext_noop("mono") +}; +static const char * const mpeg_versions[3] = { + gettext_noop("MPEG-1"), + gettext_noop("MPEG-2"), + gettext_noop("MPEG-2.5") +}; +static const char * const layer_names[3] = { + gettext_noop("Layer I"), + gettext_noop("Layer II"), + gettext_noop("Layer III") +}; + + +#define OK 0 +#define SYSERR 1 +#define INVALID_ID3 2 + +static void +trim (char *k) +{ + while ((strlen (k) > 0) && (isspace (k[strlen (k) - 1]))) + k[strlen (k) - 1] = '\0'; +} + +static int +get_id3 (const char *data, size_t size, id3tag * id3) +{ + const char *pos; + + if (size < 128) + return INVALID_ID3; + + pos = &data[size - 128]; + if (0 != strncmp ("TAG", pos, 3)) + return INVALID_ID3; + pos += 3; + + id3->title = EXTRACTOR_common_convert_to_utf8 (pos, 30, "ISO-8859-1"); + trim (id3->title); + pos += 30; + id3->artist = EXTRACTOR_common_convert_to_utf8 (pos, 30, "ISO-8859-1"); + trim (id3->artist); + pos += 30; + id3->album = EXTRACTOR_common_convert_to_utf8 (pos, 30, "ISO-8859-1"); + trim (id3->album); + pos += 30; + id3->year = EXTRACTOR_common_convert_to_utf8 (pos, 4, "ISO-8859-1"); + trim (id3->year); + pos += 4; + id3->comment = EXTRACTOR_common_convert_to_utf8 (pos, 30, "ISO-8859-1"); + trim (id3->comment); + if ( (pos[28] == '\0') && + (pos[29] != '\0') ) + { + /* ID3v1.1 */ + id3->track_number = pos[29]; + } + else + { + id3->track_number = 0; + } + pos += 30; + id3->genre = ""; + if (pos[0] < GENRE_NAME_COUNT) + id3->genre = dgettext (PACKAGE, genre_names[(unsigned) pos[0]]); + return OK; +} + + +#define ADDR(s,t) do { if (0 != proc (proc_cls, "mp3", t, EXTRACTOR_METAFORMAT_UTF8, "text/plain", s, strlen(s)+1)) return 1; } while (0) + +static int +mp3parse (const unsigned char *data, size_t size, + EXTRACTOR_MetaDataProcessor proc, + void *proc_cls) +{ + unsigned int header; + int counter = 0; + char mpeg_ver = 0; + char layer = 0; + int idx_num = 0; + int bitrate = 0; /*used for each frame */ + int avg_bps = 0; /*average bitrate */ + int vbr_flag = 0; + int copyright_flag = 0; + int original_flag = 0; + int length = 0; + int sample_rate = 0; + int ch = 0; + int frame_size; + int frames = 0; + size_t pos = 0; + char format[512]; + + do + { + /* seek for frame start */ + if (pos + sizeof (header) > size) + { + return 0; + } /*unable to find header */ + header = (data[pos] << 24) | (data[pos+1] << 16) | + (data[pos+2] << 8) | data[pos+3]; + if ((header & MPA_SYNC_MASK) == MPA_SYNC_MASK) + break; /*found header sync */ + pos++; + counter++; /*next try */ + } + while (counter < MAX_MP3_SCAN_DEEP); + if (counter >= MAX_MP3_SCAN_DEEP) + return 0; + ADDR ("audio/mpeg", EXTRACTOR_METATYPE_MIMETYPE); + + do + { /*ok, now we found a mp3 frame header */ + frames++; + switch (header & (MPA_LAST_SYNC_BIT_MASK | MPA_VERSION_MASK)) + { + case (MPA_LAST_SYNC_BIT_MASK | MPA_VERSION_MASK): + mpeg_ver = MPEG_V1; + break; + case (MPA_LAST_SYNC_BIT_MASK): + mpeg_ver = MPEG_V2; + break; + case 0: + mpeg_ver = MPEG_V25; + break; + case (MPA_VERSION_MASK): + default: + mpeg_ver = MPEG_ERR; /*error */ + break; + } + switch (header & (MPA_LAYER_MASK << MPA_LAYER_SHIFT)) + { + case (0x1 << MPA_LAYER_SHIFT): + layer = LAYER_3; + break; + case (0x2 << MPA_LAYER_SHIFT): + layer = LAYER_2; + break; + case (0x3 << MPA_LAYER_SHIFT): + layer = LAYER_1; + break; + case 0x0: + default: + layer = LAYER_ERR; /*error */ + } + if (!layer || !mpeg_ver) + return 0; /*unknown mpeg type */ + if (mpeg_ver < MPEG_V25) + idx_num = (mpeg_ver - 1) * 3 + layer - 1; + else + idx_num = 2 + layer; + bitrate = 1000 * bitrate_table[(header >> MPA_BITRATE_SHIFT) & + MPA_BITRATE_MASK][idx_num]; + if (bitrate < 0) + { + frames--; + break; + } /*error in header */ + sample_rate = freq_table[(header >> MPA_FREQ_SHIFT) & + MPA_FREQ_MASK][mpeg_ver - 1]; + if (sample_rate < 0) + { + frames--; + break; + } /*error in header */ + ch = ((header >> MPA_CHMODE_SHIFT) & MPA_CHMODE_MASK); + copyright_flag = (header >> MPA_COPYRIGHT_SHIFT) & 0x1; + original_flag = (header >> MPA_ORIGINAL_SHIFT) & 0x1; + frame_size = + 144 * bitrate / (sample_rate ? sample_rate : 1) + + ((header >> MPA_PADDING_SHIFT) & 0x1); + avg_bps += bitrate / 1000; + + pos += frame_size - 4; + if (frames > max_frames_scan) + break; /*optimization */ + if (avg_bps / frames != bitrate / 1000) + vbr_flag = 1; + if (pos + sizeof (header) > size) + break; /* EOF */ + header = (data[pos] << 24) | (data[pos+1] << 16) | + (data[pos+2] << 8) | data[pos+3]; + } + while ((header & MPA_SYNC_MASK) == MPA_SYNC_MASK); + + if (!frames) + return 0; /*no valid frames */ + avg_bps = avg_bps / frames; + if (max_frames_scan) + { /*if not all frames scaned */ + length = + size / (avg_bps ? avg_bps : bitrate ? bitrate : 0xFFFFFFFF) / 125; + } + else + { + length = 1152 * frames / (sample_rate ? sample_rate : 0xFFFFFFFF); + } + + ADDR (mpeg_versions[mpeg_ver-1], EXTRACTOR_METATYPE_FORMAT_VERSION); + snprintf (format, + sizeof(format), + "%s %s audio, %d kbps (%s), %d Hz, %s, %s, %s", + mpeg_versions[mpeg_ver-1], + layer_names[layer-1], + avg_bps, + vbr_flag ? _("VBR") : _("CBR"), + sample_rate, + channel_modes[ch], + copyright_flag ? _("copyright") : _("no copyright"), + original_flag ? _("original") : _("copy") ); + + ADDR (format, EXTRACTOR_METATYPE_RESOURCE_TYPE); + snprintf (format, + sizeof (format), "%dm%02d", + length / 60, length % 60); + ADDR (format, EXTRACTOR_METATYPE_DURATION); + return 0; +} + + +#define ADD(s,t) do { if (0 != proc (proc_cls, "mp3", t, EXTRACTOR_METAFORMAT_UTF8, "text/plain", s, strlen(s)+1)) goto FINISH; } while (0) + + +/* mimetype = audio/mpeg */ +int +EXTRACTOR_mp3_extract (const char *data, + size_t size, + EXTRACTOR_MetaDataProcessor proc, + void *proc_cls, + const char *options) +{ + id3tag info; + char track[16]; + int ret; + + if (0 != get_id3 (data, size, &info)) + return 0; + if (strlen (info.title) > 0) + ADD (info.title, EXTRACTOR_METATYPE_TITLE); + if (strlen (info.artist) > 0) + ADD (info.artist, EXTRACTOR_METATYPE_ARTIST); + if (strlen (info.album) > 0) + ADD (info.album, EXTRACTOR_METATYPE_ALBUM); + if (strlen (info.year) > 0) + ADD (info.year, EXTRACTOR_METATYPE_PUBLICATION_YEAR); + if (strlen (info.genre) > 0) + ADD (info.genre, EXTRACTOR_METATYPE_GENRE); + if (strlen (info.comment) > 0) + ADD (info.comment, EXTRACTOR_METATYPE_COMMENT); + if (info.track_number != 0) + { + snprintf(track, + sizeof(track), "%u", info.track_number); + ADD (track, EXTRACTOR_METATYPE_TRACK_NUMBER); + } + ret = mp3parse ((const unsigned char *) data, size, proc, proc_cls); +FINISH: + free (info.title); + free (info.year); + free (info.album); + free (info.artist); + free (info.comment); + return ret; +} + +/* end of mp3_extractor.c */ diff --git a/src/plugins/mp3extractor.c b/src/plugins/mp3extractor.c @@ -1,536 +0,0 @@ -/* - This file is part of libextractor. - (C) 2002, 2003, 2004, 2006 Vidyut Samanta and Christian Grothoff - - libextractor is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 2, or (at your - option) any later version. - - libextractor is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with libextractor; see the file COPYING. If not, write to the - Free Software Foundation, Inc., 59 Temple Place - Suite 330, - Boston, MA 02111-1307, USA. - - - Some of this code is based on AVInfo 1.0 alpha 11 - (c) George Shuklin, gs]AT[shounen.ru, 2002-2004 - http://shounen.ru/soft/avinfo/ - - */ - -#define DEBUG_EXTRACT_MP3 0 - -#include "platform.h" -#include "extractor.h" -#include "convert.h" -#include <string.h> -#include <stdio.h> -#include <sys/types.h> -#include <sys/stat.h> -#include <unistd.h> -#include <stdlib.h> - -typedef struct -{ - char *title; - char *artist; - char *album; - char *year; - char *comment; - const char *genre; - unsigned int track_number; -} id3tag; - -static const char *const genre_names[] = { - gettext_noop ("Blues"), - gettext_noop ("Classic Rock"), - gettext_noop ("Country"), - gettext_noop ("Dance"), - gettext_noop ("Disco"), - gettext_noop ("Funk"), - gettext_noop ("Grunge"), - gettext_noop ("Hip-Hop"), - gettext_noop ("Jazz"), - gettext_noop ("Metal"), - gettext_noop ("New Age"), - gettext_noop ("Oldies"), - gettext_noop ("Other"), - gettext_noop ("Pop"), - gettext_noop ("R&B"), - gettext_noop ("Rap"), - gettext_noop ("Reggae"), - gettext_noop ("Rock"), - gettext_noop ("Techno"), - gettext_noop ("Industrial"), - gettext_noop ("Alternative"), - gettext_noop ("Ska"), - gettext_noop ("Death Metal"), - gettext_noop ("Pranks"), - gettext_noop ("Soundtrack"), - gettext_noop ("Euro-Techno"), - gettext_noop ("Ambient"), - gettext_noop ("Trip-Hop"), - gettext_noop ("Vocal"), - gettext_noop ("Jazz+Funk"), - gettext_noop ("Fusion"), - gettext_noop ("Trance"), - gettext_noop ("Classical"), - gettext_noop ("Instrumental"), - gettext_noop ("Acid"), - gettext_noop ("House"), - gettext_noop ("Game"), - gettext_noop ("Sound Clip"), - gettext_noop ("Gospel"), - gettext_noop ("Noise"), - gettext_noop ("Alt. Rock"), - gettext_noop ("Bass"), - gettext_noop ("Soul"), - gettext_noop ("Punk"), - gettext_noop ("Space"), - gettext_noop ("Meditative"), - gettext_noop ("Instrumental Pop"), - gettext_noop ("Instrumental Rock"), - gettext_noop ("Ethnic"), - gettext_noop ("Gothic"), - gettext_noop ("Darkwave"), - gettext_noop ("Techno-Industrial"), - gettext_noop ("Electronic"), - gettext_noop ("Pop-Folk"), - gettext_noop ("Eurodance"), - gettext_noop ("Dream"), - gettext_noop ("Southern Rock"), - gettext_noop ("Comedy"), - gettext_noop ("Cult"), - gettext_noop ("Gangsta Rap"), - gettext_noop ("Top 40"), - gettext_noop ("Christian Rap"), - gettext_noop ("Pop/Funk"), - gettext_noop ("Jungle"), - gettext_noop ("Native American"), - gettext_noop ("Cabaret"), - gettext_noop ("New Wave"), - gettext_noop ("Psychedelic"), - gettext_noop ("Rave"), - gettext_noop ("Showtunes"), - gettext_noop ("Trailer"), - gettext_noop ("Lo-Fi"), - gettext_noop ("Tribal"), - gettext_noop ("Acid Punk"), - gettext_noop ("Acid Jazz"), - gettext_noop ("Polka"), - gettext_noop ("Retro"), - gettext_noop ("Musical"), - gettext_noop ("Rock & Roll"), - gettext_noop ("Hard Rock"), - gettext_noop ("Folk"), - gettext_noop ("Folk/Rock"), - gettext_noop ("National Folk"), - gettext_noop ("Swing"), - gettext_noop ("Fast-Fusion"), - gettext_noop ("Bebob"), - gettext_noop ("Latin"), - gettext_noop ("Revival"), - gettext_noop ("Celtic"), - gettext_noop ("Bluegrass"), - gettext_noop ("Avantgarde"), - gettext_noop ("Gothic Rock"), - gettext_noop ("Progressive Rock"), - gettext_noop ("Psychedelic Rock"), - gettext_noop ("Symphonic Rock"), - gettext_noop ("Slow Rock"), - gettext_noop ("Big Band"), - gettext_noop ("Chorus"), - gettext_noop ("Easy Listening"), - gettext_noop ("Acoustic"), - gettext_noop ("Humour"), - gettext_noop ("Speech"), - gettext_noop ("Chanson"), - gettext_noop ("Opera"), - gettext_noop ("Chamber Music"), - gettext_noop ("Sonata"), - gettext_noop ("Symphony"), - gettext_noop ("Booty Bass"), - gettext_noop ("Primus"), - gettext_noop ("Porn Groove"), - gettext_noop ("Satire"), - gettext_noop ("Slow Jam"), - gettext_noop ("Club"), - gettext_noop ("Tango"), - gettext_noop ("Samba"), - gettext_noop ("Folklore"), - gettext_noop ("Ballad"), - gettext_noop ("Power Ballad"), - gettext_noop ("Rhythmic Soul"), - gettext_noop ("Freestyle"), - gettext_noop ("Duet"), - gettext_noop ("Punk Rock"), - gettext_noop ("Drum Solo"), - gettext_noop ("A Cappella"), - gettext_noop ("Euro-House"), - gettext_noop ("Dance Hall"), - gettext_noop ("Goa"), - gettext_noop ("Drum & Bass"), - gettext_noop ("Club-House"), - gettext_noop ("Hardcore"), - gettext_noop ("Terror"), - gettext_noop ("Indie"), - gettext_noop ("BritPop"), - gettext_noop ("Negerpunk"), - gettext_noop ("Polsk Punk"), - gettext_noop ("Beat"), - gettext_noop ("Christian Gangsta Rap"), - gettext_noop ("Heavy Metal"), - gettext_noop ("Black Metal"), - gettext_noop ("Crossover"), - gettext_noop ("Contemporary Christian"), - gettext_noop ("Christian Rock"), - gettext_noop ("Merengue"), - gettext_noop ("Salsa"), - gettext_noop ("Thrash Metal"), - gettext_noop ("Anime"), - gettext_noop ("JPop"), - gettext_noop ("Synthpop"), -}; - -#define GENRE_NAME_COUNT \ - ((unsigned int)(sizeof genre_names / sizeof (const char *const))) - - -#define MAX_MP3_SCAN_DEEP 16768 -const int max_frames_scan = 1024; -enum -{ MPEG_ERR = 0, MPEG_V1 = 1, MPEG_V2 = 2, MPEG_V25 = 3 }; - -enum -{ LAYER_ERR = 0, LAYER_1 = 1, LAYER_2 = 2, LAYER_3 = 3 }; - -#define MPA_SYNC_MASK ((unsigned int) 0xFFE00000) -#define MPA_LAST_SYNC_BIT_MASK ((unsigned int) 0x00100000) -#define MPA_VERSION_MASK ((unsigned int) 0x00080000) -#define MPA_LAYER_MASK ((unsigned int) 0x3) -#define MPA_LAYER_SHIFT 17 -#define MPA_BITRATE_MASK ((unsigned int) 0xF) -#define MPA_BITRATE_SHIFT 12 -#define MPA_FREQ_MASK ((unsigned int) 0x3) -#define MPA_FREQ_SHIFT 10 -#define MPA_CHMODE_MASK ((unsigned int) 0x3) -#define MPA_CHMODE_SHIFT 6 -#define MPA_PADDING_SHIFT 9 -#define MPA_COPYRIGHT_SHIFT 3 -#define MPA_ORIGINAL_SHIFT 2 - -static const unsigned int bitrate_table[16][6] = { - {0, 0, 0, 0, 0, 0}, - {32, 32, 32, 32, 8, 8}, - {64, 48, 40, 48, 16, 16}, - {96, 56, 48, 56, 24, 24}, - {128, 64, 56, 64, 32, 32}, - {160, 80, 64, 80, 40, 40}, - {192, 96, 80, 96, 48, 48}, - {224, 112, 96, 112, 56, 56}, - {256, 128, 112, 128, 64, 64}, - {288, 160, 128, 144, 80, 80}, - {320, 192, 160, 160, 96, 96}, - {352, 224, 192, 176, 112, 112}, - {384, 256, 224, 192, 128, 128}, - {416, 320, 256, 224, 144, 144}, - {448, 384, 320, 256, 160, 160}, - {-1, -1, -1, -1, -1, -1} -}; -static const int freq_table[4][3] = { - {44100, 22050, 11025}, - {48000, 24000, 12000}, - {32000, 16000, 8000} -}; -static const char * const channel_modes[4] = { - gettext_noop("stereo"), - gettext_noop("joint stereo"), - gettext_noop("dual channel"), - gettext_noop("mono") -}; -static const char * const mpeg_versions[3] = { - gettext_noop("MPEG-1"), - gettext_noop("MPEG-2"), - gettext_noop("MPEG-2.5") -}; -static const char * const layer_names[3] = { - gettext_noop("Layer I"), - gettext_noop("Layer II"), - gettext_noop("Layer III") -}; - - -#define OK 0 -#define SYSERR 1 -#define INVALID_ID3 2 - -static void -trim (char *k) -{ - while ((strlen (k) > 0) && (isspace (k[strlen (k) - 1]))) - k[strlen (k) - 1] = '\0'; -} - -static int -get_id3 (const char *data, size_t size, id3tag * id3) -{ - const char *pos; - - if (size < 128) - return INVALID_ID3; - - pos = &data[size - 128]; - if (0 != strncmp ("TAG", pos, 3)) - return INVALID_ID3; - pos += 3; - - id3->title = EXTRACTOR_common_convert_to_utf8 (pos, 30, "ISO-8859-1"); - trim (id3->title); - pos += 30; - id3->artist = EXTRACTOR_common_convert_to_utf8 (pos, 30, "ISO-8859-1"); - trim (id3->artist); - pos += 30; - id3->album = EXTRACTOR_common_convert_to_utf8 (pos, 30, "ISO-8859-1"); - trim (id3->album); - pos += 30; - id3->year = EXTRACTOR_common_convert_to_utf8 (pos, 4, "ISO-8859-1"); - trim (id3->year); - pos += 4; - id3->comment = EXTRACTOR_common_convert_to_utf8 (pos, 30, "ISO-8859-1"); - trim (id3->comment); - if ( (pos[28] == '\0') && - (pos[29] != '\0') ) - { - /* ID3v1.1 */ - id3->track_number = pos[29]; - } - else - { - id3->track_number = 0; - } - pos += 30; - id3->genre = ""; - if (pos[0] < GENRE_NAME_COUNT) - id3->genre = dgettext (PACKAGE, genre_names[(unsigned) pos[0]]); - return OK; -} - -static struct EXTRACTOR_Keywords * -addkword (EXTRACTOR_KeywordList * oldhead, - const char *phrase, EXTRACTOR_KeywordType type) -{ - EXTRACTOR_KeywordList *keyword; - - keyword = malloc (sizeof (EXTRACTOR_KeywordList)); - keyword->next = oldhead; - keyword->keyword = strdup (phrase); - keyword->keywordType = type; - return keyword; -} - - - -static struct EXTRACTOR_Keywords * -mp3parse (const unsigned char *data, size_t size, struct EXTRACTOR_Keywords *prev) -{ - unsigned int header; - int counter = 0; - char mpeg_ver = 0; - char layer = 0; - int idx_num = 0; - int bitrate = 0; /*used for each frame */ - int avg_bps = 0; /*average bitrate */ - int vbr_flag = 0; - int copyright_flag = 0; - int original_flag = 0; - int length = 0; - int sample_rate = 0; - int ch = 0; - int frame_size; - int frames = 0; - size_t pos = 0; - char *format; - - do - { - /* seek for frame start */ - if (pos + sizeof (header) > size) - { - return prev; - } /*unable to find header */ - header = (data[pos] << 24) | (data[pos+1] << 16) | - (data[pos+2] << 8) | data[pos+3]; - if ((header & MPA_SYNC_MASK) == MPA_SYNC_MASK) - break; /*found header sync */ - pos++; - counter++; /*next try */ - } - while (counter < MAX_MP3_SCAN_DEEP); - if (counter >= MAX_MP3_SCAN_DEEP) - { - return prev; - }; /*give up to find mp3 header */ - - prev = addkword (prev, "audio/mpeg", EXTRACTOR_MIMETYPE); - - do - { /*ok, now we found a mp3 frame header */ - frames++; - switch (header & (MPA_LAST_SYNC_BIT_MASK | MPA_VERSION_MASK)) - { - case (MPA_LAST_SYNC_BIT_MASK | MPA_VERSION_MASK): - mpeg_ver = MPEG_V1; - break; - case (MPA_LAST_SYNC_BIT_MASK): - mpeg_ver = MPEG_V2; - break; - case 0: - mpeg_ver = MPEG_V25; - break; - case (MPA_VERSION_MASK): - default: - mpeg_ver = MPEG_ERR; /*error */ - break; - } - switch (header & (MPA_LAYER_MASK << MPA_LAYER_SHIFT)) - { - case (0x1 << MPA_LAYER_SHIFT): - layer = LAYER_3; - break; - case (0x2 << MPA_LAYER_SHIFT): - layer = LAYER_2; - break; - case (0x3 << MPA_LAYER_SHIFT): - layer = LAYER_1; - break; - case 0x0: - default: - layer = LAYER_ERR; /*error */ - } - if (!layer || !mpeg_ver) - return prev; /*unknown mpeg type */ - if (mpeg_ver < MPEG_V25) - idx_num = (mpeg_ver - 1) * 3 + layer - 1; - else - idx_num = 2 + layer; - bitrate = 1000 * bitrate_table[(header >> MPA_BITRATE_SHIFT) & - MPA_BITRATE_MASK][idx_num]; - if (bitrate < 0) - { - frames--; - break; - } /*error in header */ - sample_rate = freq_table[(header >> MPA_FREQ_SHIFT) & - MPA_FREQ_MASK][mpeg_ver - 1]; - if (sample_rate < 0) - { - frames--; - break; - } /*error in header */ - ch = ((header >> MPA_CHMODE_SHIFT) & MPA_CHMODE_MASK); - copyright_flag = (header >> MPA_COPYRIGHT_SHIFT) & 0x1; - original_flag = (header >> MPA_ORIGINAL_SHIFT) & 0x1; - frame_size = - 144 * bitrate / (sample_rate ? sample_rate : 1) + - ((header >> MPA_PADDING_SHIFT) & 0x1); - avg_bps += bitrate / 1000; - - pos += frame_size - 4; - if (frames > max_frames_scan) - break; /*optimization */ - if (avg_bps / frames != bitrate / 1000) - vbr_flag = 1; - if (pos + sizeof (header) > size) - break; /* EOF */ - header = (data[pos] << 24) | (data[pos+1] << 16) | - (data[pos+2] << 8) | data[pos+3]; - } - while ((header & MPA_SYNC_MASK) == MPA_SYNC_MASK); - - if (!frames) - return prev; /*no valid frames */ - avg_bps = avg_bps / frames; - if (max_frames_scan) - { /*if not all frames scaned */ - length = - size / (avg_bps ? avg_bps : bitrate ? bitrate : 0xFFFFFFFF) / 125; - } - else - { - length = 1152 * frames / (sample_rate ? sample_rate : 0xFFFFFFFF); - } - - prev = addkword (prev, mpeg_versions[mpeg_ver-1], EXTRACTOR_RESOURCE_TYPE); - format = malloc (512); - snprintf (format, 512, "%s %s audio, %d kbps (%s), %d Hz, %s, %s, %s", - mpeg_versions[mpeg_ver-1], - layer_names[layer-1], - avg_bps, - vbr_flag ? _("VBR") : _("CBR"), - sample_rate, - channel_modes[ch], - copyright_flag ? _("copyright") : _("no copyright"), - original_flag ? _("original") : _("copy") ); - prev = addkword (prev, format, EXTRACTOR_FORMAT); - snprintf (format, 512, "%dm%02d", - length / 60, length % 60); - prev = addkword (prev, format, EXTRACTOR_DURATION); - free (format); - return prev; -} - - -/* mimetype = audio/mpeg */ -struct EXTRACTOR_Keywords * -libextractor_mp3_extract (const char *filename, - const char *data, - size_t size, struct EXTRACTOR_Keywords *klist) -{ - id3tag info; - char *word; - char track[16]; - - if (0 != get_id3 (data, size, &info)) - return klist; - - if (strlen (info.title) > 0) - klist = addkword (klist, info.title, EXTRACTOR_TITLE); - if (strlen (info.artist) > 0) - klist = addkword (klist, info.artist, EXTRACTOR_ARTIST); - if (strlen (info.album) > 0) - klist = addkword (klist, info.album, EXTRACTOR_ALBUM); - if (strlen (info.year) > 0) - klist = addkword (klist, info.year, EXTRACTOR_YEAR); - if (strlen (info.genre) > 0) - klist = addkword (klist, info.genre, EXTRACTOR_GENRE); - if (strlen (info.comment) > 0) - klist = addkword (klist, info.comment, EXTRACTOR_COMMENT); - if (info.track_number != 0) - { - snprintf(track, 15, "%u", info.track_number); - klist = addkword (klist, track, EXTRACTOR_TRACK_NUMBER); - } - - /* A keyword that has all of the information together) */ - word = malloc (strlen (info.artist) + strlen (info.title) + - strlen (info.album) + 6); - sprintf (word, "%s: %s (%s)", info.artist, info.title, info.album); - klist = addkword (klist, word, EXTRACTOR_DESCRIPTION); - - free (word); - free (info.title); - free (info.year); - free (info.album); - free (info.artist); - free (info.comment); - - return mp3parse ((unsigned char *) data, size, klist); -} - -/* end of mp3extractor.c */