libextractor

GNU libextractor
Log | Files | Refs | Submodules | README | LICENSE

commit 069ebf37916afe510bcb98af4a87db8e929c6053
parent 42b645632bfdde2c24fb8ae7d2e76ecf0f76697f
Author: Christian Grothoff <christian@grothoff.org>
Date:   Fri, 23 Sep 2005 03:56:15 +0000

sync

Diffstat:
MAUTHORS | 2+-
MChangeLog | 7+++++++
Mpo/de.po | 93+++++++++++++++++++++++++++++--------------------------------------------------
Msrc/plugins/tarextractor.c | 146++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------------
4 files changed, 150 insertions(+), 98 deletions(-)

diff --git a/AUTHORS b/AUTHORS @@ -21,7 +21,7 @@ printable - core team based in part on code from GNUnet (bloom filters) avi - core team based in part on code from avinfo 1.0.0 alpha 11 and bitcollider 0.6.0 mpeg - core team based in part on code from avinfo 1.0.0 alpha 11 and bitcollider 0.6.0 ole2 - core team based on code from libgsf -tar - core team +tar - core team and Ronan MELENNEC <ronan.melennec@cena.fr> tar.gz - core team using zlib deb - core team using zlib man - core team using zlib (for man.gz) diff --git a/ChangeLog b/ChangeLog @@ -1,3 +1,10 @@ +Thu Sep 22 21:05:53 PDT 2005 + Improved TAR extractor: + - it now accepts old-style (UNIX V7) archives + - it produces a mimetype for old-style archives + - it outputs the file names in the same order as in the TAR file + - its end-of-file mark detection is more robust + Wed Sep 21 13:54:19 PDT 2005 Added Irish translation. diff --git a/po/de.po b/po/de.po @@ -7,10 +7,10 @@ # msgid "" msgstr "" -"Project-Id-Version: libextractor 0.5.6\n" +"Project-Id-Version: libextractor 0.5.6a\n" "Report-Msgid-Bugs-To: libextractor@gnu.org\n" "POT-Creation-Date: 2005-09-20 23:59-0700\n" -"PO-Revision-Date: 2005-09-21 07:27+0200\n" +"PO-Revision-Date: 2005-09-22 10:07+0200\n" "Last-Translator: Karl Eichwalder <ke@suse.de>\n" "Language-Team: German <de@li.org>\n" "MIME-Version: 1.0\n" @@ -21,7 +21,7 @@ msgstr "" #: src/plugins/language/language-compiler.c:37 #, c-format msgid "Please provide a list of klp files as arguments.\n" -msgstr "" +msgstr "Geben Sie eine Liste der klp-Dateien als Argument an.\n" #: src/plugins/language/language-compiler.c:48 #: src/plugins/printable/dictionary-builder.c:113 @@ -228,9 +228,8 @@ msgid "Pranks" msgstr "Punk" #: src/plugins/mp3extractor.c:73 -#, fuzzy msgid "Soundtrack" -msgstr "Sonate" +msgstr "Filmmusik (Soundtrack)" #: src/plugins/mp3extractor.c:74 #, fuzzy @@ -272,7 +271,7 @@ msgstr "Instrumental" #: src/plugins/mp3extractor.c:83 msgid "Acid" -msgstr "" +msgstr "Acid" #: src/plugins/mp3extractor.c:84 msgid "House" @@ -390,9 +389,8 @@ msgid "Christian Rap" msgstr "" #: src/plugins/mp3extractor.c:111 -#, fuzzy msgid "Pop/Funk" -msgstr "Punk" +msgstr "Pop/Funk" #: src/plugins/mp3extractor.c:112 msgid "Jungle" @@ -415,9 +413,8 @@ msgid "Psychedelic" msgstr "" #: src/plugins/mp3extractor.c:117 -#, fuzzy msgid "Rave" -msgstr "Reggae" +msgstr "Rave" #: src/plugins/mp3extractor.c:118 msgid "Showtunes" @@ -446,27 +443,24 @@ msgid "Acid Jazz" msgstr "Jazz" #: src/plugins/mp3extractor.c:124 -#, fuzzy msgid "Polka" -msgstr "Folk" +msgstr "Polka" #: src/plugins/mp3extractor.c:125 msgid "Retro" msgstr "" #: src/plugins/mp3extractor.c:126 -#, fuzzy msgid "Musical" -msgstr "Klassik" +msgstr "Musical" #: src/plugins/mp3extractor.c:127 msgid "Rock & Roll" msgstr "Rock & Roll" #: src/plugins/mp3extractor.c:128 -#, fuzzy msgid "Hard Rock" -msgstr "Rock" +msgstr "Hard Rock" #: src/plugins/mp3extractor.c:129 msgid "Folk" @@ -491,12 +485,11 @@ msgstr "Fusion" #: src/plugins/mp3extractor.c:134 msgid "Bebob" -msgstr "" +msgstr "Bebob" #: src/plugins/mp3extractor.c:135 -#, fuzzy msgid "Latin" -msgstr "Ort" +msgstr "Latin" #: src/plugins/mp3extractor.c:136 msgid "Revival" @@ -513,7 +506,7 @@ msgstr "Blues" #: src/plugins/mp3extractor.c:139 msgid "Avantgarde" -msgstr "" +msgstr "Avantgarde" #: src/plugins/mp3extractor.c:140 #, fuzzy @@ -540,11 +533,11 @@ msgstr "Rock" #: src/plugins/mp3extractor.c:145 msgid "Big Band" -msgstr "" +msgstr "Big Band" #: src/plugins/mp3extractor.c:146 msgid "Chorus" -msgstr "" +msgstr "Chor" #: src/plugins/mp3extractor.c:147 msgid "Easy Listening" @@ -595,9 +588,8 @@ msgid "Porn Groove" msgstr "" #: src/plugins/mp3extractor.c:159 -#, fuzzy msgid "Satire" -msgstr "Datum" +msgstr "Satire" #: src/plugins/mp3extractor.c:160 msgid "Slow Jam" @@ -616,9 +608,8 @@ msgid "Samba" msgstr "Samba" #: src/plugins/mp3extractor.c:164 -#, fuzzy msgid "Folklore" -msgstr "Folk" +msgstr "Folklore" #: src/plugins/mp3extractor.c:165 msgid "Ballad" @@ -704,7 +695,7 @@ msgstr "" #: src/plugins/mp3extractor.c:184 msgid "Beat" -msgstr "" +msgstr "Beat" #: src/plugins/mp3extractor.c:185 msgid "Christian Gangsta Rap" @@ -729,9 +720,8 @@ msgid "Contemporary Christian" msgstr "" #: src/plugins/mp3extractor.c:190 -#, fuzzy msgid "Christian Rock" -msgstr "Klassischer Rock" +msgstr "Christlicher Rock" #: src/plugins/mp3extractor.c:191 msgid "Merengue" @@ -830,8 +820,7 @@ msgstr "" #: src/main/extract.c:52 #, c-format -msgid "" -"Arguments mandatory for long options are also mandatory for short options.\n" +msgid "Arguments mandatory for long options are also mandatory for short options.\n" msgstr "" "Argumente, die für lange Optionen notwendig sind, sind ebenfalls für die\n" "Optionen in Kurzform notwendig.\n" @@ -845,12 +834,8 @@ msgid "print output in bibtex format" msgstr "Ausgabe im BibTeX format" #: src/main/extract.c:130 -msgid "" -"use the generic plaintext extractor for the language with the 2-letter " -"language code LANG" -msgstr "" -"generischen Klartext-extractor für die Sprache mit dem 2-Buchstabenkürzel " -"LANG verwenden" +msgid "use the generic plaintext extractor for the language with the 2-letter language code LANG" +msgstr "generischen Klartext-extractor für die Sprache mit dem 2-Buchstabenkürzel LANG verwenden" #: src/main/extract.c:132 msgid "remove duplicates only if types match" @@ -858,9 +843,7 @@ msgstr "doppelte Einträge nur entfernen, wenn die Art übereinstimmt" #: src/main/extract.c:134 msgid "use the filename as a keyword (loads filename-extractor plugin)" -msgstr "" -"Dateinamen als Schlüsselwort verwenden (filename-extractor-Erweiterung wird " -"geladen)" +msgstr "Dateinamen als Schlüsselwort verwenden (filename-extractor-Erweiterung wird geladen)" #: src/main/extract.c:136 msgid "print this help" @@ -868,8 +851,7 @@ msgstr "diese Hilfe anzeigen" #: src/main/extract.c:138 msgid "compute hash using the given ALGORITHM (currently sha1 or md5)" -msgstr "" -"Hash gemäß dem angegebenen ALGORITHMUS errechnen (z.Zt. »sha1« oder »md5«)" +msgstr "Hash gemäß dem angegebenen ALGORITHMUS errechnen (z.Zt. »sha1« oder »md5«)" #: src/main/extract.c:140 msgid "load an extractor plugin named LIBRARY" @@ -885,9 +867,7 @@ msgstr "Standardsatz der extractor-Erweiterungen nicht verwenden" #: src/main/extract.c:146 msgid "print only keywords of the given TYPE (use -L to get a list)" -msgstr "" -"nur Schlüsselwörter einer bestimmten ART ausgeben (mit -L die Liste anzeigen " -"lassen)" +msgstr "nur Schlüsselwörter einer bestimmten ART ausgeben (mit -L die Liste anzeigen lassen)" #: src/main/extract.c:148 msgid "remove duplicates even if keyword types do not match" @@ -976,11 +956,9 @@ msgid "page count" msgstr "Seitenanzahl" #: src/main/extract.c:473 -#, fuzzy, c-format +#, c-format msgid "You must specify an argument for the `%s' option (option ignored).\n" -msgstr "" -"Sie müssen ein Argument für die Option »%s« angeben (Option wird " -"ignoriert).\n" +msgstr "Sie müssen ein Argument für die Option »%s« angeben (Option wird ignoriert).\n" #: src/main/extract.c:540 #, c-format @@ -1239,9 +1217,8 @@ msgid "binary thumbnail data" msgstr "" #: src/main/extractor.c:115 -#, fuzzy msgid "publication date" -msgstr "Datum der Erstellung" +msgstr "Datum der Veröffentlichung" #: src/main/extractor.c:116 msgid "camera make" @@ -1265,7 +1242,7 @@ msgstr "" #: src/main/extractor.c:121 msgid "flash" -msgstr "" +msgstr "Blitz" #: src/main/extractor.c:122 msgid "flash bias" @@ -1297,7 +1274,7 @@ msgstr "" #: src/main/extractor.c:129 msgid "image quality" -msgstr "" +msgstr "Bildqualität" #: src/main/extractor.c:130 msgid "white balance" @@ -1315,20 +1292,18 @@ msgstr "Initialisierung des Plugin-Mechanismus' ist fehlgeschlagen: %s.\n" #: src/main/extractor.c:372 #, c-format -msgid "" -"Resolving symbol `%s' in library `%s' failed, so I tried `%s', but that " -"failed also. Errors are: `%s' and `%s'.\n" +msgid "Resolving symbol `%s' in library `%s' failed, so I tried `%s', but that failed also. Errors are: `%s' and `%s'.\n" msgstr "" #: src/main/extractor.c:401 -#, fuzzy, c-format +#, c-format msgid "Loading `%s' plugin failed: %s\n" msgstr "Laden des »%s«-Plugins ist fehlgeschlagen: %s\n" #: src/main/extractor.c:606 -#, fuzzy, c-format +#, c-format msgid "Unloading plugin `%s' failed!\n" -msgstr "Entladen des »%s«-Erweiterung ist fehlgeschlagen.\n" +msgstr "Entladen des »%s«-Plugins ist fehlgeschlagen!\n" #~ msgid "Fatal: could not allocate (%s at %s:%d).\n" #~ msgstr "Fatal: Allokieren nicht möglich (%s bei %s:%d).\n" diff --git a/src/plugins/tarextractor.c b/src/plugins/tarextractor.c @@ -20,43 +20,42 @@ #include "platform.h" #include "extractor.h" -#include <zlib.h> /* * Note that this code is not complete! + * It will not report correct results for very long member filenames + * (> 99 octets) when the archive was made with GNU tar or Solaris tar. * * References: * http://www.mkssoftware.com/docs/man4/tar.4.asp + * (does document USTAR format common nowadays, + * but not other extended formats such as the one produced + * by GNU tar 1.13 when very long filenames are met.) */ - -static EXTRACTOR_KeywordList * addKeyword(EXTRACTOR_KeywordType type, - char * keyword, - EXTRACTOR_KeywordList * next) { +static EXTRACTOR_KeywordList * appendKeyword(EXTRACTOR_KeywordType type, + char * keyword, + EXTRACTOR_KeywordList * last) { EXTRACTOR_KeywordList * result; + if ( (last != NULL) && + (last->next != NULL) ) + abort(); if (keyword == NULL) - return next; + return last; if (strlen(keyword) == 0) { free(keyword); - return next; + return last; } result = malloc(sizeof(EXTRACTOR_KeywordList)); - result->next = next; - result->keyword = keyword; + result->next = last; result->keywordType = type; + result->keyword = keyword; + if (last != NULL) + last->next = result; return result; } -static char * stndup(const char * str, - size_t n) { - char * tmp; - tmp = malloc(n+1); - tmp[n] = '\0'; - memcpy(tmp, str, n); - return tmp; -} - typedef struct { char name[100]; char mode[8]; @@ -86,53 +85,124 @@ libextractor_tar_extract(const char * filename, const char * data, size_t size, struct EXTRACTOR_Keywords * prev) { - TarHeader * tar; - USTarHeader * ustar; + const TarHeader * tar; + const USTarHeader * ustar; size_t pos; + const char * mimetype = NULL; + struct EXTRACTOR_Keywords * last; + + last = prev; + if (last != NULL) + while (last->next != NULL) + last = last->next; if (0 != (size % 512) ) return prev; /* cannot be tar! */ if (size < 1024) - return prev; - size -= 1024; /* last 2 blocks are all zeros */ - /* fixme: we may want to check that the last - 1024 bytes are all zeros here... */ + return prev; /* too short, or somehow truncated */ pos = 0; while (pos + sizeof(TarHeader) < size) { unsigned long long fsize; char buf[13]; + const char * nul_pos; + const char * ustar_prefix = NULL; + unsigned int ustar_prefix_length = 0; + unsigned int tar_name_length; + unsigned int zeropos; + int header_is_empty = 1; + + if (pos + 1024 < size) { + const int * idata = (const int*) data; + for (zeropos = 0; zeropos < 1024 / sizeof(int); zeropos++) { + if(0 != idata[zeropos]) { + header_is_empty = 0; + break; + } + } + } - tar = (TarHeader*) &data[pos]; + if (header_is_empty) /* assume the EOF mark was reached */ + break; + + tar = (const TarHeader*) &data[pos]; /* fixme: we may want to check the header checksum here... */ + /* fixme: we attempt to follow MKS document for long file names, + but no TAR file was found yet which matched what we understood ! */ if (pos + sizeof(USTarHeader) < size) { - ustar = (USTarHeader*) &data[pos]; + + nul_pos = memchr(data + pos, 0, sizeof tar->name); + tar_name_length = (0 == nul_pos) + ? sizeof(tar->name) + : (nul_pos - (data + pos)); + + ustar = (const USTarHeader*) &data[pos]; + + if(0 == mimetype) { + if(0 == memcmp(ustar->magic, "ustar ", 7)) + mimetype = "application/x-gtar"; + else + mimetype = "application/x-tar"; + } + if (0 == strncmp("ustar", - &ustar->magic[0], - strlen("ustar"))) - pos += 512; /* sizeof(USTarHeader); */ - else - pos += 257; /* sizeof(TarHeader); minus gcc alignment... */ + &ustar->magic[0], + strlen("ustar"))) { + if(0 != *ustar->prefix) { + nul_pos = memchr(ustar->prefix, 0, sizeof ustar->prefix); + + ustar_prefix_length = (0 == nul_pos) + ? sizeof ustar->prefix + : nul_pos - ustar->prefix; + ustar_prefix = ustar->prefix; + } + } + + pos += 512; /* V7 Tar, USTar and GNU Tar usual headers take 512 octets */ } else { pos += 257; /* sizeof(TarHeader); minus gcc alignment... */ } memcpy(buf, &tar->filesize[0], 12); buf[12] = '\0'; if (1 != sscanf(buf, "%12llo", &fsize)) /* octal! Yuck yuck! */ - return prev; + break; if ( (pos + fsize > size) || (fsize > size) || (pos + fsize < pos) ) - return prev; - prev = addKeyword(EXTRACTOR_FILENAME, - stndup(&tar->name[0], - 100), - prev); + break; + + if (0 < ustar_prefix_length + tar_name_length) { + char * fname = malloc(1 + ustar_prefix_length + tar_name_length); + + if(0 != fname) { + if(0 < ustar_prefix_length) + memcpy(fname, ustar_prefix, ustar_prefix_length); + if(0 < tar_name_length) + memcpy(fname + ustar_prefix_length, tar->name, tar_name_length); + fname[ustar_prefix_length + tar_name_length]= '\0'; + last = appendKeyword(EXTRACTOR_FILENAME, fname, last); + if (prev == NULL) + prev = last; + } + } + if ( (fsize & 511) != 0) fsize = (fsize | 511)+1; /* round up! */ if (pos + fsize < pos) - return prev; + break; pos += fsize; } + + /* + * a simple guard would be to clobber mimetype to NULL + * whenever something bad happens while reading + * (check break instructions just above). + */ + if (NULL != mimetype) { + last = appendKeyword(EXTRACTOR_MIMETYPE, strdup(mimetype), last); + if (prev == NULL) + prev = last; + } + return prev; }