libextractor

GNU libextractor
Log | Files | Refs | Submodules | README | LICENSE

commit 6725cce0bf18fcca79cf68b45b65298140ae9eab
parent d71ae6bfc70014e2fe7b9c55c20ff1573c288d30
Author: Christian Grothoff <christian@grothoff.org>
Date:   Fri, 29 Dec 2006 03:23:38 +0000

fixing Mantis #1125 and bug in splitextractor

Diffstat:
MChangeLog | 6++++++
Mdoc/extract.1 | 9++++++---
Msrc/main/extract.c | 78++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------
Msrc/plugins/splitextractor.c | 57+++++++++++++++++++++++++++++++++++----------------------
Msrc/plugins/thumbnail/thumbnailextractor.c | 9+++++----
5 files changed, 114 insertions(+), 45 deletions(-)

diff --git a/ChangeLog b/ChangeLog @@ -1,3 +1,9 @@ +Thu Dec 28 20:22:20 MST 2006 + Fixed bug in splitextractor, addressing also Mantis #1125. + +Thu Dec 28 18:12:15 MST 2006 + Added -g (greppable output, Mantis #1157) option to extact. + Mon Nov 20 22:08:55 EET 2006 Added an SID (C64 music file) plugin diff --git a/doc/extract.1 b/doc/extract.1 @@ -1,4 +1,4 @@ -.TH EXTRACT 1 "April 28, 2005" "libextractor 0.5.11" +.TH EXTRACT 1 "Dec 29, 2006" "libextractor 0.5.17" .\" $Id .SH NAME extract @@ -6,7 +6,7 @@ extract .SH SYNOPSIS .B extract [ -.B \-abdfhLnrsvV +.B \-abdfghLnrsvV ] [ .B \-B @@ -32,7 +32,7 @@ extract \&... .br .SH DESCRIPTION -This manual page documents version 0.5.11 of the +This manual page documents version 0.5.17 of the .B extract command. .PP @@ -63,6 +63,9 @@ Remove duplicates only if the types match exactly. By default, duplicates are re .B \-f add the filename(s) (without directory) to the list of keywords. .TP 8 +.B \-g +Use grep-friendly output (all keywords on a single line for each file). Use the verbose option to print the filename first, followed by the keywords. This option will not print keyword types or non-textual metadata. +.TP 8 .B \-h Print a brief summary of the options. .TP 8 diff --git a/src/main/extract.c b/src/main/extract.c @@ -132,6 +132,8 @@ printHelp () gettext_noop("remove duplicates only if types match") }, { 'f', "filename", NULL, gettext_noop("use the filename as a keyword (loads filename-extractor plugin)") }, + { 'g', "grep-friendly", NULL, + gettext_noop("produce grep-friendly output (all results on one line per file)") }, { 'h', "help", NULL, gettext_noop("print this help") }, { 'H', "hash", "ALGORITHM", @@ -167,7 +169,7 @@ printHelp () /** * Print a keyword list to a file. - * For debugging. + * * @param handle the file to write to (stdout, stderr), may NOT be NULL * @param keywords the list of keywords to print, may be NULL * @param print array indicating which types to print @@ -180,24 +182,19 @@ printSelectedKeywords(FILE * handle, { char * keyword; iconv_t cd; - char * buf; - cd = iconv_open( - nl_langinfo(CODESET) - , "UTF-8"); + cd = iconv_open(nl_langinfo(CODESET), "UTF-8"); while (keywords != NULL) { - buf = NULL; - if (cd != (iconv_t) -1) - keyword = iconvHelper(cd, - keywords->keyword); - else - keyword = strdup(keywords->keyword); - - if (keywords->keywordType == EXTRACTOR_THUMBNAIL_DATA) { + if (EXTRACTOR_isBinaryType(keywords->keywordType)) { fprintf (handle, _("%s - (binary)\n"), _(EXTRACTOR_getKeywordTypeAsString(keywords->keywordType))); } else { + if (cd != (iconv_t) -1) + keyword = iconvHelper(cd, + keywords->keyword); + else + keyword = strdup(keywords->keyword); if (NULL == EXTRACTOR_getKeywordTypeAsString(keywords->keywordType)) { if (verbose == YES) { fprintf(handle, @@ -209,8 +206,44 @@ printSelectedKeywords(FILE * handle, "%s - %s\n", _(EXTRACTOR_getKeywordTypeAsString(keywords->keywordType)), keyword); + free(keyword); + } + keywords = keywords->next; + } + if (cd != (iconv_t) -1) + iconv_close(cd); +} + +/** + * Print a keyword list to a file in a grep-friendly manner. + * + * @param handle the file to write to (stdout, stderr), may NOT be NULL + * @param keywords the list of keywords to print, may be NULL + * @param print array indicating which types to print + */ +static void +printSelectedKeywordsGrepFriendly(FILE * handle, + EXTRACTOR_KeywordList * keywords, + const int * print, + const int verbose) +{ + char * keyword; + iconv_t cd; + + cd = iconv_open(nl_langinfo(CODESET), "UTF-8"); + while (keywords != NULL) { + if ( (EXTRACTOR_isBinaryType(EXTRACTOR_THUMBNAIL_DATA)) && + (print[keywords->keywordType] == YES) ) { + if (cd != (iconv_t) -1) + keyword = iconvHelper(cd, + keywords->keyword); + else + keyword = strdup(keywords->keyword); + fprintf (handle, + (keywords->next == NULL) ? "%s" : "%s ", + keyword); + free(keyword); } - free(keyword); keywords = keywords->next; } if (cd != (iconv_t) -1) @@ -390,6 +423,7 @@ main (int argc, char *argv[]) int defaultAll = YES; int duplicates = EXTRACTOR_DUPLICATES_REMOVE_UNKNOWN; int bibtex = NO; + int grepfriendly = NO; char * binary = NULL; int ret = 0; @@ -413,6 +447,7 @@ main (int argc, char *argv[]) {"bibtex", 0, 0, 'b'}, {"duplicates", 0, 0, 'd'}, {"filename", 0, 0, 'f'}, + {"grep-friendly", 0, 0, 'g'}, {"help", 0, 0, 'h'}, {"hash", 1, 0, 'H'}, {"list", 0, 0, 'L'}, @@ -451,6 +486,9 @@ main (int argc, char *argv[]) case 'f': useFilename = YES; break; + case 'g': + grepfriendly = YES; + break; case 'h': printHelp(); return 0; @@ -612,10 +650,18 @@ main (int argc, char *argv[]) } if ( (duplicates != -1) || (bibtex == YES)) keywords = EXTRACTOR_removeDuplicateKeywords (keywords, duplicates); - if (verbose == YES && bibtex == NO) - printf (_("Keywords for file %s:\n"), argv[i]); + if ( ( (verbose == YES) || (grepfriendly == YES) ) + && (bibtex == NO) ) { + if (grepfriendly == YES) + printf ("%s", argv[i]); + else + printf (_("Keywords for file %s:"), + argv[i]); + } if (bibtex == YES) printSelectedKeywordsBibtex (stdout, keywords, print, argv[i]); + else if (grepfriendly == YES) + printSelectedKeywordsGrepFriendly(stdout, keywords, print, verbose); else printSelectedKeywords (stdout, keywords, print, verbose); if (verbose == YES && bibtex == NO) diff --git a/src/plugins/splitextractor.c b/src/plugins/splitextractor.c @@ -21,53 +21,63 @@ #include "platform.h" #include "extractor.h" -static char * TOKENIZERS = "._ ,%@-\n_[](){}"; +/** + * Default split characters. + */ +static const char * TOKENIZERS = "._ ,%@-\n_[](){}"; + +/** + * Do not use keywords shorter than this minimum + * length. + */ static int MINIMUM_KEYWORD_LENGTH = 4; static void addKeyword(struct EXTRACTOR_Keywords ** list, - const char * keyword, - EXTRACTOR_KeywordType type) { + const char * keyword) { EXTRACTOR_KeywordList * next; next = malloc(sizeof(EXTRACTOR_KeywordList)); next->next = *list; next->keyword = strdup(keyword); - next->keywordType = type; + next->keywordType = EXTRACTOR_SPLIT; *list = next; } static int token(char letter, const char * options) { - int i; - - if (options == NULL) - options = TOKENIZERS; - for (i=0;i<strlen(TOKENIZERS);i++) - if (letter == TOKENIZERS[i]) + size_t i; + + i = 0; + while (options[i] != '\0') { + if (letter == options[i]) return 1; + i++; + } return 0; } static void splitKeywords(const char * keyword, - EXTRACTOR_KeywordType type, struct EXTRACTOR_Keywords ** list, const char * options) { char * dp; - int pos; - int last; - int len; + size_t pos; + size_t last; + size_t len; dp = strdup(keyword); len = strlen(dp); pos = 0; last = 0; while (pos < len) { - while ((!token(dp[pos], - options)) && (pos < len)) + while ( (0 == token(dp[pos], options)) && + (pos < len) ) + pos++; + dp[pos++] = '\0'; + if (pos - last > MINIMUM_KEYWORD_LENGTH) + addKeyword(list, + &dp[last]); + while ( (1 == token(dp[pos], options)) && + (pos < len) ) pos++; - dp[pos++] = 0; - if (strlen(&dp[last]) >= MINIMUM_KEYWORD_LENGTH) { - addKeyword(list, &dp[last], type); - } last = pos; } free(dp); @@ -82,13 +92,16 @@ libextractor_split_extract(const char * filename, const char * options) { struct EXTRACTOR_Keywords * pos; + if (options == NULL) + options = TOKENIZERS; pos = prev; while (pos != NULL) { - splitKeywords(pos->keyword, - EXTRACTOR_SPLIT, + splitKeywords(pos->keyword, &prev, options); pos = pos->next; } return prev; } + +/* end of splitextractor.c */ diff --git a/src/plugins/thumbnail/thumbnailextractor.c b/src/plugins/thumbnail/thumbnailextractor.c @@ -75,10 +75,11 @@ static char * whitelist[] = { NULL, }; -struct EXTRACTOR_Keywords * libextractor_thumbnail_extract(const char * filename, - const unsigned char * data, - size_t size, - struct EXTRACTOR_Keywords * prev) { +struct EXTRACTOR_Keywords * +libextractor_thumbnail_extract(const char * filename, + const unsigned char * data, + size_t size, + struct EXTRACTOR_Keywords * prev) { GdkPixbufLoader * loader; GdkPixbuf * in; GdkPixbuf * out;