fixing Mantis #1125 and bug in splitextractor - libextractor

commit 6725cce0bf18fcca79cf68b45b65298140ae9eab
parent d71ae6bfc70014e2fe7b9c55c20ff1573c288d30
Author: Christian Grothoff <christian@grothoff.org>
Date:   Fri, 29 Dec 2006 03:23:38 +0000

fixing Mantis #1125 and bug in splitextractor

Diffstat:
M ChangeLog  | 6 ++++++
M doc/extract.1  | 9 ++++++---
M src/main/extract.c  | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------
M src/plugins/splitextractor.c  | 57 +++++++++++++++++++++++++++++++++++----------------------
M src/plugins/thumbnail/thumbnailextractor.c  | 9 +++++----

5 files changed, 114 insertions(+), 45 deletions(-)
diff --git a/ChangeLog b/ChangeLog
@@ -1,3 +1,9 @@
+Thu Dec 28 20:22:20 MST 2006
+	Fixed bug in splitextractor, addressing also Mantis #1125.
+
+Thu Dec 28 18:12:15 MST 2006
+	Added -g (greppable output, Mantis #1157) option to extact.  
+
 Mon Nov 20 22:08:55 EET 2006
 	Added an SID (C64 music file) plugin
 
diff --git a/doc/extract.1 b/doc/extract.1
@@ -1,4 +1,4 @@
-.TH EXTRACT 1 "April 28, 2005" "libextractor 0.5.11"
+.TH EXTRACT 1 "Dec 29, 2006" "libextractor 0.5.17"
 .\" $Id
 .SH NAME
 extract
@@ -6,7 +6,7 @@ extract
 .SH SYNOPSIS
 .B extract
 [
-.B \-abdfhLnrsvV
+.B \-abdfghLnrsvV
 ]
 [
 .B \-B
@@ -32,7 +32,7 @@ extract
 \&...
 .br
 .SH DESCRIPTION
-This manual page documents version 0.5.11 of the 
+This manual page documents version 0.5.17 of the 
 .B extract 
 command.
 .PP
@@ -63,6 +63,9 @@ Remove duplicates only if the types match exactly. By default, duplicates are re
 .B \-f
 add the filename(s) (without directory) to the list of keywords.
 .TP 8
+.B \-g
+Use grep-friendly output (all keywords on a single line for each file).  Use the verbose option to print the filename first, followed by the keywords.  This option will not print keyword types or non-textual metadata.
+.TP 8
 .B \-h
 Print a brief summary of the options.
 .TP 8
diff --git a/src/main/extract.c b/src/main/extract.c
@@ -132,6 +132,8 @@ printHelp ()
       gettext_noop("remove duplicates only if types match") },
     { 'f', "filename", NULL,
       gettext_noop("use the filename as a keyword (loads filename-extractor plugin)") },
+    { 'g', "grep-friendly", NULL,
+      gettext_noop("produce grep-friendly output (all results on one line per file)") },
     { 'h', "help", NULL,
       gettext_noop("print this help") },
     { 'H', "hash", "ALGORITHM",
@@ -167,7 +169,7 @@ printHelp ()
 
 /**
  * Print a keyword list to a file.
- * For debugging.
+ * 
  * @param handle the file to write to (stdout, stderr), may NOT be NULL
  * @param keywords the list of keywords to print, may be NULL
  * @param print array indicating which types to print
@@ -180,24 +182,19 @@ printSelectedKeywords(FILE * handle,
 {
   char * keyword;
   iconv_t cd;
-  char * buf;
 
-  cd = iconv_open(
-    nl_langinfo(CODESET)
-    , "UTF-8");
+  cd = iconv_open(nl_langinfo(CODESET), "UTF-8");
   while (keywords != NULL) {
-    buf = NULL;
-    if (cd != (iconv_t) -1)
-      keyword = iconvHelper(cd,
-			    keywords->keyword);
-    else
-      keyword = strdup(keywords->keyword);
-
-    if (keywords->keywordType == EXTRACTOR_THUMBNAIL_DATA) {
+    if (EXTRACTOR_isBinaryType(keywords->keywordType)) {
       fprintf (handle,
 	       _("%s - (binary)\n"),
 	       _(EXTRACTOR_getKeywordTypeAsString(keywords->keywordType)));
     } else {
+      if (cd != (iconv_t) -1)
+	keyword = iconvHelper(cd,
+			      keywords->keyword);
+      else
+	keyword = strdup(keywords->keyword);      
       if (NULL == EXTRACTOR_getKeywordTypeAsString(keywords->keywordType)) {
 	if (verbose == YES) {
 	  fprintf(handle,
@@ -209,8 +206,44 @@ printSelectedKeywords(FILE * handle,
 		 "%s - %s\n",
 		 _(EXTRACTOR_getKeywordTypeAsString(keywords->keywordType)),
 		 keyword);
+      free(keyword);
+    }
+    keywords = keywords->next;
+  }
+  if (cd != (iconv_t) -1)
+    iconv_close(cd);
+}
+
+/**
+ * Print a keyword list to a file in a grep-friendly manner.
+ *
+ * @param handle the file to write to (stdout, stderr), may NOT be NULL
+ * @param keywords the list of keywords to print, may be NULL
+ * @param print array indicating which types to print
+ */
+static void
+printSelectedKeywordsGrepFriendly(FILE * handle,
+				  EXTRACTOR_KeywordList * keywords,
+				  const int * print,
+				  const int verbose)
+{
+  char * keyword;
+  iconv_t cd;
+
+  cd = iconv_open(nl_langinfo(CODESET), "UTF-8");
+  while (keywords != NULL) {
+    if ( (EXTRACTOR_isBinaryType(EXTRACTOR_THUMBNAIL_DATA)) &&
+	 (print[keywords->keywordType] == YES) ) {
+      if (cd != (iconv_t) -1)
+	keyword = iconvHelper(cd,
+			      keywords->keyword);
+      else
+	keyword = strdup(keywords->keyword);
+      fprintf (handle,
+	       (keywords->next == NULL) ? "%s" : "%s ",
+	       keyword); 
+      free(keyword);
     }
-    free(keyword);
     keywords = keywords->next;
   }
   if (cd != (iconv_t) -1)
@@ -390,6 +423,7 @@ main (int argc, char *argv[])
   int defaultAll = YES;
   int duplicates = EXTRACTOR_DUPLICATES_REMOVE_UNKNOWN;
   int bibtex = NO;
+  int grepfriendly = NO;
   char * binary = NULL;
   int ret = 0;
 
@@ -413,6 +447,7 @@ main (int argc, char *argv[])
 	{"bibtex", 0, 0, 'b'},
 	{"duplicates", 0, 0, 'd'},
 	{"filename", 0, 0, 'f'},
+	{"grep-friendly", 0, 0, 'g'},
 	{"help", 0, 0, 'h'},
 	{"hash", 1, 0, 'H'},
 	{"list", 0, 0, 'L'},
@@ -451,6 +486,9 @@ main (int argc, char *argv[])
 	case 'f':
 	  useFilename = YES;
 	  break;
+	case 'g':
+	  grepfriendly = YES;
+	  break;
 	case 'h':
 	  printHelp();
 	  return 0;
@@ -612,10 +650,18 @@ main (int argc, char *argv[])
     }
     if ( (duplicates != -1) || (bibtex == YES))
       keywords = EXTRACTOR_removeDuplicateKeywords (keywords, duplicates);
-    if (verbose == YES && bibtex == NO)
-      printf (_("Keywords for file %s:\n"), argv[i]);
+    if ( ( (verbose == YES) || (grepfriendly == YES) )
+	 && (bibtex == NO) ) {
+      if (grepfriendly == YES)
+	printf ("%s", argv[i]);
+      else 
+	printf (_("Keywords for file %s:"), 
+		argv[i]);
+    }
     if (bibtex == YES)
       printSelectedKeywordsBibtex (stdout, keywords, print, argv[i]);
+    else if (grepfriendly == YES)
+      printSelectedKeywordsGrepFriendly(stdout, keywords, print, verbose);
     else
       printSelectedKeywords (stdout, keywords, print, verbose);
     if (verbose == YES && bibtex == NO)
diff --git a/src/plugins/splitextractor.c b/src/plugins/splitextractor.c
@@ -21,53 +21,63 @@
 #include "platform.h"
 #include "extractor.h"
 
-static char * TOKENIZERS = "._ ,%@-\n_[](){}";
+/**
+ * Default split characters.
+ */
+static const char * TOKENIZERS = "._ ,%@-\n_[](){}";
+
+/**
+ * Do not use keywords shorter than this minimum
+ * length.
+ */
 static int MINIMUM_KEYWORD_LENGTH = 4;
 
 static void addKeyword(struct EXTRACTOR_Keywords ** list,
-		       const char * keyword,
-		       EXTRACTOR_KeywordType type) {
+		       const char * keyword) {
   EXTRACTOR_KeywordList * next;
   next = malloc(sizeof(EXTRACTOR_KeywordList));
   next->next = *list;
   next->keyword = strdup(keyword);
-  next->keywordType = type;
+  next->keywordType = EXTRACTOR_SPLIT;
   *list = next;
 }
 
 static int token(char letter,
 		 const char * options) {
-  int i;
-  
-  if (options == NULL)
-    options = TOKENIZERS;
-  for (i=0;i<strlen(TOKENIZERS);i++)
-    if (letter == TOKENIZERS[i])
+  size_t i;
+
+  i = 0;
+  while (options[i] != '\0') {
+    if (letter == options[i])
       return 1;
+    i++;
+  }
   return 0;
 }
 
 static void splitKeywords(const char * keyword,
-			  EXTRACTOR_KeywordType type,
 			  struct EXTRACTOR_Keywords ** list,
 			  const char * options) {
   char * dp;
-  int pos;
-  int last;
-  int len;
+  size_t pos;
+  size_t last;
+  size_t len;
 
   dp = strdup(keyword);
   len = strlen(dp);
   pos = 0;
   last = 0;
   while (pos < len) {
-    while ((!token(dp[pos],
-																			options)) && (pos < len))
+    while ( (0 == token(dp[pos], options)) &&
+	    (pos < len) )
+      pos++;
+    dp[pos++] = '\0';
+    if (pos - last > MINIMUM_KEYWORD_LENGTH) 
+      addKeyword(list, 
+		 &dp[last]);    
+    while ( (1 == token(dp[pos], options)) &&
+	    (pos < len) )
       pos++;
-    dp[pos++] = 0;
-    if (strlen(&dp[last]) >= MINIMUM_KEYWORD_LENGTH) {
-      addKeyword(list, &dp[last], type);
-    }
     last = pos;
   }
   free(dp);
@@ -82,13 +92,16 @@ libextractor_split_extract(const char * filename,
 			   const char * options) {
   struct EXTRACTOR_Keywords * pos;
 
+  if (options == NULL)
+    options = TOKENIZERS;
   pos = prev;
   while (pos != NULL) {
-    splitKeywords(pos->keyword,
-		  EXTRACTOR_SPLIT,
+    splitKeywords(pos->keyword,		  
 		  &prev,
 		  options);
     pos = pos->next;
   }
   return prev;
 }
+
+/* end of splitextractor.c */
diff --git a/src/plugins/thumbnail/thumbnailextractor.c b/src/plugins/thumbnail/thumbnailextractor.c
@@ -75,10 +75,11 @@ static char * whitelist[] = {
   NULL,
 };
 
-struct EXTRACTOR_Keywords * libextractor_thumbnail_extract(const char * filename,
-							   const unsigned char * data,
-							   size_t size,
-							   struct EXTRACTOR_Keywords * prev) {
+struct EXTRACTOR_Keywords *
+libextractor_thumbnail_extract(const char * filename,
+			       const unsigned char * data,
+			       size_t size,
+			       struct EXTRACTOR_Keywords * prev) {
   GdkPixbufLoader * loader;
   GdkPixbuf * in;
   GdkPixbuf * out;

	libextractor GNU libextractor
	Log \| Files \| Refs \| Submodules \| README \| LICENSE

M	ChangeLog	\|	6	++++++
M	doc/extract.1	\|	9	++++++---
M	src/main/extract.c	\|	78	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------
M	src/plugins/splitextractor.c	\|	57	+++++++++++++++++++++++++++++++++++----------------------
M	src/plugins/thumbnail/thumbnailextractor.c	\|	9	+++++----