le0512 - libextractor - GNU libextractor

commit 31e68de5d7e2877dc6e907a3f073dd2f0e27d469
parent fde2e0a5b59e6beb940e2930db8e01cf250d3c90
Author: Christian Grothoff <christian@grothoff.org>
Date:   Sat, 22 Apr 2006 18:28:39 +0000

le0512

Diffstat:
M ChangeLog  | 4 ++++
M NEWS  | 4 ++++
M configure.ac  | 4 ++--
M contrib/doxygen  | 2 +-
M po/de.po  | 3 ++-
M po/ga.po  | 3 ++-
M po/libextractor.pot  | 3 ++-
M po/ro.po  | 3 ++-
M po/rw.po  | 3 ++-
M src/plugins/pdf/pdfextractor.cc  | 49 ++++++-------------------------------------------
M src/plugins/pdfextractor.c  | 2 +-
M src/plugins/printable/dictionary-builder.c  | 6 +++---
M src/plugins/printable/printableextractor.h  | 9 +++------

13 files changed, 34 insertions(+), 61 deletions(-)
diff --git a/ChangeLog b/ChangeLog
@@ -1,3 +1,7 @@
+Sat Apr 22 11:18:56 PDT 2006
+	Final touches to new build of printable extractors.
+	Releasing libextractor 0.5.12.
+
 Tue Apr 18 14:44:37 PDT 2006
 	Improved memory utilization for printable extractors
 	at compile time.  Added dictionaries for Finnish,
diff --git a/NEWS b/NEWS
@@ -1,3 +1,7 @@
+Tue Apr 18 14:44:37 PDT 2006
+	Added dictionaries for Finnish, French, Gaelic and Swedish
+	(for printable extractors).
+
 Thu Mar  9 17:55:09 PST 2006
 	Word history extraction works (wordleaker).
 
diff --git a/configure.ac b/configure.ac
@@ -1,8 +1,8 @@
 # Process this file with autoconf to produce a configure script.
 AC_PREREQ(2.57)
-AC_INIT([libextractor], [0.5.11], [bug-libextractor@gnu.org])
+AC_INIT([libextractor], [0.5.12], [bug-libextractor@gnu.org])
 AC_REVISION($Revision: 1.67 $)
-AM_INIT_AUTOMAKE([libextractor], [0.5.11])
+AM_INIT_AUTOMAKE([libextractor], [0.5.12])
 AM_CONFIG_HEADER(src/include/config.h)
 
 AH_TOP([#define _GNU_SOURCE  1])
diff --git a/contrib/doxygen b/contrib/doxygen
@@ -23,7 +23,7 @@ PROJECT_NAME           = libextractor
 # This could be handy for archiving the generated documentation or 
 # if some version control system is used.
 
-PROJECT_NUMBER         = 0.5.11
+PROJECT_NUMBER         = 0.5.12
 
 # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) 
 # base path where the generated documentation will be put. 
diff --git a/po/de.po b/po/de.po
@@ -9,7 +9,7 @@ msgid ""
 msgstr ""
 "Project-Id-Version: libextractor 0.5.6a\n"
 "Report-Msgid-Bugs-To: libextractor@gnu.org\n"
-"POT-Creation-Date: 2006-04-18 14:46-0700\n"
+"POT-Creation-Date: 2006-04-22 11:52-0700\n"
 "PO-Revision-Date: 2005-09-22 10:07+0200\n"
 "Last-Translator: Karl Eichwalder <ke@suse.de>\n"
 "Language-Team: German <de@li.org>\n"
@@ -1300,6 +1300,7 @@ msgstr ""
 "erstellen.  Zum Beispiel:\n"
 
 #: src/plugins/printable/dictionary-builder.c:110
+#: src/plugins/printable/dictionary-builder.c:166
 #, c-format
 msgid "Error opening file `%s': %s\n"
 msgstr "Fehler beim Öffnen der Datei »%s«: %s\n"
diff --git a/po/ga.po b/po/ga.po
@@ -6,7 +6,7 @@ msgid ""
 msgstr ""
 "Project-Id-Version: libextractor 0.5.6a\n"
 "Report-Msgid-Bugs-To: libextractor@gnu.org\n"
-"POT-Creation-Date: 2006-04-18 14:46-0700\n"
+"POT-Creation-Date: 2006-04-22 11:52-0700\n"
 "PO-Revision-Date: 2005-09-21 00:46-0700\n"
 "Last-Translator: Kevin Patrick Scannell <scannell@SLU.EDU>\n"
 "Language-Team: Irish <gaeilge-gnulinux@lists.sourceforge.net>\n"
@@ -1304,6 +1304,7 @@ msgstr ""
 "Mar shampla:\n"
 
 #: src/plugins/printable/dictionary-builder.c:110
+#: src/plugins/printable/dictionary-builder.c:166
 #, c-format
 msgid "Error opening file `%s': %s\n"
 msgstr "Earr�id agus comhad `%s' � oscailt: %s\n"
diff --git a/po/libextractor.pot b/po/libextractor.pot
@@ -8,7 +8,7 @@ msgid ""
 msgstr ""
 "Project-Id-Version: PACKAGE VERSION\n"
 "Report-Msgid-Bugs-To: libextractor@gnu.org\n"
-"POT-Creation-Date: 2006-04-18 14:46-0700\n"
+"POT-Creation-Date: 2006-04-22 11:52-0700\n"
 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 "Language-Team: LANGUAGE <LL@li.org>\n"
@@ -1282,6 +1282,7 @@ msgid ""
 msgstr ""
 
 #: src/plugins/printable/dictionary-builder.c:110
+#: src/plugins/printable/dictionary-builder.c:166
 #, c-format
 msgid "Error opening file `%s': %s\n"
 msgstr ""
diff --git a/po/ro.po b/po/ro.po
@@ -9,7 +9,7 @@ msgid ""
 msgstr ""
 "Project-Id-Version: libextractor 0.4.2\n"
 "Report-Msgid-Bugs-To: libextractor@gnu.org\n"
-"POT-Creation-Date: 2006-04-18 14:46-0700\n"
+"POT-Creation-Date: 2006-04-22 11:52-0700\n"
 "PO-Revision-Date: 2005-02-25 12:00-0500\n"
 "Last-Translator: Laurentiu Buzdugan <lbuz@rolix.org>\n"
 "Language-Team: Romanian <translation-team-ro@lists.sourceforge.net>\n"
@@ -1312,6 +1312,7 @@ msgstr ""
 "un dic�ionar.  De exemplu:\n"
 
 #: src/plugins/printable/dictionary-builder.c:110
+#: src/plugins/printable/dictionary-builder.c:166
 #, c-format
 msgid "Error opening file `%s': %s\n"
 msgstr "Eroare deschidere fi�ier `%s': %s\n"
diff --git a/po/rw.po b/po/rw.po
@@ -16,7 +16,7 @@ msgid ""
 msgstr ""
 "Project-Id-Version: libextractor 0.4.2\n"
 "Report-Msgid-Bugs-To: libextractor@gnu.org\n"
-"POT-Creation-Date: 2006-04-18 14:46-0700\n"
+"POT-Creation-Date: 2006-04-22 11:52-0700\n"
 "PO-Revision-Date: 2005-04-04 10:55-0700\n"
 "Last-Translator: Steven Michael Murphy <murf@e-tools.com>\n"
 "Language-Team: Kinyarwanda <translation-team-rw@lists.sourceforge.net>\n"
@@ -1586,6 +1586,7 @@ msgstr "i Izina: Bya i Ururimi Inkoranyamagambo kugirango Urugero"
 
 # basctl/source\basicide\basidesh.src:RID_STR_ERROROPENSTORAGE.text
 #: src/plugins/printable/dictionary-builder.c:110
+#: src/plugins/printable/dictionary-builder.c:166
 #, fuzzy, c-format
 msgid "Error opening file `%s': %s\n"
 msgstr "Hari ikibazo mu gufungura dosiye"
diff --git a/src/plugins/pdf/pdfextractor.cc b/src/plugins/pdf/pdfextractor.cc
@@ -73,34 +73,11 @@ extern "C" {
       if ((((unsigned char)s[0]) & 0xff) == 0xfe &&
 	  (((unsigned char)s[1]) & 0xff) == 0xff) {
 	char * result;
-	unsigned char u[2];
-	unsigned int pos;
-	unsigned int len;
-	char * con;
-
-	result = (char*) malloc(s1->getLength() * 4);
-	result[0] = '\0';
-	len = s1->getLength();
-	for (pos=0;pos<len;pos+=2) {
-	  u[0] = s1->getChar(pos+1);
-	  u[1] = s1->getChar(pos);
-	  /* Q: is there a difference between UTF-16 and UNICODE?
-	     Which one is needed here?  And how to do it on solaris
-	     where UNICODE is not known!?
-	     See http://lists.gnu.org/archive/html/libextractor/2006-04/msg00006.html
-	  */
-#ifdef SOLARIS
-	  con = (char*) convertToUtf8((const char*) u, 2, "UTF-16");
-#else
-	  con = (char*) convertToUtf8((const char*) u, 2, "UNICODE");
-#endif
-	  strcat(result, con);
-	  free(con);
-	}
+
+	result = convertToUtf8((const char*) &s[2], s1->getLength() - 2, "UTF-16BE");
 	next = addKeyword(type,
-			  strdup(result),
+			  result,
 			  next);
-	free(result);
       } else {
         unsigned int len = (NULL == s) ? 0 : strlen(s);
 
@@ -157,25 +134,11 @@ extern "C" {
 	  (s1->getChar(1) & 0xff) == 0xff) {
 	/* isUnicode */
 	char * result;
-	unsigned char u[2];
-	unsigned int pos;
-	unsigned int len;
-	char * con;
-
-	result = (char*) malloc(s1->getLength() * 4);
-	result[0] = '\0';
-	len = s1->getLength();
-	for (pos=0;pos<len;pos+=2) {
-	  u[0] = s1->getChar(pos+1);
-	  u[1] = s1->getChar(pos);
-	  con = (char*) convertToUtf8((const char*) u, 2, "UNICODE");
-	  strcat(result, con);
-	  free(con);
-	}		
+
+	result = convertToUtf8((const char*)&s[2], s1->getLength() - 2, "UTF-16BE");
 	next = addKeyword(type,
-			  strdup(result),
+			  result,
 			  next);
-	free(result);
       } else {
 	if (s[0] == 'D' && s[1] == ':') {
 	  s += 2;
diff --git a/src/plugins/pdfextractor.c b/src/plugins/pdfextractor.c
@@ -203,7 +203,7 @@ charsetDecode(const unsigned char * in,
   } else { 
     return convertToUtf8(&in[2],
 			 size - 2,
-			 "UNICODEBIG");
+			 "UTF-16BE");
   }
 }
 
diff --git a/src/plugins/printable/dictionary-builder.c b/src/plugins/printable/dictionary-builder.c
@@ -136,7 +136,7 @@ int main(int argc,
   }
 
   bf.addressesPerElement = ADDR_PER_ELEMENT;
-  bf.bitArraySize = cnt * 4 / SUBTABLES * SUBTABLES;
+  bf.bitArraySize = (1 + (cnt / SUBTABLES)) * sizeof(int) * SUBTABLES;
   bf.bitArray = malloc(bf.bitArraySize);
   memset(bf.bitArray, 0, bf.bitArraySize);
 
@@ -169,8 +169,8 @@ int main(int argc,
     }
     fprintf(btfile,
 	    "int %s_bits_%d[] = { ", argv[2], j);
-    for (i= j    * bf.bitArraySize/sizeof(int)/SUBTABLES;
-	 i<(j+1) * bf.bitArraySize/sizeof(int)/SUBTABLES;
+    for (i= j    * (bf.bitArraySize/sizeof(int)/SUBTABLES);
+	 i<(j+1) * (bf.bitArraySize/sizeof(int)/SUBTABLES);
 	 i++)
       fprintf(btfile,
 	      "%dL,",
diff --git a/src/plugins/printable/printableextractor.h b/src/plugins/printable/printableextractor.h
@@ -69,10 +69,8 @@ static void testBitCallback(Bloomfilter * bf,
   int * arg = cls;
   if (! testBit(bf->sbitArray,
 		bf->bitArraySize,
-		bit)) {
-    printf("Testing bit %u failed!\n", bit);
-    *arg = 0;
-  }
+		bit)) 
+    *arg = 0;  
 }
 /**
  * Test if an element is in the filter.
@@ -129,9 +127,8 @@ static int wordTest(char * word,
   HashCode160 hc;
   char * lower;
 
-  if (strlen(word) <= (int) (*strlenthreshold)) {
+  if (strlen(word) <= (int) (*strlenthreshold)) 
     return 0;
-  }
   for (i=strlen(word)-1;i>=0;i--)
     if (isdigit(word[i]))
       return 0;

	libextractor GNU libextractor
	Log \| Files \| Refs \| Submodules \| README \| LICENSE

M	ChangeLog	\|	4	++++
M	NEWS	\|	4	++++
M	configure.ac	\|	4	++--
M	contrib/doxygen	\|	2	+-
M	po/de.po	\|	3	++-
M	po/ga.po	\|	3	++-
M	po/libextractor.pot	\|	3	++-
M	po/ro.po	\|	3	++-
M	po/rw.po	\|	3	++-
M	src/plugins/pdf/pdfextractor.cc	\|	49	++++++-------------------------------------------
M	src/plugins/pdfextractor.c	\|	2	+-
M	src/plugins/printable/dictionary-builder.c	\|	6	+++---
M	src/plugins/printable/printableextractor.h	\|	9	+++------