commit 31e68de5d7e2877dc6e907a3f073dd2f0e27d469
parent fde2e0a5b59e6beb940e2930db8e01cf250d3c90
Author: Christian Grothoff <christian@grothoff.org>
Date: Sat, 22 Apr 2006 18:28:39 +0000
le0512
Diffstat:
13 files changed, 34 insertions(+), 61 deletions(-)
diff --git a/ChangeLog b/ChangeLog
@@ -1,3 +1,7 @@
+Sat Apr 22 11:18:56 PDT 2006
+ Final touches to new build of printable extractors.
+ Releasing libextractor 0.5.12.
+
Tue Apr 18 14:44:37 PDT 2006
Improved memory utilization for printable extractors
at compile time. Added dictionaries for Finnish,
diff --git a/NEWS b/NEWS
@@ -1,3 +1,7 @@
+Tue Apr 18 14:44:37 PDT 2006
+ Added dictionaries for Finnish, French, Gaelic and Swedish
+ (for printable extractors).
+
Thu Mar 9 17:55:09 PST 2006
Word history extraction works (wordleaker).
diff --git a/configure.ac b/configure.ac
@@ -1,8 +1,8 @@
# Process this file with autoconf to produce a configure script.
AC_PREREQ(2.57)
-AC_INIT([libextractor], [0.5.11], [bug-libextractor@gnu.org])
+AC_INIT([libextractor], [0.5.12], [bug-libextractor@gnu.org])
AC_REVISION($Revision: 1.67 $)
-AM_INIT_AUTOMAKE([libextractor], [0.5.11])
+AM_INIT_AUTOMAKE([libextractor], [0.5.12])
AM_CONFIG_HEADER(src/include/config.h)
AH_TOP([#define _GNU_SOURCE 1])
diff --git a/contrib/doxygen b/contrib/doxygen
@@ -23,7 +23,7 @@ PROJECT_NAME = libextractor
# This could be handy for archiving the generated documentation or
# if some version control system is used.
-PROJECT_NUMBER = 0.5.11
+PROJECT_NUMBER = 0.5.12
# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
# base path where the generated documentation will be put.
diff --git a/po/de.po b/po/de.po
@@ -9,7 +9,7 @@ msgid ""
msgstr ""
"Project-Id-Version: libextractor 0.5.6a\n"
"Report-Msgid-Bugs-To: libextractor@gnu.org\n"
-"POT-Creation-Date: 2006-04-18 14:46-0700\n"
+"POT-Creation-Date: 2006-04-22 11:52-0700\n"
"PO-Revision-Date: 2005-09-22 10:07+0200\n"
"Last-Translator: Karl Eichwalder <ke@suse.de>\n"
"Language-Team: German <de@li.org>\n"
@@ -1300,6 +1300,7 @@ msgstr ""
"erstellen. Zum Beispiel:\n"
#: src/plugins/printable/dictionary-builder.c:110
+#: src/plugins/printable/dictionary-builder.c:166
#, c-format
msgid "Error opening file `%s': %s\n"
msgstr "Fehler beim Öffnen der Datei »%s«: %s\n"
diff --git a/po/ga.po b/po/ga.po
@@ -6,7 +6,7 @@ msgid ""
msgstr ""
"Project-Id-Version: libextractor 0.5.6a\n"
"Report-Msgid-Bugs-To: libextractor@gnu.org\n"
-"POT-Creation-Date: 2006-04-18 14:46-0700\n"
+"POT-Creation-Date: 2006-04-22 11:52-0700\n"
"PO-Revision-Date: 2005-09-21 00:46-0700\n"
"Last-Translator: Kevin Patrick Scannell <scannell@SLU.EDU>\n"
"Language-Team: Irish <gaeilge-gnulinux@lists.sourceforge.net>\n"
@@ -1304,6 +1304,7 @@ msgstr ""
"Mar shampla:\n"
#: src/plugins/printable/dictionary-builder.c:110
+#: src/plugins/printable/dictionary-builder.c:166
#, c-format
msgid "Error opening file `%s': %s\n"
msgstr "Earráid agus comhad `%s' á oscailt: %s\n"
diff --git a/po/libextractor.pot b/po/libextractor.pot
@@ -8,7 +8,7 @@ msgid ""
msgstr ""
"Project-Id-Version: PACKAGE VERSION\n"
"Report-Msgid-Bugs-To: libextractor@gnu.org\n"
-"POT-Creation-Date: 2006-04-18 14:46-0700\n"
+"POT-Creation-Date: 2006-04-22 11:52-0700\n"
"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
"Language-Team: LANGUAGE <LL@li.org>\n"
@@ -1282,6 +1282,7 @@ msgid ""
msgstr ""
#: src/plugins/printable/dictionary-builder.c:110
+#: src/plugins/printable/dictionary-builder.c:166
#, c-format
msgid "Error opening file `%s': %s\n"
msgstr ""
diff --git a/po/ro.po b/po/ro.po
@@ -9,7 +9,7 @@ msgid ""
msgstr ""
"Project-Id-Version: libextractor 0.4.2\n"
"Report-Msgid-Bugs-To: libextractor@gnu.org\n"
-"POT-Creation-Date: 2006-04-18 14:46-0700\n"
+"POT-Creation-Date: 2006-04-22 11:52-0700\n"
"PO-Revision-Date: 2005-02-25 12:00-0500\n"
"Last-Translator: Laurentiu Buzdugan <lbuz@rolix.org>\n"
"Language-Team: Romanian <translation-team-ro@lists.sourceforge.net>\n"
@@ -1312,6 +1312,7 @@ msgstr ""
"un dicþionar. De exemplu:\n"
#: src/plugins/printable/dictionary-builder.c:110
+#: src/plugins/printable/dictionary-builder.c:166
#, c-format
msgid "Error opening file `%s': %s\n"
msgstr "Eroare deschidere fiºier `%s': %s\n"
diff --git a/po/rw.po b/po/rw.po
@@ -16,7 +16,7 @@ msgid ""
msgstr ""
"Project-Id-Version: libextractor 0.4.2\n"
"Report-Msgid-Bugs-To: libextractor@gnu.org\n"
-"POT-Creation-Date: 2006-04-18 14:46-0700\n"
+"POT-Creation-Date: 2006-04-22 11:52-0700\n"
"PO-Revision-Date: 2005-04-04 10:55-0700\n"
"Last-Translator: Steven Michael Murphy <murf@e-tools.com>\n"
"Language-Team: Kinyarwanda <translation-team-rw@lists.sourceforge.net>\n"
@@ -1586,6 +1586,7 @@ msgstr "i Izina: Bya i Ururimi Inkoranyamagambo kugirango Urugero"
# basctl/source\basicide\basidesh.src:RID_STR_ERROROPENSTORAGE.text
#: src/plugins/printable/dictionary-builder.c:110
+#: src/plugins/printable/dictionary-builder.c:166
#, fuzzy, c-format
msgid "Error opening file `%s': %s\n"
msgstr "Hari ikibazo mu gufungura dosiye"
diff --git a/src/plugins/pdf/pdfextractor.cc b/src/plugins/pdf/pdfextractor.cc
@@ -73,34 +73,11 @@ extern "C" {
if ((((unsigned char)s[0]) & 0xff) == 0xfe &&
(((unsigned char)s[1]) & 0xff) == 0xff) {
char * result;
- unsigned char u[2];
- unsigned int pos;
- unsigned int len;
- char * con;
-
- result = (char*) malloc(s1->getLength() * 4);
- result[0] = '\0';
- len = s1->getLength();
- for (pos=0;pos<len;pos+=2) {
- u[0] = s1->getChar(pos+1);
- u[1] = s1->getChar(pos);
- /* Q: is there a difference between UTF-16 and UNICODE?
- Which one is needed here? And how to do it on solaris
- where UNICODE is not known!?
- See http://lists.gnu.org/archive/html/libextractor/2006-04/msg00006.html
- */
-#ifdef SOLARIS
- con = (char*) convertToUtf8((const char*) u, 2, "UTF-16");
-#else
- con = (char*) convertToUtf8((const char*) u, 2, "UNICODE");
-#endif
- strcat(result, con);
- free(con);
- }
+
+ result = convertToUtf8((const char*) &s[2], s1->getLength() - 2, "UTF-16BE");
next = addKeyword(type,
- strdup(result),
+ result,
next);
- free(result);
} else {
unsigned int len = (NULL == s) ? 0 : strlen(s);
@@ -157,25 +134,11 @@ extern "C" {
(s1->getChar(1) & 0xff) == 0xff) {
/* isUnicode */
char * result;
- unsigned char u[2];
- unsigned int pos;
- unsigned int len;
- char * con;
-
- result = (char*) malloc(s1->getLength() * 4);
- result[0] = '\0';
- len = s1->getLength();
- for (pos=0;pos<len;pos+=2) {
- u[0] = s1->getChar(pos+1);
- u[1] = s1->getChar(pos);
- con = (char*) convertToUtf8((const char*) u, 2, "UNICODE");
- strcat(result, con);
- free(con);
- }
+
+ result = convertToUtf8((const char*)&s[2], s1->getLength() - 2, "UTF-16BE");
next = addKeyword(type,
- strdup(result),
+ result,
next);
- free(result);
} else {
if (s[0] == 'D' && s[1] == ':') {
s += 2;
diff --git a/src/plugins/pdfextractor.c b/src/plugins/pdfextractor.c
@@ -203,7 +203,7 @@ charsetDecode(const unsigned char * in,
} else {
return convertToUtf8(&in[2],
size - 2,
- "UNICODEBIG");
+ "UTF-16BE");
}
}
diff --git a/src/plugins/printable/dictionary-builder.c b/src/plugins/printable/dictionary-builder.c
@@ -136,7 +136,7 @@ int main(int argc,
}
bf.addressesPerElement = ADDR_PER_ELEMENT;
- bf.bitArraySize = cnt * 4 / SUBTABLES * SUBTABLES;
+ bf.bitArraySize = (1 + (cnt / SUBTABLES)) * sizeof(int) * SUBTABLES;
bf.bitArray = malloc(bf.bitArraySize);
memset(bf.bitArray, 0, bf.bitArraySize);
@@ -169,8 +169,8 @@ int main(int argc,
}
fprintf(btfile,
"int %s_bits_%d[] = { ", argv[2], j);
- for (i= j * bf.bitArraySize/sizeof(int)/SUBTABLES;
- i<(j+1) * bf.bitArraySize/sizeof(int)/SUBTABLES;
+ for (i= j * (bf.bitArraySize/sizeof(int)/SUBTABLES);
+ i<(j+1) * (bf.bitArraySize/sizeof(int)/SUBTABLES);
i++)
fprintf(btfile,
"%dL,",
diff --git a/src/plugins/printable/printableextractor.h b/src/plugins/printable/printableextractor.h
@@ -69,10 +69,8 @@ static void testBitCallback(Bloomfilter * bf,
int * arg = cls;
if (! testBit(bf->sbitArray,
bf->bitArraySize,
- bit)) {
- printf("Testing bit %u failed!\n", bit);
- *arg = 0;
- }
+ bit))
+ *arg = 0;
}
/**
* Test if an element is in the filter.
@@ -129,9 +127,8 @@ static int wordTest(char * word,
HashCode160 hc;
char * lower;
- if (strlen(word) <= (int) (*strlenthreshold)) {
+ if (strlen(word) <= (int) (*strlenthreshold))
return 0;
- }
for (i=strlen(word)-1;i>=0;i--)
if (isdigit(word[i]))
return 0;