libextractor

GNU libextractor
Log | Files | Refs | Submodules | README | LICENSE

commit 31e68de5d7e2877dc6e907a3f073dd2f0e27d469
parent fde2e0a5b59e6beb940e2930db8e01cf250d3c90
Author: Christian Grothoff <christian@grothoff.org>
Date:   Sat, 22 Apr 2006 18:28:39 +0000

le0512

Diffstat:
MChangeLog | 4++++
MNEWS | 4++++
Mconfigure.ac | 4++--
Mcontrib/doxygen | 2+-
Mpo/de.po | 3++-
Mpo/ga.po | 3++-
Mpo/libextractor.pot | 3++-
Mpo/ro.po | 3++-
Mpo/rw.po | 3++-
Msrc/plugins/pdf/pdfextractor.cc | 49++++++-------------------------------------------
Msrc/plugins/pdfextractor.c | 2+-
Msrc/plugins/printable/dictionary-builder.c | 6+++---
Msrc/plugins/printable/printableextractor.h | 9+++------
13 files changed, 34 insertions(+), 61 deletions(-)

diff --git a/ChangeLog b/ChangeLog @@ -1,3 +1,7 @@ +Sat Apr 22 11:18:56 PDT 2006 + Final touches to new build of printable extractors. + Releasing libextractor 0.5.12. + Tue Apr 18 14:44:37 PDT 2006 Improved memory utilization for printable extractors at compile time. Added dictionaries for Finnish, diff --git a/NEWS b/NEWS @@ -1,3 +1,7 @@ +Tue Apr 18 14:44:37 PDT 2006 + Added dictionaries for Finnish, French, Gaelic and Swedish + (for printable extractors). + Thu Mar 9 17:55:09 PST 2006 Word history extraction works (wordleaker). diff --git a/configure.ac b/configure.ac @@ -1,8 +1,8 @@ # Process this file with autoconf to produce a configure script. AC_PREREQ(2.57) -AC_INIT([libextractor], [0.5.11], [bug-libextractor@gnu.org]) +AC_INIT([libextractor], [0.5.12], [bug-libextractor@gnu.org]) AC_REVISION($Revision: 1.67 $) -AM_INIT_AUTOMAKE([libextractor], [0.5.11]) +AM_INIT_AUTOMAKE([libextractor], [0.5.12]) AM_CONFIG_HEADER(src/include/config.h) AH_TOP([#define _GNU_SOURCE 1]) diff --git a/contrib/doxygen b/contrib/doxygen @@ -23,7 +23,7 @@ PROJECT_NAME = libextractor # This could be handy for archiving the generated documentation or # if some version control system is used. -PROJECT_NUMBER = 0.5.11 +PROJECT_NUMBER = 0.5.12 # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) # base path where the generated documentation will be put. diff --git a/po/de.po b/po/de.po @@ -9,7 +9,7 @@ msgid "" msgstr "" "Project-Id-Version: libextractor 0.5.6a\n" "Report-Msgid-Bugs-To: libextractor@gnu.org\n" -"POT-Creation-Date: 2006-04-18 14:46-0700\n" +"POT-Creation-Date: 2006-04-22 11:52-0700\n" "PO-Revision-Date: 2005-09-22 10:07+0200\n" "Last-Translator: Karl Eichwalder <ke@suse.de>\n" "Language-Team: German <de@li.org>\n" @@ -1300,6 +1300,7 @@ msgstr "" "erstellen. Zum Beispiel:\n" #: src/plugins/printable/dictionary-builder.c:110 +#: src/plugins/printable/dictionary-builder.c:166 #, c-format msgid "Error opening file `%s': %s\n" msgstr "Fehler beim Öffnen der Datei »%s«: %s\n" diff --git a/po/ga.po b/po/ga.po @@ -6,7 +6,7 @@ msgid "" msgstr "" "Project-Id-Version: libextractor 0.5.6a\n" "Report-Msgid-Bugs-To: libextractor@gnu.org\n" -"POT-Creation-Date: 2006-04-18 14:46-0700\n" +"POT-Creation-Date: 2006-04-22 11:52-0700\n" "PO-Revision-Date: 2005-09-21 00:46-0700\n" "Last-Translator: Kevin Patrick Scannell <scannell@SLU.EDU>\n" "Language-Team: Irish <gaeilge-gnulinux@lists.sourceforge.net>\n" @@ -1304,6 +1304,7 @@ msgstr "" "Mar shampla:\n" #: src/plugins/printable/dictionary-builder.c:110 +#: src/plugins/printable/dictionary-builder.c:166 #, c-format msgid "Error opening file `%s': %s\n" msgstr "Earráid agus comhad `%s' á oscailt: %s\n" diff --git a/po/libextractor.pot b/po/libextractor.pot @@ -8,7 +8,7 @@ msgid "" msgstr "" "Project-Id-Version: PACKAGE VERSION\n" "Report-Msgid-Bugs-To: libextractor@gnu.org\n" -"POT-Creation-Date: 2006-04-18 14:46-0700\n" +"POT-Creation-Date: 2006-04-22 11:52-0700\n" "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n" "Language-Team: LANGUAGE <LL@li.org>\n" @@ -1282,6 +1282,7 @@ msgid "" msgstr "" #: src/plugins/printable/dictionary-builder.c:110 +#: src/plugins/printable/dictionary-builder.c:166 #, c-format msgid "Error opening file `%s': %s\n" msgstr "" diff --git a/po/ro.po b/po/ro.po @@ -9,7 +9,7 @@ msgid "" msgstr "" "Project-Id-Version: libextractor 0.4.2\n" "Report-Msgid-Bugs-To: libextractor@gnu.org\n" -"POT-Creation-Date: 2006-04-18 14:46-0700\n" +"POT-Creation-Date: 2006-04-22 11:52-0700\n" "PO-Revision-Date: 2005-02-25 12:00-0500\n" "Last-Translator: Laurentiu Buzdugan <lbuz@rolix.org>\n" "Language-Team: Romanian <translation-team-ro@lists.sourceforge.net>\n" @@ -1312,6 +1312,7 @@ msgstr "" "un dicþionar. De exemplu:\n" #: src/plugins/printable/dictionary-builder.c:110 +#: src/plugins/printable/dictionary-builder.c:166 #, c-format msgid "Error opening file `%s': %s\n" msgstr "Eroare deschidere fiºier `%s': %s\n" diff --git a/po/rw.po b/po/rw.po @@ -16,7 +16,7 @@ msgid "" msgstr "" "Project-Id-Version: libextractor 0.4.2\n" "Report-Msgid-Bugs-To: libextractor@gnu.org\n" -"POT-Creation-Date: 2006-04-18 14:46-0700\n" +"POT-Creation-Date: 2006-04-22 11:52-0700\n" "PO-Revision-Date: 2005-04-04 10:55-0700\n" "Last-Translator: Steven Michael Murphy <murf@e-tools.com>\n" "Language-Team: Kinyarwanda <translation-team-rw@lists.sourceforge.net>\n" @@ -1586,6 +1586,7 @@ msgstr "i Izina: Bya i Ururimi Inkoranyamagambo kugirango Urugero" # basctl/source\basicide\basidesh.src:RID_STR_ERROROPENSTORAGE.text #: src/plugins/printable/dictionary-builder.c:110 +#: src/plugins/printable/dictionary-builder.c:166 #, fuzzy, c-format msgid "Error opening file `%s': %s\n" msgstr "Hari ikibazo mu gufungura dosiye" diff --git a/src/plugins/pdf/pdfextractor.cc b/src/plugins/pdf/pdfextractor.cc @@ -73,34 +73,11 @@ extern "C" { if ((((unsigned char)s[0]) & 0xff) == 0xfe && (((unsigned char)s[1]) & 0xff) == 0xff) { char * result; - unsigned char u[2]; - unsigned int pos; - unsigned int len; - char * con; - - result = (char*) malloc(s1->getLength() * 4); - result[0] = '\0'; - len = s1->getLength(); - for (pos=0;pos<len;pos+=2) { - u[0] = s1->getChar(pos+1); - u[1] = s1->getChar(pos); - /* Q: is there a difference between UTF-16 and UNICODE? - Which one is needed here? And how to do it on solaris - where UNICODE is not known!? - See http://lists.gnu.org/archive/html/libextractor/2006-04/msg00006.html - */ -#ifdef SOLARIS - con = (char*) convertToUtf8((const char*) u, 2, "UTF-16"); -#else - con = (char*) convertToUtf8((const char*) u, 2, "UNICODE"); -#endif - strcat(result, con); - free(con); - } + + result = convertToUtf8((const char*) &s[2], s1->getLength() - 2, "UTF-16BE"); next = addKeyword(type, - strdup(result), + result, next); - free(result); } else { unsigned int len = (NULL == s) ? 0 : strlen(s); @@ -157,25 +134,11 @@ extern "C" { (s1->getChar(1) & 0xff) == 0xff) { /* isUnicode */ char * result; - unsigned char u[2]; - unsigned int pos; - unsigned int len; - char * con; - - result = (char*) malloc(s1->getLength() * 4); - result[0] = '\0'; - len = s1->getLength(); - for (pos=0;pos<len;pos+=2) { - u[0] = s1->getChar(pos+1); - u[1] = s1->getChar(pos); - con = (char*) convertToUtf8((const char*) u, 2, "UNICODE"); - strcat(result, con); - free(con); - } + + result = convertToUtf8((const char*)&s[2], s1->getLength() - 2, "UTF-16BE"); next = addKeyword(type, - strdup(result), + result, next); - free(result); } else { if (s[0] == 'D' && s[1] == ':') { s += 2; diff --git a/src/plugins/pdfextractor.c b/src/plugins/pdfextractor.c @@ -203,7 +203,7 @@ charsetDecode(const unsigned char * in, } else { return convertToUtf8(&in[2], size - 2, - "UNICODEBIG"); + "UTF-16BE"); } } diff --git a/src/plugins/printable/dictionary-builder.c b/src/plugins/printable/dictionary-builder.c @@ -136,7 +136,7 @@ int main(int argc, } bf.addressesPerElement = ADDR_PER_ELEMENT; - bf.bitArraySize = cnt * 4 / SUBTABLES * SUBTABLES; + bf.bitArraySize = (1 + (cnt / SUBTABLES)) * sizeof(int) * SUBTABLES; bf.bitArray = malloc(bf.bitArraySize); memset(bf.bitArray, 0, bf.bitArraySize); @@ -169,8 +169,8 @@ int main(int argc, } fprintf(btfile, "int %s_bits_%d[] = { ", argv[2], j); - for (i= j * bf.bitArraySize/sizeof(int)/SUBTABLES; - i<(j+1) * bf.bitArraySize/sizeof(int)/SUBTABLES; + for (i= j * (bf.bitArraySize/sizeof(int)/SUBTABLES); + i<(j+1) * (bf.bitArraySize/sizeof(int)/SUBTABLES); i++) fprintf(btfile, "%dL,", diff --git a/src/plugins/printable/printableextractor.h b/src/plugins/printable/printableextractor.h @@ -69,10 +69,8 @@ static void testBitCallback(Bloomfilter * bf, int * arg = cls; if (! testBit(bf->sbitArray, bf->bitArraySize, - bit)) { - printf("Testing bit %u failed!\n", bit); - *arg = 0; - } + bit)) + *arg = 0; } /** * Test if an element is in the filter. @@ -129,9 +127,8 @@ static int wordTest(char * word, HashCode160 hc; char * lower; - if (strlen(word) <= (int) (*strlenthreshold)) { + if (strlen(word) <= (int) (*strlenthreshold)) return 0; - } for (i=strlen(word)-1;i>=0;i--) if (isdigit(word[i])) return 0;