libextractor

GNU libextractor
Log | Files | Refs | Submodules | README | LICENSE

commit 59932613b357c12f38afc30643d921183edd156b
parent 6f322d688b606d51d713ac1c0b6265a617152312
Author: Christian Grothoff <christian@grothoff.org>
Date:   Sun, 16 Apr 2006 22:55:07 +0000

fixing memory utilization for compiling printable plugins

Diffstat:
MREADME | 5-----
Mconfigure.ac | 2+-
Msrc/plugins/printable/Makefile.am | 38+++++++++++++++++++++++---------------
Msrc/plugins/printable/bloomfilter-def.h | 3+++
Msrc/plugins/printable/bloomfilter.h | 1+
Msrc/plugins/printable/dictionary-builder.c | 48++++++++++++++++++++++++++++++++++--------------
Asrc/plugins/printable/makelang | 11+++++++++++
Msrc/plugins/printable/printableextractor.h | 12+++++++++---
8 files changed, 82 insertions(+), 38 deletions(-)

diff --git a/README b/README @@ -74,11 +74,6 @@ An example implementation can be found in mp3extractor.c. Notes ===== -libextractor contains some very large C files. gcc can easily use -over (!) 100 MB of memory to compile them. If you have that much, -libextractor will compile in about a minute. If you don't have that -much, you may want to consider using the binaries. - On Mac OS X, libextractor will avoid using GCC 3.1, because of problems compiling one of the extractors. GCC 3.3 and 2.95.2 are known to work well; as such, libextractor will first look for 3.3 (by diff --git a/configure.ac b/configure.ac @@ -341,7 +341,7 @@ if test "x$printable" = "x0" then AC_MSG_NOTICE([NOTICE: printable plugins disabled]) else - AC_MSG_NOTICE([NOTICE: printable plugins enabled (will need 150 MB memory to compile)]) + AC_MSG_NOTICE([NOTICE: printable plugins enabled]) fi if test "x$without_glib" = "xtrue" diff --git a/src/plugins/printable/Makefile.am b/src/plugins/printable/Makefile.am @@ -2,22 +2,30 @@ include ../Makefile-plugins.am noinst_PROGRAMS = dictionary-builder -CLEANFILES = da.c de.c en.c es.c it.c no.c pt.c peda.c pede.c peen.c pees.c peit.c peno.c pept.c +da_LANG=da_0.c da_1.c da_2.c da_3.c da_4.c da_5.c da_6.c da_7.c da_8.c da_9.c da_10.c da_11.c da_12.c da_13.c da_14.c da_15.c da_16.c da_17.c da_18.c da_19.c da_20.c da_21.c da_22.c da_23.c da_24.c da_25.c da_26.c da_27.c da_28.c da_29.c da_30.c da_31.c +de_LANG=de_0.c de_1.c de_2.c de_3.c de_4.c de_5.c de_6.c de_7.c de_8.c de_9.c de_10.c de_11.c de_12.c de_13.c de_14.c de_15.c de_16.c de_17.c de_18.c de_19.c de_20.c de_21.c de_22.c de_23.c de_24.c de_25.c de_26.c de_27.c de_28.c de_29.c de_30.c de_31.c +en_LANG=en_0.c en_1.c en_2.c en_3.c en_4.c en_5.c en_6.c en_7.c en_8.c en_9.c en_10.c en_11.c en_12.c en_13.c en_14.c en_15.c en_16.c en_17.c en_18.c en_19.c en_20.c en_21.c en_22.c en_23.c en_24.c en_25.c en_26.c en_27.c en_28.c en_29.c en_30.c en_31.c +es_LANG=es_0.c es_1.c es_2.c es_3.c es_4.c es_5.c es_6.c es_7.c es_8.c es_9.c es_10.c es_11.c es_12.c es_13.c es_14.c es_15.c es_16.c es_17.c es_18.c es_19.c es_20.c es_21.c es_22.c es_23.c es_24.c es_25.c es_26.c es_27.c es_28.c es_29.c es_30.c es_31.c +it_LANG=it_0.c it_1.c it_2.c it_3.c it_4.c it_5.c it_6.c it_7.c it_8.c it_9.c it_10.c it_11.c it_12.c it_13.c it_14.c it_15.c it_16.c it_17.c it_18.c it_19.c it_20.c it_21.c it_22.c it_23.c it_24.c it_25.c it_26.c it_27.c it_28.c it_29.c it_30.c it_31.c +no_LANG=no_0.c no_1.c no_2.c no_3.c no_4.c no_5.c no_6.c no_7.c no_8.c no_9.c no_10.c no_11.c no_12.c no_13.c no_14.c no_15.c no_16.c no_17.c no_18.c no_19.c no_20.c no_21.c no_22.c no_23.c no_24.c no_25.c no_26.c no_27.c no_28.c no_29.c no_30.c no_31.c +pt_LANG=pt_0.c pt_1.c pt_2.c pt_3.c pt_4.c pt_5.c pt_6.c pt_7.c pt_8.c pt_9.c pt_10.c pt_11.c pt_12.c pt_13.c pt_14.c pt_15.c pt_16.c pt_17.c pt_18.c pt_19.c pt_20.c pt_21.c pt_22.c pt_23.c pt_24.c pt_25.c pt_26.c pt_27.c pt_28.c pt_29.c pt_30.c pt_31.c + +CLEANFILES = da.c de.c en.c es.c it.c no.c pt.c peda.c pede.c peen.c pees.c peit.c peno.c pept.c $(da_LANG) $(de_LANG) $(es_LANG) $(en_LANG) $(it_LANG) $(no_LANG) $(pt_LANG) da.c: dictionary-builder$(EXEEXT) - ./dictionary-builder $(srcdir)/da > da.c + ./dictionary-builder $(srcdir)/da da > da.c de.c: dictionary-builder$(EXEEXT) - ./dictionary-builder $(srcdir)/de > de.c + ./dictionary-builder $(srcdir)/de de > de.c en.c: dictionary-builder$(EXEEXT) - ./dictionary-builder $(srcdir)/en > en.c + ./dictionary-builder $(srcdir)/en en > en.c es.c: dictionary-builder$(EXEEXT) - ./dictionary-builder $(srcdir)/es > es.c + ./dictionary-builder $(srcdir)/es es > es.c it.c: dictionary-builder$(EXEEXT) - ./dictionary-builder $(srcdir)/it > it.c + ./dictionary-builder $(srcdir)/it it > it.c no.c: dictionary-builder$(EXEEXT) - ./dictionary-builder $(srcdir)/no > no.c + ./dictionary-builder $(srcdir)/no no > no.c pt.c: dictionary-builder$(EXEEXT) - ./dictionary-builder $(srcdir)/pt > pt.c + ./dictionary-builder $(srcdir)/pt pt > pt.c peda.c: cat peXX.c | sed -e "s/XX/da/" > peda.c @@ -66,36 +74,36 @@ libextractor_printable_pt_la_LIBADD = \ libextractor_printable_da_la_SOURCES = \ - da.c peda.c bloomfilter.h printableextractor.h bloomfilter-def.h + da.c $(da_LANG) peda.c bloomfilter.h printableextractor.h bloomfilter-def.h libextractor_printable_da_la_LDFLAGS = \ $(PLUGINFLAGS) $(retaincommand) libextractor_printable_de_la_SOURCES = \ - de.c pede.c bloomfilter.h printableextractor.h bloomfilter-def.h + de.c $(de_LANG) pede.c bloomfilter.h printableextractor.h bloomfilter-def.h libextractor_printable_de_la_LDFLAGS = \ $(PLUGINFLAGS) $(retaincommand) libextractor_printable_en_la_SOURCES = \ - en.c peen.c bloomfilter.h printableextractor.h bloomfilter-def.h + en.c $(en_LANG) peen.c bloomfilter.h printableextractor.h bloomfilter-def.h libextractor_printable_en_la_LDFLAGS = \ $(PLUGINFLAGS) $(retaincommand) libextractor_printable_es_la_SOURCES = \ - es.c pees.c bloomfilter.h printableextractor.h bloomfilter-def.h + es.c $(es_LANG) pees.c bloomfilter.h printableextractor.h bloomfilter-def.h libextractor_printable_es_la_LDFLAGS = \ $(PLUGINFLAGS) $(retaincommand) libextractor_printable_it_la_SOURCES = \ - it.c peit.c bloomfilter.h printableextractor.h bloomfilter-def.h + it.c $(it_LANG) peit.c bloomfilter.h printableextractor.h bloomfilter-def.h libextractor_printable_it_la_LDFLAGS = \ $(PLUGINFLAGS) $(retaincommand) libextractor_printable_no_la_SOURCES = \ - no.c peno.c bloomfilter.h printableextractor.h bloomfilter-def.h + no.c $(no_LANG) peno.c bloomfilter.h printableextractor.h bloomfilter-def.h libextractor_printable_no_la_LDFLAGS = \ $(PLUGINFLAGS) $(retaincommand) libextractor_printable_pt_la_SOURCES = \ - pt.c pept.c bloomfilter.h printableextractor.h bloomfilter-def.h + pt.c $(pt_LANG) pept.c bloomfilter.h printableextractor.h bloomfilter-def.h libextractor_printable_pt_la_LDFLAGS = \ $(PLUGINFLAGS) $(retaincommand) diff --git a/src/plugins/printable/bloomfilter-def.h b/src/plugins/printable/bloomfilter-def.h @@ -28,11 +28,14 @@ #include "platform.h" #include <string.h> +#define SUBTABLES 32 + typedef struct { /** How many bits we set for each stored element */ unsigned int addressesPerElement; /** The actual bloomfilter bit array */ unsigned char * bitArray; + unsigned char ** sbitArray; /** Size of bitArray in bytes */ unsigned int bitArraySize; } Bloomfilter; diff --git a/src/plugins/printable/bloomfilter.h b/src/plugins/printable/bloomfilter.h @@ -29,6 +29,7 @@ #include <string.h> #include "bloomfilter-def.h" + typedef struct { unsigned char data[20]; } HashCode160; diff --git a/src/plugins/printable/dictionary-builder.c b/src/plugins/printable/dictionary-builder.c @@ -1,6 +1,6 @@ /* This file is part of libextractor. - (C) 2002, 2003, 2004, 2005 Vidyut Samanta and Christian Grothoff + (C) 2002, 2003, 2004, 2005, 2006 Vidyut Samanta and Christian Grothoff libextractor is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published @@ -16,11 +16,6 @@ along with libextractor; see the file COPYING. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - - Portions of this code were adapted from libhtmlparse by - Mooneer Salem (mooneer@translator.cs). The main changes - to libhtmlparse were the removal of globals to make the - code reentrant. */ /** * Tool to build a bloomfilter from a dictionary. @@ -81,11 +76,13 @@ static void addToBloomfilter(Bloomfilter * bf, #define ADDR_PER_ELEMENT 46 + int main(int argc, char ** argv) { Bloomfilter bf; HashCode160 hc; int i; + int j; int cnt; char * fn; char ** words; @@ -95,11 +92,11 @@ int main(int argc, char * charset = NULL; #define ALLOCSIZE 1024*1024 - if (argc<2) { + if (argc<3) { fprintf(stderr, _("Please provide the name of the language you are building\n" "a dictionary for. For example:\n")); - fprintf(stderr, "$ ./dictionary-builder en > en.c\n"); + fprintf(stderr, "$ ./dictionary-builder ./en en > en.c\n"); exit(-1); } @@ -139,7 +136,7 @@ int main(int argc, } bf.addressesPerElement = ADDR_PER_ELEMENT; - bf.bitArraySize = cnt*4; + bf.bitArraySize = cnt * 4 / SUBTABLES * SUBTABLES; bf.bitArray = malloc(bf.bitArraySize); memset(bf.bitArray, 0, bf.bitArraySize); @@ -158,12 +155,34 @@ int main(int argc, gcc versions then output tons of warnings about "decimal constant is so large that it is unsigned" (even for unsigned long long[] that warning is generated and dramatically increases compile times). */ + for (j=0;j<SUBTABLES;j++) { + char fn[64]; + FILE * btfile; + + snprintf(fn, 64, "%s_%d.c", argv[1], j); + btfile = fopen(fn, "w+"); + fprintf(btfile, + "int %s_bits_%d[] = { ", argv[2], j); + for (i= j * bf.bitArraySize/sizeof(int)/SUBTABLES; + i<(j+1) * bf.bitArraySize/sizeof(int)/SUBTABLES; + i++) + fprintf(btfile, + "%dL,", + (((int*)bf.bitArray)[i])); + fprintf(btfile, + "};\n"); + fclose(btfile); + fprintf(stdout, + "extern int %s_bits_%d[];\n", argv[2], j); + } + fprintf(stdout, - "static int bits[] = { "); - for (i=0;i<bf.bitArraySize/sizeof(int);i++) + "static int * bits[] = { "); + for (i=0;i<SUBTABLES;i++) fprintf(stdout, - "%dL,", - (((int*)bf.bitArray)[i])); + "%s_bits_%d,", + argv[2], + i); fprintf(stdout, "};\n"); bn = &argv[1][strlen(argv[1])]; @@ -175,7 +194,8 @@ int main(int argc, fprintf(stdout, "Bloomfilter libextractor_printable_%s_filter = {\n" " %u,\n" - " (unsigned char*)bits,\n" + " NULL,\n" /* bitarray */ + " (unsigned char **)bits,\n" /* sbitArray */ " %u };\n", bn, ADDR_PER_ELEMENT, diff --git a/src/plugins/printable/makelang b/src/plugins/printable/makelang @@ -0,0 +1,11 @@ +#!/bin/sh +for l in da de en es it no pt +do + r="" + for n in `seq 0 31` + do + m=`expr $n - 1` + r="$r${l}_$n.c " + done + echo "${l}_LANG=$r" +done diff --git a/src/plugins/printable/printableextractor.h b/src/plugins/printable/printableextractor.h @@ -33,6 +33,8 @@ #include <string.h> #include "bloomfilter.h" + + /** * Checks if a bit is active in the bitArray * @@ -40,14 +42,17 @@ * @param bitIdx which bit to test * @return 1 if the bit is set, 0 if not. */ -static int testBit(unsigned char * bitArray, +static int testBit(unsigned char ** bitArray, + unsigned int size, unsigned int bitIdx) { unsigned int slot; unsigned int targetBit; + unsigned int msize; slot = bitIdx / 8; targetBit = (1L << (bitIdx % 8)); - return (bitArray[slot] & targetBit) != 0; + msize = size / SUBTABLES; + return (bitArray[slot / msize][slot % msize] & targetBit) != 0; } @@ -62,7 +67,8 @@ static void testBitCallback(Bloomfilter * bf, unsigned int bit, void * cls) { int * arg = cls; - if (! testBit(bf->bitArray, + if (! testBit(bf->sbitArray, + bf->bitArraySize, bit)) *arg = 0; }