libextractor

GNU libextractor
Log | Files | Refs | Submodules | README | LICENSE

commit f3f83427975148b9a303bb4cb069a56fa5a4d85e
parent 00072f35a3bd58eb2f9a734d3af1996370da9e0b
Author: Christian Grothoff <christian@grothoff.org>
Date:   Fri, 16 Sep 2005 08:15:07 +0000

update

Diffstat:
Msrc/plugins/language/Makefile.am | 28++++++++++++++++++++++++++--
Dsrc/plugins/language/katlanguagemanager.cpp | 310-------------------------------------------------------------------------------
Asrc/plugins/language/language-compiler.c | 91+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/plugins/language/languageextractor.c | 331+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 448 insertions(+), 312 deletions(-)

diff --git a/src/plugins/language/Makefile.am b/src/plugins/language/Makefile.am @@ -1,6 +1,10 @@ -languagedir = $(datadir)/libextractor/language +include ../Makefile-plugins.am -language_DATA = afrikaans.klp albanian.klp arabic2.klp arabic.klp armenian.klp \ +noinst_PROGRAMS = language-compiler + +CLEANFILES = languages.c + +EXTRA_DIST = afrikaans.klp albanian.klp arabic2.klp arabic.klp armenian.klp \ basque.klp belarus.klp bosnian.klp breton.klp bulgarian.klp catalan.klp \ chinese1.klp chinese2.klp chinese.klp croatian.klp czech.klp danish.klp dutch.klp \ english.klp esperanto.klp estonian.klp finnish.klp french.klp frisian.klp \ @@ -13,3 +17,23 @@ language_DATA = afrikaans.klp albanian.klp arabic2.klp arabic.klp armenian.klp \ serbian.klp slovak2.klp slovak.klp slovenian2.klp slovenian.klp spanish.klp \ swahili.klp tagalog.klp tamil.klp thai.klp turkish.klp ukrainian.klp \ vietnamese.klp welsh.klp + +language.c: language-compiler$(EXEEXT) + ./language-compiler$(EXEEXT) *.klp > languages.c + +language_compiler_SOURCES = \ + language-compiler.c +language_compiler_LDADD = \ + $(LIBINTL) + +plugin_LTLIBRARIES = \ + libextractor_language.la + +libextractor_language_la_LIBADD = \ + $(top_builddir)/src/main/libextractor.la + +libextractor_language_la_SOURCES = \ + languageextractor.c languages.c +libextractor_language_la_LDFLAGS = \ + $(PLUGINFLAGS) $(retaincommand) + diff --git a/src/plugins/language/katlanguagemanager.cpp b/src/plugins/language/katlanguagemanager.cpp @@ -1,310 +0,0 @@ -/*************************************************************************** - * Copyright (C) 2005 by Roberto Cappuccio and the Kat team * - * Roberto Cappuccio : roberto.cappuccio@gmail.com * - * * - * This program is free software; you can redistribute it and/or modify * - * it under the terms of the GNU General Public License as published by * - * the Free Software Foundation; either version 2 of the License, or * - * (at your option) any later version. * - * * - * This program is distributed in the hope that it will be useful, * - * but WITHOUT ANY WARRANTY; without even the implied warranty of * - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * - * GNU General Public License for more details. * - * * - * You should have received a copy of the GNU General Public License * - * along with this program; if not, write to the * - * Free Software Foundation, Inc., * - * 51 Franklin Steet, Fifth Floor, Boston, MA 02110-1301, USA. * - ***************************************************************************/ - -#include <cstdlib> -#include <kdebug.h> -#include <kstandarddirs.h> -#include <kio/job.h> -#include <kio/jobclasses.h> -#include <qregexp.h> -#include <qdir.h> -#include <qdom.h> - -#include "katlanguagemanager.h" - -int NGramsList::compareItems( QCollection::Item item1, QCollection::Item item2 ) -{ - NGram* n1 = (NGram*)item1; - NGram* n2 = (NGram*)item2; - - return n2->occurrences - n1->occurrences; -} - -int LanguageList::compareItems( QCollection::Item item1, QCollection::Item item2 ) -{ - Language* n1 = (Language*)item1; - Language* n2 = (Language*)item2; - - return n2->distance - n1->distance; -} - -KatLanguageManager::KatLanguageManager() -{ -} - -KatLanguageManager::~KatLanguageManager() -{ -} - -void KatLanguageManager::extractNGrams( const QString& str, QStringList& ngrams, int size ) -{ - QString paddedString( str ); - - paddedString = paddedString.replace( QRegExp( " " ), "_" ); - paddedString = '_' + paddedString + '_'; - - for( int i = 0; i < paddedString.length() - size + 1; i++ ) - ngrams.append( paddedString.mid( i, size ) ); -} - -NGramsList KatLanguageManager::createFingerprintFromFile( const QString& fileName ) -{ - QFile m_file( fileName ); - QTextStream m_stream( &m_file ); - bool m_open = m_file.open( IO_ReadOnly ); - QString buffer = m_stream.read(); - m_file.close(); - - buffer = buffer.lower(); - buffer = buffer.replace( QRegExp( "[\\W]" ), " " ); - buffer = buffer.replace( QRegExp( "[0-9]" ), " " ); - buffer = buffer.simplifyWhiteSpace(); - - return createFingerprintFromQString( buffer ); -} - -NGramsList KatLanguageManager::createFingerprintFromQString( const QString& buf ) -{ - QStringList ngrams; - NGramsList wngrams; - - wngrams.setAutoDelete( true ); - - QString buffer( buf ); - buffer.truncate( MAXDOCSIZE ); // only use the first MAXDOCSIZE characters of the buffer - - // extract the ngrams - for ( int size = 1; size <= MAXNGRAMSIZE; ++size ) - extractNGrams( buffer, ngrams, size ); - - // sort the ngrams - ngrams.sort(); - - // count the occurrences of every ngram - // and build the NGramList wngrams - long occurrences; - QStringList::Iterator ngram = ngrams.begin(); - while ( ngram != ngrams.end() ) - { - QString currentNGram = *ngram; - - ngram++; - - occurrences = 1; - while ( *ngram == currentNGram ) - { - occurrences++; - ngram++; - } - - wngrams.inSort( new NGram( currentNGram, occurrences ) ); - } - - // the profile has to contain a maximum of MAXNGRAMS - while ( wngrams.count() > MAXNGRAMS ) - wngrams.removeLast(); - - return wngrams; -} - -QString KatLanguageManager::identifyLanguage( const QString& buffer, LanguageProfileMap lp ) -{ - long distance; - long minscore = MAXSCORE; - long threshold = minscore; - LanguageList language_list; - language_list.setAutoDelete( true ); - LanguageList candidates; - candidates.setAutoDelete( true ); - - // create the fingerprint of the buffer - NGramsList file_ngrams = createFingerprintFromQString( buffer ); - if ( buffer.length() < MINDOCSIZE ) - return QString( "unknown" ); - - // cycle through the list of managed languages - // and build an ordered list of languages sorted by distance - QMap<QString,LanguageProfile>::Iterator end( lp.end() ); - for ( QMap<QString,LanguageProfile>::Iterator it = lp.begin(); it != end; ++it ) - { - QString lname = it.key(); - LanguageProfile language_ngrams = (LanguageProfile)it.data(); - - // calculate the distance between the file profile and the language profile - distance = calculateDistance( file_ngrams, language_ngrams ); - - // calculate the threshold - if ( distance < minscore ) - { - minscore = distance; - threshold = (long)( (double)distance * THRESHOLDVALUE ); - } - - language_list.inSort( new Language( lname, distance ) ); - } - - // now that the list of languages is sorted by distance - // extract at most MAXCANDIDATES candidates - int cnt = 0; - Language* currentLanguage; - QPtrList<Language>::Iterator language = language_list.begin(); - while ( language != language_list.end() ) - { - currentLanguage = *language; - - if ( currentLanguage->distance <= threshold ) - { - cnt++; - if ( cnt == MAXCANDIDATES + 1 ) - break; - - candidates.inSort( new Language( currentLanguage->language, currentLanguage->distance ) ); - } - - language++; - } - - // If more than MAXCANDIDATES matches are found within the threshold, - // the classifier reports unknown, because the input is obviously confusing - if ( cnt == MAXCANDIDATES + 1 ) { - return QString( "unknown" ); - } else { - Language* first = candidates.getFirst(); - if ( first != 0L ) - return QString( first->language ); - else - return QString( "unknown" ); - } -} - -long KatLanguageManager::calculateDistance( NGramsList& file_ngrams, LanguageProfile& langNG ) -{ - long fileNGPos = 0L; - long langNGPos = 0L; - long distance = 0L; - - NGramsList::Iterator file_ngram = file_ngrams.begin(); - while ( file_ngram != file_ngrams.end() ) - { - NGram* currentFileNGram = *file_ngram; - - // search the currentFileNGram in language_ngrams - // and calculate the distance - QMap<QString, long>::iterator ng = langNG.find( currentFileNGram->ngram ); - - if ( ng == langNG.end() ) - { - // not found - distance = distance + MAXOUTOFPLACE; - } - else - { - //found - langNGPos = ng.data(); - distance = distance + labs( langNGPos - fileNGPos ); - } - - fileNGPos++; - file_ngram++; - } - - return distance; -} - -LanguageProfileMap* KatLanguageManager::loadAllLanguageProfiles() -{ - LanguageProfileMap* lp = new LanguageProfileMap(); - - // clear the language profile - lp->clear(); - - // find the Kat application data path - QStringList m_languageFiles = KGlobal::dirs()->findAllResources( "data", "kat/language/*.klp", false, true ); - - //delete files have .klpd extension - QStringList deletedLanguageList = KGlobal::dirs()->findAllResources( "data", "kat/language/*.klpd", false, true ); - QStringList deletedFileLanguage; - QStringList::Iterator end( deletedLanguageList.end() ); - for ( QStringList::Iterator it = deletedLanguageList.begin(); it != end; ++it ) - { - KURL file( *it ); - QString tmp = file.filename().mid( 0, file.filename().length() - 5 ); - kdDebug() << "loadAllLanguageProfiles tmp :" << tmp << endl; - deletedFileLanguage.append( tmp ); - } - // load the language profiles - QStringList::Iterator endLang( m_languageFiles.end() ); - for ( QStringList::Iterator it = m_languageFiles.begin(); it != endLang; ++it ) - { - QString lname = (*it).mid( 0, (*it).length()-4 ); - KURL tmpFile( *it ); - QString tmp = tmpFile.filename().mid( 0, tmpFile.filename().length() - 4 ); - //it was removed => don't load it - if ( deletedFileLanguage.contains( tmp ) ) - continue; - - QString profilePath = *it ; - QDomDocument doc( profilePath ); - - QFile file( profilePath ); - if ( !file.exists() ) - return lp; - - if ( !file.open( IO_ReadOnly ) ) - { - kdDebug() << "Impossible to open " << profilePath << endl; - return lp; - } - QByteArray m_data = file.readAll(); - - QString qs; - if ( !doc.setContent( QString( m_data ).utf8(), &qs ) ) - { - kdDebug() << "Impossible to set content from " << profilePath << " ERROR: " << qs << endl; - file.close(); - return lp; - } - file.close(); - - // create the list of ngrams of the language profile - LanguageProfile lprofile; - lprofile.clear(); - QDomElement docElem = doc.documentElement(); - QDomNode n = docElem.firstChild(); - long index = 0L; - - while( !n.isNull() ) - { - QDomElement e = n.toElement(); - if( !e.isNull() ) - lprofile.insert( QString( e.attribute( "value" ) ), index ); - - index++; - n = n.nextSibling(); - } - - QString tmpLang = tmpFile.filename().mid( 0, tmpFile.filename().length() - 4 ); - //kdDebug() << " language insert :" << tmpLang << endl; - lp->insert( tmpLang , lprofile ); - } - - return lp; -} - diff --git a/src/plugins/language/language-compiler.c b/src/plugins/language/language-compiler.c @@ -0,0 +1,91 @@ +/* + This file is part of libextractor. + (C) 2005 Vidyut Samanta and Christian Grothoff + + libextractor is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 2, or (at your + option) any later version. + + libextractor is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with libextractor; see the file COPYING. If not, write to the + Free Software Foundation, Inc., 59 Temple Place - Suite 330, + Boston, MA 02111-1307, USA. + */ + +#include "platform.h" + + +int main(int argc, + char ** argv) { + int i; + int cnt; + char * fn; + char ** words; + char line[2048]; /* buffer overflow, here we go */ + FILE *dictin; + char * bn; +#define ALLOCSIZE 1024*1024 + + if (argc<2) { + fprintf(stderr, + _("Please provide a list of klp files as arguments.\n")); + exit(-1); + } + + fn = malloc(strlen(argv[1]) + 6); + strcpy(fn, argv[1]); + strcat(fn, ".txt"); + dictin=fopen(fn,"r"); + free(fn); + if (dictin==NULL) { + fprintf(stderr, + _("Error opening file `%s': %s\n"), + argv[1],strerror(errno)); + exit(-1); + } + + words = malloc(sizeof(char*) * ALLOCSIZE); /* don't we LOVE constant size buffers? */ + if (words == NULL) { + fprintf(stderr, + _("Error allocating: %s\n."), + strerror(errno)); + exit(-1); + } + cnt = 0; + memset(&line[0], 0, 2048); + while (1 == fscanf(dictin, "%s", (char*)&line)) { + words[cnt] = strdup(line); + cnt++; + memset(&line[0], 0, 2048); + if (cnt > ALLOCSIZE) { + fprintf(stderr, + _("Increase ALLOCSIZE (in %s).\n"), + __FILE__); + exit(-1); + } + + } + + bf.addressesPerElement = ADDR_PER_ELEMENT; + bf.bitArraySize = cnt*4; + bf.bitArray = malloc(bf.bitArraySize); + memset(bf.bitArray, 0, bf.bitArraySize); + + fprintf(stdout, + "#include \"somefile.h\"\n"); + fprintf(stdout, + "static int bits[] = { "); + for (i=0;i<bf.bitArraySize/sizeof(int);i++) + fprintf(stdout, + "%dL,", + (((int*)bf.bitArray)[i])); + fprintf(stdout, + "};\n"); + return 0; +} diff --git a/src/plugins/language/languageextractor.c b/src/plugins/language/languageextractor.c @@ -0,0 +1,331 @@ +/* + This file is part of libextractor. + (C) 2005 Vidyut Samanta and Christian Grothoff + + libextractor is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 2, or (at your + option) any later version. + + libextractor is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with libextractor; see the file COPYING. If not, write to the + Free Software Foundation, Inc., 59 Temple Place - Suite 330, + Boston, MA 02111-1307, USA. + */ +/* this code was adopted from Kat, original copyright below: */ +/*************************************************************************** + * Copyright (C) 2005 by Roberto Cappuccio and the Kat team * + * Roberto Cappuccio : roberto.cappuccio@gmail.com * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 51 Franklin Steet, Fifth Floor, Boston, MA 02110-1301, USA. * + ***************************************************************************/ + +/** + * @file languageextractor.c + * @author Christian Grothoff + * @brief try to identify the language of the document using + * letter and letter-pair statistics + */ + +#include "platform.h" +#include "extractor.h" + +int NGramsList::compareItems( QCollection::Item item1, QCollection::Item item2 ) +{ + NGram* n1 = (NGram*)item1; + NGram* n2 = (NGram*)item2; + + return n2->occurrences - n1->occurrences; +} + +int LanguageList::compareItems( QCollection::Item item1, QCollection::Item item2 ) +{ + Language* n1 = (Language*)item1; + Language* n2 = (Language*)item2; + + return n2->distance - n1->distance; +} + + +void KatLanguageManager::extractNGrams( const QString& str, QStringList& ngrams, int size ) +{ + QString paddedString( str ); + + paddedString = paddedString.replace( QRegExp( " " ), "_" ); + paddedString = '_' + paddedString + '_'; + + for( int i = 0; i < paddedString.length() - size + 1; i++ ) + ngrams.append( paddedString.mid( i, size ) ); +} + +NGramsList KatLanguageManager::createFingerprintFromFile( const QString& fileName ) +{ + QFile m_file( fileName ); + QTextStream m_stream( &m_file ); + bool m_open = m_file.open( IO_ReadOnly ); + QString buffer = m_stream.read(); + m_file.close(); + + buffer = buffer.lower(); + buffer = buffer.replace( QRegExp( "[\\W]" ), " " ); + buffer = buffer.replace( QRegExp( "[0-9]" ), " " ); + buffer = buffer.simplifyWhiteSpace(); + + return createFingerprintFromQString( buffer ); +} + +NGramsList KatLanguageManager::createFingerprintFromQString( const QString& buf ) +{ + QStringList ngrams; + NGramsList wngrams; + + wngrams.setAutoDelete( true ); + + QString buffer( buf ); + buffer.truncate( MAXDOCSIZE ); // only use the first MAXDOCSIZE characters of the buffer + + // extract the ngrams + for ( int size = 1; size <= MAXNGRAMSIZE; ++size ) + extractNGrams( buffer, ngrams, size ); + + // sort the ngrams + ngrams.sort(); + + // count the occurrences of every ngram + // and build the NGramList wngrams + long occurrences; + QStringList::Iterator ngram = ngrams.begin(); + while ( ngram != ngrams.end() ) + { + QString currentNGram = *ngram; + + ngram++; + + occurrences = 1; + while ( *ngram == currentNGram ) + { + occurrences++; + ngram++; + } + + wngrams.inSort( new NGram( currentNGram, occurrences ) ); + } + + // the profile has to contain a maximum of MAXNGRAMS + while ( wngrams.count() > MAXNGRAMS ) + wngrams.removeLast(); + + return wngrams; +} + +QString KatLanguageManager::identifyLanguage( const QString& buffer, LanguageProfileMap lp ) +{ + long distance; + long minscore = MAXSCORE; + long threshold = minscore; + LanguageList language_list; + language_list.setAutoDelete( true ); + LanguageList candidates; + candidates.setAutoDelete( true ); + + // create the fingerprint of the buffer + NGramsList file_ngrams = createFingerprintFromQString( buffer ); + if ( buffer.length() < MINDOCSIZE ) + return QString( "unknown" ); + + // cycle through the list of managed languages + // and build an ordered list of languages sorted by distance + QMap<QString,LanguageProfile>::Iterator end( lp.end() ); + for ( QMap<QString,LanguageProfile>::Iterator it = lp.begin(); it != end; ++it ) + { + QString lname = it.key(); + LanguageProfile language_ngrams = (LanguageProfile)it.data(); + + // calculate the distance between the file profile and the language profile + distance = calculateDistance( file_ngrams, language_ngrams ); + + // calculate the threshold + if ( distance < minscore ) + { + minscore = distance; + threshold = (long)( (double)distance * THRESHOLDVALUE ); + } + + language_list.inSort( new Language( lname, distance ) ); + } + + // now that the list of languages is sorted by distance + // extract at most MAXCANDIDATES candidates + int cnt = 0; + Language* currentLanguage; + QPtrList<Language>::Iterator language = language_list.begin(); + while ( language != language_list.end() ) + { + currentLanguage = *language; + + if ( currentLanguage->distance <= threshold ) + { + cnt++; + if ( cnt == MAXCANDIDATES + 1 ) + break; + + candidates.inSort( new Language( currentLanguage->language, currentLanguage->distance ) ); + } + + language++; + } + + // If more than MAXCANDIDATES matches are found within the threshold, + // the classifier reports unknown, because the input is obviously confusing + if ( cnt == MAXCANDIDATES + 1 ) { + return QString( "unknown" ); + } else { + Language* first = candidates.getFirst(); + if ( first != 0L ) + return QString( first->language ); + else + return QString( "unknown" ); + } +} + +long KatLanguageManager::calculateDistance( NGramsList& file_ngrams, LanguageProfile& langNG ) +{ + long fileNGPos = 0L; + long langNGPos = 0L; + long distance = 0L; + + NGramsList::Iterator file_ngram = file_ngrams.begin(); + while ( file_ngram != file_ngrams.end() ) + { + NGram* currentFileNGram = *file_ngram; + + // search the currentFileNGram in language_ngrams + // and calculate the distance + QMap<QString, long>::iterator ng = langNG.find( currentFileNGram->ngram ); + + if ( ng == langNG.end() ) + { + // not found + distance = distance + MAXOUTOFPLACE; + } + else + { + //found + langNGPos = ng.data(); + distance = distance + labs( langNGPos - fileNGPos ); + } + + fileNGPos++; + file_ngram++; + } + + return distance; +} + +LanguageProfileMap* KatLanguageManager::loadAllLanguageProfiles() +{ + LanguageProfileMap* lp = new LanguageProfileMap(); + + // clear the language profile + lp->clear(); + + // find the Kat application data path + QStringList m_languageFiles = KGlobal::dirs()->findAllResources( "data", "kat/language/*.klp", false, true ); + + //delete files have .klpd extension + QStringList deletedLanguageList = KGlobal::dirs()->findAllResources( "data", "kat/language/*.klpd", false, true ); + QStringList deletedFileLanguage; + QStringList::Iterator end( deletedLanguageList.end() ); + for ( QStringList::Iterator it = deletedLanguageList.begin(); it != end; ++it ) + { + KURL file( *it ); + QString tmp = file.filename().mid( 0, file.filename().length() - 5 ); + kdDebug() << "loadAllLanguageProfiles tmp :" << tmp << endl; + deletedFileLanguage.append( tmp ); + } + // load the language profiles + QStringList::Iterator endLang( m_languageFiles.end() ); + for ( QStringList::Iterator it = m_languageFiles.begin(); it != endLang; ++it ) + { + QString lname = (*it).mid( 0, (*it).length()-4 ); + KURL tmpFile( *it ); + QString tmp = tmpFile.filename().mid( 0, tmpFile.filename().length() - 4 ); + //it was removed => don't load it + if ( deletedFileLanguage.contains( tmp ) ) + continue; + + QString profilePath = *it ; + QDomDocument doc( profilePath ); + + QFile file( profilePath ); + if ( !file.exists() ) + return lp; + + if ( !file.open( IO_ReadOnly ) ) + { + kdDebug() << "Impossible to open " << profilePath << endl; + return lp; + } + QByteArray m_data = file.readAll(); + + QString qs; + if ( !doc.setContent( QString( m_data ).utf8(), &qs ) ) + { + kdDebug() << "Impossible to set content from " << profilePath << " ERROR: " << qs << endl; + file.close(); + return lp; + } + file.close(); + + // create the list of ngrams of the language profile + LanguageProfile lprofile; + lprofile.clear(); + QDomElement docElem = doc.documentElement(); + QDomNode n = docElem.firstChild(); + long index = 0L; + + while( !n.isNull() ) + { + QDomElement e = n.toElement(); + if( !e.isNull() ) + lprofile.insert( QString( e.attribute( "value" ) ), index ); + + index++; + n = n.nextSibling(); + } + + QString tmpLang = tmpFile.filename().mid( 0, tmpFile.filename().length() - 4 ); + //kdDebug() << " language insert :" << tmpLang << endl; + lp->insert( tmpLang , lprofile ); + } + + return lp; +} + + + +struct EXTRACTOR_Keywords * +libextractor_language_extract(const char * filename, + const char * buf, + size_t size, + struct EXTRACTOR_Keywords * prev) { + return prev; +}