commit f3f83427975148b9a303bb4cb069a56fa5a4d85e
parent 00072f35a3bd58eb2f9a734d3af1996370da9e0b
Author: Christian Grothoff <christian@grothoff.org>
Date: Fri, 16 Sep 2005 08:15:07 +0000
update
Diffstat:
4 files changed, 448 insertions(+), 312 deletions(-)
diff --git a/src/plugins/language/Makefile.am b/src/plugins/language/Makefile.am
@@ -1,6 +1,10 @@
-languagedir = $(datadir)/libextractor/language
+include ../Makefile-plugins.am
-language_DATA = afrikaans.klp albanian.klp arabic2.klp arabic.klp armenian.klp \
+noinst_PROGRAMS = language-compiler
+
+CLEANFILES = languages.c
+
+EXTRA_DIST = afrikaans.klp albanian.klp arabic2.klp arabic.klp armenian.klp \
basque.klp belarus.klp bosnian.klp breton.klp bulgarian.klp catalan.klp \
chinese1.klp chinese2.klp chinese.klp croatian.klp czech.klp danish.klp dutch.klp \
english.klp esperanto.klp estonian.klp finnish.klp french.klp frisian.klp \
@@ -13,3 +17,23 @@ language_DATA = afrikaans.klp albanian.klp arabic2.klp arabic.klp armenian.klp \
serbian.klp slovak2.klp slovak.klp slovenian2.klp slovenian.klp spanish.klp \
swahili.klp tagalog.klp tamil.klp thai.klp turkish.klp ukrainian.klp \
vietnamese.klp welsh.klp
+
+language.c: language-compiler$(EXEEXT)
+ ./language-compiler$(EXEEXT) *.klp > languages.c
+
+language_compiler_SOURCES = \
+ language-compiler.c
+language_compiler_LDADD = \
+ $(LIBINTL)
+
+plugin_LTLIBRARIES = \
+ libextractor_language.la
+
+libextractor_language_la_LIBADD = \
+ $(top_builddir)/src/main/libextractor.la
+
+libextractor_language_la_SOURCES = \
+ languageextractor.c languages.c
+libextractor_language_la_LDFLAGS = \
+ $(PLUGINFLAGS) $(retaincommand)
+
diff --git a/src/plugins/language/katlanguagemanager.cpp b/src/plugins/language/katlanguagemanager.cpp
@@ -1,310 +0,0 @@
-/***************************************************************************
- * Copyright (C) 2005 by Roberto Cappuccio and the Kat team *
- * Roberto Cappuccio : roberto.cappuccio@gmail.com *
- * *
- * This program is free software; you can redistribute it and/or modify *
- * it under the terms of the GNU General Public License as published by *
- * the Free Software Foundation; either version 2 of the License, or *
- * (at your option) any later version. *
- * *
- * This program is distributed in the hope that it will be useful, *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
- * GNU General Public License for more details. *
- * *
- * You should have received a copy of the GNU General Public License *
- * along with this program; if not, write to the *
- * Free Software Foundation, Inc., *
- * 51 Franklin Steet, Fifth Floor, Boston, MA 02110-1301, USA. *
- ***************************************************************************/
-
-#include <cstdlib>
-#include <kdebug.h>
-#include <kstandarddirs.h>
-#include <kio/job.h>
-#include <kio/jobclasses.h>
-#include <qregexp.h>
-#include <qdir.h>
-#include <qdom.h>
-
-#include "katlanguagemanager.h"
-
-int NGramsList::compareItems( QCollection::Item item1, QCollection::Item item2 )
-{
- NGram* n1 = (NGram*)item1;
- NGram* n2 = (NGram*)item2;
-
- return n2->occurrences - n1->occurrences;
-}
-
-int LanguageList::compareItems( QCollection::Item item1, QCollection::Item item2 )
-{
- Language* n1 = (Language*)item1;
- Language* n2 = (Language*)item2;
-
- return n2->distance - n1->distance;
-}
-
-KatLanguageManager::KatLanguageManager()
-{
-}
-
-KatLanguageManager::~KatLanguageManager()
-{
-}
-
-void KatLanguageManager::extractNGrams( const QString& str, QStringList& ngrams, int size )
-{
- QString paddedString( str );
-
- paddedString = paddedString.replace( QRegExp( " " ), "_" );
- paddedString = '_' + paddedString + '_';
-
- for( int i = 0; i < paddedString.length() - size + 1; i++ )
- ngrams.append( paddedString.mid( i, size ) );
-}
-
-NGramsList KatLanguageManager::createFingerprintFromFile( const QString& fileName )
-{
- QFile m_file( fileName );
- QTextStream m_stream( &m_file );
- bool m_open = m_file.open( IO_ReadOnly );
- QString buffer = m_stream.read();
- m_file.close();
-
- buffer = buffer.lower();
- buffer = buffer.replace( QRegExp( "[\\W]" ), " " );
- buffer = buffer.replace( QRegExp( "[0-9]" ), " " );
- buffer = buffer.simplifyWhiteSpace();
-
- return createFingerprintFromQString( buffer );
-}
-
-NGramsList KatLanguageManager::createFingerprintFromQString( const QString& buf )
-{
- QStringList ngrams;
- NGramsList wngrams;
-
- wngrams.setAutoDelete( true );
-
- QString buffer( buf );
- buffer.truncate( MAXDOCSIZE ); // only use the first MAXDOCSIZE characters of the buffer
-
- // extract the ngrams
- for ( int size = 1; size <= MAXNGRAMSIZE; ++size )
- extractNGrams( buffer, ngrams, size );
-
- // sort the ngrams
- ngrams.sort();
-
- // count the occurrences of every ngram
- // and build the NGramList wngrams
- long occurrences;
- QStringList::Iterator ngram = ngrams.begin();
- while ( ngram != ngrams.end() )
- {
- QString currentNGram = *ngram;
-
- ngram++;
-
- occurrences = 1;
- while ( *ngram == currentNGram )
- {
- occurrences++;
- ngram++;
- }
-
- wngrams.inSort( new NGram( currentNGram, occurrences ) );
- }
-
- // the profile has to contain a maximum of MAXNGRAMS
- while ( wngrams.count() > MAXNGRAMS )
- wngrams.removeLast();
-
- return wngrams;
-}
-
-QString KatLanguageManager::identifyLanguage( const QString& buffer, LanguageProfileMap lp )
-{
- long distance;
- long minscore = MAXSCORE;
- long threshold = minscore;
- LanguageList language_list;
- language_list.setAutoDelete( true );
- LanguageList candidates;
- candidates.setAutoDelete( true );
-
- // create the fingerprint of the buffer
- NGramsList file_ngrams = createFingerprintFromQString( buffer );
- if ( buffer.length() < MINDOCSIZE )
- return QString( "unknown" );
-
- // cycle through the list of managed languages
- // and build an ordered list of languages sorted by distance
- QMap<QString,LanguageProfile>::Iterator end( lp.end() );
- for ( QMap<QString,LanguageProfile>::Iterator it = lp.begin(); it != end; ++it )
- {
- QString lname = it.key();
- LanguageProfile language_ngrams = (LanguageProfile)it.data();
-
- // calculate the distance between the file profile and the language profile
- distance = calculateDistance( file_ngrams, language_ngrams );
-
- // calculate the threshold
- if ( distance < minscore )
- {
- minscore = distance;
- threshold = (long)( (double)distance * THRESHOLDVALUE );
- }
-
- language_list.inSort( new Language( lname, distance ) );
- }
-
- // now that the list of languages is sorted by distance
- // extract at most MAXCANDIDATES candidates
- int cnt = 0;
- Language* currentLanguage;
- QPtrList<Language>::Iterator language = language_list.begin();
- while ( language != language_list.end() )
- {
- currentLanguage = *language;
-
- if ( currentLanguage->distance <= threshold )
- {
- cnt++;
- if ( cnt == MAXCANDIDATES + 1 )
- break;
-
- candidates.inSort( new Language( currentLanguage->language, currentLanguage->distance ) );
- }
-
- language++;
- }
-
- // If more than MAXCANDIDATES matches are found within the threshold,
- // the classifier reports unknown, because the input is obviously confusing
- if ( cnt == MAXCANDIDATES + 1 ) {
- return QString( "unknown" );
- } else {
- Language* first = candidates.getFirst();
- if ( first != 0L )
- return QString( first->language );
- else
- return QString( "unknown" );
- }
-}
-
-long KatLanguageManager::calculateDistance( NGramsList& file_ngrams, LanguageProfile& langNG )
-{
- long fileNGPos = 0L;
- long langNGPos = 0L;
- long distance = 0L;
-
- NGramsList::Iterator file_ngram = file_ngrams.begin();
- while ( file_ngram != file_ngrams.end() )
- {
- NGram* currentFileNGram = *file_ngram;
-
- // search the currentFileNGram in language_ngrams
- // and calculate the distance
- QMap<QString, long>::iterator ng = langNG.find( currentFileNGram->ngram );
-
- if ( ng == langNG.end() )
- {
- // not found
- distance = distance + MAXOUTOFPLACE;
- }
- else
- {
- //found
- langNGPos = ng.data();
- distance = distance + labs( langNGPos - fileNGPos );
- }
-
- fileNGPos++;
- file_ngram++;
- }
-
- return distance;
-}
-
-LanguageProfileMap* KatLanguageManager::loadAllLanguageProfiles()
-{
- LanguageProfileMap* lp = new LanguageProfileMap();
-
- // clear the language profile
- lp->clear();
-
- // find the Kat application data path
- QStringList m_languageFiles = KGlobal::dirs()->findAllResources( "data", "kat/language/*.klp", false, true );
-
- //delete files have .klpd extension
- QStringList deletedLanguageList = KGlobal::dirs()->findAllResources( "data", "kat/language/*.klpd", false, true );
- QStringList deletedFileLanguage;
- QStringList::Iterator end( deletedLanguageList.end() );
- for ( QStringList::Iterator it = deletedLanguageList.begin(); it != end; ++it )
- {
- KURL file( *it );
- QString tmp = file.filename().mid( 0, file.filename().length() - 5 );
- kdDebug() << "loadAllLanguageProfiles tmp :" << tmp << endl;
- deletedFileLanguage.append( tmp );
- }
- // load the language profiles
- QStringList::Iterator endLang( m_languageFiles.end() );
- for ( QStringList::Iterator it = m_languageFiles.begin(); it != endLang; ++it )
- {
- QString lname = (*it).mid( 0, (*it).length()-4 );
- KURL tmpFile( *it );
- QString tmp = tmpFile.filename().mid( 0, tmpFile.filename().length() - 4 );
- //it was removed => don't load it
- if ( deletedFileLanguage.contains( tmp ) )
- continue;
-
- QString profilePath = *it ;
- QDomDocument doc( profilePath );
-
- QFile file( profilePath );
- if ( !file.exists() )
- return lp;
-
- if ( !file.open( IO_ReadOnly ) )
- {
- kdDebug() << "Impossible to open " << profilePath << endl;
- return lp;
- }
- QByteArray m_data = file.readAll();
-
- QString qs;
- if ( !doc.setContent( QString( m_data ).utf8(), &qs ) )
- {
- kdDebug() << "Impossible to set content from " << profilePath << " ERROR: " << qs << endl;
- file.close();
- return lp;
- }
- file.close();
-
- // create the list of ngrams of the language profile
- LanguageProfile lprofile;
- lprofile.clear();
- QDomElement docElem = doc.documentElement();
- QDomNode n = docElem.firstChild();
- long index = 0L;
-
- while( !n.isNull() )
- {
- QDomElement e = n.toElement();
- if( !e.isNull() )
- lprofile.insert( QString( e.attribute( "value" ) ), index );
-
- index++;
- n = n.nextSibling();
- }
-
- QString tmpLang = tmpFile.filename().mid( 0, tmpFile.filename().length() - 4 );
- //kdDebug() << " language insert :" << tmpLang << endl;
- lp->insert( tmpLang , lprofile );
- }
-
- return lp;
-}
-
diff --git a/src/plugins/language/language-compiler.c b/src/plugins/language/language-compiler.c
@@ -0,0 +1,91 @@
+/*
+ This file is part of libextractor.
+ (C) 2005 Vidyut Samanta and Christian Grothoff
+
+ libextractor is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 2, or (at your
+ option) any later version.
+
+ libextractor is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with libextractor; see the file COPYING. If not, write to the
+ Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ Boston, MA 02111-1307, USA.
+ */
+
+#include "platform.h"
+
+
+int main(int argc,
+ char ** argv) {
+ int i;
+ int cnt;
+ char * fn;
+ char ** words;
+ char line[2048]; /* buffer overflow, here we go */
+ FILE *dictin;
+ char * bn;
+#define ALLOCSIZE 1024*1024
+
+ if (argc<2) {
+ fprintf(stderr,
+ _("Please provide a list of klp files as arguments.\n"));
+ exit(-1);
+ }
+
+ fn = malloc(strlen(argv[1]) + 6);
+ strcpy(fn, argv[1]);
+ strcat(fn, ".txt");
+ dictin=fopen(fn,"r");
+ free(fn);
+ if (dictin==NULL) {
+ fprintf(stderr,
+ _("Error opening file `%s': %s\n"),
+ argv[1],strerror(errno));
+ exit(-1);
+ }
+
+ words = malloc(sizeof(char*) * ALLOCSIZE); /* don't we LOVE constant size buffers? */
+ if (words == NULL) {
+ fprintf(stderr,
+ _("Error allocating: %s\n."),
+ strerror(errno));
+ exit(-1);
+ }
+ cnt = 0;
+ memset(&line[0], 0, 2048);
+ while (1 == fscanf(dictin, "%s", (char*)&line)) {
+ words[cnt] = strdup(line);
+ cnt++;
+ memset(&line[0], 0, 2048);
+ if (cnt > ALLOCSIZE) {
+ fprintf(stderr,
+ _("Increase ALLOCSIZE (in %s).\n"),
+ __FILE__);
+ exit(-1);
+ }
+
+ }
+
+ bf.addressesPerElement = ADDR_PER_ELEMENT;
+ bf.bitArraySize = cnt*4;
+ bf.bitArray = malloc(bf.bitArraySize);
+ memset(bf.bitArray, 0, bf.bitArraySize);
+
+ fprintf(stdout,
+ "#include \"somefile.h\"\n");
+ fprintf(stdout,
+ "static int bits[] = { ");
+ for (i=0;i<bf.bitArraySize/sizeof(int);i++)
+ fprintf(stdout,
+ "%dL,",
+ (((int*)bf.bitArray)[i]));
+ fprintf(stdout,
+ "};\n");
+ return 0;
+}
diff --git a/src/plugins/language/languageextractor.c b/src/plugins/language/languageextractor.c
@@ -0,0 +1,331 @@
+/*
+ This file is part of libextractor.
+ (C) 2005 Vidyut Samanta and Christian Grothoff
+
+ libextractor is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 2, or (at your
+ option) any later version.
+
+ libextractor is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with libextractor; see the file COPYING. If not, write to the
+ Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ Boston, MA 02111-1307, USA.
+ */
+/* this code was adopted from Kat, original copyright below: */
+/***************************************************************************
+ * Copyright (C) 2005 by Roberto Cappuccio and the Kat team *
+ * Roberto Cappuccio : roberto.cappuccio@gmail.com *
+ * *
+ * This program is free software; you can redistribute it and/or modify *
+ * it under the terms of the GNU General Public License as published by *
+ * the Free Software Foundation; either version 2 of the License, or *
+ * (at your option) any later version. *
+ * *
+ * This program is distributed in the hope that it will be useful, *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
+ * GNU General Public License for more details. *
+ * *
+ * You should have received a copy of the GNU General Public License *
+ * along with this program; if not, write to the *
+ * Free Software Foundation, Inc., *
+ * 51 Franklin Steet, Fifth Floor, Boston, MA 02110-1301, USA. *
+ ***************************************************************************/
+
+/**
+ * @file languageextractor.c
+ * @author Christian Grothoff
+ * @brief try to identify the language of the document using
+ * letter and letter-pair statistics
+ */
+
+#include "platform.h"
+#include "extractor.h"
+
+int NGramsList::compareItems( QCollection::Item item1, QCollection::Item item2 )
+{
+ NGram* n1 = (NGram*)item1;
+ NGram* n2 = (NGram*)item2;
+
+ return n2->occurrences - n1->occurrences;
+}
+
+int LanguageList::compareItems( QCollection::Item item1, QCollection::Item item2 )
+{
+ Language* n1 = (Language*)item1;
+ Language* n2 = (Language*)item2;
+
+ return n2->distance - n1->distance;
+}
+
+
+void KatLanguageManager::extractNGrams( const QString& str, QStringList& ngrams, int size )
+{
+ QString paddedString( str );
+
+ paddedString = paddedString.replace( QRegExp( " " ), "_" );
+ paddedString = '_' + paddedString + '_';
+
+ for( int i = 0; i < paddedString.length() - size + 1; i++ )
+ ngrams.append( paddedString.mid( i, size ) );
+}
+
+NGramsList KatLanguageManager::createFingerprintFromFile( const QString& fileName )
+{
+ QFile m_file( fileName );
+ QTextStream m_stream( &m_file );
+ bool m_open = m_file.open( IO_ReadOnly );
+ QString buffer = m_stream.read();
+ m_file.close();
+
+ buffer = buffer.lower();
+ buffer = buffer.replace( QRegExp( "[\\W]" ), " " );
+ buffer = buffer.replace( QRegExp( "[0-9]" ), " " );
+ buffer = buffer.simplifyWhiteSpace();
+
+ return createFingerprintFromQString( buffer );
+}
+
+NGramsList KatLanguageManager::createFingerprintFromQString( const QString& buf )
+{
+ QStringList ngrams;
+ NGramsList wngrams;
+
+ wngrams.setAutoDelete( true );
+
+ QString buffer( buf );
+ buffer.truncate( MAXDOCSIZE ); // only use the first MAXDOCSIZE characters of the buffer
+
+ // extract the ngrams
+ for ( int size = 1; size <= MAXNGRAMSIZE; ++size )
+ extractNGrams( buffer, ngrams, size );
+
+ // sort the ngrams
+ ngrams.sort();
+
+ // count the occurrences of every ngram
+ // and build the NGramList wngrams
+ long occurrences;
+ QStringList::Iterator ngram = ngrams.begin();
+ while ( ngram != ngrams.end() )
+ {
+ QString currentNGram = *ngram;
+
+ ngram++;
+
+ occurrences = 1;
+ while ( *ngram == currentNGram )
+ {
+ occurrences++;
+ ngram++;
+ }
+
+ wngrams.inSort( new NGram( currentNGram, occurrences ) );
+ }
+
+ // the profile has to contain a maximum of MAXNGRAMS
+ while ( wngrams.count() > MAXNGRAMS )
+ wngrams.removeLast();
+
+ return wngrams;
+}
+
+QString KatLanguageManager::identifyLanguage( const QString& buffer, LanguageProfileMap lp )
+{
+ long distance;
+ long minscore = MAXSCORE;
+ long threshold = minscore;
+ LanguageList language_list;
+ language_list.setAutoDelete( true );
+ LanguageList candidates;
+ candidates.setAutoDelete( true );
+
+ // create the fingerprint of the buffer
+ NGramsList file_ngrams = createFingerprintFromQString( buffer );
+ if ( buffer.length() < MINDOCSIZE )
+ return QString( "unknown" );
+
+ // cycle through the list of managed languages
+ // and build an ordered list of languages sorted by distance
+ QMap<QString,LanguageProfile>::Iterator end( lp.end() );
+ for ( QMap<QString,LanguageProfile>::Iterator it = lp.begin(); it != end; ++it )
+ {
+ QString lname = it.key();
+ LanguageProfile language_ngrams = (LanguageProfile)it.data();
+
+ // calculate the distance between the file profile and the language profile
+ distance = calculateDistance( file_ngrams, language_ngrams );
+
+ // calculate the threshold
+ if ( distance < minscore )
+ {
+ minscore = distance;
+ threshold = (long)( (double)distance * THRESHOLDVALUE );
+ }
+
+ language_list.inSort( new Language( lname, distance ) );
+ }
+
+ // now that the list of languages is sorted by distance
+ // extract at most MAXCANDIDATES candidates
+ int cnt = 0;
+ Language* currentLanguage;
+ QPtrList<Language>::Iterator language = language_list.begin();
+ while ( language != language_list.end() )
+ {
+ currentLanguage = *language;
+
+ if ( currentLanguage->distance <= threshold )
+ {
+ cnt++;
+ if ( cnt == MAXCANDIDATES + 1 )
+ break;
+
+ candidates.inSort( new Language( currentLanguage->language, currentLanguage->distance ) );
+ }
+
+ language++;
+ }
+
+ // If more than MAXCANDIDATES matches are found within the threshold,
+ // the classifier reports unknown, because the input is obviously confusing
+ if ( cnt == MAXCANDIDATES + 1 ) {
+ return QString( "unknown" );
+ } else {
+ Language* first = candidates.getFirst();
+ if ( first != 0L )
+ return QString( first->language );
+ else
+ return QString( "unknown" );
+ }
+}
+
+long KatLanguageManager::calculateDistance( NGramsList& file_ngrams, LanguageProfile& langNG )
+{
+ long fileNGPos = 0L;
+ long langNGPos = 0L;
+ long distance = 0L;
+
+ NGramsList::Iterator file_ngram = file_ngrams.begin();
+ while ( file_ngram != file_ngrams.end() )
+ {
+ NGram* currentFileNGram = *file_ngram;
+
+ // search the currentFileNGram in language_ngrams
+ // and calculate the distance
+ QMap<QString, long>::iterator ng = langNG.find( currentFileNGram->ngram );
+
+ if ( ng == langNG.end() )
+ {
+ // not found
+ distance = distance + MAXOUTOFPLACE;
+ }
+ else
+ {
+ //found
+ langNGPos = ng.data();
+ distance = distance + labs( langNGPos - fileNGPos );
+ }
+
+ fileNGPos++;
+ file_ngram++;
+ }
+
+ return distance;
+}
+
+LanguageProfileMap* KatLanguageManager::loadAllLanguageProfiles()
+{
+ LanguageProfileMap* lp = new LanguageProfileMap();
+
+ // clear the language profile
+ lp->clear();
+
+ // find the Kat application data path
+ QStringList m_languageFiles = KGlobal::dirs()->findAllResources( "data", "kat/language/*.klp", false, true );
+
+ //delete files have .klpd extension
+ QStringList deletedLanguageList = KGlobal::dirs()->findAllResources( "data", "kat/language/*.klpd", false, true );
+ QStringList deletedFileLanguage;
+ QStringList::Iterator end( deletedLanguageList.end() );
+ for ( QStringList::Iterator it = deletedLanguageList.begin(); it != end; ++it )
+ {
+ KURL file( *it );
+ QString tmp = file.filename().mid( 0, file.filename().length() - 5 );
+ kdDebug() << "loadAllLanguageProfiles tmp :" << tmp << endl;
+ deletedFileLanguage.append( tmp );
+ }
+ // load the language profiles
+ QStringList::Iterator endLang( m_languageFiles.end() );
+ for ( QStringList::Iterator it = m_languageFiles.begin(); it != endLang; ++it )
+ {
+ QString lname = (*it).mid( 0, (*it).length()-4 );
+ KURL tmpFile( *it );
+ QString tmp = tmpFile.filename().mid( 0, tmpFile.filename().length() - 4 );
+ //it was removed => don't load it
+ if ( deletedFileLanguage.contains( tmp ) )
+ continue;
+
+ QString profilePath = *it ;
+ QDomDocument doc( profilePath );
+
+ QFile file( profilePath );
+ if ( !file.exists() )
+ return lp;
+
+ if ( !file.open( IO_ReadOnly ) )
+ {
+ kdDebug() << "Impossible to open " << profilePath << endl;
+ return lp;
+ }
+ QByteArray m_data = file.readAll();
+
+ QString qs;
+ if ( !doc.setContent( QString( m_data ).utf8(), &qs ) )
+ {
+ kdDebug() << "Impossible to set content from " << profilePath << " ERROR: " << qs << endl;
+ file.close();
+ return lp;
+ }
+ file.close();
+
+ // create the list of ngrams of the language profile
+ LanguageProfile lprofile;
+ lprofile.clear();
+ QDomElement docElem = doc.documentElement();
+ QDomNode n = docElem.firstChild();
+ long index = 0L;
+
+ while( !n.isNull() )
+ {
+ QDomElement e = n.toElement();
+ if( !e.isNull() )
+ lprofile.insert( QString( e.attribute( "value" ) ), index );
+
+ index++;
+ n = n.nextSibling();
+ }
+
+ QString tmpLang = tmpFile.filename().mid( 0, tmpFile.filename().length() - 4 );
+ //kdDebug() << " language insert :" << tmpLang << endl;
+ lp->insert( tmpLang , lprofile );
+ }
+
+ return lp;
+}
+
+
+
+struct EXTRACTOR_Keywords *
+libextractor_language_extract(const char * filename,
+ const char * buf,
+ size_t size,
+ struct EXTRACTOR_Keywords * prev) {
+ return prev;
+}