update - libextractor - GNU libextractor

commit f3f83427975148b9a303bb4cb069a56fa5a4d85e
parent 00072f35a3bd58eb2f9a734d3af1996370da9e0b
Author: Christian Grothoff <christian@grothoff.org>
Date:   Fri, 16 Sep 2005 08:15:07 +0000

update

Diffstat:
M src/plugins/language/Makefile.am  | 28 ++++++++++++++++++++++++++--
D src/plugins/language/katlanguagemanager.cpp  | 310 -------------------------------------------------------------------------------
A src/plugins/language/language-compiler.c  | 91 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A src/plugins/language/languageextractor.c  | 331 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

4 files changed, 448 insertions(+), 312 deletions(-)
diff --git a/src/plugins/language/Makefile.am b/src/plugins/language/Makefile.am
@@ -1,6 +1,10 @@
-languagedir = $(datadir)/libextractor/language
+include ../Makefile-plugins.am
 
-language_DATA = afrikaans.klp albanian.klp arabic2.klp arabic.klp armenian.klp \
+noinst_PROGRAMS = language-compiler
+
+CLEANFILES = languages.c
+
+EXTRA_DIST = afrikaans.klp albanian.klp arabic2.klp arabic.klp armenian.klp \
 	basque.klp belarus.klp bosnian.klp breton.klp bulgarian.klp catalan.klp \
 	chinese1.klp chinese2.klp chinese.klp croatian.klp czech.klp danish.klp dutch.klp \
 	english.klp esperanto.klp estonian.klp finnish.klp french.klp frisian.klp \
@@ -13,3 +17,23 @@ language_DATA = afrikaans.klp albanian.klp arabic2.klp arabic.klp armenian.klp \
 	serbian.klp slovak2.klp slovak.klp slovenian2.klp slovenian.klp spanish.klp \
 	swahili.klp tagalog.klp tamil.klp thai.klp turkish.klp ukrainian.klp \
 	vietnamese.klp welsh.klp
+
+language.c: language-compiler$(EXEEXT)
+	./language-compiler$(EXEEXT) *.klp > languages.c
+
+language_compiler_SOURCES = \
+ language-compiler.c 
+language_compiler_LDADD = \
+ $(LIBINTL)
+
+plugin_LTLIBRARIES = \
+ libextractor_language.la
+
+libextractor_language_la_LIBADD = \
+  $(top_builddir)/src/main/libextractor.la
+
+libextractor_language_la_SOURCES = \
+  languageextractor.c languages.c
+libextractor_language_la_LDFLAGS = \
+ $(PLUGINFLAGS) $(retaincommand)
+
diff --git a/src/plugins/language/katlanguagemanager.cpp b/src/plugins/language/katlanguagemanager.cpp
@@ -1,310 +0,0 @@
-/***************************************************************************
- *   Copyright (C) 2005 by Roberto Cappuccio and the Kat team              *
- *   Roberto Cappuccio : roberto.cappuccio@gmail.com                       *
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- *   This program is distributed in the hope that it will be useful,       *
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
- *   GNU General Public License for more details.                          *
- *                                                                         *
- *   You should have received a copy of the GNU General Public License     *
- *   along with this program; if not, write to the                         *
- *   Free Software Foundation, Inc.,                                       *
- *   51 Franklin Steet, Fifth Floor, Boston, MA 02110-1301, USA.           *
- ***************************************************************************/
-
-#include <cstdlib>
-#include <kdebug.h>
-#include <kstandarddirs.h>
-#include <kio/job.h>
-#include <kio/jobclasses.h>
-#include <qregexp.h>
-#include <qdir.h>
-#include <qdom.h>
-
-#include "katlanguagemanager.h"
-
-int NGramsList::compareItems( QCollection::Item item1, QCollection::Item item2 )
-{
-    NGram* n1 = (NGram*)item1;
-    NGram* n2 = (NGram*)item2;
-
-    return n2->occurrences - n1->occurrences;
-}
-
-int LanguageList::compareItems( QCollection::Item item1, QCollection::Item item2 )
-{
-    Language* n1 = (Language*)item1;
-    Language* n2 = (Language*)item2;
-
-    return n2->distance - n1->distance;
-}
-
-KatLanguageManager::KatLanguageManager()
-{
-}
-
-KatLanguageManager::~KatLanguageManager()
-{
-}
-
-void KatLanguageManager::extractNGrams( const QString& str, QStringList& ngrams, int size )
-{
-    QString paddedString( str );
-
-    paddedString = paddedString.replace( QRegExp( " " ), "_" );
-    paddedString = '_' + paddedString + '_';
-
-    for( int i = 0; i < paddedString.length() - size + 1; i++ )
-        ngrams.append( paddedString.mid( i, size ) );
-}
-
-NGramsList KatLanguageManager::createFingerprintFromFile( const QString& fileName )
-{
-    QFile m_file( fileName );
-    QTextStream m_stream( &m_file );
-    bool m_open = m_file.open( IO_ReadOnly );
-    QString buffer = m_stream.read();
-    m_file.close();
-
-    buffer = buffer.lower();
-    buffer = buffer.replace( QRegExp( "[\\W]" ), " " );
-    buffer = buffer.replace( QRegExp( "[0-9]" ), " " );
-    buffer = buffer.simplifyWhiteSpace();
-
-    return createFingerprintFromQString( buffer );
-}
-
-NGramsList KatLanguageManager::createFingerprintFromQString( const QString& buf )
-{
-    QStringList ngrams;
-    NGramsList wngrams;
-
-    wngrams.setAutoDelete( true );
-
-    QString buffer( buf );
-    buffer.truncate( MAXDOCSIZE ); // only use the first MAXDOCSIZE characters of the buffer
-
-    // extract the ngrams
-    for ( int size = 1; size <= MAXNGRAMSIZE; ++size )
-        extractNGrams( buffer, ngrams, size );
-
-    // sort the ngrams
-    ngrams.sort();
-
-    // count the occurrences of every ngram
-    // and build the NGramList wngrams
-    long occurrences;
-    QStringList::Iterator ngram = ngrams.begin();
-    while ( ngram != ngrams.end() )
-    {
-        QString currentNGram = *ngram;
-
-        ngram++;
-
-        occurrences = 1;
-        while ( *ngram == currentNGram )
-        {
-            occurrences++;
-            ngram++;
-        }
-
-        wngrams.inSort( new NGram( currentNGram, occurrences ) );
-    }
-
-    // the profile has to contain a maximum of MAXNGRAMS
-    while ( wngrams.count() > MAXNGRAMS )
-        wngrams.removeLast();
-
-    return wngrams;
-}
-
-QString KatLanguageManager::identifyLanguage( const QString& buffer, LanguageProfileMap lp )
-{
-    long distance;
-    long minscore = MAXSCORE;
-    long threshold = minscore;
-    LanguageList language_list;
-    language_list.setAutoDelete( true );
-    LanguageList candidates;
-    candidates.setAutoDelete( true );
-
-    // create the fingerprint of the buffer
-    NGramsList file_ngrams = createFingerprintFromQString( buffer );
-    if ( buffer.length() < MINDOCSIZE )
-        return QString( "unknown" );
-
-    // cycle through the list of managed languages
-    // and build an ordered list of languages sorted by distance
-    QMap<QString,LanguageProfile>::Iterator end( lp.end() );
-    for ( QMap<QString,LanguageProfile>::Iterator it = lp.begin(); it != end; ++it )
-    {
-        QString lname = it.key();
-        LanguageProfile language_ngrams = (LanguageProfile)it.data();
-
-        // calculate the distance between the file profile and the language profile
-        distance = calculateDistance( file_ngrams, language_ngrams );
-
-        // calculate the threshold
-        if ( distance < minscore )
-        {
-            minscore = distance;
-            threshold = (long)( (double)distance * THRESHOLDVALUE );
-        }
-
-        language_list.inSort( new Language( lname, distance ) );
-    }
-
-    // now that the list of languages is sorted by distance
-    // extract at most MAXCANDIDATES candidates
-    int cnt = 0;
-    Language* currentLanguage;
-    QPtrList<Language>::Iterator language = language_list.begin();
-    while ( language != language_list.end() )
-    {
-        currentLanguage = *language;
-
-        if ( currentLanguage->distance <= threshold )
-        {
-            cnt++;
-            if ( cnt == MAXCANDIDATES + 1 )
-                break;
-
-            candidates.inSort( new Language( currentLanguage->language, currentLanguage->distance ) );
-        }
-
-        language++;
-    }
-
-    // If more than MAXCANDIDATES matches are found within the threshold,
-    // the classifier reports unknown, because the input is obviously confusing
-    if ( cnt == MAXCANDIDATES + 1 ) {
-        return QString( "unknown" );
-    } else {
-        Language* first = candidates.getFirst();
-        if ( first != 0L )
-            return QString( first->language );
-        else
-            return QString( "unknown" );
-    }
-}
-
-long KatLanguageManager::calculateDistance( NGramsList& file_ngrams, LanguageProfile& langNG )
-{
-    long fileNGPos = 0L;
-    long langNGPos = 0L;
-    long distance = 0L;
-
-    NGramsList::Iterator file_ngram = file_ngrams.begin();
-    while ( file_ngram != file_ngrams.end() )
-    {
-        NGram* currentFileNGram = *file_ngram;
-
-        // search the currentFileNGram in language_ngrams
-        // and calculate the distance
-        QMap<QString, long>::iterator ng = langNG.find( currentFileNGram->ngram );
-
-        if ( ng == langNG.end() )
-        {
-            // not found
-            distance = distance + MAXOUTOFPLACE;
-        }
-        else
-        {
-            //found
-            langNGPos = ng.data();
-            distance = distance + labs( langNGPos - fileNGPos );
-        }
-
-        fileNGPos++;
-        file_ngram++;
-    }
-
-    return distance;
-}
-
-LanguageProfileMap* KatLanguageManager::loadAllLanguageProfiles()
-{
-    LanguageProfileMap* lp = new LanguageProfileMap();
-
-    // clear the language profile
-    lp->clear();
-
-    // find the Kat application data path
-    QStringList m_languageFiles = KGlobal::dirs()->findAllResources( "data", "kat/language/*.klp", false, true );
-
-    //delete files have .klpd extension
-    QStringList deletedLanguageList = KGlobal::dirs()->findAllResources( "data", "kat/language/*.klpd", false, true );
-    QStringList deletedFileLanguage;
-    QStringList::Iterator end( deletedLanguageList.end() );
-    for ( QStringList::Iterator it = deletedLanguageList.begin(); it != end; ++it )
-    {
-        KURL file( *it );
-        QString tmp = file.filename().mid( 0, file.filename().length() - 5 );
-        kdDebug() << "loadAllLanguageProfiles tmp :" << tmp << endl;
-        deletedFileLanguage.append( tmp );
-    }
-    // load the language profiles
-    QStringList::Iterator endLang( m_languageFiles.end() );
-    for ( QStringList::Iterator it = m_languageFiles.begin(); it != endLang; ++it )
-    {
-        QString lname = (*it).mid( 0, (*it).length()-4 );
-        KURL tmpFile( *it );
-        QString tmp = tmpFile.filename().mid( 0, tmpFile.filename().length() - 4 );
-        //it was removed => don't load it
-        if ( deletedFileLanguage.contains( tmp ) )
-            continue;
-
-        QString profilePath = *it ;
-        QDomDocument doc( profilePath );
-
-        QFile file( profilePath );
-        if ( !file.exists() )
-            return lp;
-
-        if ( !file.open( IO_ReadOnly ) )
-        {
-            kdDebug() << "Impossible to open " << profilePath << endl;
-            return lp;
-        }
-        QByteArray m_data = file.readAll();
-
-        QString qs;
-        if ( !doc.setContent( QString( m_data ).utf8(), &qs ) )
-        {
-            kdDebug() << "Impossible to set content from " << profilePath << " ERROR: " << qs << endl;
-            file.close();
-            return lp;
-        }
-        file.close();
-
-        // create the list of ngrams of the language profile
-        LanguageProfile lprofile;
-        lprofile.clear();
-        QDomElement docElem = doc.documentElement();
-        QDomNode n = docElem.firstChild();
-        long index = 0L;
-
-        while( !n.isNull() )
-        {
-            QDomElement e = n.toElement();
-            if( !e.isNull() )
-                lprofile.insert( QString( e.attribute( "value" ) ), index );
-
-            index++;
-            n = n.nextSibling();
-        }
-
-        QString tmpLang = tmpFile.filename().mid( 0, tmpFile.filename().length() - 4 );
-        //kdDebug() << " language insert :" << tmpLang << endl;
-        lp->insert( tmpLang , lprofile );
-    }
-
-    return lp;
-}
-
diff --git a/src/plugins/language/language-compiler.c b/src/plugins/language/language-compiler.c
@@ -0,0 +1,91 @@
+/*
+     This file is part of libextractor.
+     (C) 2005 Vidyut Samanta and Christian Grothoff
+
+     libextractor is free software; you can redistribute it and/or modify
+     it under the terms of the GNU General Public License as published
+     by the Free Software Foundation; either version 2, or (at your
+     option) any later version.
+
+     libextractor is distributed in the hope that it will be useful, but
+     WITHOUT ANY WARRANTY; without even the implied warranty of
+     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+     General Public License for more details.
+
+     You should have received a copy of the GNU General Public License
+     along with libextractor; see the file COPYING.  If not, write to the
+     Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+     Boston, MA 02111-1307, USA.
+ */
+
+#include "platform.h"
+
+
+int main(int argc,
+	 char ** argv) {
+  int i;
+  int cnt;
+  char * fn;
+  char ** words;
+  char line[2048]; /* buffer overflow, here we go */
+  FILE *dictin;
+  char * bn;
+#define ALLOCSIZE 1024*1024
+
+  if (argc<2) {
+    fprintf(stderr,
+	    _("Please provide a list of klp files as arguments.\n"));
+    exit(-1);
+  }
+
+  fn = malloc(strlen(argv[1]) + 6);
+  strcpy(fn, argv[1]);
+  strcat(fn, ".txt");
+  dictin=fopen(fn,"r");
+  free(fn);
+  if (dictin==NULL) {
+    fprintf(stderr,
+	    _("Error opening file `%s': %s\n"),
+	    argv[1],strerror(errno));
+    exit(-1);
+  }
+
+  words = malloc(sizeof(char*) * ALLOCSIZE); /* don't we LOVE constant size buffers? */
+  if (words == NULL) {
+    fprintf(stderr,
+	    _("Error allocating: %s\n."),
+	    strerror(errno));
+    exit(-1);
+  }
+  cnt = 0;
+  memset(&line[0], 0, 2048);
+  while (1 == fscanf(dictin, "%s", (char*)&line)) {
+    words[cnt] = strdup(line);
+    cnt++;
+    memset(&line[0], 0, 2048);
+    if (cnt > ALLOCSIZE) {
+      fprintf(stderr,
+	      _("Increase ALLOCSIZE (in %s).\n"),
+	      __FILE__);
+      exit(-1);
+    }
+
+  }
+
+  bf.addressesPerElement = ADDR_PER_ELEMENT;
+  bf.bitArraySize = cnt*4;
+  bf.bitArray = malloc(bf.bitArraySize);
+  memset(bf.bitArray, 0, bf.bitArraySize);
+
+  fprintf(stdout,
+	  "#include \"somefile.h\"\n");
+  fprintf(stdout,
+	  "static int bits[] = { ");
+  for (i=0;i<bf.bitArraySize/sizeof(int);i++)
+    fprintf(stdout,
+	    "%dL,",
+	    (((int*)bf.bitArray)[i]));
+  fprintf(stdout,
+	  "};\n");
+  return 0;
+}
diff --git a/src/plugins/language/languageextractor.c b/src/plugins/language/languageextractor.c
@@ -0,0 +1,331 @@
+/*
+     This file is part of libextractor.
+     (C) 2005 Vidyut Samanta and Christian Grothoff
+
+     libextractor is free software; you can redistribute it and/or modify
+     it under the terms of the GNU General Public License as published
+     by the Free Software Foundation; either version 2, or (at your
+     option) any later version.
+
+     libextractor is distributed in the hope that it will be useful, but
+     WITHOUT ANY WARRANTY; without even the implied warranty of
+     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+     General Public License for more details.
+
+     You should have received a copy of the GNU General Public License
+     along with libextractor; see the file COPYING.  If not, write to the
+     Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+     Boston, MA 02111-1307, USA.
+ */
+/* this code was adopted from Kat, original copyright below: */
+/***************************************************************************
+ *   Copyright (C) 2005 by Roberto Cappuccio and the Kat team              *
+ *   Roberto Cappuccio : roberto.cappuccio@gmail.com                       *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   51 Franklin Steet, Fifth Floor, Boston, MA 02110-1301, USA.           *
+ ***************************************************************************/
+
+/**
+ * @file languageextractor.c
+ * @author Christian Grothoff
+ * @brief try to identify the language of the document using 
+ *        letter and letter-pair statistics 
+ */
+
+#include "platform.h"
+#include "extractor.h"
+
+int NGramsList::compareItems( QCollection::Item item1, QCollection::Item item2 )
+{
+    NGram* n1 = (NGram*)item1;
+    NGram* n2 = (NGram*)item2;
+
+    return n2->occurrences - n1->occurrences;
+}
+
+int LanguageList::compareItems( QCollection::Item item1, QCollection::Item item2 )
+{
+    Language* n1 = (Language*)item1;
+    Language* n2 = (Language*)item2;
+
+    return n2->distance - n1->distance;
+}
+
+
+void KatLanguageManager::extractNGrams( const QString& str, QStringList& ngrams, int size )
+{
+    QString paddedString( str );
+
+    paddedString = paddedString.replace( QRegExp( " " ), "_" );
+    paddedString = '_' + paddedString + '_';
+
+    for( int i = 0; i < paddedString.length() - size + 1; i++ )
+        ngrams.append( paddedString.mid( i, size ) );
+}
+
+NGramsList KatLanguageManager::createFingerprintFromFile( const QString& fileName )
+{
+    QFile m_file( fileName );
+    QTextStream m_stream( &m_file );
+    bool m_open = m_file.open( IO_ReadOnly );
+    QString buffer = m_stream.read();
+    m_file.close();
+
+    buffer = buffer.lower();
+    buffer = buffer.replace( QRegExp( "[\\W]" ), " " );
+    buffer = buffer.replace( QRegExp( "[0-9]" ), " " );
+    buffer = buffer.simplifyWhiteSpace();
+
+    return createFingerprintFromQString( buffer );
+}
+
+NGramsList KatLanguageManager::createFingerprintFromQString( const QString& buf )
+{
+    QStringList ngrams;
+    NGramsList wngrams;
+
+    wngrams.setAutoDelete( true );
+
+    QString buffer( buf );
+    buffer.truncate( MAXDOCSIZE ); // only use the first MAXDOCSIZE characters of the buffer
+
+    // extract the ngrams
+    for ( int size = 1; size <= MAXNGRAMSIZE; ++size )
+        extractNGrams( buffer, ngrams, size );
+
+    // sort the ngrams
+    ngrams.sort();
+
+    // count the occurrences of every ngram
+    // and build the NGramList wngrams
+    long occurrences;
+    QStringList::Iterator ngram = ngrams.begin();
+    while ( ngram != ngrams.end() )
+    {
+        QString currentNGram = *ngram;
+
+        ngram++;
+
+        occurrences = 1;
+        while ( *ngram == currentNGram )
+        {
+            occurrences++;
+            ngram++;
+        }
+
+        wngrams.inSort( new NGram( currentNGram, occurrences ) );
+    }
+
+    // the profile has to contain a maximum of MAXNGRAMS
+    while ( wngrams.count() > MAXNGRAMS )
+        wngrams.removeLast();
+
+    return wngrams;
+}
+
+QString KatLanguageManager::identifyLanguage( const QString& buffer, LanguageProfileMap lp )
+{
+    long distance;
+    long minscore = MAXSCORE;
+    long threshold = minscore;
+    LanguageList language_list;
+    language_list.setAutoDelete( true );
+    LanguageList candidates;
+    candidates.setAutoDelete( true );
+
+    // create the fingerprint of the buffer
+    NGramsList file_ngrams = createFingerprintFromQString( buffer );
+    if ( buffer.length() < MINDOCSIZE )
+        return QString( "unknown" );
+
+    // cycle through the list of managed languages
+    // and build an ordered list of languages sorted by distance
+    QMap<QString,LanguageProfile>::Iterator end( lp.end() );
+    for ( QMap<QString,LanguageProfile>::Iterator it = lp.begin(); it != end; ++it )
+    {
+        QString lname = it.key();
+        LanguageProfile language_ngrams = (LanguageProfile)it.data();
+
+        // calculate the distance between the file profile and the language profile
+        distance = calculateDistance( file_ngrams, language_ngrams );
+
+        // calculate the threshold
+        if ( distance < minscore )
+        {
+            minscore = distance;
+            threshold = (long)( (double)distance * THRESHOLDVALUE );
+        }
+
+        language_list.inSort( new Language( lname, distance ) );
+    }
+
+    // now that the list of languages is sorted by distance
+    // extract at most MAXCANDIDATES candidates
+    int cnt = 0;
+    Language* currentLanguage;
+    QPtrList<Language>::Iterator language = language_list.begin();
+    while ( language != language_list.end() )
+    {
+        currentLanguage = *language;
+
+        if ( currentLanguage->distance <= threshold )
+        {
+            cnt++;
+            if ( cnt == MAXCANDIDATES + 1 )
+                break;
+
+            candidates.inSort( new Language( currentLanguage->language, currentLanguage->distance ) );
+        }
+
+        language++;
+    }
+
+    // If more than MAXCANDIDATES matches are found within the threshold,
+    // the classifier reports unknown, because the input is obviously confusing
+    if ( cnt == MAXCANDIDATES + 1 ) {
+        return QString( "unknown" );
+    } else {
+        Language* first = candidates.getFirst();
+        if ( first != 0L )
+            return QString( first->language );
+        else
+            return QString( "unknown" );
+    }
+}
+
+long KatLanguageManager::calculateDistance( NGramsList& file_ngrams, LanguageProfile& langNG )
+{
+    long fileNGPos = 0L;
+    long langNGPos = 0L;
+    long distance = 0L;
+
+    NGramsList::Iterator file_ngram = file_ngrams.begin();
+    while ( file_ngram != file_ngrams.end() )
+    {
+        NGram* currentFileNGram = *file_ngram;
+
+        // search the currentFileNGram in language_ngrams
+        // and calculate the distance
+        QMap<QString, long>::iterator ng = langNG.find( currentFileNGram->ngram );
+
+        if ( ng == langNG.end() )
+        {
+            // not found
+            distance = distance + MAXOUTOFPLACE;
+        }
+        else
+        {
+            //found
+            langNGPos = ng.data();
+            distance = distance + labs( langNGPos - fileNGPos );
+        }
+
+        fileNGPos++;
+        file_ngram++;
+    }
+
+    return distance;
+}
+
+LanguageProfileMap* KatLanguageManager::loadAllLanguageProfiles()
+{
+    LanguageProfileMap* lp = new LanguageProfileMap();
+
+    // clear the language profile
+    lp->clear();
+
+    // find the Kat application data path
+    QStringList m_languageFiles = KGlobal::dirs()->findAllResources( "data", "kat/language/*.klp", false, true );
+
+    //delete files have .klpd extension
+    QStringList deletedLanguageList = KGlobal::dirs()->findAllResources( "data", "kat/language/*.klpd", false, true );
+    QStringList deletedFileLanguage;
+    QStringList::Iterator end( deletedLanguageList.end() );
+    for ( QStringList::Iterator it = deletedLanguageList.begin(); it != end; ++it )
+    {
+        KURL file( *it );
+        QString tmp = file.filename().mid( 0, file.filename().length() - 5 );
+        kdDebug() << "loadAllLanguageProfiles tmp :" << tmp << endl;
+        deletedFileLanguage.append( tmp );
+    }
+    // load the language profiles
+    QStringList::Iterator endLang( m_languageFiles.end() );
+    for ( QStringList::Iterator it = m_languageFiles.begin(); it != endLang; ++it )
+    {
+        QString lname = (*it).mid( 0, (*it).length()-4 );
+        KURL tmpFile( *it );
+        QString tmp = tmpFile.filename().mid( 0, tmpFile.filename().length() - 4 );
+        //it was removed => don't load it
+        if ( deletedFileLanguage.contains( tmp ) )
+            continue;
+
+        QString profilePath = *it ;
+        QDomDocument doc( profilePath );
+
+        QFile file( profilePath );
+        if ( !file.exists() )
+            return lp;
+
+        if ( !file.open( IO_ReadOnly ) )
+        {
+            kdDebug() << "Impossible to open " << profilePath << endl;
+            return lp;
+        }
+        QByteArray m_data = file.readAll();
+
+        QString qs;
+        if ( !doc.setContent( QString( m_data ).utf8(), &qs ) )
+        {
+            kdDebug() << "Impossible to set content from " << profilePath << " ERROR: " << qs << endl;
+            file.close();
+            return lp;
+        }
+        file.close();
+
+        // create the list of ngrams of the language profile
+        LanguageProfile lprofile;
+        lprofile.clear();
+        QDomElement docElem = doc.documentElement();
+        QDomNode n = docElem.firstChild();
+        long index = 0L;
+
+        while( !n.isNull() )
+        {
+            QDomElement e = n.toElement();
+            if( !e.isNull() )
+                lprofile.insert( QString( e.attribute( "value" ) ), index );
+
+            index++;
+            n = n.nextSibling();
+        }
+
+        QString tmpLang = tmpFile.filename().mid( 0, tmpFile.filename().length() - 4 );
+        //kdDebug() << " language insert :" << tmpLang << endl;
+        lp->insert( tmpLang , lprofile );
+    }
+
+    return lp;
+}
+
+
+
+struct EXTRACTOR_Keywords * 
+libextractor_language_extract(const char * filename,
+			      const char * buf,
+			      size_t size,
+			      struct EXTRACTOR_Keywords * prev) {
+  return prev;
+}

	libextractor GNU libextractor
	Log \| Files \| Refs \| Submodules \| README \| LICENSE

M	src/plugins/language/Makefile.am	\|	28	++++++++++++++++++++++++++--
D	src/plugins/language/katlanguagemanager.cpp	\|	310	-------------------------------------------------------------------------------
A	src/plugins/language/language-compiler.c	\|	91	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	src/plugins/language/languageextractor.c	\|	331	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++