commit 01e3e60dd64caeb9a441248d602b4c8a60e9413b
parent 4db7e805a2c0082b95b1cd2a735e687897adcf3b
Author: Christian Grothoff <christian@grothoff.org>
Date: Sat, 19 Dec 2009 12:58:35 +0000
dos2unix
Diffstat:
1 file changed, 163 insertions(+), 163 deletions(-)
diff --git a/src/plugins/translitextractor.c b/src/plugins/translitextractor.c
@@ -1,128 +1,128 @@
-/*
- This file is part of libextractor.
- (C) 2002 - 2005 Vidyut Samanta and Christian Grothoff
-
- libextractor is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 2, or (at your
- option) any later version.
-
- libextractor is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with libextractor; see the file COPYING. If not, write to the
- Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- Boston, MA 02111-1307, USA.
- */
-
-/**
- * @brief Transliterate keywords that contain international characters
- * @author Nils Durner
- */
-
-#include "platform.h"
-#include "extractor.h"
-#include "convert.h"
-
-/* Language independent chars were taken from glibc's locale/C-translit.h.in
- *
- * This extractor uses two tables: one contains the Unicode
- * characters and the other one contains the transliterations (since
- * transliterations are often used more than once: ä -> ae, æ -> ae).
- * The first table points to an appropriate transliteration stored in the
- * second table.
- *
- * To generate the two tables, a relational database was prepared:
- * create table TBL(UNI varchar(20), TRANSL varchar(10), TRANSLID integer);
- * create table TRANSL (TRANSL varchar(20) primary key, TRANSLID integer);
- *
- * After that, the data from glibc was converted to a SQL script using
- * "awk -F '\t'":
- * {
- * transl = $2;
- * gsub(/'/, "''", transl);
- * print "insert into TBL(UNI, TRANSL) values ('0x" substr($3, 6, index($3, ">") - 6) "', '" transl "');";
- * print "insert into TRANSL(TRANSL, TRANSLID) values ('" transl "', (Select count(*) from TRANSL));";
- * }
- *
- * Then the SQL script was executed, "commit"ted and the relation between the
- * two tables established using:
- * update TBL Set TRANSLID = (Select TRANSLID from TRANSL where TRANSL.TRANSL = TBL.TRANSL);
- * commit;
- *
- * The C arrays were then created with:
- * Select '{' || UNI || ', ' || TRANSLID || '},' from TBL order by UNI;
- * Select TRANSL || ', ' from TRANSL order by TRANSLID;
- * and reformatted with:
- * {
- * a = $0;
- * getline;
- * b = $0;
- * getline;
- * c = $0;
- * getline;
- * printf("%s %s %s %s\n", a, b, c, $0);
- * }
- *
- * The unicode values for the other characters were taken from
- * http://bigfield.ddo.jp/unicode/unicode0.html
- */
-unsigned int chars[][2] = {
+/*
+ This file is part of libextractor.
+ (C) 2002 - 2005 Vidyut Samanta and Christian Grothoff
+
+ libextractor is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 2, or (at your
+ option) any later version.
+
+ libextractor is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with libextractor; see the file COPYING. If not, write to the
+ Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ Boston, MA 02111-1307, USA.
+ */
+
+/**
+ * @brief Transliterate keywords that contain international characters
+ * @author Nils Durner
+ */
+
+#include "platform.h"
+#include "extractor.h"
+#include "convert.h"
+
+/* Language independent chars were taken from glibc's locale/C-translit.h.in
+ *
+ * This extractor uses two tables: one contains the Unicode
+ * characters and the other one contains the transliterations (since
+ * transliterations are often used more than once: ä -> ae, æ -> ae).
+ * The first table points to an appropriate transliteration stored in the
+ * second table.
+ *
+ * To generate the two tables, a relational database was prepared:
+ * create table TBL(UNI varchar(20), TRANSL varchar(10), TRANSLID integer);
+ * create table TRANSL (TRANSL varchar(20) primary key, TRANSLID integer);
+ *
+ * After that, the data from glibc was converted to a SQL script using
+ * "awk -F '\t'":
+ * {
+ * transl = $2;
+ * gsub(/'/, "''", transl);
+ * print "insert into TBL(UNI, TRANSL) values ('0x" substr($3, 6, index($3, ">") - 6) "', '" transl "');";
+ * print "insert into TRANSL(TRANSL, TRANSLID) values ('" transl "', (Select count(*) from TRANSL));";
+ * }
+ *
+ * Then the SQL script was executed, "commit"ted and the relation between the
+ * two tables established using:
+ * update TBL Set TRANSLID = (Select TRANSLID from TRANSL where TRANSL.TRANSL = TBL.TRANSL);
+ * commit;
+ *
+ * The C arrays were then created with:
+ * Select '{' || UNI || ', ' || TRANSLID || '},' from TBL order by UNI;
+ * Select TRANSL || ', ' from TRANSL order by TRANSLID;
+ * and reformatted with:
+ * {
+ * a = $0;
+ * getline;
+ * b = $0;
+ * getline;
+ * c = $0;
+ * getline;
+ * printf("%s %s %s %s\n", a, b, c, $0);
+ * }
+ *
+ * The unicode values for the other characters were taken from
+ * http://bigfield.ddo.jp/unicode/unicode0.html
+ */
+unsigned int chars[][2] = {
{0x00C4, 444}, {0x00D6, 445}, {0x00DC, 446}, {0x00DF, 13},
- /* Ä, Ö, Ü, ß */
-{0x00E4, 14}, {0x00F6, 19}, {0x00FC, 447}, {0x00C5, 448}, /* ä, ö, ü, Å */
-{0x00E5, 449}, {0x00C6, 444}, {0x00E6, 14}, {0x00D8, 445}, /* å, Æ, æ, Ø */
-{0x00F8, 19}, {0x00C0, 419}, {0x00C8, 77}, {0x00D9, 426}, /* ø, À, È, Ù */
-{0x00E0, 431}, {0x00E8, 76}, {0x00F9, 5}, {0x00C9, 77}, /* à, è, ù, É */
-{0x00E9, 76}, {0x00C2, 419}, {0x00CA, 77}, {0x00CE, 63}, /* é, Â, Ê, Î */
-{0x00D4, 423}, {0x00DB, 426}, {0x00E2, 431}, {0x00EA, 76}, /* Ô, Û, â, ê */
-{0x00EE, 80}, {0x00F4, 41}, {0x00FB, 5}, {0x00CB, 77}, /* î, ô, û, Ë */
-{0x00CF, 63}, {0x00EB, 76}, {0x00EF, 80}, {0x00C7, 57}, /* Ï, ë, ï, Ç */
-{0x00E7, 118}, {0x0152, 445}, {0x0053, 19}, {0x0080, 66}, /* ç, Œ, œ, € */
-
- /* Language independent */
-{0xFB00, 391}, {0xFB01, 392}, {0xFB02, 393}, {0xFB03, 394},
- {0xFB04, 395}, {0xFB06, 396}, {0xFB29, 40}, {0xFEFF, 36},
- {0xFE4D, 33}, {0xFE4E, 33}, {0xFE4F, 33}, {0xFE5A, 401},
- {0xFE5B, 402}, {0xFE5C, 403}, {0xFE5F, 404}, {0xFE50, 6},
- {0xFE52, 42}, {0xFE54, 397}, {0xFE55, 34}, {0xFE56, 398},
- {0xFE57, 399}, {0xFE59, 400}, {0xFE6A, 407}, {0xFE6B, 408},
- {0xFE60, 405}, {0xFE61, 128}, {0xFE62, 40}, {0xFE63, 3},
- {0xFE64, 47}, {0xFE65, 48}, {0xFE66, 262}, {0xFE68, 127},
- {0xFE69, 406}, {0xFF0A, 128}, {0xFF0B, 40}, {0xFF0C, 6},
- {0xFF0D, 3}, {0xFF0E, 42}, {0xFF0F, 126}, {0xFF01, 399},
- {0xFF02, 38}, {0xFF03, 404}, {0xFF04, 406}, {0xFF05, 407},
- {0xFF06, 405}, {0xFF07, 30}, {0xFF08, 400}, {0xFF09, 401},
- {0xFF1A, 34}, {0xFF1B, 397}, {0xFF1C, 47}, {0xFF1D, 262},
- {0xFF1E, 48}, {0xFF1F, 398}, {0xFF10, 409}, {0xFF11, 410},
- {0xFF12, 411}, {0xFF13, 412}, {0xFF14, 413}, {0xFF15, 414},
- {0xFF16, 415}, {0xFF17, 416}, {0xFF18, 417}, {0xFF19, 418},
- {0xFF2A, 421}, {0xFF2B, 422}, {0xFF2C, 64}, {0xFF2D, 79},
- {0xFF2E, 66}, {0xFF2F, 423}, {0xFF20, 408}, {0xFF21, 419},
- {0xFF22, 75}, {0xFF23, 57}, {0xFF24, 81}, {0xFF25, 77},
- {0xFF26, 78}, {0xFF27, 420}, {0xFF28, 61}, {0xFF29, 63},
- {0xFF3A, 73}, {0xFF3B, 429}, {0xFF3C, 127}, {0xFF3D, 430},
- {0xFF3E, 31}, {0xFF3F, 33}, {0xFF30, 68}, {0xFF31, 69},
- {0xFF32, 70}, {0xFF33, 424}, {0xFF34, 425}, {0xFF35, 426},
- {0xFF36, 100}, {0xFF37, 427}, {0xFF38, 105}, {0xFF39, 428},
- {0xFF4A, 83}, {0xFF4B, 434}, {0xFF4C, 65}, {0xFF4D, 119},
- {0xFF4E, 435}, {0xFF4F, 41}, {0xFF40, 32}, {0xFF41, 431},
- {0xFF42, 432}, {0xFF43, 118}, {0xFF44, 82}, {0xFF45, 76},
- {0xFF46, 433}, {0xFF47, 60}, {0xFF48, 62}, {0xFF49, 80},
- {0xFF5A, 442}, {0xFF5B, 402}, {0xFF5C, 129}, {0xFF5D, 403},
- {0xFF5E, 35}, {0xFF50, 436}, {0xFF51, 437}, {0xFF52, 438},
- {0xFF53, 20}, {0xFF54, 439}, {0xFF55, 5}, {0xFF56, 111},
- {0xFF57, 440}, {0xFF58, 12}, {0xFF59, 441}, {0x00AB, 2},
- {0x00AD, 3}, {0x00AE, 4}, {0x00A0, 0}, {0x00A9, 1},
- {0x00BB, 7}, {0x00BC, 8}, {0x00BD, 9}, {0x00BE, 10},
- {0x00B5, 5}, {0x00B8, 6}, {0x00C6, 11}, {0x00DF, 13},
- {0x00D7, 12}, {0x00E6, 14}, {0x0001D4AA, 423}, {0x0001D4AB, 68},
- {0x0001D4AC, 69}, {0x0001D4AE, 424}, {0x0001D4AF, 425}, {0x0001D4A2, 420},
- {0x0001D4A5, 421}, {0x0001D4A6, 422}, {0x0001D4A9, 66}, {0x0001D4BB, 433},
- {0x0001D4BD, 62}, {0x0001D4BE, 80}, {0x0001D4BF, 83}, {0x0001D4B0, 426},
+ /* Ä, Ö, Ü, ß */
+{0x00E4, 14}, {0x00F6, 19}, {0x00FC, 447}, {0x00C5, 448}, /* ä, ö, ü, Å */
+{0x00E5, 449}, {0x00C6, 444}, {0x00E6, 14}, {0x00D8, 445}, /* å, Æ, æ, Ø */
+{0x00F8, 19}, {0x00C0, 419}, {0x00C8, 77}, {0x00D9, 426}, /* ø, À, È, Ù */
+{0x00E0, 431}, {0x00E8, 76}, {0x00F9, 5}, {0x00C9, 77}, /* à, è, ù, É */
+{0x00E9, 76}, {0x00C2, 419}, {0x00CA, 77}, {0x00CE, 63}, /* é, Â, Ê, Î */
+{0x00D4, 423}, {0x00DB, 426}, {0x00E2, 431}, {0x00EA, 76}, /* Ô, Û, â, ê */
+{0x00EE, 80}, {0x00F4, 41}, {0x00FB, 5}, {0x00CB, 77}, /* î, ô, û, Ë */
+{0x00CF, 63}, {0x00EB, 76}, {0x00EF, 80}, {0x00C7, 57}, /* Ï, ë, ï, Ç */
+{0x00E7, 118}, {0x0152, 445}, {0x0053, 19}, {0x0080, 66}, /* ç, Œ, œ, € */
+
+ /* Language independent */
+{0xFB00, 391}, {0xFB01, 392}, {0xFB02, 393}, {0xFB03, 394},
+ {0xFB04, 395}, {0xFB06, 396}, {0xFB29, 40}, {0xFEFF, 36},
+ {0xFE4D, 33}, {0xFE4E, 33}, {0xFE4F, 33}, {0xFE5A, 401},
+ {0xFE5B, 402}, {0xFE5C, 403}, {0xFE5F, 404}, {0xFE50, 6},
+ {0xFE52, 42}, {0xFE54, 397}, {0xFE55, 34}, {0xFE56, 398},
+ {0xFE57, 399}, {0xFE59, 400}, {0xFE6A, 407}, {0xFE6B, 408},
+ {0xFE60, 405}, {0xFE61, 128}, {0xFE62, 40}, {0xFE63, 3},
+ {0xFE64, 47}, {0xFE65, 48}, {0xFE66, 262}, {0xFE68, 127},
+ {0xFE69, 406}, {0xFF0A, 128}, {0xFF0B, 40}, {0xFF0C, 6},
+ {0xFF0D, 3}, {0xFF0E, 42}, {0xFF0F, 126}, {0xFF01, 399},
+ {0xFF02, 38}, {0xFF03, 404}, {0xFF04, 406}, {0xFF05, 407},
+ {0xFF06, 405}, {0xFF07, 30}, {0xFF08, 400}, {0xFF09, 401},
+ {0xFF1A, 34}, {0xFF1B, 397}, {0xFF1C, 47}, {0xFF1D, 262},
+ {0xFF1E, 48}, {0xFF1F, 398}, {0xFF10, 409}, {0xFF11, 410},
+ {0xFF12, 411}, {0xFF13, 412}, {0xFF14, 413}, {0xFF15, 414},
+ {0xFF16, 415}, {0xFF17, 416}, {0xFF18, 417}, {0xFF19, 418},
+ {0xFF2A, 421}, {0xFF2B, 422}, {0xFF2C, 64}, {0xFF2D, 79},
+ {0xFF2E, 66}, {0xFF2F, 423}, {0xFF20, 408}, {0xFF21, 419},
+ {0xFF22, 75}, {0xFF23, 57}, {0xFF24, 81}, {0xFF25, 77},
+ {0xFF26, 78}, {0xFF27, 420}, {0xFF28, 61}, {0xFF29, 63},
+ {0xFF3A, 73}, {0xFF3B, 429}, {0xFF3C, 127}, {0xFF3D, 430},
+ {0xFF3E, 31}, {0xFF3F, 33}, {0xFF30, 68}, {0xFF31, 69},
+ {0xFF32, 70}, {0xFF33, 424}, {0xFF34, 425}, {0xFF35, 426},
+ {0xFF36, 100}, {0xFF37, 427}, {0xFF38, 105}, {0xFF39, 428},
+ {0xFF4A, 83}, {0xFF4B, 434}, {0xFF4C, 65}, {0xFF4D, 119},
+ {0xFF4E, 435}, {0xFF4F, 41}, {0xFF40, 32}, {0xFF41, 431},
+ {0xFF42, 432}, {0xFF43, 118}, {0xFF44, 82}, {0xFF45, 76},
+ {0xFF46, 433}, {0xFF47, 60}, {0xFF48, 62}, {0xFF49, 80},
+ {0xFF5A, 442}, {0xFF5B, 402}, {0xFF5C, 129}, {0xFF5D, 403},
+ {0xFF5E, 35}, {0xFF50, 436}, {0xFF51, 437}, {0xFF52, 438},
+ {0xFF53, 20}, {0xFF54, 439}, {0xFF55, 5}, {0xFF56, 111},
+ {0xFF57, 440}, {0xFF58, 12}, {0xFF59, 441}, {0x00AB, 2},
+ {0x00AD, 3}, {0x00AE, 4}, {0x00A0, 0}, {0x00A9, 1},
+ {0x00BB, 7}, {0x00BC, 8}, {0x00BD, 9}, {0x00BE, 10},
+ {0x00B5, 5}, {0x00B8, 6}, {0x00C6, 11}, {0x00DF, 13},
+ {0x00D7, 12}, {0x00E6, 14}, {0x0001D4AA, 423}, {0x0001D4AB, 68},
+ {0x0001D4AC, 69}, {0x0001D4AE, 424}, {0x0001D4AF, 425}, {0x0001D4A2, 420},
+ {0x0001D4A5, 421}, {0x0001D4A6, 422}, {0x0001D4A9, 66}, {0x0001D4BB, 433},
+ {0x0001D4BD, 62}, {0x0001D4BE, 80}, {0x0001D4BF, 83}, {0x0001D4B0, 426},
{0x0001D4B1, 100}, {0x0001D4B2, 427}, {0x0001D4B3, 105}, {0x0001D4B4, 428},
{0x0001D4B5, 73}, {0x0001D4B6, 431}, {0x0001D4B7, 432}, {0x0001D4B8, 118},
{0x0001D4B9, 82}, {0x0001D4CA, 5}, {0x0001D4CB, 111}, {0x0001D4CC, 440},
@@ -430,9 +430,9 @@
"log", "lx", "mb", "mil", "mol", "PH", "p.m.", "PPM", "PR", "sr", "Sv", "Wb", "ff", "fi",
"fl", "ffi", "ffl", "st", ";", "?", "!", "(", ")", "{", "}", "#", "&", "$", "%", "@",
"0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "A", "G", "J", "K", "O", "S", "T", "U",
-"W", "Y", "[", "]", "a", "b", "f", "k", "n", "p", "q", "r", "t", "w", "y", "z", "z",
- /* German */ "Ae", "Oe", "Ue", "ue",
- /* Scandinavian */ "Aa", "aa"
+"W", "Y", "[", "]", "a", "b", "f", "k", "n", "p", "q", "r", "t", "w", "y", "z", "z",
+ /* German */ "Ae", "Oe", "Ue", "ue",
+ /* Scandinavian */ "Aa", "aa"
};
static void
addKeyword (struct EXTRACTOR_Keywords **list, char *keyword,
@@ -445,7 +445,7 @@ addKeyword (struct EXTRACTOR_Keywords **list,
char *keyword,
next->keywordType = type;
*list = next;
}
-struct EXTRACTOR_Keywords *
+struct EXTRACTOR_Keywords *
libextractor_translit_extract (const char *filename, const char *data,
size_t size, struct EXTRACTOR_Keywords *prev)
{
@@ -456,7 +456,7 @@ libextractor_translit_extract (const char *filename,
const char *data,
mem = 256;
transl = malloc (mem + 1);
while (pos != NULL)
-
+
{
int charlen = 0;
char *srcdata = pos->keyword;
@@ -468,67 +468,67 @@ libextractor_translit_extract (const char *filename,
const char *data,
long long unicode;
int idx;
char *tr;
-
- /* Get length of character */
+
+ /* Get length of character */
c = srcdata[src];
if ((c & 0xC0) == 0xC0)
-
- /* UTF-8 char */
+
+ /* UTF-8 char */
if ((c & 0xE0) == 0xE0)
if ((c & 0xF0) == 0xF0)
charlen = 4;
-
+
else
charlen = 3;
-
+
else
charlen = 2;
-
+
else
charlen = 1;
if (src + charlen - 1 > len)
{
-
- /* incomplete UTF-8 */
+
+ /* incomplete UTF-8 */
src = len;
continue;
}
-
- /* Copy character to destination */
+
+ /* Copy character to destination */
if (charlen > 1)
{
unicode = 0;
if (charlen == 2)
{
-
- /* 5 bits from the first byte and 6 bits from the second.
- 64 = 2^6 */
+
+ /* 5 bits from the first byte and 6 bits from the second.
+ 64 = 2^6 */
unicode =
((srcdata[src] & 0x1F) * 64) | (srcdata[src + 1] & 0x3F);
}
-
+
else if (charlen == 3)
{
-
- /* 4 bits from the first byte and 6 bits from the second and third
- byte. 4096 = 2^12 */
- unicode = ((srcdata[src] & 0xF) * 4096) |
+
+ /* 4 bits from the first byte and 6 bits from the second and third
+ byte. 4096 = 2^12 */
+ unicode = ((srcdata[src] & 0xF) * 4096) |
((srcdata[src + 1] & 0x3F) *
64) | (srcdata[src + 2] & 0x3F);
}
-
+
else if (charlen == 4)
{
-
- /* 3 bits from the first byte and 6 bits from the second, third
- and fourth byte. 262144 = 2^18 */
- unicode = ((srcdata[src] & 7) * 262144) |
- ((srcdata[src] & 0xF) * 4096) |
+
+ /* 3 bits from the first byte and 6 bits from the second, third
+ and fourth byte. 262144 = 2^18 */
+ unicode = ((srcdata[src] & 7) * 262144) |
+ ((srcdata[src] & 0xF) * 4096) |
((srcdata[src + 1] & 0x3F) *
64) | (srcdata[src + 2] & 0x3F);
}
-
- /* Look it up */
+
+ /* Look it up */
idx = 0;
tr = srcdata + src;
trlen = charlen;
@@ -536,8 +536,8 @@ libextractor_translit_extract (const char *filename,
const char *data,
{
if (unicode == chars[idx][0])
{
-
- /* Found it */
+
+ /* Found it */
tr = translit[chars[idx][1]];
trlen = strlen (tr);
break;
@@ -545,7 +545,7 @@ libextractor_translit_extract (const char *filename,
const char *data,
idx++;
}
}
-
+
else
trlen = 1;
if (dest + trlen > mem)
@@ -555,11 +555,11 @@ libextractor_translit_extract (const char *filename,
const char *data,
}
if (charlen > 1)
{
-
- /* Copy character to destination string */
+
+ /* Copy character to destination string */
memcpy (transl + dest, tr, trlen);
}
-
+
else
transl[dest] = c;
dest += trlen;
@@ -573,4 +573,4 @@ libextractor_translit_extract (const char *filename,
const char *data,
return prev;
}
-
+