From 01e3e60dd64caeb9a441248d602b4c8a60e9413b Mon Sep 17 00:00:00 2001 From: Christian Grothoff Date: Sat, 19 Dec 2009 12:58:35 +0000 Subject: dos2unix --- src/plugins/translitextractor.c | 326 ++++++++++++++++++++-------------------- 1 file changed, 163 insertions(+), 163 deletions(-) diff --git a/src/plugins/translitextractor.c b/src/plugins/translitextractor.c index 8e8d525..0453156 100644 --- a/src/plugins/translitextractor.c +++ b/src/plugins/translitextractor.c @@ -1,128 +1,128 @@ -/* - This file is part of libextractor. - (C) 2002 - 2005 Vidyut Samanta and Christian Grothoff - - libextractor is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 2, or (at your - option) any later version. - - libextractor is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with libextractor; see the file COPYING. If not, write to the - Free Software Foundation, Inc., 59 Temple Place - Suite 330, - Boston, MA 02111-1307, USA. - */ - -/** - * @brief Transliterate keywords that contain international characters - * @author Nils Durner - */ - -#include "platform.h" -#include "extractor.h" -#include "convert.h" - -/* Language independent chars were taken from glibc's locale/C-translit.h.in - * - * This extractor uses two tables: one contains the Unicode - * characters and the other one contains the transliterations (since - * transliterations are often used more than once: ä -> ae, æ -> ae). - * The first table points to an appropriate transliteration stored in the - * second table. - * - * To generate the two tables, a relational database was prepared: - * create table TBL(UNI varchar(20), TRANSL varchar(10), TRANSLID integer); - * create table TRANSL (TRANSL varchar(20) primary key, TRANSLID integer); - * - * After that, the data from glibc was converted to a SQL script using - * "awk -F '\t'": - * { - * transl = $2; - * gsub(/'/, "''", transl); - * print "insert into TBL(UNI, TRANSL) values ('0x" substr($3, 6, index($3, ">") - 6) "', '" transl "');"; - * print "insert into TRANSL(TRANSL, TRANSLID) values ('" transl "', (Select count(*) from TRANSL));"; - * } - * - * Then the SQL script was executed, "commit"ted and the relation between the - * two tables established using: - * update TBL Set TRANSLID = (Select TRANSLID from TRANSL where TRANSL.TRANSL = TBL.TRANSL); - * commit; - * - * The C arrays were then created with: - * Select '{' || UNI || ', ' || TRANSLID || '},' from TBL order by UNI; - * Select TRANSL || ', ' from TRANSL order by TRANSLID; - * and reformatted with: - * { - * a = $0; - * getline; - * b = $0; - * getline; - * c = $0; - * getline; - * printf("%s %s %s %s\n", a, b, c, $0); - * } - * - * The unicode values for the other characters were taken from - * http://bigfield.ddo.jp/unicode/unicode0.html - */ - unsigned int chars[][2] = { +/* + This file is part of libextractor. + (C) 2002 - 2005 Vidyut Samanta and Christian Grothoff + + libextractor is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 2, or (at your + option) any later version. + + libextractor is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with libextractor; see the file COPYING. If not, write to the + Free Software Foundation, Inc., 59 Temple Place - Suite 330, + Boston, MA 02111-1307, USA. + */ + +/** + * @brief Transliterate keywords that contain international characters + * @author Nils Durner + */ + +#include "platform.h" +#include "extractor.h" +#include "convert.h" + +/* Language independent chars were taken from glibc's locale/C-translit.h.in + * + * This extractor uses two tables: one contains the Unicode + * characters and the other one contains the transliterations (since + * transliterations are often used more than once: ä -> ae, æ -> ae). + * The first table points to an appropriate transliteration stored in the + * second table. + * + * To generate the two tables, a relational database was prepared: + * create table TBL(UNI varchar(20), TRANSL varchar(10), TRANSLID integer); + * create table TRANSL (TRANSL varchar(20) primary key, TRANSLID integer); + * + * After that, the data from glibc was converted to a SQL script using + * "awk -F '\t'": + * { + * transl = $2; + * gsub(/'/, "''", transl); + * print "insert into TBL(UNI, TRANSL) values ('0x" substr($3, 6, index($3, ">") - 6) "', '" transl "');"; + * print "insert into TRANSL(TRANSL, TRANSLID) values ('" transl "', (Select count(*) from TRANSL));"; + * } + * + * Then the SQL script was executed, "commit"ted and the relation between the + * two tables established using: + * update TBL Set TRANSLID = (Select TRANSLID from TRANSL where TRANSL.TRANSL = TBL.TRANSL); + * commit; + * + * The C arrays were then created with: + * Select '{' || UNI || ', ' || TRANSLID || '},' from TBL order by UNI; + * Select TRANSL || ', ' from TRANSL order by TRANSLID; + * and reformatted with: + * { + * a = $0; + * getline; + * b = $0; + * getline; + * c = $0; + * getline; + * printf("%s %s %s %s\n", a, b, c, $0); + * } + * + * The unicode values for the other characters were taken from + * http://bigfield.ddo.jp/unicode/unicode0.html + */ + unsigned int chars[][2] = { {0x00C4, 444}, {0x00D6, 445}, {0x00DC, 446}, {0x00DF, 13}, - /* Ä, Ö, Ü, ß */ -{0x00E4, 14}, {0x00F6, 19}, {0x00FC, 447}, {0x00C5, 448}, /* ä, ö, ü, Å */ -{0x00E5, 449}, {0x00C6, 444}, {0x00E6, 14}, {0x00D8, 445}, /* å, Æ, æ, Ø */ -{0x00F8, 19}, {0x00C0, 419}, {0x00C8, 77}, {0x00D9, 426}, /* ø, À, È, Ù */ -{0x00E0, 431}, {0x00E8, 76}, {0x00F9, 5}, {0x00C9, 77}, /* à, è, ù, É */ -{0x00E9, 76}, {0x00C2, 419}, {0x00CA, 77}, {0x00CE, 63}, /* é, Â, Ê, Î */ -{0x00D4, 423}, {0x00DB, 426}, {0x00E2, 431}, {0x00EA, 76}, /* Ô, Û, â, ê */ -{0x00EE, 80}, {0x00F4, 41}, {0x00FB, 5}, {0x00CB, 77}, /* î, ô, û, Ë */ -{0x00CF, 63}, {0x00EB, 76}, {0x00EF, 80}, {0x00C7, 57}, /* Ï, ë, ï, Ç */ -{0x00E7, 118}, {0x0152, 445}, {0x0053, 19}, {0x0080, 66}, /* ç, Œ, œ, € */ - - /* Language independent */ -{0xFB00, 391}, {0xFB01, 392}, {0xFB02, 393}, {0xFB03, 394}, - {0xFB04, 395}, {0xFB06, 396}, {0xFB29, 40}, {0xFEFF, 36}, - {0xFE4D, 33}, {0xFE4E, 33}, {0xFE4F, 33}, {0xFE5A, 401}, - {0xFE5B, 402}, {0xFE5C, 403}, {0xFE5F, 404}, {0xFE50, 6}, - {0xFE52, 42}, {0xFE54, 397}, {0xFE55, 34}, {0xFE56, 398}, - {0xFE57, 399}, {0xFE59, 400}, {0xFE6A, 407}, {0xFE6B, 408}, - {0xFE60, 405}, {0xFE61, 128}, {0xFE62, 40}, {0xFE63, 3}, - {0xFE64, 47}, {0xFE65, 48}, {0xFE66, 262}, {0xFE68, 127}, - {0xFE69, 406}, {0xFF0A, 128}, {0xFF0B, 40}, {0xFF0C, 6}, - {0xFF0D, 3}, {0xFF0E, 42}, {0xFF0F, 126}, {0xFF01, 399}, - {0xFF02, 38}, {0xFF03, 404}, {0xFF04, 406}, {0xFF05, 407}, - {0xFF06, 405}, {0xFF07, 30}, {0xFF08, 400}, {0xFF09, 401}, - {0xFF1A, 34}, {0xFF1B, 397}, {0xFF1C, 47}, {0xFF1D, 262}, - {0xFF1E, 48}, {0xFF1F, 398}, {0xFF10, 409}, {0xFF11, 410}, - {0xFF12, 411}, {0xFF13, 412}, {0xFF14, 413}, {0xFF15, 414}, - {0xFF16, 415}, {0xFF17, 416}, {0xFF18, 417}, {0xFF19, 418}, - {0xFF2A, 421}, {0xFF2B, 422}, {0xFF2C, 64}, {0xFF2D, 79}, - {0xFF2E, 66}, {0xFF2F, 423}, {0xFF20, 408}, {0xFF21, 419}, - {0xFF22, 75}, {0xFF23, 57}, {0xFF24, 81}, {0xFF25, 77}, - {0xFF26, 78}, {0xFF27, 420}, {0xFF28, 61}, {0xFF29, 63}, - {0xFF3A, 73}, {0xFF3B, 429}, {0xFF3C, 127}, {0xFF3D, 430}, - {0xFF3E, 31}, {0xFF3F, 33}, {0xFF30, 68}, {0xFF31, 69}, - {0xFF32, 70}, {0xFF33, 424}, {0xFF34, 425}, {0xFF35, 426}, - {0xFF36, 100}, {0xFF37, 427}, {0xFF38, 105}, {0xFF39, 428}, - {0xFF4A, 83}, {0xFF4B, 434}, {0xFF4C, 65}, {0xFF4D, 119}, - {0xFF4E, 435}, {0xFF4F, 41}, {0xFF40, 32}, {0xFF41, 431}, - {0xFF42, 432}, {0xFF43, 118}, {0xFF44, 82}, {0xFF45, 76}, - {0xFF46, 433}, {0xFF47, 60}, {0xFF48, 62}, {0xFF49, 80}, - {0xFF5A, 442}, {0xFF5B, 402}, {0xFF5C, 129}, {0xFF5D, 403}, - {0xFF5E, 35}, {0xFF50, 436}, {0xFF51, 437}, {0xFF52, 438}, - {0xFF53, 20}, {0xFF54, 439}, {0xFF55, 5}, {0xFF56, 111}, - {0xFF57, 440}, {0xFF58, 12}, {0xFF59, 441}, {0x00AB, 2}, - {0x00AD, 3}, {0x00AE, 4}, {0x00A0, 0}, {0x00A9, 1}, - {0x00BB, 7}, {0x00BC, 8}, {0x00BD, 9}, {0x00BE, 10}, - {0x00B5, 5}, {0x00B8, 6}, {0x00C6, 11}, {0x00DF, 13}, - {0x00D7, 12}, {0x00E6, 14}, {0x0001D4AA, 423}, {0x0001D4AB, 68}, - {0x0001D4AC, 69}, {0x0001D4AE, 424}, {0x0001D4AF, 425}, {0x0001D4A2, 420}, - {0x0001D4A5, 421}, {0x0001D4A6, 422}, {0x0001D4A9, 66}, {0x0001D4BB, 433}, - {0x0001D4BD, 62}, {0x0001D4BE, 80}, {0x0001D4BF, 83}, {0x0001D4B0, 426}, + /* Ä, Ö, Ü, ß */ +{0x00E4, 14}, {0x00F6, 19}, {0x00FC, 447}, {0x00C5, 448}, /* ä, ö, ü, Å */ +{0x00E5, 449}, {0x00C6, 444}, {0x00E6, 14}, {0x00D8, 445}, /* å, Æ, æ, Ø */ +{0x00F8, 19}, {0x00C0, 419}, {0x00C8, 77}, {0x00D9, 426}, /* ø, À, È, Ù */ +{0x00E0, 431}, {0x00E8, 76}, {0x00F9, 5}, {0x00C9, 77}, /* à, è, ù, É */ +{0x00E9, 76}, {0x00C2, 419}, {0x00CA, 77}, {0x00CE, 63}, /* é, Â, Ê, Î */ +{0x00D4, 423}, {0x00DB, 426}, {0x00E2, 431}, {0x00EA, 76}, /* Ô, Û, â, ê */ +{0x00EE, 80}, {0x00F4, 41}, {0x00FB, 5}, {0x00CB, 77}, /* î, ô, û, Ë */ +{0x00CF, 63}, {0x00EB, 76}, {0x00EF, 80}, {0x00C7, 57}, /* Ï, ë, ï, Ç */ +{0x00E7, 118}, {0x0152, 445}, {0x0053, 19}, {0x0080, 66}, /* ç, Œ, œ, € */ + + /* Language independent */ +{0xFB00, 391}, {0xFB01, 392}, {0xFB02, 393}, {0xFB03, 394}, + {0xFB04, 395}, {0xFB06, 396}, {0xFB29, 40}, {0xFEFF, 36}, + {0xFE4D, 33}, {0xFE4E, 33}, {0xFE4F, 33}, {0xFE5A, 401}, + {0xFE5B, 402}, {0xFE5C, 403}, {0xFE5F, 404}, {0xFE50, 6}, + {0xFE52, 42}, {0xFE54, 397}, {0xFE55, 34}, {0xFE56, 398}, + {0xFE57, 399}, {0xFE59, 400}, {0xFE6A, 407}, {0xFE6B, 408}, + {0xFE60, 405}, {0xFE61, 128}, {0xFE62, 40}, {0xFE63, 3}, + {0xFE64, 47}, {0xFE65, 48}, {0xFE66, 262}, {0xFE68, 127}, + {0xFE69, 406}, {0xFF0A, 128}, {0xFF0B, 40}, {0xFF0C, 6}, + {0xFF0D, 3}, {0xFF0E, 42}, {0xFF0F, 126}, {0xFF01, 399}, + {0xFF02, 38}, {0xFF03, 404}, {0xFF04, 406}, {0xFF05, 407}, + {0xFF06, 405}, {0xFF07, 30}, {0xFF08, 400}, {0xFF09, 401}, + {0xFF1A, 34}, {0xFF1B, 397}, {0xFF1C, 47}, {0xFF1D, 262}, + {0xFF1E, 48}, {0xFF1F, 398}, {0xFF10, 409}, {0xFF11, 410}, + {0xFF12, 411}, {0xFF13, 412}, {0xFF14, 413}, {0xFF15, 414}, + {0xFF16, 415}, {0xFF17, 416}, {0xFF18, 417}, {0xFF19, 418}, + {0xFF2A, 421}, {0xFF2B, 422}, {0xFF2C, 64}, {0xFF2D, 79}, + {0xFF2E, 66}, {0xFF2F, 423}, {0xFF20, 408}, {0xFF21, 419}, + {0xFF22, 75}, {0xFF23, 57}, {0xFF24, 81}, {0xFF25, 77}, + {0xFF26, 78}, {0xFF27, 420}, {0xFF28, 61}, {0xFF29, 63}, + {0xFF3A, 73}, {0xFF3B, 429}, {0xFF3C, 127}, {0xFF3D, 430}, + {0xFF3E, 31}, {0xFF3F, 33}, {0xFF30, 68}, {0xFF31, 69}, + {0xFF32, 70}, {0xFF33, 424}, {0xFF34, 425}, {0xFF35, 426}, + {0xFF36, 100}, {0xFF37, 427}, {0xFF38, 105}, {0xFF39, 428}, + {0xFF4A, 83}, {0xFF4B, 434}, {0xFF4C, 65}, {0xFF4D, 119}, + {0xFF4E, 435}, {0xFF4F, 41}, {0xFF40, 32}, {0xFF41, 431}, + {0xFF42, 432}, {0xFF43, 118}, {0xFF44, 82}, {0xFF45, 76}, + {0xFF46, 433}, {0xFF47, 60}, {0xFF48, 62}, {0xFF49, 80}, + {0xFF5A, 442}, {0xFF5B, 402}, {0xFF5C, 129}, {0xFF5D, 403}, + {0xFF5E, 35}, {0xFF50, 436}, {0xFF51, 437}, {0xFF52, 438}, + {0xFF53, 20}, {0xFF54, 439}, {0xFF55, 5}, {0xFF56, 111}, + {0xFF57, 440}, {0xFF58, 12}, {0xFF59, 441}, {0x00AB, 2}, + {0x00AD, 3}, {0x00AE, 4}, {0x00A0, 0}, {0x00A9, 1}, + {0x00BB, 7}, {0x00BC, 8}, {0x00BD, 9}, {0x00BE, 10}, + {0x00B5, 5}, {0x00B8, 6}, {0x00C6, 11}, {0x00DF, 13}, + {0x00D7, 12}, {0x00E6, 14}, {0x0001D4AA, 423}, {0x0001D4AB, 68}, + {0x0001D4AC, 69}, {0x0001D4AE, 424}, {0x0001D4AF, 425}, {0x0001D4A2, 420}, + {0x0001D4A5, 421}, {0x0001D4A6, 422}, {0x0001D4A9, 66}, {0x0001D4BB, 433}, + {0x0001D4BD, 62}, {0x0001D4BE, 80}, {0x0001D4BF, 83}, {0x0001D4B0, 426}, {0x0001D4B1, 100}, {0x0001D4B2, 427}, {0x0001D4B3, 105}, {0x0001D4B4, 428}, {0x0001D4B5, 73}, {0x0001D4B6, 431}, {0x0001D4B7, 432}, {0x0001D4B8, 118}, {0x0001D4B9, 82}, {0x0001D4CA, 5}, {0x0001D4CB, 111}, {0x0001D4CC, 440}, @@ -430,9 +430,9 @@ "log", "lx", "mb", "mil", "mol", "PH", "p.m.", "PPM", "PR", "sr", "Sv", "Wb", "ff", "fi", "fl", "ffi", "ffl", "st", ";", "?", "!", "(", ")", "{", "}", "#", "&", "$", "%", "@", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "A", "G", "J", "K", "O", "S", "T", "U", -"W", "Y", "[", "]", "a", "b", "f", "k", "n", "p", "q", "r", "t", "w", "y", "z", "z", - /* German */ "Ae", "Oe", "Ue", "ue", - /* Scandinavian */ "Aa", "aa" +"W", "Y", "[", "]", "a", "b", "f", "k", "n", "p", "q", "r", "t", "w", "y", "z", "z", + /* German */ "Ae", "Oe", "Ue", "ue", + /* Scandinavian */ "Aa", "aa" }; static void addKeyword (struct EXTRACTOR_Keywords **list, char *keyword, @@ -445,7 +445,7 @@ addKeyword (struct EXTRACTOR_Keywords **list, char *keyword, next->keywordType = type; *list = next; } - struct EXTRACTOR_Keywords * + struct EXTRACTOR_Keywords * libextractor_translit_extract (const char *filename, const char *data, size_t size, struct EXTRACTOR_Keywords *prev) { @@ -456,7 +456,7 @@ libextractor_translit_extract (const char *filename, const char *data, mem = 256; transl = malloc (mem + 1); while (pos != NULL) - + { int charlen = 0; char *srcdata = pos->keyword; @@ -468,67 +468,67 @@ libextractor_translit_extract (const char *filename, const char *data, long long unicode; int idx; char *tr; - - /* Get length of character */ + + /* Get length of character */ c = srcdata[src]; if ((c & 0xC0) == 0xC0) - - /* UTF-8 char */ + + /* UTF-8 char */ if ((c & 0xE0) == 0xE0) if ((c & 0xF0) == 0xF0) charlen = 4; - + else charlen = 3; - + else charlen = 2; - + else charlen = 1; if (src + charlen - 1 > len) { - - /* incomplete UTF-8 */ + + /* incomplete UTF-8 */ src = len; continue; } - - /* Copy character to destination */ + + /* Copy character to destination */ if (charlen > 1) { unicode = 0; if (charlen == 2) { - - /* 5 bits from the first byte and 6 bits from the second. - 64 = 2^6 */ + + /* 5 bits from the first byte and 6 bits from the second. + 64 = 2^6 */ unicode = ((srcdata[src] & 0x1F) * 64) | (srcdata[src + 1] & 0x3F); } - + else if (charlen == 3) { - - /* 4 bits from the first byte and 6 bits from the second and third - byte. 4096 = 2^12 */ - unicode = ((srcdata[src] & 0xF) * 4096) | + + /* 4 bits from the first byte and 6 bits from the second and third + byte. 4096 = 2^12 */ + unicode = ((srcdata[src] & 0xF) * 4096) | ((srcdata[src + 1] & 0x3F) * 64) | (srcdata[src + 2] & 0x3F); } - + else if (charlen == 4) { - - /* 3 bits from the first byte and 6 bits from the second, third - and fourth byte. 262144 = 2^18 */ - unicode = ((srcdata[src] & 7) * 262144) | - ((srcdata[src] & 0xF) * 4096) | + + /* 3 bits from the first byte and 6 bits from the second, third + and fourth byte. 262144 = 2^18 */ + unicode = ((srcdata[src] & 7) * 262144) | + ((srcdata[src] & 0xF) * 4096) | ((srcdata[src + 1] & 0x3F) * 64) | (srcdata[src + 2] & 0x3F); } - - /* Look it up */ + + /* Look it up */ idx = 0; tr = srcdata + src; trlen = charlen; @@ -536,8 +536,8 @@ libextractor_translit_extract (const char *filename, const char *data, { if (unicode == chars[idx][0]) { - - /* Found it */ + + /* Found it */ tr = translit[chars[idx][1]]; trlen = strlen (tr); break; @@ -545,7 +545,7 @@ libextractor_translit_extract (const char *filename, const char *data, idx++; } } - + else trlen = 1; if (dest + trlen > mem) @@ -555,11 +555,11 @@ libextractor_translit_extract (const char *filename, const char *data, } if (charlen > 1) { - - /* Copy character to destination string */ + + /* Copy character to destination string */ memcpy (transl + dest, tr, trlen); } - + else transl[dest] = c; dest += trlen; @@ -573,4 +573,4 @@ libextractor_translit_extract (const char *filename, const char *data, return prev; } - + -- cgit v1.2.3