diff options
author | Christian Grothoff <christian@grothoff.org> | 2009-12-19 12:58:35 +0000 |
---|---|---|
committer | Christian Grothoff <christian@grothoff.org> | 2009-12-19 12:58:35 +0000 |
commit | 01e3e60dd64caeb9a441248d602b4c8a60e9413b (patch) | |
tree | 7b861acf0d3098d5d93fa99effdb3e468952ff20 | |
parent | 4db7e805a2c0082b95b1cd2a735e687897adcf3b (diff) | |
download | libextractor-01e3e60dd64caeb9a441248d602b4c8a60e9413b.tar.gz libextractor-01e3e60dd64caeb9a441248d602b4c8a60e9413b.zip |
dos2unix
-rw-r--r-- | src/plugins/translitextractor.c | 326 |
1 files changed, 163 insertions, 163 deletions
diff --git a/src/plugins/translitextractor.c b/src/plugins/translitextractor.c index 8e8d525..0453156 100644 --- a/src/plugins/translitextractor.c +++ b/src/plugins/translitextractor.c | |||
@@ -1,128 +1,128 @@ | |||
1 | /* | 1 | /* |
2 | This file is part of libextractor. | 2 | This file is part of libextractor. |
3 | (C) 2002 - 2005 Vidyut Samanta and Christian Grothoff | 3 | (C) 2002 - 2005 Vidyut Samanta and Christian Grothoff |
4 | 4 | ||
5 | libextractor is free software; you can redistribute it and/or modify | 5 | libextractor is free software; you can redistribute it and/or modify |
6 | it under the terms of the GNU General Public License as published | 6 | it under the terms of the GNU General Public License as published |
7 | by the Free Software Foundation; either version 2, or (at your | 7 | by the Free Software Foundation; either version 2, or (at your |
8 | option) any later version. | 8 | option) any later version. |
9 | 9 | ||
10 | libextractor is distributed in the hope that it will be useful, but | 10 | libextractor is distributed in the hope that it will be useful, but |
11 | WITHOUT ANY WARRANTY; without even the implied warranty of | 11 | WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | General Public License for more details. | 13 | General Public License for more details. |
14 | 14 | ||
15 | You should have received a copy of the GNU General Public License | 15 | You should have received a copy of the GNU General Public License |
16 | along with libextractor; see the file COPYING. If not, write to the | 16 | along with libextractor; see the file COPYING. If not, write to the |
17 | Free Software Foundation, Inc., 59 Temple Place - Suite 330, | 17 | Free Software Foundation, Inc., 59 Temple Place - Suite 330, |
18 | Boston, MA 02111-1307, USA. | 18 | Boston, MA 02111-1307, USA. |
19 | */ | 19 | */ |
20 | 20 | ||
21 | /** | 21 | /** |
22 | * @brief Transliterate keywords that contain international characters | 22 | * @brief Transliterate keywords that contain international characters |
23 | * @author Nils Durner | 23 | * @author Nils Durner |
24 | */ | 24 | */ |
25 | 25 | ||
26 | #include "platform.h" | 26 | #include "platform.h" |
27 | #include "extractor.h" | 27 | #include "extractor.h" |
28 | #include "convert.h" | 28 | #include "convert.h" |
29 | 29 | ||
30 | /* Language independent chars were taken from glibc's locale/C-translit.h.in | 30 | /* Language independent chars were taken from glibc's locale/C-translit.h.in |
31 | * | 31 | * |
32 | * This extractor uses two tables: one contains the Unicode | 32 | * This extractor uses two tables: one contains the Unicode |
33 | * characters and the other one contains the transliterations (since | 33 | * characters and the other one contains the transliterations (since |
34 | * transliterations are often used more than once: ä -> ae, æ -> ae). | 34 | * transliterations are often used more than once: ä -> ae, æ -> ae). |
35 | * The first table points to an appropriate transliteration stored in the | 35 | * The first table points to an appropriate transliteration stored in the |
36 | * second table. | 36 | * second table. |
37 | * | 37 | * |
38 | * To generate the two tables, a relational database was prepared: | 38 | * To generate the two tables, a relational database was prepared: |
39 | * create table TBL(UNI varchar(20), TRANSL varchar(10), TRANSLID integer); | 39 | * create table TBL(UNI varchar(20), TRANSL varchar(10), TRANSLID integer); |
40 | * create table TRANSL (TRANSL varchar(20) primary key, TRANSLID integer); | 40 | * create table TRANSL (TRANSL varchar(20) primary key, TRANSLID integer); |
41 | * | 41 | * |
42 | * After that, the data from glibc was converted to a SQL script using | 42 | * After that, the data from glibc was converted to a SQL script using |
43 | * "awk -F '\t'": | 43 | * "awk -F '\t'": |
44 | * { | 44 | * { |
45 | * transl = $2; | 45 | * transl = $2; |
46 | * gsub(/'/, "''", transl); | 46 | * gsub(/'/, "''", transl); |
47 | * print "insert into TBL(UNI, TRANSL) values ('0x" substr($3, 6, index($3, ">") - 6) "', '" transl "');"; | 47 | * print "insert into TBL(UNI, TRANSL) values ('0x" substr($3, 6, index($3, ">") - 6) "', '" transl "');"; |
48 | * print "insert into TRANSL(TRANSL, TRANSLID) values ('" transl "', (Select count(*) from TRANSL));"; | 48 | * print "insert into TRANSL(TRANSL, TRANSLID) values ('" transl "', (Select count(*) from TRANSL));"; |
49 | * } | 49 | * } |
50 | * | 50 | * |
51 | * Then the SQL script was executed, "commit"ted and the relation between the | 51 | * Then the SQL script was executed, "commit"ted and the relation between the |
52 | * two tables established using: | 52 | * two tables established using: |
53 | * update TBL Set TRANSLID = (Select TRANSLID from TRANSL where TRANSL.TRANSL = TBL.TRANSL); | 53 | * update TBL Set TRANSLID = (Select TRANSLID from TRANSL where TRANSL.TRANSL = TBL.TRANSL); |
54 | * commit; | 54 | * commit; |
55 | * | 55 | * |
56 | * The C arrays were then created with: | 56 | * The C arrays were then created with: |
57 | * Select '{' || UNI || ', ' || TRANSLID || '},' from TBL order by UNI; | 57 | * Select '{' || UNI || ', ' || TRANSLID || '},' from TBL order by UNI; |
58 | * Select TRANSL || ', ' from TRANSL order by TRANSLID; | 58 | * Select TRANSL || ', ' from TRANSL order by TRANSLID; |
59 | * and reformatted with: | 59 | * and reformatted with: |
60 | * { | 60 | * { |
61 | * a = $0; | 61 | * a = $0; |
62 | * getline; | 62 | * getline; |
63 | * b = $0; | 63 | * b = $0; |
64 | * getline; | 64 | * getline; |
65 | * c = $0; | 65 | * c = $0; |
66 | * getline; | 66 | * getline; |
67 | * printf("%s %s %s %s\n", a, b, c, $0); | 67 | * printf("%s %s %s %s\n", a, b, c, $0); |
68 | * } | 68 | * } |
69 | * | 69 | * |
70 | * The unicode values for the other characters were taken from | 70 | * The unicode values for the other characters were taken from |
71 | * http://bigfield.ddo.jp/unicode/unicode0.html | 71 | * http://bigfield.ddo.jp/unicode/unicode0.html |
72 | */ | 72 | */ |
73 | unsigned int chars[][2] = { | 73 | unsigned int chars[][2] = { |
74 | {0x00C4, 444}, {0x00D6, 445}, {0x00DC, 446}, {0x00DF, 13}, | 74 | {0x00C4, 444}, {0x00D6, 445}, {0x00DC, 446}, {0x00DF, 13}, |
75 | /* Ä, Ö, Ü, ß */ | 75 | /* Ä, Ö, Ü, ß */ |
76 | {0x00E4, 14}, {0x00F6, 19}, {0x00FC, 447}, {0x00C5, 448}, /* ä, ö, ü, Å */ | 76 | {0x00E4, 14}, {0x00F6, 19}, {0x00FC, 447}, {0x00C5, 448}, /* ä, ö, ü, Å */ |
77 | {0x00E5, 449}, {0x00C6, 444}, {0x00E6, 14}, {0x00D8, 445}, /* å, Æ, æ, Ø */ | 77 | {0x00E5, 449}, {0x00C6, 444}, {0x00E6, 14}, {0x00D8, 445}, /* å, Æ, æ, Ø */ |
78 | {0x00F8, 19}, {0x00C0, 419}, {0x00C8, 77}, {0x00D9, 426}, /* ø, À, È, Ù */ | 78 | {0x00F8, 19}, {0x00C0, 419}, {0x00C8, 77}, {0x00D9, 426}, /* ø, À, È, Ù */ |
79 | {0x00E0, 431}, {0x00E8, 76}, {0x00F9, 5}, {0x00C9, 77}, /* à, è, ù, É */ | 79 | {0x00E0, 431}, {0x00E8, 76}, {0x00F9, 5}, {0x00C9, 77}, /* à, è, ù, É */ |
80 | {0x00E9, 76}, {0x00C2, 419}, {0x00CA, 77}, {0x00CE, 63}, /* é, Â, Ê, Î */ | 80 | {0x00E9, 76}, {0x00C2, 419}, {0x00CA, 77}, {0x00CE, 63}, /* é, Â, Ê, Î */ |
81 | {0x00D4, 423}, {0x00DB, 426}, {0x00E2, 431}, {0x00EA, 76}, /* Ô, Û, â, ê */ | 81 | {0x00D4, 423}, {0x00DB, 426}, {0x00E2, 431}, {0x00EA, 76}, /* Ô, Û, â, ê */ |
82 | {0x00EE, 80}, {0x00F4, 41}, {0x00FB, 5}, {0x00CB, 77}, /* î, ô, û, Ë */ | 82 | {0x00EE, 80}, {0x00F4, 41}, {0x00FB, 5}, {0x00CB, 77}, /* î, ô, û, Ë */ |
83 | {0x00CF, 63}, {0x00EB, 76}, {0x00EF, 80}, {0x00C7, 57}, /* Ï, ë, ï, Ç */ | 83 | {0x00CF, 63}, {0x00EB, 76}, {0x00EF, 80}, {0x00C7, 57}, /* Ï, ë, ï, Ç */ |
84 | {0x00E7, 118}, {0x0152, 445}, {0x0053, 19}, {0x0080, 66}, /* ç, Œ, œ, € */ | 84 | {0x00E7, 118}, {0x0152, 445}, {0x0053, 19}, {0x0080, 66}, /* ç, Œ, œ, € */ |
85 | 85 | ||
86 | /* Language independent */ | 86 | /* Language independent */ |
87 | {0xFB00, 391}, {0xFB01, 392}, {0xFB02, 393}, {0xFB03, 394}, | 87 | {0xFB00, 391}, {0xFB01, 392}, {0xFB02, 393}, {0xFB03, 394}, |
88 | {0xFB04, 395}, {0xFB06, 396}, {0xFB29, 40}, {0xFEFF, 36}, | 88 | {0xFB04, 395}, {0xFB06, 396}, {0xFB29, 40}, {0xFEFF, 36}, |
89 | {0xFE4D, 33}, {0xFE4E, 33}, {0xFE4F, 33}, {0xFE5A, 401}, | 89 | {0xFE4D, 33}, {0xFE4E, 33}, {0xFE4F, 33}, {0xFE5A, 401}, |
90 | {0xFE5B, 402}, {0xFE5C, 403}, {0xFE5F, 404}, {0xFE50, 6}, | 90 | {0xFE5B, 402}, {0xFE5C, 403}, {0xFE5F, 404}, {0xFE50, 6}, |
91 | {0xFE52, 42}, {0xFE54, 397}, {0xFE55, 34}, {0xFE56, 398}, | 91 | {0xFE52, 42}, {0xFE54, 397}, {0xFE55, 34}, {0xFE56, 398}, |
92 | {0xFE57, 399}, {0xFE59, 400}, {0xFE6A, 407}, {0xFE6B, 408}, | 92 | {0xFE57, 399}, {0xFE59, 400}, {0xFE6A, 407}, {0xFE6B, 408}, |
93 | {0xFE60, 405}, {0xFE61, 128}, {0xFE62, 40}, {0xFE63, 3}, | 93 | {0xFE60, 405}, {0xFE61, 128}, {0xFE62, 40}, {0xFE63, 3}, |
94 | {0xFE64, 47}, {0xFE65, 48}, {0xFE66, 262}, {0xFE68, 127}, | 94 | {0xFE64, 47}, {0xFE65, 48}, {0xFE66, 262}, {0xFE68, 127}, |
95 | {0xFE69, 406}, {0xFF0A, 128}, {0xFF0B, 40}, {0xFF0C, 6}, | 95 | {0xFE69, 406}, {0xFF0A, 128}, {0xFF0B, 40}, {0xFF0C, 6}, |
96 | {0xFF0D, 3}, {0xFF0E, 42}, {0xFF0F, 126}, {0xFF01, 399}, | 96 | {0xFF0D, 3}, {0xFF0E, 42}, {0xFF0F, 126}, {0xFF01, 399}, |
97 | {0xFF02, 38}, {0xFF03, 404}, {0xFF04, 406}, {0xFF05, 407}, | 97 | {0xFF02, 38}, {0xFF03, 404}, {0xFF04, 406}, {0xFF05, 407}, |
98 | {0xFF06, 405}, {0xFF07, 30}, {0xFF08, 400}, {0xFF09, 401}, | 98 | {0xFF06, 405}, {0xFF07, 30}, {0xFF08, 400}, {0xFF09, 401}, |
99 | {0xFF1A, 34}, {0xFF1B, 397}, {0xFF1C, 47}, {0xFF1D, 262}, | 99 | {0xFF1A, 34}, {0xFF1B, 397}, {0xFF1C, 47}, {0xFF1D, 262}, |
100 | {0xFF1E, 48}, {0xFF1F, 398}, {0xFF10, 409}, {0xFF11, 410}, | 100 | {0xFF1E, 48}, {0xFF1F, 398}, {0xFF10, 409}, {0xFF11, 410}, |
101 | {0xFF12, 411}, {0xFF13, 412}, {0xFF14, 413}, {0xFF15, 414}, | 101 | {0xFF12, 411}, {0xFF13, 412}, {0xFF14, 413}, {0xFF15, 414}, |
102 | {0xFF16, 415}, {0xFF17, 416}, {0xFF18, 417}, {0xFF19, 418}, | 102 | {0xFF16, 415}, {0xFF17, 416}, {0xFF18, 417}, {0xFF19, 418}, |
103 | {0xFF2A, 421}, {0xFF2B, 422}, {0xFF2C, 64}, {0xFF2D, 79}, | 103 | {0xFF2A, 421}, {0xFF2B, 422}, {0xFF2C, 64}, {0xFF2D, 79}, |
104 | {0xFF2E, 66}, {0xFF2F, 423}, {0xFF20, 408}, {0xFF21, 419}, | 104 | {0xFF2E, 66}, {0xFF2F, 423}, {0xFF20, 408}, {0xFF21, 419}, |
105 | {0xFF22, 75}, {0xFF23, 57}, {0xFF24, 81}, {0xFF25, 77}, | 105 | {0xFF22, 75}, {0xFF23, 57}, {0xFF24, 81}, {0xFF25, 77}, |
106 | {0xFF26, 78}, {0xFF27, 420}, {0xFF28, 61}, {0xFF29, 63}, | 106 | {0xFF26, 78}, {0xFF27, 420}, {0xFF28, 61}, {0xFF29, 63}, |
107 | {0xFF3A, 73}, {0xFF3B, 429}, {0xFF3C, 127}, {0xFF3D, 430}, | 107 | {0xFF3A, 73}, {0xFF3B, 429}, {0xFF3C, 127}, {0xFF3D, 430}, |
108 | {0xFF3E, 31}, {0xFF3F, 33}, {0xFF30, 68}, {0xFF31, 69}, | 108 | {0xFF3E, 31}, {0xFF3F, 33}, {0xFF30, 68}, {0xFF31, 69}, |
109 | {0xFF32, 70}, {0xFF33, 424}, {0xFF34, 425}, {0xFF35, 426}, | 109 | {0xFF32, 70}, {0xFF33, 424}, {0xFF34, 425}, {0xFF35, 426}, |
110 | {0xFF36, 100}, {0xFF37, 427}, {0xFF38, 105}, {0xFF39, 428}, | 110 | {0xFF36, 100}, {0xFF37, 427}, {0xFF38, 105}, {0xFF39, 428}, |
111 | {0xFF4A, 83}, {0xFF4B, 434}, {0xFF4C, 65}, {0xFF4D, 119}, | 111 | {0xFF4A, 83}, {0xFF4B, 434}, {0xFF4C, 65}, {0xFF4D, 119}, |
112 | {0xFF4E, 435}, {0xFF4F, 41}, {0xFF40, 32}, {0xFF41, 431}, | 112 | {0xFF4E, 435}, {0xFF4F, 41}, {0xFF40, 32}, {0xFF41, 431}, |
113 | {0xFF42, 432}, {0xFF43, 118}, {0xFF44, 82}, {0xFF45, 76}, | 113 | {0xFF42, 432}, {0xFF43, 118}, {0xFF44, 82}, {0xFF45, 76}, |
114 | {0xFF46, 433}, {0xFF47, 60}, {0xFF48, 62}, {0xFF49, 80}, | 114 | {0xFF46, 433}, {0xFF47, 60}, {0xFF48, 62}, {0xFF49, 80}, |
115 | {0xFF5A, 442}, {0xFF5B, 402}, {0xFF5C, 129}, {0xFF5D, 403}, | 115 | {0xFF5A, 442}, {0xFF5B, 402}, {0xFF5C, 129}, {0xFF5D, 403}, |
116 | {0xFF5E, 35}, {0xFF50, 436}, {0xFF51, 437}, {0xFF52, 438}, | 116 | {0xFF5E, 35}, {0xFF50, 436}, {0xFF51, 437}, {0xFF52, 438}, |
117 | {0xFF53, 20}, {0xFF54, 439}, {0xFF55, 5}, {0xFF56, 111}, | 117 | {0xFF53, 20}, {0xFF54, 439}, {0xFF55, 5}, {0xFF56, 111}, |
118 | {0xFF57, 440}, {0xFF58, 12}, {0xFF59, 441}, {0x00AB, 2}, | 118 | {0xFF57, 440}, {0xFF58, 12}, {0xFF59, 441}, {0x00AB, 2}, |
119 | {0x00AD, 3}, {0x00AE, 4}, {0x00A0, 0}, {0x00A9, 1}, | 119 | {0x00AD, 3}, {0x00AE, 4}, {0x00A0, 0}, {0x00A9, 1}, |
120 | {0x00BB, 7}, {0x00BC, 8}, {0x00BD, 9}, {0x00BE, 10}, | 120 | {0x00BB, 7}, {0x00BC, 8}, {0x00BD, 9}, {0x00BE, 10}, |
121 | {0x00B5, 5}, {0x00B8, 6}, {0x00C6, 11}, {0x00DF, 13}, | 121 | {0x00B5, 5}, {0x00B8, 6}, {0x00C6, 11}, {0x00DF, 13}, |
122 | {0x00D7, 12}, {0x00E6, 14}, {0x0001D4AA, 423}, {0x0001D4AB, 68}, | 122 | {0x00D7, 12}, {0x00E6, 14}, {0x0001D4AA, 423}, {0x0001D4AB, 68}, |
123 | {0x0001D4AC, 69}, {0x0001D4AE, 424}, {0x0001D4AF, 425}, {0x0001D4A2, 420}, | 123 | {0x0001D4AC, 69}, {0x0001D4AE, 424}, {0x0001D4AF, 425}, {0x0001D4A2, 420}, |
124 | {0x0001D4A5, 421}, {0x0001D4A6, 422}, {0x0001D4A9, 66}, {0x0001D4BB, 433}, | 124 | {0x0001D4A5, 421}, {0x0001D4A6, 422}, {0x0001D4A9, 66}, {0x0001D4BB, 433}, |
125 | {0x0001D4BD, 62}, {0x0001D4BE, 80}, {0x0001D4BF, 83}, {0x0001D4B0, 426}, | 125 | {0x0001D4BD, 62}, {0x0001D4BE, 80}, {0x0001D4BF, 83}, {0x0001D4B0, 426}, |
126 | {0x0001D4B1, 100}, {0x0001D4B2, 427}, {0x0001D4B3, 105}, {0x0001D4B4, 428}, | 126 | {0x0001D4B1, 100}, {0x0001D4B2, 427}, {0x0001D4B3, 105}, {0x0001D4B4, 428}, |
127 | {0x0001D4B5, 73}, {0x0001D4B6, 431}, {0x0001D4B7, 432}, {0x0001D4B8, 118}, | 127 | {0x0001D4B5, 73}, {0x0001D4B6, 431}, {0x0001D4B7, 432}, {0x0001D4B8, 118}, |
128 | {0x0001D4B9, 82}, {0x0001D4CA, 5}, {0x0001D4CB, 111}, {0x0001D4CC, 440}, | 128 | {0x0001D4B9, 82}, {0x0001D4CA, 5}, {0x0001D4CB, 111}, {0x0001D4CC, 440}, |
@@ -430,9 +430,9 @@ | |||
430 | "log", "lx", "mb", "mil", "mol", "PH", "p.m.", "PPM", "PR", "sr", "Sv", "Wb", "ff", "fi", | 430 | "log", "lx", "mb", "mil", "mol", "PH", "p.m.", "PPM", "PR", "sr", "Sv", "Wb", "ff", "fi", |
431 | "fl", "ffi", "ffl", "st", ";", "?", "!", "(", ")", "{", "}", "#", "&", "$", "%", "@", | 431 | "fl", "ffi", "ffl", "st", ";", "?", "!", "(", ")", "{", "}", "#", "&", "$", "%", "@", |
432 | "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "A", "G", "J", "K", "O", "S", "T", "U", | 432 | "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "A", "G", "J", "K", "O", "S", "T", "U", |
433 | "W", "Y", "[", "]", "a", "b", "f", "k", "n", "p", "q", "r", "t", "w", "y", "z", "z", | 433 | "W", "Y", "[", "]", "a", "b", "f", "k", "n", "p", "q", "r", "t", "w", "y", "z", "z", |
434 | /* German */ "Ae", "Oe", "Ue", "ue", | 434 | /* German */ "Ae", "Oe", "Ue", "ue", |
435 | /* Scandinavian */ "Aa", "aa" | 435 | /* Scandinavian */ "Aa", "aa" |
436 | }; | 436 | }; |
437 | static void | 437 | static void |
438 | addKeyword (struct EXTRACTOR_Keywords **list, char *keyword, | 438 | addKeyword (struct EXTRACTOR_Keywords **list, char *keyword, |
@@ -445,7 +445,7 @@ addKeyword (struct EXTRACTOR_Keywords **list, char *keyword, | |||
445 | next->keywordType = type; | 445 | next->keywordType = type; |
446 | *list = next; | 446 | *list = next; |
447 | } | 447 | } |
448 | struct EXTRACTOR_Keywords * | 448 | struct EXTRACTOR_Keywords * |
449 | libextractor_translit_extract (const char *filename, const char *data, | 449 | libextractor_translit_extract (const char *filename, const char *data, |
450 | size_t size, struct EXTRACTOR_Keywords *prev) | 450 | size_t size, struct EXTRACTOR_Keywords *prev) |
451 | { | 451 | { |
@@ -456,7 +456,7 @@ libextractor_translit_extract (const char *filename, const char *data, | |||
456 | mem = 256; | 456 | mem = 256; |
457 | transl = malloc (mem + 1); | 457 | transl = malloc (mem + 1); |
458 | while (pos != NULL) | 458 | while (pos != NULL) |
459 | 459 | ||
460 | { | 460 | { |
461 | int charlen = 0; | 461 | int charlen = 0; |
462 | char *srcdata = pos->keyword; | 462 | char *srcdata = pos->keyword; |
@@ -468,67 +468,67 @@ libextractor_translit_extract (const char *filename, const char *data, | |||
468 | long long unicode; | 468 | long long unicode; |
469 | int idx; | 469 | int idx; |
470 | char *tr; | 470 | char *tr; |
471 | 471 | ||
472 | /* Get length of character */ | 472 | /* Get length of character */ |
473 | c = srcdata[src]; | 473 | c = srcdata[src]; |
474 | if ((c & 0xC0) == 0xC0) | 474 | if ((c & 0xC0) == 0xC0) |
475 | 475 | ||
476 | /* UTF-8 char */ | 476 | /* UTF-8 char */ |
477 | if ((c & 0xE0) == 0xE0) | 477 | if ((c & 0xE0) == 0xE0) |
478 | if ((c & 0xF0) == 0xF0) | 478 | if ((c & 0xF0) == 0xF0) |
479 | charlen = 4; | 479 | charlen = 4; |
480 | 480 | ||
481 | else | 481 | else |
482 | charlen = 3; | 482 | charlen = 3; |
483 | 483 | ||
484 | else | 484 | else |
485 | charlen = 2; | 485 | charlen = 2; |
486 | 486 | ||
487 | else | 487 | else |
488 | charlen = 1; | 488 | charlen = 1; |
489 | if (src + charlen - 1 > len) | 489 | if (src + charlen - 1 > len) |
490 | { | 490 | { |
491 | 491 | ||
492 | /* incomplete UTF-8 */ | 492 | /* incomplete UTF-8 */ |
493 | src = len; | 493 | src = len; |
494 | continue; | 494 | continue; |
495 | } | 495 | } |
496 | 496 | ||
497 | /* Copy character to destination */ | 497 | /* Copy character to destination */ |
498 | if (charlen > 1) | 498 | if (charlen > 1) |
499 | { | 499 | { |
500 | unicode = 0; | 500 | unicode = 0; |
501 | if (charlen == 2) | 501 | if (charlen == 2) |
502 | { | 502 | { |
503 | 503 | ||
504 | /* 5 bits from the first byte and 6 bits from the second. | 504 | /* 5 bits from the first byte and 6 bits from the second. |
505 | 64 = 2^6 */ | 505 | 64 = 2^6 */ |
506 | unicode = | 506 | unicode = |
507 | ((srcdata[src] & 0x1F) * 64) | (srcdata[src + 1] & 0x3F); | 507 | ((srcdata[src] & 0x1F) * 64) | (srcdata[src + 1] & 0x3F); |
508 | } | 508 | } |
509 | 509 | ||
510 | else if (charlen == 3) | 510 | else if (charlen == 3) |
511 | { | 511 | { |
512 | 512 | ||
513 | /* 4 bits from the first byte and 6 bits from the second and third | 513 | /* 4 bits from the first byte and 6 bits from the second and third |
514 | byte. 4096 = 2^12 */ | 514 | byte. 4096 = 2^12 */ |
515 | unicode = ((srcdata[src] & 0xF) * 4096) | | 515 | unicode = ((srcdata[src] & 0xF) * 4096) | |
516 | ((srcdata[src + 1] & 0x3F) * | 516 | ((srcdata[src + 1] & 0x3F) * |
517 | 64) | (srcdata[src + 2] & 0x3F); | 517 | 64) | (srcdata[src + 2] & 0x3F); |
518 | } | 518 | } |
519 | 519 | ||
520 | else if (charlen == 4) | 520 | else if (charlen == 4) |
521 | { | 521 | { |
522 | 522 | ||
523 | /* 3 bits from the first byte and 6 bits from the second, third | 523 | /* 3 bits from the first byte and 6 bits from the second, third |
524 | and fourth byte. 262144 = 2^18 */ | 524 | and fourth byte. 262144 = 2^18 */ |
525 | unicode = ((srcdata[src] & 7) * 262144) | | 525 | unicode = ((srcdata[src] & 7) * 262144) | |
526 | ((srcdata[src] & 0xF) * 4096) | | 526 | ((srcdata[src] & 0xF) * 4096) | |
527 | ((srcdata[src + 1] & 0x3F) * | 527 | ((srcdata[src + 1] & 0x3F) * |
528 | 64) | (srcdata[src + 2] & 0x3F); | 528 | 64) | (srcdata[src + 2] & 0x3F); |
529 | } | 529 | } |
530 | 530 | ||
531 | /* Look it up */ | 531 | /* Look it up */ |
532 | idx = 0; | 532 | idx = 0; |
533 | tr = srcdata + src; | 533 | tr = srcdata + src; |
534 | trlen = charlen; | 534 | trlen = charlen; |
@@ -536,8 +536,8 @@ libextractor_translit_extract (const char *filename, const char *data, | |||
536 | { | 536 | { |
537 | if (unicode == chars[idx][0]) | 537 | if (unicode == chars[idx][0]) |
538 | { | 538 | { |
539 | 539 | ||
540 | /* Found it */ | 540 | /* Found it */ |
541 | tr = translit[chars[idx][1]]; | 541 | tr = translit[chars[idx][1]]; |
542 | trlen = strlen (tr); | 542 | trlen = strlen (tr); |
543 | break; | 543 | break; |
@@ -545,7 +545,7 @@ libextractor_translit_extract (const char *filename, const char *data, | |||
545 | idx++; | 545 | idx++; |
546 | } | 546 | } |
547 | } | 547 | } |
548 | 548 | ||
549 | else | 549 | else |
550 | trlen = 1; | 550 | trlen = 1; |
551 | if (dest + trlen > mem) | 551 | if (dest + trlen > mem) |
@@ -555,11 +555,11 @@ libextractor_translit_extract (const char *filename, const char *data, | |||
555 | } | 555 | } |
556 | if (charlen > 1) | 556 | if (charlen > 1) |
557 | { | 557 | { |
558 | 558 | ||
559 | /* Copy character to destination string */ | 559 | /* Copy character to destination string */ |
560 | memcpy (transl + dest, tr, trlen); | 560 | memcpy (transl + dest, tr, trlen); |
561 | } | 561 | } |
562 | 562 | ||
563 | else | 563 | else |
564 | transl[dest] = c; | 564 | transl[dest] = c; |
565 | dest += trlen; | 565 | dest += trlen; |
@@ -573,4 +573,4 @@ libextractor_translit_extract (const char *filename, const char *data, | |||
573 | return prev; | 573 | return prev; |
574 | } | 574 | } |
575 | 575 | ||
576 | 576 | ||