1 /**************************************************************
2  *
3  * Licensed to the Apache Software Foundation (ASF) under one
4  * or more contributor license agreements.  See the NOTICE file
5  * distributed with this work for additional information
6  * regarding copyright ownership.  The ASF licenses this file
7  * to you under the Apache License, Version 2.0 (the
8  * "License"); you may not use this file except in compliance
9  * with the License.  You may obtain a copy of the License at
10  *
11  *   http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing,
14  * software distributed under the License is distributed on an
15  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16  * KIND, either express or implied.  See the License for the
17  * specific language governing permissions and limitations
18  * under the License.
19  *
20  *************************************************************/
21 
22 
23 
24 // MARKER(update_precomp.py): autogen include statement, do not remove
25 #include "precompiled_i18npool.hxx"
26 
27 #include <stdio.h>
28 #include <string.h>
29 #include <stdlib.h>
30 #include <sal/main.h>
31 #include <sal/types.h>
32 #include <rtl/strbuf.hxx>
33 #include <rtl/ustring.hxx>
34 
35 #include <vector>
36 
37 using namespace ::rtl;
38 
39 void make_hhc_char(FILE *sfp, FILE *cfp);
40 void make_stc_char(FILE *sfp, FILE *cfp);
41 void make_stc_word(FILE *sfp, FILE *cfp);
42 
43 /* Main Procedure */
44 
45 SAL_IMPLEMENT_MAIN_WITH_ARGS(argc, argv)
46 {
47 	FILE *sfp, *cfp;
48 
49 	if (argc < 4) exit(-1);
50 
51 
52 	sfp = fopen(argv[2], "rb");	// open the source file for read;
53 	if (sfp == NULL)
54     {
55 	    printf("Open the dictionary source file failed.");
56         return -1;
57     }
58 
59 	// create the C source file to write
60 	cfp = fopen(argv[3], "wb");
61 	if (cfp == NULL) {
62 	    fclose(sfp);
63 	    printf("Can't create the C source file.");
64         return -1;
65 	}
66 
67 	fprintf(cfp, "/*\n");
68 	fprintf(cfp, " * Copyright(c) 1999 - 2000, Sun Microsystems, Inc.\n");
69 	fprintf(cfp, " * All Rights Reserved.\n");
70 	fprintf(cfp, " */\n\n");
71 	fprintf(cfp, "/* !!!The file is generated automatically. DONOT edit the file manually!!! */\n\n");
72 	fprintf(cfp, "#include <sal/types.h>\n");
73 	fprintf(cfp, "#include <textconversion.hxx>\n");
74     fprintf(cfp, "\nextern \"C\" {\n");
75 
76     if (strcmp(argv[1], "hhc_char") == 0)
77         make_hhc_char(sfp, cfp);
78     else if (strcmp(argv[1], "stc_char") == 0)
79         make_stc_char(sfp, cfp);
80     else if (strcmp(argv[1], "stc_word") == 0)
81         make_stc_word(sfp, cfp);
82 
83 	fprintf (cfp, "}\n");
84 
85 	fclose(sfp);
86 	fclose(cfp);
87 
88 	return 0;
89 } // end of main
90 
91 // Hangul/Hanja character conversion
92 void make_hhc_char(FILE *sfp, FILE *cfp)
93 {
94 	sal_Int32 count, address, i, j, k;
95 	sal_Unicode Hanja2HangulData[0x10000];
96 	for (i = 0; i < 0x10000; i++) {
97 	    Hanja2HangulData[i] = 0;
98 	}
99     sal_uInt16 Hangul2HanjaData[10000][3];
100 
101 	// generate main dict. data array
102 	fprintf(cfp, "\nstatic const sal_Unicode Hangul2HanjaData[] = {");
103 
104 	sal_Char Cstr[1024];
105 	count = 0;
106     address = 0;
107 	while (fgets(Cstr, 1024, sfp)) {
108 	    // input file is in UTF-8 encoding (Hangul:Hanja)
109 	    // don't convert last new line character to Ostr.
110 	    OUString Ostr((const sal_Char *)Cstr, strlen(Cstr) - 1, RTL_TEXTENCODING_UTF8);
111 	    const sal_Unicode *Ustr = Ostr.getStr();
112         sal_Int32  len = Ostr.getLength();
113 
114         Hangul2HanjaData[count][0] = Ustr[0];
115         Hangul2HanjaData[count][1] = sal::static_int_cast<sal_uInt16>( address );
116         Hangul2HanjaData[count][2] = sal::static_int_cast<sal_uInt16>( len - 2 );
117         count++;
118 
119         for (i = 2; i < len; i++) {
120             Hanja2HangulData[Ustr[i]] = Ustr[0];
121             if (address++ % 16 == 0)
122                 fprintf(cfp, "\n\t");
123             fprintf(cfp, "0x%04x, ", Ustr[i]);
124         }
125 	}
126 	fprintf(cfp, "\n};\n");
127 
128 	fprintf(cfp, "\nstatic const com::sun::star::i18n::Hangul_Index Hangul2HanjaIndex[] = {\n");
129     for (i = 0; i < count; i++)
130         fprintf(cfp, "\t{ 0x%04x, 0x%04x, 0x%02x },\n",
131                         Hangul2HanjaData[i][0],
132                         Hangul2HanjaData[i][1],
133                         Hangul2HanjaData[i][2]);
134 	fprintf(cfp, "};\n");
135 
136 	fprintf(cfp, "\nstatic const sal_uInt16 Hanja2HangulIndex[] = {");
137 
138     address=0;
139 	for (i = 0; i < 0x10; i++) {
140         fprintf(cfp, "\n\t");
141         for (j = 0; j < 0x10; j++) {
142             for (k = 0; k < 0x100; k++) {
143                 if (Hanja2HangulData[((i*0x10)+j)*0x100+k] != 0)
144                     break;
145             }
146             fprintf(
147                 cfp, "0x%04lx, ",
148                 sal::static_int_cast< unsigned long >(
149                     k < 0x100 ? (address++)*0x100 : 0xFFFF));
150         }
151 	}
152 	fprintf(cfp, "\n};\n");
153 
154 	fprintf(cfp, "\nstatic const sal_Unicode Hanja2HangulData[] = {");
155 
156 	for (i = 0; i < 0x100; i++) {
157         for (j = 0; j < 0x100; j++) {
158             if (Hanja2HangulData[i*0x100+j] != 0)
159                 break;
160         }
161         if (j < 0x100) {
162             for (j = 0; j < 0x10; j++) {
163                 fprintf(cfp, "\n\t");
164                 for (k = 0; k < 0x10; k++) {
165 					sal_Unicode c = Hanja2HangulData[((i*0x10+j)*0x10)+k];
166                     fprintf(cfp, "0x%04x, ", c ? c : 0xFFFF);
167                 }
168             }
169         }
170 	}
171 	fprintf(cfp, "\n};\n");
172 
173 	// create function to return arrays
174 	fprintf (cfp, "\tconst sal_Unicode* getHangul2HanjaData() { return Hangul2HanjaData; }\n");
175 	fprintf (cfp, "\tconst com::sun::star::i18n::Hangul_Index* getHangul2HanjaIndex() { return Hangul2HanjaIndex; }\n");
176 	fprintf (cfp, "\tsal_Int16 getHangul2HanjaIndexCount() { return sizeof(Hangul2HanjaIndex) / sizeof(com::sun::star::i18n::Hangul_Index); }\n");
177 	fprintf (cfp, "\tconst sal_uInt16* getHanja2HangulIndex() { return Hanja2HangulIndex; }\n");
178 	fprintf (cfp, "\tconst sal_Unicode* getHanja2HangulData() { return Hanja2HangulData; }\n");
179 }
180 
181 // Simplified/Traditional Chinese character conversion
182 void make_stc_char(FILE *sfp, FILE *cfp)
183 {
184 	sal_Int32 address, i, j, k;
185 	sal_Unicode SChinese2TChineseData[0x10000];
186 	sal_Unicode SChinese2VChineseData[0x10000];
187 	sal_Unicode TChinese2SChineseData[0x10000];
188 	for (i = 0; i < 0x10000; i++) {
189 	    SChinese2TChineseData[i] = 0;
190 	    SChinese2VChineseData[i] = 0;
191 	    TChinese2SChineseData[i] = 0;
192 	}
193 
194 	sal_Char Cstr[1024];
195 	while (fgets(Cstr, 1024, sfp)) {
196 	    // input file is in UTF-8 encoding (SChinese:TChinese)
197 	    // don't convert last new line character to Ostr.
198 	    OUString Ostr((const sal_Char *)Cstr, strlen(Cstr) - 1, RTL_TEXTENCODING_UTF8);
199 	    const sal_Unicode *Ustr = Ostr.getStr();
200         sal_Int32  len = Ostr.getLength();
201         if (Ustr[1] == sal_Unicode('v'))
202             SChinese2VChineseData[Ustr[0]] = Ustr[2];
203         else {
204             SChinese2TChineseData[Ustr[0]] = Ustr[2];
205             if (SChinese2VChineseData[Ustr[0]] == 0)
206                 SChinese2VChineseData[Ustr[0]] = Ustr[2];
207         }
208         for (i = 2; i < len; i++)
209             TChinese2SChineseData[Ustr[i]] = Ustr[0];
210     }
211 
212 	fprintf(cfp, "\nstatic const sal_uInt16 STC_CharIndex_S2T[] = {");
213 
214     address=0;
215 	for (i = 0; i < 0x10; i++) {
216         fprintf(cfp, "\n\t");
217         for (j = 0; j < 0x10; j++) {
218             for (k = 0; k < 0x100; k++) {
219                 if (SChinese2TChineseData[((i*0x10)+j)*0x100+k] != 0)
220                     break;
221             }
222             fprintf(
223                 cfp, "0x%04lx, ",
224                 sal::static_int_cast< unsigned long >(
225                     k < 0x100 ? (address++)*0x100 : 0xFFFF));
226         }
227 	}
228 	fprintf(cfp, "\n};\n");
229 
230 	fprintf(cfp, "\nstatic const sal_Unicode STC_CharData_S2T[] = {");
231 
232 	for (i = 0; i < 0x100; i++) {
233         for (j = 0; j < 0x100; j++) {
234             if (SChinese2TChineseData[i*0x100+j] != 0)
235                 break;
236         }
237         if (j < 0x100) {
238             for (j = 0; j < 0x10; j++) {
239                 fprintf(cfp, "\n\t");
240                 for (k = 0; k < 0x10; k++) {
241 					sal_Unicode c = SChinese2TChineseData[((i*0x10+j)*0x10)+k];
242                     fprintf(cfp, "0x%04x, ", c ? c : 0xFFFF);
243                 }
244             }
245         }
246 	}
247 	fprintf(cfp, "\n};\n");
248 
249 	fprintf(cfp, "\nstatic const sal_uInt16 STC_CharIndex_S2V[] = {");
250 
251     address=0;
252 	for (i = 0; i < 0x10; i++) {
253         fprintf(cfp, "\n\t");
254         for (j = 0; j < 0x10; j++) {
255             for (k = 0; k < 0x100; k++) {
256                 if (SChinese2VChineseData[((i*0x10)+j)*0x100+k] != 0)
257                     break;
258             }
259             fprintf(
260                 cfp, "0x%04lx, ",
261                 sal::static_int_cast< unsigned long >(
262                     k < 0x100 ? (address++)*0x100 : 0xFFFF));
263         }
264 	}
265 	fprintf(cfp, "\n};\n");
266 
267 	fprintf(cfp, "\nstatic const sal_Unicode STC_CharData_S2V[] = {");
268 
269 	for (i = 0; i < 0x100; i++) {
270         for (j = 0; j < 0x100; j++) {
271             if (SChinese2VChineseData[i*0x100+j] != 0)
272                 break;
273         }
274         if (j < 0x100) {
275             for (j = 0; j < 0x10; j++) {
276                 fprintf(cfp, "\n\t");
277                 for (k = 0; k < 0x10; k++) {
278 					sal_Unicode c = SChinese2VChineseData[((i*0x10+j)*0x10)+k];
279                     fprintf(cfp, "0x%04x, ", c ? c : 0xFFFF);
280                 }
281             }
282         }
283 	}
284 	fprintf(cfp, "\n};\n");
285 
286 	fprintf(cfp, "\nstatic const sal_uInt16 STC_CharIndex_T2S[] = {");
287 
288     address=0;
289 	for (i = 0; i < 0x10; i++) {
290         fprintf(cfp, "\n\t");
291         for (j = 0; j < 0x10; j++) {
292             for (k = 0; k < 0x100; k++) {
293                 if (TChinese2SChineseData[((i*0x10)+j)*0x100+k] != 0)
294                     break;
295             }
296             fprintf(
297                 cfp, "0x%04lx, ",
298                 sal::static_int_cast< unsigned long >(
299                     k < 0x100 ? (address++)*0x100 : 0xFFFF));
300         }
301 	}
302 	fprintf(cfp, "\n};\n");
303 
304 	fprintf(cfp, "\nstatic const sal_Unicode STC_CharData_T2S[] = {");
305 
306 	for (i = 0; i < 0x100; i++) {
307         for (j = 0; j < 0x100; j++) {
308             if (TChinese2SChineseData[i*0x100+j] != 0)
309                 break;
310         }
311         if (j < 0x100) {
312             for (j = 0; j < 0x10; j++) {
313                 fprintf(cfp, "\n\t");
314                 for (k = 0; k < 0x10; k++) {
315 					sal_Unicode c = TChinese2SChineseData[((i*0x10+j)*0x10)+k];
316                     fprintf(cfp, "0x%04x, ", c ? c : 0xFFFF);
317                 }
318             }
319         }
320 	}
321 	fprintf(cfp, "\n};\n");
322 
323 	// create function to return arrays
324 	fprintf (cfp, "\tconst sal_uInt16* getSTC_CharIndex_S2T() { return STC_CharIndex_S2T; }\n");
325 	fprintf (cfp, "\tconst sal_Unicode* getSTC_CharData_S2T() { return STC_CharData_S2T; }\n");
326 	fprintf (cfp, "\tconst sal_uInt16* getSTC_CharIndex_S2V() { return STC_CharIndex_S2V; }\n");
327 	fprintf (cfp, "\tconst sal_Unicode* getSTC_CharData_S2V() { return STC_CharData_S2V; }\n");
328 	fprintf (cfp, "\tconst sal_uInt16* getSTC_CharIndex_T2S() { return STC_CharIndex_T2S; }\n");
329 	fprintf (cfp, "\tconst sal_Unicode* getSTC_CharData_T2S() { return STC_CharData_T2S; }\n");
330 }
331 
332 
333 typedef struct {
334     sal_uInt16 address;
335     sal_Int32 len;
336     sal_Unicode *data;
337 } Index;
338 
339 extern "C" {
340 int Index_comp(const void* s1, const void* s2)
341 {
342     Index *p1 = (Index*)s1, *p2 = (Index*)s2;
343     int result = p1->len - p2->len;
344     for (int i = 0; result == 0 && i < p1->len; i++)
345         result = *(p1->data+i) - *(p2->data+i);
346     return result;
347 }
348 }
349 
350 // Simplified/Traditional Chinese word conversion
351 void make_stc_word(FILE *sfp, FILE *cfp)
352 {
353 	sal_Int32 count, i, length;
354     sal_Unicode STC_WordData[0x10000];
355     std::vector<Index> STC_WordEntry_S2T(0x10000);
356     std::vector<Index> STC_WordEntry_T2S(0x10000);
357     sal_Int32 count_S2T = 0, count_T2S = 0;
358     sal_Int32 line = 0, char_total = 0;
359 	sal_Char Cstr[1024];
360 
361 	while (fgets(Cstr, 1024, sfp)) {
362 	    // input file is in UTF-8 encoding (SChinese:TChinese)
363 	    // don't convert last new line character to Ostr.
364 	    OUString Ostr((const sal_Char *)Cstr, strlen(Cstr) - 1, RTL_TEXTENCODING_UTF8);
365         sal_Int32  len = Ostr.getLength();
366         if (char_total + len + 1 > 0xFFFF) {
367             fprintf(stderr, "Word Dictionary stc_word.dic is too big (line %ld)", sal::static_int_cast< long >(line));
368             return;
369         }
370         sal_Int32 sep=-1, eq=-1, gt=-1, lt=-1;
371         if (((sep = eq = Ostr.indexOf(sal_Unicode('='))) > 0) ||
372             ((sep = gt = Ostr.indexOf(sal_Unicode('>'))) > 0) ||
373             ((sep = lt = Ostr.indexOf(sal_Unicode('<'))) > 0)) {
374 
375             if (eq > 0 || gt > 0) {
376                 STC_WordEntry_S2T[count_S2T].address = sal::static_int_cast<sal_uInt16>( char_total );
377                 STC_WordEntry_S2T[count_S2T].len = sep;
378                 STC_WordEntry_S2T[count_S2T++].data = &STC_WordData[char_total];
379             }
380             if (eq > 0 || lt > 0) {
381                 STC_WordEntry_T2S[count_T2S].address = sal::static_int_cast<sal_uInt16>( char_total + sep + 1 );
382                 STC_WordEntry_T2S[count_T2S].len = len - sep - 1;
383                 STC_WordEntry_T2S[count_T2S++].data = &STC_WordData[char_total + sep + 1];
384             }
385             for (i = 0; i < len; i++)
386                 STC_WordData[char_total++] = (i == sep) ? 0 : Ostr[i];
387             STC_WordData[char_total++] = 0;
388         } else {
389             fprintf(stderr, "Invalid entry in stc_word.dic (line %ld)", sal::static_int_cast< long >(line));
390             return;
391         }
392         line++;
393     }
394 
395     if (char_total > 0) {
396         fprintf(cfp, "\nstatic const sal_Unicode STC_WordData[] = {");
397         for (i = 0; i < char_total; i++) {
398             if (i % 32 == 0) fprintf(cfp, "\n\t");
399             fprintf(cfp, "0x%04x, ", STC_WordData[i]);
400         }
401         fprintf(cfp, "\n};\n");
402 
403         fprintf(cfp, "\nstatic sal_Int32 STC_WordData_Count = %ld;\n", sal::static_int_cast< long >(char_total));
404 
405         // create function to return arrays
406         fprintf (cfp, "\tconst sal_Unicode* getSTC_WordData(sal_Int32& count) { count = STC_WordData_Count; return STC_WordData; }\n");
407     } else {
408         fprintf (cfp, "\tconst sal_Unicode* getSTC_WordData(sal_Int32& count) { count = 0; return NULL; }\n");
409     }
410 
411     sal_uInt16 STC_WordIndex[0x100];
412 
413     if (count_S2T > 0) {
414         qsort(&STC_WordEntry_S2T[0], count_S2T, sizeof(Index), Index_comp);
415 
416         fprintf(cfp, "\nstatic const sal_uInt16 STC_WordEntry_S2T[] = {");
417         count = 0;
418         length = 0;
419         for (i = 0; i < count_S2T; i++) {
420             if (i % 32 == 0) fprintf(cfp, "\n\t");
421             fprintf(cfp, "0x%04x, ", STC_WordEntry_S2T[i].address);
422             if (STC_WordEntry_S2T[i].len != length) {
423                 length = STC_WordEntry_S2T[i].len;
424                 while (count <= length)
425                     STC_WordIndex[count++] = sal::static_int_cast<sal_uInt16>(i);
426             }
427         }
428         fprintf(cfp, "\n};\n");
429         STC_WordIndex[count++] = sal::static_int_cast<sal_uInt16>(i);
430 
431         fprintf(cfp, "\nstatic const sal_uInt16 STC_WordIndex_S2T[] = {");
432         for (i = 0; i < count; i++) {
433             if (i % 16 == 0) fprintf(cfp, "\n\t");
434             fprintf(cfp, "0x%04x, ", STC_WordIndex[i]);
435         }
436         fprintf(cfp, "\n};\n");
437 
438         fprintf(cfp, "\nstatic sal_Int32 STC_WordIndex_S2T_Count = %ld;\n", sal::static_int_cast< long >(length));
439         fprintf (cfp, "\tconst sal_uInt16* getSTC_WordEntry_S2T() { return STC_WordEntry_S2T; }\n");
440         fprintf (cfp, "\tconst sal_uInt16* getSTC_WordIndex_S2T(sal_Int32& count) { count = STC_WordIndex_S2T_Count; return STC_WordIndex_S2T; }\n");
441     } else {
442         fprintf (cfp, "\tconst sal_uInt16* getSTC_WordEntry_S2T() { return NULL; }\n");
443         fprintf (cfp, "\tconst sal_uInt16* getSTC_WordIndex_S2T(sal_Int32& count) { count = 0; return NULL; }\n");
444     }
445 
446     if (count_T2S > 0) {
447         qsort(&STC_WordEntry_T2S[0], count_T2S, sizeof(Index), Index_comp);
448 
449         fprintf(cfp, "\nstatic const sal_uInt16 STC_WordEntry_T2S[] = {");
450         count = 0;
451         length = 0;
452         for (i = 0; i < count_T2S; i++) {
453             if (i % 32 == 0) fprintf(cfp, "\n\t");
454             fprintf(cfp, "0x%04x, ", STC_WordEntry_T2S[i].address);
455             if (STC_WordEntry_T2S[i].len != length) {
456                 length = STC_WordEntry_T2S[i].len;
457                 while (count <= length)
458                     STC_WordIndex[count++] = sal::static_int_cast<sal_uInt16>(i);
459             }
460         }
461         STC_WordIndex[count++] = sal::static_int_cast<sal_uInt16>(i);
462         fprintf(cfp, "\n};\n");
463 
464         fprintf(cfp, "\nstatic const sal_uInt16 STC_WordIndex_T2S[] = {");
465         for (i = 0; i < count; i++) {
466             if (i % 16 == 0) fprintf(cfp, "\n\t");
467             fprintf(cfp, "0x%04x, ",  STC_WordIndex[i]);
468         }
469         fprintf(cfp, "\n};\n");
470 
471         fprintf(cfp, "\nstatic sal_Int32 STC_WordIndex_T2S_Count = %ld;\n\n", sal::static_int_cast< long >(length));
472         fprintf (cfp, "\tconst sal_uInt16* getSTC_WordEntry_T2S() { return STC_WordEntry_T2S; }\n");
473         fprintf (cfp, "\tconst sal_uInt16* getSTC_WordIndex_T2S(sal_Int32& count) { count = STC_WordIndex_T2S_Count; return STC_WordIndex_T2S; }\n");
474     } else {
475         fprintf (cfp, "\tconst sal_uInt16* getSTC_WordEntry_T2S() { return NULL; }\n");
476         fprintf (cfp, "\tconst sal_uInt16* getSTC_WordIndex_T2S(sal_Int32& count) { count = 0; return NULL; }\n");
477     }
478 }
479 
480