1 /*************************************************************************
2  *
3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4  *
5  * Copyright 2000, 2010 Oracle and/or its affiliates.
6  *
7  * OpenOffice.org - a multi-platform office productivity suite
8  *
9  * This file is part of OpenOffice.org.
10  *
11  * OpenOffice.org is free software: you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser General Public License version 3
13  * only, as published by the Free Software Foundation.
14  *
15  * OpenOffice.org is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  * GNU Lesser General Public License version 3 for more details
19  * (a copy is included in the LICENSE file that accompanied this code).
20  *
21  * You should have received a copy of the GNU Lesser General Public License
22  * version 3 along with OpenOffice.org.  If not, see
23  * <http://www.openoffice.org/license.html>
24  * for a copy of the LGPLv3 License.
25  *
26  ************************************************************************/
27 
28 // MARKER(update_precomp.py): autogen include statement, do not remove
29 #include "precompiled_i18npool.hxx"
30 
31 #include <stdio.h>
32 #include <string.h>
33 #include <stdlib.h>
34 #include <sal/main.h>
35 #include <sal/types.h>
36 #include <rtl/strbuf.hxx>
37 #include <rtl/ustring.hxx>
38 
39 #include <vector>
40 
41 using namespace ::rtl;
42 
43 void make_hhc_char(FILE *sfp, FILE *cfp);
44 void make_stc_char(FILE *sfp, FILE *cfp);
45 void make_stc_word(FILE *sfp, FILE *cfp);
46 
47 /* Main Procedure */
48 
49 SAL_IMPLEMENT_MAIN_WITH_ARGS(argc, argv)
50 {
51 	FILE *sfp, *cfp;
52 
53 	if (argc < 4) exit(-1);
54 
55 
56 	sfp = fopen(argv[2], "rb");	// open the source file for read;
57 	if (sfp == NULL)
58     {
59 	    printf("Open the dictionary source file failed.");
60         return -1;
61     }
62 
63 	// create the C source file to write
64 	cfp = fopen(argv[3], "wb");
65 	if (cfp == NULL) {
66 	    fclose(sfp);
67 	    printf("Can't create the C source file.");
68         return -1;
69 	}
70 
71 	fprintf(cfp, "/*\n");
72 	fprintf(cfp, " * Copyright(c) 1999 - 2000, Sun Microsystems, Inc.\n");
73 	fprintf(cfp, " * All Rights Reserved.\n");
74 	fprintf(cfp, " */\n\n");
75 	fprintf(cfp, "/* !!!The file is generated automatically. DONOT edit the file manually!!! */\n\n");
76 	fprintf(cfp, "#include <sal/types.h>\n");
77 	fprintf(cfp, "#include <textconversion.hxx>\n");
78     fprintf(cfp, "\nextern \"C\" {\n");
79 
80     if (strcmp(argv[1], "hhc_char") == 0)
81         make_hhc_char(sfp, cfp);
82     else if (strcmp(argv[1], "stc_char") == 0)
83         make_stc_char(sfp, cfp);
84     else if (strcmp(argv[1], "stc_word") == 0)
85         make_stc_word(sfp, cfp);
86 
87 	fprintf (cfp, "}\n");
88 
89 	fclose(sfp);
90 	fclose(cfp);
91 
92 	return 0;
93 } // end of main
94 
95 // Hangul/Hanja character conversion
96 void make_hhc_char(FILE *sfp, FILE *cfp)
97 {
98 	sal_Int32 count, address, i, j, k;
99 	sal_Unicode Hanja2HangulData[0x10000];
100 	for (i = 0; i < 0x10000; i++) {
101 	    Hanja2HangulData[i] = 0;
102 	}
103     sal_uInt16 Hangul2HanjaData[10000][3];
104 
105 	// generate main dict. data array
106 	fprintf(cfp, "\nstatic const sal_Unicode Hangul2HanjaData[] = {");
107 
108 	sal_Char Cstr[1024];
109 	count = 0;
110     address = 0;
111 	while (fgets(Cstr, 1024, sfp)) {
112 	    // input file is in UTF-8 encoding (Hangul:Hanja)
113 	    // don't convert last new line character to Ostr.
114 	    OUString Ostr((const sal_Char *)Cstr, strlen(Cstr) - 1, RTL_TEXTENCODING_UTF8);
115 	    const sal_Unicode *Ustr = Ostr.getStr();
116         sal_Int32  len = Ostr.getLength();
117 
118         Hangul2HanjaData[count][0] = Ustr[0];
119         Hangul2HanjaData[count][1] = sal::static_int_cast<sal_uInt16>( address );
120         Hangul2HanjaData[count][2] = sal::static_int_cast<sal_uInt16>( len - 2 );
121         count++;
122 
123         for (i = 2; i < len; i++) {
124             Hanja2HangulData[Ustr[i]] = Ustr[0];
125             if (address++ % 16 == 0)
126                 fprintf(cfp, "\n\t");
127             fprintf(cfp, "0x%04x, ", Ustr[i]);
128         }
129 	}
130 	fprintf(cfp, "\n};\n");
131 
132 	fprintf(cfp, "\nstatic const com::sun::star::i18n::Hangul_Index Hangul2HanjaIndex[] = {\n");
133     for (i = 0; i < count; i++)
134         fprintf(cfp, "\t{ 0x%04x, 0x%04x, 0x%02x },\n",
135                         Hangul2HanjaData[i][0],
136                         Hangul2HanjaData[i][1],
137                         Hangul2HanjaData[i][2]);
138 	fprintf(cfp, "};\n");
139 
140 	fprintf(cfp, "\nstatic const sal_uInt16 Hanja2HangulIndex[] = {");
141 
142     address=0;
143 	for (i = 0; i < 0x10; i++) {
144         fprintf(cfp, "\n\t");
145         for (j = 0; j < 0x10; j++) {
146             for (k = 0; k < 0x100; k++) {
147                 if (Hanja2HangulData[((i*0x10)+j)*0x100+k] != 0)
148                     break;
149             }
150             fprintf(
151                 cfp, "0x%04lx, ",
152                 sal::static_int_cast< unsigned long >(
153                     k < 0x100 ? (address++)*0x100 : 0xFFFF));
154         }
155 	}
156 	fprintf(cfp, "\n};\n");
157 
158 	fprintf(cfp, "\nstatic const sal_Unicode Hanja2HangulData[] = {");
159 
160 	for (i = 0; i < 0x100; i++) {
161         for (j = 0; j < 0x100; j++) {
162             if (Hanja2HangulData[i*0x100+j] != 0)
163                 break;
164         }
165         if (j < 0x100) {
166             for (j = 0; j < 0x10; j++) {
167                 fprintf(cfp, "\n\t");
168                 for (k = 0; k < 0x10; k++) {
169 					sal_Unicode c = Hanja2HangulData[((i*0x10+j)*0x10)+k];
170                     fprintf(cfp, "0x%04x, ", c ? c : 0xFFFF);
171                 }
172             }
173         }
174 	}
175 	fprintf(cfp, "\n};\n");
176 
177 	// create function to return arrays
178 	fprintf (cfp, "\tconst sal_Unicode* getHangul2HanjaData() { return Hangul2HanjaData; }\n");
179 	fprintf (cfp, "\tconst com::sun::star::i18n::Hangul_Index* getHangul2HanjaIndex() { return Hangul2HanjaIndex; }\n");
180 	fprintf (cfp, "\tsal_Int16 getHangul2HanjaIndexCount() { return sizeof(Hangul2HanjaIndex) / sizeof(com::sun::star::i18n::Hangul_Index); }\n");
181 	fprintf (cfp, "\tconst sal_uInt16* getHanja2HangulIndex() { return Hanja2HangulIndex; }\n");
182 	fprintf (cfp, "\tconst sal_Unicode* getHanja2HangulData() { return Hanja2HangulData; }\n");
183 }
184 
185 // Simplified/Traditional Chinese character conversion
186 void make_stc_char(FILE *sfp, FILE *cfp)
187 {
188 	sal_Int32 address, i, j, k;
189 	sal_Unicode SChinese2TChineseData[0x10000];
190 	sal_Unicode SChinese2VChineseData[0x10000];
191 	sal_Unicode TChinese2SChineseData[0x10000];
192 	for (i = 0; i < 0x10000; i++) {
193 	    SChinese2TChineseData[i] = 0;
194 	    SChinese2VChineseData[i] = 0;
195 	    TChinese2SChineseData[i] = 0;
196 	}
197 
198 	sal_Char Cstr[1024];
199 	while (fgets(Cstr, 1024, sfp)) {
200 	    // input file is in UTF-8 encoding (SChinese:TChinese)
201 	    // don't convert last new line character to Ostr.
202 	    OUString Ostr((const sal_Char *)Cstr, strlen(Cstr) - 1, RTL_TEXTENCODING_UTF8);
203 	    const sal_Unicode *Ustr = Ostr.getStr();
204         sal_Int32  len = Ostr.getLength();
205         if (Ustr[1] == sal_Unicode('v'))
206             SChinese2VChineseData[Ustr[0]] = Ustr[2];
207         else {
208             SChinese2TChineseData[Ustr[0]] = Ustr[2];
209             if (SChinese2VChineseData[Ustr[0]] == 0)
210                 SChinese2VChineseData[Ustr[0]] = Ustr[2];
211         }
212         for (i = 2; i < len; i++)
213             TChinese2SChineseData[Ustr[i]] = Ustr[0];
214     }
215 
216 	fprintf(cfp, "\nstatic const sal_uInt16 STC_CharIndex_S2T[] = {");
217 
218     address=0;
219 	for (i = 0; i < 0x10; i++) {
220         fprintf(cfp, "\n\t");
221         for (j = 0; j < 0x10; j++) {
222             for (k = 0; k < 0x100; k++) {
223                 if (SChinese2TChineseData[((i*0x10)+j)*0x100+k] != 0)
224                     break;
225             }
226             fprintf(
227                 cfp, "0x%04lx, ",
228                 sal::static_int_cast< unsigned long >(
229                     k < 0x100 ? (address++)*0x100 : 0xFFFF));
230         }
231 	}
232 	fprintf(cfp, "\n};\n");
233 
234 	fprintf(cfp, "\nstatic const sal_Unicode STC_CharData_S2T[] = {");
235 
236 	for (i = 0; i < 0x100; i++) {
237         for (j = 0; j < 0x100; j++) {
238             if (SChinese2TChineseData[i*0x100+j] != 0)
239                 break;
240         }
241         if (j < 0x100) {
242             for (j = 0; j < 0x10; j++) {
243                 fprintf(cfp, "\n\t");
244                 for (k = 0; k < 0x10; k++) {
245 					sal_Unicode c = SChinese2TChineseData[((i*0x10+j)*0x10)+k];
246                     fprintf(cfp, "0x%04x, ", c ? c : 0xFFFF);
247                 }
248             }
249         }
250 	}
251 	fprintf(cfp, "\n};\n");
252 
253 	fprintf(cfp, "\nstatic const sal_uInt16 STC_CharIndex_S2V[] = {");
254 
255     address=0;
256 	for (i = 0; i < 0x10; i++) {
257         fprintf(cfp, "\n\t");
258         for (j = 0; j < 0x10; j++) {
259             for (k = 0; k < 0x100; k++) {
260                 if (SChinese2VChineseData[((i*0x10)+j)*0x100+k] != 0)
261                     break;
262             }
263             fprintf(
264                 cfp, "0x%04lx, ",
265                 sal::static_int_cast< unsigned long >(
266                     k < 0x100 ? (address++)*0x100 : 0xFFFF));
267         }
268 	}
269 	fprintf(cfp, "\n};\n");
270 
271 	fprintf(cfp, "\nstatic const sal_Unicode STC_CharData_S2V[] = {");
272 
273 	for (i = 0; i < 0x100; i++) {
274         for (j = 0; j < 0x100; j++) {
275             if (SChinese2VChineseData[i*0x100+j] != 0)
276                 break;
277         }
278         if (j < 0x100) {
279             for (j = 0; j < 0x10; j++) {
280                 fprintf(cfp, "\n\t");
281                 for (k = 0; k < 0x10; k++) {
282 					sal_Unicode c = SChinese2VChineseData[((i*0x10+j)*0x10)+k];
283                     fprintf(cfp, "0x%04x, ", c ? c : 0xFFFF);
284                 }
285             }
286         }
287 	}
288 	fprintf(cfp, "\n};\n");
289 
290 	fprintf(cfp, "\nstatic const sal_uInt16 STC_CharIndex_T2S[] = {");
291 
292     address=0;
293 	for (i = 0; i < 0x10; i++) {
294         fprintf(cfp, "\n\t");
295         for (j = 0; j < 0x10; j++) {
296             for (k = 0; k < 0x100; k++) {
297                 if (TChinese2SChineseData[((i*0x10)+j)*0x100+k] != 0)
298                     break;
299             }
300             fprintf(
301                 cfp, "0x%04lx, ",
302                 sal::static_int_cast< unsigned long >(
303                     k < 0x100 ? (address++)*0x100 : 0xFFFF));
304         }
305 	}
306 	fprintf(cfp, "\n};\n");
307 
308 	fprintf(cfp, "\nstatic const sal_Unicode STC_CharData_T2S[] = {");
309 
310 	for (i = 0; i < 0x100; i++) {
311         for (j = 0; j < 0x100; j++) {
312             if (TChinese2SChineseData[i*0x100+j] != 0)
313                 break;
314         }
315         if (j < 0x100) {
316             for (j = 0; j < 0x10; j++) {
317                 fprintf(cfp, "\n\t");
318                 for (k = 0; k < 0x10; k++) {
319 					sal_Unicode c = TChinese2SChineseData[((i*0x10+j)*0x10)+k];
320                     fprintf(cfp, "0x%04x, ", c ? c : 0xFFFF);
321                 }
322             }
323         }
324 	}
325 	fprintf(cfp, "\n};\n");
326 
327 	// create function to return arrays
328 	fprintf (cfp, "\tconst sal_uInt16* getSTC_CharIndex_S2T() { return STC_CharIndex_S2T; }\n");
329 	fprintf (cfp, "\tconst sal_Unicode* getSTC_CharData_S2T() { return STC_CharData_S2T; }\n");
330 	fprintf (cfp, "\tconst sal_uInt16* getSTC_CharIndex_S2V() { return STC_CharIndex_S2V; }\n");
331 	fprintf (cfp, "\tconst sal_Unicode* getSTC_CharData_S2V() { return STC_CharData_S2V; }\n");
332 	fprintf (cfp, "\tconst sal_uInt16* getSTC_CharIndex_T2S() { return STC_CharIndex_T2S; }\n");
333 	fprintf (cfp, "\tconst sal_Unicode* getSTC_CharData_T2S() { return STC_CharData_T2S; }\n");
334 }
335 
336 
337 typedef struct {
338     sal_uInt16 address;
339     sal_Int32 len;
340     sal_Unicode *data;
341 } Index;
342 
343 extern "C" {
344 int Index_comp(const void* s1, const void* s2)
345 {
346     Index *p1 = (Index*)s1, *p2 = (Index*)s2;
347     int result = p1->len - p2->len;
348     for (int i = 0; result == 0 && i < p1->len; i++)
349         result = *(p1->data+i) - *(p2->data+i);
350     return result;
351 }
352 }
353 
354 // Simplified/Traditional Chinese word conversion
355 void make_stc_word(FILE *sfp, FILE *cfp)
356 {
357 	sal_Int32 count, i, length;
358     sal_Unicode STC_WordData[0x10000];
359     std::vector<Index> STC_WordEntry_S2T(0x10000);
360     std::vector<Index> STC_WordEntry_T2S(0x10000);
361     sal_Int32 count_S2T = 0, count_T2S = 0;
362     sal_Int32 line = 0, char_total = 0;
363 	sal_Char Cstr[1024];
364 
365 	while (fgets(Cstr, 1024, sfp)) {
366 	    // input file is in UTF-8 encoding (SChinese:TChinese)
367 	    // don't convert last new line character to Ostr.
368 	    OUString Ostr((const sal_Char *)Cstr, strlen(Cstr) - 1, RTL_TEXTENCODING_UTF8);
369         sal_Int32  len = Ostr.getLength();
370         if (char_total + len + 1 > 0xFFFF) {
371             fprintf(stderr, "Word Dictionary stc_word.dic is too big (line %ld)", sal::static_int_cast< long >(line));
372             return;
373         }
374         sal_Int32 sep=-1, eq=-1, gt=-1, lt=-1;
375         if (((sep = eq = Ostr.indexOf(sal_Unicode('='))) > 0) ||
376             ((sep = gt = Ostr.indexOf(sal_Unicode('>'))) > 0) ||
377             ((sep = lt = Ostr.indexOf(sal_Unicode('<'))) > 0)) {
378 
379             if (eq > 0 || gt > 0) {
380                 STC_WordEntry_S2T[count_S2T].address = sal::static_int_cast<sal_uInt16>( char_total );
381                 STC_WordEntry_S2T[count_S2T].len = sep;
382                 STC_WordEntry_S2T[count_S2T++].data = &STC_WordData[char_total];
383             }
384             if (eq > 0 || lt > 0) {
385                 STC_WordEntry_T2S[count_T2S].address = sal::static_int_cast<sal_uInt16>( char_total + sep + 1 );
386                 STC_WordEntry_T2S[count_T2S].len = len - sep - 1;
387                 STC_WordEntry_T2S[count_T2S++].data = &STC_WordData[char_total + sep + 1];
388             }
389             for (i = 0; i < len; i++)
390                 STC_WordData[char_total++] = (i == sep) ? 0 : Ostr[i];
391             STC_WordData[char_total++] = 0;
392         } else {
393             fprintf(stderr, "Invalid entry in stc_word.dic (line %ld)", sal::static_int_cast< long >(line));
394             return;
395         }
396         line++;
397     }
398 
399     if (char_total > 0) {
400         fprintf(cfp, "\nstatic const sal_Unicode STC_WordData[] = {");
401         for (i = 0; i < char_total; i++) {
402             if (i % 32 == 0) fprintf(cfp, "\n\t");
403             fprintf(cfp, "0x%04x, ", STC_WordData[i]);
404         }
405         fprintf(cfp, "\n};\n");
406 
407         fprintf(cfp, "\nstatic sal_Int32 STC_WordData_Count = %ld;\n", sal::static_int_cast< long >(char_total));
408 
409         // create function to return arrays
410         fprintf (cfp, "\tconst sal_Unicode* getSTC_WordData(sal_Int32& count) { count = STC_WordData_Count; return STC_WordData; }\n");
411     } else {
412         fprintf (cfp, "\tconst sal_Unicode* getSTC_WordData(sal_Int32& count) { count = 0; return NULL; }\n");
413     }
414 
415     sal_uInt16 STC_WordIndex[0x100];
416 
417     if (count_S2T > 0) {
418         qsort(&STC_WordEntry_S2T[0], count_S2T, sizeof(Index), Index_comp);
419 
420         fprintf(cfp, "\nstatic const sal_uInt16 STC_WordEntry_S2T[] = {");
421         count = 0;
422         length = 0;
423         for (i = 0; i < count_S2T; i++) {
424             if (i % 32 == 0) fprintf(cfp, "\n\t");
425             fprintf(cfp, "0x%04x, ", STC_WordEntry_S2T[i].address);
426             if (STC_WordEntry_S2T[i].len != length) {
427                 length = STC_WordEntry_S2T[i].len;
428                 while (count <= length)
429                     STC_WordIndex[count++] = sal::static_int_cast<sal_uInt16>(i);
430             }
431         }
432         fprintf(cfp, "\n};\n");
433         STC_WordIndex[count++] = sal::static_int_cast<sal_uInt16>(i);
434 
435         fprintf(cfp, "\nstatic const sal_uInt16 STC_WordIndex_S2T[] = {");
436         for (i = 0; i < count; i++) {
437             if (i % 16 == 0) fprintf(cfp, "\n\t");
438             fprintf(cfp, "0x%04x, ", STC_WordIndex[i]);
439         }
440         fprintf(cfp, "\n};\n");
441 
442         fprintf(cfp, "\nstatic sal_Int32 STC_WordIndex_S2T_Count = %ld;\n", sal::static_int_cast< long >(length));
443         fprintf (cfp, "\tconst sal_uInt16* getSTC_WordEntry_S2T() { return STC_WordEntry_S2T; }\n");
444         fprintf (cfp, "\tconst sal_uInt16* getSTC_WordIndex_S2T(sal_Int32& count) { count = STC_WordIndex_S2T_Count; return STC_WordIndex_S2T; }\n");
445     } else {
446         fprintf (cfp, "\tconst sal_uInt16* getSTC_WordEntry_S2T() { return NULL; }\n");
447         fprintf (cfp, "\tconst sal_uInt16* getSTC_WordIndex_S2T(sal_Int32& count) { count = 0; return NULL; }\n");
448     }
449 
450     if (count_T2S > 0) {
451         qsort(&STC_WordEntry_T2S[0], count_T2S, sizeof(Index), Index_comp);
452 
453         fprintf(cfp, "\nstatic const sal_uInt16 STC_WordEntry_T2S[] = {");
454         count = 0;
455         length = 0;
456         for (i = 0; i < count_T2S; i++) {
457             if (i % 32 == 0) fprintf(cfp, "\n\t");
458             fprintf(cfp, "0x%04x, ", STC_WordEntry_T2S[i].address);
459             if (STC_WordEntry_T2S[i].len != length) {
460                 length = STC_WordEntry_T2S[i].len;
461                 while (count <= length)
462                     STC_WordIndex[count++] = sal::static_int_cast<sal_uInt16>(i);
463             }
464         }
465         STC_WordIndex[count++] = sal::static_int_cast<sal_uInt16>(i);
466         fprintf(cfp, "\n};\n");
467 
468         fprintf(cfp, "\nstatic const sal_uInt16 STC_WordIndex_T2S[] = {");
469         for (i = 0; i < count; i++) {
470             if (i % 16 == 0) fprintf(cfp, "\n\t");
471             fprintf(cfp, "0x%04x, ",  STC_WordIndex[i]);
472         }
473         fprintf(cfp, "\n};\n");
474 
475         fprintf(cfp, "\nstatic sal_Int32 STC_WordIndex_T2S_Count = %ld;\n\n", sal::static_int_cast< long >(length));
476         fprintf (cfp, "\tconst sal_uInt16* getSTC_WordEntry_T2S() { return STC_WordEntry_T2S; }\n");
477         fprintf (cfp, "\tconst sal_uInt16* getSTC_WordIndex_T2S(sal_Int32& count) { count = STC_WordIndex_T2S_Count; return STC_WordIndex_T2S; }\n");
478     } else {
479         fprintf (cfp, "\tconst sal_uInt16* getSTC_WordEntry_T2S() { return NULL; }\n");
480         fprintf (cfp, "\tconst sal_uInt16* getSTC_WordIndex_T2S(sal_Int32& count) { count = 0; return NULL; }\n");
481     }
482 }
483 
484