xref: /trunk/main/sal/textenc/generate/gb180302000.pl (revision cdf0e10c)
1#!/usr/bin/perl
2#*************************************************************************
3#
4# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5#
6# Copyright 2000, 2010 Oracle and/or its affiliates.
7#
8# OpenOffice.org - a multi-platform office productivity suite
9#
10# This file is part of OpenOffice.org.
11#
12# OpenOffice.org is free software: you can redistribute it and/or modify
13# it under the terms of the GNU Lesser General Public License version 3
14# only, as published by the Free Software Foundation.
15#
16# OpenOffice.org is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19# GNU Lesser General Public License version 3 for more details
20# (a copy is included in the LICENSE file that accompanied this code).
21#
22# You should have received a copy of the GNU Lesser General Public License
23# version 3 along with OpenOffice.org.  If not, see
24# <http://www.openoffice.org/license.html>
25# for a copy of the LGPLv3 License.
26#
27#*************************************************************************
28
29# The following files must be available in a ./input subdir:
30
31# <http://oss.software.ibm.com/cvs/icu/~checkout~/charset/data/xml/
32# gb-18030-2000.xml?rev=1.4&content-type=text/plain>:
33#  "modified version="3" date="2001-02-21""
34
35$id = "Gb180302000";
36
37sub printUtf32
38{
39    my $utf32 = $_[0];
40    return sprintf("U+%04X", $utf32);
41}
42
43sub printGb
44{
45    if (defined($_[2]))
46    {
47        return sprintf("%02X%02X%02X%02X", $_[0], $_[1], $_[2], $_[3]);
48    }
49    elsif (defined($_[1]))
50    {
51        return sprintf("%02X%02X", $_[0], $_[1]);
52    }
53    else
54    {
55        return sprintf("%02X", $_[0]);
56    }
57}
58
59$gb_map_2_count = 0;
60$gb_map_4_count = 0;
61$gb_map_4_ranges = 0;
62$gb_map_4_max = 0;
63$uni_map_count = 0;
64
65$range_count = 0;
66
67if (1)
68{
69    $filename = "gb-18030-2000.xml";
70    open IN, ("input/" . $filename) or die "Cannot read " . $filename;
71    while (<IN>)
72    {
73        if (/^[ \t]*<a +u=\"([0-9A-F]+)\" +b=\"([0-7][0-9A-F])\"\/>$/)
74        {
75            $utf32 = oct("0x" . $1);
76            $gb1 = oct("0x" . $2);
77            ($utf32 == $gb1)
78                or die "Bad " . printUtf32($utf32) . " to " . printGb($gb1);
79        }
80        elsif (/^[ \t]*<a +u=\"([0-9A-F]+)\" +b=\"([89A-F][0-9A-F]) ([4-789A-F][0-9A-F])\"\/>$/)
81        {
82            $utf32 = oct("0x" . $1);
83            $gb1 = oct("0x" . $2);
84            $gb2 = oct("0x" . $3);
85            $gb_code = ($gb1 - 0x81) * 190
86                           + ($gb2 <= 0x7E ? $gb2 - 0x40 : $gb2 - 0x80 + 63);
87            !defined($gb_map_2[$gb_code])
88                or die "Redefined " . printGb($gb1, $gb2);
89            $gb_map_2[$gb_code] = $utf32;
90            ++$gb_map_2_count;
91
92            !defined($uni_map[$utf32]) or die "Double Unicode mapping";
93            $uni_map[$utf32] = $gb1 << 8 | $gb2;
94            ++$uni_map_count;
95        }
96        elsif (/^[ \t]*<a +u=\"([0-9A-F]+)\" +b=\"([89A-F][0-9A-F]) (3[0-9]) ([89A-F][0-9A-F]) (3[0-9])\"\/>$/)
97        {
98            $utf32 = oct("0x" . $1);
99            $gb1 = oct("0x" . $2);
100            $gb2 = oct("0x" . $3);
101            $gb3 = oct("0x" . $4);
102            $gb4 = oct("0x" . $5);
103            $gb_code = ($gb1 - 0x81) * 12600
104                           + ($gb2 - 0x30) * 1260
105                           + ($gb3 - 0x81) * 10
106                           + ($gb4 - 0x30);
107            !defined($gb_map_4[$gb_code])
108                or die "Redefined " . printGb($gb1, $gb2, $gb3, $gb4);
109            $gb_map_4[$gb_code] = $utf32;
110            ++$gb_map_4_count;
111            $gb_map_4_max = $gb_code if ($gb_code > $gb_map_4_max);
112
113            !defined($uni_map[$utf32]) or die "Double Unicode mapping";
114            $uni_map[$utf32] = $gb1 << 24 | $gb2 << 16 | $gb3 << 8 | $gb4;
115            ++$uni_map_count;
116        }
117        elsif (/<a /)
118        {
119            die "Bad format";
120        }
121        elsif (/^[ \t]*<range +uFirst=\"([0-9A-F]+)\" +uLast=\"([0-9A-F]+)\" +bFirst=\"([89A-F][0-9A-F]) (3[0-9]) ([89A-F][0-9A-F]) (3[0-9])\" +bLast=\"([89A-F][0-9A-F]) (3[0-9]) ([89A-F][0-9A-F]) (3[0-9])\" +bMin=\"81 30 81 30\" +bMax=\"FE 39 FE 39\"\/>$/)
122        {
123            $utf32_first = oct("0x" . $1);
124            $utf32_last = oct("0x" . $2);
125            $gb1_first = oct("0x" . $3);
126            $gb2_first = oct("0x" . $4);
127            $gb3_first = oct("0x" . $5);
128            $gb4_first = oct("0x" . $6);
129            $gb1_last = oct("0x" . $7);
130            $gb2_last = oct("0x" . $8);
131            $gb3_last = oct("0x" . $9);
132            $gb4_last = oct("0x" . $10);
133            $linear_first
134                = ($gb1_first - 0x81) * 12600
135                    + ($gb2_first - 0x30) * 1260
136                        + ($gb3_first - 0x81) * 10
137                            + ($gb4_first - 0x30);
138            $linear_last
139                = ($gb1_last - 0x81) * 12600
140                    + ($gb2_last - 0x30) * 1260
141                        + ($gb3_last - 0x81) * 10
142                            + ($gb4_last - 0x30);
143            ($utf32_last - $utf32_first == $linear_last - $linear_first)
144                or die "Bad range";
145            if ($linear_first != 189000 || $linear_last != 1237575)
146            {
147                $range_uni_first[$range_count] = $utf32_first;
148                $range_uni_last[$range_count]
149                    = ($utf32_last == 0xD7FF ? 0xDFFF : $utf32_last);
150                $range_linear_first[$range_count] = $linear_first;
151                $range_linear_last[$range_count] = $linear_last;
152                ++$range_count;
153                $gb_map_4_ranges += $linear_last - $linear_first + 1;
154                $gb_map_4_max = $linear_last
155                    if ($linear_last > $gb_map_4_max);
156            }
157        }
158        elsif (/<range /)
159        {
160            die "Bad format";
161        }
162    }
163    close IN;
164}
165
166print "gb_map_2_count = ", $gb_map_2_count,
167      ", gb_map_4_count = ", $gb_map_4_count,
168      ", gb_map_4_ranges = ", $gb_map_4_ranges,
169      ", gb_map_4_max = ", $gb_map_4_max,
170      ", uni_map_count = ", $uni_map_count, "\n";
171($gb_map_2_count == 23940) or die "Bad gb_map_2_count != 23940";
172($gb_map_4_max == $gb_map_4_count + $gb_map_4_ranges - 1)
173    or die "Bad gb_map_4_max != gb_map_4_count + gb_map_4_ranges";
174($uni_map_count + $gb_map_4_ranges == 0x10000 - (0xE000 - 0xD800) - 0x80)
175    or die "Bad uni_map_count";
176
177$range_index = 0;
178$gb_nonrangedataindex[$range_index] = $gb_map_2_count;
179for ($gb_code = 0; $gb_code < $gb_map_4_max; ++$gb_code)
180{
181    if (defined($gb_map_4[$gb_code]))
182    {
183        $gb_map_2[$gb_map_2_count++] = $gb_map_4[$gb_code];
184    }
185    else
186    {
187        ($gb_code == $range_linear_first[$range_index]) or die "Bad input";
188        $gb_code = $range_linear_last[$range_index];
189        ++$range_index;
190        $gb_nonrangedataindex[$range_index] = $gb_map_2_count;
191    }
192}
193($range_index == $range_count) or die "Bad input";
194
195$filename = lc($id) . ".tab";
196open OUT, ("> " . $filename) or die "Cannot write " . $filename;
197
198{
199    $filename = lc($id). ".pl";
200    open IN, $filename or die "Cannot read ". $filename;
201    $first = 1;
202    while (<IN>)
203    {
204        if (/^\#!.*$/)
205        {
206        }
207        elsif (/^\#(\*.*)$/)
208        {
209            if ($first == 1)
210            {
211                print OUT "/", $1, "\n";
212                $first = 0;
213            }
214            else
215            {
216                print OUT " ", substr($1, 0, length($1) - 1), "/\n";
217            }
218        }
219        elsif (/^\# (.*)$/)
220        {
221            print OUT " *", $1, "\n";
222        }
223        elsif (/^\#(.*)$/)
224        {
225            print OUT " *", $1, "\n";
226        }
227        else
228        {
229            goto done;
230        }
231    }
232  done:
233}
234
235print OUT "\n",
236          "#ifndef INCLUDED_RTL_TEXTENC_CONVERTGB18030_H\n",
237          "#include \"convertgb18030.h\"\n",
238          "#endif\n",
239          "\n",
240          "#ifndef _SAL_TYPES_H_\n",
241          "#include \"sal/types.h\"\n",
242          "#endif\n",
243          "\n";
244
245print OUT "static sal_Unicode const aImpl", $id, "ToUnicodeData[] = {\n  ";
246for ($gb_code = 0; $gb_code < $gb_map_2_count; ++$gb_code)
247{
248    printf OUT "0x%04X,", $gb_map_2[$gb_code];
249    if ($gb_code % 8 == 7 && $gb_code != $gb_map_2_count - 1)
250    {
251        print OUT "\n  ";
252    }
253}
254print OUT "\n};\n\n";
255
256print OUT "static ImplGb180302000ToUnicodeRange const\n    aImpl",
257          $id,
258          "ToUnicodeRanges[] = {\n";
259for ($range_index = 0; $range_index < $range_count; ++$range_index)
260{
261    printf OUT "  { %d, %d, %d, 0x%04X },\n",
262               $gb_nonrangedataindex[$range_index],
263               $range_linear_first[$range_index],
264               $range_linear_last[$range_index] + 1,
265               $range_uni_first[$range_index];
266}
267print OUT "  { -1, 0, 0, 0 }\n};\n\n";
268
269print OUT "static sal_uInt32 const aImplUnicodeTo", $id, "Data[] = {\n  ";
270$index = 0;
271$range_index = 0;
272$uni_nonrangedataindex[$range_index] = $index;
273for ($utf32 = 0x80; $utf32 <= 0xFFFF; ++$utf32)
274{
275    if (defined($uni_map[$utf32]))
276    {
277        if ($index > 0 && ($index - 1) % 6 == 5)
278        {
279            print OUT "\n  ";
280        }
281        $bytes = $uni_map[$utf32];
282        printf OUT ($bytes <= 0xFFFF ? "    0x%04X," : "0x%08X,"), $bytes;
283        ++$index;
284    }
285    else
286    {
287        ($utf32 == $range_uni_first[$range_index]) or die "Bad input";
288        $utf32 = $range_uni_last[$range_index];
289        ++$range_index;
290        $uni_nonrangedataindex[$range_index] = $index;
291    }
292}
293($range_index == $range_count) or die "Bad input";
294print OUT "\n};\n\n";
295
296print OUT "static ImplUnicodeToGb180302000Range const\n    aImplUnicodeTo",
297          $id,
298          "Ranges[] = {\n";
299for ($range_index = 0; $range_index < $range_count; ++$range_index)
300{
301    printf OUT "  { %d, 0x%04X, 0x%04X, %d },\n",
302               $uni_nonrangedataindex[$range_index],
303               $range_uni_first[$range_index],
304               $range_uni_last[$range_index],
305               $range_linear_first[$range_index];
306}
307print OUT "};\n";
308
309close OUT;
310