xref: /trunk/main/sal/textenc/generate/gb180302000.pl (revision c667dd47)
1#!/usr/bin/env perl
2#**************************************************************
3#
4#  Licensed to the Apache Software Foundation (ASF) under one
5#  or more contributor license agreements.  See the NOTICE file
6#  distributed with this work for additional information
7#  regarding copyright ownership.  The ASF licenses this file
8#  to you under the Apache License, Version 2.0 (the
9#  "License"); you may not use this file except in compliance
10#  with the License.  You may obtain a copy of the License at
11#
12#    http://www.apache.org/licenses/LICENSE-2.0
13#
14#  Unless required by applicable law or agreed to in writing,
15#  software distributed under the License is distributed on an
16#  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17#  KIND, either express or implied.  See the License for the
18#  specific language governing permissions and limitations
19#  under the License.
20#
21#**************************************************************
22
23
24
25# The following files must be available in a ./input subdir:
26
27# <http://oss.software.ibm.com/cvs/icu/~checkout~/charset/data/xml/
28# gb-18030-2000.xml?rev=1.4&content-type=text/plain>:
29#  "modified version="3" date="2001-02-21""
30
31$id = "Gb180302000";
32
33sub printUtf32
34{
35    my $utf32 = $_[0];
36    return sprintf("U+%04X", $utf32);
37}
38
39sub printGb
40{
41    if (defined($_[2]))
42    {
43        return sprintf("%02X%02X%02X%02X", $_[0], $_[1], $_[2], $_[3]);
44    }
45    elsif (defined($_[1]))
46    {
47        return sprintf("%02X%02X", $_[0], $_[1]);
48    }
49    else
50    {
51        return sprintf("%02X", $_[0]);
52    }
53}
54
55$gb_map_2_count = 0;
56$gb_map_4_count = 0;
57$gb_map_4_ranges = 0;
58$gb_map_4_max = 0;
59$uni_map_count = 0;
60
61$range_count = 0;
62
63if (1)
64{
65    $filename = "gb-18030-2000.xml";
66    open IN, ("input/" . $filename) or die "Cannot read " . $filename;
67    while (<IN>)
68    {
69        if (/^[ \t]*<a +u=\"([0-9A-F]+)\" +b=\"([0-7][0-9A-F])\"\/>$/)
70        {
71            $utf32 = oct("0x" . $1);
72            $gb1 = oct("0x" . $2);
73            ($utf32 == $gb1)
74                or die "Bad " . printUtf32($utf32) . " to " . printGb($gb1);
75        }
76        elsif (/^[ \t]*<a +u=\"([0-9A-F]+)\" +b=\"([89A-F][0-9A-F]) ([4-789A-F][0-9A-F])\"\/>$/)
77        {
78            $utf32 = oct("0x" . $1);
79            $gb1 = oct("0x" . $2);
80            $gb2 = oct("0x" . $3);
81            $gb_code = ($gb1 - 0x81) * 190
82                           + ($gb2 <= 0x7E ? $gb2 - 0x40 : $gb2 - 0x80 + 63);
83            !defined($gb_map_2[$gb_code])
84                or die "Redefined " . printGb($gb1, $gb2);
85            $gb_map_2[$gb_code] = $utf32;
86            ++$gb_map_2_count;
87
88            !defined($uni_map[$utf32]) or die "Double Unicode mapping";
89            $uni_map[$utf32] = $gb1 << 8 | $gb2;
90            ++$uni_map_count;
91        }
92        elsif (/^[ \t]*<a +u=\"([0-9A-F]+)\" +b=\"([89A-F][0-9A-F]) (3[0-9]) ([89A-F][0-9A-F]) (3[0-9])\"\/>$/)
93        {
94            $utf32 = oct("0x" . $1);
95            $gb1 = oct("0x" . $2);
96            $gb2 = oct("0x" . $3);
97            $gb3 = oct("0x" . $4);
98            $gb4 = oct("0x" . $5);
99            $gb_code = ($gb1 - 0x81) * 12600
100                           + ($gb2 - 0x30) * 1260
101                           + ($gb3 - 0x81) * 10
102                           + ($gb4 - 0x30);
103            !defined($gb_map_4[$gb_code])
104                or die "Redefined " . printGb($gb1, $gb2, $gb3, $gb4);
105            $gb_map_4[$gb_code] = $utf32;
106            ++$gb_map_4_count;
107            $gb_map_4_max = $gb_code if ($gb_code > $gb_map_4_max);
108
109            !defined($uni_map[$utf32]) or die "Double Unicode mapping";
110            $uni_map[$utf32] = $gb1 << 24 | $gb2 << 16 | $gb3 << 8 | $gb4;
111            ++$uni_map_count;
112        }
113        elsif (/<a /)
114        {
115            die "Bad format";
116        }
117        elsif (/^[ \t]*<range +uFirst=\"([0-9A-F]+)\" +uLast=\"([0-9A-F]+)\" +bFirst=\"([89A-F][0-9A-F]) (3[0-9]) ([89A-F][0-9A-F]) (3[0-9])\" +bLast=\"([89A-F][0-9A-F]) (3[0-9]) ([89A-F][0-9A-F]) (3[0-9])\" +bMin=\"81 30 81 30\" +bMax=\"FE 39 FE 39\"\/>$/)
118        {
119            $utf32_first = oct("0x" . $1);
120            $utf32_last = oct("0x" . $2);
121            $gb1_first = oct("0x" . $3);
122            $gb2_first = oct("0x" . $4);
123            $gb3_first = oct("0x" . $5);
124            $gb4_first = oct("0x" . $6);
125            $gb1_last = oct("0x" . $7);
126            $gb2_last = oct("0x" . $8);
127            $gb3_last = oct("0x" . $9);
128            $gb4_last = oct("0x" . $10);
129            $linear_first
130                = ($gb1_first - 0x81) * 12600
131                    + ($gb2_first - 0x30) * 1260
132                        + ($gb3_first - 0x81) * 10
133                            + ($gb4_first - 0x30);
134            $linear_last
135                = ($gb1_last - 0x81) * 12600
136                    + ($gb2_last - 0x30) * 1260
137                        + ($gb3_last - 0x81) * 10
138                            + ($gb4_last - 0x30);
139            ($utf32_last - $utf32_first == $linear_last - $linear_first)
140                or die "Bad range";
141            if ($linear_first != 189000 || $linear_last != 1237575)
142            {
143                $range_uni_first[$range_count] = $utf32_first;
144                $range_uni_last[$range_count]
145                    = ($utf32_last == 0xD7FF ? 0xDFFF : $utf32_last);
146                $range_linear_first[$range_count] = $linear_first;
147                $range_linear_last[$range_count] = $linear_last;
148                ++$range_count;
149                $gb_map_4_ranges += $linear_last - $linear_first + 1;
150                $gb_map_4_max = $linear_last
151                    if ($linear_last > $gb_map_4_max);
152            }
153        }
154        elsif (/<range /)
155        {
156            die "Bad format";
157        }
158    }
159    close IN;
160}
161
162print "gb_map_2_count = ", $gb_map_2_count,
163      ", gb_map_4_count = ", $gb_map_4_count,
164      ", gb_map_4_ranges = ", $gb_map_4_ranges,
165      ", gb_map_4_max = ", $gb_map_4_max,
166      ", uni_map_count = ", $uni_map_count, "\n";
167($gb_map_2_count == 23940) or die "Bad gb_map_2_count != 23940";
168($gb_map_4_max == $gb_map_4_count + $gb_map_4_ranges - 1)
169    or die "Bad gb_map_4_max != gb_map_4_count + gb_map_4_ranges";
170($uni_map_count + $gb_map_4_ranges == 0x10000 - (0xE000 - 0xD800) - 0x80)
171    or die "Bad uni_map_count";
172
173$range_index = 0;
174$gb_nonrangedataindex[$range_index] = $gb_map_2_count;
175for ($gb_code = 0; $gb_code < $gb_map_4_max; ++$gb_code)
176{
177    if (defined($gb_map_4[$gb_code]))
178    {
179        $gb_map_2[$gb_map_2_count++] = $gb_map_4[$gb_code];
180    }
181    else
182    {
183        ($gb_code == $range_linear_first[$range_index]) or die "Bad input";
184        $gb_code = $range_linear_last[$range_index];
185        ++$range_index;
186        $gb_nonrangedataindex[$range_index] = $gb_map_2_count;
187    }
188}
189($range_index == $range_count) or die "Bad input";
190
191$filename = lc($id) . ".tab";
192open OUT, ("> " . $filename) or die "Cannot write " . $filename;
193
194{
195    $filename = lc($id). ".pl";
196    open IN, $filename or die "Cannot read ". $filename;
197    $first = 1;
198    while (<IN>)
199    {
200        if (/^\#!.*$/)
201        {
202        }
203        elsif (/^\#(\*.*)$/)
204        {
205            if ($first == 1)
206            {
207                print OUT "/", $1, "\n";
208                $first = 0;
209            }
210            else
211            {
212                print OUT " ", substr($1, 0, length($1) - 1), "/\n";
213            }
214        }
215        elsif (/^\# (.*)$/)
216        {
217            print OUT " *", $1, "\n";
218        }
219        elsif (/^\#(.*)$/)
220        {
221            print OUT " *", $1, "\n";
222        }
223        else
224        {
225            goto done;
226        }
227    }
228  done:
229}
230
231print OUT "\n",
232          "#ifndef INCLUDED_RTL_TEXTENC_CONVERTGB18030_H\n",
233          "#include \"convertgb18030.h\"\n",
234          "#endif\n",
235          "\n",
236          "#ifndef _SAL_TYPES_H_\n",
237          "#include \"sal/types.h\"\n",
238          "#endif\n",
239          "\n";
240
241print OUT "static sal_Unicode const aImpl", $id, "ToUnicodeData[] = {\n  ";
242for ($gb_code = 0; $gb_code < $gb_map_2_count; ++$gb_code)
243{
244    printf OUT "0x%04X,", $gb_map_2[$gb_code];
245    if ($gb_code % 8 == 7 && $gb_code != $gb_map_2_count - 1)
246    {
247        print OUT "\n  ";
248    }
249}
250print OUT "\n};\n\n";
251
252print OUT "static ImplGb180302000ToUnicodeRange const\n    aImpl",
253          $id,
254          "ToUnicodeRanges[] = {\n";
255for ($range_index = 0; $range_index < $range_count; ++$range_index)
256{
257    printf OUT "  { %d, %d, %d, 0x%04X },\n",
258               $gb_nonrangedataindex[$range_index],
259               $range_linear_first[$range_index],
260               $range_linear_last[$range_index] + 1,
261               $range_uni_first[$range_index];
262}
263print OUT "  { -1, 0, 0, 0 }\n};\n\n";
264
265print OUT "static sal_uInt32 const aImplUnicodeTo", $id, "Data[] = {\n  ";
266$index = 0;
267$range_index = 0;
268$uni_nonrangedataindex[$range_index] = $index;
269for ($utf32 = 0x80; $utf32 <= 0xFFFF; ++$utf32)
270{
271    if (defined($uni_map[$utf32]))
272    {
273        if ($index > 0 && ($index - 1) % 6 == 5)
274        {
275            print OUT "\n  ";
276        }
277        $bytes = $uni_map[$utf32];
278        printf OUT ($bytes <= 0xFFFF ? "    0x%04X," : "0x%08X,"), $bytes;
279        ++$index;
280    }
281    else
282    {
283        ($utf32 == $range_uni_first[$range_index]) or die "Bad input";
284        $utf32 = $range_uni_last[$range_index];
285        ++$range_index;
286        $uni_nonrangedataindex[$range_index] = $index;
287    }
288}
289($range_index == $range_count) or die "Bad input";
290print OUT "\n};\n\n";
291
292print OUT "static ImplUnicodeToGb180302000Range const\n    aImplUnicodeTo",
293          $id,
294          "Ranges[] = {\n";
295for ($range_index = 0; $range_index < $range_count; ++$range_index)
296{
297    printf OUT "  { %d, 0x%04X, 0x%04X, %d },\n",
298               $uni_nonrangedataindex[$range_index],
299               $range_uni_first[$range_index],
300               $range_uni_last[$range_index],
301               $range_linear_first[$range_index];
302}
303print OUT "};\n";
304
305close OUT;
306