1#!/usr/bin/env perl 2#************************************************************** 3# 4# Licensed to the Apache Software Foundation (ASF) under one 5# or more contributor license agreements. See the NOTICE file 6# distributed with this work for additional information 7# regarding copyright ownership. The ASF licenses this file 8# to you under the Apache License, Version 2.0 (the 9# "License"); you may not use this file except in compliance 10# with the License. You may obtain a copy of the License at 11# 12# http://www.apache.org/licenses/LICENSE-2.0 13# 14# Unless required by applicable law or agreed to in writing, 15# software distributed under the License is distributed on an 16# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 17# KIND, either express or implied. See the License for the 18# specific language governing permissions and limitations 19# under the License. 20# 21#************************************************************** 22 23 24 25# The following files must be available in a ./input subdir: 26 27# <http://oss.software.ibm.com/cvs/icu/~checkout~/charset/data/xml/ 28# gb-18030-2000.xml?rev=1.4&content-type=text/plain>: 29# "modified version="3" date="2001-02-21"" 30 31$id = "Gb180302000"; 32 33sub printUtf32 34{ 35 my $utf32 = $_[0]; 36 return sprintf("U+%04X", $utf32); 37} 38 39sub printGb 40{ 41 if (defined($_[2])) 42 { 43 return sprintf("%02X%02X%02X%02X", $_[0], $_[1], $_[2], $_[3]); 44 } 45 elsif (defined($_[1])) 46 { 47 return sprintf("%02X%02X", $_[0], $_[1]); 48 } 49 else 50 { 51 return sprintf("%02X", $_[0]); 52 } 53} 54 55$gb_map_2_count = 0; 56$gb_map_4_count = 0; 57$gb_map_4_ranges = 0; 58$gb_map_4_max = 0; 59$uni_map_count = 0; 60 61$range_count = 0; 62 63if (1) 64{ 65 $filename = "gb-18030-2000.xml"; 66 open IN, ("input/" . $filename) or die "Cannot read " . $filename; 67 while (<IN>) 68 { 69 if (/^[ \t]*<a +u=\"([0-9A-F]+)\" +b=\"([0-7][0-9A-F])\"\/>$/) 70 { 71 $utf32 = oct("0x" . $1); 72 $gb1 = oct("0x" . $2); 73 ($utf32 == $gb1) 74 or die "Bad " . printUtf32($utf32) . " to " . printGb($gb1); 75 } 76 elsif (/^[ \t]*<a +u=\"([0-9A-F]+)\" +b=\"([89A-F][0-9A-F]) ([4-789A-F][0-9A-F])\"\/>$/) 77 { 78 $utf32 = oct("0x" . $1); 79 $gb1 = oct("0x" . $2); 80 $gb2 = oct("0x" . $3); 81 $gb_code = ($gb1 - 0x81) * 190 82 + ($gb2 <= 0x7E ? $gb2 - 0x40 : $gb2 - 0x80 + 63); 83 !defined($gb_map_2[$gb_code]) 84 or die "Redefined " . printGb($gb1, $gb2); 85 $gb_map_2[$gb_code] = $utf32; 86 ++$gb_map_2_count; 87 88 !defined($uni_map[$utf32]) or die "Double Unicode mapping"; 89 $uni_map[$utf32] = $gb1 << 8 | $gb2; 90 ++$uni_map_count; 91 } 92 elsif (/^[ \t]*<a +u=\"([0-9A-F]+)\" +b=\"([89A-F][0-9A-F]) (3[0-9]) ([89A-F][0-9A-F]) (3[0-9])\"\/>$/) 93 { 94 $utf32 = oct("0x" . $1); 95 $gb1 = oct("0x" . $2); 96 $gb2 = oct("0x" . $3); 97 $gb3 = oct("0x" . $4); 98 $gb4 = oct("0x" . $5); 99 $gb_code = ($gb1 - 0x81) * 12600 100 + ($gb2 - 0x30) * 1260 101 + ($gb3 - 0x81) * 10 102 + ($gb4 - 0x30); 103 !defined($gb_map_4[$gb_code]) 104 or die "Redefined " . printGb($gb1, $gb2, $gb3, $gb4); 105 $gb_map_4[$gb_code] = $utf32; 106 ++$gb_map_4_count; 107 $gb_map_4_max = $gb_code if ($gb_code > $gb_map_4_max); 108 109 !defined($uni_map[$utf32]) or die "Double Unicode mapping"; 110 $uni_map[$utf32] = $gb1 << 24 | $gb2 << 16 | $gb3 << 8 | $gb4; 111 ++$uni_map_count; 112 } 113 elsif (/<a /) 114 { 115 die "Bad format"; 116 } 117 elsif (/^[ \t]*<range +uFirst=\"([0-9A-F]+)\" +uLast=\"([0-9A-F]+)\" +bFirst=\"([89A-F][0-9A-F]) (3[0-9]) ([89A-F][0-9A-F]) (3[0-9])\" +bLast=\"([89A-F][0-9A-F]) (3[0-9]) ([89A-F][0-9A-F]) (3[0-9])\" +bMin=\"81 30 81 30\" +bMax=\"FE 39 FE 39\"\/>$/) 118 { 119 $utf32_first = oct("0x" . $1); 120 $utf32_last = oct("0x" . $2); 121 $gb1_first = oct("0x" . $3); 122 $gb2_first = oct("0x" . $4); 123 $gb3_first = oct("0x" . $5); 124 $gb4_first = oct("0x" . $6); 125 $gb1_last = oct("0x" . $7); 126 $gb2_last = oct("0x" . $8); 127 $gb3_last = oct("0x" . $9); 128 $gb4_last = oct("0x" . $10); 129 $linear_first 130 = ($gb1_first - 0x81) * 12600 131 + ($gb2_first - 0x30) * 1260 132 + ($gb3_first - 0x81) * 10 133 + ($gb4_first - 0x30); 134 $linear_last 135 = ($gb1_last - 0x81) * 12600 136 + ($gb2_last - 0x30) * 1260 137 + ($gb3_last - 0x81) * 10 138 + ($gb4_last - 0x30); 139 ($utf32_last - $utf32_first == $linear_last - $linear_first) 140 or die "Bad range"; 141 if ($linear_first != 189000 || $linear_last != 1237575) 142 { 143 $range_uni_first[$range_count] = $utf32_first; 144 $range_uni_last[$range_count] 145 = ($utf32_last == 0xD7FF ? 0xDFFF : $utf32_last); 146 $range_linear_first[$range_count] = $linear_first; 147 $range_linear_last[$range_count] = $linear_last; 148 ++$range_count; 149 $gb_map_4_ranges += $linear_last - $linear_first + 1; 150 $gb_map_4_max = $linear_last 151 if ($linear_last > $gb_map_4_max); 152 } 153 } 154 elsif (/<range /) 155 { 156 die "Bad format"; 157 } 158 } 159 close IN; 160} 161 162print "gb_map_2_count = ", $gb_map_2_count, 163 ", gb_map_4_count = ", $gb_map_4_count, 164 ", gb_map_4_ranges = ", $gb_map_4_ranges, 165 ", gb_map_4_max = ", $gb_map_4_max, 166 ", uni_map_count = ", $uni_map_count, "\n"; 167($gb_map_2_count == 23940) or die "Bad gb_map_2_count != 23940"; 168($gb_map_4_max == $gb_map_4_count + $gb_map_4_ranges - 1) 169 or die "Bad gb_map_4_max != gb_map_4_count + gb_map_4_ranges"; 170($uni_map_count + $gb_map_4_ranges == 0x10000 - (0xE000 - 0xD800) - 0x80) 171 or die "Bad uni_map_count"; 172 173$range_index = 0; 174$gb_nonrangedataindex[$range_index] = $gb_map_2_count; 175for ($gb_code = 0; $gb_code < $gb_map_4_max; ++$gb_code) 176{ 177 if (defined($gb_map_4[$gb_code])) 178 { 179 $gb_map_2[$gb_map_2_count++] = $gb_map_4[$gb_code]; 180 } 181 else 182 { 183 ($gb_code == $range_linear_first[$range_index]) or die "Bad input"; 184 $gb_code = $range_linear_last[$range_index]; 185 ++$range_index; 186 $gb_nonrangedataindex[$range_index] = $gb_map_2_count; 187 } 188} 189($range_index == $range_count) or die "Bad input"; 190 191$filename = lc($id) . ".tab"; 192open OUT, ("> " . $filename) or die "Cannot write " . $filename; 193 194{ 195 $filename = lc($id). ".pl"; 196 open IN, $filename or die "Cannot read ". $filename; 197 $first = 1; 198 while (<IN>) 199 { 200 if (/^\#!.*$/) 201 { 202 } 203 elsif (/^\#(\*.*)$/) 204 { 205 if ($first == 1) 206 { 207 print OUT "/", $1, "\n"; 208 $first = 0; 209 } 210 else 211 { 212 print OUT " ", substr($1, 0, length($1) - 1), "/\n"; 213 } 214 } 215 elsif (/^\# (.*)$/) 216 { 217 print OUT " *", $1, "\n"; 218 } 219 elsif (/^\#(.*)$/) 220 { 221 print OUT " *", $1, "\n"; 222 } 223 else 224 { 225 goto done; 226 } 227 } 228 done: 229} 230 231print OUT "\n", 232 "#ifndef INCLUDED_RTL_TEXTENC_CONVERTGB18030_H\n", 233 "#include \"convertgb18030.h\"\n", 234 "#endif\n", 235 "\n", 236 "#ifndef _SAL_TYPES_H_\n", 237 "#include \"sal/types.h\"\n", 238 "#endif\n", 239 "\n"; 240 241print OUT "static sal_Unicode const aImpl", $id, "ToUnicodeData[] = {\n "; 242for ($gb_code = 0; $gb_code < $gb_map_2_count; ++$gb_code) 243{ 244 printf OUT "0x%04X,", $gb_map_2[$gb_code]; 245 if ($gb_code % 8 == 7 && $gb_code != $gb_map_2_count - 1) 246 { 247 print OUT "\n "; 248 } 249} 250print OUT "\n};\n\n"; 251 252print OUT "static ImplGb180302000ToUnicodeRange const\n aImpl", 253 $id, 254 "ToUnicodeRanges[] = {\n"; 255for ($range_index = 0; $range_index < $range_count; ++$range_index) 256{ 257 printf OUT " { %d, %d, %d, 0x%04X },\n", 258 $gb_nonrangedataindex[$range_index], 259 $range_linear_first[$range_index], 260 $range_linear_last[$range_index] + 1, 261 $range_uni_first[$range_index]; 262} 263print OUT " { -1, 0, 0, 0 }\n};\n\n"; 264 265print OUT "static sal_uInt32 const aImplUnicodeTo", $id, "Data[] = {\n "; 266$index = 0; 267$range_index = 0; 268$uni_nonrangedataindex[$range_index] = $index; 269for ($utf32 = 0x80; $utf32 <= 0xFFFF; ++$utf32) 270{ 271 if (defined($uni_map[$utf32])) 272 { 273 if ($index > 0 && ($index - 1) % 6 == 5) 274 { 275 print OUT "\n "; 276 } 277 $bytes = $uni_map[$utf32]; 278 printf OUT ($bytes <= 0xFFFF ? " 0x%04X," : "0x%08X,"), $bytes; 279 ++$index; 280 } 281 else 282 { 283 ($utf32 == $range_uni_first[$range_index]) or die "Bad input"; 284 $utf32 = $range_uni_last[$range_index]; 285 ++$range_index; 286 $uni_nonrangedataindex[$range_index] = $index; 287 } 288} 289($range_index == $range_count) or die "Bad input"; 290print OUT "\n};\n\n"; 291 292print OUT "static ImplUnicodeToGb180302000Range const\n aImplUnicodeTo", 293 $id, 294 "Ranges[] = {\n"; 295for ($range_index = 0; $range_index < $range_count; ++$range_index) 296{ 297 printf OUT " { %d, 0x%04X, 0x%04X, %d },\n", 298 $uni_nonrangedataindex[$range_index], 299 $range_uni_first[$range_index], 300 $range_uni_last[$range_index], 301 $range_linear_first[$range_index]; 302} 303print OUT "};\n"; 304 305close OUT; 306