1#!/usr/bin/perl 2#************************************************************************* 3# 4# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5# 6# Copyright 2000, 2010 Oracle and/or its affiliates. 7# 8# OpenOffice.org - a multi-platform office productivity suite 9# 10# This file is part of OpenOffice.org. 11# 12# OpenOffice.org is free software: you can redistribute it and/or modify 13# it under the terms of the GNU Lesser General Public License version 3 14# only, as published by the Free Software Foundation. 15# 16# OpenOffice.org is distributed in the hope that it will be useful, 17# but WITHOUT ANY WARRANTY; without even the implied warranty of 18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19# GNU Lesser General Public License version 3 for more details 20# (a copy is included in the LICENSE file that accompanied this code). 21# 22# You should have received a copy of the GNU Lesser General Public License 23# version 3 along with OpenOffice.org. If not, see 24# <http://www.openoffice.org/license.html> 25# for a copy of the LGPLv3 License. 26# 27#************************************************************************* 28 29# The following files must be available in a ./input subdir: 30 31# <http://oss.software.ibm.com/cvs/icu/~checkout~/charset/data/xml/ 32# gb-18030-2000.xml?rev=1.4&content-type=text/plain>: 33# "modified version="3" date="2001-02-21"" 34 35$id = "Gb180302000"; 36 37sub printUtf32 38{ 39 my $utf32 = $_[0]; 40 return sprintf("U+%04X", $utf32); 41} 42 43sub printGb 44{ 45 if (defined($_[2])) 46 { 47 return sprintf("%02X%02X%02X%02X", $_[0], $_[1], $_[2], $_[3]); 48 } 49 elsif (defined($_[1])) 50 { 51 return sprintf("%02X%02X", $_[0], $_[1]); 52 } 53 else 54 { 55 return sprintf("%02X", $_[0]); 56 } 57} 58 59$gb_map_2_count = 0; 60$gb_map_4_count = 0; 61$gb_map_4_ranges = 0; 62$gb_map_4_max = 0; 63$uni_map_count = 0; 64 65$range_count = 0; 66 67if (1) 68{ 69 $filename = "gb-18030-2000.xml"; 70 open IN, ("input/" . $filename) or die "Cannot read " . $filename; 71 while (<IN>) 72 { 73 if (/^[ \t]*<a +u=\"([0-9A-F]+)\" +b=\"([0-7][0-9A-F])\"\/>$/) 74 { 75 $utf32 = oct("0x" . $1); 76 $gb1 = oct("0x" . $2); 77 ($utf32 == $gb1) 78 or die "Bad " . printUtf32($utf32) . " to " . printGb($gb1); 79 } 80 elsif (/^[ \t]*<a +u=\"([0-9A-F]+)\" +b=\"([89A-F][0-9A-F]) ([4-789A-F][0-9A-F])\"\/>$/) 81 { 82 $utf32 = oct("0x" . $1); 83 $gb1 = oct("0x" . $2); 84 $gb2 = oct("0x" . $3); 85 $gb_code = ($gb1 - 0x81) * 190 86 + ($gb2 <= 0x7E ? $gb2 - 0x40 : $gb2 - 0x80 + 63); 87 !defined($gb_map_2[$gb_code]) 88 or die "Redefined " . printGb($gb1, $gb2); 89 $gb_map_2[$gb_code] = $utf32; 90 ++$gb_map_2_count; 91 92 !defined($uni_map[$utf32]) or die "Double Unicode mapping"; 93 $uni_map[$utf32] = $gb1 << 8 | $gb2; 94 ++$uni_map_count; 95 } 96 elsif (/^[ \t]*<a +u=\"([0-9A-F]+)\" +b=\"([89A-F][0-9A-F]) (3[0-9]) ([89A-F][0-9A-F]) (3[0-9])\"\/>$/) 97 { 98 $utf32 = oct("0x" . $1); 99 $gb1 = oct("0x" . $2); 100 $gb2 = oct("0x" . $3); 101 $gb3 = oct("0x" . $4); 102 $gb4 = oct("0x" . $5); 103 $gb_code = ($gb1 - 0x81) * 12600 104 + ($gb2 - 0x30) * 1260 105 + ($gb3 - 0x81) * 10 106 + ($gb4 - 0x30); 107 !defined($gb_map_4[$gb_code]) 108 or die "Redefined " . printGb($gb1, $gb2, $gb3, $gb4); 109 $gb_map_4[$gb_code] = $utf32; 110 ++$gb_map_4_count; 111 $gb_map_4_max = $gb_code if ($gb_code > $gb_map_4_max); 112 113 !defined($uni_map[$utf32]) or die "Double Unicode mapping"; 114 $uni_map[$utf32] = $gb1 << 24 | $gb2 << 16 | $gb3 << 8 | $gb4; 115 ++$uni_map_count; 116 } 117 elsif (/<a /) 118 { 119 die "Bad format"; 120 } 121 elsif (/^[ \t]*<range +uFirst=\"([0-9A-F]+)\" +uLast=\"([0-9A-F]+)\" +bFirst=\"([89A-F][0-9A-F]) (3[0-9]) ([89A-F][0-9A-F]) (3[0-9])\" +bLast=\"([89A-F][0-9A-F]) (3[0-9]) ([89A-F][0-9A-F]) (3[0-9])\" +bMin=\"81 30 81 30\" +bMax=\"FE 39 FE 39\"\/>$/) 122 { 123 $utf32_first = oct("0x" . $1); 124 $utf32_last = oct("0x" . $2); 125 $gb1_first = oct("0x" . $3); 126 $gb2_first = oct("0x" . $4); 127 $gb3_first = oct("0x" . $5); 128 $gb4_first = oct("0x" . $6); 129 $gb1_last = oct("0x" . $7); 130 $gb2_last = oct("0x" . $8); 131 $gb3_last = oct("0x" . $9); 132 $gb4_last = oct("0x" . $10); 133 $linear_first 134 = ($gb1_first - 0x81) * 12600 135 + ($gb2_first - 0x30) * 1260 136 + ($gb3_first - 0x81) * 10 137 + ($gb4_first - 0x30); 138 $linear_last 139 = ($gb1_last - 0x81) * 12600 140 + ($gb2_last - 0x30) * 1260 141 + ($gb3_last - 0x81) * 10 142 + ($gb4_last - 0x30); 143 ($utf32_last - $utf32_first == $linear_last - $linear_first) 144 or die "Bad range"; 145 if ($linear_first != 189000 || $linear_last != 1237575) 146 { 147 $range_uni_first[$range_count] = $utf32_first; 148 $range_uni_last[$range_count] 149 = ($utf32_last == 0xD7FF ? 0xDFFF : $utf32_last); 150 $range_linear_first[$range_count] = $linear_first; 151 $range_linear_last[$range_count] = $linear_last; 152 ++$range_count; 153 $gb_map_4_ranges += $linear_last - $linear_first + 1; 154 $gb_map_4_max = $linear_last 155 if ($linear_last > $gb_map_4_max); 156 } 157 } 158 elsif (/<range /) 159 { 160 die "Bad format"; 161 } 162 } 163 close IN; 164} 165 166print "gb_map_2_count = ", $gb_map_2_count, 167 ", gb_map_4_count = ", $gb_map_4_count, 168 ", gb_map_4_ranges = ", $gb_map_4_ranges, 169 ", gb_map_4_max = ", $gb_map_4_max, 170 ", uni_map_count = ", $uni_map_count, "\n"; 171($gb_map_2_count == 23940) or die "Bad gb_map_2_count != 23940"; 172($gb_map_4_max == $gb_map_4_count + $gb_map_4_ranges - 1) 173 or die "Bad gb_map_4_max != gb_map_4_count + gb_map_4_ranges"; 174($uni_map_count + $gb_map_4_ranges == 0x10000 - (0xE000 - 0xD800) - 0x80) 175 or die "Bad uni_map_count"; 176 177$range_index = 0; 178$gb_nonrangedataindex[$range_index] = $gb_map_2_count; 179for ($gb_code = 0; $gb_code < $gb_map_4_max; ++$gb_code) 180{ 181 if (defined($gb_map_4[$gb_code])) 182 { 183 $gb_map_2[$gb_map_2_count++] = $gb_map_4[$gb_code]; 184 } 185 else 186 { 187 ($gb_code == $range_linear_first[$range_index]) or die "Bad input"; 188 $gb_code = $range_linear_last[$range_index]; 189 ++$range_index; 190 $gb_nonrangedataindex[$range_index] = $gb_map_2_count; 191 } 192} 193($range_index == $range_count) or die "Bad input"; 194 195$filename = lc($id) . ".tab"; 196open OUT, ("> " . $filename) or die "Cannot write " . $filename; 197 198{ 199 $filename = lc($id). ".pl"; 200 open IN, $filename or die "Cannot read ". $filename; 201 $first = 1; 202 while (<IN>) 203 { 204 if (/^\#!.*$/) 205 { 206 } 207 elsif (/^\#(\*.*)$/) 208 { 209 if ($first == 1) 210 { 211 print OUT "/", $1, "\n"; 212 $first = 0; 213 } 214 else 215 { 216 print OUT " ", substr($1, 0, length($1) - 1), "/\n"; 217 } 218 } 219 elsif (/^\# (.*)$/) 220 { 221 print OUT " *", $1, "\n"; 222 } 223 elsif (/^\#(.*)$/) 224 { 225 print OUT " *", $1, "\n"; 226 } 227 else 228 { 229 goto done; 230 } 231 } 232 done: 233} 234 235print OUT "\n", 236 "#ifndef INCLUDED_RTL_TEXTENC_CONVERTGB18030_H\n", 237 "#include \"convertgb18030.h\"\n", 238 "#endif\n", 239 "\n", 240 "#ifndef _SAL_TYPES_H_\n", 241 "#include \"sal/types.h\"\n", 242 "#endif\n", 243 "\n"; 244 245print OUT "static sal_Unicode const aImpl", $id, "ToUnicodeData[] = {\n "; 246for ($gb_code = 0; $gb_code < $gb_map_2_count; ++$gb_code) 247{ 248 printf OUT "0x%04X,", $gb_map_2[$gb_code]; 249 if ($gb_code % 8 == 7 && $gb_code != $gb_map_2_count - 1) 250 { 251 print OUT "\n "; 252 } 253} 254print OUT "\n};\n\n"; 255 256print OUT "static ImplGb180302000ToUnicodeRange const\n aImpl", 257 $id, 258 "ToUnicodeRanges[] = {\n"; 259for ($range_index = 0; $range_index < $range_count; ++$range_index) 260{ 261 printf OUT " { %d, %d, %d, 0x%04X },\n", 262 $gb_nonrangedataindex[$range_index], 263 $range_linear_first[$range_index], 264 $range_linear_last[$range_index] + 1, 265 $range_uni_first[$range_index]; 266} 267print OUT " { -1, 0, 0, 0 }\n};\n\n"; 268 269print OUT "static sal_uInt32 const aImplUnicodeTo", $id, "Data[] = {\n "; 270$index = 0; 271$range_index = 0; 272$uni_nonrangedataindex[$range_index] = $index; 273for ($utf32 = 0x80; $utf32 <= 0xFFFF; ++$utf32) 274{ 275 if (defined($uni_map[$utf32])) 276 { 277 if ($index > 0 && ($index - 1) % 6 == 5) 278 { 279 print OUT "\n "; 280 } 281 $bytes = $uni_map[$utf32]; 282 printf OUT ($bytes <= 0xFFFF ? " 0x%04X," : "0x%08X,"), $bytes; 283 ++$index; 284 } 285 else 286 { 287 ($utf32 == $range_uni_first[$range_index]) or die "Bad input"; 288 $utf32 = $range_uni_last[$range_index]; 289 ++$range_index; 290 $uni_nonrangedataindex[$range_index] = $index; 291 } 292} 293($range_index == $range_count) or die "Bad input"; 294print OUT "\n};\n\n"; 295 296print OUT "static ImplUnicodeToGb180302000Range const\n aImplUnicodeTo", 297 $id, 298 "Ranges[] = {\n"; 299for ($range_index = 0; $range_index < $range_count; ++$range_index) 300{ 301 printf OUT " { %d, 0x%04X, 0x%04X, %d },\n", 302 $uni_nonrangedataindex[$range_index], 303 $range_uni_first[$range_index], 304 $range_uni_last[$range_index], 305 $range_linear_first[$range_index]; 306} 307print OUT "};\n"; 308 309close OUT; 310