1#!/usr/bin/awk -f 2# ************************************************************* 3# 4# Licensed to the Apache Software Foundation (ASF) under one 5# or more contributor license agreements. See the NOTICE file 6# distributed with this work for additional information 7# regarding copyright ownership. The ASF licenses this file 8# to you under the Apache License, Version 2.0 (the 9# "License"); you may not use this file except in compliance 10# with the License. You may obtain a copy of the License at 11# 12# http://www.apache.org/licenses/LICENSE-2.0 13# 14# Unless required by applicable law or agreed to in writing, 15# software distributed under the License is distributed on an 16# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 17# KIND, either express or implied. See the License for the 18# specific language governing permissions and limitations 19# under the License. 20# 21# ************************************************************* 22# 23# Utility to compare MS-LANGID definitions with those defined in ../../inc/i18npool/lang.h 24# Run in i18npool/source/isolang 25# 26# outputs new #define LANGUAGE_... 0x... and also some commented out substrings 27# that were matched in already existing defines. 28# 29# ATTENTION! The sed filter in the command line examples below assures that a 30# '|' border is drawn by html2text in data tables, and nowhere else, on which 31# this awk script relies. This script also heavily relies on the column layout 32# encountered. Should MS decide to change their layout or their CSS names 33# ("data..."), this would probably break. Should html2text decide that the last 34# border="..." attribute encountered wins instead of the first, this may break 35# also. 36# 37# sed -e 's/|/,/g; s/<TABLE/<table/g; /<table/\!b; s/\(<table[^>]*\)\(border\|BORDER\)="[0-9]*"/\1/g; s/\(<table\)\([^>]*\(class\|CLASS\)="data\)/\1 border="1"\2/g' 38# 39# After html2text best if file cleaned up to _only_ contain the table entries, 40# but not necessary, entries are filtered. Check output. 41# 42# Expects input from the saved page of one of 43# 44# (1) 45# http://www.microsoft.com/globaldev/reference/lcid-all.mspx 46# filtered through ``html2text -nobs ...'', generated table: 47# blank,name,hex,dec,blank fields: 48# |Afrikaans_-_South_Africa___|0436___|1078___| 49# 50# complete command line: 51# lynx -dump -source http://www.microsoft.com/globaldev/reference/lcid-all.mspx | sed -e 's/|/,/g; s/<TABLE/<table/g; /<table/\!b; s/\(<table[^>]*\)\(border\|BORDER\)="[0-9]*"/\1/g; s/\(<table\)\([^>]*\(class\|CLASS\)="data\)/\1 border="1"\2/g' | html2text -nobs -width 234 | awk -f lcid.awk >outfile 52# 53# 54# (2) 55# http://www.microsoft.com/globaldev/reference/winxp/xp-lcid.mspx 56# filtered through ``html2text -nobs ...'', generated table: 57# blank,name,hex,dec,inputlocales,collection,blank fields: 58# |Afrikaans |0436 |1078 |0436:00000409, |Basic | 59# 60# complete command line: 61# lynx -dump -source http://www.microsoft.com/globaldev/reference/winxp/xp-lcid.mspx | sed -e 's/|/,/g; s/<TABLE/<table/g; /<table/\!b; s/\(<table[^>]*\)\(border\|BORDER\)="[0-9]*"/\1/g; s/\(<table\)\([^>]*\(class\|CLASS\)="data\)/\1 border="1"\2/g' | html2text -nobs -width 234 | awk -f lcid.awk >outfile 62# 63# 64# (3) 65# http://msdn.microsoft.com/library/en-us/intl/nls_238z.asp 66# filtered through ``html2text -nobs ...'', generated table: 67# blank,hex,locale,name,blank fields: 68# |0x0436___|af-ZA___|Afrikaans_(South_Africa)___| 69# 70# complete command line: 71# lynx -dump -source http://msdn.microsoft.com/library/en-us/intl/nls_238z.asp | sed -e 's/|/,/g; s/<TABLE/<table/g; /<table/\!b; s/\(<table[^>]*\)\(border\|BORDER\)="[0-9]*"/\1/g; s/\(<table\)\([^>]*\(class\|CLASS\)="data\)/\1 border="1"\2/g' | html2text -nobs -width 234 | awk -f lcid.awk >outfile 72# 73# Author: Eike Rathke <erack@sun.com>, <er@openoffice.org> 74# 75 76BEGIN { 77 while ((getline < "../../inc/i18npool/lang.h") > 0) 78 { 79 if ($0 ~ /^#define[ ]*LANGUAGE_[_A-Za-z0-9]*[ ]*0x[0-9a-fA-F]/) 80 { 81 # lang[HEX]=NAME 82 lang[toupper(substr($3,3))] = toupper($2) 83 #print substr($3,3) "=" $2 84 } 85 } 86 # html2text table follows 87 FS = "\|" 88 filetype = 0 89 lcid_all = 1 90 xp_lcid = 2 91 nls_238z = 3 92 filetypename[filetype] = "unknown" 93 filetypename[lcid_all] = "lcid_all" 94 filetypename[xp_lcid] = "xp_lcid" 95 filetypename[nls_238z] = "nls_238z" 96 namefield[lcid_all] = 2 97 namefield[xp_lcid] = 2 98 namefield[nls_238z] = 4 99 hexfield[lcid_all] = 3 100 hexfield[xp_lcid] = 3 101 hexfield[nls_238z] = 2 102 locfield[lcid_all] = 0 103 locfield[xp_lcid] = 0 104 locfield[nls_238z] = 3 105} 106 107(NF < 5) { next } 108 109!filetype { 110 if (NF == 5) 111 { 112 if ($2 ~ /^0x/) 113 filetype = nls_238z 114 else if ($2 ~ /^Afrikaans/) 115 filetype = lcid_all 116 } 117 else if (NF == 7) 118 filetype = xp_lcid 119 if (!filetype) 120 next 121 name = namefield[filetype] 122 hex = hexfield[filetype] 123 loc = locfield[filetype] 124} 125 126{ 127 gsub( /^[^:]*:/, "", $name) 128 gsub( /\..*/, "", $name) 129 gsub( /(^[ _]+)|([ _]+$)/, "", $hex) 130 gsub( /(^[ _]+)|([ _]+$)/, "", $name) 131 if (loc) 132 gsub( /(^[ _]+)|([ _]+$)/, "", $loc) 133} 134 135($hex ~ /^0x/) { $hex = substr( $hex, 3) } 136 137# if only 464 instead of 0464, make it match lang.h 138(length($hex) < 4) { $hex = "0" $hex } 139 140($hex !~ /^[0-9a-fA-F][0-9a-fA-F]*$/) { filtered[$hex] = $0; next } 141 142# all[HEX]=string 143{ all[toupper($hex)] = $name } 144 145(loc) { comment[toupper($hex)] = " /* " $loc " */" } 146 147# new hex: newlang[HEX]=string 148!(toupper($hex) in lang) { newlang[toupper($hex)] = $name } 149 150END { 151 if (!filetype) 152 { 153 print "No file type recognized." >>"/dev/stderr" 154 exit(1) 155 } 156 print "// assuming " filetypename[filetype] " file" 157 # every new language 158 for (x in newlang) 159 { 160 printf( "xxxxxxx LANGUAGE_%-26s 0x%s%s\n", newlang[x], x, comment[x]) 161 n = split(newlang[x],arr,/[^A-Za-z0-9]/) 162 def = "" 163 for (i=1; i<=n; ++i) 164 { 165 if (length(arr[i])) 166 { 167 # each identifier word of the language name 168 if (def) 169 def = def "_" 170 aup = toupper(arr[i]) 171 def = def aup 172 for (l in lang) 173 { 174 # contained in already existing definitions? 175 if (lang[l] ~ aup) 176 printf( "// %-50s %s\n", arr[i] ": " lang[l], l) 177 } 178 } 179 } 180 printf( "#define LANGUAGE_%-26s 0x%s\n", def, x) 181 } 182 print "\n// --- reverse check follows ----------------------------------\n" 183 for (x in lang) 184 { 185 if (!(x in all)) 186 print "// not in input file: " x " " lang[x] 187 } 188 print "\n// --- filtered table entries follow (if any) -----------------\n" 189 for (x in filtered) 190 print "// filtered: " x " " filtered[x] 191} 192