1#!/usr/bin/awk -f 2# 3# Utility to compare MS-LANGID definitions with those defined in ../../inc/i18npool/lang.h 4# Run in i18npool/source/isolang 5# 6# outputs new #define LANGUAGE_... 0x... and also some commented out substrings 7# that were matched in already existing defines. 8# 9# ATTENTION! The sed filter in the command line examples below assures that a 10# '|' border is drawn by html2text in data tables, and nowhere else, on which 11# this awk script relies. This script also heavily relies on the column layout 12# encountered. Should MS decide to change their layout or their CSS names 13# ("data..."), this would probably break. Should html2text decide that the last 14# border="..." attribute encountered wins instead of the first, this may break 15# also. 16# 17# sed -e 's/|/,/g; s/<TABLE/<table/g; /<table/\!b; s/\(<table[^>]*\)\(border\|BORDER\)="[0-9]*"/\1/g; s/\(<table\)\([^>]*\(class\|CLASS\)="data\)/\1 border="1"\2/g' 18# 19# After html2text best if file cleaned up to _only_ contain the table entries, 20# but not necessary, entries are filtered. Check output. 21# 22# Expects input from the saved page of one of 23# 24# (1) 25# http://www.microsoft.com/globaldev/reference/lcid-all.mspx 26# filtered through ``html2text -nobs ...'', generated table: 27# blank,name,hex,dec,blank fields: 28# |Afrikaans_-_South_Africa___|0436___|1078___| 29# 30# complete command line: 31# lynx -dump -source http://www.microsoft.com/globaldev/reference/lcid-all.mspx | sed -e 's/|/,/g; s/<TABLE/<table/g; /<table/\!b; s/\(<table[^>]*\)\(border\|BORDER\)="[0-9]*"/\1/g; s/\(<table\)\([^>]*\(class\|CLASS\)="data\)/\1 border="1"\2/g' | html2text -nobs -width 234 | awk -f lcid.awk >outfile 32# 33# 34# (2) 35# http://www.microsoft.com/globaldev/reference/winxp/xp-lcid.mspx 36# filtered through ``html2text -nobs ...'', generated table: 37# blank,name,hex,dec,inputlocales,collection,blank fields: 38# |Afrikaans |0436 |1078 |0436:00000409, |Basic | 39# 40# complete command line: 41# lynx -dump -source http://www.microsoft.com/globaldev/reference/winxp/xp-lcid.mspx | sed -e 's/|/,/g; s/<TABLE/<table/g; /<table/\!b; s/\(<table[^>]*\)\(border\|BORDER\)="[0-9]*"/\1/g; s/\(<table\)\([^>]*\(class\|CLASS\)="data\)/\1 border="1"\2/g' | html2text -nobs -width 234 | awk -f lcid.awk >outfile 42# 43# 44# (3) 45# http://msdn.microsoft.com/library/en-us/intl/nls_238z.asp 46# filtered through ``html2text -nobs ...'', generated table: 47# blank,hex,locale,name,blank fields: 48# |0x0436___|af-ZA___|Afrikaans_(South_Africa)___| 49# 50# complete command line: 51# lynx -dump -source http://msdn.microsoft.com/library/en-us/intl/nls_238z.asp | sed -e 's/|/,/g; s/<TABLE/<table/g; /<table/\!b; s/\(<table[^>]*\)\(border\|BORDER\)="[0-9]*"/\1/g; s/\(<table\)\([^>]*\(class\|CLASS\)="data\)/\1 border="1"\2/g' | html2text -nobs -width 234 | awk -f lcid.awk >outfile 52# 53# Author: Eike Rathke <erack@sun.com>, <er@openoffice.org> 54# 55 56BEGIN { 57 while ((getline < "../../inc/i18npool/lang.h") > 0) 58 { 59 if ($0 ~ /^#define[ ]*LANGUAGE_[_A-Za-z0-9]*[ ]*0x[0-9a-fA-F]/) 60 { 61 # lang[HEX]=NAME 62 lang[toupper(substr($3,3))] = toupper($2) 63 #print substr($3,3) "=" $2 64 } 65 } 66 # html2text table follows 67 FS = "\|" 68 filetype = 0 69 lcid_all = 1 70 xp_lcid = 2 71 nls_238z = 3 72 filetypename[filetype] = "unknown" 73 filetypename[lcid_all] = "lcid_all" 74 filetypename[xp_lcid] = "xp_lcid" 75 filetypename[nls_238z] = "nls_238z" 76 namefield[lcid_all] = 2 77 namefield[xp_lcid] = 2 78 namefield[nls_238z] = 4 79 hexfield[lcid_all] = 3 80 hexfield[xp_lcid] = 3 81 hexfield[nls_238z] = 2 82 locfield[lcid_all] = 0 83 locfield[xp_lcid] = 0 84 locfield[nls_238z] = 3 85} 86 87(NF < 5) { next } 88 89!filetype { 90 if (NF == 5) 91 { 92 if ($2 ~ /^0x/) 93 filetype = nls_238z 94 else if ($2 ~ /^Afrikaans/) 95 filetype = lcid_all 96 } 97 else if (NF == 7) 98 filetype = xp_lcid 99 if (!filetype) 100 next 101 name = namefield[filetype] 102 hex = hexfield[filetype] 103 loc = locfield[filetype] 104} 105 106{ 107 gsub( /^[^:]*:/, "", $name) 108 gsub( /\..*/, "", $name) 109 gsub( /(^[ _]+)|([ _]+$)/, "", $hex) 110 gsub( /(^[ _]+)|([ _]+$)/, "", $name) 111 if (loc) 112 gsub( /(^[ _]+)|([ _]+$)/, "", $loc) 113} 114 115($hex ~ /^0x/) { $hex = substr( $hex, 3) } 116 117# if only 464 instead of 0464, make it match lang.h 118(length($hex) < 4) { $hex = "0" $hex } 119 120($hex !~ /^[0-9a-fA-F][0-9a-fA-F]*$/) { filtered[$hex] = $0; next } 121 122# all[HEX]=string 123{ all[toupper($hex)] = $name } 124 125(loc) { comment[toupper($hex)] = " /* " $loc " */" } 126 127# new hex: newlang[HEX]=string 128!(toupper($hex) in lang) { newlang[toupper($hex)] = $name } 129 130END { 131 if (!filetype) 132 { 133 print "No file type recognized." >>"/dev/stderr" 134 exit(1) 135 } 136 print "// assuming " filetypename[filetype] " file" 137 # every new language 138 for (x in newlang) 139 { 140 printf( "xxxxxxx LANGUAGE_%-26s 0x%s%s\n", newlang[x], x, comment[x]) 141 n = split(newlang[x],arr,/[^A-Za-z0-9]/) 142 def = "" 143 for (i=1; i<=n; ++i) 144 { 145 if (length(arr[i])) 146 { 147 # each identifier word of the language name 148 if (def) 149 def = def "_" 150 aup = toupper(arr[i]) 151 def = def aup 152 for (l in lang) 153 { 154 # contained in already existing definitions? 155 if (lang[l] ~ aup) 156 printf( "// %-50s %s\n", arr[i] ": " lang[l], l) 157 } 158 } 159 } 160 printf( "#define LANGUAGE_%-26s 0x%s\n", def, x) 161 } 162 print "\n// --- reverse check follows ----------------------------------\n" 163 for (x in lang) 164 { 165 if (!(x in all)) 166 print "// not in input file: " x " " lang[x] 167 } 168 print "\n// --- filtered table entries follow (if any) -----------------\n" 169 for (x in filtered) 170 print "// filtered: " x " " filtered[x] 171} 172