1*cdf0e10cSrcweir#!/usr/bin/awk -f 2*cdf0e10cSrcweir# 3*cdf0e10cSrcweir# Utility to compare MS-LANGID definitions with those defined in ../../inc/i18npool/lang.h 4*cdf0e10cSrcweir# Run in i18npool/source/isolang 5*cdf0e10cSrcweir# 6*cdf0e10cSrcweir# outputs new #define LANGUAGE_... 0x... and also some commented out substrings 7*cdf0e10cSrcweir# that were matched in already existing defines. 8*cdf0e10cSrcweir# 9*cdf0e10cSrcweir# ATTENTION! The sed filter in the command line examples below assures that a 10*cdf0e10cSrcweir# '|' border is drawn by html2text in data tables, and nowhere else, on which 11*cdf0e10cSrcweir# this awk script relies. This script also heavily relies on the column layout 12*cdf0e10cSrcweir# encountered. Should MS decide to change their layout or their CSS names 13*cdf0e10cSrcweir# ("data..."), this would probably break. Should html2text decide that the last 14*cdf0e10cSrcweir# border="..." attribute encountered wins instead of the first, this may break 15*cdf0e10cSrcweir# also. 16*cdf0e10cSrcweir# 17*cdf0e10cSrcweir# sed -e 's/|/,/g; s/<TABLE/<table/g; /<table/\!b; s/\(<table[^>]*\)\(border\|BORDER\)="[0-9]*"/\1/g; s/\(<table\)\([^>]*\(class\|CLASS\)="data\)/\1 border="1"\2/g' 18*cdf0e10cSrcweir# 19*cdf0e10cSrcweir# After html2text best if file cleaned up to _only_ contain the table entries, 20*cdf0e10cSrcweir# but not necessary, entries are filtered. Check output. 21*cdf0e10cSrcweir# 22*cdf0e10cSrcweir# Expects input from the saved page of one of 23*cdf0e10cSrcweir# 24*cdf0e10cSrcweir# (1) 25*cdf0e10cSrcweir# http://www.microsoft.com/globaldev/reference/lcid-all.mspx 26*cdf0e10cSrcweir# filtered through ``html2text -nobs ...'', generated table: 27*cdf0e10cSrcweir# blank,name,hex,dec,blank fields: 28*cdf0e10cSrcweir# |Afrikaans_-_South_Africa___|0436___|1078___| 29*cdf0e10cSrcweir# 30*cdf0e10cSrcweir# complete command line: 31*cdf0e10cSrcweir# lynx -dump -source http://www.microsoft.com/globaldev/reference/lcid-all.mspx | sed -e 's/|/,/g; s/<TABLE/<table/g; /<table/\!b; s/\(<table[^>]*\)\(border\|BORDER\)="[0-9]*"/\1/g; s/\(<table\)\([^>]*\(class\|CLASS\)="data\)/\1 border="1"\2/g' | html2text -nobs -width 234 | awk -f lcid.awk >outfile 32*cdf0e10cSrcweir# 33*cdf0e10cSrcweir# 34*cdf0e10cSrcweir# (2) 35*cdf0e10cSrcweir# http://www.microsoft.com/globaldev/reference/winxp/xp-lcid.mspx 36*cdf0e10cSrcweir# filtered through ``html2text -nobs ...'', generated table: 37*cdf0e10cSrcweir# blank,name,hex,dec,inputlocales,collection,blank fields: 38*cdf0e10cSrcweir# |Afrikaans |0436 |1078 |0436:00000409, |Basic | 39*cdf0e10cSrcweir# 40*cdf0e10cSrcweir# complete command line: 41*cdf0e10cSrcweir# lynx -dump -source http://www.microsoft.com/globaldev/reference/winxp/xp-lcid.mspx | sed -e 's/|/,/g; s/<TABLE/<table/g; /<table/\!b; s/\(<table[^>]*\)\(border\|BORDER\)="[0-9]*"/\1/g; s/\(<table\)\([^>]*\(class\|CLASS\)="data\)/\1 border="1"\2/g' | html2text -nobs -width 234 | awk -f lcid.awk >outfile 42*cdf0e10cSrcweir# 43*cdf0e10cSrcweir# 44*cdf0e10cSrcweir# (3) 45*cdf0e10cSrcweir# http://msdn.microsoft.com/library/en-us/intl/nls_238z.asp 46*cdf0e10cSrcweir# filtered through ``html2text -nobs ...'', generated table: 47*cdf0e10cSrcweir# blank,hex,locale,name,blank fields: 48*cdf0e10cSrcweir# |0x0436___|af-ZA___|Afrikaans_(South_Africa)___| 49*cdf0e10cSrcweir# 50*cdf0e10cSrcweir# complete command line: 51*cdf0e10cSrcweir# lynx -dump -source http://msdn.microsoft.com/library/en-us/intl/nls_238z.asp | sed -e 's/|/,/g; s/<TABLE/<table/g; /<table/\!b; s/\(<table[^>]*\)\(border\|BORDER\)="[0-9]*"/\1/g; s/\(<table\)\([^>]*\(class\|CLASS\)="data\)/\1 border="1"\2/g' | html2text -nobs -width 234 | awk -f lcid.awk >outfile 52*cdf0e10cSrcweir# 53*cdf0e10cSrcweir# Author: Eike Rathke <erack@sun.com>, <er@openoffice.org> 54*cdf0e10cSrcweir# 55*cdf0e10cSrcweir 56*cdf0e10cSrcweirBEGIN { 57*cdf0e10cSrcweir while ((getline < "../../inc/i18npool/lang.h") > 0) 58*cdf0e10cSrcweir { 59*cdf0e10cSrcweir if ($0 ~ /^#define[ ]*LANGUAGE_[_A-Za-z0-9]*[ ]*0x[0-9a-fA-F]/) 60*cdf0e10cSrcweir { 61*cdf0e10cSrcweir # lang[HEX]=NAME 62*cdf0e10cSrcweir lang[toupper(substr($3,3))] = toupper($2) 63*cdf0e10cSrcweir #print substr($3,3) "=" $2 64*cdf0e10cSrcweir } 65*cdf0e10cSrcweir } 66*cdf0e10cSrcweir # html2text table follows 67*cdf0e10cSrcweir FS = "\|" 68*cdf0e10cSrcweir filetype = 0 69*cdf0e10cSrcweir lcid_all = 1 70*cdf0e10cSrcweir xp_lcid = 2 71*cdf0e10cSrcweir nls_238z = 3 72*cdf0e10cSrcweir filetypename[filetype] = "unknown" 73*cdf0e10cSrcweir filetypename[lcid_all] = "lcid_all" 74*cdf0e10cSrcweir filetypename[xp_lcid] = "xp_lcid" 75*cdf0e10cSrcweir filetypename[nls_238z] = "nls_238z" 76*cdf0e10cSrcweir namefield[lcid_all] = 2 77*cdf0e10cSrcweir namefield[xp_lcid] = 2 78*cdf0e10cSrcweir namefield[nls_238z] = 4 79*cdf0e10cSrcweir hexfield[lcid_all] = 3 80*cdf0e10cSrcweir hexfield[xp_lcid] = 3 81*cdf0e10cSrcweir hexfield[nls_238z] = 2 82*cdf0e10cSrcweir locfield[lcid_all] = 0 83*cdf0e10cSrcweir locfield[xp_lcid] = 0 84*cdf0e10cSrcweir locfield[nls_238z] = 3 85*cdf0e10cSrcweir} 86*cdf0e10cSrcweir 87*cdf0e10cSrcweir(NF < 5) { next } 88*cdf0e10cSrcweir 89*cdf0e10cSrcweir!filetype { 90*cdf0e10cSrcweir if (NF == 5) 91*cdf0e10cSrcweir { 92*cdf0e10cSrcweir if ($2 ~ /^0x/) 93*cdf0e10cSrcweir filetype = nls_238z 94*cdf0e10cSrcweir else if ($2 ~ /^Afrikaans/) 95*cdf0e10cSrcweir filetype = lcid_all 96*cdf0e10cSrcweir } 97*cdf0e10cSrcweir else if (NF == 7) 98*cdf0e10cSrcweir filetype = xp_lcid 99*cdf0e10cSrcweir if (!filetype) 100*cdf0e10cSrcweir next 101*cdf0e10cSrcweir name = namefield[filetype] 102*cdf0e10cSrcweir hex = hexfield[filetype] 103*cdf0e10cSrcweir loc = locfield[filetype] 104*cdf0e10cSrcweir} 105*cdf0e10cSrcweir 106*cdf0e10cSrcweir{ 107*cdf0e10cSrcweir gsub( /^[^:]*:/, "", $name) 108*cdf0e10cSrcweir gsub( /\..*/, "", $name) 109*cdf0e10cSrcweir gsub( /(^[ _]+)|([ _]+$)/, "", $hex) 110*cdf0e10cSrcweir gsub( /(^[ _]+)|([ _]+$)/, "", $name) 111*cdf0e10cSrcweir if (loc) 112*cdf0e10cSrcweir gsub( /(^[ _]+)|([ _]+$)/, "", $loc) 113*cdf0e10cSrcweir} 114*cdf0e10cSrcweir 115*cdf0e10cSrcweir($hex ~ /^0x/) { $hex = substr( $hex, 3) } 116*cdf0e10cSrcweir 117*cdf0e10cSrcweir# if only 464 instead of 0464, make it match lang.h 118*cdf0e10cSrcweir(length($hex) < 4) { $hex = "0" $hex } 119*cdf0e10cSrcweir 120*cdf0e10cSrcweir($hex !~ /^[0-9a-fA-F][0-9a-fA-F]*$/) { filtered[$hex] = $0; next } 121*cdf0e10cSrcweir 122*cdf0e10cSrcweir# all[HEX]=string 123*cdf0e10cSrcweir{ all[toupper($hex)] = $name } 124*cdf0e10cSrcweir 125*cdf0e10cSrcweir(loc) { comment[toupper($hex)] = " /* " $loc " */" } 126*cdf0e10cSrcweir 127*cdf0e10cSrcweir# new hex: newlang[HEX]=string 128*cdf0e10cSrcweir!(toupper($hex) in lang) { newlang[toupper($hex)] = $name } 129*cdf0e10cSrcweir 130*cdf0e10cSrcweirEND { 131*cdf0e10cSrcweir if (!filetype) 132*cdf0e10cSrcweir { 133*cdf0e10cSrcweir print "No file type recognized." >>"/dev/stderr" 134*cdf0e10cSrcweir exit(1) 135*cdf0e10cSrcweir } 136*cdf0e10cSrcweir print "// assuming " filetypename[filetype] " file" 137*cdf0e10cSrcweir # every new language 138*cdf0e10cSrcweir for (x in newlang) 139*cdf0e10cSrcweir { 140*cdf0e10cSrcweir printf( "xxxxxxx LANGUAGE_%-26s 0x%s%s\n", newlang[x], x, comment[x]) 141*cdf0e10cSrcweir n = split(newlang[x],arr,/[^A-Za-z0-9]/) 142*cdf0e10cSrcweir def = "" 143*cdf0e10cSrcweir for (i=1; i<=n; ++i) 144*cdf0e10cSrcweir { 145*cdf0e10cSrcweir if (length(arr[i])) 146*cdf0e10cSrcweir { 147*cdf0e10cSrcweir # each identifier word of the language name 148*cdf0e10cSrcweir if (def) 149*cdf0e10cSrcweir def = def "_" 150*cdf0e10cSrcweir aup = toupper(arr[i]) 151*cdf0e10cSrcweir def = def aup 152*cdf0e10cSrcweir for (l in lang) 153*cdf0e10cSrcweir { 154*cdf0e10cSrcweir # contained in already existing definitions? 155*cdf0e10cSrcweir if (lang[l] ~ aup) 156*cdf0e10cSrcweir printf( "// %-50s %s\n", arr[i] ": " lang[l], l) 157*cdf0e10cSrcweir } 158*cdf0e10cSrcweir } 159*cdf0e10cSrcweir } 160*cdf0e10cSrcweir printf( "#define LANGUAGE_%-26s 0x%s\n", def, x) 161*cdf0e10cSrcweir } 162*cdf0e10cSrcweir print "\n// --- reverse check follows ----------------------------------\n" 163*cdf0e10cSrcweir for (x in lang) 164*cdf0e10cSrcweir { 165*cdf0e10cSrcweir if (!(x in all)) 166*cdf0e10cSrcweir print "// not in input file: " x " " lang[x] 167*cdf0e10cSrcweir } 168*cdf0e10cSrcweir print "\n// --- filtered table entries follow (if any) -----------------\n" 169*cdf0e10cSrcweir for (x in filtered) 170*cdf0e10cSrcweir print "// filtered: " x " " filtered[x] 171*cdf0e10cSrcweir} 172