xref: /AOO41X/main/i18npool/source/isolang/lcid.awk (revision cdf0e10c4e3984b49a9502b011690b615761d4a3)
1*cdf0e10cSrcweir#!/usr/bin/awk -f
2*cdf0e10cSrcweir#
3*cdf0e10cSrcweir# Utility to compare MS-LANGID definitions with those defined in ../../inc/i18npool/lang.h
4*cdf0e10cSrcweir# Run in i18npool/source/isolang
5*cdf0e10cSrcweir#
6*cdf0e10cSrcweir# outputs new #define LANGUAGE_... 0x... and also some commented out substrings
7*cdf0e10cSrcweir# that were matched in already existing defines.
8*cdf0e10cSrcweir#
9*cdf0e10cSrcweir# ATTENTION! The sed filter in the command line examples below assures that a
10*cdf0e10cSrcweir# '|' border is drawn by html2text in data tables, and nowhere else, on which
11*cdf0e10cSrcweir# this awk script relies. This script also heavily relies on the column layout
12*cdf0e10cSrcweir# encountered. Should MS decide to change their layout or their CSS names
13*cdf0e10cSrcweir# ("data..."), this would probably break. Should html2text decide that the last
14*cdf0e10cSrcweir# border="..." attribute encountered wins instead of the first, this may break
15*cdf0e10cSrcweir# also.
16*cdf0e10cSrcweir#
17*cdf0e10cSrcweir# sed -e 's/|/,/g; s/<TABLE/<table/g; /<table/\!b; s/\(<table[^>]*\)\(border\|BORDER\)="[0-9]*"/\1/g; s/\(<table\)\([^>]*\(class\|CLASS\)="data\)/\1 border="1"\2/g'
18*cdf0e10cSrcweir#
19*cdf0e10cSrcweir# After html2text best if file cleaned up to _only_ contain the table entries,
20*cdf0e10cSrcweir# but not necessary, entries are filtered. Check output.
21*cdf0e10cSrcweir#
22*cdf0e10cSrcweir# Expects input from the saved page of one of
23*cdf0e10cSrcweir#
24*cdf0e10cSrcweir# (1)
25*cdf0e10cSrcweir# http://www.microsoft.com/globaldev/reference/lcid-all.mspx
26*cdf0e10cSrcweir# filtered through ``html2text -nobs ...'', generated table:
27*cdf0e10cSrcweir# blank,name,hex,dec,blank fields:
28*cdf0e10cSrcweir#    |Afrikaans_-_South_Africa___|0436___|1078___|
29*cdf0e10cSrcweir#
30*cdf0e10cSrcweir# complete command line:
31*cdf0e10cSrcweir# lynx -dump -source http://www.microsoft.com/globaldev/reference/lcid-all.mspx | sed -e 's/|/,/g; s/<TABLE/<table/g; /<table/\!b; s/\(<table[^>]*\)\(border\|BORDER\)="[0-9]*"/\1/g; s/\(<table\)\([^>]*\(class\|CLASS\)="data\)/\1 border="1"\2/g' | html2text -nobs -width 234 | awk -f lcid.awk >outfile
32*cdf0e10cSrcweir#
33*cdf0e10cSrcweir#
34*cdf0e10cSrcweir# (2)
35*cdf0e10cSrcweir# http://www.microsoft.com/globaldev/reference/winxp/xp-lcid.mspx
36*cdf0e10cSrcweir# filtered through ``html2text -nobs ...'', generated table:
37*cdf0e10cSrcweir# blank,name,hex,dec,inputlocales,collection,blank fields:
38*cdf0e10cSrcweir#    |Afrikaans   |0436   |1078   |0436:00000409,   |Basic   |
39*cdf0e10cSrcweir#
40*cdf0e10cSrcweir# complete command line:
41*cdf0e10cSrcweir# lynx -dump -source http://www.microsoft.com/globaldev/reference/winxp/xp-lcid.mspx | sed -e 's/|/,/g; s/<TABLE/<table/g; /<table/\!b; s/\(<table[^>]*\)\(border\|BORDER\)="[0-9]*"/\1/g; s/\(<table\)\([^>]*\(class\|CLASS\)="data\)/\1 border="1"\2/g' | html2text -nobs -width 234 | awk -f lcid.awk >outfile
42*cdf0e10cSrcweir#
43*cdf0e10cSrcweir#
44*cdf0e10cSrcweir# (3)
45*cdf0e10cSrcweir# http://msdn.microsoft.com/library/en-us/intl/nls_238z.asp
46*cdf0e10cSrcweir# filtered through ``html2text -nobs ...'', generated table:
47*cdf0e10cSrcweir# blank,hex,locale,name,blank  fields:
48*cdf0e10cSrcweir#   |0x0436___|af-ZA___|Afrikaans_(South_Africa)___|
49*cdf0e10cSrcweir#
50*cdf0e10cSrcweir# complete command line:
51*cdf0e10cSrcweir# lynx -dump -source http://msdn.microsoft.com/library/en-us/intl/nls_238z.asp | sed -e 's/|/,/g; s/<TABLE/<table/g; /<table/\!b; s/\(<table[^>]*\)\(border\|BORDER\)="[0-9]*"/\1/g; s/\(<table\)\([^>]*\(class\|CLASS\)="data\)/\1 border="1"\2/g' | html2text -nobs -width 234 | awk -f lcid.awk >outfile
52*cdf0e10cSrcweir#
53*cdf0e10cSrcweir# Author: Eike Rathke <erack@sun.com>, <er@openoffice.org>
54*cdf0e10cSrcweir#
55*cdf0e10cSrcweir
56*cdf0e10cSrcweirBEGIN {
57*cdf0e10cSrcweir    while ((getline < "../../inc/i18npool/lang.h") > 0)
58*cdf0e10cSrcweir    {
59*cdf0e10cSrcweir        if ($0 ~ /^#define[ ]*LANGUAGE_[_A-Za-z0-9]*[ ]*0x[0-9a-fA-F]/)
60*cdf0e10cSrcweir        {
61*cdf0e10cSrcweir            # lang[HEX]=NAME
62*cdf0e10cSrcweir            lang[toupper(substr($3,3))] = toupper($2)
63*cdf0e10cSrcweir            #print substr($3,3) "=" $2
64*cdf0e10cSrcweir        }
65*cdf0e10cSrcweir    }
66*cdf0e10cSrcweir    # html2text table follows
67*cdf0e10cSrcweir    FS = "\|"
68*cdf0e10cSrcweir    filetype = 0
69*cdf0e10cSrcweir    lcid_all = 1
70*cdf0e10cSrcweir    xp_lcid  = 2
71*cdf0e10cSrcweir    nls_238z = 3
72*cdf0e10cSrcweir    filetypename[filetype] = "unknown"
73*cdf0e10cSrcweir    filetypename[lcid_all] = "lcid_all"
74*cdf0e10cSrcweir    filetypename[xp_lcid]  = "xp_lcid"
75*cdf0e10cSrcweir    filetypename[nls_238z] = "nls_238z"
76*cdf0e10cSrcweir    namefield[lcid_all] = 2
77*cdf0e10cSrcweir    namefield[xp_lcid]  = 2
78*cdf0e10cSrcweir    namefield[nls_238z] = 4
79*cdf0e10cSrcweir    hexfield[lcid_all]  = 3
80*cdf0e10cSrcweir    hexfield[xp_lcid]   = 3
81*cdf0e10cSrcweir    hexfield[nls_238z]  = 2
82*cdf0e10cSrcweir    locfield[lcid_all]  = 0
83*cdf0e10cSrcweir    locfield[xp_lcid]   = 0
84*cdf0e10cSrcweir    locfield[nls_238z]  = 3
85*cdf0e10cSrcweir}
86*cdf0e10cSrcweir
87*cdf0e10cSrcweir(NF < 5) { next }
88*cdf0e10cSrcweir
89*cdf0e10cSrcweir!filetype {
90*cdf0e10cSrcweir    if (NF == 5)
91*cdf0e10cSrcweir    {
92*cdf0e10cSrcweir        if ($2 ~ /^0x/)
93*cdf0e10cSrcweir            filetype = nls_238z
94*cdf0e10cSrcweir        else if ($2 ~ /^Afrikaans/)
95*cdf0e10cSrcweir            filetype = lcid_all
96*cdf0e10cSrcweir    }
97*cdf0e10cSrcweir    else if (NF == 7)
98*cdf0e10cSrcweir        filetype = xp_lcid
99*cdf0e10cSrcweir    if (!filetype)
100*cdf0e10cSrcweir        next
101*cdf0e10cSrcweir    name = namefield[filetype]
102*cdf0e10cSrcweir    hex = hexfield[filetype]
103*cdf0e10cSrcweir    loc = locfield[filetype]
104*cdf0e10cSrcweir}
105*cdf0e10cSrcweir
106*cdf0e10cSrcweir{
107*cdf0e10cSrcweir    gsub( /^[^:]*:/, "", $name)
108*cdf0e10cSrcweir    gsub( /\..*/, "", $name)
109*cdf0e10cSrcweir    gsub( /(^[ _]+)|([ _]+$)/, "", $hex)
110*cdf0e10cSrcweir    gsub( /(^[ _]+)|([ _]+$)/, "", $name)
111*cdf0e10cSrcweir    if (loc)
112*cdf0e10cSrcweir        gsub( /(^[ _]+)|([ _]+$)/, "", $loc)
113*cdf0e10cSrcweir}
114*cdf0e10cSrcweir
115*cdf0e10cSrcweir($hex ~ /^0x/) { $hex = substr( $hex, 3) }
116*cdf0e10cSrcweir
117*cdf0e10cSrcweir# if only 464 instead of 0464, make it match lang.h
118*cdf0e10cSrcweir(length($hex) < 4) { $hex = "0" $hex }
119*cdf0e10cSrcweir
120*cdf0e10cSrcweir($hex !~ /^[0-9a-fA-F][0-9a-fA-F]*$/) { filtered[$hex] = $0; next }
121*cdf0e10cSrcweir
122*cdf0e10cSrcweir# all[HEX]=string
123*cdf0e10cSrcweir{ all[toupper($hex)] = $name }
124*cdf0e10cSrcweir
125*cdf0e10cSrcweir(loc) { comment[toupper($hex)] = "  /* " $loc " */" }
126*cdf0e10cSrcweir
127*cdf0e10cSrcweir# new hex: newlang[HEX]=string
128*cdf0e10cSrcweir!(toupper($hex) in lang) { newlang[toupper($hex)] = $name }
129*cdf0e10cSrcweir
130*cdf0e10cSrcweirEND {
131*cdf0e10cSrcweir    if (!filetype)
132*cdf0e10cSrcweir    {
133*cdf0e10cSrcweir        print "No file type recognized." >>"/dev/stderr"
134*cdf0e10cSrcweir        exit(1)
135*cdf0e10cSrcweir    }
136*cdf0e10cSrcweir    print "// assuming " filetypename[filetype] " file"
137*cdf0e10cSrcweir    # every new language
138*cdf0e10cSrcweir    for (x in newlang)
139*cdf0e10cSrcweir    {
140*cdf0e10cSrcweir        printf( "xxxxxxx LANGUAGE_%-26s 0x%s%s\n", newlang[x], x, comment[x])
141*cdf0e10cSrcweir        n = split(newlang[x],arr,/[^A-Za-z0-9]/)
142*cdf0e10cSrcweir        def = ""
143*cdf0e10cSrcweir        for (i=1; i<=n; ++i)
144*cdf0e10cSrcweir        {
145*cdf0e10cSrcweir            if (length(arr[i]))
146*cdf0e10cSrcweir            {
147*cdf0e10cSrcweir                # each identifier word of the language name
148*cdf0e10cSrcweir                if (def)
149*cdf0e10cSrcweir                    def = def "_"
150*cdf0e10cSrcweir                aup = toupper(arr[i])
151*cdf0e10cSrcweir                def = def aup
152*cdf0e10cSrcweir                for (l in lang)
153*cdf0e10cSrcweir                {
154*cdf0e10cSrcweir                    #  contained in already existing definitions?
155*cdf0e10cSrcweir                    if (lang[l] ~ aup)
156*cdf0e10cSrcweir                        printf( "// %-50s %s\n", arr[i] ": " lang[l], l)
157*cdf0e10cSrcweir                }
158*cdf0e10cSrcweir            }
159*cdf0e10cSrcweir        }
160*cdf0e10cSrcweir        printf( "#define LANGUAGE_%-26s 0x%s\n", def, x)
161*cdf0e10cSrcweir    }
162*cdf0e10cSrcweir    print "\n// --- reverse check follows ----------------------------------\n"
163*cdf0e10cSrcweir    for (x in lang)
164*cdf0e10cSrcweir    {
165*cdf0e10cSrcweir        if (!(x in all))
166*cdf0e10cSrcweir            print "// not in input file:   " x "  " lang[x]
167*cdf0e10cSrcweir    }
168*cdf0e10cSrcweir    print "\n// --- filtered table entries follow (if any) -----------------\n"
169*cdf0e10cSrcweir    for (x in filtered)
170*cdf0e10cSrcweir        print "// filtered:   " x "  " filtered[x]
171*cdf0e10cSrcweir}
172