xref: /aoo42x/main/i18npool/source/isolang/lcid.awk (revision cdf0e10c)
1#!/usr/bin/awk -f
2#
3# Utility to compare MS-LANGID definitions with those defined in ../../inc/i18npool/lang.h
4# Run in i18npool/source/isolang
5#
6# outputs new #define LANGUAGE_... 0x... and also some commented out substrings
7# that were matched in already existing defines.
8#
9# ATTENTION! The sed filter in the command line examples below assures that a
10# '|' border is drawn by html2text in data tables, and nowhere else, on which
11# this awk script relies. This script also heavily relies on the column layout
12# encountered. Should MS decide to change their layout or their CSS names
13# ("data..."), this would probably break. Should html2text decide that the last
14# border="..." attribute encountered wins instead of the first, this may break
15# also.
16#
17# sed -e 's/|/,/g; s/<TABLE/<table/g; /<table/\!b; s/\(<table[^>]*\)\(border\|BORDER\)="[0-9]*"/\1/g; s/\(<table\)\([^>]*\(class\|CLASS\)="data\)/\1 border="1"\2/g'
18#
19# After html2text best if file cleaned up to _only_ contain the table entries,
20# but not necessary, entries are filtered. Check output.
21#
22# Expects input from the saved page of one of
23#
24# (1)
25# http://www.microsoft.com/globaldev/reference/lcid-all.mspx
26# filtered through ``html2text -nobs ...'', generated table:
27# blank,name,hex,dec,blank fields:
28#    |Afrikaans_-_South_Africa___|0436___|1078___|
29#
30# complete command line:
31# lynx -dump -source http://www.microsoft.com/globaldev/reference/lcid-all.mspx | sed -e 's/|/,/g; s/<TABLE/<table/g; /<table/\!b; s/\(<table[^>]*\)\(border\|BORDER\)="[0-9]*"/\1/g; s/\(<table\)\([^>]*\(class\|CLASS\)="data\)/\1 border="1"\2/g' | html2text -nobs -width 234 | awk -f lcid.awk >outfile
32#
33#
34# (2)
35# http://www.microsoft.com/globaldev/reference/winxp/xp-lcid.mspx
36# filtered through ``html2text -nobs ...'', generated table:
37# blank,name,hex,dec,inputlocales,collection,blank fields:
38#    |Afrikaans   |0436   |1078   |0436:00000409,   |Basic   |
39#
40# complete command line:
41# lynx -dump -source http://www.microsoft.com/globaldev/reference/winxp/xp-lcid.mspx | sed -e 's/|/,/g; s/<TABLE/<table/g; /<table/\!b; s/\(<table[^>]*\)\(border\|BORDER\)="[0-9]*"/\1/g; s/\(<table\)\([^>]*\(class\|CLASS\)="data\)/\1 border="1"\2/g' | html2text -nobs -width 234 | awk -f lcid.awk >outfile
42#
43#
44# (3)
45# http://msdn.microsoft.com/library/en-us/intl/nls_238z.asp
46# filtered through ``html2text -nobs ...'', generated table:
47# blank,hex,locale,name,blank  fields:
48#   |0x0436___|af-ZA___|Afrikaans_(South_Africa)___|
49#
50# complete command line:
51# lynx -dump -source http://msdn.microsoft.com/library/en-us/intl/nls_238z.asp | sed -e 's/|/,/g; s/<TABLE/<table/g; /<table/\!b; s/\(<table[^>]*\)\(border\|BORDER\)="[0-9]*"/\1/g; s/\(<table\)\([^>]*\(class\|CLASS\)="data\)/\1 border="1"\2/g' | html2text -nobs -width 234 | awk -f lcid.awk >outfile
52#
53# Author: Eike Rathke <erack@sun.com>, <er@openoffice.org>
54#
55
56BEGIN {
57    while ((getline < "../../inc/i18npool/lang.h") > 0)
58    {
59        if ($0 ~ /^#define[ ]*LANGUAGE_[_A-Za-z0-9]*[ ]*0x[0-9a-fA-F]/)
60        {
61            # lang[HEX]=NAME
62            lang[toupper(substr($3,3))] = toupper($2)
63            #print substr($3,3) "=" $2
64        }
65    }
66    # html2text table follows
67    FS = "\|"
68    filetype = 0
69    lcid_all = 1
70    xp_lcid  = 2
71    nls_238z = 3
72    filetypename[filetype] = "unknown"
73    filetypename[lcid_all] = "lcid_all"
74    filetypename[xp_lcid]  = "xp_lcid"
75    filetypename[nls_238z] = "nls_238z"
76    namefield[lcid_all] = 2
77    namefield[xp_lcid]  = 2
78    namefield[nls_238z] = 4
79    hexfield[lcid_all]  = 3
80    hexfield[xp_lcid]   = 3
81    hexfield[nls_238z]  = 2
82    locfield[lcid_all]  = 0
83    locfield[xp_lcid]   = 0
84    locfield[nls_238z]  = 3
85}
86
87(NF < 5) { next }
88
89!filetype {
90    if (NF == 5)
91    {
92        if ($2 ~ /^0x/)
93            filetype = nls_238z
94        else if ($2 ~ /^Afrikaans/)
95            filetype = lcid_all
96    }
97    else if (NF == 7)
98        filetype = xp_lcid
99    if (!filetype)
100        next
101    name = namefield[filetype]
102    hex = hexfield[filetype]
103    loc = locfield[filetype]
104}
105
106{
107    gsub( /^[^:]*:/, "", $name)
108    gsub( /\..*/, "", $name)
109    gsub( /(^[ _]+)|([ _]+$)/, "", $hex)
110    gsub( /(^[ _]+)|([ _]+$)/, "", $name)
111    if (loc)
112        gsub( /(^[ _]+)|([ _]+$)/, "", $loc)
113}
114
115($hex ~ /^0x/) { $hex = substr( $hex, 3) }
116
117# if only 464 instead of 0464, make it match lang.h
118(length($hex) < 4) { $hex = "0" $hex }
119
120($hex !~ /^[0-9a-fA-F][0-9a-fA-F]*$/) { filtered[$hex] = $0; next }
121
122# all[HEX]=string
123{ all[toupper($hex)] = $name }
124
125(loc) { comment[toupper($hex)] = "  /* " $loc " */" }
126
127# new hex: newlang[HEX]=string
128!(toupper($hex) in lang) { newlang[toupper($hex)] = $name }
129
130END {
131    if (!filetype)
132    {
133        print "No file type recognized." >>"/dev/stderr"
134        exit(1)
135    }
136    print "// assuming " filetypename[filetype] " file"
137    # every new language
138    for (x in newlang)
139    {
140        printf( "xxxxxxx LANGUAGE_%-26s 0x%s%s\n", newlang[x], x, comment[x])
141        n = split(newlang[x],arr,/[^A-Za-z0-9]/)
142        def = ""
143        for (i=1; i<=n; ++i)
144        {
145            if (length(arr[i]))
146            {
147                # each identifier word of the language name
148                if (def)
149                    def = def "_"
150                aup = toupper(arr[i])
151                def = def aup
152                for (l in lang)
153                {
154                    #  contained in already existing definitions?
155                    if (lang[l] ~ aup)
156                        printf( "// %-50s %s\n", arr[i] ": " lang[l], l)
157                }
158            }
159        }
160        printf( "#define LANGUAGE_%-26s 0x%s\n", def, x)
161    }
162    print "\n// --- reverse check follows ----------------------------------\n"
163    for (x in lang)
164    {
165        if (!(x in all))
166            print "// not in input file:   " x "  " lang[x]
167    }
168    print "\n// --- filtered table entries follow (if any) -----------------\n"
169    for (x in filtered)
170        print "// filtered:   " x "  " filtered[x]
171}
172