xref: /trunk/main/i18npool/source/isolang/lcid.awk (revision 5b501c92)
1#!/usr/bin/awk -f
2# *************************************************************
3#
4#  Licensed to the Apache Software Foundation (ASF) under one
5#  or more contributor license agreements.  See the NOTICE file
6#  distributed with this work for additional information
7#  regarding copyright ownership.  The ASF licenses this file
8#  to you under the Apache License, Version 2.0 (the
9#  "License"); you may not use this file except in compliance
10#  with the License.  You may obtain a copy of the License at
11#
12#    http://www.apache.org/licenses/LICENSE-2.0
13#
14#  Unless required by applicable law or agreed to in writing,
15#  software distributed under the License is distributed on an
16#  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17#  KIND, either express or implied.  See the License for the
18#  specific language governing permissions and limitations
19#  under the License.
20#
21# *************************************************************
22#
23# Utility to compare MS-LANGID definitions with those defined in ../../inc/i18npool/lang.h
24# Run in i18npool/source/isolang
25#
26# outputs new #define LANGUAGE_... 0x... and also some commented out substrings
27# that were matched in already existing defines.
28#
29# ATTENTION! The sed filter in the command line examples below assures that a
30# '|' border is drawn by html2text in data tables, and nowhere else, on which
31# this awk script relies. This script also heavily relies on the column layout
32# encountered. Should MS decide to change their layout or their CSS names
33# ("data..."), this would probably break. Should html2text decide that the last
34# border="..." attribute encountered wins instead of the first, this may break
35# also.
36#
37# sed -e 's/|/,/g; s/<TABLE/<table/g; /<table/\!b; s/\(<table[^>]*\)\(border\|BORDER\)="[0-9]*"/\1/g; s/\(<table\)\([^>]*\(class\|CLASS\)="data\)/\1 border="1"\2/g'
38#
39# After html2text best if file cleaned up to _only_ contain the table entries,
40# but not necessary, entries are filtered. Check output.
41#
42# Expects input from the saved page of one of
43#
44# (1)
45# http://www.microsoft.com/globaldev/reference/lcid-all.mspx
46# filtered through ``html2text -nobs ...'', generated table:
47# blank,name,hex,dec,blank fields:
48#    |Afrikaans_-_South_Africa___|0436___|1078___|
49#
50# complete command line:
51# lynx -dump -source http://www.microsoft.com/globaldev/reference/lcid-all.mspx | sed -e 's/|/,/g; s/<TABLE/<table/g; /<table/\!b; s/\(<table[^>]*\)\(border\|BORDER\)="[0-9]*"/\1/g; s/\(<table\)\([^>]*\(class\|CLASS\)="data\)/\1 border="1"\2/g' | html2text -nobs -width 234 | awk -f lcid.awk >outfile
52#
53#
54# (2)
55# http://www.microsoft.com/globaldev/reference/winxp/xp-lcid.mspx
56# filtered through ``html2text -nobs ...'', generated table:
57# blank,name,hex,dec,inputlocales,collection,blank fields:
58#    |Afrikaans   |0436   |1078   |0436:00000409,   |Basic   |
59#
60# complete command line:
61# lynx -dump -source http://www.microsoft.com/globaldev/reference/winxp/xp-lcid.mspx | sed -e 's/|/,/g; s/<TABLE/<table/g; /<table/\!b; s/\(<table[^>]*\)\(border\|BORDER\)="[0-9]*"/\1/g; s/\(<table\)\([^>]*\(class\|CLASS\)="data\)/\1 border="1"\2/g' | html2text -nobs -width 234 | awk -f lcid.awk >outfile
62#
63#
64# (3)
65# http://msdn.microsoft.com/library/en-us/intl/nls_238z.asp
66# filtered through ``html2text -nobs ...'', generated table:
67# blank,hex,locale,name,blank  fields:
68#   |0x0436___|af-ZA___|Afrikaans_(South_Africa)___|
69#
70# complete command line:
71# lynx -dump -source http://msdn.microsoft.com/library/en-us/intl/nls_238z.asp | sed -e 's/|/,/g; s/<TABLE/<table/g; /<table/\!b; s/\(<table[^>]*\)\(border\|BORDER\)="[0-9]*"/\1/g; s/\(<table\)\([^>]*\(class\|CLASS\)="data\)/\1 border="1"\2/g' | html2text -nobs -width 234 | awk -f lcid.awk >outfile
72#
73# Author: Eike Rathke <erack@sun.com>, <er@openoffice.org>
74#
75
76BEGIN {
77    while ((getline < "../../inc/i18npool/lang.h") > 0)
78    {
79        if ($0 ~ /^#define[ ]*LANGUAGE_[_A-Za-z0-9]*[ ]*0x[0-9a-fA-F]/)
80        {
81            # lang[HEX]=NAME
82            lang[toupper(substr($3,3))] = toupper($2)
83            #print substr($3,3) "=" $2
84        }
85    }
86    # html2text table follows
87    FS = "\|"
88    filetype = 0
89    lcid_all = 1
90    xp_lcid  = 2
91    nls_238z = 3
92    filetypename[filetype] = "unknown"
93    filetypename[lcid_all] = "lcid_all"
94    filetypename[xp_lcid]  = "xp_lcid"
95    filetypename[nls_238z] = "nls_238z"
96    namefield[lcid_all] = 2
97    namefield[xp_lcid]  = 2
98    namefield[nls_238z] = 4
99    hexfield[lcid_all]  = 3
100    hexfield[xp_lcid]   = 3
101    hexfield[nls_238z]  = 2
102    locfield[lcid_all]  = 0
103    locfield[xp_lcid]   = 0
104    locfield[nls_238z]  = 3
105}
106
107(NF < 5) { next }
108
109!filetype {
110    if (NF == 5)
111    {
112        if ($2 ~ /^0x/)
113            filetype = nls_238z
114        else if ($2 ~ /^Afrikaans/)
115            filetype = lcid_all
116    }
117    else if (NF == 7)
118        filetype = xp_lcid
119    if (!filetype)
120        next
121    name = namefield[filetype]
122    hex = hexfield[filetype]
123    loc = locfield[filetype]
124}
125
126{
127    gsub( /^[^:]*:/, "", $name)
128    gsub( /\..*/, "", $name)
129    gsub( /(^[ _]+)|([ _]+$)/, "", $hex)
130    gsub( /(^[ _]+)|([ _]+$)/, "", $name)
131    if (loc)
132        gsub( /(^[ _]+)|([ _]+$)/, "", $loc)
133}
134
135($hex ~ /^0x/) { $hex = substr( $hex, 3) }
136
137# if only 464 instead of 0464, make it match lang.h
138(length($hex) < 4) { $hex = "0" $hex }
139
140($hex !~ /^[0-9a-fA-F][0-9a-fA-F]*$/) { filtered[$hex] = $0; next }
141
142# all[HEX]=string
143{ all[toupper($hex)] = $name }
144
145(loc) { comment[toupper($hex)] = "  /* " $loc " */" }
146
147# new hex: newlang[HEX]=string
148!(toupper($hex) in lang) { newlang[toupper($hex)] = $name }
149
150END {
151    if (!filetype)
152    {
153        print "No file type recognized." >>"/dev/stderr"
154        exit(1)
155    }
156    print "// assuming " filetypename[filetype] " file"
157    # every new language
158    for (x in newlang)
159    {
160        printf( "xxxxxxx LANGUAGE_%-26s 0x%s%s\n", newlang[x], x, comment[x])
161        n = split(newlang[x],arr,/[^A-Za-z0-9]/)
162        def = ""
163        for (i=1; i<=n; ++i)
164        {
165            if (length(arr[i]))
166            {
167                # each identifier word of the language name
168                if (def)
169                    def = def "_"
170                aup = toupper(arr[i])
171                def = def aup
172                for (l in lang)
173                {
174                    #  contained in already existing definitions?
175                    if (lang[l] ~ aup)
176                        printf( "// %-50s %s\n", arr[i] ": " lang[l], l)
177                }
178            }
179        }
180        printf( "#define LANGUAGE_%-26s 0x%s\n", def, x)
181    }
182    print "\n// --- reverse check follows ----------------------------------\n"
183    for (x in lang)
184    {
185        if (!(x in all))
186            print "// not in input file:   " x "  " lang[x]
187    }
188    print "\n// --- filtered table entries follow (if any) -----------------\n"
189    for (x in filtered)
190        print "// filtered:   " x "  " filtered[x]
191}
192