1 /**************************************************************
2  *
3  * Licensed to the Apache Software Foundation (ASF) under one
4  * or more contributor license agreements.  See the NOTICE file
5  * distributed with this work for additional information
6  * regarding copyright ownership.  The ASF licenses this file
7  * to you under the Apache License, Version 2.0 (the
8  * "License"); you may not use this file except in compliance
9  * with the License.  You may obtain a copy of the License at
10  *
11  *   http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing,
14  * software distributed under the License is distributed on an
15  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16  * KIND, either express or implied.  See the License for the
17  * specific language governing permissions and limitations
18  * under the License.
19  *
20  *************************************************************/
21 
22 
23 
24 #include <stdlib.h>
25 #include <stdio.h>
26 #include <fcntl.h>
27 #include <errno.h>
28 #include <string.h>
29 #include <unistd.h>
30 #include <ctype.h>
31 #include <sal/alloca.h>
32 
33 #include <rtl/ustring.hxx>
34 
35 #include <map>
36 #include <string>
37 
38 /*****************************************************************************
39  * typedefs
40  *****************************************************************************/
41 
42 typedef std::map< const std::string, rtl_TextEncoding > EncodingMap;
43 
44 struct _pair {
45     const char *key;
46     rtl_TextEncoding value;
47 };
48 
49 static int _pair_compare (const char *key, const _pair *pair);
50 static const _pair* _pair_search (const char *key, const _pair *base, unsigned int member );
51 
52 
53 const _pair _ms_encoding_list[] = {
54     { "0",       RTL_TEXTENCODING_UTF8        },
55     { "1250",    RTL_TEXTENCODING_MS_1250     },
56     { "1251",    RTL_TEXTENCODING_MS_1251     },
57     { "1252",    RTL_TEXTENCODING_MS_1252     },
58     { "1253",    RTL_TEXTENCODING_MS_1253     },
59     { "1254",    RTL_TEXTENCODING_MS_1254     },
60     { "1255",    RTL_TEXTENCODING_MS_1255     },
61     { "1256",    RTL_TEXTENCODING_MS_1256     },
62     { "1257",    RTL_TEXTENCODING_MS_1257     },
63     { "1258",    RTL_TEXTENCODING_MS_1258     },
64     { "874",     RTL_TEXTENCODING_MS_874      },
65     { "932",     RTL_TEXTENCODING_MS_932      },
66     { "936",     RTL_TEXTENCODING_MS_936      },
67     { "949",     RTL_TEXTENCODING_MS_949      },
68     { "950",     RTL_TEXTENCODING_MS_950      }
69 };
70 
71 
72 /*****************************************************************************
73  * fgets that work with unix line ends on Windows
74  *****************************************************************************/
75 
my_fgets(char * s,int n,FILE * fp)76 char * my_fgets(char *s, int n, FILE *fp)
77 {
78     int i;
79     for( i=0; i < n-1; i++ )
80     {
81         int c = getc(fp);
82 
83         if( c == EOF )
84             break;
85 
86         s[i] = (char) c;
87 
88         if( s[i] == '\n' )
89         {
90             i++;
91             break;
92         }
93     }
94 
95     if( i>0 )
96     {
97         s[i] = '\0';
98         return s;
99     }
100     else
101     {
102         return NULL;
103     }
104 }
105 
106 /*****************************************************************************
107  * compare function for binary search
108  *****************************************************************************/
109 
110 static int
_pair_compare(const char * key,const _pair * pair)111 _pair_compare (const char *key, const _pair *pair)
112 {
113     int result = rtl_str_compareIgnoreAsciiCase( key, pair->key );
114     return result;
115 }
116 
117 /*****************************************************************************
118  * binary search on encoding tables
119  *****************************************************************************/
120 
121 static const _pair*
_pair_search(const char * key,const _pair * base,unsigned int member)122 _pair_search (const char *key, const _pair *base, unsigned int member )
123 {
124     unsigned int lower = 0;
125     unsigned int upper = member;
126     unsigned int current;
127     int comparison;
128 
129     /* check for validity of input */
130     if ( (key == NULL) || (base == NULL) || (member == 0) )
131         return NULL;
132 
133     /* binary search */
134     while ( lower < upper )
135     {
136         current = (lower + upper) / 2;
137         comparison = _pair_compare( key, base + current );
138         if (comparison < 0)
139             upper = current;
140         else
141         if (comparison > 0)
142             lower = current + 1;
143         else
144             return base + current;
145     }
146 
147     return NULL;
148 }
149 
150 
151 /************************************************************************
152  * read_encoding_table
153  ************************************************************************/
154 
read_encoding_table(char * file,EncodingMap & aEncodingMap)155 void read_encoding_table(char * file, EncodingMap& aEncodingMap)
156 {
157     FILE * fp = fopen(file, "r");
158     if ( ! fp  ) {
159         fprintf(stderr, "ulfconv: %s %s\n", file, strerror(errno));
160         exit(2);
161     }
162 
163     char buffer[512];
164     while ( NULL != my_fgets(buffer, sizeof(buffer), fp) ) {
165 
166         // strip comment lines
167         if ( buffer[0] == '#' )
168             continue;
169 
170         // find end of language string
171         char * cp;
172         for ( cp = buffer; ! isspace(*cp); cp++ )
173             ;
174         *cp = '\0';
175 
176         // find start of codepage string
177         for ( ++cp; isspace(*cp); ++cp )
178             ;
179         char * codepage = cp;
180 
181         // find end of codepage string
182         for ( ++cp; ! isspace(*cp); ++cp )
183             ;
184         *cp = '\0';
185 
186         // find the correct mapping for codepage
187         const unsigned int members = sizeof( _ms_encoding_list ) / sizeof( _pair );
188         const _pair *encoding = _pair_search( codepage, _ms_encoding_list, members );
189 
190         if ( encoding != NULL ) {
191             const std::string language(buffer);
192             aEncodingMap.insert( EncodingMap::value_type(language, encoding->value) );
193         }
194     }
195 
196     fclose(fp);
197 }
198 
199 /************************************************************************
200  * print_legacy_mixed
201  ************************************************************************/
202 
print_legacy_mixed(FILE * ostream,const rtl::OUString & aString,const std::string & language,EncodingMap & aEncodingMap)203 void print_legacy_mixed(
204     FILE * ostream,
205     const rtl::OUString& aString,
206     const std::string& language,
207     EncodingMap& aEncodingMap)
208 {
209     EncodingMap::iterator iter = aEncodingMap.find(language);
210 
211     if ( iter != aEncodingMap.end() ) {
212         fputs(OUStringToOString(aString, iter->second).getStr(), ostream);
213     } else {
214         fprintf(stderr, "ulfconv: WARNING: no legacy encoding found for %s\n", language.c_str());
215     }
216 }
217 
218 /************************************************************************
219  * print_java_style
220  ************************************************************************/
221 
print_java_style(FILE * ostream,const rtl::OUString & aString)222 void print_java_style(FILE * ostream, const rtl::OUString& aString)
223 {
224     int imax = aString.getLength();
225     for (int i = 0; i < imax; i++) {
226         sal_Unicode uc = aString[i];
227         if ( uc < 128 ) {
228             fprintf(ostream, "%c", (char) uc);
229         } else {
230             fprintf(ostream, "\\u%2.2x%2.2x", uc >> 8, uc & 0xFF );
231         }
232     }
233 }
234 
235 /************************************************************************
236  * main
237  ************************************************************************/
238 
main(int argc,char * const argv[])239 int main( int argc, char * const argv[] )
240 {
241     EncodingMap aEncodingMap;
242 
243     FILE *istream = stdin;
244     FILE *ostream = stdout;
245 
246     char *outfile = NULL;
247 
248     int errflg = 0;
249     int argi;
250 
251     for( argi=1; argi < argc; argi++ )
252     {
253         if( argv[argi][0] == '-' && argv[argi][2] == '\0' )
254         {
255             switch(argv[argi][1]) {
256             case 'o':
257                 if (argi+1 >= argc || argv[argi+1][0] == '-')
258                 {
259                     fprintf(stderr, "Option -%c requires an operand\n", argv[argi][1]);
260                     errflg++;
261                     break;
262                 }
263 
264                 ++argi;
265                 outfile = argv[argi];
266                 break;
267             case 't':
268                 if (argi+1 >= argc || argv[argi+1][0] == '-')
269                 {
270                     fprintf(stderr, "Option -%c requires an operand\n", argv[argi][1]);
271                     errflg++;
272                     break;
273                 }
274 
275                 read_encoding_table(argv[++argi], aEncodingMap);
276                 break;
277             default:
278                 fprintf(stderr, "Unrecognized option: -%c\n", argv[argi][1]);
279                 errflg++;
280             }
281         }
282         else
283         {
284             break;
285         }
286     }
287 
288     if (errflg) {
289       fprintf(stderr, "Usage: ulfconv [-o <output file>] [-t <encoding table>] [<ulf file>]\n");
290       exit(2);
291     }
292 
293     /* assign input file to stdin */
294     if ( argi < argc )
295     {
296         istream = fopen(argv[argi], "r");
297         if ( istream  == NULL ) {
298             fprintf(stderr, "ulfconv: %s : %s\n", argv[argi], strerror(errno));
299             exit(2);
300         }
301     }
302 
303 	/* open output file if any */
304 	if ( outfile )
305 	{
306         ostream = fopen(outfile, "w");
307         if ( ostream == NULL ) {
308             fprintf(stderr, "ulfconv: %s : %s\n", outfile, strerror(errno));
309             fclose(istream);
310             exit(2);
311         }
312 	}
313 
314     /* read line by line from stdin */
315     char buffer[65536];
316     while ( NULL != fgets(buffer, sizeof(buffer), istream) ) {
317 
318         /* only handle lines containing " = " */
319         char * cp = strstr(buffer, " = \"");
320         if ( cp ) {
321             rtl::OUString aString;
322 
323             /* find end of lang string */
324             int n;
325             for ( n=0; ! isspace(buffer[n]); n++ )
326                 ;
327 
328             std::string line = buffer;
329             std::string lang(line, 0, n);
330 
331             cp += 4;
332             rtl_string2UString( &aString.pData, cp, strrchr(cp, '\"') - cp,
333                 RTL_TEXTENCODING_UTF8, OSTRING_TO_OUSTRING_CVTFLAGS );
334 
335             fprintf(ostream, "%s = \"", lang.c_str());
336 
337             if ( aEncodingMap.empty() ) {
338                 print_java_style(ostream, aString);
339             } else {
340                 print_legacy_mixed(ostream, aString, lang, aEncodingMap);
341             }
342 
343             fprintf(ostream, "\"\n");
344 
345 
346         } else {
347             fputs(buffer, ostream);
348         }
349     }
350 
351     fclose(ostream);
352     fclose(istream);
353 }
354