xref: /trunk/main/l10ntools/source/gsiconv.cxx (revision 1ecadb572e7010ff3b3382ad9bf179dbc6efadbb)
1 /*************************************************************************
2  *
3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4  *
5  * Copyright 2000, 2010 Oracle and/or its affiliates.
6  *
7  * OpenOffice.org - a multi-platform office productivity suite
8  *
9  * This file is part of OpenOffice.org.
10  *
11  * OpenOffice.org is free software: you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser General Public License version 3
13  * only, as published by the Free Software Foundation.
14  *
15  * OpenOffice.org is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  * GNU Lesser General Public License version 3 for more details
19  * (a copy is included in the LICENSE file that accompanied this code).
20  *
21  * You should have received a copy of the GNU Lesser General Public License
22  * version 3 along with OpenOffice.org.  If not, see
23  * <http://www.openoffice.org/license.html>
24  * for a copy of the LGPLv3 License.
25  *
26  ************************************************************************/
27 
28 // MARKER(update_precomp.py): autogen include statement, do not remove
29 #include "precompiled_l10ntools.hxx"
30 #include <stdio.h>
31 #include <tools/fsys.hxx>
32 #include <tools/stream.hxx>
33 
34 // local includes
35 #include "utf8conv.hxx"
36 
37 #define GSI_FILE_UNKNOWN        0x0000
38 #define GSI_FILE_OLDSTYLE       0x0001
39 #define GSI_FILE_L10NFRAMEWORK  0x0002
40 
41 /*****************************************************************************/
42 sal_uInt16 GetGSIFileType( SvStream &rStream )
43 /*****************************************************************************/
44 {
45     sal_uInt16 nFileType = GSI_FILE_UNKNOWN;
46 
47     sal_uLong nPos( rStream.Tell());
48     rStream.Seek( STREAM_SEEK_TO_BEGIN );
49 
50     ByteString sLine;
51     while( !rStream.IsEof() && !sLine.Len())
52         rStream.ReadLine( sLine );
53 
54     if( sLine.Len()) {
55         if( sLine.Search( "($$)" ) != STRING_NOTFOUND )
56             nFileType = GSI_FILE_OLDSTYLE;
57         else
58             nFileType = GSI_FILE_L10NFRAMEWORK;
59     }
60 
61     rStream.Seek( nPos );
62 
63     return nFileType;
64 }
65 
66 /*****************************************************************************/
67 ByteString GetGSILineId( const ByteString &rLine, sal_uInt16 nFileType )
68 /*****************************************************************************/
69 {
70     ByteString sId;
71     switch ( nFileType ) {
72         case GSI_FILE_OLDSTYLE:
73             sId = rLine;
74             sId.SearchAndReplaceAll( "($$)", "\t" );
75             sId = sId.GetToken( 0, '\t' );
76         break;
77 
78         case GSI_FILE_L10NFRAMEWORK:
79             sId = rLine.GetToken( 0, '\t' );
80             sId += "\t";
81             sId += rLine.GetToken( 1, '\t' );
82             sId += "\t";
83             sId += rLine.GetToken( 4, '\t' );
84             sId += "\t";
85             sId += rLine.GetToken( 5, '\t' );
86         break;
87     }
88     return sId;
89 }
90 
91 /*****************************************************************************/
92 ByteString GetGSILineLangId( const ByteString &rLine, sal_uInt16 nFileType )
93 /*****************************************************************************/
94 {
95     ByteString sLangId;
96     switch ( nFileType ) {
97         case GSI_FILE_OLDSTYLE:
98             sLangId = rLine;
99             sLangId.SearchAndReplaceAll( "($$)", "\t" );
100             sLangId = sLangId.GetToken( 2, '\t' );
101         break;
102 
103         case GSI_FILE_L10NFRAMEWORK:
104             sLangId = rLine.GetToken( 9, '\t' );
105         break;
106     }
107     return sLangId;
108 }
109 
110 /*****************************************************************************/
111 void ConvertGSILine( sal_Bool bToUTF8, ByteString &rLine,
112         rtl_TextEncoding nEncoding, sal_uInt16 nFileType )
113 /*****************************************************************************/
114 {
115     switch ( nFileType ) {
116         case GSI_FILE_OLDSTYLE:
117             if ( bToUTF8 )
118                 rLine = UTF8Converter::ConvertToUTF8( rLine, nEncoding );
119             else
120                 rLine = UTF8Converter::ConvertFromUTF8( rLine, nEncoding );
121         break;
122 
123         case GSI_FILE_L10NFRAMEWORK: {
124             ByteString sConverted;
125             for ( sal_uInt16 i = 0; i < rLine.GetTokenCount( '\t' ); i++ ) {
126                 ByteString sToken = rLine.GetToken( i, '\t' );
127                 if (( i > 9 ) && ( i < 14 )) {
128                     if( bToUTF8 )
129                         sToken = UTF8Converter::ConvertToUTF8( sToken, nEncoding );
130                     else
131                         sToken = UTF8Converter::ConvertFromUTF8( sToken, nEncoding );
132                 }
133                 if ( i )
134                     sConverted += "\t";
135                 sConverted += sToken;
136             }
137             rLine = sConverted;
138         }
139         break;
140     }
141 }
142 
143 /*****************************************************************************/
144 void Help()
145 /*****************************************************************************/
146 {
147     fprintf( stdout, "\n" );
148     fprintf( stdout, "gsiconv (c)1999 by StarOffice Entwicklungs GmbH\n" );
149     fprintf( stdout, "===============================================\n" );
150     fprintf( stdout, "\n" );
151     fprintf( stdout, "gsiconv converts strings in GSI-Files (Gutschmitt Interface) from or to UTF-8\n" );
152     fprintf( stdout, "\n" );
153     fprintf( stdout, "Syntax: gsiconv (-t|-f langid charset)|(-p n) filename\n" );
154     fprintf( stdout, "Switches: -t   => conversion from charset to UTF-8\n" );
155     fprintf( stdout, "          -f   => conversion from UTF-8 to charset\n" );
156     fprintf( stdout, "          -p n => creates several files with ca. n lines\n" );
157     fprintf( stdout, "\n" );
158     fprintf( stdout, "Allowed charsets:\n" );
159     fprintf( stdout, "          MS_932  => Japanese\n" );
160     fprintf( stdout, "          MS_936  => Chinese Simplified\n" );
161     fprintf( stdout, "          MS_949  => Korean\n" );
162     fprintf( stdout, "          MS_950  => Chinese Traditional\n" );
163     fprintf( stdout, "          MS_1250 => East Europe\n" );
164     fprintf( stdout, "          MS_1251 => Cyrillic\n" );
165     fprintf( stdout, "          MS_1252 => West Europe\n" );
166     fprintf( stdout, "          MS_1253 => Greek\n" );
167     fprintf( stdout, "          MS_1254 => Turkish\n" );
168     fprintf( stdout, "          MS_1255 => Hebrew\n" );
169     fprintf( stdout, "          MS_1256 => Arabic\n" );
170     fprintf( stdout, "\n" );
171     fprintf( stdout, "Allowed langids:\n" );
172     fprintf( stdout, "          1  => ENGLISH_US\n" );
173     fprintf( stdout, "          3  => PORTUGUESE \n" );
174     fprintf( stdout, "          4  => GERMAN_DE (new german style)\n" );
175     fprintf( stdout, "          7  => RUSSIAN\n" );
176     fprintf( stdout, "          30 => GREEK\n" );
177     fprintf( stdout, "          31 => DUTCH\n" );
178     fprintf( stdout, "          33 => FRENCH\n" );
179     fprintf( stdout, "          34 => SPANISH\n" );
180     fprintf( stdout, "          35 => FINNISH\n" );
181     fprintf( stdout, "          36 => HUNGARIAN\n" );
182     fprintf( stdout, "          39 => ITALIAN\n" );
183     fprintf( stdout, "          42 => CZECH\n" );
184     fprintf( stdout, "          44 => ENGLISH (UK)\n" );
185     fprintf( stdout, "          45 => DANISH\n" );
186     fprintf( stdout, "          46 => SWEDISH\n" );
187     fprintf( stdout, "          47 => NORWEGIAN\n" );
188     fprintf( stdout, "          49 => GERMAN (old german style)\n" );
189     fprintf( stdout, "          55 => PORTUGUESE_BRAZILIAN\n" );
190     fprintf( stdout, "          81 => JAPANESE\n" );
191     fprintf( stdout, "          82 => KOREAN\n" );
192     fprintf( stdout, "          86 => CHINESE_SIMPLIFIED\n" );
193     fprintf( stdout, "          88 => CHINESE_TRADITIONAL\n" );
194     fprintf( stdout, "          90 => TURKISH\n" );
195     fprintf( stdout, "          96 => ARABIC\n" );
196     fprintf( stdout, "          97 => HEBREW\n" );
197     fprintf( stdout, "\n" );
198 }
199 
200 /*****************************************************************************/
201 #if defined(UNX) || defined(OS2)
202 int main( int argc, char *argv[] )
203 #else
204 int _cdecl main( int argc, char *argv[] )
205 #endif
206 /*****************************************************************************/
207 {
208     if (( argc != 5 ) && ( argc != 4 )) {
209         Help();
210         exit ( 0 );
211     }
212 
213     if ( argc == 4 ) {
214         if ( ByteString( argv[ 1 ] ) == "-p" ) {
215 
216             DirEntry aSource = DirEntry( String( argv[ 3 ], RTL_TEXTENCODING_ASCII_US ));
217             if ( !aSource.Exists()) {
218                 fprintf( stderr, "\nERROR: GSI-File %s not found!\n\n", ByteString( argv[ 3 ] ).GetBuffer());
219                 exit ( 2 );
220             }
221 
222             DirEntry aOutput( aSource );
223 
224             String sBase = aOutput.GetBase();
225             String sExt = aOutput.GetExtension();
226 
227             String sGSI( argv[ 3 ], RTL_TEXTENCODING_ASCII_US );
228             SvFileStream aGSI( sGSI, STREAM_STD_READ  );
229             if ( !aGSI.IsOpen()) {
230                 fprintf( stderr, "\nERROR: Could not open GSI-File %s!\n\n", ByteString( argv[ 3 ] ).GetBuffer());
231                 exit ( 3 );
232             }
233 
234             sal_uInt16 nFileType( GetGSIFileType( aGSI ));
235 
236             sal_uLong nMaxLines = (sal_uLong) ByteString( argv[ 2 ] ).ToInt64();
237             if ( !nMaxLines ) {
238                 fprintf( stderr, "\nERROR: Linecount must be at least 1!\n\n" );
239                 exit ( 3 );
240             }
241 
242             ByteString sGSILine;
243             ByteString sOldId;
244             sal_uLong nLine = 0;
245             sal_uLong nOutputFile = 1;
246 
247             String sOutput( sBase );
248             sOutput += String( "_", RTL_TEXTENCODING_ASCII_US );
249             sOutput += String::CreateFromInt64( nOutputFile );
250             if ( sExt.Len()) {
251                 sOutput += String( ".", RTL_TEXTENCODING_ASCII_US );
252                 sOutput += sExt;
253             }
254             nOutputFile ++;
255 
256             aOutput.SetName( sOutput );
257             SvFileStream aOutputStream( aOutput.GetFull(), STREAM_STD_WRITE | STREAM_TRUNC );
258 
259             while ( !aGSI.IsEof()) {
260 
261                 aGSI.ReadLine( sGSILine );
262                 ByteString sId( GetGSILineId( sGSILine, nFileType ));
263 
264                 nLine++;
265 
266                 if (( nLine >= nMaxLines ) && ( sId != sOldId )) {
267                     aOutputStream.Close();
268 
269                     ByteString sText( aOutput.GetFull(), gsl_getSystemTextEncoding());
270                     sText += " with ";
271                     sText += ByteString::CreateFromInt64( nLine );
272                     sText += " lines written.";
273 
274                     fprintf( stdout, "%s\n", sText.GetBuffer());
275                     String sOutput1( sBase );
276                     sOutput1 += String( "_", RTL_TEXTENCODING_ASCII_US );
277                     sOutput1 += String::CreateFromInt64( nOutputFile );
278                     if ( sExt.Len()) {
279                         sOutput1 += String( ".", RTL_TEXTENCODING_ASCII_US );
280                         sOutput1 += sExt;
281                     }
282                     nOutputFile ++;
283 
284                     aOutput.SetName( sOutput1 );
285 
286                     aOutputStream.Open( aOutput.GetFull(), STREAM_STD_WRITE | STREAM_TRUNC );
287                     nLine = 0;
288                 }
289 
290                 aOutputStream.WriteLine( sGSILine );
291 
292                 sOldId = sId;
293             }
294 
295             aGSI.Close();
296             aOutputStream.Close();
297 
298             ByteString sText( aOutput.GetFull(), RTL_TEXTENCODING_ASCII_US );
299             sText += " with ";
300             sText += ByteString::CreateFromInt64( nLine );
301             sText += " lines written.";
302         }
303         else {
304             Help();
305             exit( 1 );
306         }
307     }
308     else {
309         if ( ByteString( argv[ 1 ] ) == "-t" || ByteString( argv[ 1 ] ) == "-f" ) {
310             rtl_TextEncoding nEncoding;
311 
312             ByteString sCurLangId( argv[ 2 ] );
313 
314             ByteString sCharset( argv[ 3 ] );
315             sCharset.ToUpperAscii();
316 
317             if      ( sCharset == "MS_932" )    nEncoding = RTL_TEXTENCODING_MS_932;
318             else if ( sCharset == "MS_936" )    nEncoding = RTL_TEXTENCODING_MS_936;
319             else if ( sCharset == "MS_949" )    nEncoding = RTL_TEXTENCODING_MS_949;
320             else if ( sCharset == "MS_950" )    nEncoding = RTL_TEXTENCODING_MS_950;
321             else if ( sCharset == "MS_1250" )   nEncoding = RTL_TEXTENCODING_MS_1250;
322             else if ( sCharset == "MS_1251" )   nEncoding = RTL_TEXTENCODING_MS_1251;
323             else if ( sCharset == "MS_1252" )   nEncoding = RTL_TEXTENCODING_MS_1252;
324             else if ( sCharset == "MS_1253" )   nEncoding = RTL_TEXTENCODING_MS_1253;
325             else if ( sCharset == "MS_1254" )   nEncoding = RTL_TEXTENCODING_MS_1254;
326             else if ( sCharset == "MS_1255" )   nEncoding = RTL_TEXTENCODING_MS_1255;
327             else if ( sCharset == "MS_1256" )   nEncoding = RTL_TEXTENCODING_MS_1256;
328             else if ( sCharset == "MS_1257" )   nEncoding = RTL_TEXTENCODING_MS_1257;
329             else if ( sCharset == "UTF8" )      nEncoding = RTL_TEXTENCODING_UTF8;
330 
331             else {
332                 Help();
333                 exit ( 1 );
334             }
335 
336             DirEntry aSource = DirEntry( String( argv[ 4 ], RTL_TEXTENCODING_ASCII_US ));
337             if ( !aSource.Exists()) {
338                 fprintf( stderr, "\nERROR: GSI-File %s not found!\n\n", ByteString( argv[ 3 ] ).GetBuffer());
339                 exit ( 2 );
340             }
341 
342             String sGSI( argv[ 4 ], RTL_TEXTENCODING_ASCII_US );
343             SvFileStream aGSI( sGSI, STREAM_STD_READ );
344             if ( !aGSI.IsOpen()) {
345                 fprintf( stderr, "\nERROR: Could not open GSI-File %s!\n\n", ByteString( argv[ 3 ] ).GetBuffer());
346                 exit ( 3 );
347             }
348             sal_uInt16 nFileType( GetGSIFileType( aGSI ));
349 
350             ByteString sGSILine;
351             while ( !aGSI.IsEof()) {
352 
353                 aGSI.ReadLine( sGSILine );
354                 ByteString sLangId( GetGSILineLangId( sGSILine, nFileType ));
355                 if ( sLangId == sCurLangId )
356                     ConvertGSILine(( ByteString( argv[ 1 ] ) == "-t" ), sGSILine, nEncoding, nFileType );
357 
358                 fprintf( stdout, "%s\n", sGSILine.GetBuffer());
359             }
360 
361             aGSI.Close();
362         }
363         else {
364             Help();
365             exit( 1 );
366         }
367     }
368     return 0;
369 }
370