xref: /aoo42x/main/l10ntools/source/gsiconv.cxx (revision cdf0e10c)
1 /*************************************************************************
2  *
3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4  *
5  * Copyright 2000, 2010 Oracle and/or its affiliates.
6  *
7  * OpenOffice.org - a multi-platform office productivity suite
8  *
9  * This file is part of OpenOffice.org.
10  *
11  * OpenOffice.org is free software: you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser General Public License version 3
13  * only, as published by the Free Software Foundation.
14  *
15  * OpenOffice.org is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  * GNU Lesser General Public License version 3 for more details
19  * (a copy is included in the LICENSE file that accompanied this code).
20  *
21  * You should have received a copy of the GNU Lesser General Public License
22  * version 3 along with OpenOffice.org.  If not, see
23  * <http://www.openoffice.org/license.html>
24  * for a copy of the LGPLv3 License.
25  *
26  ************************************************************************/
27 
28 // MARKER(update_precomp.py): autogen include statement, do not remove
29 #include "precompiled_l10ntools.hxx"
30 #include <stdio.h>
31 #include <tools/fsys.hxx>
32 #include <tools/stream.hxx>
33 
34 // local includes
35 #include "utf8conv.hxx"
36 
37 #define GSI_FILE_UNKNOWN		0x0000
38 #define GSI_FILE_OLDSTYLE		0x0001
39 #define GSI_FILE_L10NFRAMEWORK	0x0002
40 
41 /*****************************************************************************/
42 sal_uInt16 GetGSIFileType( SvStream &rStream )
43 /*****************************************************************************/
44 {
45 	sal_uInt16 nFileType = GSI_FILE_UNKNOWN;
46 
47 	sal_uLong nPos( rStream.Tell());
48 	rStream.Seek( STREAM_SEEK_TO_BEGIN );
49 
50 	ByteString sLine;
51 	while( !rStream.IsEof() && !sLine.Len())
52 		rStream.ReadLine( sLine );
53 
54 	if( sLine.Len()) {
55 		if( sLine.Search( "($$)" ) != STRING_NOTFOUND )
56 			nFileType = GSI_FILE_OLDSTYLE;
57 		else
58 			nFileType = GSI_FILE_L10NFRAMEWORK;
59 	}
60 
61 	rStream.Seek( nPos );
62 
63 	return nFileType;
64 }
65 
66 /*****************************************************************************/
67 ByteString GetGSILineId( const ByteString &rLine, sal_uInt16 nFileType )
68 /*****************************************************************************/
69 {
70 	ByteString sId;
71 	switch ( nFileType ) {
72 		case GSI_FILE_OLDSTYLE:
73 			sId = rLine;
74 			sId.SearchAndReplaceAll( "($$)", "\t" );
75 			sId = sId.GetToken( 0, '\t' );
76   		break;
77 
78 		case GSI_FILE_L10NFRAMEWORK:
79 			sId = rLine.GetToken( 0, '\t' );
80 			sId += "\t";
81 			sId += rLine.GetToken( 1, '\t' );
82 			sId += "\t";
83 			sId += rLine.GetToken( 4, '\t' );
84 			sId += "\t";
85 			sId += rLine.GetToken( 5, '\t' );
86   		break;
87 	}
88 	return sId;
89 }
90 
91 /*****************************************************************************/
92 ByteString GetGSILineLangId( const ByteString &rLine, sal_uInt16 nFileType )
93 /*****************************************************************************/
94 {
95 	ByteString sLangId;
96 	switch ( nFileType ) {
97 		case GSI_FILE_OLDSTYLE:
98 			sLangId = rLine;
99 			sLangId.SearchAndReplaceAll( "($$)", "\t" );
100 			sLangId = sLangId.GetToken( 2, '\t' );
101   		break;
102 
103 		case GSI_FILE_L10NFRAMEWORK:
104 			sLangId = rLine.GetToken( 9, '\t' );
105   		break;
106 	}
107 	return sLangId;
108 }
109 
110 /*****************************************************************************/
111 void ConvertGSILine( sal_Bool bToUTF8, ByteString &rLine,
112 		rtl_TextEncoding nEncoding,	sal_uInt16 nFileType )
113 /*****************************************************************************/
114 {
115 	switch ( nFileType ) {
116 		case GSI_FILE_OLDSTYLE:
117 			if ( bToUTF8 )
118 				rLine = UTF8Converter::ConvertToUTF8( rLine, nEncoding );
119 			else
120 				rLine = UTF8Converter::ConvertFromUTF8( rLine, nEncoding );
121 		break;
122 
123 		case GSI_FILE_L10NFRAMEWORK: {
124 			ByteString sConverted;
125 			for ( sal_uInt16 i = 0; i < rLine.GetTokenCount( '\t' ); i++ ) {
126 				ByteString sToken = rLine.GetToken( i, '\t' );
127 				if (( i > 9 ) && ( i < 14 )) {
128 					if( bToUTF8 )
129 						sToken = UTF8Converter::ConvertToUTF8( sToken, nEncoding );
130 					else
131 						sToken = UTF8Converter::ConvertFromUTF8( sToken, nEncoding );
132 				}
133 				if ( i )
134 					sConverted += "\t";
135 				sConverted += sToken;
136 			}
137 			rLine = sConverted;
138 		}
139 		break;
140 	}
141 }
142 
143 /*****************************************************************************/
144 void Help()
145 /*****************************************************************************/
146 {
147 	fprintf( stdout, "\n" );
148 	fprintf( stdout, "gsiconv (c)1999 by StarOffice Entwicklungs GmbH\n" );
149 	fprintf( stdout, "===============================================\n" );
150 	fprintf( stdout, "\n" );
151 	fprintf( stdout, "gsiconv converts strings in GSI-Files (Gutschmitt Interface) from or to UTF-8\n" );
152 	fprintf( stdout, "\n" );
153 	fprintf( stdout, "Syntax: gsiconv (-t|-f langid charset)|(-p n) filename\n" );
154 	fprintf( stdout, "Switches: -t   => conversion from charset to UTF-8\n" );
155 	fprintf( stdout, "          -f   => conversion from UTF-8 to charset\n" );
156 	fprintf( stdout, "          -p n => creates several files with ca. n lines\n" );
157 	fprintf( stdout, "\n" );
158 	fprintf( stdout, "Allowed charsets:\n" );
159 	fprintf( stdout, "          MS_932  => Japanese\n" );
160 	fprintf( stdout, "          MS_936  => Chinese Simplified\n" );
161 	fprintf( stdout, "          MS_949  => Korean\n" );
162 	fprintf( stdout, "          MS_950  => Chinese Traditional\n" );
163 	fprintf( stdout, "          MS_1250 => East Europe\n" );
164 	fprintf( stdout, "          MS_1251 => Cyrillic\n" );
165 	fprintf( stdout, "          MS_1252 => West Europe\n" );
166 	fprintf( stdout, "          MS_1253 => Greek\n" );
167 	fprintf( stdout, "          MS_1254 => Turkish\n" );
168 	fprintf( stdout, "          MS_1255 => Hebrew\n" );
169 	fprintf( stdout, "          MS_1256 => Arabic\n" );
170 	fprintf( stdout, "\n" );
171 	fprintf( stdout, "Allowed langids:\n" );
172 	fprintf( stdout, "          1  => ENGLISH_US\n" );
173 	fprintf( stdout, "          3  => PORTUGUESE \n" );
174 	fprintf( stdout, "          4  => GERMAN_DE (new german style)\n" );
175 	fprintf( stdout, "          7  => RUSSIAN\n" );
176 	fprintf( stdout, "          30 => GREEK\n" );
177 	fprintf( stdout, "          31 => DUTCH\n" );
178 	fprintf( stdout, "          33 => FRENCH\n" );
179 	fprintf( stdout, "          34 => SPANISH\n" );
180 	fprintf( stdout, "          35 => FINNISH\n" );
181 	fprintf( stdout, "          36 => HUNGARIAN\n" );
182 	fprintf( stdout, "          39 => ITALIAN\n" );
183 	fprintf( stdout, "          42 => CZECH\n" );
184 	fprintf( stdout, "          44 => ENGLISH (UK)\n" );
185 	fprintf( stdout, "          45 => DANISH\n" );
186 	fprintf( stdout, "          46 => SWEDISH\n" );
187 	fprintf( stdout, "          47 => NORWEGIAN\n" );
188 	fprintf( stdout, "          49 => GERMAN (old german style)\n" );
189 	fprintf( stdout, "          55 => PORTUGUESE_BRAZILIAN\n" );
190 	fprintf( stdout, "          81 => JAPANESE\n" );
191 	fprintf( stdout, "          82 => KOREAN\n" );
192 	fprintf( stdout, "          86 => CHINESE_SIMPLIFIED\n" );
193 	fprintf( stdout, "          88 => CHINESE_TRADITIONAL\n" );
194 	fprintf( stdout, "          90 => TURKISH\n" );
195 	fprintf( stdout, "          96 => ARABIC\n" );
196 	fprintf( stdout, "          97 => HEBREW\n" );
197 	fprintf( stdout, "\n" );
198 }
199 
200 /*****************************************************************************/
201 #if defined(UNX) || defined(OS2)
202 int main( int argc, char *argv[] )
203 #else
204 int _cdecl main( int argc, char *argv[] )
205 #endif
206 /*****************************************************************************/
207 {
208 	if (( argc != 5 ) && ( argc != 4 )) {
209 		Help();
210 		exit ( 0 );
211 	}
212 
213 	if ( argc == 4 ) {
214 		if ( ByteString( argv[ 1 ] ) == "-p" ) {
215 
216 			DirEntry aSource = DirEntry( String( argv[ 3 ], RTL_TEXTENCODING_ASCII_US ));
217 			if ( !aSource.Exists()) {
218 				fprintf( stderr, "\nERROR: GSI-File %s not found!\n\n", ByteString( argv[ 3 ] ).GetBuffer());
219 				exit ( 2 );
220 			}
221 
222 			DirEntry aOutput( aSource );
223 
224 			String sBase = aOutput.GetBase();
225 			String sExt = aOutput.GetExtension();
226 
227 			String sGSI( argv[ 3 ], RTL_TEXTENCODING_ASCII_US );
228 			SvFileStream aGSI( sGSI, STREAM_STD_READ  );
229 			if ( !aGSI.IsOpen()) {
230 				fprintf( stderr, "\nERROR: Could not open GSI-File %s!\n\n", ByteString( argv[ 3 ] ).GetBuffer());
231 				exit ( 3 );
232 			}
233 
234 			sal_uInt16 nFileType( GetGSIFileType( aGSI ));
235 
236 			sal_uLong nMaxLines = (sal_uLong) ByteString( argv[ 2 ] ).ToInt64();
237 			if ( !nMaxLines ) {
238 				fprintf( stderr, "\nERROR: Linecount must be at least 1!\n\n" );
239 				exit ( 3 );
240 			}
241 
242 			ByteString sGSILine;
243 			ByteString sOldId;
244 			sal_uLong nLine = 0;
245 			sal_uLong nOutputFile = 1;
246 
247 			String sOutput( sBase );
248 			sOutput += String( "_", RTL_TEXTENCODING_ASCII_US );
249 			sOutput += String::CreateFromInt64( nOutputFile );
250 			if ( sExt.Len()) {
251 				sOutput += String( ".", RTL_TEXTENCODING_ASCII_US );
252 				sOutput += sExt;
253 			}
254 			nOutputFile ++;
255 
256 			aOutput.SetName( sOutput );
257 			SvFileStream aOutputStream( aOutput.GetFull(), STREAM_STD_WRITE | STREAM_TRUNC );
258 
259 			while ( !aGSI.IsEof()) {
260 
261 				aGSI.ReadLine( sGSILine );
262 				ByteString sId( GetGSILineId( sGSILine, nFileType ));
263 
264 				nLine++;
265 
266 				if (( nLine >= nMaxLines ) && ( sId != sOldId )) {
267 					aOutputStream.Close();
268 
269 					ByteString sText( aOutput.GetFull(), gsl_getSystemTextEncoding());
270 					sText += " with ";
271 					sText += ByteString::CreateFromInt64( nLine );
272 					sText += " lines written.";
273 
274 					fprintf( stdout, "%s\n", sText.GetBuffer());
275 					String sOutput1( sBase );
276 					sOutput1 += String( "_", RTL_TEXTENCODING_ASCII_US );
277 					sOutput1 += String::CreateFromInt64( nOutputFile );
278 					if ( sExt.Len()) {
279 						sOutput1 += String( ".", RTL_TEXTENCODING_ASCII_US );
280 						sOutput1 += sExt;
281 					}
282 					nOutputFile ++;
283 
284 					aOutput.SetName( sOutput1 );
285 
286 					aOutputStream.Open( aOutput.GetFull(), STREAM_STD_WRITE | STREAM_TRUNC );
287 					nLine = 0;
288 				}
289 
290 				aOutputStream.WriteLine( sGSILine );
291 
292 				sOldId = sId;
293 			}
294 
295 			aGSI.Close();
296 			aOutputStream.Close();
297 
298 			ByteString sText( aOutput.GetFull(), RTL_TEXTENCODING_ASCII_US );
299 			sText += " with ";
300 			sText += ByteString::CreateFromInt64( nLine );
301 			sText += " lines written.";
302 		}
303 		else {
304 			Help();
305 			exit( 1 );
306 		}
307 	}
308 	else {
309 		if ( ByteString( argv[ 1 ] ) == "-t" || ByteString( argv[ 1 ] ) == "-f" ) {
310 			rtl_TextEncoding nEncoding;
311 
312 			ByteString sCurLangId( argv[ 2 ] );
313 
314 			ByteString sCharset( argv[ 3 ] );
315 			sCharset.ToUpperAscii();
316 
317 			if 		( sCharset == "MS_932" ) 	nEncoding = RTL_TEXTENCODING_MS_932;
318 			else if ( sCharset == "MS_936" ) 	nEncoding = RTL_TEXTENCODING_MS_936;
319 			else if ( sCharset == "MS_949" ) 	nEncoding = RTL_TEXTENCODING_MS_949;
320 			else if ( sCharset == "MS_950" ) 	nEncoding = RTL_TEXTENCODING_MS_950;
321 			else if ( sCharset == "MS_1250" ) 	nEncoding = RTL_TEXTENCODING_MS_1250;
322 			else if ( sCharset == "MS_1251" ) 	nEncoding = RTL_TEXTENCODING_MS_1251;
323 			else if ( sCharset == "MS_1252" ) 	nEncoding = RTL_TEXTENCODING_MS_1252;
324 			else if ( sCharset == "MS_1253" ) 	nEncoding = RTL_TEXTENCODING_MS_1253;
325 			else if ( sCharset == "MS_1254" ) 	nEncoding = RTL_TEXTENCODING_MS_1254;
326 			else if ( sCharset == "MS_1255" ) 	nEncoding = RTL_TEXTENCODING_MS_1255;
327 			else if ( sCharset == "MS_1256" ) 	nEncoding = RTL_TEXTENCODING_MS_1256;
328 			else if ( sCharset == "MS_1257" ) 	nEncoding = RTL_TEXTENCODING_MS_1257;
329 			else if ( sCharset == "UTF8" )		nEncoding = RTL_TEXTENCODING_UTF8;
330 
331 			else {
332 				Help();
333 				exit ( 1 );
334 			}
335 
336 			DirEntry aSource = DirEntry( String( argv[ 4 ], RTL_TEXTENCODING_ASCII_US ));
337 			if ( !aSource.Exists()) {
338 				fprintf( stderr, "\nERROR: GSI-File %s not found!\n\n", ByteString( argv[ 3 ] ).GetBuffer());
339 				exit ( 2 );
340 			}
341 
342 			String sGSI( argv[ 4 ], RTL_TEXTENCODING_ASCII_US );
343 			SvFileStream aGSI( sGSI, STREAM_STD_READ );
344 			if ( !aGSI.IsOpen()) {
345 				fprintf( stderr, "\nERROR: Could not open GSI-File %s!\n\n", ByteString( argv[ 3 ] ).GetBuffer());
346 				exit ( 3 );
347 			}
348 			sal_uInt16 nFileType( GetGSIFileType( aGSI ));
349 
350 			ByteString sGSILine;
351 			while ( !aGSI.IsEof()) {
352 
353 				aGSI.ReadLine( sGSILine );
354 				ByteString sLangId( GetGSILineLangId( sGSILine, nFileType ));
355 				if ( sLangId == sCurLangId )
356 					ConvertGSILine(( ByteString( argv[ 1 ] ) == "-t" ), sGSILine, nEncoding, nFileType );
357 
358 				fprintf( stdout, "%s\n", sGSILine.GetBuffer());
359 			}
360 
361 			aGSI.Close();
362 		}
363 		else {
364 			Help();
365 			exit( 1 );
366 		}
367 	}
368 	return 0;
369 }
370