xref: /trunk/main/l10ntools/source/gsiconv.cxx (revision 3cd96b95)
1 /**************************************************************
2  *
3  * Licensed to the Apache Software Foundation (ASF) under one
4  * or more contributor license agreements.  See the NOTICE file
5  * distributed with this work for additional information
6  * regarding copyright ownership.  The ASF licenses this file
7  * to you under the Apache License, Version 2.0 (the
8  * "License"); you may not use this file except in compliance
9  * with the License.  You may obtain a copy of the License at
10  *
11  *   http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing,
14  * software distributed under the License is distributed on an
15  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16  * KIND, either express or implied.  See the License for the
17  * specific language governing permissions and limitations
18  * under the License.
19  *
20  *************************************************************/
21 
22 
23 
24 // MARKER(update_precomp.py): autogen include statement, do not remove
25 #include "precompiled_l10ntools.hxx"
26 #include <stdio.h>
27 #include <tools/fsys.hxx>
28 #include <tools/stream.hxx>
29 
30 // local includes
31 #include "utf8conv.hxx"
32 
33 #define GSI_FILE_UNKNOWN		0x0000
34 #define GSI_FILE_OLDSTYLE		0x0001
35 #define GSI_FILE_L10NFRAMEWORK	0x0002
36 
37 /*****************************************************************************/
GetGSIFileType(SvStream & rStream)38 sal_uInt16 GetGSIFileType( SvStream &rStream )
39 /*****************************************************************************/
40 {
41 	sal_uInt16 nFileType = GSI_FILE_UNKNOWN;
42 
43 	sal_uLong nPos( rStream.Tell());
44 	rStream.Seek( STREAM_SEEK_TO_BEGIN );
45 
46 	ByteString sLine;
47 	while( !rStream.IsEof() && !sLine.Len())
48 		rStream.ReadLine( sLine );
49 
50 	if( sLine.Len()) {
51 		if( sLine.Search( "($$)" ) != STRING_NOTFOUND )
52 			nFileType = GSI_FILE_OLDSTYLE;
53 		else
54 			nFileType = GSI_FILE_L10NFRAMEWORK;
55 	}
56 
57 	rStream.Seek( nPos );
58 
59 	return nFileType;
60 }
61 
62 /*****************************************************************************/
GetGSILineId(const ByteString & rLine,sal_uInt16 nFileType)63 ByteString GetGSILineId( const ByteString &rLine, sal_uInt16 nFileType )
64 /*****************************************************************************/
65 {
66 	ByteString sId;
67 	switch ( nFileType ) {
68 		case GSI_FILE_OLDSTYLE:
69 			sId = rLine;
70 			sId.SearchAndReplaceAll( "($$)", "\t" );
71 			sId = sId.GetToken( 0, '\t' );
72   		break;
73 
74 		case GSI_FILE_L10NFRAMEWORK:
75 			sId = rLine.GetToken( 0, '\t' );
76 			sId += "\t";
77 			sId += rLine.GetToken( 1, '\t' );
78 			sId += "\t";
79 			sId += rLine.GetToken( 4, '\t' );
80 			sId += "\t";
81 			sId += rLine.GetToken( 5, '\t' );
82   		break;
83 	}
84 	return sId;
85 }
86 
87 /*****************************************************************************/
GetGSILineLangId(const ByteString & rLine,sal_uInt16 nFileType)88 ByteString GetGSILineLangId( const ByteString &rLine, sal_uInt16 nFileType )
89 /*****************************************************************************/
90 {
91 	ByteString sLangId;
92 	switch ( nFileType ) {
93 		case GSI_FILE_OLDSTYLE:
94 			sLangId = rLine;
95 			sLangId.SearchAndReplaceAll( "($$)", "\t" );
96 			sLangId = sLangId.GetToken( 2, '\t' );
97   		break;
98 
99 		case GSI_FILE_L10NFRAMEWORK:
100 			sLangId = rLine.GetToken( 9, '\t' );
101   		break;
102 	}
103 	return sLangId;
104 }
105 
106 /*****************************************************************************/
ConvertGSILine(sal_Bool bToUTF8,ByteString & rLine,rtl_TextEncoding nEncoding,sal_uInt16 nFileType)107 void ConvertGSILine( sal_Bool bToUTF8, ByteString &rLine,
108 		rtl_TextEncoding nEncoding,	sal_uInt16 nFileType )
109 /*****************************************************************************/
110 {
111 	switch ( nFileType ) {
112 		case GSI_FILE_OLDSTYLE:
113 			if ( bToUTF8 )
114 				rLine = UTF8Converter::ConvertToUTF8( rLine, nEncoding );
115 			else
116 				rLine = UTF8Converter::ConvertFromUTF8( rLine, nEncoding );
117 		break;
118 
119 		case GSI_FILE_L10NFRAMEWORK: {
120 			ByteString sConverted;
121 			for ( sal_uInt16 i = 0; i < rLine.GetTokenCount( '\t' ); i++ ) {
122 				ByteString sToken = rLine.GetToken( i, '\t' );
123 				if (( i > 9 ) && ( i < 14 )) {
124 					if( bToUTF8 )
125 						sToken = UTF8Converter::ConvertToUTF8( sToken, nEncoding );
126 					else
127 						sToken = UTF8Converter::ConvertFromUTF8( sToken, nEncoding );
128 				}
129 				if ( i )
130 					sConverted += "\t";
131 				sConverted += sToken;
132 			}
133 			rLine = sConverted;
134 		}
135 		break;
136 	}
137 }
138 
139 /*****************************************************************************/
Help()140 void Help()
141 /*****************************************************************************/
142 {
143 	fprintf( stdout, "\n" );
144 	fprintf( stdout, "gsiconv (c)1999 by StarOffice Entwicklungs GmbH\n" );
145 	fprintf( stdout, "===============================================\n" );
146 	fprintf( stdout, "\n" );
147 	fprintf( stdout, "gsiconv converts strings in GSI-Files (Gutschmitt Interface) from or to UTF-8\n" );
148 	fprintf( stdout, "\n" );
149 	fprintf( stdout, "Syntax: gsiconv (-t|-f langid charset)|(-p n) filename\n" );
150 	fprintf( stdout, "Switches: -t   => conversion from charset to UTF-8\n" );
151 	fprintf( stdout, "          -f   => conversion from UTF-8 to charset\n" );
152 	fprintf( stdout, "          -p n => creates several files with ca. n lines\n" );
153 	fprintf( stdout, "\n" );
154 	fprintf( stdout, "Allowed charsets:\n" );
155 	fprintf( stdout, "          MS_932  => Japanese\n" );
156 	fprintf( stdout, "          MS_936  => Chinese Simplified\n" );
157 	fprintf( stdout, "          MS_949  => Korean\n" );
158 	fprintf( stdout, "          MS_950  => Chinese Traditional\n" );
159 	fprintf( stdout, "          MS_1250 => East Europe\n" );
160 	fprintf( stdout, "          MS_1251 => Cyrillic\n" );
161 	fprintf( stdout, "          MS_1252 => West Europe\n" );
162 	fprintf( stdout, "          MS_1253 => Greek\n" );
163 	fprintf( stdout, "          MS_1254 => Turkish\n" );
164 	fprintf( stdout, "          MS_1255 => Hebrew\n" );
165 	fprintf( stdout, "          MS_1256 => Arabic\n" );
166 	fprintf( stdout, "\n" );
167 	fprintf( stdout, "Allowed langids:\n" );
168 	fprintf( stdout, "          1  => ENGLISH_US\n" );
169 	fprintf( stdout, "          3  => PORTUGUESE \n" );
170 	fprintf( stdout, "          4  => GERMAN_DE (new german style)\n" );
171 	fprintf( stdout, "          7  => RUSSIAN\n" );
172 	fprintf( stdout, "          30 => GREEK\n" );
173 	fprintf( stdout, "          31 => DUTCH\n" );
174 	fprintf( stdout, "          33 => FRENCH\n" );
175 	fprintf( stdout, "          34 => SPANISH\n" );
176 	fprintf( stdout, "          35 => FINNISH\n" );
177 	fprintf( stdout, "          36 => HUNGARIAN\n" );
178 	fprintf( stdout, "          39 => ITALIAN\n" );
179 	fprintf( stdout, "          42 => CZECH\n" );
180 	fprintf( stdout, "          44 => ENGLISH (UK)\n" );
181 	fprintf( stdout, "          45 => DANISH\n" );
182 	fprintf( stdout, "          46 => SWEDISH\n" );
183 	fprintf( stdout, "          47 => NORWEGIAN\n" );
184 	fprintf( stdout, "          49 => GERMAN (old german style)\n" );
185 	fprintf( stdout, "          55 => PORTUGUESE_BRAZILIAN\n" );
186 	fprintf( stdout, "          81 => JAPANESE\n" );
187 	fprintf( stdout, "          82 => KOREAN\n" );
188 	fprintf( stdout, "          86 => CHINESE_SIMPLIFIED\n" );
189 	fprintf( stdout, "          88 => CHINESE_TRADITIONAL\n" );
190 	fprintf( stdout, "          90 => TURKISH\n" );
191 	fprintf( stdout, "          96 => ARABIC\n" );
192 	fprintf( stdout, "          97 => HEBREW\n" );
193 	fprintf( stdout, "\n" );
194 }
195 
196 /*****************************************************************************/
197 #if defined(UNX) || defined(OS2)
main(int argc,char * argv[])198 int main( int argc, char *argv[] )
199 #else
200 int _cdecl main( int argc, char *argv[] )
201 #endif
202 /*****************************************************************************/
203 {
204 	if (( argc != 5 ) && ( argc != 4 )) {
205 		Help();
206 		exit ( 0 );
207 	}
208 
209 	if ( argc == 4 ) {
210 		if ( ByteString( argv[ 1 ] ) == "-p" ) {
211 
212 			DirEntry aSource = DirEntry( String( argv[ 3 ], RTL_TEXTENCODING_ASCII_US ));
213 			if ( !aSource.Exists()) {
214 				fprintf( stderr, "\nERROR: GSI-File %s not found!\n\n", ByteString( argv[ 3 ] ).GetBuffer());
215 				exit ( 2 );
216 			}
217 
218 			DirEntry aOutput( aSource );
219 
220 			String sBase = aOutput.GetBase();
221 			String sExt = aOutput.GetExtension();
222 
223 			String sGSI( argv[ 3 ], RTL_TEXTENCODING_ASCII_US );
224 			SvFileStream aGSI( sGSI, STREAM_STD_READ  );
225 			if ( !aGSI.IsOpen()) {
226 				fprintf( stderr, "\nERROR: Could not open GSI-File %s!\n\n", ByteString( argv[ 3 ] ).GetBuffer());
227 				exit ( 3 );
228 			}
229 
230 			sal_uInt16 nFileType( GetGSIFileType( aGSI ));
231 
232 			sal_uLong nMaxLines = (sal_uLong) ByteString( argv[ 2 ] ).ToInt64();
233 			if ( !nMaxLines ) {
234 				fprintf( stderr, "\nERROR: Linecount must be at least 1!\n\n" );
235 				exit ( 3 );
236 			}
237 
238 			ByteString sGSILine;
239 			ByteString sOldId;
240 			sal_uLong nLine = 0;
241 			sal_uLong nOutputFile = 1;
242 
243 			String sOutput( sBase );
244 			sOutput += String( "_", RTL_TEXTENCODING_ASCII_US );
245 			sOutput += String::CreateFromInt64( nOutputFile );
246 			if ( sExt.Len()) {
247 				sOutput += String( ".", RTL_TEXTENCODING_ASCII_US );
248 				sOutput += sExt;
249 			}
250 			nOutputFile ++;
251 
252 			aOutput.SetName( sOutput );
253 			SvFileStream aOutputStream( aOutput.GetFull(), STREAM_STD_WRITE | STREAM_TRUNC );
254 
255 			while ( !aGSI.IsEof()) {
256 
257 				aGSI.ReadLine( sGSILine );
258 				ByteString sId( GetGSILineId( sGSILine, nFileType ));
259 
260 				nLine++;
261 
262 				if (( nLine >= nMaxLines ) && ( sId != sOldId )) {
263 					aOutputStream.Close();
264 
265 					ByteString sText( aOutput.GetFull(), gsl_getSystemTextEncoding());
266 					sText += " with ";
267 					sText += ByteString::CreateFromInt64( nLine );
268 					sText += " lines written.";
269 
270 					fprintf( stdout, "%s\n", sText.GetBuffer());
271 					String sOutput1( sBase );
272 					sOutput1 += String( "_", RTL_TEXTENCODING_ASCII_US );
273 					sOutput1 += String::CreateFromInt64( nOutputFile );
274 					if ( sExt.Len()) {
275 						sOutput1 += String( ".", RTL_TEXTENCODING_ASCII_US );
276 						sOutput1 += sExt;
277 					}
278 					nOutputFile ++;
279 
280 					aOutput.SetName( sOutput1 );
281 
282 					aOutputStream.Open( aOutput.GetFull(), STREAM_STD_WRITE | STREAM_TRUNC );
283 					nLine = 0;
284 				}
285 
286 				aOutputStream.WriteLine( sGSILine );
287 
288 				sOldId = sId;
289 			}
290 
291 			aGSI.Close();
292 			aOutputStream.Close();
293 
294 			ByteString sText( aOutput.GetFull(), RTL_TEXTENCODING_ASCII_US );
295 			sText += " with ";
296 			sText += ByteString::CreateFromInt64( nLine );
297 			sText += " lines written.";
298 		}
299 		else {
300 			Help();
301 			exit( 1 );
302 		}
303 	}
304 	else {
305 		if ( ByteString( argv[ 1 ] ) == "-t" || ByteString( argv[ 1 ] ) == "-f" ) {
306 			rtl_TextEncoding nEncoding;
307 
308 			ByteString sCurLangId( argv[ 2 ] );
309 
310 			ByteString sCharset( argv[ 3 ] );
311 			sCharset.ToUpperAscii();
312 
313 			if 		( sCharset == "MS_932" ) 	nEncoding = RTL_TEXTENCODING_MS_932;
314 			else if ( sCharset == "MS_936" ) 	nEncoding = RTL_TEXTENCODING_MS_936;
315 			else if ( sCharset == "MS_949" ) 	nEncoding = RTL_TEXTENCODING_MS_949;
316 			else if ( sCharset == "MS_950" ) 	nEncoding = RTL_TEXTENCODING_MS_950;
317 			else if ( sCharset == "MS_1250" ) 	nEncoding = RTL_TEXTENCODING_MS_1250;
318 			else if ( sCharset == "MS_1251" ) 	nEncoding = RTL_TEXTENCODING_MS_1251;
319 			else if ( sCharset == "MS_1252" ) 	nEncoding = RTL_TEXTENCODING_MS_1252;
320 			else if ( sCharset == "MS_1253" ) 	nEncoding = RTL_TEXTENCODING_MS_1253;
321 			else if ( sCharset == "MS_1254" ) 	nEncoding = RTL_TEXTENCODING_MS_1254;
322 			else if ( sCharset == "MS_1255" ) 	nEncoding = RTL_TEXTENCODING_MS_1255;
323 			else if ( sCharset == "MS_1256" ) 	nEncoding = RTL_TEXTENCODING_MS_1256;
324 			else if ( sCharset == "MS_1257" ) 	nEncoding = RTL_TEXTENCODING_MS_1257;
325 			else if ( sCharset == "UTF8" )		nEncoding = RTL_TEXTENCODING_UTF8;
326 
327 			else {
328 				Help();
329 				exit ( 1 );
330 			}
331 
332 			DirEntry aSource = DirEntry( String( argv[ 4 ], RTL_TEXTENCODING_ASCII_US ));
333 			if ( !aSource.Exists()) {
334 				fprintf( stderr, "\nERROR: GSI-File %s not found!\n\n", ByteString( argv[ 3 ] ).GetBuffer());
335 				exit ( 2 );
336 			}
337 
338 			String sGSI( argv[ 4 ], RTL_TEXTENCODING_ASCII_US );
339 			SvFileStream aGSI( sGSI, STREAM_STD_READ );
340 			if ( !aGSI.IsOpen()) {
341 				fprintf( stderr, "\nERROR: Could not open GSI-File %s!\n\n", ByteString( argv[ 3 ] ).GetBuffer());
342 				exit ( 3 );
343 			}
344 			sal_uInt16 nFileType( GetGSIFileType( aGSI ));
345 
346 			ByteString sGSILine;
347 			while ( !aGSI.IsEof()) {
348 
349 				aGSI.ReadLine( sGSILine );
350 				ByteString sLangId( GetGSILineLangId( sGSILine, nFileType ));
351 				if ( sLangId == sCurLangId )
352 					ConvertGSILine(( ByteString( argv[ 1 ] ) == "-t" ), sGSILine, nEncoding, nFileType );
353 
354 				fprintf( stdout, "%s\n", sGSILine.GetBuffer());
355 			}
356 
357 			aGSI.Close();
358 		}
359 		else {
360 			Help();
361 			exit( 1 );
362 		}
363 	}
364 	return 0;
365 }
366