xref: /trunk/main/sax/source/expatwrap/xml2utf.cxx (revision f9b72d11)
1 /**************************************************************
2  *
3  * Licensed to the Apache Software Foundation (ASF) under one
4  * or more contributor license agreements.  See the NOTICE file
5  * distributed with this work for additional information
6  * regarding copyright ownership.  The ASF licenses this file
7  * to you under the Apache License, Version 2.0 (the
8  * "License"); you may not use this file except in compliance
9  * with the License.  You may obtain a copy of the License at
10  *
11  *   http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing,
14  * software distributed under the License is distributed on an
15  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16  * KIND, either express or implied.  See the License for the
17  * specific language governing permissions and limitations
18  * under the License.
19  *
20  *************************************************************/
21 
22 
23 #include <string.h>
24 
25 #include <sal/types.h>
26 
27 #include <rtl/textenc.h>
28 #include <rtl/tencinfo.h>
29 
30 
31 #include <com/sun/star/io/XInputStream.hpp>
32 
33 using namespace rtl;
34 using namespace ::com::sun::star::uno;
35 using namespace ::com::sun::star::io;
36 
37 #include "xml2utf.hxx"
38 
39 namespace sax_expatwrap {
40 
readAndConvert(Sequence<sal_Int8> & seq,sal_Int32 nMaxToRead)41 sal_Int32 XMLFile2UTFConverter::readAndConvert( Sequence<sal_Int8> &seq , sal_Int32 nMaxToRead )
42 	throw ( IOException, NotConnectedException , BufferSizeExceededException , RuntimeException )
43 {
44 
45 	Sequence<sal_Int8> seqIn;
46 
47 	if( ! m_in.is() ) {
48 		throw NotConnectedException();
49 	}
50 	if( ! m_bStarted ) {
51 		nMaxToRead = Max( 512 , nMaxToRead );  	// it should be possible to find the encoding attribute
52 						     					// within the first 512 bytes == 128 chars in UCS-4
53 	}
54 
55 	sal_Int32 nRead;
56 	Sequence< sal_Int8 > seqStart;
57 	while( sal_True )
58 	{
59 		nRead = m_in->readSomeBytes( seq , nMaxToRead );
60 
61 		if( nRead + seqStart.getLength())
62 		{
63 			// if nRead is 0, the file is already eof.
64 			if( ! m_bStarted && nRead )
65 			{
66 				// ensure that enough data is available to parse encoding
67 				if( seqStart.getLength() )
68 				{
69 				  // prefix with what we had so far.
70 				  sal_Int32 nLength = seq.getLength();
71 				  seq.realloc( seqStart.getLength() + nLength );
72 
73 				  memmove (seq.getArray() + seqStart.getLength(),
74 					   seq.getConstArray(),
75 					   nLength);
76 				  memcpy  (seq.getArray(),
77 					   seqStart.getConstArray(),
78 					   seqStart.getLength());
79 				}
80 
81 				// autodetection with the first bytes
82 				if( ! isEncodingRecognizable( seq ) )
83 				{
84 				  // remember what we have so far.
85 				  seqStart = seq;
86 
87 				  // read more !
88 				  continue;
89 				}
90 				if( scanForEncoding( seq ) || m_sEncoding.getLength() ) {
91 					// initialize decoding
92 					initializeDecoding();
93 				}
94 				nRead = seq.getLength();
95 				seqStart = Sequence < sal_Int8 > ();
96 			}
97 
98 			// do the encoding
99 			if( m_pText2Unicode && m_pUnicode2Text &&
100 				m_pText2Unicode->canContinue() && m_pUnicode2Text->canContinue() ) {
101 
102 				Sequence<sal_Unicode> seqUnicode = m_pText2Unicode->convert( seq );
103 				seq = m_pUnicode2Text->convert(	seqUnicode.getConstArray(),	seqUnicode.getLength() );
104 			}
105 
106 			if( ! m_bStarted )
107 			{
108 				// it must now be ensured, that no encoding attribute exist anymore
109 				// ( otherwise the expat-Parser will crash )
110 				// This must be done after decoding !
111 				// ( e.g. Files decoded in ucs-4 cannot be read properly )
112 				m_bStarted = sal_True;
113 				removeEncoding( seq );
114 			}
115 			nRead = seq.getLength();
116 		}
117 
118 		break;
119 	}
120 	return nRead;
121 }
122 
123 
~XMLFile2UTFConverter()124 XMLFile2UTFConverter::~XMLFile2UTFConverter()
125 {
126 	if( m_pText2Unicode )
127 		delete m_pText2Unicode;
128 	if( m_pUnicode2Text )
129 		delete m_pUnicode2Text;
130 }
131 
132 
removeEncoding(Sequence<sal_Int8> & seq)133 void XMLFile2UTFConverter::removeEncoding( Sequence<sal_Int8> &seq )
134 {
135 	const sal_Int8 *pSource = seq.getArray();
136 	if( ! strncmp( (const char * ) pSource , "<?xml" , 4) )
137 	{
138 
139 		// scan for encoding
140 		OString str( (sal_Char * ) pSource , seq.getLength() );
141 
142 		// cut sequence to first line break
143 		// find first line break;
144 		int nMax = str.indexOf( 10 );
145 		if( nMax >= 0 )
146 		{
147 			str = str.copy( 0 , nMax );
148 		}
149 
150 		int nFound = str.indexOf( " encoding" );
151 		if( nFound >= 0 ) {
152 			int nStop;
153 			int nStart = str.indexOf( "\"" , nFound );
154 			if( nStart < 0 || str.indexOf( "'" , nFound ) < nStart )
155 			{
156 				nStart = str.indexOf( "'" , nFound );
157 				nStop  = str.indexOf( "'" , nStart +1 );
158 			}
159 			else
160 			{
161 				nStop  = str.indexOf( "\"" , nStart +1);
162 			}
163 
164 			if( nStart >= 0 && nStop >= 0 && nStart+1 < nStop )
165 			{
166 				// remove encoding tag from file
167 				memmove(        &( seq.getArray()[nFound] ) ,
168 								&( seq.getArray()[nStop+1]) ,
169 								seq.getLength() - nStop -1);
170 				seq.realloc( seq.getLength() - ( nStop+1 - nFound ) );
171 //				str = String( (char * ) seq.getArray() , seq.getLen() );
172 			}
173 		}
174 	}
175 }
176 
177 // Checks, if enough data has been accumulated to recognize the encoding
isEncodingRecognizable(const Sequence<sal_Int8> & seq)178 sal_Bool XMLFile2UTFConverter::isEncodingRecognizable( const Sequence< sal_Int8 > &seq)
179 {
180 	const sal_Int8 *pSource = seq.getConstArray();
181 	sal_Bool bCheckIfFirstClosingBracketExsists = sal_False;
182 
183 	if( seq.getLength() < 8 ) {
184 		// no recognition possible, when less than 8 bytes are available
185 		return sal_False;
186 	}
187 
188 	if( ! strncmp( (const char * ) pSource , "<?xml" , 4 ) ) {
189 		// scan if the <?xml tag finishes within this buffer
190 		bCheckIfFirstClosingBracketExsists = sal_True;
191 	}
192 	else if( ('<' == pSource[0] || '<' == pSource[2] ) &&
193 			 ( ('?' == pSource[4] || '?' == pSource[6] ) ) )
194 	{
195 		// check for utf-16
196 		bCheckIfFirstClosingBracketExsists = sal_True;
197 	}
198 	else if( ( '<' == pSource[1] || '<' == pSource[3] ) &&
199 		     ( '?' == pSource[5] || '?' == pSource[7] ) )
200 	{
201 		// check for
202 		bCheckIfFirstClosingBracketExsists = sal_True;
203 	}
204 
205 	if( bCheckIfFirstClosingBracketExsists )
206 	{
207 		for( sal_Int32 i = 0; i < seq.getLength() ; i ++ )
208 		{
209 			// whole <?xml tag is valid
210 			if( '>' == pSource[ i ] )
211 			{
212 				return sal_True;
213 			}
214 		}
215 		return sal_False;
216 	}
217 
218 	// No <? tag in front, no need for a bigger buffer
219 	return sal_True;
220 }
221 
scanForEncoding(Sequence<sal_Int8> & seq)222 sal_Bool XMLFile2UTFConverter::scanForEncoding( Sequence< sal_Int8 > &seq )
223 {
224 	const sal_uInt8 *pSource = reinterpret_cast<const sal_uInt8*>( seq.getConstArray() );
225 	sal_Bool bReturn = sal_True;
226 
227 	if( seq.getLength() < 4 ) {
228 		// no recognition possible, when less than 4 bytes are available
229 		return sal_False;
230 	}
231 
232 	// first level : detect possible file formats
233 	if( ! strncmp( (const char * ) pSource , "<?xml" , 4 ) ) {
234 
235 		// scan for encoding
236 		OString str( (const sal_Char *) pSource , seq.getLength() );
237 
238 		// cut sequence to first line break
239 		//find first line break;
240 		int nMax = str.indexOf( 10 );
241 		if( nMax >= 0 )
242 		{
243 			str = str.copy( 0 , nMax );
244 		}
245 
246 		int nFound = str.indexOf( " encoding" );
247 		if( nFound < str.getLength() ) {
248 			int nStop;
249 			int nStart = str.indexOf( "\"" , nFound );
250 			if( nStart < 0 || str.indexOf( "'" , nFound ) < nStart )
251 			{
252 				nStart = str.indexOf( "'" , nFound );
253 				nStop  = str.indexOf( "'" , nStart +1 );
254 			}
255 			else
256 			{
257 				nStop  = str.indexOf( "\"" , nStart +1);
258 			}
259 			if( nStart >= 0 && nStop >= 0 && nStart+1 < nStop )
260 			{
261 				// encoding found finally
262 				m_sEncoding = str.copy( nStart+1 , nStop - nStart - 1 );
263 			}
264 		}
265 	}
266 	else if( 0xFE == pSource[0] &&
267 	         0xFF == pSource[1] ) {
268 		// UTF-16 big endian
269 		// conversion is done so that encoding information can be easily extracted
270 		m_sEncoding = "utf-16";
271 	}
272 	else if( 0xFF == pSource[0] &&
273 	         0xFE == pSource[1] ) {
274 		// UTF-16 little endian
275 		// conversion is done so that encoding information can be easily extracted
276 		m_sEncoding = "utf-16";
277 	}
278 	else if( 0x00 == pSource[0] && 0x3c == pSource[1]  && 0x00 == pSource[2] && 0x3f == pSource[3] ) {
279 		// UTF-16 big endian without byte order mark (this is (strictly speaking) an error.)
280 		// The byte order mark is simply added
281 
282 		// simply add the byte order mark !
283 		seq.realloc( seq.getLength() + 2 );
284 		memmove( &( seq.getArray()[2] ) , seq.getArray() , seq.getLength() - 2 );
285 		((sal_uInt8*)seq.getArray())[0] = 0xFE;
286 		((sal_uInt8*)seq.getArray())[1] = 0xFF;
287 
288 		m_sEncoding = "utf-16";
289 	}
290 	else if( 0x3c == pSource[0] && 0x00 == pSource[1]  && 0x3f == pSource[2] && 0x00 == pSource[3] ) {
291 		// UTF-16 little endian without byte order mark (this is (strictly speaking) an error.)
292 		// The byte order mark is simply added
293 
294 		seq.realloc( seq.getLength() + 2 );
295 		memmove( &( seq.getArray()[2] ) , seq.getArray() , seq.getLength() - 2 );
296 		((sal_uInt8*)seq.getArray())[0] = 0xFF;
297 		((sal_uInt8*)seq.getArray())[1] = 0xFE;
298 
299 		m_sEncoding = "utf-16";
300 	}
301     else if( 0xEF == pSource[0] &&
302              0xBB == pSource[1] &&
303              0xBF == pSource[2] )
304     {
305         // UTF-8 BOM (byte order mark); signifies utf-8, and not byte order
306         // The BOM is removed.
307         memmove( seq.getArray(), &( seq.getArray()[3] ), seq.getLength()-3 );
308         seq.realloc( seq.getLength() - 3 );
309         m_sEncoding = "utf-8";
310     }
311 	else if( 0x00 == pSource[0] && 0x00 == pSource[1]  && 0x00 == pSource[2] && 0x3c == pSource[3] ) {
312 		// UCS-4 big endian
313 		m_sEncoding = "ucs-4";
314 	}
315 	else if( 0x3c == pSource[0] && 0x00 == pSource[1]  && 0x00 == pSource[2] && 0x00 == pSource[3] ) {
316 		// UCS-4 little endian
317 		m_sEncoding = "ucs-4";
318 	}
319 	else if( 0x4c == pSource[0] && 0x6f == pSource[1]  &&
320 	         0xa7 == static_cast<unsigned char> (pSource[2]) &&
321 	         0x94 == static_cast<unsigned char> (pSource[3]) ) {
322 		// EBCDIC
323 		bReturn = sal_False;   // must be extended
324 	}
325 	else {
326 		// other
327 		// UTF8 is directly recognized by the parser.
328 		bReturn = sal_False;
329 	}
330 
331 	return bReturn;
332 }
333 
initializeDecoding()334 void XMLFile2UTFConverter::initializeDecoding()
335 {
336 
337 	if( m_sEncoding.getLength() )
338 	{
339 		rtl_TextEncoding encoding = rtl_getTextEncodingFromMimeCharset( m_sEncoding.getStr() );
340 		if( encoding != RTL_TEXTENCODING_UTF8 )
341 		{
342 			m_pText2Unicode = new Text2UnicodeConverter( m_sEncoding );
343 			m_pUnicode2Text = new Unicode2TextConverter( RTL_TEXTENCODING_UTF8 );
344 		}
345 	}
346 }
347 
348 
349 //----------------------------------------------
350 //
351 // Text2UnicodeConverter
352 //
353 //----------------------------------------------
Text2UnicodeConverter(const OString & sEncoding)354 Text2UnicodeConverter::Text2UnicodeConverter( const OString &sEncoding )
355 {
356 	rtl_TextEncoding encoding = rtl_getTextEncodingFromMimeCharset( sEncoding.getStr() );
357 	if( RTL_TEXTENCODING_DONTKNOW == encoding )
358 	{
359 		m_bCanContinue = sal_False;
360 		m_bInitialized = sal_False;
361 	}
362 	else
363 	{
364 		init( encoding );
365 	}
366 }
367 
~Text2UnicodeConverter()368 Text2UnicodeConverter::~Text2UnicodeConverter()
369 {
370 	if( m_bInitialized )
371 	{
372 		rtl_destroyTextToUnicodeContext( m_convText2Unicode , m_contextText2Unicode );
373 		rtl_destroyUnicodeToTextConverter( m_convText2Unicode );
374 	}
375 }
376 
init(rtl_TextEncoding encoding)377 void Text2UnicodeConverter::init( rtl_TextEncoding encoding )
378 {
379 	m_bCanContinue = sal_True;
380 	m_bInitialized = sal_True;
381 
382 	m_convText2Unicode 	= rtl_createTextToUnicodeConverter(encoding);
383 	m_contextText2Unicode = rtl_createTextToUnicodeContext( m_convText2Unicode );
384 	m_rtlEncoding = encoding;
385 }
386 
387 
convert(const Sequence<sal_Int8> & seqText)388 Sequence<sal_Unicode> Text2UnicodeConverter::convert( const Sequence<sal_Int8> &seqText )
389 {
390 	sal_uInt32 uiInfo;
391 	sal_Size nSrcCvtBytes 	= 0;
392 	sal_Size nTargetCount 	= 0;
393 	sal_Size nSourceCount   = 0;
394 
395 	// the whole source size
396 	sal_Int32 	nSourceSize = seqText.getLength() + m_seqSource.getLength();
397 	Sequence<sal_Unicode> 	seqUnicode ( nSourceSize );
398 
399 	const sal_Int8 *pbSource = seqText.getConstArray();
400 	sal_Int8 *pbTempMem = 0;
401 
402 	if( m_seqSource.getLength() ) {
403 		// put old rest and new byte sequence into one array
404 		pbTempMem = new sal_Int8[ nSourceSize ];
405 		memcpy( pbTempMem , m_seqSource.getConstArray() , m_seqSource.getLength() );
406 		memcpy( &(pbTempMem[ m_seqSource.getLength() ]) , seqText.getConstArray() , seqText.getLength() );
407 		pbSource = pbTempMem;
408 
409 		// set to zero again
410 		m_seqSource = Sequence< sal_Int8 >();
411 	}
412 
413 	while( sal_True ) {
414 
415 		/* All invalid characters are transformed to the unicode undefined char */
416 		nTargetCount += 	rtl_convertTextToUnicode(
417 									m_convText2Unicode,
418 									m_contextText2Unicode,
419 									( const sal_Char * ) &( pbSource[nSourceCount] ),
420 									nSourceSize - nSourceCount ,
421 									&( seqUnicode.getArray()[ nTargetCount ] ),
422 									seqUnicode.getLength() - nTargetCount,
423 									RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_DEFAULT   |
424 									RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_DEFAULT |
425 									RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT,
426 									&uiInfo,
427 									&nSrcCvtBytes );
428 		nSourceCount += nSrcCvtBytes;
429 
430 		if( uiInfo & RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL ) {
431 			// save necessary bytes for next conversion
432 			seqUnicode.realloc( seqUnicode.getLength() * 2 );
433 			continue;
434 		}
435 		break;
436 	}
437 	if( uiInfo & RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL ) {
438 		m_seqSource.realloc( nSourceSize - nSourceCount );
439 		memcpy( m_seqSource.getArray() , &(pbSource[nSourceCount]) , nSourceSize-nSourceCount );
440 	}
441 
442 
443 	if( pbTempMem ) {
444 		delete [] pbTempMem;
445 	}
446 
447 	// set to correct unicode size
448 	seqUnicode.realloc( nTargetCount );
449 
450 	return seqUnicode;
451 }
452 
453 
454 
455 //----------------------------------------------
456 //
457 // Unicode2TextConverter
458 //
459 //----------------------------------------------
Unicode2TextConverter(rtl_TextEncoding encoding)460 Unicode2TextConverter::Unicode2TextConverter( rtl_TextEncoding encoding )
461 {
462 	init( encoding );
463 }
464 
465 
~Unicode2TextConverter()466 Unicode2TextConverter::~Unicode2TextConverter()
467 {
468 	if( m_bInitialized ) {
469 		rtl_destroyUnicodeToTextContext( m_convUnicode2Text , m_contextUnicode2Text );
470 		rtl_destroyUnicodeToTextConverter( m_convUnicode2Text );
471 	}
472 }
473 
474 
convert(const sal_Unicode * puSource,sal_Int32 nSourceSize)475 Sequence<sal_Int8> Unicode2TextConverter::convert(const sal_Unicode *puSource , sal_Int32 nSourceSize)
476 {
477 	sal_Unicode *puTempMem = 0;
478 
479 	if( m_seqSource.getLength() ) {
480 		// For surrogates !
481 		// put old rest and new byte sequence into one array
482 		// In general when surrogates are used, they should be rarely
483 		// cut off between two convert()-calls. So this code is used
484 		// rarely and the extra copy is acceptable.
485 		puTempMem = new sal_Unicode[ nSourceSize + m_seqSource.getLength()];
486 		memcpy( puTempMem ,
487 				m_seqSource.getConstArray() ,
488 				m_seqSource.getLength() * sizeof( sal_Unicode ) );
489 		memcpy(
490 			&(puTempMem[ m_seqSource.getLength() ]) ,
491 			puSource ,
492 			nSourceSize*sizeof( sal_Unicode ) );
493 		puSource = puTempMem;
494 		nSourceSize += m_seqSource.getLength();
495 
496 		m_seqSource = Sequence< sal_Unicode > ();
497 	}
498 
499 
500 	sal_Size nTargetCount = 0;
501 	sal_Size nSourceCount = 0;
502 
503 	sal_uInt32 uiInfo;
504 	sal_Size nSrcCvtChars;
505 
506 	// take nSourceSize * 3 as preference
507 	// this is an upper boundary for converting to utf8,
508 	// which most often used as the target.
509 	sal_Int32 nSeqSize =  nSourceSize * 3;
510 
511 	Sequence<sal_Int8> 	seqText( nSeqSize );
512 	sal_Char *pTarget = (sal_Char *) seqText.getArray();
513 	while( sal_True ) {
514 
515 		nTargetCount += rtl_convertUnicodeToText(
516 									m_convUnicode2Text,
517 									m_contextUnicode2Text,
518 									&( puSource[nSourceCount] ),
519 									nSourceSize - nSourceCount ,
520 									&( pTarget[nTargetCount] ),
521 									nSeqSize - nTargetCount,
522 									RTL_UNICODETOTEXT_FLAGS_UNDEFINED_DEFAULT |
523 									RTL_UNICODETOTEXT_FLAGS_INVALID_DEFAULT ,
524 									&uiInfo,
525 									&nSrcCvtChars);
526 		nSourceCount += nSrcCvtChars;
527 
528 		if( uiInfo & RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL ) {
529 			nSeqSize = nSeqSize *2;
530 			seqText.realloc( nSeqSize );  // double array size
531 			pTarget = ( sal_Char * ) seqText.getArray();
532 			continue;
533 		}
534 		break;
535 	}
536 
537 	// for surrogates
538 	if( uiInfo & RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL ) {
539 		m_seqSource.realloc( nSourceSize - nSourceCount );
540 		memcpy( m_seqSource.getArray() ,
541 				&(puSource[nSourceCount]),
542 				(nSourceSize - nSourceCount) * sizeof( sal_Unicode ) );
543 	}
544 
545 	if( puTempMem ) {
546 		delete [] puTempMem;
547 	}
548 
549 	// reduce the size of the buffer (fast, no copy necessary)
550 	seqText.realloc( nTargetCount );
551 
552 	return seqText;
553 }
554 
init(rtl_TextEncoding encoding)555 void Unicode2TextConverter::init( rtl_TextEncoding encoding )
556 {
557 	m_bCanContinue = sal_True;
558 	m_bInitialized = sal_True;
559 
560 	m_convUnicode2Text 	= rtl_createUnicodeToTextConverter( encoding );
561 	m_contextUnicode2Text = rtl_createUnicodeToTextContext( m_convUnicode2Text );
562 	m_rtlEncoding = encoding;
563 };
564 
565 
566 }
567