1 /**************************************************************
2 *
3 * Licensed to the Apache Software Foundation (ASF) under one
4 * or more contributor license agreements. See the NOTICE file
5 * distributed with this work for additional information
6 * regarding copyright ownership. The ASF licenses this file
7 * to you under the Apache License, Version 2.0 (the
8 * "License"); you may not use this file except in compliance
9 * with the License. You may obtain a copy of the License at
10 *
11 * http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing,
14 * software distributed under the License is distributed on an
15 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 * KIND, either express or implied. See the License for the
17 * specific language governing permissions and limitations
18 * under the License.
19 *
20 *************************************************************/
21
22
23 #include <string.h>
24
25 #include <sal/types.h>
26
27 #include <rtl/textenc.h>
28 #include <rtl/tencinfo.h>
29
30
31 #include <com/sun/star/io/XInputStream.hpp>
32
33 using namespace rtl;
34 using namespace ::com::sun::star::uno;
35 using namespace ::com::sun::star::io;
36
37 #include "xml2utf.hxx"
38
39 namespace sax_expatwrap {
40
readAndConvert(Sequence<sal_Int8> & seq,sal_Int32 nMaxToRead)41 sal_Int32 XMLFile2UTFConverter::readAndConvert( Sequence<sal_Int8> &seq , sal_Int32 nMaxToRead )
42 throw ( IOException, NotConnectedException , BufferSizeExceededException , RuntimeException )
43 {
44
45 Sequence<sal_Int8> seqIn;
46
47 if( ! m_in.is() ) {
48 throw NotConnectedException();
49 }
50 if( ! m_bStarted ) {
51 nMaxToRead = Max( 512 , nMaxToRead ); // it should be possible to find the encoding attribute
52 // within the first 512 bytes == 128 chars in UCS-4
53 }
54
55 sal_Int32 nRead;
56 Sequence< sal_Int8 > seqStart;
57 while( sal_True )
58 {
59 nRead = m_in->readSomeBytes( seq , nMaxToRead );
60
61 if( nRead + seqStart.getLength())
62 {
63 // if nRead is 0, the file is already eof.
64 if( ! m_bStarted && nRead )
65 {
66 // ensure that enough data is available to parse encoding
67 if( seqStart.getLength() )
68 {
69 // prefix with what we had so far.
70 sal_Int32 nLength = seq.getLength();
71 seq.realloc( seqStart.getLength() + nLength );
72
73 memmove (seq.getArray() + seqStart.getLength(),
74 seq.getConstArray(),
75 nLength);
76 memcpy (seq.getArray(),
77 seqStart.getConstArray(),
78 seqStart.getLength());
79 }
80
81 // autodetection with the first bytes
82 if( ! isEncodingRecognizable( seq ) )
83 {
84 // remember what we have so far.
85 seqStart = seq;
86
87 // read more !
88 continue;
89 }
90 if( scanForEncoding( seq ) || m_sEncoding.getLength() ) {
91 // initialize decoding
92 initializeDecoding();
93 }
94 nRead = seq.getLength();
95 seqStart = Sequence < sal_Int8 > ();
96 }
97
98 // do the encoding
99 if( m_pText2Unicode && m_pUnicode2Text &&
100 m_pText2Unicode->canContinue() && m_pUnicode2Text->canContinue() ) {
101
102 Sequence<sal_Unicode> seqUnicode = m_pText2Unicode->convert( seq );
103 seq = m_pUnicode2Text->convert( seqUnicode.getConstArray(), seqUnicode.getLength() );
104 }
105
106 if( ! m_bStarted )
107 {
108 // it must now be ensured, that no encoding attribute exist anymore
109 // ( otherwise the expat-Parser will crash )
110 // This must be done after decoding !
111 // ( e.g. Files decoded in ucs-4 cannot be read properly )
112 m_bStarted = sal_True;
113 removeEncoding( seq );
114 }
115 nRead = seq.getLength();
116 }
117
118 break;
119 }
120 return nRead;
121 }
122
123
~XMLFile2UTFConverter()124 XMLFile2UTFConverter::~XMLFile2UTFConverter()
125 {
126 if( m_pText2Unicode )
127 delete m_pText2Unicode;
128 if( m_pUnicode2Text )
129 delete m_pUnicode2Text;
130 }
131
132
removeEncoding(Sequence<sal_Int8> & seq)133 void XMLFile2UTFConverter::removeEncoding( Sequence<sal_Int8> &seq )
134 {
135 const sal_Int8 *pSource = seq.getArray();
136 if( ! strncmp( (const char * ) pSource , "<?xml" , 4) )
137 {
138
139 // scan for encoding
140 OString str( (sal_Char * ) pSource , seq.getLength() );
141
142 // cut sequence to first line break
143 // find first line break;
144 int nMax = str.indexOf( 10 );
145 if( nMax >= 0 )
146 {
147 str = str.copy( 0 , nMax );
148 }
149
150 int nFound = str.indexOf( " encoding" );
151 if( nFound >= 0 ) {
152 int nStop;
153 int nStart = str.indexOf( "\"" , nFound );
154 if( nStart < 0 || str.indexOf( "'" , nFound ) < nStart )
155 {
156 nStart = str.indexOf( "'" , nFound );
157 nStop = str.indexOf( "'" , nStart +1 );
158 }
159 else
160 {
161 nStop = str.indexOf( "\"" , nStart +1);
162 }
163
164 if( nStart >= 0 && nStop >= 0 && nStart+1 < nStop )
165 {
166 // remove encoding tag from file
167 memmove( &( seq.getArray()[nFound] ) ,
168 &( seq.getArray()[nStop+1]) ,
169 seq.getLength() - nStop -1);
170 seq.realloc( seq.getLength() - ( nStop+1 - nFound ) );
171 // str = String( (char * ) seq.getArray() , seq.getLen() );
172 }
173 }
174 }
175 }
176
177 // Checks, if enough data has been accumulated to recognize the encoding
isEncodingRecognizable(const Sequence<sal_Int8> & seq)178 sal_Bool XMLFile2UTFConverter::isEncodingRecognizable( const Sequence< sal_Int8 > &seq)
179 {
180 const sal_Int8 *pSource = seq.getConstArray();
181 sal_Bool bCheckIfFirstClosingBracketExsists = sal_False;
182
183 if( seq.getLength() < 8 ) {
184 // no recognition possible, when less than 8 bytes are available
185 return sal_False;
186 }
187
188 if( ! strncmp( (const char * ) pSource , "<?xml" , 4 ) ) {
189 // scan if the <?xml tag finishes within this buffer
190 bCheckIfFirstClosingBracketExsists = sal_True;
191 }
192 else if( ('<' == pSource[0] || '<' == pSource[2] ) &&
193 ( ('?' == pSource[4] || '?' == pSource[6] ) ) )
194 {
195 // check for utf-16
196 bCheckIfFirstClosingBracketExsists = sal_True;
197 }
198 else if( ( '<' == pSource[1] || '<' == pSource[3] ) &&
199 ( '?' == pSource[5] || '?' == pSource[7] ) )
200 {
201 // check for
202 bCheckIfFirstClosingBracketExsists = sal_True;
203 }
204
205 if( bCheckIfFirstClosingBracketExsists )
206 {
207 for( sal_Int32 i = 0; i < seq.getLength() ; i ++ )
208 {
209 // whole <?xml tag is valid
210 if( '>' == pSource[ i ] )
211 {
212 return sal_True;
213 }
214 }
215 return sal_False;
216 }
217
218 // No <? tag in front, no need for a bigger buffer
219 return sal_True;
220 }
221
scanForEncoding(Sequence<sal_Int8> & seq)222 sal_Bool XMLFile2UTFConverter::scanForEncoding( Sequence< sal_Int8 > &seq )
223 {
224 const sal_uInt8 *pSource = reinterpret_cast<const sal_uInt8*>( seq.getConstArray() );
225 sal_Bool bReturn = sal_True;
226
227 if( seq.getLength() < 4 ) {
228 // no recognition possible, when less than 4 bytes are available
229 return sal_False;
230 }
231
232 // first level : detect possible file formats
233 if( ! strncmp( (const char * ) pSource , "<?xml" , 4 ) ) {
234
235 // scan for encoding
236 OString str( (const sal_Char *) pSource , seq.getLength() );
237
238 // cut sequence to first line break
239 //find first line break;
240 int nMax = str.indexOf( 10 );
241 if( nMax >= 0 )
242 {
243 str = str.copy( 0 , nMax );
244 }
245
246 int nFound = str.indexOf( " encoding" );
247 if( nFound < str.getLength() ) {
248 int nStop;
249 int nStart = str.indexOf( "\"" , nFound );
250 if( nStart < 0 || str.indexOf( "'" , nFound ) < nStart )
251 {
252 nStart = str.indexOf( "'" , nFound );
253 nStop = str.indexOf( "'" , nStart +1 );
254 }
255 else
256 {
257 nStop = str.indexOf( "\"" , nStart +1);
258 }
259 if( nStart >= 0 && nStop >= 0 && nStart+1 < nStop )
260 {
261 // encoding found finally
262 m_sEncoding = str.copy( nStart+1 , nStop - nStart - 1 );
263 }
264 }
265 }
266 else if( 0xFE == pSource[0] &&
267 0xFF == pSource[1] ) {
268 // UTF-16 big endian
269 // conversion is done so that encoding information can be easily extracted
270 m_sEncoding = "utf-16";
271 }
272 else if( 0xFF == pSource[0] &&
273 0xFE == pSource[1] ) {
274 // UTF-16 little endian
275 // conversion is done so that encoding information can be easily extracted
276 m_sEncoding = "utf-16";
277 }
278 else if( 0x00 == pSource[0] && 0x3c == pSource[1] && 0x00 == pSource[2] && 0x3f == pSource[3] ) {
279 // UTF-16 big endian without byte order mark (this is (strictly speaking) an error.)
280 // The byte order mark is simply added
281
282 // simply add the byte order mark !
283 seq.realloc( seq.getLength() + 2 );
284 memmove( &( seq.getArray()[2] ) , seq.getArray() , seq.getLength() - 2 );
285 ((sal_uInt8*)seq.getArray())[0] = 0xFE;
286 ((sal_uInt8*)seq.getArray())[1] = 0xFF;
287
288 m_sEncoding = "utf-16";
289 }
290 else if( 0x3c == pSource[0] && 0x00 == pSource[1] && 0x3f == pSource[2] && 0x00 == pSource[3] ) {
291 // UTF-16 little endian without byte order mark (this is (strictly speaking) an error.)
292 // The byte order mark is simply added
293
294 seq.realloc( seq.getLength() + 2 );
295 memmove( &( seq.getArray()[2] ) , seq.getArray() , seq.getLength() - 2 );
296 ((sal_uInt8*)seq.getArray())[0] = 0xFF;
297 ((sal_uInt8*)seq.getArray())[1] = 0xFE;
298
299 m_sEncoding = "utf-16";
300 }
301 else if( 0xEF == pSource[0] &&
302 0xBB == pSource[1] &&
303 0xBF == pSource[2] )
304 {
305 // UTF-8 BOM (byte order mark); signifies utf-8, and not byte order
306 // The BOM is removed.
307 memmove( seq.getArray(), &( seq.getArray()[3] ), seq.getLength()-3 );
308 seq.realloc( seq.getLength() - 3 );
309 m_sEncoding = "utf-8";
310 }
311 else if( 0x00 == pSource[0] && 0x00 == pSource[1] && 0x00 == pSource[2] && 0x3c == pSource[3] ) {
312 // UCS-4 big endian
313 m_sEncoding = "ucs-4";
314 }
315 else if( 0x3c == pSource[0] && 0x00 == pSource[1] && 0x00 == pSource[2] && 0x00 == pSource[3] ) {
316 // UCS-4 little endian
317 m_sEncoding = "ucs-4";
318 }
319 else if( 0x4c == pSource[0] && 0x6f == pSource[1] &&
320 0xa7 == static_cast<unsigned char> (pSource[2]) &&
321 0x94 == static_cast<unsigned char> (pSource[3]) ) {
322 // EBCDIC
323 bReturn = sal_False; // must be extended
324 }
325 else {
326 // other
327 // UTF8 is directly recognized by the parser.
328 bReturn = sal_False;
329 }
330
331 return bReturn;
332 }
333
initializeDecoding()334 void XMLFile2UTFConverter::initializeDecoding()
335 {
336
337 if( m_sEncoding.getLength() )
338 {
339 rtl_TextEncoding encoding = rtl_getTextEncodingFromMimeCharset( m_sEncoding.getStr() );
340 if( encoding != RTL_TEXTENCODING_UTF8 )
341 {
342 m_pText2Unicode = new Text2UnicodeConverter( m_sEncoding );
343 m_pUnicode2Text = new Unicode2TextConverter( RTL_TEXTENCODING_UTF8 );
344 }
345 }
346 }
347
348
349 //----------------------------------------------
350 //
351 // Text2UnicodeConverter
352 //
353 //----------------------------------------------
Text2UnicodeConverter(const OString & sEncoding)354 Text2UnicodeConverter::Text2UnicodeConverter( const OString &sEncoding )
355 {
356 rtl_TextEncoding encoding = rtl_getTextEncodingFromMimeCharset( sEncoding.getStr() );
357 if( RTL_TEXTENCODING_DONTKNOW == encoding )
358 {
359 m_bCanContinue = sal_False;
360 m_bInitialized = sal_False;
361 }
362 else
363 {
364 init( encoding );
365 }
366 }
367
~Text2UnicodeConverter()368 Text2UnicodeConverter::~Text2UnicodeConverter()
369 {
370 if( m_bInitialized )
371 {
372 rtl_destroyTextToUnicodeContext( m_convText2Unicode , m_contextText2Unicode );
373 rtl_destroyUnicodeToTextConverter( m_convText2Unicode );
374 }
375 }
376
init(rtl_TextEncoding encoding)377 void Text2UnicodeConverter::init( rtl_TextEncoding encoding )
378 {
379 m_bCanContinue = sal_True;
380 m_bInitialized = sal_True;
381
382 m_convText2Unicode = rtl_createTextToUnicodeConverter(encoding);
383 m_contextText2Unicode = rtl_createTextToUnicodeContext( m_convText2Unicode );
384 m_rtlEncoding = encoding;
385 }
386
387
convert(const Sequence<sal_Int8> & seqText)388 Sequence<sal_Unicode> Text2UnicodeConverter::convert( const Sequence<sal_Int8> &seqText )
389 {
390 sal_uInt32 uiInfo;
391 sal_Size nSrcCvtBytes = 0;
392 sal_Size nTargetCount = 0;
393 sal_Size nSourceCount = 0;
394
395 // the whole source size
396 sal_Int32 nSourceSize = seqText.getLength() + m_seqSource.getLength();
397 Sequence<sal_Unicode> seqUnicode ( nSourceSize );
398
399 const sal_Int8 *pbSource = seqText.getConstArray();
400 sal_Int8 *pbTempMem = 0;
401
402 if( m_seqSource.getLength() ) {
403 // put old rest and new byte sequence into one array
404 pbTempMem = new sal_Int8[ nSourceSize ];
405 memcpy( pbTempMem , m_seqSource.getConstArray() , m_seqSource.getLength() );
406 memcpy( &(pbTempMem[ m_seqSource.getLength() ]) , seqText.getConstArray() , seqText.getLength() );
407 pbSource = pbTempMem;
408
409 // set to zero again
410 m_seqSource = Sequence< sal_Int8 >();
411 }
412
413 while( sal_True ) {
414
415 /* All invalid characters are transformed to the unicode undefined char */
416 nTargetCount += rtl_convertTextToUnicode(
417 m_convText2Unicode,
418 m_contextText2Unicode,
419 ( const sal_Char * ) &( pbSource[nSourceCount] ),
420 nSourceSize - nSourceCount ,
421 &( seqUnicode.getArray()[ nTargetCount ] ),
422 seqUnicode.getLength() - nTargetCount,
423 RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_DEFAULT |
424 RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_DEFAULT |
425 RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT,
426 &uiInfo,
427 &nSrcCvtBytes );
428 nSourceCount += nSrcCvtBytes;
429
430 if( uiInfo & RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL ) {
431 // save necessary bytes for next conversion
432 seqUnicode.realloc( seqUnicode.getLength() * 2 );
433 continue;
434 }
435 break;
436 }
437 if( uiInfo & RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL ) {
438 m_seqSource.realloc( nSourceSize - nSourceCount );
439 memcpy( m_seqSource.getArray() , &(pbSource[nSourceCount]) , nSourceSize-nSourceCount );
440 }
441
442
443 if( pbTempMem ) {
444 delete [] pbTempMem;
445 }
446
447 // set to correct unicode size
448 seqUnicode.realloc( nTargetCount );
449
450 return seqUnicode;
451 }
452
453
454
455 //----------------------------------------------
456 //
457 // Unicode2TextConverter
458 //
459 //----------------------------------------------
Unicode2TextConverter(rtl_TextEncoding encoding)460 Unicode2TextConverter::Unicode2TextConverter( rtl_TextEncoding encoding )
461 {
462 init( encoding );
463 }
464
465
~Unicode2TextConverter()466 Unicode2TextConverter::~Unicode2TextConverter()
467 {
468 if( m_bInitialized ) {
469 rtl_destroyUnicodeToTextContext( m_convUnicode2Text , m_contextUnicode2Text );
470 rtl_destroyUnicodeToTextConverter( m_convUnicode2Text );
471 }
472 }
473
474
convert(const sal_Unicode * puSource,sal_Int32 nSourceSize)475 Sequence<sal_Int8> Unicode2TextConverter::convert(const sal_Unicode *puSource , sal_Int32 nSourceSize)
476 {
477 sal_Unicode *puTempMem = 0;
478
479 if( m_seqSource.getLength() ) {
480 // For surrogates !
481 // put old rest and new byte sequence into one array
482 // In general when surrogates are used, they should be rarely
483 // cut off between two convert()-calls. So this code is used
484 // rarely and the extra copy is acceptable.
485 puTempMem = new sal_Unicode[ nSourceSize + m_seqSource.getLength()];
486 memcpy( puTempMem ,
487 m_seqSource.getConstArray() ,
488 m_seqSource.getLength() * sizeof( sal_Unicode ) );
489 memcpy(
490 &(puTempMem[ m_seqSource.getLength() ]) ,
491 puSource ,
492 nSourceSize*sizeof( sal_Unicode ) );
493 puSource = puTempMem;
494 nSourceSize += m_seqSource.getLength();
495
496 m_seqSource = Sequence< sal_Unicode > ();
497 }
498
499
500 sal_Size nTargetCount = 0;
501 sal_Size nSourceCount = 0;
502
503 sal_uInt32 uiInfo;
504 sal_Size nSrcCvtChars;
505
506 // take nSourceSize * 3 as preference
507 // this is an upper boundary for converting to utf8,
508 // which most often used as the target.
509 sal_Int32 nSeqSize = nSourceSize * 3;
510
511 Sequence<sal_Int8> seqText( nSeqSize );
512 sal_Char *pTarget = (sal_Char *) seqText.getArray();
513 while( sal_True ) {
514
515 nTargetCount += rtl_convertUnicodeToText(
516 m_convUnicode2Text,
517 m_contextUnicode2Text,
518 &( puSource[nSourceCount] ),
519 nSourceSize - nSourceCount ,
520 &( pTarget[nTargetCount] ),
521 nSeqSize - nTargetCount,
522 RTL_UNICODETOTEXT_FLAGS_UNDEFINED_DEFAULT |
523 RTL_UNICODETOTEXT_FLAGS_INVALID_DEFAULT ,
524 &uiInfo,
525 &nSrcCvtChars);
526 nSourceCount += nSrcCvtChars;
527
528 if( uiInfo & RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL ) {
529 nSeqSize = nSeqSize *2;
530 seqText.realloc( nSeqSize ); // double array size
531 pTarget = ( sal_Char * ) seqText.getArray();
532 continue;
533 }
534 break;
535 }
536
537 // for surrogates
538 if( uiInfo & RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL ) {
539 m_seqSource.realloc( nSourceSize - nSourceCount );
540 memcpy( m_seqSource.getArray() ,
541 &(puSource[nSourceCount]),
542 (nSourceSize - nSourceCount) * sizeof( sal_Unicode ) );
543 }
544
545 if( puTempMem ) {
546 delete [] puTempMem;
547 }
548
549 // reduce the size of the buffer (fast, no copy necessary)
550 seqText.realloc( nTargetCount );
551
552 return seqText;
553 }
554
init(rtl_TextEncoding encoding)555 void Unicode2TextConverter::init( rtl_TextEncoding encoding )
556 {
557 m_bCanContinue = sal_True;
558 m_bInitialized = sal_True;
559
560 m_convUnicode2Text = rtl_createUnicodeToTextConverter( encoding );
561 m_contextUnicode2Text = rtl_createUnicodeToTextContext( m_convUnicode2Text );
562 m_rtlEncoding = encoding;
563 };
564
565
566 }
567