xref: /trunk/main/sax/source/expatwrap/xml2utf.cxx (revision cdf0e10c4e3984b49a9502b011690b615761d4a3)
1 /*************************************************************************
2  *
3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4  *
5  * Copyright 2000, 2010 Oracle and/or its affiliates.
6  *
7  * OpenOffice.org - a multi-platform office productivity suite
8  *
9  * This file is part of OpenOffice.org.
10  *
11  * OpenOffice.org is free software: you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser General Public License version 3
13  * only, as published by the Free Software Foundation.
14  *
15  * OpenOffice.org is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  * GNU Lesser General Public License version 3 for more details
19  * (a copy is included in the LICENSE file that accompanied this code).
20  *
21  * You should have received a copy of the GNU Lesser General Public License
22  * version 3 along with OpenOffice.org.  If not, see
23  * <http://www.openoffice.org/license.html>
24  * for a copy of the LGPLv3 License.
25  *
26  ************************************************************************/
27 #include <string.h>
28 
29 #include <sal/types.h>
30 
31 #include <rtl/textenc.h>
32 #include <rtl/tencinfo.h>
33 
34 
35 #include <com/sun/star/io/XInputStream.hpp>
36 
37 using namespace rtl;
38 using namespace ::com::sun::star::uno;
39 using namespace ::com::sun::star::io;
40 
41 #include "xml2utf.hxx"
42 
43 namespace sax_expatwrap {
44 
45 sal_Int32 XMLFile2UTFConverter::readAndConvert( Sequence<sal_Int8> &seq , sal_Int32 nMaxToRead )
46     throw ( IOException, NotConnectedException , BufferSizeExceededException , RuntimeException )
47 {
48 
49     Sequence<sal_Int8> seqIn;
50 
51     if( ! m_in.is() ) {
52         throw NotConnectedException();
53     }
54     if( ! m_bStarted ) {
55         nMaxToRead = Max( 512 , nMaxToRead );   // it should be possible to find the encoding attribute
56                                                 // within the first 512 bytes == 128 chars in UCS-4
57     }
58 
59     sal_Int32 nRead;
60     Sequence< sal_Int8 > seqStart;
61     while( sal_True )
62     {
63         nRead = m_in->readSomeBytes( seq , nMaxToRead );
64 
65         if( nRead + seqStart.getLength())
66         {
67             // if nRead is 0, the file is already eof.
68             if( ! m_bStarted && nRead )
69             {
70                 // ensure that enough data is available to parse encoding
71                 if( seqStart.getLength() )
72                 {
73                   // prefix with what we had so far.
74                   sal_Int32 nLength = seq.getLength();
75                   seq.realloc( seqStart.getLength() + nLength );
76 
77                   memmove (seq.getArray() + seqStart.getLength(),
78                        seq.getConstArray(),
79                        nLength);
80                   memcpy  (seq.getArray(),
81                        seqStart.getConstArray(),
82                        seqStart.getLength());
83                 }
84 
85                 // autodetection with the first bytes
86                 if( ! isEncodingRecognizable( seq ) )
87                 {
88                   // remember what we have so far.
89                   seqStart = seq;
90 
91                   // read more !
92                   continue;
93                 }
94                 if( scanForEncoding( seq ) || m_sEncoding.getLength() ) {
95                     // initialize decoding
96                     initializeDecoding();
97                 }
98                 nRead = seq.getLength();
99                 seqStart = Sequence < sal_Int8 > ();
100             }
101 
102             // do the encoding
103             if( m_pText2Unicode && m_pUnicode2Text &&
104                 m_pText2Unicode->canContinue() && m_pUnicode2Text->canContinue() ) {
105 
106                 Sequence<sal_Unicode> seqUnicode = m_pText2Unicode->convert( seq );
107                 seq = m_pUnicode2Text->convert( seqUnicode.getConstArray(), seqUnicode.getLength() );
108             }
109 
110             if( ! m_bStarted )
111             {
112                 // it must now be ensured, that no encoding attribute exist anymore
113                 // ( otherwise the expat-Parser will crash )
114                 // This must be done after decoding !
115                 // ( e.g. Files decoded in ucs-4 cannot be read properly )
116                 m_bStarted = sal_True;
117                 removeEncoding( seq );
118             }
119             nRead = seq.getLength();
120         }
121 
122         break;
123     }
124     return nRead;
125 }
126 
127 
128 XMLFile2UTFConverter::~XMLFile2UTFConverter()
129 {
130     if( m_pText2Unicode )
131         delete m_pText2Unicode;
132     if( m_pUnicode2Text )
133         delete m_pUnicode2Text;
134 }
135 
136 
137 void XMLFile2UTFConverter::removeEncoding( Sequence<sal_Int8> &seq )
138 {
139     const sal_Int8 *pSource = seq.getArray();
140     if( ! strncmp( (const char * ) pSource , "<?xml" , 4) )
141     {
142 
143         // scan for encoding
144         OString str( (sal_Char * ) pSource , seq.getLength() );
145 
146         // cut sequence to first line break
147         // find first line break;
148         int nMax = str.indexOf( 10 );
149         if( nMax >= 0 )
150         {
151             str = str.copy( 0 , nMax );
152         }
153 
154         int nFound = str.indexOf( " encoding" );
155         if( nFound >= 0 ) {
156             int nStop;
157             int nStart = str.indexOf( "\"" , nFound );
158             if( nStart < 0 || str.indexOf( "'" , nFound ) < nStart )
159             {
160                 nStart = str.indexOf( "'" , nFound );
161                 nStop  = str.indexOf( "'" , nStart +1 );
162             }
163             else
164             {
165                 nStop  = str.indexOf( "\"" , nStart +1);
166             }
167 
168             if( nStart >= 0 && nStop >= 0 && nStart+1 < nStop )
169             {
170                 // remove encoding tag from file
171                 memmove(        &( seq.getArray()[nFound] ) ,
172                                 &( seq.getArray()[nStop+1]) ,
173                                 seq.getLength() - nStop -1);
174                 seq.realloc( seq.getLength() - ( nStop+1 - nFound ) );
175 //              str = String( (char * ) seq.getArray() , seq.getLen() );
176             }
177         }
178     }
179 }
180 
181 // Checks, if enough data has been accumulated to recognize the encoding
182 sal_Bool XMLFile2UTFConverter::isEncodingRecognizable( const Sequence< sal_Int8 > &seq)
183 {
184     const sal_Int8 *pSource = seq.getConstArray();
185     sal_Bool bCheckIfFirstClosingBracketExsists = sal_False;
186 
187     if( seq.getLength() < 8 ) {
188         // no recognition possible, when less than 8 bytes are available
189         return sal_False;
190     }
191 
192     if( ! strncmp( (const char * ) pSource , "<?xml" , 4 ) ) {
193         // scan if the <?xml tag finishes within this buffer
194         bCheckIfFirstClosingBracketExsists = sal_True;
195     }
196     else if( ('<' == pSource[0] || '<' == pSource[2] ) &&
197              ( ('?' == pSource[4] || '?' == pSource[6] ) ) )
198     {
199         // check for utf-16
200         bCheckIfFirstClosingBracketExsists = sal_True;
201     }
202     else if( ( '<' == pSource[1] || '<' == pSource[3] ) &&
203              ( '?' == pSource[5] || '?' == pSource[7] ) )
204     {
205         // check for
206         bCheckIfFirstClosingBracketExsists = sal_True;
207     }
208 
209     if( bCheckIfFirstClosingBracketExsists )
210     {
211         for( sal_Int32 i = 0; i < seq.getLength() ; i ++ )
212         {
213             // whole <?xml tag is valid
214             if( '>' == pSource[ i ] )
215             {
216                 return sal_True;
217             }
218         }
219         return sal_False;
220     }
221 
222     // No <? tag in front, no need for a bigger buffer
223     return sal_True;
224 }
225 
226 sal_Bool XMLFile2UTFConverter::scanForEncoding( Sequence< sal_Int8 > &seq )
227 {
228     const sal_uInt8 *pSource = reinterpret_cast<const sal_uInt8*>( seq.getConstArray() );
229     sal_Bool bReturn = sal_True;
230 
231     if( seq.getLength() < 4 ) {
232         // no recognition possible, when less than 4 bytes are available
233         return sal_False;
234     }
235 
236     // first level : detect possible file formats
237     if( ! strncmp( (const char * ) pSource , "<?xml" , 4 ) ) {
238 
239         // scan for encoding
240         OString str( (const sal_Char *) pSource , seq.getLength() );
241 
242         // cut sequence to first line break
243         //find first line break;
244         int nMax = str.indexOf( 10 );
245         if( nMax >= 0 )
246         {
247             str = str.copy( 0 , nMax );
248         }
249 
250         int nFound = str.indexOf( " encoding" );
251         if( nFound < str.getLength() ) {
252             int nStop;
253             int nStart = str.indexOf( "\"" , nFound );
254             if( nStart < 0 || str.indexOf( "'" , nFound ) < nStart )
255             {
256                 nStart = str.indexOf( "'" , nFound );
257                 nStop  = str.indexOf( "'" , nStart +1 );
258             }
259             else
260             {
261                 nStop  = str.indexOf( "\"" , nStart +1);
262             }
263             if( nStart >= 0 && nStop >= 0 && nStart+1 < nStop )
264             {
265                 // encoding found finally
266                 m_sEncoding = str.copy( nStart+1 , nStop - nStart - 1 );
267             }
268         }
269     }
270     else if( 0xFE == pSource[0] &&
271              0xFF == pSource[1] ) {
272         // UTF-16 big endian
273         // conversion is done so that encoding information can be easily extracted
274         m_sEncoding = "utf-16";
275     }
276     else if( 0xFF == pSource[0] &&
277              0xFE == pSource[1] ) {
278         // UTF-16 little endian
279         // conversion is done so that encoding information can be easily extracted
280         m_sEncoding = "utf-16";
281     }
282     else if( 0x00 == pSource[0] && 0x3c == pSource[1]  && 0x00 == pSource[2] && 0x3f == pSource[3] ) {
283         // UTF-16 big endian without byte order mark (this is (strictly speaking) an error.)
284         // The byte order mark is simply added
285 
286         // simply add the byte order mark !
287         seq.realloc( seq.getLength() + 2 );
288         memmove( &( seq.getArray()[2] ) , seq.getArray() , seq.getLength() - 2 );
289         ((sal_uInt8*)seq.getArray())[0] = 0xFE;
290         ((sal_uInt8*)seq.getArray())[1] = 0xFF;
291 
292         m_sEncoding = "utf-16";
293     }
294     else if( 0x3c == pSource[0] && 0x00 == pSource[1]  && 0x3f == pSource[2] && 0x00 == pSource[3] ) {
295         // UTF-16 little endian without byte order mark (this is (strictly speaking) an error.)
296         // The byte order mark is simply added
297 
298         seq.realloc( seq.getLength() + 2 );
299         memmove( &( seq.getArray()[2] ) , seq.getArray() , seq.getLength() - 2 );
300         ((sal_uInt8*)seq.getArray())[0] = 0xFF;
301         ((sal_uInt8*)seq.getArray())[1] = 0xFE;
302 
303         m_sEncoding = "utf-16";
304     }
305     else if( 0xEF == pSource[0] &&
306              0xBB == pSource[1] &&
307              0xBF == pSource[2] )
308     {
309         // UTF-8 BOM (byte order mark); signifies utf-8, and not byte order
310         // The BOM is removed.
311         memmove( seq.getArray(), &( seq.getArray()[3] ), seq.getLength()-3 );
312         seq.realloc( seq.getLength() - 3 );
313         m_sEncoding = "utf-8";
314     }
315     else if( 0x00 == pSource[0] && 0x00 == pSource[1]  && 0x00 == pSource[2] && 0x3c == pSource[3] ) {
316         // UCS-4 big endian
317         m_sEncoding = "ucs-4";
318     }
319     else if( 0x3c == pSource[0] && 0x00 == pSource[1]  && 0x00 == pSource[2] && 0x00 == pSource[3] ) {
320         // UCS-4 little endian
321         m_sEncoding = "ucs-4";
322     }
323     else if( 0x4c == pSource[0] && 0x6f == pSource[1]  &&
324              0xa7 == static_cast<unsigned char> (pSource[2]) &&
325              0x94 == static_cast<unsigned char> (pSource[3]) ) {
326         // EBCDIC
327         bReturn = sal_False;   // must be extended
328     }
329     else {
330         // other
331         // UTF8 is directly recognized by the parser.
332         bReturn = sal_False;
333     }
334 
335     return bReturn;
336 }
337 
338 void XMLFile2UTFConverter::initializeDecoding()
339 {
340 
341     if( m_sEncoding.getLength() )
342     {
343         rtl_TextEncoding encoding = rtl_getTextEncodingFromMimeCharset( m_sEncoding.getStr() );
344         if( encoding != RTL_TEXTENCODING_UTF8 )
345         {
346             m_pText2Unicode = new Text2UnicodeConverter( m_sEncoding );
347             m_pUnicode2Text = new Unicode2TextConverter( RTL_TEXTENCODING_UTF8 );
348         }
349     }
350 }
351 
352 
353 //----------------------------------------------
354 //
355 // Text2UnicodeConverter
356 //
357 //----------------------------------------------
358 Text2UnicodeConverter::Text2UnicodeConverter( const OString &sEncoding )
359 {
360     rtl_TextEncoding encoding = rtl_getTextEncodingFromMimeCharset( sEncoding.getStr() );
361     if( RTL_TEXTENCODING_DONTKNOW == encoding )
362     {
363         m_bCanContinue = sal_False;
364         m_bInitialized = sal_False;
365     }
366     else
367     {
368         init( encoding );
369     }
370 }
371 
372 Text2UnicodeConverter::~Text2UnicodeConverter()
373 {
374     if( m_bInitialized )
375     {
376         rtl_destroyTextToUnicodeContext( m_convText2Unicode , m_contextText2Unicode );
377         rtl_destroyUnicodeToTextConverter( m_convText2Unicode );
378     }
379 }
380 
381 void Text2UnicodeConverter::init( rtl_TextEncoding encoding )
382 {
383     m_bCanContinue = sal_True;
384     m_bInitialized = sal_True;
385 
386     m_convText2Unicode  = rtl_createTextToUnicodeConverter(encoding);
387     m_contextText2Unicode = rtl_createTextToUnicodeContext( m_convText2Unicode );
388     m_rtlEncoding = encoding;
389 }
390 
391 
392 Sequence<sal_Unicode> Text2UnicodeConverter::convert( const Sequence<sal_Int8> &seqText )
393 {
394     sal_uInt32 uiInfo;
395     sal_Size nSrcCvtBytes   = 0;
396     sal_Size nTargetCount   = 0;
397     sal_Size nSourceCount   = 0;
398 
399     // the whole source size
400     sal_Int32   nSourceSize = seqText.getLength() + m_seqSource.getLength();
401     Sequence<sal_Unicode>   seqUnicode ( nSourceSize );
402 
403     const sal_Int8 *pbSource = seqText.getConstArray();
404     sal_Int8 *pbTempMem = 0;
405 
406     if( m_seqSource.getLength() ) {
407         // put old rest and new byte sequence into one array
408         pbTempMem = new sal_Int8[ nSourceSize ];
409         memcpy( pbTempMem , m_seqSource.getConstArray() , m_seqSource.getLength() );
410         memcpy( &(pbTempMem[ m_seqSource.getLength() ]) , seqText.getConstArray() , seqText.getLength() );
411         pbSource = pbTempMem;
412 
413         // set to zero again
414         m_seqSource = Sequence< sal_Int8 >();
415     }
416 
417     while( sal_True ) {
418 
419         /* All invalid characters are transformed to the unicode undefined char */
420         nTargetCount +=     rtl_convertTextToUnicode(
421                                     m_convText2Unicode,
422                                     m_contextText2Unicode,
423                                     ( const sal_Char * ) &( pbSource[nSourceCount] ),
424                                     nSourceSize - nSourceCount ,
425                                     &( seqUnicode.getArray()[ nTargetCount ] ),
426                                     seqUnicode.getLength() - nTargetCount,
427                                     RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_DEFAULT   |
428                                     RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_DEFAULT |
429                                     RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT,
430                                     &uiInfo,
431                                     &nSrcCvtBytes );
432         nSourceCount += nSrcCvtBytes;
433 
434         if( uiInfo & RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL ) {
435             // save necessary bytes for next conversion
436             seqUnicode.realloc( seqUnicode.getLength() * 2 );
437             continue;
438         }
439         break;
440     }
441     if( uiInfo & RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL ) {
442         m_seqSource.realloc( nSourceSize - nSourceCount );
443         memcpy( m_seqSource.getArray() , &(pbSource[nSourceCount]) , nSourceSize-nSourceCount );
444     }
445 
446 
447     if( pbTempMem ) {
448         delete [] pbTempMem;
449     }
450 
451     // set to correct unicode size
452     seqUnicode.realloc( nTargetCount );
453 
454     return seqUnicode;
455 }
456 
457 
458 
459 //----------------------------------------------
460 //
461 // Unicode2TextConverter
462 //
463 //----------------------------------------------
464 Unicode2TextConverter::Unicode2TextConverter( rtl_TextEncoding encoding )
465 {
466     init( encoding );
467 }
468 
469 
470 Unicode2TextConverter::~Unicode2TextConverter()
471 {
472     if( m_bInitialized ) {
473         rtl_destroyUnicodeToTextContext( m_convUnicode2Text , m_contextUnicode2Text );
474         rtl_destroyUnicodeToTextConverter( m_convUnicode2Text );
475     }
476 }
477 
478 
479 Sequence<sal_Int8> Unicode2TextConverter::convert(const sal_Unicode *puSource , sal_Int32 nSourceSize)
480 {
481     sal_Unicode *puTempMem = 0;
482 
483     if( m_seqSource.getLength() ) {
484         // For surrogates !
485         // put old rest and new byte sequence into one array
486         // In general when surrogates are used, they should be rarely
487         // cut off between two convert()-calls. So this code is used
488         // rarely and the extra copy is acceptable.
489         puTempMem = new sal_Unicode[ nSourceSize + m_seqSource.getLength()];
490         memcpy( puTempMem ,
491                 m_seqSource.getConstArray() ,
492                 m_seqSource.getLength() * sizeof( sal_Unicode ) );
493         memcpy(
494             &(puTempMem[ m_seqSource.getLength() ]) ,
495             puSource ,
496             nSourceSize*sizeof( sal_Unicode ) );
497         puSource = puTempMem;
498         nSourceSize += m_seqSource.getLength();
499 
500         m_seqSource = Sequence< sal_Unicode > ();
501     }
502 
503 
504     sal_Size nTargetCount = 0;
505     sal_Size nSourceCount = 0;
506 
507     sal_uInt32 uiInfo;
508     sal_Size nSrcCvtChars;
509 
510     // take nSourceSize * 3 as preference
511     // this is an upper boundary for converting to utf8,
512     // which most often used as the target.
513     sal_Int32 nSeqSize =  nSourceSize * 3;
514 
515     Sequence<sal_Int8>  seqText( nSeqSize );
516     sal_Char *pTarget = (sal_Char *) seqText.getArray();
517     while( sal_True ) {
518 
519         nTargetCount += rtl_convertUnicodeToText(
520                                     m_convUnicode2Text,
521                                     m_contextUnicode2Text,
522                                     &( puSource[nSourceCount] ),
523                                     nSourceSize - nSourceCount ,
524                                     &( pTarget[nTargetCount] ),
525                                     nSeqSize - nTargetCount,
526                                     RTL_UNICODETOTEXT_FLAGS_UNDEFINED_DEFAULT |
527                                     RTL_UNICODETOTEXT_FLAGS_INVALID_DEFAULT ,
528                                     &uiInfo,
529                                     &nSrcCvtChars);
530         nSourceCount += nSrcCvtChars;
531 
532         if( uiInfo & RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL ) {
533             nSeqSize = nSeqSize *2;
534             seqText.realloc( nSeqSize );  // double array size
535             pTarget = ( sal_Char * ) seqText.getArray();
536             continue;
537         }
538         break;
539     }
540 
541     // for surrogates
542     if( uiInfo & RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL ) {
543         m_seqSource.realloc( nSourceSize - nSourceCount );
544         memcpy( m_seqSource.getArray() ,
545                 &(puSource[nSourceCount]),
546                 (nSourceSize - nSourceCount) * sizeof( sal_Unicode ) );
547     }
548 
549     if( puTempMem ) {
550         delete [] puTempMem;
551     }
552 
553     // reduce the size of the buffer (fast, no copy necessary)
554     seqText.realloc( nTargetCount );
555 
556     return seqText;
557 }
558 
559 void Unicode2TextConverter::init( rtl_TextEncoding encoding )
560 {
561     m_bCanContinue = sal_True;
562     m_bInitialized = sal_True;
563 
564     m_convUnicode2Text  = rtl_createUnicodeToTextConverter( encoding );
565     m_contextUnicode2Text = rtl_createUnicodeToTextContext( m_convUnicode2Text );
566     m_rtlEncoding = encoding;
567 };
568 
569 
570 }
571