source/expatwrap/xml2utf.cxx

*f9b72d11SAndrew Rist/**************************************************************
cdf0e10cSrcweir *
*f9b72d11SAndrew Rist * Licensed to the Apache Software Foundation (ASF) under one
*f9b72d11SAndrew Rist * or more contributor license agreements.  See the NOTICE file
*f9b72d11SAndrew Rist * distributed with this work for additional information
*f9b72d11SAndrew Rist * regarding copyright ownership.  The ASF licenses this file
*f9b72d11SAndrew Rist * to you under the Apache License, Version 2.0 (the
*f9b72d11SAndrew Rist * "License"); you may not use this file except in compliance
*f9b72d11SAndrew Rist * with the License.  You may obtain a copy of the License at
cdf0e10cSrcweir *
*f9b72d11SAndrew Rist *   http://www.apache.org/licenses/LICENSE-2.0
cdf0e10cSrcweir *
*f9b72d11SAndrew Rist * Unless required by applicable law or agreed to in writing,
*f9b72d11SAndrew Rist * software distributed under the License is distributed on an
*f9b72d11SAndrew Rist * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
*f9b72d11SAndrew Rist * KIND, either express or implied.  See the License for the
*f9b72d11SAndrew Rist * specific language governing permissions and limitations
*f9b72d11SAndrew Rist * under the License.
cdf0e10cSrcweir *
*f9b72d11SAndrew Rist *************************************************************/
*f9b72d11SAndrew Rist
*f9b72d11SAndrew Rist
cdf0e10cSrcweir#include <string.h>
cdf0e10cSrcweir
cdf0e10cSrcweir#include <sal/types.h>
cdf0e10cSrcweir
cdf0e10cSrcweir#include <rtl/textenc.h>
cdf0e10cSrcweir#include <rtl/tencinfo.h>
cdf0e10cSrcweir
cdf0e10cSrcweir
cdf0e10cSrcweir#include <com/sun/star/io/XInputStream.hpp>
cdf0e10cSrcweir
cdf0e10cSrcweirusing namespace rtl;
cdf0e10cSrcweirusing namespace ::com::sun::star::uno;
cdf0e10cSrcweirusing namespace ::com::sun::star::io;
cdf0e10cSrcweir
cdf0e10cSrcweir#include "xml2utf.hxx"
cdf0e10cSrcweir
cdf0e10cSrcweirnamespace sax_expatwrap {
cdf0e10cSrcweir
cdf0e10cSrcweirsal_Int32 XMLFile2UTFConverter::readAndConvert( Sequence<sal_Int8> &seq , sal_Int32 nMaxToRead )
cdf0e10cSrcweir    throw ( IOException, NotConnectedException , BufferSizeExceededException , RuntimeException )
cdf0e10cSrcweir{
cdf0e10cSrcweir
cdf0e10cSrcweir    Sequence<sal_Int8> seqIn;
cdf0e10cSrcweir
cdf0e10cSrcweir    if( ! m_in.is() ) {
cdf0e10cSrcweir        throw NotConnectedException();
cdf0e10cSrcweir    }
cdf0e10cSrcweir    if( ! m_bStarted ) {
cdf0e10cSrcweir        nMaxToRead = Max( 512 , nMaxToRead );   // it should be possible to find the encoding attribute
cdf0e10cSrcweir                                                // within the first 512 bytes == 128 chars in UCS-4
cdf0e10cSrcweir    }
cdf0e10cSrcweir
cdf0e10cSrcweir    sal_Int32 nRead;
cdf0e10cSrcweir    Sequence< sal_Int8 > seqStart;
cdf0e10cSrcweir    while( sal_True )
cdf0e10cSrcweir    {
cdf0e10cSrcweir        nRead = m_in->readSomeBytes( seq , nMaxToRead );
cdf0e10cSrcweir
cdf0e10cSrcweir        if( nRead + seqStart.getLength())
cdf0e10cSrcweir        {
cdf0e10cSrcweir            // if nRead is 0, the file is already eof.
cdf0e10cSrcweir            if( ! m_bStarted && nRead )
cdf0e10cSrcweir            {
cdf0e10cSrcweir                // ensure that enough data is available to parse encoding
cdf0e10cSrcweir                if( seqStart.getLength() )
cdf0e10cSrcweir                {
cdf0e10cSrcweir                  // prefix with what we had so far.
cdf0e10cSrcweir                  sal_Int32 nLength = seq.getLength();
cdf0e10cSrcweir                  seq.realloc( seqStart.getLength() + nLength );
cdf0e10cSrcweir
cdf0e10cSrcweir                  memmove (seq.getArray() + seqStart.getLength(),
cdf0e10cSrcweir                       seq.getConstArray(),
cdf0e10cSrcweir                       nLength);
cdf0e10cSrcweir                  memcpy  (seq.getArray(),
cdf0e10cSrcweir                       seqStart.getConstArray(),
cdf0e10cSrcweir                       seqStart.getLength());
cdf0e10cSrcweir                }
cdf0e10cSrcweir
cdf0e10cSrcweir                // autodetection with the first bytes
cdf0e10cSrcweir                if( ! isEncodingRecognizable( seq ) )
cdf0e10cSrcweir                {
cdf0e10cSrcweir                  // remember what we have so far.
cdf0e10cSrcweir                  seqStart = seq;
cdf0e10cSrcweir
cdf0e10cSrcweir                  // read more !
cdf0e10cSrcweir                  continue;
cdf0e10cSrcweir                }
cdf0e10cSrcweir                if( scanForEncoding( seq ) || m_sEncoding.getLength() ) {
cdf0e10cSrcweir                    // initialize decoding
cdf0e10cSrcweir                    initializeDecoding();
cdf0e10cSrcweir                }
cdf0e10cSrcweir                nRead = seq.getLength();
cdf0e10cSrcweir                seqStart = Sequence < sal_Int8 > ();
cdf0e10cSrcweir            }
cdf0e10cSrcweir
cdf0e10cSrcweir            // do the encoding
cdf0e10cSrcweir            if( m_pText2Unicode && m_pUnicode2Text &&
cdf0e10cSrcweir                m_pText2Unicode->canContinue() && m_pUnicode2Text->canContinue() ) {
cdf0e10cSrcweir
cdf0e10cSrcweir                Sequence<sal_Unicode> seqUnicode = m_pText2Unicode->convert( seq );
cdf0e10cSrcweir                seq = m_pUnicode2Text->convert( seqUnicode.getConstArray(), seqUnicode.getLength() );
cdf0e10cSrcweir            }
cdf0e10cSrcweir
cdf0e10cSrcweir            if( ! m_bStarted )
cdf0e10cSrcweir            {
cdf0e10cSrcweir                // it must now be ensured, that no encoding attribute exist anymore
cdf0e10cSrcweir                // ( otherwise the expat-Parser will crash )
cdf0e10cSrcweir                // This must be done after decoding !
cdf0e10cSrcweir                // ( e.g. Files decoded in ucs-4 cannot be read properly )
cdf0e10cSrcweir                m_bStarted = sal_True;
cdf0e10cSrcweir                removeEncoding( seq );
cdf0e10cSrcweir            }
cdf0e10cSrcweir            nRead = seq.getLength();
cdf0e10cSrcweir        }
cdf0e10cSrcweir
cdf0e10cSrcweir        break;
cdf0e10cSrcweir    }
cdf0e10cSrcweir    return nRead;
cdf0e10cSrcweir}
cdf0e10cSrcweir
cdf0e10cSrcweir
cdf0e10cSrcweirXMLFile2UTFConverter::~XMLFile2UTFConverter()
cdf0e10cSrcweir{
cdf0e10cSrcweir    if( m_pText2Unicode )
cdf0e10cSrcweir        delete m_pText2Unicode;
cdf0e10cSrcweir    if( m_pUnicode2Text )
cdf0e10cSrcweir        delete m_pUnicode2Text;
cdf0e10cSrcweir}
cdf0e10cSrcweir
cdf0e10cSrcweir
cdf0e10cSrcweirvoid XMLFile2UTFConverter::removeEncoding( Sequence<sal_Int8> &seq )
cdf0e10cSrcweir{
cdf0e10cSrcweir    const sal_Int8 *pSource = seq.getArray();
cdf0e10cSrcweir    if( ! strncmp( (const char * ) pSource , "<?xml" , 4) )
cdf0e10cSrcweir    {
cdf0e10cSrcweir
cdf0e10cSrcweir        // scan for encoding
cdf0e10cSrcweir        OString str( (sal_Char * ) pSource , seq.getLength() );
cdf0e10cSrcweir
cdf0e10cSrcweir        // cut sequence to first line break
cdf0e10cSrcweir        // find first line break;
cdf0e10cSrcweir        int nMax = str.indexOf( 10 );
cdf0e10cSrcweir        if( nMax >= 0 )
cdf0e10cSrcweir        {
cdf0e10cSrcweir            str = str.copy( 0 , nMax );
cdf0e10cSrcweir        }
cdf0e10cSrcweir
cdf0e10cSrcweir        int nFound = str.indexOf( " encoding" );
cdf0e10cSrcweir        if( nFound >= 0 ) {
cdf0e10cSrcweir            int nStop;
cdf0e10cSrcweir            int nStart = str.indexOf( "\"" , nFound );
cdf0e10cSrcweir            if( nStart < 0 || str.indexOf( "'" , nFound ) < nStart )
cdf0e10cSrcweir            {
cdf0e10cSrcweir                nStart = str.indexOf( "'" , nFound );
cdf0e10cSrcweir                nStop  = str.indexOf( "'" , nStart +1 );
cdf0e10cSrcweir            }
cdf0e10cSrcweir            else
cdf0e10cSrcweir            {
cdf0e10cSrcweir                nStop  = str.indexOf( "\"" , nStart +1);
cdf0e10cSrcweir            }
cdf0e10cSrcweir
cdf0e10cSrcweir            if( nStart >= 0 && nStop >= 0 && nStart+1 < nStop )
cdf0e10cSrcweir            {
cdf0e10cSrcweir                // remove encoding tag from file
cdf0e10cSrcweir                memmove(        &( seq.getArray()[nFound] ) ,
cdf0e10cSrcweir                                &( seq.getArray()[nStop+1]) ,
cdf0e10cSrcweir                                seq.getLength() - nStop -1);
cdf0e10cSrcweir                seq.realloc( seq.getLength() - ( nStop+1 - nFound ) );
cdf0e10cSrcweir//              str = String( (char * ) seq.getArray() , seq.getLen() );
cdf0e10cSrcweir            }
cdf0e10cSrcweir        }
cdf0e10cSrcweir    }
cdf0e10cSrcweir}
cdf0e10cSrcweir
cdf0e10cSrcweir// Checks, if enough data has been accumulated to recognize the encoding
cdf0e10cSrcweirsal_Bool XMLFile2UTFConverter::isEncodingRecognizable( const Sequence< sal_Int8 > &seq)
cdf0e10cSrcweir{
cdf0e10cSrcweir    const sal_Int8 *pSource = seq.getConstArray();
cdf0e10cSrcweir    sal_Bool bCheckIfFirstClosingBracketExsists = sal_False;
cdf0e10cSrcweir
cdf0e10cSrcweir    if( seq.getLength() < 8 ) {
cdf0e10cSrcweir        // no recognition possible, when less than 8 bytes are available
cdf0e10cSrcweir        return sal_False;
cdf0e10cSrcweir    }
cdf0e10cSrcweir
cdf0e10cSrcweir    if( ! strncmp( (const char * ) pSource , "<?xml" , 4 ) ) {
cdf0e10cSrcweir        // scan if the <?xml tag finishes within this buffer
cdf0e10cSrcweir        bCheckIfFirstClosingBracketExsists = sal_True;
cdf0e10cSrcweir    }
cdf0e10cSrcweir    else if( ('<' == pSource[0] || '<' == pSource[2] ) &&
cdf0e10cSrcweir             ( ('?' == pSource[4] || '?' == pSource[6] ) ) )
cdf0e10cSrcweir    {
cdf0e10cSrcweir        // check for utf-16
cdf0e10cSrcweir        bCheckIfFirstClosingBracketExsists = sal_True;
cdf0e10cSrcweir    }
cdf0e10cSrcweir    else if( ( '<' == pSource[1] || '<' == pSource[3] ) &&
cdf0e10cSrcweir             ( '?' == pSource[5] || '?' == pSource[7] ) )
cdf0e10cSrcweir    {
cdf0e10cSrcweir        // check for
cdf0e10cSrcweir        bCheckIfFirstClosingBracketExsists = sal_True;
cdf0e10cSrcweir    }
cdf0e10cSrcweir
cdf0e10cSrcweir    if( bCheckIfFirstClosingBracketExsists )
cdf0e10cSrcweir    {
cdf0e10cSrcweir        for( sal_Int32 i = 0; i < seq.getLength() ; i ++ )
cdf0e10cSrcweir        {
cdf0e10cSrcweir            // whole <?xml tag is valid
cdf0e10cSrcweir            if( '>' == pSource[ i ] )
cdf0e10cSrcweir            {
cdf0e10cSrcweir                return sal_True;
cdf0e10cSrcweir            }
cdf0e10cSrcweir        }
cdf0e10cSrcweir        return sal_False;
cdf0e10cSrcweir    }
cdf0e10cSrcweir
cdf0e10cSrcweir    // No <? tag in front, no need for a bigger buffer
cdf0e10cSrcweir    return sal_True;
cdf0e10cSrcweir}
cdf0e10cSrcweir
cdf0e10cSrcweirsal_Bool XMLFile2UTFConverter::scanForEncoding( Sequence< sal_Int8 > &seq )
cdf0e10cSrcweir{
cdf0e10cSrcweir    const sal_uInt8 *pSource = reinterpret_cast<const sal_uInt8*>( seq.getConstArray() );
cdf0e10cSrcweir    sal_Bool bReturn = sal_True;
cdf0e10cSrcweir
cdf0e10cSrcweir    if( seq.getLength() < 4 ) {
cdf0e10cSrcweir        // no recognition possible, when less than 4 bytes are available
cdf0e10cSrcweir        return sal_False;
cdf0e10cSrcweir    }
cdf0e10cSrcweir
cdf0e10cSrcweir    // first level : detect possible file formats
cdf0e10cSrcweir    if( ! strncmp( (const char * ) pSource , "<?xml" , 4 ) ) {
cdf0e10cSrcweir
cdf0e10cSrcweir        // scan for encoding
cdf0e10cSrcweir        OString str( (const sal_Char *) pSource , seq.getLength() );
cdf0e10cSrcweir
cdf0e10cSrcweir        // cut sequence to first line break
cdf0e10cSrcweir        //find first line break;
cdf0e10cSrcweir        int nMax = str.indexOf( 10 );
cdf0e10cSrcweir        if( nMax >= 0 )
cdf0e10cSrcweir        {
cdf0e10cSrcweir            str = str.copy( 0 , nMax );
cdf0e10cSrcweir        }
cdf0e10cSrcweir
cdf0e10cSrcweir        int nFound = str.indexOf( " encoding" );
cdf0e10cSrcweir        if( nFound < str.getLength() ) {
cdf0e10cSrcweir            int nStop;
cdf0e10cSrcweir            int nStart = str.indexOf( "\"" , nFound );
cdf0e10cSrcweir            if( nStart < 0 || str.indexOf( "'" , nFound ) < nStart )
cdf0e10cSrcweir            {
cdf0e10cSrcweir                nStart = str.indexOf( "'" , nFound );
cdf0e10cSrcweir                nStop  = str.indexOf( "'" , nStart +1 );
cdf0e10cSrcweir            }
cdf0e10cSrcweir            else
cdf0e10cSrcweir            {
cdf0e10cSrcweir                nStop  = str.indexOf( "\"" , nStart +1);
cdf0e10cSrcweir            }
cdf0e10cSrcweir            if( nStart >= 0 && nStop >= 0 && nStart+1 < nStop )
cdf0e10cSrcweir            {
cdf0e10cSrcweir                // encoding found finally
cdf0e10cSrcweir                m_sEncoding = str.copy( nStart+1 , nStop - nStart - 1 );
cdf0e10cSrcweir            }
cdf0e10cSrcweir        }
cdf0e10cSrcweir    }
cdf0e10cSrcweir    else if( 0xFE == pSource[0] &&
cdf0e10cSrcweir             0xFF == pSource[1] ) {
cdf0e10cSrcweir        // UTF-16 big endian
cdf0e10cSrcweir        // conversion is done so that encoding information can be easily extracted
cdf0e10cSrcweir        m_sEncoding = "utf-16";
cdf0e10cSrcweir    }
cdf0e10cSrcweir    else if( 0xFF == pSource[0] &&
cdf0e10cSrcweir             0xFE == pSource[1] ) {
cdf0e10cSrcweir        // UTF-16 little endian
cdf0e10cSrcweir        // conversion is done so that encoding information can be easily extracted
cdf0e10cSrcweir        m_sEncoding = "utf-16";
cdf0e10cSrcweir    }
cdf0e10cSrcweir    else if( 0x00 == pSource[0] && 0x3c == pSource[1]  && 0x00 == pSource[2] && 0x3f == pSource[3] ) {
cdf0e10cSrcweir        // UTF-16 big endian without byte order mark (this is (strictly speaking) an error.)
cdf0e10cSrcweir        // The byte order mark is simply added
cdf0e10cSrcweir
cdf0e10cSrcweir        // simply add the byte order mark !
cdf0e10cSrcweir        seq.realloc( seq.getLength() + 2 );
cdf0e10cSrcweir        memmove( &( seq.getArray()[2] ) , seq.getArray() , seq.getLength() - 2 );
cdf0e10cSrcweir        ((sal_uInt8*)seq.getArray())[0] = 0xFE;
cdf0e10cSrcweir        ((sal_uInt8*)seq.getArray())[1] = 0xFF;
cdf0e10cSrcweir
cdf0e10cSrcweir        m_sEncoding = "utf-16";
cdf0e10cSrcweir    }
cdf0e10cSrcweir    else if( 0x3c == pSource[0] && 0x00 == pSource[1]  && 0x3f == pSource[2] && 0x00 == pSource[3] ) {
cdf0e10cSrcweir        // UTF-16 little endian without byte order mark (this is (strictly speaking) an error.)
cdf0e10cSrcweir        // The byte order mark is simply added
cdf0e10cSrcweir
cdf0e10cSrcweir        seq.realloc( seq.getLength() + 2 );
cdf0e10cSrcweir        memmove( &( seq.getArray()[2] ) , seq.getArray() , seq.getLength() - 2 );
cdf0e10cSrcweir        ((sal_uInt8*)seq.getArray())[0] = 0xFF;
cdf0e10cSrcweir        ((sal_uInt8*)seq.getArray())[1] = 0xFE;
cdf0e10cSrcweir
cdf0e10cSrcweir        m_sEncoding = "utf-16";
cdf0e10cSrcweir    }
cdf0e10cSrcweir    else if( 0xEF == pSource[0] &&
cdf0e10cSrcweir             0xBB == pSource[1] &&
cdf0e10cSrcweir             0xBF == pSource[2] )
cdf0e10cSrcweir    {
cdf0e10cSrcweir        // UTF-8 BOM (byte order mark); signifies utf-8, and not byte order
cdf0e10cSrcweir        // The BOM is removed.
cdf0e10cSrcweir        memmove( seq.getArray(), &( seq.getArray()[3] ), seq.getLength()-3 );
cdf0e10cSrcweir        seq.realloc( seq.getLength() - 3 );
cdf0e10cSrcweir        m_sEncoding = "utf-8";
cdf0e10cSrcweir    }
cdf0e10cSrcweir    else if( 0x00 == pSource[0] && 0x00 == pSource[1]  && 0x00 == pSource[2] && 0x3c == pSource[3] ) {
cdf0e10cSrcweir        // UCS-4 big endian
cdf0e10cSrcweir        m_sEncoding = "ucs-4";
cdf0e10cSrcweir    }
cdf0e10cSrcweir    else if( 0x3c == pSource[0] && 0x00 == pSource[1]  && 0x00 == pSource[2] && 0x00 == pSource[3] ) {
cdf0e10cSrcweir        // UCS-4 little endian
cdf0e10cSrcweir        m_sEncoding = "ucs-4";
cdf0e10cSrcweir    }
cdf0e10cSrcweir    else if( 0x4c == pSource[0] && 0x6f == pSource[1]  &&
cdf0e10cSrcweir             0xa7 == static_cast<unsigned char> (pSource[2]) &&
cdf0e10cSrcweir             0x94 == static_cast<unsigned char> (pSource[3]) ) {
cdf0e10cSrcweir        // EBCDIC
cdf0e10cSrcweir        bReturn = sal_False;   // must be extended
cdf0e10cSrcweir    }
cdf0e10cSrcweir    else {
cdf0e10cSrcweir        // other
cdf0e10cSrcweir        // UTF8 is directly recognized by the parser.
cdf0e10cSrcweir        bReturn = sal_False;
cdf0e10cSrcweir    }
cdf0e10cSrcweir
cdf0e10cSrcweir    return bReturn;
cdf0e10cSrcweir}
cdf0e10cSrcweir
cdf0e10cSrcweirvoid XMLFile2UTFConverter::initializeDecoding()
cdf0e10cSrcweir{
cdf0e10cSrcweir
cdf0e10cSrcweir    if( m_sEncoding.getLength() )
cdf0e10cSrcweir    {
cdf0e10cSrcweir        rtl_TextEncoding encoding = rtl_getTextEncodingFromMimeCharset( m_sEncoding.getStr() );
cdf0e10cSrcweir        if( encoding != RTL_TEXTENCODING_UTF8 )
cdf0e10cSrcweir        {
cdf0e10cSrcweir            m_pText2Unicode = new Text2UnicodeConverter( m_sEncoding );
cdf0e10cSrcweir            m_pUnicode2Text = new Unicode2TextConverter( RTL_TEXTENCODING_UTF8 );
cdf0e10cSrcweir        }
cdf0e10cSrcweir    }
cdf0e10cSrcweir}
cdf0e10cSrcweir
cdf0e10cSrcweir
cdf0e10cSrcweir//----------------------------------------------
cdf0e10cSrcweir//
cdf0e10cSrcweir// Text2UnicodeConverter
cdf0e10cSrcweir//
cdf0e10cSrcweir//----------------------------------------------
cdf0e10cSrcweirText2UnicodeConverter::Text2UnicodeConverter( const OString &sEncoding )
cdf0e10cSrcweir{
cdf0e10cSrcweir    rtl_TextEncoding encoding = rtl_getTextEncodingFromMimeCharset( sEncoding.getStr() );
cdf0e10cSrcweir    if( RTL_TEXTENCODING_DONTKNOW == encoding )
cdf0e10cSrcweir    {
cdf0e10cSrcweir        m_bCanContinue = sal_False;
cdf0e10cSrcweir        m_bInitialized = sal_False;
cdf0e10cSrcweir    }
cdf0e10cSrcweir    else
cdf0e10cSrcweir    {
cdf0e10cSrcweir        init( encoding );
cdf0e10cSrcweir    }
cdf0e10cSrcweir}
cdf0e10cSrcweir
cdf0e10cSrcweirText2UnicodeConverter::~Text2UnicodeConverter()
cdf0e10cSrcweir{
cdf0e10cSrcweir    if( m_bInitialized )
cdf0e10cSrcweir    {
cdf0e10cSrcweir        rtl_destroyTextToUnicodeContext( m_convText2Unicode , m_contextText2Unicode );
cdf0e10cSrcweir        rtl_destroyUnicodeToTextConverter( m_convText2Unicode );
cdf0e10cSrcweir    }
cdf0e10cSrcweir}
cdf0e10cSrcweir
cdf0e10cSrcweirvoid Text2UnicodeConverter::init( rtl_TextEncoding encoding )
cdf0e10cSrcweir{
cdf0e10cSrcweir    m_bCanContinue = sal_True;
cdf0e10cSrcweir    m_bInitialized = sal_True;
cdf0e10cSrcweir
cdf0e10cSrcweir    m_convText2Unicode  = rtl_createTextToUnicodeConverter(encoding);
cdf0e10cSrcweir    m_contextText2Unicode = rtl_createTextToUnicodeContext( m_convText2Unicode );
cdf0e10cSrcweir    m_rtlEncoding = encoding;
cdf0e10cSrcweir}
cdf0e10cSrcweir
cdf0e10cSrcweir
cdf0e10cSrcweirSequence<sal_Unicode> Text2UnicodeConverter::convert( const Sequence<sal_Int8> &seqText )
cdf0e10cSrcweir{
cdf0e10cSrcweir    sal_uInt32 uiInfo;
cdf0e10cSrcweir    sal_Size nSrcCvtBytes   = 0;
cdf0e10cSrcweir    sal_Size nTargetCount   = 0;
cdf0e10cSrcweir    sal_Size nSourceCount   = 0;
cdf0e10cSrcweir
cdf0e10cSrcweir    // the whole source size
cdf0e10cSrcweir    sal_Int32   nSourceSize = seqText.getLength() + m_seqSource.getLength();
cdf0e10cSrcweir    Sequence<sal_Unicode>   seqUnicode ( nSourceSize );
cdf0e10cSrcweir
cdf0e10cSrcweir    const sal_Int8 *pbSource = seqText.getConstArray();
cdf0e10cSrcweir    sal_Int8 *pbTempMem = 0;
cdf0e10cSrcweir
cdf0e10cSrcweir    if( m_seqSource.getLength() ) {
cdf0e10cSrcweir        // put old rest and new byte sequence into one array
cdf0e10cSrcweir        pbTempMem = new sal_Int8[ nSourceSize ];
cdf0e10cSrcweir        memcpy( pbTempMem , m_seqSource.getConstArray() , m_seqSource.getLength() );
cdf0e10cSrcweir        memcpy( &(pbTempMem[ m_seqSource.getLength() ]) , seqText.getConstArray() , seqText.getLength() );
cdf0e10cSrcweir        pbSource = pbTempMem;
cdf0e10cSrcweir
cdf0e10cSrcweir        // set to zero again
cdf0e10cSrcweir        m_seqSource = Sequence< sal_Int8 >();
cdf0e10cSrcweir    }
cdf0e10cSrcweir
cdf0e10cSrcweir    while( sal_True ) {
cdf0e10cSrcweir
cdf0e10cSrcweir        /* All invalid characters are transformed to the unicode undefined char */
cdf0e10cSrcweir        nTargetCount +=     rtl_convertTextToUnicode(
cdf0e10cSrcweir                                    m_convText2Unicode,
cdf0e10cSrcweir                                    m_contextText2Unicode,
cdf0e10cSrcweir                                    ( const sal_Char * ) &( pbSource[nSourceCount] ),
cdf0e10cSrcweir                                    nSourceSize - nSourceCount ,
cdf0e10cSrcweir                                    &( seqUnicode.getArray()[ nTargetCount ] ),
cdf0e10cSrcweir                                    seqUnicode.getLength() - nTargetCount,
cdf0e10cSrcweir                                    RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_DEFAULT   |
cdf0e10cSrcweir                                    RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_DEFAULT |
cdf0e10cSrcweir                                    RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT,
cdf0e10cSrcweir                                    &uiInfo,
cdf0e10cSrcweir                                    &nSrcCvtBytes );
cdf0e10cSrcweir        nSourceCount += nSrcCvtBytes;
cdf0e10cSrcweir
cdf0e10cSrcweir        if( uiInfo & RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL ) {
cdf0e10cSrcweir            // save necessary bytes for next conversion
cdf0e10cSrcweir            seqUnicode.realloc( seqUnicode.getLength() * 2 );
cdf0e10cSrcweir            continue;
cdf0e10cSrcweir        }
cdf0e10cSrcweir        break;
cdf0e10cSrcweir    }
cdf0e10cSrcweir    if( uiInfo & RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL ) {
cdf0e10cSrcweir        m_seqSource.realloc( nSourceSize - nSourceCount );
cdf0e10cSrcweir        memcpy( m_seqSource.getArray() , &(pbSource[nSourceCount]) , nSourceSize-nSourceCount );
cdf0e10cSrcweir    }
cdf0e10cSrcweir
cdf0e10cSrcweir
cdf0e10cSrcweir    if( pbTempMem ) {
cdf0e10cSrcweir        delete [] pbTempMem;
cdf0e10cSrcweir    }
cdf0e10cSrcweir
cdf0e10cSrcweir    // set to correct unicode size
cdf0e10cSrcweir    seqUnicode.realloc( nTargetCount );
cdf0e10cSrcweir
cdf0e10cSrcweir    return seqUnicode;
cdf0e10cSrcweir}
cdf0e10cSrcweir
cdf0e10cSrcweir
cdf0e10cSrcweir
cdf0e10cSrcweir//----------------------------------------------
cdf0e10cSrcweir//
cdf0e10cSrcweir// Unicode2TextConverter
cdf0e10cSrcweir//
cdf0e10cSrcweir//----------------------------------------------
cdf0e10cSrcweirUnicode2TextConverter::Unicode2TextConverter( rtl_TextEncoding encoding )
cdf0e10cSrcweir{
cdf0e10cSrcweir    init( encoding );
cdf0e10cSrcweir}
cdf0e10cSrcweir
cdf0e10cSrcweir
cdf0e10cSrcweirUnicode2TextConverter::~Unicode2TextConverter()
cdf0e10cSrcweir{
cdf0e10cSrcweir    if( m_bInitialized ) {
cdf0e10cSrcweir        rtl_destroyUnicodeToTextContext( m_convUnicode2Text , m_contextUnicode2Text );
cdf0e10cSrcweir        rtl_destroyUnicodeToTextConverter( m_convUnicode2Text );
cdf0e10cSrcweir    }
cdf0e10cSrcweir}
cdf0e10cSrcweir
cdf0e10cSrcweir
cdf0e10cSrcweirSequence<sal_Int8> Unicode2TextConverter::convert(const sal_Unicode *puSource , sal_Int32 nSourceSize)
cdf0e10cSrcweir{
cdf0e10cSrcweir    sal_Unicode *puTempMem = 0;
cdf0e10cSrcweir
cdf0e10cSrcweir    if( m_seqSource.getLength() ) {
cdf0e10cSrcweir        // For surrogates !
cdf0e10cSrcweir        // put old rest and new byte sequence into one array
cdf0e10cSrcweir        // In general when surrogates are used, they should be rarely
cdf0e10cSrcweir        // cut off between two convert()-calls. So this code is used
cdf0e10cSrcweir        // rarely and the extra copy is acceptable.
cdf0e10cSrcweir        puTempMem = new sal_Unicode[ nSourceSize + m_seqSource.getLength()];
cdf0e10cSrcweir        memcpy( puTempMem ,
cdf0e10cSrcweir                m_seqSource.getConstArray() ,
cdf0e10cSrcweir                m_seqSource.getLength() * sizeof( sal_Unicode ) );
cdf0e10cSrcweir        memcpy(
cdf0e10cSrcweir            &(puTempMem[ m_seqSource.getLength() ]) ,
cdf0e10cSrcweir            puSource ,
cdf0e10cSrcweir            nSourceSize*sizeof( sal_Unicode ) );
cdf0e10cSrcweir        puSource = puTempMem;
cdf0e10cSrcweir        nSourceSize += m_seqSource.getLength();
cdf0e10cSrcweir
cdf0e10cSrcweir        m_seqSource = Sequence< sal_Unicode > ();
cdf0e10cSrcweir    }
cdf0e10cSrcweir
cdf0e10cSrcweir
cdf0e10cSrcweir    sal_Size nTargetCount = 0;
cdf0e10cSrcweir    sal_Size nSourceCount = 0;
cdf0e10cSrcweir
cdf0e10cSrcweir    sal_uInt32 uiInfo;
cdf0e10cSrcweir    sal_Size nSrcCvtChars;
cdf0e10cSrcweir
cdf0e10cSrcweir    // take nSourceSize * 3 as preference
cdf0e10cSrcweir    // this is an upper boundary for converting to utf8,
cdf0e10cSrcweir    // which most often used as the target.
cdf0e10cSrcweir    sal_Int32 nSeqSize =  nSourceSize * 3;
cdf0e10cSrcweir
cdf0e10cSrcweir    Sequence<sal_Int8>  seqText( nSeqSize );
cdf0e10cSrcweir    sal_Char *pTarget = (sal_Char *) seqText.getArray();
cdf0e10cSrcweir    while( sal_True ) {
cdf0e10cSrcweir
cdf0e10cSrcweir        nTargetCount += rtl_convertUnicodeToText(
cdf0e10cSrcweir                                    m_convUnicode2Text,
cdf0e10cSrcweir                                    m_contextUnicode2Text,
cdf0e10cSrcweir                                    &( puSource[nSourceCount] ),
cdf0e10cSrcweir                                    nSourceSize - nSourceCount ,
cdf0e10cSrcweir                                    &( pTarget[nTargetCount] ),
cdf0e10cSrcweir                                    nSeqSize - nTargetCount,
cdf0e10cSrcweir                                    RTL_UNICODETOTEXT_FLAGS_UNDEFINED_DEFAULT |
cdf0e10cSrcweir                                    RTL_UNICODETOTEXT_FLAGS_INVALID_DEFAULT ,
cdf0e10cSrcweir                                    &uiInfo,
cdf0e10cSrcweir                                    &nSrcCvtChars);
cdf0e10cSrcweir        nSourceCount += nSrcCvtChars;
cdf0e10cSrcweir
cdf0e10cSrcweir        if( uiInfo & RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL ) {
cdf0e10cSrcweir            nSeqSize = nSeqSize *2;
cdf0e10cSrcweir            seqText.realloc( nSeqSize );  // double array size
cdf0e10cSrcweir            pTarget = ( sal_Char * ) seqText.getArray();
cdf0e10cSrcweir            continue;
cdf0e10cSrcweir        }
cdf0e10cSrcweir        break;
cdf0e10cSrcweir    }
cdf0e10cSrcweir
cdf0e10cSrcweir    // for surrogates
cdf0e10cSrcweir    if( uiInfo & RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL ) {
cdf0e10cSrcweir        m_seqSource.realloc( nSourceSize - nSourceCount );
cdf0e10cSrcweir        memcpy( m_seqSource.getArray() ,
cdf0e10cSrcweir                &(puSource[nSourceCount]),
cdf0e10cSrcweir                (nSourceSize - nSourceCount) * sizeof( sal_Unicode ) );
cdf0e10cSrcweir    }
cdf0e10cSrcweir
cdf0e10cSrcweir    if( puTempMem ) {
cdf0e10cSrcweir        delete [] puTempMem;
cdf0e10cSrcweir    }
cdf0e10cSrcweir
cdf0e10cSrcweir    // reduce the size of the buffer (fast, no copy necessary)
cdf0e10cSrcweir    seqText.realloc( nTargetCount );
cdf0e10cSrcweir
cdf0e10cSrcweir    return seqText;
cdf0e10cSrcweir}
cdf0e10cSrcweir
cdf0e10cSrcweirvoid Unicode2TextConverter::init( rtl_TextEncoding encoding )
cdf0e10cSrcweir{
cdf0e10cSrcweir    m_bCanContinue = sal_True;
cdf0e10cSrcweir    m_bInitialized = sal_True;
cdf0e10cSrcweir
cdf0e10cSrcweir    m_convUnicode2Text  = rtl_createUnicodeToTextConverter( encoding );
cdf0e10cSrcweir    m_contextUnicode2Text = rtl_createUnicodeToTextContext( m_convUnicode2Text );
cdf0e10cSrcweir    m_rtlEncoding = encoding;
cdf0e10cSrcweir};
cdf0e10cSrcweir
cdf0e10cSrcweir
cdf0e10cSrcweir}