1*cdf0e10cSrcweir /************************************************************************* 2*cdf0e10cSrcweir * 3*cdf0e10cSrcweir * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4*cdf0e10cSrcweir * 5*cdf0e10cSrcweir * Copyright 2000, 2010 Oracle and/or its affiliates. 6*cdf0e10cSrcweir * 7*cdf0e10cSrcweir * OpenOffice.org - a multi-platform office productivity suite 8*cdf0e10cSrcweir * 9*cdf0e10cSrcweir * This file is part of OpenOffice.org. 10*cdf0e10cSrcweir * 11*cdf0e10cSrcweir * OpenOffice.org is free software: you can redistribute it and/or modify 12*cdf0e10cSrcweir * it under the terms of the GNU Lesser General Public License version 3 13*cdf0e10cSrcweir * only, as published by the Free Software Foundation. 14*cdf0e10cSrcweir * 15*cdf0e10cSrcweir * OpenOffice.org is distributed in the hope that it will be useful, 16*cdf0e10cSrcweir * but WITHOUT ANY WARRANTY; without even the implied warranty of 17*cdf0e10cSrcweir * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18*cdf0e10cSrcweir * GNU Lesser General Public License version 3 for more details 19*cdf0e10cSrcweir * (a copy is included in the LICENSE file that accompanied this code). 20*cdf0e10cSrcweir * 21*cdf0e10cSrcweir * You should have received a copy of the GNU Lesser General Public License 22*cdf0e10cSrcweir * version 3 along with OpenOffice.org. If not, see 23*cdf0e10cSrcweir * <http://www.openoffice.org/license.html> 24*cdf0e10cSrcweir * for a copy of the LGPLv3 License. 25*cdf0e10cSrcweir * 26*cdf0e10cSrcweir ************************************************************************/ 27*cdf0e10cSrcweir 28*cdf0e10cSrcweir // TODO: Woher? 29*cdf0e10cSrcweir #define Max( a, b ) (((a)>(b)) ? (a) : (b) ) 30*cdf0e10cSrcweir #define Min( a, b ) (((a)<(b)) ? (a) : (b) ) 31*cdf0e10cSrcweir 32*cdf0e10cSrcweir /* 33*cdf0e10cSrcweir * 34*cdf0e10cSrcweir * Text2UnicodeConverter 35*cdf0e10cSrcweir * 36*cdf0e10cSrcweir **/ 37*cdf0e10cSrcweir namespace sax_expatwrap { 38*cdf0e10cSrcweir 39*cdf0e10cSrcweir class Text2UnicodeConverter 40*cdf0e10cSrcweir { 41*cdf0e10cSrcweir 42*cdf0e10cSrcweir public: 43*cdf0e10cSrcweir Text2UnicodeConverter( const ::rtl::OString & sEncoding ); 44*cdf0e10cSrcweir ~Text2UnicodeConverter(); 45*cdf0e10cSrcweir 46*cdf0e10cSrcweir ::com::sun::star::uno::Sequence < sal_Unicode > convert( const ::com::sun::star::uno::Sequence<sal_Int8> & ); 47*cdf0e10cSrcweir sal_Bool canContinue() { return m_bCanContinue; } 48*cdf0e10cSrcweir 49*cdf0e10cSrcweir private: 50*cdf0e10cSrcweir void init( rtl_TextEncoding encoding ); 51*cdf0e10cSrcweir 52*cdf0e10cSrcweir rtl_TextToUnicodeConverter m_convText2Unicode; 53*cdf0e10cSrcweir rtl_TextToUnicodeContext m_contextText2Unicode; 54*cdf0e10cSrcweir sal_Bool m_bCanContinue; 55*cdf0e10cSrcweir sal_Bool m_bInitialized; 56*cdf0e10cSrcweir rtl_TextEncoding m_rtlEncoding; 57*cdf0e10cSrcweir ::com::sun::star::uno::Sequence<sal_Int8> m_seqSource; 58*cdf0e10cSrcweir }; 59*cdf0e10cSrcweir 60*cdf0e10cSrcweir /*---------------------------------------- 61*cdf0e10cSrcweir * 62*cdf0e10cSrcweir * Unicode2TextConverter 63*cdf0e10cSrcweir * 64*cdf0e10cSrcweir **-----------------------------------------*/ 65*cdf0e10cSrcweir class Unicode2TextConverter 66*cdf0e10cSrcweir { 67*cdf0e10cSrcweir public: 68*cdf0e10cSrcweir Unicode2TextConverter( rtl_TextEncoding encoding ); 69*cdf0e10cSrcweir ~Unicode2TextConverter(); 70*cdf0e10cSrcweir 71*cdf0e10cSrcweir inline ::com::sun::star::uno::Sequence<sal_Int8> convert( const ::rtl::OUString &s ) 72*cdf0e10cSrcweir { 73*cdf0e10cSrcweir return convert( s.getStr() , s.getLength() ); 74*cdf0e10cSrcweir } 75*cdf0e10cSrcweir ::com::sun::star::uno::Sequence<sal_Int8> convert( const sal_Unicode * , sal_Int32 nLength ); 76*cdf0e10cSrcweir sal_Bool canContinue() { return m_bCanContinue; } 77*cdf0e10cSrcweir 78*cdf0e10cSrcweir private: 79*cdf0e10cSrcweir void init( rtl_TextEncoding encoding ); 80*cdf0e10cSrcweir 81*cdf0e10cSrcweir rtl_UnicodeToTextConverter m_convUnicode2Text; 82*cdf0e10cSrcweir rtl_UnicodeToTextContext m_contextUnicode2Text; 83*cdf0e10cSrcweir sal_Bool m_bCanContinue; 84*cdf0e10cSrcweir sal_Bool m_bInitialized; 85*cdf0e10cSrcweir rtl_TextEncoding m_rtlEncoding; 86*cdf0e10cSrcweir ::com::sun::star::uno::Sequence<sal_Unicode> m_seqSource; 87*cdf0e10cSrcweir }; 88*cdf0e10cSrcweir 89*cdf0e10cSrcweir 90*cdf0e10cSrcweir 91*cdf0e10cSrcweir /*---------------------------------------- 92*cdf0e10cSrcweir * 93*cdf0e10cSrcweir * XMLFile2UTFConverter 94*cdf0e10cSrcweir * 95*cdf0e10cSrcweir **-----------------------------------------*/ 96*cdf0e10cSrcweir class XMLFile2UTFConverter 97*cdf0e10cSrcweir { 98*cdf0e10cSrcweir public: 99*cdf0e10cSrcweir XMLFile2UTFConverter( ): 100*cdf0e10cSrcweir m_bStarted( sal_False ), 101*cdf0e10cSrcweir m_pText2Unicode( 0 ), 102*cdf0e10cSrcweir m_pUnicode2Text( 0 ) 103*cdf0e10cSrcweir {} 104*cdf0e10cSrcweir 105*cdf0e10cSrcweir ~XMLFile2UTFConverter(); 106*cdf0e10cSrcweir 107*cdf0e10cSrcweir void setInputStream( ::com::sun::star::uno::Reference< ::com::sun::star::io::XInputStream > &r ) { m_in = r; } 108*cdf0e10cSrcweir void setEncoding( const ::rtl::OString &s ) { m_sEncoding = s; } 109*cdf0e10cSrcweir 110*cdf0e10cSrcweir 111*cdf0e10cSrcweir 112*cdf0e10cSrcweir // @param nMaxToRead The number of chars, that should be read. Note that this is no exact number. There 113*cdf0e10cSrcweir // may be returned less or more bytes than ordered. 114*cdf0e10cSrcweir sal_Int32 readAndConvert( ::com::sun::star::uno::Sequence<sal_Int8> &seq , sal_Int32 nMaxToRead ) 115*cdf0e10cSrcweir throw ( ::com::sun::star::io::IOException, 116*cdf0e10cSrcweir ::com::sun::star::io::NotConnectedException , 117*cdf0e10cSrcweir ::com::sun::star::io::BufferSizeExceededException , 118*cdf0e10cSrcweir ::com::sun::star::uno::RuntimeException ); 119*cdf0e10cSrcweir 120*cdf0e10cSrcweir private: 121*cdf0e10cSrcweir 122*cdf0e10cSrcweir // Called only on first Sequence of bytes. Tries to figure out file format and encoding information. 123*cdf0e10cSrcweir // @return TRUE, when encoding information could be retrieved 124*cdf0e10cSrcweir // @return FALSE, when no encoding information was found in file 125*cdf0e10cSrcweir sal_Bool scanForEncoding( ::com::sun::star::uno::Sequence<sal_Int8> &seq ); 126*cdf0e10cSrcweir 127*cdf0e10cSrcweir // Called only on first Sequence of bytes. Tries to figure out 128*cdf0e10cSrcweir // if enough data is available to scan encoding 129*cdf0e10cSrcweir // @return TRUE, when encoding is retrievable 130*cdf0e10cSrcweir // @return FALSE, when more data is needed 131*cdf0e10cSrcweir sal_Bool isEncodingRecognizable( const ::com::sun::star::uno::Sequence< sal_Int8 > & seq ); 132*cdf0e10cSrcweir 133*cdf0e10cSrcweir // When encoding attribute is within the text (in the first line), it is removed. 134*cdf0e10cSrcweir void removeEncoding( ::com::sun::star::uno::Sequence<sal_Int8> &seq ); 135*cdf0e10cSrcweir 136*cdf0e10cSrcweir // Initializes decoding depending on m_sEncoding setting 137*cdf0e10cSrcweir void initializeDecoding(); 138*cdf0e10cSrcweir private: 139*cdf0e10cSrcweir ::com::sun::star::uno::Reference< ::com::sun::star::io::XInputStream > m_in; 140*cdf0e10cSrcweir 141*cdf0e10cSrcweir sal_Bool m_bStarted; 142*cdf0e10cSrcweir ::rtl::OString m_sEncoding; 143*cdf0e10cSrcweir 144*cdf0e10cSrcweir Text2UnicodeConverter *m_pText2Unicode; 145*cdf0e10cSrcweir Unicode2TextConverter *m_pUnicode2Text; 146*cdf0e10cSrcweir }; 147*cdf0e10cSrcweir } 148