xref: /aoo42x/main/sax/inc/xml2utf.hxx (revision 8d192041)
1*8d192041SAndrew Rist /**************************************************************
2cdf0e10cSrcweir  *
3*8d192041SAndrew Rist  * Licensed to the Apache Software Foundation (ASF) under one
4*8d192041SAndrew Rist  * or more contributor license agreements.  See the NOTICE file
5*8d192041SAndrew Rist  * distributed with this work for additional information
6*8d192041SAndrew Rist  * regarding copyright ownership.  The ASF licenses this file
7*8d192041SAndrew Rist  * to you under the Apache License, Version 2.0 (the
8*8d192041SAndrew Rist  * "License"); you may not use this file except in compliance
9*8d192041SAndrew Rist  * with the License.  You may obtain a copy of the License at
10*8d192041SAndrew Rist  *
11*8d192041SAndrew Rist  *   http://www.apache.org/licenses/LICENSE-2.0
12*8d192041SAndrew Rist  *
13*8d192041SAndrew Rist  * Unless required by applicable law or agreed to in writing,
14*8d192041SAndrew Rist  * software distributed under the License is distributed on an
15*8d192041SAndrew Rist  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16*8d192041SAndrew Rist  * KIND, either express or implied.  See the License for the
17*8d192041SAndrew Rist  * specific language governing permissions and limitations
18*8d192041SAndrew Rist  * under the License.
19*8d192041SAndrew Rist  *
20*8d192041SAndrew Rist  *************************************************************/
21*8d192041SAndrew Rist 
22*8d192041SAndrew Rist 
23cdf0e10cSrcweir 
24cdf0e10cSrcweir // TODO: Woher?
25cdf0e10cSrcweir #define Max( a, b )		(((a)>(b)) ? (a) : (b) )
26cdf0e10cSrcweir #define Min( a, b )		(((a)<(b)) ? (a) : (b) )
27cdf0e10cSrcweir 
28cdf0e10cSrcweir /*
29cdf0e10cSrcweir *
30cdf0e10cSrcweir * Text2UnicodeConverter
31cdf0e10cSrcweir *
32cdf0e10cSrcweir **/
33cdf0e10cSrcweir namespace sax_expatwrap {
34cdf0e10cSrcweir 
35cdf0e10cSrcweir class Text2UnicodeConverter
36cdf0e10cSrcweir {
37cdf0e10cSrcweir 
38cdf0e10cSrcweir public:
39cdf0e10cSrcweir 	Text2UnicodeConverter( const ::rtl::OString & sEncoding );
40cdf0e10cSrcweir 	~Text2UnicodeConverter();
41cdf0e10cSrcweir 
42cdf0e10cSrcweir 	::com::sun::star::uno::Sequence < sal_Unicode > convert( const ::com::sun::star::uno::Sequence<sal_Int8> & );
canContinue()43cdf0e10cSrcweir 	sal_Bool canContinue() {  return m_bCanContinue; }
44cdf0e10cSrcweir 
45cdf0e10cSrcweir private:
46cdf0e10cSrcweir 	void init( rtl_TextEncoding encoding );
47cdf0e10cSrcweir 
48cdf0e10cSrcweir 	rtl_TextToUnicodeConverter 	m_convText2Unicode;
49cdf0e10cSrcweir 	rtl_TextToUnicodeContext   	m_contextText2Unicode;
50cdf0e10cSrcweir 	sal_Bool					m_bCanContinue;
51cdf0e10cSrcweir 	sal_Bool					m_bInitialized;
52cdf0e10cSrcweir 	rtl_TextEncoding			m_rtlEncoding;
53cdf0e10cSrcweir 	::com::sun::star::uno::Sequence<sal_Int8> m_seqSource;
54cdf0e10cSrcweir };
55cdf0e10cSrcweir 
56cdf0e10cSrcweir /*----------------------------------------
57cdf0e10cSrcweir *
58cdf0e10cSrcweir * Unicode2TextConverter
59cdf0e10cSrcweir *
60cdf0e10cSrcweir **-----------------------------------------*/
61cdf0e10cSrcweir class Unicode2TextConverter
62cdf0e10cSrcweir {
63cdf0e10cSrcweir public:
64cdf0e10cSrcweir 	Unicode2TextConverter( rtl_TextEncoding encoding );
65cdf0e10cSrcweir 	~Unicode2TextConverter();
66cdf0e10cSrcweir 
convert(const::rtl::OUString & s)67cdf0e10cSrcweir 	inline ::com::sun::star::uno::Sequence<sal_Int8> convert( const ::rtl::OUString &s )
68cdf0e10cSrcweir 		{
69cdf0e10cSrcweir 			return convert( s.getStr() , s.getLength() );
70cdf0e10cSrcweir 		}
71cdf0e10cSrcweir 	::com::sun::star::uno::Sequence<sal_Int8> convert( const sal_Unicode * , sal_Int32 nLength );
canContinue()72cdf0e10cSrcweir 	sal_Bool canContinue() {  return m_bCanContinue; }
73cdf0e10cSrcweir 
74cdf0e10cSrcweir private:
75cdf0e10cSrcweir 	void init( rtl_TextEncoding encoding );
76cdf0e10cSrcweir 
77cdf0e10cSrcweir 	rtl_UnicodeToTextConverter 	m_convUnicode2Text;
78cdf0e10cSrcweir 	rtl_UnicodeToTextContext   	m_contextUnicode2Text;
79cdf0e10cSrcweir 	sal_Bool					m_bCanContinue;
80cdf0e10cSrcweir 	sal_Bool					m_bInitialized;
81cdf0e10cSrcweir 	rtl_TextEncoding			m_rtlEncoding;
82cdf0e10cSrcweir 	::com::sun::star::uno::Sequence<sal_Unicode>		m_seqSource;
83cdf0e10cSrcweir };
84cdf0e10cSrcweir 
85cdf0e10cSrcweir 
86cdf0e10cSrcweir 
87cdf0e10cSrcweir /*----------------------------------------
88cdf0e10cSrcweir *
89cdf0e10cSrcweir * XMLFile2UTFConverter
90cdf0e10cSrcweir *
91cdf0e10cSrcweir **-----------------------------------------*/
92cdf0e10cSrcweir class XMLFile2UTFConverter
93cdf0e10cSrcweir {
94cdf0e10cSrcweir public:
XMLFile2UTFConverter()95cdf0e10cSrcweir 	XMLFile2UTFConverter( ):
96cdf0e10cSrcweir 		m_bStarted( sal_False ),
97cdf0e10cSrcweir 		m_pText2Unicode( 0 ),
98cdf0e10cSrcweir 		m_pUnicode2Text( 0 )
99cdf0e10cSrcweir 		{}
100cdf0e10cSrcweir 
101cdf0e10cSrcweir 	~XMLFile2UTFConverter();
102cdf0e10cSrcweir 
setInputStream(::com::sun::star::uno::Reference<::com::sun::star::io::XInputStream> & r)103cdf0e10cSrcweir 	void setInputStream( ::com::sun::star::uno::Reference< ::com::sun::star::io::XInputStream > &r ) { m_in = r; }
setEncoding(const::rtl::OString & s)104cdf0e10cSrcweir 	void setEncoding( const ::rtl::OString &s ) { m_sEncoding = s; }
105cdf0e10cSrcweir 
106cdf0e10cSrcweir 
107cdf0e10cSrcweir 
108cdf0e10cSrcweir 	// @param nMaxToRead The number of chars, that should be read. Note that this is no exact number. There
109cdf0e10cSrcweir 	//                   may be returned less or more bytes than ordered.
110cdf0e10cSrcweir 	sal_Int32 readAndConvert( ::com::sun::star::uno::Sequence<sal_Int8> &seq , sal_Int32 nMaxToRead )
111cdf0e10cSrcweir 		throw ( ::com::sun::star::io::IOException,
112cdf0e10cSrcweir 				::com::sun::star::io::NotConnectedException ,
113cdf0e10cSrcweir 				::com::sun::star::io::BufferSizeExceededException ,
114cdf0e10cSrcweir 				::com::sun::star::uno::RuntimeException );
115cdf0e10cSrcweir 
116cdf0e10cSrcweir private:
117cdf0e10cSrcweir 
118cdf0e10cSrcweir 	// Called only on first Sequence of bytes. Tries to figure out file format and encoding information.
119cdf0e10cSrcweir 	// @return TRUE, when encoding information could be retrieved
120cdf0e10cSrcweir 	// @return FALSE, when no encoding information was found in file
121cdf0e10cSrcweir 	sal_Bool scanForEncoding( ::com::sun::star::uno::Sequence<sal_Int8> &seq );
122cdf0e10cSrcweir 
123cdf0e10cSrcweir 	// Called only on first Sequence of bytes. Tries to figure out
124cdf0e10cSrcweir 	// if enough data is available to scan encoding
125cdf0e10cSrcweir 	// @return TRUE, when encoding is retrievable
126cdf0e10cSrcweir 	// @return FALSE, when more data is needed
127cdf0e10cSrcweir 	sal_Bool isEncodingRecognizable( const ::com::sun::star::uno::Sequence< sal_Int8 > & seq );
128cdf0e10cSrcweir 
129cdf0e10cSrcweir 	// When encoding attribute is within the text (in the first line), it is removed.
130cdf0e10cSrcweir 	void removeEncoding( ::com::sun::star::uno::Sequence<sal_Int8> &seq );
131cdf0e10cSrcweir 
132cdf0e10cSrcweir 	// Initializes decoding depending on m_sEncoding setting
133cdf0e10cSrcweir 	void initializeDecoding();
134cdf0e10cSrcweir private:
135cdf0e10cSrcweir 	::com::sun::star::uno::Reference< ::com::sun::star::io::XInputStream >  m_in;
136cdf0e10cSrcweir 
137cdf0e10cSrcweir 	sal_Bool m_bStarted;
138cdf0e10cSrcweir 	::rtl::OString m_sEncoding;
139cdf0e10cSrcweir 
140cdf0e10cSrcweir 	Text2UnicodeConverter *m_pText2Unicode;
141cdf0e10cSrcweir 	Unicode2TextConverter *m_pUnicode2Text;
142cdf0e10cSrcweir };
143cdf0e10cSrcweir }
144