xref: /trunk/main/sax/inc/xml2utf.hxx (revision cdf0e10c4e3984b49a9502b011690b615761d4a3)
1*cdf0e10cSrcweir /*************************************************************************
2*cdf0e10cSrcweir  *
3*cdf0e10cSrcweir  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4*cdf0e10cSrcweir  *
5*cdf0e10cSrcweir  * Copyright 2000, 2010 Oracle and/or its affiliates.
6*cdf0e10cSrcweir  *
7*cdf0e10cSrcweir  * OpenOffice.org - a multi-platform office productivity suite
8*cdf0e10cSrcweir  *
9*cdf0e10cSrcweir  * This file is part of OpenOffice.org.
10*cdf0e10cSrcweir  *
11*cdf0e10cSrcweir  * OpenOffice.org is free software: you can redistribute it and/or modify
12*cdf0e10cSrcweir  * it under the terms of the GNU Lesser General Public License version 3
13*cdf0e10cSrcweir  * only, as published by the Free Software Foundation.
14*cdf0e10cSrcweir  *
15*cdf0e10cSrcweir  * OpenOffice.org is distributed in the hope that it will be useful,
16*cdf0e10cSrcweir  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17*cdf0e10cSrcweir  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18*cdf0e10cSrcweir  * GNU Lesser General Public License version 3 for more details
19*cdf0e10cSrcweir  * (a copy is included in the LICENSE file that accompanied this code).
20*cdf0e10cSrcweir  *
21*cdf0e10cSrcweir  * You should have received a copy of the GNU Lesser General Public License
22*cdf0e10cSrcweir  * version 3 along with OpenOffice.org.  If not, see
23*cdf0e10cSrcweir  * <http://www.openoffice.org/license.html>
24*cdf0e10cSrcweir  * for a copy of the LGPLv3 License.
25*cdf0e10cSrcweir  *
26*cdf0e10cSrcweir  ************************************************************************/
27*cdf0e10cSrcweir 
28*cdf0e10cSrcweir // TODO: Woher?
29*cdf0e10cSrcweir #define Max( a, b )     (((a)>(b)) ? (a) : (b) )
30*cdf0e10cSrcweir #define Min( a, b )     (((a)<(b)) ? (a) : (b) )
31*cdf0e10cSrcweir 
32*cdf0e10cSrcweir /*
33*cdf0e10cSrcweir *
34*cdf0e10cSrcweir * Text2UnicodeConverter
35*cdf0e10cSrcweir *
36*cdf0e10cSrcweir **/
37*cdf0e10cSrcweir namespace sax_expatwrap {
38*cdf0e10cSrcweir 
39*cdf0e10cSrcweir class Text2UnicodeConverter
40*cdf0e10cSrcweir {
41*cdf0e10cSrcweir 
42*cdf0e10cSrcweir public:
43*cdf0e10cSrcweir     Text2UnicodeConverter( const ::rtl::OString & sEncoding );
44*cdf0e10cSrcweir     ~Text2UnicodeConverter();
45*cdf0e10cSrcweir 
46*cdf0e10cSrcweir     ::com::sun::star::uno::Sequence < sal_Unicode > convert( const ::com::sun::star::uno::Sequence<sal_Int8> & );
47*cdf0e10cSrcweir     sal_Bool canContinue() {  return m_bCanContinue; }
48*cdf0e10cSrcweir 
49*cdf0e10cSrcweir private:
50*cdf0e10cSrcweir     void init( rtl_TextEncoding encoding );
51*cdf0e10cSrcweir 
52*cdf0e10cSrcweir     rtl_TextToUnicodeConverter  m_convText2Unicode;
53*cdf0e10cSrcweir     rtl_TextToUnicodeContext    m_contextText2Unicode;
54*cdf0e10cSrcweir     sal_Bool                    m_bCanContinue;
55*cdf0e10cSrcweir     sal_Bool                    m_bInitialized;
56*cdf0e10cSrcweir     rtl_TextEncoding            m_rtlEncoding;
57*cdf0e10cSrcweir     ::com::sun::star::uno::Sequence<sal_Int8> m_seqSource;
58*cdf0e10cSrcweir };
59*cdf0e10cSrcweir 
60*cdf0e10cSrcweir /*----------------------------------------
61*cdf0e10cSrcweir *
62*cdf0e10cSrcweir * Unicode2TextConverter
63*cdf0e10cSrcweir *
64*cdf0e10cSrcweir **-----------------------------------------*/
65*cdf0e10cSrcweir class Unicode2TextConverter
66*cdf0e10cSrcweir {
67*cdf0e10cSrcweir public:
68*cdf0e10cSrcweir     Unicode2TextConverter( rtl_TextEncoding encoding );
69*cdf0e10cSrcweir     ~Unicode2TextConverter();
70*cdf0e10cSrcweir 
71*cdf0e10cSrcweir     inline ::com::sun::star::uno::Sequence<sal_Int8> convert( const ::rtl::OUString &s )
72*cdf0e10cSrcweir         {
73*cdf0e10cSrcweir             return convert( s.getStr() , s.getLength() );
74*cdf0e10cSrcweir         }
75*cdf0e10cSrcweir     ::com::sun::star::uno::Sequence<sal_Int8> convert( const sal_Unicode * , sal_Int32 nLength );
76*cdf0e10cSrcweir     sal_Bool canContinue() {  return m_bCanContinue; }
77*cdf0e10cSrcweir 
78*cdf0e10cSrcweir private:
79*cdf0e10cSrcweir     void init( rtl_TextEncoding encoding );
80*cdf0e10cSrcweir 
81*cdf0e10cSrcweir     rtl_UnicodeToTextConverter  m_convUnicode2Text;
82*cdf0e10cSrcweir     rtl_UnicodeToTextContext    m_contextUnicode2Text;
83*cdf0e10cSrcweir     sal_Bool                    m_bCanContinue;
84*cdf0e10cSrcweir     sal_Bool                    m_bInitialized;
85*cdf0e10cSrcweir     rtl_TextEncoding            m_rtlEncoding;
86*cdf0e10cSrcweir     ::com::sun::star::uno::Sequence<sal_Unicode>        m_seqSource;
87*cdf0e10cSrcweir };
88*cdf0e10cSrcweir 
89*cdf0e10cSrcweir 
90*cdf0e10cSrcweir 
91*cdf0e10cSrcweir /*----------------------------------------
92*cdf0e10cSrcweir *
93*cdf0e10cSrcweir * XMLFile2UTFConverter
94*cdf0e10cSrcweir *
95*cdf0e10cSrcweir **-----------------------------------------*/
96*cdf0e10cSrcweir class XMLFile2UTFConverter
97*cdf0e10cSrcweir {
98*cdf0e10cSrcweir public:
99*cdf0e10cSrcweir     XMLFile2UTFConverter( ):
100*cdf0e10cSrcweir         m_bStarted( sal_False ),
101*cdf0e10cSrcweir         m_pText2Unicode( 0 ),
102*cdf0e10cSrcweir         m_pUnicode2Text( 0 )
103*cdf0e10cSrcweir         {}
104*cdf0e10cSrcweir 
105*cdf0e10cSrcweir     ~XMLFile2UTFConverter();
106*cdf0e10cSrcweir 
107*cdf0e10cSrcweir     void setInputStream( ::com::sun::star::uno::Reference< ::com::sun::star::io::XInputStream > &r ) { m_in = r; }
108*cdf0e10cSrcweir     void setEncoding( const ::rtl::OString &s ) { m_sEncoding = s; }
109*cdf0e10cSrcweir 
110*cdf0e10cSrcweir 
111*cdf0e10cSrcweir 
112*cdf0e10cSrcweir     // @param nMaxToRead The number of chars, that should be read. Note that this is no exact number. There
113*cdf0e10cSrcweir     //                   may be returned less or more bytes than ordered.
114*cdf0e10cSrcweir     sal_Int32 readAndConvert( ::com::sun::star::uno::Sequence<sal_Int8> &seq , sal_Int32 nMaxToRead )
115*cdf0e10cSrcweir         throw ( ::com::sun::star::io::IOException,
116*cdf0e10cSrcweir                 ::com::sun::star::io::NotConnectedException ,
117*cdf0e10cSrcweir                 ::com::sun::star::io::BufferSizeExceededException ,
118*cdf0e10cSrcweir                 ::com::sun::star::uno::RuntimeException );
119*cdf0e10cSrcweir 
120*cdf0e10cSrcweir private:
121*cdf0e10cSrcweir 
122*cdf0e10cSrcweir     // Called only on first Sequence of bytes. Tries to figure out file format and encoding information.
123*cdf0e10cSrcweir     // @return TRUE, when encoding information could be retrieved
124*cdf0e10cSrcweir     // @return FALSE, when no encoding information was found in file
125*cdf0e10cSrcweir     sal_Bool scanForEncoding( ::com::sun::star::uno::Sequence<sal_Int8> &seq );
126*cdf0e10cSrcweir 
127*cdf0e10cSrcweir     // Called only on first Sequence of bytes. Tries to figure out
128*cdf0e10cSrcweir     // if enough data is available to scan encoding
129*cdf0e10cSrcweir     // @return TRUE, when encoding is retrievable
130*cdf0e10cSrcweir     // @return FALSE, when more data is needed
131*cdf0e10cSrcweir     sal_Bool isEncodingRecognizable( const ::com::sun::star::uno::Sequence< sal_Int8 > & seq );
132*cdf0e10cSrcweir 
133*cdf0e10cSrcweir     // When encoding attribute is within the text (in the first line), it is removed.
134*cdf0e10cSrcweir     void removeEncoding( ::com::sun::star::uno::Sequence<sal_Int8> &seq );
135*cdf0e10cSrcweir 
136*cdf0e10cSrcweir     // Initializes decoding depending on m_sEncoding setting
137*cdf0e10cSrcweir     void initializeDecoding();
138*cdf0e10cSrcweir private:
139*cdf0e10cSrcweir     ::com::sun::star::uno::Reference< ::com::sun::star::io::XInputStream >  m_in;
140*cdf0e10cSrcweir 
141*cdf0e10cSrcweir     sal_Bool m_bStarted;
142*cdf0e10cSrcweir     ::rtl::OString m_sEncoding;
143*cdf0e10cSrcweir 
144*cdf0e10cSrcweir     Text2UnicodeConverter *m_pText2Unicode;
145*cdf0e10cSrcweir     Unicode2TextConverter *m_pUnicode2Text;
146*cdf0e10cSrcweir };
147*cdf0e10cSrcweir }
148