xref: /trunk/main/sax/inc/xml2utf.hxx (revision 1ecadb572e7010ff3b3382ad9bf179dbc6efadbb)
1 /*************************************************************************
2  *
3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4  *
5  * Copyright 2000, 2010 Oracle and/or its affiliates.
6  *
7  * OpenOffice.org - a multi-platform office productivity suite
8  *
9  * This file is part of OpenOffice.org.
10  *
11  * OpenOffice.org is free software: you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser General Public License version 3
13  * only, as published by the Free Software Foundation.
14  *
15  * OpenOffice.org is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  * GNU Lesser General Public License version 3 for more details
19  * (a copy is included in the LICENSE file that accompanied this code).
20  *
21  * You should have received a copy of the GNU Lesser General Public License
22  * version 3 along with OpenOffice.org.  If not, see
23  * <http://www.openoffice.org/license.html>
24  * for a copy of the LGPLv3 License.
25  *
26  ************************************************************************/
27 
28 // TODO: Woher?
29 #define Max( a, b )     (((a)>(b)) ? (a) : (b) )
30 #define Min( a, b )     (((a)<(b)) ? (a) : (b) )
31 
32 /*
33 *
34 * Text2UnicodeConverter
35 *
36 **/
37 namespace sax_expatwrap {
38 
39 class Text2UnicodeConverter
40 {
41 
42 public:
43     Text2UnicodeConverter( const ::rtl::OString & sEncoding );
44     ~Text2UnicodeConverter();
45 
46     ::com::sun::star::uno::Sequence < sal_Unicode > convert( const ::com::sun::star::uno::Sequence<sal_Int8> & );
47     sal_Bool canContinue() {  return m_bCanContinue; }
48 
49 private:
50     void init( rtl_TextEncoding encoding );
51 
52     rtl_TextToUnicodeConverter  m_convText2Unicode;
53     rtl_TextToUnicodeContext    m_contextText2Unicode;
54     sal_Bool                    m_bCanContinue;
55     sal_Bool                    m_bInitialized;
56     rtl_TextEncoding            m_rtlEncoding;
57     ::com::sun::star::uno::Sequence<sal_Int8> m_seqSource;
58 };
59 
60 /*----------------------------------------
61 *
62 * Unicode2TextConverter
63 *
64 **-----------------------------------------*/
65 class Unicode2TextConverter
66 {
67 public:
68     Unicode2TextConverter( rtl_TextEncoding encoding );
69     ~Unicode2TextConverter();
70 
71     inline ::com::sun::star::uno::Sequence<sal_Int8> convert( const ::rtl::OUString &s )
72         {
73             return convert( s.getStr() , s.getLength() );
74         }
75     ::com::sun::star::uno::Sequence<sal_Int8> convert( const sal_Unicode * , sal_Int32 nLength );
76     sal_Bool canContinue() {  return m_bCanContinue; }
77 
78 private:
79     void init( rtl_TextEncoding encoding );
80 
81     rtl_UnicodeToTextConverter  m_convUnicode2Text;
82     rtl_UnicodeToTextContext    m_contextUnicode2Text;
83     sal_Bool                    m_bCanContinue;
84     sal_Bool                    m_bInitialized;
85     rtl_TextEncoding            m_rtlEncoding;
86     ::com::sun::star::uno::Sequence<sal_Unicode>        m_seqSource;
87 };
88 
89 
90 
91 /*----------------------------------------
92 *
93 * XMLFile2UTFConverter
94 *
95 **-----------------------------------------*/
96 class XMLFile2UTFConverter
97 {
98 public:
99     XMLFile2UTFConverter( ):
100         m_bStarted( sal_False ),
101         m_pText2Unicode( 0 ),
102         m_pUnicode2Text( 0 )
103         {}
104 
105     ~XMLFile2UTFConverter();
106 
107     void setInputStream( ::com::sun::star::uno::Reference< ::com::sun::star::io::XInputStream > &r ) { m_in = r; }
108     void setEncoding( const ::rtl::OString &s ) { m_sEncoding = s; }
109 
110 
111 
112     // @param nMaxToRead The number of chars, that should be read. Note that this is no exact number. There
113     //                   may be returned less or more bytes than ordered.
114     sal_Int32 readAndConvert( ::com::sun::star::uno::Sequence<sal_Int8> &seq , sal_Int32 nMaxToRead )
115         throw ( ::com::sun::star::io::IOException,
116                 ::com::sun::star::io::NotConnectedException ,
117                 ::com::sun::star::io::BufferSizeExceededException ,
118                 ::com::sun::star::uno::RuntimeException );
119 
120 private:
121 
122     // Called only on first Sequence of bytes. Tries to figure out file format and encoding information.
123     // @return TRUE, when encoding information could be retrieved
124     // @return FALSE, when no encoding information was found in file
125     sal_Bool scanForEncoding( ::com::sun::star::uno::Sequence<sal_Int8> &seq );
126 
127     // Called only on first Sequence of bytes. Tries to figure out
128     // if enough data is available to scan encoding
129     // @return TRUE, when encoding is retrievable
130     // @return FALSE, when more data is needed
131     sal_Bool isEncodingRecognizable( const ::com::sun::star::uno::Sequence< sal_Int8 > & seq );
132 
133     // When encoding attribute is within the text (in the first line), it is removed.
134     void removeEncoding( ::com::sun::star::uno::Sequence<sal_Int8> &seq );
135 
136     // Initializes decoding depending on m_sEncoding setting
137     void initializeDecoding();
138 private:
139     ::com::sun::star::uno::Reference< ::com::sun::star::io::XInputStream >  m_in;
140 
141     sal_Bool m_bStarted;
142     ::rtl::OString m_sEncoding;
143 
144     Text2UnicodeConverter *m_pText2Unicode;
145     Unicode2TextConverter *m_pUnicode2Text;
146 };
147 }
148