1 /*************************************************************************
2 *
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * Copyright 2000, 2010 Oracle and/or its affiliates.
6 *
7 * OpenOffice.org - a multi-platform office productivity suite
8 *
9 * This file is part of OpenOffice.org.
10 *
11 * OpenOffice.org is free software: you can redistribute it and/or modify
12 * it under the terms of the GNU Lesser General Public License version 3
13 * only, as published by the Free Software Foundation.
14 *
15 * OpenOffice.org is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18 * GNU Lesser General Public License version 3 for more details
19 * (a copy is included in the LICENSE file that accompanied this code).
20 *
21 * You should have received a copy of the GNU Lesser General Public License
22 * version 3 along with OpenOffice.org.  If not, see
23 * <http://www.openoffice.org/license.html>
24 * for a copy of the LGPLv3 License.
25 *
26 ************************************************************************/
27 
28 #ifndef INCLUDED_XMLREADER_XMLREADER_HXX
29 #define INCLUDED_XMLREADER_XMLREADER_HXX
30 
31 #include "sal/config.h"
32 
33 #include <stack>
34 #include <vector>
35 
36 #include "boost/noncopyable.hpp"
37 #include "com/sun/star/container/NoSuchElementException.hpp"
38 #include "com/sun/star/uno/RuntimeException.hpp"
39 #include "osl/file.h"
40 #include "rtl/ustring.hxx"
41 #include "sal/types.h"
42 #include "xmlreader/detail/xmlreaderdllapi.hxx"
43 #include "xmlreader/pad.hxx"
44 #include "xmlreader/span.hxx"
45 
46 namespace xmlreader {
47 
48 class OOO_DLLPUBLIC_XMLREADER XmlReader: private boost::noncopyable {
49 public:
50     explicit XmlReader(rtl::OUString const & fileUrl)
51         SAL_THROW((
52             com::sun::star::container::NoSuchElementException,
53             com::sun::star::uno::RuntimeException));
54 
55     ~XmlReader();
56 
57     enum { NAMESPACE_NONE = -2, NAMESPACE_UNKNOWN = -1, NAMESPACE_XML = 0 };
58 
59     enum Text { TEXT_NONE, TEXT_RAW, TEXT_NORMALIZED };
60 
61     enum Result { RESULT_BEGIN, RESULT_END, RESULT_TEXT, RESULT_DONE };
62 
63     int registerNamespaceIri(Span const & iri);
64 
65     // RESULT_BEGIN: data = localName, ns = ns
66     // RESULT_END: data, ns unused
67     // RESULT_TEXT: data = text, ns unused
68     Result nextItem(Text reportText, Span * data, int * nsId);
69 
70     bool nextAttribute(int * nsId, Span * localName);
71 
72     // the span returned by getAttributeValue is only valid until the next call
73     // to nextItem or getAttributeValue
74     Span getAttributeValue(bool fullyNormalize);
75 
76     int getNamespaceId(Span const & prefix) const;
77 
78     rtl::OUString getUrl() const;
79 
80 private:
81     typedef std::vector< Span > NamespaceIris;
82 
83     // If NamespaceData (and similarly ElementData and AttributeData) is made
84     // SAL_DLLPRIVATE, at least gcc 4.2.3 erroneously warns about
85     // "'xmlreader::XmlReader' declared with greater visibility than the type of
86     // its field 'xmlreader::XmlReader::namespaces_'" (and similarly for
87     // elements_ and attributes_):
88 
89     struct NamespaceData {
90         Span prefix;
91         int nsId;
92 
93         NamespaceData() {}
94 
95         NamespaceData(Span const & thePrefix, int theNsId):
96             prefix(thePrefix), nsId(theNsId) {}
97     };
98 
99     typedef std::vector< NamespaceData > NamespaceList;
100 
101     struct ElementData {
102         Span name;
103         NamespaceList::size_type inheritedNamespaces;
104         int defaultNamespaceId;
105 
106         ElementData(
107             Span const & theName,
108             NamespaceList::size_type theInheritedNamespaces,
109             int theDefaultNamespaceId):
110             name(theName), inheritedNamespaces(theInheritedNamespaces),
111             defaultNamespaceId(theDefaultNamespaceId)
112         {}
113     };
114 
115     typedef std::stack< ElementData > ElementStack;
116 
117     struct AttributeData {
118         char const * nameBegin;
119         char const * nameEnd;
120         char const * nameColon;
121         char const * valueBegin;
122         char const * valueEnd;
123 
124         AttributeData(
125             char const * theNameBegin, char const * theNameEnd,
126             char const * theNameColon, char const * theValueBegin,
127             char const * theValueEnd):
128             nameBegin(theNameBegin), nameEnd(theNameEnd),
129             nameColon(theNameColon), valueBegin(theValueBegin),
130             valueEnd(theValueEnd)
131         {}
132     };
133 
134     typedef std::vector< AttributeData > Attributes;
135 
136     enum State {
137         STATE_CONTENT, STATE_START_TAG, STATE_END_TAG, STATE_EMPTY_ELEMENT_TAG,
138         STATE_DONE };
139 
140     SAL_DLLPRIVATE inline char read() { return pos_ == end_ ? '\0' : *pos_++; }
141 
142     SAL_DLLPRIVATE inline char peek() { return pos_ == end_ ? '\0' : *pos_; }
143 
144     SAL_DLLPRIVATE void normalizeLineEnds(Span const & text);
145 
146     SAL_DLLPRIVATE void skipSpace();
147 
148     SAL_DLLPRIVATE bool skipComment();
149 
150     SAL_DLLPRIVATE void skipProcessingInstruction();
151 
152     SAL_DLLPRIVATE void skipDocumentTypeDeclaration();
153 
154     SAL_DLLPRIVATE Span scanCdataSection();
155 
156     SAL_DLLPRIVATE bool scanName(char const ** nameColon);
157 
158     SAL_DLLPRIVATE int scanNamespaceIri(
159         char const * begin, char const * end);
160 
161     SAL_DLLPRIVATE char const * handleReference(
162         char const * position, char const * end);
163 
164     SAL_DLLPRIVATE Span handleAttributeValue(
165         char const * begin, char const * end, bool fullyNormalize);
166 
167     SAL_DLLPRIVATE Result handleStartTag(int * nsId, Span * localName);
168 
169     SAL_DLLPRIVATE Result handleEndTag();
170 
171     SAL_DLLPRIVATE void handleElementEnd();
172 
173     SAL_DLLPRIVATE Result handleSkippedText(Span * data, int * nsId);
174 
175     SAL_DLLPRIVATE Result handleRawText(Span * text);
176 
177     SAL_DLLPRIVATE Result handleNormalizedText(Span * text);
178 
179     SAL_DLLPRIVATE int toNamespaceId(NamespaceIris::size_type pos);
180 
181     rtl::OUString fileUrl_;
182     oslFileHandle fileHandle_;
183     sal_uInt64 fileSize_;
184     void * fileAddress_;
185     NamespaceIris namespaceIris_;
186     NamespaceList namespaces_;
187     ElementStack elements_;
188     char const * pos_;
189     char const * end_;
190     State state_;
191     Attributes attributes_;
192     Attributes::iterator currentAttribute_;
193     bool firstAttribute_;
194     Pad pad_;
195 };
196 
197 }
198 
199 #endif
200