xref: /trunk/main/xmlreader/source/xmlreader.cxx (revision cdf0e10c4e3984b49a9502b011690b615761d4a3)
1*cdf0e10cSrcweir /*************************************************************************
2*cdf0e10cSrcweir *
3*cdf0e10cSrcweir * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4*cdf0e10cSrcweir *
5*cdf0e10cSrcweir * Copyright 2000, 2010 Oracle and/or its affiliates.
6*cdf0e10cSrcweir *
7*cdf0e10cSrcweir * OpenOffice.org - a multi-platform office productivity suite
8*cdf0e10cSrcweir *
9*cdf0e10cSrcweir * This file is part of OpenOffice.org.
10*cdf0e10cSrcweir *
11*cdf0e10cSrcweir * OpenOffice.org is free software: you can redistribute it and/or modify
12*cdf0e10cSrcweir * it under the terms of the GNU Lesser General Public License version 3
13*cdf0e10cSrcweir * only, as published by the Free Software Foundation.
14*cdf0e10cSrcweir *
15*cdf0e10cSrcweir * OpenOffice.org is distributed in the hope that it will be useful,
16*cdf0e10cSrcweir * but WITHOUT ANY WARRANTY; without even the implied warranty of
17*cdf0e10cSrcweir * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18*cdf0e10cSrcweir * GNU Lesser General Public License version 3 for more details
19*cdf0e10cSrcweir * (a copy is included in the LICENSE file that accompanied this code).
20*cdf0e10cSrcweir *
21*cdf0e10cSrcweir * You should have received a copy of the GNU Lesser General Public License
22*cdf0e10cSrcweir * version 3 along with OpenOffice.org.  If not, see
23*cdf0e10cSrcweir * <http://www.openoffice.org/license.html>
24*cdf0e10cSrcweir * for a copy of the LGPLv3 License.
25*cdf0e10cSrcweir *
26*cdf0e10cSrcweir ************************************************************************/
27*cdf0e10cSrcweir 
28*cdf0e10cSrcweir #include "precompiled_xmlreader.hxx"
29*cdf0e10cSrcweir #include "sal/config.h"
30*cdf0e10cSrcweir 
31*cdf0e10cSrcweir #include <climits>
32*cdf0e10cSrcweir #include <cstddef>
33*cdf0e10cSrcweir 
34*cdf0e10cSrcweir #include "com/sun/star/container/NoSuchElementException.hpp"
35*cdf0e10cSrcweir #include "com/sun/star/uno/Reference.hxx"
36*cdf0e10cSrcweir #include "com/sun/star/uno/RuntimeException.hpp"
37*cdf0e10cSrcweir #include "com/sun/star/uno/XInterface.hpp"
38*cdf0e10cSrcweir #include "osl/diagnose.h"
39*cdf0e10cSrcweir #include "osl/file.h"
40*cdf0e10cSrcweir #include "rtl/string.h"
41*cdf0e10cSrcweir #include "rtl/ustring.h"
42*cdf0e10cSrcweir #include "rtl/ustring.hxx"
43*cdf0e10cSrcweir #include "sal/types.h"
44*cdf0e10cSrcweir #include "xmlreader/pad.hxx"
45*cdf0e10cSrcweir #include "xmlreader/span.hxx"
46*cdf0e10cSrcweir #include "xmlreader/xmlreader.hxx"
47*cdf0e10cSrcweir 
48*cdf0e10cSrcweir namespace xmlreader {
49*cdf0e10cSrcweir 
50*cdf0e10cSrcweir namespace {
51*cdf0e10cSrcweir 
52*cdf0e10cSrcweir namespace css = com::sun::star;
53*cdf0e10cSrcweir 
54*cdf0e10cSrcweir bool isSpace(char c) {
55*cdf0e10cSrcweir     switch (c) {
56*cdf0e10cSrcweir     case '\x09':
57*cdf0e10cSrcweir     case '\x0A':
58*cdf0e10cSrcweir     case '\x0D':
59*cdf0e10cSrcweir     case ' ':
60*cdf0e10cSrcweir         return true;
61*cdf0e10cSrcweir     default:
62*cdf0e10cSrcweir         return false;
63*cdf0e10cSrcweir     }
64*cdf0e10cSrcweir }
65*cdf0e10cSrcweir 
66*cdf0e10cSrcweir }
67*cdf0e10cSrcweir 
68*cdf0e10cSrcweir XmlReader::XmlReader(rtl::OUString const & fileUrl)
69*cdf0e10cSrcweir     SAL_THROW((
70*cdf0e10cSrcweir         css::container::NoSuchElementException, css::uno::RuntimeException)):
71*cdf0e10cSrcweir     fileUrl_(fileUrl)
72*cdf0e10cSrcweir {
73*cdf0e10cSrcweir     switch (osl_openFile(fileUrl_.pData, &fileHandle_, osl_File_OpenFlag_Read))
74*cdf0e10cSrcweir     {
75*cdf0e10cSrcweir     case osl_File_E_None:
76*cdf0e10cSrcweir         break;
77*cdf0e10cSrcweir     case osl_File_E_NOENT:
78*cdf0e10cSrcweir         throw css::container::NoSuchElementException(
79*cdf0e10cSrcweir             fileUrl_, css::uno::Reference< css::uno::XInterface >());
80*cdf0e10cSrcweir     default:
81*cdf0e10cSrcweir         throw css::uno::RuntimeException(
82*cdf0e10cSrcweir             (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("cannot open ")) +
83*cdf0e10cSrcweir              fileUrl_),
84*cdf0e10cSrcweir             css::uno::Reference< css::uno::XInterface >());
85*cdf0e10cSrcweir     }
86*cdf0e10cSrcweir     oslFileError e = osl_getFileSize(fileHandle_, &fileSize_);
87*cdf0e10cSrcweir     if (e == osl_File_E_None) {
88*cdf0e10cSrcweir         e = osl_mapFile(
89*cdf0e10cSrcweir             fileHandle_, &fileAddress_, fileSize_, 0,
90*cdf0e10cSrcweir             osl_File_MapFlag_WillNeed);
91*cdf0e10cSrcweir     }
92*cdf0e10cSrcweir     if (e != osl_File_E_None) {
93*cdf0e10cSrcweir         e = osl_closeFile(fileHandle_);
94*cdf0e10cSrcweir         if (e != osl_File_E_None) {
95*cdf0e10cSrcweir             OSL_TRACE("osl_closeFile failed with %ld", static_cast< long >(e));
96*cdf0e10cSrcweir         }
97*cdf0e10cSrcweir         throw css::uno::RuntimeException(
98*cdf0e10cSrcweir             (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("cannot mmap ")) +
99*cdf0e10cSrcweir              fileUrl_),
100*cdf0e10cSrcweir             css::uno::Reference< css::uno::XInterface >());
101*cdf0e10cSrcweir     }
102*cdf0e10cSrcweir     namespaceIris_.push_back(
103*cdf0e10cSrcweir         Span(
104*cdf0e10cSrcweir             RTL_CONSTASCII_STRINGPARAM(
105*cdf0e10cSrcweir                 "http://www.w3.org/XML/1998/namespace")));
106*cdf0e10cSrcweir     namespaces_.push_back(
107*cdf0e10cSrcweir         NamespaceData(Span(RTL_CONSTASCII_STRINGPARAM("xml")), NAMESPACE_XML));
108*cdf0e10cSrcweir     pos_ = static_cast< char * >(fileAddress_);
109*cdf0e10cSrcweir     end_ = pos_ + fileSize_;
110*cdf0e10cSrcweir     state_ = STATE_CONTENT;
111*cdf0e10cSrcweir }
112*cdf0e10cSrcweir 
113*cdf0e10cSrcweir XmlReader::~XmlReader() {
114*cdf0e10cSrcweir     oslFileError e = osl_unmapFile(fileAddress_, fileSize_);
115*cdf0e10cSrcweir     if (e != osl_File_E_None) {
116*cdf0e10cSrcweir         OSL_TRACE("osl_unmapFile failed with %ld", static_cast< long >(e));
117*cdf0e10cSrcweir     }
118*cdf0e10cSrcweir     e = osl_closeFile(fileHandle_);
119*cdf0e10cSrcweir     if (e != osl_File_E_None) {
120*cdf0e10cSrcweir         OSL_TRACE("osl_closeFile failed with %ld", static_cast< long >(e));
121*cdf0e10cSrcweir     }
122*cdf0e10cSrcweir }
123*cdf0e10cSrcweir 
124*cdf0e10cSrcweir int XmlReader::registerNamespaceIri(Span const & iri) {
125*cdf0e10cSrcweir     int id = toNamespaceId(namespaceIris_.size());
126*cdf0e10cSrcweir     namespaceIris_.push_back(iri);
127*cdf0e10cSrcweir     if (iri.equals(
128*cdf0e10cSrcweir             Span(
129*cdf0e10cSrcweir                 RTL_CONSTASCII_STRINGPARAM(
130*cdf0e10cSrcweir                     "http://www.w3.org/2001/XMLSchema-instance"))))
131*cdf0e10cSrcweir     {
132*cdf0e10cSrcweir         // Old user layer .xcu files used the xsi namespace prefix without
133*cdf0e10cSrcweir         // declaring a corresponding namespace binding, see issue 77174; reading
134*cdf0e10cSrcweir         // those files during migration would fail without this hack that can be
135*cdf0e10cSrcweir         // removed once migration is no longer relevant (see
136*cdf0e10cSrcweir         // configmgr::Components::parseModificationLayer):
137*cdf0e10cSrcweir         namespaces_.push_back(
138*cdf0e10cSrcweir             NamespaceData(Span(RTL_CONSTASCII_STRINGPARAM("xsi")), id));
139*cdf0e10cSrcweir     }
140*cdf0e10cSrcweir     return id;
141*cdf0e10cSrcweir }
142*cdf0e10cSrcweir 
143*cdf0e10cSrcweir XmlReader::Result XmlReader::nextItem(Text reportText, Span * data, int * nsId)
144*cdf0e10cSrcweir {
145*cdf0e10cSrcweir     switch (state_) {
146*cdf0e10cSrcweir     case STATE_CONTENT:
147*cdf0e10cSrcweir         switch (reportText) {
148*cdf0e10cSrcweir         case TEXT_NONE:
149*cdf0e10cSrcweir             return handleSkippedText(data, nsId);
150*cdf0e10cSrcweir         case TEXT_RAW:
151*cdf0e10cSrcweir             return handleRawText(data);
152*cdf0e10cSrcweir         case TEXT_NORMALIZED:
153*cdf0e10cSrcweir             return handleNormalizedText(data);
154*cdf0e10cSrcweir         }
155*cdf0e10cSrcweir     case STATE_START_TAG:
156*cdf0e10cSrcweir         return handleStartTag(nsId, data);
157*cdf0e10cSrcweir     case STATE_END_TAG:
158*cdf0e10cSrcweir         return handleEndTag();
159*cdf0e10cSrcweir     case STATE_EMPTY_ELEMENT_TAG:
160*cdf0e10cSrcweir         handleElementEnd();
161*cdf0e10cSrcweir         return RESULT_END;
162*cdf0e10cSrcweir     default: // STATE_DONE
163*cdf0e10cSrcweir         return RESULT_DONE;
164*cdf0e10cSrcweir     }
165*cdf0e10cSrcweir }
166*cdf0e10cSrcweir 
167*cdf0e10cSrcweir bool XmlReader::nextAttribute(int * nsId, Span * localName) {
168*cdf0e10cSrcweir     OSL_ASSERT(nsId != 0 && localName != 0);
169*cdf0e10cSrcweir     if (firstAttribute_) {
170*cdf0e10cSrcweir         currentAttribute_ = attributes_.begin();
171*cdf0e10cSrcweir         firstAttribute_ = false;
172*cdf0e10cSrcweir     } else {
173*cdf0e10cSrcweir         ++currentAttribute_;
174*cdf0e10cSrcweir     }
175*cdf0e10cSrcweir     if (currentAttribute_ == attributes_.end()) {
176*cdf0e10cSrcweir         return false;
177*cdf0e10cSrcweir     }
178*cdf0e10cSrcweir     if (currentAttribute_->nameColon == 0) {
179*cdf0e10cSrcweir         *nsId = NAMESPACE_NONE;
180*cdf0e10cSrcweir         *localName = Span(
181*cdf0e10cSrcweir             currentAttribute_->nameBegin,
182*cdf0e10cSrcweir             currentAttribute_->nameEnd - currentAttribute_->nameBegin);
183*cdf0e10cSrcweir     } else {
184*cdf0e10cSrcweir         *nsId = getNamespaceId(
185*cdf0e10cSrcweir             Span(
186*cdf0e10cSrcweir                 currentAttribute_->nameBegin,
187*cdf0e10cSrcweir                 currentAttribute_->nameColon - currentAttribute_->nameBegin));
188*cdf0e10cSrcweir         *localName = Span(
189*cdf0e10cSrcweir             currentAttribute_->nameColon + 1,
190*cdf0e10cSrcweir             currentAttribute_->nameEnd - (currentAttribute_->nameColon + 1));
191*cdf0e10cSrcweir     }
192*cdf0e10cSrcweir     return true;
193*cdf0e10cSrcweir }
194*cdf0e10cSrcweir 
195*cdf0e10cSrcweir Span XmlReader::getAttributeValue(bool fullyNormalize) {
196*cdf0e10cSrcweir     return handleAttributeValue(
197*cdf0e10cSrcweir         currentAttribute_->valueBegin, currentAttribute_->valueEnd,
198*cdf0e10cSrcweir         fullyNormalize);
199*cdf0e10cSrcweir }
200*cdf0e10cSrcweir 
201*cdf0e10cSrcweir int XmlReader::getNamespaceId(Span const & prefix) const {
202*cdf0e10cSrcweir     for (NamespaceList::const_reverse_iterator i(namespaces_.rbegin());
203*cdf0e10cSrcweir          i != namespaces_.rend(); ++i)
204*cdf0e10cSrcweir     {
205*cdf0e10cSrcweir         if (prefix.equals(i->prefix)) {
206*cdf0e10cSrcweir             return i->nsId;
207*cdf0e10cSrcweir         }
208*cdf0e10cSrcweir     }
209*cdf0e10cSrcweir     return NAMESPACE_UNKNOWN;
210*cdf0e10cSrcweir }
211*cdf0e10cSrcweir 
212*cdf0e10cSrcweir rtl::OUString XmlReader::getUrl() const {
213*cdf0e10cSrcweir     return fileUrl_;
214*cdf0e10cSrcweir }
215*cdf0e10cSrcweir 
216*cdf0e10cSrcweir void XmlReader::normalizeLineEnds(Span const & text) {
217*cdf0e10cSrcweir     char const * p = text.begin;
218*cdf0e10cSrcweir     sal_Int32 n = text.length;
219*cdf0e10cSrcweir     for (;;) {
220*cdf0e10cSrcweir         sal_Int32 i = rtl_str_indexOfChar_WithLength(p, n, '\x0D');
221*cdf0e10cSrcweir         if (i < 0) {
222*cdf0e10cSrcweir             break;
223*cdf0e10cSrcweir         }
224*cdf0e10cSrcweir         pad_.add(p, i);
225*cdf0e10cSrcweir         p += i + 1;
226*cdf0e10cSrcweir         n -= i + 1;
227*cdf0e10cSrcweir         if (n == 0 || *p != '\x0A') {
228*cdf0e10cSrcweir             pad_.add(RTL_CONSTASCII_STRINGPARAM("\x0A"));
229*cdf0e10cSrcweir         }
230*cdf0e10cSrcweir     }
231*cdf0e10cSrcweir     pad_.add(p, n);
232*cdf0e10cSrcweir }
233*cdf0e10cSrcweir 
234*cdf0e10cSrcweir void XmlReader::skipSpace() {
235*cdf0e10cSrcweir     while (isSpace(peek())) {
236*cdf0e10cSrcweir         ++pos_;
237*cdf0e10cSrcweir     }
238*cdf0e10cSrcweir }
239*cdf0e10cSrcweir 
240*cdf0e10cSrcweir bool XmlReader::skipComment() {
241*cdf0e10cSrcweir     if (rtl_str_shortenedCompare_WithLength(
242*cdf0e10cSrcweir             pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("--"),
243*cdf0e10cSrcweir             RTL_CONSTASCII_LENGTH("--")) !=
244*cdf0e10cSrcweir         0)
245*cdf0e10cSrcweir     {
246*cdf0e10cSrcweir         return false;
247*cdf0e10cSrcweir     }
248*cdf0e10cSrcweir     pos_ += RTL_CONSTASCII_LENGTH("--");
249*cdf0e10cSrcweir     sal_Int32 i = rtl_str_indexOfStr_WithLength(
250*cdf0e10cSrcweir         pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("--"));
251*cdf0e10cSrcweir     if (i < 0) {
252*cdf0e10cSrcweir         throw css::uno::RuntimeException(
253*cdf0e10cSrcweir             (rtl::OUString(
254*cdf0e10cSrcweir                 RTL_CONSTASCII_USTRINGPARAM(
255*cdf0e10cSrcweir                     "premature end (within comment) of ")) +
256*cdf0e10cSrcweir              fileUrl_),
257*cdf0e10cSrcweir             css::uno::Reference< css::uno::XInterface >());
258*cdf0e10cSrcweir     }
259*cdf0e10cSrcweir     pos_ += i + RTL_CONSTASCII_LENGTH("--");
260*cdf0e10cSrcweir     if (read() != '>') {
261*cdf0e10cSrcweir         throw css::uno::RuntimeException(
262*cdf0e10cSrcweir             (rtl::OUString(
263*cdf0e10cSrcweir                 RTL_CONSTASCII_USTRINGPARAM(
264*cdf0e10cSrcweir                     "illegal \"--\" within comment in ")) +
265*cdf0e10cSrcweir              fileUrl_),
266*cdf0e10cSrcweir             css::uno::Reference< css::uno::XInterface >());
267*cdf0e10cSrcweir     }
268*cdf0e10cSrcweir     return true;
269*cdf0e10cSrcweir }
270*cdf0e10cSrcweir 
271*cdf0e10cSrcweir void XmlReader::skipProcessingInstruction() {
272*cdf0e10cSrcweir     sal_Int32 i = rtl_str_indexOfStr_WithLength(
273*cdf0e10cSrcweir         pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("?>"));
274*cdf0e10cSrcweir     if (i < 0) {
275*cdf0e10cSrcweir         throw css::uno::RuntimeException(
276*cdf0e10cSrcweir             (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("bad '<?' in ")) +
277*cdf0e10cSrcweir              fileUrl_),
278*cdf0e10cSrcweir             css::uno::Reference< css::uno::XInterface >());
279*cdf0e10cSrcweir     }
280*cdf0e10cSrcweir     pos_ += i + RTL_CONSTASCII_LENGTH("?>");
281*cdf0e10cSrcweir }
282*cdf0e10cSrcweir 
283*cdf0e10cSrcweir void XmlReader::skipDocumentTypeDeclaration() {
284*cdf0e10cSrcweir     // Neither is it checked that the doctypedecl is at the correct position in
285*cdf0e10cSrcweir     // the document, nor that it is well-formed:
286*cdf0e10cSrcweir     for (;;) {
287*cdf0e10cSrcweir         char c = read();
288*cdf0e10cSrcweir         switch (c) {
289*cdf0e10cSrcweir         case '\0': // i.e., EOF
290*cdf0e10cSrcweir             throw css::uno::RuntimeException(
291*cdf0e10cSrcweir                 (rtl::OUString(
292*cdf0e10cSrcweir                     RTL_CONSTASCII_USTRINGPARAM(
293*cdf0e10cSrcweir                         "premature end (within DTD) of ")) +
294*cdf0e10cSrcweir                  fileUrl_),
295*cdf0e10cSrcweir                 css::uno::Reference< css::uno::XInterface >());
296*cdf0e10cSrcweir         case '"':
297*cdf0e10cSrcweir         case '\'':
298*cdf0e10cSrcweir             {
299*cdf0e10cSrcweir                 sal_Int32 i = rtl_str_indexOfChar_WithLength(
300*cdf0e10cSrcweir                     pos_, end_ - pos_, c);
301*cdf0e10cSrcweir                 if (i < 0) {
302*cdf0e10cSrcweir                     throw css::uno::RuntimeException(
303*cdf0e10cSrcweir                         (rtl::OUString(
304*cdf0e10cSrcweir                             RTL_CONSTASCII_USTRINGPARAM(
305*cdf0e10cSrcweir                                 "premature end (within DTD) of ")) +
306*cdf0e10cSrcweir                          fileUrl_),
307*cdf0e10cSrcweir                         css::uno::Reference< css::uno::XInterface >());
308*cdf0e10cSrcweir                 }
309*cdf0e10cSrcweir                 pos_ += i + 1;
310*cdf0e10cSrcweir             }
311*cdf0e10cSrcweir             break;
312*cdf0e10cSrcweir         case '>':
313*cdf0e10cSrcweir             return;
314*cdf0e10cSrcweir         case '[':
315*cdf0e10cSrcweir             for (;;) {
316*cdf0e10cSrcweir                 c = read();
317*cdf0e10cSrcweir                 switch (c) {
318*cdf0e10cSrcweir                 case '\0': // i.e., EOF
319*cdf0e10cSrcweir                     throw css::uno::RuntimeException(
320*cdf0e10cSrcweir                         (rtl::OUString(
321*cdf0e10cSrcweir                             RTL_CONSTASCII_USTRINGPARAM(
322*cdf0e10cSrcweir                                 "premature end (within DTD) of ")) +
323*cdf0e10cSrcweir                          fileUrl_),
324*cdf0e10cSrcweir                         css::uno::Reference< css::uno::XInterface >());
325*cdf0e10cSrcweir                 case '"':
326*cdf0e10cSrcweir                 case '\'':
327*cdf0e10cSrcweir                     {
328*cdf0e10cSrcweir                         sal_Int32 i = rtl_str_indexOfChar_WithLength(
329*cdf0e10cSrcweir                             pos_, end_ - pos_, c);
330*cdf0e10cSrcweir                         if (i < 0) {
331*cdf0e10cSrcweir                             throw css::uno::RuntimeException(
332*cdf0e10cSrcweir                             (rtl::OUString(
333*cdf0e10cSrcweir                                 RTL_CONSTASCII_USTRINGPARAM(
334*cdf0e10cSrcweir                                     "premature end (within DTD) of ")) +
335*cdf0e10cSrcweir                              fileUrl_),
336*cdf0e10cSrcweir                             css::uno::Reference< css::uno::XInterface >());
337*cdf0e10cSrcweir                         }
338*cdf0e10cSrcweir                         pos_ += i + 1;
339*cdf0e10cSrcweir                     }
340*cdf0e10cSrcweir                     break;
341*cdf0e10cSrcweir                 case '<':
342*cdf0e10cSrcweir                     switch (read()) {
343*cdf0e10cSrcweir                     case '\0': // i.e., EOF
344*cdf0e10cSrcweir                         throw css::uno::RuntimeException(
345*cdf0e10cSrcweir                             (rtl::OUString(
346*cdf0e10cSrcweir                                 RTL_CONSTASCII_USTRINGPARAM(
347*cdf0e10cSrcweir                                     "premature end (within DTD) of ")) +
348*cdf0e10cSrcweir                              fileUrl_),
349*cdf0e10cSrcweir                             css::uno::Reference< css::uno::XInterface >());
350*cdf0e10cSrcweir                     case '!':
351*cdf0e10cSrcweir                         skipComment();
352*cdf0e10cSrcweir                         break;
353*cdf0e10cSrcweir                     case '?':
354*cdf0e10cSrcweir                         skipProcessingInstruction();
355*cdf0e10cSrcweir                         break;
356*cdf0e10cSrcweir                     default:
357*cdf0e10cSrcweir                         break;
358*cdf0e10cSrcweir                     }
359*cdf0e10cSrcweir                     break;
360*cdf0e10cSrcweir                 case ']':
361*cdf0e10cSrcweir                     skipSpace();
362*cdf0e10cSrcweir                     if (read() != '>') {
363*cdf0e10cSrcweir                         throw css::uno::RuntimeException(
364*cdf0e10cSrcweir                             (rtl::OUString(
365*cdf0e10cSrcweir                                 RTL_CONSTASCII_USTRINGPARAM(
366*cdf0e10cSrcweir                                     "missing \">\" of DTD in ")) +
367*cdf0e10cSrcweir                              fileUrl_),
368*cdf0e10cSrcweir                             css::uno::Reference< css::uno::XInterface >());
369*cdf0e10cSrcweir                     }
370*cdf0e10cSrcweir                     return;
371*cdf0e10cSrcweir                 default:
372*cdf0e10cSrcweir                     break;
373*cdf0e10cSrcweir                 }
374*cdf0e10cSrcweir             }
375*cdf0e10cSrcweir         default:
376*cdf0e10cSrcweir             break;
377*cdf0e10cSrcweir         }
378*cdf0e10cSrcweir     }
379*cdf0e10cSrcweir }
380*cdf0e10cSrcweir 
381*cdf0e10cSrcweir Span XmlReader::scanCdataSection() {
382*cdf0e10cSrcweir     if (rtl_str_shortenedCompare_WithLength(
383*cdf0e10cSrcweir             pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("[CDATA["),
384*cdf0e10cSrcweir             RTL_CONSTASCII_LENGTH("[CDATA[")) !=
385*cdf0e10cSrcweir         0)
386*cdf0e10cSrcweir     {
387*cdf0e10cSrcweir         return Span();
388*cdf0e10cSrcweir     }
389*cdf0e10cSrcweir     pos_ += RTL_CONSTASCII_LENGTH("[CDATA[");
390*cdf0e10cSrcweir     char const * begin = pos_;
391*cdf0e10cSrcweir     sal_Int32 i = rtl_str_indexOfStr_WithLength(
392*cdf0e10cSrcweir         pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("]]>"));
393*cdf0e10cSrcweir     if (i < 0) {
394*cdf0e10cSrcweir         throw css::uno::RuntimeException(
395*cdf0e10cSrcweir             (rtl::OUString(
396*cdf0e10cSrcweir                 RTL_CONSTASCII_USTRINGPARAM(
397*cdf0e10cSrcweir                     "premature end (within CDATA section) of ")) +
398*cdf0e10cSrcweir              fileUrl_),
399*cdf0e10cSrcweir             css::uno::Reference< css::uno::XInterface >());
400*cdf0e10cSrcweir     }
401*cdf0e10cSrcweir     pos_ += i + RTL_CONSTASCII_LENGTH("]]>");
402*cdf0e10cSrcweir     return Span(begin, i);
403*cdf0e10cSrcweir }
404*cdf0e10cSrcweir 
405*cdf0e10cSrcweir bool XmlReader::scanName(char const ** nameColon) {
406*cdf0e10cSrcweir     OSL_ASSERT(nameColon != 0 && *nameColon == 0);
407*cdf0e10cSrcweir     for (char const * begin = pos_;; ++pos_) {
408*cdf0e10cSrcweir         switch (peek()) {
409*cdf0e10cSrcweir         case '\0': // i.e., EOF
410*cdf0e10cSrcweir         case '\x09':
411*cdf0e10cSrcweir         case '\x0A':
412*cdf0e10cSrcweir         case '\x0D':
413*cdf0e10cSrcweir         case ' ':
414*cdf0e10cSrcweir         case '/':
415*cdf0e10cSrcweir         case '=':
416*cdf0e10cSrcweir         case '>':
417*cdf0e10cSrcweir             return pos_ != begin;
418*cdf0e10cSrcweir         case ':':
419*cdf0e10cSrcweir             *nameColon = pos_;
420*cdf0e10cSrcweir             break;
421*cdf0e10cSrcweir         default:
422*cdf0e10cSrcweir             break;
423*cdf0e10cSrcweir         }
424*cdf0e10cSrcweir     }
425*cdf0e10cSrcweir }
426*cdf0e10cSrcweir 
427*cdf0e10cSrcweir int XmlReader::scanNamespaceIri(char const * begin, char const * end) {
428*cdf0e10cSrcweir     OSL_ASSERT(begin != 0 && begin <= end);
429*cdf0e10cSrcweir     Span iri(handleAttributeValue(begin, end, false));
430*cdf0e10cSrcweir     for (NamespaceIris::size_type i = 0; i < namespaceIris_.size(); ++i) {
431*cdf0e10cSrcweir         if (namespaceIris_[i].equals(iri)) {
432*cdf0e10cSrcweir             return toNamespaceId(i);
433*cdf0e10cSrcweir         }
434*cdf0e10cSrcweir     }
435*cdf0e10cSrcweir     return XmlReader::NAMESPACE_UNKNOWN;
436*cdf0e10cSrcweir }
437*cdf0e10cSrcweir 
438*cdf0e10cSrcweir char const * XmlReader::handleReference(char const * position, char const * end)
439*cdf0e10cSrcweir {
440*cdf0e10cSrcweir     OSL_ASSERT(position != 0 && *position == '&' && position < end);
441*cdf0e10cSrcweir     ++position;
442*cdf0e10cSrcweir     if (*position == '#') {
443*cdf0e10cSrcweir         ++position;
444*cdf0e10cSrcweir         sal_Int32 val = 0;
445*cdf0e10cSrcweir         char const * p;
446*cdf0e10cSrcweir         if (*position == 'x') {
447*cdf0e10cSrcweir             ++position;
448*cdf0e10cSrcweir             p = position;
449*cdf0e10cSrcweir             for (;; ++position) {
450*cdf0e10cSrcweir                 char c = *position;
451*cdf0e10cSrcweir                 if (c >= '0' && c <= '9') {
452*cdf0e10cSrcweir                     val = 16 * val + (c - '0');
453*cdf0e10cSrcweir                 } else if (c >= 'A' && c <= 'F') {
454*cdf0e10cSrcweir                     val = 16 * val + (c - 'A') + 10;
455*cdf0e10cSrcweir                 } else if (c >= 'a' && c <= 'f') {
456*cdf0e10cSrcweir                     val = 16 * val + (c - 'a') + 10;
457*cdf0e10cSrcweir                 } else {
458*cdf0e10cSrcweir                     break;
459*cdf0e10cSrcweir                 }
460*cdf0e10cSrcweir                 if (val > 0x10FFFF) { // avoid overflow
461*cdf0e10cSrcweir                     throw css::uno::RuntimeException(
462*cdf0e10cSrcweir                         (rtl::OUString(
463*cdf0e10cSrcweir                             RTL_CONSTASCII_USTRINGPARAM(
464*cdf0e10cSrcweir                                 "'&#x...' too large in ")) +
465*cdf0e10cSrcweir                          fileUrl_),
466*cdf0e10cSrcweir                         css::uno::Reference< css::uno::XInterface >());
467*cdf0e10cSrcweir                 }
468*cdf0e10cSrcweir             }
469*cdf0e10cSrcweir         } else {
470*cdf0e10cSrcweir             p = position;
471*cdf0e10cSrcweir             for (;; ++position) {
472*cdf0e10cSrcweir                 char c = *position;
473*cdf0e10cSrcweir                 if (c >= '0' && c <= '9') {
474*cdf0e10cSrcweir                     val = 10 * val + (c - '0');
475*cdf0e10cSrcweir                 } else {
476*cdf0e10cSrcweir                     break;
477*cdf0e10cSrcweir                 }
478*cdf0e10cSrcweir                 if (val > 0x10FFFF) { // avoid overflow
479*cdf0e10cSrcweir                     throw css::uno::RuntimeException(
480*cdf0e10cSrcweir                         (rtl::OUString(
481*cdf0e10cSrcweir                             RTL_CONSTASCII_USTRINGPARAM(
482*cdf0e10cSrcweir                                 "'&#...' too large in ")) +
483*cdf0e10cSrcweir                          fileUrl_),
484*cdf0e10cSrcweir                         css::uno::Reference< css::uno::XInterface >());
485*cdf0e10cSrcweir                 }
486*cdf0e10cSrcweir             }
487*cdf0e10cSrcweir         }
488*cdf0e10cSrcweir         if (position == p || *position++ != ';') {
489*cdf0e10cSrcweir             throw css::uno::RuntimeException(
490*cdf0e10cSrcweir                 (rtl::OUString(
491*cdf0e10cSrcweir                     RTL_CONSTASCII_USTRINGPARAM("'&#...' missing ';' in ")) +
492*cdf0e10cSrcweir                  fileUrl_),
493*cdf0e10cSrcweir                 css::uno::Reference< css::uno::XInterface >());
494*cdf0e10cSrcweir         }
495*cdf0e10cSrcweir         OSL_ASSERT(val >= 0 && val <= 0x10FFFF);
496*cdf0e10cSrcweir         if ((val < 0x20 && val != 0x9 && val != 0xA && val != 0xD) ||
497*cdf0e10cSrcweir             (val >= 0xD800 && val <= 0xDFFF) || val == 0xFFFE || val == 0xFFFF)
498*cdf0e10cSrcweir         {
499*cdf0e10cSrcweir             throw css::uno::RuntimeException(
500*cdf0e10cSrcweir                 (rtl::OUString(
501*cdf0e10cSrcweir                     RTL_CONSTASCII_USTRINGPARAM(
502*cdf0e10cSrcweir                         "character reference denoting invalid character in ")) +
503*cdf0e10cSrcweir                  fileUrl_),
504*cdf0e10cSrcweir                 css::uno::Reference< css::uno::XInterface >());
505*cdf0e10cSrcweir         }
506*cdf0e10cSrcweir         char buf[4];
507*cdf0e10cSrcweir         sal_Int32 len;
508*cdf0e10cSrcweir         if (val < 0x80) {
509*cdf0e10cSrcweir             buf[0] = static_cast< char >(val);
510*cdf0e10cSrcweir             len = 1;
511*cdf0e10cSrcweir         } else if (val < 0x800) {
512*cdf0e10cSrcweir             buf[0] = static_cast< char >((val >> 6) | 0xC0);
513*cdf0e10cSrcweir             buf[1] = static_cast< char >((val & 0x3F) | 0x80);
514*cdf0e10cSrcweir             len = 2;
515*cdf0e10cSrcweir         } else if (val < 0x10000) {
516*cdf0e10cSrcweir             buf[0] = static_cast< char >((val >> 12) | 0xE0);
517*cdf0e10cSrcweir             buf[1] = static_cast< char >(((val >> 6) & 0x3F) | 0x80);
518*cdf0e10cSrcweir             buf[2] = static_cast< char >((val & 0x3F) | 0x80);
519*cdf0e10cSrcweir             len = 3;
520*cdf0e10cSrcweir         } else {
521*cdf0e10cSrcweir             buf[0] = static_cast< char >((val >> 18) | 0xF0);
522*cdf0e10cSrcweir             buf[1] = static_cast< char >(((val >> 12) & 0x3F) | 0x80);
523*cdf0e10cSrcweir             buf[2] = static_cast< char >(((val >> 6) & 0x3F) | 0x80);
524*cdf0e10cSrcweir             buf[3] = static_cast< char >((val & 0x3F) | 0x80);
525*cdf0e10cSrcweir             len = 4;
526*cdf0e10cSrcweir         }
527*cdf0e10cSrcweir         pad_.addEphemeral(buf, len);
528*cdf0e10cSrcweir         return position;
529*cdf0e10cSrcweir     } else {
530*cdf0e10cSrcweir         struct EntityRef {
531*cdf0e10cSrcweir             char const * inBegin;
532*cdf0e10cSrcweir             sal_Int32 inLength;
533*cdf0e10cSrcweir             char const * outBegin;
534*cdf0e10cSrcweir             sal_Int32 outLength;
535*cdf0e10cSrcweir         };
536*cdf0e10cSrcweir         static EntityRef const refs[] = {
537*cdf0e10cSrcweir             { RTL_CONSTASCII_STRINGPARAM("amp;"),
538*cdf0e10cSrcweir               RTL_CONSTASCII_STRINGPARAM("&") },
539*cdf0e10cSrcweir             { RTL_CONSTASCII_STRINGPARAM("lt;"),
540*cdf0e10cSrcweir               RTL_CONSTASCII_STRINGPARAM("<") },
541*cdf0e10cSrcweir             { RTL_CONSTASCII_STRINGPARAM("gt;"),
542*cdf0e10cSrcweir               RTL_CONSTASCII_STRINGPARAM(">") },
543*cdf0e10cSrcweir             { RTL_CONSTASCII_STRINGPARAM("apos;"),
544*cdf0e10cSrcweir               RTL_CONSTASCII_STRINGPARAM("'") },
545*cdf0e10cSrcweir             { RTL_CONSTASCII_STRINGPARAM("quot;"),
546*cdf0e10cSrcweir               RTL_CONSTASCII_STRINGPARAM("\"") } };
547*cdf0e10cSrcweir         for (std::size_t i = 0; i < sizeof refs / sizeof refs[0]; ++i) {
548*cdf0e10cSrcweir             if (rtl_str_shortenedCompare_WithLength(
549*cdf0e10cSrcweir                     position, end - position, refs[i].inBegin, refs[i].inLength,
550*cdf0e10cSrcweir                     refs[i].inLength) ==
551*cdf0e10cSrcweir                 0)
552*cdf0e10cSrcweir             {
553*cdf0e10cSrcweir                 position += refs[i].inLength;
554*cdf0e10cSrcweir                 pad_.add(refs[i].outBegin, refs[i].outLength);
555*cdf0e10cSrcweir                 return position;
556*cdf0e10cSrcweir             }
557*cdf0e10cSrcweir         }
558*cdf0e10cSrcweir         throw css::uno::RuntimeException(
559*cdf0e10cSrcweir             (rtl::OUString(
560*cdf0e10cSrcweir                 RTL_CONSTASCII_USTRINGPARAM("unknown entity reference in ")) +
561*cdf0e10cSrcweir              fileUrl_),
562*cdf0e10cSrcweir             css::uno::Reference< css::uno::XInterface >());
563*cdf0e10cSrcweir     }
564*cdf0e10cSrcweir }
565*cdf0e10cSrcweir 
566*cdf0e10cSrcweir Span XmlReader::handleAttributeValue(
567*cdf0e10cSrcweir     char const * begin, char const * end, bool fullyNormalize)
568*cdf0e10cSrcweir {
569*cdf0e10cSrcweir     pad_.clear();
570*cdf0e10cSrcweir     if (fullyNormalize) {
571*cdf0e10cSrcweir         while (begin != end && isSpace(*begin)) {
572*cdf0e10cSrcweir             ++begin;
573*cdf0e10cSrcweir         }
574*cdf0e10cSrcweir         while (end != begin && isSpace(end[-1])) {
575*cdf0e10cSrcweir             --end;
576*cdf0e10cSrcweir         }
577*cdf0e10cSrcweir         char const * p = begin;
578*cdf0e10cSrcweir         enum Space { SPACE_NONE, SPACE_SPAN, SPACE_BREAK };
579*cdf0e10cSrcweir             // a single true space character can go into the current span,
580*cdf0e10cSrcweir             // everything else breaks the span
581*cdf0e10cSrcweir         Space space = SPACE_NONE;
582*cdf0e10cSrcweir         while (p != end) {
583*cdf0e10cSrcweir             switch (*p) {
584*cdf0e10cSrcweir             case '\x09':
585*cdf0e10cSrcweir             case '\x0A':
586*cdf0e10cSrcweir             case '\x0D':
587*cdf0e10cSrcweir                 switch (space) {
588*cdf0e10cSrcweir                 case SPACE_NONE:
589*cdf0e10cSrcweir                     pad_.add(begin, p - begin);
590*cdf0e10cSrcweir                     pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
591*cdf0e10cSrcweir                     space = SPACE_BREAK;
592*cdf0e10cSrcweir                     break;
593*cdf0e10cSrcweir                 case SPACE_SPAN:
594*cdf0e10cSrcweir                     pad_.add(begin, p - begin);
595*cdf0e10cSrcweir                     space = SPACE_BREAK;
596*cdf0e10cSrcweir                     break;
597*cdf0e10cSrcweir                 case SPACE_BREAK:
598*cdf0e10cSrcweir                     break;
599*cdf0e10cSrcweir                 }
600*cdf0e10cSrcweir                 begin = ++p;
601*cdf0e10cSrcweir                 break;
602*cdf0e10cSrcweir             case ' ':
603*cdf0e10cSrcweir                 switch (space) {
604*cdf0e10cSrcweir                 case SPACE_NONE:
605*cdf0e10cSrcweir                     ++p;
606*cdf0e10cSrcweir                     space = SPACE_SPAN;
607*cdf0e10cSrcweir                     break;
608*cdf0e10cSrcweir                 case SPACE_SPAN:
609*cdf0e10cSrcweir                     pad_.add(begin, p - begin);
610*cdf0e10cSrcweir                     begin = ++p;
611*cdf0e10cSrcweir                     space = SPACE_BREAK;
612*cdf0e10cSrcweir                     break;
613*cdf0e10cSrcweir                 case SPACE_BREAK:
614*cdf0e10cSrcweir                     begin = ++p;
615*cdf0e10cSrcweir                     break;
616*cdf0e10cSrcweir                 }
617*cdf0e10cSrcweir                 break;
618*cdf0e10cSrcweir             case '&':
619*cdf0e10cSrcweir                 pad_.add(begin, p - begin);
620*cdf0e10cSrcweir                 p = handleReference(p, end);
621*cdf0e10cSrcweir                 begin = p;
622*cdf0e10cSrcweir                 space = SPACE_NONE;
623*cdf0e10cSrcweir                 break;
624*cdf0e10cSrcweir             default:
625*cdf0e10cSrcweir                 ++p;
626*cdf0e10cSrcweir                 space = SPACE_NONE;
627*cdf0e10cSrcweir                 break;
628*cdf0e10cSrcweir             }
629*cdf0e10cSrcweir         }
630*cdf0e10cSrcweir         pad_.add(begin, p - begin);
631*cdf0e10cSrcweir     } else {
632*cdf0e10cSrcweir         char const * p = begin;
633*cdf0e10cSrcweir         while (p != end) {
634*cdf0e10cSrcweir             switch (*p) {
635*cdf0e10cSrcweir             case '\x09':
636*cdf0e10cSrcweir             case '\x0A':
637*cdf0e10cSrcweir                 pad_.add(begin, p - begin);
638*cdf0e10cSrcweir                 begin = ++p;
639*cdf0e10cSrcweir                 pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
640*cdf0e10cSrcweir                 break;
641*cdf0e10cSrcweir             case '\x0D':
642*cdf0e10cSrcweir                 pad_.add(begin, p - begin);
643*cdf0e10cSrcweir                 ++p;
644*cdf0e10cSrcweir                 if (peek() == '\x0A') {
645*cdf0e10cSrcweir                     ++p;
646*cdf0e10cSrcweir                 }
647*cdf0e10cSrcweir                 begin = p;
648*cdf0e10cSrcweir                 pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
649*cdf0e10cSrcweir                 break;
650*cdf0e10cSrcweir             case '&':
651*cdf0e10cSrcweir                 pad_.add(begin, p - begin);
652*cdf0e10cSrcweir                 p = handleReference(p, end);
653*cdf0e10cSrcweir                 begin = p;
654*cdf0e10cSrcweir                 break;
655*cdf0e10cSrcweir             default:
656*cdf0e10cSrcweir                 ++p;
657*cdf0e10cSrcweir                 break;
658*cdf0e10cSrcweir             }
659*cdf0e10cSrcweir         }
660*cdf0e10cSrcweir         pad_.add(begin, p - begin);
661*cdf0e10cSrcweir     }
662*cdf0e10cSrcweir     return pad_.get();
663*cdf0e10cSrcweir }
664*cdf0e10cSrcweir 
665*cdf0e10cSrcweir XmlReader::Result XmlReader::handleStartTag(int * nsId, Span * localName) {
666*cdf0e10cSrcweir     OSL_ASSERT(nsId != 0 && localName);
667*cdf0e10cSrcweir     char const * nameBegin = pos_;
668*cdf0e10cSrcweir     char const * nameColon = 0;
669*cdf0e10cSrcweir     if (!scanName(&nameColon)) {
670*cdf0e10cSrcweir         throw css::uno::RuntimeException(
671*cdf0e10cSrcweir             (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("bad tag name in ")) +
672*cdf0e10cSrcweir              fileUrl_),
673*cdf0e10cSrcweir             css::uno::Reference< css::uno::XInterface >());
674*cdf0e10cSrcweir     }
675*cdf0e10cSrcweir     char const * nameEnd = pos_;
676*cdf0e10cSrcweir     NamespaceList::size_type inheritedNamespaces = namespaces_.size();
677*cdf0e10cSrcweir     bool hasDefaultNs = false;
678*cdf0e10cSrcweir     int defaultNsId = NAMESPACE_NONE;
679*cdf0e10cSrcweir     attributes_.clear();
680*cdf0e10cSrcweir     for (;;) {
681*cdf0e10cSrcweir         char const * p = pos_;
682*cdf0e10cSrcweir         skipSpace();
683*cdf0e10cSrcweir         if (peek() == '/' || peek() == '>') {
684*cdf0e10cSrcweir             break;
685*cdf0e10cSrcweir         }
686*cdf0e10cSrcweir         if (pos_ == p) {
687*cdf0e10cSrcweir             throw css::uno::RuntimeException(
688*cdf0e10cSrcweir                 (rtl::OUString(
689*cdf0e10cSrcweir                     RTL_CONSTASCII_USTRINGPARAM(
690*cdf0e10cSrcweir                         "missing whitespace before attribute in ")) +
691*cdf0e10cSrcweir                  fileUrl_),
692*cdf0e10cSrcweir                 css::uno::Reference< css::uno::XInterface >());
693*cdf0e10cSrcweir         }
694*cdf0e10cSrcweir         char const * attrNameBegin = pos_;
695*cdf0e10cSrcweir         char const * attrNameColon = 0;
696*cdf0e10cSrcweir         if (!scanName(&attrNameColon)) {
697*cdf0e10cSrcweir             throw css::uno::RuntimeException(
698*cdf0e10cSrcweir                 (rtl::OUString(
699*cdf0e10cSrcweir                     RTL_CONSTASCII_USTRINGPARAM("bad attribute name in ")) +
700*cdf0e10cSrcweir                  fileUrl_),
701*cdf0e10cSrcweir                 css::uno::Reference< css::uno::XInterface >());
702*cdf0e10cSrcweir         }
703*cdf0e10cSrcweir         char const * attrNameEnd = pos_;
704*cdf0e10cSrcweir         skipSpace();
705*cdf0e10cSrcweir         if (read() != '=') {
706*cdf0e10cSrcweir             throw css::uno::RuntimeException(
707*cdf0e10cSrcweir                 (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("missing '=' in ")) +
708*cdf0e10cSrcweir                  fileUrl_),
709*cdf0e10cSrcweir                 css::uno::Reference< css::uno::XInterface >());
710*cdf0e10cSrcweir         }
711*cdf0e10cSrcweir         skipSpace();
712*cdf0e10cSrcweir         char del = read();
713*cdf0e10cSrcweir         if (del != '\'' && del != '"') {
714*cdf0e10cSrcweir             throw css::uno::RuntimeException(
715*cdf0e10cSrcweir                 (rtl::OUString(
716*cdf0e10cSrcweir                     RTL_CONSTASCII_USTRINGPARAM("bad attribute value in ")) +
717*cdf0e10cSrcweir                  fileUrl_),
718*cdf0e10cSrcweir                 css::uno::Reference< css::uno::XInterface >());
719*cdf0e10cSrcweir         }
720*cdf0e10cSrcweir         char const * valueBegin = pos_;
721*cdf0e10cSrcweir         sal_Int32 i = rtl_str_indexOfChar_WithLength(pos_, end_ - pos_, del);
722*cdf0e10cSrcweir         if (i < 0) {
723*cdf0e10cSrcweir             throw css::uno::RuntimeException(
724*cdf0e10cSrcweir                 (rtl::OUString(
725*cdf0e10cSrcweir                     RTL_CONSTASCII_USTRINGPARAM(
726*cdf0e10cSrcweir                         "unterminated attribute value in ")) +
727*cdf0e10cSrcweir                  fileUrl_),
728*cdf0e10cSrcweir                 css::uno::Reference< css::uno::XInterface >());
729*cdf0e10cSrcweir         }
730*cdf0e10cSrcweir         char const * valueEnd = pos_ + i;
731*cdf0e10cSrcweir         pos_ += i + 1;
732*cdf0e10cSrcweir         if (attrNameColon == 0 &&
733*cdf0e10cSrcweir             Span(attrNameBegin, attrNameEnd - attrNameBegin).equals(
734*cdf0e10cSrcweir                 RTL_CONSTASCII_STRINGPARAM("xmlns")))
735*cdf0e10cSrcweir         {
736*cdf0e10cSrcweir             hasDefaultNs = true;
737*cdf0e10cSrcweir             defaultNsId = scanNamespaceIri(valueBegin, valueEnd);
738*cdf0e10cSrcweir         } else if (attrNameColon != 0 &&
739*cdf0e10cSrcweir                    Span(attrNameBegin, attrNameColon - attrNameBegin).equals(
740*cdf0e10cSrcweir                        RTL_CONSTASCII_STRINGPARAM("xmlns")))
741*cdf0e10cSrcweir         {
742*cdf0e10cSrcweir             namespaces_.push_back(
743*cdf0e10cSrcweir                 NamespaceData(
744*cdf0e10cSrcweir                     Span(attrNameColon + 1, attrNameEnd - (attrNameColon + 1)),
745*cdf0e10cSrcweir                     scanNamespaceIri(valueBegin, valueEnd)));
746*cdf0e10cSrcweir         } else {
747*cdf0e10cSrcweir             attributes_.push_back(
748*cdf0e10cSrcweir                 AttributeData(
749*cdf0e10cSrcweir                     attrNameBegin, attrNameEnd, attrNameColon, valueBegin,
750*cdf0e10cSrcweir                     valueEnd));
751*cdf0e10cSrcweir         }
752*cdf0e10cSrcweir     }
753*cdf0e10cSrcweir     if (!hasDefaultNs && !elements_.empty()) {
754*cdf0e10cSrcweir         defaultNsId = elements_.top().defaultNamespaceId;
755*cdf0e10cSrcweir     }
756*cdf0e10cSrcweir     firstAttribute_ = true;
757*cdf0e10cSrcweir     if (peek() == '/') {
758*cdf0e10cSrcweir         state_ = STATE_EMPTY_ELEMENT_TAG;
759*cdf0e10cSrcweir         ++pos_;
760*cdf0e10cSrcweir     } else {
761*cdf0e10cSrcweir         state_ = STATE_CONTENT;
762*cdf0e10cSrcweir     }
763*cdf0e10cSrcweir     if (peek() != '>') {
764*cdf0e10cSrcweir         throw css::uno::RuntimeException(
765*cdf0e10cSrcweir             (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("missing '>' in ")) +
766*cdf0e10cSrcweir              fileUrl_),
767*cdf0e10cSrcweir             css::uno::Reference< css::uno::XInterface >());
768*cdf0e10cSrcweir     }
769*cdf0e10cSrcweir     ++pos_;
770*cdf0e10cSrcweir     elements_.push(
771*cdf0e10cSrcweir         ElementData(
772*cdf0e10cSrcweir             Span(nameBegin, nameEnd - nameBegin), inheritedNamespaces,
773*cdf0e10cSrcweir             defaultNsId));
774*cdf0e10cSrcweir     if (nameColon == 0) {
775*cdf0e10cSrcweir         *nsId = defaultNsId;
776*cdf0e10cSrcweir         *localName = Span(nameBegin, nameEnd - nameBegin);
777*cdf0e10cSrcweir     } else {
778*cdf0e10cSrcweir         *nsId = getNamespaceId(Span(nameBegin, nameColon - nameBegin));
779*cdf0e10cSrcweir         *localName = Span(nameColon + 1, nameEnd - (nameColon + 1));
780*cdf0e10cSrcweir     }
781*cdf0e10cSrcweir     return RESULT_BEGIN;
782*cdf0e10cSrcweir }
783*cdf0e10cSrcweir 
784*cdf0e10cSrcweir XmlReader::Result XmlReader::handleEndTag() {
785*cdf0e10cSrcweir     if (elements_.empty()) {
786*cdf0e10cSrcweir         throw css::uno::RuntimeException(
787*cdf0e10cSrcweir             (rtl::OUString(
788*cdf0e10cSrcweir                 RTL_CONSTASCII_USTRINGPARAM("spurious end tag in ")) +
789*cdf0e10cSrcweir              fileUrl_),
790*cdf0e10cSrcweir             css::uno::Reference< css::uno::XInterface >());
791*cdf0e10cSrcweir     }
792*cdf0e10cSrcweir     char const * nameBegin = pos_;
793*cdf0e10cSrcweir     char const * nameColon = 0;
794*cdf0e10cSrcweir     if (!scanName(&nameColon) ||
795*cdf0e10cSrcweir         !elements_.top().name.equals(nameBegin, pos_ - nameBegin))
796*cdf0e10cSrcweir     {
797*cdf0e10cSrcweir         throw css::uno::RuntimeException(
798*cdf0e10cSrcweir             (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("tag mismatch in ")) +
799*cdf0e10cSrcweir              fileUrl_),
800*cdf0e10cSrcweir             css::uno::Reference< css::uno::XInterface >());
801*cdf0e10cSrcweir     }
802*cdf0e10cSrcweir     handleElementEnd();
803*cdf0e10cSrcweir     skipSpace();
804*cdf0e10cSrcweir     if (peek() != '>') {
805*cdf0e10cSrcweir         throw css::uno::RuntimeException(
806*cdf0e10cSrcweir             (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("missing '>' in ")) +
807*cdf0e10cSrcweir              fileUrl_),
808*cdf0e10cSrcweir             css::uno::Reference< css::uno::XInterface >());
809*cdf0e10cSrcweir     }
810*cdf0e10cSrcweir     ++pos_;
811*cdf0e10cSrcweir     return RESULT_END;
812*cdf0e10cSrcweir }
813*cdf0e10cSrcweir 
814*cdf0e10cSrcweir void XmlReader::handleElementEnd() {
815*cdf0e10cSrcweir     OSL_ASSERT(!elements_.empty());
816*cdf0e10cSrcweir     namespaces_.resize(elements_.top().inheritedNamespaces);
817*cdf0e10cSrcweir     elements_.pop();
818*cdf0e10cSrcweir     state_ = elements_.empty() ? STATE_DONE : STATE_CONTENT;
819*cdf0e10cSrcweir }
820*cdf0e10cSrcweir 
821*cdf0e10cSrcweir XmlReader::Result XmlReader::handleSkippedText(Span * data, int * nsId) {
822*cdf0e10cSrcweir     for (;;) {
823*cdf0e10cSrcweir         sal_Int32 i = rtl_str_indexOfChar_WithLength(pos_, end_ - pos_, '<');
824*cdf0e10cSrcweir         if (i < 0) {
825*cdf0e10cSrcweir             throw css::uno::RuntimeException(
826*cdf0e10cSrcweir                 (rtl::OUString(
827*cdf0e10cSrcweir                     RTL_CONSTASCII_USTRINGPARAM("premature end of ")) +
828*cdf0e10cSrcweir                  fileUrl_),
829*cdf0e10cSrcweir                 css::uno::Reference< css::uno::XInterface >());
830*cdf0e10cSrcweir         }
831*cdf0e10cSrcweir         pos_ += i + 1;
832*cdf0e10cSrcweir         switch (peek()) {
833*cdf0e10cSrcweir         case '!':
834*cdf0e10cSrcweir             ++pos_;
835*cdf0e10cSrcweir             if (!skipComment() && !scanCdataSection().is()) {
836*cdf0e10cSrcweir                 skipDocumentTypeDeclaration();
837*cdf0e10cSrcweir             }
838*cdf0e10cSrcweir             break;
839*cdf0e10cSrcweir         case '/':
840*cdf0e10cSrcweir             ++pos_;
841*cdf0e10cSrcweir             return handleEndTag();
842*cdf0e10cSrcweir         case '?':
843*cdf0e10cSrcweir             ++pos_;
844*cdf0e10cSrcweir             skipProcessingInstruction();
845*cdf0e10cSrcweir             break;
846*cdf0e10cSrcweir         default:
847*cdf0e10cSrcweir             return handleStartTag(nsId, data);
848*cdf0e10cSrcweir         }
849*cdf0e10cSrcweir     }
850*cdf0e10cSrcweir }
851*cdf0e10cSrcweir 
852*cdf0e10cSrcweir XmlReader::Result XmlReader::handleRawText(Span * text) {
853*cdf0e10cSrcweir     pad_.clear();
854*cdf0e10cSrcweir     for (char const * begin = pos_;;) {
855*cdf0e10cSrcweir         switch (peek()) {
856*cdf0e10cSrcweir         case '\0': // i.e., EOF
857*cdf0e10cSrcweir             throw css::uno::RuntimeException(
858*cdf0e10cSrcweir                 (rtl::OUString(
859*cdf0e10cSrcweir                     RTL_CONSTASCII_USTRINGPARAM("premature end of ")) +
860*cdf0e10cSrcweir                  fileUrl_),
861*cdf0e10cSrcweir                 css::uno::Reference< css::uno::XInterface >());
862*cdf0e10cSrcweir         case '\x0D':
863*cdf0e10cSrcweir             pad_.add(begin, pos_ - begin);
864*cdf0e10cSrcweir             ++pos_;
865*cdf0e10cSrcweir             if (peek() != '\x0A') {
866*cdf0e10cSrcweir                 pad_.add(RTL_CONSTASCII_STRINGPARAM("\x0A"));
867*cdf0e10cSrcweir             }
868*cdf0e10cSrcweir             begin = pos_;
869*cdf0e10cSrcweir             break;
870*cdf0e10cSrcweir         case '&':
871*cdf0e10cSrcweir             pad_.add(begin, pos_ - begin);
872*cdf0e10cSrcweir             pos_ = handleReference(pos_, end_);
873*cdf0e10cSrcweir             begin = pos_;
874*cdf0e10cSrcweir             break;
875*cdf0e10cSrcweir         case '<':
876*cdf0e10cSrcweir             pad_.add(begin, pos_ - begin);
877*cdf0e10cSrcweir             ++pos_;
878*cdf0e10cSrcweir             switch (peek()) {
879*cdf0e10cSrcweir             case '!':
880*cdf0e10cSrcweir                 ++pos_;
881*cdf0e10cSrcweir                 if (!skipComment()) {
882*cdf0e10cSrcweir                     Span cdata(scanCdataSection());
883*cdf0e10cSrcweir                     if (cdata.is()) {
884*cdf0e10cSrcweir                         normalizeLineEnds(cdata);
885*cdf0e10cSrcweir                     } else {
886*cdf0e10cSrcweir                         skipDocumentTypeDeclaration();
887*cdf0e10cSrcweir                     }
888*cdf0e10cSrcweir                 }
889*cdf0e10cSrcweir                 begin = pos_;
890*cdf0e10cSrcweir                 break;
891*cdf0e10cSrcweir             case '/':
892*cdf0e10cSrcweir                 *text = pad_.get();
893*cdf0e10cSrcweir                 ++pos_;
894*cdf0e10cSrcweir                 state_ = STATE_END_TAG;
895*cdf0e10cSrcweir                 return RESULT_TEXT;
896*cdf0e10cSrcweir             case '?':
897*cdf0e10cSrcweir                 ++pos_;
898*cdf0e10cSrcweir                 skipProcessingInstruction();
899*cdf0e10cSrcweir                 begin = pos_;
900*cdf0e10cSrcweir                 break;
901*cdf0e10cSrcweir             default:
902*cdf0e10cSrcweir                 *text = pad_.get();
903*cdf0e10cSrcweir                 state_ = STATE_START_TAG;
904*cdf0e10cSrcweir                 return RESULT_TEXT;
905*cdf0e10cSrcweir             }
906*cdf0e10cSrcweir             break;
907*cdf0e10cSrcweir         default:
908*cdf0e10cSrcweir             ++pos_;
909*cdf0e10cSrcweir             break;
910*cdf0e10cSrcweir         }
911*cdf0e10cSrcweir     }
912*cdf0e10cSrcweir }
913*cdf0e10cSrcweir 
914*cdf0e10cSrcweir XmlReader::Result XmlReader::handleNormalizedText(Span * text) {
915*cdf0e10cSrcweir     pad_.clear();
916*cdf0e10cSrcweir     char const * flowBegin = pos_;
917*cdf0e10cSrcweir     char const * flowEnd = pos_;
918*cdf0e10cSrcweir     enum Space { SPACE_START, SPACE_NONE, SPACE_SPAN, SPACE_BREAK };
919*cdf0e10cSrcweir         // a single true space character can go into the current flow,
920*cdf0e10cSrcweir         // everything else breaks the flow
921*cdf0e10cSrcweir     Space space = SPACE_START;
922*cdf0e10cSrcweir     for (;;) {
923*cdf0e10cSrcweir         switch (peek()) {
924*cdf0e10cSrcweir         case '\0': // i.e., EOF
925*cdf0e10cSrcweir             throw css::uno::RuntimeException(
926*cdf0e10cSrcweir                 (rtl::OUString(
927*cdf0e10cSrcweir                     RTL_CONSTASCII_USTRINGPARAM("premature end of ")) +
928*cdf0e10cSrcweir                  fileUrl_),
929*cdf0e10cSrcweir                 css::uno::Reference< css::uno::XInterface >());
930*cdf0e10cSrcweir         case '\x09':
931*cdf0e10cSrcweir         case '\x0A':
932*cdf0e10cSrcweir         case '\x0D':
933*cdf0e10cSrcweir             switch (space) {
934*cdf0e10cSrcweir             case SPACE_START:
935*cdf0e10cSrcweir             case SPACE_BREAK:
936*cdf0e10cSrcweir                 break;
937*cdf0e10cSrcweir             case SPACE_NONE:
938*cdf0e10cSrcweir             case SPACE_SPAN:
939*cdf0e10cSrcweir                 space = SPACE_BREAK;
940*cdf0e10cSrcweir                 break;
941*cdf0e10cSrcweir             }
942*cdf0e10cSrcweir             ++pos_;
943*cdf0e10cSrcweir             break;
944*cdf0e10cSrcweir         case ' ':
945*cdf0e10cSrcweir             switch (space) {
946*cdf0e10cSrcweir             case SPACE_START:
947*cdf0e10cSrcweir             case SPACE_BREAK:
948*cdf0e10cSrcweir                 break;
949*cdf0e10cSrcweir             case SPACE_NONE:
950*cdf0e10cSrcweir                 space = SPACE_SPAN;
951*cdf0e10cSrcweir                 break;
952*cdf0e10cSrcweir             case SPACE_SPAN:
953*cdf0e10cSrcweir                 space = SPACE_BREAK;
954*cdf0e10cSrcweir                 break;
955*cdf0e10cSrcweir             }
956*cdf0e10cSrcweir             ++pos_;
957*cdf0e10cSrcweir             break;
958*cdf0e10cSrcweir         case '&':
959*cdf0e10cSrcweir             switch (space) {
960*cdf0e10cSrcweir             case SPACE_START:
961*cdf0e10cSrcweir                 break;
962*cdf0e10cSrcweir             case SPACE_NONE:
963*cdf0e10cSrcweir             case SPACE_SPAN:
964*cdf0e10cSrcweir                 pad_.add(flowBegin, pos_ - flowBegin);
965*cdf0e10cSrcweir                 break;
966*cdf0e10cSrcweir             case SPACE_BREAK:
967*cdf0e10cSrcweir                 pad_.add(flowBegin, flowEnd - flowBegin);
968*cdf0e10cSrcweir                 pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
969*cdf0e10cSrcweir                 break;
970*cdf0e10cSrcweir             }
971*cdf0e10cSrcweir             pos_ = handleReference(pos_, end_);
972*cdf0e10cSrcweir             flowBegin = pos_;
973*cdf0e10cSrcweir             flowEnd = pos_;
974*cdf0e10cSrcweir             space = SPACE_NONE;
975*cdf0e10cSrcweir             break;
976*cdf0e10cSrcweir         case '<':
977*cdf0e10cSrcweir             ++pos_;
978*cdf0e10cSrcweir             switch (peek()) {
979*cdf0e10cSrcweir             case '!':
980*cdf0e10cSrcweir                 ++pos_;
981*cdf0e10cSrcweir                 if (skipComment()) {
982*cdf0e10cSrcweir                     space = SPACE_BREAK;
983*cdf0e10cSrcweir                 } else {
984*cdf0e10cSrcweir                     Span cdata(scanCdataSection());
985*cdf0e10cSrcweir                     if (cdata.is()) {
986*cdf0e10cSrcweir                         // CDATA is not normalized (similar to character
987*cdf0e10cSrcweir                         // references; it keeps the code simple), but it might
988*cdf0e10cSrcweir                         // arguably be better to normalize it:
989*cdf0e10cSrcweir                         switch (space) {
990*cdf0e10cSrcweir                         case SPACE_START:
991*cdf0e10cSrcweir                             break;
992*cdf0e10cSrcweir                         case SPACE_NONE:
993*cdf0e10cSrcweir                         case SPACE_SPAN:
994*cdf0e10cSrcweir                             pad_.add(flowBegin, pos_ - flowBegin);
995*cdf0e10cSrcweir                             break;
996*cdf0e10cSrcweir                         case SPACE_BREAK:
997*cdf0e10cSrcweir                             pad_.add(flowBegin, flowEnd - flowBegin);
998*cdf0e10cSrcweir                             pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
999*cdf0e10cSrcweir                             break;
1000*cdf0e10cSrcweir                         }
1001*cdf0e10cSrcweir                         normalizeLineEnds(cdata);
1002*cdf0e10cSrcweir                         flowBegin = pos_;
1003*cdf0e10cSrcweir                         flowEnd = pos_;
1004*cdf0e10cSrcweir                         space = SPACE_NONE;
1005*cdf0e10cSrcweir                     } else {
1006*cdf0e10cSrcweir                         skipDocumentTypeDeclaration();
1007*cdf0e10cSrcweir                     }
1008*cdf0e10cSrcweir                 }
1009*cdf0e10cSrcweir                 break;
1010*cdf0e10cSrcweir             case '/':
1011*cdf0e10cSrcweir                 ++pos_;
1012*cdf0e10cSrcweir                 pad_.add(flowBegin, flowEnd - flowBegin);
1013*cdf0e10cSrcweir                 *text = pad_.get();
1014*cdf0e10cSrcweir                 state_ = STATE_END_TAG;
1015*cdf0e10cSrcweir                 return RESULT_TEXT;
1016*cdf0e10cSrcweir             case '?':
1017*cdf0e10cSrcweir                 ++pos_;
1018*cdf0e10cSrcweir                 skipProcessingInstruction();
1019*cdf0e10cSrcweir                 space = SPACE_BREAK;
1020*cdf0e10cSrcweir                 break;
1021*cdf0e10cSrcweir             default:
1022*cdf0e10cSrcweir                 pad_.add(flowBegin, flowEnd - flowBegin);
1023*cdf0e10cSrcweir                 *text = pad_.get();
1024*cdf0e10cSrcweir                 state_ = STATE_START_TAG;
1025*cdf0e10cSrcweir                 return RESULT_TEXT;
1026*cdf0e10cSrcweir             }
1027*cdf0e10cSrcweir             break;
1028*cdf0e10cSrcweir         default:
1029*cdf0e10cSrcweir             switch (space) {
1030*cdf0e10cSrcweir             case SPACE_START:
1031*cdf0e10cSrcweir                 flowBegin = pos_;
1032*cdf0e10cSrcweir                 break;
1033*cdf0e10cSrcweir             case SPACE_NONE:
1034*cdf0e10cSrcweir             case SPACE_SPAN:
1035*cdf0e10cSrcweir                 break;
1036*cdf0e10cSrcweir             case SPACE_BREAK:
1037*cdf0e10cSrcweir                 pad_.add(flowBegin, flowEnd - flowBegin);
1038*cdf0e10cSrcweir                 pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
1039*cdf0e10cSrcweir                 flowBegin = pos_;
1040*cdf0e10cSrcweir                 break;
1041*cdf0e10cSrcweir             }
1042*cdf0e10cSrcweir             flowEnd = ++pos_;
1043*cdf0e10cSrcweir             space = SPACE_NONE;
1044*cdf0e10cSrcweir             break;
1045*cdf0e10cSrcweir         }
1046*cdf0e10cSrcweir     }
1047*cdf0e10cSrcweir }
1048*cdf0e10cSrcweir 
1049*cdf0e10cSrcweir int XmlReader::toNamespaceId(NamespaceIris::size_type pos) {
1050*cdf0e10cSrcweir     OSL_ASSERT(pos <= INT_MAX);
1051*cdf0e10cSrcweir     return static_cast< int >(pos);
1052*cdf0e10cSrcweir }
1053*cdf0e10cSrcweir 
1054*cdf0e10cSrcweir }
1055