1*cdf0e10cSrcweir /************************************************************************* 2*cdf0e10cSrcweir * 3*cdf0e10cSrcweir * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4*cdf0e10cSrcweir * 5*cdf0e10cSrcweir * Copyright 2000, 2010 Oracle and/or its affiliates. 6*cdf0e10cSrcweir * 7*cdf0e10cSrcweir * OpenOffice.org - a multi-platform office productivity suite 8*cdf0e10cSrcweir * 9*cdf0e10cSrcweir * This file is part of OpenOffice.org. 10*cdf0e10cSrcweir * 11*cdf0e10cSrcweir * OpenOffice.org is free software: you can redistribute it and/or modify 12*cdf0e10cSrcweir * it under the terms of the GNU Lesser General Public License version 3 13*cdf0e10cSrcweir * only, as published by the Free Software Foundation. 14*cdf0e10cSrcweir * 15*cdf0e10cSrcweir * OpenOffice.org is distributed in the hope that it will be useful, 16*cdf0e10cSrcweir * but WITHOUT ANY WARRANTY; without even the implied warranty of 17*cdf0e10cSrcweir * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18*cdf0e10cSrcweir * GNU Lesser General Public License version 3 for more details 19*cdf0e10cSrcweir * (a copy is included in the LICENSE file that accompanied this code). 20*cdf0e10cSrcweir * 21*cdf0e10cSrcweir * You should have received a copy of the GNU Lesser General Public License 22*cdf0e10cSrcweir * version 3 along with OpenOffice.org. If not, see 23*cdf0e10cSrcweir * <http://www.openoffice.org/license.html> 24*cdf0e10cSrcweir * for a copy of the LGPLv3 License. 25*cdf0e10cSrcweir * 26*cdf0e10cSrcweir ************************************************************************/ 27*cdf0e10cSrcweir 28*cdf0e10cSrcweir #include "precompiled_xmlreader.hxx" 29*cdf0e10cSrcweir #include "sal/config.h" 30*cdf0e10cSrcweir 31*cdf0e10cSrcweir #include <climits> 32*cdf0e10cSrcweir #include <cstddef> 33*cdf0e10cSrcweir 34*cdf0e10cSrcweir #include "com/sun/star/container/NoSuchElementException.hpp" 35*cdf0e10cSrcweir #include "com/sun/star/uno/Reference.hxx" 36*cdf0e10cSrcweir #include "com/sun/star/uno/RuntimeException.hpp" 37*cdf0e10cSrcweir #include "com/sun/star/uno/XInterface.hpp" 38*cdf0e10cSrcweir #include "osl/diagnose.h" 39*cdf0e10cSrcweir #include "osl/file.h" 40*cdf0e10cSrcweir #include "rtl/string.h" 41*cdf0e10cSrcweir #include "rtl/ustring.h" 42*cdf0e10cSrcweir #include "rtl/ustring.hxx" 43*cdf0e10cSrcweir #include "sal/types.h" 44*cdf0e10cSrcweir #include "xmlreader/pad.hxx" 45*cdf0e10cSrcweir #include "xmlreader/span.hxx" 46*cdf0e10cSrcweir #include "xmlreader/xmlreader.hxx" 47*cdf0e10cSrcweir 48*cdf0e10cSrcweir namespace xmlreader { 49*cdf0e10cSrcweir 50*cdf0e10cSrcweir namespace { 51*cdf0e10cSrcweir 52*cdf0e10cSrcweir namespace css = com::sun::star; 53*cdf0e10cSrcweir 54*cdf0e10cSrcweir bool isSpace(char c) { 55*cdf0e10cSrcweir switch (c) { 56*cdf0e10cSrcweir case '\x09': 57*cdf0e10cSrcweir case '\x0A': 58*cdf0e10cSrcweir case '\x0D': 59*cdf0e10cSrcweir case ' ': 60*cdf0e10cSrcweir return true; 61*cdf0e10cSrcweir default: 62*cdf0e10cSrcweir return false; 63*cdf0e10cSrcweir } 64*cdf0e10cSrcweir } 65*cdf0e10cSrcweir 66*cdf0e10cSrcweir } 67*cdf0e10cSrcweir 68*cdf0e10cSrcweir XmlReader::XmlReader(rtl::OUString const & fileUrl) 69*cdf0e10cSrcweir SAL_THROW(( 70*cdf0e10cSrcweir css::container::NoSuchElementException, css::uno::RuntimeException)): 71*cdf0e10cSrcweir fileUrl_(fileUrl) 72*cdf0e10cSrcweir { 73*cdf0e10cSrcweir switch (osl_openFile(fileUrl_.pData, &fileHandle_, osl_File_OpenFlag_Read)) 74*cdf0e10cSrcweir { 75*cdf0e10cSrcweir case osl_File_E_None: 76*cdf0e10cSrcweir break; 77*cdf0e10cSrcweir case osl_File_E_NOENT: 78*cdf0e10cSrcweir throw css::container::NoSuchElementException( 79*cdf0e10cSrcweir fileUrl_, css::uno::Reference< css::uno::XInterface >()); 80*cdf0e10cSrcweir default: 81*cdf0e10cSrcweir throw css::uno::RuntimeException( 82*cdf0e10cSrcweir (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("cannot open ")) + 83*cdf0e10cSrcweir fileUrl_), 84*cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >()); 85*cdf0e10cSrcweir } 86*cdf0e10cSrcweir oslFileError e = osl_getFileSize(fileHandle_, &fileSize_); 87*cdf0e10cSrcweir if (e == osl_File_E_None) { 88*cdf0e10cSrcweir e = osl_mapFile( 89*cdf0e10cSrcweir fileHandle_, &fileAddress_, fileSize_, 0, 90*cdf0e10cSrcweir osl_File_MapFlag_WillNeed); 91*cdf0e10cSrcweir } 92*cdf0e10cSrcweir if (e != osl_File_E_None) { 93*cdf0e10cSrcweir e = osl_closeFile(fileHandle_); 94*cdf0e10cSrcweir if (e != osl_File_E_None) { 95*cdf0e10cSrcweir OSL_TRACE("osl_closeFile failed with %ld", static_cast< long >(e)); 96*cdf0e10cSrcweir } 97*cdf0e10cSrcweir throw css::uno::RuntimeException( 98*cdf0e10cSrcweir (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("cannot mmap ")) + 99*cdf0e10cSrcweir fileUrl_), 100*cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >()); 101*cdf0e10cSrcweir } 102*cdf0e10cSrcweir namespaceIris_.push_back( 103*cdf0e10cSrcweir Span( 104*cdf0e10cSrcweir RTL_CONSTASCII_STRINGPARAM( 105*cdf0e10cSrcweir "http://www.w3.org/XML/1998/namespace"))); 106*cdf0e10cSrcweir namespaces_.push_back( 107*cdf0e10cSrcweir NamespaceData(Span(RTL_CONSTASCII_STRINGPARAM("xml")), NAMESPACE_XML)); 108*cdf0e10cSrcweir pos_ = static_cast< char * >(fileAddress_); 109*cdf0e10cSrcweir end_ = pos_ + fileSize_; 110*cdf0e10cSrcweir state_ = STATE_CONTENT; 111*cdf0e10cSrcweir } 112*cdf0e10cSrcweir 113*cdf0e10cSrcweir XmlReader::~XmlReader() { 114*cdf0e10cSrcweir oslFileError e = osl_unmapFile(fileAddress_, fileSize_); 115*cdf0e10cSrcweir if (e != osl_File_E_None) { 116*cdf0e10cSrcweir OSL_TRACE("osl_unmapFile failed with %ld", static_cast< long >(e)); 117*cdf0e10cSrcweir } 118*cdf0e10cSrcweir e = osl_closeFile(fileHandle_); 119*cdf0e10cSrcweir if (e != osl_File_E_None) { 120*cdf0e10cSrcweir OSL_TRACE("osl_closeFile failed with %ld", static_cast< long >(e)); 121*cdf0e10cSrcweir } 122*cdf0e10cSrcweir } 123*cdf0e10cSrcweir 124*cdf0e10cSrcweir int XmlReader::registerNamespaceIri(Span const & iri) { 125*cdf0e10cSrcweir int id = toNamespaceId(namespaceIris_.size()); 126*cdf0e10cSrcweir namespaceIris_.push_back(iri); 127*cdf0e10cSrcweir if (iri.equals( 128*cdf0e10cSrcweir Span( 129*cdf0e10cSrcweir RTL_CONSTASCII_STRINGPARAM( 130*cdf0e10cSrcweir "http://www.w3.org/2001/XMLSchema-instance")))) 131*cdf0e10cSrcweir { 132*cdf0e10cSrcweir // Old user layer .xcu files used the xsi namespace prefix without 133*cdf0e10cSrcweir // declaring a corresponding namespace binding, see issue 77174; reading 134*cdf0e10cSrcweir // those files during migration would fail without this hack that can be 135*cdf0e10cSrcweir // removed once migration is no longer relevant (see 136*cdf0e10cSrcweir // configmgr::Components::parseModificationLayer): 137*cdf0e10cSrcweir namespaces_.push_back( 138*cdf0e10cSrcweir NamespaceData(Span(RTL_CONSTASCII_STRINGPARAM("xsi")), id)); 139*cdf0e10cSrcweir } 140*cdf0e10cSrcweir return id; 141*cdf0e10cSrcweir } 142*cdf0e10cSrcweir 143*cdf0e10cSrcweir XmlReader::Result XmlReader::nextItem(Text reportText, Span * data, int * nsId) 144*cdf0e10cSrcweir { 145*cdf0e10cSrcweir switch (state_) { 146*cdf0e10cSrcweir case STATE_CONTENT: 147*cdf0e10cSrcweir switch (reportText) { 148*cdf0e10cSrcweir case TEXT_NONE: 149*cdf0e10cSrcweir return handleSkippedText(data, nsId); 150*cdf0e10cSrcweir case TEXT_RAW: 151*cdf0e10cSrcweir return handleRawText(data); 152*cdf0e10cSrcweir case TEXT_NORMALIZED: 153*cdf0e10cSrcweir return handleNormalizedText(data); 154*cdf0e10cSrcweir } 155*cdf0e10cSrcweir case STATE_START_TAG: 156*cdf0e10cSrcweir return handleStartTag(nsId, data); 157*cdf0e10cSrcweir case STATE_END_TAG: 158*cdf0e10cSrcweir return handleEndTag(); 159*cdf0e10cSrcweir case STATE_EMPTY_ELEMENT_TAG: 160*cdf0e10cSrcweir handleElementEnd(); 161*cdf0e10cSrcweir return RESULT_END; 162*cdf0e10cSrcweir default: // STATE_DONE 163*cdf0e10cSrcweir return RESULT_DONE; 164*cdf0e10cSrcweir } 165*cdf0e10cSrcweir } 166*cdf0e10cSrcweir 167*cdf0e10cSrcweir bool XmlReader::nextAttribute(int * nsId, Span * localName) { 168*cdf0e10cSrcweir OSL_ASSERT(nsId != 0 && localName != 0); 169*cdf0e10cSrcweir if (firstAttribute_) { 170*cdf0e10cSrcweir currentAttribute_ = attributes_.begin(); 171*cdf0e10cSrcweir firstAttribute_ = false; 172*cdf0e10cSrcweir } else { 173*cdf0e10cSrcweir ++currentAttribute_; 174*cdf0e10cSrcweir } 175*cdf0e10cSrcweir if (currentAttribute_ == attributes_.end()) { 176*cdf0e10cSrcweir return false; 177*cdf0e10cSrcweir } 178*cdf0e10cSrcweir if (currentAttribute_->nameColon == 0) { 179*cdf0e10cSrcweir *nsId = NAMESPACE_NONE; 180*cdf0e10cSrcweir *localName = Span( 181*cdf0e10cSrcweir currentAttribute_->nameBegin, 182*cdf0e10cSrcweir currentAttribute_->nameEnd - currentAttribute_->nameBegin); 183*cdf0e10cSrcweir } else { 184*cdf0e10cSrcweir *nsId = getNamespaceId( 185*cdf0e10cSrcweir Span( 186*cdf0e10cSrcweir currentAttribute_->nameBegin, 187*cdf0e10cSrcweir currentAttribute_->nameColon - currentAttribute_->nameBegin)); 188*cdf0e10cSrcweir *localName = Span( 189*cdf0e10cSrcweir currentAttribute_->nameColon + 1, 190*cdf0e10cSrcweir currentAttribute_->nameEnd - (currentAttribute_->nameColon + 1)); 191*cdf0e10cSrcweir } 192*cdf0e10cSrcweir return true; 193*cdf0e10cSrcweir } 194*cdf0e10cSrcweir 195*cdf0e10cSrcweir Span XmlReader::getAttributeValue(bool fullyNormalize) { 196*cdf0e10cSrcweir return handleAttributeValue( 197*cdf0e10cSrcweir currentAttribute_->valueBegin, currentAttribute_->valueEnd, 198*cdf0e10cSrcweir fullyNormalize); 199*cdf0e10cSrcweir } 200*cdf0e10cSrcweir 201*cdf0e10cSrcweir int XmlReader::getNamespaceId(Span const & prefix) const { 202*cdf0e10cSrcweir for (NamespaceList::const_reverse_iterator i(namespaces_.rbegin()); 203*cdf0e10cSrcweir i != namespaces_.rend(); ++i) 204*cdf0e10cSrcweir { 205*cdf0e10cSrcweir if (prefix.equals(i->prefix)) { 206*cdf0e10cSrcweir return i->nsId; 207*cdf0e10cSrcweir } 208*cdf0e10cSrcweir } 209*cdf0e10cSrcweir return NAMESPACE_UNKNOWN; 210*cdf0e10cSrcweir } 211*cdf0e10cSrcweir 212*cdf0e10cSrcweir rtl::OUString XmlReader::getUrl() const { 213*cdf0e10cSrcweir return fileUrl_; 214*cdf0e10cSrcweir } 215*cdf0e10cSrcweir 216*cdf0e10cSrcweir void XmlReader::normalizeLineEnds(Span const & text) { 217*cdf0e10cSrcweir char const * p = text.begin; 218*cdf0e10cSrcweir sal_Int32 n = text.length; 219*cdf0e10cSrcweir for (;;) { 220*cdf0e10cSrcweir sal_Int32 i = rtl_str_indexOfChar_WithLength(p, n, '\x0D'); 221*cdf0e10cSrcweir if (i < 0) { 222*cdf0e10cSrcweir break; 223*cdf0e10cSrcweir } 224*cdf0e10cSrcweir pad_.add(p, i); 225*cdf0e10cSrcweir p += i + 1; 226*cdf0e10cSrcweir n -= i + 1; 227*cdf0e10cSrcweir if (n == 0 || *p != '\x0A') { 228*cdf0e10cSrcweir pad_.add(RTL_CONSTASCII_STRINGPARAM("\x0A")); 229*cdf0e10cSrcweir } 230*cdf0e10cSrcweir } 231*cdf0e10cSrcweir pad_.add(p, n); 232*cdf0e10cSrcweir } 233*cdf0e10cSrcweir 234*cdf0e10cSrcweir void XmlReader::skipSpace() { 235*cdf0e10cSrcweir while (isSpace(peek())) { 236*cdf0e10cSrcweir ++pos_; 237*cdf0e10cSrcweir } 238*cdf0e10cSrcweir } 239*cdf0e10cSrcweir 240*cdf0e10cSrcweir bool XmlReader::skipComment() { 241*cdf0e10cSrcweir if (rtl_str_shortenedCompare_WithLength( 242*cdf0e10cSrcweir pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("--"), 243*cdf0e10cSrcweir RTL_CONSTASCII_LENGTH("--")) != 244*cdf0e10cSrcweir 0) 245*cdf0e10cSrcweir { 246*cdf0e10cSrcweir return false; 247*cdf0e10cSrcweir } 248*cdf0e10cSrcweir pos_ += RTL_CONSTASCII_LENGTH("--"); 249*cdf0e10cSrcweir sal_Int32 i = rtl_str_indexOfStr_WithLength( 250*cdf0e10cSrcweir pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("--")); 251*cdf0e10cSrcweir if (i < 0) { 252*cdf0e10cSrcweir throw css::uno::RuntimeException( 253*cdf0e10cSrcweir (rtl::OUString( 254*cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM( 255*cdf0e10cSrcweir "premature end (within comment) of ")) + 256*cdf0e10cSrcweir fileUrl_), 257*cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >()); 258*cdf0e10cSrcweir } 259*cdf0e10cSrcweir pos_ += i + RTL_CONSTASCII_LENGTH("--"); 260*cdf0e10cSrcweir if (read() != '>') { 261*cdf0e10cSrcweir throw css::uno::RuntimeException( 262*cdf0e10cSrcweir (rtl::OUString( 263*cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM( 264*cdf0e10cSrcweir "illegal \"--\" within comment in ")) + 265*cdf0e10cSrcweir fileUrl_), 266*cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >()); 267*cdf0e10cSrcweir } 268*cdf0e10cSrcweir return true; 269*cdf0e10cSrcweir } 270*cdf0e10cSrcweir 271*cdf0e10cSrcweir void XmlReader::skipProcessingInstruction() { 272*cdf0e10cSrcweir sal_Int32 i = rtl_str_indexOfStr_WithLength( 273*cdf0e10cSrcweir pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("?>")); 274*cdf0e10cSrcweir if (i < 0) { 275*cdf0e10cSrcweir throw css::uno::RuntimeException( 276*cdf0e10cSrcweir (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("bad '<?' in ")) + 277*cdf0e10cSrcweir fileUrl_), 278*cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >()); 279*cdf0e10cSrcweir } 280*cdf0e10cSrcweir pos_ += i + RTL_CONSTASCII_LENGTH("?>"); 281*cdf0e10cSrcweir } 282*cdf0e10cSrcweir 283*cdf0e10cSrcweir void XmlReader::skipDocumentTypeDeclaration() { 284*cdf0e10cSrcweir // Neither is it checked that the doctypedecl is at the correct position in 285*cdf0e10cSrcweir // the document, nor that it is well-formed: 286*cdf0e10cSrcweir for (;;) { 287*cdf0e10cSrcweir char c = read(); 288*cdf0e10cSrcweir switch (c) { 289*cdf0e10cSrcweir case '\0': // i.e., EOF 290*cdf0e10cSrcweir throw css::uno::RuntimeException( 291*cdf0e10cSrcweir (rtl::OUString( 292*cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM( 293*cdf0e10cSrcweir "premature end (within DTD) of ")) + 294*cdf0e10cSrcweir fileUrl_), 295*cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >()); 296*cdf0e10cSrcweir case '"': 297*cdf0e10cSrcweir case '\'': 298*cdf0e10cSrcweir { 299*cdf0e10cSrcweir sal_Int32 i = rtl_str_indexOfChar_WithLength( 300*cdf0e10cSrcweir pos_, end_ - pos_, c); 301*cdf0e10cSrcweir if (i < 0) { 302*cdf0e10cSrcweir throw css::uno::RuntimeException( 303*cdf0e10cSrcweir (rtl::OUString( 304*cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM( 305*cdf0e10cSrcweir "premature end (within DTD) of ")) + 306*cdf0e10cSrcweir fileUrl_), 307*cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >()); 308*cdf0e10cSrcweir } 309*cdf0e10cSrcweir pos_ += i + 1; 310*cdf0e10cSrcweir } 311*cdf0e10cSrcweir break; 312*cdf0e10cSrcweir case '>': 313*cdf0e10cSrcweir return; 314*cdf0e10cSrcweir case '[': 315*cdf0e10cSrcweir for (;;) { 316*cdf0e10cSrcweir c = read(); 317*cdf0e10cSrcweir switch (c) { 318*cdf0e10cSrcweir case '\0': // i.e., EOF 319*cdf0e10cSrcweir throw css::uno::RuntimeException( 320*cdf0e10cSrcweir (rtl::OUString( 321*cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM( 322*cdf0e10cSrcweir "premature end (within DTD) of ")) + 323*cdf0e10cSrcweir fileUrl_), 324*cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >()); 325*cdf0e10cSrcweir case '"': 326*cdf0e10cSrcweir case '\'': 327*cdf0e10cSrcweir { 328*cdf0e10cSrcweir sal_Int32 i = rtl_str_indexOfChar_WithLength( 329*cdf0e10cSrcweir pos_, end_ - pos_, c); 330*cdf0e10cSrcweir if (i < 0) { 331*cdf0e10cSrcweir throw css::uno::RuntimeException( 332*cdf0e10cSrcweir (rtl::OUString( 333*cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM( 334*cdf0e10cSrcweir "premature end (within DTD) of ")) + 335*cdf0e10cSrcweir fileUrl_), 336*cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >()); 337*cdf0e10cSrcweir } 338*cdf0e10cSrcweir pos_ += i + 1; 339*cdf0e10cSrcweir } 340*cdf0e10cSrcweir break; 341*cdf0e10cSrcweir case '<': 342*cdf0e10cSrcweir switch (read()) { 343*cdf0e10cSrcweir case '\0': // i.e., EOF 344*cdf0e10cSrcweir throw css::uno::RuntimeException( 345*cdf0e10cSrcweir (rtl::OUString( 346*cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM( 347*cdf0e10cSrcweir "premature end (within DTD) of ")) + 348*cdf0e10cSrcweir fileUrl_), 349*cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >()); 350*cdf0e10cSrcweir case '!': 351*cdf0e10cSrcweir skipComment(); 352*cdf0e10cSrcweir break; 353*cdf0e10cSrcweir case '?': 354*cdf0e10cSrcweir skipProcessingInstruction(); 355*cdf0e10cSrcweir break; 356*cdf0e10cSrcweir default: 357*cdf0e10cSrcweir break; 358*cdf0e10cSrcweir } 359*cdf0e10cSrcweir break; 360*cdf0e10cSrcweir case ']': 361*cdf0e10cSrcweir skipSpace(); 362*cdf0e10cSrcweir if (read() != '>') { 363*cdf0e10cSrcweir throw css::uno::RuntimeException( 364*cdf0e10cSrcweir (rtl::OUString( 365*cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM( 366*cdf0e10cSrcweir "missing \">\" of DTD in ")) + 367*cdf0e10cSrcweir fileUrl_), 368*cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >()); 369*cdf0e10cSrcweir } 370*cdf0e10cSrcweir return; 371*cdf0e10cSrcweir default: 372*cdf0e10cSrcweir break; 373*cdf0e10cSrcweir } 374*cdf0e10cSrcweir } 375*cdf0e10cSrcweir default: 376*cdf0e10cSrcweir break; 377*cdf0e10cSrcweir } 378*cdf0e10cSrcweir } 379*cdf0e10cSrcweir } 380*cdf0e10cSrcweir 381*cdf0e10cSrcweir Span XmlReader::scanCdataSection() { 382*cdf0e10cSrcweir if (rtl_str_shortenedCompare_WithLength( 383*cdf0e10cSrcweir pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("[CDATA["), 384*cdf0e10cSrcweir RTL_CONSTASCII_LENGTH("[CDATA[")) != 385*cdf0e10cSrcweir 0) 386*cdf0e10cSrcweir { 387*cdf0e10cSrcweir return Span(); 388*cdf0e10cSrcweir } 389*cdf0e10cSrcweir pos_ += RTL_CONSTASCII_LENGTH("[CDATA["); 390*cdf0e10cSrcweir char const * begin = pos_; 391*cdf0e10cSrcweir sal_Int32 i = rtl_str_indexOfStr_WithLength( 392*cdf0e10cSrcweir pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("]]>")); 393*cdf0e10cSrcweir if (i < 0) { 394*cdf0e10cSrcweir throw css::uno::RuntimeException( 395*cdf0e10cSrcweir (rtl::OUString( 396*cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM( 397*cdf0e10cSrcweir "premature end (within CDATA section) of ")) + 398*cdf0e10cSrcweir fileUrl_), 399*cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >()); 400*cdf0e10cSrcweir } 401*cdf0e10cSrcweir pos_ += i + RTL_CONSTASCII_LENGTH("]]>"); 402*cdf0e10cSrcweir return Span(begin, i); 403*cdf0e10cSrcweir } 404*cdf0e10cSrcweir 405*cdf0e10cSrcweir bool XmlReader::scanName(char const ** nameColon) { 406*cdf0e10cSrcweir OSL_ASSERT(nameColon != 0 && *nameColon == 0); 407*cdf0e10cSrcweir for (char const * begin = pos_;; ++pos_) { 408*cdf0e10cSrcweir switch (peek()) { 409*cdf0e10cSrcweir case '\0': // i.e., EOF 410*cdf0e10cSrcweir case '\x09': 411*cdf0e10cSrcweir case '\x0A': 412*cdf0e10cSrcweir case '\x0D': 413*cdf0e10cSrcweir case ' ': 414*cdf0e10cSrcweir case '/': 415*cdf0e10cSrcweir case '=': 416*cdf0e10cSrcweir case '>': 417*cdf0e10cSrcweir return pos_ != begin; 418*cdf0e10cSrcweir case ':': 419*cdf0e10cSrcweir *nameColon = pos_; 420*cdf0e10cSrcweir break; 421*cdf0e10cSrcweir default: 422*cdf0e10cSrcweir break; 423*cdf0e10cSrcweir } 424*cdf0e10cSrcweir } 425*cdf0e10cSrcweir } 426*cdf0e10cSrcweir 427*cdf0e10cSrcweir int XmlReader::scanNamespaceIri(char const * begin, char const * end) { 428*cdf0e10cSrcweir OSL_ASSERT(begin != 0 && begin <= end); 429*cdf0e10cSrcweir Span iri(handleAttributeValue(begin, end, false)); 430*cdf0e10cSrcweir for (NamespaceIris::size_type i = 0; i < namespaceIris_.size(); ++i) { 431*cdf0e10cSrcweir if (namespaceIris_[i].equals(iri)) { 432*cdf0e10cSrcweir return toNamespaceId(i); 433*cdf0e10cSrcweir } 434*cdf0e10cSrcweir } 435*cdf0e10cSrcweir return XmlReader::NAMESPACE_UNKNOWN; 436*cdf0e10cSrcweir } 437*cdf0e10cSrcweir 438*cdf0e10cSrcweir char const * XmlReader::handleReference(char const * position, char const * end) 439*cdf0e10cSrcweir { 440*cdf0e10cSrcweir OSL_ASSERT(position != 0 && *position == '&' && position < end); 441*cdf0e10cSrcweir ++position; 442*cdf0e10cSrcweir if (*position == '#') { 443*cdf0e10cSrcweir ++position; 444*cdf0e10cSrcweir sal_Int32 val = 0; 445*cdf0e10cSrcweir char const * p; 446*cdf0e10cSrcweir if (*position == 'x') { 447*cdf0e10cSrcweir ++position; 448*cdf0e10cSrcweir p = position; 449*cdf0e10cSrcweir for (;; ++position) { 450*cdf0e10cSrcweir char c = *position; 451*cdf0e10cSrcweir if (c >= '0' && c <= '9') { 452*cdf0e10cSrcweir val = 16 * val + (c - '0'); 453*cdf0e10cSrcweir } else if (c >= 'A' && c <= 'F') { 454*cdf0e10cSrcweir val = 16 * val + (c - 'A') + 10; 455*cdf0e10cSrcweir } else if (c >= 'a' && c <= 'f') { 456*cdf0e10cSrcweir val = 16 * val + (c - 'a') + 10; 457*cdf0e10cSrcweir } else { 458*cdf0e10cSrcweir break; 459*cdf0e10cSrcweir } 460*cdf0e10cSrcweir if (val > 0x10FFFF) { // avoid overflow 461*cdf0e10cSrcweir throw css::uno::RuntimeException( 462*cdf0e10cSrcweir (rtl::OUString( 463*cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM( 464*cdf0e10cSrcweir "'&#x...' too large in ")) + 465*cdf0e10cSrcweir fileUrl_), 466*cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >()); 467*cdf0e10cSrcweir } 468*cdf0e10cSrcweir } 469*cdf0e10cSrcweir } else { 470*cdf0e10cSrcweir p = position; 471*cdf0e10cSrcweir for (;; ++position) { 472*cdf0e10cSrcweir char c = *position; 473*cdf0e10cSrcweir if (c >= '0' && c <= '9') { 474*cdf0e10cSrcweir val = 10 * val + (c - '0'); 475*cdf0e10cSrcweir } else { 476*cdf0e10cSrcweir break; 477*cdf0e10cSrcweir } 478*cdf0e10cSrcweir if (val > 0x10FFFF) { // avoid overflow 479*cdf0e10cSrcweir throw css::uno::RuntimeException( 480*cdf0e10cSrcweir (rtl::OUString( 481*cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM( 482*cdf0e10cSrcweir "'&#...' too large in ")) + 483*cdf0e10cSrcweir fileUrl_), 484*cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >()); 485*cdf0e10cSrcweir } 486*cdf0e10cSrcweir } 487*cdf0e10cSrcweir } 488*cdf0e10cSrcweir if (position == p || *position++ != ';') { 489*cdf0e10cSrcweir throw css::uno::RuntimeException( 490*cdf0e10cSrcweir (rtl::OUString( 491*cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM("'&#...' missing ';' in ")) + 492*cdf0e10cSrcweir fileUrl_), 493*cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >()); 494*cdf0e10cSrcweir } 495*cdf0e10cSrcweir OSL_ASSERT(val >= 0 && val <= 0x10FFFF); 496*cdf0e10cSrcweir if ((val < 0x20 && val != 0x9 && val != 0xA && val != 0xD) || 497*cdf0e10cSrcweir (val >= 0xD800 && val <= 0xDFFF) || val == 0xFFFE || val == 0xFFFF) 498*cdf0e10cSrcweir { 499*cdf0e10cSrcweir throw css::uno::RuntimeException( 500*cdf0e10cSrcweir (rtl::OUString( 501*cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM( 502*cdf0e10cSrcweir "character reference denoting invalid character in ")) + 503*cdf0e10cSrcweir fileUrl_), 504*cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >()); 505*cdf0e10cSrcweir } 506*cdf0e10cSrcweir char buf[4]; 507*cdf0e10cSrcweir sal_Int32 len; 508*cdf0e10cSrcweir if (val < 0x80) { 509*cdf0e10cSrcweir buf[0] = static_cast< char >(val); 510*cdf0e10cSrcweir len = 1; 511*cdf0e10cSrcweir } else if (val < 0x800) { 512*cdf0e10cSrcweir buf[0] = static_cast< char >((val >> 6) | 0xC0); 513*cdf0e10cSrcweir buf[1] = static_cast< char >((val & 0x3F) | 0x80); 514*cdf0e10cSrcweir len = 2; 515*cdf0e10cSrcweir } else if (val < 0x10000) { 516*cdf0e10cSrcweir buf[0] = static_cast< char >((val >> 12) | 0xE0); 517*cdf0e10cSrcweir buf[1] = static_cast< char >(((val >> 6) & 0x3F) | 0x80); 518*cdf0e10cSrcweir buf[2] = static_cast< char >((val & 0x3F) | 0x80); 519*cdf0e10cSrcweir len = 3; 520*cdf0e10cSrcweir } else { 521*cdf0e10cSrcweir buf[0] = static_cast< char >((val >> 18) | 0xF0); 522*cdf0e10cSrcweir buf[1] = static_cast< char >(((val >> 12) & 0x3F) | 0x80); 523*cdf0e10cSrcweir buf[2] = static_cast< char >(((val >> 6) & 0x3F) | 0x80); 524*cdf0e10cSrcweir buf[3] = static_cast< char >((val & 0x3F) | 0x80); 525*cdf0e10cSrcweir len = 4; 526*cdf0e10cSrcweir } 527*cdf0e10cSrcweir pad_.addEphemeral(buf, len); 528*cdf0e10cSrcweir return position; 529*cdf0e10cSrcweir } else { 530*cdf0e10cSrcweir struct EntityRef { 531*cdf0e10cSrcweir char const * inBegin; 532*cdf0e10cSrcweir sal_Int32 inLength; 533*cdf0e10cSrcweir char const * outBegin; 534*cdf0e10cSrcweir sal_Int32 outLength; 535*cdf0e10cSrcweir }; 536*cdf0e10cSrcweir static EntityRef const refs[] = { 537*cdf0e10cSrcweir { RTL_CONSTASCII_STRINGPARAM("amp;"), 538*cdf0e10cSrcweir RTL_CONSTASCII_STRINGPARAM("&") }, 539*cdf0e10cSrcweir { RTL_CONSTASCII_STRINGPARAM("lt;"), 540*cdf0e10cSrcweir RTL_CONSTASCII_STRINGPARAM("<") }, 541*cdf0e10cSrcweir { RTL_CONSTASCII_STRINGPARAM("gt;"), 542*cdf0e10cSrcweir RTL_CONSTASCII_STRINGPARAM(">") }, 543*cdf0e10cSrcweir { RTL_CONSTASCII_STRINGPARAM("apos;"), 544*cdf0e10cSrcweir RTL_CONSTASCII_STRINGPARAM("'") }, 545*cdf0e10cSrcweir { RTL_CONSTASCII_STRINGPARAM("quot;"), 546*cdf0e10cSrcweir RTL_CONSTASCII_STRINGPARAM("\"") } }; 547*cdf0e10cSrcweir for (std::size_t i = 0; i < sizeof refs / sizeof refs[0]; ++i) { 548*cdf0e10cSrcweir if (rtl_str_shortenedCompare_WithLength( 549*cdf0e10cSrcweir position, end - position, refs[i].inBegin, refs[i].inLength, 550*cdf0e10cSrcweir refs[i].inLength) == 551*cdf0e10cSrcweir 0) 552*cdf0e10cSrcweir { 553*cdf0e10cSrcweir position += refs[i].inLength; 554*cdf0e10cSrcweir pad_.add(refs[i].outBegin, refs[i].outLength); 555*cdf0e10cSrcweir return position; 556*cdf0e10cSrcweir } 557*cdf0e10cSrcweir } 558*cdf0e10cSrcweir throw css::uno::RuntimeException( 559*cdf0e10cSrcweir (rtl::OUString( 560*cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM("unknown entity reference in ")) + 561*cdf0e10cSrcweir fileUrl_), 562*cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >()); 563*cdf0e10cSrcweir } 564*cdf0e10cSrcweir } 565*cdf0e10cSrcweir 566*cdf0e10cSrcweir Span XmlReader::handleAttributeValue( 567*cdf0e10cSrcweir char const * begin, char const * end, bool fullyNormalize) 568*cdf0e10cSrcweir { 569*cdf0e10cSrcweir pad_.clear(); 570*cdf0e10cSrcweir if (fullyNormalize) { 571*cdf0e10cSrcweir while (begin != end && isSpace(*begin)) { 572*cdf0e10cSrcweir ++begin; 573*cdf0e10cSrcweir } 574*cdf0e10cSrcweir while (end != begin && isSpace(end[-1])) { 575*cdf0e10cSrcweir --end; 576*cdf0e10cSrcweir } 577*cdf0e10cSrcweir char const * p = begin; 578*cdf0e10cSrcweir enum Space { SPACE_NONE, SPACE_SPAN, SPACE_BREAK }; 579*cdf0e10cSrcweir // a single true space character can go into the current span, 580*cdf0e10cSrcweir // everything else breaks the span 581*cdf0e10cSrcweir Space space = SPACE_NONE; 582*cdf0e10cSrcweir while (p != end) { 583*cdf0e10cSrcweir switch (*p) { 584*cdf0e10cSrcweir case '\x09': 585*cdf0e10cSrcweir case '\x0A': 586*cdf0e10cSrcweir case '\x0D': 587*cdf0e10cSrcweir switch (space) { 588*cdf0e10cSrcweir case SPACE_NONE: 589*cdf0e10cSrcweir pad_.add(begin, p - begin); 590*cdf0e10cSrcweir pad_.add(RTL_CONSTASCII_STRINGPARAM(" ")); 591*cdf0e10cSrcweir space = SPACE_BREAK; 592*cdf0e10cSrcweir break; 593*cdf0e10cSrcweir case SPACE_SPAN: 594*cdf0e10cSrcweir pad_.add(begin, p - begin); 595*cdf0e10cSrcweir space = SPACE_BREAK; 596*cdf0e10cSrcweir break; 597*cdf0e10cSrcweir case SPACE_BREAK: 598*cdf0e10cSrcweir break; 599*cdf0e10cSrcweir } 600*cdf0e10cSrcweir begin = ++p; 601*cdf0e10cSrcweir break; 602*cdf0e10cSrcweir case ' ': 603*cdf0e10cSrcweir switch (space) { 604*cdf0e10cSrcweir case SPACE_NONE: 605*cdf0e10cSrcweir ++p; 606*cdf0e10cSrcweir space = SPACE_SPAN; 607*cdf0e10cSrcweir break; 608*cdf0e10cSrcweir case SPACE_SPAN: 609*cdf0e10cSrcweir pad_.add(begin, p - begin); 610*cdf0e10cSrcweir begin = ++p; 611*cdf0e10cSrcweir space = SPACE_BREAK; 612*cdf0e10cSrcweir break; 613*cdf0e10cSrcweir case SPACE_BREAK: 614*cdf0e10cSrcweir begin = ++p; 615*cdf0e10cSrcweir break; 616*cdf0e10cSrcweir } 617*cdf0e10cSrcweir break; 618*cdf0e10cSrcweir case '&': 619*cdf0e10cSrcweir pad_.add(begin, p - begin); 620*cdf0e10cSrcweir p = handleReference(p, end); 621*cdf0e10cSrcweir begin = p; 622*cdf0e10cSrcweir space = SPACE_NONE; 623*cdf0e10cSrcweir break; 624*cdf0e10cSrcweir default: 625*cdf0e10cSrcweir ++p; 626*cdf0e10cSrcweir space = SPACE_NONE; 627*cdf0e10cSrcweir break; 628*cdf0e10cSrcweir } 629*cdf0e10cSrcweir } 630*cdf0e10cSrcweir pad_.add(begin, p - begin); 631*cdf0e10cSrcweir } else { 632*cdf0e10cSrcweir char const * p = begin; 633*cdf0e10cSrcweir while (p != end) { 634*cdf0e10cSrcweir switch (*p) { 635*cdf0e10cSrcweir case '\x09': 636*cdf0e10cSrcweir case '\x0A': 637*cdf0e10cSrcweir pad_.add(begin, p - begin); 638*cdf0e10cSrcweir begin = ++p; 639*cdf0e10cSrcweir pad_.add(RTL_CONSTASCII_STRINGPARAM(" ")); 640*cdf0e10cSrcweir break; 641*cdf0e10cSrcweir case '\x0D': 642*cdf0e10cSrcweir pad_.add(begin, p - begin); 643*cdf0e10cSrcweir ++p; 644*cdf0e10cSrcweir if (peek() == '\x0A') { 645*cdf0e10cSrcweir ++p; 646*cdf0e10cSrcweir } 647*cdf0e10cSrcweir begin = p; 648*cdf0e10cSrcweir pad_.add(RTL_CONSTASCII_STRINGPARAM(" ")); 649*cdf0e10cSrcweir break; 650*cdf0e10cSrcweir case '&': 651*cdf0e10cSrcweir pad_.add(begin, p - begin); 652*cdf0e10cSrcweir p = handleReference(p, end); 653*cdf0e10cSrcweir begin = p; 654*cdf0e10cSrcweir break; 655*cdf0e10cSrcweir default: 656*cdf0e10cSrcweir ++p; 657*cdf0e10cSrcweir break; 658*cdf0e10cSrcweir } 659*cdf0e10cSrcweir } 660*cdf0e10cSrcweir pad_.add(begin, p - begin); 661*cdf0e10cSrcweir } 662*cdf0e10cSrcweir return pad_.get(); 663*cdf0e10cSrcweir } 664*cdf0e10cSrcweir 665*cdf0e10cSrcweir XmlReader::Result XmlReader::handleStartTag(int * nsId, Span * localName) { 666*cdf0e10cSrcweir OSL_ASSERT(nsId != 0 && localName); 667*cdf0e10cSrcweir char const * nameBegin = pos_; 668*cdf0e10cSrcweir char const * nameColon = 0; 669*cdf0e10cSrcweir if (!scanName(&nameColon)) { 670*cdf0e10cSrcweir throw css::uno::RuntimeException( 671*cdf0e10cSrcweir (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("bad tag name in ")) + 672*cdf0e10cSrcweir fileUrl_), 673*cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >()); 674*cdf0e10cSrcweir } 675*cdf0e10cSrcweir char const * nameEnd = pos_; 676*cdf0e10cSrcweir NamespaceList::size_type inheritedNamespaces = namespaces_.size(); 677*cdf0e10cSrcweir bool hasDefaultNs = false; 678*cdf0e10cSrcweir int defaultNsId = NAMESPACE_NONE; 679*cdf0e10cSrcweir attributes_.clear(); 680*cdf0e10cSrcweir for (;;) { 681*cdf0e10cSrcweir char const * p = pos_; 682*cdf0e10cSrcweir skipSpace(); 683*cdf0e10cSrcweir if (peek() == '/' || peek() == '>') { 684*cdf0e10cSrcweir break; 685*cdf0e10cSrcweir } 686*cdf0e10cSrcweir if (pos_ == p) { 687*cdf0e10cSrcweir throw css::uno::RuntimeException( 688*cdf0e10cSrcweir (rtl::OUString( 689*cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM( 690*cdf0e10cSrcweir "missing whitespace before attribute in ")) + 691*cdf0e10cSrcweir fileUrl_), 692*cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >()); 693*cdf0e10cSrcweir } 694*cdf0e10cSrcweir char const * attrNameBegin = pos_; 695*cdf0e10cSrcweir char const * attrNameColon = 0; 696*cdf0e10cSrcweir if (!scanName(&attrNameColon)) { 697*cdf0e10cSrcweir throw css::uno::RuntimeException( 698*cdf0e10cSrcweir (rtl::OUString( 699*cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM("bad attribute name in ")) + 700*cdf0e10cSrcweir fileUrl_), 701*cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >()); 702*cdf0e10cSrcweir } 703*cdf0e10cSrcweir char const * attrNameEnd = pos_; 704*cdf0e10cSrcweir skipSpace(); 705*cdf0e10cSrcweir if (read() != '=') { 706*cdf0e10cSrcweir throw css::uno::RuntimeException( 707*cdf0e10cSrcweir (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("missing '=' in ")) + 708*cdf0e10cSrcweir fileUrl_), 709*cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >()); 710*cdf0e10cSrcweir } 711*cdf0e10cSrcweir skipSpace(); 712*cdf0e10cSrcweir char del = read(); 713*cdf0e10cSrcweir if (del != '\'' && del != '"') { 714*cdf0e10cSrcweir throw css::uno::RuntimeException( 715*cdf0e10cSrcweir (rtl::OUString( 716*cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM("bad attribute value in ")) + 717*cdf0e10cSrcweir fileUrl_), 718*cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >()); 719*cdf0e10cSrcweir } 720*cdf0e10cSrcweir char const * valueBegin = pos_; 721*cdf0e10cSrcweir sal_Int32 i = rtl_str_indexOfChar_WithLength(pos_, end_ - pos_, del); 722*cdf0e10cSrcweir if (i < 0) { 723*cdf0e10cSrcweir throw css::uno::RuntimeException( 724*cdf0e10cSrcweir (rtl::OUString( 725*cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM( 726*cdf0e10cSrcweir "unterminated attribute value in ")) + 727*cdf0e10cSrcweir fileUrl_), 728*cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >()); 729*cdf0e10cSrcweir } 730*cdf0e10cSrcweir char const * valueEnd = pos_ + i; 731*cdf0e10cSrcweir pos_ += i + 1; 732*cdf0e10cSrcweir if (attrNameColon == 0 && 733*cdf0e10cSrcweir Span(attrNameBegin, attrNameEnd - attrNameBegin).equals( 734*cdf0e10cSrcweir RTL_CONSTASCII_STRINGPARAM("xmlns"))) 735*cdf0e10cSrcweir { 736*cdf0e10cSrcweir hasDefaultNs = true; 737*cdf0e10cSrcweir defaultNsId = scanNamespaceIri(valueBegin, valueEnd); 738*cdf0e10cSrcweir } else if (attrNameColon != 0 && 739*cdf0e10cSrcweir Span(attrNameBegin, attrNameColon - attrNameBegin).equals( 740*cdf0e10cSrcweir RTL_CONSTASCII_STRINGPARAM("xmlns"))) 741*cdf0e10cSrcweir { 742*cdf0e10cSrcweir namespaces_.push_back( 743*cdf0e10cSrcweir NamespaceData( 744*cdf0e10cSrcweir Span(attrNameColon + 1, attrNameEnd - (attrNameColon + 1)), 745*cdf0e10cSrcweir scanNamespaceIri(valueBegin, valueEnd))); 746*cdf0e10cSrcweir } else { 747*cdf0e10cSrcweir attributes_.push_back( 748*cdf0e10cSrcweir AttributeData( 749*cdf0e10cSrcweir attrNameBegin, attrNameEnd, attrNameColon, valueBegin, 750*cdf0e10cSrcweir valueEnd)); 751*cdf0e10cSrcweir } 752*cdf0e10cSrcweir } 753*cdf0e10cSrcweir if (!hasDefaultNs && !elements_.empty()) { 754*cdf0e10cSrcweir defaultNsId = elements_.top().defaultNamespaceId; 755*cdf0e10cSrcweir } 756*cdf0e10cSrcweir firstAttribute_ = true; 757*cdf0e10cSrcweir if (peek() == '/') { 758*cdf0e10cSrcweir state_ = STATE_EMPTY_ELEMENT_TAG; 759*cdf0e10cSrcweir ++pos_; 760*cdf0e10cSrcweir } else { 761*cdf0e10cSrcweir state_ = STATE_CONTENT; 762*cdf0e10cSrcweir } 763*cdf0e10cSrcweir if (peek() != '>') { 764*cdf0e10cSrcweir throw css::uno::RuntimeException( 765*cdf0e10cSrcweir (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("missing '>' in ")) + 766*cdf0e10cSrcweir fileUrl_), 767*cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >()); 768*cdf0e10cSrcweir } 769*cdf0e10cSrcweir ++pos_; 770*cdf0e10cSrcweir elements_.push( 771*cdf0e10cSrcweir ElementData( 772*cdf0e10cSrcweir Span(nameBegin, nameEnd - nameBegin), inheritedNamespaces, 773*cdf0e10cSrcweir defaultNsId)); 774*cdf0e10cSrcweir if (nameColon == 0) { 775*cdf0e10cSrcweir *nsId = defaultNsId; 776*cdf0e10cSrcweir *localName = Span(nameBegin, nameEnd - nameBegin); 777*cdf0e10cSrcweir } else { 778*cdf0e10cSrcweir *nsId = getNamespaceId(Span(nameBegin, nameColon - nameBegin)); 779*cdf0e10cSrcweir *localName = Span(nameColon + 1, nameEnd - (nameColon + 1)); 780*cdf0e10cSrcweir } 781*cdf0e10cSrcweir return RESULT_BEGIN; 782*cdf0e10cSrcweir } 783*cdf0e10cSrcweir 784*cdf0e10cSrcweir XmlReader::Result XmlReader::handleEndTag() { 785*cdf0e10cSrcweir if (elements_.empty()) { 786*cdf0e10cSrcweir throw css::uno::RuntimeException( 787*cdf0e10cSrcweir (rtl::OUString( 788*cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM("spurious end tag in ")) + 789*cdf0e10cSrcweir fileUrl_), 790*cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >()); 791*cdf0e10cSrcweir } 792*cdf0e10cSrcweir char const * nameBegin = pos_; 793*cdf0e10cSrcweir char const * nameColon = 0; 794*cdf0e10cSrcweir if (!scanName(&nameColon) || 795*cdf0e10cSrcweir !elements_.top().name.equals(nameBegin, pos_ - nameBegin)) 796*cdf0e10cSrcweir { 797*cdf0e10cSrcweir throw css::uno::RuntimeException( 798*cdf0e10cSrcweir (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("tag mismatch in ")) + 799*cdf0e10cSrcweir fileUrl_), 800*cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >()); 801*cdf0e10cSrcweir } 802*cdf0e10cSrcweir handleElementEnd(); 803*cdf0e10cSrcweir skipSpace(); 804*cdf0e10cSrcweir if (peek() != '>') { 805*cdf0e10cSrcweir throw css::uno::RuntimeException( 806*cdf0e10cSrcweir (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("missing '>' in ")) + 807*cdf0e10cSrcweir fileUrl_), 808*cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >()); 809*cdf0e10cSrcweir } 810*cdf0e10cSrcweir ++pos_; 811*cdf0e10cSrcweir return RESULT_END; 812*cdf0e10cSrcweir } 813*cdf0e10cSrcweir 814*cdf0e10cSrcweir void XmlReader::handleElementEnd() { 815*cdf0e10cSrcweir OSL_ASSERT(!elements_.empty()); 816*cdf0e10cSrcweir namespaces_.resize(elements_.top().inheritedNamespaces); 817*cdf0e10cSrcweir elements_.pop(); 818*cdf0e10cSrcweir state_ = elements_.empty() ? STATE_DONE : STATE_CONTENT; 819*cdf0e10cSrcweir } 820*cdf0e10cSrcweir 821*cdf0e10cSrcweir XmlReader::Result XmlReader::handleSkippedText(Span * data, int * nsId) { 822*cdf0e10cSrcweir for (;;) { 823*cdf0e10cSrcweir sal_Int32 i = rtl_str_indexOfChar_WithLength(pos_, end_ - pos_, '<'); 824*cdf0e10cSrcweir if (i < 0) { 825*cdf0e10cSrcweir throw css::uno::RuntimeException( 826*cdf0e10cSrcweir (rtl::OUString( 827*cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM("premature end of ")) + 828*cdf0e10cSrcweir fileUrl_), 829*cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >()); 830*cdf0e10cSrcweir } 831*cdf0e10cSrcweir pos_ += i + 1; 832*cdf0e10cSrcweir switch (peek()) { 833*cdf0e10cSrcweir case '!': 834*cdf0e10cSrcweir ++pos_; 835*cdf0e10cSrcweir if (!skipComment() && !scanCdataSection().is()) { 836*cdf0e10cSrcweir skipDocumentTypeDeclaration(); 837*cdf0e10cSrcweir } 838*cdf0e10cSrcweir break; 839*cdf0e10cSrcweir case '/': 840*cdf0e10cSrcweir ++pos_; 841*cdf0e10cSrcweir return handleEndTag(); 842*cdf0e10cSrcweir case '?': 843*cdf0e10cSrcweir ++pos_; 844*cdf0e10cSrcweir skipProcessingInstruction(); 845*cdf0e10cSrcweir break; 846*cdf0e10cSrcweir default: 847*cdf0e10cSrcweir return handleStartTag(nsId, data); 848*cdf0e10cSrcweir } 849*cdf0e10cSrcweir } 850*cdf0e10cSrcweir } 851*cdf0e10cSrcweir 852*cdf0e10cSrcweir XmlReader::Result XmlReader::handleRawText(Span * text) { 853*cdf0e10cSrcweir pad_.clear(); 854*cdf0e10cSrcweir for (char const * begin = pos_;;) { 855*cdf0e10cSrcweir switch (peek()) { 856*cdf0e10cSrcweir case '\0': // i.e., EOF 857*cdf0e10cSrcweir throw css::uno::RuntimeException( 858*cdf0e10cSrcweir (rtl::OUString( 859*cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM("premature end of ")) + 860*cdf0e10cSrcweir fileUrl_), 861*cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >()); 862*cdf0e10cSrcweir case '\x0D': 863*cdf0e10cSrcweir pad_.add(begin, pos_ - begin); 864*cdf0e10cSrcweir ++pos_; 865*cdf0e10cSrcweir if (peek() != '\x0A') { 866*cdf0e10cSrcweir pad_.add(RTL_CONSTASCII_STRINGPARAM("\x0A")); 867*cdf0e10cSrcweir } 868*cdf0e10cSrcweir begin = pos_; 869*cdf0e10cSrcweir break; 870*cdf0e10cSrcweir case '&': 871*cdf0e10cSrcweir pad_.add(begin, pos_ - begin); 872*cdf0e10cSrcweir pos_ = handleReference(pos_, end_); 873*cdf0e10cSrcweir begin = pos_; 874*cdf0e10cSrcweir break; 875*cdf0e10cSrcweir case '<': 876*cdf0e10cSrcweir pad_.add(begin, pos_ - begin); 877*cdf0e10cSrcweir ++pos_; 878*cdf0e10cSrcweir switch (peek()) { 879*cdf0e10cSrcweir case '!': 880*cdf0e10cSrcweir ++pos_; 881*cdf0e10cSrcweir if (!skipComment()) { 882*cdf0e10cSrcweir Span cdata(scanCdataSection()); 883*cdf0e10cSrcweir if (cdata.is()) { 884*cdf0e10cSrcweir normalizeLineEnds(cdata); 885*cdf0e10cSrcweir } else { 886*cdf0e10cSrcweir skipDocumentTypeDeclaration(); 887*cdf0e10cSrcweir } 888*cdf0e10cSrcweir } 889*cdf0e10cSrcweir begin = pos_; 890*cdf0e10cSrcweir break; 891*cdf0e10cSrcweir case '/': 892*cdf0e10cSrcweir *text = pad_.get(); 893*cdf0e10cSrcweir ++pos_; 894*cdf0e10cSrcweir state_ = STATE_END_TAG; 895*cdf0e10cSrcweir return RESULT_TEXT; 896*cdf0e10cSrcweir case '?': 897*cdf0e10cSrcweir ++pos_; 898*cdf0e10cSrcweir skipProcessingInstruction(); 899*cdf0e10cSrcweir begin = pos_; 900*cdf0e10cSrcweir break; 901*cdf0e10cSrcweir default: 902*cdf0e10cSrcweir *text = pad_.get(); 903*cdf0e10cSrcweir state_ = STATE_START_TAG; 904*cdf0e10cSrcweir return RESULT_TEXT; 905*cdf0e10cSrcweir } 906*cdf0e10cSrcweir break; 907*cdf0e10cSrcweir default: 908*cdf0e10cSrcweir ++pos_; 909*cdf0e10cSrcweir break; 910*cdf0e10cSrcweir } 911*cdf0e10cSrcweir } 912*cdf0e10cSrcweir } 913*cdf0e10cSrcweir 914*cdf0e10cSrcweir XmlReader::Result XmlReader::handleNormalizedText(Span * text) { 915*cdf0e10cSrcweir pad_.clear(); 916*cdf0e10cSrcweir char const * flowBegin = pos_; 917*cdf0e10cSrcweir char const * flowEnd = pos_; 918*cdf0e10cSrcweir enum Space { SPACE_START, SPACE_NONE, SPACE_SPAN, SPACE_BREAK }; 919*cdf0e10cSrcweir // a single true space character can go into the current flow, 920*cdf0e10cSrcweir // everything else breaks the flow 921*cdf0e10cSrcweir Space space = SPACE_START; 922*cdf0e10cSrcweir for (;;) { 923*cdf0e10cSrcweir switch (peek()) { 924*cdf0e10cSrcweir case '\0': // i.e., EOF 925*cdf0e10cSrcweir throw css::uno::RuntimeException( 926*cdf0e10cSrcweir (rtl::OUString( 927*cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM("premature end of ")) + 928*cdf0e10cSrcweir fileUrl_), 929*cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >()); 930*cdf0e10cSrcweir case '\x09': 931*cdf0e10cSrcweir case '\x0A': 932*cdf0e10cSrcweir case '\x0D': 933*cdf0e10cSrcweir switch (space) { 934*cdf0e10cSrcweir case SPACE_START: 935*cdf0e10cSrcweir case SPACE_BREAK: 936*cdf0e10cSrcweir break; 937*cdf0e10cSrcweir case SPACE_NONE: 938*cdf0e10cSrcweir case SPACE_SPAN: 939*cdf0e10cSrcweir space = SPACE_BREAK; 940*cdf0e10cSrcweir break; 941*cdf0e10cSrcweir } 942*cdf0e10cSrcweir ++pos_; 943*cdf0e10cSrcweir break; 944*cdf0e10cSrcweir case ' ': 945*cdf0e10cSrcweir switch (space) { 946*cdf0e10cSrcweir case SPACE_START: 947*cdf0e10cSrcweir case SPACE_BREAK: 948*cdf0e10cSrcweir break; 949*cdf0e10cSrcweir case SPACE_NONE: 950*cdf0e10cSrcweir space = SPACE_SPAN; 951*cdf0e10cSrcweir break; 952*cdf0e10cSrcweir case SPACE_SPAN: 953*cdf0e10cSrcweir space = SPACE_BREAK; 954*cdf0e10cSrcweir break; 955*cdf0e10cSrcweir } 956*cdf0e10cSrcweir ++pos_; 957*cdf0e10cSrcweir break; 958*cdf0e10cSrcweir case '&': 959*cdf0e10cSrcweir switch (space) { 960*cdf0e10cSrcweir case SPACE_START: 961*cdf0e10cSrcweir break; 962*cdf0e10cSrcweir case SPACE_NONE: 963*cdf0e10cSrcweir case SPACE_SPAN: 964*cdf0e10cSrcweir pad_.add(flowBegin, pos_ - flowBegin); 965*cdf0e10cSrcweir break; 966*cdf0e10cSrcweir case SPACE_BREAK: 967*cdf0e10cSrcweir pad_.add(flowBegin, flowEnd - flowBegin); 968*cdf0e10cSrcweir pad_.add(RTL_CONSTASCII_STRINGPARAM(" ")); 969*cdf0e10cSrcweir break; 970*cdf0e10cSrcweir } 971*cdf0e10cSrcweir pos_ = handleReference(pos_, end_); 972*cdf0e10cSrcweir flowBegin = pos_; 973*cdf0e10cSrcweir flowEnd = pos_; 974*cdf0e10cSrcweir space = SPACE_NONE; 975*cdf0e10cSrcweir break; 976*cdf0e10cSrcweir case '<': 977*cdf0e10cSrcweir ++pos_; 978*cdf0e10cSrcweir switch (peek()) { 979*cdf0e10cSrcweir case '!': 980*cdf0e10cSrcweir ++pos_; 981*cdf0e10cSrcweir if (skipComment()) { 982*cdf0e10cSrcweir space = SPACE_BREAK; 983*cdf0e10cSrcweir } else { 984*cdf0e10cSrcweir Span cdata(scanCdataSection()); 985*cdf0e10cSrcweir if (cdata.is()) { 986*cdf0e10cSrcweir // CDATA is not normalized (similar to character 987*cdf0e10cSrcweir // references; it keeps the code simple), but it might 988*cdf0e10cSrcweir // arguably be better to normalize it: 989*cdf0e10cSrcweir switch (space) { 990*cdf0e10cSrcweir case SPACE_START: 991*cdf0e10cSrcweir break; 992*cdf0e10cSrcweir case SPACE_NONE: 993*cdf0e10cSrcweir case SPACE_SPAN: 994*cdf0e10cSrcweir pad_.add(flowBegin, pos_ - flowBegin); 995*cdf0e10cSrcweir break; 996*cdf0e10cSrcweir case SPACE_BREAK: 997*cdf0e10cSrcweir pad_.add(flowBegin, flowEnd - flowBegin); 998*cdf0e10cSrcweir pad_.add(RTL_CONSTASCII_STRINGPARAM(" ")); 999*cdf0e10cSrcweir break; 1000*cdf0e10cSrcweir } 1001*cdf0e10cSrcweir normalizeLineEnds(cdata); 1002*cdf0e10cSrcweir flowBegin = pos_; 1003*cdf0e10cSrcweir flowEnd = pos_; 1004*cdf0e10cSrcweir space = SPACE_NONE; 1005*cdf0e10cSrcweir } else { 1006*cdf0e10cSrcweir skipDocumentTypeDeclaration(); 1007*cdf0e10cSrcweir } 1008*cdf0e10cSrcweir } 1009*cdf0e10cSrcweir break; 1010*cdf0e10cSrcweir case '/': 1011*cdf0e10cSrcweir ++pos_; 1012*cdf0e10cSrcweir pad_.add(flowBegin, flowEnd - flowBegin); 1013*cdf0e10cSrcweir *text = pad_.get(); 1014*cdf0e10cSrcweir state_ = STATE_END_TAG; 1015*cdf0e10cSrcweir return RESULT_TEXT; 1016*cdf0e10cSrcweir case '?': 1017*cdf0e10cSrcweir ++pos_; 1018*cdf0e10cSrcweir skipProcessingInstruction(); 1019*cdf0e10cSrcweir space = SPACE_BREAK; 1020*cdf0e10cSrcweir break; 1021*cdf0e10cSrcweir default: 1022*cdf0e10cSrcweir pad_.add(flowBegin, flowEnd - flowBegin); 1023*cdf0e10cSrcweir *text = pad_.get(); 1024*cdf0e10cSrcweir state_ = STATE_START_TAG; 1025*cdf0e10cSrcweir return RESULT_TEXT; 1026*cdf0e10cSrcweir } 1027*cdf0e10cSrcweir break; 1028*cdf0e10cSrcweir default: 1029*cdf0e10cSrcweir switch (space) { 1030*cdf0e10cSrcweir case SPACE_START: 1031*cdf0e10cSrcweir flowBegin = pos_; 1032*cdf0e10cSrcweir break; 1033*cdf0e10cSrcweir case SPACE_NONE: 1034*cdf0e10cSrcweir case SPACE_SPAN: 1035*cdf0e10cSrcweir break; 1036*cdf0e10cSrcweir case SPACE_BREAK: 1037*cdf0e10cSrcweir pad_.add(flowBegin, flowEnd - flowBegin); 1038*cdf0e10cSrcweir pad_.add(RTL_CONSTASCII_STRINGPARAM(" ")); 1039*cdf0e10cSrcweir flowBegin = pos_; 1040*cdf0e10cSrcweir break; 1041*cdf0e10cSrcweir } 1042*cdf0e10cSrcweir flowEnd = ++pos_; 1043*cdf0e10cSrcweir space = SPACE_NONE; 1044*cdf0e10cSrcweir break; 1045*cdf0e10cSrcweir } 1046*cdf0e10cSrcweir } 1047*cdf0e10cSrcweir } 1048*cdf0e10cSrcweir 1049*cdf0e10cSrcweir int XmlReader::toNamespaceId(NamespaceIris::size_type pos) { 1050*cdf0e10cSrcweir OSL_ASSERT(pos <= INT_MAX); 1051*cdf0e10cSrcweir return static_cast< int >(pos); 1052*cdf0e10cSrcweir } 1053*cdf0e10cSrcweir 1054*cdf0e10cSrcweir } 1055