1*b725e8ebSAndrew Rist /************************************************************** 2cdf0e10cSrcweir * 3*b725e8ebSAndrew Rist * Licensed to the Apache Software Foundation (ASF) under one 4*b725e8ebSAndrew Rist * or more contributor license agreements. See the NOTICE file 5*b725e8ebSAndrew Rist * distributed with this work for additional information 6*b725e8ebSAndrew Rist * regarding copyright ownership. The ASF licenses this file 7*b725e8ebSAndrew Rist * to you under the Apache License, Version 2.0 (the 8*b725e8ebSAndrew Rist * "License"); you may not use this file except in compliance 9*b725e8ebSAndrew Rist * with the License. You may obtain a copy of the License at 10cdf0e10cSrcweir * 11*b725e8ebSAndrew Rist * http://www.apache.org/licenses/LICENSE-2.0 12cdf0e10cSrcweir * 13*b725e8ebSAndrew Rist * Unless required by applicable law or agreed to in writing, 14*b725e8ebSAndrew Rist * software distributed under the License is distributed on an 15*b725e8ebSAndrew Rist * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16*b725e8ebSAndrew Rist * KIND, either express or implied. See the License for the 17*b725e8ebSAndrew Rist * specific language governing permissions and limitations 18*b725e8ebSAndrew Rist * under the License. 19cdf0e10cSrcweir * 20*b725e8ebSAndrew Rist *************************************************************/ 21*b725e8ebSAndrew Rist 22*b725e8ebSAndrew Rist 23cdf0e10cSrcweir 24cdf0e10cSrcweir #include "precompiled_xmlreader.hxx" 25cdf0e10cSrcweir #include "sal/config.h" 26cdf0e10cSrcweir 27cdf0e10cSrcweir #include <climits> 28cdf0e10cSrcweir #include <cstddef> 29cdf0e10cSrcweir 30cdf0e10cSrcweir #include "com/sun/star/container/NoSuchElementException.hpp" 31cdf0e10cSrcweir #include "com/sun/star/uno/Reference.hxx" 32cdf0e10cSrcweir #include "com/sun/star/uno/RuntimeException.hpp" 33cdf0e10cSrcweir #include "com/sun/star/uno/XInterface.hpp" 34cdf0e10cSrcweir #include "osl/diagnose.h" 35cdf0e10cSrcweir #include "osl/file.h" 36cdf0e10cSrcweir #include "rtl/string.h" 37cdf0e10cSrcweir #include "rtl/ustring.h" 38cdf0e10cSrcweir #include "rtl/ustring.hxx" 39cdf0e10cSrcweir #include "sal/types.h" 40cdf0e10cSrcweir #include "xmlreader/pad.hxx" 41cdf0e10cSrcweir #include "xmlreader/span.hxx" 42cdf0e10cSrcweir #include "xmlreader/xmlreader.hxx" 43cdf0e10cSrcweir 44cdf0e10cSrcweir namespace xmlreader { 45cdf0e10cSrcweir 46cdf0e10cSrcweir namespace { 47cdf0e10cSrcweir 48cdf0e10cSrcweir namespace css = com::sun::star; 49cdf0e10cSrcweir 50cdf0e10cSrcweir bool isSpace(char c) { 51cdf0e10cSrcweir switch (c) { 52cdf0e10cSrcweir case '\x09': 53cdf0e10cSrcweir case '\x0A': 54cdf0e10cSrcweir case '\x0D': 55cdf0e10cSrcweir case ' ': 56cdf0e10cSrcweir return true; 57cdf0e10cSrcweir default: 58cdf0e10cSrcweir return false; 59cdf0e10cSrcweir } 60cdf0e10cSrcweir } 61cdf0e10cSrcweir 62cdf0e10cSrcweir } 63cdf0e10cSrcweir 64cdf0e10cSrcweir XmlReader::XmlReader(rtl::OUString const & fileUrl) 65cdf0e10cSrcweir SAL_THROW(( 66cdf0e10cSrcweir css::container::NoSuchElementException, css::uno::RuntimeException)): 67cdf0e10cSrcweir fileUrl_(fileUrl) 68cdf0e10cSrcweir { 69cdf0e10cSrcweir switch (osl_openFile(fileUrl_.pData, &fileHandle_, osl_File_OpenFlag_Read)) 70cdf0e10cSrcweir { 71cdf0e10cSrcweir case osl_File_E_None: 72cdf0e10cSrcweir break; 73cdf0e10cSrcweir case osl_File_E_NOENT: 74cdf0e10cSrcweir throw css::container::NoSuchElementException( 75cdf0e10cSrcweir fileUrl_, css::uno::Reference< css::uno::XInterface >()); 76cdf0e10cSrcweir default: 77cdf0e10cSrcweir throw css::uno::RuntimeException( 78cdf0e10cSrcweir (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("cannot open ")) + 79cdf0e10cSrcweir fileUrl_), 80cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >()); 81cdf0e10cSrcweir } 82cdf0e10cSrcweir oslFileError e = osl_getFileSize(fileHandle_, &fileSize_); 83cdf0e10cSrcweir if (e == osl_File_E_None) { 84cdf0e10cSrcweir e = osl_mapFile( 85cdf0e10cSrcweir fileHandle_, &fileAddress_, fileSize_, 0, 86cdf0e10cSrcweir osl_File_MapFlag_WillNeed); 87cdf0e10cSrcweir } 88cdf0e10cSrcweir if (e != osl_File_E_None) { 89cdf0e10cSrcweir e = osl_closeFile(fileHandle_); 90cdf0e10cSrcweir if (e != osl_File_E_None) { 91cdf0e10cSrcweir OSL_TRACE("osl_closeFile failed with %ld", static_cast< long >(e)); 92cdf0e10cSrcweir } 93cdf0e10cSrcweir throw css::uno::RuntimeException( 94cdf0e10cSrcweir (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("cannot mmap ")) + 95cdf0e10cSrcweir fileUrl_), 96cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >()); 97cdf0e10cSrcweir } 98cdf0e10cSrcweir namespaceIris_.push_back( 99cdf0e10cSrcweir Span( 100cdf0e10cSrcweir RTL_CONSTASCII_STRINGPARAM( 101cdf0e10cSrcweir "http://www.w3.org/XML/1998/namespace"))); 102cdf0e10cSrcweir namespaces_.push_back( 103cdf0e10cSrcweir NamespaceData(Span(RTL_CONSTASCII_STRINGPARAM("xml")), NAMESPACE_XML)); 104cdf0e10cSrcweir pos_ = static_cast< char * >(fileAddress_); 105cdf0e10cSrcweir end_ = pos_ + fileSize_; 106cdf0e10cSrcweir state_ = STATE_CONTENT; 107cdf0e10cSrcweir } 108cdf0e10cSrcweir 109cdf0e10cSrcweir XmlReader::~XmlReader() { 110cdf0e10cSrcweir oslFileError e = osl_unmapFile(fileAddress_, fileSize_); 111cdf0e10cSrcweir if (e != osl_File_E_None) { 112cdf0e10cSrcweir OSL_TRACE("osl_unmapFile failed with %ld", static_cast< long >(e)); 113cdf0e10cSrcweir } 114cdf0e10cSrcweir e = osl_closeFile(fileHandle_); 115cdf0e10cSrcweir if (e != osl_File_E_None) { 116cdf0e10cSrcweir OSL_TRACE("osl_closeFile failed with %ld", static_cast< long >(e)); 117cdf0e10cSrcweir } 118cdf0e10cSrcweir } 119cdf0e10cSrcweir 120cdf0e10cSrcweir int XmlReader::registerNamespaceIri(Span const & iri) { 121cdf0e10cSrcweir int id = toNamespaceId(namespaceIris_.size()); 122cdf0e10cSrcweir namespaceIris_.push_back(iri); 123cdf0e10cSrcweir if (iri.equals( 124cdf0e10cSrcweir Span( 125cdf0e10cSrcweir RTL_CONSTASCII_STRINGPARAM( 126cdf0e10cSrcweir "http://www.w3.org/2001/XMLSchema-instance")))) 127cdf0e10cSrcweir { 128cdf0e10cSrcweir // Old user layer .xcu files used the xsi namespace prefix without 129cdf0e10cSrcweir // declaring a corresponding namespace binding, see issue 77174; reading 130cdf0e10cSrcweir // those files during migration would fail without this hack that can be 131cdf0e10cSrcweir // removed once migration is no longer relevant (see 132cdf0e10cSrcweir // configmgr::Components::parseModificationLayer): 133cdf0e10cSrcweir namespaces_.push_back( 134cdf0e10cSrcweir NamespaceData(Span(RTL_CONSTASCII_STRINGPARAM("xsi")), id)); 135cdf0e10cSrcweir } 136cdf0e10cSrcweir return id; 137cdf0e10cSrcweir } 138cdf0e10cSrcweir 139cdf0e10cSrcweir XmlReader::Result XmlReader::nextItem(Text reportText, Span * data, int * nsId) 140cdf0e10cSrcweir { 141cdf0e10cSrcweir switch (state_) { 142cdf0e10cSrcweir case STATE_CONTENT: 143cdf0e10cSrcweir switch (reportText) { 144cdf0e10cSrcweir case TEXT_NONE: 145cdf0e10cSrcweir return handleSkippedText(data, nsId); 146cdf0e10cSrcweir case TEXT_RAW: 147cdf0e10cSrcweir return handleRawText(data); 148cdf0e10cSrcweir case TEXT_NORMALIZED: 149cdf0e10cSrcweir return handleNormalizedText(data); 150cdf0e10cSrcweir } 151cdf0e10cSrcweir case STATE_START_TAG: 152cdf0e10cSrcweir return handleStartTag(nsId, data); 153cdf0e10cSrcweir case STATE_END_TAG: 154cdf0e10cSrcweir return handleEndTag(); 155cdf0e10cSrcweir case STATE_EMPTY_ELEMENT_TAG: 156cdf0e10cSrcweir handleElementEnd(); 157cdf0e10cSrcweir return RESULT_END; 158cdf0e10cSrcweir default: // STATE_DONE 159cdf0e10cSrcweir return RESULT_DONE; 160cdf0e10cSrcweir } 161cdf0e10cSrcweir } 162cdf0e10cSrcweir 163cdf0e10cSrcweir bool XmlReader::nextAttribute(int * nsId, Span * localName) { 164cdf0e10cSrcweir OSL_ASSERT(nsId != 0 && localName != 0); 165cdf0e10cSrcweir if (firstAttribute_) { 166cdf0e10cSrcweir currentAttribute_ = attributes_.begin(); 167cdf0e10cSrcweir firstAttribute_ = false; 168cdf0e10cSrcweir } else { 169cdf0e10cSrcweir ++currentAttribute_; 170cdf0e10cSrcweir } 171cdf0e10cSrcweir if (currentAttribute_ == attributes_.end()) { 172cdf0e10cSrcweir return false; 173cdf0e10cSrcweir } 174cdf0e10cSrcweir if (currentAttribute_->nameColon == 0) { 175cdf0e10cSrcweir *nsId = NAMESPACE_NONE; 176cdf0e10cSrcweir *localName = Span( 177cdf0e10cSrcweir currentAttribute_->nameBegin, 178cdf0e10cSrcweir currentAttribute_->nameEnd - currentAttribute_->nameBegin); 179cdf0e10cSrcweir } else { 180cdf0e10cSrcweir *nsId = getNamespaceId( 181cdf0e10cSrcweir Span( 182cdf0e10cSrcweir currentAttribute_->nameBegin, 183cdf0e10cSrcweir currentAttribute_->nameColon - currentAttribute_->nameBegin)); 184cdf0e10cSrcweir *localName = Span( 185cdf0e10cSrcweir currentAttribute_->nameColon + 1, 186cdf0e10cSrcweir currentAttribute_->nameEnd - (currentAttribute_->nameColon + 1)); 187cdf0e10cSrcweir } 188cdf0e10cSrcweir return true; 189cdf0e10cSrcweir } 190cdf0e10cSrcweir 191cdf0e10cSrcweir Span XmlReader::getAttributeValue(bool fullyNormalize) { 192cdf0e10cSrcweir return handleAttributeValue( 193cdf0e10cSrcweir currentAttribute_->valueBegin, currentAttribute_->valueEnd, 194cdf0e10cSrcweir fullyNormalize); 195cdf0e10cSrcweir } 196cdf0e10cSrcweir 197cdf0e10cSrcweir int XmlReader::getNamespaceId(Span const & prefix) const { 198cdf0e10cSrcweir for (NamespaceList::const_reverse_iterator i(namespaces_.rbegin()); 199cdf0e10cSrcweir i != namespaces_.rend(); ++i) 200cdf0e10cSrcweir { 201cdf0e10cSrcweir if (prefix.equals(i->prefix)) { 202cdf0e10cSrcweir return i->nsId; 203cdf0e10cSrcweir } 204cdf0e10cSrcweir } 205cdf0e10cSrcweir return NAMESPACE_UNKNOWN; 206cdf0e10cSrcweir } 207cdf0e10cSrcweir 208cdf0e10cSrcweir rtl::OUString XmlReader::getUrl() const { 209cdf0e10cSrcweir return fileUrl_; 210cdf0e10cSrcweir } 211cdf0e10cSrcweir 212cdf0e10cSrcweir void XmlReader::normalizeLineEnds(Span const & text) { 213cdf0e10cSrcweir char const * p = text.begin; 214cdf0e10cSrcweir sal_Int32 n = text.length; 215cdf0e10cSrcweir for (;;) { 216cdf0e10cSrcweir sal_Int32 i = rtl_str_indexOfChar_WithLength(p, n, '\x0D'); 217cdf0e10cSrcweir if (i < 0) { 218cdf0e10cSrcweir break; 219cdf0e10cSrcweir } 220cdf0e10cSrcweir pad_.add(p, i); 221cdf0e10cSrcweir p += i + 1; 222cdf0e10cSrcweir n -= i + 1; 223cdf0e10cSrcweir if (n == 0 || *p != '\x0A') { 224cdf0e10cSrcweir pad_.add(RTL_CONSTASCII_STRINGPARAM("\x0A")); 225cdf0e10cSrcweir } 226cdf0e10cSrcweir } 227cdf0e10cSrcweir pad_.add(p, n); 228cdf0e10cSrcweir } 229cdf0e10cSrcweir 230cdf0e10cSrcweir void XmlReader::skipSpace() { 231cdf0e10cSrcweir while (isSpace(peek())) { 232cdf0e10cSrcweir ++pos_; 233cdf0e10cSrcweir } 234cdf0e10cSrcweir } 235cdf0e10cSrcweir 236cdf0e10cSrcweir bool XmlReader::skipComment() { 237cdf0e10cSrcweir if (rtl_str_shortenedCompare_WithLength( 238cdf0e10cSrcweir pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("--"), 239cdf0e10cSrcweir RTL_CONSTASCII_LENGTH("--")) != 240cdf0e10cSrcweir 0) 241cdf0e10cSrcweir { 242cdf0e10cSrcweir return false; 243cdf0e10cSrcweir } 244cdf0e10cSrcweir pos_ += RTL_CONSTASCII_LENGTH("--"); 245cdf0e10cSrcweir sal_Int32 i = rtl_str_indexOfStr_WithLength( 246cdf0e10cSrcweir pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("--")); 247cdf0e10cSrcweir if (i < 0) { 248cdf0e10cSrcweir throw css::uno::RuntimeException( 249cdf0e10cSrcweir (rtl::OUString( 250cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM( 251cdf0e10cSrcweir "premature end (within comment) of ")) + 252cdf0e10cSrcweir fileUrl_), 253cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >()); 254cdf0e10cSrcweir } 255cdf0e10cSrcweir pos_ += i + RTL_CONSTASCII_LENGTH("--"); 256cdf0e10cSrcweir if (read() != '>') { 257cdf0e10cSrcweir throw css::uno::RuntimeException( 258cdf0e10cSrcweir (rtl::OUString( 259cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM( 260cdf0e10cSrcweir "illegal \"--\" within comment in ")) + 261cdf0e10cSrcweir fileUrl_), 262cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >()); 263cdf0e10cSrcweir } 264cdf0e10cSrcweir return true; 265cdf0e10cSrcweir } 266cdf0e10cSrcweir 267cdf0e10cSrcweir void XmlReader::skipProcessingInstruction() { 268cdf0e10cSrcweir sal_Int32 i = rtl_str_indexOfStr_WithLength( 269cdf0e10cSrcweir pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("?>")); 270cdf0e10cSrcweir if (i < 0) { 271cdf0e10cSrcweir throw css::uno::RuntimeException( 272cdf0e10cSrcweir (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("bad '<?' in ")) + 273cdf0e10cSrcweir fileUrl_), 274cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >()); 275cdf0e10cSrcweir } 276cdf0e10cSrcweir pos_ += i + RTL_CONSTASCII_LENGTH("?>"); 277cdf0e10cSrcweir } 278cdf0e10cSrcweir 279cdf0e10cSrcweir void XmlReader::skipDocumentTypeDeclaration() { 280cdf0e10cSrcweir // Neither is it checked that the doctypedecl is at the correct position in 281cdf0e10cSrcweir // the document, nor that it is well-formed: 282cdf0e10cSrcweir for (;;) { 283cdf0e10cSrcweir char c = read(); 284cdf0e10cSrcweir switch (c) { 285cdf0e10cSrcweir case '\0': // i.e., EOF 286cdf0e10cSrcweir throw css::uno::RuntimeException( 287cdf0e10cSrcweir (rtl::OUString( 288cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM( 289cdf0e10cSrcweir "premature end (within DTD) of ")) + 290cdf0e10cSrcweir fileUrl_), 291cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >()); 292cdf0e10cSrcweir case '"': 293cdf0e10cSrcweir case '\'': 294cdf0e10cSrcweir { 295cdf0e10cSrcweir sal_Int32 i = rtl_str_indexOfChar_WithLength( 296cdf0e10cSrcweir pos_, end_ - pos_, c); 297cdf0e10cSrcweir if (i < 0) { 298cdf0e10cSrcweir throw css::uno::RuntimeException( 299cdf0e10cSrcweir (rtl::OUString( 300cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM( 301cdf0e10cSrcweir "premature end (within DTD) of ")) + 302cdf0e10cSrcweir fileUrl_), 303cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >()); 304cdf0e10cSrcweir } 305cdf0e10cSrcweir pos_ += i + 1; 306cdf0e10cSrcweir } 307cdf0e10cSrcweir break; 308cdf0e10cSrcweir case '>': 309cdf0e10cSrcweir return; 310cdf0e10cSrcweir case '[': 311cdf0e10cSrcweir for (;;) { 312cdf0e10cSrcweir c = read(); 313cdf0e10cSrcweir switch (c) { 314cdf0e10cSrcweir case '\0': // i.e., EOF 315cdf0e10cSrcweir throw css::uno::RuntimeException( 316cdf0e10cSrcweir (rtl::OUString( 317cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM( 318cdf0e10cSrcweir "premature end (within DTD) of ")) + 319cdf0e10cSrcweir fileUrl_), 320cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >()); 321cdf0e10cSrcweir case '"': 322cdf0e10cSrcweir case '\'': 323cdf0e10cSrcweir { 324cdf0e10cSrcweir sal_Int32 i = rtl_str_indexOfChar_WithLength( 325cdf0e10cSrcweir pos_, end_ - pos_, c); 326cdf0e10cSrcweir if (i < 0) { 327cdf0e10cSrcweir throw css::uno::RuntimeException( 328cdf0e10cSrcweir (rtl::OUString( 329cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM( 330cdf0e10cSrcweir "premature end (within DTD) of ")) + 331cdf0e10cSrcweir fileUrl_), 332cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >()); 333cdf0e10cSrcweir } 334cdf0e10cSrcweir pos_ += i + 1; 335cdf0e10cSrcweir } 336cdf0e10cSrcweir break; 337cdf0e10cSrcweir case '<': 338cdf0e10cSrcweir switch (read()) { 339cdf0e10cSrcweir case '\0': // i.e., EOF 340cdf0e10cSrcweir throw css::uno::RuntimeException( 341cdf0e10cSrcweir (rtl::OUString( 342cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM( 343cdf0e10cSrcweir "premature end (within DTD) of ")) + 344cdf0e10cSrcweir fileUrl_), 345cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >()); 346cdf0e10cSrcweir case '!': 347cdf0e10cSrcweir skipComment(); 348cdf0e10cSrcweir break; 349cdf0e10cSrcweir case '?': 350cdf0e10cSrcweir skipProcessingInstruction(); 351cdf0e10cSrcweir break; 352cdf0e10cSrcweir default: 353cdf0e10cSrcweir break; 354cdf0e10cSrcweir } 355cdf0e10cSrcweir break; 356cdf0e10cSrcweir case ']': 357cdf0e10cSrcweir skipSpace(); 358cdf0e10cSrcweir if (read() != '>') { 359cdf0e10cSrcweir throw css::uno::RuntimeException( 360cdf0e10cSrcweir (rtl::OUString( 361cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM( 362cdf0e10cSrcweir "missing \">\" of DTD in ")) + 363cdf0e10cSrcweir fileUrl_), 364cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >()); 365cdf0e10cSrcweir } 366cdf0e10cSrcweir return; 367cdf0e10cSrcweir default: 368cdf0e10cSrcweir break; 369cdf0e10cSrcweir } 370cdf0e10cSrcweir } 371cdf0e10cSrcweir default: 372cdf0e10cSrcweir break; 373cdf0e10cSrcweir } 374cdf0e10cSrcweir } 375cdf0e10cSrcweir } 376cdf0e10cSrcweir 377cdf0e10cSrcweir Span XmlReader::scanCdataSection() { 378cdf0e10cSrcweir if (rtl_str_shortenedCompare_WithLength( 379cdf0e10cSrcweir pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("[CDATA["), 380cdf0e10cSrcweir RTL_CONSTASCII_LENGTH("[CDATA[")) != 381cdf0e10cSrcweir 0) 382cdf0e10cSrcweir { 383cdf0e10cSrcweir return Span(); 384cdf0e10cSrcweir } 385cdf0e10cSrcweir pos_ += RTL_CONSTASCII_LENGTH("[CDATA["); 386cdf0e10cSrcweir char const * begin = pos_; 387cdf0e10cSrcweir sal_Int32 i = rtl_str_indexOfStr_WithLength( 388cdf0e10cSrcweir pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("]]>")); 389cdf0e10cSrcweir if (i < 0) { 390cdf0e10cSrcweir throw css::uno::RuntimeException( 391cdf0e10cSrcweir (rtl::OUString( 392cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM( 393cdf0e10cSrcweir "premature end (within CDATA section) of ")) + 394cdf0e10cSrcweir fileUrl_), 395cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >()); 396cdf0e10cSrcweir } 397cdf0e10cSrcweir pos_ += i + RTL_CONSTASCII_LENGTH("]]>"); 398cdf0e10cSrcweir return Span(begin, i); 399cdf0e10cSrcweir } 400cdf0e10cSrcweir 401cdf0e10cSrcweir bool XmlReader::scanName(char const ** nameColon) { 402cdf0e10cSrcweir OSL_ASSERT(nameColon != 0 && *nameColon == 0); 403cdf0e10cSrcweir for (char const * begin = pos_;; ++pos_) { 404cdf0e10cSrcweir switch (peek()) { 405cdf0e10cSrcweir case '\0': // i.e., EOF 406cdf0e10cSrcweir case '\x09': 407cdf0e10cSrcweir case '\x0A': 408cdf0e10cSrcweir case '\x0D': 409cdf0e10cSrcweir case ' ': 410cdf0e10cSrcweir case '/': 411cdf0e10cSrcweir case '=': 412cdf0e10cSrcweir case '>': 413cdf0e10cSrcweir return pos_ != begin; 414cdf0e10cSrcweir case ':': 415cdf0e10cSrcweir *nameColon = pos_; 416cdf0e10cSrcweir break; 417cdf0e10cSrcweir default: 418cdf0e10cSrcweir break; 419cdf0e10cSrcweir } 420cdf0e10cSrcweir } 421cdf0e10cSrcweir } 422cdf0e10cSrcweir 423cdf0e10cSrcweir int XmlReader::scanNamespaceIri(char const * begin, char const * end) { 424cdf0e10cSrcweir OSL_ASSERT(begin != 0 && begin <= end); 425cdf0e10cSrcweir Span iri(handleAttributeValue(begin, end, false)); 426cdf0e10cSrcweir for (NamespaceIris::size_type i = 0; i < namespaceIris_.size(); ++i) { 427cdf0e10cSrcweir if (namespaceIris_[i].equals(iri)) { 428cdf0e10cSrcweir return toNamespaceId(i); 429cdf0e10cSrcweir } 430cdf0e10cSrcweir } 431cdf0e10cSrcweir return XmlReader::NAMESPACE_UNKNOWN; 432cdf0e10cSrcweir } 433cdf0e10cSrcweir 434cdf0e10cSrcweir char const * XmlReader::handleReference(char const * position, char const * end) 435cdf0e10cSrcweir { 436cdf0e10cSrcweir OSL_ASSERT(position != 0 && *position == '&' && position < end); 437cdf0e10cSrcweir ++position; 438cdf0e10cSrcweir if (*position == '#') { 439cdf0e10cSrcweir ++position; 440cdf0e10cSrcweir sal_Int32 val = 0; 441cdf0e10cSrcweir char const * p; 442cdf0e10cSrcweir if (*position == 'x') { 443cdf0e10cSrcweir ++position; 444cdf0e10cSrcweir p = position; 445cdf0e10cSrcweir for (;; ++position) { 446cdf0e10cSrcweir char c = *position; 447cdf0e10cSrcweir if (c >= '0' && c <= '9') { 448cdf0e10cSrcweir val = 16 * val + (c - '0'); 449cdf0e10cSrcweir } else if (c >= 'A' && c <= 'F') { 450cdf0e10cSrcweir val = 16 * val + (c - 'A') + 10; 451cdf0e10cSrcweir } else if (c >= 'a' && c <= 'f') { 452cdf0e10cSrcweir val = 16 * val + (c - 'a') + 10; 453cdf0e10cSrcweir } else { 454cdf0e10cSrcweir break; 455cdf0e10cSrcweir } 456cdf0e10cSrcweir if (val > 0x10FFFF) { // avoid overflow 457cdf0e10cSrcweir throw css::uno::RuntimeException( 458cdf0e10cSrcweir (rtl::OUString( 459cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM( 460cdf0e10cSrcweir "'&#x...' too large in ")) + 461cdf0e10cSrcweir fileUrl_), 462cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >()); 463cdf0e10cSrcweir } 464cdf0e10cSrcweir } 465cdf0e10cSrcweir } else { 466cdf0e10cSrcweir p = position; 467cdf0e10cSrcweir for (;; ++position) { 468cdf0e10cSrcweir char c = *position; 469cdf0e10cSrcweir if (c >= '0' && c <= '9') { 470cdf0e10cSrcweir val = 10 * val + (c - '0'); 471cdf0e10cSrcweir } else { 472cdf0e10cSrcweir break; 473cdf0e10cSrcweir } 474cdf0e10cSrcweir if (val > 0x10FFFF) { // avoid overflow 475cdf0e10cSrcweir throw css::uno::RuntimeException( 476cdf0e10cSrcweir (rtl::OUString( 477cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM( 478cdf0e10cSrcweir "'&#...' too large in ")) + 479cdf0e10cSrcweir fileUrl_), 480cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >()); 481cdf0e10cSrcweir } 482cdf0e10cSrcweir } 483cdf0e10cSrcweir } 484cdf0e10cSrcweir if (position == p || *position++ != ';') { 485cdf0e10cSrcweir throw css::uno::RuntimeException( 486cdf0e10cSrcweir (rtl::OUString( 487cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM("'&#...' missing ';' in ")) + 488cdf0e10cSrcweir fileUrl_), 489cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >()); 490cdf0e10cSrcweir } 491cdf0e10cSrcweir OSL_ASSERT(val >= 0 && val <= 0x10FFFF); 492cdf0e10cSrcweir if ((val < 0x20 && val != 0x9 && val != 0xA && val != 0xD) || 493cdf0e10cSrcweir (val >= 0xD800 && val <= 0xDFFF) || val == 0xFFFE || val == 0xFFFF) 494cdf0e10cSrcweir { 495cdf0e10cSrcweir throw css::uno::RuntimeException( 496cdf0e10cSrcweir (rtl::OUString( 497cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM( 498cdf0e10cSrcweir "character reference denoting invalid character in ")) + 499cdf0e10cSrcweir fileUrl_), 500cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >()); 501cdf0e10cSrcweir } 502cdf0e10cSrcweir char buf[4]; 503cdf0e10cSrcweir sal_Int32 len; 504cdf0e10cSrcweir if (val < 0x80) { 505cdf0e10cSrcweir buf[0] = static_cast< char >(val); 506cdf0e10cSrcweir len = 1; 507cdf0e10cSrcweir } else if (val < 0x800) { 508cdf0e10cSrcweir buf[0] = static_cast< char >((val >> 6) | 0xC0); 509cdf0e10cSrcweir buf[1] = static_cast< char >((val & 0x3F) | 0x80); 510cdf0e10cSrcweir len = 2; 511cdf0e10cSrcweir } else if (val < 0x10000) { 512cdf0e10cSrcweir buf[0] = static_cast< char >((val >> 12) | 0xE0); 513cdf0e10cSrcweir buf[1] = static_cast< char >(((val >> 6) & 0x3F) | 0x80); 514cdf0e10cSrcweir buf[2] = static_cast< char >((val & 0x3F) | 0x80); 515cdf0e10cSrcweir len = 3; 516cdf0e10cSrcweir } else { 517cdf0e10cSrcweir buf[0] = static_cast< char >((val >> 18) | 0xF0); 518cdf0e10cSrcweir buf[1] = static_cast< char >(((val >> 12) & 0x3F) | 0x80); 519cdf0e10cSrcweir buf[2] = static_cast< char >(((val >> 6) & 0x3F) | 0x80); 520cdf0e10cSrcweir buf[3] = static_cast< char >((val & 0x3F) | 0x80); 521cdf0e10cSrcweir len = 4; 522cdf0e10cSrcweir } 523cdf0e10cSrcweir pad_.addEphemeral(buf, len); 524cdf0e10cSrcweir return position; 525cdf0e10cSrcweir } else { 526cdf0e10cSrcweir struct EntityRef { 527cdf0e10cSrcweir char const * inBegin; 528cdf0e10cSrcweir sal_Int32 inLength; 529cdf0e10cSrcweir char const * outBegin; 530cdf0e10cSrcweir sal_Int32 outLength; 531cdf0e10cSrcweir }; 532cdf0e10cSrcweir static EntityRef const refs[] = { 533cdf0e10cSrcweir { RTL_CONSTASCII_STRINGPARAM("amp;"), 534cdf0e10cSrcweir RTL_CONSTASCII_STRINGPARAM("&") }, 535cdf0e10cSrcweir { RTL_CONSTASCII_STRINGPARAM("lt;"), 536cdf0e10cSrcweir RTL_CONSTASCII_STRINGPARAM("<") }, 537cdf0e10cSrcweir { RTL_CONSTASCII_STRINGPARAM("gt;"), 538cdf0e10cSrcweir RTL_CONSTASCII_STRINGPARAM(">") }, 539cdf0e10cSrcweir { RTL_CONSTASCII_STRINGPARAM("apos;"), 540cdf0e10cSrcweir RTL_CONSTASCII_STRINGPARAM("'") }, 541cdf0e10cSrcweir { RTL_CONSTASCII_STRINGPARAM("quot;"), 542cdf0e10cSrcweir RTL_CONSTASCII_STRINGPARAM("\"") } }; 543cdf0e10cSrcweir for (std::size_t i = 0; i < sizeof refs / sizeof refs[0]; ++i) { 544cdf0e10cSrcweir if (rtl_str_shortenedCompare_WithLength( 545cdf0e10cSrcweir position, end - position, refs[i].inBegin, refs[i].inLength, 546cdf0e10cSrcweir refs[i].inLength) == 547cdf0e10cSrcweir 0) 548cdf0e10cSrcweir { 549cdf0e10cSrcweir position += refs[i].inLength; 550cdf0e10cSrcweir pad_.add(refs[i].outBegin, refs[i].outLength); 551cdf0e10cSrcweir return position; 552cdf0e10cSrcweir } 553cdf0e10cSrcweir } 554cdf0e10cSrcweir throw css::uno::RuntimeException( 555cdf0e10cSrcweir (rtl::OUString( 556cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM("unknown entity reference in ")) + 557cdf0e10cSrcweir fileUrl_), 558cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >()); 559cdf0e10cSrcweir } 560cdf0e10cSrcweir } 561cdf0e10cSrcweir 562cdf0e10cSrcweir Span XmlReader::handleAttributeValue( 563cdf0e10cSrcweir char const * begin, char const * end, bool fullyNormalize) 564cdf0e10cSrcweir { 565cdf0e10cSrcweir pad_.clear(); 566cdf0e10cSrcweir if (fullyNormalize) { 567cdf0e10cSrcweir while (begin != end && isSpace(*begin)) { 568cdf0e10cSrcweir ++begin; 569cdf0e10cSrcweir } 570cdf0e10cSrcweir while (end != begin && isSpace(end[-1])) { 571cdf0e10cSrcweir --end; 572cdf0e10cSrcweir } 573cdf0e10cSrcweir char const * p = begin; 574cdf0e10cSrcweir enum Space { SPACE_NONE, SPACE_SPAN, SPACE_BREAK }; 575cdf0e10cSrcweir // a single true space character can go into the current span, 576cdf0e10cSrcweir // everything else breaks the span 577cdf0e10cSrcweir Space space = SPACE_NONE; 578cdf0e10cSrcweir while (p != end) { 579cdf0e10cSrcweir switch (*p) { 580cdf0e10cSrcweir case '\x09': 581cdf0e10cSrcweir case '\x0A': 582cdf0e10cSrcweir case '\x0D': 583cdf0e10cSrcweir switch (space) { 584cdf0e10cSrcweir case SPACE_NONE: 585cdf0e10cSrcweir pad_.add(begin, p - begin); 586cdf0e10cSrcweir pad_.add(RTL_CONSTASCII_STRINGPARAM(" ")); 587cdf0e10cSrcweir space = SPACE_BREAK; 588cdf0e10cSrcweir break; 589cdf0e10cSrcweir case SPACE_SPAN: 590cdf0e10cSrcweir pad_.add(begin, p - begin); 591cdf0e10cSrcweir space = SPACE_BREAK; 592cdf0e10cSrcweir break; 593cdf0e10cSrcweir case SPACE_BREAK: 594cdf0e10cSrcweir break; 595cdf0e10cSrcweir } 596cdf0e10cSrcweir begin = ++p; 597cdf0e10cSrcweir break; 598cdf0e10cSrcweir case ' ': 599cdf0e10cSrcweir switch (space) { 600cdf0e10cSrcweir case SPACE_NONE: 601cdf0e10cSrcweir ++p; 602cdf0e10cSrcweir space = SPACE_SPAN; 603cdf0e10cSrcweir break; 604cdf0e10cSrcweir case SPACE_SPAN: 605cdf0e10cSrcweir pad_.add(begin, p - begin); 606cdf0e10cSrcweir begin = ++p; 607cdf0e10cSrcweir space = SPACE_BREAK; 608cdf0e10cSrcweir break; 609cdf0e10cSrcweir case SPACE_BREAK: 610cdf0e10cSrcweir begin = ++p; 611cdf0e10cSrcweir break; 612cdf0e10cSrcweir } 613cdf0e10cSrcweir break; 614cdf0e10cSrcweir case '&': 615cdf0e10cSrcweir pad_.add(begin, p - begin); 616cdf0e10cSrcweir p = handleReference(p, end); 617cdf0e10cSrcweir begin = p; 618cdf0e10cSrcweir space = SPACE_NONE; 619cdf0e10cSrcweir break; 620cdf0e10cSrcweir default: 621cdf0e10cSrcweir ++p; 622cdf0e10cSrcweir space = SPACE_NONE; 623cdf0e10cSrcweir break; 624cdf0e10cSrcweir } 625cdf0e10cSrcweir } 626cdf0e10cSrcweir pad_.add(begin, p - begin); 627cdf0e10cSrcweir } else { 628cdf0e10cSrcweir char const * p = begin; 629cdf0e10cSrcweir while (p != end) { 630cdf0e10cSrcweir switch (*p) { 631cdf0e10cSrcweir case '\x09': 632cdf0e10cSrcweir case '\x0A': 633cdf0e10cSrcweir pad_.add(begin, p - begin); 634cdf0e10cSrcweir begin = ++p; 635cdf0e10cSrcweir pad_.add(RTL_CONSTASCII_STRINGPARAM(" ")); 636cdf0e10cSrcweir break; 637cdf0e10cSrcweir case '\x0D': 638cdf0e10cSrcweir pad_.add(begin, p - begin); 639cdf0e10cSrcweir ++p; 640cdf0e10cSrcweir if (peek() == '\x0A') { 641cdf0e10cSrcweir ++p; 642cdf0e10cSrcweir } 643cdf0e10cSrcweir begin = p; 644cdf0e10cSrcweir pad_.add(RTL_CONSTASCII_STRINGPARAM(" ")); 645cdf0e10cSrcweir break; 646cdf0e10cSrcweir case '&': 647cdf0e10cSrcweir pad_.add(begin, p - begin); 648cdf0e10cSrcweir p = handleReference(p, end); 649cdf0e10cSrcweir begin = p; 650cdf0e10cSrcweir break; 651cdf0e10cSrcweir default: 652cdf0e10cSrcweir ++p; 653cdf0e10cSrcweir break; 654cdf0e10cSrcweir } 655cdf0e10cSrcweir } 656cdf0e10cSrcweir pad_.add(begin, p - begin); 657cdf0e10cSrcweir } 658cdf0e10cSrcweir return pad_.get(); 659cdf0e10cSrcweir } 660cdf0e10cSrcweir 661cdf0e10cSrcweir XmlReader::Result XmlReader::handleStartTag(int * nsId, Span * localName) { 662cdf0e10cSrcweir OSL_ASSERT(nsId != 0 && localName); 663cdf0e10cSrcweir char const * nameBegin = pos_; 664cdf0e10cSrcweir char const * nameColon = 0; 665cdf0e10cSrcweir if (!scanName(&nameColon)) { 666cdf0e10cSrcweir throw css::uno::RuntimeException( 667cdf0e10cSrcweir (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("bad tag name in ")) + 668cdf0e10cSrcweir fileUrl_), 669cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >()); 670cdf0e10cSrcweir } 671cdf0e10cSrcweir char const * nameEnd = pos_; 672cdf0e10cSrcweir NamespaceList::size_type inheritedNamespaces = namespaces_.size(); 673cdf0e10cSrcweir bool hasDefaultNs = false; 674cdf0e10cSrcweir int defaultNsId = NAMESPACE_NONE; 675cdf0e10cSrcweir attributes_.clear(); 676cdf0e10cSrcweir for (;;) { 677cdf0e10cSrcweir char const * p = pos_; 678cdf0e10cSrcweir skipSpace(); 679cdf0e10cSrcweir if (peek() == '/' || peek() == '>') { 680cdf0e10cSrcweir break; 681cdf0e10cSrcweir } 682cdf0e10cSrcweir if (pos_ == p) { 683cdf0e10cSrcweir throw css::uno::RuntimeException( 684cdf0e10cSrcweir (rtl::OUString( 685cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM( 686cdf0e10cSrcweir "missing whitespace before attribute in ")) + 687cdf0e10cSrcweir fileUrl_), 688cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >()); 689cdf0e10cSrcweir } 690cdf0e10cSrcweir char const * attrNameBegin = pos_; 691cdf0e10cSrcweir char const * attrNameColon = 0; 692cdf0e10cSrcweir if (!scanName(&attrNameColon)) { 693cdf0e10cSrcweir throw css::uno::RuntimeException( 694cdf0e10cSrcweir (rtl::OUString( 695cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM("bad attribute name in ")) + 696cdf0e10cSrcweir fileUrl_), 697cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >()); 698cdf0e10cSrcweir } 699cdf0e10cSrcweir char const * attrNameEnd = pos_; 700cdf0e10cSrcweir skipSpace(); 701cdf0e10cSrcweir if (read() != '=') { 702cdf0e10cSrcweir throw css::uno::RuntimeException( 703cdf0e10cSrcweir (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("missing '=' in ")) + 704cdf0e10cSrcweir fileUrl_), 705cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >()); 706cdf0e10cSrcweir } 707cdf0e10cSrcweir skipSpace(); 708cdf0e10cSrcweir char del = read(); 709cdf0e10cSrcweir if (del != '\'' && del != '"') { 710cdf0e10cSrcweir throw css::uno::RuntimeException( 711cdf0e10cSrcweir (rtl::OUString( 712cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM("bad attribute value in ")) + 713cdf0e10cSrcweir fileUrl_), 714cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >()); 715cdf0e10cSrcweir } 716cdf0e10cSrcweir char const * valueBegin = pos_; 717cdf0e10cSrcweir sal_Int32 i = rtl_str_indexOfChar_WithLength(pos_, end_ - pos_, del); 718cdf0e10cSrcweir if (i < 0) { 719cdf0e10cSrcweir throw css::uno::RuntimeException( 720cdf0e10cSrcweir (rtl::OUString( 721cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM( 722cdf0e10cSrcweir "unterminated attribute value in ")) + 723cdf0e10cSrcweir fileUrl_), 724cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >()); 725cdf0e10cSrcweir } 726cdf0e10cSrcweir char const * valueEnd = pos_ + i; 727cdf0e10cSrcweir pos_ += i + 1; 728cdf0e10cSrcweir if (attrNameColon == 0 && 729cdf0e10cSrcweir Span(attrNameBegin, attrNameEnd - attrNameBegin).equals( 730cdf0e10cSrcweir RTL_CONSTASCII_STRINGPARAM("xmlns"))) 731cdf0e10cSrcweir { 732cdf0e10cSrcweir hasDefaultNs = true; 733cdf0e10cSrcweir defaultNsId = scanNamespaceIri(valueBegin, valueEnd); 734cdf0e10cSrcweir } else if (attrNameColon != 0 && 735cdf0e10cSrcweir Span(attrNameBegin, attrNameColon - attrNameBegin).equals( 736cdf0e10cSrcweir RTL_CONSTASCII_STRINGPARAM("xmlns"))) 737cdf0e10cSrcweir { 738cdf0e10cSrcweir namespaces_.push_back( 739cdf0e10cSrcweir NamespaceData( 740cdf0e10cSrcweir Span(attrNameColon + 1, attrNameEnd - (attrNameColon + 1)), 741cdf0e10cSrcweir scanNamespaceIri(valueBegin, valueEnd))); 742cdf0e10cSrcweir } else { 743cdf0e10cSrcweir attributes_.push_back( 744cdf0e10cSrcweir AttributeData( 745cdf0e10cSrcweir attrNameBegin, attrNameEnd, attrNameColon, valueBegin, 746cdf0e10cSrcweir valueEnd)); 747cdf0e10cSrcweir } 748cdf0e10cSrcweir } 749cdf0e10cSrcweir if (!hasDefaultNs && !elements_.empty()) { 750cdf0e10cSrcweir defaultNsId = elements_.top().defaultNamespaceId; 751cdf0e10cSrcweir } 752cdf0e10cSrcweir firstAttribute_ = true; 753cdf0e10cSrcweir if (peek() == '/') { 754cdf0e10cSrcweir state_ = STATE_EMPTY_ELEMENT_TAG; 755cdf0e10cSrcweir ++pos_; 756cdf0e10cSrcweir } else { 757cdf0e10cSrcweir state_ = STATE_CONTENT; 758cdf0e10cSrcweir } 759cdf0e10cSrcweir if (peek() != '>') { 760cdf0e10cSrcweir throw css::uno::RuntimeException( 761cdf0e10cSrcweir (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("missing '>' in ")) + 762cdf0e10cSrcweir fileUrl_), 763cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >()); 764cdf0e10cSrcweir } 765cdf0e10cSrcweir ++pos_; 766cdf0e10cSrcweir elements_.push( 767cdf0e10cSrcweir ElementData( 768cdf0e10cSrcweir Span(nameBegin, nameEnd - nameBegin), inheritedNamespaces, 769cdf0e10cSrcweir defaultNsId)); 770cdf0e10cSrcweir if (nameColon == 0) { 771cdf0e10cSrcweir *nsId = defaultNsId; 772cdf0e10cSrcweir *localName = Span(nameBegin, nameEnd - nameBegin); 773cdf0e10cSrcweir } else { 774cdf0e10cSrcweir *nsId = getNamespaceId(Span(nameBegin, nameColon - nameBegin)); 775cdf0e10cSrcweir *localName = Span(nameColon + 1, nameEnd - (nameColon + 1)); 776cdf0e10cSrcweir } 777cdf0e10cSrcweir return RESULT_BEGIN; 778cdf0e10cSrcweir } 779cdf0e10cSrcweir 780cdf0e10cSrcweir XmlReader::Result XmlReader::handleEndTag() { 781cdf0e10cSrcweir if (elements_.empty()) { 782cdf0e10cSrcweir throw css::uno::RuntimeException( 783cdf0e10cSrcweir (rtl::OUString( 784cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM("spurious end tag in ")) + 785cdf0e10cSrcweir fileUrl_), 786cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >()); 787cdf0e10cSrcweir } 788cdf0e10cSrcweir char const * nameBegin = pos_; 789cdf0e10cSrcweir char const * nameColon = 0; 790cdf0e10cSrcweir if (!scanName(&nameColon) || 791cdf0e10cSrcweir !elements_.top().name.equals(nameBegin, pos_ - nameBegin)) 792cdf0e10cSrcweir { 793cdf0e10cSrcweir throw css::uno::RuntimeException( 794cdf0e10cSrcweir (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("tag mismatch in ")) + 795cdf0e10cSrcweir fileUrl_), 796cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >()); 797cdf0e10cSrcweir } 798cdf0e10cSrcweir handleElementEnd(); 799cdf0e10cSrcweir skipSpace(); 800cdf0e10cSrcweir if (peek() != '>') { 801cdf0e10cSrcweir throw css::uno::RuntimeException( 802cdf0e10cSrcweir (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("missing '>' in ")) + 803cdf0e10cSrcweir fileUrl_), 804cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >()); 805cdf0e10cSrcweir } 806cdf0e10cSrcweir ++pos_; 807cdf0e10cSrcweir return RESULT_END; 808cdf0e10cSrcweir } 809cdf0e10cSrcweir 810cdf0e10cSrcweir void XmlReader::handleElementEnd() { 811cdf0e10cSrcweir OSL_ASSERT(!elements_.empty()); 812cdf0e10cSrcweir namespaces_.resize(elements_.top().inheritedNamespaces); 813cdf0e10cSrcweir elements_.pop(); 814cdf0e10cSrcweir state_ = elements_.empty() ? STATE_DONE : STATE_CONTENT; 815cdf0e10cSrcweir } 816cdf0e10cSrcweir 817cdf0e10cSrcweir XmlReader::Result XmlReader::handleSkippedText(Span * data, int * nsId) { 818cdf0e10cSrcweir for (;;) { 819cdf0e10cSrcweir sal_Int32 i = rtl_str_indexOfChar_WithLength(pos_, end_ - pos_, '<'); 820cdf0e10cSrcweir if (i < 0) { 821cdf0e10cSrcweir throw css::uno::RuntimeException( 822cdf0e10cSrcweir (rtl::OUString( 823cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM("premature end of ")) + 824cdf0e10cSrcweir fileUrl_), 825cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >()); 826cdf0e10cSrcweir } 827cdf0e10cSrcweir pos_ += i + 1; 828cdf0e10cSrcweir switch (peek()) { 829cdf0e10cSrcweir case '!': 830cdf0e10cSrcweir ++pos_; 831cdf0e10cSrcweir if (!skipComment() && !scanCdataSection().is()) { 832cdf0e10cSrcweir skipDocumentTypeDeclaration(); 833cdf0e10cSrcweir } 834cdf0e10cSrcweir break; 835cdf0e10cSrcweir case '/': 836cdf0e10cSrcweir ++pos_; 837cdf0e10cSrcweir return handleEndTag(); 838cdf0e10cSrcweir case '?': 839cdf0e10cSrcweir ++pos_; 840cdf0e10cSrcweir skipProcessingInstruction(); 841cdf0e10cSrcweir break; 842cdf0e10cSrcweir default: 843cdf0e10cSrcweir return handleStartTag(nsId, data); 844cdf0e10cSrcweir } 845cdf0e10cSrcweir } 846cdf0e10cSrcweir } 847cdf0e10cSrcweir 848cdf0e10cSrcweir XmlReader::Result XmlReader::handleRawText(Span * text) { 849cdf0e10cSrcweir pad_.clear(); 850cdf0e10cSrcweir for (char const * begin = pos_;;) { 851cdf0e10cSrcweir switch (peek()) { 852cdf0e10cSrcweir case '\0': // i.e., EOF 853cdf0e10cSrcweir throw css::uno::RuntimeException( 854cdf0e10cSrcweir (rtl::OUString( 855cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM("premature end of ")) + 856cdf0e10cSrcweir fileUrl_), 857cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >()); 858cdf0e10cSrcweir case '\x0D': 859cdf0e10cSrcweir pad_.add(begin, pos_ - begin); 860cdf0e10cSrcweir ++pos_; 861cdf0e10cSrcweir if (peek() != '\x0A') { 862cdf0e10cSrcweir pad_.add(RTL_CONSTASCII_STRINGPARAM("\x0A")); 863cdf0e10cSrcweir } 864cdf0e10cSrcweir begin = pos_; 865cdf0e10cSrcweir break; 866cdf0e10cSrcweir case '&': 867cdf0e10cSrcweir pad_.add(begin, pos_ - begin); 868cdf0e10cSrcweir pos_ = handleReference(pos_, end_); 869cdf0e10cSrcweir begin = pos_; 870cdf0e10cSrcweir break; 871cdf0e10cSrcweir case '<': 872cdf0e10cSrcweir pad_.add(begin, pos_ - begin); 873cdf0e10cSrcweir ++pos_; 874cdf0e10cSrcweir switch (peek()) { 875cdf0e10cSrcweir case '!': 876cdf0e10cSrcweir ++pos_; 877cdf0e10cSrcweir if (!skipComment()) { 878cdf0e10cSrcweir Span cdata(scanCdataSection()); 879cdf0e10cSrcweir if (cdata.is()) { 880cdf0e10cSrcweir normalizeLineEnds(cdata); 881cdf0e10cSrcweir } else { 882cdf0e10cSrcweir skipDocumentTypeDeclaration(); 883cdf0e10cSrcweir } 884cdf0e10cSrcweir } 885cdf0e10cSrcweir begin = pos_; 886cdf0e10cSrcweir break; 887cdf0e10cSrcweir case '/': 888cdf0e10cSrcweir *text = pad_.get(); 889cdf0e10cSrcweir ++pos_; 890cdf0e10cSrcweir state_ = STATE_END_TAG; 891cdf0e10cSrcweir return RESULT_TEXT; 892cdf0e10cSrcweir case '?': 893cdf0e10cSrcweir ++pos_; 894cdf0e10cSrcweir skipProcessingInstruction(); 895cdf0e10cSrcweir begin = pos_; 896cdf0e10cSrcweir break; 897cdf0e10cSrcweir default: 898cdf0e10cSrcweir *text = pad_.get(); 899cdf0e10cSrcweir state_ = STATE_START_TAG; 900cdf0e10cSrcweir return RESULT_TEXT; 901cdf0e10cSrcweir } 902cdf0e10cSrcweir break; 903cdf0e10cSrcweir default: 904cdf0e10cSrcweir ++pos_; 905cdf0e10cSrcweir break; 906cdf0e10cSrcweir } 907cdf0e10cSrcweir } 908cdf0e10cSrcweir } 909cdf0e10cSrcweir 910cdf0e10cSrcweir XmlReader::Result XmlReader::handleNormalizedText(Span * text) { 911cdf0e10cSrcweir pad_.clear(); 912cdf0e10cSrcweir char const * flowBegin = pos_; 913cdf0e10cSrcweir char const * flowEnd = pos_; 914cdf0e10cSrcweir enum Space { SPACE_START, SPACE_NONE, SPACE_SPAN, SPACE_BREAK }; 915cdf0e10cSrcweir // a single true space character can go into the current flow, 916cdf0e10cSrcweir // everything else breaks the flow 917cdf0e10cSrcweir Space space = SPACE_START; 918cdf0e10cSrcweir for (;;) { 919cdf0e10cSrcweir switch (peek()) { 920cdf0e10cSrcweir case '\0': // i.e., EOF 921cdf0e10cSrcweir throw css::uno::RuntimeException( 922cdf0e10cSrcweir (rtl::OUString( 923cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM("premature end of ")) + 924cdf0e10cSrcweir fileUrl_), 925cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >()); 926cdf0e10cSrcweir case '\x09': 927cdf0e10cSrcweir case '\x0A': 928cdf0e10cSrcweir case '\x0D': 929cdf0e10cSrcweir switch (space) { 930cdf0e10cSrcweir case SPACE_START: 931cdf0e10cSrcweir case SPACE_BREAK: 932cdf0e10cSrcweir break; 933cdf0e10cSrcweir case SPACE_NONE: 934cdf0e10cSrcweir case SPACE_SPAN: 935cdf0e10cSrcweir space = SPACE_BREAK; 936cdf0e10cSrcweir break; 937cdf0e10cSrcweir } 938cdf0e10cSrcweir ++pos_; 939cdf0e10cSrcweir break; 940cdf0e10cSrcweir case ' ': 941cdf0e10cSrcweir switch (space) { 942cdf0e10cSrcweir case SPACE_START: 943cdf0e10cSrcweir case SPACE_BREAK: 944cdf0e10cSrcweir break; 945cdf0e10cSrcweir case SPACE_NONE: 946cdf0e10cSrcweir space = SPACE_SPAN; 947cdf0e10cSrcweir break; 948cdf0e10cSrcweir case SPACE_SPAN: 949cdf0e10cSrcweir space = SPACE_BREAK; 950cdf0e10cSrcweir break; 951cdf0e10cSrcweir } 952cdf0e10cSrcweir ++pos_; 953cdf0e10cSrcweir break; 954cdf0e10cSrcweir case '&': 955cdf0e10cSrcweir switch (space) { 956cdf0e10cSrcweir case SPACE_START: 957cdf0e10cSrcweir break; 958cdf0e10cSrcweir case SPACE_NONE: 959cdf0e10cSrcweir case SPACE_SPAN: 960cdf0e10cSrcweir pad_.add(flowBegin, pos_ - flowBegin); 961cdf0e10cSrcweir break; 962cdf0e10cSrcweir case SPACE_BREAK: 963cdf0e10cSrcweir pad_.add(flowBegin, flowEnd - flowBegin); 964cdf0e10cSrcweir pad_.add(RTL_CONSTASCII_STRINGPARAM(" ")); 965cdf0e10cSrcweir break; 966cdf0e10cSrcweir } 967cdf0e10cSrcweir pos_ = handleReference(pos_, end_); 968cdf0e10cSrcweir flowBegin = pos_; 969cdf0e10cSrcweir flowEnd = pos_; 970cdf0e10cSrcweir space = SPACE_NONE; 971cdf0e10cSrcweir break; 972cdf0e10cSrcweir case '<': 973cdf0e10cSrcweir ++pos_; 974cdf0e10cSrcweir switch (peek()) { 975cdf0e10cSrcweir case '!': 976cdf0e10cSrcweir ++pos_; 977cdf0e10cSrcweir if (skipComment()) { 978cdf0e10cSrcweir space = SPACE_BREAK; 979cdf0e10cSrcweir } else { 980cdf0e10cSrcweir Span cdata(scanCdataSection()); 981cdf0e10cSrcweir if (cdata.is()) { 982cdf0e10cSrcweir // CDATA is not normalized (similar to character 983cdf0e10cSrcweir // references; it keeps the code simple), but it might 984cdf0e10cSrcweir // arguably be better to normalize it: 985cdf0e10cSrcweir switch (space) { 986cdf0e10cSrcweir case SPACE_START: 987cdf0e10cSrcweir break; 988cdf0e10cSrcweir case SPACE_NONE: 989cdf0e10cSrcweir case SPACE_SPAN: 990cdf0e10cSrcweir pad_.add(flowBegin, pos_ - flowBegin); 991cdf0e10cSrcweir break; 992cdf0e10cSrcweir case SPACE_BREAK: 993cdf0e10cSrcweir pad_.add(flowBegin, flowEnd - flowBegin); 994cdf0e10cSrcweir pad_.add(RTL_CONSTASCII_STRINGPARAM(" ")); 995cdf0e10cSrcweir break; 996cdf0e10cSrcweir } 997cdf0e10cSrcweir normalizeLineEnds(cdata); 998cdf0e10cSrcweir flowBegin = pos_; 999cdf0e10cSrcweir flowEnd = pos_; 1000cdf0e10cSrcweir space = SPACE_NONE; 1001cdf0e10cSrcweir } else { 1002cdf0e10cSrcweir skipDocumentTypeDeclaration(); 1003cdf0e10cSrcweir } 1004cdf0e10cSrcweir } 1005cdf0e10cSrcweir break; 1006cdf0e10cSrcweir case '/': 1007cdf0e10cSrcweir ++pos_; 1008cdf0e10cSrcweir pad_.add(flowBegin, flowEnd - flowBegin); 1009cdf0e10cSrcweir *text = pad_.get(); 1010cdf0e10cSrcweir state_ = STATE_END_TAG; 1011cdf0e10cSrcweir return RESULT_TEXT; 1012cdf0e10cSrcweir case '?': 1013cdf0e10cSrcweir ++pos_; 1014cdf0e10cSrcweir skipProcessingInstruction(); 1015cdf0e10cSrcweir space = SPACE_BREAK; 1016cdf0e10cSrcweir break; 1017cdf0e10cSrcweir default: 1018cdf0e10cSrcweir pad_.add(flowBegin, flowEnd - flowBegin); 1019cdf0e10cSrcweir *text = pad_.get(); 1020cdf0e10cSrcweir state_ = STATE_START_TAG; 1021cdf0e10cSrcweir return RESULT_TEXT; 1022cdf0e10cSrcweir } 1023cdf0e10cSrcweir break; 1024cdf0e10cSrcweir default: 1025cdf0e10cSrcweir switch (space) { 1026cdf0e10cSrcweir case SPACE_START: 1027cdf0e10cSrcweir flowBegin = pos_; 1028cdf0e10cSrcweir break; 1029cdf0e10cSrcweir case SPACE_NONE: 1030cdf0e10cSrcweir case SPACE_SPAN: 1031cdf0e10cSrcweir break; 1032cdf0e10cSrcweir case SPACE_BREAK: 1033cdf0e10cSrcweir pad_.add(flowBegin, flowEnd - flowBegin); 1034cdf0e10cSrcweir pad_.add(RTL_CONSTASCII_STRINGPARAM(" ")); 1035cdf0e10cSrcweir flowBegin = pos_; 1036cdf0e10cSrcweir break; 1037cdf0e10cSrcweir } 1038cdf0e10cSrcweir flowEnd = ++pos_; 1039cdf0e10cSrcweir space = SPACE_NONE; 1040cdf0e10cSrcweir break; 1041cdf0e10cSrcweir } 1042cdf0e10cSrcweir } 1043cdf0e10cSrcweir } 1044cdf0e10cSrcweir 1045cdf0e10cSrcweir int XmlReader::toNamespaceId(NamespaceIris::size_type pos) { 1046cdf0e10cSrcweir OSL_ASSERT(pos <= INT_MAX); 1047cdf0e10cSrcweir return static_cast< int >(pos); 1048cdf0e10cSrcweir } 1049cdf0e10cSrcweir 1050cdf0e10cSrcweir } 1051