1 /************************************************************** 2 * 3 * Licensed to the Apache Software Foundation (ASF) under one 4 * or more contributor license agreements. See the NOTICE file 5 * distributed with this work for additional information 6 * regarding copyright ownership. The ASF licenses this file 7 * to you under the Apache License, Version 2.0 (the 8 * "License"); you may not use this file except in compliance 9 * with the License. You may obtain a copy of the License at 10 * 11 * http://www.apache.org/licenses/LICENSE-2.0 12 * 13 * Unless required by applicable law or agreed to in writing, 14 * software distributed under the License is distributed on an 15 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 * KIND, either express or implied. See the License for the 17 * specific language governing permissions and limitations 18 * under the License. 19 * 20 *************************************************************/ 21 22 23 24 #include "sal/config.h" 25 26 #include <climits> 27 #include <cstddef> 28 29 #include "com/sun/star/container/NoSuchElementException.hpp" 30 #include "com/sun/star/uno/Reference.hxx" 31 #include "com/sun/star/uno/RuntimeException.hpp" 32 #include "com/sun/star/uno/XInterface.hpp" 33 #include "osl/diagnose.h" 34 #include "osl/file.h" 35 #include "rtl/string.h" 36 #include "rtl/ustring.h" 37 #include "rtl/ustring.hxx" 38 #include "sal/types.h" 39 #include "xmlreader/pad.hxx" 40 #include "xmlreader/span.hxx" 41 #include "xmlreader/xmlreader.hxx" 42 43 namespace xmlreader { 44 45 namespace { 46 47 namespace css = com::sun::star; 48 49 bool isSpace(char c) { 50 switch (c) { 51 case '\x09': 52 case '\x0A': 53 case '\x0D': 54 case ' ': 55 return true; 56 default: 57 return false; 58 } 59 } 60 61 } 62 63 XmlReader::XmlReader(rtl::OUString const & fileUrl) 64 SAL_THROW(( 65 css::container::NoSuchElementException, css::uno::RuntimeException)): 66 fileUrl_(fileUrl) 67 { 68 switch (osl_openFile(fileUrl_.pData, &fileHandle_, osl_File_OpenFlag_Read)) 69 { 70 case osl_File_E_None: 71 break; 72 case osl_File_E_NOENT: 73 throw css::container::NoSuchElementException( 74 fileUrl_, css::uno::Reference< css::uno::XInterface >()); 75 default: 76 throw css::uno::RuntimeException( 77 (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("cannot open ")) + 78 fileUrl_), 79 css::uno::Reference< css::uno::XInterface >()); 80 } 81 oslFileError e = osl_getFileSize(fileHandle_, &fileSize_); 82 if (e == osl_File_E_None) { 83 e = osl_mapFile( 84 fileHandle_, &fileAddress_, fileSize_, 0, 85 osl_File_MapFlag_WillNeed); 86 } 87 if (e != osl_File_E_None) { 88 e = osl_closeFile(fileHandle_); 89 if (e != osl_File_E_None) { 90 OSL_TRACE("osl_closeFile failed with %ld", static_cast< long >(e)); 91 } 92 throw css::uno::RuntimeException( 93 (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("cannot mmap ")) + 94 fileUrl_), 95 css::uno::Reference< css::uno::XInterface >()); 96 } 97 namespaceIris_.push_back( 98 Span( 99 RTL_CONSTASCII_STRINGPARAM( 100 "http://www.w3.org/XML/1998/namespace"))); 101 namespaces_.push_back( 102 NamespaceData(Span(RTL_CONSTASCII_STRINGPARAM("xml")), NAMESPACE_XML)); 103 pos_ = static_cast< char * >(fileAddress_); 104 end_ = pos_ + fileSize_; 105 state_ = STATE_CONTENT; 106 } 107 108 XmlReader::~XmlReader() { 109 oslFileError e = osl_unmapFile(fileAddress_, fileSize_); 110 if (e != osl_File_E_None) { 111 OSL_TRACE("osl_unmapFile failed with %ld", static_cast< long >(e)); 112 } 113 e = osl_closeFile(fileHandle_); 114 if (e != osl_File_E_None) { 115 OSL_TRACE("osl_closeFile failed with %ld", static_cast< long >(e)); 116 } 117 } 118 119 int XmlReader::registerNamespaceIri(Span const & iri) { 120 int id = toNamespaceId(namespaceIris_.size()); 121 namespaceIris_.push_back(iri); 122 if (iri.equals( 123 Span( 124 RTL_CONSTASCII_STRINGPARAM( 125 "http://www.w3.org/2001/XMLSchema-instance")))) 126 { 127 // Old user layer .xcu files used the xsi namespace prefix without 128 // declaring a corresponding namespace binding, see issue 77174; reading 129 // those files during migration would fail without this hack that can be 130 // removed once migration is no longer relevant (see 131 // configmgr::Components::parseModificationLayer): 132 namespaces_.push_back( 133 NamespaceData(Span(RTL_CONSTASCII_STRINGPARAM("xsi")), id)); 134 } 135 return id; 136 } 137 138 XmlReader::Result XmlReader::nextItem(Text reportText, Span * data, int * nsId) 139 { 140 switch (state_) { 141 case STATE_CONTENT: 142 switch (reportText) { 143 case TEXT_NONE: 144 return handleSkippedText(data, nsId); 145 case TEXT_RAW: 146 return handleRawText(data); 147 case TEXT_NORMALIZED: 148 return handleNormalizedText(data); 149 } 150 case STATE_START_TAG: 151 return handleStartTag(nsId, data); 152 case STATE_END_TAG: 153 return handleEndTag(); 154 case STATE_EMPTY_ELEMENT_TAG: 155 handleElementEnd(); 156 return RESULT_END; 157 default: // STATE_DONE 158 return RESULT_DONE; 159 } 160 } 161 162 bool XmlReader::nextAttribute(int * nsId, Span * localName) { 163 OSL_ASSERT(nsId != 0 && localName != 0); 164 if (firstAttribute_) { 165 currentAttribute_ = attributes_.begin(); 166 firstAttribute_ = false; 167 } else { 168 ++currentAttribute_; 169 } 170 if (currentAttribute_ == attributes_.end()) { 171 return false; 172 } 173 if (currentAttribute_->nameColon == 0) { 174 *nsId = NAMESPACE_NONE; 175 *localName = Span( 176 currentAttribute_->nameBegin, 177 currentAttribute_->nameEnd - currentAttribute_->nameBegin); 178 } else { 179 *nsId = getNamespaceId( 180 Span( 181 currentAttribute_->nameBegin, 182 currentAttribute_->nameColon - currentAttribute_->nameBegin)); 183 *localName = Span( 184 currentAttribute_->nameColon + 1, 185 currentAttribute_->nameEnd - (currentAttribute_->nameColon + 1)); 186 } 187 return true; 188 } 189 190 Span XmlReader::getAttributeValue(bool fullyNormalize) { 191 return handleAttributeValue( 192 currentAttribute_->valueBegin, currentAttribute_->valueEnd, 193 fullyNormalize); 194 } 195 196 int XmlReader::getNamespaceId(Span const & prefix) const { 197 for (NamespaceList::const_reverse_iterator i(namespaces_.rbegin()); 198 i != namespaces_.rend(); ++i) 199 { 200 if (prefix.equals(i->prefix)) { 201 return i->nsId; 202 } 203 } 204 return NAMESPACE_UNKNOWN; 205 } 206 207 rtl::OUString XmlReader::getUrl() const { 208 return fileUrl_; 209 } 210 211 void XmlReader::normalizeLineEnds(Span const & text) { 212 char const * p = text.begin; 213 sal_Int32 n = text.length; 214 for (;;) { 215 sal_Int32 i = rtl_str_indexOfChar_WithLength(p, n, '\x0D'); 216 if (i < 0) { 217 break; 218 } 219 pad_.add(p, i); 220 p += i + 1; 221 n -= i + 1; 222 if (n == 0 || *p != '\x0A') { 223 pad_.add(RTL_CONSTASCII_STRINGPARAM("\x0A")); 224 } 225 } 226 pad_.add(p, n); 227 } 228 229 void XmlReader::skipSpace() { 230 while (isSpace(peek())) { 231 ++pos_; 232 } 233 } 234 235 bool XmlReader::skipComment() { 236 if (rtl_str_shortenedCompare_WithLength( 237 pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("--"), 238 RTL_CONSTASCII_LENGTH("--")) != 239 0) 240 { 241 return false; 242 } 243 pos_ += RTL_CONSTASCII_LENGTH("--"); 244 sal_Int32 i = rtl_str_indexOfStr_WithLength( 245 pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("--")); 246 if (i < 0) { 247 throw css::uno::RuntimeException( 248 (rtl::OUString( 249 RTL_CONSTASCII_USTRINGPARAM( 250 "premature end (within comment) of ")) + 251 fileUrl_), 252 css::uno::Reference< css::uno::XInterface >()); 253 } 254 pos_ += i + RTL_CONSTASCII_LENGTH("--"); 255 if (read() != '>') { 256 throw css::uno::RuntimeException( 257 (rtl::OUString( 258 RTL_CONSTASCII_USTRINGPARAM( 259 "illegal \"--\" within comment in ")) + 260 fileUrl_), 261 css::uno::Reference< css::uno::XInterface >()); 262 } 263 return true; 264 } 265 266 void XmlReader::skipProcessingInstruction() { 267 sal_Int32 i = rtl_str_indexOfStr_WithLength( 268 pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("?>")); 269 if (i < 0) { 270 throw css::uno::RuntimeException( 271 (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("bad '<?' in ")) + 272 fileUrl_), 273 css::uno::Reference< css::uno::XInterface >()); 274 } 275 pos_ += i + RTL_CONSTASCII_LENGTH("?>"); 276 } 277 278 void XmlReader::skipDocumentTypeDeclaration() { 279 // Neither is it checked that the doctypedecl is at the correct position in 280 // the document, nor that it is well-formed: 281 for (;;) { 282 char c = read(); 283 switch (c) { 284 case '\0': // i.e., EOF 285 throw css::uno::RuntimeException( 286 (rtl::OUString( 287 RTL_CONSTASCII_USTRINGPARAM( 288 "premature end (within DTD) of ")) + 289 fileUrl_), 290 css::uno::Reference< css::uno::XInterface >()); 291 case '"': 292 case '\'': 293 { 294 sal_Int32 i = rtl_str_indexOfChar_WithLength( 295 pos_, end_ - pos_, c); 296 if (i < 0) { 297 throw css::uno::RuntimeException( 298 (rtl::OUString( 299 RTL_CONSTASCII_USTRINGPARAM( 300 "premature end (within DTD) of ")) + 301 fileUrl_), 302 css::uno::Reference< css::uno::XInterface >()); 303 } 304 pos_ += i + 1; 305 } 306 break; 307 case '>': 308 return; 309 case '[': 310 for (;;) { 311 c = read(); 312 switch (c) { 313 case '\0': // i.e., EOF 314 throw css::uno::RuntimeException( 315 (rtl::OUString( 316 RTL_CONSTASCII_USTRINGPARAM( 317 "premature end (within DTD) of ")) + 318 fileUrl_), 319 css::uno::Reference< css::uno::XInterface >()); 320 case '"': 321 case '\'': 322 { 323 sal_Int32 i = rtl_str_indexOfChar_WithLength( 324 pos_, end_ - pos_, c); 325 if (i < 0) { 326 throw css::uno::RuntimeException( 327 (rtl::OUString( 328 RTL_CONSTASCII_USTRINGPARAM( 329 "premature end (within DTD) of ")) + 330 fileUrl_), 331 css::uno::Reference< css::uno::XInterface >()); 332 } 333 pos_ += i + 1; 334 } 335 break; 336 case '<': 337 switch (read()) { 338 case '\0': // i.e., EOF 339 throw css::uno::RuntimeException( 340 (rtl::OUString( 341 RTL_CONSTASCII_USTRINGPARAM( 342 "premature end (within DTD) of ")) + 343 fileUrl_), 344 css::uno::Reference< css::uno::XInterface >()); 345 case '!': 346 skipComment(); 347 break; 348 case '?': 349 skipProcessingInstruction(); 350 break; 351 default: 352 break; 353 } 354 break; 355 case ']': 356 skipSpace(); 357 if (read() != '>') { 358 throw css::uno::RuntimeException( 359 (rtl::OUString( 360 RTL_CONSTASCII_USTRINGPARAM( 361 "missing \">\" of DTD in ")) + 362 fileUrl_), 363 css::uno::Reference< css::uno::XInterface >()); 364 } 365 return; 366 default: 367 break; 368 } 369 } 370 default: 371 break; 372 } 373 } 374 } 375 376 Span XmlReader::scanCdataSection() { 377 if (rtl_str_shortenedCompare_WithLength( 378 pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("[CDATA["), 379 RTL_CONSTASCII_LENGTH("[CDATA[")) != 380 0) 381 { 382 return Span(); 383 } 384 pos_ += RTL_CONSTASCII_LENGTH("[CDATA["); 385 char const * begin = pos_; 386 sal_Int32 i = rtl_str_indexOfStr_WithLength( 387 pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("]]>")); 388 if (i < 0) { 389 throw css::uno::RuntimeException( 390 (rtl::OUString( 391 RTL_CONSTASCII_USTRINGPARAM( 392 "premature end (within CDATA section) of ")) + 393 fileUrl_), 394 css::uno::Reference< css::uno::XInterface >()); 395 } 396 pos_ += i + RTL_CONSTASCII_LENGTH("]]>"); 397 return Span(begin, i); 398 } 399 400 bool XmlReader::scanName(char const ** nameColon) { 401 OSL_ASSERT(nameColon != 0 && *nameColon == 0); 402 for (char const * begin = pos_;; ++pos_) { 403 switch (peek()) { 404 case '\0': // i.e., EOF 405 case '\x09': 406 case '\x0A': 407 case '\x0D': 408 case ' ': 409 case '/': 410 case '=': 411 case '>': 412 return pos_ != begin; 413 case ':': 414 *nameColon = pos_; 415 break; 416 default: 417 break; 418 } 419 } 420 } 421 422 int XmlReader::scanNamespaceIri(char const * begin, char const * end) { 423 OSL_ASSERT(begin != 0 && begin <= end); 424 Span iri(handleAttributeValue(begin, end, false)); 425 for (NamespaceIris::size_type i = 0; i < namespaceIris_.size(); ++i) { 426 if (namespaceIris_[i].equals(iri)) { 427 return toNamespaceId(i); 428 } 429 } 430 return XmlReader::NAMESPACE_UNKNOWN; 431 } 432 433 char const * XmlReader::handleReference(char const * position, char const * end) 434 { 435 OSL_ASSERT(position != 0 && *position == '&' && position < end); 436 ++position; 437 if (*position == '#') { 438 ++position; 439 sal_Int32 val = 0; 440 char const * p; 441 if (*position == 'x') { 442 ++position; 443 p = position; 444 for (;; ++position) { 445 char c = *position; 446 if (c >= '0' && c <= '9') { 447 val = 16 * val + (c - '0'); 448 } else if (c >= 'A' && c <= 'F') { 449 val = 16 * val + (c - 'A') + 10; 450 } else if (c >= 'a' && c <= 'f') { 451 val = 16 * val + (c - 'a') + 10; 452 } else { 453 break; 454 } 455 if (val > 0x10FFFF) { // avoid overflow 456 throw css::uno::RuntimeException( 457 (rtl::OUString( 458 RTL_CONSTASCII_USTRINGPARAM( 459 "'&#x...' too large in ")) + 460 fileUrl_), 461 css::uno::Reference< css::uno::XInterface >()); 462 } 463 } 464 } else { 465 p = position; 466 for (;; ++position) { 467 char c = *position; 468 if (c >= '0' && c <= '9') { 469 val = 10 * val + (c - '0'); 470 } else { 471 break; 472 } 473 if (val > 0x10FFFF) { // avoid overflow 474 throw css::uno::RuntimeException( 475 (rtl::OUString( 476 RTL_CONSTASCII_USTRINGPARAM( 477 "'&#...' too large in ")) + 478 fileUrl_), 479 css::uno::Reference< css::uno::XInterface >()); 480 } 481 } 482 } 483 if (position == p || *position++ != ';') { 484 throw css::uno::RuntimeException( 485 (rtl::OUString( 486 RTL_CONSTASCII_USTRINGPARAM("'&#...' missing ';' in ")) + 487 fileUrl_), 488 css::uno::Reference< css::uno::XInterface >()); 489 } 490 OSL_ASSERT(val >= 0 && val <= 0x10FFFF); 491 if ((val < 0x20 && val != 0x9 && val != 0xA && val != 0xD) || 492 (val >= 0xD800 && val <= 0xDFFF) || val == 0xFFFE || val == 0xFFFF) 493 { 494 throw css::uno::RuntimeException( 495 (rtl::OUString( 496 RTL_CONSTASCII_USTRINGPARAM( 497 "character reference denoting invalid character in ")) + 498 fileUrl_), 499 css::uno::Reference< css::uno::XInterface >()); 500 } 501 char buf[4]; 502 sal_Int32 len; 503 if (val < 0x80) { 504 buf[0] = static_cast< char >(val); 505 len = 1; 506 } else if (val < 0x800) { 507 buf[0] = static_cast< char >((val >> 6) | 0xC0); 508 buf[1] = static_cast< char >((val & 0x3F) | 0x80); 509 len = 2; 510 } else if (val < 0x10000) { 511 buf[0] = static_cast< char >((val >> 12) | 0xE0); 512 buf[1] = static_cast< char >(((val >> 6) & 0x3F) | 0x80); 513 buf[2] = static_cast< char >((val & 0x3F) | 0x80); 514 len = 3; 515 } else { 516 buf[0] = static_cast< char >((val >> 18) | 0xF0); 517 buf[1] = static_cast< char >(((val >> 12) & 0x3F) | 0x80); 518 buf[2] = static_cast< char >(((val >> 6) & 0x3F) | 0x80); 519 buf[3] = static_cast< char >((val & 0x3F) | 0x80); 520 len = 4; 521 } 522 pad_.addEphemeral(buf, len); 523 return position; 524 } else { 525 struct EntityRef { 526 char const * inBegin; 527 sal_Int32 inLength; 528 char const * outBegin; 529 sal_Int32 outLength; 530 }; 531 static EntityRef const refs[] = { 532 { RTL_CONSTASCII_STRINGPARAM("amp;"), 533 RTL_CONSTASCII_STRINGPARAM("&") }, 534 { RTL_CONSTASCII_STRINGPARAM("lt;"), 535 RTL_CONSTASCII_STRINGPARAM("<") }, 536 { RTL_CONSTASCII_STRINGPARAM("gt;"), 537 RTL_CONSTASCII_STRINGPARAM(">") }, 538 { RTL_CONSTASCII_STRINGPARAM("apos;"), 539 RTL_CONSTASCII_STRINGPARAM("'") }, 540 { RTL_CONSTASCII_STRINGPARAM("quot;"), 541 RTL_CONSTASCII_STRINGPARAM("\"") } }; 542 for (std::size_t i = 0; i < sizeof refs / sizeof refs[0]; ++i) { 543 if (rtl_str_shortenedCompare_WithLength( 544 position, end - position, refs[i].inBegin, refs[i].inLength, 545 refs[i].inLength) == 546 0) 547 { 548 position += refs[i].inLength; 549 pad_.add(refs[i].outBegin, refs[i].outLength); 550 return position; 551 } 552 } 553 throw css::uno::RuntimeException( 554 (rtl::OUString( 555 RTL_CONSTASCII_USTRINGPARAM("unknown entity reference in ")) + 556 fileUrl_), 557 css::uno::Reference< css::uno::XInterface >()); 558 } 559 } 560 561 Span XmlReader::handleAttributeValue( 562 char const * begin, char const * end, bool fullyNormalize) 563 { 564 pad_.clear(); 565 if (fullyNormalize) { 566 while (begin != end && isSpace(*begin)) { 567 ++begin; 568 } 569 while (end != begin && isSpace(end[-1])) { 570 --end; 571 } 572 char const * p = begin; 573 enum Space { SPACE_NONE, SPACE_SPAN, SPACE_BREAK }; 574 // a single true space character can go into the current span, 575 // everything else breaks the span 576 Space space = SPACE_NONE; 577 while (p != end) { 578 switch (*p) { 579 case '\x09': 580 case '\x0A': 581 case '\x0D': 582 switch (space) { 583 case SPACE_NONE: 584 pad_.add(begin, p - begin); 585 pad_.add(RTL_CONSTASCII_STRINGPARAM(" ")); 586 space = SPACE_BREAK; 587 break; 588 case SPACE_SPAN: 589 pad_.add(begin, p - begin); 590 space = SPACE_BREAK; 591 break; 592 case SPACE_BREAK: 593 break; 594 } 595 begin = ++p; 596 break; 597 case ' ': 598 switch (space) { 599 case SPACE_NONE: 600 ++p; 601 space = SPACE_SPAN; 602 break; 603 case SPACE_SPAN: 604 pad_.add(begin, p - begin); 605 begin = ++p; 606 space = SPACE_BREAK; 607 break; 608 case SPACE_BREAK: 609 begin = ++p; 610 break; 611 } 612 break; 613 case '&': 614 pad_.add(begin, p - begin); 615 p = handleReference(p, end); 616 begin = p; 617 space = SPACE_NONE; 618 break; 619 default: 620 ++p; 621 space = SPACE_NONE; 622 break; 623 } 624 } 625 pad_.add(begin, p - begin); 626 } else { 627 char const * p = begin; 628 while (p != end) { 629 switch (*p) { 630 case '\x09': 631 case '\x0A': 632 pad_.add(begin, p - begin); 633 begin = ++p; 634 pad_.add(RTL_CONSTASCII_STRINGPARAM(" ")); 635 break; 636 case '\x0D': 637 pad_.add(begin, p - begin); 638 ++p; 639 if (peek() == '\x0A') { 640 ++p; 641 } 642 begin = p; 643 pad_.add(RTL_CONSTASCII_STRINGPARAM(" ")); 644 break; 645 case '&': 646 pad_.add(begin, p - begin); 647 p = handleReference(p, end); 648 begin = p; 649 break; 650 default: 651 ++p; 652 break; 653 } 654 } 655 pad_.add(begin, p - begin); 656 } 657 return pad_.get(); 658 } 659 660 XmlReader::Result XmlReader::handleStartTag(int * nsId, Span * localName) { 661 OSL_ASSERT(nsId != 0 && localName); 662 char const * nameBegin = pos_; 663 char const * nameColon = 0; 664 if (!scanName(&nameColon)) { 665 throw css::uno::RuntimeException( 666 (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("bad tag name in ")) + 667 fileUrl_), 668 css::uno::Reference< css::uno::XInterface >()); 669 } 670 char const * nameEnd = pos_; 671 NamespaceList::size_type inheritedNamespaces = namespaces_.size(); 672 bool hasDefaultNs = false; 673 int defaultNsId = NAMESPACE_NONE; 674 attributes_.clear(); 675 for (;;) { 676 char const * p = pos_; 677 skipSpace(); 678 if (peek() == '/' || peek() == '>') { 679 break; 680 } 681 if (pos_ == p) { 682 throw css::uno::RuntimeException( 683 (rtl::OUString( 684 RTL_CONSTASCII_USTRINGPARAM( 685 "missing whitespace before attribute in ")) + 686 fileUrl_), 687 css::uno::Reference< css::uno::XInterface >()); 688 } 689 char const * attrNameBegin = pos_; 690 char const * attrNameColon = 0; 691 if (!scanName(&attrNameColon)) { 692 throw css::uno::RuntimeException( 693 (rtl::OUString( 694 RTL_CONSTASCII_USTRINGPARAM("bad attribute name in ")) + 695 fileUrl_), 696 css::uno::Reference< css::uno::XInterface >()); 697 } 698 char const * attrNameEnd = pos_; 699 skipSpace(); 700 if (read() != '=') { 701 throw css::uno::RuntimeException( 702 (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("missing '=' in ")) + 703 fileUrl_), 704 css::uno::Reference< css::uno::XInterface >()); 705 } 706 skipSpace(); 707 char del = read(); 708 if (del != '\'' && del != '"') { 709 throw css::uno::RuntimeException( 710 (rtl::OUString( 711 RTL_CONSTASCII_USTRINGPARAM("bad attribute value in ")) + 712 fileUrl_), 713 css::uno::Reference< css::uno::XInterface >()); 714 } 715 char const * valueBegin = pos_; 716 sal_Int32 i = rtl_str_indexOfChar_WithLength(pos_, end_ - pos_, del); 717 if (i < 0) { 718 throw css::uno::RuntimeException( 719 (rtl::OUString( 720 RTL_CONSTASCII_USTRINGPARAM( 721 "unterminated attribute value in ")) + 722 fileUrl_), 723 css::uno::Reference< css::uno::XInterface >()); 724 } 725 char const * valueEnd = pos_ + i; 726 pos_ += i + 1; 727 if (attrNameColon == 0 && 728 Span(attrNameBegin, attrNameEnd - attrNameBegin).equals( 729 RTL_CONSTASCII_STRINGPARAM("xmlns"))) 730 { 731 hasDefaultNs = true; 732 defaultNsId = scanNamespaceIri(valueBegin, valueEnd); 733 } else if (attrNameColon != 0 && 734 Span(attrNameBegin, attrNameColon - attrNameBegin).equals( 735 RTL_CONSTASCII_STRINGPARAM("xmlns"))) 736 { 737 namespaces_.push_back( 738 NamespaceData( 739 Span(attrNameColon + 1, attrNameEnd - (attrNameColon + 1)), 740 scanNamespaceIri(valueBegin, valueEnd))); 741 } else { 742 attributes_.push_back( 743 AttributeData( 744 attrNameBegin, attrNameEnd, attrNameColon, valueBegin, 745 valueEnd)); 746 } 747 } 748 if (!hasDefaultNs && !elements_.empty()) { 749 defaultNsId = elements_.top().defaultNamespaceId; 750 } 751 firstAttribute_ = true; 752 if (peek() == '/') { 753 state_ = STATE_EMPTY_ELEMENT_TAG; 754 ++pos_; 755 } else { 756 state_ = STATE_CONTENT; 757 } 758 if (peek() != '>') { 759 throw css::uno::RuntimeException( 760 (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("missing '>' in ")) + 761 fileUrl_), 762 css::uno::Reference< css::uno::XInterface >()); 763 } 764 ++pos_; 765 elements_.push( 766 ElementData( 767 Span(nameBegin, nameEnd - nameBegin), inheritedNamespaces, 768 defaultNsId)); 769 if (nameColon == 0) { 770 *nsId = defaultNsId; 771 *localName = Span(nameBegin, nameEnd - nameBegin); 772 } else { 773 *nsId = getNamespaceId(Span(nameBegin, nameColon - nameBegin)); 774 *localName = Span(nameColon + 1, nameEnd - (nameColon + 1)); 775 } 776 return RESULT_BEGIN; 777 } 778 779 XmlReader::Result XmlReader::handleEndTag() { 780 if (elements_.empty()) { 781 throw css::uno::RuntimeException( 782 (rtl::OUString( 783 RTL_CONSTASCII_USTRINGPARAM("spurious end tag in ")) + 784 fileUrl_), 785 css::uno::Reference< css::uno::XInterface >()); 786 } 787 char const * nameBegin = pos_; 788 char const * nameColon = 0; 789 if (!scanName(&nameColon) || 790 !elements_.top().name.equals(nameBegin, pos_ - nameBegin)) 791 { 792 throw css::uno::RuntimeException( 793 (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("tag mismatch in ")) + 794 fileUrl_), 795 css::uno::Reference< css::uno::XInterface >()); 796 } 797 handleElementEnd(); 798 skipSpace(); 799 if (peek() != '>') { 800 throw css::uno::RuntimeException( 801 (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("missing '>' in ")) + 802 fileUrl_), 803 css::uno::Reference< css::uno::XInterface >()); 804 } 805 ++pos_; 806 return RESULT_END; 807 } 808 809 void XmlReader::handleElementEnd() { 810 OSL_ASSERT(!elements_.empty()); 811 namespaces_.resize(elements_.top().inheritedNamespaces); 812 elements_.pop(); 813 state_ = elements_.empty() ? STATE_DONE : STATE_CONTENT; 814 } 815 816 XmlReader::Result XmlReader::handleSkippedText(Span * data, int * nsId) { 817 for (;;) { 818 sal_Int32 i = rtl_str_indexOfChar_WithLength(pos_, end_ - pos_, '<'); 819 if (i < 0) { 820 throw css::uno::RuntimeException( 821 (rtl::OUString( 822 RTL_CONSTASCII_USTRINGPARAM("premature end of ")) + 823 fileUrl_), 824 css::uno::Reference< css::uno::XInterface >()); 825 } 826 pos_ += i + 1; 827 switch (peek()) { 828 case '!': 829 ++pos_; 830 if (!skipComment() && !scanCdataSection().is()) { 831 skipDocumentTypeDeclaration(); 832 } 833 break; 834 case '/': 835 ++pos_; 836 return handleEndTag(); 837 case '?': 838 ++pos_; 839 skipProcessingInstruction(); 840 break; 841 default: 842 return handleStartTag(nsId, data); 843 } 844 } 845 } 846 847 XmlReader::Result XmlReader::handleRawText(Span * text) { 848 pad_.clear(); 849 for (char const * begin = pos_;;) { 850 switch (peek()) { 851 case '\0': // i.e., EOF 852 throw css::uno::RuntimeException( 853 (rtl::OUString( 854 RTL_CONSTASCII_USTRINGPARAM("premature end of ")) + 855 fileUrl_), 856 css::uno::Reference< css::uno::XInterface >()); 857 case '\x0D': 858 pad_.add(begin, pos_ - begin); 859 ++pos_; 860 if (peek() != '\x0A') { 861 pad_.add(RTL_CONSTASCII_STRINGPARAM("\x0A")); 862 } 863 begin = pos_; 864 break; 865 case '&': 866 pad_.add(begin, pos_ - begin); 867 pos_ = handleReference(pos_, end_); 868 begin = pos_; 869 break; 870 case '<': 871 pad_.add(begin, pos_ - begin); 872 ++pos_; 873 switch (peek()) { 874 case '!': 875 ++pos_; 876 if (!skipComment()) { 877 Span cdata(scanCdataSection()); 878 if (cdata.is()) { 879 normalizeLineEnds(cdata); 880 } else { 881 skipDocumentTypeDeclaration(); 882 } 883 } 884 begin = pos_; 885 break; 886 case '/': 887 *text = pad_.get(); 888 ++pos_; 889 state_ = STATE_END_TAG; 890 return RESULT_TEXT; 891 case '?': 892 ++pos_; 893 skipProcessingInstruction(); 894 begin = pos_; 895 break; 896 default: 897 *text = pad_.get(); 898 state_ = STATE_START_TAG; 899 return RESULT_TEXT; 900 } 901 break; 902 default: 903 ++pos_; 904 break; 905 } 906 } 907 } 908 909 XmlReader::Result XmlReader::handleNormalizedText(Span * text) { 910 pad_.clear(); 911 char const * flowBegin = pos_; 912 char const * flowEnd = pos_; 913 enum Space { SPACE_START, SPACE_NONE, SPACE_SPAN, SPACE_BREAK }; 914 // a single true space character can go into the current flow, 915 // everything else breaks the flow 916 Space space = SPACE_START; 917 for (;;) { 918 switch (peek()) { 919 case '\0': // i.e., EOF 920 throw css::uno::RuntimeException( 921 (rtl::OUString( 922 RTL_CONSTASCII_USTRINGPARAM("premature end of ")) + 923 fileUrl_), 924 css::uno::Reference< css::uno::XInterface >()); 925 case '\x09': 926 case '\x0A': 927 case '\x0D': 928 switch (space) { 929 case SPACE_START: 930 case SPACE_BREAK: 931 break; 932 case SPACE_NONE: 933 case SPACE_SPAN: 934 space = SPACE_BREAK; 935 break; 936 } 937 ++pos_; 938 break; 939 case ' ': 940 switch (space) { 941 case SPACE_START: 942 case SPACE_BREAK: 943 break; 944 case SPACE_NONE: 945 space = SPACE_SPAN; 946 break; 947 case SPACE_SPAN: 948 space = SPACE_BREAK; 949 break; 950 } 951 ++pos_; 952 break; 953 case '&': 954 switch (space) { 955 case SPACE_START: 956 break; 957 case SPACE_NONE: 958 case SPACE_SPAN: 959 pad_.add(flowBegin, pos_ - flowBegin); 960 break; 961 case SPACE_BREAK: 962 pad_.add(flowBegin, flowEnd - flowBegin); 963 pad_.add(RTL_CONSTASCII_STRINGPARAM(" ")); 964 break; 965 } 966 pos_ = handleReference(pos_, end_); 967 flowBegin = pos_; 968 flowEnd = pos_; 969 space = SPACE_NONE; 970 break; 971 case '<': 972 ++pos_; 973 switch (peek()) { 974 case '!': 975 ++pos_; 976 if (skipComment()) { 977 space = SPACE_BREAK; 978 } else { 979 Span cdata(scanCdataSection()); 980 if (cdata.is()) { 981 // CDATA is not normalized (similar to character 982 // references; it keeps the code simple), but it might 983 // arguably be better to normalize it: 984 switch (space) { 985 case SPACE_START: 986 break; 987 case SPACE_NONE: 988 case SPACE_SPAN: 989 pad_.add(flowBegin, pos_ - flowBegin); 990 break; 991 case SPACE_BREAK: 992 pad_.add(flowBegin, flowEnd - flowBegin); 993 pad_.add(RTL_CONSTASCII_STRINGPARAM(" ")); 994 break; 995 } 996 normalizeLineEnds(cdata); 997 flowBegin = pos_; 998 flowEnd = pos_; 999 space = SPACE_NONE; 1000 } else { 1001 skipDocumentTypeDeclaration(); 1002 } 1003 } 1004 break; 1005 case '/': 1006 ++pos_; 1007 pad_.add(flowBegin, flowEnd - flowBegin); 1008 *text = pad_.get(); 1009 state_ = STATE_END_TAG; 1010 return RESULT_TEXT; 1011 case '?': 1012 ++pos_; 1013 skipProcessingInstruction(); 1014 space = SPACE_BREAK; 1015 break; 1016 default: 1017 pad_.add(flowBegin, flowEnd - flowBegin); 1018 *text = pad_.get(); 1019 state_ = STATE_START_TAG; 1020 return RESULT_TEXT; 1021 } 1022 break; 1023 default: 1024 switch (space) { 1025 case SPACE_START: 1026 flowBegin = pos_; 1027 break; 1028 case SPACE_NONE: 1029 case SPACE_SPAN: 1030 break; 1031 case SPACE_BREAK: 1032 pad_.add(flowBegin, flowEnd - flowBegin); 1033 pad_.add(RTL_CONSTASCII_STRINGPARAM(" ")); 1034 flowBegin = pos_; 1035 break; 1036 } 1037 flowEnd = ++pos_; 1038 space = SPACE_NONE; 1039 break; 1040 } 1041 } 1042 } 1043 1044 int XmlReader::toNamespaceId(NamespaceIris::size_type pos) { 1045 OSL_ASSERT(pos <= INT_MAX); 1046 return static_cast< int >(pos); 1047 } 1048 1049 } 1050