1 /**************************************************************
2 *
3 * Licensed to the Apache Software Foundation (ASF) under one
4 * or more contributor license agreements. See the NOTICE file
5 * distributed with this work for additional information
6 * regarding copyright ownership. The ASF licenses this file
7 * to you under the Apache License, Version 2.0 (the
8 * "License"); you may not use this file except in compliance
9 * with the License. You may obtain a copy of the License at
10 *
11 * http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing,
14 * software distributed under the License is distributed on an
15 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 * KIND, either express or implied. See the License for the
17 * specific language governing permissions and limitations
18 * under the License.
19 *
20 *************************************************************/
21
22
23
24 #include "precompiled_xmlreader.hxx"
25 #include "sal/config.h"
26
27 #include <climits>
28 #include <cstddef>
29
30 #include "com/sun/star/container/NoSuchElementException.hpp"
31 #include "com/sun/star/uno/Reference.hxx"
32 #include "com/sun/star/uno/RuntimeException.hpp"
33 #include "com/sun/star/uno/XInterface.hpp"
34 #include "osl/diagnose.h"
35 #include "osl/file.h"
36 #include "rtl/string.h"
37 #include "rtl/ustring.h"
38 #include "rtl/ustring.hxx"
39 #include "sal/types.h"
40 #include "xmlreader/pad.hxx"
41 #include "xmlreader/span.hxx"
42 #include "xmlreader/xmlreader.hxx"
43
44 namespace xmlreader {
45
46 namespace {
47
48 namespace css = com::sun::star;
49
isSpace(char c)50 bool isSpace(char c) {
51 switch (c) {
52 case '\x09':
53 case '\x0A':
54 case '\x0D':
55 case ' ':
56 return true;
57 default:
58 return false;
59 }
60 }
61
62 }
63
XmlReader(rtl::OUString const & fileUrl)64 XmlReader::XmlReader(rtl::OUString const & fileUrl)
65 SAL_THROW((
66 css::container::NoSuchElementException, css::uno::RuntimeException)):
67 fileUrl_(fileUrl)
68 {
69 switch (osl_openFile(fileUrl_.pData, &fileHandle_, osl_File_OpenFlag_Read))
70 {
71 case osl_File_E_None:
72 break;
73 case osl_File_E_NOENT:
74 throw css::container::NoSuchElementException(
75 fileUrl_, css::uno::Reference< css::uno::XInterface >());
76 default:
77 throw css::uno::RuntimeException(
78 (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("cannot open ")) +
79 fileUrl_),
80 css::uno::Reference< css::uno::XInterface >());
81 }
82 oslFileError e = osl_getFileSize(fileHandle_, &fileSize_);
83 if (e == osl_File_E_None) {
84 e = osl_mapFile(
85 fileHandle_, &fileAddress_, fileSize_, 0,
86 osl_File_MapFlag_WillNeed);
87 }
88 if (e != osl_File_E_None) {
89 e = osl_closeFile(fileHandle_);
90 if (e != osl_File_E_None) {
91 OSL_TRACE("osl_closeFile failed with %ld", static_cast< long >(e));
92 }
93 throw css::uno::RuntimeException(
94 (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("cannot mmap ")) +
95 fileUrl_),
96 css::uno::Reference< css::uno::XInterface >());
97 }
98 namespaceIris_.push_back(
99 Span(
100 RTL_CONSTASCII_STRINGPARAM(
101 "http://www.w3.org/XML/1998/namespace")));
102 namespaces_.push_back(
103 NamespaceData(Span(RTL_CONSTASCII_STRINGPARAM("xml")), NAMESPACE_XML));
104 pos_ = static_cast< char * >(fileAddress_);
105 end_ = pos_ + fileSize_;
106 state_ = STATE_CONTENT;
107 }
108
~XmlReader()109 XmlReader::~XmlReader() {
110 oslFileError e = osl_unmapFile(fileAddress_, fileSize_);
111 if (e != osl_File_E_None) {
112 OSL_TRACE("osl_unmapFile failed with %ld", static_cast< long >(e));
113 }
114 e = osl_closeFile(fileHandle_);
115 if (e != osl_File_E_None) {
116 OSL_TRACE("osl_closeFile failed with %ld", static_cast< long >(e));
117 }
118 }
119
registerNamespaceIri(Span const & iri)120 int XmlReader::registerNamespaceIri(Span const & iri) {
121 int id = toNamespaceId(namespaceIris_.size());
122 namespaceIris_.push_back(iri);
123 if (iri.equals(
124 Span(
125 RTL_CONSTASCII_STRINGPARAM(
126 "http://www.w3.org/2001/XMLSchema-instance"))))
127 {
128 // Old user layer .xcu files used the xsi namespace prefix without
129 // declaring a corresponding namespace binding, see issue 77174; reading
130 // those files during migration would fail without this hack that can be
131 // removed once migration is no longer relevant (see
132 // configmgr::Components::parseModificationLayer):
133 namespaces_.push_back(
134 NamespaceData(Span(RTL_CONSTASCII_STRINGPARAM("xsi")), id));
135 }
136 return id;
137 }
138
nextItem(Text reportText,Span * data,int * nsId)139 XmlReader::Result XmlReader::nextItem(Text reportText, Span * data, int * nsId)
140 {
141 switch (state_) {
142 case STATE_CONTENT:
143 switch (reportText) {
144 case TEXT_NONE:
145 return handleSkippedText(data, nsId);
146 case TEXT_RAW:
147 return handleRawText(data);
148 case TEXT_NORMALIZED:
149 return handleNormalizedText(data);
150 }
151 case STATE_START_TAG:
152 return handleStartTag(nsId, data);
153 case STATE_END_TAG:
154 return handleEndTag();
155 case STATE_EMPTY_ELEMENT_TAG:
156 handleElementEnd();
157 return RESULT_END;
158 default: // STATE_DONE
159 return RESULT_DONE;
160 }
161 }
162
nextAttribute(int * nsId,Span * localName)163 bool XmlReader::nextAttribute(int * nsId, Span * localName) {
164 OSL_ASSERT(nsId != 0 && localName != 0);
165 if (firstAttribute_) {
166 currentAttribute_ = attributes_.begin();
167 firstAttribute_ = false;
168 } else {
169 ++currentAttribute_;
170 }
171 if (currentAttribute_ == attributes_.end()) {
172 return false;
173 }
174 if (currentAttribute_->nameColon == 0) {
175 *nsId = NAMESPACE_NONE;
176 *localName = Span(
177 currentAttribute_->nameBegin,
178 currentAttribute_->nameEnd - currentAttribute_->nameBegin);
179 } else {
180 *nsId = getNamespaceId(
181 Span(
182 currentAttribute_->nameBegin,
183 currentAttribute_->nameColon - currentAttribute_->nameBegin));
184 *localName = Span(
185 currentAttribute_->nameColon + 1,
186 currentAttribute_->nameEnd - (currentAttribute_->nameColon + 1));
187 }
188 return true;
189 }
190
getAttributeValue(bool fullyNormalize)191 Span XmlReader::getAttributeValue(bool fullyNormalize) {
192 return handleAttributeValue(
193 currentAttribute_->valueBegin, currentAttribute_->valueEnd,
194 fullyNormalize);
195 }
196
getNamespaceId(Span const & prefix) const197 int XmlReader::getNamespaceId(Span const & prefix) const {
198 for (NamespaceList::const_reverse_iterator i(namespaces_.rbegin());
199 i != namespaces_.rend(); ++i)
200 {
201 if (prefix.equals(i->prefix)) {
202 return i->nsId;
203 }
204 }
205 return NAMESPACE_UNKNOWN;
206 }
207
getUrl() const208 rtl::OUString XmlReader::getUrl() const {
209 return fileUrl_;
210 }
211
normalizeLineEnds(Span const & text)212 void XmlReader::normalizeLineEnds(Span const & text) {
213 char const * p = text.begin;
214 sal_Int32 n = text.length;
215 for (;;) {
216 sal_Int32 i = rtl_str_indexOfChar_WithLength(p, n, '\x0D');
217 if (i < 0) {
218 break;
219 }
220 pad_.add(p, i);
221 p += i + 1;
222 n -= i + 1;
223 if (n == 0 || *p != '\x0A') {
224 pad_.add(RTL_CONSTASCII_STRINGPARAM("\x0A"));
225 }
226 }
227 pad_.add(p, n);
228 }
229
skipSpace()230 void XmlReader::skipSpace() {
231 while (isSpace(peek())) {
232 ++pos_;
233 }
234 }
235
skipComment()236 bool XmlReader::skipComment() {
237 if (rtl_str_shortenedCompare_WithLength(
238 pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("--"),
239 RTL_CONSTASCII_LENGTH("--")) !=
240 0)
241 {
242 return false;
243 }
244 pos_ += RTL_CONSTASCII_LENGTH("--");
245 sal_Int32 i = rtl_str_indexOfStr_WithLength(
246 pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("--"));
247 if (i < 0) {
248 throw css::uno::RuntimeException(
249 (rtl::OUString(
250 RTL_CONSTASCII_USTRINGPARAM(
251 "premature end (within comment) of ")) +
252 fileUrl_),
253 css::uno::Reference< css::uno::XInterface >());
254 }
255 pos_ += i + RTL_CONSTASCII_LENGTH("--");
256 if (read() != '>') {
257 throw css::uno::RuntimeException(
258 (rtl::OUString(
259 RTL_CONSTASCII_USTRINGPARAM(
260 "illegal \"--\" within comment in ")) +
261 fileUrl_),
262 css::uno::Reference< css::uno::XInterface >());
263 }
264 return true;
265 }
266
skipProcessingInstruction()267 void XmlReader::skipProcessingInstruction() {
268 sal_Int32 i = rtl_str_indexOfStr_WithLength(
269 pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("?>"));
270 if (i < 0) {
271 throw css::uno::RuntimeException(
272 (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("bad '<?' in ")) +
273 fileUrl_),
274 css::uno::Reference< css::uno::XInterface >());
275 }
276 pos_ += i + RTL_CONSTASCII_LENGTH("?>");
277 }
278
skipDocumentTypeDeclaration()279 void XmlReader::skipDocumentTypeDeclaration() {
280 // Neither is it checked that the doctypedecl is at the correct position in
281 // the document, nor that it is well-formed:
282 for (;;) {
283 char c = read();
284 switch (c) {
285 case '\0': // i.e., EOF
286 throw css::uno::RuntimeException(
287 (rtl::OUString(
288 RTL_CONSTASCII_USTRINGPARAM(
289 "premature end (within DTD) of ")) +
290 fileUrl_),
291 css::uno::Reference< css::uno::XInterface >());
292 case '"':
293 case '\'':
294 {
295 sal_Int32 i = rtl_str_indexOfChar_WithLength(
296 pos_, end_ - pos_, c);
297 if (i < 0) {
298 throw css::uno::RuntimeException(
299 (rtl::OUString(
300 RTL_CONSTASCII_USTRINGPARAM(
301 "premature end (within DTD) of ")) +
302 fileUrl_),
303 css::uno::Reference< css::uno::XInterface >());
304 }
305 pos_ += i + 1;
306 }
307 break;
308 case '>':
309 return;
310 case '[':
311 for (;;) {
312 c = read();
313 switch (c) {
314 case '\0': // i.e., EOF
315 throw css::uno::RuntimeException(
316 (rtl::OUString(
317 RTL_CONSTASCII_USTRINGPARAM(
318 "premature end (within DTD) of ")) +
319 fileUrl_),
320 css::uno::Reference< css::uno::XInterface >());
321 case '"':
322 case '\'':
323 {
324 sal_Int32 i = rtl_str_indexOfChar_WithLength(
325 pos_, end_ - pos_, c);
326 if (i < 0) {
327 throw css::uno::RuntimeException(
328 (rtl::OUString(
329 RTL_CONSTASCII_USTRINGPARAM(
330 "premature end (within DTD) of ")) +
331 fileUrl_),
332 css::uno::Reference< css::uno::XInterface >());
333 }
334 pos_ += i + 1;
335 }
336 break;
337 case '<':
338 switch (read()) {
339 case '\0': // i.e., EOF
340 throw css::uno::RuntimeException(
341 (rtl::OUString(
342 RTL_CONSTASCII_USTRINGPARAM(
343 "premature end (within DTD) of ")) +
344 fileUrl_),
345 css::uno::Reference< css::uno::XInterface >());
346 case '!':
347 skipComment();
348 break;
349 case '?':
350 skipProcessingInstruction();
351 break;
352 default:
353 break;
354 }
355 break;
356 case ']':
357 skipSpace();
358 if (read() != '>') {
359 throw css::uno::RuntimeException(
360 (rtl::OUString(
361 RTL_CONSTASCII_USTRINGPARAM(
362 "missing \">\" of DTD in ")) +
363 fileUrl_),
364 css::uno::Reference< css::uno::XInterface >());
365 }
366 return;
367 default:
368 break;
369 }
370 }
371 default:
372 break;
373 }
374 }
375 }
376
scanCdataSection()377 Span XmlReader::scanCdataSection() {
378 if (rtl_str_shortenedCompare_WithLength(
379 pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("[CDATA["),
380 RTL_CONSTASCII_LENGTH("[CDATA[")) !=
381 0)
382 {
383 return Span();
384 }
385 pos_ += RTL_CONSTASCII_LENGTH("[CDATA[");
386 char const * begin = pos_;
387 sal_Int32 i = rtl_str_indexOfStr_WithLength(
388 pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("]]>"));
389 if (i < 0) {
390 throw css::uno::RuntimeException(
391 (rtl::OUString(
392 RTL_CONSTASCII_USTRINGPARAM(
393 "premature end (within CDATA section) of ")) +
394 fileUrl_),
395 css::uno::Reference< css::uno::XInterface >());
396 }
397 pos_ += i + RTL_CONSTASCII_LENGTH("]]>");
398 return Span(begin, i);
399 }
400
scanName(char const ** nameColon)401 bool XmlReader::scanName(char const ** nameColon) {
402 OSL_ASSERT(nameColon != 0 && *nameColon == 0);
403 for (char const * begin = pos_;; ++pos_) {
404 switch (peek()) {
405 case '\0': // i.e., EOF
406 case '\x09':
407 case '\x0A':
408 case '\x0D':
409 case ' ':
410 case '/':
411 case '=':
412 case '>':
413 return pos_ != begin;
414 case ':':
415 *nameColon = pos_;
416 break;
417 default:
418 break;
419 }
420 }
421 }
422
scanNamespaceIri(char const * begin,char const * end)423 int XmlReader::scanNamespaceIri(char const * begin, char const * end) {
424 OSL_ASSERT(begin != 0 && begin <= end);
425 Span iri(handleAttributeValue(begin, end, false));
426 for (NamespaceIris::size_type i = 0; i < namespaceIris_.size(); ++i) {
427 if (namespaceIris_[i].equals(iri)) {
428 return toNamespaceId(i);
429 }
430 }
431 return XmlReader::NAMESPACE_UNKNOWN;
432 }
433
handleReference(char const * position,char const * end)434 char const * XmlReader::handleReference(char const * position, char const * end)
435 {
436 OSL_ASSERT(position != 0 && *position == '&' && position < end);
437 ++position;
438 if (*position == '#') {
439 ++position;
440 sal_Int32 val = 0;
441 char const * p;
442 if (*position == 'x') {
443 ++position;
444 p = position;
445 for (;; ++position) {
446 char c = *position;
447 if (c >= '0' && c <= '9') {
448 val = 16 * val + (c - '0');
449 } else if (c >= 'A' && c <= 'F') {
450 val = 16 * val + (c - 'A') + 10;
451 } else if (c >= 'a' && c <= 'f') {
452 val = 16 * val + (c - 'a') + 10;
453 } else {
454 break;
455 }
456 if (val > 0x10FFFF) { // avoid overflow
457 throw css::uno::RuntimeException(
458 (rtl::OUString(
459 RTL_CONSTASCII_USTRINGPARAM(
460 "'&#x...' too large in ")) +
461 fileUrl_),
462 css::uno::Reference< css::uno::XInterface >());
463 }
464 }
465 } else {
466 p = position;
467 for (;; ++position) {
468 char c = *position;
469 if (c >= '0' && c <= '9') {
470 val = 10 * val + (c - '0');
471 } else {
472 break;
473 }
474 if (val > 0x10FFFF) { // avoid overflow
475 throw css::uno::RuntimeException(
476 (rtl::OUString(
477 RTL_CONSTASCII_USTRINGPARAM(
478 "'&#...' too large in ")) +
479 fileUrl_),
480 css::uno::Reference< css::uno::XInterface >());
481 }
482 }
483 }
484 if (position == p || *position++ != ';') {
485 throw css::uno::RuntimeException(
486 (rtl::OUString(
487 RTL_CONSTASCII_USTRINGPARAM("'&#...' missing ';' in ")) +
488 fileUrl_),
489 css::uno::Reference< css::uno::XInterface >());
490 }
491 OSL_ASSERT(val >= 0 && val <= 0x10FFFF);
492 if ((val < 0x20 && val != 0x9 && val != 0xA && val != 0xD) ||
493 (val >= 0xD800 && val <= 0xDFFF) || val == 0xFFFE || val == 0xFFFF)
494 {
495 throw css::uno::RuntimeException(
496 (rtl::OUString(
497 RTL_CONSTASCII_USTRINGPARAM(
498 "character reference denoting invalid character in ")) +
499 fileUrl_),
500 css::uno::Reference< css::uno::XInterface >());
501 }
502 char buf[4];
503 sal_Int32 len;
504 if (val < 0x80) {
505 buf[0] = static_cast< char >(val);
506 len = 1;
507 } else if (val < 0x800) {
508 buf[0] = static_cast< char >((val >> 6) | 0xC0);
509 buf[1] = static_cast< char >((val & 0x3F) | 0x80);
510 len = 2;
511 } else if (val < 0x10000) {
512 buf[0] = static_cast< char >((val >> 12) | 0xE0);
513 buf[1] = static_cast< char >(((val >> 6) & 0x3F) | 0x80);
514 buf[2] = static_cast< char >((val & 0x3F) | 0x80);
515 len = 3;
516 } else {
517 buf[0] = static_cast< char >((val >> 18) | 0xF0);
518 buf[1] = static_cast< char >(((val >> 12) & 0x3F) | 0x80);
519 buf[2] = static_cast< char >(((val >> 6) & 0x3F) | 0x80);
520 buf[3] = static_cast< char >((val & 0x3F) | 0x80);
521 len = 4;
522 }
523 pad_.addEphemeral(buf, len);
524 return position;
525 } else {
526 struct EntityRef {
527 char const * inBegin;
528 sal_Int32 inLength;
529 char const * outBegin;
530 sal_Int32 outLength;
531 };
532 static EntityRef const refs[] = {
533 { RTL_CONSTASCII_STRINGPARAM("amp;"),
534 RTL_CONSTASCII_STRINGPARAM("&") },
535 { RTL_CONSTASCII_STRINGPARAM("lt;"),
536 RTL_CONSTASCII_STRINGPARAM("<") },
537 { RTL_CONSTASCII_STRINGPARAM("gt;"),
538 RTL_CONSTASCII_STRINGPARAM(">") },
539 { RTL_CONSTASCII_STRINGPARAM("apos;"),
540 RTL_CONSTASCII_STRINGPARAM("'") },
541 { RTL_CONSTASCII_STRINGPARAM("quot;"),
542 RTL_CONSTASCII_STRINGPARAM("\"") } };
543 for (std::size_t i = 0; i < sizeof refs / sizeof refs[0]; ++i) {
544 if (rtl_str_shortenedCompare_WithLength(
545 position, end - position, refs[i].inBegin, refs[i].inLength,
546 refs[i].inLength) ==
547 0)
548 {
549 position += refs[i].inLength;
550 pad_.add(refs[i].outBegin, refs[i].outLength);
551 return position;
552 }
553 }
554 throw css::uno::RuntimeException(
555 (rtl::OUString(
556 RTL_CONSTASCII_USTRINGPARAM("unknown entity reference in ")) +
557 fileUrl_),
558 css::uno::Reference< css::uno::XInterface >());
559 }
560 }
561
handleAttributeValue(char const * begin,char const * end,bool fullyNormalize)562 Span XmlReader::handleAttributeValue(
563 char const * begin, char const * end, bool fullyNormalize)
564 {
565 pad_.clear();
566 if (fullyNormalize) {
567 while (begin != end && isSpace(*begin)) {
568 ++begin;
569 }
570 while (end != begin && isSpace(end[-1])) {
571 --end;
572 }
573 char const * p = begin;
574 enum Space { SPACE_NONE, SPACE_SPAN, SPACE_BREAK };
575 // a single true space character can go into the current span,
576 // everything else breaks the span
577 Space space = SPACE_NONE;
578 while (p != end) {
579 switch (*p) {
580 case '\x09':
581 case '\x0A':
582 case '\x0D':
583 switch (space) {
584 case SPACE_NONE:
585 pad_.add(begin, p - begin);
586 pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
587 space = SPACE_BREAK;
588 break;
589 case SPACE_SPAN:
590 pad_.add(begin, p - begin);
591 space = SPACE_BREAK;
592 break;
593 case SPACE_BREAK:
594 break;
595 }
596 begin = ++p;
597 break;
598 case ' ':
599 switch (space) {
600 case SPACE_NONE:
601 ++p;
602 space = SPACE_SPAN;
603 break;
604 case SPACE_SPAN:
605 pad_.add(begin, p - begin);
606 begin = ++p;
607 space = SPACE_BREAK;
608 break;
609 case SPACE_BREAK:
610 begin = ++p;
611 break;
612 }
613 break;
614 case '&':
615 pad_.add(begin, p - begin);
616 p = handleReference(p, end);
617 begin = p;
618 space = SPACE_NONE;
619 break;
620 default:
621 ++p;
622 space = SPACE_NONE;
623 break;
624 }
625 }
626 pad_.add(begin, p - begin);
627 } else {
628 char const * p = begin;
629 while (p != end) {
630 switch (*p) {
631 case '\x09':
632 case '\x0A':
633 pad_.add(begin, p - begin);
634 begin = ++p;
635 pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
636 break;
637 case '\x0D':
638 pad_.add(begin, p - begin);
639 ++p;
640 if (peek() == '\x0A') {
641 ++p;
642 }
643 begin = p;
644 pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
645 break;
646 case '&':
647 pad_.add(begin, p - begin);
648 p = handleReference(p, end);
649 begin = p;
650 break;
651 default:
652 ++p;
653 break;
654 }
655 }
656 pad_.add(begin, p - begin);
657 }
658 return pad_.get();
659 }
660
handleStartTag(int * nsId,Span * localName)661 XmlReader::Result XmlReader::handleStartTag(int * nsId, Span * localName) {
662 OSL_ASSERT(nsId != 0 && localName);
663 char const * nameBegin = pos_;
664 char const * nameColon = 0;
665 if (!scanName(&nameColon)) {
666 throw css::uno::RuntimeException(
667 (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("bad tag name in ")) +
668 fileUrl_),
669 css::uno::Reference< css::uno::XInterface >());
670 }
671 char const * nameEnd = pos_;
672 NamespaceList::size_type inheritedNamespaces = namespaces_.size();
673 bool hasDefaultNs = false;
674 int defaultNsId = NAMESPACE_NONE;
675 attributes_.clear();
676 for (;;) {
677 char const * p = pos_;
678 skipSpace();
679 if (peek() == '/' || peek() == '>') {
680 break;
681 }
682 if (pos_ == p) {
683 throw css::uno::RuntimeException(
684 (rtl::OUString(
685 RTL_CONSTASCII_USTRINGPARAM(
686 "missing whitespace before attribute in ")) +
687 fileUrl_),
688 css::uno::Reference< css::uno::XInterface >());
689 }
690 char const * attrNameBegin = pos_;
691 char const * attrNameColon = 0;
692 if (!scanName(&attrNameColon)) {
693 throw css::uno::RuntimeException(
694 (rtl::OUString(
695 RTL_CONSTASCII_USTRINGPARAM("bad attribute name in ")) +
696 fileUrl_),
697 css::uno::Reference< css::uno::XInterface >());
698 }
699 char const * attrNameEnd = pos_;
700 skipSpace();
701 if (read() != '=') {
702 throw css::uno::RuntimeException(
703 (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("missing '=' in ")) +
704 fileUrl_),
705 css::uno::Reference< css::uno::XInterface >());
706 }
707 skipSpace();
708 char del = read();
709 if (del != '\'' && del != '"') {
710 throw css::uno::RuntimeException(
711 (rtl::OUString(
712 RTL_CONSTASCII_USTRINGPARAM("bad attribute value in ")) +
713 fileUrl_),
714 css::uno::Reference< css::uno::XInterface >());
715 }
716 char const * valueBegin = pos_;
717 sal_Int32 i = rtl_str_indexOfChar_WithLength(pos_, end_ - pos_, del);
718 if (i < 0) {
719 throw css::uno::RuntimeException(
720 (rtl::OUString(
721 RTL_CONSTASCII_USTRINGPARAM(
722 "unterminated attribute value in ")) +
723 fileUrl_),
724 css::uno::Reference< css::uno::XInterface >());
725 }
726 char const * valueEnd = pos_ + i;
727 pos_ += i + 1;
728 if (attrNameColon == 0 &&
729 Span(attrNameBegin, attrNameEnd - attrNameBegin).equals(
730 RTL_CONSTASCII_STRINGPARAM("xmlns")))
731 {
732 hasDefaultNs = true;
733 defaultNsId = scanNamespaceIri(valueBegin, valueEnd);
734 } else if (attrNameColon != 0 &&
735 Span(attrNameBegin, attrNameColon - attrNameBegin).equals(
736 RTL_CONSTASCII_STRINGPARAM("xmlns")))
737 {
738 namespaces_.push_back(
739 NamespaceData(
740 Span(attrNameColon + 1, attrNameEnd - (attrNameColon + 1)),
741 scanNamespaceIri(valueBegin, valueEnd)));
742 } else {
743 attributes_.push_back(
744 AttributeData(
745 attrNameBegin, attrNameEnd, attrNameColon, valueBegin,
746 valueEnd));
747 }
748 }
749 if (!hasDefaultNs && !elements_.empty()) {
750 defaultNsId = elements_.top().defaultNamespaceId;
751 }
752 firstAttribute_ = true;
753 if (peek() == '/') {
754 state_ = STATE_EMPTY_ELEMENT_TAG;
755 ++pos_;
756 } else {
757 state_ = STATE_CONTENT;
758 }
759 if (peek() != '>') {
760 throw css::uno::RuntimeException(
761 (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("missing '>' in ")) +
762 fileUrl_),
763 css::uno::Reference< css::uno::XInterface >());
764 }
765 ++pos_;
766 elements_.push(
767 ElementData(
768 Span(nameBegin, nameEnd - nameBegin), inheritedNamespaces,
769 defaultNsId));
770 if (nameColon == 0) {
771 *nsId = defaultNsId;
772 *localName = Span(nameBegin, nameEnd - nameBegin);
773 } else {
774 *nsId = getNamespaceId(Span(nameBegin, nameColon - nameBegin));
775 *localName = Span(nameColon + 1, nameEnd - (nameColon + 1));
776 }
777 return RESULT_BEGIN;
778 }
779
handleEndTag()780 XmlReader::Result XmlReader::handleEndTag() {
781 if (elements_.empty()) {
782 throw css::uno::RuntimeException(
783 (rtl::OUString(
784 RTL_CONSTASCII_USTRINGPARAM("spurious end tag in ")) +
785 fileUrl_),
786 css::uno::Reference< css::uno::XInterface >());
787 }
788 char const * nameBegin = pos_;
789 char const * nameColon = 0;
790 if (!scanName(&nameColon) ||
791 !elements_.top().name.equals(nameBegin, pos_ - nameBegin))
792 {
793 throw css::uno::RuntimeException(
794 (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("tag mismatch in ")) +
795 fileUrl_),
796 css::uno::Reference< css::uno::XInterface >());
797 }
798 handleElementEnd();
799 skipSpace();
800 if (peek() != '>') {
801 throw css::uno::RuntimeException(
802 (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("missing '>' in ")) +
803 fileUrl_),
804 css::uno::Reference< css::uno::XInterface >());
805 }
806 ++pos_;
807 return RESULT_END;
808 }
809
handleElementEnd()810 void XmlReader::handleElementEnd() {
811 OSL_ASSERT(!elements_.empty());
812 namespaces_.resize(elements_.top().inheritedNamespaces);
813 elements_.pop();
814 state_ = elements_.empty() ? STATE_DONE : STATE_CONTENT;
815 }
816
handleSkippedText(Span * data,int * nsId)817 XmlReader::Result XmlReader::handleSkippedText(Span * data, int * nsId) {
818 for (;;) {
819 sal_Int32 i = rtl_str_indexOfChar_WithLength(pos_, end_ - pos_, '<');
820 if (i < 0) {
821 throw css::uno::RuntimeException(
822 (rtl::OUString(
823 RTL_CONSTASCII_USTRINGPARAM("premature end of ")) +
824 fileUrl_),
825 css::uno::Reference< css::uno::XInterface >());
826 }
827 pos_ += i + 1;
828 switch (peek()) {
829 case '!':
830 ++pos_;
831 if (!skipComment() && !scanCdataSection().is()) {
832 skipDocumentTypeDeclaration();
833 }
834 break;
835 case '/':
836 ++pos_;
837 return handleEndTag();
838 case '?':
839 ++pos_;
840 skipProcessingInstruction();
841 break;
842 default:
843 return handleStartTag(nsId, data);
844 }
845 }
846 }
847
handleRawText(Span * text)848 XmlReader::Result XmlReader::handleRawText(Span * text) {
849 pad_.clear();
850 for (char const * begin = pos_;;) {
851 switch (peek()) {
852 case '\0': // i.e., EOF
853 throw css::uno::RuntimeException(
854 (rtl::OUString(
855 RTL_CONSTASCII_USTRINGPARAM("premature end of ")) +
856 fileUrl_),
857 css::uno::Reference< css::uno::XInterface >());
858 case '\x0D':
859 pad_.add(begin, pos_ - begin);
860 ++pos_;
861 if (peek() != '\x0A') {
862 pad_.add(RTL_CONSTASCII_STRINGPARAM("\x0A"));
863 }
864 begin = pos_;
865 break;
866 case '&':
867 pad_.add(begin, pos_ - begin);
868 pos_ = handleReference(pos_, end_);
869 begin = pos_;
870 break;
871 case '<':
872 pad_.add(begin, pos_ - begin);
873 ++pos_;
874 switch (peek()) {
875 case '!':
876 ++pos_;
877 if (!skipComment()) {
878 Span cdata(scanCdataSection());
879 if (cdata.is()) {
880 normalizeLineEnds(cdata);
881 } else {
882 skipDocumentTypeDeclaration();
883 }
884 }
885 begin = pos_;
886 break;
887 case '/':
888 *text = pad_.get();
889 ++pos_;
890 state_ = STATE_END_TAG;
891 return RESULT_TEXT;
892 case '?':
893 ++pos_;
894 skipProcessingInstruction();
895 begin = pos_;
896 break;
897 default:
898 *text = pad_.get();
899 state_ = STATE_START_TAG;
900 return RESULT_TEXT;
901 }
902 break;
903 default:
904 ++pos_;
905 break;
906 }
907 }
908 }
909
handleNormalizedText(Span * text)910 XmlReader::Result XmlReader::handleNormalizedText(Span * text) {
911 pad_.clear();
912 char const * flowBegin = pos_;
913 char const * flowEnd = pos_;
914 enum Space { SPACE_START, SPACE_NONE, SPACE_SPAN, SPACE_BREAK };
915 // a single true space character can go into the current flow,
916 // everything else breaks the flow
917 Space space = SPACE_START;
918 for (;;) {
919 switch (peek()) {
920 case '\0': // i.e., EOF
921 throw css::uno::RuntimeException(
922 (rtl::OUString(
923 RTL_CONSTASCII_USTRINGPARAM("premature end of ")) +
924 fileUrl_),
925 css::uno::Reference< css::uno::XInterface >());
926 case '\x09':
927 case '\x0A':
928 case '\x0D':
929 switch (space) {
930 case SPACE_START:
931 case SPACE_BREAK:
932 break;
933 case SPACE_NONE:
934 case SPACE_SPAN:
935 space = SPACE_BREAK;
936 break;
937 }
938 ++pos_;
939 break;
940 case ' ':
941 switch (space) {
942 case SPACE_START:
943 case SPACE_BREAK:
944 break;
945 case SPACE_NONE:
946 space = SPACE_SPAN;
947 break;
948 case SPACE_SPAN:
949 space = SPACE_BREAK;
950 break;
951 }
952 ++pos_;
953 break;
954 case '&':
955 switch (space) {
956 case SPACE_START:
957 break;
958 case SPACE_NONE:
959 case SPACE_SPAN:
960 pad_.add(flowBegin, pos_ - flowBegin);
961 break;
962 case SPACE_BREAK:
963 pad_.add(flowBegin, flowEnd - flowBegin);
964 pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
965 break;
966 }
967 pos_ = handleReference(pos_, end_);
968 flowBegin = pos_;
969 flowEnd = pos_;
970 space = SPACE_NONE;
971 break;
972 case '<':
973 ++pos_;
974 switch (peek()) {
975 case '!':
976 ++pos_;
977 if (skipComment()) {
978 space = SPACE_BREAK;
979 } else {
980 Span cdata(scanCdataSection());
981 if (cdata.is()) {
982 // CDATA is not normalized (similar to character
983 // references; it keeps the code simple), but it might
984 // arguably be better to normalize it:
985 switch (space) {
986 case SPACE_START:
987 break;
988 case SPACE_NONE:
989 case SPACE_SPAN:
990 pad_.add(flowBegin, pos_ - flowBegin);
991 break;
992 case SPACE_BREAK:
993 pad_.add(flowBegin, flowEnd - flowBegin);
994 pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
995 break;
996 }
997 normalizeLineEnds(cdata);
998 flowBegin = pos_;
999 flowEnd = pos_;
1000 space = SPACE_NONE;
1001 } else {
1002 skipDocumentTypeDeclaration();
1003 }
1004 }
1005 break;
1006 case '/':
1007 ++pos_;
1008 pad_.add(flowBegin, flowEnd - flowBegin);
1009 *text = pad_.get();
1010 state_ = STATE_END_TAG;
1011 return RESULT_TEXT;
1012 case '?':
1013 ++pos_;
1014 skipProcessingInstruction();
1015 space = SPACE_BREAK;
1016 break;
1017 default:
1018 pad_.add(flowBegin, flowEnd - flowBegin);
1019 *text = pad_.get();
1020 state_ = STATE_START_TAG;
1021 return RESULT_TEXT;
1022 }
1023 break;
1024 default:
1025 switch (space) {
1026 case SPACE_START:
1027 flowBegin = pos_;
1028 break;
1029 case SPACE_NONE:
1030 case SPACE_SPAN:
1031 break;
1032 case SPACE_BREAK:
1033 pad_.add(flowBegin, flowEnd - flowBegin);
1034 pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
1035 flowBegin = pos_;
1036 break;
1037 }
1038 flowEnd = ++pos_;
1039 space = SPACE_NONE;
1040 break;
1041 }
1042 }
1043 }
1044
toNamespaceId(NamespaceIris::size_type pos)1045 int XmlReader::toNamespaceId(NamespaceIris::size_type pos) {
1046 OSL_ASSERT(pos <= INT_MAX);
1047 return static_cast< int >(pos);
1048 }
1049
1050 }
1051