xref: /trunk/main/xmlreader/source/xmlreader.cxx (revision b63233d8)
1 /**************************************************************
2  *
3  * Licensed to the Apache Software Foundation (ASF) under one
4  * or more contributor license agreements.  See the NOTICE file
5  * distributed with this work for additional information
6  * regarding copyright ownership.  The ASF licenses this file
7  * to you under the Apache License, Version 2.0 (the
8  * "License"); you may not use this file except in compliance
9  * with the License.  You may obtain a copy of the License at
10  *
11  *   http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing,
14  * software distributed under the License is distributed on an
15  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16  * KIND, either express or implied.  See the License for the
17  * specific language governing permissions and limitations
18  * under the License.
19  *
20  *************************************************************/
21 
22 
23 
24 #include "sal/config.h"
25 
26 #include <climits>
27 #include <cstddef>
28 
29 #include "com/sun/star/container/NoSuchElementException.hpp"
30 #include "com/sun/star/uno/Reference.hxx"
31 #include "com/sun/star/uno/RuntimeException.hpp"
32 #include "com/sun/star/uno/XInterface.hpp"
33 #include "osl/diagnose.h"
34 #include "osl/file.h"
35 #include "rtl/string.h"
36 #include "rtl/ustring.h"
37 #include "rtl/ustring.hxx"
38 #include "sal/types.h"
39 #include "xmlreader/pad.hxx"
40 #include "xmlreader/span.hxx"
41 #include "xmlreader/xmlreader.hxx"
42 
43 namespace xmlreader {
44 
45 namespace {
46 
47 namespace css = com::sun::star;
48 
isSpace(char c)49 bool isSpace(char c) {
50     switch (c) {
51     case '\x09':
52     case '\x0A':
53     case '\x0D':
54     case ' ':
55         return true;
56     default:
57         return false;
58     }
59 }
60 
61 }
62 
XmlReader(rtl::OUString const & fileUrl)63 XmlReader::XmlReader(rtl::OUString const & fileUrl)
64     SAL_THROW((
65         css::container::NoSuchElementException, css::uno::RuntimeException)):
66     fileUrl_(fileUrl)
67 {
68     switch (osl_openFile(fileUrl_.pData, &fileHandle_, osl_File_OpenFlag_Read))
69     {
70     case osl_File_E_None:
71         break;
72     case osl_File_E_NOENT:
73         throw css::container::NoSuchElementException(
74             fileUrl_, css::uno::Reference< css::uno::XInterface >());
75     default:
76         throw css::uno::RuntimeException(
77             (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("cannot open ")) +
78              fileUrl_),
79             css::uno::Reference< css::uno::XInterface >());
80     }
81     oslFileError e = osl_getFileSize(fileHandle_, &fileSize_);
82     if (e == osl_File_E_None) {
83         e = osl_mapFile(
84             fileHandle_, &fileAddress_, fileSize_, 0,
85             osl_File_MapFlag_WillNeed);
86     }
87     if (e != osl_File_E_None) {
88         e = osl_closeFile(fileHandle_);
89         if (e != osl_File_E_None) {
90             OSL_TRACE("osl_closeFile failed with %ld", static_cast< long >(e));
91         }
92         throw css::uno::RuntimeException(
93             (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("cannot mmap ")) +
94              fileUrl_),
95             css::uno::Reference< css::uno::XInterface >());
96     }
97     namespaceIris_.push_back(
98         Span(
99             RTL_CONSTASCII_STRINGPARAM(
100                 "http://www.w3.org/XML/1998/namespace")));
101     namespaces_.push_back(
102         NamespaceData(Span(RTL_CONSTASCII_STRINGPARAM("xml")), NAMESPACE_XML));
103     pos_ = static_cast< char * >(fileAddress_);
104     end_ = pos_ + fileSize_;
105     state_ = STATE_CONTENT;
106 }
107 
~XmlReader()108 XmlReader::~XmlReader() {
109     oslFileError e = osl_unmapFile(fileAddress_, fileSize_);
110     if (e != osl_File_E_None) {
111         OSL_TRACE("osl_unmapFile failed with %ld", static_cast< long >(e));
112     }
113     e = osl_closeFile(fileHandle_);
114     if (e != osl_File_E_None) {
115         OSL_TRACE("osl_closeFile failed with %ld", static_cast< long >(e));
116     }
117 }
118 
registerNamespaceIri(Span const & iri)119 int XmlReader::registerNamespaceIri(Span const & iri) {
120     int id = toNamespaceId(namespaceIris_.size());
121     namespaceIris_.push_back(iri);
122     if (iri.equals(
123             Span(
124                 RTL_CONSTASCII_STRINGPARAM(
125                     "http://www.w3.org/2001/XMLSchema-instance"))))
126     {
127         // Old user layer .xcu files used the xsi namespace prefix without
128         // declaring a corresponding namespace binding, see issue 77174; reading
129         // those files during migration would fail without this hack that can be
130         // removed once migration is no longer relevant (see
131         // configmgr::Components::parseModificationLayer):
132         namespaces_.push_back(
133             NamespaceData(Span(RTL_CONSTASCII_STRINGPARAM("xsi")), id));
134     }
135     return id;
136 }
137 
nextItem(Text reportText,Span * data,int * nsId)138 XmlReader::Result XmlReader::nextItem(Text reportText, Span * data, int * nsId)
139 {
140     switch (state_) {
141     case STATE_CONTENT:
142         switch (reportText) {
143         case TEXT_NONE:
144             return handleSkippedText(data, nsId);
145         case TEXT_RAW:
146             return handleRawText(data);
147         case TEXT_NORMALIZED:
148             return handleNormalizedText(data);
149         }
150     case STATE_START_TAG:
151         return handleStartTag(nsId, data);
152     case STATE_END_TAG:
153         return handleEndTag();
154     case STATE_EMPTY_ELEMENT_TAG:
155         handleElementEnd();
156         return RESULT_END;
157     default: // STATE_DONE
158         return RESULT_DONE;
159     }
160 }
161 
nextAttribute(int * nsId,Span * localName)162 bool XmlReader::nextAttribute(int * nsId, Span * localName) {
163     OSL_ASSERT(nsId != 0 && localName != 0);
164     if (firstAttribute_) {
165         currentAttribute_ = attributes_.begin();
166         firstAttribute_ = false;
167     } else {
168         ++currentAttribute_;
169     }
170     if (currentAttribute_ == attributes_.end()) {
171         return false;
172     }
173     if (currentAttribute_->nameColon == 0) {
174         *nsId = NAMESPACE_NONE;
175         *localName = Span(
176             currentAttribute_->nameBegin,
177             currentAttribute_->nameEnd - currentAttribute_->nameBegin);
178     } else {
179         *nsId = getNamespaceId(
180             Span(
181                 currentAttribute_->nameBegin,
182                 currentAttribute_->nameColon - currentAttribute_->nameBegin));
183         *localName = Span(
184             currentAttribute_->nameColon + 1,
185             currentAttribute_->nameEnd - (currentAttribute_->nameColon + 1));
186     }
187     return true;
188 }
189 
getAttributeValue(bool fullyNormalize)190 Span XmlReader::getAttributeValue(bool fullyNormalize) {
191     return handleAttributeValue(
192         currentAttribute_->valueBegin, currentAttribute_->valueEnd,
193         fullyNormalize);
194 }
195 
getNamespaceId(Span const & prefix) const196 int XmlReader::getNamespaceId(Span const & prefix) const {
197     for (NamespaceList::const_reverse_iterator i(namespaces_.rbegin());
198          i != namespaces_.rend(); ++i)
199     {
200         if (prefix.equals(i->prefix)) {
201             return i->nsId;
202         }
203     }
204     return NAMESPACE_UNKNOWN;
205 }
206 
getUrl() const207 rtl::OUString XmlReader::getUrl() const {
208     return fileUrl_;
209 }
210 
normalizeLineEnds(Span const & text)211 void XmlReader::normalizeLineEnds(Span const & text) {
212     char const * p = text.begin;
213     sal_Int32 n = text.length;
214     for (;;) {
215         sal_Int32 i = rtl_str_indexOfChar_WithLength(p, n, '\x0D');
216         if (i < 0) {
217             break;
218         }
219         pad_.add(p, i);
220         p += i + 1;
221         n -= i + 1;
222         if (n == 0 || *p != '\x0A') {
223             pad_.add(RTL_CONSTASCII_STRINGPARAM("\x0A"));
224         }
225     }
226     pad_.add(p, n);
227 }
228 
skipSpace()229 void XmlReader::skipSpace() {
230     while (isSpace(peek())) {
231         ++pos_;
232     }
233 }
234 
skipComment()235 bool XmlReader::skipComment() {
236     if (rtl_str_shortenedCompare_WithLength(
237             pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("--"),
238             RTL_CONSTASCII_LENGTH("--")) !=
239         0)
240     {
241         return false;
242     }
243     pos_ += RTL_CONSTASCII_LENGTH("--");
244     sal_Int32 i = rtl_str_indexOfStr_WithLength(
245         pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("--"));
246     if (i < 0) {
247         throw css::uno::RuntimeException(
248             (rtl::OUString(
249                 RTL_CONSTASCII_USTRINGPARAM(
250                     "premature end (within comment) of ")) +
251              fileUrl_),
252             css::uno::Reference< css::uno::XInterface >());
253     }
254     pos_ += i + RTL_CONSTASCII_LENGTH("--");
255     if (read() != '>') {
256         throw css::uno::RuntimeException(
257             (rtl::OUString(
258                 RTL_CONSTASCII_USTRINGPARAM(
259                     "illegal \"--\" within comment in ")) +
260              fileUrl_),
261             css::uno::Reference< css::uno::XInterface >());
262     }
263     return true;
264 }
265 
skipProcessingInstruction()266 void XmlReader::skipProcessingInstruction() {
267     sal_Int32 i = rtl_str_indexOfStr_WithLength(
268         pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("?>"));
269     if (i < 0) {
270         throw css::uno::RuntimeException(
271             (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("bad '<?' in ")) +
272              fileUrl_),
273             css::uno::Reference< css::uno::XInterface >());
274     }
275     pos_ += i + RTL_CONSTASCII_LENGTH("?>");
276 }
277 
skipDocumentTypeDeclaration()278 void XmlReader::skipDocumentTypeDeclaration() {
279     // Neither is it checked that the doctypedecl is at the correct position in
280     // the document, nor that it is well-formed:
281     for (;;) {
282         char c = read();
283         switch (c) {
284         case '\0': // i.e., EOF
285             throw css::uno::RuntimeException(
286                 (rtl::OUString(
287                     RTL_CONSTASCII_USTRINGPARAM(
288                         "premature end (within DTD) of ")) +
289                  fileUrl_),
290                 css::uno::Reference< css::uno::XInterface >());
291         case '"':
292         case '\'':
293             {
294                 sal_Int32 i = rtl_str_indexOfChar_WithLength(
295                     pos_, end_ - pos_, c);
296                 if (i < 0) {
297                     throw css::uno::RuntimeException(
298                         (rtl::OUString(
299                             RTL_CONSTASCII_USTRINGPARAM(
300                                 "premature end (within DTD) of ")) +
301                          fileUrl_),
302                         css::uno::Reference< css::uno::XInterface >());
303                 }
304                 pos_ += i + 1;
305             }
306             break;
307         case '>':
308             return;
309         case '[':
310             for (;;) {
311                 c = read();
312                 switch (c) {
313                 case '\0': // i.e., EOF
314                     throw css::uno::RuntimeException(
315                         (rtl::OUString(
316                             RTL_CONSTASCII_USTRINGPARAM(
317                                 "premature end (within DTD) of ")) +
318                          fileUrl_),
319                         css::uno::Reference< css::uno::XInterface >());
320                 case '"':
321                 case '\'':
322                     {
323                         sal_Int32 i = rtl_str_indexOfChar_WithLength(
324                             pos_, end_ - pos_, c);
325                         if (i < 0) {
326                             throw css::uno::RuntimeException(
327                             (rtl::OUString(
328                                 RTL_CONSTASCII_USTRINGPARAM(
329                                     "premature end (within DTD) of ")) +
330                              fileUrl_),
331                             css::uno::Reference< css::uno::XInterface >());
332                         }
333                         pos_ += i + 1;
334                     }
335                     break;
336                 case '<':
337                     switch (read()) {
338                     case '\0': // i.e., EOF
339                         throw css::uno::RuntimeException(
340                             (rtl::OUString(
341                                 RTL_CONSTASCII_USTRINGPARAM(
342                                     "premature end (within DTD) of ")) +
343                              fileUrl_),
344                             css::uno::Reference< css::uno::XInterface >());
345                     case '!':
346                         skipComment();
347                         break;
348                     case '?':
349                         skipProcessingInstruction();
350                         break;
351                     default:
352                         break;
353                     }
354                     break;
355                 case ']':
356                     skipSpace();
357                     if (read() != '>') {
358                         throw css::uno::RuntimeException(
359                             (rtl::OUString(
360                                 RTL_CONSTASCII_USTRINGPARAM(
361                                     "missing \">\" of DTD in ")) +
362                              fileUrl_),
363                             css::uno::Reference< css::uno::XInterface >());
364                     }
365                     return;
366                 default:
367                     break;
368                 }
369             }
370         default:
371             break;
372         }
373     }
374 }
375 
scanCdataSection()376 Span XmlReader::scanCdataSection() {
377     if (rtl_str_shortenedCompare_WithLength(
378             pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("[CDATA["),
379             RTL_CONSTASCII_LENGTH("[CDATA[")) !=
380         0)
381     {
382         return Span();
383     }
384     pos_ += RTL_CONSTASCII_LENGTH("[CDATA[");
385     char const * begin = pos_;
386     sal_Int32 i = rtl_str_indexOfStr_WithLength(
387         pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("]]>"));
388     if (i < 0) {
389         throw css::uno::RuntimeException(
390             (rtl::OUString(
391                 RTL_CONSTASCII_USTRINGPARAM(
392                     "premature end (within CDATA section) of ")) +
393              fileUrl_),
394             css::uno::Reference< css::uno::XInterface >());
395     }
396     pos_ += i + RTL_CONSTASCII_LENGTH("]]>");
397     return Span(begin, i);
398 }
399 
scanName(char const ** nameColon)400 bool XmlReader::scanName(char const ** nameColon) {
401     OSL_ASSERT(nameColon != 0 && *nameColon == 0);
402     for (char const * begin = pos_;; ++pos_) {
403         switch (peek()) {
404         case '\0': // i.e., EOF
405         case '\x09':
406         case '\x0A':
407         case '\x0D':
408         case ' ':
409         case '/':
410         case '=':
411         case '>':
412             return pos_ != begin;
413         case ':':
414             *nameColon = pos_;
415             break;
416         default:
417             break;
418         }
419     }
420 }
421 
scanNamespaceIri(char const * begin,char const * end)422 int XmlReader::scanNamespaceIri(char const * begin, char const * end) {
423     OSL_ASSERT(begin != 0 && begin <= end);
424     Span iri(handleAttributeValue(begin, end, false));
425     for (NamespaceIris::size_type i = 0; i < namespaceIris_.size(); ++i) {
426         if (namespaceIris_[i].equals(iri)) {
427             return toNamespaceId(i);
428         }
429     }
430     return XmlReader::NAMESPACE_UNKNOWN;
431 }
432 
handleReference(char const * position,char const * end)433 char const * XmlReader::handleReference(char const * position, char const * end)
434 {
435     OSL_ASSERT(position != 0 && *position == '&' && position < end);
436     ++position;
437     if (*position == '#') {
438         ++position;
439         sal_Int32 val = 0;
440         char const * p;
441         if (*position == 'x') {
442             ++position;
443             p = position;
444             for (;; ++position) {
445                 char c = *position;
446                 if (c >= '0' && c <= '9') {
447                     val = 16 * val + (c - '0');
448                 } else if (c >= 'A' && c <= 'F') {
449                     val = 16 * val + (c - 'A') + 10;
450                 } else if (c >= 'a' && c <= 'f') {
451                     val = 16 * val + (c - 'a') + 10;
452                 } else {
453                     break;
454                 }
455                 if (val > 0x10FFFF) { // avoid overflow
456                     throw css::uno::RuntimeException(
457                         (rtl::OUString(
458                             RTL_CONSTASCII_USTRINGPARAM(
459                                 "'&#x...' too large in ")) +
460                          fileUrl_),
461                         css::uno::Reference< css::uno::XInterface >());
462                 }
463             }
464         } else {
465             p = position;
466             for (;; ++position) {
467                 char c = *position;
468                 if (c >= '0' && c <= '9') {
469                     val = 10 * val + (c - '0');
470                 } else {
471                     break;
472                 }
473                 if (val > 0x10FFFF) { // avoid overflow
474                     throw css::uno::RuntimeException(
475                         (rtl::OUString(
476                             RTL_CONSTASCII_USTRINGPARAM(
477                                 "'&#...' too large in ")) +
478                          fileUrl_),
479                         css::uno::Reference< css::uno::XInterface >());
480                 }
481             }
482         }
483         if (position == p || *position++ != ';') {
484             throw css::uno::RuntimeException(
485                 (rtl::OUString(
486                     RTL_CONSTASCII_USTRINGPARAM("'&#...' missing ';' in ")) +
487                  fileUrl_),
488                 css::uno::Reference< css::uno::XInterface >());
489         }
490         OSL_ASSERT(val >= 0 && val <= 0x10FFFF);
491         if ((val < 0x20 && val != 0x9 && val != 0xA && val != 0xD) ||
492             (val >= 0xD800 && val <= 0xDFFF) || val == 0xFFFE || val == 0xFFFF)
493         {
494             throw css::uno::RuntimeException(
495                 (rtl::OUString(
496                     RTL_CONSTASCII_USTRINGPARAM(
497                         "character reference denoting invalid character in ")) +
498                  fileUrl_),
499                 css::uno::Reference< css::uno::XInterface >());
500         }
501         char buf[4];
502         sal_Int32 len;
503         if (val < 0x80) {
504             buf[0] = static_cast< char >(val);
505             len = 1;
506         } else if (val < 0x800) {
507             buf[0] = static_cast< char >((val >> 6) | 0xC0);
508             buf[1] = static_cast< char >((val & 0x3F) | 0x80);
509             len = 2;
510         } else if (val < 0x10000) {
511             buf[0] = static_cast< char >((val >> 12) | 0xE0);
512             buf[1] = static_cast< char >(((val >> 6) & 0x3F) | 0x80);
513             buf[2] = static_cast< char >((val & 0x3F) | 0x80);
514             len = 3;
515         } else {
516             buf[0] = static_cast< char >((val >> 18) | 0xF0);
517             buf[1] = static_cast< char >(((val >> 12) & 0x3F) | 0x80);
518             buf[2] = static_cast< char >(((val >> 6) & 0x3F) | 0x80);
519             buf[3] = static_cast< char >((val & 0x3F) | 0x80);
520             len = 4;
521         }
522         pad_.addEphemeral(buf, len);
523         return position;
524     } else {
525         struct EntityRef {
526             char const * inBegin;
527             sal_Int32 inLength;
528             char const * outBegin;
529             sal_Int32 outLength;
530         };
531         static EntityRef const refs[] = {
532             { RTL_CONSTASCII_STRINGPARAM("amp;"),
533               RTL_CONSTASCII_STRINGPARAM("&") },
534             { RTL_CONSTASCII_STRINGPARAM("lt;"),
535               RTL_CONSTASCII_STRINGPARAM("<") },
536             { RTL_CONSTASCII_STRINGPARAM("gt;"),
537               RTL_CONSTASCII_STRINGPARAM(">") },
538             { RTL_CONSTASCII_STRINGPARAM("apos;"),
539               RTL_CONSTASCII_STRINGPARAM("'") },
540             { RTL_CONSTASCII_STRINGPARAM("quot;"),
541               RTL_CONSTASCII_STRINGPARAM("\"") } };
542         for (std::size_t i = 0; i < sizeof refs / sizeof refs[0]; ++i) {
543             if (rtl_str_shortenedCompare_WithLength(
544                     position, end - position, refs[i].inBegin, refs[i].inLength,
545                     refs[i].inLength) ==
546                 0)
547             {
548                 position += refs[i].inLength;
549                 pad_.add(refs[i].outBegin, refs[i].outLength);
550                 return position;
551             }
552         }
553         throw css::uno::RuntimeException(
554             (rtl::OUString(
555                 RTL_CONSTASCII_USTRINGPARAM("unknown entity reference in ")) +
556              fileUrl_),
557             css::uno::Reference< css::uno::XInterface >());
558     }
559 }
560 
handleAttributeValue(char const * begin,char const * end,bool fullyNormalize)561 Span XmlReader::handleAttributeValue(
562     char const * begin, char const * end, bool fullyNormalize)
563 {
564     pad_.clear();
565     if (fullyNormalize) {
566         while (begin != end && isSpace(*begin)) {
567             ++begin;
568         }
569         while (end != begin && isSpace(end[-1])) {
570             --end;
571         }
572         char const * p = begin;
573         enum Space { SPACE_NONE, SPACE_SPAN, SPACE_BREAK };
574             // a single true space character can go into the current span,
575             // everything else breaks the span
576         Space space = SPACE_NONE;
577         while (p != end) {
578             switch (*p) {
579             case '\x09':
580             case '\x0A':
581             case '\x0D':
582                 switch (space) {
583                 case SPACE_NONE:
584                     pad_.add(begin, p - begin);
585                     pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
586                     space = SPACE_BREAK;
587                     break;
588                 case SPACE_SPAN:
589                     pad_.add(begin, p - begin);
590                     space = SPACE_BREAK;
591                     break;
592                 case SPACE_BREAK:
593                     break;
594                 }
595                 begin = ++p;
596                 break;
597             case ' ':
598                 switch (space) {
599                 case SPACE_NONE:
600                     ++p;
601                     space = SPACE_SPAN;
602                     break;
603                 case SPACE_SPAN:
604                     pad_.add(begin, p - begin);
605                     begin = ++p;
606                     space = SPACE_BREAK;
607                     break;
608                 case SPACE_BREAK:
609                     begin = ++p;
610                     break;
611                 }
612                 break;
613             case '&':
614                 pad_.add(begin, p - begin);
615                 p = handleReference(p, end);
616                 begin = p;
617                 space = SPACE_NONE;
618                 break;
619             default:
620                 ++p;
621                 space = SPACE_NONE;
622                 break;
623             }
624         }
625         pad_.add(begin, p - begin);
626     } else {
627         char const * p = begin;
628         while (p != end) {
629             switch (*p) {
630             case '\x09':
631             case '\x0A':
632                 pad_.add(begin, p - begin);
633                 begin = ++p;
634                 pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
635                 break;
636             case '\x0D':
637                 pad_.add(begin, p - begin);
638                 ++p;
639                 if (peek() == '\x0A') {
640                     ++p;
641                 }
642                 begin = p;
643                 pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
644                 break;
645             case '&':
646                 pad_.add(begin, p - begin);
647                 p = handleReference(p, end);
648                 begin = p;
649                 break;
650             default:
651                 ++p;
652                 break;
653             }
654         }
655         pad_.add(begin, p - begin);
656     }
657     return pad_.get();
658 }
659 
handleStartTag(int * nsId,Span * localName)660 XmlReader::Result XmlReader::handleStartTag(int * nsId, Span * localName) {
661     OSL_ASSERT(nsId != 0 && localName);
662     char const * nameBegin = pos_;
663     char const * nameColon = 0;
664     if (!scanName(&nameColon)) {
665         throw css::uno::RuntimeException(
666             (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("bad tag name in ")) +
667              fileUrl_),
668             css::uno::Reference< css::uno::XInterface >());
669     }
670     char const * nameEnd = pos_;
671     NamespaceList::size_type inheritedNamespaces = namespaces_.size();
672     bool hasDefaultNs = false;
673     int defaultNsId = NAMESPACE_NONE;
674     attributes_.clear();
675     for (;;) {
676         char const * p = pos_;
677         skipSpace();
678         if (peek() == '/' || peek() == '>') {
679             break;
680         }
681         if (pos_ == p) {
682             throw css::uno::RuntimeException(
683                 (rtl::OUString(
684                     RTL_CONSTASCII_USTRINGPARAM(
685                         "missing whitespace before attribute in ")) +
686                  fileUrl_),
687                 css::uno::Reference< css::uno::XInterface >());
688         }
689         char const * attrNameBegin = pos_;
690         char const * attrNameColon = 0;
691         if (!scanName(&attrNameColon)) {
692             throw css::uno::RuntimeException(
693                 (rtl::OUString(
694                     RTL_CONSTASCII_USTRINGPARAM("bad attribute name in ")) +
695                  fileUrl_),
696                 css::uno::Reference< css::uno::XInterface >());
697         }
698         char const * attrNameEnd = pos_;
699         skipSpace();
700         if (read() != '=') {
701             throw css::uno::RuntimeException(
702                 (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("missing '=' in ")) +
703                  fileUrl_),
704                 css::uno::Reference< css::uno::XInterface >());
705         }
706         skipSpace();
707         char del = read();
708         if (del != '\'' && del != '"') {
709             throw css::uno::RuntimeException(
710                 (rtl::OUString(
711                     RTL_CONSTASCII_USTRINGPARAM("bad attribute value in ")) +
712                  fileUrl_),
713                 css::uno::Reference< css::uno::XInterface >());
714         }
715         char const * valueBegin = pos_;
716         sal_Int32 i = rtl_str_indexOfChar_WithLength(pos_, end_ - pos_, del);
717         if (i < 0) {
718             throw css::uno::RuntimeException(
719                 (rtl::OUString(
720                     RTL_CONSTASCII_USTRINGPARAM(
721                         "unterminated attribute value in ")) +
722                  fileUrl_),
723                 css::uno::Reference< css::uno::XInterface >());
724         }
725         char const * valueEnd = pos_ + i;
726         pos_ += i + 1;
727         if (attrNameColon == 0 &&
728             Span(attrNameBegin, attrNameEnd - attrNameBegin).equals(
729                 RTL_CONSTASCII_STRINGPARAM("xmlns")))
730         {
731             hasDefaultNs = true;
732             defaultNsId = scanNamespaceIri(valueBegin, valueEnd);
733         } else if (attrNameColon != 0 &&
734                    Span(attrNameBegin, attrNameColon - attrNameBegin).equals(
735                        RTL_CONSTASCII_STRINGPARAM("xmlns")))
736         {
737             namespaces_.push_back(
738                 NamespaceData(
739                     Span(attrNameColon + 1, attrNameEnd - (attrNameColon + 1)),
740                     scanNamespaceIri(valueBegin, valueEnd)));
741         } else {
742             attributes_.push_back(
743                 AttributeData(
744                     attrNameBegin, attrNameEnd, attrNameColon, valueBegin,
745                     valueEnd));
746         }
747     }
748     if (!hasDefaultNs && !elements_.empty()) {
749         defaultNsId = elements_.top().defaultNamespaceId;
750     }
751     firstAttribute_ = true;
752     if (peek() == '/') {
753         state_ = STATE_EMPTY_ELEMENT_TAG;
754         ++pos_;
755     } else {
756         state_ = STATE_CONTENT;
757     }
758     if (peek() != '>') {
759         throw css::uno::RuntimeException(
760             (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("missing '>' in ")) +
761              fileUrl_),
762             css::uno::Reference< css::uno::XInterface >());
763     }
764     ++pos_;
765     elements_.push(
766         ElementData(
767             Span(nameBegin, nameEnd - nameBegin), inheritedNamespaces,
768             defaultNsId));
769     if (nameColon == 0) {
770         *nsId = defaultNsId;
771         *localName = Span(nameBegin, nameEnd - nameBegin);
772     } else {
773         *nsId = getNamespaceId(Span(nameBegin, nameColon - nameBegin));
774         *localName = Span(nameColon + 1, nameEnd - (nameColon + 1));
775     }
776     return RESULT_BEGIN;
777 }
778 
handleEndTag()779 XmlReader::Result XmlReader::handleEndTag() {
780     if (elements_.empty()) {
781         throw css::uno::RuntimeException(
782             (rtl::OUString(
783                 RTL_CONSTASCII_USTRINGPARAM("spurious end tag in ")) +
784              fileUrl_),
785             css::uno::Reference< css::uno::XInterface >());
786     }
787     char const * nameBegin = pos_;
788     char const * nameColon = 0;
789     if (!scanName(&nameColon) ||
790         !elements_.top().name.equals(nameBegin, pos_ - nameBegin))
791     {
792         throw css::uno::RuntimeException(
793             (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("tag mismatch in ")) +
794              fileUrl_),
795             css::uno::Reference< css::uno::XInterface >());
796     }
797     handleElementEnd();
798     skipSpace();
799     if (peek() != '>') {
800         throw css::uno::RuntimeException(
801             (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("missing '>' in ")) +
802              fileUrl_),
803             css::uno::Reference< css::uno::XInterface >());
804     }
805     ++pos_;
806     return RESULT_END;
807 }
808 
handleElementEnd()809 void XmlReader::handleElementEnd() {
810     OSL_ASSERT(!elements_.empty());
811     namespaces_.resize(elements_.top().inheritedNamespaces);
812     elements_.pop();
813     state_ = elements_.empty() ? STATE_DONE : STATE_CONTENT;
814 }
815 
handleSkippedText(Span * data,int * nsId)816 XmlReader::Result XmlReader::handleSkippedText(Span * data, int * nsId) {
817     for (;;) {
818         sal_Int32 i = rtl_str_indexOfChar_WithLength(pos_, end_ - pos_, '<');
819         if (i < 0) {
820             throw css::uno::RuntimeException(
821                 (rtl::OUString(
822                     RTL_CONSTASCII_USTRINGPARAM("premature end of ")) +
823                  fileUrl_),
824                 css::uno::Reference< css::uno::XInterface >());
825         }
826         pos_ += i + 1;
827         switch (peek()) {
828         case '!':
829             ++pos_;
830             if (!skipComment() && !scanCdataSection().is()) {
831                 skipDocumentTypeDeclaration();
832             }
833             break;
834         case '/':
835             ++pos_;
836             return handleEndTag();
837         case '?':
838             ++pos_;
839             skipProcessingInstruction();
840             break;
841         default:
842             return handleStartTag(nsId, data);
843         }
844     }
845 }
846 
handleRawText(Span * text)847 XmlReader::Result XmlReader::handleRawText(Span * text) {
848     pad_.clear();
849     for (char const * begin = pos_;;) {
850         switch (peek()) {
851         case '\0': // i.e., EOF
852             throw css::uno::RuntimeException(
853                 (rtl::OUString(
854                     RTL_CONSTASCII_USTRINGPARAM("premature end of ")) +
855                  fileUrl_),
856                 css::uno::Reference< css::uno::XInterface >());
857         case '\x0D':
858             pad_.add(begin, pos_ - begin);
859             ++pos_;
860             if (peek() != '\x0A') {
861                 pad_.add(RTL_CONSTASCII_STRINGPARAM("\x0A"));
862             }
863             begin = pos_;
864             break;
865         case '&':
866             pad_.add(begin, pos_ - begin);
867             pos_ = handleReference(pos_, end_);
868             begin = pos_;
869             break;
870         case '<':
871             pad_.add(begin, pos_ - begin);
872             ++pos_;
873             switch (peek()) {
874             case '!':
875                 ++pos_;
876                 if (!skipComment()) {
877                     Span cdata(scanCdataSection());
878                     if (cdata.is()) {
879                         normalizeLineEnds(cdata);
880                     } else {
881                         skipDocumentTypeDeclaration();
882                     }
883                 }
884                 begin = pos_;
885                 break;
886             case '/':
887                 *text = pad_.get();
888                 ++pos_;
889                 state_ = STATE_END_TAG;
890                 return RESULT_TEXT;
891             case '?':
892                 ++pos_;
893                 skipProcessingInstruction();
894                 begin = pos_;
895                 break;
896             default:
897                 *text = pad_.get();
898                 state_ = STATE_START_TAG;
899                 return RESULT_TEXT;
900             }
901             break;
902         default:
903             ++pos_;
904             break;
905         }
906     }
907 }
908 
handleNormalizedText(Span * text)909 XmlReader::Result XmlReader::handleNormalizedText(Span * text) {
910     pad_.clear();
911     char const * flowBegin = pos_;
912     char const * flowEnd = pos_;
913     enum Space { SPACE_START, SPACE_NONE, SPACE_SPAN, SPACE_BREAK };
914         // a single true space character can go into the current flow,
915         // everything else breaks the flow
916     Space space = SPACE_START;
917     for (;;) {
918         switch (peek()) {
919         case '\0': // i.e., EOF
920             throw css::uno::RuntimeException(
921                 (rtl::OUString(
922                     RTL_CONSTASCII_USTRINGPARAM("premature end of ")) +
923                  fileUrl_),
924                 css::uno::Reference< css::uno::XInterface >());
925         case '\x09':
926         case '\x0A':
927         case '\x0D':
928             switch (space) {
929             case SPACE_START:
930             case SPACE_BREAK:
931                 break;
932             case SPACE_NONE:
933             case SPACE_SPAN:
934                 space = SPACE_BREAK;
935                 break;
936             }
937             ++pos_;
938             break;
939         case ' ':
940             switch (space) {
941             case SPACE_START:
942             case SPACE_BREAK:
943                 break;
944             case SPACE_NONE:
945                 space = SPACE_SPAN;
946                 break;
947             case SPACE_SPAN:
948                 space = SPACE_BREAK;
949                 break;
950             }
951             ++pos_;
952             break;
953         case '&':
954             switch (space) {
955             case SPACE_START:
956                 break;
957             case SPACE_NONE:
958             case SPACE_SPAN:
959                 pad_.add(flowBegin, pos_ - flowBegin);
960                 break;
961             case SPACE_BREAK:
962                 pad_.add(flowBegin, flowEnd - flowBegin);
963                 pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
964                 break;
965             }
966             pos_ = handleReference(pos_, end_);
967             flowBegin = pos_;
968             flowEnd = pos_;
969             space = SPACE_NONE;
970             break;
971         case '<':
972             ++pos_;
973             switch (peek()) {
974             case '!':
975                 ++pos_;
976                 if (skipComment()) {
977                     space = SPACE_BREAK;
978                 } else {
979                     Span cdata(scanCdataSection());
980                     if (cdata.is()) {
981                         // CDATA is not normalized (similar to character
982                         // references; it keeps the code simple), but it might
983                         // arguably be better to normalize it:
984                         switch (space) {
985                         case SPACE_START:
986                             break;
987                         case SPACE_NONE:
988                         case SPACE_SPAN:
989                             pad_.add(flowBegin, pos_ - flowBegin);
990                             break;
991                         case SPACE_BREAK:
992                             pad_.add(flowBegin, flowEnd - flowBegin);
993                             pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
994                             break;
995                         }
996                         normalizeLineEnds(cdata);
997                         flowBegin = pos_;
998                         flowEnd = pos_;
999                         space = SPACE_NONE;
1000                     } else {
1001                         skipDocumentTypeDeclaration();
1002                     }
1003                 }
1004                 break;
1005             case '/':
1006                 ++pos_;
1007                 pad_.add(flowBegin, flowEnd - flowBegin);
1008                 *text = pad_.get();
1009                 state_ = STATE_END_TAG;
1010                 return RESULT_TEXT;
1011             case '?':
1012                 ++pos_;
1013                 skipProcessingInstruction();
1014                 space = SPACE_BREAK;
1015                 break;
1016             default:
1017                 pad_.add(flowBegin, flowEnd - flowBegin);
1018                 *text = pad_.get();
1019                 state_ = STATE_START_TAG;
1020                 return RESULT_TEXT;
1021             }
1022             break;
1023         default:
1024             switch (space) {
1025             case SPACE_START:
1026                 flowBegin = pos_;
1027                 break;
1028             case SPACE_NONE:
1029             case SPACE_SPAN:
1030                 break;
1031             case SPACE_BREAK:
1032                 pad_.add(flowBegin, flowEnd - flowBegin);
1033                 pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
1034                 flowBegin = pos_;
1035                 break;
1036             }
1037             flowEnd = ++pos_;
1038             space = SPACE_NONE;
1039             break;
1040         }
1041     }
1042 }
1043 
toNamespaceId(NamespaceIris::size_type pos)1044 int XmlReader::toNamespaceId(NamespaceIris::size_type pos) {
1045     OSL_ASSERT(pos <= INT_MAX);
1046     return static_cast< int >(pos);
1047 }
1048 
1049 }
1050