xref: /aoo41x/main/xmlreader/source/xmlreader.cxx (revision b725e8eb)
1 /**************************************************************
2  *
3  * Licensed to the Apache Software Foundation (ASF) under one
4  * or more contributor license agreements.  See the NOTICE file
5  * distributed with this work for additional information
6  * regarding copyright ownership.  The ASF licenses this file
7  * to you under the Apache License, Version 2.0 (the
8  * "License"); you may not use this file except in compliance
9  * with the License.  You may obtain a copy of the License at
10  *
11  *   http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing,
14  * software distributed under the License is distributed on an
15  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16  * KIND, either express or implied.  See the License for the
17  * specific language governing permissions and limitations
18  * under the License.
19  *
20  *************************************************************/
21 
22 
23 
24 #include "precompiled_xmlreader.hxx"
25 #include "sal/config.h"
26 
27 #include <climits>
28 #include <cstddef>
29 
30 #include "com/sun/star/container/NoSuchElementException.hpp"
31 #include "com/sun/star/uno/Reference.hxx"
32 #include "com/sun/star/uno/RuntimeException.hpp"
33 #include "com/sun/star/uno/XInterface.hpp"
34 #include "osl/diagnose.h"
35 #include "osl/file.h"
36 #include "rtl/string.h"
37 #include "rtl/ustring.h"
38 #include "rtl/ustring.hxx"
39 #include "sal/types.h"
40 #include "xmlreader/pad.hxx"
41 #include "xmlreader/span.hxx"
42 #include "xmlreader/xmlreader.hxx"
43 
44 namespace xmlreader {
45 
46 namespace {
47 
48 namespace css = com::sun::star;
49 
isSpace(char c)50 bool isSpace(char c) {
51     switch (c) {
52     case '\x09':
53     case '\x0A':
54     case '\x0D':
55     case ' ':
56         return true;
57     default:
58         return false;
59     }
60 }
61 
62 }
63 
XmlReader(rtl::OUString const & fileUrl)64 XmlReader::XmlReader(rtl::OUString const & fileUrl)
65     SAL_THROW((
66         css::container::NoSuchElementException, css::uno::RuntimeException)):
67     fileUrl_(fileUrl)
68 {
69     switch (osl_openFile(fileUrl_.pData, &fileHandle_, osl_File_OpenFlag_Read))
70     {
71     case osl_File_E_None:
72         break;
73     case osl_File_E_NOENT:
74         throw css::container::NoSuchElementException(
75             fileUrl_, css::uno::Reference< css::uno::XInterface >());
76     default:
77         throw css::uno::RuntimeException(
78             (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("cannot open ")) +
79              fileUrl_),
80             css::uno::Reference< css::uno::XInterface >());
81     }
82     oslFileError e = osl_getFileSize(fileHandle_, &fileSize_);
83     if (e == osl_File_E_None) {
84         e = osl_mapFile(
85             fileHandle_, &fileAddress_, fileSize_, 0,
86             osl_File_MapFlag_WillNeed);
87     }
88     if (e != osl_File_E_None) {
89         e = osl_closeFile(fileHandle_);
90         if (e != osl_File_E_None) {
91             OSL_TRACE("osl_closeFile failed with %ld", static_cast< long >(e));
92         }
93         throw css::uno::RuntimeException(
94             (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("cannot mmap ")) +
95              fileUrl_),
96             css::uno::Reference< css::uno::XInterface >());
97     }
98     namespaceIris_.push_back(
99         Span(
100             RTL_CONSTASCII_STRINGPARAM(
101                 "http://www.w3.org/XML/1998/namespace")));
102     namespaces_.push_back(
103         NamespaceData(Span(RTL_CONSTASCII_STRINGPARAM("xml")), NAMESPACE_XML));
104     pos_ = static_cast< char * >(fileAddress_);
105     end_ = pos_ + fileSize_;
106     state_ = STATE_CONTENT;
107 }
108 
~XmlReader()109 XmlReader::~XmlReader() {
110     oslFileError e = osl_unmapFile(fileAddress_, fileSize_);
111     if (e != osl_File_E_None) {
112         OSL_TRACE("osl_unmapFile failed with %ld", static_cast< long >(e));
113     }
114     e = osl_closeFile(fileHandle_);
115     if (e != osl_File_E_None) {
116         OSL_TRACE("osl_closeFile failed with %ld", static_cast< long >(e));
117     }
118 }
119 
registerNamespaceIri(Span const & iri)120 int XmlReader::registerNamespaceIri(Span const & iri) {
121     int id = toNamespaceId(namespaceIris_.size());
122     namespaceIris_.push_back(iri);
123     if (iri.equals(
124             Span(
125                 RTL_CONSTASCII_STRINGPARAM(
126                     "http://www.w3.org/2001/XMLSchema-instance"))))
127     {
128         // Old user layer .xcu files used the xsi namespace prefix without
129         // declaring a corresponding namespace binding, see issue 77174; reading
130         // those files during migration would fail without this hack that can be
131         // removed once migration is no longer relevant (see
132         // configmgr::Components::parseModificationLayer):
133         namespaces_.push_back(
134             NamespaceData(Span(RTL_CONSTASCII_STRINGPARAM("xsi")), id));
135     }
136     return id;
137 }
138 
nextItem(Text reportText,Span * data,int * nsId)139 XmlReader::Result XmlReader::nextItem(Text reportText, Span * data, int * nsId)
140 {
141     switch (state_) {
142     case STATE_CONTENT:
143         switch (reportText) {
144         case TEXT_NONE:
145             return handleSkippedText(data, nsId);
146         case TEXT_RAW:
147             return handleRawText(data);
148         case TEXT_NORMALIZED:
149             return handleNormalizedText(data);
150         }
151     case STATE_START_TAG:
152         return handleStartTag(nsId, data);
153     case STATE_END_TAG:
154         return handleEndTag();
155     case STATE_EMPTY_ELEMENT_TAG:
156         handleElementEnd();
157         return RESULT_END;
158     default: // STATE_DONE
159         return RESULT_DONE;
160     }
161 }
162 
nextAttribute(int * nsId,Span * localName)163 bool XmlReader::nextAttribute(int * nsId, Span * localName) {
164     OSL_ASSERT(nsId != 0 && localName != 0);
165     if (firstAttribute_) {
166         currentAttribute_ = attributes_.begin();
167         firstAttribute_ = false;
168     } else {
169         ++currentAttribute_;
170     }
171     if (currentAttribute_ == attributes_.end()) {
172         return false;
173     }
174     if (currentAttribute_->nameColon == 0) {
175         *nsId = NAMESPACE_NONE;
176         *localName = Span(
177             currentAttribute_->nameBegin,
178             currentAttribute_->nameEnd - currentAttribute_->nameBegin);
179     } else {
180         *nsId = getNamespaceId(
181             Span(
182                 currentAttribute_->nameBegin,
183                 currentAttribute_->nameColon - currentAttribute_->nameBegin));
184         *localName = Span(
185             currentAttribute_->nameColon + 1,
186             currentAttribute_->nameEnd - (currentAttribute_->nameColon + 1));
187     }
188     return true;
189 }
190 
getAttributeValue(bool fullyNormalize)191 Span XmlReader::getAttributeValue(bool fullyNormalize) {
192     return handleAttributeValue(
193         currentAttribute_->valueBegin, currentAttribute_->valueEnd,
194         fullyNormalize);
195 }
196 
getNamespaceId(Span const & prefix) const197 int XmlReader::getNamespaceId(Span const & prefix) const {
198     for (NamespaceList::const_reverse_iterator i(namespaces_.rbegin());
199          i != namespaces_.rend(); ++i)
200     {
201         if (prefix.equals(i->prefix)) {
202             return i->nsId;
203         }
204     }
205     return NAMESPACE_UNKNOWN;
206 }
207 
getUrl() const208 rtl::OUString XmlReader::getUrl() const {
209     return fileUrl_;
210 }
211 
normalizeLineEnds(Span const & text)212 void XmlReader::normalizeLineEnds(Span const & text) {
213     char const * p = text.begin;
214     sal_Int32 n = text.length;
215     for (;;) {
216         sal_Int32 i = rtl_str_indexOfChar_WithLength(p, n, '\x0D');
217         if (i < 0) {
218             break;
219         }
220         pad_.add(p, i);
221         p += i + 1;
222         n -= i + 1;
223         if (n == 0 || *p != '\x0A') {
224             pad_.add(RTL_CONSTASCII_STRINGPARAM("\x0A"));
225         }
226     }
227     pad_.add(p, n);
228 }
229 
skipSpace()230 void XmlReader::skipSpace() {
231     while (isSpace(peek())) {
232         ++pos_;
233     }
234 }
235 
skipComment()236 bool XmlReader::skipComment() {
237     if (rtl_str_shortenedCompare_WithLength(
238             pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("--"),
239             RTL_CONSTASCII_LENGTH("--")) !=
240         0)
241     {
242         return false;
243     }
244     pos_ += RTL_CONSTASCII_LENGTH("--");
245     sal_Int32 i = rtl_str_indexOfStr_WithLength(
246         pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("--"));
247     if (i < 0) {
248         throw css::uno::RuntimeException(
249             (rtl::OUString(
250                 RTL_CONSTASCII_USTRINGPARAM(
251                     "premature end (within comment) of ")) +
252              fileUrl_),
253             css::uno::Reference< css::uno::XInterface >());
254     }
255     pos_ += i + RTL_CONSTASCII_LENGTH("--");
256     if (read() != '>') {
257         throw css::uno::RuntimeException(
258             (rtl::OUString(
259                 RTL_CONSTASCII_USTRINGPARAM(
260                     "illegal \"--\" within comment in ")) +
261              fileUrl_),
262             css::uno::Reference< css::uno::XInterface >());
263     }
264     return true;
265 }
266 
skipProcessingInstruction()267 void XmlReader::skipProcessingInstruction() {
268     sal_Int32 i = rtl_str_indexOfStr_WithLength(
269         pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("?>"));
270     if (i < 0) {
271         throw css::uno::RuntimeException(
272             (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("bad '<?' in ")) +
273              fileUrl_),
274             css::uno::Reference< css::uno::XInterface >());
275     }
276     pos_ += i + RTL_CONSTASCII_LENGTH("?>");
277 }
278 
skipDocumentTypeDeclaration()279 void XmlReader::skipDocumentTypeDeclaration() {
280     // Neither is it checked that the doctypedecl is at the correct position in
281     // the document, nor that it is well-formed:
282     for (;;) {
283         char c = read();
284         switch (c) {
285         case '\0': // i.e., EOF
286             throw css::uno::RuntimeException(
287                 (rtl::OUString(
288                     RTL_CONSTASCII_USTRINGPARAM(
289                         "premature end (within DTD) of ")) +
290                  fileUrl_),
291                 css::uno::Reference< css::uno::XInterface >());
292         case '"':
293         case '\'':
294             {
295                 sal_Int32 i = rtl_str_indexOfChar_WithLength(
296                     pos_, end_ - pos_, c);
297                 if (i < 0) {
298                     throw css::uno::RuntimeException(
299                         (rtl::OUString(
300                             RTL_CONSTASCII_USTRINGPARAM(
301                                 "premature end (within DTD) of ")) +
302                          fileUrl_),
303                         css::uno::Reference< css::uno::XInterface >());
304                 }
305                 pos_ += i + 1;
306             }
307             break;
308         case '>':
309             return;
310         case '[':
311             for (;;) {
312                 c = read();
313                 switch (c) {
314                 case '\0': // i.e., EOF
315                     throw css::uno::RuntimeException(
316                         (rtl::OUString(
317                             RTL_CONSTASCII_USTRINGPARAM(
318                                 "premature end (within DTD) of ")) +
319                          fileUrl_),
320                         css::uno::Reference< css::uno::XInterface >());
321                 case '"':
322                 case '\'':
323                     {
324                         sal_Int32 i = rtl_str_indexOfChar_WithLength(
325                             pos_, end_ - pos_, c);
326                         if (i < 0) {
327                             throw css::uno::RuntimeException(
328                             (rtl::OUString(
329                                 RTL_CONSTASCII_USTRINGPARAM(
330                                     "premature end (within DTD) of ")) +
331                              fileUrl_),
332                             css::uno::Reference< css::uno::XInterface >());
333                         }
334                         pos_ += i + 1;
335                     }
336                     break;
337                 case '<':
338                     switch (read()) {
339                     case '\0': // i.e., EOF
340                         throw css::uno::RuntimeException(
341                             (rtl::OUString(
342                                 RTL_CONSTASCII_USTRINGPARAM(
343                                     "premature end (within DTD) of ")) +
344                              fileUrl_),
345                             css::uno::Reference< css::uno::XInterface >());
346                     case '!':
347                         skipComment();
348                         break;
349                     case '?':
350                         skipProcessingInstruction();
351                         break;
352                     default:
353                         break;
354                     }
355                     break;
356                 case ']':
357                     skipSpace();
358                     if (read() != '>') {
359                         throw css::uno::RuntimeException(
360                             (rtl::OUString(
361                                 RTL_CONSTASCII_USTRINGPARAM(
362                                     "missing \">\" of DTD in ")) +
363                              fileUrl_),
364                             css::uno::Reference< css::uno::XInterface >());
365                     }
366                     return;
367                 default:
368                     break;
369                 }
370             }
371         default:
372             break;
373         }
374     }
375 }
376 
scanCdataSection()377 Span XmlReader::scanCdataSection() {
378     if (rtl_str_shortenedCompare_WithLength(
379             pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("[CDATA["),
380             RTL_CONSTASCII_LENGTH("[CDATA[")) !=
381         0)
382     {
383         return Span();
384     }
385     pos_ += RTL_CONSTASCII_LENGTH("[CDATA[");
386     char const * begin = pos_;
387     sal_Int32 i = rtl_str_indexOfStr_WithLength(
388         pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("]]>"));
389     if (i < 0) {
390         throw css::uno::RuntimeException(
391             (rtl::OUString(
392                 RTL_CONSTASCII_USTRINGPARAM(
393                     "premature end (within CDATA section) of ")) +
394              fileUrl_),
395             css::uno::Reference< css::uno::XInterface >());
396     }
397     pos_ += i + RTL_CONSTASCII_LENGTH("]]>");
398     return Span(begin, i);
399 }
400 
scanName(char const ** nameColon)401 bool XmlReader::scanName(char const ** nameColon) {
402     OSL_ASSERT(nameColon != 0 && *nameColon == 0);
403     for (char const * begin = pos_;; ++pos_) {
404         switch (peek()) {
405         case '\0': // i.e., EOF
406         case '\x09':
407         case '\x0A':
408         case '\x0D':
409         case ' ':
410         case '/':
411         case '=':
412         case '>':
413             return pos_ != begin;
414         case ':':
415             *nameColon = pos_;
416             break;
417         default:
418             break;
419         }
420     }
421 }
422 
scanNamespaceIri(char const * begin,char const * end)423 int XmlReader::scanNamespaceIri(char const * begin, char const * end) {
424     OSL_ASSERT(begin != 0 && begin <= end);
425     Span iri(handleAttributeValue(begin, end, false));
426     for (NamespaceIris::size_type i = 0; i < namespaceIris_.size(); ++i) {
427         if (namespaceIris_[i].equals(iri)) {
428             return toNamespaceId(i);
429         }
430     }
431     return XmlReader::NAMESPACE_UNKNOWN;
432 }
433 
handleReference(char const * position,char const * end)434 char const * XmlReader::handleReference(char const * position, char const * end)
435 {
436     OSL_ASSERT(position != 0 && *position == '&' && position < end);
437     ++position;
438     if (*position == '#') {
439         ++position;
440         sal_Int32 val = 0;
441         char const * p;
442         if (*position == 'x') {
443             ++position;
444             p = position;
445             for (;; ++position) {
446                 char c = *position;
447                 if (c >= '0' && c <= '9') {
448                     val = 16 * val + (c - '0');
449                 } else if (c >= 'A' && c <= 'F') {
450                     val = 16 * val + (c - 'A') + 10;
451                 } else if (c >= 'a' && c <= 'f') {
452                     val = 16 * val + (c - 'a') + 10;
453                 } else {
454                     break;
455                 }
456                 if (val > 0x10FFFF) { // avoid overflow
457                     throw css::uno::RuntimeException(
458                         (rtl::OUString(
459                             RTL_CONSTASCII_USTRINGPARAM(
460                                 "'&#x...' too large in ")) +
461                          fileUrl_),
462                         css::uno::Reference< css::uno::XInterface >());
463                 }
464             }
465         } else {
466             p = position;
467             for (;; ++position) {
468                 char c = *position;
469                 if (c >= '0' && c <= '9') {
470                     val = 10 * val + (c - '0');
471                 } else {
472                     break;
473                 }
474                 if (val > 0x10FFFF) { // avoid overflow
475                     throw css::uno::RuntimeException(
476                         (rtl::OUString(
477                             RTL_CONSTASCII_USTRINGPARAM(
478                                 "'&#...' too large in ")) +
479                          fileUrl_),
480                         css::uno::Reference< css::uno::XInterface >());
481                 }
482             }
483         }
484         if (position == p || *position++ != ';') {
485             throw css::uno::RuntimeException(
486                 (rtl::OUString(
487                     RTL_CONSTASCII_USTRINGPARAM("'&#...' missing ';' in ")) +
488                  fileUrl_),
489                 css::uno::Reference< css::uno::XInterface >());
490         }
491         OSL_ASSERT(val >= 0 && val <= 0x10FFFF);
492         if ((val < 0x20 && val != 0x9 && val != 0xA && val != 0xD) ||
493             (val >= 0xD800 && val <= 0xDFFF) || val == 0xFFFE || val == 0xFFFF)
494         {
495             throw css::uno::RuntimeException(
496                 (rtl::OUString(
497                     RTL_CONSTASCII_USTRINGPARAM(
498                         "character reference denoting invalid character in ")) +
499                  fileUrl_),
500                 css::uno::Reference< css::uno::XInterface >());
501         }
502         char buf[4];
503         sal_Int32 len;
504         if (val < 0x80) {
505             buf[0] = static_cast< char >(val);
506             len = 1;
507         } else if (val < 0x800) {
508             buf[0] = static_cast< char >((val >> 6) | 0xC0);
509             buf[1] = static_cast< char >((val & 0x3F) | 0x80);
510             len = 2;
511         } else if (val < 0x10000) {
512             buf[0] = static_cast< char >((val >> 12) | 0xE0);
513             buf[1] = static_cast< char >(((val >> 6) & 0x3F) | 0x80);
514             buf[2] = static_cast< char >((val & 0x3F) | 0x80);
515             len = 3;
516         } else {
517             buf[0] = static_cast< char >((val >> 18) | 0xF0);
518             buf[1] = static_cast< char >(((val >> 12) & 0x3F) | 0x80);
519             buf[2] = static_cast< char >(((val >> 6) & 0x3F) | 0x80);
520             buf[3] = static_cast< char >((val & 0x3F) | 0x80);
521             len = 4;
522         }
523         pad_.addEphemeral(buf, len);
524         return position;
525     } else {
526         struct EntityRef {
527             char const * inBegin;
528             sal_Int32 inLength;
529             char const * outBegin;
530             sal_Int32 outLength;
531         };
532         static EntityRef const refs[] = {
533             { RTL_CONSTASCII_STRINGPARAM("amp;"),
534               RTL_CONSTASCII_STRINGPARAM("&") },
535             { RTL_CONSTASCII_STRINGPARAM("lt;"),
536               RTL_CONSTASCII_STRINGPARAM("<") },
537             { RTL_CONSTASCII_STRINGPARAM("gt;"),
538               RTL_CONSTASCII_STRINGPARAM(">") },
539             { RTL_CONSTASCII_STRINGPARAM("apos;"),
540               RTL_CONSTASCII_STRINGPARAM("'") },
541             { RTL_CONSTASCII_STRINGPARAM("quot;"),
542               RTL_CONSTASCII_STRINGPARAM("\"") } };
543         for (std::size_t i = 0; i < sizeof refs / sizeof refs[0]; ++i) {
544             if (rtl_str_shortenedCompare_WithLength(
545                     position, end - position, refs[i].inBegin, refs[i].inLength,
546                     refs[i].inLength) ==
547                 0)
548             {
549                 position += refs[i].inLength;
550                 pad_.add(refs[i].outBegin, refs[i].outLength);
551                 return position;
552             }
553         }
554         throw css::uno::RuntimeException(
555             (rtl::OUString(
556                 RTL_CONSTASCII_USTRINGPARAM("unknown entity reference in ")) +
557              fileUrl_),
558             css::uno::Reference< css::uno::XInterface >());
559     }
560 }
561 
handleAttributeValue(char const * begin,char const * end,bool fullyNormalize)562 Span XmlReader::handleAttributeValue(
563     char const * begin, char const * end, bool fullyNormalize)
564 {
565     pad_.clear();
566     if (fullyNormalize) {
567         while (begin != end && isSpace(*begin)) {
568             ++begin;
569         }
570         while (end != begin && isSpace(end[-1])) {
571             --end;
572         }
573         char const * p = begin;
574         enum Space { SPACE_NONE, SPACE_SPAN, SPACE_BREAK };
575             // a single true space character can go into the current span,
576             // everything else breaks the span
577         Space space = SPACE_NONE;
578         while (p != end) {
579             switch (*p) {
580             case '\x09':
581             case '\x0A':
582             case '\x0D':
583                 switch (space) {
584                 case SPACE_NONE:
585                     pad_.add(begin, p - begin);
586                     pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
587                     space = SPACE_BREAK;
588                     break;
589                 case SPACE_SPAN:
590                     pad_.add(begin, p - begin);
591                     space = SPACE_BREAK;
592                     break;
593                 case SPACE_BREAK:
594                     break;
595                 }
596                 begin = ++p;
597                 break;
598             case ' ':
599                 switch (space) {
600                 case SPACE_NONE:
601                     ++p;
602                     space = SPACE_SPAN;
603                     break;
604                 case SPACE_SPAN:
605                     pad_.add(begin, p - begin);
606                     begin = ++p;
607                     space = SPACE_BREAK;
608                     break;
609                 case SPACE_BREAK:
610                     begin = ++p;
611                     break;
612                 }
613                 break;
614             case '&':
615                 pad_.add(begin, p - begin);
616                 p = handleReference(p, end);
617                 begin = p;
618                 space = SPACE_NONE;
619                 break;
620             default:
621                 ++p;
622                 space = SPACE_NONE;
623                 break;
624             }
625         }
626         pad_.add(begin, p - begin);
627     } else {
628         char const * p = begin;
629         while (p != end) {
630             switch (*p) {
631             case '\x09':
632             case '\x0A':
633                 pad_.add(begin, p - begin);
634                 begin = ++p;
635                 pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
636                 break;
637             case '\x0D':
638                 pad_.add(begin, p - begin);
639                 ++p;
640                 if (peek() == '\x0A') {
641                     ++p;
642                 }
643                 begin = p;
644                 pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
645                 break;
646             case '&':
647                 pad_.add(begin, p - begin);
648                 p = handleReference(p, end);
649                 begin = p;
650                 break;
651             default:
652                 ++p;
653                 break;
654             }
655         }
656         pad_.add(begin, p - begin);
657     }
658     return pad_.get();
659 }
660 
handleStartTag(int * nsId,Span * localName)661 XmlReader::Result XmlReader::handleStartTag(int * nsId, Span * localName) {
662     OSL_ASSERT(nsId != 0 && localName);
663     char const * nameBegin = pos_;
664     char const * nameColon = 0;
665     if (!scanName(&nameColon)) {
666         throw css::uno::RuntimeException(
667             (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("bad tag name in ")) +
668              fileUrl_),
669             css::uno::Reference< css::uno::XInterface >());
670     }
671     char const * nameEnd = pos_;
672     NamespaceList::size_type inheritedNamespaces = namespaces_.size();
673     bool hasDefaultNs = false;
674     int defaultNsId = NAMESPACE_NONE;
675     attributes_.clear();
676     for (;;) {
677         char const * p = pos_;
678         skipSpace();
679         if (peek() == '/' || peek() == '>') {
680             break;
681         }
682         if (pos_ == p) {
683             throw css::uno::RuntimeException(
684                 (rtl::OUString(
685                     RTL_CONSTASCII_USTRINGPARAM(
686                         "missing whitespace before attribute in ")) +
687                  fileUrl_),
688                 css::uno::Reference< css::uno::XInterface >());
689         }
690         char const * attrNameBegin = pos_;
691         char const * attrNameColon = 0;
692         if (!scanName(&attrNameColon)) {
693             throw css::uno::RuntimeException(
694                 (rtl::OUString(
695                     RTL_CONSTASCII_USTRINGPARAM("bad attribute name in ")) +
696                  fileUrl_),
697                 css::uno::Reference< css::uno::XInterface >());
698         }
699         char const * attrNameEnd = pos_;
700         skipSpace();
701         if (read() != '=') {
702             throw css::uno::RuntimeException(
703                 (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("missing '=' in ")) +
704                  fileUrl_),
705                 css::uno::Reference< css::uno::XInterface >());
706         }
707         skipSpace();
708         char del = read();
709         if (del != '\'' && del != '"') {
710             throw css::uno::RuntimeException(
711                 (rtl::OUString(
712                     RTL_CONSTASCII_USTRINGPARAM("bad attribute value in ")) +
713                  fileUrl_),
714                 css::uno::Reference< css::uno::XInterface >());
715         }
716         char const * valueBegin = pos_;
717         sal_Int32 i = rtl_str_indexOfChar_WithLength(pos_, end_ - pos_, del);
718         if (i < 0) {
719             throw css::uno::RuntimeException(
720                 (rtl::OUString(
721                     RTL_CONSTASCII_USTRINGPARAM(
722                         "unterminated attribute value in ")) +
723                  fileUrl_),
724                 css::uno::Reference< css::uno::XInterface >());
725         }
726         char const * valueEnd = pos_ + i;
727         pos_ += i + 1;
728         if (attrNameColon == 0 &&
729             Span(attrNameBegin, attrNameEnd - attrNameBegin).equals(
730                 RTL_CONSTASCII_STRINGPARAM("xmlns")))
731         {
732             hasDefaultNs = true;
733             defaultNsId = scanNamespaceIri(valueBegin, valueEnd);
734         } else if (attrNameColon != 0 &&
735                    Span(attrNameBegin, attrNameColon - attrNameBegin).equals(
736                        RTL_CONSTASCII_STRINGPARAM("xmlns")))
737         {
738             namespaces_.push_back(
739                 NamespaceData(
740                     Span(attrNameColon + 1, attrNameEnd - (attrNameColon + 1)),
741                     scanNamespaceIri(valueBegin, valueEnd)));
742         } else {
743             attributes_.push_back(
744                 AttributeData(
745                     attrNameBegin, attrNameEnd, attrNameColon, valueBegin,
746                     valueEnd));
747         }
748     }
749     if (!hasDefaultNs && !elements_.empty()) {
750         defaultNsId = elements_.top().defaultNamespaceId;
751     }
752     firstAttribute_ = true;
753     if (peek() == '/') {
754         state_ = STATE_EMPTY_ELEMENT_TAG;
755         ++pos_;
756     } else {
757         state_ = STATE_CONTENT;
758     }
759     if (peek() != '>') {
760         throw css::uno::RuntimeException(
761             (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("missing '>' in ")) +
762              fileUrl_),
763             css::uno::Reference< css::uno::XInterface >());
764     }
765     ++pos_;
766     elements_.push(
767         ElementData(
768             Span(nameBegin, nameEnd - nameBegin), inheritedNamespaces,
769             defaultNsId));
770     if (nameColon == 0) {
771         *nsId = defaultNsId;
772         *localName = Span(nameBegin, nameEnd - nameBegin);
773     } else {
774         *nsId = getNamespaceId(Span(nameBegin, nameColon - nameBegin));
775         *localName = Span(nameColon + 1, nameEnd - (nameColon + 1));
776     }
777     return RESULT_BEGIN;
778 }
779 
handleEndTag()780 XmlReader::Result XmlReader::handleEndTag() {
781     if (elements_.empty()) {
782         throw css::uno::RuntimeException(
783             (rtl::OUString(
784                 RTL_CONSTASCII_USTRINGPARAM("spurious end tag in ")) +
785              fileUrl_),
786             css::uno::Reference< css::uno::XInterface >());
787     }
788     char const * nameBegin = pos_;
789     char const * nameColon = 0;
790     if (!scanName(&nameColon) ||
791         !elements_.top().name.equals(nameBegin, pos_ - nameBegin))
792     {
793         throw css::uno::RuntimeException(
794             (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("tag mismatch in ")) +
795              fileUrl_),
796             css::uno::Reference< css::uno::XInterface >());
797     }
798     handleElementEnd();
799     skipSpace();
800     if (peek() != '>') {
801         throw css::uno::RuntimeException(
802             (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("missing '>' in ")) +
803              fileUrl_),
804             css::uno::Reference< css::uno::XInterface >());
805     }
806     ++pos_;
807     return RESULT_END;
808 }
809 
handleElementEnd()810 void XmlReader::handleElementEnd() {
811     OSL_ASSERT(!elements_.empty());
812     namespaces_.resize(elements_.top().inheritedNamespaces);
813     elements_.pop();
814     state_ = elements_.empty() ? STATE_DONE : STATE_CONTENT;
815 }
816 
handleSkippedText(Span * data,int * nsId)817 XmlReader::Result XmlReader::handleSkippedText(Span * data, int * nsId) {
818     for (;;) {
819         sal_Int32 i = rtl_str_indexOfChar_WithLength(pos_, end_ - pos_, '<');
820         if (i < 0) {
821             throw css::uno::RuntimeException(
822                 (rtl::OUString(
823                     RTL_CONSTASCII_USTRINGPARAM("premature end of ")) +
824                  fileUrl_),
825                 css::uno::Reference< css::uno::XInterface >());
826         }
827         pos_ += i + 1;
828         switch (peek()) {
829         case '!':
830             ++pos_;
831             if (!skipComment() && !scanCdataSection().is()) {
832                 skipDocumentTypeDeclaration();
833             }
834             break;
835         case '/':
836             ++pos_;
837             return handleEndTag();
838         case '?':
839             ++pos_;
840             skipProcessingInstruction();
841             break;
842         default:
843             return handleStartTag(nsId, data);
844         }
845     }
846 }
847 
handleRawText(Span * text)848 XmlReader::Result XmlReader::handleRawText(Span * text) {
849     pad_.clear();
850     for (char const * begin = pos_;;) {
851         switch (peek()) {
852         case '\0': // i.e., EOF
853             throw css::uno::RuntimeException(
854                 (rtl::OUString(
855                     RTL_CONSTASCII_USTRINGPARAM("premature end of ")) +
856                  fileUrl_),
857                 css::uno::Reference< css::uno::XInterface >());
858         case '\x0D':
859             pad_.add(begin, pos_ - begin);
860             ++pos_;
861             if (peek() != '\x0A') {
862                 pad_.add(RTL_CONSTASCII_STRINGPARAM("\x0A"));
863             }
864             begin = pos_;
865             break;
866         case '&':
867             pad_.add(begin, pos_ - begin);
868             pos_ = handleReference(pos_, end_);
869             begin = pos_;
870             break;
871         case '<':
872             pad_.add(begin, pos_ - begin);
873             ++pos_;
874             switch (peek()) {
875             case '!':
876                 ++pos_;
877                 if (!skipComment()) {
878                     Span cdata(scanCdataSection());
879                     if (cdata.is()) {
880                         normalizeLineEnds(cdata);
881                     } else {
882                         skipDocumentTypeDeclaration();
883                     }
884                 }
885                 begin = pos_;
886                 break;
887             case '/':
888                 *text = pad_.get();
889                 ++pos_;
890                 state_ = STATE_END_TAG;
891                 return RESULT_TEXT;
892             case '?':
893                 ++pos_;
894                 skipProcessingInstruction();
895                 begin = pos_;
896                 break;
897             default:
898                 *text = pad_.get();
899                 state_ = STATE_START_TAG;
900                 return RESULT_TEXT;
901             }
902             break;
903         default:
904             ++pos_;
905             break;
906         }
907     }
908 }
909 
handleNormalizedText(Span * text)910 XmlReader::Result XmlReader::handleNormalizedText(Span * text) {
911     pad_.clear();
912     char const * flowBegin = pos_;
913     char const * flowEnd = pos_;
914     enum Space { SPACE_START, SPACE_NONE, SPACE_SPAN, SPACE_BREAK };
915         // a single true space character can go into the current flow,
916         // everything else breaks the flow
917     Space space = SPACE_START;
918     for (;;) {
919         switch (peek()) {
920         case '\0': // i.e., EOF
921             throw css::uno::RuntimeException(
922                 (rtl::OUString(
923                     RTL_CONSTASCII_USTRINGPARAM("premature end of ")) +
924                  fileUrl_),
925                 css::uno::Reference< css::uno::XInterface >());
926         case '\x09':
927         case '\x0A':
928         case '\x0D':
929             switch (space) {
930             case SPACE_START:
931             case SPACE_BREAK:
932                 break;
933             case SPACE_NONE:
934             case SPACE_SPAN:
935                 space = SPACE_BREAK;
936                 break;
937             }
938             ++pos_;
939             break;
940         case ' ':
941             switch (space) {
942             case SPACE_START:
943             case SPACE_BREAK:
944                 break;
945             case SPACE_NONE:
946                 space = SPACE_SPAN;
947                 break;
948             case SPACE_SPAN:
949                 space = SPACE_BREAK;
950                 break;
951             }
952             ++pos_;
953             break;
954         case '&':
955             switch (space) {
956             case SPACE_START:
957                 break;
958             case SPACE_NONE:
959             case SPACE_SPAN:
960                 pad_.add(flowBegin, pos_ - flowBegin);
961                 break;
962             case SPACE_BREAK:
963                 pad_.add(flowBegin, flowEnd - flowBegin);
964                 pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
965                 break;
966             }
967             pos_ = handleReference(pos_, end_);
968             flowBegin = pos_;
969             flowEnd = pos_;
970             space = SPACE_NONE;
971             break;
972         case '<':
973             ++pos_;
974             switch (peek()) {
975             case '!':
976                 ++pos_;
977                 if (skipComment()) {
978                     space = SPACE_BREAK;
979                 } else {
980                     Span cdata(scanCdataSection());
981                     if (cdata.is()) {
982                         // CDATA is not normalized (similar to character
983                         // references; it keeps the code simple), but it might
984                         // arguably be better to normalize it:
985                         switch (space) {
986                         case SPACE_START:
987                             break;
988                         case SPACE_NONE:
989                         case SPACE_SPAN:
990                             pad_.add(flowBegin, pos_ - flowBegin);
991                             break;
992                         case SPACE_BREAK:
993                             pad_.add(flowBegin, flowEnd - flowBegin);
994                             pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
995                             break;
996                         }
997                         normalizeLineEnds(cdata);
998                         flowBegin = pos_;
999                         flowEnd = pos_;
1000                         space = SPACE_NONE;
1001                     } else {
1002                         skipDocumentTypeDeclaration();
1003                     }
1004                 }
1005                 break;
1006             case '/':
1007                 ++pos_;
1008                 pad_.add(flowBegin, flowEnd - flowBegin);
1009                 *text = pad_.get();
1010                 state_ = STATE_END_TAG;
1011                 return RESULT_TEXT;
1012             case '?':
1013                 ++pos_;
1014                 skipProcessingInstruction();
1015                 space = SPACE_BREAK;
1016                 break;
1017             default:
1018                 pad_.add(flowBegin, flowEnd - flowBegin);
1019                 *text = pad_.get();
1020                 state_ = STATE_START_TAG;
1021                 return RESULT_TEXT;
1022             }
1023             break;
1024         default:
1025             switch (space) {
1026             case SPACE_START:
1027                 flowBegin = pos_;
1028                 break;
1029             case SPACE_NONE:
1030             case SPACE_SPAN:
1031                 break;
1032             case SPACE_BREAK:
1033                 pad_.add(flowBegin, flowEnd - flowBegin);
1034                 pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
1035                 flowBegin = pos_;
1036                 break;
1037             }
1038             flowEnd = ++pos_;
1039             space = SPACE_NONE;
1040             break;
1041         }
1042     }
1043 }
1044 
toNamespaceId(NamespaceIris::size_type pos)1045 int XmlReader::toNamespaceId(NamespaceIris::size_type pos) {
1046     OSL_ASSERT(pos <= INT_MAX);
1047     return static_cast< int >(pos);
1048 }
1049 
1050 }
1051