1 /************************************************************************* 2 * 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * Copyright 2000, 2010 Oracle and/or its affiliates. 6 * 7 * OpenOffice.org - a multi-platform office productivity suite 8 * 9 * This file is part of OpenOffice.org. 10 * 11 * OpenOffice.org is free software: you can redistribute it and/or modify 12 * it under the terms of the GNU Lesser General Public License version 3 13 * only, as published by the Free Software Foundation. 14 * 15 * OpenOffice.org is distributed in the hope that it will be useful, 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 * GNU Lesser General Public License version 3 for more details 19 * (a copy is included in the LICENSE file that accompanied this code). 20 * 21 * You should have received a copy of the GNU Lesser General Public License 22 * version 3 along with OpenOffice.org. If not, see 23 * <http://www.openoffice.org/license.html> 24 * for a copy of the LGPLv3 License. 25 * 26 ************************************************************************/ 27 28 #ifndef INCLUDED_XMLREADER_XMLREADER_HXX 29 #define INCLUDED_XMLREADER_XMLREADER_HXX 30 31 #include "sal/config.h" 32 33 #include <stack> 34 #include <vector> 35 36 #include "boost/noncopyable.hpp" 37 #include "com/sun/star/container/NoSuchElementException.hpp" 38 #include "com/sun/star/uno/RuntimeException.hpp" 39 #include "osl/file.h" 40 #include "rtl/ustring.hxx" 41 #include "sal/types.h" 42 #include "xmlreader/detail/xmlreaderdllapi.hxx" 43 #include "xmlreader/pad.hxx" 44 #include "xmlreader/span.hxx" 45 46 namespace xmlreader { 47 48 class OOO_DLLPUBLIC_XMLREADER XmlReader: private boost::noncopyable { 49 public: 50 explicit XmlReader(rtl::OUString const & fileUrl) 51 SAL_THROW(( 52 com::sun::star::container::NoSuchElementException, 53 com::sun::star::uno::RuntimeException)); 54 55 ~XmlReader(); 56 57 enum { NAMESPACE_NONE = -2, NAMESPACE_UNKNOWN = -1, NAMESPACE_XML = 0 }; 58 59 enum Text { TEXT_NONE, TEXT_RAW, TEXT_NORMALIZED }; 60 61 enum Result { RESULT_BEGIN, RESULT_END, RESULT_TEXT, RESULT_DONE }; 62 63 int registerNamespaceIri(Span const & iri); 64 65 // RESULT_BEGIN: data = localName, ns = ns 66 // RESULT_END: data, ns unused 67 // RESULT_TEXT: data = text, ns unused 68 Result nextItem(Text reportText, Span * data, int * nsId); 69 70 bool nextAttribute(int * nsId, Span * localName); 71 72 // the span returned by getAttributeValue is only valid until the next call 73 // to nextItem or getAttributeValue 74 Span getAttributeValue(bool fullyNormalize); 75 76 int getNamespaceId(Span const & prefix) const; 77 78 rtl::OUString getUrl() const; 79 80 private: 81 typedef std::vector< Span > NamespaceIris; 82 83 // If NamespaceData (and similarly ElementData and AttributeData) is made 84 // SAL_DLLPRIVATE, at least gcc 4.2.3 erroneously warns about 85 // "'xmlreader::XmlReader' declared with greater visibility than the type of 86 // its field 'xmlreader::XmlReader::namespaces_'" (and similarly for 87 // elements_ and attributes_): 88 89 struct NamespaceData { 90 Span prefix; 91 int nsId; 92 93 NamespaceData() {} 94 95 NamespaceData(Span const & thePrefix, int theNsId): 96 prefix(thePrefix), nsId(theNsId) {} 97 }; 98 99 typedef std::vector< NamespaceData > NamespaceList; 100 101 struct ElementData { 102 Span name; 103 NamespaceList::size_type inheritedNamespaces; 104 int defaultNamespaceId; 105 106 ElementData( 107 Span const & theName, 108 NamespaceList::size_type theInheritedNamespaces, 109 int theDefaultNamespaceId): 110 name(theName), inheritedNamespaces(theInheritedNamespaces), 111 defaultNamespaceId(theDefaultNamespaceId) 112 {} 113 }; 114 115 typedef std::stack< ElementData > ElementStack; 116 117 struct AttributeData { 118 char const * nameBegin; 119 char const * nameEnd; 120 char const * nameColon; 121 char const * valueBegin; 122 char const * valueEnd; 123 124 AttributeData( 125 char const * theNameBegin, char const * theNameEnd, 126 char const * theNameColon, char const * theValueBegin, 127 char const * theValueEnd): 128 nameBegin(theNameBegin), nameEnd(theNameEnd), 129 nameColon(theNameColon), valueBegin(theValueBegin), 130 valueEnd(theValueEnd) 131 {} 132 }; 133 134 typedef std::vector< AttributeData > Attributes; 135 136 enum State { 137 STATE_CONTENT, STATE_START_TAG, STATE_END_TAG, STATE_EMPTY_ELEMENT_TAG, 138 STATE_DONE }; 139 140 SAL_DLLPRIVATE inline char read() { return pos_ == end_ ? '\0' : *pos_++; } 141 142 SAL_DLLPRIVATE inline char peek() { return pos_ == end_ ? '\0' : *pos_; } 143 144 SAL_DLLPRIVATE void normalizeLineEnds(Span const & text); 145 146 SAL_DLLPRIVATE void skipSpace(); 147 148 SAL_DLLPRIVATE bool skipComment(); 149 150 SAL_DLLPRIVATE void skipProcessingInstruction(); 151 152 SAL_DLLPRIVATE void skipDocumentTypeDeclaration(); 153 154 SAL_DLLPRIVATE Span scanCdataSection(); 155 156 SAL_DLLPRIVATE bool scanName(char const ** nameColon); 157 158 SAL_DLLPRIVATE int scanNamespaceIri( 159 char const * begin, char const * end); 160 161 SAL_DLLPRIVATE char const * handleReference( 162 char const * position, char const * end); 163 164 SAL_DLLPRIVATE Span handleAttributeValue( 165 char const * begin, char const * end, bool fullyNormalize); 166 167 SAL_DLLPRIVATE Result handleStartTag(int * nsId, Span * localName); 168 169 SAL_DLLPRIVATE Result handleEndTag(); 170 171 SAL_DLLPRIVATE void handleElementEnd(); 172 173 SAL_DLLPRIVATE Result handleSkippedText(Span * data, int * nsId); 174 175 SAL_DLLPRIVATE Result handleRawText(Span * text); 176 177 SAL_DLLPRIVATE Result handleNormalizedText(Span * text); 178 179 SAL_DLLPRIVATE int toNamespaceId(NamespaceIris::size_type pos); 180 181 rtl::OUString fileUrl_; 182 oslFileHandle fileHandle_; 183 sal_uInt64 fileSize_; 184 void * fileAddress_; 185 NamespaceIris namespaceIris_; 186 NamespaceList namespaces_; 187 ElementStack elements_; 188 char const * pos_; 189 char const * end_; 190 State state_; 191 Attributes attributes_; 192 Attributes::iterator currentAttribute_; 193 bool firstAttribute_; 194 Pad pad_; 195 }; 196 197 } 198 199 #endif 200