1 /************************************************************** 2 * 3 * Licensed to the Apache Software Foundation (ASF) under one 4 * or more contributor license agreements. See the NOTICE file 5 * distributed with this work for additional information 6 * regarding copyright ownership. The ASF licenses this file 7 * to you under the Apache License, Version 2.0 (the 8 * "License"); you may not use this file except in compliance 9 * with the License. You may obtain a copy of the License at 10 * 11 * http://www.apache.org/licenses/LICENSE-2.0 12 * 13 * Unless required by applicable law or agreed to in writing, 14 * software distributed under the License is distributed on an 15 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 * KIND, either express or implied. See the License for the 17 * specific language governing permissions and limitations 18 * under the License. 19 * 20 *************************************************************/ 21 22 23 24 #include "oox/vml/vmlinputstream.hxx" 25 26 #include <com/sun/star/io/XTextInputStream.hpp> 27 #include <map> 28 #include <string.h> 29 #include <rtl/strbuf.hxx> 30 #include "oox/helper/helper.hxx" 31 #include "oox/helper/textinputstream.hxx" 32 33 namespace oox { 34 namespace vml { 35 36 // ============================================================================ 37 38 using namespace ::com::sun::star::io; 39 using namespace ::com::sun::star::uno; 40 41 using ::rtl::OString; 42 using ::rtl::OStringBuffer; 43 44 // ============================================================================ 45 46 namespace { 47 48 inline const sal_Char* lclFindCharacter( const sal_Char* pcBeg, const sal_Char* pcEnd, sal_Char cChar ) 49 { 50 sal_Int32 nIndex = rtl_str_indexOfChar_WithLength( pcBeg, static_cast< sal_Int32 >( pcEnd - pcBeg ), cChar ); 51 return (nIndex < 0) ? pcEnd : (pcBeg + nIndex); 52 } 53 54 inline bool lclIsWhiteSpace( sal_Char cChar ) 55 { 56 return cChar < 32; 57 } 58 59 const sal_Char* lclFindWhiteSpace( const sal_Char* pcBeg, const sal_Char* pcEnd ) 60 { 61 for( ; pcBeg < pcEnd; ++pcBeg ) 62 if( lclIsWhiteSpace( *pcBeg ) ) 63 return pcBeg; 64 return pcEnd; 65 } 66 67 const sal_Char* lclFindNonWhiteSpace( const sal_Char* pcBeg, const sal_Char* pcEnd ) 68 { 69 for( ; pcBeg < pcEnd; ++pcBeg ) 70 if( !lclIsWhiteSpace( *pcBeg ) ) 71 return pcBeg; 72 return pcEnd; 73 } 74 75 const sal_Char* lclTrimWhiteSpaceFromEnd( const sal_Char* pcBeg, const sal_Char* pcEnd ) 76 { 77 while( (pcBeg < pcEnd) && lclIsWhiteSpace( pcEnd[ -1 ] ) ) 78 --pcEnd; 79 return pcEnd; 80 } 81 82 inline void lclAppendToBuffer( OStringBuffer& rBuffer, const sal_Char* pcBeg, const sal_Char* pcEnd ) 83 { 84 rBuffer.append( pcBeg, static_cast< sal_Int32 >( pcEnd - pcBeg ) ); 85 } 86 87 // ---------------------------------------------------------------------------- 88 89 void lclProcessAttribs( OStringBuffer& rBuffer, const sal_Char* pcBeg, const sal_Char* pcEnd ) 90 { 91 /* Map attribute names to char-pointer of all attributes. This map is used 92 to find multiple occurences of attributes with the same name. The 93 mapped pointers are used as map key in the next map below. */ 94 typedef ::std::map< OString, const sal_Char* > AttributeNameMap; 95 AttributeNameMap aAttributeNames; 96 97 /* Map the char-pointers of all attributes to the full attribute definition 98 string. This preserves the original order of the used attributes. */ 99 typedef ::std::map< const sal_Char*, OString > AttributeDataMap; 100 AttributeDataMap aAttributes; 101 102 bool bOk = true; 103 const sal_Char* pcNameBeg = pcBeg; 104 while( bOk && (pcNameBeg < pcEnd) ) 105 { 106 // pcNameBeg points to begin of attribute name, find equality sign 107 const sal_Char* pcEqualSign = lclFindCharacter( pcNameBeg, pcEnd, '=' ); 108 if( (bOk = pcEqualSign < pcEnd) == true ) 109 { 110 // find end of attribute name (ignore whitespace between name and equality sign) 111 const sal_Char* pcNameEnd = lclTrimWhiteSpaceFromEnd( pcNameBeg, pcEqualSign ); 112 if( (bOk = pcNameBeg < pcNameEnd) == true ) 113 { 114 // find begin of attribute value (must be single or double quote) 115 const sal_Char* pcValueBeg = lclFindNonWhiteSpace( pcEqualSign + 1, pcEnd ); 116 if( (bOk = (pcValueBeg < pcEnd) && ((*pcValueBeg == '\'') || (*pcValueBeg == '"'))) == true ) 117 { 118 // find end of attribute value (matching quote character) 119 const sal_Char* pcValueEnd = lclFindCharacter( pcValueBeg + 1, pcEnd, *pcValueBeg ); 120 if( (bOk = pcValueEnd < pcEnd) == true ) 121 { 122 ++pcValueEnd; 123 OString aAttribName( pcNameBeg, static_cast< sal_Int32 >( pcNameEnd - pcNameBeg ) ); 124 OString aAttribData( pcNameBeg, static_cast< sal_Int32 >( pcValueEnd - pcNameBeg ) ); 125 // search for an existing attribute with the same name 126 AttributeNameMap::iterator aIt = aAttributeNames.find( aAttribName ); 127 // remove its definition from the data map 128 if( aIt != aAttributeNames.end() ) 129 aAttributes.erase( aIt->second ); 130 // insert the attribute into both maps 131 aAttributeNames[ aAttribName ] = pcNameBeg; 132 aAttributes[ pcNameBeg ] = aAttribData; 133 // continue with next attribute (skip whitespace after this attribute) 134 pcNameBeg = pcValueEnd; 135 if( (pcNameBeg < pcEnd) && ((bOk = lclIsWhiteSpace( *pcNameBeg )) == true) ) 136 pcNameBeg = lclFindNonWhiteSpace( pcNameBeg + 1, pcEnd ); 137 } 138 } 139 } 140 } 141 } 142 143 // if no error has occured, build the resulting attribute list 144 if( bOk ) 145 for( AttributeDataMap::iterator aIt = aAttributes.begin(), aEnd = aAttributes.end(); aIt != aEnd; ++aIt ) 146 rBuffer.append( ' ' ).append( aIt->second ); 147 // on error, just append the complete passed string 148 else 149 lclAppendToBuffer( rBuffer, pcBeg, pcEnd ); 150 } 151 152 void lclProcessElement( OStringBuffer& rBuffer, const OString& rElement ) 153 { 154 // check that passed string starts and ends with the brackets of an XML element 155 sal_Int32 nElementLen = rElement.getLength(); 156 if( nElementLen == 0 ) 157 return; 158 159 const sal_Char* pcOpen = rElement.getStr(); 160 const sal_Char* pcClose = pcOpen + nElementLen - 1; 161 162 // no complete element found 163 if( (pcOpen >= pcClose) || (*pcOpen != '<') || (*pcClose != '>') ) 164 { 165 // just append all passed characters 166 rBuffer.append( rElement ); 167 } 168 169 // skip parser instructions: '<![...]>' 170 else if( (nElementLen >= 5) && (pcOpen[ 1 ] == '!') && (pcOpen[ 2 ] == '[') && (pcClose[ -1 ] == ']') ) 171 { 172 // do nothing 173 } 174 175 // replace '<br>' element with newline 176 else if( (nElementLen >= 4) && (pcOpen[ 1 ] == 'b') && (pcOpen[ 2 ] == 'r') && (lclFindNonWhiteSpace( pcOpen + 3, pcClose ) == pcClose) ) 177 { 178 rBuffer.append( '\n' ); 179 } 180 181 // check start elements and simple elements for repeated attributes 182 else if( pcOpen[ 1 ] != '/' ) 183 { 184 // find positions of text content inside brackets, exclude '/' in '<simpleelement/>' 185 const sal_Char* pcContentBeg = pcOpen + 1; 186 bool bIsEmptyElement = pcClose[ -1 ] == '/'; 187 const sal_Char* pcContentEnd = bIsEmptyElement ? (pcClose - 1) : pcClose; 188 // append opening bracket and element name to buffer 189 const sal_Char* pcWhiteSpace = lclFindWhiteSpace( pcContentBeg, pcContentEnd ); 190 lclAppendToBuffer( rBuffer, pcOpen, pcWhiteSpace ); 191 // find begin of attributes, and process all attributes 192 const sal_Char* pcAttribBeg = lclFindNonWhiteSpace( pcWhiteSpace, pcContentEnd ); 193 if( pcAttribBeg < pcContentEnd ) 194 lclProcessAttribs( rBuffer, pcAttribBeg, pcContentEnd ); 195 // close the element 196 if( bIsEmptyElement ) 197 rBuffer.append( '/' ); 198 rBuffer.append( '>' ); 199 } 200 201 // append end elements without further processing 202 else 203 { 204 rBuffer.append( rElement ); 205 } 206 } 207 208 bool lclProcessCharacters( OStringBuffer& rBuffer, const OString& rChars ) 209 { 210 /* MSO has a very weird way to store and handle whitespaces. The stream 211 may contain lots of spaces, tabs, and newlines which have to be handled 212 as single space character. This will be done in this function. 213 214 If the element text contains a literal line break, it will be stored as 215 <br> tag (without matching </br> element). This input stream wrapper 216 will replace this element with a literal LF character (see below). 217 218 A single space character for its own is stored as is. Example: The 219 element 220 <font> </font> 221 represents a single space character. The XML parser will ignore this 222 space character completely without issuing a 'characters' event. The 223 VML import filter implementation has to react on this case manually. 224 225 A single space character following another character is stored 226 literally and must not be stipped away here. Example: The element 227 <font>abc </font> 228 contains the three letters a, b, and c, followed by a space character. 229 230 Consecutive space characters, or a leading single space character, are 231 stored in a <span> element. If there are N space characters (N > 1), 232 then the <span> element contains exactly (N-1) NBSP (non-breaking 233 space) characters, followed by a regular space character. Examples: 234 The element 235 <font><span style='mso-spacerun:yes'>\xA0\xA0\xA0 </span></font> 236 represents 4 consecutive space characters. Has to be handled by the 237 implementation. The element 238 <font><span style='mso-spacerun:yes'> abc</span></font> 239 represents a space characters followed by the letters a, b, c. These 240 strings have to be handled by the VML import filter implementation. 241 */ 242 243 // passed string ends with the leading opening bracket of an XML element 244 const sal_Char* pcBeg = rChars.getStr(); 245 const sal_Char* pcEnd = pcBeg + rChars.getLength(); 246 bool bHasBracket = (pcBeg < pcEnd) && (pcEnd[ -1 ] == '<'); 247 if( bHasBracket ) --pcEnd; 248 249 // skip leading whitespace 250 const sal_Char* pcContentsBeg = lclFindNonWhiteSpace( pcBeg, pcEnd ); 251 while( pcContentsBeg < pcEnd ) 252 { 253 const sal_Char* pcWhitespaceBeg = lclFindWhiteSpace( pcContentsBeg + 1, pcEnd ); 254 lclAppendToBuffer( rBuffer, pcContentsBeg, pcWhitespaceBeg ); 255 if( pcWhitespaceBeg < pcEnd ) 256 rBuffer.append( ' ' ); 257 pcContentsBeg = lclFindNonWhiteSpace( pcWhitespaceBeg, pcEnd ); 258 } 259 260 return bHasBracket; 261 } 262 263 } // namespace 264 265 // ============================================================================ 266 267 InputStream::InputStream( const Reference< XComponentContext >& rxContext, const Reference< XInputStream >& rxInStrm ) : 268 // use single-byte ISO-8859-1 encoding which maps all byte characters to the first 256 Unicode characters 269 mxTextStrm( TextInputStream::createXTextInputStream( rxContext, rxInStrm, RTL_TEXTENCODING_ISO_8859_1 ) ), 270 maOpeningBracket( 1 ), 271 maClosingBracket( 1 ), 272 maOpeningCData( CREATE_OSTRING( "<![CDATA[" ) ), 273 maClosingCData( CREATE_OSTRING( "]]>" ) ), 274 mnBufferPos( 0 ) 275 { 276 maOpeningBracket[ 0 ] = '<'; 277 maClosingBracket[ 0 ] = '>'; 278 } 279 280 InputStream::~InputStream() 281 { 282 } 283 284 sal_Int32 SAL_CALL InputStream::readBytes( Sequence< sal_Int8 >& rData, sal_Int32 nBytesToRead ) 285 throw (NotConnectedException, BufferSizeExceededException, IOException, RuntimeException) 286 { 287 if( nBytesToRead < 0 ) 288 throw IOException(); 289 290 rData.realloc( nBytesToRead ); 291 sal_Int8* pcDest = rData.getArray(); 292 sal_Int32 nRet = 0; 293 while( (nBytesToRead > 0) && !mxTextStrm->isEOF() ) 294 { 295 updateBuffer(); 296 sal_Int32 nReadSize = ::std::min( nBytesToRead, maBuffer.getLength() - mnBufferPos ); 297 if( nReadSize > 0 ) 298 { 299 memcpy( pcDest + nRet, maBuffer.getStr() + mnBufferPos, static_cast< size_t >( nReadSize ) ); 300 mnBufferPos += nReadSize; 301 nBytesToRead -= nReadSize; 302 nRet += nReadSize; 303 } 304 } 305 if( nRet < rData.getLength() ) 306 rData.realloc( nRet ); 307 return nRet; 308 } 309 310 sal_Int32 SAL_CALL InputStream::readSomeBytes( Sequence< sal_Int8 >& rData, sal_Int32 nMaxBytesToRead ) 311 throw (NotConnectedException, BufferSizeExceededException, IOException, RuntimeException) 312 { 313 return readBytes( rData, nMaxBytesToRead ); 314 } 315 316 void SAL_CALL InputStream::skipBytes( sal_Int32 nBytesToSkip ) 317 throw (NotConnectedException, BufferSizeExceededException, IOException, RuntimeException) 318 { 319 if( nBytesToSkip < 0 ) 320 throw IOException(); 321 322 while( (nBytesToSkip > 0) && !mxTextStrm->isEOF() ) 323 { 324 updateBuffer(); 325 sal_Int32 nSkipSize = ::std::min( nBytesToSkip, maBuffer.getLength() - mnBufferPos ); 326 mnBufferPos += nSkipSize; 327 nBytesToSkip -= nSkipSize; 328 } 329 } 330 331 sal_Int32 SAL_CALL InputStream::available() throw (NotConnectedException, IOException, RuntimeException) 332 { 333 updateBuffer(); 334 return maBuffer.getLength() - mnBufferPos; 335 } 336 337 void SAL_CALL InputStream::closeInput() throw (NotConnectedException, IOException, RuntimeException) 338 { 339 mxTextStrm->closeInput(); 340 } 341 342 // private -------------------------------------------------------------------- 343 344 void InputStream::updateBuffer() throw (IOException, RuntimeException) 345 { 346 while( (mnBufferPos >= maBuffer.getLength()) && !mxTextStrm->isEOF() ) 347 { 348 // collect new contents in a string buffer 349 OStringBuffer aBuffer; 350 351 // read and process characters until the opening bracket of the next XML element 352 OString aChars = readToElementBegin(); 353 bool bHasOpeningBracket = lclProcessCharacters( aBuffer, aChars ); 354 355 // read and process characters until (and including) closing bracket (an XML element) 356 OSL_ENSURE( bHasOpeningBracket || mxTextStrm->isEOF(), "InputStream::updateBuffer - missing opening bracket of XML element" ); 357 if( bHasOpeningBracket && !mxTextStrm->isEOF() ) 358 { 359 // read the element text (add the leading opening bracket manually) 360 OString aElement = OString( '<' ) + readToElementEnd(); 361 // check for CDATA part, starting with '<![CDATA[' 362 if( aElement.match( maOpeningCData ) ) 363 { 364 // search the end tag ']]>' 365 while( ((aElement.getLength() < maClosingCData.getLength()) || !aElement.match( maClosingCData, aElement.getLength() - maClosingCData.getLength() )) && !mxTextStrm->isEOF() ) 366 aElement += readToElementEnd(); 367 // copy the entire CDATA part 368 aBuffer.append( aElement ); 369 } 370 else 371 { 372 // no CDATA part - process the contents of the element 373 lclProcessElement( aBuffer, aElement ); 374 } 375 } 376 377 maBuffer = aBuffer.makeStringAndClear(); 378 mnBufferPos = 0; 379 } 380 } 381 382 OString InputStream::readToElementBegin() throw (IOException, RuntimeException) 383 { 384 return OUStringToOString( mxTextStrm->readString( maOpeningBracket, sal_False ), RTL_TEXTENCODING_ISO_8859_1 ); 385 } 386 387 OString InputStream::readToElementEnd() throw (IOException, RuntimeException) 388 { 389 OString aText = OUStringToOString( mxTextStrm->readString( maClosingBracket, sal_False ), RTL_TEXTENCODING_ISO_8859_1 ); 390 OSL_ENSURE( (aText.getLength() > 0) && (aText[ aText.getLength() - 1 ] == '>'), "InputStream::readToElementEnd - missing closing bracket of XML element" ); 391 return aText; 392 } 393 394 // ============================================================================ 395 396 } // namespace vml 397 } // namespave oox 398