1 /**************************************************************
2 *
3 * Licensed to the Apache Software Foundation (ASF) under one
4 * or more contributor license agreements. See the NOTICE file
5 * distributed with this work for additional information
6 * regarding copyright ownership. The ASF licenses this file
7 * to you under the Apache License, Version 2.0 (the
8 * "License"); you may not use this file except in compliance
9 * with the License. You may obtain a copy of the License at
10 *
11 * http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing,
14 * software distributed under the License is distributed on an
15 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 * KIND, either express or implied. See the License for the
17 * specific language governing permissions and limitations
18 * under the License.
19 *
20 *************************************************************/
21
22
23
24 #include "oox/vml/vmlinputstream.hxx"
25
26 #include <com/sun/star/io/XTextInputStream.hpp>
27 #include <map>
28 #include <string.h>
29 #include <rtl/strbuf.hxx>
30 #include "oox/helper/helper.hxx"
31 #include "oox/helper/textinputstream.hxx"
32
33 namespace oox {
34 namespace vml {
35
36 // ============================================================================
37
38 using namespace ::com::sun::star::io;
39 using namespace ::com::sun::star::uno;
40
41 using ::rtl::OString;
42 using ::rtl::OStringBuffer;
43
44 // ============================================================================
45
46 namespace {
47
lclFindCharacter(const sal_Char * pcBeg,const sal_Char * pcEnd,sal_Char cChar)48 inline const sal_Char* lclFindCharacter( const sal_Char* pcBeg, const sal_Char* pcEnd, sal_Char cChar )
49 {
50 sal_Int32 nIndex = rtl_str_indexOfChar_WithLength( pcBeg, static_cast< sal_Int32 >( pcEnd - pcBeg ), cChar );
51 return (nIndex < 0) ? pcEnd : (pcBeg + nIndex);
52 }
53
lclIsWhiteSpace(sal_Char cChar)54 inline bool lclIsWhiteSpace( sal_Char cChar )
55 {
56 return cChar < 32;
57 }
58
lclFindWhiteSpace(const sal_Char * pcBeg,const sal_Char * pcEnd)59 const sal_Char* lclFindWhiteSpace( const sal_Char* pcBeg, const sal_Char* pcEnd )
60 {
61 for( ; pcBeg < pcEnd; ++pcBeg )
62 if( lclIsWhiteSpace( *pcBeg ) )
63 return pcBeg;
64 return pcEnd;
65 }
66
lclFindNonWhiteSpace(const sal_Char * pcBeg,const sal_Char * pcEnd)67 const sal_Char* lclFindNonWhiteSpace( const sal_Char* pcBeg, const sal_Char* pcEnd )
68 {
69 for( ; pcBeg < pcEnd; ++pcBeg )
70 if( !lclIsWhiteSpace( *pcBeg ) )
71 return pcBeg;
72 return pcEnd;
73 }
74
lclTrimWhiteSpaceFromEnd(const sal_Char * pcBeg,const sal_Char * pcEnd)75 const sal_Char* lclTrimWhiteSpaceFromEnd( const sal_Char* pcBeg, const sal_Char* pcEnd )
76 {
77 while( (pcBeg < pcEnd) && lclIsWhiteSpace( pcEnd[ -1 ] ) )
78 --pcEnd;
79 return pcEnd;
80 }
81
lclAppendToBuffer(OStringBuffer & rBuffer,const sal_Char * pcBeg,const sal_Char * pcEnd)82 inline void lclAppendToBuffer( OStringBuffer& rBuffer, const sal_Char* pcBeg, const sal_Char* pcEnd )
83 {
84 rBuffer.append( pcBeg, static_cast< sal_Int32 >( pcEnd - pcBeg ) );
85 }
86
87 // ----------------------------------------------------------------------------
88
lclProcessAttribs(OStringBuffer & rBuffer,const sal_Char * pcBeg,const sal_Char * pcEnd)89 void lclProcessAttribs( OStringBuffer& rBuffer, const sal_Char* pcBeg, const sal_Char* pcEnd )
90 {
91 /* Map attribute names to char-pointer of all attributes. This map is used
92 to find multiple occurrences of attributes with the same name. The
93 mapped pointers are used as map key in the next map below. */
94 typedef ::std::map< OString, const sal_Char* > AttributeNameMap;
95 AttributeNameMap aAttributeNames;
96
97 /* Map the char-pointers of all attributes to the full attribute definition
98 string. This preserves the original order of the used attributes. */
99 typedef ::std::map< const sal_Char*, OString > AttributeDataMap;
100 AttributeDataMap aAttributes;
101
102 bool bOk = true;
103 const sal_Char* pcNameBeg = pcBeg;
104 while( bOk && (pcNameBeg < pcEnd) )
105 {
106 // pcNameBeg points to begin of attribute name, find equality sign
107 const sal_Char* pcEqualSign = lclFindCharacter( pcNameBeg, pcEnd, '=' );
108 if( (bOk = pcEqualSign < pcEnd) == true )
109 {
110 // find end of attribute name (ignore whitespace between name and equality sign)
111 const sal_Char* pcNameEnd = lclTrimWhiteSpaceFromEnd( pcNameBeg, pcEqualSign );
112 if( (bOk = pcNameBeg < pcNameEnd) == true )
113 {
114 // find begin of attribute value (must be single or double quote)
115 const sal_Char* pcValueBeg = lclFindNonWhiteSpace( pcEqualSign + 1, pcEnd );
116 if( (bOk = (pcValueBeg < pcEnd) && ((*pcValueBeg == '\'') || (*pcValueBeg == '"'))) == true )
117 {
118 // find end of attribute value (matching quote character)
119 const sal_Char* pcValueEnd = lclFindCharacter( pcValueBeg + 1, pcEnd, *pcValueBeg );
120 if( (bOk = pcValueEnd < pcEnd) == true )
121 {
122 ++pcValueEnd;
123 OString aAttribName( pcNameBeg, static_cast< sal_Int32 >( pcNameEnd - pcNameBeg ) );
124 OString aAttribData( pcNameBeg, static_cast< sal_Int32 >( pcValueEnd - pcNameBeg ) );
125 // search for an existing attribute with the same name
126 AttributeNameMap::iterator aIt = aAttributeNames.find( aAttribName );
127 // remove its definition from the data map
128 if( aIt != aAttributeNames.end() )
129 aAttributes.erase( aIt->second );
130 // insert the attribute into both maps
131 aAttributeNames[ aAttribName ] = pcNameBeg;
132 aAttributes[ pcNameBeg ] = aAttribData;
133 // continue with next attribute (skip whitespace after this attribute)
134 pcNameBeg = pcValueEnd;
135 if( (pcNameBeg < pcEnd) && ((bOk = lclIsWhiteSpace( *pcNameBeg )) == true) )
136 pcNameBeg = lclFindNonWhiteSpace( pcNameBeg + 1, pcEnd );
137 }
138 }
139 }
140 }
141 }
142
143 // if no error has occurred, build the resulting attribute list
144 if( bOk )
145 for( AttributeDataMap::iterator aIt = aAttributes.begin(), aEnd = aAttributes.end(); aIt != aEnd; ++aIt )
146 rBuffer.append( ' ' ).append( aIt->second );
147 // on error, just append the complete passed string
148 else
149 lclAppendToBuffer( rBuffer, pcBeg, pcEnd );
150 }
151
lclProcessElement(OStringBuffer & rBuffer,const OString & rElement)152 void lclProcessElement( OStringBuffer& rBuffer, const OString& rElement )
153 {
154 // check that passed string starts and ends with the brackets of an XML element
155 sal_Int32 nElementLen = rElement.getLength();
156 if( nElementLen == 0 )
157 return;
158
159 const sal_Char* pcOpen = rElement.getStr();
160 const sal_Char* pcClose = pcOpen + nElementLen - 1;
161
162 // no complete element found
163 if( (pcOpen >= pcClose) || (*pcOpen != '<') || (*pcClose != '>') )
164 {
165 // just append all passed characters
166 rBuffer.append( rElement );
167 }
168
169 // skip parser instructions: '<![...]>'
170 else if( (nElementLen >= 5) && (pcOpen[ 1 ] == '!') && (pcOpen[ 2 ] == '[') && (pcClose[ -1 ] == ']') )
171 {
172 // do nothing
173 }
174
175 // replace '<br>' element with newline
176 else if( (nElementLen >= 4) && (pcOpen[ 1 ] == 'b') && (pcOpen[ 2 ] == 'r') && (lclFindNonWhiteSpace( pcOpen + 3, pcClose ) == pcClose) )
177 {
178 rBuffer.append( '\n' );
179 }
180
181 // check start elements and simple elements for repeated attributes
182 else if( pcOpen[ 1 ] != '/' )
183 {
184 // find positions of text content inside brackets, exclude '/' in '<simpleelement/>'
185 const sal_Char* pcContentBeg = pcOpen + 1;
186 bool bIsEmptyElement = pcClose[ -1 ] == '/';
187 const sal_Char* pcContentEnd = bIsEmptyElement ? (pcClose - 1) : pcClose;
188 // append opening bracket and element name to buffer
189 const sal_Char* pcWhiteSpace = lclFindWhiteSpace( pcContentBeg, pcContentEnd );
190 lclAppendToBuffer( rBuffer, pcOpen, pcWhiteSpace );
191 // find begin of attributes, and process all attributes
192 const sal_Char* pcAttribBeg = lclFindNonWhiteSpace( pcWhiteSpace, pcContentEnd );
193 if( pcAttribBeg < pcContentEnd )
194 lclProcessAttribs( rBuffer, pcAttribBeg, pcContentEnd );
195 // close the element
196 if( bIsEmptyElement )
197 rBuffer.append( '/' );
198 rBuffer.append( '>' );
199 }
200
201 // append end elements without further processing
202 else
203 {
204 rBuffer.append( rElement );
205 }
206 }
207
lclProcessCharacters(OStringBuffer & rBuffer,const OString & rChars)208 bool lclProcessCharacters( OStringBuffer& rBuffer, const OString& rChars )
209 {
210 /* MSO has a very weird way to store and handle whitespaces. The stream
211 may contain lots of spaces, tabs, and newlines which have to be handled
212 as single space character. This will be done in this function.
213
214 If the element text contains a literal line break, it will be stored as
215 <br> tag (without matching </br> element). This input stream wrapper
216 will replace this element with a literal LF character (see below).
217
218 A single space character for its own is stored as is. Example: The
219 element
220 <font> </font>
221 represents a single space character. The XML parser will ignore this
222 space character completely without issuing a 'characters' event. The
223 VML import filter implementation has to react on this case manually.
224
225 A single space character following another character is stored
226 literally and must not be stipped away here. Example: The element
227 <font>abc </font>
228 contains the three letters a, b, and c, followed by a space character.
229
230 Consecutive space characters, or a leading single space character, are
231 stored in a <span> element. If there are N space characters (N > 1),
232 then the <span> element contains exactly (N-1) NBSP (non-breaking
233 space) characters, followed by a regular space character. Examples:
234 The element
235 <font><span style='mso-spacerun:yes'>\xA0\xA0\xA0 </span></font>
236 represents 4 consecutive space characters. Has to be handled by the
237 implementation. The element
238 <font><span style='mso-spacerun:yes'> abc</span></font>
239 represents a space characters followed by the letters a, b, c. These
240 strings have to be handled by the VML import filter implementation.
241 */
242
243 // passed string ends with the leading opening bracket of an XML element
244 const sal_Char* pcBeg = rChars.getStr();
245 const sal_Char* pcEnd = pcBeg + rChars.getLength();
246 bool bHasBracket = (pcBeg < pcEnd) && (pcEnd[ -1 ] == '<');
247 if( bHasBracket ) --pcEnd;
248
249 // skip leading whitespace
250 const sal_Char* pcContentsBeg = lclFindNonWhiteSpace( pcBeg, pcEnd );
251 while( pcContentsBeg < pcEnd )
252 {
253 const sal_Char* pcWhitespaceBeg = lclFindWhiteSpace( pcContentsBeg + 1, pcEnd );
254 lclAppendToBuffer( rBuffer, pcContentsBeg, pcWhitespaceBeg );
255 if( pcWhitespaceBeg < pcEnd )
256 rBuffer.append( ' ' );
257 pcContentsBeg = lclFindNonWhiteSpace( pcWhitespaceBeg, pcEnd );
258 }
259
260 return bHasBracket;
261 }
262
263 } // namespace
264
265 // ============================================================================
266
InputStream(const Reference<XComponentContext> & rxContext,const Reference<XInputStream> & rxInStrm)267 InputStream::InputStream( const Reference< XComponentContext >& rxContext, const Reference< XInputStream >& rxInStrm ) :
268 // use single-byte ISO-8859-1 encoding which maps all byte characters to the first 256 Unicode characters
269 mxTextStrm( TextInputStream::createXTextInputStream( rxContext, rxInStrm, RTL_TEXTENCODING_ISO_8859_1 ) ),
270 maOpeningBracket( 1 ),
271 maClosingBracket( 1 ),
272 maOpeningCData( CREATE_OSTRING( "<![CDATA[" ) ),
273 maClosingCData( CREATE_OSTRING( "]]>" ) ),
274 mnBufferPos( 0 )
275 {
276 maOpeningBracket[ 0 ] = '<';
277 maClosingBracket[ 0 ] = '>';
278 }
279
~InputStream()280 InputStream::~InputStream()
281 {
282 }
283
readBytes(Sequence<sal_Int8> & rData,sal_Int32 nBytesToRead)284 sal_Int32 SAL_CALL InputStream::readBytes( Sequence< sal_Int8 >& rData, sal_Int32 nBytesToRead )
285 throw (NotConnectedException, BufferSizeExceededException, IOException, RuntimeException)
286 {
287 if( nBytesToRead < 0 )
288 throw IOException();
289
290 rData.realloc( nBytesToRead );
291 sal_Int8* pcDest = rData.getArray();
292 sal_Int32 nRet = 0;
293 while( (nBytesToRead > 0) && !mxTextStrm->isEOF() )
294 {
295 updateBuffer();
296 sal_Int32 nReadSize = ::std::min( nBytesToRead, maBuffer.getLength() - mnBufferPos );
297 if( nReadSize > 0 )
298 {
299 memcpy( pcDest + nRet, maBuffer.getStr() + mnBufferPos, static_cast< size_t >( nReadSize ) );
300 mnBufferPos += nReadSize;
301 nBytesToRead -= nReadSize;
302 nRet += nReadSize;
303 }
304 }
305 if( nRet < rData.getLength() )
306 rData.realloc( nRet );
307 return nRet;
308 }
309
readSomeBytes(Sequence<sal_Int8> & rData,sal_Int32 nMaxBytesToRead)310 sal_Int32 SAL_CALL InputStream::readSomeBytes( Sequence< sal_Int8 >& rData, sal_Int32 nMaxBytesToRead )
311 throw (NotConnectedException, BufferSizeExceededException, IOException, RuntimeException)
312 {
313 return readBytes( rData, nMaxBytesToRead );
314 }
315
skipBytes(sal_Int32 nBytesToSkip)316 void SAL_CALL InputStream::skipBytes( sal_Int32 nBytesToSkip )
317 throw (NotConnectedException, BufferSizeExceededException, IOException, RuntimeException)
318 {
319 if( nBytesToSkip < 0 )
320 throw IOException();
321
322 while( (nBytesToSkip > 0) && !mxTextStrm->isEOF() )
323 {
324 updateBuffer();
325 sal_Int32 nSkipSize = ::std::min( nBytesToSkip, maBuffer.getLength() - mnBufferPos );
326 mnBufferPos += nSkipSize;
327 nBytesToSkip -= nSkipSize;
328 }
329 }
330
available()331 sal_Int32 SAL_CALL InputStream::available() throw (NotConnectedException, IOException, RuntimeException)
332 {
333 updateBuffer();
334 return maBuffer.getLength() - mnBufferPos;
335 }
336
closeInput()337 void SAL_CALL InputStream::closeInput() throw (NotConnectedException, IOException, RuntimeException)
338 {
339 mxTextStrm->closeInput();
340 }
341
342 // private --------------------------------------------------------------------
343
updateBuffer()344 void InputStream::updateBuffer() throw (IOException, RuntimeException)
345 {
346 while( (mnBufferPos >= maBuffer.getLength()) && !mxTextStrm->isEOF() )
347 {
348 // collect new contents in a string buffer
349 OStringBuffer aBuffer;
350
351 // read and process characters until the opening bracket of the next XML element
352 OString aChars = readToElementBegin();
353 bool bHasOpeningBracket = lclProcessCharacters( aBuffer, aChars );
354
355 // read and process characters until (and including) closing bracket (an XML element)
356 OSL_ENSURE( bHasOpeningBracket || mxTextStrm->isEOF(), "InputStream::updateBuffer - missing opening bracket of XML element" );
357 if( bHasOpeningBracket && !mxTextStrm->isEOF() )
358 {
359 // read the element text (add the leading opening bracket manually)
360 OString aElement = OString( '<' ) + readToElementEnd();
361 // check for CDATA part, starting with '<![CDATA['
362 if( aElement.match( maOpeningCData ) )
363 {
364 // search the end tag ']]>'
365 while( ((aElement.getLength() < maClosingCData.getLength()) || !aElement.match( maClosingCData, aElement.getLength() - maClosingCData.getLength() )) && !mxTextStrm->isEOF() )
366 aElement += readToElementEnd();
367 // copy the entire CDATA part
368 aBuffer.append( aElement );
369 }
370 else
371 {
372 // no CDATA part - process the contents of the element
373 lclProcessElement( aBuffer, aElement );
374 }
375 }
376
377 maBuffer = aBuffer.makeStringAndClear();
378 mnBufferPos = 0;
379 }
380 }
381
readToElementBegin()382 OString InputStream::readToElementBegin() throw (IOException, RuntimeException)
383 {
384 return OUStringToOString( mxTextStrm->readString( maOpeningBracket, sal_False ), RTL_TEXTENCODING_ISO_8859_1 );
385 }
386
readToElementEnd()387 OString InputStream::readToElementEnd() throw (IOException, RuntimeException)
388 {
389 OString aText = OUStringToOString( mxTextStrm->readString( maClosingBracket, sal_False ), RTL_TEXTENCODING_ISO_8859_1 );
390 OSL_ENSURE( (aText.getLength() > 0) && (aText[ aText.getLength() - 1 ] == '>'), "InputStream::readToElementEnd - missing closing bracket of XML element" );
391 return aText;
392 }
393
394 // ============================================================================
395
396 } // namespace vml
397 } // namespave oox
398