xref: /trunk/main/oox/source/vml/vmlinputstream.cxx (revision a893be29)
1 /**************************************************************
2  *
3  * Licensed to the Apache Software Foundation (ASF) under one
4  * or more contributor license agreements.  See the NOTICE file
5  * distributed with this work for additional information
6  * regarding copyright ownership.  The ASF licenses this file
7  * to you under the Apache License, Version 2.0 (the
8  * "License"); you may not use this file except in compliance
9  * with the License.  You may obtain a copy of the License at
10  *
11  *   http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing,
14  * software distributed under the License is distributed on an
15  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16  * KIND, either express or implied.  See the License for the
17  * specific language governing permissions and limitations
18  * under the License.
19  *
20  *************************************************************/
21 
22 
23 
24 #include "oox/vml/vmlinputstream.hxx"
25 
26 #include <com/sun/star/io/XTextInputStream.hpp>
27 #include <map>
28 #include <string.h>
29 #include <rtl/strbuf.hxx>
30 #include "oox/helper/helper.hxx"
31 #include "oox/helper/textinputstream.hxx"
32 
33 namespace oox {
34 namespace vml {
35 
36 // ============================================================================
37 
38 using namespace ::com::sun::star::io;
39 using namespace ::com::sun::star::uno;
40 
41 using ::rtl::OString;
42 using ::rtl::OStringBuffer;
43 
44 // ============================================================================
45 
46 namespace {
47 
lclFindCharacter(const sal_Char * pcBeg,const sal_Char * pcEnd,sal_Char cChar)48 inline const sal_Char* lclFindCharacter( const sal_Char* pcBeg, const sal_Char* pcEnd, sal_Char cChar )
49 {
50     sal_Int32 nIndex = rtl_str_indexOfChar_WithLength( pcBeg, static_cast< sal_Int32 >( pcEnd - pcBeg ), cChar );
51     return (nIndex < 0) ? pcEnd : (pcBeg + nIndex);
52 }
53 
lclIsWhiteSpace(sal_Char cChar)54 inline bool lclIsWhiteSpace( sal_Char cChar )
55 {
56     return cChar < 32;
57 }
58 
lclFindWhiteSpace(const sal_Char * pcBeg,const sal_Char * pcEnd)59 const sal_Char* lclFindWhiteSpace( const sal_Char* pcBeg, const sal_Char* pcEnd )
60 {
61     for( ; pcBeg < pcEnd; ++pcBeg )
62         if( lclIsWhiteSpace( *pcBeg ) )
63             return pcBeg;
64     return pcEnd;
65 }
66 
lclFindNonWhiteSpace(const sal_Char * pcBeg,const sal_Char * pcEnd)67 const sal_Char* lclFindNonWhiteSpace( const sal_Char* pcBeg, const sal_Char* pcEnd )
68 {
69     for( ; pcBeg < pcEnd; ++pcBeg )
70         if( !lclIsWhiteSpace( *pcBeg ) )
71             return pcBeg;
72     return pcEnd;
73 }
74 
lclTrimWhiteSpaceFromEnd(const sal_Char * pcBeg,const sal_Char * pcEnd)75 const sal_Char* lclTrimWhiteSpaceFromEnd( const sal_Char* pcBeg, const sal_Char* pcEnd )
76 {
77     while( (pcBeg < pcEnd) && lclIsWhiteSpace( pcEnd[ -1 ] ) )
78         --pcEnd;
79     return pcEnd;
80 }
81 
lclAppendToBuffer(OStringBuffer & rBuffer,const sal_Char * pcBeg,const sal_Char * pcEnd)82 inline void lclAppendToBuffer( OStringBuffer& rBuffer, const sal_Char* pcBeg, const sal_Char* pcEnd )
83 {
84     rBuffer.append( pcBeg, static_cast< sal_Int32 >( pcEnd - pcBeg ) );
85 }
86 
87 // ----------------------------------------------------------------------------
88 
lclProcessAttribs(OStringBuffer & rBuffer,const sal_Char * pcBeg,const sal_Char * pcEnd)89 void lclProcessAttribs( OStringBuffer& rBuffer, const sal_Char* pcBeg, const sal_Char* pcEnd )
90 {
91     /*  Map attribute names to char-pointer of all attributes. This map is used
92         to find multiple occurrences of attributes with the same name. The
93         mapped pointers are used as map key in the next map below. */
94     typedef ::std::map< OString, const sal_Char* > AttributeNameMap;
95     AttributeNameMap aAttributeNames;
96 
97     /*  Map the char-pointers of all attributes to the full attribute definition
98         string. This preserves the original order of the used attributes. */
99     typedef ::std::map< const sal_Char*, OString > AttributeDataMap;
100     AttributeDataMap aAttributes;
101 
102     bool bOk = true;
103     const sal_Char* pcNameBeg = pcBeg;
104     while( bOk && (pcNameBeg < pcEnd) )
105     {
106         // pcNameBeg points to begin of attribute name, find equality sign
107         const sal_Char* pcEqualSign = lclFindCharacter( pcNameBeg, pcEnd, '=' );
108         if( (bOk = pcEqualSign < pcEnd) == true )
109         {
110             // find end of attribute name (ignore whitespace between name and equality sign)
111             const sal_Char* pcNameEnd = lclTrimWhiteSpaceFromEnd( pcNameBeg, pcEqualSign );
112             if( (bOk = pcNameBeg < pcNameEnd) == true )
113             {
114                 // find begin of attribute value (must be single or double quote)
115                 const sal_Char* pcValueBeg = lclFindNonWhiteSpace( pcEqualSign + 1, pcEnd );
116                 if( (bOk = (pcValueBeg < pcEnd) && ((*pcValueBeg == '\'') || (*pcValueBeg == '"'))) == true )
117                 {
118                     // find end of attribute value (matching quote character)
119                     const sal_Char* pcValueEnd = lclFindCharacter( pcValueBeg + 1, pcEnd, *pcValueBeg );
120                     if( (bOk = pcValueEnd < pcEnd) == true )
121                     {
122                         ++pcValueEnd;
123                         OString aAttribName( pcNameBeg, static_cast< sal_Int32 >( pcNameEnd - pcNameBeg ) );
124                         OString aAttribData( pcNameBeg, static_cast< sal_Int32 >( pcValueEnd - pcNameBeg ) );
125                         // search for an existing attribute with the same name
126                         AttributeNameMap::iterator aIt = aAttributeNames.find( aAttribName );
127                         // remove its definition from the data map
128                         if( aIt != aAttributeNames.end() )
129                             aAttributes.erase( aIt->second );
130                         // insert the attribute into both maps
131                         aAttributeNames[ aAttribName ] = pcNameBeg;
132                         aAttributes[ pcNameBeg ] = aAttribData;
133                         // continue with next attribute (skip whitespace after this attribute)
134                         pcNameBeg = pcValueEnd;
135                         if( (pcNameBeg < pcEnd) && ((bOk = lclIsWhiteSpace( *pcNameBeg )) == true) )
136                             pcNameBeg = lclFindNonWhiteSpace( pcNameBeg + 1, pcEnd );
137                     }
138                 }
139             }
140         }
141     }
142 
143     // if no error has occurred, build the resulting attribute list
144     if( bOk )
145         for( AttributeDataMap::iterator aIt = aAttributes.begin(), aEnd = aAttributes.end(); aIt != aEnd; ++aIt )
146             rBuffer.append( ' ' ).append( aIt->second );
147     // on error, just append the complete passed string
148     else
149         lclAppendToBuffer( rBuffer, pcBeg, pcEnd );
150 }
151 
lclProcessElement(OStringBuffer & rBuffer,const OString & rElement)152 void lclProcessElement( OStringBuffer& rBuffer, const OString& rElement )
153 {
154     // check that passed string starts and ends with the brackets of an XML element
155     sal_Int32 nElementLen = rElement.getLength();
156     if( nElementLen == 0 )
157         return;
158 
159     const sal_Char* pcOpen = rElement.getStr();
160     const sal_Char* pcClose = pcOpen + nElementLen - 1;
161 
162     // no complete element found
163     if( (pcOpen >= pcClose) || (*pcOpen != '<') || (*pcClose != '>') )
164     {
165         // just append all passed characters
166         rBuffer.append( rElement );
167     }
168 
169     // skip parser instructions: '<![...]>'
170     else if( (nElementLen >= 5) && (pcOpen[ 1 ] == '!') && (pcOpen[ 2 ] == '[') && (pcClose[ -1 ] == ']') )
171     {
172         // do nothing
173     }
174 
175     // replace '<br>' element with newline
176     else if( (nElementLen >= 4) && (pcOpen[ 1 ] == 'b') && (pcOpen[ 2 ] == 'r') && (lclFindNonWhiteSpace( pcOpen + 3, pcClose ) == pcClose) )
177     {
178         rBuffer.append( '\n' );
179     }
180 
181     // check start elements and simple elements for repeated attributes
182     else if( pcOpen[ 1 ] != '/' )
183     {
184         // find positions of text content inside brackets, exclude '/' in '<simpleelement/>'
185         const sal_Char* pcContentBeg = pcOpen + 1;
186         bool bIsEmptyElement = pcClose[ -1 ] == '/';
187         const sal_Char* pcContentEnd = bIsEmptyElement ? (pcClose - 1) : pcClose;
188         // append opening bracket and element name to buffer
189         const sal_Char* pcWhiteSpace = lclFindWhiteSpace( pcContentBeg, pcContentEnd );
190         lclAppendToBuffer( rBuffer, pcOpen, pcWhiteSpace );
191         // find begin of attributes, and process all attributes
192         const sal_Char* pcAttribBeg = lclFindNonWhiteSpace( pcWhiteSpace, pcContentEnd );
193         if( pcAttribBeg < pcContentEnd )
194             lclProcessAttribs( rBuffer, pcAttribBeg, pcContentEnd );
195         // close the element
196         if( bIsEmptyElement )
197             rBuffer.append( '/' );
198         rBuffer.append( '>' );
199     }
200 
201     // append end elements without further processing
202     else
203     {
204         rBuffer.append( rElement );
205     }
206 }
207 
lclProcessCharacters(OStringBuffer & rBuffer,const OString & rChars)208 bool lclProcessCharacters( OStringBuffer& rBuffer, const OString& rChars )
209 {
210     /*  MSO has a very weird way to store and handle whitespaces. The stream
211         may contain lots of spaces, tabs, and newlines which have to be handled
212         as single space character. This will be done in this function.
213 
214         If the element text contains a literal line break, it will be stored as
215         <br> tag (without matching </br> element). This input stream wrapper
216         will replace this element with a literal LF character (see below).
217 
218         A single space character for its own is stored as is. Example: The
219         element
220             <font> </font>
221         represents a single space character. The XML parser will ignore this
222         space character completely without issuing a 'characters' event. The
223         VML import filter implementation has to react on this case manually.
224 
225         A single space character following another character is stored
226         literally and must not be stipped away here. Example: The element
227             <font>abc </font>
228         contains the three letters a, b, and c, followed by a space character.
229 
230         Consecutive space characters, or a leading single space character, are
231         stored in a <span> element. If there are N space characters (N > 1),
232         then the <span> element contains exactly (N-1) NBSP (non-breaking
233         space) characters, followed by a regular space character. Examples:
234         The element
235             <font><span style='mso-spacerun:yes'>\xA0\xA0\xA0 </span></font>
236         represents 4 consecutive space characters. Has to be handled by the
237         implementation. The element
238             <font><span style='mso-spacerun:yes'> abc</span></font>
239         represents a space characters followed by the letters a, b, c. These
240         strings have to be handled by the VML import filter implementation.
241      */
242 
243     // passed string ends with the leading opening bracket of an XML element
244     const sal_Char* pcBeg = rChars.getStr();
245     const sal_Char* pcEnd = pcBeg + rChars.getLength();
246     bool bHasBracket = (pcBeg < pcEnd) && (pcEnd[ -1 ] == '<');
247     if( bHasBracket ) --pcEnd;
248 
249     // skip leading whitespace
250     const sal_Char* pcContentsBeg = lclFindNonWhiteSpace( pcBeg, pcEnd );
251     while( pcContentsBeg < pcEnd )
252     {
253         const sal_Char* pcWhitespaceBeg = lclFindWhiteSpace( pcContentsBeg + 1, pcEnd );
254         lclAppendToBuffer( rBuffer, pcContentsBeg, pcWhitespaceBeg );
255         if( pcWhitespaceBeg < pcEnd )
256             rBuffer.append( ' ' );
257         pcContentsBeg = lclFindNonWhiteSpace( pcWhitespaceBeg, pcEnd );
258     }
259 
260     return bHasBracket;
261 }
262 
263 } // namespace
264 
265 // ============================================================================
266 
InputStream(const Reference<XComponentContext> & rxContext,const Reference<XInputStream> & rxInStrm)267 InputStream::InputStream( const Reference< XComponentContext >& rxContext, const Reference< XInputStream >& rxInStrm ) :
268     // use single-byte ISO-8859-1 encoding which maps all byte characters to the first 256 Unicode characters
269     mxTextStrm( TextInputStream::createXTextInputStream( rxContext, rxInStrm, RTL_TEXTENCODING_ISO_8859_1 ) ),
270     maOpeningBracket( 1 ),
271     maClosingBracket( 1 ),
272     maOpeningCData( CREATE_OSTRING( "<![CDATA[" ) ),
273     maClosingCData( CREATE_OSTRING( "]]>" ) ),
274     mnBufferPos( 0 )
275 {
276     maOpeningBracket[ 0 ] = '<';
277     maClosingBracket[ 0 ] = '>';
278 }
279 
~InputStream()280 InputStream::~InputStream()
281 {
282 }
283 
readBytes(Sequence<sal_Int8> & rData,sal_Int32 nBytesToRead)284 sal_Int32 SAL_CALL InputStream::readBytes( Sequence< sal_Int8 >& rData, sal_Int32 nBytesToRead )
285         throw (NotConnectedException, BufferSizeExceededException, IOException, RuntimeException)
286 {
287     if( nBytesToRead < 0 )
288         throw IOException();
289 
290     rData.realloc( nBytesToRead );
291     sal_Int8* pcDest = rData.getArray();
292     sal_Int32 nRet = 0;
293     while( (nBytesToRead > 0) && !mxTextStrm->isEOF() )
294     {
295         updateBuffer();
296         sal_Int32 nReadSize = ::std::min( nBytesToRead, maBuffer.getLength() - mnBufferPos );
297         if( nReadSize > 0 )
298         {
299             memcpy( pcDest + nRet, maBuffer.getStr() + mnBufferPos, static_cast< size_t >( nReadSize ) );
300             mnBufferPos += nReadSize;
301             nBytesToRead -= nReadSize;
302             nRet += nReadSize;
303         }
304     }
305     if( nRet < rData.getLength() )
306         rData.realloc( nRet );
307     return nRet;
308 }
309 
readSomeBytes(Sequence<sal_Int8> & rData,sal_Int32 nMaxBytesToRead)310 sal_Int32 SAL_CALL InputStream::readSomeBytes( Sequence< sal_Int8 >& rData, sal_Int32 nMaxBytesToRead )
311         throw (NotConnectedException, BufferSizeExceededException, IOException, RuntimeException)
312 {
313     return readBytes( rData, nMaxBytesToRead );
314 }
315 
skipBytes(sal_Int32 nBytesToSkip)316 void SAL_CALL InputStream::skipBytes( sal_Int32 nBytesToSkip )
317         throw (NotConnectedException, BufferSizeExceededException, IOException, RuntimeException)
318 {
319     if( nBytesToSkip < 0 )
320         throw IOException();
321 
322     while( (nBytesToSkip > 0) && !mxTextStrm->isEOF() )
323     {
324         updateBuffer();
325         sal_Int32 nSkipSize = ::std::min( nBytesToSkip, maBuffer.getLength() - mnBufferPos );
326         mnBufferPos += nSkipSize;
327         nBytesToSkip -= nSkipSize;
328     }
329 }
330 
available()331 sal_Int32 SAL_CALL InputStream::available() throw (NotConnectedException, IOException, RuntimeException)
332 {
333     updateBuffer();
334     return maBuffer.getLength() - mnBufferPos;
335 }
336 
closeInput()337 void SAL_CALL InputStream::closeInput() throw (NotConnectedException, IOException, RuntimeException)
338 {
339     mxTextStrm->closeInput();
340 }
341 
342 // private --------------------------------------------------------------------
343 
updateBuffer()344 void InputStream::updateBuffer() throw (IOException, RuntimeException)
345 {
346     while( (mnBufferPos >= maBuffer.getLength()) && !mxTextStrm->isEOF() )
347     {
348         // collect new contents in a string buffer
349         OStringBuffer aBuffer;
350 
351         // read and process characters until the opening bracket of the next XML element
352         OString aChars = readToElementBegin();
353         bool bHasOpeningBracket = lclProcessCharacters( aBuffer, aChars );
354 
355         // read and process characters until (and including) closing bracket (an XML element)
356         OSL_ENSURE( bHasOpeningBracket || mxTextStrm->isEOF(), "InputStream::updateBuffer - missing opening bracket of XML element" );
357         if( bHasOpeningBracket && !mxTextStrm->isEOF() )
358         {
359             // read the element text (add the leading opening bracket manually)
360             OString aElement = OString( '<' ) + readToElementEnd();
361             // check for CDATA part, starting with '<![CDATA['
362             if( aElement.match( maOpeningCData ) )
363             {
364                 // search the end tag ']]>'
365                 while( ((aElement.getLength() < maClosingCData.getLength()) || !aElement.match( maClosingCData, aElement.getLength() - maClosingCData.getLength() )) && !mxTextStrm->isEOF() )
366                     aElement += readToElementEnd();
367                 // copy the entire CDATA part
368                 aBuffer.append( aElement );
369             }
370             else
371             {
372                 // no CDATA part - process the contents of the element
373                 lclProcessElement( aBuffer, aElement );
374             }
375         }
376 
377         maBuffer = aBuffer.makeStringAndClear();
378         mnBufferPos = 0;
379     }
380 }
381 
readToElementBegin()382 OString InputStream::readToElementBegin() throw (IOException, RuntimeException)
383 {
384     return OUStringToOString( mxTextStrm->readString( maOpeningBracket, sal_False ), RTL_TEXTENCODING_ISO_8859_1 );
385 }
386 
readToElementEnd()387 OString InputStream::readToElementEnd() throw (IOException, RuntimeException)
388 {
389     OString aText = OUStringToOString( mxTextStrm->readString( maClosingBracket, sal_False ), RTL_TEXTENCODING_ISO_8859_1 );
390     OSL_ENSURE( (aText.getLength() > 0) && (aText[ aText.getLength() - 1 ] == '>'), "InputStream::readToElementEnd - missing closing bracket of XML element" );
391     return aText;
392 }
393 
394 // ============================================================================
395 
396 } // namespace vml
397 } // namespave oox
398