xref: /trunk/main/svtools/source/svrtf/parrtf.cxx (revision 914d351e5f5b84e4342a86d6ab8d4aca7308b9bd)
1 /**************************************************************
2  *
3  * Licensed to the Apache Software Foundation (ASF) under one
4  * or more contributor license agreements.  See the NOTICE file
5  * distributed with this work for additional information
6  * regarding copyright ownership.  The ASF licenses this file
7  * to you under the Apache License, Version 2.0 (the
8  * "License"); you may not use this file except in compliance
9  * with the License.  You may obtain a copy of the License at
10  *
11  *   http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing,
14  * software distributed under the License is distributed on an
15  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16  * KIND, either express or implied.  See the License for the
17  * specific language governing permissions and limitations
18  * under the License.
19  *
20  *************************************************************/
21 
22 
23 
24 // MARKER(update_precomp.py): autogen include statement, do not remove
25 #include "precompiled_svtools.hxx"
26 
27 #include <stdio.h> // for EOF
28 #include <rtl/tencinfo.h>
29 #include <tools/stream.hxx>
30 #include <tools/debug.hxx>
31 #include <svtools/rtftoken.h>
32 #include <svtools/rtfkeywd.hxx>
33 #include <svtools/parrtf.hxx>
34 
35 const int MAX_STRING_LEN = 1024;
36 const int MAX_TOKEN_LEN = 128;
37 
38 #define RTF_ISDIGIT( c ) (c >= '0' && c <= '9')
39 #define RTF_ISALPHA( c ) ( (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') )
40 
41 SvRTFParser::SvRTFParser( SvStream& rIn, sal_uInt8 nStackSize )
42     : SvParser( rIn, nStackSize ),
43     eUNICodeSet( RTL_TEXTENCODING_MS_1252 ), // default is ANSI code set
44     nUCharOverread( 1 )
45 {
46     // default is ANSI code set
47     SetSrcEncoding( RTL_TEXTENCODING_MS_1252 );
48     bRTF_InTextRead = false;
49 }
50 
51 SvRTFParser::~SvRTFParser()
52 {
53 }
54 
55 
56 
57 
58 int SvRTFParser::_GetNextToken()
59 {
60     int nRet = 0;
61     do {
62         int bNextCh = true;
63         switch( nNextCh )
64         {
65         case '\\':
66             {
67                 // Steuerzeichen
68                 switch( nNextCh = GetNextChar() )
69                 {
70                 case '{':
71                 case '}':
72                 case '\\':
73                 case '+':       // habe ich in einem RTF-File gefunden
74                 case '~':       // nonbreaking space
75                 case '-':       // optional hyphen
76                 case '_':       // nonbreaking hyphen
77                 case '\'':      // HexValue
78                     nNextCh = '\\';
79                     rInput.SeekRel( -1 );
80                     ScanText();
81                     nRet = RTF_TEXTTOKEN;
82                     bNextCh = 0 == nNextCh;
83                     break;
84 
85                 case '*':       // ignoreflag
86                     nRet = RTF_IGNOREFLAG;
87                     break;
88                 case ':':       // subentry in an index entry
89                     nRet = RTF_SUBENTRYINDEX;
90                     break;
91                 case '|':       // formula-character
92                     nRet = RTF_FORMULA;
93                     break;
94 
95                 case 0x0a:
96                 case 0x0d:
97                     nRet = RTF_PAR;
98                     break;
99 
100                 default:
101                     if( RTF_ISALPHA( nNextCh ) )
102                     {
103                         aToken = '\\';
104                         {
105                             String aStrBuffer;
106                             sal_Unicode* pStr = aStrBuffer.AllocBuffer(
107                                                             MAX_TOKEN_LEN );
108                             xub_StrLen nStrLen = 0;
109                             do {
110                                 *(pStr + nStrLen++) = nNextCh;
111                                 if( MAX_TOKEN_LEN == nStrLen )
112                                 {
113                                     aToken += aStrBuffer;
114                                     aToken.GetBufferAccess(); // make unique string!
115                                     nStrLen = 0;
116                                 }
117                                 nNextCh = GetNextChar();
118                             } while( RTF_ISALPHA( nNextCh ) );
119                             if( nStrLen )
120                             {
121                                 aStrBuffer.ReleaseBufferAccess( nStrLen );
122                                 aToken += aStrBuffer;
123                             }
124                         }
125 
126                         // Minus fuer numerischen Parameter
127                         int bNegValue = false;
128                         if( '-' == nNextCh )
129                         {
130                             bNegValue = true;
131                             nNextCh = GetNextChar();
132                         }
133 
134                         // evt. Numerischer Parameter
135                         if( RTF_ISDIGIT( nNextCh ) )
136                         {
137                             nTokenValue = 0;
138                             do {
139                                 nTokenValue *= 10;
140                                 nTokenValue += nNextCh - '0';
141                                 nNextCh = GetNextChar();
142                             } while( RTF_ISDIGIT( nNextCh ) );
143                             if( bNegValue )
144                                 nTokenValue = -nTokenValue;
145                             bTokenHasValue=true;
146                         }
147                         else if( bNegValue ) // das Minus wieder zurueck
148                         {
149                             nNextCh = '-';
150                             rInput.SeekRel( -1 );
151                         }
152                         if( ' ' == nNextCh ) // Blank gehoert zum Token!
153                             nNextCh = GetNextChar();
154 
155                         // suche das Token in der Tabelle:
156                         if( 0 == (nRet = GetRTFToken( aToken )) )
157                             // Unknown Control
158                             nRet = RTF_UNKNOWNCONTROL;
159 
160                         // bug 76812 - unicode token handled as normal text
161                         bNextCh = false;
162                         switch( nRet )
163                         {
164                         case RTF_UC:
165                             if( 0 <= nTokenValue )
166                             {
167                                 nUCharOverread = (sal_uInt8)nTokenValue;
168 #if 1
169                                 // cmc: other ifdef breaks #i3584
170                                 aParserStates.top().
171                                     nUCharOverread = nUCharOverread;
172 #else
173                                 if( !nUCharOverread )
174                                     nUCharOverread = aParserStates.top().nUCharOverread;
175                                 else
176                                     aParserStates.top().
177                                         nUCharOverread = nUCharOverread;
178 #endif
179                             }
180                             aToken.Erase(); // #i47831# erase token to prevent the token from being treated as text
181                             // read next token
182                             nRet = 0;
183                             break;
184 
185                         case RTF_UPR:
186                             if (!_inSkipGroup) {
187                             // UPR - overread the group with the ansi
188                             //       information
189                             while( '{' != _GetNextToken() )
190                                 ;
191                             SkipGroup();
192                             _GetNextToken(); // overread the last bracket
193                             nRet = 0;
194                             }
195                             break;
196 
197                         case RTF_U:
198                             if( !bRTF_InTextRead )
199                             {
200                                 nRet = RTF_TEXTTOKEN;
201                                 aToken = (sal_Unicode)nTokenValue;
202 
203                                 // overread the next n "RTF" characters. This
204                                 // can be also \{, \}, \'88
205                                 for( sal_uInt8 m = 0; m < nUCharOverread; ++m )
206                                 {
207                                     sal_Unicode cAnsi = nNextCh;
208                                     while( 0xD == cAnsi )
209                                         cAnsi = GetNextChar();
210                                     while( 0xA == cAnsi )
211                                         cAnsi = GetNextChar();
212 
213                                     if( '\\' == cAnsi &&
214                                         '\'' == ( cAnsi = GetNextChar() ))
215                                         // HexValue ueberlesen
216                                         cAnsi = GetHexValue();
217                                     nNextCh = GetNextChar();
218                                 }
219                                 ScanText();
220                                 bNextCh = 0 == nNextCh;
221                             }
222                             break;
223                         }
224                     }
225                     else if( SVPAR_PENDING != eState )
226                     {
227                         // Bug 34631 - "\ " ueberlesen - Blank als Zeichen
228                         // eState = SVPAR_ERROR;
229                         bNextCh = false;
230                     }
231                     break;
232                 }
233             }
234             break;
235 
236         case sal_Unicode(EOF):
237             eState = SVPAR_ACCEPTED;
238             nRet = nNextCh;
239             break;
240 
241         case '{':
242             {
243                 if( 0 <= nOpenBrakets )
244                 {
245                     RtfParserState_Impl aState( nUCharOverread, GetSrcEncoding() );
246                     aParserStates.push( aState );
247                 }
248                 ++nOpenBrakets;
249                 DBG_ASSERT(
250                     static_cast<size_t>(nOpenBrakets) == aParserStates.size(),
251                     "ParserStateStack unequal to bracket count" );
252                 nRet = nNextCh;
253             }
254             break;
255 
256         case '}':
257             --nOpenBrakets;
258             if( 0 <= nOpenBrakets )
259             {
260                 aParserStates.pop();
261                 if( !aParserStates.empty() )
262                 {
263                     const RtfParserState_Impl& rRPS =
264                             aParserStates.top();
265                     nUCharOverread = rRPS.nUCharOverread;
266                     SetSrcEncoding( rRPS.eCodeSet );
267                 }
268                 else
269                 {
270                     nUCharOverread = 1;
271                     SetSrcEncoding( GetCodeSet() );
272                 }
273             }
274             DBG_ASSERT(
275                 static_cast<size_t>(nOpenBrakets) == aParserStates.size(),
276                 "ParserStateStack unequal to bracket count" );
277             nRet = nNextCh;
278             break;
279 
280         case 0x0d:
281         case 0x0a:
282             break;
283 
284         default:
285             // es folgt normaler Text
286             ScanText();
287             nRet = RTF_TEXTTOKEN;
288             bNextCh = 0 == nNextCh;
289             break;
290         }
291 
292         if( bNextCh )
293             nNextCh = GetNextChar();
294 
295     } while( !nRet && SVPAR_WORKING == eState );
296     return nRet;
297 }
298 
299 
300 sal_Unicode SvRTFParser::GetHexValue()
301 {
302     // Hex-Wert sammeln
303     register int n;
304     register sal_Unicode nHexVal = 0;
305 
306     for( n = 0; n < 2; ++n )
307     {
308         nHexVal *= 16;
309         nNextCh = GetNextChar();
310         if( nNextCh >= '0' && nNextCh <= '9' )
311             nHexVal += (nNextCh - 48);
312         else if( nNextCh >= 'a' && nNextCh <= 'f' )
313             nHexVal += (nNextCh - 87);
314         else if( nNextCh >= 'A' && nNextCh <= 'F' )
315             nHexVal += (nNextCh - 55);
316     }
317     return nHexVal;
318 }
319 
320 void SvRTFParser::ScanText( const sal_Unicode cBreak )
321 {
322     String aStrBuffer;
323     int bWeiter = true;
324     while( bWeiter && IsParserWorking() && aStrBuffer.Len() < MAX_STRING_LEN)
325     {
326         int bNextCh = true;
327         switch( nNextCh )
328         {
329         case '\\':
330             {
331                 switch (nNextCh = GetNextChar())
332                 {
333                 case '\'':
334                     {
335 
336 #if 0
337                         // #i35653 patch from cmc
338                         ByteString aByteString(static_cast<char>(GetHexValue()));
339                         if (aByteString.Len())
340                             aStrBuffer.Append(String(aByteString, GetSrcEncoding()));
341 #else
342                         ByteString aByteString;
343                         while (1)
344                         {
345                             aByteString.Append((char)GetHexValue());
346 
347                             bool bBreak = false;
348                             sal_Char nSlash = '\\';
349                             while (!bBreak)
350                             {
351                                 wchar_t __next=GetNextChar();
352                                 if (__next>0xFF) // fix for #i43933# and #i35653#
353                                 {
354                                     if (aByteString.Len())
355                                         aStrBuffer.Append(String(aByteString, GetSrcEncoding()));
356                                     aStrBuffer.Append((sal_Unicode)__next);
357 
358                                     aByteString.Erase();
359                                     continue;
360                                 }
361                                 nSlash = (sal_Char)__next;
362                                 while (nSlash == 0xD || nSlash == 0xA)
363                                     nSlash = (sal_Char)GetNextChar();
364 
365                                 switch (nSlash)
366                                 {
367                                     case '{':
368                                     case '}':
369                                     case '\\':
370                                         bBreak = true;
371                                         break;
372                                     default:
373                                         aByteString.Append(nSlash);
374                                         break;
375                                 }
376                             }
377 
378                             nNextCh = GetNextChar();
379 
380                             if (nSlash != '\\' || nNextCh != '\'')
381                             {
382                                 rInput.SeekRel(-1);
383                                 nNextCh = nSlash;
384                                 break;
385                             }
386                         }
387 
388                         bNextCh = false;
389 
390                         if (aByteString.Len())
391                             aStrBuffer.Append(String(aByteString, GetSrcEncoding()));
392 #endif
393                     }
394                     break;
395                 case '\\':
396                 case '}':
397                 case '{':
398                 case '+':       // habe ich in einem RTF-File gefunden
399                     aStrBuffer.Append(nNextCh);
400                     break;
401                 case '~':       // nonbreaking space
402                     aStrBuffer.Append(static_cast< sal_Unicode >(0xA0));
403                     break;
404                 case '-':       // optional hyphen
405                     aStrBuffer.Append(static_cast< sal_Unicode >(0xAD));
406                     break;
407                 case '_':       // nonbreaking hyphen
408                     aStrBuffer.Append(static_cast< sal_Unicode >(0x2011));
409                     break;
410 
411                 case 'u':
412                     // UNI-Code Zeichen lesen
413                     {
414                         nNextCh = GetNextChar();
415                         rInput.SeekRel( -2 );
416 
417                         if( '-' == nNextCh || RTF_ISDIGIT( nNextCh ) )
418                         {
419                             bRTF_InTextRead = true;
420 
421                             String sSave( aToken );
422                             nNextCh = '\\';
423                             #ifdef DBG_UTIL
424                             int nToken =
425                             #endif
426                                 _GetNextToken();
427                             DBG_ASSERT( RTF_U == nToken, "doch kein UNI-Code Zeichen" );
428                             // dont convert symbol chars
429                             aStrBuffer.Append(
430                                 static_cast< sal_Unicode >(nTokenValue));
431 
432                             // overread the next n "RTF" characters. This
433                             // can be also \{, \}, \'88
434                             for( sal_uInt8 m = 0; m < nUCharOverread; ++m )
435                             {
436                                 sal_Unicode cAnsi = nNextCh;
437                                 while( 0xD == cAnsi )
438                                     cAnsi = GetNextChar();
439                                 while( 0xA == cAnsi )
440                                     cAnsi = GetNextChar();
441 
442                                 if( '\\' == cAnsi &&
443                                     '\'' == ( cAnsi = GetNextChar() ))
444                                     // HexValue ueberlesen
445                                     cAnsi = GetHexValue();
446                                 nNextCh = GetNextChar();
447                             }
448                             bNextCh = false;
449                             aToken = sSave;
450                             bRTF_InTextRead = false;
451                         }
452                         else
453                         {
454                             nNextCh = '\\';
455                             bWeiter = false; // Abbrechen, String zusammen
456                         }
457                     }
458                     break;
459 
460                 default:
461                     rInput.SeekRel( -1 );
462                     nNextCh = '\\';
463                     bWeiter = false; // Abbrechen, String zusammen
464                     break;
465                 }
466             }
467             break;
468 
469         case sal_Unicode(EOF):
470                 eState = SVPAR_ERROR;
471                 // weiter
472         case '{':
473         case '}':
474             bWeiter = false;
475             break;
476 
477         case 0x0a:
478         case 0x0d:
479             break;
480 
481         default:
482             if( nNextCh == cBreak || aStrBuffer.Len() >= MAX_STRING_LEN)
483                 bWeiter = false;
484             else
485             {
486                 do {
487                     // alle anderen Zeichen kommen in den Text
488                     aStrBuffer.Append(nNextCh);
489 
490                     if (sal_Unicode(EOF) == (nNextCh = GetNextChar()))
491                     {
492                         if (aStrBuffer.Len())
493                             aToken += aStrBuffer;
494                         return;
495                     }
496                 } while
497                 (
498                     (RTF_ISALPHA(nNextCh) || RTF_ISDIGIT(nNextCh)) &&
499                     (aStrBuffer.Len() < MAX_STRING_LEN)
500                 );
501                 bNextCh = false;
502             }
503         }
504 
505         if( bWeiter && bNextCh )
506             nNextCh = GetNextChar();
507     }
508 
509     if (aStrBuffer.Len())
510         aToken += aStrBuffer;
511 }
512 
513 
514 short SvRTFParser::_inSkipGroup=0;
515 
516 void SvRTFParser::SkipGroup()
517 {
518 short nBrackets=1;
519 if (_inSkipGroup>0)
520     return;
521 _inSkipGroup++;
522 #if 1 // #i16185# fecking \bin keyword
523     do
524     {
525         switch (nNextCh)
526         {
527             case '{':
528                 ++nBrackets;
529                 break;
530             case '}':
531                 if (!--nBrackets) {
532                     _inSkipGroup--;
533                     return;
534                 }
535                 break;
536         }
537         int nToken = _GetNextToken();
538         if (nToken == RTF_BIN)
539         {
540             rInput.SeekRel(-1);
541             rInput.SeekRel(nTokenValue);
542             nNextCh = GetNextChar();
543         }
544         while (nNextCh==0xa || nNextCh==0xd)
545         {
546             nNextCh = GetNextChar();
547         }
548     } while (sal_Unicode(EOF) != nNextCh && IsParserWorking());
549 #else
550     sal_Unicode cPrev = 0;
551     do {
552         switch( nNextCh )
553         {
554         case '{':
555             if( '\\' != cPrev )
556                 ++nBrackets;
557             break;
558 
559         case '}':
560             if( '\\' != cPrev && !--nBrackets )
561                 return;
562             break;
563 
564         case '\\':
565             if( '\\' == cPrev )
566                 nNextCh = 0;
567             break;
568         }
569         cPrev = nNextCh;
570         nNextCh = GetNextChar();
571     } while( sal_Unicode(EOF) != nNextCh && IsParserWorking() );
572 #endif
573 
574     if( SVPAR_PENDING != eState && '}' != nNextCh )
575         eState = SVPAR_ERROR;
576     _inSkipGroup--;
577 }
578 
579 void SvRTFParser::ReadUnknownData() { SkipGroup(); }
580 void SvRTFParser::ReadBitmapData()  { SkipGroup(); }
581 void SvRTFParser::ReadOLEData()     { SkipGroup(); }
582 
583 
584 SvParserState SvRTFParser::CallParser()
585 {
586     sal_Char cFirstCh;
587     nNextChPos = rInput.Tell();
588     rInput >> cFirstCh; nNextCh = cFirstCh;
589     eState = SVPAR_WORKING;
590     nOpenBrakets = 0;
591     SetSrcEncoding( eCodeSet = RTL_TEXTENCODING_MS_1252 );
592     eUNICodeSet = RTL_TEXTENCODING_MS_1252;     // default ist ANSI-CodeSet
593 
594     // die 1. beiden Token muessen '{' und \\rtf sein !!
595     if( '{' == GetNextToken() && RTF_RTF == GetNextToken() )
596     {
597         AddRef();
598         Continue( 0 );
599         if( SVPAR_PENDING != eState )
600             ReleaseRef();       // dann brauchen wir den Parser nicht mehr!
601     }
602     else
603         eState = SVPAR_ERROR;
604 
605     return eState;
606 }
607 
608 void SvRTFParser::Continue( int nToken )
609 {
610 //  DBG_ASSERT( SVPAR_CS_DONTKNOW == GetCharSet(),
611 //              "Zeichensatz wurde geaendert." );
612 
613     if( !nToken )
614         nToken = GetNextToken();
615 
616     while( IsParserWorking() )
617     {
618         SaveState( nToken );
619         switch( nToken )
620         {
621         case '}':
622             if( nOpenBrakets )
623                 goto NEXTTOKEN;
624             eState = SVPAR_ACCEPTED;
625             break;
626 
627         case '{':
628             // eine unbekannte Gruppe ?
629             {
630                 if( RTF_IGNOREFLAG != GetNextToken() )
631                     nToken = SkipToken( -1 );
632                 else if( RTF_UNKNOWNCONTROL != GetNextToken() )
633                     nToken = SkipToken( -2 );
634                 else
635                 {
636                     // gleich herausfiltern
637                     ReadUnknownData();
638                     nToken = GetNextToken();
639                     if( '}' != nToken )
640                         eState = SVPAR_ERROR;
641                     break;      // auf zum naechsten Token!!
642                 }
643             }
644             goto NEXTTOKEN;
645 
646         case RTF_UNKNOWNCONTROL:
647             break;      // unbekannte Token ueberspringen
648         case RTF_NEXTTYPE:
649         case RTF_ANSITYPE:
650             SetEncoding( eCodeSet = RTL_TEXTENCODING_MS_1252 );
651             break;
652         case RTF_MACTYPE:
653             SetEncoding( eCodeSet = RTL_TEXTENCODING_APPLE_ROMAN );
654             break;
655         case RTF_PCTYPE:
656             SetEncoding( eCodeSet = RTL_TEXTENCODING_IBM_437 );
657             break;
658         case RTF_PCATYPE:
659             SetEncoding( eCodeSet = RTL_TEXTENCODING_IBM_850 );
660             break;
661         case RTF_ANSICPG:
662             eCodeSet = rtl_getTextEncodingFromWindowsCodePage(nTokenValue);
663             SetEncoding(eCodeSet);
664             break;
665         default:
666 NEXTTOKEN:
667             NextToken( nToken );
668             break;
669         }
670         if( IsParserWorking() )
671             SaveState( 0 );         // bis hierhin abgearbeitet,
672                                     // weiter mit neuem Token!
673         nToken = GetNextToken();
674     }
675     if( SVPAR_ACCEPTED == eState && 0 < nOpenBrakets )
676         eState = SVPAR_ERROR;
677 }
678 
679 void SvRTFParser::SetEncoding( rtl_TextEncoding eEnc )
680 {
681     if (eEnc == RTL_TEXTENCODING_DONTKNOW)
682         eEnc = GetCodeSet();
683 
684     if (!aParserStates.empty())
685         aParserStates.top().eCodeSet = eEnc;
686     SetSrcEncoding(eEnc);
687 }
688 
689 #ifdef USED
690 void SvRTFParser::SaveState( int nToken )
691 {
692     SvParser::SaveState( nToken );
693 }
694 
695 void SvRTFParser::RestoreState()
696 {
697     SvParser::RestoreState();
698 }
699 #endif
700 
701 /* vim: set noet sw=4 ts=4: */
702