xref: /trunk/main/svtools/source/svrtf/parrtf.cxx (revision 1ecadb572e7010ff3b3382ad9bf179dbc6efadbb)
1 /*************************************************************************
2  *
3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4  *
5  * Copyright 2000, 2010 Oracle and/or its affiliates.
6  *
7  * OpenOffice.org - a multi-platform office productivity suite
8  *
9  * This file is part of OpenOffice.org.
10  *
11  * OpenOffice.org is free software: you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser General Public License version 3
13  * only, as published by the Free Software Foundation.
14  *
15  * OpenOffice.org is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  * GNU Lesser General Public License version 3 for more details
19  * (a copy is included in the LICENSE file that accompanied this code).
20  *
21  * You should have received a copy of the GNU Lesser General Public License
22  * version 3 along with OpenOffice.org.  If not, see
23  * <http://www.openoffice.org/license.html>
24  * for a copy of the LGPLv3 License.
25  *
26  ************************************************************************/
27 
28 // MARKER(update_precomp.py): autogen include statement, do not remove
29 #include "precompiled_svtools.hxx"
30 
31 /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil -*- */
32 
33 #include <stdio.h>                      // for EOF
34 #include <rtl/tencinfo.h>
35 #include <tools/stream.hxx>
36 #include <tools/debug.hxx>
37 #include <svtools/rtftoken.h>
38 #include <svtools/rtfkeywd.hxx>
39 #include <svtools/parrtf.hxx>
40 
41 const int MAX_STRING_LEN = 1024;
42 const int MAX_TOKEN_LEN = 128;
43 
44 #define RTF_ISDIGIT( c ) (c >= '0' && c <= '9')
45 #define RTF_ISALPHA( c ) ( (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') )
46 
47 SvRTFParser::SvRTFParser( SvStream& rIn, sal_uInt8 nStackSize )
48     : SvParser( rIn, nStackSize ),
49     eUNICodeSet( RTL_TEXTENCODING_MS_1252 ),    // default ist ANSI-CodeSet
50     nUCharOverread( 1 )
51 {
52     // default ist ANSI-CodeSet
53     SetSrcEncoding( RTL_TEXTENCODING_MS_1252 );
54     bRTF_InTextRead = false;
55 }
56 
57 SvRTFParser::~SvRTFParser()
58 {
59 }
60 
61 
62 
63 
64 int SvRTFParser::_GetNextToken()
65 {
66     int nRet = 0;
67     do {
68         int bNextCh = true;
69         switch( nNextCh )
70         {
71         case '\\':
72             {
73                 // Steuerzeichen
74                 switch( nNextCh = GetNextChar() )
75                 {
76                 case '{':
77                 case '}':
78                 case '\\':
79                 case '+':       // habe ich in einem RTF-File gefunden
80                 case '~':       // nonbreaking space
81                 case '-':       // optional hyphen
82                 case '_':       // nonbreaking hyphen
83                 case '\'':      // HexValue
84                     nNextCh = '\\';
85                     rInput.SeekRel( -1 );
86                     ScanText();
87                     nRet = RTF_TEXTTOKEN;
88                     bNextCh = 0 == nNextCh;
89                     break;
90 
91                 case '*':       // ignoreflag
92                     nRet = RTF_IGNOREFLAG;
93                     break;
94                 case ':':       // subentry in an index entry
95                     nRet = RTF_SUBENTRYINDEX;
96                     break;
97                 case '|':       // formula-charakter
98                     nRet = RTF_FORMULA;
99                     break;
100 
101                 case 0x0a:
102                 case 0x0d:
103                     nRet = RTF_PAR;
104                     break;
105 
106                 default:
107                     if( RTF_ISALPHA( nNextCh ) )
108                     {
109                         aToken = '\\';
110                         {
111                             String aStrBuffer;
112                             sal_Unicode* pStr = aStrBuffer.AllocBuffer(
113                                                             MAX_TOKEN_LEN );
114                             xub_StrLen nStrLen = 0;
115                             do {
116                                 *(pStr + nStrLen++) = nNextCh;
117                                 if( MAX_TOKEN_LEN == nStrLen )
118                                 {
119                                     aToken += aStrBuffer;
120                                     aToken.GetBufferAccess();  // make unique string!
121                                     nStrLen = 0;
122                                 }
123                                 nNextCh = GetNextChar();
124                             } while( RTF_ISALPHA( nNextCh ) );
125                             if( nStrLen )
126                             {
127                                 aStrBuffer.ReleaseBufferAccess( nStrLen );
128                                 aToken += aStrBuffer;
129                             }
130                         }
131 
132                         // Minus fuer numerischen Parameter
133                         int bNegValue = false;
134                         if( '-' == nNextCh )
135                         {
136                             bNegValue = true;
137                             nNextCh = GetNextChar();
138                         }
139 
140                         // evt. Numerischer Parameter
141                         if( RTF_ISDIGIT( nNextCh ) )
142                         {
143                             nTokenValue = 0;
144                             do {
145                                 nTokenValue *= 10;
146                                 nTokenValue += nNextCh - '0';
147                                 nNextCh = GetNextChar();
148                             } while( RTF_ISDIGIT( nNextCh ) );
149                             if( bNegValue )
150                                 nTokenValue = -nTokenValue;
151                             bTokenHasValue=true;
152                         }
153                         else if( bNegValue )        // das Minus wieder zurueck
154                         {
155                             nNextCh = '-';
156                             rInput.SeekRel( -1 );
157                         }
158                         if( ' ' == nNextCh )        // Blank gehoert zum Token!
159                             nNextCh = GetNextChar();
160 
161                         // suche das Token in der Tabelle:
162                         if( 0 == (nRet = GetRTFToken( aToken )) )
163                             // Unknown Control
164                             nRet = RTF_UNKNOWNCONTROL;
165 
166                         // bug 76812 - unicode token handled as normal text
167                         bNextCh = false;
168                         switch( nRet )
169                         {
170                         case RTF_UC:
171                             if( 0 <= nTokenValue )
172                             {
173                                 nUCharOverread = (sal_uInt8)nTokenValue;
174 #if 1
175                                 //cmc: other ifdef breaks #i3584
176                                 aParserStates.top().
177                                     nUCharOverread = nUCharOverread;
178 #else
179                                 if( !nUCharOverread )
180                                     nUCharOverread = aParserStates.top().nUCharOverread;
181                                 else
182                                     aParserStates.top().
183                                         nUCharOverread = nUCharOverread;
184 #endif
185                             }
186                             aToken.Erase(); // #i47831# erase token to prevent the token from beeing treated as text
187                             // read next token
188                             nRet = 0;
189                             break;
190 
191                         case RTF_UPR:
192                             if (!_inSkipGroup) {
193                             // UPR - overread the group with the ansi
194                             //       informations
195                             while( '{' != _GetNextToken() )
196                                 ;
197                             SkipGroup();
198                             _GetNextToken();  // overread the last bracket
199                             nRet = 0;
200                             }
201                             break;
202 
203                         case RTF_U:
204                             if( !bRTF_InTextRead )
205                             {
206                                 nRet = RTF_TEXTTOKEN;
207                                 aToken = (sal_Unicode)nTokenValue;
208 
209                                 // overread the next n "RTF" characters. This
210                                 // can be also \{, \}, \'88
211                                 for( sal_uInt8 m = 0; m < nUCharOverread; ++m )
212                                 {
213                                     sal_Unicode cAnsi = nNextCh;
214                                     while( 0xD == cAnsi )
215                                         cAnsi = GetNextChar();
216                                     while( 0xA == cAnsi )
217                                         cAnsi = GetNextChar();
218 
219                                     if( '\\' == cAnsi &&
220                                         '\'' == ( cAnsi = GetNextChar() ))
221                                         // HexValue ueberlesen
222                                         cAnsi = GetHexValue();
223                                     nNextCh = GetNextChar();
224                                 }
225                                 ScanText();
226                                 bNextCh = 0 == nNextCh;
227                             }
228                             break;
229                         }
230                     }
231                     else if( SVPAR_PENDING != eState )
232                     {
233                         // Bug 34631 - "\ " ueberlesen - Blank als Zeichen
234                         // eState = SVPAR_ERROR;
235                         bNextCh = false;
236                     }
237                     break;
238                 }
239             }
240             break;
241 
242         case sal_Unicode(EOF):
243             eState = SVPAR_ACCEPTED;
244             nRet = nNextCh;
245             break;
246 
247         case '{':
248             {
249                 if( 0 <= nOpenBrakets )
250                 {
251                     RtfParserState_Impl aState( nUCharOverread, GetSrcEncoding() );
252                     aParserStates.push( aState );
253                 }
254                 ++nOpenBrakets;
255                 DBG_ASSERT(
256                     static_cast<size_t>(nOpenBrakets) == aParserStates.size(),
257                     "ParserStateStack unequal to bracket count" );
258                 nRet = nNextCh;
259             }
260             break;
261 
262         case '}':
263             --nOpenBrakets;
264             if( 0 <= nOpenBrakets )
265             {
266                 aParserStates.pop();
267                 if( !aParserStates.empty() )
268                 {
269                     const RtfParserState_Impl& rRPS =
270                             aParserStates.top();
271                     nUCharOverread = rRPS.nUCharOverread;
272                     SetSrcEncoding( rRPS.eCodeSet );
273                 }
274                 else
275                 {
276                     nUCharOverread = 1;
277                     SetSrcEncoding( GetCodeSet() );
278                 }
279             }
280             DBG_ASSERT(
281                 static_cast<size_t>(nOpenBrakets) == aParserStates.size(),
282                 "ParserStateStack unequal to bracket count" );
283             nRet = nNextCh;
284             break;
285 
286         case 0x0d:
287         case 0x0a:
288             break;
289 
290         default:
291             // es folgt normaler Text
292             ScanText();
293             nRet = RTF_TEXTTOKEN;
294             bNextCh = 0 == nNextCh;
295             break;
296         }
297 
298         if( bNextCh )
299             nNextCh = GetNextChar();
300 
301     } while( !nRet && SVPAR_WORKING == eState );
302     return nRet;
303 }
304 
305 
306 sal_Unicode SvRTFParser::GetHexValue()
307 {
308     // Hex-Wert sammeln
309     register int n;
310     register sal_Unicode nHexVal = 0;
311 
312     for( n = 0; n < 2; ++n )
313     {
314         nHexVal *= 16;
315         nNextCh = GetNextChar();
316         if( nNextCh >= '0' && nNextCh <= '9' )
317             nHexVal += (nNextCh - 48);
318         else if( nNextCh >= 'a' && nNextCh <= 'f' )
319             nHexVal += (nNextCh - 87);
320         else if( nNextCh >= 'A' && nNextCh <= 'F' )
321             nHexVal += (nNextCh - 55);
322     }
323     return nHexVal;
324 }
325 
326 void SvRTFParser::ScanText( const sal_Unicode cBreak )
327 {
328     String aStrBuffer;
329     int bWeiter = true;
330     while( bWeiter && IsParserWorking() && aStrBuffer.Len() < MAX_STRING_LEN)
331     {
332         int bNextCh = true;
333         switch( nNextCh )
334         {
335         case '\\':
336             {
337                 switch (nNextCh = GetNextChar())
338                 {
339                 case '\'':
340                     {
341 
342 #if 0
343                         // #i35653 patch from cmc
344                         ByteString aByteString(static_cast<char>(GetHexValue()));
345                         if (aByteString.Len())
346                             aStrBuffer.Append(String(aByteString, GetSrcEncoding()));
347 #else
348                         ByteString aByteString;
349                         while (1)
350                         {
351                             aByteString.Append((char)GetHexValue());
352 
353                             bool bBreak = false;
354                             sal_Char nSlash = '\\';
355                             while (!bBreak)
356                             {
357                                 wchar_t __next=GetNextChar();
358                                 if (__next>0xFF) // fix for #i43933# and #i35653#
359                                 {
360                                     if (aByteString.Len())
361                                         aStrBuffer.Append(String(aByteString, GetSrcEncoding()));
362                                     aStrBuffer.Append((sal_Unicode)__next);
363 
364                                     aByteString.Erase();
365                                     continue;
366                                 }
367                                 nSlash = (sal_Char)__next;
368                                 while (nSlash == 0xD || nSlash == 0xA)
369                                     nSlash = (sal_Char)GetNextChar();
370 
371                                 switch (nSlash)
372                                 {
373                                     case '{':
374                                     case '}':
375                                     case '\\':
376                                         bBreak = true;
377                                         break;
378                                     default:
379                                         aByteString.Append(nSlash);
380                                         break;
381                                 }
382                             }
383 
384                             nNextCh = GetNextChar();
385 
386                             if (nSlash != '\\' || nNextCh != '\'')
387                             {
388                                 rInput.SeekRel(-1);
389                                 nNextCh = nSlash;
390                                 break;
391                             }
392                         }
393 
394                         bNextCh = false;
395 
396                         if (aByteString.Len())
397                             aStrBuffer.Append(String(aByteString, GetSrcEncoding()));
398 #endif
399                     }
400                     break;
401                 case '\\':
402                 case '}':
403                 case '{':
404                 case '+':       // habe ich in einem RTF-File gefunden
405                     aStrBuffer.Append(nNextCh);
406                     break;
407                 case '~':       // nonbreaking space
408                     aStrBuffer.Append(static_cast< sal_Unicode >(0xA0));
409                     break;
410                 case '-':       // optional hyphen
411                     aStrBuffer.Append(static_cast< sal_Unicode >(0xAD));
412                     break;
413                 case '_':       // nonbreaking hyphen
414                     aStrBuffer.Append(static_cast< sal_Unicode >(0x2011));
415                     break;
416 
417                 case 'u':
418                     // UNI-Code Zeichen lesen
419                     {
420                         nNextCh = GetNextChar();
421                         rInput.SeekRel( -2 );
422 
423                         if( '-' == nNextCh || RTF_ISDIGIT( nNextCh ) )
424                         {
425                             bRTF_InTextRead = true;
426 
427                             String sSave( aToken );
428                             nNextCh = '\\';
429                             #ifdef DBG_UTIL
430                             int nToken =
431                             #endif
432                                 _GetNextToken();
433                             DBG_ASSERT( RTF_U == nToken, "doch kein UNI-Code Zeichen" );
434                             // dont convert symbol chars
435                             aStrBuffer.Append(
436                                 static_cast< sal_Unicode >(nTokenValue));
437 
438                             // overread the next n "RTF" characters. This
439                             // can be also \{, \}, \'88
440                             for( sal_uInt8 m = 0; m < nUCharOverread; ++m )
441                             {
442                                 sal_Unicode cAnsi = nNextCh;
443                                 while( 0xD == cAnsi )
444                                     cAnsi = GetNextChar();
445                                 while( 0xA == cAnsi )
446                                     cAnsi = GetNextChar();
447 
448                                 if( '\\' == cAnsi &&
449                                     '\'' == ( cAnsi = GetNextChar() ))
450                                     // HexValue ueberlesen
451                                     cAnsi = GetHexValue();
452                                 nNextCh = GetNextChar();
453                             }
454                             bNextCh = false;
455                             aToken = sSave;
456                             bRTF_InTextRead = false;
457                         }
458                         else
459                         {
460                             nNextCh = '\\';
461                             bWeiter = false;        // Abbrechen, String zusammen
462                         }
463                     }
464                     break;
465 
466                 default:
467                     rInput.SeekRel( -1 );
468                     nNextCh = '\\';
469                     bWeiter = false;        // Abbrechen, String zusammen
470                     break;
471                 }
472             }
473             break;
474 
475         case sal_Unicode(EOF):
476                 eState = SVPAR_ERROR;
477                 // weiter
478         case '{':
479         case '}':
480             bWeiter = false;
481             break;
482 
483         case 0x0a:
484         case 0x0d:
485             break;
486 
487         default:
488             if( nNextCh == cBreak || aStrBuffer.Len() >= MAX_STRING_LEN)
489                 bWeiter = false;
490             else
491             {
492                 do {
493                     // alle anderen Zeichen kommen in den Text
494                     aStrBuffer.Append(nNextCh);
495 
496                     if (sal_Unicode(EOF) == (nNextCh = GetNextChar()))
497                     {
498                         if (aStrBuffer.Len())
499                             aToken += aStrBuffer;
500                         return;
501                     }
502                 } while
503                 (
504                     (RTF_ISALPHA(nNextCh) || RTF_ISDIGIT(nNextCh)) &&
505                     (aStrBuffer.Len() < MAX_STRING_LEN)
506                 );
507                 bNextCh = false;
508             }
509         }
510 
511         if( bWeiter && bNextCh )
512             nNextCh = GetNextChar();
513     }
514 
515     if (aStrBuffer.Len())
516         aToken += aStrBuffer;
517 }
518 
519 
520 short SvRTFParser::_inSkipGroup=0;
521 
522 void SvRTFParser::SkipGroup()
523 {
524 short nBrackets=1;
525 if (_inSkipGroup>0)
526     return;
527 _inSkipGroup++;
528 #if 1   //#i16185# fecking \bin keyword
529     do
530     {
531         switch (nNextCh)
532         {
533             case '{':
534                 ++nBrackets;
535                 break;
536             case '}':
537                 if (!--nBrackets) {
538                     _inSkipGroup--;
539                     return;
540                 }
541                 break;
542         }
543         int nToken = _GetNextToken();
544         if (nToken == RTF_BIN)
545         {
546             rInput.SeekRel(-1);
547             rInput.SeekRel(nTokenValue);
548             nNextCh = GetNextChar();
549         }
550         while (nNextCh==0xa || nNextCh==0xd)
551         {
552             nNextCh = GetNextChar();
553         }
554     } while (sal_Unicode(EOF) != nNextCh && IsParserWorking());
555 #else
556     sal_Unicode cPrev = 0;
557     do {
558         switch( nNextCh )
559         {
560         case '{':
561             if( '\\' != cPrev )
562                 ++nBrackets;
563             break;
564 
565         case '}':
566             if( '\\' != cPrev && !--nBrackets )
567                 return;
568             break;
569 
570         case '\\':
571             if( '\\' == cPrev )
572                 nNextCh = 0;
573             break;
574         }
575         cPrev = nNextCh;
576         nNextCh = GetNextChar();
577     } while( sal_Unicode(EOF) != nNextCh && IsParserWorking() );
578 #endif
579 
580     if( SVPAR_PENDING != eState && '}' != nNextCh )
581         eState = SVPAR_ERROR;
582     _inSkipGroup--;
583 }
584 
585 void SvRTFParser::ReadUnknownData() { SkipGroup(); }
586 void SvRTFParser::ReadBitmapData()  { SkipGroup(); }
587 void SvRTFParser::ReadOLEData()     { SkipGroup(); }
588 
589 
590 SvParserState SvRTFParser::CallParser()
591 {
592     sal_Char cFirstCh;
593     nNextChPos = rInput.Tell();
594     rInput >> cFirstCh; nNextCh = cFirstCh;
595     eState = SVPAR_WORKING;
596     nOpenBrakets = 0;
597     SetSrcEncoding( eCodeSet = RTL_TEXTENCODING_MS_1252 );
598     eUNICodeSet = RTL_TEXTENCODING_MS_1252;     // default ist ANSI-CodeSet
599 
600     // die 1. beiden Token muessen '{' und \\rtf sein !!
601     if( '{' == GetNextToken() && RTF_RTF == GetNextToken() )
602     {
603         AddRef();
604         Continue( 0 );
605         if( SVPAR_PENDING != eState )
606             ReleaseRef();       // dann brauchen wir den Parser nicht mehr!
607     }
608     else
609         eState = SVPAR_ERROR;
610 
611     return eState;
612 }
613 
614 void SvRTFParser::Continue( int nToken )
615 {
616 //  DBG_ASSERT( SVPAR_CS_DONTKNOW == GetCharSet(),
617 //              "Zeichensatz wurde geaendert." );
618 
619     if( !nToken )
620         nToken = GetNextToken();
621 
622     while( IsParserWorking() )
623     {
624         SaveState( nToken );
625         switch( nToken )
626         {
627         case '}':
628             if( nOpenBrakets )
629                 goto NEXTTOKEN;
630             eState = SVPAR_ACCEPTED;
631             break;
632 
633         case '{':
634             // eine unbekannte Gruppe ?
635             {
636                 if( RTF_IGNOREFLAG != GetNextToken() )
637                     nToken = SkipToken( -1 );
638                 else if( RTF_UNKNOWNCONTROL != GetNextToken() )
639                     nToken = SkipToken( -2 );
640                 else
641                 {
642                     // gleich herausfiltern
643                     ReadUnknownData();
644                     nToken = GetNextToken();
645                     if( '}' != nToken )
646                         eState = SVPAR_ERROR;
647                     break;      // auf zum naechsten Token!!
648                 }
649             }
650             goto NEXTTOKEN;
651 
652         case RTF_UNKNOWNCONTROL:
653             break;      // unbekannte Token ueberspringen
654         case RTF_NEXTTYPE:
655         case RTF_ANSITYPE:
656             SetSrcEncoding( eCodeSet = RTL_TEXTENCODING_MS_1252 );
657             break;
658         case RTF_MACTYPE:
659             SetSrcEncoding( eCodeSet = RTL_TEXTENCODING_APPLE_ROMAN );
660             break;
661         case RTF_PCTYPE:
662             SetSrcEncoding( eCodeSet = RTL_TEXTENCODING_IBM_437 );
663             break;
664         case RTF_PCATYPE:
665             SetSrcEncoding( eCodeSet = RTL_TEXTENCODING_IBM_850 );
666             break;
667         case RTF_ANSICPG:
668             eCodeSet = rtl_getTextEncodingFromWindowsCodePage(nTokenValue);
669             SetSrcEncoding(eCodeSet);
670             break;
671         default:
672 NEXTTOKEN:
673             NextToken( nToken );
674             break;
675         }
676         if( IsParserWorking() )
677             SaveState( 0 );         // bis hierhin abgearbeitet,
678                                     // weiter mit neuem Token!
679         nToken = GetNextToken();
680     }
681     if( SVPAR_ACCEPTED == eState && 0 < nOpenBrakets )
682         eState = SVPAR_ERROR;
683 }
684 
685 void SvRTFParser::SetEncoding( rtl_TextEncoding eEnc )
686 {
687     if (eEnc == RTL_TEXTENCODING_DONTKNOW)
688         eEnc = GetCodeSet();
689 
690     if (!aParserStates.empty())
691         aParserStates.top().eCodeSet = eEnc;
692     SetSrcEncoding(eEnc);
693 }
694 
695 #ifdef USED
696 void SvRTFParser::SaveState( int nToken )
697 {
698     SvParser::SaveState( nToken );
699 }
700 
701 void SvRTFParser::RestoreState()
702 {
703     SvParser::RestoreState();
704 }
705 #endif
706 
707 /* vi:set tabstop=4 shiftwidth=4 expandtab: */
708