1 /************************************************************** 2 * 3 * Licensed to the Apache Software Foundation (ASF) under one 4 * or more contributor license agreements. See the NOTICE file 5 * distributed with this work for additional information 6 * regarding copyright ownership. The ASF licenses this file 7 * to you under the Apache License, Version 2.0 (the 8 * "License"); you may not use this file except in compliance 9 * with the License. You may obtain a copy of the License at 10 * 11 * http://www.apache.org/licenses/LICENSE-2.0 12 * 13 * Unless required by applicable law or agreed to in writing, 14 * software distributed under the License is distributed on an 15 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 * KIND, either express or implied. See the License for the 17 * specific language governing permissions and limitations 18 * under the License. 19 * 20 *************************************************************/ 21 22 23 24 // MARKER(update_precomp.py): autogen include statement, do not remove 25 #include "precompiled_svtools.hxx" 26 27 #include <stdio.h> // for EOF 28 #include <rtl/tencinfo.h> 29 #include <tools/stream.hxx> 30 #include <tools/debug.hxx> 31 #include <svtools/rtftoken.h> 32 #include <svtools/rtfkeywd.hxx> 33 #include <svtools/parrtf.hxx> 34 35 const int MAX_STRING_LEN = 1024; 36 const int MAX_TOKEN_LEN = 128; 37 38 #define RTF_ISDIGIT( c ) (c >= '0' && c <= '9') 39 #define RTF_ISALPHA( c ) ( (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') ) 40 41 SvRTFParser::SvRTFParser( SvStream& rIn, sal_uInt8 nStackSize ) 42 : SvParser( rIn, nStackSize ), 43 eUNICodeSet( RTL_TEXTENCODING_MS_1252 ), // default is ANSI code set 44 nUCharOverread( 1 ) 45 { 46 // default is ANSI code set 47 SetSrcEncoding( RTL_TEXTENCODING_MS_1252 ); 48 bRTF_InTextRead = false; 49 } 50 51 SvRTFParser::~SvRTFParser() 52 { 53 } 54 55 56 57 58 int SvRTFParser::_GetNextToken() 59 { 60 int nRet = 0; 61 do { 62 int bNextCh = true; 63 switch( nNextCh ) 64 { 65 case '\\': 66 { 67 // Steuerzeichen 68 switch( nNextCh = GetNextChar() ) 69 { 70 case '{': 71 case '}': 72 case '\\': 73 case '+': // habe ich in einem RTF-File gefunden 74 case '~': // nonbreaking space 75 case '-': // optional hyphen 76 case '_': // nonbreaking hyphen 77 case '\'': // HexValue 78 nNextCh = '\\'; 79 rInput.SeekRel( -1 ); 80 ScanText(); 81 nRet = RTF_TEXTTOKEN; 82 bNextCh = 0 == nNextCh; 83 break; 84 85 case '*': // ignoreflag 86 nRet = RTF_IGNOREFLAG; 87 break; 88 case ':': // subentry in an index entry 89 nRet = RTF_SUBENTRYINDEX; 90 break; 91 case '|': // formula-character 92 nRet = RTF_FORMULA; 93 break; 94 95 case 0x0a: 96 case 0x0d: 97 nRet = RTF_PAR; 98 break; 99 100 default: 101 if( RTF_ISALPHA( nNextCh ) ) 102 { 103 aToken = '\\'; 104 { 105 String aStrBuffer; 106 sal_Unicode* pStr = aStrBuffer.AllocBuffer( 107 MAX_TOKEN_LEN ); 108 xub_StrLen nStrLen = 0; 109 do { 110 *(pStr + nStrLen++) = nNextCh; 111 if( MAX_TOKEN_LEN == nStrLen ) 112 { 113 aToken += aStrBuffer; 114 aToken.GetBufferAccess(); // make unique string! 115 nStrLen = 0; 116 } 117 nNextCh = GetNextChar(); 118 } while( RTF_ISALPHA( nNextCh ) ); 119 if( nStrLen ) 120 { 121 aStrBuffer.ReleaseBufferAccess( nStrLen ); 122 aToken += aStrBuffer; 123 } 124 } 125 126 // Minus fuer numerischen Parameter 127 int bNegValue = false; 128 if( '-' == nNextCh ) 129 { 130 bNegValue = true; 131 nNextCh = GetNextChar(); 132 } 133 134 // evt. Numerischer Parameter 135 if( RTF_ISDIGIT( nNextCh ) ) 136 { 137 nTokenValue = 0; 138 do { 139 nTokenValue *= 10; 140 nTokenValue += nNextCh - '0'; 141 nNextCh = GetNextChar(); 142 } while( RTF_ISDIGIT( nNextCh ) ); 143 if( bNegValue ) 144 nTokenValue = -nTokenValue; 145 bTokenHasValue=true; 146 } 147 else if( bNegValue ) // das Minus wieder zurueck 148 { 149 nNextCh = '-'; 150 rInput.SeekRel( -1 ); 151 } 152 if( ' ' == nNextCh ) // Blank gehoert zum Token! 153 nNextCh = GetNextChar(); 154 155 // suche das Token in der Tabelle: 156 if( 0 == (nRet = GetRTFToken( aToken )) ) 157 // Unknown Control 158 nRet = RTF_UNKNOWNCONTROL; 159 160 // bug 76812 - unicode token handled as normal text 161 bNextCh = false; 162 switch( nRet ) 163 { 164 case RTF_UC: 165 if( 0 <= nTokenValue ) 166 { 167 nUCharOverread = (sal_uInt8)nTokenValue; 168 #if 1 169 // cmc: other ifdef breaks #i3584 170 aParserStates.top(). 171 nUCharOverread = nUCharOverread; 172 #else 173 if( !nUCharOverread ) 174 nUCharOverread = aParserStates.top().nUCharOverread; 175 else 176 aParserStates.top(). 177 nUCharOverread = nUCharOverread; 178 #endif 179 } 180 aToken.Erase(); // #i47831# erase token to prevent the token from being treated as text 181 // read next token 182 nRet = 0; 183 break; 184 185 case RTF_UPR: 186 if (!_inSkipGroup) { 187 // UPR - overread the group with the ansi 188 // information 189 while( '{' != _GetNextToken() ) 190 ; 191 SkipGroup(); 192 _GetNextToken(); // overread the last bracket 193 nRet = 0; 194 } 195 break; 196 197 case RTF_U: 198 if( !bRTF_InTextRead ) 199 { 200 nRet = RTF_TEXTTOKEN; 201 aToken = (sal_Unicode)nTokenValue; 202 203 // overread the next n "RTF" characters. This 204 // can be also \{, \}, \'88 205 for( sal_uInt8 m = 0; m < nUCharOverread; ++m ) 206 { 207 sal_Unicode cAnsi = nNextCh; 208 while( 0xD == cAnsi ) 209 cAnsi = GetNextChar(); 210 while( 0xA == cAnsi ) 211 cAnsi = GetNextChar(); 212 213 if( '\\' == cAnsi && 214 '\'' == ( cAnsi = GetNextChar() )) 215 // HexValue ueberlesen 216 cAnsi = GetHexValue(); 217 nNextCh = GetNextChar(); 218 } 219 ScanText(); 220 bNextCh = 0 == nNextCh; 221 } 222 break; 223 } 224 } 225 else if( SVPAR_PENDING != eState ) 226 { 227 // Bug 34631 - "\ " ueberlesen - Blank als Zeichen 228 // eState = SVPAR_ERROR; 229 bNextCh = false; 230 } 231 break; 232 } 233 } 234 break; 235 236 case sal_Unicode(EOF): 237 eState = SVPAR_ACCEPTED; 238 nRet = nNextCh; 239 break; 240 241 case '{': 242 { 243 if( 0 <= nOpenBrakets ) 244 { 245 RtfParserState_Impl aState( nUCharOverread, GetSrcEncoding() ); 246 aParserStates.push( aState ); 247 } 248 ++nOpenBrakets; 249 DBG_ASSERT( 250 static_cast<size_t>(nOpenBrakets) == aParserStates.size(), 251 "ParserStateStack unequal to bracket count" ); 252 nRet = nNextCh; 253 } 254 break; 255 256 case '}': 257 --nOpenBrakets; 258 if( 0 <= nOpenBrakets ) 259 { 260 aParserStates.pop(); 261 if( !aParserStates.empty() ) 262 { 263 const RtfParserState_Impl& rRPS = 264 aParserStates.top(); 265 nUCharOverread = rRPS.nUCharOverread; 266 SetSrcEncoding( rRPS.eCodeSet ); 267 } 268 else 269 { 270 nUCharOverread = 1; 271 SetSrcEncoding( GetCodeSet() ); 272 } 273 } 274 DBG_ASSERT( 275 static_cast<size_t>(nOpenBrakets) == aParserStates.size(), 276 "ParserStateStack unequal to bracket count" ); 277 nRet = nNextCh; 278 break; 279 280 case 0x0d: 281 case 0x0a: 282 break; 283 284 default: 285 // es folgt normaler Text 286 ScanText(); 287 nRet = RTF_TEXTTOKEN; 288 bNextCh = 0 == nNextCh; 289 break; 290 } 291 292 if( bNextCh ) 293 nNextCh = GetNextChar(); 294 295 } while( !nRet && SVPAR_WORKING == eState ); 296 return nRet; 297 } 298 299 300 sal_Unicode SvRTFParser::GetHexValue() 301 { 302 // Hex-Wert sammeln 303 register int n; 304 register sal_Unicode nHexVal = 0; 305 306 for( n = 0; n < 2; ++n ) 307 { 308 nHexVal *= 16; 309 nNextCh = GetNextChar(); 310 if( nNextCh >= '0' && nNextCh <= '9' ) 311 nHexVal += (nNextCh - 48); 312 else if( nNextCh >= 'a' && nNextCh <= 'f' ) 313 nHexVal += (nNextCh - 87); 314 else if( nNextCh >= 'A' && nNextCh <= 'F' ) 315 nHexVal += (nNextCh - 55); 316 } 317 return nHexVal; 318 } 319 320 void SvRTFParser::ScanText( const sal_Unicode cBreak ) 321 { 322 String aStrBuffer; 323 int bWeiter = true; 324 while( bWeiter && IsParserWorking() && aStrBuffer.Len() < MAX_STRING_LEN) 325 { 326 int bNextCh = true; 327 switch( nNextCh ) 328 { 329 case '\\': 330 { 331 switch (nNextCh = GetNextChar()) 332 { 333 case '\'': 334 { 335 336 #if 0 337 // #i35653 patch from cmc 338 ByteString aByteString(static_cast<char>(GetHexValue())); 339 if (aByteString.Len()) 340 aStrBuffer.Append(String(aByteString, GetSrcEncoding())); 341 #else 342 ByteString aByteString; 343 while (1) 344 { 345 aByteString.Append((char)GetHexValue()); 346 347 bool bBreak = false; 348 sal_Char nSlash = '\\'; 349 while (!bBreak) 350 { 351 wchar_t __next=GetNextChar(); 352 if (__next>0xFF) // fix for #i43933# and #i35653# 353 { 354 if (aByteString.Len()) 355 aStrBuffer.Append(String(aByteString, GetSrcEncoding())); 356 aStrBuffer.Append((sal_Unicode)__next); 357 358 aByteString.Erase(); 359 continue; 360 } 361 nSlash = (sal_Char)__next; 362 while (nSlash == 0xD || nSlash == 0xA) 363 nSlash = (sal_Char)GetNextChar(); 364 365 switch (nSlash) 366 { 367 case '{': 368 case '}': 369 case '\\': 370 bBreak = true; 371 break; 372 default: 373 aByteString.Append(nSlash); 374 break; 375 } 376 } 377 378 nNextCh = GetNextChar(); 379 380 if (nSlash != '\\' || nNextCh != '\'') 381 { 382 rInput.SeekRel(-1); 383 nNextCh = nSlash; 384 break; 385 } 386 } 387 388 bNextCh = false; 389 390 if (aByteString.Len()) 391 aStrBuffer.Append(String(aByteString, GetSrcEncoding())); 392 #endif 393 } 394 break; 395 case '\\': 396 case '}': 397 case '{': 398 case '+': // habe ich in einem RTF-File gefunden 399 aStrBuffer.Append(nNextCh); 400 break; 401 case '~': // nonbreaking space 402 aStrBuffer.Append(static_cast< sal_Unicode >(0xA0)); 403 break; 404 case '-': // optional hyphen 405 aStrBuffer.Append(static_cast< sal_Unicode >(0xAD)); 406 break; 407 case '_': // nonbreaking hyphen 408 aStrBuffer.Append(static_cast< sal_Unicode >(0x2011)); 409 break; 410 411 case 'u': 412 // UNI-Code Zeichen lesen 413 { 414 nNextCh = GetNextChar(); 415 rInput.SeekRel( -2 ); 416 417 if( '-' == nNextCh || RTF_ISDIGIT( nNextCh ) ) 418 { 419 bRTF_InTextRead = true; 420 421 String sSave( aToken ); 422 nNextCh = '\\'; 423 #ifdef DBG_UTIL 424 int nToken = 425 #endif 426 _GetNextToken(); 427 DBG_ASSERT( RTF_U == nToken, "doch kein UNI-Code Zeichen" ); 428 // dont convert symbol chars 429 aStrBuffer.Append( 430 static_cast< sal_Unicode >(nTokenValue)); 431 432 // overread the next n "RTF" characters. This 433 // can be also \{, \}, \'88 434 for( sal_uInt8 m = 0; m < nUCharOverread; ++m ) 435 { 436 sal_Unicode cAnsi = nNextCh; 437 while( 0xD == cAnsi ) 438 cAnsi = GetNextChar(); 439 while( 0xA == cAnsi ) 440 cAnsi = GetNextChar(); 441 442 if( '\\' == cAnsi && 443 '\'' == ( cAnsi = GetNextChar() )) 444 // HexValue ueberlesen 445 cAnsi = GetHexValue(); 446 nNextCh = GetNextChar(); 447 } 448 bNextCh = false; 449 aToken = sSave; 450 bRTF_InTextRead = false; 451 } 452 else 453 { 454 nNextCh = '\\'; 455 bWeiter = false; // Abbrechen, String zusammen 456 } 457 } 458 break; 459 460 default: 461 rInput.SeekRel( -1 ); 462 nNextCh = '\\'; 463 bWeiter = false; // Abbrechen, String zusammen 464 break; 465 } 466 } 467 break; 468 469 case sal_Unicode(EOF): 470 eState = SVPAR_ERROR; 471 // weiter 472 case '{': 473 case '}': 474 bWeiter = false; 475 break; 476 477 case 0x0a: 478 case 0x0d: 479 break; 480 481 default: 482 if( nNextCh == cBreak || aStrBuffer.Len() >= MAX_STRING_LEN) 483 bWeiter = false; 484 else 485 { 486 do { 487 // alle anderen Zeichen kommen in den Text 488 aStrBuffer.Append(nNextCh); 489 490 if (sal_Unicode(EOF) == (nNextCh = GetNextChar())) 491 { 492 if (aStrBuffer.Len()) 493 aToken += aStrBuffer; 494 return; 495 } 496 } while 497 ( 498 (RTF_ISALPHA(nNextCh) || RTF_ISDIGIT(nNextCh)) && 499 (aStrBuffer.Len() < MAX_STRING_LEN) 500 ); 501 bNextCh = false; 502 } 503 } 504 505 if( bWeiter && bNextCh ) 506 nNextCh = GetNextChar(); 507 } 508 509 if (aStrBuffer.Len()) 510 aToken += aStrBuffer; 511 } 512 513 514 short SvRTFParser::_inSkipGroup=0; 515 516 void SvRTFParser::SkipGroup() 517 { 518 short nBrackets=1; 519 if (_inSkipGroup>0) 520 return; 521 _inSkipGroup++; 522 #if 1 // #i16185# fecking \bin keyword 523 do 524 { 525 switch (nNextCh) 526 { 527 case '{': 528 ++nBrackets; 529 break; 530 case '}': 531 if (!--nBrackets) { 532 _inSkipGroup--; 533 return; 534 } 535 break; 536 } 537 int nToken = _GetNextToken(); 538 if (nToken == RTF_BIN) 539 { 540 rInput.SeekRel(-1); 541 rInput.SeekRel(nTokenValue); 542 nNextCh = GetNextChar(); 543 } 544 while (nNextCh==0xa || nNextCh==0xd) 545 { 546 nNextCh = GetNextChar(); 547 } 548 } while (sal_Unicode(EOF) != nNextCh && IsParserWorking()); 549 #else 550 sal_Unicode cPrev = 0; 551 do { 552 switch( nNextCh ) 553 { 554 case '{': 555 if( '\\' != cPrev ) 556 ++nBrackets; 557 break; 558 559 case '}': 560 if( '\\' != cPrev && !--nBrackets ) 561 return; 562 break; 563 564 case '\\': 565 if( '\\' == cPrev ) 566 nNextCh = 0; 567 break; 568 } 569 cPrev = nNextCh; 570 nNextCh = GetNextChar(); 571 } while( sal_Unicode(EOF) != nNextCh && IsParserWorking() ); 572 #endif 573 574 if( SVPAR_PENDING != eState && '}' != nNextCh ) 575 eState = SVPAR_ERROR; 576 _inSkipGroup--; 577 } 578 579 void SvRTFParser::ReadUnknownData() { SkipGroup(); } 580 void SvRTFParser::ReadBitmapData() { SkipGroup(); } 581 void SvRTFParser::ReadOLEData() { SkipGroup(); } 582 583 584 SvParserState SvRTFParser::CallParser() 585 { 586 sal_Char cFirstCh; 587 nNextChPos = rInput.Tell(); 588 rInput >> cFirstCh; nNextCh = cFirstCh; 589 eState = SVPAR_WORKING; 590 nOpenBrakets = 0; 591 SetSrcEncoding( eCodeSet = RTL_TEXTENCODING_MS_1252 ); 592 eUNICodeSet = RTL_TEXTENCODING_MS_1252; // default ist ANSI-CodeSet 593 594 // die 1. beiden Token muessen '{' und \\rtf sein !! 595 if( '{' == GetNextToken() && RTF_RTF == GetNextToken() ) 596 { 597 AddRef(); 598 Continue( 0 ); 599 if( SVPAR_PENDING != eState ) 600 ReleaseRef(); // dann brauchen wir den Parser nicht mehr! 601 } 602 else 603 eState = SVPAR_ERROR; 604 605 return eState; 606 } 607 608 void SvRTFParser::Continue( int nToken ) 609 { 610 // DBG_ASSERT( SVPAR_CS_DONTKNOW == GetCharSet(), 611 // "Zeichensatz wurde geaendert." ); 612 613 if( !nToken ) 614 nToken = GetNextToken(); 615 616 while( IsParserWorking() ) 617 { 618 SaveState( nToken ); 619 switch( nToken ) 620 { 621 case '}': 622 if( nOpenBrakets ) 623 goto NEXTTOKEN; 624 eState = SVPAR_ACCEPTED; 625 break; 626 627 case '{': 628 // eine unbekannte Gruppe ? 629 { 630 if( RTF_IGNOREFLAG != GetNextToken() ) 631 nToken = SkipToken( -1 ); 632 else if( RTF_UNKNOWNCONTROL != GetNextToken() ) 633 nToken = SkipToken( -2 ); 634 else 635 { 636 // gleich herausfiltern 637 ReadUnknownData(); 638 nToken = GetNextToken(); 639 if( '}' != nToken ) 640 eState = SVPAR_ERROR; 641 break; // auf zum naechsten Token!! 642 } 643 } 644 goto NEXTTOKEN; 645 646 case RTF_UNKNOWNCONTROL: 647 break; // unbekannte Token ueberspringen 648 case RTF_NEXTTYPE: 649 case RTF_ANSITYPE: 650 SetEncoding( eCodeSet = RTL_TEXTENCODING_MS_1252 ); 651 break; 652 case RTF_MACTYPE: 653 SetEncoding( eCodeSet = RTL_TEXTENCODING_APPLE_ROMAN ); 654 break; 655 case RTF_PCTYPE: 656 SetEncoding( eCodeSet = RTL_TEXTENCODING_IBM_437 ); 657 break; 658 case RTF_PCATYPE: 659 SetEncoding( eCodeSet = RTL_TEXTENCODING_IBM_850 ); 660 break; 661 case RTF_ANSICPG: 662 eCodeSet = rtl_getTextEncodingFromWindowsCodePage(nTokenValue); 663 SetEncoding(eCodeSet); 664 break; 665 default: 666 NEXTTOKEN: 667 NextToken( nToken ); 668 break; 669 } 670 if( IsParserWorking() ) 671 SaveState( 0 ); // bis hierhin abgearbeitet, 672 // weiter mit neuem Token! 673 nToken = GetNextToken(); 674 } 675 if( SVPAR_ACCEPTED == eState && 0 < nOpenBrakets ) 676 eState = SVPAR_ERROR; 677 } 678 679 void SvRTFParser::SetEncoding( rtl_TextEncoding eEnc ) 680 { 681 if (eEnc == RTL_TEXTENCODING_DONTKNOW) 682 eEnc = GetCodeSet(); 683 684 if (!aParserStates.empty()) 685 aParserStates.top().eCodeSet = eEnc; 686 SetSrcEncoding(eEnc); 687 } 688 689 #ifdef USED 690 void SvRTFParser::SaveState( int nToken ) 691 { 692 SvParser::SaveState( nToken ); 693 } 694 695 void SvRTFParser::RestoreState() 696 { 697 SvParser::RestoreState(); 698 } 699 #endif 700 701 /* vim: set noet sw=4 ts=4: */ 702