1 /************************************************************** 2 * 3 * Licensed to the Apache Software Foundation (ASF) under one 4 * or more contributor license agreements. See the NOTICE file 5 * distributed with this work for additional information 6 * regarding copyright ownership. The ASF licenses this file 7 * to you under the Apache License, Version 2.0 (the 8 * "License"); you may not use this file except in compliance 9 * with the License. You may obtain a copy of the License at 10 * 11 * http://www.apache.org/licenses/LICENSE-2.0 12 * 13 * Unless required by applicable law or agreed to in writing, 14 * software distributed under the License is distributed on an 15 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 * KIND, either express or implied. See the License for the 17 * specific language governing permissions and limitations 18 * under the License. 19 * 20 *************************************************************/ 21 22 23 24 // MARKER(update_precomp.py): autogen include statement, do not remove 25 #include "precompiled_svtools.hxx" 26 27 /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil -*- */ 28 29 #include <stdio.h> // for EOF 30 #include <rtl/tencinfo.h> 31 #include <tools/stream.hxx> 32 #include <tools/debug.hxx> 33 #include <svtools/rtftoken.h> 34 #include <svtools/rtfkeywd.hxx> 35 #include <svtools/parrtf.hxx> 36 37 const int MAX_STRING_LEN = 1024; 38 const int MAX_TOKEN_LEN = 128; 39 40 #define RTF_ISDIGIT( c ) (c >= '0' && c <= '9') 41 #define RTF_ISALPHA( c ) ( (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') ) 42 43 SvRTFParser::SvRTFParser( SvStream& rIn, sal_uInt8 nStackSize ) 44 : SvParser( rIn, nStackSize ), 45 eUNICodeSet( RTL_TEXTENCODING_MS_1252 ), // default ist ANSI-CodeSet 46 nUCharOverread( 1 ) 47 { 48 // default ist ANSI-CodeSet 49 SetSrcEncoding( RTL_TEXTENCODING_MS_1252 ); 50 bRTF_InTextRead = false; 51 } 52 53 SvRTFParser::~SvRTFParser() 54 { 55 } 56 57 58 59 60 int SvRTFParser::_GetNextToken() 61 { 62 int nRet = 0; 63 do { 64 int bNextCh = true; 65 switch( nNextCh ) 66 { 67 case '\\': 68 { 69 // Steuerzeichen 70 switch( nNextCh = GetNextChar() ) 71 { 72 case '{': 73 case '}': 74 case '\\': 75 case '+': // habe ich in einem RTF-File gefunden 76 case '~': // nonbreaking space 77 case '-': // optional hyphen 78 case '_': // nonbreaking hyphen 79 case '\'': // HexValue 80 nNextCh = '\\'; 81 rInput.SeekRel( -1 ); 82 ScanText(); 83 nRet = RTF_TEXTTOKEN; 84 bNextCh = 0 == nNextCh; 85 break; 86 87 case '*': // ignoreflag 88 nRet = RTF_IGNOREFLAG; 89 break; 90 case ':': // subentry in an index entry 91 nRet = RTF_SUBENTRYINDEX; 92 break; 93 case '|': // formula-charakter 94 nRet = RTF_FORMULA; 95 break; 96 97 case 0x0a: 98 case 0x0d: 99 nRet = RTF_PAR; 100 break; 101 102 default: 103 if( RTF_ISALPHA( nNextCh ) ) 104 { 105 aToken = '\\'; 106 { 107 String aStrBuffer; 108 sal_Unicode* pStr = aStrBuffer.AllocBuffer( 109 MAX_TOKEN_LEN ); 110 xub_StrLen nStrLen = 0; 111 do { 112 *(pStr + nStrLen++) = nNextCh; 113 if( MAX_TOKEN_LEN == nStrLen ) 114 { 115 aToken += aStrBuffer; 116 aToken.GetBufferAccess(); // make unique string! 117 nStrLen = 0; 118 } 119 nNextCh = GetNextChar(); 120 } while( RTF_ISALPHA( nNextCh ) ); 121 if( nStrLen ) 122 { 123 aStrBuffer.ReleaseBufferAccess( nStrLen ); 124 aToken += aStrBuffer; 125 } 126 } 127 128 // Minus fuer numerischen Parameter 129 int bNegValue = false; 130 if( '-' == nNextCh ) 131 { 132 bNegValue = true; 133 nNextCh = GetNextChar(); 134 } 135 136 // evt. Numerischer Parameter 137 if( RTF_ISDIGIT( nNextCh ) ) 138 { 139 nTokenValue = 0; 140 do { 141 nTokenValue *= 10; 142 nTokenValue += nNextCh - '0'; 143 nNextCh = GetNextChar(); 144 } while( RTF_ISDIGIT( nNextCh ) ); 145 if( bNegValue ) 146 nTokenValue = -nTokenValue; 147 bTokenHasValue=true; 148 } 149 else if( bNegValue ) // das Minus wieder zurueck 150 { 151 nNextCh = '-'; 152 rInput.SeekRel( -1 ); 153 } 154 if( ' ' == nNextCh ) // Blank gehoert zum Token! 155 nNextCh = GetNextChar(); 156 157 // suche das Token in der Tabelle: 158 if( 0 == (nRet = GetRTFToken( aToken )) ) 159 // Unknown Control 160 nRet = RTF_UNKNOWNCONTROL; 161 162 // bug 76812 - unicode token handled as normal text 163 bNextCh = false; 164 switch( nRet ) 165 { 166 case RTF_UC: 167 if( 0 <= nTokenValue ) 168 { 169 nUCharOverread = (sal_uInt8)nTokenValue; 170 #if 1 171 //cmc: other ifdef breaks #i3584 172 aParserStates.top(). 173 nUCharOverread = nUCharOverread; 174 #else 175 if( !nUCharOverread ) 176 nUCharOverread = aParserStates.top().nUCharOverread; 177 else 178 aParserStates.top(). 179 nUCharOverread = nUCharOverread; 180 #endif 181 } 182 aToken.Erase(); // #i47831# erase token to prevent the token from being treated as text 183 // read next token 184 nRet = 0; 185 break; 186 187 case RTF_UPR: 188 if (!_inSkipGroup) { 189 // UPR - overread the group with the ansi 190 // informations 191 while( '{' != _GetNextToken() ) 192 ; 193 SkipGroup(); 194 _GetNextToken(); // overread the last bracket 195 nRet = 0; 196 } 197 break; 198 199 case RTF_U: 200 if( !bRTF_InTextRead ) 201 { 202 nRet = RTF_TEXTTOKEN; 203 aToken = (sal_Unicode)nTokenValue; 204 205 // overread the next n "RTF" characters. This 206 // can be also \{, \}, \'88 207 for( sal_uInt8 m = 0; m < nUCharOverread; ++m ) 208 { 209 sal_Unicode cAnsi = nNextCh; 210 while( 0xD == cAnsi ) 211 cAnsi = GetNextChar(); 212 while( 0xA == cAnsi ) 213 cAnsi = GetNextChar(); 214 215 if( '\\' == cAnsi && 216 '\'' == ( cAnsi = GetNextChar() )) 217 // HexValue ueberlesen 218 cAnsi = GetHexValue(); 219 nNextCh = GetNextChar(); 220 } 221 ScanText(); 222 bNextCh = 0 == nNextCh; 223 } 224 break; 225 } 226 } 227 else if( SVPAR_PENDING != eState ) 228 { 229 // Bug 34631 - "\ " ueberlesen - Blank als Zeichen 230 // eState = SVPAR_ERROR; 231 bNextCh = false; 232 } 233 break; 234 } 235 } 236 break; 237 238 case sal_Unicode(EOF): 239 eState = SVPAR_ACCEPTED; 240 nRet = nNextCh; 241 break; 242 243 case '{': 244 { 245 if( 0 <= nOpenBrakets ) 246 { 247 RtfParserState_Impl aState( nUCharOverread, GetSrcEncoding() ); 248 aParserStates.push( aState ); 249 } 250 ++nOpenBrakets; 251 DBG_ASSERT( 252 static_cast<size_t>(nOpenBrakets) == aParserStates.size(), 253 "ParserStateStack unequal to bracket count" ); 254 nRet = nNextCh; 255 } 256 break; 257 258 case '}': 259 --nOpenBrakets; 260 if( 0 <= nOpenBrakets ) 261 { 262 aParserStates.pop(); 263 if( !aParserStates.empty() ) 264 { 265 const RtfParserState_Impl& rRPS = 266 aParserStates.top(); 267 nUCharOverread = rRPS.nUCharOverread; 268 SetSrcEncoding( rRPS.eCodeSet ); 269 } 270 else 271 { 272 nUCharOverread = 1; 273 SetSrcEncoding( GetCodeSet() ); 274 } 275 } 276 DBG_ASSERT( 277 static_cast<size_t>(nOpenBrakets) == aParserStates.size(), 278 "ParserStateStack unequal to bracket count" ); 279 nRet = nNextCh; 280 break; 281 282 case 0x0d: 283 case 0x0a: 284 break; 285 286 default: 287 // es folgt normaler Text 288 ScanText(); 289 nRet = RTF_TEXTTOKEN; 290 bNextCh = 0 == nNextCh; 291 break; 292 } 293 294 if( bNextCh ) 295 nNextCh = GetNextChar(); 296 297 } while( !nRet && SVPAR_WORKING == eState ); 298 return nRet; 299 } 300 301 302 sal_Unicode SvRTFParser::GetHexValue() 303 { 304 // Hex-Wert sammeln 305 register int n; 306 register sal_Unicode nHexVal = 0; 307 308 for( n = 0; n < 2; ++n ) 309 { 310 nHexVal *= 16; 311 nNextCh = GetNextChar(); 312 if( nNextCh >= '0' && nNextCh <= '9' ) 313 nHexVal += (nNextCh - 48); 314 else if( nNextCh >= 'a' && nNextCh <= 'f' ) 315 nHexVal += (nNextCh - 87); 316 else if( nNextCh >= 'A' && nNextCh <= 'F' ) 317 nHexVal += (nNextCh - 55); 318 } 319 return nHexVal; 320 } 321 322 void SvRTFParser::ScanText( const sal_Unicode cBreak ) 323 { 324 String aStrBuffer; 325 int bWeiter = true; 326 while( bWeiter && IsParserWorking() && aStrBuffer.Len() < MAX_STRING_LEN) 327 { 328 int bNextCh = true; 329 switch( nNextCh ) 330 { 331 case '\\': 332 { 333 switch (nNextCh = GetNextChar()) 334 { 335 case '\'': 336 { 337 338 #if 0 339 // #i35653 patch from cmc 340 ByteString aByteString(static_cast<char>(GetHexValue())); 341 if (aByteString.Len()) 342 aStrBuffer.Append(String(aByteString, GetSrcEncoding())); 343 #else 344 ByteString aByteString; 345 while (1) 346 { 347 aByteString.Append((char)GetHexValue()); 348 349 bool bBreak = false; 350 sal_Char nSlash = '\\'; 351 while (!bBreak) 352 { 353 wchar_t __next=GetNextChar(); 354 if (__next>0xFF) // fix for #i43933# and #i35653# 355 { 356 if (aByteString.Len()) 357 aStrBuffer.Append(String(aByteString, GetSrcEncoding())); 358 aStrBuffer.Append((sal_Unicode)__next); 359 360 aByteString.Erase(); 361 continue; 362 } 363 nSlash = (sal_Char)__next; 364 while (nSlash == 0xD || nSlash == 0xA) 365 nSlash = (sal_Char)GetNextChar(); 366 367 switch (nSlash) 368 { 369 case '{': 370 case '}': 371 case '\\': 372 bBreak = true; 373 break; 374 default: 375 aByteString.Append(nSlash); 376 break; 377 } 378 } 379 380 nNextCh = GetNextChar(); 381 382 if (nSlash != '\\' || nNextCh != '\'') 383 { 384 rInput.SeekRel(-1); 385 nNextCh = nSlash; 386 break; 387 } 388 } 389 390 bNextCh = false; 391 392 if (aByteString.Len()) 393 aStrBuffer.Append(String(aByteString, GetSrcEncoding())); 394 #endif 395 } 396 break; 397 case '\\': 398 case '}': 399 case '{': 400 case '+': // habe ich in einem RTF-File gefunden 401 aStrBuffer.Append(nNextCh); 402 break; 403 case '~': // nonbreaking space 404 aStrBuffer.Append(static_cast< sal_Unicode >(0xA0)); 405 break; 406 case '-': // optional hyphen 407 aStrBuffer.Append(static_cast< sal_Unicode >(0xAD)); 408 break; 409 case '_': // nonbreaking hyphen 410 aStrBuffer.Append(static_cast< sal_Unicode >(0x2011)); 411 break; 412 413 case 'u': 414 // UNI-Code Zeichen lesen 415 { 416 nNextCh = GetNextChar(); 417 rInput.SeekRel( -2 ); 418 419 if( '-' == nNextCh || RTF_ISDIGIT( nNextCh ) ) 420 { 421 bRTF_InTextRead = true; 422 423 String sSave( aToken ); 424 nNextCh = '\\'; 425 #ifdef DBG_UTIL 426 int nToken = 427 #endif 428 _GetNextToken(); 429 DBG_ASSERT( RTF_U == nToken, "doch kein UNI-Code Zeichen" ); 430 // dont convert symbol chars 431 aStrBuffer.Append( 432 static_cast< sal_Unicode >(nTokenValue)); 433 434 // overread the next n "RTF" characters. This 435 // can be also \{, \}, \'88 436 for( sal_uInt8 m = 0; m < nUCharOverread; ++m ) 437 { 438 sal_Unicode cAnsi = nNextCh; 439 while( 0xD == cAnsi ) 440 cAnsi = GetNextChar(); 441 while( 0xA == cAnsi ) 442 cAnsi = GetNextChar(); 443 444 if( '\\' == cAnsi && 445 '\'' == ( cAnsi = GetNextChar() )) 446 // HexValue ueberlesen 447 cAnsi = GetHexValue(); 448 nNextCh = GetNextChar(); 449 } 450 bNextCh = false; 451 aToken = sSave; 452 bRTF_InTextRead = false; 453 } 454 else 455 { 456 nNextCh = '\\'; 457 bWeiter = false; // Abbrechen, String zusammen 458 } 459 } 460 break; 461 462 default: 463 rInput.SeekRel( -1 ); 464 nNextCh = '\\'; 465 bWeiter = false; // Abbrechen, String zusammen 466 break; 467 } 468 } 469 break; 470 471 case sal_Unicode(EOF): 472 eState = SVPAR_ERROR; 473 // weiter 474 case '{': 475 case '}': 476 bWeiter = false; 477 break; 478 479 case 0x0a: 480 case 0x0d: 481 break; 482 483 default: 484 if( nNextCh == cBreak || aStrBuffer.Len() >= MAX_STRING_LEN) 485 bWeiter = false; 486 else 487 { 488 do { 489 // alle anderen Zeichen kommen in den Text 490 aStrBuffer.Append(nNextCh); 491 492 if (sal_Unicode(EOF) == (nNextCh = GetNextChar())) 493 { 494 if (aStrBuffer.Len()) 495 aToken += aStrBuffer; 496 return; 497 } 498 } while 499 ( 500 (RTF_ISALPHA(nNextCh) || RTF_ISDIGIT(nNextCh)) && 501 (aStrBuffer.Len() < MAX_STRING_LEN) 502 ); 503 bNextCh = false; 504 } 505 } 506 507 if( bWeiter && bNextCh ) 508 nNextCh = GetNextChar(); 509 } 510 511 if (aStrBuffer.Len()) 512 aToken += aStrBuffer; 513 } 514 515 516 short SvRTFParser::_inSkipGroup=0; 517 518 void SvRTFParser::SkipGroup() 519 { 520 short nBrackets=1; 521 if (_inSkipGroup>0) 522 return; 523 _inSkipGroup++; 524 #if 1 //#i16185# fecking \bin keyword 525 do 526 { 527 switch (nNextCh) 528 { 529 case '{': 530 ++nBrackets; 531 break; 532 case '}': 533 if (!--nBrackets) { 534 _inSkipGroup--; 535 return; 536 } 537 break; 538 } 539 int nToken = _GetNextToken(); 540 if (nToken == RTF_BIN) 541 { 542 rInput.SeekRel(-1); 543 rInput.SeekRel(nTokenValue); 544 nNextCh = GetNextChar(); 545 } 546 while (nNextCh==0xa || nNextCh==0xd) 547 { 548 nNextCh = GetNextChar(); 549 } 550 } while (sal_Unicode(EOF) != nNextCh && IsParserWorking()); 551 #else 552 sal_Unicode cPrev = 0; 553 do { 554 switch( nNextCh ) 555 { 556 case '{': 557 if( '\\' != cPrev ) 558 ++nBrackets; 559 break; 560 561 case '}': 562 if( '\\' != cPrev && !--nBrackets ) 563 return; 564 break; 565 566 case '\\': 567 if( '\\' == cPrev ) 568 nNextCh = 0; 569 break; 570 } 571 cPrev = nNextCh; 572 nNextCh = GetNextChar(); 573 } while( sal_Unicode(EOF) != nNextCh && IsParserWorking() ); 574 #endif 575 576 if( SVPAR_PENDING != eState && '}' != nNextCh ) 577 eState = SVPAR_ERROR; 578 _inSkipGroup--; 579 } 580 581 void SvRTFParser::ReadUnknownData() { SkipGroup(); } 582 void SvRTFParser::ReadBitmapData() { SkipGroup(); } 583 void SvRTFParser::ReadOLEData() { SkipGroup(); } 584 585 586 SvParserState SvRTFParser::CallParser() 587 { 588 sal_Char cFirstCh; 589 nNextChPos = rInput.Tell(); 590 rInput >> cFirstCh; nNextCh = cFirstCh; 591 eState = SVPAR_WORKING; 592 nOpenBrakets = 0; 593 SetSrcEncoding( eCodeSet = RTL_TEXTENCODING_MS_1252 ); 594 eUNICodeSet = RTL_TEXTENCODING_MS_1252; // default ist ANSI-CodeSet 595 596 // die 1. beiden Token muessen '{' und \\rtf sein !! 597 if( '{' == GetNextToken() && RTF_RTF == GetNextToken() ) 598 { 599 AddRef(); 600 Continue( 0 ); 601 if( SVPAR_PENDING != eState ) 602 ReleaseRef(); // dann brauchen wir den Parser nicht mehr! 603 } 604 else 605 eState = SVPAR_ERROR; 606 607 return eState; 608 } 609 610 void SvRTFParser::Continue( int nToken ) 611 { 612 // DBG_ASSERT( SVPAR_CS_DONTKNOW == GetCharSet(), 613 // "Zeichensatz wurde geaendert." ); 614 615 if( !nToken ) 616 nToken = GetNextToken(); 617 618 while( IsParserWorking() ) 619 { 620 SaveState( nToken ); 621 switch( nToken ) 622 { 623 case '}': 624 if( nOpenBrakets ) 625 goto NEXTTOKEN; 626 eState = SVPAR_ACCEPTED; 627 break; 628 629 case '{': 630 // eine unbekannte Gruppe ? 631 { 632 if( RTF_IGNOREFLAG != GetNextToken() ) 633 nToken = SkipToken( -1 ); 634 else if( RTF_UNKNOWNCONTROL != GetNextToken() ) 635 nToken = SkipToken( -2 ); 636 else 637 { 638 // gleich herausfiltern 639 ReadUnknownData(); 640 nToken = GetNextToken(); 641 if( '}' != nToken ) 642 eState = SVPAR_ERROR; 643 break; // auf zum naechsten Token!! 644 } 645 } 646 goto NEXTTOKEN; 647 648 case RTF_UNKNOWNCONTROL: 649 break; // unbekannte Token ueberspringen 650 case RTF_NEXTTYPE: 651 case RTF_ANSITYPE: 652 SetEncoding( eCodeSet = RTL_TEXTENCODING_MS_1252 ); 653 break; 654 case RTF_MACTYPE: 655 SetEncoding( eCodeSet = RTL_TEXTENCODING_APPLE_ROMAN ); 656 break; 657 case RTF_PCTYPE: 658 SetEncoding( eCodeSet = RTL_TEXTENCODING_IBM_437 ); 659 break; 660 case RTF_PCATYPE: 661 SetEncoding( eCodeSet = RTL_TEXTENCODING_IBM_850 ); 662 break; 663 case RTF_ANSICPG: 664 eCodeSet = rtl_getTextEncodingFromWindowsCodePage(nTokenValue); 665 SetEncoding(eCodeSet); 666 break; 667 default: 668 NEXTTOKEN: 669 NextToken( nToken ); 670 break; 671 } 672 if( IsParserWorking() ) 673 SaveState( 0 ); // bis hierhin abgearbeitet, 674 // weiter mit neuem Token! 675 nToken = GetNextToken(); 676 } 677 if( SVPAR_ACCEPTED == eState && 0 < nOpenBrakets ) 678 eState = SVPAR_ERROR; 679 } 680 681 void SvRTFParser::SetEncoding( rtl_TextEncoding eEnc ) 682 { 683 if (eEnc == RTL_TEXTENCODING_DONTKNOW) 684 eEnc = GetCodeSet(); 685 686 if (!aParserStates.empty()) 687 aParserStates.top().eCodeSet = eEnc; 688 SetSrcEncoding(eEnc); 689 } 690 691 #ifdef USED 692 void SvRTFParser::SaveState( int nToken ) 693 { 694 SvParser::SaveState( nToken ); 695 } 696 697 void SvRTFParser::RestoreState() 698 { 699 SvParser::RestoreState(); 700 } 701 #endif 702 703 /* vi:set tabstop=4 shiftwidth=4 expandtab: */ 704