1 /************************************************************************* 2 * 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * Copyright 2000, 2010 Oracle and/or its affiliates. 6 * 7 * OpenOffice.org - a multi-platform office productivity suite 8 * 9 * This file is part of OpenOffice.org. 10 * 11 * OpenOffice.org is free software: you can redistribute it and/or modify 12 * it under the terms of the GNU Lesser General Public License version 3 13 * only, as published by the Free Software Foundation. 14 * 15 * OpenOffice.org is distributed in the hope that it will be useful, 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 * GNU Lesser General Public License version 3 for more details 19 * (a copy is included in the LICENSE file that accompanied this code). 20 * 21 * You should have received a copy of the GNU Lesser General Public License 22 * version 3 along with OpenOffice.org. If not, see 23 * <http://www.openoffice.org/license.html> 24 * for a copy of the LGPLv3 License. 25 * 26 ************************************************************************/ 27 28 // MARKER(update_precomp.py): autogen include statement, do not remove 29 #include "precompiled_svtools.hxx" 30 31 /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil -*- */ 32 33 #include <stdio.h> // for EOF 34 #include <rtl/tencinfo.h> 35 #include <tools/stream.hxx> 36 #include <tools/debug.hxx> 37 #include <svtools/rtftoken.h> 38 #include <svtools/rtfkeywd.hxx> 39 #include <svtools/parrtf.hxx> 40 41 const int MAX_STRING_LEN = 1024; 42 const int MAX_TOKEN_LEN = 128; 43 44 #define RTF_ISDIGIT( c ) (c >= '0' && c <= '9') 45 #define RTF_ISALPHA( c ) ( (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') ) 46 47 SvRTFParser::SvRTFParser( SvStream& rIn, sal_uInt8 nStackSize ) 48 : SvParser( rIn, nStackSize ), 49 eUNICodeSet( RTL_TEXTENCODING_MS_1252 ), // default ist ANSI-CodeSet 50 nUCharOverread( 1 ) 51 { 52 // default ist ANSI-CodeSet 53 SetSrcEncoding( RTL_TEXTENCODING_MS_1252 ); 54 bRTF_InTextRead = false; 55 } 56 57 SvRTFParser::~SvRTFParser() 58 { 59 } 60 61 62 63 64 int SvRTFParser::_GetNextToken() 65 { 66 int nRet = 0; 67 do { 68 int bNextCh = true; 69 switch( nNextCh ) 70 { 71 case '\\': 72 { 73 // Steuerzeichen 74 switch( nNextCh = GetNextChar() ) 75 { 76 case '{': 77 case '}': 78 case '\\': 79 case '+': // habe ich in einem RTF-File gefunden 80 case '~': // nonbreaking space 81 case '-': // optional hyphen 82 case '_': // nonbreaking hyphen 83 case '\'': // HexValue 84 nNextCh = '\\'; 85 rInput.SeekRel( -1 ); 86 ScanText(); 87 nRet = RTF_TEXTTOKEN; 88 bNextCh = 0 == nNextCh; 89 break; 90 91 case '*': // ignoreflag 92 nRet = RTF_IGNOREFLAG; 93 break; 94 case ':': // subentry in an index entry 95 nRet = RTF_SUBENTRYINDEX; 96 break; 97 case '|': // formula-charakter 98 nRet = RTF_FORMULA; 99 break; 100 101 case 0x0a: 102 case 0x0d: 103 nRet = RTF_PAR; 104 break; 105 106 default: 107 if( RTF_ISALPHA( nNextCh ) ) 108 { 109 aToken = '\\'; 110 { 111 String aStrBuffer; 112 sal_Unicode* pStr = aStrBuffer.AllocBuffer( 113 MAX_TOKEN_LEN ); 114 xub_StrLen nStrLen = 0; 115 do { 116 *(pStr + nStrLen++) = nNextCh; 117 if( MAX_TOKEN_LEN == nStrLen ) 118 { 119 aToken += aStrBuffer; 120 aToken.GetBufferAccess(); // make unique string! 121 nStrLen = 0; 122 } 123 nNextCh = GetNextChar(); 124 } while( RTF_ISALPHA( nNextCh ) ); 125 if( nStrLen ) 126 { 127 aStrBuffer.ReleaseBufferAccess( nStrLen ); 128 aToken += aStrBuffer; 129 } 130 } 131 132 // Minus fuer numerischen Parameter 133 int bNegValue = false; 134 if( '-' == nNextCh ) 135 { 136 bNegValue = true; 137 nNextCh = GetNextChar(); 138 } 139 140 // evt. Numerischer Parameter 141 if( RTF_ISDIGIT( nNextCh ) ) 142 { 143 nTokenValue = 0; 144 do { 145 nTokenValue *= 10; 146 nTokenValue += nNextCh - '0'; 147 nNextCh = GetNextChar(); 148 } while( RTF_ISDIGIT( nNextCh ) ); 149 if( bNegValue ) 150 nTokenValue = -nTokenValue; 151 bTokenHasValue=true; 152 } 153 else if( bNegValue ) // das Minus wieder zurueck 154 { 155 nNextCh = '-'; 156 rInput.SeekRel( -1 ); 157 } 158 if( ' ' == nNextCh ) // Blank gehoert zum Token! 159 nNextCh = GetNextChar(); 160 161 // suche das Token in der Tabelle: 162 if( 0 == (nRet = GetRTFToken( aToken )) ) 163 // Unknown Control 164 nRet = RTF_UNKNOWNCONTROL; 165 166 // bug 76812 - unicode token handled as normal text 167 bNextCh = false; 168 switch( nRet ) 169 { 170 case RTF_UC: 171 if( 0 <= nTokenValue ) 172 { 173 nUCharOverread = (sal_uInt8)nTokenValue; 174 #if 1 175 //cmc: other ifdef breaks #i3584 176 aParserStates.top(). 177 nUCharOverread = nUCharOverread; 178 #else 179 if( !nUCharOverread ) 180 nUCharOverread = aParserStates.top().nUCharOverread; 181 else 182 aParserStates.top(). 183 nUCharOverread = nUCharOverread; 184 #endif 185 } 186 aToken.Erase(); // #i47831# erase token to prevent the token from beeing treated as text 187 // read next token 188 nRet = 0; 189 break; 190 191 case RTF_UPR: 192 if (!_inSkipGroup) { 193 // UPR - overread the group with the ansi 194 // informations 195 while( '{' != _GetNextToken() ) 196 ; 197 SkipGroup(); 198 _GetNextToken(); // overread the last bracket 199 nRet = 0; 200 } 201 break; 202 203 case RTF_U: 204 if( !bRTF_InTextRead ) 205 { 206 nRet = RTF_TEXTTOKEN; 207 aToken = (sal_Unicode)nTokenValue; 208 209 // overread the next n "RTF" characters. This 210 // can be also \{, \}, \'88 211 for( sal_uInt8 m = 0; m < nUCharOverread; ++m ) 212 { 213 sal_Unicode cAnsi = nNextCh; 214 while( 0xD == cAnsi ) 215 cAnsi = GetNextChar(); 216 while( 0xA == cAnsi ) 217 cAnsi = GetNextChar(); 218 219 if( '\\' == cAnsi && 220 '\'' == ( cAnsi = GetNextChar() )) 221 // HexValue ueberlesen 222 cAnsi = GetHexValue(); 223 nNextCh = GetNextChar(); 224 } 225 ScanText(); 226 bNextCh = 0 == nNextCh; 227 } 228 break; 229 } 230 } 231 else if( SVPAR_PENDING != eState ) 232 { 233 // Bug 34631 - "\ " ueberlesen - Blank als Zeichen 234 // eState = SVPAR_ERROR; 235 bNextCh = false; 236 } 237 break; 238 } 239 } 240 break; 241 242 case sal_Unicode(EOF): 243 eState = SVPAR_ACCEPTED; 244 nRet = nNextCh; 245 break; 246 247 case '{': 248 { 249 if( 0 <= nOpenBrakets ) 250 { 251 RtfParserState_Impl aState( nUCharOverread, GetSrcEncoding() ); 252 aParserStates.push( aState ); 253 } 254 ++nOpenBrakets; 255 DBG_ASSERT( 256 static_cast<size_t>(nOpenBrakets) == aParserStates.size(), 257 "ParserStateStack unequal to bracket count" ); 258 nRet = nNextCh; 259 } 260 break; 261 262 case '}': 263 --nOpenBrakets; 264 if( 0 <= nOpenBrakets ) 265 { 266 aParserStates.pop(); 267 if( !aParserStates.empty() ) 268 { 269 const RtfParserState_Impl& rRPS = 270 aParserStates.top(); 271 nUCharOverread = rRPS.nUCharOverread; 272 SetSrcEncoding( rRPS.eCodeSet ); 273 } 274 else 275 { 276 nUCharOverread = 1; 277 SetSrcEncoding( GetCodeSet() ); 278 } 279 } 280 DBG_ASSERT( 281 static_cast<size_t>(nOpenBrakets) == aParserStates.size(), 282 "ParserStateStack unequal to bracket count" ); 283 nRet = nNextCh; 284 break; 285 286 case 0x0d: 287 case 0x0a: 288 break; 289 290 default: 291 // es folgt normaler Text 292 ScanText(); 293 nRet = RTF_TEXTTOKEN; 294 bNextCh = 0 == nNextCh; 295 break; 296 } 297 298 if( bNextCh ) 299 nNextCh = GetNextChar(); 300 301 } while( !nRet && SVPAR_WORKING == eState ); 302 return nRet; 303 } 304 305 306 sal_Unicode SvRTFParser::GetHexValue() 307 { 308 // Hex-Wert sammeln 309 register int n; 310 register sal_Unicode nHexVal = 0; 311 312 for( n = 0; n < 2; ++n ) 313 { 314 nHexVal *= 16; 315 nNextCh = GetNextChar(); 316 if( nNextCh >= '0' && nNextCh <= '9' ) 317 nHexVal += (nNextCh - 48); 318 else if( nNextCh >= 'a' && nNextCh <= 'f' ) 319 nHexVal += (nNextCh - 87); 320 else if( nNextCh >= 'A' && nNextCh <= 'F' ) 321 nHexVal += (nNextCh - 55); 322 } 323 return nHexVal; 324 } 325 326 void SvRTFParser::ScanText( const sal_Unicode cBreak ) 327 { 328 String aStrBuffer; 329 int bWeiter = true; 330 while( bWeiter && IsParserWorking() && aStrBuffer.Len() < MAX_STRING_LEN) 331 { 332 int bNextCh = true; 333 switch( nNextCh ) 334 { 335 case '\\': 336 { 337 switch (nNextCh = GetNextChar()) 338 { 339 case '\'': 340 { 341 342 #if 0 343 // #i35653 patch from cmc 344 ByteString aByteString(static_cast<char>(GetHexValue())); 345 if (aByteString.Len()) 346 aStrBuffer.Append(String(aByteString, GetSrcEncoding())); 347 #else 348 ByteString aByteString; 349 while (1) 350 { 351 aByteString.Append((char)GetHexValue()); 352 353 bool bBreak = false; 354 sal_Char nSlash = '\\'; 355 while (!bBreak) 356 { 357 wchar_t __next=GetNextChar(); 358 if (__next>0xFF) // fix for #i43933# and #i35653# 359 { 360 if (aByteString.Len()) 361 aStrBuffer.Append(String(aByteString, GetSrcEncoding())); 362 aStrBuffer.Append((sal_Unicode)__next); 363 364 aByteString.Erase(); 365 continue; 366 } 367 nSlash = (sal_Char)__next; 368 while (nSlash == 0xD || nSlash == 0xA) 369 nSlash = (sal_Char)GetNextChar(); 370 371 switch (nSlash) 372 { 373 case '{': 374 case '}': 375 case '\\': 376 bBreak = true; 377 break; 378 default: 379 aByteString.Append(nSlash); 380 break; 381 } 382 } 383 384 nNextCh = GetNextChar(); 385 386 if (nSlash != '\\' || nNextCh != '\'') 387 { 388 rInput.SeekRel(-1); 389 nNextCh = nSlash; 390 break; 391 } 392 } 393 394 bNextCh = false; 395 396 if (aByteString.Len()) 397 aStrBuffer.Append(String(aByteString, GetSrcEncoding())); 398 #endif 399 } 400 break; 401 case '\\': 402 case '}': 403 case '{': 404 case '+': // habe ich in einem RTF-File gefunden 405 aStrBuffer.Append(nNextCh); 406 break; 407 case '~': // nonbreaking space 408 aStrBuffer.Append(static_cast< sal_Unicode >(0xA0)); 409 break; 410 case '-': // optional hyphen 411 aStrBuffer.Append(static_cast< sal_Unicode >(0xAD)); 412 break; 413 case '_': // nonbreaking hyphen 414 aStrBuffer.Append(static_cast< sal_Unicode >(0x2011)); 415 break; 416 417 case 'u': 418 // UNI-Code Zeichen lesen 419 { 420 nNextCh = GetNextChar(); 421 rInput.SeekRel( -2 ); 422 423 if( '-' == nNextCh || RTF_ISDIGIT( nNextCh ) ) 424 { 425 bRTF_InTextRead = true; 426 427 String sSave( aToken ); 428 nNextCh = '\\'; 429 #ifdef DBG_UTIL 430 int nToken = 431 #endif 432 _GetNextToken(); 433 DBG_ASSERT( RTF_U == nToken, "doch kein UNI-Code Zeichen" ); 434 // dont convert symbol chars 435 aStrBuffer.Append( 436 static_cast< sal_Unicode >(nTokenValue)); 437 438 // overread the next n "RTF" characters. This 439 // can be also \{, \}, \'88 440 for( sal_uInt8 m = 0; m < nUCharOverread; ++m ) 441 { 442 sal_Unicode cAnsi = nNextCh; 443 while( 0xD == cAnsi ) 444 cAnsi = GetNextChar(); 445 while( 0xA == cAnsi ) 446 cAnsi = GetNextChar(); 447 448 if( '\\' == cAnsi && 449 '\'' == ( cAnsi = GetNextChar() )) 450 // HexValue ueberlesen 451 cAnsi = GetHexValue(); 452 nNextCh = GetNextChar(); 453 } 454 bNextCh = false; 455 aToken = sSave; 456 bRTF_InTextRead = false; 457 } 458 else 459 { 460 nNextCh = '\\'; 461 bWeiter = false; // Abbrechen, String zusammen 462 } 463 } 464 break; 465 466 default: 467 rInput.SeekRel( -1 ); 468 nNextCh = '\\'; 469 bWeiter = false; // Abbrechen, String zusammen 470 break; 471 } 472 } 473 break; 474 475 case sal_Unicode(EOF): 476 eState = SVPAR_ERROR; 477 // weiter 478 case '{': 479 case '}': 480 bWeiter = false; 481 break; 482 483 case 0x0a: 484 case 0x0d: 485 break; 486 487 default: 488 if( nNextCh == cBreak || aStrBuffer.Len() >= MAX_STRING_LEN) 489 bWeiter = false; 490 else 491 { 492 do { 493 // alle anderen Zeichen kommen in den Text 494 aStrBuffer.Append(nNextCh); 495 496 if (sal_Unicode(EOF) == (nNextCh = GetNextChar())) 497 { 498 if (aStrBuffer.Len()) 499 aToken += aStrBuffer; 500 return; 501 } 502 } while 503 ( 504 (RTF_ISALPHA(nNextCh) || RTF_ISDIGIT(nNextCh)) && 505 (aStrBuffer.Len() < MAX_STRING_LEN) 506 ); 507 bNextCh = false; 508 } 509 } 510 511 if( bWeiter && bNextCh ) 512 nNextCh = GetNextChar(); 513 } 514 515 if (aStrBuffer.Len()) 516 aToken += aStrBuffer; 517 } 518 519 520 short SvRTFParser::_inSkipGroup=0; 521 522 void SvRTFParser::SkipGroup() 523 { 524 short nBrackets=1; 525 if (_inSkipGroup>0) 526 return; 527 _inSkipGroup++; 528 #if 1 //#i16185# fecking \bin keyword 529 do 530 { 531 switch (nNextCh) 532 { 533 case '{': 534 ++nBrackets; 535 break; 536 case '}': 537 if (!--nBrackets) { 538 _inSkipGroup--; 539 return; 540 } 541 break; 542 } 543 int nToken = _GetNextToken(); 544 if (nToken == RTF_BIN) 545 { 546 rInput.SeekRel(-1); 547 rInput.SeekRel(nTokenValue); 548 nNextCh = GetNextChar(); 549 } 550 while (nNextCh==0xa || nNextCh==0xd) 551 { 552 nNextCh = GetNextChar(); 553 } 554 } while (sal_Unicode(EOF) != nNextCh && IsParserWorking()); 555 #else 556 sal_Unicode cPrev = 0; 557 do { 558 switch( nNextCh ) 559 { 560 case '{': 561 if( '\\' != cPrev ) 562 ++nBrackets; 563 break; 564 565 case '}': 566 if( '\\' != cPrev && !--nBrackets ) 567 return; 568 break; 569 570 case '\\': 571 if( '\\' == cPrev ) 572 nNextCh = 0; 573 break; 574 } 575 cPrev = nNextCh; 576 nNextCh = GetNextChar(); 577 } while( sal_Unicode(EOF) != nNextCh && IsParserWorking() ); 578 #endif 579 580 if( SVPAR_PENDING != eState && '}' != nNextCh ) 581 eState = SVPAR_ERROR; 582 _inSkipGroup--; 583 } 584 585 void SvRTFParser::ReadUnknownData() { SkipGroup(); } 586 void SvRTFParser::ReadBitmapData() { SkipGroup(); } 587 void SvRTFParser::ReadOLEData() { SkipGroup(); } 588 589 590 SvParserState SvRTFParser::CallParser() 591 { 592 sal_Char cFirstCh; 593 nNextChPos = rInput.Tell(); 594 rInput >> cFirstCh; nNextCh = cFirstCh; 595 eState = SVPAR_WORKING; 596 nOpenBrakets = 0; 597 SetSrcEncoding( eCodeSet = RTL_TEXTENCODING_MS_1252 ); 598 eUNICodeSet = RTL_TEXTENCODING_MS_1252; // default ist ANSI-CodeSet 599 600 // die 1. beiden Token muessen '{' und \\rtf sein !! 601 if( '{' == GetNextToken() && RTF_RTF == GetNextToken() ) 602 { 603 AddRef(); 604 Continue( 0 ); 605 if( SVPAR_PENDING != eState ) 606 ReleaseRef(); // dann brauchen wir den Parser nicht mehr! 607 } 608 else 609 eState = SVPAR_ERROR; 610 611 return eState; 612 } 613 614 void SvRTFParser::Continue( int nToken ) 615 { 616 // DBG_ASSERT( SVPAR_CS_DONTKNOW == GetCharSet(), 617 // "Zeichensatz wurde geaendert." ); 618 619 if( !nToken ) 620 nToken = GetNextToken(); 621 622 while( IsParserWorking() ) 623 { 624 SaveState( nToken ); 625 switch( nToken ) 626 { 627 case '}': 628 if( nOpenBrakets ) 629 goto NEXTTOKEN; 630 eState = SVPAR_ACCEPTED; 631 break; 632 633 case '{': 634 // eine unbekannte Gruppe ? 635 { 636 if( RTF_IGNOREFLAG != GetNextToken() ) 637 nToken = SkipToken( -1 ); 638 else if( RTF_UNKNOWNCONTROL != GetNextToken() ) 639 nToken = SkipToken( -2 ); 640 else 641 { 642 // gleich herausfiltern 643 ReadUnknownData(); 644 nToken = GetNextToken(); 645 if( '}' != nToken ) 646 eState = SVPAR_ERROR; 647 break; // auf zum naechsten Token!! 648 } 649 } 650 goto NEXTTOKEN; 651 652 case RTF_UNKNOWNCONTROL: 653 break; // unbekannte Token ueberspringen 654 case RTF_NEXTTYPE: 655 case RTF_ANSITYPE: 656 SetSrcEncoding( eCodeSet = RTL_TEXTENCODING_MS_1252 ); 657 break; 658 case RTF_MACTYPE: 659 SetSrcEncoding( eCodeSet = RTL_TEXTENCODING_APPLE_ROMAN ); 660 break; 661 case RTF_PCTYPE: 662 SetSrcEncoding( eCodeSet = RTL_TEXTENCODING_IBM_437 ); 663 break; 664 case RTF_PCATYPE: 665 SetSrcEncoding( eCodeSet = RTL_TEXTENCODING_IBM_850 ); 666 break; 667 case RTF_ANSICPG: 668 eCodeSet = rtl_getTextEncodingFromWindowsCodePage(nTokenValue); 669 SetSrcEncoding(eCodeSet); 670 break; 671 default: 672 NEXTTOKEN: 673 NextToken( nToken ); 674 break; 675 } 676 if( IsParserWorking() ) 677 SaveState( 0 ); // bis hierhin abgearbeitet, 678 // weiter mit neuem Token! 679 nToken = GetNextToken(); 680 } 681 if( SVPAR_ACCEPTED == eState && 0 < nOpenBrakets ) 682 eState = SVPAR_ERROR; 683 } 684 685 void SvRTFParser::SetEncoding( rtl_TextEncoding eEnc ) 686 { 687 if (eEnc == RTL_TEXTENCODING_DONTKNOW) 688 eEnc = GetCodeSet(); 689 690 if (!aParserStates.empty()) 691 aParserStates.top().eCodeSet = eEnc; 692 SetSrcEncoding(eEnc); 693 } 694 695 #ifdef USED 696 void SvRTFParser::SaveState( int nToken ) 697 { 698 SvParser::SaveState( nToken ); 699 } 700 701 void SvRTFParser::RestoreState() 702 { 703 SvParser::RestoreState(); 704 } 705 #endif 706 707 /* vi:set tabstop=4 shiftwidth=4 expandtab: */ 708