xref: /trunk/main/svtools/source/svrtf/parrtf.cxx (revision ce48dd1f)
1 /**************************************************************
2  *
3  * Licensed to the Apache Software Foundation (ASF) under one
4  * or more contributor license agreements.  See the NOTICE file
5  * distributed with this work for additional information
6  * regarding copyright ownership.  The ASF licenses this file
7  * to you under the Apache License, Version 2.0 (the
8  * "License"); you may not use this file except in compliance
9  * with the License.  You may obtain a copy of the License at
10  *
11  *   http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing,
14  * software distributed under the License is distributed on an
15  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16  * KIND, either express or implied.  See the License for the
17  * specific language governing permissions and limitations
18  * under the License.
19  *
20  *************************************************************/
21 
22 
23 
24 // MARKER(update_precomp.py): autogen include statement, do not remove
25 #include "precompiled_svtools.hxx"
26 
27 #include <stdio.h> // for EOF
28 #include <rtl/tencinfo.h>
29 #include <tools/stream.hxx>
30 #include <tools/debug.hxx>
31 #include <svtools/rtftoken.h>
32 #include <svtools/rtfkeywd.hxx>
33 #include <svtools/parrtf.hxx>
34 
35 const int MAX_STRING_LEN = 1024;
36 const int MAX_TOKEN_LEN = 128;
37 
38 #define RTF_ISDIGIT( c ) (c >= '0' && c <= '9')
39 #define RTF_ISALPHA( c ) ( (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') )
40 
SvRTFParser(SvStream & rIn,sal_uInt8 nStackSize)41 SvRTFParser::SvRTFParser( SvStream& rIn, sal_uInt8 nStackSize )
42 	: SvParser( rIn, nStackSize ),
43 	eUNICodeSet( RTL_TEXTENCODING_MS_1252 ), // default is ANSI code set
44 	nUCharOverread( 1 )
45 {
46 	// default is ANSI code set
47 	SetSrcEncoding( RTL_TEXTENCODING_MS_1252 );
48 	bRTF_InTextRead = false;
49 }
50 
~SvRTFParser()51 SvRTFParser::~SvRTFParser()
52 {
53 }
54 
55 
56 
57 
_GetNextToken()58 int SvRTFParser::_GetNextToken()
59 {
60 	int nRet = 0;
61 	do {
62 		int bNextCh = true;
63 		switch( nNextCh )
64 		{
65 		case '\\':
66 			{
67 				// Steuerzeichen
68 				switch( nNextCh = GetNextChar() )
69 				{
70 				case '{':
71 				case '}':
72 				case '\\':
73 				case '+':		// habe ich in einem RTF-File gefunden
74 				case '~':		// nonbreaking space
75 				case '-':		// optional hyphen
76 				case '_':		// nonbreaking hyphen
77 				case '\'':		// HexValue
78 					nNextCh = '\\';
79 					rInput.SeekRel( -1 );
80 					ScanText();
81 					nRet = RTF_TEXTTOKEN;
82 					bNextCh = 0 == nNextCh;
83 					break;
84 
85 				case '*':		// ignoreflag
86 					nRet = RTF_IGNOREFLAG;
87 					break;
88 				case ':':	 	// subentry in an index entry
89 					nRet = RTF_SUBENTRYINDEX;
90 					break;
91 				case '|':		// formula-character
92 					nRet = RTF_FORMULA;
93 					break;
94 
95 				case 0x0a:
96 				case 0x0d:
97 					nRet = RTF_PAR;
98 					break;
99 
100 				default:
101 					if( RTF_ISALPHA( nNextCh ) )
102 					{
103 						aToken = '\\';
104 						{
105 							String aStrBuffer;
106 							sal_Unicode* pStr = aStrBuffer.AllocBuffer(
107 															MAX_TOKEN_LEN );
108 							xub_StrLen nStrLen = 0;
109 							do {
110 								*(pStr + nStrLen++) = nNextCh;
111 								if( MAX_TOKEN_LEN == nStrLen )
112 								{
113 									aToken += aStrBuffer;
114 									aToken.GetBufferAccess(); // make unique string!
115 									nStrLen = 0;
116 								}
117 								nNextCh = GetNextChar();
118 							} while( RTF_ISALPHA( nNextCh ) );
119 							if( nStrLen )
120 							{
121 								aStrBuffer.ReleaseBufferAccess( nStrLen );
122 								aToken += aStrBuffer;
123 							}
124 						}
125 
126 						// Minus fuer numerischen Parameter
127 						int bNegValue = false;
128 						if( '-' == nNextCh )
129 						{
130 							bNegValue = true;
131 							nNextCh = GetNextChar();
132 						}
133 
134 						// evt. Numerischer Parameter
135 						if( RTF_ISDIGIT( nNextCh ) )
136 						{
137 							nTokenValue = 0;
138 							do {
139 								nTokenValue *= 10;
140 								nTokenValue += nNextCh - '0';
141 								nNextCh = GetNextChar();
142 							} while( RTF_ISDIGIT( nNextCh ) );
143 							if( bNegValue )
144 								nTokenValue = -nTokenValue;
145 							bTokenHasValue=true;
146 						}
147 						else if( bNegValue ) // das Minus wieder zurueck
148 						{
149 							nNextCh = '-';
150 							rInput.SeekRel( -1 );
151 						}
152 						if( ' ' == nNextCh ) // Blank gehoert zum Token!
153 							nNextCh = GetNextChar();
154 
155 						// suche das Token in der Tabelle:
156 						if( 0 == (nRet = GetRTFToken( aToken )) )
157 							// Unknown Control
158 							nRet = RTF_UNKNOWNCONTROL;
159 
160 						// bug 76812 - unicode token handled as normal text
161 						bNextCh = false;
162 						switch( nRet )
163 						{
164 						case RTF_UC:
165 							if( 0 <= nTokenValue )
166 							{
167 								nUCharOverread = (sal_uInt8)nTokenValue;
168 #if 1
169 								// cmc: other ifdef breaks #i3584
170 								aParserStates.top().
171 									nUCharOverread = nUCharOverread;
172 #else
173 								if( !nUCharOverread )
174 									nUCharOverread = aParserStates.top().nUCharOverread;
175 								else
176 									aParserStates.top().
177 										nUCharOverread = nUCharOverread;
178 #endif
179 							}
180 							aToken.Erase(); // #i47831# erase token to prevent the token from being treated as text
181 							// read next token
182 							nRet = 0;
183 							break;
184 
185 						case RTF_UPR:
186 							if (!_inSkipGroup) {
187 							// UPR - overread the group with the ansi
188 							//       information
189 							while( '{' != _GetNextToken() )
190 								;
191 							SkipGroup();
192 							_GetNextToken(); // overread the last bracket
193 							nRet = 0;
194 							}
195 							break;
196 
197 						case RTF_U:
198 							if( !bRTF_InTextRead )
199 							{
200 								nRet = RTF_TEXTTOKEN;
201 								aToken = (sal_Unicode)nTokenValue;
202 
203 								// overread the next n "RTF" characters. This
204 								// can be also \{, \}, \'88
205 								for( sal_uInt8 m = 0; m < nUCharOverread; ++m )
206 								{
207 									sal_Unicode cAnsi = nNextCh;
208 									while( 0xD == cAnsi )
209 										cAnsi = GetNextChar();
210 									while( 0xA == cAnsi )
211 										cAnsi = GetNextChar();
212 
213 									if( '\\' == cAnsi &&
214 										'\'' == ( cAnsi = GetNextChar() ))
215 										// HexValue ueberlesen
216 										cAnsi = GetHexValue();
217 									nNextCh = GetNextChar();
218 								}
219 								ScanText();
220 								bNextCh = 0 == nNextCh;
221 							}
222 							break;
223 						}
224 					}
225 					else if( SVPAR_PENDING != eState )
226 					{
227 						// Bug 34631 - "\ " ueberlesen - Blank als Zeichen
228 						// eState = SVPAR_ERROR;
229 						bNextCh = false;
230 					}
231 					break;
232 				}
233 			}
234 			break;
235 
236 		case sal_Unicode(EOF):
237 			eState = SVPAR_ACCEPTED;
238 			nRet = nNextCh;
239 			break;
240 
241 		case '{':
242 			{
243 				if( 0 <= nOpenBrakets )
244 				{
245 					RtfParserState_Impl aState( nUCharOverread, GetSrcEncoding() );
246 					aParserStates.push( aState );
247 				}
248 				++nOpenBrakets;
249 				DBG_ASSERT(
250 					static_cast<size_t>(nOpenBrakets) == aParserStates.size(),
251 					"ParserStateStack unequal to bracket count" );
252 				nRet = nNextCh;
253 			}
254 			break;
255 
256 		case '}':
257 			--nOpenBrakets;
258 			if( 0 <= nOpenBrakets )
259 			{
260 				aParserStates.pop();
261 				if( !aParserStates.empty() )
262 				{
263 					const RtfParserState_Impl& rRPS =
264 							aParserStates.top();
265 					nUCharOverread = rRPS.nUCharOverread;
266 					SetSrcEncoding( rRPS.eCodeSet );
267 				}
268 				else
269 				{
270 					nUCharOverread = 1;
271 					SetSrcEncoding( GetCodeSet() );
272 				}
273 			}
274 			DBG_ASSERT(
275 				static_cast<size_t>(nOpenBrakets) == aParserStates.size(),
276 				"ParserStateStack unequal to bracket count" );
277 			nRet = nNextCh;
278 			break;
279 
280 		case 0x0d:
281 		case 0x0a:
282 			break;
283 
284 		default:
285 			// es folgt normaler Text
286 			ScanText();
287 			nRet = RTF_TEXTTOKEN;
288 			bNextCh = 0 == nNextCh;
289 			break;
290 		}
291 
292 		if( bNextCh )
293 			nNextCh = GetNextChar();
294 
295 	} while( !nRet && SVPAR_WORKING == eState );
296 	return nRet;
297 }
298 
299 
GetHexValue()300 sal_Unicode SvRTFParser::GetHexValue()
301 {
302 	// Hex-Wert sammeln
303 	register int n;
304 	register sal_Unicode nHexVal = 0;
305 
306 	for( n = 0; n < 2; ++n )
307 	{
308 		nHexVal *= 16;
309 		nNextCh = GetNextChar();
310 		if( nNextCh >= '0' && nNextCh <= '9' )
311 			nHexVal += (nNextCh - 48);
312 		else if( nNextCh >= 'a' && nNextCh <= 'f' )
313 			nHexVal += (nNextCh - 87);
314 		else if( nNextCh >= 'A' && nNextCh <= 'F' )
315 			nHexVal += (nNextCh - 55);
316 	}
317 	return nHexVal;
318 }
319 
ScanText(const sal_Unicode cBreak)320 void SvRTFParser::ScanText( const sal_Unicode cBreak )
321 {
322 	String aStrBuffer;
323 	int bWeiter = true;
324 	while( bWeiter && IsParserWorking() && aStrBuffer.Len() < MAX_STRING_LEN)
325 	{
326 		int bNextCh = true;
327 		switch( nNextCh )
328 		{
329 		case '\\':
330 			{
331 				switch (nNextCh = GetNextChar())
332 				{
333 				case '\'':
334 					{
335 
336 #if 0
337 						// #i35653 patch from cmc
338 						ByteString aByteString(static_cast<char>(GetHexValue()));
339 						if (aByteString.Len())
340 							aStrBuffer.Append(String(aByteString, GetSrcEncoding()));
341 #else
342 						ByteString aByteString;
343 						while (1)
344 						{
345 							aByteString.Append((char)GetHexValue());
346 
347 							bool bBreak = false;
348 							sal_Char nSlash = '\\';
349 							while (!bBreak)
350 							{
351 								wchar_t __next=GetNextChar();
352 								if (__next>0xFF) // fix for #i43933# and #i35653#
353 								{
354 									if (aByteString.Len())
355 										aStrBuffer.Append(String(aByteString, GetSrcEncoding()));
356 									aStrBuffer.Append((sal_Unicode)__next);
357 
358 									aByteString.Erase();
359 									continue;
360 								}
361 								nSlash = (sal_Char)__next;
362 								while (nSlash == 0xD || nSlash == 0xA)
363 									nSlash = (sal_Char)GetNextChar();
364 
365 								switch (nSlash)
366 								{
367 									case '{':
368 									case '}':
369 									case '\\':
370 										bBreak = true;
371 										break;
372 									default:
373 										aByteString.Append(nSlash);
374 										break;
375 								}
376 							}
377 
378 							nNextCh = GetNextChar();
379 
380 							if (nSlash != '\\' || nNextCh != '\'')
381 							{
382 								rInput.SeekRel(-1);
383 								nNextCh = nSlash;
384 								break;
385 							}
386 						}
387 
388 						bNextCh = false;
389 
390 						if (aByteString.Len())
391 							aStrBuffer.Append(String(aByteString, GetSrcEncoding()));
392 #endif
393 					}
394 					break;
395 				case '\\':
396 				case '}':
397 				case '{':
398 				case '+':		// habe ich in einem RTF-File gefunden
399 					aStrBuffer.Append(nNextCh);
400 					break;
401 				case '~':		// nonbreaking space
402 					aStrBuffer.Append(static_cast< sal_Unicode >(0xA0));
403 					break;
404 				case '-':		// optional hyphen
405 					aStrBuffer.Append(static_cast< sal_Unicode >(0xAD));
406 					break;
407 				case '_':		// nonbreaking hyphen
408 					aStrBuffer.Append(static_cast< sal_Unicode >(0x2011));
409 					break;
410 
411 				case 'u':
412 					// UNI-Code Zeichen lesen
413 					{
414 						nNextCh = GetNextChar();
415 						rInput.SeekRel( -2 );
416 
417 						if( '-' == nNextCh || RTF_ISDIGIT( nNextCh ) )
418 						{
419 							bRTF_InTextRead = true;
420 
421 							String sSave( aToken );
422 							nNextCh = '\\';
423 							#ifdef DBG_UTIL
424 							int nToken =
425 							#endif
426 								_GetNextToken();
427 							DBG_ASSERT( RTF_U == nToken, "doch kein UNI-Code Zeichen" );
428 							// dont convert symbol chars
429 							aStrBuffer.Append(
430 								static_cast< sal_Unicode >(nTokenValue));
431 
432 							// overread the next n "RTF" characters. This
433 							// can be also \{, \}, \'88
434 							for( sal_uInt8 m = 0; m < nUCharOverread; ++m )
435 							{
436 								sal_Unicode cAnsi = nNextCh;
437 								while( 0xD == cAnsi )
438 									cAnsi = GetNextChar();
439 								while( 0xA == cAnsi )
440 									cAnsi = GetNextChar();
441 
442 								if( '\\' == cAnsi &&
443 									'\'' == ( cAnsi = GetNextChar() ))
444 									// HexValue ueberlesen
445 									cAnsi = GetHexValue();
446 								nNextCh = GetNextChar();
447 							}
448 							bNextCh = false;
449 							aToken = sSave;
450 							bRTF_InTextRead = false;
451 						}
452 						else
453 						{
454 							nNextCh = '\\';
455 							bWeiter = false; // Abbrechen, String zusammen
456 						}
457 					}
458 					break;
459 
460 				default:
461 					rInput.SeekRel( -1 );
462 					nNextCh = '\\';
463 					bWeiter = false; // Abbrechen, String zusammen
464 					break;
465 				}
466 			}
467 			break;
468 
469 		case sal_Unicode(EOF):
470 				eState = SVPAR_ERROR;
471 				// weiter
472 		case '{':
473 		case '}':
474 			bWeiter = false;
475 			break;
476 
477 		case 0x0a:
478 		case 0x0d:
479 			break;
480 
481 		default:
482 			if( nNextCh == cBreak || aStrBuffer.Len() >= MAX_STRING_LEN)
483 				bWeiter = false;
484 			else
485 			{
486 				do {
487 					// alle anderen Zeichen kommen in den Text
488 					aStrBuffer.Append(nNextCh);
489 
490 					if (sal_Unicode(EOF) == (nNextCh = GetNextChar()))
491 					{
492 						if (aStrBuffer.Len())
493 							aToken += aStrBuffer;
494 						return;
495 					}
496 				} while
497 				(
498 					(RTF_ISALPHA(nNextCh) || RTF_ISDIGIT(nNextCh)) &&
499 					(aStrBuffer.Len() < MAX_STRING_LEN)
500 				);
501 				bNextCh = false;
502 			}
503 		}
504 
505 		if( bWeiter && bNextCh )
506 			nNextCh = GetNextChar();
507 	}
508 
509 	if (aStrBuffer.Len())
510 		aToken += aStrBuffer;
511 }
512 
513 
514 short SvRTFParser::_inSkipGroup=0;
515 
SkipGroup()516 void SvRTFParser::SkipGroup()
517 {
518 short nBrackets=1;
519 if (_inSkipGroup>0)
520 	return;
521 _inSkipGroup++;
522 #if 1 // #i16185# fecking \bin keyword
523 	do
524 	{
525 		switch (nNextCh)
526 		{
527 			case '{':
528 				++nBrackets;
529 				break;
530 			case '}':
531 				if (!--nBrackets) {
532 					_inSkipGroup--;
533 					return;
534 				}
535 				break;
536 		}
537 		int nToken = _GetNextToken();
538 		if (nToken == RTF_BIN)
539 		{
540 			rInput.SeekRel(-1);
541 			rInput.SeekRel(nTokenValue);
542 			nNextCh = GetNextChar();
543 		}
544 		while (nNextCh==0xa || nNextCh==0xd)
545 		{
546 			nNextCh = GetNextChar();
547 		}
548 	} while (sal_Unicode(EOF) != nNextCh && IsParserWorking());
549 #else
550 	sal_Unicode cPrev = 0;
551 	do {
552 		switch( nNextCh )
553 		{
554 		case '{':
555 			if( '\\' != cPrev )
556 				++nBrackets;
557 			break;
558 
559 		case '}':
560 			if( '\\' != cPrev && !--nBrackets )
561 				return;
562 			break;
563 
564 		case '\\':
565 			if( '\\' == cPrev )
566 				nNextCh = 0;
567 			break;
568 		}
569 		cPrev = nNextCh;
570 		nNextCh = GetNextChar();
571 	} while( sal_Unicode(EOF) != nNextCh && IsParserWorking() );
572 #endif
573 
574 	if( SVPAR_PENDING != eState && '}' != nNextCh )
575 		eState = SVPAR_ERROR;
576 	_inSkipGroup--;
577 }
578 
ReadUnknownData()579 void SvRTFParser::ReadUnknownData()	{ SkipGroup(); }
ReadBitmapData()580 void SvRTFParser::ReadBitmapData()	{ SkipGroup(); }
ReadOLEData()581 void SvRTFParser::ReadOLEData()		{ SkipGroup(); }
582 
583 
CallParser()584 SvParserState SvRTFParser::CallParser()
585 {
586 	sal_Char cFirstCh;
587 	nNextChPos = rInput.Tell();
588 	rInput >> cFirstCh; nNextCh = cFirstCh;
589 	eState = SVPAR_WORKING;
590 	nOpenBrakets = 0;
591 	SetSrcEncoding( eCodeSet = RTL_TEXTENCODING_MS_1252 );
592 	eUNICodeSet = RTL_TEXTENCODING_MS_1252; 	// default ist ANSI-CodeSet
593 
594 	// die 1. beiden Token muessen '{' und \\rtf sein !!
595 	if( '{' == GetNextToken() && RTF_RTF == GetNextToken() )
596 	{
597 		AddRef();
598 		Continue( 0 );
599 		if( SVPAR_PENDING != eState )
600 			ReleaseRef();		// dann brauchen wir den Parser nicht mehr!
601 	}
602 	else
603 		eState = SVPAR_ERROR;
604 
605 	return eState;
606 }
607 
Continue(int nToken)608 void SvRTFParser::Continue( int nToken )
609 {
610 //	DBG_ASSERT( SVPAR_CS_DONTKNOW == GetCharSet(),
611 //				"Zeichensatz wurde geaendert." );
612 
613 	if( !nToken )
614 		nToken = GetNextToken();
615 
616 	while( IsParserWorking() )
617 	{
618 		SaveState( nToken );
619 		switch( nToken )
620 		{
621 		case '}':
622 			if( nOpenBrakets )
623 				goto NEXTTOKEN;
624 			eState = SVPAR_ACCEPTED;
625 			break;
626 
627 		case '{':
628 			// eine unbekannte Gruppe ?
629 			{
630 				if( RTF_IGNOREFLAG != GetNextToken() )
631 					nToken = SkipToken( -1 );
632 				else if( RTF_UNKNOWNCONTROL != GetNextToken() )
633 					nToken = SkipToken( -2 );
634 				else
635 				{
636 					// gleich herausfiltern
637 					ReadUnknownData();
638 					nToken = GetNextToken();
639 					if( '}' != nToken )
640 						eState = SVPAR_ERROR;
641 					break;		// auf zum naechsten Token!!
642 				}
643 			}
644 			goto NEXTTOKEN;
645 
646 		case RTF_UNKNOWNCONTROL:
647 			break;		// unbekannte Token ueberspringen
648 		case RTF_NEXTTYPE:
649 		case RTF_ANSITYPE:
650 			SetEncoding( eCodeSet = RTL_TEXTENCODING_MS_1252 );
651 			break;
652 		case RTF_MACTYPE:
653 			SetEncoding( eCodeSet = RTL_TEXTENCODING_APPLE_ROMAN );
654 			break;
655 		case RTF_PCTYPE:
656 			SetEncoding( eCodeSet = RTL_TEXTENCODING_IBM_437 );
657 			break;
658 		case RTF_PCATYPE:
659 			SetEncoding( eCodeSet = RTL_TEXTENCODING_IBM_850 );
660 			break;
661 		case RTF_ANSICPG:
662 			eCodeSet = rtl_getTextEncodingFromWindowsCodePage(nTokenValue);
663 			SetEncoding(eCodeSet);
664 			break;
665 		default:
666 NEXTTOKEN:
667 			NextToken( nToken );
668 			break;
669 		}
670 		if( IsParserWorking() )
671 			SaveState( 0 );			// bis hierhin abgearbeitet,
672 									// weiter mit neuem Token!
673 		nToken = GetNextToken();
674 	}
675 	if( SVPAR_ACCEPTED == eState && 0 < nOpenBrakets )
676 		eState = SVPAR_ERROR;
677 }
678 
SetEncoding(rtl_TextEncoding eEnc)679 void SvRTFParser::SetEncoding( rtl_TextEncoding eEnc )
680 {
681 	if (eEnc == RTL_TEXTENCODING_DONTKNOW)
682 		eEnc = GetCodeSet();
683 
684 	if (!aParserStates.empty())
685 		aParserStates.top().eCodeSet = eEnc;
686 	SetSrcEncoding(eEnc);
687 }
688 
689 #ifdef USED
SaveState(int nToken)690 void SvRTFParser::SaveState( int nToken )
691 {
692 	SvParser::SaveState( nToken );
693 }
694 
RestoreState()695 void SvRTFParser::RestoreState()
696 {
697 	SvParser::RestoreState();
698 }
699 #endif
700 
701 /* vim: set noet sw=4 ts=4: */
702