1 /**************************************************************
2  *
3  * Licensed to the Apache Software Foundation (ASF) under one
4  * or more contributor license agreements.  See the NOTICE file
5  * distributed with this work for additional information
6  * regarding copyright ownership.  The ASF licenses this file
7  * to you under the Apache License, Version 2.0 (the
8  * "License"); you may not use this file except in compliance
9  * with the License.  You may obtain a copy of the License at
10  *
11  *   http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing,
14  * software distributed under the License is distributed on an
15  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16  * KIND, either express or implied.  See the License for the
17  * specific language governing permissions and limitations
18  * under the License.
19  *
20  *************************************************************/
21 
22 
23 
24 // MARKER(update_precomp.py): autogen include statement, do not remove
25 #include "precompiled_svtools.hxx"
26 
27 #include <svtools/syntaxhighlight.hxx>
28 
29 #include <unotools/charclass.hxx>
30 #include <tools/debug.hxx>
31 
32 
33 // ##########################################################################
34 // ATTENTION: all these words needs to be in small caps
35 // ##########################################################################
36 static const char* strListBasicKeyWords[] = {
37 	"access",
38 	"alias",
39 	"and",
40 	"any",
41 	"append",
42 	"as",
43 	"base",
44 	"binary",
45 	"boolean",
46 	"byref",
47 	"byte",
48 	"byval",
49 	"call",
50 	"case",
51 	"cdecl",
52 	"classmodule",
53 	"close",
54 	"compare",
55 	"compatible",
56 	"const",
57 	"currency",
58 	"date",
59 	"declare",
60 	"defbool",
61 	"defcur",
62 	"defdate",
63 	"defdbl",
64 	"deferr",
65 	"defint",
66 	"deflng",
67 	"defobj",
68 	"defsng",
69 	"defstr",
70 	"defvar",
71 	"dim",
72 	"do",
73 	"double",
74 	"each",
75 	"else",
76 	"elseif",
77 	"end",
78 	"end enum",
79 	"end function",
80 	"end if",
81 	"end select",
82 	"end sub",
83 	"end type",
84 	"endif",
85 	"enum",
86 	"eqv",
87 	"erase",
88 	"error",
89 	"exit",
90 	"explicit",
91 	"for",
92 	"function",
93 	"get",
94 	"global",
95 	"gosub",
96 	"goto",
97 	"if",
98 	"imp",
99 	"implements",
100 	"in",
101 	"input",
102 	"integer",
103 	"is",
104 	"let",
105 	"lib",
106 	"like",
107 	"line",
108 	"line input",
109 	"local",
110 	"lock",
111 	"long",
112 	"loop",
113 	"lprint",
114 	"lset",
115 	"mod",
116 	"name",
117 	"new",
118 	"next",
119 	"not",
120 	"object",
121 	"on",
122 	"open",
123 	"option",
124 	"optional",
125 	"or",
126 	"output",
127 	"preserve",
128 	"print",
129 	"private",
130 	"property",
131 	"public",
132 	"random",
133 	"read",
134 	"redim",
135 	"rem",
136 	"resume",
137 	"return",
138 	"rset",
139 	"select",
140 	"set",
141 	"shared",
142 	"single",
143 	"static",
144 	"step",
145 	"stop",
146 	"string",
147 	"sub",
148 	"system",
149 	"text",
150 	"then",
151 	"to",
152 	"type",
153 	"typeof",
154 	"until",
155 	"variant",
156 	"wend",
157 	"while",
158 	"with",
159 	"write",
160 	"xor"
161 };
162 
163 
164 static const char* strListSqlKeyWords[] = {
165 	"all",
166 	"and",
167 	"any",
168 	"as",
169 	"asc",
170 	"avg",
171 	"between",
172 	"by",
173 	"cast",
174 	"corresponding",
175 	"count",
176 	"create",
177 	"cross",
178 	"delete",
179 	"desc",
180 	"distinct",
181 	"drop",
182 	"escape",
183 	"except",
184 	"exists",
185 	"false",
186 	"from",
187 	"full",
188 	"global",
189 	"group",
190 	"having",
191 	"in",
192 	"inner",
193 	"insert",
194 	"intersect",
195 	"into",
196 	"is",
197 	"join",
198 	"left",
199 	"like",
200 	"local",
201 	"match",
202 	"max",
203 	"min",
204 	"natural",
205 	"not",
206 	"null",
207 	"on",
208 	"or",
209 	"order",
210 	"outer",
211 	"right",
212 	"select",
213 	"set",
214 	"some",
215 	"sum",
216 	"table",
217 	"temporary",
218 	"true",
219 	"union",
220 	"unique",
221 	"unknown",
222 	"update",
223 	"using",
224 	"values",
225 	"where"
226 };
227 
228 
compare_strings(const void * arg1,const void * arg2)229 extern "C" int CDECL compare_strings( const void *arg1, const void *arg2 )
230 {
231 	return strcmp( (char *)arg1, *(char **)arg2 );
232 }
233 
234 
235 class LetterTable
236 {
237 	bool		IsLetterTab[256];
238 
239 public:
240 	LetterTable( void );
241 
isLetter(sal_Unicode c)242 	inline bool isLetter( sal_Unicode c )
243 	{
244 		bool bRet = (c < 256) ? IsLetterTab[c] : isLetterUnicode( c );
245 		return bRet;
246 	}
247 	bool isLetterUnicode( sal_Unicode c );
248 };
249 
250 class BasicSimpleCharClass
251 {
252 	static LetterTable aLetterTable;
253 
254 public:
isAlpha(sal_Unicode c,bool bCompatible)255 	static sal_Bool isAlpha( sal_Unicode c, bool bCompatible )
256 	{
257 		sal_Bool bRet = (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
258 					|| (bCompatible && aLetterTable.isLetter( c ));
259 		return bRet;
260 	}
261 
isDigit(sal_Unicode c)262 	static sal_Bool isDigit( sal_Unicode c )
263 	{
264 		sal_Bool bRet = (c >= '0' && c <= '9');
265 		return bRet;
266 	}
267 
isAlphaNumeric(sal_Unicode c,bool bCompatible)268 	static sal_Bool isAlphaNumeric( sal_Unicode c, bool bCompatible )
269 	{
270 		sal_Bool bRet = isDigit( c ) || isAlpha( c, bCompatible );
271 		return bRet;
272 	}
273 };
274 
275 LetterTable BasicSimpleCharClass::aLetterTable;
276 
LetterTable(void)277 LetterTable::LetterTable( void )
278 {
279 	for( int i = 0 ; i < 256 ; ++i )
280 		IsLetterTab[i] = false;
281 
282 	IsLetterTab[0xC0] = true;	// ?, CAPITAL LETTER A WITH GRAVE ACCENT
283 	IsLetterTab[0xC1] = true;	// ?, CAPITAL LETTER A WITH ACUTE ACCENT
284 	IsLetterTab[0xC2] = true;	// ?, CAPITAL LETTER A WITH CIRCUMFLEX ACCENT
285 	IsLetterTab[0xC3] = true;	// ?, CAPITAL LETTER A WITH TILDE
286 	IsLetterTab[0xC4] = true;	// ?, CAPITAL LETTER A WITH DIAERESIS
287 	IsLetterTab[0xC5] = true;	// ?, CAPITAL LETTER A WITH RING ABOVE
288 	IsLetterTab[0xC6] = true;	// ?, CAPITAL LIGATURE AE
289 	IsLetterTab[0xC7] = true;	// ?, CAPITAL LETTER C WITH CEDILLA
290 	IsLetterTab[0xC8] = true;	// ?, CAPITAL LETTER E WITH GRAVE ACCENT
291 	IsLetterTab[0xC9] = true;	// ?, CAPITAL LETTER E WITH ACUTE ACCENT
292 	IsLetterTab[0xCA] = true;	// ?, CAPITAL LETTER E WITH CIRCUMFLEX ACCENT
293 	IsLetterTab[0xCB] = true;	// ?, CAPITAL LETTER E WITH DIAERESIS
294 	IsLetterTab[0xCC] = true;	// ?, CAPITAL LETTER I WITH GRAVE ACCENT
295 	IsLetterTab[0xCD] = true;	// ?, CAPITAL LETTER I WITH ACUTE ACCENT
296 	IsLetterTab[0xCE] = true;	// ?, CAPITAL LETTER I WITH CIRCUMFLEX ACCENT
297 	IsLetterTab[0xCF] = true;	// ?, CAPITAL LETTER I WITH DIAERESIS
298 	IsLetterTab[0xD0] = true;	// ?, CAPITAL LETTER ETH
299 	IsLetterTab[0xD1] = true;	// ?, CAPITAL LETTER N WITH TILDE
300 	IsLetterTab[0xD2] = true;	// ?, CAPITAL LETTER O WITH GRAVE ACCENT
301 	IsLetterTab[0xD3] = true;	// ?, CAPITAL LETTER O WITH ACUTE ACCENT
302 	IsLetterTab[0xD4] = true;	// ?, CAPITAL LETTER O WITH CIRCUMFLEX ACCENT
303 	IsLetterTab[0xD5] = true;	// ?, CAPITAL LETTER O WITH TILDE
304 	IsLetterTab[0xD6] = true;	// ?, CAPITAL LETTER O WITH DIAERESIS
305 	IsLetterTab[0xD8] = true;	// ?, CAPITAL LETTER O WITH STROKE
306 	IsLetterTab[0xD9] = true;	// ?, CAPITAL LETTER U WITH GRAVE ACCENT
307 	IsLetterTab[0xDA] = true;	// ?, CAPITAL LETTER U WITH ACUTE ACCENT
308 	IsLetterTab[0xDB] = true;	// ?, CAPITAL LETTER U WITH CIRCUMFLEX ACCENT
309 	IsLetterTab[0xDC] = true;	// ?, CAPITAL LETTER U WITH DIAERESIS
310 	IsLetterTab[0xDD] = true;	// ?, CAPITAL LETTER Y WITH ACUTE ACCENT
311 	IsLetterTab[0xDE] = true;	// ?, CAPITAL LETTER THORN
312 	IsLetterTab[0xDF] = true;	// ?, SMALL LETTER SHARP S
313 	IsLetterTab[0xE0] = true;	// ?, SMALL LETTER A WITH GRAVE ACCENT
314 	IsLetterTab[0xE1] = true;	// ?, SMALL LETTER A WITH ACUTE ACCENT
315 	IsLetterTab[0xE2] = true;	// ?, SMALL LETTER A WITH CIRCUMFLEX ACCENT
316 	IsLetterTab[0xE3] = true;	// ?, SMALL LETTER A WITH TILDE
317 	IsLetterTab[0xE4] = true;	// ?, SMALL LETTER A WITH DIAERESIS
318 	IsLetterTab[0xE5] = true;	// ?, SMALL LETTER A WITH RING ABOVE
319 	IsLetterTab[0xE6] = true;	// ?, SMALL LIGATURE AE
320 	IsLetterTab[0xE7] = true;	// ?, SMALL LETTER C WITH CEDILLA
321 	IsLetterTab[0xE8] = true;	// ?, SMALL LETTER E WITH GRAVE ACCENT
322 	IsLetterTab[0xE9] = true;	// ?, SMALL LETTER E WITH ACUTE ACCENT
323 	IsLetterTab[0xEA] = true;	// ?, SMALL LETTER E WITH CIRCUMFLEX ACCENT
324 	IsLetterTab[0xEB] = true;	// ?, SMALL LETTER E WITH DIAERESIS
325 	IsLetterTab[0xEC] = true;	// ?, SMALL LETTER I WITH GRAVE ACCENT
326 	IsLetterTab[0xED] = true;	// ?, SMALL LETTER I WITH ACUTE ACCENT
327 	IsLetterTab[0xEE] = true;	// ?, SMALL LETTER I WITH CIRCUMFLEX ACCENT
328 	IsLetterTab[0xEF] = true;	// ?, SMALL LETTER I WITH DIAERESIS
329 	IsLetterTab[0xF0] = true;	// ?, SMALL LETTER ETH
330 	IsLetterTab[0xF1] = true;	// ?, SMALL LETTER N WITH TILDE
331 	IsLetterTab[0xF2] = true;	// ?, SMALL LETTER O WITH GRAVE ACCENT
332 	IsLetterTab[0xF3] = true;	// ?, SMALL LETTER O WITH ACUTE ACCENT
333 	IsLetterTab[0xF4] = true;	// ?, SMALL LETTER O WITH CIRCUMFLEX ACCENT
334 	IsLetterTab[0xF5] = true;	// ?, SMALL LETTER O WITH TILDE
335 	IsLetterTab[0xF6] = true;	// ?, SMALL LETTER O WITH DIAERESIS
336 	IsLetterTab[0xF8] = true;	// ?, SMALL LETTER O WITH OBLIQUE BAR
337 	IsLetterTab[0xF9] = true;	// ?, SMALL LETTER U WITH GRAVE ACCENT
338 	IsLetterTab[0xFA] = true;	// ?, SMALL LETTER U WITH ACUTE ACCENT
339 	IsLetterTab[0xFB] = true;	// ?, SMALL LETTER U WITH CIRCUMFLEX ACCENT
340 	IsLetterTab[0xFC] = true;	// ?, SMALL LETTER U WITH DIAERESIS
341 	IsLetterTab[0xFD] = true;	// ?, SMALL LETTER Y WITH ACUTE ACCENT
342 	IsLetterTab[0xFE] = true;	// ?, SMALL LETTER THORN
343 	IsLetterTab[0xFF] = true;	// � , SMALL LETTER Y WITH DIAERESIS
344 }
345 
isLetterUnicode(sal_Unicode c)346 bool LetterTable::isLetterUnicode( sal_Unicode c )
347 {
348 	static CharClass* pCharClass = NULL;
349 	if( pCharClass == NULL )
350 		pCharClass = new CharClass( Application::GetSettings().GetLocale() );
351 	String aStr( c );
352 	bool bRet = pCharClass->isLetter( aStr, 0 );
353 	return bRet;
354 }
355 
356 // Hilfsfunktion: Zeichen-Flag Testen
testCharFlags(sal_Unicode c,sal_uInt16 nTestFlags)357 sal_Bool SimpleTokenizer_Impl::testCharFlags( sal_Unicode c, sal_uInt16 nTestFlags )
358 {
359 	bool bRet = false;
360 	if( c != 0 && c <= 255 )
361 	{
362 		bRet = ( (aCharTypeTab[c] & nTestFlags) != 0 );
363 	}
364 	else if( c > 255 )
365 	{
366 		bRet = (( CHAR_START_IDENTIFIER | CHAR_IN_IDENTIFIER ) & nTestFlags) != 0
367 			? BasicSimpleCharClass::isAlpha( c, true ) : false;
368 	}
369 	return bRet;
370 }
371 
setKeyWords(const char ** ppKeyWords,sal_uInt16 nCount)372 void SimpleTokenizer_Impl::setKeyWords( const char** ppKeyWords, sal_uInt16 nCount )
373 {
374 	ppListKeyWords = ppKeyWords;
375 	nKeyWordCount = nCount;
376 }
377 
378 // Neues Token holen
getNextToken(TokenTypes & reType,const sal_Unicode * & rpStartPos,const sal_Unicode * & rpEndPos)379 sal_Bool SimpleTokenizer_Impl::getNextToken( /*out*/TokenTypes& reType,
380 	/*out*/const sal_Unicode*& rpStartPos, /*out*/const sal_Unicode*& rpEndPos )
381 {
382 	reType = TT_UNKNOWN;
383 
384 	// Position merken
385 	rpStartPos = mpActualPos;
386 
387 	// Zeichen untersuchen
388 	sal_Unicode c = peekChar();
389 	if( c == CHAR_EOF )
390 		return sal_False;
391 
392 	// Zeichen lesen
393 	getChar();
394 
395 	//*** Alle Moeglichkeiten durchgehen ***
396 	// Space?
397 	if ( (testCharFlags( c, CHAR_SPACE ) == sal_True) )
398 	{
399 		while( testCharFlags( peekChar(), CHAR_SPACE ) == sal_True )
400 			getChar();
401 
402 		reType = TT_WHITESPACE;
403 	}
404 
405 	// Identifier?
406 	else if ( (testCharFlags( c, CHAR_START_IDENTIFIER ) == sal_True) )
407 	{
408 		sal_Bool bIdentifierChar;
409 		do
410 		{
411 			// Naechstes Zeichen holen
412 			c = peekChar();
413 			bIdentifierChar = testCharFlags( c, CHAR_IN_IDENTIFIER );
414 			if( bIdentifierChar )
415 				getChar();
416 		}
417 		while( bIdentifierChar );
418 
419 		reType = TT_IDENTIFIER;
420 
421 		// Schluesselwort-Tabelle
422 		if (ppListKeyWords != NULL)
423 		{
424 			int nCount = mpActualPos - rpStartPos;
425 
426 			// No keyword if string contains char > 255
427 			bool bCanBeKeyword = true;
428 			for( int i = 0 ; i < nCount ; i++ )
429 			{
430 				if( rpStartPos[i] > 255 )
431 				{
432 					bCanBeKeyword = false;
433 					break;
434 				}
435 			}
436 
437 			if( bCanBeKeyword )
438 			{
439 				String aKWString(rpStartPos, sal::static_int_cast< xub_StrLen >(nCount) );
440 				ByteString aByteStr( aKWString, RTL_TEXTENCODING_ASCII_US );
441 				aByteStr.ToLowerAscii();
442 				if ( bsearch( aByteStr.GetBuffer(), ppListKeyWords, nKeyWordCount, sizeof( char* ),
443 																		compare_strings ) )
444 				{
445 					reType = TT_KEYWORDS;
446 
447 					if ( aByteStr.Equals( "rem" ) )
448 					{
449 						// Alle Zeichen bis Zeilen-Ende oder EOF entfernen
450 						sal_Unicode cPeek = peekChar();
451 						while( cPeek != CHAR_EOF && testCharFlags( cPeek, CHAR_EOL ) == sal_False )
452 						{
453 							c = getChar();
454 							cPeek = peekChar();
455 						}
456 
457 						reType = TT_COMMENT;
458 					}
459 				}
460 			}
461 		}
462 	}
463 
464 	// Operator?
465 	// only for BASIC '\'' should be a comment, otherwise it is a normal string and handled there
466 	else if ( ( testCharFlags( c, CHAR_OPERATOR ) == sal_True ) || ( (c == '\'') && (aLanguage==HIGHLIGHT_BASIC)) )
467 	{
468 		// paramters for SQL view
469 		if ( (c==':') || (c=='?'))
470 		{
471 			if (c!='?')
472 			{
473 				sal_Bool bIdentifierChar;
474 				do
475 				{
476 					// Naechstes Zeichen holen
477 					c = peekChar();
478 					bIdentifierChar =  BasicSimpleCharClass::isAlpha( c, true );
479 					if( bIdentifierChar )
480 						getChar();
481 				}
482 				while( bIdentifierChar );
483 			}
484 			reType = TT_PARAMETER;
485 		}
486 		else if( c=='-' )
487 		{
488 			sal_Unicode cPeekNext = peekChar();
489 			if (cPeekNext=='-')
490 			{
491 				// Alle Zeichen bis Zeilen-Ende oder EOF entfernen
492 				while( cPeekNext != CHAR_EOF && testCharFlags( cPeekNext, CHAR_EOL ) == sal_False )
493 				{
494 					getChar();
495 					cPeekNext = peekChar();
496 				}
497 				reType = TT_COMMENT;
498 			}
499 		}
500        else if (c=='/')
501        {
502            sal_Unicode cPeekNext = peekChar();
503            if (cPeekNext=='/')
504            {
505                // Alle Zeichen bis Zeilen-Ende oder EOF entfernen
506                while( cPeekNext != CHAR_EOF && testCharFlags( cPeekNext, CHAR_EOL ) == sal_False )
507                {
508                    getChar();
509                    cPeekNext = peekChar();
510                }
511                reType = TT_COMMENT;
512            }
513        }
514 		else
515 		{
516 			// Kommentar ?
517 			if ( c == '\'' )
518 			{
519 				c = getChar();	// '/' entfernen
520 
521 				// Alle Zeichen bis Zeilen-Ende oder EOF entfernen
522 				sal_Unicode cPeek = c;
523 				while( cPeek != CHAR_EOF && testCharFlags( cPeek, CHAR_EOL ) == sal_False )
524 				{
525 					getChar();
526 					cPeek = peekChar();
527 				}
528 
529 				reType = TT_COMMENT;
530 			}
531 
532 			// Echter Operator, kann hier einfach behandelt werden,
533 			// da nicht der wirkliche Operator, wie z.B. += interessiert,
534 			// sondern nur die Tatsache, dass es sich um einen handelt.
535 			if( reType != TT_COMMENT )
536 			{
537 				reType = TT_OPERATOR;
538 			}
539 
540 		}
541 	}
542 
543 	// Objekt-Trenner? Muss vor Number abgehandelt werden
544 	else if( c == '.' && ( peekChar() < '0' || peekChar() > '9' ) )
545 	{
546 		reType = TT_OPERATOR;
547 	}
548 
549 	// Zahl?
550 	else if( testCharFlags( c, CHAR_START_NUMBER ) == sal_True )
551 	{
552 		reType = TT_NUMBER;
553 
554 		// Zahlensystem, 10 = normal, wird bei Oct/Hex geaendert
555 		int nRadix = 10;
556 
557 		// Ist es eine Hex- oder Oct-Zahl?
558 		if( c == '&' )
559 		{
560 			// Octal?
561 			if( peekChar() == 'o' || peekChar() == 'O' )
562 			{
563 				// o entfernen
564 				getChar();
565 				nRadix = 8; 	// Octal-Basis
566 
567 				// Alle Ziffern einlesen
568 				while( testCharFlags( peekChar(), CHAR_IN_OCT_NUMBER ) )
569 					c = getChar();
570 			}
571 			// Hex?
572 			else if( peekChar() == 'h' || peekChar() == 'H' )
573 			{
574 				// x entfernen
575 				getChar();
576 				nRadix = 16;	 // Hex-Basis
577 
578 				// Alle Ziffern einlesen und puffern
579 				while( testCharFlags( peekChar(), CHAR_IN_HEX_NUMBER ) )
580 					c = getChar();
581 			}
582 			else
583 			{
584 				reType = TT_OPERATOR;
585 			}
586 		}
587 
588 		// Wenn nicht Oct oder Hex als double ansehen
589 		if( reType == TT_NUMBER && nRadix == 10 )
590 		{
591 			// Flag, ob das letzte Zeichen ein Exponent war
592 			sal_Bool bAfterExpChar = sal_False;
593 
594 			// Alle Ziffern einlesen
595 			while( testCharFlags( peekChar(), CHAR_IN_NUMBER ) ||
596 					(bAfterExpChar && peekChar() == '+' ) ||
597 					(bAfterExpChar && peekChar() == '-' ) )
598 					// Nach Exponent auch +/- OK
599 			{
600 				c = getChar();					// Zeichen lesen
601 				bAfterExpChar = ( c == 'e' || c == 'E' );
602 			}
603 		}
604 
605 		// reType = TT_NUMBER;
606 	}
607 
608 	// String?
609 	else if( testCharFlags( c, CHAR_START_STRING ) == sal_True )
610 	{
611 		// Merken, welches Zeichen den String eroeffnet hat
612 		sal_Unicode cEndString = c;
613 		if( c == '[' )
614 			cEndString = ']';
615 
616 		// Alle Ziffern einlesen und puffern
617 		while( peekChar() != cEndString )
618 		{
619 			// #58846 EOF vor getChar() abfangen, damit EOF micht verloren geht
620 			if( peekChar() == CHAR_EOF )
621 			{
622 				// ERROR: unterminated string literal
623 				reType = TT_ERROR;
624 				break;
625 			}
626 			c = getChar();
627 			if( testCharFlags( c, CHAR_EOL ) == sal_True )
628 			{
629 				// ERROR: unterminated string literal
630 				reType = TT_ERROR;
631 				break;
632 			}
633 		}
634 
635 		//	Zeichen lesen
636 		if( reType != TT_ERROR )
637 		{
638 			getChar();
639 			if( cEndString == ']' )
640 				reType = TT_IDENTIFIER;
641 			else
642 				reType = TT_STRING;
643 		}
644 	}
645 
646 	// Zeilenende?
647 	else if( testCharFlags( c, CHAR_EOL ) == sal_True )
648 	{
649 		// Falls ein weiteres anderes EOL-Char folgt, weg damit
650 		sal_Unicode cNext = peekChar();
651 		if( cNext != c && testCharFlags( cNext, CHAR_EOL ) == sal_True )
652 			getChar();
653 
654 		// Positions-Daten auf Zeilen-Beginn setzen
655 		nCol = 0;
656 		nLine++;
657 
658 		reType = TT_EOL;
659 	}
660 
661 	// Alles andere bleibt TT_UNKNOWN
662 
663 
664 	// End-Position eintragen
665 	rpEndPos = mpActualPos;
666 	return sal_True;
667 }
668 
getTokStr(const sal_Unicode * pStartPos,const sal_Unicode * pEndPos)669 String SimpleTokenizer_Impl::getTokStr
670 	( /*out*/const sal_Unicode* pStartPos, /*out*/const sal_Unicode* pEndPos )
671 {
672 	return String( pStartPos, (sal_uInt16)( pEndPos - pStartPos ) );
673 }
674 
675 #ifdef DBG_UTIL
676 // TEST: Token ausgeben
getFullTokenStr(TokenTypes eType,const sal_Unicode * pStartPos,const sal_Unicode * pEndPos)677 String SimpleTokenizer_Impl::getFullTokenStr( /*out*/TokenTypes eType,
678 	/*out*/const sal_Unicode* pStartPos, /*out*/const sal_Unicode* pEndPos )
679 {
680 	String aOut;
681 	switch( eType )
682 	{
683 		case TT_UNKNOWN:	aOut = String( RTL_CONSTASCII_USTRINGPARAM("TT_UNKNOWN:") ); break;
684 		case TT_IDENTIFIER:	aOut = String( RTL_CONSTASCII_USTRINGPARAM("TT_IDENTIFIER:") ); break;
685 		case TT_WHITESPACE:	aOut = String( RTL_CONSTASCII_USTRINGPARAM("TT_WHITESPACE:") ); break;
686 		case TT_NUMBER:		aOut = String( RTL_CONSTASCII_USTRINGPARAM("TT_NUMBER:") ); break;
687 		case TT_STRING:		aOut = String( RTL_CONSTASCII_USTRINGPARAM("TT_STRING:") ); break;
688 		case TT_EOL:		aOut = String( RTL_CONSTASCII_USTRINGPARAM("TT_EOL:") ); break;
689 		case TT_COMMENT:	aOut = String( RTL_CONSTASCII_USTRINGPARAM("TT_COMMENT:") ); break;
690 		case TT_ERROR:		aOut = String( RTL_CONSTASCII_USTRINGPARAM("TT_ERROR:") ); break;
691 		case TT_OPERATOR:	aOut = String( RTL_CONSTASCII_USTRINGPARAM("TT_OPERATOR:") ); break;
692 		case TT_KEYWORDS:	aOut = String( RTL_CONSTASCII_USTRINGPARAM("TT_KEYWORD:") ); break;
693 		case TT_PARAMETER:	aOut = String( RTL_CONSTASCII_USTRINGPARAM("TT_PARAMETER:") ); break;
694 	}
695 	if( eType != TT_EOL )
696 	{
697 		aOut += String( pStartPos, (sal_uInt16)( pEndPos - pStartPos ) );
698 	}
699 	aOut += String( RTL_CONSTASCII_USTRINGPARAM("\n") );
700 	return aOut;
701 }
702 #endif
703 
SimpleTokenizer_Impl(HighlighterLanguage aLang)704 SimpleTokenizer_Impl::SimpleTokenizer_Impl( HighlighterLanguage aLang ): aLanguage(aLang)
705 {
706 	memset( aCharTypeTab, 0, sizeof( aCharTypeTab ) );
707 
708 	// Zeichen-Tabelle fuellen
709 	sal_uInt16 i;
710 
711 	// Zulaessige Zeichen fuer Identifier
712 	sal_uInt16 nHelpMask = (sal_uInt16)( CHAR_START_IDENTIFIER | CHAR_IN_IDENTIFIER );
713 	for( i = 'a' ; i <= 'z' ; i++ )
714 		aCharTypeTab[i] |= nHelpMask;
715 	for( i = 'A' ; i <= 'Z' ; i++ )
716 		aCharTypeTab[i] |= nHelpMask;
717 	// '_' extra eintragen
718 	aCharTypeTab[(int)'_'] |= nHelpMask;
719 	// AB 23.6.97: '$' ist auch erlaubt
720 	aCharTypeTab[(int)'$'] |= nHelpMask;
721 
722 	// Ziffern (Identifier und Number ist moeglich)
723 	nHelpMask = (sal_uInt16)( CHAR_IN_IDENTIFIER | CHAR_START_NUMBER |
724 						 CHAR_IN_NUMBER | CHAR_IN_HEX_NUMBER );
725 	for( i = '0' ; i <= '9' ; i++ )
726 		aCharTypeTab[i] |= nHelpMask;
727 
728 	// e und E sowie . von Hand ergaenzen
729 	aCharTypeTab[(int)'e'] |= CHAR_IN_NUMBER;
730 	aCharTypeTab[(int)'E'] |= CHAR_IN_NUMBER;
731 	aCharTypeTab[(int)'.'] |= (sal_uInt16)( CHAR_IN_NUMBER | CHAR_START_NUMBER );
732 	aCharTypeTab[(int)'&'] |= CHAR_START_NUMBER;
733 
734 	// Hex-Ziffern
735 	for( i = 'a' ; i <= 'f' ; i++ )
736 		aCharTypeTab[i] |= CHAR_IN_HEX_NUMBER;
737 	for( i = 'A' ; i <= 'F' ; i++ )
738 		aCharTypeTab[i] |= CHAR_IN_HEX_NUMBER;
739 
740 	// Oct-Ziffern
741 	for( i = '0' ; i <= '7' ; i++ )
742 		aCharTypeTab[i] |= CHAR_IN_OCT_NUMBER;
743 
744 	// String-Beginn/End-Zeichen
745 	aCharTypeTab[(int)'\''] |= CHAR_START_STRING;
746 	aCharTypeTab[(int)'\"'] |= CHAR_START_STRING;
747 	aCharTypeTab[(int)'[']  |= CHAR_START_STRING;
748 	aCharTypeTab[(int)'`']  |= CHAR_START_STRING;
749 
750 	// Operator-Zeichen
751 	aCharTypeTab[(int)'!'] |= CHAR_OPERATOR;
752 	aCharTypeTab[(int)'%'] |= CHAR_OPERATOR;
753 	// aCharTypeTab[(int)'&'] |= CHAR_OPERATOR;		Removed because of #i14140
754 	aCharTypeTab[(int)'('] |= CHAR_OPERATOR;
755 	aCharTypeTab[(int)')'] |= CHAR_OPERATOR;
756 	aCharTypeTab[(int)'*'] |= CHAR_OPERATOR;
757 	aCharTypeTab[(int)'+'] |= CHAR_OPERATOR;
758 	aCharTypeTab[(int)','] |= CHAR_OPERATOR;
759 	aCharTypeTab[(int)'-'] |= CHAR_OPERATOR;
760 	aCharTypeTab[(int)'/'] |= CHAR_OPERATOR;
761 	aCharTypeTab[(int)':'] |= CHAR_OPERATOR;
762 	aCharTypeTab[(int)'<'] |= CHAR_OPERATOR;
763 	aCharTypeTab[(int)'='] |= CHAR_OPERATOR;
764 	aCharTypeTab[(int)'>'] |= CHAR_OPERATOR;
765 	aCharTypeTab[(int)'?'] |= CHAR_OPERATOR;
766 	aCharTypeTab[(int)'^'] |= CHAR_OPERATOR;
767 	aCharTypeTab[(int)'|'] |= CHAR_OPERATOR;
768 	aCharTypeTab[(int)'~'] |= CHAR_OPERATOR;
769 	aCharTypeTab[(int)'{'] |= CHAR_OPERATOR;
770 	aCharTypeTab[(int)'}'] |= CHAR_OPERATOR;
771 	// aCharTypeTab[(int)'['] |= CHAR_OPERATOR;		Removed because of #i17826
772 	aCharTypeTab[(int)']'] |= CHAR_OPERATOR;
773 	aCharTypeTab[(int)';'] |= CHAR_OPERATOR;
774 
775 	// Space
776 	aCharTypeTab[(int)' ' ] |= CHAR_SPACE;
777 	aCharTypeTab[(int)'\t'] |= CHAR_SPACE;
778 
779 	// Zeilen-Ende-Zeichen
780 	aCharTypeTab[(int)'\r'] |= CHAR_EOL;
781 	aCharTypeTab[(int)'\n'] |= CHAR_EOL;
782 
783 	ppListKeyWords = NULL;
784 }
785 
~SimpleTokenizer_Impl(void)786 SimpleTokenizer_Impl::~SimpleTokenizer_Impl( void )
787 {
788 }
789 
getSimpleTokenizer(void)790 SimpleTokenizer_Impl* getSimpleTokenizer( void )
791 {
792 	static SimpleTokenizer_Impl* pSimpleTokenizer = NULL;
793 	if( !pSimpleTokenizer )
794 		pSimpleTokenizer = new SimpleTokenizer_Impl();
795 	return pSimpleTokenizer;
796 }
797 
798 // Heraussuchen der jeweils naechsten Funktion aus einem JavaScript-Modul
parseLine(sal_uInt32 nParseLine,const String * aSource)799 sal_uInt16 SimpleTokenizer_Impl::parseLine( sal_uInt32 nParseLine, const String* aSource )
800 {
801 	// Position auf den Anfang des Source-Strings setzen
802 	mpStringBegin = mpActualPos = aSource->GetBuffer();
803 
804 	// Zeile und Spalte initialisieren
805 	nLine = nParseLine;
806 	nCol = 0L;
807 
808 	// Variablen fuer die Out-Parameter
809 	TokenTypes eType;
810 	const sal_Unicode* pStartPos;
811 	const sal_Unicode* pEndPos;
812 
813 	// Schleife ueber alle Tokens
814 	sal_uInt16 nTokenCount = 0;
815 	while( getNextToken( eType, pStartPos, pEndPos ) )
816 		nTokenCount++;
817 
818 	return nTokenCount;
819 }
820 
getHighlightPortions(sal_uInt32 nParseLine,const String & rLine,HighlightPortions & portions)821 void SimpleTokenizer_Impl::getHighlightPortions( sal_uInt32 nParseLine, const String& rLine,
822 													/*out*/HighlightPortions& portions  )
823 {
824 	// Position auf den Anfang des Source-Strings setzen
825 	mpStringBegin = mpActualPos = rLine.GetBuffer();
826 
827 	// Zeile und Spalte initialisieren
828 	nLine = nParseLine;
829 	nCol = 0L;
830 
831 	// Variablen fuer die Out-Parameter
832 	TokenTypes eType;
833 	const sal_Unicode* pStartPos;
834 	const sal_Unicode* pEndPos;
835 
836 	// Schleife ueber alle Tokens
837 	while( getNextToken( eType, pStartPos, pEndPos ) )
838 	{
839 		HighlightPortion portion;
840 
841 		portion.nBegin = (sal_uInt16)(pStartPos - mpStringBegin);
842 		portion.nEnd = (sal_uInt16)(pEndPos - mpStringBegin);
843 		portion.tokenType = eType;
844 
845         portions.push_back(portion);
846 	}
847 }
848 
849 
850 //////////////////////////////////////////////////////////////////////////
851 // Implementierung des SyntaxHighlighter
852 
SyntaxHighlighter()853 SyntaxHighlighter::SyntaxHighlighter()
854 {
855 	m_pSimpleTokenizer = 0;
856 	m_pKeyWords = NULL;
857 	m_nKeyWordCount = 0;
858 }
859 
~SyntaxHighlighter()860 SyntaxHighlighter::~SyntaxHighlighter()
861 {
862 	delete m_pSimpleTokenizer;
863 	delete m_pKeyWords;
864 }
865 
initialize(HighlighterLanguage eLanguage_)866 void SyntaxHighlighter::initialize( HighlighterLanguage eLanguage_ )
867 {
868 	eLanguage = eLanguage_;
869 	delete m_pSimpleTokenizer;
870 	m_pSimpleTokenizer = new SimpleTokenizer_Impl(eLanguage);
871 
872 	switch (eLanguage)
873 	{
874 		case HIGHLIGHT_BASIC:
875 			m_pSimpleTokenizer->setKeyWords( strListBasicKeyWords,
876 											sizeof( strListBasicKeyWords ) / sizeof( char* ));
877 			break;
878 		case HIGHLIGHT_SQL:
879 			m_pSimpleTokenizer->setKeyWords( strListSqlKeyWords,
880 											sizeof( strListSqlKeyWords ) / sizeof( char* ));
881 			break;
882 		default:
883 			m_pSimpleTokenizer->setKeyWords( NULL, 0 );
884 	}
885 }
886 
notifyChange(sal_uInt32 nLine,sal_Int32 nLineCountDifference,const String * pChangedLines,sal_uInt32 nArrayLength)887 const Range SyntaxHighlighter::notifyChange( sal_uInt32 nLine, sal_Int32 nLineCountDifference,
888 								const String* pChangedLines, sal_uInt32 nArrayLength)
889 {
890     (void)nLineCountDifference;
891 
892 	for( sal_uInt32 i=0 ; i < nArrayLength ; i++ )
893 		m_pSimpleTokenizer->parseLine(nLine+i, &pChangedLines[i]);
894 
895 	return Range( nLine, nLine + nArrayLength-1 );
896 }
897 
getHighlightPortions(sal_uInt32 nLine,const String & rLine,HighlightPortions & portions)898 void SyntaxHighlighter::getHighlightPortions( sal_uInt32 nLine, const String& rLine,
899 											/*out*/HighlightPortions& portions )
900 {
901 	m_pSimpleTokenizer->getHighlightPortions( nLine, rLine, portions );
902 }
903