1 /*************************************************************************
2  *
3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4  *
5  * Copyright 2000, 2010 Oracle and/or its affiliates.
6  *
7  * OpenOffice.org - a multi-platform office productivity suite
8  *
9  * This file is part of OpenOffice.org.
10  *
11  * OpenOffice.org is free software: you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser General Public License version 3
13  * only, as published by the Free Software Foundation.
14  *
15  * OpenOffice.org is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  * GNU Lesser General Public License version 3 for more details
19  * (a copy is included in the LICENSE file that accompanied this code).
20  *
21  * You should have received a copy of the GNU Lesser General Public License
22  * version 3 along with OpenOffice.org.  If not, see
23  * <http://www.openoffice.org/license.html>
24  * for a copy of the LGPLv3 License.
25  *
26  ************************************************************************/
27 
28 // MARKER(update_precomp.py): autogen include statement, do not remove
29 #include "precompiled_svtools.hxx"
30 
31 #include <svtools/syntaxhighlight.hxx>
32 
33 #include <unotools/charclass.hxx>
34 #include <tools/debug.hxx>
35 
36 
37 // ##########################################################################
38 // ATTENTION: all these words needs to be in small caps
39 // ##########################################################################
40 static const char* strListBasicKeyWords[] = {
41 	"access",
42 	"alias",
43 	"and",
44 	"any",
45 	"append",
46 	"as",
47 	"base",
48 	"binary",
49 	"boolean",
50 	"byref",
51 	"byte",
52 	"byval",
53 	"call",
54 	"case",
55 	"cdecl",
56 	"classmodule",
57 	"close",
58 	"compare",
59 	"compatible",
60 	"const",
61 	"currency",
62 	"date",
63 	"declare",
64 	"defbool",
65 	"defcur",
66 	"defdate",
67 	"defdbl",
68 	"deferr",
69 	"defint",
70 	"deflng",
71 	"defobj",
72 	"defsng",
73 	"defstr",
74 	"defvar",
75 	"dim",
76 	"do",
77 	"double",
78 	"each",
79 	"else",
80 	"elseif",
81 	"end",
82 	"end enum",
83 	"end function",
84 	"end if",
85 	"end select",
86 	"end sub",
87 	"end type",
88 	"endif",
89 	"enum",
90 	"eqv",
91 	"erase",
92 	"error",
93 	"exit",
94 	"explicit",
95 	"for",
96 	"function",
97 	"get",
98 	"global",
99 	"gosub",
100 	"goto",
101 	"if",
102 	"imp",
103 	"implements",
104 	"in",
105 	"input",
106 	"integer",
107 	"is",
108 	"let",
109 	"lib",
110 	"like",
111 	"line",
112 	"line input",
113 	"local",
114 	"lock",
115 	"long",
116 	"loop",
117 	"lprint",
118 	"lset",
119 	"mod",
120 	"name",
121 	"new",
122 	"next",
123 	"not",
124 	"object",
125 	"on",
126 	"open",
127 	"option",
128 	"optional",
129 	"or",
130 	"output",
131 	"preserve",
132 	"print",
133 	"private",
134 	"property",
135 	"public",
136 	"random",
137 	"read",
138 	"redim",
139 	"rem",
140 	"resume",
141 	"return",
142 	"rset",
143 	"select",
144 	"set",
145 	"shared",
146 	"single",
147 	"static",
148 	"step",
149 	"stop",
150 	"string",
151 	"sub",
152 	"system",
153 	"text",
154 	"then",
155 	"to",
156 	"type",
157 	"typeof",
158 	"until",
159 	"variant",
160 	"wend",
161 	"while",
162 	"with",
163 	"write",
164 	"xor"
165 };
166 
167 
168 static const char* strListSqlKeyWords[] = {
169 	"all",
170 	"and",
171 	"any",
172 	"as",
173 	"asc",
174 	"avg",
175 	"between",
176 	"by",
177 	"cast",
178 	"corresponding",
179 	"count",
180 	"create",
181 	"cross",
182 	"delete",
183 	"desc",
184 	"distinct",
185 	"drop",
186 	"escape",
187 	"except",
188 	"exists",
189 	"false",
190 	"from",
191 	"full",
192 	"global",
193 	"group",
194 	"having",
195 	"in",
196 	"inner",
197 	"insert",
198 	"intersect",
199 	"into",
200 	"is",
201 	"join",
202 	"left",
203 	"like",
204 	"local",
205 	"match",
206 	"max",
207 	"min",
208 	"natural",
209 	"not",
210 	"null",
211 	"on",
212 	"or",
213 	"order",
214 	"outer",
215 	"right",
216 	"select",
217 	"set",
218 	"some",
219 	"sum",
220 	"table",
221 	"temporary",
222 	"true",
223 	"union",
224 	"unique",
225 	"unknown",
226 	"update",
227 	"using",
228 	"values",
229 	"where"
230 };
231 
232 
233 extern "C" int CDECL compare_strings( const void *arg1, const void *arg2 )
234 {
235 	return strcmp( (char *)arg1, *(char **)arg2 );
236 }
237 
238 
239 class LetterTable
240 {
241 	bool		IsLetterTab[256];
242 
243 public:
244 	LetterTable( void );
245 
246 	inline bool isLetter( sal_Unicode c )
247 	{
248 		bool bRet = (c < 256) ? IsLetterTab[c] : isLetterUnicode( c );
249 		return bRet;
250 	}
251 	bool isLetterUnicode( sal_Unicode c );
252 };
253 
254 class BasicSimpleCharClass
255 {
256 	static LetterTable aLetterTable;
257 
258 public:
259 	static sal_Bool isAlpha( sal_Unicode c, bool bCompatible )
260 	{
261 		sal_Bool bRet = (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
262 					|| (bCompatible && aLetterTable.isLetter( c ));
263 		return bRet;
264 	}
265 
266 	static sal_Bool isDigit( sal_Unicode c )
267 	{
268 		sal_Bool bRet = (c >= '0' && c <= '9');
269 		return bRet;
270 	}
271 
272 	static sal_Bool isAlphaNumeric( sal_Unicode c, bool bCompatible )
273 	{
274 		sal_Bool bRet = isDigit( c ) || isAlpha( c, bCompatible );
275 		return bRet;
276 	}
277 };
278 
279 LetterTable BasicSimpleCharClass::aLetterTable;
280 
281 LetterTable::LetterTable( void )
282 {
283 	for( int i = 0 ; i < 256 ; ++i )
284 		IsLetterTab[i] = false;
285 
286 	IsLetterTab[0xC0] = true;	// ?, CAPITAL LETTER A WITH GRAVE ACCENT
287 	IsLetterTab[0xC1] = true;	// ?, CAPITAL LETTER A WITH ACUTE ACCENT
288 	IsLetterTab[0xC2] = true;	// ?, CAPITAL LETTER A WITH CIRCUMFLEX ACCENT
289 	IsLetterTab[0xC3] = true;	// ?, CAPITAL LETTER A WITH TILDE
290 	IsLetterTab[0xC4] = true;	// ?, CAPITAL LETTER A WITH DIAERESIS
291 	IsLetterTab[0xC5] = true;	// ?, CAPITAL LETTER A WITH RING ABOVE
292 	IsLetterTab[0xC6] = true;	// ?, CAPITAL LIGATURE AE
293 	IsLetterTab[0xC7] = true;	// ?, CAPITAL LETTER C WITH CEDILLA
294 	IsLetterTab[0xC8] = true;	// ?, CAPITAL LETTER E WITH GRAVE ACCENT
295 	IsLetterTab[0xC9] = true;	// ?, CAPITAL LETTER E WITH ACUTE ACCENT
296 	IsLetterTab[0xCA] = true;	// ?, CAPITAL LETTER E WITH CIRCUMFLEX ACCENT
297 	IsLetterTab[0xCB] = true;	// ?, CAPITAL LETTER E WITH DIAERESIS
298 	IsLetterTab[0xCC] = true;	// ?, CAPITAL LETTER I WITH GRAVE ACCENT
299 	IsLetterTab[0xCD] = true;	// ?, CAPITAL LETTER I WITH ACUTE ACCENT
300 	IsLetterTab[0xCE] = true;	// ?, CAPITAL LETTER I WITH CIRCUMFLEX ACCENT
301 	IsLetterTab[0xCF] = true;	// ?, CAPITAL LETTER I WITH DIAERESIS
302 	IsLetterTab[0xD0] = true;	// ?, CAPITAL LETTER ETH
303 	IsLetterTab[0xD1] = true;	// ?, CAPITAL LETTER N WITH TILDE
304 	IsLetterTab[0xD2] = true;	// ?, CAPITAL LETTER O WITH GRAVE ACCENT
305 	IsLetterTab[0xD3] = true;	// ?, CAPITAL LETTER O WITH ACUTE ACCENT
306 	IsLetterTab[0xD4] = true;	// ?, CAPITAL LETTER O WITH CIRCUMFLEX ACCENT
307 	IsLetterTab[0xD5] = true;	// ?, CAPITAL LETTER O WITH TILDE
308 	IsLetterTab[0xD6] = true;	// ?, CAPITAL LETTER O WITH DIAERESIS
309 	IsLetterTab[0xD8] = true;	// ?, CAPITAL LETTER O WITH STROKE
310 	IsLetterTab[0xD9] = true;	// ?, CAPITAL LETTER U WITH GRAVE ACCENT
311 	IsLetterTab[0xDA] = true;	// ?, CAPITAL LETTER U WITH ACUTE ACCENT
312 	IsLetterTab[0xDB] = true;	// ?, CAPITAL LETTER U WITH CIRCUMFLEX ACCENT
313 	IsLetterTab[0xDC] = true;	// ?, CAPITAL LETTER U WITH DIAERESIS
314 	IsLetterTab[0xDD] = true;	// ?, CAPITAL LETTER Y WITH ACUTE ACCENT
315 	IsLetterTab[0xDE] = true;	// ?, CAPITAL LETTER THORN
316 	IsLetterTab[0xDF] = true;	// ?, SMALL LETTER SHARP S
317 	IsLetterTab[0xE0] = true;	// ?, SMALL LETTER A WITH GRAVE ACCENT
318 	IsLetterTab[0xE1] = true;	// ?, SMALL LETTER A WITH ACUTE ACCENT
319 	IsLetterTab[0xE2] = true;	// ?, SMALL LETTER A WITH CIRCUMFLEX ACCENT
320 	IsLetterTab[0xE3] = true;	// ?, SMALL LETTER A WITH TILDE
321 	IsLetterTab[0xE4] = true;	// ?, SMALL LETTER A WITH DIAERESIS
322 	IsLetterTab[0xE5] = true;	// ?, SMALL LETTER A WITH RING ABOVE
323 	IsLetterTab[0xE6] = true;	// ?, SMALL LIGATURE AE
324 	IsLetterTab[0xE7] = true;	// ?, SMALL LETTER C WITH CEDILLA
325 	IsLetterTab[0xE8] = true;	// ?, SMALL LETTER E WITH GRAVE ACCENT
326 	IsLetterTab[0xE9] = true;	// ?, SMALL LETTER E WITH ACUTE ACCENT
327 	IsLetterTab[0xEA] = true;	// ?, SMALL LETTER E WITH CIRCUMFLEX ACCENT
328 	IsLetterTab[0xEB] = true;	// ?, SMALL LETTER E WITH DIAERESIS
329 	IsLetterTab[0xEC] = true;	// ?, SMALL LETTER I WITH GRAVE ACCENT
330 	IsLetterTab[0xED] = true;	// ?, SMALL LETTER I WITH ACUTE ACCENT
331 	IsLetterTab[0xEE] = true;	// ?, SMALL LETTER I WITH CIRCUMFLEX ACCENT
332 	IsLetterTab[0xEF] = true;	// ?, SMALL LETTER I WITH DIAERESIS
333 	IsLetterTab[0xF0] = true;	// ?, SMALL LETTER ETH
334 	IsLetterTab[0xF1] = true;	// ?, SMALL LETTER N WITH TILDE
335 	IsLetterTab[0xF2] = true;	// ?, SMALL LETTER O WITH GRAVE ACCENT
336 	IsLetterTab[0xF3] = true;	// ?, SMALL LETTER O WITH ACUTE ACCENT
337 	IsLetterTab[0xF4] = true;	// ?, SMALL LETTER O WITH CIRCUMFLEX ACCENT
338 	IsLetterTab[0xF5] = true;	// ?, SMALL LETTER O WITH TILDE
339 	IsLetterTab[0xF6] = true;	// ?, SMALL LETTER O WITH DIAERESIS
340 	IsLetterTab[0xF8] = true;	// ?, SMALL LETTER O WITH OBLIQUE BAR
341 	IsLetterTab[0xF9] = true;	// ?, SMALL LETTER U WITH GRAVE ACCENT
342 	IsLetterTab[0xFA] = true;	// ?, SMALL LETTER U WITH ACUTE ACCENT
343 	IsLetterTab[0xFB] = true;	// ?, SMALL LETTER U WITH CIRCUMFLEX ACCENT
344 	IsLetterTab[0xFC] = true;	// ?, SMALL LETTER U WITH DIAERESIS
345 	IsLetterTab[0xFD] = true;	// ?, SMALL LETTER Y WITH ACUTE ACCENT
346 	IsLetterTab[0xFE] = true;	// ?, SMALL LETTER THORN
347 	IsLetterTab[0xFF] = true;	// � , SMALL LETTER Y WITH DIAERESIS
348 }
349 
350 bool LetterTable::isLetterUnicode( sal_Unicode c )
351 {
352 	static CharClass* pCharClass = NULL;
353 	if( pCharClass == NULL )
354 		pCharClass = new CharClass( Application::GetSettings().GetLocale() );
355 	String aStr( c );
356 	bool bRet = pCharClass->isLetter( aStr, 0 );
357 	return bRet;
358 }
359 
360 // Hilfsfunktion: Zeichen-Flag Testen
361 sal_Bool SimpleTokenizer_Impl::testCharFlags( sal_Unicode c, sal_uInt16 nTestFlags )
362 {
363 	bool bRet = false;
364 	if( c != 0 && c <= 255 )
365 	{
366 		bRet = ( (aCharTypeTab[c] & nTestFlags) != 0 );
367 	}
368 	else if( c > 255 )
369 	{
370 		bRet = (( CHAR_START_IDENTIFIER | CHAR_IN_IDENTIFIER ) & nTestFlags) != 0
371 			? BasicSimpleCharClass::isAlpha( c, true ) : false;
372 	}
373 	return bRet;
374 }
375 
376 void SimpleTokenizer_Impl::setKeyWords( const char** ppKeyWords, sal_uInt16 nCount )
377 {
378 	ppListKeyWords = ppKeyWords;
379 	nKeyWordCount = nCount;
380 }
381 
382 // Neues Token holen
383 sal_Bool SimpleTokenizer_Impl::getNextToken( /*out*/TokenTypes& reType,
384 	/*out*/const sal_Unicode*& rpStartPos, /*out*/const sal_Unicode*& rpEndPos )
385 {
386 	reType = TT_UNKNOWN;
387 
388 	// Position merken
389 	rpStartPos = mpActualPos;
390 
391 	// Zeichen untersuchen
392 	sal_Unicode c = peekChar();
393 	if( c == CHAR_EOF )
394 		return sal_False;
395 
396 	// Zeichen lesen
397 	getChar();
398 
399 	//*** Alle Moeglichkeiten durchgehen ***
400 	// Space?
401 	if ( (testCharFlags( c, CHAR_SPACE ) == sal_True) )
402 	{
403 		while( testCharFlags( peekChar(), CHAR_SPACE ) == sal_True )
404 			getChar();
405 
406 		reType = TT_WHITESPACE;
407 	}
408 
409 	// Identifier?
410 	else if ( (testCharFlags( c, CHAR_START_IDENTIFIER ) == sal_True) )
411 	{
412 		sal_Bool bIdentifierChar;
413 		do
414 		{
415 			// Naechstes Zeichen holen
416 			c = peekChar();
417 			bIdentifierChar = testCharFlags( c, CHAR_IN_IDENTIFIER );
418 			if( bIdentifierChar )
419 				getChar();
420 		}
421 		while( bIdentifierChar );
422 
423 		reType = TT_IDENTIFIER;
424 
425 		// Schluesselwort-Tabelle
426 		if (ppListKeyWords != NULL)
427 		{
428 			int nCount = mpActualPos - rpStartPos;
429 
430 			// No keyword if string contains char > 255
431 			bool bCanBeKeyword = true;
432 			for( int i = 0 ; i < nCount ; i++ )
433 			{
434 				if( rpStartPos[i] > 255 )
435 				{
436 					bCanBeKeyword = false;
437 					break;
438 				}
439 			}
440 
441 			if( bCanBeKeyword )
442 			{
443 				String aKWString(rpStartPos, sal::static_int_cast< xub_StrLen >(nCount) );
444 				ByteString aByteStr( aKWString, RTL_TEXTENCODING_ASCII_US );
445 				aByteStr.ToLowerAscii();
446 				if ( bsearch( aByteStr.GetBuffer(), ppListKeyWords, nKeyWordCount, sizeof( char* ),
447 																		compare_strings ) )
448 				{
449 					reType = TT_KEYWORDS;
450 
451 					if ( aByteStr.Equals( "rem" ) )
452 					{
453 						// Alle Zeichen bis Zeilen-Ende oder EOF entfernen
454 						sal_Unicode cPeek = peekChar();
455 						while( cPeek != CHAR_EOF && testCharFlags( cPeek, CHAR_EOL ) == sal_False )
456 						{
457 							c = getChar();
458 							cPeek = peekChar();
459 						}
460 
461 						reType = TT_COMMENT;
462 					}
463 				}
464 			}
465 		}
466 	}
467 
468 	// Operator?
469 	// only for BASIC '\'' should be a comment, otherwise it is a normal string and handled there
470 	else if ( ( testCharFlags( c, CHAR_OPERATOR ) == sal_True ) || ( (c == '\'') && (aLanguage==HIGHLIGHT_BASIC)) )
471 	{
472 		// paramters for SQL view
473 		if ( (c==':') || (c=='?'))
474 		{
475 			if (c!='?')
476 			{
477 				sal_Bool bIdentifierChar;
478 				do
479 				{
480 					// Naechstes Zeichen holen
481 					c = peekChar();
482 					bIdentifierChar =  BasicSimpleCharClass::isAlpha( c, true );
483 					if( bIdentifierChar )
484 						getChar();
485 				}
486 				while( bIdentifierChar );
487 			}
488 			reType = TT_PARAMETER;
489 		}
490 		else if ((c=='-'))
491 		{
492 			sal_Unicode cPeekNext = peekChar();
493 			if (cPeekNext=='-')
494 			{
495 				// Alle Zeichen bis Zeilen-Ende oder EOF entfernen
496 				while( cPeekNext != CHAR_EOF && testCharFlags( cPeekNext, CHAR_EOL ) == sal_False )
497 				{
498 					getChar();
499 					cPeekNext = peekChar();
500 				}
501 				reType = TT_COMMENT;
502 			}
503 		}
504        else if (c=='/')
505        {
506            sal_Unicode cPeekNext = peekChar();
507            if (cPeekNext=='/')
508            {
509                // Alle Zeichen bis Zeilen-Ende oder EOF entfernen
510                while( cPeekNext != CHAR_EOF && testCharFlags( cPeekNext, CHAR_EOL ) == sal_False )
511                {
512                    getChar();
513                    cPeekNext = peekChar();
514                }
515                reType = TT_COMMENT;
516            }
517        }
518 		else
519 		{
520 			// Kommentar ?
521 			if ( c == '\'' )
522 			{
523 				c = getChar();	// '/' entfernen
524 
525 				// Alle Zeichen bis Zeilen-Ende oder EOF entfernen
526 				sal_Unicode cPeek = c;
527 				while( cPeek != CHAR_EOF && testCharFlags( cPeek, CHAR_EOL ) == sal_False )
528 				{
529 					getChar();
530 					cPeek = peekChar();
531 				}
532 
533 				reType = TT_COMMENT;
534 			}
535 
536 			// Echter Operator, kann hier einfach behandelt werden,
537 			// da nicht der wirkliche Operator, wie z.B. += interessiert,
538 			// sondern nur die Tatsache, dass es sich um einen handelt.
539 			if( reType != TT_COMMENT )
540 			{
541 				reType = TT_OPERATOR;
542 			}
543 
544 		}
545 	}
546 
547 	// Objekt-Trenner? Muss vor Number abgehandelt werden
548 	else if( c == '.' && ( peekChar() < '0' || peekChar() > '9' ) )
549 	{
550 		reType = TT_OPERATOR;
551 	}
552 
553 	// Zahl?
554 	else if( testCharFlags( c, CHAR_START_NUMBER ) == sal_True )
555 	{
556 		reType = TT_NUMBER;
557 
558 		// Zahlensystem, 10 = normal, wird bei Oct/Hex geaendert
559 		int nRadix = 10;
560 
561 		// Ist es eine Hex- oder Oct-Zahl?
562 		if( c == '&' )
563 		{
564 			// Octal?
565 			if( peekChar() == 'o' || peekChar() == 'O' )
566 			{
567 				// o entfernen
568 				getChar();
569 				nRadix = 8; 	// Octal-Basis
570 
571 				// Alle Ziffern einlesen
572 				while( testCharFlags( peekChar(), CHAR_IN_OCT_NUMBER ) )
573 					c = getChar();
574 			}
575 			// Hex?
576 			else if( peekChar() == 'h' || peekChar() == 'H' )
577 			{
578 				// x entfernen
579 				getChar();
580 				nRadix = 16;	 // Hex-Basis
581 
582 				// Alle Ziffern einlesen und puffern
583 				while( testCharFlags( peekChar(), CHAR_IN_HEX_NUMBER ) )
584 					c = getChar();
585 			}
586 			else
587 			{
588 				reType = TT_OPERATOR;
589 			}
590 		}
591 
592 		// Wenn nicht Oct oder Hex als double ansehen
593 		if( reType == TT_NUMBER && nRadix == 10 )
594 		{
595 			// Flag, ob das letzte Zeichen ein Exponent war
596 			sal_Bool bAfterExpChar = sal_False;
597 
598 			// Alle Ziffern einlesen
599 			while( testCharFlags( peekChar(), CHAR_IN_NUMBER ) ||
600 					(bAfterExpChar && peekChar() == '+' ) ||
601 					(bAfterExpChar && peekChar() == '-' ) )
602 					// Nach Exponent auch +/- OK
603 			{
604 				c = getChar();					// Zeichen lesen
605 				bAfterExpChar = ( c == 'e' || c == 'E' );
606 			}
607 		}
608 
609 		// reType = TT_NUMBER;
610 	}
611 
612 	// String?
613 	else if( testCharFlags( c, CHAR_START_STRING ) == sal_True )
614 	{
615 		// Merken, welches Zeichen den String eroeffnet hat
616 		sal_Unicode cEndString = c;
617 		if( c == '[' )
618 			cEndString = ']';
619 
620 		// Alle Ziffern einlesen und puffern
621 		while( peekChar() != cEndString )
622 		{
623 			// #58846 EOF vor getChar() abfangen, damit EOF micht verloren geht
624 			if( peekChar() == CHAR_EOF )
625 			{
626 				// ERROR: unterminated string literal
627 				reType = TT_ERROR;
628 				break;
629 			}
630 			c = getChar();
631 			if( testCharFlags( c, CHAR_EOL ) == sal_True )
632 			{
633 				// ERROR: unterminated string literal
634 				reType = TT_ERROR;
635 				break;
636 			}
637 		}
638 
639 		//	Zeichen lesen
640 		if( reType != TT_ERROR )
641 		{
642 			getChar();
643 			if( cEndString == ']' )
644 				reType = TT_IDENTIFIER;
645 			else
646 				reType = TT_STRING;
647 		}
648 	}
649 
650 	// Zeilenende?
651 	else if( testCharFlags( c, CHAR_EOL ) == sal_True )
652 	{
653 		// Falls ein weiteres anderes EOL-Char folgt, weg damit
654 		sal_Unicode cNext = peekChar();
655 		if( cNext != c && testCharFlags( cNext, CHAR_EOL ) == sal_True )
656 			getChar();
657 
658 		// Positions-Daten auf Zeilen-Beginn setzen
659 		nCol = 0;
660 		nLine++;
661 
662 		reType = TT_EOL;
663 	}
664 
665 	// Alles andere bleibt TT_UNKNOWN
666 
667 
668 	// End-Position eintragen
669 	rpEndPos = mpActualPos;
670 	return sal_True;
671 }
672 
673 String SimpleTokenizer_Impl::getTokStr
674 	( /*out*/const sal_Unicode* pStartPos, /*out*/const sal_Unicode* pEndPos )
675 {
676 	return String( pStartPos, (sal_uInt16)( pEndPos - pStartPos ) );
677 }
678 
679 #ifdef DBG_UTIL
680 // TEST: Token ausgeben
681 String SimpleTokenizer_Impl::getFullTokenStr( /*out*/TokenTypes eType,
682 	/*out*/const sal_Unicode* pStartPos, /*out*/const sal_Unicode* pEndPos )
683 {
684 	String aOut;
685 	switch( eType )
686 	{
687 		case TT_UNKNOWN:	aOut = String( RTL_CONSTASCII_USTRINGPARAM("TT_UNKNOWN:") ); break;
688 		case TT_IDENTIFIER:	aOut = String( RTL_CONSTASCII_USTRINGPARAM("TT_IDENTIFIER:") ); break;
689 		case TT_WHITESPACE:	aOut = String( RTL_CONSTASCII_USTRINGPARAM("TT_WHITESPACE:") ); break;
690 		case TT_NUMBER:		aOut = String( RTL_CONSTASCII_USTRINGPARAM("TT_NUMBER:") ); break;
691 		case TT_STRING:		aOut = String( RTL_CONSTASCII_USTRINGPARAM("TT_STRING:") ); break;
692 		case TT_EOL:		aOut = String( RTL_CONSTASCII_USTRINGPARAM("TT_EOL:") ); break;
693 		case TT_COMMENT:	aOut = String( RTL_CONSTASCII_USTRINGPARAM("TT_COMMENT:") ); break;
694 		case TT_ERROR:		aOut = String( RTL_CONSTASCII_USTRINGPARAM("TT_ERROR:") ); break;
695 		case TT_OPERATOR:	aOut = String( RTL_CONSTASCII_USTRINGPARAM("TT_OPERATOR:") ); break;
696 		case TT_KEYWORDS:	aOut = String( RTL_CONSTASCII_USTRINGPARAM("TT_KEYWORD:") ); break;
697 		case TT_PARAMETER:	aOut = String( RTL_CONSTASCII_USTRINGPARAM("TT_PARAMETER:") ); break;
698 	}
699 	if( eType != TT_EOL )
700 	{
701 		aOut += String( pStartPos, (sal_uInt16)( pEndPos - pStartPos ) );
702 	}
703 	aOut += String( RTL_CONSTASCII_USTRINGPARAM("\n") );
704 	return aOut;
705 }
706 #endif
707 
708 SimpleTokenizer_Impl::SimpleTokenizer_Impl( HighlighterLanguage aLang ): aLanguage(aLang)
709 {
710 	memset( aCharTypeTab, 0, sizeof( aCharTypeTab ) );
711 
712 	// Zeichen-Tabelle fuellen
713 	sal_uInt16 i;
714 
715 	// Zulaessige Zeichen fuer Identifier
716 	sal_uInt16 nHelpMask = (sal_uInt16)( CHAR_START_IDENTIFIER | CHAR_IN_IDENTIFIER );
717 	for( i = 'a' ; i <= 'z' ; i++ )
718 		aCharTypeTab[i] |= nHelpMask;
719 	for( i = 'A' ; i <= 'Z' ; i++ )
720 		aCharTypeTab[i] |= nHelpMask;
721 	// '_' extra eintragen
722 	aCharTypeTab[(int)'_'] |= nHelpMask;
723 	// AB 23.6.97: '$' ist auch erlaubt
724 	aCharTypeTab[(int)'$'] |= nHelpMask;
725 
726 	// Ziffern (Identifier und Number ist moeglich)
727 	nHelpMask = (sal_uInt16)( CHAR_IN_IDENTIFIER | CHAR_START_NUMBER |
728 						 CHAR_IN_NUMBER | CHAR_IN_HEX_NUMBER );
729 	for( i = '0' ; i <= '9' ; i++ )
730 		aCharTypeTab[i] |= nHelpMask;
731 
732 	// e und E sowie . von Hand ergaenzen
733 	aCharTypeTab[(int)'e'] |= CHAR_IN_NUMBER;
734 	aCharTypeTab[(int)'E'] |= CHAR_IN_NUMBER;
735 	aCharTypeTab[(int)'.'] |= (sal_uInt16)( CHAR_IN_NUMBER | CHAR_START_NUMBER );
736 	aCharTypeTab[(int)'&'] |= CHAR_START_NUMBER;
737 
738 	// Hex-Ziffern
739 	for( i = 'a' ; i <= 'f' ; i++ )
740 		aCharTypeTab[i] |= CHAR_IN_HEX_NUMBER;
741 	for( i = 'A' ; i <= 'F' ; i++ )
742 		aCharTypeTab[i] |= CHAR_IN_HEX_NUMBER;
743 
744 	// Oct-Ziffern
745 	for( i = '0' ; i <= '7' ; i++ )
746 		aCharTypeTab[i] |= CHAR_IN_OCT_NUMBER;
747 
748 	// String-Beginn/End-Zeichen
749 	aCharTypeTab[(int)'\''] |= CHAR_START_STRING;
750 	aCharTypeTab[(int)'\"'] |= CHAR_START_STRING;
751 	aCharTypeTab[(int)'[']  |= CHAR_START_STRING;
752 	aCharTypeTab[(int)'`']  |= CHAR_START_STRING;
753 
754 	// Operator-Zeichen
755 	aCharTypeTab[(int)'!'] |= CHAR_OPERATOR;
756 	aCharTypeTab[(int)'%'] |= CHAR_OPERATOR;
757 	// aCharTypeTab[(int)'&'] |= CHAR_OPERATOR;		Removed because of #i14140
758 	aCharTypeTab[(int)'('] |= CHAR_OPERATOR;
759 	aCharTypeTab[(int)')'] |= CHAR_OPERATOR;
760 	aCharTypeTab[(int)'*'] |= CHAR_OPERATOR;
761 	aCharTypeTab[(int)'+'] |= CHAR_OPERATOR;
762 	aCharTypeTab[(int)','] |= CHAR_OPERATOR;
763 	aCharTypeTab[(int)'-'] |= CHAR_OPERATOR;
764 	aCharTypeTab[(int)'/'] |= CHAR_OPERATOR;
765 	aCharTypeTab[(int)':'] |= CHAR_OPERATOR;
766 	aCharTypeTab[(int)'<'] |= CHAR_OPERATOR;
767 	aCharTypeTab[(int)'='] |= CHAR_OPERATOR;
768 	aCharTypeTab[(int)'>'] |= CHAR_OPERATOR;
769 	aCharTypeTab[(int)'?'] |= CHAR_OPERATOR;
770 	aCharTypeTab[(int)'^'] |= CHAR_OPERATOR;
771 	aCharTypeTab[(int)'|'] |= CHAR_OPERATOR;
772 	aCharTypeTab[(int)'~'] |= CHAR_OPERATOR;
773 	aCharTypeTab[(int)'{'] |= CHAR_OPERATOR;
774 	aCharTypeTab[(int)'}'] |= CHAR_OPERATOR;
775 	// aCharTypeTab[(int)'['] |= CHAR_OPERATOR;		Removed because of #i17826
776 	aCharTypeTab[(int)']'] |= CHAR_OPERATOR;
777 	aCharTypeTab[(int)';'] |= CHAR_OPERATOR;
778 
779 	// Space
780 	aCharTypeTab[(int)' ' ] |= CHAR_SPACE;
781 	aCharTypeTab[(int)'\t'] |= CHAR_SPACE;
782 
783 	// Zeilen-Ende-Zeichen
784 	aCharTypeTab[(int)'\r'] |= CHAR_EOL;
785 	aCharTypeTab[(int)'\n'] |= CHAR_EOL;
786 
787 	ppListKeyWords = NULL;
788 }
789 
790 SimpleTokenizer_Impl::~SimpleTokenizer_Impl( void )
791 {
792 }
793 
794 SimpleTokenizer_Impl* getSimpleTokenizer( void )
795 {
796 	static SimpleTokenizer_Impl* pSimpleTokenizer = NULL;
797 	if( !pSimpleTokenizer )
798 		pSimpleTokenizer = new SimpleTokenizer_Impl();
799 	return pSimpleTokenizer;
800 }
801 
802 // Heraussuchen der jeweils naechsten Funktion aus einem JavaScript-Modul
803 sal_uInt16 SimpleTokenizer_Impl::parseLine( sal_uInt32 nParseLine, const String* aSource )
804 {
805 	// Position auf den Anfang des Source-Strings setzen
806 	mpStringBegin = mpActualPos = aSource->GetBuffer();
807 
808 	// Zeile und Spalte initialisieren
809 	nLine = nParseLine;
810 	nCol = 0L;
811 
812 	// Variablen fuer die Out-Parameter
813 	TokenTypes eType;
814 	const sal_Unicode* pStartPos;
815 	const sal_Unicode* pEndPos;
816 
817 	// Schleife ueber alle Tokens
818 	sal_uInt16 nTokenCount = 0;
819 	while( getNextToken( eType, pStartPos, pEndPos ) )
820 		nTokenCount++;
821 
822 	return nTokenCount;
823 }
824 
825 void SimpleTokenizer_Impl::getHighlightPortions( sal_uInt32 nParseLine, const String& rLine,
826 													/*out*/HighlightPortions& portions  )
827 {
828 	// Position auf den Anfang des Source-Strings setzen
829 	mpStringBegin = mpActualPos = rLine.GetBuffer();
830 
831 	// Zeile und Spalte initialisieren
832 	nLine = nParseLine;
833 	nCol = 0L;
834 
835 	// Variablen fuer die Out-Parameter
836 	TokenTypes eType;
837 	const sal_Unicode* pStartPos;
838 	const sal_Unicode* pEndPos;
839 
840 	// Schleife ueber alle Tokens
841 	while( getNextToken( eType, pStartPos, pEndPos ) )
842 	{
843 		HighlightPortion portion;
844 
845 		portion.nBegin = (sal_uInt16)(pStartPos - mpStringBegin);
846 		portion.nEnd = (sal_uInt16)(pEndPos - mpStringBegin);
847 		portion.tokenType = eType;
848 
849         portions.push_back(portion);
850 	}
851 }
852 
853 
854 //////////////////////////////////////////////////////////////////////////
855 // Implementierung des SyntaxHighlighter
856 
857 SyntaxHighlighter::SyntaxHighlighter()
858 {
859 	m_pSimpleTokenizer = 0;
860 	m_pKeyWords = NULL;
861 	m_nKeyWordCount = 0;
862 }
863 
864 SyntaxHighlighter::~SyntaxHighlighter()
865 {
866 	delete m_pSimpleTokenizer;
867 	delete m_pKeyWords;
868 }
869 
870 void SyntaxHighlighter::initialize( HighlighterLanguage eLanguage_ )
871 {
872 	eLanguage = eLanguage_;
873 	delete m_pSimpleTokenizer;
874 	m_pSimpleTokenizer = new SimpleTokenizer_Impl(eLanguage);
875 
876 	switch (eLanguage)
877 	{
878 		case HIGHLIGHT_BASIC:
879 			m_pSimpleTokenizer->setKeyWords( strListBasicKeyWords,
880 											sizeof( strListBasicKeyWords ) / sizeof( char* ));
881 			break;
882 		case HIGHLIGHT_SQL:
883 			m_pSimpleTokenizer->setKeyWords( strListSqlKeyWords,
884 											sizeof( strListSqlKeyWords ) / sizeof( char* ));
885 			break;
886 		default:
887 			m_pSimpleTokenizer->setKeyWords( NULL, 0 );
888 	}
889 }
890 
891 const Range SyntaxHighlighter::notifyChange( sal_uInt32 nLine, sal_Int32 nLineCountDifference,
892 								const String* pChangedLines, sal_uInt32 nArrayLength)
893 {
894     (void)nLineCountDifference;
895 
896 	for( sal_uInt32 i=0 ; i < nArrayLength ; i++ )
897 		m_pSimpleTokenizer->parseLine(nLine+i, &pChangedLines[i]);
898 
899 	return Range( nLine, nLine + nArrayLength-1 );
900 }
901 
902 void SyntaxHighlighter::getHighlightPortions( sal_uInt32 nLine, const String& rLine,
903 											/*out*/HighlightPortions& portions )
904 {
905 	m_pSimpleTokenizer->getHighlightPortions( nLine, rLine, portions );
906 }
907