xref: /trunk/main/svtools/source/edit/syntaxhighlight.cxx (revision cdf0e10c4e3984b49a9502b011690b615761d4a3)
1 /*************************************************************************
2  *
3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4  *
5  * Copyright 2000, 2010 Oracle and/or its affiliates.
6  *
7  * OpenOffice.org - a multi-platform office productivity suite
8  *
9  * This file is part of OpenOffice.org.
10  *
11  * OpenOffice.org is free software: you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser General Public License version 3
13  * only, as published by the Free Software Foundation.
14  *
15  * OpenOffice.org is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  * GNU Lesser General Public License version 3 for more details
19  * (a copy is included in the LICENSE file that accompanied this code).
20  *
21  * You should have received a copy of the GNU Lesser General Public License
22  * version 3 along with OpenOffice.org.  If not, see
23  * <http://www.openoffice.org/license.html>
24  * for a copy of the LGPLv3 License.
25  *
26  ************************************************************************/
27 
28 // MARKER(update_precomp.py): autogen include statement, do not remove
29 #include "precompiled_svtools.hxx"
30 
31 #include <svtools/syntaxhighlight.hxx>
32 
33 #include <unotools/charclass.hxx>
34 #include <tools/debug.hxx>
35 
36 
37 // ##########################################################################
38 // ATTENTION: all these words needs to be in small caps
39 // ##########################################################################
40 static const char* strListBasicKeyWords[] = {
41     "access",
42     "alias",
43     "and",
44     "any",
45     "append",
46     "as",
47     "base",
48     "binary",
49     "boolean",
50     "byref",
51     "byte",
52     "byval",
53     "call",
54     "case",
55     "cdecl",
56     "classmodule",
57     "close",
58     "compare",
59     "compatible",
60     "const",
61     "currency",
62     "date",
63     "declare",
64     "defbool",
65     "defcur",
66     "defdate",
67     "defdbl",
68     "deferr",
69     "defint",
70     "deflng",
71     "defobj",
72     "defsng",
73     "defstr",
74     "defvar",
75     "dim",
76     "do",
77     "double",
78     "each",
79     "else",
80     "elseif",
81     "end",
82     "end enum",
83     "end function",
84     "end if",
85     "end select",
86     "end sub",
87     "end type",
88     "endif",
89     "enum",
90     "eqv",
91     "erase",
92     "error",
93     "exit",
94     "explicit",
95     "for",
96     "function",
97     "get",
98     "global",
99     "gosub",
100     "goto",
101     "if",
102     "imp",
103     "implements",
104     "in",
105     "input",
106     "integer",
107     "is",
108     "let",
109     "lib",
110     "like",
111     "line",
112     "line input",
113     "local",
114     "lock",
115     "long",
116     "loop",
117     "lprint",
118     "lset",
119     "mod",
120     "name",
121     "new",
122     "next",
123     "not",
124     "object",
125     "on",
126     "open",
127     "option",
128     "optional",
129     "or",
130     "output",
131     "preserve",
132     "print",
133     "private",
134     "property",
135     "public",
136     "random",
137     "read",
138     "redim",
139     "rem",
140     "resume",
141     "return",
142     "rset",
143     "select",
144     "set",
145     "shared",
146     "single",
147     "static",
148     "step",
149     "stop",
150     "string",
151     "sub",
152     "system",
153     "text",
154     "then",
155     "to",
156     "type",
157     "typeof",
158     "until",
159     "variant",
160     "wend",
161     "while",
162     "with",
163     "write",
164     "xor"
165 };
166 
167 
168 static const char* strListSqlKeyWords[] = {
169     "all",
170     "and",
171     "any",
172     "as",
173     "asc",
174     "avg",
175     "between",
176     "by",
177     "cast",
178     "corresponding",
179     "count",
180     "create",
181     "cross",
182     "delete",
183     "desc",
184     "distinct",
185     "drop",
186     "escape",
187     "except",
188     "exists",
189     "false",
190     "from",
191     "full",
192     "global",
193     "group",
194     "having",
195     "in",
196     "inner",
197     "insert",
198     "intersect",
199     "into",
200     "is",
201     "join",
202     "left",
203     "like",
204     "local",
205     "match",
206     "max",
207     "min",
208     "natural",
209     "not",
210     "null",
211     "on",
212     "or",
213     "order",
214     "outer",
215     "right",
216     "select",
217     "set",
218     "some",
219     "sum",
220     "table",
221     "temporary",
222     "true",
223     "union",
224     "unique",
225     "unknown",
226     "update",
227     "using",
228     "values",
229     "where"
230 };
231 
232 
233 extern "C" int CDECL compare_strings( const void *arg1, const void *arg2 )
234 {
235     return strcmp( (char *)arg1, *(char **)arg2 );
236 }
237 
238 
239 class LetterTable
240 {
241     bool        IsLetterTab[256];
242 
243 public:
244     LetterTable( void );
245 
246     inline bool isLetter( sal_Unicode c )
247     {
248         bool bRet = (c < 256) ? IsLetterTab[c] : isLetterUnicode( c );
249         return bRet;
250     }
251     bool isLetterUnicode( sal_Unicode c );
252 };
253 
254 class BasicSimpleCharClass
255 {
256     static LetterTable aLetterTable;
257 
258 public:
259     static sal_Bool isAlpha( sal_Unicode c, bool bCompatible )
260     {
261         sal_Bool bRet = (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
262                     || (bCompatible && aLetterTable.isLetter( c ));
263         return bRet;
264     }
265 
266     static sal_Bool isDigit( sal_Unicode c )
267     {
268         sal_Bool bRet = (c >= '0' && c <= '9');
269         return bRet;
270     }
271 
272     static sal_Bool isAlphaNumeric( sal_Unicode c, bool bCompatible )
273     {
274         sal_Bool bRet = isDigit( c ) || isAlpha( c, bCompatible );
275         return bRet;
276     }
277 };
278 
279 LetterTable BasicSimpleCharClass::aLetterTable;
280 
281 LetterTable::LetterTable( void )
282 {
283     for( int i = 0 ; i < 256 ; ++i )
284         IsLetterTab[i] = false;
285 
286     IsLetterTab[0xC0] = true;   // ?, CAPITAL LETTER A WITH GRAVE ACCENT
287     IsLetterTab[0xC1] = true;   // ?, CAPITAL LETTER A WITH ACUTE ACCENT
288     IsLetterTab[0xC2] = true;   // ?, CAPITAL LETTER A WITH CIRCUMFLEX ACCENT
289     IsLetterTab[0xC3] = true;   // ?, CAPITAL LETTER A WITH TILDE
290     IsLetterTab[0xC4] = true;   // ?, CAPITAL LETTER A WITH DIAERESIS
291     IsLetterTab[0xC5] = true;   // ?, CAPITAL LETTER A WITH RING ABOVE
292     IsLetterTab[0xC6] = true;   // ?, CAPITAL LIGATURE AE
293     IsLetterTab[0xC7] = true;   // ?, CAPITAL LETTER C WITH CEDILLA
294     IsLetterTab[0xC8] = true;   // ?, CAPITAL LETTER E WITH GRAVE ACCENT
295     IsLetterTab[0xC9] = true;   // ?, CAPITAL LETTER E WITH ACUTE ACCENT
296     IsLetterTab[0xCA] = true;   // ?, CAPITAL LETTER E WITH CIRCUMFLEX ACCENT
297     IsLetterTab[0xCB] = true;   // ?, CAPITAL LETTER E WITH DIAERESIS
298     IsLetterTab[0xCC] = true;   // ?, CAPITAL LETTER I WITH GRAVE ACCENT
299     IsLetterTab[0xCD] = true;   // ?, CAPITAL LETTER I WITH ACUTE ACCENT
300     IsLetterTab[0xCE] = true;   // ?, CAPITAL LETTER I WITH CIRCUMFLEX ACCENT
301     IsLetterTab[0xCF] = true;   // ?, CAPITAL LETTER I WITH DIAERESIS
302     IsLetterTab[0xD0] = true;   // ?, CAPITAL LETTER ETH
303     IsLetterTab[0xD1] = true;   // ?, CAPITAL LETTER N WITH TILDE
304     IsLetterTab[0xD2] = true;   // ?, CAPITAL LETTER O WITH GRAVE ACCENT
305     IsLetterTab[0xD3] = true;   // ?, CAPITAL LETTER O WITH ACUTE ACCENT
306     IsLetterTab[0xD4] = true;   // ?, CAPITAL LETTER O WITH CIRCUMFLEX ACCENT
307     IsLetterTab[0xD5] = true;   // ?, CAPITAL LETTER O WITH TILDE
308     IsLetterTab[0xD6] = true;   // ?, CAPITAL LETTER O WITH DIAERESIS
309     IsLetterTab[0xD8] = true;   // ?, CAPITAL LETTER O WITH STROKE
310     IsLetterTab[0xD9] = true;   // ?, CAPITAL LETTER U WITH GRAVE ACCENT
311     IsLetterTab[0xDA] = true;   // ?, CAPITAL LETTER U WITH ACUTE ACCENT
312     IsLetterTab[0xDB] = true;   // ?, CAPITAL LETTER U WITH CIRCUMFLEX ACCENT
313     IsLetterTab[0xDC] = true;   // ?, CAPITAL LETTER U WITH DIAERESIS
314     IsLetterTab[0xDD] = true;   // ?, CAPITAL LETTER Y WITH ACUTE ACCENT
315     IsLetterTab[0xDE] = true;   // ?, CAPITAL LETTER THORN
316     IsLetterTab[0xDF] = true;   // ?, SMALL LETTER SHARP S
317     IsLetterTab[0xE0] = true;   // ?, SMALL LETTER A WITH GRAVE ACCENT
318     IsLetterTab[0xE1] = true;   // ?, SMALL LETTER A WITH ACUTE ACCENT
319     IsLetterTab[0xE2] = true;   // ?, SMALL LETTER A WITH CIRCUMFLEX ACCENT
320     IsLetterTab[0xE3] = true;   // ?, SMALL LETTER A WITH TILDE
321     IsLetterTab[0xE4] = true;   // ?, SMALL LETTER A WITH DIAERESIS
322     IsLetterTab[0xE5] = true;   // ?, SMALL LETTER A WITH RING ABOVE
323     IsLetterTab[0xE6] = true;   // ?, SMALL LIGATURE AE
324     IsLetterTab[0xE7] = true;   // ?, SMALL LETTER C WITH CEDILLA
325     IsLetterTab[0xE8] = true;   // ?, SMALL LETTER E WITH GRAVE ACCENT
326     IsLetterTab[0xE9] = true;   // ?, SMALL LETTER E WITH ACUTE ACCENT
327     IsLetterTab[0xEA] = true;   // ?, SMALL LETTER E WITH CIRCUMFLEX ACCENT
328     IsLetterTab[0xEB] = true;   // ?, SMALL LETTER E WITH DIAERESIS
329     IsLetterTab[0xEC] = true;   // ?, SMALL LETTER I WITH GRAVE ACCENT
330     IsLetterTab[0xED] = true;   // ?, SMALL LETTER I WITH ACUTE ACCENT
331     IsLetterTab[0xEE] = true;   // ?, SMALL LETTER I WITH CIRCUMFLEX ACCENT
332     IsLetterTab[0xEF] = true;   // ?, SMALL LETTER I WITH DIAERESIS
333     IsLetterTab[0xF0] = true;   // ?, SMALL LETTER ETH
334     IsLetterTab[0xF1] = true;   // ?, SMALL LETTER N WITH TILDE
335     IsLetterTab[0xF2] = true;   // ?, SMALL LETTER O WITH GRAVE ACCENT
336     IsLetterTab[0xF3] = true;   // ?, SMALL LETTER O WITH ACUTE ACCENT
337     IsLetterTab[0xF4] = true;   // ?, SMALL LETTER O WITH CIRCUMFLEX ACCENT
338     IsLetterTab[0xF5] = true;   // ?, SMALL LETTER O WITH TILDE
339     IsLetterTab[0xF6] = true;   // ?, SMALL LETTER O WITH DIAERESIS
340     IsLetterTab[0xF8] = true;   // ?, SMALL LETTER O WITH OBLIQUE BAR
341     IsLetterTab[0xF9] = true;   // ?, SMALL LETTER U WITH GRAVE ACCENT
342     IsLetterTab[0xFA] = true;   // ?, SMALL LETTER U WITH ACUTE ACCENT
343     IsLetterTab[0xFB] = true;   // ?, SMALL LETTER U WITH CIRCUMFLEX ACCENT
344     IsLetterTab[0xFC] = true;   // ?, SMALL LETTER U WITH DIAERESIS
345     IsLetterTab[0xFD] = true;   // ?, SMALL LETTER Y WITH ACUTE ACCENT
346     IsLetterTab[0xFE] = true;   // ?, SMALL LETTER THORN
347     IsLetterTab[0xFF] = true;   // � , SMALL LETTER Y WITH DIAERESIS
348 }
349 
350 bool LetterTable::isLetterUnicode( sal_Unicode c )
351 {
352     static CharClass* pCharClass = NULL;
353     if( pCharClass == NULL )
354         pCharClass = new CharClass( Application::GetSettings().GetLocale() );
355     String aStr( c );
356     bool bRet = pCharClass->isLetter( aStr, 0 );
357     return bRet;
358 }
359 
360 // Hilfsfunktion: Zeichen-Flag Testen
361 sal_Bool SimpleTokenizer_Impl::testCharFlags( sal_Unicode c, sal_uInt16 nTestFlags )
362 {
363     bool bRet = false;
364     if( c != 0 && c <= 255 )
365     {
366         bRet = ( (aCharTypeTab[c] & nTestFlags) != 0 );
367     }
368     else if( c > 255 )
369     {
370         bRet = (( CHAR_START_IDENTIFIER | CHAR_IN_IDENTIFIER ) & nTestFlags) != 0
371             ? BasicSimpleCharClass::isAlpha( c, true ) : false;
372     }
373     return bRet;
374 }
375 
376 void SimpleTokenizer_Impl::setKeyWords( const char** ppKeyWords, sal_uInt16 nCount )
377 {
378     ppListKeyWords = ppKeyWords;
379     nKeyWordCount = nCount;
380 }
381 
382 // Neues Token holen
383 sal_Bool SimpleTokenizer_Impl::getNextToken( /*out*/TokenTypes& reType,
384     /*out*/const sal_Unicode*& rpStartPos, /*out*/const sal_Unicode*& rpEndPos )
385 {
386     reType = TT_UNKNOWN;
387 
388     // Position merken
389     rpStartPos = mpActualPos;
390 
391     // Zeichen untersuchen
392     sal_Unicode c = peekChar();
393     if( c == CHAR_EOF )
394         return sal_False;
395 
396     // Zeichen lesen
397     getChar();
398 
399     //*** Alle Moeglichkeiten durchgehen ***
400     // Space?
401     if ( (testCharFlags( c, CHAR_SPACE ) == sal_True) )
402     {
403         while( testCharFlags( peekChar(), CHAR_SPACE ) == sal_True )
404             getChar();
405 
406         reType = TT_WHITESPACE;
407     }
408 
409     // Identifier?
410     else if ( (testCharFlags( c, CHAR_START_IDENTIFIER ) == sal_True) )
411     {
412         sal_Bool bIdentifierChar;
413         do
414         {
415             // Naechstes Zeichen holen
416             c = peekChar();
417             bIdentifierChar = testCharFlags( c, CHAR_IN_IDENTIFIER );
418             if( bIdentifierChar )
419                 getChar();
420         }
421         while( bIdentifierChar );
422 
423         reType = TT_IDENTIFIER;
424 
425         // Schluesselwort-Tabelle
426         if (ppListKeyWords != NULL)
427         {
428             int nCount = mpActualPos - rpStartPos;
429 
430             // No keyword if string contains char > 255
431             bool bCanBeKeyword = true;
432             for( int i = 0 ; i < nCount ; i++ )
433             {
434                 if( rpStartPos[i] > 255 )
435                 {
436                     bCanBeKeyword = false;
437                     break;
438                 }
439             }
440 
441             if( bCanBeKeyword )
442             {
443                 String aKWString(rpStartPos, sal::static_int_cast< xub_StrLen >(nCount) );
444                 ByteString aByteStr( aKWString, RTL_TEXTENCODING_ASCII_US );
445                 aByteStr.ToLowerAscii();
446                 if ( bsearch( aByteStr.GetBuffer(), ppListKeyWords, nKeyWordCount, sizeof( char* ),
447                                                                         compare_strings ) )
448                 {
449                     reType = TT_KEYWORDS;
450 
451                     if ( aByteStr.Equals( "rem" ) )
452                     {
453                         // Alle Zeichen bis Zeilen-Ende oder EOF entfernen
454                         sal_Unicode cPeek = peekChar();
455                         while( cPeek != CHAR_EOF && testCharFlags( cPeek, CHAR_EOL ) == sal_False )
456                         {
457                             c = getChar();
458                             cPeek = peekChar();
459                         }
460 
461                         reType = TT_COMMENT;
462                     }
463                 }
464             }
465         }
466     }
467 
468     // Operator?
469     // only for BASIC '\'' should be a comment, otherwise it is a normal string and handled there
470     else if ( ( testCharFlags( c, CHAR_OPERATOR ) == sal_True ) || ( (c == '\'') && (aLanguage==HIGHLIGHT_BASIC)) )
471     {
472         // paramters for SQL view
473         if ( (c==':') || (c=='?'))
474         {
475             if (c!='?')
476             {
477                 sal_Bool bIdentifierChar;
478                 do
479                 {
480                     // Naechstes Zeichen holen
481                     c = peekChar();
482                     bIdentifierChar =  BasicSimpleCharClass::isAlpha( c, true );
483                     if( bIdentifierChar )
484                         getChar();
485                 }
486                 while( bIdentifierChar );
487             }
488             reType = TT_PARAMETER;
489         }
490         else if ((c=='-'))
491         {
492             sal_Unicode cPeekNext = peekChar();
493             if (cPeekNext=='-')
494             {
495                 // Alle Zeichen bis Zeilen-Ende oder EOF entfernen
496                 while( cPeekNext != CHAR_EOF && testCharFlags( cPeekNext, CHAR_EOL ) == sal_False )
497                 {
498                     getChar();
499                     cPeekNext = peekChar();
500                 }
501                 reType = TT_COMMENT;
502             }
503         }
504        else if (c=='/')
505        {
506            sal_Unicode cPeekNext = peekChar();
507            if (cPeekNext=='/')
508            {
509                // Alle Zeichen bis Zeilen-Ende oder EOF entfernen
510                while( cPeekNext != CHAR_EOF && testCharFlags( cPeekNext, CHAR_EOL ) == sal_False )
511                {
512                    getChar();
513                    cPeekNext = peekChar();
514                }
515                reType = TT_COMMENT;
516            }
517        }
518         else
519         {
520             // Kommentar ?
521             if ( c == '\'' )
522             {
523                 c = getChar();  // '/' entfernen
524 
525                 // Alle Zeichen bis Zeilen-Ende oder EOF entfernen
526                 sal_Unicode cPeek = c;
527                 while( cPeek != CHAR_EOF && testCharFlags( cPeek, CHAR_EOL ) == sal_False )
528                 {
529                     getChar();
530                     cPeek = peekChar();
531                 }
532 
533                 reType = TT_COMMENT;
534             }
535 
536             // Echter Operator, kann hier einfach behandelt werden,
537             // da nicht der wirkliche Operator, wie z.B. += interessiert,
538             // sondern nur die Tatsache, dass es sich um einen handelt.
539             if( reType != TT_COMMENT )
540             {
541                 reType = TT_OPERATOR;
542             }
543 
544         }
545     }
546 
547     // Objekt-Trenner? Muss vor Number abgehandelt werden
548     else if( c == '.' && ( peekChar() < '0' || peekChar() > '9' ) )
549     {
550         reType = TT_OPERATOR;
551     }
552 
553     // Zahl?
554     else if( testCharFlags( c, CHAR_START_NUMBER ) == sal_True )
555     {
556         reType = TT_NUMBER;
557 
558         // Zahlensystem, 10 = normal, wird bei Oct/Hex geaendert
559         int nRadix = 10;
560 
561         // Ist es eine Hex- oder Oct-Zahl?
562         if( c == '&' )
563         {
564             // Octal?
565             if( peekChar() == 'o' || peekChar() == 'O' )
566             {
567                 // o entfernen
568                 getChar();
569                 nRadix = 8;     // Octal-Basis
570 
571                 // Alle Ziffern einlesen
572                 while( testCharFlags( peekChar(), CHAR_IN_OCT_NUMBER ) )
573                     c = getChar();
574             }
575             // Hex?
576             else if( peekChar() == 'h' || peekChar() == 'H' )
577             {
578                 // x entfernen
579                 getChar();
580                 nRadix = 16;     // Hex-Basis
581 
582                 // Alle Ziffern einlesen und puffern
583                 while( testCharFlags( peekChar(), CHAR_IN_HEX_NUMBER ) )
584                     c = getChar();
585             }
586             else
587             {
588                 reType = TT_OPERATOR;
589             }
590         }
591 
592         // Wenn nicht Oct oder Hex als double ansehen
593         if( reType == TT_NUMBER && nRadix == 10 )
594         {
595             // Flag, ob das letzte Zeichen ein Exponent war
596             sal_Bool bAfterExpChar = sal_False;
597 
598             // Alle Ziffern einlesen
599             while( testCharFlags( peekChar(), CHAR_IN_NUMBER ) ||
600                     (bAfterExpChar && peekChar() == '+' ) ||
601                     (bAfterExpChar && peekChar() == '-' ) )
602                     // Nach Exponent auch +/- OK
603             {
604                 c = getChar();                  // Zeichen lesen
605                 bAfterExpChar = ( c == 'e' || c == 'E' );
606             }
607         }
608 
609         // reType = TT_NUMBER;
610     }
611 
612     // String?
613     else if( testCharFlags( c, CHAR_START_STRING ) == sal_True )
614     {
615         // Merken, welches Zeichen den String eroeffnet hat
616         sal_Unicode cEndString = c;
617         if( c == '[' )
618             cEndString = ']';
619 
620         // Alle Ziffern einlesen und puffern
621         while( peekChar() != cEndString )
622         {
623             // #58846 EOF vor getChar() abfangen, damit EOF micht verloren geht
624             if( peekChar() == CHAR_EOF )
625             {
626                 // ERROR: unterminated string literal
627                 reType = TT_ERROR;
628                 break;
629             }
630             c = getChar();
631             if( testCharFlags( c, CHAR_EOL ) == sal_True )
632             {
633                 // ERROR: unterminated string literal
634                 reType = TT_ERROR;
635                 break;
636             }
637         }
638 
639         //  Zeichen lesen
640         if( reType != TT_ERROR )
641         {
642             getChar();
643             if( cEndString == ']' )
644                 reType = TT_IDENTIFIER;
645             else
646                 reType = TT_STRING;
647         }
648     }
649 
650     // Zeilenende?
651     else if( testCharFlags( c, CHAR_EOL ) == sal_True )
652     {
653         // Falls ein weiteres anderes EOL-Char folgt, weg damit
654         sal_Unicode cNext = peekChar();
655         if( cNext != c && testCharFlags( cNext, CHAR_EOL ) == sal_True )
656             getChar();
657 
658         // Positions-Daten auf Zeilen-Beginn setzen
659         nCol = 0;
660         nLine++;
661 
662         reType = TT_EOL;
663     }
664 
665     // Alles andere bleibt TT_UNKNOWN
666 
667 
668     // End-Position eintragen
669     rpEndPos = mpActualPos;
670     return sal_True;
671 }
672 
673 String SimpleTokenizer_Impl::getTokStr
674     ( /*out*/const sal_Unicode* pStartPos, /*out*/const sal_Unicode* pEndPos )
675 {
676     return String( pStartPos, (sal_uInt16)( pEndPos - pStartPos ) );
677 }
678 
679 #ifdef DBG_UTIL
680 // TEST: Token ausgeben
681 String SimpleTokenizer_Impl::getFullTokenStr( /*out*/TokenTypes eType,
682     /*out*/const sal_Unicode* pStartPos, /*out*/const sal_Unicode* pEndPos )
683 {
684     String aOut;
685     switch( eType )
686     {
687         case TT_UNKNOWN:    aOut = String( RTL_CONSTASCII_USTRINGPARAM("TT_UNKNOWN:") ); break;
688         case TT_IDENTIFIER: aOut = String( RTL_CONSTASCII_USTRINGPARAM("TT_IDENTIFIER:") ); break;
689         case TT_WHITESPACE: aOut = String( RTL_CONSTASCII_USTRINGPARAM("TT_WHITESPACE:") ); break;
690         case TT_NUMBER:     aOut = String( RTL_CONSTASCII_USTRINGPARAM("TT_NUMBER:") ); break;
691         case TT_STRING:     aOut = String( RTL_CONSTASCII_USTRINGPARAM("TT_STRING:") ); break;
692         case TT_EOL:        aOut = String( RTL_CONSTASCII_USTRINGPARAM("TT_EOL:") ); break;
693         case TT_COMMENT:    aOut = String( RTL_CONSTASCII_USTRINGPARAM("TT_COMMENT:") ); break;
694         case TT_ERROR:      aOut = String( RTL_CONSTASCII_USTRINGPARAM("TT_ERROR:") ); break;
695         case TT_OPERATOR:   aOut = String( RTL_CONSTASCII_USTRINGPARAM("TT_OPERATOR:") ); break;
696         case TT_KEYWORDS:   aOut = String( RTL_CONSTASCII_USTRINGPARAM("TT_KEYWORD:") ); break;
697         case TT_PARAMETER:  aOut = String( RTL_CONSTASCII_USTRINGPARAM("TT_PARAMETER:") ); break;
698     }
699     if( eType != TT_EOL )
700     {
701         aOut += String( pStartPos, (sal_uInt16)( pEndPos - pStartPos ) );
702     }
703     aOut += String( RTL_CONSTASCII_USTRINGPARAM("\n") );
704     return aOut;
705 }
706 #endif
707 
708 SimpleTokenizer_Impl::SimpleTokenizer_Impl( HighlighterLanguage aLang ): aLanguage(aLang)
709 {
710     memset( aCharTypeTab, 0, sizeof( aCharTypeTab ) );
711 
712     // Zeichen-Tabelle fuellen
713     sal_uInt16 i;
714 
715     // Zulaessige Zeichen fuer Identifier
716     sal_uInt16 nHelpMask = (sal_uInt16)( CHAR_START_IDENTIFIER | CHAR_IN_IDENTIFIER );
717     for( i = 'a' ; i <= 'z' ; i++ )
718         aCharTypeTab[i] |= nHelpMask;
719     for( i = 'A' ; i <= 'Z' ; i++ )
720         aCharTypeTab[i] |= nHelpMask;
721     // '_' extra eintragen
722     aCharTypeTab[(int)'_'] |= nHelpMask;
723     // AB 23.6.97: '$' ist auch erlaubt
724     aCharTypeTab[(int)'$'] |= nHelpMask;
725 
726     // Ziffern (Identifier und Number ist moeglich)
727     nHelpMask = (sal_uInt16)( CHAR_IN_IDENTIFIER | CHAR_START_NUMBER |
728                          CHAR_IN_NUMBER | CHAR_IN_HEX_NUMBER );
729     for( i = '0' ; i <= '9' ; i++ )
730         aCharTypeTab[i] |= nHelpMask;
731 
732     // e und E sowie . von Hand ergaenzen
733     aCharTypeTab[(int)'e'] |= CHAR_IN_NUMBER;
734     aCharTypeTab[(int)'E'] |= CHAR_IN_NUMBER;
735     aCharTypeTab[(int)'.'] |= (sal_uInt16)( CHAR_IN_NUMBER | CHAR_START_NUMBER );
736     aCharTypeTab[(int)'&'] |= CHAR_START_NUMBER;
737 
738     // Hex-Ziffern
739     for( i = 'a' ; i <= 'f' ; i++ )
740         aCharTypeTab[i] |= CHAR_IN_HEX_NUMBER;
741     for( i = 'A' ; i <= 'F' ; i++ )
742         aCharTypeTab[i] |= CHAR_IN_HEX_NUMBER;
743 
744     // Oct-Ziffern
745     for( i = '0' ; i <= '7' ; i++ )
746         aCharTypeTab[i] |= CHAR_IN_OCT_NUMBER;
747 
748     // String-Beginn/End-Zeichen
749     aCharTypeTab[(int)'\''] |= CHAR_START_STRING;
750     aCharTypeTab[(int)'\"'] |= CHAR_START_STRING;
751     aCharTypeTab[(int)'[']  |= CHAR_START_STRING;
752     aCharTypeTab[(int)'`']  |= CHAR_START_STRING;
753 
754     // Operator-Zeichen
755     aCharTypeTab[(int)'!'] |= CHAR_OPERATOR;
756     aCharTypeTab[(int)'%'] |= CHAR_OPERATOR;
757     // aCharTypeTab[(int)'&'] |= CHAR_OPERATOR;     Removed because of #i14140
758     aCharTypeTab[(int)'('] |= CHAR_OPERATOR;
759     aCharTypeTab[(int)')'] |= CHAR_OPERATOR;
760     aCharTypeTab[(int)'*'] |= CHAR_OPERATOR;
761     aCharTypeTab[(int)'+'] |= CHAR_OPERATOR;
762     aCharTypeTab[(int)','] |= CHAR_OPERATOR;
763     aCharTypeTab[(int)'-'] |= CHAR_OPERATOR;
764     aCharTypeTab[(int)'/'] |= CHAR_OPERATOR;
765     aCharTypeTab[(int)':'] |= CHAR_OPERATOR;
766     aCharTypeTab[(int)'<'] |= CHAR_OPERATOR;
767     aCharTypeTab[(int)'='] |= CHAR_OPERATOR;
768     aCharTypeTab[(int)'>'] |= CHAR_OPERATOR;
769     aCharTypeTab[(int)'?'] |= CHAR_OPERATOR;
770     aCharTypeTab[(int)'^'] |= CHAR_OPERATOR;
771     aCharTypeTab[(int)'|'] |= CHAR_OPERATOR;
772     aCharTypeTab[(int)'~'] |= CHAR_OPERATOR;
773     aCharTypeTab[(int)'{'] |= CHAR_OPERATOR;
774     aCharTypeTab[(int)'}'] |= CHAR_OPERATOR;
775     // aCharTypeTab[(int)'['] |= CHAR_OPERATOR;     Removed because of #i17826
776     aCharTypeTab[(int)']'] |= CHAR_OPERATOR;
777     aCharTypeTab[(int)';'] |= CHAR_OPERATOR;
778 
779     // Space
780     aCharTypeTab[(int)' ' ] |= CHAR_SPACE;
781     aCharTypeTab[(int)'\t'] |= CHAR_SPACE;
782 
783     // Zeilen-Ende-Zeichen
784     aCharTypeTab[(int)'\r'] |= CHAR_EOL;
785     aCharTypeTab[(int)'\n'] |= CHAR_EOL;
786 
787     ppListKeyWords = NULL;
788 }
789 
790 SimpleTokenizer_Impl::~SimpleTokenizer_Impl( void )
791 {
792 }
793 
794 SimpleTokenizer_Impl* getSimpleTokenizer( void )
795 {
796     static SimpleTokenizer_Impl* pSimpleTokenizer = NULL;
797     if( !pSimpleTokenizer )
798         pSimpleTokenizer = new SimpleTokenizer_Impl();
799     return pSimpleTokenizer;
800 }
801 
802 // Heraussuchen der jeweils naechsten Funktion aus einem JavaScript-Modul
803 sal_uInt16 SimpleTokenizer_Impl::parseLine( sal_uInt32 nParseLine, const String* aSource )
804 {
805     // Position auf den Anfang des Source-Strings setzen
806     mpStringBegin = mpActualPos = aSource->GetBuffer();
807 
808     // Zeile und Spalte initialisieren
809     nLine = nParseLine;
810     nCol = 0L;
811 
812     // Variablen fuer die Out-Parameter
813     TokenTypes eType;
814     const sal_Unicode* pStartPos;
815     const sal_Unicode* pEndPos;
816 
817     // Schleife ueber alle Tokens
818     sal_uInt16 nTokenCount = 0;
819     while( getNextToken( eType, pStartPos, pEndPos ) )
820         nTokenCount++;
821 
822     return nTokenCount;
823 }
824 
825 void SimpleTokenizer_Impl::getHighlightPortions( sal_uInt32 nParseLine, const String& rLine,
826                                                     /*out*/HighlightPortions& portions  )
827 {
828     // Position auf den Anfang des Source-Strings setzen
829     mpStringBegin = mpActualPos = rLine.GetBuffer();
830 
831     // Zeile und Spalte initialisieren
832     nLine = nParseLine;
833     nCol = 0L;
834 
835     // Variablen fuer die Out-Parameter
836     TokenTypes eType;
837     const sal_Unicode* pStartPos;
838     const sal_Unicode* pEndPos;
839 
840     // Schleife ueber alle Tokens
841     while( getNextToken( eType, pStartPos, pEndPos ) )
842     {
843         HighlightPortion portion;
844 
845         portion.nBegin = (sal_uInt16)(pStartPos - mpStringBegin);
846         portion.nEnd = (sal_uInt16)(pEndPos - mpStringBegin);
847         portion.tokenType = eType;
848 
849         portions.push_back(portion);
850     }
851 }
852 
853 
854 //////////////////////////////////////////////////////////////////////////
855 // Implementierung des SyntaxHighlighter
856 
857 SyntaxHighlighter::SyntaxHighlighter()
858 {
859     m_pSimpleTokenizer = 0;
860     m_pKeyWords = NULL;
861     m_nKeyWordCount = 0;
862 }
863 
864 SyntaxHighlighter::~SyntaxHighlighter()
865 {
866     delete m_pSimpleTokenizer;
867     delete m_pKeyWords;
868 }
869 
870 void SyntaxHighlighter::initialize( HighlighterLanguage eLanguage_ )
871 {
872     eLanguage = eLanguage_;
873     delete m_pSimpleTokenizer;
874     m_pSimpleTokenizer = new SimpleTokenizer_Impl(eLanguage);
875 
876     switch (eLanguage)
877     {
878         case HIGHLIGHT_BASIC:
879             m_pSimpleTokenizer->setKeyWords( strListBasicKeyWords,
880                                             sizeof( strListBasicKeyWords ) / sizeof( char* ));
881             break;
882         case HIGHLIGHT_SQL:
883             m_pSimpleTokenizer->setKeyWords( strListSqlKeyWords,
884                                             sizeof( strListSqlKeyWords ) / sizeof( char* ));
885             break;
886         default:
887             m_pSimpleTokenizer->setKeyWords( NULL, 0 );
888     }
889 }
890 
891 const Range SyntaxHighlighter::notifyChange( sal_uInt32 nLine, sal_Int32 nLineCountDifference,
892                                 const String* pChangedLines, sal_uInt32 nArrayLength)
893 {
894     (void)nLineCountDifference;
895 
896     for( sal_uInt32 i=0 ; i < nArrayLength ; i++ )
897         m_pSimpleTokenizer->parseLine(nLine+i, &pChangedLines[i]);
898 
899     return Range( nLine, nLine + nArrayLength-1 );
900 }
901 
902 void SyntaxHighlighter::getHighlightPortions( sal_uInt32 nLine, const String& rLine,
903                                             /*out*/HighlightPortions& portions )
904 {
905     m_pSimpleTokenizer->getHighlightPortions( nLine, rLine, portions );
906 }
907