xref: /trunk/main/i18npool/source/characterclassification/cclass_unicode_parser.cxx (revision cf6516809c57e1bb0a940545cca99cdad54d4ce2)
1449ab281SAndrew Rist /**************************************************************
2cdf0e10cSrcweir  *
3449ab281SAndrew Rist  * Licensed to the Apache Software Foundation (ASF) under one
4449ab281SAndrew Rist  * or more contributor license agreements.  See the NOTICE file
5449ab281SAndrew Rist  * distributed with this work for additional information
6449ab281SAndrew Rist  * regarding copyright ownership.  The ASF licenses this file
7449ab281SAndrew Rist  * to you under the Apache License, Version 2.0 (the
8449ab281SAndrew Rist  * "License"); you may not use this file except in compliance
9449ab281SAndrew Rist  * with the License.  You may obtain a copy of the License at
10cdf0e10cSrcweir  *
11449ab281SAndrew Rist  *   http://www.apache.org/licenses/LICENSE-2.0
12cdf0e10cSrcweir  *
13449ab281SAndrew Rist  * Unless required by applicable law or agreed to in writing,
14449ab281SAndrew Rist  * software distributed under the License is distributed on an
15449ab281SAndrew Rist  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16449ab281SAndrew Rist  * KIND, either express or implied.  See the License for the
17449ab281SAndrew Rist  * specific language governing permissions and limitations
18449ab281SAndrew Rist  * under the License.
19cdf0e10cSrcweir  *
20449ab281SAndrew Rist  *************************************************************/
21449ab281SAndrew Rist 
22449ab281SAndrew Rist 
23cdf0e10cSrcweir 
24cdf0e10cSrcweir // MARKER(update_precomp.py): autogen include statement, do not remove
25cdf0e10cSrcweir #include "precompiled_i18npool.hxx"
26cdf0e10cSrcweir 
27cdf0e10cSrcweir #include <cclass_unicode.hxx>
28cdf0e10cSrcweir #include <unicode/uchar.h>
29cdf0e10cSrcweir #include <rtl/math.hxx>
30cdf0e10cSrcweir #include <rtl/ustring.hxx>
31cdf0e10cSrcweir #include <com/sun/star/i18n/KParseTokens.hpp>
32cdf0e10cSrcweir #include <com/sun/star/i18n/KParseType.hpp>
33cdf0e10cSrcweir #include <com/sun/star/i18n/UnicodeType.hpp>
34cdf0e10cSrcweir #include <com/sun/star/i18n/XLocaleData.hpp>
35cdf0e10cSrcweir #include <com/sun/star/i18n/NativeNumberMode.hpp>
36cdf0e10cSrcweir 
37cdf0e10cSrcweir #include <string.h>     // memcpy()
38cdf0e10cSrcweir 
39cdf0e10cSrcweir using namespace ::com::sun::star::uno;
40cdf0e10cSrcweir using namespace ::com::sun::star::lang;
41cdf0e10cSrcweir using namespace ::rtl;
42cdf0e10cSrcweir 
43cdf0e10cSrcweir namespace com { namespace sun { namespace star { namespace i18n {
44cdf0e10cSrcweir 
45cdf0e10cSrcweir const UPT_FLAG_TYPE cclass_Unicode::TOKEN_ILLEGAL       = 0x00000000;
46cdf0e10cSrcweir const UPT_FLAG_TYPE cclass_Unicode::TOKEN_CHAR          = 0x00000001;
47cdf0e10cSrcweir const UPT_FLAG_TYPE cclass_Unicode::TOKEN_CHAR_BOOL = 0x00000002;
48cdf0e10cSrcweir const UPT_FLAG_TYPE cclass_Unicode::TOKEN_CHAR_WORD = 0x00000004;
49cdf0e10cSrcweir const UPT_FLAG_TYPE cclass_Unicode::TOKEN_CHAR_VALUE    = 0x00000008;
50cdf0e10cSrcweir const UPT_FLAG_TYPE cclass_Unicode::TOKEN_CHAR_STRING   = 0x00000010;
51cdf0e10cSrcweir const UPT_FLAG_TYPE cclass_Unicode::TOKEN_CHAR_DONTCARE= 0x00000020;
52cdf0e10cSrcweir const UPT_FLAG_TYPE cclass_Unicode::TOKEN_BOOL          = 0x00000040;
53cdf0e10cSrcweir const UPT_FLAG_TYPE cclass_Unicode::TOKEN_WORD          = 0x00000080;
54cdf0e10cSrcweir const UPT_FLAG_TYPE cclass_Unicode::TOKEN_WORD_SEP      = 0x00000100;
55cdf0e10cSrcweir const UPT_FLAG_TYPE cclass_Unicode::TOKEN_VALUE     = 0x00000200;
56cdf0e10cSrcweir const UPT_FLAG_TYPE cclass_Unicode::TOKEN_VALUE_SEP = 0x00000400;
57cdf0e10cSrcweir const UPT_FLAG_TYPE cclass_Unicode::TOKEN_VALUE_EXP = 0x00000800;
58cdf0e10cSrcweir const UPT_FLAG_TYPE cclass_Unicode::TOKEN_VALUE_SIGN    = 0x00001000;
59cdf0e10cSrcweir const UPT_FLAG_TYPE cclass_Unicode::TOKEN_VALUE_EXP_VALUE   = 0x00002000;
60cdf0e10cSrcweir const UPT_FLAG_TYPE cclass_Unicode::TOKEN_VALUE_DIGIT   = 0x00004000;
61cdf0e10cSrcweir const UPT_FLAG_TYPE cclass_Unicode::TOKEN_NAME_SEP      = 0x20000000;
62cdf0e10cSrcweir const UPT_FLAG_TYPE cclass_Unicode::TOKEN_STRING_SEP    = 0x40000000;
63cdf0e10cSrcweir const UPT_FLAG_TYPE cclass_Unicode::TOKEN_EXCLUDED      = 0x80000000;
64cdf0e10cSrcweir 
65cdf0e10cSrcweir #define TOKEN_DIGIT_FLAGS (TOKEN_CHAR_VALUE | TOKEN_VALUE | TOKEN_VALUE_EXP | TOKEN_VALUE_EXP_VALUE | TOKEN_VALUE_DIGIT)
66cdf0e10cSrcweir 
67cdf0e10cSrcweir // Default identifier/name specification is [A-Za-z_][A-Za-z0-9_]*
68cdf0e10cSrcweir 
69cdf0e10cSrcweir const sal_uInt8 cclass_Unicode::nDefCnt = 128;
70cdf0e10cSrcweir const UPT_FLAG_TYPE cclass_Unicode::pDefaultParserTable[ nDefCnt ] =
71cdf0e10cSrcweir {
72cdf0e10cSrcweir // (...) == Calc formula compiler specific, commented out and modified
73cdf0e10cSrcweir 
74cdf0e10cSrcweir     /* \0 */    TOKEN_EXCLUDED,
75cdf0e10cSrcweir                 TOKEN_ILLEGAL,
76cdf0e10cSrcweir                 TOKEN_ILLEGAL,
77cdf0e10cSrcweir                 TOKEN_ILLEGAL,
78cdf0e10cSrcweir                 TOKEN_ILLEGAL,
79cdf0e10cSrcweir                 TOKEN_ILLEGAL,
80cdf0e10cSrcweir                 TOKEN_ILLEGAL,
81cdf0e10cSrcweir                 TOKEN_ILLEGAL,
82cdf0e10cSrcweir                 TOKEN_ILLEGAL,
83cdf0e10cSrcweir     /*  9 \t */ TOKEN_CHAR_DONTCARE | TOKEN_WORD_SEP | TOKEN_VALUE_SEP,     // (TOKEN_ILLEGAL)
84cdf0e10cSrcweir                 TOKEN_ILLEGAL,
85cdf0e10cSrcweir     /* 11 \v */ TOKEN_CHAR_DONTCARE | TOKEN_WORD_SEP | TOKEN_VALUE_SEP,     // (TOKEN_ILLEGAL)
86cdf0e10cSrcweir                 TOKEN_ILLEGAL,
87cdf0e10cSrcweir                 TOKEN_ILLEGAL,
88cdf0e10cSrcweir                 TOKEN_ILLEGAL,
89cdf0e10cSrcweir                 TOKEN_ILLEGAL,
90cdf0e10cSrcweir                 TOKEN_ILLEGAL,
91cdf0e10cSrcweir                 TOKEN_ILLEGAL,
92cdf0e10cSrcweir                 TOKEN_ILLEGAL,
93cdf0e10cSrcweir                 TOKEN_ILLEGAL,
94cdf0e10cSrcweir                 TOKEN_ILLEGAL,
95cdf0e10cSrcweir                 TOKEN_ILLEGAL,
96cdf0e10cSrcweir                 TOKEN_ILLEGAL,
97cdf0e10cSrcweir                 TOKEN_ILLEGAL,
98cdf0e10cSrcweir                 TOKEN_ILLEGAL,
99cdf0e10cSrcweir                 TOKEN_ILLEGAL,
100cdf0e10cSrcweir                 TOKEN_ILLEGAL,
101cdf0e10cSrcweir                 TOKEN_ILLEGAL,
102cdf0e10cSrcweir                 TOKEN_ILLEGAL,
103cdf0e10cSrcweir                 TOKEN_ILLEGAL,
104cdf0e10cSrcweir                 TOKEN_ILLEGAL,
105cdf0e10cSrcweir                 TOKEN_ILLEGAL,
106cdf0e10cSrcweir     /*  32   */ TOKEN_CHAR_DONTCARE | TOKEN_WORD_SEP | TOKEN_VALUE_SEP,
107cdf0e10cSrcweir     /*  33 ! */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP,
108cdf0e10cSrcweir     /*  34 " */ TOKEN_CHAR_STRING | TOKEN_STRING_SEP,
109cdf0e10cSrcweir     /*  35 # */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP,  // (TOKEN_WORD_SEP)
110cdf0e10cSrcweir     /*  36 $ */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP,  // (TOKEN_CHAR_WORD | TOKEN_WORD)
111cdf0e10cSrcweir     /*  37 % */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP,  // (TOKEN_VALUE)
112cdf0e10cSrcweir     /*  38 & */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP,
113cdf0e10cSrcweir     /*  39 ' */ TOKEN_NAME_SEP,
114cdf0e10cSrcweir     /*  40 ( */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP,
115cdf0e10cSrcweir     /*  41 ) */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP,
116cdf0e10cSrcweir     /*  42 * */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP,
117cdf0e10cSrcweir     /*  43 + */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP | TOKEN_VALUE_EXP | TOKEN_VALUE_SIGN,
118cdf0e10cSrcweir     /*  44 , */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP,  // (TOKEN_CHAR_VALUE | TOKEN_VALUE)
119cdf0e10cSrcweir     /*  45 - */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP | TOKEN_VALUE_EXP | TOKEN_VALUE_SIGN,
120cdf0e10cSrcweir     /*  46 . */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP,  // (TOKEN_WORD | TOKEN_CHAR_VALUE | TOKEN_VALUE)
121cdf0e10cSrcweir     /*  47 / */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP,
122cdf0e10cSrcweir     //for ( i = 48; i < 58; i++ )
123cdf0e10cSrcweir     /*  48 0 */ TOKEN_DIGIT_FLAGS | TOKEN_WORD,
124cdf0e10cSrcweir     /*  49 1 */ TOKEN_DIGIT_FLAGS | TOKEN_WORD,
125cdf0e10cSrcweir     /*  50 2 */ TOKEN_DIGIT_FLAGS | TOKEN_WORD,
126cdf0e10cSrcweir     /*  51 3 */ TOKEN_DIGIT_FLAGS | TOKEN_WORD,
127cdf0e10cSrcweir     /*  52 4 */ TOKEN_DIGIT_FLAGS | TOKEN_WORD,
128cdf0e10cSrcweir     /*  53 5 */ TOKEN_DIGIT_FLAGS | TOKEN_WORD,
129cdf0e10cSrcweir     /*  54 6 */ TOKEN_DIGIT_FLAGS | TOKEN_WORD,
130cdf0e10cSrcweir     /*  55 7 */ TOKEN_DIGIT_FLAGS | TOKEN_WORD,
131cdf0e10cSrcweir     /*  56 8 */ TOKEN_DIGIT_FLAGS | TOKEN_WORD,
132cdf0e10cSrcweir     /*  57 9 */ TOKEN_DIGIT_FLAGS | TOKEN_WORD,
133cdf0e10cSrcweir     /*  58 : */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP,  // (TOKEN_WORD)
134cdf0e10cSrcweir     /*  59 ; */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP,
135cdf0e10cSrcweir     /*  60 < */ TOKEN_CHAR_BOOL | TOKEN_WORD_SEP | TOKEN_VALUE_SEP,
136cdf0e10cSrcweir     /*  61 = */ TOKEN_CHAR | TOKEN_BOOL | TOKEN_WORD_SEP | TOKEN_VALUE_SEP,
137cdf0e10cSrcweir     /*  62 > */ TOKEN_CHAR_BOOL | TOKEN_BOOL | TOKEN_WORD_SEP | TOKEN_VALUE_SEP,
138cdf0e10cSrcweir     /*  63 ? */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP,  // (TOKEN_CHAR_WORD | TOKEN_WORD)
139cdf0e10cSrcweir     /*  64 @ */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP,  // (TOKEN_ILLEGAL // UNUSED)
140cdf0e10cSrcweir     //for ( i = 65; i < 91; i++ )
141cdf0e10cSrcweir     /*  65 A */ TOKEN_CHAR_WORD | TOKEN_WORD,
142cdf0e10cSrcweir     /*  66 B */ TOKEN_CHAR_WORD | TOKEN_WORD,
143cdf0e10cSrcweir     /*  67 C */ TOKEN_CHAR_WORD | TOKEN_WORD,
144cdf0e10cSrcweir     /*  68 D */ TOKEN_CHAR_WORD | TOKEN_WORD,
145cdf0e10cSrcweir     /*  69 E */ TOKEN_CHAR_WORD | TOKEN_WORD,
146cdf0e10cSrcweir     /*  70 F */ TOKEN_CHAR_WORD | TOKEN_WORD,
147cdf0e10cSrcweir     /*  71 G */ TOKEN_CHAR_WORD | TOKEN_WORD,
148cdf0e10cSrcweir     /*  72 H */ TOKEN_CHAR_WORD | TOKEN_WORD,
149cdf0e10cSrcweir     /*  73 I */ TOKEN_CHAR_WORD | TOKEN_WORD,
150cdf0e10cSrcweir     /*  74 J */ TOKEN_CHAR_WORD | TOKEN_WORD,
151cdf0e10cSrcweir     /*  75 K */ TOKEN_CHAR_WORD | TOKEN_WORD,
152cdf0e10cSrcweir     /*  76 L */ TOKEN_CHAR_WORD | TOKEN_WORD,
153cdf0e10cSrcweir     /*  77 M */ TOKEN_CHAR_WORD | TOKEN_WORD,
154cdf0e10cSrcweir     /*  78 N */ TOKEN_CHAR_WORD | TOKEN_WORD,
155cdf0e10cSrcweir     /*  79 O */ TOKEN_CHAR_WORD | TOKEN_WORD,
156cdf0e10cSrcweir     /*  80 P */ TOKEN_CHAR_WORD | TOKEN_WORD,
157cdf0e10cSrcweir     /*  81 Q */ TOKEN_CHAR_WORD | TOKEN_WORD,
158cdf0e10cSrcweir     /*  82 R */ TOKEN_CHAR_WORD | TOKEN_WORD,
159cdf0e10cSrcweir     /*  83 S */ TOKEN_CHAR_WORD | TOKEN_WORD,
160cdf0e10cSrcweir     /*  84 T */ TOKEN_CHAR_WORD | TOKEN_WORD,
161cdf0e10cSrcweir     /*  85 U */ TOKEN_CHAR_WORD | TOKEN_WORD,
162cdf0e10cSrcweir     /*  86 V */ TOKEN_CHAR_WORD | TOKEN_WORD,
163cdf0e10cSrcweir     /*  87 W */ TOKEN_CHAR_WORD | TOKEN_WORD,
164cdf0e10cSrcweir     /*  88 X */ TOKEN_CHAR_WORD | TOKEN_WORD,
165cdf0e10cSrcweir     /*  89 Y */ TOKEN_CHAR_WORD | TOKEN_WORD,
166cdf0e10cSrcweir     /*  90 Z */ TOKEN_CHAR_WORD | TOKEN_WORD,
167cdf0e10cSrcweir     /*  91 [ */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP,  // (TOKEN_ILLEGAL // UNUSED)
168cdf0e10cSrcweir     /*  92 \ */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP,  // (TOKEN_ILLEGAL // UNUSED)
169cdf0e10cSrcweir     /*  93 ] */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP,  // (TOKEN_ILLEGAL // UNUSED)
170cdf0e10cSrcweir     /*  94 ^ */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP,
171cdf0e10cSrcweir     /*  95 _ */ TOKEN_CHAR_WORD | TOKEN_WORD,
172cdf0e10cSrcweir     /*  96 ` */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP,  // (TOKEN_ILLEGAL // UNUSED)
173cdf0e10cSrcweir     //for ( i = 97; i < 123; i++ )
174cdf0e10cSrcweir     /*  97 a */ TOKEN_CHAR_WORD | TOKEN_WORD,
175cdf0e10cSrcweir     /*  98 b */ TOKEN_CHAR_WORD | TOKEN_WORD,
176cdf0e10cSrcweir     /*  99 c */ TOKEN_CHAR_WORD | TOKEN_WORD,
177cdf0e10cSrcweir     /* 100 d */ TOKEN_CHAR_WORD | TOKEN_WORD,
178cdf0e10cSrcweir     /* 101 e */ TOKEN_CHAR_WORD | TOKEN_WORD,
179cdf0e10cSrcweir     /* 102 f */ TOKEN_CHAR_WORD | TOKEN_WORD,
180cdf0e10cSrcweir     /* 103 g */ TOKEN_CHAR_WORD | TOKEN_WORD,
181cdf0e10cSrcweir     /* 104 h */ TOKEN_CHAR_WORD | TOKEN_WORD,
182cdf0e10cSrcweir     /* 105 i */ TOKEN_CHAR_WORD | TOKEN_WORD,
183cdf0e10cSrcweir     /* 106 j */ TOKEN_CHAR_WORD | TOKEN_WORD,
184cdf0e10cSrcweir     /* 107 k */ TOKEN_CHAR_WORD | TOKEN_WORD,
185cdf0e10cSrcweir     /* 108 l */ TOKEN_CHAR_WORD | TOKEN_WORD,
186cdf0e10cSrcweir     /* 109 m */ TOKEN_CHAR_WORD | TOKEN_WORD,
187cdf0e10cSrcweir     /* 110 n */ TOKEN_CHAR_WORD | TOKEN_WORD,
188cdf0e10cSrcweir     /* 111 o */ TOKEN_CHAR_WORD | TOKEN_WORD,
189cdf0e10cSrcweir     /* 112 p */ TOKEN_CHAR_WORD | TOKEN_WORD,
190cdf0e10cSrcweir     /* 113 q */ TOKEN_CHAR_WORD | TOKEN_WORD,
191cdf0e10cSrcweir     /* 114 r */ TOKEN_CHAR_WORD | TOKEN_WORD,
192cdf0e10cSrcweir     /* 115 s */ TOKEN_CHAR_WORD | TOKEN_WORD,
193cdf0e10cSrcweir     /* 116 t */ TOKEN_CHAR_WORD | TOKEN_WORD,
194cdf0e10cSrcweir     /* 117 u */ TOKEN_CHAR_WORD | TOKEN_WORD,
195cdf0e10cSrcweir     /* 118 v */ TOKEN_CHAR_WORD | TOKEN_WORD,
196cdf0e10cSrcweir     /* 119 w */ TOKEN_CHAR_WORD | TOKEN_WORD,
197cdf0e10cSrcweir     /* 120 x */ TOKEN_CHAR_WORD | TOKEN_WORD,
198cdf0e10cSrcweir     /* 121 y */ TOKEN_CHAR_WORD | TOKEN_WORD,
199cdf0e10cSrcweir     /* 122 z */ TOKEN_CHAR_WORD | TOKEN_WORD,
200cdf0e10cSrcweir     /* 123 { */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP,  // (TOKEN_ILLEGAL // UNUSED)
201cdf0e10cSrcweir     /* 124 | */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP,  // (TOKEN_ILLEGAL // UNUSED)
202cdf0e10cSrcweir     /* 125 } */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP,  // (TOKEN_ILLEGAL // UNUSED)
203cdf0e10cSrcweir     /* 126 ~ */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP,  // (TOKEN_ILLEGAL // UNUSED)
204cdf0e10cSrcweir     /* 127   */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP   // (TOKEN_ILLEGAL // UNUSED)
205cdf0e10cSrcweir };
206cdf0e10cSrcweir 
207cdf0e10cSrcweir 
208cdf0e10cSrcweir const sal_Int32 cclass_Unicode::pParseTokensType[ nDefCnt ] =
209cdf0e10cSrcweir {
210cdf0e10cSrcweir     /* \0 */    KParseTokens::ASC_OTHER,
211cdf0e10cSrcweir                 KParseTokens::ASC_CONTROL,
212cdf0e10cSrcweir                 KParseTokens::ASC_CONTROL,
213cdf0e10cSrcweir                 KParseTokens::ASC_CONTROL,
214cdf0e10cSrcweir                 KParseTokens::ASC_CONTROL,
215cdf0e10cSrcweir                 KParseTokens::ASC_CONTROL,
216cdf0e10cSrcweir                 KParseTokens::ASC_CONTROL,
217cdf0e10cSrcweir                 KParseTokens::ASC_CONTROL,
218cdf0e10cSrcweir                 KParseTokens::ASC_CONTROL,
219cdf0e10cSrcweir     /*  9 \t */ KParseTokens::ASC_CONTROL,
220cdf0e10cSrcweir                 KParseTokens::ASC_CONTROL,
221cdf0e10cSrcweir     /* 11 \v */ KParseTokens::ASC_CONTROL,
222cdf0e10cSrcweir                 KParseTokens::ASC_CONTROL,
223cdf0e10cSrcweir                 KParseTokens::ASC_CONTROL,
224cdf0e10cSrcweir                 KParseTokens::ASC_CONTROL,
225cdf0e10cSrcweir                 KParseTokens::ASC_CONTROL,
226cdf0e10cSrcweir                 KParseTokens::ASC_CONTROL,
227cdf0e10cSrcweir                 KParseTokens::ASC_CONTROL,
228cdf0e10cSrcweir                 KParseTokens::ASC_CONTROL,
229cdf0e10cSrcweir                 KParseTokens::ASC_CONTROL,
230cdf0e10cSrcweir                 KParseTokens::ASC_CONTROL,
231cdf0e10cSrcweir                 KParseTokens::ASC_CONTROL,
232cdf0e10cSrcweir                 KParseTokens::ASC_CONTROL,
233cdf0e10cSrcweir                 KParseTokens::ASC_CONTROL,
234cdf0e10cSrcweir                 KParseTokens::ASC_CONTROL,
235cdf0e10cSrcweir                 KParseTokens::ASC_CONTROL,
236cdf0e10cSrcweir                 KParseTokens::ASC_CONTROL,
237cdf0e10cSrcweir                 KParseTokens::ASC_CONTROL,
238cdf0e10cSrcweir                 KParseTokens::ASC_CONTROL,
239cdf0e10cSrcweir                 KParseTokens::ASC_CONTROL,
240cdf0e10cSrcweir                 KParseTokens::ASC_CONTROL,
241cdf0e10cSrcweir                 KParseTokens::ASC_CONTROL,
242cdf0e10cSrcweir     /*  32   */ KParseTokens::ASC_OTHER,
243cdf0e10cSrcweir     /*  33 ! */ KParseTokens::ASC_OTHER,
244cdf0e10cSrcweir     /*  34 " */ KParseTokens::ASC_OTHER,
245cdf0e10cSrcweir     /*  35 # */ KParseTokens::ASC_OTHER,
246cdf0e10cSrcweir     /*  36 $ */ KParseTokens::ASC_DOLLAR,
247cdf0e10cSrcweir     /*  37 % */ KParseTokens::ASC_OTHER,
248cdf0e10cSrcweir     /*  38 & */ KParseTokens::ASC_OTHER,
249cdf0e10cSrcweir     /*  39 ' */ KParseTokens::ASC_OTHER,
250cdf0e10cSrcweir     /*  40 ( */ KParseTokens::ASC_OTHER,
251cdf0e10cSrcweir     /*  41 ) */ KParseTokens::ASC_OTHER,
252cdf0e10cSrcweir     /*  42 * */ KParseTokens::ASC_OTHER,
253cdf0e10cSrcweir     /*  43 + */ KParseTokens::ASC_OTHER,
254cdf0e10cSrcweir     /*  44 , */ KParseTokens::ASC_OTHER,
255cdf0e10cSrcweir     /*  45 - */ KParseTokens::ASC_OTHER,
256cdf0e10cSrcweir     /*  46 . */ KParseTokens::ASC_DOT,
257cdf0e10cSrcweir     /*  47 / */ KParseTokens::ASC_OTHER,
258cdf0e10cSrcweir     //for ( i = 48; i < 58; i++ )
259cdf0e10cSrcweir     /*  48 0 */ KParseTokens::ASC_DIGIT,
260cdf0e10cSrcweir     /*  49 1 */ KParseTokens::ASC_DIGIT,
261cdf0e10cSrcweir     /*  50 2 */ KParseTokens::ASC_DIGIT,
262cdf0e10cSrcweir     /*  51 3 */ KParseTokens::ASC_DIGIT,
263cdf0e10cSrcweir     /*  52 4 */ KParseTokens::ASC_DIGIT,
264cdf0e10cSrcweir     /*  53 5 */ KParseTokens::ASC_DIGIT,
265cdf0e10cSrcweir     /*  54 6 */ KParseTokens::ASC_DIGIT,
266cdf0e10cSrcweir     /*  55 7 */ KParseTokens::ASC_DIGIT,
267cdf0e10cSrcweir     /*  56 8 */ KParseTokens::ASC_DIGIT,
268cdf0e10cSrcweir     /*  57 9 */ KParseTokens::ASC_DIGIT,
269cdf0e10cSrcweir     /*  58 : */ KParseTokens::ASC_COLON,
270cdf0e10cSrcweir     /*  59 ; */ KParseTokens::ASC_OTHER,
271cdf0e10cSrcweir     /*  60 < */ KParseTokens::ASC_OTHER,
272cdf0e10cSrcweir     /*  61 = */ KParseTokens::ASC_OTHER,
273cdf0e10cSrcweir     /*  62 > */ KParseTokens::ASC_OTHER,
274cdf0e10cSrcweir     /*  63 ? */ KParseTokens::ASC_OTHER,
275cdf0e10cSrcweir     /*  64 @ */ KParseTokens::ASC_OTHER,
276cdf0e10cSrcweir     //for ( i = 65; i < 91; i++ )
277cdf0e10cSrcweir     /*  65 A */ KParseTokens::ASC_UPALPHA,
278cdf0e10cSrcweir     /*  66 B */ KParseTokens::ASC_UPALPHA,
279cdf0e10cSrcweir     /*  67 C */ KParseTokens::ASC_UPALPHA,
280cdf0e10cSrcweir     /*  68 D */ KParseTokens::ASC_UPALPHA,
281cdf0e10cSrcweir     /*  69 E */ KParseTokens::ASC_UPALPHA,
282cdf0e10cSrcweir     /*  70 F */ KParseTokens::ASC_UPALPHA,
283cdf0e10cSrcweir     /*  71 G */ KParseTokens::ASC_UPALPHA,
284cdf0e10cSrcweir     /*  72 H */ KParseTokens::ASC_UPALPHA,
285cdf0e10cSrcweir     /*  73 I */ KParseTokens::ASC_UPALPHA,
286cdf0e10cSrcweir     /*  74 J */ KParseTokens::ASC_UPALPHA,
287cdf0e10cSrcweir     /*  75 K */ KParseTokens::ASC_UPALPHA,
288cdf0e10cSrcweir     /*  76 L */ KParseTokens::ASC_UPALPHA,
289cdf0e10cSrcweir     /*  77 M */ KParseTokens::ASC_UPALPHA,
290cdf0e10cSrcweir     /*  78 N */ KParseTokens::ASC_UPALPHA,
291cdf0e10cSrcweir     /*  79 O */ KParseTokens::ASC_UPALPHA,
292cdf0e10cSrcweir     /*  80 P */ KParseTokens::ASC_UPALPHA,
293cdf0e10cSrcweir     /*  81 Q */ KParseTokens::ASC_UPALPHA,
294cdf0e10cSrcweir     /*  82 R */ KParseTokens::ASC_UPALPHA,
295cdf0e10cSrcweir     /*  83 S */ KParseTokens::ASC_UPALPHA,
296cdf0e10cSrcweir     /*  84 T */ KParseTokens::ASC_UPALPHA,
297cdf0e10cSrcweir     /*  85 U */ KParseTokens::ASC_UPALPHA,
298cdf0e10cSrcweir     /*  86 V */ KParseTokens::ASC_UPALPHA,
299cdf0e10cSrcweir     /*  87 W */ KParseTokens::ASC_UPALPHA,
300cdf0e10cSrcweir     /*  88 X */ KParseTokens::ASC_UPALPHA,
301cdf0e10cSrcweir     /*  89 Y */ KParseTokens::ASC_UPALPHA,
302cdf0e10cSrcweir     /*  90 Z */ KParseTokens::ASC_UPALPHA,
303cdf0e10cSrcweir     /*  91 [ */ KParseTokens::ASC_OTHER,
304cdf0e10cSrcweir     /*  92 \ */ KParseTokens::ASC_OTHER,
305cdf0e10cSrcweir     /*  93 ] */ KParseTokens::ASC_OTHER,
306cdf0e10cSrcweir     /*  94 ^ */ KParseTokens::ASC_OTHER,
307cdf0e10cSrcweir     /*  95 _ */ KParseTokens::ASC_UNDERSCORE,
308cdf0e10cSrcweir     /*  96 ` */ KParseTokens::ASC_OTHER,
309cdf0e10cSrcweir     //for ( i = 97; i < 123; i++ )
310cdf0e10cSrcweir     /*  97 a */ KParseTokens::ASC_LOALPHA,
311cdf0e10cSrcweir     /*  98 b */ KParseTokens::ASC_LOALPHA,
312cdf0e10cSrcweir     /*  99 c */ KParseTokens::ASC_LOALPHA,
313cdf0e10cSrcweir     /* 100 d */ KParseTokens::ASC_LOALPHA,
314cdf0e10cSrcweir     /* 101 e */ KParseTokens::ASC_LOALPHA,
315cdf0e10cSrcweir     /* 102 f */ KParseTokens::ASC_LOALPHA,
316cdf0e10cSrcweir     /* 103 g */ KParseTokens::ASC_LOALPHA,
317cdf0e10cSrcweir     /* 104 h */ KParseTokens::ASC_LOALPHA,
318cdf0e10cSrcweir     /* 105 i */ KParseTokens::ASC_LOALPHA,
319cdf0e10cSrcweir     /* 106 j */ KParseTokens::ASC_LOALPHA,
320cdf0e10cSrcweir     /* 107 k */ KParseTokens::ASC_LOALPHA,
321cdf0e10cSrcweir     /* 108 l */ KParseTokens::ASC_LOALPHA,
322cdf0e10cSrcweir     /* 109 m */ KParseTokens::ASC_LOALPHA,
323cdf0e10cSrcweir     /* 110 n */ KParseTokens::ASC_LOALPHA,
324cdf0e10cSrcweir     /* 111 o */ KParseTokens::ASC_LOALPHA,
325cdf0e10cSrcweir     /* 112 p */ KParseTokens::ASC_LOALPHA,
326cdf0e10cSrcweir     /* 113 q */ KParseTokens::ASC_LOALPHA,
327cdf0e10cSrcweir     /* 114 r */ KParseTokens::ASC_LOALPHA,
328cdf0e10cSrcweir     /* 115 s */ KParseTokens::ASC_LOALPHA,
329cdf0e10cSrcweir     /* 116 t */ KParseTokens::ASC_LOALPHA,
330cdf0e10cSrcweir     /* 117 u */ KParseTokens::ASC_LOALPHA,
331cdf0e10cSrcweir     /* 118 v */ KParseTokens::ASC_LOALPHA,
332cdf0e10cSrcweir     /* 119 w */ KParseTokens::ASC_LOALPHA,
333cdf0e10cSrcweir     /* 120 x */ KParseTokens::ASC_LOALPHA,
334cdf0e10cSrcweir     /* 121 y */ KParseTokens::ASC_LOALPHA,
335cdf0e10cSrcweir     /* 122 z */ KParseTokens::ASC_LOALPHA,
336cdf0e10cSrcweir     /* 123 { */ KParseTokens::ASC_OTHER,
337cdf0e10cSrcweir     /* 124 | */ KParseTokens::ASC_OTHER,
338cdf0e10cSrcweir     /* 125 } */ KParseTokens::ASC_OTHER,
339cdf0e10cSrcweir     /* 126 ~ */ KParseTokens::ASC_OTHER,
340cdf0e10cSrcweir     /* 127   */ KParseTokens::ASC_OTHER
341cdf0e10cSrcweir };
342cdf0e10cSrcweir 
343cdf0e10cSrcweir 
344cdf0e10cSrcweir // static
StrChr(const sal_Unicode * pStr,sal_Unicode c)345cdf0e10cSrcweir const sal_Unicode* cclass_Unicode::StrChr( const sal_Unicode* pStr, sal_Unicode c )
346cdf0e10cSrcweir {
347cdf0e10cSrcweir     if ( !pStr )
348cdf0e10cSrcweir         return NULL;
349cdf0e10cSrcweir     while ( *pStr )
350cdf0e10cSrcweir     {
351cdf0e10cSrcweir         if ( *pStr == c )
352cdf0e10cSrcweir             return pStr;
353cdf0e10cSrcweir         pStr++;
354cdf0e10cSrcweir     }
355cdf0e10cSrcweir     return NULL;
356cdf0e10cSrcweir }
357cdf0e10cSrcweir 
358cdf0e10cSrcweir 
getParseTokensType(const sal_Unicode * aStr,sal_Int32 nPos)359cdf0e10cSrcweir sal_Int32 cclass_Unicode::getParseTokensType( const sal_Unicode* aStr, sal_Int32 nPos )
360cdf0e10cSrcweir {
361cdf0e10cSrcweir     sal_Unicode c = aStr[nPos];
362cdf0e10cSrcweir     if ( c < nDefCnt )
363cdf0e10cSrcweir         return pParseTokensType[ sal_uInt8(c) ];
364cdf0e10cSrcweir     else
365cdf0e10cSrcweir     {
366cdf0e10cSrcweir 
367cdf0e10cSrcweir         //! all KParseTokens::UNI_... must be matched
368cdf0e10cSrcweir         switch ( u_charType( (sal_uInt32) c ) )
369cdf0e10cSrcweir         {
370cdf0e10cSrcweir             case U_UPPERCASE_LETTER :
371cdf0e10cSrcweir                 return KParseTokens::UNI_UPALPHA;
372cdf0e10cSrcweir             case U_LOWERCASE_LETTER :
373cdf0e10cSrcweir                 return KParseTokens::UNI_LOALPHA;
374cdf0e10cSrcweir             case U_TITLECASE_LETTER :
375cdf0e10cSrcweir                 return KParseTokens::UNI_TITLE_ALPHA;
376cdf0e10cSrcweir             case U_MODIFIER_LETTER :
377cdf0e10cSrcweir                 return KParseTokens::UNI_MODIFIER_LETTER;
378cdf0e10cSrcweir             case U_OTHER_LETTER :
379cdf0e10cSrcweir                 // Non_Spacing_Mark could not be as leading character
380cdf0e10cSrcweir                 if (nPos == 0) break;
381cdf0e10cSrcweir                 // fall through, treat it as Other_Letter.
382cdf0e10cSrcweir             case U_NON_SPACING_MARK :
383cdf0e10cSrcweir                 return KParseTokens::UNI_OTHER_LETTER;
384cdf0e10cSrcweir             case U_DECIMAL_DIGIT_NUMBER :
385cdf0e10cSrcweir                 return KParseTokens::UNI_DIGIT;
386cdf0e10cSrcweir             case U_LETTER_NUMBER :
387cdf0e10cSrcweir                 return KParseTokens::UNI_LETTER_NUMBER;
388cdf0e10cSrcweir             case U_OTHER_NUMBER :
389cdf0e10cSrcweir                 return KParseTokens::UNI_OTHER_NUMBER;
390cdf0e10cSrcweir         }
391cdf0e10cSrcweir 
392cdf0e10cSrcweir         return KParseTokens::UNI_OTHER;
393cdf0e10cSrcweir     }
394cdf0e10cSrcweir }
395cdf0e10cSrcweir 
setupInternational(const Locale & rLocale)396cdf0e10cSrcweir sal_Bool cclass_Unicode::setupInternational( const Locale& rLocale )
397cdf0e10cSrcweir {
398cdf0e10cSrcweir     sal_Bool bChanged = (aParserLocale.Language != rLocale.Language
399cdf0e10cSrcweir         || aParserLocale.Country != rLocale.Country
400cdf0e10cSrcweir         || aParserLocale.Variant != rLocale.Variant);
401cdf0e10cSrcweir     if ( bChanged )
402cdf0e10cSrcweir     {
403cdf0e10cSrcweir         aParserLocale.Language = rLocale.Language;
404cdf0e10cSrcweir         aParserLocale.Country = rLocale.Country;
405cdf0e10cSrcweir         aParserLocale.Variant = rLocale.Variant;
406cdf0e10cSrcweir     }
407cdf0e10cSrcweir     if ( !xLocaleData.is() && xMSF.is() )
408cdf0e10cSrcweir     {
409cdf0e10cSrcweir         Reference <
410cdf0e10cSrcweir             XInterface > xI =
411cdf0e10cSrcweir             xMSF->createInstance( OUString(
412cdf0e10cSrcweir             RTL_CONSTASCII_USTRINGPARAM( "com.sun.star.i18n.LocaleData" ) ) );
413cdf0e10cSrcweir         if ( xI.is() )
414cdf0e10cSrcweir         {
415cdf0e10cSrcweir             Any x = xI->queryInterface( getCppuType((const Reference< XLocaleData>*)0) );
416cdf0e10cSrcweir             x >>= xLocaleData;
417cdf0e10cSrcweir         }
418cdf0e10cSrcweir     }
419cdf0e10cSrcweir     return bChanged;
420cdf0e10cSrcweir }
421cdf0e10cSrcweir 
422cdf0e10cSrcweir 
setupParserTable(const Locale & rLocale,sal_Int32 startCharTokenType,const OUString & userDefinedCharactersStart,sal_Int32 contCharTokenType,const OUString & userDefinedCharactersCont)423cdf0e10cSrcweir void cclass_Unicode::setupParserTable( const Locale& rLocale, sal_Int32 startCharTokenType,
424cdf0e10cSrcweir             const OUString& userDefinedCharactersStart, sal_Int32 contCharTokenType,
425cdf0e10cSrcweir             const OUString& userDefinedCharactersCont )
426cdf0e10cSrcweir {
427cdf0e10cSrcweir     bool bIntlEqual = (rLocale.Language == aParserLocale.Language &&
428cdf0e10cSrcweir         rLocale.Country == aParserLocale.Country &&
429cdf0e10cSrcweir         rLocale.Variant == aParserLocale.Variant);
430cdf0e10cSrcweir     if ( !pTable || !bIntlEqual ||
431cdf0e10cSrcweir             startCharTokenType != nStartTypes ||
432cdf0e10cSrcweir             contCharTokenType != nContTypes ||
433cdf0e10cSrcweir             userDefinedCharactersStart != aStartChars ||
434cdf0e10cSrcweir             userDefinedCharactersCont != aContChars )
435cdf0e10cSrcweir         initParserTable( rLocale, startCharTokenType, userDefinedCharactersStart,
436cdf0e10cSrcweir             contCharTokenType, userDefinedCharactersCont );
437cdf0e10cSrcweir }
438cdf0e10cSrcweir 
439cdf0e10cSrcweir 
initParserTable(const Locale & rLocale,sal_Int32 startCharTokenType,const OUString & userDefinedCharactersStart,sal_Int32 contCharTokenType,const OUString & userDefinedCharactersCont)440cdf0e10cSrcweir void cclass_Unicode::initParserTable( const Locale& rLocale, sal_Int32 startCharTokenType,
441cdf0e10cSrcweir             const OUString& userDefinedCharactersStart, sal_Int32 contCharTokenType,
442cdf0e10cSrcweir             const OUString& userDefinedCharactersCont )
443cdf0e10cSrcweir {
444cdf0e10cSrcweir     // (Re)Init
445cdf0e10cSrcweir     setupInternational( rLocale );
446cdf0e10cSrcweir     // Memory of pTable is reused.
447cdf0e10cSrcweir     if ( !pTable )
448cdf0e10cSrcweir         pTable = new UPT_FLAG_TYPE[nDefCnt];
449cdf0e10cSrcweir     memcpy( pTable, pDefaultParserTable, sizeof(UPT_FLAG_TYPE) * nDefCnt );
450cdf0e10cSrcweir     // Start and cont tables only need reallocation if different length.
451cdf0e10cSrcweir     if ( pStart && userDefinedCharactersStart.getLength() != aStartChars.getLength() )
452cdf0e10cSrcweir     {
453cdf0e10cSrcweir         delete [] pStart;
454cdf0e10cSrcweir         pStart = NULL;
455cdf0e10cSrcweir     }
456cdf0e10cSrcweir     if ( pCont && userDefinedCharactersCont.getLength() != aContChars.getLength() )
457cdf0e10cSrcweir     {
458cdf0e10cSrcweir         delete [] pCont;
459cdf0e10cSrcweir         pCont = NULL;
460cdf0e10cSrcweir     }
461cdf0e10cSrcweir     nStartTypes = startCharTokenType;
462cdf0e10cSrcweir     nContTypes = contCharTokenType;
463cdf0e10cSrcweir     aStartChars = userDefinedCharactersStart;
464cdf0e10cSrcweir     aContChars = userDefinedCharactersCont;
465cdf0e10cSrcweir 
466cdf0e10cSrcweir     // specials
467cdf0e10cSrcweir     if( xLocaleData.is() )
468cdf0e10cSrcweir     {
469cdf0e10cSrcweir         LocaleDataItem aItem =
470cdf0e10cSrcweir             xLocaleData->getLocaleItem( aParserLocale );
471cdf0e10cSrcweir //!TODO: theoretically separators may be a string, adjustment would have to be
472cdf0e10cSrcweir //! done here and in parsing and in ::rtl::math::stringToDouble()
473cdf0e10cSrcweir         cGroupSep = aItem.thousandSeparator.getStr()[0];
474cdf0e10cSrcweir         cDecimalSep = aItem.decimalSeparator.getStr()[0];
475cdf0e10cSrcweir     }
476cdf0e10cSrcweir 
477cdf0e10cSrcweir     if ( cGroupSep < nDefCnt )
478cdf0e10cSrcweir         pTable[cGroupSep] |= TOKEN_VALUE;
479cdf0e10cSrcweir     if ( cDecimalSep < nDefCnt )
480cdf0e10cSrcweir         pTable[cDecimalSep] |= TOKEN_CHAR_VALUE | TOKEN_VALUE;
481cdf0e10cSrcweir 
482cdf0e10cSrcweir     // Modify characters according to KParseTokens definitions.
483cdf0e10cSrcweir     {
484cdf0e10cSrcweir         using namespace KParseTokens;
485cdf0e10cSrcweir         sal_uInt8 i;
486cdf0e10cSrcweir 
487cdf0e10cSrcweir         if ( !(nStartTypes & ASC_UPALPHA) )
488cdf0e10cSrcweir             for ( i = 65; i < 91; i++ )
489cdf0e10cSrcweir                 pTable[i] &= ~TOKEN_CHAR_WORD;  // not allowed as start character
490cdf0e10cSrcweir         if ( !(nContTypes & ASC_UPALPHA) )
491cdf0e10cSrcweir             for ( i = 65; i < 91; i++ )
492cdf0e10cSrcweir                 pTable[i] &= ~TOKEN_WORD;       // not allowed as cont character
493cdf0e10cSrcweir 
494cdf0e10cSrcweir         if ( !(nStartTypes & ASC_LOALPHA) )
495cdf0e10cSrcweir             for ( i = 97; i < 123; i++ )
496cdf0e10cSrcweir                 pTable[i] &= ~TOKEN_CHAR_WORD;  // not allowed as start character
497cdf0e10cSrcweir         if ( !(nContTypes & ASC_LOALPHA) )
498cdf0e10cSrcweir             for ( i = 97; i < 123; i++ )
499cdf0e10cSrcweir                 pTable[i] &= ~TOKEN_WORD;       // not allowed as cont character
500cdf0e10cSrcweir 
501cdf0e10cSrcweir         if ( nStartTypes & ASC_DIGIT )
502cdf0e10cSrcweir             for ( i = 48; i < 58; i++ )
503cdf0e10cSrcweir                 pTable[i] |= TOKEN_CHAR_WORD;   // allowed as start character
504cdf0e10cSrcweir         if ( !(nContTypes & ASC_DIGIT) )
505cdf0e10cSrcweir             for ( i = 48; i < 58; i++ )
506cdf0e10cSrcweir                 pTable[i] &= ~TOKEN_WORD;       // not allowed as cont character
507cdf0e10cSrcweir 
508cdf0e10cSrcweir         if ( !(nStartTypes & ASC_UNDERSCORE) )
509cdf0e10cSrcweir             pTable[95] &= ~TOKEN_CHAR_WORD;     // not allowed as start character
510cdf0e10cSrcweir         if ( !(nContTypes & ASC_UNDERSCORE) )
511cdf0e10cSrcweir             pTable[95] &= ~TOKEN_WORD;          // not allowed as cont character
512cdf0e10cSrcweir 
513cdf0e10cSrcweir         if ( nStartTypes & ASC_DOLLAR )
514cdf0e10cSrcweir             pTable[36] |= TOKEN_CHAR_WORD;      // allowed as start character
515cdf0e10cSrcweir         if ( nContTypes & ASC_DOLLAR )
516cdf0e10cSrcweir             pTable[36] |= TOKEN_WORD;           // allowed as cont character
517cdf0e10cSrcweir 
518cdf0e10cSrcweir         if ( nStartTypes & ASC_DOT )
519cdf0e10cSrcweir             pTable[46] |= TOKEN_CHAR_WORD;      // allowed as start character
520cdf0e10cSrcweir         if ( nContTypes & ASC_DOT )
521cdf0e10cSrcweir             pTable[46] |= TOKEN_WORD;           // allowed as cont character
522cdf0e10cSrcweir 
523cdf0e10cSrcweir         if ( nStartTypes & ASC_COLON )
524cdf0e10cSrcweir             pTable[58] |= TOKEN_CHAR_WORD;      // allowed as start character
525cdf0e10cSrcweir         if ( nContTypes & ASC_COLON )
526cdf0e10cSrcweir             pTable[58] |= TOKEN_WORD;           // allowed as cont character
527cdf0e10cSrcweir 
528cdf0e10cSrcweir         if ( nStartTypes & ASC_CONTROL )
529cdf0e10cSrcweir             for ( i = 1; i < 32; i++ )
530cdf0e10cSrcweir                 pTable[i] |= TOKEN_CHAR_WORD;   // allowed as start character
531cdf0e10cSrcweir         if ( nContTypes & ASC_CONTROL )
532cdf0e10cSrcweir             for ( i = 1; i < 32; i++ )
533cdf0e10cSrcweir                 pTable[i] |= TOKEN_WORD;        // allowed as cont character
534cdf0e10cSrcweir 
535cdf0e10cSrcweir         if ( nStartTypes & ASC_ANY_BUT_CONTROL )
536cdf0e10cSrcweir             for ( i = 32; i < nDefCnt; i++ )
537cdf0e10cSrcweir                 pTable[i] |= TOKEN_CHAR_WORD;   // allowed as start character
538cdf0e10cSrcweir         if ( nContTypes & ASC_ANY_BUT_CONTROL )
539cdf0e10cSrcweir             for ( i = 32; i < nDefCnt; i++ )
540cdf0e10cSrcweir                 pTable[i] |= TOKEN_WORD;        // allowed as cont character
541cdf0e10cSrcweir 
542cdf0e10cSrcweir     }
543cdf0e10cSrcweir 
544cdf0e10cSrcweir     // Merge in (positively override with) user defined characters.
545cdf0e10cSrcweir     // StartChars
546cdf0e10cSrcweir     sal_Int32 nLen = aStartChars.getLength();
547cdf0e10cSrcweir     if ( nLen )
548cdf0e10cSrcweir     {
549cdf0e10cSrcweir         if ( !pStart )
550cdf0e10cSrcweir             pStart = new UPT_FLAG_TYPE[ nLen ];
551cdf0e10cSrcweir         const sal_Unicode* p = aStartChars.getStr();
552cdf0e10cSrcweir         for ( sal_Int32 j=0; j<nLen; j++, p++ )
553cdf0e10cSrcweir         {
554cdf0e10cSrcweir             pStart[j] = TOKEN_CHAR_WORD;
555cdf0e10cSrcweir             if ( *p < nDefCnt )
556cdf0e10cSrcweir                 pTable[*p] |= TOKEN_CHAR_WORD;
557cdf0e10cSrcweir         }
558cdf0e10cSrcweir     }
559cdf0e10cSrcweir     // ContChars
560cdf0e10cSrcweir     nLen = aContChars.getLength();
561cdf0e10cSrcweir     if ( nLen )
562cdf0e10cSrcweir     {
563cdf0e10cSrcweir         if ( !pCont )
564cdf0e10cSrcweir             pCont = new UPT_FLAG_TYPE[ nLen ];
565cdf0e10cSrcweir         const sal_Unicode* p = aContChars.getStr();
566cdf0e10cSrcweir         for ( sal_Int32 j=0; j<nLen; j++ )
567cdf0e10cSrcweir         {
568cdf0e10cSrcweir             pCont[j] = TOKEN_WORD;
569cdf0e10cSrcweir             if ( *p < nDefCnt )
570cdf0e10cSrcweir                 pTable[*p] |= TOKEN_WORD;
571cdf0e10cSrcweir         }
572cdf0e10cSrcweir     }
573cdf0e10cSrcweir }
574cdf0e10cSrcweir 
575cdf0e10cSrcweir 
destroyParserTable()576cdf0e10cSrcweir void cclass_Unicode::destroyParserTable()
577cdf0e10cSrcweir {
578cdf0e10cSrcweir     if ( pCont )
579cdf0e10cSrcweir         delete [] pCont;
580cdf0e10cSrcweir     if ( pStart )
581cdf0e10cSrcweir         delete [] pStart;
582cdf0e10cSrcweir     if ( pTable )
583cdf0e10cSrcweir         delete [] pTable;
584cdf0e10cSrcweir }
585cdf0e10cSrcweir 
586cdf0e10cSrcweir 
getFlags(const sal_Unicode * aStr,sal_Int32 nPos)587cdf0e10cSrcweir UPT_FLAG_TYPE cclass_Unicode::getFlags( const sal_Unicode* aStr, sal_Int32 nPos )
588cdf0e10cSrcweir {
589cdf0e10cSrcweir     UPT_FLAG_TYPE nMask;
590cdf0e10cSrcweir     sal_Unicode c = aStr[nPos];
591cdf0e10cSrcweir     if ( c < nDefCnt )
592cdf0e10cSrcweir         nMask = pTable[ sal_uInt8(c) ];
593cdf0e10cSrcweir     else
594cdf0e10cSrcweir         nMask = getFlagsExtended( aStr, nPos );
595cdf0e10cSrcweir     switch ( eState )
596cdf0e10cSrcweir     {
597cdf0e10cSrcweir         case ssGetChar :
598cdf0e10cSrcweir         case ssRewindFromValue :
599cdf0e10cSrcweir         case ssIgnoreLeadingInRewind :
600cdf0e10cSrcweir         case ssGetWordFirstChar :
601cdf0e10cSrcweir             if ( !(nMask & TOKEN_CHAR_WORD) )
602cdf0e10cSrcweir             {
603cdf0e10cSrcweir                 nMask |= getStartCharsFlags( c );
604cdf0e10cSrcweir                 if ( nMask & TOKEN_CHAR_WORD )
605cdf0e10cSrcweir                     nMask &= ~TOKEN_EXCLUDED;
606cdf0e10cSrcweir             }
607cdf0e10cSrcweir         break;
608cdf0e10cSrcweir         case ssGetValue :
609cdf0e10cSrcweir         case ssGetWord :
610cdf0e10cSrcweir             if ( !(nMask & TOKEN_WORD) )
611cdf0e10cSrcweir             {
612cdf0e10cSrcweir                 nMask |= getContCharsFlags( c );
613cdf0e10cSrcweir                 if ( nMask & TOKEN_WORD )
614cdf0e10cSrcweir                     nMask &= ~TOKEN_EXCLUDED;
615cdf0e10cSrcweir             }
616cdf0e10cSrcweir         break;
617cdf0e10cSrcweir         default:
618cdf0e10cSrcweir             ;   // other cases aren't needed, no compiler warning
619cdf0e10cSrcweir     }
620cdf0e10cSrcweir     return nMask;
621cdf0e10cSrcweir }
622cdf0e10cSrcweir 
623cdf0e10cSrcweir 
getFlagsExtended(const sal_Unicode * aStr,sal_Int32 nPos)624cdf0e10cSrcweir UPT_FLAG_TYPE cclass_Unicode::getFlagsExtended( const sal_Unicode* aStr, sal_Int32 nPos )
625cdf0e10cSrcweir {
626cdf0e10cSrcweir     sal_Unicode c = aStr[nPos];
627cdf0e10cSrcweir     if ( c == cGroupSep )
628cdf0e10cSrcweir         return TOKEN_VALUE;
629cdf0e10cSrcweir     else if ( c == cDecimalSep )
630cdf0e10cSrcweir         return TOKEN_CHAR_VALUE | TOKEN_VALUE;
631cdf0e10cSrcweir     using namespace i18n;
632cdf0e10cSrcweir     bool bStart = (eState == ssGetChar || eState == ssGetWordFirstChar ||
633cdf0e10cSrcweir             eState == ssRewindFromValue || eState == ssIgnoreLeadingInRewind);
634cdf0e10cSrcweir     sal_Int32 nTypes = (bStart ? nStartTypes : nContTypes);
635cdf0e10cSrcweir 
636cdf0e10cSrcweir     //! all KParseTokens::UNI_... must be matched
637cdf0e10cSrcweir     switch ( u_charType( (sal_uInt32) c ) )
638cdf0e10cSrcweir     {
639cdf0e10cSrcweir         case U_UPPERCASE_LETTER :
640cdf0e10cSrcweir             return (nTypes & KParseTokens::UNI_UPALPHA) ?
641cdf0e10cSrcweir                 (bStart ? TOKEN_CHAR_WORD : TOKEN_WORD) :
642cdf0e10cSrcweir                 TOKEN_ILLEGAL;
643cdf0e10cSrcweir         case U_LOWERCASE_LETTER :
644cdf0e10cSrcweir             return (nTypes & KParseTokens::UNI_LOALPHA) ?
645cdf0e10cSrcweir                 (bStart ? TOKEN_CHAR_WORD : TOKEN_WORD) :
646cdf0e10cSrcweir                 TOKEN_ILLEGAL;
647cdf0e10cSrcweir         case U_TITLECASE_LETTER :
648cdf0e10cSrcweir             return (nTypes & KParseTokens::UNI_TITLE_ALPHA) ?
649cdf0e10cSrcweir                 (bStart ? TOKEN_CHAR_WORD : TOKEN_WORD) :
650cdf0e10cSrcweir                 TOKEN_ILLEGAL;
651cdf0e10cSrcweir         case U_MODIFIER_LETTER :
652cdf0e10cSrcweir             return (nTypes & KParseTokens::UNI_MODIFIER_LETTER) ?
653cdf0e10cSrcweir                 (bStart ? TOKEN_CHAR_WORD : TOKEN_WORD) :
654cdf0e10cSrcweir                 TOKEN_ILLEGAL;
655cdf0e10cSrcweir         case U_NON_SPACING_MARK :
656cdf0e10cSrcweir         case U_COMBINING_SPACING_MARK :
657cdf0e10cSrcweir             // Non_Spacing_Mark can't be a leading character,
658cdf0e10cSrcweir             // nor can a spacing combining mark.
659cdf0e10cSrcweir             if (bStart)
660cdf0e10cSrcweir                 return TOKEN_ILLEGAL;
661cdf0e10cSrcweir             // fall through, treat it as Other_Letter.
662cdf0e10cSrcweir         case U_OTHER_LETTER :
663cdf0e10cSrcweir             return (nTypes & KParseTokens::UNI_OTHER_LETTER) ?
664cdf0e10cSrcweir                 (bStart ? TOKEN_CHAR_WORD : TOKEN_WORD) :
665cdf0e10cSrcweir                 TOKEN_ILLEGAL;
666cdf0e10cSrcweir         case U_DECIMAL_DIGIT_NUMBER :
667cdf0e10cSrcweir             return ((nTypes & KParseTokens::UNI_DIGIT) ?
668cdf0e10cSrcweir                 (bStart ? TOKEN_CHAR_WORD : TOKEN_WORD) :
669cdf0e10cSrcweir                 TOKEN_ILLEGAL) | TOKEN_DIGIT_FLAGS;
670cdf0e10cSrcweir         case U_LETTER_NUMBER :
671cdf0e10cSrcweir             return ((nTypes & KParseTokens::UNI_LETTER_NUMBER) ?
672cdf0e10cSrcweir                 (bStart ? TOKEN_CHAR_WORD : TOKEN_WORD) :
673cdf0e10cSrcweir                 TOKEN_ILLEGAL) | TOKEN_DIGIT_FLAGS;
674cdf0e10cSrcweir         case U_OTHER_NUMBER :
675cdf0e10cSrcweir             return ((nTypes & KParseTokens::UNI_OTHER_NUMBER) ?
676cdf0e10cSrcweir                 (bStart ? TOKEN_CHAR_WORD : TOKEN_WORD) :
677cdf0e10cSrcweir                 TOKEN_ILLEGAL) | TOKEN_DIGIT_FLAGS;
678cdf0e10cSrcweir         case U_SPACE_SEPARATOR :
679cdf0e10cSrcweir             return ((nTypes & KParseTokens::IGNORE_LEADING_WS) ?
680cdf0e10cSrcweir                 TOKEN_CHAR_DONTCARE : (bStart ? TOKEN_CHAR_WORD : (TOKEN_CHAR_DONTCARE | TOKEN_WORD_SEP | TOKEN_VALUE_SEP) ));
681cdf0e10cSrcweir     }
682cdf0e10cSrcweir 
683cdf0e10cSrcweir     return TOKEN_ILLEGAL;
684cdf0e10cSrcweir }
685cdf0e10cSrcweir 
686cdf0e10cSrcweir 
getStartCharsFlags(sal_Unicode c)687cdf0e10cSrcweir UPT_FLAG_TYPE cclass_Unicode::getStartCharsFlags( sal_Unicode c )
688cdf0e10cSrcweir {
689cdf0e10cSrcweir     if ( pStart )
690cdf0e10cSrcweir     {
691cdf0e10cSrcweir         const sal_Unicode* pStr = aStartChars.getStr();
692cdf0e10cSrcweir         const sal_Unicode* p = StrChr( pStr, c );
693cdf0e10cSrcweir         if ( p )
694cdf0e10cSrcweir             return pStart[ p - pStr ];
695cdf0e10cSrcweir     }
696cdf0e10cSrcweir     return TOKEN_ILLEGAL;
697cdf0e10cSrcweir }
698cdf0e10cSrcweir 
699cdf0e10cSrcweir 
getContCharsFlags(sal_Unicode c)700cdf0e10cSrcweir UPT_FLAG_TYPE cclass_Unicode::getContCharsFlags( sal_Unicode c )
701cdf0e10cSrcweir {
702cdf0e10cSrcweir     if ( pCont )
703cdf0e10cSrcweir     {
704cdf0e10cSrcweir         const sal_Unicode* pStr = aContChars.getStr();
705cdf0e10cSrcweir         const sal_Unicode* p = StrChr( pStr, c );
706cdf0e10cSrcweir         if ( p )
707cdf0e10cSrcweir             return pCont[ p - pStr ];
708cdf0e10cSrcweir     }
709cdf0e10cSrcweir     return TOKEN_ILLEGAL;
710cdf0e10cSrcweir }
711cdf0e10cSrcweir 
712cdf0e10cSrcweir 
parseText(ParseResult & r,const OUString & rText,sal_Int32 nPos,sal_Int32 nTokenType)713cdf0e10cSrcweir void cclass_Unicode::parseText( ParseResult& r, const OUString& rText, sal_Int32 nPos, sal_Int32 nTokenType )
714cdf0e10cSrcweir {
715cdf0e10cSrcweir     using namespace i18n;
716cdf0e10cSrcweir     const sal_Unicode* const pTextStart = rText.getStr() + nPos;
717cdf0e10cSrcweir     eState = ssGetChar;
718cdf0e10cSrcweir 
719cdf0e10cSrcweir     //! All the variables below (plus ParseResult) have to be resetted on ssRewindFromValue!
720cdf0e10cSrcweir     const sal_Unicode* pSym = pTextStart;
721cdf0e10cSrcweir     const sal_Unicode* pSrc = pSym;
722cdf0e10cSrcweir     OUString aSymbol;
723cdf0e10cSrcweir     sal_Unicode c = *pSrc;
724cdf0e10cSrcweir     sal_Unicode cLast = 0;
725cdf0e10cSrcweir     int nDecSeps = 0;
726cdf0e10cSrcweir     bool bQuote = false;
727cdf0e10cSrcweir     bool bMightBeWord = true;
728cdf0e10cSrcweir     bool bMightBeWordLast = true;
729cdf0e10cSrcweir     //! All the variables above (plus ParseResult) have to be resetted on ssRewindFromValue!
730cdf0e10cSrcweir 
731cdf0e10cSrcweir     while ( (c != 0) && (eState != ssStop) )
732cdf0e10cSrcweir     {
733cdf0e10cSrcweir         UPT_FLAG_TYPE nMask = getFlags( pTextStart, pSrc - pTextStart );
734cdf0e10cSrcweir         if ( nMask & TOKEN_EXCLUDED )
735cdf0e10cSrcweir             eState = ssBounce;
736cdf0e10cSrcweir         if ( bMightBeWord )
737cdf0e10cSrcweir         {   // only relevant for ssGetValue fall back
738cdf0e10cSrcweir             if ( eState == ssGetChar || eState == ssRewindFromValue ||
739cdf0e10cSrcweir                     eState == ssIgnoreLeadingInRewind )
740cdf0e10cSrcweir                 bMightBeWord = ((nMask & TOKEN_CHAR_WORD) != 0);
741cdf0e10cSrcweir             else
742cdf0e10cSrcweir                 bMightBeWord = ((nMask & TOKEN_WORD) != 0);
743cdf0e10cSrcweir         }
744cdf0e10cSrcweir         sal_Int32 nParseTokensType = getParseTokensType( pTextStart, pSrc - pTextStart );
745cdf0e10cSrcweir         pSrc++;
746cdf0e10cSrcweir         switch (eState)
747cdf0e10cSrcweir         {
748cdf0e10cSrcweir             case ssGetChar :
749cdf0e10cSrcweir             case ssRewindFromValue :
750cdf0e10cSrcweir             case ssIgnoreLeadingInRewind :
751cdf0e10cSrcweir             {
752cdf0e10cSrcweir                 if ( (nMask & TOKEN_CHAR_VALUE) && eState != ssRewindFromValue
753cdf0e10cSrcweir                         && eState != ssIgnoreLeadingInRewind )
754cdf0e10cSrcweir                 {   //! must be first, may fall back to ssGetWord via bMightBeWord
755cdf0e10cSrcweir                     eState = ssGetValue;
756cdf0e10cSrcweir                     if ( nMask & TOKEN_VALUE_DIGIT )
757cdf0e10cSrcweir                     {
758cdf0e10cSrcweir                         if ( 128 <= c )
759cdf0e10cSrcweir                             r.TokenType = KParseType::UNI_NUMBER;
760cdf0e10cSrcweir                         else
761cdf0e10cSrcweir                             r.TokenType = KParseType::ASC_NUMBER;
762cdf0e10cSrcweir                     }
763cdf0e10cSrcweir                     else if ( c == cDecimalSep )
764cdf0e10cSrcweir                     {
765cdf0e10cSrcweir                         if ( *pSrc )
766cdf0e10cSrcweir                             ++nDecSeps;
767cdf0e10cSrcweir                         else
768cdf0e10cSrcweir                             eState = ssRewindFromValue;
769cdf0e10cSrcweir                             // retry for ONE_SINGLE_CHAR or others
770cdf0e10cSrcweir                     }
771cdf0e10cSrcweir                 }
772cdf0e10cSrcweir                 else if ( nMask & TOKEN_CHAR_WORD )
773cdf0e10cSrcweir                 {
774cdf0e10cSrcweir                     eState = ssGetWord;
775cdf0e10cSrcweir                     r.TokenType = KParseType::IDENTNAME;
776cdf0e10cSrcweir                 }
777cdf0e10cSrcweir                 else if ( nMask & TOKEN_NAME_SEP )
778cdf0e10cSrcweir                 {
779cdf0e10cSrcweir                     eState = ssGetWordFirstChar;
780cdf0e10cSrcweir                     bQuote = true;
781cdf0e10cSrcweir                     pSym++;
782cdf0e10cSrcweir                     nParseTokensType = 0;   // will be taken of first real character
783cdf0e10cSrcweir                     r.TokenType = KParseType::SINGLE_QUOTE_NAME;
784cdf0e10cSrcweir                 }
785cdf0e10cSrcweir                 else if ( nMask & TOKEN_CHAR_STRING )
786cdf0e10cSrcweir                 {
787cdf0e10cSrcweir                     eState = ssGetString;
788cdf0e10cSrcweir                     pSym++;
789cdf0e10cSrcweir                     nParseTokensType = 0;   // will be taken of first real character
790cdf0e10cSrcweir                     r.TokenType = KParseType::DOUBLE_QUOTE_STRING;
791cdf0e10cSrcweir                 }
792cdf0e10cSrcweir                 else if ( nMask & TOKEN_CHAR_DONTCARE )
793cdf0e10cSrcweir                 {
794cdf0e10cSrcweir                     if ( nStartTypes & KParseTokens::IGNORE_LEADING_WS )
795cdf0e10cSrcweir                     {
796cdf0e10cSrcweir                         if (eState == ssRewindFromValue)
797cdf0e10cSrcweir                             eState = ssIgnoreLeadingInRewind;
798cdf0e10cSrcweir                         r.LeadingWhiteSpace++;
799cdf0e10cSrcweir                         pSym++;
800cdf0e10cSrcweir                         nParseTokensType = 0;   // wait until real character
801cdf0e10cSrcweir                         bMightBeWord = true;
802cdf0e10cSrcweir                     }
803cdf0e10cSrcweir                     else
804cdf0e10cSrcweir                         eState = ssBounce;
805cdf0e10cSrcweir                 }
806cdf0e10cSrcweir                 else if ( nMask & TOKEN_CHAR_BOOL )
807cdf0e10cSrcweir                 {
808cdf0e10cSrcweir                     eState = ssGetBool;
809cdf0e10cSrcweir                     r.TokenType = KParseType::BOOLEAN;
810cdf0e10cSrcweir                 }
811cdf0e10cSrcweir                 else if ( nMask & TOKEN_CHAR )
812cdf0e10cSrcweir                 {   //! must be last
813cdf0e10cSrcweir                     eState = ssStop;
814cdf0e10cSrcweir                     r.TokenType = KParseType::ONE_SINGLE_CHAR;
815cdf0e10cSrcweir                 }
816cdf0e10cSrcweir                 else
817cdf0e10cSrcweir                     eState = ssBounce;      // not known
818cdf0e10cSrcweir             }
819cdf0e10cSrcweir             break;
820cdf0e10cSrcweir             case ssGetValue :
821cdf0e10cSrcweir             {
822cdf0e10cSrcweir                 if ( nMask & TOKEN_VALUE_DIGIT )
823cdf0e10cSrcweir                 {
824cdf0e10cSrcweir                     if ( 128 <= c )
825cdf0e10cSrcweir                         r.TokenType = KParseType::UNI_NUMBER;
826cdf0e10cSrcweir                     else if ( r.TokenType != KParseType::UNI_NUMBER )
827cdf0e10cSrcweir                         r.TokenType = KParseType::ASC_NUMBER;
828cdf0e10cSrcweir                 }
829cdf0e10cSrcweir                 if ( nMask & TOKEN_VALUE )
830cdf0e10cSrcweir                 {
831cdf0e10cSrcweir                     if ( c == cDecimalSep && ++nDecSeps > 1 )
832cdf0e10cSrcweir                     {
833cdf0e10cSrcweir                         if ( pSrc - pTextStart == 2 )
834cdf0e10cSrcweir                             eState = ssRewindFromValue;
835cdf0e10cSrcweir                             // consecutive separators
836cdf0e10cSrcweir                         else
837cdf0e10cSrcweir                             eState = ssStopBack;
838cdf0e10cSrcweir                     }
839cdf0e10cSrcweir                     // else keep it going
840cdf0e10cSrcweir                 }
841cdf0e10cSrcweir                 else if ( c == 'E' || c == 'e' )
842cdf0e10cSrcweir                 {
843cdf0e10cSrcweir                     UPT_FLAG_TYPE nNext = getFlags( pTextStart, pSrc - pTextStart );
844cdf0e10cSrcweir                     if ( nNext & TOKEN_VALUE_EXP )
845cdf0e10cSrcweir                         ;   // keep it going
846cdf0e10cSrcweir                     else if ( bMightBeWord && ((nNext & TOKEN_WORD) || !*pSrc) )
847cdf0e10cSrcweir                     {   // might be a numerical name (1.2efg)
848cdf0e10cSrcweir                         eState = ssGetWord;
849cdf0e10cSrcweir                         r.TokenType = KParseType::IDENTNAME;
850cdf0e10cSrcweir                     }
851cdf0e10cSrcweir                     else
852cdf0e10cSrcweir                         eState = ssStopBack;
853cdf0e10cSrcweir                 }
854cdf0e10cSrcweir                 else if ( nMask & TOKEN_VALUE_SIGN )
855cdf0e10cSrcweir                 {
856cdf0e10cSrcweir                     if ( (cLast == 'E') || (cLast == 'e') )
857cdf0e10cSrcweir                     {
858cdf0e10cSrcweir                         UPT_FLAG_TYPE nNext = getFlags( pTextStart, pSrc - pTextStart );
859cdf0e10cSrcweir                         if ( nNext & TOKEN_VALUE_EXP_VALUE )
860cdf0e10cSrcweir                             ;   // keep it going
861cdf0e10cSrcweir                         else if ( bMightBeWord && ((nNext & TOKEN_WORD) || !*pSrc) )
862cdf0e10cSrcweir                         {   // might be a numerical name (1.2e+fg)
863cdf0e10cSrcweir                             eState = ssGetWord;
864cdf0e10cSrcweir                             r.TokenType = KParseType::IDENTNAME;
865cdf0e10cSrcweir                         }
866cdf0e10cSrcweir                         else
867cdf0e10cSrcweir                             eState = ssStopBack;
868cdf0e10cSrcweir                     }
869cdf0e10cSrcweir                     else if ( bMightBeWord )
870cdf0e10cSrcweir                     {   // might be a numerical name (1.2+fg)
871cdf0e10cSrcweir                         eState = ssGetWord;
872cdf0e10cSrcweir                         r.TokenType = KParseType::IDENTNAME;
873cdf0e10cSrcweir                     }
874cdf0e10cSrcweir                     else
875cdf0e10cSrcweir                         eState = ssStopBack;
876cdf0e10cSrcweir                 }
877cdf0e10cSrcweir                 else if ( bMightBeWord && (nMask & TOKEN_WORD) )
878cdf0e10cSrcweir                 {   // might be a numerical name (1995.A1)
879cdf0e10cSrcweir                     eState = ssGetWord;
880cdf0e10cSrcweir                     r.TokenType = KParseType::IDENTNAME;
881cdf0e10cSrcweir                 }
882cdf0e10cSrcweir                 else
883cdf0e10cSrcweir                     eState = ssStopBack;
884cdf0e10cSrcweir             }
885cdf0e10cSrcweir             break;
886cdf0e10cSrcweir             case ssGetWordFirstChar :
887cdf0e10cSrcweir                 eState = ssGetWord;
888cdf0e10cSrcweir                 // fall thru
889cdf0e10cSrcweir             case ssGetWord :
890cdf0e10cSrcweir             {
891cdf0e10cSrcweir                 if ( nMask & TOKEN_WORD )
892cdf0e10cSrcweir                     ;   // keep it going
893cdf0e10cSrcweir                 else if ( nMask & TOKEN_NAME_SEP )
894cdf0e10cSrcweir                 {
895cdf0e10cSrcweir                     if ( bQuote )
896cdf0e10cSrcweir                     {
897cdf0e10cSrcweir                         if ( cLast == '\\' )
898cdf0e10cSrcweir                         {   // escaped
899cdf0e10cSrcweir                             aSymbol += OUString( pSym, pSrc - pSym - 2 );
900cdf0e10cSrcweir                             aSymbol += OUString( &c, 1);
901cdf0e10cSrcweir                         }
902cdf0e10cSrcweir                         else
903cdf0e10cSrcweir                         {
904cdf0e10cSrcweir                             eState = ssStop;
905cdf0e10cSrcweir                             aSymbol += OUString( pSym, pSrc - pSym - 1 );
906cdf0e10cSrcweir                         }
907cdf0e10cSrcweir                         pSym = pSrc;
908cdf0e10cSrcweir                     }
909cdf0e10cSrcweir                     else
910cdf0e10cSrcweir                         eState = ssStopBack;
911cdf0e10cSrcweir                 }
912cdf0e10cSrcweir                 else if ( bQuote )
913cdf0e10cSrcweir                     ;   // keep it going
914cdf0e10cSrcweir                 else
915cdf0e10cSrcweir                     eState = ssStopBack;
916cdf0e10cSrcweir             }
917cdf0e10cSrcweir             break;
918cdf0e10cSrcweir             case ssGetString :
919cdf0e10cSrcweir             {
920cdf0e10cSrcweir                 if ( nMask & TOKEN_STRING_SEP )
921cdf0e10cSrcweir                 {
922cdf0e10cSrcweir                     if ( cLast == '\\' )
923cdf0e10cSrcweir                     {   // escaped
924cdf0e10cSrcweir                         aSymbol += OUString( pSym, pSrc - pSym - 2 );
925cdf0e10cSrcweir                         aSymbol += OUString( &c, 1);
926cdf0e10cSrcweir                     }
927cdf0e10cSrcweir                     else if ( c == *pSrc &&
928cdf0e10cSrcweir                             !(nContTypes & KParseTokens::TWO_DOUBLE_QUOTES_BREAK_STRING) )
929cdf0e10cSrcweir                     {   // "" => literal " escaped
930cdf0e10cSrcweir                         aSymbol += OUString( pSym, pSrc - pSym );
931cdf0e10cSrcweir                         pSrc++;
932cdf0e10cSrcweir                     }
933cdf0e10cSrcweir                     else
934cdf0e10cSrcweir                     {
935cdf0e10cSrcweir                         eState = ssStop;
936cdf0e10cSrcweir                         aSymbol += OUString( pSym, pSrc - pSym - 1 );
937cdf0e10cSrcweir                     }
938cdf0e10cSrcweir                     pSym = pSrc;
939cdf0e10cSrcweir                 }
940cdf0e10cSrcweir             }
941cdf0e10cSrcweir             break;
942cdf0e10cSrcweir             case ssGetBool :
943cdf0e10cSrcweir             {
944cdf0e10cSrcweir                 if ( (nMask & TOKEN_BOOL) )
945cdf0e10cSrcweir                     eState = ssStop;    // maximum 2: <, >, <>, <=, >=
946cdf0e10cSrcweir                 else
947cdf0e10cSrcweir                     eState = ssStopBack;
948cdf0e10cSrcweir             }
949cdf0e10cSrcweir             break;
950cdf0e10cSrcweir             case ssStopBack :
951cdf0e10cSrcweir             case ssBounce :
952cdf0e10cSrcweir             case ssStop :
953cdf0e10cSrcweir                 ;   // nothing, no compiler warning
954cdf0e10cSrcweir             break;
955cdf0e10cSrcweir         }
956cdf0e10cSrcweir         if ( eState == ssRewindFromValue )
957cdf0e10cSrcweir         {
958cdf0e10cSrcweir             r = ParseResult();
959cdf0e10cSrcweir             pSym = pTextStart;
960cdf0e10cSrcweir             pSrc = pSym;
961cdf0e10cSrcweir             aSymbol = OUString();
962cdf0e10cSrcweir             c = *pSrc;
963cdf0e10cSrcweir             cLast = 0;
964cdf0e10cSrcweir             nDecSeps = 0;
965cdf0e10cSrcweir             bQuote = false;
966cdf0e10cSrcweir             bMightBeWord = true;
967cdf0e10cSrcweir             bMightBeWordLast = true;
968cdf0e10cSrcweir         }
969cdf0e10cSrcweir         else
970cdf0e10cSrcweir         {
971cdf0e10cSrcweir             if ( !(r.TokenType & nTokenType) )
972cdf0e10cSrcweir             {
973cdf0e10cSrcweir                 if ( (r.TokenType & (KParseType::ASC_NUMBER | KParseType::UNI_NUMBER))
974cdf0e10cSrcweir                         && (nTokenType & KParseType::IDENTNAME) && bMightBeWord )
975cdf0e10cSrcweir                     ;   // keep a number that might be a word
976cdf0e10cSrcweir                 else if ( r.LeadingWhiteSpace == (pSrc - pTextStart) )
977cdf0e10cSrcweir                     ;   // keep ignored white space
978cdf0e10cSrcweir                 else if ( !r.TokenType && eState == ssGetValue && (nMask & TOKEN_VALUE_SEP) )
979cdf0e10cSrcweir                     ;   // keep uncertain value
980cdf0e10cSrcweir                 else
981cdf0e10cSrcweir                     eState = ssBounce;
982cdf0e10cSrcweir             }
983cdf0e10cSrcweir             if ( eState == ssBounce )
984cdf0e10cSrcweir             {
985cdf0e10cSrcweir                 r.TokenType = 0;
986cdf0e10cSrcweir                 eState = ssStopBack;
987cdf0e10cSrcweir             }
988cdf0e10cSrcweir             if ( eState == ssStopBack )
989cdf0e10cSrcweir             {   // put back
990cdf0e10cSrcweir                 pSrc--;
991cdf0e10cSrcweir                 bMightBeWord = bMightBeWordLast;
992cdf0e10cSrcweir                 eState = ssStop;
993cdf0e10cSrcweir             }
994cdf0e10cSrcweir             if ( eState != ssStop )
995cdf0e10cSrcweir             {
996cdf0e10cSrcweir                 if ( !r.StartFlags )
997cdf0e10cSrcweir                     r.StartFlags |= nParseTokensType;
998cdf0e10cSrcweir                 else
999cdf0e10cSrcweir                     r.ContFlags |= nParseTokensType;
1000cdf0e10cSrcweir             }
1001cdf0e10cSrcweir             bMightBeWordLast = bMightBeWord;
1002cdf0e10cSrcweir             cLast = c;
1003cdf0e10cSrcweir             c = *pSrc;
1004cdf0e10cSrcweir         }
1005cdf0e10cSrcweir     }
1006cdf0e10cSrcweir     // r.CharLen is the length in characters (not code points) of the parsed
1007cdf0e10cSrcweir     // token not including any leading white space, change this calculation if
1008cdf0e10cSrcweir     // multi-code-point Unicode characters are to be supported.
1009cdf0e10cSrcweir     r.CharLen = pSrc - pTextStart - r.LeadingWhiteSpace;
1010cdf0e10cSrcweir     r.EndPos = nPos + (pSrc - pTextStart);
1011cdf0e10cSrcweir     if ( r.TokenType & KParseType::ASC_NUMBER )
1012cdf0e10cSrcweir     {
1013cdf0e10cSrcweir         r.Value = rtl_math_uStringToDouble( pTextStart + r.LeadingWhiteSpace,
1014cdf0e10cSrcweir                 pTextStart + r.EndPos, cDecimalSep, cGroupSep, NULL, NULL );
1015cdf0e10cSrcweir         if ( bMightBeWord )
1016cdf0e10cSrcweir             r.TokenType |= KParseType::IDENTNAME;
1017cdf0e10cSrcweir     }
1018cdf0e10cSrcweir     else if ( r.TokenType & KParseType::UNI_NUMBER )
1019cdf0e10cSrcweir     {
1020cdf0e10cSrcweir         if ( !xNatNumSup.is() )
1021cdf0e10cSrcweir         {
1022cdf0e10cSrcweir #define NATIVENUMBERSUPPLIER_SERVICENAME "com.sun.star.i18n.NativeNumberSupplier"
1023cdf0e10cSrcweir             if ( xMSF.is() )
1024cdf0e10cSrcweir             {
1025cdf0e10cSrcweir                 xNatNumSup = Reference< XNativeNumberSupplier > (
1026cdf0e10cSrcweir                         xMSF->createInstance( OUString(
1027cdf0e10cSrcweir                                 RTL_CONSTASCII_USTRINGPARAM(
1028cdf0e10cSrcweir                                     NATIVENUMBERSUPPLIER_SERVICENAME ) ) ),
1029cdf0e10cSrcweir                         UNO_QUERY );
1030cdf0e10cSrcweir             }
1031cdf0e10cSrcweir             if ( !xNatNumSup.is() )
1032cdf0e10cSrcweir             {
1033cdf0e10cSrcweir                 throw RuntimeException( OUString(
1034cdf0e10cSrcweir #ifdef DBG_UTIL
1035cdf0e10cSrcweir                     RTL_CONSTASCII_USTRINGPARAM(
1036*5e7dbebbSJohn Bampton                         "cclass_Unicode::parseText: can't instantiate "
1037cdf0e10cSrcweir                         NATIVENUMBERSUPPLIER_SERVICENAME )
1038cdf0e10cSrcweir #endif
1039cdf0e10cSrcweir                     ), *this );
1040cdf0e10cSrcweir             }
1041cdf0e10cSrcweir #undef NATIVENUMBERSUPPLIER_SERVICENAME
1042cdf0e10cSrcweir         }
1043cdf0e10cSrcweir         OUString aTmp( pTextStart + r.LeadingWhiteSpace, r.EndPos - nPos +
1044cdf0e10cSrcweir                 r.LeadingWhiteSpace );
1045cdf0e10cSrcweir         // transliterate to ASCII
1046cdf0e10cSrcweir         aTmp = xNatNumSup->getNativeNumberString( aTmp, aParserLocale,
1047cdf0e10cSrcweir                 NativeNumberMode::NATNUM0 );
1048cdf0e10cSrcweir         r.Value = ::rtl::math::stringToDouble( aTmp, cDecimalSep, cGroupSep, NULL, NULL );
1049cdf0e10cSrcweir         if ( bMightBeWord )
1050cdf0e10cSrcweir             r.TokenType |= KParseType::IDENTNAME;
1051cdf0e10cSrcweir     }
1052cdf0e10cSrcweir     else if ( r.TokenType & (KParseType::SINGLE_QUOTE_NAME | KParseType::DOUBLE_QUOTE_STRING) )
1053cdf0e10cSrcweir     {
1054cdf0e10cSrcweir         if ( pSym < pSrc )
1055cdf0e10cSrcweir         {   //! open quote
1056cdf0e10cSrcweir             aSymbol += OUString( pSym, pSrc - pSym );
1057cdf0e10cSrcweir             r.TokenType |= KParseType::MISSING_QUOTE;
1058cdf0e10cSrcweir         }
1059cdf0e10cSrcweir         r.DequotedNameOrString = aSymbol;
1060cdf0e10cSrcweir     }
1061cdf0e10cSrcweir }
1062cdf0e10cSrcweir 
1063cdf0e10cSrcweir } } } }
1064