xref: /trunk/main/i18nutil/source/utility/unicode.cxx (revision 75272fef)
1 /**************************************************************
2  *
3  * Licensed to the Apache Software Foundation (ASF) under one
4  * or more contributor license agreements.  See the NOTICE file
5  * distributed with this work for additional information
6  * regarding copyright ownership.  The ASF licenses this file
7  * to you under the Apache License, Version 2.0 (the
8  * "License"); you may not use this file except in compliance
9  * with the License.  You may obtain a copy of the License at
10  *
11  *   http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing,
14  * software distributed under the License is distributed on an
15  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16  * KIND, either express or implied.  See the License for the
17  * specific language governing permissions and limitations
18  * under the License.
19  *
20  *************************************************************/
21 
22 
23 
24 #include <com/sun/star/i18n/UnicodeType.hpp>
25 #include <com/sun/star/i18n/KCharacterType.hpp>
26 #include <i18nutil/unicode.hxx>
27 #include "unicode_data.h"
28 
29 using namespace ::com::sun::star::i18n;
30 
31 static ScriptTypeList defaultTypeList[] = {
32     { UnicodeScript_kBasicLatin,
33       UnicodeScript_kBasicLatin,
34       UnicodeScript_kBasicLatin },      // 0,
35     { UnicodeScript_kLatin1Supplement,
36       UnicodeScript_kLatin1Supplement,
37       UnicodeScript_kLatin1Supplement },// 1,
38     { UnicodeScript_kLatinExtendedA,
39       UnicodeScript_kLatinExtendedA,
40       UnicodeScript_kLatinExtendedA }, // 2,
41     { UnicodeScript_kLatinExtendedB,
42       UnicodeScript_kLatinExtendedB,
43       UnicodeScript_kLatinExtendedB }, // 3,
44     { UnicodeScript_kIPAExtension,
45       UnicodeScript_kIPAExtension,
46       UnicodeScript_kIPAExtension }, // 4,
47     { UnicodeScript_kSpacingModifier,
48       UnicodeScript_kSpacingModifier,
49       UnicodeScript_kSpacingModifier }, // 5,
50     { UnicodeScript_kCombiningDiacritical,
51       UnicodeScript_kCombiningDiacritical,
52       UnicodeScript_kCombiningDiacritical }, // 6,
53     { UnicodeScript_kGreek,
54       UnicodeScript_kGreek,
55       UnicodeScript_kGreek }, // 7,
56     { UnicodeScript_kCyrillic,
57       UnicodeScript_kCyrillic,
58       UnicodeScript_kCyrillic }, // 8,
59     { UnicodeScript_kArmenian,
60       UnicodeScript_kArmenian,
61       UnicodeScript_kArmenian }, // 9,
62     { UnicodeScript_kHebrew,
63       UnicodeScript_kHebrew,
64       UnicodeScript_kHebrew }, // 10,
65     { UnicodeScript_kArabic,
66       UnicodeScript_kArabic,
67       UnicodeScript_kArabic }, // 11,
68     { UnicodeScript_kSyriac,
69       UnicodeScript_kSyriac,
70       UnicodeScript_kSyriac }, // 12,
71     { UnicodeScript_kThaana,
72       UnicodeScript_kThaana,
73       UnicodeScript_kThaana }, // 13,
74     { UnicodeScript_kDevanagari,
75       UnicodeScript_kDevanagari,
76       UnicodeScript_kDevanagari }, // 14,
77     { UnicodeScript_kBengali,
78       UnicodeScript_kBengali,
79       UnicodeScript_kBengali }, // 15,
80     { UnicodeScript_kGurmukhi,
81       UnicodeScript_kGurmukhi,
82       UnicodeScript_kGurmukhi }, // 16,
83     { UnicodeScript_kGujarati,
84       UnicodeScript_kGujarati,
85       UnicodeScript_kGujarati }, // 17,
86     { UnicodeScript_kOriya,
87       UnicodeScript_kOriya,
88       UnicodeScript_kOriya }, // 18,
89     { UnicodeScript_kTamil,
90       UnicodeScript_kTamil,
91       UnicodeScript_kTamil }, // 19,
92     { UnicodeScript_kTelugu,
93       UnicodeScript_kTelugu,
94       UnicodeScript_kTelugu }, // 20,
95     { UnicodeScript_kKannada,
96       UnicodeScript_kKannada,
97       UnicodeScript_kKannada }, // 21,
98     { UnicodeScript_kMalayalam,
99       UnicodeScript_kMalayalam,
100       UnicodeScript_kMalayalam }, // 22,
101     { UnicodeScript_kSinhala,
102       UnicodeScript_kSinhala,
103       UnicodeScript_kSinhala }, // 23,
104     { UnicodeScript_kThai,
105       UnicodeScript_kThai,
106       UnicodeScript_kThai }, // 24,
107     { UnicodeScript_kLao,
108       UnicodeScript_kLao,
109       UnicodeScript_kLao }, // 25,
110     { UnicodeScript_kTibetan,
111       UnicodeScript_kTibetan,
112       UnicodeScript_kTibetan }, // 26,
113     { UnicodeScript_kMyanmar,
114       UnicodeScript_kMyanmar,
115       UnicodeScript_kMyanmar }, // 27,
116     { UnicodeScript_kGeorgian,
117       UnicodeScript_kGeorgian,
118       UnicodeScript_kGeorgian }, // 28,
119     { UnicodeScript_kHangulJamo,
120       UnicodeScript_kHangulJamo,
121       UnicodeScript_kHangulJamo }, // 29,
122     { UnicodeScript_kEthiopic,
123       UnicodeScript_kEthiopic,
124       UnicodeScript_kEthiopic }, // 30,
125     { UnicodeScript_kCherokee,
126       UnicodeScript_kCherokee,
127       UnicodeScript_kCherokee }, // 31,
128     { UnicodeScript_kUnifiedCanadianAboriginalSyllabics,
129       UnicodeScript_kUnifiedCanadianAboriginalSyllabics,
130       UnicodeScript_kUnifiedCanadianAboriginalSyllabics }, // 32,
131     { UnicodeScript_kOgham,
132       UnicodeScript_kOgham,
133       UnicodeScript_kOgham }, // 33,
134     { UnicodeScript_kRunic,
135       UnicodeScript_kRunic,
136       UnicodeScript_kRunic }, // 34,
137     { UnicodeScript_kKhmer,
138       UnicodeScript_kKhmer,
139       UnicodeScript_kKhmer }, // 35,
140     { UnicodeScript_kMongolian,
141       UnicodeScript_kMongolian,
142       UnicodeScript_kMongolian }, // 36,
143     { UnicodeScript_kLatinExtendedAdditional,
144       UnicodeScript_kLatinExtendedAdditional,
145       UnicodeScript_kLatinExtendedAdditional }, // 37,
146     { UnicodeScript_kGreekExtended,
147       UnicodeScript_kGreekExtended,
148       UnicodeScript_kGreekExtended }, // 38,
149     { UnicodeScript_kGeneralPunctuation,
150       UnicodeScript_kGeneralPunctuation,
151       UnicodeScript_kGeneralPunctuation }, // 39,
152     { UnicodeScript_kSuperSubScript,
153       UnicodeScript_kSuperSubScript,
154       UnicodeScript_kSuperSubScript }, // 40,
155     { UnicodeScript_kCurrencySymbolScript,
156       UnicodeScript_kCurrencySymbolScript,
157       UnicodeScript_kCurrencySymbolScript }, // 41,
158     { UnicodeScript_kSymbolCombiningMark,
159       UnicodeScript_kSymbolCombiningMark,
160       UnicodeScript_kSymbolCombiningMark }, // 42,
161     { UnicodeScript_kLetterlikeSymbol,
162       UnicodeScript_kLetterlikeSymbol,
163       UnicodeScript_kLetterlikeSymbol }, // 43,
164     { UnicodeScript_kNumberForm,
165       UnicodeScript_kNumberForm,
166       UnicodeScript_kNumberForm }, // 44,
167     { UnicodeScript_kArrow,
168       UnicodeScript_kArrow,
169       UnicodeScript_kArrow }, // 45,
170     { UnicodeScript_kMathOperator,
171       UnicodeScript_kMathOperator,
172       UnicodeScript_kMathOperator }, // 46,
173     { UnicodeScript_kMiscTechnical,
174       UnicodeScript_kMiscTechnical,
175       UnicodeScript_kMiscTechnical }, // 47,
176     { UnicodeScript_kControlPicture,
177       UnicodeScript_kControlPicture,
178       UnicodeScript_kControlPicture }, // 48,
179     { UnicodeScript_kOpticalCharacter,
180       UnicodeScript_kOpticalCharacter,
181       UnicodeScript_kOpticalCharacter }, // 49,
182     { UnicodeScript_kEnclosedAlphanumeric,
183       UnicodeScript_kEnclosedAlphanumeric,
184       UnicodeScript_kEnclosedAlphanumeric }, // 50,
185     { UnicodeScript_kBoxDrawing,
186       UnicodeScript_kBoxDrawing,
187       UnicodeScript_kBoxDrawing }, // 51,
188     { UnicodeScript_kBlockElement,
189       UnicodeScript_kBlockElement,
190       UnicodeScript_kBlockElement }, // 52,
191     { UnicodeScript_kGeometricShape,
192       UnicodeScript_kGeometricShape,
193       UnicodeScript_kGeometricShape }, // 53,
194     { UnicodeScript_kMiscSymbol,
195       UnicodeScript_kMiscSymbol,
196       UnicodeScript_kMiscSymbol }, // 54,
197     { UnicodeScript_kDingbat,
198       UnicodeScript_kDingbat,
199       UnicodeScript_kDingbat }, // 55,
200     { UnicodeScript_kBraillePatterns,
201       UnicodeScript_kBraillePatterns,
202       UnicodeScript_kBraillePatterns }, // 56,
203     { UnicodeScript_kCJKRadicalsSupplement,
204       UnicodeScript_kCJKRadicalsSupplement,
205       UnicodeScript_kCJKRadicalsSupplement }, // 57,
206     { UnicodeScript_kKangxiRadicals,
207       UnicodeScript_kKangxiRadicals,
208       UnicodeScript_kKangxiRadicals }, // 58,
209     { UnicodeScript_kIdeographicDescriptionCharacters,
210       UnicodeScript_kIdeographicDescriptionCharacters,
211       UnicodeScript_kIdeographicDescriptionCharacters }, // 59,
212     { UnicodeScript_kCJKSymbolPunctuation,
213       UnicodeScript_kCJKSymbolPunctuation,
214       UnicodeScript_kCJKSymbolPunctuation }, // 60,
215     { UnicodeScript_kHiragana,
216       UnicodeScript_kHiragana,
217       UnicodeScript_kHiragana }, // 61,
218     { UnicodeScript_kKatakana,
219       UnicodeScript_kKatakana,
220       UnicodeScript_kKatakana }, // 62,
221     { UnicodeScript_kBopomofo,
222       UnicodeScript_kBopomofo,
223       UnicodeScript_kBopomofo }, // 63,
224     { UnicodeScript_kHangulCompatibilityJamo,
225       UnicodeScript_kHangulCompatibilityJamo,
226       UnicodeScript_kHangulCompatibilityJamo }, // 64,
227     { UnicodeScript_kKanbun,
228       UnicodeScript_kKanbun,
229       UnicodeScript_kKanbun }, // 65,
230     { UnicodeScript_kBopomofoExtended,
231       UnicodeScript_kBopomofoExtended,
232       UnicodeScript_kBopomofoExtended }, // 66,
233     { UnicodeScript_kEnclosedCJKLetterMonth,
234       UnicodeScript_kEnclosedCJKLetterMonth,
235       UnicodeScript_kEnclosedCJKLetterMonth }, // 67,
236     { UnicodeScript_kCJKCompatibility,
237       UnicodeScript_kCJKCompatibility,
238       UnicodeScript_kCJKCompatibility }, // 68,
239     { UnicodeScript_k_CJKUnifiedIdeographsExtensionA,
240       UnicodeScript_k_CJKUnifiedIdeographsExtensionA,
241       UnicodeScript_k_CJKUnifiedIdeographsExtensionA }, // 69,
242     { UnicodeScript_kCJKUnifiedIdeograph,
243       UnicodeScript_kCJKUnifiedIdeograph,
244       UnicodeScript_kCJKUnifiedIdeograph }, // 70,
245     { UnicodeScript_kYiSyllables,
246       UnicodeScript_kYiSyllables,
247       UnicodeScript_kYiSyllables }, // 71,
248     { UnicodeScript_kYiRadicals,
249       UnicodeScript_kYiRadicals,
250       UnicodeScript_kYiRadicals }, // 72,
251     { UnicodeScript_kHangulSyllable,
252       UnicodeScript_kHangulSyllable,
253       UnicodeScript_kHangulSyllable }, // 73,
254     { UnicodeScript_kHighSurrogate,
255       UnicodeScript_kHighSurrogate,
256       UnicodeScript_kHighSurrogate }, // 74,
257     { UnicodeScript_kHighPrivateUseSurrogate,
258       UnicodeScript_kHighPrivateUseSurrogate,
259       UnicodeScript_kHighPrivateUseSurrogate }, // 75,
260     { UnicodeScript_kLowSurrogate,
261       UnicodeScript_kLowSurrogate,
262       UnicodeScript_kLowSurrogate }, // 76,
263     { UnicodeScript_kPrivateUse,
264       UnicodeScript_kPrivateUse,
265       UnicodeScript_kPrivateUse }, // 77,
266     { UnicodeScript_kCJKCompatibilityIdeograph,
267       UnicodeScript_kCJKCompatibilityIdeograph,
268       UnicodeScript_kCJKCompatibilityIdeograph }, // 78,
269     { UnicodeScript_kAlphabeticPresentation,
270       UnicodeScript_kAlphabeticPresentation,
271       UnicodeScript_kAlphabeticPresentation }, // 79,
272     { UnicodeScript_kArabicPresentationA,
273       UnicodeScript_kArabicPresentationA,
274       UnicodeScript_kArabicPresentationA }, // 80,
275     { UnicodeScript_kCombiningHalfMark,
276       UnicodeScript_kCombiningHalfMark,
277       UnicodeScript_kCombiningHalfMark }, // 81,
278     { UnicodeScript_kCJKCompatibilityForm,
279       UnicodeScript_kCJKCompatibilityForm,
280       UnicodeScript_kCJKCompatibilityForm }, // 82,
281     { UnicodeScript_kSmallFormVariant,
282       UnicodeScript_kSmallFormVariant,
283       UnicodeScript_kSmallFormVariant }, // 83,
284     { UnicodeScript_kArabicPresentationB,
285       UnicodeScript_kArabicPresentationB,
286       UnicodeScript_kArabicPresentationB }, // 84,
287     { UnicodeScript_kNoScript,
288       UnicodeScript_kNoScript,
289       UnicodeScript_kNoScript }, // 85,
290     { UnicodeScript_kHalfwidthFullwidthForm,
291       UnicodeScript_kHalfwidthFullwidthForm,
292       UnicodeScript_kHalfwidthFullwidthForm }, // 86,
293     { UnicodeScript_kScriptCount,
294       UnicodeScript_kScriptCount,
295       UnicodeScript_kNoScript } // 87,
296 };
297 
298 sal_Int16 SAL_CALL
getUnicodeScriptType(const sal_Unicode ch,ScriptTypeList * typeList,sal_Int16 unknownType)299 unicode::getUnicodeScriptType( const sal_Unicode ch, ScriptTypeList* typeList, sal_Int16 unknownType ) {
300 
301     if (!typeList) {
302         typeList = defaultTypeList;
303         unknownType = UnicodeScript_kNoScript;
304     }
305 
306     sal_Int16 i = 0, type = typeList[0].to;
307     while (type < UnicodeScript_kScriptCount && ch > UnicodeScriptType[type][UnicodeScriptTypeTo]) {
308         type = typeList[++i].to;
309     }
310 
311     return (type < UnicodeScript_kScriptCount &&
312             ch >= UnicodeScriptType[typeList[i].from][UnicodeScriptTypeFrom]) ?
313             typeList[i].value : unknownType;
314 }
315 
316 sal_Bool SAL_CALL
isUnicodeScriptType(const sal_Unicode ch,sal_Int16 type)317 unicode::isUnicodeScriptType( const sal_Unicode ch, sal_Int16 type) {
318     return ch >= UnicodeScriptType[type][UnicodeScriptTypeFrom] &&
319         ch <= UnicodeScriptType[type][UnicodeScriptTypeTo];
320 }
321 
322 sal_Unicode SAL_CALL
getUnicodeScriptStart(UnicodeScript type)323 unicode::getUnicodeScriptStart( UnicodeScript type) {
324     return UnicodeScriptType[type][UnicodeScriptTypeFrom];
325 }
326 
327 sal_Unicode SAL_CALL
getUnicodeScriptEnd(UnicodeScript type)328 unicode::getUnicodeScriptEnd( UnicodeScript type) {
329     return UnicodeScriptType[type][UnicodeScriptTypeTo];
330 }
331 
332 sal_Int16 SAL_CALL
getUnicodeType(const sal_Unicode ch)333 unicode::getUnicodeType( const sal_Unicode ch ) {
334     static sal_Unicode c = 0x00;
335     static sal_Int16 r = 0x00;
336 
337     if (ch == c) return r;
338     else c = ch;
339 
340     sal_Int16 address = UnicodeTypeIndex[ch >> 8];
341     return r = (sal_Int16)((address < UnicodeTypeNumberBlock) ? UnicodeTypeBlockValue[address] :
342         UnicodeTypeValue[((address - UnicodeTypeNumberBlock) << 8) + (ch & 0xff)]);
343 }
344 
345 sal_uInt8 SAL_CALL
getUnicodeDirection(const sal_Unicode ch)346 unicode::getUnicodeDirection( const sal_Unicode ch ) {
347     static sal_Unicode c = 0x00;
348     static sal_uInt8 r = 0x00;
349 
350     if (ch == c) return r;
351     else c = ch;
352 
353     sal_Int16 address = UnicodeDirectionIndex[ch >> 8];
354     return r = ((address < UnicodeDirectionNumberBlock) ? UnicodeDirectionBlockValue[address] :
355         UnicodeDirectionValue[((address - UnicodeDirectionNumberBlock) << 8) + (ch & 0xff)]);
356 
357 }
358 
359 #define bit(name)   (1 << name)
360 
361 #define UPPERMASK   bit(UnicodeType::UPPERCASE_LETTER)
362 
363 #define LOWERMASK   bit(UnicodeType::LOWERCASE_LETTER)
364 
365 #define TITLEMASK   bit(UnicodeType::TITLECASE_LETTER)
366 
367 #define DIGITMASK   bit(UnicodeType::DECIMAL_DIGIT_NUMBER)|\
368             bit(UnicodeType::LETTER_NUMBER)|\
369             bit(UnicodeType::OTHER_NUMBER)
370 
371 #define ALPHAMASK   UPPERMASK|LOWERMASK|TITLEMASK|\
372             bit(UnicodeType::MODIFIER_LETTER)|\
373             bit(UnicodeType::OTHER_LETTER)
374 
375 #define BASEMASK    DIGITMASK|ALPHAMASK|\
376             bit(UnicodeType::NON_SPACING_MARK)|\
377             bit(UnicodeType::ENCLOSING_MARK)|\
378             bit(UnicodeType::COMBINING_SPACING_MARK)
379 
380 #define SPACEMASK   bit(UnicodeType::SPACE_SEPARATOR)|\
381             bit(UnicodeType::LINE_SEPARATOR)|\
382             bit(UnicodeType::PARAGRAPH_SEPARATOR)
383 
384 #define PUNCTUATIONMASK bit(UnicodeType::DASH_PUNCTUATION)|\
385             bit(UnicodeType::INITIAL_PUNCTUATION)|\
386             bit(UnicodeType::FINAL_PUNCTUATION)|\
387             bit(UnicodeType::CONNECTOR_PUNCTUATION)|\
388             bit(UnicodeType::OTHER_PUNCTUATION)
389 
390 #define SYMBOLMASK  bit(UnicodeType::MATH_SYMBOL)|\
391             bit(UnicodeType::CURRENCY_SYMBOL)|\
392             bit(UnicodeType::MODIFIER_SYMBOL)|\
393             bit(UnicodeType::OTHER_SYMBOL)
394 
395 #define PRINTMASK   BASEMASK|SPACEMASK|PUNCTUATIONMASK|SYMBOLMASK
396 
397 #define CONTROLMASK bit(UnicodeType::CONTROL)|\
398             bit(UnicodeType::FORMAT)|\
399             bit(UnicodeType::LINE_SEPARATOR)|\
400             bit(UnicodeType::PARAGRAPH_SEPARATOR)
401 
402 #define IsType(func, mask)  \
403 sal_Bool SAL_CALL func( const sal_Unicode ch) {\
404     return (bit(getUnicodeType(ch)) & (mask)) != 0;\
405 }
406 
IsType(unicode::isUpper,UPPERMASK)407 IsType(unicode::isUpper, UPPERMASK)
408 IsType(unicode::isLower, LOWERMASK)
409 IsType(unicode::isTitle, DIGITMASK)
410 IsType(unicode::isControl, CONTROLMASK)
411 IsType(unicode::isPrint, PRINTMASK)
412 IsType(unicode::isAlpha, ALPHAMASK)
413 IsType(unicode::isDigit, DIGITMASK)
414 IsType(unicode::isAlphaDigit, ALPHAMASK|DIGITMASK)
415 IsType(unicode::isSpace, SPACEMASK)
416 IsType(unicode::isBase, BASEMASK)
417 IsType(unicode::isPunctuation, PUNCTUATIONMASK)
418 
419 #define CONTROLSPACE    bit(0x09)|bit(0x0a)|bit(0x0b)|bit(0x0c)|bit(0x0d)|\
420             bit(0x1c)|bit(0x1d)|bit(0x1e)|bit(0x1f)
421 
422 sal_Bool SAL_CALL unicode::isWhiteSpace( const sal_Unicode ch) {
423     return (ch != 0xa0 && isSpace(ch)) || (ch <= 0x1F && (bit(ch) & (CONTROLSPACE)));
424 }
425 
getCharType(const sal_Unicode ch)426 sal_Int32 SAL_CALL unicode::getCharType( const sal_Unicode ch )
427 {
428     using namespace ::com::sun::star::i18n::KCharacterType;
429 
430     switch ( getUnicodeType( ch ) ) {
431     // Upper
432     case UnicodeType::UPPERCASE_LETTER :
433         return UPPER|LETTER|PRINTABLE|BASE_FORM;
434 
435     // Lower
436     case UnicodeType::LOWERCASE_LETTER :
437         return LOWER|LETTER|PRINTABLE|BASE_FORM;
438 
439     // Title
440     case UnicodeType::TITLECASE_LETTER :
441         return TITLE_CASE|LETTER|PRINTABLE|BASE_FORM;
442 
443     // Letter
444     case UnicodeType::MODIFIER_LETTER :
445     case UnicodeType::OTHER_LETTER :
446         return LETTER|PRINTABLE|BASE_FORM;
447 
448     // Digit
449     case UnicodeType::DECIMAL_DIGIT_NUMBER:
450     case UnicodeType::LETTER_NUMBER:
451     case UnicodeType::OTHER_NUMBER:
452         return DIGIT|PRINTABLE|BASE_FORM;
453 
454     // Base
455     case UnicodeType::NON_SPACING_MARK:
456     case UnicodeType::ENCLOSING_MARK:
457     case UnicodeType::COMBINING_SPACING_MARK:
458         return BASE_FORM|PRINTABLE;
459 
460     // Print
461     case UnicodeType::SPACE_SEPARATOR:
462 
463     case UnicodeType::DASH_PUNCTUATION:
464     case UnicodeType::INITIAL_PUNCTUATION:
465     case UnicodeType::FINAL_PUNCTUATION:
466     case UnicodeType::CONNECTOR_PUNCTUATION:
467     case UnicodeType::OTHER_PUNCTUATION:
468 
469     case UnicodeType::MATH_SYMBOL:
470     case UnicodeType::CURRENCY_SYMBOL:
471     case UnicodeType::MODIFIER_SYMBOL:
472     case UnicodeType::OTHER_SYMBOL:
473         return PRINTABLE;
474 
475     // Control
476     case UnicodeType::CONTROL:
477     case UnicodeType::FORMAT:
478         return CONTROL;
479 
480     case UnicodeType::LINE_SEPARATOR:
481     case UnicodeType::PARAGRAPH_SEPARATOR:
482         return CONTROL|PRINTABLE;
483 
484     // for all others
485     default:
486         return 0;
487     }
488 }
489 
490 
491