1 /*************************************************************************
2  *
3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4  *
5  * Copyright 2000, 2010 Oracle and/or its affiliates.
6  *
7  * OpenOffice.org - a multi-platform office productivity suite
8  *
9  * This file is part of OpenOffice.org.
10  *
11  * OpenOffice.org is free software: you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser General Public License version 3
13  * only, as published by the Free Software Foundation.
14  *
15  * OpenOffice.org is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  * GNU Lesser General Public License version 3 for more details
19  * (a copy is included in the LICENSE file that accompanied this code).
20  *
21  * You should have received a copy of the GNU Lesser General Public License
22  * version 3 along with OpenOffice.org.  If not, see
23  * <http://www.openoffice.org/license.html>
24  * for a copy of the LGPLv3 License.
25  *
26  ************************************************************************/
27 
28 #include <com/sun/star/i18n/UnicodeType.hpp>
29 #include <com/sun/star/i18n/KCharacterType.hpp>
30 #include <i18nutil/unicode.hxx>
31 #include "unicode_data.h"
32 
33 using namespace ::com::sun::star::i18n;
34 
35 static ScriptTypeList defaultTypeList[] = {
36     { UnicodeScript_kBasicLatin,
37       UnicodeScript_kBasicLatin,
38       UnicodeScript_kBasicLatin },      // 0,
39     { UnicodeScript_kLatin1Supplement,
40       UnicodeScript_kLatin1Supplement,
41       UnicodeScript_kLatin1Supplement },// 1,
42     { UnicodeScript_kLatinExtendedA,
43       UnicodeScript_kLatinExtendedA,
44       UnicodeScript_kLatinExtendedA }, // 2,
45     { UnicodeScript_kLatinExtendedB,
46       UnicodeScript_kLatinExtendedB,
47       UnicodeScript_kLatinExtendedB }, // 3,
48     { UnicodeScript_kIPAExtension,
49       UnicodeScript_kIPAExtension,
50       UnicodeScript_kIPAExtension }, // 4,
51     { UnicodeScript_kSpacingModifier,
52       UnicodeScript_kSpacingModifier,
53       UnicodeScript_kSpacingModifier }, // 5,
54     { UnicodeScript_kCombiningDiacritical,
55       UnicodeScript_kCombiningDiacritical,
56       UnicodeScript_kCombiningDiacritical }, // 6,
57     { UnicodeScript_kGreek,
58       UnicodeScript_kGreek,
59       UnicodeScript_kGreek }, // 7,
60     { UnicodeScript_kCyrillic,
61       UnicodeScript_kCyrillic,
62       UnicodeScript_kCyrillic }, // 8,
63     { UnicodeScript_kArmenian,
64       UnicodeScript_kArmenian,
65       UnicodeScript_kArmenian }, // 9,
66     { UnicodeScript_kHebrew,
67       UnicodeScript_kHebrew,
68       UnicodeScript_kHebrew }, // 10,
69     { UnicodeScript_kArabic,
70       UnicodeScript_kArabic,
71       UnicodeScript_kArabic }, // 11,
72     { UnicodeScript_kSyriac,
73       UnicodeScript_kSyriac,
74       UnicodeScript_kSyriac }, // 12,
75     { UnicodeScript_kThaana,
76       UnicodeScript_kThaana,
77       UnicodeScript_kThaana }, // 13,
78     { UnicodeScript_kDevanagari,
79       UnicodeScript_kDevanagari,
80       UnicodeScript_kDevanagari }, // 14,
81     { UnicodeScript_kBengali,
82       UnicodeScript_kBengali,
83       UnicodeScript_kBengali }, // 15,
84     { UnicodeScript_kGurmukhi,
85       UnicodeScript_kGurmukhi,
86       UnicodeScript_kGurmukhi }, // 16,
87     { UnicodeScript_kGujarati,
88       UnicodeScript_kGujarati,
89       UnicodeScript_kGujarati }, // 17,
90     { UnicodeScript_kOriya,
91       UnicodeScript_kOriya,
92       UnicodeScript_kOriya }, // 18,
93     { UnicodeScript_kTamil,
94       UnicodeScript_kTamil,
95       UnicodeScript_kTamil }, // 19,
96     { UnicodeScript_kTelugu,
97       UnicodeScript_kTelugu,
98       UnicodeScript_kTelugu }, // 20,
99     { UnicodeScript_kKannada,
100       UnicodeScript_kKannada,
101       UnicodeScript_kKannada }, // 21,
102     { UnicodeScript_kMalayalam,
103       UnicodeScript_kMalayalam,
104       UnicodeScript_kMalayalam }, // 22,
105     { UnicodeScript_kSinhala,
106       UnicodeScript_kSinhala,
107       UnicodeScript_kSinhala }, // 23,
108     { UnicodeScript_kThai,
109       UnicodeScript_kThai,
110       UnicodeScript_kThai }, // 24,
111     { UnicodeScript_kLao,
112       UnicodeScript_kLao,
113       UnicodeScript_kLao }, // 25,
114     { UnicodeScript_kTibetan,
115       UnicodeScript_kTibetan,
116       UnicodeScript_kTibetan }, // 26,
117     { UnicodeScript_kMyanmar,
118       UnicodeScript_kMyanmar,
119       UnicodeScript_kMyanmar }, // 27,
120     { UnicodeScript_kGeorgian,
121       UnicodeScript_kGeorgian,
122       UnicodeScript_kGeorgian }, // 28,
123     { UnicodeScript_kHangulJamo,
124       UnicodeScript_kHangulJamo,
125       UnicodeScript_kHangulJamo }, // 29,
126     { UnicodeScript_kEthiopic,
127       UnicodeScript_kEthiopic,
128       UnicodeScript_kEthiopic }, // 30,
129     { UnicodeScript_kCherokee,
130       UnicodeScript_kCherokee,
131       UnicodeScript_kCherokee }, // 31,
132     { UnicodeScript_kUnifiedCanadianAboriginalSyllabics,
133       UnicodeScript_kUnifiedCanadianAboriginalSyllabics,
134       UnicodeScript_kUnifiedCanadianAboriginalSyllabics }, // 32,
135     { UnicodeScript_kOgham,
136       UnicodeScript_kOgham,
137       UnicodeScript_kOgham }, // 33,
138     { UnicodeScript_kRunic,
139       UnicodeScript_kRunic,
140       UnicodeScript_kRunic }, // 34,
141     { UnicodeScript_kKhmer,
142       UnicodeScript_kKhmer,
143       UnicodeScript_kKhmer }, // 35,
144     { UnicodeScript_kMongolian,
145       UnicodeScript_kMongolian,
146       UnicodeScript_kMongolian }, // 36,
147     { UnicodeScript_kLatinExtendedAdditional,
148       UnicodeScript_kLatinExtendedAdditional,
149       UnicodeScript_kLatinExtendedAdditional }, // 37,
150     { UnicodeScript_kGreekExtended,
151       UnicodeScript_kGreekExtended,
152       UnicodeScript_kGreekExtended }, // 38,
153     { UnicodeScript_kGeneralPunctuation,
154       UnicodeScript_kGeneralPunctuation,
155       UnicodeScript_kGeneralPunctuation }, // 39,
156     { UnicodeScript_kSuperSubScript,
157       UnicodeScript_kSuperSubScript,
158       UnicodeScript_kSuperSubScript }, // 40,
159     { UnicodeScript_kCurrencySymbolScript,
160       UnicodeScript_kCurrencySymbolScript,
161       UnicodeScript_kCurrencySymbolScript }, // 41,
162     { UnicodeScript_kSymbolCombiningMark,
163       UnicodeScript_kSymbolCombiningMark,
164       UnicodeScript_kSymbolCombiningMark }, // 42,
165     { UnicodeScript_kLetterlikeSymbol,
166       UnicodeScript_kLetterlikeSymbol,
167       UnicodeScript_kLetterlikeSymbol }, // 43,
168     { UnicodeScript_kNumberForm,
169       UnicodeScript_kNumberForm,
170       UnicodeScript_kNumberForm }, // 44,
171     { UnicodeScript_kArrow,
172       UnicodeScript_kArrow,
173       UnicodeScript_kArrow }, // 45,
174     { UnicodeScript_kMathOperator,
175       UnicodeScript_kMathOperator,
176       UnicodeScript_kMathOperator }, // 46,
177     { UnicodeScript_kMiscTechnical,
178       UnicodeScript_kMiscTechnical,
179       UnicodeScript_kMiscTechnical }, // 47,
180     { UnicodeScript_kControlPicture,
181       UnicodeScript_kControlPicture,
182       UnicodeScript_kControlPicture }, // 48,
183     { UnicodeScript_kOpticalCharacter,
184       UnicodeScript_kOpticalCharacter,
185       UnicodeScript_kOpticalCharacter }, // 49,
186     { UnicodeScript_kEnclosedAlphanumeric,
187       UnicodeScript_kEnclosedAlphanumeric,
188       UnicodeScript_kEnclosedAlphanumeric }, // 50,
189     { UnicodeScript_kBoxDrawing,
190       UnicodeScript_kBoxDrawing,
191       UnicodeScript_kBoxDrawing }, // 51,
192     { UnicodeScript_kBlockElement,
193       UnicodeScript_kBlockElement,
194       UnicodeScript_kBlockElement }, // 52,
195     { UnicodeScript_kGeometricShape,
196       UnicodeScript_kGeometricShape,
197       UnicodeScript_kGeometricShape }, // 53,
198     { UnicodeScript_kMiscSymbol,
199       UnicodeScript_kMiscSymbol,
200       UnicodeScript_kMiscSymbol }, // 54,
201     { UnicodeScript_kDingbat,
202       UnicodeScript_kDingbat,
203       UnicodeScript_kDingbat }, // 55,
204     { UnicodeScript_kBraillePatterns,
205       UnicodeScript_kBraillePatterns,
206       UnicodeScript_kBraillePatterns }, // 56,
207     { UnicodeScript_kCJKRadicalsSupplement,
208       UnicodeScript_kCJKRadicalsSupplement,
209       UnicodeScript_kCJKRadicalsSupplement }, // 57,
210     { UnicodeScript_kKangxiRadicals,
211       UnicodeScript_kKangxiRadicals,
212       UnicodeScript_kKangxiRadicals }, // 58,
213     { UnicodeScript_kIdeographicDescriptionCharacters,
214       UnicodeScript_kIdeographicDescriptionCharacters,
215       UnicodeScript_kIdeographicDescriptionCharacters }, // 59,
216     { UnicodeScript_kCJKSymbolPunctuation,
217       UnicodeScript_kCJKSymbolPunctuation,
218       UnicodeScript_kCJKSymbolPunctuation }, // 60,
219     { UnicodeScript_kHiragana,
220       UnicodeScript_kHiragana,
221       UnicodeScript_kHiragana }, // 61,
222     { UnicodeScript_kKatakana,
223       UnicodeScript_kKatakana,
224       UnicodeScript_kKatakana }, // 62,
225     { UnicodeScript_kBopomofo,
226       UnicodeScript_kBopomofo,
227       UnicodeScript_kBopomofo }, // 63,
228     { UnicodeScript_kHangulCompatibilityJamo,
229       UnicodeScript_kHangulCompatibilityJamo,
230       UnicodeScript_kHangulCompatibilityJamo }, // 64,
231     { UnicodeScript_kKanbun,
232       UnicodeScript_kKanbun,
233       UnicodeScript_kKanbun }, // 65,
234     { UnicodeScript_kBopomofoExtended,
235       UnicodeScript_kBopomofoExtended,
236       UnicodeScript_kBopomofoExtended }, // 66,
237     { UnicodeScript_kEnclosedCJKLetterMonth,
238       UnicodeScript_kEnclosedCJKLetterMonth,
239       UnicodeScript_kEnclosedCJKLetterMonth }, // 67,
240     { UnicodeScript_kCJKCompatibility,
241       UnicodeScript_kCJKCompatibility,
242       UnicodeScript_kCJKCompatibility }, // 68,
243     { UnicodeScript_k_CJKUnifiedIdeographsExtensionA,
244       UnicodeScript_k_CJKUnifiedIdeographsExtensionA,
245       UnicodeScript_k_CJKUnifiedIdeographsExtensionA }, // 69,
246     { UnicodeScript_kCJKUnifiedIdeograph,
247       UnicodeScript_kCJKUnifiedIdeograph,
248       UnicodeScript_kCJKUnifiedIdeograph }, // 70,
249     { UnicodeScript_kYiSyllables,
250       UnicodeScript_kYiSyllables,
251       UnicodeScript_kYiSyllables }, // 71,
252     { UnicodeScript_kYiRadicals,
253       UnicodeScript_kYiRadicals,
254       UnicodeScript_kYiRadicals }, // 72,
255     { UnicodeScript_kHangulSyllable,
256       UnicodeScript_kHangulSyllable,
257       UnicodeScript_kHangulSyllable }, // 73,
258     { UnicodeScript_kHighSurrogate,
259       UnicodeScript_kHighSurrogate,
260       UnicodeScript_kHighSurrogate }, // 74,
261     { UnicodeScript_kHighPrivateUseSurrogate,
262       UnicodeScript_kHighPrivateUseSurrogate,
263       UnicodeScript_kHighPrivateUseSurrogate }, // 75,
264     { UnicodeScript_kLowSurrogate,
265       UnicodeScript_kLowSurrogate,
266       UnicodeScript_kLowSurrogate }, // 76,
267     { UnicodeScript_kPrivateUse,
268       UnicodeScript_kPrivateUse,
269       UnicodeScript_kPrivateUse }, // 77,
270     { UnicodeScript_kCJKCompatibilityIdeograph,
271       UnicodeScript_kCJKCompatibilityIdeograph,
272       UnicodeScript_kCJKCompatibilityIdeograph }, // 78,
273     { UnicodeScript_kAlphabeticPresentation,
274       UnicodeScript_kAlphabeticPresentation,
275       UnicodeScript_kAlphabeticPresentation }, // 79,
276     { UnicodeScript_kArabicPresentationA,
277       UnicodeScript_kArabicPresentationA,
278       UnicodeScript_kArabicPresentationA }, // 80,
279     { UnicodeScript_kCombiningHalfMark,
280       UnicodeScript_kCombiningHalfMark,
281       UnicodeScript_kCombiningHalfMark }, // 81,
282     { UnicodeScript_kCJKCompatibilityForm,
283       UnicodeScript_kCJKCompatibilityForm,
284       UnicodeScript_kCJKCompatibilityForm }, // 82,
285     { UnicodeScript_kSmallFormVariant,
286       UnicodeScript_kSmallFormVariant,
287       UnicodeScript_kSmallFormVariant }, // 83,
288     { UnicodeScript_kArabicPresentationB,
289       UnicodeScript_kArabicPresentationB,
290       UnicodeScript_kArabicPresentationB }, // 84,
291     { UnicodeScript_kNoScript,
292       UnicodeScript_kNoScript,
293       UnicodeScript_kNoScript }, // 85,
294     { UnicodeScript_kHalfwidthFullwidthForm,
295       UnicodeScript_kHalfwidthFullwidthForm,
296       UnicodeScript_kHalfwidthFullwidthForm }, // 86,
297     { UnicodeScript_kScriptCount,
298       UnicodeScript_kScriptCount,
299       UnicodeScript_kNoScript } // 87,
300 };
301 
302 sal_Int16 SAL_CALL
303 unicode::getUnicodeScriptType( const sal_Unicode ch, ScriptTypeList* typeList, sal_Int16 unknownType ) {
304 
305     if (!typeList) {
306         typeList = defaultTypeList;
307         unknownType = UnicodeScript_kNoScript;
308     }
309 
310     sal_Int16 i = 0, type = typeList[0].to;
311     while (type < UnicodeScript_kScriptCount && ch > UnicodeScriptType[type][UnicodeScriptTypeTo]) {
312         type = typeList[++i].to;
313     }
314 
315     return (type < UnicodeScript_kScriptCount &&
316             ch >= UnicodeScriptType[typeList[i].from][UnicodeScriptTypeFrom]) ?
317             typeList[i].value : unknownType;
318 }
319 
320 sal_Bool SAL_CALL
321 unicode::isUnicodeScriptType( const sal_Unicode ch, sal_Int16 type) {
322     return ch >= UnicodeScriptType[type][UnicodeScriptTypeFrom] &&
323         ch <= UnicodeScriptType[type][UnicodeScriptTypeTo];
324 }
325 
326 sal_Unicode SAL_CALL
327 unicode::getUnicodeScriptStart( UnicodeScript type) {
328     return UnicodeScriptType[type][UnicodeScriptTypeFrom];
329 }
330 
331 sal_Unicode SAL_CALL
332 unicode::getUnicodeScriptEnd( UnicodeScript type) {
333     return UnicodeScriptType[type][UnicodeScriptTypeTo];
334 }
335 
336 sal_Int16 SAL_CALL
337 unicode::getUnicodeType( const sal_Unicode ch ) {
338     static sal_Unicode c = 0x00;
339     static sal_Int16 r = 0x00;
340 
341     if (ch == c) return r;
342     else c = ch;
343 
344     sal_Int16 address = UnicodeTypeIndex[ch >> 8];
345     return r = (sal_Int16)((address < UnicodeTypeNumberBlock) ? UnicodeTypeBlockValue[address] :
346         UnicodeTypeValue[((address - UnicodeTypeNumberBlock) << 8) + (ch & 0xff)]);
347 }
348 
349 sal_uInt8 SAL_CALL
350 unicode::getUnicodeDirection( const sal_Unicode ch ) {
351     static sal_Unicode c = 0x00;
352     static sal_uInt8 r = 0x00;
353 
354     if (ch == c) return r;
355     else c = ch;
356 
357     sal_Int16 address = UnicodeDirectionIndex[ch >> 8];
358     return r = ((address < UnicodeDirectionNumberBlock) ? UnicodeDirectionBlockValue[address] :
359         UnicodeDirectionValue[((address - UnicodeDirectionNumberBlock) << 8) + (ch & 0xff)]);
360 
361 }
362 
363 #define bit(name)   (1 << name)
364 
365 #define UPPERMASK   bit(UnicodeType::UPPERCASE_LETTER)
366 
367 #define LOWERMASK   bit(UnicodeType::LOWERCASE_LETTER)
368 
369 #define TITLEMASK   bit(UnicodeType::TITLECASE_LETTER)
370 
371 #define DIGITMASK   bit(UnicodeType::DECIMAL_DIGIT_NUMBER)|\
372             bit(UnicodeType::LETTER_NUMBER)|\
373             bit(UnicodeType::OTHER_NUMBER)
374 
375 #define ALPHAMASK   UPPERMASK|LOWERMASK|TITLEMASK|\
376             bit(UnicodeType::MODIFIER_LETTER)|\
377             bit(UnicodeType::OTHER_LETTER)
378 
379 #define BASEMASK    DIGITMASK|ALPHAMASK|\
380             bit(UnicodeType::NON_SPACING_MARK)|\
381             bit(UnicodeType::ENCLOSING_MARK)|\
382             bit(UnicodeType::COMBINING_SPACING_MARK)
383 
384 #define SPACEMASK   bit(UnicodeType::SPACE_SEPARATOR)|\
385             bit(UnicodeType::LINE_SEPARATOR)|\
386             bit(UnicodeType::PARAGRAPH_SEPARATOR)
387 
388 #define PUNCTUATIONMASK bit(UnicodeType::DASH_PUNCTUATION)|\
389             bit(UnicodeType::INITIAL_PUNCTUATION)|\
390             bit(UnicodeType::FINAL_PUNCTUATION)|\
391             bit(UnicodeType::CONNECTOR_PUNCTUATION)|\
392             bit(UnicodeType::OTHER_PUNCTUATION)
393 
394 #define SYMBOLMASK  bit(UnicodeType::MATH_SYMBOL)|\
395             bit(UnicodeType::CURRENCY_SYMBOL)|\
396             bit(UnicodeType::MODIFIER_SYMBOL)|\
397             bit(UnicodeType::OTHER_SYMBOL)
398 
399 #define PRINTMASK   BASEMASK|SPACEMASK|PUNCTUATIONMASK|SYMBOLMASK
400 
401 #define CONTROLMASK bit(UnicodeType::CONTROL)|\
402             bit(UnicodeType::FORMAT)|\
403             bit(UnicodeType::LINE_SEPARATOR)|\
404             bit(UnicodeType::PARAGRAPH_SEPARATOR)
405 
406 #define IsType(func, mask)  \
407 sal_Bool SAL_CALL func( const sal_Unicode ch) {\
408     return (bit(getUnicodeType(ch)) & (mask)) != 0;\
409 }
410 
411 IsType(unicode::isUpper, UPPERMASK)
412 IsType(unicode::isLower, LOWERMASK)
413 IsType(unicode::isTitle, DIGITMASK)
414 IsType(unicode::isControl, CONTROLMASK)
415 IsType(unicode::isPrint, PRINTMASK)
416 IsType(unicode::isAlpha, ALPHAMASK)
417 IsType(unicode::isDigit, DIGITMASK)
418 IsType(unicode::isAlphaDigit, ALPHAMASK|DIGITMASK)
419 IsType(unicode::isSpace, SPACEMASK)
420 IsType(unicode::isBase, BASEMASK)
421 IsType(unicode::isPunctuation, PUNCTUATIONMASK)
422 
423 #define CONTROLSPACE    bit(0x09)|bit(0x0a)|bit(0x0b)|bit(0x0c)|bit(0x0d)|\
424             bit(0x1c)|bit(0x1d)|bit(0x1e)|bit(0x1f)
425 
426 sal_Bool SAL_CALL unicode::isWhiteSpace( const sal_Unicode ch) {
427     return (ch != 0xa0 && isSpace(ch)) || (ch <= 0x1F && (bit(ch) & (CONTROLSPACE)));
428 }
429 
430 sal_Int32 SAL_CALL unicode::getCharType( const sal_Unicode ch )
431 {
432     using namespace ::com::sun::star::i18n::KCharacterType;
433 
434     switch ( getUnicodeType( ch ) ) {
435     // Upper
436     case UnicodeType::UPPERCASE_LETTER :
437         return UPPER|LETTER|PRINTABLE|BASE_FORM;
438 
439     // Lower
440     case UnicodeType::LOWERCASE_LETTER :
441         return LOWER|LETTER|PRINTABLE|BASE_FORM;
442 
443     // Title
444     case UnicodeType::TITLECASE_LETTER :
445         return TITLE_CASE|LETTER|PRINTABLE|BASE_FORM;
446 
447     // Letter
448     case UnicodeType::MODIFIER_LETTER :
449     case UnicodeType::OTHER_LETTER :
450         return LETTER|PRINTABLE|BASE_FORM;
451 
452     // Digit
453     case UnicodeType::DECIMAL_DIGIT_NUMBER:
454     case UnicodeType::LETTER_NUMBER:
455     case UnicodeType::OTHER_NUMBER:
456         return DIGIT|PRINTABLE|BASE_FORM;
457 
458     // Base
459     case UnicodeType::NON_SPACING_MARK:
460     case UnicodeType::ENCLOSING_MARK:
461     case UnicodeType::COMBINING_SPACING_MARK:
462         return BASE_FORM|PRINTABLE;
463 
464     // Print
465     case UnicodeType::SPACE_SEPARATOR:
466 
467     case UnicodeType::DASH_PUNCTUATION:
468     case UnicodeType::INITIAL_PUNCTUATION:
469     case UnicodeType::FINAL_PUNCTUATION:
470     case UnicodeType::CONNECTOR_PUNCTUATION:
471     case UnicodeType::OTHER_PUNCTUATION:
472 
473     case UnicodeType::MATH_SYMBOL:
474     case UnicodeType::CURRENCY_SYMBOL:
475     case UnicodeType::MODIFIER_SYMBOL:
476     case UnicodeType::OTHER_SYMBOL:
477         return PRINTABLE;
478 
479     // Control
480     case UnicodeType::CONTROL:
481     case UnicodeType::FORMAT:
482         return CONTROL;
483 
484     case UnicodeType::LINE_SEPARATOR:
485     case UnicodeType::PARAGRAPH_SEPARATOR:
486         return CONTROL|PRINTABLE;
487 
488     // for all others
489     default:
490         return 0;
491     }
492 }
493 
494 
495