1 /************************************************************************* 2 * 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * Copyright 2000, 2010 Oracle and/or its affiliates. 6 * 7 * OpenOffice.org - a multi-platform office productivity suite 8 * 9 * This file is part of OpenOffice.org. 10 * 11 * OpenOffice.org is free software: you can redistribute it and/or modify 12 * it under the terms of the GNU Lesser General Public License version 3 13 * only, as published by the Free Software Foundation. 14 * 15 * OpenOffice.org is distributed in the hope that it will be useful, 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 * GNU Lesser General Public License version 3 for more details 19 * (a copy is included in the LICENSE file that accompanied this code). 20 * 21 * You should have received a copy of the GNU Lesser General Public License 22 * version 3 along with OpenOffice.org. If not, see 23 * <http://www.openoffice.org/license.html> 24 * for a copy of the LGPLv3 License. 25 * 26 ************************************************************************/ 27 28 #include <com/sun/star/i18n/UnicodeType.hpp> 29 #include <com/sun/star/i18n/KCharacterType.hpp> 30 #include <i18nutil/unicode.hxx> 31 #include "unicode_data.h" 32 33 using namespace ::com::sun::star::i18n; 34 35 static ScriptTypeList defaultTypeList[] = { 36 { UnicodeScript_kBasicLatin, 37 UnicodeScript_kBasicLatin, 38 UnicodeScript_kBasicLatin }, // 0, 39 { UnicodeScript_kLatin1Supplement, 40 UnicodeScript_kLatin1Supplement, 41 UnicodeScript_kLatin1Supplement },// 1, 42 { UnicodeScript_kLatinExtendedA, 43 UnicodeScript_kLatinExtendedA, 44 UnicodeScript_kLatinExtendedA }, // 2, 45 { UnicodeScript_kLatinExtendedB, 46 UnicodeScript_kLatinExtendedB, 47 UnicodeScript_kLatinExtendedB }, // 3, 48 { UnicodeScript_kIPAExtension, 49 UnicodeScript_kIPAExtension, 50 UnicodeScript_kIPAExtension }, // 4, 51 { UnicodeScript_kSpacingModifier, 52 UnicodeScript_kSpacingModifier, 53 UnicodeScript_kSpacingModifier }, // 5, 54 { UnicodeScript_kCombiningDiacritical, 55 UnicodeScript_kCombiningDiacritical, 56 UnicodeScript_kCombiningDiacritical }, // 6, 57 { UnicodeScript_kGreek, 58 UnicodeScript_kGreek, 59 UnicodeScript_kGreek }, // 7, 60 { UnicodeScript_kCyrillic, 61 UnicodeScript_kCyrillic, 62 UnicodeScript_kCyrillic }, // 8, 63 { UnicodeScript_kArmenian, 64 UnicodeScript_kArmenian, 65 UnicodeScript_kArmenian }, // 9, 66 { UnicodeScript_kHebrew, 67 UnicodeScript_kHebrew, 68 UnicodeScript_kHebrew }, // 10, 69 { UnicodeScript_kArabic, 70 UnicodeScript_kArabic, 71 UnicodeScript_kArabic }, // 11, 72 { UnicodeScript_kSyriac, 73 UnicodeScript_kSyriac, 74 UnicodeScript_kSyriac }, // 12, 75 { UnicodeScript_kThaana, 76 UnicodeScript_kThaana, 77 UnicodeScript_kThaana }, // 13, 78 { UnicodeScript_kDevanagari, 79 UnicodeScript_kDevanagari, 80 UnicodeScript_kDevanagari }, // 14, 81 { UnicodeScript_kBengali, 82 UnicodeScript_kBengali, 83 UnicodeScript_kBengali }, // 15, 84 { UnicodeScript_kGurmukhi, 85 UnicodeScript_kGurmukhi, 86 UnicodeScript_kGurmukhi }, // 16, 87 { UnicodeScript_kGujarati, 88 UnicodeScript_kGujarati, 89 UnicodeScript_kGujarati }, // 17, 90 { UnicodeScript_kOriya, 91 UnicodeScript_kOriya, 92 UnicodeScript_kOriya }, // 18, 93 { UnicodeScript_kTamil, 94 UnicodeScript_kTamil, 95 UnicodeScript_kTamil }, // 19, 96 { UnicodeScript_kTelugu, 97 UnicodeScript_kTelugu, 98 UnicodeScript_kTelugu }, // 20, 99 { UnicodeScript_kKannada, 100 UnicodeScript_kKannada, 101 UnicodeScript_kKannada }, // 21, 102 { UnicodeScript_kMalayalam, 103 UnicodeScript_kMalayalam, 104 UnicodeScript_kMalayalam }, // 22, 105 { UnicodeScript_kSinhala, 106 UnicodeScript_kSinhala, 107 UnicodeScript_kSinhala }, // 23, 108 { UnicodeScript_kThai, 109 UnicodeScript_kThai, 110 UnicodeScript_kThai }, // 24, 111 { UnicodeScript_kLao, 112 UnicodeScript_kLao, 113 UnicodeScript_kLao }, // 25, 114 { UnicodeScript_kTibetan, 115 UnicodeScript_kTibetan, 116 UnicodeScript_kTibetan }, // 26, 117 { UnicodeScript_kMyanmar, 118 UnicodeScript_kMyanmar, 119 UnicodeScript_kMyanmar }, // 27, 120 { UnicodeScript_kGeorgian, 121 UnicodeScript_kGeorgian, 122 UnicodeScript_kGeorgian }, // 28, 123 { UnicodeScript_kHangulJamo, 124 UnicodeScript_kHangulJamo, 125 UnicodeScript_kHangulJamo }, // 29, 126 { UnicodeScript_kEthiopic, 127 UnicodeScript_kEthiopic, 128 UnicodeScript_kEthiopic }, // 30, 129 { UnicodeScript_kCherokee, 130 UnicodeScript_kCherokee, 131 UnicodeScript_kCherokee }, // 31, 132 { UnicodeScript_kUnifiedCanadianAboriginalSyllabics, 133 UnicodeScript_kUnifiedCanadianAboriginalSyllabics, 134 UnicodeScript_kUnifiedCanadianAboriginalSyllabics }, // 32, 135 { UnicodeScript_kOgham, 136 UnicodeScript_kOgham, 137 UnicodeScript_kOgham }, // 33, 138 { UnicodeScript_kRunic, 139 UnicodeScript_kRunic, 140 UnicodeScript_kRunic }, // 34, 141 { UnicodeScript_kKhmer, 142 UnicodeScript_kKhmer, 143 UnicodeScript_kKhmer }, // 35, 144 { UnicodeScript_kMongolian, 145 UnicodeScript_kMongolian, 146 UnicodeScript_kMongolian }, // 36, 147 { UnicodeScript_kLatinExtendedAdditional, 148 UnicodeScript_kLatinExtendedAdditional, 149 UnicodeScript_kLatinExtendedAdditional }, // 37, 150 { UnicodeScript_kGreekExtended, 151 UnicodeScript_kGreekExtended, 152 UnicodeScript_kGreekExtended }, // 38, 153 { UnicodeScript_kGeneralPunctuation, 154 UnicodeScript_kGeneralPunctuation, 155 UnicodeScript_kGeneralPunctuation }, // 39, 156 { UnicodeScript_kSuperSubScript, 157 UnicodeScript_kSuperSubScript, 158 UnicodeScript_kSuperSubScript }, // 40, 159 { UnicodeScript_kCurrencySymbolScript, 160 UnicodeScript_kCurrencySymbolScript, 161 UnicodeScript_kCurrencySymbolScript }, // 41, 162 { UnicodeScript_kSymbolCombiningMark, 163 UnicodeScript_kSymbolCombiningMark, 164 UnicodeScript_kSymbolCombiningMark }, // 42, 165 { UnicodeScript_kLetterlikeSymbol, 166 UnicodeScript_kLetterlikeSymbol, 167 UnicodeScript_kLetterlikeSymbol }, // 43, 168 { UnicodeScript_kNumberForm, 169 UnicodeScript_kNumberForm, 170 UnicodeScript_kNumberForm }, // 44, 171 { UnicodeScript_kArrow, 172 UnicodeScript_kArrow, 173 UnicodeScript_kArrow }, // 45, 174 { UnicodeScript_kMathOperator, 175 UnicodeScript_kMathOperator, 176 UnicodeScript_kMathOperator }, // 46, 177 { UnicodeScript_kMiscTechnical, 178 UnicodeScript_kMiscTechnical, 179 UnicodeScript_kMiscTechnical }, // 47, 180 { UnicodeScript_kControlPicture, 181 UnicodeScript_kControlPicture, 182 UnicodeScript_kControlPicture }, // 48, 183 { UnicodeScript_kOpticalCharacter, 184 UnicodeScript_kOpticalCharacter, 185 UnicodeScript_kOpticalCharacter }, // 49, 186 { UnicodeScript_kEnclosedAlphanumeric, 187 UnicodeScript_kEnclosedAlphanumeric, 188 UnicodeScript_kEnclosedAlphanumeric }, // 50, 189 { UnicodeScript_kBoxDrawing, 190 UnicodeScript_kBoxDrawing, 191 UnicodeScript_kBoxDrawing }, // 51, 192 { UnicodeScript_kBlockElement, 193 UnicodeScript_kBlockElement, 194 UnicodeScript_kBlockElement }, // 52, 195 { UnicodeScript_kGeometricShape, 196 UnicodeScript_kGeometricShape, 197 UnicodeScript_kGeometricShape }, // 53, 198 { UnicodeScript_kMiscSymbol, 199 UnicodeScript_kMiscSymbol, 200 UnicodeScript_kMiscSymbol }, // 54, 201 { UnicodeScript_kDingbat, 202 UnicodeScript_kDingbat, 203 UnicodeScript_kDingbat }, // 55, 204 { UnicodeScript_kBraillePatterns, 205 UnicodeScript_kBraillePatterns, 206 UnicodeScript_kBraillePatterns }, // 56, 207 { UnicodeScript_kCJKRadicalsSupplement, 208 UnicodeScript_kCJKRadicalsSupplement, 209 UnicodeScript_kCJKRadicalsSupplement }, // 57, 210 { UnicodeScript_kKangxiRadicals, 211 UnicodeScript_kKangxiRadicals, 212 UnicodeScript_kKangxiRadicals }, // 58, 213 { UnicodeScript_kIdeographicDescriptionCharacters, 214 UnicodeScript_kIdeographicDescriptionCharacters, 215 UnicodeScript_kIdeographicDescriptionCharacters }, // 59, 216 { UnicodeScript_kCJKSymbolPunctuation, 217 UnicodeScript_kCJKSymbolPunctuation, 218 UnicodeScript_kCJKSymbolPunctuation }, // 60, 219 { UnicodeScript_kHiragana, 220 UnicodeScript_kHiragana, 221 UnicodeScript_kHiragana }, // 61, 222 { UnicodeScript_kKatakana, 223 UnicodeScript_kKatakana, 224 UnicodeScript_kKatakana }, // 62, 225 { UnicodeScript_kBopomofo, 226 UnicodeScript_kBopomofo, 227 UnicodeScript_kBopomofo }, // 63, 228 { UnicodeScript_kHangulCompatibilityJamo, 229 UnicodeScript_kHangulCompatibilityJamo, 230 UnicodeScript_kHangulCompatibilityJamo }, // 64, 231 { UnicodeScript_kKanbun, 232 UnicodeScript_kKanbun, 233 UnicodeScript_kKanbun }, // 65, 234 { UnicodeScript_kBopomofoExtended, 235 UnicodeScript_kBopomofoExtended, 236 UnicodeScript_kBopomofoExtended }, // 66, 237 { UnicodeScript_kEnclosedCJKLetterMonth, 238 UnicodeScript_kEnclosedCJKLetterMonth, 239 UnicodeScript_kEnclosedCJKLetterMonth }, // 67, 240 { UnicodeScript_kCJKCompatibility, 241 UnicodeScript_kCJKCompatibility, 242 UnicodeScript_kCJKCompatibility }, // 68, 243 { UnicodeScript_k_CJKUnifiedIdeographsExtensionA, 244 UnicodeScript_k_CJKUnifiedIdeographsExtensionA, 245 UnicodeScript_k_CJKUnifiedIdeographsExtensionA }, // 69, 246 { UnicodeScript_kCJKUnifiedIdeograph, 247 UnicodeScript_kCJKUnifiedIdeograph, 248 UnicodeScript_kCJKUnifiedIdeograph }, // 70, 249 { UnicodeScript_kYiSyllables, 250 UnicodeScript_kYiSyllables, 251 UnicodeScript_kYiSyllables }, // 71, 252 { UnicodeScript_kYiRadicals, 253 UnicodeScript_kYiRadicals, 254 UnicodeScript_kYiRadicals }, // 72, 255 { UnicodeScript_kHangulSyllable, 256 UnicodeScript_kHangulSyllable, 257 UnicodeScript_kHangulSyllable }, // 73, 258 { UnicodeScript_kHighSurrogate, 259 UnicodeScript_kHighSurrogate, 260 UnicodeScript_kHighSurrogate }, // 74, 261 { UnicodeScript_kHighPrivateUseSurrogate, 262 UnicodeScript_kHighPrivateUseSurrogate, 263 UnicodeScript_kHighPrivateUseSurrogate }, // 75, 264 { UnicodeScript_kLowSurrogate, 265 UnicodeScript_kLowSurrogate, 266 UnicodeScript_kLowSurrogate }, // 76, 267 { UnicodeScript_kPrivateUse, 268 UnicodeScript_kPrivateUse, 269 UnicodeScript_kPrivateUse }, // 77, 270 { UnicodeScript_kCJKCompatibilityIdeograph, 271 UnicodeScript_kCJKCompatibilityIdeograph, 272 UnicodeScript_kCJKCompatibilityIdeograph }, // 78, 273 { UnicodeScript_kAlphabeticPresentation, 274 UnicodeScript_kAlphabeticPresentation, 275 UnicodeScript_kAlphabeticPresentation }, // 79, 276 { UnicodeScript_kArabicPresentationA, 277 UnicodeScript_kArabicPresentationA, 278 UnicodeScript_kArabicPresentationA }, // 80, 279 { UnicodeScript_kCombiningHalfMark, 280 UnicodeScript_kCombiningHalfMark, 281 UnicodeScript_kCombiningHalfMark }, // 81, 282 { UnicodeScript_kCJKCompatibilityForm, 283 UnicodeScript_kCJKCompatibilityForm, 284 UnicodeScript_kCJKCompatibilityForm }, // 82, 285 { UnicodeScript_kSmallFormVariant, 286 UnicodeScript_kSmallFormVariant, 287 UnicodeScript_kSmallFormVariant }, // 83, 288 { UnicodeScript_kArabicPresentationB, 289 UnicodeScript_kArabicPresentationB, 290 UnicodeScript_kArabicPresentationB }, // 84, 291 { UnicodeScript_kNoScript, 292 UnicodeScript_kNoScript, 293 UnicodeScript_kNoScript }, // 85, 294 { UnicodeScript_kHalfwidthFullwidthForm, 295 UnicodeScript_kHalfwidthFullwidthForm, 296 UnicodeScript_kHalfwidthFullwidthForm }, // 86, 297 { UnicodeScript_kScriptCount, 298 UnicodeScript_kScriptCount, 299 UnicodeScript_kNoScript } // 87, 300 }; 301 302 sal_Int16 SAL_CALL 303 unicode::getUnicodeScriptType( const sal_Unicode ch, ScriptTypeList* typeList, sal_Int16 unknownType ) { 304 305 if (!typeList) { 306 typeList = defaultTypeList; 307 unknownType = UnicodeScript_kNoScript; 308 } 309 310 sal_Int16 i = 0, type = typeList[0].to; 311 while (type < UnicodeScript_kScriptCount && ch > UnicodeScriptType[type][UnicodeScriptTypeTo]) { 312 type = typeList[++i].to; 313 } 314 315 return (type < UnicodeScript_kScriptCount && 316 ch >= UnicodeScriptType[typeList[i].from][UnicodeScriptTypeFrom]) ? 317 typeList[i].value : unknownType; 318 } 319 320 sal_Bool SAL_CALL 321 unicode::isUnicodeScriptType( const sal_Unicode ch, sal_Int16 type) { 322 return ch >= UnicodeScriptType[type][UnicodeScriptTypeFrom] && 323 ch <= UnicodeScriptType[type][UnicodeScriptTypeTo]; 324 } 325 326 sal_Unicode SAL_CALL 327 unicode::getUnicodeScriptStart( UnicodeScript type) { 328 return UnicodeScriptType[type][UnicodeScriptTypeFrom]; 329 } 330 331 sal_Unicode SAL_CALL 332 unicode::getUnicodeScriptEnd( UnicodeScript type) { 333 return UnicodeScriptType[type][UnicodeScriptTypeTo]; 334 } 335 336 sal_Int16 SAL_CALL 337 unicode::getUnicodeType( const sal_Unicode ch ) { 338 static sal_Unicode c = 0x00; 339 static sal_Int16 r = 0x00; 340 341 if (ch == c) return r; 342 else c = ch; 343 344 sal_Int16 address = UnicodeTypeIndex[ch >> 8]; 345 return r = (sal_Int16)((address < UnicodeTypeNumberBlock) ? UnicodeTypeBlockValue[address] : 346 UnicodeTypeValue[((address - UnicodeTypeNumberBlock) << 8) + (ch & 0xff)]); 347 } 348 349 sal_uInt8 SAL_CALL 350 unicode::getUnicodeDirection( const sal_Unicode ch ) { 351 static sal_Unicode c = 0x00; 352 static sal_uInt8 r = 0x00; 353 354 if (ch == c) return r; 355 else c = ch; 356 357 sal_Int16 address = UnicodeDirectionIndex[ch >> 8]; 358 return r = ((address < UnicodeDirectionNumberBlock) ? UnicodeDirectionBlockValue[address] : 359 UnicodeDirectionValue[((address - UnicodeDirectionNumberBlock) << 8) + (ch & 0xff)]); 360 361 } 362 363 #define bit(name) (1 << name) 364 365 #define UPPERMASK bit(UnicodeType::UPPERCASE_LETTER) 366 367 #define LOWERMASK bit(UnicodeType::LOWERCASE_LETTER) 368 369 #define TITLEMASK bit(UnicodeType::TITLECASE_LETTER) 370 371 #define DIGITMASK bit(UnicodeType::DECIMAL_DIGIT_NUMBER)|\ 372 bit(UnicodeType::LETTER_NUMBER)|\ 373 bit(UnicodeType::OTHER_NUMBER) 374 375 #define ALPHAMASK UPPERMASK|LOWERMASK|TITLEMASK|\ 376 bit(UnicodeType::MODIFIER_LETTER)|\ 377 bit(UnicodeType::OTHER_LETTER) 378 379 #define BASEMASK DIGITMASK|ALPHAMASK|\ 380 bit(UnicodeType::NON_SPACING_MARK)|\ 381 bit(UnicodeType::ENCLOSING_MARK)|\ 382 bit(UnicodeType::COMBINING_SPACING_MARK) 383 384 #define SPACEMASK bit(UnicodeType::SPACE_SEPARATOR)|\ 385 bit(UnicodeType::LINE_SEPARATOR)|\ 386 bit(UnicodeType::PARAGRAPH_SEPARATOR) 387 388 #define PUNCTUATIONMASK bit(UnicodeType::DASH_PUNCTUATION)|\ 389 bit(UnicodeType::INITIAL_PUNCTUATION)|\ 390 bit(UnicodeType::FINAL_PUNCTUATION)|\ 391 bit(UnicodeType::CONNECTOR_PUNCTUATION)|\ 392 bit(UnicodeType::OTHER_PUNCTUATION) 393 394 #define SYMBOLMASK bit(UnicodeType::MATH_SYMBOL)|\ 395 bit(UnicodeType::CURRENCY_SYMBOL)|\ 396 bit(UnicodeType::MODIFIER_SYMBOL)|\ 397 bit(UnicodeType::OTHER_SYMBOL) 398 399 #define PRINTMASK BASEMASK|SPACEMASK|PUNCTUATIONMASK|SYMBOLMASK 400 401 #define CONTROLMASK bit(UnicodeType::CONTROL)|\ 402 bit(UnicodeType::FORMAT)|\ 403 bit(UnicodeType::LINE_SEPARATOR)|\ 404 bit(UnicodeType::PARAGRAPH_SEPARATOR) 405 406 #define IsType(func, mask) \ 407 sal_Bool SAL_CALL func( const sal_Unicode ch) {\ 408 return (bit(getUnicodeType(ch)) & (mask)) != 0;\ 409 } 410 411 IsType(unicode::isUpper, UPPERMASK) 412 IsType(unicode::isLower, LOWERMASK) 413 IsType(unicode::isTitle, DIGITMASK) 414 IsType(unicode::isControl, CONTROLMASK) 415 IsType(unicode::isPrint, PRINTMASK) 416 IsType(unicode::isAlpha, ALPHAMASK) 417 IsType(unicode::isDigit, DIGITMASK) 418 IsType(unicode::isAlphaDigit, ALPHAMASK|DIGITMASK) 419 IsType(unicode::isSpace, SPACEMASK) 420 IsType(unicode::isBase, BASEMASK) 421 IsType(unicode::isPunctuation, PUNCTUATIONMASK) 422 423 #define CONTROLSPACE bit(0x09)|bit(0x0a)|bit(0x0b)|bit(0x0c)|bit(0x0d)|\ 424 bit(0x1c)|bit(0x1d)|bit(0x1e)|bit(0x1f) 425 426 sal_Bool SAL_CALL unicode::isWhiteSpace( const sal_Unicode ch) { 427 return (ch != 0xa0 && isSpace(ch)) || (ch <= 0x1F && (bit(ch) & (CONTROLSPACE))); 428 } 429 430 sal_Int32 SAL_CALL unicode::getCharType( const sal_Unicode ch ) 431 { 432 using namespace ::com::sun::star::i18n::KCharacterType; 433 434 switch ( getUnicodeType( ch ) ) { 435 // Upper 436 case UnicodeType::UPPERCASE_LETTER : 437 return UPPER|LETTER|PRINTABLE|BASE_FORM; 438 439 // Lower 440 case UnicodeType::LOWERCASE_LETTER : 441 return LOWER|LETTER|PRINTABLE|BASE_FORM; 442 443 // Title 444 case UnicodeType::TITLECASE_LETTER : 445 return TITLE_CASE|LETTER|PRINTABLE|BASE_FORM; 446 447 // Letter 448 case UnicodeType::MODIFIER_LETTER : 449 case UnicodeType::OTHER_LETTER : 450 return LETTER|PRINTABLE|BASE_FORM; 451 452 // Digit 453 case UnicodeType::DECIMAL_DIGIT_NUMBER: 454 case UnicodeType::LETTER_NUMBER: 455 case UnicodeType::OTHER_NUMBER: 456 return DIGIT|PRINTABLE|BASE_FORM; 457 458 // Base 459 case UnicodeType::NON_SPACING_MARK: 460 case UnicodeType::ENCLOSING_MARK: 461 case UnicodeType::COMBINING_SPACING_MARK: 462 return BASE_FORM|PRINTABLE; 463 464 // Print 465 case UnicodeType::SPACE_SEPARATOR: 466 467 case UnicodeType::DASH_PUNCTUATION: 468 case UnicodeType::INITIAL_PUNCTUATION: 469 case UnicodeType::FINAL_PUNCTUATION: 470 case UnicodeType::CONNECTOR_PUNCTUATION: 471 case UnicodeType::OTHER_PUNCTUATION: 472 473 case UnicodeType::MATH_SYMBOL: 474 case UnicodeType::CURRENCY_SYMBOL: 475 case UnicodeType::MODIFIER_SYMBOL: 476 case UnicodeType::OTHER_SYMBOL: 477 return PRINTABLE; 478 479 // Control 480 case UnicodeType::CONTROL: 481 case UnicodeType::FORMAT: 482 return CONTROL; 483 484 case UnicodeType::LINE_SEPARATOR: 485 case UnicodeType::PARAGRAPH_SEPARATOR: 486 return CONTROL|PRINTABLE; 487 488 // for all others 489 default: 490 return 0; 491 } 492 } 493 494 495