1 /************************************************************** 2 * 3 * Licensed to the Apache Software Foundation (ASF) under one 4 * or more contributor license agreements. See the NOTICE file 5 * distributed with this work for additional information 6 * regarding copyright ownership. The ASF licenses this file 7 * to you under the Apache License, Version 2.0 (the 8 * "License"); you may not use this file except in compliance 9 * with the License. You may obtain a copy of the License at 10 * 11 * http://www.apache.org/licenses/LICENSE-2.0 12 * 13 * Unless required by applicable law or agreed to in writing, 14 * software distributed under the License is distributed on an 15 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 * KIND, either express or implied. See the License for the 17 * specific language governing permissions and limitations 18 * under the License. 19 * 20 *************************************************************/ 21 22 23 24 // MARKER(update_precomp.py): autogen include statement, do not remove 25 #include "precompiled_i18npool.hxx" 26 27 #include <rtl/ustrbuf.hxx> 28 #include <i18nutil/casefolding.hxx> 29 #include <i18nutil/unicode.hxx> 30 31 #include <comphelper/processfactory.hxx> 32 #include <osl/diagnose.h> 33 34 #include <string.h> 35 36 #include "characterclassificationImpl.hxx" 37 #include "breakiteratorImpl.hxx" 38 39 #define TRANSLITERATION_ALL 40 #include "transliteration_body.hxx" 41 42 using namespace ::com::sun::star::uno; 43 using namespace ::com::sun::star::lang; 44 using namespace ::rtl; 45 46 #define A2OU(x) OUString::createFromAscii(x) 47 48 namespace com { namespace sun { namespace star { namespace i18n { 49 50 51 Transliteration_body::Transliteration_body() 52 { 53 nMappingType = 0; 54 transliterationName = "Transliteration_body"; 55 implementationName = "com.sun.star.i18n.Transliteration.Transliteration_body"; 56 } 57 58 sal_Int16 SAL_CALL Transliteration_body::getType() throw(RuntimeException) 59 { 60 return TransliterationType::ONE_TO_ONE; 61 } 62 63 sal_Bool SAL_CALL Transliteration_body::equals( 64 const OUString& /*str1*/, sal_Int32 /*pos1*/, sal_Int32 /*nCount1*/, sal_Int32& /*nMatch1*/, 65 const OUString& /*str2*/, sal_Int32 /*pos2*/, sal_Int32 /*nCount2*/, sal_Int32& /*nMatch2*/) 66 throw(RuntimeException) 67 { 68 throw RuntimeException(); 69 } 70 71 Sequence< OUString > SAL_CALL 72 Transliteration_body::transliterateRange( const OUString& str1, const OUString& str2 ) 73 throw( RuntimeException) 74 { 75 Sequence< OUString > ostr(2); 76 ostr[0] = str1; 77 ostr[1] = str2; 78 return ostr; 79 } 80 81 82 static sal_uInt8 lcl_getMappingTypeForToggleCase( sal_uInt8 nMappingType, sal_Unicode cChar ) 83 { 84 sal_uInt8 nRes = nMappingType; 85 86 // take care of TOGGLE_CASE transliteration: 87 // nMappingType should not be a combination of flags, thuse we decide now 88 // which one to use. 89 if (nMappingType == (MappingTypeLowerToUpper | MappingTypeUpperToLower)) 90 { 91 const sal_Int16 nType = unicode::getUnicodeType( cChar ); 92 if (nType & 0x02 /* lower case*/) 93 nRes = MappingTypeLowerToUpper; 94 else 95 { 96 // should also work properly for non-upper characters like white spacs, numbers, ... 97 nRes = MappingTypeUpperToLower; 98 } 99 } 100 101 return nRes; 102 } 103 104 105 OUString SAL_CALL 106 Transliteration_body::transliterate( 107 const OUString& inStr, sal_Int32 startPos, sal_Int32 nCount, 108 Sequence< sal_Int32 >& offset) 109 throw(RuntimeException) 110 { 111 #if 0 112 /* Performance optimization: 113 * The two realloc() consume 48% (32% grow, 16% shrink) runtime of this method! 114 * getValue() needs about 15%, so there is equal balance if we trade the second 115 * (shrinking) realloc() for a getValue(). But if the caller initializes the 116 * sequence to nCount elements there isn't any change in size necessary in most 117 * cases (one-to-one mapping) and we gain 33%. 118 * 119 * Of that constellation the getValue() method takes 20% upon each call, so 40% 120 * for both. By remembering the first calls' results we could gain some extra 121 * percentage again, but unfortunately getValue() may return a reference to a 122 * static buffer, so we can't store the pointer directly but would have to 123 * copy-construct an array, which doesn't give us any advantage. 124 * 125 * Much more is accomplished by working directly on the sequence buffer 126 * returned by getArray() instead of using operator[] for each and every 127 * access. 128 * 129 * And while we're at it: now that we know the size in advance we don't need to 130 * copy the buffer anymore, just create the real string buffer and let the 131 * return value take ownership. 132 * 133 * All together these changes result in the new implementation needing only 62% 134 * of the time of the old implementation (in other words: that one was 1.61 135 * times slower ...) 136 */ 137 138 // Allocate the max possible buffer. Try to use stack instead of heap which 139 // would have to be reallocated most times anyway. 140 const sal_Int32 nLocalBuf = 512 * NMAPPINGMAX; 141 sal_Unicode aLocalBuf[nLocalBuf], *out = aLocalBuf, *aHeapBuf = NULL; 142 143 const sal_Unicode *in = inStr.getStr() + startPos; 144 145 if (nCount > 512) 146 out = aHeapBuf = (sal_Unicode*) malloc((nCount * NMAPPINGMAX) * sizeof(sal_Unicode)); 147 148 if (useOffset) 149 offset.realloc(nCount * NMAPPINGMAX); 150 sal_Int32 j = 0; 151 for (sal_Int32 i = 0; i < nCount; i++) { 152 Mapping &map = casefolding::getValue(in, i, nCount, aLocale, nMappingType); 153 for (sal_Int32 k = 0; k < map.nmap; k++) { 154 if (useOffset) 155 offset[j] = i + startPos; 156 out[j++] = map.map[k]; 157 } 158 } 159 if (useOffset) 160 offset.realloc(j); 161 162 OUString r(out, j); 163 164 if (aHeapBuf) 165 free(aHeapBuf); 166 167 return r; 168 #else 169 const sal_Unicode *in = inStr.getStr() + startPos; 170 171 // Two different blocks to eliminate the if(useOffset) condition inside the 172 // inner k loop. Yes, on massive use even such small things do count. 173 if ( useOffset ) 174 { 175 sal_Int32 nOffCount = 0, i; 176 for (i = 0; i < nCount; i++) 177 { 178 // take care of TOGGLE_CASE transliteration: 179 sal_uInt8 nTmpMappingType = nMappingType; 180 if (nMappingType == (MappingTypeLowerToUpper | MappingTypeUpperToLower)) 181 nTmpMappingType = lcl_getMappingTypeForToggleCase( nMappingType, in[i] ); 182 183 const Mapping &map = casefolding::getValue( in, i, nCount, aLocale, nTmpMappingType ); 184 nOffCount += map.nmap; 185 } 186 rtl_uString* pStr = x_rtl_uString_new_WithLength( nOffCount, 1 ); // our x_rtl_ustring.h 187 sal_Unicode* out = pStr->buffer; 188 189 if ( nOffCount != offset.getLength() ) 190 offset.realloc( nOffCount ); 191 192 sal_Int32 j = 0; 193 sal_Int32 * pArr = offset.getArray(); 194 for (i = 0; i < nCount; i++) 195 { 196 // take care of TOGGLE_CASE transliteration: 197 sal_uInt8 nTmpMappingType = nMappingType; 198 if (nMappingType == (MappingTypeLowerToUpper | MappingTypeUpperToLower)) 199 nTmpMappingType = lcl_getMappingTypeForToggleCase( nMappingType, in[i] ); 200 201 const Mapping &map = casefolding::getValue( in, i, nCount, aLocale, nTmpMappingType ); 202 for (sal_Int32 k = 0; k < map.nmap; k++) 203 { 204 pArr[j] = i + startPos; 205 out[j++] = map.map[k]; 206 } 207 } 208 out[j] = 0; 209 210 return OUString( pStr, SAL_NO_ACQUIRE ); 211 } 212 else 213 { 214 // In the simple case of no offset sequence used we can eliminate the 215 // first getValue() loop. We could also assume that most calls result 216 // in identical string lengths, thus using a preallocated 217 // OUStringBuffer could be an easy way to assemble the return string 218 // without too much hassle. However, for single characters the 219 // OUStringBuffer::append() method is quite expensive compared to a 220 // simple array operation, so it pays here to copy the final result 221 // instead. 222 223 // Allocate the max possible buffer. Try to use stack instead of heap, 224 // which would have to be reallocated most times anyways. 225 const sal_Int32 nLocalBuf = 2048; 226 sal_Unicode aLocalBuf[ nLocalBuf * NMAPPINGMAX ], *out = aLocalBuf, *pHeapBuf = NULL; 227 if ( nCount > nLocalBuf ) 228 out = pHeapBuf = new sal_Unicode[ nCount * NMAPPINGMAX ]; 229 230 sal_Int32 j = 0; 231 for ( sal_Int32 i = 0; i < nCount; i++) 232 { 233 // take care of TOGGLE_CASE transliteration: 234 sal_uInt8 nTmpMappingType = nMappingType; 235 if (nMappingType == (MappingTypeLowerToUpper | MappingTypeUpperToLower)) 236 nTmpMappingType = lcl_getMappingTypeForToggleCase( nMappingType, in[i] ); 237 238 const Mapping &map = casefolding::getValue( in, i, nCount, aLocale, nTmpMappingType ); 239 for (sal_Int32 k = 0; k < map.nmap; k++) 240 { 241 out[j++] = map.map[k]; 242 } 243 } 244 245 OUString aRet( out, j ); 246 if ( pHeapBuf ) 247 delete [] pHeapBuf; 248 return aRet; 249 } 250 #endif 251 } 252 253 OUString SAL_CALL 254 Transliteration_body::transliterateChar2String( sal_Unicode inChar ) throw(RuntimeException) 255 { 256 const Mapping &map = casefolding::getValue(&inChar, 0, 1, aLocale, nMappingType); 257 rtl_uString* pStr = x_rtl_uString_new_WithLength( map.nmap, 1 ); // our x_rtl_ustring.h 258 sal_Unicode* out = pStr->buffer; 259 sal_Int32 i; 260 261 for (i = 0; i < map.nmap; i++) 262 out[i] = map.map[i]; 263 out[i] = 0; 264 265 return OUString( pStr, SAL_NO_ACQUIRE ); 266 } 267 268 sal_Unicode SAL_CALL 269 Transliteration_body::transliterateChar2Char( sal_Unicode inChar ) throw(MultipleCharsOutputException, RuntimeException) 270 { 271 const Mapping &map = casefolding::getValue(&inChar, 0, 1, aLocale, nMappingType); 272 if (map.nmap > 1) 273 throw MultipleCharsOutputException(); 274 return map.map[0]; 275 } 276 277 OUString SAL_CALL 278 Transliteration_body::folding( const OUString& inStr, sal_Int32 startPos, sal_Int32 nCount, 279 Sequence< sal_Int32 >& offset) throw(RuntimeException) 280 { 281 return this->transliterate(inStr, startPos, nCount, offset); 282 } 283 284 Transliteration_casemapping::Transliteration_casemapping() 285 { 286 nMappingType = 0; 287 transliterationName = "casemapping(generic)"; 288 implementationName = "com.sun.star.i18n.Transliteration.Transliteration_casemapping"; 289 } 290 291 void SAL_CALL 292 Transliteration_casemapping::setMappingType( const sal_uInt8 rMappingType, const Locale& rLocale ) 293 { 294 nMappingType = rMappingType; 295 aLocale = rLocale; 296 } 297 298 Transliteration_u2l::Transliteration_u2l() 299 { 300 nMappingType = MappingTypeUpperToLower; 301 transliterationName = "upper_to_lower(generic)"; 302 implementationName = "com.sun.star.i18n.Transliteration.Transliteration_u2l"; 303 } 304 305 Transliteration_l2u::Transliteration_l2u() 306 { 307 nMappingType = MappingTypeLowerToUpper; 308 transliterationName = "lower_to_upper(generic)"; 309 implementationName = "com.sun.star.i18n.Transliteration.Transliteration_l2u"; 310 } 311 312 Transliteration_togglecase::Transliteration_togglecase() 313 { 314 // usually nMappingType must NOT be a combiantion of different flages here, 315 // but we take care of that problem in Transliteration_body::transliterate above 316 // before that value is used. There we will decide which of both is to be used on 317 // a per character basis. 318 nMappingType = MappingTypeLowerToUpper | MappingTypeUpperToLower; 319 transliterationName = "toggle(generic)"; 320 implementationName = "com.sun.star.i18n.Transliteration.Transliteration_togglecase"; 321 } 322 323 Transliteration_titlecase::Transliteration_titlecase() 324 { 325 nMappingType = MappingTypeToTitle; 326 transliterationName = "title(generic)"; 327 implementationName = "com.sun.star.i18n.Transliteration.Transliteration_titlecase"; 328 } 329 330 #if 0 331 struct LigatureData 332 { 333 sal_uInt32 cChar; 334 sal_Char * pUtf8Text; 335 }; 336 337 // available Unicode ligatures: 338 // http://www.unicode.org/charts 339 // http://www.unicode.org/charts/PDF/UFB00.pdf 340 static LigatureData aLigatures[] = 341 { 342 { 0x0FB00, "ff" }, 343 { 0x0FB01, "fi" }, 344 { 0x0FB02, "fl" }, 345 { 0x0FB03, "ffi" }, 346 { 0x0FB04, "ffl" }, 347 { 0x0FB05, "ft" }, 348 { 0x0FB06, "st" }, 349 350 { 0x0FB13, "\xD5\xB4\xD5\xB6" }, // Armenian small men now 351 { 0x0FB14, "\xD5\xB4\xD5\xA5" }, // Armenian small men ech 352 { 0x0FB15, "\xD5\xB4\xD5\xAB" }, // Armenian small men ini 353 { 0x0FB16, "\xD5\xBE\xD5\xB6" }, // Armenian small vew now 354 { 0x0FB17, "\xD5\xB4\xD5\xAD" }, // Armenian small men xeh 355 { 0x00000, "" } 356 }; 357 358 static inline bool lcl_IsLigature( sal_uInt32 cChar ) 359 { 360 return (0x0FB00 <= cChar && cChar <= 0x0FB06) || (0x0FB13 <= cChar && cChar <= 0x0FB17); 361 } 362 363 static rtl::OUString lcl_ResolveLigature( sal_uInt32 cChar ) 364 { 365 rtl::OUString aRes; 366 if (lcl_IsLigature( cChar )) 367 { 368 LigatureData *pFound = NULL; 369 LigatureData *pData = aLigatures; 370 while (!pFound && pData->cChar != 0) 371 { 372 if (pData->cChar == cChar) 373 pFound = pData; 374 ++pData; 375 } 376 if (pFound) 377 aRes = rtl::OUString( pFound->pUtf8Text, strlen( pFound->pUtf8Text ), RTL_TEXTENCODING_UTF8 ); 378 } 379 else 380 aRes = rtl::OUString( &cChar, 1 ); 381 return aRes; 382 } 383 #endif // if 0 384 385 static rtl::OUString transliterate_titlecase_Impl( 386 const OUString& inStr, sal_Int32 startPos, sal_Int32 nCount, 387 const Locale &rLocale, 388 Sequence< sal_Int32 >& offset ) 389 throw(RuntimeException) 390 { 391 const OUString aText( inStr.copy( startPos, nCount ) ); 392 393 OUString aRes; 394 if (aText.getLength() > 0) 395 { 396 Reference< XMultiServiceFactory > xMSF = ::comphelper::getProcessServiceFactory(); 397 CharacterClassificationImpl aCharClassImpl( xMSF ); 398 399 // because aCharClassImpl.toTitle does not handle ligatures or � but will raise 400 // an exception we need to handle the first chara manually... 401 402 // we don't want to change surrogates by accident, thuse we use proper code point iteration 403 sal_Int32 nPos = 0; 404 sal_uInt32 cFirstChar = aText.iterateCodePoints( &nPos ); 405 OUString aResolvedLigature( &cFirstChar, 1 ); //lcl_ResolveLigature( cFirstChar ) ); 406 // toUpper can be used to properly resolve ligatures and characters like � 407 aResolvedLigature = aCharClassImpl.toUpper( aResolvedLigature, 0, aResolvedLigature.getLength(), rLocale ); 408 // since toTitle will leave all-uppercase text unchanged we first need to 409 // use toLower to bring possible 2nd and following charas in lowercase 410 aResolvedLigature = aCharClassImpl.toLower( aResolvedLigature, 0, aResolvedLigature.getLength(), rLocale ); 411 sal_Int32 nResolvedLen = aResolvedLigature.getLength(); 412 413 // now we can properly use toTitle to get the expected result for the resolved string. 414 // The rest of the text should just become lowercase. 415 aRes = aCharClassImpl.toTitle( aResolvedLigature, 0, nResolvedLen, rLocale ); 416 aRes += aCharClassImpl.toLower( aText, 1, aText.getLength() - 1, rLocale ); 417 offset.realloc( aRes.getLength() ); 418 419 sal_Int32 *pOffset = offset.getArray(); 420 sal_Int32 nLen = offset.getLength(); 421 for (sal_Int32 i = 0; i < nLen; ++i) 422 { 423 sal_Int32 nIdx = 0; 424 if (i >= nResolvedLen) 425 nIdx = i - nResolvedLen + 1; 426 pOffset[i] = nIdx; 427 } 428 } 429 #if OSL_DEBUG_LEVEL > 1 430 const sal_Int32 *pCOffset = offset.getConstArray(); 431 (void) pCOffset; 432 #endif 433 434 return aRes; 435 } 436 437 438 // this function expects to be called on a word-by-word basis, 439 // namely that startPos points to the first char of the word 440 rtl::OUString SAL_CALL Transliteration_titlecase::transliterate( 441 const OUString& inStr, sal_Int32 startPos, sal_Int32 nCount, 442 Sequence< sal_Int32 >& offset ) 443 throw(RuntimeException) 444 { 445 return transliterate_titlecase_Impl( inStr, startPos, nCount, aLocale, offset ); 446 } 447 448 449 Transliteration_sentencecase::Transliteration_sentencecase() 450 { 451 nMappingType = MappingTypeToTitle; // though only to be applied to the first word... 452 transliterationName = "sentence(generic)"; 453 implementationName = "com.sun.star.i18n.Transliteration.Transliteration_sentencecase"; 454 } 455 456 457 // this function expects to be called on a sentence-by-sentence basis, 458 // namely that startPos points to the first word (NOT first char!) in the sentence 459 rtl::OUString SAL_CALL Transliteration_sentencecase::transliterate( 460 const OUString& inStr, sal_Int32 startPos, sal_Int32 nCount, 461 Sequence< sal_Int32 >& offset ) 462 throw(RuntimeException) 463 { 464 return transliterate_titlecase_Impl( inStr, startPos, nCount, aLocale, offset ); 465 } 466 467 468 } } } } 469 470