1 /************************************************************************* 2 * 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * Copyright 2000, 2010 Oracle and/or its affiliates. 6 * 7 * OpenOffice.org - a multi-platform office productivity suite 8 * 9 * This file is part of OpenOffice.org. 10 * 11 * OpenOffice.org is free software: you can redistribute it and/or modify 12 * it under the terms of the GNU Lesser General Public License version 3 13 * only, as published by the Free Software Foundation. 14 * 15 * OpenOffice.org is distributed in the hope that it will be useful, 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 * GNU Lesser General Public License version 3 for more details 19 * (a copy is included in the LICENSE file that accompanied this code). 20 * 21 * You should have received a copy of the GNU Lesser General Public License 22 * version 3 along with OpenOffice.org. If not, see 23 * <http://www.openoffice.org/license.html> 24 * for a copy of the LGPLv3 License. 25 * 26 ************************************************************************/ 27 28 // MARKER(update_precomp.py): autogen include statement, do not remove 29 #include "precompiled_i18npool.hxx" 30 31 #include <rtl/ustrbuf.hxx> 32 #include <i18nutil/casefolding.hxx> 33 #include <i18nutil/unicode.hxx> 34 35 #include <comphelper/processfactory.hxx> 36 #include <osl/diagnose.h> 37 38 #include <string.h> 39 40 #include "characterclassificationImpl.hxx" 41 #include "breakiteratorImpl.hxx" 42 43 #define TRANSLITERATION_ALL 44 #include "transliteration_body.hxx" 45 46 using namespace ::com::sun::star::uno; 47 using namespace ::com::sun::star::lang; 48 using namespace ::rtl; 49 50 #define A2OU(x) OUString::createFromAscii(x) 51 52 namespace com { namespace sun { namespace star { namespace i18n { 53 54 55 Transliteration_body::Transliteration_body() 56 { 57 nMappingType = 0; 58 transliterationName = "Transliteration_body"; 59 implementationName = "com.sun.star.i18n.Transliteration.Transliteration_body"; 60 } 61 62 sal_Int16 SAL_CALL Transliteration_body::getType() throw(RuntimeException) 63 { 64 return TransliterationType::ONE_TO_ONE; 65 } 66 67 sal_Bool SAL_CALL Transliteration_body::equals( 68 const OUString& /*str1*/, sal_Int32 /*pos1*/, sal_Int32 /*nCount1*/, sal_Int32& /*nMatch1*/, 69 const OUString& /*str2*/, sal_Int32 /*pos2*/, sal_Int32 /*nCount2*/, sal_Int32& /*nMatch2*/) 70 throw(RuntimeException) 71 { 72 throw RuntimeException(); 73 } 74 75 Sequence< OUString > SAL_CALL 76 Transliteration_body::transliterateRange( const OUString& str1, const OUString& str2 ) 77 throw( RuntimeException) 78 { 79 Sequence< OUString > ostr(2); 80 ostr[0] = str1; 81 ostr[1] = str2; 82 return ostr; 83 } 84 85 86 static sal_uInt8 lcl_getMappingTypeForToggleCase( sal_uInt8 nMappingType, sal_Unicode cChar ) 87 { 88 sal_uInt8 nRes = nMappingType; 89 90 // take care of TOGGLE_CASE transliteration: 91 // nMappingType should not be a combination of flags, thuse we decide now 92 // which one to use. 93 if (nMappingType == (MappingTypeLowerToUpper | MappingTypeUpperToLower)) 94 { 95 const sal_Int16 nType = unicode::getUnicodeType( cChar ); 96 if (nType & 0x02 /* lower case*/) 97 nRes = MappingTypeLowerToUpper; 98 else 99 { 100 // should also work properly for non-upper characters like white spacs, numbers, ... 101 nRes = MappingTypeUpperToLower; 102 } 103 } 104 105 return nRes; 106 } 107 108 109 OUString SAL_CALL 110 Transliteration_body::transliterate( 111 const OUString& inStr, sal_Int32 startPos, sal_Int32 nCount, 112 Sequence< sal_Int32 >& offset) 113 throw(RuntimeException) 114 { 115 #if 0 116 /* Performance optimization: 117 * The two realloc() consume 48% (32% grow, 16% shrink) runtime of this method! 118 * getValue() needs about 15%, so there is equal balance if we trade the second 119 * (shrinking) realloc() for a getValue(). But if the caller initializes the 120 * sequence to nCount elements there isn't any change in size necessary in most 121 * cases (one-to-one mapping) and we gain 33%. 122 * 123 * Of that constellation the getValue() method takes 20% upon each call, so 40% 124 * for both. By remembering the first calls' results we could gain some extra 125 * percentage again, but unfortunately getValue() may return a reference to a 126 * static buffer, so we can't store the pointer directly but would have to 127 * copy-construct an array, which doesn't give us any advantage. 128 * 129 * Much more is accomplished by working directly on the sequence buffer 130 * returned by getArray() instead of using operator[] for each and every 131 * access. 132 * 133 * And while we're at it: now that we know the size in advance we don't need to 134 * copy the buffer anymore, just create the real string buffer and let the 135 * return value take ownership. 136 * 137 * All together these changes result in the new implementation needing only 62% 138 * of the time of the old implementation (in other words: that one was 1.61 139 * times slower ...) 140 */ 141 142 // Allocate the max possible buffer. Try to use stack instead of heap which 143 // would have to be reallocated most times anyway. 144 const sal_Int32 nLocalBuf = 512 * NMAPPINGMAX; 145 sal_Unicode aLocalBuf[nLocalBuf], *out = aLocalBuf, *aHeapBuf = NULL; 146 147 const sal_Unicode *in = inStr.getStr() + startPos; 148 149 if (nCount > 512) 150 out = aHeapBuf = (sal_Unicode*) malloc((nCount * NMAPPINGMAX) * sizeof(sal_Unicode)); 151 152 if (useOffset) 153 offset.realloc(nCount * NMAPPINGMAX); 154 sal_Int32 j = 0; 155 for (sal_Int32 i = 0; i < nCount; i++) { 156 Mapping &map = casefolding::getValue(in, i, nCount, aLocale, nMappingType); 157 for (sal_Int32 k = 0; k < map.nmap; k++) { 158 if (useOffset) 159 offset[j] = i + startPos; 160 out[j++] = map.map[k]; 161 } 162 } 163 if (useOffset) 164 offset.realloc(j); 165 166 OUString r(out, j); 167 168 if (aHeapBuf) 169 free(aHeapBuf); 170 171 return r; 172 #else 173 const sal_Unicode *in = inStr.getStr() + startPos; 174 175 // Two different blocks to eliminate the if(useOffset) condition inside the 176 // inner k loop. Yes, on massive use even such small things do count. 177 if ( useOffset ) 178 { 179 sal_Int32 nOffCount = 0, i; 180 for (i = 0; i < nCount; i++) 181 { 182 // take care of TOGGLE_CASE transliteration: 183 sal_uInt8 nTmpMappingType = nMappingType; 184 if (nMappingType == (MappingTypeLowerToUpper | MappingTypeUpperToLower)) 185 nTmpMappingType = lcl_getMappingTypeForToggleCase( nMappingType, in[i] ); 186 187 const Mapping &map = casefolding::getValue( in, i, nCount, aLocale, nTmpMappingType ); 188 nOffCount += map.nmap; 189 } 190 rtl_uString* pStr = x_rtl_uString_new_WithLength( nOffCount, 1 ); // our x_rtl_ustring.h 191 sal_Unicode* out = pStr->buffer; 192 193 if ( nOffCount != offset.getLength() ) 194 offset.realloc( nOffCount ); 195 196 sal_Int32 j = 0; 197 sal_Int32 * pArr = offset.getArray(); 198 for (i = 0; i < nCount; i++) 199 { 200 // take care of TOGGLE_CASE transliteration: 201 sal_uInt8 nTmpMappingType = nMappingType; 202 if (nMappingType == (MappingTypeLowerToUpper | MappingTypeUpperToLower)) 203 nTmpMappingType = lcl_getMappingTypeForToggleCase( nMappingType, in[i] ); 204 205 const Mapping &map = casefolding::getValue( in, i, nCount, aLocale, nTmpMappingType ); 206 for (sal_Int32 k = 0; k < map.nmap; k++) 207 { 208 pArr[j] = i + startPos; 209 out[j++] = map.map[k]; 210 } 211 } 212 out[j] = 0; 213 214 return OUString( pStr, SAL_NO_ACQUIRE ); 215 } 216 else 217 { 218 // In the simple case of no offset sequence used we can eliminate the 219 // first getValue() loop. We could also assume that most calls result 220 // in identical string lengths, thus using a preallocated 221 // OUStringBuffer could be an easy way to assemble the return string 222 // without too much hassle. However, for single characters the 223 // OUStringBuffer::append() method is quite expensive compared to a 224 // simple array operation, so it pays here to copy the final result 225 // instead. 226 227 // Allocate the max possible buffer. Try to use stack instead of heap, 228 // which would have to be reallocated most times anyways. 229 const sal_Int32 nLocalBuf = 2048; 230 sal_Unicode aLocalBuf[ nLocalBuf * NMAPPINGMAX ], *out = aLocalBuf, *pHeapBuf = NULL; 231 if ( nCount > nLocalBuf ) 232 out = pHeapBuf = new sal_Unicode[ nCount * NMAPPINGMAX ]; 233 234 sal_Int32 j = 0; 235 for ( sal_Int32 i = 0; i < nCount; i++) 236 { 237 // take care of TOGGLE_CASE transliteration: 238 sal_uInt8 nTmpMappingType = nMappingType; 239 if (nMappingType == (MappingTypeLowerToUpper | MappingTypeUpperToLower)) 240 nTmpMappingType = lcl_getMappingTypeForToggleCase( nMappingType, in[i] ); 241 242 const Mapping &map = casefolding::getValue( in, i, nCount, aLocale, nTmpMappingType ); 243 for (sal_Int32 k = 0; k < map.nmap; k++) 244 { 245 out[j++] = map.map[k]; 246 } 247 } 248 249 OUString aRet( out, j ); 250 if ( pHeapBuf ) 251 delete [] pHeapBuf; 252 return aRet; 253 } 254 #endif 255 } 256 257 OUString SAL_CALL 258 Transliteration_body::transliterateChar2String( sal_Unicode inChar ) throw(RuntimeException) 259 { 260 const Mapping &map = casefolding::getValue(&inChar, 0, 1, aLocale, nMappingType); 261 rtl_uString* pStr = x_rtl_uString_new_WithLength( map.nmap, 1 ); // our x_rtl_ustring.h 262 sal_Unicode* out = pStr->buffer; 263 sal_Int32 i; 264 265 for (i = 0; i < map.nmap; i++) 266 out[i] = map.map[i]; 267 out[i] = 0; 268 269 return OUString( pStr, SAL_NO_ACQUIRE ); 270 } 271 272 sal_Unicode SAL_CALL 273 Transliteration_body::transliterateChar2Char( sal_Unicode inChar ) throw(MultipleCharsOutputException, RuntimeException) 274 { 275 const Mapping &map = casefolding::getValue(&inChar, 0, 1, aLocale, nMappingType); 276 if (map.nmap > 1) 277 throw MultipleCharsOutputException(); 278 return map.map[0]; 279 } 280 281 OUString SAL_CALL 282 Transliteration_body::folding( const OUString& inStr, sal_Int32 startPos, sal_Int32 nCount, 283 Sequence< sal_Int32 >& offset) throw(RuntimeException) 284 { 285 return this->transliterate(inStr, startPos, nCount, offset); 286 } 287 288 Transliteration_casemapping::Transliteration_casemapping() 289 { 290 nMappingType = 0; 291 transliterationName = "casemapping(generic)"; 292 implementationName = "com.sun.star.i18n.Transliteration.Transliteration_casemapping"; 293 } 294 295 void SAL_CALL 296 Transliteration_casemapping::setMappingType( const sal_uInt8 rMappingType, const Locale& rLocale ) 297 { 298 nMappingType = rMappingType; 299 aLocale = rLocale; 300 } 301 302 Transliteration_u2l::Transliteration_u2l() 303 { 304 nMappingType = MappingTypeUpperToLower; 305 transliterationName = "upper_to_lower(generic)"; 306 implementationName = "com.sun.star.i18n.Transliteration.Transliteration_u2l"; 307 } 308 309 Transliteration_l2u::Transliteration_l2u() 310 { 311 nMappingType = MappingTypeLowerToUpper; 312 transliterationName = "lower_to_upper(generic)"; 313 implementationName = "com.sun.star.i18n.Transliteration.Transliteration_l2u"; 314 } 315 316 Transliteration_togglecase::Transliteration_togglecase() 317 { 318 // usually nMappingType must NOT be a combiantion of different flages here, 319 // but we take care of that problem in Transliteration_body::transliterate above 320 // before that value is used. There we will decide which of both is to be used on 321 // a per character basis. 322 nMappingType = MappingTypeLowerToUpper | MappingTypeUpperToLower; 323 transliterationName = "toggle(generic)"; 324 implementationName = "com.sun.star.i18n.Transliteration.Transliteration_togglecase"; 325 } 326 327 Transliteration_titlecase::Transliteration_titlecase() 328 { 329 nMappingType = MappingTypeToTitle; 330 transliterationName = "title(generic)"; 331 implementationName = "com.sun.star.i18n.Transliteration.Transliteration_titlecase"; 332 } 333 334 #if 0 335 struct LigatureData 336 { 337 sal_uInt32 cChar; 338 sal_Char * pUtf8Text; 339 }; 340 341 // available Unicode ligatures: 342 // http://www.unicode.org/charts 343 // http://www.unicode.org/charts/PDF/UFB00.pdf 344 static LigatureData aLigatures[] = 345 { 346 { 0x0FB00, "ff" }, 347 { 0x0FB01, "fi" }, 348 { 0x0FB02, "fl" }, 349 { 0x0FB03, "ffi" }, 350 { 0x0FB04, "ffl" }, 351 { 0x0FB05, "ft" }, 352 { 0x0FB06, "st" }, 353 354 { 0x0FB13, "\xD5\xB4\xD5\xB6" }, // Armenian small men now 355 { 0x0FB14, "\xD5\xB4\xD5\xA5" }, // Armenian small men ech 356 { 0x0FB15, "\xD5\xB4\xD5\xAB" }, // Armenian small men ini 357 { 0x0FB16, "\xD5\xBE\xD5\xB6" }, // Armenian small vew now 358 { 0x0FB17, "\xD5\xB4\xD5\xAD" }, // Armenian small men xeh 359 { 0x00000, "" } 360 }; 361 362 static inline bool lcl_IsLigature( sal_uInt32 cChar ) 363 { 364 return (0x0FB00 <= cChar && cChar <= 0x0FB06) || (0x0FB13 <= cChar && cChar <= 0x0FB17); 365 } 366 367 static rtl::OUString lcl_ResolveLigature( sal_uInt32 cChar ) 368 { 369 rtl::OUString aRes; 370 if (lcl_IsLigature( cChar )) 371 { 372 LigatureData *pFound = NULL; 373 LigatureData *pData = aLigatures; 374 while (!pFound && pData->cChar != 0) 375 { 376 if (pData->cChar == cChar) 377 pFound = pData; 378 ++pData; 379 } 380 if (pFound) 381 aRes = rtl::OUString( pFound->pUtf8Text, strlen( pFound->pUtf8Text ), RTL_TEXTENCODING_UTF8 ); 382 } 383 else 384 aRes = rtl::OUString( &cChar, 1 ); 385 return aRes; 386 } 387 #endif // if 0 388 389 static rtl::OUString transliterate_titlecase_Impl( 390 const OUString& inStr, sal_Int32 startPos, sal_Int32 nCount, 391 const Locale &rLocale, 392 Sequence< sal_Int32 >& offset ) 393 throw(RuntimeException) 394 { 395 const OUString aText( inStr.copy( startPos, nCount ) ); 396 397 OUString aRes; 398 if (aText.getLength() > 0) 399 { 400 Reference< XMultiServiceFactory > xMSF = ::comphelper::getProcessServiceFactory(); 401 CharacterClassificationImpl aCharClassImpl( xMSF ); 402 403 // because aCharClassImpl.toTitle does not handle ligatures or � but will raise 404 // an exception we need to handle the first chara manually... 405 406 // we don't want to change surrogates by accident, thuse we use proper code point iteration 407 sal_Int32 nPos = 0; 408 sal_uInt32 cFirstChar = aText.iterateCodePoints( &nPos ); 409 OUString aResolvedLigature( &cFirstChar, 1 ); //lcl_ResolveLigature( cFirstChar ) ); 410 // toUpper can be used to properly resolve ligatures and characters like � 411 aResolvedLigature = aCharClassImpl.toUpper( aResolvedLigature, 0, aResolvedLigature.getLength(), rLocale ); 412 // since toTitle will leave all-uppercase text unchanged we first need to 413 // use toLower to bring possible 2nd and following charas in lowercase 414 aResolvedLigature = aCharClassImpl.toLower( aResolvedLigature, 0, aResolvedLigature.getLength(), rLocale ); 415 sal_Int32 nResolvedLen = aResolvedLigature.getLength(); 416 417 // now we can properly use toTitle to get the expected result for the resolved string. 418 // The rest of the text should just become lowercase. 419 aRes = aCharClassImpl.toTitle( aResolvedLigature, 0, nResolvedLen, rLocale ); 420 aRes += aCharClassImpl.toLower( aText, 1, aText.getLength() - 1, rLocale ); 421 offset.realloc( aRes.getLength() ); 422 423 sal_Int32 *pOffset = offset.getArray(); 424 sal_Int32 nLen = offset.getLength(); 425 for (sal_Int32 i = 0; i < nLen; ++i) 426 { 427 sal_Int32 nIdx = 0; 428 if (i >= nResolvedLen) 429 nIdx = i - nResolvedLen + 1; 430 pOffset[i] = nIdx; 431 } 432 } 433 #if OSL_DEBUG_LEVEL > 1 434 const sal_Int32 *pCOffset = offset.getConstArray(); 435 (void) pCOffset; 436 #endif 437 438 return aRes; 439 } 440 441 442 // this function expects to be called on a word-by-word basis, 443 // namely that startPos points to the first char of the word 444 rtl::OUString SAL_CALL Transliteration_titlecase::transliterate( 445 const OUString& inStr, sal_Int32 startPos, sal_Int32 nCount, 446 Sequence< sal_Int32 >& offset ) 447 throw(RuntimeException) 448 { 449 return transliterate_titlecase_Impl( inStr, startPos, nCount, aLocale, offset ); 450 } 451 452 453 Transliteration_sentencecase::Transliteration_sentencecase() 454 { 455 nMappingType = MappingTypeToTitle; // though only to be applied to the first word... 456 transliterationName = "sentence(generic)"; 457 implementationName = "com.sun.star.i18n.Transliteration.Transliteration_sentencecase"; 458 } 459 460 461 // this function expects to be called on a sentence-by-sentence basis, 462 // namely that startPos points to the first word (NOT first char!) in the sentence 463 rtl::OUString SAL_CALL Transliteration_sentencecase::transliterate( 464 const OUString& inStr, sal_Int32 startPos, sal_Int32 nCount, 465 Sequence< sal_Int32 >& offset ) 466 throw(RuntimeException) 467 { 468 return transliterate_titlecase_Impl( inStr, startPos, nCount, aLocale, offset ); 469 } 470 471 472 } } } } 473 474