1 /************************************************************************* 2 * 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * Copyright 2000, 2010 Oracle and/or its affiliates. 6 * 7 * OpenOffice.org - a multi-platform office productivity suite 8 * 9 * This file is part of OpenOffice.org. 10 * 11 * OpenOffice.org is free software: you can redistribute it and/or modify 12 * it under the terms of the GNU Lesser General Public License version 3 13 * only, as published by the Free Software Foundation. 14 * 15 * OpenOffice.org is distributed in the hope that it will be useful, 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 * GNU Lesser General Public License version 3 for more details 19 * (a copy is included in the LICENSE file that accompanied this code). 20 * 21 * You should have received a copy of the GNU Lesser General Public License 22 * version 3 along with OpenOffice.org. If not, see 23 * <http://www.openoffice.org/license.html> 24 * for a copy of the LGPLv3 License. 25 * 26 ************************************************************************/ 27 28 // MARKER(update_precomp.py): autogen include statement, do not remove 29 #include "precompiled_i18npool.hxx" 30 #include <breakiterator_unicode.hxx> 31 #include <localedata.hxx> 32 #include <unicode/uchar.h> 33 #include <unicode/locid.h> 34 #include <unicode/rbbi.h> 35 #include <unicode/udata.h> 36 #include <rtl/strbuf.hxx> 37 #include <rtl/ustring.hxx> 38 39 U_CDECL_BEGIN 40 extern const char OpenOffice_dat[]; 41 U_CDECL_END 42 43 using namespace ::com::sun::star; 44 using namespace ::com::sun::star::lang; 45 using namespace ::rtl; 46 47 namespace com { namespace sun { namespace star { namespace i18n { 48 49 #define ERROR ::com::sun::star::uno::RuntimeException() 50 51 //#define ImplementName "com.sun.star.i18n.BreakIterator_Unicode"; 52 53 54 BreakIterator_Unicode::BreakIterator_Unicode() : 55 cBreakIterator( "com.sun.star.i18n.BreakIterator_Unicode" ), // implementation name 56 wordRule( "word" ), 57 lineRule( "line" ), 58 result(), 59 character(), 60 word(), 61 sentence(), 62 line(), 63 icuBI( NULL ), 64 aLocale(), 65 aBreakType(), 66 aWordType() 67 { 68 } 69 70 71 BreakIterator_Unicode::~BreakIterator_Unicode() 72 { 73 if (icuBI && icuBI->aBreakIterator) { 74 delete icuBI->aBreakIterator; 75 icuBI->aBreakIterator=NULL; 76 } 77 if (character.aBreakIterator) delete character.aBreakIterator; 78 if (word.aBreakIterator) delete word.aBreakIterator; 79 if (sentence.aBreakIterator) delete sentence.aBreakIterator; 80 if (line.aBreakIterator) delete line.aBreakIterator; 81 } 82 83 /* 84 Wrapper class to provide public access to the RuleBasedBreakIterator's 85 setbreakType method. 86 */ 87 class OOoRuleBasedBreakIterator : public RuleBasedBreakIterator { 88 public: 89 inline void publicSetBreakType(int32_t type) { 90 setBreakType(type); 91 }; 92 OOoRuleBasedBreakIterator(UDataMemory* image, 93 UErrorCode &status) : 94 RuleBasedBreakIterator(image, status) { }; 95 96 }; 97 98 // loading ICU breakiterator on demand. 99 void SAL_CALL BreakIterator_Unicode::loadICUBreakIterator(const com::sun::star::lang::Locale& rLocale, 100 sal_Int16 rBreakType, sal_Int16 rWordType, const sal_Char *rule, const OUString& rText) throw(uno::RuntimeException) 101 { 102 sal_Bool newBreak = sal_False; 103 UErrorCode status = U_ZERO_ERROR; 104 sal_Int16 breakType = 0; 105 switch (rBreakType) { 106 case LOAD_CHARACTER_BREAKITERATOR: icuBI=&character; breakType = 3; break; 107 case LOAD_WORD_BREAKITERATOR: icuBI=&word; 108 switch (rWordType) { 109 case WordType::ANYWORD_IGNOREWHITESPACES: breakType = 0; rule=wordRule = "edit_word"; break; 110 case WordType::DICTIONARY_WORD: breakType = 1; rule=wordRule = "dict_word"; break; 111 case WordType::WORD_COUNT: breakType = 2; rule=wordRule = "count_word"; break; 112 } 113 break; 114 case LOAD_SENTENCE_BREAKITERATOR: icuBI=&sentence; breakType = 5; break; 115 case LOAD_LINE_BREAKITERATOR: icuBI=&line; breakType = 4; break; 116 } 117 if (!icuBI->aBreakIterator || rWordType != aWordType || 118 rLocale.Language != aLocale.Language || rLocale.Country != aLocale.Country || 119 rLocale.Variant != aLocale.Variant) { 120 if (icuBI->aBreakIterator) { 121 delete icuBI->aBreakIterator; 122 icuBI->aBreakIterator=NULL; 123 } 124 if (rule) { 125 uno::Sequence< OUString > breakRules = LocaleData().getBreakIteratorRules(rLocale); 126 127 status = U_ZERO_ERROR; 128 udata_setAppData("OpenOffice", OpenOffice_dat, &status); 129 if ( !U_SUCCESS(status) ) throw ERROR; 130 131 OOoRuleBasedBreakIterator *rbi = NULL; 132 133 if (breakRules.getLength() > breakType && breakRules[breakType].getLength() > 0) { 134 rbi = new OOoRuleBasedBreakIterator(udata_open("OpenOffice", "brk", 135 OUStringToOString(breakRules[breakType], RTL_TEXTENCODING_ASCII_US).getStr(), &status), status); 136 } else { 137 status = U_ZERO_ERROR; 138 OStringBuffer aUDName(64); 139 aUDName.append(rule); 140 aUDName.append('_'); 141 aUDName.append( OUStringToOString(rLocale.Language, RTL_TEXTENCODING_ASCII_US)); 142 UDataMemory* pUData = udata_open("OpenOffice", "brk", aUDName.getStr(), &status); 143 if( U_SUCCESS(status) ) 144 rbi = new OOoRuleBasedBreakIterator( pUData, status); 145 if (!U_SUCCESS(status) ) { 146 status = U_ZERO_ERROR; 147 pUData = udata_open("OpenOffice", "brk", rule, &status); 148 if( U_SUCCESS(status) ) 149 rbi = new OOoRuleBasedBreakIterator( pUData, status); 150 if (!U_SUCCESS(status) ) icuBI->aBreakIterator=NULL; 151 } 152 } 153 if (rbi) { 154 switch (rBreakType) { 155 case LOAD_CHARACTER_BREAKITERATOR: rbi->publicSetBreakType(UBRK_CHARACTER); break; 156 case LOAD_WORD_BREAKITERATOR: rbi->publicSetBreakType(UBRK_WORD); break; 157 case LOAD_SENTENCE_BREAKITERATOR: rbi->publicSetBreakType(UBRK_SENTENCE); break; 158 case LOAD_LINE_BREAKITERATOR: rbi->publicSetBreakType(UBRK_LINE); break; 159 } 160 icuBI->aBreakIterator = rbi; 161 } 162 } 163 164 if (!icuBI->aBreakIterator) { 165 icu::Locale icuLocale( 166 OUStringToOString(rLocale.Language, RTL_TEXTENCODING_ASCII_US).getStr(), 167 OUStringToOString(rLocale.Country, RTL_TEXTENCODING_ASCII_US).getStr(), 168 OUStringToOString(rLocale.Variant, RTL_TEXTENCODING_ASCII_US).getStr()); 169 170 status = U_ZERO_ERROR; 171 switch (rBreakType) { 172 case LOAD_CHARACTER_BREAKITERATOR: 173 icuBI->aBreakIterator = icu::BreakIterator::createCharacterInstance(icuLocale, status); 174 break; 175 case LOAD_WORD_BREAKITERATOR: 176 icuBI->aBreakIterator = icu::BreakIterator::createWordInstance(icuLocale, status); 177 break; 178 case LOAD_SENTENCE_BREAKITERATOR: 179 icuBI->aBreakIterator = icu::BreakIterator::createSentenceInstance(icuLocale, status); 180 break; 181 case LOAD_LINE_BREAKITERATOR: 182 icuBI->aBreakIterator = icu::BreakIterator::createLineInstance(icuLocale, status); 183 break; 184 } 185 if ( !U_SUCCESS(status) ) { 186 icuBI->aBreakIterator=NULL; 187 throw ERROR; 188 } 189 } 190 if (icuBI->aBreakIterator) { 191 aLocale=rLocale; 192 aWordType=rWordType; 193 aBreakType=rBreakType; 194 newBreak=sal_True; 195 } else { 196 throw ERROR; 197 } 198 } 199 200 if (newBreak || icuBI->aICUText.compare(UnicodeString(reinterpret_cast<const UChar *>(rText.getStr()), rText.getLength()))) { // UChar != sal_Unicode in MinGW 201 icuBI->aICUText=UnicodeString(reinterpret_cast<const UChar *>(rText.getStr()), rText.getLength()); 202 icuBI->aBreakIterator->setText(icuBI->aICUText); 203 } 204 } 205 206 207 sal_Int32 SAL_CALL BreakIterator_Unicode::nextCharacters( const OUString& Text, 208 sal_Int32 nStartPos, const lang::Locale &rLocale, 209 sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone ) 210 throw(uno::RuntimeException) 211 { 212 if (nCharacterIteratorMode == CharacterIteratorMode::SKIPCELL ) { // for CELL mode 213 loadICUBreakIterator(rLocale, LOAD_CHARACTER_BREAKITERATOR, 0, "char", Text); 214 for (nDone = 0; nDone < nCount; nDone++) { 215 nStartPos = character.aBreakIterator->following(nStartPos); 216 if (nStartPos == BreakIterator::DONE) 217 return Text.getLength(); 218 } 219 } else { // for CHARACTER mode 220 for (nDone = 0; nDone < nCount && nStartPos < Text.getLength(); nDone++) 221 Text.iterateCodePoints(&nStartPos, 1); 222 } 223 return nStartPos; 224 } 225 226 sal_Int32 SAL_CALL BreakIterator_Unicode::previousCharacters( const OUString& Text, 227 sal_Int32 nStartPos, const lang::Locale& rLocale, 228 sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone ) 229 throw(uno::RuntimeException) 230 { 231 if (nCharacterIteratorMode == CharacterIteratorMode::SKIPCELL ) { // for CELL mode 232 loadICUBreakIterator(rLocale, LOAD_CHARACTER_BREAKITERATOR, 0, "char", Text); 233 for (nDone = 0; nDone < nCount; nDone++) { 234 nStartPos = character.aBreakIterator->preceding(nStartPos); 235 if (nStartPos == BreakIterator::DONE) 236 return 0; 237 } 238 } else { // for BS to delete one char and CHARACTER mode. 239 for (nDone = 0; nDone < nCount && nStartPos > 0; nDone++) 240 Text.iterateCodePoints(&nStartPos, -1); 241 } 242 return nStartPos; 243 } 244 245 246 Boundary SAL_CALL BreakIterator_Unicode::nextWord( const OUString& Text, sal_Int32 nStartPos, 247 const lang::Locale& rLocale, sal_Int16 rWordType ) throw(uno::RuntimeException) 248 { 249 loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, NULL, Text); 250 251 result.startPos = word.aBreakIterator->following(nStartPos); 252 if( result.startPos >= Text.getLength() || result.startPos == BreakIterator::DONE ) 253 result.endPos = result.startPos; 254 else { 255 if ( (rWordType == WordType::ANYWORD_IGNOREWHITESPACES || 256 rWordType == WordType::DICTIONARY_WORD ) && 257 u_isWhitespace(Text.iterateCodePoints(&result.startPos, 0)) ) 258 result.startPos = word.aBreakIterator->following(result.startPos); 259 260 result.endPos = word.aBreakIterator->following(result.startPos); 261 if(result.endPos == BreakIterator::DONE) 262 result.endPos = result.startPos; 263 } 264 return result; 265 } 266 267 268 Boundary SAL_CALL BreakIterator_Unicode::previousWord(const OUString& Text, sal_Int32 nStartPos, 269 const lang::Locale& rLocale, sal_Int16 rWordType) throw(uno::RuntimeException) 270 { 271 loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, NULL, Text); 272 273 result.startPos = word.aBreakIterator->preceding(nStartPos); 274 if( result.startPos < 0 || result.startPos == BreakIterator::DONE) 275 result.endPos = result.startPos; 276 else { 277 if ( (rWordType == WordType::ANYWORD_IGNOREWHITESPACES || 278 rWordType == WordType::DICTIONARY_WORD) && 279 u_isWhitespace(Text.iterateCodePoints(&result.startPos, 0)) ) 280 result.startPos = word.aBreakIterator->preceding(result.startPos); 281 282 result.endPos = word.aBreakIterator->following(result.startPos); 283 if(result.endPos == BreakIterator::DONE) 284 result.endPos = result.startPos; 285 } 286 return result; 287 } 288 289 290 Boundary SAL_CALL BreakIterator_Unicode::getWordBoundary( const OUString& Text, sal_Int32 nPos, const lang::Locale& rLocale, 291 sal_Int16 rWordType, sal_Bool bDirection ) throw(uno::RuntimeException) 292 { 293 loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, NULL, Text); 294 sal_Int32 len = Text.getLength(); 295 296 if(word.aBreakIterator->isBoundary(nPos)) { 297 result.startPos = result.endPos = nPos; 298 if((bDirection || nPos == 0) && nPos < len) //forward 299 result.endPos = word.aBreakIterator->following(nPos); 300 else 301 result.startPos = word.aBreakIterator->preceding(nPos); 302 } else { 303 if(nPos <= 0) { 304 result.startPos = 0; 305 result.endPos = len ? word.aBreakIterator->following((sal_Int32)0) : 0; 306 } else if(nPos >= len) { 307 result.startPos = word.aBreakIterator->preceding(len); 308 result.endPos = len; 309 } else { 310 result.startPos = word.aBreakIterator->preceding(nPos); 311 result.endPos = word.aBreakIterator->following(nPos); 312 } 313 } 314 if (result.startPos == BreakIterator::DONE) 315 result.startPos = result.endPos; 316 else if (result.endPos == BreakIterator::DONE) 317 result.endPos = result.startPos; 318 319 return result; 320 } 321 322 323 sal_Int32 SAL_CALL BreakIterator_Unicode::beginOfSentence( const OUString& Text, sal_Int32 nStartPos, 324 const lang::Locale &rLocale ) throw(uno::RuntimeException) 325 { 326 loadICUBreakIterator(rLocale, LOAD_SENTENCE_BREAKITERATOR, 0, "sent", Text); 327 328 sal_Int32 len = Text.getLength(); 329 if (len > 0 && nStartPos == len) 330 Text.iterateCodePoints(&nStartPos, -1); // issue #i27703# treat end position as part of last sentence 331 if (!sentence.aBreakIterator->isBoundary(nStartPos)) 332 nStartPos = sentence.aBreakIterator->preceding(nStartPos); 333 334 // skip preceding space. 335 sal_uInt32 ch = Text.iterateCodePoints(&nStartPos, 1); 336 while (nStartPos < len && u_isWhitespace(ch)) ch = Text.iterateCodePoints(&nStartPos, 1); 337 Text.iterateCodePoints(&nStartPos, -1); 338 339 return nStartPos; 340 } 341 342 sal_Int32 SAL_CALL BreakIterator_Unicode::endOfSentence( const OUString& Text, sal_Int32 nStartPos, 343 const lang::Locale &rLocale ) throw(uno::RuntimeException) 344 { 345 loadICUBreakIterator(rLocale, LOAD_SENTENCE_BREAKITERATOR, 0, "sent", Text); 346 347 sal_Int32 len = Text.getLength(); 348 if (len > 0 && nStartPos == len) 349 Text.iterateCodePoints(&nStartPos, -1); // issue #i27703# treat end position as part of last sentence 350 nStartPos = sentence.aBreakIterator->following(nStartPos); 351 352 sal_Int32 nPos=nStartPos; 353 while (nPos > 0 && u_isWhitespace(Text.iterateCodePoints(&nPos, -1))) nStartPos=nPos; 354 355 return nStartPos; 356 } 357 358 LineBreakResults SAL_CALL BreakIterator_Unicode::getLineBreak( 359 const OUString& Text, sal_Int32 nStartPos, 360 const lang::Locale& rLocale, sal_Int32 nMinBreakPos, 361 const LineBreakHyphenationOptions& hOptions, 362 const LineBreakUserOptions& /*rOptions*/ ) throw(uno::RuntimeException) 363 { 364 LineBreakResults lbr; 365 366 if (nStartPos >= Text.getLength()) { 367 lbr.breakIndex = Text.getLength(); 368 lbr.breakType = BreakType::WORDBOUNDARY; 369 return lbr; 370 } 371 372 loadICUBreakIterator(rLocale, LOAD_LINE_BREAKITERATOR, 0, lineRule, Text); 373 374 sal_Bool GlueSpace=sal_True; 375 while (GlueSpace) { 376 if (line.aBreakIterator->preceding(nStartPos + 1) == nStartPos) { //Line boundary break 377 lbr.breakIndex = nStartPos; 378 lbr.breakType = BreakType::WORDBOUNDARY; 379 } else if (hOptions.rHyphenator.is()) { //Hyphenation break 380 Boundary wBoundary = getWordBoundary( Text, nStartPos, rLocale, 381 WordType::DICTIONARY_WORD, false); 382 uno::Reference< linguistic2::XHyphenatedWord > aHyphenatedWord; 383 aHyphenatedWord = hOptions.rHyphenator->hyphenate(Text.copy(wBoundary.startPos, 384 wBoundary.endPos - wBoundary.startPos), rLocale, 385 (sal_Int16) (hOptions.hyphenIndex - wBoundary.startPos), hOptions.aHyphenationOptions); 386 if (aHyphenatedWord.is()) { 387 lbr.rHyphenatedWord = aHyphenatedWord; 388 if(wBoundary.startPos + aHyphenatedWord->getHyphenationPos() + 1 < nMinBreakPos ) 389 lbr.breakIndex = -1; 390 else 391 lbr.breakIndex = wBoundary.startPos; //aHyphenatedWord->getHyphenationPos(); 392 lbr.breakType = BreakType::HYPHENATION; 393 } else { 394 lbr.breakIndex = line.aBreakIterator->preceding(nStartPos); 395 lbr.breakType = BreakType::WORDBOUNDARY;; 396 } 397 } else { //word boundary break 398 lbr.breakIndex = line.aBreakIterator->preceding(nStartPos); 399 lbr.breakType = BreakType::WORDBOUNDARY; 400 } 401 402 #define WJ 0x2060 // Word Joiner 403 GlueSpace=sal_False; 404 if (lbr.breakType == BreakType::WORDBOUNDARY) { 405 nStartPos = lbr.breakIndex; 406 if (Text[nStartPos--] == WJ) 407 GlueSpace=sal_True; 408 while (nStartPos >= 0 && 409 (u_isWhitespace(Text.iterateCodePoints(&nStartPos, 0)) || Text[nStartPos] == WJ)) { 410 if (Text[nStartPos--] == WJ) 411 GlueSpace=sal_True; 412 } 413 if (GlueSpace && nStartPos < 0) { 414 lbr.breakIndex = 0; 415 break; 416 } 417 } 418 } 419 420 return lbr; 421 } 422 423 424 425 OUString SAL_CALL 426 BreakIterator_Unicode::getImplementationName(void) throw( uno::RuntimeException ) 427 { 428 return OUString::createFromAscii(cBreakIterator); 429 } 430 431 sal_Bool SAL_CALL 432 BreakIterator_Unicode::supportsService(const OUString& rServiceName) throw( uno::RuntimeException ) 433 { 434 return !rServiceName.compareToAscii(cBreakIterator); 435 } 436 437 uno::Sequence< OUString > SAL_CALL 438 BreakIterator_Unicode::getSupportedServiceNames(void) throw( uno::RuntimeException ) 439 { 440 uno::Sequence< OUString > aRet(1); 441 aRet[0] = OUString::createFromAscii(cBreakIterator); 442 return aRet; 443 } 444 445 } } } } 446