1 /************************************************************** 2 * 3 * Licensed to the Apache Software Foundation (ASF) under one 4 * or more contributor license agreements. See the NOTICE file 5 * distributed with this work for additional information 6 * regarding copyright ownership. The ASF licenses this file 7 * to you under the Apache License, Version 2.0 (the 8 * "License"); you may not use this file except in compliance 9 * with the License. You may obtain a copy of the License at 10 * 11 * http://www.apache.org/licenses/LICENSE-2.0 12 * 13 * Unless required by applicable law or agreed to in writing, 14 * software distributed under the License is distributed on an 15 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 * KIND, either express or implied. See the License for the 17 * specific language governing permissions and limitations 18 * under the License. 19 * 20 *************************************************************/ 21 22 23 24 // MARKER(update_precomp.py): autogen include statement, do not remove 25 #include "precompiled_i18npool.hxx" 26 27 #include "textsearch.hxx" 28 #include "levdis.hxx" 29 #include <com/sun/star/lang/Locale.hpp> 30 #include <com/sun/star/lang/XMultiServiceFactory.hpp> 31 #include <comphelper/processfactory.hxx> 32 #include <com/sun/star/i18n/UnicodeType.hpp> 33 #include <com/sun/star/util/SearchFlags.hpp> 34 #include <com/sun/star/i18n/WordType.hpp> 35 #include <com/sun/star/i18n/ScriptType.hpp> 36 #include <com/sun/star/i18n/CharacterIteratorMode.hpp> 37 #include <com/sun/star/i18n/KCharacterType.hpp> 38 #include <com/sun/star/registry/XRegistryKey.hpp> 39 #include <cppuhelper/factory.hxx> 40 #include <cppuhelper/weak.hxx> 41 42 #ifdef _MSC_VER 43 // get rid of that dumb compiler warning 44 // identifier was truncated to '255' characters in the debug information 45 // for STL template usage, if .pdb files are to be created 46 #pragma warning( disable: 4786 ) 47 #endif 48 49 #include <string.h> 50 51 using namespace ::com::sun::star::util; 52 using namespace ::com::sun::star::uno; 53 using namespace ::com::sun::star::lang; 54 using namespace ::com::sun::star::i18n; 55 using namespace ::rtl; 56 57 static sal_Int32 COMPLEX_TRANS_MASK_TMP = 58 TransliterationModules_ignoreBaFa_ja_JP | 59 TransliterationModules_ignoreIterationMark_ja_JP | 60 TransliterationModules_ignoreTiJi_ja_JP | 61 TransliterationModules_ignoreHyuByu_ja_JP | 62 TransliterationModules_ignoreSeZe_ja_JP | 63 TransliterationModules_ignoreIandEfollowedByYa_ja_JP | 64 TransliterationModules_ignoreKiKuFollowedBySa_ja_JP | 65 TransliterationModules_ignoreProlongedSoundMark_ja_JP; 66 static const sal_Int32 SIMPLE_TRANS_MASK = TransliterationModules_HIRAGANA_KATAKANA | TransliterationModules_FULLWIDTH_HALFWIDTH; 67 static const sal_Int32 COMPLEX_TRANS_MASK = COMPLEX_TRANS_MASK_TMP | TransliterationModules_IGNORE_KANA | TransliterationModules_FULLWIDTH_HALFWIDTH; 68 // Above 2 transliteration is simple but need to take effect in 69 // complex transliteration 70 71 TextSearch::TextSearch(const Reference < XMultiServiceFactory > & rxMSF) 72 : xMSF( rxMSF ) 73 , pJumpTable( 0 ) 74 , pJumpTable2( 0 ) 75 , pRegexMatcher( NULL ) 76 , pWLD( 0 ) 77 { 78 SearchOptions aOpt; 79 aOpt.algorithmType = SearchAlgorithms_ABSOLUTE; 80 aOpt.searchFlag = SearchFlags::ALL_IGNORE_CASE; 81 //aOpt.Locale = ???; 82 setOptions( aOpt ); 83 } 84 85 TextSearch::~TextSearch() 86 { 87 delete pRegexMatcher; 88 delete pWLD; 89 delete pJumpTable; 90 delete pJumpTable2; 91 } 92 93 void TextSearch::setOptions( const SearchOptions& rOptions ) throw( RuntimeException ) 94 { 95 aSrchPara = rOptions; 96 97 delete pRegexMatcher, pRegexMatcher = NULL; 98 delete pWLD, pWLD = 0; 99 delete pJumpTable, pJumpTable = 0; 100 delete pJumpTable2, pJumpTable2 = 0; 101 102 // Create Transliteration class 103 if( aSrchPara.transliterateFlags & SIMPLE_TRANS_MASK ) 104 { 105 if( !xTranslit.is() ) 106 { 107 Reference < XInterface > xI = xMSF->createInstance( 108 OUString::createFromAscii( 109 "com.sun.star.i18n.Transliteration")); 110 if ( xI.is() ) 111 xI->queryInterface( ::getCppuType( 112 (const Reference< XExtendedTransliteration >*)0)) 113 >>= xTranslit; 114 } 115 // Load transliteration module 116 if( xTranslit.is() ) 117 xTranslit->loadModule( 118 (TransliterationModules)( aSrchPara.transliterateFlags & SIMPLE_TRANS_MASK ), 119 aSrchPara.Locale); 120 } 121 else if( xTranslit.is() ) 122 xTranslit = 0; 123 124 // Create Transliteration for 2<->1, 2<->2 transliteration 125 if ( aSrchPara.transliterateFlags & COMPLEX_TRANS_MASK ) 126 { 127 if( !xTranslit2.is() ) 128 { 129 Reference < XInterface > xI = xMSF->createInstance( 130 OUString::createFromAscii( 131 "com.sun.star.i18n.Transliteration")); 132 if ( xI.is() ) 133 xI->queryInterface( ::getCppuType( 134 (const Reference< XExtendedTransliteration >*)0)) 135 >>= xTranslit2; 136 } 137 // Load transliteration module 138 if( xTranslit2.is() ) 139 xTranslit2->loadModule( 140 (TransliterationModules)( aSrchPara.transliterateFlags & COMPLEX_TRANS_MASK ), 141 aSrchPara.Locale); 142 } 143 144 if ( !xBreak.is() ) 145 { 146 Reference < XInterface > xI = xMSF->createInstance( 147 OUString::createFromAscii( "com.sun.star.i18n.BreakIterator")); 148 if( xI.is() ) 149 xI->queryInterface( ::getCppuType( 150 (const Reference< XBreakIterator >*)0)) 151 >>= xBreak; 152 } 153 154 sSrchStr = aSrchPara.searchString; 155 156 // use transliteration here 157 if ( xTranslit.is() && 158 aSrchPara.transliterateFlags & SIMPLE_TRANS_MASK ) 159 sSrchStr = xTranslit->transliterateString2String( 160 aSrchPara.searchString, 0, aSrchPara.searchString.getLength()); 161 162 if ( xTranslit2.is() && 163 aSrchPara.transliterateFlags & COMPLEX_TRANS_MASK ) 164 sSrchStr2 = xTranslit2->transliterateString2String( 165 aSrchPara.searchString, 0, aSrchPara.searchString.getLength()); 166 167 // When start or end of search string is a complex script type, we need to 168 // make sure the result boundary is not located in the middle of cell. 169 checkCTLStart = (xBreak.is() && (xBreak->getScriptType(sSrchStr, 0) == 170 ScriptType::COMPLEX)); 171 checkCTLEnd = (xBreak.is() && (xBreak->getScriptType(sSrchStr, 172 sSrchStr.getLength()-1) == ScriptType::COMPLEX)); 173 174 switch( aSrchPara.algorithmType) 175 { 176 case SearchAlgorithms_REGEXP: 177 fnForward = &TextSearch::RESrchFrwrd; 178 fnBackward = &TextSearch::RESrchBkwrd; 179 RESrchPrepare( aSrchPara); 180 break; 181 182 case SearchAlgorithms_APPROXIMATE: 183 fnForward = &TextSearch::ApproxSrchFrwrd; 184 fnBackward = &TextSearch::ApproxSrchBkwrd; 185 186 pWLD = new WLevDistance( sSrchStr.getStr(), aSrchPara.changedChars, 187 aSrchPara.insertedChars, aSrchPara.deletedChars, 188 0 != (SearchFlags::LEV_RELAXED & aSrchPara.searchFlag ) ); 189 190 nLimit = pWLD->GetLimit(); 191 break; 192 193 default: 194 fnForward = &TextSearch::NSrchFrwrd; 195 fnBackward = &TextSearch::NSrchBkwrd; 196 break; 197 } 198 } 199 200 sal_Int32 FindPosInSeq_Impl( const Sequence <sal_Int32>& rOff, sal_Int32 nPos ) 201 { 202 sal_Int32 nRet = 0, nEnd = rOff.getLength(); 203 while( nRet < nEnd && nPos > rOff[ nRet ] ) ++nRet; 204 return nRet; 205 } 206 207 sal_Bool TextSearch::isCellStart(const OUString& searchStr, sal_Int32 nPos) 208 throw( RuntimeException ) 209 { 210 sal_Int32 nDone; 211 return nPos == xBreak->previousCharacters(searchStr, nPos+1, 212 aSrchPara.Locale, CharacterIteratorMode::SKIPCELL, 1, nDone); 213 } 214 215 SearchResult TextSearch::searchForward( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos ) 216 throw( RuntimeException ) 217 { 218 SearchResult sres; 219 220 OUString in_str(searchStr); 221 sal_Int32 newStartPos = startPos; 222 sal_Int32 newEndPos = endPos; 223 224 bUsePrimarySrchStr = true; 225 226 if ( xTranslit.is() ) 227 { 228 // apply normal transliteration (1<->1, 1<->0) 229 com::sun::star::uno::Sequence <sal_Int32> offset( in_str.getLength()); 230 in_str = xTranslit->transliterate( searchStr, 0, in_str.getLength(), offset ); 231 232 // JP 20.6.2001: also the start and end positions must be corrected! 233 if( startPos ) 234 newStartPos = FindPosInSeq_Impl( offset, startPos ); 235 236 if( endPos < searchStr.getLength() ) 237 newEndPos = FindPosInSeq_Impl( offset, endPos ); 238 else 239 newEndPos = in_str.getLength(); 240 241 sres = (this->*fnForward)( in_str, newStartPos, newEndPos ); 242 243 for ( int k = 0; k < sres.startOffset.getLength(); k++ ) 244 { 245 if (sres.startOffset[k]) 246 sres.startOffset[k] = offset[sres.startOffset[k]]; 247 // JP 20.6.2001: end is ever exclusive and then don't return 248 // the position of the next character - return the 249 // next position behind the last found character! 250 // "a b c" find "b" must return 2,3 and not 2,4!!! 251 if (sres.endOffset[k]) 252 sres.endOffset[k] = offset[sres.endOffset[k]-1] + 1; 253 } 254 } 255 else 256 { 257 sres = (this->*fnForward)( in_str, startPos, endPos ); 258 } 259 260 if ( xTranslit2.is() && aSrchPara.algorithmType != SearchAlgorithms_REGEXP) 261 { 262 SearchResult sres2; 263 264 in_str = OUString(searchStr); 265 com::sun::star::uno::Sequence <sal_Int32> offset( in_str.getLength()); 266 267 in_str = xTranslit2->transliterate( searchStr, 0, in_str.getLength(), offset ); 268 269 if( startPos ) 270 startPos = FindPosInSeq_Impl( offset, startPos ); 271 272 if( endPos < searchStr.getLength() ) 273 endPos = FindPosInSeq_Impl( offset, endPos ); 274 else 275 endPos = in_str.getLength(); 276 277 bUsePrimarySrchStr = false; 278 sres2 = (this->*fnForward)( in_str, startPos, endPos ); 279 280 for ( int k = 0; k < sres2.startOffset.getLength(); k++ ) 281 { 282 if (sres2.startOffset[k]) 283 sres2.startOffset[k] = offset[sres2.startOffset[k]-1] + 1; 284 if (sres2.endOffset[k]) 285 sres2.endOffset[k] = offset[sres2.endOffset[k]-1] + 1; 286 } 287 288 // pick first and long one 289 if ( sres.subRegExpressions == 0) 290 return sres2; 291 if ( sres2.subRegExpressions == 1) 292 { 293 if ( sres.startOffset[0] > sres2.startOffset[0]) 294 return sres2; 295 else if ( sres.startOffset[0] == sres2.startOffset[0] && 296 sres.endOffset[0] < sres2.endOffset[0]) 297 return sres2; 298 } 299 } 300 301 return sres; 302 } 303 304 SearchResult TextSearch::searchBackward( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos ) 305 throw(RuntimeException) 306 { 307 SearchResult sres; 308 309 OUString in_str(searchStr); 310 sal_Int32 newStartPos = startPos; 311 sal_Int32 newEndPos = endPos; 312 313 bUsePrimarySrchStr = true; 314 315 if ( xTranslit.is() ) 316 { 317 // apply only simple 1<->1 transliteration here 318 com::sun::star::uno::Sequence <sal_Int32> offset( in_str.getLength()); 319 in_str = xTranslit->transliterate( searchStr, 0, in_str.getLength(), offset ); 320 321 // JP 20.6.2001: also the start and end positions must be corrected! 322 if( startPos < searchStr.getLength() ) 323 newStartPos = FindPosInSeq_Impl( offset, startPos ); 324 else 325 newStartPos = in_str.getLength(); 326 327 if( endPos ) 328 newEndPos = FindPosInSeq_Impl( offset, endPos ); 329 330 sres = (this->*fnBackward)( in_str, newStartPos, newEndPos ); 331 332 for ( int k = 0; k < sres.startOffset.getLength(); k++ ) 333 { 334 if (sres.startOffset[k]) 335 sres.startOffset[k] = offset[sres.startOffset[k] - 1] + 1; 336 // JP 20.6.2001: end is ever exclusive and then don't return 337 // the position of the next character - return the 338 // next position behind the last found character! 339 // "a b c" find "b" must return 2,3 and not 2,4!!! 340 if (sres.endOffset[k]) 341 sres.endOffset[k] = offset[sres.endOffset[k]]; 342 } 343 } 344 else 345 { 346 sres = (this->*fnBackward)( in_str, startPos, endPos ); 347 } 348 349 if ( xTranslit2.is() && aSrchPara.algorithmType != SearchAlgorithms_REGEXP ) 350 { 351 SearchResult sres2; 352 353 in_str = OUString(searchStr); 354 com::sun::star::uno::Sequence <sal_Int32> offset( in_str.getLength()); 355 356 in_str = xTranslit2->transliterate(searchStr, 0, in_str.getLength(), offset); 357 358 if( startPos < searchStr.getLength() ) 359 startPos = FindPosInSeq_Impl( offset, startPos ); 360 else 361 startPos = in_str.getLength(); 362 363 if( endPos ) 364 endPos = FindPosInSeq_Impl( offset, endPos ); 365 366 bUsePrimarySrchStr = false; 367 sres2 = (this->*fnBackward)( in_str, startPos, endPos ); 368 369 for( int k = 0; k < sres2.startOffset.getLength(); k++ ) 370 { 371 if (sres2.startOffset[k]) 372 sres2.startOffset[k] = offset[sres2.startOffset[k]-1]+1; 373 if (sres2.endOffset[k]) 374 sres2.endOffset[k] = offset[sres2.endOffset[k]-1]+1; 375 } 376 377 // pick last and long one 378 if ( sres.subRegExpressions == 0 ) 379 return sres2; 380 if ( sres2.subRegExpressions == 1 ) 381 { 382 if ( sres.startOffset[0] < sres2.startOffset[0] ) 383 return sres2; 384 if ( sres.startOffset[0] == sres2.startOffset[0] && 385 sres.endOffset[0] > sres2.endOffset[0] ) 386 return sres2; 387 } 388 } 389 390 return sres; 391 } 392 393 //--------------------------------------------------------------------- 394 395 bool TextSearch::IsDelimiter( const OUString& rStr, sal_Int32 nPos ) const 396 { 397 bool bRet = 1; 398 if( '\x7f' != rStr[nPos]) 399 { 400 if ( !xCharClass.is() ) 401 { 402 Reference < XInterface > xI = xMSF->createInstance( 403 OUString::createFromAscii( "com.sun.star.i18n.CharacterClassification")); 404 if( xI.is() ) 405 xI->queryInterface( ::getCppuType( 406 (const Reference< XCharacterClassification >*)0)) 407 >>= xCharClass; 408 } 409 if ( xCharClass.is() ) 410 { 411 sal_Int32 nCType = xCharClass->getCharacterType( rStr, nPos, 412 aSrchPara.Locale ); 413 if( 0 != (( KCharacterType::DIGIT | KCharacterType::ALPHA | 414 KCharacterType::LETTER ) & nCType ) ) 415 bRet = 0; 416 } 417 } 418 return bRet; 419 } 420 421 // --------- helper methods for Boyer-Moore like text searching ---------- 422 // TODO: use ICU's regex UREGEX_LITERAL mode instead when it becomes available 423 424 void TextSearch::MakeForwardTab() 425 { 426 // create the jumptable for the search text 427 if( pJumpTable ) 428 { 429 if( bIsForwardTab ) 430 return ; // the jumpTable is ok 431 delete pJumpTable; 432 } 433 bIsForwardTab = true; 434 435 sal_Int32 n, nLen = sSrchStr.getLength(); 436 pJumpTable = new TextSearchJumpTable; 437 438 for( n = 0; n < nLen - 1; ++n ) 439 { 440 sal_Unicode cCh = sSrchStr[n]; 441 sal_Int32 nDiff = nLen - n - 1; 442 TextSearchJumpTable::value_type aEntry( cCh, nDiff ); 443 444 ::std::pair< TextSearchJumpTable::iterator, bool > aPair = 445 pJumpTable->insert( aEntry ); 446 if ( !aPair.second ) 447 (*(aPair.first)).second = nDiff; 448 } 449 } 450 451 void TextSearch::MakeForwardTab2() 452 { 453 // create the jumptable for the search text 454 if( pJumpTable2 ) 455 { 456 if( bIsForwardTab ) 457 return ; // the jumpTable is ok 458 delete pJumpTable2; 459 } 460 bIsForwardTab = true; 461 462 sal_Int32 n, nLen = sSrchStr2.getLength(); 463 pJumpTable2 = new TextSearchJumpTable; 464 465 for( n = 0; n < nLen - 1; ++n ) 466 { 467 sal_Unicode cCh = sSrchStr2[n]; 468 sal_Int32 nDiff = nLen - n - 1; 469 470 TextSearchJumpTable::value_type aEntry( cCh, nDiff ); 471 ::std::pair< TextSearchJumpTable::iterator, bool > aPair = 472 pJumpTable2->insert( aEntry ); 473 if ( !aPair.second ) 474 (*(aPair.first)).second = nDiff; 475 } 476 } 477 478 void TextSearch::MakeBackwardTab() 479 { 480 // create the jumptable for the search text 481 if( pJumpTable ) 482 { 483 if( !bIsForwardTab ) 484 return ; // the jumpTable is ok 485 delete pJumpTable; 486 } 487 bIsForwardTab = false; 488 489 sal_Int32 n, nLen = sSrchStr.getLength(); 490 pJumpTable = new TextSearchJumpTable; 491 492 for( n = nLen-1; n > 0; --n ) 493 { 494 sal_Unicode cCh = sSrchStr[n]; 495 TextSearchJumpTable::value_type aEntry( cCh, n ); 496 ::std::pair< TextSearchJumpTable::iterator, bool > aPair = 497 pJumpTable->insert( aEntry ); 498 if ( !aPair.second ) 499 (*(aPair.first)).second = n; 500 } 501 } 502 503 void TextSearch::MakeBackwardTab2() 504 { 505 // create the jumptable for the search text 506 if( pJumpTable2 ) 507 { 508 if( !bIsForwardTab ) 509 return ; // the jumpTable is ok 510 delete pJumpTable2; 511 } 512 bIsForwardTab = false; 513 514 sal_Int32 n, nLen = sSrchStr2.getLength(); 515 pJumpTable2 = new TextSearchJumpTable; 516 517 for( n = nLen-1; n > 0; --n ) 518 { 519 sal_Unicode cCh = sSrchStr2[n]; 520 TextSearchJumpTable::value_type aEntry( cCh, n ); 521 ::std::pair< TextSearchJumpTable::iterator, bool > aPair = 522 pJumpTable2->insert( aEntry ); 523 if ( !aPair.second ) 524 (*(aPair.first)).second = n; 525 } 526 } 527 528 sal_Int32 TextSearch::GetDiff( const sal_Unicode cChr ) const 529 { 530 TextSearchJumpTable *pJump; 531 OUString sSearchKey; 532 533 if ( bUsePrimarySrchStr ) { 534 pJump = pJumpTable; 535 sSearchKey = sSrchStr; 536 } else { 537 pJump = pJumpTable2; 538 sSearchKey = sSrchStr2; 539 } 540 541 TextSearchJumpTable::const_iterator iLook = pJump->find( cChr ); 542 if ( iLook == pJump->end() ) 543 return sSearchKey.getLength(); 544 return (*iLook).second; 545 } 546 547 548 // TextSearch::NSrchFrwrd is mis-optimized on unxsoli (#i105945#) 549 SearchResult TextSearch::NSrchFrwrd( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos ) 550 throw(RuntimeException) 551 { 552 SearchResult aRet; 553 aRet.subRegExpressions = 0; 554 555 OUString sSearchKey = bUsePrimarySrchStr ? sSrchStr : sSrchStr2; 556 557 OUString aStr( searchStr ); 558 sal_Int32 nSuchIdx = aStr.getLength(); 559 sal_Int32 nEnde = endPos; 560 if( !nSuchIdx || !sSearchKey.getLength() || sSearchKey.getLength() > nSuchIdx ) 561 return aRet; 562 563 564 if( nEnde < sSearchKey.getLength() ) // position inside the search region ? 565 return aRet; 566 567 nEnde -= sSearchKey.getLength(); 568 569 if (bUsePrimarySrchStr) 570 MakeForwardTab(); // create the jumptable 571 else 572 MakeForwardTab2(); 573 574 for (sal_Int32 nCmpIdx = startPos; // start position for the search 575 nCmpIdx <= nEnde; 576 nCmpIdx += GetDiff( aStr[nCmpIdx + sSearchKey.getLength()-1])) 577 { 578 // if the match would be the completed cells, skip it. 579 if ( (checkCTLStart && !isCellStart( aStr, nCmpIdx )) || (checkCTLEnd 580 && !isCellStart( aStr, nCmpIdx + sSearchKey.getLength())) ) 581 continue; 582 583 nSuchIdx = sSearchKey.getLength() - 1; 584 while( nSuchIdx >= 0 && sSearchKey[nSuchIdx] == aStr[nCmpIdx + nSuchIdx]) 585 { 586 if( nSuchIdx == 0 ) 587 { 588 if( SearchFlags::NORM_WORD_ONLY & aSrchPara.searchFlag ) 589 { 590 sal_Int32 nFndEnd = nCmpIdx + sSearchKey.getLength(); 591 bool bAtStart = !nCmpIdx; 592 bool bAtEnd = nFndEnd == endPos; 593 bool bDelimBefore = bAtStart || IsDelimiter( aStr, nCmpIdx-1 ); 594 bool bDelimBehind = IsDelimiter( aStr, nFndEnd ); 595 // * 1 -> only one word in the paragraph 596 // * 2 -> at begin of paragraph 597 // * 3 -> at end of paragraph 598 // * 4 -> inside the paragraph 599 if( !( ( bAtStart && bAtEnd ) || // 1 600 ( bAtStart && bDelimBehind ) || // 2 601 ( bAtEnd && bDelimBefore ) || // 3 602 ( bDelimBefore && bDelimBehind ))) // 4 603 break; 604 } 605 606 aRet.subRegExpressions = 1; 607 aRet.startOffset.realloc( 1 ); 608 aRet.startOffset[ 0 ] = nCmpIdx; 609 aRet.endOffset.realloc( 1 ); 610 aRet.endOffset[ 0 ] = nCmpIdx + sSearchKey.getLength(); 611 612 return aRet; 613 } 614 else 615 nSuchIdx--; 616 } 617 } 618 return aRet; 619 } 620 621 SearchResult TextSearch::NSrchBkwrd( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos ) 622 throw(RuntimeException) 623 { 624 SearchResult aRet; 625 aRet.subRegExpressions = 0; 626 627 OUString sSearchKey = bUsePrimarySrchStr ? sSrchStr : sSrchStr2; 628 629 OUString aStr( searchStr ); 630 sal_Int32 nSuchIdx = aStr.getLength(); 631 sal_Int32 nEnde = endPos; 632 if( nSuchIdx == 0 || sSearchKey.getLength() == 0 || sSearchKey.getLength() > nSuchIdx) 633 return aRet; 634 635 if (bUsePrimarySrchStr) 636 MakeBackwardTab(); // create the jumptable 637 else 638 MakeBackwardTab2(); 639 640 if( nEnde == nSuchIdx ) // end position for the search 641 nEnde = sSearchKey.getLength(); 642 else 643 nEnde += sSearchKey.getLength(); 644 645 sal_Int32 nCmpIdx = startPos; // start position for the search 646 647 while (nCmpIdx >= nEnde) 648 { 649 // if the match would be the completed cells, skip it. 650 if ( (!checkCTLStart || isCellStart( aStr, nCmpIdx - 651 sSearchKey.getLength() )) && (!checkCTLEnd || 652 isCellStart( aStr, nCmpIdx))) 653 { 654 nSuchIdx = 0; 655 while( nSuchIdx < sSearchKey.getLength() && sSearchKey[nSuchIdx] == 656 aStr[nCmpIdx + nSuchIdx - sSearchKey.getLength()] ) 657 nSuchIdx++; 658 if( nSuchIdx >= sSearchKey.getLength() ) 659 { 660 if( SearchFlags::NORM_WORD_ONLY & aSrchPara.searchFlag ) 661 { 662 sal_Int32 nFndStt = nCmpIdx - sSearchKey.getLength(); 663 bool bAtStart = !nFndStt; 664 bool bAtEnd = nCmpIdx == startPos; 665 bool bDelimBehind = IsDelimiter( aStr, nCmpIdx ); 666 bool bDelimBefore = bAtStart || // begin of paragraph 667 IsDelimiter( aStr, nFndStt-1 ); 668 // * 1 -> only one word in the paragraph 669 // * 2 -> at begin of paragraph 670 // * 3 -> at end of paragraph 671 // * 4 -> inside the paragraph 672 if( ( bAtStart && bAtEnd ) || // 1 673 ( bAtStart && bDelimBehind ) || // 2 674 ( bAtEnd && bDelimBefore ) || // 3 675 ( bDelimBefore && bDelimBehind )) // 4 676 { 677 aRet.subRegExpressions = 1; 678 aRet.startOffset.realloc( 1 ); 679 aRet.startOffset[ 0 ] = nCmpIdx; 680 aRet.endOffset.realloc( 1 ); 681 aRet.endOffset[ 0 ] = nCmpIdx - sSearchKey.getLength(); 682 return aRet; 683 } 684 } 685 else 686 { 687 aRet.subRegExpressions = 1; 688 aRet.startOffset.realloc( 1 ); 689 aRet.startOffset[ 0 ] = nCmpIdx; 690 aRet.endOffset.realloc( 1 ); 691 aRet.endOffset[ 0 ] = nCmpIdx - sSearchKey.getLength(); 692 return aRet; 693 } 694 } 695 } 696 nSuchIdx = GetDiff( aStr[nCmpIdx - sSearchKey.getLength()] ); 697 if( nCmpIdx < nSuchIdx ) 698 return aRet; 699 nCmpIdx -= nSuchIdx; 700 } 701 return aRet; 702 } 703 704 void TextSearch::RESrchPrepare( const ::com::sun::star::util::SearchOptions& rOptions) 705 { 706 // select the transliterated pattern string 707 const OUString& rPatternStr = 708 (rOptions.transliterateFlags & SIMPLE_TRANS_MASK) ? sSrchStr 709 : ((rOptions.transliterateFlags & COMPLEX_TRANS_MASK) ? sSrchStr2 : rOptions.searchString); 710 711 sal_uInt32 nIcuSearchFlags = UREGEX_UWORD; // request UAX#29 unicode capability 712 // map com::sun::star::util::SearchFlags to ICU uregex.h flags 713 // TODO: REG_EXTENDED, REG_NOT_BEGINOFLINE, REG_NOT_ENDOFLINE 714 // REG_NEWLINE is neither properly defined nor used anywhere => not implemented 715 // REG_NOSUB is not used anywhere => not implemented 716 // NORM_WORD_ONLY is only used for SearchAlgorithm==Absolute 717 // LEV_RELAXED is only used for SearchAlgorithm==Approximate 718 // why is even ALL_IGNORE_CASE deprecated in UNO? because of transliteration taking care of it??? 719 if( (rOptions.searchFlag & com::sun::star::util::SearchFlags::ALL_IGNORE_CASE) != 0) 720 nIcuSearchFlags |= UREGEX_CASE_INSENSITIVE; 721 UErrorCode nIcuErr = U_ZERO_ERROR; 722 // assumption: transliteration didn't mangle regexp control chars 723 IcuUniString aIcuSearchPatStr( rPatternStr.getStr(), rPatternStr.getLength()); 724 #if 1 725 // for conveniance specific syntax elements of the old regex engine are emulated 726 // by using regular word boundary matching \b to replace \< and \> 727 static const IcuUniString aChevronPattern( "\\<|\\>", -1, IcuUniString::kInvariant); 728 static const IcuUniString aChevronReplace( "\\b", -1, IcuUniString::kInvariant); 729 static RegexMatcher aChevronMatcher( aChevronPattern, 0, nIcuErr); 730 aChevronMatcher.reset( aIcuSearchPatStr); 731 aIcuSearchPatStr = aChevronMatcher.replaceAll( aChevronReplace, nIcuErr); 732 aChevronMatcher.reset(); 733 #endif 734 pRegexMatcher = new RegexMatcher( aIcuSearchPatStr, nIcuSearchFlags, nIcuErr); 735 if( nIcuErr) 736 { delete pRegexMatcher; pRegexMatcher = NULL;} 737 } 738 739 //--------------------------------------------------------------------------- 740 741 SearchResult TextSearch::RESrchFrwrd( const OUString& searchStr, 742 sal_Int32 startPos, sal_Int32 endPos ) 743 throw(RuntimeException) 744 { 745 SearchResult aRet; 746 aRet.subRegExpressions = 0; 747 if( !pRegexMatcher) 748 return aRet; 749 750 if( endPos > searchStr.getLength()) 751 endPos = searchStr.getLength(); 752 753 // use the ICU RegexMatcher to find the matches 754 UErrorCode nIcuErr = U_ZERO_ERROR; 755 const IcuUniString aSearchTargetStr( searchStr.getStr(), endPos); 756 pRegexMatcher->reset( aSearchTargetStr); 757 // search until there is a valid match 758 for(;;) 759 { 760 if( !pRegexMatcher->find( startPos, nIcuErr)) 761 return aRet; 762 763 // #i118887# ignore zero-length matches e.g. "a*" in "bc" 764 int nStartOfs = pRegexMatcher->start( nIcuErr); 765 int nEndOfs = pRegexMatcher->end( nIcuErr); 766 if( nStartOfs < nEndOfs) 767 break; 768 // try at next position if there was a zero-length match 769 if( ++startPos >= endPos) 770 return aRet; 771 } 772 773 // extract the result of the search 774 const int nGroupCount = pRegexMatcher->groupCount(); 775 aRet.subRegExpressions = nGroupCount + 1; 776 aRet.startOffset.realloc( aRet.subRegExpressions); 777 aRet.endOffset.realloc( aRet.subRegExpressions); 778 aRet.startOffset[0] = pRegexMatcher->start( nIcuErr); 779 aRet.endOffset[0] = pRegexMatcher->end( nIcuErr); 780 for( int i = 1; i <= nGroupCount; ++i) { 781 aRet.startOffset[i] = pRegexMatcher->start( i, nIcuErr); 782 aRet.endOffset[i] = pRegexMatcher->end( i, nIcuErr); 783 } 784 785 return aRet; 786 } 787 788 SearchResult TextSearch::RESrchBkwrd( const OUString& searchStr, 789 sal_Int32 startPos, sal_Int32 endPos ) 790 throw(RuntimeException) 791 { 792 // NOTE: for backwards search callers provide startPos/endPos inverted! 793 SearchResult aRet; 794 aRet.subRegExpressions = 0; 795 if( !pRegexMatcher) 796 return aRet; 797 798 if( startPos > searchStr.getLength()) 799 startPos = searchStr.getLength(); 800 801 // use the ICU RegexMatcher to find the matches 802 // TODO: use ICU's backward searching once it becomes available 803 // as its replacement using forward search is not as good as the real thing 804 UErrorCode nIcuErr = U_ZERO_ERROR; 805 const IcuUniString aSearchTargetStr( searchStr.getStr(), startPos); 806 pRegexMatcher->reset( aSearchTargetStr); 807 if( !pRegexMatcher->find( endPos, nIcuErr)) 808 return aRet; 809 810 // find the last match 811 int nLastPos = 0; 812 do { 813 nLastPos = pRegexMatcher->start( nIcuErr); 814 } while( pRegexMatcher->find( nLastPos + 1, nIcuErr)); 815 816 // find last match again to get its details 817 pRegexMatcher->find( nLastPos, nIcuErr); 818 819 // fill in the details of the last match 820 const int nGroupCount = pRegexMatcher->groupCount(); 821 aRet.subRegExpressions = nGroupCount + 1; 822 aRet.startOffset.realloc( aRet.subRegExpressions); 823 aRet.endOffset.realloc( aRet.subRegExpressions); 824 // NOTE: existing users of backward search seem to expect startOfs/endOfs being inverted! 825 aRet.startOffset[0] = pRegexMatcher->end( nIcuErr); 826 aRet.endOffset[0] = pRegexMatcher->start( nIcuErr); 827 for( int i = 1; i <= nGroupCount; ++i) { 828 aRet.startOffset[i] = pRegexMatcher->end( i, nIcuErr); 829 aRet.endOffset[i] = pRegexMatcher->start( i, nIcuErr); 830 } 831 832 return aRet; 833 } 834 835 //--------------------------------------------------------------------------- 836 837 // search for words phonetically 838 SearchResult TextSearch::ApproxSrchFrwrd( const OUString& searchStr, 839 sal_Int32 startPos, sal_Int32 endPos ) 840 throw(RuntimeException) 841 { 842 SearchResult aRet; 843 aRet.subRegExpressions = 0; 844 845 if( !xBreak.is() ) 846 return aRet; 847 848 OUString aWTemp( searchStr ); 849 850 register sal_Int32 nStt, nEnd; 851 852 Boundary aWBnd = xBreak->getWordBoundary( aWTemp, startPos, 853 aSrchPara.Locale, 854 WordType::ANYWORD_IGNOREWHITESPACES, sal_True ); 855 856 do 857 { 858 if( aWBnd.startPos >= endPos ) 859 break; 860 nStt = aWBnd.startPos < startPos ? startPos : aWBnd.startPos; 861 nEnd = aWBnd.endPos > endPos ? endPos : aWBnd.endPos; 862 863 if( nStt < nEnd && 864 pWLD->WLD( aWTemp.getStr() + nStt, nEnd - nStt ) <= nLimit ) 865 { 866 aRet.subRegExpressions = 1; 867 aRet.startOffset.realloc( 1 ); 868 aRet.startOffset[ 0 ] = nStt; 869 aRet.endOffset.realloc( 1 ); 870 aRet.endOffset[ 0 ] = nEnd; 871 break; 872 } 873 874 nStt = nEnd - 1; 875 aWBnd = xBreak->nextWord( aWTemp, nStt, aSrchPara.Locale, 876 WordType::ANYWORD_IGNOREWHITESPACES); 877 } while( aWBnd.startPos != aWBnd.endPos || 878 (aWBnd.endPos != aWTemp.getLength() && aWBnd.endPos != nEnd) ); 879 // #i50244# aWBnd.endPos != nEnd : in case there is _no_ word (only 880 // whitespace) in searchStr, getWordBoundary() returned startPos,startPos 881 // and nextWord() does also => don't loop forever. 882 return aRet; 883 } 884 885 SearchResult TextSearch::ApproxSrchBkwrd( const OUString& searchStr, 886 sal_Int32 startPos, sal_Int32 endPos ) 887 throw(RuntimeException) 888 { 889 SearchResult aRet; 890 aRet.subRegExpressions = 0; 891 892 if( !xBreak.is() ) 893 return aRet; 894 895 OUString aWTemp( searchStr ); 896 897 register sal_Int32 nStt, nEnd; 898 899 Boundary aWBnd = xBreak->getWordBoundary( aWTemp, startPos, 900 aSrchPara.Locale, 901 WordType::ANYWORD_IGNOREWHITESPACES, sal_True ); 902 903 do 904 { 905 if( aWBnd.endPos <= endPos ) 906 break; 907 nStt = aWBnd.startPos < endPos ? endPos : aWBnd.startPos; 908 nEnd = aWBnd.endPos > startPos ? startPos : aWBnd.endPos; 909 910 if( nStt < nEnd && 911 pWLD->WLD( aWTemp.getStr() + nStt, nEnd - nStt ) <= nLimit ) 912 { 913 aRet.subRegExpressions = 1; 914 aRet.startOffset.realloc( 1 ); 915 aRet.startOffset[ 0 ] = nEnd; 916 aRet.endOffset.realloc( 1 ); 917 aRet.endOffset[ 0 ] = nStt; 918 break; 919 } 920 if( !nStt ) 921 break; 922 923 aWBnd = xBreak->previousWord( aWTemp, nStt, aSrchPara.Locale, 924 WordType::ANYWORD_IGNOREWHITESPACES); 925 } while( aWBnd.startPos != aWBnd.endPos || aWBnd.endPos != aWTemp.getLength() ); 926 return aRet; 927 } 928 929 930 static const sal_Char cSearchName[] = "com.sun.star.util.TextSearch"; 931 static const sal_Char cSearchImpl[] = "com.sun.star.util.TextSearch_i18n"; 932 933 static OUString getServiceName_Static() 934 { 935 return OUString::createFromAscii( cSearchName ); 936 } 937 938 static OUString getImplementationName_Static() 939 { 940 return OUString::createFromAscii( cSearchImpl ); 941 } 942 943 OUString SAL_CALL 944 TextSearch::getImplementationName() 945 throw( RuntimeException ) 946 { 947 return getImplementationName_Static(); 948 } 949 950 sal_Bool SAL_CALL 951 TextSearch::supportsService(const OUString& rServiceName) 952 throw( RuntimeException ) 953 { 954 return !rServiceName.compareToAscii( cSearchName ); 955 } 956 957 Sequence< OUString > SAL_CALL 958 TextSearch::getSupportedServiceNames(void) throw( RuntimeException ) 959 { 960 Sequence< OUString > aRet(1); 961 aRet[0] = getServiceName_Static(); 962 return aRet; 963 } 964 965 ::com::sun::star::uno::Reference< ::com::sun::star::uno::XInterface > 966 SAL_CALL TextSearch_CreateInstance( 967 const ::com::sun::star::uno::Reference< 968 ::com::sun::star::lang::XMultiServiceFactory >& rxMSF ) 969 { 970 return ::com::sun::star::uno::Reference< 971 ::com::sun::star::uno::XInterface >( 972 (::cppu::OWeakObject*) new TextSearch( rxMSF ) ); 973 } 974 975 extern "C" 976 { 977 978 void SAL_CALL component_getImplementationEnvironment( 979 const sal_Char** ppEnvTypeName, uno_Environment** /*ppEnv*/ ) 980 { 981 *ppEnvTypeName = CPPU_CURRENT_LANGUAGE_BINDING_NAME; 982 } 983 984 void* SAL_CALL component_getFactory( const sal_Char* sImplementationName, 985 void* _pServiceManager, void* /*_pRegistryKey*/ ) 986 { 987 void* pRet = NULL; 988 989 ::com::sun::star::lang::XMultiServiceFactory* pServiceManager = 990 reinterpret_cast< ::com::sun::star::lang::XMultiServiceFactory* > 991 ( _pServiceManager ); 992 ::com::sun::star::uno::Reference< 993 ::com::sun::star::lang::XSingleServiceFactory > xFactory; 994 995 if ( 0 == rtl_str_compare( sImplementationName, cSearchImpl) ) 996 { 997 ::com::sun::star::uno::Sequence< ::rtl::OUString > aServiceNames(1); 998 aServiceNames[0] = getServiceName_Static(); 999 xFactory = ::cppu::createSingleFactory( 1000 pServiceManager, getImplementationName_Static(), 1001 &TextSearch_CreateInstance, aServiceNames ); 1002 } 1003 1004 if ( xFactory.is() ) 1005 { 1006 xFactory->acquire(); 1007 pRet = xFactory.get(); 1008 } 1009 1010 return pRet; 1011 } 1012 1013 } // extern "C" 1014