1 /************************************************************** 2 * 3 * Licensed to the Apache Software Foundation (ASF) under one 4 * or more contributor license agreements. See the NOTICE file 5 * distributed with this work for additional information 6 * regarding copyright ownership. The ASF licenses this file 7 * to you under the Apache License, Version 2.0 (the 8 * "License"); you may not use this file except in compliance 9 * with the License. You may obtain a copy of the License at 10 * 11 * http://www.apache.org/licenses/LICENSE-2.0 12 * 13 * Unless required by applicable law or agreed to in writing, 14 * software distributed under the License is distributed on an 15 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 * KIND, either express or implied. See the License for the 17 * specific language governing permissions and limitations 18 * under the License. 19 * 20 *************************************************************/ 21 22 23 24 // MARKER(update_precomp.py): autogen include statement, do not remove 25 #include "precompiled_i18npool.hxx" 26 27 #include "textsearch.hxx" 28 #include "levdis.hxx" 29 #include <com/sun/star/lang/Locale.hpp> 30 #include <com/sun/star/lang/XMultiServiceFactory.hpp> 31 #include <comphelper/processfactory.hxx> 32 #include <com/sun/star/i18n/UnicodeType.hpp> 33 #include <com/sun/star/util/SearchFlags.hpp> 34 #include <com/sun/star/i18n/WordType.hpp> 35 #include <com/sun/star/i18n/ScriptType.hpp> 36 #include <com/sun/star/i18n/CharacterIteratorMode.hpp> 37 #include <com/sun/star/i18n/KCharacterType.hpp> 38 #include <com/sun/star/registry/XRegistryKey.hpp> 39 #include <cppuhelper/factory.hxx> 40 #include <cppuhelper/weak.hxx> 41 42 #ifdef _MSC_VER 43 // get rid of that dumb compiler warning 44 // identifier was truncated to '255' characters in the debug information 45 // for STL template usage, if .pdb files are to be created 46 #pragma warning( disable: 4786 ) 47 #endif 48 49 #include <string.h> 50 51 using namespace ::com::sun::star::util; 52 using namespace ::com::sun::star::uno; 53 using namespace ::com::sun::star::lang; 54 using namespace ::com::sun::star::i18n; 55 using namespace ::rtl; 56 57 static sal_Int32 COMPLEX_TRANS_MASK_TMP = 58 TransliterationModules_ignoreBaFa_ja_JP | 59 TransliterationModules_ignoreIterationMark_ja_JP | 60 TransliterationModules_ignoreTiJi_ja_JP | 61 TransliterationModules_ignoreHyuByu_ja_JP | 62 TransliterationModules_ignoreSeZe_ja_JP | 63 TransliterationModules_ignoreIandEfollowedByYa_ja_JP | 64 TransliterationModules_ignoreKiKuFollowedBySa_ja_JP | 65 TransliterationModules_ignoreProlongedSoundMark_ja_JP; 66 static const sal_Int32 COMPLEX_TRANS_MASK = COMPLEX_TRANS_MASK_TMP | TransliterationModules_IGNORE_KANA | TransliterationModules_FULLWIDTH_HALFWIDTH; 67 static const sal_Int32 SIMPLE_TRANS_MASK = ~(COMPLEX_TRANS_MASK | TransliterationModules_IGNORE_CASE | TransliterationModules_UPPERCASE_LOWERCASE | TransliterationModules_LOWERCASE_UPPERCASE); 68 // Above 2 transliteration is simple but need to take effect in 69 // complex transliteration 70 71 TextSearch::TextSearch(const Reference < XMultiServiceFactory > & rxMSF) 72 : xMSF( rxMSF ) 73 , pJumpTable( 0 ) 74 , pJumpTable2( 0 ) 75 , pRegexMatcher( NULL ) 76 , pWLD( 0 ) 77 { 78 SearchOptions aOpt; 79 aOpt.algorithmType = SearchAlgorithms_ABSOLUTE; 80 aOpt.searchFlag = SearchFlags::ALL_IGNORE_CASE; 81 //aOpt.Locale = ???; 82 setOptions( aOpt ); 83 } 84 85 TextSearch::~TextSearch() 86 { 87 delete pRegexMatcher; 88 delete pWLD; 89 delete pJumpTable; 90 delete pJumpTable2; 91 } 92 93 void TextSearch::setOptions( const SearchOptions& rOptions ) throw( RuntimeException ) 94 { 95 aSrchPara = rOptions; 96 97 delete pRegexMatcher, pRegexMatcher = NULL; 98 delete pWLD, pWLD = 0; 99 delete pJumpTable, pJumpTable = 0; 100 delete pJumpTable2, pJumpTable2 = 0; 101 102 // Create Transliteration class 103 if( aSrchPara.transliterateFlags & SIMPLE_TRANS_MASK ) 104 { 105 if( !xTranslit.is() ) 106 { 107 Reference < XInterface > xI = xMSF->createInstance( 108 OUString::createFromAscii( 109 "com.sun.star.i18n.Transliteration")); 110 if ( xI.is() ) 111 xI->queryInterface( ::getCppuType( 112 (const Reference< XExtendedTransliteration >*)0)) 113 >>= xTranslit; 114 } 115 // Load transliteration module 116 if( xTranslit.is() ) 117 xTranslit->loadModule( 118 (TransliterationModules)( aSrchPara.transliterateFlags & SIMPLE_TRANS_MASK ), 119 aSrchPara.Locale); 120 } 121 else if( xTranslit.is() ) 122 xTranslit = 0; 123 124 // Create Transliteration for 2<->1, 2<->2 transliteration 125 if ( aSrchPara.transliterateFlags & COMPLEX_TRANS_MASK ) 126 { 127 if( !xTranslit2.is() ) 128 { 129 Reference < XInterface > xI = xMSF->createInstance( 130 OUString::createFromAscii( 131 "com.sun.star.i18n.Transliteration")); 132 if ( xI.is() ) 133 xI->queryInterface( ::getCppuType( 134 (const Reference< XExtendedTransliteration >*)0)) 135 >>= xTranslit2; 136 } 137 // Load transliteration module 138 if( xTranslit2.is() ) 139 xTranslit2->loadModule( 140 (TransliterationModules)( aSrchPara.transliterateFlags & COMPLEX_TRANS_MASK ), 141 aSrchPara.Locale); 142 } 143 144 if ( !xBreak.is() ) 145 { 146 Reference < XInterface > xI = xMSF->createInstance( 147 OUString::createFromAscii( "com.sun.star.i18n.BreakIterator")); 148 if( xI.is() ) 149 xI->queryInterface( ::getCppuType( 150 (const Reference< XBreakIterator >*)0)) 151 >>= xBreak; 152 } 153 154 sSrchStr = aSrchPara.searchString; 155 156 // use transliteration here 157 if ( xTranslit.is() && 158 aSrchPara.transliterateFlags & SIMPLE_TRANS_MASK ) 159 sSrchStr = xTranslit->transliterateString2String( 160 aSrchPara.searchString, 0, aSrchPara.searchString.getLength()); 161 162 if ( xTranslit2.is() && 163 aSrchPara.transliterateFlags & COMPLEX_TRANS_MASK ) 164 sSrchStr2 = xTranslit2->transliterateString2String( 165 aSrchPara.searchString, 0, aSrchPara.searchString.getLength()); 166 167 // When start or end of search string is a complex script type, we need to 168 // make sure the result boundary is not located in the middle of cell. 169 checkCTLStart = (xBreak.is() && (xBreak->getScriptType(sSrchStr, 0) == 170 ScriptType::COMPLEX)); 171 checkCTLEnd = (xBreak.is() && (xBreak->getScriptType(sSrchStr, 172 sSrchStr.getLength()-1) == ScriptType::COMPLEX)); 173 174 switch( aSrchPara.algorithmType) 175 { 176 case SearchAlgorithms_REGEXP: 177 fnForward = &TextSearch::RESrchFrwrd; 178 fnBackward = &TextSearch::RESrchBkwrd; 179 RESrchPrepare( aSrchPara); 180 break; 181 182 case SearchAlgorithms_APPROXIMATE: 183 fnForward = &TextSearch::ApproxSrchFrwrd; 184 fnBackward = &TextSearch::ApproxSrchBkwrd; 185 186 pWLD = new WLevDistance( sSrchStr.getStr(), aSrchPara.changedChars, 187 aSrchPara.insertedChars, aSrchPara.deletedChars, 188 0 != (SearchFlags::LEV_RELAXED & aSrchPara.searchFlag ) ); 189 190 nLimit = pWLD->GetLimit(); 191 break; 192 193 default: 194 fnForward = &TextSearch::NSrchFrwrd; 195 fnBackward = &TextSearch::NSrchBkwrd; 196 break; 197 } 198 } 199 200 sal_Int32 FindPosInSeq_Impl( const Sequence <sal_Int32>& rOff, sal_Int32 nPos ) 201 { 202 sal_Int32 nRet = 0, nEnd = rOff.getLength(); 203 while( nRet < nEnd && nPos > rOff[ nRet ] ) ++nRet; 204 return nRet; 205 } 206 207 sal_Bool TextSearch::isCellStart(const OUString& searchStr, sal_Int32 nPos) 208 throw( RuntimeException ) 209 { 210 sal_Int32 nDone; 211 return nPos == xBreak->previousCharacters(searchStr, nPos+1, 212 aSrchPara.Locale, CharacterIteratorMode::SKIPCELL, 1, nDone); 213 } 214 215 SearchResult TextSearch::searchForward( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos ) 216 throw( RuntimeException ) 217 { 218 SearchResult sres; 219 220 OUString in_str(searchStr); 221 sal_Int32 newStartPos = startPos; 222 sal_Int32 newEndPos = endPos; 223 224 bUsePrimarySrchStr = true; 225 226 if ( xTranslit.is() ) 227 { 228 // apply normal transliteration (1<->1, 1<->0) 229 com::sun::star::uno::Sequence <sal_Int32> offset( in_str.getLength()); 230 in_str = xTranslit->transliterate( searchStr, 0, in_str.getLength(), offset ); 231 232 // JP 20.6.2001: also the start and end positions must be corrected! 233 if( startPos ) 234 newStartPos = FindPosInSeq_Impl( offset, startPos ); 235 236 if( endPos < searchStr.getLength() ) 237 newEndPos = FindPosInSeq_Impl( offset, endPos ); 238 else 239 newEndPos = in_str.getLength(); 240 241 sres = (this->*fnForward)( in_str, newStartPos, newEndPos ); 242 243 for ( int k = 0; k < sres.startOffset.getLength(); k++ ) 244 { 245 if (sres.startOffset[k]) 246 sres.startOffset[k] = offset[sres.startOffset[k]]; 247 // JP 20.6.2001: end is ever exclusive and then don't return 248 // the position of the next character - return the 249 // next position behind the last found character! 250 // "a b c" find "b" must return 2,3 and not 2,4!!! 251 if (sres.endOffset[k]) 252 sres.endOffset[k] = offset[sres.endOffset[k]-1] + 1; 253 } 254 } 255 else 256 { 257 sres = (this->*fnForward)( in_str, startPos, endPos ); 258 } 259 260 if ( xTranslit2.is() && aSrchPara.algorithmType != SearchAlgorithms_REGEXP) 261 { 262 SearchResult sres2; 263 264 in_str = OUString(searchStr); 265 com::sun::star::uno::Sequence <sal_Int32> offset( in_str.getLength()); 266 267 in_str = xTranslit2->transliterate( searchStr, 0, in_str.getLength(), offset ); 268 269 if( startPos ) 270 startPos = FindPosInSeq_Impl( offset, startPos ); 271 272 if( endPos < searchStr.getLength() ) 273 endPos = FindPosInSeq_Impl( offset, endPos ); 274 else 275 endPos = in_str.getLength(); 276 277 bUsePrimarySrchStr = false; 278 sres2 = (this->*fnForward)( in_str, startPos, endPos ); 279 280 for ( int k = 0; k < sres2.startOffset.getLength(); k++ ) 281 { 282 if (sres2.startOffset[k]) 283 sres2.startOffset[k] = offset[sres2.startOffset[k]-1] + 1; 284 if (sres2.endOffset[k]) 285 sres2.endOffset[k] = offset[sres2.endOffset[k]-1] + 1; 286 } 287 288 // pick first and long one 289 if ( sres.subRegExpressions == 0) 290 return sres2; 291 if ( sres2.subRegExpressions == 1) 292 { 293 if ( sres.startOffset[0] > sres2.startOffset[0]) 294 return sres2; 295 else if ( sres.startOffset[0] == sres2.startOffset[0] && 296 sres.endOffset[0] < sres2.endOffset[0]) 297 return sres2; 298 } 299 } 300 301 return sres; 302 } 303 304 SearchResult TextSearch::searchBackward( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos ) 305 throw(RuntimeException) 306 { 307 SearchResult sres; 308 309 OUString in_str(searchStr); 310 sal_Int32 newStartPos = startPos; 311 sal_Int32 newEndPos = endPos; 312 313 bUsePrimarySrchStr = true; 314 315 if ( xTranslit.is() ) 316 { 317 // apply only simple 1<->1 transliteration here 318 com::sun::star::uno::Sequence <sal_Int32> offset( in_str.getLength()); 319 in_str = xTranslit->transliterate( searchStr, 0, in_str.getLength(), offset ); 320 321 // JP 20.6.2001: also the start and end positions must be corrected! 322 if( startPos < searchStr.getLength() ) 323 newStartPos = FindPosInSeq_Impl( offset, startPos ); 324 else 325 newStartPos = in_str.getLength(); 326 327 if( endPos ) 328 newEndPos = FindPosInSeq_Impl( offset, endPos ); 329 330 sres = (this->*fnBackward)( in_str, newStartPos, newEndPos ); 331 332 for ( int k = 0; k < sres.startOffset.getLength(); k++ ) 333 { 334 if (sres.startOffset[k]) 335 sres.startOffset[k] = offset[sres.startOffset[k] - 1] + 1; 336 // JP 20.6.2001: end is ever exclusive and then don't return 337 // the position of the next character - return the 338 // next position behind the last found character! 339 // "a b c" find "b" must return 2,3 and not 2,4!!! 340 if (sres.endOffset[k]) 341 sres.endOffset[k] = offset[sres.endOffset[k]]; 342 } 343 } 344 else 345 { 346 sres = (this->*fnBackward)( in_str, startPos, endPos ); 347 } 348 349 if ( xTranslit2.is() && aSrchPara.algorithmType != SearchAlgorithms_REGEXP ) 350 { 351 SearchResult sres2; 352 353 in_str = OUString(searchStr); 354 com::sun::star::uno::Sequence <sal_Int32> offset( in_str.getLength()); 355 356 in_str = xTranslit2->transliterate(searchStr, 0, in_str.getLength(), offset); 357 358 if( startPos < searchStr.getLength() ) 359 startPos = FindPosInSeq_Impl( offset, startPos ); 360 else 361 startPos = in_str.getLength(); 362 363 if( endPos ) 364 endPos = FindPosInSeq_Impl( offset, endPos ); 365 366 bUsePrimarySrchStr = false; 367 sres2 = (this->*fnBackward)( in_str, startPos, endPos ); 368 369 for( int k = 0; k < sres2.startOffset.getLength(); k++ ) 370 { 371 if (sres2.startOffset[k]) 372 sres2.startOffset[k] = offset[sres2.startOffset[k]-1]+1; 373 if (sres2.endOffset[k]) 374 sres2.endOffset[k] = offset[sres2.endOffset[k]-1]+1; 375 } 376 377 // pick last and long one 378 if ( sres.subRegExpressions == 0 ) 379 return sres2; 380 if ( sres2.subRegExpressions == 1 ) 381 { 382 if ( sres.startOffset[0] < sres2.startOffset[0] ) 383 return sres2; 384 if ( sres.startOffset[0] == sres2.startOffset[0] && 385 sres.endOffset[0] > sres2.endOffset[0] ) 386 return sres2; 387 } 388 } 389 390 return sres; 391 } 392 393 //--------------------------------------------------------------------- 394 395 bool TextSearch::IsDelimiter( const OUString& rStr, sal_Int32 nPos ) const 396 { 397 bool bRet = 1; 398 if( '\x7f' != rStr[nPos]) 399 { 400 if ( !xCharClass.is() ) 401 { 402 Reference < XInterface > xI = xMSF->createInstance( 403 OUString::createFromAscii( "com.sun.star.i18n.CharacterClassification")); 404 if( xI.is() ) 405 xI->queryInterface( ::getCppuType( 406 (const Reference< XCharacterClassification >*)0)) 407 >>= xCharClass; 408 } 409 if ( xCharClass.is() ) 410 { 411 sal_Int32 nCType = xCharClass->getCharacterType( rStr, nPos, 412 aSrchPara.Locale ); 413 if( 0 != (( KCharacterType::DIGIT | KCharacterType::ALPHA | 414 KCharacterType::LETTER ) & nCType ) ) 415 bRet = 0; 416 } 417 } 418 return bRet; 419 } 420 421 // --------- helper methods for Boyer-Moore like text searching ---------- 422 // TODO: use ICU's regex UREGEX_LITERAL mode instead when it becomes available 423 424 void TextSearch::MakeForwardTab() 425 { 426 // create the jumptable for the search text 427 if( pJumpTable ) 428 { 429 if( bIsForwardTab ) 430 return ; // the jumpTable is ok 431 delete pJumpTable; 432 } 433 bIsForwardTab = true; 434 435 sal_Int32 n, nLen = sSrchStr.getLength(); 436 pJumpTable = new TextSearchJumpTable; 437 438 for( n = 0; n < nLen - 1; ++n ) 439 { 440 sal_Unicode cCh = sSrchStr[n]; 441 sal_Int32 nDiff = nLen - n - 1; 442 TextSearchJumpTable::value_type aEntry( cCh, nDiff ); 443 444 ::std::pair< TextSearchJumpTable::iterator, bool > aPair = 445 pJumpTable->insert( aEntry ); 446 if ( !aPair.second ) 447 (*(aPair.first)).second = nDiff; 448 } 449 } 450 451 void TextSearch::MakeForwardTab2() 452 { 453 // create the jumptable for the search text 454 if( pJumpTable2 ) 455 { 456 if( bIsForwardTab ) 457 return ; // the jumpTable is ok 458 delete pJumpTable2; 459 } 460 bIsForwardTab = true; 461 462 sal_Int32 n, nLen = sSrchStr2.getLength(); 463 pJumpTable2 = new TextSearchJumpTable; 464 465 for( n = 0; n < nLen - 1; ++n ) 466 { 467 sal_Unicode cCh = sSrchStr2[n]; 468 sal_Int32 nDiff = nLen - n - 1; 469 470 TextSearchJumpTable::value_type aEntry( cCh, nDiff ); 471 ::std::pair< TextSearchJumpTable::iterator, bool > aPair = 472 pJumpTable2->insert( aEntry ); 473 if ( !aPair.second ) 474 (*(aPair.first)).second = nDiff; 475 } 476 } 477 478 void TextSearch::MakeBackwardTab() 479 { 480 // create the jumptable for the search text 481 if( pJumpTable ) 482 { 483 if( !bIsForwardTab ) 484 return ; // the jumpTable is ok 485 delete pJumpTable; 486 } 487 bIsForwardTab = false; 488 489 sal_Int32 n, nLen = sSrchStr.getLength(); 490 pJumpTable = new TextSearchJumpTable; 491 492 for( n = nLen-1; n > 0; --n ) 493 { 494 sal_Unicode cCh = sSrchStr[n]; 495 TextSearchJumpTable::value_type aEntry( cCh, n ); 496 ::std::pair< TextSearchJumpTable::iterator, bool > aPair = 497 pJumpTable->insert( aEntry ); 498 if ( !aPair.second ) 499 (*(aPair.first)).second = n; 500 } 501 } 502 503 void TextSearch::MakeBackwardTab2() 504 { 505 // create the jumptable for the search text 506 if( pJumpTable2 ) 507 { 508 if( !bIsForwardTab ) 509 return ; // the jumpTable is ok 510 delete pJumpTable2; 511 } 512 bIsForwardTab = false; 513 514 sal_Int32 n, nLen = sSrchStr2.getLength(); 515 pJumpTable2 = new TextSearchJumpTable; 516 517 for( n = nLen-1; n > 0; --n ) 518 { 519 sal_Unicode cCh = sSrchStr2[n]; 520 TextSearchJumpTable::value_type aEntry( cCh, n ); 521 ::std::pair< TextSearchJumpTable::iterator, bool > aPair = 522 pJumpTable2->insert( aEntry ); 523 if ( !aPair.second ) 524 (*(aPair.first)).second = n; 525 } 526 } 527 528 sal_Int32 TextSearch::GetDiff( const sal_Unicode cChr ) const 529 { 530 TextSearchJumpTable *pJump; 531 OUString sSearchKey; 532 533 if ( bUsePrimarySrchStr ) { 534 pJump = pJumpTable; 535 sSearchKey = sSrchStr; 536 } else { 537 pJump = pJumpTable2; 538 sSearchKey = sSrchStr2; 539 } 540 541 TextSearchJumpTable::const_iterator iLook = pJump->find( cChr ); 542 if ( iLook == pJump->end() ) 543 return sSearchKey.getLength(); 544 return (*iLook).second; 545 } 546 547 548 // TextSearch::NSrchFrwrd is mis-optimized on unxsoli (#i105945#) 549 SearchResult TextSearch::NSrchFrwrd( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos ) 550 throw(RuntimeException) 551 { 552 SearchResult aRet; 553 aRet.subRegExpressions = 0; 554 555 OUString sSearchKey = bUsePrimarySrchStr ? sSrchStr : sSrchStr2; 556 557 OUString aStr( searchStr ); 558 sal_Int32 nSuchIdx = aStr.getLength(); 559 sal_Int32 nEnde = endPos; 560 if( !nSuchIdx || !sSearchKey.getLength() || sSearchKey.getLength() > nSuchIdx ) 561 return aRet; 562 563 564 if( nEnde < sSearchKey.getLength() ) // position inside the search region ? 565 return aRet; 566 567 nEnde -= sSearchKey.getLength(); 568 569 if (bUsePrimarySrchStr) 570 MakeForwardTab(); // create the jumptable 571 else 572 MakeForwardTab2(); 573 574 for (sal_Int32 nCmpIdx = startPos; // start position for the search 575 nCmpIdx <= nEnde; 576 nCmpIdx += GetDiff( aStr[nCmpIdx + sSearchKey.getLength()-1])) 577 { 578 // if the match would be the completed cells, skip it. 579 if ( (checkCTLStart && !isCellStart( aStr, nCmpIdx )) || (checkCTLEnd 580 && !isCellStart( aStr, nCmpIdx + sSearchKey.getLength())) ) 581 continue; 582 583 nSuchIdx = sSearchKey.getLength() - 1; 584 while( nSuchIdx >= 0 && sSearchKey[nSuchIdx] == aStr[nCmpIdx + nSuchIdx]) 585 { 586 if( nSuchIdx == 0 ) 587 { 588 if( SearchFlags::NORM_WORD_ONLY & aSrchPara.searchFlag ) 589 { 590 sal_Int32 nFndEnd = nCmpIdx + sSearchKey.getLength(); 591 bool bAtStart = !nCmpIdx; 592 bool bAtEnd = nFndEnd == endPos; 593 bool bDelimBefore = bAtStart || IsDelimiter( aStr, nCmpIdx-1 ); 594 bool bDelimBehind = IsDelimiter( aStr, nFndEnd ); 595 // * 1 -> only one word in the paragraph 596 // * 2 -> at begin of paragraph 597 // * 3 -> at end of paragraph 598 // * 4 -> inside the paragraph 599 if( !( ( bAtStart && bAtEnd ) || // 1 600 ( bAtStart && bDelimBehind ) || // 2 601 ( bAtEnd && bDelimBefore ) || // 3 602 ( bDelimBefore && bDelimBehind ))) // 4 603 break; 604 } 605 606 aRet.subRegExpressions = 1; 607 aRet.startOffset.realloc( 1 ); 608 aRet.startOffset[ 0 ] = nCmpIdx; 609 aRet.endOffset.realloc( 1 ); 610 aRet.endOffset[ 0 ] = nCmpIdx + sSearchKey.getLength(); 611 612 return aRet; 613 } 614 else 615 nSuchIdx--; 616 } 617 } 618 return aRet; 619 } 620 621 SearchResult TextSearch::NSrchBkwrd( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos ) 622 throw(RuntimeException) 623 { 624 SearchResult aRet; 625 aRet.subRegExpressions = 0; 626 627 OUString sSearchKey = bUsePrimarySrchStr ? sSrchStr : sSrchStr2; 628 629 OUString aStr( searchStr ); 630 sal_Int32 nSuchIdx = aStr.getLength(); 631 sal_Int32 nEnde = endPos; 632 if( nSuchIdx == 0 || sSearchKey.getLength() == 0 || sSearchKey.getLength() > nSuchIdx) 633 return aRet; 634 635 if (bUsePrimarySrchStr) 636 MakeBackwardTab(); // create the jumptable 637 else 638 MakeBackwardTab2(); 639 640 if( nEnde == nSuchIdx ) // end position for the search 641 nEnde = sSearchKey.getLength(); 642 else 643 nEnde += sSearchKey.getLength(); 644 645 sal_Int32 nCmpIdx = startPos; // start position for the search 646 647 while (nCmpIdx >= nEnde) 648 { 649 // if the match would be the completed cells, skip it. 650 if ( (!checkCTLStart || isCellStart( aStr, nCmpIdx - 651 sSearchKey.getLength() )) && (!checkCTLEnd || 652 isCellStart( aStr, nCmpIdx))) 653 { 654 nSuchIdx = 0; 655 while( nSuchIdx < sSearchKey.getLength() && sSearchKey[nSuchIdx] == 656 aStr[nCmpIdx + nSuchIdx - sSearchKey.getLength()] ) 657 nSuchIdx++; 658 if( nSuchIdx >= sSearchKey.getLength() ) 659 { 660 if( SearchFlags::NORM_WORD_ONLY & aSrchPara.searchFlag ) 661 { 662 sal_Int32 nFndStt = nCmpIdx - sSearchKey.getLength(); 663 bool bAtStart = !nFndStt; 664 bool bAtEnd = nCmpIdx == startPos; 665 bool bDelimBehind = IsDelimiter( aStr, nCmpIdx ); 666 bool bDelimBefore = bAtStart || // begin of paragraph 667 IsDelimiter( aStr, nFndStt-1 ); 668 // * 1 -> only one word in the paragraph 669 // * 2 -> at begin of paragraph 670 // * 3 -> at end of paragraph 671 // * 4 -> inside the paragraph 672 if( ( bAtStart && bAtEnd ) || // 1 673 ( bAtStart && bDelimBehind ) || // 2 674 ( bAtEnd && bDelimBefore ) || // 3 675 ( bDelimBefore && bDelimBehind )) // 4 676 { 677 aRet.subRegExpressions = 1; 678 aRet.startOffset.realloc( 1 ); 679 aRet.startOffset[ 0 ] = nCmpIdx; 680 aRet.endOffset.realloc( 1 ); 681 aRet.endOffset[ 0 ] = nCmpIdx - sSearchKey.getLength(); 682 return aRet; 683 } 684 } 685 else 686 { 687 aRet.subRegExpressions = 1; 688 aRet.startOffset.realloc( 1 ); 689 aRet.startOffset[ 0 ] = nCmpIdx; 690 aRet.endOffset.realloc( 1 ); 691 aRet.endOffset[ 0 ] = nCmpIdx - sSearchKey.getLength(); 692 return aRet; 693 } 694 } 695 } 696 nSuchIdx = GetDiff( aStr[nCmpIdx - sSearchKey.getLength()] ); 697 if( nCmpIdx < nSuchIdx ) 698 return aRet; 699 nCmpIdx -= nSuchIdx; 700 } 701 return aRet; 702 } 703 704 void TextSearch::RESrchPrepare( const ::com::sun::star::util::SearchOptions& rOptions) 705 { 706 // select the transliterated pattern string 707 const OUString& rPatternStr = 708 (rOptions.transliterateFlags & SIMPLE_TRANS_MASK) ? sSrchStr 709 : ((rOptions.transliterateFlags & COMPLEX_TRANS_MASK) ? sSrchStr2 : rOptions.searchString); 710 711 sal_uInt32 nIcuSearchFlags = UREGEX_UWORD; // request UAX#29 unicode capability 712 // map com::sun::star::util::SearchFlags to ICU uregex.h flags 713 // TODO: REG_EXTENDED, REG_NOT_BEGINOFLINE, REG_NOT_ENDOFLINE 714 // REG_NEWLINE is neither properly defined nor used anywhere => not implemented 715 // REG_NOSUB is not used anywhere => not implemented 716 // NORM_WORD_ONLY is only used for SearchAlgorithm==Absolute 717 // LEV_RELAXED is only used for SearchAlgorithm==Approximate 718 // why is even ALL_IGNORE_CASE deprecated in UNO? because of transliteration taking care of it??? 719 if( (rOptions.searchFlag & com::sun::star::util::SearchFlags::ALL_IGNORE_CASE) != 0) 720 nIcuSearchFlags |= UREGEX_CASE_INSENSITIVE; 721 UErrorCode nIcuErr = U_ZERO_ERROR; 722 // assumption: transliteration didn't mangle regexp control chars 723 IcuUniString aIcuSearchPatStr( (const UChar*)rPatternStr.getStr(), rPatternStr.getLength()); 724 #ifndef DISABLE_WORDBOUND_EMULATION 725 // for conveniance specific syntax elements of the old regex engine are emulated 726 // - by replacing \< with "word-break followed by a look-ahead word-char" 727 static const IcuUniString aChevronPatternB( "\\\\<", -1, IcuUniString::kInvariant); 728 static const IcuUniString aChevronReplaceB( "\\\\b(?=\\\\w)", -1, IcuUniString::kInvariant); 729 static RegexMatcher aChevronMatcherB( aChevronPatternB, 0, nIcuErr); 730 aChevronMatcherB.reset( aIcuSearchPatStr); 731 aIcuSearchPatStr = aChevronMatcherB.replaceAll( aChevronReplaceB, nIcuErr); 732 aChevronMatcherB.reset(); 733 // - by replacing \> with "look-behind word-char followed by a word-break" 734 static const IcuUniString aChevronPatternE( "\\\\>", -1, IcuUniString::kInvariant); 735 static const IcuUniString aChevronReplaceE( "(?<=\\\\w)\\\\b", -1, IcuUniString::kInvariant); 736 static RegexMatcher aChevronMatcherE( aChevronPatternE, 0, nIcuErr); 737 aChevronMatcherE.reset( aIcuSearchPatStr); 738 aIcuSearchPatStr = aChevronMatcherE.replaceAll( aChevronReplaceE, nIcuErr); 739 aChevronMatcherE.reset(); 740 #endif 741 pRegexMatcher = new RegexMatcher( aIcuSearchPatStr, nIcuSearchFlags, nIcuErr); 742 if( nIcuErr) 743 { delete pRegexMatcher; pRegexMatcher = NULL;} 744 } 745 746 //--------------------------------------------------------------------------- 747 748 SearchResult TextSearch::RESrchFrwrd( const OUString& searchStr, 749 sal_Int32 startPos, sal_Int32 endPos ) 750 throw(RuntimeException) 751 { 752 SearchResult aRet; 753 aRet.subRegExpressions = 0; 754 if( !pRegexMatcher) 755 return aRet; 756 757 if( endPos > searchStr.getLength()) 758 endPos = searchStr.getLength(); 759 760 // use the ICU RegexMatcher to find the matches 761 UErrorCode nIcuErr = U_ZERO_ERROR; 762 const IcuUniString aSearchTargetStr( (const UChar*)searchStr.getStr(), endPos); 763 pRegexMatcher->reset( aSearchTargetStr); 764 // search until there is a valid match 765 for(;;) 766 { 767 if( !pRegexMatcher->find( startPos, nIcuErr)) 768 return aRet; 769 770 // #i118887# ignore zero-length matches e.g. "a*" in "bc" 771 int nStartOfs = pRegexMatcher->start( nIcuErr); 772 int nEndOfs = pRegexMatcher->end( nIcuErr); 773 if( nStartOfs < nEndOfs) 774 break; 775 // try at next position if there was a zero-length match 776 if( ++startPos >= endPos) 777 return aRet; 778 } 779 780 // extract the result of the search 781 const int nGroupCount = pRegexMatcher->groupCount(); 782 aRet.subRegExpressions = nGroupCount + 1; 783 aRet.startOffset.realloc( aRet.subRegExpressions); 784 aRet.endOffset.realloc( aRet.subRegExpressions); 785 aRet.startOffset[0] = pRegexMatcher->start( nIcuErr); 786 aRet.endOffset[0] = pRegexMatcher->end( nIcuErr); 787 for( int i = 1; i <= nGroupCount; ++i) { 788 aRet.startOffset[i] = pRegexMatcher->start( i, nIcuErr); 789 aRet.endOffset[i] = pRegexMatcher->end( i, nIcuErr); 790 } 791 792 return aRet; 793 } 794 795 SearchResult TextSearch::RESrchBkwrd( const OUString& searchStr, 796 sal_Int32 startPos, sal_Int32 endPos ) 797 throw(RuntimeException) 798 { 799 // NOTE: for backwards search callers provide startPos/endPos inverted! 800 SearchResult aRet; 801 aRet.subRegExpressions = 0; 802 if( !pRegexMatcher) 803 return aRet; 804 805 if( startPos > searchStr.getLength()) 806 startPos = searchStr.getLength(); 807 808 // use the ICU RegexMatcher to find the matches 809 // TODO: use ICU's backward searching once it becomes available 810 // as its replacement using forward search is not as good as the real thing 811 UErrorCode nIcuErr = U_ZERO_ERROR; 812 const IcuUniString aSearchTargetStr( (const UChar*)searchStr.getStr(), startPos); 813 pRegexMatcher->reset( aSearchTargetStr); 814 if( !pRegexMatcher->find( endPos, nIcuErr)) 815 return aRet; 816 817 // find the last match 818 int nLastPos = 0; 819 int nFoundEnd = 0; 820 do { 821 nLastPos = pRegexMatcher->start( nIcuErr); 822 nFoundEnd = pRegexMatcher->end( nIcuErr); 823 if( nFoundEnd >= startPos) 824 break; 825 if( nFoundEnd == nLastPos) 826 ++nFoundEnd; 827 } while( pRegexMatcher->find( nFoundEnd, nIcuErr)); 828 829 // find last match again to get its details 830 pRegexMatcher->find( nLastPos, nIcuErr); 831 832 // fill in the details of the last match 833 const int nGroupCount = pRegexMatcher->groupCount(); 834 aRet.subRegExpressions = nGroupCount + 1; 835 aRet.startOffset.realloc( aRet.subRegExpressions); 836 aRet.endOffset.realloc( aRet.subRegExpressions); 837 // NOTE: existing users of backward search seem to expect startOfs/endOfs being inverted! 838 aRet.startOffset[0] = pRegexMatcher->end( nIcuErr); 839 aRet.endOffset[0] = pRegexMatcher->start( nIcuErr); 840 for( int i = 1; i <= nGroupCount; ++i) { 841 aRet.startOffset[i] = pRegexMatcher->end( i, nIcuErr); 842 aRet.endOffset[i] = pRegexMatcher->start( i, nIcuErr); 843 } 844 845 return aRet; 846 } 847 848 //--------------------------------------------------------------------------- 849 850 // search for words phonetically 851 SearchResult TextSearch::ApproxSrchFrwrd( const OUString& searchStr, 852 sal_Int32 startPos, sal_Int32 endPos ) 853 throw(RuntimeException) 854 { 855 SearchResult aRet; 856 aRet.subRegExpressions = 0; 857 858 if( !xBreak.is() ) 859 return aRet; 860 861 OUString aWTemp( searchStr ); 862 863 register sal_Int32 nStt, nEnd; 864 865 Boundary aWBnd = xBreak->getWordBoundary( aWTemp, startPos, 866 aSrchPara.Locale, 867 WordType::ANYWORD_IGNOREWHITESPACES, sal_True ); 868 869 do 870 { 871 if( aWBnd.startPos >= endPos ) 872 break; 873 nStt = aWBnd.startPos < startPos ? startPos : aWBnd.startPos; 874 nEnd = aWBnd.endPos > endPos ? endPos : aWBnd.endPos; 875 876 if( nStt < nEnd && 877 pWLD->WLD( aWTemp.getStr() + nStt, nEnd - nStt ) <= nLimit ) 878 { 879 aRet.subRegExpressions = 1; 880 aRet.startOffset.realloc( 1 ); 881 aRet.startOffset[ 0 ] = nStt; 882 aRet.endOffset.realloc( 1 ); 883 aRet.endOffset[ 0 ] = nEnd; 884 break; 885 } 886 887 nStt = nEnd - 1; 888 aWBnd = xBreak->nextWord( aWTemp, nStt, aSrchPara.Locale, 889 WordType::ANYWORD_IGNOREWHITESPACES); 890 } while( aWBnd.startPos != aWBnd.endPos || 891 (aWBnd.endPos != aWTemp.getLength() && aWBnd.endPos != nEnd) ); 892 // #i50244# aWBnd.endPos != nEnd : in case there is _no_ word (only 893 // whitespace) in searchStr, getWordBoundary() returned startPos,startPos 894 // and nextWord() does also => don't loop forever. 895 return aRet; 896 } 897 898 SearchResult TextSearch::ApproxSrchBkwrd( const OUString& searchStr, 899 sal_Int32 startPos, sal_Int32 endPos ) 900 throw(RuntimeException) 901 { 902 SearchResult aRet; 903 aRet.subRegExpressions = 0; 904 905 if( !xBreak.is() ) 906 return aRet; 907 908 OUString aWTemp( searchStr ); 909 910 register sal_Int32 nStt, nEnd; 911 912 Boundary aWBnd = xBreak->getWordBoundary( aWTemp, startPos, 913 aSrchPara.Locale, 914 WordType::ANYWORD_IGNOREWHITESPACES, sal_True ); 915 916 do 917 { 918 if( aWBnd.endPos <= endPos ) 919 break; 920 nStt = aWBnd.startPos < endPos ? endPos : aWBnd.startPos; 921 nEnd = aWBnd.endPos > startPos ? startPos : aWBnd.endPos; 922 923 if( nStt < nEnd && 924 pWLD->WLD( aWTemp.getStr() + nStt, nEnd - nStt ) <= nLimit ) 925 { 926 aRet.subRegExpressions = 1; 927 aRet.startOffset.realloc( 1 ); 928 aRet.startOffset[ 0 ] = nEnd; 929 aRet.endOffset.realloc( 1 ); 930 aRet.endOffset[ 0 ] = nStt; 931 break; 932 } 933 if( !nStt ) 934 break; 935 936 aWBnd = xBreak->previousWord( aWTemp, nStt, aSrchPara.Locale, 937 WordType::ANYWORD_IGNOREWHITESPACES); 938 } while( aWBnd.startPos != aWBnd.endPos || aWBnd.endPos != aWTemp.getLength() ); 939 return aRet; 940 } 941 942 943 static const sal_Char cSearchName[] = "com.sun.star.util.TextSearch"; 944 static const sal_Char cSearchImpl[] = "com.sun.star.util.TextSearch_i18n"; 945 946 static OUString getServiceName_Static() 947 { 948 return OUString::createFromAscii( cSearchName ); 949 } 950 951 static OUString getImplementationName_Static() 952 { 953 return OUString::createFromAscii( cSearchImpl ); 954 } 955 956 OUString SAL_CALL 957 TextSearch::getImplementationName() 958 throw( RuntimeException ) 959 { 960 return getImplementationName_Static(); 961 } 962 963 sal_Bool SAL_CALL 964 TextSearch::supportsService(const OUString& rServiceName) 965 throw( RuntimeException ) 966 { 967 return !rServiceName.compareToAscii( cSearchName ); 968 } 969 970 Sequence< OUString > SAL_CALL 971 TextSearch::getSupportedServiceNames(void) throw( RuntimeException ) 972 { 973 Sequence< OUString > aRet(1); 974 aRet[0] = getServiceName_Static(); 975 return aRet; 976 } 977 978 ::com::sun::star::uno::Reference< ::com::sun::star::uno::XInterface > 979 SAL_CALL TextSearch_CreateInstance( 980 const ::com::sun::star::uno::Reference< 981 ::com::sun::star::lang::XMultiServiceFactory >& rxMSF ) 982 { 983 return ::com::sun::star::uno::Reference< 984 ::com::sun::star::uno::XInterface >( 985 (::cppu::OWeakObject*) new TextSearch( rxMSF ) ); 986 } 987 988 extern "C" 989 { 990 991 void SAL_CALL component_getImplementationEnvironment( 992 const sal_Char** ppEnvTypeName, uno_Environment** /*ppEnv*/ ) 993 { 994 *ppEnvTypeName = CPPU_CURRENT_LANGUAGE_BINDING_NAME; 995 } 996 997 void* SAL_CALL component_getFactory( const sal_Char* sImplementationName, 998 void* _pServiceManager, void* /*_pRegistryKey*/ ) 999 { 1000 void* pRet = NULL; 1001 1002 ::com::sun::star::lang::XMultiServiceFactory* pServiceManager = 1003 reinterpret_cast< ::com::sun::star::lang::XMultiServiceFactory* > 1004 ( _pServiceManager ); 1005 ::com::sun::star::uno::Reference< 1006 ::com::sun::star::lang::XSingleServiceFactory > xFactory; 1007 1008 if ( 0 == rtl_str_compare( sImplementationName, cSearchImpl) ) 1009 { 1010 ::com::sun::star::uno::Sequence< ::rtl::OUString > aServiceNames(1); 1011 aServiceNames[0] = getServiceName_Static(); 1012 xFactory = ::cppu::createSingleFactory( 1013 pServiceManager, getImplementationName_Static(), 1014 &TextSearch_CreateInstance, aServiceNames ); 1015 } 1016 1017 if ( xFactory.is() ) 1018 { 1019 xFactory->acquire(); 1020 pRet = xFactory.get(); 1021 } 1022 1023 return pRet; 1024 } 1025 1026 } // extern "C" 1027