1 /************************************************************** 2 * 3 * Licensed to the Apache Software Foundation (ASF) under one 4 * or more contributor license agreements. See the NOTICE file 5 * distributed with this work for additional information 6 * regarding copyright ownership. The ASF licenses this file 7 * to you under the Apache License, Version 2.0 (the 8 * "License"); you may not use this file except in compliance 9 * with the License. You may obtain a copy of the License at 10 * 11 * http://www.apache.org/licenses/LICENSE-2.0 12 * 13 * Unless required by applicable law or agreed to in writing, 14 * software distributed under the License is distributed on an 15 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 * KIND, either express or implied. See the License for the 17 * specific language governing permissions and limitations 18 * under the License. 19 * 20 *************************************************************/ 21 22 23 24 // MARKER(update_precomp.py): autogen include statement, do not remove 25 #include "precompiled_i18npool.hxx" 26 27 #include "textsearch.hxx" 28 #include "levdis.hxx" 29 #include <com/sun/star/lang/Locale.hpp> 30 #include <com/sun/star/lang/XMultiServiceFactory.hpp> 31 #include <comphelper/processfactory.hxx> 32 #include <com/sun/star/i18n/UnicodeType.hpp> 33 #include <com/sun/star/util/SearchFlags.hpp> 34 #include <com/sun/star/i18n/WordType.hpp> 35 #include <com/sun/star/i18n/ScriptType.hpp> 36 #include <com/sun/star/i18n/CharacterIteratorMode.hpp> 37 #include <com/sun/star/i18n/KCharacterType.hpp> 38 #include <com/sun/star/registry/XRegistryKey.hpp> 39 #include <cppuhelper/factory.hxx> 40 #include <cppuhelper/weak.hxx> 41 42 #ifdef _MSC_VER 43 // get rid of that dumb compiler warning 44 // identifier was truncated to '255' characters in the debug information 45 // for STL template usage, if .pdb files are to be created 46 #pragma warning( disable: 4786 ) 47 #endif 48 49 #include <string.h> 50 51 using namespace ::com::sun::star::util; 52 using namespace ::com::sun::star::uno; 53 using namespace ::com::sun::star::lang; 54 using namespace ::com::sun::star::i18n; 55 using namespace ::rtl; 56 57 static sal_Int32 COMPLEX_TRANS_MASK_TMP = 58 TransliterationModules_ignoreBaFa_ja_JP | 59 TransliterationModules_ignoreIterationMark_ja_JP | 60 TransliterationModules_ignoreTiJi_ja_JP | 61 TransliterationModules_ignoreHyuByu_ja_JP | 62 TransliterationModules_ignoreSeZe_ja_JP | 63 TransliterationModules_ignoreIandEfollowedByYa_ja_JP | 64 TransliterationModules_ignoreKiKuFollowedBySa_ja_JP | 65 TransliterationModules_ignoreProlongedSoundMark_ja_JP; 66 static const sal_Int32 COMPLEX_TRANS_MASK = COMPLEX_TRANS_MASK_TMP | TransliterationModules_IGNORE_KANA | TransliterationModules_FULLWIDTH_HALFWIDTH; 67 static const sal_Int32 SIMPLE_TRANS_MASK = ~COMPLEX_TRANS_MASK; 68 static const sal_Int32 REGEX_TRANS_MASK = ~(COMPLEX_TRANS_MASK | TransliterationModules_IGNORE_CASE | TransliterationModules_UPPERCASE_LOWERCASE | TransliterationModules_LOWERCASE_UPPERCASE); 69 // Above 2 transliteration is simple but need to take effect in 70 // complex transliteration 71 72 TextSearch::TextSearch(const Reference < XMultiServiceFactory > & rxMSF) 73 : xMSF( rxMSF ) 74 , pJumpTable( 0 ) 75 , pJumpTable2( 0 ) 76 , pRegexMatcher( NULL ) 77 , pWLD( 0 ) 78 { 79 SearchOptions aOpt; 80 aOpt.algorithmType = SearchAlgorithms_ABSOLUTE; 81 aOpt.searchFlag = SearchFlags::ALL_IGNORE_CASE; 82 //aOpt.Locale = ???; 83 setOptions( aOpt ); 84 } 85 86 TextSearch::~TextSearch() 87 { 88 delete pRegexMatcher; 89 delete pWLD; 90 delete pJumpTable; 91 delete pJumpTable2; 92 } 93 94 void TextSearch::setOptions( const SearchOptions& rOptions ) throw( RuntimeException ) 95 { 96 aSrchPara = rOptions; 97 98 delete pRegexMatcher, pRegexMatcher = NULL; 99 delete pWLD, pWLD = 0; 100 delete pJumpTable, pJumpTable = 0; 101 delete pJumpTable2, pJumpTable2 = 0; 102 103 // Create Transliteration class 104 if( aSrchPara.transliterateFlags & SIMPLE_TRANS_MASK ) 105 { 106 if( !xTranslit.is() ) 107 { 108 Reference < XInterface > xI = xMSF->createInstance( 109 OUString::createFromAscii( 110 "com.sun.star.i18n.Transliteration")); 111 if ( xI.is() ) 112 xI->queryInterface( ::getCppuType( 113 (const Reference< XExtendedTransliteration >*)0)) 114 >>= xTranslit; 115 } 116 // Load transliteration module 117 if( xTranslit.is() ) 118 xTranslit->loadModule( 119 (TransliterationModules)( aSrchPara.transliterateFlags & SIMPLE_TRANS_MASK ), 120 aSrchPara.Locale); 121 } 122 else if( xTranslit.is() ) 123 xTranslit = 0; 124 125 // Create Transliteration for 2<->1, 2<->2 transliteration 126 if ( aSrchPara.transliterateFlags & COMPLEX_TRANS_MASK ) 127 { 128 if( !xTranslit2.is() ) 129 { 130 Reference < XInterface > xI = xMSF->createInstance( 131 OUString::createFromAscii( 132 "com.sun.star.i18n.Transliteration")); 133 if ( xI.is() ) 134 xI->queryInterface( ::getCppuType( 135 (const Reference< XExtendedTransliteration >*)0)) 136 >>= xTranslit2; 137 } 138 // Load transliteration module 139 if( xTranslit2.is() ) 140 xTranslit2->loadModule( 141 (TransliterationModules)( aSrchPara.transliterateFlags & COMPLEX_TRANS_MASK ), 142 aSrchPara.Locale); 143 } 144 145 if ( !xBreak.is() ) 146 { 147 Reference < XInterface > xI = xMSF->createInstance( 148 OUString::createFromAscii( "com.sun.star.i18n.BreakIterator")); 149 if( xI.is() ) 150 xI->queryInterface( ::getCppuType( 151 (const Reference< XBreakIterator >*)0)) 152 >>= xBreak; 153 } 154 155 sSrchStr = aSrchPara.searchString; 156 157 // use transliteration here 158 if ( xTranslit.is() && 159 aSrchPara.transliterateFlags & SIMPLE_TRANS_MASK ) 160 sSrchStr = xTranslit->transliterateString2String( 161 aSrchPara.searchString, 0, aSrchPara.searchString.getLength()); 162 163 if ( xTranslit2.is() && 164 aSrchPara.transliterateFlags & COMPLEX_TRANS_MASK ) 165 sSrchStr2 = xTranslit2->transliterateString2String( 166 aSrchPara.searchString, 0, aSrchPara.searchString.getLength()); 167 168 // When start or end of search string is a complex script type, we need to 169 // make sure the result boundary is not located in the middle of cell. 170 checkCTLStart = (xBreak.is() && (xBreak->getScriptType(sSrchStr, 0) == 171 ScriptType::COMPLEX)); 172 checkCTLEnd = (xBreak.is() && (xBreak->getScriptType(sSrchStr, 173 sSrchStr.getLength()-1) == ScriptType::COMPLEX)); 174 175 switch( aSrchPara.algorithmType) 176 { 177 case SearchAlgorithms_REGEXP: 178 fnForward = &TextSearch::RESrchFrwrd; 179 fnBackward = &TextSearch::RESrchBkwrd; 180 RESrchPrepare( aSrchPara); 181 break; 182 183 case SearchAlgorithms_APPROXIMATE: 184 fnForward = &TextSearch::ApproxSrchFrwrd; 185 fnBackward = &TextSearch::ApproxSrchBkwrd; 186 187 pWLD = new WLevDistance( sSrchStr.getStr(), aSrchPara.changedChars, 188 aSrchPara.insertedChars, aSrchPara.deletedChars, 189 0 != (SearchFlags::LEV_RELAXED & aSrchPara.searchFlag ) ); 190 191 nLimit = pWLD->GetLimit(); 192 break; 193 194 default: 195 fnForward = &TextSearch::NSrchFrwrd; 196 fnBackward = &TextSearch::NSrchBkwrd; 197 break; 198 } 199 } 200 201 sal_Int32 FindPosInSeq_Impl( const Sequence <sal_Int32>& rOff, sal_Int32 nPos ) 202 { 203 sal_Int32 nRet = 0, nEnd = rOff.getLength(); 204 while( nRet < nEnd && nPos > rOff[ nRet ] ) ++nRet; 205 return nRet; 206 } 207 208 sal_Bool TextSearch::isCellStart(const OUString& searchStr, sal_Int32 nPos) 209 throw( RuntimeException ) 210 { 211 sal_Int32 nDone; 212 return nPos == xBreak->previousCharacters(searchStr, nPos+1, 213 aSrchPara.Locale, CharacterIteratorMode::SKIPCELL, 1, nDone); 214 } 215 216 SearchResult TextSearch::searchForward( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos ) 217 throw( RuntimeException ) 218 { 219 SearchResult sres; 220 221 OUString in_str(searchStr); 222 sal_Int32 newStartPos = startPos; 223 sal_Int32 newEndPos = endPos; 224 225 bUsePrimarySrchStr = true; 226 227 if ( xTranslit.is() ) 228 { 229 // apply normal transliteration (1<->1, 1<->0) 230 com::sun::star::uno::Sequence <sal_Int32> offset( in_str.getLength()); 231 in_str = xTranslit->transliterate( searchStr, 0, in_str.getLength(), offset ); 232 233 // JP 20.6.2001: also the start and end positions must be corrected! 234 if( startPos ) 235 newStartPos = FindPosInSeq_Impl( offset, startPos ); 236 237 if( endPos < searchStr.getLength() ) 238 newEndPos = FindPosInSeq_Impl( offset, endPos ); 239 else 240 newEndPos = in_str.getLength(); 241 242 sres = (this->*fnForward)( in_str, newStartPos, newEndPos ); 243 244 for ( int k = 0; k < sres.startOffset.getLength(); k++ ) 245 { 246 if (sres.startOffset[k]) 247 sres.startOffset[k] = offset[sres.startOffset[k]]; 248 // JP 20.6.2001: end is ever exclusive and then don't return 249 // the position of the next character - return the 250 // next position behind the last found character! 251 // "a b c" find "b" must return 2,3 and not 2,4!!! 252 if (sres.endOffset[k]) 253 sres.endOffset[k] = offset[sres.endOffset[k]-1] + 1; 254 } 255 } 256 else 257 { 258 sres = (this->*fnForward)( in_str, startPos, endPos ); 259 } 260 261 if ( xTranslit2.is() && aSrchPara.algorithmType != SearchAlgorithms_REGEXP) 262 { 263 SearchResult sres2; 264 265 in_str = OUString(searchStr); 266 com::sun::star::uno::Sequence <sal_Int32> offset( in_str.getLength()); 267 268 in_str = xTranslit2->transliterate( searchStr, 0, in_str.getLength(), offset ); 269 270 if( startPos ) 271 startPos = FindPosInSeq_Impl( offset, startPos ); 272 273 if( endPos < searchStr.getLength() ) 274 endPos = FindPosInSeq_Impl( offset, endPos ); 275 else 276 endPos = in_str.getLength(); 277 278 bUsePrimarySrchStr = false; 279 sres2 = (this->*fnForward)( in_str, startPos, endPos ); 280 281 for ( int k = 0; k < sres2.startOffset.getLength(); k++ ) 282 { 283 if (sres2.startOffset[k]) 284 sres2.startOffset[k] = offset[sres2.startOffset[k]-1] + 1; 285 if (sres2.endOffset[k]) 286 sres2.endOffset[k] = offset[sres2.endOffset[k]-1] + 1; 287 } 288 289 // pick first and long one 290 if ( sres.subRegExpressions == 0) 291 return sres2; 292 if ( sres2.subRegExpressions == 1) 293 { 294 if ( sres.startOffset[0] > sres2.startOffset[0]) 295 return sres2; 296 else if ( sres.startOffset[0] == sres2.startOffset[0] && 297 sres.endOffset[0] < sres2.endOffset[0]) 298 return sres2; 299 } 300 } 301 302 return sres; 303 } 304 305 SearchResult TextSearch::searchBackward( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos ) 306 throw(RuntimeException) 307 { 308 SearchResult sres; 309 310 OUString in_str(searchStr); 311 sal_Int32 newStartPos = startPos; 312 sal_Int32 newEndPos = endPos; 313 314 bUsePrimarySrchStr = true; 315 316 if ( xTranslit.is() ) 317 { 318 // apply only simple 1<->1 transliteration here 319 com::sun::star::uno::Sequence <sal_Int32> offset( in_str.getLength()); 320 in_str = xTranslit->transliterate( searchStr, 0, in_str.getLength(), offset ); 321 322 // JP 20.6.2001: also the start and end positions must be corrected! 323 if( startPos < searchStr.getLength() ) 324 newStartPos = FindPosInSeq_Impl( offset, startPos ); 325 else 326 newStartPos = in_str.getLength(); 327 328 if( endPos ) 329 newEndPos = FindPosInSeq_Impl( offset, endPos ); 330 331 sres = (this->*fnBackward)( in_str, newStartPos, newEndPos ); 332 333 for ( int k = 0; k < sres.startOffset.getLength(); k++ ) 334 { 335 if (sres.startOffset[k]) 336 sres.startOffset[k] = offset[sres.startOffset[k] - 1] + 1; 337 // JP 20.6.2001: end is ever exclusive and then don't return 338 // the position of the next character - return the 339 // next position behind the last found character! 340 // "a b c" find "b" must return 2,3 and not 2,4!!! 341 if (sres.endOffset[k]) 342 sres.endOffset[k] = offset[sres.endOffset[k]]; 343 } 344 } 345 else 346 { 347 sres = (this->*fnBackward)( in_str, startPos, endPos ); 348 } 349 350 if ( xTranslit2.is() && aSrchPara.algorithmType != SearchAlgorithms_REGEXP ) 351 { 352 SearchResult sres2; 353 354 in_str = OUString(searchStr); 355 com::sun::star::uno::Sequence <sal_Int32> offset( in_str.getLength()); 356 357 in_str = xTranslit2->transliterate(searchStr, 0, in_str.getLength(), offset); 358 359 if( startPos < searchStr.getLength() ) 360 startPos = FindPosInSeq_Impl( offset, startPos ); 361 else 362 startPos = in_str.getLength(); 363 364 if( endPos ) 365 endPos = FindPosInSeq_Impl( offset, endPos ); 366 367 bUsePrimarySrchStr = false; 368 sres2 = (this->*fnBackward)( in_str, startPos, endPos ); 369 370 for( int k = 0; k < sres2.startOffset.getLength(); k++ ) 371 { 372 if (sres2.startOffset[k]) 373 sres2.startOffset[k] = offset[sres2.startOffset[k]-1]+1; 374 if (sres2.endOffset[k]) 375 sres2.endOffset[k] = offset[sres2.endOffset[k]-1]+1; 376 } 377 378 // pick last and long one 379 if ( sres.subRegExpressions == 0 ) 380 return sres2; 381 if ( sres2.subRegExpressions == 1 ) 382 { 383 if ( sres.startOffset[0] < sres2.startOffset[0] ) 384 return sres2; 385 if ( sres.startOffset[0] == sres2.startOffset[0] && 386 sres.endOffset[0] > sres2.endOffset[0] ) 387 return sres2; 388 } 389 } 390 391 return sres; 392 } 393 394 //--------------------------------------------------------------------- 395 396 bool TextSearch::IsDelimiter( const OUString& rStr, sal_Int32 nPos ) const 397 { 398 bool bRet = 1; 399 if( '\x7f' != rStr[nPos]) 400 { 401 if ( !xCharClass.is() ) 402 { 403 Reference < XInterface > xI = xMSF->createInstance( 404 OUString::createFromAscii( "com.sun.star.i18n.CharacterClassification")); 405 if( xI.is() ) 406 xI->queryInterface( ::getCppuType( 407 (const Reference< XCharacterClassification >*)0)) 408 >>= xCharClass; 409 } 410 if ( xCharClass.is() ) 411 { 412 sal_Int32 nCType = xCharClass->getCharacterType( rStr, nPos, 413 aSrchPara.Locale ); 414 if( 0 != (( KCharacterType::DIGIT | KCharacterType::ALPHA | 415 KCharacterType::LETTER ) & nCType ) ) 416 bRet = 0; 417 } 418 } 419 return bRet; 420 } 421 422 // --------- helper methods for Boyer-Moore like text searching ---------- 423 // TODO: use ICU's regex UREGEX_LITERAL mode instead when it becomes available 424 425 void TextSearch::MakeForwardTab() 426 { 427 // create the jumptable for the search text 428 if( pJumpTable ) 429 { 430 if( bIsForwardTab ) 431 return ; // the jumpTable is ok 432 delete pJumpTable; 433 } 434 bIsForwardTab = true; 435 436 sal_Int32 n, nLen = sSrchStr.getLength(); 437 pJumpTable = new TextSearchJumpTable; 438 439 for( n = 0; n < nLen - 1; ++n ) 440 { 441 sal_Unicode cCh = sSrchStr[n]; 442 sal_Int32 nDiff = nLen - n - 1; 443 TextSearchJumpTable::value_type aEntry( cCh, nDiff ); 444 445 ::std::pair< TextSearchJumpTable::iterator, bool > aPair = 446 pJumpTable->insert( aEntry ); 447 if ( !aPair.second ) 448 (*(aPair.first)).second = nDiff; 449 } 450 } 451 452 void TextSearch::MakeForwardTab2() 453 { 454 // create the jumptable for the search text 455 if( pJumpTable2 ) 456 { 457 if( bIsForwardTab ) 458 return ; // the jumpTable is ok 459 delete pJumpTable2; 460 } 461 bIsForwardTab = true; 462 463 sal_Int32 n, nLen = sSrchStr2.getLength(); 464 pJumpTable2 = new TextSearchJumpTable; 465 466 for( n = 0; n < nLen - 1; ++n ) 467 { 468 sal_Unicode cCh = sSrchStr2[n]; 469 sal_Int32 nDiff = nLen - n - 1; 470 471 TextSearchJumpTable::value_type aEntry( cCh, nDiff ); 472 ::std::pair< TextSearchJumpTable::iterator, bool > aPair = 473 pJumpTable2->insert( aEntry ); 474 if ( !aPair.second ) 475 (*(aPair.first)).second = nDiff; 476 } 477 } 478 479 void TextSearch::MakeBackwardTab() 480 { 481 // create the jumptable for the search text 482 if( pJumpTable ) 483 { 484 if( !bIsForwardTab ) 485 return ; // the jumpTable is ok 486 delete pJumpTable; 487 } 488 bIsForwardTab = false; 489 490 sal_Int32 n, nLen = sSrchStr.getLength(); 491 pJumpTable = new TextSearchJumpTable; 492 493 for( n = nLen-1; n > 0; --n ) 494 { 495 sal_Unicode cCh = sSrchStr[n]; 496 TextSearchJumpTable::value_type aEntry( cCh, n ); 497 ::std::pair< TextSearchJumpTable::iterator, bool > aPair = 498 pJumpTable->insert( aEntry ); 499 if ( !aPair.second ) 500 (*(aPair.first)).second = n; 501 } 502 } 503 504 void TextSearch::MakeBackwardTab2() 505 { 506 // create the jumptable for the search text 507 if( pJumpTable2 ) 508 { 509 if( !bIsForwardTab ) 510 return ; // the jumpTable is ok 511 delete pJumpTable2; 512 } 513 bIsForwardTab = false; 514 515 sal_Int32 n, nLen = sSrchStr2.getLength(); 516 pJumpTable2 = new TextSearchJumpTable; 517 518 for( n = nLen-1; n > 0; --n ) 519 { 520 sal_Unicode cCh = sSrchStr2[n]; 521 TextSearchJumpTable::value_type aEntry( cCh, n ); 522 ::std::pair< TextSearchJumpTable::iterator, bool > aPair = 523 pJumpTable2->insert( aEntry ); 524 if ( !aPair.second ) 525 (*(aPair.first)).second = n; 526 } 527 } 528 529 sal_Int32 TextSearch::GetDiff( const sal_Unicode cChr ) const 530 { 531 TextSearchJumpTable *pJump; 532 OUString sSearchKey; 533 534 if ( bUsePrimarySrchStr ) { 535 pJump = pJumpTable; 536 sSearchKey = sSrchStr; 537 } else { 538 pJump = pJumpTable2; 539 sSearchKey = sSrchStr2; 540 } 541 542 TextSearchJumpTable::const_iterator iLook = pJump->find( cChr ); 543 if ( iLook == pJump->end() ) 544 return sSearchKey.getLength(); 545 return (*iLook).second; 546 } 547 548 549 // TextSearch::NSrchFrwrd is mis-optimized on unxsoli (#i105945#) 550 SearchResult TextSearch::NSrchFrwrd( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos ) 551 throw(RuntimeException) 552 { 553 SearchResult aRet; 554 aRet.subRegExpressions = 0; 555 556 OUString sSearchKey = bUsePrimarySrchStr ? sSrchStr : sSrchStr2; 557 558 OUString aStr( searchStr ); 559 sal_Int32 nSuchIdx = aStr.getLength(); 560 sal_Int32 nEnde = endPos; 561 if( !nSuchIdx || !sSearchKey.getLength() || sSearchKey.getLength() > nSuchIdx ) 562 return aRet; 563 564 565 if( nEnde < sSearchKey.getLength() ) // position inside the search region ? 566 return aRet; 567 568 nEnde -= sSearchKey.getLength(); 569 570 if (bUsePrimarySrchStr) 571 MakeForwardTab(); // create the jumptable 572 else 573 MakeForwardTab2(); 574 575 for (sal_Int32 nCmpIdx = startPos; // start position for the search 576 nCmpIdx <= nEnde; 577 nCmpIdx += GetDiff( aStr[nCmpIdx + sSearchKey.getLength()-1])) 578 { 579 // if the match would be the completed cells, skip it. 580 if ( (checkCTLStart && !isCellStart( aStr, nCmpIdx )) || (checkCTLEnd 581 && !isCellStart( aStr, nCmpIdx + sSearchKey.getLength())) ) 582 continue; 583 584 nSuchIdx = sSearchKey.getLength() - 1; 585 while( nSuchIdx >= 0 && sSearchKey[nSuchIdx] == aStr[nCmpIdx + nSuchIdx]) 586 { 587 if( nSuchIdx == 0 ) 588 { 589 if( SearchFlags::NORM_WORD_ONLY & aSrchPara.searchFlag ) 590 { 591 sal_Int32 nFndEnd = nCmpIdx + sSearchKey.getLength(); 592 bool bAtStart = !nCmpIdx; 593 bool bAtEnd = nFndEnd == endPos; 594 bool bDelimBefore = bAtStart || IsDelimiter( aStr, nCmpIdx-1 ); 595 bool bDelimBehind = IsDelimiter( aStr, nFndEnd ); 596 // * 1 -> only one word in the paragraph 597 // * 2 -> at begin of paragraph 598 // * 3 -> at end of paragraph 599 // * 4 -> inside the paragraph 600 if( !( ( bAtStart && bAtEnd ) || // 1 601 ( bAtStart && bDelimBehind ) || // 2 602 ( bAtEnd && bDelimBefore ) || // 3 603 ( bDelimBefore && bDelimBehind ))) // 4 604 break; 605 } 606 607 aRet.subRegExpressions = 1; 608 aRet.startOffset.realloc( 1 ); 609 aRet.startOffset[ 0 ] = nCmpIdx; 610 aRet.endOffset.realloc( 1 ); 611 aRet.endOffset[ 0 ] = nCmpIdx + sSearchKey.getLength(); 612 613 return aRet; 614 } 615 else 616 nSuchIdx--; 617 } 618 } 619 return aRet; 620 } 621 622 SearchResult TextSearch::NSrchBkwrd( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos ) 623 throw(RuntimeException) 624 { 625 SearchResult aRet; 626 aRet.subRegExpressions = 0; 627 628 OUString sSearchKey = bUsePrimarySrchStr ? sSrchStr : sSrchStr2; 629 630 OUString aStr( searchStr ); 631 sal_Int32 nSuchIdx = aStr.getLength(); 632 sal_Int32 nEnde = endPos; 633 if( nSuchIdx == 0 || sSearchKey.getLength() == 0 || sSearchKey.getLength() > nSuchIdx) 634 return aRet; 635 636 if (bUsePrimarySrchStr) 637 MakeBackwardTab(); // create the jumptable 638 else 639 MakeBackwardTab2(); 640 641 if( nEnde == nSuchIdx ) // end position for the search 642 nEnde = sSearchKey.getLength(); 643 else 644 nEnde += sSearchKey.getLength(); 645 646 sal_Int32 nCmpIdx = startPos; // start position for the search 647 648 while (nCmpIdx >= nEnde) 649 { 650 // if the match would be the completed cells, skip it. 651 if ( (!checkCTLStart || isCellStart( aStr, nCmpIdx - 652 sSearchKey.getLength() )) && (!checkCTLEnd || 653 isCellStart( aStr, nCmpIdx))) 654 { 655 nSuchIdx = 0; 656 while( nSuchIdx < sSearchKey.getLength() && sSearchKey[nSuchIdx] == 657 aStr[nCmpIdx + nSuchIdx - sSearchKey.getLength()] ) 658 nSuchIdx++; 659 if( nSuchIdx >= sSearchKey.getLength() ) 660 { 661 if( SearchFlags::NORM_WORD_ONLY & aSrchPara.searchFlag ) 662 { 663 sal_Int32 nFndStt = nCmpIdx - sSearchKey.getLength(); 664 bool bAtStart = !nFndStt; 665 bool bAtEnd = nCmpIdx == startPos; 666 bool bDelimBehind = IsDelimiter( aStr, nCmpIdx ); 667 bool bDelimBefore = bAtStart || // begin of paragraph 668 IsDelimiter( aStr, nFndStt-1 ); 669 // * 1 -> only one word in the paragraph 670 // * 2 -> at begin of paragraph 671 // * 3 -> at end of paragraph 672 // * 4 -> inside the paragraph 673 if( ( bAtStart && bAtEnd ) || // 1 674 ( bAtStart && bDelimBehind ) || // 2 675 ( bAtEnd && bDelimBefore ) || // 3 676 ( bDelimBefore && bDelimBehind )) // 4 677 { 678 aRet.subRegExpressions = 1; 679 aRet.startOffset.realloc( 1 ); 680 aRet.startOffset[ 0 ] = nCmpIdx; 681 aRet.endOffset.realloc( 1 ); 682 aRet.endOffset[ 0 ] = nCmpIdx - sSearchKey.getLength(); 683 return aRet; 684 } 685 } 686 else 687 { 688 aRet.subRegExpressions = 1; 689 aRet.startOffset.realloc( 1 ); 690 aRet.startOffset[ 0 ] = nCmpIdx; 691 aRet.endOffset.realloc( 1 ); 692 aRet.endOffset[ 0 ] = nCmpIdx - sSearchKey.getLength(); 693 return aRet; 694 } 695 } 696 } 697 nSuchIdx = GetDiff( aStr[nCmpIdx - sSearchKey.getLength()] ); 698 if( nCmpIdx < nSuchIdx ) 699 return aRet; 700 nCmpIdx -= nSuchIdx; 701 } 702 return aRet; 703 } 704 705 void TextSearch::RESrchPrepare( const ::com::sun::star::util::SearchOptions& rOptions) 706 { 707 // select the transliterated pattern string 708 const OUString& rPatternStr = 709 (rOptions.transliterateFlags & REGEX_TRANS_MASK) ? sSrchStr 710 : ((rOptions.transliterateFlags & COMPLEX_TRANS_MASK) ? sSrchStr2 : rOptions.searchString); 711 712 sal_uInt32 nIcuSearchFlags = UREGEX_UWORD; // request UAX#29 unicode capability 713 // map com::sun::star::util::SearchFlags to ICU uregex.h flags 714 // TODO: REG_EXTENDED, REG_NOT_BEGINOFLINE, REG_NOT_ENDOFLINE 715 // REG_NEWLINE is neither properly defined nor used anywhere => not implemented 716 // REG_NOSUB is not used anywhere => not implemented 717 // NORM_WORD_ONLY is only used for SearchAlgorithm==Absolute 718 // LEV_RELAXED is only used for SearchAlgorithm==Approximate 719 // Note that the search flag ALL_IGNORE_CASE is deprecated in UNO 720 // probably because the transliteration flag IGNORE_CASE handles it as well. 721 if( (rOptions.searchFlag & com::sun::star::util::SearchFlags::ALL_IGNORE_CASE) != 0 722 || (rOptions.transliterateFlags & TransliterationModules_IGNORE_CASE) != 0) 723 nIcuSearchFlags |= UREGEX_CASE_INSENSITIVE; 724 UErrorCode nIcuErr = U_ZERO_ERROR; 725 // assumption: transliteration didn't mangle regexp control chars 726 IcuUniString aIcuSearchPatStr( (const UChar*)rPatternStr.getStr(), rPatternStr.getLength()); 727 #ifndef DISABLE_WORDBOUND_EMULATION 728 // for conveniance specific syntax elements of the old regex engine are emulated 729 // - by replacing \< with "word-break followed by a look-ahead word-char" 730 static const IcuUniString aChevronPatternB( "\\\\<", -1, IcuUniString::kInvariant); 731 static const IcuUniString aChevronReplaceB( "\\\\b(?=\\\\w)", -1, IcuUniString::kInvariant); 732 static RegexMatcher aChevronMatcherB( aChevronPatternB, 0, nIcuErr); 733 aChevronMatcherB.reset( aIcuSearchPatStr); 734 aIcuSearchPatStr = aChevronMatcherB.replaceAll( aChevronReplaceB, nIcuErr); 735 aChevronMatcherB.reset(); 736 // - by replacing \> with "look-behind word-char followed by a word-break" 737 static const IcuUniString aChevronPatternE( "\\\\>", -1, IcuUniString::kInvariant); 738 static const IcuUniString aChevronReplaceE( "(?<=\\\\w)\\\\b", -1, IcuUniString::kInvariant); 739 static RegexMatcher aChevronMatcherE( aChevronPatternE, 0, nIcuErr); 740 aChevronMatcherE.reset( aIcuSearchPatStr); 741 aIcuSearchPatStr = aChevronMatcherE.replaceAll( aChevronReplaceE, nIcuErr); 742 aChevronMatcherE.reset(); 743 #endif 744 pRegexMatcher = new RegexMatcher( aIcuSearchPatStr, nIcuSearchFlags, nIcuErr); 745 if( nIcuErr) 746 { delete pRegexMatcher; pRegexMatcher = NULL;} 747 } 748 749 //--------------------------------------------------------------------------- 750 751 SearchResult TextSearch::RESrchFrwrd( const OUString& searchStr, 752 sal_Int32 startPos, sal_Int32 endPos ) 753 throw(RuntimeException) 754 { 755 SearchResult aRet; 756 aRet.subRegExpressions = 0; 757 if( !pRegexMatcher) 758 return aRet; 759 760 if( endPos > searchStr.getLength()) 761 endPos = searchStr.getLength(); 762 763 // use the ICU RegexMatcher to find the matches 764 UErrorCode nIcuErr = U_ZERO_ERROR; 765 const IcuUniString aSearchTargetStr( (const UChar*)searchStr.getStr(), endPos); 766 pRegexMatcher->reset( aSearchTargetStr); 767 // search until there is a valid match 768 for(;;) 769 { 770 if( !pRegexMatcher->find( startPos, nIcuErr)) 771 return aRet; 772 773 // #i118887# ignore zero-length matches e.g. "a*" in "bc" 774 int nStartOfs = pRegexMatcher->start( nIcuErr); 775 int nEndOfs = pRegexMatcher->end( nIcuErr); 776 if( nStartOfs < nEndOfs) 777 break; 778 // try at next position if there was a zero-length match 779 if( ++startPos >= endPos) 780 return aRet; 781 } 782 783 // extract the result of the search 784 const int nGroupCount = pRegexMatcher->groupCount(); 785 aRet.subRegExpressions = nGroupCount + 1; 786 aRet.startOffset.realloc( aRet.subRegExpressions); 787 aRet.endOffset.realloc( aRet.subRegExpressions); 788 aRet.startOffset[0] = pRegexMatcher->start( nIcuErr); 789 aRet.endOffset[0] = pRegexMatcher->end( nIcuErr); 790 for( int i = 1; i <= nGroupCount; ++i) { 791 aRet.startOffset[i] = pRegexMatcher->start( i, nIcuErr); 792 aRet.endOffset[i] = pRegexMatcher->end( i, nIcuErr); 793 } 794 795 return aRet; 796 } 797 798 SearchResult TextSearch::RESrchBkwrd( const OUString& searchStr, 799 sal_Int32 startPos, sal_Int32 endPos ) 800 throw(RuntimeException) 801 { 802 // NOTE: for backwards search callers provide startPos/endPos inverted! 803 SearchResult aRet; 804 aRet.subRegExpressions = 0; 805 if( !pRegexMatcher) 806 return aRet; 807 808 if( startPos > searchStr.getLength()) 809 startPos = searchStr.getLength(); 810 811 // use the ICU RegexMatcher to find the matches 812 // TODO: use ICU's backward searching once it becomes available 813 // as its replacement using forward search is not as good as the real thing 814 UErrorCode nIcuErr = U_ZERO_ERROR; 815 const IcuUniString aSearchTargetStr( (const UChar*)searchStr.getStr(), startPos); 816 pRegexMatcher->reset( aSearchTargetStr); 817 if( !pRegexMatcher->find( endPos, nIcuErr)) 818 return aRet; 819 820 // find the last match 821 int nLastPos = 0; 822 int nFoundEnd = 0; 823 do { 824 nLastPos = pRegexMatcher->start( nIcuErr); 825 nFoundEnd = pRegexMatcher->end( nIcuErr); 826 if( nFoundEnd >= startPos) 827 break; 828 if( nFoundEnd == nLastPos) 829 ++nFoundEnd; 830 } while( pRegexMatcher->find( nFoundEnd, nIcuErr)); 831 832 // find last match again to get its details 833 pRegexMatcher->find( nLastPos, nIcuErr); 834 835 // fill in the details of the last match 836 const int nGroupCount = pRegexMatcher->groupCount(); 837 aRet.subRegExpressions = nGroupCount + 1; 838 aRet.startOffset.realloc( aRet.subRegExpressions); 839 aRet.endOffset.realloc( aRet.subRegExpressions); 840 // NOTE: existing users of backward search seem to expect startOfs/endOfs being inverted! 841 aRet.startOffset[0] = pRegexMatcher->end( nIcuErr); 842 aRet.endOffset[0] = pRegexMatcher->start( nIcuErr); 843 for( int i = 1; i <= nGroupCount; ++i) { 844 aRet.startOffset[i] = pRegexMatcher->end( i, nIcuErr); 845 aRet.endOffset[i] = pRegexMatcher->start( i, nIcuErr); 846 } 847 848 return aRet; 849 } 850 851 //--------------------------------------------------------------------------- 852 853 // search for words phonetically 854 SearchResult TextSearch::ApproxSrchFrwrd( const OUString& searchStr, 855 sal_Int32 startPos, sal_Int32 endPos ) 856 throw(RuntimeException) 857 { 858 SearchResult aRet; 859 aRet.subRegExpressions = 0; 860 861 if( !xBreak.is() ) 862 return aRet; 863 864 OUString aWTemp( searchStr ); 865 866 register sal_Int32 nStt, nEnd; 867 868 Boundary aWBnd = xBreak->getWordBoundary( aWTemp, startPos, 869 aSrchPara.Locale, 870 WordType::ANYWORD_IGNOREWHITESPACES, sal_True ); 871 872 do 873 { 874 if( aWBnd.startPos >= endPos ) 875 break; 876 nStt = aWBnd.startPos < startPos ? startPos : aWBnd.startPos; 877 nEnd = aWBnd.endPos > endPos ? endPos : aWBnd.endPos; 878 879 if( nStt < nEnd && 880 pWLD->WLD( aWTemp.getStr() + nStt, nEnd - nStt ) <= nLimit ) 881 { 882 aRet.subRegExpressions = 1; 883 aRet.startOffset.realloc( 1 ); 884 aRet.startOffset[ 0 ] = nStt; 885 aRet.endOffset.realloc( 1 ); 886 aRet.endOffset[ 0 ] = nEnd; 887 break; 888 } 889 890 nStt = nEnd - 1; 891 aWBnd = xBreak->nextWord( aWTemp, nStt, aSrchPara.Locale, 892 WordType::ANYWORD_IGNOREWHITESPACES); 893 } while( aWBnd.startPos != aWBnd.endPos || 894 (aWBnd.endPos != aWTemp.getLength() && aWBnd.endPos != nEnd) ); 895 // #i50244# aWBnd.endPos != nEnd : in case there is _no_ word (only 896 // whitespace) in searchStr, getWordBoundary() returned startPos,startPos 897 // and nextWord() does also => don't loop forever. 898 return aRet; 899 } 900 901 SearchResult TextSearch::ApproxSrchBkwrd( const OUString& searchStr, 902 sal_Int32 startPos, sal_Int32 endPos ) 903 throw(RuntimeException) 904 { 905 SearchResult aRet; 906 aRet.subRegExpressions = 0; 907 908 if( !xBreak.is() ) 909 return aRet; 910 911 OUString aWTemp( searchStr ); 912 913 register sal_Int32 nStt, nEnd; 914 915 Boundary aWBnd = xBreak->getWordBoundary( aWTemp, startPos, 916 aSrchPara.Locale, 917 WordType::ANYWORD_IGNOREWHITESPACES, sal_True ); 918 919 do 920 { 921 if( aWBnd.endPos <= endPos ) 922 break; 923 nStt = aWBnd.startPos < endPos ? endPos : aWBnd.startPos; 924 nEnd = aWBnd.endPos > startPos ? startPos : aWBnd.endPos; 925 926 if( nStt < nEnd && 927 pWLD->WLD( aWTemp.getStr() + nStt, nEnd - nStt ) <= nLimit ) 928 { 929 aRet.subRegExpressions = 1; 930 aRet.startOffset.realloc( 1 ); 931 aRet.startOffset[ 0 ] = nEnd; 932 aRet.endOffset.realloc( 1 ); 933 aRet.endOffset[ 0 ] = nStt; 934 break; 935 } 936 if( !nStt ) 937 break; 938 939 aWBnd = xBreak->previousWord( aWTemp, nStt, aSrchPara.Locale, 940 WordType::ANYWORD_IGNOREWHITESPACES); 941 } while( aWBnd.startPos != aWBnd.endPos || aWBnd.endPos != aWTemp.getLength() ); 942 return aRet; 943 } 944 945 946 static const sal_Char cSearchName[] = "com.sun.star.util.TextSearch"; 947 static const sal_Char cSearchImpl[] = "com.sun.star.util.TextSearch_i18n"; 948 949 static OUString getServiceName_Static() 950 { 951 return OUString::createFromAscii( cSearchName ); 952 } 953 954 static OUString getImplementationName_Static() 955 { 956 return OUString::createFromAscii( cSearchImpl ); 957 } 958 959 OUString SAL_CALL 960 TextSearch::getImplementationName() 961 throw( RuntimeException ) 962 { 963 return getImplementationName_Static(); 964 } 965 966 sal_Bool SAL_CALL 967 TextSearch::supportsService(const OUString& rServiceName) 968 throw( RuntimeException ) 969 { 970 return !rServiceName.compareToAscii( cSearchName ); 971 } 972 973 Sequence< OUString > SAL_CALL 974 TextSearch::getSupportedServiceNames(void) throw( RuntimeException ) 975 { 976 Sequence< OUString > aRet(1); 977 aRet[0] = getServiceName_Static(); 978 return aRet; 979 } 980 981 ::com::sun::star::uno::Reference< ::com::sun::star::uno::XInterface > 982 SAL_CALL TextSearch_CreateInstance( 983 const ::com::sun::star::uno::Reference< 984 ::com::sun::star::lang::XMultiServiceFactory >& rxMSF ) 985 { 986 return ::com::sun::star::uno::Reference< 987 ::com::sun::star::uno::XInterface >( 988 (::cppu::OWeakObject*) new TextSearch( rxMSF ) ); 989 } 990 991 extern "C" 992 { 993 994 void SAL_CALL component_getImplementationEnvironment( 995 const sal_Char** ppEnvTypeName, uno_Environment** /*ppEnv*/ ) 996 { 997 *ppEnvTypeName = CPPU_CURRENT_LANGUAGE_BINDING_NAME; 998 } 999 1000 void* SAL_CALL component_getFactory( const sal_Char* sImplementationName, 1001 void* _pServiceManager, void* /*_pRegistryKey*/ ) 1002 { 1003 void* pRet = NULL; 1004 1005 ::com::sun::star::lang::XMultiServiceFactory* pServiceManager = 1006 reinterpret_cast< ::com::sun::star::lang::XMultiServiceFactory* > 1007 ( _pServiceManager ); 1008 ::com::sun::star::uno::Reference< 1009 ::com::sun::star::lang::XSingleServiceFactory > xFactory; 1010 1011 if ( 0 == rtl_str_compare( sImplementationName, cSearchImpl) ) 1012 { 1013 ::com::sun::star::uno::Sequence< ::rtl::OUString > aServiceNames(1); 1014 aServiceNames[0] = getServiceName_Static(); 1015 xFactory = ::cppu::createSingleFactory( 1016 pServiceManager, getImplementationName_Static(), 1017 &TextSearch_CreateInstance, aServiceNames ); 1018 } 1019 1020 if ( xFactory.is() ) 1021 { 1022 xFactory->acquire(); 1023 pRet = xFactory.get(); 1024 } 1025 1026 return pRet; 1027 } 1028 1029 } // extern "C" 1030