1 /************************************************************** 2 * 3 * Licensed to the Apache Software Foundation (ASF) under one 4 * or more contributor license agreements. See the NOTICE file 5 * distributed with this work for additional information 6 * regarding copyright ownership. The ASF licenses this file 7 * to you under the Apache License, Version 2.0 (the 8 * "License"); you may not use this file except in compliance 9 * with the License. You may obtain a copy of the License at 10 * 11 * http://www.apache.org/licenses/LICENSE-2.0 12 * 13 * Unless required by applicable law or agreed to in writing, 14 * software distributed under the License is distributed on an 15 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 * KIND, either express or implied. See the License for the 17 * specific language governing permissions and limitations 18 * under the License. 19 * 20 *************************************************************/ 21 22 23 24 // MARKER(update_precomp.py): autogen include statement, do not remove 25 #include "precompiled_i18npool.hxx" 26 27 #include "textsearch.hxx" 28 #include "levdis.hxx" 29 #include <com/sun/star/lang/Locale.hpp> 30 #include <com/sun/star/lang/XMultiServiceFactory.hpp> 31 #include <comphelper/processfactory.hxx> 32 #include <com/sun/star/i18n/UnicodeType.hpp> 33 #include <com/sun/star/util/SearchFlags.hpp> 34 #include <com/sun/star/i18n/WordType.hpp> 35 #include <com/sun/star/i18n/ScriptType.hpp> 36 #include <com/sun/star/i18n/CharacterIteratorMode.hpp> 37 #include <com/sun/star/i18n/KCharacterType.hpp> 38 #include <com/sun/star/registry/XRegistryKey.hpp> 39 #include <cppuhelper/factory.hxx> 40 #include <cppuhelper/weak.hxx> 41 42 #ifdef _MSC_VER 43 // get rid of that dumb compiler warning 44 // identifier was truncated to '255' characters in the debug information 45 // for STL template usage, if .pdb files are to be created 46 #pragma warning( disable: 4786 ) 47 #endif 48 49 #include <string.h> 50 51 using namespace ::com::sun::star::util; 52 using namespace ::com::sun::star::uno; 53 using namespace ::com::sun::star::lang; 54 using namespace ::com::sun::star::i18n; 55 using namespace ::rtl; 56 57 static sal_Int32 COMPLEX_TRANS_MASK_TMP = 58 TransliterationModules_ignoreBaFa_ja_JP | 59 TransliterationModules_ignoreIterationMark_ja_JP | 60 TransliterationModules_ignoreTiJi_ja_JP | 61 TransliterationModules_ignoreHyuByu_ja_JP | 62 TransliterationModules_ignoreSeZe_ja_JP | 63 TransliterationModules_ignoreIandEfollowedByYa_ja_JP | 64 TransliterationModules_ignoreKiKuFollowedBySa_ja_JP | 65 TransliterationModules_ignoreProlongedSoundMark_ja_JP; 66 static const sal_Int32 SIMPLE_TRANS_MASK = TransliterationModules_HIRAGANA_KATAKANA | TransliterationModules_FULLWIDTH_HALFWIDTH; 67 static const sal_Int32 COMPLEX_TRANS_MASK = COMPLEX_TRANS_MASK_TMP | TransliterationModules_IGNORE_KANA | TransliterationModules_FULLWIDTH_HALFWIDTH; 68 // Above 2 transliteration is simple but need to take effect in 69 // complex transliteration 70 71 TextSearch::TextSearch(const Reference < XMultiServiceFactory > & rxMSF) 72 : xMSF( rxMSF ) 73 , pJumpTable( 0 ) 74 , pJumpTable2( 0 ) 75 , pRegexMatcher( NULL ) 76 , pWLD( 0 ) 77 { 78 SearchOptions aOpt; 79 aOpt.algorithmType = SearchAlgorithms_ABSOLUTE; 80 aOpt.searchFlag = SearchFlags::ALL_IGNORE_CASE; 81 //aOpt.Locale = ???; 82 setOptions( aOpt ); 83 } 84 85 TextSearch::~TextSearch() 86 { 87 delete pRegexMatcher; 88 delete pWLD; 89 delete pJumpTable; 90 delete pJumpTable2; 91 } 92 93 void TextSearch::setOptions( const SearchOptions& rOptions ) throw( RuntimeException ) 94 { 95 aSrchPara = rOptions; 96 97 delete pRegexMatcher, pRegexMatcher = NULL; 98 delete pWLD, pWLD = 0; 99 delete pJumpTable, pJumpTable = 0; 100 delete pJumpTable2, pJumpTable2 = 0; 101 102 // Create Transliteration class 103 if( aSrchPara.transliterateFlags & SIMPLE_TRANS_MASK ) 104 { 105 if( !xTranslit.is() ) 106 { 107 Reference < XInterface > xI = xMSF->createInstance( 108 OUString::createFromAscii( 109 "com.sun.star.i18n.Transliteration")); 110 if ( xI.is() ) 111 xI->queryInterface( ::getCppuType( 112 (const Reference< XExtendedTransliteration >*)0)) 113 >>= xTranslit; 114 } 115 // Load transliteration module 116 if( xTranslit.is() ) 117 xTranslit->loadModule( 118 (TransliterationModules)( aSrchPara.transliterateFlags & SIMPLE_TRANS_MASK ), 119 aSrchPara.Locale); 120 } 121 else if( xTranslit.is() ) 122 xTranslit = 0; 123 124 // Create Transliteration for 2<->1, 2<->2 transliteration 125 if ( aSrchPara.transliterateFlags & COMPLEX_TRANS_MASK ) 126 { 127 if( !xTranslit2.is() ) 128 { 129 Reference < XInterface > xI = xMSF->createInstance( 130 OUString::createFromAscii( 131 "com.sun.star.i18n.Transliteration")); 132 if ( xI.is() ) 133 xI->queryInterface( ::getCppuType( 134 (const Reference< XExtendedTransliteration >*)0)) 135 >>= xTranslit2; 136 } 137 // Load transliteration module 138 if( xTranslit2.is() ) 139 xTranslit2->loadModule( 140 (TransliterationModules)( aSrchPara.transliterateFlags & COMPLEX_TRANS_MASK ), 141 aSrchPara.Locale); 142 } 143 144 if ( !xBreak.is() ) 145 { 146 Reference < XInterface > xI = xMSF->createInstance( 147 OUString::createFromAscii( "com.sun.star.i18n.BreakIterator")); 148 if( xI.is() ) 149 xI->queryInterface( ::getCppuType( 150 (const Reference< XBreakIterator >*)0)) 151 >>= xBreak; 152 } 153 154 sSrchStr = aSrchPara.searchString; 155 156 // use transliteration here 157 if ( xTranslit.is() && 158 aSrchPara.transliterateFlags & SIMPLE_TRANS_MASK ) 159 sSrchStr = xTranslit->transliterateString2String( 160 aSrchPara.searchString, 0, aSrchPara.searchString.getLength()); 161 162 if ( xTranslit2.is() && 163 aSrchPara.transliterateFlags & COMPLEX_TRANS_MASK ) 164 sSrchStr2 = xTranslit2->transliterateString2String( 165 aSrchPara.searchString, 0, aSrchPara.searchString.getLength()); 166 167 // When start or end of search string is a complex script type, we need to 168 // make sure the result boundary is not located in the middle of cell. 169 checkCTLStart = (xBreak.is() && (xBreak->getScriptType(sSrchStr, 0) == 170 ScriptType::COMPLEX)); 171 checkCTLEnd = (xBreak.is() && (xBreak->getScriptType(sSrchStr, 172 sSrchStr.getLength()-1) == ScriptType::COMPLEX)); 173 174 switch( aSrchPara.algorithmType) 175 { 176 case SearchAlgorithms_REGEXP: 177 fnForward = &TextSearch::RESrchFrwrd; 178 fnBackward = &TextSearch::RESrchBkwrd; 179 RESrchPrepare( aSrchPara); 180 break; 181 182 case SearchAlgorithms_APPROXIMATE: 183 fnForward = &TextSearch::ApproxSrchFrwrd; 184 fnBackward = &TextSearch::ApproxSrchBkwrd; 185 186 pWLD = new WLevDistance( sSrchStr.getStr(), aSrchPara.changedChars, 187 aSrchPara.insertedChars, aSrchPara.deletedChars, 188 0 != (SearchFlags::LEV_RELAXED & aSrchPara.searchFlag ) ); 189 190 nLimit = pWLD->GetLimit(); 191 break; 192 193 default: 194 fnForward = &TextSearch::NSrchFrwrd; 195 fnBackward = &TextSearch::NSrchBkwrd; 196 break; 197 } 198 } 199 200 sal_Int32 FindPosInSeq_Impl( const Sequence <sal_Int32>& rOff, sal_Int32 nPos ) 201 { 202 sal_Int32 nRet = 0, nEnd = rOff.getLength(); 203 while( nRet < nEnd && nPos > rOff[ nRet ] ) ++nRet; 204 return nRet; 205 } 206 207 sal_Bool TextSearch::isCellStart(const OUString& searchStr, sal_Int32 nPos) 208 throw( RuntimeException ) 209 { 210 sal_Int32 nDone; 211 return nPos == xBreak->previousCharacters(searchStr, nPos+1, 212 aSrchPara.Locale, CharacterIteratorMode::SKIPCELL, 1, nDone); 213 } 214 215 SearchResult TextSearch::searchForward( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos ) 216 throw( RuntimeException ) 217 { 218 SearchResult sres; 219 220 OUString in_str(searchStr); 221 sal_Int32 newStartPos = startPos; 222 sal_Int32 newEndPos = endPos; 223 224 bUsePrimarySrchStr = true; 225 226 if ( xTranslit.is() ) 227 { 228 // apply normal transliteration (1<->1, 1<->0) 229 com::sun::star::uno::Sequence <sal_Int32> offset( in_str.getLength()); 230 in_str = xTranslit->transliterate( searchStr, 0, in_str.getLength(), offset ); 231 232 // JP 20.6.2001: also the start and end positions must be corrected! 233 if( startPos ) 234 newStartPos = FindPosInSeq_Impl( offset, startPos ); 235 236 if( endPos < searchStr.getLength() ) 237 newEndPos = FindPosInSeq_Impl( offset, endPos ); 238 else 239 newEndPos = in_str.getLength(); 240 241 sres = (this->*fnForward)( in_str, newStartPos, newEndPos ); 242 243 for ( int k = 0; k < sres.startOffset.getLength(); k++ ) 244 { 245 if (sres.startOffset[k]) 246 sres.startOffset[k] = offset[sres.startOffset[k]]; 247 // JP 20.6.2001: end is ever exclusive and then don't return 248 // the position of the next character - return the 249 // next position behind the last found character! 250 // "a b c" find "b" must return 2,3 and not 2,4!!! 251 if (sres.endOffset[k]) 252 sres.endOffset[k] = offset[sres.endOffset[k]-1] + 1; 253 } 254 } 255 else 256 { 257 sres = (this->*fnForward)( in_str, startPos, endPos ); 258 } 259 260 if ( xTranslit2.is() && aSrchPara.algorithmType != SearchAlgorithms_REGEXP) 261 { 262 SearchResult sres2; 263 264 in_str = OUString(searchStr); 265 com::sun::star::uno::Sequence <sal_Int32> offset( in_str.getLength()); 266 267 in_str = xTranslit2->transliterate( searchStr, 0, in_str.getLength(), offset ); 268 269 if( startPos ) 270 startPos = FindPosInSeq_Impl( offset, startPos ); 271 272 if( endPos < searchStr.getLength() ) 273 endPos = FindPosInSeq_Impl( offset, endPos ); 274 else 275 endPos = in_str.getLength(); 276 277 bUsePrimarySrchStr = false; 278 sres2 = (this->*fnForward)( in_str, startPos, endPos ); 279 280 for ( int k = 0; k < sres2.startOffset.getLength(); k++ ) 281 { 282 if (sres2.startOffset[k]) 283 sres2.startOffset[k] = offset[sres2.startOffset[k]-1] + 1; 284 if (sres2.endOffset[k]) 285 sres2.endOffset[k] = offset[sres2.endOffset[k]-1] + 1; 286 } 287 288 // pick first and long one 289 if ( sres.subRegExpressions == 0) 290 return sres2; 291 if ( sres2.subRegExpressions == 1) 292 { 293 if ( sres.startOffset[0] > sres2.startOffset[0]) 294 return sres2; 295 else if ( sres.startOffset[0] == sres2.startOffset[0] && 296 sres.endOffset[0] < sres2.endOffset[0]) 297 return sres2; 298 } 299 } 300 301 return sres; 302 } 303 304 SearchResult TextSearch::searchBackward( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos ) 305 throw(RuntimeException) 306 { 307 SearchResult sres; 308 309 OUString in_str(searchStr); 310 sal_Int32 newStartPos = startPos; 311 sal_Int32 newEndPos = endPos; 312 313 bUsePrimarySrchStr = true; 314 315 if ( xTranslit.is() ) 316 { 317 // apply only simple 1<->1 transliteration here 318 com::sun::star::uno::Sequence <sal_Int32> offset( in_str.getLength()); 319 in_str = xTranslit->transliterate( searchStr, 0, in_str.getLength(), offset ); 320 321 // JP 20.6.2001: also the start and end positions must be corrected! 322 if( startPos < searchStr.getLength() ) 323 newStartPos = FindPosInSeq_Impl( offset, startPos ); 324 else 325 newStartPos = in_str.getLength(); 326 327 if( endPos ) 328 newEndPos = FindPosInSeq_Impl( offset, endPos ); 329 330 sres = (this->*fnBackward)( in_str, newStartPos, newEndPos ); 331 332 for ( int k = 0; k < sres.startOffset.getLength(); k++ ) 333 { 334 if (sres.startOffset[k]) 335 sres.startOffset[k] = offset[sres.startOffset[k] - 1] + 1; 336 // JP 20.6.2001: end is ever exclusive and then don't return 337 // the position of the next character - return the 338 // next position behind the last found character! 339 // "a b c" find "b" must return 2,3 and not 2,4!!! 340 if (sres.endOffset[k]) 341 sres.endOffset[k] = offset[sres.endOffset[k]]; 342 } 343 } 344 else 345 { 346 sres = (this->*fnBackward)( in_str, startPos, endPos ); 347 } 348 349 if ( xTranslit2.is() && aSrchPara.algorithmType != SearchAlgorithms_REGEXP ) 350 { 351 SearchResult sres2; 352 353 in_str = OUString(searchStr); 354 com::sun::star::uno::Sequence <sal_Int32> offset( in_str.getLength()); 355 356 in_str = xTranslit2->transliterate(searchStr, 0, in_str.getLength(), offset); 357 358 if( startPos < searchStr.getLength() ) 359 startPos = FindPosInSeq_Impl( offset, startPos ); 360 else 361 startPos = in_str.getLength(); 362 363 if( endPos ) 364 endPos = FindPosInSeq_Impl( offset, endPos ); 365 366 bUsePrimarySrchStr = false; 367 sres2 = (this->*fnBackward)( in_str, startPos, endPos ); 368 369 for( int k = 0; k < sres2.startOffset.getLength(); k++ ) 370 { 371 if (sres2.startOffset[k]) 372 sres2.startOffset[k] = offset[sres2.startOffset[k]-1]+1; 373 if (sres2.endOffset[k]) 374 sres2.endOffset[k] = offset[sres2.endOffset[k]-1]+1; 375 } 376 377 // pick last and long one 378 if ( sres.subRegExpressions == 0 ) 379 return sres2; 380 if ( sres2.subRegExpressions == 1 ) 381 { 382 if ( sres.startOffset[0] < sres2.startOffset[0] ) 383 return sres2; 384 if ( sres.startOffset[0] == sres2.startOffset[0] && 385 sres.endOffset[0] > sres2.endOffset[0] ) 386 return sres2; 387 } 388 } 389 390 return sres; 391 } 392 393 //--------------------------------------------------------------------- 394 395 bool TextSearch::IsDelimiter( const OUString& rStr, sal_Int32 nPos ) const 396 { 397 bool bRet = 1; 398 if( '\x7f' != rStr[nPos]) 399 { 400 if ( !xCharClass.is() ) 401 { 402 Reference < XInterface > xI = xMSF->createInstance( 403 OUString::createFromAscii( "com.sun.star.i18n.CharacterClassification")); 404 if( xI.is() ) 405 xI->queryInterface( ::getCppuType( 406 (const Reference< XCharacterClassification >*)0)) 407 >>= xCharClass; 408 } 409 if ( xCharClass.is() ) 410 { 411 sal_Int32 nCType = xCharClass->getCharacterType( rStr, nPos, 412 aSrchPara.Locale ); 413 if( 0 != (( KCharacterType::DIGIT | KCharacterType::ALPHA | 414 KCharacterType::LETTER ) & nCType ) ) 415 bRet = 0; 416 } 417 } 418 return bRet; 419 } 420 421 // --------- helper methods for Boyer-Moore like text searching ---------- 422 // TODO: use ICU's regex UREGEX_LITERAL mode instead when it becomes available 423 424 void TextSearch::MakeForwardTab() 425 { 426 // create the jumptable for the search text 427 if( pJumpTable ) 428 { 429 if( bIsForwardTab ) 430 return ; // the jumpTable is ok 431 delete pJumpTable; 432 } 433 bIsForwardTab = true; 434 435 sal_Int32 n, nLen = sSrchStr.getLength(); 436 pJumpTable = new TextSearchJumpTable; 437 438 for( n = 0; n < nLen - 1; ++n ) 439 { 440 sal_Unicode cCh = sSrchStr[n]; 441 sal_Int32 nDiff = nLen - n - 1; 442 TextSearchJumpTable::value_type aEntry( cCh, nDiff ); 443 444 ::std::pair< TextSearchJumpTable::iterator, bool > aPair = 445 pJumpTable->insert( aEntry ); 446 if ( !aPair.second ) 447 (*(aPair.first)).second = nDiff; 448 } 449 } 450 451 void TextSearch::MakeForwardTab2() 452 { 453 // create the jumptable for the search text 454 if( pJumpTable2 ) 455 { 456 if( bIsForwardTab ) 457 return ; // the jumpTable is ok 458 delete pJumpTable2; 459 } 460 bIsForwardTab = true; 461 462 sal_Int32 n, nLen = sSrchStr2.getLength(); 463 pJumpTable2 = new TextSearchJumpTable; 464 465 for( n = 0; n < nLen - 1; ++n ) 466 { 467 sal_Unicode cCh = sSrchStr2[n]; 468 sal_Int32 nDiff = nLen - n - 1; 469 470 TextSearchJumpTable::value_type aEntry( cCh, nDiff ); 471 ::std::pair< TextSearchJumpTable::iterator, bool > aPair = 472 pJumpTable2->insert( aEntry ); 473 if ( !aPair.second ) 474 (*(aPair.first)).second = nDiff; 475 } 476 } 477 478 void TextSearch::MakeBackwardTab() 479 { 480 // create the jumptable for the search text 481 if( pJumpTable ) 482 { 483 if( !bIsForwardTab ) 484 return ; // the jumpTable is ok 485 delete pJumpTable; 486 } 487 bIsForwardTab = false; 488 489 sal_Int32 n, nLen = sSrchStr.getLength(); 490 pJumpTable = new TextSearchJumpTable; 491 492 for( n = nLen-1; n > 0; --n ) 493 { 494 sal_Unicode cCh = sSrchStr[n]; 495 TextSearchJumpTable::value_type aEntry( cCh, n ); 496 ::std::pair< TextSearchJumpTable::iterator, bool > aPair = 497 pJumpTable->insert( aEntry ); 498 if ( !aPair.second ) 499 (*(aPair.first)).second = n; 500 } 501 } 502 503 void TextSearch::MakeBackwardTab2() 504 { 505 // create the jumptable for the search text 506 if( pJumpTable2 ) 507 { 508 if( !bIsForwardTab ) 509 return ; // the jumpTable is ok 510 delete pJumpTable2; 511 } 512 bIsForwardTab = false; 513 514 sal_Int32 n, nLen = sSrchStr2.getLength(); 515 pJumpTable2 = new TextSearchJumpTable; 516 517 for( n = nLen-1; n > 0; --n ) 518 { 519 sal_Unicode cCh = sSrchStr2[n]; 520 TextSearchJumpTable::value_type aEntry( cCh, n ); 521 ::std::pair< TextSearchJumpTable::iterator, bool > aPair = 522 pJumpTable2->insert( aEntry ); 523 if ( !aPair.second ) 524 (*(aPair.first)).second = n; 525 } 526 } 527 528 sal_Int32 TextSearch::GetDiff( const sal_Unicode cChr ) const 529 { 530 TextSearchJumpTable *pJump; 531 OUString sSearchKey; 532 533 if ( bUsePrimarySrchStr ) { 534 pJump = pJumpTable; 535 sSearchKey = sSrchStr; 536 } else { 537 pJump = pJumpTable2; 538 sSearchKey = sSrchStr2; 539 } 540 541 TextSearchJumpTable::const_iterator iLook = pJump->find( cChr ); 542 if ( iLook == pJump->end() ) 543 return sSearchKey.getLength(); 544 return (*iLook).second; 545 } 546 547 548 // TextSearch::NSrchFrwrd is mis-optimized on unxsoli (#i105945#) 549 SearchResult TextSearch::NSrchFrwrd( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos ) 550 throw(RuntimeException) 551 { 552 SearchResult aRet; 553 aRet.subRegExpressions = 0; 554 555 OUString sSearchKey = bUsePrimarySrchStr ? sSrchStr : sSrchStr2; 556 557 OUString aStr( searchStr ); 558 sal_Int32 nSuchIdx = aStr.getLength(); 559 sal_Int32 nEnde = endPos; 560 if( !nSuchIdx || !sSearchKey.getLength() || sSearchKey.getLength() > nSuchIdx ) 561 return aRet; 562 563 564 if( nEnde < sSearchKey.getLength() ) // position inside the search region ? 565 return aRet; 566 567 nEnde -= sSearchKey.getLength(); 568 569 if (bUsePrimarySrchStr) 570 MakeForwardTab(); // create the jumptable 571 else 572 MakeForwardTab2(); 573 574 for (sal_Int32 nCmpIdx = startPos; // start position for the search 575 nCmpIdx <= nEnde; 576 nCmpIdx += GetDiff( aStr[nCmpIdx + sSearchKey.getLength()-1])) 577 { 578 // if the match would be the completed cells, skip it. 579 if ( (checkCTLStart && !isCellStart( aStr, nCmpIdx )) || (checkCTLEnd 580 && !isCellStart( aStr, nCmpIdx + sSearchKey.getLength())) ) 581 continue; 582 583 nSuchIdx = sSearchKey.getLength() - 1; 584 while( nSuchIdx >= 0 && sSearchKey[nSuchIdx] == aStr[nCmpIdx + nSuchIdx]) 585 { 586 if( nSuchIdx == 0 ) 587 { 588 if( SearchFlags::NORM_WORD_ONLY & aSrchPara.searchFlag ) 589 { 590 sal_Int32 nFndEnd = nCmpIdx + sSearchKey.getLength(); 591 bool bAtStart = !nCmpIdx; 592 bool bAtEnd = nFndEnd == endPos; 593 bool bDelimBefore = bAtStart || IsDelimiter( aStr, nCmpIdx-1 ); 594 bool bDelimBehind = IsDelimiter( aStr, nFndEnd ); 595 // * 1 -> only one word in the paragraph 596 // * 2 -> at begin of paragraph 597 // * 3 -> at end of paragraph 598 // * 4 -> inside the paragraph 599 if( !( ( bAtStart && bAtEnd ) || // 1 600 ( bAtStart && bDelimBehind ) || // 2 601 ( bAtEnd && bDelimBefore ) || // 3 602 ( bDelimBefore && bDelimBehind ))) // 4 603 break; 604 } 605 606 aRet.subRegExpressions = 1; 607 aRet.startOffset.realloc( 1 ); 608 aRet.startOffset[ 0 ] = nCmpIdx; 609 aRet.endOffset.realloc( 1 ); 610 aRet.endOffset[ 0 ] = nCmpIdx + sSearchKey.getLength(); 611 612 return aRet; 613 } 614 else 615 nSuchIdx--; 616 } 617 } 618 return aRet; 619 } 620 621 SearchResult TextSearch::NSrchBkwrd( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos ) 622 throw(RuntimeException) 623 { 624 SearchResult aRet; 625 aRet.subRegExpressions = 0; 626 627 OUString sSearchKey = bUsePrimarySrchStr ? sSrchStr : sSrchStr2; 628 629 OUString aStr( searchStr ); 630 sal_Int32 nSuchIdx = aStr.getLength(); 631 sal_Int32 nEnde = endPos; 632 if( nSuchIdx == 0 || sSearchKey.getLength() == 0 || sSearchKey.getLength() > nSuchIdx) 633 return aRet; 634 635 if (bUsePrimarySrchStr) 636 MakeBackwardTab(); // create the jumptable 637 else 638 MakeBackwardTab2(); 639 640 if( nEnde == nSuchIdx ) // end position for the search 641 nEnde = sSearchKey.getLength(); 642 else 643 nEnde += sSearchKey.getLength(); 644 645 sal_Int32 nCmpIdx = startPos; // start position for the search 646 647 while (nCmpIdx >= nEnde) 648 { 649 // if the match would be the completed cells, skip it. 650 if ( (!checkCTLStart || isCellStart( aStr, nCmpIdx - 651 sSearchKey.getLength() )) && (!checkCTLEnd || 652 isCellStart( aStr, nCmpIdx))) 653 { 654 nSuchIdx = 0; 655 while( nSuchIdx < sSearchKey.getLength() && sSearchKey[nSuchIdx] == 656 aStr[nCmpIdx + nSuchIdx - sSearchKey.getLength()] ) 657 nSuchIdx++; 658 if( nSuchIdx >= sSearchKey.getLength() ) 659 { 660 if( SearchFlags::NORM_WORD_ONLY & aSrchPara.searchFlag ) 661 { 662 sal_Int32 nFndStt = nCmpIdx - sSearchKey.getLength(); 663 bool bAtStart = !nFndStt; 664 bool bAtEnd = nCmpIdx == startPos; 665 bool bDelimBehind = IsDelimiter( aStr, nCmpIdx ); 666 bool bDelimBefore = bAtStart || // begin of paragraph 667 IsDelimiter( aStr, nFndStt-1 ); 668 // * 1 -> only one word in the paragraph 669 // * 2 -> at begin of paragraph 670 // * 3 -> at end of paragraph 671 // * 4 -> inside the paragraph 672 if( ( bAtStart && bAtEnd ) || // 1 673 ( bAtStart && bDelimBehind ) || // 2 674 ( bAtEnd && bDelimBefore ) || // 3 675 ( bDelimBefore && bDelimBehind )) // 4 676 { 677 aRet.subRegExpressions = 1; 678 aRet.startOffset.realloc( 1 ); 679 aRet.startOffset[ 0 ] = nCmpIdx; 680 aRet.endOffset.realloc( 1 ); 681 aRet.endOffset[ 0 ] = nCmpIdx - sSearchKey.getLength(); 682 return aRet; 683 } 684 } 685 else 686 { 687 aRet.subRegExpressions = 1; 688 aRet.startOffset.realloc( 1 ); 689 aRet.startOffset[ 0 ] = nCmpIdx; 690 aRet.endOffset.realloc( 1 ); 691 aRet.endOffset[ 0 ] = nCmpIdx - sSearchKey.getLength(); 692 return aRet; 693 } 694 } 695 } 696 nSuchIdx = GetDiff( aStr[nCmpIdx - sSearchKey.getLength()] ); 697 if( nCmpIdx < nSuchIdx ) 698 return aRet; 699 nCmpIdx -= nSuchIdx; 700 } 701 return aRet; 702 } 703 704 void TextSearch::RESrchPrepare( const ::com::sun::star::util::SearchOptions& rOptions) 705 { 706 // select the transliterated pattern string 707 const OUString& rPatternStr = 708 (rOptions.transliterateFlags & SIMPLE_TRANS_MASK) ? sSrchStr 709 : ((rOptions.transliterateFlags & COMPLEX_TRANS_MASK) ? sSrchStr2 : rOptions.searchString); 710 711 sal_uInt32 nIcuSearchFlags = UREGEX_UWORD; // request UAX#29 unicode capability 712 // map com::sun::star::util::SearchFlags to ICU uregex.h flags 713 // TODO: REG_EXTENDED, REG_NOT_BEGINOFLINE, REG_NOT_ENDOFLINE 714 // REG_NEWLINE is neither properly defined nor used anywhere => not implemented 715 // REG_NOSUB is not used anywhere => not implemented 716 // NORM_WORD_ONLY is only used for SearchAlgorithm==Absolute 717 // LEV_RELAXED is only used for SearchAlgorithm==Approximate 718 // why is even ALL_IGNORE_CASE deprecated in UNO? because of transliteration taking care of it??? 719 if( (rOptions.searchFlag & com::sun::star::util::SearchFlags::ALL_IGNORE_CASE) != 0) 720 nIcuSearchFlags |= UREGEX_CASE_INSENSITIVE; 721 UErrorCode nIcuErr = U_ZERO_ERROR; 722 // assumption: transliteration didn't mangle regexp control chars 723 #ifdef OS2 724 IcuUniString aIcuSearchPatStr( (const UChar*)rPatternStr.getStr(), rPatternStr.getLength()); 725 #else 726 IcuUniString aIcuSearchPatStr( rPatternStr.getStr(), rPatternStr.getLength()); 727 #endif 728 #if 1 729 // for conveniance specific syntax elements of the old regex engine are emulated 730 // by using regular word boundary matching \b to replace \< and \> 731 static const IcuUniString aChevronPattern( "\\<|\\>", -1, IcuUniString::kInvariant); 732 static const IcuUniString aChevronReplace( "\\b", -1, IcuUniString::kInvariant); 733 static RegexMatcher aChevronMatcher( aChevronPattern, 0, nIcuErr); 734 aChevronMatcher.reset( aIcuSearchPatStr); 735 aIcuSearchPatStr = aChevronMatcher.replaceAll( aChevronReplace, nIcuErr); 736 aChevronMatcher.reset(); 737 #endif 738 pRegexMatcher = new RegexMatcher( aIcuSearchPatStr, nIcuSearchFlags, nIcuErr); 739 if( nIcuErr) 740 { delete pRegexMatcher; pRegexMatcher = NULL;} 741 } 742 743 //--------------------------------------------------------------------------- 744 745 SearchResult TextSearch::RESrchFrwrd( const OUString& searchStr, 746 sal_Int32 startPos, sal_Int32 endPos ) 747 throw(RuntimeException) 748 { 749 SearchResult aRet; 750 aRet.subRegExpressions = 0; 751 if( !pRegexMatcher) 752 return aRet; 753 754 if( endPos > searchStr.getLength()) 755 endPos = searchStr.getLength(); 756 757 // use the ICU RegexMatcher to find the matches 758 UErrorCode nIcuErr = U_ZERO_ERROR; 759 #ifdef OS2 760 const IcuUniString aSearchTargetStr( (const UChar*)searchStr.getStr(), endPos); 761 #else 762 const IcuUniString aSearchTargetStr( searchStr.getStr(), endPos); 763 #endif 764 pRegexMatcher->reset( aSearchTargetStr); 765 // search until there is a valid match 766 for(;;) 767 { 768 if( !pRegexMatcher->find( startPos, nIcuErr)) 769 return aRet; 770 771 // #i118887# ignore zero-length matches e.g. "a*" in "bc" 772 int nStartOfs = pRegexMatcher->start( nIcuErr); 773 int nEndOfs = pRegexMatcher->end( nIcuErr); 774 if( nStartOfs < nEndOfs) 775 break; 776 // try at next position if there was a zero-length match 777 if( ++startPos >= endPos) 778 return aRet; 779 } 780 781 // extract the result of the search 782 const int nGroupCount = pRegexMatcher->groupCount(); 783 aRet.subRegExpressions = nGroupCount + 1; 784 aRet.startOffset.realloc( aRet.subRegExpressions); 785 aRet.endOffset.realloc( aRet.subRegExpressions); 786 aRet.startOffset[0] = pRegexMatcher->start( nIcuErr); 787 aRet.endOffset[0] = pRegexMatcher->end( nIcuErr); 788 for( int i = 1; i <= nGroupCount; ++i) { 789 aRet.startOffset[i] = pRegexMatcher->start( i, nIcuErr); 790 aRet.endOffset[i] = pRegexMatcher->end( i, nIcuErr); 791 } 792 793 return aRet; 794 } 795 796 SearchResult TextSearch::RESrchBkwrd( const OUString& searchStr, 797 sal_Int32 startPos, sal_Int32 endPos ) 798 throw(RuntimeException) 799 { 800 // NOTE: for backwards search callers provide startPos/endPos inverted! 801 SearchResult aRet; 802 aRet.subRegExpressions = 0; 803 if( !pRegexMatcher) 804 return aRet; 805 806 if( startPos > searchStr.getLength()) 807 startPos = searchStr.getLength(); 808 809 // use the ICU RegexMatcher to find the matches 810 // TODO: use ICU's backward searching once it becomes available 811 // as its replacement using forward search is not as good as the real thing 812 UErrorCode nIcuErr = U_ZERO_ERROR; 813 #ifdef OS2 814 const IcuUniString aSearchTargetStr( (const UChar*)searchStr.getStr(), startPos); 815 #else 816 const IcuUniString aSearchTargetStr( searchStr.getStr(), startPos); 817 #endif 818 pRegexMatcher->reset( aSearchTargetStr); 819 if( !pRegexMatcher->find( endPos, nIcuErr)) 820 return aRet; 821 822 // find the last match 823 int nLastPos = 0; 824 do { 825 nLastPos = pRegexMatcher->start( nIcuErr); 826 } while( pRegexMatcher->find( nLastPos + 1, nIcuErr)); 827 828 // find last match again to get its details 829 pRegexMatcher->find( nLastPos, nIcuErr); 830 831 // fill in the details of the last match 832 const int nGroupCount = pRegexMatcher->groupCount(); 833 aRet.subRegExpressions = nGroupCount + 1; 834 aRet.startOffset.realloc( aRet.subRegExpressions); 835 aRet.endOffset.realloc( aRet.subRegExpressions); 836 // NOTE: existing users of backward search seem to expect startOfs/endOfs being inverted! 837 aRet.startOffset[0] = pRegexMatcher->end( nIcuErr); 838 aRet.endOffset[0] = pRegexMatcher->start( nIcuErr); 839 for( int i = 1; i <= nGroupCount; ++i) { 840 aRet.startOffset[i] = pRegexMatcher->end( i, nIcuErr); 841 aRet.endOffset[i] = pRegexMatcher->start( i, nIcuErr); 842 } 843 844 return aRet; 845 } 846 847 //--------------------------------------------------------------------------- 848 849 // search for words phonetically 850 SearchResult TextSearch::ApproxSrchFrwrd( const OUString& searchStr, 851 sal_Int32 startPos, sal_Int32 endPos ) 852 throw(RuntimeException) 853 { 854 SearchResult aRet; 855 aRet.subRegExpressions = 0; 856 857 if( !xBreak.is() ) 858 return aRet; 859 860 OUString aWTemp( searchStr ); 861 862 register sal_Int32 nStt, nEnd; 863 864 Boundary aWBnd = xBreak->getWordBoundary( aWTemp, startPos, 865 aSrchPara.Locale, 866 WordType::ANYWORD_IGNOREWHITESPACES, sal_True ); 867 868 do 869 { 870 if( aWBnd.startPos >= endPos ) 871 break; 872 nStt = aWBnd.startPos < startPos ? startPos : aWBnd.startPos; 873 nEnd = aWBnd.endPos > endPos ? endPos : aWBnd.endPos; 874 875 if( nStt < nEnd && 876 pWLD->WLD( aWTemp.getStr() + nStt, nEnd - nStt ) <= nLimit ) 877 { 878 aRet.subRegExpressions = 1; 879 aRet.startOffset.realloc( 1 ); 880 aRet.startOffset[ 0 ] = nStt; 881 aRet.endOffset.realloc( 1 ); 882 aRet.endOffset[ 0 ] = nEnd; 883 break; 884 } 885 886 nStt = nEnd - 1; 887 aWBnd = xBreak->nextWord( aWTemp, nStt, aSrchPara.Locale, 888 WordType::ANYWORD_IGNOREWHITESPACES); 889 } while( aWBnd.startPos != aWBnd.endPos || 890 (aWBnd.endPos != aWTemp.getLength() && aWBnd.endPos != nEnd) ); 891 // #i50244# aWBnd.endPos != nEnd : in case there is _no_ word (only 892 // whitespace) in searchStr, getWordBoundary() returned startPos,startPos 893 // and nextWord() does also => don't loop forever. 894 return aRet; 895 } 896 897 SearchResult TextSearch::ApproxSrchBkwrd( const OUString& searchStr, 898 sal_Int32 startPos, sal_Int32 endPos ) 899 throw(RuntimeException) 900 { 901 SearchResult aRet; 902 aRet.subRegExpressions = 0; 903 904 if( !xBreak.is() ) 905 return aRet; 906 907 OUString aWTemp( searchStr ); 908 909 register sal_Int32 nStt, nEnd; 910 911 Boundary aWBnd = xBreak->getWordBoundary( aWTemp, startPos, 912 aSrchPara.Locale, 913 WordType::ANYWORD_IGNOREWHITESPACES, sal_True ); 914 915 do 916 { 917 if( aWBnd.endPos <= endPos ) 918 break; 919 nStt = aWBnd.startPos < endPos ? endPos : aWBnd.startPos; 920 nEnd = aWBnd.endPos > startPos ? startPos : aWBnd.endPos; 921 922 if( nStt < nEnd && 923 pWLD->WLD( aWTemp.getStr() + nStt, nEnd - nStt ) <= nLimit ) 924 { 925 aRet.subRegExpressions = 1; 926 aRet.startOffset.realloc( 1 ); 927 aRet.startOffset[ 0 ] = nEnd; 928 aRet.endOffset.realloc( 1 ); 929 aRet.endOffset[ 0 ] = nStt; 930 break; 931 } 932 if( !nStt ) 933 break; 934 935 aWBnd = xBreak->previousWord( aWTemp, nStt, aSrchPara.Locale, 936 WordType::ANYWORD_IGNOREWHITESPACES); 937 } while( aWBnd.startPos != aWBnd.endPos || aWBnd.endPos != aWTemp.getLength() ); 938 return aRet; 939 } 940 941 942 static const sal_Char cSearchName[] = "com.sun.star.util.TextSearch"; 943 static const sal_Char cSearchImpl[] = "com.sun.star.util.TextSearch_i18n"; 944 945 static OUString getServiceName_Static() 946 { 947 return OUString::createFromAscii( cSearchName ); 948 } 949 950 static OUString getImplementationName_Static() 951 { 952 return OUString::createFromAscii( cSearchImpl ); 953 } 954 955 OUString SAL_CALL 956 TextSearch::getImplementationName() 957 throw( RuntimeException ) 958 { 959 return getImplementationName_Static(); 960 } 961 962 sal_Bool SAL_CALL 963 TextSearch::supportsService(const OUString& rServiceName) 964 throw( RuntimeException ) 965 { 966 return !rServiceName.compareToAscii( cSearchName ); 967 } 968 969 Sequence< OUString > SAL_CALL 970 TextSearch::getSupportedServiceNames(void) throw( RuntimeException ) 971 { 972 Sequence< OUString > aRet(1); 973 aRet[0] = getServiceName_Static(); 974 return aRet; 975 } 976 977 ::com::sun::star::uno::Reference< ::com::sun::star::uno::XInterface > 978 SAL_CALL TextSearch_CreateInstance( 979 const ::com::sun::star::uno::Reference< 980 ::com::sun::star::lang::XMultiServiceFactory >& rxMSF ) 981 { 982 return ::com::sun::star::uno::Reference< 983 ::com::sun::star::uno::XInterface >( 984 (::cppu::OWeakObject*) new TextSearch( rxMSF ) ); 985 } 986 987 extern "C" 988 { 989 990 void SAL_CALL component_getImplementationEnvironment( 991 const sal_Char** ppEnvTypeName, uno_Environment** /*ppEnv*/ ) 992 { 993 *ppEnvTypeName = CPPU_CURRENT_LANGUAGE_BINDING_NAME; 994 } 995 996 void* SAL_CALL component_getFactory( const sal_Char* sImplementationName, 997 void* _pServiceManager, void* /*_pRegistryKey*/ ) 998 { 999 void* pRet = NULL; 1000 1001 ::com::sun::star::lang::XMultiServiceFactory* pServiceManager = 1002 reinterpret_cast< ::com::sun::star::lang::XMultiServiceFactory* > 1003 ( _pServiceManager ); 1004 ::com::sun::star::uno::Reference< 1005 ::com::sun::star::lang::XSingleServiceFactory > xFactory; 1006 1007 if ( 0 == rtl_str_compare( sImplementationName, cSearchImpl) ) 1008 { 1009 ::com::sun::star::uno::Sequence< ::rtl::OUString > aServiceNames(1); 1010 aServiceNames[0] = getServiceName_Static(); 1011 xFactory = ::cppu::createSingleFactory( 1012 pServiceManager, getImplementationName_Static(), 1013 &TextSearch_CreateInstance, aServiceNames ); 1014 } 1015 1016 if ( xFactory.is() ) 1017 { 1018 xFactory->acquire(); 1019 pRet = xFactory.get(); 1020 } 1021 1022 return pRet; 1023 } 1024 1025 } // extern "C" 1026