1 /************************************************************** 2 * 3 * Licensed to the Apache Software Foundation (ASF) under one 4 * or more contributor license agreements. See the NOTICE file 5 * distributed with this work for additional information 6 * regarding copyright ownership. The ASF licenses this file 7 * to you under the Apache License, Version 2.0 (the 8 * "License"); you may not use this file except in compliance 9 * with the License. You may obtain a copy of the License at 10 * 11 * http://www.apache.org/licenses/LICENSE-2.0 12 * 13 * Unless required by applicable law or agreed to in writing, 14 * software distributed under the License is distributed on an 15 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 * KIND, either express or implied. See the License for the 17 * specific language governing permissions and limitations 18 * under the License. 19 * 20 *************************************************************/ 21 22 23 24 // MARKER(update_precomp.py): autogen include statement, do not remove 25 #include "precompiled_i18npool.hxx" 26 27 #include "textsearch.hxx" 28 #include "levdis.hxx" 29 #include <com/sun/star/lang/Locale.hpp> 30 #include <com/sun/star/lang/XMultiServiceFactory.hpp> 31 #include <comphelper/processfactory.hxx> 32 #include <com/sun/star/i18n/UnicodeType.hpp> 33 #include <com/sun/star/util/SearchFlags.hpp> 34 #include <com/sun/star/i18n/WordType.hpp> 35 #include <com/sun/star/i18n/ScriptType.hpp> 36 #include <com/sun/star/i18n/CharacterIteratorMode.hpp> 37 #include <com/sun/star/i18n/KCharacterType.hpp> 38 #include <com/sun/star/registry/XRegistryKey.hpp> 39 #include <cppuhelper/factory.hxx> 40 #include <cppuhelper/weak.hxx> 41 42 #ifdef _MSC_VER 43 // get rid of that dumb compiler warning 44 // identifier was truncated to '255' characters in the debug information 45 // for STL template usage, if .pdb files are to be created 46 #pragma warning( disable: 4786 ) 47 #endif 48 49 #include <string.h> 50 51 using namespace ::com::sun::star::util; 52 using namespace ::com::sun::star::uno; 53 using namespace ::com::sun::star::lang; 54 using namespace ::com::sun::star::i18n; 55 using namespace ::rtl; 56 57 static sal_Int32 COMPLEX_TRANS_MASK_TMP = 58 TransliterationModules_ignoreBaFa_ja_JP | 59 TransliterationModules_ignoreIterationMark_ja_JP | 60 TransliterationModules_ignoreTiJi_ja_JP | 61 TransliterationModules_ignoreHyuByu_ja_JP | 62 TransliterationModules_ignoreSeZe_ja_JP | 63 TransliterationModules_ignoreIandEfollowedByYa_ja_JP | 64 TransliterationModules_ignoreKiKuFollowedBySa_ja_JP | 65 TransliterationModules_ignoreProlongedSoundMark_ja_JP; 66 static const sal_Int32 COMPLEX_TRANS_MASK = COMPLEX_TRANS_MASK_TMP | TransliterationModules_IGNORE_KANA | TransliterationModules_FULLWIDTH_HALFWIDTH; 67 static const sal_Int32 SIMPLE_TRANS_MASK = ~COMPLEX_TRANS_MASK; 68 static const sal_Int32 REGEX_TRANS_MASK = ~(COMPLEX_TRANS_MASK | TransliterationModules_IGNORE_CASE | TransliterationModules_UPPERCASE_LOWERCASE | TransliterationModules_LOWERCASE_UPPERCASE); 69 // Above 2 transliteration is simple but need to take effect in 70 // complex transliteration 71 72 TextSearch::TextSearch(const Reference < XMultiServiceFactory > & rxMSF) 73 : xMSF( rxMSF ) 74 , pJumpTable( 0 ) 75 , pJumpTable2( 0 ) 76 , pRegexMatcher( NULL ) 77 , pWLD( 0 ) 78 { 79 SearchOptions aOpt; 80 aOpt.algorithmType = SearchAlgorithms_ABSOLUTE; 81 aOpt.searchFlag = SearchFlags::ALL_IGNORE_CASE; 82 //aOpt.Locale = ???; 83 setOptions( aOpt ); 84 } 85 86 TextSearch::~TextSearch() 87 { 88 delete pRegexMatcher; 89 delete pWLD; 90 delete pJumpTable; 91 delete pJumpTable2; 92 } 93 94 void TextSearch::setOptions( const SearchOptions& rOptions ) throw( RuntimeException ) 95 { 96 aSrchPara = rOptions; 97 98 delete pRegexMatcher, pRegexMatcher = NULL; 99 delete pWLD, pWLD = 0; 100 delete pJumpTable, pJumpTable = 0; 101 delete pJumpTable2, pJumpTable2 = 0; 102 103 // Create Transliteration class 104 if( aSrchPara.transliterateFlags & SIMPLE_TRANS_MASK ) 105 { 106 if( !xTranslit.is() ) 107 { 108 Reference < XInterface > xI = xMSF->createInstance( 109 OUString::createFromAscii( 110 "com.sun.star.i18n.Transliteration")); 111 if ( xI.is() ) 112 xI->queryInterface( ::getCppuType( 113 (const Reference< XExtendedTransliteration >*)0)) 114 >>= xTranslit; 115 } 116 // Load transliteration module 117 if( xTranslit.is() ) 118 xTranslit->loadModule( 119 (TransliterationModules)( aSrchPara.transliterateFlags & SIMPLE_TRANS_MASK ), 120 aSrchPara.Locale); 121 } 122 else if( xTranslit.is() ) 123 xTranslit = 0; 124 125 // Create Transliteration for 2<->1, 2<->2 transliteration 126 if ( aSrchPara.transliterateFlags & COMPLEX_TRANS_MASK ) 127 { 128 if( !xTranslit2.is() ) 129 { 130 Reference < XInterface > xI = xMSF->createInstance( 131 OUString::createFromAscii( 132 "com.sun.star.i18n.Transliteration")); 133 if ( xI.is() ) 134 xI->queryInterface( ::getCppuType( 135 (const Reference< XExtendedTransliteration >*)0)) 136 >>= xTranslit2; 137 } 138 // Load transliteration module 139 if( xTranslit2.is() ) 140 xTranslit2->loadModule( 141 (TransliterationModules)( aSrchPara.transliterateFlags & COMPLEX_TRANS_MASK ), 142 aSrchPara.Locale); 143 } 144 145 if ( !xBreak.is() ) 146 { 147 Reference < XInterface > xI = xMSF->createInstance( 148 OUString::createFromAscii( "com.sun.star.i18n.BreakIterator")); 149 if( xI.is() ) 150 xI->queryInterface( ::getCppuType( 151 (const Reference< XBreakIterator >*)0)) 152 >>= xBreak; 153 } 154 155 sSrchStr = aSrchPara.searchString; 156 157 // use transliteration here 158 if ( xTranslit.is() && 159 aSrchPara.transliterateFlags & SIMPLE_TRANS_MASK ) 160 sSrchStr = xTranslit->transliterateString2String( 161 aSrchPara.searchString, 0, aSrchPara.searchString.getLength()); 162 163 if ( xTranslit2.is() && 164 aSrchPara.transliterateFlags & COMPLEX_TRANS_MASK ) 165 sSrchStr2 = xTranslit2->transliterateString2String( 166 aSrchPara.searchString, 0, aSrchPara.searchString.getLength()); 167 168 // When start or end of search string is a complex script type, we need to 169 // make sure the result boundary is not located in the middle of cell. 170 checkCTLStart = (xBreak.is() && (xBreak->getScriptType(sSrchStr, 0) == 171 ScriptType::COMPLEX)); 172 checkCTLEnd = (xBreak.is() && (xBreak->getScriptType(sSrchStr, 173 sSrchStr.getLength()-1) == ScriptType::COMPLEX)); 174 175 switch( aSrchPara.algorithmType) 176 { 177 case SearchAlgorithms_REGEXP: 178 fnForward = &TextSearch::RESrchFrwrd; 179 fnBackward = &TextSearch::RESrchBkwrd; 180 RESrchPrepare( aSrchPara); 181 break; 182 183 case SearchAlgorithms_APPROXIMATE: 184 fnForward = &TextSearch::ApproxSrchFrwrd; 185 fnBackward = &TextSearch::ApproxSrchBkwrd; 186 187 pWLD = new WLevDistance( sSrchStr.getStr(), aSrchPara.changedChars, 188 aSrchPara.insertedChars, aSrchPara.deletedChars, 189 0 != (SearchFlags::LEV_RELAXED & aSrchPara.searchFlag ) ); 190 191 nLimit = pWLD->GetLimit(); 192 break; 193 194 default: 195 fnForward = &TextSearch::NSrchFrwrd; 196 fnBackward = &TextSearch::NSrchBkwrd; 197 break; 198 } 199 } 200 201 sal_Int32 FindPosInSeq_Impl( const Sequence <sal_Int32>& rOff, sal_Int32 nPos ) 202 { 203 sal_Int32 nRet = 0, nEnd = rOff.getLength(); 204 while( nRet < nEnd && nPos > rOff[ nRet ] ) ++nRet; 205 return nRet; 206 } 207 208 sal_Bool TextSearch::isCellStart(const OUString& searchStr, sal_Int32 nPos) 209 throw( RuntimeException ) 210 { 211 sal_Int32 nDone; 212 return nPos == xBreak->previousCharacters(searchStr, nPos+1, 213 aSrchPara.Locale, CharacterIteratorMode::SKIPCELL, 1, nDone); 214 } 215 216 SearchResult TextSearch::searchForward( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos ) 217 throw( RuntimeException ) 218 { 219 SearchResult sres; 220 221 OUString in_str(searchStr); 222 sal_Int32 newStartPos = startPos; 223 sal_Int32 newEndPos = endPos; 224 225 bUsePrimarySrchStr = true; 226 227 if ( xTranslit.is() ) 228 { 229 // apply normal transliteration (1<->1, 1<->0) 230 com::sun::star::uno::Sequence <sal_Int32> offset( in_str.getLength()); 231 in_str = xTranslit->transliterate( searchStr, 0, in_str.getLength(), offset ); 232 233 // JP 20.6.2001: also the start and end positions must be corrected! 234 if( startPos ) 235 newStartPos = FindPosInSeq_Impl( offset, startPos ); 236 237 if( endPos < searchStr.getLength() ) 238 newEndPos = FindPosInSeq_Impl( offset, endPos ); 239 else 240 newEndPos = in_str.getLength(); 241 242 sres = (this->*fnForward)( in_str, newStartPos, newEndPos ); 243 244 sal_Int32 nOffsetLength = offset.getLength(); 245 sal_Int32 nStartOffset = 0; 246 for ( int k = 0; k < sres.startOffset.getLength(); k++ ) 247 { 248 nStartOffset = sres.startOffset[k]; 249 if ( nStartOffset ) 250 { 251 if ( nStartOffset < nOffsetLength ) 252 sres.startOffset[k] = offset[nStartOffset]; 253 else 254 sres.startOffset[k] = offset[offset.getLength()-1] +1; 255 } 256 // JP 20.6.2001: end is ever exclusive and then don't return 257 // the position of the next character - return the 258 // next position behind the last found character! 259 // "a b c" find "b" must return 2,3 and not 2,4!!! 260 if (sres.endOffset[k]) 261 sres.endOffset[k] = offset[sres.endOffset[k]-1] + 1; 262 } 263 } 264 else 265 { 266 sres = (this->*fnForward)( in_str, startPos, endPos ); 267 } 268 269 if ( xTranslit2.is() && aSrchPara.algorithmType != SearchAlgorithms_REGEXP) 270 { 271 SearchResult sres2; 272 273 in_str = OUString(searchStr); 274 com::sun::star::uno::Sequence <sal_Int32> offset( in_str.getLength()); 275 276 in_str = xTranslit2->transliterate( searchStr, 0, in_str.getLength(), offset ); 277 278 if( startPos ) 279 startPos = FindPosInSeq_Impl( offset, startPos ); 280 281 if( endPos < searchStr.getLength() ) 282 endPos = FindPosInSeq_Impl( offset, endPos ); 283 else 284 endPos = in_str.getLength(); 285 286 bUsePrimarySrchStr = false; 287 sres2 = (this->*fnForward)( in_str, startPos, endPos ); 288 289 for ( int k = 0; k < sres2.startOffset.getLength(); k++ ) 290 { 291 if (sres2.startOffset[k]) 292 sres2.startOffset[k] = offset[sres2.startOffset[k]-1] + 1; 293 if (sres2.endOffset[k]) 294 sres2.endOffset[k] = offset[sres2.endOffset[k]-1] + 1; 295 } 296 297 // pick first and long one 298 if ( sres.subRegExpressions == 0) 299 return sres2; 300 if ( sres2.subRegExpressions == 1) 301 { 302 if ( sres.startOffset[0] > sres2.startOffset[0]) 303 return sres2; 304 else if ( sres.startOffset[0] == sres2.startOffset[0] && 305 sres.endOffset[0] < sres2.endOffset[0]) 306 return sres2; 307 } 308 } 309 310 return sres; 311 } 312 313 SearchResult TextSearch::searchBackward( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos ) 314 throw(RuntimeException) 315 { 316 SearchResult sres; 317 318 OUString in_str(searchStr); 319 sal_Int32 newStartPos = startPos; 320 sal_Int32 newEndPos = endPos; 321 322 bUsePrimarySrchStr = true; 323 324 if ( xTranslit.is() ) 325 { 326 // apply only simple 1<->1 transliteration here 327 com::sun::star::uno::Sequence <sal_Int32> offset( in_str.getLength()); 328 in_str = xTranslit->transliterate( searchStr, 0, in_str.getLength(), offset ); 329 330 // JP 20.6.2001: also the start and end positions must be corrected! 331 if( startPos < searchStr.getLength() ) 332 newStartPos = FindPosInSeq_Impl( offset, startPos ); 333 else 334 newStartPos = in_str.getLength(); 335 336 if( endPos ) 337 newEndPos = FindPosInSeq_Impl( offset, endPos ); 338 339 sres = (this->*fnBackward)( in_str, newStartPos, newEndPos ); 340 341 sal_Int32 nOffsetLength = offset.getLength(); 342 sal_Int32 nEndOffset = 0; 343 for ( int k = 0; k < sres.startOffset.getLength(); k++ ) 344 { 345 if (sres.startOffset[k]) 346 sres.startOffset[k] = offset[sres.startOffset[k] - 1] + 1; 347 // JP 20.6.2001: end is ever exclusive and then don't return 348 // the position of the next character - return the 349 // next position behind the last found character! 350 // "a b c" find "b" must return 2,3 and not 2,4!!! 351 nEndOffset = sres.endOffset[k]; 352 if ( nEndOffset ) 353 { 354 if ( nEndOffset < nOffsetLength ) 355 sres.endOffset[k] = offset[nEndOffset]; 356 else 357 sres.endOffset[k] = offset[offset.getLength()-1] +1; 358 } 359 } 360 } 361 else 362 { 363 sres = (this->*fnBackward)( in_str, startPos, endPos ); 364 } 365 366 if ( xTranslit2.is() && aSrchPara.algorithmType != SearchAlgorithms_REGEXP ) 367 { 368 SearchResult sres2; 369 370 in_str = OUString(searchStr); 371 com::sun::star::uno::Sequence <sal_Int32> offset( in_str.getLength()); 372 373 in_str = xTranslit2->transliterate(searchStr, 0, in_str.getLength(), offset); 374 375 if( startPos < searchStr.getLength() ) 376 startPos = FindPosInSeq_Impl( offset, startPos ); 377 else 378 startPos = in_str.getLength(); 379 380 if( endPos ) 381 endPos = FindPosInSeq_Impl( offset, endPos ); 382 383 bUsePrimarySrchStr = false; 384 sres2 = (this->*fnBackward)( in_str, startPos, endPos ); 385 386 for( int k = 0; k < sres2.startOffset.getLength(); k++ ) 387 { 388 if (sres2.startOffset[k]) 389 sres2.startOffset[k] = offset[sres2.startOffset[k]-1]+1; 390 if (sres2.endOffset[k]) 391 sres2.endOffset[k] = offset[sres2.endOffset[k]-1]+1; 392 } 393 394 // pick last and long one 395 if ( sres.subRegExpressions == 0 ) 396 return sres2; 397 if ( sres2.subRegExpressions == 1 ) 398 { 399 if ( sres.startOffset[0] < sres2.startOffset[0] ) 400 return sres2; 401 if ( sres.startOffset[0] == sres2.startOffset[0] && 402 sres.endOffset[0] > sres2.endOffset[0] ) 403 return sres2; 404 } 405 } 406 407 return sres; 408 } 409 410 //--------------------------------------------------------------------- 411 412 bool TextSearch::IsDelimiter( const OUString& rStr, sal_Int32 nPos ) const 413 { 414 bool bRet = 1; 415 if( '\x7f' != rStr[nPos]) 416 { 417 if ( !xCharClass.is() ) 418 { 419 Reference < XInterface > xI = xMSF->createInstance( 420 OUString::createFromAscii( "com.sun.star.i18n.CharacterClassification")); 421 if( xI.is() ) 422 xI->queryInterface( ::getCppuType( 423 (const Reference< XCharacterClassification >*)0)) 424 >>= xCharClass; 425 } 426 if ( xCharClass.is() ) 427 { 428 sal_Int32 nCType = xCharClass->getCharacterType( rStr, nPos, 429 aSrchPara.Locale ); 430 if( 0 != (( KCharacterType::DIGIT | KCharacterType::ALPHA | 431 KCharacterType::LETTER ) & nCType ) ) 432 bRet = 0; 433 } 434 } 435 return bRet; 436 } 437 438 // --------- helper methods for Boyer-Moore like text searching ---------- 439 // TODO: use ICU's regex UREGEX_LITERAL mode instead when it becomes available 440 441 void TextSearch::MakeForwardTab() 442 { 443 // create the jumptable for the search text 444 if( pJumpTable ) 445 { 446 if( bIsForwardTab ) 447 return ; // the jumpTable is ok 448 delete pJumpTable; 449 } 450 bIsForwardTab = true; 451 452 sal_Int32 n, nLen = sSrchStr.getLength(); 453 pJumpTable = new TextSearchJumpTable; 454 455 for( n = 0; n < nLen - 1; ++n ) 456 { 457 sal_Unicode cCh = sSrchStr[n]; 458 sal_Int32 nDiff = nLen - n - 1; 459 TextSearchJumpTable::value_type aEntry( cCh, nDiff ); 460 461 ::std::pair< TextSearchJumpTable::iterator, bool > aPair = 462 pJumpTable->insert( aEntry ); 463 if ( !aPair.second ) 464 (*(aPair.first)).second = nDiff; 465 } 466 } 467 468 void TextSearch::MakeForwardTab2() 469 { 470 // create the jumptable for the search text 471 if( pJumpTable2 ) 472 { 473 if( bIsForwardTab ) 474 return ; // the jumpTable is ok 475 delete pJumpTable2; 476 } 477 bIsForwardTab = true; 478 479 sal_Int32 n, nLen = sSrchStr2.getLength(); 480 pJumpTable2 = new TextSearchJumpTable; 481 482 for( n = 0; n < nLen - 1; ++n ) 483 { 484 sal_Unicode cCh = sSrchStr2[n]; 485 sal_Int32 nDiff = nLen - n - 1; 486 487 TextSearchJumpTable::value_type aEntry( cCh, nDiff ); 488 ::std::pair< TextSearchJumpTable::iterator, bool > aPair = 489 pJumpTable2->insert( aEntry ); 490 if ( !aPair.second ) 491 (*(aPair.first)).second = nDiff; 492 } 493 } 494 495 void TextSearch::MakeBackwardTab() 496 { 497 // create the jumptable for the search text 498 if( pJumpTable ) 499 { 500 if( !bIsForwardTab ) 501 return ; // the jumpTable is ok 502 delete pJumpTable; 503 } 504 bIsForwardTab = false; 505 506 sal_Int32 n, nLen = sSrchStr.getLength(); 507 pJumpTable = new TextSearchJumpTable; 508 509 for( n = nLen-1; n > 0; --n ) 510 { 511 sal_Unicode cCh = sSrchStr[n]; 512 TextSearchJumpTable::value_type aEntry( cCh, n ); 513 ::std::pair< TextSearchJumpTable::iterator, bool > aPair = 514 pJumpTable->insert( aEntry ); 515 if ( !aPair.second ) 516 (*(aPair.first)).second = n; 517 } 518 } 519 520 void TextSearch::MakeBackwardTab2() 521 { 522 // create the jumptable for the search text 523 if( pJumpTable2 ) 524 { 525 if( !bIsForwardTab ) 526 return ; // the jumpTable is ok 527 delete pJumpTable2; 528 } 529 bIsForwardTab = false; 530 531 sal_Int32 n, nLen = sSrchStr2.getLength(); 532 pJumpTable2 = new TextSearchJumpTable; 533 534 for( n = nLen-1; n > 0; --n ) 535 { 536 sal_Unicode cCh = sSrchStr2[n]; 537 TextSearchJumpTable::value_type aEntry( cCh, n ); 538 ::std::pair< TextSearchJumpTable::iterator, bool > aPair = 539 pJumpTable2->insert( aEntry ); 540 if ( !aPair.second ) 541 (*(aPair.first)).second = n; 542 } 543 } 544 545 sal_Int32 TextSearch::GetDiff( const sal_Unicode cChr ) const 546 { 547 TextSearchJumpTable *pJump; 548 OUString sSearchKey; 549 550 if ( bUsePrimarySrchStr ) { 551 pJump = pJumpTable; 552 sSearchKey = sSrchStr; 553 } else { 554 pJump = pJumpTable2; 555 sSearchKey = sSrchStr2; 556 } 557 558 TextSearchJumpTable::const_iterator iLook = pJump->find( cChr ); 559 if ( iLook == pJump->end() ) 560 return sSearchKey.getLength(); 561 return (*iLook).second; 562 } 563 564 565 // TextSearch::NSrchFrwrd is mis-optimized on unxsoli (#i105945#) 566 SearchResult TextSearch::NSrchFrwrd( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos ) 567 throw(RuntimeException) 568 { 569 SearchResult aRet; 570 aRet.subRegExpressions = 0; 571 572 OUString sSearchKey = bUsePrimarySrchStr ? sSrchStr : sSrchStr2; 573 574 OUString aStr( searchStr ); 575 sal_Int32 nSuchIdx = aStr.getLength(); 576 sal_Int32 nEnde = endPos; 577 if( !nSuchIdx || !sSearchKey.getLength() || sSearchKey.getLength() > nSuchIdx ) 578 return aRet; 579 580 581 if( nEnde < sSearchKey.getLength() ) // position inside the search region ? 582 return aRet; 583 584 nEnde -= sSearchKey.getLength(); 585 586 if (bUsePrimarySrchStr) 587 MakeForwardTab(); // create the jumptable 588 else 589 MakeForwardTab2(); 590 591 for (sal_Int32 nCmpIdx = startPos; // start position for the search 592 nCmpIdx <= nEnde; 593 nCmpIdx += GetDiff( aStr[nCmpIdx + sSearchKey.getLength()-1])) 594 { 595 // if the match would be the completed cells, skip it. 596 if ( (checkCTLStart && !isCellStart( aStr, nCmpIdx )) || (checkCTLEnd 597 && !isCellStart( aStr, nCmpIdx + sSearchKey.getLength())) ) 598 continue; 599 600 nSuchIdx = sSearchKey.getLength() - 1; 601 while( nSuchIdx >= 0 && sSearchKey[nSuchIdx] == aStr[nCmpIdx + nSuchIdx]) 602 { 603 if( nSuchIdx == 0 ) 604 { 605 if( SearchFlags::NORM_WORD_ONLY & aSrchPara.searchFlag ) 606 { 607 sal_Int32 nFndEnd = nCmpIdx + sSearchKey.getLength(); 608 bool bAtStart = !nCmpIdx; 609 bool bAtEnd = nFndEnd == endPos; 610 bool bDelimBefore = bAtStart || IsDelimiter( aStr, nCmpIdx-1 ); 611 bool bDelimBehind = IsDelimiter( aStr, nFndEnd ); 612 // * 1 -> only one word in the paragraph 613 // * 2 -> at begin of paragraph 614 // * 3 -> at end of paragraph 615 // * 4 -> inside the paragraph 616 if( !( ( bAtStart && bAtEnd ) || // 1 617 ( bAtStart && bDelimBehind ) || // 2 618 ( bAtEnd && bDelimBefore ) || // 3 619 ( bDelimBefore && bDelimBehind ))) // 4 620 break; 621 } 622 623 aRet.subRegExpressions = 1; 624 aRet.startOffset.realloc( 1 ); 625 aRet.startOffset[ 0 ] = nCmpIdx; 626 aRet.endOffset.realloc( 1 ); 627 aRet.endOffset[ 0 ] = nCmpIdx + sSearchKey.getLength(); 628 629 return aRet; 630 } 631 else 632 nSuchIdx--; 633 } 634 } 635 return aRet; 636 } 637 638 SearchResult TextSearch::NSrchBkwrd( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos ) 639 throw(RuntimeException) 640 { 641 SearchResult aRet; 642 aRet.subRegExpressions = 0; 643 644 OUString sSearchKey = bUsePrimarySrchStr ? sSrchStr : sSrchStr2; 645 646 OUString aStr( searchStr ); 647 sal_Int32 nSuchIdx = aStr.getLength(); 648 sal_Int32 nEnde = endPos; 649 if( nSuchIdx == 0 || sSearchKey.getLength() == 0 || sSearchKey.getLength() > nSuchIdx) 650 return aRet; 651 652 if (bUsePrimarySrchStr) 653 MakeBackwardTab(); // create the jumptable 654 else 655 MakeBackwardTab2(); 656 657 if( nEnde == nSuchIdx ) // end position for the search 658 nEnde = sSearchKey.getLength(); 659 else 660 nEnde += sSearchKey.getLength(); 661 662 sal_Int32 nCmpIdx = startPos; // start position for the search 663 664 while (nCmpIdx >= nEnde) 665 { 666 // if the match would be the completed cells, skip it. 667 if ( (!checkCTLStart || isCellStart( aStr, nCmpIdx - 668 sSearchKey.getLength() )) && (!checkCTLEnd || 669 isCellStart( aStr, nCmpIdx))) 670 { 671 nSuchIdx = 0; 672 while( nSuchIdx < sSearchKey.getLength() && sSearchKey[nSuchIdx] == 673 aStr[nCmpIdx + nSuchIdx - sSearchKey.getLength()] ) 674 nSuchIdx++; 675 if( nSuchIdx >= sSearchKey.getLength() ) 676 { 677 if( SearchFlags::NORM_WORD_ONLY & aSrchPara.searchFlag ) 678 { 679 sal_Int32 nFndStt = nCmpIdx - sSearchKey.getLength(); 680 bool bAtStart = !nFndStt; 681 bool bAtEnd = nCmpIdx == startPos; 682 bool bDelimBehind = IsDelimiter( aStr, nCmpIdx ); 683 bool bDelimBefore = bAtStart || // begin of paragraph 684 IsDelimiter( aStr, nFndStt-1 ); 685 // * 1 -> only one word in the paragraph 686 // * 2 -> at begin of paragraph 687 // * 3 -> at end of paragraph 688 // * 4 -> inside the paragraph 689 if( ( bAtStart && bAtEnd ) || // 1 690 ( bAtStart && bDelimBehind ) || // 2 691 ( bAtEnd && bDelimBefore ) || // 3 692 ( bDelimBefore && bDelimBehind )) // 4 693 { 694 aRet.subRegExpressions = 1; 695 aRet.startOffset.realloc( 1 ); 696 aRet.startOffset[ 0 ] = nCmpIdx; 697 aRet.endOffset.realloc( 1 ); 698 aRet.endOffset[ 0 ] = nCmpIdx - sSearchKey.getLength(); 699 return aRet; 700 } 701 } 702 else 703 { 704 aRet.subRegExpressions = 1; 705 aRet.startOffset.realloc( 1 ); 706 aRet.startOffset[ 0 ] = nCmpIdx; 707 aRet.endOffset.realloc( 1 ); 708 aRet.endOffset[ 0 ] = nCmpIdx - sSearchKey.getLength(); 709 return aRet; 710 } 711 } 712 } 713 nSuchIdx = GetDiff( aStr[nCmpIdx - sSearchKey.getLength()] ); 714 if( nCmpIdx < nSuchIdx ) 715 return aRet; 716 nCmpIdx -= nSuchIdx; 717 } 718 return aRet; 719 } 720 721 void TextSearch::RESrchPrepare( const ::com::sun::star::util::SearchOptions& rOptions) 722 { 723 // select the transliterated pattern string 724 const OUString& rPatternStr = 725 (rOptions.transliterateFlags & REGEX_TRANS_MASK) ? sSrchStr 726 : ((rOptions.transliterateFlags & COMPLEX_TRANS_MASK) ? sSrchStr2 : rOptions.searchString); 727 728 sal_uInt32 nIcuSearchFlags = UREGEX_UWORD; // request UAX#29 unicode capability 729 // map com::sun::star::util::SearchFlags to ICU uregex.h flags 730 // TODO: REG_EXTENDED, REG_NOT_BEGINOFLINE, REG_NOT_ENDOFLINE 731 // REG_NEWLINE is neither properly defined nor used anywhere => not implemented 732 // REG_NOSUB is not used anywhere => not implemented 733 // NORM_WORD_ONLY is only used for SearchAlgorithm==Absolute 734 // LEV_RELAXED is only used for SearchAlgorithm==Approximate 735 // Note that the search flag ALL_IGNORE_CASE is deprecated in UNO 736 // probably because the transliteration flag IGNORE_CASE handles it as well. 737 if( (rOptions.searchFlag & com::sun::star::util::SearchFlags::ALL_IGNORE_CASE) != 0 738 || (rOptions.transliterateFlags & TransliterationModules_IGNORE_CASE) != 0) 739 nIcuSearchFlags |= UREGEX_CASE_INSENSITIVE; 740 UErrorCode nIcuErr = U_ZERO_ERROR; 741 // assumption: transliteration didn't mangle regexp control chars 742 IcuUniString aIcuSearchPatStr( (const UChar*)rPatternStr.getStr(), rPatternStr.getLength()); 743 #ifndef DISABLE_WORDBOUND_EMULATION 744 // for conveniance specific syntax elements of the old regex engine are emulated 745 // - by replacing \< with "word-break followed by a look-ahead word-char" 746 static const IcuUniString aChevronPatternB( "\\\\<", -1, IcuUniString::kInvariant); 747 static const IcuUniString aChevronReplaceB( "\\\\b(?=\\\\w)", -1, IcuUniString::kInvariant); 748 static RegexMatcher aChevronMatcherB( aChevronPatternB, 0, nIcuErr); 749 aChevronMatcherB.reset( aIcuSearchPatStr); 750 aIcuSearchPatStr = aChevronMatcherB.replaceAll( aChevronReplaceB, nIcuErr); 751 aChevronMatcherB.reset(); 752 // - by replacing \> with "look-behind word-char followed by a word-break" 753 static const IcuUniString aChevronPatternE( "\\\\>", -1, IcuUniString::kInvariant); 754 static const IcuUniString aChevronReplaceE( "(?<=\\\\w)\\\\b", -1, IcuUniString::kInvariant); 755 static RegexMatcher aChevronMatcherE( aChevronPatternE, 0, nIcuErr); 756 aChevronMatcherE.reset( aIcuSearchPatStr); 757 aIcuSearchPatStr = aChevronMatcherE.replaceAll( aChevronReplaceE, nIcuErr); 758 aChevronMatcherE.reset(); 759 #endif 760 pRegexMatcher = new RegexMatcher( aIcuSearchPatStr, nIcuSearchFlags, nIcuErr); 761 if( nIcuErr) 762 { delete pRegexMatcher; pRegexMatcher = NULL;} 763 } 764 765 //--------------------------------------------------------------------------- 766 767 SearchResult TextSearch::RESrchFrwrd( const OUString& searchStr, 768 sal_Int32 startPos, sal_Int32 endPos ) 769 throw(RuntimeException) 770 { 771 SearchResult aRet; 772 aRet.subRegExpressions = 0; 773 if( !pRegexMatcher) 774 return aRet; 775 776 if( endPos > searchStr.getLength()) 777 endPos = searchStr.getLength(); 778 779 // use the ICU RegexMatcher to find the matches 780 UErrorCode nIcuErr = U_ZERO_ERROR; 781 const IcuUniString aSearchTargetStr( (const UChar*)searchStr.getStr(), endPos); 782 pRegexMatcher->reset( aSearchTargetStr); 783 // search until there is a valid match 784 for(;;) 785 { 786 if( !pRegexMatcher->find( startPos, nIcuErr)) 787 return aRet; 788 789 // #i118887# ignore zero-length matches e.g. "a*" in "bc" 790 int nStartOfs = pRegexMatcher->start( nIcuErr); 791 int nEndOfs = pRegexMatcher->end( nIcuErr); 792 if( nStartOfs < nEndOfs) 793 break; 794 // try at next position if there was a zero-length match 795 if( ++startPos >= endPos) 796 return aRet; 797 } 798 799 // extract the result of the search 800 const int nGroupCount = pRegexMatcher->groupCount(); 801 aRet.subRegExpressions = nGroupCount + 1; 802 aRet.startOffset.realloc( aRet.subRegExpressions); 803 aRet.endOffset.realloc( aRet.subRegExpressions); 804 aRet.startOffset[0] = pRegexMatcher->start( nIcuErr); 805 aRet.endOffset[0] = pRegexMatcher->end( nIcuErr); 806 for( int i = 1; i <= nGroupCount; ++i) { 807 aRet.startOffset[i] = pRegexMatcher->start( i, nIcuErr); 808 aRet.endOffset[i] = pRegexMatcher->end( i, nIcuErr); 809 } 810 811 return aRet; 812 } 813 814 SearchResult TextSearch::RESrchBkwrd( const OUString& searchStr, 815 sal_Int32 startPos, sal_Int32 endPos ) 816 throw(RuntimeException) 817 { 818 // NOTE: for backwards search callers provide startPos/endPos inverted! 819 SearchResult aRet; 820 aRet.subRegExpressions = 0; 821 if( !pRegexMatcher) 822 return aRet; 823 824 if( startPos > searchStr.getLength()) 825 startPos = searchStr.getLength(); 826 827 // use the ICU RegexMatcher to find the matches 828 // TODO: use ICU's backward searching once it becomes available 829 // as its replacement using forward search is not as good as the real thing 830 UErrorCode nIcuErr = U_ZERO_ERROR; 831 const IcuUniString aSearchTargetStr( (const UChar*)searchStr.getStr(), startPos); 832 pRegexMatcher->reset( aSearchTargetStr); 833 if( !pRegexMatcher->find( endPos, nIcuErr)) 834 return aRet; 835 836 // find the last match 837 int nLastPos = 0; 838 int nFoundEnd = 0; 839 do { 840 nLastPos = pRegexMatcher->start( nIcuErr); 841 nFoundEnd = pRegexMatcher->end( nIcuErr); 842 if( nFoundEnd >= startPos) 843 break; 844 if( nFoundEnd == nLastPos) 845 ++nFoundEnd; 846 } while( pRegexMatcher->find( nFoundEnd, nIcuErr)); 847 848 // find last match again to get its details 849 pRegexMatcher->find( nLastPos, nIcuErr); 850 851 // fill in the details of the last match 852 const int nGroupCount = pRegexMatcher->groupCount(); 853 aRet.subRegExpressions = nGroupCount + 1; 854 aRet.startOffset.realloc( aRet.subRegExpressions); 855 aRet.endOffset.realloc( aRet.subRegExpressions); 856 // NOTE: existing users of backward search seem to expect startOfs/endOfs being inverted! 857 aRet.startOffset[0] = pRegexMatcher->end( nIcuErr); 858 aRet.endOffset[0] = pRegexMatcher->start( nIcuErr); 859 for( int i = 1; i <= nGroupCount; ++i) { 860 aRet.startOffset[i] = pRegexMatcher->end( i, nIcuErr); 861 aRet.endOffset[i] = pRegexMatcher->start( i, nIcuErr); 862 } 863 864 return aRet; 865 } 866 867 //--------------------------------------------------------------------------- 868 869 // search for words phonetically 870 SearchResult TextSearch::ApproxSrchFrwrd( const OUString& searchStr, 871 sal_Int32 startPos, sal_Int32 endPos ) 872 throw(RuntimeException) 873 { 874 SearchResult aRet; 875 aRet.subRegExpressions = 0; 876 877 if( !xBreak.is() ) 878 return aRet; 879 880 OUString aWTemp( searchStr ); 881 882 register sal_Int32 nStt, nEnd; 883 884 Boundary aWBnd = xBreak->getWordBoundary( aWTemp, startPos, 885 aSrchPara.Locale, 886 WordType::ANYWORD_IGNOREWHITESPACES, sal_True ); 887 888 do 889 { 890 if( aWBnd.startPos >= endPos ) 891 break; 892 nStt = aWBnd.startPos < startPos ? startPos : aWBnd.startPos; 893 nEnd = aWBnd.endPos > endPos ? endPos : aWBnd.endPos; 894 895 if( nStt < nEnd && 896 pWLD->WLD( aWTemp.getStr() + nStt, nEnd - nStt ) <= nLimit ) 897 { 898 aRet.subRegExpressions = 1; 899 aRet.startOffset.realloc( 1 ); 900 aRet.startOffset[ 0 ] = nStt; 901 aRet.endOffset.realloc( 1 ); 902 aRet.endOffset[ 0 ] = nEnd; 903 break; 904 } 905 906 nStt = nEnd - 1; 907 aWBnd = xBreak->nextWord( aWTemp, nStt, aSrchPara.Locale, 908 WordType::ANYWORD_IGNOREWHITESPACES); 909 } while( aWBnd.startPos != aWBnd.endPos || 910 (aWBnd.endPos != aWTemp.getLength() && aWBnd.endPos != nEnd) ); 911 // #i50244# aWBnd.endPos != nEnd : in case there is _no_ word (only 912 // whitespace) in searchStr, getWordBoundary() returned startPos,startPos 913 // and nextWord() does also => don't loop forever. 914 return aRet; 915 } 916 917 SearchResult TextSearch::ApproxSrchBkwrd( const OUString& searchStr, 918 sal_Int32 startPos, sal_Int32 endPos ) 919 throw(RuntimeException) 920 { 921 SearchResult aRet; 922 aRet.subRegExpressions = 0; 923 924 if( !xBreak.is() ) 925 return aRet; 926 927 OUString aWTemp( searchStr ); 928 929 register sal_Int32 nStt, nEnd; 930 931 Boundary aWBnd = xBreak->getWordBoundary( aWTemp, startPos, 932 aSrchPara.Locale, 933 WordType::ANYWORD_IGNOREWHITESPACES, sal_True ); 934 935 do 936 { 937 if( aWBnd.endPos <= endPos ) 938 break; 939 nStt = aWBnd.startPos < endPos ? endPos : aWBnd.startPos; 940 nEnd = aWBnd.endPos > startPos ? startPos : aWBnd.endPos; 941 942 if( nStt < nEnd && 943 pWLD->WLD( aWTemp.getStr() + nStt, nEnd - nStt ) <= nLimit ) 944 { 945 aRet.subRegExpressions = 1; 946 aRet.startOffset.realloc( 1 ); 947 aRet.startOffset[ 0 ] = nEnd; 948 aRet.endOffset.realloc( 1 ); 949 aRet.endOffset[ 0 ] = nStt; 950 break; 951 } 952 if( !nStt ) 953 break; 954 955 aWBnd = xBreak->previousWord( aWTemp, nStt, aSrchPara.Locale, 956 WordType::ANYWORD_IGNOREWHITESPACES); 957 } while( aWBnd.startPos != aWBnd.endPos || aWBnd.endPos != aWTemp.getLength() ); 958 return aRet; 959 } 960 961 962 static const sal_Char cSearchName[] = "com.sun.star.util.TextSearch"; 963 static const sal_Char cSearchImpl[] = "com.sun.star.util.TextSearch_i18n"; 964 965 static OUString getServiceName_Static() 966 { 967 return OUString::createFromAscii( cSearchName ); 968 } 969 970 static OUString getImplementationName_Static() 971 { 972 return OUString::createFromAscii( cSearchImpl ); 973 } 974 975 OUString SAL_CALL 976 TextSearch::getImplementationName() 977 throw( RuntimeException ) 978 { 979 return getImplementationName_Static(); 980 } 981 982 sal_Bool SAL_CALL 983 TextSearch::supportsService(const OUString& rServiceName) 984 throw( RuntimeException ) 985 { 986 return !rServiceName.compareToAscii( cSearchName ); 987 } 988 989 Sequence< OUString > SAL_CALL 990 TextSearch::getSupportedServiceNames(void) throw( RuntimeException ) 991 { 992 Sequence< OUString > aRet(1); 993 aRet[0] = getServiceName_Static(); 994 return aRet; 995 } 996 997 ::com::sun::star::uno::Reference< ::com::sun::star::uno::XInterface > 998 SAL_CALL TextSearch_CreateInstance( 999 const ::com::sun::star::uno::Reference< 1000 ::com::sun::star::lang::XMultiServiceFactory >& rxMSF ) 1001 { 1002 return ::com::sun::star::uno::Reference< 1003 ::com::sun::star::uno::XInterface >( 1004 (::cppu::OWeakObject*) new TextSearch( rxMSF ) ); 1005 } 1006 1007 extern "C" 1008 { 1009 1010 void SAL_CALL component_getImplementationEnvironment( 1011 const sal_Char** ppEnvTypeName, uno_Environment** /*ppEnv*/ ) 1012 { 1013 *ppEnvTypeName = CPPU_CURRENT_LANGUAGE_BINDING_NAME; 1014 } 1015 1016 void* SAL_CALL component_getFactory( const sal_Char* sImplementationName, 1017 void* _pServiceManager, void* /*_pRegistryKey*/ ) 1018 { 1019 void* pRet = NULL; 1020 1021 ::com::sun::star::lang::XMultiServiceFactory* pServiceManager = 1022 reinterpret_cast< ::com::sun::star::lang::XMultiServiceFactory* > 1023 ( _pServiceManager ); 1024 ::com::sun::star::uno::Reference< 1025 ::com::sun::star::lang::XSingleServiceFactory > xFactory; 1026 1027 if ( 0 == rtl_str_compare( sImplementationName, cSearchImpl) ) 1028 { 1029 ::com::sun::star::uno::Sequence< ::rtl::OUString > aServiceNames(1); 1030 aServiceNames[0] = getServiceName_Static(); 1031 xFactory = ::cppu::createSingleFactory( 1032 pServiceManager, getImplementationName_Static(), 1033 &TextSearch_CreateInstance, aServiceNames ); 1034 } 1035 1036 if ( xFactory.is() ) 1037 { 1038 xFactory->acquire(); 1039 pRet = xFactory.get(); 1040 } 1041 1042 return pRet; 1043 } 1044 1045 } // extern "C" 1046