1449ab281SAndrew Rist /************************************************************** 2cdf0e10cSrcweir * 3449ab281SAndrew Rist * Licensed to the Apache Software Foundation (ASF) under one 4449ab281SAndrew Rist * or more contributor license agreements. See the NOTICE file 5449ab281SAndrew Rist * distributed with this work for additional information 6449ab281SAndrew Rist * regarding copyright ownership. The ASF licenses this file 7449ab281SAndrew Rist * to you under the Apache License, Version 2.0 (the 8449ab281SAndrew Rist * "License"); you may not use this file except in compliance 9449ab281SAndrew Rist * with the License. You may obtain a copy of the License at 10449ab281SAndrew Rist * 11449ab281SAndrew Rist * http://www.apache.org/licenses/LICENSE-2.0 12449ab281SAndrew Rist * 13449ab281SAndrew Rist * Unless required by applicable law or agreed to in writing, 14449ab281SAndrew Rist * software distributed under the License is distributed on an 15449ab281SAndrew Rist * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16449ab281SAndrew Rist * KIND, either express or implied. See the License for the 17449ab281SAndrew Rist * specific language governing permissions and limitations 18449ab281SAndrew Rist * under the License. 19449ab281SAndrew Rist * 20449ab281SAndrew Rist *************************************************************/ 21449ab281SAndrew Rist 22449ab281SAndrew Rist 23cdf0e10cSrcweir 24cdf0e10cSrcweir // MARKER(update_precomp.py): autogen include statement, do not remove 25cdf0e10cSrcweir #include "precompiled_i18npool.hxx" 26cdf0e10cSrcweir 27cdf0e10cSrcweir #include "textsearch.hxx" 28cdf0e10cSrcweir #include "levdis.hxx" 29cdf0e10cSrcweir #include <com/sun/star/lang/Locale.hpp> 30cdf0e10cSrcweir #include <com/sun/star/lang/XMultiServiceFactory.hpp> 31cdf0e10cSrcweir #include <comphelper/processfactory.hxx> 32cdf0e10cSrcweir #include <com/sun/star/i18n/UnicodeType.hpp> 33cdf0e10cSrcweir #include <com/sun/star/util/SearchFlags.hpp> 34cdf0e10cSrcweir #include <com/sun/star/i18n/WordType.hpp> 35cdf0e10cSrcweir #include <com/sun/star/i18n/ScriptType.hpp> 36cdf0e10cSrcweir #include <com/sun/star/i18n/CharacterIteratorMode.hpp> 37cdf0e10cSrcweir #include <com/sun/star/i18n/KCharacterType.hpp> 38cdf0e10cSrcweir #include <com/sun/star/registry/XRegistryKey.hpp> 39cdf0e10cSrcweir #include <cppuhelper/factory.hxx> 40cdf0e10cSrcweir #include <cppuhelper/weak.hxx> 41cdf0e10cSrcweir 42cdf0e10cSrcweir #ifdef _MSC_VER 43cdf0e10cSrcweir // get rid of that dumb compiler warning 44cdf0e10cSrcweir // identifier was truncated to '255' characters in the debug information 45cdf0e10cSrcweir // for STL template usage, if .pdb files are to be created 46cdf0e10cSrcweir #pragma warning( disable: 4786 ) 47cdf0e10cSrcweir #endif 48cdf0e10cSrcweir 49cdf0e10cSrcweir #include <string.h> 50cdf0e10cSrcweir 51cdf0e10cSrcweir using namespace ::com::sun::star::util; 52cdf0e10cSrcweir using namespace ::com::sun::star::uno; 53cdf0e10cSrcweir using namespace ::com::sun::star::lang; 54cdf0e10cSrcweir using namespace ::com::sun::star::i18n; 55cdf0e10cSrcweir using namespace ::rtl; 56cdf0e10cSrcweir 57cdf0e10cSrcweir static sal_Int32 COMPLEX_TRANS_MASK_TMP = 58cdf0e10cSrcweir TransliterationModules_ignoreBaFa_ja_JP | 59cdf0e10cSrcweir TransliterationModules_ignoreIterationMark_ja_JP | 60cdf0e10cSrcweir TransliterationModules_ignoreTiJi_ja_JP | 61cdf0e10cSrcweir TransliterationModules_ignoreHyuByu_ja_JP | 62cdf0e10cSrcweir TransliterationModules_ignoreSeZe_ja_JP | 63cdf0e10cSrcweir TransliterationModules_ignoreIandEfollowedByYa_ja_JP | 64cdf0e10cSrcweir TransliterationModules_ignoreKiKuFollowedBySa_ja_JP | 65cdf0e10cSrcweir TransliterationModules_ignoreProlongedSoundMark_ja_JP; 66cc450e3aSHerbert Dürr static const sal_Int32 COMPLEX_TRANS_MASK = COMPLEX_TRANS_MASK_TMP | TransliterationModules_IGNORE_KANA | TransliterationModules_FULLWIDTH_HALFWIDTH; 67*e2630f2cSHerbert Dürr static const sal_Int32 SIMPLE_TRANS_MASK = ~COMPLEX_TRANS_MASK; 68*e2630f2cSHerbert Dürr static const sal_Int32 REGEX_TRANS_MASK = ~(COMPLEX_TRANS_MASK | TransliterationModules_IGNORE_CASE | TransliterationModules_UPPERCASE_LOWERCASE | TransliterationModules_LOWERCASE_UPPERCASE); 69cdf0e10cSrcweir // Above 2 transliteration is simple but need to take effect in 70cdf0e10cSrcweir // complex transliteration 71cdf0e10cSrcweir 72cdf0e10cSrcweir TextSearch::TextSearch(const Reference < XMultiServiceFactory > & rxMSF) 73cdf0e10cSrcweir : xMSF( rxMSF ) 74cdf0e10cSrcweir , pJumpTable( 0 ) 75cdf0e10cSrcweir , pJumpTable2( 0 ) 76cc450e3aSHerbert Dürr , pRegexMatcher( NULL ) 77cdf0e10cSrcweir , pWLD( 0 ) 78cdf0e10cSrcweir { 79cdf0e10cSrcweir SearchOptions aOpt; 80cdf0e10cSrcweir aOpt.algorithmType = SearchAlgorithms_ABSOLUTE; 81cdf0e10cSrcweir aOpt.searchFlag = SearchFlags::ALL_IGNORE_CASE; 82cdf0e10cSrcweir //aOpt.Locale = ???; 83cdf0e10cSrcweir setOptions( aOpt ); 84cdf0e10cSrcweir } 85cdf0e10cSrcweir 86cdf0e10cSrcweir TextSearch::~TextSearch() 87cdf0e10cSrcweir { 88cc450e3aSHerbert Dürr delete pRegexMatcher; 89cdf0e10cSrcweir delete pWLD; 90cdf0e10cSrcweir delete pJumpTable; 91cdf0e10cSrcweir delete pJumpTable2; 92cdf0e10cSrcweir } 93cdf0e10cSrcweir 94cdf0e10cSrcweir void TextSearch::setOptions( const SearchOptions& rOptions ) throw( RuntimeException ) 95cdf0e10cSrcweir { 96cdf0e10cSrcweir aSrchPara = rOptions; 97cdf0e10cSrcweir 98cc450e3aSHerbert Dürr delete pRegexMatcher, pRegexMatcher = NULL; 99cdf0e10cSrcweir delete pWLD, pWLD = 0; 100cdf0e10cSrcweir delete pJumpTable, pJumpTable = 0; 101cdf0e10cSrcweir delete pJumpTable2, pJumpTable2 = 0; 102cdf0e10cSrcweir 103cdf0e10cSrcweir // Create Transliteration class 104cdf0e10cSrcweir if( aSrchPara.transliterateFlags & SIMPLE_TRANS_MASK ) 105cdf0e10cSrcweir { 106cdf0e10cSrcweir if( !xTranslit.is() ) 107cdf0e10cSrcweir { 108cdf0e10cSrcweir Reference < XInterface > xI = xMSF->createInstance( 109cdf0e10cSrcweir OUString::createFromAscii( 110cdf0e10cSrcweir "com.sun.star.i18n.Transliteration")); 111cdf0e10cSrcweir if ( xI.is() ) 112cdf0e10cSrcweir xI->queryInterface( ::getCppuType( 113cdf0e10cSrcweir (const Reference< XExtendedTransliteration >*)0)) 114cdf0e10cSrcweir >>= xTranslit; 115cdf0e10cSrcweir } 116cdf0e10cSrcweir // Load transliteration module 117cdf0e10cSrcweir if( xTranslit.is() ) 118cdf0e10cSrcweir xTranslit->loadModule( 119cdf0e10cSrcweir (TransliterationModules)( aSrchPara.transliterateFlags & SIMPLE_TRANS_MASK ), 120cdf0e10cSrcweir aSrchPara.Locale); 121cdf0e10cSrcweir } 122cdf0e10cSrcweir else if( xTranslit.is() ) 123cdf0e10cSrcweir xTranslit = 0; 124cdf0e10cSrcweir 125cdf0e10cSrcweir // Create Transliteration for 2<->1, 2<->2 transliteration 126cdf0e10cSrcweir if ( aSrchPara.transliterateFlags & COMPLEX_TRANS_MASK ) 127cdf0e10cSrcweir { 128cdf0e10cSrcweir if( !xTranslit2.is() ) 129cdf0e10cSrcweir { 130cdf0e10cSrcweir Reference < XInterface > xI = xMSF->createInstance( 131cdf0e10cSrcweir OUString::createFromAscii( 132cdf0e10cSrcweir "com.sun.star.i18n.Transliteration")); 133cdf0e10cSrcweir if ( xI.is() ) 134cdf0e10cSrcweir xI->queryInterface( ::getCppuType( 135cdf0e10cSrcweir (const Reference< XExtendedTransliteration >*)0)) 136cdf0e10cSrcweir >>= xTranslit2; 137cdf0e10cSrcweir } 138cdf0e10cSrcweir // Load transliteration module 139cdf0e10cSrcweir if( xTranslit2.is() ) 140cdf0e10cSrcweir xTranslit2->loadModule( 141cdf0e10cSrcweir (TransliterationModules)( aSrchPara.transliterateFlags & COMPLEX_TRANS_MASK ), 142cdf0e10cSrcweir aSrchPara.Locale); 143cdf0e10cSrcweir } 144cdf0e10cSrcweir 145cdf0e10cSrcweir if ( !xBreak.is() ) 146cdf0e10cSrcweir { 147cdf0e10cSrcweir Reference < XInterface > xI = xMSF->createInstance( 148cdf0e10cSrcweir OUString::createFromAscii( "com.sun.star.i18n.BreakIterator")); 149cdf0e10cSrcweir if( xI.is() ) 150cdf0e10cSrcweir xI->queryInterface( ::getCppuType( 151cdf0e10cSrcweir (const Reference< XBreakIterator >*)0)) 152cdf0e10cSrcweir >>= xBreak; 153cdf0e10cSrcweir } 154cdf0e10cSrcweir 155cdf0e10cSrcweir sSrchStr = aSrchPara.searchString; 156cdf0e10cSrcweir 157cc450e3aSHerbert Dürr // use transliteration here 158cc450e3aSHerbert Dürr if ( xTranslit.is() && 159cdf0e10cSrcweir aSrchPara.transliterateFlags & SIMPLE_TRANS_MASK ) 160cdf0e10cSrcweir sSrchStr = xTranslit->transliterateString2String( 161cdf0e10cSrcweir aSrchPara.searchString, 0, aSrchPara.searchString.getLength()); 162cdf0e10cSrcweir 163cc450e3aSHerbert Dürr if ( xTranslit2.is() && 164cdf0e10cSrcweir aSrchPara.transliterateFlags & COMPLEX_TRANS_MASK ) 165cdf0e10cSrcweir sSrchStr2 = xTranslit2->transliterateString2String( 166cdf0e10cSrcweir aSrchPara.searchString, 0, aSrchPara.searchString.getLength()); 167cdf0e10cSrcweir 168cdf0e10cSrcweir // When start or end of search string is a complex script type, we need to 169cdf0e10cSrcweir // make sure the result boundary is not located in the middle of cell. 170cdf0e10cSrcweir checkCTLStart = (xBreak.is() && (xBreak->getScriptType(sSrchStr, 0) == 171cdf0e10cSrcweir ScriptType::COMPLEX)); 172cdf0e10cSrcweir checkCTLEnd = (xBreak.is() && (xBreak->getScriptType(sSrchStr, 173cdf0e10cSrcweir sSrchStr.getLength()-1) == ScriptType::COMPLEX)); 174cdf0e10cSrcweir 175cc450e3aSHerbert Dürr switch( aSrchPara.algorithmType) 176cdf0e10cSrcweir { 177cc450e3aSHerbert Dürr case SearchAlgorithms_REGEXP: 178cc450e3aSHerbert Dürr fnForward = &TextSearch::RESrchFrwrd; 179cc450e3aSHerbert Dürr fnBackward = &TextSearch::RESrchBkwrd; 1807f9f793fSHerbert Dürr RESrchPrepare( aSrchPara); 1817f9f793fSHerbert Dürr break; 182cc450e3aSHerbert Dürr 183cc450e3aSHerbert Dürr case SearchAlgorithms_APPROXIMATE: 184cdf0e10cSrcweir fnForward = &TextSearch::ApproxSrchFrwrd; 185cdf0e10cSrcweir fnBackward = &TextSearch::ApproxSrchBkwrd; 186cdf0e10cSrcweir 187cdf0e10cSrcweir pWLD = new WLevDistance( sSrchStr.getStr(), aSrchPara.changedChars, 188cdf0e10cSrcweir aSrchPara.insertedChars, aSrchPara.deletedChars, 189cdf0e10cSrcweir 0 != (SearchFlags::LEV_RELAXED & aSrchPara.searchFlag ) ); 190cdf0e10cSrcweir 191cdf0e10cSrcweir nLimit = pWLD->GetLimit(); 192cc450e3aSHerbert Dürr break; 193cc450e3aSHerbert Dürr 194cc450e3aSHerbert Dürr default: 195cdf0e10cSrcweir fnForward = &TextSearch::NSrchFrwrd; 196cdf0e10cSrcweir fnBackward = &TextSearch::NSrchBkwrd; 197cc450e3aSHerbert Dürr break; 198cdf0e10cSrcweir } 199cdf0e10cSrcweir } 200cdf0e10cSrcweir 201cdf0e10cSrcweir sal_Int32 FindPosInSeq_Impl( const Sequence <sal_Int32>& rOff, sal_Int32 nPos ) 202cdf0e10cSrcweir { 203cdf0e10cSrcweir sal_Int32 nRet = 0, nEnd = rOff.getLength(); 204cdf0e10cSrcweir while( nRet < nEnd && nPos > rOff[ nRet ] ) ++nRet; 205cdf0e10cSrcweir return nRet; 206cdf0e10cSrcweir } 207cdf0e10cSrcweir 208cdf0e10cSrcweir sal_Bool TextSearch::isCellStart(const OUString& searchStr, sal_Int32 nPos) 209cdf0e10cSrcweir throw( RuntimeException ) 210cdf0e10cSrcweir { 211cdf0e10cSrcweir sal_Int32 nDone; 212cdf0e10cSrcweir return nPos == xBreak->previousCharacters(searchStr, nPos+1, 213cdf0e10cSrcweir aSrchPara.Locale, CharacterIteratorMode::SKIPCELL, 1, nDone); 214cdf0e10cSrcweir } 215cdf0e10cSrcweir 216cdf0e10cSrcweir SearchResult TextSearch::searchForward( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos ) 217cdf0e10cSrcweir throw( RuntimeException ) 218cdf0e10cSrcweir { 219cdf0e10cSrcweir SearchResult sres; 220cdf0e10cSrcweir 221cdf0e10cSrcweir OUString in_str(searchStr); 222cdf0e10cSrcweir sal_Int32 newStartPos = startPos; 223cdf0e10cSrcweir sal_Int32 newEndPos = endPos; 224cdf0e10cSrcweir 225cdf0e10cSrcweir bUsePrimarySrchStr = true; 226cdf0e10cSrcweir 227cdf0e10cSrcweir if ( xTranslit.is() ) 228cdf0e10cSrcweir { 229cdf0e10cSrcweir // apply normal transliteration (1<->1, 1<->0) 230cdf0e10cSrcweir com::sun::star::uno::Sequence <sal_Int32> offset( in_str.getLength()); 231cdf0e10cSrcweir in_str = xTranslit->transliterate( searchStr, 0, in_str.getLength(), offset ); 232cdf0e10cSrcweir 233cdf0e10cSrcweir // JP 20.6.2001: also the start and end positions must be corrected! 234cdf0e10cSrcweir if( startPos ) 235cdf0e10cSrcweir newStartPos = FindPosInSeq_Impl( offset, startPos ); 236cdf0e10cSrcweir 237cdf0e10cSrcweir if( endPos < searchStr.getLength() ) 238cdf0e10cSrcweir newEndPos = FindPosInSeq_Impl( offset, endPos ); 239cdf0e10cSrcweir else 240cdf0e10cSrcweir newEndPos = in_str.getLength(); 241cdf0e10cSrcweir 242cdf0e10cSrcweir sres = (this->*fnForward)( in_str, newStartPos, newEndPos ); 243cdf0e10cSrcweir 244cdf0e10cSrcweir for ( int k = 0; k < sres.startOffset.getLength(); k++ ) 245cdf0e10cSrcweir { 246cdf0e10cSrcweir if (sres.startOffset[k]) 247cdf0e10cSrcweir sres.startOffset[k] = offset[sres.startOffset[k]]; 248cdf0e10cSrcweir // JP 20.6.2001: end is ever exclusive and then don't return 249cdf0e10cSrcweir // the position of the next character - return the 250cdf0e10cSrcweir // next position behind the last found character! 251cdf0e10cSrcweir // "a b c" find "b" must return 2,3 and not 2,4!!! 252cdf0e10cSrcweir if (sres.endOffset[k]) 253cdf0e10cSrcweir sres.endOffset[k] = offset[sres.endOffset[k]-1] + 1; 254cdf0e10cSrcweir } 255cdf0e10cSrcweir } 256cdf0e10cSrcweir else 257cdf0e10cSrcweir { 258cdf0e10cSrcweir sres = (this->*fnForward)( in_str, startPos, endPos ); 259cdf0e10cSrcweir } 260cdf0e10cSrcweir 261cdf0e10cSrcweir if ( xTranslit2.is() && aSrchPara.algorithmType != SearchAlgorithms_REGEXP) 262cdf0e10cSrcweir { 263cdf0e10cSrcweir SearchResult sres2; 264cdf0e10cSrcweir 265cdf0e10cSrcweir in_str = OUString(searchStr); 266cdf0e10cSrcweir com::sun::star::uno::Sequence <sal_Int32> offset( in_str.getLength()); 267cdf0e10cSrcweir 268cdf0e10cSrcweir in_str = xTranslit2->transliterate( searchStr, 0, in_str.getLength(), offset ); 269cdf0e10cSrcweir 270cdf0e10cSrcweir if( startPos ) 271cdf0e10cSrcweir startPos = FindPosInSeq_Impl( offset, startPos ); 272cdf0e10cSrcweir 273cdf0e10cSrcweir if( endPos < searchStr.getLength() ) 274cdf0e10cSrcweir endPos = FindPosInSeq_Impl( offset, endPos ); 275cdf0e10cSrcweir else 276cdf0e10cSrcweir endPos = in_str.getLength(); 277cdf0e10cSrcweir 278cdf0e10cSrcweir bUsePrimarySrchStr = false; 279cdf0e10cSrcweir sres2 = (this->*fnForward)( in_str, startPos, endPos ); 280cdf0e10cSrcweir 281cdf0e10cSrcweir for ( int k = 0; k < sres2.startOffset.getLength(); k++ ) 282cdf0e10cSrcweir { 283cdf0e10cSrcweir if (sres2.startOffset[k]) 284cdf0e10cSrcweir sres2.startOffset[k] = offset[sres2.startOffset[k]-1] + 1; 285cdf0e10cSrcweir if (sres2.endOffset[k]) 286cdf0e10cSrcweir sres2.endOffset[k] = offset[sres2.endOffset[k]-1] + 1; 287cdf0e10cSrcweir } 288cdf0e10cSrcweir 289cdf0e10cSrcweir // pick first and long one 290cdf0e10cSrcweir if ( sres.subRegExpressions == 0) 291cdf0e10cSrcweir return sres2; 292cdf0e10cSrcweir if ( sres2.subRegExpressions == 1) 293cdf0e10cSrcweir { 294cdf0e10cSrcweir if ( sres.startOffset[0] > sres2.startOffset[0]) 295cdf0e10cSrcweir return sres2; 296cdf0e10cSrcweir else if ( sres.startOffset[0] == sres2.startOffset[0] && 297cdf0e10cSrcweir sres.endOffset[0] < sres2.endOffset[0]) 298cdf0e10cSrcweir return sres2; 299cdf0e10cSrcweir } 300cdf0e10cSrcweir } 301cdf0e10cSrcweir 302cdf0e10cSrcweir return sres; 303cdf0e10cSrcweir } 304cdf0e10cSrcweir 305cdf0e10cSrcweir SearchResult TextSearch::searchBackward( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos ) 306cdf0e10cSrcweir throw(RuntimeException) 307cdf0e10cSrcweir { 308cdf0e10cSrcweir SearchResult sres; 309cdf0e10cSrcweir 310cdf0e10cSrcweir OUString in_str(searchStr); 311cdf0e10cSrcweir sal_Int32 newStartPos = startPos; 312cdf0e10cSrcweir sal_Int32 newEndPos = endPos; 313cdf0e10cSrcweir 314cdf0e10cSrcweir bUsePrimarySrchStr = true; 315cdf0e10cSrcweir 316cdf0e10cSrcweir if ( xTranslit.is() ) 317cdf0e10cSrcweir { 318cdf0e10cSrcweir // apply only simple 1<->1 transliteration here 319cdf0e10cSrcweir com::sun::star::uno::Sequence <sal_Int32> offset( in_str.getLength()); 320cdf0e10cSrcweir in_str = xTranslit->transliterate( searchStr, 0, in_str.getLength(), offset ); 321cdf0e10cSrcweir 322cdf0e10cSrcweir // JP 20.6.2001: also the start and end positions must be corrected! 323cdf0e10cSrcweir if( startPos < searchStr.getLength() ) 324cdf0e10cSrcweir newStartPos = FindPosInSeq_Impl( offset, startPos ); 325cdf0e10cSrcweir else 326cdf0e10cSrcweir newStartPos = in_str.getLength(); 327cdf0e10cSrcweir 328cdf0e10cSrcweir if( endPos ) 329cdf0e10cSrcweir newEndPos = FindPosInSeq_Impl( offset, endPos ); 330cdf0e10cSrcweir 331cdf0e10cSrcweir sres = (this->*fnBackward)( in_str, newStartPos, newEndPos ); 332cdf0e10cSrcweir 333cdf0e10cSrcweir for ( int k = 0; k < sres.startOffset.getLength(); k++ ) 334cdf0e10cSrcweir { 335cdf0e10cSrcweir if (sres.startOffset[k]) 336cdf0e10cSrcweir sres.startOffset[k] = offset[sres.startOffset[k] - 1] + 1; 337cdf0e10cSrcweir // JP 20.6.2001: end is ever exclusive and then don't return 338cdf0e10cSrcweir // the position of the next character - return the 339cdf0e10cSrcweir // next position behind the last found character! 340cdf0e10cSrcweir // "a b c" find "b" must return 2,3 and not 2,4!!! 341cdf0e10cSrcweir if (sres.endOffset[k]) 342cdf0e10cSrcweir sres.endOffset[k] = offset[sres.endOffset[k]]; 343cdf0e10cSrcweir } 344cdf0e10cSrcweir } 345cdf0e10cSrcweir else 346cdf0e10cSrcweir { 347cdf0e10cSrcweir sres = (this->*fnBackward)( in_str, startPos, endPos ); 348cdf0e10cSrcweir } 349cdf0e10cSrcweir 350cdf0e10cSrcweir if ( xTranslit2.is() && aSrchPara.algorithmType != SearchAlgorithms_REGEXP ) 351cdf0e10cSrcweir { 352cdf0e10cSrcweir SearchResult sres2; 353cdf0e10cSrcweir 354cdf0e10cSrcweir in_str = OUString(searchStr); 355cdf0e10cSrcweir com::sun::star::uno::Sequence <sal_Int32> offset( in_str.getLength()); 356cdf0e10cSrcweir 357cdf0e10cSrcweir in_str = xTranslit2->transliterate(searchStr, 0, in_str.getLength(), offset); 358cdf0e10cSrcweir 359cdf0e10cSrcweir if( startPos < searchStr.getLength() ) 360cdf0e10cSrcweir startPos = FindPosInSeq_Impl( offset, startPos ); 361cdf0e10cSrcweir else 362cdf0e10cSrcweir startPos = in_str.getLength(); 363cdf0e10cSrcweir 364cdf0e10cSrcweir if( endPos ) 365cdf0e10cSrcweir endPos = FindPosInSeq_Impl( offset, endPos ); 366cdf0e10cSrcweir 367cdf0e10cSrcweir bUsePrimarySrchStr = false; 368cdf0e10cSrcweir sres2 = (this->*fnBackward)( in_str, startPos, endPos ); 369cdf0e10cSrcweir 370cdf0e10cSrcweir for( int k = 0; k < sres2.startOffset.getLength(); k++ ) 371cdf0e10cSrcweir { 372cdf0e10cSrcweir if (sres2.startOffset[k]) 373cdf0e10cSrcweir sres2.startOffset[k] = offset[sres2.startOffset[k]-1]+1; 374cdf0e10cSrcweir if (sres2.endOffset[k]) 375cdf0e10cSrcweir sres2.endOffset[k] = offset[sres2.endOffset[k]-1]+1; 376cdf0e10cSrcweir } 377cdf0e10cSrcweir 378cdf0e10cSrcweir // pick last and long one 379cdf0e10cSrcweir if ( sres.subRegExpressions == 0 ) 380cdf0e10cSrcweir return sres2; 381cdf0e10cSrcweir if ( sres2.subRegExpressions == 1 ) 382cdf0e10cSrcweir { 383cdf0e10cSrcweir if ( sres.startOffset[0] < sres2.startOffset[0] ) 384cdf0e10cSrcweir return sres2; 385cdf0e10cSrcweir if ( sres.startOffset[0] == sres2.startOffset[0] && 386cdf0e10cSrcweir sres.endOffset[0] > sres2.endOffset[0] ) 387cdf0e10cSrcweir return sres2; 388cdf0e10cSrcweir } 389cdf0e10cSrcweir } 390cdf0e10cSrcweir 391cdf0e10cSrcweir return sres; 392cdf0e10cSrcweir } 393cdf0e10cSrcweir 394cc450e3aSHerbert Dürr //--------------------------------------------------------------------- 395cdf0e10cSrcweir 396cdf0e10cSrcweir bool TextSearch::IsDelimiter( const OUString& rStr, sal_Int32 nPos ) const 397cdf0e10cSrcweir { 398cdf0e10cSrcweir bool bRet = 1; 399cdf0e10cSrcweir if( '\x7f' != rStr[nPos]) 400cdf0e10cSrcweir { 401cdf0e10cSrcweir if ( !xCharClass.is() ) 402cdf0e10cSrcweir { 403cdf0e10cSrcweir Reference < XInterface > xI = xMSF->createInstance( 404cdf0e10cSrcweir OUString::createFromAscii( "com.sun.star.i18n.CharacterClassification")); 405cdf0e10cSrcweir if( xI.is() ) 406cdf0e10cSrcweir xI->queryInterface( ::getCppuType( 407cdf0e10cSrcweir (const Reference< XCharacterClassification >*)0)) 408cdf0e10cSrcweir >>= xCharClass; 409cdf0e10cSrcweir } 410cdf0e10cSrcweir if ( xCharClass.is() ) 411cdf0e10cSrcweir { 412cdf0e10cSrcweir sal_Int32 nCType = xCharClass->getCharacterType( rStr, nPos, 413cdf0e10cSrcweir aSrchPara.Locale ); 414cdf0e10cSrcweir if( 0 != (( KCharacterType::DIGIT | KCharacterType::ALPHA | 415cdf0e10cSrcweir KCharacterType::LETTER ) & nCType ) ) 416cdf0e10cSrcweir bRet = 0; 417cdf0e10cSrcweir } 418cdf0e10cSrcweir } 419cdf0e10cSrcweir return bRet; 420cdf0e10cSrcweir } 421cdf0e10cSrcweir 422cc450e3aSHerbert Dürr // --------- helper methods for Boyer-Moore like text searching ---------- 423cc450e3aSHerbert Dürr // TODO: use ICU's regex UREGEX_LITERAL mode instead when it becomes available 424cdf0e10cSrcweir 425cdf0e10cSrcweir void TextSearch::MakeForwardTab() 426cdf0e10cSrcweir { 427cdf0e10cSrcweir // create the jumptable for the search text 428cdf0e10cSrcweir if( pJumpTable ) 429cdf0e10cSrcweir { 430cdf0e10cSrcweir if( bIsForwardTab ) 431cdf0e10cSrcweir return ; // the jumpTable is ok 432cdf0e10cSrcweir delete pJumpTable; 433cdf0e10cSrcweir } 434cdf0e10cSrcweir bIsForwardTab = true; 435cdf0e10cSrcweir 436cdf0e10cSrcweir sal_Int32 n, nLen = sSrchStr.getLength(); 437cdf0e10cSrcweir pJumpTable = new TextSearchJumpTable; 438cdf0e10cSrcweir 439cdf0e10cSrcweir for( n = 0; n < nLen - 1; ++n ) 440cdf0e10cSrcweir { 441cdf0e10cSrcweir sal_Unicode cCh = sSrchStr[n]; 442cdf0e10cSrcweir sal_Int32 nDiff = nLen - n - 1; 443cdf0e10cSrcweir TextSearchJumpTable::value_type aEntry( cCh, nDiff ); 444cdf0e10cSrcweir 445cdf0e10cSrcweir ::std::pair< TextSearchJumpTable::iterator, bool > aPair = 446cdf0e10cSrcweir pJumpTable->insert( aEntry ); 447cdf0e10cSrcweir if ( !aPair.second ) 448cdf0e10cSrcweir (*(aPair.first)).second = nDiff; 449cdf0e10cSrcweir } 450cdf0e10cSrcweir } 451cdf0e10cSrcweir 452cdf0e10cSrcweir void TextSearch::MakeForwardTab2() 453cdf0e10cSrcweir { 454cdf0e10cSrcweir // create the jumptable for the search text 455cdf0e10cSrcweir if( pJumpTable2 ) 456cdf0e10cSrcweir { 457cdf0e10cSrcweir if( bIsForwardTab ) 458cdf0e10cSrcweir return ; // the jumpTable is ok 459cdf0e10cSrcweir delete pJumpTable2; 460cdf0e10cSrcweir } 461cdf0e10cSrcweir bIsForwardTab = true; 462cdf0e10cSrcweir 463cdf0e10cSrcweir sal_Int32 n, nLen = sSrchStr2.getLength(); 464cdf0e10cSrcweir pJumpTable2 = new TextSearchJumpTable; 465cdf0e10cSrcweir 466cdf0e10cSrcweir for( n = 0; n < nLen - 1; ++n ) 467cdf0e10cSrcweir { 468cdf0e10cSrcweir sal_Unicode cCh = sSrchStr2[n]; 469cdf0e10cSrcweir sal_Int32 nDiff = nLen - n - 1; 470cdf0e10cSrcweir 471cdf0e10cSrcweir TextSearchJumpTable::value_type aEntry( cCh, nDiff ); 472cdf0e10cSrcweir ::std::pair< TextSearchJumpTable::iterator, bool > aPair = 473cdf0e10cSrcweir pJumpTable2->insert( aEntry ); 474cdf0e10cSrcweir if ( !aPair.second ) 475cdf0e10cSrcweir (*(aPair.first)).second = nDiff; 476cdf0e10cSrcweir } 477cdf0e10cSrcweir } 478cdf0e10cSrcweir 479cdf0e10cSrcweir void TextSearch::MakeBackwardTab() 480cdf0e10cSrcweir { 481cdf0e10cSrcweir // create the jumptable for the search text 482cdf0e10cSrcweir if( pJumpTable ) 483cdf0e10cSrcweir { 484cdf0e10cSrcweir if( !bIsForwardTab ) 485cdf0e10cSrcweir return ; // the jumpTable is ok 486cdf0e10cSrcweir delete pJumpTable; 487cdf0e10cSrcweir } 488cdf0e10cSrcweir bIsForwardTab = false; 489cdf0e10cSrcweir 490cdf0e10cSrcweir sal_Int32 n, nLen = sSrchStr.getLength(); 491cdf0e10cSrcweir pJumpTable = new TextSearchJumpTable; 492cdf0e10cSrcweir 493cdf0e10cSrcweir for( n = nLen-1; n > 0; --n ) 494cdf0e10cSrcweir { 495cdf0e10cSrcweir sal_Unicode cCh = sSrchStr[n]; 496cdf0e10cSrcweir TextSearchJumpTable::value_type aEntry( cCh, n ); 497cdf0e10cSrcweir ::std::pair< TextSearchJumpTable::iterator, bool > aPair = 498cdf0e10cSrcweir pJumpTable->insert( aEntry ); 499cdf0e10cSrcweir if ( !aPair.second ) 500cdf0e10cSrcweir (*(aPair.first)).second = n; 501cdf0e10cSrcweir } 502cdf0e10cSrcweir } 503cdf0e10cSrcweir 504cdf0e10cSrcweir void TextSearch::MakeBackwardTab2() 505cdf0e10cSrcweir { 506cdf0e10cSrcweir // create the jumptable for the search text 507cdf0e10cSrcweir if( pJumpTable2 ) 508cdf0e10cSrcweir { 509cdf0e10cSrcweir if( !bIsForwardTab ) 510cdf0e10cSrcweir return ; // the jumpTable is ok 511cdf0e10cSrcweir delete pJumpTable2; 512cdf0e10cSrcweir } 513cdf0e10cSrcweir bIsForwardTab = false; 514cdf0e10cSrcweir 515cdf0e10cSrcweir sal_Int32 n, nLen = sSrchStr2.getLength(); 516cdf0e10cSrcweir pJumpTable2 = new TextSearchJumpTable; 517cdf0e10cSrcweir 518cdf0e10cSrcweir for( n = nLen-1; n > 0; --n ) 519cdf0e10cSrcweir { 520cdf0e10cSrcweir sal_Unicode cCh = sSrchStr2[n]; 521cdf0e10cSrcweir TextSearchJumpTable::value_type aEntry( cCh, n ); 522cdf0e10cSrcweir ::std::pair< TextSearchJumpTable::iterator, bool > aPair = 523cdf0e10cSrcweir pJumpTable2->insert( aEntry ); 524cdf0e10cSrcweir if ( !aPair.second ) 525cdf0e10cSrcweir (*(aPair.first)).second = n; 526cdf0e10cSrcweir } 527cdf0e10cSrcweir } 528cdf0e10cSrcweir 529cdf0e10cSrcweir sal_Int32 TextSearch::GetDiff( const sal_Unicode cChr ) const 530cdf0e10cSrcweir { 531cdf0e10cSrcweir TextSearchJumpTable *pJump; 532cdf0e10cSrcweir OUString sSearchKey; 533cdf0e10cSrcweir 534cdf0e10cSrcweir if ( bUsePrimarySrchStr ) { 535cdf0e10cSrcweir pJump = pJumpTable; 536cdf0e10cSrcweir sSearchKey = sSrchStr; 537cdf0e10cSrcweir } else { 538cdf0e10cSrcweir pJump = pJumpTable2; 539cdf0e10cSrcweir sSearchKey = sSrchStr2; 540cdf0e10cSrcweir } 541cdf0e10cSrcweir 542cdf0e10cSrcweir TextSearchJumpTable::const_iterator iLook = pJump->find( cChr ); 543cdf0e10cSrcweir if ( iLook == pJump->end() ) 544cdf0e10cSrcweir return sSearchKey.getLength(); 545cdf0e10cSrcweir return (*iLook).second; 546cdf0e10cSrcweir } 547cdf0e10cSrcweir 548cdf0e10cSrcweir 549cdf0e10cSrcweir // TextSearch::NSrchFrwrd is mis-optimized on unxsoli (#i105945#) 550cdf0e10cSrcweir SearchResult TextSearch::NSrchFrwrd( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos ) 551cdf0e10cSrcweir throw(RuntimeException) 552cdf0e10cSrcweir { 553cdf0e10cSrcweir SearchResult aRet; 554cdf0e10cSrcweir aRet.subRegExpressions = 0; 555cdf0e10cSrcweir 556cdf0e10cSrcweir OUString sSearchKey = bUsePrimarySrchStr ? sSrchStr : sSrchStr2; 557cdf0e10cSrcweir 558cdf0e10cSrcweir OUString aStr( searchStr ); 559cdf0e10cSrcweir sal_Int32 nSuchIdx = aStr.getLength(); 560cdf0e10cSrcweir sal_Int32 nEnde = endPos; 561cdf0e10cSrcweir if( !nSuchIdx || !sSearchKey.getLength() || sSearchKey.getLength() > nSuchIdx ) 562cdf0e10cSrcweir return aRet; 563cdf0e10cSrcweir 564cdf0e10cSrcweir 565cdf0e10cSrcweir if( nEnde < sSearchKey.getLength() ) // position inside the search region ? 566cdf0e10cSrcweir return aRet; 567cdf0e10cSrcweir 568cdf0e10cSrcweir nEnde -= sSearchKey.getLength(); 569cdf0e10cSrcweir 570cdf0e10cSrcweir if (bUsePrimarySrchStr) 571cdf0e10cSrcweir MakeForwardTab(); // create the jumptable 572cdf0e10cSrcweir else 573cdf0e10cSrcweir MakeForwardTab2(); 574cdf0e10cSrcweir 575cdf0e10cSrcweir for (sal_Int32 nCmpIdx = startPos; // start position for the search 576cdf0e10cSrcweir nCmpIdx <= nEnde; 577cdf0e10cSrcweir nCmpIdx += GetDiff( aStr[nCmpIdx + sSearchKey.getLength()-1])) 578cdf0e10cSrcweir { 579cdf0e10cSrcweir // if the match would be the completed cells, skip it. 580cdf0e10cSrcweir if ( (checkCTLStart && !isCellStart( aStr, nCmpIdx )) || (checkCTLEnd 581cdf0e10cSrcweir && !isCellStart( aStr, nCmpIdx + sSearchKey.getLength())) ) 582cdf0e10cSrcweir continue; 583cdf0e10cSrcweir 584cdf0e10cSrcweir nSuchIdx = sSearchKey.getLength() - 1; 585cdf0e10cSrcweir while( nSuchIdx >= 0 && sSearchKey[nSuchIdx] == aStr[nCmpIdx + nSuchIdx]) 586cdf0e10cSrcweir { 587cdf0e10cSrcweir if( nSuchIdx == 0 ) 588cdf0e10cSrcweir { 589cdf0e10cSrcweir if( SearchFlags::NORM_WORD_ONLY & aSrchPara.searchFlag ) 590cdf0e10cSrcweir { 591cdf0e10cSrcweir sal_Int32 nFndEnd = nCmpIdx + sSearchKey.getLength(); 592cdf0e10cSrcweir bool bAtStart = !nCmpIdx; 593cdf0e10cSrcweir bool bAtEnd = nFndEnd == endPos; 594cdf0e10cSrcweir bool bDelimBefore = bAtStart || IsDelimiter( aStr, nCmpIdx-1 ); 595cdf0e10cSrcweir bool bDelimBehind = IsDelimiter( aStr, nFndEnd ); 596cdf0e10cSrcweir // * 1 -> only one word in the paragraph 597cdf0e10cSrcweir // * 2 -> at begin of paragraph 598cdf0e10cSrcweir // * 3 -> at end of paragraph 599cdf0e10cSrcweir // * 4 -> inside the paragraph 600cdf0e10cSrcweir if( !( ( bAtStart && bAtEnd ) || // 1 601cdf0e10cSrcweir ( bAtStart && bDelimBehind ) || // 2 602cdf0e10cSrcweir ( bAtEnd && bDelimBefore ) || // 3 603cdf0e10cSrcweir ( bDelimBefore && bDelimBehind ))) // 4 604cdf0e10cSrcweir break; 605cdf0e10cSrcweir } 606cdf0e10cSrcweir 607cdf0e10cSrcweir aRet.subRegExpressions = 1; 608cdf0e10cSrcweir aRet.startOffset.realloc( 1 ); 609cdf0e10cSrcweir aRet.startOffset[ 0 ] = nCmpIdx; 610cdf0e10cSrcweir aRet.endOffset.realloc( 1 ); 611cdf0e10cSrcweir aRet.endOffset[ 0 ] = nCmpIdx + sSearchKey.getLength(); 612cdf0e10cSrcweir 613cdf0e10cSrcweir return aRet; 614cdf0e10cSrcweir } 615cdf0e10cSrcweir else 616cdf0e10cSrcweir nSuchIdx--; 617cdf0e10cSrcweir } 618cdf0e10cSrcweir } 619cdf0e10cSrcweir return aRet; 620cdf0e10cSrcweir } 621cdf0e10cSrcweir 622cdf0e10cSrcweir SearchResult TextSearch::NSrchBkwrd( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos ) 623cdf0e10cSrcweir throw(RuntimeException) 624cdf0e10cSrcweir { 625cdf0e10cSrcweir SearchResult aRet; 626cdf0e10cSrcweir aRet.subRegExpressions = 0; 627cdf0e10cSrcweir 628cdf0e10cSrcweir OUString sSearchKey = bUsePrimarySrchStr ? sSrchStr : sSrchStr2; 629cdf0e10cSrcweir 630cdf0e10cSrcweir OUString aStr( searchStr ); 631cdf0e10cSrcweir sal_Int32 nSuchIdx = aStr.getLength(); 632cdf0e10cSrcweir sal_Int32 nEnde = endPos; 633cdf0e10cSrcweir if( nSuchIdx == 0 || sSearchKey.getLength() == 0 || sSearchKey.getLength() > nSuchIdx) 634cdf0e10cSrcweir return aRet; 635cdf0e10cSrcweir 636cdf0e10cSrcweir if (bUsePrimarySrchStr) 637cdf0e10cSrcweir MakeBackwardTab(); // create the jumptable 638cdf0e10cSrcweir else 639cdf0e10cSrcweir MakeBackwardTab2(); 640cdf0e10cSrcweir 641cdf0e10cSrcweir if( nEnde == nSuchIdx ) // end position for the search 642cdf0e10cSrcweir nEnde = sSearchKey.getLength(); 643cdf0e10cSrcweir else 644cdf0e10cSrcweir nEnde += sSearchKey.getLength(); 645cdf0e10cSrcweir 646cdf0e10cSrcweir sal_Int32 nCmpIdx = startPos; // start position for the search 647cdf0e10cSrcweir 648cdf0e10cSrcweir while (nCmpIdx >= nEnde) 649cdf0e10cSrcweir { 650cdf0e10cSrcweir // if the match would be the completed cells, skip it. 651cdf0e10cSrcweir if ( (!checkCTLStart || isCellStart( aStr, nCmpIdx - 652cdf0e10cSrcweir sSearchKey.getLength() )) && (!checkCTLEnd || 653cdf0e10cSrcweir isCellStart( aStr, nCmpIdx))) 654cdf0e10cSrcweir { 655cdf0e10cSrcweir nSuchIdx = 0; 656cdf0e10cSrcweir while( nSuchIdx < sSearchKey.getLength() && sSearchKey[nSuchIdx] == 657cdf0e10cSrcweir aStr[nCmpIdx + nSuchIdx - sSearchKey.getLength()] ) 658cdf0e10cSrcweir nSuchIdx++; 659cdf0e10cSrcweir if( nSuchIdx >= sSearchKey.getLength() ) 660cdf0e10cSrcweir { 661cdf0e10cSrcweir if( SearchFlags::NORM_WORD_ONLY & aSrchPara.searchFlag ) 662cdf0e10cSrcweir { 663cdf0e10cSrcweir sal_Int32 nFndStt = nCmpIdx - sSearchKey.getLength(); 664cdf0e10cSrcweir bool bAtStart = !nFndStt; 665cdf0e10cSrcweir bool bAtEnd = nCmpIdx == startPos; 666cdf0e10cSrcweir bool bDelimBehind = IsDelimiter( aStr, nCmpIdx ); 667cdf0e10cSrcweir bool bDelimBefore = bAtStart || // begin of paragraph 668cdf0e10cSrcweir IsDelimiter( aStr, nFndStt-1 ); 669cdf0e10cSrcweir // * 1 -> only one word in the paragraph 670cdf0e10cSrcweir // * 2 -> at begin of paragraph 671cdf0e10cSrcweir // * 3 -> at end of paragraph 672cdf0e10cSrcweir // * 4 -> inside the paragraph 673cdf0e10cSrcweir if( ( bAtStart && bAtEnd ) || // 1 674cdf0e10cSrcweir ( bAtStart && bDelimBehind ) || // 2 675cdf0e10cSrcweir ( bAtEnd && bDelimBefore ) || // 3 676cdf0e10cSrcweir ( bDelimBefore && bDelimBehind )) // 4 677cdf0e10cSrcweir { 678cdf0e10cSrcweir aRet.subRegExpressions = 1; 679cdf0e10cSrcweir aRet.startOffset.realloc( 1 ); 680cdf0e10cSrcweir aRet.startOffset[ 0 ] = nCmpIdx; 681cdf0e10cSrcweir aRet.endOffset.realloc( 1 ); 682cdf0e10cSrcweir aRet.endOffset[ 0 ] = nCmpIdx - sSearchKey.getLength(); 683cdf0e10cSrcweir return aRet; 684cdf0e10cSrcweir } 685cdf0e10cSrcweir } 686cdf0e10cSrcweir else 687cdf0e10cSrcweir { 688cdf0e10cSrcweir aRet.subRegExpressions = 1; 689cdf0e10cSrcweir aRet.startOffset.realloc( 1 ); 690cdf0e10cSrcweir aRet.startOffset[ 0 ] = nCmpIdx; 691cdf0e10cSrcweir aRet.endOffset.realloc( 1 ); 692cdf0e10cSrcweir aRet.endOffset[ 0 ] = nCmpIdx - sSearchKey.getLength(); 693cdf0e10cSrcweir return aRet; 694cdf0e10cSrcweir } 695cdf0e10cSrcweir } 696cdf0e10cSrcweir } 697cdf0e10cSrcweir nSuchIdx = GetDiff( aStr[nCmpIdx - sSearchKey.getLength()] ); 698cdf0e10cSrcweir if( nCmpIdx < nSuchIdx ) 699cdf0e10cSrcweir return aRet; 700cdf0e10cSrcweir nCmpIdx -= nSuchIdx; 701cdf0e10cSrcweir } 702cdf0e10cSrcweir return aRet; 703cdf0e10cSrcweir } 704cdf0e10cSrcweir 7057f9f793fSHerbert Dürr void TextSearch::RESrchPrepare( const ::com::sun::star::util::SearchOptions& rOptions) 7067f9f793fSHerbert Dürr { 7077f9f793fSHerbert Dürr // select the transliterated pattern string 7087f9f793fSHerbert Dürr const OUString& rPatternStr = 709*e2630f2cSHerbert Dürr (rOptions.transliterateFlags & REGEX_TRANS_MASK) ? sSrchStr 7107f9f793fSHerbert Dürr : ((rOptions.transliterateFlags & COMPLEX_TRANS_MASK) ? sSrchStr2 : rOptions.searchString); 7117f9f793fSHerbert Dürr 7127c5e76a7SHerbert Dürr sal_uInt32 nIcuSearchFlags = UREGEX_UWORD; // request UAX#29 unicode capability 7137f9f793fSHerbert Dürr // map com::sun::star::util::SearchFlags to ICU uregex.h flags 7147f9f793fSHerbert Dürr // TODO: REG_EXTENDED, REG_NOT_BEGINOFLINE, REG_NOT_ENDOFLINE 7157f9f793fSHerbert Dürr // REG_NEWLINE is neither properly defined nor used anywhere => not implemented 7167f9f793fSHerbert Dürr // REG_NOSUB is not used anywhere => not implemented 7177f9f793fSHerbert Dürr // NORM_WORD_ONLY is only used for SearchAlgorithm==Absolute 7187f9f793fSHerbert Dürr // LEV_RELAXED is only used for SearchAlgorithm==Approximate 71922c9c6f7SHerbert Dürr // Note that the search flag ALL_IGNORE_CASE is deprecated in UNO 72022c9c6f7SHerbert Dürr // probably because the transliteration flag IGNORE_CASE handles it as well. 72122c9c6f7SHerbert Dürr if( (rOptions.searchFlag & com::sun::star::util::SearchFlags::ALL_IGNORE_CASE) != 0 72222c9c6f7SHerbert Dürr || (rOptions.transliterateFlags & TransliterationModules_IGNORE_CASE) != 0) 7237f9f793fSHerbert Dürr nIcuSearchFlags |= UREGEX_CASE_INSENSITIVE; 7247f9f793fSHerbert Dürr UErrorCode nIcuErr = U_ZERO_ERROR; 7257f9f793fSHerbert Dürr // assumption: transliteration didn't mangle regexp control chars 72603c97e34SYuri Dario IcuUniString aIcuSearchPatStr( (const UChar*)rPatternStr.getStr(), rPatternStr.getLength()); 727ee131020SHerbert Dürr #ifndef DISABLE_WORDBOUND_EMULATION 7287f9f793fSHerbert Dürr // for conveniance specific syntax elements of the old regex engine are emulated 7296a7366bcSHerbert Dürr // - by replacing \< with "word-break followed by a look-ahead word-char" 7306a7366bcSHerbert Dürr static const IcuUniString aChevronPatternB( "\\\\<", -1, IcuUniString::kInvariant); 7316a7366bcSHerbert Dürr static const IcuUniString aChevronReplaceB( "\\\\b(?=\\\\w)", -1, IcuUniString::kInvariant); 7326a7366bcSHerbert Dürr static RegexMatcher aChevronMatcherB( aChevronPatternB, 0, nIcuErr); 7336a7366bcSHerbert Dürr aChevronMatcherB.reset( aIcuSearchPatStr); 7346a7366bcSHerbert Dürr aIcuSearchPatStr = aChevronMatcherB.replaceAll( aChevronReplaceB, nIcuErr); 7356a7366bcSHerbert Dürr aChevronMatcherB.reset(); 7366a7366bcSHerbert Dürr // - by replacing \> with "look-behind word-char followed by a word-break" 7376a7366bcSHerbert Dürr static const IcuUniString aChevronPatternE( "\\\\>", -1, IcuUniString::kInvariant); 7386a7366bcSHerbert Dürr static const IcuUniString aChevronReplaceE( "(?<=\\\\w)\\\\b", -1, IcuUniString::kInvariant); 7396a7366bcSHerbert Dürr static RegexMatcher aChevronMatcherE( aChevronPatternE, 0, nIcuErr); 7406a7366bcSHerbert Dürr aChevronMatcherE.reset( aIcuSearchPatStr); 7416a7366bcSHerbert Dürr aIcuSearchPatStr = aChevronMatcherE.replaceAll( aChevronReplaceE, nIcuErr); 7426a7366bcSHerbert Dürr aChevronMatcherE.reset(); 7437f9f793fSHerbert Dürr #endif 7447f9f793fSHerbert Dürr pRegexMatcher = new RegexMatcher( aIcuSearchPatStr, nIcuSearchFlags, nIcuErr); 7457f9f793fSHerbert Dürr if( nIcuErr) 7467f9f793fSHerbert Dürr { delete pRegexMatcher; pRegexMatcher = NULL;} 7477f9f793fSHerbert Dürr } 7487f9f793fSHerbert Dürr 749cdf0e10cSrcweir //--------------------------------------------------------------------------- 750cdf0e10cSrcweir 751cdf0e10cSrcweir SearchResult TextSearch::RESrchFrwrd( const OUString& searchStr, 752cdf0e10cSrcweir sal_Int32 startPos, sal_Int32 endPos ) 753cdf0e10cSrcweir throw(RuntimeException) 754cdf0e10cSrcweir { 755cc450e3aSHerbert Dürr SearchResult aRet; 756cc450e3aSHerbert Dürr aRet.subRegExpressions = 0; 757cc450e3aSHerbert Dürr if( !pRegexMatcher) 758cc450e3aSHerbert Dürr return aRet; 75919ee98b9SHerbert Dürr 760cc450e3aSHerbert Dürr if( endPos > searchStr.getLength()) 761cc450e3aSHerbert Dürr endPos = searchStr.getLength(); 762cc450e3aSHerbert Dürr 763cc450e3aSHerbert Dürr // use the ICU RegexMatcher to find the matches 764cc450e3aSHerbert Dürr UErrorCode nIcuErr = U_ZERO_ERROR; 76519716b0aSHerbert Dürr const IcuUniString aSearchTargetStr( (const UChar*)searchStr.getStr(), endPos); 766cc450e3aSHerbert Dürr pRegexMatcher->reset( aSearchTargetStr); 76716b8677bSHerbert Dürr // search until there is a valid match 76816b8677bSHerbert Dürr for(;;) 76916b8677bSHerbert Dürr { 77016b8677bSHerbert Dürr if( !pRegexMatcher->find( startPos, nIcuErr)) 77116b8677bSHerbert Dürr return aRet; 77216b8677bSHerbert Dürr 77316b8677bSHerbert Dürr // #i118887# ignore zero-length matches e.g. "a*" in "bc" 77416b8677bSHerbert Dürr int nStartOfs = pRegexMatcher->start( nIcuErr); 77516b8677bSHerbert Dürr int nEndOfs = pRegexMatcher->end( nIcuErr); 77616b8677bSHerbert Dürr if( nStartOfs < nEndOfs) 77716b8677bSHerbert Dürr break; 77816b8677bSHerbert Dürr // try at next position if there was a zero-length match 77916b8677bSHerbert Dürr if( ++startPos >= endPos) 78016b8677bSHerbert Dürr return aRet; 78116b8677bSHerbert Dürr } 782cc450e3aSHerbert Dürr 78316b8677bSHerbert Dürr // extract the result of the search 7840c7ce76dSHerbert Dürr const int nGroupCount = pRegexMatcher->groupCount(); 7850c7ce76dSHerbert Dürr aRet.subRegExpressions = nGroupCount + 1; 786cc450e3aSHerbert Dürr aRet.startOffset.realloc( aRet.subRegExpressions); 787cc450e3aSHerbert Dürr aRet.endOffset.realloc( aRet.subRegExpressions); 788cc450e3aSHerbert Dürr aRet.startOffset[0] = pRegexMatcher->start( nIcuErr); 789cc450e3aSHerbert Dürr aRet.endOffset[0] = pRegexMatcher->end( nIcuErr); 7900c7ce76dSHerbert Dürr for( int i = 1; i <= nGroupCount; ++i) { 7910c7ce76dSHerbert Dürr aRet.startOffset[i] = pRegexMatcher->start( i, nIcuErr); 7920c7ce76dSHerbert Dürr aRet.endOffset[i] = pRegexMatcher->end( i, nIcuErr); 7930c7ce76dSHerbert Dürr } 794cc450e3aSHerbert Dürr 795cc450e3aSHerbert Dürr return aRet; 796cdf0e10cSrcweir } 797cdf0e10cSrcweir 798cdf0e10cSrcweir SearchResult TextSearch::RESrchBkwrd( const OUString& searchStr, 799cdf0e10cSrcweir sal_Int32 startPos, sal_Int32 endPos ) 800cdf0e10cSrcweir throw(RuntimeException) 801cdf0e10cSrcweir { 802cc450e3aSHerbert Dürr // NOTE: for backwards search callers provide startPos/endPos inverted! 803cc450e3aSHerbert Dürr SearchResult aRet; 804cc450e3aSHerbert Dürr aRet.subRegExpressions = 0; 805cc450e3aSHerbert Dürr if( !pRegexMatcher) 806cc450e3aSHerbert Dürr return aRet; 80719ee98b9SHerbert Dürr 808cc450e3aSHerbert Dürr if( startPos > searchStr.getLength()) 809cc450e3aSHerbert Dürr startPos = searchStr.getLength(); 810cc450e3aSHerbert Dürr 811cc450e3aSHerbert Dürr // use the ICU RegexMatcher to find the matches 812cc450e3aSHerbert Dürr // TODO: use ICU's backward searching once it becomes available 8130c7ce76dSHerbert Dürr // as its replacement using forward search is not as good as the real thing 814cc450e3aSHerbert Dürr UErrorCode nIcuErr = U_ZERO_ERROR; 81503c97e34SYuri Dario const IcuUniString aSearchTargetStr( (const UChar*)searchStr.getStr(), startPos); 816cc450e3aSHerbert Dürr pRegexMatcher->reset( aSearchTargetStr); 817cc450e3aSHerbert Dürr if( !pRegexMatcher->find( endPos, nIcuErr)) 818cc450e3aSHerbert Dürr return aRet; 819cc450e3aSHerbert Dürr 8200c7ce76dSHerbert Dürr // find the last match 8210c7ce76dSHerbert Dürr int nLastPos = 0; 8222c1e93e7SHerbert Dürr int nFoundEnd = 0; 8230c7ce76dSHerbert Dürr do { 8240c7ce76dSHerbert Dürr nLastPos = pRegexMatcher->start( nIcuErr); 8252c1e93e7SHerbert Dürr nFoundEnd = pRegexMatcher->end( nIcuErr); 8262c1e93e7SHerbert Dürr if( nFoundEnd >= startPos) 8272c1e93e7SHerbert Dürr break; 8282c1e93e7SHerbert Dürr if( nFoundEnd == nLastPos) 8292c1e93e7SHerbert Dürr ++nFoundEnd; 8302c1e93e7SHerbert Dürr } while( pRegexMatcher->find( nFoundEnd, nIcuErr)); 8310c7ce76dSHerbert Dürr 8320c7ce76dSHerbert Dürr // find last match again to get its details 8330c7ce76dSHerbert Dürr pRegexMatcher->find( nLastPos, nIcuErr); 8340c7ce76dSHerbert Dürr 8350c7ce76dSHerbert Dürr // fill in the details of the last match 8360c7ce76dSHerbert Dürr const int nGroupCount = pRegexMatcher->groupCount(); 8370c7ce76dSHerbert Dürr aRet.subRegExpressions = nGroupCount + 1; 838cc450e3aSHerbert Dürr aRet.startOffset.realloc( aRet.subRegExpressions); 839cc450e3aSHerbert Dürr aRet.endOffset.realloc( aRet.subRegExpressions); 8400c7ce76dSHerbert Dürr // NOTE: existing users of backward search seem to expect startOfs/endOfs being inverted! 8410c7ce76dSHerbert Dürr aRet.startOffset[0] = pRegexMatcher->end( nIcuErr); 8420c7ce76dSHerbert Dürr aRet.endOffset[0] = pRegexMatcher->start( nIcuErr); 8430c7ce76dSHerbert Dürr for( int i = 1; i <= nGroupCount; ++i) { 8440c7ce76dSHerbert Dürr aRet.startOffset[i] = pRegexMatcher->end( i, nIcuErr); 8450c7ce76dSHerbert Dürr aRet.endOffset[i] = pRegexMatcher->start( i, nIcuErr); 8460c7ce76dSHerbert Dürr } 847cc450e3aSHerbert Dürr 848cc450e3aSHerbert Dürr return aRet; 849cdf0e10cSrcweir } 850cdf0e10cSrcweir 851cc450e3aSHerbert Dürr //--------------------------------------------------------------------------- 852cc450e3aSHerbert Dürr 853cc450e3aSHerbert Dürr // search for words phonetically 854cdf0e10cSrcweir SearchResult TextSearch::ApproxSrchFrwrd( const OUString& searchStr, 855cdf0e10cSrcweir sal_Int32 startPos, sal_Int32 endPos ) 856cdf0e10cSrcweir throw(RuntimeException) 857cdf0e10cSrcweir { 858cdf0e10cSrcweir SearchResult aRet; 859cdf0e10cSrcweir aRet.subRegExpressions = 0; 860cdf0e10cSrcweir 861cdf0e10cSrcweir if( !xBreak.is() ) 862cdf0e10cSrcweir return aRet; 863cdf0e10cSrcweir 864cdf0e10cSrcweir OUString aWTemp( searchStr ); 865cdf0e10cSrcweir 866cdf0e10cSrcweir register sal_Int32 nStt, nEnd; 867cdf0e10cSrcweir 868cdf0e10cSrcweir Boundary aWBnd = xBreak->getWordBoundary( aWTemp, startPos, 869cdf0e10cSrcweir aSrchPara.Locale, 870cdf0e10cSrcweir WordType::ANYWORD_IGNOREWHITESPACES, sal_True ); 871cdf0e10cSrcweir 872cdf0e10cSrcweir do 873cdf0e10cSrcweir { 874cdf0e10cSrcweir if( aWBnd.startPos >= endPos ) 875cdf0e10cSrcweir break; 876cdf0e10cSrcweir nStt = aWBnd.startPos < startPos ? startPos : aWBnd.startPos; 877cdf0e10cSrcweir nEnd = aWBnd.endPos > endPos ? endPos : aWBnd.endPos; 878cdf0e10cSrcweir 879cdf0e10cSrcweir if( nStt < nEnd && 880cdf0e10cSrcweir pWLD->WLD( aWTemp.getStr() + nStt, nEnd - nStt ) <= nLimit ) 881cdf0e10cSrcweir { 882cdf0e10cSrcweir aRet.subRegExpressions = 1; 883cdf0e10cSrcweir aRet.startOffset.realloc( 1 ); 884cdf0e10cSrcweir aRet.startOffset[ 0 ] = nStt; 885cdf0e10cSrcweir aRet.endOffset.realloc( 1 ); 886cdf0e10cSrcweir aRet.endOffset[ 0 ] = nEnd; 887cdf0e10cSrcweir break; 888cdf0e10cSrcweir } 889cdf0e10cSrcweir 890cdf0e10cSrcweir nStt = nEnd - 1; 891cdf0e10cSrcweir aWBnd = xBreak->nextWord( aWTemp, nStt, aSrchPara.Locale, 892cdf0e10cSrcweir WordType::ANYWORD_IGNOREWHITESPACES); 893cdf0e10cSrcweir } while( aWBnd.startPos != aWBnd.endPos || 894cdf0e10cSrcweir (aWBnd.endPos != aWTemp.getLength() && aWBnd.endPos != nEnd) ); 895cdf0e10cSrcweir // #i50244# aWBnd.endPos != nEnd : in case there is _no_ word (only 896cdf0e10cSrcweir // whitespace) in searchStr, getWordBoundary() returned startPos,startPos 897cdf0e10cSrcweir // and nextWord() does also => don't loop forever. 898cdf0e10cSrcweir return aRet; 899cdf0e10cSrcweir } 900cdf0e10cSrcweir 901cdf0e10cSrcweir SearchResult TextSearch::ApproxSrchBkwrd( const OUString& searchStr, 902cdf0e10cSrcweir sal_Int32 startPos, sal_Int32 endPos ) 903cdf0e10cSrcweir throw(RuntimeException) 904cdf0e10cSrcweir { 905cdf0e10cSrcweir SearchResult aRet; 906cdf0e10cSrcweir aRet.subRegExpressions = 0; 907cdf0e10cSrcweir 908cdf0e10cSrcweir if( !xBreak.is() ) 909cdf0e10cSrcweir return aRet; 910cdf0e10cSrcweir 911cdf0e10cSrcweir OUString aWTemp( searchStr ); 912cdf0e10cSrcweir 913cdf0e10cSrcweir register sal_Int32 nStt, nEnd; 914cdf0e10cSrcweir 915cdf0e10cSrcweir Boundary aWBnd = xBreak->getWordBoundary( aWTemp, startPos, 916cdf0e10cSrcweir aSrchPara.Locale, 917cdf0e10cSrcweir WordType::ANYWORD_IGNOREWHITESPACES, sal_True ); 918cdf0e10cSrcweir 919cdf0e10cSrcweir do 920cdf0e10cSrcweir { 921cdf0e10cSrcweir if( aWBnd.endPos <= endPos ) 922cdf0e10cSrcweir break; 923cdf0e10cSrcweir nStt = aWBnd.startPos < endPos ? endPos : aWBnd.startPos; 924cdf0e10cSrcweir nEnd = aWBnd.endPos > startPos ? startPos : aWBnd.endPos; 925cdf0e10cSrcweir 926cdf0e10cSrcweir if( nStt < nEnd && 927cdf0e10cSrcweir pWLD->WLD( aWTemp.getStr() + nStt, nEnd - nStt ) <= nLimit ) 928cdf0e10cSrcweir { 929cdf0e10cSrcweir aRet.subRegExpressions = 1; 930cdf0e10cSrcweir aRet.startOffset.realloc( 1 ); 931cdf0e10cSrcweir aRet.startOffset[ 0 ] = nEnd; 932cdf0e10cSrcweir aRet.endOffset.realloc( 1 ); 933cdf0e10cSrcweir aRet.endOffset[ 0 ] = nStt; 934cdf0e10cSrcweir break; 935cdf0e10cSrcweir } 936cdf0e10cSrcweir if( !nStt ) 937cdf0e10cSrcweir break; 938cdf0e10cSrcweir 939cdf0e10cSrcweir aWBnd = xBreak->previousWord( aWTemp, nStt, aSrchPara.Locale, 940cdf0e10cSrcweir WordType::ANYWORD_IGNOREWHITESPACES); 941cdf0e10cSrcweir } while( aWBnd.startPos != aWBnd.endPos || aWBnd.endPos != aWTemp.getLength() ); 942cdf0e10cSrcweir return aRet; 943cdf0e10cSrcweir } 944cdf0e10cSrcweir 945cdf0e10cSrcweir 946cdf0e10cSrcweir static const sal_Char cSearchName[] = "com.sun.star.util.TextSearch"; 947cdf0e10cSrcweir static const sal_Char cSearchImpl[] = "com.sun.star.util.TextSearch_i18n"; 948cdf0e10cSrcweir 949cdf0e10cSrcweir static OUString getServiceName_Static() 950cdf0e10cSrcweir { 951cdf0e10cSrcweir return OUString::createFromAscii( cSearchName ); 952cdf0e10cSrcweir } 953cdf0e10cSrcweir 954cdf0e10cSrcweir static OUString getImplementationName_Static() 955cdf0e10cSrcweir { 956cdf0e10cSrcweir return OUString::createFromAscii( cSearchImpl ); 957cdf0e10cSrcweir } 958cdf0e10cSrcweir 959cdf0e10cSrcweir OUString SAL_CALL 960cdf0e10cSrcweir TextSearch::getImplementationName() 961cdf0e10cSrcweir throw( RuntimeException ) 962cdf0e10cSrcweir { 963cdf0e10cSrcweir return getImplementationName_Static(); 964cdf0e10cSrcweir } 965cdf0e10cSrcweir 966cdf0e10cSrcweir sal_Bool SAL_CALL 967cdf0e10cSrcweir TextSearch::supportsService(const OUString& rServiceName) 968cdf0e10cSrcweir throw( RuntimeException ) 969cdf0e10cSrcweir { 970cdf0e10cSrcweir return !rServiceName.compareToAscii( cSearchName ); 971cdf0e10cSrcweir } 972cdf0e10cSrcweir 973cdf0e10cSrcweir Sequence< OUString > SAL_CALL 974cdf0e10cSrcweir TextSearch::getSupportedServiceNames(void) throw( RuntimeException ) 975cdf0e10cSrcweir { 976cdf0e10cSrcweir Sequence< OUString > aRet(1); 977cdf0e10cSrcweir aRet[0] = getServiceName_Static(); 978cdf0e10cSrcweir return aRet; 979cdf0e10cSrcweir } 980cdf0e10cSrcweir 981cdf0e10cSrcweir ::com::sun::star::uno::Reference< ::com::sun::star::uno::XInterface > 982cdf0e10cSrcweir SAL_CALL TextSearch_CreateInstance( 983cdf0e10cSrcweir const ::com::sun::star::uno::Reference< 984cdf0e10cSrcweir ::com::sun::star::lang::XMultiServiceFactory >& rxMSF ) 985cdf0e10cSrcweir { 986cdf0e10cSrcweir return ::com::sun::star::uno::Reference< 987cdf0e10cSrcweir ::com::sun::star::uno::XInterface >( 988cdf0e10cSrcweir (::cppu::OWeakObject*) new TextSearch( rxMSF ) ); 989cdf0e10cSrcweir } 990cdf0e10cSrcweir 991cdf0e10cSrcweir extern "C" 992cdf0e10cSrcweir { 993cdf0e10cSrcweir 994cdf0e10cSrcweir void SAL_CALL component_getImplementationEnvironment( 995cdf0e10cSrcweir const sal_Char** ppEnvTypeName, uno_Environment** /*ppEnv*/ ) 996cdf0e10cSrcweir { 997cdf0e10cSrcweir *ppEnvTypeName = CPPU_CURRENT_LANGUAGE_BINDING_NAME; 998cdf0e10cSrcweir } 999cdf0e10cSrcweir 1000cdf0e10cSrcweir void* SAL_CALL component_getFactory( const sal_Char* sImplementationName, 1001cdf0e10cSrcweir void* _pServiceManager, void* /*_pRegistryKey*/ ) 1002cdf0e10cSrcweir { 1003cdf0e10cSrcweir void* pRet = NULL; 1004cdf0e10cSrcweir 1005cdf0e10cSrcweir ::com::sun::star::lang::XMultiServiceFactory* pServiceManager = 1006cdf0e10cSrcweir reinterpret_cast< ::com::sun::star::lang::XMultiServiceFactory* > 1007cdf0e10cSrcweir ( _pServiceManager ); 1008cdf0e10cSrcweir ::com::sun::star::uno::Reference< 1009cdf0e10cSrcweir ::com::sun::star::lang::XSingleServiceFactory > xFactory; 1010cdf0e10cSrcweir 1011cdf0e10cSrcweir if ( 0 == rtl_str_compare( sImplementationName, cSearchImpl) ) 1012cdf0e10cSrcweir { 1013cdf0e10cSrcweir ::com::sun::star::uno::Sequence< ::rtl::OUString > aServiceNames(1); 1014cdf0e10cSrcweir aServiceNames[0] = getServiceName_Static(); 1015cdf0e10cSrcweir xFactory = ::cppu::createSingleFactory( 1016cdf0e10cSrcweir pServiceManager, getImplementationName_Static(), 1017cdf0e10cSrcweir &TextSearch_CreateInstance, aServiceNames ); 1018cdf0e10cSrcweir } 1019cdf0e10cSrcweir 1020cdf0e10cSrcweir if ( xFactory.is() ) 1021cdf0e10cSrcweir { 1022cdf0e10cSrcweir xFactory->acquire(); 1023cdf0e10cSrcweir pRet = xFactory.get(); 1024cdf0e10cSrcweir } 1025cdf0e10cSrcweir 1026cdf0e10cSrcweir return pRet; 1027cdf0e10cSrcweir } 1028cdf0e10cSrcweir 1029cdf0e10cSrcweir } // extern "C" 1030