1449ab281SAndrew Rist /************************************************************** 2cdf0e10cSrcweir * 3449ab281SAndrew Rist * Licensed to the Apache Software Foundation (ASF) under one 4449ab281SAndrew Rist * or more contributor license agreements. See the NOTICE file 5449ab281SAndrew Rist * distributed with this work for additional information 6449ab281SAndrew Rist * regarding copyright ownership. The ASF licenses this file 7449ab281SAndrew Rist * to you under the Apache License, Version 2.0 (the 8449ab281SAndrew Rist * "License"); you may not use this file except in compliance 9449ab281SAndrew Rist * with the License. You may obtain a copy of the License at 10449ab281SAndrew Rist * 11449ab281SAndrew Rist * http://www.apache.org/licenses/LICENSE-2.0 12449ab281SAndrew Rist * 13449ab281SAndrew Rist * Unless required by applicable law or agreed to in writing, 14449ab281SAndrew Rist * software distributed under the License is distributed on an 15449ab281SAndrew Rist * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16449ab281SAndrew Rist * KIND, either express or implied. See the License for the 17449ab281SAndrew Rist * specific language governing permissions and limitations 18449ab281SAndrew Rist * under the License. 19449ab281SAndrew Rist * 20449ab281SAndrew Rist *************************************************************/ 21449ab281SAndrew Rist 22449ab281SAndrew Rist 23cdf0e10cSrcweir 24cdf0e10cSrcweir // MARKER(update_precomp.py): autogen include statement, do not remove 25cdf0e10cSrcweir #include "precompiled_i18npool.hxx" 26cdf0e10cSrcweir 27cdf0e10cSrcweir #include "textsearch.hxx" 28cdf0e10cSrcweir #include "levdis.hxx" 29cdf0e10cSrcweir #include <com/sun/star/lang/Locale.hpp> 30cdf0e10cSrcweir #include <com/sun/star/lang/XMultiServiceFactory.hpp> 31cdf0e10cSrcweir #include <comphelper/processfactory.hxx> 32cdf0e10cSrcweir #include <com/sun/star/i18n/UnicodeType.hpp> 33cdf0e10cSrcweir #include <com/sun/star/util/SearchFlags.hpp> 34cdf0e10cSrcweir #include <com/sun/star/i18n/WordType.hpp> 35cdf0e10cSrcweir #include <com/sun/star/i18n/ScriptType.hpp> 36cdf0e10cSrcweir #include <com/sun/star/i18n/CharacterIteratorMode.hpp> 37cdf0e10cSrcweir #include <com/sun/star/i18n/KCharacterType.hpp> 38cdf0e10cSrcweir #include <com/sun/star/registry/XRegistryKey.hpp> 39cdf0e10cSrcweir #include <cppuhelper/factory.hxx> 40cdf0e10cSrcweir #include <cppuhelper/weak.hxx> 41cdf0e10cSrcweir 42cdf0e10cSrcweir #ifdef _MSC_VER 43cdf0e10cSrcweir // get rid of that dumb compiler warning 44cdf0e10cSrcweir // identifier was truncated to '255' characters in the debug information 45cdf0e10cSrcweir // for STL template usage, if .pdb files are to be created 46cdf0e10cSrcweir #pragma warning( disable: 4786 ) 47cdf0e10cSrcweir #endif 48cdf0e10cSrcweir 49cdf0e10cSrcweir #include <string.h> 50cdf0e10cSrcweir 51cdf0e10cSrcweir using namespace ::com::sun::star::util; 52cdf0e10cSrcweir using namespace ::com::sun::star::uno; 53cdf0e10cSrcweir using namespace ::com::sun::star::lang; 54cdf0e10cSrcweir using namespace ::com::sun::star::i18n; 55cdf0e10cSrcweir using namespace ::rtl; 56cdf0e10cSrcweir 57cdf0e10cSrcweir static sal_Int32 COMPLEX_TRANS_MASK_TMP = 58cdf0e10cSrcweir TransliterationModules_ignoreBaFa_ja_JP | 59cdf0e10cSrcweir TransliterationModules_ignoreIterationMark_ja_JP | 60cdf0e10cSrcweir TransliterationModules_ignoreTiJi_ja_JP | 61cdf0e10cSrcweir TransliterationModules_ignoreHyuByu_ja_JP | 62cdf0e10cSrcweir TransliterationModules_ignoreSeZe_ja_JP | 63cdf0e10cSrcweir TransliterationModules_ignoreIandEfollowedByYa_ja_JP | 64cdf0e10cSrcweir TransliterationModules_ignoreKiKuFollowedBySa_ja_JP | 65cdf0e10cSrcweir TransliterationModules_ignoreProlongedSoundMark_ja_JP; 66cc450e3aSHerbert Dürr static const sal_Int32 COMPLEX_TRANS_MASK = COMPLEX_TRANS_MASK_TMP | TransliterationModules_IGNORE_KANA | TransliterationModules_FULLWIDTH_HALFWIDTH; 67e2630f2cSHerbert Dürr static const sal_Int32 SIMPLE_TRANS_MASK = ~COMPLEX_TRANS_MASK; 68e2630f2cSHerbert Dürr static const sal_Int32 REGEX_TRANS_MASK = ~(COMPLEX_TRANS_MASK | TransliterationModules_IGNORE_CASE | TransliterationModules_UPPERCASE_LOWERCASE | TransliterationModules_LOWERCASE_UPPERCASE); 69cdf0e10cSrcweir // Above 2 transliteration is simple but need to take effect in 70cdf0e10cSrcweir // complex transliteration 71cdf0e10cSrcweir 72cdf0e10cSrcweir TextSearch::TextSearch(const Reference < XMultiServiceFactory > & rxMSF) 73cdf0e10cSrcweir : xMSF( rxMSF ) 74cdf0e10cSrcweir , pJumpTable( 0 ) 75cdf0e10cSrcweir , pJumpTable2( 0 ) 76cc450e3aSHerbert Dürr , pRegexMatcher( NULL ) 77cdf0e10cSrcweir , pWLD( 0 ) 78cdf0e10cSrcweir { 79cdf0e10cSrcweir SearchOptions aOpt; 80cdf0e10cSrcweir aOpt.algorithmType = SearchAlgorithms_ABSOLUTE; 81cdf0e10cSrcweir aOpt.searchFlag = SearchFlags::ALL_IGNORE_CASE; 82cdf0e10cSrcweir //aOpt.Locale = ???; 83cdf0e10cSrcweir setOptions( aOpt ); 84cdf0e10cSrcweir } 85cdf0e10cSrcweir 86cdf0e10cSrcweir TextSearch::~TextSearch() 87cdf0e10cSrcweir { 88cc450e3aSHerbert Dürr delete pRegexMatcher; 89cdf0e10cSrcweir delete pWLD; 90cdf0e10cSrcweir delete pJumpTable; 91cdf0e10cSrcweir delete pJumpTable2; 92cdf0e10cSrcweir } 93cdf0e10cSrcweir 94cdf0e10cSrcweir void TextSearch::setOptions( const SearchOptions& rOptions ) throw( RuntimeException ) 95cdf0e10cSrcweir { 96cdf0e10cSrcweir aSrchPara = rOptions; 97cdf0e10cSrcweir 98cc450e3aSHerbert Dürr delete pRegexMatcher, pRegexMatcher = NULL; 99cdf0e10cSrcweir delete pWLD, pWLD = 0; 100cdf0e10cSrcweir delete pJumpTable, pJumpTable = 0; 101cdf0e10cSrcweir delete pJumpTable2, pJumpTable2 = 0; 102cdf0e10cSrcweir 103cdf0e10cSrcweir // Create Transliteration class 104cdf0e10cSrcweir if( aSrchPara.transliterateFlags & SIMPLE_TRANS_MASK ) 105cdf0e10cSrcweir { 106cdf0e10cSrcweir if( !xTranslit.is() ) 107cdf0e10cSrcweir { 108cdf0e10cSrcweir Reference < XInterface > xI = xMSF->createInstance( 109cdf0e10cSrcweir OUString::createFromAscii( 110cdf0e10cSrcweir "com.sun.star.i18n.Transliteration")); 111cdf0e10cSrcweir if ( xI.is() ) 112cdf0e10cSrcweir xI->queryInterface( ::getCppuType( 113cdf0e10cSrcweir (const Reference< XExtendedTransliteration >*)0)) 114cdf0e10cSrcweir >>= xTranslit; 115cdf0e10cSrcweir } 116cdf0e10cSrcweir // Load transliteration module 117cdf0e10cSrcweir if( xTranslit.is() ) 118cdf0e10cSrcweir xTranslit->loadModule( 119cdf0e10cSrcweir (TransliterationModules)( aSrchPara.transliterateFlags & SIMPLE_TRANS_MASK ), 120cdf0e10cSrcweir aSrchPara.Locale); 121cdf0e10cSrcweir } 122cdf0e10cSrcweir else if( xTranslit.is() ) 123cdf0e10cSrcweir xTranslit = 0; 124cdf0e10cSrcweir 125cdf0e10cSrcweir // Create Transliteration for 2<->1, 2<->2 transliteration 126cdf0e10cSrcweir if ( aSrchPara.transliterateFlags & COMPLEX_TRANS_MASK ) 127cdf0e10cSrcweir { 128cdf0e10cSrcweir if( !xTranslit2.is() ) 129cdf0e10cSrcweir { 130cdf0e10cSrcweir Reference < XInterface > xI = xMSF->createInstance( 131cdf0e10cSrcweir OUString::createFromAscii( 132cdf0e10cSrcweir "com.sun.star.i18n.Transliteration")); 133cdf0e10cSrcweir if ( xI.is() ) 134cdf0e10cSrcweir xI->queryInterface( ::getCppuType( 135cdf0e10cSrcweir (const Reference< XExtendedTransliteration >*)0)) 136cdf0e10cSrcweir >>= xTranslit2; 137cdf0e10cSrcweir } 138cdf0e10cSrcweir // Load transliteration module 139cdf0e10cSrcweir if( xTranslit2.is() ) 140cdf0e10cSrcweir xTranslit2->loadModule( 141cdf0e10cSrcweir (TransliterationModules)( aSrchPara.transliterateFlags & COMPLEX_TRANS_MASK ), 142cdf0e10cSrcweir aSrchPara.Locale); 143cdf0e10cSrcweir } 144cdf0e10cSrcweir 145cdf0e10cSrcweir if ( !xBreak.is() ) 146cdf0e10cSrcweir { 147cdf0e10cSrcweir Reference < XInterface > xI = xMSF->createInstance( 148cdf0e10cSrcweir OUString::createFromAscii( "com.sun.star.i18n.BreakIterator")); 149cdf0e10cSrcweir if( xI.is() ) 150cdf0e10cSrcweir xI->queryInterface( ::getCppuType( 151cdf0e10cSrcweir (const Reference< XBreakIterator >*)0)) 152cdf0e10cSrcweir >>= xBreak; 153cdf0e10cSrcweir } 154cdf0e10cSrcweir 155cdf0e10cSrcweir sSrchStr = aSrchPara.searchString; 156cdf0e10cSrcweir 157cc450e3aSHerbert Dürr // use transliteration here 158cc450e3aSHerbert Dürr if ( xTranslit.is() && 159cdf0e10cSrcweir aSrchPara.transliterateFlags & SIMPLE_TRANS_MASK ) 160cdf0e10cSrcweir sSrchStr = xTranslit->transliterateString2String( 161cdf0e10cSrcweir aSrchPara.searchString, 0, aSrchPara.searchString.getLength()); 162cdf0e10cSrcweir 163cc450e3aSHerbert Dürr if ( xTranslit2.is() && 164cdf0e10cSrcweir aSrchPara.transliterateFlags & COMPLEX_TRANS_MASK ) 165cdf0e10cSrcweir sSrchStr2 = xTranslit2->transliterateString2String( 166cdf0e10cSrcweir aSrchPara.searchString, 0, aSrchPara.searchString.getLength()); 167cdf0e10cSrcweir 168cdf0e10cSrcweir // When start or end of search string is a complex script type, we need to 169cdf0e10cSrcweir // make sure the result boundary is not located in the middle of cell. 170cdf0e10cSrcweir checkCTLStart = (xBreak.is() && (xBreak->getScriptType(sSrchStr, 0) == 171cdf0e10cSrcweir ScriptType::COMPLEX)); 172cdf0e10cSrcweir checkCTLEnd = (xBreak.is() && (xBreak->getScriptType(sSrchStr, 173cdf0e10cSrcweir sSrchStr.getLength()-1) == ScriptType::COMPLEX)); 174cdf0e10cSrcweir 175cc450e3aSHerbert Dürr switch( aSrchPara.algorithmType) 176cdf0e10cSrcweir { 177cc450e3aSHerbert Dürr case SearchAlgorithms_REGEXP: 178cc450e3aSHerbert Dürr fnForward = &TextSearch::RESrchFrwrd; 179cc450e3aSHerbert Dürr fnBackward = &TextSearch::RESrchBkwrd; 1807f9f793fSHerbert Dürr RESrchPrepare( aSrchPara); 1817f9f793fSHerbert Dürr break; 182cc450e3aSHerbert Dürr 183cc450e3aSHerbert Dürr case SearchAlgorithms_APPROXIMATE: 184cdf0e10cSrcweir fnForward = &TextSearch::ApproxSrchFrwrd; 185cdf0e10cSrcweir fnBackward = &TextSearch::ApproxSrchBkwrd; 186cdf0e10cSrcweir 187cdf0e10cSrcweir pWLD = new WLevDistance( sSrchStr.getStr(), aSrchPara.changedChars, 188cdf0e10cSrcweir aSrchPara.insertedChars, aSrchPara.deletedChars, 189cdf0e10cSrcweir 0 != (SearchFlags::LEV_RELAXED & aSrchPara.searchFlag ) ); 190cdf0e10cSrcweir 191cdf0e10cSrcweir nLimit = pWLD->GetLimit(); 192cc450e3aSHerbert Dürr break; 193cc450e3aSHerbert Dürr 194cc450e3aSHerbert Dürr default: 195cdf0e10cSrcweir fnForward = &TextSearch::NSrchFrwrd; 196cdf0e10cSrcweir fnBackward = &TextSearch::NSrchBkwrd; 197cc450e3aSHerbert Dürr break; 198cdf0e10cSrcweir } 199cdf0e10cSrcweir } 200cdf0e10cSrcweir 201cdf0e10cSrcweir sal_Int32 FindPosInSeq_Impl( const Sequence <sal_Int32>& rOff, sal_Int32 nPos ) 202cdf0e10cSrcweir { 203cdf0e10cSrcweir sal_Int32 nRet = 0, nEnd = rOff.getLength(); 204cdf0e10cSrcweir while( nRet < nEnd && nPos > rOff[ nRet ] ) ++nRet; 205cdf0e10cSrcweir return nRet; 206cdf0e10cSrcweir } 207cdf0e10cSrcweir 208cdf0e10cSrcweir sal_Bool TextSearch::isCellStart(const OUString& searchStr, sal_Int32 nPos) 209cdf0e10cSrcweir throw( RuntimeException ) 210cdf0e10cSrcweir { 211cdf0e10cSrcweir sal_Int32 nDone; 212cdf0e10cSrcweir return nPos == xBreak->previousCharacters(searchStr, nPos+1, 213cdf0e10cSrcweir aSrchPara.Locale, CharacterIteratorMode::SKIPCELL, 1, nDone); 214cdf0e10cSrcweir } 215cdf0e10cSrcweir 216cdf0e10cSrcweir SearchResult TextSearch::searchForward( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos ) 217cdf0e10cSrcweir throw( RuntimeException ) 218cdf0e10cSrcweir { 219cdf0e10cSrcweir SearchResult sres; 220cdf0e10cSrcweir 221cdf0e10cSrcweir OUString in_str(searchStr); 222cdf0e10cSrcweir sal_Int32 newStartPos = startPos; 223cdf0e10cSrcweir sal_Int32 newEndPos = endPos; 224cdf0e10cSrcweir 225cdf0e10cSrcweir bUsePrimarySrchStr = true; 226cdf0e10cSrcweir 227cdf0e10cSrcweir if ( xTranslit.is() ) 228cdf0e10cSrcweir { 229cdf0e10cSrcweir // apply normal transliteration (1<->1, 1<->0) 230cdf0e10cSrcweir com::sun::star::uno::Sequence <sal_Int32> offset( in_str.getLength()); 231cdf0e10cSrcweir in_str = xTranslit->transliterate( searchStr, 0, in_str.getLength(), offset ); 232cdf0e10cSrcweir 233cdf0e10cSrcweir // JP 20.6.2001: also the start and end positions must be corrected! 234cdf0e10cSrcweir if( startPos ) 235cdf0e10cSrcweir newStartPos = FindPosInSeq_Impl( offset, startPos ); 236cdf0e10cSrcweir 237cdf0e10cSrcweir if( endPos < searchStr.getLength() ) 238cdf0e10cSrcweir newEndPos = FindPosInSeq_Impl( offset, endPos ); 239cdf0e10cSrcweir else 240cdf0e10cSrcweir newEndPos = in_str.getLength(); 241cdf0e10cSrcweir 242cdf0e10cSrcweir sres = (this->*fnForward)( in_str, newStartPos, newEndPos ); 243cdf0e10cSrcweir 244d5645047STsutomu Uchino sal_Int32 nOffsetLength = offset.getLength(); 245d5645047STsutomu Uchino sal_Int32 nStartOffset = 0; 246cdf0e10cSrcweir for ( int k = 0; k < sres.startOffset.getLength(); k++ ) 247cdf0e10cSrcweir { 248d5645047STsutomu Uchino nStartOffset = sres.startOffset[k]; 249d5645047STsutomu Uchino if ( nStartOffset ) 250d5645047STsutomu Uchino { 251d5645047STsutomu Uchino if ( nStartOffset < nOffsetLength ) 252d5645047STsutomu Uchino sres.startOffset[k] = offset[nStartOffset]; 253d5645047STsutomu Uchino else 254d5645047STsutomu Uchino sres.startOffset[k] = offset[offset.getLength()-1] +1; 255d5645047STsutomu Uchino } 256cdf0e10cSrcweir // JP 20.6.2001: end is ever exclusive and then don't return 257cdf0e10cSrcweir // the position of the next character - return the 258cdf0e10cSrcweir // next position behind the last found character! 259cdf0e10cSrcweir // "a b c" find "b" must return 2,3 and not 2,4!!! 260cdf0e10cSrcweir if (sres.endOffset[k]) 261cdf0e10cSrcweir sres.endOffset[k] = offset[sres.endOffset[k]-1] + 1; 262cdf0e10cSrcweir } 263cdf0e10cSrcweir } 264cdf0e10cSrcweir else 265cdf0e10cSrcweir { 266cdf0e10cSrcweir sres = (this->*fnForward)( in_str, startPos, endPos ); 267cdf0e10cSrcweir } 268cdf0e10cSrcweir 269cdf0e10cSrcweir if ( xTranslit2.is() && aSrchPara.algorithmType != SearchAlgorithms_REGEXP) 270cdf0e10cSrcweir { 271cdf0e10cSrcweir SearchResult sres2; 272cdf0e10cSrcweir 273cdf0e10cSrcweir in_str = OUString(searchStr); 274cdf0e10cSrcweir com::sun::star::uno::Sequence <sal_Int32> offset( in_str.getLength()); 275cdf0e10cSrcweir 276cdf0e10cSrcweir in_str = xTranslit2->transliterate( searchStr, 0, in_str.getLength(), offset ); 277cdf0e10cSrcweir 278cdf0e10cSrcweir if( startPos ) 279cdf0e10cSrcweir startPos = FindPosInSeq_Impl( offset, startPos ); 280cdf0e10cSrcweir 281cdf0e10cSrcweir if( endPos < searchStr.getLength() ) 282cdf0e10cSrcweir endPos = FindPosInSeq_Impl( offset, endPos ); 283cdf0e10cSrcweir else 284cdf0e10cSrcweir endPos = in_str.getLength(); 285cdf0e10cSrcweir 286cdf0e10cSrcweir bUsePrimarySrchStr = false; 287cdf0e10cSrcweir sres2 = (this->*fnForward)( in_str, startPos, endPos ); 288cdf0e10cSrcweir 289cdf0e10cSrcweir for ( int k = 0; k < sres2.startOffset.getLength(); k++ ) 290cdf0e10cSrcweir { 291cdf0e10cSrcweir if (sres2.startOffset[k]) 292cdf0e10cSrcweir sres2.startOffset[k] = offset[sres2.startOffset[k]-1] + 1; 293cdf0e10cSrcweir if (sres2.endOffset[k]) 294cdf0e10cSrcweir sres2.endOffset[k] = offset[sres2.endOffset[k]-1] + 1; 295cdf0e10cSrcweir } 296cdf0e10cSrcweir 297cdf0e10cSrcweir // pick first and long one 298cdf0e10cSrcweir if ( sres.subRegExpressions == 0) 299cdf0e10cSrcweir return sres2; 300cdf0e10cSrcweir if ( sres2.subRegExpressions == 1) 301cdf0e10cSrcweir { 302cdf0e10cSrcweir if ( sres.startOffset[0] > sres2.startOffset[0]) 303cdf0e10cSrcweir return sres2; 304cdf0e10cSrcweir else if ( sres.startOffset[0] == sres2.startOffset[0] && 305cdf0e10cSrcweir sres.endOffset[0] < sres2.endOffset[0]) 306cdf0e10cSrcweir return sres2; 307cdf0e10cSrcweir } 308cdf0e10cSrcweir } 309cdf0e10cSrcweir 310cdf0e10cSrcweir return sres; 311cdf0e10cSrcweir } 312cdf0e10cSrcweir 313cdf0e10cSrcweir SearchResult TextSearch::searchBackward( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos ) 314cdf0e10cSrcweir throw(RuntimeException) 315cdf0e10cSrcweir { 316cdf0e10cSrcweir SearchResult sres; 317cdf0e10cSrcweir 318cdf0e10cSrcweir OUString in_str(searchStr); 319cdf0e10cSrcweir sal_Int32 newStartPos = startPos; 320cdf0e10cSrcweir sal_Int32 newEndPos = endPos; 321cdf0e10cSrcweir 322cdf0e10cSrcweir bUsePrimarySrchStr = true; 323cdf0e10cSrcweir 324cdf0e10cSrcweir if ( xTranslit.is() ) 325cdf0e10cSrcweir { 326cdf0e10cSrcweir // apply only simple 1<->1 transliteration here 327cdf0e10cSrcweir com::sun::star::uno::Sequence <sal_Int32> offset( in_str.getLength()); 328cdf0e10cSrcweir in_str = xTranslit->transliterate( searchStr, 0, in_str.getLength(), offset ); 329cdf0e10cSrcweir 330cdf0e10cSrcweir // JP 20.6.2001: also the start and end positions must be corrected! 331cdf0e10cSrcweir if( startPos < searchStr.getLength() ) 332cdf0e10cSrcweir newStartPos = FindPosInSeq_Impl( offset, startPos ); 333cdf0e10cSrcweir else 334cdf0e10cSrcweir newStartPos = in_str.getLength(); 335cdf0e10cSrcweir 336cdf0e10cSrcweir if( endPos ) 337cdf0e10cSrcweir newEndPos = FindPosInSeq_Impl( offset, endPos ); 338cdf0e10cSrcweir 339cdf0e10cSrcweir sres = (this->*fnBackward)( in_str, newStartPos, newEndPos ); 340cdf0e10cSrcweir 341d5645047STsutomu Uchino sal_Int32 nOffsetLength = offset.getLength(); 342d5645047STsutomu Uchino sal_Int32 nEndOffset = 0; 343cdf0e10cSrcweir for ( int k = 0; k < sres.startOffset.getLength(); k++ ) 344cdf0e10cSrcweir { 345cdf0e10cSrcweir if (sres.startOffset[k]) 346cdf0e10cSrcweir sres.startOffset[k] = offset[sres.startOffset[k] - 1] + 1; 347cdf0e10cSrcweir // JP 20.6.2001: end is ever exclusive and then don't return 348cdf0e10cSrcweir // the position of the next character - return the 349cdf0e10cSrcweir // next position behind the last found character! 350cdf0e10cSrcweir // "a b c" find "b" must return 2,3 and not 2,4!!! 351d5645047STsutomu Uchino nEndOffset = sres.endOffset[k]; 352d5645047STsutomu Uchino if ( nEndOffset ) 353d5645047STsutomu Uchino { 354d5645047STsutomu Uchino if ( nEndOffset < nOffsetLength ) 355d5645047STsutomu Uchino sres.endOffset[k] = offset[nEndOffset]; 356d5645047STsutomu Uchino else 357d5645047STsutomu Uchino sres.endOffset[k] = offset[offset.getLength()-1] +1; 358d5645047STsutomu Uchino } 359cdf0e10cSrcweir } 360cdf0e10cSrcweir } 361cdf0e10cSrcweir else 362cdf0e10cSrcweir { 363cdf0e10cSrcweir sres = (this->*fnBackward)( in_str, startPos, endPos ); 364cdf0e10cSrcweir } 365cdf0e10cSrcweir 366cdf0e10cSrcweir if ( xTranslit2.is() && aSrchPara.algorithmType != SearchAlgorithms_REGEXP ) 367cdf0e10cSrcweir { 368cdf0e10cSrcweir SearchResult sres2; 369cdf0e10cSrcweir 370cdf0e10cSrcweir in_str = OUString(searchStr); 371cdf0e10cSrcweir com::sun::star::uno::Sequence <sal_Int32> offset( in_str.getLength()); 372cdf0e10cSrcweir 373cdf0e10cSrcweir in_str = xTranslit2->transliterate(searchStr, 0, in_str.getLength(), offset); 374cdf0e10cSrcweir 375cdf0e10cSrcweir if( startPos < searchStr.getLength() ) 376cdf0e10cSrcweir startPos = FindPosInSeq_Impl( offset, startPos ); 377cdf0e10cSrcweir else 378cdf0e10cSrcweir startPos = in_str.getLength(); 379cdf0e10cSrcweir 380cdf0e10cSrcweir if( endPos ) 381cdf0e10cSrcweir endPos = FindPosInSeq_Impl( offset, endPos ); 382cdf0e10cSrcweir 383cdf0e10cSrcweir bUsePrimarySrchStr = false; 384cdf0e10cSrcweir sres2 = (this->*fnBackward)( in_str, startPos, endPos ); 385cdf0e10cSrcweir 386cdf0e10cSrcweir for( int k = 0; k < sres2.startOffset.getLength(); k++ ) 387cdf0e10cSrcweir { 388cdf0e10cSrcweir if (sres2.startOffset[k]) 389cdf0e10cSrcweir sres2.startOffset[k] = offset[sres2.startOffset[k]-1]+1; 390cdf0e10cSrcweir if (sres2.endOffset[k]) 391cdf0e10cSrcweir sres2.endOffset[k] = offset[sres2.endOffset[k]-1]+1; 392cdf0e10cSrcweir } 393cdf0e10cSrcweir 394cdf0e10cSrcweir // pick last and long one 395cdf0e10cSrcweir if ( sres.subRegExpressions == 0 ) 396cdf0e10cSrcweir return sres2; 397cdf0e10cSrcweir if ( sres2.subRegExpressions == 1 ) 398cdf0e10cSrcweir { 399cdf0e10cSrcweir if ( sres.startOffset[0] < sres2.startOffset[0] ) 400cdf0e10cSrcweir return sres2; 401cdf0e10cSrcweir if ( sres.startOffset[0] == sres2.startOffset[0] && 402cdf0e10cSrcweir sres.endOffset[0] > sres2.endOffset[0] ) 403cdf0e10cSrcweir return sres2; 404cdf0e10cSrcweir } 405cdf0e10cSrcweir } 406cdf0e10cSrcweir 407cdf0e10cSrcweir return sres; 408cdf0e10cSrcweir } 409cdf0e10cSrcweir 410cc450e3aSHerbert Dürr //--------------------------------------------------------------------- 411cdf0e10cSrcweir 412cdf0e10cSrcweir bool TextSearch::IsDelimiter( const OUString& rStr, sal_Int32 nPos ) const 413cdf0e10cSrcweir { 414cdf0e10cSrcweir bool bRet = 1; 415cdf0e10cSrcweir if( '\x7f' != rStr[nPos]) 416cdf0e10cSrcweir { 417cdf0e10cSrcweir if ( !xCharClass.is() ) 418cdf0e10cSrcweir { 419cdf0e10cSrcweir Reference < XInterface > xI = xMSF->createInstance( 420cdf0e10cSrcweir OUString::createFromAscii( "com.sun.star.i18n.CharacterClassification")); 421cdf0e10cSrcweir if( xI.is() ) 422cdf0e10cSrcweir xI->queryInterface( ::getCppuType( 423cdf0e10cSrcweir (const Reference< XCharacterClassification >*)0)) 424cdf0e10cSrcweir >>= xCharClass; 425cdf0e10cSrcweir } 426cdf0e10cSrcweir if ( xCharClass.is() ) 427cdf0e10cSrcweir { 428cdf0e10cSrcweir sal_Int32 nCType = xCharClass->getCharacterType( rStr, nPos, 429cdf0e10cSrcweir aSrchPara.Locale ); 430cdf0e10cSrcweir if( 0 != (( KCharacterType::DIGIT | KCharacterType::ALPHA | 431cdf0e10cSrcweir KCharacterType::LETTER ) & nCType ) ) 432cdf0e10cSrcweir bRet = 0; 433cdf0e10cSrcweir } 434cdf0e10cSrcweir } 435cdf0e10cSrcweir return bRet; 436cdf0e10cSrcweir } 437cdf0e10cSrcweir 438cc450e3aSHerbert Dürr // --------- helper methods for Boyer-Moore like text searching ---------- 439cc450e3aSHerbert Dürr // TODO: use ICU's regex UREGEX_LITERAL mode instead when it becomes available 440cdf0e10cSrcweir 441cdf0e10cSrcweir void TextSearch::MakeForwardTab() 442cdf0e10cSrcweir { 443cdf0e10cSrcweir // create the jumptable for the search text 444cdf0e10cSrcweir if( pJumpTable ) 445cdf0e10cSrcweir { 446cdf0e10cSrcweir if( bIsForwardTab ) 447cdf0e10cSrcweir return ; // the jumpTable is ok 448cdf0e10cSrcweir delete pJumpTable; 449cdf0e10cSrcweir } 450cdf0e10cSrcweir bIsForwardTab = true; 451cdf0e10cSrcweir 452cdf0e10cSrcweir sal_Int32 n, nLen = sSrchStr.getLength(); 453cdf0e10cSrcweir pJumpTable = new TextSearchJumpTable; 454cdf0e10cSrcweir 455cdf0e10cSrcweir for( n = 0; n < nLen - 1; ++n ) 456cdf0e10cSrcweir { 457cdf0e10cSrcweir sal_Unicode cCh = sSrchStr[n]; 458cdf0e10cSrcweir sal_Int32 nDiff = nLen - n - 1; 459cdf0e10cSrcweir TextSearchJumpTable::value_type aEntry( cCh, nDiff ); 460cdf0e10cSrcweir 461cdf0e10cSrcweir ::std::pair< TextSearchJumpTable::iterator, bool > aPair = 462cdf0e10cSrcweir pJumpTable->insert( aEntry ); 463cdf0e10cSrcweir if ( !aPair.second ) 464cdf0e10cSrcweir (*(aPair.first)).second = nDiff; 465cdf0e10cSrcweir } 466cdf0e10cSrcweir } 467cdf0e10cSrcweir 468cdf0e10cSrcweir void TextSearch::MakeForwardTab2() 469cdf0e10cSrcweir { 470cdf0e10cSrcweir // create the jumptable for the search text 471cdf0e10cSrcweir if( pJumpTable2 ) 472cdf0e10cSrcweir { 473cdf0e10cSrcweir if( bIsForwardTab ) 474cdf0e10cSrcweir return ; // the jumpTable is ok 475cdf0e10cSrcweir delete pJumpTable2; 476cdf0e10cSrcweir } 477cdf0e10cSrcweir bIsForwardTab = true; 478cdf0e10cSrcweir 479cdf0e10cSrcweir sal_Int32 n, nLen = sSrchStr2.getLength(); 480cdf0e10cSrcweir pJumpTable2 = new TextSearchJumpTable; 481cdf0e10cSrcweir 482cdf0e10cSrcweir for( n = 0; n < nLen - 1; ++n ) 483cdf0e10cSrcweir { 484cdf0e10cSrcweir sal_Unicode cCh = sSrchStr2[n]; 485cdf0e10cSrcweir sal_Int32 nDiff = nLen - n - 1; 486cdf0e10cSrcweir 487cdf0e10cSrcweir TextSearchJumpTable::value_type aEntry( cCh, nDiff ); 488cdf0e10cSrcweir ::std::pair< TextSearchJumpTable::iterator, bool > aPair = 489cdf0e10cSrcweir pJumpTable2->insert( aEntry ); 490cdf0e10cSrcweir if ( !aPair.second ) 491cdf0e10cSrcweir (*(aPair.first)).second = nDiff; 492cdf0e10cSrcweir } 493cdf0e10cSrcweir } 494cdf0e10cSrcweir 495cdf0e10cSrcweir void TextSearch::MakeBackwardTab() 496cdf0e10cSrcweir { 497cdf0e10cSrcweir // create the jumptable for the search text 498cdf0e10cSrcweir if( pJumpTable ) 499cdf0e10cSrcweir { 500cdf0e10cSrcweir if( !bIsForwardTab ) 501cdf0e10cSrcweir return ; // the jumpTable is ok 502cdf0e10cSrcweir delete pJumpTable; 503cdf0e10cSrcweir } 504cdf0e10cSrcweir bIsForwardTab = false; 505cdf0e10cSrcweir 506cdf0e10cSrcweir sal_Int32 n, nLen = sSrchStr.getLength(); 507cdf0e10cSrcweir pJumpTable = new TextSearchJumpTable; 508cdf0e10cSrcweir 509cdf0e10cSrcweir for( n = nLen-1; n > 0; --n ) 510cdf0e10cSrcweir { 511cdf0e10cSrcweir sal_Unicode cCh = sSrchStr[n]; 512cdf0e10cSrcweir TextSearchJumpTable::value_type aEntry( cCh, n ); 513cdf0e10cSrcweir ::std::pair< TextSearchJumpTable::iterator, bool > aPair = 514cdf0e10cSrcweir pJumpTable->insert( aEntry ); 515cdf0e10cSrcweir if ( !aPair.second ) 516cdf0e10cSrcweir (*(aPair.first)).second = n; 517cdf0e10cSrcweir } 518cdf0e10cSrcweir } 519cdf0e10cSrcweir 520cdf0e10cSrcweir void TextSearch::MakeBackwardTab2() 521cdf0e10cSrcweir { 522cdf0e10cSrcweir // create the jumptable for the search text 523cdf0e10cSrcweir if( pJumpTable2 ) 524cdf0e10cSrcweir { 525cdf0e10cSrcweir if( !bIsForwardTab ) 526cdf0e10cSrcweir return ; // the jumpTable is ok 527cdf0e10cSrcweir delete pJumpTable2; 528cdf0e10cSrcweir } 529cdf0e10cSrcweir bIsForwardTab = false; 530cdf0e10cSrcweir 531cdf0e10cSrcweir sal_Int32 n, nLen = sSrchStr2.getLength(); 532cdf0e10cSrcweir pJumpTable2 = new TextSearchJumpTable; 533cdf0e10cSrcweir 534cdf0e10cSrcweir for( n = nLen-1; n > 0; --n ) 535cdf0e10cSrcweir { 536cdf0e10cSrcweir sal_Unicode cCh = sSrchStr2[n]; 537cdf0e10cSrcweir TextSearchJumpTable::value_type aEntry( cCh, n ); 538cdf0e10cSrcweir ::std::pair< TextSearchJumpTable::iterator, bool > aPair = 539cdf0e10cSrcweir pJumpTable2->insert( aEntry ); 540cdf0e10cSrcweir if ( !aPair.second ) 541cdf0e10cSrcweir (*(aPair.first)).second = n; 542cdf0e10cSrcweir } 543cdf0e10cSrcweir } 544cdf0e10cSrcweir 545cdf0e10cSrcweir sal_Int32 TextSearch::GetDiff( const sal_Unicode cChr ) const 546cdf0e10cSrcweir { 547cdf0e10cSrcweir TextSearchJumpTable *pJump; 548cdf0e10cSrcweir OUString sSearchKey; 549cdf0e10cSrcweir 550cdf0e10cSrcweir if ( bUsePrimarySrchStr ) { 551cdf0e10cSrcweir pJump = pJumpTable; 552cdf0e10cSrcweir sSearchKey = sSrchStr; 553cdf0e10cSrcweir } else { 554cdf0e10cSrcweir pJump = pJumpTable2; 555cdf0e10cSrcweir sSearchKey = sSrchStr2; 556cdf0e10cSrcweir } 557cdf0e10cSrcweir 558cdf0e10cSrcweir TextSearchJumpTable::const_iterator iLook = pJump->find( cChr ); 559cdf0e10cSrcweir if ( iLook == pJump->end() ) 560cdf0e10cSrcweir return sSearchKey.getLength(); 561cdf0e10cSrcweir return (*iLook).second; 562cdf0e10cSrcweir } 563cdf0e10cSrcweir 564cdf0e10cSrcweir 565cdf0e10cSrcweir // TextSearch::NSrchFrwrd is mis-optimized on unxsoli (#i105945#) 566cdf0e10cSrcweir SearchResult TextSearch::NSrchFrwrd( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos ) 567cdf0e10cSrcweir throw(RuntimeException) 568cdf0e10cSrcweir { 569cdf0e10cSrcweir SearchResult aRet; 570cdf0e10cSrcweir aRet.subRegExpressions = 0; 571cdf0e10cSrcweir 572cdf0e10cSrcweir OUString sSearchKey = bUsePrimarySrchStr ? sSrchStr : sSrchStr2; 573cdf0e10cSrcweir 574cdf0e10cSrcweir OUString aStr( searchStr ); 575cdf0e10cSrcweir sal_Int32 nSuchIdx = aStr.getLength(); 576cdf0e10cSrcweir sal_Int32 nEnde = endPos; 577cdf0e10cSrcweir if( !nSuchIdx || !sSearchKey.getLength() || sSearchKey.getLength() > nSuchIdx ) 578cdf0e10cSrcweir return aRet; 579cdf0e10cSrcweir 580cdf0e10cSrcweir 581cdf0e10cSrcweir if( nEnde < sSearchKey.getLength() ) // position inside the search region ? 582cdf0e10cSrcweir return aRet; 583cdf0e10cSrcweir 584cdf0e10cSrcweir nEnde -= sSearchKey.getLength(); 585cdf0e10cSrcweir 586cdf0e10cSrcweir if (bUsePrimarySrchStr) 587cdf0e10cSrcweir MakeForwardTab(); // create the jumptable 588cdf0e10cSrcweir else 589cdf0e10cSrcweir MakeForwardTab2(); 590cdf0e10cSrcweir 591cdf0e10cSrcweir for (sal_Int32 nCmpIdx = startPos; // start position for the search 592cdf0e10cSrcweir nCmpIdx <= nEnde; 593cdf0e10cSrcweir nCmpIdx += GetDiff( aStr[nCmpIdx + sSearchKey.getLength()-1])) 594cdf0e10cSrcweir { 595cdf0e10cSrcweir // if the match would be the completed cells, skip it. 596cdf0e10cSrcweir if ( (checkCTLStart && !isCellStart( aStr, nCmpIdx )) || (checkCTLEnd 597cdf0e10cSrcweir && !isCellStart( aStr, nCmpIdx + sSearchKey.getLength())) ) 598cdf0e10cSrcweir continue; 599cdf0e10cSrcweir 600cdf0e10cSrcweir nSuchIdx = sSearchKey.getLength() - 1; 601cdf0e10cSrcweir while( nSuchIdx >= 0 && sSearchKey[nSuchIdx] == aStr[nCmpIdx + nSuchIdx]) 602cdf0e10cSrcweir { 603cdf0e10cSrcweir if( nSuchIdx == 0 ) 604cdf0e10cSrcweir { 605cdf0e10cSrcweir if( SearchFlags::NORM_WORD_ONLY & aSrchPara.searchFlag ) 606cdf0e10cSrcweir { 607cdf0e10cSrcweir sal_Int32 nFndEnd = nCmpIdx + sSearchKey.getLength(); 608cdf0e10cSrcweir bool bAtStart = !nCmpIdx; 609cdf0e10cSrcweir bool bAtEnd = nFndEnd == endPos; 610cdf0e10cSrcweir bool bDelimBefore = bAtStart || IsDelimiter( aStr, nCmpIdx-1 ); 611cdf0e10cSrcweir bool bDelimBehind = IsDelimiter( aStr, nFndEnd ); 612cdf0e10cSrcweir // * 1 -> only one word in the paragraph 613cdf0e10cSrcweir // * 2 -> at begin of paragraph 614cdf0e10cSrcweir // * 3 -> at end of paragraph 615cdf0e10cSrcweir // * 4 -> inside the paragraph 616cdf0e10cSrcweir if( !( ( bAtStart && bAtEnd ) || // 1 617cdf0e10cSrcweir ( bAtStart && bDelimBehind ) || // 2 618cdf0e10cSrcweir ( bAtEnd && bDelimBefore ) || // 3 619cdf0e10cSrcweir ( bDelimBefore && bDelimBehind ))) // 4 620cdf0e10cSrcweir break; 621cdf0e10cSrcweir } 622cdf0e10cSrcweir 623cdf0e10cSrcweir aRet.subRegExpressions = 1; 624cdf0e10cSrcweir aRet.startOffset.realloc( 1 ); 625cdf0e10cSrcweir aRet.startOffset[ 0 ] = nCmpIdx; 626cdf0e10cSrcweir aRet.endOffset.realloc( 1 ); 627cdf0e10cSrcweir aRet.endOffset[ 0 ] = nCmpIdx + sSearchKey.getLength(); 628cdf0e10cSrcweir 629cdf0e10cSrcweir return aRet; 630cdf0e10cSrcweir } 631cdf0e10cSrcweir else 632cdf0e10cSrcweir nSuchIdx--; 633cdf0e10cSrcweir } 634cdf0e10cSrcweir } 635cdf0e10cSrcweir return aRet; 636cdf0e10cSrcweir } 637cdf0e10cSrcweir 638cdf0e10cSrcweir SearchResult TextSearch::NSrchBkwrd( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos ) 639cdf0e10cSrcweir throw(RuntimeException) 640cdf0e10cSrcweir { 641cdf0e10cSrcweir SearchResult aRet; 642cdf0e10cSrcweir aRet.subRegExpressions = 0; 643cdf0e10cSrcweir 644cdf0e10cSrcweir OUString sSearchKey = bUsePrimarySrchStr ? sSrchStr : sSrchStr2; 645cdf0e10cSrcweir 646cdf0e10cSrcweir OUString aStr( searchStr ); 647cdf0e10cSrcweir sal_Int32 nSuchIdx = aStr.getLength(); 648cdf0e10cSrcweir sal_Int32 nEnde = endPos; 649cdf0e10cSrcweir if( nSuchIdx == 0 || sSearchKey.getLength() == 0 || sSearchKey.getLength() > nSuchIdx) 650cdf0e10cSrcweir return aRet; 651cdf0e10cSrcweir 652cdf0e10cSrcweir if (bUsePrimarySrchStr) 653cdf0e10cSrcweir MakeBackwardTab(); // create the jumptable 654cdf0e10cSrcweir else 655cdf0e10cSrcweir MakeBackwardTab2(); 656cdf0e10cSrcweir 657cdf0e10cSrcweir if( nEnde == nSuchIdx ) // end position for the search 658cdf0e10cSrcweir nEnde = sSearchKey.getLength(); 659cdf0e10cSrcweir else 660cdf0e10cSrcweir nEnde += sSearchKey.getLength(); 661cdf0e10cSrcweir 662cdf0e10cSrcweir sal_Int32 nCmpIdx = startPos; // start position for the search 663cdf0e10cSrcweir 664cdf0e10cSrcweir while (nCmpIdx >= nEnde) 665cdf0e10cSrcweir { 666cdf0e10cSrcweir // if the match would be the completed cells, skip it. 667cdf0e10cSrcweir if ( (!checkCTLStart || isCellStart( aStr, nCmpIdx - 668cdf0e10cSrcweir sSearchKey.getLength() )) && (!checkCTLEnd || 669cdf0e10cSrcweir isCellStart( aStr, nCmpIdx))) 670cdf0e10cSrcweir { 671cdf0e10cSrcweir nSuchIdx = 0; 672cdf0e10cSrcweir while( nSuchIdx < sSearchKey.getLength() && sSearchKey[nSuchIdx] == 673cdf0e10cSrcweir aStr[nCmpIdx + nSuchIdx - sSearchKey.getLength()] ) 674cdf0e10cSrcweir nSuchIdx++; 675cdf0e10cSrcweir if( nSuchIdx >= sSearchKey.getLength() ) 676cdf0e10cSrcweir { 677cdf0e10cSrcweir if( SearchFlags::NORM_WORD_ONLY & aSrchPara.searchFlag ) 678cdf0e10cSrcweir { 679cdf0e10cSrcweir sal_Int32 nFndStt = nCmpIdx - sSearchKey.getLength(); 680cdf0e10cSrcweir bool bAtStart = !nFndStt; 681cdf0e10cSrcweir bool bAtEnd = nCmpIdx == startPos; 682cdf0e10cSrcweir bool bDelimBehind = IsDelimiter( aStr, nCmpIdx ); 683cdf0e10cSrcweir bool bDelimBefore = bAtStart || // begin of paragraph 684cdf0e10cSrcweir IsDelimiter( aStr, nFndStt-1 ); 685cdf0e10cSrcweir // * 1 -> only one word in the paragraph 686cdf0e10cSrcweir // * 2 -> at begin of paragraph 687cdf0e10cSrcweir // * 3 -> at end of paragraph 688cdf0e10cSrcweir // * 4 -> inside the paragraph 689cdf0e10cSrcweir if( ( bAtStart && bAtEnd ) || // 1 690cdf0e10cSrcweir ( bAtStart && bDelimBehind ) || // 2 691cdf0e10cSrcweir ( bAtEnd && bDelimBefore ) || // 3 692cdf0e10cSrcweir ( bDelimBefore && bDelimBehind )) // 4 693cdf0e10cSrcweir { 694cdf0e10cSrcweir aRet.subRegExpressions = 1; 695cdf0e10cSrcweir aRet.startOffset.realloc( 1 ); 696cdf0e10cSrcweir aRet.startOffset[ 0 ] = nCmpIdx; 697cdf0e10cSrcweir aRet.endOffset.realloc( 1 ); 698cdf0e10cSrcweir aRet.endOffset[ 0 ] = nCmpIdx - sSearchKey.getLength(); 699cdf0e10cSrcweir return aRet; 700cdf0e10cSrcweir } 701cdf0e10cSrcweir } 702cdf0e10cSrcweir else 703cdf0e10cSrcweir { 704cdf0e10cSrcweir aRet.subRegExpressions = 1; 705cdf0e10cSrcweir aRet.startOffset.realloc( 1 ); 706cdf0e10cSrcweir aRet.startOffset[ 0 ] = nCmpIdx; 707cdf0e10cSrcweir aRet.endOffset.realloc( 1 ); 708cdf0e10cSrcweir aRet.endOffset[ 0 ] = nCmpIdx - sSearchKey.getLength(); 709cdf0e10cSrcweir return aRet; 710cdf0e10cSrcweir } 711cdf0e10cSrcweir } 712cdf0e10cSrcweir } 713cdf0e10cSrcweir nSuchIdx = GetDiff( aStr[nCmpIdx - sSearchKey.getLength()] ); 714cdf0e10cSrcweir if( nCmpIdx < nSuchIdx ) 715cdf0e10cSrcweir return aRet; 716cdf0e10cSrcweir nCmpIdx -= nSuchIdx; 717cdf0e10cSrcweir } 718cdf0e10cSrcweir return aRet; 719cdf0e10cSrcweir } 720cdf0e10cSrcweir 7217f9f793fSHerbert Dürr void TextSearch::RESrchPrepare( const ::com::sun::star::util::SearchOptions& rOptions) 7227f9f793fSHerbert Dürr { 7237f9f793fSHerbert Dürr // select the transliterated pattern string 7247f9f793fSHerbert Dürr const OUString& rPatternStr = 725e2630f2cSHerbert Dürr (rOptions.transliterateFlags & REGEX_TRANS_MASK) ? sSrchStr 7267f9f793fSHerbert Dürr : ((rOptions.transliterateFlags & COMPLEX_TRANS_MASK) ? sSrchStr2 : rOptions.searchString); 7277f9f793fSHerbert Dürr 7287c5e76a7SHerbert Dürr sal_uInt32 nIcuSearchFlags = UREGEX_UWORD; // request UAX#29 unicode capability 7297f9f793fSHerbert Dürr // map com::sun::star::util::SearchFlags to ICU uregex.h flags 7307f9f793fSHerbert Dürr // TODO: REG_EXTENDED, REG_NOT_BEGINOFLINE, REG_NOT_ENDOFLINE 7317f9f793fSHerbert Dürr // REG_NEWLINE is neither properly defined nor used anywhere => not implemented 7327f9f793fSHerbert Dürr // REG_NOSUB is not used anywhere => not implemented 7337f9f793fSHerbert Dürr // NORM_WORD_ONLY is only used for SearchAlgorithm==Absolute 7347f9f793fSHerbert Dürr // LEV_RELAXED is only used for SearchAlgorithm==Approximate 73522c9c6f7SHerbert Dürr // Note that the search flag ALL_IGNORE_CASE is deprecated in UNO 73622c9c6f7SHerbert Dürr // probably because the transliteration flag IGNORE_CASE handles it as well. 73722c9c6f7SHerbert Dürr if( (rOptions.searchFlag & com::sun::star::util::SearchFlags::ALL_IGNORE_CASE) != 0 73822c9c6f7SHerbert Dürr || (rOptions.transliterateFlags & TransliterationModules_IGNORE_CASE) != 0) 7397f9f793fSHerbert Dürr nIcuSearchFlags |= UREGEX_CASE_INSENSITIVE; 7407f9f793fSHerbert Dürr UErrorCode nIcuErr = U_ZERO_ERROR; 7417f9f793fSHerbert Dürr // assumption: transliteration didn't mangle regexp control chars 74203c97e34SYuri Dario IcuUniString aIcuSearchPatStr( (const UChar*)rPatternStr.getStr(), rPatternStr.getLength()); 743ee131020SHerbert Dürr #ifndef DISABLE_WORDBOUND_EMULATION 7447f9f793fSHerbert Dürr // for conveniance specific syntax elements of the old regex engine are emulated 7456a7366bcSHerbert Dürr // - by replacing \< with "word-break followed by a look-ahead word-char" 7466a7366bcSHerbert Dürr static const IcuUniString aChevronPatternB( "\\\\<", -1, IcuUniString::kInvariant); 7476a7366bcSHerbert Dürr static const IcuUniString aChevronReplaceB( "\\\\b(?=\\\\w)", -1, IcuUniString::kInvariant); 7486a7366bcSHerbert Dürr static RegexMatcher aChevronMatcherB( aChevronPatternB, 0, nIcuErr); 7496a7366bcSHerbert Dürr aChevronMatcherB.reset( aIcuSearchPatStr); 7506a7366bcSHerbert Dürr aIcuSearchPatStr = aChevronMatcherB.replaceAll( aChevronReplaceB, nIcuErr); 7516a7366bcSHerbert Dürr aChevronMatcherB.reset(); 7526a7366bcSHerbert Dürr // - by replacing \> with "look-behind word-char followed by a word-break" 7536a7366bcSHerbert Dürr static const IcuUniString aChevronPatternE( "\\\\>", -1, IcuUniString::kInvariant); 7546a7366bcSHerbert Dürr static const IcuUniString aChevronReplaceE( "(?<=\\\\w)\\\\b", -1, IcuUniString::kInvariant); 7556a7366bcSHerbert Dürr static RegexMatcher aChevronMatcherE( aChevronPatternE, 0, nIcuErr); 7566a7366bcSHerbert Dürr aChevronMatcherE.reset( aIcuSearchPatStr); 7576a7366bcSHerbert Dürr aIcuSearchPatStr = aChevronMatcherE.replaceAll( aChevronReplaceE, nIcuErr); 7586a7366bcSHerbert Dürr aChevronMatcherE.reset(); 7597f9f793fSHerbert Dürr #endif 7607f9f793fSHerbert Dürr pRegexMatcher = new RegexMatcher( aIcuSearchPatStr, nIcuSearchFlags, nIcuErr); 7617f9f793fSHerbert Dürr if( nIcuErr) 7627f9f793fSHerbert Dürr { delete pRegexMatcher; pRegexMatcher = NULL;} 7637f9f793fSHerbert Dürr } 7647f9f793fSHerbert Dürr 765cdf0e10cSrcweir //--------------------------------------------------------------------------- 766cdf0e10cSrcweir 767cdf0e10cSrcweir SearchResult TextSearch::RESrchFrwrd( const OUString& searchStr, 768cdf0e10cSrcweir sal_Int32 startPos, sal_Int32 endPos ) 769cdf0e10cSrcweir throw(RuntimeException) 770cdf0e10cSrcweir { 771cc450e3aSHerbert Dürr SearchResult aRet; 772cc450e3aSHerbert Dürr aRet.subRegExpressions = 0; 773cc450e3aSHerbert Dürr if( !pRegexMatcher) 774cc450e3aSHerbert Dürr return aRet; 77519ee98b9SHerbert Dürr 776cc450e3aSHerbert Dürr if( endPos > searchStr.getLength()) 777cc450e3aSHerbert Dürr endPos = searchStr.getLength(); 778cc450e3aSHerbert Dürr 779cc450e3aSHerbert Dürr // use the ICU RegexMatcher to find the matches 780cc450e3aSHerbert Dürr UErrorCode nIcuErr = U_ZERO_ERROR; 78119716b0aSHerbert Dürr const IcuUniString aSearchTargetStr( (const UChar*)searchStr.getStr(), endPos); 782cc450e3aSHerbert Dürr pRegexMatcher->reset( aSearchTargetStr); 78316b8677bSHerbert Dürr // search until there is a valid match 78416b8677bSHerbert Dürr for(;;) 78516b8677bSHerbert Dürr { 78616b8677bSHerbert Dürr if( !pRegexMatcher->find( startPos, nIcuErr)) 78716b8677bSHerbert Dürr return aRet; 78816b8677bSHerbert Dürr 78916b8677bSHerbert Dürr // #i118887# ignore zero-length matches e.g. "a*" in "bc" 79016b8677bSHerbert Dürr int nStartOfs = pRegexMatcher->start( nIcuErr); 79116b8677bSHerbert Dürr int nEndOfs = pRegexMatcher->end( nIcuErr); 79216b8677bSHerbert Dürr if( nStartOfs < nEndOfs) 79316b8677bSHerbert Dürr break; 79416b8677bSHerbert Dürr // try at next position if there was a zero-length match 79516b8677bSHerbert Dürr if( ++startPos >= endPos) 79616b8677bSHerbert Dürr return aRet; 79716b8677bSHerbert Dürr } 798cc450e3aSHerbert Dürr 79916b8677bSHerbert Dürr // extract the result of the search 8000c7ce76dSHerbert Dürr const int nGroupCount = pRegexMatcher->groupCount(); 8010c7ce76dSHerbert Dürr aRet.subRegExpressions = nGroupCount + 1; 802cc450e3aSHerbert Dürr aRet.startOffset.realloc( aRet.subRegExpressions); 803cc450e3aSHerbert Dürr aRet.endOffset.realloc( aRet.subRegExpressions); 804cc450e3aSHerbert Dürr aRet.startOffset[0] = pRegexMatcher->start( nIcuErr); 805cc450e3aSHerbert Dürr aRet.endOffset[0] = pRegexMatcher->end( nIcuErr); 8060c7ce76dSHerbert Dürr for( int i = 1; i <= nGroupCount; ++i) { 8070c7ce76dSHerbert Dürr aRet.startOffset[i] = pRegexMatcher->start( i, nIcuErr); 8080c7ce76dSHerbert Dürr aRet.endOffset[i] = pRegexMatcher->end( i, nIcuErr); 8090c7ce76dSHerbert Dürr } 810cc450e3aSHerbert Dürr 811cc450e3aSHerbert Dürr return aRet; 812cdf0e10cSrcweir } 813cdf0e10cSrcweir 814cdf0e10cSrcweir SearchResult TextSearch::RESrchBkwrd( const OUString& searchStr, 815cdf0e10cSrcweir sal_Int32 startPos, sal_Int32 endPos ) 816cdf0e10cSrcweir throw(RuntimeException) 817cdf0e10cSrcweir { 818cc450e3aSHerbert Dürr // NOTE: for backwards search callers provide startPos/endPos inverted! 819cc450e3aSHerbert Dürr SearchResult aRet; 820cc450e3aSHerbert Dürr aRet.subRegExpressions = 0; 821cc450e3aSHerbert Dürr if( !pRegexMatcher) 822cc450e3aSHerbert Dürr return aRet; 82319ee98b9SHerbert Dürr 824cc450e3aSHerbert Dürr if( startPos > searchStr.getLength()) 825cc450e3aSHerbert Dürr startPos = searchStr.getLength(); 826cc450e3aSHerbert Dürr 827cc450e3aSHerbert Dürr // use the ICU RegexMatcher to find the matches 828cc450e3aSHerbert Dürr // TODO: use ICU's backward searching once it becomes available 8290c7ce76dSHerbert Dürr // as its replacement using forward search is not as good as the real thing 830cc450e3aSHerbert Dürr UErrorCode nIcuErr = U_ZERO_ERROR; 83103c97e34SYuri Dario const IcuUniString aSearchTargetStr( (const UChar*)searchStr.getStr(), startPos); 832cc450e3aSHerbert Dürr pRegexMatcher->reset( aSearchTargetStr); 833cc450e3aSHerbert Dürr if( !pRegexMatcher->find( endPos, nIcuErr)) 834cc450e3aSHerbert Dürr return aRet; 835cc450e3aSHerbert Dürr 8360c7ce76dSHerbert Dürr // find the last match 8370c7ce76dSHerbert Dürr int nLastPos = 0; 8382c1e93e7SHerbert Dürr int nFoundEnd = 0; 8390c7ce76dSHerbert Dürr do { 8400c7ce76dSHerbert Dürr nLastPos = pRegexMatcher->start( nIcuErr); 8412c1e93e7SHerbert Dürr nFoundEnd = pRegexMatcher->end( nIcuErr); 8422c1e93e7SHerbert Dürr if( nFoundEnd >= startPos) 8432c1e93e7SHerbert Dürr break; 8442c1e93e7SHerbert Dürr if( nFoundEnd == nLastPos) 8452c1e93e7SHerbert Dürr ++nFoundEnd; 8462c1e93e7SHerbert Dürr } while( pRegexMatcher->find( nFoundEnd, nIcuErr)); 8470c7ce76dSHerbert Dürr 8480c7ce76dSHerbert Dürr // find last match again to get its details 8490c7ce76dSHerbert Dürr pRegexMatcher->find( nLastPos, nIcuErr); 8500c7ce76dSHerbert Dürr 8510c7ce76dSHerbert Dürr // fill in the details of the last match 8520c7ce76dSHerbert Dürr const int nGroupCount = pRegexMatcher->groupCount(); 8530c7ce76dSHerbert Dürr aRet.subRegExpressions = nGroupCount + 1; 854cc450e3aSHerbert Dürr aRet.startOffset.realloc( aRet.subRegExpressions); 855cc450e3aSHerbert Dürr aRet.endOffset.realloc( aRet.subRegExpressions); 8560c7ce76dSHerbert Dürr // NOTE: existing users of backward search seem to expect startOfs/endOfs being inverted! 8570c7ce76dSHerbert Dürr aRet.startOffset[0] = pRegexMatcher->end( nIcuErr); 8580c7ce76dSHerbert Dürr aRet.endOffset[0] = pRegexMatcher->start( nIcuErr); 8590c7ce76dSHerbert Dürr for( int i = 1; i <= nGroupCount; ++i) { 8600c7ce76dSHerbert Dürr aRet.startOffset[i] = pRegexMatcher->end( i, nIcuErr); 8610c7ce76dSHerbert Dürr aRet.endOffset[i] = pRegexMatcher->start( i, nIcuErr); 8620c7ce76dSHerbert Dürr } 863cc450e3aSHerbert Dürr 864cc450e3aSHerbert Dürr return aRet; 865cdf0e10cSrcweir } 866cdf0e10cSrcweir 867cc450e3aSHerbert Dürr //--------------------------------------------------------------------------- 868cc450e3aSHerbert Dürr 869cc450e3aSHerbert Dürr // search for words phonetically 870cdf0e10cSrcweir SearchResult TextSearch::ApproxSrchFrwrd( const OUString& searchStr, 871cdf0e10cSrcweir sal_Int32 startPos, sal_Int32 endPos ) 872cdf0e10cSrcweir throw(RuntimeException) 873cdf0e10cSrcweir { 874cdf0e10cSrcweir SearchResult aRet; 875cdf0e10cSrcweir aRet.subRegExpressions = 0; 876cdf0e10cSrcweir 877cdf0e10cSrcweir if( !xBreak.is() ) 878cdf0e10cSrcweir return aRet; 879cdf0e10cSrcweir 880cdf0e10cSrcweir OUString aWTemp( searchStr ); 881cdf0e10cSrcweir 882*c1e8cc3aSDon Lewis sal_Int32 nStt, nEnd; 883cdf0e10cSrcweir 884cdf0e10cSrcweir Boundary aWBnd = xBreak->getWordBoundary( aWTemp, startPos, 885cdf0e10cSrcweir aSrchPara.Locale, 886cdf0e10cSrcweir WordType::ANYWORD_IGNOREWHITESPACES, sal_True ); 887cdf0e10cSrcweir 888cdf0e10cSrcweir do 889cdf0e10cSrcweir { 890cdf0e10cSrcweir if( aWBnd.startPos >= endPos ) 891cdf0e10cSrcweir break; 892cdf0e10cSrcweir nStt = aWBnd.startPos < startPos ? startPos : aWBnd.startPos; 893cdf0e10cSrcweir nEnd = aWBnd.endPos > endPos ? endPos : aWBnd.endPos; 894cdf0e10cSrcweir 895cdf0e10cSrcweir if( nStt < nEnd && 896cdf0e10cSrcweir pWLD->WLD( aWTemp.getStr() + nStt, nEnd - nStt ) <= nLimit ) 897cdf0e10cSrcweir { 898cdf0e10cSrcweir aRet.subRegExpressions = 1; 899cdf0e10cSrcweir aRet.startOffset.realloc( 1 ); 900cdf0e10cSrcweir aRet.startOffset[ 0 ] = nStt; 901cdf0e10cSrcweir aRet.endOffset.realloc( 1 ); 902cdf0e10cSrcweir aRet.endOffset[ 0 ] = nEnd; 903cdf0e10cSrcweir break; 904cdf0e10cSrcweir } 905cdf0e10cSrcweir 906cdf0e10cSrcweir nStt = nEnd - 1; 907cdf0e10cSrcweir aWBnd = xBreak->nextWord( aWTemp, nStt, aSrchPara.Locale, 908cdf0e10cSrcweir WordType::ANYWORD_IGNOREWHITESPACES); 909cdf0e10cSrcweir } while( aWBnd.startPos != aWBnd.endPos || 910cdf0e10cSrcweir (aWBnd.endPos != aWTemp.getLength() && aWBnd.endPos != nEnd) ); 911cdf0e10cSrcweir // #i50244# aWBnd.endPos != nEnd : in case there is _no_ word (only 912cdf0e10cSrcweir // whitespace) in searchStr, getWordBoundary() returned startPos,startPos 913cdf0e10cSrcweir // and nextWord() does also => don't loop forever. 914cdf0e10cSrcweir return aRet; 915cdf0e10cSrcweir } 916cdf0e10cSrcweir 917cdf0e10cSrcweir SearchResult TextSearch::ApproxSrchBkwrd( const OUString& searchStr, 918cdf0e10cSrcweir sal_Int32 startPos, sal_Int32 endPos ) 919cdf0e10cSrcweir throw(RuntimeException) 920cdf0e10cSrcweir { 921cdf0e10cSrcweir SearchResult aRet; 922cdf0e10cSrcweir aRet.subRegExpressions = 0; 923cdf0e10cSrcweir 924cdf0e10cSrcweir if( !xBreak.is() ) 925cdf0e10cSrcweir return aRet; 926cdf0e10cSrcweir 927cdf0e10cSrcweir OUString aWTemp( searchStr ); 928cdf0e10cSrcweir 929*c1e8cc3aSDon Lewis sal_Int32 nStt, nEnd; 930cdf0e10cSrcweir 931cdf0e10cSrcweir Boundary aWBnd = xBreak->getWordBoundary( aWTemp, startPos, 932cdf0e10cSrcweir aSrchPara.Locale, 933cdf0e10cSrcweir WordType::ANYWORD_IGNOREWHITESPACES, sal_True ); 934cdf0e10cSrcweir 935cdf0e10cSrcweir do 936cdf0e10cSrcweir { 937cdf0e10cSrcweir if( aWBnd.endPos <= endPos ) 938cdf0e10cSrcweir break; 939cdf0e10cSrcweir nStt = aWBnd.startPos < endPos ? endPos : aWBnd.startPos; 940cdf0e10cSrcweir nEnd = aWBnd.endPos > startPos ? startPos : aWBnd.endPos; 941cdf0e10cSrcweir 942cdf0e10cSrcweir if( nStt < nEnd && 943cdf0e10cSrcweir pWLD->WLD( aWTemp.getStr() + nStt, nEnd - nStt ) <= nLimit ) 944cdf0e10cSrcweir { 945cdf0e10cSrcweir aRet.subRegExpressions = 1; 946cdf0e10cSrcweir aRet.startOffset.realloc( 1 ); 947cdf0e10cSrcweir aRet.startOffset[ 0 ] = nEnd; 948cdf0e10cSrcweir aRet.endOffset.realloc( 1 ); 949cdf0e10cSrcweir aRet.endOffset[ 0 ] = nStt; 950cdf0e10cSrcweir break; 951cdf0e10cSrcweir } 952cdf0e10cSrcweir if( !nStt ) 953cdf0e10cSrcweir break; 954cdf0e10cSrcweir 955cdf0e10cSrcweir aWBnd = xBreak->previousWord( aWTemp, nStt, aSrchPara.Locale, 956cdf0e10cSrcweir WordType::ANYWORD_IGNOREWHITESPACES); 957cdf0e10cSrcweir } while( aWBnd.startPos != aWBnd.endPos || aWBnd.endPos != aWTemp.getLength() ); 958cdf0e10cSrcweir return aRet; 959cdf0e10cSrcweir } 960cdf0e10cSrcweir 961cdf0e10cSrcweir 962cdf0e10cSrcweir static const sal_Char cSearchName[] = "com.sun.star.util.TextSearch"; 963cdf0e10cSrcweir static const sal_Char cSearchImpl[] = "com.sun.star.util.TextSearch_i18n"; 964cdf0e10cSrcweir 965cdf0e10cSrcweir static OUString getServiceName_Static() 966cdf0e10cSrcweir { 967cdf0e10cSrcweir return OUString::createFromAscii( cSearchName ); 968cdf0e10cSrcweir } 969cdf0e10cSrcweir 970cdf0e10cSrcweir static OUString getImplementationName_Static() 971cdf0e10cSrcweir { 972cdf0e10cSrcweir return OUString::createFromAscii( cSearchImpl ); 973cdf0e10cSrcweir } 974cdf0e10cSrcweir 975cdf0e10cSrcweir OUString SAL_CALL 976cdf0e10cSrcweir TextSearch::getImplementationName() 977cdf0e10cSrcweir throw( RuntimeException ) 978cdf0e10cSrcweir { 979cdf0e10cSrcweir return getImplementationName_Static(); 980cdf0e10cSrcweir } 981cdf0e10cSrcweir 982cdf0e10cSrcweir sal_Bool SAL_CALL 983cdf0e10cSrcweir TextSearch::supportsService(const OUString& rServiceName) 984cdf0e10cSrcweir throw( RuntimeException ) 985cdf0e10cSrcweir { 986cdf0e10cSrcweir return !rServiceName.compareToAscii( cSearchName ); 987cdf0e10cSrcweir } 988cdf0e10cSrcweir 989cdf0e10cSrcweir Sequence< OUString > SAL_CALL 990cdf0e10cSrcweir TextSearch::getSupportedServiceNames(void) throw( RuntimeException ) 991cdf0e10cSrcweir { 992cdf0e10cSrcweir Sequence< OUString > aRet(1); 993cdf0e10cSrcweir aRet[0] = getServiceName_Static(); 994cdf0e10cSrcweir return aRet; 995cdf0e10cSrcweir } 996cdf0e10cSrcweir 997cdf0e10cSrcweir ::com::sun::star::uno::Reference< ::com::sun::star::uno::XInterface > 998cdf0e10cSrcweir SAL_CALL TextSearch_CreateInstance( 999cdf0e10cSrcweir const ::com::sun::star::uno::Reference< 1000cdf0e10cSrcweir ::com::sun::star::lang::XMultiServiceFactory >& rxMSF ) 1001cdf0e10cSrcweir { 1002cdf0e10cSrcweir return ::com::sun::star::uno::Reference< 1003cdf0e10cSrcweir ::com::sun::star::uno::XInterface >( 1004cdf0e10cSrcweir (::cppu::OWeakObject*) new TextSearch( rxMSF ) ); 1005cdf0e10cSrcweir } 1006cdf0e10cSrcweir 1007cdf0e10cSrcweir extern "C" 1008cdf0e10cSrcweir { 1009cdf0e10cSrcweir 1010cdf0e10cSrcweir void SAL_CALL component_getImplementationEnvironment( 1011cdf0e10cSrcweir const sal_Char** ppEnvTypeName, uno_Environment** /*ppEnv*/ ) 1012cdf0e10cSrcweir { 1013cdf0e10cSrcweir *ppEnvTypeName = CPPU_CURRENT_LANGUAGE_BINDING_NAME; 1014cdf0e10cSrcweir } 1015cdf0e10cSrcweir 1016cdf0e10cSrcweir void* SAL_CALL component_getFactory( const sal_Char* sImplementationName, 1017cdf0e10cSrcweir void* _pServiceManager, void* /*_pRegistryKey*/ ) 1018cdf0e10cSrcweir { 1019cdf0e10cSrcweir void* pRet = NULL; 1020cdf0e10cSrcweir 1021cdf0e10cSrcweir ::com::sun::star::lang::XMultiServiceFactory* pServiceManager = 1022cdf0e10cSrcweir reinterpret_cast< ::com::sun::star::lang::XMultiServiceFactory* > 1023cdf0e10cSrcweir ( _pServiceManager ); 1024cdf0e10cSrcweir ::com::sun::star::uno::Reference< 1025cdf0e10cSrcweir ::com::sun::star::lang::XSingleServiceFactory > xFactory; 1026cdf0e10cSrcweir 1027cdf0e10cSrcweir if ( 0 == rtl_str_compare( sImplementationName, cSearchImpl) ) 1028cdf0e10cSrcweir { 1029cdf0e10cSrcweir ::com::sun::star::uno::Sequence< ::rtl::OUString > aServiceNames(1); 1030cdf0e10cSrcweir aServiceNames[0] = getServiceName_Static(); 1031cdf0e10cSrcweir xFactory = ::cppu::createSingleFactory( 1032cdf0e10cSrcweir pServiceManager, getImplementationName_Static(), 1033cdf0e10cSrcweir &TextSearch_CreateInstance, aServiceNames ); 1034cdf0e10cSrcweir } 1035cdf0e10cSrcweir 1036cdf0e10cSrcweir if ( xFactory.is() ) 1037cdf0e10cSrcweir { 1038cdf0e10cSrcweir xFactory->acquire(); 1039cdf0e10cSrcweir pRet = xFactory.get(); 1040cdf0e10cSrcweir } 1041cdf0e10cSrcweir 1042cdf0e10cSrcweir return pRet; 1043cdf0e10cSrcweir } 1044cdf0e10cSrcweir 1045cdf0e10cSrcweir } // extern "C" 1046