1449ab281SAndrew Rist /**************************************************************
2cdf0e10cSrcweir *
3449ab281SAndrew Rist * Licensed to the Apache Software Foundation (ASF) under one
4449ab281SAndrew Rist * or more contributor license agreements. See the NOTICE file
5449ab281SAndrew Rist * distributed with this work for additional information
6449ab281SAndrew Rist * regarding copyright ownership. The ASF licenses this file
7449ab281SAndrew Rist * to you under the Apache License, Version 2.0 (the
8449ab281SAndrew Rist * "License"); you may not use this file except in compliance
9449ab281SAndrew Rist * with the License. You may obtain a copy of the License at
10449ab281SAndrew Rist *
11449ab281SAndrew Rist * http://www.apache.org/licenses/LICENSE-2.0
12449ab281SAndrew Rist *
13449ab281SAndrew Rist * Unless required by applicable law or agreed to in writing,
14449ab281SAndrew Rist * software distributed under the License is distributed on an
15449ab281SAndrew Rist * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16449ab281SAndrew Rist * KIND, either express or implied. See the License for the
17449ab281SAndrew Rist * specific language governing permissions and limitations
18449ab281SAndrew Rist * under the License.
19449ab281SAndrew Rist *
20449ab281SAndrew Rist *************************************************************/
21449ab281SAndrew Rist
22449ab281SAndrew Rist
23cdf0e10cSrcweir
24cdf0e10cSrcweir // MARKER(update_precomp.py): autogen include statement, do not remove
25*52d905a6SJim Jagielski #include "precompiled_search.hxx"
26cdf0e10cSrcweir
27cdf0e10cSrcweir #include "textsearch.hxx"
28cdf0e10cSrcweir #include "levdis.hxx"
29cdf0e10cSrcweir #include <com/sun/star/lang/Locale.hpp>
30cdf0e10cSrcweir #include <com/sun/star/lang/XMultiServiceFactory.hpp>
31cdf0e10cSrcweir #include <comphelper/processfactory.hxx>
32cdf0e10cSrcweir #include <com/sun/star/i18n/UnicodeType.hpp>
33cdf0e10cSrcweir #include <com/sun/star/util/SearchFlags.hpp>
34cdf0e10cSrcweir #include <com/sun/star/i18n/WordType.hpp>
35cdf0e10cSrcweir #include <com/sun/star/i18n/ScriptType.hpp>
36cdf0e10cSrcweir #include <com/sun/star/i18n/CharacterIteratorMode.hpp>
37cdf0e10cSrcweir #include <com/sun/star/i18n/KCharacterType.hpp>
38cdf0e10cSrcweir #include <com/sun/star/registry/XRegistryKey.hpp>
39cdf0e10cSrcweir #include <cppuhelper/factory.hxx>
40cdf0e10cSrcweir #include <cppuhelper/weak.hxx>
41cdf0e10cSrcweir
42cdf0e10cSrcweir #ifdef _MSC_VER
43cdf0e10cSrcweir // get rid of that dumb compiler warning
44cdf0e10cSrcweir // identifier was truncated to '255' characters in the debug information
45cdf0e10cSrcweir // for STL template usage, if .pdb files are to be created
46cdf0e10cSrcweir #pragma warning( disable: 4786 )
47cdf0e10cSrcweir #endif
48cdf0e10cSrcweir
49cdf0e10cSrcweir #include <string.h>
50cdf0e10cSrcweir
51cdf0e10cSrcweir using namespace ::com::sun::star::util;
52cdf0e10cSrcweir using namespace ::com::sun::star::uno;
53cdf0e10cSrcweir using namespace ::com::sun::star::lang;
54cdf0e10cSrcweir using namespace ::com::sun::star::i18n;
55cdf0e10cSrcweir using namespace ::rtl;
56cdf0e10cSrcweir
57cdf0e10cSrcweir static sal_Int32 COMPLEX_TRANS_MASK_TMP =
58cdf0e10cSrcweir TransliterationModules_ignoreBaFa_ja_JP |
59cdf0e10cSrcweir TransliterationModules_ignoreIterationMark_ja_JP |
60cdf0e10cSrcweir TransliterationModules_ignoreTiJi_ja_JP |
61cdf0e10cSrcweir TransliterationModules_ignoreHyuByu_ja_JP |
62cdf0e10cSrcweir TransliterationModules_ignoreSeZe_ja_JP |
63cdf0e10cSrcweir TransliterationModules_ignoreIandEfollowedByYa_ja_JP |
64cdf0e10cSrcweir TransliterationModules_ignoreKiKuFollowedBySa_ja_JP |
65cdf0e10cSrcweir TransliterationModules_ignoreProlongedSoundMark_ja_JP;
66cc450e3aSHerbert Dürr static const sal_Int32 COMPLEX_TRANS_MASK = COMPLEX_TRANS_MASK_TMP | TransliterationModules_IGNORE_KANA | TransliterationModules_FULLWIDTH_HALFWIDTH;
67e2630f2cSHerbert Dürr static const sal_Int32 SIMPLE_TRANS_MASK = ~COMPLEX_TRANS_MASK;
68e2630f2cSHerbert Dürr static const sal_Int32 REGEX_TRANS_MASK = ~(COMPLEX_TRANS_MASK | TransliterationModules_IGNORE_CASE | TransliterationModules_UPPERCASE_LOWERCASE | TransliterationModules_LOWERCASE_UPPERCASE);
69cdf0e10cSrcweir // Above 2 transliteration is simple but need to take effect in
70cdf0e10cSrcweir // complex transliteration
71cdf0e10cSrcweir
TextSearch(const Reference<XMultiServiceFactory> & rxMSF)72cdf0e10cSrcweir TextSearch::TextSearch(const Reference < XMultiServiceFactory > & rxMSF)
73cdf0e10cSrcweir : xMSF( rxMSF )
74cdf0e10cSrcweir , pJumpTable( 0 )
75cdf0e10cSrcweir , pJumpTable2( 0 )
76cc450e3aSHerbert Dürr , pRegexMatcher( NULL )
77cdf0e10cSrcweir , pWLD( 0 )
78cdf0e10cSrcweir {
79cdf0e10cSrcweir SearchOptions aOpt;
80cdf0e10cSrcweir aOpt.algorithmType = SearchAlgorithms_ABSOLUTE;
81cdf0e10cSrcweir aOpt.searchFlag = SearchFlags::ALL_IGNORE_CASE;
82cdf0e10cSrcweir //aOpt.Locale = ???;
83cdf0e10cSrcweir setOptions( aOpt );
84cdf0e10cSrcweir }
85cdf0e10cSrcweir
~TextSearch()86cdf0e10cSrcweir TextSearch::~TextSearch()
87cdf0e10cSrcweir {
88cc450e3aSHerbert Dürr delete pRegexMatcher;
89cdf0e10cSrcweir delete pWLD;
90cdf0e10cSrcweir delete pJumpTable;
91cdf0e10cSrcweir delete pJumpTable2;
92cdf0e10cSrcweir }
93cdf0e10cSrcweir
setOptions(const SearchOptions & rOptions)94cdf0e10cSrcweir void TextSearch::setOptions( const SearchOptions& rOptions ) throw( RuntimeException )
95cdf0e10cSrcweir {
96cdf0e10cSrcweir aSrchPara = rOptions;
97cdf0e10cSrcweir
98cc450e3aSHerbert Dürr delete pRegexMatcher, pRegexMatcher = NULL;
99cdf0e10cSrcweir delete pWLD, pWLD = 0;
100cdf0e10cSrcweir delete pJumpTable, pJumpTable = 0;
101cdf0e10cSrcweir delete pJumpTable2, pJumpTable2 = 0;
102cdf0e10cSrcweir
103cdf0e10cSrcweir // Create Transliteration class
104cdf0e10cSrcweir if( aSrchPara.transliterateFlags & SIMPLE_TRANS_MASK )
105cdf0e10cSrcweir {
106cdf0e10cSrcweir if( !xTranslit.is() )
107cdf0e10cSrcweir {
108cdf0e10cSrcweir Reference < XInterface > xI = xMSF->createInstance(
109cdf0e10cSrcweir OUString::createFromAscii(
110cdf0e10cSrcweir "com.sun.star.i18n.Transliteration"));
111cdf0e10cSrcweir if ( xI.is() )
112cdf0e10cSrcweir xI->queryInterface( ::getCppuType(
113cdf0e10cSrcweir (const Reference< XExtendedTransliteration >*)0))
114cdf0e10cSrcweir >>= xTranslit;
115cdf0e10cSrcweir }
116cdf0e10cSrcweir // Load transliteration module
117cdf0e10cSrcweir if( xTranslit.is() )
118cdf0e10cSrcweir xTranslit->loadModule(
119cdf0e10cSrcweir (TransliterationModules)( aSrchPara.transliterateFlags & SIMPLE_TRANS_MASK ),
120cdf0e10cSrcweir aSrchPara.Locale);
121cdf0e10cSrcweir }
122cdf0e10cSrcweir else if( xTranslit.is() )
123cdf0e10cSrcweir xTranslit = 0;
124cdf0e10cSrcweir
125cdf0e10cSrcweir // Create Transliteration for 2<->1, 2<->2 transliteration
126cdf0e10cSrcweir if ( aSrchPara.transliterateFlags & COMPLEX_TRANS_MASK )
127cdf0e10cSrcweir {
128cdf0e10cSrcweir if( !xTranslit2.is() )
129cdf0e10cSrcweir {
130cdf0e10cSrcweir Reference < XInterface > xI = xMSF->createInstance(
131cdf0e10cSrcweir OUString::createFromAscii(
132cdf0e10cSrcweir "com.sun.star.i18n.Transliteration"));
133cdf0e10cSrcweir if ( xI.is() )
134cdf0e10cSrcweir xI->queryInterface( ::getCppuType(
135cdf0e10cSrcweir (const Reference< XExtendedTransliteration >*)0))
136cdf0e10cSrcweir >>= xTranslit2;
137cdf0e10cSrcweir }
138cdf0e10cSrcweir // Load transliteration module
139cdf0e10cSrcweir if( xTranslit2.is() )
140cdf0e10cSrcweir xTranslit2->loadModule(
141cdf0e10cSrcweir (TransliterationModules)( aSrchPara.transliterateFlags & COMPLEX_TRANS_MASK ),
142cdf0e10cSrcweir aSrchPara.Locale);
143cdf0e10cSrcweir }
144cdf0e10cSrcweir
145cdf0e10cSrcweir if ( !xBreak.is() )
146cdf0e10cSrcweir {
147cdf0e10cSrcweir Reference < XInterface > xI = xMSF->createInstance(
148cdf0e10cSrcweir OUString::createFromAscii( "com.sun.star.i18n.BreakIterator"));
149cdf0e10cSrcweir if( xI.is() )
150cdf0e10cSrcweir xI->queryInterface( ::getCppuType(
151cdf0e10cSrcweir (const Reference< XBreakIterator >*)0))
152cdf0e10cSrcweir >>= xBreak;
153cdf0e10cSrcweir }
154cdf0e10cSrcweir
155cdf0e10cSrcweir sSrchStr = aSrchPara.searchString;
156cdf0e10cSrcweir
157cc450e3aSHerbert Dürr // use transliteration here
158cc450e3aSHerbert Dürr if ( xTranslit.is() &&
159cdf0e10cSrcweir aSrchPara.transliterateFlags & SIMPLE_TRANS_MASK )
160cdf0e10cSrcweir sSrchStr = xTranslit->transliterateString2String(
161cdf0e10cSrcweir aSrchPara.searchString, 0, aSrchPara.searchString.getLength());
162cdf0e10cSrcweir
163cc450e3aSHerbert Dürr if ( xTranslit2.is() &&
164cdf0e10cSrcweir aSrchPara.transliterateFlags & COMPLEX_TRANS_MASK )
165cdf0e10cSrcweir sSrchStr2 = xTranslit2->transliterateString2String(
166cdf0e10cSrcweir aSrchPara.searchString, 0, aSrchPara.searchString.getLength());
167cdf0e10cSrcweir
168cdf0e10cSrcweir // When start or end of search string is a complex script type, we need to
169cdf0e10cSrcweir // make sure the result boundary is not located in the middle of cell.
170cdf0e10cSrcweir checkCTLStart = (xBreak.is() && (xBreak->getScriptType(sSrchStr, 0) ==
171cdf0e10cSrcweir ScriptType::COMPLEX));
172cdf0e10cSrcweir checkCTLEnd = (xBreak.is() && (xBreak->getScriptType(sSrchStr,
173cdf0e10cSrcweir sSrchStr.getLength()-1) == ScriptType::COMPLEX));
174cdf0e10cSrcweir
175cc450e3aSHerbert Dürr switch( aSrchPara.algorithmType)
176cdf0e10cSrcweir {
177cc450e3aSHerbert Dürr case SearchAlgorithms_REGEXP:
178cc450e3aSHerbert Dürr fnForward = &TextSearch::RESrchFrwrd;
179cc450e3aSHerbert Dürr fnBackward = &TextSearch::RESrchBkwrd;
1807f9f793fSHerbert Dürr RESrchPrepare( aSrchPara);
1817f9f793fSHerbert Dürr break;
182cc450e3aSHerbert Dürr
183cc450e3aSHerbert Dürr case SearchAlgorithms_APPROXIMATE:
184cdf0e10cSrcweir fnForward = &TextSearch::ApproxSrchFrwrd;
185cdf0e10cSrcweir fnBackward = &TextSearch::ApproxSrchBkwrd;
186cdf0e10cSrcweir
187cdf0e10cSrcweir pWLD = new WLevDistance( sSrchStr.getStr(), aSrchPara.changedChars,
188cdf0e10cSrcweir aSrchPara.insertedChars, aSrchPara.deletedChars,
189cdf0e10cSrcweir 0 != (SearchFlags::LEV_RELAXED & aSrchPara.searchFlag ) );
190cdf0e10cSrcweir
191cdf0e10cSrcweir nLimit = pWLD->GetLimit();
192cc450e3aSHerbert Dürr break;
193cc450e3aSHerbert Dürr
194cc450e3aSHerbert Dürr default:
195cdf0e10cSrcweir fnForward = &TextSearch::NSrchFrwrd;
196cdf0e10cSrcweir fnBackward = &TextSearch::NSrchBkwrd;
197cc450e3aSHerbert Dürr break;
198cdf0e10cSrcweir }
199cdf0e10cSrcweir }
200cdf0e10cSrcweir
FindPosInSeq_Impl(const Sequence<sal_Int32> & rOff,sal_Int32 nPos)201cdf0e10cSrcweir sal_Int32 FindPosInSeq_Impl( const Sequence <sal_Int32>& rOff, sal_Int32 nPos )
202cdf0e10cSrcweir {
203cdf0e10cSrcweir sal_Int32 nRet = 0, nEnd = rOff.getLength();
204cdf0e10cSrcweir while( nRet < nEnd && nPos > rOff[ nRet ] ) ++nRet;
205cdf0e10cSrcweir return nRet;
206cdf0e10cSrcweir }
207cdf0e10cSrcweir
isCellStart(const OUString & searchStr,sal_Int32 nPos)208cdf0e10cSrcweir sal_Bool TextSearch::isCellStart(const OUString& searchStr, sal_Int32 nPos)
209cdf0e10cSrcweir throw( RuntimeException )
210cdf0e10cSrcweir {
211cdf0e10cSrcweir sal_Int32 nDone;
212cdf0e10cSrcweir return nPos == xBreak->previousCharacters(searchStr, nPos+1,
213cdf0e10cSrcweir aSrchPara.Locale, CharacterIteratorMode::SKIPCELL, 1, nDone);
214cdf0e10cSrcweir }
215cdf0e10cSrcweir
searchForward(const OUString & searchStr,sal_Int32 startPos,sal_Int32 endPos)216cdf0e10cSrcweir SearchResult TextSearch::searchForward( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos )
217cdf0e10cSrcweir throw( RuntimeException )
218cdf0e10cSrcweir {
219cdf0e10cSrcweir SearchResult sres;
220cdf0e10cSrcweir
221cdf0e10cSrcweir OUString in_str(searchStr);
222cdf0e10cSrcweir sal_Int32 newStartPos = startPos;
223cdf0e10cSrcweir sal_Int32 newEndPos = endPos;
224cdf0e10cSrcweir
225cdf0e10cSrcweir bUsePrimarySrchStr = true;
226cdf0e10cSrcweir
227cdf0e10cSrcweir if ( xTranslit.is() )
228cdf0e10cSrcweir {
229cdf0e10cSrcweir // apply normal transliteration (1<->1, 1<->0)
230cdf0e10cSrcweir com::sun::star::uno::Sequence <sal_Int32> offset( in_str.getLength());
231cdf0e10cSrcweir in_str = xTranslit->transliterate( searchStr, 0, in_str.getLength(), offset );
232cdf0e10cSrcweir
233cdf0e10cSrcweir // JP 20.6.2001: also the start and end positions must be corrected!
234cdf0e10cSrcweir if( startPos )
235cdf0e10cSrcweir newStartPos = FindPosInSeq_Impl( offset, startPos );
236cdf0e10cSrcweir
237cdf0e10cSrcweir if( endPos < searchStr.getLength() )
238cdf0e10cSrcweir newEndPos = FindPosInSeq_Impl( offset, endPos );
239cdf0e10cSrcweir else
240cdf0e10cSrcweir newEndPos = in_str.getLength();
241cdf0e10cSrcweir
242cdf0e10cSrcweir sres = (this->*fnForward)( in_str, newStartPos, newEndPos );
243cdf0e10cSrcweir
244d5645047STsutomu Uchino sal_Int32 nOffsetLength = offset.getLength();
245d5645047STsutomu Uchino sal_Int32 nStartOffset = 0;
246cdf0e10cSrcweir for ( int k = 0; k < sres.startOffset.getLength(); k++ )
247cdf0e10cSrcweir {
248d5645047STsutomu Uchino nStartOffset = sres.startOffset[k];
249d5645047STsutomu Uchino if ( nStartOffset )
250d5645047STsutomu Uchino {
251d5645047STsutomu Uchino if ( nStartOffset < nOffsetLength )
252d5645047STsutomu Uchino sres.startOffset[k] = offset[nStartOffset];
253d5645047STsutomu Uchino else
254d5645047STsutomu Uchino sres.startOffset[k] = offset[offset.getLength()-1] +1;
255d5645047STsutomu Uchino }
256cdf0e10cSrcweir // JP 20.6.2001: end is ever exclusive and then don't return
257cdf0e10cSrcweir // the position of the next character - return the
258cdf0e10cSrcweir // next position behind the last found character!
259cdf0e10cSrcweir // "a b c" find "b" must return 2,3 and not 2,4!!!
260cdf0e10cSrcweir if (sres.endOffset[k])
261cdf0e10cSrcweir sres.endOffset[k] = offset[sres.endOffset[k]-1] + 1;
262cdf0e10cSrcweir }
263cdf0e10cSrcweir }
264cdf0e10cSrcweir else
265cdf0e10cSrcweir {
266cdf0e10cSrcweir sres = (this->*fnForward)( in_str, startPos, endPos );
267cdf0e10cSrcweir }
268cdf0e10cSrcweir
269cdf0e10cSrcweir if ( xTranslit2.is() && aSrchPara.algorithmType != SearchAlgorithms_REGEXP)
270cdf0e10cSrcweir {
271cdf0e10cSrcweir SearchResult sres2;
272cdf0e10cSrcweir
273cdf0e10cSrcweir in_str = OUString(searchStr);
274cdf0e10cSrcweir com::sun::star::uno::Sequence <sal_Int32> offset( in_str.getLength());
275cdf0e10cSrcweir
276cdf0e10cSrcweir in_str = xTranslit2->transliterate( searchStr, 0, in_str.getLength(), offset );
277cdf0e10cSrcweir
278cdf0e10cSrcweir if( startPos )
279cdf0e10cSrcweir startPos = FindPosInSeq_Impl( offset, startPos );
280cdf0e10cSrcweir
281cdf0e10cSrcweir if( endPos < searchStr.getLength() )
282cdf0e10cSrcweir endPos = FindPosInSeq_Impl( offset, endPos );
283cdf0e10cSrcweir else
284cdf0e10cSrcweir endPos = in_str.getLength();
285cdf0e10cSrcweir
286cdf0e10cSrcweir bUsePrimarySrchStr = false;
287cdf0e10cSrcweir sres2 = (this->*fnForward)( in_str, startPos, endPos );
288cdf0e10cSrcweir
289cdf0e10cSrcweir for ( int k = 0; k < sres2.startOffset.getLength(); k++ )
290cdf0e10cSrcweir {
291cdf0e10cSrcweir if (sres2.startOffset[k])
292cdf0e10cSrcweir sres2.startOffset[k] = offset[sres2.startOffset[k]-1] + 1;
293cdf0e10cSrcweir if (sres2.endOffset[k])
294cdf0e10cSrcweir sres2.endOffset[k] = offset[sres2.endOffset[k]-1] + 1;
295cdf0e10cSrcweir }
296cdf0e10cSrcweir
297cdf0e10cSrcweir // pick first and long one
298cdf0e10cSrcweir if ( sres.subRegExpressions == 0)
299cdf0e10cSrcweir return sres2;
300cdf0e10cSrcweir if ( sres2.subRegExpressions == 1)
301cdf0e10cSrcweir {
302cdf0e10cSrcweir if ( sres.startOffset[0] > sres2.startOffset[0])
303cdf0e10cSrcweir return sres2;
304cdf0e10cSrcweir else if ( sres.startOffset[0] == sres2.startOffset[0] &&
305cdf0e10cSrcweir sres.endOffset[0] < sres2.endOffset[0])
306cdf0e10cSrcweir return sres2;
307cdf0e10cSrcweir }
308cdf0e10cSrcweir }
309cdf0e10cSrcweir
310cdf0e10cSrcweir return sres;
311cdf0e10cSrcweir }
312cdf0e10cSrcweir
searchBackward(const OUString & searchStr,sal_Int32 startPos,sal_Int32 endPos)313cdf0e10cSrcweir SearchResult TextSearch::searchBackward( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos )
314cdf0e10cSrcweir throw(RuntimeException)
315cdf0e10cSrcweir {
316cdf0e10cSrcweir SearchResult sres;
317cdf0e10cSrcweir
318cdf0e10cSrcweir OUString in_str(searchStr);
319cdf0e10cSrcweir sal_Int32 newStartPos = startPos;
320cdf0e10cSrcweir sal_Int32 newEndPos = endPos;
321cdf0e10cSrcweir
322cdf0e10cSrcweir bUsePrimarySrchStr = true;
323cdf0e10cSrcweir
324cdf0e10cSrcweir if ( xTranslit.is() )
325cdf0e10cSrcweir {
326cdf0e10cSrcweir // apply only simple 1<->1 transliteration here
327cdf0e10cSrcweir com::sun::star::uno::Sequence <sal_Int32> offset( in_str.getLength());
328cdf0e10cSrcweir in_str = xTranslit->transliterate( searchStr, 0, in_str.getLength(), offset );
329cdf0e10cSrcweir
330cdf0e10cSrcweir // JP 20.6.2001: also the start and end positions must be corrected!
331cdf0e10cSrcweir if( startPos < searchStr.getLength() )
332cdf0e10cSrcweir newStartPos = FindPosInSeq_Impl( offset, startPos );
333cdf0e10cSrcweir else
334cdf0e10cSrcweir newStartPos = in_str.getLength();
335cdf0e10cSrcweir
336cdf0e10cSrcweir if( endPos )
337cdf0e10cSrcweir newEndPos = FindPosInSeq_Impl( offset, endPos );
338cdf0e10cSrcweir
339cdf0e10cSrcweir sres = (this->*fnBackward)( in_str, newStartPos, newEndPos );
340cdf0e10cSrcweir
341d5645047STsutomu Uchino sal_Int32 nOffsetLength = offset.getLength();
342d5645047STsutomu Uchino sal_Int32 nEndOffset = 0;
343cdf0e10cSrcweir for ( int k = 0; k < sres.startOffset.getLength(); k++ )
344cdf0e10cSrcweir {
345cdf0e10cSrcweir if (sres.startOffset[k])
346cdf0e10cSrcweir sres.startOffset[k] = offset[sres.startOffset[k] - 1] + 1;
347cdf0e10cSrcweir // JP 20.6.2001: end is ever exclusive and then don't return
348cdf0e10cSrcweir // the position of the next character - return the
349cdf0e10cSrcweir // next position behind the last found character!
350cdf0e10cSrcweir // "a b c" find "b" must return 2,3 and not 2,4!!!
351d5645047STsutomu Uchino nEndOffset = sres.endOffset[k];
352d5645047STsutomu Uchino if ( nEndOffset )
353d5645047STsutomu Uchino {
354d5645047STsutomu Uchino if ( nEndOffset < nOffsetLength )
355d5645047STsutomu Uchino sres.endOffset[k] = offset[nEndOffset];
356d5645047STsutomu Uchino else
357d5645047STsutomu Uchino sres.endOffset[k] = offset[offset.getLength()-1] +1;
358d5645047STsutomu Uchino }
359cdf0e10cSrcweir }
360cdf0e10cSrcweir }
361cdf0e10cSrcweir else
362cdf0e10cSrcweir {
363cdf0e10cSrcweir sres = (this->*fnBackward)( in_str, startPos, endPos );
364cdf0e10cSrcweir }
365cdf0e10cSrcweir
366cdf0e10cSrcweir if ( xTranslit2.is() && aSrchPara.algorithmType != SearchAlgorithms_REGEXP )
367cdf0e10cSrcweir {
368cdf0e10cSrcweir SearchResult sres2;
369cdf0e10cSrcweir
370cdf0e10cSrcweir in_str = OUString(searchStr);
371cdf0e10cSrcweir com::sun::star::uno::Sequence <sal_Int32> offset( in_str.getLength());
372cdf0e10cSrcweir
373cdf0e10cSrcweir in_str = xTranslit2->transliterate(searchStr, 0, in_str.getLength(), offset);
374cdf0e10cSrcweir
375cdf0e10cSrcweir if( startPos < searchStr.getLength() )
376cdf0e10cSrcweir startPos = FindPosInSeq_Impl( offset, startPos );
377cdf0e10cSrcweir else
378cdf0e10cSrcweir startPos = in_str.getLength();
379cdf0e10cSrcweir
380cdf0e10cSrcweir if( endPos )
381cdf0e10cSrcweir endPos = FindPosInSeq_Impl( offset, endPos );
382cdf0e10cSrcweir
383cdf0e10cSrcweir bUsePrimarySrchStr = false;
384cdf0e10cSrcweir sres2 = (this->*fnBackward)( in_str, startPos, endPos );
385cdf0e10cSrcweir
386cdf0e10cSrcweir for( int k = 0; k < sres2.startOffset.getLength(); k++ )
387cdf0e10cSrcweir {
388cdf0e10cSrcweir if (sres2.startOffset[k])
389cdf0e10cSrcweir sres2.startOffset[k] = offset[sres2.startOffset[k]-1]+1;
390cdf0e10cSrcweir if (sres2.endOffset[k])
391cdf0e10cSrcweir sres2.endOffset[k] = offset[sres2.endOffset[k]-1]+1;
392cdf0e10cSrcweir }
393cdf0e10cSrcweir
394cdf0e10cSrcweir // pick last and long one
395cdf0e10cSrcweir if ( sres.subRegExpressions == 0 )
396cdf0e10cSrcweir return sres2;
397cdf0e10cSrcweir if ( sres2.subRegExpressions == 1 )
398cdf0e10cSrcweir {
399cdf0e10cSrcweir if ( sres.startOffset[0] < sres2.startOffset[0] )
400cdf0e10cSrcweir return sres2;
401cdf0e10cSrcweir if ( sres.startOffset[0] == sres2.startOffset[0] &&
402cdf0e10cSrcweir sres.endOffset[0] > sres2.endOffset[0] )
403cdf0e10cSrcweir return sres2;
404cdf0e10cSrcweir }
405cdf0e10cSrcweir }
406cdf0e10cSrcweir
407cdf0e10cSrcweir return sres;
408cdf0e10cSrcweir }
409cdf0e10cSrcweir
410cc450e3aSHerbert Dürr //---------------------------------------------------------------------
411cdf0e10cSrcweir
IsDelimiter(const OUString & rStr,sal_Int32 nPos) const412cdf0e10cSrcweir bool TextSearch::IsDelimiter( const OUString& rStr, sal_Int32 nPos ) const
413cdf0e10cSrcweir {
414cdf0e10cSrcweir bool bRet = 1;
415cdf0e10cSrcweir if( '\x7f' != rStr[nPos])
416cdf0e10cSrcweir {
417cdf0e10cSrcweir if ( !xCharClass.is() )
418cdf0e10cSrcweir {
419cdf0e10cSrcweir Reference < XInterface > xI = xMSF->createInstance(
420cdf0e10cSrcweir OUString::createFromAscii( "com.sun.star.i18n.CharacterClassification"));
421cdf0e10cSrcweir if( xI.is() )
422cdf0e10cSrcweir xI->queryInterface( ::getCppuType(
423cdf0e10cSrcweir (const Reference< XCharacterClassification >*)0))
424cdf0e10cSrcweir >>= xCharClass;
425cdf0e10cSrcweir }
426cdf0e10cSrcweir if ( xCharClass.is() )
427cdf0e10cSrcweir {
428cdf0e10cSrcweir sal_Int32 nCType = xCharClass->getCharacterType( rStr, nPos,
429cdf0e10cSrcweir aSrchPara.Locale );
430cdf0e10cSrcweir if( 0 != (( KCharacterType::DIGIT | KCharacterType::ALPHA |
431cdf0e10cSrcweir KCharacterType::LETTER ) & nCType ) )
432cdf0e10cSrcweir bRet = 0;
433cdf0e10cSrcweir }
434cdf0e10cSrcweir }
435cdf0e10cSrcweir return bRet;
436cdf0e10cSrcweir }
437cdf0e10cSrcweir
438cc450e3aSHerbert Dürr // --------- helper methods for Boyer-Moore like text searching ----------
439cc450e3aSHerbert Dürr // TODO: use ICU's regex UREGEX_LITERAL mode instead when it becomes available
440cdf0e10cSrcweir
MakeForwardTab()441cdf0e10cSrcweir void TextSearch::MakeForwardTab()
442cdf0e10cSrcweir {
443cdf0e10cSrcweir // create the jumptable for the search text
444cdf0e10cSrcweir if( pJumpTable )
445cdf0e10cSrcweir {
446cdf0e10cSrcweir if( bIsForwardTab )
447cdf0e10cSrcweir return ; // the jumpTable is ok
448cdf0e10cSrcweir delete pJumpTable;
449cdf0e10cSrcweir }
450cdf0e10cSrcweir bIsForwardTab = true;
451cdf0e10cSrcweir
452cdf0e10cSrcweir sal_Int32 n, nLen = sSrchStr.getLength();
453cdf0e10cSrcweir pJumpTable = new TextSearchJumpTable;
454cdf0e10cSrcweir
455cdf0e10cSrcweir for( n = 0; n < nLen - 1; ++n )
456cdf0e10cSrcweir {
457cdf0e10cSrcweir sal_Unicode cCh = sSrchStr[n];
458cdf0e10cSrcweir sal_Int32 nDiff = nLen - n - 1;
459cdf0e10cSrcweir TextSearchJumpTable::value_type aEntry( cCh, nDiff );
460cdf0e10cSrcweir
461cdf0e10cSrcweir ::std::pair< TextSearchJumpTable::iterator, bool > aPair =
462cdf0e10cSrcweir pJumpTable->insert( aEntry );
463cdf0e10cSrcweir if ( !aPair.second )
464cdf0e10cSrcweir (*(aPair.first)).second = nDiff;
465cdf0e10cSrcweir }
466cdf0e10cSrcweir }
467cdf0e10cSrcweir
MakeForwardTab2()468cdf0e10cSrcweir void TextSearch::MakeForwardTab2()
469cdf0e10cSrcweir {
470cdf0e10cSrcweir // create the jumptable for the search text
471cdf0e10cSrcweir if( pJumpTable2 )
472cdf0e10cSrcweir {
473cdf0e10cSrcweir if( bIsForwardTab )
474cdf0e10cSrcweir return ; // the jumpTable is ok
475cdf0e10cSrcweir delete pJumpTable2;
476cdf0e10cSrcweir }
477cdf0e10cSrcweir bIsForwardTab = true;
478cdf0e10cSrcweir
479cdf0e10cSrcweir sal_Int32 n, nLen = sSrchStr2.getLength();
480cdf0e10cSrcweir pJumpTable2 = new TextSearchJumpTable;
481cdf0e10cSrcweir
482cdf0e10cSrcweir for( n = 0; n < nLen - 1; ++n )
483cdf0e10cSrcweir {
484cdf0e10cSrcweir sal_Unicode cCh = sSrchStr2[n];
485cdf0e10cSrcweir sal_Int32 nDiff = nLen - n - 1;
486cdf0e10cSrcweir
487cdf0e10cSrcweir TextSearchJumpTable::value_type aEntry( cCh, nDiff );
488cdf0e10cSrcweir ::std::pair< TextSearchJumpTable::iterator, bool > aPair =
489cdf0e10cSrcweir pJumpTable2->insert( aEntry );
490cdf0e10cSrcweir if ( !aPair.second )
491cdf0e10cSrcweir (*(aPair.first)).second = nDiff;
492cdf0e10cSrcweir }
493cdf0e10cSrcweir }
494cdf0e10cSrcweir
MakeBackwardTab()495cdf0e10cSrcweir void TextSearch::MakeBackwardTab()
496cdf0e10cSrcweir {
497cdf0e10cSrcweir // create the jumptable for the search text
498cdf0e10cSrcweir if( pJumpTable )
499cdf0e10cSrcweir {
500cdf0e10cSrcweir if( !bIsForwardTab )
501cdf0e10cSrcweir return ; // the jumpTable is ok
502cdf0e10cSrcweir delete pJumpTable;
503cdf0e10cSrcweir }
504cdf0e10cSrcweir bIsForwardTab = false;
505cdf0e10cSrcweir
506cdf0e10cSrcweir sal_Int32 n, nLen = sSrchStr.getLength();
507cdf0e10cSrcweir pJumpTable = new TextSearchJumpTable;
508cdf0e10cSrcweir
509cdf0e10cSrcweir for( n = nLen-1; n > 0; --n )
510cdf0e10cSrcweir {
511cdf0e10cSrcweir sal_Unicode cCh = sSrchStr[n];
512cdf0e10cSrcweir TextSearchJumpTable::value_type aEntry( cCh, n );
513cdf0e10cSrcweir ::std::pair< TextSearchJumpTable::iterator, bool > aPair =
514cdf0e10cSrcweir pJumpTable->insert( aEntry );
515cdf0e10cSrcweir if ( !aPair.second )
516cdf0e10cSrcweir (*(aPair.first)).second = n;
517cdf0e10cSrcweir }
518cdf0e10cSrcweir }
519cdf0e10cSrcweir
MakeBackwardTab2()520cdf0e10cSrcweir void TextSearch::MakeBackwardTab2()
521cdf0e10cSrcweir {
522cdf0e10cSrcweir // create the jumptable for the search text
523cdf0e10cSrcweir if( pJumpTable2 )
524cdf0e10cSrcweir {
525cdf0e10cSrcweir if( !bIsForwardTab )
526cdf0e10cSrcweir return ; // the jumpTable is ok
527cdf0e10cSrcweir delete pJumpTable2;
528cdf0e10cSrcweir }
529cdf0e10cSrcweir bIsForwardTab = false;
530cdf0e10cSrcweir
531cdf0e10cSrcweir sal_Int32 n, nLen = sSrchStr2.getLength();
532cdf0e10cSrcweir pJumpTable2 = new TextSearchJumpTable;
533cdf0e10cSrcweir
534cdf0e10cSrcweir for( n = nLen-1; n > 0; --n )
535cdf0e10cSrcweir {
536cdf0e10cSrcweir sal_Unicode cCh = sSrchStr2[n];
537cdf0e10cSrcweir TextSearchJumpTable::value_type aEntry( cCh, n );
538cdf0e10cSrcweir ::std::pair< TextSearchJumpTable::iterator, bool > aPair =
539cdf0e10cSrcweir pJumpTable2->insert( aEntry );
540cdf0e10cSrcweir if ( !aPair.second )
541cdf0e10cSrcweir (*(aPair.first)).second = n;
542cdf0e10cSrcweir }
543cdf0e10cSrcweir }
544cdf0e10cSrcweir
GetDiff(const sal_Unicode cChr) const545cdf0e10cSrcweir sal_Int32 TextSearch::GetDiff( const sal_Unicode cChr ) const
546cdf0e10cSrcweir {
547cdf0e10cSrcweir TextSearchJumpTable *pJump;
548cdf0e10cSrcweir OUString sSearchKey;
549cdf0e10cSrcweir
550cdf0e10cSrcweir if ( bUsePrimarySrchStr ) {
551cdf0e10cSrcweir pJump = pJumpTable;
552cdf0e10cSrcweir sSearchKey = sSrchStr;
553cdf0e10cSrcweir } else {
554cdf0e10cSrcweir pJump = pJumpTable2;
555cdf0e10cSrcweir sSearchKey = sSrchStr2;
556cdf0e10cSrcweir }
557cdf0e10cSrcweir
558cdf0e10cSrcweir TextSearchJumpTable::const_iterator iLook = pJump->find( cChr );
559cdf0e10cSrcweir if ( iLook == pJump->end() )
560cdf0e10cSrcweir return sSearchKey.getLength();
561cdf0e10cSrcweir return (*iLook).second;
562cdf0e10cSrcweir }
563cdf0e10cSrcweir
564cdf0e10cSrcweir
565cdf0e10cSrcweir // TextSearch::NSrchFrwrd is mis-optimized on unxsoli (#i105945#)
NSrchFrwrd(const OUString & searchStr,sal_Int32 startPos,sal_Int32 endPos)566cdf0e10cSrcweir SearchResult TextSearch::NSrchFrwrd( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos )
567cdf0e10cSrcweir throw(RuntimeException)
568cdf0e10cSrcweir {
569cdf0e10cSrcweir SearchResult aRet;
570cdf0e10cSrcweir aRet.subRegExpressions = 0;
571cdf0e10cSrcweir
572cdf0e10cSrcweir OUString sSearchKey = bUsePrimarySrchStr ? sSrchStr : sSrchStr2;
573cdf0e10cSrcweir
574cdf0e10cSrcweir OUString aStr( searchStr );
575cdf0e10cSrcweir sal_Int32 nSuchIdx = aStr.getLength();
576cdf0e10cSrcweir sal_Int32 nEnde = endPos;
577cdf0e10cSrcweir if( !nSuchIdx || !sSearchKey.getLength() || sSearchKey.getLength() > nSuchIdx )
578cdf0e10cSrcweir return aRet;
579cdf0e10cSrcweir
580cdf0e10cSrcweir
581cdf0e10cSrcweir if( nEnde < sSearchKey.getLength() ) // position inside the search region ?
582cdf0e10cSrcweir return aRet;
583cdf0e10cSrcweir
584cdf0e10cSrcweir nEnde -= sSearchKey.getLength();
585cdf0e10cSrcweir
586cdf0e10cSrcweir if (bUsePrimarySrchStr)
587cdf0e10cSrcweir MakeForwardTab(); // create the jumptable
588cdf0e10cSrcweir else
589cdf0e10cSrcweir MakeForwardTab2();
590cdf0e10cSrcweir
591cdf0e10cSrcweir for (sal_Int32 nCmpIdx = startPos; // start position for the search
592cdf0e10cSrcweir nCmpIdx <= nEnde;
593cdf0e10cSrcweir nCmpIdx += GetDiff( aStr[nCmpIdx + sSearchKey.getLength()-1]))
594cdf0e10cSrcweir {
595cdf0e10cSrcweir // if the match would be the completed cells, skip it.
596cdf0e10cSrcweir if ( (checkCTLStart && !isCellStart( aStr, nCmpIdx )) || (checkCTLEnd
597cdf0e10cSrcweir && !isCellStart( aStr, nCmpIdx + sSearchKey.getLength())) )
598cdf0e10cSrcweir continue;
599cdf0e10cSrcweir
600cdf0e10cSrcweir nSuchIdx = sSearchKey.getLength() - 1;
601cdf0e10cSrcweir while( nSuchIdx >= 0 && sSearchKey[nSuchIdx] == aStr[nCmpIdx + nSuchIdx])
602cdf0e10cSrcweir {
603cdf0e10cSrcweir if( nSuchIdx == 0 )
604cdf0e10cSrcweir {
605cdf0e10cSrcweir if( SearchFlags::NORM_WORD_ONLY & aSrchPara.searchFlag )
606cdf0e10cSrcweir {
607cdf0e10cSrcweir sal_Int32 nFndEnd = nCmpIdx + sSearchKey.getLength();
608cdf0e10cSrcweir bool bAtStart = !nCmpIdx;
609cdf0e10cSrcweir bool bAtEnd = nFndEnd == endPos;
610cdf0e10cSrcweir bool bDelimBefore = bAtStart || IsDelimiter( aStr, nCmpIdx-1 );
611cdf0e10cSrcweir bool bDelimBehind = IsDelimiter( aStr, nFndEnd );
612cdf0e10cSrcweir // * 1 -> only one word in the paragraph
613cdf0e10cSrcweir // * 2 -> at begin of paragraph
614cdf0e10cSrcweir // * 3 -> at end of paragraph
615cdf0e10cSrcweir // * 4 -> inside the paragraph
616cdf0e10cSrcweir if( !( ( bAtStart && bAtEnd ) || // 1
617cdf0e10cSrcweir ( bAtStart && bDelimBehind ) || // 2
618cdf0e10cSrcweir ( bAtEnd && bDelimBefore ) || // 3
619cdf0e10cSrcweir ( bDelimBefore && bDelimBehind ))) // 4
620cdf0e10cSrcweir break;
621cdf0e10cSrcweir }
622cdf0e10cSrcweir
623cdf0e10cSrcweir aRet.subRegExpressions = 1;
624cdf0e10cSrcweir aRet.startOffset.realloc( 1 );
625cdf0e10cSrcweir aRet.startOffset[ 0 ] = nCmpIdx;
626cdf0e10cSrcweir aRet.endOffset.realloc( 1 );
627cdf0e10cSrcweir aRet.endOffset[ 0 ] = nCmpIdx + sSearchKey.getLength();
628cdf0e10cSrcweir
629cdf0e10cSrcweir return aRet;
630cdf0e10cSrcweir }
631cdf0e10cSrcweir else
632cdf0e10cSrcweir nSuchIdx--;
633cdf0e10cSrcweir }
634cdf0e10cSrcweir }
635cdf0e10cSrcweir return aRet;
636cdf0e10cSrcweir }
637cdf0e10cSrcweir
NSrchBkwrd(const OUString & searchStr,sal_Int32 startPos,sal_Int32 endPos)638cdf0e10cSrcweir SearchResult TextSearch::NSrchBkwrd( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos )
639cdf0e10cSrcweir throw(RuntimeException)
640cdf0e10cSrcweir {
641cdf0e10cSrcweir SearchResult aRet;
642cdf0e10cSrcweir aRet.subRegExpressions = 0;
643cdf0e10cSrcweir
644cdf0e10cSrcweir OUString sSearchKey = bUsePrimarySrchStr ? sSrchStr : sSrchStr2;
645cdf0e10cSrcweir
646cdf0e10cSrcweir OUString aStr( searchStr );
647cdf0e10cSrcweir sal_Int32 nSuchIdx = aStr.getLength();
648cdf0e10cSrcweir sal_Int32 nEnde = endPos;
649cdf0e10cSrcweir if( nSuchIdx == 0 || sSearchKey.getLength() == 0 || sSearchKey.getLength() > nSuchIdx)
650cdf0e10cSrcweir return aRet;
651cdf0e10cSrcweir
652cdf0e10cSrcweir if (bUsePrimarySrchStr)
653cdf0e10cSrcweir MakeBackwardTab(); // create the jumptable
654cdf0e10cSrcweir else
655cdf0e10cSrcweir MakeBackwardTab2();
656cdf0e10cSrcweir
657cdf0e10cSrcweir if( nEnde == nSuchIdx ) // end position for the search
658cdf0e10cSrcweir nEnde = sSearchKey.getLength();
659cdf0e10cSrcweir else
660cdf0e10cSrcweir nEnde += sSearchKey.getLength();
661cdf0e10cSrcweir
662cdf0e10cSrcweir sal_Int32 nCmpIdx = startPos; // start position for the search
663cdf0e10cSrcweir
664cdf0e10cSrcweir while (nCmpIdx >= nEnde)
665cdf0e10cSrcweir {
666cdf0e10cSrcweir // if the match would be the completed cells, skip it.
667cdf0e10cSrcweir if ( (!checkCTLStart || isCellStart( aStr, nCmpIdx -
668cdf0e10cSrcweir sSearchKey.getLength() )) && (!checkCTLEnd ||
669cdf0e10cSrcweir isCellStart( aStr, nCmpIdx)))
670cdf0e10cSrcweir {
671cdf0e10cSrcweir nSuchIdx = 0;
672cdf0e10cSrcweir while( nSuchIdx < sSearchKey.getLength() && sSearchKey[nSuchIdx] ==
673cdf0e10cSrcweir aStr[nCmpIdx + nSuchIdx - sSearchKey.getLength()] )
674cdf0e10cSrcweir nSuchIdx++;
675cdf0e10cSrcweir if( nSuchIdx >= sSearchKey.getLength() )
676cdf0e10cSrcweir {
677cdf0e10cSrcweir if( SearchFlags::NORM_WORD_ONLY & aSrchPara.searchFlag )
678cdf0e10cSrcweir {
679cdf0e10cSrcweir sal_Int32 nFndStt = nCmpIdx - sSearchKey.getLength();
680cdf0e10cSrcweir bool bAtStart = !nFndStt;
681cdf0e10cSrcweir bool bAtEnd = nCmpIdx == startPos;
682cdf0e10cSrcweir bool bDelimBehind = IsDelimiter( aStr, nCmpIdx );
683cdf0e10cSrcweir bool bDelimBefore = bAtStart || // begin of paragraph
684cdf0e10cSrcweir IsDelimiter( aStr, nFndStt-1 );
685cdf0e10cSrcweir // * 1 -> only one word in the paragraph
686cdf0e10cSrcweir // * 2 -> at begin of paragraph
687cdf0e10cSrcweir // * 3 -> at end of paragraph
688cdf0e10cSrcweir // * 4 -> inside the paragraph
689cdf0e10cSrcweir if( ( bAtStart && bAtEnd ) || // 1
690cdf0e10cSrcweir ( bAtStart && bDelimBehind ) || // 2
691cdf0e10cSrcweir ( bAtEnd && bDelimBefore ) || // 3
692cdf0e10cSrcweir ( bDelimBefore && bDelimBehind )) // 4
693cdf0e10cSrcweir {
694cdf0e10cSrcweir aRet.subRegExpressions = 1;
695cdf0e10cSrcweir aRet.startOffset.realloc( 1 );
696cdf0e10cSrcweir aRet.startOffset[ 0 ] = nCmpIdx;
697cdf0e10cSrcweir aRet.endOffset.realloc( 1 );
698cdf0e10cSrcweir aRet.endOffset[ 0 ] = nCmpIdx - sSearchKey.getLength();
699cdf0e10cSrcweir return aRet;
700cdf0e10cSrcweir }
701cdf0e10cSrcweir }
702cdf0e10cSrcweir else
703cdf0e10cSrcweir {
704cdf0e10cSrcweir aRet.subRegExpressions = 1;
705cdf0e10cSrcweir aRet.startOffset.realloc( 1 );
706cdf0e10cSrcweir aRet.startOffset[ 0 ] = nCmpIdx;
707cdf0e10cSrcweir aRet.endOffset.realloc( 1 );
708cdf0e10cSrcweir aRet.endOffset[ 0 ] = nCmpIdx - sSearchKey.getLength();
709cdf0e10cSrcweir return aRet;
710cdf0e10cSrcweir }
711cdf0e10cSrcweir }
712cdf0e10cSrcweir }
713cdf0e10cSrcweir nSuchIdx = GetDiff( aStr[nCmpIdx - sSearchKey.getLength()] );
714cdf0e10cSrcweir if( nCmpIdx < nSuchIdx )
715cdf0e10cSrcweir return aRet;
716cdf0e10cSrcweir nCmpIdx -= nSuchIdx;
717cdf0e10cSrcweir }
718cdf0e10cSrcweir return aRet;
719cdf0e10cSrcweir }
720cdf0e10cSrcweir
RESrchPrepare(const::com::sun::star::util::SearchOptions & rOptions)7217f9f793fSHerbert Dürr void TextSearch::RESrchPrepare( const ::com::sun::star::util::SearchOptions& rOptions)
7227f9f793fSHerbert Dürr {
7237f9f793fSHerbert Dürr // select the transliterated pattern string
7247f9f793fSHerbert Dürr const OUString& rPatternStr =
725e2630f2cSHerbert Dürr (rOptions.transliterateFlags & REGEX_TRANS_MASK) ? sSrchStr
7267f9f793fSHerbert Dürr : ((rOptions.transliterateFlags & COMPLEX_TRANS_MASK) ? sSrchStr2 : rOptions.searchString);
7277f9f793fSHerbert Dürr
7287c5e76a7SHerbert Dürr sal_uInt32 nIcuSearchFlags = UREGEX_UWORD; // request UAX#29 unicode capability
7297f9f793fSHerbert Dürr // map com::sun::star::util::SearchFlags to ICU uregex.h flags
7307f9f793fSHerbert Dürr // TODO: REG_EXTENDED, REG_NOT_BEGINOFLINE, REG_NOT_ENDOFLINE
7317f9f793fSHerbert Dürr // REG_NEWLINE is neither properly defined nor used anywhere => not implemented
7327f9f793fSHerbert Dürr // REG_NOSUB is not used anywhere => not implemented
7337f9f793fSHerbert Dürr // NORM_WORD_ONLY is only used for SearchAlgorithm==Absolute
7347f9f793fSHerbert Dürr // LEV_RELAXED is only used for SearchAlgorithm==Approximate
73522c9c6f7SHerbert Dürr // Note that the search flag ALL_IGNORE_CASE is deprecated in UNO
73622c9c6f7SHerbert Dürr // probably because the transliteration flag IGNORE_CASE handles it as well.
73722c9c6f7SHerbert Dürr if( (rOptions.searchFlag & com::sun::star::util::SearchFlags::ALL_IGNORE_CASE) != 0
73822c9c6f7SHerbert Dürr || (rOptions.transliterateFlags & TransliterationModules_IGNORE_CASE) != 0)
7397f9f793fSHerbert Dürr nIcuSearchFlags |= UREGEX_CASE_INSENSITIVE;
7407f9f793fSHerbert Dürr UErrorCode nIcuErr = U_ZERO_ERROR;
7417f9f793fSHerbert Dürr // assumption: transliteration didn't mangle regexp control chars
74203c97e34SYuri Dario IcuUniString aIcuSearchPatStr( (const UChar*)rPatternStr.getStr(), rPatternStr.getLength());
743ee131020SHerbert Dürr #ifndef DISABLE_WORDBOUND_EMULATION
7447f9f793fSHerbert Dürr // for conveniance specific syntax elements of the old regex engine are emulated
7456a7366bcSHerbert Dürr // - by replacing \< with "word-break followed by a look-ahead word-char"
7466a7366bcSHerbert Dürr static const IcuUniString aChevronPatternB( "\\\\<", -1, IcuUniString::kInvariant);
7476a7366bcSHerbert Dürr static const IcuUniString aChevronReplaceB( "\\\\b(?=\\\\w)", -1, IcuUniString::kInvariant);
7486a7366bcSHerbert Dürr static RegexMatcher aChevronMatcherB( aChevronPatternB, 0, nIcuErr);
7496a7366bcSHerbert Dürr aChevronMatcherB.reset( aIcuSearchPatStr);
7506a7366bcSHerbert Dürr aIcuSearchPatStr = aChevronMatcherB.replaceAll( aChevronReplaceB, nIcuErr);
7516a7366bcSHerbert Dürr aChevronMatcherB.reset();
7526a7366bcSHerbert Dürr // - by replacing \> with "look-behind word-char followed by a word-break"
7536a7366bcSHerbert Dürr static const IcuUniString aChevronPatternE( "\\\\>", -1, IcuUniString::kInvariant);
7546a7366bcSHerbert Dürr static const IcuUniString aChevronReplaceE( "(?<=\\\\w)\\\\b", -1, IcuUniString::kInvariant);
7556a7366bcSHerbert Dürr static RegexMatcher aChevronMatcherE( aChevronPatternE, 0, nIcuErr);
7566a7366bcSHerbert Dürr aChevronMatcherE.reset( aIcuSearchPatStr);
7576a7366bcSHerbert Dürr aIcuSearchPatStr = aChevronMatcherE.replaceAll( aChevronReplaceE, nIcuErr);
7586a7366bcSHerbert Dürr aChevronMatcherE.reset();
7597f9f793fSHerbert Dürr #endif
7607f9f793fSHerbert Dürr pRegexMatcher = new RegexMatcher( aIcuSearchPatStr, nIcuSearchFlags, nIcuErr);
7617f9f793fSHerbert Dürr if( nIcuErr)
7627f9f793fSHerbert Dürr { delete pRegexMatcher; pRegexMatcher = NULL;}
7637f9f793fSHerbert Dürr }
7647f9f793fSHerbert Dürr
765cdf0e10cSrcweir //---------------------------------------------------------------------------
766cdf0e10cSrcweir
RESrchFrwrd(const OUString & searchStr,sal_Int32 startPos,sal_Int32 endPos)767cdf0e10cSrcweir SearchResult TextSearch::RESrchFrwrd( const OUString& searchStr,
768cdf0e10cSrcweir sal_Int32 startPos, sal_Int32 endPos )
769cdf0e10cSrcweir throw(RuntimeException)
770cdf0e10cSrcweir {
771cc450e3aSHerbert Dürr SearchResult aRet;
772cc450e3aSHerbert Dürr aRet.subRegExpressions = 0;
773cc450e3aSHerbert Dürr if( !pRegexMatcher)
774cc450e3aSHerbert Dürr return aRet;
77519ee98b9SHerbert Dürr
776cc450e3aSHerbert Dürr if( endPos > searchStr.getLength())
777cc450e3aSHerbert Dürr endPos = searchStr.getLength();
778cc450e3aSHerbert Dürr
779cc450e3aSHerbert Dürr // use the ICU RegexMatcher to find the matches
780cc450e3aSHerbert Dürr UErrorCode nIcuErr = U_ZERO_ERROR;
78119716b0aSHerbert Dürr const IcuUniString aSearchTargetStr( (const UChar*)searchStr.getStr(), endPos);
782cc450e3aSHerbert Dürr pRegexMatcher->reset( aSearchTargetStr);
78316b8677bSHerbert Dürr // search until there is a valid match
78416b8677bSHerbert Dürr for(;;)
78516b8677bSHerbert Dürr {
78616b8677bSHerbert Dürr if( !pRegexMatcher->find( startPos, nIcuErr))
78716b8677bSHerbert Dürr return aRet;
78816b8677bSHerbert Dürr
78916b8677bSHerbert Dürr // #i118887# ignore zero-length matches e.g. "a*" in "bc"
79016b8677bSHerbert Dürr int nStartOfs = pRegexMatcher->start( nIcuErr);
79116b8677bSHerbert Dürr int nEndOfs = pRegexMatcher->end( nIcuErr);
79216b8677bSHerbert Dürr if( nStartOfs < nEndOfs)
79316b8677bSHerbert Dürr break;
79416b8677bSHerbert Dürr // try at next position if there was a zero-length match
79516b8677bSHerbert Dürr if( ++startPos >= endPos)
79616b8677bSHerbert Dürr return aRet;
79716b8677bSHerbert Dürr }
798cc450e3aSHerbert Dürr
79916b8677bSHerbert Dürr // extract the result of the search
8000c7ce76dSHerbert Dürr const int nGroupCount = pRegexMatcher->groupCount();
8010c7ce76dSHerbert Dürr aRet.subRegExpressions = nGroupCount + 1;
802cc450e3aSHerbert Dürr aRet.startOffset.realloc( aRet.subRegExpressions);
803cc450e3aSHerbert Dürr aRet.endOffset.realloc( aRet.subRegExpressions);
804cc450e3aSHerbert Dürr aRet.startOffset[0] = pRegexMatcher->start( nIcuErr);
805cc450e3aSHerbert Dürr aRet.endOffset[0] = pRegexMatcher->end( nIcuErr);
8060c7ce76dSHerbert Dürr for( int i = 1; i <= nGroupCount; ++i) {
8070c7ce76dSHerbert Dürr aRet.startOffset[i] = pRegexMatcher->start( i, nIcuErr);
8080c7ce76dSHerbert Dürr aRet.endOffset[i] = pRegexMatcher->end( i, nIcuErr);
8090c7ce76dSHerbert Dürr }
810cc450e3aSHerbert Dürr
811cc450e3aSHerbert Dürr return aRet;
812cdf0e10cSrcweir }
813cdf0e10cSrcweir
RESrchBkwrd(const OUString & searchStr,sal_Int32 startPos,sal_Int32 endPos)814cdf0e10cSrcweir SearchResult TextSearch::RESrchBkwrd( const OUString& searchStr,
815cdf0e10cSrcweir sal_Int32 startPos, sal_Int32 endPos )
816cdf0e10cSrcweir throw(RuntimeException)
817cdf0e10cSrcweir {
818cc450e3aSHerbert Dürr // NOTE: for backwards search callers provide startPos/endPos inverted!
819cc450e3aSHerbert Dürr SearchResult aRet;
820cc450e3aSHerbert Dürr aRet.subRegExpressions = 0;
821cc450e3aSHerbert Dürr if( !pRegexMatcher)
822cc450e3aSHerbert Dürr return aRet;
82319ee98b9SHerbert Dürr
824cc450e3aSHerbert Dürr if( startPos > searchStr.getLength())
825cc450e3aSHerbert Dürr startPos = searchStr.getLength();
826cc450e3aSHerbert Dürr
827cc450e3aSHerbert Dürr // use the ICU RegexMatcher to find the matches
828cc450e3aSHerbert Dürr // TODO: use ICU's backward searching once it becomes available
8290c7ce76dSHerbert Dürr // as its replacement using forward search is not as good as the real thing
830cc450e3aSHerbert Dürr UErrorCode nIcuErr = U_ZERO_ERROR;
83103c97e34SYuri Dario const IcuUniString aSearchTargetStr( (const UChar*)searchStr.getStr(), startPos);
832cc450e3aSHerbert Dürr pRegexMatcher->reset( aSearchTargetStr);
833cc450e3aSHerbert Dürr if( !pRegexMatcher->find( endPos, nIcuErr))
834cc450e3aSHerbert Dürr return aRet;
835cc450e3aSHerbert Dürr
8360c7ce76dSHerbert Dürr // find the last match
8370c7ce76dSHerbert Dürr int nLastPos = 0;
8382c1e93e7SHerbert Dürr int nFoundEnd = 0;
8390c7ce76dSHerbert Dürr do {
8400c7ce76dSHerbert Dürr nLastPos = pRegexMatcher->start( nIcuErr);
8412c1e93e7SHerbert Dürr nFoundEnd = pRegexMatcher->end( nIcuErr);
8422c1e93e7SHerbert Dürr if( nFoundEnd >= startPos)
8432c1e93e7SHerbert Dürr break;
8442c1e93e7SHerbert Dürr if( nFoundEnd == nLastPos)
8452c1e93e7SHerbert Dürr ++nFoundEnd;
8462c1e93e7SHerbert Dürr } while( pRegexMatcher->find( nFoundEnd, nIcuErr));
8470c7ce76dSHerbert Dürr
8480c7ce76dSHerbert Dürr // find last match again to get its details
8490c7ce76dSHerbert Dürr pRegexMatcher->find( nLastPos, nIcuErr);
8500c7ce76dSHerbert Dürr
8510c7ce76dSHerbert Dürr // fill in the details of the last match
8520c7ce76dSHerbert Dürr const int nGroupCount = pRegexMatcher->groupCount();
8530c7ce76dSHerbert Dürr aRet.subRegExpressions = nGroupCount + 1;
854cc450e3aSHerbert Dürr aRet.startOffset.realloc( aRet.subRegExpressions);
855cc450e3aSHerbert Dürr aRet.endOffset.realloc( aRet.subRegExpressions);
8560c7ce76dSHerbert Dürr // NOTE: existing users of backward search seem to expect startOfs/endOfs being inverted!
8570c7ce76dSHerbert Dürr aRet.startOffset[0] = pRegexMatcher->end( nIcuErr);
8580c7ce76dSHerbert Dürr aRet.endOffset[0] = pRegexMatcher->start( nIcuErr);
8590c7ce76dSHerbert Dürr for( int i = 1; i <= nGroupCount; ++i) {
8600c7ce76dSHerbert Dürr aRet.startOffset[i] = pRegexMatcher->end( i, nIcuErr);
8610c7ce76dSHerbert Dürr aRet.endOffset[i] = pRegexMatcher->start( i, nIcuErr);
8620c7ce76dSHerbert Dürr }
863cc450e3aSHerbert Dürr
864cc450e3aSHerbert Dürr return aRet;
865cdf0e10cSrcweir }
866cdf0e10cSrcweir
867cc450e3aSHerbert Dürr //---------------------------------------------------------------------------
868cc450e3aSHerbert Dürr
869cc450e3aSHerbert Dürr // search for words phonetically
ApproxSrchFrwrd(const OUString & searchStr,sal_Int32 startPos,sal_Int32 endPos)870cdf0e10cSrcweir SearchResult TextSearch::ApproxSrchFrwrd( const OUString& searchStr,
871cdf0e10cSrcweir sal_Int32 startPos, sal_Int32 endPos )
872cdf0e10cSrcweir throw(RuntimeException)
873cdf0e10cSrcweir {
874cdf0e10cSrcweir SearchResult aRet;
875cdf0e10cSrcweir aRet.subRegExpressions = 0;
876cdf0e10cSrcweir
877cdf0e10cSrcweir if( !xBreak.is() )
878cdf0e10cSrcweir return aRet;
879cdf0e10cSrcweir
880cdf0e10cSrcweir OUString aWTemp( searchStr );
881cdf0e10cSrcweir
882c1e8cc3aSDon Lewis sal_Int32 nStt, nEnd;
883cdf0e10cSrcweir
884cdf0e10cSrcweir Boundary aWBnd = xBreak->getWordBoundary( aWTemp, startPos,
885cdf0e10cSrcweir aSrchPara.Locale,
886cdf0e10cSrcweir WordType::ANYWORD_IGNOREWHITESPACES, sal_True );
887cdf0e10cSrcweir
888cdf0e10cSrcweir do
889cdf0e10cSrcweir {
890cdf0e10cSrcweir if( aWBnd.startPos >= endPos )
891cdf0e10cSrcweir break;
892cdf0e10cSrcweir nStt = aWBnd.startPos < startPos ? startPos : aWBnd.startPos;
893cdf0e10cSrcweir nEnd = aWBnd.endPos > endPos ? endPos : aWBnd.endPos;
894cdf0e10cSrcweir
895cdf0e10cSrcweir if( nStt < nEnd &&
896cdf0e10cSrcweir pWLD->WLD( aWTemp.getStr() + nStt, nEnd - nStt ) <= nLimit )
897cdf0e10cSrcweir {
898cdf0e10cSrcweir aRet.subRegExpressions = 1;
899cdf0e10cSrcweir aRet.startOffset.realloc( 1 );
900cdf0e10cSrcweir aRet.startOffset[ 0 ] = nStt;
901cdf0e10cSrcweir aRet.endOffset.realloc( 1 );
902cdf0e10cSrcweir aRet.endOffset[ 0 ] = nEnd;
903cdf0e10cSrcweir break;
904cdf0e10cSrcweir }
905cdf0e10cSrcweir
906cdf0e10cSrcweir nStt = nEnd - 1;
907cdf0e10cSrcweir aWBnd = xBreak->nextWord( aWTemp, nStt, aSrchPara.Locale,
908cdf0e10cSrcweir WordType::ANYWORD_IGNOREWHITESPACES);
909cdf0e10cSrcweir } while( aWBnd.startPos != aWBnd.endPos ||
910cdf0e10cSrcweir (aWBnd.endPos != aWTemp.getLength() && aWBnd.endPos != nEnd) );
911cdf0e10cSrcweir // #i50244# aWBnd.endPos != nEnd : in case there is _no_ word (only
912cdf0e10cSrcweir // whitespace) in searchStr, getWordBoundary() returned startPos,startPos
913cdf0e10cSrcweir // and nextWord() does also => don't loop forever.
914cdf0e10cSrcweir return aRet;
915cdf0e10cSrcweir }
916cdf0e10cSrcweir
ApproxSrchBkwrd(const OUString & searchStr,sal_Int32 startPos,sal_Int32 endPos)917cdf0e10cSrcweir SearchResult TextSearch::ApproxSrchBkwrd( const OUString& searchStr,
918cdf0e10cSrcweir sal_Int32 startPos, sal_Int32 endPos )
919cdf0e10cSrcweir throw(RuntimeException)
920cdf0e10cSrcweir {
921cdf0e10cSrcweir SearchResult aRet;
922cdf0e10cSrcweir aRet.subRegExpressions = 0;
923cdf0e10cSrcweir
924cdf0e10cSrcweir if( !xBreak.is() )
925cdf0e10cSrcweir return aRet;
926cdf0e10cSrcweir
927cdf0e10cSrcweir OUString aWTemp( searchStr );
928cdf0e10cSrcweir
929c1e8cc3aSDon Lewis sal_Int32 nStt, nEnd;
930cdf0e10cSrcweir
931cdf0e10cSrcweir Boundary aWBnd = xBreak->getWordBoundary( aWTemp, startPos,
932cdf0e10cSrcweir aSrchPara.Locale,
933cdf0e10cSrcweir WordType::ANYWORD_IGNOREWHITESPACES, sal_True );
934cdf0e10cSrcweir
935cdf0e10cSrcweir do
936cdf0e10cSrcweir {
937cdf0e10cSrcweir if( aWBnd.endPos <= endPos )
938cdf0e10cSrcweir break;
939cdf0e10cSrcweir nStt = aWBnd.startPos < endPos ? endPos : aWBnd.startPos;
940cdf0e10cSrcweir nEnd = aWBnd.endPos > startPos ? startPos : aWBnd.endPos;
941cdf0e10cSrcweir
942cdf0e10cSrcweir if( nStt < nEnd &&
943cdf0e10cSrcweir pWLD->WLD( aWTemp.getStr() + nStt, nEnd - nStt ) <= nLimit )
944cdf0e10cSrcweir {
945cdf0e10cSrcweir aRet.subRegExpressions = 1;
946cdf0e10cSrcweir aRet.startOffset.realloc( 1 );
947cdf0e10cSrcweir aRet.startOffset[ 0 ] = nEnd;
948cdf0e10cSrcweir aRet.endOffset.realloc( 1 );
949cdf0e10cSrcweir aRet.endOffset[ 0 ] = nStt;
950cdf0e10cSrcweir break;
951cdf0e10cSrcweir }
952cdf0e10cSrcweir if( !nStt )
953cdf0e10cSrcweir break;
954cdf0e10cSrcweir
955cdf0e10cSrcweir aWBnd = xBreak->previousWord( aWTemp, nStt, aSrchPara.Locale,
956cdf0e10cSrcweir WordType::ANYWORD_IGNOREWHITESPACES);
957cdf0e10cSrcweir } while( aWBnd.startPos != aWBnd.endPos || aWBnd.endPos != aWTemp.getLength() );
958cdf0e10cSrcweir return aRet;
959cdf0e10cSrcweir }
960cdf0e10cSrcweir
961cdf0e10cSrcweir
962cdf0e10cSrcweir static const sal_Char cSearchName[] = "com.sun.star.util.TextSearch";
963cdf0e10cSrcweir static const sal_Char cSearchImpl[] = "com.sun.star.util.TextSearch_i18n";
964cdf0e10cSrcweir
getServiceName_Static()965cdf0e10cSrcweir static OUString getServiceName_Static()
966cdf0e10cSrcweir {
967cdf0e10cSrcweir return OUString::createFromAscii( cSearchName );
968cdf0e10cSrcweir }
969cdf0e10cSrcweir
getImplementationName_Static()970cdf0e10cSrcweir static OUString getImplementationName_Static()
971cdf0e10cSrcweir {
972cdf0e10cSrcweir return OUString::createFromAscii( cSearchImpl );
973cdf0e10cSrcweir }
974cdf0e10cSrcweir
975cdf0e10cSrcweir OUString SAL_CALL
getImplementationName()976cdf0e10cSrcweir TextSearch::getImplementationName()
977cdf0e10cSrcweir throw( RuntimeException )
978cdf0e10cSrcweir {
979cdf0e10cSrcweir return getImplementationName_Static();
980cdf0e10cSrcweir }
981cdf0e10cSrcweir
982cdf0e10cSrcweir sal_Bool SAL_CALL
supportsService(const OUString & rServiceName)983cdf0e10cSrcweir TextSearch::supportsService(const OUString& rServiceName)
984cdf0e10cSrcweir throw( RuntimeException )
985cdf0e10cSrcweir {
986cdf0e10cSrcweir return !rServiceName.compareToAscii( cSearchName );
987cdf0e10cSrcweir }
988cdf0e10cSrcweir
989cdf0e10cSrcweir Sequence< OUString > SAL_CALL
getSupportedServiceNames(void)990cdf0e10cSrcweir TextSearch::getSupportedServiceNames(void) throw( RuntimeException )
991cdf0e10cSrcweir {
992cdf0e10cSrcweir Sequence< OUString > aRet(1);
993cdf0e10cSrcweir aRet[0] = getServiceName_Static();
994cdf0e10cSrcweir return aRet;
995cdf0e10cSrcweir }
996cdf0e10cSrcweir
997cdf0e10cSrcweir ::com::sun::star::uno::Reference< ::com::sun::star::uno::XInterface >
TextSearch_CreateInstance(const::com::sun::star::uno::Reference<::com::sun::star::lang::XMultiServiceFactory> & rxMSF)998cdf0e10cSrcweir SAL_CALL TextSearch_CreateInstance(
999cdf0e10cSrcweir const ::com::sun::star::uno::Reference<
1000cdf0e10cSrcweir ::com::sun::star::lang::XMultiServiceFactory >& rxMSF )
1001cdf0e10cSrcweir {
1002cdf0e10cSrcweir return ::com::sun::star::uno::Reference<
1003cdf0e10cSrcweir ::com::sun::star::uno::XInterface >(
1004cdf0e10cSrcweir (::cppu::OWeakObject*) new TextSearch( rxMSF ) );
1005cdf0e10cSrcweir }
1006cdf0e10cSrcweir
1007cdf0e10cSrcweir extern "C"
1008cdf0e10cSrcweir {
1009cdf0e10cSrcweir
component_getImplementationEnvironment(const sal_Char ** ppEnvTypeName,uno_Environment **)1010*52d905a6SJim Jagielski SAL_DLLPUBLIC_EXPORT void SAL_CALL component_getImplementationEnvironment(
1011cdf0e10cSrcweir const sal_Char** ppEnvTypeName, uno_Environment** /*ppEnv*/ )
1012cdf0e10cSrcweir {
1013cdf0e10cSrcweir *ppEnvTypeName = CPPU_CURRENT_LANGUAGE_BINDING_NAME;
1014cdf0e10cSrcweir }
1015cdf0e10cSrcweir
component_getFactory(const sal_Char * sImplementationName,void * _pServiceManager,void *)1016*52d905a6SJim Jagielski SAL_DLLPUBLIC_EXPORT void* SAL_CALL component_getFactory( const sal_Char* sImplementationName,
1017cdf0e10cSrcweir void* _pServiceManager, void* /*_pRegistryKey*/ )
1018cdf0e10cSrcweir {
1019cdf0e10cSrcweir void* pRet = NULL;
1020cdf0e10cSrcweir
1021cdf0e10cSrcweir ::com::sun::star::lang::XMultiServiceFactory* pServiceManager =
1022cdf0e10cSrcweir reinterpret_cast< ::com::sun::star::lang::XMultiServiceFactory* >
1023cdf0e10cSrcweir ( _pServiceManager );
1024cdf0e10cSrcweir ::com::sun::star::uno::Reference<
1025cdf0e10cSrcweir ::com::sun::star::lang::XSingleServiceFactory > xFactory;
1026cdf0e10cSrcweir
1027cdf0e10cSrcweir if ( 0 == rtl_str_compare( sImplementationName, cSearchImpl) )
1028cdf0e10cSrcweir {
1029cdf0e10cSrcweir ::com::sun::star::uno::Sequence< ::rtl::OUString > aServiceNames(1);
1030cdf0e10cSrcweir aServiceNames[0] = getServiceName_Static();
1031cdf0e10cSrcweir xFactory = ::cppu::createSingleFactory(
1032cdf0e10cSrcweir pServiceManager, getImplementationName_Static(),
1033cdf0e10cSrcweir &TextSearch_CreateInstance, aServiceNames );
1034cdf0e10cSrcweir }
1035cdf0e10cSrcweir
1036cdf0e10cSrcweir if ( xFactory.is() )
1037cdf0e10cSrcweir {
1038cdf0e10cSrcweir xFactory->acquire();
1039cdf0e10cSrcweir pRet = xFactory.get();
1040cdf0e10cSrcweir }
1041cdf0e10cSrcweir
1042cdf0e10cSrcweir return pRet;
1043cdf0e10cSrcweir }
1044cdf0e10cSrcweir
1045cdf0e10cSrcweir } // extern "C"
1046