1 /**************************************************************
2  *
3  * Licensed to the Apache Software Foundation (ASF) under one
4  * or more contributor license agreements.  See the NOTICE file
5  * distributed with this work for additional information
6  * regarding copyright ownership.  The ASF licenses this file
7  * to you under the Apache License, Version 2.0 (the
8  * "License"); you may not use this file except in compliance
9  * with the License.  You may obtain a copy of the License at
10  *
11  *   http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing,
14  * software distributed under the License is distributed on an
15  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16  * KIND, either express or implied.  See the License for the
17  * specific language governing permissions and limitations
18  * under the License.
19  *
20  *************************************************************/
21 
22 
23 
24 // MARKER(update_precomp.py): autogen include statement, do not remove
25 #include "precompiled_search.hxx"
26 
27 #include "textsearch.hxx"
28 #include "levdis.hxx"
29 #include <com/sun/star/lang/Locale.hpp>
30 #include <com/sun/star/lang/XMultiServiceFactory.hpp>
31 #include <comphelper/processfactory.hxx>
32 #include <com/sun/star/i18n/UnicodeType.hpp>
33 #include <com/sun/star/util/SearchFlags.hpp>
34 #include <com/sun/star/i18n/WordType.hpp>
35 #include <com/sun/star/i18n/ScriptType.hpp>
36 #include <com/sun/star/i18n/CharacterIteratorMode.hpp>
37 #include <com/sun/star/i18n/KCharacterType.hpp>
38 #include <com/sun/star/registry/XRegistryKey.hpp>
39 #include <cppuhelper/factory.hxx>
40 #include <cppuhelper/weak.hxx>
41 
42 #ifdef _MSC_VER
43 // get rid of that dumb compiler warning
44 // identifier was truncated to '255' characters in the debug information
45 // for STL template usage, if .pdb files are to be created
46 #pragma warning( disable: 4786 )
47 #endif
48 
49 #include <string.h>
50 
51 using namespace ::com::sun::star::util;
52 using namespace ::com::sun::star::uno;
53 using namespace ::com::sun::star::lang;
54 using namespace ::com::sun::star::i18n;
55 using namespace ::rtl;
56 
57 static sal_Int32 COMPLEX_TRANS_MASK_TMP =
58     TransliterationModules_ignoreBaFa_ja_JP |
59     TransliterationModules_ignoreIterationMark_ja_JP |
60     TransliterationModules_ignoreTiJi_ja_JP |
61     TransliterationModules_ignoreHyuByu_ja_JP |
62     TransliterationModules_ignoreSeZe_ja_JP |
63     TransliterationModules_ignoreIandEfollowedByYa_ja_JP |
64     TransliterationModules_ignoreKiKuFollowedBySa_ja_JP |
65     TransliterationModules_ignoreProlongedSoundMark_ja_JP;
66 static const sal_Int32 COMPLEX_TRANS_MASK = COMPLEX_TRANS_MASK_TMP | TransliterationModules_IGNORE_KANA | TransliterationModules_FULLWIDTH_HALFWIDTH;
67 static const sal_Int32 SIMPLE_TRANS_MASK = ~COMPLEX_TRANS_MASK;
68 static const sal_Int32 REGEX_TRANS_MASK = ~(COMPLEX_TRANS_MASK | TransliterationModules_IGNORE_CASE | TransliterationModules_UPPERCASE_LOWERCASE | TransliterationModules_LOWERCASE_UPPERCASE);
69     // Above 2 transliteration is simple but need to take effect in
70     // complex transliteration
71 
TextSearch(const Reference<XMultiServiceFactory> & rxMSF)72 TextSearch::TextSearch(const Reference < XMultiServiceFactory > & rxMSF)
73         : xMSF( rxMSF )
74         , pJumpTable( 0 )
75         , pJumpTable2( 0 )
76         , pRegexMatcher( NULL )
77         , pWLD( 0 )
78 {
79     SearchOptions aOpt;
80     aOpt.algorithmType = SearchAlgorithms_ABSOLUTE;
81     aOpt.searchFlag = SearchFlags::ALL_IGNORE_CASE;
82     //aOpt.Locale = ???;
83     setOptions( aOpt );
84 }
85 
~TextSearch()86 TextSearch::~TextSearch()
87 {
88     delete pRegexMatcher;
89     delete pWLD;
90     delete pJumpTable;
91     delete pJumpTable2;
92 }
93 
setOptions(const SearchOptions & rOptions)94 void TextSearch::setOptions( const SearchOptions& rOptions ) throw( RuntimeException )
95 {
96     aSrchPara = rOptions;
97 
98     delete pRegexMatcher, pRegexMatcher = NULL;
99     delete pWLD, pWLD = 0;
100     delete pJumpTable, pJumpTable = 0;
101     delete pJumpTable2, pJumpTable2 = 0;
102 
103     // Create Transliteration class
104     if( aSrchPara.transliterateFlags & SIMPLE_TRANS_MASK )
105     {
106         if( !xTranslit.is() )
107         {
108             Reference < XInterface > xI = xMSF->createInstance(
109                     OUString::createFromAscii(
110                         "com.sun.star.i18n.Transliteration"));
111             if ( xI.is() )
112                 xI->queryInterface( ::getCppuType(
113                             (const Reference< XExtendedTransliteration >*)0))
114                     >>= xTranslit;
115         }
116         // Load transliteration module
117         if( xTranslit.is() )
118             xTranslit->loadModule(
119                     (TransliterationModules)( aSrchPara.transliterateFlags & SIMPLE_TRANS_MASK ),
120                     aSrchPara.Locale);
121     }
122     else if( xTranslit.is() )
123         xTranslit = 0;
124 
125     // Create Transliteration for 2<->1, 2<->2 transliteration
126     if ( aSrchPara.transliterateFlags & COMPLEX_TRANS_MASK )
127     {
128         if( !xTranslit2.is() )
129         {
130             Reference < XInterface > xI = xMSF->createInstance(
131                     OUString::createFromAscii(
132                         "com.sun.star.i18n.Transliteration"));
133             if ( xI.is() )
134                 xI->queryInterface( ::getCppuType(
135                             (const Reference< XExtendedTransliteration >*)0))
136                     >>= xTranslit2;
137         }
138         // Load transliteration module
139         if( xTranslit2.is() )
140             xTranslit2->loadModule(
141                     (TransliterationModules)( aSrchPara.transliterateFlags & COMPLEX_TRANS_MASK ),
142                     aSrchPara.Locale);
143     }
144 
145     if ( !xBreak.is() )
146     {
147         Reference < XInterface > xI = xMSF->createInstance(
148                 OUString::createFromAscii( "com.sun.star.i18n.BreakIterator"));
149         if( xI.is() )
150             xI->queryInterface( ::getCppuType(
151                         (const Reference< XBreakIterator >*)0))
152                 >>= xBreak;
153     }
154 
155     sSrchStr = aSrchPara.searchString;
156 
157     // use transliteration here
158     if ( xTranslit.is() &&
159 	 aSrchPara.transliterateFlags & SIMPLE_TRANS_MASK )
160         sSrchStr = xTranslit->transliterateString2String(
161                 aSrchPara.searchString, 0, aSrchPara.searchString.getLength());
162 
163     if ( xTranslit2.is() &&
164 	 aSrchPara.transliterateFlags & COMPLEX_TRANS_MASK )
165 	sSrchStr2 = xTranslit2->transliterateString2String(
166 	        aSrchPara.searchString, 0, aSrchPara.searchString.getLength());
167 
168     // When start or end of search string is a complex script type, we need to
169     // make sure the result boundary is not located in the middle of cell.
170     checkCTLStart = (xBreak.is() && (xBreak->getScriptType(sSrchStr, 0) ==
171                 ScriptType::COMPLEX));
172     checkCTLEnd = (xBreak.is() && (xBreak->getScriptType(sSrchStr,
173                     sSrchStr.getLength()-1) == ScriptType::COMPLEX));
174 
175     switch( aSrchPara.algorithmType)
176     {
177 		case SearchAlgorithms_REGEXP:
178 			fnForward = &TextSearch::RESrchFrwrd;
179 			fnBackward = &TextSearch::RESrchBkwrd;
180 			RESrchPrepare( aSrchPara);
181 			break;
182 
183 		case SearchAlgorithms_APPROXIMATE:
184             fnForward = &TextSearch::ApproxSrchFrwrd;
185             fnBackward = &TextSearch::ApproxSrchBkwrd;
186 
187             pWLD = new WLevDistance( sSrchStr.getStr(), aSrchPara.changedChars,
188                     aSrchPara.insertedChars, aSrchPara.deletedChars,
189                     0 != (SearchFlags::LEV_RELAXED & aSrchPara.searchFlag ) );
190 
191             nLimit = pWLD->GetLimit();
192 			break;
193 
194 		default:
195             fnForward = &TextSearch::NSrchFrwrd;
196             fnBackward = &TextSearch::NSrchBkwrd;
197 			break;
198     }
199 }
200 
FindPosInSeq_Impl(const Sequence<sal_Int32> & rOff,sal_Int32 nPos)201 sal_Int32 FindPosInSeq_Impl( const Sequence <sal_Int32>& rOff, sal_Int32 nPos )
202 {
203     sal_Int32 nRet = 0, nEnd = rOff.getLength();
204     while( nRet < nEnd && nPos > rOff[ nRet ] ) ++nRet;
205     return nRet;
206 }
207 
isCellStart(const OUString & searchStr,sal_Int32 nPos)208 sal_Bool TextSearch::isCellStart(const OUString& searchStr, sal_Int32 nPos)
209         throw( RuntimeException )
210 {
211     sal_Int32 nDone;
212     return nPos == xBreak->previousCharacters(searchStr, nPos+1,
213             aSrchPara.Locale, CharacterIteratorMode::SKIPCELL, 1, nDone);
214 }
215 
searchForward(const OUString & searchStr,sal_Int32 startPos,sal_Int32 endPos)216 SearchResult TextSearch::searchForward( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos )
217         throw( RuntimeException )
218 {
219     SearchResult sres;
220 
221     OUString in_str(searchStr);
222     sal_Int32 newStartPos = startPos;
223     sal_Int32 newEndPos = endPos;
224 
225     bUsePrimarySrchStr = true;
226 
227     if ( xTranslit.is() )
228     {
229         // apply normal transliteration (1<->1, 1<->0)
230         com::sun::star::uno::Sequence <sal_Int32> offset( in_str.getLength());
231         in_str = xTranslit->transliterate( searchStr, 0, in_str.getLength(), offset );
232 
233         // JP 20.6.2001: also the start and end positions must be corrected!
234         if( startPos )
235             newStartPos = FindPosInSeq_Impl( offset, startPos );
236 
237         if( endPos < searchStr.getLength() )
238 	    newEndPos = FindPosInSeq_Impl( offset, endPos );
239         else
240             newEndPos = in_str.getLength();
241 
242         sres = (this->*fnForward)( in_str, newStartPos, newEndPos );
243 
244         sal_Int32 nOffsetLength = offset.getLength();
245         sal_Int32 nStartOffset = 0;
246         for ( int k = 0; k < sres.startOffset.getLength(); k++ )
247         {
248             nStartOffset = sres.startOffset[k];
249             if ( nStartOffset )
250             {
251                 if ( nStartOffset < nOffsetLength )
252                     sres.startOffset[k] = offset[nStartOffset];
253                 else
254                     sres.startOffset[k] = offset[offset.getLength()-1] +1;
255             }
256             // JP 20.6.2001: end is ever exclusive and then don't return
257             //               the position of the next character - return the
258             //               next position behind the last found character!
259             //               "a b c" find "b" must return 2,3 and not 2,4!!!
260             if (sres.endOffset[k])
261 	      sres.endOffset[k] = offset[sres.endOffset[k]-1] + 1;
262         }
263     }
264     else
265     {
266         sres = (this->*fnForward)( in_str, startPos, endPos );
267     }
268 
269     if ( xTranslit2.is() && aSrchPara.algorithmType != SearchAlgorithms_REGEXP)
270     {
271         SearchResult sres2;
272 
273 	in_str = OUString(searchStr);
274         com::sun::star::uno::Sequence <sal_Int32> offset( in_str.getLength());
275 
276         in_str = xTranslit2->transliterate( searchStr, 0, in_str.getLength(), offset );
277 
278         if( startPos )
279             startPos = FindPosInSeq_Impl( offset, startPos );
280 
281         if( endPos < searchStr.getLength() )
282             endPos = FindPosInSeq_Impl( offset, endPos );
283         else
284             endPos = in_str.getLength();
285 
286 	bUsePrimarySrchStr = false;
287         sres2 = (this->*fnForward)( in_str, startPos, endPos );
288 
289         for ( int k = 0; k < sres2.startOffset.getLength(); k++ )
290         {
291             if (sres2.startOffset[k])
292 	      sres2.startOffset[k] = offset[sres2.startOffset[k]-1] + 1;
293             if (sres2.endOffset[k])
294 	      sres2.endOffset[k] = offset[sres2.endOffset[k]-1] + 1;
295         }
296 
297 	// pick first and long one
298 	if ( sres.subRegExpressions == 0)
299 	    return sres2;
300 	if ( sres2.subRegExpressions == 1)
301 	{
302 	    if ( sres.startOffset[0] > sres2.startOffset[0])
303 	        return sres2;
304 	    else if ( sres.startOffset[0] == sres2.startOffset[0] &&
305 	        sres.endOffset[0] < sres2.endOffset[0])
306 	        return sres2;
307 	}
308     }
309 
310     return sres;
311 }
312 
searchBackward(const OUString & searchStr,sal_Int32 startPos,sal_Int32 endPos)313 SearchResult TextSearch::searchBackward( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos )
314         throw(RuntimeException)
315 {
316     SearchResult sres;
317 
318     OUString in_str(searchStr);
319     sal_Int32 newStartPos = startPos;
320     sal_Int32 newEndPos = endPos;
321 
322     bUsePrimarySrchStr = true;
323 
324     if ( xTranslit.is() )
325     {
326         // apply only simple 1<->1 transliteration here
327         com::sun::star::uno::Sequence <sal_Int32> offset( in_str.getLength());
328 	in_str = xTranslit->transliterate( searchStr, 0, in_str.getLength(), offset );
329 
330         // JP 20.6.2001: also the start and end positions must be corrected!
331         if( startPos < searchStr.getLength() )
332             newStartPos = FindPosInSeq_Impl( offset, startPos );
333 	else
334 	    newStartPos = in_str.getLength();
335 
336         if( endPos )
337 	    newEndPos = FindPosInSeq_Impl( offset, endPos );
338 
339         sres = (this->*fnBackward)( in_str, newStartPos, newEndPos );
340 
341         sal_Int32 nOffsetLength = offset.getLength();
342         sal_Int32 nEndOffset = 0;
343         for ( int k = 0; k < sres.startOffset.getLength(); k++ )
344         {
345             if (sres.startOffset[k])
346 	      sres.startOffset[k] = offset[sres.startOffset[k] - 1] + 1;
347             // JP 20.6.2001: end is ever exclusive and then don't return
348             //               the position of the next character - return the
349             //               next position behind the last found character!
350             //               "a b c" find "b" must return 2,3 and not 2,4!!!
351             nEndOffset = sres.endOffset[k];
352             if ( nEndOffset )
353             {
354                 if ( nEndOffset < nOffsetLength )
355                     sres.endOffset[k] = offset[nEndOffset];
356                 else
357                     sres.endOffset[k] = offset[offset.getLength()-1] +1;
358             }
359         }
360     }
361     else
362     {
363         sres = (this->*fnBackward)( in_str, startPos, endPos );
364     }
365 
366     if ( xTranslit2.is() && aSrchPara.algorithmType != SearchAlgorithms_REGEXP )
367     {
368 	SearchResult sres2;
369 
370 	in_str = OUString(searchStr);
371         com::sun::star::uno::Sequence <sal_Int32> offset( in_str.getLength());
372 
373         in_str = xTranslit2->transliterate(searchStr, 0, in_str.getLength(), offset);
374 
375         if( startPos < searchStr.getLength() )
376             startPos = FindPosInSeq_Impl( offset, startPos );
377         else
378             startPos = in_str.getLength();
379 
380         if( endPos )
381             endPos = FindPosInSeq_Impl( offset, endPos );
382 
383 	bUsePrimarySrchStr = false;
384 	sres2 = (this->*fnBackward)( in_str, startPos, endPos );
385 
386         for( int k = 0; k < sres2.startOffset.getLength(); k++ )
387         {
388             if (sres2.startOffset[k])
389                 sres2.startOffset[k] = offset[sres2.startOffset[k]-1]+1;
390             if (sres2.endOffset[k])
391                 sres2.endOffset[k] = offset[sres2.endOffset[k]-1]+1;
392         }
393 
394 	// pick last and long one
395 	if ( sres.subRegExpressions == 0 )
396 	    return sres2;
397 	if ( sres2.subRegExpressions == 1 )
398 	{
399 	    if ( sres.startOffset[0] < sres2.startOffset[0] )
400 	        return sres2;
401 	    if ( sres.startOffset[0] == sres2.startOffset[0] &&
402 		sres.endOffset[0] > sres2.endOffset[0] )
403 	        return sres2;
404 	}
405     }
406 
407     return sres;
408 }
409 
410 //---------------------------------------------------------------------
411 
IsDelimiter(const OUString & rStr,sal_Int32 nPos) const412 bool TextSearch::IsDelimiter( const OUString& rStr, sal_Int32 nPos ) const
413 {
414     bool bRet = 1;
415     if( '\x7f' != rStr[nPos])
416     {
417         if ( !xCharClass.is() )
418         {
419             Reference < XInterface > xI = xMSF->createInstance(
420                     OUString::createFromAscii( "com.sun.star.i18n.CharacterClassification"));
421             if( xI.is() )
422                 xI->queryInterface( ::getCppuType(
423                             (const Reference< XCharacterClassification >*)0))
424                     >>= xCharClass;
425         }
426         if ( xCharClass.is() )
427         {
428             sal_Int32 nCType = xCharClass->getCharacterType( rStr, nPos,
429                     aSrchPara.Locale );
430             if( 0 != (( KCharacterType::DIGIT | KCharacterType::ALPHA |
431                             KCharacterType::LETTER ) & nCType ) )
432                 bRet = 0;
433         }
434     }
435     return bRet;
436 }
437 
438 // --------- helper methods for Boyer-Moore like text searching ----------
439 // TODO: use ICU's regex UREGEX_LITERAL mode instead when it becomes available
440 
MakeForwardTab()441 void TextSearch::MakeForwardTab()
442 {
443     // create the jumptable for the search text
444     if( pJumpTable )
445     {
446         if( bIsForwardTab )
447             return ;                                        // the jumpTable is ok
448         delete pJumpTable;
449     }
450     bIsForwardTab = true;
451 
452     sal_Int32 n, nLen = sSrchStr.getLength();
453     pJumpTable = new TextSearchJumpTable;
454 
455     for( n = 0; n < nLen - 1; ++n )
456     {
457         sal_Unicode cCh = sSrchStr[n];
458         sal_Int32 nDiff = nLen - n - 1;
459 	TextSearchJumpTable::value_type aEntry( cCh, nDiff );
460 
461         ::std::pair< TextSearchJumpTable::iterator, bool > aPair =
462             pJumpTable->insert( aEntry );
463         if ( !aPair.second )
464             (*(aPair.first)).second = nDiff;
465     }
466 }
467 
MakeForwardTab2()468 void TextSearch::MakeForwardTab2()
469 {
470     // create the jumptable for the search text
471     if( pJumpTable2 )
472     {
473         if( bIsForwardTab )
474             return ;                                        // the jumpTable is ok
475         delete pJumpTable2;
476     }
477     bIsForwardTab = true;
478 
479     sal_Int32 n, nLen = sSrchStr2.getLength();
480     pJumpTable2 = new TextSearchJumpTable;
481 
482     for( n = 0; n < nLen - 1; ++n )
483     {
484         sal_Unicode cCh = sSrchStr2[n];
485         sal_Int32 nDiff = nLen - n - 1;
486 
487 	TextSearchJumpTable::value_type aEntry( cCh, nDiff );
488         ::std::pair< TextSearchJumpTable::iterator, bool > aPair =
489             pJumpTable2->insert( aEntry );
490         if ( !aPair.second )
491             (*(aPair.first)).second = nDiff;
492     }
493 }
494 
MakeBackwardTab()495 void TextSearch::MakeBackwardTab()
496 {
497     // create the jumptable for the search text
498     if( pJumpTable )
499     {
500         if( !bIsForwardTab )
501             return ;                                        // the jumpTable is ok
502         delete pJumpTable;
503     }
504     bIsForwardTab = false;
505 
506     sal_Int32 n, nLen = sSrchStr.getLength();
507     pJumpTable = new TextSearchJumpTable;
508 
509     for( n = nLen-1; n > 0; --n )
510     {
511         sal_Unicode cCh = sSrchStr[n];
512         TextSearchJumpTable::value_type aEntry( cCh, n );
513         ::std::pair< TextSearchJumpTable::iterator, bool > aPair =
514             pJumpTable->insert( aEntry );
515         if ( !aPair.second )
516             (*(aPair.first)).second = n;
517     }
518 }
519 
MakeBackwardTab2()520 void TextSearch::MakeBackwardTab2()
521 {
522     // create the jumptable for the search text
523     if( pJumpTable2 )
524     {
525         if( !bIsForwardTab )
526             return ;                                        // the jumpTable is ok
527         delete pJumpTable2;
528     }
529     bIsForwardTab = false;
530 
531     sal_Int32 n, nLen = sSrchStr2.getLength();
532     pJumpTable2 = new TextSearchJumpTable;
533 
534     for( n = nLen-1; n > 0; --n )
535     {
536         sal_Unicode cCh = sSrchStr2[n];
537         TextSearchJumpTable::value_type aEntry( cCh, n );
538         ::std::pair< TextSearchJumpTable::iterator, bool > aPair =
539             pJumpTable2->insert( aEntry );
540         if ( !aPair.second )
541             (*(aPair.first)).second = n;
542     }
543 }
544 
GetDiff(const sal_Unicode cChr) const545 sal_Int32 TextSearch::GetDiff( const sal_Unicode cChr ) const
546 {
547     TextSearchJumpTable *pJump;
548     OUString sSearchKey;
549 
550     if ( bUsePrimarySrchStr ) {
551       pJump = pJumpTable;
552       sSearchKey = sSrchStr;
553     } else {
554       pJump = pJumpTable2;
555       sSearchKey = sSrchStr2;
556     }
557 
558     TextSearchJumpTable::const_iterator iLook = pJump->find( cChr );
559     if ( iLook == pJump->end() )
560         return sSearchKey.getLength();
561     return (*iLook).second;
562 }
563 
564 
565 // TextSearch::NSrchFrwrd is mis-optimized on unxsoli (#i105945#)
NSrchFrwrd(const OUString & searchStr,sal_Int32 startPos,sal_Int32 endPos)566 SearchResult TextSearch::NSrchFrwrd( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos )
567         throw(RuntimeException)
568 {
569     SearchResult aRet;
570     aRet.subRegExpressions = 0;
571 
572     OUString sSearchKey = bUsePrimarySrchStr ? sSrchStr : sSrchStr2;
573 
574     OUString aStr( searchStr );
575     sal_Int32 nSuchIdx = aStr.getLength();
576     sal_Int32 nEnde = endPos;
577     if( !nSuchIdx || !sSearchKey.getLength() || sSearchKey.getLength() > nSuchIdx )
578         return aRet;
579 
580 
581     if( nEnde < sSearchKey.getLength() )  // position inside the search region ?
582         return aRet;
583 
584     nEnde -= sSearchKey.getLength();
585 
586     if (bUsePrimarySrchStr)
587       MakeForwardTab();                   // create the jumptable
588     else
589       MakeForwardTab2();
590 
591     for (sal_Int32 nCmpIdx = startPos; // start position for the search
592             nCmpIdx <= nEnde;
593             nCmpIdx += GetDiff( aStr[nCmpIdx + sSearchKey.getLength()-1]))
594     {
595         // if the match would be the completed cells, skip it.
596         if ( (checkCTLStart && !isCellStart( aStr, nCmpIdx )) || (checkCTLEnd
597                     && !isCellStart( aStr, nCmpIdx + sSearchKey.getLength())) )
598             continue;
599 
600         nSuchIdx = sSearchKey.getLength() - 1;
601         while( nSuchIdx >= 0 && sSearchKey[nSuchIdx] == aStr[nCmpIdx + nSuchIdx])
602         {
603             if( nSuchIdx == 0 )
604             {
605                 if( SearchFlags::NORM_WORD_ONLY & aSrchPara.searchFlag )
606                 {
607                     sal_Int32 nFndEnd = nCmpIdx + sSearchKey.getLength();
608                     bool bAtStart = !nCmpIdx;
609                     bool bAtEnd = nFndEnd == endPos;
610                     bool bDelimBefore = bAtStart || IsDelimiter( aStr, nCmpIdx-1 );
611                     bool bDelimBehind = IsDelimiter(  aStr, nFndEnd );
612                     //  *       1 -> only one word in the paragraph
613                     //  *       2 -> at begin of paragraph
614                     //  *       3 -> at end of paragraph
615                     //  *       4 -> inside the paragraph
616                     if( !(  ( bAtStart && bAtEnd ) ||           // 1
617                                 ( bAtStart && bDelimBehind ) ||     // 2
618                                 ( bAtEnd && bDelimBefore ) ||       // 3
619                                 ( bDelimBefore && bDelimBehind )))  // 4
620                         break;
621                 }
622 
623                 aRet.subRegExpressions = 1;
624                 aRet.startOffset.realloc( 1 );
625                 aRet.startOffset[ 0 ] = nCmpIdx;
626                 aRet.endOffset.realloc( 1 );
627                 aRet.endOffset[ 0 ] = nCmpIdx + sSearchKey.getLength();
628 
629                 return aRet;
630             }
631             else
632                 nSuchIdx--;
633         }
634     }
635     return aRet;
636 }
637 
NSrchBkwrd(const OUString & searchStr,sal_Int32 startPos,sal_Int32 endPos)638 SearchResult TextSearch::NSrchBkwrd( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos )
639         throw(RuntimeException)
640 {
641     SearchResult aRet;
642     aRet.subRegExpressions = 0;
643 
644     OUString sSearchKey = bUsePrimarySrchStr ? sSrchStr : sSrchStr2;
645 
646     OUString aStr( searchStr );
647     sal_Int32 nSuchIdx = aStr.getLength();
648     sal_Int32 nEnde = endPos;
649     if( nSuchIdx == 0 || sSearchKey.getLength() == 0 || sSearchKey.getLength() > nSuchIdx)
650         return aRet;
651 
652     if (bUsePrimarySrchStr)
653       MakeBackwardTab();                      // create the jumptable
654     else
655       MakeBackwardTab2();
656 
657     if( nEnde == nSuchIdx )                 // end position for the search
658         nEnde = sSearchKey.getLength();
659     else
660         nEnde += sSearchKey.getLength();
661 
662     sal_Int32 nCmpIdx = startPos;          // start position for the search
663 
664     while (nCmpIdx >= nEnde)
665     {
666         // if the match would be the completed cells, skip it.
667         if ( (!checkCTLStart || isCellStart( aStr, nCmpIdx -
668                         sSearchKey.getLength() )) && (!checkCTLEnd ||
669                     isCellStart( aStr, nCmpIdx)))
670         {
671             nSuchIdx = 0;
672             while( nSuchIdx < sSearchKey.getLength() && sSearchKey[nSuchIdx] ==
673                     aStr[nCmpIdx + nSuchIdx - sSearchKey.getLength()] )
674                 nSuchIdx++;
675             if( nSuchIdx >= sSearchKey.getLength() )
676             {
677                 if( SearchFlags::NORM_WORD_ONLY & aSrchPara.searchFlag )
678                 {
679                     sal_Int32 nFndStt = nCmpIdx - sSearchKey.getLength();
680                     bool bAtStart = !nFndStt;
681                     bool bAtEnd = nCmpIdx == startPos;
682                     bool bDelimBehind = IsDelimiter( aStr, nCmpIdx );
683                     bool bDelimBefore = bAtStart || // begin of paragraph
684                         IsDelimiter( aStr, nFndStt-1 );
685                     //  *       1 -> only one word in the paragraph
686                     //  *       2 -> at begin of paragraph
687                     //  *       3 -> at end of paragraph
688                     //  *       4 -> inside the paragraph
689                     if( ( bAtStart && bAtEnd ) ||           // 1
690                             ( bAtStart && bDelimBehind ) ||     // 2
691                             ( bAtEnd && bDelimBefore ) ||       // 3
692                             ( bDelimBefore && bDelimBehind ))   // 4
693                     {
694                         aRet.subRegExpressions = 1;
695                         aRet.startOffset.realloc( 1 );
696                         aRet.startOffset[ 0 ] = nCmpIdx;
697                         aRet.endOffset.realloc( 1 );
698                         aRet.endOffset[ 0 ] = nCmpIdx - sSearchKey.getLength();
699                         return aRet;
700                     }
701                 }
702                 else
703                 {
704                     aRet.subRegExpressions = 1;
705                     aRet.startOffset.realloc( 1 );
706                     aRet.startOffset[ 0 ] = nCmpIdx;
707                     aRet.endOffset.realloc( 1 );
708                     aRet.endOffset[ 0 ] = nCmpIdx - sSearchKey.getLength();
709                     return aRet;
710                 }
711             }
712         }
713         nSuchIdx = GetDiff( aStr[nCmpIdx - sSearchKey.getLength()] );
714         if( nCmpIdx < nSuchIdx )
715             return aRet;
716         nCmpIdx -= nSuchIdx;
717     }
718     return aRet;
719 }
720 
RESrchPrepare(const::com::sun::star::util::SearchOptions & rOptions)721 void TextSearch::RESrchPrepare( const ::com::sun::star::util::SearchOptions& rOptions)
722 {
723 	// select the transliterated pattern string
724 	const OUString& rPatternStr =
725 		(rOptions.transliterateFlags & REGEX_TRANS_MASK) ? sSrchStr
726 		: ((rOptions.transliterateFlags & COMPLEX_TRANS_MASK) ? sSrchStr2 : rOptions.searchString);
727 
728 	sal_uInt32 nIcuSearchFlags = UREGEX_UWORD; // request UAX#29 unicode capability
729 	// map com::sun::star::util::SearchFlags to ICU uregex.h flags
730 	// TODO: REG_EXTENDED, REG_NOT_BEGINOFLINE, REG_NOT_ENDOFLINE
731 	// REG_NEWLINE is neither properly defined nor used anywhere => not implemented
732 	// REG_NOSUB is not used anywhere => not implemented
733 	// NORM_WORD_ONLY is only used for SearchAlgorithm==Absolute
734 	// LEV_RELAXED is only used for SearchAlgorithm==Approximate
735 	// Note that the search flag ALL_IGNORE_CASE is deprecated in UNO
736 	// probably because the transliteration flag IGNORE_CASE handles it as well.
737 	if( (rOptions.searchFlag & com::sun::star::util::SearchFlags::ALL_IGNORE_CASE) != 0
738 	||  (rOptions.transliterateFlags & TransliterationModules_IGNORE_CASE) != 0)
739 		nIcuSearchFlags |= UREGEX_CASE_INSENSITIVE;
740 	UErrorCode nIcuErr = U_ZERO_ERROR;
741 	// assumption: transliteration didn't mangle regexp control chars
742 	IcuUniString aIcuSearchPatStr( (const UChar*)rPatternStr.getStr(), rPatternStr.getLength());
743 #ifndef DISABLE_WORDBOUND_EMULATION
744 	// for conveniance specific syntax elements of the old regex engine are emulated
745 	// - by replacing \< with "word-break followed by a look-ahead word-char"
746 	static const IcuUniString aChevronPatternB( "\\\\<", -1, IcuUniString::kInvariant);
747 	static const IcuUniString aChevronReplaceB( "\\\\b(?=\\\\w)", -1, IcuUniString::kInvariant);
748 	static RegexMatcher aChevronMatcherB( aChevronPatternB, 0, nIcuErr);
749 	aChevronMatcherB.reset( aIcuSearchPatStr);
750 	aIcuSearchPatStr = aChevronMatcherB.replaceAll( aChevronReplaceB, nIcuErr);
751 	aChevronMatcherB.reset();
752 	// - by replacing \> with "look-behind word-char followed by a word-break"
753 	static const IcuUniString aChevronPatternE( "\\\\>", -1, IcuUniString::kInvariant);
754 	static const IcuUniString aChevronReplaceE( "(?<=\\\\w)\\\\b", -1, IcuUniString::kInvariant);
755 	static RegexMatcher aChevronMatcherE( aChevronPatternE, 0, nIcuErr);
756 	aChevronMatcherE.reset( aIcuSearchPatStr);
757 	aIcuSearchPatStr = aChevronMatcherE.replaceAll( aChevronReplaceE, nIcuErr);
758 	aChevronMatcherE.reset();
759 #endif
760 	pRegexMatcher = new RegexMatcher( aIcuSearchPatStr, nIcuSearchFlags, nIcuErr);
761 	if( nIcuErr)
762 		{ delete pRegexMatcher; pRegexMatcher = NULL;}
763 }
764 
765 //---------------------------------------------------------------------------
766 
RESrchFrwrd(const OUString & searchStr,sal_Int32 startPos,sal_Int32 endPos)767 SearchResult TextSearch::RESrchFrwrd( const OUString& searchStr,
768                                       sal_Int32 startPos, sal_Int32 endPos )
769             throw(RuntimeException)
770 {
771 	SearchResult aRet;
772 	aRet.subRegExpressions = 0;
773 	if( !pRegexMatcher)
774 		return aRet;
775 
776 	if( endPos > searchStr.getLength())
777 		endPos = searchStr.getLength();
778 
779 	// use the ICU RegexMatcher to find the matches
780 	UErrorCode nIcuErr = U_ZERO_ERROR;
781 	const IcuUniString aSearchTargetStr( (const UChar*)searchStr.getStr(), endPos);
782 	pRegexMatcher->reset( aSearchTargetStr);
783 	// search until there is a valid match
784 	for(;;)
785 	{
786 		if( !pRegexMatcher->find( startPos, nIcuErr))
787 			return aRet;
788 
789 		// #i118887# ignore zero-length matches e.g. "a*" in "bc"
790 		int nStartOfs = pRegexMatcher->start( nIcuErr);
791 		int nEndOfs = pRegexMatcher->end( nIcuErr);
792 		if( nStartOfs < nEndOfs)
793 			break;
794 		// try at next position if there was a zero-length match
795 		if( ++startPos >= endPos)
796 			return aRet;
797 	}
798 
799 	// extract the result of the search
800 	const int nGroupCount = pRegexMatcher->groupCount();
801 	aRet.subRegExpressions = nGroupCount + 1;
802 	aRet.startOffset.realloc( aRet.subRegExpressions);
803 	aRet.endOffset.realloc( aRet.subRegExpressions);
804 	aRet.startOffset[0] = pRegexMatcher->start( nIcuErr);
805 	aRet.endOffset[0]   = pRegexMatcher->end( nIcuErr);
806 	for( int i = 1; i <= nGroupCount; ++i) {
807 		aRet.startOffset[i] = pRegexMatcher->start( i, nIcuErr);
808 		aRet.endOffset[i]   = pRegexMatcher->end( i, nIcuErr);
809 	}
810 
811 	return aRet;
812 }
813 
RESrchBkwrd(const OUString & searchStr,sal_Int32 startPos,sal_Int32 endPos)814 SearchResult TextSearch::RESrchBkwrd( const OUString& searchStr,
815                                       sal_Int32 startPos, sal_Int32 endPos )
816             throw(RuntimeException)
817 {
818 	// NOTE: for backwards search callers provide startPos/endPos inverted!
819 	SearchResult aRet;
820 	aRet.subRegExpressions = 0;
821 	if( !pRegexMatcher)
822 		return aRet;
823 
824 	if( startPos > searchStr.getLength())
825 		startPos = searchStr.getLength();
826 
827 	// use the ICU RegexMatcher to find the matches
828 	// TODO: use ICU's backward searching once it becomes available
829 	//       as its replacement using forward search is not as good as the real thing
830 	UErrorCode nIcuErr = U_ZERO_ERROR;
831 	const IcuUniString aSearchTargetStr( (const UChar*)searchStr.getStr(), startPos);
832 	pRegexMatcher->reset( aSearchTargetStr);
833 	if( !pRegexMatcher->find( endPos, nIcuErr))
834 		return aRet;
835 
836 	// find the last match
837 	int nLastPos = 0;
838 	int nFoundEnd = 0;
839 	do {
840 		nLastPos = pRegexMatcher->start( nIcuErr);
841 		nFoundEnd = pRegexMatcher->end( nIcuErr);
842 		if( nFoundEnd >= startPos)
843 			break;
844 		if( nFoundEnd == nLastPos)
845 			++nFoundEnd;
846 	} while( pRegexMatcher->find( nFoundEnd, nIcuErr));
847 
848 	// find last match again to get its details
849 	pRegexMatcher->find( nLastPos, nIcuErr);
850 
851 	// fill in the details of the last match
852 	const int nGroupCount = pRegexMatcher->groupCount();
853 	aRet.subRegExpressions = nGroupCount + 1;
854 	aRet.startOffset.realloc( aRet.subRegExpressions);
855 	aRet.endOffset.realloc( aRet.subRegExpressions);
856 	// NOTE: existing users of backward search seem to expect startOfs/endOfs being inverted!
857 	aRet.startOffset[0] = pRegexMatcher->end( nIcuErr);
858 	aRet.endOffset[0]   = pRegexMatcher->start( nIcuErr);
859 	for( int i = 1; i <= nGroupCount; ++i) {
860 		aRet.startOffset[i] = pRegexMatcher->end( i, nIcuErr);
861 		aRet.endOffset[i]   = pRegexMatcher->start( i, nIcuErr);
862 	}
863 
864 	return aRet;
865 }
866 
867 //---------------------------------------------------------------------------
868 
869 // search for words phonetically
ApproxSrchFrwrd(const OUString & searchStr,sal_Int32 startPos,sal_Int32 endPos)870 SearchResult TextSearch::ApproxSrchFrwrd( const OUString& searchStr,
871                                           sal_Int32 startPos, sal_Int32 endPos )
872             throw(RuntimeException)
873 {
874     SearchResult aRet;
875     aRet.subRegExpressions = 0;
876 
877     if( !xBreak.is() )
878         return aRet;
879 
880     OUString aWTemp( searchStr );
881 
882     sal_Int32 nStt, nEnd;
883 
884     Boundary aWBnd = xBreak->getWordBoundary( aWTemp, startPos,
885             aSrchPara.Locale,
886             WordType::ANYWORD_IGNOREWHITESPACES, sal_True );
887 
888     do
889     {
890         if( aWBnd.startPos >= endPos )
891             break;
892         nStt = aWBnd.startPos < startPos ? startPos : aWBnd.startPos;
893         nEnd = aWBnd.endPos > endPos ? endPos : aWBnd.endPos;
894 
895         if( nStt < nEnd &&
896                 pWLD->WLD( aWTemp.getStr() + nStt, nEnd - nStt ) <= nLimit )
897         {
898             aRet.subRegExpressions = 1;
899             aRet.startOffset.realloc( 1 );
900             aRet.startOffset[ 0 ] = nStt;
901             aRet.endOffset.realloc( 1 );
902             aRet.endOffset[ 0 ] = nEnd;
903             break;
904         }
905 
906         nStt = nEnd - 1;
907         aWBnd = xBreak->nextWord( aWTemp, nStt, aSrchPara.Locale,
908                 WordType::ANYWORD_IGNOREWHITESPACES);
909     } while( aWBnd.startPos != aWBnd.endPos ||
910             (aWBnd.endPos != aWTemp.getLength() && aWBnd.endPos != nEnd) );
911     // #i50244# aWBnd.endPos != nEnd : in case there is _no_ word (only
912     // whitespace) in searchStr, getWordBoundary() returned startPos,startPos
913     // and nextWord() does also => don't loop forever.
914     return aRet;
915 }
916 
ApproxSrchBkwrd(const OUString & searchStr,sal_Int32 startPos,sal_Int32 endPos)917 SearchResult TextSearch::ApproxSrchBkwrd( const OUString& searchStr,
918                                           sal_Int32 startPos, sal_Int32 endPos )
919             throw(RuntimeException)
920 {
921     SearchResult aRet;
922     aRet.subRegExpressions = 0;
923 
924     if( !xBreak.is() )
925         return aRet;
926 
927     OUString aWTemp( searchStr );
928 
929     sal_Int32 nStt, nEnd;
930 
931     Boundary aWBnd = xBreak->getWordBoundary( aWTemp, startPos,
932             aSrchPara.Locale,
933             WordType::ANYWORD_IGNOREWHITESPACES, sal_True );
934 
935     do
936     {
937         if( aWBnd.endPos <= endPos )
938             break;
939         nStt = aWBnd.startPos < endPos ? endPos : aWBnd.startPos;
940         nEnd = aWBnd.endPos > startPos ? startPos : aWBnd.endPos;
941 
942         if( nStt < nEnd &&
943                 pWLD->WLD( aWTemp.getStr() + nStt, nEnd - nStt ) <= nLimit )
944         {
945             aRet.subRegExpressions = 1;
946             aRet.startOffset.realloc( 1 );
947             aRet.startOffset[ 0 ] = nEnd;
948             aRet.endOffset.realloc( 1 );
949             aRet.endOffset[ 0 ] = nStt;
950             break;
951         }
952         if( !nStt )
953             break;
954 
955         aWBnd = xBreak->previousWord( aWTemp, nStt, aSrchPara.Locale,
956                 WordType::ANYWORD_IGNOREWHITESPACES);
957     } while( aWBnd.startPos != aWBnd.endPos || aWBnd.endPos != aWTemp.getLength() );
958     return aRet;
959 }
960 
961 
962 static const sal_Char cSearchName[] = "com.sun.star.util.TextSearch";
963 static const sal_Char cSearchImpl[] = "com.sun.star.util.TextSearch_i18n";
964 
getServiceName_Static()965 static OUString getServiceName_Static()
966 {
967     return OUString::createFromAscii( cSearchName );
968 }
969 
getImplementationName_Static()970 static OUString getImplementationName_Static()
971 {
972     return OUString::createFromAscii( cSearchImpl );
973 }
974 
975 OUString SAL_CALL
getImplementationName()976 TextSearch::getImplementationName()
977                 throw( RuntimeException )
978 {
979     return getImplementationName_Static();
980 }
981 
982 sal_Bool SAL_CALL
supportsService(const OUString & rServiceName)983 TextSearch::supportsService(const OUString& rServiceName)
984                 throw( RuntimeException )
985 {
986     return !rServiceName.compareToAscii( cSearchName );
987 }
988 
989 Sequence< OUString > SAL_CALL
getSupportedServiceNames(void)990 TextSearch::getSupportedServiceNames(void) throw( RuntimeException )
991 {
992     Sequence< OUString > aRet(1);
993     aRet[0] = getServiceName_Static();
994     return aRet;
995 }
996 
997 ::com::sun::star::uno::Reference< ::com::sun::star::uno::XInterface >
TextSearch_CreateInstance(const::com::sun::star::uno::Reference<::com::sun::star::lang::XMultiServiceFactory> & rxMSF)998 SAL_CALL TextSearch_CreateInstance(
999         const ::com::sun::star::uno::Reference<
1000         ::com::sun::star::lang::XMultiServiceFactory >& rxMSF )
1001 {
1002     return ::com::sun::star::uno::Reference<
1003         ::com::sun::star::uno::XInterface >(
1004                 (::cppu::OWeakObject*) new TextSearch( rxMSF ) );
1005 }
1006 
1007 extern "C"
1008 {
1009 
component_getImplementationEnvironment(const sal_Char ** ppEnvTypeName,uno_Environment **)1010 SAL_DLLPUBLIC_EXPORT void SAL_CALL component_getImplementationEnvironment(
1011         const sal_Char** ppEnvTypeName, uno_Environment** /*ppEnv*/ )
1012 {
1013     *ppEnvTypeName = CPPU_CURRENT_LANGUAGE_BINDING_NAME;
1014 }
1015 
component_getFactory(const sal_Char * sImplementationName,void * _pServiceManager,void *)1016 SAL_DLLPUBLIC_EXPORT void* SAL_CALL component_getFactory( const sal_Char* sImplementationName,
1017         void* _pServiceManager, void* /*_pRegistryKey*/ )
1018 {
1019     void* pRet = NULL;
1020 
1021     ::com::sun::star::lang::XMultiServiceFactory* pServiceManager =
1022         reinterpret_cast< ::com::sun::star::lang::XMultiServiceFactory* >
1023             ( _pServiceManager );
1024     ::com::sun::star::uno::Reference<
1025             ::com::sun::star::lang::XSingleServiceFactory > xFactory;
1026 
1027     if ( 0 == rtl_str_compare( sImplementationName, cSearchImpl) )
1028     {
1029         ::com::sun::star::uno::Sequence< ::rtl::OUString > aServiceNames(1);
1030         aServiceNames[0] = getServiceName_Static();
1031         xFactory = ::cppu::createSingleFactory(
1032                 pServiceManager, getImplementationName_Static(),
1033                 &TextSearch_CreateInstance, aServiceNames );
1034     }
1035 
1036     if ( xFactory.is() )
1037     {
1038         xFactory->acquire();
1039         pRet = xFactory.get();
1040     }
1041 
1042     return pRet;
1043 }
1044 
1045 } // extern "C"
1046