1 /**************************************************************
2  *
3  * Licensed to the Apache Software Foundation (ASF) under one
4  * or more contributor license agreements.  See the NOTICE file
5  * distributed with this work for additional information
6  * regarding copyright ownership.  The ASF licenses this file
7  * to you under the Apache License, Version 2.0 (the
8  * "License"); you may not use this file except in compliance
9  * with the License.  You may obtain a copy of the License at
10  *
11  *   http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing,
14  * software distributed under the License is distributed on an
15  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16  * KIND, either express or implied.  See the License for the
17  * specific language governing permissions and limitations
18  * under the License.
19  *
20  *************************************************************/
21 
22 
23 
24 // MARKER(update_precomp.py): autogen include statement, do not remove
25 #include "precompiled_i18npool.hxx"
26 
27 #include "textsearch.hxx"
28 #include "levdis.hxx"
29 #include <com/sun/star/lang/Locale.hpp>
30 #include <com/sun/star/lang/XMultiServiceFactory.hpp>
31 #include <comphelper/processfactory.hxx>
32 #include <com/sun/star/i18n/UnicodeType.hpp>
33 #include <com/sun/star/util/SearchFlags.hpp>
34 #include <com/sun/star/i18n/WordType.hpp>
35 #include <com/sun/star/i18n/ScriptType.hpp>
36 #include <com/sun/star/i18n/CharacterIteratorMode.hpp>
37 #include <com/sun/star/i18n/KCharacterType.hpp>
38 #include <com/sun/star/registry/XRegistryKey.hpp>
39 #include <cppuhelper/factory.hxx>
40 #include <cppuhelper/weak.hxx>
41 
42 #ifdef _MSC_VER
43 // get rid of that dumb compiler warning
44 // identifier was truncated to '255' characters in the debug information
45 // for STL template usage, if .pdb files are to be created
46 #pragma warning( disable: 4786 )
47 #endif
48 
49 #include <string.h>
50 
51 using namespace ::com::sun::star::util;
52 using namespace ::com::sun::star::uno;
53 using namespace ::com::sun::star::lang;
54 using namespace ::com::sun::star::i18n;
55 using namespace ::rtl;
56 
57 static sal_Int32 COMPLEX_TRANS_MASK_TMP =
58     TransliterationModules_ignoreBaFa_ja_JP |
59     TransliterationModules_ignoreIterationMark_ja_JP |
60     TransliterationModules_ignoreTiJi_ja_JP |
61     TransliterationModules_ignoreHyuByu_ja_JP |
62     TransliterationModules_ignoreSeZe_ja_JP |
63     TransliterationModules_ignoreIandEfollowedByYa_ja_JP |
64     TransliterationModules_ignoreKiKuFollowedBySa_ja_JP |
65     TransliterationModules_ignoreProlongedSoundMark_ja_JP;
66 static const sal_Int32 COMPLEX_TRANS_MASK = COMPLEX_TRANS_MASK_TMP | TransliterationModules_IGNORE_KANA | TransliterationModules_FULLWIDTH_HALFWIDTH;
67 static const sal_Int32 SIMPLE_TRANS_MASK = ~COMPLEX_TRANS_MASK;
68 static const sal_Int32 REGEX_TRANS_MASK = ~(COMPLEX_TRANS_MASK | TransliterationModules_IGNORE_CASE | TransliterationModules_UPPERCASE_LOWERCASE | TransliterationModules_LOWERCASE_UPPERCASE);
69     // Above 2 transliteration is simple but need to take effect in
70     // complex transliteration
71 
72 TextSearch::TextSearch(const Reference < XMultiServiceFactory > & rxMSF)
73         : xMSF( rxMSF )
74         , pJumpTable( 0 )
75         , pJumpTable2( 0 )
76         , pRegexMatcher( NULL )
77         , pWLD( 0 )
78 {
79     SearchOptions aOpt;
80     aOpt.algorithmType = SearchAlgorithms_ABSOLUTE;
81     aOpt.searchFlag = SearchFlags::ALL_IGNORE_CASE;
82     //aOpt.Locale = ???;
83     setOptions( aOpt );
84 }
85 
86 TextSearch::~TextSearch()
87 {
88     delete pRegexMatcher;
89     delete pWLD;
90     delete pJumpTable;
91     delete pJumpTable2;
92 }
93 
94 void TextSearch::setOptions( const SearchOptions& rOptions ) throw( RuntimeException )
95 {
96     aSrchPara = rOptions;
97 
98     delete pRegexMatcher, pRegexMatcher = NULL;
99     delete pWLD, pWLD = 0;
100     delete pJumpTable, pJumpTable = 0;
101     delete pJumpTable2, pJumpTable2 = 0;
102 
103     // Create Transliteration class
104     if( aSrchPara.transliterateFlags & SIMPLE_TRANS_MASK )
105     {
106         if( !xTranslit.is() )
107         {
108             Reference < XInterface > xI = xMSF->createInstance(
109                     OUString::createFromAscii(
110                         "com.sun.star.i18n.Transliteration"));
111             if ( xI.is() )
112                 xI->queryInterface( ::getCppuType(
113                             (const Reference< XExtendedTransliteration >*)0))
114                     >>= xTranslit;
115         }
116         // Load transliteration module
117         if( xTranslit.is() )
118             xTranslit->loadModule(
119                     (TransliterationModules)( aSrchPara.transliterateFlags & SIMPLE_TRANS_MASK ),
120                     aSrchPara.Locale);
121     }
122     else if( xTranslit.is() )
123         xTranslit = 0;
124 
125     // Create Transliteration for 2<->1, 2<->2 transliteration
126     if ( aSrchPara.transliterateFlags & COMPLEX_TRANS_MASK )
127     {
128         if( !xTranslit2.is() )
129         {
130             Reference < XInterface > xI = xMSF->createInstance(
131                     OUString::createFromAscii(
132                         "com.sun.star.i18n.Transliteration"));
133             if ( xI.is() )
134                 xI->queryInterface( ::getCppuType(
135                             (const Reference< XExtendedTransliteration >*)0))
136                     >>= xTranslit2;
137         }
138         // Load transliteration module
139         if( xTranslit2.is() )
140             xTranslit2->loadModule(
141                     (TransliterationModules)( aSrchPara.transliterateFlags & COMPLEX_TRANS_MASK ),
142                     aSrchPara.Locale);
143     }
144 
145     if ( !xBreak.is() )
146     {
147         Reference < XInterface > xI = xMSF->createInstance(
148                 OUString::createFromAscii( "com.sun.star.i18n.BreakIterator"));
149         if( xI.is() )
150             xI->queryInterface( ::getCppuType(
151                         (const Reference< XBreakIterator >*)0))
152                 >>= xBreak;
153     }
154 
155     sSrchStr = aSrchPara.searchString;
156 
157     // use transliteration here
158     if ( xTranslit.is() &&
159 	 aSrchPara.transliterateFlags & SIMPLE_TRANS_MASK )
160         sSrchStr = xTranslit->transliterateString2String(
161                 aSrchPara.searchString, 0, aSrchPara.searchString.getLength());
162 
163     if ( xTranslit2.is() &&
164 	 aSrchPara.transliterateFlags & COMPLEX_TRANS_MASK )
165 	sSrchStr2 = xTranslit2->transliterateString2String(
166 	        aSrchPara.searchString, 0, aSrchPara.searchString.getLength());
167 
168     // When start or end of search string is a complex script type, we need to
169     // make sure the result boundary is not located in the middle of cell.
170     checkCTLStart = (xBreak.is() && (xBreak->getScriptType(sSrchStr, 0) ==
171                 ScriptType::COMPLEX));
172     checkCTLEnd = (xBreak.is() && (xBreak->getScriptType(sSrchStr,
173                     sSrchStr.getLength()-1) == ScriptType::COMPLEX));
174 
175     switch( aSrchPara.algorithmType)
176     {
177 		case SearchAlgorithms_REGEXP:
178 			fnForward = &TextSearch::RESrchFrwrd;
179 			fnBackward = &TextSearch::RESrchBkwrd;
180 			RESrchPrepare( aSrchPara);
181 			break;
182 
183 		case SearchAlgorithms_APPROXIMATE:
184             fnForward = &TextSearch::ApproxSrchFrwrd;
185             fnBackward = &TextSearch::ApproxSrchBkwrd;
186 
187             pWLD = new WLevDistance( sSrchStr.getStr(), aSrchPara.changedChars,
188                     aSrchPara.insertedChars, aSrchPara.deletedChars,
189                     0 != (SearchFlags::LEV_RELAXED & aSrchPara.searchFlag ) );
190 
191             nLimit = pWLD->GetLimit();
192 			break;
193 
194 		default:
195             fnForward = &TextSearch::NSrchFrwrd;
196             fnBackward = &TextSearch::NSrchBkwrd;
197 			break;
198     }
199 }
200 
201 sal_Int32 FindPosInSeq_Impl( const Sequence <sal_Int32>& rOff, sal_Int32 nPos )
202 {
203     sal_Int32 nRet = 0, nEnd = rOff.getLength();
204     while( nRet < nEnd && nPos > rOff[ nRet ] ) ++nRet;
205     return nRet;
206 }
207 
208 sal_Bool TextSearch::isCellStart(const OUString& searchStr, sal_Int32 nPos)
209         throw( RuntimeException )
210 {
211     sal_Int32 nDone;
212     return nPos == xBreak->previousCharacters(searchStr, nPos+1,
213             aSrchPara.Locale, CharacterIteratorMode::SKIPCELL, 1, nDone);
214 }
215 
216 SearchResult TextSearch::searchForward( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos )
217         throw( RuntimeException )
218 {
219     SearchResult sres;
220 
221     OUString in_str(searchStr);
222     sal_Int32 newStartPos = startPos;
223     sal_Int32 newEndPos = endPos;
224 
225     bUsePrimarySrchStr = true;
226 
227     if ( xTranslit.is() )
228     {
229         // apply normal transliteration (1<->1, 1<->0)
230         com::sun::star::uno::Sequence <sal_Int32> offset( in_str.getLength());
231         in_str = xTranslit->transliterate( searchStr, 0, in_str.getLength(), offset );
232 
233         // JP 20.6.2001: also the start and end positions must be corrected!
234         if( startPos )
235             newStartPos = FindPosInSeq_Impl( offset, startPos );
236 
237         if( endPos < searchStr.getLength() )
238 	    newEndPos = FindPosInSeq_Impl( offset, endPos );
239         else
240             newEndPos = in_str.getLength();
241 
242         sres = (this->*fnForward)( in_str, newStartPos, newEndPos );
243 
244         for ( int k = 0; k < sres.startOffset.getLength(); k++ )
245         {
246             if (sres.startOffset[k])
247 	      sres.startOffset[k] = offset[sres.startOffset[k]];
248             // JP 20.6.2001: end is ever exclusive and then don't return
249             //               the position of the next character - return the
250             //               next position behind the last found character!
251             //               "a b c" find "b" must return 2,3 and not 2,4!!!
252             if (sres.endOffset[k])
253 	      sres.endOffset[k] = offset[sres.endOffset[k]-1] + 1;
254         }
255     }
256     else
257     {
258         sres = (this->*fnForward)( in_str, startPos, endPos );
259     }
260 
261     if ( xTranslit2.is() && aSrchPara.algorithmType != SearchAlgorithms_REGEXP)
262     {
263         SearchResult sres2;
264 
265 	in_str = OUString(searchStr);
266         com::sun::star::uno::Sequence <sal_Int32> offset( in_str.getLength());
267 
268         in_str = xTranslit2->transliterate( searchStr, 0, in_str.getLength(), offset );
269 
270         if( startPos )
271             startPos = FindPosInSeq_Impl( offset, startPos );
272 
273         if( endPos < searchStr.getLength() )
274             endPos = FindPosInSeq_Impl( offset, endPos );
275         else
276             endPos = in_str.getLength();
277 
278 	bUsePrimarySrchStr = false;
279         sres2 = (this->*fnForward)( in_str, startPos, endPos );
280 
281         for ( int k = 0; k < sres2.startOffset.getLength(); k++ )
282         {
283             if (sres2.startOffset[k])
284 	      sres2.startOffset[k] = offset[sres2.startOffset[k]-1] + 1;
285             if (sres2.endOffset[k])
286 	      sres2.endOffset[k] = offset[sres2.endOffset[k]-1] + 1;
287         }
288 
289 	// pick first and long one
290 	if ( sres.subRegExpressions == 0)
291 	    return sres2;
292 	if ( sres2.subRegExpressions == 1)
293 	{
294 	    if ( sres.startOffset[0] > sres2.startOffset[0])
295 	        return sres2;
296 	    else if ( sres.startOffset[0] == sres2.startOffset[0] &&
297 	        sres.endOffset[0] < sres2.endOffset[0])
298 	        return sres2;
299 	}
300     }
301 
302     return sres;
303 }
304 
305 SearchResult TextSearch::searchBackward( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos )
306         throw(RuntimeException)
307 {
308     SearchResult sres;
309 
310     OUString in_str(searchStr);
311     sal_Int32 newStartPos = startPos;
312     sal_Int32 newEndPos = endPos;
313 
314     bUsePrimarySrchStr = true;
315 
316     if ( xTranslit.is() )
317     {
318         // apply only simple 1<->1 transliteration here
319         com::sun::star::uno::Sequence <sal_Int32> offset( in_str.getLength());
320 	in_str = xTranslit->transliterate( searchStr, 0, in_str.getLength(), offset );
321 
322         // JP 20.6.2001: also the start and end positions must be corrected!
323         if( startPos < searchStr.getLength() )
324             newStartPos = FindPosInSeq_Impl( offset, startPos );
325 	else
326 	    newStartPos = in_str.getLength();
327 
328         if( endPos )
329 	    newEndPos = FindPosInSeq_Impl( offset, endPos );
330 
331         sres = (this->*fnBackward)( in_str, newStartPos, newEndPos );
332 
333         for ( int k = 0; k < sres.startOffset.getLength(); k++ )
334         {
335             if (sres.startOffset[k])
336 	      sres.startOffset[k] = offset[sres.startOffset[k] - 1] + 1;
337             // JP 20.6.2001: end is ever exclusive and then don't return
338             //               the position of the next character - return the
339             //               next position behind the last found character!
340             //               "a b c" find "b" must return 2,3 and not 2,4!!!
341             if (sres.endOffset[k])
342 	      sres.endOffset[k] = offset[sres.endOffset[k]];
343         }
344     }
345     else
346     {
347         sres = (this->*fnBackward)( in_str, startPos, endPos );
348     }
349 
350     if ( xTranslit2.is() && aSrchPara.algorithmType != SearchAlgorithms_REGEXP )
351     {
352 	SearchResult sres2;
353 
354 	in_str = OUString(searchStr);
355         com::sun::star::uno::Sequence <sal_Int32> offset( in_str.getLength());
356 
357         in_str = xTranslit2->transliterate(searchStr, 0, in_str.getLength(), offset);
358 
359         if( startPos < searchStr.getLength() )
360             startPos = FindPosInSeq_Impl( offset, startPos );
361         else
362             startPos = in_str.getLength();
363 
364         if( endPos )
365             endPos = FindPosInSeq_Impl( offset, endPos );
366 
367 	bUsePrimarySrchStr = false;
368 	sres2 = (this->*fnBackward)( in_str, startPos, endPos );
369 
370         for( int k = 0; k < sres2.startOffset.getLength(); k++ )
371         {
372             if (sres2.startOffset[k])
373                 sres2.startOffset[k] = offset[sres2.startOffset[k]-1]+1;
374             if (sres2.endOffset[k])
375                 sres2.endOffset[k] = offset[sres2.endOffset[k]-1]+1;
376         }
377 
378 	// pick last and long one
379 	if ( sres.subRegExpressions == 0 )
380 	    return sres2;
381 	if ( sres2.subRegExpressions == 1 )
382 	{
383 	    if ( sres.startOffset[0] < sres2.startOffset[0] )
384 	        return sres2;
385 	    if ( sres.startOffset[0] == sres2.startOffset[0] &&
386 		sres.endOffset[0] > sres2.endOffset[0] )
387 	        return sres2;
388 	}
389     }
390 
391     return sres;
392 }
393 
394 //---------------------------------------------------------------------
395 
396 bool TextSearch::IsDelimiter( const OUString& rStr, sal_Int32 nPos ) const
397 {
398     bool bRet = 1;
399     if( '\x7f' != rStr[nPos])
400     {
401         if ( !xCharClass.is() )
402         {
403             Reference < XInterface > xI = xMSF->createInstance(
404                     OUString::createFromAscii( "com.sun.star.i18n.CharacterClassification"));
405             if( xI.is() )
406                 xI->queryInterface( ::getCppuType(
407                             (const Reference< XCharacterClassification >*)0))
408                     >>= xCharClass;
409         }
410         if ( xCharClass.is() )
411         {
412             sal_Int32 nCType = xCharClass->getCharacterType( rStr, nPos,
413                     aSrchPara.Locale );
414             if( 0 != (( KCharacterType::DIGIT | KCharacterType::ALPHA |
415                             KCharacterType::LETTER ) & nCType ) )
416                 bRet = 0;
417         }
418     }
419     return bRet;
420 }
421 
422 // --------- helper methods for Boyer-Moore like text searching ----------
423 // TODO: use ICU's regex UREGEX_LITERAL mode instead when it becomes available
424 
425 void TextSearch::MakeForwardTab()
426 {
427     // create the jumptable for the search text
428     if( pJumpTable )
429     {
430         if( bIsForwardTab )
431             return ;                                        // the jumpTable is ok
432         delete pJumpTable;
433     }
434     bIsForwardTab = true;
435 
436     sal_Int32 n, nLen = sSrchStr.getLength();
437     pJumpTable = new TextSearchJumpTable;
438 
439     for( n = 0; n < nLen - 1; ++n )
440     {
441         sal_Unicode cCh = sSrchStr[n];
442         sal_Int32 nDiff = nLen - n - 1;
443 	TextSearchJumpTable::value_type aEntry( cCh, nDiff );
444 
445         ::std::pair< TextSearchJumpTable::iterator, bool > aPair =
446             pJumpTable->insert( aEntry );
447         if ( !aPair.second )
448             (*(aPair.first)).second = nDiff;
449     }
450 }
451 
452 void TextSearch::MakeForwardTab2()
453 {
454     // create the jumptable for the search text
455     if( pJumpTable2 )
456     {
457         if( bIsForwardTab )
458             return ;                                        // the jumpTable is ok
459         delete pJumpTable2;
460     }
461     bIsForwardTab = true;
462 
463     sal_Int32 n, nLen = sSrchStr2.getLength();
464     pJumpTable2 = new TextSearchJumpTable;
465 
466     for( n = 0; n < nLen - 1; ++n )
467     {
468         sal_Unicode cCh = sSrchStr2[n];
469         sal_Int32 nDiff = nLen - n - 1;
470 
471 	TextSearchJumpTable::value_type aEntry( cCh, nDiff );
472         ::std::pair< TextSearchJumpTable::iterator, bool > aPair =
473             pJumpTable2->insert( aEntry );
474         if ( !aPair.second )
475             (*(aPair.first)).second = nDiff;
476     }
477 }
478 
479 void TextSearch::MakeBackwardTab()
480 {
481     // create the jumptable for the search text
482     if( pJumpTable )
483     {
484         if( !bIsForwardTab )
485             return ;                                        // the jumpTable is ok
486         delete pJumpTable;
487     }
488     bIsForwardTab = false;
489 
490     sal_Int32 n, nLen = sSrchStr.getLength();
491     pJumpTable = new TextSearchJumpTable;
492 
493     for( n = nLen-1; n > 0; --n )
494     {
495         sal_Unicode cCh = sSrchStr[n];
496         TextSearchJumpTable::value_type aEntry( cCh, n );
497         ::std::pair< TextSearchJumpTable::iterator, bool > aPair =
498             pJumpTable->insert( aEntry );
499         if ( !aPair.second )
500             (*(aPair.first)).second = n;
501     }
502 }
503 
504 void TextSearch::MakeBackwardTab2()
505 {
506     // create the jumptable for the search text
507     if( pJumpTable2 )
508     {
509         if( !bIsForwardTab )
510             return ;                                        // the jumpTable is ok
511         delete pJumpTable2;
512     }
513     bIsForwardTab = false;
514 
515     sal_Int32 n, nLen = sSrchStr2.getLength();
516     pJumpTable2 = new TextSearchJumpTable;
517 
518     for( n = nLen-1; n > 0; --n )
519     {
520         sal_Unicode cCh = sSrchStr2[n];
521         TextSearchJumpTable::value_type aEntry( cCh, n );
522         ::std::pair< TextSearchJumpTable::iterator, bool > aPair =
523             pJumpTable2->insert( aEntry );
524         if ( !aPair.second )
525             (*(aPair.first)).second = n;
526     }
527 }
528 
529 sal_Int32 TextSearch::GetDiff( const sal_Unicode cChr ) const
530 {
531     TextSearchJumpTable *pJump;
532     OUString sSearchKey;
533 
534     if ( bUsePrimarySrchStr ) {
535       pJump = pJumpTable;
536       sSearchKey = sSrchStr;
537     } else {
538       pJump = pJumpTable2;
539       sSearchKey = sSrchStr2;
540     }
541 
542     TextSearchJumpTable::const_iterator iLook = pJump->find( cChr );
543     if ( iLook == pJump->end() )
544         return sSearchKey.getLength();
545     return (*iLook).second;
546 }
547 
548 
549 // TextSearch::NSrchFrwrd is mis-optimized on unxsoli (#i105945#)
550 SearchResult TextSearch::NSrchFrwrd( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos )
551         throw(RuntimeException)
552 {
553     SearchResult aRet;
554     aRet.subRegExpressions = 0;
555 
556     OUString sSearchKey = bUsePrimarySrchStr ? sSrchStr : sSrchStr2;
557 
558     OUString aStr( searchStr );
559     sal_Int32 nSuchIdx = aStr.getLength();
560     sal_Int32 nEnde = endPos;
561     if( !nSuchIdx || !sSearchKey.getLength() || sSearchKey.getLength() > nSuchIdx )
562         return aRet;
563 
564 
565     if( nEnde < sSearchKey.getLength() )  // position inside the search region ?
566         return aRet;
567 
568     nEnde -= sSearchKey.getLength();
569 
570     if (bUsePrimarySrchStr)
571       MakeForwardTab();                   // create the jumptable
572     else
573       MakeForwardTab2();
574 
575     for (sal_Int32 nCmpIdx = startPos; // start position for the search
576             nCmpIdx <= nEnde;
577             nCmpIdx += GetDiff( aStr[nCmpIdx + sSearchKey.getLength()-1]))
578     {
579         // if the match would be the completed cells, skip it.
580         if ( (checkCTLStart && !isCellStart( aStr, nCmpIdx )) || (checkCTLEnd
581                     && !isCellStart( aStr, nCmpIdx + sSearchKey.getLength())) )
582             continue;
583 
584         nSuchIdx = sSearchKey.getLength() - 1;
585         while( nSuchIdx >= 0 && sSearchKey[nSuchIdx] == aStr[nCmpIdx + nSuchIdx])
586         {
587             if( nSuchIdx == 0 )
588             {
589                 if( SearchFlags::NORM_WORD_ONLY & aSrchPara.searchFlag )
590                 {
591                     sal_Int32 nFndEnd = nCmpIdx + sSearchKey.getLength();
592                     bool bAtStart = !nCmpIdx;
593                     bool bAtEnd = nFndEnd == endPos;
594                     bool bDelimBefore = bAtStart || IsDelimiter( aStr, nCmpIdx-1 );
595                     bool bDelimBehind = IsDelimiter(  aStr, nFndEnd );
596                     //  *       1 -> only one word in the paragraph
597                     //  *       2 -> at begin of paragraph
598                     //  *       3 -> at end of paragraph
599                     //  *       4 -> inside the paragraph
600                     if( !(  ( bAtStart && bAtEnd ) ||           // 1
601                                 ( bAtStart && bDelimBehind ) ||     // 2
602                                 ( bAtEnd && bDelimBefore ) ||       // 3
603                                 ( bDelimBefore && bDelimBehind )))  // 4
604                         break;
605                 }
606 
607                 aRet.subRegExpressions = 1;
608                 aRet.startOffset.realloc( 1 );
609                 aRet.startOffset[ 0 ] = nCmpIdx;
610                 aRet.endOffset.realloc( 1 );
611                 aRet.endOffset[ 0 ] = nCmpIdx + sSearchKey.getLength();
612 
613                 return aRet;
614             }
615             else
616                 nSuchIdx--;
617         }
618     }
619     return aRet;
620 }
621 
622 SearchResult TextSearch::NSrchBkwrd( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos )
623         throw(RuntimeException)
624 {
625     SearchResult aRet;
626     aRet.subRegExpressions = 0;
627 
628     OUString sSearchKey = bUsePrimarySrchStr ? sSrchStr : sSrchStr2;
629 
630     OUString aStr( searchStr );
631     sal_Int32 nSuchIdx = aStr.getLength();
632     sal_Int32 nEnde = endPos;
633     if( nSuchIdx == 0 || sSearchKey.getLength() == 0 || sSearchKey.getLength() > nSuchIdx)
634         return aRet;
635 
636     if (bUsePrimarySrchStr)
637       MakeBackwardTab();                      // create the jumptable
638     else
639       MakeBackwardTab2();
640 
641     if( nEnde == nSuchIdx )                 // end position for the search
642         nEnde = sSearchKey.getLength();
643     else
644         nEnde += sSearchKey.getLength();
645 
646     sal_Int32 nCmpIdx = startPos;          // start position for the search
647 
648     while (nCmpIdx >= nEnde)
649     {
650         // if the match would be the completed cells, skip it.
651         if ( (!checkCTLStart || isCellStart( aStr, nCmpIdx -
652                         sSearchKey.getLength() )) && (!checkCTLEnd ||
653                     isCellStart( aStr, nCmpIdx)))
654         {
655             nSuchIdx = 0;
656             while( nSuchIdx < sSearchKey.getLength() && sSearchKey[nSuchIdx] ==
657                     aStr[nCmpIdx + nSuchIdx - sSearchKey.getLength()] )
658                 nSuchIdx++;
659             if( nSuchIdx >= sSearchKey.getLength() )
660             {
661                 if( SearchFlags::NORM_WORD_ONLY & aSrchPara.searchFlag )
662                 {
663                     sal_Int32 nFndStt = nCmpIdx - sSearchKey.getLength();
664                     bool bAtStart = !nFndStt;
665                     bool bAtEnd = nCmpIdx == startPos;
666                     bool bDelimBehind = IsDelimiter( aStr, nCmpIdx );
667                     bool bDelimBefore = bAtStart || // begin of paragraph
668                         IsDelimiter( aStr, nFndStt-1 );
669                     //  *       1 -> only one word in the paragraph
670                     //  *       2 -> at begin of paragraph
671                     //  *       3 -> at end of paragraph
672                     //  *       4 -> inside the paragraph
673                     if( ( bAtStart && bAtEnd ) ||           // 1
674                             ( bAtStart && bDelimBehind ) ||     // 2
675                             ( bAtEnd && bDelimBefore ) ||       // 3
676                             ( bDelimBefore && bDelimBehind ))   // 4
677                     {
678                         aRet.subRegExpressions = 1;
679                         aRet.startOffset.realloc( 1 );
680                         aRet.startOffset[ 0 ] = nCmpIdx;
681                         aRet.endOffset.realloc( 1 );
682                         aRet.endOffset[ 0 ] = nCmpIdx - sSearchKey.getLength();
683                         return aRet;
684                     }
685                 }
686                 else
687                 {
688                     aRet.subRegExpressions = 1;
689                     aRet.startOffset.realloc( 1 );
690                     aRet.startOffset[ 0 ] = nCmpIdx;
691                     aRet.endOffset.realloc( 1 );
692                     aRet.endOffset[ 0 ] = nCmpIdx - sSearchKey.getLength();
693                     return aRet;
694                 }
695             }
696         }
697         nSuchIdx = GetDiff( aStr[nCmpIdx - sSearchKey.getLength()] );
698         if( nCmpIdx < nSuchIdx )
699             return aRet;
700         nCmpIdx -= nSuchIdx;
701     }
702     return aRet;
703 }
704 
705 void TextSearch::RESrchPrepare( const ::com::sun::star::util::SearchOptions& rOptions)
706 {
707 	// select the transliterated pattern string
708 	const OUString& rPatternStr =
709 		(rOptions.transliterateFlags & REGEX_TRANS_MASK) ? sSrchStr
710 		: ((rOptions.transliterateFlags & COMPLEX_TRANS_MASK) ? sSrchStr2 : rOptions.searchString);
711 
712 	sal_uInt32 nIcuSearchFlags = UREGEX_UWORD; // request UAX#29 unicode capability
713 	// map com::sun::star::util::SearchFlags to ICU uregex.h flags
714 	// TODO: REG_EXTENDED, REG_NOT_BEGINOFLINE, REG_NOT_ENDOFLINE
715 	// REG_NEWLINE is neither properly defined nor used anywhere => not implemented
716 	// REG_NOSUB is not used anywhere => not implemented
717 	// NORM_WORD_ONLY is only used for SearchAlgorithm==Absolute
718 	// LEV_RELAXED is only used for SearchAlgorithm==Approximate
719 	// Note that the search flag ALL_IGNORE_CASE is deprecated in UNO
720 	// probably because the transliteration flag IGNORE_CASE handles it as well.
721 	if( (rOptions.searchFlag & com::sun::star::util::SearchFlags::ALL_IGNORE_CASE) != 0
722 	||  (rOptions.transliterateFlags & TransliterationModules_IGNORE_CASE) != 0)
723 		nIcuSearchFlags |= UREGEX_CASE_INSENSITIVE;
724 	UErrorCode nIcuErr = U_ZERO_ERROR;
725 	// assumption: transliteration didn't mangle regexp control chars
726 	IcuUniString aIcuSearchPatStr( (const UChar*)rPatternStr.getStr(), rPatternStr.getLength());
727 #ifndef DISABLE_WORDBOUND_EMULATION
728 	// for conveniance specific syntax elements of the old regex engine are emulated
729 	// - by replacing \< with "word-break followed by a look-ahead word-char"
730 	static const IcuUniString aChevronPatternB( "\\\\<", -1, IcuUniString::kInvariant);
731 	static const IcuUniString aChevronReplaceB( "\\\\b(?=\\\\w)", -1, IcuUniString::kInvariant);
732 	static RegexMatcher aChevronMatcherB( aChevronPatternB, 0, nIcuErr);
733 	aChevronMatcherB.reset( aIcuSearchPatStr);
734 	aIcuSearchPatStr = aChevronMatcherB.replaceAll( aChevronReplaceB, nIcuErr);
735 	aChevronMatcherB.reset();
736 	// - by replacing \> with "look-behind word-char followed by a word-break"
737 	static const IcuUniString aChevronPatternE( "\\\\>", -1, IcuUniString::kInvariant);
738 	static const IcuUniString aChevronReplaceE( "(?<=\\\\w)\\\\b", -1, IcuUniString::kInvariant);
739 	static RegexMatcher aChevronMatcherE( aChevronPatternE, 0, nIcuErr);
740 	aChevronMatcherE.reset( aIcuSearchPatStr);
741 	aIcuSearchPatStr = aChevronMatcherE.replaceAll( aChevronReplaceE, nIcuErr);
742 	aChevronMatcherE.reset();
743 #endif
744 	pRegexMatcher = new RegexMatcher( aIcuSearchPatStr, nIcuSearchFlags, nIcuErr);
745 	if( nIcuErr)
746 		{ delete pRegexMatcher; pRegexMatcher = NULL;}
747 }
748 
749 //---------------------------------------------------------------------------
750 
751 SearchResult TextSearch::RESrchFrwrd( const OUString& searchStr,
752                                       sal_Int32 startPos, sal_Int32 endPos )
753             throw(RuntimeException)
754 {
755 	SearchResult aRet;
756 	aRet.subRegExpressions = 0;
757 	if( !pRegexMatcher)
758 		return aRet;
759 
760 	if( endPos > searchStr.getLength())
761 		endPos = searchStr.getLength();
762 
763 	// use the ICU RegexMatcher to find the matches
764 	UErrorCode nIcuErr = U_ZERO_ERROR;
765 	const IcuUniString aSearchTargetStr( (const UChar*)searchStr.getStr(), endPos);
766 	pRegexMatcher->reset( aSearchTargetStr);
767 	// search until there is a valid match
768 	for(;;)
769 	{
770 		if( !pRegexMatcher->find( startPos, nIcuErr))
771 			return aRet;
772 
773 		// #i118887# ignore zero-length matches e.g. "a*" in "bc"
774 		int nStartOfs = pRegexMatcher->start( nIcuErr);
775 		int nEndOfs = pRegexMatcher->end( nIcuErr);
776 		if( nStartOfs < nEndOfs)
777 			break;
778 		// try at next position if there was a zero-length match
779 		if( ++startPos >= endPos)
780 			return aRet;
781 	}
782 
783 	// extract the result of the search
784 	const int nGroupCount = pRegexMatcher->groupCount();
785 	aRet.subRegExpressions = nGroupCount + 1;
786 	aRet.startOffset.realloc( aRet.subRegExpressions);
787 	aRet.endOffset.realloc( aRet.subRegExpressions);
788 	aRet.startOffset[0] = pRegexMatcher->start( nIcuErr);
789 	aRet.endOffset[0]   = pRegexMatcher->end( nIcuErr);
790 	for( int i = 1; i <= nGroupCount; ++i) {
791 		aRet.startOffset[i] = pRegexMatcher->start( i, nIcuErr);
792 		aRet.endOffset[i]   = pRegexMatcher->end( i, nIcuErr);
793 	}
794 
795 	return aRet;
796 }
797 
798 SearchResult TextSearch::RESrchBkwrd( const OUString& searchStr,
799                                       sal_Int32 startPos, sal_Int32 endPos )
800             throw(RuntimeException)
801 {
802 	// NOTE: for backwards search callers provide startPos/endPos inverted!
803 	SearchResult aRet;
804 	aRet.subRegExpressions = 0;
805 	if( !pRegexMatcher)
806 		return aRet;
807 
808 	if( startPos > searchStr.getLength())
809 		startPos = searchStr.getLength();
810 
811 	// use the ICU RegexMatcher to find the matches
812 	// TODO: use ICU's backward searching once it becomes available
813 	//       as its replacement using forward search is not as good as the real thing
814 	UErrorCode nIcuErr = U_ZERO_ERROR;
815 	const IcuUniString aSearchTargetStr( (const UChar*)searchStr.getStr(), startPos);
816 	pRegexMatcher->reset( aSearchTargetStr);
817 	if( !pRegexMatcher->find( endPos, nIcuErr))
818 		return aRet;
819 
820 	// find the last match
821 	int nLastPos = 0;
822 	int nFoundEnd = 0;
823 	do {
824 		nLastPos = pRegexMatcher->start( nIcuErr);
825 		nFoundEnd = pRegexMatcher->end( nIcuErr);
826 		if( nFoundEnd >= startPos)
827 			break;
828 		if( nFoundEnd == nLastPos)
829 			++nFoundEnd;
830 	} while( pRegexMatcher->find( nFoundEnd, nIcuErr));
831 
832 	// find last match again to get its details
833 	pRegexMatcher->find( nLastPos, nIcuErr);
834 
835 	// fill in the details of the last match
836 	const int nGroupCount = pRegexMatcher->groupCount();
837 	aRet.subRegExpressions = nGroupCount + 1;
838 	aRet.startOffset.realloc( aRet.subRegExpressions);
839 	aRet.endOffset.realloc( aRet.subRegExpressions);
840 	// NOTE: existing users of backward search seem to expect startOfs/endOfs being inverted!
841 	aRet.startOffset[0] = pRegexMatcher->end( nIcuErr);
842 	aRet.endOffset[0]   = pRegexMatcher->start( nIcuErr);
843 	for( int i = 1; i <= nGroupCount; ++i) {
844 		aRet.startOffset[i] = pRegexMatcher->end( i, nIcuErr);
845 		aRet.endOffset[i]   = pRegexMatcher->start( i, nIcuErr);
846 	}
847 
848 	return aRet;
849 }
850 
851 //---------------------------------------------------------------------------
852 
853 // search for words phonetically
854 SearchResult TextSearch::ApproxSrchFrwrd( const OUString& searchStr,
855                                           sal_Int32 startPos, sal_Int32 endPos )
856             throw(RuntimeException)
857 {
858     SearchResult aRet;
859     aRet.subRegExpressions = 0;
860 
861     if( !xBreak.is() )
862         return aRet;
863 
864     OUString aWTemp( searchStr );
865 
866     register sal_Int32 nStt, nEnd;
867 
868     Boundary aWBnd = xBreak->getWordBoundary( aWTemp, startPos,
869             aSrchPara.Locale,
870             WordType::ANYWORD_IGNOREWHITESPACES, sal_True );
871 
872     do
873     {
874         if( aWBnd.startPos >= endPos )
875             break;
876         nStt = aWBnd.startPos < startPos ? startPos : aWBnd.startPos;
877         nEnd = aWBnd.endPos > endPos ? endPos : aWBnd.endPos;
878 
879         if( nStt < nEnd &&
880                 pWLD->WLD( aWTemp.getStr() + nStt, nEnd - nStt ) <= nLimit )
881         {
882             aRet.subRegExpressions = 1;
883             aRet.startOffset.realloc( 1 );
884             aRet.startOffset[ 0 ] = nStt;
885             aRet.endOffset.realloc( 1 );
886             aRet.endOffset[ 0 ] = nEnd;
887             break;
888         }
889 
890         nStt = nEnd - 1;
891         aWBnd = xBreak->nextWord( aWTemp, nStt, aSrchPara.Locale,
892                 WordType::ANYWORD_IGNOREWHITESPACES);
893     } while( aWBnd.startPos != aWBnd.endPos ||
894             (aWBnd.endPos != aWTemp.getLength() && aWBnd.endPos != nEnd) );
895     // #i50244# aWBnd.endPos != nEnd : in case there is _no_ word (only
896     // whitespace) in searchStr, getWordBoundary() returned startPos,startPos
897     // and nextWord() does also => don't loop forever.
898     return aRet;
899 }
900 
901 SearchResult TextSearch::ApproxSrchBkwrd( const OUString& searchStr,
902                                           sal_Int32 startPos, sal_Int32 endPos )
903             throw(RuntimeException)
904 {
905     SearchResult aRet;
906     aRet.subRegExpressions = 0;
907 
908     if( !xBreak.is() )
909         return aRet;
910 
911     OUString aWTemp( searchStr );
912 
913     register sal_Int32 nStt, nEnd;
914 
915     Boundary aWBnd = xBreak->getWordBoundary( aWTemp, startPos,
916             aSrchPara.Locale,
917             WordType::ANYWORD_IGNOREWHITESPACES, sal_True );
918 
919     do
920     {
921         if( aWBnd.endPos <= endPos )
922             break;
923         nStt = aWBnd.startPos < endPos ? endPos : aWBnd.startPos;
924         nEnd = aWBnd.endPos > startPos ? startPos : aWBnd.endPos;
925 
926         if( nStt < nEnd &&
927                 pWLD->WLD( aWTemp.getStr() + nStt, nEnd - nStt ) <= nLimit )
928         {
929             aRet.subRegExpressions = 1;
930             aRet.startOffset.realloc( 1 );
931             aRet.startOffset[ 0 ] = nEnd;
932             aRet.endOffset.realloc( 1 );
933             aRet.endOffset[ 0 ] = nStt;
934             break;
935         }
936         if( !nStt )
937             break;
938 
939         aWBnd = xBreak->previousWord( aWTemp, nStt, aSrchPara.Locale,
940                 WordType::ANYWORD_IGNOREWHITESPACES);
941     } while( aWBnd.startPos != aWBnd.endPos || aWBnd.endPos != aWTemp.getLength() );
942     return aRet;
943 }
944 
945 
946 static const sal_Char cSearchName[] = "com.sun.star.util.TextSearch";
947 static const sal_Char cSearchImpl[] = "com.sun.star.util.TextSearch_i18n";
948 
949 static OUString getServiceName_Static()
950 {
951     return OUString::createFromAscii( cSearchName );
952 }
953 
954 static OUString getImplementationName_Static()
955 {
956     return OUString::createFromAscii( cSearchImpl );
957 }
958 
959 OUString SAL_CALL
960 TextSearch::getImplementationName()
961                 throw( RuntimeException )
962 {
963     return getImplementationName_Static();
964 }
965 
966 sal_Bool SAL_CALL
967 TextSearch::supportsService(const OUString& rServiceName)
968                 throw( RuntimeException )
969 {
970     return !rServiceName.compareToAscii( cSearchName );
971 }
972 
973 Sequence< OUString > SAL_CALL
974 TextSearch::getSupportedServiceNames(void) throw( RuntimeException )
975 {
976     Sequence< OUString > aRet(1);
977     aRet[0] = getServiceName_Static();
978     return aRet;
979 }
980 
981 ::com::sun::star::uno::Reference< ::com::sun::star::uno::XInterface >
982 SAL_CALL TextSearch_CreateInstance(
983         const ::com::sun::star::uno::Reference<
984         ::com::sun::star::lang::XMultiServiceFactory >& rxMSF )
985 {
986     return ::com::sun::star::uno::Reference<
987         ::com::sun::star::uno::XInterface >(
988                 (::cppu::OWeakObject*) new TextSearch( rxMSF ) );
989 }
990 
991 extern "C"
992 {
993 
994 void SAL_CALL component_getImplementationEnvironment(
995         const sal_Char** ppEnvTypeName, uno_Environment** /*ppEnv*/ )
996 {
997     *ppEnvTypeName = CPPU_CURRENT_LANGUAGE_BINDING_NAME;
998 }
999 
1000 void* SAL_CALL component_getFactory( const sal_Char* sImplementationName,
1001         void* _pServiceManager, void* /*_pRegistryKey*/ )
1002 {
1003     void* pRet = NULL;
1004 
1005     ::com::sun::star::lang::XMultiServiceFactory* pServiceManager =
1006         reinterpret_cast< ::com::sun::star::lang::XMultiServiceFactory* >
1007             ( _pServiceManager );
1008     ::com::sun::star::uno::Reference<
1009             ::com::sun::star::lang::XSingleServiceFactory > xFactory;
1010 
1011     if ( 0 == rtl_str_compare( sImplementationName, cSearchImpl) )
1012     {
1013         ::com::sun::star::uno::Sequence< ::rtl::OUString > aServiceNames(1);
1014         aServiceNames[0] = getServiceName_Static();
1015         xFactory = ::cppu::createSingleFactory(
1016                 pServiceManager, getImplementationName_Static(),
1017                 &TextSearch_CreateInstance, aServiceNames );
1018     }
1019 
1020     if ( xFactory.is() )
1021     {
1022         xFactory->acquire();
1023         pRet = xFactory.get();
1024     }
1025 
1026     return pRet;
1027 }
1028 
1029 } // extern "C"
1030