1 /**************************************************************
2  *
3  * Licensed to the Apache Software Foundation (ASF) under one
4  * or more contributor license agreements.  See the NOTICE file
5  * distributed with this work for additional information
6  * regarding copyright ownership.  The ASF licenses this file
7  * to you under the Apache License, Version 2.0 (the
8  * "License"); you may not use this file except in compliance
9  * with the License.  You may obtain a copy of the License at
10  *
11  *   http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing,
14  * software distributed under the License is distributed on an
15  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16  * KIND, either express or implied.  See the License for the
17  * specific language governing permissions and limitations
18  * under the License.
19  *
20  *************************************************************/
21 
22 
23 
24 // MARKER(update_precomp.py): autogen include statement, do not remove
25 #include "precompiled_i18npool.hxx"
26 
27 #include "textsearch.hxx"
28 #include "levdis.hxx"
29 #include <com/sun/star/lang/Locale.hpp>
30 #include <com/sun/star/lang/XMultiServiceFactory.hpp>
31 #include <comphelper/processfactory.hxx>
32 #include <com/sun/star/i18n/UnicodeType.hpp>
33 #include <com/sun/star/util/SearchFlags.hpp>
34 #include <com/sun/star/i18n/WordType.hpp>
35 #include <com/sun/star/i18n/ScriptType.hpp>
36 #include <com/sun/star/i18n/CharacterIteratorMode.hpp>
37 #include <com/sun/star/i18n/KCharacterType.hpp>
38 #include <com/sun/star/registry/XRegistryKey.hpp>
39 #include <cppuhelper/factory.hxx>
40 #include <cppuhelper/weak.hxx>
41 
42 #ifdef _MSC_VER
43 // get rid of that dumb compiler warning
44 // identifier was truncated to '255' characters in the debug information
45 // for STL template usage, if .pdb files are to be created
46 #pragma warning( disable: 4786 )
47 #endif
48 
49 #include <string.h>
50 
51 using namespace ::com::sun::star::util;
52 using namespace ::com::sun::star::uno;
53 using namespace ::com::sun::star::lang;
54 using namespace ::com::sun::star::i18n;
55 using namespace ::rtl;
56 
57 static sal_Int32 COMPLEX_TRANS_MASK_TMP =
58     TransliterationModules_ignoreBaFa_ja_JP |
59     TransliterationModules_ignoreIterationMark_ja_JP |
60     TransliterationModules_ignoreTiJi_ja_JP |
61     TransliterationModules_ignoreHyuByu_ja_JP |
62     TransliterationModules_ignoreSeZe_ja_JP |
63     TransliterationModules_ignoreIandEfollowedByYa_ja_JP |
64     TransliterationModules_ignoreKiKuFollowedBySa_ja_JP |
65     TransliterationModules_ignoreProlongedSoundMark_ja_JP;
66 static const sal_Int32 COMPLEX_TRANS_MASK = COMPLEX_TRANS_MASK_TMP | TransliterationModules_IGNORE_KANA | TransliterationModules_FULLWIDTH_HALFWIDTH;
67 static const sal_Int32 SIMPLE_TRANS_MASK = ~(COMPLEX_TRANS_MASK | TransliterationModules_IGNORE_CASE | TransliterationModules_UPPERCASE_LOWERCASE | TransliterationModules_LOWERCASE_UPPERCASE);
68     // Above 2 transliteration is simple but need to take effect in
69     // complex transliteration
70 
71 TextSearch::TextSearch(const Reference < XMultiServiceFactory > & rxMSF)
72         : xMSF( rxMSF )
73         , pJumpTable( 0 )
74         , pJumpTable2( 0 )
75         , pRegexMatcher( NULL )
76         , pWLD( 0 )
77 {
78     SearchOptions aOpt;
79     aOpt.algorithmType = SearchAlgorithms_ABSOLUTE;
80     aOpt.searchFlag = SearchFlags::ALL_IGNORE_CASE;
81     //aOpt.Locale = ???;
82     setOptions( aOpt );
83 }
84 
85 TextSearch::~TextSearch()
86 {
87     delete pRegexMatcher;
88     delete pWLD;
89     delete pJumpTable;
90     delete pJumpTable2;
91 }
92 
93 void TextSearch::setOptions( const SearchOptions& rOptions ) throw( RuntimeException )
94 {
95     aSrchPara = rOptions;
96 
97     delete pRegexMatcher, pRegexMatcher = NULL;
98     delete pWLD, pWLD = 0;
99     delete pJumpTable, pJumpTable = 0;
100     delete pJumpTable2, pJumpTable2 = 0;
101 
102     // Create Transliteration class
103     if( aSrchPara.transliterateFlags & SIMPLE_TRANS_MASK )
104     {
105         if( !xTranslit.is() )
106         {
107             Reference < XInterface > xI = xMSF->createInstance(
108                     OUString::createFromAscii(
109                         "com.sun.star.i18n.Transliteration"));
110             if ( xI.is() )
111                 xI->queryInterface( ::getCppuType(
112                             (const Reference< XExtendedTransliteration >*)0))
113                     >>= xTranslit;
114         }
115         // Load transliteration module
116         if( xTranslit.is() )
117             xTranslit->loadModule(
118                     (TransliterationModules)( aSrchPara.transliterateFlags & SIMPLE_TRANS_MASK ),
119                     aSrchPara.Locale);
120     }
121     else if( xTranslit.is() )
122         xTranslit = 0;
123 
124     // Create Transliteration for 2<->1, 2<->2 transliteration
125     if ( aSrchPara.transliterateFlags & COMPLEX_TRANS_MASK )
126     {
127         if( !xTranslit2.is() )
128         {
129             Reference < XInterface > xI = xMSF->createInstance(
130                     OUString::createFromAscii(
131                         "com.sun.star.i18n.Transliteration"));
132             if ( xI.is() )
133                 xI->queryInterface( ::getCppuType(
134                             (const Reference< XExtendedTransliteration >*)0))
135                     >>= xTranslit2;
136         }
137         // Load transliteration module
138         if( xTranslit2.is() )
139             xTranslit2->loadModule(
140                     (TransliterationModules)( aSrchPara.transliterateFlags & COMPLEX_TRANS_MASK ),
141                     aSrchPara.Locale);
142     }
143 
144     if ( !xBreak.is() )
145     {
146         Reference < XInterface > xI = xMSF->createInstance(
147                 OUString::createFromAscii( "com.sun.star.i18n.BreakIterator"));
148         if( xI.is() )
149             xI->queryInterface( ::getCppuType(
150                         (const Reference< XBreakIterator >*)0))
151                 >>= xBreak;
152     }
153 
154     sSrchStr = aSrchPara.searchString;
155 
156     // use transliteration here
157     if ( xTranslit.is() &&
158 	 aSrchPara.transliterateFlags & SIMPLE_TRANS_MASK )
159         sSrchStr = xTranslit->transliterateString2String(
160                 aSrchPara.searchString, 0, aSrchPara.searchString.getLength());
161 
162     if ( xTranslit2.is() &&
163 	 aSrchPara.transliterateFlags & COMPLEX_TRANS_MASK )
164 	sSrchStr2 = xTranslit2->transliterateString2String(
165 	        aSrchPara.searchString, 0, aSrchPara.searchString.getLength());
166 
167     // When start or end of search string is a complex script type, we need to
168     // make sure the result boundary is not located in the middle of cell.
169     checkCTLStart = (xBreak.is() && (xBreak->getScriptType(sSrchStr, 0) ==
170                 ScriptType::COMPLEX));
171     checkCTLEnd = (xBreak.is() && (xBreak->getScriptType(sSrchStr,
172                     sSrchStr.getLength()-1) == ScriptType::COMPLEX));
173 
174     switch( aSrchPara.algorithmType)
175     {
176 		case SearchAlgorithms_REGEXP:
177 			fnForward = &TextSearch::RESrchFrwrd;
178 			fnBackward = &TextSearch::RESrchBkwrd;
179 			RESrchPrepare( aSrchPara);
180 			break;
181 
182 		case SearchAlgorithms_APPROXIMATE:
183             fnForward = &TextSearch::ApproxSrchFrwrd;
184             fnBackward = &TextSearch::ApproxSrchBkwrd;
185 
186             pWLD = new WLevDistance( sSrchStr.getStr(), aSrchPara.changedChars,
187                     aSrchPara.insertedChars, aSrchPara.deletedChars,
188                     0 != (SearchFlags::LEV_RELAXED & aSrchPara.searchFlag ) );
189 
190             nLimit = pWLD->GetLimit();
191 			break;
192 
193 		default:
194             fnForward = &TextSearch::NSrchFrwrd;
195             fnBackward = &TextSearch::NSrchBkwrd;
196 			break;
197     }
198 }
199 
200 sal_Int32 FindPosInSeq_Impl( const Sequence <sal_Int32>& rOff, sal_Int32 nPos )
201 {
202     sal_Int32 nRet = 0, nEnd = rOff.getLength();
203     while( nRet < nEnd && nPos > rOff[ nRet ] ) ++nRet;
204     return nRet;
205 }
206 
207 sal_Bool TextSearch::isCellStart(const OUString& searchStr, sal_Int32 nPos)
208         throw( RuntimeException )
209 {
210     sal_Int32 nDone;
211     return nPos == xBreak->previousCharacters(searchStr, nPos+1,
212             aSrchPara.Locale, CharacterIteratorMode::SKIPCELL, 1, nDone);
213 }
214 
215 SearchResult TextSearch::searchForward( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos )
216         throw( RuntimeException )
217 {
218     SearchResult sres;
219 
220     OUString in_str(searchStr);
221     sal_Int32 newStartPos = startPos;
222     sal_Int32 newEndPos = endPos;
223 
224     bUsePrimarySrchStr = true;
225 
226     if ( xTranslit.is() )
227     {
228         // apply normal transliteration (1<->1, 1<->0)
229         com::sun::star::uno::Sequence <sal_Int32> offset( in_str.getLength());
230         in_str = xTranslit->transliterate( searchStr, 0, in_str.getLength(), offset );
231 
232         // JP 20.6.2001: also the start and end positions must be corrected!
233         if( startPos )
234             newStartPos = FindPosInSeq_Impl( offset, startPos );
235 
236         if( endPos < searchStr.getLength() )
237 	    newEndPos = FindPosInSeq_Impl( offset, endPos );
238         else
239             newEndPos = in_str.getLength();
240 
241         sres = (this->*fnForward)( in_str, newStartPos, newEndPos );
242 
243         for ( int k = 0; k < sres.startOffset.getLength(); k++ )
244         {
245             if (sres.startOffset[k])
246 	      sres.startOffset[k] = offset[sres.startOffset[k]];
247             // JP 20.6.2001: end is ever exclusive and then don't return
248             //               the position of the next character - return the
249             //               next position behind the last found character!
250             //               "a b c" find "b" must return 2,3 and not 2,4!!!
251             if (sres.endOffset[k])
252 	      sres.endOffset[k] = offset[sres.endOffset[k]-1] + 1;
253         }
254     }
255     else
256     {
257         sres = (this->*fnForward)( in_str, startPos, endPos );
258     }
259 
260     if ( xTranslit2.is() && aSrchPara.algorithmType != SearchAlgorithms_REGEXP)
261     {
262         SearchResult sres2;
263 
264 	in_str = OUString(searchStr);
265         com::sun::star::uno::Sequence <sal_Int32> offset( in_str.getLength());
266 
267         in_str = xTranslit2->transliterate( searchStr, 0, in_str.getLength(), offset );
268 
269         if( startPos )
270             startPos = FindPosInSeq_Impl( offset, startPos );
271 
272         if( endPos < searchStr.getLength() )
273             endPos = FindPosInSeq_Impl( offset, endPos );
274         else
275             endPos = in_str.getLength();
276 
277 	bUsePrimarySrchStr = false;
278         sres2 = (this->*fnForward)( in_str, startPos, endPos );
279 
280         for ( int k = 0; k < sres2.startOffset.getLength(); k++ )
281         {
282             if (sres2.startOffset[k])
283 	      sres2.startOffset[k] = offset[sres2.startOffset[k]-1] + 1;
284             if (sres2.endOffset[k])
285 	      sres2.endOffset[k] = offset[sres2.endOffset[k]-1] + 1;
286         }
287 
288 	// pick first and long one
289 	if ( sres.subRegExpressions == 0)
290 	    return sres2;
291 	if ( sres2.subRegExpressions == 1)
292 	{
293 	    if ( sres.startOffset[0] > sres2.startOffset[0])
294 	        return sres2;
295 	    else if ( sres.startOffset[0] == sres2.startOffset[0] &&
296 	        sres.endOffset[0] < sres2.endOffset[0])
297 	        return sres2;
298 	}
299     }
300 
301     return sres;
302 }
303 
304 SearchResult TextSearch::searchBackward( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos )
305         throw(RuntimeException)
306 {
307     SearchResult sres;
308 
309     OUString in_str(searchStr);
310     sal_Int32 newStartPos = startPos;
311     sal_Int32 newEndPos = endPos;
312 
313     bUsePrimarySrchStr = true;
314 
315     if ( xTranslit.is() )
316     {
317         // apply only simple 1<->1 transliteration here
318         com::sun::star::uno::Sequence <sal_Int32> offset( in_str.getLength());
319 	in_str = xTranslit->transliterate( searchStr, 0, in_str.getLength(), offset );
320 
321         // JP 20.6.2001: also the start and end positions must be corrected!
322         if( startPos < searchStr.getLength() )
323             newStartPos = FindPosInSeq_Impl( offset, startPos );
324 	else
325 	    newStartPos = in_str.getLength();
326 
327         if( endPos )
328 	    newEndPos = FindPosInSeq_Impl( offset, endPos );
329 
330         sres = (this->*fnBackward)( in_str, newStartPos, newEndPos );
331 
332         for ( int k = 0; k < sres.startOffset.getLength(); k++ )
333         {
334             if (sres.startOffset[k])
335 	      sres.startOffset[k] = offset[sres.startOffset[k] - 1] + 1;
336             // JP 20.6.2001: end is ever exclusive and then don't return
337             //               the position of the next character - return the
338             //               next position behind the last found character!
339             //               "a b c" find "b" must return 2,3 and not 2,4!!!
340             if (sres.endOffset[k])
341 	      sres.endOffset[k] = offset[sres.endOffset[k]];
342         }
343     }
344     else
345     {
346         sres = (this->*fnBackward)( in_str, startPos, endPos );
347     }
348 
349     if ( xTranslit2.is() && aSrchPara.algorithmType != SearchAlgorithms_REGEXP )
350     {
351 	SearchResult sres2;
352 
353 	in_str = OUString(searchStr);
354         com::sun::star::uno::Sequence <sal_Int32> offset( in_str.getLength());
355 
356         in_str = xTranslit2->transliterate(searchStr, 0, in_str.getLength(), offset);
357 
358         if( startPos < searchStr.getLength() )
359             startPos = FindPosInSeq_Impl( offset, startPos );
360         else
361             startPos = in_str.getLength();
362 
363         if( endPos )
364             endPos = FindPosInSeq_Impl( offset, endPos );
365 
366 	bUsePrimarySrchStr = false;
367 	sres2 = (this->*fnBackward)( in_str, startPos, endPos );
368 
369         for( int k = 0; k < sres2.startOffset.getLength(); k++ )
370         {
371             if (sres2.startOffset[k])
372                 sres2.startOffset[k] = offset[sres2.startOffset[k]-1]+1;
373             if (sres2.endOffset[k])
374                 sres2.endOffset[k] = offset[sres2.endOffset[k]-1]+1;
375         }
376 
377 	// pick last and long one
378 	if ( sres.subRegExpressions == 0 )
379 	    return sres2;
380 	if ( sres2.subRegExpressions == 1 )
381 	{
382 	    if ( sres.startOffset[0] < sres2.startOffset[0] )
383 	        return sres2;
384 	    if ( sres.startOffset[0] == sres2.startOffset[0] &&
385 		sres.endOffset[0] > sres2.endOffset[0] )
386 	        return sres2;
387 	}
388     }
389 
390     return sres;
391 }
392 
393 //---------------------------------------------------------------------
394 
395 bool TextSearch::IsDelimiter( const OUString& rStr, sal_Int32 nPos ) const
396 {
397     bool bRet = 1;
398     if( '\x7f' != rStr[nPos])
399     {
400         if ( !xCharClass.is() )
401         {
402             Reference < XInterface > xI = xMSF->createInstance(
403                     OUString::createFromAscii( "com.sun.star.i18n.CharacterClassification"));
404             if( xI.is() )
405                 xI->queryInterface( ::getCppuType(
406                             (const Reference< XCharacterClassification >*)0))
407                     >>= xCharClass;
408         }
409         if ( xCharClass.is() )
410         {
411             sal_Int32 nCType = xCharClass->getCharacterType( rStr, nPos,
412                     aSrchPara.Locale );
413             if( 0 != (( KCharacterType::DIGIT | KCharacterType::ALPHA |
414                             KCharacterType::LETTER ) & nCType ) )
415                 bRet = 0;
416         }
417     }
418     return bRet;
419 }
420 
421 // --------- helper methods for Boyer-Moore like text searching ----------
422 // TODO: use ICU's regex UREGEX_LITERAL mode instead when it becomes available
423 
424 void TextSearch::MakeForwardTab()
425 {
426     // create the jumptable for the search text
427     if( pJumpTable )
428     {
429         if( bIsForwardTab )
430             return ;                                        // the jumpTable is ok
431         delete pJumpTable;
432     }
433     bIsForwardTab = true;
434 
435     sal_Int32 n, nLen = sSrchStr.getLength();
436     pJumpTable = new TextSearchJumpTable;
437 
438     for( n = 0; n < nLen - 1; ++n )
439     {
440         sal_Unicode cCh = sSrchStr[n];
441         sal_Int32 nDiff = nLen - n - 1;
442 	TextSearchJumpTable::value_type aEntry( cCh, nDiff );
443 
444         ::std::pair< TextSearchJumpTable::iterator, bool > aPair =
445             pJumpTable->insert( aEntry );
446         if ( !aPair.second )
447             (*(aPair.first)).second = nDiff;
448     }
449 }
450 
451 void TextSearch::MakeForwardTab2()
452 {
453     // create the jumptable for the search text
454     if( pJumpTable2 )
455     {
456         if( bIsForwardTab )
457             return ;                                        // the jumpTable is ok
458         delete pJumpTable2;
459     }
460     bIsForwardTab = true;
461 
462     sal_Int32 n, nLen = sSrchStr2.getLength();
463     pJumpTable2 = new TextSearchJumpTable;
464 
465     for( n = 0; n < nLen - 1; ++n )
466     {
467         sal_Unicode cCh = sSrchStr2[n];
468         sal_Int32 nDiff = nLen - n - 1;
469 
470 	TextSearchJumpTable::value_type aEntry( cCh, nDiff );
471         ::std::pair< TextSearchJumpTable::iterator, bool > aPair =
472             pJumpTable2->insert( aEntry );
473         if ( !aPair.second )
474             (*(aPair.first)).second = nDiff;
475     }
476 }
477 
478 void TextSearch::MakeBackwardTab()
479 {
480     // create the jumptable for the search text
481     if( pJumpTable )
482     {
483         if( !bIsForwardTab )
484             return ;                                        // the jumpTable is ok
485         delete pJumpTable;
486     }
487     bIsForwardTab = false;
488 
489     sal_Int32 n, nLen = sSrchStr.getLength();
490     pJumpTable = new TextSearchJumpTable;
491 
492     for( n = nLen-1; n > 0; --n )
493     {
494         sal_Unicode cCh = sSrchStr[n];
495         TextSearchJumpTable::value_type aEntry( cCh, n );
496         ::std::pair< TextSearchJumpTable::iterator, bool > aPair =
497             pJumpTable->insert( aEntry );
498         if ( !aPair.second )
499             (*(aPair.first)).second = n;
500     }
501 }
502 
503 void TextSearch::MakeBackwardTab2()
504 {
505     // create the jumptable for the search text
506     if( pJumpTable2 )
507     {
508         if( !bIsForwardTab )
509             return ;                                        // the jumpTable is ok
510         delete pJumpTable2;
511     }
512     bIsForwardTab = false;
513 
514     sal_Int32 n, nLen = sSrchStr2.getLength();
515     pJumpTable2 = new TextSearchJumpTable;
516 
517     for( n = nLen-1; n > 0; --n )
518     {
519         sal_Unicode cCh = sSrchStr2[n];
520         TextSearchJumpTable::value_type aEntry( cCh, n );
521         ::std::pair< TextSearchJumpTable::iterator, bool > aPair =
522             pJumpTable2->insert( aEntry );
523         if ( !aPair.second )
524             (*(aPair.first)).second = n;
525     }
526 }
527 
528 sal_Int32 TextSearch::GetDiff( const sal_Unicode cChr ) const
529 {
530     TextSearchJumpTable *pJump;
531     OUString sSearchKey;
532 
533     if ( bUsePrimarySrchStr ) {
534       pJump = pJumpTable;
535       sSearchKey = sSrchStr;
536     } else {
537       pJump = pJumpTable2;
538       sSearchKey = sSrchStr2;
539     }
540 
541     TextSearchJumpTable::const_iterator iLook = pJump->find( cChr );
542     if ( iLook == pJump->end() )
543         return sSearchKey.getLength();
544     return (*iLook).second;
545 }
546 
547 
548 // TextSearch::NSrchFrwrd is mis-optimized on unxsoli (#i105945#)
549 SearchResult TextSearch::NSrchFrwrd( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos )
550         throw(RuntimeException)
551 {
552     SearchResult aRet;
553     aRet.subRegExpressions = 0;
554 
555     OUString sSearchKey = bUsePrimarySrchStr ? sSrchStr : sSrchStr2;
556 
557     OUString aStr( searchStr );
558     sal_Int32 nSuchIdx = aStr.getLength();
559     sal_Int32 nEnde = endPos;
560     if( !nSuchIdx || !sSearchKey.getLength() || sSearchKey.getLength() > nSuchIdx )
561         return aRet;
562 
563 
564     if( nEnde < sSearchKey.getLength() )  // position inside the search region ?
565         return aRet;
566 
567     nEnde -= sSearchKey.getLength();
568 
569     if (bUsePrimarySrchStr)
570       MakeForwardTab();                   // create the jumptable
571     else
572       MakeForwardTab2();
573 
574     for (sal_Int32 nCmpIdx = startPos; // start position for the search
575             nCmpIdx <= nEnde;
576             nCmpIdx += GetDiff( aStr[nCmpIdx + sSearchKey.getLength()-1]))
577     {
578         // if the match would be the completed cells, skip it.
579         if ( (checkCTLStart && !isCellStart( aStr, nCmpIdx )) || (checkCTLEnd
580                     && !isCellStart( aStr, nCmpIdx + sSearchKey.getLength())) )
581             continue;
582 
583         nSuchIdx = sSearchKey.getLength() - 1;
584         while( nSuchIdx >= 0 && sSearchKey[nSuchIdx] == aStr[nCmpIdx + nSuchIdx])
585         {
586             if( nSuchIdx == 0 )
587             {
588                 if( SearchFlags::NORM_WORD_ONLY & aSrchPara.searchFlag )
589                 {
590                     sal_Int32 nFndEnd = nCmpIdx + sSearchKey.getLength();
591                     bool bAtStart = !nCmpIdx;
592                     bool bAtEnd = nFndEnd == endPos;
593                     bool bDelimBefore = bAtStart || IsDelimiter( aStr, nCmpIdx-1 );
594                     bool bDelimBehind = IsDelimiter(  aStr, nFndEnd );
595                     //  *       1 -> only one word in the paragraph
596                     //  *       2 -> at begin of paragraph
597                     //  *       3 -> at end of paragraph
598                     //  *       4 -> inside the paragraph
599                     if( !(  ( bAtStart && bAtEnd ) ||           // 1
600                                 ( bAtStart && bDelimBehind ) ||     // 2
601                                 ( bAtEnd && bDelimBefore ) ||       // 3
602                                 ( bDelimBefore && bDelimBehind )))  // 4
603                         break;
604                 }
605 
606                 aRet.subRegExpressions = 1;
607                 aRet.startOffset.realloc( 1 );
608                 aRet.startOffset[ 0 ] = nCmpIdx;
609                 aRet.endOffset.realloc( 1 );
610                 aRet.endOffset[ 0 ] = nCmpIdx + sSearchKey.getLength();
611 
612                 return aRet;
613             }
614             else
615                 nSuchIdx--;
616         }
617     }
618     return aRet;
619 }
620 
621 SearchResult TextSearch::NSrchBkwrd( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos )
622         throw(RuntimeException)
623 {
624     SearchResult aRet;
625     aRet.subRegExpressions = 0;
626 
627     OUString sSearchKey = bUsePrimarySrchStr ? sSrchStr : sSrchStr2;
628 
629     OUString aStr( searchStr );
630     sal_Int32 nSuchIdx = aStr.getLength();
631     sal_Int32 nEnde = endPos;
632     if( nSuchIdx == 0 || sSearchKey.getLength() == 0 || sSearchKey.getLength() > nSuchIdx)
633         return aRet;
634 
635     if (bUsePrimarySrchStr)
636       MakeBackwardTab();                      // create the jumptable
637     else
638       MakeBackwardTab2();
639 
640     if( nEnde == nSuchIdx )                 // end position for the search
641         nEnde = sSearchKey.getLength();
642     else
643         nEnde += sSearchKey.getLength();
644 
645     sal_Int32 nCmpIdx = startPos;          // start position for the search
646 
647     while (nCmpIdx >= nEnde)
648     {
649         // if the match would be the completed cells, skip it.
650         if ( (!checkCTLStart || isCellStart( aStr, nCmpIdx -
651                         sSearchKey.getLength() )) && (!checkCTLEnd ||
652                     isCellStart( aStr, nCmpIdx)))
653         {
654             nSuchIdx = 0;
655             while( nSuchIdx < sSearchKey.getLength() && sSearchKey[nSuchIdx] ==
656                     aStr[nCmpIdx + nSuchIdx - sSearchKey.getLength()] )
657                 nSuchIdx++;
658             if( nSuchIdx >= sSearchKey.getLength() )
659             {
660                 if( SearchFlags::NORM_WORD_ONLY & aSrchPara.searchFlag )
661                 {
662                     sal_Int32 nFndStt = nCmpIdx - sSearchKey.getLength();
663                     bool bAtStart = !nFndStt;
664                     bool bAtEnd = nCmpIdx == startPos;
665                     bool bDelimBehind = IsDelimiter( aStr, nCmpIdx );
666                     bool bDelimBefore = bAtStart || // begin of paragraph
667                         IsDelimiter( aStr, nFndStt-1 );
668                     //  *       1 -> only one word in the paragraph
669                     //  *       2 -> at begin of paragraph
670                     //  *       3 -> at end of paragraph
671                     //  *       4 -> inside the paragraph
672                     if( ( bAtStart && bAtEnd ) ||           // 1
673                             ( bAtStart && bDelimBehind ) ||     // 2
674                             ( bAtEnd && bDelimBefore ) ||       // 3
675                             ( bDelimBefore && bDelimBehind ))   // 4
676                     {
677                         aRet.subRegExpressions = 1;
678                         aRet.startOffset.realloc( 1 );
679                         aRet.startOffset[ 0 ] = nCmpIdx;
680                         aRet.endOffset.realloc( 1 );
681                         aRet.endOffset[ 0 ] = nCmpIdx - sSearchKey.getLength();
682                         return aRet;
683                     }
684                 }
685                 else
686                 {
687                     aRet.subRegExpressions = 1;
688                     aRet.startOffset.realloc( 1 );
689                     aRet.startOffset[ 0 ] = nCmpIdx;
690                     aRet.endOffset.realloc( 1 );
691                     aRet.endOffset[ 0 ] = nCmpIdx - sSearchKey.getLength();
692                     return aRet;
693                 }
694             }
695         }
696         nSuchIdx = GetDiff( aStr[nCmpIdx - sSearchKey.getLength()] );
697         if( nCmpIdx < nSuchIdx )
698             return aRet;
699         nCmpIdx -= nSuchIdx;
700     }
701     return aRet;
702 }
703 
704 void TextSearch::RESrchPrepare( const ::com::sun::star::util::SearchOptions& rOptions)
705 {
706 	// select the transliterated pattern string
707 	const OUString& rPatternStr =
708 		(rOptions.transliterateFlags & SIMPLE_TRANS_MASK) ? sSrchStr
709 		: ((rOptions.transliterateFlags & COMPLEX_TRANS_MASK) ? sSrchStr2 : rOptions.searchString);
710 
711 	sal_uInt32 nIcuSearchFlags = UREGEX_UWORD; // request UAX#29 unicode capability
712 	// map com::sun::star::util::SearchFlags to ICU uregex.h flags
713 	// TODO: REG_EXTENDED, REG_NOT_BEGINOFLINE, REG_NOT_ENDOFLINE
714 	// REG_NEWLINE is neither properly defined nor used anywhere => not implemented
715 	// REG_NOSUB is not used anywhere => not implemented
716 	// NORM_WORD_ONLY is only used for SearchAlgorithm==Absolute
717 	// LEV_RELAXED is only used for SearchAlgorithm==Approximate
718 	// Note that the search flag ALL_IGNORE_CASE is deprecated in UNO
719 	// probably because the transliteration flag IGNORE_CASE handles it as well.
720 	if( (rOptions.searchFlag & com::sun::star::util::SearchFlags::ALL_IGNORE_CASE) != 0
721 	||  (rOptions.transliterateFlags & TransliterationModules_IGNORE_CASE) != 0)
722 		nIcuSearchFlags |= UREGEX_CASE_INSENSITIVE;
723 	UErrorCode nIcuErr = U_ZERO_ERROR;
724 	// assumption: transliteration didn't mangle regexp control chars
725 	IcuUniString aIcuSearchPatStr( (const UChar*)rPatternStr.getStr(), rPatternStr.getLength());
726 #ifndef DISABLE_WORDBOUND_EMULATION
727 	// for conveniance specific syntax elements of the old regex engine are emulated
728 	// - by replacing \< with "word-break followed by a look-ahead word-char"
729 	static const IcuUniString aChevronPatternB( "\\\\<", -1, IcuUniString::kInvariant);
730 	static const IcuUniString aChevronReplaceB( "\\\\b(?=\\\\w)", -1, IcuUniString::kInvariant);
731 	static RegexMatcher aChevronMatcherB( aChevronPatternB, 0, nIcuErr);
732 	aChevronMatcherB.reset( aIcuSearchPatStr);
733 	aIcuSearchPatStr = aChevronMatcherB.replaceAll( aChevronReplaceB, nIcuErr);
734 	aChevronMatcherB.reset();
735 	// - by replacing \> with "look-behind word-char followed by a word-break"
736 	static const IcuUniString aChevronPatternE( "\\\\>", -1, IcuUniString::kInvariant);
737 	static const IcuUniString aChevronReplaceE( "(?<=\\\\w)\\\\b", -1, IcuUniString::kInvariant);
738 	static RegexMatcher aChevronMatcherE( aChevronPatternE, 0, nIcuErr);
739 	aChevronMatcherE.reset( aIcuSearchPatStr);
740 	aIcuSearchPatStr = aChevronMatcherE.replaceAll( aChevronReplaceE, nIcuErr);
741 	aChevronMatcherE.reset();
742 #endif
743 	pRegexMatcher = new RegexMatcher( aIcuSearchPatStr, nIcuSearchFlags, nIcuErr);
744 	if( nIcuErr)
745 		{ delete pRegexMatcher; pRegexMatcher = NULL;}
746 }
747 
748 //---------------------------------------------------------------------------
749 
750 SearchResult TextSearch::RESrchFrwrd( const OUString& searchStr,
751                                       sal_Int32 startPos, sal_Int32 endPos )
752             throw(RuntimeException)
753 {
754 	SearchResult aRet;
755 	aRet.subRegExpressions = 0;
756 	if( !pRegexMatcher)
757 		return aRet;
758 
759 	if( endPos > searchStr.getLength())
760 		endPos = searchStr.getLength();
761 
762 	// use the ICU RegexMatcher to find the matches
763 	UErrorCode nIcuErr = U_ZERO_ERROR;
764 	const IcuUniString aSearchTargetStr( (const UChar*)searchStr.getStr(), endPos);
765 	pRegexMatcher->reset( aSearchTargetStr);
766 	// search until there is a valid match
767 	for(;;)
768 	{
769 		if( !pRegexMatcher->find( startPos, nIcuErr))
770 			return aRet;
771 
772 		// #i118887# ignore zero-length matches e.g. "a*" in "bc"
773 		int nStartOfs = pRegexMatcher->start( nIcuErr);
774 		int nEndOfs = pRegexMatcher->end( nIcuErr);
775 		if( nStartOfs < nEndOfs)
776 			break;
777 		// try at next position if there was a zero-length match
778 		if( ++startPos >= endPos)
779 			return aRet;
780 	}
781 
782 	// extract the result of the search
783 	const int nGroupCount = pRegexMatcher->groupCount();
784 	aRet.subRegExpressions = nGroupCount + 1;
785 	aRet.startOffset.realloc( aRet.subRegExpressions);
786 	aRet.endOffset.realloc( aRet.subRegExpressions);
787 	aRet.startOffset[0] = pRegexMatcher->start( nIcuErr);
788 	aRet.endOffset[0]   = pRegexMatcher->end( nIcuErr);
789 	for( int i = 1; i <= nGroupCount; ++i) {
790 		aRet.startOffset[i] = pRegexMatcher->start( i, nIcuErr);
791 		aRet.endOffset[i]   = pRegexMatcher->end( i, nIcuErr);
792 	}
793 
794 	return aRet;
795 }
796 
797 SearchResult TextSearch::RESrchBkwrd( const OUString& searchStr,
798                                       sal_Int32 startPos, sal_Int32 endPos )
799             throw(RuntimeException)
800 {
801 	// NOTE: for backwards search callers provide startPos/endPos inverted!
802 	SearchResult aRet;
803 	aRet.subRegExpressions = 0;
804 	if( !pRegexMatcher)
805 		return aRet;
806 
807 	if( startPos > searchStr.getLength())
808 		startPos = searchStr.getLength();
809 
810 	// use the ICU RegexMatcher to find the matches
811 	// TODO: use ICU's backward searching once it becomes available
812 	//       as its replacement using forward search is not as good as the real thing
813 	UErrorCode nIcuErr = U_ZERO_ERROR;
814 	const IcuUniString aSearchTargetStr( (const UChar*)searchStr.getStr(), startPos);
815 	pRegexMatcher->reset( aSearchTargetStr);
816 	if( !pRegexMatcher->find( endPos, nIcuErr))
817 		return aRet;
818 
819 	// find the last match
820 	int nLastPos = 0;
821 	int nFoundEnd = 0;
822 	do {
823 		nLastPos = pRegexMatcher->start( nIcuErr);
824 		nFoundEnd = pRegexMatcher->end( nIcuErr);
825 		if( nFoundEnd >= startPos)
826 			break;
827 		if( nFoundEnd == nLastPos)
828 			++nFoundEnd;
829 	} while( pRegexMatcher->find( nFoundEnd, nIcuErr));
830 
831 	// find last match again to get its details
832 	pRegexMatcher->find( nLastPos, nIcuErr);
833 
834 	// fill in the details of the last match
835 	const int nGroupCount = pRegexMatcher->groupCount();
836 	aRet.subRegExpressions = nGroupCount + 1;
837 	aRet.startOffset.realloc( aRet.subRegExpressions);
838 	aRet.endOffset.realloc( aRet.subRegExpressions);
839 	// NOTE: existing users of backward search seem to expect startOfs/endOfs being inverted!
840 	aRet.startOffset[0] = pRegexMatcher->end( nIcuErr);
841 	aRet.endOffset[0]   = pRegexMatcher->start( nIcuErr);
842 	for( int i = 1; i <= nGroupCount; ++i) {
843 		aRet.startOffset[i] = pRegexMatcher->end( i, nIcuErr);
844 		aRet.endOffset[i]   = pRegexMatcher->start( i, nIcuErr);
845 	}
846 
847 	return aRet;
848 }
849 
850 //---------------------------------------------------------------------------
851 
852 // search for words phonetically
853 SearchResult TextSearch::ApproxSrchFrwrd( const OUString& searchStr,
854                                           sal_Int32 startPos, sal_Int32 endPos )
855             throw(RuntimeException)
856 {
857     SearchResult aRet;
858     aRet.subRegExpressions = 0;
859 
860     if( !xBreak.is() )
861         return aRet;
862 
863     OUString aWTemp( searchStr );
864 
865     register sal_Int32 nStt, nEnd;
866 
867     Boundary aWBnd = xBreak->getWordBoundary( aWTemp, startPos,
868             aSrchPara.Locale,
869             WordType::ANYWORD_IGNOREWHITESPACES, sal_True );
870 
871     do
872     {
873         if( aWBnd.startPos >= endPos )
874             break;
875         nStt = aWBnd.startPos < startPos ? startPos : aWBnd.startPos;
876         nEnd = aWBnd.endPos > endPos ? endPos : aWBnd.endPos;
877 
878         if( nStt < nEnd &&
879                 pWLD->WLD( aWTemp.getStr() + nStt, nEnd - nStt ) <= nLimit )
880         {
881             aRet.subRegExpressions = 1;
882             aRet.startOffset.realloc( 1 );
883             aRet.startOffset[ 0 ] = nStt;
884             aRet.endOffset.realloc( 1 );
885             aRet.endOffset[ 0 ] = nEnd;
886             break;
887         }
888 
889         nStt = nEnd - 1;
890         aWBnd = xBreak->nextWord( aWTemp, nStt, aSrchPara.Locale,
891                 WordType::ANYWORD_IGNOREWHITESPACES);
892     } while( aWBnd.startPos != aWBnd.endPos ||
893             (aWBnd.endPos != aWTemp.getLength() && aWBnd.endPos != nEnd) );
894     // #i50244# aWBnd.endPos != nEnd : in case there is _no_ word (only
895     // whitespace) in searchStr, getWordBoundary() returned startPos,startPos
896     // and nextWord() does also => don't loop forever.
897     return aRet;
898 }
899 
900 SearchResult TextSearch::ApproxSrchBkwrd( const OUString& searchStr,
901                                           sal_Int32 startPos, sal_Int32 endPos )
902             throw(RuntimeException)
903 {
904     SearchResult aRet;
905     aRet.subRegExpressions = 0;
906 
907     if( !xBreak.is() )
908         return aRet;
909 
910     OUString aWTemp( searchStr );
911 
912     register sal_Int32 nStt, nEnd;
913 
914     Boundary aWBnd = xBreak->getWordBoundary( aWTemp, startPos,
915             aSrchPara.Locale,
916             WordType::ANYWORD_IGNOREWHITESPACES, sal_True );
917 
918     do
919     {
920         if( aWBnd.endPos <= endPos )
921             break;
922         nStt = aWBnd.startPos < endPos ? endPos : aWBnd.startPos;
923         nEnd = aWBnd.endPos > startPos ? startPos : aWBnd.endPos;
924 
925         if( nStt < nEnd &&
926                 pWLD->WLD( aWTemp.getStr() + nStt, nEnd - nStt ) <= nLimit )
927         {
928             aRet.subRegExpressions = 1;
929             aRet.startOffset.realloc( 1 );
930             aRet.startOffset[ 0 ] = nEnd;
931             aRet.endOffset.realloc( 1 );
932             aRet.endOffset[ 0 ] = nStt;
933             break;
934         }
935         if( !nStt )
936             break;
937 
938         aWBnd = xBreak->previousWord( aWTemp, nStt, aSrchPara.Locale,
939                 WordType::ANYWORD_IGNOREWHITESPACES);
940     } while( aWBnd.startPos != aWBnd.endPos || aWBnd.endPos != aWTemp.getLength() );
941     return aRet;
942 }
943 
944 
945 static const sal_Char cSearchName[] = "com.sun.star.util.TextSearch";
946 static const sal_Char cSearchImpl[] = "com.sun.star.util.TextSearch_i18n";
947 
948 static OUString getServiceName_Static()
949 {
950     return OUString::createFromAscii( cSearchName );
951 }
952 
953 static OUString getImplementationName_Static()
954 {
955     return OUString::createFromAscii( cSearchImpl );
956 }
957 
958 OUString SAL_CALL
959 TextSearch::getImplementationName()
960                 throw( RuntimeException )
961 {
962     return getImplementationName_Static();
963 }
964 
965 sal_Bool SAL_CALL
966 TextSearch::supportsService(const OUString& rServiceName)
967                 throw( RuntimeException )
968 {
969     return !rServiceName.compareToAscii( cSearchName );
970 }
971 
972 Sequence< OUString > SAL_CALL
973 TextSearch::getSupportedServiceNames(void) throw( RuntimeException )
974 {
975     Sequence< OUString > aRet(1);
976     aRet[0] = getServiceName_Static();
977     return aRet;
978 }
979 
980 ::com::sun::star::uno::Reference< ::com::sun::star::uno::XInterface >
981 SAL_CALL TextSearch_CreateInstance(
982         const ::com::sun::star::uno::Reference<
983         ::com::sun::star::lang::XMultiServiceFactory >& rxMSF )
984 {
985     return ::com::sun::star::uno::Reference<
986         ::com::sun::star::uno::XInterface >(
987                 (::cppu::OWeakObject*) new TextSearch( rxMSF ) );
988 }
989 
990 extern "C"
991 {
992 
993 void SAL_CALL component_getImplementationEnvironment(
994         const sal_Char** ppEnvTypeName, uno_Environment** /*ppEnv*/ )
995 {
996     *ppEnvTypeName = CPPU_CURRENT_LANGUAGE_BINDING_NAME;
997 }
998 
999 void* SAL_CALL component_getFactory( const sal_Char* sImplementationName,
1000         void* _pServiceManager, void* /*_pRegistryKey*/ )
1001 {
1002     void* pRet = NULL;
1003 
1004     ::com::sun::star::lang::XMultiServiceFactory* pServiceManager =
1005         reinterpret_cast< ::com::sun::star::lang::XMultiServiceFactory* >
1006             ( _pServiceManager );
1007     ::com::sun::star::uno::Reference<
1008             ::com::sun::star::lang::XSingleServiceFactory > xFactory;
1009 
1010     if ( 0 == rtl_str_compare( sImplementationName, cSearchImpl) )
1011     {
1012         ::com::sun::star::uno::Sequence< ::rtl::OUString > aServiceNames(1);
1013         aServiceNames[0] = getServiceName_Static();
1014         xFactory = ::cppu::createSingleFactory(
1015                 pServiceManager, getImplementationName_Static(),
1016                 &TextSearch_CreateInstance, aServiceNames );
1017     }
1018 
1019     if ( xFactory.is() )
1020     {
1021         xFactory->acquire();
1022         pRet = xFactory.get();
1023     }
1024 
1025     return pRet;
1026 }
1027 
1028 } // extern "C"
1029