1 /**************************************************************
2  *
3  * Licensed to the Apache Software Foundation (ASF) under one
4  * or more contributor license agreements.  See the NOTICE file
5  * distributed with this work for additional information
6  * regarding copyright ownership.  The ASF licenses this file
7  * to you under the Apache License, Version 2.0 (the
8  * "License"); you may not use this file except in compliance
9  * with the License.  You may obtain a copy of the License at
10  *
11  *   http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing,
14  * software distributed under the License is distributed on an
15  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16  * KIND, either express or implied.  See the License for the
17  * specific language governing permissions and limitations
18  * under the License.
19  *
20  *************************************************************/
21 
22 
23 
24 // MARKER(update_precomp.py): autogen include statement, do not remove
25 #include "precompiled_i18npool.hxx"
26 
27 #include <breakiteratorImpl.hxx>
28 #include <unicode/uchar.h>
29 #include <rtl/ustrbuf.hxx>
30 
31 using namespace ::com::sun::star::uno;
32 using namespace ::com::sun::star::lang;
33 using namespace ::rtl;
34 
35 namespace com { namespace sun { namespace star { namespace i18n {
36 
BreakIteratorImpl(const Reference<XMultiServiceFactory> & rxMSF)37 BreakIteratorImpl::BreakIteratorImpl( const Reference < XMultiServiceFactory >& rxMSF ) : xMSF( rxMSF )
38 {
39 }
40 
BreakIteratorImpl()41 BreakIteratorImpl::BreakIteratorImpl()
42 {
43 }
44 
~BreakIteratorImpl()45 BreakIteratorImpl::~BreakIteratorImpl()
46 {
47         // Clear lookuptable
48         for (size_t l = 0; l < lookupTable.size(); l++)
49             delete lookupTable[l];
50         lookupTable.clear();
51 }
52 
53 #define LBI getLocaleSpecificBreakIterator(rLocale)
54 
nextCharacters(const OUString & Text,sal_Int32 nStartPos,const Locale & rLocale,sal_Int16 nCharacterIteratorMode,sal_Int32 nCount,sal_Int32 & nDone)55 sal_Int32 SAL_CALL BreakIteratorImpl::nextCharacters( const OUString& Text, sal_Int32 nStartPos,
56         const Locale &rLocale, sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
57         throw(RuntimeException)
58 {
59         if (nCount < 0) throw RuntimeException();
60 
61         return LBI->nextCharacters( Text, nStartPos, rLocale, nCharacterIteratorMode, nCount, nDone);
62 }
63 
previousCharacters(const OUString & Text,sal_Int32 nStartPos,const Locale & rLocale,sal_Int16 nCharacterIteratorMode,sal_Int32 nCount,sal_Int32 & nDone)64 sal_Int32 SAL_CALL BreakIteratorImpl::previousCharacters( const OUString& Text, sal_Int32 nStartPos,
65         const Locale& rLocale, sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
66         throw(RuntimeException)
67 {
68         if (nCount < 0) throw RuntimeException();
69 
70         return LBI->previousCharacters( Text, nStartPos, rLocale, nCharacterIteratorMode, nCount, nDone);
71 }
72 
73 #define isZWSP(c) (ch == 0x200B)
74 
skipSpace(const OUString & Text,sal_Int32 nPos,sal_Int32 len,sal_Int16 rWordType,sal_Bool bDirection)75 static sal_Int32 skipSpace(const OUString& Text, sal_Int32 nPos, sal_Int32 len, sal_Int16 rWordType, sal_Bool bDirection)
76 {
77 		sal_uInt32 ch=0;
78 		sal_Int32 pos=nPos;
79         switch (rWordType) {
80             case WordType::ANYWORD_IGNOREWHITESPACES:
81                 if (bDirection)
82                     while (nPos < len && (u_isWhitespace(ch = Text.iterateCodePoints(&pos, 1)) || isZWSP(ch))) nPos=pos;
83                 else
84                     while (nPos > 0 && (u_isWhitespace(ch = Text.iterateCodePoints(&pos, -1)) || isZWSP(ch))) nPos=pos;
85             break;
86             case WordType::DICTIONARY_WORD:
87                 if (bDirection)
88                     while (nPos < len && (u_isWhitespace(ch = Text.iterateCodePoints(&pos, 1)) || isZWSP(ch) ||
89                             ! (ch == 0x002E || u_isalnum(ch)))) nPos=pos;
90                 else
91                     while (nPos > 0 && (u_isWhitespace(ch = Text.iterateCodePoints(&pos, -1)) || isZWSP(ch) ||
92                             ! (ch == 0x002E || u_isalnum(ch)))) nPos=pos;
93             break;
94             case WordType::WORD_COUNT:
95                 if (bDirection)
96                     while (nPos < len && (u_isUWhiteSpace(ch = Text.iterateCodePoints(&pos, 1)) || isZWSP(ch))) nPos=pos;
97                 else
98                     while (nPos > 0 && (u_isUWhiteSpace(ch = Text.iterateCodePoints(&pos, -1)) || isZWSP(ch))) nPos=pos;
99             break;
100         }
101         return nPos;
102 }
103 
nextWord(const OUString & Text,sal_Int32 nStartPos,const Locale & rLocale,sal_Int16 rWordType)104 Boundary SAL_CALL BreakIteratorImpl::nextWord( const OUString& Text, sal_Int32 nStartPos,
105         const Locale& rLocale, sal_Int16 rWordType ) throw(RuntimeException)
106 {
107         sal_Int32 len = Text.getLength();
108         if( nStartPos < 0 || len == 0 )
109             result.endPos = result.startPos = 0;
110         else if (nStartPos >= len)
111             result.endPos = result.startPos = len;
112         else {
113             result = LBI->nextWord(Text, nStartPos, rLocale, rWordType);
114 
115             nStartPos = skipSpace(Text, result.startPos, len, rWordType, sal_True);
116 
117             if ( nStartPos != result.startPos) {
118                 if( nStartPos >= len )
119                     result.startPos = result.endPos = len;
120                 else {
121                     result = LBI->getWordBoundary(Text, nStartPos, rLocale, rWordType, sal_True);
122                     // i88041: avoid startPos goes back to nStartPos when switching between Latin and CJK scripts
123                     if (result.startPos < nStartPos) result.startPos = nStartPos;
124                 }
125             }
126         }
127         return result;
128 }
129 
isCJK(const Locale & rLocale)130 static inline sal_Bool SAL_CALL isCJK( const Locale& rLocale ) {
131         return rLocale.Language.equalsAscii("zh") || rLocale.Language.equalsAscii("ja") || rLocale.Language.equalsAscii("ko");
132 }
133 
previousWord(const OUString & Text,sal_Int32 nStartPos,const Locale & rLocale,sal_Int16 rWordType)134 Boundary SAL_CALL BreakIteratorImpl::previousWord( const OUString& Text, sal_Int32 nStartPos,
135         const Locale& rLocale, sal_Int16 rWordType) throw(RuntimeException)
136 {
137         sal_Int32 len = Text.getLength();
138         if( nStartPos <= 0 || len == 0 ) {
139             result.endPos = result.startPos = 0;
140             return result;
141         } else if (nStartPos > len) {
142             result.endPos = result.startPos = len;
143             return result;
144         }
145 
146         sal_Int32 nPos = skipSpace(Text, nStartPos, len, rWordType, sal_False);
147 
148         // if some spaces are skiped, and the script type is Asian with no CJK rLocale, we have to return
149         // (nStartPos, -1) for caller to send correct rLocale for loading correct dictionary.
150         result.startPos = nPos;
151         if (nPos != nStartPos && nPos > 0 && !isCJK(rLocale) && getScriptClass(Text.iterateCodePoints(&nPos, -1)) == ScriptType::ASIAN) {
152             result.endPos = -1;
153             return result;
154         }
155 
156         return LBI->previousWord(Text, result.startPos, rLocale, rWordType);
157 }
158 
159 
getWordBoundary(const OUString & Text,sal_Int32 nPos,const Locale & rLocale,sal_Int16 rWordType,sal_Bool bDirection)160 Boundary SAL_CALL BreakIteratorImpl::getWordBoundary( const OUString& Text, sal_Int32 nPos, const Locale& rLocale,
161         sal_Int16 rWordType, sal_Bool bDirection ) throw(RuntimeException)
162 {
163         sal_Int32 len = Text.getLength();
164         if( nPos < 0 || len == 0 )
165             result.endPos = result.startPos = 0;
166         else if (nPos > len)
167             result.endPos = result.startPos = len;
168         else {
169             sal_Int32 next, prev;
170             next = skipSpace(Text, nPos, len, rWordType, sal_True);
171             prev = skipSpace(Text, nPos, len, rWordType, sal_False);
172             if (prev == 0 && next == len) {
173                 result.endPos = result.startPos = nPos;
174             } else if (prev == 0 && ! bDirection) {
175                 result.endPos = result.startPos = 0;
176             } else if (next == len && bDirection) {
177                 result.endPos = result.startPos = len;
178             } else {
179                 if (next != prev) {
180                     if (next == nPos && next != len)
181                         bDirection = sal_True;
182                     else if (prev == nPos && prev != 0)
183                         bDirection = sal_False;
184                     else
185                         nPos = bDirection ? next : prev;
186                 }
187                 result = LBI->getWordBoundary(Text, nPos, rLocale, rWordType, bDirection);
188             }
189         }
190         return result;
191 }
192 
isBeginWord(const OUString & Text,sal_Int32 nPos,const Locale & rLocale,sal_Int16 rWordType)193 sal_Bool SAL_CALL BreakIteratorImpl::isBeginWord( const OUString& Text, sal_Int32 nPos,
194         const Locale& rLocale, sal_Int16 rWordType ) throw(RuntimeException)
195 {
196         sal_Int32 len = Text.getLength();
197 
198         if (nPos < 0 || nPos >= len) return sal_False;
199 
200         sal_Int32 tmp = skipSpace(Text, nPos, len, rWordType, sal_True);
201 
202         if (tmp != nPos) return sal_False;
203 
204         result = getWordBoundary(Text, nPos, rLocale, rWordType, sal_True);
205 
206         return result.startPos == nPos;
207 }
208 
isEndWord(const OUString & Text,sal_Int32 nPos,const Locale & rLocale,sal_Int16 rWordType)209 sal_Bool SAL_CALL BreakIteratorImpl::isEndWord( const OUString& Text, sal_Int32 nPos,
210         const Locale& rLocale, sal_Int16 rWordType ) throw(RuntimeException)
211 {
212         sal_Int32 len = Text.getLength();
213 
214         if (nPos <= 0 || nPos > len) return sal_False;
215 
216         sal_Int32 tmp = skipSpace(Text, nPos, len, rWordType, sal_False);
217 
218         if (tmp != nPos) return sal_False;
219 
220         result = getWordBoundary(Text, nPos, rLocale, rWordType, sal_False);
221 
222         return result.endPos == nPos;
223 }
224 
beginOfSentence(const OUString & Text,sal_Int32 nStartPos,const Locale & rLocale)225 sal_Int32 SAL_CALL BreakIteratorImpl::beginOfSentence( const OUString& Text, sal_Int32 nStartPos,
226         const Locale &rLocale ) throw(RuntimeException)
227 {
228         if (nStartPos < 0 || nStartPos > Text.getLength())
229             return -1;
230         if (Text.getLength() == 0) return 0;
231         return LBI->beginOfSentence(Text, nStartPos, rLocale);
232 }
233 
endOfSentence(const OUString & Text,sal_Int32 nStartPos,const Locale & rLocale)234 sal_Int32 SAL_CALL BreakIteratorImpl::endOfSentence( const OUString& Text, sal_Int32 nStartPos,
235         const Locale &rLocale ) throw(RuntimeException)
236 {
237         if (nStartPos < 0 || nStartPos > Text.getLength())
238             return -1;
239         if (Text.getLength() == 0) return 0;
240         return LBI->endOfSentence(Text, nStartPos, rLocale);
241 }
242 
getLineBreak(const OUString & Text,sal_Int32 nStartPos,const Locale & rLocale,sal_Int32 nMinBreakPos,const LineBreakHyphenationOptions & hOptions,const LineBreakUserOptions & bOptions)243 LineBreakResults SAL_CALL BreakIteratorImpl::getLineBreak( const OUString& Text, sal_Int32 nStartPos,
244         const Locale& rLocale, sal_Int32 nMinBreakPos, const LineBreakHyphenationOptions& hOptions,
245         const LineBreakUserOptions& bOptions ) throw(RuntimeException)
246 {
247         return LBI->getLineBreak(Text, nStartPos, rLocale, nMinBreakPos, hOptions, bOptions);
248 }
249 
getScriptType(const OUString & Text,sal_Int32 nPos)250 sal_Int16 SAL_CALL BreakIteratorImpl::getScriptType( const OUString& Text, sal_Int32 nPos )
251         throw(RuntimeException)
252 {
253         return (nPos < 0 || nPos >= Text.getLength()) ? ScriptType::WEAK :
254                             getScriptClass(Text.iterateCodePoints(&nPos, 0));
255 }
256 
257 
258 /** Increments/decrements position first, then obtains character.
259     @return current position, may be -1 or text length if string was consumed.
260  */
iterateCodePoints(const OUString & Text,sal_Int32 & nStartPos,sal_Int32 inc,sal_uInt32 & ch)261 static sal_Int32 SAL_CALL iterateCodePoints(const OUString& Text, sal_Int32 &nStartPos, sal_Int32 inc, sal_uInt32& ch) {
262         sal_Int32 nLen = Text.getLength();
263 		if (nStartPos + inc < 0 || nStartPos + inc >= nLen) {
264 			ch = 0;
265 			nStartPos = nStartPos + inc < 0 ? -1 : nLen;
266 		} else {
267 			ch = Text.iterateCodePoints(&nStartPos, inc);
268             // Fix for #i80436#.
269             // erAck: 2009-06-30T21:52+0200  This logic looks somewhat
270             // suspicious as if it cures a symptom.. anyway, had to add
271             // nStartPos < Text.getLength() to silence the (correct) assertion
272             // in rtl_uString_iterateCodePoints() if Text was one character
273             // (codepoint) only, made up of a surrogate pair.
274             //if (inc > 0 && nStartPos < Text.getLength())
275             //    ch = Text.iterateCodePoints(&nStartPos, 0);
276             // With surrogates, nStartPos may actually point behind string
277             // now, even if inc is only +1
278 			if (inc > 0)
279                 ch = (nStartPos < nLen ? Text.iterateCodePoints(&nStartPos, 0) : 0);
280 		}
281 		return nStartPos;
282 }
283 
284 
beginOfScript(const OUString & Text,sal_Int32 nStartPos,sal_Int16 ScriptType)285 sal_Int32 SAL_CALL BreakIteratorImpl::beginOfScript( const OUString& Text,
286         sal_Int32 nStartPos, sal_Int16 ScriptType ) throw(RuntimeException)
287 {
288         if (nStartPos < 0 || nStartPos >= Text.getLength())
289             return -1;
290 
291         if(ScriptType != getScriptClass(Text.iterateCodePoints(&nStartPos, 0)))
292             return -1;
293 
294 		if (nStartPos == 0) return 0;
295 		sal_uInt32 ch=0;
296         while (iterateCodePoints(Text, nStartPos, -1, ch) >= 0 && ScriptType == getScriptClass(ch)) {
297 			if (nStartPos == 0) return 0;
298 		}
299 
300         return  iterateCodePoints(Text, nStartPos, 1, ch);
301 }
302 
endOfScript(const OUString & Text,sal_Int32 nStartPos,sal_Int16 ScriptType)303 sal_Int32 SAL_CALL BreakIteratorImpl::endOfScript( const OUString& Text,
304         sal_Int32 nStartPos, sal_Int16 ScriptType ) throw(RuntimeException)
305 {
306         if (nStartPos < 0 || nStartPos >= Text.getLength())
307             return -1;
308 
309         if(ScriptType != getScriptClass(Text.iterateCodePoints(&nStartPos, 0)))
310             return -1;
311 
312         sal_Int32 strLen = Text.getLength();
313 		sal_uInt32 ch=0;
314         while(iterateCodePoints(Text, nStartPos, 1, ch) < strLen ) {
315             sal_Int16 currentCharScriptType = getScriptClass(ch);
316             if(ScriptType != currentCharScriptType && currentCharScriptType != ScriptType::WEAK)
317                 break;
318         }
319         return  nStartPos;
320 }
321 
previousScript(const OUString & Text,sal_Int32 nStartPos,sal_Int16 ScriptType)322 sal_Int32  SAL_CALL BreakIteratorImpl::previousScript( const OUString& Text,
323         sal_Int32 nStartPos, sal_Int16 ScriptType ) throw(RuntimeException)
324 {
325         if (nStartPos < 0)
326             return -1;
327         if (nStartPos > Text.getLength())
328             nStartPos = Text.getLength();
329 
330         sal_Int16 numberOfChange = (ScriptType == getScriptClass(Text.iterateCodePoints(&nStartPos, 0))) ? 3 : 2;
331 
332 		sal_uInt32 ch=0;
333         while (numberOfChange > 0 && iterateCodePoints(Text, nStartPos, -1, ch) >= 0) {
334 			if ((((numberOfChange % 2) == 0) ^ (ScriptType != getScriptClass(ch))))
335 				numberOfChange--;
336 			else if (nStartPos == 0) {
337 				if (numberOfChange > 0)
338 					numberOfChange--;
339 				if (nStartPos > 0)
340 					Text.iterateCodePoints(&nStartPos, -1);
341 				else
342 					return -1;
343 			}
344         }
345         return numberOfChange == 0 ? iterateCodePoints(Text, nStartPos, 1, ch) : -1;
346 }
347 
nextScript(const OUString & Text,sal_Int32 nStartPos,sal_Int16 ScriptType)348 sal_Int32 SAL_CALL BreakIteratorImpl::nextScript( const OUString& Text, sal_Int32 nStartPos,
349         sal_Int16 ScriptType ) throw(RuntimeException)
350 
351 {
352         if (nStartPos < 0)
353             nStartPos = 0;
354         sal_Int32 strLen = Text.getLength();
355         if (nStartPos > strLen)
356             return -1;
357 
358         sal_Int16 numberOfChange = (ScriptType == getScriptClass(Text.iterateCodePoints(&nStartPos, 0))) ? 2 : 1;
359 
360 		sal_uInt32 ch=0;
361         while (numberOfChange > 0 && iterateCodePoints(Text, nStartPos, 1, ch) < strLen) {
362 			sal_Int16 currentCharScriptType = getScriptClass(ch);
363 			if ((numberOfChange == 1) ? (ScriptType == currentCharScriptType) :
364 					(ScriptType != currentCharScriptType && currentCharScriptType != ScriptType::WEAK))
365 				numberOfChange--;
366         }
367         return numberOfChange == 0 ? nStartPos : -1;
368 }
369 
beginOfCharBlock(const OUString & Text,sal_Int32 nStartPos,const Locale &,sal_Int16 CharType)370 sal_Int32 SAL_CALL BreakIteratorImpl::beginOfCharBlock( const OUString& Text, sal_Int32 nStartPos,
371         const Locale& /*rLocale*/, sal_Int16 CharType ) throw(RuntimeException)
372 {
373         if (CharType == CharType::ANY_CHAR) return 0;
374         if (nStartPos < 0 || nStartPos >= Text.getLength()) return -1;
375         if (CharType != (sal_Int16)u_charType( Text.iterateCodePoints(&nStartPos, 0))) return -1;
376 
377         sal_Int32 nPos=nStartPos;
378         while(nStartPos > 0 && CharType == (sal_Int16)u_charType(Text.iterateCodePoints(&nPos, -1))) { nStartPos=nPos; }
379         return nStartPos; // begin of char block is inclusive
380 }
381 
endOfCharBlock(const OUString & Text,sal_Int32 nStartPos,const Locale &,sal_Int16 CharType)382 sal_Int32 SAL_CALL BreakIteratorImpl::endOfCharBlock( const OUString& Text, sal_Int32 nStartPos,
383         const Locale& /*rLocale*/, sal_Int16 CharType ) throw(RuntimeException)
384 {
385         sal_Int32 strLen = Text.getLength();
386 
387         if (CharType == CharType::ANY_CHAR) return strLen; // end of char block is exclusive
388         if (nStartPos < 0 || nStartPos >= strLen) return -1;
389         if (CharType != (sal_Int16)u_charType(Text.iterateCodePoints(&nStartPos, 0))) return -1;
390 
391 		sal_uInt32 ch=0;
392         while(iterateCodePoints(Text, nStartPos, 1, ch) < strLen && CharType == (sal_Int16)u_charType(ch)) {}
393         return nStartPos; // end of char block is exclusive
394 }
395 
nextCharBlock(const OUString & Text,sal_Int32 nStartPos,const Locale &,sal_Int16 CharType)396 sal_Int32 SAL_CALL BreakIteratorImpl::nextCharBlock( const OUString& Text, sal_Int32 nStartPos,
397         const Locale& /*rLocale*/, sal_Int16 CharType ) throw(RuntimeException)
398 {
399         if (CharType == CharType::ANY_CHAR) return -1;
400         if (nStartPos < 0 || nStartPos >= Text.getLength()) return -1;
401 
402         sal_Int16 numberOfChange = (CharType == (sal_Int16)u_charType(Text.iterateCodePoints(&nStartPos, 0))) ? 2 : 1;
403         sal_Int32 strLen = Text.getLength();
404 
405 	sal_uInt32 ch=0;
406 	while (numberOfChange > 0 && iterateCodePoints(Text, nStartPos, 1, ch) < strLen) {
407 		if ((CharType != (sal_Int16)u_charType(ch)) ^ (numberOfChange == 1))
408 			numberOfChange--;
409     }
410     return numberOfChange == 0 ? nStartPos : -1;
411 }
412 
previousCharBlock(const OUString & Text,sal_Int32 nStartPos,const Locale &,sal_Int16 CharType)413 sal_Int32 SAL_CALL BreakIteratorImpl::previousCharBlock( const OUString& Text, sal_Int32 nStartPos,
414         const Locale& /*rLocale*/, sal_Int16 CharType ) throw(RuntimeException)
415 {
416         if(CharType == CharType::ANY_CHAR) return -1;
417         if (nStartPos < 0 || nStartPos >= Text.getLength()) return -1;
418 
419         sal_Int16 numberOfChange = (CharType == (sal_Int16)u_charType(Text.iterateCodePoints(&nStartPos, 0))) ? 3 : 2;
420 
421 		sal_uInt32 ch=0;
422         while (numberOfChange > 0 && iterateCodePoints(Text, nStartPos, -1, ch) >= 0) {
423 			if (((numberOfChange % 2) == 0) ^ (CharType != (sal_Int16)u_charType(ch)))
424 				numberOfChange--;
425 			if (nStartPos == 0 && numberOfChange > 0) {
426 				numberOfChange--;
427 				if (numberOfChange == 0) return nStartPos;
428 			}
429         }
430         return numberOfChange == 0 ? iterateCodePoints(Text, nStartPos, 1, ch) : -1;
431 }
432 
433 
434 
getWordType(const OUString &,sal_Int32,const Locale &)435 sal_Int16 SAL_CALL BreakIteratorImpl::getWordType( const OUString& /*Text*/,
436         sal_Int32 /*nPos*/, const Locale& /*rLocale*/ ) throw(RuntimeException)
437 {
438         return 0;
439 }
440 
441 typedef struct {
442     UBlockCode from;
443     UBlockCode to;
444     sal_Int16 script;
445 } UBlock2Script;
446 
447 // for a list of the UBLOCK_... values see:
448 // http://icu-project.org/apiref/icu4c/uchar_8h.html
449 // where enum UBlockCode is defined.
450 // See also http://www.unicode.org/charts/ for general reference
451 static UBlock2Script scriptList[] = {
452     {UBLOCK_NO_BLOCK, UBLOCK_NO_BLOCK, ScriptType::WEAK},
453     {UBLOCK_BASIC_LATIN, UBLOCK_ARMENIAN, ScriptType::LATIN},
454     {UBLOCK_HEBREW, UBLOCK_MYANMAR, ScriptType::COMPLEX},
455     {UBLOCK_GEORGIAN, UBLOCK_GEORGIAN, ScriptType::LATIN},
456     {UBLOCK_HANGUL_JAMO, UBLOCK_HANGUL_JAMO, ScriptType::ASIAN},
457     {UBLOCK_ETHIOPIC, UBLOCK_ETHIOPIC, ScriptType::COMPLEX},
458     {UBLOCK_CHEROKEE, UBLOCK_RUNIC, ScriptType::LATIN},
459     {UBLOCK_KHMER, UBLOCK_MONGOLIAN, ScriptType::COMPLEX},
460     {UBLOCK_LATIN_EXTENDED_ADDITIONAL, UBLOCK_GREEK_EXTENDED, ScriptType::LATIN},
461     {UBLOCK_CJK_RADICALS_SUPPLEMENT, UBLOCK_HANGUL_SYLLABLES, ScriptType::ASIAN},
462     {UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS, UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS, ScriptType::ASIAN},
463     {UBLOCK_ARABIC_PRESENTATION_FORMS_A, UBLOCK_ARABIC_PRESENTATION_FORMS_A, ScriptType::COMPLEX},
464     {UBLOCK_CJK_COMPATIBILITY_FORMS, UBLOCK_CJK_COMPATIBILITY_FORMS, ScriptType::ASIAN},
465     {UBLOCK_ARABIC_PRESENTATION_FORMS_B, UBLOCK_ARABIC_PRESENTATION_FORMS_B, ScriptType::COMPLEX},
466     {UBLOCK_HALFWIDTH_AND_FULLWIDTH_FORMS, UBLOCK_HALFWIDTH_AND_FULLWIDTH_FORMS, ScriptType::ASIAN},
467     {UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B, UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT, ScriptType::ASIAN},
468     {UBLOCK_CJK_STROKES, UBLOCK_CJK_STROKES, ScriptType::ASIAN},
469     {UBLOCK_LATIN_EXTENDED_C, UBLOCK_LATIN_EXTENDED_D, ScriptType::LATIN}
470 };
471 
472 #define scriptListCount sizeof (scriptList) / sizeof (UBlock2Script)
473 
getScriptClass(sal_uInt32 currentChar)474 sal_Int16  BreakIteratorImpl::getScriptClass(sal_uInt32 currentChar)
475 {
476         static sal_uInt32 lastChar = 0;
477         static sal_Int16 nRet = 0;
478 
479         if (currentChar != lastChar) {
480             lastChar = currentChar;
481 
482             //JP 21.9.2001: handle specific characters - always as weak
483             //                  definition of 1 - this breaks a word
484             //                  2 - this can be inside a word
485             //                  0x20 & 0xA0 - Bug 102975, declare western space and non-break space as WEAK char.
486             if( 1 == currentChar || 2 == currentChar || 0x20 == currentChar || 0xA0 == currentChar)
487                 nRet = ScriptType::WEAK;
488             // workaround for Coptic
489             else if ( 0x2C80 <= currentChar && 0x2CE3 >= currentChar)
490                 nRet = ScriptType::LATIN;
491             // work-around for ligatures (see http://www.unicode.org/charts/PDF/UFB00.pdf)
492             else if ((0xFB00 <= currentChar && currentChar <= 0xFB06) ||
493                      (0xFB13 <= currentChar && currentChar <= 0xFB17))
494                 nRet = ScriptType::LATIN;
495             else {
496                 UBlockCode block=ublock_getCode(currentChar);
497                 sal_uInt16 i;
498                 for ( i = 0; i < scriptListCount; i++) {
499                     if (block <= scriptList[i].to) break;
500                 }
501                 nRet=(i < scriptListCount && block >= scriptList[i].from) ? scriptList[i].script : ScriptType::WEAK;
502             }
503         }
504         return nRet;
505 }
506 
operator ==(const Locale & l1,const Locale & l2)507 static inline sal_Bool operator == (const Locale& l1, const Locale& l2) {
508         return l1.Language == l2.Language && l1.Country == l2.Country && l1.Variant == l2.Variant;
509 }
510 
createLocaleSpecificBreakIterator(const OUString & aLocaleName)511 sal_Bool SAL_CALL BreakIteratorImpl::createLocaleSpecificBreakIterator(const OUString& aLocaleName) throw( RuntimeException )
512 {
513         // to share service between same Language but different Country code, like zh_CN and zh_TW
514         for (size_t l = 0; l < lookupTable.size(); l++) {
515             lookupTableItem *listItem = lookupTable[l];
516             if (aLocaleName == listItem->aLocale.Language) {
517                 xBI = listItem->xBI;
518                 return sal_True;
519             }
520         }
521 
522         Reference < uno::XInterface > xI = xMSF->createInstance(
523             OUString::createFromAscii("com.sun.star.i18n.BreakIterator_") + aLocaleName);
524 
525         if ( xI.is() ) {
526             xI->queryInterface( getCppuType((const Reference< XBreakIterator>*)0) ) >>= xBI;
527             if (xBI.is()) {
528                 lookupTable.push_back(new lookupTableItem(Locale(aLocaleName, aLocaleName, aLocaleName), xBI));
529                 return sal_True;
530             }
531         }
532         return sal_False;
533 }
534 
535 Reference < XBreakIterator > SAL_CALL
getLocaleSpecificBreakIterator(const Locale & rLocale)536 BreakIteratorImpl::getLocaleSpecificBreakIterator(const Locale& rLocale) throw (RuntimeException)
537 {
538         if (xBI.is() && rLocale == aLocale)
539             return xBI;
540         else if (xMSF.is()) {
541             aLocale = rLocale;
542 
543             for (size_t i = 0; i < lookupTable.size(); i++) {
544                 lookupTableItem *listItem = lookupTable[i];
545                 if (rLocale == listItem->aLocale)
546                     return xBI = listItem->xBI;
547             }
548 
549             sal_Unicode under = (sal_Unicode)'_';
550 
551             sal_Int32 l = rLocale.Language.getLength();
552             sal_Int32 c = rLocale.Country.getLength();
553             sal_Int32 v = rLocale.Variant.getLength();
554             OUStringBuffer aBuf(l+c+v+3);
555 
556             if ((l > 0 && c > 0 && v > 0 &&
557                     // load service with name <base>_<lang>_<country>_<varian>
558                     createLocaleSpecificBreakIterator(aBuf.append(rLocale.Language).append(under).append(
559                                     rLocale.Country).append(under).append(rLocale.Variant).makeStringAndClear())) ||
560                 (l > 0 && c > 0 &&
561                     // load service with name <base>_<lang>_<country>
562                     createLocaleSpecificBreakIterator(aBuf.append(rLocale.Language).append(under).append(
563                                     rLocale.Country).makeStringAndClear())) ||
564                 (l > 0 && c > 0 && rLocale.Language.compareToAscii("zh") == 0 &&
565                                     (rLocale.Country.compareToAscii("HK") == 0 ||
566                                     rLocale.Country.compareToAscii("MO") == 0) &&
567                     // if the country code is HK or MO, one more step to try TW.
568                     createLocaleSpecificBreakIterator(aBuf.append(rLocale.Language).append(under).appendAscii(
569                                     "TW").makeStringAndClear())) ||
570                 (l > 0 &&
571                     // load service with name <base>_<lang>
572                     createLocaleSpecificBreakIterator(rLocale.Language)) ||
573                     // load default service with name <base>_Unicode
574                     createLocaleSpecificBreakIterator(OUString::createFromAscii("Unicode"))) {
575                 lookupTable.push_back( new lookupTableItem(aLocale, xBI) );
576                 return xBI;
577             }
578         }
579         throw RuntimeException();
580 }
581 
582 const sal_Char cBreakIterator[] = "com.sun.star.i18n.BreakIterator";
583 
584 OUString SAL_CALL
getImplementationName(void)585 BreakIteratorImpl::getImplementationName(void) throw( RuntimeException )
586 {
587         return OUString::createFromAscii(cBreakIterator);
588 }
589 
590 sal_Bool SAL_CALL
supportsService(const OUString & rServiceName)591 BreakIteratorImpl::supportsService(const OUString& rServiceName) throw( RuntimeException )
592 {
593         return !rServiceName.compareToAscii(cBreakIterator);
594 }
595 
596 Sequence< OUString > SAL_CALL
getSupportedServiceNames(void)597 BreakIteratorImpl::getSupportedServiceNames(void) throw( RuntimeException )
598 {
599         Sequence< OUString > aRet(1);
600         aRet[0] = OUString::createFromAscii(cBreakIterator);
601         return aRet;
602 }
603 
604 } } } }
605 
606