1 /*************************************************************************
2  *
3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4  *
5  * Copyright 2000, 2010 Oracle and/or its affiliates.
6  *
7  * OpenOffice.org - a multi-platform office productivity suite
8  *
9  * This file is part of OpenOffice.org.
10  *
11  * OpenOffice.org is free software: you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser General Public License version 3
13  * only, as published by the Free Software Foundation.
14  *
15  * OpenOffice.org is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  * GNU Lesser General Public License version 3 for more details
19  * (a copy is included in the LICENSE file that accompanied this code).
20  *
21  * You should have received a copy of the GNU Lesser General Public License
22  * version 3 along with OpenOffice.org.  If not, see
23  * <http://www.openoffice.org/license.html>
24  * for a copy of the LGPLv3 License.
25  *
26  ************************************************************************/
27 
28 // MARKER(update_precomp.py): autogen include statement, do not remove
29 #include "precompiled_i18npool.hxx"
30 
31 #include <breakiteratorImpl.hxx>
32 #include <unicode/uchar.h>
33 #include <rtl/ustrbuf.hxx>
34 
35 using namespace ::com::sun::star::uno;
36 using namespace ::com::sun::star::lang;
37 using namespace ::rtl;
38 
39 namespace com { namespace sun { namespace star { namespace i18n {
40 
41 BreakIteratorImpl::BreakIteratorImpl( const Reference < XMultiServiceFactory >& rxMSF ) : xMSF( rxMSF )
42 {
43 }
44 
45 BreakIteratorImpl::BreakIteratorImpl()
46 {
47 }
48 
49 BreakIteratorImpl::~BreakIteratorImpl()
50 {
51         // Clear lookuptable
52         for (size_t l = 0; l < lookupTable.size(); l++)
53             delete lookupTable[l];
54         lookupTable.clear();
55 }
56 
57 #define LBI getLocaleSpecificBreakIterator(rLocale)
58 
59 sal_Int32 SAL_CALL BreakIteratorImpl::nextCharacters( const OUString& Text, sal_Int32 nStartPos,
60         const Locale &rLocale, sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
61         throw(RuntimeException)
62 {
63         if (nCount < 0) throw RuntimeException();
64 
65         return LBI->nextCharacters( Text, nStartPos, rLocale, nCharacterIteratorMode, nCount, nDone);
66 }
67 
68 sal_Int32 SAL_CALL BreakIteratorImpl::previousCharacters( const OUString& Text, sal_Int32 nStartPos,
69         const Locale& rLocale, sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
70         throw(RuntimeException)
71 {
72         if (nCount < 0) throw RuntimeException();
73 
74         return LBI->previousCharacters( Text, nStartPos, rLocale, nCharacterIteratorMode, nCount, nDone);
75 }
76 
77 #define isZWSP(c) (ch == 0x200B)
78 
79 static sal_Int32 skipSpace(const OUString& Text, sal_Int32 nPos, sal_Int32 len, sal_Int16 rWordType, sal_Bool bDirection)
80 {
81 		sal_uInt32 ch=0;
82 		sal_Int32 pos=nPos;
83         switch (rWordType) {
84             case WordType::ANYWORD_IGNOREWHITESPACES:
85                 if (bDirection)
86                     while (nPos < len && (u_isWhitespace(ch = Text.iterateCodePoints(&pos, 1)) || isZWSP(ch))) nPos=pos;
87                 else
88                     while (nPos > 0 && (u_isWhitespace(ch = Text.iterateCodePoints(&pos, -1)) || isZWSP(ch))) nPos=pos;
89             break;
90             case WordType::DICTIONARY_WORD:
91                 if (bDirection)
92                     while (nPos < len && (u_isWhitespace(ch = Text.iterateCodePoints(&pos, 1)) || isZWSP(ch) ||
93                             ! (ch == 0x002E || u_isalnum(ch)))) nPos=pos;
94                 else
95                     while (nPos > 0 && (u_isWhitespace(ch = Text.iterateCodePoints(&pos, -1)) || isZWSP(ch) ||
96                             ! (ch == 0x002E || u_isalnum(ch)))) nPos=pos;
97             break;
98             case WordType::WORD_COUNT:
99                 if (bDirection)
100                     while (nPos < len && (u_isUWhiteSpace(ch = Text.iterateCodePoints(&pos, 1)) || isZWSP(ch))) nPos=pos;
101                 else
102                     while (nPos > 0 && (u_isUWhiteSpace(ch = Text.iterateCodePoints(&pos, -1)) || isZWSP(ch))) nPos=pos;
103             break;
104         }
105         return nPos;
106 }
107 
108 Boundary SAL_CALL BreakIteratorImpl::nextWord( const OUString& Text, sal_Int32 nStartPos,
109         const Locale& rLocale, sal_Int16 rWordType ) throw(RuntimeException)
110 {
111         sal_Int32 len = Text.getLength();
112         if( nStartPos < 0 || len == 0 )
113             result.endPos = result.startPos = 0;
114         else if (nStartPos >= len)
115             result.endPos = result.startPos = len;
116         else {
117             result = LBI->nextWord(Text, nStartPos, rLocale, rWordType);
118 
119             nStartPos = skipSpace(Text, result.startPos, len, rWordType, sal_True);
120 
121             if ( nStartPos != result.startPos) {
122                 if( nStartPos >= len )
123                     result.startPos = result.endPos = len;
124                 else {
125                     result = LBI->getWordBoundary(Text, nStartPos, rLocale, rWordType, sal_True);
126                     // i88041: avoid startPos goes back to nStartPos when switching between Latin and CJK scripts
127                     if (result.startPos < nStartPos) result.startPos = nStartPos;
128                 }
129             }
130         }
131         return result;
132 }
133 
134 static inline sal_Bool SAL_CALL isCJK( const Locale& rLocale ) {
135         return rLocale.Language.equalsAscii("zh") || rLocale.Language.equalsAscii("ja") || rLocale.Language.equalsAscii("ko");
136 }
137 
138 Boundary SAL_CALL BreakIteratorImpl::previousWord( const OUString& Text, sal_Int32 nStartPos,
139         const Locale& rLocale, sal_Int16 rWordType) throw(RuntimeException)
140 {
141         sal_Int32 len = Text.getLength();
142         if( nStartPos <= 0 || len == 0 ) {
143             result.endPos = result.startPos = 0;
144             return result;
145         } else if (nStartPos > len) {
146             result.endPos = result.startPos = len;
147             return result;
148         }
149 
150         sal_Int32 nPos = skipSpace(Text, nStartPos, len, rWordType, sal_False);
151 
152         // if some spaces are skiped, and the script type is Asian with no CJK rLocale, we have to return
153         // (nStartPos, -1) for caller to send correct rLocale for loading correct dictionary.
154         result.startPos = nPos;
155         if (nPos != nStartPos && nPos > 0 && !isCJK(rLocale) && getScriptClass(Text.iterateCodePoints(&nPos, -1)) == ScriptType::ASIAN) {
156             result.endPos = -1;
157             return result;
158         }
159 
160         return LBI->previousWord(Text, result.startPos, rLocale, rWordType);
161 }
162 
163 
164 Boundary SAL_CALL BreakIteratorImpl::getWordBoundary( const OUString& Text, sal_Int32 nPos, const Locale& rLocale,
165         sal_Int16 rWordType, sal_Bool bDirection ) throw(RuntimeException)
166 {
167         sal_Int32 len = Text.getLength();
168         if( nPos < 0 || len == 0 )
169             result.endPos = result.startPos = 0;
170         else if (nPos > len)
171             result.endPos = result.startPos = len;
172         else {
173             sal_Int32 next, prev;
174             next = skipSpace(Text, nPos, len, rWordType, sal_True);
175             prev = skipSpace(Text, nPos, len, rWordType, sal_False);
176             if (prev == 0 && next == len) {
177                 result.endPos = result.startPos = nPos;
178             } else if (prev == 0 && ! bDirection) {
179                 result.endPos = result.startPos = 0;
180             } else if (next == len && bDirection) {
181                 result.endPos = result.startPos = len;
182             } else {
183                 if (next != prev) {
184                     if (next == nPos && next != len)
185                         bDirection = sal_True;
186                     else if (prev == nPos && prev != 0)
187                         bDirection = sal_False;
188                     else
189                         nPos = bDirection ? next : prev;
190                 }
191                 result = LBI->getWordBoundary(Text, nPos, rLocale, rWordType, bDirection);
192             }
193         }
194         return result;
195 }
196 
197 sal_Bool SAL_CALL BreakIteratorImpl::isBeginWord( const OUString& Text, sal_Int32 nPos,
198         const Locale& rLocale, sal_Int16 rWordType ) throw(RuntimeException)
199 {
200         sal_Int32 len = Text.getLength();
201 
202         if (nPos < 0 || nPos >= len) return sal_False;
203 
204         sal_Int32 tmp = skipSpace(Text, nPos, len, rWordType, sal_True);
205 
206         if (tmp != nPos) return sal_False;
207 
208         result = getWordBoundary(Text, nPos, rLocale, rWordType, sal_True);
209 
210         return result.startPos == nPos;
211 }
212 
213 sal_Bool SAL_CALL BreakIteratorImpl::isEndWord( const OUString& Text, sal_Int32 nPos,
214         const Locale& rLocale, sal_Int16 rWordType ) throw(RuntimeException)
215 {
216         sal_Int32 len = Text.getLength();
217 
218         if (nPos <= 0 || nPos > len) return sal_False;
219 
220         sal_Int32 tmp = skipSpace(Text, nPos, len, rWordType, sal_False);
221 
222         if (tmp != nPos) return sal_False;
223 
224         result = getWordBoundary(Text, nPos, rLocale, rWordType, sal_False);
225 
226         return result.endPos == nPos;
227 }
228 
229 sal_Int32 SAL_CALL BreakIteratorImpl::beginOfSentence( const OUString& Text, sal_Int32 nStartPos,
230         const Locale &rLocale ) throw(RuntimeException)
231 {
232         if (nStartPos < 0 || nStartPos > Text.getLength())
233             return -1;
234         if (Text.getLength() == 0) return 0;
235         return LBI->beginOfSentence(Text, nStartPos, rLocale);
236 }
237 
238 sal_Int32 SAL_CALL BreakIteratorImpl::endOfSentence( const OUString& Text, sal_Int32 nStartPos,
239         const Locale &rLocale ) throw(RuntimeException)
240 {
241         if (nStartPos < 0 || nStartPos > Text.getLength())
242             return -1;
243         if (Text.getLength() == 0) return 0;
244         return LBI->endOfSentence(Text, nStartPos, rLocale);
245 }
246 
247 LineBreakResults SAL_CALL BreakIteratorImpl::getLineBreak( const OUString& Text, sal_Int32 nStartPos,
248         const Locale& rLocale, sal_Int32 nMinBreakPos, const LineBreakHyphenationOptions& hOptions,
249         const LineBreakUserOptions& bOptions ) throw(RuntimeException)
250 {
251         return LBI->getLineBreak(Text, nStartPos, rLocale, nMinBreakPos, hOptions, bOptions);
252 }
253 
254 sal_Int16 SAL_CALL BreakIteratorImpl::getScriptType( const OUString& Text, sal_Int32 nPos )
255         throw(RuntimeException)
256 {
257         return (nPos < 0 || nPos >= Text.getLength()) ? ScriptType::WEAK :
258                             getScriptClass(Text.iterateCodePoints(&nPos, 0));
259 }
260 
261 
262 /** Increments/decrements position first, then obtains character.
263     @return current position, may be -1 or text length if string was consumed.
264  */
265 static sal_Int32 SAL_CALL iterateCodePoints(const OUString& Text, sal_Int32 &nStartPos, sal_Int32 inc, sal_uInt32& ch) {
266         sal_Int32 nLen = Text.getLength();
267 		if (nStartPos + inc < 0 || nStartPos + inc >= nLen) {
268 			ch = 0;
269 			nStartPos = nStartPos + inc < 0 ? -1 : nLen;
270 		} else {
271 			ch = Text.iterateCodePoints(&nStartPos, inc);
272             // Fix for #i80436#.
273             // erAck: 2009-06-30T21:52+0200  This logic looks somewhat
274             // suspicious as if it cures a symptom.. anyway, had to add
275             // nStartPos < Text.getLength() to silence the (correct) assertion
276             // in rtl_uString_iterateCodePoints() if Text was one character
277             // (codepoint) only, made up of a surrogate pair.
278             //if (inc > 0 && nStartPos < Text.getLength())
279             //    ch = Text.iterateCodePoints(&nStartPos, 0);
280             // With surrogates, nStartPos may actually point behind string
281             // now, even if inc is only +1
282 			if (inc > 0)
283                 ch = (nStartPos < nLen ? Text.iterateCodePoints(&nStartPos, 0) : 0);
284 		}
285 		return nStartPos;
286 }
287 
288 
289 sal_Int32 SAL_CALL BreakIteratorImpl::beginOfScript( const OUString& Text,
290         sal_Int32 nStartPos, sal_Int16 ScriptType ) throw(RuntimeException)
291 {
292         if (nStartPos < 0 || nStartPos >= Text.getLength())
293             return -1;
294 
295         if(ScriptType != getScriptClass(Text.iterateCodePoints(&nStartPos, 0)))
296             return -1;
297 
298 		if (nStartPos == 0) return 0;
299 		sal_uInt32 ch=0;
300         while (iterateCodePoints(Text, nStartPos, -1, ch) >= 0 && ScriptType == getScriptClass(ch)) {
301 			if (nStartPos == 0) return 0;
302 		}
303 
304         return  iterateCodePoints(Text, nStartPos, 1, ch);
305 }
306 
307 sal_Int32 SAL_CALL BreakIteratorImpl::endOfScript( const OUString& Text,
308         sal_Int32 nStartPos, sal_Int16 ScriptType ) throw(RuntimeException)
309 {
310         if (nStartPos < 0 || nStartPos >= Text.getLength())
311             return -1;
312 
313         if(ScriptType != getScriptClass(Text.iterateCodePoints(&nStartPos, 0)))
314             return -1;
315 
316         sal_Int32 strLen = Text.getLength();
317 		sal_uInt32 ch=0;
318         while(iterateCodePoints(Text, nStartPos, 1, ch) < strLen ) {
319             sal_Int16 currentCharScriptType = getScriptClass(ch);
320             if(ScriptType != currentCharScriptType && currentCharScriptType != ScriptType::WEAK)
321                 break;
322         }
323         return  nStartPos;
324 }
325 
326 sal_Int32  SAL_CALL BreakIteratorImpl::previousScript( const OUString& Text,
327         sal_Int32 nStartPos, sal_Int16 ScriptType ) throw(RuntimeException)
328 {
329         if (nStartPos < 0)
330             return -1;
331         if (nStartPos > Text.getLength())
332             nStartPos = Text.getLength();
333 
334         sal_Int16 numberOfChange = (ScriptType == getScriptClass(Text.iterateCodePoints(&nStartPos, 0))) ? 3 : 2;
335 
336 		sal_uInt32 ch=0;
337         while (numberOfChange > 0 && iterateCodePoints(Text, nStartPos, -1, ch) >= 0) {
338 			if ((((numberOfChange % 2) == 0) ^ (ScriptType != getScriptClass(ch))))
339 				numberOfChange--;
340 			else if (nStartPos == 0) {
341 				if (numberOfChange > 0)
342 					numberOfChange--;
343 				if (nStartPos > 0)
344 					Text.iterateCodePoints(&nStartPos, -1);
345 				else
346 					return -1;
347 			}
348         }
349         return numberOfChange == 0 ? iterateCodePoints(Text, nStartPos, 1, ch) : -1;
350 }
351 
352 sal_Int32 SAL_CALL BreakIteratorImpl::nextScript( const OUString& Text, sal_Int32 nStartPos,
353         sal_Int16 ScriptType ) throw(RuntimeException)
354 
355 {
356         if (nStartPos < 0)
357             nStartPos = 0;
358         sal_Int32 strLen = Text.getLength();
359         if (nStartPos > strLen)
360             return -1;
361 
362         sal_Int16 numberOfChange = (ScriptType == getScriptClass(Text.iterateCodePoints(&nStartPos, 0))) ? 2 : 1;
363 
364 		sal_uInt32 ch=0;
365         while (numberOfChange > 0 && iterateCodePoints(Text, nStartPos, 1, ch) < strLen) {
366 			sal_Int16 currentCharScriptType = getScriptClass(ch);
367 			if ((numberOfChange == 1) ? (ScriptType == currentCharScriptType) :
368 					(ScriptType != currentCharScriptType && currentCharScriptType != ScriptType::WEAK))
369 				numberOfChange--;
370         }
371         return numberOfChange == 0 ? nStartPos : -1;
372 }
373 
374 sal_Int32 SAL_CALL BreakIteratorImpl::beginOfCharBlock( const OUString& Text, sal_Int32 nStartPos,
375         const Locale& /*rLocale*/, sal_Int16 CharType ) throw(RuntimeException)
376 {
377         if (CharType == CharType::ANY_CHAR) return 0;
378         if (nStartPos < 0 || nStartPos >= Text.getLength()) return -1;
379         if (CharType != (sal_Int16)u_charType( Text.iterateCodePoints(&nStartPos, 0))) return -1;
380 
381         sal_Int32 nPos=nStartPos;
382         while(nStartPos > 0 && CharType == (sal_Int16)u_charType(Text.iterateCodePoints(&nPos, -1))) { nStartPos=nPos; }
383         return nStartPos; // begin of char block is inclusive
384 }
385 
386 sal_Int32 SAL_CALL BreakIteratorImpl::endOfCharBlock( const OUString& Text, sal_Int32 nStartPos,
387         const Locale& /*rLocale*/, sal_Int16 CharType ) throw(RuntimeException)
388 {
389         sal_Int32 strLen = Text.getLength();
390 
391         if (CharType == CharType::ANY_CHAR) return strLen; // end of char block is exclusive
392         if (nStartPos < 0 || nStartPos >= strLen) return -1;
393         if (CharType != (sal_Int16)u_charType(Text.iterateCodePoints(&nStartPos, 0))) return -1;
394 
395 		sal_uInt32 ch=0;
396         while(iterateCodePoints(Text, nStartPos, 1, ch) < strLen && CharType == (sal_Int16)u_charType(ch)) {}
397         return nStartPos; // end of char block is exclusive
398 }
399 
400 sal_Int32 SAL_CALL BreakIteratorImpl::nextCharBlock( const OUString& Text, sal_Int32 nStartPos,
401         const Locale& /*rLocale*/, sal_Int16 CharType ) throw(RuntimeException)
402 {
403         if (CharType == CharType::ANY_CHAR) return -1;
404         if (nStartPos < 0 || nStartPos >= Text.getLength()) return -1;
405 
406         sal_Int16 numberOfChange = (CharType == (sal_Int16)u_charType(Text.iterateCodePoints(&nStartPos, 0))) ? 2 : 1;
407         sal_Int32 strLen = Text.getLength();
408 
409 	sal_uInt32 ch=0;
410 	while (numberOfChange > 0 && iterateCodePoints(Text, nStartPos, 1, ch) < strLen) {
411 		if ((CharType != (sal_Int16)u_charType(ch)) ^ (numberOfChange == 1))
412 			numberOfChange--;
413     }
414     return numberOfChange == 0 ? nStartPos : -1;
415 }
416 
417 sal_Int32 SAL_CALL BreakIteratorImpl::previousCharBlock( const OUString& Text, sal_Int32 nStartPos,
418         const Locale& /*rLocale*/, sal_Int16 CharType ) throw(RuntimeException)
419 {
420         if(CharType == CharType::ANY_CHAR) return -1;
421         if (nStartPos < 0 || nStartPos >= Text.getLength()) return -1;
422 
423         sal_Int16 numberOfChange = (CharType == (sal_Int16)u_charType(Text.iterateCodePoints(&nStartPos, 0))) ? 3 : 2;
424 
425 		sal_uInt32 ch=0;
426         while (numberOfChange > 0 && iterateCodePoints(Text, nStartPos, -1, ch) >= 0) {
427 			if (((numberOfChange % 2) == 0) ^ (CharType != (sal_Int16)u_charType(ch)))
428 				numberOfChange--;
429 			if (nStartPos == 0 && numberOfChange > 0) {
430 				numberOfChange--;
431 				if (numberOfChange == 0) return nStartPos;
432 			}
433         }
434         return numberOfChange == 0 ? iterateCodePoints(Text, nStartPos, 1, ch) : -1;
435 }
436 
437 
438 
439 sal_Int16 SAL_CALL BreakIteratorImpl::getWordType( const OUString& /*Text*/,
440         sal_Int32 /*nPos*/, const Locale& /*rLocale*/ ) throw(RuntimeException)
441 {
442         return 0;
443 }
444 
445 typedef struct {
446     UBlockCode from;
447     UBlockCode to;
448     sal_Int16 script;
449 } UBlock2Script;
450 
451 // for a list of the UBLOCK_... values see:
452 // http://icu-project.org/apiref/icu4c/uchar_8h.html
453 // where enum UBlockCode is defined.
454 // See also http://www.unicode.org/charts/ for general reference
455 static UBlock2Script scriptList[] = {
456     {UBLOCK_NO_BLOCK, UBLOCK_NO_BLOCK, ScriptType::WEAK},
457     {UBLOCK_BASIC_LATIN, UBLOCK_ARMENIAN, ScriptType::LATIN},
458     {UBLOCK_HEBREW, UBLOCK_MYANMAR, ScriptType::COMPLEX},
459     {UBLOCK_GEORGIAN, UBLOCK_GEORGIAN, ScriptType::LATIN},
460     {UBLOCK_HANGUL_JAMO, UBLOCK_HANGUL_JAMO, ScriptType::ASIAN},
461     {UBLOCK_ETHIOPIC, UBLOCK_ETHIOPIC, ScriptType::COMPLEX},
462     {UBLOCK_CHEROKEE, UBLOCK_RUNIC, ScriptType::LATIN},
463     {UBLOCK_KHMER, UBLOCK_MONGOLIAN, ScriptType::COMPLEX},
464     {UBLOCK_LATIN_EXTENDED_ADDITIONAL, UBLOCK_GREEK_EXTENDED, ScriptType::LATIN},
465     {UBLOCK_CJK_RADICALS_SUPPLEMENT, UBLOCK_HANGUL_SYLLABLES, ScriptType::ASIAN},
466     {UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS, UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS, ScriptType::ASIAN},
467     {UBLOCK_ARABIC_PRESENTATION_FORMS_A, UBLOCK_ARABIC_PRESENTATION_FORMS_A, ScriptType::COMPLEX},
468     {UBLOCK_CJK_COMPATIBILITY_FORMS, UBLOCK_CJK_COMPATIBILITY_FORMS, ScriptType::ASIAN},
469     {UBLOCK_ARABIC_PRESENTATION_FORMS_B, UBLOCK_ARABIC_PRESENTATION_FORMS_B, ScriptType::COMPLEX},
470     {UBLOCK_HALFWIDTH_AND_FULLWIDTH_FORMS, UBLOCK_HALFWIDTH_AND_FULLWIDTH_FORMS, ScriptType::ASIAN},
471     {UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B, UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT, ScriptType::ASIAN},
472     {UBLOCK_CJK_STROKES, UBLOCK_CJK_STROKES, ScriptType::ASIAN},
473     {UBLOCK_LATIN_EXTENDED_C, UBLOCK_LATIN_EXTENDED_D, ScriptType::LATIN}
474 };
475 
476 #define scriptListCount sizeof (scriptList) / sizeof (UBlock2Script)
477 
478 sal_Int16  BreakIteratorImpl::getScriptClass(sal_uInt32 currentChar)
479 {
480         static sal_uInt32 lastChar = 0;
481         static sal_Int16 nRet = 0;
482 
483         if (currentChar != lastChar) {
484             lastChar = currentChar;
485 
486             //JP 21.9.2001: handle specific characters - always as weak
487             //                  definition of 1 - this breaks a word
488             //                  2 - this can be inside a word
489             //                  0x20 & 0xA0 - Bug 102975, declare western space and non-break space as WEAK char.
490             if( 1 == currentChar || 2 == currentChar || 0x20 == currentChar || 0xA0 == currentChar)
491                 nRet = ScriptType::WEAK;
492             // workaround for Coptic
493             else if ( 0x2C80 <= currentChar && 0x2CE3 >= currentChar)
494                 nRet = ScriptType::LATIN;
495             // work-around for ligatures (see http://www.unicode.org/charts/PDF/UFB00.pdf)
496             else if ((0xFB00 <= currentChar && currentChar <= 0xFB06) ||
497                      (0xFB13 <= currentChar && currentChar <= 0xFB17))
498                 nRet = ScriptType::LATIN;
499             else {
500                 UBlockCode block=ublock_getCode(currentChar);
501                 sal_uInt16 i;
502                 for ( i = 0; i < scriptListCount; i++) {
503                     if (block <= scriptList[i].to) break;
504                 }
505                 nRet=(i < scriptListCount && block >= scriptList[i].from) ? scriptList[i].script : ScriptType::WEAK;
506             }
507         }
508         return nRet;
509 }
510 
511 static inline sal_Bool operator == (const Locale& l1, const Locale& l2) {
512         return l1.Language == l2.Language && l1.Country == l2.Country && l1.Variant == l2.Variant;
513 }
514 
515 sal_Bool SAL_CALL BreakIteratorImpl::createLocaleSpecificBreakIterator(const OUString& aLocaleName) throw( RuntimeException )
516 {
517         // to share service between same Language but different Country code, like zh_CN and zh_TW
518         for (size_t l = 0; l < lookupTable.size(); l++) {
519             lookupTableItem *listItem = lookupTable[l];
520             if (aLocaleName == listItem->aLocale.Language) {
521                 xBI = listItem->xBI;
522                 return sal_True;
523             }
524         }
525 
526         Reference < uno::XInterface > xI = xMSF->createInstance(
527             OUString::createFromAscii("com.sun.star.i18n.BreakIterator_") + aLocaleName);
528 
529         if ( xI.is() ) {
530             xI->queryInterface( getCppuType((const Reference< XBreakIterator>*)0) ) >>= xBI;
531             if (xBI.is()) {
532                 lookupTable.push_back(new lookupTableItem(Locale(aLocaleName, aLocaleName, aLocaleName), xBI));
533                 return sal_True;
534             }
535         }
536         return sal_False;
537 }
538 
539 Reference < XBreakIterator > SAL_CALL
540 BreakIteratorImpl::getLocaleSpecificBreakIterator(const Locale& rLocale) throw (RuntimeException)
541 {
542         if (xBI.is() && rLocale == aLocale)
543             return xBI;
544         else if (xMSF.is()) {
545             aLocale = rLocale;
546 
547             for (size_t i = 0; i < lookupTable.size(); i++) {
548                 lookupTableItem *listItem = lookupTable[i];
549                 if (rLocale == listItem->aLocale)
550                     return xBI = listItem->xBI;
551             }
552 
553             sal_Unicode under = (sal_Unicode)'_';
554 
555             sal_Int32 l = rLocale.Language.getLength();
556             sal_Int32 c = rLocale.Country.getLength();
557             sal_Int32 v = rLocale.Variant.getLength();
558             OUStringBuffer aBuf(l+c+v+3);
559 
560             if ((l > 0 && c > 0 && v > 0 &&
561                     // load service with name <base>_<lang>_<country>_<varian>
562                     createLocaleSpecificBreakIterator(aBuf.append(rLocale.Language).append(under).append(
563                                     rLocale.Country).append(under).append(rLocale.Variant).makeStringAndClear())) ||
564                 (l > 0 && c > 0 &&
565                     // load service with name <base>_<lang>_<country>
566                     createLocaleSpecificBreakIterator(aBuf.append(rLocale.Language).append(under).append(
567                                     rLocale.Country).makeStringAndClear())) ||
568                 (l > 0 && c > 0 && rLocale.Language.compareToAscii("zh") == 0 &&
569                                     (rLocale.Country.compareToAscii("HK") == 0 ||
570                                     rLocale.Country.compareToAscii("MO") == 0) &&
571                     // if the country code is HK or MO, one more step to try TW.
572                     createLocaleSpecificBreakIterator(aBuf.append(rLocale.Language).append(under).appendAscii(
573                                     "TW").makeStringAndClear())) ||
574                 (l > 0 &&
575                     // load service with name <base>_<lang>
576                     createLocaleSpecificBreakIterator(rLocale.Language)) ||
577                     // load default service with name <base>_Unicode
578                     createLocaleSpecificBreakIterator(OUString::createFromAscii("Unicode"))) {
579                 lookupTable.push_back( new lookupTableItem(aLocale, xBI) );
580                 return xBI;
581             }
582         }
583         throw RuntimeException();
584 }
585 
586 const sal_Char cBreakIterator[] = "com.sun.star.i18n.BreakIterator";
587 
588 OUString SAL_CALL
589 BreakIteratorImpl::getImplementationName(void) throw( RuntimeException )
590 {
591         return OUString::createFromAscii(cBreakIterator);
592 }
593 
594 sal_Bool SAL_CALL
595 BreakIteratorImpl::supportsService(const OUString& rServiceName) throw( RuntimeException )
596 {
597         return !rServiceName.compareToAscii(cBreakIterator);
598 }
599 
600 Sequence< OUString > SAL_CALL
601 BreakIteratorImpl::getSupportedServiceNames(void) throw( RuntimeException )
602 {
603         Sequence< OUString > aRet(1);
604         aRet[0] = OUString::createFromAscii(cBreakIterator);
605         return aRet;
606 }
607 
608 } } } }
609 
610