1 /*************************************************************************
2  *
3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4  *
5  * Copyright 2000, 2010 Oracle and/or its affiliates.
6  *
7  * OpenOffice.org - a multi-platform office productivity suite
8  *
9  * This file is part of OpenOffice.org.
10  *
11  * OpenOffice.org is free software: you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser General Public License version 3
13  * only, as published by the Free Software Foundation.
14  *
15  * OpenOffice.org is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  * GNU Lesser General Public License version 3 for more details
19  * (a copy is included in the LICENSE file that accompanied this code).
20  *
21  * You should have received a copy of the GNU Lesser General Public License
22  * version 3 along with OpenOffice.org.  If not, see
23  * <http://www.openoffice.org/license.html>
24  * for a copy of the LGPLv3 License.
25  *
26  ************************************************************************/
27 
28 // MARKER(update_precomp.py): autogen include statement, do not remove
29 #include "precompiled_i18npool.hxx"
30 #include <breakiterator_unicode.hxx>
31 #include <localedata.hxx>
32 #include <unicode/uchar.h>
33 #include <unicode/locid.h>
34 #include <unicode/rbbi.h>
35 #include <unicode/udata.h>
36 #include <rtl/strbuf.hxx>
37 #include <rtl/ustring.hxx>
38 
39 U_CDECL_BEGIN
40 extern const char OpenOffice_dat[];
41 U_CDECL_END
42 
43 using namespace ::com::sun::star;
44 using namespace ::com::sun::star::lang;
45 using namespace ::rtl;
46 
47 namespace com { namespace sun { namespace star { namespace i18n {
48 
49 #define ERROR ::com::sun::star::uno::RuntimeException()
50 
51 //#define ImplementName "com.sun.star.i18n.BreakIterator_Unicode";
52 
53 
54 BreakIterator_Unicode::BreakIterator_Unicode() :
55     cBreakIterator( "com.sun.star.i18n.BreakIterator_Unicode" ),    // implementation name
56     wordRule( "word" ),
57     lineRule( "line" ),
58     result(),
59     character(),
60     word(),
61     sentence(),
62     line(),
63     icuBI( NULL ),
64     aLocale(),
65     aBreakType(),
66     aWordType()
67 {
68 }
69 
70 
71 BreakIterator_Unicode::~BreakIterator_Unicode()
72 {
73         if (icuBI && icuBI->aBreakIterator) {
74             delete icuBI->aBreakIterator;
75             icuBI->aBreakIterator=NULL;
76         }
77         if (character.aBreakIterator) delete character.aBreakIterator;
78         if (word.aBreakIterator) delete word.aBreakIterator;
79         if (sentence.aBreakIterator) delete sentence.aBreakIterator;
80         if (line.aBreakIterator) delete line.aBreakIterator;
81 }
82 
83 /*
84 	Wrapper class to provide public access to the RuleBasedBreakIterator's
85 	setbreakType method.
86 */
87 class OOoRuleBasedBreakIterator : public RuleBasedBreakIterator {
88 	public:
89 		inline void publicSetBreakType(int32_t type) {
90 			setBreakType(type);
91 		};
92 		OOoRuleBasedBreakIterator(UDataMemory* image,
93 				UErrorCode &status) :
94 			RuleBasedBreakIterator(image, status) { };
95 
96 };
97 
98 // loading ICU breakiterator on demand.
99 void SAL_CALL BreakIterator_Unicode::loadICUBreakIterator(const com::sun::star::lang::Locale& rLocale,
100         sal_Int16 rBreakType, sal_Int16 rWordType, const sal_Char *rule, const OUString& rText) throw(uno::RuntimeException)
101 {
102     sal_Bool newBreak = sal_False;
103     UErrorCode status = U_ZERO_ERROR;
104     sal_Int16 breakType = 0;
105     switch (rBreakType) {
106         case LOAD_CHARACTER_BREAKITERATOR: icuBI=&character; breakType = 3; break;
107         case LOAD_WORD_BREAKITERATOR: icuBI=&word;
108             switch (rWordType) {
109                 case WordType::ANYWORD_IGNOREWHITESPACES: breakType = 0; rule=wordRule = "edit_word"; break;
110                 case WordType::DICTIONARY_WORD: breakType = 1; rule=wordRule = "dict_word"; break;
111                 case WordType::WORD_COUNT: breakType = 2; rule=wordRule = "count_word"; break;
112             }
113             break;
114         case LOAD_SENTENCE_BREAKITERATOR: icuBI=&sentence; breakType = 5; break;
115         case LOAD_LINE_BREAKITERATOR: icuBI=&line; breakType = 4; break;
116     }
117     if (!icuBI->aBreakIterator || rWordType != aWordType ||
118             rLocale.Language != aLocale.Language || rLocale.Country != aLocale.Country ||
119             rLocale.Variant != aLocale.Variant) {
120         if (icuBI->aBreakIterator) {
121             delete icuBI->aBreakIterator;
122             icuBI->aBreakIterator=NULL;
123         }
124         if (rule) {
125             uno::Sequence< OUString > breakRules = LocaleData().getBreakIteratorRules(rLocale);
126 
127             status = U_ZERO_ERROR;
128             udata_setAppData("OpenOffice", OpenOffice_dat, &status);
129             if ( !U_SUCCESS(status) ) throw ERROR;
130 
131             OOoRuleBasedBreakIterator *rbi = NULL;
132 
133             if (breakRules.getLength() > breakType && breakRules[breakType].getLength() > 0) {
134                 rbi = new OOoRuleBasedBreakIterator(udata_open("OpenOffice", "brk",
135                     OUStringToOString(breakRules[breakType], RTL_TEXTENCODING_ASCII_US).getStr(), &status), status);
136             } else {
137                 status = U_ZERO_ERROR;
138                 OStringBuffer aUDName(64);
139                 aUDName.append(rule);
140                 aUDName.append('_');
141                 aUDName.append( OUStringToOString(rLocale.Language, RTL_TEXTENCODING_ASCII_US));
142                 UDataMemory* pUData = udata_open("OpenOffice", "brk", aUDName.getStr(), &status);
143                 if( U_SUCCESS(status) )
144                     rbi = new OOoRuleBasedBreakIterator( pUData, status);
145                 if (!U_SUCCESS(status) ) {
146                     status = U_ZERO_ERROR;
147                     pUData = udata_open("OpenOffice", "brk", rule, &status);
148                     if( U_SUCCESS(status) )
149                         rbi = new OOoRuleBasedBreakIterator( pUData, status);
150                     if (!U_SUCCESS(status) ) icuBI->aBreakIterator=NULL;
151                 }
152             }
153             if (rbi) {
154                 switch (rBreakType) {
155                     case LOAD_CHARACTER_BREAKITERATOR: rbi->publicSetBreakType(UBRK_CHARACTER); break;
156                     case LOAD_WORD_BREAKITERATOR: rbi->publicSetBreakType(UBRK_WORD); break;
157                     case LOAD_SENTENCE_BREAKITERATOR: rbi->publicSetBreakType(UBRK_SENTENCE); break;
158                     case LOAD_LINE_BREAKITERATOR: rbi->publicSetBreakType(UBRK_LINE); break;
159                 }
160                 icuBI->aBreakIterator = rbi;
161             }
162         }
163 
164         if (!icuBI->aBreakIterator) {
165             icu::Locale icuLocale(
166                     OUStringToOString(rLocale.Language, RTL_TEXTENCODING_ASCII_US).getStr(),
167                     OUStringToOString(rLocale.Country, RTL_TEXTENCODING_ASCII_US).getStr(),
168                     OUStringToOString(rLocale.Variant, RTL_TEXTENCODING_ASCII_US).getStr());
169 
170             status = U_ZERO_ERROR;
171             switch (rBreakType) {
172                 case LOAD_CHARACTER_BREAKITERATOR:
173                     icuBI->aBreakIterator =  icu::BreakIterator::createCharacterInstance(icuLocale, status);
174                     break;
175                 case LOAD_WORD_BREAKITERATOR:
176                     icuBI->aBreakIterator =  icu::BreakIterator::createWordInstance(icuLocale, status);
177                     break;
178                 case LOAD_SENTENCE_BREAKITERATOR:
179                     icuBI->aBreakIterator = icu::BreakIterator::createSentenceInstance(icuLocale, status);
180                     break;
181                 case LOAD_LINE_BREAKITERATOR:
182                     icuBI->aBreakIterator = icu::BreakIterator::createLineInstance(icuLocale, status);
183                     break;
184             }
185             if ( !U_SUCCESS(status) ) {
186                 icuBI->aBreakIterator=NULL;
187                 throw ERROR;
188             }
189         }
190         if (icuBI->aBreakIterator) {
191             aLocale=rLocale;
192             aWordType=rWordType;
193             aBreakType=rBreakType;
194             newBreak=sal_True;
195         } else {
196             throw ERROR;
197         }
198     }
199 
200     if (newBreak || icuBI->aICUText.compare(UnicodeString(reinterpret_cast<const UChar *>(rText.getStr()), rText.getLength()))) {	// UChar != sal_Unicode in MinGW
201         icuBI->aICUText=UnicodeString(reinterpret_cast<const UChar *>(rText.getStr()), rText.getLength());
202         icuBI->aBreakIterator->setText(icuBI->aICUText);
203     }
204 }
205 
206 
207 sal_Int32 SAL_CALL BreakIterator_Unicode::nextCharacters( const OUString& Text,
208         sal_Int32 nStartPos, const lang::Locale &rLocale,
209         sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
210         throw(uno::RuntimeException)
211 {
212         if (nCharacterIteratorMode == CharacterIteratorMode::SKIPCELL ) { // for CELL mode
213             loadICUBreakIterator(rLocale, LOAD_CHARACTER_BREAKITERATOR, 0, "char", Text);
214             for (nDone = 0; nDone < nCount; nDone++) {
215                 nStartPos = character.aBreakIterator->following(nStartPos);
216                 if (nStartPos == BreakIterator::DONE)
217                     return Text.getLength();
218             }
219         } else { // for CHARACTER mode
220             for (nDone = 0; nDone < nCount && nStartPos < Text.getLength(); nDone++)
221                 Text.iterateCodePoints(&nStartPos, 1);
222         }
223         return nStartPos;
224 }
225 
226 sal_Int32 SAL_CALL BreakIterator_Unicode::previousCharacters( const OUString& Text,
227         sal_Int32 nStartPos, const lang::Locale& rLocale,
228         sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
229         throw(uno::RuntimeException)
230 {
231         if (nCharacterIteratorMode == CharacterIteratorMode::SKIPCELL ) { // for CELL mode
232             loadICUBreakIterator(rLocale, LOAD_CHARACTER_BREAKITERATOR, 0, "char", Text);
233             for (nDone = 0; nDone < nCount; nDone++) {
234                 nStartPos = character.aBreakIterator->preceding(nStartPos);
235                 if (nStartPos == BreakIterator::DONE)
236                     return 0;
237             }
238         } else { // for BS to delete one char and CHARACTER mode.
239             for (nDone = 0; nDone < nCount && nStartPos > 0; nDone++)
240                 Text.iterateCodePoints(&nStartPos, -1);
241         }
242         return nStartPos;
243 }
244 
245 
246 Boundary SAL_CALL BreakIterator_Unicode::nextWord( const OUString& Text, sal_Int32 nStartPos,
247     const lang::Locale& rLocale, sal_Int16 rWordType ) throw(uno::RuntimeException)
248 {
249         loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, NULL, Text);
250 
251         result.startPos = word.aBreakIterator->following(nStartPos);
252         if( result.startPos >= Text.getLength() || result.startPos == BreakIterator::DONE )
253             result.endPos = result.startPos;
254         else {
255             if ( (rWordType == WordType::ANYWORD_IGNOREWHITESPACES ||
256                     rWordType == WordType::DICTIONARY_WORD ) &&
257                         u_isWhitespace(Text.iterateCodePoints(&result.startPos, 0)) )
258                 result.startPos = word.aBreakIterator->following(result.startPos);
259 
260             result.endPos = word.aBreakIterator->following(result.startPos);
261             if(result.endPos == BreakIterator::DONE)
262                 result.endPos = result.startPos;
263         }
264         return result;
265 }
266 
267 
268 Boundary SAL_CALL BreakIterator_Unicode::previousWord(const OUString& Text, sal_Int32 nStartPos,
269         const lang::Locale& rLocale, sal_Int16 rWordType) throw(uno::RuntimeException)
270 {
271         loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, NULL, Text);
272 
273         result.startPos = word.aBreakIterator->preceding(nStartPos);
274         if( result.startPos < 0 || result.startPos == BreakIterator::DONE)
275             result.endPos = result.startPos;
276         else {
277             if ( (rWordType == WordType::ANYWORD_IGNOREWHITESPACES ||
278                     rWordType == WordType::DICTIONARY_WORD) &&
279                         u_isWhitespace(Text.iterateCodePoints(&result.startPos, 0)) )
280                 result.startPos = word.aBreakIterator->preceding(result.startPos);
281 
282             result.endPos = word.aBreakIterator->following(result.startPos);
283             if(result.endPos == BreakIterator::DONE)
284                 result.endPos = result.startPos;
285         }
286         return result;
287 }
288 
289 
290 Boundary SAL_CALL BreakIterator_Unicode::getWordBoundary( const OUString& Text, sal_Int32 nPos, const lang::Locale& rLocale,
291         sal_Int16 rWordType, sal_Bool bDirection ) throw(uno::RuntimeException)
292 {
293         loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, NULL, Text);
294         sal_Int32 len = Text.getLength();
295 
296         if(word.aBreakIterator->isBoundary(nPos)) {
297             result.startPos = result.endPos = nPos;
298             if((bDirection || nPos == 0) && nPos < len) //forward
299                 result.endPos = word.aBreakIterator->following(nPos);
300             else
301                 result.startPos = word.aBreakIterator->preceding(nPos);
302         } else {
303             if(nPos <= 0) {
304                 result.startPos = 0;
305                 result.endPos = len ? word.aBreakIterator->following((sal_Int32)0) : 0;
306             } else if(nPos >= len) {
307                 result.startPos = word.aBreakIterator->preceding(len);
308                 result.endPos = len;
309             } else {
310                 result.startPos = word.aBreakIterator->preceding(nPos);
311                 result.endPos = word.aBreakIterator->following(nPos);
312             }
313         }
314         if (result.startPos == BreakIterator::DONE)
315             result.startPos = result.endPos;
316         else if (result.endPos == BreakIterator::DONE)
317             result.endPos = result.startPos;
318 
319         return result;
320 }
321 
322 
323 sal_Int32 SAL_CALL BreakIterator_Unicode::beginOfSentence( const OUString& Text, sal_Int32 nStartPos,
324         const lang::Locale &rLocale ) throw(uno::RuntimeException)
325 {
326         loadICUBreakIterator(rLocale, LOAD_SENTENCE_BREAKITERATOR, 0, "sent", Text);
327 
328         sal_Int32 len = Text.getLength();
329         if (len > 0 && nStartPos == len)
330             Text.iterateCodePoints(&nStartPos, -1); // issue #i27703# treat end position as part of last sentence
331         if (!sentence.aBreakIterator->isBoundary(nStartPos))
332             nStartPos = sentence.aBreakIterator->preceding(nStartPos);
333 
334         // skip preceding space.
335         sal_uInt32 ch = Text.iterateCodePoints(&nStartPos, 1);
336         while (nStartPos < len && u_isWhitespace(ch)) ch = Text.iterateCodePoints(&nStartPos, 1);
337 		Text.iterateCodePoints(&nStartPos, -1);
338 
339         return nStartPos;
340 }
341 
342 sal_Int32 SAL_CALL BreakIterator_Unicode::endOfSentence( const OUString& Text, sal_Int32 nStartPos,
343         const lang::Locale &rLocale ) throw(uno::RuntimeException)
344 {
345         loadICUBreakIterator(rLocale, LOAD_SENTENCE_BREAKITERATOR, 0, "sent", Text);
346 
347         sal_Int32 len = Text.getLength();
348         if (len > 0 && nStartPos == len)
349             Text.iterateCodePoints(&nStartPos, -1); // issue #i27703# treat end position as part of last sentence
350         nStartPos = sentence.aBreakIterator->following(nStartPos);
351 
352         sal_Int32 nPos=nStartPos;
353         while (nPos > 0 && u_isWhitespace(Text.iterateCodePoints(&nPos, -1))) nStartPos=nPos;
354 
355         return nStartPos;
356 }
357 
358 LineBreakResults SAL_CALL BreakIterator_Unicode::getLineBreak(
359         const OUString& Text, sal_Int32 nStartPos,
360         const lang::Locale& rLocale, sal_Int32 nMinBreakPos,
361         const LineBreakHyphenationOptions& hOptions,
362         const LineBreakUserOptions& /*rOptions*/ ) throw(uno::RuntimeException)
363 {
364         LineBreakResults lbr;
365 
366         if (nStartPos >= Text.getLength()) {
367             lbr.breakIndex = Text.getLength();
368             lbr.breakType = BreakType::WORDBOUNDARY;
369             return lbr;
370         }
371 
372         loadICUBreakIterator(rLocale, LOAD_LINE_BREAKITERATOR, 0, lineRule, Text);
373 
374         sal_Bool GlueSpace=sal_True;
375         while (GlueSpace) {
376             if (line.aBreakIterator->preceding(nStartPos + 1) == nStartPos) { //Line boundary break
377                 lbr.breakIndex = nStartPos;
378                 lbr.breakType = BreakType::WORDBOUNDARY;
379             } else if (hOptions.rHyphenator.is()) { //Hyphenation break
380                 Boundary wBoundary = getWordBoundary( Text, nStartPos, rLocale,
381                                                 WordType::DICTIONARY_WORD, false);
382                 uno::Reference< linguistic2::XHyphenatedWord > aHyphenatedWord;
383                 aHyphenatedWord = hOptions.rHyphenator->hyphenate(Text.copy(wBoundary.startPos,
384                     wBoundary.endPos - wBoundary.startPos), rLocale,
385                     (sal_Int16) (hOptions.hyphenIndex - wBoundary.startPos), hOptions.aHyphenationOptions);
386                 if (aHyphenatedWord.is()) {
387                     lbr.rHyphenatedWord = aHyphenatedWord;
388                     if(wBoundary.startPos + aHyphenatedWord->getHyphenationPos() + 1 < nMinBreakPos )
389                         lbr.breakIndex = -1;
390                     else
391                         lbr.breakIndex = wBoundary.startPos; //aHyphenatedWord->getHyphenationPos();
392                     lbr.breakType = BreakType::HYPHENATION;
393                 } else {
394                     lbr.breakIndex = line.aBreakIterator->preceding(nStartPos);
395                     lbr.breakType = BreakType::WORDBOUNDARY;;
396                 }
397             } else { //word boundary break
398                 lbr.breakIndex = line.aBreakIterator->preceding(nStartPos);
399                 lbr.breakType = BreakType::WORDBOUNDARY;
400             }
401 
402 #define WJ 0x2060   // Word Joiner
403             GlueSpace=sal_False;
404             if (lbr.breakType == BreakType::WORDBOUNDARY) {
405                 nStartPos = lbr.breakIndex;
406                 if (Text[nStartPos--] == WJ)
407                     GlueSpace=sal_True;
408                 while (nStartPos >= 0 &&
409                     (u_isWhitespace(Text.iterateCodePoints(&nStartPos, 0)) || Text[nStartPos] == WJ)) {
410                     if (Text[nStartPos--] == WJ)
411                         GlueSpace=sal_True;
412                 }
413                 if (GlueSpace && nStartPos < 0)  {
414                     lbr.breakIndex = 0;
415                     break;
416                 }
417             }
418         }
419 
420         return lbr;
421 }
422 
423 
424 
425 OUString SAL_CALL
426 BreakIterator_Unicode::getImplementationName(void) throw( uno::RuntimeException )
427 {
428         return OUString::createFromAscii(cBreakIterator);
429 }
430 
431 sal_Bool SAL_CALL
432 BreakIterator_Unicode::supportsService(const OUString& rServiceName) throw( uno::RuntimeException )
433 {
434         return !rServiceName.compareToAscii(cBreakIterator);
435 }
436 
437 uno::Sequence< OUString > SAL_CALL
438 BreakIterator_Unicode::getSupportedServiceNames(void) throw( uno::RuntimeException )
439 {
440         uno::Sequence< OUString > aRet(1);
441         aRet[0] = OUString::createFromAscii(cBreakIterator);
442         return aRet;
443 }
444 
445 } } } }
446