1 /*************************************************************************
2  *
3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4  *
5  * Copyright 2000, 2010 Oracle and/or its affiliates.
6  *
7  * OpenOffice.org - a multi-platform office productivity suite
8  *
9  * This file is part of OpenOffice.org.
10  *
11  * OpenOffice.org is free software: you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser General Public License version 3
13  * only, as published by the Free Software Foundation.
14  *
15  * OpenOffice.org is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  * GNU Lesser General Public License version 3 for more details
19  * (a copy is included in the LICENSE file that accompanied this code).
20  *
21  * You should have received a copy of the GNU Lesser General Public License
22  * version 3 along with OpenOffice.org.  If not, see
23  * <http://www.openoffice.org/license.html>
24  * for a copy of the LGPLv3 License.
25  *
26  ************************************************************************/
27 
28 // MARKER(update_precomp.py): autogen include statement, do not remove
29 #include "precompiled_i18npool.hxx"
30 
31 // xdictionary.cpp: implementation of the xdictionary class.
32 //
33 //////////////////////////////////////////////////////////////////////
34 
35 
36 #include <rtl/ustrbuf.hxx>
37 
38 #include <com/sun/star/i18n/WordType.hpp>
39 #include <xdictionary.hxx>
40 #include <unicode/uchar.h>
41 #include <string.h>
42 #include <breakiteratorImpl.hxx>
43 
44 //////////////////////////////////////////////////////////////////////
45 // Construction/Destruction
46 //////////////////////////////////////////////////////////////////////
47 
48 using namespace rtl;
49 
50 namespace com { namespace sun { namespace star { namespace i18n {
51 
52 extern "C" { static void SAL_CALL thisModule() {} }
53 
54 xdictionary::xdictionary(const sal_Char *lang) :
55     existMark( NULL ),
56     index1( NULL ),
57     index2( NULL ),
58     lenArray( NULL ),
59     dataArea( NULL ),
60     hModule( NULL ),
61     boundary(),
62     japaneseWordBreak( sal_False )
63 #if USE_CELL_BOUNDARY_CODE
64     // For CTL breakiterator, where the word boundary should not be inside cell.
65     ,
66     useCellBoundary( sal_False ),
67     cellBoundary( NULL )
68 #endif
69 {
70 	index1 = 0;
71 #ifdef SAL_DLLPREFIX
72     OUStringBuffer aBuf( strlen(lang) + 7 + 6 );    // mostly "lib*.so" (with * == dict_zh)
73     aBuf.appendAscii( SAL_DLLPREFIX );
74 #else
75     OUStringBuffer aBuf( strlen(lang) + 7 + 4 );    // mostly "*.dll" (with * == dict_zh)
76 #endif
77     aBuf.appendAscii( "dict_" ).appendAscii( lang ).appendAscii( SAL_DLLEXTENSION );
78         hModule = osl_loadModuleRelative( &thisModule, aBuf.makeStringAndClear().pData, SAL_LOADMODULE_DEFAULT );
79         if( hModule ) {
80             sal_IntPtr (*func)();
81             func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString::createFromAscii("getExistMark").pData );
82             existMark = (sal_uInt8*) (*func)();
83             func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString::createFromAscii("getIndex1").pData );
84             index1 = (sal_Int16*) (*func)();
85             func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString::createFromAscii("getIndex2").pData );
86             index2 = (sal_Int32*) (*func)();
87             func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString::createFromAscii("getLenArray").pData );
88             lenArray = (sal_Int32*) (*func)();
89             func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString::createFromAscii("getDataArea").pData );
90             dataArea = (sal_Unicode*) (*func)();
91         }
92         else
93 		{
94             existMark = NULL;
95 			index1 = NULL;
96 			index2 = NULL;
97 			lenArray = NULL;
98 			dataArea = NULL;
99 		}
100 
101 		for (sal_Int32 i = 0; i < CACHE_MAX; i++)
102             cache[i].size = 0;
103 
104 #if USE_CELL_BOUNDARY_CODE
105         useCellBoundary = sal_False;
106         cellBoundary = NULL;
107 #endif
108         japaneseWordBreak = sal_False;
109 }
110 
111 xdictionary::~xdictionary() {
112         osl_unloadModule(hModule);
113         for (sal_Int32 i = 0; i < CACHE_MAX; i++) {
114             if (cache[i].size > 0) {
115                 delete cache[i].contents;
116                 delete cache[i].wordboundary;
117             }
118         }
119 }
120 
121 void xdictionary::setJapaneseWordBreak()
122 {
123         japaneseWordBreak = sal_True;
124 }
125 
126 sal_Bool xdictionary::exists(const sal_uInt32 c) {
127         // 0x1FFF is the hardcoded limit in gendict for existMarks
128         sal_Bool exist = (existMark && ((c>>3) < 0x1FFF)) ? sal::static_int_cast<sal_Bool>((existMark[c>>3] & (1<<(c&0x07))) != 0) : sal_False;
129         if (!exist && japaneseWordBreak)
130             return BreakIteratorImpl::getScriptClass(c) == ScriptType::ASIAN;
131         else
132             return exist;
133 }
134 
135 sal_Int32 xdictionary::getLongestMatch(const sal_Unicode* str, sal_Int32 sLen) {
136 
137 		if ( !index1 ) return 0;
138 
139         sal_Int16 idx = index1[str[0] >> 8];
140 
141         if (idx == 0xFF) return 0;
142 
143         idx = (idx<<8) | (str[0]&0xff);
144 
145         sal_uInt32 begin = index2[idx], end = index2[idx+1];
146 
147         if (begin == 0) return 0;
148 
149         str++; sLen--; // first character is not stored in the dictionary
150         for (sal_uInt32 i = end; i > begin; i--) {
151             sal_Int32 len = lenArray[i] - lenArray[i - 1];
152             if (sLen >= len) {
153                 const sal_Unicode *dstr = dataArea + lenArray[i-1];
154                 sal_Int32 pos = 0;
155 
156                 while (pos < len && dstr[pos] == str[pos]) { pos++; }
157 
158                 if (pos == len)
159                     return len + 1;
160             }
161         }
162         return 0;
163 }
164 
165 
166 /*
167  * c-tor
168  */
169 
170 WordBreakCache::WordBreakCache() :
171     length( 0 ),
172     contents( NULL ),
173     wordboundary( NULL ),
174     size( 0 )
175 {
176 }
177 
178 /*
179  * Compare two unicode string,
180  */
181 
182 sal_Bool WordBreakCache::equals(const sal_Unicode* str, Boundary& boundary) {
183         // Different length, different string.
184         if (length != boundary.endPos - boundary.startPos) return sal_False;
185 
186         for (sal_Int32 i = 0; i < length; i++)
187             if (contents[i] != str[i + boundary.startPos]) return sal_False;
188 
189         return sal_True;
190 }
191 
192 
193 /*
194  * Retrieve the segment containing the character at pos.
195  * @param pos : Position of the given character.
196  * @return true if CJK.
197  */
198 sal_Bool xdictionary::seekSegment(const rtl::OUString &rText, sal_Int32 pos,
199 	Boundary& segBoundary)
200 {
201     sal_Int32 indexUtf16;
202     segBoundary.endPos = segBoundary.startPos = pos;
203 
204     indexUtf16 = pos;
205     while (indexUtf16 > 0)
206     {
207         sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16, -1);
208         if (u_isWhitespace(ch) || exists(ch))
209             segBoundary.startPos = indexUtf16;
210         else
211             break;
212     }
213 
214     indexUtf16 = pos;
215     while (indexUtf16 < rText.getLength())
216     {
217         sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16, 1);
218         if (u_isWhitespace(ch) || exists(ch))
219             segBoundary.endPos = indexUtf16;
220         else
221             break;
222     }
223 
224     indexUtf16 = segBoundary.startPos;
225     rText.iterateCodePoints(&indexUtf16, 1);
226     return segBoundary.endPos > indexUtf16;
227 }
228 
229 #define KANJA       1
230 #define KATAKANA    2
231 #define HIRAKANA    3
232 
233 static sal_Int16 JapaneseCharType(sal_Unicode c)
234 {
235     if (0x3041 <= c && c <= 0x309e)
236         return HIRAKANA;
237     if ((0x30a1 <= c && c <= 0x30fe) || (0xff65 <= c && c <= 0xff9f))
238         return KATAKANA;
239     return KANJA;
240 }
241 
242 WordBreakCache& xdictionary::getCache(const sal_Unicode *text, Boundary& wordBoundary)
243 {
244 
245         WordBreakCache& aCache = cache[text[0] & 0x1f];
246 
247         if (aCache.size != 0 && aCache.equals(text, wordBoundary))
248             return aCache;
249 
250         sal_Int32 len = wordBoundary.endPos - wordBoundary.startPos;
251 
252         if (aCache.size == 0 || len > aCache.size) {
253             if (aCache.size != 0) {
254                 delete aCache.contents;
255                 delete aCache.wordboundary;
256                 aCache.size = len;
257             }
258             else
259                 aCache.size = len > DEFAULT_SIZE ? len : DEFAULT_SIZE;
260             aCache.contents = new sal_Unicode[aCache.size + 1];
261             aCache.wordboundary = new sal_Int32[aCache.size + 2];
262         }
263         aCache.length  = len;
264         memcpy(aCache.contents, text + wordBoundary.startPos, len * sizeof(sal_Unicode));
265         *(aCache.contents + len) = 0x0000;
266         // reset the wordboundary in cache
267         memset(aCache.wordboundary, '\0', sizeof(sal_Int32)*(len + 2));
268 
269         sal_Int32 i = 0;        // loop variable
270         while (aCache.wordboundary[i] < aCache.length) {
271             len = 0;
272             // look the continuous white space as one word and cashe it
273             while (u_isWhitespace((sal_uInt32)text[wordBoundary.startPos + aCache.wordboundary[i] + len]))
274                 len ++;
275 
276             if (len == 0) {
277                 const sal_Unicode *str = text + wordBoundary.startPos + aCache.wordboundary[i];
278                 sal_Int32 slen = aCache.length - aCache.wordboundary[i];
279                 sal_Int16 type = 0, count = 0;
280                 for (;len == 0 && slen > 0; str++, slen--) {
281                     len = getLongestMatch(str, slen);
282                     if (len == 0) {
283                         if (!japaneseWordBreak) {
284                             len = 1;
285                         } else {
286                             if (count == 0)
287                                 type = JapaneseCharType(*str);
288                             else if (type != JapaneseCharType(*str))
289                                 break;
290                             count++;
291                         }
292                     }
293                 }
294                 if (count) {
295                     aCache.wordboundary[i+1] = aCache.wordboundary[i] + count;
296                     i++;
297 
298 #if USE_CELL_BOUNDARY_CODE
299                     if (useCellBoundary) {
300                         sal_Int32 cBoundary = cellBoundary[aCache.wordboundary[i] + wordBoundary.startPos - 1];
301                         if (cBoundary > 0)
302                             aCache.wordboundary[i] = cBoundary - wordBoundary.startPos;
303                     }
304 #endif
305                 }
306             }
307 
308             if (len) {
309                 aCache.wordboundary[i+1] = aCache.wordboundary[i] + len;
310                 i++;
311 
312 #if USE_CELL_BOUNDARY_CODE
313                 if (useCellBoundary) {
314                     sal_Int32 cBoundary = cellBoundary[aCache.wordboundary[i] + wordBoundary.startPos - 1];
315                     if (cBoundary > 0)
316                         aCache.wordboundary[i] = cBoundary - wordBoundary.startPos;
317                 }
318 #endif
319             }
320         }
321         aCache.wordboundary[i + 1] = aCache.length + 1;
322 
323         return aCache;
324 }
325 
326 Boundary xdictionary::previousWord(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType)
327 {
328         // looking for the first non-whitespace character from anyPos
329         sal_uInt32 ch = rText.iterateCodePoints(&anyPos, -1);
330 
331         while (anyPos > 0 && u_isWhitespace(ch)) ch = rText.iterateCodePoints(&anyPos, -1);
332 
333         return getWordBoundary(rText, anyPos, wordType, true);
334 }
335 
336 Boundary xdictionary::nextWord(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType)
337 {
338         boundary = getWordBoundary(rText, anyPos, wordType, true);
339         anyPos = boundary.endPos;
340         if (anyPos < rText.getLength()) {
341             // looknig for the first non-whitespace character from anyPos
342             sal_uInt32 ch = rText.iterateCodePoints(&anyPos, 1);
343             while (u_isWhitespace(ch)) ch=rText.iterateCodePoints(&anyPos, 1);
344             rText.iterateCodePoints(&anyPos, -1);
345         }
346 
347         return getWordBoundary(rText, anyPos, wordType, true);
348 }
349 
350 Boundary xdictionary::getWordBoundary(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType, sal_Bool bDirection)
351 {
352         const sal_Unicode *text=rText.getStr();
353         sal_Int32 len=rText.getLength();
354         if (anyPos >= len || anyPos < 0) {
355             boundary.startPos = boundary.endPos = anyPos < 0 ? 0 : len;
356         } else if (seekSegment(rText, anyPos, boundary)) {          // character in dict
357             WordBreakCache& aCache = getCache(text, boundary);
358             sal_Int32 i = 0;
359 
360             while (aCache.wordboundary[i] <= anyPos - boundary.startPos) i++;
361 
362             sal_Int32 startPos = aCache.wordboundary[i - 1];
363             // if bDirection is false
364             if (!bDirection && startPos > 0 && startPos == (anyPos - boundary.startPos))
365             {
366                 sal_Int32 indexUtf16 = anyPos-1;
367                 sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16, 1);
368                 if (u_isWhitespace(ch))
369                     i--;
370             }
371             boundary.endPos = boundary.startPos;
372             rText.iterateCodePoints(&boundary.endPos, aCache.wordboundary[i]);
373             rText.iterateCodePoints(&boundary.startPos, aCache.wordboundary[i-1]);
374         } else {
375             boundary.startPos = anyPos;
376             if (anyPos < len) rText.iterateCodePoints(&anyPos, 1);
377             boundary.endPos = anyPos < len ? anyPos : len;
378         }
379         if (wordType == WordType::WORD_COUNT) {
380             // skip punctuation for word count.
381             while (boundary.endPos < len)
382             {
383                 sal_Int32 indexUtf16 = boundary.endPos;
384                 if (u_ispunct(rText.iterateCodePoints(&indexUtf16, 1)))
385                     boundary.endPos = indexUtf16;
386                 else
387                     break;
388             }
389         }
390 
391         return boundary;
392 }
393 
394 #if USE_CELL_BOUNDARY_CODE
395 void xdictionary::setCellBoundary(sal_Int32* cellArray)
396 {
397         useCellBoundary = sal_True;
398         cellBoundary = cellArray;
399 }
400 #endif
401 
402 } } } }
403