source/breakiterator/xdictionary.cxx

*cdf0e10cSrcweir/*************************************************************************
*cdf0e10cSrcweir *
*cdf0e10cSrcweir * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*cdf0e10cSrcweir *
*cdf0e10cSrcweir * Copyright 2000, 2010 Oracle and/or its affiliates.
*cdf0e10cSrcweir *
*cdf0e10cSrcweir * OpenOffice.org - a multi-platform office productivity suite
*cdf0e10cSrcweir *
*cdf0e10cSrcweir * This file is part of OpenOffice.org.
*cdf0e10cSrcweir *
*cdf0e10cSrcweir * OpenOffice.org is free software: you can redistribute it and/or modify
*cdf0e10cSrcweir * it under the terms of the GNU Lesser General Public License version 3
*cdf0e10cSrcweir * only, as published by the Free Software Foundation.
*cdf0e10cSrcweir *
*cdf0e10cSrcweir * OpenOffice.org is distributed in the hope that it will be useful,
*cdf0e10cSrcweir * but WITHOUT ANY WARRANTY; without even the implied warranty of
*cdf0e10cSrcweir * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
*cdf0e10cSrcweir * GNU Lesser General Public License version 3 for more details
*cdf0e10cSrcweir * (a copy is included in the LICENSE file that accompanied this code).
*cdf0e10cSrcweir *
*cdf0e10cSrcweir * You should have received a copy of the GNU Lesser General Public License
*cdf0e10cSrcweir * version 3 along with OpenOffice.org.  If not, see
*cdf0e10cSrcweir * <http://www.openoffice.org/license.html>
*cdf0e10cSrcweir * for a copy of the LGPLv3 License.
*cdf0e10cSrcweir *
*cdf0e10cSrcweir ************************************************************************/
*cdf0e10cSrcweir
*cdf0e10cSrcweir// MARKER(update_precomp.py): autogen include statement, do not remove
*cdf0e10cSrcweir#include "precompiled_i18npool.hxx"
*cdf0e10cSrcweir
*cdf0e10cSrcweir// xdictionary.cpp: implementation of the xdictionary class.
*cdf0e10cSrcweir//
*cdf0e10cSrcweir//////////////////////////////////////////////////////////////////////
*cdf0e10cSrcweir
*cdf0e10cSrcweir
*cdf0e10cSrcweir#include <rtl/ustrbuf.hxx>
*cdf0e10cSrcweir
*cdf0e10cSrcweir#include <com/sun/star/i18n/WordType.hpp>
*cdf0e10cSrcweir#include <xdictionary.hxx>
*cdf0e10cSrcweir#include <unicode/uchar.h>
*cdf0e10cSrcweir#include <string.h>
*cdf0e10cSrcweir#include <breakiteratorImpl.hxx>
*cdf0e10cSrcweir
*cdf0e10cSrcweir//////////////////////////////////////////////////////////////////////
*cdf0e10cSrcweir// Construction/Destruction
*cdf0e10cSrcweir//////////////////////////////////////////////////////////////////////
*cdf0e10cSrcweir
*cdf0e10cSrcweirusing namespace rtl;
*cdf0e10cSrcweir
*cdf0e10cSrcweirnamespace com { namespace sun { namespace star { namespace i18n {
*cdf0e10cSrcweir
*cdf0e10cSrcweirextern "C" { static void SAL_CALL thisModule() {} }
*cdf0e10cSrcweir
*cdf0e10cSrcweirxdictionary::xdictionary(const sal_Char *lang) :
*cdf0e10cSrcweir    existMark( NULL ),
*cdf0e10cSrcweir    index1( NULL ),
*cdf0e10cSrcweir    index2( NULL ),
*cdf0e10cSrcweir    lenArray( NULL ),
*cdf0e10cSrcweir    dataArea( NULL ),
*cdf0e10cSrcweir    hModule( NULL ),
*cdf0e10cSrcweir    boundary(),
*cdf0e10cSrcweir    japaneseWordBreak( sal_False )
*cdf0e10cSrcweir#if USE_CELL_BOUNDARY_CODE
*cdf0e10cSrcweir    // For CTL breakiterator, where the word boundary should not be inside cell.
*cdf0e10cSrcweir    ,
*cdf0e10cSrcweir    useCellBoundary( sal_False ),
*cdf0e10cSrcweir    cellBoundary( NULL )
*cdf0e10cSrcweir#endif
*cdf0e10cSrcweir{
*cdf0e10cSrcweir	index1 = 0;
*cdf0e10cSrcweir#ifdef SAL_DLLPREFIX
*cdf0e10cSrcweir    OUStringBuffer aBuf( strlen(lang) + 7 + 6 );    // mostly "lib*.so" (with * == dict_zh)
*cdf0e10cSrcweir    aBuf.appendAscii( SAL_DLLPREFIX );
*cdf0e10cSrcweir#else
*cdf0e10cSrcweir    OUStringBuffer aBuf( strlen(lang) + 7 + 4 );    // mostly "*.dll" (with * == dict_zh)
*cdf0e10cSrcweir#endif
*cdf0e10cSrcweir    aBuf.appendAscii( "dict_" ).appendAscii( lang ).appendAscii( SAL_DLLEXTENSION );
*cdf0e10cSrcweir        hModule = osl_loadModuleRelative( &thisModule, aBuf.makeStringAndClear().pData, SAL_LOADMODULE_DEFAULT );
*cdf0e10cSrcweir        if( hModule ) {
*cdf0e10cSrcweir            sal_IntPtr (*func)();
*cdf0e10cSrcweir            func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString::createFromAscii("getExistMark").pData );
*cdf0e10cSrcweir            existMark = (sal_uInt8*) (*func)();
*cdf0e10cSrcweir            func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString::createFromAscii("getIndex1").pData );
*cdf0e10cSrcweir            index1 = (sal_Int16*) (*func)();
*cdf0e10cSrcweir            func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString::createFromAscii("getIndex2").pData );
*cdf0e10cSrcweir            index2 = (sal_Int32*) (*func)();
*cdf0e10cSrcweir            func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString::createFromAscii("getLenArray").pData );
*cdf0e10cSrcweir            lenArray = (sal_Int32*) (*func)();
*cdf0e10cSrcweir            func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString::createFromAscii("getDataArea").pData );
*cdf0e10cSrcweir            dataArea = (sal_Unicode*) (*func)();
*cdf0e10cSrcweir        }
*cdf0e10cSrcweir        else
*cdf0e10cSrcweir		{
*cdf0e10cSrcweir            existMark = NULL;
*cdf0e10cSrcweir			index1 = NULL;
*cdf0e10cSrcweir			index2 = NULL;
*cdf0e10cSrcweir			lenArray = NULL;
*cdf0e10cSrcweir			dataArea = NULL;
*cdf0e10cSrcweir		}
*cdf0e10cSrcweir
*cdf0e10cSrcweir		for (sal_Int32 i = 0; i < CACHE_MAX; i++)
*cdf0e10cSrcweir            cache[i].size = 0;
*cdf0e10cSrcweir
*cdf0e10cSrcweir#if USE_CELL_BOUNDARY_CODE
*cdf0e10cSrcweir        useCellBoundary = sal_False;
*cdf0e10cSrcweir        cellBoundary = NULL;
*cdf0e10cSrcweir#endif
*cdf0e10cSrcweir        japaneseWordBreak = sal_False;
*cdf0e10cSrcweir}
*cdf0e10cSrcweir
*cdf0e10cSrcweirxdictionary::~xdictionary() {
*cdf0e10cSrcweir        osl_unloadModule(hModule);
*cdf0e10cSrcweir        for (sal_Int32 i = 0; i < CACHE_MAX; i++) {
*cdf0e10cSrcweir            if (cache[i].size > 0) {
*cdf0e10cSrcweir                delete cache[i].contents;
*cdf0e10cSrcweir                delete cache[i].wordboundary;
*cdf0e10cSrcweir            }
*cdf0e10cSrcweir        }
*cdf0e10cSrcweir}
*cdf0e10cSrcweir
*cdf0e10cSrcweirvoid xdictionary::setJapaneseWordBreak()
*cdf0e10cSrcweir{
*cdf0e10cSrcweir        japaneseWordBreak = sal_True;
*cdf0e10cSrcweir}
*cdf0e10cSrcweir
*cdf0e10cSrcweirsal_Bool xdictionary::exists(const sal_uInt32 c) {
*cdf0e10cSrcweir        // 0x1FFF is the hardcoded limit in gendict for existMarks
*cdf0e10cSrcweir        sal_Bool exist = (existMark && ((c>>3) < 0x1FFF)) ? sal::static_int_cast<sal_Bool>((existMark[c>>3] & (1<<(c&0x07))) != 0) : sal_False;
*cdf0e10cSrcweir        if (!exist && japaneseWordBreak)
*cdf0e10cSrcweir            return BreakIteratorImpl::getScriptClass(c) == ScriptType::ASIAN;
*cdf0e10cSrcweir        else
*cdf0e10cSrcweir            return exist;
*cdf0e10cSrcweir}
*cdf0e10cSrcweir
*cdf0e10cSrcweirsal_Int32 xdictionary::getLongestMatch(const sal_Unicode* str, sal_Int32 sLen) {
*cdf0e10cSrcweir
*cdf0e10cSrcweir		if ( !index1 ) return 0;
*cdf0e10cSrcweir
*cdf0e10cSrcweir        sal_Int16 idx = index1[str[0] >> 8];
*cdf0e10cSrcweir
*cdf0e10cSrcweir        if (idx == 0xFF) return 0;
*cdf0e10cSrcweir
*cdf0e10cSrcweir        idx = (idx<<8) | (str[0]&0xff);
*cdf0e10cSrcweir
*cdf0e10cSrcweir        sal_uInt32 begin = index2[idx], end = index2[idx+1];
*cdf0e10cSrcweir
*cdf0e10cSrcweir        if (begin == 0) return 0;
*cdf0e10cSrcweir
*cdf0e10cSrcweir        str++; sLen--; // first character is not stored in the dictionary
*cdf0e10cSrcweir        for (sal_uInt32 i = end; i > begin; i--) {
*cdf0e10cSrcweir            sal_Int32 len = lenArray[i] - lenArray[i - 1];
*cdf0e10cSrcweir            if (sLen >= len) {
*cdf0e10cSrcweir                const sal_Unicode *dstr = dataArea + lenArray[i-1];
*cdf0e10cSrcweir                sal_Int32 pos = 0;
*cdf0e10cSrcweir
*cdf0e10cSrcweir                while (pos < len && dstr[pos] == str[pos]) { pos++; }
*cdf0e10cSrcweir
*cdf0e10cSrcweir                if (pos == len)
*cdf0e10cSrcweir                    return len + 1;
*cdf0e10cSrcweir            }
*cdf0e10cSrcweir        }
*cdf0e10cSrcweir        return 0;
*cdf0e10cSrcweir}
*cdf0e10cSrcweir
*cdf0e10cSrcweir
*cdf0e10cSrcweir/*
*cdf0e10cSrcweir * c-tor
*cdf0e10cSrcweir */
*cdf0e10cSrcweir
*cdf0e10cSrcweirWordBreakCache::WordBreakCache() :
*cdf0e10cSrcweir    length( 0 ),
*cdf0e10cSrcweir    contents( NULL ),
*cdf0e10cSrcweir    wordboundary( NULL ),
*cdf0e10cSrcweir    size( 0 )
*cdf0e10cSrcweir{
*cdf0e10cSrcweir}
*cdf0e10cSrcweir
*cdf0e10cSrcweir/*
*cdf0e10cSrcweir * Compare two unicode string,
*cdf0e10cSrcweir */
*cdf0e10cSrcweir
*cdf0e10cSrcweirsal_Bool WordBreakCache::equals(const sal_Unicode* str, Boundary& boundary) {
*cdf0e10cSrcweir        // Different length, different string.
*cdf0e10cSrcweir        if (length != boundary.endPos - boundary.startPos) return sal_False;
*cdf0e10cSrcweir
*cdf0e10cSrcweir        for (sal_Int32 i = 0; i < length; i++)
*cdf0e10cSrcweir            if (contents[i] != str[i + boundary.startPos]) return sal_False;
*cdf0e10cSrcweir
*cdf0e10cSrcweir        return sal_True;
*cdf0e10cSrcweir}
*cdf0e10cSrcweir
*cdf0e10cSrcweir
*cdf0e10cSrcweir/*
*cdf0e10cSrcweir * Retrieve the segment containing the character at pos.
*cdf0e10cSrcweir * @param pos : Position of the given character.
*cdf0e10cSrcweir * @return true if CJK.
*cdf0e10cSrcweir */
*cdf0e10cSrcweirsal_Bool xdictionary::seekSegment(const rtl::OUString &rText, sal_Int32 pos,
*cdf0e10cSrcweir	Boundary& segBoundary)
*cdf0e10cSrcweir{
*cdf0e10cSrcweir    sal_Int32 indexUtf16;
*cdf0e10cSrcweir    segBoundary.endPos = segBoundary.startPos = pos;
*cdf0e10cSrcweir
*cdf0e10cSrcweir    indexUtf16 = pos;
*cdf0e10cSrcweir    while (indexUtf16 > 0)
*cdf0e10cSrcweir    {
*cdf0e10cSrcweir        sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16, -1);
*cdf0e10cSrcweir        if (u_isWhitespace(ch) || exists(ch))
*cdf0e10cSrcweir            segBoundary.startPos = indexUtf16;
*cdf0e10cSrcweir        else
*cdf0e10cSrcweir            break;
*cdf0e10cSrcweir    }
*cdf0e10cSrcweir
*cdf0e10cSrcweir    indexUtf16 = pos;
*cdf0e10cSrcweir    while (indexUtf16 < rText.getLength())
*cdf0e10cSrcweir    {
*cdf0e10cSrcweir        sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16, 1);
*cdf0e10cSrcweir        if (u_isWhitespace(ch) || exists(ch))
*cdf0e10cSrcweir            segBoundary.endPos = indexUtf16;
*cdf0e10cSrcweir        else
*cdf0e10cSrcweir            break;
*cdf0e10cSrcweir    }
*cdf0e10cSrcweir
*cdf0e10cSrcweir    indexUtf16 = segBoundary.startPos;
*cdf0e10cSrcweir    rText.iterateCodePoints(&indexUtf16, 1);
*cdf0e10cSrcweir    return segBoundary.endPos > indexUtf16;
*cdf0e10cSrcweir}
*cdf0e10cSrcweir
*cdf0e10cSrcweir#define KANJA       1
*cdf0e10cSrcweir#define KATAKANA    2
*cdf0e10cSrcweir#define HIRAKANA    3
*cdf0e10cSrcweir
*cdf0e10cSrcweirstatic sal_Int16 JapaneseCharType(sal_Unicode c)
*cdf0e10cSrcweir{
*cdf0e10cSrcweir    if (0x3041 <= c && c <= 0x309e)
*cdf0e10cSrcweir        return HIRAKANA;
*cdf0e10cSrcweir    if ((0x30a1 <= c && c <= 0x30fe) || (0xff65 <= c && c <= 0xff9f))
*cdf0e10cSrcweir        return KATAKANA;
*cdf0e10cSrcweir    return KANJA;
*cdf0e10cSrcweir}
*cdf0e10cSrcweir
*cdf0e10cSrcweirWordBreakCache& xdictionary::getCache(const sal_Unicode *text, Boundary& wordBoundary)
*cdf0e10cSrcweir{
*cdf0e10cSrcweir
*cdf0e10cSrcweir        WordBreakCache& aCache = cache[text[0] & 0x1f];
*cdf0e10cSrcweir
*cdf0e10cSrcweir        if (aCache.size != 0 && aCache.equals(text, wordBoundary))
*cdf0e10cSrcweir            return aCache;
*cdf0e10cSrcweir
*cdf0e10cSrcweir        sal_Int32 len = wordBoundary.endPos - wordBoundary.startPos;
*cdf0e10cSrcweir
*cdf0e10cSrcweir        if (aCache.size == 0 || len > aCache.size) {
*cdf0e10cSrcweir            if (aCache.size != 0) {
*cdf0e10cSrcweir                delete aCache.contents;
*cdf0e10cSrcweir                delete aCache.wordboundary;
*cdf0e10cSrcweir                aCache.size = len;
*cdf0e10cSrcweir            }
*cdf0e10cSrcweir            else
*cdf0e10cSrcweir                aCache.size = len > DEFAULT_SIZE ? len : DEFAULT_SIZE;
*cdf0e10cSrcweir            aCache.contents = new sal_Unicode[aCache.size + 1];
*cdf0e10cSrcweir            aCache.wordboundary = new sal_Int32[aCache.size + 2];
*cdf0e10cSrcweir        }
*cdf0e10cSrcweir        aCache.length  = len;
*cdf0e10cSrcweir        memcpy(aCache.contents, text + wordBoundary.startPos, len * sizeof(sal_Unicode));
*cdf0e10cSrcweir        *(aCache.contents + len) = 0x0000;
*cdf0e10cSrcweir        // reset the wordboundary in cache
*cdf0e10cSrcweir        memset(aCache.wordboundary, '\0', sizeof(sal_Int32)*(len + 2));
*cdf0e10cSrcweir
*cdf0e10cSrcweir        sal_Int32 i = 0;        // loop variable
*cdf0e10cSrcweir        while (aCache.wordboundary[i] < aCache.length) {
*cdf0e10cSrcweir            len = 0;
*cdf0e10cSrcweir            // look the continuous white space as one word and cashe it
*cdf0e10cSrcweir            while (u_isWhitespace((sal_uInt32)text[wordBoundary.startPos + aCache.wordboundary[i] + len]))
*cdf0e10cSrcweir                len ++;
*cdf0e10cSrcweir
*cdf0e10cSrcweir            if (len == 0) {
*cdf0e10cSrcweir                const sal_Unicode *str = text + wordBoundary.startPos + aCache.wordboundary[i];
*cdf0e10cSrcweir                sal_Int32 slen = aCache.length - aCache.wordboundary[i];
*cdf0e10cSrcweir                sal_Int16 type = 0, count = 0;
*cdf0e10cSrcweir                for (;len == 0 && slen > 0; str++, slen--) {
*cdf0e10cSrcweir                    len = getLongestMatch(str, slen);
*cdf0e10cSrcweir                    if (len == 0) {
*cdf0e10cSrcweir                        if (!japaneseWordBreak) {
*cdf0e10cSrcweir                            len = 1;
*cdf0e10cSrcweir                        } else {
*cdf0e10cSrcweir                            if (count == 0)
*cdf0e10cSrcweir                                type = JapaneseCharType(*str);
*cdf0e10cSrcweir                            else if (type != JapaneseCharType(*str))
*cdf0e10cSrcweir                                break;
*cdf0e10cSrcweir                            count++;
*cdf0e10cSrcweir                        }
*cdf0e10cSrcweir                    }
*cdf0e10cSrcweir                }
*cdf0e10cSrcweir                if (count) {
*cdf0e10cSrcweir                    aCache.wordboundary[i+1] = aCache.wordboundary[i] + count;
*cdf0e10cSrcweir                    i++;
*cdf0e10cSrcweir
*cdf0e10cSrcweir#if USE_CELL_BOUNDARY_CODE
*cdf0e10cSrcweir                    if (useCellBoundary) {
*cdf0e10cSrcweir                        sal_Int32 cBoundary = cellBoundary[aCache.wordboundary[i] + wordBoundary.startPos - 1];
*cdf0e10cSrcweir                        if (cBoundary > 0)
*cdf0e10cSrcweir                            aCache.wordboundary[i] = cBoundary - wordBoundary.startPos;
*cdf0e10cSrcweir                    }
*cdf0e10cSrcweir#endif
*cdf0e10cSrcweir                }
*cdf0e10cSrcweir            }
*cdf0e10cSrcweir
*cdf0e10cSrcweir            if (len) {
*cdf0e10cSrcweir                aCache.wordboundary[i+1] = aCache.wordboundary[i] + len;
*cdf0e10cSrcweir                i++;
*cdf0e10cSrcweir
*cdf0e10cSrcweir#if USE_CELL_BOUNDARY_CODE
*cdf0e10cSrcweir                if (useCellBoundary) {
*cdf0e10cSrcweir                    sal_Int32 cBoundary = cellBoundary[aCache.wordboundary[i] + wordBoundary.startPos - 1];
*cdf0e10cSrcweir                    if (cBoundary > 0)
*cdf0e10cSrcweir                        aCache.wordboundary[i] = cBoundary - wordBoundary.startPos;
*cdf0e10cSrcweir                }
*cdf0e10cSrcweir#endif
*cdf0e10cSrcweir            }
*cdf0e10cSrcweir        }
*cdf0e10cSrcweir        aCache.wordboundary[i + 1] = aCache.length + 1;
*cdf0e10cSrcweir
*cdf0e10cSrcweir        return aCache;
*cdf0e10cSrcweir}
*cdf0e10cSrcweir
*cdf0e10cSrcweirBoundary xdictionary::previousWord(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType)
*cdf0e10cSrcweir{
*cdf0e10cSrcweir        // looking for the first non-whitespace character from anyPos
*cdf0e10cSrcweir        sal_uInt32 ch = rText.iterateCodePoints(&anyPos, -1);
*cdf0e10cSrcweir
*cdf0e10cSrcweir        while (anyPos > 0 && u_isWhitespace(ch)) ch = rText.iterateCodePoints(&anyPos, -1);
*cdf0e10cSrcweir
*cdf0e10cSrcweir        return getWordBoundary(rText, anyPos, wordType, true);
*cdf0e10cSrcweir}
*cdf0e10cSrcweir
*cdf0e10cSrcweirBoundary xdictionary::nextWord(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType)
*cdf0e10cSrcweir{
*cdf0e10cSrcweir        boundary = getWordBoundary(rText, anyPos, wordType, true);
*cdf0e10cSrcweir        anyPos = boundary.endPos;
*cdf0e10cSrcweir        if (anyPos < rText.getLength()) {
*cdf0e10cSrcweir            // looknig for the first non-whitespace character from anyPos
*cdf0e10cSrcweir            sal_uInt32 ch = rText.iterateCodePoints(&anyPos, 1);
*cdf0e10cSrcweir            while (u_isWhitespace(ch)) ch=rText.iterateCodePoints(&anyPos, 1);
*cdf0e10cSrcweir            rText.iterateCodePoints(&anyPos, -1);
*cdf0e10cSrcweir        }
*cdf0e10cSrcweir
*cdf0e10cSrcweir        return getWordBoundary(rText, anyPos, wordType, true);
*cdf0e10cSrcweir}
*cdf0e10cSrcweir
*cdf0e10cSrcweirBoundary xdictionary::getWordBoundary(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType, sal_Bool bDirection)
*cdf0e10cSrcweir{
*cdf0e10cSrcweir        const sal_Unicode *text=rText.getStr();
*cdf0e10cSrcweir        sal_Int32 len=rText.getLength();
*cdf0e10cSrcweir        if (anyPos >= len || anyPos < 0) {
*cdf0e10cSrcweir            boundary.startPos = boundary.endPos = anyPos < 0 ? 0 : len;
*cdf0e10cSrcweir        } else if (seekSegment(rText, anyPos, boundary)) {          // character in dict
*cdf0e10cSrcweir            WordBreakCache& aCache = getCache(text, boundary);
*cdf0e10cSrcweir            sal_Int32 i = 0;
*cdf0e10cSrcweir
*cdf0e10cSrcweir            while (aCache.wordboundary[i] <= anyPos - boundary.startPos) i++;
*cdf0e10cSrcweir
*cdf0e10cSrcweir            sal_Int32 startPos = aCache.wordboundary[i - 1];
*cdf0e10cSrcweir            // if bDirection is false
*cdf0e10cSrcweir            if (!bDirection && startPos > 0 && startPos == (anyPos - boundary.startPos))
*cdf0e10cSrcweir            {
*cdf0e10cSrcweir                sal_Int32 indexUtf16 = anyPos-1;
*cdf0e10cSrcweir                sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16, 1);
*cdf0e10cSrcweir                if (u_isWhitespace(ch))
*cdf0e10cSrcweir                    i--;
*cdf0e10cSrcweir            }
*cdf0e10cSrcweir            boundary.endPos = boundary.startPos;
*cdf0e10cSrcweir            rText.iterateCodePoints(&boundary.endPos, aCache.wordboundary[i]);
*cdf0e10cSrcweir            rText.iterateCodePoints(&boundary.startPos, aCache.wordboundary[i-1]);
*cdf0e10cSrcweir        } else {
*cdf0e10cSrcweir            boundary.startPos = anyPos;
*cdf0e10cSrcweir            if (anyPos < len) rText.iterateCodePoints(&anyPos, 1);
*cdf0e10cSrcweir            boundary.endPos = anyPos < len ? anyPos : len;
*cdf0e10cSrcweir        }
*cdf0e10cSrcweir        if (wordType == WordType::WORD_COUNT) {
*cdf0e10cSrcweir            // skip punctuation for word count.
*cdf0e10cSrcweir            while (boundary.endPos < len)
*cdf0e10cSrcweir            {
*cdf0e10cSrcweir                sal_Int32 indexUtf16 = boundary.endPos;
*cdf0e10cSrcweir                if (u_ispunct(rText.iterateCodePoints(&indexUtf16, 1)))
*cdf0e10cSrcweir                    boundary.endPos = indexUtf16;
*cdf0e10cSrcweir                else
*cdf0e10cSrcweir                    break;
*cdf0e10cSrcweir            }
*cdf0e10cSrcweir        }
*cdf0e10cSrcweir
*cdf0e10cSrcweir        return boundary;
*cdf0e10cSrcweir}
*cdf0e10cSrcweir
*cdf0e10cSrcweir#if USE_CELL_BOUNDARY_CODE
*cdf0e10cSrcweirvoid xdictionary::setCellBoundary(sal_Int32* cellArray)
*cdf0e10cSrcweir{
*cdf0e10cSrcweir        useCellBoundary = sal_True;
*cdf0e10cSrcweir        cellBoundary = cellArray;
*cdf0e10cSrcweir}
*cdf0e10cSrcweir#endif
*cdf0e10cSrcweir
*cdf0e10cSrcweir} } } }