1*cdf0e10cSrcweir /***************************************************************************
2*cdf0e10cSrcweir  *
3*cdf0e10cSrcweir  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4*cdf0e10cSrcweir  *
5*cdf0e10cSrcweir  * Copyright 2000, 2010 Oracle and/or its affiliates.
6*cdf0e10cSrcweir  *
7*cdf0e10cSrcweir  * OpenOffice.org - a multi-platform office productivity suite
8*cdf0e10cSrcweir  *
9*cdf0e10cSrcweir  * This file is part of OpenOffice.org.
10*cdf0e10cSrcweir  *
11*cdf0e10cSrcweir  * OpenOffice.org is free software: you can redistribute it and/or modify
12*cdf0e10cSrcweir  * it under the terms of the GNU Lesser General Public License version 3
13*cdf0e10cSrcweir  * only, as published by the Free Software Foundation.
14*cdf0e10cSrcweir  *
15*cdf0e10cSrcweir  * OpenOffice.org is distributed in the hope that it will be useful,
16*cdf0e10cSrcweir  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17*cdf0e10cSrcweir  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18*cdf0e10cSrcweir  * GNU Lesser General Public License version 3 for more details
19*cdf0e10cSrcweir  * (a copy is included in the LICENSE file that accompanied this code).
20*cdf0e10cSrcweir  *
21*cdf0e10cSrcweir  * You should have received a copy of the GNU Lesser General Public License
22*cdf0e10cSrcweir  * version 3 along with OpenOffice.org.  If not, see
23*cdf0e10cSrcweir  * <http://www.openoffice.org/license.html>
24*cdf0e10cSrcweir  * for a copy of the LGPLv3 License.
25*cdf0e10cSrcweir  *
26*cdf0e10cSrcweir  ************************************************************************/
27*cdf0e10cSrcweir 
28*cdf0e10cSrcweir  /**
29*cdf0e10cSrcweir   *
30*cdf0e10cSrcweir   *
31*cdf0e10cSrcweir   *
32*cdf0e10cSrcweir   *
33*cdf0e10cSrcweir   * TODO
34*cdf0e10cSrcweir   * - Add exception throwing when h == NULL
35*cdf0e10cSrcweir   * - Not init h when implicit constructor is launched
36*cdf0e10cSrcweir   */
37*cdf0e10cSrcweir 
38*cdf0e10cSrcweir // MARKER(update_precomp.py): autogen include statement, do not remove
39*cdf0e10cSrcweir #include "precompiled_lingucomponent.hxx"
40*cdf0e10cSrcweir 
41*cdf0e10cSrcweir #include <string.h>
42*cdf0e10cSrcweir #include <sstream>
43*cdf0e10cSrcweir #include <iostream>
44*cdf0e10cSrcweir 
45*cdf0e10cSrcweir #include <libtextcat/textcat.h>
46*cdf0e10cSrcweir #include <libtextcat/common.h>
47*cdf0e10cSrcweir #include <libtextcat/constants.h>
48*cdf0e10cSrcweir #include <libtextcat/fingerprint.h>
49*cdf0e10cSrcweir #include <libtextcat/utf8misc.h>
50*cdf0e10cSrcweir 
51*cdf0e10cSrcweir #include <sal/types.h>
52*cdf0e10cSrcweir 
53*cdf0e10cSrcweir #include "altstrfunc.hxx"
54*cdf0e10cSrcweir #include "simpleguesser.hxx"
55*cdf0e10cSrcweir 
56*cdf0e10cSrcweir #ifndef _UTF8_
57*cdf0e10cSrcweir #define _UTF8_
58*cdf0e10cSrcweir #endif
59*cdf0e10cSrcweir 
60*cdf0e10cSrcweir 
61*cdf0e10cSrcweir using namespace std;
62*cdf0e10cSrcweir 
63*cdf0e10cSrcweir 
64*cdf0e10cSrcweir /**
65*cdf0e10cSrcweir  * This 3 following structures are from fingerprint.c and textcat.c
66*cdf0e10cSrcweir  */
67*cdf0e10cSrcweir 
68*cdf0e10cSrcweir typedef struct ngram_t {
69*cdf0e10cSrcweir 
70*cdf0e10cSrcweir     sint2 rank;
71*cdf0e10cSrcweir     char str[MAXNGRAMSIZE+1];
72*cdf0e10cSrcweir 
73*cdf0e10cSrcweir } ngram_t;
74*cdf0e10cSrcweir 
75*cdf0e10cSrcweir typedef struct fp_t {
76*cdf0e10cSrcweir 
77*cdf0e10cSrcweir     const char *name;
78*cdf0e10cSrcweir     ngram_t *fprint;
79*cdf0e10cSrcweir     uint4 size;
80*cdf0e10cSrcweir 
81*cdf0e10cSrcweir } fp_t;
82*cdf0e10cSrcweir 
83*cdf0e10cSrcweir typedef struct textcat_t{
84*cdf0e10cSrcweir 
85*cdf0e10cSrcweir     void **fprint;
86*cdf0e10cSrcweir     char *fprint_disable;
87*cdf0e10cSrcweir     uint4 size;
88*cdf0e10cSrcweir     uint4 maxsize;
89*cdf0e10cSrcweir 
90*cdf0e10cSrcweir     char output[MAXOUTPUTSIZE];
91*cdf0e10cSrcweir 
92*cdf0e10cSrcweir } textcat_t;
93*cdf0e10cSrcweir /** end of the 3 structs */
94*cdf0e10cSrcweir 
95*cdf0e10cSrcweir SimpleGuesser::SimpleGuesser()
96*cdf0e10cSrcweir {
97*cdf0e10cSrcweir     h = NULL;
98*cdf0e10cSrcweir }
99*cdf0e10cSrcweir 
100*cdf0e10cSrcweir void SimpleGuesser::operator=(SimpleGuesser& sg){
101*cdf0e10cSrcweir     if(h){textcat_Done(h);}
102*cdf0e10cSrcweir     h = sg.h;
103*cdf0e10cSrcweir }
104*cdf0e10cSrcweir 
105*cdf0e10cSrcweir SimpleGuesser::~SimpleGuesser()
106*cdf0e10cSrcweir {
107*cdf0e10cSrcweir     if(h){textcat_Done(h);}
108*cdf0e10cSrcweir }
109*cdf0e10cSrcweir 
110*cdf0e10cSrcweir 
111*cdf0e10cSrcweir /*!
112*cdf0e10cSrcweir     \fn SimpleGuesser::GuessLanguage(char* text)
113*cdf0e10cSrcweir  */
114*cdf0e10cSrcweir vector<Guess> SimpleGuesser::GuessLanguage(char* text)
115*cdf0e10cSrcweir {
116*cdf0e10cSrcweir         vector<Guess> guesses;
117*cdf0e10cSrcweir 
118*cdf0e10cSrcweir         if(!h){return guesses;}
119*cdf0e10cSrcweir 
120*cdf0e10cSrcweir         //calculate le number of unicode charcters (symbols)
121*cdf0e10cSrcweir         int len = utfstrlen(text);
122*cdf0e10cSrcweir 
123*cdf0e10cSrcweir 	if( len > MAX_STRING_LENGTH_TO_ANALYSE ){len = MAX_STRING_LENGTH_TO_ANALYSE ;}
124*cdf0e10cSrcweir 
125*cdf0e10cSrcweir         char *guess_list = textcat_Classify(h, text, len);
126*cdf0e10cSrcweir 
127*cdf0e10cSrcweir         if(strcmp(guess_list, _TEXTCAT_RESULT_SHORT) == 0){
128*cdf0e10cSrcweir             return guesses;
129*cdf0e10cSrcweir         }
130*cdf0e10cSrcweir 
131*cdf0e10cSrcweir         int current_pointer = 0;
132*cdf0e10cSrcweir 
133*cdf0e10cSrcweir         for(int i = 0; guess_list[current_pointer] != '\0'; i++)
134*cdf0e10cSrcweir         {
135*cdf0e10cSrcweir             while(guess_list[current_pointer] != GUESS_SEPARATOR_OPEN && guess_list[current_pointer] != '\0'){
136*cdf0e10cSrcweir                 current_pointer++;
137*cdf0e10cSrcweir             }
138*cdf0e10cSrcweir             if(guess_list[current_pointer] != '\0')
139*cdf0e10cSrcweir             {
140*cdf0e10cSrcweir                 Guess g((char*)(guess_list + current_pointer));
141*cdf0e10cSrcweir 
142*cdf0e10cSrcweir                 guesses.push_back(g);
143*cdf0e10cSrcweir 
144*cdf0e10cSrcweir                 current_pointer++;
145*cdf0e10cSrcweir             }
146*cdf0e10cSrcweir         }
147*cdf0e10cSrcweir 
148*cdf0e10cSrcweir 	return guesses;
149*cdf0e10cSrcweir }
150*cdf0e10cSrcweir 
151*cdf0e10cSrcweir /*!
152*cdf0e10cSrcweir     \fn SimpleGuesser::GuessPrimaryLanguage(char* text)
153*cdf0e10cSrcweir  */
154*cdf0e10cSrcweir Guess SimpleGuesser::GuessPrimaryLanguage(char* text)
155*cdf0e10cSrcweir {
156*cdf0e10cSrcweir     vector<Guess> ret = GuessLanguage(text);
157*cdf0e10cSrcweir     if(ret.size() > 0){
158*cdf0e10cSrcweir         return GuessLanguage(text)[0];
159*cdf0e10cSrcweir     }
160*cdf0e10cSrcweir     else{
161*cdf0e10cSrcweir         return Guess();
162*cdf0e10cSrcweir     }
163*cdf0e10cSrcweir }
164*cdf0e10cSrcweir /**
165*cdf0e10cSrcweir  * Is used to know wich language is available, unavailable or both
166*cdf0e10cSrcweir  * when mask = 0xF0, return only Available
167*cdf0e10cSrcweir  * when mask = 0x0F, return only Unavailable
168*cdf0e10cSrcweir  * when mask = 0xFF, return both Available and Unavailable
169*cdf0e10cSrcweir  */
170*cdf0e10cSrcweir vector<Guess> SimpleGuesser::GetManagedLanguages(const char mask)
171*cdf0e10cSrcweir {
172*cdf0e10cSrcweir     size_t i;
173*cdf0e10cSrcweir     textcat_t *tables = (textcat_t*)h;
174*cdf0e10cSrcweir 
175*cdf0e10cSrcweir     vector<Guess> lang;
176*cdf0e10cSrcweir     if(!h){return lang;}
177*cdf0e10cSrcweir 
178*cdf0e10cSrcweir     for (i=0; i<tables->size; i++) {
179*cdf0e10cSrcweir         if(tables->fprint_disable[i] & mask){
180*cdf0e10cSrcweir             string langStr = "[";
181*cdf0e10cSrcweir             langStr += (char*)fp_Name(tables->fprint[i]);
182*cdf0e10cSrcweir             Guess g( (char *)langStr.c_str());
183*cdf0e10cSrcweir             lang.push_back(g);
184*cdf0e10cSrcweir         }
185*cdf0e10cSrcweir     }
186*cdf0e10cSrcweir 
187*cdf0e10cSrcweir     return lang;
188*cdf0e10cSrcweir }
189*cdf0e10cSrcweir 
190*cdf0e10cSrcweir vector<Guess> SimpleGuesser::GetAvailableLanguages(){
191*cdf0e10cSrcweir     return GetManagedLanguages( sal::static_int_cast< char >( 0xF0 ) );
192*cdf0e10cSrcweir }
193*cdf0e10cSrcweir 
194*cdf0e10cSrcweir vector<Guess> SimpleGuesser::GetUnavailableLanguages(){
195*cdf0e10cSrcweir     return GetManagedLanguages( sal::static_int_cast< char >( 0x0F ));
196*cdf0e10cSrcweir }
197*cdf0e10cSrcweir 
198*cdf0e10cSrcweir vector<Guess> SimpleGuesser::GetAllManagedLanguages(){
199*cdf0e10cSrcweir     return GetManagedLanguages( sal::static_int_cast< char >( 0xFF ));
200*cdf0e10cSrcweir }
201*cdf0e10cSrcweir 
202*cdf0e10cSrcweir void SimpleGuesser::XableLanguage(string lang, char mask){
203*cdf0e10cSrcweir     size_t i;
204*cdf0e10cSrcweir     textcat_t *tables = (textcat_t*)h;
205*cdf0e10cSrcweir 
206*cdf0e10cSrcweir     if(!h){return;}
207*cdf0e10cSrcweir 
208*cdf0e10cSrcweir     for (i=0; i<tables->size; i++) {
209*cdf0e10cSrcweir         string language(fp_Name(tables->fprint[i]));
210*cdf0e10cSrcweir         if(start(language,lang) == 0){
211*cdf0e10cSrcweir             //cout << language << endl;
212*cdf0e10cSrcweir             tables->fprint_disable[i] = mask;
213*cdf0e10cSrcweir             //continue;
214*cdf0e10cSrcweir         }
215*cdf0e10cSrcweir     }
216*cdf0e10cSrcweir }
217*cdf0e10cSrcweir 
218*cdf0e10cSrcweir void SimpleGuesser::EnableLanguage(string lang){
219*cdf0e10cSrcweir     XableLanguage(lang,  sal::static_int_cast< char >( 0xF0 ));
220*cdf0e10cSrcweir }
221*cdf0e10cSrcweir 
222*cdf0e10cSrcweir void SimpleGuesser::DisableLanguage(string lang){
223*cdf0e10cSrcweir     XableLanguage(lang,  sal::static_int_cast< char >( 0x0F ));
224*cdf0e10cSrcweir }
225*cdf0e10cSrcweir 
226*cdf0e10cSrcweir /**
227*cdf0e10cSrcweir *
228*cdf0e10cSrcweir */
229*cdf0e10cSrcweir void SimpleGuesser::SetDBPath(const char* path, const char* prefix){
230*cdf0e10cSrcweir     if(h){
231*cdf0e10cSrcweir         textcat_Done(h);
232*cdf0e10cSrcweir     }
233*cdf0e10cSrcweir     h = special_textcat_Init(path, prefix);
234*cdf0e10cSrcweir }
235