1 /***************************************************************************
2  *
3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4  *
5  * Copyright 2000, 2010 Oracle and/or its affiliates.
6  *
7  * OpenOffice.org - a multi-platform office productivity suite
8  *
9  * This file is part of OpenOffice.org.
10  *
11  * OpenOffice.org is free software: you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser General Public License version 3
13  * only, as published by the Free Software Foundation.
14  *
15  * OpenOffice.org is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  * GNU Lesser General Public License version 3 for more details
19  * (a copy is included in the LICENSE file that accompanied this code).
20  *
21  * You should have received a copy of the GNU Lesser General Public License
22  * version 3 along with OpenOffice.org.  If not, see
23  * <http://www.openoffice.org/license.html>
24  * for a copy of the LGPLv3 License.
25  *
26  ************************************************************************/
27 
28  /**
29   *
30   *
31   *
32   *
33   * TODO
34   * - Add exception throwing when h == NULL
35   * - Not init h when implicit constructor is launched
36   */
37 
38 // MARKER(update_precomp.py): autogen include statement, do not remove
39 #include "precompiled_lingucomponent.hxx"
40 
41 #include <string.h>
42 #include <sstream>
43 #include <iostream>
44 
45 #include <libtextcat/textcat.h>
46 #include <libtextcat/common.h>
47 #include <libtextcat/constants.h>
48 #include <libtextcat/fingerprint.h>
49 #include <libtextcat/utf8misc.h>
50 
51 #include <sal/types.h>
52 
53 #include "altstrfunc.hxx"
54 #include "simpleguesser.hxx"
55 
56 #ifndef _UTF8_
57 #define _UTF8_
58 #endif
59 
60 
61 using namespace std;
62 
63 
64 /**
65  * This 3 following structures are from fingerprint.c and textcat.c
66  */
67 
68 typedef struct ngram_t {
69 
70     sint2 rank;
71     char str[MAXNGRAMSIZE+1];
72 
73 } ngram_t;
74 
75 typedef struct fp_t {
76 
77     const char *name;
78     ngram_t *fprint;
79     uint4 size;
80 
81 } fp_t;
82 
83 typedef struct textcat_t{
84 
85     void **fprint;
86     char *fprint_disable;
87     uint4 size;
88     uint4 maxsize;
89 
90     char output[MAXOUTPUTSIZE];
91 
92 } textcat_t;
93 /** end of the 3 structs */
94 
95 SimpleGuesser::SimpleGuesser()
96 {
97     h = NULL;
98 }
99 
100 void SimpleGuesser::operator=(SimpleGuesser& sg){
101     if(h){textcat_Done(h);}
102     h = sg.h;
103 }
104 
105 SimpleGuesser::~SimpleGuesser()
106 {
107     if(h){textcat_Done(h);}
108 }
109 
110 
111 /*!
112     \fn SimpleGuesser::GuessLanguage(char* text)
113  */
114 vector<Guess> SimpleGuesser::GuessLanguage(char* text)
115 {
116         vector<Guess> guesses;
117 
118         if(!h){return guesses;}
119 
120         //calculate le number of unicode charcters (symbols)
121         int len = utfstrlen(text);
122 
123 	if( len > MAX_STRING_LENGTH_TO_ANALYSE ){len = MAX_STRING_LENGTH_TO_ANALYSE ;}
124 
125         char *guess_list = textcat_Classify(h, text, len);
126 
127         if(strcmp(guess_list, _TEXTCAT_RESULT_SHORT) == 0){
128             return guesses;
129         }
130 
131         int current_pointer = 0;
132 
133         for(int i = 0; guess_list[current_pointer] != '\0'; i++)
134         {
135             while(guess_list[current_pointer] != GUESS_SEPARATOR_OPEN && guess_list[current_pointer] != '\0'){
136                 current_pointer++;
137             }
138             if(guess_list[current_pointer] != '\0')
139             {
140                 Guess g((char*)(guess_list + current_pointer));
141 
142                 guesses.push_back(g);
143 
144                 current_pointer++;
145             }
146         }
147 
148 	return guesses;
149 }
150 
151 /*!
152     \fn SimpleGuesser::GuessPrimaryLanguage(char* text)
153  */
154 Guess SimpleGuesser::GuessPrimaryLanguage(char* text)
155 {
156     vector<Guess> ret = GuessLanguage(text);
157     if(ret.size() > 0){
158         return GuessLanguage(text)[0];
159     }
160     else{
161         return Guess();
162     }
163 }
164 /**
165  * Is used to know wich language is available, unavailable or both
166  * when mask = 0xF0, return only Available
167  * when mask = 0x0F, return only Unavailable
168  * when mask = 0xFF, return both Available and Unavailable
169  */
170 vector<Guess> SimpleGuesser::GetManagedLanguages(const char mask)
171 {
172     size_t i;
173     textcat_t *tables = (textcat_t*)h;
174 
175     vector<Guess> lang;
176     if(!h){return lang;}
177 
178     for (i=0; i<tables->size; i++) {
179         if(tables->fprint_disable[i] & mask){
180             string langStr = "[";
181             langStr += (char*)fp_Name(tables->fprint[i]);
182             Guess g( (char *)langStr.c_str());
183             lang.push_back(g);
184         }
185     }
186 
187     return lang;
188 }
189 
190 vector<Guess> SimpleGuesser::GetAvailableLanguages(){
191     return GetManagedLanguages( sal::static_int_cast< char >( 0xF0 ) );
192 }
193 
194 vector<Guess> SimpleGuesser::GetUnavailableLanguages(){
195     return GetManagedLanguages( sal::static_int_cast< char >( 0x0F ));
196 }
197 
198 vector<Guess> SimpleGuesser::GetAllManagedLanguages(){
199     return GetManagedLanguages( sal::static_int_cast< char >( 0xFF ));
200 }
201 
202 void SimpleGuesser::XableLanguage(string lang, char mask){
203     size_t i;
204     textcat_t *tables = (textcat_t*)h;
205 
206     if(!h){return;}
207 
208     for (i=0; i<tables->size; i++) {
209         string language(fp_Name(tables->fprint[i]));
210         if(start(language,lang) == 0){
211             //cout << language << endl;
212             tables->fprint_disable[i] = mask;
213             //continue;
214         }
215     }
216 }
217 
218 void SimpleGuesser::EnableLanguage(string lang){
219     XableLanguage(lang,  sal::static_int_cast< char >( 0xF0 ));
220 }
221 
222 void SimpleGuesser::DisableLanguage(string lang){
223     XableLanguage(lang,  sal::static_int_cast< char >( 0x0F ));
224 }
225 
226 /**
227 *
228 */
229 void SimpleGuesser::SetDBPath(const char* path, const char* prefix){
230     if(h){
231         textcat_Done(h);
232     }
233     h = special_textcat_Init(path, prefix);
234 }
235