1*b0844812SAndrew Rist /**************************************************************
2cdf0e10cSrcweir  *
3*b0844812SAndrew Rist  * Licensed to the Apache Software Foundation (ASF) under one
4*b0844812SAndrew Rist  * or more contributor license agreements.  See the NOTICE file
5*b0844812SAndrew Rist  * distributed with this work for additional information
6*b0844812SAndrew Rist  * regarding copyright ownership.  The ASF licenses this file
7*b0844812SAndrew Rist  * to you under the Apache License, Version 2.0 (the
8*b0844812SAndrew Rist  * "License"); you may not use this file except in compliance
9*b0844812SAndrew Rist  * with the License.  You may obtain a copy of the License at
10*b0844812SAndrew Rist  *
11*b0844812SAndrew Rist  *   http://www.apache.org/licenses/LICENSE-2.0
12*b0844812SAndrew Rist  *
13*b0844812SAndrew Rist  * Unless required by applicable law or agreed to in writing,
14*b0844812SAndrew Rist  * software distributed under the License is distributed on an
15*b0844812SAndrew Rist  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16*b0844812SAndrew Rist  * KIND, either express or implied.  See the License for the
17*b0844812SAndrew Rist  * specific language governing permissions and limitations
18*b0844812SAndrew Rist  * under the License.
19*b0844812SAndrew Rist  *
20*b0844812SAndrew Rist  *************************************************************/
21*b0844812SAndrew Rist 
22*b0844812SAndrew Rist 
23cdf0e10cSrcweir 
24cdf0e10cSrcweir  /**
25cdf0e10cSrcweir   *
26cdf0e10cSrcweir   *
27cdf0e10cSrcweir   *
28cdf0e10cSrcweir   *
29cdf0e10cSrcweir   * TODO
30cdf0e10cSrcweir   * - Add exception throwing when h == NULL
31cdf0e10cSrcweir   * - Not init h when implicit constructor is launched
32cdf0e10cSrcweir   */
33cdf0e10cSrcweir 
34cdf0e10cSrcweir // MARKER(update_precomp.py): autogen include statement, do not remove
35cdf0e10cSrcweir #include "precompiled_lingucomponent.hxx"
36cdf0e10cSrcweir 
37cdf0e10cSrcweir #include <string.h>
38cdf0e10cSrcweir #include <sstream>
39cdf0e10cSrcweir #include <iostream>
40cdf0e10cSrcweir 
41cdf0e10cSrcweir #include <libtextcat/textcat.h>
42cdf0e10cSrcweir #include <libtextcat/common.h>
43cdf0e10cSrcweir #include <libtextcat/constants.h>
44cdf0e10cSrcweir #include <libtextcat/fingerprint.h>
45cdf0e10cSrcweir #include <libtextcat/utf8misc.h>
46cdf0e10cSrcweir 
47cdf0e10cSrcweir #include <sal/types.h>
48cdf0e10cSrcweir 
49cdf0e10cSrcweir #include "altstrfunc.hxx"
50cdf0e10cSrcweir #include "simpleguesser.hxx"
51cdf0e10cSrcweir 
52cdf0e10cSrcweir #ifndef _UTF8_
53cdf0e10cSrcweir #define _UTF8_
54cdf0e10cSrcweir #endif
55cdf0e10cSrcweir 
56cdf0e10cSrcweir 
57cdf0e10cSrcweir using namespace std;
58cdf0e10cSrcweir 
59cdf0e10cSrcweir 
60cdf0e10cSrcweir /**
61cdf0e10cSrcweir  * This 3 following structures are from fingerprint.c and textcat.c
62cdf0e10cSrcweir  */
63cdf0e10cSrcweir 
64cdf0e10cSrcweir typedef struct ngram_t {
65cdf0e10cSrcweir 
66cdf0e10cSrcweir     sint2 rank;
67cdf0e10cSrcweir     char str[MAXNGRAMSIZE+1];
68cdf0e10cSrcweir 
69cdf0e10cSrcweir } ngram_t;
70cdf0e10cSrcweir 
71cdf0e10cSrcweir typedef struct fp_t {
72cdf0e10cSrcweir 
73cdf0e10cSrcweir     const char *name;
74cdf0e10cSrcweir     ngram_t *fprint;
75cdf0e10cSrcweir     uint4 size;
76cdf0e10cSrcweir 
77cdf0e10cSrcweir } fp_t;
78cdf0e10cSrcweir 
79cdf0e10cSrcweir typedef struct textcat_t{
80cdf0e10cSrcweir 
81cdf0e10cSrcweir     void **fprint;
82cdf0e10cSrcweir     char *fprint_disable;
83cdf0e10cSrcweir     uint4 size;
84cdf0e10cSrcweir     uint4 maxsize;
85cdf0e10cSrcweir 
86cdf0e10cSrcweir     char output[MAXOUTPUTSIZE];
87cdf0e10cSrcweir 
88cdf0e10cSrcweir } textcat_t;
89cdf0e10cSrcweir /** end of the 3 structs */
90cdf0e10cSrcweir 
SimpleGuesser()91cdf0e10cSrcweir SimpleGuesser::SimpleGuesser()
92cdf0e10cSrcweir {
93cdf0e10cSrcweir     h = NULL;
94cdf0e10cSrcweir }
95cdf0e10cSrcweir 
operator =(SimpleGuesser & sg)96cdf0e10cSrcweir void SimpleGuesser::operator=(SimpleGuesser& sg){
97cdf0e10cSrcweir     if(h){textcat_Done(h);}
98cdf0e10cSrcweir     h = sg.h;
99cdf0e10cSrcweir }
100cdf0e10cSrcweir 
~SimpleGuesser()101cdf0e10cSrcweir SimpleGuesser::~SimpleGuesser()
102cdf0e10cSrcweir {
103cdf0e10cSrcweir     if(h){textcat_Done(h);}
104cdf0e10cSrcweir }
105cdf0e10cSrcweir 
106cdf0e10cSrcweir 
107cdf0e10cSrcweir /*!
108cdf0e10cSrcweir     \fn SimpleGuesser::GuessLanguage(char* text)
109cdf0e10cSrcweir  */
GuessLanguage(char * text)110cdf0e10cSrcweir vector<Guess> SimpleGuesser::GuessLanguage(char* text)
111cdf0e10cSrcweir {
112cdf0e10cSrcweir         vector<Guess> guesses;
113cdf0e10cSrcweir 
114cdf0e10cSrcweir         if(!h){return guesses;}
115cdf0e10cSrcweir 
116cdf0e10cSrcweir         //calculate le number of unicode charcters (symbols)
117cdf0e10cSrcweir         int len = utfstrlen(text);
118cdf0e10cSrcweir 
119cdf0e10cSrcweir 	if( len > MAX_STRING_LENGTH_TO_ANALYSE ){len = MAX_STRING_LENGTH_TO_ANALYSE ;}
120cdf0e10cSrcweir 
121cdf0e10cSrcweir         char *guess_list = textcat_Classify(h, text, len);
122cdf0e10cSrcweir 
123cdf0e10cSrcweir         if(strcmp(guess_list, _TEXTCAT_RESULT_SHORT) == 0){
124cdf0e10cSrcweir             return guesses;
125cdf0e10cSrcweir         }
126cdf0e10cSrcweir 
127cdf0e10cSrcweir         int current_pointer = 0;
128cdf0e10cSrcweir 
129cdf0e10cSrcweir         for(int i = 0; guess_list[current_pointer] != '\0'; i++)
130cdf0e10cSrcweir         {
131cdf0e10cSrcweir             while(guess_list[current_pointer] != GUESS_SEPARATOR_OPEN && guess_list[current_pointer] != '\0'){
132cdf0e10cSrcweir                 current_pointer++;
133cdf0e10cSrcweir             }
134cdf0e10cSrcweir             if(guess_list[current_pointer] != '\0')
135cdf0e10cSrcweir             {
136cdf0e10cSrcweir                 Guess g((char*)(guess_list + current_pointer));
137cdf0e10cSrcweir 
138cdf0e10cSrcweir                 guesses.push_back(g);
139cdf0e10cSrcweir 
140cdf0e10cSrcweir                 current_pointer++;
141cdf0e10cSrcweir             }
142cdf0e10cSrcweir         }
143cdf0e10cSrcweir 
144cdf0e10cSrcweir 	return guesses;
145cdf0e10cSrcweir }
146cdf0e10cSrcweir 
147cdf0e10cSrcweir /*!
148cdf0e10cSrcweir     \fn SimpleGuesser::GuessPrimaryLanguage(char* text)
149cdf0e10cSrcweir  */
GuessPrimaryLanguage(char * text)150cdf0e10cSrcweir Guess SimpleGuesser::GuessPrimaryLanguage(char* text)
151cdf0e10cSrcweir {
152cdf0e10cSrcweir     vector<Guess> ret = GuessLanguage(text);
153cdf0e10cSrcweir     if(ret.size() > 0){
154cdf0e10cSrcweir         return GuessLanguage(text)[0];
155cdf0e10cSrcweir     }
156cdf0e10cSrcweir     else{
157cdf0e10cSrcweir         return Guess();
158cdf0e10cSrcweir     }
159cdf0e10cSrcweir }
160cdf0e10cSrcweir /**
161cdf0e10cSrcweir  * Is used to know wich language is available, unavailable or both
162cdf0e10cSrcweir  * when mask = 0xF0, return only Available
163cdf0e10cSrcweir  * when mask = 0x0F, return only Unavailable
164cdf0e10cSrcweir  * when mask = 0xFF, return both Available and Unavailable
165cdf0e10cSrcweir  */
GetManagedLanguages(const char mask)166cdf0e10cSrcweir vector<Guess> SimpleGuesser::GetManagedLanguages(const char mask)
167cdf0e10cSrcweir {
168cdf0e10cSrcweir     size_t i;
169cdf0e10cSrcweir     textcat_t *tables = (textcat_t*)h;
170cdf0e10cSrcweir 
171cdf0e10cSrcweir     vector<Guess> lang;
172cdf0e10cSrcweir     if(!h){return lang;}
173cdf0e10cSrcweir 
174cdf0e10cSrcweir     for (i=0; i<tables->size; i++) {
175cdf0e10cSrcweir         if(tables->fprint_disable[i] & mask){
176cdf0e10cSrcweir             string langStr = "[";
177cdf0e10cSrcweir             langStr += (char*)fp_Name(tables->fprint[i]);
178cdf0e10cSrcweir             Guess g( (char *)langStr.c_str());
179cdf0e10cSrcweir             lang.push_back(g);
180cdf0e10cSrcweir         }
181cdf0e10cSrcweir     }
182cdf0e10cSrcweir 
183cdf0e10cSrcweir     return lang;
184cdf0e10cSrcweir }
185cdf0e10cSrcweir 
GetAvailableLanguages()186cdf0e10cSrcweir vector<Guess> SimpleGuesser::GetAvailableLanguages(){
187cdf0e10cSrcweir     return GetManagedLanguages( sal::static_int_cast< char >( 0xF0 ) );
188cdf0e10cSrcweir }
189cdf0e10cSrcweir 
GetUnavailableLanguages()190cdf0e10cSrcweir vector<Guess> SimpleGuesser::GetUnavailableLanguages(){
191cdf0e10cSrcweir     return GetManagedLanguages( sal::static_int_cast< char >( 0x0F ));
192cdf0e10cSrcweir }
193cdf0e10cSrcweir 
GetAllManagedLanguages()194cdf0e10cSrcweir vector<Guess> SimpleGuesser::GetAllManagedLanguages(){
195cdf0e10cSrcweir     return GetManagedLanguages( sal::static_int_cast< char >( 0xFF ));
196cdf0e10cSrcweir }
197cdf0e10cSrcweir 
XableLanguage(string lang,char mask)198cdf0e10cSrcweir void SimpleGuesser::XableLanguage(string lang, char mask){
199cdf0e10cSrcweir     size_t i;
200cdf0e10cSrcweir     textcat_t *tables = (textcat_t*)h;
201cdf0e10cSrcweir 
202cdf0e10cSrcweir     if(!h){return;}
203cdf0e10cSrcweir 
204cdf0e10cSrcweir     for (i=0; i<tables->size; i++) {
205cdf0e10cSrcweir         string language(fp_Name(tables->fprint[i]));
206cdf0e10cSrcweir         if(start(language,lang) == 0){
207cdf0e10cSrcweir             //cout << language << endl;
208cdf0e10cSrcweir             tables->fprint_disable[i] = mask;
209cdf0e10cSrcweir             //continue;
210cdf0e10cSrcweir         }
211cdf0e10cSrcweir     }
212cdf0e10cSrcweir }
213cdf0e10cSrcweir 
EnableLanguage(string lang)214cdf0e10cSrcweir void SimpleGuesser::EnableLanguage(string lang){
215cdf0e10cSrcweir     XableLanguage(lang,  sal::static_int_cast< char >( 0xF0 ));
216cdf0e10cSrcweir }
217cdf0e10cSrcweir 
DisableLanguage(string lang)218cdf0e10cSrcweir void SimpleGuesser::DisableLanguage(string lang){
219cdf0e10cSrcweir     XableLanguage(lang,  sal::static_int_cast< char >( 0x0F ));
220cdf0e10cSrcweir }
221cdf0e10cSrcweir 
222cdf0e10cSrcweir /**
223cdf0e10cSrcweir *
224cdf0e10cSrcweir */
SetDBPath(const char * path,const char * prefix)225cdf0e10cSrcweir void SimpleGuesser::SetDBPath(const char* path, const char* prefix){
226cdf0e10cSrcweir     if(h){
227cdf0e10cSrcweir         textcat_Done(h);
228cdf0e10cSrcweir     }
229cdf0e10cSrcweir     h = special_textcat_Init(path, prefix);
230cdf0e10cSrcweir }
231