1 /************************************************************** 2 * 3 * Licensed to the Apache Software Foundation (ASF) under one 4 * or more contributor license agreements. See the NOTICE file 5 * distributed with this work for additional information 6 * regarding copyright ownership. The ASF licenses this file 7 * to you under the Apache License, Version 2.0 (the 8 * "License"); you may not use this file except in compliance 9 * with the License. You may obtain a copy of the License at 10 * 11 * http://www.apache.org/licenses/LICENSE-2.0 12 * 13 * Unless required by applicable law or agreed to in writing, 14 * software distributed under the License is distributed on an 15 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 * KIND, either express or implied. See the License for the 17 * specific language governing permissions and limitations 18 * under the License. 19 * 20 *************************************************************/ 21 22 23 24 /** 25 * 26 * 27 * 28 * 29 * TODO 30 * - Add exception throwing when h == NULL 31 * - Not init h when implicit constructor is launched 32 */ 33 34 // MARKER(update_precomp.py): autogen include statement, do not remove 35 #include "precompiled_lingucomponent.hxx" 36 37 #include <string.h> 38 #include <sstream> 39 #include <iostream> 40 41 #include <libtextcat/textcat.h> 42 #include <libtextcat/common.h> 43 #include <libtextcat/constants.h> 44 #include <libtextcat/fingerprint.h> 45 #include <libtextcat/utf8misc.h> 46 47 #include <sal/types.h> 48 49 #include "altstrfunc.hxx" 50 #include "simpleguesser.hxx" 51 52 #ifndef _UTF8_ 53 #define _UTF8_ 54 #endif 55 56 57 using namespace std; 58 59 60 /** 61 * This 3 following structures are from fingerprint.c and textcat.c 62 */ 63 64 typedef struct ngram_t { 65 66 sint2 rank; 67 char str[MAXNGRAMSIZE+1]; 68 69 } ngram_t; 70 71 typedef struct fp_t { 72 73 const char *name; 74 ngram_t *fprint; 75 uint4 size; 76 77 } fp_t; 78 79 typedef struct textcat_t{ 80 81 void **fprint; 82 char *fprint_disable; 83 uint4 size; 84 uint4 maxsize; 85 86 char output[MAXOUTPUTSIZE]; 87 88 } textcat_t; 89 /** end of the 3 structs */ 90 91 SimpleGuesser::SimpleGuesser() 92 { 93 h = NULL; 94 } 95 96 void SimpleGuesser::operator=(SimpleGuesser& sg){ 97 if(h){textcat_Done(h);} 98 h = sg.h; 99 } 100 101 SimpleGuesser::~SimpleGuesser() 102 { 103 if(h){textcat_Done(h);} 104 } 105 106 107 /*! 108 \fn SimpleGuesser::GuessLanguage(char* text) 109 */ 110 vector<Guess> SimpleGuesser::GuessLanguage(char* text) 111 { 112 vector<Guess> guesses; 113 114 if(!h){return guesses;} 115 116 //calculate le number of unicode charcters (symbols) 117 int len = utfstrlen(text); 118 119 if( len > MAX_STRING_LENGTH_TO_ANALYSE ){len = MAX_STRING_LENGTH_TO_ANALYSE ;} 120 121 char *guess_list = textcat_Classify(h, text, len); 122 123 if(strcmp(guess_list, _TEXTCAT_RESULT_SHORT) == 0){ 124 return guesses; 125 } 126 127 int current_pointer = 0; 128 129 for(int i = 0; guess_list[current_pointer] != '\0'; i++) 130 { 131 while(guess_list[current_pointer] != GUESS_SEPARATOR_OPEN && guess_list[current_pointer] != '\0'){ 132 current_pointer++; 133 } 134 if(guess_list[current_pointer] != '\0') 135 { 136 Guess g((char*)(guess_list + current_pointer)); 137 138 guesses.push_back(g); 139 140 current_pointer++; 141 } 142 } 143 144 return guesses; 145 } 146 147 /*! 148 \fn SimpleGuesser::GuessPrimaryLanguage(char* text) 149 */ 150 Guess SimpleGuesser::GuessPrimaryLanguage(char* text) 151 { 152 vector<Guess> ret = GuessLanguage(text); 153 if(ret.size() > 0){ 154 return GuessLanguage(text)[0]; 155 } 156 else{ 157 return Guess(); 158 } 159 } 160 /** 161 * Is used to know which language is available, unavailable or both 162 * when mask = 0xF0, return only Available 163 * when mask = 0x0F, return only Unavailable 164 * when mask = 0xFF, return both Available and Unavailable 165 */ 166 vector<Guess> SimpleGuesser::GetManagedLanguages(const char mask) 167 { 168 size_t i; 169 textcat_t *tables = (textcat_t*)h; 170 171 vector<Guess> lang; 172 if(!h){return lang;} 173 174 for (i=0; i<tables->size; i++) { 175 if(tables->fprint_disable[i] & mask){ 176 string langStr = "["; 177 langStr += (char*)fp_Name(tables->fprint[i]); 178 Guess g( (char *)langStr.c_str()); 179 lang.push_back(g); 180 } 181 } 182 183 return lang; 184 } 185 186 vector<Guess> SimpleGuesser::GetAvailableLanguages(){ 187 return GetManagedLanguages( sal::static_int_cast< char >( 0xF0 ) ); 188 } 189 190 vector<Guess> SimpleGuesser::GetUnavailableLanguages(){ 191 return GetManagedLanguages( sal::static_int_cast< char >( 0x0F )); 192 } 193 194 vector<Guess> SimpleGuesser::GetAllManagedLanguages(){ 195 return GetManagedLanguages( sal::static_int_cast< char >( 0xFF )); 196 } 197 198 void SimpleGuesser::XableLanguage(string lang, char mask){ 199 size_t i; 200 textcat_t *tables = (textcat_t*)h; 201 202 if(!h){return;} 203 204 for (i=0; i<tables->size; i++) { 205 string language(fp_Name(tables->fprint[i])); 206 if(start(language,lang) == 0){ 207 //cout << language << endl; 208 tables->fprint_disable[i] = mask; 209 //continue; 210 } 211 } 212 } 213 214 void SimpleGuesser::EnableLanguage(string lang){ 215 XableLanguage(lang, sal::static_int_cast< char >( 0xF0 )); 216 } 217 218 void SimpleGuesser::DisableLanguage(string lang){ 219 XableLanguage(lang, sal::static_int_cast< char >( 0x0F )); 220 } 221 222 /** 223 * 224 */ 225 void SimpleGuesser::SetDBPath(const char* path, const char* prefix){ 226 if(h){ 227 textcat_Done(h); 228 } 229 h = special_textcat_Init(path, prefix); 230 } 231