1 /************************************************************************* 2 * 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * Copyright 2000, 2010 Oracle and/or its affiliates. 6 * 7 * OpenOffice.org - a multi-platform office productivity suite 8 * 9 * This file is part of OpenOffice.org. 10 * 11 * OpenOffice.org is free software: you can redistribute it and/or modify 12 * it under the terms of the GNU Lesser General Public License version 3 13 * only, as published by the Free Software Foundation. 14 * 15 * OpenOffice.org is distributed in the hope that it will be useful, 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 * GNU Lesser General Public License version 3 for more details 19 * (a copy is included in the LICENSE file that accompanied this code). 20 * 21 * You should have received a copy of the GNU Lesser General Public License 22 * version 3 along with OpenOffice.org. If not, see 23 * <http://www.openoffice.org/license.html> 24 * for a copy of the LGPLv3 License. 25 * 26 ************************************************************************/ 27 28 // MARKER(update_precomp.py): autogen include statement, do not remove 29 #include "precompiled_l10ntools.hxx" 30 31 32 #include "wtratree.hxx" 33 34 35 36 /** @ATTENTION 37 For reasons of speed, class WordTransTree works with two simple 38 char arrays, sOutput and sInput, instead of secure containers or 39 streams. So be extremely careful, when changing this code!!! 40 **/ 41 42 43 44 // NOT FULLY DECLARED SERVICES 45 #include <string.h> 46 #include <stdio.h> 47 #include <ctype.h> 48 #include "wtranode.hxx" 49 50 51 const BRANCH_T BR_END = 0; 52 const BRANCH_T BR_NONALPHA = 1; 53 const BRANCH_T BR_HOTKEY = 2; 54 const BRANCH_T BR_BACKSLASH = 3; 55 const BRANCH_T BR_ALPHABASE = 4; /// @ATTENTION All branches not valid for words must be smaller than this value! 56 const BRANCH_T BR_AE = 30; 57 const BRANCH_T BR_OE = 31; 58 const BRANCH_T BR_UE = 32; 59 const BRANCH_T BR_SZ = 33; 60 const BRANCH_T BR_MAX = 34; /// @ATTENTION Must be updated always! 61 62 const BRANCH_T BR_START = 0; 63 64 65 66 67 68 WordTransTree::WordTransTree(CharSet i_nWorkingCharSet) 69 : sInput(0), 70 nInputLength(0), 71 pInputEnd(0), 72 sOutput(0), 73 nOutputMaxLength(0), 74 dpParsingTreeTop(0), 75 pUnknownAlpha(0), 76 // cChar2Branch 77 c_AE(u_char('\xC4')), c_OE(u_char('\xD6')), c_UE(u_char('\xDC')), 78 c_ae(u_char('\xE4')), c_oe(u_char('\xF6')), c_ue(u_char('\xFC')), 79 pInputCurTokenStart(0), 80 pInputPosition(0), 81 pOutputPosition(0), 82 pCurParseNode(0), 83 eCurResult(OK), 84 cCurHotkey(0), 85 cCurHotkeySign(u_char('~')) 86 { 87 // Initialize parsing tree: 88 pUnknownAlpha = new WTT_Node(BR_ALPHABASE,0,0); // This will be deleted as part of the parsing tree. 89 for ( UINT8 i = BR_ALPHABASE; i < C_NR_OF_BRANCHES; i++) 90 { 91 pUnknownAlpha->SetBranch(i,pUnknownAlpha); 92 } // end for 93 94 dpParsingTreeTop = new WTT_Node(BR_START,0,pUnknownAlpha); 95 96 WTT_Node * dpNonAlpha = new WTT_Node(BR_NONALPHA,0,0); 97 98 dpNonAlpha->SetBranch(BR_NONALPHA,dpNonAlpha); 99 dpParsingTreeTop->SetBranch(BR_NONALPHA,dpNonAlpha); 100 101 WTT_Node * dpBackslash = new WTT_Node(BR_BACKSLASH,dpNonAlpha,dpNonAlpha); 102 dpBackslash->SetBranch(BR_END,0); 103 104 dpParsingTreeTop->SetBranch(BR_BACKSLASH,dpBackslash); 105 dpNonAlpha->SetBranch(BR_BACKSLASH,dpBackslash); 106 107 108 // Initialize character set: 109 SetCharSet(i_nWorkingCharSet); 110 111 if (C_BR_ALPHABASE != BR_ALPHABASE || C_NR_OF_BRANCHES != BR_MAX) 112 { 113 fprintf(stderr, "Assertion failed: file %s line %d.", __FILE__, __LINE__); 114 exit(1); 115 } 116 } 117 118 void 119 WordTransTree::SetCharSet(CharSet i_nWorkingCharSet) 120 { 121 ByteString sConvert("\xC4\xD6\xDC\xE4\xF6\xFC\xDF"); 122 const u_char * pConvert = (const u_char * ) ( sConvert.Convert(RTL_TEXTENCODING_MS_1252, i_nWorkingCharSet).GetBuffer() ); 123 124 INT16 i = 0; 125 for ( ; i < C_NR_OF_POSSIBLE_CHARS; ++i ) 126 { 127 cChar2Branch[i] = BR_NONALPHA; 128 } // end for 129 for ( i = 'a'; i <= 'z'; ++i ) 130 { 131 cChar2Branch[i] = BR_ALPHABASE + i - 'a'; 132 } // end for 133 for ( i = 'A'; i <= 'Z'; ++i ) 134 { 135 cChar2Branch[i] = BR_ALPHABASE + i - 'A'; 136 } // end for 137 cChar2Branch[pConvert[0]] = BR_AE; 138 cChar2Branch[pConvert[1]] = BR_OE; 139 cChar2Branch[pConvert[2]] = BR_UE; 140 cChar2Branch[pConvert[3]] = BR_AE; 141 cChar2Branch[pConvert[4]] = BR_OE; 142 cChar2Branch[pConvert[5]] = BR_UE; 143 cChar2Branch[pConvert[6]] = BR_SZ; 144 145 cChar2Branch[u_char('~')] = BR_HOTKEY; 146 cChar2Branch[u_char('&')] = BR_HOTKEY; 147 148 149 c_AE = pConvert[0]; 150 c_OE = pConvert[1]; 151 c_UE = pConvert[2]; 152 c_ae = pConvert[3]; 153 c_oe = pConvert[4]; 154 c_ue = pConvert[5]; 155 } 156 157 WordTransTree::~WordTransTree() 158 { 159 delete dpParsingTreeTop; 160 if (sOutput != 0) 161 delete [] sOutput; 162 } 163 164 void 165 WordTransTree::AddWordPair( const ByteString & i_sOldString, 166 const ByteString & i_sReplaceString ) 167 { 168 if (i_sOldString.Len() == 0) 169 return; 170 171 pCurParseNode = dpParsingTreeTop; 172 WTT_Node * pBranch = 0; 173 char cBranch = 0; 174 175 for ( constr pOld = i_sOldString.GetBuffer(); 176 *pOld != 0; 177 pOld++ ) 178 { 179 cBranch = CalculateBranch(*pOld); 180 pBranch = pCurParseNode->GetNextNode(cBranch); 181 if (pBranch == 0 || pBranch == pUnknownAlpha) 182 { 183 pBranch = new WTT_Node(cBranch,0,pUnknownAlpha); 184 pCurParseNode->SetBranch(cBranch,pBranch); 185 } 186 pCurParseNode = pBranch; 187 } // end for 188 pCurParseNode->SetAsTokenToReplace(i_sReplaceString); 189 } 190 191 void 192 WordTransTree::InitTransformation( const char * i_sInput, 193 UINT32 i_nInputLength, 194 UINT32 i_nOutputMaxLength ) 195 { 196 sInput = (const u_char *)i_sInput; 197 nInputLength = i_nInputLength; 198 pInputEnd = &sInput[i_nInputLength]; 199 200 pInputCurTokenStart = sInput; 201 pInputPosition = sInput; 202 203 if (nOutputMaxLength < i_nOutputMaxLength) 204 { 205 if (sOutput != 0) 206 delete [] sOutput; 207 sOutput = new unsigned char[i_nOutputMaxLength]; 208 nOutputMaxLength = i_nOutputMaxLength; 209 } 210 pOutputPosition = sOutput; 211 } 212 213 /** pInputCurTokenStart and CurParseNode are updated just when 214 starting this function. After its end they must not be changed 215 till this functon is called again. 216 Outside this function pInputPositon and pOutputPosition are both 217 on the first not transformed char in their respective array. 218 **/ 219 WordTransTree::E_Result 220 WordTransTree::TransformNextToken() 221 { 222 pInputCurTokenStart = pInputPosition; 223 pCurParseNode = dpParsingTreeTop; 224 cCurHotkey = 0; 225 eCurResult = OK; 226 227 WTT_Node * pBranch = 0; 228 UINT8 cBranch = 0; 229 230 for ( pCurParseNode = dpParsingTreeTop; 231 pInputPosition != pInputEnd; 232 ++pInputPosition ) 233 { 234 cBranch = CalculateBranch(*pInputPosition); 235 pBranch = pCurParseNode->GetNextNode( cBranch ); 236 if (pBranch != 0) 237 { 238 pCurParseNode = pBranch; 239 } 240 else 241 { 242 if (cBranch == BR_HOTKEY) // current letter is '~' or '&'. 243 { 244 // Logic of the following. There are 9 possible cases - 245 // A = alphabetic letter, NA = non alphabetic, TB = token begin, 246 // Eot = end of text: 247 // 1. A~A set hotkey to following letter, continue 248 // 2. A~NA token end 249 // 3. A~Eot token end 250 // 4. NA~A token end 251 // 5. NA~NA continue 252 // 6. A~Eof continue 253 // 7. TB~A set hotkey to following letter, continue 254 // 8. TB~NA continue 255 // 9. TB~Eot continue 256 257 // bNext and Prev are true, if there are alphabetic letters: 258 sal_Bool bNext = pInputPosition + 1 != pInputEnd 259 ? CalculateBranch(pInputPosition[1]) >= BR_ALPHABASE 260 : sal_False; 261 sal_Bool bPrev = pCurParseNode->Value() >= BR_ALPHABASE; 262 263 if ( bNext && (bPrev || pCurParseNode == dpParsingTreeTop) ) 264 { // case 1. and 7. 265 Handle_Hotkey(); 266 continue; 267 } 268 else if (!bPrev && !bNext) 269 { // case 5.,6.,8.,9. 270 continue; 271 } 272 273 // Case 2.,3.,4. : 274 // so this should be handled as an end of a token. 275 } 276 if (pCurParseNode->TokenType() == WTT_Node::token_to_keep) 277 { 278 Handle_TokenToKeep(); 279 return eCurResult; 280 } 281 else 282 { 283 Handle_TokenToTransform(); 284 return eCurResult; 285 } // endif (pCurParseNode->TokenType() == WTT_Node::token_to_keep) 286 } // endif (pBranch == 0) else 287 } // end for 288 289 // If here, the text end is reached 290 if (pCurParseNode->TokenType() == WTT_Node::token_to_keep) 291 { 292 Handle_TokenToKeep(); 293 return eCurResult; 294 } 295 else 296 { 297 Handle_TokenToTransform(); 298 return eCurResult; 299 } 300 } 301 302 ByteString 303 WordTransTree::CurReplacingString() const 304 { 305 return pCurParseNode->ReplaceString(); 306 } 307 308 void 309 WordTransTree::Handle_Hotkey() 310 { 311 if (cCurHotkey == 0) // Avoid to replace the first found hotkey by 312 // a later one - though this shouldn't happen anyway. 313 { 314 cCurHotkey = (pInputPosition+1) != pInputEnd ? pInputPosition[1] : 0; 315 cCurHotkeySign = *pInputPosition; 316 } 317 } 318 319 void 320 WordTransTree::Handle_TokenToKeep() 321 { 322 UINT32 nTokenLength = pInputPosition-pInputCurTokenStart; 323 324 memcpy(pOutputPosition,pInputCurTokenStart,nTokenLength); 325 326 pOutputPosition += nTokenLength; 327 *pOutputPosition = '\0'; 328 } 329 330 void 331 WordTransTree::Handle_TokenToTransform() 332 { 333 sal_Bool bHaveHotkey = CalculateBranch(cCurHotkey) >= BR_ALPHABASE; 334 const ByteString & rReplace = pCurParseNode->ReplaceString(); 335 336 // Find position of hotkey in replace-string: 337 sal_uInt16 nHotkeyPos = bHaveHotkey 338 ? rReplace.Search(char(cCurHotkey)) 339 : STRING_NOTFOUND; 340 if (nHotkeyPos == STRING_NOTFOUND && bHaveHotkey) 341 { 342 if (cCurHotkey < 128) 343 { 344 if (islower(cCurHotkey)) 345 nHotkeyPos = rReplace.Search(toupper(char(cCurHotkey))); 346 else 347 nHotkeyPos = rReplace.Search(tolower(char(cCurHotkey))); 348 } 349 else // cCurHotkey >= 128 350 { 351 if (cCurHotkey == c_ae) 352 nHotkeyPos = rReplace.Search(char(c_AE)); 353 else if (cCurHotkey == c_oe) 354 nHotkeyPos = rReplace.Search(char(c_OE)); 355 else if (cCurHotkey == c_ue) 356 nHotkeyPos = rReplace.Search(char(c_UE)); 357 else if (cCurHotkey == c_AE) 358 nHotkeyPos = rReplace.Search(char(c_ae)); 359 else if (cCurHotkey == c_OE) 360 nHotkeyPos = rReplace.Search(char(c_oe)); 361 else if (cCurHotkey == c_UE) 362 nHotkeyPos = rReplace.Search(char(c_ue)); 363 } // endif (cCurHotkey < 128) else 364 365 if (nHotkeyPos == STRING_NOTFOUND) 366 { 367 eCurResult = HOTKEY_LOST; 368 bHaveHotkey = sal_False; 369 } 370 } // endif (nHotkeyPos == STRING_NOT_FOUND && bHaveHotkey) 371 372 373 UINT32 nOutputTokenLength = rReplace.Len() + (bHaveHotkey ? 1 : 0); 374 375 if (bHaveHotkey) 376 { 377 memcpy( pOutputPosition, 378 pCurParseNode->ReplaceString().GetBuffer(), 379 nHotkeyPos ); 380 *(pOutputPosition + nHotkeyPos) = cCurHotkeySign; 381 memcpy( pOutputPosition + nHotkeyPos + 1, 382 pCurParseNode->ReplaceString().GetBuffer() + nHotkeyPos, 383 nOutputTokenLength - nHotkeyPos - 1); 384 } 385 else 386 { 387 memcpy( pOutputPosition, 388 pCurParseNode->ReplaceString().GetBuffer(), 389 nOutputTokenLength ); 390 } 391 392 // Convert first letter into upper if necessary: 393 u_char cInStart = CalculateBranch(*pInputCurTokenStart) == BR_HOTKEY 394 ? pInputCurTokenStart[1] 395 : pInputCurTokenStart[0] ; 396 u_char * pOutStart = nHotkeyPos == 0 397 ? pOutputPosition + 1 398 : pOutputPosition ; 399 if (isupper(cInStart) || cInStart > 127) 400 { // Possibly cInStart is upper character: 401 if (isupper(cInStart) || cInStart == c_AE || cInStart == c_OE || cInStart == c_UE) 402 { // Surely cInStart is upper character: 403 u_char cOutStart = *pOutStart; 404 if (cOutStart < 128) 405 *pOutStart = toupper(cOutStart); 406 else if (cOutStart == c_ae) 407 *pOutStart = c_AE; 408 else if (cOutStart == c_oe) 409 *pOutStart = c_OE; 410 else if (cOutStart == c_ue) 411 *pOutStart = c_UE; 412 } 413 } // endif (isupper(cInStart) || cInStart > 127) 414 415 pOutputPosition += nOutputTokenLength; 416 *pOutputPosition = '\0'; 417 } 418 419