xref: /trunk/main/l10ntools/source/wtratree.cxx (revision cdf0e10c4e3984b49a9502b011690b615761d4a3)
1 /*************************************************************************
2  *
3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4  *
5  * Copyright 2000, 2010 Oracle and/or its affiliates.
6  *
7  * OpenOffice.org - a multi-platform office productivity suite
8  *
9  * This file is part of OpenOffice.org.
10  *
11  * OpenOffice.org is free software: you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser General Public License version 3
13  * only, as published by the Free Software Foundation.
14  *
15  * OpenOffice.org is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  * GNU Lesser General Public License version 3 for more details
19  * (a copy is included in the LICENSE file that accompanied this code).
20  *
21  * You should have received a copy of the GNU Lesser General Public License
22  * version 3 along with OpenOffice.org.  If not, see
23  * <http://www.openoffice.org/license.html>
24  * for a copy of the LGPLv3 License.
25  *
26  ************************************************************************/
27 
28 // MARKER(update_precomp.py): autogen include statement, do not remove
29 #include "precompiled_l10ntools.hxx"
30 
31 
32 #include "wtratree.hxx"
33 
34 
35 
36 /** @ATTENTION
37     For reasons of speed, class WordTransTree works with two simple
38     char arrays, sOutput and sInput, instead of secure containers or
39     streams. So be extremely careful, when changing this code!!!
40 **/
41 
42 
43 
44 // NOT FULLY DECLARED SERVICES
45 #include <string.h>
46 #include <stdio.h>
47 #include <ctype.h>
48 #include "wtranode.hxx"
49 
50 
51 const BRANCH_T  BR_END          = 0;
52 const BRANCH_T  BR_NONALPHA     = 1;
53 const BRANCH_T  BR_HOTKEY       = 2;
54 const BRANCH_T  BR_BACKSLASH    = 3;
55 const BRANCH_T  BR_ALPHABASE    = 4;    /// @ATTENTION  All branches not valid for words must be smaller than this value!
56 const BRANCH_T  BR_AE           = 30;
57 const BRANCH_T  BR_OE           = 31;
58 const BRANCH_T  BR_UE           = 32;
59 const BRANCH_T  BR_SZ           = 33;
60 const BRANCH_T  BR_MAX          = 34;   /// @ATTENTION  Must be updated always!
61 
62 const BRANCH_T  BR_START        = 0;
63 
64 
65 
66 
67 
68 WordTransTree::WordTransTree(CharSet  i_nWorkingCharSet)
69     :   sInput(0),
70         nInputLength(0),
71         pInputEnd(0),
72         sOutput(0),
73         nOutputMaxLength(0),
74         dpParsingTreeTop(0),
75         pUnknownAlpha(0),
76         // cChar2Branch
77         c_AE(u_char('\xC4')), c_OE(u_char('\xD6')), c_UE(u_char('\xDC')),
78         c_ae(u_char('\xE4')), c_oe(u_char('\xF6')), c_ue(u_char('\xFC')),
79         pInputCurTokenStart(0),
80         pInputPosition(0),
81         pOutputPosition(0),
82         pCurParseNode(0),
83         eCurResult(OK),
84         cCurHotkey(0),
85         cCurHotkeySign(u_char('~'))
86 {
87     // Initialize parsing tree:
88     pUnknownAlpha = new WTT_Node(BR_ALPHABASE,0,0); // This will be deleted as part of the parsing tree.
89     for ( UINT8 i = BR_ALPHABASE; i < C_NR_OF_BRANCHES; i++)
90     {
91         pUnknownAlpha->SetBranch(i,pUnknownAlpha);
92     }  // end for
93 
94     dpParsingTreeTop = new WTT_Node(BR_START,0,pUnknownAlpha);
95 
96     WTT_Node * dpNonAlpha = new WTT_Node(BR_NONALPHA,0,0);
97 
98     dpNonAlpha->SetBranch(BR_NONALPHA,dpNonAlpha);
99     dpParsingTreeTop->SetBranch(BR_NONALPHA,dpNonAlpha);
100 
101     WTT_Node * dpBackslash = new WTT_Node(BR_BACKSLASH,dpNonAlpha,dpNonAlpha);
102     dpBackslash->SetBranch(BR_END,0);
103 
104     dpParsingTreeTop->SetBranch(BR_BACKSLASH,dpBackslash);
105     dpNonAlpha->SetBranch(BR_BACKSLASH,dpBackslash);
106 
107 
108     // Initialize character set:
109     SetCharSet(i_nWorkingCharSet);
110 
111     if (C_BR_ALPHABASE != BR_ALPHABASE || C_NR_OF_BRANCHES != BR_MAX)
112     {
113         fprintf(stderr, "Assertion failed: file %s line %d.", __FILE__,  __LINE__);
114         exit(1);
115     }
116 }
117 
118 void
119 WordTransTree::SetCharSet(CharSet i_nWorkingCharSet)
120 {
121     ByteString sConvert("\xC4\xD6\xDC\xE4\xF6\xFC\xDF");
122     const u_char * pConvert = (const u_char * ) ( sConvert.Convert(RTL_TEXTENCODING_MS_1252, i_nWorkingCharSet).GetBuffer() );
123 
124     INT16 i = 0;
125     for ( ; i < C_NR_OF_POSSIBLE_CHARS; ++i )
126     {
127         cChar2Branch[i] = BR_NONALPHA;
128     }  // end for
129     for ( i = 'a'; i <= 'z'; ++i )
130     {
131         cChar2Branch[i] = BR_ALPHABASE + i - 'a';
132     }  // end for
133     for ( i = 'A'; i <= 'Z'; ++i )
134     {
135         cChar2Branch[i] = BR_ALPHABASE + i - 'A';
136     }  // end for
137     cChar2Branch[pConvert[0]] = BR_AE;
138     cChar2Branch[pConvert[1]] = BR_OE;
139     cChar2Branch[pConvert[2]] = BR_UE;
140     cChar2Branch[pConvert[3]] = BR_AE;
141     cChar2Branch[pConvert[4]] = BR_OE;
142     cChar2Branch[pConvert[5]] = BR_UE;
143     cChar2Branch[pConvert[6]] = BR_SZ;
144 
145     cChar2Branch[u_char('~')] = BR_HOTKEY;
146     cChar2Branch[u_char('&')] = BR_HOTKEY;
147 
148 
149     c_AE = pConvert[0];
150     c_OE = pConvert[1];
151     c_UE = pConvert[2];
152     c_ae = pConvert[3];
153     c_oe = pConvert[4];
154     c_ue = pConvert[5];
155 }
156 
157 WordTransTree::~WordTransTree()
158 {
159     delete dpParsingTreeTop;
160     if (sOutput != 0)
161         delete [] sOutput;
162 }
163 
164 void
165 WordTransTree::AddWordPair( const ByteString &      i_sOldString,
166                             const ByteString &      i_sReplaceString )
167 {
168     if (i_sOldString.Len() == 0)
169         return;
170 
171     pCurParseNode = dpParsingTreeTop;
172     WTT_Node * pBranch = 0;
173     char cBranch = 0;
174 
175     for ( constr pOld = i_sOldString.GetBuffer();
176           *pOld != 0;
177           pOld++ )
178     {
179         cBranch = CalculateBranch(*pOld);
180         pBranch = pCurParseNode->GetNextNode(cBranch);
181         if (pBranch == 0 || pBranch == pUnknownAlpha)
182         {
183             pBranch = new WTT_Node(cBranch,0,pUnknownAlpha);
184             pCurParseNode->SetBranch(cBranch,pBranch);
185         }
186         pCurParseNode = pBranch;
187     }   // end for
188     pCurParseNode->SetAsTokenToReplace(i_sReplaceString);
189 }
190 
191 void
192 WordTransTree::InitTransformation( const char * i_sInput,
193                                    UINT32       i_nInputLength,
194                                    UINT32       i_nOutputMaxLength )
195 {
196     sInput = (const u_char *)i_sInput;
197     nInputLength = i_nInputLength;
198     pInputEnd = &sInput[i_nInputLength];
199 
200     pInputCurTokenStart = sInput;
201     pInputPosition = sInput;
202 
203     if (nOutputMaxLength < i_nOutputMaxLength)
204     {
205         if (sOutput != 0)
206             delete [] sOutput;
207         sOutput = new unsigned char[i_nOutputMaxLength];
208         nOutputMaxLength = i_nOutputMaxLength;
209     }
210     pOutputPosition = sOutput;
211 }
212 
213 /** pInputCurTokenStart and CurParseNode are updated just when
214     starting this function. After its end they must not be changed
215     till this functon is called again.
216     Outside this function pInputPositon and pOutputPosition are both
217     on the first not transformed char in their respective array.
218 **/
219 WordTransTree::E_Result
220 WordTransTree::TransformNextToken()
221 {
222     pInputCurTokenStart = pInputPosition;
223     pCurParseNode = dpParsingTreeTop;
224     cCurHotkey = 0;
225     eCurResult = OK;
226 
227     WTT_Node * pBranch = 0;
228     UINT8 cBranch = 0;
229 
230     for ( pCurParseNode = dpParsingTreeTop;
231           pInputPosition != pInputEnd;
232           ++pInputPosition )
233     {
234         cBranch = CalculateBranch(*pInputPosition);
235         pBranch = pCurParseNode->GetNextNode( cBranch );
236         if (pBranch != 0)
237         {
238             pCurParseNode = pBranch;
239         }
240         else
241         {
242             if (cBranch == BR_HOTKEY)   // current letter is '~' or '&'.
243             {
244                 // Logic of the following. There are 9 possible cases -
245                 // A = alphabetic letter, NA = non alphabetic, TB = token begin,
246                 // Eot = end of text:
247                 //   1. A~A          set hotkey to following letter, continue
248                 //   2. A~NA         token end
249                 //   3. A~Eot        token end
250                 //   4. NA~A         token end
251                 //   5. NA~NA        continue
252                 //   6. A~Eof        continue
253                 //   7. TB~A         set hotkey to following letter, continue
254                 //   8. TB~NA        continue
255                 //   9. TB~Eot       continue
256 
257                 // bNext and Prev are true, if there are alphabetic letters:
258                 sal_Bool bNext =  pInputPosition + 1 != pInputEnd
259                                     ?   CalculateBranch(pInputPosition[1]) >= BR_ALPHABASE
260                                     :   sal_False;
261                 sal_Bool bPrev = pCurParseNode->Value() >= BR_ALPHABASE;
262 
263                 if ( bNext && (bPrev || pCurParseNode == dpParsingTreeTop) )
264                 {   // case 1. and 7.
265                     Handle_Hotkey();
266                     continue;
267                 }
268                 else if  (!bPrev && !bNext)
269                 {   // case 5.,6.,8.,9.
270                     continue;
271                 }
272 
273                 // Case 2.,3.,4. :
274                 //  so this should be handled as an end of a token.
275             }
276             if (pCurParseNode->TokenType() == WTT_Node::token_to_keep)
277             {
278                 Handle_TokenToKeep();
279                 return eCurResult;
280             }
281             else
282             {
283                 Handle_TokenToTransform();
284                 return eCurResult;
285             }   // endif (pCurParseNode->TokenType() == WTT_Node::token_to_keep)
286         }   // endif (pBranch == 0) else
287     }   // end for
288 
289     // If here, the text end is reached
290     if (pCurParseNode->TokenType() == WTT_Node::token_to_keep)
291     {
292         Handle_TokenToKeep();
293         return eCurResult;
294     }
295     else
296     {
297         Handle_TokenToTransform();
298         return eCurResult;
299     }
300 }
301 
302 ByteString
303 WordTransTree::CurReplacingString() const
304 {
305     return pCurParseNode->ReplaceString();
306 }
307 
308 void
309 WordTransTree::Handle_Hotkey()
310 {
311     if (cCurHotkey == 0)    // Avoid to replace the first found hotkey by
312                             //   a later one - though this shouldn't happen anyway.
313     {
314         cCurHotkey = (pInputPosition+1) != pInputEnd ? pInputPosition[1] : 0;
315         cCurHotkeySign = *pInputPosition;
316     }
317 }
318 
319 void
320 WordTransTree::Handle_TokenToKeep()
321 {
322     UINT32 nTokenLength = pInputPosition-pInputCurTokenStart;
323 
324     memcpy(pOutputPosition,pInputCurTokenStart,nTokenLength);
325 
326     pOutputPosition += nTokenLength;
327     *pOutputPosition = '\0';
328 }
329 
330 void
331 WordTransTree::Handle_TokenToTransform()
332 {
333     sal_Bool bHaveHotkey = CalculateBranch(cCurHotkey) >= BR_ALPHABASE;
334     const ByteString & rReplace = pCurParseNode->ReplaceString();
335 
336     // Find position of hotkey in replace-string:
337     sal_uInt16 nHotkeyPos = bHaveHotkey
338                             ?   rReplace.Search(char(cCurHotkey))
339                             :   STRING_NOTFOUND;
340     if (nHotkeyPos == STRING_NOTFOUND && bHaveHotkey)
341     {
342         if (cCurHotkey < 128)
343         {
344             if (islower(cCurHotkey))
345                 nHotkeyPos = rReplace.Search(toupper(char(cCurHotkey)));
346             else
347                 nHotkeyPos = rReplace.Search(tolower(char(cCurHotkey)));
348         }
349         else    // cCurHotkey >= 128
350         {
351             if (cCurHotkey == c_ae)
352                 nHotkeyPos = rReplace.Search(char(c_AE));
353             else if (cCurHotkey == c_oe)
354                 nHotkeyPos = rReplace.Search(char(c_OE));
355             else if (cCurHotkey == c_ue)
356                 nHotkeyPos = rReplace.Search(char(c_UE));
357             else if (cCurHotkey == c_AE)
358                 nHotkeyPos = rReplace.Search(char(c_ae));
359             else if (cCurHotkey == c_OE)
360                 nHotkeyPos = rReplace.Search(char(c_oe));
361             else if (cCurHotkey == c_UE)
362                 nHotkeyPos = rReplace.Search(char(c_ue));
363         }   // endif (cCurHotkey < 128) else
364 
365         if (nHotkeyPos == STRING_NOTFOUND)
366         {
367             eCurResult = HOTKEY_LOST;
368             bHaveHotkey = sal_False;
369         }
370     }   // endif (nHotkeyPos == STRING_NOT_FOUND && bHaveHotkey)
371 
372 
373     UINT32 nOutputTokenLength = rReplace.Len() + (bHaveHotkey ? 1 : 0);
374 
375     if (bHaveHotkey)
376     {
377         memcpy( pOutputPosition,
378                 pCurParseNode->ReplaceString().GetBuffer(),
379                 nHotkeyPos );
380         *(pOutputPosition + nHotkeyPos) = cCurHotkeySign;
381         memcpy( pOutputPosition + nHotkeyPos + 1,
382                 pCurParseNode->ReplaceString().GetBuffer() + nHotkeyPos,
383                 nOutputTokenLength - nHotkeyPos - 1);
384     }
385     else
386     {
387         memcpy( pOutputPosition,
388                 pCurParseNode->ReplaceString().GetBuffer(),
389                 nOutputTokenLength );
390     }
391 
392     // Convert first letter into upper if necessary:
393     u_char cInStart = CalculateBranch(*pInputCurTokenStart) == BR_HOTKEY
394                             ?   pInputCurTokenStart[1]
395                             :   pInputCurTokenStart[0] ;
396     u_char * pOutStart = nHotkeyPos == 0
397                             ?   pOutputPosition + 1
398                             :   pOutputPosition ;
399     if (isupper(cInStart) || cInStart > 127)
400     {   // Possibly cInStart is upper character:
401         if (isupper(cInStart) || cInStart == c_AE || cInStart == c_OE || cInStart == c_UE)
402         {   // Surely cInStart is upper character:
403             u_char cOutStart = *pOutStart;
404             if (cOutStart < 128)
405                 *pOutStart = toupper(cOutStart);
406             else if (cOutStart == c_ae)
407                 *pOutStart = c_AE;
408             else if (cOutStart == c_oe)
409                 *pOutStart = c_OE;
410             else if (cOutStart == c_ue)
411                 *pOutStart = c_UE;
412         }
413     }   // endif (isupper(cInStart) || cInStart > 127)
414 
415     pOutputPosition += nOutputTokenLength;
416     *pOutputPosition = '\0';
417 }
418 
419