xref: /aoo4110/main/l10ntools/source/wtratree.cxx (revision b1cdbd2c)
1 /**************************************************************
2  *
3  * Licensed to the Apache Software Foundation (ASF) under one
4  * or more contributor license agreements.  See the NOTICE file
5  * distributed with this work for additional information
6  * regarding copyright ownership.  The ASF licenses this file
7  * to you under the Apache License, Version 2.0 (the
8  * "License"); you may not use this file except in compliance
9  * with the License.  You may obtain a copy of the License at
10  *
11  *   http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing,
14  * software distributed under the License is distributed on an
15  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16  * KIND, either express or implied.  See the License for the
17  * specific language governing permissions and limitations
18  * under the License.
19  *
20  *************************************************************/
21 
22 
23 
24 // MARKER(update_precomp.py): autogen include statement, do not remove
25 #include "precompiled_l10ntools.hxx"
26 
27 
28 #include "wtratree.hxx"
29 
30 
31 
32 /** @ATTENTION
33 	For reasons of speed, class WordTransTree works with two simple
34 	char arrays, sOutput and sInput, instead of secure containers or
35 	streams. So be extremely careful, when changing this code!!!
36 **/
37 
38 
39 
40 // NOT FULLY DECLARED SERVICES
41 #include <string.h>
42 #include <stdio.h>
43 #include <ctype.h>
44 #include "wtranode.hxx"
45 
46 
47 const BRANCH_T	BR_END			= 0;
48 const BRANCH_T	BR_NONALPHA     = 1;
49 const BRANCH_T	BR_HOTKEY       = 2;
50 const BRANCH_T	BR_BACKSLASH    = 3;
51 const BRANCH_T	BR_ALPHABASE    = 4;   	/// @ATTENTION  All branches not valid for words must be smaller than this value!
52 const BRANCH_T	BR_AE           = 30;
53 const BRANCH_T	BR_OE           = 31;
54 const BRANCH_T	BR_UE           = 32;
55 const BRANCH_T	BR_SZ           = 33;
56 const BRANCH_T	BR_MAX          = 34;	/// @ATTENTION  Must be updated always!
57 
58 const BRANCH_T	BR_START 		= 0;
59 
60 
61 
62 
63 
WordTransTree(CharSet i_nWorkingCharSet)64 WordTransTree::WordTransTree(CharSet  i_nWorkingCharSet)
65 	:	sInput(0),
66 		nInputLength(0),
67 		pInputEnd(0),
68 		sOutput(0),
69 		nOutputMaxLength(0),
70 		dpParsingTreeTop(0),
71 		pUnknownAlpha(0),
72 		// cChar2Branch
73         c_AE(u_char('\xC4')), c_OE(u_char('\xD6')), c_UE(u_char('\xDC')),
74         c_ae(u_char('\xE4')), c_oe(u_char('\xF6')), c_ue(u_char('\xFC')),
75 		pInputCurTokenStart(0),
76 		pInputPosition(0),
77 		pOutputPosition(0),
78 		pCurParseNode(0),
79 		eCurResult(OK),
80 		cCurHotkey(0),
81 		cCurHotkeySign(u_char('~'))
82 {
83 	// Initialize parsing tree:
84 	pUnknownAlpha = new WTT_Node(BR_ALPHABASE,0,0);	// This will be deleted as part of the parsing tree.
85 	for ( UINT8 i = BR_ALPHABASE; i < C_NR_OF_BRANCHES; i++)
86 	{
87 		pUnknownAlpha->SetBranch(i,pUnknownAlpha);
88 	}  // end for
89 
90 	dpParsingTreeTop = new WTT_Node(BR_START,0,pUnknownAlpha);
91 
92 	WTT_Node * dpNonAlpha = new WTT_Node(BR_NONALPHA,0,0);
93 
94 	dpNonAlpha->SetBranch(BR_NONALPHA,dpNonAlpha);
95 	dpParsingTreeTop->SetBranch(BR_NONALPHA,dpNonAlpha);
96 
97 	WTT_Node * dpBackslash = new WTT_Node(BR_BACKSLASH,dpNonAlpha,dpNonAlpha);
98 	dpBackslash->SetBranch(BR_END,0);
99 
100 	dpParsingTreeTop->SetBranch(BR_BACKSLASH,dpBackslash);
101 	dpNonAlpha->SetBranch(BR_BACKSLASH,dpBackslash);
102 
103 
104 	// Initialize character set:
105 	SetCharSet(i_nWorkingCharSet);
106 
107 	if (C_BR_ALPHABASE != BR_ALPHABASE || C_NR_OF_BRANCHES != BR_MAX)
108 	{
109 		fprintf(stderr, "Assertion failed: file %s line %d.", __FILE__,  __LINE__);
110 		exit(1);
111 	}
112 }
113 
114 void
SetCharSet(CharSet i_nWorkingCharSet)115 WordTransTree::SetCharSet(CharSet i_nWorkingCharSet)
116 {
117     ByteString sConvert("\xC4\xD6\xDC\xE4\xF6\xFC\xDF");
118 	const u_char * pConvert = (const u_char * ) ( sConvert.Convert(RTL_TEXTENCODING_MS_1252, i_nWorkingCharSet).GetBuffer() );
119 
120 	INT16 i = 0;
121 	for ( ; i < C_NR_OF_POSSIBLE_CHARS; ++i )
122 	{
123 		cChar2Branch[i] = BR_NONALPHA;
124 	}  // end for
125 	for ( i = 'a'; i <= 'z'; ++i )
126 	{
127 		cChar2Branch[i] = BR_ALPHABASE + i - 'a';
128 	}  // end for
129 	for ( i = 'A'; i <= 'Z'; ++i )
130 	{
131 		cChar2Branch[i] = BR_ALPHABASE + i - 'A';
132 	}  // end for
133 	cChar2Branch[pConvert[0]] = BR_AE;
134 	cChar2Branch[pConvert[1]] = BR_OE;
135 	cChar2Branch[pConvert[2]] = BR_UE;
136 	cChar2Branch[pConvert[3]] = BR_AE;
137 	cChar2Branch[pConvert[4]] = BR_OE;
138 	cChar2Branch[pConvert[5]] = BR_UE;
139 	cChar2Branch[pConvert[6]] = BR_SZ;
140 
141 	cChar2Branch[u_char('~')] = BR_HOTKEY;
142 	cChar2Branch[u_char('&')] = BR_HOTKEY;
143 
144 
145 	c_AE = pConvert[0];
146 	c_OE = pConvert[1];
147 	c_UE = pConvert[2];
148 	c_ae = pConvert[3];
149 	c_oe = pConvert[4];
150 	c_ue = pConvert[5];
151 }
152 
~WordTransTree()153 WordTransTree::~WordTransTree()
154 {
155 	delete dpParsingTreeTop;
156 	if (sOutput != 0)
157 		delete [] sOutput;
158 }
159 
160 void
AddWordPair(const ByteString & i_sOldString,const ByteString & i_sReplaceString)161 WordTransTree::AddWordPair(	const ByteString &		i_sOldString,
162 							const ByteString &		i_sReplaceString )
163 {
164 	if (i_sOldString.Len() == 0)
165 		return;
166 
167 	pCurParseNode = dpParsingTreeTop;
168 	WTT_Node * pBranch = 0;
169 	char cBranch = 0;
170 
171 	for ( constr pOld = i_sOldString.GetBuffer();
172 		  *pOld != 0;
173 		  pOld++ )
174 	{
175 		cBranch = CalculateBranch(*pOld);
176 		pBranch = pCurParseNode->GetNextNode(cBranch);
177 		if (pBranch == 0 || pBranch == pUnknownAlpha)
178 		{
179 			pBranch = new WTT_Node(cBranch,0,pUnknownAlpha);
180 			pCurParseNode->SetBranch(cBranch,pBranch);
181 		}
182 		pCurParseNode = pBranch;
183 	}	// end for
184 	pCurParseNode->SetAsTokenToReplace(i_sReplaceString);
185 }
186 
187 void
InitTransformation(const char * i_sInput,UINT32 i_nInputLength,UINT32 i_nOutputMaxLength)188 WordTransTree::InitTransformation( const char *	i_sInput,
189 								   UINT32		i_nInputLength,
190 								   UINT32		i_nOutputMaxLength )
191 {
192 	sInput = (const u_char *)i_sInput;
193 	nInputLength = i_nInputLength;
194 	pInputEnd = &sInput[i_nInputLength];
195 
196 	pInputCurTokenStart = sInput;
197 	pInputPosition = sInput;
198 
199 	if (nOutputMaxLength < i_nOutputMaxLength)
200 	{
201 		if (sOutput != 0)
202 			delete [] sOutput;
203 		sOutput = new unsigned char[i_nOutputMaxLength];
204 		nOutputMaxLength = i_nOutputMaxLength;
205 	}
206 	pOutputPosition = sOutput;
207 }
208 
209 /**	pInputCurTokenStart and CurParseNode are updated just when
210 	starting this function. After its end they must not be changed
211 	till this functon is called again.
212 	Outside this function pInputPositon and pOutputPosition are both
213 	on the first not transformed char in their respective array.
214 **/
215 WordTransTree::E_Result
TransformNextToken()216 WordTransTree::TransformNextToken()
217 {
218 	pInputCurTokenStart = pInputPosition;
219 	pCurParseNode = dpParsingTreeTop;
220 	cCurHotkey = 0;
221     eCurResult = OK;
222 
223 	WTT_Node * pBranch = 0;
224 	UINT8 cBranch = 0;
225 
226 	for ( pCurParseNode = dpParsingTreeTop;
227 		  pInputPosition != pInputEnd;
228 		  ++pInputPosition )
229 	{
230 		cBranch = CalculateBranch(*pInputPosition);
231 		pBranch = pCurParseNode->GetNextNode( cBranch );
232 		if (pBranch != 0)
233 		{
234 			pCurParseNode = pBranch;
235 		}
236 		else
237 		{
238 			if (cBranch == BR_HOTKEY)   // current letter is '~' or '&'.
239 			{
240 				// Logic of the following. There are 9 possible cases -
241 				// A = alphabetic letter, NA = non alphabetic, TB = token begin,
242 				// Eot = end of text:
243 				//	 1.	A~A          set hotkey to following letter, continue
244 				//	 2.	A~NA         token end
245 				//	 3.	A~Eot        token end
246 				//	 4.	NA~A         token end
247 				//	 5.	NA~NA        continue
248 				//	 6.	A~Eof        continue
249 				//	 7.	TB~A         set hotkey to following letter, continue
250 				//	 8.	TB~NA        continue
251 				//	 9.	TB~Eot       continue
252 
253 				// bNext and Prev are true, if there are alphabetic letters:
254 				sal_Bool bNext =  pInputPosition + 1 != pInputEnd
255 									?   CalculateBranch(pInputPosition[1]) >= BR_ALPHABASE
256 									: 	sal_False;
257 				sal_Bool bPrev = pCurParseNode->Value() >= BR_ALPHABASE;
258 
259 				if ( bNext && (bPrev || pCurParseNode == dpParsingTreeTop) )
260 				{   // case 1. and 7.
261 					Handle_Hotkey();
262 					continue;
263 				}
264 				else if  (!bPrev && !bNext)
265 				{   // case 5.,6.,8.,9.
266 					continue;
267 				}
268 
269 				// Case 2.,3.,4. :
270 				// 	so this should be handled as an end of a token.
271 			}
272 			if (pCurParseNode->TokenType() == WTT_Node::token_to_keep)
273 			{
274 				Handle_TokenToKeep();
275 				return eCurResult;
276 			}
277 			else
278 			{
279 				Handle_TokenToTransform();
280 				return eCurResult;
281 			}	// endif (pCurParseNode->TokenType() == WTT_Node::token_to_keep)
282 		} 	// endif (pBranch == 0) else
283 	}	// end for
284 
285 	// If here, the text end is reached
286 	if (pCurParseNode->TokenType() == WTT_Node::token_to_keep)
287 	{
288 		Handle_TokenToKeep();
289 		return eCurResult;
290 	}
291 	else
292 	{
293 		Handle_TokenToTransform();
294 		return eCurResult;
295 	}
296 }
297 
298 ByteString
CurReplacingString() const299 WordTransTree::CurReplacingString() const
300 {
301 	return pCurParseNode->ReplaceString();
302 }
303 
304 void
Handle_Hotkey()305 WordTransTree::Handle_Hotkey()
306 {
307 	if (cCurHotkey == 0) 	// Avoid to replace the first found hotkey by
308 	                        //   a later one - though this shouldn't happen anyway.
309 	{
310 		cCurHotkey = (pInputPosition+1) != pInputEnd ? pInputPosition[1] : 0;
311 		cCurHotkeySign = *pInputPosition;
312 	}
313 }
314 
315 void
Handle_TokenToKeep()316 WordTransTree::Handle_TokenToKeep()
317 {
318 	UINT32 nTokenLength = pInputPosition-pInputCurTokenStart;
319 
320 	memcpy(pOutputPosition,pInputCurTokenStart,nTokenLength);
321 
322 	pOutputPosition += nTokenLength;
323 	*pOutputPosition = '\0';
324 }
325 
326 void
Handle_TokenToTransform()327 WordTransTree::Handle_TokenToTransform()
328 {
329 	sal_Bool bHaveHotkey = CalculateBranch(cCurHotkey) >= BR_ALPHABASE;
330 	const ByteString & rReplace = pCurParseNode->ReplaceString();
331 
332 	// Find position of hotkey in replace-string:
333 	sal_uInt16 nHotkeyPos = bHaveHotkey
334 							?	rReplace.Search(char(cCurHotkey))
335 							:	STRING_NOTFOUND;
336 	if (nHotkeyPos == STRING_NOTFOUND && bHaveHotkey)
337 	{
338 		if (cCurHotkey < 128)
339 		{
340 			if (islower(cCurHotkey))
341 				nHotkeyPos = rReplace.Search(toupper(char(cCurHotkey)));
342 			else
343 				nHotkeyPos = rReplace.Search(tolower(char(cCurHotkey)));
344 		}
345 		else	// cCurHotkey >= 128
346 		{
347 			if (cCurHotkey == c_ae)
348 				nHotkeyPos = rReplace.Search(char(c_AE));
349 			else if (cCurHotkey == c_oe)
350 				nHotkeyPos = rReplace.Search(char(c_OE));
351 			else if (cCurHotkey == c_ue)
352 				nHotkeyPos = rReplace.Search(char(c_UE));
353 			else if (cCurHotkey == c_AE)
354 				nHotkeyPos = rReplace.Search(char(c_ae));
355 			else if (cCurHotkey == c_OE)
356 				nHotkeyPos = rReplace.Search(char(c_oe));
357 			else if (cCurHotkey == c_UE)
358 				nHotkeyPos = rReplace.Search(char(c_ue));
359 		}	// endif (cCurHotkey < 128) else
360 
361 		if (nHotkeyPos == STRING_NOTFOUND)
362 		{
363 			eCurResult = HOTKEY_LOST;
364 			bHaveHotkey = sal_False;
365 		}
366 	} 	// endif (nHotkeyPos == STRING_NOT_FOUND && bHaveHotkey)
367 
368 
369 	UINT32 nOutputTokenLength = rReplace.Len() + (bHaveHotkey ? 1 : 0);
370 
371 	if (bHaveHotkey)
372 	{
373 		memcpy( pOutputPosition,
374 				pCurParseNode->ReplaceString().GetBuffer(),
375 				nHotkeyPos );
376 		*(pOutputPosition + nHotkeyPos) = cCurHotkeySign;
377 		memcpy( pOutputPosition + nHotkeyPos + 1,
378 				pCurParseNode->ReplaceString().GetBuffer() + nHotkeyPos,
379 				nOutputTokenLength - nHotkeyPos - 1);
380 	}
381 	else
382 	{
383 		memcpy( pOutputPosition,
384 				pCurParseNode->ReplaceString().GetBuffer(),
385 				nOutputTokenLength );
386 	}
387 
388 	// Convert first letter into upper if necessary:
389 	u_char cInStart = CalculateBranch(*pInputCurTokenStart) == BR_HOTKEY
390 							? 	pInputCurTokenStart[1]
391 							:	pInputCurTokenStart[0] ;
392 	u_char * pOutStart = nHotkeyPos == 0
393 							? 	pOutputPosition + 1
394 							:	pOutputPosition ;
395 	if (isupper(cInStart) || cInStart > 127)
396 	{   // Possibly cInStart is upper character:
397 		if (isupper(cInStart) || cInStart == c_AE || cInStart == c_OE || cInStart == c_UE)
398 		{	// Surely cInStart is upper character:
399 			u_char cOutStart = *pOutStart;
400 			if (cOutStart < 128)
401 				*pOutStart = toupper(cOutStart);
402 			else if (cOutStart == c_ae)
403 				*pOutStart = c_AE;
404 			else if (cOutStart == c_oe)
405 				*pOutStart = c_OE;
406 			else if (cOutStart == c_ue)
407 				*pOutStart = c_UE;
408 		}
409 	}  	// endif (isupper(cInStart) || cInStart > 127)
410 
411 	pOutputPosition += nOutputTokenLength;
412 	*pOutputPosition = '\0';
413 }
414 
415