1 /**************************************************************
2 *
3 * Licensed to the Apache Software Foundation (ASF) under one
4 * or more contributor license agreements. See the NOTICE file
5 * distributed with this work for additional information
6 * regarding copyright ownership. The ASF licenses this file
7 * to you under the Apache License, Version 2.0 (the
8 * "License"); you may not use this file except in compliance
9 * with the License. You may obtain a copy of the License at
10 *
11 * http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing,
14 * software distributed under the License is distributed on an
15 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 * KIND, either express or implied. See the License for the
17 * specific language governing permissions and limitations
18 * under the License.
19 *
20 *************************************************************/
21
22
23
24 #include "convertgb18030.h"
25 #include "context.h"
26 #include "converter.h"
27 #include "tenchelp.h"
28 #include "unichars.h"
29 #include "rtl/alloc.h"
30 #include "rtl/textcvt.h"
31 #include "sal/types.h"
32
33 typedef enum
34 {
35 IMPL_GB_18030_TO_UNICODE_STATE_0,
36 IMPL_GB_18030_TO_UNICODE_STATE_1,
37 IMPL_GB_18030_TO_UNICODE_STATE_2,
38 IMPL_GB_18030_TO_UNICODE_STATE_3
39 } ImplGb18030ToUnicodeState;
40
41 typedef struct
42 {
43 ImplGb18030ToUnicodeState m_eState;
44 sal_uInt32 m_nCode;
45 } ImplGb18030ToUnicodeContext;
46
ImplCreateGb18030ToUnicodeContext(void)47 void * ImplCreateGb18030ToUnicodeContext(void)
48 {
49 void * pContext
50 = rtl_allocateMemory(sizeof (ImplGb18030ToUnicodeContext));
51 ((ImplGb18030ToUnicodeContext *) pContext)->m_eState
52 = IMPL_GB_18030_TO_UNICODE_STATE_0;
53 return pContext;
54 }
55
ImplResetGb18030ToUnicodeContext(void * pContext)56 void ImplResetGb18030ToUnicodeContext(void * pContext)
57 {
58 if (pContext)
59 ((ImplGb18030ToUnicodeContext *) pContext)->m_eState
60 = IMPL_GB_18030_TO_UNICODE_STATE_0;
61 }
62
ImplConvertGb18030ToUnicode(ImplTextConverterData const * pData,void * pContext,sal_Char const * pSrcBuf,sal_Size nSrcBytes,sal_Unicode * pDestBuf,sal_Size nDestChars,sal_uInt32 nFlags,sal_uInt32 * pInfo,sal_Size * pSrcCvtBytes)63 sal_Size ImplConvertGb18030ToUnicode(ImplTextConverterData const * pData,
64 void * pContext,
65 sal_Char const * pSrcBuf,
66 sal_Size nSrcBytes,
67 sal_Unicode * pDestBuf,
68 sal_Size nDestChars,
69 sal_uInt32 nFlags,
70 sal_uInt32 * pInfo,
71 sal_Size * pSrcCvtBytes)
72 {
73 sal_Unicode const * pGb18030Data
74 = ((ImplGb18030ConverterData const *) pData)->m_pGb18030ToUnicodeData;
75 ImplGb180302000ToUnicodeRange const * pGb18030Ranges
76 = ((ImplGb18030ConverterData const *) pData)->
77 m_pGb18030ToUnicodeRanges;
78 ImplGb18030ToUnicodeState eState = IMPL_GB_18030_TO_UNICODE_STATE_0;
79 sal_uInt32 nCode = 0;
80 sal_uInt32 nInfo = 0;
81 sal_Size nConverted = 0;
82 sal_Unicode * pDestBufPtr = pDestBuf;
83 sal_Unicode * pDestBufEnd = pDestBuf + nDestChars;
84
85 if (pContext)
86 {
87 eState = ((ImplGb18030ToUnicodeContext *) pContext)->m_eState;
88 nCode = ((ImplGb18030ToUnicodeContext *) pContext)->m_nCode;
89 }
90
91 for (; nConverted < nSrcBytes; ++nConverted)
92 {
93 sal_Bool bUndefined = sal_True;
94 sal_uInt32 nChar = *(sal_uChar const *) pSrcBuf++;
95 switch (eState)
96 {
97 case IMPL_GB_18030_TO_UNICODE_STATE_0:
98 if (nChar < 0x80)
99 if (pDestBufPtr != pDestBufEnd)
100 *pDestBufPtr++ = (sal_Unicode) nChar;
101 else
102 goto no_output;
103 else if (nChar == 0x80)
104 goto bad_input;
105 else if (nChar <= 0xFE)
106 {
107 nCode = nChar - 0x81;
108 eState = IMPL_GB_18030_TO_UNICODE_STATE_1;
109 }
110 else
111 {
112 bUndefined = sal_False;
113 goto bad_input;
114 }
115 break;
116
117 case IMPL_GB_18030_TO_UNICODE_STATE_1:
118 if (nChar >= 0x30 && nChar <= 0x39)
119 {
120 nCode = nCode * 10 + (nChar - 0x30);
121 eState = IMPL_GB_18030_TO_UNICODE_STATE_2;
122 }
123 else if ((nChar >= 0x40 && nChar <= 0x7E)
124 || (nChar >= 0x80 && nChar <= 0xFE))
125 {
126 nCode = nCode * 190 + (nChar <= 0x7E ? nChar - 0x40 :
127 nChar - 0x80 + 63);
128 if (pDestBufPtr != pDestBufEnd)
129 *pDestBufPtr++ = pGb18030Data[nCode];
130 else
131 goto no_output;
132 eState = IMPL_GB_18030_TO_UNICODE_STATE_0;
133 }
134 else
135 {
136 bUndefined = sal_False;
137 goto bad_input;
138 }
139 break;
140
141 case IMPL_GB_18030_TO_UNICODE_STATE_2:
142 if (nChar >= 0x81 && nChar <= 0xFE)
143 {
144 nCode = nCode * 126 + (nChar - 0x81);
145 eState = IMPL_GB_18030_TO_UNICODE_STATE_3;
146 }
147 else
148 {
149 bUndefined = sal_False;
150 goto bad_input;
151 }
152 break;
153
154 case IMPL_GB_18030_TO_UNICODE_STATE_3:
155 if (nChar >= 0x30 && nChar <= 0x39)
156 {
157 nCode = nCode * 10 + (nChar - 0x30);
158
159 /* 90 30 81 30 to E3 32 9A 35 maps to U+10000 to U+10FFFF: */
160 if (nCode >= 189000 && nCode <= 1237575)
161 if (pDestBufEnd - pDestBufPtr >= 2)
162 {
163 nCode -= 189000 - 0x10000;
164 *pDestBufPtr++
165 = (sal_Unicode) ImplGetHighSurrogate(nCode);
166 *pDestBufPtr++
167 = (sal_Unicode) ImplGetLowSurrogate(nCode);
168 }
169 else
170 goto no_output;
171 else
172 {
173 ImplGb180302000ToUnicodeRange const * pRange
174 = pGb18030Ranges;
175 sal_uInt32 nFirstNonRange = 0;
176 for (;;)
177 {
178 if (pRange->m_nNonRangeDataIndex == -1)
179 goto bad_input;
180 else if (nCode < pRange->m_nFirstLinear)
181 {
182 if (pDestBufPtr != pDestBufEnd)
183 *pDestBufPtr++
184 = pGb18030Data[
185 pRange->m_nNonRangeDataIndex
186 + (nCode - nFirstNonRange)];
187 else
188 goto no_output;
189 break;
190 }
191 else if (nCode < pRange->m_nPastLinear)
192 {
193 if (pDestBufPtr != pDestBufEnd)
194 *pDestBufPtr++
195 = (sal_Unicode)
196 (pRange->m_nFirstUnicode
197 + (nCode
198 - pRange->
199 m_nFirstLinear));
200 else
201 goto no_output;
202 break;
203 }
204 nFirstNonRange = (pRange++)->m_nPastLinear;
205 }
206 }
207 eState = IMPL_GB_18030_TO_UNICODE_STATE_0;
208 }
209 else
210 {
211 bUndefined = sal_False;
212 goto bad_input;
213 }
214 break;
215 }
216 continue;
217
218 bad_input:
219 switch (ImplHandleBadInputTextToUnicodeConversion(
220 bUndefined, sal_True, 0, nFlags, &pDestBufPtr, pDestBufEnd,
221 &nInfo))
222 {
223 case IMPL_BAD_INPUT_STOP:
224 eState = IMPL_GB_18030_TO_UNICODE_STATE_0;
225 break;
226
227 case IMPL_BAD_INPUT_CONTINUE:
228 eState = IMPL_GB_18030_TO_UNICODE_STATE_0;
229 continue;
230
231 case IMPL_BAD_INPUT_NO_OUTPUT:
232 goto no_output;
233 }
234 break;
235
236 no_output:
237 --pSrcBuf;
238 nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL;
239 break;
240 }
241
242 if (eState != IMPL_GB_18030_TO_UNICODE_STATE_0
243 && (nInfo & (RTL_TEXTTOUNICODE_INFO_ERROR
244 | RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL))
245 == 0)
246 {
247 if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0)
248 nInfo |= RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL;
249 else
250 switch (ImplHandleBadInputTextToUnicodeConversion(
251 sal_False, sal_True, 0, nFlags, &pDestBufPtr,
252 pDestBufEnd, &nInfo))
253 {
254 case IMPL_BAD_INPUT_STOP:
255 case IMPL_BAD_INPUT_CONTINUE:
256 eState = IMPL_GB_18030_TO_UNICODE_STATE_0;
257 break;
258
259 case IMPL_BAD_INPUT_NO_OUTPUT:
260 nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL;
261 break;
262 }
263 }
264
265 if (pContext)
266 {
267 ((ImplGb18030ToUnicodeContext *) pContext)->m_eState = eState;
268 ((ImplGb18030ToUnicodeContext *) pContext)->m_nCode = nCode;
269 }
270 if (pInfo)
271 *pInfo = nInfo;
272 if (pSrcCvtBytes)
273 *pSrcCvtBytes = nConverted;
274
275 return pDestBufPtr - pDestBuf;
276 }
277
ImplConvertUnicodeToGb18030(ImplTextConverterData const * pData,void * pContext,sal_Unicode const * pSrcBuf,sal_Size nSrcChars,sal_Char * pDestBuf,sal_Size nDestBytes,sal_uInt32 nFlags,sal_uInt32 * pInfo,sal_Size * pSrcCvtChars)278 sal_Size ImplConvertUnicodeToGb18030(ImplTextConverterData const * pData,
279 void * pContext,
280 sal_Unicode const * pSrcBuf,
281 sal_Size nSrcChars,
282 sal_Char * pDestBuf,
283 sal_Size nDestBytes,
284 sal_uInt32 nFlags,
285 sal_uInt32 * pInfo,
286 sal_Size * pSrcCvtChars)
287 {
288 sal_uInt32 const * pGb18030Data
289 = ((ImplGb18030ConverterData const *) pData)->
290 m_pUnicodeToGb18030Data;
291 ImplUnicodeToGb180302000Range const * pGb18030Ranges
292 = ((ImplGb18030ConverterData const *) pData)->
293 m_pUnicodeToGb18030Ranges;
294 sal_Unicode nHighSurrogate = 0;
295 sal_uInt32 nInfo = 0;
296 sal_Size nConverted = 0;
297 sal_Char * pDestBufPtr = pDestBuf;
298 sal_Char * pDestBufEnd = pDestBuf + nDestBytes;
299
300 if (pContext)
301 nHighSurrogate
302 = ((ImplUnicodeToTextContext *) pContext)->m_nHighSurrogate;
303
304 for (; nConverted < nSrcChars; ++nConverted)
305 {
306 sal_Bool bUndefined = sal_True;
307 sal_uInt32 nChar = *pSrcBuf++;
308 if (nHighSurrogate == 0)
309 {
310 if (ImplIsHighSurrogate(nChar))
311 {
312 nHighSurrogate = (sal_Unicode) nChar;
313 continue;
314 }
315 }
316 else if (ImplIsLowSurrogate(nChar))
317 nChar = ImplCombineSurrogates(nHighSurrogate, nChar);
318 else
319 {
320 bUndefined = sal_False;
321 goto bad_input;
322 }
323
324 if (ImplIsLowSurrogate(nChar) || ImplIsNoncharacter(nChar))
325 {
326 bUndefined = sal_False;
327 goto bad_input;
328 }
329
330 if (nChar < 0x80)
331 if (pDestBufPtr != pDestBufEnd)
332 *pDestBufPtr++ = (sal_Char) nChar;
333 else
334 goto no_output;
335 else if (nChar < 0x10000)
336 {
337 ImplUnicodeToGb180302000Range const * pRange = pGb18030Ranges;
338 sal_Unicode nFirstNonRange = 0x80;
339 for (;;)
340 {
341 if (nChar < pRange->m_nFirstUnicode)
342 {
343 sal_uInt32 nCode
344 = pGb18030Data[pRange->m_nNonRangeDataIndex
345 + (nChar - nFirstNonRange)];
346 if (pDestBufEnd - pDestBufPtr
347 >= (nCode <= 0xFFFF ? 2 : 4))
348 {
349 if (nCode > 0xFFFF)
350 {
351 *pDestBufPtr++ = (sal_Char) (nCode >> 24);
352 *pDestBufPtr++ = (sal_Char) (nCode >> 16 & 0xFF);
353 }
354 *pDestBufPtr++ = (sal_Char) (nCode >> 8 & 0xFF);
355 *pDestBufPtr++ = (sal_Char) (nCode & 0xFF);
356 }
357 else
358 goto no_output;
359 break;
360 }
361 else if (nChar <= pRange->m_nLastUnicode)
362 {
363 if (pDestBufEnd - pDestBufPtr >= 4)
364 {
365 sal_uInt32 nCode
366 = pRange->m_nFirstLinear
367 + (nChar - pRange->m_nFirstUnicode);
368 *pDestBufPtr++ = (sal_Char) (nCode / 12600 + 0x81);
369 *pDestBufPtr++
370 = (sal_Char) (nCode / 1260 % 10 + 0x30);
371 *pDestBufPtr++ = (sal_Char) (nCode / 10 % 126 + 0x81);
372 *pDestBufPtr++ = (sal_Char) (nCode % 10 + 0x30);
373 }
374 else
375 goto no_output;
376 break;
377 }
378 nFirstNonRange
379 = (sal_Unicode) ((pRange++)->m_nLastUnicode + 1);
380 }
381 }
382 else
383 if (pDestBufEnd - pDestBufPtr >= 4)
384 {
385 sal_uInt32 nCode = nChar - 0x10000;
386 *pDestBufPtr++ = (sal_Char) (nCode / 12600 + 0x90);
387 *pDestBufPtr++ = (sal_Char) (nCode / 1260 % 10 + 0x30);
388 *pDestBufPtr++ = (sal_Char) (nCode / 10 % 126 + 0x81);
389 *pDestBufPtr++ = (sal_Char) (nCode % 10 + 0x30);
390 }
391 else
392 goto no_output;
393 nHighSurrogate = 0;
394 continue;
395
396 bad_input:
397 switch (ImplHandleBadInputUnicodeToTextConversion(bUndefined,
398 nChar,
399 nFlags,
400 &pDestBufPtr,
401 pDestBufEnd,
402 &nInfo,
403 NULL,
404 0,
405 NULL))
406 {
407 case IMPL_BAD_INPUT_STOP:
408 nHighSurrogate = 0;
409 break;
410
411 case IMPL_BAD_INPUT_CONTINUE:
412 nHighSurrogate = 0;
413 continue;
414
415 case IMPL_BAD_INPUT_NO_OUTPUT:
416 goto no_output;
417 }
418 break;
419
420 no_output:
421 --pSrcBuf;
422 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
423 break;
424 }
425
426 if (nHighSurrogate != 0
427 && (nInfo & (RTL_UNICODETOTEXT_INFO_ERROR
428 | RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL))
429 == 0)
430 {
431 if ((nFlags & RTL_UNICODETOTEXT_FLAGS_FLUSH) != 0)
432 nInfo |= RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL;
433 else
434 switch (ImplHandleBadInputUnicodeToTextConversion(sal_False,
435 0,
436 nFlags,
437 &pDestBufPtr,
438 pDestBufEnd,
439 &nInfo,
440 NULL,
441 0,
442 NULL))
443 {
444 case IMPL_BAD_INPUT_STOP:
445 case IMPL_BAD_INPUT_CONTINUE:
446 nHighSurrogate = 0;
447 break;
448
449 case IMPL_BAD_INPUT_NO_OUTPUT:
450 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
451 break;
452 }
453 }
454
455 if (pContext)
456 ((ImplUnicodeToTextContext *) pContext)->m_nHighSurrogate
457 = nHighSurrogate;
458 if (pInfo)
459 *pInfo = nInfo;
460 if (pSrcCvtChars)
461 *pSrcCvtChars = nConverted;
462
463 return pDestBufPtr - pDestBuf;
464 }
465