xref: /trunk/main/sal/textenc/convertgb18030.c (revision 647f063d)
1 /**************************************************************
2  *
3  * Licensed to the Apache Software Foundation (ASF) under one
4  * or more contributor license agreements.  See the NOTICE file
5  * distributed with this work for additional information
6  * regarding copyright ownership.  The ASF licenses this file
7  * to you under the Apache License, Version 2.0 (the
8  * "License"); you may not use this file except in compliance
9  * with the License.  You may obtain a copy of the License at
10  *
11  *   http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing,
14  * software distributed under the License is distributed on an
15  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16  * KIND, either express or implied.  See the License for the
17  * specific language governing permissions and limitations
18  * under the License.
19  *
20  *************************************************************/
21 
22 
23 
24 #include "convertgb18030.h"
25 #include "context.h"
26 #include "converter.h"
27 #include "tenchelp.h"
28 #include "unichars.h"
29 #include "rtl/alloc.h"
30 #include "rtl/textcvt.h"
31 #include "sal/types.h"
32 
33 typedef enum
34 {
35     IMPL_GB_18030_TO_UNICODE_STATE_0,
36     IMPL_GB_18030_TO_UNICODE_STATE_1,
37     IMPL_GB_18030_TO_UNICODE_STATE_2,
38     IMPL_GB_18030_TO_UNICODE_STATE_3
39 } ImplGb18030ToUnicodeState;
40 
41 typedef struct
42 {
43     ImplGb18030ToUnicodeState m_eState;
44     sal_uInt32 m_nCode;
45 } ImplGb18030ToUnicodeContext;
46 
ImplCreateGb18030ToUnicodeContext(void)47 void * ImplCreateGb18030ToUnicodeContext(void)
48 {
49     void * pContext
50         = rtl_allocateMemory(sizeof (ImplGb18030ToUnicodeContext));
51     ((ImplGb18030ToUnicodeContext *) pContext)->m_eState
52         = IMPL_GB_18030_TO_UNICODE_STATE_0;
53     return pContext;
54 }
55 
ImplResetGb18030ToUnicodeContext(void * pContext)56 void ImplResetGb18030ToUnicodeContext(void * pContext)
57 {
58     if (pContext)
59         ((ImplGb18030ToUnicodeContext *) pContext)->m_eState
60             = IMPL_GB_18030_TO_UNICODE_STATE_0;
61 }
62 
ImplConvertGb18030ToUnicode(ImplTextConverterData const * pData,void * pContext,sal_Char const * pSrcBuf,sal_Size nSrcBytes,sal_Unicode * pDestBuf,sal_Size nDestChars,sal_uInt32 nFlags,sal_uInt32 * pInfo,sal_Size * pSrcCvtBytes)63 sal_Size ImplConvertGb18030ToUnicode(ImplTextConverterData const * pData,
64                                      void * pContext,
65                                      sal_Char const * pSrcBuf,
66                                      sal_Size nSrcBytes,
67                                      sal_Unicode * pDestBuf,
68                                      sal_Size nDestChars,
69                                      sal_uInt32 nFlags,
70                                      sal_uInt32 * pInfo,
71                                      sal_Size * pSrcCvtBytes)
72 {
73     sal_Unicode const * pGb18030Data
74         = ((ImplGb18030ConverterData const *) pData)->m_pGb18030ToUnicodeData;
75     ImplGb180302000ToUnicodeRange const * pGb18030Ranges
76         = ((ImplGb18030ConverterData const *) pData)->
77               m_pGb18030ToUnicodeRanges;
78     ImplGb18030ToUnicodeState eState = IMPL_GB_18030_TO_UNICODE_STATE_0;
79     sal_uInt32 nCode = 0;
80     sal_uInt32 nInfo = 0;
81     sal_Size nConverted = 0;
82     sal_Unicode * pDestBufPtr = pDestBuf;
83     sal_Unicode * pDestBufEnd = pDestBuf + nDestChars;
84 
85     if (pContext)
86     {
87         eState = ((ImplGb18030ToUnicodeContext *) pContext)->m_eState;
88         nCode = ((ImplGb18030ToUnicodeContext *) pContext)->m_nCode;
89     }
90 
91     for (; nConverted < nSrcBytes; ++nConverted)
92     {
93         sal_Bool bUndefined = sal_True;
94         sal_uInt32 nChar = *(sal_uChar const *) pSrcBuf++;
95         switch (eState)
96         {
97         case IMPL_GB_18030_TO_UNICODE_STATE_0:
98             if (nChar < 0x80)
99                 if (pDestBufPtr != pDestBufEnd)
100                     *pDestBufPtr++ = (sal_Unicode) nChar;
101                 else
102                     goto no_output;
103             else if (nChar == 0x80)
104                 goto bad_input;
105             else if (nChar <= 0xFE)
106             {
107                 nCode = nChar - 0x81;
108                 eState = IMPL_GB_18030_TO_UNICODE_STATE_1;
109             }
110             else
111             {
112                 bUndefined = sal_False;
113                 goto bad_input;
114             }
115             break;
116 
117         case IMPL_GB_18030_TO_UNICODE_STATE_1:
118             if (nChar >= 0x30 && nChar <= 0x39)
119             {
120                 nCode = nCode * 10 + (nChar - 0x30);
121                 eState = IMPL_GB_18030_TO_UNICODE_STATE_2;
122             }
123             else if ((nChar >= 0x40 && nChar <= 0x7E)
124                      || (nChar >= 0x80 && nChar <= 0xFE))
125             {
126                 nCode = nCode * 190 + (nChar <= 0x7E ? nChar - 0x40 :
127                                                        nChar - 0x80 + 63);
128                 if (pDestBufPtr != pDestBufEnd)
129                     *pDestBufPtr++ = pGb18030Data[nCode];
130                 else
131                     goto no_output;
132                 eState = IMPL_GB_18030_TO_UNICODE_STATE_0;
133             }
134             else
135             {
136                 bUndefined = sal_False;
137                 goto bad_input;
138             }
139             break;
140 
141         case IMPL_GB_18030_TO_UNICODE_STATE_2:
142             if (nChar >= 0x81 && nChar <= 0xFE)
143             {
144                 nCode = nCode * 126 + (nChar - 0x81);
145                 eState = IMPL_GB_18030_TO_UNICODE_STATE_3;
146             }
147             else
148             {
149                 bUndefined = sal_False;
150                 goto bad_input;
151             }
152             break;
153 
154         case IMPL_GB_18030_TO_UNICODE_STATE_3:
155             if (nChar >= 0x30 && nChar <= 0x39)
156             {
157                 nCode = nCode * 10 + (nChar - 0x30);
158 
159                 /* 90 30 81 30 to E3 32 9A 35 maps to U+10000 to U+10FFFF: */
160                 if (nCode >= 189000 && nCode <= 1237575)
161                     if (pDestBufEnd - pDestBufPtr >= 2)
162                     {
163                         nCode -= 189000 - 0x10000;
164                         *pDestBufPtr++
165                             = (sal_Unicode) ImplGetHighSurrogate(nCode);
166                         *pDestBufPtr++
167                             = (sal_Unicode) ImplGetLowSurrogate(nCode);
168                     }
169                     else
170                         goto no_output;
171                 else
172                 {
173                     ImplGb180302000ToUnicodeRange const * pRange
174                         = pGb18030Ranges;
175                     sal_uInt32 nFirstNonRange = 0;
176                     for (;;)
177                     {
178                         if (pRange->m_nNonRangeDataIndex == -1)
179                             goto bad_input;
180                         else if (nCode < pRange->m_nFirstLinear)
181                         {
182                             if (pDestBufPtr != pDestBufEnd)
183                                 *pDestBufPtr++
184                                     = pGb18030Data[
185                                           pRange->m_nNonRangeDataIndex
186                                               + (nCode - nFirstNonRange)];
187                             else
188                                 goto no_output;
189                             break;
190                         }
191                         else if (nCode < pRange->m_nPastLinear)
192                         {
193                             if (pDestBufPtr != pDestBufEnd)
194                                 *pDestBufPtr++
195                                     = (sal_Unicode)
196                                           (pRange->m_nFirstUnicode
197                                                + (nCode
198                                                       - pRange->
199                                                             m_nFirstLinear));
200                             else
201                                 goto no_output;
202                             break;
203                         }
204                         nFirstNonRange = (pRange++)->m_nPastLinear;
205                     }
206                 }
207                 eState = IMPL_GB_18030_TO_UNICODE_STATE_0;
208             }
209             else
210             {
211                 bUndefined = sal_False;
212                 goto bad_input;
213             }
214             break;
215         }
216         continue;
217 
218     bad_input:
219         switch (ImplHandleBadInputTextToUnicodeConversion(
220                     bUndefined, sal_True, 0, nFlags, &pDestBufPtr, pDestBufEnd,
221                     &nInfo))
222         {
223         case IMPL_BAD_INPUT_STOP:
224             eState = IMPL_GB_18030_TO_UNICODE_STATE_0;
225             break;
226 
227         case IMPL_BAD_INPUT_CONTINUE:
228             eState = IMPL_GB_18030_TO_UNICODE_STATE_0;
229             continue;
230 
231         case IMPL_BAD_INPUT_NO_OUTPUT:
232             goto no_output;
233         }
234         break;
235 
236     no_output:
237         --pSrcBuf;
238         nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL;
239         break;
240     }
241 
242     if (eState != IMPL_GB_18030_TO_UNICODE_STATE_0
243         && (nInfo & (RTL_TEXTTOUNICODE_INFO_ERROR
244                          | RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL))
245                == 0)
246     {
247         if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0)
248             nInfo |= RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL;
249         else
250             switch (ImplHandleBadInputTextToUnicodeConversion(
251                         sal_False, sal_True, 0, nFlags, &pDestBufPtr,
252                         pDestBufEnd, &nInfo))
253             {
254             case IMPL_BAD_INPUT_STOP:
255             case IMPL_BAD_INPUT_CONTINUE:
256                 eState = IMPL_GB_18030_TO_UNICODE_STATE_0;
257                 break;
258 
259             case IMPL_BAD_INPUT_NO_OUTPUT:
260                 nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL;
261                 break;
262             }
263     }
264 
265     if (pContext)
266     {
267         ((ImplGb18030ToUnicodeContext *) pContext)->m_eState = eState;
268         ((ImplGb18030ToUnicodeContext *) pContext)->m_nCode = nCode;
269     }
270     if (pInfo)
271         *pInfo = nInfo;
272     if (pSrcCvtBytes)
273         *pSrcCvtBytes = nConverted;
274 
275     return pDestBufPtr - pDestBuf;
276 }
277 
ImplConvertUnicodeToGb18030(ImplTextConverterData const * pData,void * pContext,sal_Unicode const * pSrcBuf,sal_Size nSrcChars,sal_Char * pDestBuf,sal_Size nDestBytes,sal_uInt32 nFlags,sal_uInt32 * pInfo,sal_Size * pSrcCvtChars)278 sal_Size ImplConvertUnicodeToGb18030(ImplTextConverterData const * pData,
279                                      void * pContext,
280                                      sal_Unicode const * pSrcBuf,
281                                      sal_Size nSrcChars,
282                                      sal_Char * pDestBuf,
283                                      sal_Size nDestBytes,
284                                      sal_uInt32 nFlags,
285                                      sal_uInt32 * pInfo,
286                                      sal_Size * pSrcCvtChars)
287 {
288     sal_uInt32 const * pGb18030Data
289         = ((ImplGb18030ConverterData const *) pData)->
290               m_pUnicodeToGb18030Data;
291     ImplUnicodeToGb180302000Range const * pGb18030Ranges
292         = ((ImplGb18030ConverterData const *) pData)->
293               m_pUnicodeToGb18030Ranges;
294     sal_Unicode nHighSurrogate = 0;
295     sal_uInt32 nInfo = 0;
296     sal_Size nConverted = 0;
297     sal_Char * pDestBufPtr = pDestBuf;
298     sal_Char * pDestBufEnd = pDestBuf + nDestBytes;
299 
300     if (pContext)
301         nHighSurrogate
302             = ((ImplUnicodeToTextContext *) pContext)->m_nHighSurrogate;
303 
304     for (; nConverted < nSrcChars; ++nConverted)
305     {
306         sal_Bool bUndefined = sal_True;
307         sal_uInt32 nChar = *pSrcBuf++;
308         if (nHighSurrogate == 0)
309         {
310             if (ImplIsHighSurrogate(nChar))
311             {
312                 nHighSurrogate = (sal_Unicode) nChar;
313                 continue;
314             }
315         }
316         else if (ImplIsLowSurrogate(nChar))
317             nChar = ImplCombineSurrogates(nHighSurrogate, nChar);
318         else
319         {
320             bUndefined = sal_False;
321             goto bad_input;
322         }
323 
324         if (ImplIsLowSurrogate(nChar) || ImplIsNoncharacter(nChar))
325         {
326             bUndefined = sal_False;
327             goto bad_input;
328         }
329 
330         if (nChar < 0x80)
331             if (pDestBufPtr != pDestBufEnd)
332                 *pDestBufPtr++ = (sal_Char) nChar;
333             else
334                 goto no_output;
335         else if (nChar < 0x10000)
336         {
337             ImplUnicodeToGb180302000Range const * pRange = pGb18030Ranges;
338             sal_Unicode nFirstNonRange = 0x80;
339             for (;;)
340             {
341                 if (nChar < pRange->m_nFirstUnicode)
342                 {
343                     sal_uInt32 nCode
344                         = pGb18030Data[pRange->m_nNonRangeDataIndex
345                                            + (nChar - nFirstNonRange)];
346                     if (pDestBufEnd - pDestBufPtr
347                             >= (nCode <= 0xFFFF ? 2 : 4))
348                     {
349                         if (nCode > 0xFFFF)
350                         {
351                             *pDestBufPtr++ = (sal_Char) (nCode >> 24);
352                             *pDestBufPtr++ = (sal_Char) (nCode >> 16 & 0xFF);
353                         }
354                         *pDestBufPtr++ = (sal_Char) (nCode >> 8 & 0xFF);
355                         *pDestBufPtr++ = (sal_Char) (nCode & 0xFF);
356                     }
357                     else
358                         goto no_output;
359                     break;
360                 }
361                 else if (nChar <= pRange->m_nLastUnicode)
362                 {
363                     if (pDestBufEnd - pDestBufPtr >= 4)
364                     {
365                         sal_uInt32 nCode
366                             = pRange->m_nFirstLinear
367                                   + (nChar - pRange->m_nFirstUnicode);
368                         *pDestBufPtr++ = (sal_Char) (nCode / 12600 + 0x81);
369                         *pDestBufPtr++
370                             = (sal_Char) (nCode / 1260 % 10 + 0x30);
371                         *pDestBufPtr++ = (sal_Char) (nCode / 10 % 126 + 0x81);
372                         *pDestBufPtr++ = (sal_Char) (nCode % 10 + 0x30);
373                     }
374                     else
375                         goto no_output;
376                     break;
377                 }
378                 nFirstNonRange
379                     = (sal_Unicode) ((pRange++)->m_nLastUnicode + 1);
380             }
381         }
382         else
383             if (pDestBufEnd - pDestBufPtr >= 4)
384             {
385                 sal_uInt32 nCode = nChar - 0x10000;
386                 *pDestBufPtr++ = (sal_Char) (nCode / 12600 + 0x90);
387                 *pDestBufPtr++ = (sal_Char) (nCode / 1260 % 10 + 0x30);
388                 *pDestBufPtr++ = (sal_Char) (nCode / 10 % 126 + 0x81);
389                 *pDestBufPtr++ = (sal_Char) (nCode % 10 + 0x30);
390             }
391             else
392                 goto no_output;
393         nHighSurrogate = 0;
394         continue;
395 
396     bad_input:
397         switch (ImplHandleBadInputUnicodeToTextConversion(bUndefined,
398                                                           nChar,
399                                                           nFlags,
400                                                           &pDestBufPtr,
401                                                           pDestBufEnd,
402                                                           &nInfo,
403                                                           NULL,
404                                                           0,
405                                                           NULL))
406         {
407         case IMPL_BAD_INPUT_STOP:
408             nHighSurrogate = 0;
409             break;
410 
411         case IMPL_BAD_INPUT_CONTINUE:
412             nHighSurrogate = 0;
413             continue;
414 
415         case IMPL_BAD_INPUT_NO_OUTPUT:
416             goto no_output;
417         }
418         break;
419 
420     no_output:
421         --pSrcBuf;
422         nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
423         break;
424     }
425 
426     if (nHighSurrogate != 0
427         && (nInfo & (RTL_UNICODETOTEXT_INFO_ERROR
428                          | RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL))
429                == 0)
430     {
431         if ((nFlags & RTL_UNICODETOTEXT_FLAGS_FLUSH) != 0)
432             nInfo |= RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL;
433         else
434             switch (ImplHandleBadInputUnicodeToTextConversion(sal_False,
435                                                               0,
436                                                               nFlags,
437                                                               &pDestBufPtr,
438                                                               pDestBufEnd,
439                                                               &nInfo,
440                                                               NULL,
441                                                               0,
442                                                               NULL))
443             {
444             case IMPL_BAD_INPUT_STOP:
445             case IMPL_BAD_INPUT_CONTINUE:
446                 nHighSurrogate = 0;
447                 break;
448 
449             case IMPL_BAD_INPUT_NO_OUTPUT:
450                 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
451                 break;
452             }
453     }
454 
455     if (pContext)
456         ((ImplUnicodeToTextContext *) pContext)->m_nHighSurrogate
457             = nHighSurrogate;
458     if (pInfo)
459         *pInfo = nInfo;
460     if (pSrcCvtChars)
461         *pSrcCvtChars = nConverted;
462 
463     return pDestBufPtr - pDestBuf;
464 }
465