xref: /aoo42x/main/sal/textenc/convertgb18030.c (revision cdf0e10c)
1 /*************************************************************************
2  *
3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4  *
5  * Copyright 2000, 2010 Oracle and/or its affiliates.
6  *
7  * OpenOffice.org - a multi-platform office productivity suite
8  *
9  * This file is part of OpenOffice.org.
10  *
11  * OpenOffice.org is free software: you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser General Public License version 3
13  * only, as published by the Free Software Foundation.
14  *
15  * OpenOffice.org is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  * GNU Lesser General Public License version 3 for more details
19  * (a copy is included in the LICENSE file that accompanied this code).
20  *
21  * You should have received a copy of the GNU Lesser General Public License
22  * version 3 along with OpenOffice.org.  If not, see
23  * <http://www.openoffice.org/license.html>
24  * for a copy of the LGPLv3 License.
25  *
26  ************************************************************************/
27 
28 #include "convertgb18030.h"
29 #include "context.h"
30 #include "converter.h"
31 #include "tenchelp.h"
32 #include "unichars.h"
33 #include "rtl/alloc.h"
34 #include "rtl/textcvt.h"
35 #include "sal/types.h"
36 
37 typedef enum
38 {
39     IMPL_GB_18030_TO_UNICODE_STATE_0,
40     IMPL_GB_18030_TO_UNICODE_STATE_1,
41     IMPL_GB_18030_TO_UNICODE_STATE_2,
42     IMPL_GB_18030_TO_UNICODE_STATE_3
43 } ImplGb18030ToUnicodeState;
44 
45 typedef struct
46 {
47     ImplGb18030ToUnicodeState m_eState;
48     sal_uInt32 m_nCode;
49 } ImplGb18030ToUnicodeContext;
50 
51 void * ImplCreateGb18030ToUnicodeContext(void)
52 {
53     void * pContext
54         = rtl_allocateMemory(sizeof (ImplGb18030ToUnicodeContext));
55     ((ImplGb18030ToUnicodeContext *) pContext)->m_eState
56         = IMPL_GB_18030_TO_UNICODE_STATE_0;
57     return pContext;
58 }
59 
60 void ImplResetGb18030ToUnicodeContext(void * pContext)
61 {
62     if (pContext)
63         ((ImplGb18030ToUnicodeContext *) pContext)->m_eState
64             = IMPL_GB_18030_TO_UNICODE_STATE_0;
65 }
66 
67 sal_Size ImplConvertGb18030ToUnicode(ImplTextConverterData const * pData,
68                                      void * pContext,
69                                      sal_Char const * pSrcBuf,
70                                      sal_Size nSrcBytes,
71                                      sal_Unicode * pDestBuf,
72                                      sal_Size nDestChars,
73                                      sal_uInt32 nFlags,
74                                      sal_uInt32 * pInfo,
75                                      sal_Size * pSrcCvtBytes)
76 {
77     sal_Unicode const * pGb18030Data
78         = ((ImplGb18030ConverterData const *) pData)->m_pGb18030ToUnicodeData;
79     ImplGb180302000ToUnicodeRange const * pGb18030Ranges
80         = ((ImplGb18030ConverterData const *) pData)->
81               m_pGb18030ToUnicodeRanges;
82     ImplGb18030ToUnicodeState eState = IMPL_GB_18030_TO_UNICODE_STATE_0;
83     sal_uInt32 nCode = 0;
84     sal_uInt32 nInfo = 0;
85     sal_Size nConverted = 0;
86     sal_Unicode * pDestBufPtr = pDestBuf;
87     sal_Unicode * pDestBufEnd = pDestBuf + nDestChars;
88 
89     if (pContext)
90     {
91         eState = ((ImplGb18030ToUnicodeContext *) pContext)->m_eState;
92         nCode = ((ImplGb18030ToUnicodeContext *) pContext)->m_nCode;
93     }
94 
95     for (; nConverted < nSrcBytes; ++nConverted)
96     {
97         sal_Bool bUndefined = sal_True;
98         sal_uInt32 nChar = *(sal_uChar const *) pSrcBuf++;
99         switch (eState)
100         {
101         case IMPL_GB_18030_TO_UNICODE_STATE_0:
102             if (nChar < 0x80)
103                 if (pDestBufPtr != pDestBufEnd)
104                     *pDestBufPtr++ = (sal_Unicode) nChar;
105                 else
106                     goto no_output;
107             else if (nChar == 0x80)
108                 goto bad_input;
109             else if (nChar <= 0xFE)
110             {
111                 nCode = nChar - 0x81;
112                 eState = IMPL_GB_18030_TO_UNICODE_STATE_1;
113             }
114             else
115             {
116                 bUndefined = sal_False;
117                 goto bad_input;
118             }
119             break;
120 
121         case IMPL_GB_18030_TO_UNICODE_STATE_1:
122             if (nChar >= 0x30 && nChar <= 0x39)
123             {
124                 nCode = nCode * 10 + (nChar - 0x30);
125                 eState = IMPL_GB_18030_TO_UNICODE_STATE_2;
126             }
127             else if ((nChar >= 0x40 && nChar <= 0x7E)
128                      || (nChar >= 0x80 && nChar <= 0xFE))
129             {
130                 nCode = nCode * 190 + (nChar <= 0x7E ? nChar - 0x40 :
131                                                        nChar - 0x80 + 63);
132                 if (pDestBufPtr != pDestBufEnd)
133                     *pDestBufPtr++ = pGb18030Data[nCode];
134                 else
135                     goto no_output;
136                 eState = IMPL_GB_18030_TO_UNICODE_STATE_0;
137             }
138             else
139             {
140                 bUndefined = sal_False;
141                 goto bad_input;
142             }
143             break;
144 
145         case IMPL_GB_18030_TO_UNICODE_STATE_2:
146             if (nChar >= 0x81 && nChar <= 0xFE)
147             {
148                 nCode = nCode * 126 + (nChar - 0x81);
149                 eState = IMPL_GB_18030_TO_UNICODE_STATE_3;
150             }
151             else
152             {
153                 bUndefined = sal_False;
154                 goto bad_input;
155             }
156             break;
157 
158         case IMPL_GB_18030_TO_UNICODE_STATE_3:
159             if (nChar >= 0x30 && nChar <= 0x39)
160             {
161                 nCode = nCode * 10 + (nChar - 0x30);
162 
163                 /* 90 30 81 30 to E3 32 9A 35 maps to U+10000 to U+10FFFF: */
164                 if (nCode >= 189000 && nCode <= 1237575)
165                     if (pDestBufEnd - pDestBufPtr >= 2)
166                     {
167                         nCode -= 189000 - 0x10000;
168                         *pDestBufPtr++
169                             = (sal_Unicode) ImplGetHighSurrogate(nCode);
170                         *pDestBufPtr++
171                             = (sal_Unicode) ImplGetLowSurrogate(nCode);
172                     }
173                     else
174                         goto no_output;
175                 else
176                 {
177                     ImplGb180302000ToUnicodeRange const * pRange
178                         = pGb18030Ranges;
179                     sal_uInt32 nFirstNonRange = 0;
180                     for (;;)
181                     {
182                         if (pRange->m_nNonRangeDataIndex == -1)
183                             goto bad_input;
184                         else if (nCode < pRange->m_nFirstLinear)
185                         {
186                             if (pDestBufPtr != pDestBufEnd)
187                                 *pDestBufPtr++
188                                     = pGb18030Data[
189                                           pRange->m_nNonRangeDataIndex
190                                               + (nCode - nFirstNonRange)];
191                             else
192                                 goto no_output;
193                             break;
194                         }
195                         else if (nCode < pRange->m_nPastLinear)
196                         {
197                             if (pDestBufPtr != pDestBufEnd)
198                                 *pDestBufPtr++
199                                     = (sal_Unicode)
200                                           (pRange->m_nFirstUnicode
201                                                + (nCode
202                                                       - pRange->
203                                                             m_nFirstLinear));
204                             else
205                                 goto no_output;
206                             break;
207                         }
208                         nFirstNonRange = (pRange++)->m_nPastLinear;
209                     }
210                 }
211                 eState = IMPL_GB_18030_TO_UNICODE_STATE_0;
212             }
213             else
214             {
215                 bUndefined = sal_False;
216                 goto bad_input;
217             }
218             break;
219         }
220         continue;
221 
222     bad_input:
223         switch (ImplHandleBadInputTextToUnicodeConversion(
224                     bUndefined, sal_True, 0, nFlags, &pDestBufPtr, pDestBufEnd,
225                     &nInfo))
226         {
227         case IMPL_BAD_INPUT_STOP:
228             eState = IMPL_GB_18030_TO_UNICODE_STATE_0;
229             break;
230 
231         case IMPL_BAD_INPUT_CONTINUE:
232             eState = IMPL_GB_18030_TO_UNICODE_STATE_0;
233             continue;
234 
235         case IMPL_BAD_INPUT_NO_OUTPUT:
236             goto no_output;
237         }
238         break;
239 
240     no_output:
241         --pSrcBuf;
242         nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL;
243         break;
244     }
245 
246     if (eState != IMPL_GB_18030_TO_UNICODE_STATE_0
247         && (nInfo & (RTL_TEXTTOUNICODE_INFO_ERROR
248                          | RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL))
249                == 0)
250     {
251         if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0)
252             nInfo |= RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL;
253         else
254             switch (ImplHandleBadInputTextToUnicodeConversion(
255                         sal_False, sal_True, 0, nFlags, &pDestBufPtr,
256                         pDestBufEnd, &nInfo))
257             {
258             case IMPL_BAD_INPUT_STOP:
259             case IMPL_BAD_INPUT_CONTINUE:
260                 eState = IMPL_GB_18030_TO_UNICODE_STATE_0;
261                 break;
262 
263             case IMPL_BAD_INPUT_NO_OUTPUT:
264                 nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL;
265                 break;
266             }
267     }
268 
269     if (pContext)
270     {
271         ((ImplGb18030ToUnicodeContext *) pContext)->m_eState = eState;
272         ((ImplGb18030ToUnicodeContext *) pContext)->m_nCode = nCode;
273     }
274     if (pInfo)
275         *pInfo = nInfo;
276     if (pSrcCvtBytes)
277         *pSrcCvtBytes = nConverted;
278 
279     return pDestBufPtr - pDestBuf;
280 }
281 
282 sal_Size ImplConvertUnicodeToGb18030(ImplTextConverterData const * pData,
283                                      void * pContext,
284                                      sal_Unicode const * pSrcBuf,
285                                      sal_Size nSrcChars,
286                                      sal_Char * pDestBuf,
287                                      sal_Size nDestBytes,
288                                      sal_uInt32 nFlags,
289                                      sal_uInt32 * pInfo,
290                                      sal_Size * pSrcCvtChars)
291 {
292     sal_uInt32 const * pGb18030Data
293         = ((ImplGb18030ConverterData const *) pData)->
294               m_pUnicodeToGb18030Data;
295     ImplUnicodeToGb180302000Range const * pGb18030Ranges
296         = ((ImplGb18030ConverterData const *) pData)->
297               m_pUnicodeToGb18030Ranges;
298     sal_Unicode nHighSurrogate = 0;
299     sal_uInt32 nInfo = 0;
300     sal_Size nConverted = 0;
301     sal_Char * pDestBufPtr = pDestBuf;
302     sal_Char * pDestBufEnd = pDestBuf + nDestBytes;
303 
304     if (pContext)
305         nHighSurrogate
306             = ((ImplUnicodeToTextContext *) pContext)->m_nHighSurrogate;
307 
308     for (; nConverted < nSrcChars; ++nConverted)
309     {
310         sal_Bool bUndefined = sal_True;
311         sal_uInt32 nChar = *pSrcBuf++;
312         if (nHighSurrogate == 0)
313         {
314             if (ImplIsHighSurrogate(nChar))
315             {
316                 nHighSurrogate = (sal_Unicode) nChar;
317                 continue;
318             }
319         }
320         else if (ImplIsLowSurrogate(nChar))
321             nChar = ImplCombineSurrogates(nHighSurrogate, nChar);
322         else
323         {
324             bUndefined = sal_False;
325             goto bad_input;
326         }
327 
328         if (ImplIsLowSurrogate(nChar) || ImplIsNoncharacter(nChar))
329         {
330             bUndefined = sal_False;
331             goto bad_input;
332         }
333 
334         if (nChar < 0x80)
335             if (pDestBufPtr != pDestBufEnd)
336                 *pDestBufPtr++ = (sal_Char) nChar;
337             else
338                 goto no_output;
339         else if (nChar < 0x10000)
340         {
341             ImplUnicodeToGb180302000Range const * pRange = pGb18030Ranges;
342             sal_Unicode nFirstNonRange = 0x80;
343             for (;;)
344             {
345                 if (nChar < pRange->m_nFirstUnicode)
346                 {
347                     sal_uInt32 nCode
348                         = pGb18030Data[pRange->m_nNonRangeDataIndex
349                                            + (nChar - nFirstNonRange)];
350                     if (pDestBufEnd - pDestBufPtr
351                             >= (nCode <= 0xFFFF ? 2 : 4))
352                     {
353                         if (nCode > 0xFFFF)
354                         {
355                             *pDestBufPtr++ = (sal_Char) (nCode >> 24);
356                             *pDestBufPtr++ = (sal_Char) (nCode >> 16 & 0xFF);
357                         }
358                         *pDestBufPtr++ = (sal_Char) (nCode >> 8 & 0xFF);
359                         *pDestBufPtr++ = (sal_Char) (nCode & 0xFF);
360                     }
361                     else
362                         goto no_output;
363                     break;
364                 }
365                 else if (nChar <= pRange->m_nLastUnicode)
366                 {
367                     if (pDestBufEnd - pDestBufPtr >= 4)
368                     {
369                         sal_uInt32 nCode
370                             = pRange->m_nFirstLinear
371                                   + (nChar - pRange->m_nFirstUnicode);
372                         *pDestBufPtr++ = (sal_Char) (nCode / 12600 + 0x81);
373                         *pDestBufPtr++
374                             = (sal_Char) (nCode / 1260 % 10 + 0x30);
375                         *pDestBufPtr++ = (sal_Char) (nCode / 10 % 126 + 0x81);
376                         *pDestBufPtr++ = (sal_Char) (nCode % 10 + 0x30);
377                     }
378                     else
379                         goto no_output;
380                     break;
381                 }
382                 nFirstNonRange
383                     = (sal_Unicode) ((pRange++)->m_nLastUnicode + 1);
384             }
385         }
386         else
387             if (pDestBufEnd - pDestBufPtr >= 4)
388             {
389                 sal_uInt32 nCode = nChar - 0x10000;
390                 *pDestBufPtr++ = (sal_Char) (nCode / 12600 + 0x90);
391                 *pDestBufPtr++ = (sal_Char) (nCode / 1260 % 10 + 0x30);
392                 *pDestBufPtr++ = (sal_Char) (nCode / 10 % 126 + 0x81);
393                 *pDestBufPtr++ = (sal_Char) (nCode % 10 + 0x30);
394             }
395             else
396                 goto no_output;
397         nHighSurrogate = 0;
398         continue;
399 
400     bad_input:
401         switch (ImplHandleBadInputUnicodeToTextConversion(bUndefined,
402                                                           nChar,
403                                                           nFlags,
404                                                           &pDestBufPtr,
405                                                           pDestBufEnd,
406                                                           &nInfo,
407                                                           NULL,
408                                                           0,
409                                                           NULL))
410         {
411         case IMPL_BAD_INPUT_STOP:
412             nHighSurrogate = 0;
413             break;
414 
415         case IMPL_BAD_INPUT_CONTINUE:
416             nHighSurrogate = 0;
417             continue;
418 
419         case IMPL_BAD_INPUT_NO_OUTPUT:
420             goto no_output;
421         }
422         break;
423 
424     no_output:
425         --pSrcBuf;
426         nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
427         break;
428     }
429 
430     if (nHighSurrogate != 0
431         && (nInfo & (RTL_UNICODETOTEXT_INFO_ERROR
432                          | RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL))
433                == 0)
434     {
435         if ((nFlags & RTL_UNICODETOTEXT_FLAGS_FLUSH) != 0)
436             nInfo |= RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL;
437         else
438             switch (ImplHandleBadInputUnicodeToTextConversion(sal_False,
439                                                               0,
440                                                               nFlags,
441                                                               &pDestBufPtr,
442                                                               pDestBufEnd,
443                                                               &nInfo,
444                                                               NULL,
445                                                               0,
446                                                               NULL))
447             {
448             case IMPL_BAD_INPUT_STOP:
449             case IMPL_BAD_INPUT_CONTINUE:
450                 nHighSurrogate = 0;
451                 break;
452 
453             case IMPL_BAD_INPUT_NO_OUTPUT:
454                 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
455                 break;
456             }
457     }
458 
459     if (pContext)
460         ((ImplUnicodeToTextContext *) pContext)->m_nHighSurrogate
461             = nHighSurrogate;
462     if (pInfo)
463         *pInfo = nInfo;
464     if (pSrcCvtChars)
465         *pSrcCvtChars = nConverted;
466 
467     return pDestBufPtr - pDestBuf;
468 }
469