xref: /trunk/main/sal/textenc/convertiso2022kr.c (revision 647f063d)
1 /**************************************************************
2  *
3  * Licensed to the Apache Software Foundation (ASF) under one
4  * or more contributor license agreements.  See the NOTICE file
5  * distributed with this work for additional information
6  * regarding copyright ownership.  The ASF licenses this file
7  * to you under the Apache License, Version 2.0 (the
8  * "License"); you may not use this file except in compliance
9  * with the License.  You may obtain a copy of the License at
10  *
11  *   http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing,
14  * software distributed under the License is distributed on an
15  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16  * KIND, either express or implied.  See the License for the
17  * specific language governing permissions and limitations
18  * under the License.
19  *
20  *************************************************************/
21 
22 
23 
24 #include "convertiso2022kr.h"
25 #include "context.h"
26 #include "converter.h"
27 #include "tenchelp.h"
28 #include "unichars.h"
29 #include "rtl/alloc.h"
30 #include "rtl/textcvt.h"
31 #include "sal/types.h"
32 
33 typedef enum /* order is important: */
34 {
35     IMPL_ISO_2022_KR_TO_UNICODE_STATE_ASCII,
36     IMPL_ISO_2022_KR_TO_UNICODE_STATE_1001,
37     IMPL_ISO_2022_KR_TO_UNICODE_STATE_1001_2,
38     IMPL_ISO_2022_KR_TO_UNICODE_STATE_ESC,
39     IMPL_ISO_2022_KR_TO_UNICODE_STATE_ESC_DOLLAR,
40     IMPL_ISO_2022_KR_TO_UNICODE_STATE_ESC_DOLLAR_RPAREN
41 } ImplIso2022KrToUnicodeState;
42 
43 typedef struct
44 {
45     ImplIso2022KrToUnicodeState m_eState;
46     sal_uInt32 m_nRow;
47 } ImplIso2022KrToUnicodeContext;
48 
49 typedef enum
50 {
51     IMPL_UNICODE_TO_ISO_2022_KR_SET_NONE,
52     IMPL_UNICODE_TO_ISO_2022_KR_SET_ASCII,
53     IMPL_UNICODE_TO_ISO_2022_KR_SET_1001
54 } ImplUnicodeToIso2022KrSet;
55 
56 typedef struct
57 {
58     sal_Unicode m_nHighSurrogate;
59     ImplUnicodeToIso2022KrSet m_eSet;
60 } ImplUnicodeToIso2022KrContext;
61 
ImplCreateIso2022KrToUnicodeContext(void)62 void * ImplCreateIso2022KrToUnicodeContext(void)
63 {
64     void * pContext
65         = rtl_allocateMemory(sizeof (ImplIso2022KrToUnicodeContext));
66     ((ImplIso2022KrToUnicodeContext *) pContext)->m_eState
67         = IMPL_ISO_2022_KR_TO_UNICODE_STATE_ASCII;
68     return pContext;
69 }
70 
ImplResetIso2022KrToUnicodeContext(void * pContext)71 void ImplResetIso2022KrToUnicodeContext(void * pContext)
72 {
73     if (pContext)
74         ((ImplIso2022KrToUnicodeContext *) pContext)->m_eState
75             = IMPL_ISO_2022_KR_TO_UNICODE_STATE_ASCII;
76 }
77 
ImplConvertIso2022KrToUnicode(ImplTextConverterData const * pData,void * pContext,sal_Char const * pSrcBuf,sal_Size nSrcBytes,sal_Unicode * pDestBuf,sal_Size nDestChars,sal_uInt32 nFlags,sal_uInt32 * pInfo,sal_Size * pSrcCvtBytes)78 sal_Size ImplConvertIso2022KrToUnicode(ImplTextConverterData const * pData,
79                                        void * pContext,
80                                        sal_Char const * pSrcBuf,
81                                        sal_Size nSrcBytes,
82                                        sal_Unicode * pDestBuf,
83                                        sal_Size nDestChars,
84                                        sal_uInt32 nFlags,
85                                        sal_uInt32 * pInfo,
86                                        sal_Size * pSrcCvtBytes)
87 {
88     ImplDBCSToUniLeadTab const * pKsX1001Data
89         = ((ImplIso2022KrConverterData const *) pData)->
90               m_pKsX1001ToUnicodeData;
91     ImplIso2022KrToUnicodeState eState
92         = IMPL_ISO_2022_KR_TO_UNICODE_STATE_ASCII;
93     sal_uInt32 nRow = 0;
94     sal_uInt32 nInfo = 0;
95     sal_Size nConverted = 0;
96     sal_Unicode * pDestBufPtr = pDestBuf;
97     sal_Unicode * pDestBufEnd = pDestBuf + nDestChars;
98 
99     if (pContext)
100     {
101         eState = ((ImplIso2022KrToUnicodeContext *) pContext)->m_eState;
102         nRow = ((ImplIso2022KrToUnicodeContext *) pContext)->m_nRow;
103     }
104 
105     for (; nConverted < nSrcBytes; ++nConverted)
106     {
107         sal_Bool bUndefined = sal_True;
108         sal_uInt32 nChar = *(sal_uChar const *) pSrcBuf++;
109         switch (eState)
110         {
111         case IMPL_ISO_2022_KR_TO_UNICODE_STATE_ASCII:
112             if (nChar == 0x0E) /* SO */
113                 eState = IMPL_ISO_2022_KR_TO_UNICODE_STATE_1001;
114             else if (nChar == 0x1B) /* ESC */
115                 eState = IMPL_ISO_2022_KR_TO_UNICODE_STATE_ESC;
116             else if (nChar < 0x80)
117                 if (pDestBufPtr != pDestBufEnd)
118                     *pDestBufPtr++ = (sal_Unicode) nChar;
119                 else
120                     goto no_output;
121             else
122             {
123                 bUndefined = sal_False;
124                 goto bad_input;
125             }
126             break;
127 
128         case IMPL_ISO_2022_KR_TO_UNICODE_STATE_1001:
129             if (nChar == 0x0F) /* SI */
130                 eState = IMPL_ISO_2022_KR_TO_UNICODE_STATE_ASCII;
131             else if (nChar >= 0x21 && nChar <= 0x7E)
132             {
133                 nRow = nChar + 0x80;
134                 eState = IMPL_ISO_2022_KR_TO_UNICODE_STATE_1001_2;
135             }
136             else
137             {
138                 bUndefined = sal_False;
139                 goto bad_input;
140             }
141             break;
142 
143         case IMPL_ISO_2022_KR_TO_UNICODE_STATE_1001_2:
144             if (nChar >= 0x21 && nChar <= 0x7E)
145             {
146                 sal_uInt16 nUnicode = 0;
147                 sal_uInt32 nFirst = pKsX1001Data[nRow].mnTrailStart;
148                 nChar += 0x80;
149                 if (nChar >= nFirst && nChar <= pKsX1001Data[nRow].mnTrailEnd)
150                     nUnicode = pKsX1001Data[nRow].
151                                    mpToUniTrailTab[nChar - nFirst];
152                 if (nUnicode != 0)
153                     if (pDestBufPtr != pDestBufEnd)
154                     {
155                         *pDestBufPtr++ = (sal_Unicode) nUnicode;
156                         eState = IMPL_ISO_2022_KR_TO_UNICODE_STATE_1001;
157                     }
158                     else
159                         goto no_output;
160                 else
161                     goto bad_input;
162             }
163             else
164             {
165                 bUndefined = sal_False;
166                 goto bad_input;
167             }
168             break;
169 
170         case IMPL_ISO_2022_KR_TO_UNICODE_STATE_ESC:
171             if (nChar == 0x24) /* $ */
172                 eState = IMPL_ISO_2022_KR_TO_UNICODE_STATE_ESC_DOLLAR;
173             else
174             {
175                 bUndefined = sal_False;
176                 goto bad_input;
177             }
178             break;
179 
180         case IMPL_ISO_2022_KR_TO_UNICODE_STATE_ESC_DOLLAR:
181             if (nChar == 0x29) /* ) */
182                 eState = IMPL_ISO_2022_KR_TO_UNICODE_STATE_ESC_DOLLAR_RPAREN;
183             else
184             {
185                 bUndefined = sal_False;
186                 goto bad_input;
187             }
188             break;
189 
190         case IMPL_ISO_2022_KR_TO_UNICODE_STATE_ESC_DOLLAR_RPAREN:
191             if (nChar == 0x43) /* C */
192                 eState = IMPL_ISO_2022_KR_TO_UNICODE_STATE_ASCII;
193             else
194             {
195                 bUndefined = sal_False;
196                 goto bad_input;
197             }
198             break;
199         }
200         continue;
201 
202     bad_input:
203         switch (ImplHandleBadInputTextToUnicodeConversion(
204                     bUndefined, sal_True, 0, nFlags, &pDestBufPtr, pDestBufEnd,
205                     &nInfo))
206         {
207         case IMPL_BAD_INPUT_STOP:
208             eState = IMPL_ISO_2022_KR_TO_UNICODE_STATE_ASCII;
209             break;
210 
211         case IMPL_BAD_INPUT_CONTINUE:
212             eState = IMPL_ISO_2022_KR_TO_UNICODE_STATE_ASCII;
213             continue;
214 
215         case IMPL_BAD_INPUT_NO_OUTPUT:
216             goto no_output;
217         }
218         break;
219 
220     no_output:
221         --pSrcBuf;
222         nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL;
223         break;
224     }
225 
226     if (eState > IMPL_ISO_2022_KR_TO_UNICODE_STATE_1001
227         && (nInfo & (RTL_TEXTTOUNICODE_INFO_ERROR
228                          | RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL))
229                == 0)
230     {
231         if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0)
232             nInfo |= RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL;
233         else
234             switch (ImplHandleBadInputTextToUnicodeConversion(
235                         sal_False, sal_True, 0, nFlags, &pDestBufPtr, pDestBufEnd,
236                         &nInfo))
237             {
238             case IMPL_BAD_INPUT_STOP:
239             case IMPL_BAD_INPUT_CONTINUE:
240                 eState = IMPL_ISO_2022_KR_TO_UNICODE_STATE_ASCII;
241                 break;
242 
243             case IMPL_BAD_INPUT_NO_OUTPUT:
244                 nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL;
245                 break;
246             }
247     }
248 
249     if (pContext)
250     {
251         ((ImplIso2022KrToUnicodeContext *) pContext)->m_eState = eState;
252         ((ImplIso2022KrToUnicodeContext *) pContext)->m_nRow = nRow;
253     }
254     if (pInfo)
255         *pInfo = nInfo;
256     if (pSrcCvtBytes)
257         *pSrcCvtBytes = nConverted;
258 
259     return pDestBufPtr - pDestBuf;
260 }
261 
ImplCreateUnicodeToIso2022KrContext(void)262 void * ImplCreateUnicodeToIso2022KrContext(void)
263 {
264     void * pContext
265         = rtl_allocateMemory(sizeof (ImplUnicodeToIso2022KrContext));
266     ((ImplUnicodeToIso2022KrContext *) pContext)->m_nHighSurrogate = 0;
267     ((ImplUnicodeToIso2022KrContext *) pContext)->m_eSet
268         = IMPL_UNICODE_TO_ISO_2022_KR_SET_NONE;
269     return pContext;
270 }
271 
ImplResetUnicodeToIso2022KrContext(void * pContext)272 void ImplResetUnicodeToIso2022KrContext(void * pContext)
273 {
274     if (pContext)
275     {
276         ((ImplUnicodeToIso2022KrContext *) pContext)->m_nHighSurrogate = 0;
277         ((ImplUnicodeToIso2022KrContext *) pContext)->m_eSet
278             = IMPL_UNICODE_TO_ISO_2022_KR_SET_NONE;
279     }
280 }
281 
ImplConvertUnicodeToIso2022Kr(ImplTextConverterData const * pData,void * pContext,sal_Unicode const * pSrcBuf,sal_Size nSrcChars,sal_Char * pDestBuf,sal_Size nDestBytes,sal_uInt32 nFlags,sal_uInt32 * pInfo,sal_Size * pSrcCvtChars)282 sal_Size ImplConvertUnicodeToIso2022Kr(ImplTextConverterData const * pData,
283                                        void * pContext,
284                                        sal_Unicode const * pSrcBuf,
285                                        sal_Size nSrcChars,
286                                        sal_Char * pDestBuf,
287                                        sal_Size nDestBytes,
288                                        sal_uInt32 nFlags,
289                                        sal_uInt32 * pInfo,
290                                        sal_Size * pSrcCvtChars)
291 {
292     ImplUniToDBCSHighTab const * pKsX1001Data
293         = ((ImplIso2022KrConverterData const *) pData)->
294               m_pUnicodeToKsX1001Data;
295     sal_Unicode nHighSurrogate = 0;
296     ImplUnicodeToIso2022KrSet eSet = IMPL_UNICODE_TO_ISO_2022_KR_SET_NONE;
297     sal_uInt32 nInfo = 0;
298     sal_Size nConverted = 0;
299     sal_Char * pDestBufPtr = pDestBuf;
300     sal_Char * pDestBufEnd = pDestBuf + nDestBytes;
301     sal_Bool bWritten;
302 
303     if (pContext)
304     {
305         nHighSurrogate
306             = ((ImplUnicodeToIso2022KrContext *) pContext)->m_nHighSurrogate;
307         eSet = ((ImplUnicodeToIso2022KrContext *) pContext)->m_eSet;
308     }
309 
310     if (eSet == IMPL_UNICODE_TO_ISO_2022_KR_SET_NONE)
311     {
312         if (pDestBufEnd - pDestBufPtr >= 4)
313         {
314             *pDestBufPtr++ = 0x1B; /* ESC */
315             *pDestBufPtr++ = 0x24; /* $ */
316             *pDestBufPtr++ = 0x29; /* ) */
317             *pDestBufPtr++ = 0x43; /* C */
318             eSet = IMPL_UNICODE_TO_ISO_2022_KR_SET_ASCII;
319         }
320         else
321             nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
322     }
323 
324     if ((nInfo & RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL) == 0)
325         for (; nConverted < nSrcChars; ++nConverted)
326         {
327             sal_Bool bUndefined = sal_True;
328             sal_uInt32 nChar = *pSrcBuf++;
329             if (nHighSurrogate == 0)
330             {
331                 if (ImplIsHighSurrogate(nChar))
332                 {
333                     nHighSurrogate = (sal_Unicode) nChar;
334                     continue;
335                 }
336             }
337             else if (ImplIsLowSurrogate(nChar))
338                 nChar = ImplCombineSurrogates(nHighSurrogate, nChar);
339             else
340             {
341                 bUndefined = sal_False;
342                 goto bad_input;
343             }
344 
345             if (ImplIsLowSurrogate(nChar) || ImplIsNoncharacter(nChar))
346             {
347                 bUndefined = sal_False;
348                 goto bad_input;
349             }
350 
351             if (nChar == 0x0A || nChar == 0x0D) /* LF, CR */
352             {
353                 if (eSet == IMPL_UNICODE_TO_ISO_2022_KR_SET_1001)
354                 {
355                     if (pDestBufPtr != pDestBufEnd)
356                     {
357                         *pDestBufPtr++ = 0x0F; /* SI */
358                         eSet = IMPL_UNICODE_TO_ISO_2022_KR_SET_ASCII;
359                     }
360                     else
361                         goto no_output;
362                 }
363                 if (pDestBufPtr != pDestBufEnd)
364                     *pDestBufPtr++ = (sal_Char) nChar;
365                 else
366                     goto no_output;
367             }
368             else if (nChar == 0x0E || nChar == 0x0F || nChar == 0x1B)
369                 goto bad_input;
370             else if (nChar < 0x80)
371             {
372                 if (eSet == IMPL_UNICODE_TO_ISO_2022_KR_SET_1001)
373                 {
374                     if (pDestBufPtr != pDestBufEnd)
375                     {
376                         *pDestBufPtr++ = 0x0F; /* SI */
377                         eSet = IMPL_UNICODE_TO_ISO_2022_KR_SET_ASCII;
378                     }
379                     else
380                         goto no_output;
381                 }
382                 if (pDestBufPtr != pDestBufEnd)
383                     *pDestBufPtr++ = (sal_Char) nChar;
384                 else
385                     goto no_output;
386             }
387             else
388             {
389                 sal_uInt16 nBytes = 0;
390                 sal_uInt32 nIndex1 = nChar >> 8;
391                 if (nIndex1 < 0x100)
392                 {
393                     sal_uInt32 nIndex2 = nChar & 0xFF;
394                     sal_uInt32 nFirst = pKsX1001Data[nIndex1].mnLowStart;
395                     if (nIndex2 >= nFirst
396                         && nIndex2 <= pKsX1001Data[nIndex1].mnLowEnd)
397                         nBytes = pKsX1001Data[nIndex1].
398                                      mpToUniTrailTab[nIndex2 - nFirst];
399                 }
400                 if (nBytes != 0)
401                 {
402                     if (eSet == IMPL_UNICODE_TO_ISO_2022_KR_SET_ASCII)
403                     {
404                         if (pDestBufPtr != pDestBufEnd)
405                         {
406                             *pDestBufPtr++ = 0x0E; /* SO */
407                             eSet = IMPL_UNICODE_TO_ISO_2022_KR_SET_1001;
408                         }
409                         else
410                             goto no_output;
411                     }
412                     if (pDestBufEnd - pDestBufPtr >= 2)
413                     {
414                         *pDestBufPtr++ = (sal_Char) ((nBytes >> 8) & 0x7F);
415                         *pDestBufPtr++ = (sal_Char) (nBytes & 0x7F);
416                     }
417                     else
418                         goto no_output;
419                 }
420                 else
421                     goto bad_input;
422             }
423             nHighSurrogate = 0;
424             continue;
425 
426         bad_input:
427             switch (ImplHandleBadInputUnicodeToTextConversion(
428                         bUndefined,
429                         nChar,
430                         nFlags,
431                         &pDestBufPtr,
432                         pDestBufEnd,
433                         &nInfo,
434                         "\x0F", /* SI */
435                         eSet == IMPL_UNICODE_TO_ISO_2022_KR_SET_ASCII ? 0 : 1,
436                         &bWritten))
437             {
438             case IMPL_BAD_INPUT_STOP:
439                 nHighSurrogate = 0;
440                 break;
441 
442             case IMPL_BAD_INPUT_CONTINUE:
443                 if (bWritten)
444                     eSet = IMPL_UNICODE_TO_ISO_2022_KR_SET_ASCII;
445                 nHighSurrogate = 0;
446                 continue;
447 
448             case IMPL_BAD_INPUT_NO_OUTPUT:
449                 goto no_output;
450             }
451             break;
452 
453         no_output:
454             --pSrcBuf;
455             nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
456             break;
457         }
458 
459     if ((nInfo & (RTL_UNICODETOTEXT_INFO_ERROR
460                       | RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL))
461             == 0)
462     {
463         sal_Bool bFlush = sal_True;
464         if (nHighSurrogate != 0)
465         {
466             if ((nFlags & RTL_UNICODETOTEXT_FLAGS_FLUSH) != 0)
467                 nInfo |= RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL;
468             else
469                 switch (ImplHandleBadInputUnicodeToTextConversion(
470                             sal_False,
471                             0,
472                             nFlags,
473                             &pDestBufPtr,
474                             pDestBufEnd,
475                             &nInfo,
476                             "\x0F", /* SI */
477                             eSet == IMPL_UNICODE_TO_ISO_2022_KR_SET_ASCII ?
478                                 0 : 1,
479                             &bWritten))
480                 {
481                 case IMPL_BAD_INPUT_STOP:
482                     nHighSurrogate = 0;
483                     bFlush = sal_False;
484                     break;
485 
486                 case IMPL_BAD_INPUT_CONTINUE:
487                     if (bWritten)
488                         eSet = IMPL_UNICODE_TO_ISO_2022_KR_SET_ASCII;
489                     nHighSurrogate = 0;
490                     break;
491 
492                 case IMPL_BAD_INPUT_NO_OUTPUT:
493                     nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
494                     break;
495                 }
496         }
497         if (bFlush
498             && eSet == IMPL_UNICODE_TO_ISO_2022_KR_SET_1001
499             && (nFlags & RTL_UNICODETOTEXT_FLAGS_FLUSH) != 0)
500         {
501             if (pDestBufPtr != pDestBufEnd)
502             {
503                 *pDestBufPtr++ = 0x0F; /* SI */
504                 eSet = IMPL_UNICODE_TO_ISO_2022_KR_SET_ASCII;
505             }
506             else
507                 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
508         }
509     }
510 
511     if (pContext)
512     {
513         ((ImplUnicodeToIso2022KrContext *) pContext)->m_nHighSurrogate
514             = nHighSurrogate;
515         ((ImplUnicodeToIso2022KrContext *) pContext)->m_eSet = eSet;
516     }
517     if (pInfo)
518         *pInfo = nInfo;
519     if (pSrcCvtChars)
520         *pSrcCvtChars = nConverted;
521 
522     return pDestBufPtr - pDestBuf;
523 }
524