xref: /aoo41x/main/sal/textenc/convertiso2022kr.c (revision cdf0e10c)
1 /*************************************************************************
2  *
3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4  *
5  * Copyright 2000, 2010 Oracle and/or its affiliates.
6  *
7  * OpenOffice.org - a multi-platform office productivity suite
8  *
9  * This file is part of OpenOffice.org.
10  *
11  * OpenOffice.org is free software: you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser General Public License version 3
13  * only, as published by the Free Software Foundation.
14  *
15  * OpenOffice.org is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  * GNU Lesser General Public License version 3 for more details
19  * (a copy is included in the LICENSE file that accompanied this code).
20  *
21  * You should have received a copy of the GNU Lesser General Public License
22  * version 3 along with OpenOffice.org.  If not, see
23  * <http://www.openoffice.org/license.html>
24  * for a copy of the LGPLv3 License.
25  *
26  ************************************************************************/
27 
28 #include "convertiso2022kr.h"
29 #include "context.h"
30 #include "converter.h"
31 #include "tenchelp.h"
32 #include "unichars.h"
33 #include "rtl/alloc.h"
34 #include "rtl/textcvt.h"
35 #include "sal/types.h"
36 
37 typedef enum /* order is important: */
38 {
39     IMPL_ISO_2022_KR_TO_UNICODE_STATE_ASCII,
40     IMPL_ISO_2022_KR_TO_UNICODE_STATE_1001,
41     IMPL_ISO_2022_KR_TO_UNICODE_STATE_1001_2,
42     IMPL_ISO_2022_KR_TO_UNICODE_STATE_ESC,
43     IMPL_ISO_2022_KR_TO_UNICODE_STATE_ESC_DOLLAR,
44     IMPL_ISO_2022_KR_TO_UNICODE_STATE_ESC_DOLLAR_RPAREN
45 } ImplIso2022KrToUnicodeState;
46 
47 typedef struct
48 {
49     ImplIso2022KrToUnicodeState m_eState;
50     sal_uInt32 m_nRow;
51 } ImplIso2022KrToUnicodeContext;
52 
53 typedef enum
54 {
55     IMPL_UNICODE_TO_ISO_2022_KR_SET_NONE,
56     IMPL_UNICODE_TO_ISO_2022_KR_SET_ASCII,
57     IMPL_UNICODE_TO_ISO_2022_KR_SET_1001
58 } ImplUnicodeToIso2022KrSet;
59 
60 typedef struct
61 {
62     sal_Unicode m_nHighSurrogate;
63     ImplUnicodeToIso2022KrSet m_eSet;
64 } ImplUnicodeToIso2022KrContext;
65 
66 void * ImplCreateIso2022KrToUnicodeContext(void)
67 {
68     void * pContext
69         = rtl_allocateMemory(sizeof (ImplIso2022KrToUnicodeContext));
70     ((ImplIso2022KrToUnicodeContext *) pContext)->m_eState
71         = IMPL_ISO_2022_KR_TO_UNICODE_STATE_ASCII;
72     return pContext;
73 }
74 
75 void ImplResetIso2022KrToUnicodeContext(void * pContext)
76 {
77     if (pContext)
78         ((ImplIso2022KrToUnicodeContext *) pContext)->m_eState
79             = IMPL_ISO_2022_KR_TO_UNICODE_STATE_ASCII;
80 }
81 
82 sal_Size ImplConvertIso2022KrToUnicode(ImplTextConverterData const * pData,
83                                        void * pContext,
84                                        sal_Char const * pSrcBuf,
85                                        sal_Size nSrcBytes,
86                                        sal_Unicode * pDestBuf,
87                                        sal_Size nDestChars,
88                                        sal_uInt32 nFlags,
89                                        sal_uInt32 * pInfo,
90                                        sal_Size * pSrcCvtBytes)
91 {
92     ImplDBCSToUniLeadTab const * pKsX1001Data
93         = ((ImplIso2022KrConverterData const *) pData)->
94               m_pKsX1001ToUnicodeData;
95     ImplIso2022KrToUnicodeState eState
96         = IMPL_ISO_2022_KR_TO_UNICODE_STATE_ASCII;
97     sal_uInt32 nRow = 0;
98     sal_uInt32 nInfo = 0;
99     sal_Size nConverted = 0;
100     sal_Unicode * pDestBufPtr = pDestBuf;
101     sal_Unicode * pDestBufEnd = pDestBuf + nDestChars;
102 
103     if (pContext)
104     {
105         eState = ((ImplIso2022KrToUnicodeContext *) pContext)->m_eState;
106         nRow = ((ImplIso2022KrToUnicodeContext *) pContext)->m_nRow;
107     }
108 
109     for (; nConverted < nSrcBytes; ++nConverted)
110     {
111         sal_Bool bUndefined = sal_True;
112         sal_uInt32 nChar = *(sal_uChar const *) pSrcBuf++;
113         switch (eState)
114         {
115         case IMPL_ISO_2022_KR_TO_UNICODE_STATE_ASCII:
116             if (nChar == 0x0E) /* SO */
117                 eState = IMPL_ISO_2022_KR_TO_UNICODE_STATE_1001;
118             else if (nChar == 0x1B) /* ESC */
119                 eState = IMPL_ISO_2022_KR_TO_UNICODE_STATE_ESC;
120             else if (nChar < 0x80)
121                 if (pDestBufPtr != pDestBufEnd)
122                     *pDestBufPtr++ = (sal_Unicode) nChar;
123                 else
124                     goto no_output;
125             else
126             {
127                 bUndefined = sal_False;
128                 goto bad_input;
129             }
130             break;
131 
132         case IMPL_ISO_2022_KR_TO_UNICODE_STATE_1001:
133             if (nChar == 0x0F) /* SI */
134                 eState = IMPL_ISO_2022_KR_TO_UNICODE_STATE_ASCII;
135             else if (nChar >= 0x21 && nChar <= 0x7E)
136             {
137                 nRow = nChar + 0x80;
138                 eState = IMPL_ISO_2022_KR_TO_UNICODE_STATE_1001_2;
139             }
140             else
141             {
142                 bUndefined = sal_False;
143                 goto bad_input;
144             }
145             break;
146 
147         case IMPL_ISO_2022_KR_TO_UNICODE_STATE_1001_2:
148             if (nChar >= 0x21 && nChar <= 0x7E)
149             {
150                 sal_uInt16 nUnicode = 0;
151                 sal_uInt32 nFirst = pKsX1001Data[nRow].mnTrailStart;
152                 nChar += 0x80;
153                 if (nChar >= nFirst && nChar <= pKsX1001Data[nRow].mnTrailEnd)
154                     nUnicode = pKsX1001Data[nRow].
155                                    mpToUniTrailTab[nChar - nFirst];
156                 if (nUnicode != 0)
157                     if (pDestBufPtr != pDestBufEnd)
158                     {
159                         *pDestBufPtr++ = (sal_Unicode) nUnicode;
160                         eState = IMPL_ISO_2022_KR_TO_UNICODE_STATE_1001;
161                     }
162                     else
163                         goto no_output;
164                 else
165                     goto bad_input;
166             }
167             else
168             {
169                 bUndefined = sal_False;
170                 goto bad_input;
171             }
172             break;
173 
174         case IMPL_ISO_2022_KR_TO_UNICODE_STATE_ESC:
175             if (nChar == 0x24) /* $ */
176                 eState = IMPL_ISO_2022_KR_TO_UNICODE_STATE_ESC_DOLLAR;
177             else
178             {
179                 bUndefined = sal_False;
180                 goto bad_input;
181             }
182             break;
183 
184         case IMPL_ISO_2022_KR_TO_UNICODE_STATE_ESC_DOLLAR:
185             if (nChar == 0x29) /* ) */
186                 eState = IMPL_ISO_2022_KR_TO_UNICODE_STATE_ESC_DOLLAR_RPAREN;
187             else
188             {
189                 bUndefined = sal_False;
190                 goto bad_input;
191             }
192             break;
193 
194         case IMPL_ISO_2022_KR_TO_UNICODE_STATE_ESC_DOLLAR_RPAREN:
195             if (nChar == 0x43) /* C */
196                 eState = IMPL_ISO_2022_KR_TO_UNICODE_STATE_ASCII;
197             else
198             {
199                 bUndefined = sal_False;
200                 goto bad_input;
201             }
202             break;
203         }
204         continue;
205 
206     bad_input:
207         switch (ImplHandleBadInputTextToUnicodeConversion(
208                     bUndefined, sal_True, 0, nFlags, &pDestBufPtr, pDestBufEnd,
209                     &nInfo))
210         {
211         case IMPL_BAD_INPUT_STOP:
212             eState = IMPL_ISO_2022_KR_TO_UNICODE_STATE_ASCII;
213             break;
214 
215         case IMPL_BAD_INPUT_CONTINUE:
216             eState = IMPL_ISO_2022_KR_TO_UNICODE_STATE_ASCII;
217             continue;
218 
219         case IMPL_BAD_INPUT_NO_OUTPUT:
220             goto no_output;
221         }
222         break;
223 
224     no_output:
225         --pSrcBuf;
226         nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL;
227         break;
228     }
229 
230     if (eState > IMPL_ISO_2022_KR_TO_UNICODE_STATE_1001
231         && (nInfo & (RTL_TEXTTOUNICODE_INFO_ERROR
232                          | RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL))
233                == 0)
234     {
235         if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0)
236             nInfo |= RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL;
237         else
238             switch (ImplHandleBadInputTextToUnicodeConversion(
239                         sal_False, sal_True, 0, nFlags, &pDestBufPtr, pDestBufEnd,
240                         &nInfo))
241             {
242             case IMPL_BAD_INPUT_STOP:
243             case IMPL_BAD_INPUT_CONTINUE:
244                 eState = IMPL_ISO_2022_KR_TO_UNICODE_STATE_ASCII;
245                 break;
246 
247             case IMPL_BAD_INPUT_NO_OUTPUT:
248                 nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL;
249                 break;
250             }
251     }
252 
253     if (pContext)
254     {
255         ((ImplIso2022KrToUnicodeContext *) pContext)->m_eState = eState;
256         ((ImplIso2022KrToUnicodeContext *) pContext)->m_nRow = nRow;
257     }
258     if (pInfo)
259         *pInfo = nInfo;
260     if (pSrcCvtBytes)
261         *pSrcCvtBytes = nConverted;
262 
263     return pDestBufPtr - pDestBuf;
264 }
265 
266 void * ImplCreateUnicodeToIso2022KrContext(void)
267 {
268     void * pContext
269         = rtl_allocateMemory(sizeof (ImplUnicodeToIso2022KrContext));
270     ((ImplUnicodeToIso2022KrContext *) pContext)->m_nHighSurrogate = 0;
271     ((ImplUnicodeToIso2022KrContext *) pContext)->m_eSet
272         = IMPL_UNICODE_TO_ISO_2022_KR_SET_NONE;
273     return pContext;
274 }
275 
276 void ImplResetUnicodeToIso2022KrContext(void * pContext)
277 {
278     if (pContext)
279     {
280         ((ImplUnicodeToIso2022KrContext *) pContext)->m_nHighSurrogate = 0;
281         ((ImplUnicodeToIso2022KrContext *) pContext)->m_eSet
282             = IMPL_UNICODE_TO_ISO_2022_KR_SET_NONE;
283     }
284 }
285 
286 sal_Size ImplConvertUnicodeToIso2022Kr(ImplTextConverterData const * pData,
287                                        void * pContext,
288                                        sal_Unicode const * pSrcBuf,
289                                        sal_Size nSrcChars,
290                                        sal_Char * pDestBuf,
291                                        sal_Size nDestBytes,
292                                        sal_uInt32 nFlags,
293                                        sal_uInt32 * pInfo,
294                                        sal_Size * pSrcCvtChars)
295 {
296     ImplUniToDBCSHighTab const * pKsX1001Data
297         = ((ImplIso2022KrConverterData const *) pData)->
298               m_pUnicodeToKsX1001Data;
299     sal_Unicode nHighSurrogate = 0;
300     ImplUnicodeToIso2022KrSet eSet = IMPL_UNICODE_TO_ISO_2022_KR_SET_NONE;
301     sal_uInt32 nInfo = 0;
302     sal_Size nConverted = 0;
303     sal_Char * pDestBufPtr = pDestBuf;
304     sal_Char * pDestBufEnd = pDestBuf + nDestBytes;
305     sal_Bool bWritten;
306 
307     if (pContext)
308     {
309         nHighSurrogate
310             = ((ImplUnicodeToIso2022KrContext *) pContext)->m_nHighSurrogate;
311         eSet = ((ImplUnicodeToIso2022KrContext *) pContext)->m_eSet;
312     }
313 
314     if (eSet == IMPL_UNICODE_TO_ISO_2022_KR_SET_NONE)
315     {
316         if (pDestBufEnd - pDestBufPtr >= 4)
317         {
318             *pDestBufPtr++ = 0x1B; /* ESC */
319             *pDestBufPtr++ = 0x24; /* $ */
320             *pDestBufPtr++ = 0x29; /* ) */
321             *pDestBufPtr++ = 0x43; /* C */
322             eSet = IMPL_UNICODE_TO_ISO_2022_KR_SET_ASCII;
323         }
324         else
325             nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
326     }
327 
328     if ((nInfo & RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL) == 0)
329         for (; nConverted < nSrcChars; ++nConverted)
330         {
331             sal_Bool bUndefined = sal_True;
332             sal_uInt32 nChar = *pSrcBuf++;
333             if (nHighSurrogate == 0)
334             {
335                 if (ImplIsHighSurrogate(nChar))
336                 {
337                     nHighSurrogate = (sal_Unicode) nChar;
338                     continue;
339                 }
340             }
341             else if (ImplIsLowSurrogate(nChar))
342                 nChar = ImplCombineSurrogates(nHighSurrogate, nChar);
343             else
344             {
345                 bUndefined = sal_False;
346                 goto bad_input;
347             }
348 
349             if (ImplIsLowSurrogate(nChar) || ImplIsNoncharacter(nChar))
350             {
351                 bUndefined = sal_False;
352                 goto bad_input;
353             }
354 
355             if (nChar == 0x0A || nChar == 0x0D) /* LF, CR */
356             {
357                 if (eSet == IMPL_UNICODE_TO_ISO_2022_KR_SET_1001)
358                 {
359                     if (pDestBufPtr != pDestBufEnd)
360                     {
361                         *pDestBufPtr++ = 0x0F; /* SI */
362                         eSet = IMPL_UNICODE_TO_ISO_2022_KR_SET_ASCII;
363                     }
364                     else
365                         goto no_output;
366                 }
367                 if (pDestBufPtr != pDestBufEnd)
368                     *pDestBufPtr++ = (sal_Char) nChar;
369                 else
370                     goto no_output;
371             }
372             else if (nChar == 0x0E || nChar == 0x0F || nChar == 0x1B)
373                 goto bad_input;
374             else if (nChar < 0x80)
375             {
376                 if (eSet == IMPL_UNICODE_TO_ISO_2022_KR_SET_1001)
377                 {
378                     if (pDestBufPtr != pDestBufEnd)
379                     {
380                         *pDestBufPtr++ = 0x0F; /* SI */
381                         eSet = IMPL_UNICODE_TO_ISO_2022_KR_SET_ASCII;
382                     }
383                     else
384                         goto no_output;
385                 }
386                 if (pDestBufPtr != pDestBufEnd)
387                     *pDestBufPtr++ = (sal_Char) nChar;
388                 else
389                     goto no_output;
390             }
391             else
392             {
393                 sal_uInt16 nBytes = 0;
394                 sal_uInt32 nIndex1 = nChar >> 8;
395                 if (nIndex1 < 0x100)
396                 {
397                     sal_uInt32 nIndex2 = nChar & 0xFF;
398                     sal_uInt32 nFirst = pKsX1001Data[nIndex1].mnLowStart;
399                     if (nIndex2 >= nFirst
400                         && nIndex2 <= pKsX1001Data[nIndex1].mnLowEnd)
401                         nBytes = pKsX1001Data[nIndex1].
402                                      mpToUniTrailTab[nIndex2 - nFirst];
403                 }
404                 if (nBytes != 0)
405                 {
406                     if (eSet == IMPL_UNICODE_TO_ISO_2022_KR_SET_ASCII)
407                     {
408                         if (pDestBufPtr != pDestBufEnd)
409                         {
410                             *pDestBufPtr++ = 0x0E; /* SO */
411                             eSet = IMPL_UNICODE_TO_ISO_2022_KR_SET_1001;
412                         }
413                         else
414                             goto no_output;
415                     }
416                     if (pDestBufEnd - pDestBufPtr >= 2)
417                     {
418                         *pDestBufPtr++ = (sal_Char) ((nBytes >> 8) & 0x7F);
419                         *pDestBufPtr++ = (sal_Char) (nBytes & 0x7F);
420                     }
421                     else
422                         goto no_output;
423                 }
424                 else
425                     goto bad_input;
426             }
427             nHighSurrogate = 0;
428             continue;
429 
430         bad_input:
431             switch (ImplHandleBadInputUnicodeToTextConversion(
432                         bUndefined,
433                         nChar,
434                         nFlags,
435                         &pDestBufPtr,
436                         pDestBufEnd,
437                         &nInfo,
438                         "\x0F", /* SI */
439                         eSet == IMPL_UNICODE_TO_ISO_2022_KR_SET_ASCII ? 0 : 1,
440                         &bWritten))
441             {
442             case IMPL_BAD_INPUT_STOP:
443                 nHighSurrogate = 0;
444                 break;
445 
446             case IMPL_BAD_INPUT_CONTINUE:
447                 if (bWritten)
448                     eSet = IMPL_UNICODE_TO_ISO_2022_KR_SET_ASCII;
449                 nHighSurrogate = 0;
450                 continue;
451 
452             case IMPL_BAD_INPUT_NO_OUTPUT:
453                 goto no_output;
454             }
455             break;
456 
457         no_output:
458             --pSrcBuf;
459             nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
460             break;
461         }
462 
463     if ((nInfo & (RTL_UNICODETOTEXT_INFO_ERROR
464                       | RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL))
465             == 0)
466     {
467         sal_Bool bFlush = sal_True;
468         if (nHighSurrogate != 0)
469         {
470             if ((nFlags & RTL_UNICODETOTEXT_FLAGS_FLUSH) != 0)
471                 nInfo |= RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL;
472             else
473                 switch (ImplHandleBadInputUnicodeToTextConversion(
474                             sal_False,
475                             0,
476                             nFlags,
477                             &pDestBufPtr,
478                             pDestBufEnd,
479                             &nInfo,
480                             "\x0F", /* SI */
481                             eSet == IMPL_UNICODE_TO_ISO_2022_KR_SET_ASCII ?
482                                 0 : 1,
483                             &bWritten))
484                 {
485                 case IMPL_BAD_INPUT_STOP:
486                     nHighSurrogate = 0;
487                     bFlush = sal_False;
488                     break;
489 
490                 case IMPL_BAD_INPUT_CONTINUE:
491                     if (bWritten)
492                         eSet = IMPL_UNICODE_TO_ISO_2022_KR_SET_ASCII;
493                     nHighSurrogate = 0;
494                     break;
495 
496                 case IMPL_BAD_INPUT_NO_OUTPUT:
497                     nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
498                     break;
499                 }
500         }
501         if (bFlush
502             && eSet == IMPL_UNICODE_TO_ISO_2022_KR_SET_1001
503             && (nFlags & RTL_UNICODETOTEXT_FLAGS_FLUSH) != 0)
504         {
505             if (pDestBufPtr != pDestBufEnd)
506             {
507                 *pDestBufPtr++ = 0x0F; /* SI */
508                 eSet = IMPL_UNICODE_TO_ISO_2022_KR_SET_ASCII;
509             }
510             else
511                 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
512         }
513     }
514 
515     if (pContext)
516     {
517         ((ImplUnicodeToIso2022KrContext *) pContext)->m_nHighSurrogate
518             = nHighSurrogate;
519         ((ImplUnicodeToIso2022KrContext *) pContext)->m_eSet = eSet;
520     }
521     if (pInfo)
522         *pInfo = nInfo;
523     if (pSrcCvtChars)
524         *pSrcCvtChars = nConverted;
525 
526     return pDestBufPtr - pDestBuf;
527 }
528