xref: /trunk/main/sal/textenc/convertiso2022jp.c (revision 647f063d)
1 /**************************************************************
2  *
3  * Licensed to the Apache Software Foundation (ASF) under one
4  * or more contributor license agreements.  See the NOTICE file
5  * distributed with this work for additional information
6  * regarding copyright ownership.  The ASF licenses this file
7  * to you under the Apache License, Version 2.0 (the
8  * "License"); you may not use this file except in compliance
9  * with the License.  You may obtain a copy of the License at
10  *
11  *   http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing,
14  * software distributed under the License is distributed on an
15  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16  * KIND, either express or implied.  See the License for the
17  * specific language governing permissions and limitations
18  * under the License.
19  *
20  *************************************************************/
21 
22 
23 
24 #include "convertiso2022jp.h"
25 #include "context.h"
26 #include "converter.h"
27 #include "tenchelp.h"
28 #include "unichars.h"
29 #include "rtl/alloc.h"
30 #include "rtl/textcvt.h"
31 #include "sal/types.h"
32 
33 typedef enum /* order is important: */
34 {
35     IMPL_ISO_2022_JP_TO_UNICODE_STATE_ASCII,
36     IMPL_ISO_2022_JP_TO_UNICODE_STATE_JIS_ROMAN,
37     IMPL_ISO_2022_JP_TO_UNICODE_STATE_0208,
38     IMPL_ISO_2022_JP_TO_UNICODE_STATE_0208_2,
39     IMPL_ISO_2022_JP_TO_UNICODE_STATE_ESC,
40     IMPL_ISO_2022_JP_TO_UNICODE_STATE_ESC_LPAREN,
41     IMPL_ISO_2022_JP_TO_UNICODE_STATE_ESC_DOLLAR
42 } ImplIso2022JpToUnicodeState;
43 
44 typedef struct
45 {
46     ImplIso2022JpToUnicodeState m_eState;
47     sal_uInt32 m_nRow;
48 } ImplIso2022JpToUnicodeContext;
49 
50 typedef struct
51 {
52     sal_Unicode m_nHighSurrogate;
53     sal_Bool m_b0208;
54 } ImplUnicodeToIso2022JpContext;
55 
ImplCreateIso2022JpToUnicodeContext(void)56 void * ImplCreateIso2022JpToUnicodeContext(void)
57 {
58     void * pContext
59         = rtl_allocateMemory(sizeof (ImplIso2022JpToUnicodeContext));
60     ((ImplIso2022JpToUnicodeContext *) pContext)->m_eState
61         = IMPL_ISO_2022_JP_TO_UNICODE_STATE_ASCII;
62     return pContext;
63 }
64 
ImplResetIso2022JpToUnicodeContext(void * pContext)65 void ImplResetIso2022JpToUnicodeContext(void * pContext)
66 {
67     if (pContext)
68         ((ImplIso2022JpToUnicodeContext *) pContext)->m_eState
69             = IMPL_ISO_2022_JP_TO_UNICODE_STATE_ASCII;
70 }
71 
ImplConvertIso2022JpToUnicode(ImplTextConverterData const * pData,void * pContext,sal_Char const * pSrcBuf,sal_Size nSrcBytes,sal_Unicode * pDestBuf,sal_Size nDestChars,sal_uInt32 nFlags,sal_uInt32 * pInfo,sal_Size * pSrcCvtBytes)72 sal_Size ImplConvertIso2022JpToUnicode(ImplTextConverterData const * pData,
73                                        void * pContext,
74                                        sal_Char const * pSrcBuf,
75                                        sal_Size nSrcBytes,
76                                        sal_Unicode * pDestBuf,
77                                        sal_Size nDestChars,
78                                        sal_uInt32 nFlags,
79                                        sal_uInt32 * pInfo,
80                                        sal_Size * pSrcCvtBytes)
81 {
82     ImplDBCSToUniLeadTab const * pJisX0208Data
83         = ((ImplIso2022JpConverterData const *) pData)->
84               m_pJisX0208ToUnicodeData;
85     ImplIso2022JpToUnicodeState eState
86         = IMPL_ISO_2022_JP_TO_UNICODE_STATE_ASCII;
87     sal_uInt32 nRow = 0;
88     sal_uInt32 nInfo = 0;
89     sal_Size nConverted = 0;
90     sal_Unicode * pDestBufPtr = pDestBuf;
91     sal_Unicode * pDestBufEnd = pDestBuf + nDestChars;
92 
93     if (pContext)
94     {
95         eState = ((ImplIso2022JpToUnicodeContext *) pContext)->m_eState;
96         nRow = ((ImplIso2022JpToUnicodeContext *) pContext)->m_nRow;
97     }
98 
99     for (; nConverted < nSrcBytes; ++nConverted)
100     {
101         sal_Bool bUndefined = sal_True;
102         sal_uInt32 nChar = *(sal_uChar const *) pSrcBuf++;
103         switch (eState)
104         {
105         case IMPL_ISO_2022_JP_TO_UNICODE_STATE_ASCII:
106             if (nChar == 0x1B) /* ESC */
107                 eState = IMPL_ISO_2022_JP_TO_UNICODE_STATE_ESC;
108             else if (nChar < 0x80)
109                 if (pDestBufPtr != pDestBufEnd)
110                     *pDestBufPtr++ = (sal_Unicode) nChar;
111                 else
112                     goto no_output;
113             else
114             {
115                 bUndefined = sal_False;
116                 goto bad_input;
117             }
118             break;
119 
120         case IMPL_ISO_2022_JP_TO_UNICODE_STATE_JIS_ROMAN:
121             if (nChar == 0x1B) /* ESC */
122                 eState = IMPL_ISO_2022_JP_TO_UNICODE_STATE_ESC;
123             else if (nChar < 0x80)
124                 if (pDestBufPtr != pDestBufEnd)
125                 {
126                     switch (nChar)
127                     {
128                     case 0x5C: /* \ */
129                         nChar = 0xA5; /* YEN SIGN */
130                         break;
131 
132                     case 0x7E: /* ~ */
133                         nChar = 0xAF; /* MACRON */
134                         break;
135                     }
136                     *pDestBufPtr++ = (sal_Unicode) nChar;
137                 }
138                 else
139                     goto no_output;
140             else
141             {
142                 bUndefined = sal_False;
143                 goto bad_input;
144             }
145             break;
146 
147         case IMPL_ISO_2022_JP_TO_UNICODE_STATE_0208:
148             if (nChar == 0x1B) /* ESC */
149                 eState = IMPL_ISO_2022_JP_TO_UNICODE_STATE_ESC;
150             else if (nChar >= 0x21 && nChar <= 0x7E)
151             {
152                 nRow = nChar;
153                 eState = IMPL_ISO_2022_JP_TO_UNICODE_STATE_0208_2;
154             }
155             else
156             {
157                 bUndefined = sal_False;
158                 goto bad_input;
159             }
160             break;
161 
162         case IMPL_ISO_2022_JP_TO_UNICODE_STATE_0208_2:
163             if (nChar >= 0x21 && nChar <= 0x7E)
164             {
165                 sal_uInt16 nUnicode = 0;
166                 sal_uInt32 nFirst = pJisX0208Data[nRow].mnTrailStart;
167                 if (nChar >= nFirst
168                     && nChar <= pJisX0208Data[nRow].mnTrailEnd)
169                     nUnicode = pJisX0208Data[nRow].
170                                    mpToUniTrailTab[nChar - nFirst];
171                 if (nUnicode != 0)
172                     if (pDestBufPtr != pDestBufEnd)
173                     {
174                         *pDestBufPtr++ = (sal_Unicode) nUnicode;
175                         eState = IMPL_ISO_2022_JP_TO_UNICODE_STATE_0208;
176                     }
177                     else
178                         goto no_output;
179                 else
180                     goto bad_input;
181             }
182             else
183             {
184                 bUndefined = sal_False;
185                 goto bad_input;
186             }
187             break;
188 
189         case IMPL_ISO_2022_JP_TO_UNICODE_STATE_ESC:
190             switch (nChar)
191             {
192             case 0x24: /* $ */
193                 eState = IMPL_ISO_2022_JP_TO_UNICODE_STATE_ESC_DOLLAR;
194                 break;
195 
196             case 0x28: /* ( */
197                 eState = IMPL_ISO_2022_JP_TO_UNICODE_STATE_ESC_LPAREN;
198                 break;
199 
200             default:
201                 bUndefined = sal_False;
202                 goto bad_input;
203             }
204             break;
205 
206         case IMPL_ISO_2022_JP_TO_UNICODE_STATE_ESC_LPAREN:
207             switch (nChar)
208             {
209             case 0x42: /* A */
210                 eState = IMPL_ISO_2022_JP_TO_UNICODE_STATE_ASCII;
211                 break;
212 
213             case 0x4A: /* J */
214                 eState = IMPL_ISO_2022_JP_TO_UNICODE_STATE_JIS_ROMAN;
215                 break;
216 
217             default:
218                 bUndefined = sal_False;
219                 goto bad_input;
220             }
221             break;
222 
223         case IMPL_ISO_2022_JP_TO_UNICODE_STATE_ESC_DOLLAR:
224             switch (nChar)
225             {
226             case 0x40: /* @ */
227             case 0x42: /* B */
228                 eState = IMPL_ISO_2022_JP_TO_UNICODE_STATE_0208;
229                 break;
230 
231             default:
232                 bUndefined = sal_False;
233                 goto bad_input;
234             }
235             break;
236         }
237         continue;
238 
239     bad_input:
240         switch (ImplHandleBadInputTextToUnicodeConversion(
241                     bUndefined, sal_True, 0, nFlags, &pDestBufPtr, pDestBufEnd,
242                     &nInfo))
243         {
244         case IMPL_BAD_INPUT_STOP:
245             eState = IMPL_ISO_2022_JP_TO_UNICODE_STATE_ASCII;
246             break;
247 
248         case IMPL_BAD_INPUT_CONTINUE:
249             eState = IMPL_ISO_2022_JP_TO_UNICODE_STATE_ASCII;
250             continue;
251 
252         case IMPL_BAD_INPUT_NO_OUTPUT:
253             goto no_output;
254         }
255         break;
256 
257     no_output:
258         --pSrcBuf;
259         nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL;
260         break;
261     }
262 
263     if (eState > IMPL_ISO_2022_JP_TO_UNICODE_STATE_0208
264         && (nInfo & (RTL_TEXTTOUNICODE_INFO_ERROR
265                          | RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL))
266                == 0)
267     {
268         if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0)
269             nInfo |= RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL;
270         else
271             switch (ImplHandleBadInputTextToUnicodeConversion(
272                         sal_False, sal_True, 0, nFlags, &pDestBufPtr, pDestBufEnd,
273                         &nInfo))
274             {
275             case IMPL_BAD_INPUT_STOP:
276             case IMPL_BAD_INPUT_CONTINUE:
277                 eState = IMPL_ISO_2022_JP_TO_UNICODE_STATE_ASCII;
278                 break;
279 
280             case IMPL_BAD_INPUT_NO_OUTPUT:
281                 nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL;
282                 break;
283             }
284     }
285 
286     if (pContext)
287     {
288         ((ImplIso2022JpToUnicodeContext *) pContext)->m_eState = eState;
289         ((ImplIso2022JpToUnicodeContext *) pContext)->m_nRow = nRow;
290     }
291     if (pInfo)
292         *pInfo = nInfo;
293     if (pSrcCvtBytes)
294         *pSrcCvtBytes = nConverted;
295 
296     return pDestBufPtr - pDestBuf;
297 }
298 
ImplCreateUnicodeToIso2022JpContext(void)299 void * ImplCreateUnicodeToIso2022JpContext(void)
300 {
301     void * pContext
302         = rtl_allocateMemory(sizeof (ImplUnicodeToIso2022JpContext));
303     ((ImplUnicodeToIso2022JpContext *) pContext)->m_nHighSurrogate = 0;
304     ((ImplUnicodeToIso2022JpContext *) pContext)->m_b0208 = sal_False;
305     return pContext;
306 }
307 
ImplResetUnicodeToIso2022JpContext(void * pContext)308 void ImplResetUnicodeToIso2022JpContext(void * pContext)
309 {
310     if (pContext)
311     {
312         ((ImplUnicodeToIso2022JpContext *) pContext)->m_nHighSurrogate = 0;
313         ((ImplUnicodeToIso2022JpContext *) pContext)->m_b0208 = sal_False;
314     }
315 }
316 
ImplConvertUnicodeToIso2022Jp(ImplTextConverterData const * pData,void * pContext,sal_Unicode const * pSrcBuf,sal_Size nSrcChars,sal_Char * pDestBuf,sal_Size nDestBytes,sal_uInt32 nFlags,sal_uInt32 * pInfo,sal_Size * pSrcCvtChars)317 sal_Size ImplConvertUnicodeToIso2022Jp(ImplTextConverterData const * pData,
318                                        void * pContext,
319                                        sal_Unicode const * pSrcBuf,
320                                        sal_Size nSrcChars,
321                                        sal_Char * pDestBuf,
322                                        sal_Size nDestBytes,
323                                        sal_uInt32 nFlags,
324                                        sal_uInt32 * pInfo,
325                                        sal_Size * pSrcCvtChars)
326 {
327     ImplUniToDBCSHighTab const * pJisX0208Data
328         = ((ImplIso2022JpConverterData const *) pData)->
329               m_pUnicodeToJisX0208Data;
330     sal_Unicode nHighSurrogate = 0;
331     sal_Bool b0208 = sal_False;
332     sal_uInt32 nInfo = 0;
333     sal_Size nConverted = 0;
334     sal_Char * pDestBufPtr = pDestBuf;
335     sal_Char * pDestBufEnd = pDestBuf + nDestBytes;
336     sal_Bool bWritten;
337 
338     if (pContext)
339     {
340         nHighSurrogate
341             = ((ImplUnicodeToIso2022JpContext *) pContext)->m_nHighSurrogate;
342         b0208 = ((ImplUnicodeToIso2022JpContext *) pContext)->m_b0208;
343     }
344 
345     for (; nConverted < nSrcChars; ++nConverted)
346     {
347         sal_Bool bUndefined = sal_True;
348         sal_uInt32 nChar = *pSrcBuf++;
349         if (nHighSurrogate == 0)
350         {
351             if (ImplIsHighSurrogate(nChar))
352             {
353                 nHighSurrogate = (sal_Unicode) nChar;
354                 continue;
355             }
356         }
357         else if (ImplIsLowSurrogate(nChar))
358             nChar = ImplCombineSurrogates(nHighSurrogate, nChar);
359         else
360         {
361             bUndefined = sal_False;
362             goto bad_input;
363         }
364 
365         if (ImplIsLowSurrogate(nChar) || ImplIsNoncharacter(nChar))
366         {
367             bUndefined = sal_False;
368             goto bad_input;
369         }
370 
371         if (nChar == 0x0A || nChar == 0x0D) /* LF, CR */
372         {
373             if (b0208)
374             {
375                 if (pDestBufEnd - pDestBufPtr >= 3)
376                 {
377                     *pDestBufPtr++ = 0x1B; /* ESC */
378                     *pDestBufPtr++ = 0x28; /* ( */
379                     *pDestBufPtr++ = 0x42; /* B */
380                     b0208 = sal_False;
381                 }
382                 else
383                     goto no_output;
384             }
385             if (pDestBufPtr != pDestBufEnd)
386                 *pDestBufPtr++ = (sal_Char) nChar;
387             else
388                 goto no_output;
389         }
390         else if (nChar == 0x1B)
391             goto bad_input;
392         else if (nChar < 0x80)
393         {
394             if (b0208)
395             {
396                 if (pDestBufEnd - pDestBufPtr >= 3)
397                 {
398                     *pDestBufPtr++ = 0x1B; /* ESC */
399                     *pDestBufPtr++ = 0x28; /* ( */
400                     *pDestBufPtr++ = 0x42; /* B */
401                     b0208 = sal_False;
402                 }
403                 else
404                     goto no_output;
405             }
406             if (pDestBufPtr != pDestBufEnd)
407                 *pDestBufPtr++ = (sal_Char) nChar;
408             else
409                 goto no_output;
410         }
411         else
412         {
413             sal_uInt16 nBytes = 0;
414             sal_uInt32 nIndex1 = nChar >> 8;
415             if (nIndex1 < 0x100)
416             {
417                 sal_uInt32 nIndex2 = nChar & 0xFF;
418                 sal_uInt32 nFirst = pJisX0208Data[nIndex1].mnLowStart;
419                 if (nIndex2 >= nFirst
420                     && nIndex2 <= pJisX0208Data[nIndex1].mnLowEnd)
421                 {
422                     nBytes = pJisX0208Data[nIndex1].
423                                  mpToUniTrailTab[nIndex2 - nFirst];
424                     if (nBytes == 0)
425                         /* For some reason, the tables in tcvtjp4.tab do not
426                            include these two conversions: */
427                         switch (nChar)
428                         {
429                         case 0xA5: /* YEN SIGN */
430                             nBytes = 0x216F;
431                             break;
432 
433                         case 0xAF: /* MACRON */
434                             nBytes = 0x2131;
435                             break;
436                         }
437                 }
438             }
439             if (nBytes != 0)
440             {
441                 if (!b0208)
442                 {
443                     if (pDestBufEnd - pDestBufPtr >= 3)
444                     {
445                         *pDestBufPtr++ = 0x1B; /* ESC */
446                         *pDestBufPtr++ = 0x24; /* $ */
447                         *pDestBufPtr++ = 0x42; /* B */
448                         b0208 = sal_True;
449                     }
450                     else
451                         goto no_output;
452                 }
453                 if (pDestBufEnd - pDestBufPtr >= 2)
454                 {
455                     *pDestBufPtr++ = (sal_Char) (nBytes >> 8);
456                     *pDestBufPtr++ = (sal_Char) (nBytes & 0xFF);
457                 }
458                 else
459                     goto no_output;
460             }
461             else
462                 goto bad_input;
463         }
464         nHighSurrogate = 0;
465         continue;
466 
467     bad_input:
468         switch (ImplHandleBadInputUnicodeToTextConversion(
469                     bUndefined,
470                     nChar,
471                     nFlags,
472                     &pDestBufPtr,
473                     pDestBufEnd,
474                     &nInfo,
475                     "\x1B(B",
476                     b0208 ? 3 : 0,
477                     &bWritten))
478         {
479         case IMPL_BAD_INPUT_STOP:
480             nHighSurrogate = 0;
481             break;
482 
483         case IMPL_BAD_INPUT_CONTINUE:
484             if (bWritten)
485                 b0208 = sal_False;
486             nHighSurrogate = 0;
487             continue;
488 
489         case IMPL_BAD_INPUT_NO_OUTPUT:
490             goto no_output;
491         }
492         break;
493 
494     no_output:
495         --pSrcBuf;
496         nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
497         break;
498     }
499 
500     if ((nInfo & (RTL_UNICODETOTEXT_INFO_ERROR
501                       | RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL))
502             == 0)
503     {
504         sal_Bool bFlush = sal_True;
505         if (nHighSurrogate != 0)
506         {
507             if ((nFlags & RTL_UNICODETOTEXT_FLAGS_FLUSH) != 0)
508                 nInfo |= RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL;
509             else
510                 switch (ImplHandleBadInputUnicodeToTextConversion(
511                             sal_False,
512                             0,
513                             nFlags,
514                             &pDestBufPtr,
515                             pDestBufEnd,
516                             &nInfo,
517                             "\x1B(B",
518                             b0208 ? 3 : 0,
519                             &bWritten))
520                 {
521                 case IMPL_BAD_INPUT_STOP:
522                     nHighSurrogate = 0;
523                     bFlush = sal_False;
524                     break;
525 
526                 case IMPL_BAD_INPUT_CONTINUE:
527                     if (bWritten)
528                         b0208 = sal_False;
529                     nHighSurrogate = 0;
530                     break;
531 
532                 case IMPL_BAD_INPUT_NO_OUTPUT:
533                     nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
534                     break;
535                 }
536         }
537         if (bFlush
538             && b0208
539             && (nFlags & RTL_UNICODETOTEXT_FLAGS_FLUSH) != 0)
540         {
541             if (pDestBufEnd - pDestBufPtr >= 3)
542             {
543                 *pDestBufPtr++ = 0x1B; /* ESC */
544                 *pDestBufPtr++ = 0x28; /* ( */
545                 *pDestBufPtr++ = 0x42; /* B */
546                 b0208 = sal_False;
547             }
548             else
549                 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
550         }
551     }
552 
553     if (pContext)
554     {
555         ((ImplUnicodeToIso2022JpContext *) pContext)->m_nHighSurrogate
556             = nHighSurrogate;
557         ((ImplUnicodeToIso2022JpContext *) pContext)->m_b0208 = b0208;
558     }
559     if (pInfo)
560         *pInfo = nInfo;
561     if (pSrcCvtChars)
562         *pSrcCvtChars = nConverted;
563 
564     return pDestBufPtr - pDestBuf;
565 }
566