1 /**************************************************************
2 *
3 * Licensed to the Apache Software Foundation (ASF) under one
4 * or more contributor license agreements. See the NOTICE file
5 * distributed with this work for additional information
6 * regarding copyright ownership. The ASF licenses this file
7 * to you under the Apache License, Version 2.0 (the
8 * "License"); you may not use this file except in compliance
9 * with the License. You may obtain a copy of the License at
10 *
11 * http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing,
14 * software distributed under the License is distributed on an
15 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 * KIND, either express or implied. See the License for the
17 * specific language governing permissions and limitations
18 * under the License.
19 *
20 *************************************************************/
21
22
23
24 #include "convertiso2022kr.h"
25 #include "context.h"
26 #include "converter.h"
27 #include "tenchelp.h"
28 #include "unichars.h"
29 #include "rtl/alloc.h"
30 #include "rtl/textcvt.h"
31 #include "sal/types.h"
32
33 typedef enum /* order is important: */
34 {
35 IMPL_ISO_2022_KR_TO_UNICODE_STATE_ASCII,
36 IMPL_ISO_2022_KR_TO_UNICODE_STATE_1001,
37 IMPL_ISO_2022_KR_TO_UNICODE_STATE_1001_2,
38 IMPL_ISO_2022_KR_TO_UNICODE_STATE_ESC,
39 IMPL_ISO_2022_KR_TO_UNICODE_STATE_ESC_DOLLAR,
40 IMPL_ISO_2022_KR_TO_UNICODE_STATE_ESC_DOLLAR_RPAREN
41 } ImplIso2022KrToUnicodeState;
42
43 typedef struct
44 {
45 ImplIso2022KrToUnicodeState m_eState;
46 sal_uInt32 m_nRow;
47 } ImplIso2022KrToUnicodeContext;
48
49 typedef enum
50 {
51 IMPL_UNICODE_TO_ISO_2022_KR_SET_NONE,
52 IMPL_UNICODE_TO_ISO_2022_KR_SET_ASCII,
53 IMPL_UNICODE_TO_ISO_2022_KR_SET_1001
54 } ImplUnicodeToIso2022KrSet;
55
56 typedef struct
57 {
58 sal_Unicode m_nHighSurrogate;
59 ImplUnicodeToIso2022KrSet m_eSet;
60 } ImplUnicodeToIso2022KrContext;
61
ImplCreateIso2022KrToUnicodeContext(void)62 void * ImplCreateIso2022KrToUnicodeContext(void)
63 {
64 void * pContext
65 = rtl_allocateMemory(sizeof (ImplIso2022KrToUnicodeContext));
66 ((ImplIso2022KrToUnicodeContext *) pContext)->m_eState
67 = IMPL_ISO_2022_KR_TO_UNICODE_STATE_ASCII;
68 return pContext;
69 }
70
ImplResetIso2022KrToUnicodeContext(void * pContext)71 void ImplResetIso2022KrToUnicodeContext(void * pContext)
72 {
73 if (pContext)
74 ((ImplIso2022KrToUnicodeContext *) pContext)->m_eState
75 = IMPL_ISO_2022_KR_TO_UNICODE_STATE_ASCII;
76 }
77
ImplConvertIso2022KrToUnicode(ImplTextConverterData const * pData,void * pContext,sal_Char const * pSrcBuf,sal_Size nSrcBytes,sal_Unicode * pDestBuf,sal_Size nDestChars,sal_uInt32 nFlags,sal_uInt32 * pInfo,sal_Size * pSrcCvtBytes)78 sal_Size ImplConvertIso2022KrToUnicode(ImplTextConverterData const * pData,
79 void * pContext,
80 sal_Char const * pSrcBuf,
81 sal_Size nSrcBytes,
82 sal_Unicode * pDestBuf,
83 sal_Size nDestChars,
84 sal_uInt32 nFlags,
85 sal_uInt32 * pInfo,
86 sal_Size * pSrcCvtBytes)
87 {
88 ImplDBCSToUniLeadTab const * pKsX1001Data
89 = ((ImplIso2022KrConverterData const *) pData)->
90 m_pKsX1001ToUnicodeData;
91 ImplIso2022KrToUnicodeState eState
92 = IMPL_ISO_2022_KR_TO_UNICODE_STATE_ASCII;
93 sal_uInt32 nRow = 0;
94 sal_uInt32 nInfo = 0;
95 sal_Size nConverted = 0;
96 sal_Unicode * pDestBufPtr = pDestBuf;
97 sal_Unicode * pDestBufEnd = pDestBuf + nDestChars;
98
99 if (pContext)
100 {
101 eState = ((ImplIso2022KrToUnicodeContext *) pContext)->m_eState;
102 nRow = ((ImplIso2022KrToUnicodeContext *) pContext)->m_nRow;
103 }
104
105 for (; nConverted < nSrcBytes; ++nConverted)
106 {
107 sal_Bool bUndefined = sal_True;
108 sal_uInt32 nChar = *(sal_uChar const *) pSrcBuf++;
109 switch (eState)
110 {
111 case IMPL_ISO_2022_KR_TO_UNICODE_STATE_ASCII:
112 if (nChar == 0x0E) /* SO */
113 eState = IMPL_ISO_2022_KR_TO_UNICODE_STATE_1001;
114 else if (nChar == 0x1B) /* ESC */
115 eState = IMPL_ISO_2022_KR_TO_UNICODE_STATE_ESC;
116 else if (nChar < 0x80)
117 if (pDestBufPtr != pDestBufEnd)
118 *pDestBufPtr++ = (sal_Unicode) nChar;
119 else
120 goto no_output;
121 else
122 {
123 bUndefined = sal_False;
124 goto bad_input;
125 }
126 break;
127
128 case IMPL_ISO_2022_KR_TO_UNICODE_STATE_1001:
129 if (nChar == 0x0F) /* SI */
130 eState = IMPL_ISO_2022_KR_TO_UNICODE_STATE_ASCII;
131 else if (nChar >= 0x21 && nChar <= 0x7E)
132 {
133 nRow = nChar + 0x80;
134 eState = IMPL_ISO_2022_KR_TO_UNICODE_STATE_1001_2;
135 }
136 else
137 {
138 bUndefined = sal_False;
139 goto bad_input;
140 }
141 break;
142
143 case IMPL_ISO_2022_KR_TO_UNICODE_STATE_1001_2:
144 if (nChar >= 0x21 && nChar <= 0x7E)
145 {
146 sal_uInt16 nUnicode = 0;
147 sal_uInt32 nFirst = pKsX1001Data[nRow].mnTrailStart;
148 nChar += 0x80;
149 if (nChar >= nFirst && nChar <= pKsX1001Data[nRow].mnTrailEnd)
150 nUnicode = pKsX1001Data[nRow].
151 mpToUniTrailTab[nChar - nFirst];
152 if (nUnicode != 0)
153 if (pDestBufPtr != pDestBufEnd)
154 {
155 *pDestBufPtr++ = (sal_Unicode) nUnicode;
156 eState = IMPL_ISO_2022_KR_TO_UNICODE_STATE_1001;
157 }
158 else
159 goto no_output;
160 else
161 goto bad_input;
162 }
163 else
164 {
165 bUndefined = sal_False;
166 goto bad_input;
167 }
168 break;
169
170 case IMPL_ISO_2022_KR_TO_UNICODE_STATE_ESC:
171 if (nChar == 0x24) /* $ */
172 eState = IMPL_ISO_2022_KR_TO_UNICODE_STATE_ESC_DOLLAR;
173 else
174 {
175 bUndefined = sal_False;
176 goto bad_input;
177 }
178 break;
179
180 case IMPL_ISO_2022_KR_TO_UNICODE_STATE_ESC_DOLLAR:
181 if (nChar == 0x29) /* ) */
182 eState = IMPL_ISO_2022_KR_TO_UNICODE_STATE_ESC_DOLLAR_RPAREN;
183 else
184 {
185 bUndefined = sal_False;
186 goto bad_input;
187 }
188 break;
189
190 case IMPL_ISO_2022_KR_TO_UNICODE_STATE_ESC_DOLLAR_RPAREN:
191 if (nChar == 0x43) /* C */
192 eState = IMPL_ISO_2022_KR_TO_UNICODE_STATE_ASCII;
193 else
194 {
195 bUndefined = sal_False;
196 goto bad_input;
197 }
198 break;
199 }
200 continue;
201
202 bad_input:
203 switch (ImplHandleBadInputTextToUnicodeConversion(
204 bUndefined, sal_True, 0, nFlags, &pDestBufPtr, pDestBufEnd,
205 &nInfo))
206 {
207 case IMPL_BAD_INPUT_STOP:
208 eState = IMPL_ISO_2022_KR_TO_UNICODE_STATE_ASCII;
209 break;
210
211 case IMPL_BAD_INPUT_CONTINUE:
212 eState = IMPL_ISO_2022_KR_TO_UNICODE_STATE_ASCII;
213 continue;
214
215 case IMPL_BAD_INPUT_NO_OUTPUT:
216 goto no_output;
217 }
218 break;
219
220 no_output:
221 --pSrcBuf;
222 nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL;
223 break;
224 }
225
226 if (eState > IMPL_ISO_2022_KR_TO_UNICODE_STATE_1001
227 && (nInfo & (RTL_TEXTTOUNICODE_INFO_ERROR
228 | RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL))
229 == 0)
230 {
231 if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0)
232 nInfo |= RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL;
233 else
234 switch (ImplHandleBadInputTextToUnicodeConversion(
235 sal_False, sal_True, 0, nFlags, &pDestBufPtr, pDestBufEnd,
236 &nInfo))
237 {
238 case IMPL_BAD_INPUT_STOP:
239 case IMPL_BAD_INPUT_CONTINUE:
240 eState = IMPL_ISO_2022_KR_TO_UNICODE_STATE_ASCII;
241 break;
242
243 case IMPL_BAD_INPUT_NO_OUTPUT:
244 nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL;
245 break;
246 }
247 }
248
249 if (pContext)
250 {
251 ((ImplIso2022KrToUnicodeContext *) pContext)->m_eState = eState;
252 ((ImplIso2022KrToUnicodeContext *) pContext)->m_nRow = nRow;
253 }
254 if (pInfo)
255 *pInfo = nInfo;
256 if (pSrcCvtBytes)
257 *pSrcCvtBytes = nConverted;
258
259 return pDestBufPtr - pDestBuf;
260 }
261
ImplCreateUnicodeToIso2022KrContext(void)262 void * ImplCreateUnicodeToIso2022KrContext(void)
263 {
264 void * pContext
265 = rtl_allocateMemory(sizeof (ImplUnicodeToIso2022KrContext));
266 ((ImplUnicodeToIso2022KrContext *) pContext)->m_nHighSurrogate = 0;
267 ((ImplUnicodeToIso2022KrContext *) pContext)->m_eSet
268 = IMPL_UNICODE_TO_ISO_2022_KR_SET_NONE;
269 return pContext;
270 }
271
ImplResetUnicodeToIso2022KrContext(void * pContext)272 void ImplResetUnicodeToIso2022KrContext(void * pContext)
273 {
274 if (pContext)
275 {
276 ((ImplUnicodeToIso2022KrContext *) pContext)->m_nHighSurrogate = 0;
277 ((ImplUnicodeToIso2022KrContext *) pContext)->m_eSet
278 = IMPL_UNICODE_TO_ISO_2022_KR_SET_NONE;
279 }
280 }
281
ImplConvertUnicodeToIso2022Kr(ImplTextConverterData const * pData,void * pContext,sal_Unicode const * pSrcBuf,sal_Size nSrcChars,sal_Char * pDestBuf,sal_Size nDestBytes,sal_uInt32 nFlags,sal_uInt32 * pInfo,sal_Size * pSrcCvtChars)282 sal_Size ImplConvertUnicodeToIso2022Kr(ImplTextConverterData const * pData,
283 void * pContext,
284 sal_Unicode const * pSrcBuf,
285 sal_Size nSrcChars,
286 sal_Char * pDestBuf,
287 sal_Size nDestBytes,
288 sal_uInt32 nFlags,
289 sal_uInt32 * pInfo,
290 sal_Size * pSrcCvtChars)
291 {
292 ImplUniToDBCSHighTab const * pKsX1001Data
293 = ((ImplIso2022KrConverterData const *) pData)->
294 m_pUnicodeToKsX1001Data;
295 sal_Unicode nHighSurrogate = 0;
296 ImplUnicodeToIso2022KrSet eSet = IMPL_UNICODE_TO_ISO_2022_KR_SET_NONE;
297 sal_uInt32 nInfo = 0;
298 sal_Size nConverted = 0;
299 sal_Char * pDestBufPtr = pDestBuf;
300 sal_Char * pDestBufEnd = pDestBuf + nDestBytes;
301 sal_Bool bWritten;
302
303 if (pContext)
304 {
305 nHighSurrogate
306 = ((ImplUnicodeToIso2022KrContext *) pContext)->m_nHighSurrogate;
307 eSet = ((ImplUnicodeToIso2022KrContext *) pContext)->m_eSet;
308 }
309
310 if (eSet == IMPL_UNICODE_TO_ISO_2022_KR_SET_NONE)
311 {
312 if (pDestBufEnd - pDestBufPtr >= 4)
313 {
314 *pDestBufPtr++ = 0x1B; /* ESC */
315 *pDestBufPtr++ = 0x24; /* $ */
316 *pDestBufPtr++ = 0x29; /* ) */
317 *pDestBufPtr++ = 0x43; /* C */
318 eSet = IMPL_UNICODE_TO_ISO_2022_KR_SET_ASCII;
319 }
320 else
321 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
322 }
323
324 if ((nInfo & RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL) == 0)
325 for (; nConverted < nSrcChars; ++nConverted)
326 {
327 sal_Bool bUndefined = sal_True;
328 sal_uInt32 nChar = *pSrcBuf++;
329 if (nHighSurrogate == 0)
330 {
331 if (ImplIsHighSurrogate(nChar))
332 {
333 nHighSurrogate = (sal_Unicode) nChar;
334 continue;
335 }
336 }
337 else if (ImplIsLowSurrogate(nChar))
338 nChar = ImplCombineSurrogates(nHighSurrogate, nChar);
339 else
340 {
341 bUndefined = sal_False;
342 goto bad_input;
343 }
344
345 if (ImplIsLowSurrogate(nChar) || ImplIsNoncharacter(nChar))
346 {
347 bUndefined = sal_False;
348 goto bad_input;
349 }
350
351 if (nChar == 0x0A || nChar == 0x0D) /* LF, CR */
352 {
353 if (eSet == IMPL_UNICODE_TO_ISO_2022_KR_SET_1001)
354 {
355 if (pDestBufPtr != pDestBufEnd)
356 {
357 *pDestBufPtr++ = 0x0F; /* SI */
358 eSet = IMPL_UNICODE_TO_ISO_2022_KR_SET_ASCII;
359 }
360 else
361 goto no_output;
362 }
363 if (pDestBufPtr != pDestBufEnd)
364 *pDestBufPtr++ = (sal_Char) nChar;
365 else
366 goto no_output;
367 }
368 else if (nChar == 0x0E || nChar == 0x0F || nChar == 0x1B)
369 goto bad_input;
370 else if (nChar < 0x80)
371 {
372 if (eSet == IMPL_UNICODE_TO_ISO_2022_KR_SET_1001)
373 {
374 if (pDestBufPtr != pDestBufEnd)
375 {
376 *pDestBufPtr++ = 0x0F; /* SI */
377 eSet = IMPL_UNICODE_TO_ISO_2022_KR_SET_ASCII;
378 }
379 else
380 goto no_output;
381 }
382 if (pDestBufPtr != pDestBufEnd)
383 *pDestBufPtr++ = (sal_Char) nChar;
384 else
385 goto no_output;
386 }
387 else
388 {
389 sal_uInt16 nBytes = 0;
390 sal_uInt32 nIndex1 = nChar >> 8;
391 if (nIndex1 < 0x100)
392 {
393 sal_uInt32 nIndex2 = nChar & 0xFF;
394 sal_uInt32 nFirst = pKsX1001Data[nIndex1].mnLowStart;
395 if (nIndex2 >= nFirst
396 && nIndex2 <= pKsX1001Data[nIndex1].mnLowEnd)
397 nBytes = pKsX1001Data[nIndex1].
398 mpToUniTrailTab[nIndex2 - nFirst];
399 }
400 if (nBytes != 0)
401 {
402 if (eSet == IMPL_UNICODE_TO_ISO_2022_KR_SET_ASCII)
403 {
404 if (pDestBufPtr != pDestBufEnd)
405 {
406 *pDestBufPtr++ = 0x0E; /* SO */
407 eSet = IMPL_UNICODE_TO_ISO_2022_KR_SET_1001;
408 }
409 else
410 goto no_output;
411 }
412 if (pDestBufEnd - pDestBufPtr >= 2)
413 {
414 *pDestBufPtr++ = (sal_Char) ((nBytes >> 8) & 0x7F);
415 *pDestBufPtr++ = (sal_Char) (nBytes & 0x7F);
416 }
417 else
418 goto no_output;
419 }
420 else
421 goto bad_input;
422 }
423 nHighSurrogate = 0;
424 continue;
425
426 bad_input:
427 switch (ImplHandleBadInputUnicodeToTextConversion(
428 bUndefined,
429 nChar,
430 nFlags,
431 &pDestBufPtr,
432 pDestBufEnd,
433 &nInfo,
434 "\x0F", /* SI */
435 eSet == IMPL_UNICODE_TO_ISO_2022_KR_SET_ASCII ? 0 : 1,
436 &bWritten))
437 {
438 case IMPL_BAD_INPUT_STOP:
439 nHighSurrogate = 0;
440 break;
441
442 case IMPL_BAD_INPUT_CONTINUE:
443 if (bWritten)
444 eSet = IMPL_UNICODE_TO_ISO_2022_KR_SET_ASCII;
445 nHighSurrogate = 0;
446 continue;
447
448 case IMPL_BAD_INPUT_NO_OUTPUT:
449 goto no_output;
450 }
451 break;
452
453 no_output:
454 --pSrcBuf;
455 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
456 break;
457 }
458
459 if ((nInfo & (RTL_UNICODETOTEXT_INFO_ERROR
460 | RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL))
461 == 0)
462 {
463 sal_Bool bFlush = sal_True;
464 if (nHighSurrogate != 0)
465 {
466 if ((nFlags & RTL_UNICODETOTEXT_FLAGS_FLUSH) != 0)
467 nInfo |= RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL;
468 else
469 switch (ImplHandleBadInputUnicodeToTextConversion(
470 sal_False,
471 0,
472 nFlags,
473 &pDestBufPtr,
474 pDestBufEnd,
475 &nInfo,
476 "\x0F", /* SI */
477 eSet == IMPL_UNICODE_TO_ISO_2022_KR_SET_ASCII ?
478 0 : 1,
479 &bWritten))
480 {
481 case IMPL_BAD_INPUT_STOP:
482 nHighSurrogate = 0;
483 bFlush = sal_False;
484 break;
485
486 case IMPL_BAD_INPUT_CONTINUE:
487 if (bWritten)
488 eSet = IMPL_UNICODE_TO_ISO_2022_KR_SET_ASCII;
489 nHighSurrogate = 0;
490 break;
491
492 case IMPL_BAD_INPUT_NO_OUTPUT:
493 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
494 break;
495 }
496 }
497 if (bFlush
498 && eSet == IMPL_UNICODE_TO_ISO_2022_KR_SET_1001
499 && (nFlags & RTL_UNICODETOTEXT_FLAGS_FLUSH) != 0)
500 {
501 if (pDestBufPtr != pDestBufEnd)
502 {
503 *pDestBufPtr++ = 0x0F; /* SI */
504 eSet = IMPL_UNICODE_TO_ISO_2022_KR_SET_ASCII;
505 }
506 else
507 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
508 }
509 }
510
511 if (pContext)
512 {
513 ((ImplUnicodeToIso2022KrContext *) pContext)->m_nHighSurrogate
514 = nHighSurrogate;
515 ((ImplUnicodeToIso2022KrContext *) pContext)->m_eSet = eSet;
516 }
517 if (pInfo)
518 *pInfo = nInfo;
519 if (pSrcCvtChars)
520 *pSrcCvtChars = nConverted;
521
522 return pDestBufPtr - pDestBuf;
523 }
524