1 /************************************************************************* 2 * 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * Copyright 2000, 2010 Oracle and/or its affiliates. 6 * 7 * OpenOffice.org - a multi-platform office productivity suite 8 * 9 * This file is part of OpenOffice.org. 10 * 11 * OpenOffice.org is free software: you can redistribute it and/or modify 12 * it under the terms of the GNU Lesser General Public License version 3 13 * only, as published by the Free Software Foundation. 14 * 15 * OpenOffice.org is distributed in the hope that it will be useful, 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 * GNU Lesser General Public License version 3 for more details 19 * (a copy is included in the LICENSE file that accompanied this code). 20 * 21 * You should have received a copy of the GNU Lesser General Public License 22 * version 3 along with OpenOffice.org. If not, see 23 * <http://www.openoffice.org/license.html> 24 * for a copy of the LGPLv3 License. 25 * 26 ************************************************************************/ 27 28 #include "convertiso2022kr.h" 29 #include "context.h" 30 #include "converter.h" 31 #include "tenchelp.h" 32 #include "unichars.h" 33 #include "rtl/alloc.h" 34 #include "rtl/textcvt.h" 35 #include "sal/types.h" 36 37 typedef enum /* order is important: */ 38 { 39 IMPL_ISO_2022_KR_TO_UNICODE_STATE_ASCII, 40 IMPL_ISO_2022_KR_TO_UNICODE_STATE_1001, 41 IMPL_ISO_2022_KR_TO_UNICODE_STATE_1001_2, 42 IMPL_ISO_2022_KR_TO_UNICODE_STATE_ESC, 43 IMPL_ISO_2022_KR_TO_UNICODE_STATE_ESC_DOLLAR, 44 IMPL_ISO_2022_KR_TO_UNICODE_STATE_ESC_DOLLAR_RPAREN 45 } ImplIso2022KrToUnicodeState; 46 47 typedef struct 48 { 49 ImplIso2022KrToUnicodeState m_eState; 50 sal_uInt32 m_nRow; 51 } ImplIso2022KrToUnicodeContext; 52 53 typedef enum 54 { 55 IMPL_UNICODE_TO_ISO_2022_KR_SET_NONE, 56 IMPL_UNICODE_TO_ISO_2022_KR_SET_ASCII, 57 IMPL_UNICODE_TO_ISO_2022_KR_SET_1001 58 } ImplUnicodeToIso2022KrSet; 59 60 typedef struct 61 { 62 sal_Unicode m_nHighSurrogate; 63 ImplUnicodeToIso2022KrSet m_eSet; 64 } ImplUnicodeToIso2022KrContext; 65 66 void * ImplCreateIso2022KrToUnicodeContext(void) 67 { 68 void * pContext 69 = rtl_allocateMemory(sizeof (ImplIso2022KrToUnicodeContext)); 70 ((ImplIso2022KrToUnicodeContext *) pContext)->m_eState 71 = IMPL_ISO_2022_KR_TO_UNICODE_STATE_ASCII; 72 return pContext; 73 } 74 75 void ImplResetIso2022KrToUnicodeContext(void * pContext) 76 { 77 if (pContext) 78 ((ImplIso2022KrToUnicodeContext *) pContext)->m_eState 79 = IMPL_ISO_2022_KR_TO_UNICODE_STATE_ASCII; 80 } 81 82 sal_Size ImplConvertIso2022KrToUnicode(ImplTextConverterData const * pData, 83 void * pContext, 84 sal_Char const * pSrcBuf, 85 sal_Size nSrcBytes, 86 sal_Unicode * pDestBuf, 87 sal_Size nDestChars, 88 sal_uInt32 nFlags, 89 sal_uInt32 * pInfo, 90 sal_Size * pSrcCvtBytes) 91 { 92 ImplDBCSToUniLeadTab const * pKsX1001Data 93 = ((ImplIso2022KrConverterData const *) pData)-> 94 m_pKsX1001ToUnicodeData; 95 ImplIso2022KrToUnicodeState eState 96 = IMPL_ISO_2022_KR_TO_UNICODE_STATE_ASCII; 97 sal_uInt32 nRow = 0; 98 sal_uInt32 nInfo = 0; 99 sal_Size nConverted = 0; 100 sal_Unicode * pDestBufPtr = pDestBuf; 101 sal_Unicode * pDestBufEnd = pDestBuf + nDestChars; 102 103 if (pContext) 104 { 105 eState = ((ImplIso2022KrToUnicodeContext *) pContext)->m_eState; 106 nRow = ((ImplIso2022KrToUnicodeContext *) pContext)->m_nRow; 107 } 108 109 for (; nConverted < nSrcBytes; ++nConverted) 110 { 111 sal_Bool bUndefined = sal_True; 112 sal_uInt32 nChar = *(sal_uChar const *) pSrcBuf++; 113 switch (eState) 114 { 115 case IMPL_ISO_2022_KR_TO_UNICODE_STATE_ASCII: 116 if (nChar == 0x0E) /* SO */ 117 eState = IMPL_ISO_2022_KR_TO_UNICODE_STATE_1001; 118 else if (nChar == 0x1B) /* ESC */ 119 eState = IMPL_ISO_2022_KR_TO_UNICODE_STATE_ESC; 120 else if (nChar < 0x80) 121 if (pDestBufPtr != pDestBufEnd) 122 *pDestBufPtr++ = (sal_Unicode) nChar; 123 else 124 goto no_output; 125 else 126 { 127 bUndefined = sal_False; 128 goto bad_input; 129 } 130 break; 131 132 case IMPL_ISO_2022_KR_TO_UNICODE_STATE_1001: 133 if (nChar == 0x0F) /* SI */ 134 eState = IMPL_ISO_2022_KR_TO_UNICODE_STATE_ASCII; 135 else if (nChar >= 0x21 && nChar <= 0x7E) 136 { 137 nRow = nChar + 0x80; 138 eState = IMPL_ISO_2022_KR_TO_UNICODE_STATE_1001_2; 139 } 140 else 141 { 142 bUndefined = sal_False; 143 goto bad_input; 144 } 145 break; 146 147 case IMPL_ISO_2022_KR_TO_UNICODE_STATE_1001_2: 148 if (nChar >= 0x21 && nChar <= 0x7E) 149 { 150 sal_uInt16 nUnicode = 0; 151 sal_uInt32 nFirst = pKsX1001Data[nRow].mnTrailStart; 152 nChar += 0x80; 153 if (nChar >= nFirst && nChar <= pKsX1001Data[nRow].mnTrailEnd) 154 nUnicode = pKsX1001Data[nRow]. 155 mpToUniTrailTab[nChar - nFirst]; 156 if (nUnicode != 0) 157 if (pDestBufPtr != pDestBufEnd) 158 { 159 *pDestBufPtr++ = (sal_Unicode) nUnicode; 160 eState = IMPL_ISO_2022_KR_TO_UNICODE_STATE_1001; 161 } 162 else 163 goto no_output; 164 else 165 goto bad_input; 166 } 167 else 168 { 169 bUndefined = sal_False; 170 goto bad_input; 171 } 172 break; 173 174 case IMPL_ISO_2022_KR_TO_UNICODE_STATE_ESC: 175 if (nChar == 0x24) /* $ */ 176 eState = IMPL_ISO_2022_KR_TO_UNICODE_STATE_ESC_DOLLAR; 177 else 178 { 179 bUndefined = sal_False; 180 goto bad_input; 181 } 182 break; 183 184 case IMPL_ISO_2022_KR_TO_UNICODE_STATE_ESC_DOLLAR: 185 if (nChar == 0x29) /* ) */ 186 eState = IMPL_ISO_2022_KR_TO_UNICODE_STATE_ESC_DOLLAR_RPAREN; 187 else 188 { 189 bUndefined = sal_False; 190 goto bad_input; 191 } 192 break; 193 194 case IMPL_ISO_2022_KR_TO_UNICODE_STATE_ESC_DOLLAR_RPAREN: 195 if (nChar == 0x43) /* C */ 196 eState = IMPL_ISO_2022_KR_TO_UNICODE_STATE_ASCII; 197 else 198 { 199 bUndefined = sal_False; 200 goto bad_input; 201 } 202 break; 203 } 204 continue; 205 206 bad_input: 207 switch (ImplHandleBadInputTextToUnicodeConversion( 208 bUndefined, sal_True, 0, nFlags, &pDestBufPtr, pDestBufEnd, 209 &nInfo)) 210 { 211 case IMPL_BAD_INPUT_STOP: 212 eState = IMPL_ISO_2022_KR_TO_UNICODE_STATE_ASCII; 213 break; 214 215 case IMPL_BAD_INPUT_CONTINUE: 216 eState = IMPL_ISO_2022_KR_TO_UNICODE_STATE_ASCII; 217 continue; 218 219 case IMPL_BAD_INPUT_NO_OUTPUT: 220 goto no_output; 221 } 222 break; 223 224 no_output: 225 --pSrcBuf; 226 nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL; 227 break; 228 } 229 230 if (eState > IMPL_ISO_2022_KR_TO_UNICODE_STATE_1001 231 && (nInfo & (RTL_TEXTTOUNICODE_INFO_ERROR 232 | RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL)) 233 == 0) 234 { 235 if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0) 236 nInfo |= RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL; 237 else 238 switch (ImplHandleBadInputTextToUnicodeConversion( 239 sal_False, sal_True, 0, nFlags, &pDestBufPtr, pDestBufEnd, 240 &nInfo)) 241 { 242 case IMPL_BAD_INPUT_STOP: 243 case IMPL_BAD_INPUT_CONTINUE: 244 eState = IMPL_ISO_2022_KR_TO_UNICODE_STATE_ASCII; 245 break; 246 247 case IMPL_BAD_INPUT_NO_OUTPUT: 248 nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL; 249 break; 250 } 251 } 252 253 if (pContext) 254 { 255 ((ImplIso2022KrToUnicodeContext *) pContext)->m_eState = eState; 256 ((ImplIso2022KrToUnicodeContext *) pContext)->m_nRow = nRow; 257 } 258 if (pInfo) 259 *pInfo = nInfo; 260 if (pSrcCvtBytes) 261 *pSrcCvtBytes = nConverted; 262 263 return pDestBufPtr - pDestBuf; 264 } 265 266 void * ImplCreateUnicodeToIso2022KrContext(void) 267 { 268 void * pContext 269 = rtl_allocateMemory(sizeof (ImplUnicodeToIso2022KrContext)); 270 ((ImplUnicodeToIso2022KrContext *) pContext)->m_nHighSurrogate = 0; 271 ((ImplUnicodeToIso2022KrContext *) pContext)->m_eSet 272 = IMPL_UNICODE_TO_ISO_2022_KR_SET_NONE; 273 return pContext; 274 } 275 276 void ImplResetUnicodeToIso2022KrContext(void * pContext) 277 { 278 if (pContext) 279 { 280 ((ImplUnicodeToIso2022KrContext *) pContext)->m_nHighSurrogate = 0; 281 ((ImplUnicodeToIso2022KrContext *) pContext)->m_eSet 282 = IMPL_UNICODE_TO_ISO_2022_KR_SET_NONE; 283 } 284 } 285 286 sal_Size ImplConvertUnicodeToIso2022Kr(ImplTextConverterData const * pData, 287 void * pContext, 288 sal_Unicode const * pSrcBuf, 289 sal_Size nSrcChars, 290 sal_Char * pDestBuf, 291 sal_Size nDestBytes, 292 sal_uInt32 nFlags, 293 sal_uInt32 * pInfo, 294 sal_Size * pSrcCvtChars) 295 { 296 ImplUniToDBCSHighTab const * pKsX1001Data 297 = ((ImplIso2022KrConverterData const *) pData)-> 298 m_pUnicodeToKsX1001Data; 299 sal_Unicode nHighSurrogate = 0; 300 ImplUnicodeToIso2022KrSet eSet = IMPL_UNICODE_TO_ISO_2022_KR_SET_NONE; 301 sal_uInt32 nInfo = 0; 302 sal_Size nConverted = 0; 303 sal_Char * pDestBufPtr = pDestBuf; 304 sal_Char * pDestBufEnd = pDestBuf + nDestBytes; 305 sal_Bool bWritten; 306 307 if (pContext) 308 { 309 nHighSurrogate 310 = ((ImplUnicodeToIso2022KrContext *) pContext)->m_nHighSurrogate; 311 eSet = ((ImplUnicodeToIso2022KrContext *) pContext)->m_eSet; 312 } 313 314 if (eSet == IMPL_UNICODE_TO_ISO_2022_KR_SET_NONE) 315 { 316 if (pDestBufEnd - pDestBufPtr >= 4) 317 { 318 *pDestBufPtr++ = 0x1B; /* ESC */ 319 *pDestBufPtr++ = 0x24; /* $ */ 320 *pDestBufPtr++ = 0x29; /* ) */ 321 *pDestBufPtr++ = 0x43; /* C */ 322 eSet = IMPL_UNICODE_TO_ISO_2022_KR_SET_ASCII; 323 } 324 else 325 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL; 326 } 327 328 if ((nInfo & RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL) == 0) 329 for (; nConverted < nSrcChars; ++nConverted) 330 { 331 sal_Bool bUndefined = sal_True; 332 sal_uInt32 nChar = *pSrcBuf++; 333 if (nHighSurrogate == 0) 334 { 335 if (ImplIsHighSurrogate(nChar)) 336 { 337 nHighSurrogate = (sal_Unicode) nChar; 338 continue; 339 } 340 } 341 else if (ImplIsLowSurrogate(nChar)) 342 nChar = ImplCombineSurrogates(nHighSurrogate, nChar); 343 else 344 { 345 bUndefined = sal_False; 346 goto bad_input; 347 } 348 349 if (ImplIsLowSurrogate(nChar) || ImplIsNoncharacter(nChar)) 350 { 351 bUndefined = sal_False; 352 goto bad_input; 353 } 354 355 if (nChar == 0x0A || nChar == 0x0D) /* LF, CR */ 356 { 357 if (eSet == IMPL_UNICODE_TO_ISO_2022_KR_SET_1001) 358 { 359 if (pDestBufPtr != pDestBufEnd) 360 { 361 *pDestBufPtr++ = 0x0F; /* SI */ 362 eSet = IMPL_UNICODE_TO_ISO_2022_KR_SET_ASCII; 363 } 364 else 365 goto no_output; 366 } 367 if (pDestBufPtr != pDestBufEnd) 368 *pDestBufPtr++ = (sal_Char) nChar; 369 else 370 goto no_output; 371 } 372 else if (nChar == 0x0E || nChar == 0x0F || nChar == 0x1B) 373 goto bad_input; 374 else if (nChar < 0x80) 375 { 376 if (eSet == IMPL_UNICODE_TO_ISO_2022_KR_SET_1001) 377 { 378 if (pDestBufPtr != pDestBufEnd) 379 { 380 *pDestBufPtr++ = 0x0F; /* SI */ 381 eSet = IMPL_UNICODE_TO_ISO_2022_KR_SET_ASCII; 382 } 383 else 384 goto no_output; 385 } 386 if (pDestBufPtr != pDestBufEnd) 387 *pDestBufPtr++ = (sal_Char) nChar; 388 else 389 goto no_output; 390 } 391 else 392 { 393 sal_uInt16 nBytes = 0; 394 sal_uInt32 nIndex1 = nChar >> 8; 395 if (nIndex1 < 0x100) 396 { 397 sal_uInt32 nIndex2 = nChar & 0xFF; 398 sal_uInt32 nFirst = pKsX1001Data[nIndex1].mnLowStart; 399 if (nIndex2 >= nFirst 400 && nIndex2 <= pKsX1001Data[nIndex1].mnLowEnd) 401 nBytes = pKsX1001Data[nIndex1]. 402 mpToUniTrailTab[nIndex2 - nFirst]; 403 } 404 if (nBytes != 0) 405 { 406 if (eSet == IMPL_UNICODE_TO_ISO_2022_KR_SET_ASCII) 407 { 408 if (pDestBufPtr != pDestBufEnd) 409 { 410 *pDestBufPtr++ = 0x0E; /* SO */ 411 eSet = IMPL_UNICODE_TO_ISO_2022_KR_SET_1001; 412 } 413 else 414 goto no_output; 415 } 416 if (pDestBufEnd - pDestBufPtr >= 2) 417 { 418 *pDestBufPtr++ = (sal_Char) ((nBytes >> 8) & 0x7F); 419 *pDestBufPtr++ = (sal_Char) (nBytes & 0x7F); 420 } 421 else 422 goto no_output; 423 } 424 else 425 goto bad_input; 426 } 427 nHighSurrogate = 0; 428 continue; 429 430 bad_input: 431 switch (ImplHandleBadInputUnicodeToTextConversion( 432 bUndefined, 433 nChar, 434 nFlags, 435 &pDestBufPtr, 436 pDestBufEnd, 437 &nInfo, 438 "\x0F", /* SI */ 439 eSet == IMPL_UNICODE_TO_ISO_2022_KR_SET_ASCII ? 0 : 1, 440 &bWritten)) 441 { 442 case IMPL_BAD_INPUT_STOP: 443 nHighSurrogate = 0; 444 break; 445 446 case IMPL_BAD_INPUT_CONTINUE: 447 if (bWritten) 448 eSet = IMPL_UNICODE_TO_ISO_2022_KR_SET_ASCII; 449 nHighSurrogate = 0; 450 continue; 451 452 case IMPL_BAD_INPUT_NO_OUTPUT: 453 goto no_output; 454 } 455 break; 456 457 no_output: 458 --pSrcBuf; 459 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL; 460 break; 461 } 462 463 if ((nInfo & (RTL_UNICODETOTEXT_INFO_ERROR 464 | RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL)) 465 == 0) 466 { 467 sal_Bool bFlush = sal_True; 468 if (nHighSurrogate != 0) 469 { 470 if ((nFlags & RTL_UNICODETOTEXT_FLAGS_FLUSH) != 0) 471 nInfo |= RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL; 472 else 473 switch (ImplHandleBadInputUnicodeToTextConversion( 474 sal_False, 475 0, 476 nFlags, 477 &pDestBufPtr, 478 pDestBufEnd, 479 &nInfo, 480 "\x0F", /* SI */ 481 eSet == IMPL_UNICODE_TO_ISO_2022_KR_SET_ASCII ? 482 0 : 1, 483 &bWritten)) 484 { 485 case IMPL_BAD_INPUT_STOP: 486 nHighSurrogate = 0; 487 bFlush = sal_False; 488 break; 489 490 case IMPL_BAD_INPUT_CONTINUE: 491 if (bWritten) 492 eSet = IMPL_UNICODE_TO_ISO_2022_KR_SET_ASCII; 493 nHighSurrogate = 0; 494 break; 495 496 case IMPL_BAD_INPUT_NO_OUTPUT: 497 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL; 498 break; 499 } 500 } 501 if (bFlush 502 && eSet == IMPL_UNICODE_TO_ISO_2022_KR_SET_1001 503 && (nFlags & RTL_UNICODETOTEXT_FLAGS_FLUSH) != 0) 504 { 505 if (pDestBufPtr != pDestBufEnd) 506 { 507 *pDestBufPtr++ = 0x0F; /* SI */ 508 eSet = IMPL_UNICODE_TO_ISO_2022_KR_SET_ASCII; 509 } 510 else 511 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL; 512 } 513 } 514 515 if (pContext) 516 { 517 ((ImplUnicodeToIso2022KrContext *) pContext)->m_nHighSurrogate 518 = nHighSurrogate; 519 ((ImplUnicodeToIso2022KrContext *) pContext)->m_eSet = eSet; 520 } 521 if (pInfo) 522 *pInfo = nInfo; 523 if (pSrcCvtChars) 524 *pSrcCvtChars = nConverted; 525 526 return pDestBufPtr - pDestBuf; 527 } 528