1 /************************************************************** 2 * 3 * Licensed to the Apache Software Foundation (ASF) under one 4 * or more contributor license agreements. See the NOTICE file 5 * distributed with this work for additional information 6 * regarding copyright ownership. The ASF licenses this file 7 * to you under the Apache License, Version 2.0 (the 8 * "License"); you may not use this file except in compliance 9 * with the License. You may obtain a copy of the License at 10 * 11 * http://www.apache.org/licenses/LICENSE-2.0 12 * 13 * Unless required by applicable law or agreed to in writing, 14 * software distributed under the License is distributed on an 15 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 * KIND, either express or implied. See the License for the 17 * specific language governing permissions and limitations 18 * under the License. 19 * 20 *************************************************************/ 21 22 23 24 #include "sal/types.h" 25 #include "rtl/alloc.h" 26 #include "rtl/textcvt.h" 27 28 #include "converter.h" 29 #include "tenchelp.h" 30 #include "unichars.h" 31 32 struct ImplUtf8ToUnicodeContext 33 { 34 sal_uInt32 nUtf32; 35 int nShift; 36 sal_Bool bCheckBom; 37 }; 38 39 struct ImplUnicodeToUtf8Context 40 { 41 sal_Unicode nHighSurrogate; /* 0xFFFF: write BOM */ 42 }; 43 44 void * ImplCreateUtf8ToUnicodeContext(void) 45 { 46 void * p = rtl_allocateMemory(sizeof (struct ImplUtf8ToUnicodeContext)); 47 ImplResetUtf8ToUnicodeContext(p); 48 return p; 49 } 50 51 void ImplResetUtf8ToUnicodeContext(void * pContext) 52 { 53 if (pContext != NULL) 54 { 55 ((struct ImplUtf8ToUnicodeContext *) pContext)->nShift = -1; 56 ((struct ImplUtf8ToUnicodeContext *) pContext)->bCheckBom = sal_True; 57 } 58 } 59 60 sal_Size ImplConvertUtf8ToUnicode(ImplTextConverterData const * pData, 61 void * pContext, sal_Char const * pSrcBuf, 62 sal_Size nSrcBytes, sal_Unicode * pDestBuf, 63 sal_Size nDestChars, sal_uInt32 nFlags, 64 sal_uInt32 * pInfo, sal_Size * pSrcCvtBytes) 65 { 66 /* 67 This function is very liberal with the UTF-8 input. Accepted are: 68 - non-shortest forms (e.g., C0 41 instead of 41 to represent U+0041) 69 - surrogates (e.g., ED A0 80 to represent U+D800) 70 - encodings with up to six bytes (everything outside the range 71 U+0000..10FFFF is considered "undefined") 72 The first two of these points allow this routine to translate from both 73 RTL_TEXTENCODING_UTF8 and RTL_TEXTENCODING_JAVA_UTF8. 74 */ 75 76 int bJavaUtf8 = pData != NULL; 77 sal_uInt32 nUtf32 = 0; 78 int nShift = -1; 79 sal_Bool bCheckBom = sal_True; 80 sal_uInt32 nInfo = 0; 81 sal_uChar const * pSrcBufPtr = (sal_uChar const *) pSrcBuf; 82 sal_uChar const * pSrcBufEnd = pSrcBufPtr + nSrcBytes; 83 sal_Unicode * pDestBufPtr = pDestBuf; 84 sal_Unicode * pDestBufEnd = pDestBufPtr + nDestChars; 85 86 if (pContext != NULL) 87 { 88 nUtf32 = ((struct ImplUtf8ToUnicodeContext *) pContext)->nUtf32; 89 nShift = ((struct ImplUtf8ToUnicodeContext *) pContext)->nShift; 90 bCheckBom = ((struct ImplUtf8ToUnicodeContext *) pContext)->bCheckBom; 91 } 92 93 while (pSrcBufPtr < pSrcBufEnd) 94 { 95 sal_Bool bUndefined = sal_False; 96 int bConsume = sal_True; 97 sal_uInt32 nChar = *pSrcBufPtr++; 98 if (nShift < 0) 99 if (nChar <= 0x7F) 100 { 101 nUtf32 = nChar; 102 goto transform; 103 } 104 else if (nChar <= 0xBF) 105 goto bad_input; 106 else if (nChar <= 0xDF) 107 { 108 nUtf32 = (nChar & 0x1F) << 6; 109 nShift = 0; 110 } 111 else if (nChar <= 0xEF) 112 { 113 nUtf32 = (nChar & 0x0F) << 12; 114 nShift = 6; 115 } 116 else if (nChar <= 0xF7) 117 { 118 nUtf32 = (nChar & 0x07) << 18; 119 nShift = 12; 120 } 121 else if (nChar <= 0xFB) 122 { 123 nUtf32 = (nChar & 0x03) << 24; 124 nShift = 18; 125 } 126 else if (nChar <= 0xFD) 127 { 128 nUtf32 = (nChar & 0x01) << 30; 129 nShift = 24; 130 } 131 else 132 goto bad_input; 133 else if ((nChar & 0xC0) == 0x80) 134 { 135 nUtf32 |= (nChar & 0x3F) << nShift; 136 if (nShift == 0) 137 goto transform; 138 else 139 nShift -= 6; 140 } 141 else 142 { 143 /* 144 This byte is preceeded by a broken UTF-8 sequence; if this byte 145 is neither in the range [0x80..0xBF] nor in the range 146 [0xFE..0xFF], assume that this byte does not belong to that 147 broken sequence, but instead starts a new, legal UTF-8 sequence: 148 */ 149 bConsume = nChar >= 0xFE; 150 goto bad_input; 151 } 152 continue; 153 154 transform: 155 if (!bCheckBom || nUtf32 != 0xFEFF 156 || (nFlags & RTL_TEXTTOUNICODE_FLAGS_GLOBAL_SIGNATURE) == 0 157 || bJavaUtf8) 158 { 159 if (nUtf32 <= 0xFFFF) 160 if (pDestBufPtr != pDestBufEnd) 161 *pDestBufPtr++ = (sal_Unicode) nUtf32; 162 else 163 goto no_output; 164 else if (nUtf32 <= 0x10FFFF) 165 if (pDestBufEnd - pDestBufPtr >= 2) 166 { 167 *pDestBufPtr++ = (sal_Unicode) ImplGetHighSurrogate(nUtf32); 168 *pDestBufPtr++ = (sal_Unicode) ImplGetLowSurrogate(nUtf32); 169 } 170 else 171 goto no_output; 172 else 173 { 174 bUndefined = sal_True; 175 goto bad_input; 176 } 177 } 178 nShift = -1; 179 bCheckBom = sal_False; 180 continue; 181 182 bad_input: 183 switch (ImplHandleBadInputTextToUnicodeConversion( 184 bUndefined, sal_True, 0, nFlags, &pDestBufPtr, pDestBufEnd, 185 &nInfo)) 186 { 187 case IMPL_BAD_INPUT_STOP: 188 nShift = -1; 189 bCheckBom = sal_False; 190 if (!bConsume) 191 --pSrcBufPtr; 192 break; 193 194 case IMPL_BAD_INPUT_CONTINUE: 195 nShift = -1; 196 bCheckBom = sal_False; 197 if (!bConsume) 198 --pSrcBufPtr; 199 continue; 200 201 case IMPL_BAD_INPUT_NO_OUTPUT: 202 goto no_output; 203 } 204 break; 205 206 no_output: 207 --pSrcBufPtr; 208 nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL; 209 break; 210 } 211 212 if (nShift >= 0 213 && (nInfo & (RTL_TEXTTOUNICODE_INFO_ERROR 214 | RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL)) 215 == 0) 216 { 217 if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0) 218 nInfo |= RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL; 219 else 220 switch (ImplHandleBadInputTextToUnicodeConversion( 221 sal_False, sal_True, 0, nFlags, &pDestBufPtr, 222 pDestBufEnd, &nInfo)) 223 { 224 case IMPL_BAD_INPUT_STOP: 225 case IMPL_BAD_INPUT_CONTINUE: 226 nShift = -1; 227 bCheckBom = sal_False; 228 break; 229 230 case IMPL_BAD_INPUT_NO_OUTPUT: 231 nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL; 232 break; 233 } 234 } 235 236 if (pContext != NULL) 237 { 238 ((struct ImplUtf8ToUnicodeContext *) pContext)->nUtf32 = nUtf32; 239 ((struct ImplUtf8ToUnicodeContext *) pContext)->nShift = nShift; 240 ((struct ImplUtf8ToUnicodeContext *) pContext)->bCheckBom = bCheckBom; 241 } 242 if (pInfo != NULL) 243 *pInfo = nInfo; 244 if (pSrcCvtBytes != NULL) 245 *pSrcCvtBytes = (sal_Char const *) pSrcBufPtr - pSrcBuf; 246 return pDestBufPtr - pDestBuf; 247 } 248 249 void * ImplCreateUnicodeToUtf8Context(void) 250 { 251 void * p = rtl_allocateMemory(sizeof (struct ImplUnicodeToUtf8Context)); 252 ImplResetUnicodeToUtf8Context(p); 253 return p; 254 } 255 256 void ImplResetUnicodeToUtf8Context(void * pContext) 257 { 258 if (pContext != NULL) 259 ((struct ImplUnicodeToUtf8Context *) pContext)->nHighSurrogate = 0xFFFF; 260 } 261 262 sal_Size ImplConvertUnicodeToUtf8(ImplTextConverterData const * pData, 263 void * pContext, sal_Unicode const * pSrcBuf, 264 sal_Size nSrcChars, sal_Char * pDestBuf, 265 sal_Size nDestBytes, sal_uInt32 nFlags, 266 sal_uInt32 * pInfo, sal_Size* pSrcCvtChars) 267 { 268 int bJavaUtf8 = pData != NULL; 269 sal_Unicode nHighSurrogate = 0xFFFF; 270 sal_uInt32 nInfo = 0; 271 sal_Unicode const * pSrcBufPtr = pSrcBuf; 272 sal_Unicode const * pSrcBufEnd = pSrcBufPtr + nSrcChars; 273 sal_Char * pDestBufPtr = pDestBuf; 274 sal_Char * pDestBufEnd = pDestBufPtr + nDestBytes; 275 276 if (pContext != NULL) 277 nHighSurrogate 278 = ((struct ImplUnicodeToUtf8Context *) pContext)->nHighSurrogate; 279 280 if (nHighSurrogate == 0xFFFF) 281 { 282 if ((nFlags & RTL_UNICODETOTEXT_FLAGS_GLOBAL_SIGNATURE) != 0 283 && !bJavaUtf8) 284 { 285 if (pDestBufEnd - pDestBufPtr >= 3) 286 { 287 /* Write BOM (U+FEFF) as UTF-8: */ 288 *pDestBufPtr++ = (sal_Char) (unsigned char) 0xEF; 289 *pDestBufPtr++ = (sal_Char) (unsigned char) 0xBB; 290 *pDestBufPtr++ = (sal_Char) (unsigned char) 0xBF; 291 } 292 else 293 { 294 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL; 295 goto done; 296 } 297 } 298 nHighSurrogate = 0; 299 } 300 301 while (pSrcBufPtr < pSrcBufEnd) 302 { 303 sal_uInt32 nChar = *pSrcBufPtr++; 304 if (nHighSurrogate == 0) 305 { 306 if (ImplIsHighSurrogate(nChar) && !bJavaUtf8) 307 { 308 nHighSurrogate = (sal_Unicode) nChar; 309 continue; 310 } 311 } 312 else if (ImplIsLowSurrogate(nChar) && !bJavaUtf8) 313 nChar = ImplCombineSurrogates(nHighSurrogate, nChar); 314 else 315 goto bad_input; 316 317 if ((ImplIsLowSurrogate(nChar) && !bJavaUtf8) 318 || ImplIsNoncharacter(nChar)) 319 goto bad_input; 320 321 if (nChar <= 0x7F && (!bJavaUtf8 || nChar != 0)) 322 if (pDestBufPtr != pDestBufEnd) 323 *pDestBufPtr++ = (sal_Char) nChar; 324 else 325 goto no_output; 326 else if (nChar <= 0x7FF) 327 if (pDestBufEnd - pDestBufPtr >= 2) 328 { 329 *pDestBufPtr++ = (sal_Char) (0xC0 | (nChar >> 6)); 330 *pDestBufPtr++ = (sal_Char) (0x80 | (nChar & 0x3F)); 331 } 332 else 333 goto no_output; 334 else if (nChar <= 0xFFFF) 335 if (pDestBufEnd - pDestBufPtr >= 3) 336 { 337 *pDestBufPtr++ = (sal_Char) (0xE0 | (nChar >> 12)); 338 *pDestBufPtr++ = (sal_Char) (0x80 | ((nChar >> 6) & 0x3F)); 339 *pDestBufPtr++ = (sal_Char) (0x80 | (nChar & 0x3F)); 340 } 341 else 342 goto no_output; 343 else if (pDestBufEnd - pDestBufPtr >= 4) 344 { 345 *pDestBufPtr++ = (sal_Char) (0xF0 | (nChar >> 18)); 346 *pDestBufPtr++ = (sal_Char) (0x80 | ((nChar >> 12) & 0x3F)); 347 *pDestBufPtr++ = (sal_Char) (0x80 | ((nChar >> 6) & 0x3F)); 348 *pDestBufPtr++ = (sal_Char) (0x80 | (nChar & 0x3F)); 349 } 350 else 351 goto no_output; 352 nHighSurrogate = 0; 353 continue; 354 355 bad_input: 356 switch (ImplHandleBadInputUnicodeToTextConversion(sal_False, 0, nFlags, 357 &pDestBufPtr, 358 pDestBufEnd, &nInfo, 359 NULL, 0, NULL)) 360 { 361 case IMPL_BAD_INPUT_STOP: 362 nHighSurrogate = 0; 363 break; 364 365 case IMPL_BAD_INPUT_CONTINUE: 366 nHighSurrogate = 0; 367 continue; 368 369 case IMPL_BAD_INPUT_NO_OUTPUT: 370 goto no_output; 371 } 372 break; 373 374 no_output: 375 --pSrcBufPtr; 376 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL; 377 break; 378 } 379 380 if (nHighSurrogate != 0 381 && (nInfo & (RTL_UNICODETOTEXT_INFO_ERROR 382 | RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL)) 383 == 0) 384 { 385 if ((nFlags & RTL_UNICODETOTEXT_FLAGS_FLUSH) != 0) 386 nInfo |= RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL; 387 else 388 switch (ImplHandleBadInputUnicodeToTextConversion(sal_False, 0, 389 nFlags, 390 &pDestBufPtr, 391 pDestBufEnd, 392 &nInfo, NULL, 0, 393 NULL)) 394 { 395 case IMPL_BAD_INPUT_STOP: 396 case IMPL_BAD_INPUT_CONTINUE: 397 nHighSurrogate = 0; 398 break; 399 400 case IMPL_BAD_INPUT_NO_OUTPUT: 401 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL; 402 break; 403 } 404 } 405 406 done: 407 if (pContext != NULL) 408 ((struct ImplUnicodeToUtf8Context *) pContext)->nHighSurrogate 409 = nHighSurrogate; 410 if (pInfo != NULL) 411 *pInfo = nInfo; 412 if (pSrcCvtChars != NULL) 413 *pSrcCvtChars = pSrcBufPtr - pSrcBuf; 414 return pDestBufPtr - pDestBuf; 415 } 416