/************************************************************** * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. * *************************************************************/ #include "sal/types.h" #include "rtl/alloc.h" #include "rtl/textcvt.h" #include "converter.h" #include "tenchelp.h" #include "unichars.h" struct ImplUtf8ToUnicodeContext { sal_uInt32 nUtf32; int nShift; sal_Bool bCheckBom; }; struct ImplUnicodeToUtf8Context { sal_Unicode nHighSurrogate; /* 0xFFFF: write BOM */ }; void * ImplCreateUtf8ToUnicodeContext(void) { void * p = rtl_allocateMemory(sizeof (struct ImplUtf8ToUnicodeContext)); ImplResetUtf8ToUnicodeContext(p); return p; } void ImplResetUtf8ToUnicodeContext(void * pContext) { if (pContext != NULL) { ((struct ImplUtf8ToUnicodeContext *) pContext)->nShift = -1; ((struct ImplUtf8ToUnicodeContext *) pContext)->bCheckBom = sal_True; } } sal_Size ImplConvertUtf8ToUnicode(ImplTextConverterData const * pData, void * pContext, sal_Char const * pSrcBuf, sal_Size nSrcBytes, sal_Unicode * pDestBuf, sal_Size nDestChars, sal_uInt32 nFlags, sal_uInt32 * pInfo, sal_Size * pSrcCvtBytes) { /* This function is very liberal with the UTF-8 input. Accepted are: - non-shortest forms (e.g., C0 41 instead of 41 to represent U+0041) - surrogates (e.g., ED A0 80 to represent U+D800) - encodings with up to six bytes (everything outside the range U+0000..10FFFF is considered "undefined") The first two of these points allow this routine to translate from both RTL_TEXTENCODING_UTF8 and RTL_TEXTENCODING_JAVA_UTF8. */ int bJavaUtf8 = pData != NULL; sal_uInt32 nUtf32 = 0; int nShift = -1; sal_Bool bCheckBom = sal_True; sal_uInt32 nInfo = 0; sal_uChar const * pSrcBufPtr = (sal_uChar const *) pSrcBuf; sal_uChar const * pSrcBufEnd = pSrcBufPtr + nSrcBytes; sal_Unicode * pDestBufPtr = pDestBuf; sal_Unicode * pDestBufEnd = pDestBufPtr + nDestChars; if (pContext != NULL) { nUtf32 = ((struct ImplUtf8ToUnicodeContext *) pContext)->nUtf32; nShift = ((struct ImplUtf8ToUnicodeContext *) pContext)->nShift; bCheckBom = ((struct ImplUtf8ToUnicodeContext *) pContext)->bCheckBom; } while (pSrcBufPtr < pSrcBufEnd) { sal_Bool bUndefined = sal_False; int bConsume = sal_True; sal_uInt32 nChar = *pSrcBufPtr++; if (nShift < 0) if (nChar <= 0x7F) { nUtf32 = nChar; goto transform; } else if (nChar <= 0xBF) goto bad_input; else if (nChar <= 0xDF) { nUtf32 = (nChar & 0x1F) << 6; nShift = 0; } else if (nChar <= 0xEF) { nUtf32 = (nChar & 0x0F) << 12; nShift = 6; } else if (nChar <= 0xF7) { nUtf32 = (nChar & 0x07) << 18; nShift = 12; } else if (nChar <= 0xFB) { nUtf32 = (nChar & 0x03) << 24; nShift = 18; } else if (nChar <= 0xFD) { nUtf32 = (nChar & 0x01) << 30; nShift = 24; } else goto bad_input; else if ((nChar & 0xC0) == 0x80) { nUtf32 |= (nChar & 0x3F) << nShift; if (nShift == 0) goto transform; else nShift -= 6; } else { /* This byte is preceeded by a broken UTF-8 sequence; if this byte is neither in the range [0x80..0xBF] nor in the range [0xFE..0xFF], assume that this byte does not belong to that broken sequence, but instead starts a new, legal UTF-8 sequence: */ bConsume = nChar >= 0xFE; goto bad_input; } continue; transform: if (!bCheckBom || nUtf32 != 0xFEFF || (nFlags & RTL_TEXTTOUNICODE_FLAGS_GLOBAL_SIGNATURE) == 0 || bJavaUtf8) { if (nUtf32 <= 0xFFFF) if (pDestBufPtr != pDestBufEnd) *pDestBufPtr++ = (sal_Unicode) nUtf32; else goto no_output; else if (nUtf32 <= 0x10FFFF) if (pDestBufEnd - pDestBufPtr >= 2) { *pDestBufPtr++ = (sal_Unicode) ImplGetHighSurrogate(nUtf32); *pDestBufPtr++ = (sal_Unicode) ImplGetLowSurrogate(nUtf32); } else goto no_output; else { bUndefined = sal_True; goto bad_input; } } nShift = -1; bCheckBom = sal_False; continue; bad_input: switch (ImplHandleBadInputTextToUnicodeConversion( bUndefined, sal_True, 0, nFlags, &pDestBufPtr, pDestBufEnd, &nInfo)) { case IMPL_BAD_INPUT_STOP: nShift = -1; bCheckBom = sal_False; if (!bConsume) --pSrcBufPtr; break; case IMPL_BAD_INPUT_CONTINUE: nShift = -1; bCheckBom = sal_False; if (!bConsume) --pSrcBufPtr; continue; case IMPL_BAD_INPUT_NO_OUTPUT: goto no_output; } break; no_output: --pSrcBufPtr; nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL; break; } if (nShift >= 0 && (nInfo & (RTL_TEXTTOUNICODE_INFO_ERROR | RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL)) == 0) { if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0) nInfo |= RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL; else switch (ImplHandleBadInputTextToUnicodeConversion( sal_False, sal_True, 0, nFlags, &pDestBufPtr, pDestBufEnd, &nInfo)) { case IMPL_BAD_INPUT_STOP: case IMPL_BAD_INPUT_CONTINUE: nShift = -1; bCheckBom = sal_False; break; case IMPL_BAD_INPUT_NO_OUTPUT: nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL; break; } } if (pContext != NULL) { ((struct ImplUtf8ToUnicodeContext *) pContext)->nUtf32 = nUtf32; ((struct ImplUtf8ToUnicodeContext *) pContext)->nShift = nShift; ((struct ImplUtf8ToUnicodeContext *) pContext)->bCheckBom = bCheckBom; } if (pInfo != NULL) *pInfo = nInfo; if (pSrcCvtBytes != NULL) *pSrcCvtBytes = (sal_Char const *) pSrcBufPtr - pSrcBuf; return pDestBufPtr - pDestBuf; } void * ImplCreateUnicodeToUtf8Context(void) { void * p = rtl_allocateMemory(sizeof (struct ImplUnicodeToUtf8Context)); ImplResetUnicodeToUtf8Context(p); return p; } void ImplResetUnicodeToUtf8Context(void * pContext) { if (pContext != NULL) ((struct ImplUnicodeToUtf8Context *) pContext)->nHighSurrogate = 0xFFFF; } sal_Size ImplConvertUnicodeToUtf8(ImplTextConverterData const * pData, void * pContext, sal_Unicode const * pSrcBuf, sal_Size nSrcChars, sal_Char * pDestBuf, sal_Size nDestBytes, sal_uInt32 nFlags, sal_uInt32 * pInfo, sal_Size* pSrcCvtChars) { int bJavaUtf8 = pData != NULL; sal_Unicode nHighSurrogate = 0xFFFF; sal_uInt32 nInfo = 0; sal_Unicode const * pSrcBufPtr = pSrcBuf; sal_Unicode const * pSrcBufEnd = pSrcBufPtr + nSrcChars; sal_Char * pDestBufPtr = pDestBuf; sal_Char * pDestBufEnd = pDestBufPtr + nDestBytes; if (pContext != NULL) nHighSurrogate = ((struct ImplUnicodeToUtf8Context *) pContext)->nHighSurrogate; if (nHighSurrogate == 0xFFFF) { if ((nFlags & RTL_UNICODETOTEXT_FLAGS_GLOBAL_SIGNATURE) != 0 && !bJavaUtf8) { if (pDestBufEnd - pDestBufPtr >= 3) { /* Write BOM (U+FEFF) as UTF-8: */ *pDestBufPtr++ = (sal_Char) (unsigned char) 0xEF; *pDestBufPtr++ = (sal_Char) (unsigned char) 0xBB; *pDestBufPtr++ = (sal_Char) (unsigned char) 0xBF; } else { nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL; goto done; } } nHighSurrogate = 0; } while (pSrcBufPtr < pSrcBufEnd) { sal_uInt32 nChar = *pSrcBufPtr++; if (nHighSurrogate == 0) { if (ImplIsHighSurrogate(nChar) && !bJavaUtf8) { nHighSurrogate = (sal_Unicode) nChar; continue; } } else if (ImplIsLowSurrogate(nChar) && !bJavaUtf8) nChar = ImplCombineSurrogates(nHighSurrogate, nChar); else goto bad_input; if ((ImplIsLowSurrogate(nChar) && !bJavaUtf8) || ImplIsNoncharacter(nChar)) goto bad_input; if (nChar <= 0x7F && (!bJavaUtf8 || nChar != 0)) if (pDestBufPtr != pDestBufEnd) *pDestBufPtr++ = (sal_Char) nChar; else goto no_output; else if (nChar <= 0x7FF) if (pDestBufEnd - pDestBufPtr >= 2) { *pDestBufPtr++ = (sal_Char) (0xC0 | (nChar >> 6)); *pDestBufPtr++ = (sal_Char) (0x80 | (nChar & 0x3F)); } else goto no_output; else if (nChar <= 0xFFFF) if (pDestBufEnd - pDestBufPtr >= 3) { *pDestBufPtr++ = (sal_Char) (0xE0 | (nChar >> 12)); *pDestBufPtr++ = (sal_Char) (0x80 | ((nChar >> 6) & 0x3F)); *pDestBufPtr++ = (sal_Char) (0x80 | (nChar & 0x3F)); } else goto no_output; else if (pDestBufEnd - pDestBufPtr >= 4) { *pDestBufPtr++ = (sal_Char) (0xF0 | (nChar >> 18)); *pDestBufPtr++ = (sal_Char) (0x80 | ((nChar >> 12) & 0x3F)); *pDestBufPtr++ = (sal_Char) (0x80 | ((nChar >> 6) & 0x3F)); *pDestBufPtr++ = (sal_Char) (0x80 | (nChar & 0x3F)); } else goto no_output; nHighSurrogate = 0; continue; bad_input: switch (ImplHandleBadInputUnicodeToTextConversion(sal_False, 0, nFlags, &pDestBufPtr, pDestBufEnd, &nInfo, NULL, 0, NULL)) { case IMPL_BAD_INPUT_STOP: nHighSurrogate = 0; break; case IMPL_BAD_INPUT_CONTINUE: nHighSurrogate = 0; continue; case IMPL_BAD_INPUT_NO_OUTPUT: goto no_output; } break; no_output: --pSrcBufPtr; nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL; break; } if (nHighSurrogate != 0 && (nInfo & (RTL_UNICODETOTEXT_INFO_ERROR | RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL)) == 0) { if ((nFlags & RTL_UNICODETOTEXT_FLAGS_FLUSH) != 0) nInfo |= RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL; else switch (ImplHandleBadInputUnicodeToTextConversion(sal_False, 0, nFlags, &pDestBufPtr, pDestBufEnd, &nInfo, NULL, 0, NULL)) { case IMPL_BAD_INPUT_STOP: case IMPL_BAD_INPUT_CONTINUE: nHighSurrogate = 0; break; case IMPL_BAD_INPUT_NO_OUTPUT: nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL; break; } } done: if (pContext != NULL) ((struct ImplUnicodeToUtf8Context *) pContext)->nHighSurrogate = nHighSurrogate; if (pInfo != NULL) *pInfo = nInfo; if (pSrcCvtChars != NULL) *pSrcCvtChars = pSrcBufPtr - pSrcBuf; return pDestBufPtr - pDestBuf; }