xref: /trunk/main/sal/textenc/tcvtutf8.c (revision 86e1cf34)
1 /**************************************************************
2  *
3  * Licensed to the Apache Software Foundation (ASF) under one
4  * or more contributor license agreements.  See the NOTICE file
5  * distributed with this work for additional information
6  * regarding copyright ownership.  The ASF licenses this file
7  * to you under the Apache License, Version 2.0 (the
8  * "License"); you may not use this file except in compliance
9  * with the License.  You may obtain a copy of the License at
10  *
11  *   http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing,
14  * software distributed under the License is distributed on an
15  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16  * KIND, either express or implied.  See the License for the
17  * specific language governing permissions and limitations
18  * under the License.
19  *
20  *************************************************************/
21 
22 
23 
24 #include "sal/types.h"
25 #include "rtl/alloc.h"
26 #include "rtl/textcvt.h"
27 
28 #include "converter.h"
29 #include "tenchelp.h"
30 #include "unichars.h"
31 
32 struct ImplUtf8ToUnicodeContext
33 {
34     sal_uInt32 nUtf32;
35     int nShift;
36     sal_Bool bCheckBom;
37 };
38 
39 struct ImplUnicodeToUtf8Context
40 {
41     sal_Unicode nHighSurrogate; /* 0xFFFF: write BOM */
42 };
43 
ImplCreateUtf8ToUnicodeContext(void)44 void * ImplCreateUtf8ToUnicodeContext(void)
45 {
46     void * p = rtl_allocateMemory(sizeof (struct ImplUtf8ToUnicodeContext));
47     ImplResetUtf8ToUnicodeContext(p);
48     return p;
49 }
50 
ImplResetUtf8ToUnicodeContext(void * pContext)51 void ImplResetUtf8ToUnicodeContext(void * pContext)
52 {
53     if (pContext != NULL)
54     {
55         ((struct ImplUtf8ToUnicodeContext *) pContext)->nShift = -1;
56         ((struct ImplUtf8ToUnicodeContext *) pContext)->bCheckBom = sal_True;
57     }
58 }
59 
ImplConvertUtf8ToUnicode(ImplTextConverterData const * pData,void * pContext,sal_Char const * pSrcBuf,sal_Size nSrcBytes,sal_Unicode * pDestBuf,sal_Size nDestChars,sal_uInt32 nFlags,sal_uInt32 * pInfo,sal_Size * pSrcCvtBytes)60 sal_Size ImplConvertUtf8ToUnicode(ImplTextConverterData const * pData,
61                                   void * pContext, sal_Char const * pSrcBuf,
62                                   sal_Size nSrcBytes, sal_Unicode * pDestBuf,
63                                   sal_Size nDestChars, sal_uInt32 nFlags,
64                                   sal_uInt32 * pInfo, sal_Size * pSrcCvtBytes)
65 {
66 	/*
67        This function is very liberal with the UTF-8 input.  Accepted are:
68        - non-shortest forms (e.g., C0 41 instead of 41 to represent U+0041)
69        - surrogates (e.g., ED A0 80 to represent U+D800)
70        - encodings with up to six bytes (everything outside the range
71          U+0000..10FFFF is considered "undefined")
72        The first two of these points allow this routine to translate from both
73        RTL_TEXTENCODING_UTF8 and RTL_TEXTENCODING_JAVA_UTF8.
74 	  */
75 
76     int bJavaUtf8 = pData != NULL;
77     sal_uInt32 nUtf32 = 0;
78     int nShift = -1;
79     sal_Bool bCheckBom = sal_True;
80     sal_uInt32 nInfo = 0;
81     sal_uChar const * pSrcBufPtr = (sal_uChar const *) pSrcBuf;
82     sal_uChar const * pSrcBufEnd = pSrcBufPtr + nSrcBytes;
83     sal_Unicode * pDestBufPtr = pDestBuf;
84     sal_Unicode * pDestBufEnd = pDestBufPtr + nDestChars;
85 
86     if (pContext != NULL)
87     {
88         nUtf32 = ((struct ImplUtf8ToUnicodeContext *) pContext)->nUtf32;
89         nShift = ((struct ImplUtf8ToUnicodeContext *) pContext)->nShift;
90         bCheckBom = ((struct ImplUtf8ToUnicodeContext *) pContext)->bCheckBom;
91     }
92 
93     while (pSrcBufPtr < pSrcBufEnd)
94     {
95         sal_Bool bUndefined = sal_False;
96         int bConsume = sal_True;
97         sal_uInt32 nChar = *pSrcBufPtr++;
98         if (nShift < 0)
99             if (nChar <= 0x7F)
100             {
101                 nUtf32 = nChar;
102                 goto transform;
103             }
104             else if (nChar <= 0xBF)
105                 goto bad_input;
106             else if (nChar <= 0xDF)
107             {
108                 nUtf32 = (nChar & 0x1F) << 6;
109                 nShift = 0;
110             }
111             else if (nChar <= 0xEF)
112             {
113                 nUtf32 = (nChar & 0x0F) << 12;
114                 nShift = 6;
115             }
116             else if (nChar <= 0xF7)
117             {
118                 nUtf32 = (nChar & 0x07) << 18;
119                 nShift = 12;
120             }
121             else if (nChar <= 0xFB)
122             {
123                 nUtf32 = (nChar & 0x03) << 24;
124                 nShift = 18;
125             }
126             else if (nChar <= 0xFD)
127             {
128                 nUtf32 = (nChar & 0x01) << 30;
129                 nShift = 24;
130             }
131             else
132                 goto bad_input;
133         else if ((nChar & 0xC0) == 0x80)
134         {
135             nUtf32 |= (nChar & 0x3F) << nShift;
136             if (nShift == 0)
137                 goto transform;
138             else
139                 nShift -= 6;
140         }
141         else
142         {
143 			/*
144              This byte is preceded by a broken UTF-8 sequence; if this byte
145              is neither in the range [0x80..0xBF] nor in the range
146              [0xFE..0xFF], assume that this byte does not belong to that
147              broken sequence, but instead starts a new, legal UTF-8 sequence:
148 			 */
149             bConsume = nChar >= 0xFE;
150             goto bad_input;
151         }
152         continue;
153 
154     transform:
155         if (!bCheckBom || nUtf32 != 0xFEFF
156             || (nFlags & RTL_TEXTTOUNICODE_FLAGS_GLOBAL_SIGNATURE) == 0
157             || bJavaUtf8)
158         {
159             if (nUtf32 <= 0xFFFF)
160                 if (pDestBufPtr != pDestBufEnd)
161                     *pDestBufPtr++ = (sal_Unicode) nUtf32;
162                 else
163                     goto no_output;
164             else if (nUtf32 <= 0x10FFFF)
165                 if (pDestBufEnd - pDestBufPtr >= 2)
166                 {
167                     *pDestBufPtr++ = (sal_Unicode) ImplGetHighSurrogate(nUtf32);
168                     *pDestBufPtr++ = (sal_Unicode) ImplGetLowSurrogate(nUtf32);
169                 }
170                 else
171                     goto no_output;
172             else
173             {
174                 bUndefined = sal_True;
175                 goto bad_input;
176             }
177         }
178         nShift = -1;
179         bCheckBom = sal_False;
180         continue;
181 
182     bad_input:
183         switch (ImplHandleBadInputTextToUnicodeConversion(
184                     bUndefined, sal_True, 0, nFlags, &pDestBufPtr, pDestBufEnd,
185                     &nInfo))
186         {
187         case IMPL_BAD_INPUT_STOP:
188             nShift = -1;
189             bCheckBom = sal_False;
190             if (!bConsume)
191                 --pSrcBufPtr;
192             break;
193 
194         case IMPL_BAD_INPUT_CONTINUE:
195             nShift = -1;
196             bCheckBom = sal_False;
197             if (!bConsume)
198                 --pSrcBufPtr;
199             continue;
200 
201         case IMPL_BAD_INPUT_NO_OUTPUT:
202             goto no_output;
203         }
204         break;
205 
206     no_output:
207         --pSrcBufPtr;
208         nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL;
209         break;
210     }
211 
212     if (nShift >= 0
213         && (nInfo & (RTL_TEXTTOUNICODE_INFO_ERROR
214                          | RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL))
215                == 0)
216     {
217         if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0)
218             nInfo |= RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL;
219         else
220             switch (ImplHandleBadInputTextToUnicodeConversion(
221                         sal_False, sal_True, 0, nFlags, &pDestBufPtr,
222                         pDestBufEnd, &nInfo))
223             {
224             case IMPL_BAD_INPUT_STOP:
225             case IMPL_BAD_INPUT_CONTINUE:
226                 nShift = -1;
227                 bCheckBom = sal_False;
228                 break;
229 
230             case IMPL_BAD_INPUT_NO_OUTPUT:
231                 nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL;
232                 break;
233             }
234     }
235 
236     if (pContext != NULL)
237     {
238         ((struct ImplUtf8ToUnicodeContext *) pContext)->nUtf32 = nUtf32;
239         ((struct ImplUtf8ToUnicodeContext *) pContext)->nShift = nShift;
240         ((struct ImplUtf8ToUnicodeContext *) pContext)->bCheckBom = bCheckBom;
241     }
242     if (pInfo != NULL)
243         *pInfo = nInfo;
244     if (pSrcCvtBytes != NULL)
245         *pSrcCvtBytes = (sal_Char const *) pSrcBufPtr - pSrcBuf;
246     return pDestBufPtr - pDestBuf;
247 }
248 
ImplCreateUnicodeToUtf8Context(void)249 void * ImplCreateUnicodeToUtf8Context(void)
250 {
251     void * p = rtl_allocateMemory(sizeof (struct ImplUnicodeToUtf8Context));
252     ImplResetUnicodeToUtf8Context(p);
253     return p;
254 }
255 
ImplResetUnicodeToUtf8Context(void * pContext)256 void ImplResetUnicodeToUtf8Context(void * pContext)
257 {
258     if (pContext != NULL)
259         ((struct ImplUnicodeToUtf8Context *) pContext)->nHighSurrogate = 0xFFFF;
260 }
261 
ImplConvertUnicodeToUtf8(ImplTextConverterData const * pData,void * pContext,sal_Unicode const * pSrcBuf,sal_Size nSrcChars,sal_Char * pDestBuf,sal_Size nDestBytes,sal_uInt32 nFlags,sal_uInt32 * pInfo,sal_Size * pSrcCvtChars)262 sal_Size ImplConvertUnicodeToUtf8(ImplTextConverterData const * pData,
263                                   void * pContext, sal_Unicode const * pSrcBuf,
264                                   sal_Size nSrcChars, sal_Char * pDestBuf,
265                                   sal_Size nDestBytes, sal_uInt32 nFlags,
266                                   sal_uInt32 * pInfo, sal_Size* pSrcCvtChars)
267 {
268     int bJavaUtf8 = pData != NULL;
269     sal_Unicode nHighSurrogate = 0xFFFF;
270     sal_uInt32 nInfo = 0;
271     sal_Unicode const * pSrcBufPtr = pSrcBuf;
272     sal_Unicode const * pSrcBufEnd = pSrcBufPtr + nSrcChars;
273     sal_Char * pDestBufPtr = pDestBuf;
274     sal_Char * pDestBufEnd = pDestBufPtr + nDestBytes;
275 
276     if (pContext != NULL)
277         nHighSurrogate
278             = ((struct ImplUnicodeToUtf8Context *) pContext)->nHighSurrogate;
279 
280     if (nHighSurrogate == 0xFFFF)
281     {
282         if ((nFlags & RTL_UNICODETOTEXT_FLAGS_GLOBAL_SIGNATURE) != 0
283             && !bJavaUtf8)
284         {
285             if (pDestBufEnd - pDestBufPtr >= 3)
286             {
287                 /* Write BOM (U+FEFF) as UTF-8: */
288                 *pDestBufPtr++ = (sal_Char) (unsigned char) 0xEF;
289                 *pDestBufPtr++ = (sal_Char) (unsigned char) 0xBB;
290                 *pDestBufPtr++ = (sal_Char) (unsigned char) 0xBF;
291             }
292             else
293             {
294                 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
295                 goto done;
296             }
297         }
298         nHighSurrogate = 0;
299     }
300 
301     while (pSrcBufPtr < pSrcBufEnd)
302     {
303         sal_uInt32 nChar = *pSrcBufPtr++;
304         if (nHighSurrogate == 0)
305         {
306             if (ImplIsHighSurrogate(nChar) && !bJavaUtf8)
307             {
308                 nHighSurrogate = (sal_Unicode) nChar;
309                 continue;
310             }
311         }
312         else if (ImplIsLowSurrogate(nChar) && !bJavaUtf8)
313             nChar = ImplCombineSurrogates(nHighSurrogate, nChar);
314         else
315             goto bad_input;
316 
317         if ((ImplIsLowSurrogate(nChar) && !bJavaUtf8)
318             || ImplIsNoncharacter(nChar))
319             goto bad_input;
320 
321         if (nChar <= 0x7F && (!bJavaUtf8 || nChar != 0))
322             if (pDestBufPtr != pDestBufEnd)
323                 *pDestBufPtr++ = (sal_Char) nChar;
324             else
325                 goto no_output;
326         else if (nChar <= 0x7FF)
327             if (pDestBufEnd - pDestBufPtr >= 2)
328             {
329                 *pDestBufPtr++ = (sal_Char) (0xC0 | (nChar >> 6));
330                 *pDestBufPtr++ = (sal_Char) (0x80 | (nChar & 0x3F));
331             }
332             else
333                 goto no_output;
334         else if (nChar <= 0xFFFF)
335             if (pDestBufEnd - pDestBufPtr >= 3)
336             {
337                 *pDestBufPtr++ = (sal_Char) (0xE0 | (nChar >> 12));
338                 *pDestBufPtr++ = (sal_Char) (0x80 | ((nChar >> 6) & 0x3F));
339                 *pDestBufPtr++ = (sal_Char) (0x80 | (nChar & 0x3F));
340             }
341             else
342                 goto no_output;
343         else if (pDestBufEnd - pDestBufPtr >= 4)
344         {
345             *pDestBufPtr++ = (sal_Char) (0xF0 | (nChar >> 18));
346             *pDestBufPtr++ = (sal_Char) (0x80 | ((nChar >> 12) & 0x3F));
347             *pDestBufPtr++ = (sal_Char) (0x80 | ((nChar >> 6) & 0x3F));
348             *pDestBufPtr++ = (sal_Char) (0x80 | (nChar & 0x3F));
349         }
350         else
351             goto no_output;
352         nHighSurrogate = 0;
353         continue;
354 
355     bad_input:
356         switch (ImplHandleBadInputUnicodeToTextConversion(sal_False, 0, nFlags,
357                                                           &pDestBufPtr,
358                                                           pDestBufEnd, &nInfo,
359                                                           NULL, 0, NULL))
360         {
361         case IMPL_BAD_INPUT_STOP:
362             nHighSurrogate = 0;
363             break;
364 
365         case IMPL_BAD_INPUT_CONTINUE:
366             nHighSurrogate = 0;
367             continue;
368 
369         case IMPL_BAD_INPUT_NO_OUTPUT:
370             goto no_output;
371         }
372         break;
373 
374     no_output:
375         --pSrcBufPtr;
376         nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
377         break;
378     }
379 
380     if (nHighSurrogate != 0
381         && (nInfo & (RTL_UNICODETOTEXT_INFO_ERROR
382                          | RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL))
383                == 0)
384     {
385         if ((nFlags & RTL_UNICODETOTEXT_FLAGS_FLUSH) != 0)
386             nInfo |= RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL;
387         else
388             switch (ImplHandleBadInputUnicodeToTextConversion(sal_False, 0,
389                                                               nFlags,
390                                                               &pDestBufPtr,
391                                                               pDestBufEnd,
392                                                               &nInfo, NULL, 0,
393                                                               NULL))
394             {
395             case IMPL_BAD_INPUT_STOP:
396             case IMPL_BAD_INPUT_CONTINUE:
397                 nHighSurrogate = 0;
398                 break;
399 
400             case IMPL_BAD_INPUT_NO_OUTPUT:
401                 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
402                 break;
403             }
404     }
405 
406  done:
407     if (pContext != NULL)
408         ((struct ImplUnicodeToUtf8Context *) pContext)->nHighSurrogate
409             = nHighSurrogate;
410     if (pInfo != NULL)
411         *pInfo = nInfo;
412     if (pSrcCvtChars != NULL)
413         *pSrcCvtChars = pSrcBufPtr - pSrcBuf;
414     return pDestBufPtr - pDestBuf;
415 }
416