1 /**************************************************************
2 *
3 * Licensed to the Apache Software Foundation (ASF) under one
4 * or more contributor license agreements. See the NOTICE file
5 * distributed with this work for additional information
6 * regarding copyright ownership. The ASF licenses this file
7 * to you under the Apache License, Version 2.0 (the
8 * "License"); you may not use this file except in compliance
9 * with the License. You may obtain a copy of the License at
10 *
11 * http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing,
14 * software distributed under the License is distributed on an
15 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 * KIND, either express or implied. See the License for the
17 * specific language governing permissions and limitations
18 * under the License.
19 *
20 *************************************************************/
21
22
23
24 #include "sal/types.h"
25 #include "rtl/alloc.h"
26 #include "rtl/textcvt.h"
27
28 #include "converter.h"
29 #include "tenchelp.h"
30 #include "unichars.h"
31
32 struct ImplUtf8ToUnicodeContext
33 {
34 sal_uInt32 nUtf32;
35 int nShift;
36 sal_Bool bCheckBom;
37 };
38
39 struct ImplUnicodeToUtf8Context
40 {
41 sal_Unicode nHighSurrogate; /* 0xFFFF: write BOM */
42 };
43
ImplCreateUtf8ToUnicodeContext(void)44 void * ImplCreateUtf8ToUnicodeContext(void)
45 {
46 void * p = rtl_allocateMemory(sizeof (struct ImplUtf8ToUnicodeContext));
47 ImplResetUtf8ToUnicodeContext(p);
48 return p;
49 }
50
ImplResetUtf8ToUnicodeContext(void * pContext)51 void ImplResetUtf8ToUnicodeContext(void * pContext)
52 {
53 if (pContext != NULL)
54 {
55 ((struct ImplUtf8ToUnicodeContext *) pContext)->nShift = -1;
56 ((struct ImplUtf8ToUnicodeContext *) pContext)->bCheckBom = sal_True;
57 }
58 }
59
ImplConvertUtf8ToUnicode(ImplTextConverterData const * pData,void * pContext,sal_Char const * pSrcBuf,sal_Size nSrcBytes,sal_Unicode * pDestBuf,sal_Size nDestChars,sal_uInt32 nFlags,sal_uInt32 * pInfo,sal_Size * pSrcCvtBytes)60 sal_Size ImplConvertUtf8ToUnicode(ImplTextConverterData const * pData,
61 void * pContext, sal_Char const * pSrcBuf,
62 sal_Size nSrcBytes, sal_Unicode * pDestBuf,
63 sal_Size nDestChars, sal_uInt32 nFlags,
64 sal_uInt32 * pInfo, sal_Size * pSrcCvtBytes)
65 {
66 /*
67 This function is very liberal with the UTF-8 input. Accepted are:
68 - non-shortest forms (e.g., C0 41 instead of 41 to represent U+0041)
69 - surrogates (e.g., ED A0 80 to represent U+D800)
70 - encodings with up to six bytes (everything outside the range
71 U+0000..10FFFF is considered "undefined")
72 The first two of these points allow this routine to translate from both
73 RTL_TEXTENCODING_UTF8 and RTL_TEXTENCODING_JAVA_UTF8.
74 */
75
76 int bJavaUtf8 = pData != NULL;
77 sal_uInt32 nUtf32 = 0;
78 int nShift = -1;
79 sal_Bool bCheckBom = sal_True;
80 sal_uInt32 nInfo = 0;
81 sal_uChar const * pSrcBufPtr = (sal_uChar const *) pSrcBuf;
82 sal_uChar const * pSrcBufEnd = pSrcBufPtr + nSrcBytes;
83 sal_Unicode * pDestBufPtr = pDestBuf;
84 sal_Unicode * pDestBufEnd = pDestBufPtr + nDestChars;
85
86 if (pContext != NULL)
87 {
88 nUtf32 = ((struct ImplUtf8ToUnicodeContext *) pContext)->nUtf32;
89 nShift = ((struct ImplUtf8ToUnicodeContext *) pContext)->nShift;
90 bCheckBom = ((struct ImplUtf8ToUnicodeContext *) pContext)->bCheckBom;
91 }
92
93 while (pSrcBufPtr < pSrcBufEnd)
94 {
95 sal_Bool bUndefined = sal_False;
96 int bConsume = sal_True;
97 sal_uInt32 nChar = *pSrcBufPtr++;
98 if (nShift < 0)
99 if (nChar <= 0x7F)
100 {
101 nUtf32 = nChar;
102 goto transform;
103 }
104 else if (nChar <= 0xBF)
105 goto bad_input;
106 else if (nChar <= 0xDF)
107 {
108 nUtf32 = (nChar & 0x1F) << 6;
109 nShift = 0;
110 }
111 else if (nChar <= 0xEF)
112 {
113 nUtf32 = (nChar & 0x0F) << 12;
114 nShift = 6;
115 }
116 else if (nChar <= 0xF7)
117 {
118 nUtf32 = (nChar & 0x07) << 18;
119 nShift = 12;
120 }
121 else if (nChar <= 0xFB)
122 {
123 nUtf32 = (nChar & 0x03) << 24;
124 nShift = 18;
125 }
126 else if (nChar <= 0xFD)
127 {
128 nUtf32 = (nChar & 0x01) << 30;
129 nShift = 24;
130 }
131 else
132 goto bad_input;
133 else if ((nChar & 0xC0) == 0x80)
134 {
135 nUtf32 |= (nChar & 0x3F) << nShift;
136 if (nShift == 0)
137 goto transform;
138 else
139 nShift -= 6;
140 }
141 else
142 {
143 /*
144 This byte is preceeded by a broken UTF-8 sequence; if this byte
145 is neither in the range [0x80..0xBF] nor in the range
146 [0xFE..0xFF], assume that this byte does not belong to that
147 broken sequence, but instead starts a new, legal UTF-8 sequence:
148 */
149 bConsume = nChar >= 0xFE;
150 goto bad_input;
151 }
152 continue;
153
154 transform:
155 if (!bCheckBom || nUtf32 != 0xFEFF
156 || (nFlags & RTL_TEXTTOUNICODE_FLAGS_GLOBAL_SIGNATURE) == 0
157 || bJavaUtf8)
158 {
159 if (nUtf32 <= 0xFFFF)
160 if (pDestBufPtr != pDestBufEnd)
161 *pDestBufPtr++ = (sal_Unicode) nUtf32;
162 else
163 goto no_output;
164 else if (nUtf32 <= 0x10FFFF)
165 if (pDestBufEnd - pDestBufPtr >= 2)
166 {
167 *pDestBufPtr++ = (sal_Unicode) ImplGetHighSurrogate(nUtf32);
168 *pDestBufPtr++ = (sal_Unicode) ImplGetLowSurrogate(nUtf32);
169 }
170 else
171 goto no_output;
172 else
173 {
174 bUndefined = sal_True;
175 goto bad_input;
176 }
177 }
178 nShift = -1;
179 bCheckBom = sal_False;
180 continue;
181
182 bad_input:
183 switch (ImplHandleBadInputTextToUnicodeConversion(
184 bUndefined, sal_True, 0, nFlags, &pDestBufPtr, pDestBufEnd,
185 &nInfo))
186 {
187 case IMPL_BAD_INPUT_STOP:
188 nShift = -1;
189 bCheckBom = sal_False;
190 if (!bConsume)
191 --pSrcBufPtr;
192 break;
193
194 case IMPL_BAD_INPUT_CONTINUE:
195 nShift = -1;
196 bCheckBom = sal_False;
197 if (!bConsume)
198 --pSrcBufPtr;
199 continue;
200
201 case IMPL_BAD_INPUT_NO_OUTPUT:
202 goto no_output;
203 }
204 break;
205
206 no_output:
207 --pSrcBufPtr;
208 nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL;
209 break;
210 }
211
212 if (nShift >= 0
213 && (nInfo & (RTL_TEXTTOUNICODE_INFO_ERROR
214 | RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL))
215 == 0)
216 {
217 if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0)
218 nInfo |= RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL;
219 else
220 switch (ImplHandleBadInputTextToUnicodeConversion(
221 sal_False, sal_True, 0, nFlags, &pDestBufPtr,
222 pDestBufEnd, &nInfo))
223 {
224 case IMPL_BAD_INPUT_STOP:
225 case IMPL_BAD_INPUT_CONTINUE:
226 nShift = -1;
227 bCheckBom = sal_False;
228 break;
229
230 case IMPL_BAD_INPUT_NO_OUTPUT:
231 nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL;
232 break;
233 }
234 }
235
236 if (pContext != NULL)
237 {
238 ((struct ImplUtf8ToUnicodeContext *) pContext)->nUtf32 = nUtf32;
239 ((struct ImplUtf8ToUnicodeContext *) pContext)->nShift = nShift;
240 ((struct ImplUtf8ToUnicodeContext *) pContext)->bCheckBom = bCheckBom;
241 }
242 if (pInfo != NULL)
243 *pInfo = nInfo;
244 if (pSrcCvtBytes != NULL)
245 *pSrcCvtBytes = (sal_Char const *) pSrcBufPtr - pSrcBuf;
246 return pDestBufPtr - pDestBuf;
247 }
248
ImplCreateUnicodeToUtf8Context(void)249 void * ImplCreateUnicodeToUtf8Context(void)
250 {
251 void * p = rtl_allocateMemory(sizeof (struct ImplUnicodeToUtf8Context));
252 ImplResetUnicodeToUtf8Context(p);
253 return p;
254 }
255
ImplResetUnicodeToUtf8Context(void * pContext)256 void ImplResetUnicodeToUtf8Context(void * pContext)
257 {
258 if (pContext != NULL)
259 ((struct ImplUnicodeToUtf8Context *) pContext)->nHighSurrogate = 0xFFFF;
260 }
261
ImplConvertUnicodeToUtf8(ImplTextConverterData const * pData,void * pContext,sal_Unicode const * pSrcBuf,sal_Size nSrcChars,sal_Char * pDestBuf,sal_Size nDestBytes,sal_uInt32 nFlags,sal_uInt32 * pInfo,sal_Size * pSrcCvtChars)262 sal_Size ImplConvertUnicodeToUtf8(ImplTextConverterData const * pData,
263 void * pContext, sal_Unicode const * pSrcBuf,
264 sal_Size nSrcChars, sal_Char * pDestBuf,
265 sal_Size nDestBytes, sal_uInt32 nFlags,
266 sal_uInt32 * pInfo, sal_Size* pSrcCvtChars)
267 {
268 int bJavaUtf8 = pData != NULL;
269 sal_Unicode nHighSurrogate = 0xFFFF;
270 sal_uInt32 nInfo = 0;
271 sal_Unicode const * pSrcBufPtr = pSrcBuf;
272 sal_Unicode const * pSrcBufEnd = pSrcBufPtr + nSrcChars;
273 sal_Char * pDestBufPtr = pDestBuf;
274 sal_Char * pDestBufEnd = pDestBufPtr + nDestBytes;
275
276 if (pContext != NULL)
277 nHighSurrogate
278 = ((struct ImplUnicodeToUtf8Context *) pContext)->nHighSurrogate;
279
280 if (nHighSurrogate == 0xFFFF)
281 {
282 if ((nFlags & RTL_UNICODETOTEXT_FLAGS_GLOBAL_SIGNATURE) != 0
283 && !bJavaUtf8)
284 {
285 if (pDestBufEnd - pDestBufPtr >= 3)
286 {
287 /* Write BOM (U+FEFF) as UTF-8: */
288 *pDestBufPtr++ = (sal_Char) (unsigned char) 0xEF;
289 *pDestBufPtr++ = (sal_Char) (unsigned char) 0xBB;
290 *pDestBufPtr++ = (sal_Char) (unsigned char) 0xBF;
291 }
292 else
293 {
294 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
295 goto done;
296 }
297 }
298 nHighSurrogate = 0;
299 }
300
301 while (pSrcBufPtr < pSrcBufEnd)
302 {
303 sal_uInt32 nChar = *pSrcBufPtr++;
304 if (nHighSurrogate == 0)
305 {
306 if (ImplIsHighSurrogate(nChar) && !bJavaUtf8)
307 {
308 nHighSurrogate = (sal_Unicode) nChar;
309 continue;
310 }
311 }
312 else if (ImplIsLowSurrogate(nChar) && !bJavaUtf8)
313 nChar = ImplCombineSurrogates(nHighSurrogate, nChar);
314 else
315 goto bad_input;
316
317 if ((ImplIsLowSurrogate(nChar) && !bJavaUtf8)
318 || ImplIsNoncharacter(nChar))
319 goto bad_input;
320
321 if (nChar <= 0x7F && (!bJavaUtf8 || nChar != 0))
322 if (pDestBufPtr != pDestBufEnd)
323 *pDestBufPtr++ = (sal_Char) nChar;
324 else
325 goto no_output;
326 else if (nChar <= 0x7FF)
327 if (pDestBufEnd - pDestBufPtr >= 2)
328 {
329 *pDestBufPtr++ = (sal_Char) (0xC0 | (nChar >> 6));
330 *pDestBufPtr++ = (sal_Char) (0x80 | (nChar & 0x3F));
331 }
332 else
333 goto no_output;
334 else if (nChar <= 0xFFFF)
335 if (pDestBufEnd - pDestBufPtr >= 3)
336 {
337 *pDestBufPtr++ = (sal_Char) (0xE0 | (nChar >> 12));
338 *pDestBufPtr++ = (sal_Char) (0x80 | ((nChar >> 6) & 0x3F));
339 *pDestBufPtr++ = (sal_Char) (0x80 | (nChar & 0x3F));
340 }
341 else
342 goto no_output;
343 else if (pDestBufEnd - pDestBufPtr >= 4)
344 {
345 *pDestBufPtr++ = (sal_Char) (0xF0 | (nChar >> 18));
346 *pDestBufPtr++ = (sal_Char) (0x80 | ((nChar >> 12) & 0x3F));
347 *pDestBufPtr++ = (sal_Char) (0x80 | ((nChar >> 6) & 0x3F));
348 *pDestBufPtr++ = (sal_Char) (0x80 | (nChar & 0x3F));
349 }
350 else
351 goto no_output;
352 nHighSurrogate = 0;
353 continue;
354
355 bad_input:
356 switch (ImplHandleBadInputUnicodeToTextConversion(sal_False, 0, nFlags,
357 &pDestBufPtr,
358 pDestBufEnd, &nInfo,
359 NULL, 0, NULL))
360 {
361 case IMPL_BAD_INPUT_STOP:
362 nHighSurrogate = 0;
363 break;
364
365 case IMPL_BAD_INPUT_CONTINUE:
366 nHighSurrogate = 0;
367 continue;
368
369 case IMPL_BAD_INPUT_NO_OUTPUT:
370 goto no_output;
371 }
372 break;
373
374 no_output:
375 --pSrcBufPtr;
376 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
377 break;
378 }
379
380 if (nHighSurrogate != 0
381 && (nInfo & (RTL_UNICODETOTEXT_INFO_ERROR
382 | RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL))
383 == 0)
384 {
385 if ((nFlags & RTL_UNICODETOTEXT_FLAGS_FLUSH) != 0)
386 nInfo |= RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL;
387 else
388 switch (ImplHandleBadInputUnicodeToTextConversion(sal_False, 0,
389 nFlags,
390 &pDestBufPtr,
391 pDestBufEnd,
392 &nInfo, NULL, 0,
393 NULL))
394 {
395 case IMPL_BAD_INPUT_STOP:
396 case IMPL_BAD_INPUT_CONTINUE:
397 nHighSurrogate = 0;
398 break;
399
400 case IMPL_BAD_INPUT_NO_OUTPUT:
401 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
402 break;
403 }
404 }
405
406 done:
407 if (pContext != NULL)
408 ((struct ImplUnicodeToUtf8Context *) pContext)->nHighSurrogate
409 = nHighSurrogate;
410 if (pInfo != NULL)
411 *pInfo = nInfo;
412 if (pSrcCvtChars != NULL)
413 *pSrcCvtChars = pSrcBufPtr - pSrcBuf;
414 return pDestBufPtr - pDestBuf;
415 }
416