xref: /trunk/main/sal/textenc/tencinfo.c (revision 26d9e9dd)
1 /**************************************************************
2  *
3  * Licensed to the Apache Software Foundation (ASF) under one
4  * or more contributor license agreements.  See the NOTICE file
5  * distributed with this work for additional information
6  * regarding copyright ownership.  The ASF licenses this file
7  * to you under the Apache License, Version 2.0 (the
8  * "License"); you may not use this file except in compliance
9  * with the License.  You may obtain a copy of the License at
10  *
11  *   http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing,
14  * software distributed under the License is distributed on an
15  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16  * KIND, either express or implied.  See the License for the
17  * specific language governing permissions and limitations
18  * under the License.
19  *
20  *************************************************************/
21 
22 
23 
24 #include "rtl/tencinfo.h"
25 #include "gettextencodingdata.h"
26 #include "tenchelp.h"
27 
28 #ifndef _RTL_ALLOC_H
29 #include "rtl/alloc.h"
30 #endif
31 
32 #ifndef INCLUDED_STDDEF_H
33 #include <stddef.h>
34 #define INCLUDED_STDDEF_H
35 #endif
36 #ifndef INCLUDED_STRING_H
37 #include <string.h>
38 #define INCLUDED_STRING_H
39 #endif
40 
rtl_isOctetTextEncoding(rtl_TextEncoding nEncoding)41 sal_Bool SAL_CALL rtl_isOctetTextEncoding(rtl_TextEncoding nEncoding)
42 {
43     return (sal_Bool)
44         (nEncoding > RTL_TEXTENCODING_DONTKNOW
45          && (nEncoding <= RTL_TEXTENCODING_ADOBE_DINGBATS)
46              /* always update this! */
47          && nEncoding != 9); /* RTL_TEXTENCODING_SYSTEM */
48 }
49 
50 /* ======================================================================= */
51 
Impl_toAsciiLower(const sal_Char * pName,sal_Char * pBuf)52 static void Impl_toAsciiLower( const sal_Char* pName, sal_Char* pBuf )
53 {
54     while ( *pName )
55     {
56         /* A-Z */
57         if ( (*pName >= 0x41) && (*pName <= 0x5A) )
58             *pBuf = (*pName)+0x20;  /* toAsciiLower */
59         else
60             *pBuf = *pName;
61 
62         pBuf++;
63         pName++;
64     }
65 
66     *pBuf = '\0';
67 }
68 
69 /* ----------------------------------------------------------------------- */
70 
Impl_toAsciiLowerAndRemoveNonAlphanumeric(const sal_Char * pName,sal_Char * pBuf)71 static void Impl_toAsciiLowerAndRemoveNonAlphanumeric( const sal_Char* pName, sal_Char* pBuf )
72 {
73     while ( *pName )
74     {
75         /* A-Z */
76         if ( (*pName >= 0x41) && (*pName <= 0x5A) )
77         {
78             *pBuf = (*pName)+0x20;  /* toAsciiLower */
79             pBuf++;
80         }
81         /* a-z, 0-9 */
82         else if ( ((*pName >= 0x61) && (*pName <= 0x7A)) ||
83                   ((*pName >= 0x30) && (*pName <= 0x39)) )
84         {
85             *pBuf = *pName;
86             pBuf++;
87         }
88 
89         pName++;
90     }
91 
92     *pBuf = '\0';
93 }
94 
95 /* ----------------------------------------------------------------------- */
96 
97 /* pMatchStr must match with all characters in pCompStr */
Impl_matchString(const sal_Char * pCompStr,const sal_Char * pMatchStr)98 static sal_Bool Impl_matchString( const sal_Char* pCompStr, const sal_Char* pMatchStr )
99 {
100     /* We test only for end in MatchStr, because the last 0 character from */
101     /* pCompStr is unequal a character in MatchStr, so the loop terminates */
102     while ( *pMatchStr )
103     {
104         if ( *pCompStr != *pMatchStr )
105             return sal_False;
106 
107         pCompStr++;
108         pMatchStr++;
109     }
110 
111     return sal_True;
112 }
113 
114 /* ======================================================================= */
115 
116 typedef struct
117 {
118     const sal_Char*             mpCharsetStr;
119     rtl_TextEncoding            meTextEncoding;
120 } ImplStrCharsetDef;
121 
122 typedef struct
123 {
124     const sal_Char*             mpCharsetStr;
125     const ImplStrCharsetDef*    mpSecondPartTab;
126 } ImplStrFirstPartCharsetDef;
127 
128 /* ======================================================================= */
129 
rtl_getTextEncodingInfo(rtl_TextEncoding eTextEncoding,rtl_TextEncodingInfo * pEncInfo)130 sal_Bool SAL_CALL rtl_getTextEncodingInfo( rtl_TextEncoding eTextEncoding, rtl_TextEncodingInfo* pEncInfo )
131 {
132     const ImplTextEncodingData* pData;
133 
134     pData = Impl_getTextEncodingData( eTextEncoding );
135     if ( !pData )
136     {
137         /* HACK: For not implemented encoding, because not all
138            calls handle the errors */
139         if ( pEncInfo->StructSize < 5 )
140             return sal_False;
141         pEncInfo->MinimumCharSize = 1;
142 
143         if ( pEncInfo->StructSize < 6 )
144             return sal_True;
145         pEncInfo->MaximumCharSize = 1;
146 
147         if ( pEncInfo->StructSize < 7 )
148             return sal_True;
149         pEncInfo->AverageCharSize = 1;
150 
151         if ( pEncInfo->StructSize < 12 )
152             return sal_True;
153         pEncInfo->Flags = 0;
154 
155         return sal_False;
156     }
157 
158     if ( pEncInfo->StructSize < 5 )
159         return sal_False;
160     pEncInfo->MinimumCharSize = pData->mnMinCharSize;
161 
162     if ( pEncInfo->StructSize < 6 )
163         return sal_True;
164     pEncInfo->MaximumCharSize = pData->mnMaxCharSize;
165 
166     if ( pEncInfo->StructSize < 7 )
167         return sal_True;
168     pEncInfo->AverageCharSize = pData->mnAveCharSize;
169 
170     if ( pEncInfo->StructSize < 12 )
171         return sal_True;
172     pEncInfo->Flags = pData->mnInfoFlags;
173 
174     return sal_True;
175 }
176 
177 /* ======================================================================= */
178 
rtl_getTextEncodingFromWindowsCharset(sal_uInt8 nWinCharset)179 rtl_TextEncoding SAL_CALL rtl_getTextEncodingFromWindowsCharset( sal_uInt8 nWinCharset )
180 {
181     rtl_TextEncoding eTextEncoding;
182 
183     switch ( nWinCharset )
184     {
185         case 0:     eTextEncoding = RTL_TEXTENCODING_MS_1252; break;    /* ANSI_CHARSET */
186         case 2:     eTextEncoding = RTL_TEXTENCODING_SYMBOL; break;     /* SYMBOL_CHARSET */
187         case 77:    eTextEncoding = RTL_TEXTENCODING_APPLE_ROMAN; break;/* MAC_CHARSET */
188         case 128:   eTextEncoding = RTL_TEXTENCODING_MS_932; break;     /* SHIFTJIS_CHARSET */
189         case 129:   eTextEncoding = RTL_TEXTENCODING_MS_949; break;     /* HANGEUL_CHARSET */
190         case 130:   eTextEncoding = RTL_TEXTENCODING_MS_1361; break;    /* JOHAB_CHARSET */
191         case 134:   eTextEncoding = RTL_TEXTENCODING_MS_936; break;     /* GB2312_CHARSET */
192         case 136:   eTextEncoding = RTL_TEXTENCODING_MS_950; break;     /* CHINESEBIG5_CHARSET */
193         case 161:   eTextEncoding = RTL_TEXTENCODING_MS_1253; break;    /* GREEK_CHARSET */
194         case 162:   eTextEncoding = RTL_TEXTENCODING_MS_1254; break;    /* TURKISH_CHARSET */
195         case 163:   eTextEncoding = RTL_TEXTENCODING_MS_1258; break;    /* VIETNAMESE_CHARSET !!! */
196         case 177:   eTextEncoding = RTL_TEXTENCODING_MS_1255; break;    /* HEBREW_CHARSET */
197         case 178:   eTextEncoding = RTL_TEXTENCODING_MS_1256; break;    /* ARABIC_CHARSET */
198         case 186:   eTextEncoding = RTL_TEXTENCODING_MS_1257; break;    /* BALTIC_CHARSET */
199         case 204:   eTextEncoding = RTL_TEXTENCODING_MS_1251; break;    /* RUSSIAN_CHARSET */
200         case 222:   eTextEncoding = RTL_TEXTENCODING_MS_874; break;     /* THAI_CHARSET */
201         case 238:   eTextEncoding = RTL_TEXTENCODING_MS_1250; break;    /* EASTEUROPE_CHARSET */
202         case 255:   eTextEncoding = RTL_TEXTENCODING_IBM_850; break;    /* OEM_CHARSET */
203         default:    eTextEncoding = RTL_TEXTENCODING_DONTKNOW; break;
204     }
205 
206     return eTextEncoding;
207 }
208 
209 /* ----------------------------------------------------------------------- */
210 
211 #if 0
212 
213 rtl_TextEncoding SAL_CALL rtl_getTextEncodingFromMacTextEncoding( sal_uInt32 nMacTextEncoding )
214 {
215     rtl_TextEncoding eTextEncoding;
216 
217     switch ( nMacTextEncoding )
218     {
219         case 0:     eTextEncoding = RTL_TEXTENCODING_APPLE_ROMAN; break;
220         case 1:     eTextEncoding = RTL_TEXTENCODING_APPLE_JAPANESE; break;
221         case 2:     eTextEncoding = RTL_TEXTENCODING_APPLE_CHINTRAD; break;
222         case 3:     eTextEncoding = RTL_TEXTENCODING_APPLE_KOREAN; break;
223         case 4:     eTextEncoding = RTL_TEXTENCODING_APPLE_ARABIC; break;
224         case 5:     eTextEncoding = RTL_TEXTENCODING_APPLE_HEBREW; break;
225         case 6:     eTextEncoding = RTL_TEXTENCODING_APPLE_GREEK; break;
226         case 7:     eTextEncoding = RTL_TEXTENCODING_APPLE_CYRILLIC; break;
227         /* RIGHT-TO-LEFT SYMBOLS = 8  ??? */
228         case 9:     eTextEncoding = RTL_TEXTENCODING_APPLE_DEVANAGARI; break;
229         case 10:    eTextEncoding = RTL_TEXTENCODING_APPLE_GURMUKHI; break;
230         case 11:    eTextEncoding = RTL_TEXTENCODING_APPLE_GUJARATI; break;
231         /* MacOriya = 12 */
232         /* MacBengali = 13 */
233         /* MacTamil = 14 */
234         /* MacTelugu = 15 */
235         /* MacKannada = 16 */
236         /* MacMalayalam = 17 */
237         /* MacSinhalese = 18 */
238         /* MacBurmese = 19 */
239         /* MacKhmer = 20 */
240         case 21:    eTextEncoding = RTL_TEXTENCODING_APPLE_THAI; break;
241         /* MacLaotian = 22 */
242         /* MacGeorgian = 23 */
243         /* MacArmenian = 24 */
244         case 25:    eTextEncoding = RTL_TEXTENCODING_APPLE_CHINSIMP; break;
245         /* MacTibetan = 26 */
246         /* MacMongolian = 27 */
247         /* MacEthiopic = 28 */
248         case 29:    eTextEncoding = RTL_TEXTENCODING_APPLE_CENTEURO; break;     /* MacCentralEurRoman */
249         /* MacVietnamese = 30 */
250         /* MacExtArabic = 31 */
251         case 32:    eTextEncoding = RTL_TEXTENCODING_SYMBOL; break;
252         case 33:    eTextEncoding = RTL_TEXTENCODING_SYMBOL; break;     /* MacSymbol */
253         case 34:    eTextEncoding = RTL_TEXTENCODING_SYMBOL; break;     /* MacDingbats */
254         case 35:    eTextEncoding = RTL_TEXTENCODING_APPLE_TURKISH; break;      /* MacTurkish */
255         case 36:    eTextEncoding = RTL_TEXTENCODING_APPLE_CROATIAN; break;     /* MacCroatian */
256         case 37:    eTextEncoding = RTL_TEXTENCODING_APPLE_ICELAND; break;      /* MacIcelandic */
257         case 38:    eTextEncoding = RTL_TEXTENCODING_APPLE_ROMANIAN; break;     /* MacRomanian */
258         /* MacCeltic = 39 */
259         /* MacGaelic = 40 */
260         /* Beginning in Mac OS 8.5, the set of Mac OS script codes has been */
261         /* extended for some Mac OS components to include Unicode. Some of */
262         /* these components have only 7 bits available for script code, so */
263         /* kTextEncodingUnicodeDefault cannot be used to indicate Unicode. */
264         /* Instead, the following meta-value is used to indicate Unicode */
265         /* handled as a special Mac OS script code; TEC handles this value */
266         /* like kTextEncodingUnicodeDefault. */
267         /* The following use script code 4, smArabic */
268         case 0x08C: eTextEncoding = RTL_TEXTENCODING_APPLE_FARSI; break;        /* MacFarsi */
269         /* The following use script code 7, smCyrillic */
270         case 0x098: eTextEncoding = RTL_TEXTENCODING_APPLE_UKRAINIAN; break;    /* MacUkrainian */
271         /* The following use script code 28, smEthiopic */
272         /* MacInuit = 0xEC */
273         /* The following use script code 32, smUninterp */
274         case 0x0FC: eTextEncoding = RTL_TEXTENCODING_SYMBOL; break;       /* MacVT100 - VT100/102 font */
275         /* Special Mac OS encodings */
276         /* MacHFS = 0xFF */ /* metavalue. */
277         /* Unicode & ISO UCS encodings begin at 0x100 */
278         case 0x100: eTextEncoding = RTL_TEXTENCODING_UNICODE; break;    /* UnicodeDefault */
279         case 0x101: eTextEncoding = RTL_TEXTENCODING_UNICODE; break;    /* UnicodeV1_1 / ISO10646_1993 */
280         case 0x103: eTextEncoding = RTL_TEXTENCODING_UNICODE; break;    /* UnicodeV2_1 - new location for Korean Hangul / UnicodeV2_1 */
281         /* ISO 8-bit and 7-bit encodings begin at 0x200 */
282         case 0x201: eTextEncoding = RTL_TEXTENCODING_ISO_8859_1; break; /* ISOLatin1 - ISO 8859-1 */
283         case 0x202: eTextEncoding = RTL_TEXTENCODING_ISO_8859_2; break; /* ISOLatin2 - ISO 8859-2 */
284         case 0x203: eTextEncoding = RTL_TEXTENCODING_ISO_8859_3; break; /* ISOLatin3 - ISO 8859-3 */
285         case 0x204: eTextEncoding = RTL_TEXTENCODING_ISO_8859_4; break; /* ISOLatin4 - ISO 8859-4 */
286         case 0x205: eTextEncoding = RTL_TEXTENCODING_ISO_8859_5; break; /* ISOLatinCyrillic - ISO 8859-5 */
287         case 0x206: eTextEncoding = RTL_TEXTENCODING_ISO_8859_6; break; /* ISOLatinArabic - ISO 8859-6, = ASMO 708, =DOS CP 708 */
288         case 0x207: eTextEncoding = RTL_TEXTENCODING_ISO_8859_7; break; /* ISOLatinGreek - ISO 8859-7 */
289         case 0x208: eTextEncoding = RTL_TEXTENCODING_ISO_8859_8; break; /* ISOLatinHebrew - ISO 8859-8 */
290         case 0x209: eTextEncoding = RTL_TEXTENCODING_ISO_8859_9; break; /* ISOLatin5 - ISO 8859-9 */
291         /* MS-DOS & Windows encodings begin at 0x400 */
292         case 0x400: eTextEncoding = RTL_TEXTENCODING_IBM_437; break;    /* DOSLatinUS - code page 437 */
293         case 0x405: eTextEncoding = RTL_TEXTENCODING_IBM_737; break;    /* DOSGreek - code page 737 (formerly 437G) */
294         case 0x406: eTextEncoding = RTL_TEXTENCODING_IBM_775; break;    /* DOSBalticRim - code page 775 */
295         case 0x410: eTextEncoding = RTL_TEXTENCODING_IBM_850; break;    /* DOSLatin1 - code page 850, "Multilingual" */
296 /* !!!       case 0x411: eTextEncoding = RTL_TEXTENCODING_IBM_851; break; */    /* DOSGreek1 - code page 851 */
297         case 0x412: eTextEncoding = RTL_TEXTENCODING_IBM_852; break;    /* DOSLatin2 - code page 852, Slavic */
298         case 0x413: eTextEncoding = RTL_TEXTENCODING_IBM_855; break;    /* DOSCyrillic - code page 855, IBM Cyrillic */
299         case 0x414: eTextEncoding = RTL_TEXTENCODING_IBM_857; break;    /* DOSTurkish - code page 857, IBM Turkish */
300         case 0x415: eTextEncoding = RTL_TEXTENCODING_IBM_860; break;    /* DOSPortuguese - code page 860 */
301         case 0x416: eTextEncoding = RTL_TEXTENCODING_IBM_861; break;    /* DOSIcelandic - code page 861 */
302         case 0x417: eTextEncoding = RTL_TEXTENCODING_IBM_862; break;    /* DOSHebrew - code page 862 */
303         case 0x418: eTextEncoding = RTL_TEXTENCODING_IBM_863; break;    /* DOSCanadianFrench - code page 863 */
304         case 0x419: eTextEncoding = RTL_TEXTENCODING_IBM_864; break;    /* DOSArabic - code page 864 */
305         case 0x41A: eTextEncoding = RTL_TEXTENCODING_IBM_865; break;    /* DOSNordic - code page 865 */
306         case 0x41B: eTextEncoding = RTL_TEXTENCODING_IBM_866; break;    /* DOSRussian - code page 866 */
307         case 0x41C: eTextEncoding = RTL_TEXTENCODING_IBM_869; break;    /* DOSGreek2 - code page 869, IBM Modern Greek */
308         case 0x41D: eTextEncoding = RTL_TEXTENCODING_MS_874; break;     /* DOSThai - code page 874, also for Windows */
309         case 0x420: eTextEncoding = RTL_TEXTENCODING_MS_932; break;     /* DOSJapanese - code page 932, also for Windows */
310         case 0x421: eTextEncoding = RTL_TEXTENCODING_MS_936; break;     /* DOSChineseSimplif - code page 936, also for Windows */
311         case 0x422: eTextEncoding = RTL_TEXTENCODING_MS_949; break;     /* DOSKorean - code page 949, also for Windows;Unified Hangul */
312         case 0x423: eTextEncoding = RTL_TEXTENCODING_MS_950; break;     /* DOSChineseTrad - code page 950, also for Windows */
313         case 0x500: eTextEncoding = RTL_TEXTENCODING_MS_1252; break;    /* WindowsLatin1 / WindowsANSI - code page 1252 */
314         case 0x501: eTextEncoding = RTL_TEXTENCODING_MS_1250; break;    /* WindowsLatin2 - code page 1250, Central Europe */
315         case 0x502: eTextEncoding = RTL_TEXTENCODING_MS_1251; break;    /* WindowsCyrillic - code page 1251, Slavic Cyrillic */
316         case 0x503: eTextEncoding = RTL_TEXTENCODING_MS_1253; break;    /* WindowsGreek - code page 1253 */
317         case 0x504: eTextEncoding = RTL_TEXTENCODING_MS_1254; break;    /* WindowsLatin5 - code page 1254, Turkish */
318         case 0x505: eTextEncoding = RTL_TEXTENCODING_MS_1255; break;    /* WindowsHebrew - code page 1255 */
319         case 0x506: eTextEncoding = RTL_TEXTENCODING_MS_1256; break;    /* WindowsArabic - code page 1256 */
320         case 0x507: eTextEncoding = RTL_TEXTENCODING_MS_1257; break;    /* WindowsBalticRim - code page 1257 */
321         case 0x508: eTextEncoding = RTL_TEXTENCODING_MS_1258; break;    /* WindowsVietnamese - code page 1258 */
322         case 0x510: eTextEncoding = RTL_TEXTENCODING_MS_1361; break;    /* WindowsKoreanJohab - code page 1361, for Windows NT */
323         /* Various national standards begin at 0x600 */
324         case 0x600: eTextEncoding = RTL_TEXTENCODING_ASCII_US; break;    /* US_ASCII */
325         case 0x620: eTextEncoding = RTL_TEXTENCODING_JIS_X_0201; break;  /* JIS_X0201_76 */
326         case 0x621: eTextEncoding = RTL_TEXTENCODING_JIS_X_0208; break;  /* JIS_X0208_83 */
327         case 0x622: eTextEncoding = RTL_TEXTENCODING_JIS_X_0208; break;  /* JIS_X0208_90 */
328         case 0x623: eTextEncoding = RTL_TEXTENCODING_JIS_X_0212; break;  /* JIS_X0212_90 */
329         /* !!! JIS_C6226_78 = 0x624 */
330         /* !!! GB_2312_80 = 0x630 */
331         /* !!! GBK_95 = 0x631 */ /* annex to GB 13000-93; for Windows 95 */
332 //        case 0x640: eTextEncoding = RTL_TEXTENCODING_KSC_5601_1987; break; /* KSC_5601_87 */ /* same as KSC 5601-92 without Johab annex */
333 //        case 0x641: eTextEncoding = RTL_TEXTENCODING_KSC_5601_1992; break; /* KSC 5601-92 Johab annex */
334         /* !!! CNS_11643_92_P1 = 0x651 */ /* CNS 11643-1992 plane 1 */
335         /* !!! CNS_11643_92_P2 = 0x652 */ /* CNS 11643-1992 plane 2 */
336         /* !!! CNS_11643_92_P3 = 0x653 */ /* CNS 11643-1992 plane 3 (11643-1986 plane 14) */
337         /* ISO 2022 collections begin at 0x800 */
338         case 0x820: eTextEncoding = RTL_TEXTENCODING_ISO_2022_JP; break;    /* ISO_2022_JP */
339         case 0x821: eTextEncoding = RTL_TEXTENCODING_ISO_2022_JP; break;    /* ISO_2022_JP_2 */
340         case 0x830: eTextEncoding = RTL_TEXTENCODING_ISO_2022_CN; break;    /* ISO_2022_CN */
341         case 0x831: eTextEncoding = RTL_TEXTENCODING_ISO_2022_CN; break;    /* ISO_2022_CN_EXT */
342         /* !!! ISO_2022_KR = 0x840 */
343         /* EUC collections begin at 0x900 */
344         case 0x920: eTextEncoding = RTL_TEXTENCODING_EUC_JP; break;    /* EUC_JP - ISO 646,1-byte Katakana,JIS 208,JIS 212 */
345         case 0x930: eTextEncoding = RTL_TEXTENCODING_EUC_CN; break;    /* EUC_CN - ISO 646, GB 2312-80 */
346         case 0x931: eTextEncoding = RTL_TEXTENCODING_EUC_TW; break;    /* EUC_TW - ISO 646, CNS 11643-1992 Planes 1-16 */
347         case 0x940: eTextEncoding = RTL_TEXTENCODING_EUC_KR; break;    /* EUC_KR - ISO 646, KS C 5601-1987 */
348         /* Miscellaneous standards begin at 0xA00 */
349         case 0xA01: eTextEncoding = RTL_TEXTENCODING_SHIFT_JIS; break; /* ShiftJIS - plain Shift-JIS */
350         case 0xA02: eTextEncoding = RTL_TEXTENCODING_KOI8_R; break;    /* KOI8_R - Russian Internet standard */
351         case 0xA03: eTextEncoding = RTL_TEXTENCODING_BIG5; break;      /* Big5 - Big-5 */
352         /* !!! MacRomanLatin1 = 0xA04 */ /* Mac OS Roman permuted to align with 8859-1 */
353         /* !!! HZ_GB_2312 = 0xA05 */ /* HZ (RFC 1842, for Chinese mail & news) */
354         /* Other platform encodings */
355         /* !!! NextStepLatin = 0xB01 */ /* NextStep encoding */
356         /* EBCDIC & IBM host encodings begin at 0xC00 */
357         /* !!! EBCDIC_US = 0xC01 */ /* basic EBCDIC-US */
358         /* !!! EBCDIC_CP037 = 0xC02 */ /* code page 037, extended EBCDIC-US Latin1 */
359         /* Special value */
360         /* MultiRun = 0xFFF */ /* Multiple encoded text, external run info */
361         default:    eTextEncoding = RTL_TEXTENCODING_DONTKNOW; break;
362     };
363 
364     return eTextEncoding;
365 }
366 
367 #endif
368 
369 /* ----------------------------------------------------------------------- */
370 
rtl_getTextEncodingFromUnixCharset(const sal_Char * pUnixCharset)371 rtl_TextEncoding SAL_CALL rtl_getTextEncodingFromUnixCharset( const sal_Char* pUnixCharset )
372 {
373     /* See <ftp://ftp.x.org/pub/DOCS/registry>, section 14 ("Font Charset
374      * (Registry and Encoding) Names").
375      */
376 
377     /* All Identifiers in the tables are lower case The function search */
378     /* for the first matching string in the tables. */
379     /* Sort order: unique (first 14, than 1), important */
380 
381     static ImplStrCharsetDef const aUnixCharsetISOTab[] =
382     {
383         { "15", RTL_TEXTENCODING_ISO_8859_15 },
384         { "14", RTL_TEXTENCODING_ISO_8859_14 },
385         { "13", RTL_TEXTENCODING_ISO_8859_13 },
386         { "11", RTL_TEXTENCODING_TIS_620 },
387         { "10", RTL_TEXTENCODING_ISO_8859_10 },
388         { "1", RTL_TEXTENCODING_ISO_8859_1 },
389         { "2", RTL_TEXTENCODING_ISO_8859_2 },
390         { "3", RTL_TEXTENCODING_ISO_8859_3 },
391         { "4", RTL_TEXTENCODING_ISO_8859_4 },
392         { "5", RTL_TEXTENCODING_ISO_8859_5 },
393         { "6", RTL_TEXTENCODING_ISO_8859_6 },
394         { "7", RTL_TEXTENCODING_ISO_8859_7 },
395         { "8", RTL_TEXTENCODING_ISO_8859_8 },
396         { "9", RTL_TEXTENCODING_ISO_8859_9 },
397         { NULL, RTL_TEXTENCODING_DONTKNOW }
398     };
399 
400     static ImplStrCharsetDef const aUnixCharsetADOBETab[] =
401     {
402         { "fontspecific", RTL_TEXTENCODING_SYMBOL },
403         { NULL, RTL_TEXTENCODING_DONTKNOW }
404     };
405 
406     static ImplStrCharsetDef const aUnixCharsetMSTab[] =
407     {
408         { "1252", RTL_TEXTENCODING_MS_1252 },
409         { "1250", RTL_TEXTENCODING_MS_1250 },
410         { "1251", RTL_TEXTENCODING_MS_1251 },
411         { "1253", RTL_TEXTENCODING_MS_1253 },
412         { "1254", RTL_TEXTENCODING_MS_1254 },
413         { "1255", RTL_TEXTENCODING_MS_1255 },
414         { "1256", RTL_TEXTENCODING_MS_1256 },
415         { "1257", RTL_TEXTENCODING_MS_1257 },
416         { "1258", RTL_TEXTENCODING_MS_1258 },
417         { "932", RTL_TEXTENCODING_MS_932 },
418         { "936", RTL_TEXTENCODING_MS_936 },
419         { "949", RTL_TEXTENCODING_MS_949 },
420         { "950", RTL_TEXTENCODING_MS_950 },
421         { "1361", RTL_TEXTENCODING_MS_1361 },
422         { "cp1252", RTL_TEXTENCODING_MS_1252 },
423         { "cp1250", RTL_TEXTENCODING_MS_1250 },
424         { "cp1251", RTL_TEXTENCODING_MS_1251 },
425         { "cp1253", RTL_TEXTENCODING_MS_1253 },
426         { "cp1254", RTL_TEXTENCODING_MS_1254 },
427         { "cp1255", RTL_TEXTENCODING_MS_1255 },
428         { "cp1256", RTL_TEXTENCODING_MS_1256 },
429         { "cp1257", RTL_TEXTENCODING_MS_1257 },
430         { "cp1258", RTL_TEXTENCODING_MS_1258 },
431         { "cp932", RTL_TEXTENCODING_MS_932 },
432         { "cp936", RTL_TEXTENCODING_MS_936 },
433         { "cp949", RTL_TEXTENCODING_MS_949 },
434         { "cp950", RTL_TEXTENCODING_MS_950 },
435         { "cp1361", RTL_TEXTENCODING_MS_1361 },
436         { NULL, RTL_TEXTENCODING_DONTKNOW }
437     };
438 
439     static ImplStrCharsetDef const aUnixCharsetIBMTab[] =
440     {
441         { "437", RTL_TEXTENCODING_IBM_437 },
442         { "850", RTL_TEXTENCODING_IBM_850 },
443         { "860", RTL_TEXTENCODING_IBM_860 },
444         { "861", RTL_TEXTENCODING_IBM_861 },
445         { "863", RTL_TEXTENCODING_IBM_863 },
446         { "865", RTL_TEXTENCODING_IBM_865 },
447         { "737", RTL_TEXTENCODING_IBM_737 },
448         { "775", RTL_TEXTENCODING_IBM_775 },
449         { "852", RTL_TEXTENCODING_IBM_852 },
450         { "855", RTL_TEXTENCODING_IBM_855 },
451         { "857", RTL_TEXTENCODING_IBM_857 },
452         { "862", RTL_TEXTENCODING_IBM_862 },
453         { "864", RTL_TEXTENCODING_IBM_864 },
454         { "866", RTL_TEXTENCODING_IBM_866 },
455         { "869", RTL_TEXTENCODING_IBM_869 },
456         { "874", RTL_TEXTENCODING_MS_874 },
457         { "1004", RTL_TEXTENCODING_MS_1252 },
458         { "65400", RTL_TEXTENCODING_SYMBOL },
459         { NULL, RTL_TEXTENCODING_DONTKNOW }
460     };
461 
462     static ImplStrCharsetDef const aUnixCharsetKOI8Tab[] =
463     {
464         { "r", RTL_TEXTENCODING_KOI8_R },
465         { "u", RTL_TEXTENCODING_KOI8_U },
466         { NULL, RTL_TEXTENCODING_DONTKNOW }
467     };
468 
469     static ImplStrCharsetDef aUnixCharsetJISX0208Tab[] =
470     {
471         { NULL, RTL_TEXTENCODING_JIS_X_0208 }
472     };
473 
474     static ImplStrCharsetDef aUnixCharsetJISX0201Tab[] =
475     {
476         { NULL, RTL_TEXTENCODING_JIS_X_0201 }
477     };
478 
479     static ImplStrCharsetDef aUnixCharsetJISX0212Tab[] =
480     {
481         { NULL, RTL_TEXTENCODING_JIS_X_0212 }
482     };
483 
484     static ImplStrCharsetDef aUnixCharsetGBTab[] =
485     {
486         { NULL, RTL_TEXTENCODING_GB_2312 }
487     };
488 
489     static ImplStrCharsetDef aUnixCharsetGBKTab[] =
490     {
491         { NULL, RTL_TEXTENCODING_GBK }
492     };
493 
494     static ImplStrCharsetDef aUnixCharsetBIG5Tab[] =
495     {
496         { NULL, RTL_TEXTENCODING_BIG5 }
497     };
498 
499     static ImplStrCharsetDef const aUnixCharsetKSC56011987Tab[] =
500     {
501         { NULL, RTL_TEXTENCODING_EUC_KR }
502     };
503 
504     static ImplStrCharsetDef const aUnixCharsetKSC56011992Tab[] =
505     {
506         { NULL, RTL_TEXTENCODING_MS_1361 }
507     };
508 
509     static ImplStrCharsetDef const aUnixCharsetISO10646Tab[] =
510     {
511         { NULL, RTL_TEXTENCODING_UNICODE }
512     };
513 
514     static ImplStrCharsetDef const aUnixCharsetUNICODETab[] =
515     {
516 /* Currently every Unicode Encoding is for us Unicode */
517 /*        { "fontspecific", RTL_TEXTENCODING_UNICODE }, */
518         { NULL, RTL_TEXTENCODING_UNICODE }
519     };
520 
521     static ImplStrCharsetDef const aUnixCharsetSymbolTab[] =
522     {
523         { NULL, RTL_TEXTENCODING_SYMBOL }
524     };
525 
526     /* See <http://cvs.freedesktop.org/xorg/xc/fonts/encodings/iso8859-11.enc?
527        rev=1.1.1.1>: */
528     static ImplStrCharsetDef const aUnixCharsetTIS620Tab[] =
529     {
530         { "0", RTL_TEXTENCODING_TIS_620 },
531         { "2529", RTL_TEXTENCODING_TIS_620 },
532         { "2533", RTL_TEXTENCODING_TIS_620 },
533         { NULL, RTL_TEXTENCODING_DONTKNOW }
534     };
535     static ImplStrCharsetDef const aUnixCharsetTIS6202529Tab[] =
536     {
537         { "1", RTL_TEXTENCODING_TIS_620 },
538         { NULL, RTL_TEXTENCODING_DONTKNOW }
539     };
540     static ImplStrCharsetDef const aUnixCharsetTIS6202533Tab[] =
541     {
542         { "0", RTL_TEXTENCODING_TIS_620 },
543         { "1", RTL_TEXTENCODING_TIS_620 },
544         { NULL, RTL_TEXTENCODING_DONTKNOW }
545     };
546 
547     static ImplStrFirstPartCharsetDef const aUnixCharsetFirstPartTab[] =
548     {
549         { "iso8859", aUnixCharsetISOTab },
550         { "adobe", aUnixCharsetADOBETab },
551         { "ansi", aUnixCharsetMSTab },
552         { "microsoft", aUnixCharsetMSTab },
553         { "ibm", aUnixCharsetIBMTab },
554         { "koi8", aUnixCharsetKOI8Tab },
555         { "jisx0208", aUnixCharsetJISX0208Tab },
556         { "jisx0208.1983", aUnixCharsetJISX0208Tab },
557         { "jisx0201", aUnixCharsetJISX0201Tab },
558         { "jisx0201.1976", aUnixCharsetJISX0201Tab },
559         { "jisx0212", aUnixCharsetJISX0212Tab },
560         { "jisx0212.1990", aUnixCharsetJISX0212Tab },
561         { "gb2312", aUnixCharsetGBTab },
562         { "gbk", aUnixCharsetGBKTab },
563         { "big5", aUnixCharsetBIG5Tab },
564         { "iso10646", aUnixCharsetISO10646Tab },
565 /*      { "unicode", aUnixCharsetUNICODETab }, */ /* fonts contain only default chars */
566         { "sunolcursor", aUnixCharsetSymbolTab },
567         { "sunolglyph", aUnixCharsetSymbolTab },
568         { "iso10646", aUnixCharsetUNICODETab },
569         { "ksc5601.1987", aUnixCharsetKSC56011987Tab },
570         { "ksc5601.1992", aUnixCharsetKSC56011992Tab },
571         { "tis620.2529", aUnixCharsetTIS6202529Tab },
572         { "tis620.2533", aUnixCharsetTIS6202533Tab },
573         { "tis620", aUnixCharsetTIS620Tab },
574 /*        { "sunudcja.1997",  },        */
575 /*        { "sunudcko.1997",  },        */
576 /*        { "sunudczh.1997",  },        */
577 /*        { "sunudczhtw.1997",  },      */
578         { NULL, NULL }
579     };
580 
581     rtl_TextEncoding    eEncoding = RTL_TEXTENCODING_DONTKNOW;
582     sal_Char*           pBuf;
583     sal_Char*           pTempBuf;
584     sal_uInt32          nBufLen = strlen( pUnixCharset )+1;
585     const sal_Char*     pFirstPart;
586     const sal_Char*     pSecondPart;
587 
588     /* Alloc Buffer and map to lower case */
589     pBuf = (char*)rtl_allocateMemory( nBufLen );
590     Impl_toAsciiLower( pUnixCharset, pBuf );
591 
592     /* Search FirstPart */
593     pFirstPart = pBuf;
594     pSecondPart = NULL;
595     pTempBuf = pBuf;
596     while ( *pTempBuf )
597     {
598         if ( *pTempBuf == '-' )
599         {
600             *pTempBuf = '\0';
601             pSecondPart = pTempBuf+1;
602             break;
603         }
604 
605         pTempBuf++;
606     }
607 
608     /* Parttrenner gefunden */
609     if ( pSecondPart )
610     {
611         /* Search for the part tab */
612         const ImplStrFirstPartCharsetDef* pFirstPartData = aUnixCharsetFirstPartTab;
613         while ( pFirstPartData->mpCharsetStr )
614         {
615             if ( Impl_matchString( pFirstPart, pFirstPartData->mpCharsetStr ) )
616             {
617                 /* Search for the charset in the second part tab */
618                 const ImplStrCharsetDef* pData = pFirstPartData->mpSecondPartTab;
619                 while ( pData->mpCharsetStr )
620                 {
621                     if ( Impl_matchString( pSecondPart, pData->mpCharsetStr ) )
622                     {
623                         eEncoding = pData->meTextEncoding;
624                         break;
625                     }
626 
627                     pData++;
628                 }
629 
630                 /* use default encoding for first part */
631                 eEncoding = pData->meTextEncoding;
632                 break;
633             }
634 
635             pFirstPartData++;
636         }
637     }
638 
639     rtl_freeMemory( pBuf );
640 
641     return eEncoding;
642 }
643 
644 /* ----------------------------------------------------------------------- */
645 
rtl_getTextEncodingFromMimeCharset(const sal_Char * pMimeCharset)646 rtl_TextEncoding SAL_CALL rtl_getTextEncodingFromMimeCharset( const sal_Char* pMimeCharset )
647 {
648     /* All Identifiers are in lower case and contain only alphanumeric */
649     /* characters. The function search for the first equal string in */
650     /* the table. In this table are only the most used mime types. */
651     /* Sort order: important */
652     static ImplStrCharsetDef const aVIPMimeCharsetTab[] =
653     {
654         { "usascii", RTL_TEXTENCODING_ASCII_US },
655         { "utf8", RTL_TEXTENCODING_UTF8 },
656         { "utf7", RTL_TEXTENCODING_UTF7 },
657         { "iso88591", RTL_TEXTENCODING_ISO_8859_1 },
658         { "iso88592", RTL_TEXTENCODING_ISO_8859_2 },
659         { "iso88593", RTL_TEXTENCODING_ISO_8859_3 },
660         { "iso88594", RTL_TEXTENCODING_ISO_8859_4 },
661         { "iso88595", RTL_TEXTENCODING_ISO_8859_5 },
662         { "iso88596", RTL_TEXTENCODING_ISO_8859_6 },
663         { "iso88597", RTL_TEXTENCODING_ISO_8859_7 },
664         { "iso88598", RTL_TEXTENCODING_ISO_8859_8 },
665         { "iso88599", RTL_TEXTENCODING_ISO_8859_9 },
666         { "iso885910", RTL_TEXTENCODING_ISO_8859_10 },
667         { "iso885913", RTL_TEXTENCODING_ISO_8859_13 },
668         { "iso885914", RTL_TEXTENCODING_ISO_8859_14 },
669         { "iso885915", RTL_TEXTENCODING_ISO_8859_15 },
670         { "iso2022jp", RTL_TEXTENCODING_ISO_2022_JP },
671         { "iso2022jp2", RTL_TEXTENCODING_ISO_2022_JP },
672         { "iso2022cn", RTL_TEXTENCODING_ISO_2022_CN },
673         { "iso2022cnext", RTL_TEXTENCODING_ISO_2022_CN },
674         { "iso2022kr", RTL_TEXTENCODING_ISO_2022_KR },
675         { "eucjp", RTL_TEXTENCODING_EUC_JP },
676         { "shiftjis", RTL_TEXTENCODING_SHIFT_JIS },
677         { "mskanji", RTL_TEXTENCODING_MS_932 },
678         { "gb2312", RTL_TEXTENCODING_GB_2312 },
679         { "cngb", RTL_TEXTENCODING_GB_2312 },
680         { "big5", RTL_TEXTENCODING_BIG5 },
681         { "cnbig5", RTL_TEXTENCODING_BIG5 },
682         { "cngb12345", RTL_TEXTENCODING_GBT_12345 },
683         { "euckr", RTL_TEXTENCODING_EUC_KR },
684         { "koi8r", RTL_TEXTENCODING_KOI8_R },
685         { "windows1252", RTL_TEXTENCODING_MS_1252 },
686         { "windows1250", RTL_TEXTENCODING_MS_1250 },
687         { "windows1251", RTL_TEXTENCODING_MS_1251 },
688         { "windows1253", RTL_TEXTENCODING_MS_1253 },
689         { "windows1254", RTL_TEXTENCODING_MS_1254 },
690         { "windows1255", RTL_TEXTENCODING_MS_1255 },
691         { "windows1256", RTL_TEXTENCODING_MS_1256 },
692         { "windows1257", RTL_TEXTENCODING_MS_1257 },
693         { "windows1258", RTL_TEXTENCODING_MS_1258 },
694         { NULL, RTL_TEXTENCODING_DONTKNOW }
695     };
696 
697     /* All Identifiers are in lower case and contain only alphanumeric */
698     /* characters. The function search for the first matching string in */
699     /* the table. */
700     /* Sort order: unique (first iso885914, than iso88591), important */
701     static ImplStrCharsetDef const aMimeCharsetTab[] =
702     {
703         { "unicode11utf7", RTL_TEXTENCODING_UTF7 },
704         { "caunicode11utf7", RTL_TEXTENCODING_UTF7 },
705         { "iso88591windows30", RTL_TEXTENCODING_ISO_8859_1 },
706         { "iso88591win", RTL_TEXTENCODING_MS_1252 },
707         { "iso88592win", RTL_TEXTENCODING_MS_1250 },
708         { "iso88599win", RTL_TEXTENCODING_MS_1254 },
709         { "iso885915", RTL_TEXTENCODING_ISO_8859_15 },
710         { "iso885914", RTL_TEXTENCODING_ISO_8859_14 },
711         { "iso885913", RTL_TEXTENCODING_ISO_8859_13 },
712         { "iso885911", RTL_TEXTENCODING_TIS_620 },
713             /* This is no official MIME character set name, but it might be in
714                use in Thailand. */
715         { "iso885910", RTL_TEXTENCODING_ISO_8859_10 },
716         { "iso88591", RTL_TEXTENCODING_ISO_8859_1 },
717         { "iso88592", RTL_TEXTENCODING_ISO_8859_2 },
718         { "iso88593", RTL_TEXTENCODING_ISO_8859_3 },
719         { "iso88594", RTL_TEXTENCODING_ISO_8859_4 },
720         { "iso88595", RTL_TEXTENCODING_ISO_8859_5 },
721         { "iso88596", RTL_TEXTENCODING_ISO_8859_6 },
722         { "iso88597", RTL_TEXTENCODING_ISO_8859_7 },
723         { "iso88598", RTL_TEXTENCODING_ISO_8859_8 },
724         { "iso88599", RTL_TEXTENCODING_ISO_8859_9 },
725         { "isoir100", RTL_TEXTENCODING_ISO_8859_1 },
726         { "latin1", RTL_TEXTENCODING_ISO_8859_1 },
727         { "l1", RTL_TEXTENCODING_ISO_8859_1 },
728         { "cp819", RTL_TEXTENCODING_ISO_8859_1 },
729         { "ibm819", RTL_TEXTENCODING_ISO_8859_1 },
730         { "csisolatin1", RTL_TEXTENCODING_ISO_8859_1 },
731         { "isoir101", RTL_TEXTENCODING_ISO_8859_2 },
732         { "latin2", RTL_TEXTENCODING_ISO_8859_2 },
733         { "l2", RTL_TEXTENCODING_ISO_8859_2 },
734         { "csisolatin2", RTL_TEXTENCODING_ISO_8859_2 },
735         { "isoir109", RTL_TEXTENCODING_ISO_8859_3 },
736         { "latin3", RTL_TEXTENCODING_ISO_8859_3 },
737         { "l3", RTL_TEXTENCODING_ISO_8859_3 },
738         { "csisolatin3", RTL_TEXTENCODING_ISO_8859_3 },
739         { "isoir110", RTL_TEXTENCODING_ISO_8859_4 },
740         { "latin4", RTL_TEXTENCODING_ISO_8859_4 },
741         { "l4", RTL_TEXTENCODING_ISO_8859_4 },
742         { "csisolatin4", RTL_TEXTENCODING_ISO_8859_4 },
743         { "isoir144", RTL_TEXTENCODING_ISO_8859_5 },
744         { "cyrillicasian", RTL_TEXTENCODING_PT154 },
745         { "cyrillic", RTL_TEXTENCODING_ISO_8859_5 },
746         { "csisolatincyrillic", RTL_TEXTENCODING_ISO_8859_5 },
747         { "isoir127", RTL_TEXTENCODING_ISO_8859_6 },
748         { "arabic", RTL_TEXTENCODING_ISO_8859_6 },
749         { "csisolatinarabic", RTL_TEXTENCODING_ISO_8859_6 },
750         { "ecma114", RTL_TEXTENCODING_ISO_8859_6 },
751         { "asmo708", RTL_TEXTENCODING_ISO_8859_6 },
752         { "isoir126", RTL_TEXTENCODING_ISO_8859_7 },
753         { "greek", RTL_TEXTENCODING_ISO_8859_7 },
754         { "csisolatingreek", RTL_TEXTENCODING_ISO_8859_7 },
755         { "elot928", RTL_TEXTENCODING_ISO_8859_7 },
756         { "ecma118", RTL_TEXTENCODING_ISO_8859_7 },
757         { "isoir138", RTL_TEXTENCODING_ISO_8859_8 },
758         { "hebrew", RTL_TEXTENCODING_ISO_8859_8 },
759         { "csisolatinhebrew", RTL_TEXTENCODING_ISO_8859_8 },
760         { "isoir148", RTL_TEXTENCODING_ISO_8859_9 },
761         { "latin5", RTL_TEXTENCODING_ISO_8859_9 },
762         { "l5", RTL_TEXTENCODING_ISO_8859_9 },
763         { "csisolatin5", RTL_TEXTENCODING_ISO_8859_9 },
764         { "cswindows30latin1", RTL_TEXTENCODING_ISO_8859_1 },
765         { "cswindows30latin1", RTL_TEXTENCODING_ISO_8859_1 },
766         { "cswindows31latin1", RTL_TEXTENCODING_MS_1252 },
767         { "cswindows31latin2", RTL_TEXTENCODING_MS_1250 },
768         { "cswindows31latin5", RTL_TEXTENCODING_MS_1254 },
769         { "iso10646us", RTL_TEXTENCODING_ASCII_US },
770         { "iso646irv", RTL_TEXTENCODING_ASCII_US },
771         { "cskoi8r", RTL_TEXTENCODING_KOI8_R },
772         { "ibm437", RTL_TEXTENCODING_IBM_437 },
773         { "cp437", RTL_TEXTENCODING_IBM_437 },
774         { "437", RTL_TEXTENCODING_IBM_437 },
775         { "cspc8codepage437", RTL_TEXTENCODING_IBM_437 },
776         { "ansix34", RTL_TEXTENCODING_ASCII_US },
777         { "ibm367", RTL_TEXTENCODING_ASCII_US },
778         { "cp367", RTL_TEXTENCODING_ASCII_US },
779         { "csascii", RTL_TEXTENCODING_ASCII_US },
780         { "ibm775", RTL_TEXTENCODING_IBM_775 },
781         { "cp775", RTL_TEXTENCODING_IBM_775 },
782         { "cspc775baltic", RTL_TEXTENCODING_IBM_775 },
783         { "ibm850", RTL_TEXTENCODING_IBM_850 },
784         { "cp850", RTL_TEXTENCODING_IBM_850 },
785         { "850", RTL_TEXTENCODING_IBM_850 },
786         { "cspc850multilingual", RTL_TEXTENCODING_IBM_850 },
787 /*        { "ibm851", RTL_TEXTENCODING_IBM_851 }, */
788 /*        { "cp851", RTL_TEXTENCODING_IBM_851 }, */
789 /*        { "851", RTL_TEXTENCODING_IBM_851 }, */
790 /*        { "csibm851", RTL_TEXTENCODING_IBM_851 }, */
791         { "ibm852", RTL_TEXTENCODING_IBM_852 },
792         { "cp852", RTL_TEXTENCODING_IBM_852 },
793         { "852", RTL_TEXTENCODING_IBM_852 },
794         { "cspcp852", RTL_TEXTENCODING_IBM_852 },
795         { "ibm855", RTL_TEXTENCODING_IBM_855 },
796         { "cp855", RTL_TEXTENCODING_IBM_855 },
797         { "855", RTL_TEXTENCODING_IBM_855 },
798         { "csibm855", RTL_TEXTENCODING_IBM_855 },
799         { "ibm857", RTL_TEXTENCODING_IBM_857 },
800         { "cp857", RTL_TEXTENCODING_IBM_857 },
801         { "857", RTL_TEXTENCODING_IBM_857 },
802         { "csibm857", RTL_TEXTENCODING_IBM_857 },
803         { "ibm860", RTL_TEXTENCODING_IBM_860 },
804         { "cp860", RTL_TEXTENCODING_IBM_860 },
805         { "860", RTL_TEXTENCODING_IBM_860 },
806         { "csibm860", RTL_TEXTENCODING_IBM_860 },
807         { "ibm861", RTL_TEXTENCODING_IBM_861 },
808         { "cp861", RTL_TEXTENCODING_IBM_861 },
809         { "861", RTL_TEXTENCODING_IBM_861 },
810         { "csis", RTL_TEXTENCODING_IBM_861 },
811         { "csibm861", RTL_TEXTENCODING_IBM_861 },
812         { "ibm862", RTL_TEXTENCODING_IBM_862 },
813         { "cp862", RTL_TEXTENCODING_IBM_862 },
814         { "862", RTL_TEXTENCODING_IBM_862 },
815         { "cspc862latinhebrew", RTL_TEXTENCODING_IBM_862 },
816         { "ibm863", RTL_TEXTENCODING_IBM_863 },
817         { "cp863", RTL_TEXTENCODING_IBM_863 },
818         { "863", RTL_TEXTENCODING_IBM_863 },
819         { "csibm863", RTL_TEXTENCODING_IBM_863 },
820         { "ibm864", RTL_TEXTENCODING_IBM_864 },
821         { "cp864", RTL_TEXTENCODING_IBM_864 },
822         { "864", RTL_TEXTENCODING_IBM_864 },
823         { "csibm864", RTL_TEXTENCODING_IBM_864 },
824         { "ibm865", RTL_TEXTENCODING_IBM_865 },
825         { "cp865", RTL_TEXTENCODING_IBM_865 },
826         { "865", RTL_TEXTENCODING_IBM_865 },
827         { "csibm865", RTL_TEXTENCODING_IBM_865 },
828         { "ibm866", RTL_TEXTENCODING_IBM_866 },
829         { "cp866", RTL_TEXTENCODING_IBM_866 },
830         { "866", RTL_TEXTENCODING_IBM_866 },
831         { "csibm866", RTL_TEXTENCODING_IBM_866 },
832 /*        { "ibm868", RTL_TEXTENCODING_IBM_868 }, */
833 /*        { "cp868", RTL_TEXTENCODING_IBM_868 }, */
834 /*        { "cpar", RTL_TEXTENCODING_IBM_868 }, */
835 /*        { "csibm868", RTL_TEXTENCODING_IBM_868 }, */
836         { "ibm869", RTL_TEXTENCODING_IBM_869 },
837         { "cp869", RTL_TEXTENCODING_IBM_869 },
838         { "869", RTL_TEXTENCODING_IBM_869 },
839         { "cpgr", RTL_TEXTENCODING_IBM_869 },
840         { "csibm869", RTL_TEXTENCODING_IBM_869 },
841         { "ibm869", RTL_TEXTENCODING_IBM_869 },
842         { "cp869", RTL_TEXTENCODING_IBM_869 },
843         { "869", RTL_TEXTENCODING_IBM_869 },
844         { "cpgr", RTL_TEXTENCODING_IBM_869 },
845         { "csibm869", RTL_TEXTENCODING_IBM_869 },
846         { "mac", RTL_TEXTENCODING_APPLE_ROMAN },
847         { "csmacintosh", RTL_TEXTENCODING_APPLE_ROMAN },
848         { "shiftjis", RTL_TEXTENCODING_SHIFT_JIS },
849         { "mskanji", RTL_TEXTENCODING_MS_932 },
850         { "csshiftjis", RTL_TEXTENCODING_SHIFT_JIS },
851         { "jisx0208", RTL_TEXTENCODING_JIS_X_0208 },
852         { "jisc62261983", RTL_TEXTENCODING_JIS_X_0208 },
853         { "csiso87jisx0208", RTL_TEXTENCODING_JIS_X_0208 },
854         { "isoir86", RTL_TEXTENCODING_JIS_X_0208 },
855         { "x0208", RTL_TEXTENCODING_JIS_X_0208 },
856         { "jisx0201", RTL_TEXTENCODING_JIS_X_0201 },
857         { "cshalfwidthkatakana", RTL_TEXTENCODING_JIS_X_0201 },
858         { "x0201", RTL_TEXTENCODING_JIS_X_0201 },
859         { "jisx0212", RTL_TEXTENCODING_JIS_X_0212 },
860         { "csiso159jisx0212", RTL_TEXTENCODING_JIS_X_0212 },
861         { "isoir159", RTL_TEXTENCODING_JIS_X_0208 },
862         { "x0212", RTL_TEXTENCODING_JIS_X_0212 },
863         { "isoir6", RTL_TEXTENCODING_ASCII_US },
864         { "xsjis", RTL_TEXTENCODING_SHIFT_JIS },
865         { "sjis", RTL_TEXTENCODING_SHIFT_JIS },
866         { "ascii", RTL_TEXTENCODING_ASCII_US },
867         { "us", RTL_TEXTENCODING_ASCII_US },
868         { "gb180302000", RTL_TEXTENCODING_GB_18030 },
869             /* This is no actual MIME character set name, it is only in here
870                for backwards compatibility (before "GB18030" was officially
871                registered with IANA, this code contained some guesses of what
872                would become official names for GB18030). */
873         { "gb18030", RTL_TEXTENCODING_GB_18030 },
874         { "big5hkscs", RTL_TEXTENCODING_BIG5_HKSCS },
875         { "tis620", RTL_TEXTENCODING_TIS_620 },
876         { "gbk", RTL_TEXTENCODING_GBK },
877         { "cp936", RTL_TEXTENCODING_GBK },
878         { "ms936", RTL_TEXTENCODING_GBK },
879         { "windows936", RTL_TEXTENCODING_GBK },
880         { "cp874", RTL_TEXTENCODING_MS_874 },
881             /* This is no official MIME character set name, but it might be in
882                use in Thailand. */
883         { "ms874", RTL_TEXTENCODING_MS_874 },
884             /* This is no official MIME character set name, but it might be in
885                use in Thailand. */
886         { "windows874", RTL_TEXTENCODING_MS_874 },
887             /* This is no official MIME character set name, but it might be in
888                use in Thailand. */
889         { "koi8u", RTL_TEXTENCODING_KOI8_U },
890         { "cpis", RTL_TEXTENCODING_IBM_861 },
891         { "ksc56011987", RTL_TEXTENCODING_MS_949 },
892         { "isoir149", RTL_TEXTENCODING_MS_949 },
893         { "ksc56011989", RTL_TEXTENCODING_MS_949 },
894         { "ksc5601", RTL_TEXTENCODING_MS_949 },
895         { "korean", RTL_TEXTENCODING_MS_949 },
896         { "csksc56011987", RTL_TEXTENCODING_MS_949 },
897             /* Map KS_C_5601-1987 and aliases to MS-949 instead of EUC-KR, as
898                this character set identifier seems to be prominently used by MS
899                to stand for KS C 5601 plus MS-949 extensions */
900         { "latin9", RTL_TEXTENCODING_ISO_8859_15 },
901         { "adobestandardencoding", RTL_TEXTENCODING_ADOBE_STANDARD },
902         { "csadobestandardencoding", RTL_TEXTENCODING_ADOBE_STANDARD },
903         { "adobesymbolencoding", RTL_TEXTENCODING_ADOBE_SYMBOL },
904         { "cshppsmath", RTL_TEXTENCODING_ADOBE_SYMBOL },
905         { "ptcp154", RTL_TEXTENCODING_PT154 },
906         { "csptcp154", RTL_TEXTENCODING_PT154 },
907         { "pt154", RTL_TEXTENCODING_PT154 },
908         { "cp154", RTL_TEXTENCODING_PT154 },
909         { "xisciide", RTL_TEXTENCODING_ISCII_DEVANAGARI },
910             /* This is no official MIME character set name, but is in use by
911                various windows APIs. */
912         { NULL, RTL_TEXTENCODING_DONTKNOW }
913     };
914 
915     rtl_TextEncoding            eEncoding = RTL_TEXTENCODING_DONTKNOW;
916     sal_Char*                   pBuf;
917     const ImplStrCharsetDef*    pData = aVIPMimeCharsetTab;
918     sal_uInt32                  nBufLen = strlen( pMimeCharset )+1;
919 
920     /* Alloc Buffer and map to lower case and remove non alphanumeric chars */
921     pBuf = (char*)rtl_allocateMemory( nBufLen );
922     Impl_toAsciiLowerAndRemoveNonAlphanumeric( pMimeCharset, pBuf );
923 
924     /* Search for equal in the VIP table */
925     while ( pData->mpCharsetStr )
926     {
927         if ( strcmp( pBuf, pData->mpCharsetStr ) == 0 )
928         {
929             eEncoding = pData->meTextEncoding;
930             break;
931         }
932 
933         pData++;
934     }
935 
936     /* Search for matching in the mime table */
937     if ( eEncoding == RTL_TEXTENCODING_DONTKNOW )
938     {
939         pData = aMimeCharsetTab;
940         while ( pData->mpCharsetStr )
941         {
942             if ( Impl_matchString( pBuf, pData->mpCharsetStr ) )
943             {
944                 eEncoding = pData->meTextEncoding;
945                 break;
946             }
947 
948             pData++;
949         }
950     }
951 
952     rtl_freeMemory( pBuf );
953 
954     return eEncoding;
955 }
956 
957 /* ======================================================================= */
958 
rtl_getBestWindowsCharsetFromTextEncoding(rtl_TextEncoding eTextEncoding)959 sal_uInt8 SAL_CALL rtl_getBestWindowsCharsetFromTextEncoding( rtl_TextEncoding eTextEncoding )
960 {
961     const ImplTextEncodingData* pData = Impl_getTextEncodingData( eTextEncoding );
962     if ( pData )
963         return pData->mnBestWindowsCharset;
964     else
965         return 1;
966 }
967 
968 /* ----------------------------------------------------------------------- */
969 
rtl_getBestUnixCharsetFromTextEncoding(rtl_TextEncoding eTextEncoding)970 const sal_Char* SAL_CALL rtl_getBestUnixCharsetFromTextEncoding( rtl_TextEncoding eTextEncoding  )
971 {
972     const ImplTextEncodingData* pData = Impl_getTextEncodingData( eTextEncoding );
973     if ( pData )
974         return (sal_Char const *) pData->mpBestUnixCharset;
975     else if( eTextEncoding == RTL_TEXTENCODING_UNICODE )
976         return (sal_Char const *) "iso10646-1";
977     else
978         return 0;
979 }
980 
981 /* ----------------------------------------------------------------------- */
982 
rtl_getMimeCharsetFromTextEncoding(rtl_TextEncoding nEncoding)983 char const * SAL_CALL rtl_getMimeCharsetFromTextEncoding(rtl_TextEncoding
984                                                              nEncoding)
985 {
986     ImplTextEncodingData const * p = Impl_getTextEncodingData(nEncoding);
987     return p && (p->mnInfoFlags & RTL_TEXTENCODING_INFO_MIME) != 0 ?
988                p->mpBestMimeCharset : NULL;
989 }
990 
rtl_getBestMimeCharsetFromTextEncoding(rtl_TextEncoding eTextEncoding)991 const sal_Char* SAL_CALL rtl_getBestMimeCharsetFromTextEncoding( rtl_TextEncoding eTextEncoding )
992 {
993     const ImplTextEncodingData* pData = Impl_getTextEncodingData( eTextEncoding );
994     if ( pData )
995         return (sal_Char const *) pData->mpBestMimeCharset;
996     else
997         return 0;
998 }
999 
1000 /* The following two functions are based on <http://www.sharmahd.com/tm/
1001    codepages.html>, <http://msdn.microsoft.com/workshop/author/dhtml/reference/
1002    charsets/charset4.asp>, and <http://www.iana.org/assignments/character-sets>.
1003  */
1004 
1005 rtl_TextEncoding SAL_CALL
rtl_getTextEncodingFromWindowsCodePage(sal_uInt32 nCodePage)1006 rtl_getTextEncodingFromWindowsCodePage(sal_uInt32 nCodePage)
1007 {
1008     switch (nCodePage)
1009     {
1010     case 437: return RTL_TEXTENCODING_IBM_437;
1011     case 708: return RTL_TEXTENCODING_ISO_8859_6;
1012     case 737: return RTL_TEXTENCODING_IBM_737;
1013     case 775: return RTL_TEXTENCODING_IBM_775;
1014     case 850: return RTL_TEXTENCODING_IBM_850;
1015     case 852: return RTL_TEXTENCODING_IBM_852;
1016     case 855: return RTL_TEXTENCODING_IBM_855;
1017     case 857: return RTL_TEXTENCODING_IBM_857;
1018     case 860: return RTL_TEXTENCODING_IBM_860;
1019     case 861: return RTL_TEXTENCODING_IBM_861;
1020     case 862: return RTL_TEXTENCODING_IBM_862;
1021     case 863: return RTL_TEXTENCODING_IBM_863;
1022     case 864: return RTL_TEXTENCODING_IBM_864;
1023     case 865: return RTL_TEXTENCODING_IBM_865;
1024     case 866: return RTL_TEXTENCODING_IBM_866;
1025     case 869: return RTL_TEXTENCODING_IBM_869;
1026     case 874: return RTL_TEXTENCODING_MS_874;
1027     case 932: return RTL_TEXTENCODING_MS_932;
1028     case 936: return RTL_TEXTENCODING_MS_936;
1029     case 949: return RTL_TEXTENCODING_MS_949;
1030     case 950: return RTL_TEXTENCODING_MS_950;
1031     case 1250: return RTL_TEXTENCODING_MS_1250;
1032     case 1251: return RTL_TEXTENCODING_MS_1251;
1033     case 1252: return RTL_TEXTENCODING_MS_1252;
1034     case 1253: return RTL_TEXTENCODING_MS_1253;
1035     case 1254: return RTL_TEXTENCODING_MS_1254;
1036     case 1255: return RTL_TEXTENCODING_MS_1255;
1037     case 1256: return RTL_TEXTENCODING_MS_1256;
1038     case 1257: return RTL_TEXTENCODING_MS_1257;
1039     case 1258: return RTL_TEXTENCODING_MS_1258;
1040     case 1361: return RTL_TEXTENCODING_MS_1361;
1041     case 10000: return RTL_TEXTENCODING_APPLE_ROMAN;
1042     case 10001: return RTL_TEXTENCODING_APPLE_JAPANESE;
1043     case 10002: return RTL_TEXTENCODING_APPLE_CHINTRAD;
1044     case 10003: return RTL_TEXTENCODING_APPLE_KOREAN;
1045     case 10004: return RTL_TEXTENCODING_APPLE_ARABIC;
1046     case 10005: return RTL_TEXTENCODING_APPLE_HEBREW;
1047     case 10006: return RTL_TEXTENCODING_APPLE_GREEK;
1048     case 10007: return RTL_TEXTENCODING_APPLE_CYRILLIC;
1049     case 10008: return RTL_TEXTENCODING_APPLE_CHINSIMP;
1050     case 10010: return RTL_TEXTENCODING_APPLE_ROMANIAN;
1051     case 10017: return RTL_TEXTENCODING_APPLE_UKRAINIAN;
1052     case 10029: return RTL_TEXTENCODING_APPLE_CENTEURO;
1053     case 10079: return RTL_TEXTENCODING_APPLE_ICELAND;
1054     case 10081: return RTL_TEXTENCODING_APPLE_TURKISH;
1055     case 10082: return RTL_TEXTENCODING_APPLE_CROATIAN;
1056     case 20127: return RTL_TEXTENCODING_ASCII_US;
1057     case 20866: return RTL_TEXTENCODING_KOI8_R;
1058     case 21866: return RTL_TEXTENCODING_KOI8_U;
1059     case 28591: return RTL_TEXTENCODING_ISO_8859_1;
1060     case 28592: return RTL_TEXTENCODING_ISO_8859_2;
1061     case 28593: return RTL_TEXTENCODING_ISO_8859_3;
1062     case 28594: return RTL_TEXTENCODING_ISO_8859_4;
1063     case 28595: return RTL_TEXTENCODING_ISO_8859_5;
1064     case 28596: return RTL_TEXTENCODING_ISO_8859_6;
1065     case 28597: return RTL_TEXTENCODING_ISO_8859_7;
1066     case 28598: return RTL_TEXTENCODING_ISO_8859_8;
1067     case 28599: return RTL_TEXTENCODING_ISO_8859_9;
1068     case 28605: return RTL_TEXTENCODING_ISO_8859_15;
1069     case 50220: return RTL_TEXTENCODING_ISO_2022_JP;
1070     case 50225: return RTL_TEXTENCODING_ISO_2022_KR;
1071     case 51932: return RTL_TEXTENCODING_EUC_JP;
1072     case 51936: return RTL_TEXTENCODING_EUC_CN;
1073     case 51949: return RTL_TEXTENCODING_EUC_KR;
1074     case 57002: return RTL_TEXTENCODING_ISCII_DEVANAGARI;
1075     case 65000: return RTL_TEXTENCODING_UTF7;
1076     case 65001: return RTL_TEXTENCODING_UTF8;
1077     default: return RTL_TEXTENCODING_DONTKNOW;
1078     }
1079 }
1080 
1081 sal_uInt32 SAL_CALL
rtl_getWindowsCodePageFromTextEncoding(rtl_TextEncoding nEncoding)1082 rtl_getWindowsCodePageFromTextEncoding(rtl_TextEncoding nEncoding)
1083 {
1084     switch (nEncoding)
1085     {
1086     case RTL_TEXTENCODING_IBM_437: return 437;
1087  /* case RTL_TEXTENCODING_ISO_8859_6: return 708; */
1088     case RTL_TEXTENCODING_IBM_737: return 737;
1089     case RTL_TEXTENCODING_IBM_775: return 775;
1090     case RTL_TEXTENCODING_IBM_850: return 850;
1091     case RTL_TEXTENCODING_IBM_852: return 852;
1092     case RTL_TEXTENCODING_IBM_855: return 855;
1093     case RTL_TEXTENCODING_IBM_857: return 857;
1094     case RTL_TEXTENCODING_IBM_860: return 860;
1095     case RTL_TEXTENCODING_IBM_861: return 861;
1096     case RTL_TEXTENCODING_IBM_862: return 862;
1097     case RTL_TEXTENCODING_IBM_863: return 863;
1098     case RTL_TEXTENCODING_IBM_864: return 864;
1099     case RTL_TEXTENCODING_IBM_865: return 865;
1100     case RTL_TEXTENCODING_IBM_866: return 866;
1101     case RTL_TEXTENCODING_IBM_869: return 869;
1102     case RTL_TEXTENCODING_MS_874: return 874;
1103     case RTL_TEXTENCODING_MS_932: return 932;
1104     case RTL_TEXTENCODING_MS_936: return 936;
1105     case RTL_TEXTENCODING_MS_949: return 949;
1106     case RTL_TEXTENCODING_MS_950: return 950;
1107     case RTL_TEXTENCODING_MS_1250: return 1250;
1108     case RTL_TEXTENCODING_MS_1251: return 1251;
1109     case RTL_TEXTENCODING_MS_1252: return 1252;
1110     case RTL_TEXTENCODING_MS_1253: return 1253;
1111     case RTL_TEXTENCODING_MS_1254: return 1254;
1112     case RTL_TEXTENCODING_MS_1255: return 1255;
1113     case RTL_TEXTENCODING_MS_1256: return 1256;
1114     case RTL_TEXTENCODING_MS_1257: return 1257;
1115     case RTL_TEXTENCODING_MS_1258: return 1258;
1116     case RTL_TEXTENCODING_MS_1361: return 1361;
1117     case RTL_TEXTENCODING_APPLE_ROMAN: return 10000;
1118     case RTL_TEXTENCODING_APPLE_JAPANESE: return 10001;
1119     case RTL_TEXTENCODING_APPLE_CHINTRAD: return 10002;
1120     case RTL_TEXTENCODING_APPLE_KOREAN: return 10003;
1121     case RTL_TEXTENCODING_APPLE_ARABIC: return 10004;
1122     case RTL_TEXTENCODING_APPLE_HEBREW: return 10005;
1123     case RTL_TEXTENCODING_APPLE_GREEK: return 10006;
1124     case RTL_TEXTENCODING_APPLE_CYRILLIC: return 10007;
1125     case RTL_TEXTENCODING_APPLE_CHINSIMP: return 10008;
1126     case RTL_TEXTENCODING_APPLE_ROMANIAN: return 10010;
1127     case RTL_TEXTENCODING_APPLE_UKRAINIAN: return 10017;
1128     case RTL_TEXTENCODING_APPLE_CENTEURO: return 10029;
1129     case RTL_TEXTENCODING_APPLE_ICELAND: return 10079;
1130     case RTL_TEXTENCODING_APPLE_TURKISH: return 10081;
1131     case RTL_TEXTENCODING_APPLE_CROATIAN: return 10082;
1132     case RTL_TEXTENCODING_ASCII_US: return 20127;
1133     case RTL_TEXTENCODING_KOI8_R: return 20866;
1134     case RTL_TEXTENCODING_KOI8_U: return 21866;
1135     case RTL_TEXTENCODING_ISO_8859_1: return 28591;
1136     case RTL_TEXTENCODING_ISO_8859_2: return 28592;
1137     case RTL_TEXTENCODING_ISO_8859_3: return 28593;
1138     case RTL_TEXTENCODING_ISO_8859_4: return 28594;
1139     case RTL_TEXTENCODING_ISO_8859_5: return 28595;
1140     case RTL_TEXTENCODING_ISO_8859_6: return 28596;
1141     case RTL_TEXTENCODING_ISO_8859_7: return 28597;
1142     case RTL_TEXTENCODING_ISO_8859_8: return 28598;
1143     case RTL_TEXTENCODING_ISO_8859_9: return 28599;
1144     case RTL_TEXTENCODING_ISO_8859_15: return 28605;
1145     case RTL_TEXTENCODING_ISO_2022_JP: return 50220;
1146     case RTL_TEXTENCODING_ISO_2022_KR: return 50225;
1147     case RTL_TEXTENCODING_EUC_JP: return 51932;
1148     case RTL_TEXTENCODING_EUC_CN: return 51936;
1149     case RTL_TEXTENCODING_EUC_KR: return 51949;
1150     case RTL_TEXTENCODING_ISCII_DEVANAGARI: return 57002;
1151     case RTL_TEXTENCODING_UTF7: return 65000;
1152     case RTL_TEXTENCODING_UTF8: return 65001;
1153     default: return 0;
1154     }
1155 }
1156