1 /************************************************************** 2 * 3 * Licensed to the Apache Software Foundation (ASF) under one 4 * or more contributor license agreements. See the NOTICE file 5 * distributed with this work for additional information 6 * regarding copyright ownership. The ASF licenses this file 7 * to you under the Apache License, Version 2.0 (the 8 * "License"); you may not use this file except in compliance 9 * with the License. You may obtain a copy of the License at 10 * 11 * http://www.apache.org/licenses/LICENSE-2.0 12 * 13 * Unless required by applicable law or agreed to in writing, 14 * software distributed under the License is distributed on an 15 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 * KIND, either express or implied. See the License for the 17 * specific language governing permissions and limitations 18 * under the License. 19 * 20 *************************************************************/ 21 22 23 24 #ifndef _RTL_TENCINFO_H 25 #define _RTL_TENCINFO_H 26 27 #ifndef _SAL_TYPES_H 28 #include <sal/types.h> 29 #endif 30 #include <rtl/textenc.h> 31 32 #ifdef __cplusplus 33 extern "C" { 34 #endif 35 36 // See rtl_TextEncodingInfo.Flags below for documentation on these values: 37 #define RTL_TEXTENCODING_INFO_CONTEXT ((sal_uInt32)0x00000001) 38 #define RTL_TEXTENCODING_INFO_ASCII ((sal_uInt32)0x00000002) 39 #define RTL_TEXTENCODING_INFO_UNICODE ((sal_uInt32)0x00000004) 40 #define RTL_TEXTENCODING_INFO_MULTIBYTE ((sal_uInt32)0x00000008) 41 #define RTL_TEXTENCODING_INFO_R2L ((sal_uInt32)0x00000010) 42 #define RTL_TEXTENCODING_INFO_7BIT ((sal_uInt32)0x00000020) 43 #define RTL_TEXTENCODING_INFO_SYMBOL ((sal_uInt32)0x00000040) 44 #define RTL_TEXTENCODING_INFO_MIME ((sal_uInt32)0x00000080) 45 46 /** Information about a text encoding. 47 */ 48 typedef struct _rtl_TextEncodingInfo 49 { 50 /** The size (in bytes) of this structure. Should be 12. 51 */ 52 sal_uInt32 StructSize; 53 54 /** The minimum number of bytes needed to encode any character in the 55 given encoding. 56 57 Can be rather meaningless for encodings that encode global state along 58 with the characters (e.g., ISO-2022 encodings). 59 */ 60 sal_uInt8 MinimumCharSize; 61 62 /** The maximum number of bytes needed to encode any character in the 63 given encoding. 64 65 Can be rather meaningless for encodings that encode global state along 66 with the characters (e.g., ISO-2022 encodings). 67 */ 68 sal_uInt8 MaximumCharSize; 69 70 /** The average number of bytes needed to encode a character in the given 71 encoding. 72 */ 73 sal_uInt8 AverageCharSize; 74 75 /** An unused byte, for padding. 76 */ 77 sal_uInt8 Reserved; 78 79 /** Any combination of the RTL_TEXTENCODING_INFO flags. 80 81 RTL_TEXTENCODING_INFO_CONTEXT: The encoding uses some mechanism (like 82 state-changing byte sequences) to switch between different modes (e.g., 83 to encode multiple character repertoires within the same byte ranges). 84 85 Even if an encoding does not have the CONTEXT property, interpretation 86 of certain byte values within that encoding can depend on context (e.g., 87 a certain byte value could be either a single-byte character or a 88 subsequent byte of a multi-byte character). Likewise, the single shift 89 characters (SS2 and SS3) used by some of the EUC encodings (to denote 90 that the following bytes constitute a character from another character 91 repertoire) do not imply that encodings making use of these characters 92 have the CONTEXT property. Examples of encodings that do have the 93 CONTEXT property are the ISO-2022 encodings and UTF-7. 94 95 RTL_TEXTENCODING_INFO_ASCII: The encoding is a superset of ASCII. More 96 specifically, any appearance of a byte in the range 0x20--7F denotes the 97 corresponding ASCII character (from SPACE to DELETE); in particular, 98 such a byte cannot be part of a multi-byte character. Note that the 99 ASCII control codes 0x00--1F are not included here, as they are used for 100 special purposes in some encodings. 101 102 If an encoding has this property, it is easy to search for occurrences of 103 ASCII characters within strings of this encoding---you do not need to 104 keep track whether a byte in the range 0x20--7F really represents an 105 ASCII character or rather is part of some multi-byte character. 106 107 The guarantees when mapping between Unicode and a given encoding with 108 the ASCII property are as follows: When mapping from Unicode to the 109 given encoding, U+0020--007F map to 0x20--7F (but there can also be 110 other Unicode characters mapping into the range 0x20--7F), and when 111 mapping from the given encoding to Unicode, 0x20--7F map to U+0020--007F 112 (again, there can also be other characters mapping into the range 113 U+0020--007F). In particular, this ensures round-trip conversion for 114 the ASCII range. 115 116 In principle, the ASCII property is orthogonal to the CONTEXT property. 117 In practice, however, an encoding that has the ASCII property will most 118 likely not also have the CONTEXT property. 119 120 RTL_TEXTENCODING_INFO_UNICODE: The encoding is based on the Unicode 121 character repertoire. 122 123 RTL_TEXTENCODING_INFO_MULTIBYTE: A multi-byte encoding. 124 125 RTL_TEXTENCODING_INFO_R2L: An encoding used mainly or exclusively for 126 languages written from right to left. 127 128 RTL_TEXTENCODING_INFO_7BIT: A 7-bit instead of an 8-bit encoding. 129 130 RTL_TEXTENCODING_INFO_SYMBOL: A (generic) encoding for symbol character 131 sets. 132 133 RTL_TEXTENCODING_INFO_MIME: The encoding is registered as a MIME 134 charset. 135 */ 136 sal_uInt32 Flags; 137 } rtl_TextEncodingInfo; 138 139 /** Determine whether a text encoding uses single octets as basic units of 140 information (and can thus be used with the conversion routines in 141 rtl/textcvt.h). 142 143 @param nEncoding 144 Any rtl_TextEncoding value. 145 146 @return 147 True if the given encoding uses single octets as basic units of 148 information, false otherwise. 149 */ 150 sal_Bool SAL_CALL rtl_isOctetTextEncoding(rtl_TextEncoding nEncoding); 151 152 /** Return information about a text encoding. 153 154 @param eTextEncoding 155 Any rtl_TextEncoding value. 156 157 @param pEncInfo 158 Returns information about the given encoding. Must not be null, and the 159 StructSize member must be set correctly. 160 161 @return 162 True if information about the given encoding is available, false 163 otherwise. 164 */ 165 sal_Bool SAL_CALL rtl_getTextEncodingInfo( rtl_TextEncoding eTextEncoding, rtl_TextEncodingInfo* pEncInfo ); 166 167 /** Map from a numeric Windows charset to a text encoding. 168 169 @param nWinCharset 170 Any numeric Windows charset. 171 172 @return 173 The corresponding rtl_TextEncoding value, or RTL_TEXTENCODING_DONTKNOW if 174 no mapping is applicable. 175 */ 176 rtl_TextEncoding SAL_CALL rtl_getTextEncodingFromWindowsCharset( sal_uInt8 nWinCharset ); 177 178 /** Map from a MIME charset to a text encoding. 179 180 @param pMimeCharset 181 Any MIME charset string. Must not be null. 182 183 @return 184 The corresponding rtl_TextEncoding value, or RTL_TEXTENCODING_DONTKNOW if 185 no mapping is applicable. 186 */ 187 rtl_TextEncoding SAL_CALL rtl_getTextEncodingFromMimeCharset( const sal_Char* pMimeCharset ); 188 189 /** Map from a Unix charset to a text encoding. 190 191 @param pMimeCharset 192 Any Unix charset string. Must not be null. 193 194 @return 195 The corresponding rtl_TextEncoding value, or RTL_TEXTENCODING_DONTKNOW if 196 no mapping is applicable. 197 */ 198 rtl_TextEncoding SAL_CALL rtl_getTextEncodingFromUnixCharset( const sal_Char* pUnixCharset ); 199 200 /** Map from a text encoding to the best matching numeric Windows charset. 201 202 @param eTextEncoding 203 Any rtl_TextEncoding value. 204 205 @return 206 The best matching numeric Windows charset, or 1 if none matches. 207 */ 208 sal_uInt8 SAL_CALL rtl_getBestWindowsCharsetFromTextEncoding( rtl_TextEncoding eTextEncoding ); 209 210 /** Map from a text encoding to a corresponding MIME charset name, if 211 available (see <http://www.iana.org/assignments/character-sets>). 212 213 @param nEncoding 214 Any rtl_TextEncoding value. 215 216 @return 217 The (preferred) MIME charset name corresponding to the given encoding, or 218 NULL if none is available. 219 */ 220 char const * SAL_CALL rtl_getMimeCharsetFromTextEncoding(rtl_TextEncoding 221 nEncoding); 222 223 /** Map from a text encoding to the best matching MIME charset. 224 225 @param eTextEncoding 226 Any rtl_TextEncoding value. 227 228 @return 229 The best matching MIME charset string, or null if none matches. 230 */ 231 const sal_Char* SAL_CALL rtl_getBestMimeCharsetFromTextEncoding( rtl_TextEncoding eTextEncoding ); 232 233 /** Map from a text encoding to the best matching Unix charset. 234 235 @param eTextEncoding 236 Any rtl_TextEncoding value. 237 238 @return 239 The best matching Unix charset string, or null if none matches. 240 */ 241 const sal_Char* SAL_CALL rtl_getBestUnixCharsetFromTextEncoding( rtl_TextEncoding eTextEncoding ); 242 243 /** Map from a Windows code page to a text encoding. 244 245 @param nCodePage 246 Any Windows code page number. 247 248 @return 249 The corresponding rtl_TextEncoding value (which will be an octet text 250 encoding, see rtl_isOctetTextEncoding), or RTL_TEXTENCODING_DONTKNOW if no 251 mapping is applicable. 252 */ 253 rtl_TextEncoding SAL_CALL 254 rtl_getTextEncodingFromWindowsCodePage(sal_uInt32 nCodePage); 255 256 /** Map from a text encoding to a Windows code page. 257 258 @param nEncoding 259 Any rtl_TextEncoding value. 260 261 @return 262 The corresponding Windows code page number, or 0 if no mapping is 263 applicable. 264 */ 265 sal_uInt32 SAL_CALL 266 rtl_getWindowsCodePageFromTextEncoding(rtl_TextEncoding nEncoding); 267 268 #ifdef __cplusplus 269 } 270 #endif 271 272 #endif /* _RTL_TENCINFO_H */ 273