xref: /trunk/main/sal/inc/rtl/tencinfo.h (revision 86e1cf34)
1 /**************************************************************
2  *
3  * Licensed to the Apache Software Foundation (ASF) under one
4  * or more contributor license agreements.  See the NOTICE file
5  * distributed with this work for additional information
6  * regarding copyright ownership.  The ASF licenses this file
7  * to you under the Apache License, Version 2.0 (the
8  * "License"); you may not use this file except in compliance
9  * with the License.  You may obtain a copy of the License at
10  *
11  *   http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing,
14  * software distributed under the License is distributed on an
15  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16  * KIND, either express or implied.  See the License for the
17  * specific language governing permissions and limitations
18  * under the License.
19  *
20  *************************************************************/
21 
22 
23 
24 #ifndef _RTL_TENCINFO_H
25 #define _RTL_TENCINFO_H
26 
27 #ifndef _SAL_TYPES_H
28 #include <sal/types.h>
29 #endif
30 #include <rtl/textenc.h>
31 
32 #ifdef __cplusplus
33 extern "C" {
34 #endif
35 
36 // See rtl_TextEncodingInfo.Flags below for documentation on these values:
37 #define RTL_TEXTENCODING_INFO_CONTEXT   ((sal_uInt32)0x00000001)
38 #define RTL_TEXTENCODING_INFO_ASCII     ((sal_uInt32)0x00000002)
39 #define RTL_TEXTENCODING_INFO_UNICODE   ((sal_uInt32)0x00000004)
40 #define RTL_TEXTENCODING_INFO_MULTIBYTE ((sal_uInt32)0x00000008)
41 #define RTL_TEXTENCODING_INFO_R2L       ((sal_uInt32)0x00000010)
42 #define RTL_TEXTENCODING_INFO_7BIT      ((sal_uInt32)0x00000020)
43 #define RTL_TEXTENCODING_INFO_SYMBOL    ((sal_uInt32)0x00000040)
44 #define RTL_TEXTENCODING_INFO_MIME      ((sal_uInt32)0x00000080)
45 
46 /** Information about a text encoding.
47  */
48 typedef struct _rtl_TextEncodingInfo
49 {
50     /** The size (in bytes) of this structure.  Should be 12.
51      */
52     sal_uInt32          StructSize;
53 
54     /** The minimum number of bytes needed to encode any character in the
55         given encoding.
56 
57         Can be rather meaningless for encodings that encode global state along
58         with the characters (e.g., ISO-2022 encodings).
59      */
60     sal_uInt8           MinimumCharSize;
61 
62     /** The maximum number of bytes needed to encode any character in the
63         given encoding.
64 
65         Can be rather meaningless for encodings that encode global state along
66         with the characters (e.g., ISO-2022 encodings).
67      */
68     sal_uInt8           MaximumCharSize;
69 
70     /** The average number of bytes needed to encode a character in the given
71         encoding.
72      */
73     sal_uInt8           AverageCharSize;
74 
75     /** An unused byte, for padding.
76      */
77     sal_uInt8           Reserved;
78 
79     /** Any combination of the RTL_TEXTENCODING_INFO flags.
80 
81         RTL_TEXTENCODING_INFO_CONTEXT:  The encoding uses some mechanism (like
82         state-changing byte sequences) to switch between different modes (e.g.,
83         to encode multiple character repertoires within the same byte ranges).
84 
85         Even if an encoding does not have the CONTEXT property, interpretation
86         of certain byte values within that encoding can depend on context (e.g.,
87         a certain byte value could be either a single-byte character or a
88         subsequent byte of a multi-byte character).  Likewise, the single shift
89         characters (SS2 and SS3) used by some of the EUC encodings (to denote
90         that the following bytes constitute a character from another character
91         repertoire) do not imply that encodings making use of these characters
92         have the CONTEXT property.  Examples of encodings that do have the
93         CONTEXT property are the ISO-2022 encodings and UTF-7.
94 
95         RTL_TEXTENCODING_INFO_ASCII:  The encoding is a superset of ASCII.  More
96         specifically, any appearance of a byte in the range 0x20--7F denotes the
97         corresponding ASCII character (from SPACE to DELETE); in particular,
98         such a byte cannot be part of a multi-byte character.  Note that the
99         ASCII control codes 0x00--1F are not included here, as they are used for
100         special purposes in some encodings.
101 
102         If an encoding has this property, it is easy to search for occurrences of
103         ASCII characters within strings of this encoding---you do not need to
104         keep track whether a byte in the range 0x20--7F really represents an
105         ASCII character or rather is part of some multi-byte character.
106 
107         The guarantees when mapping between Unicode and a given encoding with
108         the ASCII property are as follows:  When mapping from Unicode to the
109         given encoding, U+0020--007F map to 0x20--7F (but there can also be
110         other Unicode characters mapping into the range 0x20--7F), and when
111         mapping from the given encoding to Unicode, 0x20--7F map to U+0020--007F
112         (again, there can also be other characters mapping into the range
113         U+0020--007F).  In particular, this ensures round-trip conversion for
114         the ASCII range.
115 
116         In principle, the ASCII property is orthogonal to the CONTEXT property.
117         In practice, however, an encoding that has the ASCII property will most
118         likely not also have the CONTEXT property.
119 
120         RTL_TEXTENCODING_INFO_UNICODE:  The encoding is based on the Unicode
121         character repertoire.
122 
123         RTL_TEXTENCODING_INFO_MULTIBYTE:  A multi-byte encoding.
124 
125         RTL_TEXTENCODING_INFO_R2L:  An encoding used mainly or exclusively for
126         languages written from right to left.
127 
128         RTL_TEXTENCODING_INFO_7BIT:  A 7-bit instead of an 8-bit encoding.
129 
130         RTL_TEXTENCODING_INFO_SYMBOL:  A (generic) encoding for symbol character
131         sets.
132 
133         RTL_TEXTENCODING_INFO_MIME:  The encoding is registered as a MIME
134         charset.
135      */
136     sal_uInt32          Flags;
137 } rtl_TextEncodingInfo;
138 
139 /** Determine whether a text encoding uses single octets as basic units of
140     information (and can thus be used with the conversion routines in
141     rtl/textcvt.h).
142 
143     @param nEncoding
144     Any rtl_TextEncoding value.
145 
146     @return
147     True if the given encoding uses single octets as basic units of
148     information, false otherwise.
149  */
150 sal_Bool SAL_CALL rtl_isOctetTextEncoding(rtl_TextEncoding nEncoding);
151 
152 /** Return information about a text encoding.
153 
154     @param eTextEncoding
155     Any rtl_TextEncoding value.
156 
157     @param pEncInfo
158     Returns information about the given encoding.  Must not be null, and the
159     StructSize member must be set correctly.
160 
161     @return
162     True if information about the given encoding is available, false
163     otherwise.
164  */
165 sal_Bool SAL_CALL rtl_getTextEncodingInfo( rtl_TextEncoding eTextEncoding, rtl_TextEncodingInfo* pEncInfo );
166 
167 /** Map from a numeric Windows charset to a text encoding.
168 
169     @param nWinCharset
170     Any numeric Windows charset.
171 
172     @return
173     The corresponding rtl_TextEncoding value, or RTL_TEXTENCODING_DONTKNOW if
174     no mapping is applicable.
175  */
176 rtl_TextEncoding SAL_CALL rtl_getTextEncodingFromWindowsCharset( sal_uInt8 nWinCharset );
177 
178 /** Map from a MIME charset to a text encoding.
179 
180     @param pMimeCharset
181     Any MIME charset string.  Must not be null.
182 
183     @return
184     The corresponding rtl_TextEncoding value, or RTL_TEXTENCODING_DONTKNOW if
185     no mapping is applicable.
186  */
187 rtl_TextEncoding SAL_CALL rtl_getTextEncodingFromMimeCharset( const sal_Char* pMimeCharset );
188 
189 /** Map from a Unix charset to a text encoding.
190 
191     @param pMimeCharset
192     Any Unix charset string.  Must not be null.
193 
194     @return
195     The corresponding rtl_TextEncoding value, or RTL_TEXTENCODING_DONTKNOW if
196     no mapping is applicable.
197  */
198 rtl_TextEncoding SAL_CALL rtl_getTextEncodingFromUnixCharset( const sal_Char* pUnixCharset );
199 
200 /** Map from a text encoding to the best matching numeric Windows charset.
201 
202     @param eTextEncoding
203     Any rtl_TextEncoding value.
204 
205     @return
206     The best matching numeric Windows charset, or 1 if none matches.
207  */
208 sal_uInt8       SAL_CALL rtl_getBestWindowsCharsetFromTextEncoding( rtl_TextEncoding eTextEncoding );
209 
210 /** Map from a text encoding to a corresponding MIME charset name, if
211     available (see <http://www.iana.org/assignments/character-sets>).
212 
213     @param nEncoding
214     Any rtl_TextEncoding value.
215 
216     @return
217     The (preferred) MIME charset name corresponding to the given encoding, or
218     NULL if none is available.
219  */
220 char const * SAL_CALL rtl_getMimeCharsetFromTextEncoding(rtl_TextEncoding
221                                                              nEncoding);
222 
223 /** Map from a text encoding to the best matching MIME charset.
224 
225     @param eTextEncoding
226     Any rtl_TextEncoding value.
227 
228     @return
229     The best matching MIME charset string, or null if none matches.
230  */
231 const sal_Char* SAL_CALL rtl_getBestMimeCharsetFromTextEncoding( rtl_TextEncoding eTextEncoding );
232 
233 /** Map from a text encoding to the best matching Unix charset.
234 
235     @param eTextEncoding
236     Any rtl_TextEncoding value.
237 
238     @return
239     The best matching Unix charset string, or null if none matches.
240  */
241 const sal_Char* SAL_CALL rtl_getBestUnixCharsetFromTextEncoding( rtl_TextEncoding eTextEncoding  );
242 
243 /** Map from a Windows code page to a text encoding.
244 
245     @param nCodePage
246     Any Windows code page number.
247 
248     @return
249     The corresponding rtl_TextEncoding value (which will be an octet text
250     encoding, see rtl_isOctetTextEncoding), or RTL_TEXTENCODING_DONTKNOW if no
251     mapping is applicable.
252  */
253 rtl_TextEncoding SAL_CALL
254 rtl_getTextEncodingFromWindowsCodePage(sal_uInt32 nCodePage);
255 
256 /** Map from a text encoding to a Windows code page.
257 
258     @param nEncoding
259     Any rtl_TextEncoding value.
260 
261     @return
262     The corresponding Windows code page number, or 0 if no mapping is
263     applicable.
264  */
265 sal_uInt32 SAL_CALL
266 rtl_getWindowsCodePageFromTextEncoding(rtl_TextEncoding nEncoding);
267 
268 #ifdef __cplusplus
269 }
270 #endif
271 
272 #endif /* _RTL_TENCINFO_H */
273