xref: /trunk/main/sal/textenc/unichars.c (revision 0cf2fd93dccc69f9c6d59dfb2dbcea73e82909ce)
1647f063dSAndrew Rist /**************************************************************
2cdf0e10cSrcweir  *
3647f063dSAndrew Rist  * Licensed to the Apache Software Foundation (ASF) under one
4647f063dSAndrew Rist  * or more contributor license agreements.  See the NOTICE file
5647f063dSAndrew Rist  * distributed with this work for additional information
6647f063dSAndrew Rist  * regarding copyright ownership.  The ASF licenses this file
7647f063dSAndrew Rist  * to you under the Apache License, Version 2.0 (the
8647f063dSAndrew Rist  * "License"); you may not use this file except in compliance
9647f063dSAndrew Rist  * with the License.  You may obtain a copy of the License at
10cdf0e10cSrcweir  *
11647f063dSAndrew Rist  *   http://www.apache.org/licenses/LICENSE-2.0
12cdf0e10cSrcweir  *
13647f063dSAndrew Rist  * Unless required by applicable law or agreed to in writing,
14647f063dSAndrew Rist  * software distributed under the License is distributed on an
15647f063dSAndrew Rist  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16647f063dSAndrew Rist  * KIND, either express or implied.  See the License for the
17647f063dSAndrew Rist  * specific language governing permissions and limitations
18647f063dSAndrew Rist  * under the License.
19cdf0e10cSrcweir  *
20647f063dSAndrew Rist  *************************************************************/
21647f063dSAndrew Rist 
22cdf0e10cSrcweir #include "unichars.h"
23cdf0e10cSrcweir #include "osl/diagnose.h"
24cdf0e10cSrcweir #include "sal/types.h"
25cdf0e10cSrcweir 
ImplIsNoncharacter(sal_uInt32 nUtf32)26cdf0e10cSrcweir int ImplIsNoncharacter(sal_uInt32 nUtf32)
27cdf0e10cSrcweir {
28cdf0e10cSrcweir     /* All code points that are noncharacters, as of Unicode 3.1.1. */
29cdf0e10cSrcweir     return (nUtf32 >= 0xFDD0 && nUtf32 <= 0xFDEF)
30cdf0e10cSrcweir            || (nUtf32 & 0xFFFF) >= 0xFFFE
31cdf0e10cSrcweir            || nUtf32 > 0x10FFFF;
32cdf0e10cSrcweir }
33cdf0e10cSrcweir 
ImplIsControlOrFormat(sal_uInt32 nUtf32)34cdf0e10cSrcweir int ImplIsControlOrFormat(sal_uInt32 nUtf32)
35cdf0e10cSrcweir {
36cdf0e10cSrcweir     /* All code points of <http://www.unicode.org/Public/UNIDATA/
37cdf0e10cSrcweir        UnicodeData.txt>, Version 3.1.1, that have a General Category of Cc
38cdf0e10cSrcweir        (Other, Control) or Cf (Other, Format).
39cdf0e10cSrcweir      */
40cdf0e10cSrcweir     return nUtf32 <= 0x001F
41cdf0e10cSrcweir            || (nUtf32 >= 0x007F && nUtf32 <= 0x009F)
42cdf0e10cSrcweir            || nUtf32 == 0x070F /* SYRIAC ABBREVIATION MARK */
43cdf0e10cSrcweir            || nUtf32 == 0x180B /* MONGOLIAN FREE VARIATION SELECTOR ONE */
44cdf0e10cSrcweir            || nUtf32 == 0x180C /* MONGOLIAN FREE VARIATION SELECTOR TWO */
45cdf0e10cSrcweir            || nUtf32 == 0x180D /* MONGOLIAN FREE VARIATION SELECTOR THREE */
46cdf0e10cSrcweir            || nUtf32 == 0x180E /* MONGOLIAN VOWEL SEPARATOR */
47cdf0e10cSrcweir            || nUtf32 == 0x200C /* ZERO WIDTH NON-JOINER */
48cdf0e10cSrcweir            || nUtf32 == 0x200D /* ZERO WIDTH JOINER */
49cdf0e10cSrcweir            || nUtf32 == 0x200E /* LEFT-TO-RIGHT MARK */
50cdf0e10cSrcweir            || nUtf32 == 0x200F /* RIGHT-TO-LEFT MARK */
51cdf0e10cSrcweir            || nUtf32 == 0x202A /* LEFT-TO-RIGHT EMBEDDING */
52cdf0e10cSrcweir            || nUtf32 == 0x202B /* RIGHT-TO-LEFT EMBEDDING */
53cdf0e10cSrcweir            || nUtf32 == 0x202C /* POP DIRECTIONAL FORMATTING */
54cdf0e10cSrcweir            || nUtf32 == 0x202D /* LEFT-TO-RIGHT OVERRIDE */
55cdf0e10cSrcweir            || nUtf32 == 0x202E /* RIGHT-TO-LEFT OVERRIDE */
56cdf0e10cSrcweir            || nUtf32 == 0x206A /* INHIBIT SYMMETRIC SWAPPING */
57cdf0e10cSrcweir            || nUtf32 == 0x206B /* ACTIVATE SYMMETRIC SWAPPING */
58cdf0e10cSrcweir            || nUtf32 == 0x206C /* INHIBIT ARABIC FORM SHAPING */
59cdf0e10cSrcweir            || nUtf32 == 0x206D /* ACTIVATE ARABIC FORM SHAPING */
60cdf0e10cSrcweir            || nUtf32 == 0x206E /* NATIONAL DIGIT SHAPES */
61cdf0e10cSrcweir            || nUtf32 == 0x206F /* NOMINAL DIGIT SHAPES */
62cdf0e10cSrcweir            || nUtf32 == 0xFEFF /* ZERO WIDTH NO-BREAK SPACE */
63cdf0e10cSrcweir            || nUtf32 == 0xFFF9 /* INTERLINEAR ANNOTATION ANCHOR */
64cdf0e10cSrcweir            || nUtf32 == 0xFFFA /* INTERLINEAR ANNOTATION SEPARATOR */
65cdf0e10cSrcweir            || nUtf32 == 0xFFFB /* INTERLINEAR ANNOTATION TERMINATOR */
66cdf0e10cSrcweir            || nUtf32 == 0x1D173 /* MUSICAL SYMBOL BEGIN BEAM */
67cdf0e10cSrcweir            || nUtf32 == 0x1D174 /* MUSICAL SYMBOL END BEAM */
68cdf0e10cSrcweir            || nUtf32 == 0x1D175 /* MUSICAL SYMBOL BEGIN TIE */
69cdf0e10cSrcweir            || nUtf32 == 0x1D176 /* MUSICAL SYMBOL END TIE */
70cdf0e10cSrcweir            || nUtf32 == 0x1D177 /* MUSICAL SYMBOL BEGIN SLUR */
71cdf0e10cSrcweir            || nUtf32 == 0x1D178 /* MUSICAL SYMBOL END SLUR */
72cdf0e10cSrcweir            || nUtf32 == 0x1D179 /* MUSICAL SYMBOL BEGIN PHRASE */
73cdf0e10cSrcweir            || nUtf32 == 0x1D17A /* MUSICAL SYMBOL END PHRASE */
74cdf0e10cSrcweir            || nUtf32 == 0xE0001 /* LANGUAGE TAG */
75cdf0e10cSrcweir            || (nUtf32 >= 0xE0020 && nUtf32 <= 0xE007F);
76cdf0e10cSrcweir }
77cdf0e10cSrcweir 
ImplIsHighSurrogate(sal_uInt32 nUtf32)78cdf0e10cSrcweir int ImplIsHighSurrogate(sal_uInt32 nUtf32)
79cdf0e10cSrcweir {
80cdf0e10cSrcweir     /* All code points that are high-surrogates, as of Unicode 3.1.1. */
81cdf0e10cSrcweir     return nUtf32 >= 0xD800 && nUtf32 <= 0xDBFF;
82cdf0e10cSrcweir }
83cdf0e10cSrcweir 
ImplIsLowSurrogate(sal_uInt32 nUtf32)84cdf0e10cSrcweir int ImplIsLowSurrogate(sal_uInt32 nUtf32)
85cdf0e10cSrcweir {
86cdf0e10cSrcweir     /* All code points that are low-surrogates, as of Unicode 3.1.1. */
87cdf0e10cSrcweir     return nUtf32 >= 0xDC00 && nUtf32 <= 0xDFFF;
88cdf0e10cSrcweir }
89cdf0e10cSrcweir 
ImplIsPrivateUse(sal_uInt32 nUtf32)90cdf0e10cSrcweir int ImplIsPrivateUse(sal_uInt32 nUtf32)
91cdf0e10cSrcweir {
92cdf0e10cSrcweir     /* All code points of <http://www.unicode.org/Public/UNIDATA/
93cdf0e10cSrcweir        UnicodeData.txt>, Version 3.1.1, that have a General Category of Co
94cdf0e10cSrcweir        (Other, Private Use).
95cdf0e10cSrcweir      */
96cdf0e10cSrcweir     return (nUtf32 >= 0xE000 && nUtf32 <= 0xF8FF)
97cdf0e10cSrcweir            || (nUtf32 >= 0xF0000 && nUtf32 <= 0xFFFFD)
98cdf0e10cSrcweir            || (nUtf32 >= 0x100000 && nUtf32 <= 0x10FFFD);
99cdf0e10cSrcweir }
100cdf0e10cSrcweir 
ImplIsZeroWidth(sal_uInt32 nUtf32)101cdf0e10cSrcweir int ImplIsZeroWidth(sal_uInt32 nUtf32)
102cdf0e10cSrcweir {
103cdf0e10cSrcweir     /* All code points of <http://www.unicode.org/Public/UNIDATA/
104cdf0e10cSrcweir        UnicodeData.txt>, Version 3.1.1, that have "ZERO WIDTH" in their
105cdf0e10cSrcweir        Character name.
106cdf0e10cSrcweir      */
107cdf0e10cSrcweir     return nUtf32 == 0x200B /* ZERO WIDTH SPACE */
108cdf0e10cSrcweir            || nUtf32 == 0x200C /* ZERO WIDTH NON-JOINER */
109cdf0e10cSrcweir            || nUtf32 == 0x200D /* ZERO WIDTH JOINER */
110*0cf2fd93Smseidel            || nUtf32 == 0xFEFF; /* ZERO WIDTH NO-BREAK SPACE */
111cdf0e10cSrcweir }
112cdf0e10cSrcweir 
ImplGetHighSurrogate(sal_uInt32 nUtf32)113cdf0e10cSrcweir sal_uInt32 ImplGetHighSurrogate(sal_uInt32 nUtf32)
114cdf0e10cSrcweir {
115cdf0e10cSrcweir     OSL_ENSURE(nUtf32 >= 0x10000, "specification violation");
116cdf0e10cSrcweir     return ((nUtf32 - 0x10000) >> 10) | 0xD800;
117cdf0e10cSrcweir }
118cdf0e10cSrcweir 
ImplGetLowSurrogate(sal_uInt32 nUtf32)119cdf0e10cSrcweir sal_uInt32 ImplGetLowSurrogate(sal_uInt32 nUtf32)
120cdf0e10cSrcweir {
121cdf0e10cSrcweir     OSL_ENSURE(nUtf32 >= 0x10000, "specification violation");
122cdf0e10cSrcweir     return ((nUtf32 - 0x10000) & 0x3FF) | 0xDC00;
123cdf0e10cSrcweir }
124cdf0e10cSrcweir 
ImplCombineSurrogates(sal_uInt32 nHigh,sal_uInt32 nLow)125cdf0e10cSrcweir sal_uInt32 ImplCombineSurrogates(sal_uInt32 nHigh, sal_uInt32 nLow)
126cdf0e10cSrcweir {
127cdf0e10cSrcweir     OSL_ENSURE(ImplIsHighSurrogate(nHigh) && ImplIsLowSurrogate(nLow),
128cdf0e10cSrcweir                "specification violation");
129cdf0e10cSrcweir     return (((nHigh & 0x3FF) << 10) | (nLow & 0x3FF)) + 0x10000;
130cdf0e10cSrcweir }
131