summaryrefslogtreecommitdiffstats
path: root/include/utils/AndroidUnicode.h
blob: 563fcd0197fecb102d5683c38c98cea15ba7173e (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
/*
 * Copyright (C) 2006 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

//

#ifndef ANDROID_UNICODE_H
#define ANDROID_UNICODE_H

#include <stdint.h>
#include <sys/types.h>

#define REPLACEMENT_CHAR (0xFFFD)

// this part of code is copied from umachine.h under ICU
/**
 * Define UChar32 as a type for single Unicode code points.
 * UChar32 is a signed 32-bit integer (same as int32_t).
 *
 * The Unicode code point range is 0..0x10ffff.
 * All other values (negative or >=0x110000) are illegal as Unicode code points.
 * They may be used as sentinel values to indicate "done", "error"
 * or similar non-code point conditions.
 *
 * @stable ICU 2.4
 */
typedef int32_t UChar32;

namespace android {

    class Encoding;
    /**
     * \class Unicode
     *
     * Helper class for getting properties of Unicode characters. Characters
     * can have one of the types listed in CharType and each character can have the
     * directionality of Direction.
     */
    class Unicode
    {
    public:
        /**
         * Directions specified in the Unicode standard. These directions map directly
         * to java.lang.Character.
         */
        enum Direction {
            DIRECTIONALITY_UNDEFINED = -1,
            DIRECTIONALITY_LEFT_TO_RIGHT,
            DIRECTIONALITY_RIGHT_TO_LEFT,
            DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC,
            DIRECTIONALITY_EUROPEAN_NUMBER,
            DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR,
            DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR,
            DIRECTIONALITY_ARABIC_NUMBER,
            DIRECTIONALITY_COMMON_NUMBER_SEPARATOR,
            DIRECTIONALITY_NONSPACING_MARK,
            DIRECTIONALITY_BOUNDARY_NEUTRAL,
            DIRECTIONALITY_PARAGRAPH_SEPARATOR,
            DIRECTIONALITY_SEGMENT_SEPARATOR,
            DIRECTIONALITY_WHITESPACE,
            DIRECTIONALITY_OTHER_NEUTRALS,
            DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING,
            DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE,
            DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING,
            DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE,
            DIRECTIONALITY_POP_DIRECTIONAL_FORMAT
        };

        /**
         * Character types as specified in the Unicode standard. These map directly to
         * java.lang.Character.
         */
        enum CharType {
            CHARTYPE_UNASSIGNED = 0,
            CHARTYPE_UPPERCASE_LETTER,
            CHARTYPE_LOWERCASE_LETTER,
            CHARTYPE_TITLECASE_LETTER,
            CHARTYPE_MODIFIER_LETTER,
            CHARTYPE_OTHER_LETTER,
            CHARTYPE_NON_SPACING_MARK,
            CHARTYPE_ENCLOSING_MARK,
            CHARTYPE_COMBINING_SPACING_MARK,
            CHARTYPE_DECIMAL_DIGIT_NUMBER,
            CHARTYPE_LETTER_NUMBER,
            CHARTYPE_OTHER_NUMBER,
            CHARTYPE_SPACE_SEPARATOR,
            CHARTYPE_LINE_SEPARATOR,
            CHARTYPE_PARAGRAPH_SEPARATOR,
            CHARTYPE_CONTROL,
            CHARTYPE_FORMAT,
            CHARTYPE_MISSING_VALUE_FOR_JAVA,    /* This is the mysterious missing 17 value from the java constants */
            CHARTYPE_PRIVATE_USE,
            CHARTYPE_SURROGATE,
            CHARTYPE_DASH_PUNCTUATION,
            CHARTYPE_START_PUNCTUATION,
            CHARTYPE_END_PUNCTUATION,
            CHARTYPE_CONNECTOR_PUNCTUATION,
            CHARTYPE_OTHER_PUNCTUATION,
            CHARTYPE_MATH_SYMBOL,
            CHARTYPE_CURRENCY_SYMBOL,
            CHARTYPE_MODIFIER_SYMBOL,
            CHARTYPE_OTHER_SYMBOL,
            CHARTYPE_INITIAL_QUOTE_PUNCTUATION,
            CHARTYPE_FINAL_QUOTE_PUNCTUATION
        };

        /**
         * Decomposition types as described by the unicode standard. These values map to
         * the same values in uchar.h in ICU.
         */
        enum DecompositionType {
            DECOMPOSITION_NONE = 0,
            DECOMPOSITION_CANONICAL,
            DECOMPOSITION_COMPAT,
            DECOMPOSITION_CIRCLE,
            DECOMPOSITION_FINAL,
            DECOMPOSITION_FONT,
            DECOMPOSITION_FRACTION,
            DECOMPOSITION_INITIAL,
            DECOMPOSITION_ISOLATED,
            DECOMPOSITION_MEDIAL,
            DECOMPOSITION_NARROW,
            DECOMPOSITION_NOBREAK,
            DECOMPOSITION_SMALL,
            DECOMPOSITION_SQUARE,
            DECOMPOSITION_SUB,
            DECOMPOSITION_SUPER,
            DECOMPOSITION_VERTICAL,
            DECOMPOSITION_WIDE
        };

        /**
         * Returns the packed data for java calls
         * @param c The unicode character.
         * @return The packed data for the character.
         *
         * Copied from java.lang.Character implementation:
         * 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
         * F E D C B A 9 8 7 6 5 4 3 2 1 0 F E D C B A 9 8 7 6 5 4 3 2 1 0
         * 
         *                              31 types                 ---------
         *                   18 directionalities       ---------
         *                   2 mirroreds             -
         *                               -----------      56  toupper diffs
         *                   -----------                  48  tolower diffs
         *               ---                              4 totitlecase diffs
         * -------------                                 84 numeric values
         *     ---------                                 24 mirror char diffs
         */
        static uint32_t getPackedData(UChar32 c);
        
        /**
         * Get the Character type.
         * @param c The unicode character.
         * @return The character's type or CHARTYPE_UNASSIGNED if the character is invalid
         *         or has an unassigned class.
         */
        static CharType getType(UChar32 c);    

        /**
         * Get the Character's decomposition type.
         * @param c The unicode character.
         * @return The character's decomposition type or DECOMPOSITION_NONE is there 
         *         is no decomposition.
         */
        static DecompositionType getDecompositionType(UChar32 c);
        
        /**
         * Returns the digit value of a character or -1 if the character
         * is not within the specified radix.
         *
         * The digit value is computed for integer characters and letters
         * within the given radix. This function does not handle Roman Numerals,
         * fractions, or any other characters that may represent numbers.
         * 
         * @param c The unicode character
         * @param radix The intended radix.
         * @return The digit value or -1 if there is no digit value or if the value is outside the radix.
         */
        static int getDigitValue(UChar32 c, int radix = 10);

        /**
         * Return the numeric value of a character
         *
         * @param c The unicode character.
         * @return The numeric value of the character. -1 if the character has no numeric value, 
         *         -2 if the character has a numeric value that is not representable by an integer.
         */
        static int getNumericValue(UChar32 c);

        /**
         * Convert the character to lowercase
         * @param c The unicode character.
         * @return The lowercase character equivalent of c. If c does not have a lowercase equivalent,
         *         the original character is returned.
         */
        static UChar32 toLower(UChar32 c);
            
        /**
         * Convert the character to uppercase
         * @param c The unicode character.
         * @return The uppercase character equivalent of c. If c does not have an uppercase equivalent,
         *         the original character is returned.
         */
        static UChar32 toUpper(UChar32 c);
    
        /**
         * Get the directionality of the character.
         * @param c The unicode character.
         * @return The direction of the character or DIRECTIONALITY_UNDEFINED.
         */
        static Direction getDirectionality(UChar32 c);
            
        /**
         * Check if the character is a mirrored character. This means that the character
         * has an equivalent character that is the mirror image of itself.
         * @param c The unicode character.
         * @return True iff c has a mirror equivalent.
         */
        static bool isMirrored(UChar32 c);
         
        /**
         * Return the mirror of the given character.
         * @param c The unicode character.
         * @return The mirror equivalent of c. If c does not have a mirror equivalent,
         *         the original character is returned.
         * @see isMirrored
         */
        static UChar32 toMirror(UChar32 c);
        
        /**
         * Convert the character to title case.
         * @param c The unicode character.
         * @return The titlecase equivalent of c. If c does not have a titlecase equivalent,
         *         the original character is returned.
         */
        static UChar32 toTitle(UChar32 c);

   };

}

#endif