summaryrefslogtreecommitdiffstats
path: root/include/utils/AndroidUnicode.h
diff options
context:
space:
mode:
Diffstat (limited to 'include/utils/AndroidUnicode.h')
-rw-r--r--include/utils/AndroidUnicode.h255
1 files changed, 255 insertions, 0 deletions
diff --git a/include/utils/AndroidUnicode.h b/include/utils/AndroidUnicode.h
new file mode 100644
index 0000000..563fcd0
--- /dev/null
+++ b/include/utils/AndroidUnicode.h
@@ -0,0 +1,255 @@
+/*
+ * Copyright (C) 2006 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+//
+
+#ifndef ANDROID_UNICODE_H
+#define ANDROID_UNICODE_H
+
+#include <stdint.h>
+#include <sys/types.h>
+
+#define REPLACEMENT_CHAR (0xFFFD)
+
+// this part of code is copied from umachine.h under ICU
+/**
+ * Define UChar32 as a type for single Unicode code points.
+ * UChar32 is a signed 32-bit integer (same as int32_t).
+ *
+ * The Unicode code point range is 0..0x10ffff.
+ * All other values (negative or >=0x110000) are illegal as Unicode code points.
+ * They may be used as sentinel values to indicate "done", "error"
+ * or similar non-code point conditions.
+ *
+ * @stable ICU 2.4
+ */
+typedef int32_t UChar32;
+
+namespace android {
+
+ class Encoding;
+ /**
+ * \class Unicode
+ *
+ * Helper class for getting properties of Unicode characters. Characters
+ * can have one of the types listed in CharType and each character can have the
+ * directionality of Direction.
+ */
+ class Unicode
+ {
+ public:
+ /**
+ * Directions specified in the Unicode standard. These directions map directly
+ * to java.lang.Character.
+ */
+ enum Direction {
+ DIRECTIONALITY_UNDEFINED = -1,
+ DIRECTIONALITY_LEFT_TO_RIGHT,
+ DIRECTIONALITY_RIGHT_TO_LEFT,
+ DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC,
+ DIRECTIONALITY_EUROPEAN_NUMBER,
+ DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR,
+ DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR,
+ DIRECTIONALITY_ARABIC_NUMBER,
+ DIRECTIONALITY_COMMON_NUMBER_SEPARATOR,
+ DIRECTIONALITY_NONSPACING_MARK,
+ DIRECTIONALITY_BOUNDARY_NEUTRAL,
+ DIRECTIONALITY_PARAGRAPH_SEPARATOR,
+ DIRECTIONALITY_SEGMENT_SEPARATOR,
+ DIRECTIONALITY_WHITESPACE,
+ DIRECTIONALITY_OTHER_NEUTRALS,
+ DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING,
+ DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE,
+ DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING,
+ DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE,
+ DIRECTIONALITY_POP_DIRECTIONAL_FORMAT
+ };
+
+ /**
+ * Character types as specified in the Unicode standard. These map directly to
+ * java.lang.Character.
+ */
+ enum CharType {
+ CHARTYPE_UNASSIGNED = 0,
+ CHARTYPE_UPPERCASE_LETTER,
+ CHARTYPE_LOWERCASE_LETTER,
+ CHARTYPE_TITLECASE_LETTER,
+ CHARTYPE_MODIFIER_LETTER,
+ CHARTYPE_OTHER_LETTER,
+ CHARTYPE_NON_SPACING_MARK,
+ CHARTYPE_ENCLOSING_MARK,
+ CHARTYPE_COMBINING_SPACING_MARK,
+ CHARTYPE_DECIMAL_DIGIT_NUMBER,
+ CHARTYPE_LETTER_NUMBER,
+ CHARTYPE_OTHER_NUMBER,
+ CHARTYPE_SPACE_SEPARATOR,
+ CHARTYPE_LINE_SEPARATOR,
+ CHARTYPE_PARAGRAPH_SEPARATOR,
+ CHARTYPE_CONTROL,
+ CHARTYPE_FORMAT,
+ CHARTYPE_MISSING_VALUE_FOR_JAVA, /* This is the mysterious missing 17 value from the java constants */
+ CHARTYPE_PRIVATE_USE,
+ CHARTYPE_SURROGATE,
+ CHARTYPE_DASH_PUNCTUATION,
+ CHARTYPE_START_PUNCTUATION,
+ CHARTYPE_END_PUNCTUATION,
+ CHARTYPE_CONNECTOR_PUNCTUATION,
+ CHARTYPE_OTHER_PUNCTUATION,
+ CHARTYPE_MATH_SYMBOL,
+ CHARTYPE_CURRENCY_SYMBOL,
+ CHARTYPE_MODIFIER_SYMBOL,
+ CHARTYPE_OTHER_SYMBOL,
+ CHARTYPE_INITIAL_QUOTE_PUNCTUATION,
+ CHARTYPE_FINAL_QUOTE_PUNCTUATION
+ };
+
+ /**
+ * Decomposition types as described by the unicode standard. These values map to
+ * the same values in uchar.h in ICU.
+ */
+ enum DecompositionType {
+ DECOMPOSITION_NONE = 0,
+ DECOMPOSITION_CANONICAL,
+ DECOMPOSITION_COMPAT,
+ DECOMPOSITION_CIRCLE,
+ DECOMPOSITION_FINAL,
+ DECOMPOSITION_FONT,
+ DECOMPOSITION_FRACTION,
+ DECOMPOSITION_INITIAL,
+ DECOMPOSITION_ISOLATED,
+ DECOMPOSITION_MEDIAL,
+ DECOMPOSITION_NARROW,
+ DECOMPOSITION_NOBREAK,
+ DECOMPOSITION_SMALL,
+ DECOMPOSITION_SQUARE,
+ DECOMPOSITION_SUB,
+ DECOMPOSITION_SUPER,
+ DECOMPOSITION_VERTICAL,
+ DECOMPOSITION_WIDE
+ };
+
+ /**
+ * Returns the packed data for java calls
+ * @param c The unicode character.
+ * @return The packed data for the character.
+ *
+ * Copied from java.lang.Character implementation:
+ * 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
+ * F E D C B A 9 8 7 6 5 4 3 2 1 0 F E D C B A 9 8 7 6 5 4 3 2 1 0
+ *
+ * 31 types ---------
+ * 18 directionalities ---------
+ * 2 mirroreds -
+ * ----------- 56 toupper diffs
+ * ----------- 48 tolower diffs
+ * --- 4 totitlecase diffs
+ * ------------- 84 numeric values
+ * --------- 24 mirror char diffs
+ */
+ static uint32_t getPackedData(UChar32 c);
+
+ /**
+ * Get the Character type.
+ * @param c The unicode character.
+ * @return The character's type or CHARTYPE_UNASSIGNED if the character is invalid
+ * or has an unassigned class.
+ */
+ static CharType getType(UChar32 c);
+
+ /**
+ * Get the Character's decomposition type.
+ * @param c The unicode character.
+ * @return The character's decomposition type or DECOMPOSITION_NONE is there
+ * is no decomposition.
+ */
+ static DecompositionType getDecompositionType(UChar32 c);
+
+ /**
+ * Returns the digit value of a character or -1 if the character
+ * is not within the specified radix.
+ *
+ * The digit value is computed for integer characters and letters
+ * within the given radix. This function does not handle Roman Numerals,
+ * fractions, or any other characters that may represent numbers.
+ *
+ * @param c The unicode character
+ * @param radix The intended radix.
+ * @return The digit value or -1 if there is no digit value or if the value is outside the radix.
+ */
+ static int getDigitValue(UChar32 c, int radix = 10);
+
+ /**
+ * Return the numeric value of a character
+ *
+ * @param c The unicode character.
+ * @return The numeric value of the character. -1 if the character has no numeric value,
+ * -2 if the character has a numeric value that is not representable by an integer.
+ */
+ static int getNumericValue(UChar32 c);
+
+ /**
+ * Convert the character to lowercase
+ * @param c The unicode character.
+ * @return The lowercase character equivalent of c. If c does not have a lowercase equivalent,
+ * the original character is returned.
+ */
+ static UChar32 toLower(UChar32 c);
+
+ /**
+ * Convert the character to uppercase
+ * @param c The unicode character.
+ * @return The uppercase character equivalent of c. If c does not have an uppercase equivalent,
+ * the original character is returned.
+ */
+ static UChar32 toUpper(UChar32 c);
+
+ /**
+ * Get the directionality of the character.
+ * @param c The unicode character.
+ * @return The direction of the character or DIRECTIONALITY_UNDEFINED.
+ */
+ static Direction getDirectionality(UChar32 c);
+
+ /**
+ * Check if the character is a mirrored character. This means that the character
+ * has an equivalent character that is the mirror image of itself.
+ * @param c The unicode character.
+ * @return True iff c has a mirror equivalent.
+ */
+ static bool isMirrored(UChar32 c);
+
+ /**
+ * Return the mirror of the given character.
+ * @param c The unicode character.
+ * @return The mirror equivalent of c. If c does not have a mirror equivalent,
+ * the original character is returned.
+ * @see isMirrored
+ */
+ static UChar32 toMirror(UChar32 c);
+
+ /**
+ * Convert the character to title case.
+ * @param c The unicode character.
+ * @return The titlecase equivalent of c. If c does not have a titlecase equivalent,
+ * the original character is returned.
+ */
+ static UChar32 toTitle(UChar32 c);
+
+ };
+
+}
+
+#endif