diff options
-rw-r--r-- | luni/src/main/java/java/lang/Character.java | 431 | ||||
-rw-r--r-- | luni/src/main/native/java_lang_Character.cpp | 30 | ||||
-rw-r--r-- | luni/src/test/java/libcore/java/lang/CharacterTest.java | 62 |
3 files changed, 515 insertions, 8 deletions
diff --git a/luni/src/main/java/java/lang/Character.java b/luni/src/main/java/java/lang/Character.java index 5762bd4..32c38d3 100644 --- a/luni/src/main/java/java/lang/Character.java +++ b/luni/src/main/java/java/lang/Character.java @@ -1489,7 +1489,7 @@ public final class Character implements Serializable, Comparable<Character> { if (blockName == null) { throw new NullPointerException("blockName == null"); } - int block = forNameImpl(blockName); + int block = unicodeBlockForName(blockName); if (block == -1) { throw new IllegalArgumentException("Unknown block: " + blockName); } @@ -1510,7 +1510,7 @@ public final class Character implements Serializable, Comparable<Character> { */ public static UnicodeBlock of(int codePoint) { checkValidCodePoint(codePoint); - int block = ofImpl(codePoint); + int block = unicodeBlockForCodePoint(codePoint); if (block == -1 || block >= BLOCKS.length) { return null; } @@ -1522,9 +1522,432 @@ public final class Character implements Serializable, Comparable<Character> { } } - private static native int forNameImpl(String blockName); + private static native int unicodeBlockForName(String blockName); + + private static native int unicodeBlockForCodePoint(int codePoint); + + /** + * Represents a <a href="http://www.unicode.org/reports/tr24/">Unicode script</a>. + * Every Unicode code point is contained by a single {@code UnicodeScript}. Code points + * shared between scripts will be in {@code COMMON}. Code points for combining + * characters that can be applied to multiple scripts will be in {@code INHERITED} + * because they inherit the script of their base character. Code points whose scripts + * don't have a corresponding {@code UnicodeScript} will be in {@code UNKNOWN}. + * + * @since 1.7 + * @hide + */ + public static enum UnicodeScript { + /** ISO 15924 English name "Arabic" */ + ARABIC, + /** ISO 15924 English name "Armenian" */ + ARMENIAN, + /** ISO 15924 English name "Avestan" */ + AVESTAN, + /** ISO 15924 English name "Balinese" */ + BALINESE, + /** ISO 15924 English name "Bamum" */ + BAMUM, + /** ISO 15924 English name "Batak" */ + BATAK, + /** ISO 15924 English name "Bengali" */ + BENGALI, + /** ISO 15924 English name "Bopomofo" */ + BOPOMOFO, + /** ISO 15924 English name "Brahmi" */ + BRAHMI, + /** ISO 15924 English name "Braille" */ + BRAILLE, + /** ISO 15924 English name "Buginese" */ + BUGINESE, + /** ISO 15924 English name "Buhid" */ + BUHID, + /** ISO 15924 English name "Unified Canadian Aboriginal Syllabics" */ + CANADIAN_ABORIGINAL, + /** ISO 15924 English name "Carian" */ + CARIAN, + /** ISO 15924 English name "Cham" */ + CHAM, + /** ISO 15924 English name "Cherokee" */ + CHEROKEE, + /** ISO 15924 English name "Common" */ + COMMON, + /** ISO 15924 English name "Coptic" */ + COPTIC, + /** ISO 15924 English name "Cuneiform" */ + CUNEIFORM, + /** ISO 15924 English name "Cypriot" */ + CYPRIOT, + /** ISO 15924 English name "Cyrillic" */ + CYRILLIC, + /** ISO 15924 English name "Deseret" */ + DESERET, + /** ISO 15924 English name "Devanagari" */ + DEVANAGARI, + /** ISO 15924 English name "Egyptian hieroglyphs" */ + EGYPTIAN_HIEROGLYPHS, + /** ISO 15924 English name "Ethiopic" */ + ETHIOPIC, + /** ISO 15924 English name "Georgian" */ + GEORGIAN, + /** ISO 15924 English name "Glagolitic" */ + GLAGOLITIC, + /** ISO 15924 English name "Gothic" */ + GOTHIC, + /** ISO 15924 English name "Greek" */ + GREEK, + /** ISO 15924 English name "Gujarati" */ + GUJARATI, + /** ISO 15924 English name "Gurmukhi" */ + GURMUKHI, + /** ISO 15924 English name "Han" */ + HAN, + /** ISO 15924 English name "Hangul" */ + HANGUL, + /** ISO 15924 English name "Hanunoo" */ + HANUNOO, + /** ISO 15924 English name "Hebrew" */ + HEBREW, + /** ISO 15924 English name "Hiragana" */ + HIRAGANA, + /** ISO 15924 English name "Imperial aramaic" */ + IMPERIAL_ARAMAIC, + /** ISO 15924 English name "Inherited" */ + INHERITED, + /** ISO 15924 English name "Inscriptional pahlavi" */ + INSCRIPTIONAL_PAHLAVI, + /** ISO 15924 English name "Inscriptional parthian" */ + INSCRIPTIONAL_PARTHIAN, + /** ISO 15924 English name "Javanese" */ + JAVANESE, + /** ISO 15924 English name "Kaithi" */ + KAITHI, + /** ISO 15924 English name "Kannada" */ + KANNADA, + /** ISO 15924 English name "Katakana" */ + KATAKANA, + /** ISO 15924 English name "Kayah li" */ + KAYAH_LI, + /** ISO 15924 English name "Kharoshthi" */ + KHAROSHTHI, + /** ISO 15924 English name "Khmer" */ + KHMER, + /** ISO 15924 English name "Lao" */ + LAO, + /** ISO 15924 English name "Latin" */ + LATIN, + /** ISO 15924 English name "Lepcha" */ + LEPCHA, + /** ISO 15924 English name "Limbu" */ + LIMBU, + /** ISO 15924 English name "Linear B" */ + LINEAR_B, + /** ISO 15924 English name "Lisu" */ + LISU, + /** ISO 15924 English name "Lycian" */ + LYCIAN, + /** ISO 15924 English name "Lydian" */ + LYDIAN, + /** ISO 15924 English name "Malayalam" */ + MALAYALAM, + /** ISO 15924 English name "Mandaic" */ + MANDAIC, + /** ISO 15924 English name "Meetei Mayek (Meithei, Meetei)" */ + MEETEI_MAYEK, + /** ISO 15924 English name "Mongolian" */ + MONGOLIAN, + /** ISO 15924 English name "Myanmar" */ + MYANMAR, + /** ISO 15924 English name "New Tai Lue" */ + NEW_TAI_LUE, + /** ISO 15924 English name "Nko" */ + NKO, + /** ISO 15924 English name "Ogham" */ + OGHAM, + /** ISO 15924 English name "Ol Chiki" */ + OL_CHIKI, + /** ISO 15924 English name "Old Italic" */ + OLD_ITALIC, + /** ISO 15924 English name "Old Persian" */ + OLD_PERSIAN, + /** ISO 15924 English name "Old South Arabian" */ + OLD_SOUTH_ARABIAN, + /** ISO 15924 English name "Old Turkic, Orkhon Runic" */ + OLD_TURKIC, + /** ISO 15924 English name "Oriya" */ + ORIYA, + /** ISO 15924 English name "Osmanya" */ + OSMANYA, + /** ISO 15924 English name "Phags-pa" */ + PHAGS_PA, + /** ISO 15924 English name "Phoenician" */ + PHOENICIAN, + /** ISO 15924 English name "Rejang" */ + REJANG, + /** ISO 15924 English name "Runic" */ + RUNIC, + /** ISO 15924 English name "Samaritan" */ + SAMARITAN, + /** ISO 15924 English name "Saurashtra" */ + SAURASHTRA, + /** ISO 15924 English name "Shavian" */ + SHAVIAN, + /** ISO 15924 English name "Sinhala" */ + SINHALA, + /** ISO 15924 English name "Sundanese" */ + SUNDANESE, + /** ISO 15924 English name "Syloti Nagri" */ + SYLOTI_NAGRI, + /** ISO 15924 English name "Syriac" */ + SYRIAC, + /** ISO 15924 English name "Tagalog" */ + TAGALOG, + /** ISO 15924 English name "Tagbanwa" */ + TAGBANWA, + /** ISO 15924 English name "Tai Le" */ + TAI_LE, + /** ISO 15924 English name "Tai Tham (Lanna)" */ + TAI_THAM, + /** ISO 15924 English name "Tai Viet" */ + TAI_VIET, + /** ISO 15924 English name "Tamil" */ + TAMIL, + /** ISO 15924 English name "Telugu" */ + TELUGU, + /** ISO 15924 English name "Thaana" */ + THAANA, + /** ISO 15924 English name "Thai" */ + THAI, + /** ISO 15924 English name "Tibetan" */ + TIBETAN, + /** ISO 15924 English name "Tifinagh" */ + TIFINAGH, + /** ISO 15924 English name "Ugaritic" */ + UGARITIC, + /** ISO 15924 English name "Unknown" */ + UNKNOWN, + /** ISO 15924 English name "Vai" */ + VAI, + /** ISO 15924 English name "Yi" */ + YI; + + private static final UnicodeScript[] SCRIPTS = { + COMMON, + INHERITED, + ARABIC, + ARMENIAN, + BENGALI, + BOPOMOFO, + CHEROKEE, + COPTIC, + CYRILLIC, + DESERET, + DEVANAGARI, + ETHIOPIC, + GEORGIAN, + GOTHIC, + GREEK, + GUJARATI, + GURMUKHI, + HAN, + HANGUL, + HEBREW, + HIRAGANA, + KANNADA, + KATAKANA, + KHMER, + LAO, + LATIN, + MALAYALAM, + MONGOLIAN, + MYANMAR, + OGHAM, + OLD_ITALIC, + ORIYA, + RUNIC, + SINHALA, + SYRIAC, + TAMIL, + TELUGU, + THAANA, + THAI, + TIBETAN, + CANADIAN_ABORIGINAL, + YI, + TAGALOG, + HANUNOO, + BUHID, + TAGBANWA, + BRAILLE, + CYPRIOT, + LIMBU, + LINEAR_B, + OSMANYA, + SHAVIAN, + TAI_LE, + UGARITIC, + null, // USCRIPT_KATAKANA_OR_HIRAGANA + BUGINESE, + GLAGOLITIC, + KHAROSHTHI, + SYLOTI_NAGRI, + NEW_TAI_LUE, + TIFINAGH, + OLD_PERSIAN, + BALINESE, + BATAK, + null, // USCRIPT_BLISSYMBOLS, + BRAHMI, + CHAM, + null, // USCRIPT_CIRTH, + null, // USCRIPT_OLD_CHURCH_SLAVONIC_CYRILLIC, + null, // USCRIPT_DEMOTIC_EGYPTIAN, + null, // USCRIPT_HIERATIC_EGYPTIAN, + EGYPTIAN_HIEROGLYPHS, + null, // USCRIPT_USCRIPT_KHUTSURI, + null, // USCRIPT_SIMPLIFIED_HAN, + null, // USCRIPT_TRADITIONAL_HAN, + null, // USCRIPT_PAHAWH_HMONG, + null, // USCRIPT_OLD_HUNGARIAN, + null, // USCRIPT_HARAPPAN_INDUS, + JAVANESE, + KAYAH_LI, + null, // USCRIPT_LATIN_FRAKTUR, + null, // USCRIPT_LATIN_GAELIC, + LEPCHA, + null, // USCRIPT_LINEAR_A, + MANDAIC, // == MANDAEAN + null, // USCRIPT_MAYAN_HIEROGLYPHS, + null, // USCRIPT_MEROITIC_HIEROGLYPHS == USCRIPT_MEROITIC + null, // USCRIPT_NKO, + OLD_TURKIC, // USCRIPT_ORKHON == OLD_TURKIC, + null, // USCRIPT_OLD_PERMIC, + PHAGS_PA, + PHOENICIAN, + null, // USCRIPT_PHONETIC_POLLARD === MIAO, + null, // USCRIPT_RONGORONGO, + null, // USCRIPT_SARATI, + null, // USCRIPT_ESTRANGELO_SYRIAC, + null, // USCRIPT_WESTERN_SYRIAC, + null, // USCRIPT_EASTERN_SYRIAC, + null, // USCRIPT_TENGWAR, + VAI, + null, // USCRIPT_VISIBLE_SPEECH, + CUNEIFORM, + null, // USCRIPT_UNWRITTEN_LANGUAGES, + UNKNOWN, + CARIAN, + null, // USCRIPT_JAPANESE, + TAI_THAM, // USCRIPT_LANNA (aka TAI_THAM), + LYCIAN, + LYDIAN, + OL_CHIKI, + REJANG, + SAURASHTRA, + null, // USCRIPT_SIGN_WRITING, + SUNDANESE, + null, // USCRIPT_MOON, + MEETEI_MAYEK, // USCRIPT_MEITEI_MAYEK (aka MEETEI, MEITHEI), + IMPERIAL_ARAMAIC, + AVESTAN, + null, // USCRIPT_CHAKMA, + null, // USCRIPT_KOREAN, + KAITHI, + null, // USCRIPT_MANICHAEAN, + INSCRIPTIONAL_PAHLAVI, + null, // USCRIPT_PSALTER_PAHLAVI, + null, // USCRIPT_BOOK_PAHLAVI, + INSCRIPTIONAL_PARTHIAN, + SAMARITAN, + TAI_VIET, + null, // USCRIPT_MATHEMATICAL_NOTATION, + null, // USCRIPT_SYMBOLS, + BAMUM, + LISU, + null, // USCRIPT_NAKHI_GEBA, + OLD_SOUTH_ARABIAN, + null, // USCRIPT_BASSA_VAH, + null, // USCRIPT_DUPLOYAN_SHORTAND, + null, // USCRIPT_ELBASAN, + null, // USCRIPT_GRANTHA, + null, // USCRIPT_KPELLE, + null, // USCRIPT_LOMA, + null, // USCRIPT_MENDE, + null, // USCRIPT_MEROITIC_CURSIVE, + null, // USCRIPT_OLD_NORTH_ARABIAN, + null, // USCRIPT_NABATAEAN, + null, // USCRIPT_PALMYRENE, + null, // USCRIPT_SINDHI, + null, // USCRIPT_WARANG_CITI, + null, // USCRIPT_AFAKA, + null, // USCRIPT_JURCHEN, + null, // USCRIPT_MRO, + null, // USCRIPT_NUSHU, + null, // USCRIPT_SHARADA, + null, // USCRIPT_SORA_SOMPENG, + null, // USCRIPT_TAKRI, + null, // USCRIPT_TANGUT, + null, // USCRIPT_WOLEAI, + null, // USCRIPT_ANATOLIAN_HIEROGLYPHS, + null, // USCRIPT_KHOJKI, + null, // USCRIPT_TIRHUTA, + }; + + /** + * Returns the {@link UnicodeScript} value identified by {@code scriptName}. + * {@code scriptName} can be a ISO-15924 English script name + * or an alias (ISO-15924 script code) for that name. + * {@see http://www.unicode.org/iso15924/iso15924-codes.html} + * Lookups are case insensitive. + * + * @throws NullPointerException if {@code scriptName} is null. + * @throws IllegalAccessException if {@code scriptName} in invalid. + * + * @since 1.7 + */ + public static UnicodeScript forName(String scriptName) { + if (scriptName == null) { + throw new NullPointerException("scriptName == null"); + } + + final int script = unicodeScriptForName(scriptName); + if (script == -1 || script >= SCRIPTS.length || + SCRIPTS[script] == null) { + throw new IllegalArgumentException("Unknown script: " + scriptName); + } + + return SCRIPTS[script]; + } + + /** + * Returns the {@link UnicodeScript} value that the given Unicode code + * point is assigned to. + * + * @throws IllegalArgumentException if {@codePoint} is not a valid Unicode code point. + */ + public static UnicodeScript of(int codePoint) { + checkValidCodePoint(codePoint); + int script = unicodeScriptForCodePoint(codePoint); + if (script == -1 || script >= SCRIPTS.length) { + // This signifies an ICU error. Complain loudly instead of swallowing + // the error up. + throw new IllegalArgumentException("Invalid codePoint: " + codePoint); + } + + // This happens when ICU maps the code point to a script known to ICU but + // not the Java API. + if (SCRIPTS[script] == null) { + return UNKNOWN; + } + + return SCRIPTS[script]; + } + } + + private static native int unicodeScriptForName(String blockName); + + private static native int unicodeScriptForCodePoint(int codePoint); - private static native int ofImpl(int codePoint); /** * Constructs a new {@code Character} with the specified primitive char diff --git a/luni/src/main/native/java_lang_Character.cpp b/luni/src/main/native/java_lang_Character.cpp index 14eef64..2d1fcfc 100644 --- a/luni/src/main/native/java_lang_Character.cpp +++ b/luni/src/main/native/java_lang_Character.cpp @@ -20,6 +20,7 @@ #include "JniConstants.h" #include "ScopedUtfChars.h" #include "unicode/uchar.h" +#include "unicode/uscript.h" #include <math.h> #include <stdio.h> // For BUFSIZ #include <stdlib.h> @@ -124,7 +125,7 @@ static jboolean Character_isLowerCaseImpl(JNIEnv*, jclass, jint codePoint) { return u_islower(codePoint); } -static int Character_forNameImpl(JNIEnv* env, jclass, jstring javaBlockName) { +static int Character_unicodeBlockForName(JNIEnv* env, jclass, jstring javaBlockName) { ScopedUtfChars blockName(env, javaBlockName); if (blockName.c_str() == NULL) { return 0; @@ -132,10 +133,29 @@ static int Character_forNameImpl(JNIEnv* env, jclass, jstring javaBlockName) { return u_getPropertyValueEnum(UCHAR_BLOCK, blockName.c_str()); } -static int Character_ofImpl(JNIEnv*, jclass, jint codePoint) { +static int Character_unicodeBlockForCodePoint(JNIEnv*, jclass, jint codePoint) { return ublock_getCode(codePoint); } +static int Character_unicodeScriptForName(JNIEnv* env, jclass, jstring javaScriptName) { + ScopedUtfChars scriptName(env, javaScriptName); + if (scriptName.c_str() == NULL) { + return -1; + } + + return u_getPropertyValueEnum(UCHAR_SCRIPT, scriptName.c_str()); +} + +static int Character_unicodeScriptForCodePoint(JNIEnv*, jclass, jint codePoint) { + UErrorCode status = U_ZERO_ERROR; + const UScriptCode script = uscript_getScript(codePoint, &status); + if (status != U_ZERO_ERROR) { + return -1; + } + + return script; +} + static jboolean Character_isAlphabetic(JNIEnv*, jclass, jint codePoint) { return u_hasBinaryProperty(codePoint, UCHAR_ALPHABETIC); } @@ -146,7 +166,6 @@ static jboolean Character_isIdeographic(JNIEnv*, jclass, jint codePoint) { static JNINativeMethod gMethods[] = { NATIVE_METHOD(Character, digitImpl, "!(II)I"), - NATIVE_METHOD(Character, forNameImpl, "(Ljava/lang/String;)I"), NATIVE_METHOD(Character, getDirectionalityImpl, "!(I)B"), NATIVE_METHOD(Character, getNameImpl, "(I)Ljava/lang/String;"), NATIVE_METHOD(Character, getNumericValueImpl, "!(I)I"), @@ -166,10 +185,13 @@ static JNINativeMethod gMethods[] = { NATIVE_METHOD(Character, isUnicodeIdentifierStartImpl, "!(I)Z"), NATIVE_METHOD(Character, isUpperCaseImpl, "!(I)Z"), NATIVE_METHOD(Character, isWhitespaceImpl, "!(I)Z"), - NATIVE_METHOD(Character, ofImpl, "!(I)I"), NATIVE_METHOD(Character, toLowerCaseImpl, "!(I)I"), NATIVE_METHOD(Character, toTitleCaseImpl, "!(I)I"), NATIVE_METHOD(Character, toUpperCaseImpl, "!(I)I"), + NATIVE_METHOD(Character, unicodeBlockForName, "(Ljava/lang/String;)I"), + NATIVE_METHOD(Character, unicodeBlockForCodePoint, "!(I)I"), + NATIVE_METHOD(Character, unicodeScriptForName, "(Ljava/lang/String;)I"), + NATIVE_METHOD(Character, unicodeScriptForCodePoint, "!(I)I"), }; void register_java_lang_Character(JNIEnv* env) { jniRegisterNativeMethods(env, "java/lang/Character", gMethods, NELEM(gMethods)); diff --git a/luni/src/test/java/libcore/java/lang/CharacterTest.java b/luni/src/test/java/libcore/java/lang/CharacterTest.java index 48284d6..f0c5a23 100644 --- a/luni/src/test/java/libcore/java/lang/CharacterTest.java +++ b/luni/src/test/java/libcore/java/lang/CharacterTest.java @@ -263,4 +263,66 @@ public class CharacterTest extends junit.framework.TestCase { assertEquals(m.invoke(null, i), Character.isWhitespace(i)); } } + + public void test_UnicodeScript_forName() throws Exception { + try { + Character.UnicodeScript.forName(null); + fail(); + } catch (NullPointerException expected) { + } + + try { + Character.UnicodeScript.forName("existential_dilemmas"); + fail(); + } catch (IllegalArgumentException expected) { + } + + // Note that ICU is pretty lenient about block names and their abbreviations. + assertSame(Character.UnicodeScript.MALAYALAM, Character.UnicodeScript.forName("Malayalam")); + assertSame(Character.UnicodeScript.MALAYALAM, Character.UnicodeScript.forName("MalayaLAM")); + assertSame(Character.UnicodeScript.MALAYALAM, Character.UnicodeScript.forName("Mlym")); + assertSame(Character.UnicodeScript.MALAYALAM, Character.UnicodeScript.forName("MlYM")); + + assertSame(Character.UnicodeScript.OLD_SOUTH_ARABIAN, Character.UnicodeScript.forName("Old_south_arabian")); + + // NOTE: This test fails on the RI because they're much stricter in + // their matching. Strict enough that they fail on "Old south arabian", despite + // it being the official name AND the alias for this script. + assertSame(Character.UnicodeScript.OLD_SOUTH_ARABIAN, Character.UnicodeScript.forName("Old south arabian")); + assertSame(Character.UnicodeScript.OLD_SOUTH_ARABIAN, Character.UnicodeScript.forName("SARB")); + + // A script that's recognized by ICU but not a part of the standard + // java script values. + try { + Character.UnicodeScript.forName("Old north arabian"); + fail(); + } catch (IllegalArgumentException expected) { + } + } + + public void test_UnicodeScript_of() throws Exception { + try { + Character.UnicodeScript.of(-1); + fail(); + } catch (IllegalArgumentException expected) { + } + + try { + Character.UnicodeScript.of(0xffffff); + fail(); + } catch (IllegalArgumentException expected) { + } + + // The example from the ICU4C unit tests. + assertSame(Character.UnicodeScript.MALAYALAM, Character.UnicodeScript.of(0x0D02)); + + // Special cases: + // + // 0640 is the ARABIC_TATWEEL, used by both Mandiac & Syriac + assertSame(Character.UnicodeScript.COMMON, Character.UnicodeScript.of(0x0640)); + // 0300 is the COMBINING GRAVE ACCENT, which should be INHERITED because it's + // a nonspacing mark. + assertSame(Character.UnicodeScript.INHERITED, Character.UnicodeScript.of(0x0300)); + assertSame(Character.UnicodeScript.COMMON, Character.UnicodeScript.of(0x0640)); + } } |