summaryrefslogtreecommitdiffstats
path: root/core/java/com
diff options
context:
space:
mode:
authorBai Tao <michaelbai@google.com>2010-01-21 08:48:30 +0800
committerBai Tao <michaelbai@google.com>2010-01-23 12:57:21 +0800
commit4256586663f0d045c69ea818db4893b3365b9915 (patch)
treee14afc7537bac6b291e025cc6c2b48755adf3a04 /core/java/com
parent52a014492c10d825ec26b2179bd8369bf78363ef (diff)
downloadframeworks_base-4256586663f0d045c69ea818db4893b3365b9915.zip
frameworks_base-4256586663f0d045c69ea818db4893b3365b9915.tar.gz
frameworks_base-4256586663f0d045c69ea818db4893b3365b9915.tar.bz2
Modify the interface of HanziToPinyin class to make it generic and add test class
Diffstat (limited to 'core/java/com')
-rw-r--r--core/java/com/android/internal/util/HanziToPinyin.java114
1 files changed, 77 insertions, 37 deletions
diff --git a/core/java/com/android/internal/util/HanziToPinyin.java b/core/java/com/android/internal/util/HanziToPinyin.java
index 4368e98..6a4adaa 100644
--- a/core/java/com/android/internal/util/HanziToPinyin.java
+++ b/core/java/com/android/internal/util/HanziToPinyin.java
@@ -16,8 +16,6 @@
package com.android.internal.util;
-import com.google.android.util.AbstractMessageParser.Token;
-
import android.text.TextUtils;
import android.util.Log;
@@ -298,8 +296,10 @@ public class HanziToPinyin {
};
/** First and last Chinese character with known Pinyin according to zh collation */
- private static final String FIRST_UNIHAN = "\u5416";
- private static final String LAST_UNIHAN = "\u5497";
+ private static final String FIRST_PINYIN_UNIHAN = "\u5416";
+ private static final String LAST_PINYIN_UNIHAN = "\u5497";
+ /** The first Chinese character in Unicode block */
+ private static final char FIRST_UNIHAN = '\u3400';
private static final Collator COLLATOR = Collator.getInstance(Locale.CHINA);
private static HanziToPinyin sInstance;
@@ -311,10 +311,18 @@ public class HanziToPinyin {
*/
public static final String SEPARATOR = " ";
- public static final int ASCII = 1;
+ public static final int LATIN = 1;
public static final int PINYIN = 2;
public static final int UNKNOWN = 3;
+ public Token() {
+ }
+
+ public Token(int type, String source, String target) {
+ this.type = type;
+ this.source = source;
+ this.target = target;
+ }
/**
* Type of this token, ASCII, PINYIN or UNKNOWN.
*/
@@ -347,6 +355,7 @@ public class HanziToPinyin {
return sInstance;
}
}
+ Log.w(TAG, "There is no Chinese collator, HanziToPinyin is disabled");
sInstance = new HanziToPinyin(false);
return sInstance;
}
@@ -359,11 +368,15 @@ public class HanziToPinyin {
int offset = -1;
int cmp;
if (character < 256) {
- token.type = Token.ASCII;
+ token.type = Token.LATIN;
+ token.target = letter;
+ return token;
+ } else if (character < FIRST_UNIHAN) {
+ token.type = Token.UNKNOWN;
token.target = letter;
return token;
} else {
- cmp = COLLATOR.compare(letter, FIRST_UNIHAN);
+ cmp = COLLATOR.compare(letter, FIRST_PINYIN_UNIHAN);
if (cmp < 0) {
token.type = Token.UNKNOWN;
token.target = letter;
@@ -372,7 +385,7 @@ public class HanziToPinyin {
token.type = Token.PINYIN;
offset = 0;
} else {
- cmp = COLLATOR.compare(letter, LAST_UNIHAN);
+ cmp = COLLATOR.compare(letter, LAST_PINYIN_UNIHAN);
if (cmp > 0) {
token.type = Token.UNKNOWN;
token.target = letter;
@@ -412,44 +425,71 @@ public class HanziToPinyin {
return token;
}
+ /**
+ * Convert the input to a array of tokens. The sequence of ASCII or Unknown
+ * characters without space will be put into a Token, One Hanzi character
+ * which has pinyin will be treated as a Token.
+ * If these is no China collator, the empty token array is returned.
+ */
public ArrayList<Token> get(final String input) {
+ ArrayList<Token> tokens = new ArrayList<Token>();
if (!mHasChinaCollator || TextUtils.isEmpty(input)) {
- return null;
+ // return empty tokens.
+ return tokens;
}
-
- ArrayList<Token> tokens = new ArrayList<Token>();
- Token currentToken;
-
final int inputLength = input.length();
-
- currentToken = getToken(input.charAt(0));
-
- for (int i = 1; i < inputLength; i++) {
+ final StringBuilder sb = new StringBuilder();
+ int tokenType = Token.LATIN;
+ // Go through the input, create a new token when
+ // a. Token type changed
+ // b. Get the Pinyin of current charater.
+ // c. current character is space.
+ for (int i = 0; i < inputLength; i++) {
final char character = input.charAt(i);
- Token token = getToken(character);
-
- if (token.type != currentToken.type) {
- currentToken.target = currentToken.target.trim();
- tokens.add(currentToken);
- currentToken = token;
+ if (character == ' ') {
+ if (sb.length() > 0) {
+ addToken(sb, tokens, tokenType);
+ }
+ } else if (character < 256) {
+ if (tokenType != Token.LATIN && sb.length() > 0) {
+ addToken(sb, tokens, tokenType);
+ }
+ tokenType = Token.LATIN;
+ sb.append(character);
+ } else if (character < FIRST_UNIHAN) {
+ if (tokenType != Token.UNKNOWN && sb.length() > 0) {
+ addToken(sb, tokens, tokenType);
+ }
+ tokenType = Token.UNKNOWN;
+ sb.append(character);
} else {
- switch (token.type) {
- case Token.ASCII:
- case Token.UNKNOWN:
- currentToken.source += token.source;
- currentToken.target += token.target;
- break;
- case Token.PINYIN:
- currentToken.source += token.source;
- currentToken.target += " " + token.target;
- break;
+ Token t = getToken(character);
+ if (t.type == Token.PINYIN) {
+ if (sb.length() > 0) {
+ addToken(sb, tokens, tokenType);
+ }
+ tokens.add(t);
+ tokenType = Token.PINYIN;
+ } else {
+ if (tokenType != t.type && sb.length() > 0) {
+ addToken(sb, tokens, tokenType);
+ }
+ tokenType = t.type;
+ sb.append(character);
}
}
}
-
- currentToken.target = currentToken.target.trim();
- tokens.add(currentToken);
-
+ if (sb.length() > 0) {
+ addToken(sb, tokens, tokenType);
+ }
return tokens;
}
+
+ private void addToken(final StringBuilder sb, final ArrayList<Token> tokens,
+ final int tokenType) {
+ String str = sb.toString();
+ tokens.add(new Token(tokenType, str, str));
+ sb.setLength(0);
+ }
+
}