summaryrefslogtreecommitdiffstats
path: root/src/com/android/providers/contacts/SearchIndexManager.java
diff options
context:
space:
mode:
authorMakoto Onuki <omakoto@google.com>2012-04-27 13:59:45 -0700
committerMakoto Onuki <omakoto@google.com>2012-04-30 15:03:08 -0700
commit116d86ddd67330428f9128613b4886fc0ea66221 (patch)
tree35d01b44ca3c25ab31b61094c43c2fa9af285fdf /src/com/android/providers/contacts/SearchIndexManager.java
parentf76a0fe0629fb626d96081dc0f272433e4920ba3 (diff)
downloadpackages_providers_ContactsProvider-116d86ddd67330428f9128613b4886fc0ea66221.zip
packages_providers_ContactsProvider-116d86ddd67330428f9128613b4886fc0ea66221.tar.gz
packages_providers_ContactsProvider-116d86ddd67330428f9128613b4886fc0ea66221.tar.bz2
Make names with special chars searchable.
It's a better fix than I34bfa864, which was only a quick workaround for double barrelled names. Now names with other special characters are searchable too. Also, previously, a query "doublebarrelled" wouldn't match "double-barrelled", but now it will. Bug 5592553 Change-Id: Id1d44261f577df7abf701311ed1c86fb093547da
Diffstat (limited to 'src/com/android/providers/contacts/SearchIndexManager.java')
-rw-r--r--src/com/android/providers/contacts/SearchIndexManager.java93
1 files changed, 56 insertions, 37 deletions
diff --git a/src/com/android/providers/contacts/SearchIndexManager.java b/src/com/android/providers/contacts/SearchIndexManager.java
index 5ca9859..bd4e1cc 100644
--- a/src/com/android/providers/contacts/SearchIndexManager.java
+++ b/src/com/android/providers/contacts/SearchIndexManager.java
@@ -20,6 +20,8 @@ import com.android.providers.contacts.ContactsDatabaseHelper.MimetypesColumns;
import com.android.providers.contacts.ContactsDatabaseHelper.RawContactsColumns;
import com.android.providers.contacts.ContactsDatabaseHelper.SearchIndexColumns;
import com.android.providers.contacts.ContactsDatabaseHelper.Tables;
+import com.google.android.collect.Lists;
+import com.google.common.annotations.VisibleForTesting;
import android.content.ContentValues;
import android.database.Cursor;
@@ -35,7 +37,9 @@ import android.provider.ContactsContract.RawContacts;
import android.text.TextUtils;
import android.util.Log;
+import java.util.ArrayList;
import java.util.HashSet;
+import java.util.List;
import java.util.Set;
import java.util.regex.Pattern;
@@ -138,7 +142,7 @@ public class SearchIndexManager {
appendContent(value, SEPARATOR_SPACE);
}
- public void appendContent(String value, int format) {
+ private void appendContent(String value, int format) {
if (TextUtils.isEmpty(value)) {
return;
}
@@ -182,18 +186,33 @@ public class SearchIndexManager {
mSbTokens.append(token);
}
- private static final Pattern PATTERN_HYPHEN = Pattern.compile("\\-");
-
public void appendName(String name) {
if (TextUtils.isEmpty(name)) {
return;
}
- if (name.indexOf('-') < 0) {
- // Common case -- no hyphens in it.
- appendNameInternal(name);
- } else {
- // In order to make hyphenated names searchable, let's split names with '-'.
- for (String namePart : PATTERN_HYPHEN.split(name)) {
+ // First, put the original name.
+ appendNameInternal(name);
+
+ // Then, if the name contains more than one FTS token, put each token into the index
+ // too.
+ //
+ // This is to make names with special characters searchable, such as "double-barrelled"
+ // "L'Image".
+ //
+ // Here's how it works:
+ // Because we "normalize" names when putting into the index, if we only put
+ // "double-barrelled", the index will only contain "doublebarrelled".
+ // Now, if the user searches for "double-barrelled", the searcher tokenizes it into
+ // two tokens, "double" and "barrelled". The first one matches "doublebarrelled"
+ // but the second one doesn't (because we only do the prefix match), so
+ // "doublebarrelled" doesn't match.
+ // So, here, we put each token in a name into the index too. In the case above,
+ // we put also "double" and "barrelled".
+ // With this, queries such as "double-barrelled", "double barrelled", "doublebarrelled"
+ // will all match "double-barrelled".
+ final List<String> nameParts = splitIntoFtsTokens(name);
+ if (nameParts.size() > 1) {
+ for (String namePart : nameParts) {
if (!TextUtils.isEmpty(namePart)) {
appendNameInternal(namePart);
}
@@ -201,6 +220,9 @@ public class SearchIndexManager {
}
}
+ /**
+ * Normalize a name and add to {@link #mSbName}
+ */
private void appendNameInternal(String name) {
if (mSbName.length() != 0) {
mSbName.append(' ');
@@ -373,6 +395,29 @@ public class SearchIndexManager {
}
/**
+ * Token separator that matches SQLite's "simple" tokenizer.
+ * - Unicode codepoints >= 128: Everything
+ * - Unicode codepoints < 128: Alphanumeric and "_"
+ * - Everything else is a separator of tokens
+ */
+ private static final Pattern FTS_TOKEN_SEPARATOR_RE =
+ Pattern.compile("[^\u0080-\uffff\\p{Alnum}_]");
+
+ /**
+ * Tokenize a string in the way as that of SQLite's "simple" tokenizer.
+ */
+ @VisibleForTesting
+ static List<String> splitIntoFtsTokens(String s) {
+ final ArrayList<String> ret = Lists.newArrayList();
+ for (String token : FTS_TOKEN_SEPARATOR_RE.split(s)) {
+ if (!TextUtils.isEmpty(token)) {
+ ret.add(token);
+ }
+ }
+ return ret;
+ }
+
+ /**
* Tokenizes the query and normalizes/hex encodes each token. The tokenizer uses the same
* rules as SQLite's "simple" tokenizer. Each token is added to the retokenizer and then
* returned as a String.
@@ -380,35 +425,9 @@ public class SearchIndexManager {
* @see FtsQueryBuilder#SCOPED_NAME_NORMALIZING
*/
public static String getFtsMatchQuery(String query, FtsQueryBuilder ftsQueryBuilder) {
- // SQLite's "simple" tokenizer uses the following rules to detect characters:
- // - Unicode codepoints >= 128: Everything
- // - Unicode codepoints < 128: Alphanumeric and "_"
- // Everything else is a separator of tokens
- int tokenStart = -1;
final StringBuilder result = new StringBuilder();
- for (int i = 0; i <= query.length(); i++) {
- final boolean isChar;
- if (i == query.length()) {
- isChar = false;
- } else {
- final char ch = query.charAt(i);
- if (ch >= 128) {
- isChar = true;
- } else {
- isChar = Character.isLetterOrDigit(ch) || ch == '_';
- }
- }
- if (isChar) {
- if (tokenStart == -1) {
- tokenStart = i;
- }
- } else {
- if (tokenStart != -1) {
- final String token = query.substring(tokenStart, i);
- ftsQueryBuilder.addToken(result, token);
- tokenStart = -1;
- }
- }
+ for (String token : splitIntoFtsTokens(query)) {
+ ftsQueryBuilder.addToken(result, token);
}
return result.toString();
}