summaryrefslogtreecommitdiffstats
path: root/src/com/android/providers/contacts/aggregation
diff options
context:
space:
mode:
authorMakoto Onuki <omakoto@google.com>2012-03-23 13:55:37 -0700
committerMakoto Onuki <omakoto@google.com>2012-03-23 14:03:21 -0700
commit81567f4a0f7c9c338506bd82f4d33e83c2ccf159 (patch)
tree79caaabb662456f9ab46fd886dab0d17cba17962 /src/com/android/providers/contacts/aggregation
parentd3256d7ba4ee8376567f91c1c2dec444ad13d542 (diff)
downloadpackages_providers_ContactsProvider-81567f4a0f7c9c338506bd82f4d33e83c2ccf159.zip
packages_providers_ContactsProvider-81567f4a0f7c9c338506bd82f4d33e83c2ccf159.tar.gz
packages_providers_ContactsProvider-81567f4a0f7c9c338506bd82f4d33e83c2ccf159.tar.bz2
Add new package aggregation.util
Move aggregator related classes into it. Change-Id: I712fe07ad2bab1e532e3822e3e2797a199329865
Diffstat (limited to 'src/com/android/providers/contacts/aggregation')
-rw-r--r--src/com/android/providers/contacts/aggregation/ContactAggregator.java6
-rw-r--r--src/com/android/providers/contacts/aggregation/ProfileAggregator.java2
-rw-r--r--src/com/android/providers/contacts/aggregation/util/CommonNicknameCache.java151
-rw-r--r--src/com/android/providers/contacts/aggregation/util/ContactMatcher.java441
-rw-r--r--src/com/android/providers/contacts/aggregation/util/NameDistance.java169
5 files changed, 765 insertions, 4 deletions
diff --git a/src/com/android/providers/contacts/aggregation/ContactAggregator.java b/src/com/android/providers/contacts/aggregation/ContactAggregator.java
index a235ce2..60eba95 100644
--- a/src/com/android/providers/contacts/aggregation/ContactAggregator.java
+++ b/src/com/android/providers/contacts/aggregation/ContactAggregator.java
@@ -16,10 +16,7 @@
package com.android.providers.contacts.aggregation;
-import com.android.providers.contacts.CommonNicknameCache;
import com.android.providers.contacts.ContactLookupKey;
-import com.android.providers.contacts.ContactMatcher;
-import com.android.providers.contacts.ContactMatcher.MatchScore;
import com.android.providers.contacts.ContactsDatabaseHelper;
import com.android.providers.contacts.ContactsDatabaseHelper.AccountsColumns;
import com.android.providers.contacts.ContactsDatabaseHelper.AggregatedPresenceColumns;
@@ -33,6 +30,9 @@ import com.android.providers.contacts.ContactsDatabaseHelper.RawContactsColumns;
import com.android.providers.contacts.ContactsDatabaseHelper.StatusUpdatesColumns;
import com.android.providers.contacts.ContactsDatabaseHelper.Tables;
import com.android.providers.contacts.ContactsDatabaseHelper.Views;
+import com.android.providers.contacts.aggregation.util.CommonNicknameCache;
+import com.android.providers.contacts.aggregation.util.ContactMatcher;
+import com.android.providers.contacts.aggregation.util.ContactMatcher.MatchScore;
import com.android.providers.contacts.ContactsProvider2;
import com.android.providers.contacts.NameLookupBuilder;
import com.android.providers.contacts.NameNormalizer;
diff --git a/src/com/android/providers/contacts/aggregation/ProfileAggregator.java b/src/com/android/providers/contacts/aggregation/ProfileAggregator.java
index 6126184..fedf5fe 100644
--- a/src/com/android/providers/contacts/aggregation/ProfileAggregator.java
+++ b/src/com/android/providers/contacts/aggregation/ProfileAggregator.java
@@ -20,10 +20,10 @@ import android.database.sqlite.SQLiteDoneException;
import android.database.sqlite.SQLiteStatement;
import android.provider.ContactsContract.Contacts;
-import com.android.providers.contacts.CommonNicknameCache;
import com.android.providers.contacts.ContactLookupKey;
import com.android.providers.contacts.ContactsDatabaseHelper;
import com.android.providers.contacts.ContactsDatabaseHelper.Tables;
+import com.android.providers.contacts.aggregation.util.CommonNicknameCache;
import com.android.providers.contacts.ContactsProvider2;
import com.android.providers.contacts.NameSplitter;
import com.android.providers.contacts.PhotoPriorityResolver;
diff --git a/src/com/android/providers/contacts/aggregation/util/CommonNicknameCache.java b/src/com/android/providers/contacts/aggregation/util/CommonNicknameCache.java
new file mode 100644
index 0000000..d6b799f
--- /dev/null
+++ b/src/com/android/providers/contacts/aggregation/util/CommonNicknameCache.java
@@ -0,0 +1,151 @@
+/*
+ * Copyright (C) 2009 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License
+ */
+
+package com.android.providers.contacts.aggregation.util;
+
+import com.android.providers.contacts.ContactsDatabaseHelper.NicknameLookupColumns;
+import com.android.providers.contacts.ContactsDatabaseHelper.Tables;
+import com.google.android.collect.Maps;
+
+import android.database.Cursor;
+import android.database.sqlite.SQLiteDatabase;
+
+import java.lang.ref.SoftReference;
+import java.util.BitSet;
+import java.util.HashMap;
+
+/**
+ * Cache for common nicknames.
+ */
+public class CommonNicknameCache {
+
+ // We will use this much memory (in bits) to optimize the nickname cluster lookup
+ private static final int NICKNAME_BLOOM_FILTER_SIZE = 0x1FFF; // =long[128]
+ private BitSet mNicknameBloomFilter;
+
+ private HashMap<String, SoftReference<String[]>> mNicknameClusterCache = Maps.newHashMap();
+
+ private final SQLiteDatabase mDb;
+
+ public CommonNicknameCache(SQLiteDatabase db) {
+ mDb = db;
+ }
+
+ private final static class NicknameLookupPreloadQuery {
+ public final static String TABLE = Tables.NICKNAME_LOOKUP;
+
+ public final static String[] COLUMNS = new String[] {
+ NicknameLookupColumns.NAME
+ };
+
+ public final static int NAME = 0;
+ }
+
+ /**
+ * Read all known common nicknames from the database and populate a Bloom
+ * filter using the corresponding hash codes. The idea is to eliminate most
+ * of unnecessary database lookups for nicknames. Given a name, we will take
+ * its hash code and see if it is set in the Bloom filter. If not, we will know
+ * that the name is not in the database. If it is, we still need to run a
+ * query.
+ * <p>
+ * Given the size of the filter and the expected size of the nickname table,
+ * we should expect the combination of the Bloom filter and cache will
+ * prevent around 98-99% of unnecessary queries from running.
+ */
+ private void preloadNicknameBloomFilter() {
+ mNicknameBloomFilter = new BitSet(NICKNAME_BLOOM_FILTER_SIZE + 1);
+ Cursor cursor = mDb.query(NicknameLookupPreloadQuery.TABLE,
+ NicknameLookupPreloadQuery.COLUMNS,
+ null, null, null, null, null);
+ try {
+ int count = cursor.getCount();
+ for (int i = 0; i < count; i++) {
+ cursor.moveToNext();
+ String normalizedName = cursor.getString(NicknameLookupPreloadQuery.NAME);
+ int hashCode = normalizedName.hashCode();
+ mNicknameBloomFilter.set(hashCode & NICKNAME_BLOOM_FILTER_SIZE);
+ }
+ } finally {
+ cursor.close();
+ }
+ }
+
+ /**
+ * Returns nickname cluster IDs or null. Maintains cache.
+ */
+ public String[] getCommonNicknameClusters(String normalizedName) {
+ if (mNicknameBloomFilter == null) {
+ preloadNicknameBloomFilter();
+ }
+
+ int hashCode = normalizedName.hashCode();
+ if (!mNicknameBloomFilter.get(hashCode & NICKNAME_BLOOM_FILTER_SIZE)) {
+ return null;
+ }
+
+ SoftReference<String[]> ref;
+ String[] clusters = null;
+ synchronized (mNicknameClusterCache) {
+ if (mNicknameClusterCache.containsKey(normalizedName)) {
+ ref = mNicknameClusterCache.get(normalizedName);
+ if (ref == null) {
+ return null;
+ }
+ clusters = ref.get();
+ }
+ }
+
+ if (clusters == null) {
+ clusters = loadNicknameClusters(normalizedName);
+ ref = clusters == null ? null : new SoftReference<String[]>(clusters);
+ synchronized (mNicknameClusterCache) {
+ mNicknameClusterCache.put(normalizedName, ref);
+ }
+ }
+ return clusters;
+ }
+
+ private interface NicknameLookupQuery {
+ String TABLE = Tables.NICKNAME_LOOKUP;
+
+ String[] COLUMNS = new String[] {
+ NicknameLookupColumns.CLUSTER
+ };
+
+ int CLUSTER = 0;
+ }
+
+ protected String[] loadNicknameClusters(String normalizedName) {
+ String[] clusters = null;
+ Cursor cursor = mDb.query(NicknameLookupQuery.TABLE, NicknameLookupQuery.COLUMNS,
+ NicknameLookupColumns.NAME + "=?", new String[] { normalizedName },
+ null, null, null);
+ try {
+ int count = cursor.getCount();
+ if (count > 0) {
+ clusters = new String[count];
+ for (int i = 0; i < count; i++) {
+ cursor.moveToNext();
+ clusters[i] = cursor.getString(NicknameLookupQuery.CLUSTER);
+ }
+ }
+ } finally {
+ cursor.close();
+ }
+ return clusters;
+ }
+}
diff --git a/src/com/android/providers/contacts/aggregation/util/ContactMatcher.java b/src/com/android/providers/contacts/aggregation/util/ContactMatcher.java
new file mode 100644
index 0000000..60e41ab
--- /dev/null
+++ b/src/com/android/providers/contacts/aggregation/util/ContactMatcher.java
@@ -0,0 +1,441 @@
+/*
+ * Copyright (C) 2009 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License
+ */
+package com.android.providers.contacts.aggregation.util;
+
+import com.android.providers.contacts.ContactsDatabaseHelper.NameLookupType;
+import com.android.providers.contacts.util.Hex;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+
+/**
+ * Logic for matching contacts' data and accumulating match scores.
+ */
+public class ContactMatcher {
+
+ // Best possible match score
+ public static final int MAX_SCORE = 100;
+
+ // Suggest to aggregate contacts if their match score is equal or greater than this threshold
+ public static final int SCORE_THRESHOLD_SUGGEST = 50;
+
+ // Automatically aggregate contacts if their match score is equal or greater than this threshold
+ public static final int SCORE_THRESHOLD_PRIMARY = 70;
+
+ // Automatically aggregate contacts if the match score is equal or greater than this threshold
+ // and there is a secondary match (phone number, email etc).
+ public static final int SCORE_THRESHOLD_SECONDARY = 50;
+
+ // Score for missing data (as opposed to present data but a bad match)
+ private static final int NO_DATA_SCORE = -1;
+
+ // Score for matching phone numbers
+ private static final int PHONE_MATCH_SCORE = 71;
+
+ // Score for matching email addresses
+ private static final int EMAIL_MATCH_SCORE = 71;
+
+ // Score for matching nickname
+ private static final int NICKNAME_MATCH_SCORE = 71;
+
+ // Maximum number of characters in a name to be considered by the matching algorithm.
+ private static final int MAX_MATCHED_NAME_LENGTH = 30;
+
+ // Scores a multiplied by this number to allow room for "fractional" scores
+ private static final int SCORE_SCALE = 1000;
+
+ public static final int MATCHING_ALGORITHM_EXACT = 0;
+ public static final int MATCHING_ALGORITHM_CONSERVATIVE = 1;
+ public static final int MATCHING_ALGORITHM_APPROXIMATE = 2;
+
+ // Minimum edit distance between two names to be considered an approximate match
+ public static final float APPROXIMATE_MATCH_THRESHOLD = 0.82f;
+
+ // Minimum edit distance between two email ids to be considered an approximate match
+ public static final float APPROXIMATE_MATCH_THRESHOLD_FOR_EMAIL = 0.95f;
+
+ // Returned value when we found multiple matches and that was not allowed
+ public static final long MULTIPLE_MATCHES = -2;
+
+ /**
+ * Name matching scores: a matrix by name type vs. candidate lookup type.
+ * For example, if the name type is "full name" while we are looking for a
+ * "full name", the score may be 99. If we are looking for a "nickname" but
+ * find "first name", the score may be 50 (see specific scores defined
+ * below.)
+ * <p>
+ * For approximate matching, we have a range of scores, let's say 40-70. Depending one how
+ * similar the two strings are, the score will be somewhere between 40 and 70, with the exact
+ * match producing the score of 70. The score may also be 0 if the similarity (distance)
+ * between the strings is below the threshold.
+ * <p>
+ * We use a string matching algorithm, which is particularly suited for
+ * name matching. See {@link NameDistance}.
+ */
+ private static int[] sMinScore =
+ new int[NameLookupType.TYPE_COUNT * NameLookupType.TYPE_COUNT];
+ private static int[] sMaxScore =
+ new int[NameLookupType.TYPE_COUNT * NameLookupType.TYPE_COUNT];
+
+ /*
+ * Note: the reverse names ({@link NameLookupType#FULL_NAME_REVERSE},
+ * {@link NameLookupType#FULL_NAME_REVERSE_CONCATENATED} may appear to be redundant. They are
+ * not! They are useful in three-way aggregation cases when we have, for example, both
+ * John Smith and Smith John. A third contact with the name John Smith should be aggregated
+ * with the former rather than the latter. This is why "reverse" matches have slightly lower
+ * scores than direct matches.
+ */
+ static {
+ setScoreRange(NameLookupType.NAME_EXACT,
+ NameLookupType.NAME_EXACT, 99, 99);
+ setScoreRange(NameLookupType.NAME_VARIANT,
+ NameLookupType.NAME_VARIANT, 90, 90);
+ setScoreRange(NameLookupType.NAME_COLLATION_KEY,
+ NameLookupType.NAME_COLLATION_KEY, 50, 80);
+
+ setScoreRange(NameLookupType.NAME_COLLATION_KEY,
+ NameLookupType.EMAIL_BASED_NICKNAME, 30, 60);
+ setScoreRange(NameLookupType.NAME_COLLATION_KEY,
+ NameLookupType.NICKNAME, 50, 60);
+
+ setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME,
+ NameLookupType.EMAIL_BASED_NICKNAME, 50, 60);
+ setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME,
+ NameLookupType.NAME_COLLATION_KEY, 50, 60);
+ setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME,
+ NameLookupType.NICKNAME, 50, 60);
+
+ setScoreRange(NameLookupType.NICKNAME,
+ NameLookupType.NICKNAME, 50, 60);
+ setScoreRange(NameLookupType.NICKNAME,
+ NameLookupType.NAME_COLLATION_KEY, 50, 60);
+ setScoreRange(NameLookupType.NICKNAME,
+ NameLookupType.EMAIL_BASED_NICKNAME, 50, 60);
+ }
+
+ /**
+ * Populates the cells of the score matrix and score span matrix
+ * corresponding to the {@code candidateNameType} and {@code nameType}.
+ */
+ private static void setScoreRange(int candidateNameType, int nameType, int scoreFrom, int scoreTo) {
+ int index = nameType * NameLookupType.TYPE_COUNT + candidateNameType;
+ sMinScore[index] = scoreFrom;
+ sMaxScore[index] = scoreTo;
+ }
+
+ /**
+ * Returns the lower range for the match score for the given {@code candidateNameType} and
+ * {@code nameType}.
+ */
+ private static int getMinScore(int candidateNameType, int nameType) {
+ int index = nameType * NameLookupType.TYPE_COUNT + candidateNameType;
+ return sMinScore[index];
+ }
+
+ /**
+ * Returns the upper range for the match score for the given {@code candidateNameType} and
+ * {@code nameType}.
+ */
+ private static int getMaxScore(int candidateNameType, int nameType) {
+ int index = nameType * NameLookupType.TYPE_COUNT + candidateNameType;
+ return sMaxScore[index];
+ }
+
+ /**
+ * Captures the max score and match count for a specific contact. Used in an
+ * contactId - MatchScore map.
+ */
+ public static class MatchScore implements Comparable<MatchScore> {
+ private long mContactId;
+ private boolean mKeepIn;
+ private boolean mKeepOut;
+ private int mPrimaryScore;
+ private int mSecondaryScore;
+ private int mMatchCount;
+
+ public MatchScore(long contactId) {
+ this.mContactId = contactId;
+ }
+
+ public void reset(long contactId) {
+ this.mContactId = contactId;
+ mKeepIn = false;
+ mKeepOut = false;
+ mPrimaryScore = 0;
+ mSecondaryScore = 0;
+ mMatchCount = 0;
+ }
+
+ public long getContactId() {
+ return mContactId;
+ }
+
+ public void updatePrimaryScore(int score) {
+ if (score > mPrimaryScore) {
+ mPrimaryScore = score;
+ }
+ mMatchCount++;
+ }
+
+ public void updateSecondaryScore(int score) {
+ if (score > mSecondaryScore) {
+ mSecondaryScore = score;
+ }
+ mMatchCount++;
+ }
+
+ public void keepIn() {
+ mKeepIn = true;
+ }
+
+ public void keepOut() {
+ mKeepOut = true;
+ }
+
+ public int getScore() {
+ if (mKeepOut) {
+ return 0;
+ }
+
+ if (mKeepIn) {
+ return MAX_SCORE;
+ }
+
+ int score = (mPrimaryScore > mSecondaryScore ? mPrimaryScore : mSecondaryScore);
+
+ // Ensure that of two contacts with the same match score the one with more matching
+ // data elements wins.
+ return score * SCORE_SCALE + mMatchCount;
+ }
+
+ /**
+ * Descending order of match score.
+ */
+ @Override
+ public int compareTo(MatchScore another) {
+ return another.getScore() - getScore();
+ }
+
+ @Override
+ public String toString() {
+ return mContactId + ": " + mPrimaryScore + "/" + mSecondaryScore + "(" + mMatchCount
+ + ")";
+ }
+ }
+
+ private final HashMap<Long, MatchScore> mScores = new HashMap<Long, MatchScore>();
+ private final ArrayList<MatchScore> mScoreList = new ArrayList<MatchScore>();
+ private int mScoreCount = 0;
+
+ private final NameDistance mNameDistanceConservative = new NameDistance();
+ private final NameDistance mNameDistanceApproximate = new NameDistance(MAX_MATCHED_NAME_LENGTH);
+
+ private MatchScore getMatchingScore(long contactId) {
+ MatchScore matchingScore = mScores.get(contactId);
+ if (matchingScore == null) {
+ if (mScoreList.size() > mScoreCount) {
+ matchingScore = mScoreList.get(mScoreCount);
+ matchingScore.reset(contactId);
+ } else {
+ matchingScore = new MatchScore(contactId);
+ mScoreList.add(matchingScore);
+ }
+ mScoreCount++;
+ mScores.put(contactId, matchingScore);
+ }
+ return matchingScore;
+ }
+
+ /**
+ * Marks the contact as a full match, because we found an Identity match
+ */
+ public void matchIdentity(long contactId) {
+ updatePrimaryScore(contactId, MAX_SCORE);
+ }
+
+ /**
+ * Checks if there is a match and updates the overall score for the
+ * specified contact for a discovered match. The new score is determined
+ * by the prior score, by the type of name we were looking for, the type
+ * of name we found and, if the match is approximate, the distance between the candidate and
+ * actual name.
+ */
+ public void matchName(long contactId, int candidateNameType, String candidateName,
+ int nameType, String name, int algorithm) {
+ int maxScore = getMaxScore(candidateNameType, nameType);
+ if (maxScore == 0) {
+ return;
+ }
+
+ if (candidateName.equals(name)) {
+ updatePrimaryScore(contactId, maxScore);
+ return;
+ }
+
+ if (algorithm == MATCHING_ALGORITHM_EXACT) {
+ return;
+ }
+
+ int minScore = getMinScore(candidateNameType, nameType);
+ if (minScore == maxScore) {
+ return;
+ }
+
+ byte[] decodedCandidateName = Hex.decodeHex(candidateName);
+ byte[] decodedName = Hex.decodeHex(name);
+
+ NameDistance nameDistance = algorithm == MATCHING_ALGORITHM_CONSERVATIVE ?
+ mNameDistanceConservative : mNameDistanceApproximate;
+
+ int score;
+ float distance = nameDistance.getDistance(decodedCandidateName, decodedName);
+ boolean emailBased = candidateNameType == NameLookupType.EMAIL_BASED_NICKNAME
+ || nameType == NameLookupType.EMAIL_BASED_NICKNAME;
+ float threshold = emailBased
+ ? APPROXIMATE_MATCH_THRESHOLD_FOR_EMAIL
+ : APPROXIMATE_MATCH_THRESHOLD;
+ if (distance > threshold) {
+ score = (int)(minScore + (maxScore - minScore) * (1.0f - distance));
+ } else {
+ score = 0;
+ }
+
+ updatePrimaryScore(contactId, score);
+ }
+
+ public void updateScoreWithPhoneNumberMatch(long contactId) {
+ updateSecondaryScore(contactId, PHONE_MATCH_SCORE);
+ }
+
+ public void updateScoreWithEmailMatch(long contactId) {
+ updateSecondaryScore(contactId, EMAIL_MATCH_SCORE);
+ }
+
+ public void updateScoreWithNicknameMatch(long contactId) {
+ updateSecondaryScore(contactId, NICKNAME_MATCH_SCORE);
+ }
+
+ private void updatePrimaryScore(long contactId, int score) {
+ getMatchingScore(contactId).updatePrimaryScore(score);
+ }
+
+ private void updateSecondaryScore(long contactId, int score) {
+ getMatchingScore(contactId).updateSecondaryScore(score);
+ }
+
+ public void keepIn(long contactId) {
+ getMatchingScore(contactId).keepIn();
+ }
+
+ public void keepOut(long contactId) {
+ getMatchingScore(contactId).keepOut();
+ }
+
+ public void clear() {
+ mScores.clear();
+ mScoreCount = 0;
+ }
+
+ /**
+ * Returns a list of IDs for contacts that are matched on secondary data elements
+ * (phone number, email address, nickname). We still need to obtain the approximate
+ * primary score for those contacts to determine if any of them should be aggregated.
+ * <p>
+ * May return null.
+ */
+ public List<Long> prepareSecondaryMatchCandidates(int threshold) {
+ ArrayList<Long> contactIds = null;
+
+ for (int i = 0; i < mScoreCount; i++) {
+ MatchScore score = mScoreList.get(i);
+ if (score.mKeepOut) {
+ continue;
+ }
+
+ int s = score.mSecondaryScore;
+ if (s >= threshold) {
+ if (contactIds == null) {
+ contactIds = new ArrayList<Long>();
+ }
+ contactIds.add(score.mContactId);
+ }
+ score.mPrimaryScore = NO_DATA_SCORE;
+ }
+ return contactIds;
+ }
+
+ /**
+ * Returns the contactId with the best match score over the specified threshold or -1
+ * if no such contact is found.
+ */
+ public long pickBestMatch(int threshold, boolean allowMultipleMatches) {
+ long contactId = -1;
+ int maxScore = 0;
+ for (int i = 0; i < mScoreCount; i++) {
+ MatchScore score = mScoreList.get(i);
+ if (score.mKeepOut) {
+ continue;
+ }
+
+ if (score.mKeepIn) {
+ return score.mContactId;
+ }
+
+ int s = score.mPrimaryScore;
+ if (s == NO_DATA_SCORE) {
+ s = score.mSecondaryScore;
+ }
+
+ if (s >= threshold) {
+ if (contactId != -1 && !allowMultipleMatches) {
+ return MULTIPLE_MATCHES;
+ }
+ if (s > maxScore) {
+ contactId = score.mContactId;
+ maxScore = s;
+ }
+ }
+ }
+ return contactId;
+ }
+
+ /**
+ * Returns matches in the order of descending score.
+ */
+ public List<MatchScore> pickBestMatches(int threshold) {
+ int scaledThreshold = threshold * SCORE_SCALE;
+ List<MatchScore> matches = mScoreList.subList(0, mScoreCount);
+ Collections.sort(matches);
+ int count = 0;
+ for (int i = 0; i < mScoreCount; i++) {
+ MatchScore matchScore = matches.get(i);
+ if (matchScore.getScore() >= scaledThreshold) {
+ count++;
+ } else {
+ break;
+ }
+ }
+
+ return matches.subList(0, count);
+ }
+
+ @Override
+ public String toString() {
+ return mScoreList.subList(0, mScoreCount).toString();
+ }
+}
diff --git a/src/com/android/providers/contacts/aggregation/util/NameDistance.java b/src/com/android/providers/contacts/aggregation/util/NameDistance.java
new file mode 100644
index 0000000..e357ec1
--- /dev/null
+++ b/src/com/android/providers/contacts/aggregation/util/NameDistance.java
@@ -0,0 +1,169 @@
+/*
+ * Copyright (C) 2009 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License
+ */
+package com.android.providers.contacts.aggregation.util;
+
+import java.util.Arrays;
+
+/**
+ * A string distance calculator, particularly suited for name matching.
+* <p>
+ * A detailed discussion of the topic of record linkage in general and name matching
+ * in particular can be found in this article:
+ * <blockquote>
+ * Winkler, W. E. (2006). "Overview of Record Linkage and Current Research Directions".
+ * Research Report Series, RRS.
+ * </blockquote>
+ */
+public class NameDistance {
+
+ private static final float WINKLER_BONUS_THRESHOLD = 0.7f;
+ private static final int MIN_EXACT_PREFIX_LENGTH = 3;
+
+ private final int mMaxLength;
+ private final boolean mPrefixOnly;
+ private final boolean[] mMatchFlags1;
+ private final boolean[] mMatchFlags2;
+
+ /**
+ * Constructor.
+ *
+ * @param maxLength byte arrays are truncated if longer than this number
+ */
+ public NameDistance(int maxLength) {
+ mMaxLength = maxLength;
+ mPrefixOnly = false;
+ mMatchFlags1 = new boolean[maxLength];
+ mMatchFlags2 = new boolean[maxLength];
+ }
+
+ /**
+ * Constructor for a matcher that only checks if one string is the exact prefix of the other
+ */
+ public NameDistance() {
+ mPrefixOnly = true;
+ mMaxLength = 0;
+ mMatchFlags1 = mMatchFlags2 = null;
+ }
+
+ /**
+ * Computes a string distance between two normalized strings passed as byte arrays.
+ */
+ public float getDistance(byte bytes1[], byte bytes2[]) {
+ byte[] array1, array2;
+
+ if (bytes1.length > bytes2.length) {
+ array2 = bytes1;
+ array1 = bytes2;
+ } else {
+ array2 = bytes2;
+ array1 = bytes1;
+ }
+
+ int length1 = array1.length;
+ if (length1 >= MIN_EXACT_PREFIX_LENGTH) {
+ boolean prefix = true;
+ for (int i = 0; i < array1.length; i++) {
+ if (array1[i] != array2[i]) {
+ prefix = false;
+ break;
+ }
+ }
+ if (prefix) {
+ return 1.0f;
+ }
+ }
+
+ if (mPrefixOnly) {
+ return 0;
+ }
+
+ if (length1 > mMaxLength) {
+ length1 = mMaxLength;
+ }
+
+ int length2 = array2.length;
+ if (length2 > mMaxLength) {
+ length2 = mMaxLength;
+ }
+
+ Arrays.fill(mMatchFlags1, 0, length1, false);
+ Arrays.fill(mMatchFlags2, 0, length2, false);
+
+ int range = length2 / 2 - 1;
+ if (range < 0) {
+ range = 0;
+ }
+
+ int matches = 0;
+ for (int i = 0; i < length1; i++) {
+ byte c1 = array1[i];
+
+ int from = i - range;
+ if (from < 0) {
+ from = 0;
+ }
+
+ int to = i + range + 1;
+ if (to > length2) {
+ to = length2;
+ }
+
+ for (int j = from; j < to; j++) {
+ if (!mMatchFlags2[j] && c1 == array2[j]) {
+ mMatchFlags1[i] = mMatchFlags2[j] = true;
+ matches++;
+ break;
+ }
+ }
+ }
+
+ if (matches == 0) {
+ return 0f;
+ }
+
+ int transpositions = 0;
+ int j = 0;
+ for (int i = 0; i < length1; i++) {
+ if (mMatchFlags1[i]) {
+ while (!mMatchFlags2[j]) {
+ j++;
+ }
+ if (array1[i] != array2[j]) {
+ transpositions++;
+ }
+ j++;
+ }
+ }
+
+ float m = matches;
+ float jaro = ((m / length1 + m / length2 + (m - (transpositions / 2f)) / m)) / 3;
+
+ if (jaro < WINKLER_BONUS_THRESHOLD) {
+ return jaro;
+ }
+
+ // Add Winkler bonus
+ int prefix = 0;
+ for (int i = 0; i < length1; i++) {
+ if (bytes1[i] != bytes2[i]) {
+ break;
+ }
+ prefix++;
+ }
+
+ return jaro + Math.min(0.1f, 1f / length2) * prefix * (1 - jaro);
+ }
+}