diff options
author | Makoto Onuki <omakoto@google.com> | 2012-03-23 13:55:37 -0700 |
---|---|---|
committer | Makoto Onuki <omakoto@google.com> | 2012-03-23 14:03:21 -0700 |
commit | 81567f4a0f7c9c338506bd82f4d33e83c2ccf159 (patch) | |
tree | 79caaabb662456f9ab46fd886dab0d17cba17962 /src/com/android/providers/contacts/aggregation | |
parent | d3256d7ba4ee8376567f91c1c2dec444ad13d542 (diff) | |
download | packages_providers_ContactsProvider-81567f4a0f7c9c338506bd82f4d33e83c2ccf159.zip packages_providers_ContactsProvider-81567f4a0f7c9c338506bd82f4d33e83c2ccf159.tar.gz packages_providers_ContactsProvider-81567f4a0f7c9c338506bd82f4d33e83c2ccf159.tar.bz2 |
Add new package aggregation.util
Move aggregator related classes into it.
Change-Id: I712fe07ad2bab1e532e3822e3e2797a199329865
Diffstat (limited to 'src/com/android/providers/contacts/aggregation')
5 files changed, 765 insertions, 4 deletions
diff --git a/src/com/android/providers/contacts/aggregation/ContactAggregator.java b/src/com/android/providers/contacts/aggregation/ContactAggregator.java index a235ce2..60eba95 100644 --- a/src/com/android/providers/contacts/aggregation/ContactAggregator.java +++ b/src/com/android/providers/contacts/aggregation/ContactAggregator.java @@ -16,10 +16,7 @@ package com.android.providers.contacts.aggregation; -import com.android.providers.contacts.CommonNicknameCache; import com.android.providers.contacts.ContactLookupKey; -import com.android.providers.contacts.ContactMatcher; -import com.android.providers.contacts.ContactMatcher.MatchScore; import com.android.providers.contacts.ContactsDatabaseHelper; import com.android.providers.contacts.ContactsDatabaseHelper.AccountsColumns; import com.android.providers.contacts.ContactsDatabaseHelper.AggregatedPresenceColumns; @@ -33,6 +30,9 @@ import com.android.providers.contacts.ContactsDatabaseHelper.RawContactsColumns; import com.android.providers.contacts.ContactsDatabaseHelper.StatusUpdatesColumns; import com.android.providers.contacts.ContactsDatabaseHelper.Tables; import com.android.providers.contacts.ContactsDatabaseHelper.Views; +import com.android.providers.contacts.aggregation.util.CommonNicknameCache; +import com.android.providers.contacts.aggregation.util.ContactMatcher; +import com.android.providers.contacts.aggregation.util.ContactMatcher.MatchScore; import com.android.providers.contacts.ContactsProvider2; import com.android.providers.contacts.NameLookupBuilder; import com.android.providers.contacts.NameNormalizer; diff --git a/src/com/android/providers/contacts/aggregation/ProfileAggregator.java b/src/com/android/providers/contacts/aggregation/ProfileAggregator.java index 6126184..fedf5fe 100644 --- a/src/com/android/providers/contacts/aggregation/ProfileAggregator.java +++ b/src/com/android/providers/contacts/aggregation/ProfileAggregator.java @@ -20,10 +20,10 @@ import android.database.sqlite.SQLiteDoneException; import android.database.sqlite.SQLiteStatement; import android.provider.ContactsContract.Contacts; -import com.android.providers.contacts.CommonNicknameCache; import com.android.providers.contacts.ContactLookupKey; import com.android.providers.contacts.ContactsDatabaseHelper; import com.android.providers.contacts.ContactsDatabaseHelper.Tables; +import com.android.providers.contacts.aggregation.util.CommonNicknameCache; import com.android.providers.contacts.ContactsProvider2; import com.android.providers.contacts.NameSplitter; import com.android.providers.contacts.PhotoPriorityResolver; diff --git a/src/com/android/providers/contacts/aggregation/util/CommonNicknameCache.java b/src/com/android/providers/contacts/aggregation/util/CommonNicknameCache.java new file mode 100644 index 0000000..d6b799f --- /dev/null +++ b/src/com/android/providers/contacts/aggregation/util/CommonNicknameCache.java @@ -0,0 +1,151 @@ +/* + * Copyright (C) 2009 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License + */ + +package com.android.providers.contacts.aggregation.util; + +import com.android.providers.contacts.ContactsDatabaseHelper.NicknameLookupColumns; +import com.android.providers.contacts.ContactsDatabaseHelper.Tables; +import com.google.android.collect.Maps; + +import android.database.Cursor; +import android.database.sqlite.SQLiteDatabase; + +import java.lang.ref.SoftReference; +import java.util.BitSet; +import java.util.HashMap; + +/** + * Cache for common nicknames. + */ +public class CommonNicknameCache { + + // We will use this much memory (in bits) to optimize the nickname cluster lookup + private static final int NICKNAME_BLOOM_FILTER_SIZE = 0x1FFF; // =long[128] + private BitSet mNicknameBloomFilter; + + private HashMap<String, SoftReference<String[]>> mNicknameClusterCache = Maps.newHashMap(); + + private final SQLiteDatabase mDb; + + public CommonNicknameCache(SQLiteDatabase db) { + mDb = db; + } + + private final static class NicknameLookupPreloadQuery { + public final static String TABLE = Tables.NICKNAME_LOOKUP; + + public final static String[] COLUMNS = new String[] { + NicknameLookupColumns.NAME + }; + + public final static int NAME = 0; + } + + /** + * Read all known common nicknames from the database and populate a Bloom + * filter using the corresponding hash codes. The idea is to eliminate most + * of unnecessary database lookups for nicknames. Given a name, we will take + * its hash code and see if it is set in the Bloom filter. If not, we will know + * that the name is not in the database. If it is, we still need to run a + * query. + * <p> + * Given the size of the filter and the expected size of the nickname table, + * we should expect the combination of the Bloom filter and cache will + * prevent around 98-99% of unnecessary queries from running. + */ + private void preloadNicknameBloomFilter() { + mNicknameBloomFilter = new BitSet(NICKNAME_BLOOM_FILTER_SIZE + 1); + Cursor cursor = mDb.query(NicknameLookupPreloadQuery.TABLE, + NicknameLookupPreloadQuery.COLUMNS, + null, null, null, null, null); + try { + int count = cursor.getCount(); + for (int i = 0; i < count; i++) { + cursor.moveToNext(); + String normalizedName = cursor.getString(NicknameLookupPreloadQuery.NAME); + int hashCode = normalizedName.hashCode(); + mNicknameBloomFilter.set(hashCode & NICKNAME_BLOOM_FILTER_SIZE); + } + } finally { + cursor.close(); + } + } + + /** + * Returns nickname cluster IDs or null. Maintains cache. + */ + public String[] getCommonNicknameClusters(String normalizedName) { + if (mNicknameBloomFilter == null) { + preloadNicknameBloomFilter(); + } + + int hashCode = normalizedName.hashCode(); + if (!mNicknameBloomFilter.get(hashCode & NICKNAME_BLOOM_FILTER_SIZE)) { + return null; + } + + SoftReference<String[]> ref; + String[] clusters = null; + synchronized (mNicknameClusterCache) { + if (mNicknameClusterCache.containsKey(normalizedName)) { + ref = mNicknameClusterCache.get(normalizedName); + if (ref == null) { + return null; + } + clusters = ref.get(); + } + } + + if (clusters == null) { + clusters = loadNicknameClusters(normalizedName); + ref = clusters == null ? null : new SoftReference<String[]>(clusters); + synchronized (mNicknameClusterCache) { + mNicknameClusterCache.put(normalizedName, ref); + } + } + return clusters; + } + + private interface NicknameLookupQuery { + String TABLE = Tables.NICKNAME_LOOKUP; + + String[] COLUMNS = new String[] { + NicknameLookupColumns.CLUSTER + }; + + int CLUSTER = 0; + } + + protected String[] loadNicknameClusters(String normalizedName) { + String[] clusters = null; + Cursor cursor = mDb.query(NicknameLookupQuery.TABLE, NicknameLookupQuery.COLUMNS, + NicknameLookupColumns.NAME + "=?", new String[] { normalizedName }, + null, null, null); + try { + int count = cursor.getCount(); + if (count > 0) { + clusters = new String[count]; + for (int i = 0; i < count; i++) { + cursor.moveToNext(); + clusters[i] = cursor.getString(NicknameLookupQuery.CLUSTER); + } + } + } finally { + cursor.close(); + } + return clusters; + } +} diff --git a/src/com/android/providers/contacts/aggregation/util/ContactMatcher.java b/src/com/android/providers/contacts/aggregation/util/ContactMatcher.java new file mode 100644 index 0000000..60e41ab --- /dev/null +++ b/src/com/android/providers/contacts/aggregation/util/ContactMatcher.java @@ -0,0 +1,441 @@ +/* + * Copyright (C) 2009 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License + */ +package com.android.providers.contacts.aggregation.util; + +import com.android.providers.contacts.ContactsDatabaseHelper.NameLookupType; +import com.android.providers.contacts.util.Hex; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; + +/** + * Logic for matching contacts' data and accumulating match scores. + */ +public class ContactMatcher { + + // Best possible match score + public static final int MAX_SCORE = 100; + + // Suggest to aggregate contacts if their match score is equal or greater than this threshold + public static final int SCORE_THRESHOLD_SUGGEST = 50; + + // Automatically aggregate contacts if their match score is equal or greater than this threshold + public static final int SCORE_THRESHOLD_PRIMARY = 70; + + // Automatically aggregate contacts if the match score is equal or greater than this threshold + // and there is a secondary match (phone number, email etc). + public static final int SCORE_THRESHOLD_SECONDARY = 50; + + // Score for missing data (as opposed to present data but a bad match) + private static final int NO_DATA_SCORE = -1; + + // Score for matching phone numbers + private static final int PHONE_MATCH_SCORE = 71; + + // Score for matching email addresses + private static final int EMAIL_MATCH_SCORE = 71; + + // Score for matching nickname + private static final int NICKNAME_MATCH_SCORE = 71; + + // Maximum number of characters in a name to be considered by the matching algorithm. + private static final int MAX_MATCHED_NAME_LENGTH = 30; + + // Scores a multiplied by this number to allow room for "fractional" scores + private static final int SCORE_SCALE = 1000; + + public static final int MATCHING_ALGORITHM_EXACT = 0; + public static final int MATCHING_ALGORITHM_CONSERVATIVE = 1; + public static final int MATCHING_ALGORITHM_APPROXIMATE = 2; + + // Minimum edit distance between two names to be considered an approximate match + public static final float APPROXIMATE_MATCH_THRESHOLD = 0.82f; + + // Minimum edit distance between two email ids to be considered an approximate match + public static final float APPROXIMATE_MATCH_THRESHOLD_FOR_EMAIL = 0.95f; + + // Returned value when we found multiple matches and that was not allowed + public static final long MULTIPLE_MATCHES = -2; + + /** + * Name matching scores: a matrix by name type vs. candidate lookup type. + * For example, if the name type is "full name" while we are looking for a + * "full name", the score may be 99. If we are looking for a "nickname" but + * find "first name", the score may be 50 (see specific scores defined + * below.) + * <p> + * For approximate matching, we have a range of scores, let's say 40-70. Depending one how + * similar the two strings are, the score will be somewhere between 40 and 70, with the exact + * match producing the score of 70. The score may also be 0 if the similarity (distance) + * between the strings is below the threshold. + * <p> + * We use a string matching algorithm, which is particularly suited for + * name matching. See {@link NameDistance}. + */ + private static int[] sMinScore = + new int[NameLookupType.TYPE_COUNT * NameLookupType.TYPE_COUNT]; + private static int[] sMaxScore = + new int[NameLookupType.TYPE_COUNT * NameLookupType.TYPE_COUNT]; + + /* + * Note: the reverse names ({@link NameLookupType#FULL_NAME_REVERSE}, + * {@link NameLookupType#FULL_NAME_REVERSE_CONCATENATED} may appear to be redundant. They are + * not! They are useful in three-way aggregation cases when we have, for example, both + * John Smith and Smith John. A third contact with the name John Smith should be aggregated + * with the former rather than the latter. This is why "reverse" matches have slightly lower + * scores than direct matches. + */ + static { + setScoreRange(NameLookupType.NAME_EXACT, + NameLookupType.NAME_EXACT, 99, 99); + setScoreRange(NameLookupType.NAME_VARIANT, + NameLookupType.NAME_VARIANT, 90, 90); + setScoreRange(NameLookupType.NAME_COLLATION_KEY, + NameLookupType.NAME_COLLATION_KEY, 50, 80); + + setScoreRange(NameLookupType.NAME_COLLATION_KEY, + NameLookupType.EMAIL_BASED_NICKNAME, 30, 60); + setScoreRange(NameLookupType.NAME_COLLATION_KEY, + NameLookupType.NICKNAME, 50, 60); + + setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME, + NameLookupType.EMAIL_BASED_NICKNAME, 50, 60); + setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME, + NameLookupType.NAME_COLLATION_KEY, 50, 60); + setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME, + NameLookupType.NICKNAME, 50, 60); + + setScoreRange(NameLookupType.NICKNAME, + NameLookupType.NICKNAME, 50, 60); + setScoreRange(NameLookupType.NICKNAME, + NameLookupType.NAME_COLLATION_KEY, 50, 60); + setScoreRange(NameLookupType.NICKNAME, + NameLookupType.EMAIL_BASED_NICKNAME, 50, 60); + } + + /** + * Populates the cells of the score matrix and score span matrix + * corresponding to the {@code candidateNameType} and {@code nameType}. + */ + private static void setScoreRange(int candidateNameType, int nameType, int scoreFrom, int scoreTo) { + int index = nameType * NameLookupType.TYPE_COUNT + candidateNameType; + sMinScore[index] = scoreFrom; + sMaxScore[index] = scoreTo; + } + + /** + * Returns the lower range for the match score for the given {@code candidateNameType} and + * {@code nameType}. + */ + private static int getMinScore(int candidateNameType, int nameType) { + int index = nameType * NameLookupType.TYPE_COUNT + candidateNameType; + return sMinScore[index]; + } + + /** + * Returns the upper range for the match score for the given {@code candidateNameType} and + * {@code nameType}. + */ + private static int getMaxScore(int candidateNameType, int nameType) { + int index = nameType * NameLookupType.TYPE_COUNT + candidateNameType; + return sMaxScore[index]; + } + + /** + * Captures the max score and match count for a specific contact. Used in an + * contactId - MatchScore map. + */ + public static class MatchScore implements Comparable<MatchScore> { + private long mContactId; + private boolean mKeepIn; + private boolean mKeepOut; + private int mPrimaryScore; + private int mSecondaryScore; + private int mMatchCount; + + public MatchScore(long contactId) { + this.mContactId = contactId; + } + + public void reset(long contactId) { + this.mContactId = contactId; + mKeepIn = false; + mKeepOut = false; + mPrimaryScore = 0; + mSecondaryScore = 0; + mMatchCount = 0; + } + + public long getContactId() { + return mContactId; + } + + public void updatePrimaryScore(int score) { + if (score > mPrimaryScore) { + mPrimaryScore = score; + } + mMatchCount++; + } + + public void updateSecondaryScore(int score) { + if (score > mSecondaryScore) { + mSecondaryScore = score; + } + mMatchCount++; + } + + public void keepIn() { + mKeepIn = true; + } + + public void keepOut() { + mKeepOut = true; + } + + public int getScore() { + if (mKeepOut) { + return 0; + } + + if (mKeepIn) { + return MAX_SCORE; + } + + int score = (mPrimaryScore > mSecondaryScore ? mPrimaryScore : mSecondaryScore); + + // Ensure that of two contacts with the same match score the one with more matching + // data elements wins. + return score * SCORE_SCALE + mMatchCount; + } + + /** + * Descending order of match score. + */ + @Override + public int compareTo(MatchScore another) { + return another.getScore() - getScore(); + } + + @Override + public String toString() { + return mContactId + ": " + mPrimaryScore + "/" + mSecondaryScore + "(" + mMatchCount + + ")"; + } + } + + private final HashMap<Long, MatchScore> mScores = new HashMap<Long, MatchScore>(); + private final ArrayList<MatchScore> mScoreList = new ArrayList<MatchScore>(); + private int mScoreCount = 0; + + private final NameDistance mNameDistanceConservative = new NameDistance(); + private final NameDistance mNameDistanceApproximate = new NameDistance(MAX_MATCHED_NAME_LENGTH); + + private MatchScore getMatchingScore(long contactId) { + MatchScore matchingScore = mScores.get(contactId); + if (matchingScore == null) { + if (mScoreList.size() > mScoreCount) { + matchingScore = mScoreList.get(mScoreCount); + matchingScore.reset(contactId); + } else { + matchingScore = new MatchScore(contactId); + mScoreList.add(matchingScore); + } + mScoreCount++; + mScores.put(contactId, matchingScore); + } + return matchingScore; + } + + /** + * Marks the contact as a full match, because we found an Identity match + */ + public void matchIdentity(long contactId) { + updatePrimaryScore(contactId, MAX_SCORE); + } + + /** + * Checks if there is a match and updates the overall score for the + * specified contact for a discovered match. The new score is determined + * by the prior score, by the type of name we were looking for, the type + * of name we found and, if the match is approximate, the distance between the candidate and + * actual name. + */ + public void matchName(long contactId, int candidateNameType, String candidateName, + int nameType, String name, int algorithm) { + int maxScore = getMaxScore(candidateNameType, nameType); + if (maxScore == 0) { + return; + } + + if (candidateName.equals(name)) { + updatePrimaryScore(contactId, maxScore); + return; + } + + if (algorithm == MATCHING_ALGORITHM_EXACT) { + return; + } + + int minScore = getMinScore(candidateNameType, nameType); + if (minScore == maxScore) { + return; + } + + byte[] decodedCandidateName = Hex.decodeHex(candidateName); + byte[] decodedName = Hex.decodeHex(name); + + NameDistance nameDistance = algorithm == MATCHING_ALGORITHM_CONSERVATIVE ? + mNameDistanceConservative : mNameDistanceApproximate; + + int score; + float distance = nameDistance.getDistance(decodedCandidateName, decodedName); + boolean emailBased = candidateNameType == NameLookupType.EMAIL_BASED_NICKNAME + || nameType == NameLookupType.EMAIL_BASED_NICKNAME; + float threshold = emailBased + ? APPROXIMATE_MATCH_THRESHOLD_FOR_EMAIL + : APPROXIMATE_MATCH_THRESHOLD; + if (distance > threshold) { + score = (int)(minScore + (maxScore - minScore) * (1.0f - distance)); + } else { + score = 0; + } + + updatePrimaryScore(contactId, score); + } + + public void updateScoreWithPhoneNumberMatch(long contactId) { + updateSecondaryScore(contactId, PHONE_MATCH_SCORE); + } + + public void updateScoreWithEmailMatch(long contactId) { + updateSecondaryScore(contactId, EMAIL_MATCH_SCORE); + } + + public void updateScoreWithNicknameMatch(long contactId) { + updateSecondaryScore(contactId, NICKNAME_MATCH_SCORE); + } + + private void updatePrimaryScore(long contactId, int score) { + getMatchingScore(contactId).updatePrimaryScore(score); + } + + private void updateSecondaryScore(long contactId, int score) { + getMatchingScore(contactId).updateSecondaryScore(score); + } + + public void keepIn(long contactId) { + getMatchingScore(contactId).keepIn(); + } + + public void keepOut(long contactId) { + getMatchingScore(contactId).keepOut(); + } + + public void clear() { + mScores.clear(); + mScoreCount = 0; + } + + /** + * Returns a list of IDs for contacts that are matched on secondary data elements + * (phone number, email address, nickname). We still need to obtain the approximate + * primary score for those contacts to determine if any of them should be aggregated. + * <p> + * May return null. + */ + public List<Long> prepareSecondaryMatchCandidates(int threshold) { + ArrayList<Long> contactIds = null; + + for (int i = 0; i < mScoreCount; i++) { + MatchScore score = mScoreList.get(i); + if (score.mKeepOut) { + continue; + } + + int s = score.mSecondaryScore; + if (s >= threshold) { + if (contactIds == null) { + contactIds = new ArrayList<Long>(); + } + contactIds.add(score.mContactId); + } + score.mPrimaryScore = NO_DATA_SCORE; + } + return contactIds; + } + + /** + * Returns the contactId with the best match score over the specified threshold or -1 + * if no such contact is found. + */ + public long pickBestMatch(int threshold, boolean allowMultipleMatches) { + long contactId = -1; + int maxScore = 0; + for (int i = 0; i < mScoreCount; i++) { + MatchScore score = mScoreList.get(i); + if (score.mKeepOut) { + continue; + } + + if (score.mKeepIn) { + return score.mContactId; + } + + int s = score.mPrimaryScore; + if (s == NO_DATA_SCORE) { + s = score.mSecondaryScore; + } + + if (s >= threshold) { + if (contactId != -1 && !allowMultipleMatches) { + return MULTIPLE_MATCHES; + } + if (s > maxScore) { + contactId = score.mContactId; + maxScore = s; + } + } + } + return contactId; + } + + /** + * Returns matches in the order of descending score. + */ + public List<MatchScore> pickBestMatches(int threshold) { + int scaledThreshold = threshold * SCORE_SCALE; + List<MatchScore> matches = mScoreList.subList(0, mScoreCount); + Collections.sort(matches); + int count = 0; + for (int i = 0; i < mScoreCount; i++) { + MatchScore matchScore = matches.get(i); + if (matchScore.getScore() >= scaledThreshold) { + count++; + } else { + break; + } + } + + return matches.subList(0, count); + } + + @Override + public String toString() { + return mScoreList.subList(0, mScoreCount).toString(); + } +} diff --git a/src/com/android/providers/contacts/aggregation/util/NameDistance.java b/src/com/android/providers/contacts/aggregation/util/NameDistance.java new file mode 100644 index 0000000..e357ec1 --- /dev/null +++ b/src/com/android/providers/contacts/aggregation/util/NameDistance.java @@ -0,0 +1,169 @@ +/* + * Copyright (C) 2009 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License + */ +package com.android.providers.contacts.aggregation.util; + +import java.util.Arrays; + +/** + * A string distance calculator, particularly suited for name matching. +* <p> + * A detailed discussion of the topic of record linkage in general and name matching + * in particular can be found in this article: + * <blockquote> + * Winkler, W. E. (2006). "Overview of Record Linkage and Current Research Directions". + * Research Report Series, RRS. + * </blockquote> + */ +public class NameDistance { + + private static final float WINKLER_BONUS_THRESHOLD = 0.7f; + private static final int MIN_EXACT_PREFIX_LENGTH = 3; + + private final int mMaxLength; + private final boolean mPrefixOnly; + private final boolean[] mMatchFlags1; + private final boolean[] mMatchFlags2; + + /** + * Constructor. + * + * @param maxLength byte arrays are truncated if longer than this number + */ + public NameDistance(int maxLength) { + mMaxLength = maxLength; + mPrefixOnly = false; + mMatchFlags1 = new boolean[maxLength]; + mMatchFlags2 = new boolean[maxLength]; + } + + /** + * Constructor for a matcher that only checks if one string is the exact prefix of the other + */ + public NameDistance() { + mPrefixOnly = true; + mMaxLength = 0; + mMatchFlags1 = mMatchFlags2 = null; + } + + /** + * Computes a string distance between two normalized strings passed as byte arrays. + */ + public float getDistance(byte bytes1[], byte bytes2[]) { + byte[] array1, array2; + + if (bytes1.length > bytes2.length) { + array2 = bytes1; + array1 = bytes2; + } else { + array2 = bytes2; + array1 = bytes1; + } + + int length1 = array1.length; + if (length1 >= MIN_EXACT_PREFIX_LENGTH) { + boolean prefix = true; + for (int i = 0; i < array1.length; i++) { + if (array1[i] != array2[i]) { + prefix = false; + break; + } + } + if (prefix) { + return 1.0f; + } + } + + if (mPrefixOnly) { + return 0; + } + + if (length1 > mMaxLength) { + length1 = mMaxLength; + } + + int length2 = array2.length; + if (length2 > mMaxLength) { + length2 = mMaxLength; + } + + Arrays.fill(mMatchFlags1, 0, length1, false); + Arrays.fill(mMatchFlags2, 0, length2, false); + + int range = length2 / 2 - 1; + if (range < 0) { + range = 0; + } + + int matches = 0; + for (int i = 0; i < length1; i++) { + byte c1 = array1[i]; + + int from = i - range; + if (from < 0) { + from = 0; + } + + int to = i + range + 1; + if (to > length2) { + to = length2; + } + + for (int j = from; j < to; j++) { + if (!mMatchFlags2[j] && c1 == array2[j]) { + mMatchFlags1[i] = mMatchFlags2[j] = true; + matches++; + break; + } + } + } + + if (matches == 0) { + return 0f; + } + + int transpositions = 0; + int j = 0; + for (int i = 0; i < length1; i++) { + if (mMatchFlags1[i]) { + while (!mMatchFlags2[j]) { + j++; + } + if (array1[i] != array2[j]) { + transpositions++; + } + j++; + } + } + + float m = matches; + float jaro = ((m / length1 + m / length2 + (m - (transpositions / 2f)) / m)) / 3; + + if (jaro < WINKLER_BONUS_THRESHOLD) { + return jaro; + } + + // Add Winkler bonus + int prefix = 0; + for (int i = 0; i < length1; i++) { + if (bytes1[i] != bytes2[i]) { + break; + } + prefix++; + } + + return jaro + Math.min(0.1f, 1f / length2) * prefix * (1 - jaro); + } +} |