diff options
Diffstat (limited to 'src/org/apache/commons/codec/language/Metaphone.java')
-rw-r--r-- | src/org/apache/commons/codec/language/Metaphone.java | 399 |
1 files changed, 399 insertions, 0 deletions
diff --git a/src/org/apache/commons/codec/language/Metaphone.java b/src/org/apache/commons/codec/language/Metaphone.java new file mode 100644 index 0000000..dce2c72 --- /dev/null +++ b/src/org/apache/commons/codec/language/Metaphone.java @@ -0,0 +1,399 @@ +/* + * Copyright 2001-2004 The Apache Software Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.commons.codec.language; + +import org.apache.commons.codec.EncoderException; +import org.apache.commons.codec.StringEncoder; + +/** + * Encodes a string into a metaphone value. + * <p> + * Initial Java implementation by <CITE>William B. Brogden. December, 1997</CITE>. + * Permission given by <CITE>wbrogden</CITE> for code to be used anywhere. + * </p> + * <p> + * <CITE>Hanging on the Metaphone</CITE> by <CITE>Lawrence Philips</CITE> in <CITE>Computer Language of Dec. 1990, p + * 39.</CITE> + * </p> + * + * @author Apache Software Foundation + * @version $Id: Metaphone.java,v 1.20 2004/06/05 18:32:04 ggregory Exp $ + */ +public class Metaphone implements StringEncoder { + + /** + * Five values in the English language + */ + private String vowels = "AEIOU" ; + + /** + * Variable used in Metaphone algorithm + */ + private String frontv = "EIY" ; + + /** + * Variable used in Metaphone algorithm + */ + private String varson = "CSPTG" ; + + /** + * The max code length for metaphone is 4 + */ + private int maxCodeLen = 4 ; + + /** + * Creates an instance of the Metaphone encoder + */ + public Metaphone() { + super(); + } + + /** + * Find the metaphone value of a String. This is similar to the + * soundex algorithm, but better at finding similar sounding words. + * All input is converted to upper case. + * Limitations: Input format is expected to be a single ASCII word + * with only characters in the A - Z range, no punctuation or numbers. + * + * @param txt String to find the metaphone code for + * @return A metaphone code corresponding to the String supplied + */ + public String metaphone(String txt) { + boolean hard = false ; + if ((txt == null) || (txt.length() == 0)) { + return "" ; + } + // single character is itself + if (txt.length() == 1) { + return txt.toUpperCase() ; + } + + char[] inwd = txt.toUpperCase().toCharArray() ; + + StringBuffer local = new StringBuffer(40); // manipulate + StringBuffer code = new StringBuffer(10) ; // output + // handle initial 2 characters exceptions + switch(inwd[0]) { + case 'K' : + case 'G' : + case 'P' : /* looking for KN, etc*/ + if (inwd[1] == 'N') { + local.append(inwd, 1, inwd.length - 1); + } else { + local.append(inwd); + } + break; + case 'A': /* looking for AE */ + if (inwd[1] == 'E') { + local.append(inwd, 1, inwd.length - 1); + } else { + local.append(inwd); + } + break; + case 'W' : /* looking for WR or WH */ + if (inwd[1] == 'R') { // WR -> R + local.append(inwd, 1, inwd.length - 1); + break ; + } + if (inwd[1] == 'H') { + local.append(inwd, 1, inwd.length - 1); + local.setCharAt(0, 'W'); // WH -> W + } else { + local.append(inwd); + } + break; + case 'X' : /* initial X becomes S */ + inwd[0] = 'S'; + local.append(inwd); + break ; + default : + local.append(inwd); + } // now local has working string with initials fixed + + int wdsz = local.length(); + int n = 0 ; + + while ((code.length() < this.getMaxCodeLen()) && + (n < wdsz) ) { // max code size of 4 works well + char symb = local.charAt(n) ; + // remove duplicate letters except C + if ((symb != 'C') && (isPreviousChar( local, n, symb )) ) { + n++ ; + } else { // not dup + switch(symb) { + case 'A' : case 'E' : case 'I' : case 'O' : case 'U' : + if (n == 0) { + code.append(symb); + } + break ; // only use vowel if leading char + case 'B' : + if ( isPreviousChar(local, n, 'M') && + isLastChar(wdsz, n) ) { // B is silent if word ends in MB + break; + } + code.append(symb); + break; + case 'C' : // lots of C special cases + /* discard if SCI, SCE or SCY */ + if ( isPreviousChar(local, n, 'S') && + !isLastChar(wdsz, n) && + (this.frontv.indexOf(local.charAt(n + 1)) >= 0) ) { + break; + } + if (regionMatch(local, n, "CIA")) { // "CIA" -> X + code.append('X'); + break; + } + if (!isLastChar(wdsz, n) && + (this.frontv.indexOf(local.charAt(n + 1)) >= 0)) { + code.append('S'); + break; // CI,CE,CY -> S + } + if (isPreviousChar(local, n, 'S') && + isNextChar(local, n, 'H') ) { // SCH->sk + code.append('K') ; + break ; + } + if (isNextChar(local, n, 'H')) { // detect CH + if ((n == 0) && + (wdsz >= 3) && + isVowel(local,2) ) { // CH consonant -> K consonant + code.append('K'); + } else { + code.append('X'); // CHvowel -> X + } + } else { + code.append('K'); + } + break ; + case 'D' : + if (!isLastChar(wdsz, n + 1) && + isNextChar(local, n, 'G') && + (this.frontv.indexOf(local.charAt(n + 2)) >= 0)) { // DGE DGI DGY -> J + code.append('J'); n += 2 ; + } else { + code.append('T'); + } + break ; + case 'G' : // GH silent at end or before consonant + if (isLastChar(wdsz, n + 1) && + isNextChar(local, n, 'H')) { + break; + } + if (!isLastChar(wdsz, n + 1) && + isNextChar(local,n,'H') && + !isVowel(local,n+2)) { + break; + } + if ((n > 0) && + ( regionMatch(local, n, "GN") || + regionMatch(local, n, "GNED") ) ) { + break; // silent G + } + if (isPreviousChar(local, n, 'G')) { + hard = true ; + } else { + hard = false ; + } + if (!isLastChar(wdsz, n) && + (this.frontv.indexOf(local.charAt(n + 1)) >= 0) && + (!hard)) { + code.append('J'); + } else { + code.append('K'); + } + break ; + case 'H': + if (isLastChar(wdsz, n)) { + break ; // terminal H + } + if ((n > 0) && + (this.varson.indexOf(local.charAt(n - 1)) >= 0)) { + break; + } + if (isVowel(local,n+1)) { + code.append('H'); // Hvowel + } + break; + case 'F': + case 'J' : + case 'L' : + case 'M': + case 'N' : + case 'R' : + code.append(symb); + break; + case 'K' : + if (n > 0) { // not initial + if (!isPreviousChar(local, n, 'C')) { + code.append(symb); + } + } else { + code.append(symb); // initial K + } + break ; + case 'P' : + if (isNextChar(local,n,'H')) { + // PH -> F + code.append('F'); + } else { + code.append(symb); + } + break ; + case 'Q' : + code.append('K'); + break; + case 'S' : + if (regionMatch(local,n,"SH") || + regionMatch(local,n,"SIO") || + regionMatch(local,n,"SIA")) { + code.append('X'); + } else { + code.append('S'); + } + break; + case 'T' : + if (regionMatch(local,n,"TIA") || + regionMatch(local,n,"TIO")) { + code.append('X'); + break; + } + if (regionMatch(local,n,"TCH")) { + // Silent if in "TCH" + break; + } + // substitute numeral 0 for TH (resembles theta after all) + if (regionMatch(local,n,"TH")) { + code.append('0'); + } else { + code.append('T'); + } + break ; + case 'V' : + code.append('F'); break ; + case 'W' : case 'Y' : // silent if not followed by vowel + if (!isLastChar(wdsz,n) && + isVowel(local,n+1)) { + code.append(symb); + } + break ; + case 'X' : + code.append('K'); code.append('S'); + break ; + case 'Z' : + code.append('S'); break ; + } // end switch + n++ ; + } // end else from symb != 'C' + if (code.length() > this.getMaxCodeLen()) { + code.setLength(this.getMaxCodeLen()); + } + } + return code.toString(); + } + + private boolean isVowel(StringBuffer string, int index) { + return (this.vowels.indexOf(string.charAt(index)) >= 0); + } + + private boolean isPreviousChar(StringBuffer string, int index, char c) { + boolean matches = false; + if( index > 0 && + index < string.length() ) { + matches = string.charAt(index - 1) == c; + } + return matches; + } + + private boolean isNextChar(StringBuffer string, int index, char c) { + boolean matches = false; + if( index >= 0 && + index < string.length() - 1 ) { + matches = string.charAt(index + 1) == c; + } + return matches; + } + + private boolean regionMatch(StringBuffer string, int index, String test) { + boolean matches = false; + if( index >= 0 && + (index + test.length() - 1) < string.length() ) { + String substring = string.substring( index, index + test.length()); + matches = substring.equals( test ); + } + return matches; + } + + private boolean isLastChar(int wdsz, int n) { + return n + 1 == wdsz; + } + + + /** + * Encodes an Object using the metaphone algorithm. This method + * is provided in order to satisfy the requirements of the + * Encoder interface, and will throw an EncoderException if the + * supplied object is not of type java.lang.String. + * + * @param pObject Object to encode + * @return An object (or type java.lang.String) containing the + * metaphone code which corresponds to the String supplied. + * @throws EncoderException if the parameter supplied is not + * of type java.lang.String + */ + public Object encode(Object pObject) throws EncoderException { + if (!(pObject instanceof java.lang.String)) { + throw new EncoderException("Parameter supplied to Metaphone encode is not of type java.lang.String"); + } + return metaphone((String) pObject); + } + + /** + * Encodes a String using the Metaphone algorithm. + * + * @param pString String object to encode + * @return The metaphone code corresponding to the String supplied + */ + public String encode(String pString) { + return metaphone(pString); + } + + /** + * Tests is the metaphones of two strings are identical. + * + * @param str1 First of two strings to compare + * @param str2 Second of two strings to compare + * @return true if the metaphones of these strings are identical, + * false otherwise. + */ + public boolean isMetaphoneEqual(String str1, String str2) { + return metaphone(str1).equals(metaphone(str2)); + } + + /** + * Returns the maxCodeLen. + * @return int + */ + public int getMaxCodeLen() { return this.maxCodeLen; } + + /** + * Sets the maxCodeLen. + * @param maxCodeLen The maxCodeLen to set + */ + public void setMaxCodeLen(int maxCodeLen) { this.maxCodeLen = maxCodeLen; } + +} |