diff options
author | The Android Open Source Project <initial-contribution@android.com> | 2009-03-03 19:29:16 -0800 |
---|---|---|
committer | The Android Open Source Project <initial-contribution@android.com> | 2009-03-03 19:29:16 -0800 |
commit | 069490a5ca2fd1988d29daf45d892f47ad665115 (patch) | |
tree | aea04c65769a1d9e3ca6fde36a7d23bd91dbeb98 /src/org/apache/commons/codec | |
parent | e5d9544310b857f3ee9ec172bdbff8077323f9a1 (diff) | |
download | external_apache-http-069490a5ca2fd1988d29daf45d892f47ad665115.zip external_apache-http-069490a5ca2fd1988d29daf45d892f47ad665115.tar.gz external_apache-http-069490a5ca2fd1988d29daf45d892f47ad665115.tar.bz2 |
auto import from //depot/cupcake/@135843
Diffstat (limited to 'src/org/apache/commons/codec')
28 files changed, 5172 insertions, 0 deletions
diff --git a/src/org/apache/commons/codec/BinaryDecoder.java b/src/org/apache/commons/codec/BinaryDecoder.java new file mode 100644 index 0000000..7aebabf --- /dev/null +++ b/src/org/apache/commons/codec/BinaryDecoder.java @@ -0,0 +1,41 @@ +/* + * Copyright 2001-2004 The Apache Software Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.commons.codec; + +/** + * Defines common decoding methods for byte array decoders. + * + * @author Apache Software Foundation + * @version $Id: BinaryDecoder.java,v 1.10 2004/06/15 18:14:15 ggregory Exp $ + */ +public interface BinaryDecoder extends Decoder { + + /** + * Decodes a byte array and returns the results as a byte array. + * + * @param pArray A byte array which has been encoded with the + * appropriate encoder + * + * @return a byte array that contains decoded content + * + * @throws DecoderException A decoder exception is thrown + * if a Decoder encounters a failure condition during + * the decode process. + */ + byte[] decode(byte[] pArray) throws DecoderException; +} + diff --git a/src/org/apache/commons/codec/BinaryEncoder.java b/src/org/apache/commons/codec/BinaryEncoder.java new file mode 100644 index 0000000..52859ed --- /dev/null +++ b/src/org/apache/commons/codec/BinaryEncoder.java @@ -0,0 +1,41 @@ +/* + * Copyright 2001-2004 The Apache Software Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.commons.codec; + +/** + * Defines common encoding methods for byte array encoders. + * + * @author Apache Software Foundation + * @version $Id: BinaryEncoder.java,v 1.10 2004/02/29 04:08:31 tobrien Exp $ + */ +public interface BinaryEncoder extends Encoder { + + /** + * Encodes a byte array and return the encoded data + * as a byte array. + * + * @param pArray Data to be encoded + * + * @return A byte array containing the encoded data + * + * @throws EncoderException thrown if the Encoder + * encounters a failure condition during the + * encoding process. + */ + byte[] encode(byte[] pArray) throws EncoderException; +} + diff --git a/src/org/apache/commons/codec/Decoder.java b/src/org/apache/commons/codec/Decoder.java new file mode 100644 index 0000000..184920c --- /dev/null +++ b/src/org/apache/commons/codec/Decoder.java @@ -0,0 +1,54 @@ +/* + * Copyright 2001-2004 The Apache Software Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.commons.codec; + +/** + * <p>Provides the highest level of abstraction for Decoders. + * This is the sister interface of {@link Encoder}. All + * Decoders implement this common generic interface.</p> + * + * <p>Allows a user to pass a generic Object to any Decoder + * implementation in the codec package.</p> + * + * <p>One of the two interfaces at the center of the codec package.</p> + * + * @author Apache Software Foundation + * @version $Id: Decoder.java,v 1.9 2004/02/29 04:08:31 tobrien Exp $ + */ +public interface Decoder { + + /** + * Decodes an "encoded" Object and returns a "decoded" + * Object. Note that the implementation of this + * interface will try to cast the Object parameter + * to the specific type expected by a particular Decoder + * implementation. If a {@link java.lang.ClassCastException} occurs + * this decode method will throw a DecoderException. + * + * @param pObject an object to "decode" + * + * @return a 'decoded" object + * + * @throws DecoderException a decoder exception can + * be thrown for any number of reasons. Some good + * candidates are that the parameter passed to this + * method is null, a param cannot be cast to the + * appropriate type for a specific encoder. + */ + Object decode(Object pObject) throws DecoderException; +} + diff --git a/src/org/apache/commons/codec/DecoderException.java b/src/org/apache/commons/codec/DecoderException.java new file mode 100644 index 0000000..f35c016 --- /dev/null +++ b/src/org/apache/commons/codec/DecoderException.java @@ -0,0 +1,37 @@ +/* + * Copyright 2001-2004 The Apache Software Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.commons.codec; + +/** + * Thrown when a Decoder has encountered a failure condition during a decode. + * + * @author Apache Software Foundation + * @version $Id: DecoderException.java,v 1.9 2004/02/29 04:08:31 tobrien Exp $ + */ +public class DecoderException extends Exception { + + /** + * Creates a DecoderException + * + * @param pMessage A message with meaning to a human + */ + public DecoderException(String pMessage) { + super(pMessage); + } + +} + diff --git a/src/org/apache/commons/codec/Encoder.java b/src/org/apache/commons/codec/Encoder.java new file mode 100644 index 0000000..fa339ee --- /dev/null +++ b/src/org/apache/commons/codec/Encoder.java @@ -0,0 +1,45 @@ +/* + * Copyright 2001-2004 The Apache Software Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.commons.codec; + +/** + * <p>Provides the highest level of abstraction for Encoders. + * This is the sister interface of {@link Decoder}. Every implementation of + * Encoder provides this common generic interface whic allows a user to pass a + * generic Object to any Encoder implementation in the codec package.</p> + * + * @author Apache Software Foundation + * @version $Id: Encoder.java,v 1.10 2004/02/29 04:08:31 tobrien Exp $ + */ +public interface Encoder { + + /** + * Encodes an "Object" and returns the encoded content + * as an Object. The Objects here may just be <code>byte[]</code> + * or <code>String</code>s depending on the implementation used. + * + * @param pObject An object ot encode + * + * @return An "encoded" Object + * + * @throws EncoderException an encoder exception is + * thrown if the encoder experiences a failure + * condition during the encoding process. + */ + Object encode(Object pObject) throws EncoderException; +} + diff --git a/src/org/apache/commons/codec/EncoderException.java b/src/org/apache/commons/codec/EncoderException.java new file mode 100644 index 0000000..0e202c1 --- /dev/null +++ b/src/org/apache/commons/codec/EncoderException.java @@ -0,0 +1,39 @@ +/* + * Copyright 2001-2004 The Apache Software Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.commons.codec; + +/** + * Thrown when there is a failure condition during the encoding process. This + * exception is thrown when an Encoder encounters a encoding specific exception + * such as invalid data, inability to calculate a checksum, characters outside of the + * expected range. + * + * @author Apache Software Foundation + * @version $Id: EncoderException.java,v 1.10 2004/02/29 04:08:31 tobrien Exp $ + */ +public class EncoderException extends Exception { + + /** + * Creates a new instance of this exception with an useful message. + * + * @param pMessage a useful message relating to the encoder specific error. + */ + public EncoderException(String pMessage) { + super(pMessage); + } +} + diff --git a/src/org/apache/commons/codec/StringDecoder.java b/src/org/apache/commons/codec/StringDecoder.java new file mode 100644 index 0000000..9b1a0cd --- /dev/null +++ b/src/org/apache/commons/codec/StringDecoder.java @@ -0,0 +1,39 @@ +/* + * Copyright 2001-2004 The Apache Software Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.commons.codec; + +/** + * Decodes a String into a String. + * + * @author Apache Software Foundation + * @version $Id: StringDecoder.java,v 1.9 2004/02/29 04:08:31 tobrien Exp $ + */ +public interface StringDecoder extends Decoder { + + /** + * Decodes a String and returns a String. + * + * @param pString a String to encode + * + * @return the encoded String + * + * @throws DecoderException thrown if there is + * an error conidition during the Encoding process. + */ + String decode(String pString) throws DecoderException; +} + diff --git a/src/org/apache/commons/codec/StringEncoder.java b/src/org/apache/commons/codec/StringEncoder.java new file mode 100644 index 0000000..46f5404 --- /dev/null +++ b/src/org/apache/commons/codec/StringEncoder.java @@ -0,0 +1,39 @@ +/* + * Copyright 2001-2004 The Apache Software Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.commons.codec; + +/** + * Encodes a String into a String. + * + * @author Apache Software Foundation + * @version $Id: StringEncoder.java,v 1.9 2004/02/29 04:08:31 tobrien Exp $ + */ +public interface StringEncoder extends Encoder { + + /** + * Encodes a String and returns a String. + * + * @param pString a String to encode + * + * @return the encoded String + * + * @throws EncoderException thrown if there is + * an error conidition during the Encoding process. + */ + String encode(String pString) throws EncoderException; +} + diff --git a/src/org/apache/commons/codec/StringEncoderComparator.java b/src/org/apache/commons/codec/StringEncoderComparator.java new file mode 100644 index 0000000..6d29af2 --- /dev/null +++ b/src/org/apache/commons/codec/StringEncoderComparator.java @@ -0,0 +1,83 @@ +/* + * Copyright 2001-2004 The Apache Software Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.commons.codec; + +import java.util.Comparator; + +/** + * Strings are comparable, and this comparator allows + * you to configure it with an instance of a class + * which implements StringEncoder. This comparator + * is used to sort Strings by an encoding scheme such + * as Soundex, Metaphone, etc. This class can come in + * handy if one need to sort Strings by an encoded + * form of a name such as Soundex. + * + * @author Apache Software Foundation + * @version $Id: StringEncoderComparator.java,v 1.14 2004/06/21 23:24:17 ggregory Exp $ + */ +public class StringEncoderComparator implements Comparator { + + /** + * Internal encoder instance. + */ + private StringEncoder stringEncoder; + + /** + * Constructs a new instance. + */ + public StringEncoderComparator() { + // no init. + } + + /** + * Constructs a new instance with the given algorithm. + * @param stringEncoder the StringEncoder used for comparisons. + */ + public StringEncoderComparator(StringEncoder stringEncoder) { + this.stringEncoder = stringEncoder; + } + + /** + * Compares two strings based not on the strings + * themselves, but on an encoding of the two + * strings using the StringEncoder this Comparator + * was created with. + * + * If an {@link EncoderException} is encountered, return <code>0</code>. + * + * @param o1 the object to compare + * @param o2 the object to compare to + * @return the Comparable.compareTo() return code or 0 if an encoding error was caught. + * @see Comparable + */ + public int compare(Object o1, Object o2) { + + int compareCode = 0; + + try { + Comparable s1 = (Comparable) ((Encoder) this.stringEncoder).encode(o1); + Comparable s2 = (Comparable) ((Encoder) this.stringEncoder).encode(o2); + compareCode = s1.compareTo(s2); + } + catch (EncoderException ee) { + compareCode = 0; + } + return compareCode; + } + +} diff --git a/src/org/apache/commons/codec/binary/Base64.java b/src/org/apache/commons/codec/binary/Base64.java new file mode 100644 index 0000000..ea479e9 --- /dev/null +++ b/src/org/apache/commons/codec/binary/Base64.java @@ -0,0 +1,524 @@ +/* + * Copyright 2001-2004 The Apache Software Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.commons.codec.binary; + +import org.apache.commons.codec.BinaryDecoder; +import org.apache.commons.codec.BinaryEncoder; +import org.apache.commons.codec.DecoderException; +import org.apache.commons.codec.EncoderException; + +/** + * Provides Base64 encoding and decoding as defined by RFC 2045. + * + * <p>This class implements section <cite>6.8. Base64 Content-Transfer-Encoding</cite> + * from RFC 2045 <cite>Multipurpose Internet Mail Extensions (MIME) Part One: + * Format of Internet Message Bodies</cite> by Freed and Borenstein.</p> + * + * @see <a href="http://www.ietf.org/rfc/rfc2045.txt">RFC 2045</a> + * @author Apache Software Foundation + * @since 1.0-dev + * @version $Id: Base64.java,v 1.20 2004/05/24 00:21:24 ggregory Exp $ + */ +public class Base64 implements BinaryEncoder, BinaryDecoder { + + /** + * Chunk size per RFC 2045 section 6.8. + * + * <p>The {@value} character limit does not count the trailing CRLF, but counts + * all other characters, including any equal signs.</p> + * + * @see <a href="http://www.ietf.org/rfc/rfc2045.txt">RFC 2045 section 6.8</a> + */ + static final int CHUNK_SIZE = 76; + + /** + * Chunk separator per RFC 2045 section 2.1. + * + * @see <a href="http://www.ietf.org/rfc/rfc2045.txt">RFC 2045 section 2.1</a> + */ + static final byte[] CHUNK_SEPARATOR = "\r\n".getBytes(); + + /** + * The base length. + */ + static final int BASELENGTH = 255; + + /** + * Lookup length. + */ + static final int LOOKUPLENGTH = 64; + + /** + * Used to calculate the number of bits in a byte. + */ + static final int EIGHTBIT = 8; + + /** + * Used when encoding something which has fewer than 24 bits. + */ + static final int SIXTEENBIT = 16; + + /** + * Used to determine how many bits data contains. + */ + static final int TWENTYFOURBITGROUP = 24; + + /** + * Used to get the number of Quadruples. + */ + static final int FOURBYTE = 4; + + /** + * Used to test the sign of a byte. + */ + static final int SIGN = -128; + + /** + * Byte used to pad output. + */ + static final byte PAD = (byte) '='; + + // Create arrays to hold the base64 characters and a + // lookup for base64 chars + private static byte[] base64Alphabet = new byte[BASELENGTH]; + private static byte[] lookUpBase64Alphabet = new byte[LOOKUPLENGTH]; + + // Populating the lookup and character arrays + static { + for (int i = 0; i < BASELENGTH; i++) { + base64Alphabet[i] = (byte) -1; + } + for (int i = 'Z'; i >= 'A'; i--) { + base64Alphabet[i] = (byte) (i - 'A'); + } + for (int i = 'z'; i >= 'a'; i--) { + base64Alphabet[i] = (byte) (i - 'a' + 26); + } + for (int i = '9'; i >= '0'; i--) { + base64Alphabet[i] = (byte) (i - '0' + 52); + } + + base64Alphabet['+'] = 62; + base64Alphabet['/'] = 63; + + for (int i = 0; i <= 25; i++) { + lookUpBase64Alphabet[i] = (byte) ('A' + i); + } + + for (int i = 26, j = 0; i <= 51; i++, j++) { + lookUpBase64Alphabet[i] = (byte) ('a' + j); + } + + for (int i = 52, j = 0; i <= 61; i++, j++) { + lookUpBase64Alphabet[i] = (byte) ('0' + j); + } + + lookUpBase64Alphabet[62] = (byte) '+'; + lookUpBase64Alphabet[63] = (byte) '/'; + } + + private static boolean isBase64(byte octect) { + if (octect == PAD) { + return true; + } else if (base64Alphabet[octect] == -1) { + return false; + } else { + return true; + } + } + + /** + * Tests a given byte array to see if it contains + * only valid characters within the Base64 alphabet. + * + * @param arrayOctect byte array to test + * @return true if all bytes are valid characters in the Base64 + * alphabet or if the byte array is empty; false, otherwise + */ + public static boolean isArrayByteBase64(byte[] arrayOctect) { + + arrayOctect = discardWhitespace(arrayOctect); + + int length = arrayOctect.length; + if (length == 0) { + // shouldn't a 0 length array be valid base64 data? + // return false; + return true; + } + for (int i = 0; i < length; i++) { + if (!isBase64(arrayOctect[i])) { + return false; + } + } + return true; + } + + /** + * Encodes binary data using the base64 algorithm but + * does not chunk the output. + * + * @param binaryData binary data to encode + * @return Base64 characters + */ + public static byte[] encodeBase64(byte[] binaryData) { + return encodeBase64(binaryData, false); + } + + /** + * Encodes binary data using the base64 algorithm and chunks + * the encoded output into 76 character blocks + * + * @param binaryData binary data to encode + * @return Base64 characters chunked in 76 character blocks + */ + public static byte[] encodeBase64Chunked(byte[] binaryData) { + return encodeBase64(binaryData, true); + } + + + /** + * Decodes an Object using the base64 algorithm. This method + * is provided in order to satisfy the requirements of the + * Decoder interface, and will throw a DecoderException if the + * supplied object is not of type byte[]. + * + * @param pObject Object to decode + * @return An object (of type byte[]) containing the + * binary data which corresponds to the byte[] supplied. + * @throws DecoderException if the parameter supplied is not + * of type byte[] + */ + public Object decode(Object pObject) throws DecoderException { + if (!(pObject instanceof byte[])) { + throw new DecoderException("Parameter supplied to Base64 decode is not a byte[]"); + } + return decode((byte[]) pObject); + } + + /** + * Decodes a byte[] containing containing + * characters in the Base64 alphabet. + * + * @param pArray A byte array containing Base64 character data + * @return a byte array containing binary data + */ + public byte[] decode(byte[] pArray) { + return decodeBase64(pArray); + } + + /** + * Encodes binary data using the base64 algorithm, optionally + * chunking the output into 76 character blocks. + * + * @param binaryData Array containing binary data to encode. + * @param isChunked if isChunked is true this encoder will chunk + * the base64 output into 76 character blocks + * @return Base64-encoded data. + */ + public static byte[] encodeBase64(byte[] binaryData, boolean isChunked) { + int lengthDataBits = binaryData.length * EIGHTBIT; + int fewerThan24bits = lengthDataBits % TWENTYFOURBITGROUP; + int numberTriplets = lengthDataBits / TWENTYFOURBITGROUP; + byte encodedData[] = null; + int encodedDataLength = 0; + int nbrChunks = 0; + + if (fewerThan24bits != 0) { + //data not divisible by 24 bit + encodedDataLength = (numberTriplets + 1) * 4; + } else { + // 16 or 8 bit + encodedDataLength = numberTriplets * 4; + } + + // If the output is to be "chunked" into 76 character sections, + // for compliance with RFC 2045 MIME, then it is important to + // allow for extra length to account for the separator(s) + if (isChunked) { + + nbrChunks = + (CHUNK_SEPARATOR.length == 0 ? 0 : (int) Math.ceil((float) encodedDataLength / CHUNK_SIZE)); + encodedDataLength += nbrChunks * CHUNK_SEPARATOR.length; + } + + encodedData = new byte[encodedDataLength]; + + byte k = 0, l = 0, b1 = 0, b2 = 0, b3 = 0; + + int encodedIndex = 0; + int dataIndex = 0; + int i = 0; + int nextSeparatorIndex = CHUNK_SIZE; + int chunksSoFar = 0; + + //log.debug("number of triplets = " + numberTriplets); + for (i = 0; i < numberTriplets; i++) { + dataIndex = i * 3; + b1 = binaryData[dataIndex]; + b2 = binaryData[dataIndex + 1]; + b3 = binaryData[dataIndex + 2]; + + //log.debug("b1= " + b1 +", b2= " + b2 + ", b3= " + b3); + + l = (byte) (b2 & 0x0f); + k = (byte) (b1 & 0x03); + + byte val1 = + ((b1 & SIGN) == 0) ? (byte) (b1 >> 2) : (byte) ((b1) >> 2 ^ 0xc0); + byte val2 = + ((b2 & SIGN) == 0) ? (byte) (b2 >> 4) : (byte) ((b2) >> 4 ^ 0xf0); + byte val3 = + ((b3 & SIGN) == 0) ? (byte) (b3 >> 6) : (byte) ((b3) >> 6 ^ 0xfc); + + encodedData[encodedIndex] = lookUpBase64Alphabet[val1]; + //log.debug( "val2 = " + val2 ); + //log.debug( "k4 = " + (k<<4) ); + //log.debug( "vak = " + (val2 | (k<<4)) ); + encodedData[encodedIndex + 1] = + lookUpBase64Alphabet[val2 | (k << 4)]; + encodedData[encodedIndex + 2] = + lookUpBase64Alphabet[(l << 2) | val3]; + encodedData[encodedIndex + 3] = lookUpBase64Alphabet[b3 & 0x3f]; + + encodedIndex += 4; + + // If we are chunking, let's put a chunk separator down. + if (isChunked) { + // this assumes that CHUNK_SIZE % 4 == 0 + if (encodedIndex == nextSeparatorIndex) { + System.arraycopy( + CHUNK_SEPARATOR, + 0, + encodedData, + encodedIndex, + CHUNK_SEPARATOR.length); + chunksSoFar++; + nextSeparatorIndex = + (CHUNK_SIZE * (chunksSoFar + 1)) + + (chunksSoFar * CHUNK_SEPARATOR.length); + encodedIndex += CHUNK_SEPARATOR.length; + } + } + } + + // form integral number of 6-bit groups + dataIndex = i * 3; + + if (fewerThan24bits == EIGHTBIT) { + b1 = binaryData[dataIndex]; + k = (byte) (b1 & 0x03); + //log.debug("b1=" + b1); + //log.debug("b1<<2 = " + (b1>>2) ); + byte val1 = + ((b1 & SIGN) == 0) ? (byte) (b1 >> 2) : (byte) ((b1) >> 2 ^ 0xc0); + encodedData[encodedIndex] = lookUpBase64Alphabet[val1]; + encodedData[encodedIndex + 1] = lookUpBase64Alphabet[k << 4]; + encodedData[encodedIndex + 2] = PAD; + encodedData[encodedIndex + 3] = PAD; + } else if (fewerThan24bits == SIXTEENBIT) { + + b1 = binaryData[dataIndex]; + b2 = binaryData[dataIndex + 1]; + l = (byte) (b2 & 0x0f); + k = (byte) (b1 & 0x03); + + byte val1 = + ((b1 & SIGN) == 0) ? (byte) (b1 >> 2) : (byte) ((b1) >> 2 ^ 0xc0); + byte val2 = + ((b2 & SIGN) == 0) ? (byte) (b2 >> 4) : (byte) ((b2) >> 4 ^ 0xf0); + + encodedData[encodedIndex] = lookUpBase64Alphabet[val1]; + encodedData[encodedIndex + 1] = + lookUpBase64Alphabet[val2 | (k << 4)]; + encodedData[encodedIndex + 2] = lookUpBase64Alphabet[l << 2]; + encodedData[encodedIndex + 3] = PAD; + } + + if (isChunked) { + // we also add a separator to the end of the final chunk. + if (chunksSoFar < nbrChunks) { + System.arraycopy( + CHUNK_SEPARATOR, + 0, + encodedData, + encodedDataLength - CHUNK_SEPARATOR.length, + CHUNK_SEPARATOR.length); + } + } + + return encodedData; + } + + /** + * Decodes Base64 data into octects + * + * @param base64Data Byte array containing Base64 data + * @return Array containing decoded data. + */ + public static byte[] decodeBase64(byte[] base64Data) { + // RFC 2045 requires that we discard ALL non-Base64 characters + base64Data = discardNonBase64(base64Data); + + // handle the edge case, so we don't have to worry about it later + if (base64Data.length == 0) { + return new byte[0]; + } + + int numberQuadruple = base64Data.length / FOURBYTE; + byte decodedData[] = null; + byte b1 = 0, b2 = 0, b3 = 0, b4 = 0, marker0 = 0, marker1 = 0; + + // Throw away anything not in base64Data + + int encodedIndex = 0; + int dataIndex = 0; + { + // this sizes the output array properly - rlw + int lastData = base64Data.length; + // ignore the '=' padding + while (base64Data[lastData - 1] == PAD) { + if (--lastData == 0) { + return new byte[0]; + } + } + decodedData = new byte[lastData - numberQuadruple]; + } + + for (int i = 0; i < numberQuadruple; i++) { + dataIndex = i * 4; + marker0 = base64Data[dataIndex + 2]; + marker1 = base64Data[dataIndex + 3]; + + b1 = base64Alphabet[base64Data[dataIndex]]; + b2 = base64Alphabet[base64Data[dataIndex + 1]]; + + if (marker0 != PAD && marker1 != PAD) { + //No PAD e.g 3cQl + b3 = base64Alphabet[marker0]; + b4 = base64Alphabet[marker1]; + + decodedData[encodedIndex] = (byte) (b1 << 2 | b2 >> 4); + decodedData[encodedIndex + 1] = + (byte) (((b2 & 0xf) << 4) | ((b3 >> 2) & 0xf)); + decodedData[encodedIndex + 2] = (byte) (b3 << 6 | b4); + } else if (marker0 == PAD) { + //Two PAD e.g. 3c[Pad][Pad] + decodedData[encodedIndex] = (byte) (b1 << 2 | b2 >> 4); + } else if (marker1 == PAD) { + //One PAD e.g. 3cQ[Pad] + b3 = base64Alphabet[marker0]; + + decodedData[encodedIndex] = (byte) (b1 << 2 | b2 >> 4); + decodedData[encodedIndex + 1] = + (byte) (((b2 & 0xf) << 4) | ((b3 >> 2) & 0xf)); + } + encodedIndex += 3; + } + return decodedData; + } + + /** + * Discards any whitespace from a base-64 encoded block. + * + * @param data The base-64 encoded data to discard the whitespace + * from. + * @return The data, less whitespace (see RFC 2045). + */ + static byte[] discardWhitespace(byte[] data) { + byte groomedData[] = new byte[data.length]; + int bytesCopied = 0; + + for (int i = 0; i < data.length; i++) { + switch (data[i]) { + case (byte) ' ' : + case (byte) '\n' : + case (byte) '\r' : + case (byte) '\t' : + break; + default: + groomedData[bytesCopied++] = data[i]; + } + } + + byte packedData[] = new byte[bytesCopied]; + + System.arraycopy(groomedData, 0, packedData, 0, bytesCopied); + + return packedData; + } + + /** + * Discards any characters outside of the base64 alphabet, per + * the requirements on page 25 of RFC 2045 - "Any characters + * outside of the base64 alphabet are to be ignored in base64 + * encoded data." + * + * @param data The base-64 encoded data to groom + * @return The data, less non-base64 characters (see RFC 2045). + */ + static byte[] discardNonBase64(byte[] data) { + byte groomedData[] = new byte[data.length]; + int bytesCopied = 0; + + for (int i = 0; i < data.length; i++) { + if (isBase64(data[i])) { + groomedData[bytesCopied++] = data[i]; + } + } + + byte packedData[] = new byte[bytesCopied]; + + System.arraycopy(groomedData, 0, packedData, 0, bytesCopied); + + return packedData; + } + + + // Implementation of the Encoder Interface + + /** + * Encodes an Object using the base64 algorithm. This method + * is provided in order to satisfy the requirements of the + * Encoder interface, and will throw an EncoderException if the + * supplied object is not of type byte[]. + * + * @param pObject Object to encode + * @return An object (of type byte[]) containing the + * base64 encoded data which corresponds to the byte[] supplied. + * @throws EncoderException if the parameter supplied is not + * of type byte[] + */ + public Object encode(Object pObject) throws EncoderException { + if (!(pObject instanceof byte[])) { + throw new EncoderException( + "Parameter supplied to Base64 encode is not a byte[]"); + } + return encode((byte[]) pObject); + } + + /** + * Encodes a byte[] containing binary data, into a byte[] containing + * characters in the Base64 alphabet. + * + * @param pArray a byte array containing binary data + * @return A byte array containing only Base64 character data + */ + public byte[] encode(byte[] pArray) { + return encodeBase64(pArray, false); + } + +} diff --git a/src/org/apache/commons/codec/binary/BinaryCodec.java b/src/org/apache/commons/codec/binary/BinaryCodec.java new file mode 100644 index 0000000..98c6409 --- /dev/null +++ b/src/org/apache/commons/codec/binary/BinaryCodec.java @@ -0,0 +1,285 @@ +/* + * Copyright 2001-2004 The Apache Software Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.commons.codec.binary; + +import org.apache.commons.codec.BinaryDecoder; +import org.apache.commons.codec.BinaryEncoder; +import org.apache.commons.codec.DecoderException; +import org.apache.commons.codec.EncoderException; + +/** + * Translates between byte arrays and strings of "0"s and "1"s. + * + * <b>TODO:</b> may want to add more bit vector functions like and/or/xor/nand. + * <B>TODO:</b> also might be good to generate boolean[] + * from byte[] et. cetera. + * + * @author Apache Software Foundation + * @since 1.3 + * @version $Id $ + */ +public class BinaryCodec implements BinaryDecoder, BinaryEncoder { + /* + * tried to avoid using ArrayUtils to minimize dependencies while using these empty arrays - dep is just not worth + * it. + */ + /** Empty char array. */ + private static final char[] EMPTY_CHAR_ARRAY = new char[0]; + + /** Empty byte array. */ + private static final byte[] EMPTY_BYTE_ARRAY = new byte[0]; + + /** Mask for bit 0 of a byte. */ + private static final int BIT_0 = 1; + + /** Mask for bit 1 of a byte. */ + private static final int BIT_1 = 0x02; + + /** Mask for bit 2 of a byte. */ + private static final int BIT_2 = 0x04; + + /** Mask for bit 3 of a byte. */ + private static final int BIT_3 = 0x08; + + /** Mask for bit 4 of a byte. */ + private static final int BIT_4 = 0x10; + + /** Mask for bit 5 of a byte. */ + private static final int BIT_5 = 0x20; + + /** Mask for bit 6 of a byte. */ + private static final int BIT_6 = 0x40; + + /** Mask for bit 7 of a byte. */ + private static final int BIT_7 = 0x80; + + private static final int[] BITS = {BIT_0, BIT_1, BIT_2, BIT_3, BIT_4, BIT_5, BIT_6, BIT_7}; + + /** + * Converts an array of raw binary data into an array of ascii 0 and 1 characters. + * + * @param raw + * the raw binary data to convert + * @return 0 and 1 ascii character bytes one for each bit of the argument + * @see org.apache.commons.codec.BinaryEncoder#encode(byte[]) + */ + public byte[] encode(byte[] raw) { + return toAsciiBytes(raw); + } + + /** + * Converts an array of raw binary data into an array of ascii 0 and 1 chars. + * + * @param raw + * the raw binary data to convert + * @return 0 and 1 ascii character chars one for each bit of the argument + * @throws EncoderException + * if the argument is not a byte[] + * @see org.apache.commons.codec.Encoder#encode(java.lang.Object) + */ + public Object encode(Object raw) throws EncoderException { + if (!(raw instanceof byte[])) { + throw new EncoderException("argument not a byte array"); + } + return toAsciiChars((byte[]) raw); + } + + /** + * Decodes a byte array where each byte represents an ascii '0' or '1'. + * + * @param ascii + * each byte represents an ascii '0' or '1' + * @return the raw encoded binary where each bit corresponds to a byte in the byte array argument + * @throws DecoderException + * if argument is not a byte[], char[] or String + * @see org.apache.commons.codec.Decoder#decode(java.lang.Object) + */ + public Object decode(Object ascii) throws DecoderException { + if (ascii == null) { + return EMPTY_BYTE_ARRAY; + } + if (ascii instanceof byte[]) { + return fromAscii((byte[]) ascii); + } + if (ascii instanceof char[]) { + return fromAscii((char[]) ascii); + } + if (ascii instanceof String) { + return fromAscii(((String) ascii).toCharArray()); + } + throw new DecoderException("argument not a byte array"); + } + + /** + * Decodes a byte array where each byte represents an ascii '0' or '1'. + * + * @param ascii + * each byte represents an ascii '0' or '1' + * @return the raw encoded binary where each bit corresponds to a byte in the byte array argument + * @see org.apache.commons.codec.Decoder#decode(Object) + */ + public byte[] decode(byte[] ascii) { + return fromAscii(ascii); + } + + /** + * Decodes a String where each char of the String represents an ascii '0' or '1'. + * + * @param ascii + * String of '0' and '1' characters + * @return the raw encoded binary where each bit corresponds to a byte in the byte array argument + * @see org.apache.commons.codec.Decoder#decode(Object) + */ + public byte[] toByteArray(String ascii) { + if (ascii == null) { + return EMPTY_BYTE_ARRAY; + } + return fromAscii(ascii.toCharArray()); + } + + // ------------------------------------------------------------------------ + // + // static codec operations + // + // ------------------------------------------------------------------------ + /** + * Decodes a byte array where each char represents an ascii '0' or '1'. + * + * @param ascii + * each char represents an ascii '0' or '1' + * @return the raw encoded binary where each bit corresponds to a char in the char array argument + */ + public static byte[] fromAscii(char[] ascii) { + if (ascii == null || ascii.length == 0) { + return EMPTY_BYTE_ARRAY; + } + // get length/8 times bytes with 3 bit shifts to the right of the length + byte[] l_raw = new byte[ascii.length >> 3]; + /* + * We decr index jj by 8 as we go along to not recompute indices using multiplication every time inside the + * loop. + */ + for (int ii = 0, jj = ascii.length - 1; ii < l_raw.length; ii++, jj -= 8) { + for (int bits = 0; bits < BITS.length; ++bits) { + if (ascii[jj - bits] == '1') { + l_raw[ii] |= BITS[bits]; + } + } + } + return l_raw; + } + + /** + * Decodes a byte array where each byte represents an ascii '0' or '1'. + * + * @param ascii + * each byte represents an ascii '0' or '1' + * @return the raw encoded binary where each bit corresponds to a byte in the byte array argument + */ + public static byte[] fromAscii(byte[] ascii) { + if (ascii == null || ascii.length == 0) { + return EMPTY_BYTE_ARRAY; + } + // get length/8 times bytes with 3 bit shifts to the right of the length + byte[] l_raw = new byte[ascii.length >> 3]; + /* + * We decr index jj by 8 as we go along to not recompute indices using multiplication every time inside the + * loop. + */ + for (int ii = 0, jj = ascii.length - 1; ii < l_raw.length; ii++, jj -= 8) { + for (int bits = 0; bits < BITS.length; ++bits) { + if (ascii[jj - bits] == '1') { + l_raw[ii] |= BITS[bits]; + } + } + } + return l_raw; + } + + /** + * Converts an array of raw binary data into an array of ascii 0 and 1 character bytes - each byte is a truncated + * char. + * + * @param raw + * the raw binary data to convert + * @return an array of 0 and 1 character bytes for each bit of the argument + * @see org.apache.commons.codec.BinaryEncoder#encode(byte[]) + */ + public static byte[] toAsciiBytes(byte[] raw) { + if (raw == null || raw.length == 0) { + return EMPTY_BYTE_ARRAY; + } + // get 8 times the bytes with 3 bit shifts to the left of the length + byte[] l_ascii = new byte[raw.length << 3]; + /* + * We decr index jj by 8 as we go along to not recompute indices using multiplication every time inside the + * loop. + */ + for (int ii = 0, jj = l_ascii.length - 1; ii < raw.length; ii++, jj -= 8) { + for (int bits = 0; bits < BITS.length; ++bits) { + if ((raw[ii] & BITS[bits]) == 0) { + l_ascii[jj - bits] = '0'; + } else { + l_ascii[jj - bits] = '1'; + } + } + } + return l_ascii; + } + + /** + * Converts an array of raw binary data into an array of ascii 0 and 1 characters. + * + * @param raw + * the raw binary data to convert + * @return an array of 0 and 1 characters for each bit of the argument + * @see org.apache.commons.codec.BinaryEncoder#encode(byte[]) + */ + public static char[] toAsciiChars(byte[] raw) { + if (raw == null || raw.length == 0) { + return EMPTY_CHAR_ARRAY; + } + // get 8 times the bytes with 3 bit shifts to the left of the length + char[] l_ascii = new char[raw.length << 3]; + /* + * We decr index jj by 8 as we go along to not recompute indices using multiplication every time inside the + * loop. + */ + for (int ii = 0, jj = l_ascii.length - 1; ii < raw.length; ii++, jj -= 8) { + for (int bits = 0; bits < BITS.length; ++bits) { + if ((raw[ii] & BITS[bits]) == 0) { + l_ascii[jj - bits] = '0'; + } else { + l_ascii[jj - bits] = '1'; + } + } + } + return l_ascii; + } + + /** + * Converts an array of raw binary data into a String of ascii 0 and 1 characters. + * + * @param raw + * the raw binary data to convert + * @return a String of 0 and 1 characters representing the binary data + * @see org.apache.commons.codec.BinaryEncoder#encode(byte[]) + */ + public static String toAsciiString(byte[] raw) { + return new String(toAsciiChars(raw)); + } +} diff --git a/src/org/apache/commons/codec/binary/Hex.java b/src/org/apache/commons/codec/binary/Hex.java new file mode 100644 index 0000000..78f5510 --- /dev/null +++ b/src/org/apache/commons/codec/binary/Hex.java @@ -0,0 +1,192 @@ +/* + * Copyright 2001-2004 The Apache Software Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.commons.codec.binary; + +import org.apache.commons.codec.BinaryDecoder; +import org.apache.commons.codec.BinaryEncoder; +import org.apache.commons.codec.DecoderException; +import org.apache.commons.codec.EncoderException; + +/** + * Hex encoder and decoder. + * + * @since 1.1 + * @author Apache Software Foundation + * @version $Id: Hex.java,v 1.13 2004/04/18 18:22:33 ggregory Exp $ + */ +public class Hex implements BinaryEncoder, BinaryDecoder { + + /** + * Used building output as Hex + */ + private static final char[] DIGITS = { + '0', '1', '2', '3', '4', '5', '6', '7', + '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' + }; + + /** + * Converts an array of characters representing hexidecimal values into an + * array of bytes of those same values. The returned array will be half the + * length of the passed array, as it takes two characters to represent any + * given byte. An exception is thrown if the passed char array has an odd + * number of elements. + * + * @param data An array of characters containing hexidecimal digits + * @return A byte array containing binary data decoded from + * the supplied char array. + * @throws DecoderException Thrown if an odd number or illegal of characters + * is supplied + */ + public static byte[] decodeHex(char[] data) throws DecoderException { + + int len = data.length; + + if ((len & 0x01) != 0) { + throw new DecoderException("Odd number of characters."); + } + + byte[] out = new byte[len >> 1]; + + // two characters form the hex value. + for (int i = 0, j = 0; j < len; i++) { + int f = toDigit(data[j], j) << 4; + j++; + f = f | toDigit(data[j], j); + j++; + out[i] = (byte) (f & 0xFF); + } + + return out; + } + + /** + * Converts a hexadecimal character to an integer. + * + * @param ch A character to convert to an integer digit + * @param index The index of the character in the source + * @return An integer + * @throws DecoderException Thrown if ch is an illegal hex character + */ + protected static int toDigit(char ch, int index) throws DecoderException { + int digit = Character.digit(ch, 16); + if (digit == -1) { + throw new DecoderException("Illegal hexadecimal charcter " + ch + " at index " + index); + } + return digit; + } + + /** + * Converts an array of bytes into an array of characters representing the hexidecimal values of each byte in order. + * The returned array will be double the length of the passed array, as it takes two characters to represent any + * given byte. + * + * @param data + * a byte[] to convert to Hex characters + * @return A char[] containing hexidecimal characters + */ + public static char[] encodeHex(byte[] data) { + + int l = data.length; + + char[] out = new char[l << 1]; + + // two characters form the hex value. + for (int i = 0, j = 0; i < l; i++) { + out[j++] = DIGITS[(0xF0 & data[i]) >>> 4 ]; + out[j++] = DIGITS[ 0x0F & data[i] ]; + } + + return out; + } + + /** + * Converts an array of character bytes representing hexidecimal values into an + * array of bytes of those same values. The returned array will be half the + * length of the passed array, as it takes two characters to represent any + * given byte. An exception is thrown if the passed char array has an odd + * number of elements. + * + * @param array An array of character bytes containing hexidecimal digits + * @return A byte array containing binary data decoded from + * the supplied byte array (representing characters). + * @throws DecoderException Thrown if an odd number of characters is supplied + * to this function + * @see #decodeHex(char[]) + */ + public byte[] decode(byte[] array) throws DecoderException { + return decodeHex(new String(array).toCharArray()); + } + + /** + * Converts a String or an array of character bytes representing hexidecimal values into an + * array of bytes of those same values. The returned array will be half the + * length of the passed String or array, as it takes two characters to represent any + * given byte. An exception is thrown if the passed char array has an odd + * number of elements. + * + * @param object A String or, an array of character bytes containing hexidecimal digits + * @return A byte array containing binary data decoded from + * the supplied byte array (representing characters). + * @throws DecoderException Thrown if an odd number of characters is supplied + * to this function or the object is not a String or char[] + * @see #decodeHex(char[]) + */ + public Object decode(Object object) throws DecoderException { + try { + char[] charArray = object instanceof String ? ((String) object).toCharArray() : (char[]) object; + return decodeHex(charArray); + } catch (ClassCastException e) { + throw new DecoderException(e.getMessage()); + } + } + + /** + * Converts an array of bytes into an array of bytes for the characters representing the + * hexidecimal values of each byte in order. The returned array will be + * double the length of the passed array, as it takes two characters to + * represent any given byte. + * + * @param array a byte[] to convert to Hex characters + * @return A byte[] containing the bytes of the hexidecimal characters + * @see #encodeHex(byte[]) + */ + public byte[] encode(byte[] array) { + return new String(encodeHex(array)).getBytes(); + } + + /** + * Converts a String or an array of bytes into an array of characters representing the + * hexidecimal values of each byte in order. The returned array will be + * double the length of the passed String or array, as it takes two characters to + * represent any given byte. + * + * @param object a String, or byte[] to convert to Hex characters + * @return A char[] containing hexidecimal characters + * @throws EncoderException Thrown if the given object is not a String or byte[] + * @see #encodeHex(byte[]) + */ + public Object encode(Object object) throws EncoderException { + try { + byte[] byteArray = object instanceof String ? ((String) object).getBytes() : (byte[]) object; + return encodeHex(byteArray); + } catch (ClassCastException e) { + throw new EncoderException(e.getMessage()); + } + } + +} + diff --git a/src/org/apache/commons/codec/binary/package.html b/src/org/apache/commons/codec/binary/package.html new file mode 100644 index 0000000..844d918 --- /dev/null +++ b/src/org/apache/commons/codec/binary/package.html @@ -0,0 +1,20 @@ +<!-- +Copyright 2003-2004 The Apache Software Foundation. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +--> +<html> + <body> + Base64, Binary, and Hexadecimal String encoding and decoding. + </body> +</html> diff --git a/src/org/apache/commons/codec/language/DoubleMetaphone.java b/src/org/apache/commons/codec/language/DoubleMetaphone.java new file mode 100644 index 0000000..1cad991 --- /dev/null +++ b/src/org/apache/commons/codec/language/DoubleMetaphone.java @@ -0,0 +1,1103 @@ +/* + * Copyright 2001-2004 The Apache Software Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.commons.codec.language; + +import org.apache.commons.codec.EncoderException; +import org.apache.commons.codec.StringEncoder; + +/** + * Encodes a string into a double metaphone value. + * This Implementation is based on the algorithm by <CITE>Lawrence Philips</CITE>. + * <ul> + * <li>Original Article: <a + * href="http://www.cuj.com/documents/s=8038/cuj0006philips/"> + * http://www.cuj.com/documents/s=8038/cuj0006philips/</a></li> + * <li>Original Source Code: <a href="ftp://ftp.cuj.com/pub/2000/1806/philips.zip"> + * ftp://ftp.cuj.com/pub/2000/1806/philips.zip</a></li> + * </ul> + * + * @author Apache Software Foundation + * @version $Id: DoubleMetaphone.java,v 1.24 2004/06/05 18:32:04 ggregory Exp $ + */ +public class DoubleMetaphone implements StringEncoder { + + /** + * "Vowels" to test for + */ + private static final String VOWELS = "AEIOUY"; + + /** + * Prefixes when present which are not pronounced + */ + private static final String[] SILENT_START = + { "GN", "KN", "PN", "WR", "PS" }; + private static final String[] L_R_N_M_B_H_F_V_W_SPACE = + { "L", "R", "N", "M", "B", "H", "F", "V", "W", " " }; + private static final String[] ES_EP_EB_EL_EY_IB_IL_IN_IE_EI_ER = + { "ES", "EP", "EB", "EL", "EY", "IB", "IL", "IN", "IE", "EI", "ER" }; + private static final String[] L_T_K_S_N_M_B_Z = + { "L", "T", "K", "S", "N", "M", "B", "Z" }; + + /** + * Maximum length of an encoding, default is 4 + */ + protected int maxCodeLen = 4; + + /** + * Creates an instance of this DoubleMetaphone encoder + */ + public DoubleMetaphone() { + super(); + } + + /** + * Encode a value with Double Metaphone + * + * @param value String to encode + * @return an encoded string + */ + public String doubleMetaphone(String value) { + return doubleMetaphone(value, false); + } + + /** + * Encode a value with Double Metaphone, optionally using the alternate + * encoding. + * + * @param value String to encode + * @param alternate use alternate encode + * @return an encoded string + */ + public String doubleMetaphone(String value, boolean alternate) { + value = cleanInput(value); + if (value == null) { + return null; + } + + boolean slavoGermanic = isSlavoGermanic(value); + int index = isSilentStart(value) ? 1 : 0; + + DoubleMetaphoneResult result = new DoubleMetaphoneResult(this.getMaxCodeLen()); + + while (!result.isComplete() && index <= value.length() - 1) { + switch (value.charAt(index)) { + case 'A': + case 'E': + case 'I': + case 'O': + case 'U': + case 'Y': + index = handleAEIOUY(value, result, index); + break; + case 'B': + result.append('P'); + index = charAt(value, index + 1) == 'B' ? index + 2 : index + 1; + break; + case '\u00C7': + // A C with a Cedilla + result.append('S'); + index++; + break; + case 'C': + index = handleC(value, result, index); + break; + case 'D': + index = handleD(value, result, index); + break; + case 'F': + result.append('F'); + index = charAt(value, index + 1) == 'F' ? index + 2 : index + 1; + break; + case 'G': + index = handleG(value, result, index, slavoGermanic); + break; + case 'H': + index = handleH(value, result, index); + break; + case 'J': + index = handleJ(value, result, index, slavoGermanic); + break; + case 'K': + result.append('K'); + index = charAt(value, index + 1) == 'K' ? index + 2 : index + 1; + break; + case 'L': + index = handleL(value, result, index); + break; + case 'M': + result.append('M'); + index = conditionM0(value, index) ? index + 2 : index + 1; + break; + case 'N': + result.append('N'); + index = charAt(value, index + 1) == 'N' ? index + 2 : index + 1; + break; + case '\u00D1': + // N with a tilde (spanish ene) + result.append('N'); + index++; + break; + case 'P': + index = handleP(value, result, index); + break; + case 'Q': + result.append('K'); + index = charAt(value, index + 1) == 'Q' ? index + 2 : index + 1; + break; + case 'R': + index = handleR(value, result, index, slavoGermanic); + break; + case 'S': + index = handleS(value, result, index, slavoGermanic); + break; + case 'T': + index = handleT(value, result, index); + break; + case 'V': + result.append('F'); + index = charAt(value, index + 1) == 'V' ? index + 2 : index + 1; + break; + case 'W': + index = handleW(value, result, index); + break; + case 'X': + index = handleX(value, result, index); + break; + case 'Z': + index = handleZ(value, result, index, slavoGermanic); + break; + default: + index++; + break; + } + } + + return alternate ? result.getAlternate() : result.getPrimary(); + } + + /** + * Encode the value using DoubleMetaphone. It will only work if + * <code>obj</code> is a <code>String</code> (like <code>Metaphone</code>). + * + * @param obj Object to encode (should be of type String) + * @return An encoded Object (will be of type String) + * @throws EncoderException encode parameter is not of type String + */ + public Object encode(Object obj) throws EncoderException { + if (!(obj instanceof String)) { + throw new EncoderException("DoubleMetaphone encode parameter is not of type String"); + } + return doubleMetaphone((String) obj); + } + + /** + * Encode the value using DoubleMetaphone. + * + * @param value String to encode + * @return An encoded String + */ + public String encode(String value) { + return doubleMetaphone(value); + } + + /** + * Check if the Double Metaphone values of two <code>String</code> values + * are equal. + * + * @param value1 The left-hand side of the encoded {@link String#equals(Object)}. + * @param value2 The right-hand side of the encoded {@link String#equals(Object)}. + * @return <code>true</code> if the encoded <code>String</code>s are equal; + * <code>false</code> otherwise. + * @see #isDoubleMetaphoneEqual(String,String,boolean) + */ + public boolean isDoubleMetaphoneEqual(String value1, String value2) { + return isDoubleMetaphoneEqual(value1, value2, false); + } + + /** + * Check if the Double Metaphone values of two <code>String</code> values + * are equal, optionally using the alternate value. + * + * @param value1 The left-hand side of the encoded {@link String#equals(Object)}. + * @param value2 The right-hand side of the encoded {@link String#equals(Object)}. + * @param alternate use the alternate value if <code>true</code>. + * @return <code>true</code> if the encoded <code>String</code>s are equal; + * <code>false</code> otherwise. + */ + public boolean isDoubleMetaphoneEqual(String value1, + String value2, + boolean alternate) { + return doubleMetaphone(value1, alternate).equals(doubleMetaphone + (value2, alternate)); + } + + /** + * Returns the maxCodeLen. + * @return int + */ + public int getMaxCodeLen() { + return this.maxCodeLen; + } + + /** + * Sets the maxCodeLen. + * @param maxCodeLen The maxCodeLen to set + */ + public void setMaxCodeLen(int maxCodeLen) { + this.maxCodeLen = maxCodeLen; + } + + //-- BEGIN HANDLERS --// + + /** + * Handles 'A', 'E', 'I', 'O', 'U', and 'Y' cases + */ + private int handleAEIOUY(String value, DoubleMetaphoneResult result, int + index) { + if (index == 0) { + result.append('A'); + } + return index + 1; + } + + /** + * Handles 'C' cases + */ + private int handleC(String value, + DoubleMetaphoneResult result, + int index) { + if (conditionC0(value, index)) { // very confusing, moved out + result.append('K'); + index += 2; + } else if (index == 0 && contains(value, index, 6, "CAESAR")) { + result.append('S'); + index += 2; + } else if (contains(value, index, 2, "CH")) { + index = handleCH(value, result, index); + } else if (contains(value, index, 2, "CZ") && + !contains(value, index - 2, 4, "WICZ")) { + //-- "Czerny" --// + result.append('S', 'X'); + index += 2; + } else if (contains(value, index + 1, 3, "CIA")) { + //-- "focaccia" --// + result.append('X'); + index += 3; + } else if (contains(value, index, 2, "CC") && + !(index == 1 && charAt(value, 0) == 'M')) { + //-- double "cc" but not "McClelland" --// + return handleCC(value, result, index); + } else if (contains(value, index, 2, "CK", "CG", "CQ")) { + result.append('K'); + index += 2; + } else if (contains(value, index, 2, "CI", "CE", "CY")) { + //-- Italian vs. English --// + if (contains(value, index, 3, "CIO", "CIE", "CIA")) { + result.append('S', 'X'); + } else { + result.append('S'); + } + index += 2; + } else { + result.append('K'); + if (contains(value, index + 1, 2, " C", " Q", " G")) { + //-- Mac Caffrey, Mac Gregor --// + index += 3; + } else if (contains(value, index + 1, 1, "C", "K", "Q") && + !contains(value, index + 1, 2, "CE", "CI")) { + index += 2; + } else { + index++; + } + } + + return index; + } + + /** + * Handles 'CC' cases + */ + private int handleCC(String value, + DoubleMetaphoneResult result, + int index) { + if (contains(value, index + 2, 1, "I", "E", "H") && + !contains(value, index + 2, 2, "HU")) { + //-- "bellocchio" but not "bacchus" --// + if ((index == 1 && charAt(value, index - 1) == 'A') || + contains(value, index - 1, 5, "UCCEE", "UCCES")) { + //-- "accident", "accede", "succeed" --// + result.append("KS"); + } else { + //-- "bacci", "bertucci", other Italian --// + result.append('X'); + } + index += 3; + } else { // Pierce's rule + result.append('K'); + index += 2; + } + + return index; + } + + /** + * Handles 'CH' cases + */ + private int handleCH(String value, + DoubleMetaphoneResult result, + int index) { + if (index > 0 && contains(value, index, 4, "CHAE")) { // Michael + result.append('K', 'X'); + return index + 2; + } else if (conditionCH0(value, index)) { + //-- Greek roots ("chemistry", "chorus", etc.) --// + result.append('K'); + return index + 2; + } else if (conditionCH1(value, index)) { + //-- Germanic, Greek, or otherwise 'ch' for 'kh' sound --// + result.append('K'); + return index + 2; + } else { + if (index > 0) { + if (contains(value, 0, 2, "MC")) { + result.append('K'); + } else { + result.append('X', 'K'); + } + } else { + result.append('X'); + } + return index + 2; + } + } + + /** + * Handles 'D' cases + */ + private int handleD(String value, + DoubleMetaphoneResult result, + int index) { + if (contains(value, index, 2, "DG")) { + //-- "Edge" --// + if (contains(value, index + 2, 1, "I", "E", "Y")) { + result.append('J'); + index += 3; + //-- "Edgar" --// + } else { + result.append("TK"); + index += 2; + } + } else if (contains(value, index, 2, "DT", "DD")) { + result.append('T'); + index += 2; + } else { + result.append('T'); + index++; + } + return index; + } + + /** + * Handles 'G' cases + */ + private int handleG(String value, + DoubleMetaphoneResult result, + int index, + boolean slavoGermanic) { + if (charAt(value, index + 1) == 'H') { + index = handleGH(value, result, index); + } else if (charAt(value, index + 1) == 'N') { + if (index == 1 && isVowel(charAt(value, 0)) && !slavoGermanic) { + result.append("KN", "N"); + } else if (!contains(value, index + 2, 2, "EY") && + charAt(value, index + 1) != 'Y' && !slavoGermanic) { + result.append("N", "KN"); + } else { + result.append("KN"); + } + index = index + 2; + } else if (contains(value, index + 1, 2, "LI") && !slavoGermanic) { + result.append("KL", "L"); + index += 2; + } else if (index == 0 && (charAt(value, index + 1) == 'Y' || contains(value, index + 1, 2, ES_EP_EB_EL_EY_IB_IL_IN_IE_EI_ER))) { + //-- -ges-, -gep-, -gel-, -gie- at beginning --// + result.append('K', 'J'); + index += 2; + } else if ((contains(value, index + 1, 2, "ER") || + charAt(value, index + 1) == 'Y') && + !contains(value, 0, 6, "DANGER", "RANGER", "MANGER") && + !contains(value, index - 1, 1, "E", "I") && + !contains(value, index - 1, 3, "RGY", "OGY")) { + //-- -ger-, -gy- --// + result.append('K', 'J'); + index += 2; + } else if (contains(value, index + 1, 1, "E", "I", "Y") || + contains(value, index - 1, 4, "AGGI", "OGGI")) { + //-- Italian "biaggi" --// + if ((contains(value, 0 ,4, "VAN ", "VON ") || contains(value, 0, 3, "SCH")) || contains(value, index + 1, 2, "ET")) { + //-- obvious germanic --// + result.append('K'); + } else if (contains(value, index + 1, 4, "IER")) { + result.append('J'); + } else { + result.append('J', 'K'); + } + index += 2; + } else if (charAt(value, index + 1) == 'G') { + index += 2; + result.append('K'); + } else { + index++; + result.append('K'); + } + return index; + } + + /** + * Handles 'GH' cases + */ + private int handleGH(String value, + DoubleMetaphoneResult result, + int index) { + if (index > 0 && !isVowel(charAt(value, index - 1))) { + result.append('K'); + index += 2; + } else if (index == 0) { + if (charAt(value, index + 2) == 'I') { + result.append('J'); + } else { + result.append('K'); + } + index += 2; + } else if ((index > 1 && contains(value, index - 2, 1, "B", "H", "D")) || + (index > 2 && contains(value, index - 3, 1, "B", "H", "D")) || + (index > 3 && contains(value, index - 4, 1, "B", "H"))) { + //-- Parker's rule (with some further refinements) - "hugh" + index += 2; + } else { + if (index > 2 && charAt(value, index - 1) == 'U' && + contains(value, index - 3, 1, "C", "G", "L", "R", "T")) { + //-- "laugh", "McLaughlin", "cough", "gough", "rough", "tough" + result.append('F'); + } else if (index > 0 && charAt(value, index - 1) != 'I') { + result.append('K'); + } + index += 2; + } + return index; + } + + /** + * Handles 'H' cases + */ + private int handleH(String value, + DoubleMetaphoneResult result, + int index) { + //-- only keep if first & before vowel or between 2 vowels --// + if ((index == 0 || isVowel(charAt(value, index - 1))) && + isVowel(charAt(value, index + 1))) { + result.append('H'); + index += 2; + //-- also takes car of "HH" --// + } else { + index++; + } + return index; + } + + /** + * Handles 'J' cases + */ + private int handleJ(String value, DoubleMetaphoneResult result, int index, + boolean slavoGermanic) { + if (contains(value, index, 4, "JOSE") || contains(value, 0, 4, "SAN ")) { + //-- obvious Spanish, "Jose", "San Jacinto" --// + if ((index == 0 && (charAt(value, index + 4) == ' ') || + value.length() == 4) || contains(value, 0, 4, "SAN ")) { + result.append('H'); + } else { + result.append('J', 'H'); + } + index++; + } else { + if (index == 0 && !contains(value, index, 4, "JOSE")) { + result.append('J', 'A'); + } else if (isVowel(charAt(value, index - 1)) && !slavoGermanic && + (charAt(value, index + 1) == 'A' || charAt(value, index + 1) == 'O')) { + result.append('J', 'H'); + } else if (index == value.length() - 1) { + result.append('J', ' '); + } else if (!contains(value, index + 1, 1, L_T_K_S_N_M_B_Z) && !contains(value, index - 1, 1, "S", "K", "L")) { + result.append('J'); + } + + if (charAt(value, index + 1) == 'J') { + index += 2; + } else { + index++; + } + } + return index; + } + + /** + * Handles 'L' cases + */ + private int handleL(String value, + DoubleMetaphoneResult result, + int index) { + result.append('L'); + if (charAt(value, index + 1) == 'L') { + if (conditionL0(value, index)) { + result.appendAlternate(' '); + } + index += 2; + } else { + index++; + } + return index; + } + + /** + * Handles 'P' cases + */ + private int handleP(String value, + DoubleMetaphoneResult result, + int index) { + if (charAt(value, index + 1) == 'H') { + result.append('F'); + index += 2; + } else { + result.append('P'); + index = contains(value, index + 1, 1, "P", "B") ? index + 2 : index + 1; + } + return index; + } + + /** + * Handles 'R' cases + */ + private int handleR(String value, + DoubleMetaphoneResult result, + int index, + boolean slavoGermanic) { + if (index == value.length() - 1 && !slavoGermanic && + contains(value, index - 2, 2, "IE") && + !contains(value, index - 4, 2, "ME", "MA")) { + result.appendAlternate('R'); + } else { + result.append('R'); + } + return charAt(value, index + 1) == 'R' ? index + 2 : index + 1; + } + + /** + * Handles 'S' cases + */ + private int handleS(String value, + DoubleMetaphoneResult result, + int index, + boolean slavoGermanic) { + if (contains(value, index - 1, 3, "ISL", "YSL")) { + //-- special cases "island", "isle", "carlisle", "carlysle" --// + index++; + } else if (index == 0 && contains(value, index, 5, "SUGAR")) { + //-- special case "sugar-" --// + result.append('X', 'S'); + index++; + } else if (contains(value, index, 2, "SH")) { + if (contains(value, index + 1, 4, + "HEIM", "HOEK", "HOLM", "HOLZ")) { + //-- germanic --// + result.append('S'); + } else { + result.append('X'); + } + index += 2; + } else if (contains(value, index, 3, "SIO", "SIA") || contains(value, index, 4, "SIAN")) { + //-- Italian and Armenian --// + if (slavoGermanic) { + result.append('S'); + } else { + result.append('S', 'X'); + } + index += 3; + } else if ((index == 0 && contains(value, index + 1, 1, "M", "N", "L", "W")) || contains(value, index + 1, 1, "Z")) { + //-- german & anglicisations, e.g. "smith" match "schmidt" // + // "snider" match "schneider" --// + //-- also, -sz- in slavic language altho in hungarian it // + // is pronounced "s" --// + result.append('S', 'X'); + index = contains(value, index + 1, 1, "Z") ? index + 2 : index + 1; + } else if (contains(value, index, 2, "SC")) { + index = handleSC(value, result, index); + } else { + if (index == value.length() - 1 && contains(value, index - 2, + 2, "AI", "OI")){ + //-- french e.g. "resnais", "artois" --// + result.appendAlternate('S'); + } else { + result.append('S'); + } + index = contains(value, index + 1, 1, "S", "Z") ? index + 2 : index + 1; + } + return index; + } + + /** + * Handles 'SC' cases + */ + private int handleSC(String value, + DoubleMetaphoneResult result, + int index) { + if (charAt(value, index + 2) == 'H') { + //-- Schlesinger's rule --// + if (contains(value, index + 3, + 2, "OO", "ER", "EN", "UY", "ED", "EM")) { + //-- Dutch origin, e.g. "school", "schooner" --// + if (contains(value, index + 3, 2, "ER", "EN")) { + //-- "schermerhorn", "schenker" --// + result.append("X", "SK"); + } else { + result.append("SK"); + } + } else { + if (index == 0 && !isVowel(charAt(value, 3)) && charAt(value, 3) != 'W') { + result.append('X', 'S'); + } else { + result.append('X'); + } + } + } else if (contains(value, index + 2, 1, "I", "E", "Y")) { + result.append('S'); + } else { + result.append("SK"); + } + return index + 3; + } + + /** + * Handles 'T' cases + */ + private int handleT(String value, + DoubleMetaphoneResult result, + int index) { + if (contains(value, index, 4, "TION")) { + result.append('X'); + index += 3; + } else if (contains(value, index, 3, "TIA", "TCH")) { + result.append('X'); + index += 3; + } else if (contains(value, index, 2, "TH") || contains(value, index, + 3, "TTH")) { + if (contains(value, index + 2, 2, "OM", "AM") || + //-- special case "thomas", "thames" or germanic --// + contains(value, 0, 4, "VAN ", "VON ") || + contains(value, 0, 3, "SCH")) { + result.append('T'); + } else { + result.append('0', 'T'); + } + index += 2; + } else { + result.append('T'); + index = contains(value, index + 1, 1, "T", "D") ? index + 2 : index + 1; + } + return index; + } + + /** + * Handles 'W' cases + */ + private int handleW(String value, + DoubleMetaphoneResult result, + int index) { + if (contains(value, index, 2, "WR")) { + //-- can also be in middle of word --// + result.append('R'); + index += 2; + } else { + if (index == 0 && (isVowel(charAt(value, index + 1)) || + contains(value, index, 2, "WH"))) { + if (isVowel(charAt(value, index + 1))) { + //-- Wasserman should match Vasserman --// + result.append('A', 'F'); + } else { + //-- need Uomo to match Womo --// + result.append('A'); + } + index++; + } else if ((index == value.length() - 1 && isVowel(charAt(value, index - 1))) || + contains(value, index - 1, + 5, "EWSKI", "EWSKY", "OWSKI", "OWSKY") || + contains(value, 0, 3, "SCH")) { + //-- Arnow should match Arnoff --// + result.appendAlternate('F'); + index++; + } else if (contains(value, index, 4, "WICZ", "WITZ")) { + //-- Polish e.g. "filipowicz" --// + result.append("TS", "FX"); + index += 4; + } else { + index++; + } + } + return index; + } + + /** + * Handles 'X' cases + */ + private int handleX(String value, + DoubleMetaphoneResult result, + int index) { + if (index == 0) { + result.append('S'); + index++; + } else { + if (!((index == value.length() - 1) && + (contains(value, index - 3, 3, "IAU", "EAU") || + contains(value, index - 2, 2, "AU", "OU")))) { + //-- French e.g. breaux --// + result.append("KS"); + } + index = contains(value, index + 1, 1, "C", "X") ? index + 2 : index + 1; + } + return index; + } + + /** + * Handles 'Z' cases + */ + private int handleZ(String value, DoubleMetaphoneResult result, int index, + boolean slavoGermanic) { + if (charAt(value, index + 1) == 'H') { + //-- Chinese pinyin e.g. "zhao" or Angelina "Zhang" --// + result.append('J'); + index += 2; + } else { + if (contains(value, index + 1, 2, "ZO", "ZI", "ZA") || (slavoGermanic && (index > 0 && charAt(value, index - 1) != 'T'))) { + result.append("S", "TS"); + } else { + result.append('S'); + } + index = charAt(value, index + 1) == 'Z' ? index + 2 : index + 1; + } + return index; + } + + //-- BEGIN CONDITIONS --// + + /** + * Complex condition 0 for 'C' + */ + private boolean conditionC0(String value, int index) { + if (contains(value, index, 4, "CHIA")) { + return true; + } else if (index <= 1) { + return false; + } else if (isVowel(charAt(value, index - 2))) { + return false; + } else if (!contains(value, index - 1, 3, "ACH")) { + return false; + } else { + char c = charAt(value, index + 2); + return (c != 'I' && c != 'E') + || contains(value, index - 2, 6, "BACHER", "MACHER"); + } + } + + /** + * Complex condition 0 for 'CH' + */ + private boolean conditionCH0(String value, int index) { + if (index != 0) { + return false; + } else if (!contains(value, index + 1, 5, "HARAC", "HARIS") && + !contains(value, index + 1, 3, "HOR", "HYM", "HIA", "HEM")) { + return false; + } else if (contains(value, 0, 5, "CHORE")) { + return false; + } else { + return true; + } + } + + /** + * Complex condition 1 for 'CH' + */ + private boolean conditionCH1(String value, int index) { + return ((contains(value, 0, 4, "VAN ", "VON ") || contains(value, 0, + 3, "SCH")) || + contains(value, index - 2, 6, "ORCHES", "ARCHIT", "ORCHID") || + contains(value, index + 2, 1, "T", "S") || + ((contains(value, index - 1, 1, "A", "O", "U", "E") || index == 0) && + (contains(value, index + 2, 1, L_R_N_M_B_H_F_V_W_SPACE) || index + 1 == value.length() - 1))); + } + + /** + * Complex condition 0 for 'L' + */ + private boolean conditionL0(String value, int index) { + if (index == value.length() - 3 && + contains(value, index - 1, 4, "ILLO", "ILLA", "ALLE")) { + return true; + } else if ((contains(value, index - 1, 2, "AS", "OS") || + contains(value, value.length() - 1, 1, "A", "O")) && + contains(value, index - 1, 4, "ALLE")) { + return true; + } else { + return false; + } + } + + /** + * Complex condition 0 for 'M' + */ + private boolean conditionM0(String value, int index) { + if (charAt(value, index + 1) == 'M') { + return true; + } + return contains(value, index - 1, 3, "UMB") + && ((index + 1) == value.length() - 1 || contains(value, + index + 2, 2, "ER")); + } + + //-- BEGIN HELPER FUNCTIONS --// + + /** + * Determines whether or not a value is of slavo-germanic orgin. A value is + * of slavo-germanic origin if it contians any of 'W', 'K', 'CZ', or 'WITZ'. + */ + private boolean isSlavoGermanic(String value) { + return value.indexOf('W') > -1 || value.indexOf('K') > -1 || + value.indexOf("CZ") > -1 || value.indexOf("WITZ") > -1; + } + + /** + * Determines whether or not a character is a vowel or not + */ + private boolean isVowel(char ch) { + return VOWELS.indexOf(ch) != -1; + } + + /** + * Determines whether or not the value starts with a silent letter. It will + * return <code>true</code> if the value starts with any of 'GN', 'KN', + * 'PN', 'WR' or 'PS'. + */ + private boolean isSilentStart(String value) { + boolean result = false; + for (int i = 0; i < SILENT_START.length; i++) { + if (value.startsWith(SILENT_START[i])) { + result = true; + break; + } + } + return result; + } + + /** + * Cleans the input + */ + private String cleanInput(String input) { + if (input == null) { + return null; + } + input = input.trim(); + if (input.length() == 0) { + return null; + } + return input.toUpperCase(); + } + + /** + * Gets the character at index <code>index</code> if available, otherwise + * it returns <code>Character.MIN_VALUE</code> so that there is some sort + * of a default + */ + protected char charAt(String value, int index) { + if (index < 0 || index >= value.length()) { + return Character.MIN_VALUE; + } + return value.charAt(index); + } + + /** + * Shortcut method with 1 criteria + */ + private static boolean contains(String value, int start, int length, + String criteria) { + return contains(value, start, length, + new String[] { criteria }); + } + + /** + * Shortcut method with 2 criteria + */ + private static boolean contains(String value, int start, int length, + String criteria1, String criteria2) { + return contains(value, start, length, + new String[] { criteria1, criteria2 }); + } + + /** + * Shortcut method with 3 criteria + */ + private static boolean contains(String value, int start, int length, + String criteria1, String criteria2, + String criteria3) { + return contains(value, start, length, + new String[] { criteria1, criteria2, criteria3 }); + } + + /** + * Shortcut method with 4 criteria + */ + private static boolean contains(String value, int start, int length, + String criteria1, String criteria2, + String criteria3, String criteria4) { + return contains(value, start, length, + new String[] { criteria1, criteria2, criteria3, + criteria4 }); + } + + /** + * Shortcut method with 5 criteria + */ + private static boolean contains(String value, int start, int length, + String criteria1, String criteria2, + String criteria3, String criteria4, + String criteria5) { + return contains(value, start, length, + new String[] { criteria1, criteria2, criteria3, + criteria4, criteria5 }); + } + + /** + * Shortcut method with 6 criteria + */ + private static boolean contains(String value, int start, int length, + String criteria1, String criteria2, + String criteria3, String criteria4, + String criteria5, String criteria6) { + return contains(value, start, length, + new String[] { criteria1, criteria2, criteria3, + criteria4, criteria5, criteria6 }); + } + + /** + * Determines whether <code>value</code> contains any of the criteria + starting + * at index <code>start</code> and matching up to length <code>length</code> + */ + protected static boolean contains(String value, int start, int length, + String[] criteria) { + boolean result = false; + if (start >= 0 && start + length <= value.length()) { + String target = value.substring(start, start + length); + + for (int i = 0; i < criteria.length; i++) { + if (target.equals(criteria[i])) { + result = true; + break; + } + } + } + return result; + } + + //-- BEGIN INNER CLASSES --// + + /** + * Inner class for storing results, since there is the optional alternate + * encoding. + */ + public class DoubleMetaphoneResult { + + private StringBuffer primary = new StringBuffer(getMaxCodeLen()); + private StringBuffer alternate = new StringBuffer(getMaxCodeLen()); + private int maxLength; + + public DoubleMetaphoneResult(int maxLength) { + this.maxLength = maxLength; + } + + public void append(char value) { + appendPrimary(value); + appendAlternate(value); + } + + public void append(char primary, char alternate) { + appendPrimary(primary); + appendAlternate(alternate); + } + + public void appendPrimary(char value) { + if (this.primary.length() < this.maxLength) { + this.primary.append(value); + } + } + + public void appendAlternate(char value) { + if (this.alternate.length() < this.maxLength) { + this.alternate.append(value); + } + } + + public void append(String value) { + appendPrimary(value); + appendAlternate(value); + } + + public void append(String primary, String alternate) { + appendPrimary(primary); + appendAlternate(alternate); + } + + public void appendPrimary(String value) { + int addChars = this.maxLength - this.primary.length(); + if (value.length() <= addChars) { + this.primary.append(value); + } else { + this.primary.append(value.substring(0, addChars)); + } + } + + public void appendAlternate(String value) { + int addChars = this.maxLength - this.alternate.length(); + if (value.length() <= addChars) { + this.alternate.append(value); + } else { + this.alternate.append(value.substring(0, addChars)); + } + } + + public String getPrimary() { + return this.primary.toString(); + } + + public String getAlternate() { + return this.alternate.toString(); + } + + public boolean isComplete() { + return this.primary.length() >= this.maxLength && + this.alternate.length() >= this.maxLength; + } + } +} diff --git a/src/org/apache/commons/codec/language/Metaphone.java b/src/org/apache/commons/codec/language/Metaphone.java new file mode 100644 index 0000000..dce2c72 --- /dev/null +++ b/src/org/apache/commons/codec/language/Metaphone.java @@ -0,0 +1,399 @@ +/* + * Copyright 2001-2004 The Apache Software Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.commons.codec.language; + +import org.apache.commons.codec.EncoderException; +import org.apache.commons.codec.StringEncoder; + +/** + * Encodes a string into a metaphone value. + * <p> + * Initial Java implementation by <CITE>William B. Brogden. December, 1997</CITE>. + * Permission given by <CITE>wbrogden</CITE> for code to be used anywhere. + * </p> + * <p> + * <CITE>Hanging on the Metaphone</CITE> by <CITE>Lawrence Philips</CITE> in <CITE>Computer Language of Dec. 1990, p + * 39.</CITE> + * </p> + * + * @author Apache Software Foundation + * @version $Id: Metaphone.java,v 1.20 2004/06/05 18:32:04 ggregory Exp $ + */ +public class Metaphone implements StringEncoder { + + /** + * Five values in the English language + */ + private String vowels = "AEIOU" ; + + /** + * Variable used in Metaphone algorithm + */ + private String frontv = "EIY" ; + + /** + * Variable used in Metaphone algorithm + */ + private String varson = "CSPTG" ; + + /** + * The max code length for metaphone is 4 + */ + private int maxCodeLen = 4 ; + + /** + * Creates an instance of the Metaphone encoder + */ + public Metaphone() { + super(); + } + + /** + * Find the metaphone value of a String. This is similar to the + * soundex algorithm, but better at finding similar sounding words. + * All input is converted to upper case. + * Limitations: Input format is expected to be a single ASCII word + * with only characters in the A - Z range, no punctuation or numbers. + * + * @param txt String to find the metaphone code for + * @return A metaphone code corresponding to the String supplied + */ + public String metaphone(String txt) { + boolean hard = false ; + if ((txt == null) || (txt.length() == 0)) { + return "" ; + } + // single character is itself + if (txt.length() == 1) { + return txt.toUpperCase() ; + } + + char[] inwd = txt.toUpperCase().toCharArray() ; + + StringBuffer local = new StringBuffer(40); // manipulate + StringBuffer code = new StringBuffer(10) ; // output + // handle initial 2 characters exceptions + switch(inwd[0]) { + case 'K' : + case 'G' : + case 'P' : /* looking for KN, etc*/ + if (inwd[1] == 'N') { + local.append(inwd, 1, inwd.length - 1); + } else { + local.append(inwd); + } + break; + case 'A': /* looking for AE */ + if (inwd[1] == 'E') { + local.append(inwd, 1, inwd.length - 1); + } else { + local.append(inwd); + } + break; + case 'W' : /* looking for WR or WH */ + if (inwd[1] == 'R') { // WR -> R + local.append(inwd, 1, inwd.length - 1); + break ; + } + if (inwd[1] == 'H') { + local.append(inwd, 1, inwd.length - 1); + local.setCharAt(0, 'W'); // WH -> W + } else { + local.append(inwd); + } + break; + case 'X' : /* initial X becomes S */ + inwd[0] = 'S'; + local.append(inwd); + break ; + default : + local.append(inwd); + } // now local has working string with initials fixed + + int wdsz = local.length(); + int n = 0 ; + + while ((code.length() < this.getMaxCodeLen()) && + (n < wdsz) ) { // max code size of 4 works well + char symb = local.charAt(n) ; + // remove duplicate letters except C + if ((symb != 'C') && (isPreviousChar( local, n, symb )) ) { + n++ ; + } else { // not dup + switch(symb) { + case 'A' : case 'E' : case 'I' : case 'O' : case 'U' : + if (n == 0) { + code.append(symb); + } + break ; // only use vowel if leading char + case 'B' : + if ( isPreviousChar(local, n, 'M') && + isLastChar(wdsz, n) ) { // B is silent if word ends in MB + break; + } + code.append(symb); + break; + case 'C' : // lots of C special cases + /* discard if SCI, SCE or SCY */ + if ( isPreviousChar(local, n, 'S') && + !isLastChar(wdsz, n) && + (this.frontv.indexOf(local.charAt(n + 1)) >= 0) ) { + break; + } + if (regionMatch(local, n, "CIA")) { // "CIA" -> X + code.append('X'); + break; + } + if (!isLastChar(wdsz, n) && + (this.frontv.indexOf(local.charAt(n + 1)) >= 0)) { + code.append('S'); + break; // CI,CE,CY -> S + } + if (isPreviousChar(local, n, 'S') && + isNextChar(local, n, 'H') ) { // SCH->sk + code.append('K') ; + break ; + } + if (isNextChar(local, n, 'H')) { // detect CH + if ((n == 0) && + (wdsz >= 3) && + isVowel(local,2) ) { // CH consonant -> K consonant + code.append('K'); + } else { + code.append('X'); // CHvowel -> X + } + } else { + code.append('K'); + } + break ; + case 'D' : + if (!isLastChar(wdsz, n + 1) && + isNextChar(local, n, 'G') && + (this.frontv.indexOf(local.charAt(n + 2)) >= 0)) { // DGE DGI DGY -> J + code.append('J'); n += 2 ; + } else { + code.append('T'); + } + break ; + case 'G' : // GH silent at end or before consonant + if (isLastChar(wdsz, n + 1) && + isNextChar(local, n, 'H')) { + break; + } + if (!isLastChar(wdsz, n + 1) && + isNextChar(local,n,'H') && + !isVowel(local,n+2)) { + break; + } + if ((n > 0) && + ( regionMatch(local, n, "GN") || + regionMatch(local, n, "GNED") ) ) { + break; // silent G + } + if (isPreviousChar(local, n, 'G')) { + hard = true ; + } else { + hard = false ; + } + if (!isLastChar(wdsz, n) && + (this.frontv.indexOf(local.charAt(n + 1)) >= 0) && + (!hard)) { + code.append('J'); + } else { + code.append('K'); + } + break ; + case 'H': + if (isLastChar(wdsz, n)) { + break ; // terminal H + } + if ((n > 0) && + (this.varson.indexOf(local.charAt(n - 1)) >= 0)) { + break; + } + if (isVowel(local,n+1)) { + code.append('H'); // Hvowel + } + break; + case 'F': + case 'J' : + case 'L' : + case 'M': + case 'N' : + case 'R' : + code.append(symb); + break; + case 'K' : + if (n > 0) { // not initial + if (!isPreviousChar(local, n, 'C')) { + code.append(symb); + } + } else { + code.append(symb); // initial K + } + break ; + case 'P' : + if (isNextChar(local,n,'H')) { + // PH -> F + code.append('F'); + } else { + code.append(symb); + } + break ; + case 'Q' : + code.append('K'); + break; + case 'S' : + if (regionMatch(local,n,"SH") || + regionMatch(local,n,"SIO") || + regionMatch(local,n,"SIA")) { + code.append('X'); + } else { + code.append('S'); + } + break; + case 'T' : + if (regionMatch(local,n,"TIA") || + regionMatch(local,n,"TIO")) { + code.append('X'); + break; + } + if (regionMatch(local,n,"TCH")) { + // Silent if in "TCH" + break; + } + // substitute numeral 0 for TH (resembles theta after all) + if (regionMatch(local,n,"TH")) { + code.append('0'); + } else { + code.append('T'); + } + break ; + case 'V' : + code.append('F'); break ; + case 'W' : case 'Y' : // silent if not followed by vowel + if (!isLastChar(wdsz,n) && + isVowel(local,n+1)) { + code.append(symb); + } + break ; + case 'X' : + code.append('K'); code.append('S'); + break ; + case 'Z' : + code.append('S'); break ; + } // end switch + n++ ; + } // end else from symb != 'C' + if (code.length() > this.getMaxCodeLen()) { + code.setLength(this.getMaxCodeLen()); + } + } + return code.toString(); + } + + private boolean isVowel(StringBuffer string, int index) { + return (this.vowels.indexOf(string.charAt(index)) >= 0); + } + + private boolean isPreviousChar(StringBuffer string, int index, char c) { + boolean matches = false; + if( index > 0 && + index < string.length() ) { + matches = string.charAt(index - 1) == c; + } + return matches; + } + + private boolean isNextChar(StringBuffer string, int index, char c) { + boolean matches = false; + if( index >= 0 && + index < string.length() - 1 ) { + matches = string.charAt(index + 1) == c; + } + return matches; + } + + private boolean regionMatch(StringBuffer string, int index, String test) { + boolean matches = false; + if( index >= 0 && + (index + test.length() - 1) < string.length() ) { + String substring = string.substring( index, index + test.length()); + matches = substring.equals( test ); + } + return matches; + } + + private boolean isLastChar(int wdsz, int n) { + return n + 1 == wdsz; + } + + + /** + * Encodes an Object using the metaphone algorithm. This method + * is provided in order to satisfy the requirements of the + * Encoder interface, and will throw an EncoderException if the + * supplied object is not of type java.lang.String. + * + * @param pObject Object to encode + * @return An object (or type java.lang.String) containing the + * metaphone code which corresponds to the String supplied. + * @throws EncoderException if the parameter supplied is not + * of type java.lang.String + */ + public Object encode(Object pObject) throws EncoderException { + if (!(pObject instanceof java.lang.String)) { + throw new EncoderException("Parameter supplied to Metaphone encode is not of type java.lang.String"); + } + return metaphone((String) pObject); + } + + /** + * Encodes a String using the Metaphone algorithm. + * + * @param pString String object to encode + * @return The metaphone code corresponding to the String supplied + */ + public String encode(String pString) { + return metaphone(pString); + } + + /** + * Tests is the metaphones of two strings are identical. + * + * @param str1 First of two strings to compare + * @param str2 Second of two strings to compare + * @return true if the metaphones of these strings are identical, + * false otherwise. + */ + public boolean isMetaphoneEqual(String str1, String str2) { + return metaphone(str1).equals(metaphone(str2)); + } + + /** + * Returns the maxCodeLen. + * @return int + */ + public int getMaxCodeLen() { return this.maxCodeLen; } + + /** + * Sets the maxCodeLen. + * @param maxCodeLen The maxCodeLen to set + */ + public void setMaxCodeLen(int maxCodeLen) { this.maxCodeLen = maxCodeLen; } + +} diff --git a/src/org/apache/commons/codec/language/RefinedSoundex.java b/src/org/apache/commons/codec/language/RefinedSoundex.java new file mode 100644 index 0000000..dbf60fe --- /dev/null +++ b/src/org/apache/commons/codec/language/RefinedSoundex.java @@ -0,0 +1,186 @@ +/* + * Copyright 2001-2004 The Apache Software Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.commons.codec.language; + +import org.apache.commons.codec.EncoderException; +import org.apache.commons.codec.StringEncoder; + +/** + * Encodes a string into a Refined Soundex value. A refined soundex code is + * optimized for spell checking words. Soundex method originally developed by + * <CITE>Margaret Odell</CITE> and <CITE>Robert Russell</CITE>. + * + * @author Apache Software Foundation + * @version $Id: RefinedSoundex.java,v 1.21 2004/06/05 18:32:04 ggregory Exp $ + */ +public class RefinedSoundex implements StringEncoder { + + /** + * This static variable contains an instance of the RefinedSoundex using + * the US_ENGLISH mapping. + */ + public static final RefinedSoundex US_ENGLISH = new RefinedSoundex(); + + /** + * RefinedSoundex is *refined* for a number of reasons one being that the + * mappings have been altered. This implementation contains default + * mappings for US English. + */ + public static final char[] US_ENGLISH_MAPPING = "01360240043788015936020505".toCharArray(); + + /** + * Every letter of the alphabet is "mapped" to a numerical value. This char + * array holds the values to which each letter is mapped. This + * implementation contains a default map for US_ENGLISH + */ + private char[] soundexMapping; + + /** + * Creates an instance of the RefinedSoundex object using the default US + * English mapping. + */ + public RefinedSoundex() { + this(US_ENGLISH_MAPPING); + } + + /** + * Creates a refined soundex instance using a custom mapping. This + * constructor can be used to customize the mapping, and/or possibly + * provide an internationalized mapping for a non-Western character set. + * + * @param mapping + * Mapping array to use when finding the corresponding code for + * a given character + */ + public RefinedSoundex(char[] mapping) { + this.soundexMapping = mapping; + } + + // BEGIN android-note + // Removed @see reference to SoundexUtils below, since the class isn't + // public. + // END android-note + /** + * Returns the number of characters in the two encoded Strings that are the + * same. This return value ranges from 0 to the length of the shortest + * encoded String: 0 indicates little or no similarity, and 4 out of 4 (for + * example) indicates strong similarity or identical values. For refined + * Soundex, the return value can be greater than 4. + * + * @param s1 + * A String that will be encoded and compared. + * @param s2 + * A String that will be encoded and compared. + * @return The number of characters in the two encoded Strings that are the + * same from 0 to to the length of the shortest encoded String. + * + * @see <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp"> + * MS T-SQL DIFFERENCE</a> + * + * @throws EncoderException + * if an error occurs encoding one of the strings + * @since 1.3 + */ + public int difference(String s1, String s2) throws EncoderException { + return SoundexUtils.difference(this, s1, s2); + } + + /** + * Encodes an Object using the refined soundex algorithm. This method is + * provided in order to satisfy the requirements of the Encoder interface, + * and will throw an EncoderException if the supplied object is not of type + * java.lang.String. + * + * @param pObject + * Object to encode + * @return An object (or type java.lang.String) containing the refined + * soundex code which corresponds to the String supplied. + * @throws EncoderException + * if the parameter supplied is not of type java.lang.String + */ + public Object encode(Object pObject) throws EncoderException { + if (!(pObject instanceof java.lang.String)) { + throw new EncoderException("Parameter supplied to RefinedSoundex encode is not of type java.lang.String"); + } + return soundex((String) pObject); + } + + /** + * Encodes a String using the refined soundex algorithm. + * + * @param pString + * A String object to encode + * @return A Soundex code corresponding to the String supplied + */ + public String encode(String pString) { + return soundex(pString); + } + + /** + * Returns the mapping code for a given character. The mapping codes are + * maintained in an internal char array named soundexMapping, and the + * default values of these mappings are US English. + * + * @param c + * char to get mapping for + * @return A character (really a numeral) to return for the given char + */ + char getMappingCode(char c) { + if (!Character.isLetter(c)) { + return 0; + } + return this.soundexMapping[Character.toUpperCase(c) - 'A']; + } + + /** + * Retreives the Refined Soundex code for a given String object. + * + * @param str + * String to encode using the Refined Soundex algorithm + * @return A soundex code for the String supplied + */ + public String soundex(String str) { + if (str == null) { + return null; + } + str = SoundexUtils.clean(str); + if (str.length() == 0) { + return str; + } + + StringBuffer sBuf = new StringBuffer(); + sBuf.append(str.charAt(0)); + + char last, current; + last = '*'; + + for (int i = 0; i < str.length(); i++) { + + current = getMappingCode(str.charAt(i)); + if (current == last) { + continue; + } else if (current != 0) { + sBuf.append(current); + } + + last = current; + + } + + return sBuf.toString(); + } +} diff --git a/src/org/apache/commons/codec/language/Soundex.java b/src/org/apache/commons/codec/language/Soundex.java new file mode 100644 index 0000000..61ce440 --- /dev/null +++ b/src/org/apache/commons/codec/language/Soundex.java @@ -0,0 +1,274 @@ +/* + * Copyright 2001-2004 The Apache Software Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.commons.codec.language; + +import org.apache.commons.codec.EncoderException; +import org.apache.commons.codec.StringEncoder; + +/** + * Encodes a string into a Soundex value. Soundex is an encoding used to relate similar names, but can also be used as a + * general purpose scheme to find word with similar phonemes. + * + * @author Apache Software Foundation + * @version $Id: Soundex.java,v 1.26 2004/07/07 23:15:24 ggregory Exp $ + */ +public class Soundex implements StringEncoder { + + /** + * An instance of Soundex using the US_ENGLISH_MAPPING mapping. + * + * @see #US_ENGLISH_MAPPING + */ + public static final Soundex US_ENGLISH = new Soundex(); + + /** + * This is a default mapping of the 26 letters used in US English. A value of <code>0</code> for a letter position + * means do not encode. + * <p> + * (This constant is provided as both an implementation convenience and to allow Javadoc to pick + * up the value for the constant values page.) + * </p> + * + * @see #US_ENGLISH_MAPPING + */ + public static final String US_ENGLISH_MAPPING_STRING = "01230120022455012623010202"; + + /** + * This is a default mapping of the 26 letters used in US English. A value of <code>0</code> for a letter position + * means do not encode. + * + * @see Soundex#Soundex(char[]) + */ + public static final char[] US_ENGLISH_MAPPING = US_ENGLISH_MAPPING_STRING.toCharArray(); + + // BEGIN android-note + // Removed @see reference to SoundexUtils below, since the class isn't + // public. + // END android-note + /** + * Encodes the Strings and returns the number of characters in the two encoded Strings that are the same. This + * return value ranges from 0 through 4: 0 indicates little or no similarity, and 4 indicates strong similarity or + * identical values. + * + * @param s1 + * A String that will be encoded and compared. + * @param s2 + * A String that will be encoded and compared. + * @return The number of characters in the two encoded Strings that are the same from 0 to 4. + * + * @see <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp"> MS + * T-SQL DIFFERENCE </a> + * + * @throws EncoderException + * if an error occurs encoding one of the strings + * @since 1.3 + */ + public int difference(String s1, String s2) throws EncoderException { + return SoundexUtils.difference(this, s1, s2); + } + + /** + * The maximum length of a Soundex code - Soundex codes are only four characters by definition. + * + * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0. + */ + private int maxLength = 4; + + /** + * Every letter of the alphabet is "mapped" to a numerical value. This char array holds the values to which each + * letter is mapped. This implementation contains a default map for US_ENGLISH + */ + private char[] soundexMapping; + + /** + * Creates an instance using US_ENGLISH_MAPPING + * + * @see Soundex#Soundex(char[]) + * @see Soundex#US_ENGLISH_MAPPING + */ + public Soundex() { + this(US_ENGLISH_MAPPING); + } + + /** + * Creates a soundex instance using the given mapping. This constructor can be used to provide an internationalized + * mapping for a non-Western character set. + * + * Every letter of the alphabet is "mapped" to a numerical value. This char array holds the values to which each + * letter is mapped. This implementation contains a default map for US_ENGLISH + * + * @param mapping + * Mapping array to use when finding the corresponding code for a given character + */ + public Soundex(char[] mapping) { + this.setSoundexMapping(mapping); + } + + /** + * Encodes an Object using the soundex algorithm. This method is provided in order to satisfy the requirements of + * the Encoder interface, and will throw an EncoderException if the supplied object is not of type java.lang.String. + * + * @param pObject + * Object to encode + * @return An object (or type java.lang.String) containing the soundex code which corresponds to the String + * supplied. + * @throws EncoderException + * if the parameter supplied is not of type java.lang.String + * @throws IllegalArgumentException + * if a character is not mapped + */ + public Object encode(Object pObject) throws EncoderException { + if (!(pObject instanceof String)) { + throw new EncoderException("Parameter supplied to Soundex encode is not of type java.lang.String"); + } + return soundex((String) pObject); + } + + /** + * Encodes a String using the soundex algorithm. + * + * @param pString + * A String object to encode + * @return A Soundex code corresponding to the String supplied + * @throws IllegalArgumentException + * if a character is not mapped + */ + public String encode(String pString) { + return soundex(pString); + } + + /** + * Used internally by the SoundEx algorithm. + * + * Consonants from the same code group separated by W or H are treated as one. + * + * @param str + * the cleaned working string to encode (in upper case). + * @param index + * the character position to encode + * @return Mapping code for a particular character + * @throws IllegalArgumentException + * if the character is not mapped + */ + private char getMappingCode(String str, int index) { + char mappedChar = this.map(str.charAt(index)); + // HW rule check + if (index > 1 && mappedChar != '0') { + char hwChar = str.charAt(index - 1); + if ('H' == hwChar || 'W' == hwChar) { + char preHWChar = str.charAt(index - 2); + char firstCode = this.map(preHWChar); + if (firstCode == mappedChar || 'H' == preHWChar || 'W' == preHWChar) { + return 0; + } + } + } + return mappedChar; + } + + /** + * Returns the maxLength. Standard Soundex + * + * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0. + * @return int + */ + public int getMaxLength() { + return this.maxLength; + } + + /** + * Returns the soundex mapping. + * + * @return soundexMapping. + */ + private char[] getSoundexMapping() { + return this.soundexMapping; + } + + /** + * Maps the given upper-case character to it's Soudex code. + * + * @param ch + * An upper-case character. + * @return A Soundex code. + * @throws IllegalArgumentException + * Thrown if <code>ch</code> is not mapped. + */ + private char map(char ch) { + int index = ch - 'A'; + if (index < 0 || index >= this.getSoundexMapping().length) { + throw new IllegalArgumentException("The character is not mapped: " + ch); + } + return this.getSoundexMapping()[index]; + } + + /** + * Sets the maxLength. + * + * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0. + * @param maxLength + * The maxLength to set + */ + public void setMaxLength(int maxLength) { + this.maxLength = maxLength; + } + + /** + * Sets the soundexMapping. + * + * @param soundexMapping + * The soundexMapping to set. + */ + private void setSoundexMapping(char[] soundexMapping) { + this.soundexMapping = soundexMapping; + } + + /** + * Retreives the Soundex code for a given String object. + * + * @param str + * String to encode using the Soundex algorithm + * @return A soundex code for the String supplied + * @throws IllegalArgumentException + * if a character is not mapped + */ + public String soundex(String str) { + if (str == null) { + return null; + } + str = SoundexUtils.clean(str); + if (str.length() == 0) { + return str; + } + char out[] = {'0', '0', '0', '0'}; + char last, mapped; + int incount = 1, count = 1; + out[0] = str.charAt(0); + last = getMappingCode(str, 0); + while ((incount < str.length()) && (count < out.length)) { + mapped = getMappingCode(str, incount++); + if (mapped != 0) { + if ((mapped != '0') && (mapped != last)) { + out[count++] = mapped; + } + last = mapped; + } + } + return new String(out); + } + +} diff --git a/src/org/apache/commons/codec/language/SoundexUtils.java b/src/org/apache/commons/codec/language/SoundexUtils.java new file mode 100644 index 0000000..48f2d87 --- /dev/null +++ b/src/org/apache/commons/codec/language/SoundexUtils.java @@ -0,0 +1,122 @@ +/* + * Copyright 2001-2004 The Apache Software Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.commons.codec.language; + +import org.apache.commons.codec.EncoderException; +import org.apache.commons.codec.StringEncoder; + +/** + * Utility methods for {@link Soundex} and {@link RefinedSoundex} classes. + * + * @author Apache Software Foundation + * @version $Id: SoundexUtils.java,v 1.5 2004/03/17 18:31:35 ggregory Exp $ + * @since 1.3 + */ +final class SoundexUtils { + + /** + * Cleans up the input string before Soundex processing by only returning + * upper case letters. + * + * @param str + * The String to clean. + * @return A clean String. + */ + static String clean(String str) { + if (str == null || str.length() == 0) { + return str; + } + int len = str.length(); + char[] chars = new char[len]; + int count = 0; + for (int i = 0; i < len; i++) { + if (Character.isLetter(str.charAt(i))) { + chars[count++] = str.charAt(i); + } + } + if (count == len) { + return str.toUpperCase(); + } + return new String(chars, 0, count).toUpperCase(); + } + + /** + * Encodes the Strings and returns the number of characters in the two + * encoded Strings that are the same. + * <ul> + * <li>For Soundex, this return value ranges from 0 through 4: 0 indicates + * little or no similarity, and 4 indicates strong similarity or identical + * values.</li> + * <li>For refined Soundex, the return value can be greater than 4.</li> + * </ul> + * + * @param encoder + * The encoder to use to encode the Strings. + * @param s1 + * A String that will be encoded and compared. + * @param s2 + * A String that will be encoded and compared. + * @return The number of characters in the two Soundex encoded Strings that + * are the same. + * + * @see #differenceEncoded(String,String) + * @see <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp"> + * MS T-SQL DIFFERENCE</a> + * + * @throws EncoderException + * if an error occurs encoding one of the strings + */ + static int difference(StringEncoder encoder, String s1, String s2) throws EncoderException { + return differenceEncoded(encoder.encode(s1), encoder.encode(s2)); + } + + /** + * Returns the number of characters in the two Soundex encoded Strings that + * are the same. + * <ul> + * <li>For Soundex, this return value ranges from 0 through 4: 0 indicates + * little or no similarity, and 4 indicates strong similarity or identical + * values.</li> + * <li>For refined Soundex, the return value can be greater than 4.</li> + * </ul> + * + * @param es1 + * An encoded String. + * @param es2 + * An encoded String. + * @return The number of characters in the two Soundex encoded Strings that + * are the same. + * + * @see <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp"> + * MS T-SQL DIFFERENCE</a> + */ + static int differenceEncoded(String es1, String es2) { + + if (es1 == null || es2 == null) { + return 0; + } + int lengthToMatch = Math.min(es1.length(), es2.length()); + int diff = 0; + for (int i = 0; i < lengthToMatch; i++) { + if (es1.charAt(i) == es2.charAt(i)) { + diff++; + } + } + return diff; + } + +} diff --git a/src/org/apache/commons/codec/language/package.html b/src/org/apache/commons/codec/language/package.html new file mode 100644 index 0000000..fab8e4c --- /dev/null +++ b/src/org/apache/commons/codec/language/package.html @@ -0,0 +1,20 @@ +<!-- +Copyright 2003-2004 The Apache Software Foundation. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +--> +<html> + <body> + Language and phonetic encoders. + </body> +</html> diff --git a/src/org/apache/commons/codec/net/BCodec.java b/src/org/apache/commons/codec/net/BCodec.java new file mode 100644 index 0000000..b164100 --- /dev/null +++ b/src/org/apache/commons/codec/net/BCodec.java @@ -0,0 +1,207 @@ +/* + * Copyright 2001-2004 The Apache Software Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.commons.codec.net; + +import java.io.UnsupportedEncodingException; +import org.apache.commons.codec.DecoderException; +import org.apache.commons.codec.EncoderException; +import org.apache.commons.codec.StringDecoder; +import org.apache.commons.codec.StringEncoder; +import org.apache.commons.codec.binary.Base64; + +/** + * <p> + * Identical to the Base64 encoding defined by <a href="http://www.ietf.org/rfc/rfc1521.txt">RFC + * 1521</a> and allows a character set to be specified. + * </p> + * + * <p> + * <a href="http://www.ietf.org/rfc/rfc1522.txt">RFC 1522</a> describes techniques to allow the encoding of non-ASCII + * text in various portions of a RFC 822 [2] message header, in a manner which is unlikely to confuse existing message + * handling software. + * </p> + * + * @see <a href="http://www.ietf.org/rfc/rfc1522.txt">MIME (Multipurpose Internet Mail Extensions) Part Two: Message + * Header Extensions for Non-ASCII Text</a> + * + * @author Apache Software Foundation + * @since 1.3 + * @version $Id: BCodec.java,v 1.5 2004/04/13 22:46:37 ggregory Exp $ + */ +public class BCodec extends RFC1522Codec implements StringEncoder, StringDecoder { + /** + * The default charset used for string decoding and encoding. + */ + private String charset = StringEncodings.UTF8; + + /** + * Default constructor. + */ + public BCodec() { + super(); + } + + /** + * Constructor which allows for the selection of a default charset + * + * @param charset + * the default string charset to use. + * + * @see <a href="http://java.sun.com/j2se/1.3/docs/api/java/lang/package-summary.html#charenc">JRE character + * encoding names</a> + */ + public BCodec(final String charset) { + super(); + this.charset = charset; + } + + protected String getEncoding() { + return "B"; + } + + protected byte[] doEncoding(byte[] bytes) throws EncoderException { + if (bytes == null) { + return null; + } + return Base64.encodeBase64(bytes); + } + + protected byte[] doDecoding(byte[] bytes) throws DecoderException { + if (bytes == null) { + return null; + } + return Base64.decodeBase64(bytes); + } + + /** + * Encodes a string into its Base64 form using the specified charset. Unsafe characters are escaped. + * + * @param value + * string to convert to Base64 form + * @param charset + * the charset for pString + * @return Base64 string + * + * @throws EncoderException + * thrown if a failure condition is encountered during the encoding process. + */ + public String encode(final String value, final String charset) throws EncoderException { + if (value == null) { + return null; + } + try { + return encodeText(value, charset); + } catch (UnsupportedEncodingException e) { + throw new EncoderException(e.getMessage()); + } + } + + /** + * Encodes a string into its Base64 form using the default charset. Unsafe characters are escaped. + * + * @param value + * string to convert to Base64 form + * @return Base64 string + * + * @throws EncoderException + * thrown if a failure condition is encountered during the encoding process. + */ + public String encode(String value) throws EncoderException { + if (value == null) { + return null; + } + return encode(value, getDefaultCharset()); + } + + /** + * Decodes a Base64 string into its original form. Escaped characters are converted back to their original + * representation. + * + * @param value + * Base64 string to convert into its original form + * + * @return original string + * + * @throws DecoderException + * A decoder exception is thrown if a failure condition is encountered during the decode process. + */ + public String decode(String value) throws DecoderException { + if (value == null) { + return null; + } + try { + return decodeText(value); + } catch (UnsupportedEncodingException e) { + throw new DecoderException(e.getMessage()); + } + } + + /** + * Encodes an object into its Base64 form using the default charset. Unsafe characters are escaped. + * + * @param value + * object to convert to Base64 form + * @return Base64 object + * + * @throws EncoderException + * thrown if a failure condition is encountered during the encoding process. + */ + public Object encode(Object value) throws EncoderException { + if (value == null) { + return null; + } else if (value instanceof String) { + return encode((String) value); + } else { + throw new EncoderException("Objects of type " + + value.getClass().getName() + + " cannot be encoded using BCodec"); + } + } + + /** + * Decodes a Base64 object into its original form. Escaped characters are converted back to their original + * representation. + * + * @param value + * Base64 object to convert into its original form + * + * @return original object + * + * @throws DecoderException + * A decoder exception is thrown if a failure condition is encountered during the decode process. + */ + public Object decode(Object value) throws DecoderException { + if (value == null) { + return null; + } else if (value instanceof String) { + return decode((String) value); + } else { + throw new DecoderException("Objects of type " + + value.getClass().getName() + + " cannot be decoded using BCodec"); + } + } + + /** + * The default charset used for string decoding and encoding. + * + * @return the default string charset. + */ + public String getDefaultCharset() { + return this.charset; + } +} diff --git a/src/org/apache/commons/codec/net/QCodec.java b/src/org/apache/commons/codec/net/QCodec.java new file mode 100644 index 0000000..5736080 --- /dev/null +++ b/src/org/apache/commons/codec/net/QCodec.java @@ -0,0 +1,309 @@ +/* + * Copyright 2001-2004 The Apache Software Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.commons.codec.net; + +import java.io.UnsupportedEncodingException; +import java.util.BitSet; + +import org.apache.commons.codec.DecoderException; +import org.apache.commons.codec.EncoderException; +import org.apache.commons.codec.StringDecoder; +import org.apache.commons.codec.StringEncoder; + +/** + * <p> + * Similar to the Quoted-Printable content-transfer-encoding defined in <a + * href="http://www.ietf.org/rfc/rfc1521.txt">RFC 1521</a> and designed to allow text containing mostly ASCII + * characters to be decipherable on an ASCII terminal without decoding. + * </p> + * + * <p> + * <a href="http://www.ietf.org/rfc/rfc1522.txt">RFC 1522</a> describes techniques to allow the encoding of non-ASCII + * text in various portions of a RFC 822 [2] message header, in a manner which is unlikely to confuse existing message + * handling software. + * </p> + * + * @see <a href="http://www.ietf.org/rfc/rfc1522.txt">MIME (Multipurpose Internet Mail Extensions) Part Two: Message + * Header Extensions for Non-ASCII Text</a> + * + * @author Apache Software Foundation + * @since 1.3 + * @version $Id: QCodec.java,v 1.6 2004/05/24 00:24:32 ggregory Exp $ + */ +public class QCodec extends RFC1522Codec implements StringEncoder, StringDecoder { + /** + * The default charset used for string decoding and encoding. + */ + private String charset = StringEncodings.UTF8; + + /** + * BitSet of printable characters as defined in RFC 1522. + */ + private static final BitSet PRINTABLE_CHARS = new BitSet(256); + // Static initializer for printable chars collection + static { + // alpha characters + PRINTABLE_CHARS.set(' '); + PRINTABLE_CHARS.set('!'); + PRINTABLE_CHARS.set('"'); + PRINTABLE_CHARS.set('#'); + PRINTABLE_CHARS.set('$'); + PRINTABLE_CHARS.set('%'); + PRINTABLE_CHARS.set('&'); + PRINTABLE_CHARS.set('\''); + PRINTABLE_CHARS.set('('); + PRINTABLE_CHARS.set(')'); + PRINTABLE_CHARS.set('*'); + PRINTABLE_CHARS.set('+'); + PRINTABLE_CHARS.set(','); + PRINTABLE_CHARS.set('-'); + PRINTABLE_CHARS.set('.'); + PRINTABLE_CHARS.set('/'); + for (int i = '0'; i <= '9'; i++) { + PRINTABLE_CHARS.set(i); + } + PRINTABLE_CHARS.set(':'); + PRINTABLE_CHARS.set(';'); + PRINTABLE_CHARS.set('<'); + PRINTABLE_CHARS.set('>'); + PRINTABLE_CHARS.set('@'); + for (int i = 'A'; i <= 'Z'; i++) { + PRINTABLE_CHARS.set(i); + } + PRINTABLE_CHARS.set('['); + PRINTABLE_CHARS.set('\\'); + PRINTABLE_CHARS.set(']'); + PRINTABLE_CHARS.set('^'); + PRINTABLE_CHARS.set('`'); + for (int i = 'a'; i <= 'z'; i++) { + PRINTABLE_CHARS.set(i); + } + PRINTABLE_CHARS.set('{'); + PRINTABLE_CHARS.set('|'); + PRINTABLE_CHARS.set('}'); + PRINTABLE_CHARS.set('~'); + } + + private static byte BLANK = 32; + + private static byte UNDERSCORE = 95; + + private boolean encodeBlanks = false; + + /** + * Default constructor. + */ + public QCodec() { + super(); + } + + /** + * Constructor which allows for the selection of a default charset + * + * @param charset + * the default string charset to use. + * + * @see <a href="http://java.sun.com/j2se/1.3/docs/api/java/lang/package-summary.html#charenc">JRE character + * encoding names</a> + */ + public QCodec(final String charset) { + super(); + this.charset = charset; + } + + protected String getEncoding() { + return "Q"; + } + + protected byte[] doEncoding(byte[] bytes) throws EncoderException { + if (bytes == null) { + return null; + } + byte[] data = QuotedPrintableCodec.encodeQuotedPrintable(PRINTABLE_CHARS, bytes); + if (this.encodeBlanks) { + for (int i = 0; i < data.length; i++) { + if (data[i] == BLANK) { + data[i] = UNDERSCORE; + } + } + } + return data; + } + + protected byte[] doDecoding(byte[] bytes) throws DecoderException { + if (bytes == null) { + return null; + } + boolean hasUnderscores = false; + for (int i = 0; i < bytes.length; i++) { + if (bytes[i] == UNDERSCORE) { + hasUnderscores = true; + break; + } + } + if (hasUnderscores) { + byte[] tmp = new byte[bytes.length]; + for (int i = 0; i < bytes.length; i++) { + byte b = bytes[i]; + if (b != UNDERSCORE) { + tmp[i] = b; + } else { + tmp[i] = BLANK; + } + } + return QuotedPrintableCodec.decodeQuotedPrintable(tmp); + } + return QuotedPrintableCodec.decodeQuotedPrintable(bytes); + } + + /** + * Encodes a string into its quoted-printable form using the specified charset. Unsafe characters are escaped. + * + * @param pString + * string to convert to quoted-printable form + * @param charset + * the charset for pString + * @return quoted-printable string + * + * @throws EncoderException + * thrown if a failure condition is encountered during the encoding process. + */ + public String encode(final String pString, final String charset) throws EncoderException { + if (pString == null) { + return null; + } + try { + return encodeText(pString, charset); + } catch (UnsupportedEncodingException e) { + throw new EncoderException(e.getMessage()); + } + } + + /** + * Encodes a string into its quoted-printable form using the default charset. Unsafe characters are escaped. + * + * @param pString + * string to convert to quoted-printable form + * @return quoted-printable string + * + * @throws EncoderException + * thrown if a failure condition is encountered during the encoding process. + */ + public String encode(String pString) throws EncoderException { + if (pString == null) { + return null; + } + return encode(pString, getDefaultCharset()); + } + + /** + * Decodes a quoted-printable string into its original form. Escaped characters are converted back to their original + * representation. + * + * @param pString + * quoted-printable string to convert into its original form + * + * @return original string + * + * @throws DecoderException + * A decoder exception is thrown if a failure condition is encountered during the decode process. + */ + public String decode(String pString) throws DecoderException { + if (pString == null) { + return null; + } + try { + return decodeText(pString); + } catch (UnsupportedEncodingException e) { + throw new DecoderException(e.getMessage()); + } + } + + /** + * Encodes an object into its quoted-printable form using the default charset. Unsafe characters are escaped. + * + * @param pObject + * object to convert to quoted-printable form + * @return quoted-printable object + * + * @throws EncoderException + * thrown if a failure condition is encountered during the encoding process. + */ + public Object encode(Object pObject) throws EncoderException { + if (pObject == null) { + return null; + } else if (pObject instanceof String) { + return encode((String) pObject); + } else { + throw new EncoderException("Objects of type " + + pObject.getClass().getName() + + " cannot be encoded using Q codec"); + } + } + + /** + * Decodes a quoted-printable object into its original form. Escaped characters are converted back to their original + * representation. + * + * @param pObject + * quoted-printable object to convert into its original form + * + * @return original object + * + * @throws DecoderException + * A decoder exception is thrown if a failure condition is encountered during the decode process. + */ + public Object decode(Object pObject) throws DecoderException { + if (pObject == null) { + return null; + } else if (pObject instanceof String) { + return decode((String) pObject); + } else { + throw new DecoderException("Objects of type " + + pObject.getClass().getName() + + " cannot be decoded using Q codec"); + } + } + + /** + * The default charset used for string decoding and encoding. + * + * @return the default string charset. + */ + public String getDefaultCharset() { + return this.charset; + } + + /** + * Tests if optional tranformation of SPACE characters is to be used + * + * @return <code>true</code> if SPACE characters are to be transformed, <code>false</code> otherwise + */ + public boolean isEncodeBlanks() { + return this.encodeBlanks; + } + + /** + * Defines whether optional tranformation of SPACE characters is to be used + * + * @param b + * <code>true</code> if SPACE characters are to be transformed, <code>false</code> otherwise + */ + public void setEncodeBlanks(boolean b) { + this.encodeBlanks = b; + } +} diff --git a/src/org/apache/commons/codec/net/QuotedPrintableCodec.java b/src/org/apache/commons/codec/net/QuotedPrintableCodec.java new file mode 100644 index 0000000..c2fcd27 --- /dev/null +++ b/src/org/apache/commons/codec/net/QuotedPrintableCodec.java @@ -0,0 +1,387 @@ +/* + * Copyright 2001-2004 The Apache Software Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.commons.codec.net; + +import java.io.ByteArrayOutputStream; +import java.io.UnsupportedEncodingException; +import java.util.BitSet; +import org.apache.commons.codec.BinaryDecoder; +import org.apache.commons.codec.BinaryEncoder; +import org.apache.commons.codec.DecoderException; +import org.apache.commons.codec.EncoderException; +import org.apache.commons.codec.StringDecoder; +import org.apache.commons.codec.StringEncoder; + +/** + * <p> + * Codec for the Quoted-Printable section of <a href="http://www.ietf.org/rfc/rfc1521.txt">RFC 1521 </a>. + * </p> + * <p> + * The Quoted-Printable encoding is intended to represent data that largely consists of octets that correspond to + * printable characters in the ASCII character set. It encodes the data in such a way that the resulting octets are + * unlikely to be modified by mail transport. If the data being encoded are mostly ASCII text, the encoded form of the + * data remains largely recognizable by humans. A body which is entirely ASCII may also be encoded in Quoted-Printable + * to ensure the integrity of the data should the message pass through a character- translating, and/or line-wrapping + * gateway. + * </p> + * + * <p> + * Note: + * </p> + * <p> + * Rules #3, #4, and #5 of the quoted-printable spec are not implemented yet because the complete quoted-printable spec + * does not lend itself well into the byte[] oriented codec framework. Complete the codec once the steamable codec + * framework is ready. The motivation behind providing the codec in a partial form is that it can already come in handy + * for those applications that do not require quoted-printable line formatting (rules #3, #4, #5), for instance Q codec. + * </p> + * + * @see <a href="http://www.ietf.org/rfc/rfc1521.txt"> RFC 1521 MIME (Multipurpose Internet Mail Extensions) Part One: + * Mechanisms for Specifying and Describing the Format of Internet Message Bodies </a> + * + * @author Apache Software Foundation + * @since 1.3 + * @version $Id: QuotedPrintableCodec.java,v 1.7 2004/04/09 22:21:07 ggregory Exp $ + */ +public class QuotedPrintableCodec implements BinaryEncoder, BinaryDecoder, StringEncoder, StringDecoder { + /** + * The default charset used for string decoding and encoding. + */ + private String charset = StringEncodings.UTF8; + + /** + * BitSet of printable characters as defined in RFC 1521. + */ + private static final BitSet PRINTABLE_CHARS = new BitSet(256); + + private static byte ESCAPE_CHAR = '='; + + private static byte TAB = 9; + + private static byte SPACE = 32; + // Static initializer for printable chars collection + static { + // alpha characters + for (int i = 33; i <= 60; i++) { + PRINTABLE_CHARS.set(i); + } + for (int i = 62; i <= 126; i++) { + PRINTABLE_CHARS.set(i); + } + PRINTABLE_CHARS.set(TAB); + PRINTABLE_CHARS.set(SPACE); + } + + /** + * Default constructor. + */ + public QuotedPrintableCodec() { + super(); + } + + /** + * Constructor which allows for the selection of a default charset + * + * @param charset + * the default string charset to use. + */ + public QuotedPrintableCodec(String charset) { + super(); + this.charset = charset; + } + + /** + * Encodes byte into its quoted-printable representation. + * + * @param b + * byte to encode + * @param buffer + * the buffer to write to + */ + private static final void encodeQuotedPrintable(int b, ByteArrayOutputStream buffer) { + buffer.write(ESCAPE_CHAR); + char hex1 = Character.toUpperCase(Character.forDigit((b >> 4) & 0xF, 16)); + char hex2 = Character.toUpperCase(Character.forDigit(b & 0xF, 16)); + buffer.write(hex1); + buffer.write(hex2); + } + + /** + * Encodes an array of bytes into an array of quoted-printable 7-bit characters. Unsafe characters are escaped. + * + * <p> + * This function implements a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in + * RFC 1521 and is suitable for encoding binary data and unformatted text. + * </p> + * + * @param printable + * bitset of characters deemed quoted-printable + * @param bytes + * array of bytes to be encoded + * @return array of bytes containing quoted-printable data + */ + public static final byte[] encodeQuotedPrintable(BitSet printable, byte[] bytes) { + if (bytes == null) { + return null; + } + if (printable == null) { + printable = PRINTABLE_CHARS; + } + ByteArrayOutputStream buffer = new ByteArrayOutputStream(); + for (int i = 0; i < bytes.length; i++) { + int b = bytes[i]; + if (b < 0) { + b = 256 + b; + } + if (printable.get(b)) { + buffer.write(b); + } else { + encodeQuotedPrintable(b, buffer); + } + } + return buffer.toByteArray(); + } + + /** + * Decodes an array quoted-printable characters into an array of original bytes. Escaped characters are converted + * back to their original representation. + * + * <p> + * This function implements a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in + * RFC 1521. + * </p> + * + * @param bytes + * array of quoted-printable characters + * @return array of original bytes + * @throws DecoderException + * Thrown if quoted-printable decoding is unsuccessful + */ + public static final byte[] decodeQuotedPrintable(byte[] bytes) throws DecoderException { + if (bytes == null) { + return null; + } + ByteArrayOutputStream buffer = new ByteArrayOutputStream(); + for (int i = 0; i < bytes.length; i++) { + int b = bytes[i]; + if (b == ESCAPE_CHAR) { + try { + int u = Character.digit((char) bytes[++i], 16); + int l = Character.digit((char) bytes[++i], 16); + if (u == -1 || l == -1) { + throw new DecoderException("Invalid quoted-printable encoding"); + } + buffer.write((char) ((u << 4) + l)); + } catch (ArrayIndexOutOfBoundsException e) { + throw new DecoderException("Invalid quoted-printable encoding"); + } + } else { + buffer.write(b); + } + } + return buffer.toByteArray(); + } + + /** + * Encodes an array of bytes into an array of quoted-printable 7-bit characters. Unsafe characters are escaped. + * + * <p> + * This function implements a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in + * RFC 1521 and is suitable for encoding binary data and unformatted text. + * </p> + * + * @param bytes + * array of bytes to be encoded + * @return array of bytes containing quoted-printable data + */ + public byte[] encode(byte[] bytes) { + return encodeQuotedPrintable(PRINTABLE_CHARS, bytes); + } + + /** + * Decodes an array of quoted-printable characters into an array of original bytes. Escaped characters are converted + * back to their original representation. + * + * <p> + * This function implements a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in + * RFC 1521. + * </p> + * + * @param bytes + * array of quoted-printable characters + * @return array of original bytes + * @throws DecoderException + * Thrown if quoted-printable decoding is unsuccessful + */ + public byte[] decode(byte[] bytes) throws DecoderException { + return decodeQuotedPrintable(bytes); + } + + /** + * Encodes a string into its quoted-printable form using the default string charset. Unsafe characters are escaped. + * + * <p> + * This function implements a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in + * RFC 1521 and is suitable for encoding binary data. + * </p> + * + * @param pString + * string to convert to quoted-printable form + * @return quoted-printable string + * + * @throws EncoderException + * Thrown if quoted-printable encoding is unsuccessful + * + * @see #getDefaultCharset() + */ + public String encode(String pString) throws EncoderException { + if (pString == null) { + return null; + } + try { + return encode(pString, getDefaultCharset()); + } catch (UnsupportedEncodingException e) { + throw new EncoderException(e.getMessage()); + } + } + + /** + * Decodes a quoted-printable string into its original form using the specified string charset. Escaped characters + * are converted back to their original representation. + * + * @param pString + * quoted-printable string to convert into its original form + * @param charset + * the original string charset + * @return original string + * @throws DecoderException + * Thrown if quoted-printable decoding is unsuccessful + * @throws UnsupportedEncodingException + * Thrown if charset is not supported + */ + public String decode(String pString, String charset) throws DecoderException, UnsupportedEncodingException { + if (pString == null) { + return null; + } + return new String(decode(pString.getBytes(StringEncodings.US_ASCII)), charset); + } + + /** + * Decodes a quoted-printable string into its original form using the default string charset. Escaped characters are + * converted back to their original representation. + * + * @param pString + * quoted-printable string to convert into its original form + * @return original string + * @throws DecoderException + * Thrown if quoted-printable decoding is unsuccessful + * @throws UnsupportedEncodingException + * Thrown if charset is not supported + * @see #getDefaultCharset() + */ + public String decode(String pString) throws DecoderException { + if (pString == null) { + return null; + } + try { + return decode(pString, getDefaultCharset()); + } catch (UnsupportedEncodingException e) { + throw new DecoderException(e.getMessage()); + } + } + + /** + * Encodes an object into its quoted-printable safe form. Unsafe characters are escaped. + * + * @param pObject + * string to convert to a quoted-printable form + * @return quoted-printable object + * @throws EncoderException + * Thrown if quoted-printable encoding is not applicable to objects of this type or if encoding is + * unsuccessful + */ + public Object encode(Object pObject) throws EncoderException { + if (pObject == null) { + return null; + } else if (pObject instanceof byte[]) { + return encode((byte[]) pObject); + } else if (pObject instanceof String) { + return encode((String) pObject); + } else { + throw new EncoderException("Objects of type " + + pObject.getClass().getName() + + " cannot be quoted-printable encoded"); + } + } + + /** + * Decodes a quoted-printable object into its original form. Escaped characters are converted back to their original + * representation. + * + * @param pObject + * quoted-printable object to convert into its original form + * @return original object + * @throws DecoderException + * Thrown if quoted-printable decoding is not applicable to objects of this type if decoding is + * unsuccessful + */ + public Object decode(Object pObject) throws DecoderException { + if (pObject == null) { + return null; + } else if (pObject instanceof byte[]) { + return decode((byte[]) pObject); + } else if (pObject instanceof String) { + return decode((String) pObject); + } else { + throw new DecoderException("Objects of type " + + pObject.getClass().getName() + + " cannot be quoted-printable decoded"); + } + } + + /** + * Returns the default charset used for string decoding and encoding. + * + * @return the default string charset. + */ + public String getDefaultCharset() { + return this.charset; + } + + /** + * Encodes a string into its quoted-printable form using the specified charset. Unsafe characters are escaped. + * + * <p> + * This function implements a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in + * RFC 1521 and is suitable for encoding binary data and unformatted text. + * </p> + * + * @param pString + * string to convert to quoted-printable form + * @param charset + * the charset for pString + * @return quoted-printable string + * + * @throws UnsupportedEncodingException + * Thrown if the charset is not supported + */ + public String encode(String pString, String charset) throws UnsupportedEncodingException { + if (pString == null) { + return null; + } + return new String(encode(pString.getBytes(charset)), StringEncodings.US_ASCII); + } +} diff --git a/src/org/apache/commons/codec/net/RFC1522Codec.java b/src/org/apache/commons/codec/net/RFC1522Codec.java new file mode 100644 index 0000000..0acf921 --- /dev/null +++ b/src/org/apache/commons/codec/net/RFC1522Codec.java @@ -0,0 +1,161 @@ +/* + * Copyright 2001-2004 The Apache Software Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.commons.codec.net; + +import java.io.UnsupportedEncodingException; + +import org.apache.commons.codec.DecoderException; +import org.apache.commons.codec.EncoderException; + +/** + * <p> + * Implements methods common to all codecs defined in RFC 1522. + * </p> + * + * <p> + * <a href="http://www.ietf.org/rfc/rfc1522.txt">RFC 1522</a> + * describes techniques to allow the encoding of non-ASCII text in + * various portions of a RFC 822 [2] message header, in a manner which + * is unlikely to confuse existing message handling software. + * </p> + + * @see <a href="http://www.ietf.org/rfc/rfc1522.txt"> + * MIME (Multipurpose Internet Mail Extensions) Part Two: + * Message Header Extensions for Non-ASCII Text</a> + * </p> + * + * @author Apache Software Foundation + * @since 1.3 + * @version $Id: RFC1522Codec.java,v 1.2 2004/04/09 22:21:43 ggregory Exp $ + */ +abstract class RFC1522Codec { + + /** + * Applies an RFC 1522 compliant encoding scheme to the given string of text with the + * given charset. This method constructs the "encoded-word" header common to all the + * RFC 1522 codecs and then invokes {@link #doEncoding(byte [])} method of a concrete + * class to perform the specific enconding. + * + * @param text a string to encode + * @param charset a charset to be used + * + * @return RFC 1522 compliant "encoded-word" + * + * @throws EncoderException thrown if there is an error conidition during the Encoding + * process. + * @throws UnsupportedEncodingException thrown if charset is not supported + * + * @see <a href="http://java.sun.com/j2se/1.3/docs/api/java/lang/package-summary.html#charenc">JRE character + * encoding names</a> + */ + protected String encodeText(final String text, final String charset) + throws EncoderException, UnsupportedEncodingException + { + if (text == null) { + return null; + } + StringBuffer buffer = new StringBuffer(); + buffer.append("=?"); + buffer.append(charset); + buffer.append('?'); + buffer.append(getEncoding()); + buffer.append('?'); + byte [] rawdata = doEncoding(text.getBytes(charset)); + buffer.append(new String(rawdata, StringEncodings.US_ASCII)); + buffer.append("?="); + return buffer.toString(); + } + + /** + * Applies an RFC 1522 compliant decoding scheme to the given string of text. This method + * processes the "encoded-word" header common to all the RFC 1522 codecs and then invokes + * {@link #doEncoding(byte [])} method of a concrete class to perform the specific deconding. + * + * @param text a string to decode + * + * @throws DecoderException thrown if there is an error conidition during the Decoding + * process. + * @throws UnsupportedEncodingException thrown if charset specified in the "encoded-word" + * header is not supported + */ + protected String decodeText(final String text) + throws DecoderException, UnsupportedEncodingException + { + if (text == null) { + return null; + } + if ((!text.startsWith("=?")) || (!text.endsWith("?="))) { + throw new DecoderException("RFC 1522 violation: malformed encoded content"); + } + int termnator = text.length() - 2; + int from = 2; + int to = text.indexOf("?", from); + if ((to == -1) || (to == termnator)) { + throw new DecoderException("RFC 1522 violation: charset token not found"); + } + String charset = text.substring(from, to); + if (charset.equals("")) { + throw new DecoderException("RFC 1522 violation: charset not specified"); + } + from = to + 1; + to = text.indexOf("?", from); + if ((to == -1) || (to == termnator)) { + throw new DecoderException("RFC 1522 violation: encoding token not found"); + } + String encoding = text.substring(from, to); + if (!getEncoding().equalsIgnoreCase(encoding)) { + throw new DecoderException("This codec cannot decode " + + encoding + " encoded content"); + } + from = to + 1; + to = text.indexOf("?", from); + byte[] data = text.substring(from, to).getBytes(StringEncodings.US_ASCII); + data = doDecoding(data); + return new String(data, charset); + } + + /** + * Returns the codec name (referred to as encoding in the RFC 1522) + * + * @return name of the codec + */ + protected abstract String getEncoding(); + + /** + * Encodes an array of bytes using the defined encoding scheme + * + * @param bytes Data to be encoded + * + * @return A byte array containing the encoded data + * + * @throws EncoderException thrown if the Encoder encounters a failure condition + * during the encoding process. + */ + protected abstract byte[] doEncoding(byte[] bytes) throws EncoderException; + + /** + * Decodes an array of bytes using the defined encoding scheme + * + * @param bytes Data to be decoded + * + * @return a byte array that contains decoded data + * + * @throws DecoderException A decoder exception is thrown if a Decoder encounters a + * failure condition during the decode process. + */ + protected abstract byte[] doDecoding(byte[] bytes) throws DecoderException; +} diff --git a/src/org/apache/commons/codec/net/StringEncodings.java b/src/org/apache/commons/codec/net/StringEncodings.java new file mode 100644 index 0000000..e7f6bb8 --- /dev/null +++ b/src/org/apache/commons/codec/net/StringEncodings.java @@ -0,0 +1,52 @@ +/* + * Copyright 2001-2004 The Apache Software Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.commons.codec.net; + +/** + * String encodings used in this package. + * + * @author Apache Software Foundation + * @since 1.3 + * @version $Id: StringEncodings.java,v 1.2 2004/04/09 22:21:07 ggregory Exp $ + */ +interface StringEncodings { + /** + * <p> + * Seven-bit ASCII, also known as ISO646-US, also known as the Basic Latin block of the Unicode character set. + * </p> + * <p> + * Every implementation of the Java platform is required to support this character encoding. + * </p> + * + * @see <a href="http://java.sun.com/j2se/1.3/docs/api/java/lang/package-summary.html#charenc">JRE character + * encoding names</a> + */ + String US_ASCII = "US-ASCII"; + + /** + * <p> + * Eight-bit Unicode Transformation Format. + * </p> + * <p> + * Every implementation of the Java platform is required to support this character encoding. + * </p> + * + * @see <a href="http://java.sun.com/j2se/1.3/docs/api/java/lang/package-summary.html#charenc">JRE character + * encoding names</a> + */ + String UTF8 = "UTF-8"; +} diff --git a/src/org/apache/commons/codec/net/URLCodec.java b/src/org/apache/commons/codec/net/URLCodec.java new file mode 100644 index 0000000..1bc3507 --- /dev/null +++ b/src/org/apache/commons/codec/net/URLCodec.java @@ -0,0 +1,364 @@ +/* + * Copyright 2001-2004 The Apache Software Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.commons.codec.net; + +import java.io.ByteArrayOutputStream; +import java.io.UnsupportedEncodingException; +import java.util.BitSet; + +import org.apache.commons.codec.BinaryDecoder; +import org.apache.commons.codec.BinaryEncoder; +import org.apache.commons.codec.DecoderException; +import org.apache.commons.codec.EncoderException; +import org.apache.commons.codec.StringDecoder; +import org.apache.commons.codec.StringEncoder; + +/** + * <p>Implements the 'www-form-urlencoded' encoding scheme, + * also misleadingly known as URL encoding.</p> + * + * <p>For more detailed information please refer to + * <a href="http://www.w3.org/TR/html4/interact/forms.html#h-17.13.4.1"> + * Chapter 17.13.4 'Form content types'</a> of the + * <a href="http://www.w3.org/TR/html4/">HTML 4.01 Specification<a></p> + * + * <p> + * This codec is meant to be a replacement for standard Java classes + * {@link java.net.URLEncoder} and {@link java.net.URLDecoder} + * on older Java platforms, as these classes in Java versions below + * 1.4 rely on the platform's default charset encoding. + * </p> + * + * @author Apache Software Foundation + * @since 1.2 + * @version $Id: URLCodec.java,v 1.19 2004/03/29 07:59:00 ggregory Exp $ + */ +public class URLCodec implements BinaryEncoder, BinaryDecoder, StringEncoder, StringDecoder { + + /** + * The default charset used for string decoding and encoding. + */ + protected String charset = StringEncodings.UTF8; + + protected static byte ESCAPE_CHAR = '%'; + /** + * BitSet of www-form-url safe characters. + */ + protected static final BitSet WWW_FORM_URL = new BitSet(256); + + // Static initializer for www_form_url + static { + // alpha characters + for (int i = 'a'; i <= 'z'; i++) { + WWW_FORM_URL.set(i); + } + for (int i = 'A'; i <= 'Z'; i++) { + WWW_FORM_URL.set(i); + } + // numeric characters + for (int i = '0'; i <= '9'; i++) { + WWW_FORM_URL.set(i); + } + // special chars + WWW_FORM_URL.set('-'); + WWW_FORM_URL.set('_'); + WWW_FORM_URL.set('.'); + WWW_FORM_URL.set('*'); + // blank to be replaced with + + WWW_FORM_URL.set(' '); + } + + + /** + * Default constructor. + */ + public URLCodec() { + super(); + } + + /** + * Constructor which allows for the selection of a default charset + * + * @param charset the default string charset to use. + */ + public URLCodec(String charset) { + super(); + this.charset = charset; + } + + /** + * Encodes an array of bytes into an array of URL safe 7-bit + * characters. Unsafe characters are escaped. + * + * @param urlsafe bitset of characters deemed URL safe + * @param bytes array of bytes to convert to URL safe characters + * @return array of bytes containing URL safe characters + */ + public static final byte[] encodeUrl(BitSet urlsafe, byte[] bytes) + { + if (bytes == null) { + return null; + } + if (urlsafe == null) { + urlsafe = WWW_FORM_URL; + } + + ByteArrayOutputStream buffer = new ByteArrayOutputStream(); + for (int i = 0; i < bytes.length; i++) { + int b = bytes[i]; + if (b < 0) { + b = 256 + b; + } + if (urlsafe.get(b)) { + if (b == ' ') { + b = '+'; + } + buffer.write(b); + } else { + buffer.write('%'); + char hex1 = Character.toUpperCase( + Character.forDigit((b >> 4) & 0xF, 16)); + char hex2 = Character.toUpperCase( + Character.forDigit(b & 0xF, 16)); + buffer.write(hex1); + buffer.write(hex2); + } + } + return buffer.toByteArray(); + } + + + /** + * Decodes an array of URL safe 7-bit characters into an array of + * original bytes. Escaped characters are converted back to their + * original representation. + * + * @param bytes array of URL safe characters + * @return array of original bytes + * @throws DecoderException Thrown if URL decoding is unsuccessful + */ + public static final byte[] decodeUrl(byte[] bytes) + throws DecoderException + { + if (bytes == null) { + return null; + } + ByteArrayOutputStream buffer = new ByteArrayOutputStream(); + for (int i = 0; i < bytes.length; i++) { + int b = bytes[i]; + if (b == '+') { + buffer.write(' '); + } else if (b == '%') { + try { + int u = Character.digit((char)bytes[++i], 16); + int l = Character.digit((char)bytes[++i], 16); + if (u == -1 || l == -1) { + throw new DecoderException("Invalid URL encoding"); + } + buffer.write((char)((u << 4) + l)); + } catch(ArrayIndexOutOfBoundsException e) { + throw new DecoderException("Invalid URL encoding"); + } + } else { + buffer.write(b); + } + } + return buffer.toByteArray(); + } + + + /** + * Encodes an array of bytes into an array of URL safe 7-bit + * characters. Unsafe characters are escaped. + * + * @param bytes array of bytes to convert to URL safe characters + * @return array of bytes containing URL safe characters + */ + public byte[] encode(byte[] bytes) { + return encodeUrl(WWW_FORM_URL, bytes); + } + + + /** + * Decodes an array of URL safe 7-bit characters into an array of + * original bytes. Escaped characters are converted back to their + * original representation. + * + * @param bytes array of URL safe characters + * @return array of original bytes + * @throws DecoderException Thrown if URL decoding is unsuccessful + */ + public byte[] decode(byte[] bytes) throws DecoderException { + return decodeUrl(bytes); + } + + + /** + * Encodes a string into its URL safe form using the specified + * string charset. Unsafe characters are escaped. + * + * @param pString string to convert to a URL safe form + * @param charset the charset for pString + * @return URL safe string + * @throws UnsupportedEncodingException Thrown if charset is not + * supported + */ + public String encode(String pString, String charset) + throws UnsupportedEncodingException + { + if (pString == null) { + return null; + } + return new String(encode(pString.getBytes(charset)), StringEncodings.US_ASCII); + } + + + /** + * Encodes a string into its URL safe form using the default string + * charset. Unsafe characters are escaped. + * + * @param pString string to convert to a URL safe form + * @return URL safe string + * @throws EncoderException Thrown if URL encoding is unsuccessful + * + * @see #getDefaultCharset() + */ + public String encode(String pString) throws EncoderException { + if (pString == null) { + return null; + } + try { + return encode(pString, getDefaultCharset()); + } catch(UnsupportedEncodingException e) { + throw new EncoderException(e.getMessage()); + } + } + + + /** + * Decodes a URL safe string into its original form using the + * specified encoding. Escaped characters are converted back + * to their original representation. + * + * @param pString URL safe string to convert into its original form + * @param charset the original string charset + * @return original string + * @throws DecoderException Thrown if URL decoding is unsuccessful + * @throws UnsupportedEncodingException Thrown if charset is not + * supported + */ + public String decode(String pString, String charset) + throws DecoderException, UnsupportedEncodingException + { + if (pString == null) { + return null; + } + return new String(decode(pString.getBytes(StringEncodings.US_ASCII)), charset); + } + + + /** + * Decodes a URL safe string into its original form using the default + * string charset. Escaped characters are converted back to their + * original representation. + * + * @param pString URL safe string to convert into its original form + * @return original string + * @throws DecoderException Thrown if URL decoding is unsuccessful + * + * @see #getDefaultCharset() + */ + public String decode(String pString) throws DecoderException { + if (pString == null) { + return null; + } + try { + return decode(pString, getDefaultCharset()); + } catch(UnsupportedEncodingException e) { + throw new DecoderException(e.getMessage()); + } + } + + /** + * Encodes an object into its URL safe form. Unsafe characters are + * escaped. + * + * @param pObject string to convert to a URL safe form + * @return URL safe object + * @throws EncoderException Thrown if URL encoding is not + * applicable to objects of this type or + * if encoding is unsuccessful + */ + public Object encode(Object pObject) throws EncoderException { + if (pObject == null) { + return null; + } else if (pObject instanceof byte[]) { + return encode((byte[])pObject); + } else if (pObject instanceof String) { + return encode((String)pObject); + } else { + throw new EncoderException("Objects of type " + + pObject.getClass().getName() + " cannot be URL encoded"); + + } + } + + /** + * Decodes a URL safe object into its original form. Escaped + * characters are converted back to their original representation. + * + * @param pObject URL safe object to convert into its original form + * @return original object + * @throws DecoderException Thrown if URL decoding is not + * applicable to objects of this type + * if decoding is unsuccessful + */ + public Object decode(Object pObject) throws DecoderException { + if (pObject == null) { + return null; + } else if (pObject instanceof byte[]) { + return decode((byte[])pObject); + } else if (pObject instanceof String) { + return decode((String)pObject); + } else { + throw new DecoderException("Objects of type " + + pObject.getClass().getName() + " cannot be URL decoded"); + + } + } + + /** + * The <code>String</code> encoding used for decoding and encoding. + * + * @return Returns the encoding. + * + * @deprecated use #getDefaultCharset() + */ + public String getEncoding() { + return this.charset; + } + + /** + * The default charset used for string decoding and encoding. + * + * @return the default string charset. + */ + public String getDefaultCharset() { + return this.charset; + } + +} diff --git a/src/org/apache/commons/codec/net/package.html b/src/org/apache/commons/codec/net/package.html new file mode 100644 index 0000000..4607c57 --- /dev/null +++ b/src/org/apache/commons/codec/net/package.html @@ -0,0 +1,22 @@ +<!-- +Copyright 2003-2004 The Apache Software Foundation. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +--> +<html> + <body> + <p> + Network related encoding and decoding. + </p> + </body> +</html> diff --git a/src/org/apache/commons/codec/overview.html b/src/org/apache/commons/codec/overview.html new file mode 100644 index 0000000..6b6f6c9 --- /dev/null +++ b/src/org/apache/commons/codec/overview.html @@ -0,0 +1,28 @@ +<!-- +Copyright 2003-2004 The Apache Software Foundation. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +--> +<!-- $Id: overview.html,v 1.6 2004/05/17 17:06:10 ggregory Exp $ --> +<html> +<body> +<p> +This document is the API specification for the Apache Jakarta Commons Codec Library, version 1.3. +</p> +<p> +This library requires a JRE version of 1.2.2 or greater. +The hypertext links originating from this document point to Sun's version 1.3 API as the 1.2.2 API documentation +is no longer on-line. +</p> +</body> +</html> diff --git a/src/org/apache/commons/codec/package.html b/src/org/apache/commons/codec/package.html new file mode 100644 index 0000000..b7ccf03 --- /dev/null +++ b/src/org/apache/commons/codec/package.html @@ -0,0 +1,99 @@ +<!-- +Copyright 2003-2004 The Apache Software Foundation. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +--> +<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"> +<html> + <head> + </head> + <body> + <p>A small set of interfaces used by + the various implementations in the sub-packages.</p> + + <p>Definitive implementations of commonly used encoders and decoders.</p> + + <p>Codec is currently comprised of a modest set of utilities and a + simple framework for String encoding and decoding in three categories: + Binary Encoders, Language Encoders, and Network Encoders. </p> + + <h4><a name="Common Encoders">Binary Encoders</a></h4> + + <table border="1" width="100%" cellspacing="2" cellpadding="3"> + <tbody> + <tr> + <td> + <a href="binary/Base64.html"> + org.apache.commons.codec.binary.Base64</a> + </td> + <td> + Provides Base64 content-transfer-encoding as defined in + <a href="http://www.ietf.org/rfc/rfc2045.txt"> RFC 2045</a> + </td> + <td>Production</td> + </tr> + <tr> + <td> + <a href="binary/Hex.html"> + org.apache.commons.codec.binary.Hex</a> + </td> + <td> + Converts an array of bytes into an array of characters + representing the hexidecimal values of each byte in order + </td> + <td>Production</td> + </tr> + </tbody> + </table> + <h4> + <a name="Language Encoders">Language Encoders</a> + </h4> + <p> + Codec contains a number of commonly used language and phonetic + encoders + </p> + <table border="1" width="100%" cellspacing="2" cellpadding="3"> + <tbody> + <tr> + <td> + <a href="#">org.apache.commons.codec.language.Soundex</a> + </td> + <td>Implementation of the Soundex algorithm.</td> + <td>Production</td> + </tr> + <tr> + <td> + <a href="#">org.apache.commons.codec.language.Metaphone</a> + </td> + <td>Implementation of the Metaphone algorithm.</td> + <td>Production</td> + </tr> + </tbody> + </table> + <h4><a name="Network_Encoders">Network Encoders</a></h4> + <h4> </h4> + <p> Codec contains network related encoders </p> + <table border="1" width="100%" cellspacing="2" cellpadding="3"> + <tbody> + <tr> + <td> + <a href="#">org.apache.commons.codec.net.URLCodec</a> + </td> + <td>Implements the 'www-form-urlencoded' encoding scheme.</td> + <td>Production</td> + </tr> + </tbody> + </table> + <br> + </body> +</html> |