summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorElliott Hughes <enh@google.com>2011-02-02 17:15:53 -0800
committerElliott Hughes <enh@google.com>2011-02-02 17:15:53 -0800
commit2981b5e8cf7c19dfd85b2088b18b7a6146825317 (patch)
tree4516640b70d8797fde2b009d15b5a1af978abfd0
parent8c20515fc3a6152a4e5bdee57f2d136f82e5f625 (diff)
downloadlibcore-2981b5e8cf7c19dfd85b2088b18b7a6146825317.zip
libcore-2981b5e8cf7c19dfd85b2088b18b7a6146825317.tar.gz
libcore-2981b5e8cf7c19dfd85b2088b18b7a6146825317.tar.bz2
Improve CharsetDecoder's quality of implementation, like CharsetEncoder.
As I suspected, CharsetDecoder doesn't cope with a character's bytes being split across multiple writes any more than CharsetEncoder could cope with halves of a surrogate pair being split across multiple writes. This seems much more likely to have harmed applications than CharsetEncoder (since surrogates are rare). Anyway, ICU does the right thing here too, so the fix is basically the same. I've also gone through the decoder/encoder code a bit to bring them more in line with each other. Bug: 3410124 Change-Id: I151d043e474161e324361cddfc73188ba73fd59c
-rw-r--r--luni/src/main/java/libcore/icu/CharsetDecoderICU.java101
-rw-r--r--luni/src/main/java/libcore/icu/CharsetEncoderICU.java33
-rw-r--r--luni/src/main/native/NativeConverter.cpp37
-rw-r--r--luni/src/test/java/libcore/java/nio/charset/CharsetDecoderTest.java (renamed from luni/src/test/java/libcore/java/nio/charset/OldCharsetDecoderTest.java)48
4 files changed, 93 insertions, 126 deletions
diff --git a/luni/src/main/java/libcore/icu/CharsetDecoderICU.java b/luni/src/main/java/libcore/icu/CharsetDecoderICU.java
index e6b892e..d42368e 100644
--- a/luni/src/main/java/libcore/icu/CharsetDecoderICU.java
+++ b/luni/src/main/java/libcore/icu/CharsetDecoderICU.java
@@ -28,14 +28,12 @@ public final class CharsetDecoderICU extends CharsetDecoder {
private static final int INPUT_OFFSET = 0;
private static final int OUTPUT_OFFSET = 1;
private static final int INVALID_BYTES = 2;
- private static final int INPUT_HELD = 3;
/*
* data[INPUT_OFFSET] = on input contains the start of input and on output the number of input bytes consumed
* data[OUTPUT_OFFSET] = on input contains the start of output and on output the number of output chars written
* data[INVALID_BYTES] = number of invalid bytes
- * data[INPUT_HELD] = number of input bytes held in the converter's state
*/
- private int[] data = new int[4];
+ private int[] data = new int[3];
/* handle to the ICU converter that is opened */
private long converterHandle = 0;
@@ -51,7 +49,6 @@ public final class CharsetDecoderICU extends CharsetDecoder {
private int inEnd;
private int outEnd;
private int ec;
- private int savedInputHeldLen;
public static CharsetDecoderICU newInstance(Charset cs, String icuCanonicalName) {
// This complexity is necessary to ensure that even if the constructor, superclass
@@ -76,38 +73,22 @@ public final class CharsetDecoderICU extends CharsetDecoder {
this.converterHandle = address;
}
- /**
- * Sets this decoders replacement string. Substitutes the string in input if an
- * unmappable or illegal sequence is encountered
- * @param newReplacement to replace the error bytes with
- * @stable ICU 2.4
- */
- protected void implReplaceWith(String newReplacement) {
+ @Override protected void implReplaceWith(String newReplacement) {
if (converterHandle > 0) {
- if (newReplacement.length() > NativeConverter.getMaxBytesPerChar(converterHandle)) {
- throw new IllegalArgumentException();
+ // TODO: is this the right test? we're providing characters.
+ int max = NativeConverter.getMaxBytesPerChar(converterHandle);
+ if (newReplacement.length() > max) {
+ throw new IllegalArgumentException("replacement length (" + newReplacement.length() + ") > maximum (" + max + ")");
}
updateCallback();
}
}
- /**
- * Sets the action to be taken if an illegal sequence is encountered
- * @param newAction action to be taken
- * @exception IllegalArgumentException
- * @stable ICU 2.4
- */
- protected final void implOnMalformedInput(CodingErrorAction newAction) {
+ @Override protected final void implOnMalformedInput(CodingErrorAction newAction) {
updateCallback();
}
- /**
- * Sets the action to be taken if an illegal sequence is encountered
- * @param newAction action to be taken
- * @exception IllegalArgumentException
- * @stable ICU 2.4
- */
- protected final void implOnUnmappableCharacter(CodingErrorAction newAction) {
+ @Override protected final void implOnUnmappableCharacter(CodingErrorAction newAction) {
updateCallback();
}
@@ -118,6 +99,20 @@ public final class CharsetDecoderICU extends CharsetDecoder {
}
}
+ @Override protected void implReset() {
+ NativeConverter.resetByteToChar(converterHandle);
+ data[INPUT_OFFSET] = 0;
+ data[OUTPUT_OFFSET] = 0;
+ data[INVALID_BYTES] = 0;
+ output = null;
+ input = null;
+ allocatedInput = null;
+ allocatedOutput = null;
+ ec = 0;
+ inEnd = 0;
+ outEnd = 0;
+ }
+
@Override protected final CoderResult implFlush(CharBuffer out) {
try {
// ICU needs to see an empty input.
@@ -132,7 +127,7 @@ public final class CharsetDecoderICU extends CharsetDecoder {
if (ErrorCode.isFailure(ec)) {
if (ec == ErrorCode.U_BUFFER_OVERFLOW_ERROR) {
return CoderResult.OVERFLOW;
- } else if (ec == ErrorCode.U_TRUNCATED_CHAR_FOUND) {//CSDL: add this truncated character error handling
+ } else if (ec == ErrorCode.U_TRUNCATED_CHAR_FOUND) {
if (data[INPUT_OFFSET] > 0) {
return CoderResult.malformedForLength(data[INPUT_OFFSET]);
}
@@ -142,28 +137,11 @@ public final class CharsetDecoderICU extends CharsetDecoder {
}
return CoderResult.UNDERFLOW;
} finally {
- /* save the flushed data */
setPosition(out);
implReset();
}
}
- @Override protected void implReset() {
- NativeConverter.resetByteToChar(converterHandle);
- data[INPUT_OFFSET] = 0;
- data[OUTPUT_OFFSET] = 0;
- data[INVALID_BYTES] = 0;
- data[INPUT_HELD] = 0;
- savedInputHeldLen = 0;
- output = null;
- input = null;
- allocatedInput = null;
- allocatedOutput = null;
- ec = 0;
- inEnd = 0;
- outEnd = 0;
- }
-
@Override protected CoderResult decodeLoop(ByteBuffer in, CharBuffer out) {
if (!in.hasRemaining()) {
return CoderResult.UNDERFLOW;
@@ -171,12 +149,9 @@ public final class CharsetDecoderICU extends CharsetDecoder {
data[INPUT_OFFSET] = getArray(in);
data[OUTPUT_OFFSET]= getArray(out);
- data[INPUT_HELD] = 0;
- try{
+ try {
ec = NativeConverter.decode(converterHandle, input, inEnd, output, outEnd, data, false);
-
- // Return an error.
if (ec == ErrorCode.U_BUFFER_OVERFLOW_ERROR) {
return CoderResult.OVERFLOW;
} else if (ec == ErrorCode.U_INVALID_CHAR_FOUND) {
@@ -208,13 +183,11 @@ public final class CharsetDecoderICU extends CharsetDecoder {
return out.arrayOffset() + out.position();
} else {
outEnd = out.remaining();
- if (allocatedOutput == null || (outEnd > allocatedOutput.length)) {
+ if (allocatedOutput == null || outEnd > allocatedOutput.length) {
allocatedOutput = new char[outEnd];
}
+ // The array's start position is 0.
output = allocatedOutput;
- //since the new
- // buffer start position
- // is 0
return 0;
}
}
@@ -223,22 +196,19 @@ public final class CharsetDecoderICU extends CharsetDecoder {
if (in.hasArray()) {
input = in.array();
inEnd = in.arrayOffset() + in.limit();
- return in.arrayOffset() + in.position() + savedInputHeldLen;/*exclude the number fo bytes held in previous conversion*/
+ return in.arrayOffset() + in.position();
} else {
inEnd = in.remaining();
- if (allocatedInput == null || (inEnd > allocatedInput.length)) {
+ if (allocatedInput == null || inEnd > allocatedInput.length) {
allocatedInput = new byte[inEnd];
}
- input = allocatedInput;
- // save the current position
+ // Copy the input buffer into the allocated array.
int pos = in.position();
- in.get(input,0,inEnd);
- // reset the position
+ in.get(allocatedInput, 0, inEnd);
in.position(pos);
- // the start position
- // of the new buffer
- // is whatever is savedInputLen
- return savedInputHeldLen;
+ // The array's start position is 0.
+ input = allocatedInput;
+ return 0;
}
}
@@ -253,10 +223,7 @@ public final class CharsetDecoderICU extends CharsetDecoder {
}
private void setPosition(ByteBuffer in) {
- // ok was there input held in the previous invocation of decodeLoop
- // that resulted in output in this invocation?
- in.position(in.position() + data[INPUT_OFFSET] + savedInputHeldLen - data[INPUT_HELD]);
- savedInputHeldLen = data[INPUT_HELD];
+ in.position(in.position() + data[INPUT_OFFSET]);
// release reference to input array, which may not be ours
input = null;
}
diff --git a/luni/src/main/java/libcore/icu/CharsetEncoderICU.java b/luni/src/main/java/libcore/icu/CharsetEncoderICU.java
index f76826e..15b2788 100644
--- a/luni/src/main/java/libcore/icu/CharsetEncoderICU.java
+++ b/luni/src/main/java/libcore/icu/CharsetEncoderICU.java
@@ -49,6 +49,7 @@ public final class CharsetEncoderICU extends CharsetEncoder {
* data[INVALID_CHARS] = number of invalid chars
*/
private int[] data = new int[3];
+
/* handle to the ICU converter that is opened */
private long converterHandle=0;
@@ -101,8 +102,9 @@ public final class CharsetEncoderICU extends CharsetEncoder {
@Override protected void implReplaceWith(byte[] newReplacement) {
if (converterHandle != 0) {
- if (newReplacement.length > NativeConverter.getMaxBytesPerChar(converterHandle)) {
- throw new IllegalArgumentException("Number of replacement Bytes are greater than max bytes per char");
+ int max = NativeConverter.getMaxBytesPerChar(converterHandle);
+ if (newReplacement.length > max) {
+ throw new IllegalArgumentException("replacement length (" + newReplacement.length + ") > maximum (" + max + ")");
}
updateCallback();
}
@@ -128,6 +130,13 @@ public final class CharsetEncoderICU extends CharsetEncoder {
data[INPUT_OFFSET] = 0;
data[OUTPUT_OFFSET] = 0;
data[INVALID_CHARS] = 0;
+ output = null;
+ input = null;
+ allocatedInput = null;
+ allocatedOutput = null;
+ ec = 0;
+ inEnd = 0;
+ outEnd = 0;
}
@Override protected CoderResult implFlush(ByteBuffer out) {
@@ -144,7 +153,7 @@ public final class CharsetEncoderICU extends CharsetEncoder {
if (ErrorCode.isFailure(ec)) {
if (ec == ErrorCode.U_BUFFER_OVERFLOW_ERROR) {
return CoderResult.OVERFLOW;
- } else if (ec == ErrorCode.U_TRUNCATED_CHAR_FOUND) {//CSDL: add this truncated character error handling
+ } else if (ec == ErrorCode.U_TRUNCATED_CHAR_FOUND) {
if (data[INPUT_OFFSET] > 0) {
return CoderResult.malformedForLength(data[INPUT_OFFSET]);
}
@@ -176,15 +185,14 @@ public final class CharsetEncoderICU extends CharsetEncoder {
} else if (ec == ErrorCode.U_INVALID_CHAR_FOUND) {
return CoderResult.unmappableForLength(data[INVALID_CHARS]);
} else if (ec == ErrorCode.U_ILLEGAL_CHAR_FOUND) {
- // in.position(in.position() - 1);
return CoderResult.malformedForLength(data[INVALID_CHARS]);
} else {
throw new AssertionError("unexpected failure: " + ec);
}
}
+ // Decoding succeeded: give us more data.
return CoderResult.UNDERFLOW;
} finally {
- /* save state */
setPosition(in);
setPosition(out);
}
@@ -214,7 +222,7 @@ public final class CharsetEncoderICU extends CharsetEncoder {
return out.arrayOffset() + out.position();
} else {
outEnd = out.remaining();
- if (allocatedOutput == null || (outEnd > allocatedOutput.length)) {
+ if (allocatedOutput == null || outEnd > allocatedOutput.length) {
allocatedOutput = new byte[outEnd];
}
// The array's start position is 0
@@ -245,10 +253,6 @@ public final class CharsetEncoderICU extends CharsetEncoder {
private void setPosition(ByteBuffer out) {
if (out.hasArray()) {
- // in getArray method we accessed the
- // array backing the buffer directly and wrote to
- // it, so just just set the position and return.
- // This is done to avoid the creation of temp array.
out.position(out.position() + data[OUTPUT_OFFSET] - out.arrayOffset());
} else {
out.put(output, 0, data[OUTPUT_OFFSET]);
@@ -258,14 +262,7 @@ public final class CharsetEncoderICU extends CharsetEncoder {
}
private void setPosition(CharBuffer in) {
- // Slightly rewired original code to make it cleaner. Also
- // added a fix for the problem where input characters got
- // lost when invalid characters were encountered. Not sure
- // what happens when data[INVALID_CHARS] is > 1, though,
- // since we never saw that happening.
- int len = in.position() + data[INPUT_OFFSET];
- len -= data[INVALID_CHARS]; // Otherwise position becomes wrong.
- in.position(len);
+ in.position(in.position() + data[INPUT_OFFSET] - data[INVALID_CHARS]);
// release reference to input array, which may not be ours
input = null;
}
diff --git a/luni/src/main/native/NativeConverter.cpp b/luni/src/main/native/NativeConverter.cpp
index ecb39f7..ff7b0d0 100644
--- a/luni/src/main/native/NativeConverter.cpp
+++ b/luni/src/main/native/NativeConverter.cpp
@@ -110,12 +110,12 @@ static jint NativeConverter_encode(JNIEnv* env, jclass, jlong address,
// If there was an error, count the problematic characters.
if (errorCode == U_ILLEGAL_CHAR_FOUND || errorCode == U_INVALID_CHAR_FOUND) {
- int8_t len = 32;
+ int8_t invalidUCharCount = 32;
UChar invalidUChars[32];
UErrorCode minorErrorCode = U_ZERO_ERROR;
- ucnv_getInvalidUChars(cnv, invalidUChars, &len, &minorErrorCode);
+ ucnv_getInvalidUChars(cnv, invalidUChars, &invalidUCharCount, &minorErrorCode);
if (U_SUCCESS(minorErrorCode)) {
- myData[2] = len;
+ myData[2] = invalidUCharCount;
}
}
return errorCode;
@@ -154,18 +154,14 @@ static jint NativeConverter_decode(JNIEnv* env, jclass, jlong address,
*sourceOffset = mySource - reinterpret_cast<const char*>(uSource.get()) - *sourceOffset;
*targetOffset = cTarget - uTarget.get() - *targetOffset;
- // Check how much more input is necessary to complete what's in the converter's internal buffer.
- UErrorCode minorErrorCode = U_ZERO_ERROR;
- jint pending = ucnv_toUCountPending(cnv, &minorErrorCode);
- myData[3] = pending;
-
// If there was an error, count the problematic bytes.
if (errorCode == U_ILLEGAL_CHAR_FOUND || errorCode == U_INVALID_CHAR_FOUND) {
- int8_t len = 32;
- char invalidChars[32] = {'\0'};
- ucnv_getInvalidChars(cnv, invalidChars, &len, &minorErrorCode);
+ int8_t invalidByteCount = 32;
+ char invalidBytes[32] = {'\0'};
+ UErrorCode minorErrorCode = U_ZERO_ERROR;
+ ucnv_getInvalidChars(cnv, invalidBytes, &invalidByteCount, &minorErrorCode);
if (U_SUCCESS(minorErrorCode)) {
- myData[2] = len;
+ myData[2] = invalidByteCount;
}
}
@@ -376,12 +372,9 @@ static void encoderReplaceCallback(const void* rawContext,
static UConverterFromUCallback getFromUCallback(int32_t mode) {
switch(mode) {
- case NativeConverter_REPORT:
- return UCNV_FROM_U_CALLBACK_STOP;
- case NativeConverter_IGNORE:
- return UCNV_FROM_U_CALLBACK_SKIP;
- case NativeConverter_REPLACE:
- return encoderReplaceCallback;
+ case NativeConverter_IGNORE: return UCNV_FROM_U_CALLBACK_SKIP;
+ case NativeConverter_REPLACE: return encoderReplaceCallback;
+ case NativeConverter_REPORT: return UCNV_FROM_U_CALLBACK_STOP;
}
abort();
}
@@ -396,12 +389,12 @@ static jint NativeConverter_setCallbackEncode(JNIEnv* env, jclass, jlong address
const void* fromUOldContext = NULL;
ucnv_getFromUCallBack(cnv, &fromUOldAction, const_cast<const void**>(&fromUOldContext));
- /* fromUOldContext can only be DecodeCallbackContext since
+ /* fromUOldContext can only be EncoderCallbackContext since
* the converter created is private data for the decoder
* and callbacks can only be set via this method!
*/
- EncoderCallbackContext* fromUNewContext=NULL;
- UConverterFromUCallback fromUNewAction=NULL;
+ EncoderCallbackContext* fromUNewContext = NULL;
+ UConverterFromUCallback fromUNewAction = NULL;
if (fromUOldContext == NULL) {
fromUNewContext = new EncoderCallbackContext;
fromUNewAction = CHARSET_ENCODER_CALLBACK;
@@ -487,7 +480,7 @@ static jint NativeConverter_setCallbackDecode(JNIEnv* env, jclass, jlong address
const void* toUOldContext;
ucnv_getToUCallBack(cnv, &toUOldAction, &toUOldContext);
- /* toUOldContext can only be DecodeCallbackContext since
+ /* toUOldContext can only be DecoderCallbackContext since
* the converter created is private data for the decoder
* and callbacks can only be set via this method!
*/
diff --git a/luni/src/test/java/libcore/java/nio/charset/OldCharsetDecoderTest.java b/luni/src/test/java/libcore/java/nio/charset/CharsetDecoderTest.java
index 8f37f63..e874a1b 100644
--- a/luni/src/test/java/libcore/java/nio/charset/OldCharsetDecoderTest.java
+++ b/luni/src/test/java/libcore/java/nio/charset/CharsetDecoderTest.java
@@ -25,11 +25,7 @@ import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;
import java.nio.charset.CodingErrorAction;
-public class OldCharsetDecoderTest extends junit.framework.TestCase {
- private static final String CHARSET = "UTF-16";
-
- private static final String SAMPLE_STRING = "Android";
-
+public class CharsetDecoderTest extends junit.framework.TestCase {
// None of the harmony or jtreg tests actually check that replaceWith does the right thing!
public void test_replaceWith() throws Exception {
CharsetDecoder d = Charset.forName("UTF-16").newDecoder();
@@ -42,8 +38,8 @@ public class OldCharsetDecoderTest extends junit.framework.TestCase {
// http://code.google.com/p/android/issues/detail?id=4237
public void test_ByteArray_decode_no_offset() throws Exception {
- CharsetDecoder decoder = getCharsetDecoderUnderTest();
- byte[] arr = getEncodedByteArrayFixture();
+ CharsetDecoder decoder = Charset.forName("UTF-16").newDecoder();
+ byte[] arr = encode("UTF-16", "Android");
ByteBuffer inBuffer = ByteBuffer.wrap(arr, 0, arr.length).slice();
CharBuffer outBuffer = CharBuffer.allocate(arr.length);
decoder.reset();
@@ -51,13 +47,13 @@ public class OldCharsetDecoderTest extends junit.framework.TestCase {
assertFalse(coderResult.toString(), coderResult.isError());
decoder.flush(outBuffer);
outBuffer.flip();
- assertEquals(SAMPLE_STRING, outBuffer.toString().trim());
+ assertEquals("Android", outBuffer.toString().trim());
}
// http://code.google.com/p/android/issues/detail?id=4237
public void test_ByteArray_decode_with_offset() throws Exception {
- CharsetDecoder decoder = getCharsetDecoderUnderTest();
- byte[] arr = getEncodedByteArrayFixture();
+ CharsetDecoder decoder = Charset.forName("UTF-16").newDecoder();
+ byte[] arr = encode("UTF-16", "Android");
arr = prependByteToByteArray(arr, new Integer(1).byteValue());
int offset = 1;
ByteBuffer inBuffer = ByteBuffer.wrap(arr, offset, arr.length - offset).slice();
@@ -67,17 +63,17 @@ public class OldCharsetDecoderTest extends junit.framework.TestCase {
assertFalse(coderResult.toString(), coderResult.isError());
decoder.flush(outBuffer);
outBuffer.flip();
- assertEquals(SAMPLE_STRING, outBuffer.toString().trim());
+ assertEquals("Android", outBuffer.toString().trim());
}
// http://code.google.com/p/android/issues/detail?id=4237
public void test_ByteArray_decode_with_offset_using_facade_method() throws Exception {
- CharsetDecoder decoder = getCharsetDecoderUnderTest();
- byte[] arr = getEncodedByteArrayFixture();
+ CharsetDecoder decoder = Charset.forName("UTF-16").newDecoder();
+ byte[] arr = encode("UTF-16", "Android");
arr = prependByteToByteArray(arr, new Integer(1).byteValue());
int offset = 1;
CharBuffer outBuffer = decoder.decode(ByteBuffer.wrap(arr, offset, arr.length - offset));
- assertEquals(SAMPLE_STRING, outBuffer.toString().trim());
+ assertEquals("Android", outBuffer.toString().trim());
}
private static byte[] prependByteToByteArray(byte[] arr, byte b) {
@@ -87,12 +83,26 @@ public class OldCharsetDecoderTest extends junit.framework.TestCase {
return result;
}
- private static CharsetDecoder getCharsetDecoderUnderTest() {
- return Charset.forName(CHARSET).newDecoder();
+ private static byte[] encode(String charsetName, String s) throws Exception {
+ CharsetEncoder encoder = Charset.forName(charsetName).newEncoder();
+ return encoder.encode(CharBuffer.wrap(s)).array();
}
- private byte[] getEncodedByteArrayFixture() throws CharacterCodingException {
- CharsetEncoder encoder = Charset.forName(CHARSET).newEncoder();
- return encoder.encode(CharBuffer.wrap(SAMPLE_STRING)).array();
+ public void testUtf8BytesSplitAcrossMultipleWrites() throws Exception {
+ CharsetDecoder decoder = Charset.forName("UTF-8").newDecoder();
+ CharBuffer cb = CharBuffer.allocate(128);
+ CoderResult cr;
+ cr = decoder.decode(ByteBuffer.wrap(new byte[] { (byte) 0xe2 }), cb, false);
+ assertEquals(CoderResult.UNDERFLOW, cr);
+ cr = decoder.decode(ByteBuffer.wrap(new byte[] { (byte) 0x98 }), cb, false);
+ assertEquals(CoderResult.UNDERFLOW, cr);
+ cr = decoder.decode(ByteBuffer.wrap(new byte[] { (byte) 0x83 }), cb, false);
+ assertEquals(CoderResult.UNDERFLOW, cr);
+ cr = decoder.decode(ByteBuffer.wrap(new byte[0]), cb, true);
+ assertEquals(CoderResult.UNDERFLOW, cr);
+ cr = decoder.flush(cb);
+ assertEquals(CoderResult.UNDERFLOW, cr);
+ assertEquals(1, cb.position());
+ assertEquals('\u2603', cb.get(0));
}
}