summaryrefslogtreecommitdiffstats
path: root/luni/src/main
diff options
context:
space:
mode:
authorElliott Hughes <enh@google.com>2011-02-01 13:58:06 -0800
committerAndroid (Google) Code Review <android-gerrit@google.com>2011-02-01 13:58:06 -0800
commitaee004a114565d7b1f3464507ec26829b8230d98 (patch)
treef233f7e96e90d3c5946dc1ad743e14c6bb9ac0f7 /luni/src/main
parent6a75005c0547634e5179829c61eb03209197ceda (diff)
parent33604713c5c70f9e6cad61dee6eb628db666bb22 (diff)
downloadlibcore-aee004a114565d7b1f3464507ec26829b8230d98.zip
libcore-aee004a114565d7b1f3464507ec26829b8230d98.tar.gz
libcore-aee004a114565d7b1f3464507ec26829b8230d98.tar.bz2
Merge "Improve CharsetEncoder to handle surrogates gracefully." into dalvik-dev
Diffstat (limited to 'luni/src/main')
-rw-r--r--luni/src/main/java/java/io/OutputStreamWriter.java42
-rw-r--r--luni/src/main/java/libcore/icu/CharsetEncoderICU.java141
-rw-r--r--luni/src/main/java/libcore/icu/NativeConverter.java15
-rw-r--r--luni/src/main/native/NativeConverter.cpp35
4 files changed, 47 insertions, 186 deletions
diff --git a/luni/src/main/java/java/io/OutputStreamWriter.java b/luni/src/main/java/java/io/OutputStreamWriter.java
index c29885e..62f825d 100644
--- a/luni/src/main/java/java/io/OutputStreamWriter.java
+++ b/luni/src/main/java/java/io/OutputStreamWriter.java
@@ -43,7 +43,6 @@ public class OutputStreamWriter extends Writer {
private CharsetEncoder encoder;
private ByteBuffer bytes = ByteBuffer.allocate(8192);
- private CharBuffer underflowChars;
/**
* Constructs a new OutputStreamWriter using {@code out} as the target
@@ -127,8 +126,8 @@ public class OutputStreamWriter extends Writer {
* Closes this writer. This implementation flushes the buffer as well as the
* target stream. The target stream is then closed and the resources for the
* buffer and converter are released.
- * <p>
- * Only the first invocation of this method has any effect. Subsequent calls
+ *
+ * <p>Only the first invocation of this method has any effect. Subsequent calls
* do nothing.
*
* @throws IOException
@@ -176,31 +175,12 @@ public class OutputStreamWriter extends Writer {
}
private void convert(CharBuffer chars) throws IOException {
- // Do we have anything left over from the previous write?
- if (underflowChars != null) {
- // Move the first character from 'chars' into 'underflowChars' and try to encode that.
- if (chars.hasRemaining()) {
- underflowChars.put(chars.get());
- underflowChars.flip();
- CharBuffer cb = underflowChars;
- underflowChars = null;
- convert(cb);
- }
- }
-
while (true) {
CoderResult result = encoder.encode(chars, bytes, false);
if (result.isOverflow()) {
// Make room and try again.
flushBytes(false);
continue;
- } else if (result.isUnderflow() && chars.remaining() > 0) {
- // Stash any remaining chars. This probably means we've seen half a surrogate
- // pair in CharBuffer and need to see the next char before we know what to do.
- // Believe it or not, CharsetEncoder doesn't keep that character as part of its
- // internal state.
- underflowChars = CharBuffer.allocate(chars.remaining() + 1);
- underflowChars.put(chars);
} else if (result.isError()) {
result.throwException();
}
@@ -209,8 +189,10 @@ public class OutputStreamWriter extends Writer {
}
private void drainEncoder() throws IOException {
- // TODO: is there any case where underflowChars is non-null and passing it to encode would
- // make any difference?
+ // Strictly speaking, I think it's part of the CharsetEncoder contract that you call
+ // encode with endOfInput true before flushing. Our ICU-based implementations don't
+ // actually need this, and you'd hope that any reasonable implementation wouldn't either.
+ // CharsetEncoder.encode doesn't actually pass the boolean through to encodeLoop anyway!
CharBuffer chars = CharBuffer.allocate(0);
while (true) {
CoderResult result = encoder.encode(chars, bytes, true);
@@ -224,7 +206,8 @@ public class OutputStreamWriter extends Writer {
}
// Some encoders (such as ISO-2022-JP) have stuff to write out after all the
- // characters (such as shifting back into a default state).
+ // characters (such as shifting back into a default state). In our implementation,
+ // this is actually the first time ICU is told that we've run out of input.
CoderResult result = encoder.flush(bytes);
while (!result.isUnderflow()) {
if (result.isOverflow()) {
@@ -243,11 +226,10 @@ public class OutputStreamWriter extends Writer {
}
/**
- * Gets the name of the encoding that is used to convert characters to
- * bytes.
- *
- * @return the string describing the converter or {@code null} if this
- * writer is closed.
+ * Returns the historical name of the encoding used by this writer to convert characters to
+ * bytes, or null if this writer has been closed. Most callers should probably keep
+ * track of the String or Charset they passed in; this method may not return the same
+ * name.
*/
public String getEncoding() {
if (encoder == null) {
diff --git a/luni/src/main/java/libcore/icu/CharsetEncoderICU.java b/luni/src/main/java/libcore/icu/CharsetEncoderICU.java
index 3b42f85..84c6a74 100644
--- a/luni/src/main/java/libcore/icu/CharsetEncoderICU.java
+++ b/luni/src/main/java/libcore/icu/CharsetEncoderICU.java
@@ -22,6 +22,7 @@ import java.nio.charset.CoderResult;
import java.nio.charset.CodingErrorAction;
import java.util.HashMap;
import java.util.Map;
+import libcore.base.EmptyArray;
public final class CharsetEncoderICU extends CharsetEncoder {
private static final Map<String, byte[]> DEFAULT_REPLACEMENTS = new HashMap<String, byte[]>();
@@ -42,14 +43,12 @@ public final class CharsetEncoderICU extends CharsetEncoder {
private static final int INPUT_OFFSET = 0;
private static final int OUTPUT_OFFSET = 1;
private static final int INVALID_CHARS = 2;
- private static final int INPUT_HELD = 3;
/*
* data[INPUT_OFFSET] = on input contains the start of input and on output the number of input chars consumed
* data[OUTPUT_OFFSET] = on input contains the start of output and on output the number of output bytes written
* data[INVALID_CHARS] = number of invalid chars
- * data[INPUT_HELD] = number of input chars held in the converter's state
*/
- private int[] data = new int[4];
+ private int[] data = new int[3];
/* handle to the ICU converter that is opened */
private long converterHandle=0;
@@ -64,7 +63,6 @@ public final class CharsetEncoderICU extends CharsetEncoder {
private int inEnd;
private int outEnd;
private int ec;
- private int savedInputHeldLen;
public static CharsetEncoderICU newInstance(Charset cs, String icuCanonicalName) {
// This complexity is necessary to ensure that even if the constructor, superclass
@@ -101,13 +99,7 @@ public final class CharsetEncoderICU extends CharsetEncoder {
this.converterHandle = address;
}
- /**
- * Sets this encoders replacement string. Substitutes the string in output if an
- * unmappable or illegal sequence is encountered
- * @param newReplacement to replace the error chars with
- * @stable ICU 2.4
- */
- protected void implReplaceWith(byte[] newReplacement) {
+ @Override protected void implReplaceWith(byte[] newReplacement) {
if (converterHandle != 0) {
if (newReplacement.length > NativeConverter.getMaxBytesPerChar(converterHandle)) {
throw new IllegalArgumentException("Number of replacement Bytes are greater than max bytes per char");
@@ -116,23 +108,11 @@ public final class CharsetEncoderICU extends CharsetEncoder {
}
}
- /**
- * Sets the action to be taken if an illegal sequence is encountered
- * @param newAction action to be taken
- * @exception IllegalArgumentException
- * @stable ICU 2.4
- */
- protected void implOnMalformedInput(CodingErrorAction newAction) {
+ @Override protected void implOnMalformedInput(CodingErrorAction newAction) {
updateCallback();
}
- /**
- * Sets the action to be taken if an illegal sequence is encountered
- * @param newAction action to be taken
- * @exception IllegalArgumentException
- * @stable ICU 2.4
- */
- protected void implOnUnmappableCharacter(CodingErrorAction newAction) {
+ @Override protected void implOnUnmappableCharacter(CodingErrorAction newAction) {
updateCallback();
}
@@ -143,24 +123,23 @@ public final class CharsetEncoderICU extends CharsetEncoder {
}
}
- /**
- * Flushes any characters saved in the converter's internal buffer and
- * resets the converter.
- * @param out action to be taken
- * @return result of flushing action and completes the decoding all input.
- * Returns CoderResult.UNDERFLOW if the action succeeds.
- * @stable ICU 2.4
- */
- protected CoderResult implFlush(ByteBuffer out) {
+ @Override protected void implReset() {
+ NativeConverter.resetCharToByte(converterHandle);
+ data[INPUT_OFFSET] = 0;
+ data[OUTPUT_OFFSET] = 0;
+ data[INVALID_CHARS] = 0;
+ }
+
+ @Override protected CoderResult implFlush(ByteBuffer out) {
try {
+ // ICU needs to see an empty input.
+ input = EmptyArray.CHAR;
+ data[INPUT_OFFSET] = 0;
+
data[OUTPUT_OFFSET] = getArray(out);
- ec = NativeConverter.flushCharToByte(converterHandle,/* Handle to ICU Converter */
- output, /* output array of chars */
- outEnd, /* output index+1 to be written */
- data /* contains data, inOff,outOff */
- );
+ data[INVALID_CHARS] = 0; // Make sure we don't see earlier errors.
- /* If we don't have room for the output, throw an exception*/
+ ec = NativeConverter.encode(converterHandle, input, inEnd, output, outEnd, data, true);
if (ErrorCode.isFailure(ec)) {
if (ec == ErrorCode.U_BUFFER_OVERFLOW_ERROR) {
return CoderResult.OVERFLOW;
@@ -179,51 +158,18 @@ public final class CharsetEncoderICU extends CharsetEncoder {
}
}
- /**
- * Resets the from Unicode mode of converter
- * @stable ICU 2.4
- */
- protected void implReset() {
- NativeConverter.resetCharToByte(converterHandle);
- data[INPUT_OFFSET] = 0;
- data[OUTPUT_OFFSET] = 0;
- data[INVALID_CHARS] = 0;
- data[INPUT_HELD] = 0;
- savedInputHeldLen = 0;
- }
-
- /**
- * Encodes one or more chars. The default behavior of the
- * converter is stop and report if an error in input stream is encountered.
- * To set different behavior use @see CharsetEncoder.onMalformedInput()
- * @param in buffer to decode
- * @param out buffer to populate with decoded result
- * @return result of decoding action. Returns CoderResult.UNDERFLOW if the decoding
- * action succeeds or more input is needed for completing the decoding action.
- * @stable ICU 2.4
- */
- protected CoderResult encodeLoop(CharBuffer in, ByteBuffer out) {
+ @Override protected CoderResult encodeLoop(CharBuffer in, ByteBuffer out) {
if (!in.hasRemaining()) {
return CoderResult.UNDERFLOW;
}
data[INPUT_OFFSET] = getArray(in);
data[OUTPUT_OFFSET]= getArray(out);
- data[INPUT_HELD] = 0;
data[INVALID_CHARS] = 0; // Make sure we don't see earlier errors.
try {
- /* do the conversion */
- ec = NativeConverter.encode(converterHandle,/* Handle to ICU Converter */
- input, /* input array of bytes */
- inEnd, /* last index+1 to be converted */
- output, /* output array of chars */
- outEnd, /* output index+1 to be written */
- data, /* contains data, inOff,outOff */
- false /* don't flush the data */
- );
+ ec = NativeConverter.encode(converterHandle, input, inEnd, output, outEnd, data, false);
if (ErrorCode.isFailure(ec)) {
- /* If we don't have room for the output return error */
if (ec == ErrorCode.U_BUFFER_OVERFLOW_ERROR) {
return CoderResult.OVERFLOW;
} else if (ec == ErrorCode.U_INVALID_CHAR_FOUND) {
@@ -231,6 +177,8 @@ public final class CharsetEncoderICU extends CharsetEncoder {
} else if (ec == ErrorCode.U_ILLEGAL_CHAR_FOUND) {
// in.position(in.position() - 1);
return CoderResult.malformedForLength(data[INVALID_CHARS]);
+ } else {
+ throw new AssertionError("unexpected failure: " + ec);
}
}
return CoderResult.UNDERFLOW;
@@ -249,11 +197,6 @@ public final class CharsetEncoderICU extends CharsetEncoder {
return NativeConverter.canEncode(converterHandle, codePoint);
}
- /**
- * Releases the system resources by cleanly closing ICU converter opened
- * @exception Throwable exception thrown by super class' finalize method
- * @stable ICU 2.4
- */
@Override protected void finalize() throws Throwable {
try {
NativeConverter.closeConverter(converterHandle);
@@ -263,9 +206,6 @@ public final class CharsetEncoderICU extends CharsetEncoder {
}
}
- //------------------------------------------
- // private utility methods
- //------------------------------------------
private int getArray(ByteBuffer out) {
if (out.hasArray()) {
output = out.array();
@@ -276,10 +216,8 @@ public final class CharsetEncoderICU extends CharsetEncoder {
if (allocatedOutput == null || (outEnd > allocatedOutput.length)) {
allocatedOutput = new byte[outEnd];
}
+ // The array's start position is 0
output = allocatedOutput;
- //since the new
- // buffer start position
- // is 0
return 0;
}
}
@@ -288,27 +226,23 @@ public final class CharsetEncoderICU extends CharsetEncoder {
if (in.hasArray()) {
input = in.array();
inEnd = in.arrayOffset() + in.limit();
- return in.arrayOffset() + in.position() + savedInputHeldLen;/*exclude the number fo bytes held in previous conversion*/
+ return in.arrayOffset() + in.position();
} else {
inEnd = in.remaining();
- if (allocatedInput == null || (inEnd > allocatedInput.length)) {
+ if (allocatedInput == null || inEnd > allocatedInput.length) {
allocatedInput = new char[inEnd];
}
- input = allocatedInput;
- // save the current position
+ // Copy the input buffer into the allocated array.
int pos = in.position();
- in.get(input,0,inEnd);
- // reset the position
+ in.get(allocatedInput, 0, inEnd);
in.position(pos);
- // the start position
- // of the new buffer
- // is whatever is savedInputLen
- return savedInputHeldLen;
+ // The array's start position is 0
+ input = allocatedInput;
+ return 0;
}
-
}
- private void setPosition(ByteBuffer out) {
+ private void setPosition(ByteBuffer out) {
if (out.hasArray()) {
// in getArray method we accessed the
// array backing the buffer directly and wrote to
@@ -321,23 +255,16 @@ public final class CharsetEncoderICU extends CharsetEncoder {
// release reference to output array, which may not be ours
output = null;
}
- private void setPosition(CharBuffer in){
+ private void setPosition(CharBuffer in) {
// Slightly rewired original code to make it cleaner. Also
// added a fix for the problem where input characters got
// lost when invalid characters were encountered. Not sure
// what happens when data[INVALID_CHARS] is > 1, though,
// since we never saw that happening.
- int len = in.position() + data[INPUT_OFFSET] + savedInputHeldLen;
+ int len = in.position() + data[INPUT_OFFSET];
len -= data[INVALID_CHARS]; // Otherwise position becomes wrong.
in.position(len);
- savedInputHeldLen = data[INPUT_HELD];
- // was there input held in the previous invocation of encodeLoop
- // that resulted in output in this invocation?
- if(!(data[OUTPUT_OFFSET]>0 && savedInputHeldLen>0)){
- in.position(in.position() - savedInputHeldLen);
- }
-
// release reference to input array, which may not be ours
input = null;
}
diff --git a/luni/src/main/java/libcore/icu/NativeConverter.java b/luni/src/main/java/libcore/icu/NativeConverter.java
index 6165c61..2aab65f 100644
--- a/luni/src/main/java/libcore/icu/NativeConverter.java
+++ b/luni/src/main/java/libcore/icu/NativeConverter.java
@@ -66,21 +66,6 @@ public final class NativeConverter {
* Writes any remaining output to the output buffer and resets the
* converter to its initial state.
*
- * @param converterHandle Address of converter object created by C code
- * @param output byte array to receive flushed output.
- * @param outEnd stop writing to output array at this offset (exclusive).
- * @return int error code returned by ICU
- * @param data integer array containing the following data
- * data[0] = inputOffset
- * data[1] = outputOffset
- * @internal ICU 2.4
- */
- public static native int flushCharToByte(long converterHandle, byte[] output, int outEnd, int[] data);
-
- /**
- * Writes any remaining output to the output buffer and resets the
- * converter to its initial state.
- *
* @param converterHandle Address of converter object created by the native code
* @param output char array to receive flushed output.
* @param outEnd stop writing to output array at this offset (exclusive).
diff --git a/luni/src/main/native/NativeConverter.cpp b/luni/src/main/native/NativeConverter.cpp
index 7587fc6..9679f57 100644
--- a/luni/src/main/native/NativeConverter.cpp
+++ b/luni/src/main/native/NativeConverter.cpp
@@ -108,17 +108,11 @@ static jint NativeConverter_encode(JNIEnv* env, jclass, jlong address,
*sourceOffset = (mySource - uSource.get()) - *sourceOffset;
*targetOffset = (reinterpret_cast<jbyte*>(cTarget) - uTarget.get()) - *targetOffset;
- // Check how much more input is necessary to complete what's in the converter's internal buffer.
- UErrorCode minorErrorCode = U_ZERO_ERROR;
- int32_t pending = ucnv_fromUCountPending(cnv, &minorErrorCode);
- if (U_SUCCESS(minorErrorCode)) {
- myData[3] = pending;
- }
-
// If there was an error, count the problematic characters.
if (errorCode == U_ILLEGAL_CHAR_FOUND || errorCode == U_INVALID_CHAR_FOUND) {
int8_t len = 32;
UChar invalidUChars[32];
+ UErrorCode minorErrorCode = U_ZERO_ERROR;
ucnv_getInvalidUChars(cnv, invalidUChars, &len, &minorErrorCode);
if (U_SUCCESS(minorErrorCode)) {
myData[2] = len;
@@ -233,32 +227,6 @@ static jint NativeConverter_flushByteToChar(JNIEnv* env, jclass, jlong address,
return errorCode;
}
-static jint NativeConverter_flushCharToByte(JNIEnv* env, jclass, jlong address,
- jbyteArray target, jint targetEnd, jintArray data) {
- UConverter* cnv = toUConverter(address);
- if (cnv == NULL) {
- return U_ILLEGAL_ARGUMENT_ERROR;
- }
- ScopedByteArrayRW uTarget(env, target);
- if (uTarget.get() == NULL) {
- return U_ILLEGAL_ARGUMENT_ERROR;
- }
- ScopedIntArrayRW myData(env, data);
- if (myData.get() == NULL) {
- return U_ILLEGAL_ARGUMENT_ERROR;
- }
- jchar source = '\0';
- jint* targetOffset = &myData[1];
- const jchar* mySource = &source;
- const UChar* mySourceLimit= &source;
- char* cTarget = reinterpret_cast<char*>(uTarget.get() + *targetOffset);
- const char* cTargetLimit = reinterpret_cast<char*>(uTarget.get() + targetEnd);
- UErrorCode errorCode = U_ZERO_ERROR;
- ucnv_fromUnicode(cnv, &cTarget, cTargetLimit, &mySource, mySourceLimit, NULL, TRUE, &errorCode);
- *targetOffset = reinterpret_cast<jbyte*>(cTarget) - uTarget.get() - *targetOffset;
- return errorCode;
-}
-
static jboolean NativeConverter_canEncode(JNIEnv*, jclass, jlong address, jint codeUnit) {
UErrorCode errorCode = U_ZERO_ERROR;
UConverter* cnv = toUConverter(address);
@@ -671,7 +639,6 @@ static JNINativeMethod gMethods[] = {
NATIVE_METHOD(NativeConverter, decode, "(J[BI[CI[IZ)I"),
NATIVE_METHOD(NativeConverter, encode, "(J[CI[BI[IZ)I"),
NATIVE_METHOD(NativeConverter, flushByteToChar, "(J[CI[I)I"),
- NATIVE_METHOD(NativeConverter, flushCharToByte, "(J[BI[I)I"),
NATIVE_METHOD(NativeConverter, getAvailableCharsetNames, "()[Ljava/lang/String;"),
NATIVE_METHOD(NativeConverter, getAveBytesPerChar, "(J)F"),
NATIVE_METHOD(NativeConverter, getAveCharsPerByte, "(J)F"),