Merge "Improve CharsetEncoder to handle surrogates gracefully." into dalvik-dev

author: Elliott Hughes <enh@google.com> 2011-02-01 13:58:06 -0800
committer: Android (Google) Code Review <android-gerrit@google.com> 2011-02-01 13:58:06 -0800
commit: aee004a114565d7b1f3464507ec26829b8230d98 (patch)
tree: f233f7e96e90d3c5946dc1ad743e14c6bb9ac0f7 /luni/src/main
parent: 6a75005c0547634e5179829c61eb03209197ceda (diff)
parent: 33604713c5c70f9e6cad61dee6eb628db666bb22 (diff)
download: libcore-aee004a114565d7b1f3464507ec26829b8230d98.zip
libcore-aee004a114565d7b1f3464507ec26829b8230d98.tar.gz
libcore-aee004a114565d7b1f3464507ec26829b8230d98.tar.bz2
4 files changed, 47 insertions, 186 deletions
diff --git a/luni/src/main/java/java/io/OutputStreamWriter.java b/luni/src/main/java/java/io/OutputStreamWriter.java
index c29885e..62f825d 100644
--- a/luni/src/main/java/java/io/OutputStreamWriter.java
+++ b/luni/src/main/java/java/io/OutputStreamWriter.java
@@ -43,7 +43,6 @@ public class OutputStreamWriter extends Writer {
     private CharsetEncoder encoder;
 
     private ByteBuffer bytes = ByteBuffer.allocate(8192);
-    private CharBuffer underflowChars;
 
     /**
      * Constructs a new OutputStreamWriter using {@code out} as the target
@@ -127,8 +126,8 @@ public class OutputStreamWriter extends Writer {
      * Closes this writer. This implementation flushes the buffer as well as the
      * target stream. The target stream is then closed and the resources for the
      * buffer and converter are released.
-     * <p>
-     * Only the first invocation of this method has any effect. Subsequent calls
+     *
+     * <p>Only the first invocation of this method has any effect. Subsequent calls
      * do nothing.
      *
      * @throws IOException
@@ -176,31 +175,12 @@ public class OutputStreamWriter extends Writer {
     }
 
     private void convert(CharBuffer chars) throws IOException {
-        // Do we have anything left over from the previous write?
-        if (underflowChars != null) {
-            // Move the first character from 'chars' into 'underflowChars' and try to encode that.
-            if (chars.hasRemaining()) {
-                underflowChars.put(chars.get());
-                underflowChars.flip();
-                CharBuffer cb = underflowChars;
-                underflowChars = null;
-                convert(cb);
-            }
-        }
-
         while (true) {
             CoderResult result = encoder.encode(chars, bytes, false);
             if (result.isOverflow()) {
                 // Make room and try again.
                 flushBytes(false);
                 continue;
-            } else if (result.isUnderflow() && chars.remaining() > 0) {
-                // Stash any remaining chars. This probably means we've seen half a surrogate
-                // pair in CharBuffer and need to see the next char before we know what to do.
-                // Believe it or not, CharsetEncoder doesn't keep that character as part of its
-                // internal state.
-                underflowChars = CharBuffer.allocate(chars.remaining() + 1);
-                underflowChars.put(chars);
             } else if (result.isError()) {
                 result.throwException();
             }
@@ -209,8 +189,10 @@ public class OutputStreamWriter extends Writer {
     }
 
     private void drainEncoder() throws IOException {
-        // TODO: is there any case where underflowChars is non-null and passing it to encode would
-        // make any difference?
+        // Strictly speaking, I think it's part of the CharsetEncoder contract that you call
+        // encode with endOfInput true before flushing. Our ICU-based implementations don't
+        // actually need this, and you'd hope that any reasonable implementation wouldn't either.
+        // CharsetEncoder.encode doesn't actually pass the boolean through to encodeLoop anyway!
         CharBuffer chars = CharBuffer.allocate(0);
         while (true) {
             CoderResult result = encoder.encode(chars, bytes, true);
@@ -224,7 +206,8 @@ public class OutputStreamWriter extends Writer {
         }
 
         // Some encoders (such as ISO-2022-JP) have stuff to write out after all the
-        // characters (such as shifting back into a default state).
+        // characters (such as shifting back into a default state). In our implementation,
+        // this is actually the first time ICU is told that we've run out of input.
         CoderResult result = encoder.flush(bytes);
         while (!result.isUnderflow()) {
             if (result.isOverflow()) {
@@ -243,11 +226,10 @@ public class OutputStreamWriter extends Writer {
     }
 
     /**
-     * Gets the name of the encoding that is used to convert characters to
-     * bytes.
-     *
-     * @return the string describing the converter or {@code null} if this
-     *         writer is closed.
+     * Returns the historical name of the encoding used by this writer to convert characters to
+     * bytes, or null if this writer has been closed. Most callers should probably keep
+     * track of the String or Charset they passed in; this method may not return the same
+     * name.
      */
     public String getEncoding() {
         if (encoder == null) {
diff --git a/luni/src/main/java/libcore/icu/CharsetEncoderICU.java b/luni/src/main/java/libcore/icu/CharsetEncoderICU.java
index 3b42f85..84c6a74 100644
--- a/luni/src/main/java/libcore/icu/CharsetEncoderICU.java
+++ b/luni/src/main/java/libcore/icu/CharsetEncoderICU.java
@@ -22,6 +22,7 @@ import java.nio.charset.CoderResult;
 import java.nio.charset.CodingErrorAction;
 import java.util.HashMap;
 import java.util.Map;
+import libcore.base.EmptyArray;
 
 public final class CharsetEncoderICU extends CharsetEncoder {
     private static final Map<String, byte[]> DEFAULT_REPLACEMENTS = new HashMap<String, byte[]>();
@@ -42,14 +43,12 @@ public final class CharsetEncoderICU extends CharsetEncoder {
     private static final int INPUT_OFFSET = 0;
     private static final int OUTPUT_OFFSET = 1;
     private static final int INVALID_CHARS = 2;
-    private static final int INPUT_HELD = 3;
     /*
      * data[INPUT_OFFSET]   = on input contains the start of input and on output the number of input chars consumed
      * data[OUTPUT_OFFSET]  = on input contains the start of output and on output the number of output bytes written
      * data[INVALID_CHARS]  = number of invalid chars
-     * data[INPUT_HELD]     = number of input chars held in the converter's state
      */
-    private int[] data = new int[4];
+    private int[] data = new int[3];
     /* handle to the ICU converter that is opened */
     private long converterHandle=0;
 
@@ -64,7 +63,6 @@ public final class CharsetEncoderICU extends CharsetEncoder {
     private int inEnd;
     private int outEnd;
     private int ec;
-    private int savedInputHeldLen;
 
     public static CharsetEncoderICU newInstance(Charset cs, String icuCanonicalName) {
         // This complexity is necessary to ensure that even if the constructor, superclass
@@ -101,13 +99,7 @@ public final class CharsetEncoderICU extends CharsetEncoder {
         this.converterHandle = address;
     }
 
-    /**
-     * Sets this encoders replacement string. Substitutes the string in output if an
-     * unmappable or illegal sequence is encountered
-     * @param newReplacement to replace the error chars with
-     * @stable ICU 2.4
-     */
-    protected void implReplaceWith(byte[] newReplacement) {
+    @Override protected void implReplaceWith(byte[] newReplacement) {
         if (converterHandle != 0) {
             if (newReplacement.length > NativeConverter.getMaxBytesPerChar(converterHandle)) {
                 throw new IllegalArgumentException("Number of replacement Bytes are greater than max bytes per char");
@@ -116,23 +108,11 @@ public final class CharsetEncoderICU extends CharsetEncoder {
         }
     }
 
-    /**
-     * Sets the action to be taken if an illegal sequence is encountered
-     * @param newAction action to be taken
-     * @exception IllegalArgumentException
-     * @stable ICU 2.4
-     */
-    protected void implOnMalformedInput(CodingErrorAction newAction) {
+    @Override protected void implOnMalformedInput(CodingErrorAction newAction) {
         updateCallback();
     }
 
-    /**
-     * Sets the action to be taken if an illegal sequence is encountered
-     * @param newAction action to be taken
-     * @exception IllegalArgumentException
-     * @stable ICU 2.4
-     */
-    protected void implOnUnmappableCharacter(CodingErrorAction newAction) {
+    @Override protected void implOnUnmappableCharacter(CodingErrorAction newAction) {
         updateCallback();
     }
 
@@ -143,24 +123,23 @@ public final class CharsetEncoderICU extends CharsetEncoder {
         }
     }
 
-    /**
-     * Flushes any characters saved in the converter's internal buffer and
-     * resets the converter.
-     * @param out action to be taken
-     * @return result of flushing action and completes the decoding all input.
-     *       Returns CoderResult.UNDERFLOW if the action succeeds.
-     * @stable ICU 2.4
-     */
-    protected CoderResult implFlush(ByteBuffer out) {
+    @Override protected void implReset() {
+        NativeConverter.resetCharToByte(converterHandle);
+        data[INPUT_OFFSET] = 0;
+        data[OUTPUT_OFFSET] = 0;
+        data[INVALID_CHARS] = 0;
+    }
+
+    @Override protected CoderResult implFlush(ByteBuffer out) {
         try {
+            // ICU needs to see an empty input.
+            input = EmptyArray.CHAR;
+            data[INPUT_OFFSET] = 0;
+
             data[OUTPUT_OFFSET] = getArray(out);
-            ec = NativeConverter.flushCharToByte(converterHandle,/* Handle to ICU Converter */
-                                                 output, /* output array of chars */
-                                                 outEnd, /* output index+1 to be written */
-                                                 data /* contains data, inOff,outOff */
-                                                );
+            data[INVALID_CHARS] = 0; // Make sure we don't see earlier errors.
 
-            /* If we don't have room for the output, throw an exception*/
+            ec = NativeConverter.encode(converterHandle, input, inEnd, output, outEnd, data, true);
             if (ErrorCode.isFailure(ec)) {
                 if (ec == ErrorCode.U_BUFFER_OVERFLOW_ERROR) {
                     return CoderResult.OVERFLOW;
@@ -179,51 +158,18 @@ public final class CharsetEncoderICU extends CharsetEncoder {
         }
     }
 
-    /**
-     * Resets the from Unicode mode of converter
-     * @stable ICU 2.4
-     */
-    protected void implReset() {
-        NativeConverter.resetCharToByte(converterHandle);
-        data[INPUT_OFFSET] = 0;
-        data[OUTPUT_OFFSET] = 0;
-        data[INVALID_CHARS] = 0;
-        data[INPUT_HELD] = 0;
-        savedInputHeldLen = 0;
-    }
-
-    /**
-     * Encodes one or more chars. The default behavior of the
-     * converter is stop and report if an error in input stream is encountered.
-     * To set different behavior use @see CharsetEncoder.onMalformedInput()
-     * @param in buffer to decode
-     * @param out buffer to populate with decoded result
-     * @return result of decoding action. Returns CoderResult.UNDERFLOW if the decoding
-     *       action succeeds or more input is needed for completing the decoding action.
-     * @stable ICU 2.4
-     */
-    protected CoderResult encodeLoop(CharBuffer in, ByteBuffer out) {
+    @Override protected CoderResult encodeLoop(CharBuffer in, ByteBuffer out) {
         if (!in.hasRemaining()) {
             return CoderResult.UNDERFLOW;
         }
 
         data[INPUT_OFFSET] = getArray(in);
         data[OUTPUT_OFFSET]= getArray(out);
-        data[INPUT_HELD] = 0;
         data[INVALID_CHARS] = 0; // Make sure we don't see earlier errors.
 
         try {
-            /* do the conversion */
-            ec = NativeConverter.encode(converterHandle,/* Handle to ICU Converter */
-                                        input, /* input array of bytes */
-                                        inEnd, /* last index+1 to be converted */
-                                        output, /* output array of chars */
-                                        outEnd, /* output index+1 to be written */
-                                        data, /* contains data, inOff,outOff */
-                                        false /* don't flush the data */
-                                        );
+            ec = NativeConverter.encode(converterHandle, input, inEnd, output, outEnd, data, false);
             if (ErrorCode.isFailure(ec)) {
-                /* If we don't have room for the output return error */
                 if (ec == ErrorCode.U_BUFFER_OVERFLOW_ERROR) {
                     return CoderResult.OVERFLOW;
                 } else if (ec == ErrorCode.U_INVALID_CHAR_FOUND) {
@@ -231,6 +177,8 @@ public final class CharsetEncoderICU extends CharsetEncoder {
                 } else if (ec == ErrorCode.U_ILLEGAL_CHAR_FOUND) {
                     // in.position(in.position() - 1);
                     return CoderResult.malformedForLength(data[INVALID_CHARS]);
+                } else {
+                    throw new AssertionError("unexpected failure: " + ec);
                 }
             }
             return CoderResult.UNDERFLOW;
@@ -249,11 +197,6 @@ public final class CharsetEncoderICU extends CharsetEncoder {
         return NativeConverter.canEncode(converterHandle, codePoint);
     }
 
-    /**
-     * Releases the system resources by cleanly closing ICU converter opened
-     * @exception Throwable exception thrown by super class' finalize method
-     * @stable ICU 2.4
-     */
     @Override protected void finalize() throws Throwable {
         try {
             NativeConverter.closeConverter(converterHandle);
@@ -263,9 +206,6 @@ public final class CharsetEncoderICU extends CharsetEncoder {
         }
     }
 
-    //------------------------------------------
-    // private utility methods
-    //------------------------------------------
     private int getArray(ByteBuffer out) {
         if (out.hasArray()) {
             output = out.array();
@@ -276,10 +216,8 @@ public final class CharsetEncoderICU extends CharsetEncoder {
             if (allocatedOutput == null || (outEnd > allocatedOutput.length)) {
                 allocatedOutput = new byte[outEnd];
             }
+            // The array's start position is 0
             output = allocatedOutput;
-            //since the new
-            // buffer start position
-            // is 0
             return 0;
         }
     }
@@ -288,27 +226,23 @@ public final class CharsetEncoderICU extends CharsetEncoder {
         if (in.hasArray()) {
             input = in.array();
             inEnd = in.arrayOffset() + in.limit();
-            return in.arrayOffset() + in.position() + savedInputHeldLen;/*exclude the number fo bytes held in previous conversion*/
+            return in.arrayOffset() + in.position();
         } else {
             inEnd = in.remaining();
-            if (allocatedInput == null || (inEnd > allocatedInput.length)) {
+            if (allocatedInput == null || inEnd > allocatedInput.length) {
                 allocatedInput = new char[inEnd];
             }
-            input = allocatedInput;
-            // save the current position
+            // Copy the input buffer into the allocated array.
             int pos = in.position();
-            in.get(input,0,inEnd);
-            // reset the position
+            in.get(allocatedInput, 0, inEnd);
             in.position(pos);
-            // the start position
-            // of the new buffer
-            // is whatever is savedInputLen
-            return savedInputHeldLen;
+            // The array's start position is 0
+            input = allocatedInput;
+            return 0;
         }
-
     }
-    private void setPosition(ByteBuffer out) {
 
+    private void setPosition(ByteBuffer out) {
         if (out.hasArray()) {
             // in getArray method we accessed the
             // array backing the buffer directly and wrote to
@@ -321,23 +255,16 @@ public final class CharsetEncoderICU extends CharsetEncoder {
         // release reference to output array, which may not be ours
         output = null;
     }
-    private void setPosition(CharBuffer in){
 
+    private void setPosition(CharBuffer in) {
         // Slightly rewired original code to make it cleaner. Also
         // added a fix for the problem where input characters got
         // lost when invalid characters were encountered. Not sure
         // what happens when data[INVALID_CHARS] is > 1, though,
         // since we never saw that happening.
-        int len = in.position() + data[INPUT_OFFSET] + savedInputHeldLen;
+        int len = in.position() + data[INPUT_OFFSET];
         len -= data[INVALID_CHARS]; // Otherwise position becomes wrong.
         in.position(len);
-        savedInputHeldLen = data[INPUT_HELD];
-        // was there input held in the previous invocation of encodeLoop
-        // that resulted in output in this invocation?
-        if(!(data[OUTPUT_OFFSET]>0 && savedInputHeldLen>0)){
-            in.position(in.position() - savedInputHeldLen);
-        }
-
         // release reference to input array, which may not be ours
         input = null;
     }
diff --git a/luni/src/main/java/libcore/icu/NativeConverter.java b/luni/src/main/java/libcore/icu/NativeConverter.java
index 6165c61..2aab65f 100644
--- a/luni/src/main/java/libcore/icu/NativeConverter.java
+++ b/luni/src/main/java/libcore/icu/NativeConverter.java
@@ -66,21 +66,6 @@ public final class NativeConverter {
      * Writes any remaining output to the output buffer and resets the
      * converter to its initial state.
      *
-     * @param converterHandle Address of converter object created by C code
-     * @param output byte array to receive flushed output.
-     * @param outEnd stop writing to output array at this offset (exclusive).
-     * @return int error code returned by ICU
-     * @param data integer array containing the following data
-     *        data[0] = inputOffset
-     *        data[1] = outputOffset
-     * @internal ICU 2.4
-     */
-    public static native int flushCharToByte(long converterHandle, byte[] output, int outEnd, int[] data);
-
-    /**
-     * Writes any remaining output to the output buffer and resets the
-     * converter to its initial state.
-     *
      * @param converterHandle Address of converter object created by the native code
      * @param output char array to receive flushed output.
      * @param outEnd stop writing to output array at this offset (exclusive).
diff --git a/luni/src/main/native/NativeConverter.cpp b/luni/src/main/native/NativeConverter.cpp
index 7587fc6..9679f57 100644
--- a/luni/src/main/native/NativeConverter.cpp
+++ b/luni/src/main/native/NativeConverter.cpp
@@ -108,17 +108,11 @@ static jint NativeConverter_encode(JNIEnv* env, jclass, jlong address,
     *sourceOffset = (mySource - uSource.get()) - *sourceOffset;
     *targetOffset = (reinterpret_cast<jbyte*>(cTarget) - uTarget.get()) - *targetOffset;
 
-    // Check how much more input is necessary to complete what's in the converter's internal buffer.
-    UErrorCode minorErrorCode = U_ZERO_ERROR;
-    int32_t pending = ucnv_fromUCountPending(cnv, &minorErrorCode);
-    if (U_SUCCESS(minorErrorCode)) {
-        myData[3] = pending;
-    }
-
     // If there was an error, count the problematic characters.
     if (errorCode == U_ILLEGAL_CHAR_FOUND || errorCode == U_INVALID_CHAR_FOUND) {
         int8_t len = 32;
         UChar invalidUChars[32];
+        UErrorCode minorErrorCode = U_ZERO_ERROR;
         ucnv_getInvalidUChars(cnv, invalidUChars, &len, &minorErrorCode);
         if (U_SUCCESS(minorErrorCode)) {
             myData[2] = len;
@@ -233,32 +227,6 @@ static jint NativeConverter_flushByteToChar(JNIEnv* env, jclass, jlong address,
     return errorCode;
 }
 
-static jint NativeConverter_flushCharToByte(JNIEnv* env, jclass, jlong address,
-        jbyteArray target, jint targetEnd, jintArray data) {
-    UConverter* cnv = toUConverter(address);
-    if (cnv == NULL) {
-        return U_ILLEGAL_ARGUMENT_ERROR;
-    }
-    ScopedByteArrayRW uTarget(env, target);
-    if (uTarget.get() == NULL) {
-        return U_ILLEGAL_ARGUMENT_ERROR;
-    }
-    ScopedIntArrayRW myData(env, data);
-    if (myData.get() == NULL) {
-        return U_ILLEGAL_ARGUMENT_ERROR;
-    }
-    jchar source = '\0';
-    jint* targetOffset = &myData[1];
-    const jchar* mySource = &source;
-    const UChar* mySourceLimit= &source;
-    char* cTarget = reinterpret_cast<char*>(uTarget.get() + *targetOffset);
-    const char* cTargetLimit = reinterpret_cast<char*>(uTarget.get() + targetEnd);
-    UErrorCode errorCode = U_ZERO_ERROR;
-    ucnv_fromUnicode(cnv, &cTarget, cTargetLimit, &mySource, mySourceLimit, NULL, TRUE, &errorCode);
-    *targetOffset = reinterpret_cast<jbyte*>(cTarget) - uTarget.get() - *targetOffset;
-    return errorCode;
-}
-
 static jboolean NativeConverter_canEncode(JNIEnv*, jclass, jlong address, jint codeUnit) {
     UErrorCode errorCode = U_ZERO_ERROR;
     UConverter* cnv = toUConverter(address);
@@ -671,7 +639,6 @@ static JNINativeMethod gMethods[] = {
     NATIVE_METHOD(NativeConverter, decode, "(J[BI[CI[IZ)I"),
     NATIVE_METHOD(NativeConverter, encode, "(J[CI[BI[IZ)I"),
     NATIVE_METHOD(NativeConverter, flushByteToChar, "(J[CI[I)I"),
-    NATIVE_METHOD(NativeConverter, flushCharToByte, "(J[BI[I)I"),
     NATIVE_METHOD(NativeConverter, getAvailableCharsetNames, "()[Ljava/lang/String;"),
     NATIVE_METHOD(NativeConverter, getAveBytesPerChar, "(J)F"),
     NATIVE_METHOD(NativeConverter, getAveCharsPerByte, "(J)F"),
author	Elliott Hughes <enh@google.com>	2011-02-01 13:58:06 -0800
committer	Android (Google) Code Review <android-gerrit@google.com>	2011-02-01 13:58:06 -0800
commit	aee004a114565d7b1f3464507ec26829b8230d98 (patch)
tree	f233f7e96e90d3c5946dc1ad743e14c6bb9ac0f7 /luni/src/main
parent	6a75005c0547634e5179829c61eb03209197ceda (diff)
parent	33604713c5c70f9e6cad61dee6eb628db666bb22 (diff)
download	libcore-aee004a114565d7b1f3464507ec26829b8230d98.zip libcore-aee004a114565d7b1f3464507ec26829b8230d98.tar.gz libcore-aee004a114565d7b1f3464507ec26829b8230d98.tar.bz2