diff options
author | Elliott Hughes <enh@google.com> | 2011-02-01 13:58:06 -0800 |
---|---|---|
committer | Android (Google) Code Review <android-gerrit@google.com> | 2011-02-01 13:58:06 -0800 |
commit | aee004a114565d7b1f3464507ec26829b8230d98 (patch) | |
tree | f233f7e96e90d3c5946dc1ad743e14c6bb9ac0f7 /luni/src/main | |
parent | 6a75005c0547634e5179829c61eb03209197ceda (diff) | |
parent | 33604713c5c70f9e6cad61dee6eb628db666bb22 (diff) | |
download | libcore-aee004a114565d7b1f3464507ec26829b8230d98.zip libcore-aee004a114565d7b1f3464507ec26829b8230d98.tar.gz libcore-aee004a114565d7b1f3464507ec26829b8230d98.tar.bz2 |
Merge "Improve CharsetEncoder to handle surrogates gracefully." into dalvik-dev
Diffstat (limited to 'luni/src/main')
-rw-r--r-- | luni/src/main/java/java/io/OutputStreamWriter.java | 42 | ||||
-rw-r--r-- | luni/src/main/java/libcore/icu/CharsetEncoderICU.java | 141 | ||||
-rw-r--r-- | luni/src/main/java/libcore/icu/NativeConverter.java | 15 | ||||
-rw-r--r-- | luni/src/main/native/NativeConverter.cpp | 35 |
4 files changed, 47 insertions, 186 deletions
diff --git a/luni/src/main/java/java/io/OutputStreamWriter.java b/luni/src/main/java/java/io/OutputStreamWriter.java index c29885e..62f825d 100644 --- a/luni/src/main/java/java/io/OutputStreamWriter.java +++ b/luni/src/main/java/java/io/OutputStreamWriter.java @@ -43,7 +43,6 @@ public class OutputStreamWriter extends Writer { private CharsetEncoder encoder; private ByteBuffer bytes = ByteBuffer.allocate(8192); - private CharBuffer underflowChars; /** * Constructs a new OutputStreamWriter using {@code out} as the target @@ -127,8 +126,8 @@ public class OutputStreamWriter extends Writer { * Closes this writer. This implementation flushes the buffer as well as the * target stream. The target stream is then closed and the resources for the * buffer and converter are released. - * <p> - * Only the first invocation of this method has any effect. Subsequent calls + * + * <p>Only the first invocation of this method has any effect. Subsequent calls * do nothing. * * @throws IOException @@ -176,31 +175,12 @@ public class OutputStreamWriter extends Writer { } private void convert(CharBuffer chars) throws IOException { - // Do we have anything left over from the previous write? - if (underflowChars != null) { - // Move the first character from 'chars' into 'underflowChars' and try to encode that. - if (chars.hasRemaining()) { - underflowChars.put(chars.get()); - underflowChars.flip(); - CharBuffer cb = underflowChars; - underflowChars = null; - convert(cb); - } - } - while (true) { CoderResult result = encoder.encode(chars, bytes, false); if (result.isOverflow()) { // Make room and try again. flushBytes(false); continue; - } else if (result.isUnderflow() && chars.remaining() > 0) { - // Stash any remaining chars. This probably means we've seen half a surrogate - // pair in CharBuffer and need to see the next char before we know what to do. - // Believe it or not, CharsetEncoder doesn't keep that character as part of its - // internal state. - underflowChars = CharBuffer.allocate(chars.remaining() + 1); - underflowChars.put(chars); } else if (result.isError()) { result.throwException(); } @@ -209,8 +189,10 @@ public class OutputStreamWriter extends Writer { } private void drainEncoder() throws IOException { - // TODO: is there any case where underflowChars is non-null and passing it to encode would - // make any difference? + // Strictly speaking, I think it's part of the CharsetEncoder contract that you call + // encode with endOfInput true before flushing. Our ICU-based implementations don't + // actually need this, and you'd hope that any reasonable implementation wouldn't either. + // CharsetEncoder.encode doesn't actually pass the boolean through to encodeLoop anyway! CharBuffer chars = CharBuffer.allocate(0); while (true) { CoderResult result = encoder.encode(chars, bytes, true); @@ -224,7 +206,8 @@ public class OutputStreamWriter extends Writer { } // Some encoders (such as ISO-2022-JP) have stuff to write out after all the - // characters (such as shifting back into a default state). + // characters (such as shifting back into a default state). In our implementation, + // this is actually the first time ICU is told that we've run out of input. CoderResult result = encoder.flush(bytes); while (!result.isUnderflow()) { if (result.isOverflow()) { @@ -243,11 +226,10 @@ public class OutputStreamWriter extends Writer { } /** - * Gets the name of the encoding that is used to convert characters to - * bytes. - * - * @return the string describing the converter or {@code null} if this - * writer is closed. + * Returns the historical name of the encoding used by this writer to convert characters to + * bytes, or null if this writer has been closed. Most callers should probably keep + * track of the String or Charset they passed in; this method may not return the same + * name. */ public String getEncoding() { if (encoder == null) { diff --git a/luni/src/main/java/libcore/icu/CharsetEncoderICU.java b/luni/src/main/java/libcore/icu/CharsetEncoderICU.java index 3b42f85..84c6a74 100644 --- a/luni/src/main/java/libcore/icu/CharsetEncoderICU.java +++ b/luni/src/main/java/libcore/icu/CharsetEncoderICU.java @@ -22,6 +22,7 @@ import java.nio.charset.CoderResult; import java.nio.charset.CodingErrorAction; import java.util.HashMap; import java.util.Map; +import libcore.base.EmptyArray; public final class CharsetEncoderICU extends CharsetEncoder { private static final Map<String, byte[]> DEFAULT_REPLACEMENTS = new HashMap<String, byte[]>(); @@ -42,14 +43,12 @@ public final class CharsetEncoderICU extends CharsetEncoder { private static final int INPUT_OFFSET = 0; private static final int OUTPUT_OFFSET = 1; private static final int INVALID_CHARS = 2; - private static final int INPUT_HELD = 3; /* * data[INPUT_OFFSET] = on input contains the start of input and on output the number of input chars consumed * data[OUTPUT_OFFSET] = on input contains the start of output and on output the number of output bytes written * data[INVALID_CHARS] = number of invalid chars - * data[INPUT_HELD] = number of input chars held in the converter's state */ - private int[] data = new int[4]; + private int[] data = new int[3]; /* handle to the ICU converter that is opened */ private long converterHandle=0; @@ -64,7 +63,6 @@ public final class CharsetEncoderICU extends CharsetEncoder { private int inEnd; private int outEnd; private int ec; - private int savedInputHeldLen; public static CharsetEncoderICU newInstance(Charset cs, String icuCanonicalName) { // This complexity is necessary to ensure that even if the constructor, superclass @@ -101,13 +99,7 @@ public final class CharsetEncoderICU extends CharsetEncoder { this.converterHandle = address; } - /** - * Sets this encoders replacement string. Substitutes the string in output if an - * unmappable or illegal sequence is encountered - * @param newReplacement to replace the error chars with - * @stable ICU 2.4 - */ - protected void implReplaceWith(byte[] newReplacement) { + @Override protected void implReplaceWith(byte[] newReplacement) { if (converterHandle != 0) { if (newReplacement.length > NativeConverter.getMaxBytesPerChar(converterHandle)) { throw new IllegalArgumentException("Number of replacement Bytes are greater than max bytes per char"); @@ -116,23 +108,11 @@ public final class CharsetEncoderICU extends CharsetEncoder { } } - /** - * Sets the action to be taken if an illegal sequence is encountered - * @param newAction action to be taken - * @exception IllegalArgumentException - * @stable ICU 2.4 - */ - protected void implOnMalformedInput(CodingErrorAction newAction) { + @Override protected void implOnMalformedInput(CodingErrorAction newAction) { updateCallback(); } - /** - * Sets the action to be taken if an illegal sequence is encountered - * @param newAction action to be taken - * @exception IllegalArgumentException - * @stable ICU 2.4 - */ - protected void implOnUnmappableCharacter(CodingErrorAction newAction) { + @Override protected void implOnUnmappableCharacter(CodingErrorAction newAction) { updateCallback(); } @@ -143,24 +123,23 @@ public final class CharsetEncoderICU extends CharsetEncoder { } } - /** - * Flushes any characters saved in the converter's internal buffer and - * resets the converter. - * @param out action to be taken - * @return result of flushing action and completes the decoding all input. - * Returns CoderResult.UNDERFLOW if the action succeeds. - * @stable ICU 2.4 - */ - protected CoderResult implFlush(ByteBuffer out) { + @Override protected void implReset() { + NativeConverter.resetCharToByte(converterHandle); + data[INPUT_OFFSET] = 0; + data[OUTPUT_OFFSET] = 0; + data[INVALID_CHARS] = 0; + } + + @Override protected CoderResult implFlush(ByteBuffer out) { try { + // ICU needs to see an empty input. + input = EmptyArray.CHAR; + data[INPUT_OFFSET] = 0; + data[OUTPUT_OFFSET] = getArray(out); - ec = NativeConverter.flushCharToByte(converterHandle,/* Handle to ICU Converter */ - output, /* output array of chars */ - outEnd, /* output index+1 to be written */ - data /* contains data, inOff,outOff */ - ); + data[INVALID_CHARS] = 0; // Make sure we don't see earlier errors. - /* If we don't have room for the output, throw an exception*/ + ec = NativeConverter.encode(converterHandle, input, inEnd, output, outEnd, data, true); if (ErrorCode.isFailure(ec)) { if (ec == ErrorCode.U_BUFFER_OVERFLOW_ERROR) { return CoderResult.OVERFLOW; @@ -179,51 +158,18 @@ public final class CharsetEncoderICU extends CharsetEncoder { } } - /** - * Resets the from Unicode mode of converter - * @stable ICU 2.4 - */ - protected void implReset() { - NativeConverter.resetCharToByte(converterHandle); - data[INPUT_OFFSET] = 0; - data[OUTPUT_OFFSET] = 0; - data[INVALID_CHARS] = 0; - data[INPUT_HELD] = 0; - savedInputHeldLen = 0; - } - - /** - * Encodes one or more chars. The default behavior of the - * converter is stop and report if an error in input stream is encountered. - * To set different behavior use @see CharsetEncoder.onMalformedInput() - * @param in buffer to decode - * @param out buffer to populate with decoded result - * @return result of decoding action. Returns CoderResult.UNDERFLOW if the decoding - * action succeeds or more input is needed for completing the decoding action. - * @stable ICU 2.4 - */ - protected CoderResult encodeLoop(CharBuffer in, ByteBuffer out) { + @Override protected CoderResult encodeLoop(CharBuffer in, ByteBuffer out) { if (!in.hasRemaining()) { return CoderResult.UNDERFLOW; } data[INPUT_OFFSET] = getArray(in); data[OUTPUT_OFFSET]= getArray(out); - data[INPUT_HELD] = 0; data[INVALID_CHARS] = 0; // Make sure we don't see earlier errors. try { - /* do the conversion */ - ec = NativeConverter.encode(converterHandle,/* Handle to ICU Converter */ - input, /* input array of bytes */ - inEnd, /* last index+1 to be converted */ - output, /* output array of chars */ - outEnd, /* output index+1 to be written */ - data, /* contains data, inOff,outOff */ - false /* don't flush the data */ - ); + ec = NativeConverter.encode(converterHandle, input, inEnd, output, outEnd, data, false); if (ErrorCode.isFailure(ec)) { - /* If we don't have room for the output return error */ if (ec == ErrorCode.U_BUFFER_OVERFLOW_ERROR) { return CoderResult.OVERFLOW; } else if (ec == ErrorCode.U_INVALID_CHAR_FOUND) { @@ -231,6 +177,8 @@ public final class CharsetEncoderICU extends CharsetEncoder { } else if (ec == ErrorCode.U_ILLEGAL_CHAR_FOUND) { // in.position(in.position() - 1); return CoderResult.malformedForLength(data[INVALID_CHARS]); + } else { + throw new AssertionError("unexpected failure: " + ec); } } return CoderResult.UNDERFLOW; @@ -249,11 +197,6 @@ public final class CharsetEncoderICU extends CharsetEncoder { return NativeConverter.canEncode(converterHandle, codePoint); } - /** - * Releases the system resources by cleanly closing ICU converter opened - * @exception Throwable exception thrown by super class' finalize method - * @stable ICU 2.4 - */ @Override protected void finalize() throws Throwable { try { NativeConverter.closeConverter(converterHandle); @@ -263,9 +206,6 @@ public final class CharsetEncoderICU extends CharsetEncoder { } } - //------------------------------------------ - // private utility methods - //------------------------------------------ private int getArray(ByteBuffer out) { if (out.hasArray()) { output = out.array(); @@ -276,10 +216,8 @@ public final class CharsetEncoderICU extends CharsetEncoder { if (allocatedOutput == null || (outEnd > allocatedOutput.length)) { allocatedOutput = new byte[outEnd]; } + // The array's start position is 0 output = allocatedOutput; - //since the new - // buffer start position - // is 0 return 0; } } @@ -288,27 +226,23 @@ public final class CharsetEncoderICU extends CharsetEncoder { if (in.hasArray()) { input = in.array(); inEnd = in.arrayOffset() + in.limit(); - return in.arrayOffset() + in.position() + savedInputHeldLen;/*exclude the number fo bytes held in previous conversion*/ + return in.arrayOffset() + in.position(); } else { inEnd = in.remaining(); - if (allocatedInput == null || (inEnd > allocatedInput.length)) { + if (allocatedInput == null || inEnd > allocatedInput.length) { allocatedInput = new char[inEnd]; } - input = allocatedInput; - // save the current position + // Copy the input buffer into the allocated array. int pos = in.position(); - in.get(input,0,inEnd); - // reset the position + in.get(allocatedInput, 0, inEnd); in.position(pos); - // the start position - // of the new buffer - // is whatever is savedInputLen - return savedInputHeldLen; + // The array's start position is 0 + input = allocatedInput; + return 0; } - } - private void setPosition(ByteBuffer out) { + private void setPosition(ByteBuffer out) { if (out.hasArray()) { // in getArray method we accessed the // array backing the buffer directly and wrote to @@ -321,23 +255,16 @@ public final class CharsetEncoderICU extends CharsetEncoder { // release reference to output array, which may not be ours output = null; } - private void setPosition(CharBuffer in){ + private void setPosition(CharBuffer in) { // Slightly rewired original code to make it cleaner. Also // added a fix for the problem where input characters got // lost when invalid characters were encountered. Not sure // what happens when data[INVALID_CHARS] is > 1, though, // since we never saw that happening. - int len = in.position() + data[INPUT_OFFSET] + savedInputHeldLen; + int len = in.position() + data[INPUT_OFFSET]; len -= data[INVALID_CHARS]; // Otherwise position becomes wrong. in.position(len); - savedInputHeldLen = data[INPUT_HELD]; - // was there input held in the previous invocation of encodeLoop - // that resulted in output in this invocation? - if(!(data[OUTPUT_OFFSET]>0 && savedInputHeldLen>0)){ - in.position(in.position() - savedInputHeldLen); - } - // release reference to input array, which may not be ours input = null; } diff --git a/luni/src/main/java/libcore/icu/NativeConverter.java b/luni/src/main/java/libcore/icu/NativeConverter.java index 6165c61..2aab65f 100644 --- a/luni/src/main/java/libcore/icu/NativeConverter.java +++ b/luni/src/main/java/libcore/icu/NativeConverter.java @@ -66,21 +66,6 @@ public final class NativeConverter { * Writes any remaining output to the output buffer and resets the * converter to its initial state. * - * @param converterHandle Address of converter object created by C code - * @param output byte array to receive flushed output. - * @param outEnd stop writing to output array at this offset (exclusive). - * @return int error code returned by ICU - * @param data integer array containing the following data - * data[0] = inputOffset - * data[1] = outputOffset - * @internal ICU 2.4 - */ - public static native int flushCharToByte(long converterHandle, byte[] output, int outEnd, int[] data); - - /** - * Writes any remaining output to the output buffer and resets the - * converter to its initial state. - * * @param converterHandle Address of converter object created by the native code * @param output char array to receive flushed output. * @param outEnd stop writing to output array at this offset (exclusive). diff --git a/luni/src/main/native/NativeConverter.cpp b/luni/src/main/native/NativeConverter.cpp index 7587fc6..9679f57 100644 --- a/luni/src/main/native/NativeConverter.cpp +++ b/luni/src/main/native/NativeConverter.cpp @@ -108,17 +108,11 @@ static jint NativeConverter_encode(JNIEnv* env, jclass, jlong address, *sourceOffset = (mySource - uSource.get()) - *sourceOffset; *targetOffset = (reinterpret_cast<jbyte*>(cTarget) - uTarget.get()) - *targetOffset; - // Check how much more input is necessary to complete what's in the converter's internal buffer. - UErrorCode minorErrorCode = U_ZERO_ERROR; - int32_t pending = ucnv_fromUCountPending(cnv, &minorErrorCode); - if (U_SUCCESS(minorErrorCode)) { - myData[3] = pending; - } - // If there was an error, count the problematic characters. if (errorCode == U_ILLEGAL_CHAR_FOUND || errorCode == U_INVALID_CHAR_FOUND) { int8_t len = 32; UChar invalidUChars[32]; + UErrorCode minorErrorCode = U_ZERO_ERROR; ucnv_getInvalidUChars(cnv, invalidUChars, &len, &minorErrorCode); if (U_SUCCESS(minorErrorCode)) { myData[2] = len; @@ -233,32 +227,6 @@ static jint NativeConverter_flushByteToChar(JNIEnv* env, jclass, jlong address, return errorCode; } -static jint NativeConverter_flushCharToByte(JNIEnv* env, jclass, jlong address, - jbyteArray target, jint targetEnd, jintArray data) { - UConverter* cnv = toUConverter(address); - if (cnv == NULL) { - return U_ILLEGAL_ARGUMENT_ERROR; - } - ScopedByteArrayRW uTarget(env, target); - if (uTarget.get() == NULL) { - return U_ILLEGAL_ARGUMENT_ERROR; - } - ScopedIntArrayRW myData(env, data); - if (myData.get() == NULL) { - return U_ILLEGAL_ARGUMENT_ERROR; - } - jchar source = '\0'; - jint* targetOffset = &myData[1]; - const jchar* mySource = &source; - const UChar* mySourceLimit= &source; - char* cTarget = reinterpret_cast<char*>(uTarget.get() + *targetOffset); - const char* cTargetLimit = reinterpret_cast<char*>(uTarget.get() + targetEnd); - UErrorCode errorCode = U_ZERO_ERROR; - ucnv_fromUnicode(cnv, &cTarget, cTargetLimit, &mySource, mySourceLimit, NULL, TRUE, &errorCode); - *targetOffset = reinterpret_cast<jbyte*>(cTarget) - uTarget.get() - *targetOffset; - return errorCode; -} - static jboolean NativeConverter_canEncode(JNIEnv*, jclass, jlong address, jint codeUnit) { UErrorCode errorCode = U_ZERO_ERROR; UConverter* cnv = toUConverter(address); @@ -671,7 +639,6 @@ static JNINativeMethod gMethods[] = { NATIVE_METHOD(NativeConverter, decode, "(J[BI[CI[IZ)I"), NATIVE_METHOD(NativeConverter, encode, "(J[CI[BI[IZ)I"), NATIVE_METHOD(NativeConverter, flushByteToChar, "(J[CI[I)I"), - NATIVE_METHOD(NativeConverter, flushCharToByte, "(J[BI[I)I"), NATIVE_METHOD(NativeConverter, getAvailableCharsetNames, "()[Ljava/lang/String;"), NATIVE_METHOD(NativeConverter, getAveBytesPerChar, "(J)F"), NATIVE_METHOD(NativeConverter, getAveCharsPerByte, "(J)F"), |