diff options
-rw-r--r-- | luni/src/test/java/libcore/xml/DomTest.java | 31 | ||||
-rw-r--r-- | luni/src/test/java/libcore/xml/PullParserTest.java | 27 | ||||
-rw-r--r-- | xml/src/main/java/org/kxml2/io/KXmlParser.java | 169 |
3 files changed, 146 insertions, 81 deletions
diff --git a/luni/src/test/java/libcore/xml/DomTest.java b/luni/src/test/java/libcore/xml/DomTest.java index 4b97c06..f27a1a4 100644 --- a/luni/src/test/java/libcore/xml/DomTest.java +++ b/luni/src/test/java/libcore/xml/DomTest.java @@ -17,6 +17,7 @@ package libcore.xml; import dalvik.annotation.KnownFailure; +import java.io.ByteArrayInputStream; import java.io.File; import java.io.FileWriter; import java.io.IOException; @@ -1656,6 +1657,36 @@ public class DomTest extends TestCase { assertNull(text.getNextSibling()); } + public void testBomAndByteInput() throws Exception { + byte[] xml = { + (byte) 0xef, (byte) 0xbb, (byte) 0xbf, + '<', 'i', 'n', 'p', 'u', 't', '/', '>' + }; + document = builder.parse(new InputSource(new ByteArrayInputStream(xml))); + assertEquals("input", document.getDocumentElement().getNodeName()); + } + + public void testBomAndByteInputWithExplicitCharset() throws Exception { + byte[] xml = { + (byte) 0xef, (byte) 0xbb, (byte) 0xbf, + '<', 'i', 'n', 'p', 'u', 't', '/', '>' + }; + InputSource inputSource = new InputSource(new ByteArrayInputStream(xml)); + inputSource.setEncoding("UTF-8"); + document = builder.parse(inputSource); + assertEquals("input", document.getDocumentElement().getNodeName()); + } + + public void testBomAndCharacterInput() throws Exception { + InputSource inputSource = new InputSource(new StringReader("\ufeff<input/>")); + inputSource.setEncoding("UTF-8"); + try { + builder.parse(inputSource); + fail(); + } catch (SAXException expected) { + } + } + private class RecordingHandler implements UserDataHandler { final Set<String> calls = new HashSet<String>(); public void handle(short operation, String key, Object data, Node src, Node dst) { diff --git a/luni/src/test/java/libcore/xml/PullParserTest.java b/luni/src/test/java/libcore/xml/PullParserTest.java index 06a40fd..fa00fe5 100644 --- a/luni/src/test/java/libcore/xml/PullParserTest.java +++ b/luni/src/test/java/libcore/xml/PullParserTest.java @@ -17,7 +17,6 @@ package libcore.xml; import java.io.ByteArrayInputStream; -import java.io.IOException; import java.io.StringReader; import junit.framework.TestCase; import org.xmlpull.v1.XmlPullParser; @@ -695,6 +694,32 @@ public abstract class PullParserTest extends TestCase { assertParseFailure("not xml"); } + public void testBomAndByteInput() throws Exception { + byte[] xml = "\ufeff<?xml version='1.0'?><input/>".getBytes("UTF-8"); + XmlPullParser parser = newPullParser(); + parser.setInput(new ByteArrayInputStream(xml), null); + assertEquals(XmlPullParser.START_TAG, parser.next()); + assertEquals("input", parser.getName()); + assertEquals(XmlPullParser.END_TAG, parser.next()); + assertEquals("input", parser.getName()); + assertEquals(XmlPullParser.END_DOCUMENT, parser.next()); + } + + public void testBomAndByteInputWithExplicitCharset() throws Exception { + byte[] xml = "\ufeff<?xml version='1.0'?><input/>".getBytes("UTF-8"); + XmlPullParser parser = newPullParser(); + parser.setInput(new ByteArrayInputStream(xml), "UTF-8"); + assertEquals(XmlPullParser.START_TAG, parser.next()); + assertEquals("input", parser.getName()); + assertEquals(XmlPullParser.END_TAG, parser.next()); + assertEquals("input", parser.getName()); + assertEquals(XmlPullParser.END_DOCUMENT, parser.next()); + } + + public void testBomAndCharacterInput() throws Exception { + assertParseFailure("\ufeff<?xml version='1.0'?><input/>"); + } + private void assertParseFailure(String xml) throws Exception { XmlPullParser parser = newPullParser(); parser.setInput(new StringReader(xml)); diff --git a/xml/src/main/java/org/kxml2/io/KXmlParser.java b/xml/src/main/java/org/kxml2/io/KXmlParser.java index 7a2d052..2be8d36 100644 --- a/xml/src/main/java/org/kxml2/io/KXmlParser.java +++ b/xml/src/main/java/org/kxml2/io/KXmlParser.java @@ -1606,17 +1606,17 @@ public class KXmlParser implements XmlPullParser, Closeable { documentEntities = null; } - public void setInput(InputStream is, String _enc) throws XmlPullParserException { + public void setInput(InputStream is, String charset) throws XmlPullParserException { position = 0; limit = 0; - String enc = _enc; + boolean detectCharset = (charset == null); if (is == null) { throw new IllegalArgumentException(); } try { - if (enc == null) { + if (detectCharset) { // read the four bytes looking for an indication of the encoding in use int firstFourBytes = 0; while (limit < 4) { @@ -1630,93 +1630,102 @@ public class KXmlParser implements XmlPullParser, Closeable { if (limit == 4) { switch (firstFourBytes) { - case 0x00000FEFF: // UTF-32BE BOM - enc = "UTF-32BE"; - limit = 0; - break; - - case 0x0FFFE0000: // UTF-32LE BOM - enc = "UTF-32LE"; - limit = 0; - break; - - case 0x0000003c: // '>' in UTF-32BE - enc = "UTF-32BE"; - buffer[0] = '<'; - limit = 1; - break; + case 0x00000FEFF: // UTF-32BE BOM + charset = "UTF-32BE"; + limit = 0; + break; - case 0x03c000000: // '<' in UTF-32LE - enc = "UTF-32LE"; - buffer[0] = '<'; - limit = 1; - break; - - case 0x0003c003f: // "<?" in UTF-16BE - enc = "UTF-16BE"; - buffer[0] = '<'; - buffer[1] = '?'; - limit = 2; - break; - - case 0x03c003f00: // "<?" in UTF-16LE - enc = "UTF-16LE"; - buffer[0] = '<'; - buffer[1] = '?'; - limit = 2; - break; - - case 0x03c3f786d: // "<?xm" in ASCII etc. - while (true) { - int i = is.read(); - if (i == -1) { - break; - } - buffer[limit++] = (char) i; - if (i == '>') { - String s = new String(buffer, 0, limit); - int i0 = s.indexOf("encoding"); - if (i0 != -1) { - while (s.charAt(i0) != '"' - && s.charAt(i0) != '\'') { - i0++; - } - char deli = s.charAt(i0++); - int i1 = s.indexOf(deli, i0); - enc = s.substring(i0, i1); + case 0x0FFFE0000: // UTF-32LE BOM + charset = "UTF-32LE"; + limit = 0; + break; + + case 0x0000003c: // '<' in UTF-32BE + charset = "UTF-32BE"; + buffer[0] = '<'; + limit = 1; + break; + + case 0x03c000000: // '<' in UTF-32LE + charset = "UTF-32LE"; + buffer[0] = '<'; + limit = 1; + break; + + case 0x0003c003f: // "<?" in UTF-16BE + charset = "UTF-16BE"; + buffer[0] = '<'; + buffer[1] = '?'; + limit = 2; + break; + + case 0x03c003f00: // "<?" in UTF-16LE + charset = "UTF-16LE"; + buffer[0] = '<'; + buffer[1] = '?'; + limit = 2; + break; + + case 0x03c3f786d: // "<?xm" in ASCII etc. + while (true) { + int i = is.read(); + if (i == -1) { + break; + } + buffer[limit++] = (char) i; + if (i == '>') { + String s = new String(buffer, 0, limit); + int i0 = s.indexOf("encoding"); + if (i0 != -1) { + while (s.charAt(i0) != '"' && s.charAt(i0) != '\'') { + i0++; } - break; + char deli = s.charAt(i0++); + int i1 = s.indexOf(deli, i0); + charset = s.substring(i0, i1); } + break; } - break; - - default: - // handle a byte order mark followed by something other than <? - if ((firstFourBytes & 0x0ffff0000) == 0x0FEFF0000) { - enc = "UTF-16BE"; - buffer[0] = (char) ((buffer[2] << 8) | buffer[3]); - limit = 1; - } else if ((firstFourBytes & 0x0ffff0000) == 0x0fffe0000) { - enc = "UTF-16LE"; - buffer[0] = (char) ((buffer[3] << 8) | buffer[2]); - limit = 1; - } else if ((firstFourBytes & 0x0ffffff00) == 0x0EFBBBF00) { - enc = "UTF-8"; - buffer[0] = buffer[3]; - limit = 1; - } + } + break; + + default: + // handle a byte order mark followed by something other than <? + if ((firstFourBytes & 0x0ffff0000) == 0x0feff0000) { + charset = "UTF-16BE"; + buffer[0] = (char) ((buffer[2] << 8) | buffer[3]); + limit = 1; + } else if ((firstFourBytes & 0x0ffff0000) == 0x0fffe0000) { + charset = "UTF-16LE"; + buffer[0] = (char) ((buffer[3] << 8) | buffer[2]); + limit = 1; + } else if ((firstFourBytes & 0x0ffffff00) == 0x0efbbbf00) { + charset = "UTF-8"; + buffer[0] = buffer[3]; + limit = 1; + } } } } - if (enc == null) { - enc = "UTF-8"; + if (charset == null) { + charset = "UTF-8"; } - int sc = limit; - setInput(new InputStreamReader(is, enc)); - encoding = _enc; - limit = sc; + int savedLimit = limit; + setInput(new InputStreamReader(is, charset)); + encoding = charset; + limit = savedLimit; + + /* + * Skip the optional BOM if we didn't above. This decrements limit + * rather than incrementing position so that <?xml version='1.0'?> + * is still at character 0. + */ + if (!detectCharset && peekCharacter() == 0xfeff) { + limit--; + System.arraycopy(buffer, 1, buffer, 0, limit); + } } catch (Exception e) { throw new XmlPullParserException("Invalid stream or encoding: " + e, this, e); } |