3 files changed, 146 insertions, 81 deletions
diff --git a/luni/src/test/java/libcore/xml/DomTest.java b/luni/src/test/java/libcore/xml/DomTest.java
index 4b97c06..f27a1a4 100644
--- a/luni/src/test/java/libcore/xml/DomTest.java
+++ b/luni/src/test/java/libcore/xml/DomTest.java
@@ -17,6 +17,7 @@
 package libcore.xml;
 
 import dalvik.annotation.KnownFailure;
+import java.io.ByteArrayInputStream;
 import java.io.File;
 import java.io.FileWriter;
 import java.io.IOException;
@@ -1656,6 +1657,36 @@ public class DomTest extends TestCase {
         assertNull(text.getNextSibling());
     }
 
+    public void testBomAndByteInput() throws Exception {
+        byte[] xml = {
+                (byte) 0xef, (byte) 0xbb, (byte) 0xbf,
+                '<', 'i', 'n', 'p', 'u', 't', '/', '>'
+        };
+        document = builder.parse(new InputSource(new ByteArrayInputStream(xml)));
+        assertEquals("input", document.getDocumentElement().getNodeName());
+    }
+
+    public void testBomAndByteInputWithExplicitCharset() throws Exception {
+        byte[] xml = {
+                (byte) 0xef, (byte) 0xbb, (byte) 0xbf,
+                '<', 'i', 'n', 'p', 'u', 't', '/', '>'
+        };
+        InputSource inputSource = new InputSource(new ByteArrayInputStream(xml));
+        inputSource.setEncoding("UTF-8");
+        document = builder.parse(inputSource);
+        assertEquals("input", document.getDocumentElement().getNodeName());
+    }
+
+    public void testBomAndCharacterInput() throws Exception {
+        InputSource inputSource = new InputSource(new StringReader("\ufeff<input/>"));
+        inputSource.setEncoding("UTF-8");
+        try {
+            builder.parse(inputSource);
+            fail();
+        } catch (SAXException expected) {
+        }
+    }
+
     private class RecordingHandler implements UserDataHandler {
         final Set<String> calls = new HashSet<String>();
         public void handle(short operation, String key, Object data, Node src, Node dst) {
diff --git a/luni/src/test/java/libcore/xml/PullParserTest.java b/luni/src/test/java/libcore/xml/PullParserTest.java
index 06a40fd..fa00fe5 100644
--- a/luni/src/test/java/libcore/xml/PullParserTest.java
+++ b/luni/src/test/java/libcore/xml/PullParserTest.java
@@ -17,7 +17,6 @@
 package libcore.xml;
 
 import java.io.ByteArrayInputStream;
-import java.io.IOException;
 import java.io.StringReader;
 import junit.framework.TestCase;
 import org.xmlpull.v1.XmlPullParser;
@@ -695,6 +694,32 @@ public abstract class PullParserTest extends TestCase {
         assertParseFailure("not xml");
     }
 
+    public void testBomAndByteInput() throws Exception {
+        byte[] xml = "\ufeff<?xml version='1.0'?><input/>".getBytes("UTF-8");
+        XmlPullParser parser = newPullParser();
+        parser.setInput(new ByteArrayInputStream(xml), null);
+        assertEquals(XmlPullParser.START_TAG, parser.next());
+        assertEquals("input", parser.getName());
+        assertEquals(XmlPullParser.END_TAG, parser.next());
+        assertEquals("input", parser.getName());
+        assertEquals(XmlPullParser.END_DOCUMENT, parser.next());
+    }
+
+    public void testBomAndByteInputWithExplicitCharset() throws Exception {
+        byte[] xml = "\ufeff<?xml version='1.0'?><input/>".getBytes("UTF-8");
+        XmlPullParser parser = newPullParser();
+        parser.setInput(new ByteArrayInputStream(xml), "UTF-8");
+        assertEquals(XmlPullParser.START_TAG, parser.next());
+        assertEquals("input", parser.getName());
+        assertEquals(XmlPullParser.END_TAG, parser.next());
+        assertEquals("input", parser.getName());
+        assertEquals(XmlPullParser.END_DOCUMENT, parser.next());
+    }
+
+    public void testBomAndCharacterInput() throws Exception {
+        assertParseFailure("\ufeff<?xml version='1.0'?><input/>");
+    }
+
     private void assertParseFailure(String xml) throws Exception {
         XmlPullParser parser = newPullParser();
         parser.setInput(new StringReader(xml));
diff --git a/xml/src/main/java/org/kxml2/io/KXmlParser.java b/xml/src/main/java/org/kxml2/io/KXmlParser.java
index 7a2d052..2be8d36 100644
--- a/xml/src/main/java/org/kxml2/io/KXmlParser.java
+++ b/xml/src/main/java/org/kxml2/io/KXmlParser.java
@@ -1606,17 +1606,17 @@ public class KXmlParser implements XmlPullParser, Closeable {
         documentEntities = null;
     }
 
-    public void setInput(InputStream is, String _enc) throws XmlPullParserException {
+    public void setInput(InputStream is, String charset) throws XmlPullParserException {
         position = 0;
         limit = 0;
-        String enc = _enc;
+        boolean detectCharset = (charset == null);
 
         if (is == null) {
             throw new IllegalArgumentException();
         }
 
         try {
-            if (enc == null) {
+            if (detectCharset) {
                 // read the four bytes looking for an indication of the encoding in use
                 int firstFourBytes = 0;
                 while (limit < 4) {
@@ -1630,93 +1630,102 @@ public class KXmlParser implements XmlPullParser, Closeable {
 
                 if (limit == 4) {
                     switch (firstFourBytes) {
-                        case 0x00000FEFF: // UTF-32BE BOM
-                            enc = "UTF-32BE";
-                            limit = 0;
-                            break;
-
-                        case 0x0FFFE0000: // UTF-32LE BOM
-                            enc = "UTF-32LE";
-                            limit = 0;
-                            break;
-
-                        case 0x0000003c: // '>' in UTF-32BE
-                            enc = "UTF-32BE";
-                            buffer[0] = '<';
-                            limit = 1;
-                            break;
+                    case 0x00000FEFF: // UTF-32BE BOM
+                        charset = "UTF-32BE";
+                        limit = 0;
+                        break;
 
-                        case 0x03c000000: // '<' in UTF-32LE
-                            enc = "UTF-32LE";
-                            buffer[0] = '<';
-                            limit = 1;
-                            break;
-
-                        case 0x0003c003f: // "<?" in UTF-16BE
-                            enc = "UTF-16BE";
-                            buffer[0] = '<';
-                            buffer[1] = '?';
-                            limit = 2;
-                            break;
-
-                        case 0x03c003f00: // "<?" in UTF-16LE
-                            enc = "UTF-16LE";
-                            buffer[0] = '<';
-                            buffer[1] = '?';
-                            limit = 2;
-                            break;
-
-                        case 0x03c3f786d: // "<?xm" in ASCII etc.
-                            while (true) {
-                                int i = is.read();
-                                if (i == -1) {
-                                    break;
-                                }
-                                buffer[limit++] = (char) i;
-                                if (i == '>') {
-                                    String s = new String(buffer, 0, limit);
-                                    int i0 = s.indexOf("encoding");
-                                    if (i0 != -1) {
-                                        while (s.charAt(i0) != '"'
-                                                && s.charAt(i0) != '\'') {
-                                            i0++;
-                                        }
-                                        char deli = s.charAt(i0++);
-                                        int i1 = s.indexOf(deli, i0);
-                                        enc = s.substring(i0, i1);
+                    case 0x0FFFE0000: // UTF-32LE BOM
+                        charset = "UTF-32LE";
+                        limit = 0;
+                        break;
+
+                    case 0x0000003c: // '<' in UTF-32BE
+                        charset = "UTF-32BE";
+                        buffer[0] = '<';
+                        limit = 1;
+                        break;
+
+                    case 0x03c000000: // '<' in UTF-32LE
+                        charset = "UTF-32LE";
+                        buffer[0] = '<';
+                        limit = 1;
+                        break;
+
+                    case 0x0003c003f: // "<?" in UTF-16BE
+                        charset = "UTF-16BE";
+                        buffer[0] = '<';
+                        buffer[1] = '?';
+                        limit = 2;
+                        break;
+
+                    case 0x03c003f00: // "<?" in UTF-16LE
+                        charset = "UTF-16LE";
+                        buffer[0] = '<';
+                        buffer[1] = '?';
+                        limit = 2;
+                        break;
+
+                    case 0x03c3f786d: // "<?xm" in ASCII etc.
+                        while (true) {
+                            int i = is.read();
+                            if (i == -1) {
+                                break;
+                            }
+                            buffer[limit++] = (char) i;
+                            if (i == '>') {
+                                String s = new String(buffer, 0, limit);
+                                int i0 = s.indexOf("encoding");
+                                if (i0 != -1) {
+                                    while (s.charAt(i0) != '"' && s.charAt(i0) != '\'') {
+                                        i0++;
                                     }
-                                    break;
+                                    char deli = s.charAt(i0++);
+                                    int i1 = s.indexOf(deli, i0);
+                                    charset = s.substring(i0, i1);
                                 }
+                                break;
                             }
-                            break;
-
-                        default:
-                            // handle a byte order mark followed by something other than <?
-                            if ((firstFourBytes & 0x0ffff0000) == 0x0FEFF0000) {
-                                enc = "UTF-16BE";
-                                buffer[0] = (char) ((buffer[2] << 8) | buffer[3]);
-                                limit = 1;
-                            } else if ((firstFourBytes & 0x0ffff0000) == 0x0fffe0000) {
-                                enc = "UTF-16LE";
-                                buffer[0] = (char) ((buffer[3] << 8) | buffer[2]);
-                                limit = 1;
-                            } else if ((firstFourBytes & 0x0ffffff00) == 0x0EFBBBF00) {
-                                enc = "UTF-8";
-                                buffer[0] = buffer[3];
-                                limit = 1;
-                            }
+                        }
+                        break;
+
+                    default:
+                        // handle a byte order mark followed by something other than <?
+                        if ((firstFourBytes & 0x0ffff0000) == 0x0feff0000) {
+                            charset = "UTF-16BE";
+                            buffer[0] = (char) ((buffer[2] << 8) | buffer[3]);
+                            limit = 1;
+                        } else if ((firstFourBytes & 0x0ffff0000) == 0x0fffe0000) {
+                            charset = "UTF-16LE";
+                            buffer[0] = (char) ((buffer[3] << 8) | buffer[2]);
+                            limit = 1;
+                        } else if ((firstFourBytes & 0x0ffffff00) == 0x0efbbbf00) {
+                            charset = "UTF-8";
+                            buffer[0] = buffer[3];
+                            limit = 1;
+                        }
                     }
                 }
             }
 
-            if (enc == null) {
-                enc = "UTF-8";
+            if (charset == null) {
+                charset = "UTF-8";
             }
 
-            int sc = limit;
-            setInput(new InputStreamReader(is, enc));
-            encoding = _enc;
-            limit = sc;
+            int savedLimit = limit;
+            setInput(new InputStreamReader(is, charset));
+            encoding = charset;
+            limit = savedLimit;
+
+            /*
+             * Skip the optional BOM if we didn't above. This decrements limit
+             * rather than incrementing position so that <?xml version='1.0'?>
+             * is still at character 0.
+             */
+            if (!detectCharset && peekCharacter() == 0xfeff) {
+                limit--;
+                System.arraycopy(buffer, 1, buffer, 0, limit);
+            }
         } catch (Exception e) {
             throw new XmlPullParserException("Invalid stream or encoding: " + e, this, e);
         }