summaryrefslogtreecommitdiffstats
path: root/xml/src/main/java/org/kxml2
diff options
context:
space:
mode:
authorJesse Wilson <jessewilson@google.com>2011-05-16 15:37:02 -0700
committerJesse Wilson <jessewilson@google.com>2011-05-16 16:06:28 -0700
commit6ce8e6ee5da964f724d39655fba0e432cff4c3a6 (patch)
tree3cdb0fca211f0550454579e2ab25f84164fb9618 /xml/src/main/java/org/kxml2
parent20b0416aa49b46b586ecf3d0b33016217d2dca63 (diff)
downloadlibcore-6ce8e6ee5da964f724d39655fba0e432cff4c3a6.zip
libcore-6ce8e6ee5da964f724d39655fba0e432cff4c3a6.tar.gz
libcore-6ce8e6ee5da964f724d39655fba0e432cff4c3a6.tar.bz2
Skip BOM characters even with an explicit charset.
Change-Id: I697448528324cd68196d00ebf82ee8eecb72148d http://code.google.com/p/android/issues/detail?id=16892
Diffstat (limited to 'xml/src/main/java/org/kxml2')
-rw-r--r--xml/src/main/java/org/kxml2/io/KXmlParser.java169
1 files changed, 89 insertions, 80 deletions
diff --git a/xml/src/main/java/org/kxml2/io/KXmlParser.java b/xml/src/main/java/org/kxml2/io/KXmlParser.java
index 4b4f328..d41ced9 100644
--- a/xml/src/main/java/org/kxml2/io/KXmlParser.java
+++ b/xml/src/main/java/org/kxml2/io/KXmlParser.java
@@ -1606,17 +1606,17 @@ public class KXmlParser implements XmlPullParser, Closeable {
documentEntities = null;
}
- public void setInput(InputStream is, String _enc) throws XmlPullParserException {
+ public void setInput(InputStream is, String charset) throws XmlPullParserException {
position = 0;
limit = 0;
- String enc = _enc;
+ boolean detectCharset = (charset == null);
if (is == null) {
throw new IllegalArgumentException();
}
try {
- if (enc == null) {
+ if (detectCharset) {
// read the four bytes looking for an indication of the encoding in use
int firstFourBytes = 0;
while (limit < 4) {
@@ -1630,93 +1630,102 @@ public class KXmlParser implements XmlPullParser, Closeable {
if (limit == 4) {
switch (firstFourBytes) {
- case 0x00000FEFF: // UTF-32BE BOM
- enc = "UTF-32BE";
- limit = 0;
- break;
-
- case 0x0FFFE0000: // UTF-32LE BOM
- enc = "UTF-32LE";
- limit = 0;
- break;
-
- case 0x0000003c: // '>' in UTF-32BE
- enc = "UTF-32BE";
- buffer[0] = '<';
- limit = 1;
- break;
+ case 0x00000FEFF: // UTF-32BE BOM
+ charset = "UTF-32BE";
+ limit = 0;
+ break;
- case 0x03c000000: // '<' in UTF-32LE
- enc = "UTF-32LE";
- buffer[0] = '<';
- limit = 1;
- break;
-
- case 0x0003c003f: // "<?" in UTF-16BE
- enc = "UTF-16BE";
- buffer[0] = '<';
- buffer[1] = '?';
- limit = 2;
- break;
-
- case 0x03c003f00: // "<?" in UTF-16LE
- enc = "UTF-16LE";
- buffer[0] = '<';
- buffer[1] = '?';
- limit = 2;
- break;
-
- case 0x03c3f786d: // "<?xm" in ASCII etc.
- while (true) {
- int i = is.read();
- if (i == -1) {
- break;
- }
- buffer[limit++] = (char) i;
- if (i == '>') {
- String s = new String(buffer, 0, limit);
- int i0 = s.indexOf("encoding");
- if (i0 != -1) {
- while (s.charAt(i0) != '"'
- && s.charAt(i0) != '\'') {
- i0++;
- }
- char deli = s.charAt(i0++);
- int i1 = s.indexOf(deli, i0);
- enc = s.substring(i0, i1);
+ case 0x0FFFE0000: // UTF-32LE BOM
+ charset = "UTF-32LE";
+ limit = 0;
+ break;
+
+ case 0x0000003c: // '<' in UTF-32BE
+ charset = "UTF-32BE";
+ buffer[0] = '<';
+ limit = 1;
+ break;
+
+ case 0x03c000000: // '<' in UTF-32LE
+ charset = "UTF-32LE";
+ buffer[0] = '<';
+ limit = 1;
+ break;
+
+ case 0x0003c003f: // "<?" in UTF-16BE
+ charset = "UTF-16BE";
+ buffer[0] = '<';
+ buffer[1] = '?';
+ limit = 2;
+ break;
+
+ case 0x03c003f00: // "<?" in UTF-16LE
+ charset = "UTF-16LE";
+ buffer[0] = '<';
+ buffer[1] = '?';
+ limit = 2;
+ break;
+
+ case 0x03c3f786d: // "<?xm" in ASCII etc.
+ while (true) {
+ int i = is.read();
+ if (i == -1) {
+ break;
+ }
+ buffer[limit++] = (char) i;
+ if (i == '>') {
+ String s = new String(buffer, 0, limit);
+ int i0 = s.indexOf("encoding");
+ if (i0 != -1) {
+ while (s.charAt(i0) != '"' && s.charAt(i0) != '\'') {
+ i0++;
}
- break;
+ char deli = s.charAt(i0++);
+ int i1 = s.indexOf(deli, i0);
+ charset = s.substring(i0, i1);
}
+ break;
}
- break;
-
- default:
- // handle a byte order mark followed by something other than <?
- if ((firstFourBytes & 0x0ffff0000) == 0x0FEFF0000) {
- enc = "UTF-16BE";
- buffer[0] = (char) ((buffer[2] << 8) | buffer[3]);
- limit = 1;
- } else if ((firstFourBytes & 0x0ffff0000) == 0x0fffe0000) {
- enc = "UTF-16LE";
- buffer[0] = (char) ((buffer[3] << 8) | buffer[2]);
- limit = 1;
- } else if ((firstFourBytes & 0x0ffffff00) == 0x0EFBBBF00) {
- enc = "UTF-8";
- buffer[0] = buffer[3];
- limit = 1;
- }
+ }
+ break;
+
+ default:
+ // handle a byte order mark followed by something other than <?
+ if ((firstFourBytes & 0x0ffff0000) == 0x0feff0000) {
+ charset = "UTF-16BE";
+ buffer[0] = (char) ((buffer[2] << 8) | buffer[3]);
+ limit = 1;
+ } else if ((firstFourBytes & 0x0ffff0000) == 0x0fffe0000) {
+ charset = "UTF-16LE";
+ buffer[0] = (char) ((buffer[3] << 8) | buffer[2]);
+ limit = 1;
+ } else if ((firstFourBytes & 0x0ffffff00) == 0x0efbbbf00) {
+ charset = "UTF-8";
+ buffer[0] = buffer[3];
+ limit = 1;
+ }
}
}
}
- if (enc == null) {
- enc = "UTF-8";
+ if (charset == null) {
+ charset = "UTF-8";
}
- int sc = limit;
- setInput(new InputStreamReader(is, enc));
- encoding = _enc;
- limit = sc;
+ int savedLimit = limit;
+ setInput(new InputStreamReader(is, charset));
+ encoding = charset;
+ limit = savedLimit;
+
+ /*
+ * Skip the optional BOM if we didn't above. This decrements limit
+ * rather than incrementing position so that <?xml version='1.0'?>
+ * is still at character 0.
+ */
+ if (!detectCharset && peekCharacter() == 0xfeff) {
+ limit--;
+ System.arraycopy(buffer, 1, buffer, 0, limit);
+ }
} catch (Exception e) {
throw new XmlPullParserException("Invalid stream or encoding: " + e, this, e);
}