diff options
author | Elliott Hughes <enh@google.com> | 2014-10-13 22:56:18 -0700 |
---|---|---|
committer | Elliott Hughes <enh@google.com> | 2014-10-18 13:29:39 -0700 |
commit | 3f1a5ebc337e896977bbc75aa2dc370e97805794 (patch) | |
tree | f2507d5186f1869435457083d4f9aa24c1f216ff /xml/src | |
parent | 73f5c01dd40d1823cb86cb329a38dd8b620d20cb (diff) | |
download | libcore-3f1a5ebc337e896977bbc75aa2dc370e97805794.zip libcore-3f1a5ebc337e896977bbc75aa2dc370e97805794.tar.gz libcore-3f1a5ebc337e896977bbc75aa2dc370e97805794.tar.bz2 |
Improve support for non-BMP characters in XML.
This adds support for writing surrogate pairs out as entity references in
KXmlSerializer and for parsing non-BMP entity references in
DocumentBuilderImpl.
Emoji and XML. Two of my least favorite things together at last.
Bug: 17960630
Change-Id: If5e1001faf250e87e6eeebe3449a6ebc115789a1
Diffstat (limited to 'xml/src')
-rw-r--r-- | xml/src/main/java/org/kxml2/io/KXmlSerializer.java | 53 |
1 files changed, 38 insertions, 15 deletions
diff --git a/xml/src/main/java/org/kxml2/io/KXmlSerializer.java b/xml/src/main/java/org/kxml2/io/KXmlSerializer.java index 8fa2756..bfdeece 100644 --- a/xml/src/main/java/org/kxml2/io/KXmlSerializer.java +++ b/xml/src/main/java/org/kxml2/io/KXmlSerializer.java @@ -125,14 +125,18 @@ public class KXmlSerializer implements XmlSerializer { // otherwise generate. // Note: tab, newline, and carriage return have already been // handled above. - boolean valid = (c >= 0x20 && c <= 0xd7ff) || (c >= 0xe000 && c <= 0xfffd); - if (!valid) { - reportInvalidCharacter(c); - } - if (unicode || c < 127) { - writer.write(c); + boolean allowedInXml = (c >= 0x20 && c <= 0xd7ff) || (c >= 0xe000 && c <= 0xfffd); + if (allowedInXml) { + if (unicode || c < 127) { + writer.write(c); + } else { + writer.write("&#" + ((int) c) + ";"); + } + } else if (Character.isHighSurrogate(c) && i < s.length() - 1) { + writeSurrogate(c, s.charAt(i + 1)); + ++i; } else { - writer.write("&#" + ((int) c) + ";"); + reportInvalidCharacter(c); } // END android-changed } @@ -141,7 +145,7 @@ public class KXmlSerializer implements XmlSerializer { // BEGIN android-added private static void reportInvalidCharacter(char ch) { - throw new IllegalArgumentException("Illegal character (" + Integer.toHexString((int) ch) + ")"); + throw new IllegalArgumentException("Illegal character (U+" + Integer.toHexString((int) ch) + ")"); } // END android-added @@ -548,22 +552,41 @@ public class KXmlSerializer implements XmlSerializer { // BEGIN android-changed: ]]> is not allowed within a CDATA, // so break and start a new one when necessary. data = data.replace("]]>", "]]]]><![CDATA[>"); - char[] chars = data.toCharArray(); - // We also aren't allowed any invalid characters. - for (char ch : chars) { - boolean valid = (ch >= 0x20 && ch <= 0xd7ff) || + writer.write("<![CDATA["); + for (int i = 0; i < data.length(); ++i) { + char ch = data.charAt(i); + boolean allowedInCdata = (ch >= 0x20 && ch <= 0xd7ff) || (ch == '\t' || ch == '\n' || ch == '\r') || (ch >= 0xe000 && ch <= 0xfffd); - if (!valid) { + if (allowedInCdata) { + writer.write(ch); + } else if (Character.isHighSurrogate(ch) && i < data.length() - 1) { + // Character entities aren't valid in CDATA, so break out for this. + writer.write("]]>"); + writeSurrogate(ch, data.charAt(++i)); + writer.write("<![CDATA["); + } else { reportInvalidCharacter(ch); } } - writer.write("<![CDATA["); - writer.write(chars, 0, chars.length); writer.write("]]>"); // END android-changed } + // BEGIN android-added + private void writeSurrogate(char high, char low) throws IOException { + if (!Character.isLowSurrogate(low)) { + throw new IllegalArgumentException("Bad surrogate pair (U+" + Integer.toHexString((int) high) + + " U+" + Integer.toHexString((int) low) + ")"); + } + // Java-style surrogate pairs aren't allowed in XML. We could use the > 3-byte encodings, but that + // seems likely to upset anything expecting modified UTF-8 rather than "real" UTF-8. It seems more + // conservative in a Java environment to use an entity reference instead. + int codePoint = Character.toCodePoint(high, low); + writer.write("&#" + codePoint + ";"); + } + // END android-added + public void comment(String comment) throws IOException { check(false); writer.write("<!--"); |