summaryrefslogtreecommitdiffstats
path: root/xml/src/main/java/org/kxml2
diff options
context:
space:
mode:
authorElliott Hughes <enh@google.com>2014-10-13 22:56:18 -0700
committerElliott Hughes <enh@google.com>2014-10-18 13:29:39 -0700
commit3f1a5ebc337e896977bbc75aa2dc370e97805794 (patch)
treef2507d5186f1869435457083d4f9aa24c1f216ff /xml/src/main/java/org/kxml2
parent73f5c01dd40d1823cb86cb329a38dd8b620d20cb (diff)
downloadlibcore-3f1a5ebc337e896977bbc75aa2dc370e97805794.zip
libcore-3f1a5ebc337e896977bbc75aa2dc370e97805794.tar.gz
libcore-3f1a5ebc337e896977bbc75aa2dc370e97805794.tar.bz2
Improve support for non-BMP characters in XML.
This adds support for writing surrogate pairs out as entity references in KXmlSerializer and for parsing non-BMP entity references in DocumentBuilderImpl. Emoji and XML. Two of my least favorite things together at last. Bug: 17960630 Change-Id: If5e1001faf250e87e6eeebe3449a6ebc115789a1
Diffstat (limited to 'xml/src/main/java/org/kxml2')
-rw-r--r--xml/src/main/java/org/kxml2/io/KXmlSerializer.java53
1 files changed, 38 insertions, 15 deletions
diff --git a/xml/src/main/java/org/kxml2/io/KXmlSerializer.java b/xml/src/main/java/org/kxml2/io/KXmlSerializer.java
index 8fa2756..bfdeece 100644
--- a/xml/src/main/java/org/kxml2/io/KXmlSerializer.java
+++ b/xml/src/main/java/org/kxml2/io/KXmlSerializer.java
@@ -125,14 +125,18 @@ public class KXmlSerializer implements XmlSerializer {
// otherwise generate.
// Note: tab, newline, and carriage return have already been
// handled above.
- boolean valid = (c >= 0x20 && c <= 0xd7ff) || (c >= 0xe000 && c <= 0xfffd);
- if (!valid) {
- reportInvalidCharacter(c);
- }
- if (unicode || c < 127) {
- writer.write(c);
+ boolean allowedInXml = (c >= 0x20 && c <= 0xd7ff) || (c >= 0xe000 && c <= 0xfffd);
+ if (allowedInXml) {
+ if (unicode || c < 127) {
+ writer.write(c);
+ } else {
+ writer.write("&#" + ((int) c) + ";");
+ }
+ } else if (Character.isHighSurrogate(c) && i < s.length() - 1) {
+ writeSurrogate(c, s.charAt(i + 1));
+ ++i;
} else {
- writer.write("&#" + ((int) c) + ";");
+ reportInvalidCharacter(c);
}
// END android-changed
}
@@ -141,7 +145,7 @@ public class KXmlSerializer implements XmlSerializer {
// BEGIN android-added
private static void reportInvalidCharacter(char ch) {
- throw new IllegalArgumentException("Illegal character (" + Integer.toHexString((int) ch) + ")");
+ throw new IllegalArgumentException("Illegal character (U+" + Integer.toHexString((int) ch) + ")");
}
// END android-added
@@ -548,22 +552,41 @@ public class KXmlSerializer implements XmlSerializer {
// BEGIN android-changed: ]]> is not allowed within a CDATA,
// so break and start a new one when necessary.
data = data.replace("]]>", "]]]]><![CDATA[>");
- char[] chars = data.toCharArray();
- // We also aren't allowed any invalid characters.
- for (char ch : chars) {
- boolean valid = (ch >= 0x20 && ch <= 0xd7ff) ||
+ writer.write("<![CDATA[");
+ for (int i = 0; i < data.length(); ++i) {
+ char ch = data.charAt(i);
+ boolean allowedInCdata = (ch >= 0x20 && ch <= 0xd7ff) ||
(ch == '\t' || ch == '\n' || ch == '\r') ||
(ch >= 0xe000 && ch <= 0xfffd);
- if (!valid) {
+ if (allowedInCdata) {
+ writer.write(ch);
+ } else if (Character.isHighSurrogate(ch) && i < data.length() - 1) {
+ // Character entities aren't valid in CDATA, so break out for this.
+ writer.write("]]>");
+ writeSurrogate(ch, data.charAt(++i));
+ writer.write("<![CDATA[");
+ } else {
reportInvalidCharacter(ch);
}
}
- writer.write("<![CDATA[");
- writer.write(chars, 0, chars.length);
writer.write("]]>");
// END android-changed
}
+ // BEGIN android-added
+ private void writeSurrogate(char high, char low) throws IOException {
+ if (!Character.isLowSurrogate(low)) {
+ throw new IllegalArgumentException("Bad surrogate pair (U+" + Integer.toHexString((int) high) +
+ " U+" + Integer.toHexString((int) low) + ")");
+ }
+ // Java-style surrogate pairs aren't allowed in XML. We could use the > 3-byte encodings, but that
+ // seems likely to upset anything expecting modified UTF-8 rather than "real" UTF-8. It seems more
+ // conservative in a Java environment to use an entity reference instead.
+ int codePoint = Character.toCodePoint(high, low);
+ writer.write("&#" + codePoint + ";");
+ }
+ // END android-added
+
public void comment(String comment) throws IOException {
check(false);
writer.write("<!--");