summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--luni/src/main/java/org/apache/harmony/xml/parsers/DocumentBuilderImpl.java10
-rw-r--r--luni/src/test/java/libcore/xml/KxmlSerializerTest.java75
-rw-r--r--xml/src/main/java/org/kxml2/io/KXmlSerializer.java53
3 files changed, 110 insertions, 28 deletions
diff --git a/luni/src/main/java/org/apache/harmony/xml/parsers/DocumentBuilderImpl.java b/luni/src/main/java/org/apache/harmony/xml/parsers/DocumentBuilderImpl.java
index d1079c8..040a012 100644
--- a/luni/src/main/java/org/apache/harmony/xml/parsers/DocumentBuilderImpl.java
+++ b/luni/src/main/java/org/apache/harmony/xml/parsers/DocumentBuilderImpl.java
@@ -416,11 +416,13 @@ class DocumentBuilderImpl extends DocumentBuilder {
private String resolveCharacterReference(String value, int base) {
try {
- int ch = Integer.parseInt(value, base);
- if (ch < 0 || ch > Character.MAX_VALUE) {
- return null;
+ int codePoint = Integer.parseInt(value, base);
+ if (Character.isBmpCodePoint(codePoint)) {
+ return String.valueOf((char) codePoint);
+ } else {
+ char[] surrogatePair = Character.toChars(codePoint);
+ return new String(surrogatePair);
}
- return String.valueOf((char) ch);
} catch (NumberFormatException ex) {
return null;
}
diff --git a/luni/src/test/java/libcore/xml/KxmlSerializerTest.java b/luni/src/test/java/libcore/xml/KxmlSerializerTest.java
index 6a75a9b..5f68a99 100644
--- a/luni/src/test/java/libcore/xml/KxmlSerializerTest.java
+++ b/luni/src/test/java/libcore/xml/KxmlSerializerTest.java
@@ -22,7 +22,9 @@ import java.io.StringWriter;
import junit.framework.TestCase;
import org.kxml2.io.KXmlSerializer;
import org.w3c.dom.Document;
+import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
+import org.w3c.dom.Text;
import org.xmlpull.v1.XmlSerializer;
import static tests.support.Support_Xml.domOf;
@@ -87,12 +89,67 @@ public final class KxmlSerializerTest extends TestCase {
return serializer;
}
+ public String fromCodePoint(int codePoint) {
+ if (codePoint > Character.MAX_VALUE) {
+ return new String(Character.toChars(codePoint));
+ }
+ return Character.toString((char) codePoint);
+ }
+
+ // http://b/17960630
+ public void testSpeakNoEvilMonkeys() throws Exception {
+ StringWriter stringWriter = new StringWriter();
+ XmlSerializer serializer = new KXmlSerializer();
+ serializer.setOutput(stringWriter);
+ serializer.startDocument("UTF-8", null);
+ serializer.startTag(NAMESPACE, "tag");
+ serializer.attribute(NAMESPACE, "attr", "a\ud83d\ude4ab");
+ serializer.text("c\ud83d\ude4ad");
+ serializer.cdsect("e\ud83d\ude4af");
+ serializer.endTag(NAMESPACE, "tag");
+ serializer.endDocument();
+ assertXmlEquals("<tag attr=\"a&#128586;b\">" +
+ "c&#128586;d" +
+ "<![CDATA[e]]>&#128586;<![CDATA[f]]>" +
+ "</tag>", stringWriter.toString());
+
+ // Check we can parse what we just output.
+ Document doc = domOf(stringWriter.toString());
+ Node root = doc.getDocumentElement();
+ assertEquals("a\ud83d\ude4ab", root.getAttributes().getNamedItem("attr").getNodeValue());
+ Text text = (Text) root.getFirstChild();
+ assertEquals("c\ud83d\ude4ade\ud83d\ude4af", text.getNodeValue());
+ }
+
+ public void testBadSurrogates() throws Exception {
+ StringWriter stringWriter = new StringWriter();
+ XmlSerializer serializer = new KXmlSerializer();
+ serializer.setOutput(stringWriter);
+ serializer.startDocument("UTF-8", null);
+ serializer.startTag(NAMESPACE, "tag");
+ try {
+ serializer.attribute(NAMESPACE, "attr", "a\ud83d\u0040b");
+ } catch (IllegalArgumentException expected) {
+ }
+ try {
+ serializer.text("c\ud83d\u0040d");
+ } catch (IllegalArgumentException expected) {
+ }
+ try {
+ serializer.cdsect("e\ud83d\u0040f");
+ } catch (IllegalArgumentException expected) {
+ }
+ }
+
+ // Cover all the BMP code points plus a few that require us to use surrogates.
+ private static int MAX_TEST_CODE_POINT = 0x10008;
+
public void testInvalidCharactersInText() throws IOException {
XmlSerializer serializer = newSerializer();
serializer.startTag(NAMESPACE, "root");
- for (int ch = 0; ch <= 0xffff; ++ch) {
- final String s = Character.toString((char) ch);
- if (isValidXmlCodePoint(ch)) {
+ for (int c = 0; c <= MAX_TEST_CODE_POINT; ++c) {
+ final String s = fromCodePoint(c);
+ if (isValidXmlCodePoint(c)) {
serializer.text("a" + s + "b");
} else {
try {
@@ -108,9 +165,9 @@ public final class KxmlSerializerTest extends TestCase {
public void testInvalidCharactersInAttributeValues() throws IOException {
XmlSerializer serializer = newSerializer();
serializer.startTag(NAMESPACE, "root");
- for (int ch = 0; ch <= 0xffff; ++ch) {
- final String s = Character.toString((char) ch);
- if (isValidXmlCodePoint(ch)) {
+ for (int c = 0; c <= MAX_TEST_CODE_POINT; ++c) {
+ final String s = fromCodePoint(c);
+ if (isValidXmlCodePoint(c)) {
serializer.attribute(NAMESPACE, "a", "a" + s + "b");
} else {
try {
@@ -126,9 +183,9 @@ public final class KxmlSerializerTest extends TestCase {
public void testInvalidCharactersInCdataSections() throws IOException {
XmlSerializer serializer = newSerializer();
serializer.startTag(NAMESPACE, "root");
- for (int ch = 0; ch <= 0xffff; ++ch) {
- final String s = Character.toString((char) ch);
- if (isValidXmlCodePoint(ch)) {
+ for (int c = 0; c <= MAX_TEST_CODE_POINT; ++c) {
+ final String s = fromCodePoint(c);
+ if (isValidXmlCodePoint(c)) {
serializer.cdsect("a" + s + "b");
} else {
try {
diff --git a/xml/src/main/java/org/kxml2/io/KXmlSerializer.java b/xml/src/main/java/org/kxml2/io/KXmlSerializer.java
index 8fa2756..bfdeece 100644
--- a/xml/src/main/java/org/kxml2/io/KXmlSerializer.java
+++ b/xml/src/main/java/org/kxml2/io/KXmlSerializer.java
@@ -125,14 +125,18 @@ public class KXmlSerializer implements XmlSerializer {
// otherwise generate.
// Note: tab, newline, and carriage return have already been
// handled above.
- boolean valid = (c >= 0x20 && c <= 0xd7ff) || (c >= 0xe000 && c <= 0xfffd);
- if (!valid) {
- reportInvalidCharacter(c);
- }
- if (unicode || c < 127) {
- writer.write(c);
+ boolean allowedInXml = (c >= 0x20 && c <= 0xd7ff) || (c >= 0xe000 && c <= 0xfffd);
+ if (allowedInXml) {
+ if (unicode || c < 127) {
+ writer.write(c);
+ } else {
+ writer.write("&#" + ((int) c) + ";");
+ }
+ } else if (Character.isHighSurrogate(c) && i < s.length() - 1) {
+ writeSurrogate(c, s.charAt(i + 1));
+ ++i;
} else {
- writer.write("&#" + ((int) c) + ";");
+ reportInvalidCharacter(c);
}
// END android-changed
}
@@ -141,7 +145,7 @@ public class KXmlSerializer implements XmlSerializer {
// BEGIN android-added
private static void reportInvalidCharacter(char ch) {
- throw new IllegalArgumentException("Illegal character (" + Integer.toHexString((int) ch) + ")");
+ throw new IllegalArgumentException("Illegal character (U+" + Integer.toHexString((int) ch) + ")");
}
// END android-added
@@ -548,22 +552,41 @@ public class KXmlSerializer implements XmlSerializer {
// BEGIN android-changed: ]]> is not allowed within a CDATA,
// so break and start a new one when necessary.
data = data.replace("]]>", "]]]]><![CDATA[>");
- char[] chars = data.toCharArray();
- // We also aren't allowed any invalid characters.
- for (char ch : chars) {
- boolean valid = (ch >= 0x20 && ch <= 0xd7ff) ||
+ writer.write("<![CDATA[");
+ for (int i = 0; i < data.length(); ++i) {
+ char ch = data.charAt(i);
+ boolean allowedInCdata = (ch >= 0x20 && ch <= 0xd7ff) ||
(ch == '\t' || ch == '\n' || ch == '\r') ||
(ch >= 0xe000 && ch <= 0xfffd);
- if (!valid) {
+ if (allowedInCdata) {
+ writer.write(ch);
+ } else if (Character.isHighSurrogate(ch) && i < data.length() - 1) {
+ // Character entities aren't valid in CDATA, so break out for this.
+ writer.write("]]>");
+ writeSurrogate(ch, data.charAt(++i));
+ writer.write("<![CDATA[");
+ } else {
reportInvalidCharacter(ch);
}
}
- writer.write("<![CDATA[");
- writer.write(chars, 0, chars.length);
writer.write("]]>");
// END android-changed
}
+ // BEGIN android-added
+ private void writeSurrogate(char high, char low) throws IOException {
+ if (!Character.isLowSurrogate(low)) {
+ throw new IllegalArgumentException("Bad surrogate pair (U+" + Integer.toHexString((int) high) +
+ " U+" + Integer.toHexString((int) low) + ")");
+ }
+ // Java-style surrogate pairs aren't allowed in XML. We could use the > 3-byte encodings, but that
+ // seems likely to upset anything expecting modified UTF-8 rather than "real" UTF-8. It seems more
+ // conservative in a Java environment to use an entity reference instead.
+ int codePoint = Character.toCodePoint(high, low);
+ writer.write("&#" + codePoint + ";");
+ }
+ // END android-added
+
public void comment(String comment) throws IOException {
check(false);
writer.write("<!--");