diff options
3 files changed, 110 insertions, 28 deletions
diff --git a/luni/src/main/java/org/apache/harmony/xml/parsers/DocumentBuilderImpl.java b/luni/src/main/java/org/apache/harmony/xml/parsers/DocumentBuilderImpl.java index d1079c8..040a012 100644 --- a/luni/src/main/java/org/apache/harmony/xml/parsers/DocumentBuilderImpl.java +++ b/luni/src/main/java/org/apache/harmony/xml/parsers/DocumentBuilderImpl.java @@ -416,11 +416,13 @@ class DocumentBuilderImpl extends DocumentBuilder { private String resolveCharacterReference(String value, int base) { try { - int ch = Integer.parseInt(value, base); - if (ch < 0 || ch > Character.MAX_VALUE) { - return null; + int codePoint = Integer.parseInt(value, base); + if (Character.isBmpCodePoint(codePoint)) { + return String.valueOf((char) codePoint); + } else { + char[] surrogatePair = Character.toChars(codePoint); + return new String(surrogatePair); } - return String.valueOf((char) ch); } catch (NumberFormatException ex) { return null; } diff --git a/luni/src/test/java/libcore/xml/KxmlSerializerTest.java b/luni/src/test/java/libcore/xml/KxmlSerializerTest.java index 6a75a9b..5f68a99 100644 --- a/luni/src/test/java/libcore/xml/KxmlSerializerTest.java +++ b/luni/src/test/java/libcore/xml/KxmlSerializerTest.java @@ -22,7 +22,9 @@ import java.io.StringWriter; import junit.framework.TestCase; import org.kxml2.io.KXmlSerializer; import org.w3c.dom.Document; +import org.w3c.dom.Node; import org.w3c.dom.NodeList; +import org.w3c.dom.Text; import org.xmlpull.v1.XmlSerializer; import static tests.support.Support_Xml.domOf; @@ -87,12 +89,67 @@ public final class KxmlSerializerTest extends TestCase { return serializer; } + public String fromCodePoint(int codePoint) { + if (codePoint > Character.MAX_VALUE) { + return new String(Character.toChars(codePoint)); + } + return Character.toString((char) codePoint); + } + + // http://b/17960630 + public void testSpeakNoEvilMonkeys() throws Exception { + StringWriter stringWriter = new StringWriter(); + XmlSerializer serializer = new KXmlSerializer(); + serializer.setOutput(stringWriter); + serializer.startDocument("UTF-8", null); + serializer.startTag(NAMESPACE, "tag"); + serializer.attribute(NAMESPACE, "attr", "a\ud83d\ude4ab"); + serializer.text("c\ud83d\ude4ad"); + serializer.cdsect("e\ud83d\ude4af"); + serializer.endTag(NAMESPACE, "tag"); + serializer.endDocument(); + assertXmlEquals("<tag attr=\"a🙊b\">" + + "c🙊d" + + "<![CDATA[e]]>🙊<![CDATA[f]]>" + + "</tag>", stringWriter.toString()); + + // Check we can parse what we just output. + Document doc = domOf(stringWriter.toString()); + Node root = doc.getDocumentElement(); + assertEquals("a\ud83d\ude4ab", root.getAttributes().getNamedItem("attr").getNodeValue()); + Text text = (Text) root.getFirstChild(); + assertEquals("c\ud83d\ude4ade\ud83d\ude4af", text.getNodeValue()); + } + + public void testBadSurrogates() throws Exception { + StringWriter stringWriter = new StringWriter(); + XmlSerializer serializer = new KXmlSerializer(); + serializer.setOutput(stringWriter); + serializer.startDocument("UTF-8", null); + serializer.startTag(NAMESPACE, "tag"); + try { + serializer.attribute(NAMESPACE, "attr", "a\ud83d\u0040b"); + } catch (IllegalArgumentException expected) { + } + try { + serializer.text("c\ud83d\u0040d"); + } catch (IllegalArgumentException expected) { + } + try { + serializer.cdsect("e\ud83d\u0040f"); + } catch (IllegalArgumentException expected) { + } + } + + // Cover all the BMP code points plus a few that require us to use surrogates. + private static int MAX_TEST_CODE_POINT = 0x10008; + public void testInvalidCharactersInText() throws IOException { XmlSerializer serializer = newSerializer(); serializer.startTag(NAMESPACE, "root"); - for (int ch = 0; ch <= 0xffff; ++ch) { - final String s = Character.toString((char) ch); - if (isValidXmlCodePoint(ch)) { + for (int c = 0; c <= MAX_TEST_CODE_POINT; ++c) { + final String s = fromCodePoint(c); + if (isValidXmlCodePoint(c)) { serializer.text("a" + s + "b"); } else { try { @@ -108,9 +165,9 @@ public final class KxmlSerializerTest extends TestCase { public void testInvalidCharactersInAttributeValues() throws IOException { XmlSerializer serializer = newSerializer(); serializer.startTag(NAMESPACE, "root"); - for (int ch = 0; ch <= 0xffff; ++ch) { - final String s = Character.toString((char) ch); - if (isValidXmlCodePoint(ch)) { + for (int c = 0; c <= MAX_TEST_CODE_POINT; ++c) { + final String s = fromCodePoint(c); + if (isValidXmlCodePoint(c)) { serializer.attribute(NAMESPACE, "a", "a" + s + "b"); } else { try { @@ -126,9 +183,9 @@ public final class KxmlSerializerTest extends TestCase { public void testInvalidCharactersInCdataSections() throws IOException { XmlSerializer serializer = newSerializer(); serializer.startTag(NAMESPACE, "root"); - for (int ch = 0; ch <= 0xffff; ++ch) { - final String s = Character.toString((char) ch); - if (isValidXmlCodePoint(ch)) { + for (int c = 0; c <= MAX_TEST_CODE_POINT; ++c) { + final String s = fromCodePoint(c); + if (isValidXmlCodePoint(c)) { serializer.cdsect("a" + s + "b"); } else { try { diff --git a/xml/src/main/java/org/kxml2/io/KXmlSerializer.java b/xml/src/main/java/org/kxml2/io/KXmlSerializer.java index 8fa2756..bfdeece 100644 --- a/xml/src/main/java/org/kxml2/io/KXmlSerializer.java +++ b/xml/src/main/java/org/kxml2/io/KXmlSerializer.java @@ -125,14 +125,18 @@ public class KXmlSerializer implements XmlSerializer { // otherwise generate. // Note: tab, newline, and carriage return have already been // handled above. - boolean valid = (c >= 0x20 && c <= 0xd7ff) || (c >= 0xe000 && c <= 0xfffd); - if (!valid) { - reportInvalidCharacter(c); - } - if (unicode || c < 127) { - writer.write(c); + boolean allowedInXml = (c >= 0x20 && c <= 0xd7ff) || (c >= 0xe000 && c <= 0xfffd); + if (allowedInXml) { + if (unicode || c < 127) { + writer.write(c); + } else { + writer.write("&#" + ((int) c) + ";"); + } + } else if (Character.isHighSurrogate(c) && i < s.length() - 1) { + writeSurrogate(c, s.charAt(i + 1)); + ++i; } else { - writer.write("&#" + ((int) c) + ";"); + reportInvalidCharacter(c); } // END android-changed } @@ -141,7 +145,7 @@ public class KXmlSerializer implements XmlSerializer { // BEGIN android-added private static void reportInvalidCharacter(char ch) { - throw new IllegalArgumentException("Illegal character (" + Integer.toHexString((int) ch) + ")"); + throw new IllegalArgumentException("Illegal character (U+" + Integer.toHexString((int) ch) + ")"); } // END android-added @@ -548,22 +552,41 @@ public class KXmlSerializer implements XmlSerializer { // BEGIN android-changed: ]]> is not allowed within a CDATA, // so break and start a new one when necessary. data = data.replace("]]>", "]]]]><![CDATA[>"); - char[] chars = data.toCharArray(); - // We also aren't allowed any invalid characters. - for (char ch : chars) { - boolean valid = (ch >= 0x20 && ch <= 0xd7ff) || + writer.write("<![CDATA["); + for (int i = 0; i < data.length(); ++i) { + char ch = data.charAt(i); + boolean allowedInCdata = (ch >= 0x20 && ch <= 0xd7ff) || (ch == '\t' || ch == '\n' || ch == '\r') || (ch >= 0xe000 && ch <= 0xfffd); - if (!valid) { + if (allowedInCdata) { + writer.write(ch); + } else if (Character.isHighSurrogate(ch) && i < data.length() - 1) { + // Character entities aren't valid in CDATA, so break out for this. + writer.write("]]>"); + writeSurrogate(ch, data.charAt(++i)); + writer.write("<![CDATA["); + } else { reportInvalidCharacter(ch); } } - writer.write("<![CDATA["); - writer.write(chars, 0, chars.length); writer.write("]]>"); // END android-changed } + // BEGIN android-added + private void writeSurrogate(char high, char low) throws IOException { + if (!Character.isLowSurrogate(low)) { + throw new IllegalArgumentException("Bad surrogate pair (U+" + Integer.toHexString((int) high) + + " U+" + Integer.toHexString((int) low) + ")"); + } + // Java-style surrogate pairs aren't allowed in XML. We could use the > 3-byte encodings, but that + // seems likely to upset anything expecting modified UTF-8 rather than "real" UTF-8. It seems more + // conservative in a Java environment to use an entity reference instead. + int codePoint = Character.toCodePoint(high, low); + writer.write("&#" + codePoint + ";"); + } + // END android-added + public void comment(String comment) throws IOException { check(false); writer.write("<!--"); |