diff options
author | Jesse Wilson <jessewilson@google.com> | 2010-03-01 17:31:53 -0800 |
---|---|---|
committer | Jesse Wilson <jessewilson@google.com> | 2010-03-01 19:03:40 -0800 |
commit | 8092253eb6a1cff91f0e4953f1387165169157b5 (patch) | |
tree | 224d9e121b4053c9a3dc480e6983a389cd61f745 | |
parent | f06779ef253298411e2151a990d6f14b2cb42ce3 (diff) | |
download | libcore-8092253eb6a1cff91f0e4953f1387165169157b5.zip libcore-8092253eb6a1cff91f0e4953f1387165169157b5.tar.gz libcore-8092253eb6a1cff91f0e4953f1387165169157b5.tar.bz2 |
Implementing almost all of Document.normalizeDocument().
This follows the rules specified by DOMConfiguration. In particular:
- replacing CDATA nodes with text
- splitting CDATA nodes
- merging text nodes
- stripping comments
- detecting invalid characters
I haven't added the normalization code for XML validation or namespaces.
10 files changed, 446 insertions, 48 deletions
diff --git a/xml/src/main/java/org/apache/harmony/xml/dom/CDATASectionImpl.java b/xml/src/main/java/org/apache/harmony/xml/dom/CDATASectionImpl.java index b28c9da..7d122f2 100644 --- a/xml/src/main/java/org/apache/harmony/xml/dom/CDATASectionImpl.java +++ b/xml/src/main/java/org/apache/harmony/xml/dom/CDATASectionImpl.java @@ -29,7 +29,7 @@ import org.w3c.dom.Node; * the DOM implementation can easily access them while maintaining the DOM tree * structure. */ -public class CDATASectionImpl extends TextImpl implements CDATASection { +public final class CDATASectionImpl extends TextImpl implements CDATASection { public CDATASectionImpl(DocumentImpl document, String data) { super(document, data); @@ -45,4 +45,43 @@ public class CDATASectionImpl extends TextImpl implements CDATASection { return Node.CDATA_SECTION_NODE; } + /** + * Splits this CDATA node into parts that do not contain a "]]>" sequence. + * Any newly created nodes will be inserted before this node. + */ + public void split() { + if (!needsSplitting()) { + return; + } + + Node parent = getParentNode(); + String[] parts = getData().split("\\]\\]>"); + parent.insertBefore(new CDATASectionImpl(document, parts[0] + "]]"), this); + for (int p = 1; p < parts.length - 1; p++) { + parent.insertBefore(new CDATASectionImpl(document, ">" + parts[p] + "]]"), this); + } + setData(">" + parts[parts.length - 1]); + } + + /** + * Returns true if this CDATA section contains the illegal character + * sequence "]]>". Such nodes must be {@link #split} before they are + * serialized. + */ + public boolean needsSplitting() { + return buffer.indexOf("]]>") != -1; + } + + /** + * Replaces this node with a semantically equivalent text node. This node + * will be removed from the DOM tree and the new node inserted in its place. + * + * @return the replacement node. + */ + public TextImpl replaceWithText() { + TextImpl replacement = new TextImpl(document, getData()); + parent.insertBefore(replacement, this); + parent.removeChild(this); + return replacement; + } } diff --git a/xml/src/main/java/org/apache/harmony/xml/dom/CharacterDataImpl.java b/xml/src/main/java/org/apache/harmony/xml/dom/CharacterDataImpl.java index 6354747..2d4c3b4 100644 --- a/xml/src/main/java/org/apache/harmony/xml/dom/CharacterDataImpl.java +++ b/xml/src/main/java/org/apache/harmony/xml/dom/CharacterDataImpl.java @@ -32,7 +32,7 @@ import org.w3c.dom.DOMException; public abstract class CharacterDataImpl extends LeafNodeImpl implements CharacterData { - private StringBuffer buffer; + protected StringBuffer buffer; CharacterDataImpl(DocumentImpl document, String data) { super(document); diff --git a/xml/src/main/java/org/apache/harmony/xml/dom/CommentImpl.java b/xml/src/main/java/org/apache/harmony/xml/dom/CommentImpl.java index 2d4a9c5..5f8a4e0 100644 --- a/xml/src/main/java/org/apache/harmony/xml/dom/CommentImpl.java +++ b/xml/src/main/java/org/apache/harmony/xml/dom/CommentImpl.java @@ -29,7 +29,7 @@ import org.w3c.dom.Node; * the DOM implementation can easily access them while maintaining the DOM tree * structure. */ -public class CommentImpl extends CharacterDataImpl implements Comment { +public final class CommentImpl extends CharacterDataImpl implements Comment { CommentImpl(DocumentImpl document, String data) { super(document, data); @@ -45,4 +45,11 @@ public class CommentImpl extends CharacterDataImpl implements Comment { return Node.COMMENT_NODE; } + /** + * Returns true if this comment contains the illegal character sequence + * "--". Such nodes may not be serialized. + */ + public boolean containsDashDash() { + return buffer.indexOf("--") != -1; + } } diff --git a/xml/src/main/java/org/apache/harmony/xml/dom/DOMConfigurationImpl.java b/xml/src/main/java/org/apache/harmony/xml/dom/DOMConfigurationImpl.java index 2f57a4c..1a8acbc 100644 --- a/xml/src/main/java/org/apache/harmony/xml/dom/DOMConfigurationImpl.java +++ b/xml/src/main/java/org/apache/harmony/xml/dom/DOMConfigurationImpl.java @@ -16,10 +16,14 @@ package org.apache.harmony.xml.dom; +import org.apache.xml.serializer.dom3.DOMErrorImpl; import org.w3c.dom.DOMConfiguration; +import org.w3c.dom.DOMError; import org.w3c.dom.DOMErrorHandler; import org.w3c.dom.DOMException; import org.w3c.dom.DOMStringList; +import org.w3c.dom.NamedNodeMap; +import org.w3c.dom.Node; import java.util.Map; import java.util.TreeMap; @@ -368,4 +372,120 @@ public final class DOMConfigurationImpl implements DOMConfiguration { } }; } + + public void normalize(Node node) { + /* + * Since we don't validate, this code doesn't take into account the + * following "supported" parameters: datatype-normalization, entities, + * schema-location, schema-type, or validate. + * + * TODO: normalize namespaces + */ + + switch (node.getNodeType()) { + case Node.CDATA_SECTION_NODE: + CDATASectionImpl cdata = (CDATASectionImpl) node; + if (cdataSections) { + if (cdata.needsSplitting()) { + if (splitCdataSections) { + cdata.split(); + report(DOMError.SEVERITY_WARNING, "cdata-sections-splitted"); + } else { + report(DOMError.SEVERITY_ERROR, "wf-invalid-character"); + } + } + checkTextValidity(cdata.buffer); + break; + } + node = cdata.replaceWithText(); + // fall through + + case Node.TEXT_NODE: + TextImpl text = (TextImpl) node; + text = text.minimize(); + if (text != null) { + checkTextValidity(text.buffer); + } + break; + + case Node.COMMENT_NODE: + CommentImpl comment = (CommentImpl) node; + if (!comments) { + comment.getParentNode().removeChild(comment); + break; + } + if (comment.containsDashDash()) { + report(DOMError.SEVERITY_ERROR, "wf-invalid-character"); + } + checkTextValidity(comment.buffer); + break; + + case Node.PROCESSING_INSTRUCTION_NODE: + checkTextValidity(((ProcessingInstructionImpl) node).getData()); + break; + + case Node.ATTRIBUTE_NODE: + checkTextValidity(((AttrImpl) node).getValue()); + break; + + case Node.ELEMENT_NODE: + ElementImpl element = (ElementImpl) node; + NamedNodeMap attributes = element.getAttributes(); + for (int i = 0; i < attributes.getLength(); i++) { + normalize(attributes.item(i)); + } + // fall through + + case Node.DOCUMENT_NODE: + case Node.DOCUMENT_FRAGMENT_NODE: + Node next; + for (Node child = node.getFirstChild(); child != null; child = next) { + // lookup next eagerly because normalize() may remove its subject + next = child.getNextSibling(); + normalize(child); + } + break; + + case Node.NOTATION_NODE: + case Node.DOCUMENT_TYPE_NODE: + case Node.ENTITY_NODE: + case Node.ENTITY_REFERENCE_NODE: + break; + + default: + throw new DOMException(DOMException.NOT_SUPPORTED_ERR, + "Unsupported node type " + node.getNodeType()); + } + } + + private void checkTextValidity(CharSequence s) { + if (wellFormed && !isValid(s)) { + report(DOMError.SEVERITY_ERROR, "wf-invalid-character"); + } + } + + /** + * Returns true if all of the characters in the text are permitted for use + * in XML documents. + */ + private boolean isValid(CharSequence text) { + for (int i = 0; i < text.length(); i++) { + char c = text.charAt(i); + // as defined by http://www.w3.org/TR/REC-xml/#charsets. + boolean valid = c == 0x9 || c == 0xA || c == 0xD + || (c >= 0x20 && c <= 0xd7ff) + || (c >= 0xe000 && c <= 0xfffd); + if (!valid) { + return false; + } + } + return true; + } + + private void report(short severity, String type) { + if (errorHandler != null) { + // TODO: abort if handleError returns false + errorHandler.handleError(new DOMErrorImpl(severity, type, type)); + } + } } diff --git a/xml/src/main/java/org/apache/harmony/xml/dom/DocumentImpl.java b/xml/src/main/java/org/apache/harmony/xml/dom/DocumentImpl.java index 035e1bb..b009128 100644 --- a/xml/src/main/java/org/apache/harmony/xml/dom/DocumentImpl.java +++ b/xml/src/main/java/org/apache/harmony/xml/dom/DocumentImpl.java @@ -44,10 +44,10 @@ import org.w3c.dom.Text; * the DOM implementation can easily access them while maintaining the DOM tree * structure. */ -public class DocumentImpl extends InnerNodeImpl implements Document { +public final class DocumentImpl extends InnerNodeImpl implements Document { private DOMImplementation domImplementation; - private DOMConfiguration domConfiguration; + private DOMConfigurationImpl domConfiguration; /* * The default values of these fields are specified by the Document @@ -369,7 +369,12 @@ public class DocumentImpl extends InnerNodeImpl implements Document { } public void normalizeDocument() { - throw new UnsupportedOperationException(); // TODO + Element root = getDocumentElement(); + if (root == null) { + return; + } + + ((DOMConfigurationImpl) getDomConfig()).normalize(root); } public Node renameNode(Node n, String namespaceURI, String qualifiedName) diff --git a/xml/src/main/java/org/apache/harmony/xml/dom/InnerNodeImpl.java b/xml/src/main/java/org/apache/harmony/xml/dom/InnerNodeImpl.java index 9cee352..fa75e21 100644 --- a/xml/src/main/java/org/apache/harmony/xml/dom/InnerNodeImpl.java +++ b/xml/src/main/java/org/apache/harmony/xml/dom/InnerNodeImpl.java @@ -19,7 +19,6 @@ package org.apache.harmony.xml.dom; import org.w3c.dom.DOMException; import org.w3c.dom.Node; import org.w3c.dom.NodeList; -import org.w3c.dom.Text; import java.util.ArrayList; import java.util.List; @@ -154,29 +153,14 @@ public abstract class InnerNodeImpl extends LeafNodeImpl { */ @Override public final void normalize() { - Text next = null; // null if next doesn't exist or is not a TEXT_NODE - for (int i = children.size() - 1; i >= 0; i--) { - Node node = children.get(i); + Node next; + for (Node node = getFirstChild(); node != null; node = next) { + next = node.getNextSibling(); node.normalize(); - if (node.getNodeType() != Node.TEXT_NODE) { - next = null; - continue; + if (node.getNodeType() == Node.TEXT_NODE) { + ((TextImpl) node).minimize(); } - - Text text = (Text) node; - - if (text.getLength() == 0) { - removeChild(text); - continue; - } - - if (next != null) { - text.appendData(next.getData()); - removeChild(next); - } - - next = text; } } diff --git a/xml/src/main/java/org/apache/harmony/xml/dom/ProcessingInstructionImpl.java b/xml/src/main/java/org/apache/harmony/xml/dom/ProcessingInstructionImpl.java index 179b33c..115245d 100644 --- a/xml/src/main/java/org/apache/harmony/xml/dom/ProcessingInstructionImpl.java +++ b/xml/src/main/java/org/apache/harmony/xml/dom/ProcessingInstructionImpl.java @@ -30,7 +30,7 @@ import org.w3c.dom.ProcessingInstruction; * the DOM implementation can easily access them while maintaining the DOM tree * structure. */ -public class ProcessingInstructionImpl extends LeafNodeImpl implements +public final class ProcessingInstructionImpl extends LeafNodeImpl implements ProcessingInstruction { private String target; @@ -39,7 +39,7 @@ public class ProcessingInstructionImpl extends LeafNodeImpl implements ProcessingInstructionImpl(DocumentImpl document, String target, String data) { super(document); - this.target = target; + this.target = target; // TODO: validate that target is well-formed this.data = data; } diff --git a/xml/src/main/java/org/apache/harmony/xml/dom/TextImpl.java b/xml/src/main/java/org/apache/harmony/xml/dom/TextImpl.java index 3840ef4..d39dff2 100644 --- a/xml/src/main/java/org/apache/harmony/xml/dom/TextImpl.java +++ b/xml/src/main/java/org/apache/harmony/xml/dom/TextImpl.java @@ -140,4 +140,33 @@ public class TextImpl extends CharacterDataImpl implements Text { ? (TextImpl) nextSibling : null; } + + /** + * Tries to remove this node using itself and the previous node as context. + * If this node's text is empty, this node is removed and null is returned. + * If the previous node exists and is a text node, this node's text will be + * appended to that node's text and this node will be removed. + * + * <p>Although this method alters the structure of the DOM tree, it does + * not alter the document's semantics. + * + * @return the node holding this node's text and the end of the operation. + * Can be null if this node contained the empty string. + */ + public final TextImpl minimize() { + if (getLength() == 0) { + parent.removeChild(this); + return null; + } + + Node previous = getPreviousSibling(); + if (previous == null || previous.getNodeType() != Node.TEXT_NODE) { + return this; + } + + TextImpl previousText = (TextImpl) previous; + previousText.buffer.append(buffer); + parent.removeChild(this); + return previousText; + } } diff --git a/xml/src/main/java/org/apache/xml/serializer/dom3/DOMErrorImpl.java b/xml/src/main/java/org/apache/xml/serializer/dom3/DOMErrorImpl.java index 81eda73..3895a53 100644 --- a/xml/src/main/java/org/apache/xml/serializer/dom3/DOMErrorImpl.java +++ b/xml/src/main/java/org/apache/xml/serializer/dom3/DOMErrorImpl.java @@ -32,7 +32,7 @@ import org.w3c.dom.DOMLocator; * @xsl.usage internal
*/
-final class DOMErrorImpl implements DOMError {
+public final class DOMErrorImpl implements DOMError {
/** private data members */
@@ -70,7 +70,7 @@ final class DOMErrorImpl implements DOMError { * @param message
* @param type
*/
- DOMErrorImpl(short severity, String message, String type) {
+ public DOMErrorImpl(short severity, String message, String type) {
fSeverity = severity;
fMessage = message;
fType = type;
@@ -82,7 +82,7 @@ final class DOMErrorImpl implements DOMError { * @param type
* @param exception
*/
- DOMErrorImpl(short severity, String message, String type,
+ public DOMErrorImpl(short severity, String message, String type,
Exception exception) {
fSeverity = severity;
fMessage = message;
@@ -98,7 +98,7 @@ final class DOMErrorImpl implements DOMError { * @param relatedData
* @param location
*/
- DOMErrorImpl(short severity, String message, String type,
+ public DOMErrorImpl(short severity, String message, String type,
Exception exception, Object relatedData, DOMLocatorImpl location) {
fSeverity = severity;
fMessage = message;
diff --git a/xml/src/test/java/tests/xml/NormalizeTest.java b/xml/src/test/java/tests/xml/NormalizeTest.java index b10ea9c..6fa6c97 100644 --- a/xml/src/test/java/tests/xml/NormalizeTest.java +++ b/xml/src/test/java/tests/xml/NormalizeTest.java @@ -17,6 +17,8 @@ package tests.xml; import junit.framework.TestCase; +import org.w3c.dom.CDATASection; +import org.w3c.dom.Comment; import org.w3c.dom.DOMConfiguration; import org.w3c.dom.DOMError; import org.w3c.dom.DOMErrorHandler; @@ -25,6 +27,7 @@ import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.Node; import org.w3c.dom.NodeList; +import org.w3c.dom.ProcessingInstruction; import org.w3c.dom.Text; import org.xml.sax.InputSource; @@ -37,6 +40,7 @@ import java.io.StringReader; import java.io.StringWriter; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; import java.util.List; /** @@ -61,26 +65,31 @@ public class NormalizeTest extends TestCase { } public void testCanonicalForm() { + assertEquals(false, domConfiguration.getParameter("canonical-form")); assertSupported("canonical-form", false); assertUnsupported("canonical-form", true); } public void testCdataSections() { + assertEquals(true, domConfiguration.getParameter("cdata-sections")); assertSupported("cdata-sections", false); assertSupported("cdata-sections", true); } public void testCheckCharacterNormalization() { + assertEquals(false, domConfiguration.getParameter("check-character-normalization")); assertSupported("check-character-normalization", false); assertUnsupported("check-character-normalization", true); } public void testComments() { + assertEquals(true, domConfiguration.getParameter("comments")); assertSupported("comments", false); assertSupported("comments", true); } public void testDatatypeNormalization() { + assertEquals(false, domConfiguration.getParameter("datatype-normalization")); assertSupported("datatype-normalization", false); assertSupported("datatype-normalization", true); @@ -95,16 +104,19 @@ public class NormalizeTest extends TestCase { } public void testElementContentWhitespace() { + assertEquals(true, domConfiguration.getParameter("element-content-whitespace")); assertUnsupported("element-content-whitespace", false); assertSupported("element-content-whitespace", true); } public void testEntities() { + assertEquals(true, domConfiguration.getParameter("entities")); assertSupported("entities", false); assertSupported("entities", true); } public void testErrorHandler() { + assertEquals(null, domConfiguration.getParameter("error-handler")); assertSupported("error-handler", null); assertSupported("error-handler", new DOMErrorHandler() { public boolean handleError(DOMError error) { @@ -114,6 +126,7 @@ public class NormalizeTest extends TestCase { } public void testInfoset() { + assertEquals(false, domConfiguration.getParameter("infoset")); assertSupported("infoset", false); assertSupported("infoset", true); } @@ -162,21 +175,25 @@ public class NormalizeTest extends TestCase { } public void testNamespaces() { + assertEquals(true, domConfiguration.getParameter("namespaces")); assertSupported("namespaces", false); assertSupported("namespaces", true); } public void testNamespaceDeclarations() { + assertEquals(true, domConfiguration.getParameter("namespace-declarations")); assertUnsupported("namespace-declarations", false); // supported in RI 6 assertSupported("namespace-declarations", true); } public void testNormalizeCharacters() { + assertEquals(false, domConfiguration.getParameter("normalize-characters")); assertSupported("normalize-characters", false); assertUnsupported("normalize-characters", true); } public void testSchemaLocation() { + assertEquals(null, domConfiguration.getParameter("schema-location")); assertSupported("schema-location", "http://foo"); assertSupported("schema-location", null); } @@ -190,26 +207,31 @@ public class NormalizeTest extends TestCase { } public void testSchemaTypeXmlSchema() { + assertEquals(null, domConfiguration.getParameter("schema-type")); assertSupported("schema-type", null); assertSupported("schema-type", "http://www.w3.org/2001/XMLSchema"); } public void testSplitCdataSections() { + assertEquals(true, domConfiguration.getParameter("split-cdata-sections")); assertSupported("split-cdata-sections", false); assertSupported("split-cdata-sections", true); } public void testValidate() { + assertEquals(false, domConfiguration.getParameter("validate")); assertSupported("validate", false); assertSupported("validate", true); } public void testValidateIfSchema() { + assertEquals(false, domConfiguration.getParameter("validate-if-schema")); assertSupported("validate-if-schema", false); assertUnsupported("validate-if-schema", true); } public void testWellFormed() { + assertEquals(true, domConfiguration.getParameter("well-formed")); assertSupported("well-formed", false); assertSupported("well-formed", true); } @@ -314,30 +336,26 @@ public class NormalizeTest extends TestCase { public void testCdataSectionsNotHonoredByNodeNormalize() throws Exception { String xml = "<foo>ABC<![CDATA[DEF]]>GHI</foo>"; - document = DocumentBuilderFactory.newInstance().newDocumentBuilder() - .parse(new InputSource(new StringReader(xml))); - document.getDomConfig().setParameter("cdata-sections", true); + parse(xml); + domConfiguration.setParameter("cdata-sections", true); document.getDocumentElement().normalize(); assertEquals(xml, domToString(document)); - document = DocumentBuilderFactory.newInstance().newDocumentBuilder() - .parse(new InputSource(new StringReader(xml))); - document.getDomConfig().setParameter("cdata-sections", false); + parse(xml); + domConfiguration.setParameter("cdata-sections", false); document.getDocumentElement().normalize(); assertEquals(xml, domToString(document)); } public void testCdataSectionsHonoredByDocumentNormalize() throws Exception { String xml = "<foo>ABC<![CDATA[DEF]]>GHI</foo>"; - document = DocumentBuilderFactory.newInstance().newDocumentBuilder() - .parse(new InputSource(new StringReader(xml))); - document.getDomConfig().setParameter("cdata-sections", true); + parse(xml); + domConfiguration.setParameter("cdata-sections", true); document.normalizeDocument(); assertEquals(xml, domToString(document)); - document = DocumentBuilderFactory.newInstance().newDocumentBuilder() - .parse(new InputSource(new StringReader(xml))); - document.getDomConfig().setParameter("cdata-sections", false); + parse(xml); + domConfiguration.setParameter("cdata-sections", false); document.normalizeDocument(); String expected = xml.replace("<![CDATA[DEF]]>", "DEF"); assertEquals(expected, domToString(document)); @@ -367,6 +385,170 @@ public class NormalizeTest extends TestCase { assertChildren(document.getDocumentElement(), "<br>", "<br>", "<br>"); } + public void testRetainingComments() throws Exception { + String xml = "<foo>ABC<!-- bar -->DEF<!-- baz -->GHI</foo>"; + parse(xml); + domConfiguration.setParameter("comments", true); + document.normalizeDocument(); + assertEquals(xml, domToString(document)); + } + + public void testCommentContainingDoubleDash() throws Exception { + ErrorRecorder errorRecorder = new ErrorRecorder(); + domConfiguration.setParameter("error-handler", errorRecorder); + domConfiguration.setParameter("namespaces", false); + Element root = document.createElement("foo"); + document.appendChild(root); + root.appendChild(document.createComment("ABC -- DEF")); + document.normalizeDocument(); + errorRecorder.assertAllErrors(DOMError.SEVERITY_ERROR, "wf-invalid-character"); + } + + public void testStrippingComments() throws Exception { + String xml = "<foo>ABC<!-- bar -->DEF<!-- baz -->GHI</foo>"; + parse(xml); + domConfiguration.setParameter("comments", false); + document.normalizeDocument(); + assertChildren(document.getDocumentElement(), "ABCDEFGHI"); + } + + public void testSplittingCdataSectionsSplit() throws Exception { + ErrorRecorder errorRecorder = new ErrorRecorder(); + domConfiguration.setParameter("split-cdata-sections", true); + domConfiguration.setParameter("error-handler", errorRecorder); + domConfiguration.setParameter("namespaces", false); + Element root = document.createElement("foo"); + document.appendChild(root); + root.appendChild(document.createCDATASection("ABC]]>DEF]]>GHI")); + document.normalizeDocument(); + errorRecorder.assertAllErrors(DOMError.SEVERITY_WARNING, "cdata-sections-splitted"); + assertChildren(root, "<![CDATA[ABC]]]]>", "<![CDATA[>DEF]]]]>", "<![CDATA[>GHI]]>"); + } + + public void testSplittingCdataSectionsReportError() throws Exception { + ErrorRecorder errorRecorder = new ErrorRecorder(); + domConfiguration.setParameter("split-cdata-sections", false); + domConfiguration.setParameter("error-handler", errorRecorder); + domConfiguration.setParameter("namespaces", false); + Element root = document.createElement("foo"); + document.appendChild(root); + root.appendChild(document.createCDATASection("ABC]]>DEF")); + document.normalizeDocument(); + errorRecorder.assertAllErrors(DOMError.SEVERITY_ERROR, "wf-invalid-character"); + } + + public void testInvalidCharactersCdata() throws Exception { + ErrorRecorder errorRecorder = new ErrorRecorder(); + domConfiguration.setParameter("cdata-sections", true); + domConfiguration.setParameter("error-handler", errorRecorder); + domConfiguration.setParameter("namespaces", false); + Element root = document.createElement("foo"); + document.appendChild(root); + CDATASection cdata = document.createCDATASection(""); + root.appendChild(cdata); + + for (int c = 0; c <= Character.MAX_VALUE; c++) { + cdata.setData(new String(new char[]{ 'A', 'B', (char) c })); + document.normalizeDocument(); + if (isValid((char) c)) { + assertEquals(Collections.<DOMError>emptyList(), errorRecorder.errors); + } else { + errorRecorder.assertAllErrors("For character " + c, + DOMError.SEVERITY_ERROR, "wf-invalid-character"); + } + } + } + + public void testInvalidCharactersText() throws Exception { + ErrorRecorder errorRecorder = new ErrorRecorder(); + domConfiguration.setParameter("error-handler", errorRecorder); + domConfiguration.setParameter("namespaces", false); + Element root = document.createElement("foo"); + document.appendChild(root); + Text text = document.createTextNode(""); + root.appendChild(text); + + for (int c = 0; c <= Character.MAX_VALUE; c++) { + text.setData(new String(new char[]{ 'A', 'B', (char) c })); + document.normalizeDocument(); + if (isValid((char) c)) { + assertEquals(Collections.<DOMError>emptyList(), errorRecorder.errors); + } else { + errorRecorder.assertAllErrors("For character " + c, + DOMError.SEVERITY_ERROR, "wf-invalid-character"); + } + } + } + + public void testInvalidCharactersAttribute() throws Exception { + ErrorRecorder errorRecorder = new ErrorRecorder(); + domConfiguration.setParameter("error-handler", errorRecorder); + domConfiguration.setParameter("namespaces", false); + Element root = document.createElement("foo"); + document.appendChild(root); + + for (int c = 0; c <= Character.MAX_VALUE; c++) { + root.setAttribute("bar", new String(new char[] { 'A', 'B', (char) c})); + document.normalizeDocument(); + if (isValid((char) c)) { + assertEquals(Collections.<DOMError>emptyList(), errorRecorder.errors); + } else { + errorRecorder.assertAllErrors("For character " + c, + DOMError.SEVERITY_ERROR, "wf-invalid-character"); + } + } + } + + public void testInvalidCharactersComment() throws Exception { + ErrorRecorder errorRecorder = new ErrorRecorder(); + domConfiguration.setParameter("error-handler", errorRecorder); + domConfiguration.setParameter("namespaces", false); + Element root = document.createElement("foo"); + document.appendChild(root); + Comment comment = document.createComment(""); + root.appendChild(comment); + + for (int c = 0; c <= Character.MAX_VALUE; c++) { + comment.setData(new String(new char[] { 'A', 'B', (char) c})); + document.normalizeDocument(); + if (isValid((char) c)) { + assertEquals(Collections.<DOMError>emptyList(), errorRecorder.errors); + } else { + errorRecorder.assertAllErrors("For character " + c, + DOMError.SEVERITY_ERROR, "wf-invalid-character"); + } + } + } + + public void testInvalidCharactersProcessingInstructionData() throws Exception { + ErrorRecorder errorRecorder = new ErrorRecorder(); + domConfiguration.setParameter("error-handler", errorRecorder); + domConfiguration.setParameter("namespaces", false); + Element root = document.createElement("foo"); + document.appendChild(root); + ProcessingInstruction pi = document.createProcessingInstruction("foo", ""); + root.appendChild(pi); + + for (int c = 0; c <= Character.MAX_VALUE; c++) { + pi.setData(new String(new char[] { 'A', 'B', (char) c})); + document.normalizeDocument(); + if (isValid((char) c)) { + assertEquals(Collections.<DOMError>emptyList(), errorRecorder.errors); + } else { + errorRecorder.assertAllErrors("For character " + c, + DOMError.SEVERITY_ERROR, "wf-invalid-character"); + } + } + } + + // TODO: test for surrogates + + private boolean isValid(char c) { + // as defined by http://www.w3.org/TR/REC-xml/#charsets. + return c == 0x9 || c == 0xA || c == 0xD || (c >= 0x20 && c <= 0xd7ff) + || (c >= 0xe000 && c <= 0xfffd); + } + private Document createDocumentWithAdjacentTexts(String... texts) throws Exception { Document result = DocumentBuilderFactory.newInstance() .newDocumentBuilder().newDocument(); @@ -387,13 +569,23 @@ public class NormalizeTest extends TestCase { NodeList nodes = element.getChildNodes(); for (int i = 0; i < nodes.getLength(); i++) { Node node = nodes.item(i); - actual.add(node.getNodeType() == Node.TEXT_NODE - ? ((Text) node).getData() - : "<" + node.getNodeName() + ">"); + if (node.getNodeType() == Node.TEXT_NODE) { + actual.add(((Text) node).getData()); + } else if (node.getNodeType() == Node.CDATA_SECTION_NODE) { + actual.add("<![CDATA[" + ((CDATASection) node).getData() + "]]>"); + } else { + actual.add("<" + node.getNodeName() + ">"); + } } assertEquals(Arrays.asList(texts), actual); } + private void parse(String xml) throws Exception { + document = DocumentBuilderFactory.newInstance().newDocumentBuilder() + .parse(new InputSource(new StringReader(xml))); + domConfiguration = document.getDomConfig(); + } + private String domToString(Document document) throws TransformerException { StringWriter writer = new StringWriter(); TransformerFactory.newInstance().newTransformer() @@ -401,4 +593,26 @@ public class NormalizeTest extends TestCase { String xml = writer.toString(); return xml.replaceFirst("<\\?xml[^?]*\\?>", ""); } + + private class ErrorRecorder implements DOMErrorHandler { + private final List<DOMError> errors = new ArrayList<DOMError>(); + + public boolean handleError(DOMError error) { + errors.add(error); + return true; + } + + public void assertAllErrors(int severity, String type) { + assertAllErrors("Expected one or more " + type + " errors", severity, type); + } + + public void assertAllErrors(String message, int severity, String type) { + assertFalse(message, errors.isEmpty()); + for (DOMError error : errors) { + assertEquals(message, severity, error.getSeverity()); + assertEquals(message, type, error.getType()); + } + errors.clear(); + } + } } |