summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJesse Wilson <jessewilson@google.com>2010-03-01 17:31:53 -0800
committerJesse Wilson <jessewilson@google.com>2010-03-01 19:03:40 -0800
commit8092253eb6a1cff91f0e4953f1387165169157b5 (patch)
tree224d9e121b4053c9a3dc480e6983a389cd61f745
parentf06779ef253298411e2151a990d6f14b2cb42ce3 (diff)
downloadlibcore-8092253eb6a1cff91f0e4953f1387165169157b5.zip
libcore-8092253eb6a1cff91f0e4953f1387165169157b5.tar.gz
libcore-8092253eb6a1cff91f0e4953f1387165169157b5.tar.bz2
Implementing almost all of Document.normalizeDocument().
This follows the rules specified by DOMConfiguration. In particular: - replacing CDATA nodes with text - splitting CDATA nodes - merging text nodes - stripping comments - detecting invalid characters I haven't added the normalization code for XML validation or namespaces.
-rw-r--r--xml/src/main/java/org/apache/harmony/xml/dom/CDATASectionImpl.java41
-rw-r--r--xml/src/main/java/org/apache/harmony/xml/dom/CharacterDataImpl.java2
-rw-r--r--xml/src/main/java/org/apache/harmony/xml/dom/CommentImpl.java9
-rw-r--r--xml/src/main/java/org/apache/harmony/xml/dom/DOMConfigurationImpl.java120
-rw-r--r--xml/src/main/java/org/apache/harmony/xml/dom/DocumentImpl.java11
-rw-r--r--xml/src/main/java/org/apache/harmony/xml/dom/InnerNodeImpl.java26
-rw-r--r--xml/src/main/java/org/apache/harmony/xml/dom/ProcessingInstructionImpl.java4
-rw-r--r--xml/src/main/java/org/apache/harmony/xml/dom/TextImpl.java29
-rw-r--r--xml/src/main/java/org/apache/xml/serializer/dom3/DOMErrorImpl.java8
-rw-r--r--xml/src/test/java/tests/xml/NormalizeTest.java244
10 files changed, 446 insertions, 48 deletions
diff --git a/xml/src/main/java/org/apache/harmony/xml/dom/CDATASectionImpl.java b/xml/src/main/java/org/apache/harmony/xml/dom/CDATASectionImpl.java
index b28c9da..7d122f2 100644
--- a/xml/src/main/java/org/apache/harmony/xml/dom/CDATASectionImpl.java
+++ b/xml/src/main/java/org/apache/harmony/xml/dom/CDATASectionImpl.java
@@ -29,7 +29,7 @@ import org.w3c.dom.Node;
* the DOM implementation can easily access them while maintaining the DOM tree
* structure.
*/
-public class CDATASectionImpl extends TextImpl implements CDATASection {
+public final class CDATASectionImpl extends TextImpl implements CDATASection {
public CDATASectionImpl(DocumentImpl document, String data) {
super(document, data);
@@ -45,4 +45,43 @@ public class CDATASectionImpl extends TextImpl implements CDATASection {
return Node.CDATA_SECTION_NODE;
}
+ /**
+ * Splits this CDATA node into parts that do not contain a "]]>" sequence.
+ * Any newly created nodes will be inserted before this node.
+ */
+ public void split() {
+ if (!needsSplitting()) {
+ return;
+ }
+
+ Node parent = getParentNode();
+ String[] parts = getData().split("\\]\\]>");
+ parent.insertBefore(new CDATASectionImpl(document, parts[0] + "]]"), this);
+ for (int p = 1; p < parts.length - 1; p++) {
+ parent.insertBefore(new CDATASectionImpl(document, ">" + parts[p] + "]]"), this);
+ }
+ setData(">" + parts[parts.length - 1]);
+ }
+
+ /**
+ * Returns true if this CDATA section contains the illegal character
+ * sequence "]]>". Such nodes must be {@link #split} before they are
+ * serialized.
+ */
+ public boolean needsSplitting() {
+ return buffer.indexOf("]]>") != -1;
+ }
+
+ /**
+ * Replaces this node with a semantically equivalent text node. This node
+ * will be removed from the DOM tree and the new node inserted in its place.
+ *
+ * @return the replacement node.
+ */
+ public TextImpl replaceWithText() {
+ TextImpl replacement = new TextImpl(document, getData());
+ parent.insertBefore(replacement, this);
+ parent.removeChild(this);
+ return replacement;
+ }
}
diff --git a/xml/src/main/java/org/apache/harmony/xml/dom/CharacterDataImpl.java b/xml/src/main/java/org/apache/harmony/xml/dom/CharacterDataImpl.java
index 6354747..2d4c3b4 100644
--- a/xml/src/main/java/org/apache/harmony/xml/dom/CharacterDataImpl.java
+++ b/xml/src/main/java/org/apache/harmony/xml/dom/CharacterDataImpl.java
@@ -32,7 +32,7 @@ import org.w3c.dom.DOMException;
public abstract class CharacterDataImpl extends LeafNodeImpl implements
CharacterData {
- private StringBuffer buffer;
+ protected StringBuffer buffer;
CharacterDataImpl(DocumentImpl document, String data) {
super(document);
diff --git a/xml/src/main/java/org/apache/harmony/xml/dom/CommentImpl.java b/xml/src/main/java/org/apache/harmony/xml/dom/CommentImpl.java
index 2d4a9c5..5f8a4e0 100644
--- a/xml/src/main/java/org/apache/harmony/xml/dom/CommentImpl.java
+++ b/xml/src/main/java/org/apache/harmony/xml/dom/CommentImpl.java
@@ -29,7 +29,7 @@ import org.w3c.dom.Node;
* the DOM implementation can easily access them while maintaining the DOM tree
* structure.
*/
-public class CommentImpl extends CharacterDataImpl implements Comment {
+public final class CommentImpl extends CharacterDataImpl implements Comment {
CommentImpl(DocumentImpl document, String data) {
super(document, data);
@@ -45,4 +45,11 @@ public class CommentImpl extends CharacterDataImpl implements Comment {
return Node.COMMENT_NODE;
}
+ /**
+ * Returns true if this comment contains the illegal character sequence
+ * "--". Such nodes may not be serialized.
+ */
+ public boolean containsDashDash() {
+ return buffer.indexOf("--") != -1;
+ }
}
diff --git a/xml/src/main/java/org/apache/harmony/xml/dom/DOMConfigurationImpl.java b/xml/src/main/java/org/apache/harmony/xml/dom/DOMConfigurationImpl.java
index 2f57a4c..1a8acbc 100644
--- a/xml/src/main/java/org/apache/harmony/xml/dom/DOMConfigurationImpl.java
+++ b/xml/src/main/java/org/apache/harmony/xml/dom/DOMConfigurationImpl.java
@@ -16,10 +16,14 @@
package org.apache.harmony.xml.dom;
+import org.apache.xml.serializer.dom3.DOMErrorImpl;
import org.w3c.dom.DOMConfiguration;
+import org.w3c.dom.DOMError;
import org.w3c.dom.DOMErrorHandler;
import org.w3c.dom.DOMException;
import org.w3c.dom.DOMStringList;
+import org.w3c.dom.NamedNodeMap;
+import org.w3c.dom.Node;
import java.util.Map;
import java.util.TreeMap;
@@ -368,4 +372,120 @@ public final class DOMConfigurationImpl implements DOMConfiguration {
}
};
}
+
+ public void normalize(Node node) {
+ /*
+ * Since we don't validate, this code doesn't take into account the
+ * following "supported" parameters: datatype-normalization, entities,
+ * schema-location, schema-type, or validate.
+ *
+ * TODO: normalize namespaces
+ */
+
+ switch (node.getNodeType()) {
+ case Node.CDATA_SECTION_NODE:
+ CDATASectionImpl cdata = (CDATASectionImpl) node;
+ if (cdataSections) {
+ if (cdata.needsSplitting()) {
+ if (splitCdataSections) {
+ cdata.split();
+ report(DOMError.SEVERITY_WARNING, "cdata-sections-splitted");
+ } else {
+ report(DOMError.SEVERITY_ERROR, "wf-invalid-character");
+ }
+ }
+ checkTextValidity(cdata.buffer);
+ break;
+ }
+ node = cdata.replaceWithText();
+ // fall through
+
+ case Node.TEXT_NODE:
+ TextImpl text = (TextImpl) node;
+ text = text.minimize();
+ if (text != null) {
+ checkTextValidity(text.buffer);
+ }
+ break;
+
+ case Node.COMMENT_NODE:
+ CommentImpl comment = (CommentImpl) node;
+ if (!comments) {
+ comment.getParentNode().removeChild(comment);
+ break;
+ }
+ if (comment.containsDashDash()) {
+ report(DOMError.SEVERITY_ERROR, "wf-invalid-character");
+ }
+ checkTextValidity(comment.buffer);
+ break;
+
+ case Node.PROCESSING_INSTRUCTION_NODE:
+ checkTextValidity(((ProcessingInstructionImpl) node).getData());
+ break;
+
+ case Node.ATTRIBUTE_NODE:
+ checkTextValidity(((AttrImpl) node).getValue());
+ break;
+
+ case Node.ELEMENT_NODE:
+ ElementImpl element = (ElementImpl) node;
+ NamedNodeMap attributes = element.getAttributes();
+ for (int i = 0; i < attributes.getLength(); i++) {
+ normalize(attributes.item(i));
+ }
+ // fall through
+
+ case Node.DOCUMENT_NODE:
+ case Node.DOCUMENT_FRAGMENT_NODE:
+ Node next;
+ for (Node child = node.getFirstChild(); child != null; child = next) {
+ // lookup next eagerly because normalize() may remove its subject
+ next = child.getNextSibling();
+ normalize(child);
+ }
+ break;
+
+ case Node.NOTATION_NODE:
+ case Node.DOCUMENT_TYPE_NODE:
+ case Node.ENTITY_NODE:
+ case Node.ENTITY_REFERENCE_NODE:
+ break;
+
+ default:
+ throw new DOMException(DOMException.NOT_SUPPORTED_ERR,
+ "Unsupported node type " + node.getNodeType());
+ }
+ }
+
+ private void checkTextValidity(CharSequence s) {
+ if (wellFormed && !isValid(s)) {
+ report(DOMError.SEVERITY_ERROR, "wf-invalid-character");
+ }
+ }
+
+ /**
+ * Returns true if all of the characters in the text are permitted for use
+ * in XML documents.
+ */
+ private boolean isValid(CharSequence text) {
+ for (int i = 0; i < text.length(); i++) {
+ char c = text.charAt(i);
+ // as defined by http://www.w3.org/TR/REC-xml/#charsets.
+ boolean valid = c == 0x9 || c == 0xA || c == 0xD
+ || (c >= 0x20 && c <= 0xd7ff)
+ || (c >= 0xe000 && c <= 0xfffd);
+ if (!valid) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ private void report(short severity, String type) {
+ if (errorHandler != null) {
+ // TODO: abort if handleError returns false
+ errorHandler.handleError(new DOMErrorImpl(severity, type, type));
+ }
+ }
}
diff --git a/xml/src/main/java/org/apache/harmony/xml/dom/DocumentImpl.java b/xml/src/main/java/org/apache/harmony/xml/dom/DocumentImpl.java
index 035e1bb..b009128 100644
--- a/xml/src/main/java/org/apache/harmony/xml/dom/DocumentImpl.java
+++ b/xml/src/main/java/org/apache/harmony/xml/dom/DocumentImpl.java
@@ -44,10 +44,10 @@ import org.w3c.dom.Text;
* the DOM implementation can easily access them while maintaining the DOM tree
* structure.
*/
-public class DocumentImpl extends InnerNodeImpl implements Document {
+public final class DocumentImpl extends InnerNodeImpl implements Document {
private DOMImplementation domImplementation;
- private DOMConfiguration domConfiguration;
+ private DOMConfigurationImpl domConfiguration;
/*
* The default values of these fields are specified by the Document
@@ -369,7 +369,12 @@ public class DocumentImpl extends InnerNodeImpl implements Document {
}
public void normalizeDocument() {
- throw new UnsupportedOperationException(); // TODO
+ Element root = getDocumentElement();
+ if (root == null) {
+ return;
+ }
+
+ ((DOMConfigurationImpl) getDomConfig()).normalize(root);
}
public Node renameNode(Node n, String namespaceURI, String qualifiedName)
diff --git a/xml/src/main/java/org/apache/harmony/xml/dom/InnerNodeImpl.java b/xml/src/main/java/org/apache/harmony/xml/dom/InnerNodeImpl.java
index 9cee352..fa75e21 100644
--- a/xml/src/main/java/org/apache/harmony/xml/dom/InnerNodeImpl.java
+++ b/xml/src/main/java/org/apache/harmony/xml/dom/InnerNodeImpl.java
@@ -19,7 +19,6 @@ package org.apache.harmony.xml.dom;
import org.w3c.dom.DOMException;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
-import org.w3c.dom.Text;
import java.util.ArrayList;
import java.util.List;
@@ -154,29 +153,14 @@ public abstract class InnerNodeImpl extends LeafNodeImpl {
*/
@Override
public final void normalize() {
- Text next = null; // null if next doesn't exist or is not a TEXT_NODE
- for (int i = children.size() - 1; i >= 0; i--) {
- Node node = children.get(i);
+ Node next;
+ for (Node node = getFirstChild(); node != null; node = next) {
+ next = node.getNextSibling();
node.normalize();
- if (node.getNodeType() != Node.TEXT_NODE) {
- next = null;
- continue;
+ if (node.getNodeType() == Node.TEXT_NODE) {
+ ((TextImpl) node).minimize();
}
-
- Text text = (Text) node;
-
- if (text.getLength() == 0) {
- removeChild(text);
- continue;
- }
-
- if (next != null) {
- text.appendData(next.getData());
- removeChild(next);
- }
-
- next = text;
}
}
diff --git a/xml/src/main/java/org/apache/harmony/xml/dom/ProcessingInstructionImpl.java b/xml/src/main/java/org/apache/harmony/xml/dom/ProcessingInstructionImpl.java
index 179b33c..115245d 100644
--- a/xml/src/main/java/org/apache/harmony/xml/dom/ProcessingInstructionImpl.java
+++ b/xml/src/main/java/org/apache/harmony/xml/dom/ProcessingInstructionImpl.java
@@ -30,7 +30,7 @@ import org.w3c.dom.ProcessingInstruction;
* the DOM implementation can easily access them while maintaining the DOM tree
* structure.
*/
-public class ProcessingInstructionImpl extends LeafNodeImpl implements
+public final class ProcessingInstructionImpl extends LeafNodeImpl implements
ProcessingInstruction {
private String target;
@@ -39,7 +39,7 @@ public class ProcessingInstructionImpl extends LeafNodeImpl implements
ProcessingInstructionImpl(DocumentImpl document, String target, String data) {
super(document);
- this.target = target;
+ this.target = target; // TODO: validate that target is well-formed
this.data = data;
}
diff --git a/xml/src/main/java/org/apache/harmony/xml/dom/TextImpl.java b/xml/src/main/java/org/apache/harmony/xml/dom/TextImpl.java
index 3840ef4..d39dff2 100644
--- a/xml/src/main/java/org/apache/harmony/xml/dom/TextImpl.java
+++ b/xml/src/main/java/org/apache/harmony/xml/dom/TextImpl.java
@@ -140,4 +140,33 @@ public class TextImpl extends CharacterDataImpl implements Text {
? (TextImpl) nextSibling
: null;
}
+
+ /**
+ * Tries to remove this node using itself and the previous node as context.
+ * If this node's text is empty, this node is removed and null is returned.
+ * If the previous node exists and is a text node, this node's text will be
+ * appended to that node's text and this node will be removed.
+ *
+ * <p>Although this method alters the structure of the DOM tree, it does
+ * not alter the document's semantics.
+ *
+ * @return the node holding this node's text and the end of the operation.
+ * Can be null if this node contained the empty string.
+ */
+ public final TextImpl minimize() {
+ if (getLength() == 0) {
+ parent.removeChild(this);
+ return null;
+ }
+
+ Node previous = getPreviousSibling();
+ if (previous == null || previous.getNodeType() != Node.TEXT_NODE) {
+ return this;
+ }
+
+ TextImpl previousText = (TextImpl) previous;
+ previousText.buffer.append(buffer);
+ parent.removeChild(this);
+ return previousText;
+ }
}
diff --git a/xml/src/main/java/org/apache/xml/serializer/dom3/DOMErrorImpl.java b/xml/src/main/java/org/apache/xml/serializer/dom3/DOMErrorImpl.java
index 81eda73..3895a53 100644
--- a/xml/src/main/java/org/apache/xml/serializer/dom3/DOMErrorImpl.java
+++ b/xml/src/main/java/org/apache/xml/serializer/dom3/DOMErrorImpl.java
@@ -32,7 +32,7 @@ import org.w3c.dom.DOMLocator;
* @xsl.usage internal
*/
-final class DOMErrorImpl implements DOMError {
+public final class DOMErrorImpl implements DOMError {
/** private data members */
@@ -70,7 +70,7 @@ final class DOMErrorImpl implements DOMError {
* @param message
* @param type
*/
- DOMErrorImpl(short severity, String message, String type) {
+ public DOMErrorImpl(short severity, String message, String type) {
fSeverity = severity;
fMessage = message;
fType = type;
@@ -82,7 +82,7 @@ final class DOMErrorImpl implements DOMError {
* @param type
* @param exception
*/
- DOMErrorImpl(short severity, String message, String type,
+ public DOMErrorImpl(short severity, String message, String type,
Exception exception) {
fSeverity = severity;
fMessage = message;
@@ -98,7 +98,7 @@ final class DOMErrorImpl implements DOMError {
* @param relatedData
* @param location
*/
- DOMErrorImpl(short severity, String message, String type,
+ public DOMErrorImpl(short severity, String message, String type,
Exception exception, Object relatedData, DOMLocatorImpl location) {
fSeverity = severity;
fMessage = message;
diff --git a/xml/src/test/java/tests/xml/NormalizeTest.java b/xml/src/test/java/tests/xml/NormalizeTest.java
index b10ea9c..6fa6c97 100644
--- a/xml/src/test/java/tests/xml/NormalizeTest.java
+++ b/xml/src/test/java/tests/xml/NormalizeTest.java
@@ -17,6 +17,8 @@
package tests.xml;
import junit.framework.TestCase;
+import org.w3c.dom.CDATASection;
+import org.w3c.dom.Comment;
import org.w3c.dom.DOMConfiguration;
import org.w3c.dom.DOMError;
import org.w3c.dom.DOMErrorHandler;
@@ -25,6 +27,7 @@ import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
+import org.w3c.dom.ProcessingInstruction;
import org.w3c.dom.Text;
import org.xml.sax.InputSource;
@@ -37,6 +40,7 @@ import java.io.StringReader;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.Arrays;
+import java.util.Collections;
import java.util.List;
/**
@@ -61,26 +65,31 @@ public class NormalizeTest extends TestCase {
}
public void testCanonicalForm() {
+ assertEquals(false, domConfiguration.getParameter("canonical-form"));
assertSupported("canonical-form", false);
assertUnsupported("canonical-form", true);
}
public void testCdataSections() {
+ assertEquals(true, domConfiguration.getParameter("cdata-sections"));
assertSupported("cdata-sections", false);
assertSupported("cdata-sections", true);
}
public void testCheckCharacterNormalization() {
+ assertEquals(false, domConfiguration.getParameter("check-character-normalization"));
assertSupported("check-character-normalization", false);
assertUnsupported("check-character-normalization", true);
}
public void testComments() {
+ assertEquals(true, domConfiguration.getParameter("comments"));
assertSupported("comments", false);
assertSupported("comments", true);
}
public void testDatatypeNormalization() {
+ assertEquals(false, domConfiguration.getParameter("datatype-normalization"));
assertSupported("datatype-normalization", false);
assertSupported("datatype-normalization", true);
@@ -95,16 +104,19 @@ public class NormalizeTest extends TestCase {
}
public void testElementContentWhitespace() {
+ assertEquals(true, domConfiguration.getParameter("element-content-whitespace"));
assertUnsupported("element-content-whitespace", false);
assertSupported("element-content-whitespace", true);
}
public void testEntities() {
+ assertEquals(true, domConfiguration.getParameter("entities"));
assertSupported("entities", false);
assertSupported("entities", true);
}
public void testErrorHandler() {
+ assertEquals(null, domConfiguration.getParameter("error-handler"));
assertSupported("error-handler", null);
assertSupported("error-handler", new DOMErrorHandler() {
public boolean handleError(DOMError error) {
@@ -114,6 +126,7 @@ public class NormalizeTest extends TestCase {
}
public void testInfoset() {
+ assertEquals(false, domConfiguration.getParameter("infoset"));
assertSupported("infoset", false);
assertSupported("infoset", true);
}
@@ -162,21 +175,25 @@ public class NormalizeTest extends TestCase {
}
public void testNamespaces() {
+ assertEquals(true, domConfiguration.getParameter("namespaces"));
assertSupported("namespaces", false);
assertSupported("namespaces", true);
}
public void testNamespaceDeclarations() {
+ assertEquals(true, domConfiguration.getParameter("namespace-declarations"));
assertUnsupported("namespace-declarations", false); // supported in RI 6
assertSupported("namespace-declarations", true);
}
public void testNormalizeCharacters() {
+ assertEquals(false, domConfiguration.getParameter("normalize-characters"));
assertSupported("normalize-characters", false);
assertUnsupported("normalize-characters", true);
}
public void testSchemaLocation() {
+ assertEquals(null, domConfiguration.getParameter("schema-location"));
assertSupported("schema-location", "http://foo");
assertSupported("schema-location", null);
}
@@ -190,26 +207,31 @@ public class NormalizeTest extends TestCase {
}
public void testSchemaTypeXmlSchema() {
+ assertEquals(null, domConfiguration.getParameter("schema-type"));
assertSupported("schema-type", null);
assertSupported("schema-type", "http://www.w3.org/2001/XMLSchema");
}
public void testSplitCdataSections() {
+ assertEquals(true, domConfiguration.getParameter("split-cdata-sections"));
assertSupported("split-cdata-sections", false);
assertSupported("split-cdata-sections", true);
}
public void testValidate() {
+ assertEquals(false, domConfiguration.getParameter("validate"));
assertSupported("validate", false);
assertSupported("validate", true);
}
public void testValidateIfSchema() {
+ assertEquals(false, domConfiguration.getParameter("validate-if-schema"));
assertSupported("validate-if-schema", false);
assertUnsupported("validate-if-schema", true);
}
public void testWellFormed() {
+ assertEquals(true, domConfiguration.getParameter("well-formed"));
assertSupported("well-formed", false);
assertSupported("well-formed", true);
}
@@ -314,30 +336,26 @@ public class NormalizeTest extends TestCase {
public void testCdataSectionsNotHonoredByNodeNormalize() throws Exception {
String xml = "<foo>ABC<![CDATA[DEF]]>GHI</foo>";
- document = DocumentBuilderFactory.newInstance().newDocumentBuilder()
- .parse(new InputSource(new StringReader(xml)));
- document.getDomConfig().setParameter("cdata-sections", true);
+ parse(xml);
+ domConfiguration.setParameter("cdata-sections", true);
document.getDocumentElement().normalize();
assertEquals(xml, domToString(document));
- document = DocumentBuilderFactory.newInstance().newDocumentBuilder()
- .parse(new InputSource(new StringReader(xml)));
- document.getDomConfig().setParameter("cdata-sections", false);
+ parse(xml);
+ domConfiguration.setParameter("cdata-sections", false);
document.getDocumentElement().normalize();
assertEquals(xml, domToString(document));
}
public void testCdataSectionsHonoredByDocumentNormalize() throws Exception {
String xml = "<foo>ABC<![CDATA[DEF]]>GHI</foo>";
- document = DocumentBuilderFactory.newInstance().newDocumentBuilder()
- .parse(new InputSource(new StringReader(xml)));
- document.getDomConfig().setParameter("cdata-sections", true);
+ parse(xml);
+ domConfiguration.setParameter("cdata-sections", true);
document.normalizeDocument();
assertEquals(xml, domToString(document));
- document = DocumentBuilderFactory.newInstance().newDocumentBuilder()
- .parse(new InputSource(new StringReader(xml)));
- document.getDomConfig().setParameter("cdata-sections", false);
+ parse(xml);
+ domConfiguration.setParameter("cdata-sections", false);
document.normalizeDocument();
String expected = xml.replace("<![CDATA[DEF]]>", "DEF");
assertEquals(expected, domToString(document));
@@ -367,6 +385,170 @@ public class NormalizeTest extends TestCase {
assertChildren(document.getDocumentElement(), "<br>", "<br>", "<br>");
}
+ public void testRetainingComments() throws Exception {
+ String xml = "<foo>ABC<!-- bar -->DEF<!-- baz -->GHI</foo>";
+ parse(xml);
+ domConfiguration.setParameter("comments", true);
+ document.normalizeDocument();
+ assertEquals(xml, domToString(document));
+ }
+
+ public void testCommentContainingDoubleDash() throws Exception {
+ ErrorRecorder errorRecorder = new ErrorRecorder();
+ domConfiguration.setParameter("error-handler", errorRecorder);
+ domConfiguration.setParameter("namespaces", false);
+ Element root = document.createElement("foo");
+ document.appendChild(root);
+ root.appendChild(document.createComment("ABC -- DEF"));
+ document.normalizeDocument();
+ errorRecorder.assertAllErrors(DOMError.SEVERITY_ERROR, "wf-invalid-character");
+ }
+
+ public void testStrippingComments() throws Exception {
+ String xml = "<foo>ABC<!-- bar -->DEF<!-- baz -->GHI</foo>";
+ parse(xml);
+ domConfiguration.setParameter("comments", false);
+ document.normalizeDocument();
+ assertChildren(document.getDocumentElement(), "ABCDEFGHI");
+ }
+
+ public void testSplittingCdataSectionsSplit() throws Exception {
+ ErrorRecorder errorRecorder = new ErrorRecorder();
+ domConfiguration.setParameter("split-cdata-sections", true);
+ domConfiguration.setParameter("error-handler", errorRecorder);
+ domConfiguration.setParameter("namespaces", false);
+ Element root = document.createElement("foo");
+ document.appendChild(root);
+ root.appendChild(document.createCDATASection("ABC]]>DEF]]>GHI"));
+ document.normalizeDocument();
+ errorRecorder.assertAllErrors(DOMError.SEVERITY_WARNING, "cdata-sections-splitted");
+ assertChildren(root, "<![CDATA[ABC]]]]>", "<![CDATA[>DEF]]]]>", "<![CDATA[>GHI]]>");
+ }
+
+ public void testSplittingCdataSectionsReportError() throws Exception {
+ ErrorRecorder errorRecorder = new ErrorRecorder();
+ domConfiguration.setParameter("split-cdata-sections", false);
+ domConfiguration.setParameter("error-handler", errorRecorder);
+ domConfiguration.setParameter("namespaces", false);
+ Element root = document.createElement("foo");
+ document.appendChild(root);
+ root.appendChild(document.createCDATASection("ABC]]>DEF"));
+ document.normalizeDocument();
+ errorRecorder.assertAllErrors(DOMError.SEVERITY_ERROR, "wf-invalid-character");
+ }
+
+ public void testInvalidCharactersCdata() throws Exception {
+ ErrorRecorder errorRecorder = new ErrorRecorder();
+ domConfiguration.setParameter("cdata-sections", true);
+ domConfiguration.setParameter("error-handler", errorRecorder);
+ domConfiguration.setParameter("namespaces", false);
+ Element root = document.createElement("foo");
+ document.appendChild(root);
+ CDATASection cdata = document.createCDATASection("");
+ root.appendChild(cdata);
+
+ for (int c = 0; c <= Character.MAX_VALUE; c++) {
+ cdata.setData(new String(new char[]{ 'A', 'B', (char) c }));
+ document.normalizeDocument();
+ if (isValid((char) c)) {
+ assertEquals(Collections.<DOMError>emptyList(), errorRecorder.errors);
+ } else {
+ errorRecorder.assertAllErrors("For character " + c,
+ DOMError.SEVERITY_ERROR, "wf-invalid-character");
+ }
+ }
+ }
+
+ public void testInvalidCharactersText() throws Exception {
+ ErrorRecorder errorRecorder = new ErrorRecorder();
+ domConfiguration.setParameter("error-handler", errorRecorder);
+ domConfiguration.setParameter("namespaces", false);
+ Element root = document.createElement("foo");
+ document.appendChild(root);
+ Text text = document.createTextNode("");
+ root.appendChild(text);
+
+ for (int c = 0; c <= Character.MAX_VALUE; c++) {
+ text.setData(new String(new char[]{ 'A', 'B', (char) c }));
+ document.normalizeDocument();
+ if (isValid((char) c)) {
+ assertEquals(Collections.<DOMError>emptyList(), errorRecorder.errors);
+ } else {
+ errorRecorder.assertAllErrors("For character " + c,
+ DOMError.SEVERITY_ERROR, "wf-invalid-character");
+ }
+ }
+ }
+
+ public void testInvalidCharactersAttribute() throws Exception {
+ ErrorRecorder errorRecorder = new ErrorRecorder();
+ domConfiguration.setParameter("error-handler", errorRecorder);
+ domConfiguration.setParameter("namespaces", false);
+ Element root = document.createElement("foo");
+ document.appendChild(root);
+
+ for (int c = 0; c <= Character.MAX_VALUE; c++) {
+ root.setAttribute("bar", new String(new char[] { 'A', 'B', (char) c}));
+ document.normalizeDocument();
+ if (isValid((char) c)) {
+ assertEquals(Collections.<DOMError>emptyList(), errorRecorder.errors);
+ } else {
+ errorRecorder.assertAllErrors("For character " + c,
+ DOMError.SEVERITY_ERROR, "wf-invalid-character");
+ }
+ }
+ }
+
+ public void testInvalidCharactersComment() throws Exception {
+ ErrorRecorder errorRecorder = new ErrorRecorder();
+ domConfiguration.setParameter("error-handler", errorRecorder);
+ domConfiguration.setParameter("namespaces", false);
+ Element root = document.createElement("foo");
+ document.appendChild(root);
+ Comment comment = document.createComment("");
+ root.appendChild(comment);
+
+ for (int c = 0; c <= Character.MAX_VALUE; c++) {
+ comment.setData(new String(new char[] { 'A', 'B', (char) c}));
+ document.normalizeDocument();
+ if (isValid((char) c)) {
+ assertEquals(Collections.<DOMError>emptyList(), errorRecorder.errors);
+ } else {
+ errorRecorder.assertAllErrors("For character " + c,
+ DOMError.SEVERITY_ERROR, "wf-invalid-character");
+ }
+ }
+ }
+
+ public void testInvalidCharactersProcessingInstructionData() throws Exception {
+ ErrorRecorder errorRecorder = new ErrorRecorder();
+ domConfiguration.setParameter("error-handler", errorRecorder);
+ domConfiguration.setParameter("namespaces", false);
+ Element root = document.createElement("foo");
+ document.appendChild(root);
+ ProcessingInstruction pi = document.createProcessingInstruction("foo", "");
+ root.appendChild(pi);
+
+ for (int c = 0; c <= Character.MAX_VALUE; c++) {
+ pi.setData(new String(new char[] { 'A', 'B', (char) c}));
+ document.normalizeDocument();
+ if (isValid((char) c)) {
+ assertEquals(Collections.<DOMError>emptyList(), errorRecorder.errors);
+ } else {
+ errorRecorder.assertAllErrors("For character " + c,
+ DOMError.SEVERITY_ERROR, "wf-invalid-character");
+ }
+ }
+ }
+
+ // TODO: test for surrogates
+
+ private boolean isValid(char c) {
+ // as defined by http://www.w3.org/TR/REC-xml/#charsets.
+ return c == 0x9 || c == 0xA || c == 0xD || (c >= 0x20 && c <= 0xd7ff)
+ || (c >= 0xe000 && c <= 0xfffd);
+ }
+
private Document createDocumentWithAdjacentTexts(String... texts) throws Exception {
Document result = DocumentBuilderFactory.newInstance()
.newDocumentBuilder().newDocument();
@@ -387,13 +569,23 @@ public class NormalizeTest extends TestCase {
NodeList nodes = element.getChildNodes();
for (int i = 0; i < nodes.getLength(); i++) {
Node node = nodes.item(i);
- actual.add(node.getNodeType() == Node.TEXT_NODE
- ? ((Text) node).getData()
- : "<" + node.getNodeName() + ">");
+ if (node.getNodeType() == Node.TEXT_NODE) {
+ actual.add(((Text) node).getData());
+ } else if (node.getNodeType() == Node.CDATA_SECTION_NODE) {
+ actual.add("<![CDATA[" + ((CDATASection) node).getData() + "]]>");
+ } else {
+ actual.add("<" + node.getNodeName() + ">");
+ }
}
assertEquals(Arrays.asList(texts), actual);
}
+ private void parse(String xml) throws Exception {
+ document = DocumentBuilderFactory.newInstance().newDocumentBuilder()
+ .parse(new InputSource(new StringReader(xml)));
+ domConfiguration = document.getDomConfig();
+ }
+
private String domToString(Document document) throws TransformerException {
StringWriter writer = new StringWriter();
TransformerFactory.newInstance().newTransformer()
@@ -401,4 +593,26 @@ public class NormalizeTest extends TestCase {
String xml = writer.toString();
return xml.replaceFirst("<\\?xml[^?]*\\?>", "");
}
+
+ private class ErrorRecorder implements DOMErrorHandler {
+ private final List<DOMError> errors = new ArrayList<DOMError>();
+
+ public boolean handleError(DOMError error) {
+ errors.add(error);
+ return true;
+ }
+
+ public void assertAllErrors(int severity, String type) {
+ assertAllErrors("Expected one or more " + type + " errors", severity, type);
+ }
+
+ public void assertAllErrors(String message, int severity, String type) {
+ assertFalse(message, errors.isEmpty());
+ for (DOMError error : errors) {
+ assertEquals(message, severity, error.getSeverity());
+ assertEquals(message, type, error.getType());
+ }
+ errors.clear();
+ }
+ }
}