summaryrefslogtreecommitdiffstats
path: root/xml
diff options
context:
space:
mode:
authorJesse Wilson <jessewilson@google.com>2010-11-23 17:02:32 -0800
committerJesse Wilson <jessewilson@google.com>2010-11-23 17:14:49 -0800
commit7fac047a67ee9a0295e4dc798c7c6880ccd83513 (patch)
treec6e6c29c6ba976712018f9c348c98271c1cc48ea /xml
parente8515df2fdeda9e99dce4035914481e42046beec (diff)
downloadlibcore-7fac047a67ee9a0295e4dc798c7c6880ccd83513.zip
libcore-7fac047a67ee9a0295e4dc798c7c6880ccd83513.tar.gz
libcore-7fac047a67ee9a0295e4dc798c7c6880ccd83513.tar.bz2
Implement DTD parsing in KxmlParser.
This change still has some problems: - default attribute values are not honored. - the doctype token text is lost - entity values are not reparsed - use of parameter entities is ignored but should cause a failure Change-Id: Idd543846840aea481730e690e63212164555cdf1 http://b/3090550
Diffstat (limited to 'xml')
-rw-r--r--xml/src/main/java/org/kxml2/io/KXmlParser.java478
1 files changed, 406 insertions, 72 deletions
diff --git a/xml/src/main/java/org/kxml2/io/KXmlParser.java b/xml/src/main/java/org/kxml2/io/KXmlParser.java
index 4f41fe0..8c0b3b1 100644
--- a/xml/src/main/java/org/kxml2/io/KXmlParser.java
+++ b/xml/src/main/java/org/kxml2/io/KXmlParser.java
@@ -37,6 +37,11 @@ import org.xmlpull.v1.XmlPullParserException;
*/
public class KXmlParser implements XmlPullParser {
+ private static final int ELEMENTDECL = 11;
+ private static final int ENTITYDECL = 12;
+ private static final int ATTLISTDECL = 13;
+ private static final int NOTATIONDECL = 14;
+ private static final int PARAMETER_ENTITY_REF = 15;
private static final char[] START_COMMENT = { '<', '!', '-', '-' };
private static final char[] END_COMMENT = { '-', '-', '>' };
private static final char[] COMMENT_DOUBLE_DASH = { '-', '-' };
@@ -45,7 +50,19 @@ public class KXmlParser implements XmlPullParser {
private static final char[] START_PROCESSING_INSTRUCTION = { '<', '?' };
private static final char[] END_PROCESSING_INSTRUCTION = { '?', '>' };
private static final char[] START_DOCTYPE = { '<', '!', 'D', 'O', 'C', 'T', 'Y', 'P', 'E' };
- // no END_DOCTYPE because doctype must be parsed
+ private static final char[] SYSTEM = { 'S', 'Y', 'S', 'T', 'E', 'M' };
+ private static final char[] PUBLIC = { 'P', 'U', 'B', 'L', 'I', 'C' };
+ private static final char[] START_ELEMENT = { '<', '!', 'E', 'L', 'E', 'M', 'E', 'N', 'T' };
+ private static final char[] START_ATTLIST = { '<', '!', 'A', 'T', 'T', 'L', 'I', 'S', 'T' };
+ private static final char[] START_ENTITY = { '<', '!', 'E', 'N', 'T', 'I', 'T', 'Y' };
+ private static final char[] START_NOTATION = { '<', '!', 'N', 'O', 'T', 'A', 'T', 'I', 'O', 'N' };
+ private static final char[] EMPTY = new char[] { 'E', 'M', 'P', 'T', 'Y' };
+ private static final char[] ANY = new char[]{ 'A', 'N', 'Y' };
+ private static final char[] NDATA = new char[]{ 'N', 'D', 'A', 'T', 'A' };
+ private static final char[] NOTATION = new char[]{ 'N', 'O', 'T', 'A', 'T', 'I', 'O', 'N' };
+ private static final char[] REQUIRED = new char[] { 'R', 'E', 'Q', 'U', 'I', 'R', 'E', 'D' };
+ private static final char[] IMPLIED = new char[] { 'I', 'M', 'P', 'L', 'I', 'E', 'D' };
+ private static final char[] FIXED = new char[] { 'F', 'I', 'X', 'E', 'D' };
static final private String UNEXPECTED_EOF = "Unexpected EOF";
static final private String ILLEGAL_TYPE = "Wrong event type";
@@ -285,11 +302,11 @@ public class KXmlParser implements XmlPullParser {
}
}
- type = peekType();
+ type = peekType(false);
if (type == XML_DECLARATION) {
readXmlDeclaration();
- type = peekType();
+ type = peekType(false);
}
text = null;
@@ -322,13 +339,13 @@ public class KXmlParser implements XmlPullParser {
case ENTITY_REF:
if (justOneToken) {
StringBuilder entityTextBuilder = new StringBuilder();
- readEntity(entityTextBuilder, true);
+ readEntity(entityTextBuilder, true, ValueContext.TEXT);
text = entityTextBuilder.toString();
break;
}
// fall-through
case TEXT:
- text = readValue('<', !justOneToken, false);
+ text = readValue('<', !justOneToken, ValueContext.TEXT);
if (depth == 0 && isWhitespace) {
type = IGNORABLE_WHITESPACE;
}
@@ -356,11 +373,14 @@ public class KXmlParser implements XmlPullParser {
}
break;
case DOCDECL:
- String doctype = readDoctype(justOneToken);
+ readDoctype();
if (justOneToken) {
- text = doctype;
+ text = ""; // TODO: support capturing the doctype text
}
break;
+
+ default:
+ throw new XmlPullParserException("Unexpected token", this, null);
}
if (justOneToken) {
@@ -376,7 +396,7 @@ public class KXmlParser implements XmlPullParser {
* report this as text, even if it was a CDATA block or entity
* reference.
*/
- int peek = peekType();
+ int peek = peekType(false);
if (text != null && !text.isEmpty() && peek < TEXT) {
type = TEXT;
return type;
@@ -504,52 +524,337 @@ public class KXmlParser implements XmlPullParser {
return commentText;
}
- private String readDoctype(boolean returnText) throws IOException, XmlPullParserException {
+ /**
+ * Read the document's DTD. Although this parser is non-validating, the DTD
+ * must be parsed to capture entity values and default attribute values.
+ */
+ private void readDoctype() throws IOException, XmlPullParserException {
read(START_DOCTYPE);
+ skip();
+ readName();
+ readExternalId(true);
+ skip();
+ if (peekCharacter() == '[') {
+ readInternalSubset();
+ }
+ skip();
+ read('>');
+ }
- int start = position;
- StringBuilder result = null;
- int nesting = 1;
- boolean quoted = false;
+ /**
+ * Reads an external ID of one of these two forms:
+ * SYSTEM "quoted system name"
+ * PUBLIC "quoted public id" "quoted system name"
+ *
+ * If the system name is not required, this also supports lone public IDs of
+ * this form:
+ * PUBLIC "quoted public id"
+ *
+ * Returns true if any ID was read.
+ */
+ private boolean readExternalId(boolean requireSystemName)
+ throws IOException, XmlPullParserException {
+ skip();
+ int c = peekCharacter();
+
+ if (c == 'S') {
+ read(SYSTEM);
+ } else if (c == 'P') {
+ read(PUBLIC);
+ skip();
+ readQuotedId();
+ } else {
+ return false;
+ }
+
+ skip();
+
+ if (!requireSystemName) {
+ int delimiter = peekCharacter();
+ if (delimiter != '"' && delimiter != '\'') {
+ return true; // no system name!
+ }
+ }
+
+ readQuotedId();
+ return true;
+ }
+
+ /**
+ * Reads a quoted string, performing no entity escaping of the contents.
+ */
+ private void readQuotedId() throws IOException, XmlPullParserException {
+ int quote = peekCharacter();
+ if (quote != '"' && quote != '\'') {
+ throw new XmlPullParserException("Expected a quoted string", this, null);
+ }
+ position++;
+ while (peekCharacter() != quote) {
+ position++;
+ }
+ position++;
+ }
+
+ private void readInternalSubset() throws IOException, XmlPullParserException {
+ read('[');
while (true) {
- if (position >= limit) {
- if (start < position && returnText) {
- if (result == null) {
- result = new StringBuilder();
- }
- result.append(buffer, start, position - start);
- }
- if (!fillBuffer(1)) {
- checkRelaxed(UNEXPECTED_EOF);
- return null;
+ skip();
+ if (peekCharacter() == ']') {
+ position++;
+ return;
+ }
+
+ int declarationType = peekType(true);
+ switch (declarationType) {
+ case ELEMENTDECL:
+ readElementDeclaration();
+ break;
+
+ case ATTLISTDECL:
+ readAttributeListDeclaration();
+ break;
+
+ case ENTITYDECL:
+ readEntityDeclaration();
+ break;
+
+ case NOTATIONDECL:
+ readNotationDeclaration();
+ break;
+
+ case PROCESSING_INSTRUCTION:
+ read(START_PROCESSING_INSTRUCTION);
+ readUntil(END_PROCESSING_INSTRUCTION, false);
+ break;
+
+ case COMMENT:
+ readComment(false);
+ break;
+
+ case PARAMETER_ENTITY_REF:
+ throw new XmlPullParserException(
+ "Parameter entity references are not supported", this, null);
+
+ default:
+ throw new XmlPullParserException("Unexpected token", this, null);
+ }
+ }
+ }
+
+ /**
+ * Read an element declaration. This contains a name and a content spec.
+ * <!ELEMENT foo EMPTY >
+ * <!ELEMENT foo (bar?,(baz|quux)) >
+ * <!ELEMENT foo (#PCDATA|bar)* >
+ */
+ private void readElementDeclaration() throws IOException, XmlPullParserException {
+ read(START_ELEMENT);
+ skip();
+ readName();
+ readContentSpec();
+ skip();
+ read('>');
+ }
+
+ /**
+ * Read an element content spec. This is a regular expression-like pattern
+ * of names or other content specs. The following operators are supported:
+ * sequence: (a,b,c)
+ * choice: (a|b|c)
+ * optional: a?
+ * one or more: a+
+ * any number: a*
+ *
+ * The special name '#PCDATA' is permitted but only if it is the first
+ * element of the first group:
+ * (#PCDATA|a|b)
+ *
+ * The top-level element must be either a choice, a sequence, or one of the
+ * special names EMPTY and ANY.
+ */
+ private void readContentSpec() throws IOException, XmlPullParserException {
+ // this implementation is very lenient; it scans for balanced parens only
+ skip();
+ int c = peekCharacter();
+ if (c == '(') {
+ int depth = 0;
+ do {
+ if (c == '(') {
+ depth++;
+ } else if (c == ')') {
+ depth--;
}
- start = position;
+ position++;
+ c = peekCharacter();
+ } while (depth > 0);
+
+ if (c == '*' || c == '?' || c == '+') {
+ position++;
+ }
+ } else if (c == EMPTY[0]) {
+ read(EMPTY);
+ } else if (c == ANY[0]) {
+ read(ANY);
+ } else {
+ throw new XmlPullParserException("Expected element content spec", this, null);
+ }
+ }
+
+ /**
+ * Reads an attribute list declaration such as the following:
+ * <!ATTLIST foo
+ * bar CDATA #IMPLIED
+ * quux (a|b|c) "c"
+ * baz NOTATION (a|b|c) #FIXED "c">
+ *
+ * Each attribute has a name, type and default.
+ *
+ * Types are one of the built-in types (CDATA, ID, IDREF, IDREFS, ENTITY,
+ * ENTITIES, NMTOKEN, or NMTOKENS), an enumerated type "(list|of|options)"
+ * or NOTATION followed by an enumerated type.
+ *
+ * The default is either #REQUIRED, #IMPLIED, #FIXED, a quoted value, or
+ * #FIXED with a quoted value.
+ */
+ private void readAttributeListDeclaration() throws IOException, XmlPullParserException {
+ read(START_ATTLIST);
+ skip();
+ String elementName = readName();
+
+ while (true) {
+ skip();
+ int c = peekCharacter();
+ if (c == '>') {
+ position++;
+ return;
}
- char i = buffer[position++];
+ // attribute name
+ String attributeName = readName();
- if (i == '\'') {
- quoted = !quoted; // TODO: should this include a double quote as well?
- } else if (i == '<') {
- if (!quoted) {
- nesting++;
+ // attribute type
+ skip();
+ if (position + 1 >= limit && !fillBuffer(2)) {
+ throw new XmlPullParserException("Malformed attribute list", this, null);
+ }
+ if (buffer[position] == NOTATION[0] && buffer[position + 1] == NOTATION[1]) {
+ read(NOTATION);
+ skip();
+ }
+ c = peekCharacter();
+ if (c == '(') {
+ position++;
+ while (true) {
+ skip();
+ readName();
+ skip();
+ c = peekCharacter();
+ if (c == ')') {
+ position++;
+ break;
+ } else if (c == '|') {
+ position++;
+ } else {
+ throw new XmlPullParserException("Malformed attribute type", this, null);
+ }
}
- } else if (i == '>') {
- if (!quoted && --nesting == 0) {
- break;
+ } else {
+ readName();
+ }
+
+ // default value
+ skip();
+ c = peekCharacter();
+ if (c == '#') {
+ position++;
+ c = peekCharacter();
+ if (c == 'R') {
+ read(REQUIRED);
+ } else if (c == 'I') {
+ read(IMPLIED);
+ } else if (c == 'F') {
+ read(FIXED);
+ } else {
+ throw new XmlPullParserException("Malformed attribute type", this, null);
}
+ skip();
+ c = peekCharacter();
}
+ if (c == '"' || c == '\'') {
+ position++;
+ // TODO: does this do escaping correctly?
+ String value = readValue((char) c, true, ValueContext.ATTRIBUTE);
+ position++;
+ defineAttributeDefault(elementName, attributeName, value);
+ }
+ }
+ }
+
+ private void defineAttributeDefault(String elementName, String attributeName, String value) {
+ // TODO: stash this attribute so we can recall it later
+ }
+
+ /**
+ * Read an entity declaration. The value of internal entities are inline:
+ * <!ENTITY foo "bar">
+ *
+ * The values of external entities must be retrieved by URL or path:
+ * <!ENTITY foo SYSTEM "http://host/file">
+ * <!ENTITY foo PUBLIC "-//Android//Foo//EN" "http://host/file">
+ * <!ENTITY foo SYSTEM "../file.png" NDATA png>
+ *
+ * Entities may be general or parameterized. Parameterized entities are
+ * marked by a percent sign. Such entities may only be used in the DTD:
+ * <!ENTITY % foo "bar">
+ */
+ private void readEntityDeclaration() throws IOException, XmlPullParserException {
+ read(START_ENTITY);
+ boolean generalEntity = true;
+
+ skip();
+ if (peekCharacter() == '%') {
+ generalEntity = false;
+ position++;
+ skip();
}
- if (!returnText) {
- return null;
- } else if (result == null) {
- return stringPool.get(buffer, start, position - start - 1); // omit the '>'
+ String name = readName();
+
+ skip();
+ int quote = peekCharacter();
+ if (quote == '"' || quote == '\'') {
+ position++;
+ String value = readValue((char) quote, true, ValueContext.ENTITY_DECLARATION);
+ position++;
+ if (generalEntity) {
+ defineEntityReplacementText(name, value); // TODO: test parameter and general entity
+ }
+ } else if (readExternalId(true)) {
+ skip();
+ if (peekCharacter() == NDATA[0]) {
+ read(NDATA);
+ skip();
+ readName();
+ }
} else {
- result.append(buffer, start, position - start - 1); // omit the '>'
- return result.toString();
+ throw new XmlPullParserException("Expected entity value or external ID", this, null);
}
+
+ skip();
+ read('>');
+ }
+
+ private void readNotationDeclaration() throws IOException, XmlPullParserException {
+ read(START_NOTATION);
+ skip();
+ readName();
+ if (!readExternalId(false)) {
+ throw new XmlPullParserException(
+ "Expected external ID or public ID for notation", this, null);
+ }
+ skip();
+ read('>');
}
private void readEndTag() throws IOException, XmlPullParserException {
@@ -580,46 +885,61 @@ public class KXmlParser implements XmlPullParser {
/**
* Returns the type of the next token.
*/
- private int peekType() throws IOException, XmlPullParserException {
+ private int peekType(boolean inDeclaration) throws IOException, XmlPullParserException {
if (position >= limit && !fillBuffer(1)) {
return END_DOCUMENT;
}
- if (buffer[position] == '&') {
- return ENTITY_REF;
-
- } else if (buffer[position] == '<') {
- if (position + 2 >= limit && !fillBuffer(3)) {
+ switch (buffer[position]) {
+ case '&':
+ return ENTITY_REF; // &
+ case '<':
+ if (position + 3 >= limit && !fillBuffer(4)) {
throw new XmlPullParserException("Dangling <", this, null);
}
- if (buffer[position + 1] == '/') {
- return END_TAG;
- } else if (buffer[position + 1] == '?') {
+ switch (buffer[position + 1]) {
+ case '/':
+ return END_TAG; // </
+ case '?':
// we're looking for "<?xml " with case insensitivity
if ((position + 5 < limit || fillBuffer(6))
&& (buffer[position + 2] == 'x' || buffer[position + 2] == 'X')
&& (buffer[position + 3] == 'm' || buffer[position + 3] == 'M')
&& (buffer[position + 4] == 'l' || buffer[position + 4] == 'L')
&& (buffer[position + 5] == ' ')) {
- return XML_DECLARATION;
+ return XML_DECLARATION; // <?xml
} else {
- return PROCESSING_INSTRUCTION;
+ return PROCESSING_INSTRUCTION; // <?
}
- } else if (buffer[position + 1] == '!') {
- if (buffer[position + 2] == START_DOCTYPE[2]) {
- return DOCDECL;
- } else if (buffer[position + 2] == START_CDATA[2]) {
- return CDSECT;
- } else if (buffer[position + 2] == START_COMMENT[2]) {
- return COMMENT;
- } else {
- throw new XmlPullParserException("Unexpected <!", this, null);
+ case '!':
+ switch (buffer[position + 2]) {
+ case 'D':
+ return DOCDECL; // <!D
+ case '[':
+ return CDSECT; // <![
+ case '-':
+ return COMMENT; // <!-
+ case 'E':
+ switch (buffer[position + 3]) {
+ case 'L':
+ return ELEMENTDECL; // <!EL
+ case 'N':
+ return ENTITYDECL; // <!EN
+ default:
+ throw new XmlPullParserException("Unexpected <!", this, null);
+ }
+ case 'A':
+ return ATTLISTDECL; // <!A
+ case 'N':
+ return NOTATIONDECL; // <!N
}
- } else {
- return START_TAG;
+ default:
+ return START_TAG; // <
}
- } else {
+ case '%':
+ return inDeclaration ? PARAMETER_ENTITY_REF : TEXT;
+ default:
return TEXT;
}
}
@@ -695,7 +1015,7 @@ public class KXmlParser implements XmlPullParser {
throw new XmlPullParserException("attr value delimiter missing!", this, null);
}
- attributes[i] = readValue(delimiter, true, true);
+ attributes[i] = readValue(delimiter, true, ValueContext.ATTRIBUTE);
if (delimiter != ' ') {
position++; // end quote
@@ -736,7 +1056,7 @@ public class KXmlParser implements XmlPullParser {
* resolved entity to {@code out}. If the entity cannot be read or resolved,
* {@code out} will contain the partial entity reference.
*/
- private void readEntity(StringBuilder out, boolean isEntityToken)
+ private void readEntity(StringBuilder out, boolean isEntityToken, ValueContext valueContext)
throws IOException, XmlPullParserException {
int start = out.length();
@@ -793,6 +1113,8 @@ public class KXmlParser implements XmlPullParser {
} catch (IllegalArgumentException invalidCodePoint) {
throw new XmlPullParserException("Invalid character reference: &" + code);
}
+ } else if (valueContext == ValueContext.ENTITY_DECLARATION) {
+ // keep the unresolved &code; in the text
} else if ((resolved = entityMap.get(code)) != null) {
out.delete(start, out.length());
out.append(resolved);
@@ -807,15 +1129,27 @@ public class KXmlParser implements XmlPullParser {
}
/**
+ * Where a value is found impacts how that value is interpreted. For
+ * example, in attributes, "\n" must be replaced with a space character. In
+ * text, "]]>" is forbidden. In entity declarations, named references are
+ * not resolved.
+ */
+ enum ValueContext {
+ ATTRIBUTE,
+ TEXT,
+ ENTITY_DECLARATION
+ }
+
+ /**
* Returns the current text or attribute value. This also has the side
* effect of setting isWhitespace to false if a non-whitespace character is
* encountered.
*
- * @param delimiter {@code >} for text, {@code "} and {@code '} for quoted
+ * @param delimiter {@code <} for text, {@code "} and {@code '} for quoted
* attributes, or a space for unquoted attributes.
*/
private String readValue(char delimiter, boolean resolveEntities,
- boolean inAttributeValue) throws IOException, XmlPullParserException {
+ ValueContext valueContext) throws IOException, XmlPullParserException {
/*
* This method returns all of the characters from the current position
@@ -842,7 +1176,7 @@ public class KXmlParser implements XmlPullParser {
StringBuilder result = null;
// if a text section was already started, prefix the start
- if (!inAttributeValue && text != null) {
+ if (valueContext == ValueContext.TEXT && text != null) {
result = new StringBuilder();
result.append(text);
}
@@ -876,10 +1210,10 @@ public class KXmlParser implements XmlPullParser {
}
if (c != '\r'
- && (c != '\n' || !inAttributeValue)
+ && (c != '\n' || valueContext != ValueContext.ATTRIBUTE)
&& c != '&'
&& c != '<'
- && (c != ']' || inAttributeValue)) {
+ && (c != ']' || valueContext != ValueContext.TEXT)) {
isWhitespace &= (c <= ' ');
position++;
continue;
@@ -898,19 +1232,19 @@ public class KXmlParser implements XmlPullParser {
if ((position + 1 < limit || fillBuffer(2)) && buffer[position + 1] == '\n') {
position++;
}
- c = inAttributeValue ? ' ' : '\n';
+ c = (valueContext == ValueContext.ATTRIBUTE) ? ' ' : '\n';
} else if (c == '\n') {
c = ' ';
} else if (c == '&') {
isWhitespace = false; // TODO: what if the entity resolves to whitespace?
- readEntity(result, false);
+ readEntity(result, false, valueContext);
start = position;
continue;
} else if (c == '<') {
- if (inAttributeValue) {
+ if (valueContext == ValueContext.ATTRIBUTE) {
checkRelaxed("Illegal: \"<\" inside attribute value");
}
isWhitespace = false;