diff options
author | Jesse Wilson <jessewilson@google.com> | 2010-11-23 17:02:32 -0800 |
---|---|---|
committer | Jesse Wilson <jessewilson@google.com> | 2010-11-23 17:14:49 -0800 |
commit | 7fac047a67ee9a0295e4dc798c7c6880ccd83513 (patch) | |
tree | c6e6c29c6ba976712018f9c348c98271c1cc48ea /xml/src/main/java/org | |
parent | e8515df2fdeda9e99dce4035914481e42046beec (diff) | |
download | libcore-7fac047a67ee9a0295e4dc798c7c6880ccd83513.zip libcore-7fac047a67ee9a0295e4dc798c7c6880ccd83513.tar.gz libcore-7fac047a67ee9a0295e4dc798c7c6880ccd83513.tar.bz2 |
Implement DTD parsing in KxmlParser.
This change still has some problems:
- default attribute values are not honored.
- the doctype token text is lost
- entity values are not reparsed
- use of parameter entities is ignored but should cause a failure
Change-Id: Idd543846840aea481730e690e63212164555cdf1
http://b/3090550
Diffstat (limited to 'xml/src/main/java/org')
-rw-r--r-- | xml/src/main/java/org/kxml2/io/KXmlParser.java | 478 |
1 files changed, 406 insertions, 72 deletions
diff --git a/xml/src/main/java/org/kxml2/io/KXmlParser.java b/xml/src/main/java/org/kxml2/io/KXmlParser.java index 4f41fe0..8c0b3b1 100644 --- a/xml/src/main/java/org/kxml2/io/KXmlParser.java +++ b/xml/src/main/java/org/kxml2/io/KXmlParser.java @@ -37,6 +37,11 @@ import org.xmlpull.v1.XmlPullParserException; */ public class KXmlParser implements XmlPullParser { + private static final int ELEMENTDECL = 11; + private static final int ENTITYDECL = 12; + private static final int ATTLISTDECL = 13; + private static final int NOTATIONDECL = 14; + private static final int PARAMETER_ENTITY_REF = 15; private static final char[] START_COMMENT = { '<', '!', '-', '-' }; private static final char[] END_COMMENT = { '-', '-', '>' }; private static final char[] COMMENT_DOUBLE_DASH = { '-', '-' }; @@ -45,7 +50,19 @@ public class KXmlParser implements XmlPullParser { private static final char[] START_PROCESSING_INSTRUCTION = { '<', '?' }; private static final char[] END_PROCESSING_INSTRUCTION = { '?', '>' }; private static final char[] START_DOCTYPE = { '<', '!', 'D', 'O', 'C', 'T', 'Y', 'P', 'E' }; - // no END_DOCTYPE because doctype must be parsed + private static final char[] SYSTEM = { 'S', 'Y', 'S', 'T', 'E', 'M' }; + private static final char[] PUBLIC = { 'P', 'U', 'B', 'L', 'I', 'C' }; + private static final char[] START_ELEMENT = { '<', '!', 'E', 'L', 'E', 'M', 'E', 'N', 'T' }; + private static final char[] START_ATTLIST = { '<', '!', 'A', 'T', 'T', 'L', 'I', 'S', 'T' }; + private static final char[] START_ENTITY = { '<', '!', 'E', 'N', 'T', 'I', 'T', 'Y' }; + private static final char[] START_NOTATION = { '<', '!', 'N', 'O', 'T', 'A', 'T', 'I', 'O', 'N' }; + private static final char[] EMPTY = new char[] { 'E', 'M', 'P', 'T', 'Y' }; + private static final char[] ANY = new char[]{ 'A', 'N', 'Y' }; + private static final char[] NDATA = new char[]{ 'N', 'D', 'A', 'T', 'A' }; + private static final char[] NOTATION = new char[]{ 'N', 'O', 'T', 'A', 'T', 'I', 'O', 'N' }; + private static final char[] REQUIRED = new char[] { 'R', 'E', 'Q', 'U', 'I', 'R', 'E', 'D' }; + private static final char[] IMPLIED = new char[] { 'I', 'M', 'P', 'L', 'I', 'E', 'D' }; + private static final char[] FIXED = new char[] { 'F', 'I', 'X', 'E', 'D' }; static final private String UNEXPECTED_EOF = "Unexpected EOF"; static final private String ILLEGAL_TYPE = "Wrong event type"; @@ -285,11 +302,11 @@ public class KXmlParser implements XmlPullParser { } } - type = peekType(); + type = peekType(false); if (type == XML_DECLARATION) { readXmlDeclaration(); - type = peekType(); + type = peekType(false); } text = null; @@ -322,13 +339,13 @@ public class KXmlParser implements XmlPullParser { case ENTITY_REF: if (justOneToken) { StringBuilder entityTextBuilder = new StringBuilder(); - readEntity(entityTextBuilder, true); + readEntity(entityTextBuilder, true, ValueContext.TEXT); text = entityTextBuilder.toString(); break; } // fall-through case TEXT: - text = readValue('<', !justOneToken, false); + text = readValue('<', !justOneToken, ValueContext.TEXT); if (depth == 0 && isWhitespace) { type = IGNORABLE_WHITESPACE; } @@ -356,11 +373,14 @@ public class KXmlParser implements XmlPullParser { } break; case DOCDECL: - String doctype = readDoctype(justOneToken); + readDoctype(); if (justOneToken) { - text = doctype; + text = ""; // TODO: support capturing the doctype text } break; + + default: + throw new XmlPullParserException("Unexpected token", this, null); } if (justOneToken) { @@ -376,7 +396,7 @@ public class KXmlParser implements XmlPullParser { * report this as text, even if it was a CDATA block or entity * reference. */ - int peek = peekType(); + int peek = peekType(false); if (text != null && !text.isEmpty() && peek < TEXT) { type = TEXT; return type; @@ -504,52 +524,337 @@ public class KXmlParser implements XmlPullParser { return commentText; } - private String readDoctype(boolean returnText) throws IOException, XmlPullParserException { + /** + * Read the document's DTD. Although this parser is non-validating, the DTD + * must be parsed to capture entity values and default attribute values. + */ + private void readDoctype() throws IOException, XmlPullParserException { read(START_DOCTYPE); + skip(); + readName(); + readExternalId(true); + skip(); + if (peekCharacter() == '[') { + readInternalSubset(); + } + skip(); + read('>'); + } - int start = position; - StringBuilder result = null; - int nesting = 1; - boolean quoted = false; + /** + * Reads an external ID of one of these two forms: + * SYSTEM "quoted system name" + * PUBLIC "quoted public id" "quoted system name" + * + * If the system name is not required, this also supports lone public IDs of + * this form: + * PUBLIC "quoted public id" + * + * Returns true if any ID was read. + */ + private boolean readExternalId(boolean requireSystemName) + throws IOException, XmlPullParserException { + skip(); + int c = peekCharacter(); + + if (c == 'S') { + read(SYSTEM); + } else if (c == 'P') { + read(PUBLIC); + skip(); + readQuotedId(); + } else { + return false; + } + + skip(); + + if (!requireSystemName) { + int delimiter = peekCharacter(); + if (delimiter != '"' && delimiter != '\'') { + return true; // no system name! + } + } + + readQuotedId(); + return true; + } + + /** + * Reads a quoted string, performing no entity escaping of the contents. + */ + private void readQuotedId() throws IOException, XmlPullParserException { + int quote = peekCharacter(); + if (quote != '"' && quote != '\'') { + throw new XmlPullParserException("Expected a quoted string", this, null); + } + position++; + while (peekCharacter() != quote) { + position++; + } + position++; + } + + private void readInternalSubset() throws IOException, XmlPullParserException { + read('['); while (true) { - if (position >= limit) { - if (start < position && returnText) { - if (result == null) { - result = new StringBuilder(); - } - result.append(buffer, start, position - start); - } - if (!fillBuffer(1)) { - checkRelaxed(UNEXPECTED_EOF); - return null; + skip(); + if (peekCharacter() == ']') { + position++; + return; + } + + int declarationType = peekType(true); + switch (declarationType) { + case ELEMENTDECL: + readElementDeclaration(); + break; + + case ATTLISTDECL: + readAttributeListDeclaration(); + break; + + case ENTITYDECL: + readEntityDeclaration(); + break; + + case NOTATIONDECL: + readNotationDeclaration(); + break; + + case PROCESSING_INSTRUCTION: + read(START_PROCESSING_INSTRUCTION); + readUntil(END_PROCESSING_INSTRUCTION, false); + break; + + case COMMENT: + readComment(false); + break; + + case PARAMETER_ENTITY_REF: + throw new XmlPullParserException( + "Parameter entity references are not supported", this, null); + + default: + throw new XmlPullParserException("Unexpected token", this, null); + } + } + } + + /** + * Read an element declaration. This contains a name and a content spec. + * <!ELEMENT foo EMPTY > + * <!ELEMENT foo (bar?,(baz|quux)) > + * <!ELEMENT foo (#PCDATA|bar)* > + */ + private void readElementDeclaration() throws IOException, XmlPullParserException { + read(START_ELEMENT); + skip(); + readName(); + readContentSpec(); + skip(); + read('>'); + } + + /** + * Read an element content spec. This is a regular expression-like pattern + * of names or other content specs. The following operators are supported: + * sequence: (a,b,c) + * choice: (a|b|c) + * optional: a? + * one or more: a+ + * any number: a* + * + * The special name '#PCDATA' is permitted but only if it is the first + * element of the first group: + * (#PCDATA|a|b) + * + * The top-level element must be either a choice, a sequence, or one of the + * special names EMPTY and ANY. + */ + private void readContentSpec() throws IOException, XmlPullParserException { + // this implementation is very lenient; it scans for balanced parens only + skip(); + int c = peekCharacter(); + if (c == '(') { + int depth = 0; + do { + if (c == '(') { + depth++; + } else if (c == ')') { + depth--; } - start = position; + position++; + c = peekCharacter(); + } while (depth > 0); + + if (c == '*' || c == '?' || c == '+') { + position++; + } + } else if (c == EMPTY[0]) { + read(EMPTY); + } else if (c == ANY[0]) { + read(ANY); + } else { + throw new XmlPullParserException("Expected element content spec", this, null); + } + } + + /** + * Reads an attribute list declaration such as the following: + * <!ATTLIST foo + * bar CDATA #IMPLIED + * quux (a|b|c) "c" + * baz NOTATION (a|b|c) #FIXED "c"> + * + * Each attribute has a name, type and default. + * + * Types are one of the built-in types (CDATA, ID, IDREF, IDREFS, ENTITY, + * ENTITIES, NMTOKEN, or NMTOKENS), an enumerated type "(list|of|options)" + * or NOTATION followed by an enumerated type. + * + * The default is either #REQUIRED, #IMPLIED, #FIXED, a quoted value, or + * #FIXED with a quoted value. + */ + private void readAttributeListDeclaration() throws IOException, XmlPullParserException { + read(START_ATTLIST); + skip(); + String elementName = readName(); + + while (true) { + skip(); + int c = peekCharacter(); + if (c == '>') { + position++; + return; } - char i = buffer[position++]; + // attribute name + String attributeName = readName(); - if (i == '\'') { - quoted = !quoted; // TODO: should this include a double quote as well? - } else if (i == '<') { - if (!quoted) { - nesting++; + // attribute type + skip(); + if (position + 1 >= limit && !fillBuffer(2)) { + throw new XmlPullParserException("Malformed attribute list", this, null); + } + if (buffer[position] == NOTATION[0] && buffer[position + 1] == NOTATION[1]) { + read(NOTATION); + skip(); + } + c = peekCharacter(); + if (c == '(') { + position++; + while (true) { + skip(); + readName(); + skip(); + c = peekCharacter(); + if (c == ')') { + position++; + break; + } else if (c == '|') { + position++; + } else { + throw new XmlPullParserException("Malformed attribute type", this, null); + } } - } else if (i == '>') { - if (!quoted && --nesting == 0) { - break; + } else { + readName(); + } + + // default value + skip(); + c = peekCharacter(); + if (c == '#') { + position++; + c = peekCharacter(); + if (c == 'R') { + read(REQUIRED); + } else if (c == 'I') { + read(IMPLIED); + } else if (c == 'F') { + read(FIXED); + } else { + throw new XmlPullParserException("Malformed attribute type", this, null); } + skip(); + c = peekCharacter(); } + if (c == '"' || c == '\'') { + position++; + // TODO: does this do escaping correctly? + String value = readValue((char) c, true, ValueContext.ATTRIBUTE); + position++; + defineAttributeDefault(elementName, attributeName, value); + } + } + } + + private void defineAttributeDefault(String elementName, String attributeName, String value) { + // TODO: stash this attribute so we can recall it later + } + + /** + * Read an entity declaration. The value of internal entities are inline: + * <!ENTITY foo "bar"> + * + * The values of external entities must be retrieved by URL or path: + * <!ENTITY foo SYSTEM "http://host/file"> + * <!ENTITY foo PUBLIC "-//Android//Foo//EN" "http://host/file"> + * <!ENTITY foo SYSTEM "../file.png" NDATA png> + * + * Entities may be general or parameterized. Parameterized entities are + * marked by a percent sign. Such entities may only be used in the DTD: + * <!ENTITY % foo "bar"> + */ + private void readEntityDeclaration() throws IOException, XmlPullParserException { + read(START_ENTITY); + boolean generalEntity = true; + + skip(); + if (peekCharacter() == '%') { + generalEntity = false; + position++; + skip(); } - if (!returnText) { - return null; - } else if (result == null) { - return stringPool.get(buffer, start, position - start - 1); // omit the '>' + String name = readName(); + + skip(); + int quote = peekCharacter(); + if (quote == '"' || quote == '\'') { + position++; + String value = readValue((char) quote, true, ValueContext.ENTITY_DECLARATION); + position++; + if (generalEntity) { + defineEntityReplacementText(name, value); // TODO: test parameter and general entity + } + } else if (readExternalId(true)) { + skip(); + if (peekCharacter() == NDATA[0]) { + read(NDATA); + skip(); + readName(); + } } else { - result.append(buffer, start, position - start - 1); // omit the '>' - return result.toString(); + throw new XmlPullParserException("Expected entity value or external ID", this, null); } + + skip(); + read('>'); + } + + private void readNotationDeclaration() throws IOException, XmlPullParserException { + read(START_NOTATION); + skip(); + readName(); + if (!readExternalId(false)) { + throw new XmlPullParserException( + "Expected external ID or public ID for notation", this, null); + } + skip(); + read('>'); } private void readEndTag() throws IOException, XmlPullParserException { @@ -580,46 +885,61 @@ public class KXmlParser implements XmlPullParser { /** * Returns the type of the next token. */ - private int peekType() throws IOException, XmlPullParserException { + private int peekType(boolean inDeclaration) throws IOException, XmlPullParserException { if (position >= limit && !fillBuffer(1)) { return END_DOCUMENT; } - if (buffer[position] == '&') { - return ENTITY_REF; - - } else if (buffer[position] == '<') { - if (position + 2 >= limit && !fillBuffer(3)) { + switch (buffer[position]) { + case '&': + return ENTITY_REF; // & + case '<': + if (position + 3 >= limit && !fillBuffer(4)) { throw new XmlPullParserException("Dangling <", this, null); } - if (buffer[position + 1] == '/') { - return END_TAG; - } else if (buffer[position + 1] == '?') { + switch (buffer[position + 1]) { + case '/': + return END_TAG; // </ + case '?': // we're looking for "<?xml " with case insensitivity if ((position + 5 < limit || fillBuffer(6)) && (buffer[position + 2] == 'x' || buffer[position + 2] == 'X') && (buffer[position + 3] == 'm' || buffer[position + 3] == 'M') && (buffer[position + 4] == 'l' || buffer[position + 4] == 'L') && (buffer[position + 5] == ' ')) { - return XML_DECLARATION; + return XML_DECLARATION; // <?xml } else { - return PROCESSING_INSTRUCTION; + return PROCESSING_INSTRUCTION; // <? } - } else if (buffer[position + 1] == '!') { - if (buffer[position + 2] == START_DOCTYPE[2]) { - return DOCDECL; - } else if (buffer[position + 2] == START_CDATA[2]) { - return CDSECT; - } else if (buffer[position + 2] == START_COMMENT[2]) { - return COMMENT; - } else { - throw new XmlPullParserException("Unexpected <!", this, null); + case '!': + switch (buffer[position + 2]) { + case 'D': + return DOCDECL; // <!D + case '[': + return CDSECT; // <![ + case '-': + return COMMENT; // <!- + case 'E': + switch (buffer[position + 3]) { + case 'L': + return ELEMENTDECL; // <!EL + case 'N': + return ENTITYDECL; // <!EN + default: + throw new XmlPullParserException("Unexpected <!", this, null); + } + case 'A': + return ATTLISTDECL; // <!A + case 'N': + return NOTATIONDECL; // <!N } - } else { - return START_TAG; + default: + return START_TAG; // < } - } else { + case '%': + return inDeclaration ? PARAMETER_ENTITY_REF : TEXT; + default: return TEXT; } } @@ -695,7 +1015,7 @@ public class KXmlParser implements XmlPullParser { throw new XmlPullParserException("attr value delimiter missing!", this, null); } - attributes[i] = readValue(delimiter, true, true); + attributes[i] = readValue(delimiter, true, ValueContext.ATTRIBUTE); if (delimiter != ' ') { position++; // end quote @@ -736,7 +1056,7 @@ public class KXmlParser implements XmlPullParser { * resolved entity to {@code out}. If the entity cannot be read or resolved, * {@code out} will contain the partial entity reference. */ - private void readEntity(StringBuilder out, boolean isEntityToken) + private void readEntity(StringBuilder out, boolean isEntityToken, ValueContext valueContext) throws IOException, XmlPullParserException { int start = out.length(); @@ -793,6 +1113,8 @@ public class KXmlParser implements XmlPullParser { } catch (IllegalArgumentException invalidCodePoint) { throw new XmlPullParserException("Invalid character reference: &" + code); } + } else if (valueContext == ValueContext.ENTITY_DECLARATION) { + // keep the unresolved &code; in the text } else if ((resolved = entityMap.get(code)) != null) { out.delete(start, out.length()); out.append(resolved); @@ -807,15 +1129,27 @@ public class KXmlParser implements XmlPullParser { } /** + * Where a value is found impacts how that value is interpreted. For + * example, in attributes, "\n" must be replaced with a space character. In + * text, "]]>" is forbidden. In entity declarations, named references are + * not resolved. + */ + enum ValueContext { + ATTRIBUTE, + TEXT, + ENTITY_DECLARATION + } + + /** * Returns the current text or attribute value. This also has the side * effect of setting isWhitespace to false if a non-whitespace character is * encountered. * - * @param delimiter {@code >} for text, {@code "} and {@code '} for quoted + * @param delimiter {@code <} for text, {@code "} and {@code '} for quoted * attributes, or a space for unquoted attributes. */ private String readValue(char delimiter, boolean resolveEntities, - boolean inAttributeValue) throws IOException, XmlPullParserException { + ValueContext valueContext) throws IOException, XmlPullParserException { /* * This method returns all of the characters from the current position @@ -842,7 +1176,7 @@ public class KXmlParser implements XmlPullParser { StringBuilder result = null; // if a text section was already started, prefix the start - if (!inAttributeValue && text != null) { + if (valueContext == ValueContext.TEXT && text != null) { result = new StringBuilder(); result.append(text); } @@ -876,10 +1210,10 @@ public class KXmlParser implements XmlPullParser { } if (c != '\r' - && (c != '\n' || !inAttributeValue) + && (c != '\n' || valueContext != ValueContext.ATTRIBUTE) && c != '&' && c != '<' - && (c != ']' || inAttributeValue)) { + && (c != ']' || valueContext != ValueContext.TEXT)) { isWhitespace &= (c <= ' '); position++; continue; @@ -898,19 +1232,19 @@ public class KXmlParser implements XmlPullParser { if ((position + 1 < limit || fillBuffer(2)) && buffer[position + 1] == '\n') { position++; } - c = inAttributeValue ? ' ' : '\n'; + c = (valueContext == ValueContext.ATTRIBUTE) ? ' ' : '\n'; } else if (c == '\n') { c = ' '; } else if (c == '&') { isWhitespace = false; // TODO: what if the entity resolves to whitespace? - readEntity(result, false); + readEntity(result, false, valueContext); start = position; continue; } else if (c == '<') { - if (inAttributeValue) { + if (valueContext == ValueContext.ATTRIBUTE) { checkRelaxed("Illegal: \"<\" inside attribute value"); } isWhitespace = false; |