diff options
author | Jesse Wilson <jessewilson@google.com> | 2010-11-13 08:34:48 -0800 |
---|---|---|
committer | Android (Google) Code Review <android-gerrit@google.com> | 2010-11-13 08:34:48 -0800 |
commit | 4ab2cec301baf9704b1235aa50b544e8d7f53124 (patch) | |
tree | 9c3126e2fd8597522974b196df5a82cd9c7d2110 /xml/src | |
parent | 87987208b0e225bc2022190e57b01e8d57d29193 (diff) | |
parent | fda724de28fe86804e6ef6a0afd7ae5be1529083 (diff) | |
download | libcore-4ab2cec301baf9704b1235aa50b544e8d7f53124.zip libcore-4ab2cec301baf9704b1235aa50b544e8d7f53124.tar.gz libcore-4ab2cec301baf9704b1235aa50b544e8d7f53124.tar.bz2 |
Merge "Optimize KxmlParser." into dalvik-dev
Diffstat (limited to 'xml/src')
-rw-r--r-- | xml/src/main/java/org/kxml2/io/KXmlParser.java | 1185 | ||||
-rw-r--r-- | xml/src/main/java/org/xmlpull/v1/XmlPullParser.java | 2 |
2 files changed, 664 insertions, 523 deletions
diff --git a/xml/src/main/java/org/kxml2/io/KXmlParser.java b/xml/src/main/java/org/kxml2/io/KXmlParser.java index 3ee5e43..9ca555b 100644 --- a/xml/src/main/java/org/kxml2/io/KXmlParser.java +++ b/xml/src/main/java/org/kxml2/io/KXmlParser.java @@ -36,13 +36,21 @@ import org.xmlpull.v1.XmlPullParserException; */ public class KXmlParser implements XmlPullParser { - private Object location; + private static final char[] START_COMMENT = { '<', '!', '-', '-' }; + private static final char[] END_COMMENT = { '-', '-', '>' }; + private static final char[] START_CDATA = { '<', '!', '[', 'C', 'D', 'A', 'T', 'A', '[' }; + private static final char[] END_CDATA = { ']', ']', '>' }; + private static final char[] START_PROCESSING_INSTRUCTION = { '<', '?' }; + private static final char[] END_PROCESSING_INSTRUCTION = { '?', '>' }; + private static final char[] START_DOCTYPE = { '<', '!', 'D', 'O', 'C', 'T', 'Y', 'P', 'E' }; + // no END_DOCTYPE because doctype must be parsed + static final private String UNEXPECTED_EOF = "Unexpected EOF"; static final private String ILLEGAL_TYPE = "Wrong event type"; - static final private int LEGACY = 999; - static final private int XML_DECL = 998; + static final private int XML_DECLARATION = 998; // general + private String location; private String version; private Boolean standalone; @@ -60,33 +68,31 @@ public class KXmlParser implements XmlPullParser { private Reader reader; private String encoding; - private char[] srcBuf; - - private int srcPos; - private int srcCount; - - private int line; - private int column; - - // txtbuffer - - /** Target buffer for storing incoming text (including aggregated resolved entities) */ - private char[] txtBuf = new char[128]; - /** Write position */ - private int txtPos; + private final char[] buffer = new char[8192]; + private int position = 0; + private int limit = 0; + + /* + * Track the number of newlines and columns preceding the current buffer. To + * compute the line and column of a position in the buffer, compute the line + * and column in the buffer and add the preceding values. + */ + private int bufferStartLine; + private int bufferStartColumn; - // Event-related + // the current token private int type; private boolean isWhitespace; private String namespace; private String prefix; private String name; + private String text; private boolean degenerated; private int attributeCount; - /** + /* * The current element's attributes arranged in groups of 4: * i + 0 = attribute namespace URI * i + 1 = attribute namespace prefix @@ -97,21 +103,9 @@ public class KXmlParser implements XmlPullParser { private String error; - /** - * A separate peek buffer seems simpler than managing wrap around in the first level read - * buffer - */ - private int[] peek = new int[2]; - private int peekCount; - private boolean wasCR; - private boolean unresolved; private boolean token; - public KXmlParser() { - srcBuf = new char[8192]; - } - /** * Retains namespace attributes like {@code xmlns="http://foo"} or {@code xmlns:foo="http:foo"} * in pulled elements. Most applications will only be interested in the effective namespaces of @@ -161,7 +155,7 @@ public class KXmlParser implements XmlPullParser { nspStack[j + 1] = attributes[i + 3]; if (attrName != null && attributes[i + 3].isEmpty()) { - error("illegal empty namespace"); + checkRelaxed("illegal empty namespace"); } if (keepNamespaceAttributes) { @@ -213,7 +207,7 @@ public class KXmlParser implements XmlPullParser { int cut = name.indexOf(':'); if (cut == 0) { - error("illegal tag name: " + name); + checkRelaxed("illegal tag name: " + name); } if (cut != -1) { @@ -225,7 +219,7 @@ public class KXmlParser implements XmlPullParser { if (this.namespace == null) { if (prefix != null) { - error("undefined prefix: " + prefix); + checkRelaxed("undefined prefix: " + prefix); } this.namespace = NO_NAMESPACE; } @@ -242,21 +236,13 @@ public class KXmlParser implements XmlPullParser { return bigger; } - private void error(String desc) throws XmlPullParserException { - if (relaxed) { - if (error == null) { - error = "ERR: " + desc; - } - } else { - exception(desc); + private void checkRelaxed(String errorMessage) throws XmlPullParserException { + if (!relaxed) { + throw new XmlPullParserException(errorMessage, this, null); + } + if (error == null) { + error = "Error: " + errorMessage; } - } - - private void exception(String desc) throws XmlPullParserException { - throw new XmlPullParserException( - desc.length() < 100 ? desc : desc.substring(0, 100) + "\n", - this, - null); } /** @@ -265,7 +251,7 @@ public class KXmlParser implements XmlPullParser { */ private void nextImpl() throws IOException, XmlPullParserException { if (reader == null) { - exception("No Input specified"); + throw new XmlPullParserException("setInput() must be called first.", this, null); } if (type == END_TAG) { @@ -285,9 +271,7 @@ public class KXmlParser implements XmlPullParser { } if (error != null) { - for (int i = 0; i < error.length(); i++) { - push(error.charAt(i)); - } + text = error; error = null; type = COMMENT; return; @@ -301,229 +285,243 @@ public class KXmlParser implements XmlPullParser { switch (type) { - case ENTITY_REF: - pushEntity(); - return; - - case START_TAG: - parseStartTag(false); - return; - - case END_TAG: - parseEndTag(); - return; - - case END_DOCUMENT: - return; - - case TEXT: - pushText('<', !token, false); - if (depth == 0) { - if (isWhitespace) { - type = IGNORABLE_WHITESPACE; - } - } + case ENTITY_REF: + if (token) { + StringBuilder entityTextBuilder = new StringBuilder(); + readEntity(entityTextBuilder); + text = entityTextBuilder.toString(); return; + } + // fall-through + case TEXT: + text = readValue('<', !token, false); + if (depth == 0 && isWhitespace) { + type = IGNORABLE_WHITESPACE; + } + return; - default: - type = parseLegacy(token); - if (type != XML_DECL) { - return; - } - } - } - } + case START_TAG: + text = null; // TODO: fix next()/nextToken() so this is handled there + parseStartTag(false); + return; - private int parseLegacy(boolean push) throws IOException, XmlPullParserException { - String req = ""; - int term; - int result; - int prev = 0; + case END_TAG: + readEndTag(); + return; - read(); // < - int c = read(); + case END_DOCUMENT: + return; - if (c == '?') { - if ((peek(0) == 'x' || peek(0) == 'X') - && (peek(1) == 'm' || peek(1) == 'M')) { + case XML_DECLARATION: + readXmlDeclaration(); + continue; - if (push) { - push(peek(0)); - push(peek(1)); + case PROCESSING_INSTRUCTION: + read(START_PROCESSING_INSTRUCTION); + if (token) { + text = readUntil(END_PROCESSING_INSTRUCTION, true); + } else { + readUntil(END_PROCESSING_INSTRUCTION, false); } - read(); - read(); - - if ((peek(0) == 'l' || peek(0) == 'L') && peek(1) <= ' ') { - - if (line != 1 || column > 4) { - error("PI must not start with xml"); - } - - parseStartTag(true); - - if (attributeCount < 1 || !"version".equals(attributes[2])) { - error("version expected"); - } + return; - version = attributes[3]; + case DOCDECL: + readDoctype(token); + return; - int pos = 1; + case CDSECT: + String oldText = text; + read(START_CDATA); + text = readUntil(END_CDATA, true); + if (oldText != null) { + text = oldText + text; // TODO: fix next()/nextToken() so this is handled there + } + return; - if (pos < attributeCount - && "encoding".equals(attributes[2 + 4])) { - encoding = attributes[3 + 4]; - pos++; - } + case COMMENT: + read(START_COMMENT); + if (token) { + text = readUntil(END_COMMENT, true); + } else { + readUntil(END_COMMENT, false); + } + return; + } + } + } - if (pos < attributeCount - && "standalone".equals(attributes[4 * pos + 2])) { - String st = attributes[3 + 4 * pos]; - if ("yes".equals(st)) { - standalone = new Boolean(true); - } else if ("no".equals(st)) { - standalone = new Boolean(false); - } else { - error("illegal standalone value: " + st); - } - pos++; - } + /** + * Reads text until the specified delimiter is encountered. Consumes the + * text and the delimiter. + * + * @param returnText true to return the read text excluding the delimiter; + * false to return null. + */ + private String readUntil(char[] delimiter, boolean returnText) + throws IOException, XmlPullParserException { + int previous = -1; + int start = position; + StringBuilder result = null; - if (pos != attributeCount) { - error("illegal xmldecl"); + search: + while (true) { + if (position + delimiter.length >= limit) { + if (start < position && returnText) { + if (result == null) { + result = new StringBuilder(); } - - isWhitespace = true; - txtPos = 0; - - return XML_DECL; + result.append(buffer, start, position - start); } + if (!fillBuffer(delimiter.length)) { + checkRelaxed(UNEXPECTED_EOF); + type = COMMENT; + return null; + } + start = position; } - term = '?'; - result = PROCESSING_INSTRUCTION; - } else if (c == '!') { - if (peek(0) == '-') { - result = COMMENT; - req = "--"; - term = '-'; - } else if (peek(0) == '[') { - result = CDSECT; - req = "[CDATA["; - term = ']'; - push = true; - } else { - result = DOCDECL; - req = "DOCTYPE"; - term = -1; + // TODO: replace with Arrays.equals(buffer, position, delimiter, 0, delimiter.length) + // when the VM has better method inlining + for (int i = 0; i < delimiter.length; i++) { + if (buffer[position + i] != delimiter[i]) { + previous = buffer[position]; + position++; + continue search; + } } - } else { - error("illegal: <" + c); - return COMMENT; + + break; } - for (int i = 0; i < req.length(); i++) { - read(req.charAt(i)); + if (delimiter == END_COMMENT && previous == '-') { + checkRelaxed("illegal comment delimiter: --->"); } - if (result == DOCDECL) { - parseDoctype(push); + int end = position; + position += delimiter.length; + + if (!returnText) { + return null; + } else if (result == null) { + return new String(buffer, start, end - start); } else { - while (true) { - c = read(); - if (c == -1) { - error(UNEXPECTED_EOF); - return COMMENT; - } + result.append(buffer, start, end - start); + return result.toString(); + } + } - if (push) { - push(c); - } + /** + * Returns true if an XML declaration was read. + */ + private boolean readXmlDeclaration() throws IOException, XmlPullParserException { + if (bufferStartLine != 0 || bufferStartColumn != 0 || position != 0) { + checkRelaxed("processing instructions must not start with xml"); + } - if ((term == '?' || c == term) - && peek(0) == term - && peek(1) == '>') { - break; - } + read(START_PROCESSING_INSTRUCTION); + parseStartTag(true); - prev = c; - } + if (attributeCount < 1 || !"version".equals(attributes[2])) { + checkRelaxed("version expected"); + } - if (term == '-' && prev == '-' && !relaxed) { - error("illegal comment delimiter: --->"); - } + version = attributes[3]; + + int pos = 1; - read(); - read(); + if (pos < attributeCount && "encoding".equals(attributes[2 + 4])) { + encoding = attributes[3 + 4]; + pos++; + } - if (push && term != '?') { - txtPos--; + if (pos < attributeCount && "standalone".equals(attributes[4 * pos + 2])) { + String st = attributes[3 + 4 * pos]; + if ("yes".equals(st)) { + standalone = Boolean.TRUE; + } else if ("no".equals(st)) { + standalone = Boolean.FALSE; + } else { + checkRelaxed("illegal standalone value: " + st); } + pos++; + } + if (pos != attributeCount) { + checkRelaxed("unexpected attributes in XML declaration"); } - return result; + + isWhitespace = true; + text = null; + return true; } - /** - * precondition: <! consumed - */ - private void parseDoctype(boolean push) throws IOException, XmlPullParserException { + private void readDoctype(boolean assignText) throws IOException, XmlPullParserException { + read(START_DOCTYPE); + + int start = position; + StringBuilder result = null; int nesting = 1; boolean quoted = false; while (true) { - int i = read(); - switch (i) { - - case -1: - error(UNEXPECTED_EOF); + if (position >= limit) { + if (start < position && assignText) { + if (result == null) { + result = new StringBuilder(); + } + result.append(buffer, start, position - start); + } + if (!fillBuffer(1)) { + checkRelaxed(UNEXPECTED_EOF); return; + } + start = position; + } - case '\'': - quoted = !quoted; - break; - - case '<': - if (!quoted) { - nesting++; - } - break; + char i = buffer[position++]; - case '>': - if (!quoted) { - if ((--nesting) == 0) { - return; - } - } + if (i == '\'') { + quoted = !quoted; // TODO: should this include a double quote as well? + } else if (i == '<') { + if (!quoted) { + nesting++; + } + } else if (i == '>') { + if (!quoted && --nesting == 0) { break; + } } - if (push) { - push(i); + } + + if (assignText) { + if (result == null) { + text = new String(buffer, start, position - start - 1); // omit the '>' + } else { + result.append(buffer, start, position - start - 1); // omit the '>' + text = result.toString(); } } } - /** - * precondition: </ consumed - */ - private void parseEndTag() throws IOException, XmlPullParserException { - read(); // '<' - read(); // '/' - name = readName(); + private void readEndTag() throws IOException, XmlPullParserException { + read('<'); + read('/'); + name = readName(); // TODO: pass the expected name in as a hint? skip(); read('>'); - int sp = (depth - 1) << 2; + int sp = (depth - 1) * 4; if (depth == 0) { - error("element stack empty"); + checkRelaxed("read end tag " + name + " with no tags open"); type = COMMENT; return; } if (!relaxed) { if (!name.equals(elementStack[sp + 3])) { - error("expected: /" + elementStack[sp + 3] + " read: " + name); + throw new XmlPullParserException( + "expected: /" + elementStack[sp + 3] + " read: " + name, this, null); } namespace = elementStack[sp]; @@ -532,41 +530,51 @@ public class KXmlParser implements XmlPullParser { } } - private int peekType() throws IOException { - switch (peek(0)) { - case -1: - return END_DOCUMENT; - case '&': - return ENTITY_REF; - case '<': - switch (peek(1)) { - case '/': - return END_TAG; - case '?': - case '!': - return LEGACY; - default: - return START_TAG; - } - default: - return TEXT; + /** + * Returns the type of the next token. + */ + private int peekType() throws IOException, XmlPullParserException { + if (position >= limit && !fillBuffer(1)) { + return END_DOCUMENT; } - } - private String get(int pos) { - return new String(txtBuf, pos, txtPos - pos); - } + if (buffer[position] == '&') { + return ENTITY_REF; - private void push(int c) { - isWhitespace &= c <= ' '; + } else if (buffer[position] == '<') { + if (position + 2 >= limit && !fillBuffer(3)) { + throw new XmlPullParserException("Dangling <", this, null); + } - if (txtPos == txtBuf.length) { - char[] bigger = new char[txtPos * 4 / 3 + 4]; - System.arraycopy(txtBuf, 0, bigger, 0, txtPos); - txtBuf = bigger; + if (buffer[position + 1] == '/') { + return END_TAG; + } else if (buffer[position + 1] == '?') { + // we're looking for "<?xml " with case insensitivity + if ((position + 5 < limit || fillBuffer(6)) + && (buffer[position + 2] == 'x' || buffer[position + 2] == 'X') + && (buffer[position + 3] == 'm' || buffer[position + 3] == 'M') + && (buffer[position + 4] == 'l' || buffer[position + 4] == 'L') + && (buffer[position + 5] == ' ')) { + return XML_DECLARATION; + } else { + return PROCESSING_INSTRUCTION; + } + } else if (buffer[position + 1] == '!') { + if (buffer[position + 2] == START_DOCTYPE[2]) { + return DOCDECL; + } else if (buffer[position + 2] == START_CDATA[2]) { + return CDSECT; + } else if (buffer[position + 2] == START_COMMENT[2]) { + return COMMENT; + } else { + throw new XmlPullParserException("Unexpected <!", this, null); + } + } else { + return START_TAG; + } + } else { + return TEXT; } - - txtBuf[txtPos++] = (char) c; } /** @@ -574,7 +582,7 @@ public class KXmlParser implements XmlPullParser { */ private void parseStartTag(boolean xmldecl) throws IOException, XmlPullParserException { if (!xmldecl) { - read(); + read('<'); } name = readName(); attributeCount = 0; @@ -582,84 +590,78 @@ public class KXmlParser implements XmlPullParser { while (true) { skip(); - int c = peek(0); + if (position >= limit && !fillBuffer(1)) { + checkRelaxed(UNEXPECTED_EOF); + return; + } + + int c = buffer[position]; if (xmldecl) { if (c == '?') { - read(); + position++; read('>'); return; } } else { if (c == '/') { degenerated = true; - read(); + position++; skip(); read('>'); break; - } - - if (c == '>' && !xmldecl) { - read(); + } else if (c == '>') { + position++; break; } } - if (c == -1) { - error(UNEXPECTED_EOF); - return; - } - String attrName = readName(); - if (attrName.length() == 0) { - error("attr name expected"); - break; - } - - int i = (attributeCount++) << 2; - + int i = (attributeCount++) * 4; attributes = ensureCapacity(attributes, i + 4); - attributes[i++] = ""; attributes[i++] = null; attributes[i++] = attrName; skip(); + if (position >= limit && !fillBuffer(1)) { + checkRelaxed(UNEXPECTED_EOF); + return; + } + + if (buffer[position] == '=') { + position++; - if (peek(0) != '=') { - if (!relaxed) { - error("Attr.value missing f. " + attrName); - } - attributes[i] = attrName; - } else { - read('='); skip(); - int delimiter = peek(0); + if (position >= limit && !fillBuffer(1)) { + checkRelaxed(UNEXPECTED_EOF); + return; + } + char delimiter = buffer[position]; - if (delimiter != '\'' && delimiter != '"') { - if (!relaxed) { - error("attr value delimiter missing!"); - } + if (delimiter == '\'' || delimiter == '"') { + position++; + } else if (relaxed) { delimiter = ' '; } else { - read(); + throw new XmlPullParserException("attr value delimiter missing!", this, null); } - int p = txtPos; - pushText(delimiter, true, true); - - attributes[i] = get(p); - txtPos = p; + attributes[i] = readValue(delimiter, true, true); if (delimiter != ' ') { - read(); // skip endquote + position++; // end quote } + } else if (relaxed) { + attributes[i] = attrName; + } else { + checkRelaxed("Attr.value missing f. " + attrName); + attributes[i] = attrName; } } - int sp = depth++ << 2; - + int sp = depth++ * 4; elementStack = ensureCapacity(elementStack, sp + 4); elementStack[sp + 3] = name; @@ -683,217 +685,352 @@ public class KXmlParser implements XmlPullParser { } /** - * result: isWhitespace; if the setName parameter is set, the name of the entity is stored in - * "name" + * Reads an entity reference from the buffer, resolves it, and writes the + * resolved entity to {@code out}. If the entity cannot be read or resolved, + * {@code out} will contain the partial entity reference. */ - private void pushEntity() throws IOException, XmlPullParserException { - push(read()); // & + private void readEntity(StringBuilder out) throws IOException, XmlPullParserException { + int start = out.length(); + + if (buffer[position++] != '&') { + throw new AssertionError(); + } - int pos = txtPos; + out.append('&'); while (true) { - int c = peek(0); + int c = peekCharacter(); + if (c == ';') { - read(); + position++; break; - } - if (c < 128 && (c < '0' || c > '9') && (c < 'a' || c > 'z') && (c < 'A' || c > 'Z') - && c != '_' && c != '-' && c != '#') { - if (!relaxed) { - error("unterminated entity ref"); - } + } else if (c >= 128 + || (c >= '0' && c <= '9') + || (c >= 'a' && c <= 'z') + || (c >= 'A' && c <= 'Z') + || c == '_' + || c == '-' + || c == '#') { + position++; + out.append((char) c); + + } else if (relaxed) { + // intentionally leave the partial reference in 'out' return; - } - push(read()); + } else { + throw new XmlPullParserException("unterminated entity ref", this, null); + } } - String code = get(pos); - txtPos = pos - 1; + String code = out.substring(start + 1); + out.delete(start, out.length()); + if (token && type == ENTITY_REF) { name = code; } if (code.charAt(0) == '#') { + // TODO: check IndexOutOfBoundsException? + // TODO: save an intermediate string for 'code' if unneeded? int c = code.charAt(1) == 'x' ? Integer.parseInt(code.substring(2), 16) : Integer.parseInt(code.substring(1)); - push(c); + // TODO: set unresolved to false? + out.append((char) c); return; } - String result = entityMap.get(code); - - unresolved = result == null; + String resolved = entityMap.get(code); + if (resolved != null) { + unresolved = false; + out.append(resolved); + return; + } - if (unresolved) { - if (!token) { - error("unresolved: &" + code + ";"); - } - } else { - for (int i = 0; i < result.length(); i++) { - push(result.charAt(i)); - } + unresolved = true; + if (!token) { + checkRelaxed("unresolved: &" + code + ";"); + // TODO: should the &code; show up in the text in relaxed mode? } } /** - * types: '<': parse to any token (for nextToken ()) '"': parse to quote ' ': parse to - * whitespace or '>' + * Returns the current text or attribute value. This also has the side + * effect of setting isWhitespace to false if a non-whitespace character is + * encountered. + * + * @param delimiter {@code >} for text, {@code "} and {@code '} for quoted + * attributes, or a space for unquoted attributes. */ - private void pushText(int delimiter, boolean resolveEntities, boolean inAttributeValue) - throws IOException, XmlPullParserException { + private String readValue(char delimiter, boolean resolveEntities, + boolean inAttributeValue) throws IOException, XmlPullParserException { + + /* + * This method returns all of the characters from the current position + * through to an appropriate delimiter. + * + * If we're lucky (which we usually are), we'll return a single slice of + * the buffer. This fast path avoids allocating a string builder. + * + * There are 5 unlucky characters we could encounter: + * - "&": entities must be resolved. + * - "<": this isn't permitted in attributes unless relaxed. + * - "]": this requires a lookahead to defend against the forbidden + * CDATA section delimiter "]]>". + * - "\r": If a "\r" is followed by a "\n", we discard the "\r". If it + * isn't followed by "\n", we replace "\r" with either a "\n" + * in text nodes or a space in attribute values. + * - "\n": In attribute values, "\n" must be replaced with a space. + * + * We could also get unlucky by needing to refill the buffer midway + * through the text. + */ + + int start = position; + StringBuilder result = null; + + // if a text section was already started, prefix the start + if (text != null) { + result = new StringBuilder(); + result.append(text); + } + + while (true) { - int next = peek(0); - int cbrCount = 0; + /* + * Make sure we have at least a single character to read from the + * buffer. This mutates the buffer, so save the partial result + * to the slow path string builder first. + */ + if (position >= limit) { + if (start < position) { + if (result == null) { + result = new StringBuilder(); + } + result.append(buffer, start, position - start); + } + if (!fillBuffer(1)) { + return result != null ? result.toString() : ""; + } + start = position; + } - while (next != -1 && next != delimiter) { // covers eof, '<', '"' + char c = buffer[position]; - if (delimiter == ' ' && (next <= ' ' || next == '>')) { + if (c == delimiter + || (delimiter == ' ' && (c <= ' ' || c == '>')) + || c == '&' && !resolveEntities) { break; } - if (next == '&') { - if (!resolveEntities) { - break; - } - - pushEntity(); - } else if (next == '<' && inAttributeValue) { - error("Illegal: \"<\" inside attribute value"); - } else if (next == '\n' && type == START_TAG) { - read(); - push(' '); - } else { - push(read()); + if (c != '\r' + && (c != '\n' || !inAttributeValue) + && c != '&' + && c != '<' + && (c != ']' || inAttributeValue)) { + isWhitespace &= (c <= ' '); + position++; + continue; } /* - * "]]>" is allowed in attribute values, but is not allowed in - * regular text between markup. + * We've encountered an unlucky character! Convert from fast + * path to slow path if we haven't done so already. */ - final boolean allowCloseCdata = inAttributeValue; - if (!allowCloseCdata && (next == '>' && cbrCount >= 2 && delimiter != ']')) { - error("Illegal: \"]]>\" outside CDATA section"); + if (result == null) { + result = new StringBuilder(); } + result.append(buffer, start, position - start); + + if (c == '\r') { + if ((position + 1 < limit || fillBuffer(2)) && buffer[position + 1] == '\n') { + position++; + } + c = inAttributeValue ? ' ' : '\n'; + + } else if (c == '\n') { + c = ' '; + + } else if (c == '&') { + isWhitespace = false; // TODO: what if the entity resolves to whitespace? + readEntity(result); + start = position; + continue; + + } else if (c == '<') { + if (inAttributeValue) { + checkRelaxed("Illegal: \"<\" inside attribute value"); + } + isWhitespace = false; + + } else if (c == ']') { + if ((position + 2 < limit || fillBuffer(3)) + && buffer[position + 1] == ']' && buffer[position + 2] == '>') { + checkRelaxed("Illegal: \"]]>\" outside CDATA section"); + } + isWhitespace = false; - if (next == ']') { - cbrCount++; } else { - cbrCount = 0; + throw new AssertionError(); } - next = peek(0); + position++; + result.append(c); + start = position; } - } - private void read(char c) throws IOException, XmlPullParserException { - int a = read(); - if (a != c) { - error("expected: '" + c + "' actual: '" + ((char) a) + "'"); + if (result == null) { + return new String(buffer, start, position - start); + } else { + result.append(buffer, start, position - start); + return result.toString(); } } - private int read() throws IOException { - int result; + private void read(char expected) throws IOException, XmlPullParserException { + int c = peekCharacter(); + if (c != expected) { + checkRelaxed("expected: '" + expected + "' actual: '" + ((char) c) + "'"); + } + position++; + } - if (peekCount == 0) { - result = peek(0); - } else { - result = peek[0]; - peek[0] = peek[1]; + private void read(char[] chars) throws IOException, XmlPullParserException { + if (position + chars.length >= limit && !fillBuffer(chars.length)) { + checkRelaxed("expected: '" + new String(chars) + "' but was EOF"); + return; } - peekCount--; - column++; + // TODO: replace with Arrays.equals(buffer, position, delimiter, 0, delimiter.length) + // when the VM has better method inlining + for (int i = 0; i < chars.length; i++) { + if (buffer[position + i] != chars[i]) { + checkRelaxed("expected: \"" + new String(chars) + "\" but was \"" + + new String(buffer, position, chars.length) + "...\""); + } + } - if (result == '\n') { + position += chars.length; + } - line++; - column = 1; + private int peekCharacter() throws IOException, XmlPullParserException { + if (position < limit || fillBuffer(1)) { + return buffer[position]; } - - return result; + return -1; } /** - * Does never read more than needed + * Returns true once {@code limit - position >= minimum}. If the data is + * exhausted before that many characters are available, this returns + * false. */ - private int peek(int pos) throws IOException { - while (pos >= peekCount) { - int nw; - if (srcBuf.length <= 1) { - nw = reader.read(); - } else if (srcPos < srcCount) { - nw = srcBuf[srcPos++]; + private boolean fillBuffer(int minimum) throws IOException { + // Before clobbering the old characters, update where buffer starts + for (int i = 0; i < position; i++) { + if (buffer[i] == '\n') { + bufferStartLine++; + bufferStartColumn = 0; } else { - srcCount = reader.read(srcBuf, 0, srcBuf.length); - if (srcCount <= 0) { - nw = -1; - } else { - nw = srcBuf[0]; - } - - srcPos = 1; + bufferStartColumn++; } + } - if (nw == '\r') { - wasCR = true; - peek[peekCount++] = '\n'; - } else { - if (nw == '\n') { - if (!wasCR) { - peek[peekCount++] = '\n'; - } - } else { - peek[peekCount++] = nw; - } + if (limit != position) { + limit -= position; + System.arraycopy(buffer, position, buffer, 0, limit); + } else { + limit = 0; + } - wasCR = false; + position = 0; + int total; + while ((total = reader.read(buffer, limit, buffer.length - limit)) != -1) { + limit += total; + if (limit >= minimum) { + return true; } } - - return peek[pos]; + return false; } + /** + * Returns an element or attribute name. This is always non-empty for + * non-relaxed parsers. + */ private String readName() throws IOException, XmlPullParserException { - int pos = txtPos; - int c = peek(0); - if ((c < 'a' || c > 'z') - && (c < 'A' || c > 'Z') - && c != '_' - && c != ':' - && c < 0x0c0 - && !relaxed) { - error("name expected"); + if (position >= limit && !fillBuffer(1)) { + checkRelaxed("name expected"); + return ""; } - do { - push(read()); - c = peek(0); - } - while ((c >= 'a' && c <= 'z') + int start = position; + StringBuilder result = null; + + // read the first character + char c = buffer[position]; + if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') - || (c >= '0' && c <= '9') || c == '_' - || c == '-' || c == ':' - || c == '.' - || c >= 0x0b7); + || c >= '\u00c0' // TODO: check the XML spec + || relaxed) { + position++; + } else { + checkRelaxed("name expected"); + return ""; + } - String result = get(pos); - txtPos = pos; - return result; + while (true) { + /* + * Make sure we have at least a single character to read from the + * buffer. This mutates the buffer, so save the partial result + * to the slow path string builder first. + */ + if (position >= limit) { + if (result == null) { + result = new StringBuilder(); + } + result.append(buffer, start, position - start); + if (!fillBuffer(1)) { + return result.toString(); + } + start = position; + } + + // read another character + c = buffer[position]; + if ((c >= 'a' && c <= 'z') + || (c >= 'A' && c <= 'Z') + || (c >= '0' && c <= '9') + || c == '_' + || c == '-' + || c == ':' + || c == '.' + || c >= '\u00b7') { // TODO: check the XML spec + position++; + continue; + } + + // we encountered a non-name character. done! + if (result == null) { + return new String(buffer, start, position - start); + } else { + result.append(buffer, start, position - start); + return result.toString(); + } + } } private void skip() throws IOException { - while (true) { - int c = peek(0); - if (c > ' ' || c == -1) { + while (position < limit || fillBuffer(1)) { + int c = buffer[position]; + if (c > ' ') { break; } - read(); + position++; } } @@ -902,8 +1039,6 @@ public class KXmlParser implements XmlPullParser { public void setInput(Reader reader) throws XmlPullParserException { this.reader = reader; - line = 1; - column = 0; type = START_DOCUMENT; name = null; namespace = null; @@ -917,9 +1052,10 @@ public class KXmlParser implements XmlPullParser { return; } - srcPos = 0; - srcCount = 0; - peekCount = 0; + position = 0; + limit = 0; + bufferStartLine = 0; + bufferStartColumn = 0; depth = 0; entityMap = new HashMap<String, String>(); @@ -931,8 +1067,8 @@ public class KXmlParser implements XmlPullParser { } public void setInput(InputStream is, String _enc) throws XmlPullParserException { - srcPos = 0; - srcCount = 0; + position = 0; + limit = 0; String enc = _enc; if (is == null) { @@ -941,66 +1077,64 @@ public class KXmlParser implements XmlPullParser { try { if (enc == null) { - // read four bytes - - int chk = 0; - - while (srcCount < 4) { + // read the four bytes looking for an indication of the encoding in use + int firstFourBytes = 0; + while (limit < 4) { int i = is.read(); if (i == -1) { break; } - chk = (chk << 8) | i; - srcBuf[srcCount++] = (char) i; + firstFourBytes = (firstFourBytes << 8) | i; + buffer[limit++] = (char) i; } - if (srcCount == 4) { - switch (chk) { - case 0x00000FEFF: + if (limit == 4) { + switch (firstFourBytes) { + case 0x00000FEFF: // UTF-32BE BOM enc = "UTF-32BE"; - srcCount = 0; + limit = 0; break; - case 0x0FFFE0000: + case 0x0FFFE0000: // UTF-32LE BOM enc = "UTF-32LE"; - srcCount = 0; + limit = 0; break; - case 0x03c: + case 0x0000003c: // '>' in UTF-32BE enc = "UTF-32BE"; - srcBuf[0] = '<'; - srcCount = 1; + buffer[0] = '<'; + limit = 1; break; - case 0x03c000000: + case 0x03c000000: // '<' in UTF-32LE enc = "UTF-32LE"; - srcBuf[0] = '<'; - srcCount = 1; + buffer[0] = '<'; + limit = 1; break; - case 0x0003c003f: + case 0x0003c003f: // "<?" in UTF-16BE enc = "UTF-16BE"; - srcBuf[0] = '<'; - srcBuf[1] = '?'; - srcCount = 2; + buffer[0] = '<'; + buffer[1] = '?'; + limit = 2; break; - case 0x03c003f00: + case 0x03c003f00: // "<?" in UTF-16LE enc = "UTF-16LE"; - srcBuf[0] = '<'; - srcBuf[1] = '?'; - srcCount = 2; + buffer[0] = '<'; + buffer[1] = '?'; + limit = 2; break; - case 0x03c3f786d: + case 0x03c3f786d: // "<?xm" in ASCII etc. while (true) { int i = is.read(); if (i == -1) { break; } - srcBuf[srcCount++] = (char) i; + buffer[limit++] = (char) i; if (i == '>') { - String s = new String(srcBuf, 0, srcCount); + String s = new String(buffer, 0, limit); int i0 = s.indexOf("encoding"); if (i0 != -1) { while (s.charAt(i0) != '"' @@ -1016,20 +1150,19 @@ public class KXmlParser implements XmlPullParser { } default: - if ((chk & 0x0ffff0000) == 0x0FEFF0000) { + // handle a byte order mark followed by something other than <? + if ((firstFourBytes & 0x0ffff0000) == 0x0FEFF0000) { enc = "UTF-16BE"; - srcBuf[0] = - (char) ((srcBuf[2] << 8) | srcBuf[3]); - srcCount = 1; - } else if ((chk & 0x0ffff0000) == 0x0fffe0000) { + buffer[0] = (char) ((buffer[2] << 8) | buffer[3]); + limit = 1; + } else if ((firstFourBytes & 0x0ffff0000) == 0x0fffe0000) { enc = "UTF-16LE"; - srcBuf[0] = - (char) ((srcBuf[3] << 8) | srcBuf[2]); - srcCount = 1; - } else if ((chk & 0x0ffffff00) == 0x0EFBBBF00) { + buffer[0] = (char) ((buffer[3] << 8) | buffer[2]); + limit = 1; + } else if ((firstFourBytes & 0x0ffffff00) == 0x0EFBBBF00) { enc = "UTF-8"; - srcBuf[0] = srcBuf[3]; - srcCount = 1; + buffer[0] = buffer[3]; + limit = 1; } } } @@ -1039,15 +1172,12 @@ public class KXmlParser implements XmlPullParser { enc = "UTF-8"; } - int sc = srcCount; + int sc = limit; setInput(new InputStreamReader(is, enc)); encoding = _enc; - srcCount = sc; + limit = sc; } catch (Exception e) { - throw new XmlPullParserException( - "Invalid stream or encoding: " + e.toString(), - this, - e); + throw new XmlPullParserException("Invalid stream or encoding: " + e, this, e); } } @@ -1094,11 +1224,11 @@ public class KXmlParser implements XmlPullParser { } public String getNamespacePrefix(int pos) { - return nspStack[pos << 1]; + return nspStack[pos * 2]; } public String getNamespaceUri(int pos) { - return nspStack[(pos << 1) + 1]; + return nspStack[(pos * 2) + 1]; } public String getNamespace(String prefix) { @@ -1144,12 +1274,11 @@ public class KXmlParser implements XmlPullParser { } buf.append(name); - int cnt = attributeCount << 2; + int cnt = attributeCount * 4; for (int i = 0; i < cnt; i += 4) { buf.append(' '); if (attributes[i + 1] != null) { - buf.append( - "{" + attributes[i] + "}" + attributes[i + 1] + ":"); + buf.append("{" + attributes[i] + "}" + attributes[i + 1] + ":"); } buf.append(attributes[i + 2] + "='" + attributes[i + 3] + "'"); } @@ -1169,7 +1298,7 @@ public class KXmlParser implements XmlPullParser { buf.append(text); } - buf.append("@" + line + ":" + column); + buf.append("@" + getLineNumber() + ":" + getColumnNumber()); if (location != null) { buf.append(" in "); buf.append(location); @@ -1181,40 +1310,55 @@ public class KXmlParser implements XmlPullParser { } public int getLineNumber() { - return line; + int result = bufferStartLine; + for (int i = 0; i < position; i++) { + if (buffer[i] == '\n') { + result++; + } + } + return result + 1; // the first line is '1' } public int getColumnNumber() { - return column; + int result = bufferStartColumn; + for (int i = 0; i < position; i++) { + if (buffer[i] == '\n') { + result = 0; + } else { + result++; + } + } + return result + 1; // the first column is '1' } public boolean isWhitespace() throws XmlPullParserException { if (type != TEXT && type != IGNORABLE_WHITESPACE && type != CDSECT) { - exception(ILLEGAL_TYPE); + throw new XmlPullParserException(ILLEGAL_TYPE, this, null); } return isWhitespace; } public String getText() { - return type < TEXT - || (type == ENTITY_REF && unresolved) ? null : get(0); + if (type < TEXT || (type == ENTITY_REF && unresolved)) { + return null; + } else if (text == null) { + return ""; + } else { + return text; + } } public char[] getTextCharacters(int[] poslen) { - if (type >= TEXT) { - if (type == ENTITY_REF) { - poslen[0] = 0; - poslen[1] = name.length(); - return name.toCharArray(); - } - poslen[0] = 0; - poslen[1] = txtPos; - return txtBuf; - } - - poslen[0] = -1; - poslen[1] = -1; - return null; + String text = getText(); + if (text == null) { + poslen[0] = -1; + poslen[1] = -1; + return null; + } + char[] result = text.toCharArray(); + poslen[0] = 0; + poslen[1] = result.length; + return result; } public String getNamespace() { @@ -1231,7 +1375,7 @@ public class KXmlParser implements XmlPullParser { public boolean isEmptyElementTag() throws XmlPullParserException { if (type != START_TAG) { - exception(ILLEGAL_TYPE); + throw new XmlPullParserException(ILLEGAL_TYPE, this, null); } return degenerated; } @@ -1252,33 +1396,32 @@ public class KXmlParser implements XmlPullParser { if (index >= attributeCount) { throw new IndexOutOfBoundsException(); } - return attributes[index << 2]; + return attributes[index * 4]; } public String getAttributeName(int index) { if (index >= attributeCount) { throw new IndexOutOfBoundsException(); } - return attributes[(index << 2) + 2]; + return attributes[(index * 4) + 2]; } public String getAttributePrefix(int index) { if (index >= attributeCount) { throw new IndexOutOfBoundsException(); } - return attributes[(index << 2) + 1]; + return attributes[(index * 4) + 1]; } public String getAttributeValue(int index) { if (index >= attributeCount) { throw new IndexOutOfBoundsException(); } - return attributes[(index << 2) + 3]; + return attributes[(index * 4) + 3]; } public String getAttributeValue(String namespace, String name) { - - for (int i = (attributeCount << 2) - 4; i >= 0; i -= 4) { + for (int i = (attributeCount * 4) - 4; i >= 0; i -= 4) { if (attributes[i + 2].equals(name) && (namespace == null || attributes[i].equals(namespace))) { return attributes[i + 3]; @@ -1293,8 +1436,7 @@ public class KXmlParser implements XmlPullParser { } public int next() throws XmlPullParserException, IOException { - - txtPos = 0; + text = null; isWhitespace = true; int minType = 9999; token = false; @@ -1304,9 +1446,7 @@ public class KXmlParser implements XmlPullParser { if (type < minType) { minType = type; } - // if (curr <= TEXT) type = curr; - } - while (minType > ENTITY_REF // ignorable + } while (minType > ENTITY_REF // ignorable || (minType >= TEXT && peekType() >= TEXT)); type = minType; @@ -1319,7 +1459,7 @@ public class KXmlParser implements XmlPullParser { public int nextToken() throws XmlPullParserException, IOException { isWhitespace = true; - txtPos = 0; + text = null; token = true; nextImpl(); @@ -1335,7 +1475,7 @@ public class KXmlParser implements XmlPullParser { } if (type != END_TAG && type != START_TAG) { - exception("unexpected type"); + throw new XmlPullParserException("unexpected type", this, null); } return type; @@ -1347,14 +1487,14 @@ public class KXmlParser implements XmlPullParser { if (type != this.type || (namespace != null && !namespace.equals(getNamespace())) || (name != null && !name.equals(getName()))) { - exception( - "expected: " + TYPES[type] + " {" + namespace + "}" + name); + throw new XmlPullParserException( + "expected: " + TYPES[type] + " {" + namespace + "}" + name, this, null); } } public String nextText() throws XmlPullParserException, IOException { if (type != START_TAG) { - exception("precondition: START_TAG"); + throw new XmlPullParserException("precondition: START_TAG", this, null); } next(); @@ -1368,7 +1508,7 @@ public class KXmlParser implements XmlPullParser { } if (type != END_TAG) { - exception("END_TAG expected"); + throw new XmlPullParserException("END_TAG expected", this, null); } return result; @@ -1378,15 +1518,16 @@ public class KXmlParser implements XmlPullParser { if (XmlPullParser.FEATURE_PROCESS_NAMESPACES.equals(feature)) { processNsp = value; } else if (isProp(feature, false, "relaxed")) { + // "http://xmlpull.org/v1/doc/features.html#relaxed" relaxed = value; } else { - exception("unsupported feature: " + feature); + throw new XmlPullParserException("unsupported feature: " + feature, this, null); } } public void setProperty(String property, Object value) throws XmlPullParserException { if (isProp(property, true, "location")) { - location = value; + location = String.valueOf(value); } else { throw new XmlPullParserException("unsupported property: " + property); } diff --git a/xml/src/main/java/org/xmlpull/v1/XmlPullParser.java b/xml/src/main/java/org/xmlpull/v1/XmlPullParser.java index b2f5e39..48c95a9 100644 --- a/xml/src/main/java/org/xmlpull/v1/XmlPullParser.java +++ b/xml/src/main/java/org/xmlpull/v1/XmlPullParser.java @@ -59,7 +59,7 @@ import java.io.Reader; * getProperty("<a href="http://xmlpull.org/v1/doc/properties.html#xmldecl-version">http://xmlpull.org/v1/doc/properties.html#xmldecl-version</a>") * returns String ("1.0") or null if XMLDecl was not read or if property is not supported * <li><b>standalone</b>: - * getProperty("<a href="http://xmlpull.org/v1/doc/features.html#xmldecl-standalone">http://xmlpull.org/v1/doc/features.html#xmldecl-standalone</a>") + * getProperty("<a href="http://xmlpull.org/v1/doc/properties.html#xmldecl-standalone">http://xmlpull.org/v1/doc/properties.html#xmldecl-standalone</a>") * returns Boolean: null if there was no standalone declaration * or if property is not supported * otherwise returns Boolean(true) if standalone="yes" and Boolean(false) when standalone="no" |