summaryrefslogtreecommitdiffstats
path: root/WebCore/html/HTML5Lexer.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'WebCore/html/HTML5Lexer.cpp')
-rw-r--r--WebCore/html/HTML5Lexer.cpp1319
1 files changed, 1319 insertions, 0 deletions
diff --git a/WebCore/html/HTML5Lexer.cpp b/WebCore/html/HTML5Lexer.cpp
new file mode 100644
index 0000000..44eae83
--- /dev/null
+++ b/WebCore/html/HTML5Lexer.cpp
@@ -0,0 +1,1319 @@
+/*
+ * Copyright (C) 2008 Apple Inc. All Rights Reserved.
+ * Copyright (C) 2009 Torch Mobile, Inc. http://www.torchmobile.com/
+ * Copyright (C) 2010 Google, Inc. All Rights Reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+#include "HTML5Lexer.h"
+
+#include "AtomicString.h"
+#include "HTML5Token.h"
+#include "HTMLNames.h"
+#include "NotImplemented.h"
+#include <wtf/CurrentTime.h>
+#include <wtf/UnusedParam.h>
+#include <wtf/text/CString.h>
+#include <wtf/unicode/Unicode.h>
+
+
+// Use __GNUC__ instead of PLATFORM(GCC) to stay consistent with the gperf generated c file
+#ifdef __GNUC__
+// The main tokenizer includes this too so we are getting two copies of the data. However, this way the code gets inlined.
+#include "HTMLEntityNames.c"
+#else
+// Not inlined for non-GCC compilers
+struct Entity {
+ const char* name;
+ int code;
+};
+const struct Entity* findEntity(register const char* str, register unsigned int len);
+#endif
+
+using namespace WTF;
+
+namespace WebCore {
+
+using namespace HTMLNames;
+
+inline UChar toLowerCase(UChar cc)
+{
+ ASSERT(cc >= 'A' && cc <= 'Z');
+ const int lowerCaseOffset = 0x20;
+ return cc + lowerCaseOffset;
+}
+
+HTML5Lexer::HTML5Lexer()
+{
+ reset();
+}
+
+HTML5Lexer::~HTML5Lexer()
+{
+}
+
+void HTML5Lexer::reset()
+{
+ m_state = DataState;
+ m_token = 0;
+ m_emitPending = false;
+ m_additionalAllowedCharacter = '\0';
+}
+
+static inline bool isWhitespace(UChar c)
+{
+ return c == ' ' || c == '\n' || c == '\r' || c == '\t';
+}
+
+static inline unsigned legalEntityFor(unsigned value)
+{
+ // FIXME There is a table for more exceptions in the HTML5 specification.
+ if (value == 0 || value > 0x10FFFF || (value >= 0xD800 && value <= 0xDFFF))
+ return 0xFFFD;
+ return value;
+}
+
+unsigned HTML5Lexer::consumeEntity(SegmentedString& source, bool& notEnoughCharacters)
+{
+ enum EntityState {
+ Initial,
+ NumberType,
+ MaybeHex,
+ Hex,
+ Decimal,
+ Named
+ };
+ EntityState entityState = Initial;
+ unsigned result = 0;
+ Vector<UChar, 10> seenChars;
+ Vector<char, 10> entityName;
+
+ while (!source.isEmpty()) {
+ UChar cc = *source;
+ seenChars.append(cc);
+ switch (entityState) {
+ case Initial:
+ if (isWhitespace(cc) || cc == '<' || cc == '&')
+ return 0;
+ else if (cc == '#')
+ entityState = NumberType;
+ else if ((cc >= 'a' && cc <= 'z') || (cc >= 'A' && cc <= 'Z')) {
+ entityName.append(cc);
+ entityState = Named;
+ } else
+ return 0;
+ break;
+ case NumberType:
+ if (cc == 'x' || cc == 'X')
+ entityState = MaybeHex;
+ else if (cc >= '0' && cc <= '9') {
+ entityState = Decimal;
+ result = cc - '0';
+ } else {
+ source.push('#');
+ return 0;
+ }
+ break;
+ case MaybeHex:
+ if (cc >= '0' && cc <= '9')
+ result = cc - '0';
+ else if (cc >= 'a' && cc <= 'f')
+ result = 10 + cc - 'a';
+ else if (cc >= 'A' && cc <= 'F')
+ result = 10 + cc - 'A';
+ else {
+ source.push('#');
+ source.push(seenChars[1]);
+ return 0;
+ }
+ entityState = Hex;
+ break;
+ case Hex:
+ if (cc >= '0' && cc <= '9')
+ result = result * 16 + cc - '0';
+ else if (cc >= 'a' && cc <= 'f')
+ result = result * 16 + 10 + cc - 'a';
+ else if (cc >= 'A' && cc <= 'F')
+ result = result * 16 + 10 + cc - 'A';
+ else if (cc == ';') {
+ source.advance();
+ return legalEntityFor(result);
+ } else
+ return legalEntityFor(result);
+ break;
+ case Decimal:
+ if (cc >= '0' && cc <= '9')
+ result = result * 10 + cc - '0';
+ else if (cc == ';') {
+ source.advance();
+ return legalEntityFor(result);
+ } else
+ return legalEntityFor(result);
+ break;
+ case Named:
+ // This is the attribute only version, generic version matches somewhat differently
+ while (entityName.size() <= 8) {
+ if (cc == ';') {
+ const Entity* entity = findEntity(entityName.data(), entityName.size());
+ if (entity) {
+ source.advance();
+ return entity->code;
+ }
+ break;
+ }
+ if (!(cc >= 'a' && cc <= 'z') && !(cc >= 'A' && cc <= 'Z') && !(cc >= '0' && cc <= '9')) {
+ const Entity* entity = findEntity(entityName.data(), entityName.size());
+ if (entity)
+ return entity->code;
+ break;
+ }
+ entityName.append(cc);
+ source.advance();
+ if (source.isEmpty())
+ goto outOfCharacters;
+ cc = *source;
+ seenChars.append(cc);
+ }
+ if (seenChars.size() == 2)
+ source.push(seenChars[0]);
+ else if (seenChars.size() == 3) {
+ source.push(seenChars[0]);
+ source.push(seenChars[1]);
+ } else
+ source.prepend(SegmentedString(String(seenChars.data(), seenChars.size() - 1)));
+ return 0;
+ }
+ source.advance();
+ }
+outOfCharacters:
+ notEnoughCharacters = true;
+ source.prepend(SegmentedString(String(seenChars.data(), seenChars.size())));
+ return 0;
+}
+
+bool HTML5Lexer::nextToken(SegmentedString& source, HTML5Token& token)
+{
+ // If we have a token in progress, then we're supposed to be called back
+ // with the same token so we can finish it.
+ ASSERT(!m_token || m_token == &token || token.type() == HTML5Token::Uninitialized);
+ m_token = &token;
+
+ if (!m_bufferedEndTagName.isEmpty()) {
+ // We started an end tag during our last iteration.
+ m_token->beginEndTag(m_bufferedEndTagName);
+ m_bufferedEndTagName.clear();
+ if (m_state == DataState) {
+ // We're back in the data state, so we must be done with the tag.
+ return true;
+ }
+ }
+
+ // Source: http://www.whatwg.org/specs/web-apps/current-work/#tokenisation0
+ // FIXME: This while should stop as soon as we have a token to return.
+ while (!source.isEmpty()) {
+ UChar cc = *source;
+ switch (m_state) {
+ case DataState: {
+ if (cc == '&')
+ m_state = CharacterReferenceInDataState;
+ else if (cc == '<') {
+ if (m_token->type() == HTML5Token::Character) {
+ // We have a bunch of character tokens queued up that we
+ // are emitting lazily here.
+ return true;
+ }
+ m_state = TagOpenState;
+ } else
+ emitCharacter(cc);
+ break;
+ }
+ case CharacterReferenceInDataState: {
+ notImplemented();
+ break;
+ }
+ case RCDATAState: {
+ if (cc == '&')
+ m_state = CharacterReferenceInRCDATAState;
+ else if (cc == '<')
+ m_state = RCDATALessThanSignState;
+ else
+ emitCharacter(cc);
+ break;
+ }
+ case CharacterReferenceInRCDATAState: {
+ notImplemented();
+ break;
+ }
+ case RAWTEXTState: {
+ if (cc == '<')
+ m_state = RAWTEXTLessThanSignState;
+ else
+ emitCharacter(cc);
+ break;
+ }
+ case ScriptDataState: {
+ if (cc == '<')
+ m_state = ScriptDataLessThanSignState;
+ else
+ emitCharacter(cc);
+ break;
+ }
+ case PLAINTEXTState: {
+ emitCharacter(cc);
+ break;
+ }
+ case TagOpenState: {
+ if (cc == '!')
+ m_state = MarkupDeclarationOpenState;
+ else if (cc == '/')
+ m_state = EndTagOpenState;
+ else if (cc >= 'A' && cc <= 'Z') {
+ m_token->beginStartTag(toLowerCase(cc));
+ m_state = TagNameState;
+ } else if (cc >= 'a' && cc <= 'z') {
+ m_token->beginStartTag(cc);
+ m_state = TagNameState;
+ } else if (cc == '?') {
+ emitParseError();
+ m_state = BogusCommentState;
+ } else {
+ emitParseError();
+ m_state = DataState;
+ emitCharacter('<');
+ continue;
+ }
+ break;
+ }
+ case EndTagOpenState: {
+ if (cc >= 'A' && cc <= 'Z') {
+ m_token->beginEndTag(toLowerCase(cc));
+ m_state = TagNameState;
+ } else if (cc >= 'a' && cc <= 'z') {
+ m_token->beginEndTag(cc);
+ m_state = TagNameState;
+ } else if (cc == '>') {
+ emitParseError();
+ m_state = DataState;
+ } else {
+ emitParseError();
+ m_state = DataState;
+ }
+ // FIXME: Handle EOF properly.
+ break;
+ }
+ case TagNameState: {
+ if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ')
+ m_state = BeforeAttributeNameState;
+ else if (cc == '/')
+ m_state = SelfClosingStartTagState;
+ else if (cc == '>') {
+ emitCurrentToken();
+ m_state = DataState;
+ } else if (cc >= 'A' && cc <= 'Z')
+ m_token->appendToName(toLowerCase(cc));
+ else
+ m_token->appendToName(cc);
+ // FIXME: Handle EOF properly.
+ break;
+ }
+ case RCDATALessThanSignState: {
+ if (cc == '/') {
+ m_temporaryBuffer.clear();
+ m_state = RCDATAEndTagOpenState;
+ } else {
+ emitCharacter('<');
+ m_state = RCDATAState;
+ continue;
+ }
+ break;
+ }
+ case RCDATAEndTagOpenState: {
+ if (cc >= 'A' && cc <= 'Z') {
+ notImplemented();
+ m_state = RCDATAEndTagNameState;
+ } else if (cc >= 'a' && cc <= 'z') {
+ notImplemented();
+ m_state = RCDATAEndTagNameState;
+ emitCharacter('<');
+ emitCharacter('/');
+ m_state = RCDATAState;
+ continue;
+ }
+ break;
+ }
+ case RCDATAEndTagNameState: {
+ if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ') {
+ notImplemented();
+ m_state = BeforeAttributeNameState;
+ } else if (cc == '/') {
+ notImplemented();
+ m_state = SelfClosingStartTagState;
+ } else if (cc == '>') {
+ notImplemented();
+ m_state = DataState;
+ } else if (cc >= 'A' && cc <= 'Z')
+ notImplemented();
+ else if (cc >= 'a' && cc <= 'z')
+ notImplemented();
+ else {
+ emitCharacter('<');
+ emitCharacter('/');
+ notImplemented();
+ m_state = RCDATAState;
+ continue;
+ }
+ break;
+ }
+ case RAWTEXTLessThanSignState: {
+ if (cc == '/') {
+ m_temporaryBuffer.clear();
+ m_state = RAWTEXTEndTagOpenState;
+ } else {
+ emitCharacter('<');
+ m_state = RAWTEXTState;
+ continue;
+ }
+ break;
+ }
+ case RAWTEXTEndTagOpenState: {
+ if (cc >= 'A' && cc <= 'Z') {
+ notImplemented();
+ m_state = RAWTEXTEndTagNameState;
+ } else if (cc >= 'a' && cc <= 'z') {
+ notImplemented();
+ m_state = RAWTEXTEndTagNameState;
+ } else {
+ emitCharacter('<');
+ emitCharacter('/');
+ m_state = RAWTEXTState;
+ continue;
+ }
+ break;
+ }
+ case RAWTEXTEndTagNameState: {
+ if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ') {
+ notImplemented();
+ m_state = BeforeAttributeNameState;
+ } else if (cc == '/') {
+ notImplemented();
+ m_state = SelfClosingStartTagState;
+ } else if (cc == '>') {
+ notImplemented();
+ m_state = DataState;
+ } else if (cc >= 'A' && cc <= 'Z')
+ notImplemented();
+ else if (cc >= 'a' && cc <= 'z')
+ notImplemented();
+ else {
+ emitCharacter('<');
+ emitCharacter('/');
+ notImplemented();
+ m_state = RAWTEXTState;
+ continue;
+ }
+ }
+ case ScriptDataLessThanSignState: {
+ if (cc == '/') {
+ m_temporaryBuffer.clear();
+ ASSERT(m_bufferedEndTagName.isEmpty());
+ m_state = ScriptDataEndTagOpenState;
+ } else if (cc == '!') {
+ emitCharacter('<');
+ emitCharacter('!');
+ m_state = ScriptDataEscapeStartState;
+ } else {
+ emitCharacter('<');
+ m_state = ScriptDataState;
+ continue;
+ }
+ break;
+ }
+ case ScriptDataEndTagOpenState: {
+ if (cc >= 'A' && cc <= 'Z') {
+ m_temporaryBuffer.append(cc);
+ m_bufferedEndTagName.append(toLowerCase(cc));
+ m_state = ScriptDataEndTagNameState;
+ } else if (cc >= 'a' && cc <= 'z') {
+ m_temporaryBuffer.append(cc);
+ m_bufferedEndTagName.append(cc);
+ m_state = ScriptDataEndTagNameState;
+ } else {
+ emitCharacter('<');
+ emitCharacter('/');
+ m_state = ScriptDataState;
+ continue;
+ }
+ break;
+ }
+ case ScriptDataEndTagNameState: {
+ if (cc >= 'A' && cc <= 'Z') {
+ m_temporaryBuffer.append(cc);
+ m_bufferedEndTagName.append(toLowerCase(cc));
+ } else if (cc >= 'a' && cc <= 'z') {
+ m_temporaryBuffer.append(cc);
+ m_bufferedEndTagName.append(cc);
+ } else {
+ if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ') {
+ if (isAppropriateEndTag()) {
+ m_state = BeforeAttributeNameState;
+ ASSERT(m_token->type() == HTML5Token::Character);
+ emitCurrentToken();
+ break;
+ }
+ } else if (cc == '/') {
+ if (isAppropriateEndTag()) {
+ m_state = SelfClosingStartTagState;
+ ASSERT(m_token->type() == HTML5Token::Character);
+ emitCurrentToken();
+ break;
+ }
+ } else if (cc == '>') {
+ if (isAppropriateEndTag()) {
+ m_state = DataState;
+ ASSERT(m_token->type() == HTML5Token::Character);
+ emitCurrentToken();
+ break;
+ }
+ }
+ emitCharacter('<');
+ emitCharacter('/');
+ m_token->appendToCharacter(m_temporaryBuffer);
+ m_bufferedEndTagName.clear();
+ m_state = ScriptDataState;
+ continue;
+ }
+ break;
+ }
+ case ScriptDataEscapeStartState: {
+ if (cc == '-') {
+ emitCharacter(cc);
+ m_state = ScriptDataEscapeStartDashState;
+ } else {
+ m_state = ScriptDataState;
+ continue;
+ }
+ break;
+ }
+ case ScriptDataEscapeStartDashState: {
+ if (cc == '-') {
+ emitCharacter(cc);
+ m_state = ScriptDataEscapedDashDashState;
+ } else {
+ m_state = ScriptDataState;
+ continue;
+ }
+ break;
+ }
+ case ScriptDataEscapedState: {
+ if (cc == '-') {
+ emitCharacter(cc);
+ m_state = ScriptDataEscapedDashState;
+ } else if (cc == '<')
+ m_state = ScriptDataEscapedLessThanSignState;
+ else
+ emitCharacter(cc);
+ // FIXME: Handle EOF properly.
+ break;
+ }
+ case ScriptDataEscapedDashState: {
+ if (cc == '-') {
+ emitCharacter(cc);
+ m_state = ScriptDataEscapedDashDashState;
+ } else if (cc == '<')
+ m_state = ScriptDataEscapedLessThanSignState;
+ else {
+ emitCharacter(cc);
+ m_state = ScriptDataEscapedState;
+ }
+ // FIXME: Handle EOF properly.
+ break;
+ }
+ case ScriptDataEscapedDashDashState: {
+ if (cc == '-')
+ emitCharacter(cc);
+ else if (cc == '<')
+ m_state = ScriptDataEscapedLessThanSignState;
+ else if (cc == '>') {
+ emitCharacter(cc);
+ m_state = ScriptDataState;
+ } else {
+ emitCharacter(cc);
+ m_state = ScriptDataEscapedState;
+ }
+ // FIXME: Handle EOF properly.
+ break;
+ }
+ case ScriptDataEscapedLessThanSignState: {
+ if (cc == '/') {
+ m_temporaryBuffer.clear();
+ m_state = ScriptDataEscapedEndTagOpenState;
+ } else if (cc >= 'A' && cc <= 'Z') {
+ notImplemented();
+ m_state = ScriptDataDoubleEscapeStartState;
+ } else if (cc >= 'a' && cc <= 'z') {
+ notImplemented();
+ m_state = ScriptDataDoubleEscapeStartState;
+ } else {
+ emitCharacter('<');
+ m_state = ScriptDataEscapedState;
+ continue;
+ }
+ break;
+ }
+ case ScriptDataEscapedEndTagOpenState: {
+ if (cc >= 'A' && cc <= 'Z') {
+ notImplemented();
+ m_state = ScriptDataEscapedEndTagNameState;
+ } else if (cc >= 'a' && cc <= 'z') {
+ notImplemented();
+ m_state = ScriptDataEscapedEndTagNameState;
+ } else {
+ emitCharacter('<');
+ emitCharacter('/');
+ m_state = ScriptDataEscapedState;
+ continue;
+ }
+ break;
+ }
+ case ScriptDataEscapedEndTagNameState: {
+ if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ') {
+ notImplemented();
+ m_state = BeforeAttributeNameState;
+ } else if (cc == '/') {
+ notImplemented();
+ m_state = SelfClosingStartTagState;
+ } else if (cc == '>') {
+ notImplemented();
+ m_state = DataState;
+ } else if (cc >= 'A' && cc <= 'Z')
+ notImplemented();
+ else if (cc >= 'a' && cc <= 'z')
+ notImplemented();
+ else {
+ emitCharacter('<');
+ emitCharacter('/');
+ notImplemented();
+ m_state = ScriptDataEscapedState;
+ continue;
+ }
+ break;
+ }
+ case ScriptDataDoubleEscapeStartState: {
+ if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ' || cc == '/' || cc == '>') {
+ emitCharacter(cc);
+ if (temporaryBufferIs("string"))
+ m_state = ScriptDataDoubleEscapedState;
+ else
+ m_state = ScriptDataEscapedState;
+ } else if (cc >= 'A' && cc <= 'Z')
+ notImplemented();
+ else if (cc >= 'a' && cc <= 'z')
+ notImplemented();
+ else {
+ m_state = ScriptDataEscapedState;
+ continue;
+ }
+ break;
+ }
+ case ScriptDataDoubleEscapedState: {
+ if (cc == '-') {
+ emitCharacter(cc);
+ m_state = ScriptDataDoubleEscapedDashState;
+ } else if (cc == '<') {
+ emitCharacter(cc);
+ m_state = ScriptDataDoubleEscapedLessThanSignState;
+ } else
+ emitCharacter(cc);
+ // FIXME: Handle EOF properly.
+ break;
+ }
+ case ScriptDataDoubleEscapedDashState: {
+ if (cc == '-') {
+ emitCharacter(cc);
+ m_state = ScriptDataDoubleEscapedDashDashState;
+ } else if (cc == '<') {
+ emitCharacter(cc);
+ m_state = ScriptDataDoubleEscapedLessThanSignState;
+ } else {
+ emitCharacter(cc);
+ m_state = ScriptDataDoubleEscapedState;
+ }
+ // FIXME: Handle EOF properly.
+ break;
+ }
+ case ScriptDataDoubleEscapedDashDashState: {
+ if (cc == '-')
+ emitCharacter(cc);
+ else if (cc == '<') {
+ emitCharacter(cc);
+ m_state = ScriptDataDoubleEscapedLessThanSignState;
+ } else if (cc == '>') {
+ emitCharacter(cc);
+ m_state = ScriptDataState;
+ } else {
+ emitCharacter(cc);
+ m_state = ScriptDataDoubleEscapedState;
+ }
+ // FIXME: Handle EOF properly.
+ break;
+ }
+ case ScriptDataDoubleEscapedLessThanSignState: {
+ if (cc == '/') {
+ emitCharacter(cc);
+ m_temporaryBuffer.clear();
+ m_state = ScriptDataDoubleEscapeEndState;
+ } else {
+ m_state = ScriptDataDoubleEscapedState;
+ continue;
+ }
+ break;
+ }
+ case ScriptDataDoubleEscapeEndState: {
+ if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ' || cc == '/' || cc == '>') {
+ emitCharacter(cc);
+ if (temporaryBufferIs("string"))
+ m_state = ScriptDataEscapedState;
+ else
+ m_state = ScriptDataDoubleEscapedState;
+ } else if (cc >= 'A' && cc <= 'Z')
+ notImplemented();
+ else if (cc >= 'a' && cc <= 'z')
+ notImplemented();
+ else {
+ m_state = ScriptDataDoubleEscapedState;
+ continue;
+ }
+ break;
+ }
+ case BeforeAttributeNameState: {
+ if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ')
+ break;
+ else if (cc == '/')
+ m_state = SelfClosingStartTagState;
+ else if (cc == '>') {
+ emitCurrentToken();
+ m_state = DataState;
+ } else if (cc >= 'A' && cc <= 'Z') {
+ m_token->addNewAttribute();
+ m_token->appendToAttributeName(toLowerCase(cc));
+ m_state = AttributeNameState;
+ } else {
+ if (cc == '"' || cc == '\'' || cc == '<' || cc == '=')
+ emitParseError();
+ m_token->addNewAttribute();
+ m_token->appendToAttributeName(cc);
+ m_state = AttributeNameState;
+ }
+ // FIXME: Handle EOF properly.
+ break;
+ }
+ case AttributeNameState: {
+ if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ')
+ m_state = AfterAttributeNameState;
+ else if (cc == '/')
+ m_state = SelfClosingStartTagState;
+ else if (cc == '=')
+ m_state = BeforeAttributeValueState;
+ else if (cc == '>') {
+ emitCurrentToken();
+ m_state = DataState;
+ } else if (cc >= 'A' && cc <= 'Z')
+ m_token->appendToAttributeName(toLowerCase(cc));
+ else {
+ if (cc == '"' || cc == '\'' || cc == '<' || cc == '=')
+ emitParseError();
+ m_token->appendToAttributeName(cc);
+ m_state = AttributeNameState;
+ }
+ // FIXME: Handle EOF properly.
+ break;
+ }
+ case AfterAttributeNameState: {
+ if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ')
+ break;
+ else if (cc == '/')
+ m_state = SelfClosingStartTagState;
+ else if (cc == '=')
+ m_state = BeforeAttributeValueState;
+ else if (cc == '=') {
+ emitCurrentToken();
+ m_state = DataState;
+ } else if (cc >= 'A' && cc <= 'Z') {
+ m_token->addNewAttribute();
+ m_token->appendToAttributeName(toLowerCase(cc));
+ m_state = AttributeNameState;
+ } else {
+ if (cc == '"' || cc == '\'' || cc == '<')
+ emitParseError();
+ m_token->addNewAttribute();
+ m_token->appendToAttributeName(cc);
+ m_state = AttributeNameState;
+ }
+ // FIXME: Handle EOF properly.
+ break;
+ }
+ case BeforeAttributeValueState: {
+ if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ')
+ break;
+ else if (cc == '"')
+ m_state = AttributeValueDoubleQuotedState;
+ else if (cc == '&') {
+ m_state = AttributeValueUnquotedState;
+ continue;
+ } else if (cc == '\'')
+ m_state = AttributeValueSingleQuotedState;
+ else if (cc == '>') {
+ emitParseError();
+ emitCurrentToken();
+ m_state = DataState;
+ } else {
+ if (cc == '<' || cc == '=' || cc == '`')
+ emitParseError();
+ m_token->appendToAttributeValue(cc);
+ m_state = AttributeValueUnquotedState;
+ }
+ break;
+ }
+ case AttributeValueDoubleQuotedState: {
+ if (cc == '"')
+ m_state = AfterAttributeValueQuotedState;
+ else if (cc == '&') {
+ m_state = CharacterReferenceInAttributeValueState;
+ m_additionalAllowedCharacter = '"';
+ } else
+ m_token->appendToAttributeValue(cc);
+ // FIXME: Handle EOF properly.
+ break;
+ }
+ case AttributeValueSingleQuotedState: {
+ if (cc == '\'')
+ m_state = AfterAttributeValueQuotedState;
+ else if (cc == '&') {
+ m_state = CharacterReferenceInAttributeValueState;
+ m_additionalAllowedCharacter = '\'';
+ } else
+ m_token->appendToAttributeValue(cc);
+ // FIXME: Handle EOF properly.
+ break;
+ }
+ case AttributeValueUnquotedState: {
+ if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ')
+ m_state = BeforeAttributeNameState;
+ else if (cc == '&') {
+ m_state = CharacterReferenceInAttributeValueState;
+ m_additionalAllowedCharacter = '>';
+ } else if (cc == '>') {
+ emitCurrentToken();
+ m_state = DataState;
+ } else {
+ if (cc == '"' || cc == '\'' || cc == '<' || cc == '=' || cc == '`')
+ emitParseError();
+ m_token->appendToAttributeValue(cc);
+ }
+ // FIXME: Handle EOF properly.
+ break;
+ }
+ case CharacterReferenceInAttributeValueState: {
+ notImplemented();
+ break;
+ }
+ case AfterAttributeValueQuotedState: {
+ if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ')
+ m_state = BeforeAttributeNameState;
+ else if (cc == '/')
+ m_state = SelfClosingStartTagState;
+ else if (cc == '>') {
+ emitCurrentToken();
+ m_state = DataState;
+ } else {
+ emitParseError();
+ m_state = BeforeAttributeNameState;
+ continue;
+ }
+ // FIXME: Handle EOF properly.
+ break;
+ }
+ case SelfClosingStartTagState: {
+ if (cc == '>') {
+ notImplemented();
+ emitCurrentToken();
+ m_state = DataState;
+ } else {
+ emitParseError();
+ m_state = BeforeAttributeNameState;
+ continue;
+ }
+ // FIXME: Handle EOF properly.
+ break;
+ }
+ case BogusCommentState: {
+ notImplemented();
+ m_state = DataState;
+ break;
+ }
+ case MarkupDeclarationOpenState: {
+ DEFINE_STATIC_LOCAL(String, dashDashString, ("--"));
+ if (cc == '-') {
+ SegmentedString::LookAheadResult result = source.lookAhead(dashDashString);
+ if (result == SegmentedString::DidMatch) {
+ source.advanceAndASSERT('-');
+ source.advanceAndASSERT('-');
+ m_token->beginComment();
+ m_state = CommentStartState;
+ continue;
+ } else if (result == SegmentedString::NotEnoughCharacters)
+ return false; // We need to wait for more characters to arrive.
+ }
+ notImplemented();
+ break;
+ }
+ case CommentStartState: {
+ if (cc == '-')
+ m_state = CommentStartDashState;
+ else if (cc == '>') {
+ emitParseError();
+ emitCurrentToken();
+ m_state = DataState;
+ } else {
+ m_token->appendToComment(cc);
+ m_state = CommentState;
+ }
+ // FIXME: Handle EOF properly.
+ break;
+ }
+ case CommentStartDashState: {
+ if (cc == '-')
+ m_state = CommentEndState;
+ else if (cc == '>') {
+ emitParseError();
+ emitCurrentToken();
+ m_state = DataState;
+ } else {
+ m_token->appendToComment('-');
+ m_token->appendToComment(cc);
+ m_state = CommentState;
+ }
+ // FIXME: Handle EOF properly.
+ break;
+ }
+ case CommentState: {
+ if (cc == '-')
+ m_state = CommentEndDashState;
+ else
+ m_token->appendToComment(cc);
+ // FIXME: Handle EOF properly.
+ break;
+ }
+ case CommentEndDashState: {
+ if (cc == '-')
+ m_state = CommentEndState;
+ else {
+ m_token->appendToComment('-');
+ m_token->appendToComment(cc);
+ m_state = CommentState;
+ }
+ // FIXME: Handle EOF properly.
+ break;
+ }
+ case CommentEndState: {
+ if (cc == '>') {
+ emitCurrentToken();
+ m_state = DataState;
+ } else if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ') {
+ emitParseError();
+ m_token->appendToComment('-');
+ m_token->appendToComment('-');
+ m_token->appendToComment(cc);
+ m_state = CommentEndSpaceState;
+ } else if (cc == '!') {
+ emitParseError();
+ m_state = CommentEndBangState;
+ } else if (cc == '-') {
+ emitParseError();
+ m_token->appendToComment('-');
+ m_token->appendToComment(cc);
+ } else {
+ emitParseError();
+ m_token->appendToComment('-');
+ m_token->appendToComment('-');
+ m_token->appendToComment(cc);
+ m_state = CommentState;
+ }
+ // FIXME: Handle EOF properly.
+ break;
+ }
+ case CommentEndBangState: {
+ if (cc == '-') {
+ m_token->appendToComment('-');
+ m_token->appendToComment('-');
+ m_token->appendToComment('!');
+ m_state = CommentEndDashState;
+ } else if (cc == '>') {
+ emitCurrentToken();
+ m_state = DataState;
+ } else {
+ m_token->appendToComment('-');
+ m_token->appendToComment('-');
+ m_token->appendToComment('!');
+ m_token->appendToComment(cc);
+ m_state = CommentState;
+ }
+ // FIXME: Handle EOF properly.
+ break;
+ }
+ case CommentEndSpaceState: {
+ if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ')
+ m_token->appendToComment(cc);
+ else if (cc == '-')
+ m_state = CommentEndDashState;
+ else if (cc == '>') {
+ emitCurrentToken();
+ m_state = DataState;
+ } else {
+ m_token->appendToComment(cc);
+ m_state = CommentState;
+ }
+ // FIXME: Handle EOF properly.
+ break;
+ }
+ case DOCTYPEState: {
+ if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ')
+ m_state = BeforeDOCTYPENameState;
+ else {
+ emitParseError();
+ m_state = BeforeDOCTYPENameState;
+ continue;
+ }
+ // FIXME: Handle EOF properly.
+ break;
+ }
+ case BeforeDOCTYPENameState: {
+ if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ')
+ break;
+ else if (cc >= 'A' && cc <= 'Z') {
+ notImplemented();
+ m_state = DOCTYPENameState;
+ } else if (cc == '>') {
+ emitParseError();
+ notImplemented();
+ emitCurrentDoctypeToken();
+ m_state = DataState;
+ } else {
+ notImplemented();
+ m_state = DOCTYPENameState;
+ }
+ // FIXME: Handle EOF properly.
+ break;
+ }
+ case DOCTYPENameState: {
+ if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ')
+ m_state = AfterDOCTYPENameState;
+ else if (cc == '>') {
+ emitCurrentDoctypeToken();
+ m_state = DataState;
+ } else if (cc >= 'A' && cc <= 'Z')
+ notImplemented();
+ else
+ notImplemented();
+ // FIXME: Handle EOF properly.
+ break;
+ }
+ case AfterDOCTYPENameState: {
+ if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ')
+ break;
+ else if (cc == '>') {
+ emitCurrentDoctypeToken();
+ m_state = DataState;
+ } else
+ notImplemented();
+ // FIXME: Handle EOF properly.
+ break;
+ }
+ case AfterDOCTYPEPublicKeywordState: {
+ if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ')
+ m_state = BeforeDOCTYPEPublicIdentifierState;
+ else if (cc == '"') {
+ emitParseError();
+ notImplemented();
+ m_state = DOCTYPEPublicIdentifierDoubleQuotedState;
+ } else if (cc == '\'') {
+ emitParseError();
+ notImplemented();
+ m_state = DOCTYPEPublicIdentifierSingleQuotedState;
+ } else if (cc == '>') {
+ emitParseError();
+ notImplemented();
+ emitCurrentDoctypeToken();
+ m_state = DataState;
+ } else {
+ emitParseError();
+ notImplemented();
+ m_state = BogusDOCTYPEState;
+ }
+ // FIXME: Handle EOF properly.
+ break;
+ }
+ case BeforeDOCTYPEPublicIdentifierState: {
+ if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ')
+ break;
+ else if (cc == '"') {
+ notImplemented();
+ m_state = DOCTYPEPublicIdentifierDoubleQuotedState;
+ } else if (cc == '\'') {
+ notImplemented();
+ m_state = DOCTYPEPublicIdentifierSingleQuotedState;
+ } else if (cc == '>') {
+ emitParseError();
+ notImplemented();
+ emitCurrentDoctypeToken();
+ m_state = DataState;
+ } else {
+ emitParseError();
+ notImplemented();
+ m_state = BogusDOCTYPEState;
+ }
+ // FIXME: Handle EOF properly.
+ break;
+ }
+ case DOCTYPEPublicIdentifierDoubleQuotedState: {
+ if (cc == '"')
+ m_state = AfterDOCTYPEPublicIdentifierState;
+ else if (cc == '>') {
+ emitParseError();
+ notImplemented();
+ emitCurrentDoctypeToken();
+ m_state = DataState;
+ } else
+ notImplemented();
+ // FIXME: Handle EOF properly.
+ break;
+ }
+ case DOCTYPEPublicIdentifierSingleQuotedState: {
+ if (cc == '\'')
+ m_state = AfterDOCTYPEPublicIdentifierState;
+ else if (cc == '>') {
+ emitParseError();
+ notImplemented();
+ emitCurrentDoctypeToken();
+ m_state = DataState;
+ } else
+ notImplemented();
+ // FIXME: Handle EOF properly.
+ break;
+ }
+ case AfterDOCTYPEPublicIdentifierState: {
+ if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ')
+ m_state = BetweenDOCTYPEPublicAndSystemIdentifiersState;
+ else if (cc == '>') {
+ emitCurrentDoctypeToken();
+ m_state = DataState;
+ } else if (cc == '"') {
+ emitParseError();
+ notImplemented();
+ m_state = DOCTYPESystemIdentifierDoubleQuotedState;
+ } else if (cc == '\'') {
+ emitParseError();
+ notImplemented();
+ m_state = DOCTYPESystemIdentifierSingleQuotedState;
+ } else {
+ emitParseError();
+ notImplemented();
+ m_state = BogusDOCTYPEState;
+ }
+ // FIXME: Handle EOF properly.
+ break;
+ }
+ case BetweenDOCTYPEPublicAndSystemIdentifiersState: {
+ if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ')
+ m_state = BetweenDOCTYPEPublicAndSystemIdentifiersState;
+ else if (cc == '>') {
+ emitCurrentDoctypeToken();
+ m_state = DataState;
+ } else if (cc == '"') {
+ notImplemented();
+ m_state = DOCTYPESystemIdentifierDoubleQuotedState;
+ } else if (cc == '\'') {
+ notImplemented();
+ m_state = DOCTYPESystemIdentifierSingleQuotedState;
+ } else {
+ emitParseError();
+ notImplemented();
+ m_state = BogusDOCTYPEState;
+ }
+ // FIXME: Handle EOF properly.
+ break;
+ }
+ case AfterDOCTYPESystemKeywordState: {
+ if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ')
+ m_state = BeforeDOCTYPESystemIdentifierState;
+ else if (cc == '"') {
+ emitParseError();
+ notImplemented();
+ m_state = DOCTYPESystemIdentifierDoubleQuotedState;
+ } else if (cc == '\'') {
+ emitParseError();
+ notImplemented();
+ m_state = DOCTYPESystemIdentifierSingleQuotedState;
+ } else if (cc == '>') {
+ emitParseError();
+ notImplemented();
+ emitCurrentDoctypeToken();
+ m_state = DataState;
+ } else {
+ emitParseError();
+ notImplemented();
+ m_state = BogusDOCTYPEState;
+ }
+ // FIXME: Handle EOF properly.
+ break;
+ }
+ case BeforeDOCTYPESystemIdentifierState: {
+ if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ')
+ break;
+ else if (cc == '"') {
+ notImplemented();
+ m_state = DOCTYPESystemIdentifierDoubleQuotedState;
+ } else if (cc == '\'') {
+ notImplemented();
+ m_state = DOCTYPESystemIdentifierSingleQuotedState;
+ } else if (cc == '>') {
+ emitParseError();
+ notImplemented();
+ emitCurrentDoctypeToken();
+ m_state = DataState;
+ } else {
+ emitParseError();
+ notImplemented();
+ m_state = BogusDOCTYPEState;
+ }
+ // FIXME: Handle EOF properly.
+ break;
+ }
+ case DOCTYPESystemIdentifierDoubleQuotedState: {
+ if (cc == '"')
+ m_state = AfterDOCTYPESystemIdentifierState;
+ else if (cc == '>') {
+ emitParseError();
+ notImplemented();
+ emitCurrentDoctypeToken();
+ m_state = DataState;
+ } else
+ notImplemented();
+ // FIXME: Handle EOF properly.
+ break;
+ }
+ case DOCTYPESystemIdentifierSingleQuotedState: {
+ if (cc == '\'')
+ m_state = AfterDOCTYPESystemIdentifierState;
+ else if (cc == '>') {
+ emitParseError();
+ notImplemented();
+ emitCurrentDoctypeToken();
+ m_state = DataState;
+ } else
+ notImplemented();
+ // FIXME: Handle EOF properly.
+ break;
+ }
+ case AfterDOCTYPESystemIdentifierState: {
+ if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ')
+ break;
+ else if (cc == '>') {
+ emitCurrentDoctypeToken();
+ m_state = DataState;
+ } else {
+ emitParseError();
+ m_state = BogusDOCTYPEState;
+ }
+ // FIXME: Handle EOF properly.
+ break;
+ }
+ case BogusDOCTYPEState: {
+ if (cc == '>')
+ emitCurrentDoctypeToken();
+ // FIXME: Handle EOF properly.
+ break;
+ }
+ case CDATASectionState: {
+ notImplemented();
+ break;
+ }
+ case TokenizingCharacterReferencesState: {
+ notImplemented();
+ break;
+ }
+ }
+ source.advance();
+ if (m_emitPending) {
+ m_emitPending = false;
+ return true;
+ }
+ }
+ // We've reached the end of the input stream. If we have a character
+ // token buffered, we should emit it.
+ return m_token->type() == HTML5Token::Character;
+}
+
+inline bool HTML5Lexer::temporaryBufferIs(const char*)
+{
+ notImplemented();
+ return true;
+}
+
+inline bool HTML5Lexer::isAppropriateEndTag()
+{
+ if (m_bufferedEndTagName.size() != m_appropriateEndTagName.length())
+ return false;
+ const UChar* appropriate = m_appropriateEndTagName.characters();
+ const UChar* actual = m_bufferedEndTagName.data();
+ // FIXME: Is there a higher-level function we should be calling here?
+ return !memcmp(appropriate, actual, m_bufferedEndTagName.size() * sizeof(UChar));
+}
+
+inline void HTML5Lexer::emitCharacter(UChar character)
+{
+ if (m_token->type() != HTML5Token::Character) {
+ m_token->beginCharacter(character);
+ return;
+ }
+ m_token->appendToCharacter(character);
+}
+
+inline void HTML5Lexer::emitParseError()
+{
+ notImplemented();
+}
+
+inline void HTML5Lexer::emitCurrentToken()
+{
+ m_emitPending = true;
+ if (m_token->type() == HTML5Token::StartTag)
+ m_appropriateEndTagName = m_token->name();
+}
+
+inline void HTML5Lexer::emitCurrentDoctypeToken()
+{
+}
+
+}