summaryrefslogtreecommitdiffstats
path: root/WebCore/html/HTMLTreeBuilder.cpp
diff options
context:
space:
mode:
authorKristian Monsen <kristianm@google.com>2010-06-28 16:42:48 +0100
committerKristian Monsen <kristianm@google.com>2010-07-02 10:29:56 +0100
commit06ea8e899e48f1f2f396b70e63fae369f2f23232 (patch)
tree20c1428cd05c76f32394ab354ea35ed99acd86d8 /WebCore/html/HTMLTreeBuilder.cpp
parent72aad67af14193199e29cdd5c4ddc095a8b9a8a8 (diff)
downloadexternal_webkit-06ea8e899e48f1f2f396b70e63fae369f2f23232.zip
external_webkit-06ea8e899e48f1f2f396b70e63fae369f2f23232.tar.gz
external_webkit-06ea8e899e48f1f2f396b70e63fae369f2f23232.tar.bz2
Merge WebKit at r61871: Initial merge by git.
Change-Id: I6cff43abca9cc4782e088a469ad4f03f166a65d5
Diffstat (limited to 'WebCore/html/HTMLTreeBuilder.cpp')
-rw-r--r--WebCore/html/HTMLTreeBuilder.cpp593
1 files changed, 593 insertions, 0 deletions
diff --git a/WebCore/html/HTMLTreeBuilder.cpp b/WebCore/html/HTMLTreeBuilder.cpp
new file mode 100644
index 0000000..6e40fd7
--- /dev/null
+++ b/WebCore/html/HTMLTreeBuilder.cpp
@@ -0,0 +1,593 @@
+/*
+ * Copyright (C) 2010 Google, Inc. All Rights Reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY GOOGLE INC. ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GOOGLE INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+#include "HTMLTreeBuilder.h"
+
+#include "DocumentFragment.h"
+#include "Element.h"
+#include "Frame.h"
+#include "HTMLTokenizer.h"
+#include "HTMLToken.h"
+#include "HTMLDocument.h"
+#include "LegacyHTMLDocumentParser.h"
+#include "HTMLNames.h"
+#include "LegacyHTMLTreeBuilder.h"
+#include "NotImplemented.h"
+#include "ScriptController.h"
+#include <wtf/UnusedParam.h>
+
+namespace WebCore {
+
+using namespace HTMLNames;
+
+static const int uninitializedLineNumberValue = -1;
+
+namespace {
+
+inline bool isTreeBuilderWhiteSpace(UChar cc)
+{
+ return cc == '\t' || cc == '\x0A' || cc == '\x0C' || cc == '\x0D' || cc == ' ';
+}
+
+} // namespace
+
+HTMLTreeBuilder::HTMLTreeBuilder(HTMLTokenizer* tokenizer, HTMLDocument* document, bool reportErrors)
+ : m_framesetOk(true)
+ , m_document(document)
+ , m_reportErrors(reportErrors)
+ , m_isPaused(false)
+ , m_insertionMode(InitialMode)
+ , m_tokenizer(tokenizer)
+ , m_legacyTreeBuilder(new LegacyHTMLTreeBuilder(document, reportErrors))
+ , m_lastScriptElementStartLine(uninitializedLineNumberValue)
+ , m_scriptToProcessStartLine(uninitializedLineNumberValue)
+ , m_fragmentScriptingPermission(FragmentScriptingAllowed)
+{
+}
+
+// FIXME: Member variables should be grouped into self-initializing structs to
+// minimize code duplication between these constructors.
+HTMLTreeBuilder::HTMLTreeBuilder(HTMLTokenizer* tokenizer, DocumentFragment* fragment, FragmentScriptingPermission scriptingPermission)
+ : m_framesetOk(true)
+ , m_document(fragment->document())
+ , m_reportErrors(false) // FIXME: Why not report errors in fragments?
+ , m_isPaused(false)
+ , m_insertionMode(InitialMode)
+ , m_tokenizer(tokenizer)
+ , m_legacyTreeBuilder(new LegacyHTMLTreeBuilder(fragment, scriptingPermission))
+ , m_lastScriptElementStartLine(uninitializedLineNumberValue)
+ , m_scriptToProcessStartLine(uninitializedLineNumberValue)
+ , m_fragmentScriptingPermission(scriptingPermission)
+{
+}
+
+HTMLTreeBuilder::~HTMLTreeBuilder()
+{
+}
+
+static void convertToOldStyle(const AtomicHTMLToken& token, Token& oldStyleToken)
+{
+ switch (token.type()) {
+ case HTMLToken::Uninitialized:
+ case HTMLToken::DOCTYPE:
+ ASSERT_NOT_REACHED();
+ break;
+ case HTMLToken::EndOfFile:
+ ASSERT_NOT_REACHED();
+ notImplemented();
+ break;
+ case HTMLToken::StartTag:
+ case HTMLToken::EndTag: {
+ oldStyleToken.beginTag = (token.type() == HTMLToken::StartTag);
+ oldStyleToken.selfClosingTag = token.selfClosing();
+ oldStyleToken.tagName = token.name();
+ oldStyleToken.attrs = token.attributes();
+ break;
+ }
+ case HTMLToken::Comment:
+ oldStyleToken.tagName = commentAtom;
+ oldStyleToken.text = token.comment().impl();
+ break;
+ case HTMLToken::Character:
+ oldStyleToken.tagName = textAtom;
+ oldStyleToken.text = token.characters().impl();
+ break;
+ }
+}
+
+void HTMLTreeBuilder::handleScriptStartTag()
+{
+ notImplemented(); // The HTML frgment case?
+ m_tokenizer->setState(HTMLTokenizer::ScriptDataState);
+ notImplemented(); // Save insertion mode.
+}
+
+void HTMLTreeBuilder::handleScriptEndTag(Element* scriptElement, int scriptStartLine)
+{
+ ASSERT(!m_scriptToProcess); // Caller never called takeScriptToProcess!
+ ASSERT(m_scriptToProcessStartLine == uninitializedLineNumberValue); // Caller never called takeScriptToProcess!
+ notImplemented(); // Save insertion mode and insertion point?
+
+ // Pause ourselves so that parsing stops until the script can be processed by the caller.
+ m_isPaused = true;
+ m_scriptToProcess = scriptElement;
+ // Lexer line numbers are 0-based, ScriptSourceCode expects 1-based lines,
+ // so we convert here before passing the line number off to HTMLScriptRunner.
+ m_scriptToProcessStartLine = scriptStartLine + 1;
+}
+
+PassRefPtr<Element> HTMLTreeBuilder::takeScriptToProcess(int& scriptStartLine)
+{
+ // Unpause ourselves, callers may pause us again when processing the script.
+ // The HTML5 spec is written as though scripts are executed inside the tree
+ // builder. We pause the parser to exit the tree builder, and then resume
+ // before running scripts.
+ m_isPaused = false;
+ scriptStartLine = m_scriptToProcessStartLine;
+ m_scriptToProcessStartLine = uninitializedLineNumberValue;
+ return m_scriptToProcess.release();
+}
+
+HTMLTokenizer::State HTMLTreeBuilder::adjustedLexerState(HTMLTokenizer::State state, const AtomicString& tagName, Frame* frame)
+{
+ if (tagName == textareaTag || tagName == titleTag)
+ return HTMLTokenizer::RCDATAState;
+
+ if (tagName == styleTag || tagName == iframeTag || tagName == xmpTag || tagName == noembedTag
+ || tagName == noframesTag || (tagName == noscriptTag && isScriptingFlagEnabled(frame)))
+ return HTMLTokenizer::RAWTEXTState;
+
+ if (tagName == plaintextTag)
+ return HTMLTokenizer::PLAINTEXTState;
+
+ return state;
+}
+
+PassRefPtr<Node> HTMLTreeBuilder::passTokenToLegacyParser(HTMLToken& token)
+{
+ if (token.type() == HTMLToken::DOCTYPE) {
+ DoctypeToken doctypeToken;
+ doctypeToken.m_name.append(token.name().data(), token.name().size());
+ doctypeToken.m_publicID = token.publicIdentifier();
+ doctypeToken.m_systemID = token.systemIdentifier();
+ doctypeToken.m_forceQuirks = token.forceQuirks();
+
+ m_legacyTreeBuilder->parseDoctypeToken(&doctypeToken);
+ return 0;
+ }
+
+ // For now, we translate into an old-style token for testing.
+ Token oldStyleToken;
+ AtomicHTMLToken atomicToken(token);
+ convertToOldStyle(atomicToken, oldStyleToken);
+
+ RefPtr<Node> result = m_legacyTreeBuilder->parseToken(&oldStyleToken);
+ if (token.type() == HTMLToken::StartTag) {
+ // This work is supposed to be done by the parser, but
+ // when using the old parser for we have to do this manually.
+ if (oldStyleToken.tagName == scriptTag) {
+ handleScriptStartTag();
+ m_lastScriptElement = static_pointer_cast<Element>(result);
+ m_lastScriptElementStartLine = m_tokenizer->lineNumber();
+ } else if (oldStyleToken.tagName == preTag || oldStyleToken.tagName == listingTag)
+ m_tokenizer->skipLeadingNewLineForListing();
+ else
+ m_tokenizer->setState(adjustedLexerState(m_tokenizer->state(), oldStyleToken.tagName, m_document->frame()));
+ } else if (token.type() == HTMLToken::EndTag) {
+ if (oldStyleToken.tagName == scriptTag) {
+ if (m_lastScriptElement) {
+ ASSERT(m_lastScriptElementStartLine != uninitializedLineNumberValue);
+ if (m_fragmentScriptingPermission == FragmentScriptingNotAllowed) {
+ // FIXME: This is a horrible hack for platform/Pasteboard.
+ // Clear the <script> tag when using the Parser to create
+ // a DocumentFragment for pasting so that javascript content
+ // does not show up in pasted HTML.
+ m_lastScriptElement->removeChildren();
+ } else if (insertionMode() != AfterFramesetMode)
+ handleScriptEndTag(m_lastScriptElement.get(), m_lastScriptElementStartLine);
+ m_lastScriptElement = 0;
+ m_lastScriptElementStartLine = uninitializedLineNumberValue;
+ }
+ } else if (oldStyleToken.tagName == framesetTag)
+ setInsertionMode(AfterFramesetMode);
+ }
+ return result.release();
+}
+
+PassRefPtr<Node> HTMLTreeBuilder::constructTreeFromToken(HTMLToken& rawToken)
+{
+ // Make MSVC ignore our unreachable code for now.
+ if (true)
+ return passTokenToLegacyParser(rawToken);
+
+ AtomicHTMLToken token(rawToken);
+
+ // HTML5 expects the tokenizer to call the parser every time a character is
+ // emitted. We instead collect characters and call the parser with a batch.
+ // In order to make our first-pass parser code simple, processToken matches
+ // the spec in only handling one character at a time.
+ if (token.type() == HTMLToken::Character) {
+ StringImpl* characters = token.characters().impl();
+ // FIXME: Calling processToken for each character is probably slow.
+ for (unsigned i = 0; i < characters->length(); ++i)
+ processToken(token, (*characters)[i]);
+ return 0; // FIXME: Should we be returning the Text node?
+ }
+ return processToken(token);
+}
+
+PassRefPtr<Node> HTMLTreeBuilder::processToken(AtomicHTMLToken& token, UChar cc)
+{
+reprocessToken:
+ switch (insertionMode()) {
+ case InitialMode: {
+ switch (token.type()) {
+ case HTMLToken::Uninitialized:
+ ASSERT_NOT_REACHED();
+ break;
+ case HTMLToken::DOCTYPE:
+ return insertDoctype(token);
+ case HTMLToken::Comment:
+ return insertComment(token);
+ case HTMLToken::Character:
+ if (isTreeBuilderWhiteSpace(cc))
+ return 0;
+ break;
+ case HTMLToken::StartTag:
+ case HTMLToken::EndTag:
+ case HTMLToken::EndOfFile:
+ break;
+ }
+ notImplemented();
+ parseError(token);
+ setInsertionMode(BeforeHTMLMode);
+ goto reprocessToken;
+ }
+ case BeforeHTMLMode: {
+ switch (token.type()) {
+ case HTMLToken::Uninitialized:
+ ASSERT_NOT_REACHED();
+ break;
+ case HTMLToken::DOCTYPE:
+ parseError(token);
+ return 0;
+ case HTMLToken::Comment:
+ return insertComment(token);
+ case HTMLToken::Character:
+ if (isTreeBuilderWhiteSpace(cc))
+ return 0;
+ break;
+ case HTMLToken::StartTag:
+ if (token.name() == htmlTag) {
+ notImplemented();
+ setInsertionMode(BeforeHeadMode);
+ return 0;
+ }
+ break;
+ case HTMLToken::EndTag:
+ if (token.name() == headTag || token.name() == bodyTag || token.name() == htmlTag || token.name() == brTag)
+ break;
+ parseError(token);
+ return 0;
+ case HTMLToken::EndOfFile:
+ break;
+ }
+ notImplemented();
+ setInsertionMode(BeforeHeadMode);
+ goto reprocessToken;
+ }
+ case BeforeHeadMode: {
+ switch (token.type()) {
+ case HTMLToken::Uninitialized:
+ ASSERT_NOT_REACHED();
+ break;
+ case HTMLToken::Character:
+ if (isTreeBuilderWhiteSpace(cc))
+ return 0;
+ break;
+ case HTMLToken::Comment:
+ return insertComment(token);
+ case HTMLToken::DOCTYPE:
+ parseError(token);
+ return 0;
+ case HTMLToken::StartTag:
+ if (token.name() == htmlTag) {
+ notImplemented();
+ return 0;
+ }
+ if (token.name() == headTag) {
+ m_headElement = insertElement(token);
+ setInsertionMode(InHeadMode);
+ return m_headElement;
+ }
+ break;
+ case HTMLToken::EndTag:
+ if (token.name() == headTag || token.name() == bodyTag || token.name() == brTag) {
+ AtomicHTMLToken fakeHead(HTMLToken::StartTag, headTag.localName());
+ processToken(fakeHead);
+ goto reprocessToken;
+ }
+ parseError(token);
+ return 0;
+ case HTMLToken::EndOfFile:
+ break;
+ }
+ AtomicHTMLToken fakeHead(HTMLToken::StartTag, headTag.localName());
+ processToken(fakeHead);
+ goto reprocessToken;
+ }
+ case InHeadMode: {
+ switch (token.type()) {
+ case HTMLToken::Uninitialized:
+ ASSERT_NOT_REACHED();
+ break;
+ case HTMLToken::Character:
+ insertCharacter(cc);
+ break;
+ case HTMLToken::Comment:
+ return insertComment(token);
+ case HTMLToken::DOCTYPE:
+ parseError(token);
+ return 0;
+ case HTMLToken::StartTag:
+ if (token.name() == htmlTag) {
+ notImplemented();
+ return 0;
+ }
+ // FIXME: Atomize "command".
+ if (token.name() == baseTag || token.name() == "command" || token.name() == linkTag) {
+ PassRefPtr<Node> node = insertElement(token);
+ m_openElements.pop();
+ notImplemented();
+ return node;
+ }
+ if (token.name() == metaTag) {
+ PassRefPtr<Node> node = insertElement(token);
+ m_openElements.pop();
+ notImplemented();
+ return node;
+ }
+ if (token.name() == titleTag)
+ return insertGenericRCDATAElement(token);
+ if (token.name() == noscriptTag) {
+ if (isScriptingFlagEnabled(m_document->frame()))
+ return insertGenericRawTextElement(token);
+ PassRefPtr<Node> node = insertElement(token);
+ setInsertionMode(InHeadNoscriptMode);
+ return node;
+ }
+ if (token.name() == noframesTag || token.name() == styleTag)
+ return insertGenericRawTextElement(token);
+ if (token.name() == scriptTag)
+ return insertScriptElement(token);
+ if (token.name() == headTag) {
+ notImplemented();
+ return 0;
+ }
+ break;
+ case HTMLToken::EndTag:
+ if (token.name() == headTag) {
+ ASSERT(m_openElements.top()->tagQName() == headTag);
+ m_openElements.pop();
+ setInsertionMode(AfterHeadMode);
+ return 0;
+ }
+ if (token.name() == bodyTag || token.name() == htmlTag || token.name() == brTag)
+ break;
+ parseError(token);
+ return 0;
+ break;
+ case HTMLToken::EndOfFile:
+ break;
+ }
+ AtomicHTMLToken fakeHead(HTMLToken::EndTag, headTag.localName());
+ processToken(fakeHead);
+ goto reprocessToken;
+ }
+ case InHeadNoscriptMode: {
+ switch (token.type()) {
+ case HTMLToken::Uninitialized:
+ ASSERT_NOT_REACHED();
+ break;
+ case HTMLToken::DOCTYPE:
+ parseError(token);
+ return 0;
+ case HTMLToken::StartTag:
+ if (token.name() == htmlTag) {
+ notImplemented();
+ return 0;
+ }
+ if (token.name() == linkTag || token.name() == metaTag || token.name() == noframesTag || token.name() == styleTag) {
+ notImplemented();
+ return 0;
+ }
+ if (token.name() == htmlTag || token.name() == noscriptTag) {
+ parseError(token);
+ return 0;
+ }
+ break;
+ case HTMLToken::EndTag:
+ if (token.name() == noscriptTag) {
+ ASSERT(m_openElements.top()->tagQName() == noscriptTag);
+ m_openElements.pop();
+ ASSERT(m_openElements.top()->tagQName() == headTag);
+ setInsertionMode(InHeadMode);
+ return 0;
+ }
+ if (token.name() == brTag)
+ break;
+ parseError(token);
+ return 0;
+ case HTMLToken::Character:
+ notImplemented();
+ break;
+ case HTMLToken::Comment:
+ notImplemented();
+ return 0;
+ case HTMLToken::EndOfFile:
+ break;
+ }
+ AtomicHTMLToken fakeNoscript(HTMLToken::EndTag, noscriptTag.localName());
+ processToken(fakeNoscript);
+ goto reprocessToken;
+ }
+ case AfterHeadMode: {
+ switch (token.type()) {
+ case HTMLToken::Uninitialized:
+ ASSERT_NOT_REACHED();
+ break;
+ case HTMLToken::Character:
+ if (isTreeBuilderWhiteSpace(cc)) {
+ insertCharacter(cc);
+ return 0;
+ }
+ break;
+ case HTMLToken::Comment:
+ return insertComment(token);
+ case HTMLToken::DOCTYPE:
+ parseError(token);
+ return 0;
+ case HTMLToken::StartTag:
+ if (token.name() == htmlTag) {
+ notImplemented();
+ return 0;
+ }
+ if (token.name() == bodyTag) {
+ m_framesetOk = false;
+ return insertElement(token);
+ }
+ if (token.name() == framesetTag) {
+ PassRefPtr<Node> node = insertElement(token);
+ setInsertionMode(InFramesetMode);
+ return node;
+ }
+ if (token.name() == baseTag || token.name() == linkTag || token.name() == metaTag || token.name() == noframesTag || token.name() == scriptTag || token.name() == styleTag || token.name() == titleTag) {
+ parseError(token);
+ ASSERT(m_headElement);
+ m_openElements.push(m_headElement.get());
+ notImplemented();
+ m_openElements.remove(m_headElement.get());
+ }
+ if (token.name() == headTag) {
+ parseError(token);
+ return 0;
+ }
+ break;
+ case HTMLToken::EndTag:
+ if (token.name() == bodyTag || token.name() == htmlTag || token.name() == brTag)
+ break;
+ parseError(token);
+ return 0;
+ case HTMLToken::EndOfFile:
+ break;
+ }
+ AtomicHTMLToken fakeBody(HTMLToken::StartTag, bodyTag.localName());
+ processToken(fakeBody);
+ m_framesetOk = true;
+ goto reprocessToken;
+ }
+ case InBodyMode:
+ case TextMode:
+ case InTableMode:
+ case InTableTextMode:
+ case InCaptionMode:
+ case InColumnGroupMode:
+ case InTableBodyMode:
+ case InRowMode:
+ case InCellMode:
+ case InSelectMode:
+ case InSelectInTableMode:
+ case InForeignContentMode:
+ case AfterBodyMode:
+ case InFramesetMode:
+ case AfterFramesetMode:
+ case AfterAfterBodyMode:
+ case AfterAfterFramesetMode:
+ ASSERT_NOT_REACHED();
+ }
+
+ // Implementation coming in the next patch.
+ return 0;
+}
+
+PassRefPtr<Node> HTMLTreeBuilder::insertDoctype(AtomicHTMLToken& token)
+{
+ ASSERT_UNUSED(token, token.type() == HTMLToken::DOCTYPE);
+ return 0;
+}
+
+PassRefPtr<Node> HTMLTreeBuilder::insertComment(AtomicHTMLToken& token)
+{
+ ASSERT_UNUSED(token, token.type() == HTMLToken::Comment);
+ return 0;
+}
+
+PassRefPtr<Element> HTMLTreeBuilder::insertElement(AtomicHTMLToken& token)
+{
+ ASSERT_UNUSED(token, token.type() == HTMLToken::StartTag);
+ return 0;
+}
+
+void HTMLTreeBuilder::insertCharacter(UChar cc)
+{
+ ASSERT_UNUSED(cc, cc);
+}
+
+PassRefPtr<Node> HTMLTreeBuilder::insertGenericRCDATAElement(AtomicHTMLToken& token)
+{
+ ASSERT_UNUSED(token, token.type() == HTMLToken::StartTag);
+ return 0;
+}
+
+PassRefPtr<Node> HTMLTreeBuilder::insertGenericRawTextElement(AtomicHTMLToken& token)
+{
+ ASSERT_UNUSED(token, token.type() == HTMLToken::StartTag);
+ return 0;
+}
+
+PassRefPtr<Node> HTMLTreeBuilder::insertScriptElement(AtomicHTMLToken& token)
+{
+ ASSERT_UNUSED(token, token.type() == HTMLToken::StartTag);
+ return 0;
+}
+
+void HTMLTreeBuilder::finished()
+{
+ // We should call m_document->finishedParsing() here, except
+ // m_legacyTreeBuilder->finished() does it for us.
+ m_legacyTreeBuilder->finished();
+}
+
+bool HTMLTreeBuilder::isScriptingFlagEnabled(Frame* frame)
+{
+ if (!frame)
+ return false;
+ if (ScriptController* scriptController = frame->script())
+ return scriptController->canExecuteScripts(NotAboutToExecuteScript);
+ return false;
+}
+
+}