diff options
author | Kristian Monsen <kristianm@google.com> | 2010-06-28 16:42:48 +0100 |
---|---|---|
committer | Kristian Monsen <kristianm@google.com> | 2010-07-02 10:29:56 +0100 |
commit | 06ea8e899e48f1f2f396b70e63fae369f2f23232 (patch) | |
tree | 20c1428cd05c76f32394ab354ea35ed99acd86d8 /WebCore/html/HTMLTreeBuilder.cpp | |
parent | 72aad67af14193199e29cdd5c4ddc095a8b9a8a8 (diff) | |
download | external_webkit-06ea8e899e48f1f2f396b70e63fae369f2f23232.zip external_webkit-06ea8e899e48f1f2f396b70e63fae369f2f23232.tar.gz external_webkit-06ea8e899e48f1f2f396b70e63fae369f2f23232.tar.bz2 |
Merge WebKit at r61871: Initial merge by git.
Change-Id: I6cff43abca9cc4782e088a469ad4f03f166a65d5
Diffstat (limited to 'WebCore/html/HTMLTreeBuilder.cpp')
-rw-r--r-- | WebCore/html/HTMLTreeBuilder.cpp | 593 |
1 files changed, 593 insertions, 0 deletions
diff --git a/WebCore/html/HTMLTreeBuilder.cpp b/WebCore/html/HTMLTreeBuilder.cpp new file mode 100644 index 0000000..6e40fd7 --- /dev/null +++ b/WebCore/html/HTMLTreeBuilder.cpp @@ -0,0 +1,593 @@ +/* + * Copyright (C) 2010 Google, Inc. All Rights Reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY GOOGLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GOOGLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "HTMLTreeBuilder.h" + +#include "DocumentFragment.h" +#include "Element.h" +#include "Frame.h" +#include "HTMLTokenizer.h" +#include "HTMLToken.h" +#include "HTMLDocument.h" +#include "LegacyHTMLDocumentParser.h" +#include "HTMLNames.h" +#include "LegacyHTMLTreeBuilder.h" +#include "NotImplemented.h" +#include "ScriptController.h" +#include <wtf/UnusedParam.h> + +namespace WebCore { + +using namespace HTMLNames; + +static const int uninitializedLineNumberValue = -1; + +namespace { + +inline bool isTreeBuilderWhiteSpace(UChar cc) +{ + return cc == '\t' || cc == '\x0A' || cc == '\x0C' || cc == '\x0D' || cc == ' '; +} + +} // namespace + +HTMLTreeBuilder::HTMLTreeBuilder(HTMLTokenizer* tokenizer, HTMLDocument* document, bool reportErrors) + : m_framesetOk(true) + , m_document(document) + , m_reportErrors(reportErrors) + , m_isPaused(false) + , m_insertionMode(InitialMode) + , m_tokenizer(tokenizer) + , m_legacyTreeBuilder(new LegacyHTMLTreeBuilder(document, reportErrors)) + , m_lastScriptElementStartLine(uninitializedLineNumberValue) + , m_scriptToProcessStartLine(uninitializedLineNumberValue) + , m_fragmentScriptingPermission(FragmentScriptingAllowed) +{ +} + +// FIXME: Member variables should be grouped into self-initializing structs to +// minimize code duplication between these constructors. +HTMLTreeBuilder::HTMLTreeBuilder(HTMLTokenizer* tokenizer, DocumentFragment* fragment, FragmentScriptingPermission scriptingPermission) + : m_framesetOk(true) + , m_document(fragment->document()) + , m_reportErrors(false) // FIXME: Why not report errors in fragments? + , m_isPaused(false) + , m_insertionMode(InitialMode) + , m_tokenizer(tokenizer) + , m_legacyTreeBuilder(new LegacyHTMLTreeBuilder(fragment, scriptingPermission)) + , m_lastScriptElementStartLine(uninitializedLineNumberValue) + , m_scriptToProcessStartLine(uninitializedLineNumberValue) + , m_fragmentScriptingPermission(scriptingPermission) +{ +} + +HTMLTreeBuilder::~HTMLTreeBuilder() +{ +} + +static void convertToOldStyle(const AtomicHTMLToken& token, Token& oldStyleToken) +{ + switch (token.type()) { + case HTMLToken::Uninitialized: + case HTMLToken::DOCTYPE: + ASSERT_NOT_REACHED(); + break; + case HTMLToken::EndOfFile: + ASSERT_NOT_REACHED(); + notImplemented(); + break; + case HTMLToken::StartTag: + case HTMLToken::EndTag: { + oldStyleToken.beginTag = (token.type() == HTMLToken::StartTag); + oldStyleToken.selfClosingTag = token.selfClosing(); + oldStyleToken.tagName = token.name(); + oldStyleToken.attrs = token.attributes(); + break; + } + case HTMLToken::Comment: + oldStyleToken.tagName = commentAtom; + oldStyleToken.text = token.comment().impl(); + break; + case HTMLToken::Character: + oldStyleToken.tagName = textAtom; + oldStyleToken.text = token.characters().impl(); + break; + } +} + +void HTMLTreeBuilder::handleScriptStartTag() +{ + notImplemented(); // The HTML frgment case? + m_tokenizer->setState(HTMLTokenizer::ScriptDataState); + notImplemented(); // Save insertion mode. +} + +void HTMLTreeBuilder::handleScriptEndTag(Element* scriptElement, int scriptStartLine) +{ + ASSERT(!m_scriptToProcess); // Caller never called takeScriptToProcess! + ASSERT(m_scriptToProcessStartLine == uninitializedLineNumberValue); // Caller never called takeScriptToProcess! + notImplemented(); // Save insertion mode and insertion point? + + // Pause ourselves so that parsing stops until the script can be processed by the caller. + m_isPaused = true; + m_scriptToProcess = scriptElement; + // Lexer line numbers are 0-based, ScriptSourceCode expects 1-based lines, + // so we convert here before passing the line number off to HTMLScriptRunner. + m_scriptToProcessStartLine = scriptStartLine + 1; +} + +PassRefPtr<Element> HTMLTreeBuilder::takeScriptToProcess(int& scriptStartLine) +{ + // Unpause ourselves, callers may pause us again when processing the script. + // The HTML5 spec is written as though scripts are executed inside the tree + // builder. We pause the parser to exit the tree builder, and then resume + // before running scripts. + m_isPaused = false; + scriptStartLine = m_scriptToProcessStartLine; + m_scriptToProcessStartLine = uninitializedLineNumberValue; + return m_scriptToProcess.release(); +} + +HTMLTokenizer::State HTMLTreeBuilder::adjustedLexerState(HTMLTokenizer::State state, const AtomicString& tagName, Frame* frame) +{ + if (tagName == textareaTag || tagName == titleTag) + return HTMLTokenizer::RCDATAState; + + if (tagName == styleTag || tagName == iframeTag || tagName == xmpTag || tagName == noembedTag + || tagName == noframesTag || (tagName == noscriptTag && isScriptingFlagEnabled(frame))) + return HTMLTokenizer::RAWTEXTState; + + if (tagName == plaintextTag) + return HTMLTokenizer::PLAINTEXTState; + + return state; +} + +PassRefPtr<Node> HTMLTreeBuilder::passTokenToLegacyParser(HTMLToken& token) +{ + if (token.type() == HTMLToken::DOCTYPE) { + DoctypeToken doctypeToken; + doctypeToken.m_name.append(token.name().data(), token.name().size()); + doctypeToken.m_publicID = token.publicIdentifier(); + doctypeToken.m_systemID = token.systemIdentifier(); + doctypeToken.m_forceQuirks = token.forceQuirks(); + + m_legacyTreeBuilder->parseDoctypeToken(&doctypeToken); + return 0; + } + + // For now, we translate into an old-style token for testing. + Token oldStyleToken; + AtomicHTMLToken atomicToken(token); + convertToOldStyle(atomicToken, oldStyleToken); + + RefPtr<Node> result = m_legacyTreeBuilder->parseToken(&oldStyleToken); + if (token.type() == HTMLToken::StartTag) { + // This work is supposed to be done by the parser, but + // when using the old parser for we have to do this manually. + if (oldStyleToken.tagName == scriptTag) { + handleScriptStartTag(); + m_lastScriptElement = static_pointer_cast<Element>(result); + m_lastScriptElementStartLine = m_tokenizer->lineNumber(); + } else if (oldStyleToken.tagName == preTag || oldStyleToken.tagName == listingTag) + m_tokenizer->skipLeadingNewLineForListing(); + else + m_tokenizer->setState(adjustedLexerState(m_tokenizer->state(), oldStyleToken.tagName, m_document->frame())); + } else if (token.type() == HTMLToken::EndTag) { + if (oldStyleToken.tagName == scriptTag) { + if (m_lastScriptElement) { + ASSERT(m_lastScriptElementStartLine != uninitializedLineNumberValue); + if (m_fragmentScriptingPermission == FragmentScriptingNotAllowed) { + // FIXME: This is a horrible hack for platform/Pasteboard. + // Clear the <script> tag when using the Parser to create + // a DocumentFragment for pasting so that javascript content + // does not show up in pasted HTML. + m_lastScriptElement->removeChildren(); + } else if (insertionMode() != AfterFramesetMode) + handleScriptEndTag(m_lastScriptElement.get(), m_lastScriptElementStartLine); + m_lastScriptElement = 0; + m_lastScriptElementStartLine = uninitializedLineNumberValue; + } + } else if (oldStyleToken.tagName == framesetTag) + setInsertionMode(AfterFramesetMode); + } + return result.release(); +} + +PassRefPtr<Node> HTMLTreeBuilder::constructTreeFromToken(HTMLToken& rawToken) +{ + // Make MSVC ignore our unreachable code for now. + if (true) + return passTokenToLegacyParser(rawToken); + + AtomicHTMLToken token(rawToken); + + // HTML5 expects the tokenizer to call the parser every time a character is + // emitted. We instead collect characters and call the parser with a batch. + // In order to make our first-pass parser code simple, processToken matches + // the spec in only handling one character at a time. + if (token.type() == HTMLToken::Character) { + StringImpl* characters = token.characters().impl(); + // FIXME: Calling processToken for each character is probably slow. + for (unsigned i = 0; i < characters->length(); ++i) + processToken(token, (*characters)[i]); + return 0; // FIXME: Should we be returning the Text node? + } + return processToken(token); +} + +PassRefPtr<Node> HTMLTreeBuilder::processToken(AtomicHTMLToken& token, UChar cc) +{ +reprocessToken: + switch (insertionMode()) { + case InitialMode: { + switch (token.type()) { + case HTMLToken::Uninitialized: + ASSERT_NOT_REACHED(); + break; + case HTMLToken::DOCTYPE: + return insertDoctype(token); + case HTMLToken::Comment: + return insertComment(token); + case HTMLToken::Character: + if (isTreeBuilderWhiteSpace(cc)) + return 0; + break; + case HTMLToken::StartTag: + case HTMLToken::EndTag: + case HTMLToken::EndOfFile: + break; + } + notImplemented(); + parseError(token); + setInsertionMode(BeforeHTMLMode); + goto reprocessToken; + } + case BeforeHTMLMode: { + switch (token.type()) { + case HTMLToken::Uninitialized: + ASSERT_NOT_REACHED(); + break; + case HTMLToken::DOCTYPE: + parseError(token); + return 0; + case HTMLToken::Comment: + return insertComment(token); + case HTMLToken::Character: + if (isTreeBuilderWhiteSpace(cc)) + return 0; + break; + case HTMLToken::StartTag: + if (token.name() == htmlTag) { + notImplemented(); + setInsertionMode(BeforeHeadMode); + return 0; + } + break; + case HTMLToken::EndTag: + if (token.name() == headTag || token.name() == bodyTag || token.name() == htmlTag || token.name() == brTag) + break; + parseError(token); + return 0; + case HTMLToken::EndOfFile: + break; + } + notImplemented(); + setInsertionMode(BeforeHeadMode); + goto reprocessToken; + } + case BeforeHeadMode: { + switch (token.type()) { + case HTMLToken::Uninitialized: + ASSERT_NOT_REACHED(); + break; + case HTMLToken::Character: + if (isTreeBuilderWhiteSpace(cc)) + return 0; + break; + case HTMLToken::Comment: + return insertComment(token); + case HTMLToken::DOCTYPE: + parseError(token); + return 0; + case HTMLToken::StartTag: + if (token.name() == htmlTag) { + notImplemented(); + return 0; + } + if (token.name() == headTag) { + m_headElement = insertElement(token); + setInsertionMode(InHeadMode); + return m_headElement; + } + break; + case HTMLToken::EndTag: + if (token.name() == headTag || token.name() == bodyTag || token.name() == brTag) { + AtomicHTMLToken fakeHead(HTMLToken::StartTag, headTag.localName()); + processToken(fakeHead); + goto reprocessToken; + } + parseError(token); + return 0; + case HTMLToken::EndOfFile: + break; + } + AtomicHTMLToken fakeHead(HTMLToken::StartTag, headTag.localName()); + processToken(fakeHead); + goto reprocessToken; + } + case InHeadMode: { + switch (token.type()) { + case HTMLToken::Uninitialized: + ASSERT_NOT_REACHED(); + break; + case HTMLToken::Character: + insertCharacter(cc); + break; + case HTMLToken::Comment: + return insertComment(token); + case HTMLToken::DOCTYPE: + parseError(token); + return 0; + case HTMLToken::StartTag: + if (token.name() == htmlTag) { + notImplemented(); + return 0; + } + // FIXME: Atomize "command". + if (token.name() == baseTag || token.name() == "command" || token.name() == linkTag) { + PassRefPtr<Node> node = insertElement(token); + m_openElements.pop(); + notImplemented(); + return node; + } + if (token.name() == metaTag) { + PassRefPtr<Node> node = insertElement(token); + m_openElements.pop(); + notImplemented(); + return node; + } + if (token.name() == titleTag) + return insertGenericRCDATAElement(token); + if (token.name() == noscriptTag) { + if (isScriptingFlagEnabled(m_document->frame())) + return insertGenericRawTextElement(token); + PassRefPtr<Node> node = insertElement(token); + setInsertionMode(InHeadNoscriptMode); + return node; + } + if (token.name() == noframesTag || token.name() == styleTag) + return insertGenericRawTextElement(token); + if (token.name() == scriptTag) + return insertScriptElement(token); + if (token.name() == headTag) { + notImplemented(); + return 0; + } + break; + case HTMLToken::EndTag: + if (token.name() == headTag) { + ASSERT(m_openElements.top()->tagQName() == headTag); + m_openElements.pop(); + setInsertionMode(AfterHeadMode); + return 0; + } + if (token.name() == bodyTag || token.name() == htmlTag || token.name() == brTag) + break; + parseError(token); + return 0; + break; + case HTMLToken::EndOfFile: + break; + } + AtomicHTMLToken fakeHead(HTMLToken::EndTag, headTag.localName()); + processToken(fakeHead); + goto reprocessToken; + } + case InHeadNoscriptMode: { + switch (token.type()) { + case HTMLToken::Uninitialized: + ASSERT_NOT_REACHED(); + break; + case HTMLToken::DOCTYPE: + parseError(token); + return 0; + case HTMLToken::StartTag: + if (token.name() == htmlTag) { + notImplemented(); + return 0; + } + if (token.name() == linkTag || token.name() == metaTag || token.name() == noframesTag || token.name() == styleTag) { + notImplemented(); + return 0; + } + if (token.name() == htmlTag || token.name() == noscriptTag) { + parseError(token); + return 0; + } + break; + case HTMLToken::EndTag: + if (token.name() == noscriptTag) { + ASSERT(m_openElements.top()->tagQName() == noscriptTag); + m_openElements.pop(); + ASSERT(m_openElements.top()->tagQName() == headTag); + setInsertionMode(InHeadMode); + return 0; + } + if (token.name() == brTag) + break; + parseError(token); + return 0; + case HTMLToken::Character: + notImplemented(); + break; + case HTMLToken::Comment: + notImplemented(); + return 0; + case HTMLToken::EndOfFile: + break; + } + AtomicHTMLToken fakeNoscript(HTMLToken::EndTag, noscriptTag.localName()); + processToken(fakeNoscript); + goto reprocessToken; + } + case AfterHeadMode: { + switch (token.type()) { + case HTMLToken::Uninitialized: + ASSERT_NOT_REACHED(); + break; + case HTMLToken::Character: + if (isTreeBuilderWhiteSpace(cc)) { + insertCharacter(cc); + return 0; + } + break; + case HTMLToken::Comment: + return insertComment(token); + case HTMLToken::DOCTYPE: + parseError(token); + return 0; + case HTMLToken::StartTag: + if (token.name() == htmlTag) { + notImplemented(); + return 0; + } + if (token.name() == bodyTag) { + m_framesetOk = false; + return insertElement(token); + } + if (token.name() == framesetTag) { + PassRefPtr<Node> node = insertElement(token); + setInsertionMode(InFramesetMode); + return node; + } + if (token.name() == baseTag || token.name() == linkTag || token.name() == metaTag || token.name() == noframesTag || token.name() == scriptTag || token.name() == styleTag || token.name() == titleTag) { + parseError(token); + ASSERT(m_headElement); + m_openElements.push(m_headElement.get()); + notImplemented(); + m_openElements.remove(m_headElement.get()); + } + if (token.name() == headTag) { + parseError(token); + return 0; + } + break; + case HTMLToken::EndTag: + if (token.name() == bodyTag || token.name() == htmlTag || token.name() == brTag) + break; + parseError(token); + return 0; + case HTMLToken::EndOfFile: + break; + } + AtomicHTMLToken fakeBody(HTMLToken::StartTag, bodyTag.localName()); + processToken(fakeBody); + m_framesetOk = true; + goto reprocessToken; + } + case InBodyMode: + case TextMode: + case InTableMode: + case InTableTextMode: + case InCaptionMode: + case InColumnGroupMode: + case InTableBodyMode: + case InRowMode: + case InCellMode: + case InSelectMode: + case InSelectInTableMode: + case InForeignContentMode: + case AfterBodyMode: + case InFramesetMode: + case AfterFramesetMode: + case AfterAfterBodyMode: + case AfterAfterFramesetMode: + ASSERT_NOT_REACHED(); + } + + // Implementation coming in the next patch. + return 0; +} + +PassRefPtr<Node> HTMLTreeBuilder::insertDoctype(AtomicHTMLToken& token) +{ + ASSERT_UNUSED(token, token.type() == HTMLToken::DOCTYPE); + return 0; +} + +PassRefPtr<Node> HTMLTreeBuilder::insertComment(AtomicHTMLToken& token) +{ + ASSERT_UNUSED(token, token.type() == HTMLToken::Comment); + return 0; +} + +PassRefPtr<Element> HTMLTreeBuilder::insertElement(AtomicHTMLToken& token) +{ + ASSERT_UNUSED(token, token.type() == HTMLToken::StartTag); + return 0; +} + +void HTMLTreeBuilder::insertCharacter(UChar cc) +{ + ASSERT_UNUSED(cc, cc); +} + +PassRefPtr<Node> HTMLTreeBuilder::insertGenericRCDATAElement(AtomicHTMLToken& token) +{ + ASSERT_UNUSED(token, token.type() == HTMLToken::StartTag); + return 0; +} + +PassRefPtr<Node> HTMLTreeBuilder::insertGenericRawTextElement(AtomicHTMLToken& token) +{ + ASSERT_UNUSED(token, token.type() == HTMLToken::StartTag); + return 0; +} + +PassRefPtr<Node> HTMLTreeBuilder::insertScriptElement(AtomicHTMLToken& token) +{ + ASSERT_UNUSED(token, token.type() == HTMLToken::StartTag); + return 0; +} + +void HTMLTreeBuilder::finished() +{ + // We should call m_document->finishedParsing() here, except + // m_legacyTreeBuilder->finished() does it for us. + m_legacyTreeBuilder->finished(); +} + +bool HTMLTreeBuilder::isScriptingFlagEnabled(Frame* frame) +{ + if (!frame) + return false; + if (ScriptController* scriptController = frame->script()) + return scriptController->canExecuteScripts(NotAboutToExecuteScript); + return false; +} + +} |