/* * Copyright (C) 2010 Google, Inc. All Rights Reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY GOOGLE INC. ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GOOGLE INC. OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include "HTMLTreeBuilder.h" #include "Comment.h" #include "DocumentFragment.h" #include "DocumentType.h" #include "Element.h" #include "Frame.h" #include "HTMLDocument.h" #include "HTMLElementFactory.h" #include "HTMLHtmlElement.h" #include "HTMLNames.h" #include "HTMLScriptElement.h" #include "HTMLToken.h" #include "HTMLTokenizer.h" #include "LegacyHTMLDocumentParser.h" #include "LegacyHTMLTreeBuilder.h" #include "NotImplemented.h" #if ENABLE(SVG) #include "SVGNames.h" #endif #include "ScriptController.h" #include "Settings.h" #include "Text.h" #include namespace WebCore { using namespace HTMLNames; static const int uninitializedLineNumberValue = -1; namespace { inline bool isTreeBuilderWhiteSpace(UChar cc) { return cc == '\t' || cc == '\x0A' || cc == '\x0C' || cc == '\x0D' || cc == ' '; } bool shouldUseLegacyTreeBuilder(Document* document) { return !document->settings() || !document->settings()->html5TreeBuilderEnabled(); } bool isNumberedHeaderTag(const AtomicString& tagName) { return tagName == h1Tag || tagName == h2Tag || tagName == h3Tag || tagName == h4Tag || tagName == h5Tag || tagName == h6Tag; } bool isTableBodyContextTag(const AtomicString& tagName) { return tagName == tbodyTag || tagName == tfootTag || tagName == theadTag; } // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#special bool isSpecialTag(const AtomicString& tagName) { return tagName == addressTag || tagName == articleTag || tagName == asideTag || tagName == baseTag || tagName == basefontTag || tagName == "bgsound" || tagName == blockquoteTag || tagName == bodyTag || tagName == brTag || tagName == buttonTag || tagName == centerTag || tagName == colTag || tagName == colgroupTag || tagName == "command" || tagName == ddTag || tagName == "details" || tagName == dirTag || tagName == divTag || tagName == dlTag || tagName == dtTag || tagName == embedTag || tagName == fieldsetTag || tagName == "figure" || tagName == footerTag || tagName == formTag || tagName == frameTag || tagName == framesetTag || isNumberedHeaderTag(tagName) || tagName == headTag || tagName == headerTag || tagName == hgroupTag || tagName == hrTag || tagName == iframeTag || tagName == imgTag || tagName == inputTag || tagName == isindexTag || tagName == liTag || tagName == linkTag || tagName == listingTag || tagName == menuTag || tagName == metaTag || tagName == navTag || tagName == noembedTag || tagName == noframesTag || tagName == noscriptTag || tagName == olTag || tagName == pTag || tagName == paramTag || tagName == plaintextTag || tagName == preTag || tagName == scriptTag || tagName == sectionTag || tagName == selectTag || tagName == styleTag || isTableBodyContextTag(tagName) || tagName == textareaTag || tagName == titleTag || tagName == trTag || tagName == ulTag || tagName == wbrTag || tagName == xmpTag; } // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#scoping // Same as isScopingTag in LegacyHTMLTreeBuilder.cpp // and isScopeMarker in HTMLElementStack.cpp bool isScopingTag(const AtomicString& tagName) { return tagName == appletTag || tagName == buttonTag || tagName == captionTag #if ENABLE(SVG_FOREIGN_OBJECT) || tagName == SVGNames::foreignObjectTag #endif || tagName == htmlTag || tagName == marqueeTag || tagName == objectTag || tagName == tableTag || tagName == tdTag || tagName == thTag; } bool isNonAnchorFormattingTag(const AtomicString& tagName) { return tagName == bTag || tagName == bigTag || tagName == codeTag || tagName == emTag || tagName == fontTag || tagName == iTag || tagName == nobrTag || tagName == sTag || tagName == smallTag || tagName == strikeTag || tagName == strongTag || tagName == ttTag || tagName == uTag; } // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#formatting bool isFormattingTag(const AtomicString& tagName) { return tagName == aTag || isNonAnchorFormattingTag(tagName); } // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#phrasing bool isPhrasingTag(const AtomicString& tagName) { return !isSpecialTag(tagName) && !isScopingTag(tagName) && !isFormattingTag(tagName); } } // namespace HTMLTreeBuilder::HTMLTreeBuilder(HTMLTokenizer* tokenizer, HTMLDocument* document, bool reportErrors) : m_framesetOk(true) , m_document(document) , m_reportErrors(reportErrors) , m_isPaused(false) , m_insertionMode(InitialMode) , m_originalInsertionMode(InitialMode) , m_tokenizer(tokenizer) , m_legacyTreeBuilder(shouldUseLegacyTreeBuilder(document) ? new LegacyHTMLTreeBuilder(document, reportErrors) : 0) , m_lastScriptElementStartLine(uninitializedLineNumberValue) , m_scriptToProcessStartLine(uninitializedLineNumberValue) , m_fragmentScriptingPermission(FragmentScriptingAllowed) , m_isParsingFragment(false) { } // FIXME: Member variables should be grouped into self-initializing structs to // minimize code duplication between these constructors. HTMLTreeBuilder::HTMLTreeBuilder(HTMLTokenizer* tokenizer, DocumentFragment* fragment, FragmentScriptingPermission scriptingPermission) : m_framesetOk(true) , m_document(fragment->document()) , m_reportErrors(false) // FIXME: Why not report errors in fragments? , m_isPaused(false) , m_insertionMode(InitialMode) , m_originalInsertionMode(InitialMode) , m_tokenizer(tokenizer) , m_legacyTreeBuilder(new LegacyHTMLTreeBuilder(fragment, scriptingPermission)) , m_lastScriptElementStartLine(uninitializedLineNumberValue) , m_scriptToProcessStartLine(uninitializedLineNumberValue) , m_fragmentScriptingPermission(scriptingPermission) , m_isParsingFragment(true) { } HTMLTreeBuilder::~HTMLTreeBuilder() { } static void convertToOldStyle(const AtomicHTMLToken& token, Token& oldStyleToken) { switch (token.type()) { case HTMLToken::Uninitialized: case HTMLToken::DOCTYPE: ASSERT_NOT_REACHED(); break; case HTMLToken::EndOfFile: ASSERT_NOT_REACHED(); notImplemented(); break; case HTMLToken::StartTag: case HTMLToken::EndTag: { oldStyleToken.beginTag = (token.type() == HTMLToken::StartTag); oldStyleToken.selfClosingTag = token.selfClosing(); oldStyleToken.tagName = token.name(); oldStyleToken.attrs = token.attributes(); break; } case HTMLToken::Comment: oldStyleToken.tagName = commentAtom; oldStyleToken.text = token.comment().impl(); break; case HTMLToken::Character: oldStyleToken.tagName = textAtom; oldStyleToken.text = token.characters().impl(); break; } } void HTMLTreeBuilder::handleScriptStartTag() { notImplemented(); // The HTML frgment case? m_tokenizer->setState(HTMLTokenizer::ScriptDataState); notImplemented(); // Save insertion mode. } void HTMLTreeBuilder::handleScriptEndTag(Element* scriptElement, int scriptStartLine) { ASSERT(!m_scriptToProcess); // Caller never called takeScriptToProcess! ASSERT(m_scriptToProcessStartLine == uninitializedLineNumberValue); // Caller never called takeScriptToProcess! notImplemented(); // Save insertion mode and insertion point? // Pause ourselves so that parsing stops until the script can be processed by the caller. m_isPaused = true; m_scriptToProcess = scriptElement; // Lexer line numbers are 0-based, ScriptSourceCode expects 1-based lines, // so we convert here before passing the line number off to HTMLScriptRunner. m_scriptToProcessStartLine = scriptStartLine + 1; } PassRefPtr HTMLTreeBuilder::takeScriptToProcess(int& scriptStartLine) { // Unpause ourselves, callers may pause us again when processing the script. // The HTML5 spec is written as though scripts are executed inside the tree // builder. We pause the parser to exit the tree builder, and then resume // before running scripts. m_isPaused = false; scriptStartLine = m_scriptToProcessStartLine; m_scriptToProcessStartLine = uninitializedLineNumberValue; return m_scriptToProcess.release(); } HTMLTokenizer::State HTMLTreeBuilder::adjustedLexerState(HTMLTokenizer::State state, const AtomicString& tagName, Frame* frame) { if (tagName == textareaTag || tagName == titleTag) return HTMLTokenizer::RCDATAState; if (tagName == styleTag || tagName == iframeTag || tagName == xmpTag || tagName == noembedTag || tagName == noframesTag || (tagName == noscriptTag && isScriptingFlagEnabled(frame))) return HTMLTokenizer::RAWTEXTState; if (tagName == plaintextTag) return HTMLTokenizer::PLAINTEXTState; return state; } void HTMLTreeBuilder::passTokenToLegacyParser(HTMLToken& token) { if (token.type() == HTMLToken::DOCTYPE) { DoctypeToken doctypeToken; doctypeToken.m_name.append(token.name().data(), token.name().size()); doctypeToken.m_publicID = token.publicIdentifier(); doctypeToken.m_systemID = token.systemIdentifier(); doctypeToken.m_forceQuirks = token.forceQuirks(); m_legacyTreeBuilder->parseDoctypeToken(&doctypeToken); return; } if (token.type() == HTMLToken::EndOfFile) return; // For now, we translate into an old-style token for testing. Token oldStyleToken; AtomicHTMLToken atomicToken(token); convertToOldStyle(atomicToken, oldStyleToken); RefPtr result = m_legacyTreeBuilder->parseToken(&oldStyleToken); if (token.type() == HTMLToken::StartTag) { // This work is supposed to be done by the parser, but // when using the old parser for we have to do this manually. if (oldStyleToken.tagName == scriptTag) { handleScriptStartTag(); m_lastScriptElement = static_pointer_cast(result); m_lastScriptElementStartLine = m_tokenizer->lineNumber(); } else if (oldStyleToken.tagName == preTag || oldStyleToken.tagName == listingTag) m_tokenizer->skipLeadingNewLineForListing(); else m_tokenizer->setState(adjustedLexerState(m_tokenizer->state(), oldStyleToken.tagName, m_document->frame())); } else if (token.type() == HTMLToken::EndTag) { if (oldStyleToken.tagName == scriptTag) { if (m_lastScriptElement) { ASSERT(m_lastScriptElementStartLine != uninitializedLineNumberValue); if (m_fragmentScriptingPermission == FragmentScriptingNotAllowed) { // FIXME: This is a horrible hack for platform/Pasteboard. // Clear the