/* * Copyright (C) 2010 Google, Inc. All Rights Reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY GOOGLE INC. ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GOOGLE INC. OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include "HTMLTreeBuilder.h" #include "Comment.h" #include "DocumentFragment.h" #include "DocumentType.h" #include "Element.h" #include "Frame.h" #include "HTMLDocument.h" #include "HTMLElementFactory.h" #include "HTMLFormElement.h" #include "HTMLHtmlElement.h" #include "HTMLNames.h" #include "HTMLScriptElement.h" #include "HTMLToken.h" #include "HTMLTokenizer.h" #include "LegacyHTMLDocumentParser.h" #include "LegacyHTMLTreeBuilder.h" #include "LocalizedStrings.h" #include "MathMLNames.h" #include "NotImplemented.h" #include "SVGNames.h" #include "ScriptController.h" #include "Settings.h" #include "Text.h" #include "XLinkNames.h" #include "XMLNSNames.h" #include "XMLNames.h" #include namespace WebCore { using namespace HTMLNames; static const int uninitializedLineNumberValue = -1; namespace { inline bool isTreeBuilderWhitepace(UChar c) { // FIXME: Consider branch permutations. return c == '\t' || c == '\x0A' || c == '\x0C' || c == '\x0D' || c == ' '; } inline bool isTreeBuilderWhitepaceOrReplacementCharacter(UChar c) { return isTreeBuilderWhitepace(c) || c == 0xFFFD; } template inline bool isAllSpecialCharacters(const String& string) { const UChar* characters = string.characters(); const unsigned length = string.length(); for (unsigned i = 0; i < length; ++i) { if (!isSpecialCharacter(characters[i])) return false; } return true; } inline bool isAllWhitespace(const String& string) { return isAllSpecialCharacters(string); } inline bool isAllWhitespaceOrReplacementCharacters(const String& string) { return isAllSpecialCharacters(string); } bool shouldUseLegacyTreeBuilder(Document* document) { return !document->settings() || !document->settings()->html5TreeBuilderEnabled(); } bool isNumberedHeaderTag(const AtomicString& tagName) { return tagName == h1Tag || tagName == h2Tag || tagName == h3Tag || tagName == h4Tag || tagName == h5Tag || tagName == h6Tag; } bool isCaptionColOrColgroupTag(const AtomicString& tagName) { return tagName == captionTag || tagName == colTag || tagName == colgroupTag; } bool isTableCellContextTag(const AtomicString& tagName) { return tagName == thTag || tagName == tdTag; } bool isTableBodyContextTag(const AtomicString& tagName) { return tagName == tbodyTag || tagName == tfootTag || tagName == theadTag; } // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#special bool isSpecialTag(const AtomicString& tagName) { return tagName == addressTag || tagName == articleTag || tagName == asideTag || tagName == baseTag || tagName == basefontTag || tagName == bgsoundTag || tagName == blockquoteTag || tagName == bodyTag || tagName == brTag || tagName == buttonTag || tagName == centerTag || tagName == colTag || tagName == colgroupTag || tagName == commandTag || tagName == ddTag || tagName == detailsTag || tagName == dirTag || tagName == divTag || tagName == dlTag || tagName == dtTag || tagName == embedTag || tagName == fieldsetTag || tagName == figureTag || tagName == footerTag || tagName == formTag || tagName == frameTag || tagName == framesetTag || isNumberedHeaderTag(tagName) || tagName == headTag || tagName == headerTag || tagName == hgroupTag || tagName == hrTag || tagName == iframeTag || tagName == imgTag || tagName == inputTag || tagName == isindexTag || tagName == liTag || tagName == linkTag || tagName == listingTag || tagName == menuTag || tagName == metaTag || tagName == navTag || tagName == noembedTag || tagName == noframesTag || tagName == noscriptTag || tagName == olTag || tagName == pTag || tagName == paramTag || tagName == plaintextTag || tagName == preTag || tagName == scriptTag || tagName == sectionTag || tagName == selectTag || tagName == styleTag || isTableBodyContextTag(tagName) || tagName == textareaTag || tagName == titleTag || tagName == trTag || tagName == ulTag || tagName == wbrTag || tagName == xmpTag; } // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#scoping // Same as isScopingTag in LegacyHTMLTreeBuilder.cpp // and isScopeMarker in HTMLElementStack.cpp bool isScopingTag(const AtomicString& tagName) { return tagName == appletTag || tagName == captionTag || tagName == SVGNames::foreignObjectTag || tagName == htmlTag || tagName == marqueeTag || tagName == objectTag || tagName == tableTag || isTableCellContextTag(tagName); } bool isNonAnchorNonNobrFormattingTag(const AtomicString& tagName) { return tagName == bTag || tagName == bigTag || tagName == codeTag || tagName == emTag || tagName == fontTag || tagName == iTag || tagName == sTag || tagName == smallTag || tagName == strikeTag || tagName == strongTag || tagName == ttTag || tagName == uTag; } bool isNonAnchorFormattingTag(const AtomicString& tagName) { return tagName == nobrTag || isNonAnchorNonNobrFormattingTag(tagName); } // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#formatting bool isFormattingTag(const AtomicString& tagName) { return tagName == aTag || isNonAnchorFormattingTag(tagName); } // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#phrasing bool isPhrasingTag(const AtomicString& tagName) { return !isSpecialTag(tagName) && !isScopingTag(tagName) && !isFormattingTag(tagName); } bool isNotFormattingAndNotPhrasing(const Element* element) { // The spec often says "node is not in the formatting category, and is not // in the phrasing category". !phrasing && !formatting == scoping || special // scoping || special is easier to compute. // FIXME: localName() is wrong for non-html content. const AtomicString& tagName = element->localName(); return isScopingTag(tagName) || isSpecialTag(tagName); } HTMLFormElement* closestFormAncestor(Element* element) { while (element) { if (element->hasTagName(formTag)) return static_cast(element); Node* parent = element->parent(); if (!parent || !parent->isElementNode()) return 0; element = static_cast(parent); } return 0; } // FIXME: This belongs on ContainerNode, where it could avoid the double ref // by directly releasing into the Vector. Such an implementation would need to // be careful not to send mutation events. void takeChildrenFromNode(ContainerNode* container, Vector >& children) { for (Node* child = container->firstChild(); child; child = child->nextSibling()) children.append(child); container->removeAllChildren(); } } // namespace class HTMLTreeBuilder::ExternalCharacterTokenBuffer : public Noncopyable { public: explicit ExternalCharacterTokenBuffer(AtomicHTMLToken& token) : m_current(token.characters().data()) , m_end(m_current + token.characters().size()) { ASSERT(!isEmpty()); } explicit ExternalCharacterTokenBuffer(const String& string) : m_current(string.characters()) , m_end(m_current + string.length()) { ASSERT(!isEmpty()); } ~ExternalCharacterTokenBuffer() { ASSERT(isEmpty()); } bool isEmpty() const { return m_current == m_end; } void skipLeadingWhitespace() { ASSERT(!isEmpty()); while (isTreeBuilderWhitepace(*m_current)) { if (++m_current == m_end) return; } } String takeLeadingWhitespace() { ASSERT(!isEmpty()); const UChar* start = m_current; skipLeadingWhitespace(); if (start == m_current) return String(); return String(start, m_current - start); } String takeRemaining() { ASSERT(!isEmpty()); const UChar* start = m_current; m_current = m_end; return String(start, m_current - start); } void giveRemainingTo(Vector& recipient) { recipient.append(m_current, m_end - m_current); m_current = m_end; } String takeRemainingWhitespace() { ASSERT(!isEmpty()); Vector whitespace; do { UChar cc = *m_current++; if (isTreeBuilderWhitepace(cc)) whitespace.append(cc); } while (m_current < m_end); // Returning the null string when there aren't any whitespace // characters is slightly cleaner semantically because we don't want // to insert a text node (as opposed to inserting an empty text node). if (whitespace.isEmpty()) return String(); return String::adopt(whitespace); } private: const UChar* m_current; const UChar* m_end; }; HTMLTreeBuilder::HTMLTreeBuilder(HTMLTokenizer* tokenizer, HTMLDocument* document, bool reportErrors) : m_framesetOk(true) , m_document(document) , m_tree(document, FragmentScriptingAllowed, false) , m_reportErrors(reportErrors) , m_isPaused(false) , m_insertionMode(InitialMode) , m_originalInsertionMode(InitialMode) , m_secondaryInsertionMode(InitialMode) , m_tokenizer(tokenizer) , m_lastScriptElementStartLine(uninitializedLineNumberValue) , m_scriptToProcessStartLine(uninitializedLineNumberValue) { } // FIXME: Member variables should be grouped into self-initializing structs to // minimize code duplication between these constructors. HTMLTreeBuilder::HTMLTreeBuilder(HTMLTokenizer* tokenizer, DocumentFragment* fragment, Element* contextElement, FragmentScriptingPermission scriptingPermission) : m_framesetOk(true) , m_fragmentContext(fragment, contextElement, scriptingPermission, shouldUseLegacyTreeBuilder(fragment->document())) , m_document(m_fragmentContext.document()) , m_tree(m_document, scriptingPermission, true) , m_reportErrors(false) // FIXME: Why not report errors in fragments? , m_isPaused(false) , m_insertionMode(InitialMode) , m_originalInsertionMode(InitialMode) , m_secondaryInsertionMode(InitialMode) , m_tokenizer(tokenizer) , m_legacyTreeBuilder(shouldUseLegacyTreeBuilder(fragment->document()) ? new LegacyHTMLTreeBuilder(fragment, scriptingPermission) : 0) , m_lastScriptElementStartLine(uninitializedLineNumberValue) , m_scriptToProcessStartLine(uninitializedLineNumberValue) { if (shouldUseLegacyTreeBuilder(fragment->document())) return; // This is steps 2-6 of the HTML5 Fragment Case parsing algorithm: // http://www.whatwg.org/specs/web-apps/current-work/multipage/the-end.html#fragment-case if (contextElement) m_document->setParseMode(contextElement->document()->parseMode()); processFakeStartTag(htmlTag); resetInsertionModeAppropriately(); m_tree.setForm(closestFormAncestor(contextElement)); } HTMLTreeBuilder::~HTMLTreeBuilder() { } HTMLTreeBuilder::FragmentParsingContext::FragmentParsingContext() : m_fragment(0) , m_contextElement(0) , m_usingLegacyTreeBuilder(false) , m_scriptingPermission(FragmentScriptingAllowed) { } HTMLTreeBuilder::FragmentParsingContext::FragmentParsingContext(DocumentFragment* fragment, Element* contextElement, FragmentScriptingPermission scriptingPermission, bool legacyMode) : m_dummyDocumentForFragmentParsing(legacyMode ? 0 : HTMLDocument::create(0, KURL())) , m_fragment(fragment) , m_contextElement(contextElement) , m_usingLegacyTreeBuilder(legacyMode) , m_scriptingPermission(scriptingPermission) { } Document* HTMLTreeBuilder::FragmentParsingContext::document() const { ASSERT(m_fragment); if (m_usingLegacyTreeBuilder) return m_fragment->document(); return m_dummyDocumentForFragmentParsing.get(); } void HTMLTreeBuilder::FragmentParsingContext::finished() { // Populate the DocumentFragment with the parsed content now that we're done. ContainerNode* root = m_dummyDocumentForFragmentParsing.get(); if (m_contextElement) root = m_dummyDocumentForFragmentParsing->documentElement(); Vector > children; takeChildrenFromNode(root, children); for (unsigned i = 0; i < children.size(); ++i) { ExceptionCode ec = 0; // FIXME: We need a parser-safe (no events) version of adoptNode. RefPtr child = m_fragment->document()->adoptNode(children[i].release(), ec); ASSERT(!ec); m_fragment->parserAddChild(child.release()); } } HTMLTreeBuilder::FragmentParsingContext::~FragmentParsingContext() { } static void convertToOldStyle(AtomicHTMLToken& token, Token& oldStyleToken) { switch (token.type()) { case HTMLToken::Uninitialized: case HTMLToken::DOCTYPE: ASSERT_NOT_REACHED(); break; case HTMLToken::EndOfFile: ASSERT_NOT_REACHED(); notImplemented(); break; case HTMLToken::StartTag: case HTMLToken::EndTag: { oldStyleToken.beginTag = (token.type() == HTMLToken::StartTag); // The LegacyHTMLTreeBuilder seems to work better if we lie here and // say that tags are never self closing. As a wise man once said: // "You can't handle the truth!" oldStyleToken.selfClosingTag = false; oldStyleToken.tagName = token.name(); oldStyleToken.attrs = token.takeAtributes(); break; } case HTMLToken::Comment: oldStyleToken.tagName = commentAtom; oldStyleToken.text = token.comment().impl(); break; case HTMLToken::Character: oldStyleToken.tagName = textAtom; oldStyleToken.text = StringImpl::create(token.characters().data(), token.characters().size()); break; } } PassRefPtr HTMLTreeBuilder::takeScriptToProcess(int& scriptStartLine) { // Unpause ourselves, callers may pause us again when processing the script. // The HTML5 spec is written as though scripts are executed inside the tree // builder. We pause the parser to exit the tree builder, and then resume // before running scripts. m_isPaused = false; scriptStartLine = m_scriptToProcessStartLine; m_scriptToProcessStartLine = uninitializedLineNumberValue; return m_scriptToProcess.release(); } HTMLTokenizer::State HTMLTreeBuilder::adjustedLexerState(HTMLTokenizer::State state, const AtomicString& tagName, Frame* frame) { if (tagName == textareaTag || tagName == titleTag) return HTMLTokenizer::RCDATAState; if (tagName == styleTag || tagName == iframeTag || tagName == xmpTag || (tagName == noembedTag && pluginsEnabled(frame)) || tagName == noframesTag || (tagName == noscriptTag && scriptEnabled(frame))) return HTMLTokenizer::RAWTEXTState; if (tagName == plaintextTag) return HTMLTokenizer::PLAINTEXTState; return state; } void HTMLTreeBuilder::passTokenToLegacyParser(HTMLToken& token) { if (token.type() == HTMLToken::DOCTYPE) { DoctypeToken doctypeToken; doctypeToken.m_name.append(token.name().data(), token.name().size()); doctypeToken.m_publicID = token.publicIdentifier(); doctypeToken.m_systemID = token.systemIdentifier(); doctypeToken.m_forceQuirks = token.forceQuirks(); m_legacyTreeBuilder->parseDoctypeToken(&doctypeToken); return; } if (token.type() == HTMLToken::EndOfFile) return; // For now, we translate into an old-style token for testing. Token oldStyleToken; AtomicHTMLToken atomicToken(token); convertToOldStyle(atomicToken, oldStyleToken); RefPtr result = m_legacyTreeBuilder->parseToken(&oldStyleToken); if (token.type() == HTMLToken::StartTag) { // This work is supposed to be done by the parser, but // when using the old parser for we have to do this manually. if (oldStyleToken.tagName == scriptTag) { m_tokenizer->setState(HTMLTokenizer::ScriptDataState); m_lastScriptElement = static_pointer_cast(result); m_lastScriptElementStartLine = m_tokenizer->lineNumber(); } else if (oldStyleToken.tagName == preTag || oldStyleToken.tagName == listingTag) m_tokenizer->setSkipLeadingNewLineForListing(true); else m_tokenizer->setState(adjustedLexerState(m_tokenizer->state(), oldStyleToken.tagName, m_document->frame())); } else if (token.type() == HTMLToken::EndTag) { if (oldStyleToken.tagName == scriptTag) { if (m_lastScriptElement) { ASSERT(m_lastScriptElementStartLine != uninitializedLineNumberValue); if (m_fragmentContext.scriptingPermission() == FragmentScriptingNotAllowed) { // FIXME: This is a horrible hack for platform/Pasteboard. // Clear the