diff options
author | Steve Block <steveblock@google.com> | 2011-05-06 11:45:16 +0100 |
---|---|---|
committer | Steve Block <steveblock@google.com> | 2011-05-12 13:44:10 +0100 |
commit | cad810f21b803229eb11403f9209855525a25d57 (patch) | |
tree | 29a6fd0279be608e0fe9ffe9841f722f0f4e4269 /Source/WebCore/html/parser | |
parent | 121b0cf4517156d0ac5111caf9830c51b69bae8f (diff) | |
download | external_webkit-cad810f21b803229eb11403f9209855525a25d57.zip external_webkit-cad810f21b803229eb11403f9209855525a25d57.tar.gz external_webkit-cad810f21b803229eb11403f9209855525a25d57.tar.bz2 |
Merge WebKit at r75315: Initial merge by git.
Change-Id: I570314b346ce101c935ed22a626b48c2af266b84
Diffstat (limited to 'Source/WebCore/html/parser')
41 files changed, 13215 insertions, 0 deletions
diff --git a/Source/WebCore/html/parser/CSSPreloadScanner.cpp b/Source/WebCore/html/parser/CSSPreloadScanner.cpp new file mode 100644 index 0000000..23364f9 --- /dev/null +++ b/Source/WebCore/html/parser/CSSPreloadScanner.cpp @@ -0,0 +1,195 @@ +/* + * Copyright (C) 2008, 2010 Apple Inc. All Rights Reserved. + * Copyright (C) 2009 Torch Mobile, Inc. http://www.torchmobile.com/ + * Copyright (C) 2010 Google Inc. All Rights Reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "CSSPreloadScanner.h" + +#include "CachedCSSStyleSheet.h" +#include "CachedResourceLoader.h" +#include "Document.h" +#include "HTMLParserIdioms.h" +#include "HTMLToken.h" + +namespace WebCore { + +CSSPreloadScanner::CSSPreloadScanner(Document* document) + : m_state(Initial) + , m_document(document) +{ +} + +void CSSPreloadScanner::reset() +{ + m_state = Initial; + m_rule.clear(); + m_ruleValue.clear(); +} + +void CSSPreloadScanner::scan(const HTMLToken& token, bool scanningBody) +{ + m_scanningBody = scanningBody; + + const HTMLToken::DataVector& characters = token.characters(); + for (HTMLToken::DataVector::const_iterator iter = characters.begin(); iter != characters.end(); ++iter) + tokenize(*iter); +} + +inline void CSSPreloadScanner::tokenize(UChar c) +{ + // We are just interested in @import rules, no need for real tokenization here + // Searching for other types of resources is probably low payoff. + switch (m_state) { + case Initial: + if (c == '@') + m_state = RuleStart; + else if (c == '/') + m_state = MaybeComment; + break; + case MaybeComment: + if (c == '*') + m_state = Comment; + else + m_state = Initial; + break; + case Comment: + if (c == '*') + m_state = MaybeCommentEnd; + break; + case MaybeCommentEnd: + if (c == '/') + m_state = Initial; + else if (c == '*') + ; + else + m_state = Comment; + break; + case RuleStart: + if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) { + m_rule.clear(); + m_ruleValue.clear(); + m_rule.append(c); + m_state = Rule; + } else + m_state = Initial; + break; + case Rule: + if (isHTMLSpace(c)) + m_state = AfterRule; + else if (c == ';') + m_state = Initial; + else + m_rule.append(c); + break; + case AfterRule: + if (isHTMLSpace(c)) + ; + else if (c == ';') + m_state = Initial; + else { + m_state = RuleValue; + m_ruleValue.append(c); + } + break; + case RuleValue: + if (isHTMLSpace(c)) + m_state = AfterRuleValue; + else if (c == ';') { + emitRule(); + m_state = Initial; + } else + m_ruleValue.append(c); + break; + case AfterRuleValue: + if (isHTMLSpace(c)) + ; + else if (c == ';') { + emitRule(); + m_state = Initial; + } else { + // FIXME: media rules + m_state = Initial; + } + break; + } +} + +static String parseCSSStringOrURL(const UChar* characters, size_t length) +{ + size_t offset = 0; + size_t reducedLength = length; + + while (reducedLength && isHTMLSpace(characters[offset])) { + ++offset; + --reducedLength; + } + while (reducedLength && isHTMLSpace(characters[offset + reducedLength - 1])) + --reducedLength; + + if (reducedLength >= 5 + && (characters[offset] == 'u' || characters[offset] == 'U') + && (characters[offset + 1] == 'r' || characters[offset + 1] == 'R') + && (characters[offset + 2] == 'l' || characters[offset + 2] == 'L') + && characters[offset + 3] == '(' + && characters[offset + reducedLength - 1] == ')') { + offset += 4; + reducedLength -= 5; + } + + while (reducedLength && isHTMLSpace(characters[offset])) { + ++offset; + --reducedLength; + } + while (reducedLength && isHTMLSpace(characters[offset + reducedLength - 1])) + --reducedLength; + + if (reducedLength < 2 || characters[offset] != characters[offset + reducedLength - 1] || !(characters[offset] == '\'' || characters[offset] == '"')) + return String(); + offset++; + reducedLength -= 2; + + while (reducedLength && isHTMLSpace(characters[offset])) { + ++offset; + --reducedLength; + } + while (reducedLength && isHTMLSpace(characters[offset + reducedLength - 1])) + --reducedLength; + + return String(characters + offset, reducedLength); +} + +void CSSPreloadScanner::emitRule() +{ + if (equalIgnoringCase("import", m_rule.data(), m_rule.size())) { + String value = parseCSSStringOrURL(m_ruleValue.data(), m_ruleValue.size()); + if (!value.isEmpty()) + m_document->cachedResourceLoader()->preload(CachedResource::CSSStyleSheet, value, String(), m_scanningBody); + } + m_rule.clear(); + m_ruleValue.clear(); +} + +} diff --git a/Source/WebCore/html/parser/CSSPreloadScanner.h b/Source/WebCore/html/parser/CSSPreloadScanner.h new file mode 100644 index 0000000..7ac282f --- /dev/null +++ b/Source/WebCore/html/parser/CSSPreloadScanner.h @@ -0,0 +1,71 @@ +/* + * Copyright (C) 2008 Apple Inc. All Rights Reserved. + * Copyright (C) 2010 Google Inc. All Rights Reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef CSSPreloadScanner_h +#define CSSPreloadScanner_h + +#include "PlatformString.h" +#include <wtf/Vector.h> + +namespace WebCore { + +class Document; +class HTMLToken; + +class CSSPreloadScanner : public Noncopyable { +public: + CSSPreloadScanner(Document*); + + void reset(); + void scan(const HTMLToken&, bool scanningBody); + +private: + enum State { + Initial, + MaybeComment, + Comment, + MaybeCommentEnd, + RuleStart, + Rule, + AfterRule, + RuleValue, + AfterRuleValue + }; + + inline void tokenize(UChar c); + void emitRule(); + + State m_state; + Vector<UChar, 16> m_rule; + Vector<UChar> m_ruleValue; + + bool m_scanningBody; + Document* m_document; +}; + +} + +#endif diff --git a/Source/WebCore/html/parser/HTMLConstructionSite.cpp b/Source/WebCore/html/parser/HTMLConstructionSite.cpp new file mode 100644 index 0000000..c46b9b9 --- /dev/null +++ b/Source/WebCore/html/parser/HTMLConstructionSite.cpp @@ -0,0 +1,464 @@ +/* + * Copyright (C) 2010 Google, Inc. All Rights Reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY GOOGLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GOOGLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "HTMLTreeBuilder.h" + +#include "Comment.h" +#include "DocumentFragment.h" +#include "DocumentType.h" +#include "Element.h" +#include "Frame.h" +#include "HTMLDocument.h" +#include "HTMLElementFactory.h" +#include "HTMLFormElement.h" +#include "HTMLHtmlElement.h" +#include "HTMLNames.h" +#include "HTMLScriptElement.h" +#include "HTMLToken.h" +#include "HTMLTokenizer.h" +#include "LocalizedStrings.h" +#if ENABLE(MATHML) +#include "MathMLNames.h" +#endif +#include "NotImplemented.h" +#if ENABLE(SVG) +#include "SVGNames.h" +#endif +#include "ScriptController.h" +#include "Settings.h" +#include "Text.h" +#include <wtf/UnusedParam.h> + +namespace WebCore { + +using namespace HTMLNames; + +namespace { + +bool hasImpliedEndTag(Element* element) +{ + return element->hasTagName(ddTag) + || element->hasTagName(dtTag) + || element->hasTagName(liTag) + || element->hasTagName(optionTag) + || element->hasTagName(optgroupTag) + || element->hasTagName(pTag) + || element->hasTagName(rpTag) + || element->hasTagName(rtTag); +} + +bool causesFosterParenting(const QualifiedName& tagName) +{ + return tagName == tableTag + || tagName == tbodyTag + || tagName == tfootTag + || tagName == theadTag + || tagName == trTag; +} + +} // namespace + +template<typename ChildType> +PassRefPtr<ChildType> HTMLConstructionSite::attach(ContainerNode* parent, PassRefPtr<ChildType> prpChild) +{ + RefPtr<ChildType> child = prpChild; + + // FIXME: It's confusing that HTMLConstructionSite::attach does the magic + // redirection to the foster parent but HTMLConstructionSite::attachAtSite + // doesn't. It feels like we're missing a concept somehow. + if (shouldFosterParent()) { + fosterParent(child.get()); + ASSERT(child->attached() || !child->parentNode() || !child->parentNode()->attached()); + return child.release(); + } + + parent->parserAddChild(child); + + // An event handler (DOM Mutation, beforeload, et al.) could have removed + // the child, in which case we shouldn't try attaching it. + if (!child->parentNode()) + return child.release(); + + // It's slightly unfortunate that we need to hold a reference to child + // here to call attach(). We should investigate whether we can rely on + // |parent| to hold a ref at this point. In the common case (at least + // for elements), however, we'll get to use this ref in the stack of + // open elements. + if (parent->attached() && !child->attached()) + child->attach(); + return child.release(); +} + +void HTMLConstructionSite::attachAtSite(const AttachmentSite& site, PassRefPtr<Node> prpChild) +{ + // FIXME: It's unfortunate that we need to hold a reference to child + // here to call attach(). We should investigate whether we can rely on + // |site.parent| to hold a ref at this point. + RefPtr<Node> child = prpChild; + + if (site.nextChild) + site.parent->parserInsertBefore(child, site.nextChild); + else + site.parent->parserAddChild(child); + + // JavaScript run from beforeload (or DOM Mutation or event handlers) + // might have removed the child, in which case we should not attach it. + if (child->parentNode() && site.parent->attached() && !child->attached()) + child->attach(); +} + +HTMLConstructionSite::HTMLConstructionSite(Document* document, FragmentScriptingPermission scriptingPermission, bool isParsingFragment) + : m_document(document) + , m_fragmentScriptingPermission(scriptingPermission) + , m_isParsingFragment(isParsingFragment) + , m_redirectAttachToFosterParent(false) +{ +} + +HTMLConstructionSite::~HTMLConstructionSite() +{ +} + +void HTMLConstructionSite::detach() +{ + m_document = 0; +} + +void HTMLConstructionSite::setForm(HTMLFormElement* form) +{ + // This method should only be needed for HTMLTreeBuilder in the fragment case. + ASSERT(!m_form); + m_form = form; +} + +PassRefPtr<HTMLFormElement> HTMLConstructionSite::takeForm() +{ + return m_form.release(); +} + +void HTMLConstructionSite::dispatchDocumentElementAvailableIfNeeded() +{ + ASSERT(m_document); + if (m_document->frame() && !m_isParsingFragment) + m_document->frame()->loader()->dispatchDocumentElementAvailable(); +} + +void HTMLConstructionSite::insertHTMLHtmlStartTagBeforeHTML(AtomicHTMLToken& token) +{ + RefPtr<HTMLHtmlElement> element = HTMLHtmlElement::create(m_document); + element->setAttributeMap(token.takeAtributes(), m_fragmentScriptingPermission); + m_openElements.pushHTMLHtmlElement(attach<Element>(m_document, element.get())); +#if ENABLE(OFFLINE_WEB_APPLICATIONS) + element->insertedByParser(); +#endif + dispatchDocumentElementAvailableIfNeeded(); +} + +void HTMLConstructionSite::mergeAttributesFromTokenIntoElement(AtomicHTMLToken& token, Element* element) +{ + if (!token.attributes()) + return; + + NamedNodeMap* attributes = element->attributes(false); + for (unsigned i = 0; i < token.attributes()->length(); ++i) { + Attribute* attribute = token.attributes()->attributeItem(i); + if (!attributes->getAttributeItem(attribute->name())) + element->setAttribute(attribute->name(), attribute->value()); + } +} + +void HTMLConstructionSite::insertHTMLHtmlStartTagInBody(AtomicHTMLToken& token) +{ + // FIXME: parse error + mergeAttributesFromTokenIntoElement(token, m_openElements.htmlElement()); +} + +void HTMLConstructionSite::insertHTMLBodyStartTagInBody(AtomicHTMLToken& token) +{ + // FIXME: parse error + mergeAttributesFromTokenIntoElement(token, m_openElements.bodyElement()); +} + +void HTMLConstructionSite::insertDoctype(AtomicHTMLToken& token) +{ + ASSERT(token.type() == HTMLToken::DOCTYPE); + attach(m_document, DocumentType::create(m_document, token.name(), String::adopt(token.publicIdentifier()), String::adopt(token.systemIdentifier()))); + + if (token.forceQuirks()) + m_document->setCompatibilityMode(Document::QuirksMode); + else + m_document->setCompatibilityModeFromDoctype(); +} + +void HTMLConstructionSite::insertComment(AtomicHTMLToken& token) +{ + ASSERT(token.type() == HTMLToken::Comment); + attach(currentElement(), Comment::create(currentElement()->document(), token.comment())); +} + +void HTMLConstructionSite::insertCommentOnDocument(AtomicHTMLToken& token) +{ + ASSERT(token.type() == HTMLToken::Comment); + attach(m_document, Comment::create(m_document, token.comment())); +} + +void HTMLConstructionSite::insertCommentOnHTMLHtmlElement(AtomicHTMLToken& token) +{ + ASSERT(token.type() == HTMLToken::Comment); + Element* parent = m_openElements.htmlElement(); + attach(parent, Comment::create(parent->document(), token.comment())); +} + +PassRefPtr<Element> HTMLConstructionSite::attachToCurrent(PassRefPtr<Element> child) +{ + return attach(currentElement(), child); +} + +void HTMLConstructionSite::insertHTMLHeadElement(AtomicHTMLToken& token) +{ + ASSERT(!shouldFosterParent()); + m_head = attachToCurrent(createHTMLElement(token)); + m_openElements.pushHTMLHeadElement(m_head); +} + +void HTMLConstructionSite::insertHTMLBodyElement(AtomicHTMLToken& token) +{ + ASSERT(!shouldFosterParent()); + m_openElements.pushHTMLBodyElement(attachToCurrent(createHTMLElement(token))); +} + +void HTMLConstructionSite::insertHTMLFormElement(AtomicHTMLToken& token, bool isDemoted) +{ + RefPtr<Element> element = createHTMLElement(token); + ASSERT(element->hasTagName(formTag)); + RefPtr<HTMLFormElement> form = static_pointer_cast<HTMLFormElement>(element.release()); + form->setDemoted(isDemoted); + m_openElements.push(attachToCurrent(form.release())); + ASSERT(currentElement()->isHTMLElement()); + ASSERT(currentElement()->hasTagName(formTag)); + m_form = static_cast<HTMLFormElement*>(currentElement()); +} + +void HTMLConstructionSite::insertHTMLElement(AtomicHTMLToken& token) +{ + m_openElements.push(attachToCurrent(createHTMLElement(token))); +} + +void HTMLConstructionSite::insertSelfClosingHTMLElement(AtomicHTMLToken& token) +{ + ASSERT(token.type() == HTMLToken::StartTag); + RefPtr<Element> element = attachToCurrent(createHTMLElement(token)); + // Normally HTMLElementStack is responsible for calling finishParsingChildren, + // but self-closing elements are never in the element stack so the stack + // doesn't get a chance to tell them that we're done parsing their children. + element->finishParsingChildren(); + // FIXME: Do we want to acknowledge the token's self-closing flag? + // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#acknowledge-self-closing-flag +} + +void HTMLConstructionSite::insertFormattingElement(AtomicHTMLToken& token) +{ + // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#the-stack-of-open-elements + // Possible active formatting elements include: + // a, b, big, code, em, font, i, nobr, s, small, strike, strong, tt, and u. + insertHTMLElement(token); + m_activeFormattingElements.append(currentElement()); +} + +void HTMLConstructionSite::insertScriptElement(AtomicHTMLToken& token) +{ + RefPtr<HTMLScriptElement> element = HTMLScriptElement::create(scriptTag, currentElement()->document(), true); + if (m_fragmentScriptingPermission == FragmentScriptingAllowed) + element->setAttributeMap(token.takeAtributes(), m_fragmentScriptingPermission); + m_openElements.push(attachToCurrent(element.release())); +} + +void HTMLConstructionSite::insertForeignElement(AtomicHTMLToken& token, const AtomicString& namespaceURI) +{ + ASSERT(token.type() == HTMLToken::StartTag); + notImplemented(); // parseError when xmlns or xmlns:xlink are wrong. + + RefPtr<Element> element = attachToCurrent(createElement(token, namespaceURI)); + if (!token.selfClosing()) + m_openElements.push(element); +} + +void HTMLConstructionSite::insertTextNode(const String& characters) +{ + AttachmentSite site; + site.parent = currentElement(); + site.nextChild = 0; + if (shouldFosterParent()) + findFosterSite(site); + + Node* previousChild = site.nextChild ? site.nextChild->previousSibling() : site.parent->lastChild(); + if (previousChild && previousChild->isTextNode()) { + // FIXME: We're only supposed to append to this text node if it + // was the last text node inserted by the parser. + CharacterData* textNode = static_cast<CharacterData*>(previousChild); + textNode->parserAppendData(characters); + return; + } + + attachAtSite(site, Text::create(site.parent->document(), characters)); +} + +PassRefPtr<Element> HTMLConstructionSite::createElement(AtomicHTMLToken& token, const AtomicString& namespaceURI) +{ + QualifiedName tagName(nullAtom, token.name(), namespaceURI); + RefPtr<Element> element = currentElement()->document()->createElement(tagName, true); + element->setAttributeMap(token.takeAtributes(), m_fragmentScriptingPermission); + return element.release(); +} + +PassRefPtr<Element> HTMLConstructionSite::createHTMLElement(AtomicHTMLToken& token) +{ + QualifiedName tagName(nullAtom, token.name(), xhtmlNamespaceURI); + // FIXME: This can't use HTMLConstructionSite::createElement because we + // have to pass the current form element. We should rework form association + // to occur after construction to allow better code sharing here. + RefPtr<Element> element = HTMLElementFactory::createHTMLElement(tagName, currentElement()->document(), form(), true); + element->setAttributeMap(token.takeAtributes(), m_fragmentScriptingPermission); + ASSERT(element->isHTMLElement()); + return element.release(); +} + +PassRefPtr<Element> HTMLConstructionSite::createHTMLElementFromElementRecord(HTMLElementStack::ElementRecord* record) +{ + return createHTMLElementFromSavedElement(record->element()); +} + +namespace { + +PassRefPtr<NamedNodeMap> cloneAttributes(Element* element) +{ + NamedNodeMap* attributes = element->attributes(true); + if (!attributes) + return 0; + + RefPtr<NamedNodeMap> newAttributes = NamedNodeMap::create(); + for (size_t i = 0; i < attributes->length(); ++i) { + Attribute* attribute = attributes->attributeItem(i); + RefPtr<Attribute> clone = Attribute::createMapped(attribute->name(), attribute->value()); + newAttributes->addAttribute(clone); + } + return newAttributes.release(); +} + +} + +PassRefPtr<Element> HTMLConstructionSite::createHTMLElementFromSavedElement(Element* element) +{ + // FIXME: This method is wrong. We should be using the original token. + // Using an Element* causes us to fail examples like this: + // <b id="1"><p><script>document.getElementById("1").id = "2"</script></p>TEXT</b> + // When reconstructTheActiveFormattingElements calls this method to open + // a second <b> tag to wrap TEXT, it will have id "2", even though the HTML5 + // spec implies it should be "1". Minefield matches the HTML5 spec here. + + ASSERT(element->isHTMLElement()); // otherwise localName() might be wrong. + AtomicHTMLToken fakeToken(HTMLToken::StartTag, element->localName(), cloneAttributes(element)); + return createHTMLElement(fakeToken); +} + +bool HTMLConstructionSite::indexOfFirstUnopenFormattingElement(unsigned& firstUnopenElementIndex) const +{ + if (m_activeFormattingElements.isEmpty()) + return false; + unsigned index = m_activeFormattingElements.size(); + do { + --index; + const HTMLFormattingElementList::Entry& entry = m_activeFormattingElements.at(index); + if (entry.isMarker() || m_openElements.contains(entry.element())) { + firstUnopenElementIndex = index + 1; + return firstUnopenElementIndex < m_activeFormattingElements.size(); + } + } while (index); + firstUnopenElementIndex = index; + return true; +} + +void HTMLConstructionSite::reconstructTheActiveFormattingElements() +{ + unsigned firstUnopenElementIndex; + if (!indexOfFirstUnopenFormattingElement(firstUnopenElementIndex)) + return; + + unsigned unopenEntryIndex = firstUnopenElementIndex; + ASSERT(unopenEntryIndex < m_activeFormattingElements.size()); + for (; unopenEntryIndex < m_activeFormattingElements.size(); ++unopenEntryIndex) { + HTMLFormattingElementList::Entry& unopenedEntry = m_activeFormattingElements.at(unopenEntryIndex); + RefPtr<Element> reconstructed = createHTMLElementFromSavedElement(unopenedEntry.element()); + m_openElements.push(attachToCurrent(reconstructed.release())); + unopenedEntry.replaceElement(currentElement()); + } +} + +void HTMLConstructionSite::generateImpliedEndTagsWithExclusion(const AtomicString& tagName) +{ + while (hasImpliedEndTag(currentElement()) && !currentElement()->hasLocalName(tagName)) + m_openElements.pop(); +} + +void HTMLConstructionSite::generateImpliedEndTags() +{ + while (hasImpliedEndTag(currentElement())) + m_openElements.pop(); +} + +void HTMLConstructionSite::findFosterSite(AttachmentSite& site) +{ + HTMLElementStack::ElementRecord* lastTableElementRecord = m_openElements.topmost(tableTag.localName()); + if (lastTableElementRecord) { + Element* lastTableElement = lastTableElementRecord->element(); + if (ContainerNode* parent = lastTableElement->parentNode()) { + site.parent = parent; + site.nextChild = lastTableElement; + return; + } + site.parent = lastTableElementRecord->next()->element(); + site.nextChild = 0; + return; + } + // Fragment case + site.parent = m_openElements.bottom(); // <html> element + site.nextChild = 0; +} + +bool HTMLConstructionSite::shouldFosterParent() const +{ + return m_redirectAttachToFosterParent + && causesFosterParenting(currentElement()->tagQName()); +} + +void HTMLConstructionSite::fosterParent(Node* node) +{ + AttachmentSite site; + findFosterSite(site); + attachAtSite(site, node); +} + +} diff --git a/Source/WebCore/html/parser/HTMLConstructionSite.h b/Source/WebCore/html/parser/HTMLConstructionSite.h new file mode 100644 index 0000000..8b09bf5 --- /dev/null +++ b/Source/WebCore/html/parser/HTMLConstructionSite.h @@ -0,0 +1,148 @@ +/* + * Copyright (C) 2010 Google, Inc. All Rights Reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY GOOGLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GOOGLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef HTMLConstructionSite_h +#define HTMLConstructionSite_h + +#include "FragmentScriptingPermission.h" +#include "HTMLElementStack.h" +#include "HTMLFormattingElementList.h" +#include "NotImplemented.h" +#include <wtf/Noncopyable.h> +#include <wtf/PassRefPtr.h> +#include <wtf/RefPtr.h> + +namespace WebCore { + +class AtomicHTMLToken; +class Document; +class Element; + +class HTMLConstructionSite : public Noncopyable { +public: + HTMLConstructionSite(Document*, FragmentScriptingPermission, bool isParsingFragment); + ~HTMLConstructionSite(); + + void detach(); + + void insertDoctype(AtomicHTMLToken&); + void insertComment(AtomicHTMLToken&); + void insertCommentOnDocument(AtomicHTMLToken&); + void insertCommentOnHTMLHtmlElement(AtomicHTMLToken&); + void insertHTMLElement(AtomicHTMLToken&); + void insertSelfClosingHTMLElement(AtomicHTMLToken&); + void insertFormattingElement(AtomicHTMLToken&); + void insertHTMLHeadElement(AtomicHTMLToken&); + void insertHTMLBodyElement(AtomicHTMLToken&); + void insertHTMLFormElement(AtomicHTMLToken&, bool isDemoted = false); + void insertScriptElement(AtomicHTMLToken&); + void insertTextNode(const String&); + void insertForeignElement(AtomicHTMLToken&, const AtomicString& namespaceURI); + + void insertHTMLHtmlStartTagBeforeHTML(AtomicHTMLToken&); + void insertHTMLHtmlStartTagInBody(AtomicHTMLToken&); + void insertHTMLBodyStartTagInBody(AtomicHTMLToken&); + + PassRefPtr<Element> createHTMLElement(AtomicHTMLToken&); + PassRefPtr<Element> createHTMLElementFromElementRecord(HTMLElementStack::ElementRecord*); + + bool shouldFosterParent() const; + void fosterParent(Node*); + + bool indexOfFirstUnopenFormattingElement(unsigned& firstUnopenElementIndex) const; + void reconstructTheActiveFormattingElements(); + + void generateImpliedEndTags(); + void generateImpliedEndTagsWithExclusion(const AtomicString& tagName); + + Element* currentElement() const { return m_openElements.top(); } + Element* oneBelowTop() const { return m_openElements.oneBelowTop(); } + + HTMLElementStack* openElements() const { return &m_openElements; } + HTMLFormattingElementList* activeFormattingElements() const { return &m_activeFormattingElements; } + + Element* head() const { return m_head.get(); } + + void setForm(HTMLFormElement*); + HTMLFormElement* form() const { return m_form.get(); } + PassRefPtr<HTMLFormElement> takeForm(); + + class RedirectToFosterParentGuard : public Noncopyable { + public: + RedirectToFosterParentGuard(HTMLConstructionSite& tree) + : m_tree(tree) + , m_wasRedirectingBefore(tree.m_redirectAttachToFosterParent) + { + m_tree.m_redirectAttachToFosterParent = true; + } + + ~RedirectToFosterParentGuard() + { + m_tree.m_redirectAttachToFosterParent = m_wasRedirectingBefore; + } + + private: + HTMLConstructionSite& m_tree; + bool m_wasRedirectingBefore; + }; + +private: + struct AttachmentSite { + ContainerNode* parent; + Node* nextChild; + }; + + template<typename ChildType> + PassRefPtr<ChildType> attach(ContainerNode* parent, PassRefPtr<ChildType> child); + PassRefPtr<Element> attachToCurrent(PassRefPtr<Element>); + + void attachAtSite(const AttachmentSite&, PassRefPtr<Node> child); + void findFosterSite(AttachmentSite&); + + PassRefPtr<Element> createHTMLElementFromSavedElement(Element*); + PassRefPtr<Element> createElement(AtomicHTMLToken&, const AtomicString& namespaceURI); + + void mergeAttributesFromTokenIntoElement(AtomicHTMLToken&, Element*); + void dispatchDocumentElementAvailableIfNeeded(); + + Document* m_document; + RefPtr<Element> m_head; + RefPtr<HTMLFormElement> m_form; + mutable HTMLElementStack m_openElements; + mutable HTMLFormattingElementList m_activeFormattingElements; + + FragmentScriptingPermission m_fragmentScriptingPermission; + bool m_isParsingFragment; + + // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#parsing-main-intable + // In the "in table" insertion mode, we sometimes get into a state where + // "whenever a node would be inserted into the current node, it must instead + // be foster parented." This flag tracks whether we're in that state. + bool m_redirectAttachToFosterParent; +}; + +} + +#endif diff --git a/Source/WebCore/html/parser/HTMLDocumentParser.cpp b/Source/WebCore/html/parser/HTMLDocumentParser.cpp new file mode 100644 index 0000000..93e1309 --- /dev/null +++ b/Source/WebCore/html/parser/HTMLDocumentParser.cpp @@ -0,0 +1,549 @@ +/* + * Copyright (C) 2010 Google, Inc. All Rights Reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "HTMLDocumentParser.h" + +#include "DocumentFragment.h" +#include "Element.h" +#include "Frame.h" +#include "HTMLNames.h" +#include "HTMLParserScheduler.h" +#include "HTMLTokenizer.h" +#include "HTMLPreloadScanner.h" +#include "HTMLScriptRunner.h" +#include "HTMLTreeBuilder.h" +#include "HTMLDocument.h" +#include "InspectorInstrumentation.h" +#include "NestingLevelIncrementer.h" +#include "Settings.h" +#include "XSSAuditor.h" +#include <wtf/CurrentTime.h> + +#ifdef ANDROID_INSTRUMENT +#include "TimeCounter.h" +#endif + +namespace WebCore { + +using namespace HTMLNames; + +namespace { + +// This is a direct transcription of step 4 from: +// http://www.whatwg.org/specs/web-apps/current-work/multipage/the-end.html#fragment-case +HTMLTokenizer::State tokenizerStateForContextElement(Element* contextElement, bool reportErrors) +{ + if (!contextElement) + return HTMLTokenizer::DataState; + + const QualifiedName& contextTag = contextElement->tagQName(); + + if (contextTag.matches(titleTag) || contextTag.matches(textareaTag)) + return HTMLTokenizer::RCDATAState; + if (contextTag.matches(styleTag) + || contextTag.matches(xmpTag) + || contextTag.matches(iframeTag) + || (contextTag.matches(noembedTag) && HTMLTreeBuilder::pluginsEnabled(contextElement->document()->frame())) + || (contextTag.matches(noscriptTag) && HTMLTreeBuilder::scriptEnabled(contextElement->document()->frame())) + || contextTag.matches(noframesTag)) + return reportErrors ? HTMLTokenizer::RAWTEXTState : HTMLTokenizer::PLAINTEXTState; + if (contextTag.matches(scriptTag)) + return reportErrors ? HTMLTokenizer::ScriptDataState : HTMLTokenizer::PLAINTEXTState; + if (contextTag.matches(plaintextTag)) + return HTMLTokenizer::PLAINTEXTState; + return HTMLTokenizer::DataState; +} + +} // namespace + +HTMLDocumentParser::HTMLDocumentParser(HTMLDocument* document, bool reportErrors) + : ScriptableDocumentParser(document) + , m_tokenizer(HTMLTokenizer::create(usePreHTML5ParserQuirks(document))) + , m_scriptRunner(HTMLScriptRunner::create(document, this)) + , m_treeBuilder(HTMLTreeBuilder::create(this, document, reportErrors, usePreHTML5ParserQuirks(document))) + , m_parserScheduler(HTMLParserScheduler::create(this)) + , m_endWasDelayed(false) + , m_writeNestingLevel(0) +{ +} + +// FIXME: Member variables should be grouped into self-initializing structs to +// minimize code duplication between these constructors. +HTMLDocumentParser::HTMLDocumentParser(DocumentFragment* fragment, Element* contextElement, FragmentScriptingPermission scriptingPermission) + : ScriptableDocumentParser(fragment->document()) + , m_tokenizer(HTMLTokenizer::create(usePreHTML5ParserQuirks(fragment->document()))) + , m_treeBuilder(HTMLTreeBuilder::create(this, fragment, contextElement, scriptingPermission, usePreHTML5ParserQuirks(fragment->document()))) + , m_endWasDelayed(false) + , m_writeNestingLevel(0) +{ + bool reportErrors = false; // For now document fragment parsing never reports errors. + m_tokenizer->setState(tokenizerStateForContextElement(contextElement, reportErrors)); +} + +HTMLDocumentParser::~HTMLDocumentParser() +{ + ASSERT(!m_parserScheduler); + ASSERT(!m_writeNestingLevel); + ASSERT(!m_preloadScanner); +} + +void HTMLDocumentParser::detach() +{ + DocumentParser::detach(); + if (m_scriptRunner) + m_scriptRunner->detach(); + m_treeBuilder->detach(); + // FIXME: It seems wrong that we would have a preload scanner here. + // Yet during fast/dom/HTMLScriptElement/script-load-events.html we do. + m_preloadScanner.clear(); + m_parserScheduler.clear(); // Deleting the scheduler will clear any timers. +} + +void HTMLDocumentParser::stopParsing() +{ + DocumentParser::stopParsing(); + m_parserScheduler.clear(); // Deleting the scheduler will clear any timers. +} + +// This kicks off "Once the user agent stops parsing" as described by: +// http://www.whatwg.org/specs/web-apps/current-work/multipage/the-end.html#the-end +void HTMLDocumentParser::prepareToStopParsing() +{ + ASSERT(!hasInsertionPoint()); + + // pumpTokenizer can cause this parser to be detached from the Document, + // but we need to ensure it isn't deleted yet. + RefPtr<HTMLDocumentParser> protect(this); + + // NOTE: This pump should only ever emit buffered character tokens, + // so ForceSynchronous vs. AllowYield should be meaningless. + pumpTokenizerIfPossible(ForceSynchronous); + + if (isStopped()) + return; + + DocumentParser::prepareToStopParsing(); + + // We will not have a scriptRunner when parsing a DocumentFragment. + if (m_scriptRunner) + document()->setReadyState(Document::Interactive); + + attemptToRunDeferredScriptsAndEnd(); +} + +bool HTMLDocumentParser::processingData() const +{ + return isScheduledForResume() || inWrite(); +} + +void HTMLDocumentParser::pumpTokenizerIfPossible(SynchronousMode mode) +{ + if (isStopped() || m_treeBuilder->isPaused()) + return; + + // Once a resume is scheduled, HTMLParserScheduler controls when we next pump. + if (isScheduledForResume()) { + ASSERT(mode == AllowYield); + return; + } + + pumpTokenizer(mode); +} + +bool HTMLDocumentParser::isScheduledForResume() const +{ + return m_parserScheduler && m_parserScheduler->isScheduledForResume(); +} + +// Used by HTMLParserScheduler +void HTMLDocumentParser::resumeParsingAfterYield() +{ + // pumpTokenizer can cause this parser to be detached from the Document, + // but we need to ensure it isn't deleted yet. + RefPtr<HTMLDocumentParser> protect(this); + + // We should never be here unless we can pump immediately. Call pumpTokenizer() + // directly so that ASSERTS will fire if we're wrong. + pumpTokenizer(AllowYield); + endIfDelayed(); +} + +bool HTMLDocumentParser::runScriptsForPausedTreeBuilder() +{ + ASSERT(m_treeBuilder->isPaused()); + + TextPosition1 scriptStartPosition = TextPosition1::belowRangePosition(); + RefPtr<Element> scriptElement = m_treeBuilder->takeScriptToProcess(scriptStartPosition); + // We will not have a scriptRunner when parsing a DocumentFragment. + if (!m_scriptRunner) + return true; + return m_scriptRunner->execute(scriptElement.release(), scriptStartPosition); +} + +void HTMLDocumentParser::pumpTokenizer(SynchronousMode mode) +{ + ASSERT(!isStopped()); + ASSERT(!m_treeBuilder->isPaused()); + ASSERT(!isScheduledForResume()); + // ASSERT that this object is both attached to the Document and protected. + ASSERT(refCount() >= 2); + + // We tell the InspectorInstrumentation about every pump, even if we + // end up pumping nothing. It can filter out empty pumps itself. + // FIXME: m_input.current().length() is only accurate if we + // end up parsing the whole buffer in this pump. We should pass how + // much we parsed as part of didWriteHTML instead of willWriteHTML. + InspectorInstrumentationCookie cookie = InspectorInstrumentation::willWriteHTML(document(), m_input.current().length(), m_tokenizer->lineNumber()); + + HTMLParserScheduler::PumpSession session; + // FIXME: This loop body has is now too long and needs cleanup. + while (mode == ForceSynchronous || m_parserScheduler->shouldContinueParsing(session)) { + // FIXME: It's wrong for the HTMLDocumentParser to reach back to the + // Frame, but this approach is how the old parser handled + // stopping when the page assigns window.location. What really + // should happen is that assigning window.location causes the + // parser to stop parsing cleanly. The problem is we're not + // perpared to do that at every point where we run JavaScript. + if (!m_treeBuilder->isParsingFragment() + && document()->frame() && document()->frame()->navigationScheduler()->locationChangePending()) + break; + if (!m_tokenizer->nextToken(m_input.current(), m_token)) + break; + + m_treeBuilder->constructTreeFromToken(m_token); + m_token.clear(); + + // JavaScript may have stopped or detached the parser. + if (isStopped()) + return; + + // The parser will pause itself when waiting on a script to load or run. + if (!m_treeBuilder->isPaused()) + continue; + + // If we're paused waiting for a script, we try to execute scripts before continuing. + bool shouldContinueParsing = runScriptsForPausedTreeBuilder(); + m_treeBuilder->setPaused(!shouldContinueParsing); + + // JavaScript may have stopped or detached the parser. + if (isStopped()) + return; + + if (!shouldContinueParsing) + break; + } + + // Ensure we haven't been totally deref'ed after pumping. Any caller of this + // function should be holding a RefPtr to this to ensure we weren't deleted. + ASSERT(refCount() >= 1); + + if (isWaitingForScripts()) { + ASSERT(m_tokenizer->state() == HTMLTokenizer::DataState); + if (!m_preloadScanner) { + m_preloadScanner.set(new HTMLPreloadScanner(document())); + m_preloadScanner->appendToEnd(m_input.current()); + } + m_preloadScanner->scan(); + } + + InspectorInstrumentation::didWriteHTML(cookie, m_tokenizer->lineNumber()); +} + +bool HTMLDocumentParser::hasInsertionPoint() +{ + return m_input.hasInsertionPoint(); +} + +void HTMLDocumentParser::insert(const SegmentedString& source) +{ + if (isStopped()) + return; + +#ifdef ANDROID_INSTRUMENT + android::TimeCounter::start(android::TimeCounter::ParsingTimeCounter); +#endif + + // pumpTokenizer can cause this parser to be detached from the Document, + // but we need to ensure it isn't deleted yet. + RefPtr<HTMLDocumentParser> protect(this); + + { + NestingLevelIncrementer nestingLevelIncrementer(m_writeNestingLevel); + + SegmentedString excludedLineNumberSource(source); + excludedLineNumberSource.setExcludeLineNumbers(); + m_input.insertAtCurrentInsertionPoint(excludedLineNumberSource); + pumpTokenizerIfPossible(ForceSynchronous); + } + + endIfDelayed(); +} + +void HTMLDocumentParser::append(const SegmentedString& source) +{ + if (isStopped()) + return; + + // pumpTokenizer can cause this parser to be detached from the Document, + // but we need to ensure it isn't deleted yet. + RefPtr<HTMLDocumentParser> protect(this); + + { + NestingLevelIncrementer nestingLevelIncrementer(m_writeNestingLevel); + + m_input.appendToEnd(source); + if (m_preloadScanner) + m_preloadScanner->appendToEnd(source); + + if (m_writeNestingLevel > 1) { + // We've gotten data off the network in a nested write. + // We don't want to consume any more of the input stream now. Do + // not worry. We'll consume this data in a less-nested write(). +#ifdef ANDROID_INSTRUMENT + android::TimeCounter::record(android::TimeCounter::ParsingTimeCounter, __FUNCTION__); +#endif + return; + } + + pumpTokenizerIfPossible(AllowYield); + } + + endIfDelayed(); +#ifdef ANDROID_INSTRUMENT + android::TimeCounter::record(android::TimeCounter::ParsingTimeCounter, __FUNCTION__); +#endif +} + +void HTMLDocumentParser::end() +{ + ASSERT(!isDetached()); + ASSERT(!isScheduledForResume()); + + // Informs the the rest of WebCore that parsing is really finished (and deletes this). + m_treeBuilder->finished(); +} + +void HTMLDocumentParser::attemptToRunDeferredScriptsAndEnd() +{ + ASSERT(isStopping()); + ASSERT(!hasInsertionPoint()); + if (m_scriptRunner && !m_scriptRunner->executeScriptsWaitingForParsing()) + return; + end(); +} + +void HTMLDocumentParser::attemptToEnd() +{ + // finish() indicates we will not receive any more data. If we are waiting on + // an external script to load, we can't finish parsing quite yet. + + if (shouldDelayEnd()) { + m_endWasDelayed = true; + return; + } + prepareToStopParsing(); +} + +void HTMLDocumentParser::endIfDelayed() +{ + // If we've already been detached, don't bother ending. + if (isDetached()) + return; + + if (!m_endWasDelayed || shouldDelayEnd()) + return; + + m_endWasDelayed = false; + prepareToStopParsing(); +} + +void HTMLDocumentParser::finish() +{ + // FIXME: We should ASSERT(!m_parserStopped) here, since it does not + // makes sense to call any methods on DocumentParser once it's been stopped. + // However, FrameLoader::stop calls Document::finishParsing unconditionally + // which in turn calls m_parser->finish(). + + // We're not going to get any more data off the network, so we tell the + // input stream we've reached the end of file. finish() can be called more + // than once, if the first time does not call end(). + if (!m_input.haveSeenEndOfFile()) + m_input.markEndOfFile(); + attemptToEnd(); +} + +bool HTMLDocumentParser::finishWasCalled() +{ + return m_input.haveSeenEndOfFile(); +} + +// This function is virtual and just for the DocumentParser interface. +bool HTMLDocumentParser::isExecutingScript() const +{ + return inScriptExecution(); +} + +// This function is non-virtual and used throughout the implementation. +bool HTMLDocumentParser::inScriptExecution() const +{ + if (!m_scriptRunner) + return false; + return m_scriptRunner->isExecutingScript(); +} + +int HTMLDocumentParser::lineNumber() const +{ + return m_tokenizer->lineNumber(); +} + +TextPosition0 HTMLDocumentParser::textPosition() const +{ + const SegmentedString& currentString = m_input.current(); + WTF::ZeroBasedNumber line = currentString.currentLine(); + WTF::ZeroBasedNumber column = currentString.currentColumn(); + ASSERT(m_tokenizer->lineNumber() == line.zeroBasedInt()); + + return TextPosition0(line, column); +} + +bool HTMLDocumentParser::isWaitingForScripts() const +{ + return m_treeBuilder->isPaused(); +} + +void HTMLDocumentParser::resumeParsingAfterScriptExecution() +{ + ASSERT(!inScriptExecution()); + ASSERT(!m_treeBuilder->isPaused()); + + m_preloadScanner.clear(); + pumpTokenizerIfPossible(AllowYield); + endIfDelayed(); +} + +void HTMLDocumentParser::watchForLoad(CachedResource* cachedScript) +{ + ASSERT(!cachedScript->isLoaded()); + // addClient would call notifyFinished if the load were complete. + // Callers do not expect to be re-entered from this call, so they should + // not an already-loaded CachedResource. + cachedScript->addClient(this); +} + +void HTMLDocumentParser::stopWatchingForLoad(CachedResource* cachedScript) +{ + cachedScript->removeClient(this); +} + +bool HTMLDocumentParser::shouldLoadExternalScriptFromSrc(const AtomicString& srcValue) +{ + if (!xssAuditor()) + return true; + return xssAuditor()->canLoadExternalScriptFromSrc(srcValue); +} + +void HTMLDocumentParser::notifyFinished(CachedResource* cachedResource) +{ + // pumpTokenizer can cause this parser to be detached from the Document, + // but we need to ensure it isn't deleted yet. + RefPtr<HTMLDocumentParser> protect(this); + + ASSERT(m_scriptRunner); + ASSERT(!inScriptExecution()); + if (isStopping()) { + attemptToRunDeferredScriptsAndEnd(); + return; + } + + ASSERT(m_treeBuilder->isPaused()); + // Note: We only ever wait on one script at a time, so we always know this + // is the one we were waiting on and can un-pause the tree builder. + m_treeBuilder->setPaused(false); + bool shouldContinueParsing = m_scriptRunner->executeScriptsWaitingForLoad(cachedResource); + m_treeBuilder->setPaused(!shouldContinueParsing); + if (shouldContinueParsing) + resumeParsingAfterScriptExecution(); +} + +void HTMLDocumentParser::executeScriptsWaitingForStylesheets() +{ + // Document only calls this when the Document owns the DocumentParser + // so this will not be called in the DocumentFragment case. + ASSERT(m_scriptRunner); + // Ignore calls unless we have a script blocking the parser waiting on a + // stylesheet load. Otherwise we are currently parsing and this + // is a re-entrant call from encountering a </ style> tag. + if (!m_scriptRunner->hasScriptsWaitingForStylesheets()) + return; + + // pumpTokenizer can cause this parser to be detached from the Document, + // but we need to ensure it isn't deleted yet. + RefPtr<HTMLDocumentParser> protect(this); + + ASSERT(!m_scriptRunner->isExecutingScript()); + ASSERT(m_treeBuilder->isPaused()); + // Note: We only ever wait on one script at a time, so we always know this + // is the one we were waiting on and can un-pause the tree builder. + m_treeBuilder->setPaused(false); + bool shouldContinueParsing = m_scriptRunner->executeScriptsWaitingForStylesheets(); + m_treeBuilder->setPaused(!shouldContinueParsing); + if (shouldContinueParsing) + resumeParsingAfterScriptExecution(); +} + +ScriptController* HTMLDocumentParser::script() const +{ + return document()->frame() ? document()->frame()->script() : 0; +} + +void HTMLDocumentParser::parseDocumentFragment(const String& source, DocumentFragment* fragment, Element* contextElement, FragmentScriptingPermission scriptingPermission) +{ + RefPtr<HTMLDocumentParser> parser = HTMLDocumentParser::create(fragment, contextElement, scriptingPermission); + parser->insert(source); // Use insert() so that the parser will not yield. + parser->finish(); + ASSERT(!parser->processingData()); // Make sure we're done. <rdar://problem/3963151> + parser->detach(); // Allows ~DocumentParser to assert it was detached before destruction. +} + +bool HTMLDocumentParser::usePreHTML5ParserQuirks(Document* document) +{ + ASSERT(document); + return document->settings() && document->settings()->usePreHTML5ParserQuirks(); +} + +void HTMLDocumentParser::suspendScheduledTasks() +{ + if (m_parserScheduler) + m_parserScheduler->suspend(); +} + +void HTMLDocumentParser::resumeScheduledTasks() +{ + if (m_parserScheduler) + m_parserScheduler->resume(); +} + +} diff --git a/Source/WebCore/html/parser/HTMLDocumentParser.h b/Source/WebCore/html/parser/HTMLDocumentParser.h new file mode 100644 index 0000000..80ca727 --- /dev/null +++ b/Source/WebCore/html/parser/HTMLDocumentParser.h @@ -0,0 +1,150 @@ +/* + * Copyright (C) 2010 Google, Inc. All Rights Reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef HTMLDocumentParser_h +#define HTMLDocumentParser_h + +#include "CachedResourceClient.h" +#include "FragmentScriptingPermission.h" +#include "HTMLInputStream.h" +#include "HTMLScriptRunnerHost.h" +#include "HTMLToken.h" +#include "ScriptableDocumentParser.h" +#include "SegmentedString.h" +#include "Timer.h" +#include <wtf/OwnPtr.h> + +namespace WebCore { + +class Document; +class DocumentFragment; +class HTMLDocument; +class HTMLParserScheduler; +class HTMLTokenizer; +class HTMLScriptRunner; +class HTMLTreeBuilder; +class HTMLPreloadScanner; +class ScriptController; +class ScriptSourceCode; + +class HTMLDocumentParser : public ScriptableDocumentParser, HTMLScriptRunnerHost, CachedResourceClient { +public: + static PassRefPtr<HTMLDocumentParser> create(HTMLDocument* document, bool reportErrors) + { + return adoptRef(new HTMLDocumentParser(document, reportErrors)); + } + static PassRefPtr<HTMLDocumentParser> create(DocumentFragment* fragment, Element* contextElement, FragmentScriptingPermission permission) + { + return adoptRef(new HTMLDocumentParser(fragment, contextElement, permission)); + } + + virtual ~HTMLDocumentParser(); + + // Exposed for HTMLParserScheduler + void resumeParsingAfterYield(); + + static void parseDocumentFragment(const String&, DocumentFragment*, Element* contextElement, FragmentScriptingPermission = FragmentScriptingAllowed); + + static bool usePreHTML5ParserQuirks(Document*); + + HTMLTokenizer* tokenizer() const { return m_tokenizer.get(); } + + virtual TextPosition0 textPosition() const; + virtual void suspendScheduledTasks(); + virtual void resumeScheduledTasks(); + +protected: + virtual void insert(const SegmentedString&); + virtual void append(const SegmentedString&); + virtual void finish(); + + HTMLDocumentParser(HTMLDocument*, bool reportErrors); + HTMLDocumentParser(DocumentFragment*, Element* contextElement, FragmentScriptingPermission); + + HTMLTreeBuilder* treeBuilder() const { return m_treeBuilder.get(); } + +private: + // DocumentParser + virtual void detach(); + virtual bool hasInsertionPoint(); + virtual bool finishWasCalled(); + virtual bool processingData() const; + virtual void prepareToStopParsing(); + virtual void stopParsing(); + virtual bool isWaitingForScripts() const; + virtual bool isExecutingScript() const; + virtual void executeScriptsWaitingForStylesheets(); + virtual int lineNumber() const; + + // HTMLScriptRunnerHost + virtual void watchForLoad(CachedResource*); + virtual void stopWatchingForLoad(CachedResource*); + virtual bool shouldLoadExternalScriptFromSrc(const AtomicString&); + virtual HTMLInputStream& inputStream() { return m_input; } + + // CachedResourceClient + virtual void notifyFinished(CachedResource*); + + enum SynchronousMode { + AllowYield, + ForceSynchronous, + }; + void pumpTokenizer(SynchronousMode); + void pumpTokenizerIfPossible(SynchronousMode); + + bool runScriptsForPausedTreeBuilder(); + void resumeParsingAfterScriptExecution(); + + void begin(); + void attemptToEnd(); + void endIfDelayed(); + void attemptToRunDeferredScriptsAndEnd(); + void end(); + + bool isScheduledForResume() const; + bool inScriptExecution() const; + bool inWrite() const { return m_writeNestingLevel > 0; } + bool shouldDelayEnd() const { return inWrite() || isWaitingForScripts() || inScriptExecution() || isScheduledForResume(); } + + ScriptController* script() const; + + HTMLInputStream m_input; + + // We hold m_token here because it might be partially complete. + HTMLToken m_token; + + OwnPtr<HTMLTokenizer> m_tokenizer; + OwnPtr<HTMLScriptRunner> m_scriptRunner; + OwnPtr<HTMLTreeBuilder> m_treeBuilder; + OwnPtr<HTMLPreloadScanner> m_preloadScanner; + OwnPtr<HTMLParserScheduler> m_parserScheduler; + + bool m_endWasDelayed; + unsigned m_writeNestingLevel; +}; + +} + +#endif diff --git a/Source/WebCore/html/parser/HTMLElementStack.cpp b/Source/WebCore/html/parser/HTMLElementStack.cpp new file mode 100644 index 0000000..6aab0f7 --- /dev/null +++ b/Source/WebCore/html/parser/HTMLElementStack.cpp @@ -0,0 +1,569 @@ +/* + * Copyright (C) 2010 Google, Inc. All Rights Reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY GOOGLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GOOGLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "HTMLElementStack.h" + +#include "Element.h" +#include "HTMLNames.h" +#include "MathMLNames.h" +#include "SVGNames.h" +#include <wtf/PassOwnPtr.h> + +namespace WebCore { + +using namespace HTMLNames; + +namespace { + +inline bool isNumberedHeaderElement(Element* element) +{ + return element->hasTagName(h1Tag) + || element->hasTagName(h2Tag) + || element->hasTagName(h3Tag) + || element->hasTagName(h4Tag) + || element->hasTagName(h5Tag) + || element->hasTagName(h6Tag); +} + +inline bool isScopeMarker(Element* element) +{ + return element->hasTagName(appletTag) + || element->hasTagName(captionTag) + || element->hasTagName(htmlTag) + || element->hasTagName(marqueeTag) + || element->hasTagName(objectTag) + || element->hasTagName(tableTag) + || element->hasTagName(tdTag) + || element->hasTagName(thTag) + || element->hasTagName(MathMLNames::miTag) + || element->hasTagName(MathMLNames::moTag) + || element->hasTagName(MathMLNames::mnTag) + || element->hasTagName(MathMLNames::msTag) + || element->hasTagName(MathMLNames::mtextTag) + || element->hasTagName(MathMLNames::annotation_xmlTag) + || element->hasTagName(SVGNames::foreignObjectTag) + || element->hasTagName(SVGNames::descTag) + || element->hasTagName(SVGNames::titleTag); +} + +inline bool isListItemScopeMarker(Element* element) +{ + return isScopeMarker(element) + || element->hasTagName(olTag) + || element->hasTagName(ulTag); +} + +inline bool isTableScopeMarker(Element* element) +{ + return element->hasTagName(tableTag) + || element->hasTagName(htmlTag); +} + +inline bool isTableBodyScopeMarker(Element* element) +{ + return element->hasTagName(tbodyTag) + || element->hasTagName(tfootTag) + || element->hasTagName(theadTag) + || element->hasTagName(htmlTag); +} + +inline bool isTableRowScopeMarker(Element* element) +{ + return element->hasTagName(trTag) + || element->hasTagName(htmlTag); +} + +inline bool isForeignContentScopeMarker(Element* element) +{ + return element->hasTagName(MathMLNames::miTag) + || element->hasTagName(MathMLNames::moTag) + || element->hasTagName(MathMLNames::mnTag) + || element->hasTagName(MathMLNames::msTag) + || element->hasTagName(MathMLNames::mtextTag) + || element->hasTagName(SVGNames::foreignObjectTag) + || element->hasTagName(SVGNames::descTag) + || element->hasTagName(SVGNames::titleTag) + || element->namespaceURI() == HTMLNames::xhtmlNamespaceURI; +} + +inline bool isButtonScopeMarker(Element* element) +{ + return isScopeMarker(element) + || element->hasTagName(buttonTag); +} + +inline bool isSelectScopeMarker(Element* element) +{ + return !element->hasTagName(optgroupTag) + && !element->hasTagName(optionTag); +} + +} + +HTMLElementStack::ElementRecord::ElementRecord(PassRefPtr<Element> element, PassOwnPtr<ElementRecord> next) + : m_element(element) + , m_next(next) +{ + ASSERT(m_element); +} + +HTMLElementStack::ElementRecord::~ElementRecord() +{ +} + +void HTMLElementStack::ElementRecord::replaceElement(PassRefPtr<Element> element) +{ + ASSERT(element); + // FIXME: Should this call finishParsingChildren? + m_element = element; +} + +bool HTMLElementStack::ElementRecord::isAbove(ElementRecord* other) const +{ + for (ElementRecord* below = next(); below; below = below->next()) { + if (below == other) + return true; + } + return false; +} + +HTMLElementStack::HTMLElementStack() + : m_htmlElement(0) + , m_headElement(0) + , m_bodyElement(0) +{ +} + +HTMLElementStack::~HTMLElementStack() +{ +} + +bool HTMLElementStack::hasOnlyOneElement() const +{ + return !topRecord()->next(); +} + +bool HTMLElementStack::secondElementIsHTMLBodyElement() const +{ + // This is used the fragment case of <body> and <frameset> in the "in body" + // insertion mode. + // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#parsing-main-inbody + ASSERT(m_htmlElement); + // If we have a body element, it must always be the second element on the + // stack, as we always start with an html element, and any other element + // would cause the implicit creation of a body element. + return !!m_bodyElement; +} + +void HTMLElementStack::popHTMLHeadElement() +{ + ASSERT(top() == m_headElement); + m_headElement = 0; + popCommon(); +} + +void HTMLElementStack::popHTMLBodyElement() +{ + ASSERT(top() == m_bodyElement); + m_bodyElement = 0; + popCommon(); +} + +void HTMLElementStack::popAll() +{ + m_htmlElement = 0; + m_headElement = 0; + m_bodyElement = 0; + while (m_top) { + top()->finishParsingChildren(); + m_top = m_top->releaseNext(); + } +} + +void HTMLElementStack::pop() +{ + ASSERT(!top()->hasTagName(HTMLNames::headTag)); + popCommon(); +} + +void HTMLElementStack::popUntil(const AtomicString& tagName) +{ + while (!top()->hasLocalName(tagName)) { + // pop() will ASSERT at <body> if callers fail to check that there is an + // element with localName |tagName| on the stack of open elements. + pop(); + } +} + +void HTMLElementStack::popUntilPopped(const AtomicString& tagName) +{ + popUntil(tagName); + pop(); +} + +void HTMLElementStack::popUntilNumberedHeaderElementPopped() +{ + while (!isNumberedHeaderElement(top())) + pop(); + pop(); +} + +void HTMLElementStack::popUntil(Element* element) +{ + while (top() != element) + pop(); +} + +void HTMLElementStack::popUntilPopped(Element* element) +{ + popUntil(element); + pop(); +} + +void HTMLElementStack::popUntilTableScopeMarker() +{ + // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#clear-the-stack-back-to-a-table-context + while (!isTableScopeMarker(top())) + pop(); +} + +void HTMLElementStack::popUntilTableBodyScopeMarker() +{ + // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#clear-the-stack-back-to-a-table-body-context + while (!isTableBodyScopeMarker(top())) + pop(); +} + +void HTMLElementStack::popUntilTableRowScopeMarker() +{ + // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#clear-the-stack-back-to-a-table-row-context + while (!isTableRowScopeMarker(top())) + pop(); +} + +void HTMLElementStack::popUntilForeignContentScopeMarker() +{ + while (!isForeignContentScopeMarker(top())) + pop(); +} + +void HTMLElementStack::pushHTMLHtmlElement(PassRefPtr<Element> element) +{ + ASSERT(!m_top); // <html> should always be the bottom of the stack. + ASSERT(element->hasTagName(HTMLNames::htmlTag)); + ASSERT(!m_htmlElement); + m_htmlElement = element.get(); + pushCommon(element); +} + +void HTMLElementStack::pushHTMLHeadElement(PassRefPtr<Element> element) +{ + ASSERT(element->hasTagName(HTMLNames::headTag)); + ASSERT(!m_headElement); + m_headElement = element.get(); + pushCommon(element); +} + +void HTMLElementStack::pushHTMLBodyElement(PassRefPtr<Element> element) +{ + ASSERT(element->hasTagName(HTMLNames::bodyTag)); + ASSERT(!m_bodyElement); + m_bodyElement = element.get(); + pushCommon(element); +} + +void HTMLElementStack::push(PassRefPtr<Element> element) +{ + ASSERT(!element->hasTagName(HTMLNames::htmlTag)); + ASSERT(!element->hasTagName(HTMLNames::headTag)); + ASSERT(!element->hasTagName(HTMLNames::bodyTag)); + ASSERT(m_htmlElement); + pushCommon(element); +} + +void HTMLElementStack::insertAbove(PassRefPtr<Element> element, ElementRecord* recordBelow) +{ + ASSERT(element); + ASSERT(recordBelow); + ASSERT(m_top); + ASSERT(!element->hasTagName(HTMLNames::htmlTag)); + ASSERT(!element->hasTagName(HTMLNames::headTag)); + ASSERT(!element->hasTagName(HTMLNames::bodyTag)); + ASSERT(m_htmlElement); + if (recordBelow == m_top) { + push(element); + return; + } + + for (ElementRecord* recordAbove = m_top.get(); recordAbove; recordAbove = recordAbove->next()) { + if (recordAbove->next() != recordBelow) + continue; + + recordAbove->setNext(adoptPtr(new ElementRecord(element, recordAbove->releaseNext()))); + recordAbove->next()->element()->beginParsingChildren(); + return; + } + ASSERT_NOT_REACHED(); +} + +HTMLElementStack::ElementRecord* HTMLElementStack::topRecord() const +{ + ASSERT(m_top); + return m_top.get(); +} + +Element* HTMLElementStack::oneBelowTop() const +{ + // We should never be calling this if it could be 0. + ASSERT(m_top); + ASSERT(m_top->next()); + return m_top->next()->element(); +} + +Element* HTMLElementStack::bottom() const +{ + return htmlElement(); +} + +void HTMLElementStack::removeHTMLHeadElement(Element* element) +{ + ASSERT(m_headElement == element); + if (m_top->element() == element) { + popHTMLHeadElement(); + return; + } + m_headElement = 0; + removeNonTopCommon(element); +} + +void HTMLElementStack::remove(Element* element) +{ + ASSERT(!element->hasTagName(HTMLNames::headTag)); + if (m_top->element() == element) { + pop(); + return; + } + removeNonTopCommon(element); +} + +HTMLElementStack::ElementRecord* HTMLElementStack::find(Element* element) const +{ + for (ElementRecord* pos = m_top.get(); pos; pos = pos->next()) { + if (pos->element() == element) + return pos; + } + return 0; +} + +HTMLElementStack::ElementRecord* HTMLElementStack::topmost(const AtomicString& tagName) const +{ + for (ElementRecord* pos = m_top.get(); pos; pos = pos->next()) { + if (pos->element()->hasLocalName(tagName)) + return pos; + } + return 0; +} + +bool HTMLElementStack::contains(Element* element) const +{ + return !!find(element); +} + +bool HTMLElementStack::contains(const AtomicString& tagName) const +{ + return !!topmost(tagName); +} + +template <bool isMarker(Element*)> +bool inScopeCommon(HTMLElementStack::ElementRecord* top, const AtomicString& targetTag) +{ + for (HTMLElementStack::ElementRecord* pos = top; pos; pos = pos->next()) { + Element* element = pos->element(); + if (element->hasLocalName(targetTag)) + return true; + if (isMarker(element)) + return false; + } + ASSERT_NOT_REACHED(); // <html> is always on the stack and is a scope marker. + return false; +} + +bool HTMLElementStack::hasOnlyHTMLElementsInScope() const +{ + for (ElementRecord* record = m_top.get(); record; record = record->next()) { + Element* element = record->element(); + if (element->namespaceURI() != xhtmlNamespaceURI) + return false; + if (isScopeMarker(element)) + return true; + } + ASSERT_NOT_REACHED(); // <html> is always on the stack and is a scope marker. + return true; +} + +bool HTMLElementStack::hasNumberedHeaderElementInScope() const +{ + for (ElementRecord* record = m_top.get(); record; record = record->next()) { + Element* element = record->element(); + if (isNumberedHeaderElement(element)) + return true; + if (isScopeMarker(element)) + return false; + } + ASSERT_NOT_REACHED(); // <html> is always on the stack and is a scope marker. + return false; +} + +bool HTMLElementStack::inScope(Element* targetElement) const +{ + for (ElementRecord* pos = m_top.get(); pos; pos = pos->next()) { + Element* element = pos->element(); + if (element == targetElement) + return true; + if (isScopeMarker(element)) + return false; + } + ASSERT_NOT_REACHED(); // <html> is always on the stack and is a scope marker. + return false; +} + +bool HTMLElementStack::inScope(const AtomicString& targetTag) const +{ + return inScopeCommon<isScopeMarker>(m_top.get(), targetTag); +} + +bool HTMLElementStack::inScope(const QualifiedName& tagName) const +{ + // FIXME: Is localName() right for non-html elements? + return inScope(tagName.localName()); +} + +bool HTMLElementStack::inListItemScope(const AtomicString& targetTag) const +{ + return inScopeCommon<isListItemScopeMarker>(m_top.get(), targetTag); +} + +bool HTMLElementStack::inListItemScope(const QualifiedName& tagName) const +{ + // FIXME: Is localName() right for non-html elements? + return inListItemScope(tagName.localName()); +} + +bool HTMLElementStack::inTableScope(const AtomicString& targetTag) const +{ + return inScopeCommon<isTableScopeMarker>(m_top.get(), targetTag); +} + +bool HTMLElementStack::inTableScope(const QualifiedName& tagName) const +{ + // FIXME: Is localName() right for non-html elements? + return inTableScope(tagName.localName()); +} + +bool HTMLElementStack::inButtonScope(const AtomicString& targetTag) const +{ + return inScopeCommon<isButtonScopeMarker>(m_top.get(), targetTag); +} + +bool HTMLElementStack::inButtonScope(const QualifiedName& tagName) const +{ + // FIXME: Is localName() right for non-html elements? + return inButtonScope(tagName.localName()); +} + +bool HTMLElementStack::inSelectScope(const AtomicString& targetTag) const +{ + return inScopeCommon<isSelectScopeMarker>(m_top.get(), targetTag); +} + +bool HTMLElementStack::inSelectScope(const QualifiedName& tagName) const +{ + // FIXME: Is localName() right for non-html elements? + return inSelectScope(tagName.localName()); +} + +Element* HTMLElementStack::htmlElement() const +{ + ASSERT(m_htmlElement); + return m_htmlElement; +} + +Element* HTMLElementStack::headElement() const +{ + ASSERT(m_headElement); + return m_headElement; +} + +Element* HTMLElementStack::bodyElement() const +{ + ASSERT(m_bodyElement); + return m_bodyElement; +} + +void HTMLElementStack::pushCommon(PassRefPtr<Element> element) +{ + ASSERT(m_htmlElement); + m_top = adoptPtr(new ElementRecord(element, m_top.release())); + top()->beginParsingChildren(); +} + +void HTMLElementStack::popCommon() +{ + ASSERT(!top()->hasTagName(HTMLNames::htmlTag)); + ASSERT(!top()->hasTagName(HTMLNames::headTag) || !m_headElement); + ASSERT(!top()->hasTagName(HTMLNames::bodyTag) || !m_bodyElement); + top()->finishParsingChildren(); + m_top = m_top->releaseNext(); +} + +void HTMLElementStack::removeNonTopCommon(Element* element) +{ + ASSERT(!element->hasTagName(HTMLNames::htmlTag)); + ASSERT(!element->hasTagName(HTMLNames::bodyTag)); + ASSERT(top() != element); + for (ElementRecord* pos = m_top.get(); pos; pos = pos->next()) { + if (pos->next()->element() == element) { + // FIXME: Is it OK to call finishParsingChildren() + // when the children aren't actually finished? + element->finishParsingChildren(); + pos->setNext(pos->next()->releaseNext()); + return; + } + } + ASSERT_NOT_REACHED(); +} + +#ifndef NDEBUG + +void HTMLElementStack::show() +{ + for (ElementRecord* record = m_top.get(); record; record = record->next()) + record->element()->showNode(); +} + +#endif + +} diff --git a/Source/WebCore/html/parser/HTMLElementStack.h b/Source/WebCore/html/parser/HTMLElementStack.h new file mode 100644 index 0000000..8a8e160 --- /dev/null +++ b/Source/WebCore/html/parser/HTMLElementStack.h @@ -0,0 +1,156 @@ +/* + * Copyright (C) 2010 Google, Inc. All Rights Reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY GOOGLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GOOGLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef HTMLElementStack_h +#define HTMLElementStack_h + +#include <wtf/Forward.h> +#include <wtf/Noncopyable.h> +#include <wtf/OwnPtr.h> +#include <wtf/PassOwnPtr.h> +#include <wtf/RefPtr.h> + +namespace WebCore { + +class Element; +class QualifiedName; + +// NOTE: The HTML5 spec uses a backwards (grows downward) stack. We're using +// more standard (grows upwards) stack terminology here. +class HTMLElementStack : public Noncopyable { +public: + HTMLElementStack(); + ~HTMLElementStack(); + + class ElementRecord : public Noncopyable { + public: + ~ElementRecord(); // Public for ~PassOwnPtr() + + Element* element() const { return m_element.get(); } + void replaceElement(PassRefPtr<Element>); + + bool isAbove(ElementRecord*) const; + + ElementRecord* next() const { return m_next.get(); } + + private: + friend class HTMLElementStack; + + ElementRecord(PassRefPtr<Element>, PassOwnPtr<ElementRecord>); + + PassOwnPtr<ElementRecord> releaseNext() { return m_next.release(); } + void setNext(PassOwnPtr<ElementRecord> next) { m_next = next; } + + RefPtr<Element> m_element; + OwnPtr<ElementRecord> m_next; + }; + + // Inlining this function is a (small) performance win on the parsing + // benchmark. + Element* top() const + { + ASSERT(m_top->element()); + return m_top->element(); + } + + Element* oneBelowTop() const; + ElementRecord* topRecord() const; + Element* bottom() const; + ElementRecord* find(Element*) const; + ElementRecord* topmost(const AtomicString& tagName) const; + + void insertAbove(PassRefPtr<Element>, ElementRecord*); + + void push(PassRefPtr<Element>); + void pushHTMLHtmlElement(PassRefPtr<Element>); + void pushHTMLHeadElement(PassRefPtr<Element>); + void pushHTMLBodyElement(PassRefPtr<Element>); + + void pop(); + void popUntil(const AtomicString& tagName); + void popUntil(Element*); + void popUntilPopped(const AtomicString& tagName); + void popUntilPopped(Element*); + void popUntilNumberedHeaderElementPopped(); + void popUntilTableScopeMarker(); // "clear the stack back to a table context" in the spec. + void popUntilTableBodyScopeMarker(); // "clear the stack back to a table body context" in the spec. + void popUntilTableRowScopeMarker(); // "clear the stack back to a table row context" in the spec. + void popUntilForeignContentScopeMarker(); + void popHTMLHeadElement(); + void popHTMLBodyElement(); + void popAll(); + + void remove(Element*); + void removeHTMLHeadElement(Element*); + + bool contains(Element*) const; + bool contains(const AtomicString& tagName) const; + + bool inScope(Element*) const; + bool inScope(const AtomicString& tagName) const; + bool inScope(const QualifiedName&) const; + bool inListItemScope(const AtomicString& tagName) const; + bool inListItemScope(const QualifiedName&) const; + bool inTableScope(const AtomicString& tagName) const; + bool inTableScope(const QualifiedName&) const; + bool inButtonScope(const AtomicString& tagName) const; + bool inButtonScope(const QualifiedName&) const; + bool inSelectScope(const AtomicString& tagName) const; + bool inSelectScope(const QualifiedName&) const; + + bool hasOnlyHTMLElementsInScope() const; + bool hasNumberedHeaderElementInScope() const; + + bool hasOnlyOneElement() const; + bool secondElementIsHTMLBodyElement() const; + + Element* htmlElement() const; + Element* headElement() const; + Element* bodyElement() const; + +#ifndef NDEBUG + void show(); +#endif + +private: + void pushCommon(PassRefPtr<Element>); + void popCommon(); + void removeNonTopCommon(Element*); + + OwnPtr<ElementRecord> m_top; + + // We remember <html>, <head> and <body> as they are pushed. Their + // ElementRecords keep them alive. <html> is never popped. + // FIXME: We don't currently require type-specific information about + // these elements so we haven't yet bothered to plumb the types all the + // way down through createElement, etc. + Element* m_htmlElement; + Element* m_headElement; + Element* m_bodyElement; +}; + +} // namespace WebCore + +#endif // HTMLElementStack_h diff --git a/Source/WebCore/html/parser/HTMLEntityNames.in b/Source/WebCore/html/parser/HTMLEntityNames.in new file mode 100644 index 0000000..2d42ab2 --- /dev/null +++ b/Source/WebCore/html/parser/HTMLEntityNames.in @@ -0,0 +1,2138 @@ +"AElig;","U+000C6" +"AElig","U+000C6" +"AMP;","U+00026" +"AMP","U+00026" +"Aacute;","U+000C1" +"Aacute","U+000C1" +"Abreve;","U+00102" +"Acirc;","U+000C2" +"Acirc","U+000C2" +"Acy;","U+00410" +"Afr;","U+1D504" +"Agrave;","U+000C0" +"Agrave","U+000C0" +"Alpha;","U+00391" +"Amacr;","U+00100" +"And;","U+02A53" +"Aogon;","U+00104" +"Aopf;","U+1D538" +"ApplyFunction;","U+02061" +"Aring;","U+000C5" +"Aring","U+000C5" +"Ascr;","U+1D49C" +"Assign;","U+02254" +"Atilde;","U+000C3" +"Atilde","U+000C3" +"Auml;","U+000C4" +"Auml","U+000C4" +"Backslash;","U+02216" +"Barv;","U+02AE7" +"Barwed;","U+02306" +"Bcy;","U+00411" +"Because;","U+02235" +"Bernoullis;","U+0212C" +"Beta;","U+00392" +"Bfr;","U+1D505" +"Bopf;","U+1D539" +"Breve;","U+002D8" +"Bscr;","U+0212C" +"Bumpeq;","U+0224E" +"CHcy;","U+00427" +"COPY;","U+000A9" +"COPY","U+000A9" +"Cacute;","U+00106" +"Cap;","U+022D2" +"CapitalDifferentialD;","U+02145" +"Cayleys;","U+0212D" +"Ccaron;","U+0010C" +"Ccedil;","U+000C7" +"Ccedil","U+000C7" +"Ccirc;","U+00108" +"Cconint;","U+02230" +"Cdot;","U+0010A" +"Cedilla;","U+000B8" +"CenterDot;","U+000B7" +"Cfr;","U+0212D" +"Chi;","U+003A7" +"CircleDot;","U+02299" +"CircleMinus;","U+02296" +"CirclePlus;","U+02295" +"CircleTimes;","U+02297" +"ClockwiseContourIntegral;","U+02232" +"CloseCurlyDoubleQuote;","U+0201D" +"CloseCurlyQuote;","U+02019" +"Colon;","U+02237" +"Colone;","U+02A74" +"Congruent;","U+02261" +"Conint;","U+0222F" +"ContourIntegral;","U+0222E" +"Copf;","U+02102" +"Coproduct;","U+02210" +"CounterClockwiseContourIntegral;","U+02233" +"Cross;","U+02A2F" +"Cscr;","U+1D49E" +"Cup;","U+022D3" +"CupCap;","U+0224D" +"DD;","U+02145" +"DDotrahd;","U+02911" +"DJcy;","U+00402" +"DScy;","U+00405" +"DZcy;","U+0040F" +"Dagger;","U+02021" +"Darr;","U+021A1" +"Dashv;","U+02AE4" +"Dcaron;","U+0010E" +"Dcy;","U+00414" +"Del;","U+02207" +"Delta;","U+00394" +"Dfr;","U+1D507" +"DiacriticalAcute;","U+000B4" +"DiacriticalDot;","U+002D9" +"DiacriticalDoubleAcute;","U+002DD" +"DiacriticalGrave;","U+00060" +"DiacriticalTilde;","U+002DC" +"Diamond;","U+022C4" +"DifferentialD;","U+02146" +"Dopf;","U+1D53B" +"Dot;","U+000A8" +"DotDot;","U+020DC" +"DotEqual;","U+02250" +"DoubleContourIntegral;","U+0222F" +"DoubleDot;","U+000A8" +"DoubleDownArrow;","U+021D3" +"DoubleLeftArrow;","U+021D0" +"DoubleLeftRightArrow;","U+021D4" +"DoubleLeftTee;","U+02AE4" +"DoubleLongLeftArrow;","U+027F8" +"DoubleLongLeftRightArrow;","U+027FA" +"DoubleLongRightArrow;","U+027F9" +"DoubleRightArrow;","U+021D2" +"DoubleRightTee;","U+022A8" +"DoubleUpArrow;","U+021D1" +"DoubleUpDownArrow;","U+021D5" +"DoubleVerticalBar;","U+02225" +"DownArrow;","U+02193" +"DownArrowBar;","U+02913" +"DownArrowUpArrow;","U+021F5" +"DownBreve;","U+00311" +"DownLeftRightVector;","U+02950" +"DownLeftTeeVector;","U+0295E" +"DownLeftVector;","U+021BD" +"DownLeftVectorBar;","U+02956" +"DownRightTeeVector;","U+0295F" +"DownRightVector;","U+021C1" +"DownRightVectorBar;","U+02957" +"DownTee;","U+022A4" +"DownTeeArrow;","U+021A7" +"Downarrow;","U+021D3" +"Dscr;","U+1D49F" +"Dstrok;","U+00110" +"ENG;","U+0014A" +"ETH;","U+000D0" +"ETH","U+000D0" +"Eacute;","U+000C9" +"Eacute","U+000C9" +"Ecaron;","U+0011A" +"Ecirc;","U+000CA" +"Ecirc","U+000CA" +"Ecy;","U+0042D" +"Edot;","U+00116" +"Efr;","U+1D508" +"Egrave;","U+000C8" +"Egrave","U+000C8" +"Element;","U+02208" +"Emacr;","U+00112" +"EmptySmallSquare;","U+025FB" +"EmptyVerySmallSquare;","U+025AB" +"Eogon;","U+00118" +"Eopf;","U+1D53C" +"Epsilon;","U+00395" +"Equal;","U+02A75" +"EqualTilde;","U+02242" +"Equilibrium;","U+021CC" +"Escr;","U+02130" +"Esim;","U+02A73" +"Eta;","U+00397" +"Euml;","U+000CB" +"Euml","U+000CB" +"Exists;","U+02203" +"ExponentialE;","U+02147" +"Fcy;","U+00424" +"Ffr;","U+1D509" +"FilledSmallSquare;","U+025FC" +"FilledVerySmallSquare;","U+025AA" +"Fopf;","U+1D53D" +"ForAll;","U+02200" +"Fouriertrf;","U+02131" +"Fscr;","U+02131" +"GJcy;","U+00403" +"GT;","U+0003E" +"GT","U+0003E" +"Gamma;","U+00393" +"Gammad;","U+003DC" +"Gbreve;","U+0011E" +"Gcedil;","U+00122" +"Gcirc;","U+0011C" +"Gcy;","U+00413" +"Gdot;","U+00120" +"Gfr;","U+1D50A" +"Gg;","U+022D9" +"Gopf;","U+1D53E" +"GreaterEqual;","U+02265" +"GreaterEqualLess;","U+022DB" +"GreaterFullEqual;","U+02267" +"GreaterGreater;","U+02AA2" +"GreaterLess;","U+02277" +"GreaterSlantEqual;","U+02A7E" +"GreaterTilde;","U+02273" +"Gscr;","U+1D4A2" +"Gt;","U+0226B" +"HARDcy;","U+0042A" +"Hacek;","U+002C7" +"Hat;","U+0005E" +"Hcirc;","U+00124" +"Hfr;","U+0210C" +"HilbertSpace;","U+0210B" +"Hopf;","U+0210D" +"HorizontalLine;","U+02500" +"Hscr;","U+0210B" +"Hstrok;","U+00126" +"HumpDownHump;","U+0224E" +"HumpEqual;","U+0224F" +"IEcy;","U+00415" +"IJlig;","U+00132" +"IOcy;","U+00401" +"Iacute;","U+000CD" +"Iacute","U+000CD" +"Icirc;","U+000CE" +"Icirc","U+000CE" +"Icy;","U+00418" +"Idot;","U+00130" +"Ifr;","U+02111" +"Igrave;","U+000CC" +"Igrave","U+000CC" +"Im;","U+02111" +"Imacr;","U+0012A" +"ImaginaryI;","U+02148" +"Implies;","U+021D2" +"Int;","U+0222C" +"Integral;","U+0222B" +"Intersection;","U+022C2" +"InvisibleComma;","U+02063" +"InvisibleTimes;","U+02062" +"Iogon;","U+0012E" +"Iopf;","U+1D540" +"Iota;","U+00399" +"Iscr;","U+02110" +"Itilde;","U+00128" +"Iukcy;","U+00406" +"Iuml;","U+000CF" +"Iuml","U+000CF" +"Jcirc;","U+00134" +"Jcy;","U+00419" +"Jfr;","U+1D50D" +"Jopf;","U+1D541" +"Jscr;","U+1D4A5" +"Jsercy;","U+00408" +"Jukcy;","U+00404" +"KHcy;","U+00425" +"KJcy;","U+0040C" +"Kappa;","U+0039A" +"Kcedil;","U+00136" +"Kcy;","U+0041A" +"Kfr;","U+1D50E" +"Kopf;","U+1D542" +"Kscr;","U+1D4A6" +"LJcy;","U+00409" +"LT;","U+0003C" +"LT","U+0003C" +"Lacute;","U+00139" +"Lambda;","U+0039B" +"Lang;","U+027EA" +"Laplacetrf;","U+02112" +"Larr;","U+0219E" +"Lcaron;","U+0013D" +"Lcedil;","U+0013B" +"Lcy;","U+0041B" +"LeftAngleBracket;","U+027E8" +"LeftArrow;","U+02190" +"LeftArrowBar;","U+021E4" +"LeftArrowRightArrow;","U+021C6" +"LeftCeiling;","U+02308" +"LeftDoubleBracket;","U+027E6" +"LeftDownTeeVector;","U+02961" +"LeftDownVector;","U+021C3" +"LeftDownVectorBar;","U+02959" +"LeftFloor;","U+0230A" +"LeftRightArrow;","U+02194" +"LeftRightVector;","U+0294E" +"LeftTee;","U+022A3" +"LeftTeeArrow;","U+021A4" +"LeftTeeVector;","U+0295A" +"LeftTriangle;","U+022B2" +"LeftTriangleBar;","U+029CF" +"LeftTriangleEqual;","U+022B4" +"LeftUpDownVector;","U+02951" +"LeftUpTeeVector;","U+02960" +"LeftUpVector;","U+021BF" +"LeftUpVectorBar;","U+02958" +"LeftVector;","U+021BC" +"LeftVectorBar;","U+02952" +"Leftarrow;","U+021D0" +"Leftrightarrow;","U+021D4" +"LessEqualGreater;","U+022DA" +"LessFullEqual;","U+02266" +"LessGreater;","U+02276" +"LessLess;","U+02AA1" +"LessSlantEqual;","U+02A7D" +"LessTilde;","U+02272" +"Lfr;","U+1D50F" +"Ll;","U+022D8" +"Lleftarrow;","U+021DA" +"Lmidot;","U+0013F" +"LongLeftArrow;","U+027F5" +"LongLeftRightArrow;","U+027F7" +"LongRightArrow;","U+027F6" +"Longleftarrow;","U+027F8" +"Longleftrightarrow;","U+027FA" +"Longrightarrow;","U+027F9" +"Lopf;","U+1D543" +"LowerLeftArrow;","U+02199" +"LowerRightArrow;","U+02198" +"Lscr;","U+02112" +"Lsh;","U+021B0" +"Lstrok;","U+00141" +"Lt;","U+0226A" +"Map;","U+02905" +"Mcy;","U+0041C" +"MediumSpace;","U+0205F" +"Mellintrf;","U+02133" +"Mfr;","U+1D510" +"MinusPlus;","U+02213" +"Mopf;","U+1D544" +"Mscr;","U+02133" +"Mu;","U+0039C" +"NJcy;","U+0040A" +"Nacute;","U+00143" +"Ncaron;","U+00147" +"Ncedil;","U+00145" +"Ncy;","U+0041D" +"NegativeMediumSpace;","U+0200B" +"NegativeThickSpace;","U+0200B" +"NegativeThinSpace;","U+0200B" +"NegativeVeryThinSpace;","U+0200B" +"NestedGreaterGreater;","U+0226B" +"NestedLessLess;","U+0226A" +"NewLine;","U+0000A" +"Nfr;","U+1D511" +"NoBreak;","U+02060" +"NonBreakingSpace;","U+000A0" +"Nopf;","U+02115" +"Not;","U+02AEC" +"NotCongruent;","U+02262" +"NotCupCap;","U+0226D" +"NotDoubleVerticalBar;","U+02226" +"NotElement;","U+02209" +"NotEqual;","U+02260" +"NotExists;","U+02204" +"NotGreater;","U+0226F" +"NotGreaterEqual;","U+02271" +"NotGreaterLess;","U+02279" +"NotGreaterTilde;","U+02275" +"NotLeftTriangle;","U+022EA" +"NotLeftTriangleEqual;","U+022EC" +"NotLess;","U+0226E" +"NotLessEqual;","U+02270" +"NotLessGreater;","U+02278" +"NotLessTilde;","U+02274" +"NotPrecedes;","U+02280" +"NotPrecedesSlantEqual;","U+022E0" +"NotReverseElement;","U+0220C" +"NotRightTriangle;","U+022EB" +"NotRightTriangleEqual;","U+022ED" +"NotSquareSubsetEqual;","U+022E2" +"NotSquareSupersetEqual;","U+022E3" +"NotSubsetEqual;","U+02288" +"NotSucceeds;","U+02281" +"NotSucceedsSlantEqual;","U+022E1" +"NotSupersetEqual;","U+02289" +"NotTilde;","U+02241" +"NotTildeEqual;","U+02244" +"NotTildeFullEqual;","U+02247" +"NotTildeTilde;","U+02249" +"NotVerticalBar;","U+02224" +"Nscr;","U+1D4A9" +"Ntilde;","U+000D1" +"Ntilde","U+000D1" +"Nu;","U+0039D" +"OElig;","U+00152" +"Oacute;","U+000D3" +"Oacute","U+000D3" +"Ocirc;","U+000D4" +"Ocirc","U+000D4" +"Ocy;","U+0041E" +"Odblac;","U+00150" +"Ofr;","U+1D512" +"Ograve;","U+000D2" +"Ograve","U+000D2" +"Omacr;","U+0014C" +"Omega;","U+003A9" +"Omicron;","U+0039F" +"Oopf;","U+1D546" +"OpenCurlyDoubleQuote;","U+0201C" +"OpenCurlyQuote;","U+02018" +"Or;","U+02A54" +"Oscr;","U+1D4AA" +"Oslash;","U+000D8" +"Oslash","U+000D8" +"Otilde;","U+000D5" +"Otilde","U+000D5" +"Otimes;","U+02A37" +"Ouml;","U+000D6" +"Ouml","U+000D6" +"OverBar;","U+0203E" +"OverBrace;","U+023DE" +"OverBracket;","U+023B4" +"OverParenthesis;","U+023DC" +"PartialD;","U+02202" +"Pcy;","U+0041F" +"Pfr;","U+1D513" +"Phi;","U+003A6" +"Pi;","U+003A0" +"PlusMinus;","U+000B1" +"Poincareplane;","U+0210C" +"Popf;","U+02119" +"Pr;","U+02ABB" +"Precedes;","U+0227A" +"PrecedesEqual;","U+02AAF" +"PrecedesSlantEqual;","U+0227C" +"PrecedesTilde;","U+0227E" +"Prime;","U+02033" +"Product;","U+0220F" +"Proportion;","U+02237" +"Proportional;","U+0221D" +"Pscr;","U+1D4AB" +"Psi;","U+003A8" +"QUOT;","U+00022" +"QUOT","U+00022" +"Qfr;","U+1D514" +"Qopf;","U+0211A" +"Qscr;","U+1D4AC" +"RBarr;","U+02910" +"REG;","U+000AE" +"REG","U+000AE" +"Racute;","U+00154" +"Rang;","U+027EB" +"Rarr;","U+021A0" +"Rarrtl;","U+02916" +"Rcaron;","U+00158" +"Rcedil;","U+00156" +"Rcy;","U+00420" +"Re;","U+0211C" +"ReverseElement;","U+0220B" +"ReverseEquilibrium;","U+021CB" +"ReverseUpEquilibrium;","U+0296F" +"Rfr;","U+0211C" +"Rho;","U+003A1" +"RightAngleBracket;","U+027E9" +"RightArrow;","U+02192" +"RightArrowBar;","U+021E5" +"RightArrowLeftArrow;","U+021C4" +"RightCeiling;","U+02309" +"RightDoubleBracket;","U+027E7" +"RightDownTeeVector;","U+0295D" +"RightDownVector;","U+021C2" +"RightDownVectorBar;","U+02955" +"RightFloor;","U+0230B" +"RightTee;","U+022A2" +"RightTeeArrow;","U+021A6" +"RightTeeVector;","U+0295B" +"RightTriangle;","U+022B3" +"RightTriangleBar;","U+029D0" +"RightTriangleEqual;","U+022B5" +"RightUpDownVector;","U+0294F" +"RightUpTeeVector;","U+0295C" +"RightUpVector;","U+021BE" +"RightUpVectorBar;","U+02954" +"RightVector;","U+021C0" +"RightVectorBar;","U+02953" +"Rightarrow;","U+021D2" +"Ropf;","U+0211D" +"RoundImplies;","U+02970" +"Rrightarrow;","U+021DB" +"Rscr;","U+0211B" +"Rsh;","U+021B1" +"RuleDelayed;","U+029F4" +"SHCHcy;","U+00429" +"SHcy;","U+00428" +"SOFTcy;","U+0042C" +"Sacute;","U+0015A" +"Sc;","U+02ABC" +"Scaron;","U+00160" +"Scedil;","U+0015E" +"Scirc;","U+0015C" +"Scy;","U+00421" +"Sfr;","U+1D516" +"ShortDownArrow;","U+02193" +"ShortLeftArrow;","U+02190" +"ShortRightArrow;","U+02192" +"ShortUpArrow;","U+02191" +"Sigma;","U+003A3" +"SmallCircle;","U+02218" +"Sopf;","U+1D54A" +"Sqrt;","U+0221A" +"Square;","U+025A1" +"SquareIntersection;","U+02293" +"SquareSubset;","U+0228F" +"SquareSubsetEqual;","U+02291" +"SquareSuperset;","U+02290" +"SquareSupersetEqual;","U+02292" +"SquareUnion;","U+02294" +"Sscr;","U+1D4AE" +"Star;","U+022C6" +"Sub;","U+022D0" +"Subset;","U+022D0" +"SubsetEqual;","U+02286" +"Succeeds;","U+0227B" +"SucceedsEqual;","U+02AB0" +"SucceedsSlantEqual;","U+0227D" +"SucceedsTilde;","U+0227F" +"SuchThat;","U+0220B" +"Sum;","U+02211" +"Sup;","U+022D1" +"Superset;","U+02283" +"SupersetEqual;","U+02287" +"Supset;","U+022D1" +"THORN;","U+000DE" +"THORN","U+000DE" +"TRADE;","U+02122" +"TSHcy;","U+0040B" +"TScy;","U+00426" +"Tab;","U+00009" +"Tau;","U+003A4" +"Tcaron;","U+00164" +"Tcedil;","U+00162" +"Tcy;","U+00422" +"Tfr;","U+1D517" +"Therefore;","U+02234" +"Theta;","U+00398" +"ThinSpace;","U+02009" +"Tilde;","U+0223C" +"TildeEqual;","U+02243" +"TildeFullEqual;","U+02245" +"TildeTilde;","U+02248" +"Topf;","U+1D54B" +"TripleDot;","U+020DB" +"Tscr;","U+1D4AF" +"Tstrok;","U+00166" +"Uacute;","U+000DA" +"Uacute","U+000DA" +"Uarr;","U+0219F" +"Uarrocir;","U+02949" +"Ubrcy;","U+0040E" +"Ubreve;","U+0016C" +"Ucirc;","U+000DB" +"Ucirc","U+000DB" +"Ucy;","U+00423" +"Udblac;","U+00170" +"Ufr;","U+1D518" +"Ugrave;","U+000D9" +"Ugrave","U+000D9" +"Umacr;","U+0016A" +"UnderBar;","U+0005F" +"UnderBrace;","U+023DF" +"UnderBracket;","U+023B5" +"UnderParenthesis;","U+023DD" +"Union;","U+022C3" +"UnionPlus;","U+0228E" +"Uogon;","U+00172" +"Uopf;","U+1D54C" +"UpArrow;","U+02191" +"UpArrowBar;","U+02912" +"UpArrowDownArrow;","U+021C5" +"UpDownArrow;","U+02195" +"UpEquilibrium;","U+0296E" +"UpTee;","U+022A5" +"UpTeeArrow;","U+021A5" +"Uparrow;","U+021D1" +"Updownarrow;","U+021D5" +"UpperLeftArrow;","U+02196" +"UpperRightArrow;","U+02197" +"Upsi;","U+003D2" +"Upsilon;","U+003A5" +"Uring;","U+0016E" +"Uscr;","U+1D4B0" +"Utilde;","U+00168" +"Uuml;","U+000DC" +"Uuml","U+000DC" +"VDash;","U+022AB" +"Vbar;","U+02AEB" +"Vcy;","U+00412" +"Vdash;","U+022A9" +"Vdashl;","U+02AE6" +"Vee;","U+022C1" +"Verbar;","U+02016" +"Vert;","U+02016" +"VerticalBar;","U+02223" +"VerticalLine;","U+0007C" +"VerticalSeparator;","U+02758" +"VerticalTilde;","U+02240" +"VeryThinSpace;","U+0200A" +"Vfr;","U+1D519" +"Vopf;","U+1D54D" +"Vscr;","U+1D4B1" +"Vvdash;","U+022AA" +"Wcirc;","U+00174" +"Wedge;","U+022C0" +"Wfr;","U+1D51A" +"Wopf;","U+1D54E" +"Wscr;","U+1D4B2" +"Xfr;","U+1D51B" +"Xi;","U+0039E" +"Xopf;","U+1D54F" +"Xscr;","U+1D4B3" +"YAcy;","U+0042F" +"YIcy;","U+00407" +"YUcy;","U+0042E" +"Yacute;","U+000DD" +"Yacute","U+000DD" +"Ycirc;","U+00176" +"Ycy;","U+0042B" +"Yfr;","U+1D51C" +"Yopf;","U+1D550" +"Yscr;","U+1D4B4" +"Yuml;","U+00178" +"ZHcy;","U+00416" +"Zacute;","U+00179" +"Zcaron;","U+0017D" +"Zcy;","U+00417" +"Zdot;","U+0017B" +"ZeroWidthSpace;","U+0200B" +"Zeta;","U+00396" +"Zfr;","U+02128" +"Zopf;","U+02124" +"Zscr;","U+1D4B5" +"aacute;","U+000E1" +"aacute","U+000E1" +"abreve;","U+00103" +"ac;","U+0223E" +"acd;","U+0223F" +"acirc;","U+000E2" +"acirc","U+000E2" +"acute;","U+000B4" +"acute","U+000B4" +"acy;","U+00430" +"aelig;","U+000E6" +"aelig","U+000E6" +"af;","U+02061" +"afr;","U+1D51E" +"agrave;","U+000E0" +"agrave","U+000E0" +"alefsym;","U+02135" +"aleph;","U+02135" +"alpha;","U+003B1" +"amacr;","U+00101" +"amalg;","U+02A3F" +"amp;","U+00026" +"amp","U+00026" +"and;","U+02227" +"andand;","U+02A55" +"andd;","U+02A5C" +"andslope;","U+02A58" +"andv;","U+02A5A" +"ang;","U+02220" +"ange;","U+029A4" +"angle;","U+02220" +"angmsd;","U+02221" +"angmsdaa;","U+029A8" +"angmsdab;","U+029A9" +"angmsdac;","U+029AA" +"angmsdad;","U+029AB" +"angmsdae;","U+029AC" +"angmsdaf;","U+029AD" +"angmsdag;","U+029AE" +"angmsdah;","U+029AF" +"angrt;","U+0221F" +"angrtvb;","U+022BE" +"angrtvbd;","U+0299D" +"angsph;","U+02222" +"angst;","U+000C5" +"angzarr;","U+0237C" +"aogon;","U+00105" +"aopf;","U+1D552" +"ap;","U+02248" +"apE;","U+02A70" +"apacir;","U+02A6F" +"ape;","U+0224A" +"apid;","U+0224B" +"apos;","U+00027" +"approx;","U+02248" +"approxeq;","U+0224A" +"aring;","U+000E5" +"aring","U+000E5" +"ascr;","U+1D4B6" +"ast;","U+0002A" +"asymp;","U+02248" +"asympeq;","U+0224D" +"atilde;","U+000E3" +"atilde","U+000E3" +"auml;","U+000E4" +"auml","U+000E4" +"awconint;","U+02233" +"awint;","U+02A11" +"bNot;","U+02AED" +"backcong;","U+0224C" +"backepsilon;","U+003F6" +"backprime;","U+02035" +"backsim;","U+0223D" +"backsimeq;","U+022CD" +"barvee;","U+022BD" +"barwed;","U+02305" +"barwedge;","U+02305" +"bbrk;","U+023B5" +"bbrktbrk;","U+023B6" +"bcong;","U+0224C" +"bcy;","U+00431" +"bdquo;","U+0201E" +"becaus;","U+02235" +"because;","U+02235" +"bemptyv;","U+029B0" +"bepsi;","U+003F6" +"bernou;","U+0212C" +"beta;","U+003B2" +"beth;","U+02136" +"between;","U+0226C" +"bfr;","U+1D51F" +"bigcap;","U+022C2" +"bigcirc;","U+025EF" +"bigcup;","U+022C3" +"bigodot;","U+02A00" +"bigoplus;","U+02A01" +"bigotimes;","U+02A02" +"bigsqcup;","U+02A06" +"bigstar;","U+02605" +"bigtriangledown;","U+025BD" +"bigtriangleup;","U+025B3" +"biguplus;","U+02A04" +"bigvee;","U+022C1" +"bigwedge;","U+022C0" +"bkarow;","U+0290D" +"blacklozenge;","U+029EB" +"blacksquare;","U+025AA" +"blacktriangle;","U+025B4" +"blacktriangledown;","U+025BE" +"blacktriangleleft;","U+025C2" +"blacktriangleright;","U+025B8" +"blank;","U+02423" +"blk12;","U+02592" +"blk14;","U+02591" +"blk34;","U+02593" +"block;","U+02588" +"bnot;","U+02310" +"bopf;","U+1D553" +"bot;","U+022A5" +"bottom;","U+022A5" +"bowtie;","U+022C8" +"boxDL;","U+02557" +"boxDR;","U+02554" +"boxDl;","U+02556" +"boxDr;","U+02553" +"boxH;","U+02550" +"boxHD;","U+02566" +"boxHU;","U+02569" +"boxHd;","U+02564" +"boxHu;","U+02567" +"boxUL;","U+0255D" +"boxUR;","U+0255A" +"boxUl;","U+0255C" +"boxUr;","U+02559" +"boxV;","U+02551" +"boxVH;","U+0256C" +"boxVL;","U+02563" +"boxVR;","U+02560" +"boxVh;","U+0256B" +"boxVl;","U+02562" +"boxVr;","U+0255F" +"boxbox;","U+029C9" +"boxdL;","U+02555" +"boxdR;","U+02552" +"boxdl;","U+02510" +"boxdr;","U+0250C" +"boxh;","U+02500" +"boxhD;","U+02565" +"boxhU;","U+02568" +"boxhd;","U+0252C" +"boxhu;","U+02534" +"boxminus;","U+0229F" +"boxplus;","U+0229E" +"boxtimes;","U+022A0" +"boxuL;","U+0255B" +"boxuR;","U+02558" +"boxul;","U+02518" +"boxur;","U+02514" +"boxv;","U+02502" +"boxvH;","U+0256A" +"boxvL;","U+02561" +"boxvR;","U+0255E" +"boxvh;","U+0253C" +"boxvl;","U+02524" +"boxvr;","U+0251C" +"bprime;","U+02035" +"breve;","U+002D8" +"brvbar;","U+000A6" +"brvbar","U+000A6" +"bscr;","U+1D4B7" +"bsemi;","U+0204F" +"bsim;","U+0223D" +"bsime;","U+022CD" +"bsol;","U+0005C" +"bsolb;","U+029C5" +"bsolhsub;","U+027C8" +"bull;","U+02022" +"bullet;","U+02022" +"bump;","U+0224E" +"bumpE;","U+02AAE" +"bumpe;","U+0224F" +"bumpeq;","U+0224F" +"cacute;","U+00107" +"cap;","U+02229" +"capand;","U+02A44" +"capbrcup;","U+02A49" +"capcap;","U+02A4B" +"capcup;","U+02A47" +"capdot;","U+02A40" +"caret;","U+02041" +"caron;","U+002C7" +"ccaps;","U+02A4D" +"ccaron;","U+0010D" +"ccedil;","U+000E7" +"ccedil","U+000E7" +"ccirc;","U+00109" +"ccups;","U+02A4C" +"ccupssm;","U+02A50" +"cdot;","U+0010B" +"cedil;","U+000B8" +"cedil","U+000B8" +"cemptyv;","U+029B2" +"cent;","U+000A2" +"cent","U+000A2" +"centerdot;","U+000B7" +"cfr;","U+1D520" +"chcy;","U+00447" +"check;","U+02713" +"checkmark;","U+02713" +"chi;","U+003C7" +"cir;","U+025CB" +"cirE;","U+029C3" +"circ;","U+002C6" +"circeq;","U+02257" +"circlearrowleft;","U+021BA" +"circlearrowright;","U+021BB" +"circledR;","U+000AE" +"circledS;","U+024C8" +"circledast;","U+0229B" +"circledcirc;","U+0229A" +"circleddash;","U+0229D" +"cire;","U+02257" +"cirfnint;","U+02A10" +"cirmid;","U+02AEF" +"cirscir;","U+029C2" +"clubs;","U+02663" +"clubsuit;","U+02663" +"colon;","U+0003A" +"colone;","U+02254" +"coloneq;","U+02254" +"comma;","U+0002C" +"commat;","U+00040" +"comp;","U+02201" +"compfn;","U+02218" +"complement;","U+02201" +"complexes;","U+02102" +"cong;","U+02245" +"congdot;","U+02A6D" +"conint;","U+0222E" +"copf;","U+1D554" +"coprod;","U+02210" +"copy;","U+000A9" +"copy","U+000A9" +"copysr;","U+02117" +"crarr;","U+021B5" +"cross;","U+02717" +"cscr;","U+1D4B8" +"csub;","U+02ACF" +"csube;","U+02AD1" +"csup;","U+02AD0" +"csupe;","U+02AD2" +"ctdot;","U+022EF" +"cudarrl;","U+02938" +"cudarrr;","U+02935" +"cuepr;","U+022DE" +"cuesc;","U+022DF" +"cularr;","U+021B6" +"cularrp;","U+0293D" +"cup;","U+0222A" +"cupbrcap;","U+02A48" +"cupcap;","U+02A46" +"cupcup;","U+02A4A" +"cupdot;","U+0228D" +"cupor;","U+02A45" +"curarr;","U+021B7" +"curarrm;","U+0293C" +"curlyeqprec;","U+022DE" +"curlyeqsucc;","U+022DF" +"curlyvee;","U+022CE" +"curlywedge;","U+022CF" +"curren;","U+000A4" +"curren","U+000A4" +"curvearrowleft;","U+021B6" +"curvearrowright;","U+021B7" +"cuvee;","U+022CE" +"cuwed;","U+022CF" +"cwconint;","U+02232" +"cwint;","U+02231" +"cylcty;","U+0232D" +"dArr;","U+021D3" +"dHar;","U+02965" +"dagger;","U+02020" +"daleth;","U+02138" +"darr;","U+02193" +"dash;","U+02010" +"dashv;","U+022A3" +"dbkarow;","U+0290F" +"dblac;","U+002DD" +"dcaron;","U+0010F" +"dcy;","U+00434" +"dd;","U+02146" +"ddagger;","U+02021" +"ddarr;","U+021CA" +"ddotseq;","U+02A77" +"deg;","U+000B0" +"deg","U+000B0" +"delta;","U+003B4" +"demptyv;","U+029B1" +"dfisht;","U+0297F" +"dfr;","U+1D521" +"dharl;","U+021C3" +"dharr;","U+021C2" +"diam;","U+022C4" +"diamond;","U+022C4" +"diamondsuit;","U+02666" +"diams;","U+02666" +"die;","U+000A8" +"digamma;","U+003DD" +"disin;","U+022F2" +"div;","U+000F7" +"divide;","U+000F7" +"divide","U+000F7" +"divideontimes;","U+022C7" +"divonx;","U+022C7" +"djcy;","U+00452" +"dlcorn;","U+0231E" +"dlcrop;","U+0230D" +"dollar;","U+00024" +"dopf;","U+1D555" +"dot;","U+002D9" +"doteq;","U+02250" +"doteqdot;","U+02251" +"dotminus;","U+02238" +"dotplus;","U+02214" +"dotsquare;","U+022A1" +"doublebarwedge;","U+02306" +"downarrow;","U+02193" +"downdownarrows;","U+021CA" +"downharpoonleft;","U+021C3" +"downharpoonright;","U+021C2" +"drbkarow;","U+02910" +"drcorn;","U+0231F" +"drcrop;","U+0230C" +"dscr;","U+1D4B9" +"dscy;","U+00455" +"dsol;","U+029F6" +"dstrok;","U+00111" +"dtdot;","U+022F1" +"dtri;","U+025BF" +"dtrif;","U+025BE" +"duarr;","U+021F5" +"duhar;","U+0296F" +"dwangle;","U+029A6" +"dzcy;","U+0045F" +"dzigrarr;","U+027FF" +"eDDot;","U+02A77" +"eDot;","U+02251" +"eacute;","U+000E9" +"eacute","U+000E9" +"easter;","U+02A6E" +"ecaron;","U+0011B" +"ecir;","U+02256" +"ecirc;","U+000EA" +"ecirc","U+000EA" +"ecolon;","U+02255" +"ecy;","U+0044D" +"edot;","U+00117" +"ee;","U+02147" +"efDot;","U+02252" +"efr;","U+1D522" +"eg;","U+02A9A" +"egrave;","U+000E8" +"egrave","U+000E8" +"egs;","U+02A96" +"egsdot;","U+02A98" +"el;","U+02A99" +"elinters;","U+023E7" +"ell;","U+02113" +"els;","U+02A95" +"elsdot;","U+02A97" +"emacr;","U+00113" +"empty;","U+02205" +"emptyset;","U+02205" +"emptyv;","U+02205" +"emsp13;","U+02004" +"emsp14;","U+02005" +"emsp;","U+02003" +"eng;","U+0014B" +"ensp;","U+02002" +"eogon;","U+00119" +"eopf;","U+1D556" +"epar;","U+022D5" +"eparsl;","U+029E3" +"eplus;","U+02A71" +"epsi;","U+003B5" +"epsilon;","U+003B5" +"epsiv;","U+003F5" +"eqcirc;","U+02256" +"eqcolon;","U+02255" +"eqsim;","U+02242" +"eqslantgtr;","U+02A96" +"eqslantless;","U+02A95" +"equals;","U+0003D" +"equest;","U+0225F" +"equiv;","U+02261" +"equivDD;","U+02A78" +"eqvparsl;","U+029E5" +"erDot;","U+02253" +"erarr;","U+02971" +"escr;","U+0212F" +"esdot;","U+02250" +"esim;","U+02242" +"eta;","U+003B7" +"eth;","U+000F0" +"eth","U+000F0" +"euml;","U+000EB" +"euml","U+000EB" +"euro;","U+020AC" +"excl;","U+00021" +"exist;","U+02203" +"expectation;","U+02130" +"exponentiale;","U+02147" +"fallingdotseq;","U+02252" +"fcy;","U+00444" +"female;","U+02640" +"ffilig;","U+0FB03" +"fflig;","U+0FB00" +"ffllig;","U+0FB04" +"ffr;","U+1D523" +"filig;","U+0FB01" +"flat;","U+0266D" +"fllig;","U+0FB02" +"fltns;","U+025B1" +"fnof;","U+00192" +"fopf;","U+1D557" +"forall;","U+02200" +"fork;","U+022D4" +"forkv;","U+02AD9" +"fpartint;","U+02A0D" +"frac12;","U+000BD" +"frac12","U+000BD" +"frac13;","U+02153" +"frac14;","U+000BC" +"frac14","U+000BC" +"frac15;","U+02155" +"frac16;","U+02159" +"frac18;","U+0215B" +"frac23;","U+02154" +"frac25;","U+02156" +"frac34;","U+000BE" +"frac34","U+000BE" +"frac35;","U+02157" +"frac38;","U+0215C" +"frac45;","U+02158" +"frac56;","U+0215A" +"frac58;","U+0215D" +"frac78;","U+0215E" +"frasl;","U+02044" +"frown;","U+02322" +"fscr;","U+1D4BB" +"gE;","U+02267" +"gEl;","U+02A8C" +"gacute;","U+001F5" +"gamma;","U+003B3" +"gammad;","U+003DD" +"gap;","U+02A86" +"gbreve;","U+0011F" +"gcirc;","U+0011D" +"gcy;","U+00433" +"gdot;","U+00121" +"ge;","U+02265" +"gel;","U+022DB" +"geq;","U+02265" +"geqq;","U+02267" +"geqslant;","U+02A7E" +"ges;","U+02A7E" +"gescc;","U+02AA9" +"gesdot;","U+02A80" +"gesdoto;","U+02A82" +"gesdotol;","U+02A84" +"gesles;","U+02A94" +"gfr;","U+1D524" +"gg;","U+0226B" +"ggg;","U+022D9" +"gimel;","U+02137" +"gjcy;","U+00453" +"gl;","U+02277" +"glE;","U+02A92" +"gla;","U+02AA5" +"glj;","U+02AA4" +"gnE;","U+02269" +"gnap;","U+02A8A" +"gnapprox;","U+02A8A" +"gne;","U+02A88" +"gneq;","U+02A88" +"gneqq;","U+02269" +"gnsim;","U+022E7" +"gopf;","U+1D558" +"grave;","U+00060" +"gscr;","U+0210A" +"gsim;","U+02273" +"gsime;","U+02A8E" +"gsiml;","U+02A90" +"gt;","U+0003E" +"gt","U+0003E" +"gtcc;","U+02AA7" +"gtcir;","U+02A7A" +"gtdot;","U+022D7" +"gtlPar;","U+02995" +"gtquest;","U+02A7C" +"gtrapprox;","U+02A86" +"gtrarr;","U+02978" +"gtrdot;","U+022D7" +"gtreqless;","U+022DB" +"gtreqqless;","U+02A8C" +"gtrless;","U+02277" +"gtrsim;","U+02273" +"hArr;","U+021D4" +"hairsp;","U+0200A" +"half;","U+000BD" +"hamilt;","U+0210B" +"hardcy;","U+0044A" +"harr;","U+02194" +"harrcir;","U+02948" +"harrw;","U+021AD" +"hbar;","U+0210F" +"hcirc;","U+00125" +"hearts;","U+02665" +"heartsuit;","U+02665" +"hellip;","U+02026" +"hercon;","U+022B9" +"hfr;","U+1D525" +"hksearow;","U+02925" +"hkswarow;","U+02926" +"hoarr;","U+021FF" +"homtht;","U+0223B" +"hookleftarrow;","U+021A9" +"hookrightarrow;","U+021AA" +"hopf;","U+1D559" +"horbar;","U+02015" +"hscr;","U+1D4BD" +"hslash;","U+0210F" +"hstrok;","U+00127" +"hybull;","U+02043" +"hyphen;","U+02010" +"iacute;","U+000ED" +"iacute","U+000ED" +"ic;","U+02063" +"icirc;","U+000EE" +"icirc","U+000EE" +"icy;","U+00438" +"iecy;","U+00435" +"iexcl;","U+000A1" +"iexcl","U+000A1" +"iff;","U+021D4" +"ifr;","U+1D526" +"igrave;","U+000EC" +"igrave","U+000EC" +"ii;","U+02148" +"iiiint;","U+02A0C" +"iiint;","U+0222D" +"iinfin;","U+029DC" +"iiota;","U+02129" +"ijlig;","U+00133" +"imacr;","U+0012B" +"image;","U+02111" +"imagline;","U+02110" +"imagpart;","U+02111" +"imath;","U+00131" +"imof;","U+022B7" +"imped;","U+001B5" +"in;","U+02208" +"incare;","U+02105" +"infin;","U+0221E" +"infintie;","U+029DD" +"inodot;","U+00131" +"int;","U+0222B" +"intcal;","U+022BA" +"integers;","U+02124" +"intercal;","U+022BA" +"intlarhk;","U+02A17" +"intprod;","U+02A3C" +"iocy;","U+00451" +"iogon;","U+0012F" +"iopf;","U+1D55A" +"iota;","U+003B9" +"iprod;","U+02A3C" +"iquest;","U+000BF" +"iquest","U+000BF" +"iscr;","U+1D4BE" +"isin;","U+02208" +"isinE;","U+022F9" +"isindot;","U+022F5" +"isins;","U+022F4" +"isinsv;","U+022F3" +"isinv;","U+02208" +"it;","U+02062" +"itilde;","U+00129" +"iukcy;","U+00456" +"iuml;","U+000EF" +"iuml","U+000EF" +"jcirc;","U+00135" +"jcy;","U+00439" +"jfr;","U+1D527" +"jmath;","U+00237" +"jopf;","U+1D55B" +"jscr;","U+1D4BF" +"jsercy;","U+00458" +"jukcy;","U+00454" +"kappa;","U+003BA" +"kappav;","U+003F0" +"kcedil;","U+00137" +"kcy;","U+0043A" +"kfr;","U+1D528" +"kgreen;","U+00138" +"khcy;","U+00445" +"kjcy;","U+0045C" +"kopf;","U+1D55C" +"kscr;","U+1D4C0" +"lAarr;","U+021DA" +"lArr;","U+021D0" +"lAtail;","U+0291B" +"lBarr;","U+0290E" +"lE;","U+02266" +"lEg;","U+02A8B" +"lHar;","U+02962" +"lacute;","U+0013A" +"laemptyv;","U+029B4" +"lagran;","U+02112" +"lambda;","U+003BB" +"lang;","U+027E8" +"langd;","U+02991" +"langle;","U+027E8" +"lap;","U+02A85" +"laquo;","U+000AB" +"laquo","U+000AB" +"larr;","U+02190" +"larrb;","U+021E4" +"larrbfs;","U+0291F" +"larrfs;","U+0291D" +"larrhk;","U+021A9" +"larrlp;","U+021AB" +"larrpl;","U+02939" +"larrsim;","U+02973" +"larrtl;","U+021A2" +"lat;","U+02AAB" +"latail;","U+02919" +"late;","U+02AAD" +"lbarr;","U+0290C" +"lbbrk;","U+02772" +"lbrace;","U+0007B" +"lbrack;","U+0005B" +"lbrke;","U+0298B" +"lbrksld;","U+0298F" +"lbrkslu;","U+0298D" +"lcaron;","U+0013E" +"lcedil;","U+0013C" +"lceil;","U+02308" +"lcub;","U+0007B" +"lcy;","U+0043B" +"ldca;","U+02936" +"ldquo;","U+0201C" +"ldquor;","U+0201E" +"ldrdhar;","U+02967" +"ldrushar;","U+0294B" +"ldsh;","U+021B2" +"le;","U+02264" +"leftarrow;","U+02190" +"leftarrowtail;","U+021A2" +"leftharpoondown;","U+021BD" +"leftharpoonup;","U+021BC" +"leftleftarrows;","U+021C7" +"leftrightarrow;","U+02194" +"leftrightarrows;","U+021C6" +"leftrightharpoons;","U+021CB" +"leftrightsquigarrow;","U+021AD" +"leftthreetimes;","U+022CB" +"leg;","U+022DA" +"leq;","U+02264" +"leqq;","U+02266" +"leqslant;","U+02A7D" +"les;","U+02A7D" +"lescc;","U+02AA8" +"lesdot;","U+02A7F" +"lesdoto;","U+02A81" +"lesdotor;","U+02A83" +"lesges;","U+02A93" +"lessapprox;","U+02A85" +"lessdot;","U+022D6" +"lesseqgtr;","U+022DA" +"lesseqqgtr;","U+02A8B" +"lessgtr;","U+02276" +"lesssim;","U+02272" +"lfisht;","U+0297C" +"lfloor;","U+0230A" +"lfr;","U+1D529" +"lg;","U+02276" +"lgE;","U+02A91" +"lhard;","U+021BD" +"lharu;","U+021BC" +"lharul;","U+0296A" +"lhblk;","U+02584" +"ljcy;","U+00459" +"ll;","U+0226A" +"llarr;","U+021C7" +"llcorner;","U+0231E" +"llhard;","U+0296B" +"lltri;","U+025FA" +"lmidot;","U+00140" +"lmoust;","U+023B0" +"lmoustache;","U+023B0" +"lnE;","U+02268" +"lnap;","U+02A89" +"lnapprox;","U+02A89" +"lne;","U+02A87" +"lneq;","U+02A87" +"lneqq;","U+02268" +"lnsim;","U+022E6" +"loang;","U+027EC" +"loarr;","U+021FD" +"lobrk;","U+027E6" +"longleftarrow;","U+027F5" +"longleftrightarrow;","U+027F7" +"longmapsto;","U+027FC" +"longrightarrow;","U+027F6" +"looparrowleft;","U+021AB" +"looparrowright;","U+021AC" +"lopar;","U+02985" +"lopf;","U+1D55D" +"loplus;","U+02A2D" +"lotimes;","U+02A34" +"lowast;","U+02217" +"lowbar;","U+0005F" +"loz;","U+025CA" +"lozenge;","U+025CA" +"lozf;","U+029EB" +"lpar;","U+00028" +"lparlt;","U+02993" +"lrarr;","U+021C6" +"lrcorner;","U+0231F" +"lrhar;","U+021CB" +"lrhard;","U+0296D" +"lrm;","U+0200E" +"lrtri;","U+022BF" +"lsaquo;","U+02039" +"lscr;","U+1D4C1" +"lsh;","U+021B0" +"lsim;","U+02272" +"lsime;","U+02A8D" +"lsimg;","U+02A8F" +"lsqb;","U+0005B" +"lsquo;","U+02018" +"lsquor;","U+0201A" +"lstrok;","U+00142" +"lt;","U+0003C" +"lt","U+0003C" +"ltcc;","U+02AA6" +"ltcir;","U+02A79" +"ltdot;","U+022D6" +"lthree;","U+022CB" +"ltimes;","U+022C9" +"ltlarr;","U+02976" +"ltquest;","U+02A7B" +"ltrPar;","U+02996" +"ltri;","U+025C3" +"ltrie;","U+022B4" +"ltrif;","U+025C2" +"lurdshar;","U+0294A" +"luruhar;","U+02966" +"mDDot;","U+0223A" +"macr;","U+000AF" +"macr","U+000AF" +"male;","U+02642" +"malt;","U+02720" +"maltese;","U+02720" +"map;","U+021A6" +"mapsto;","U+021A6" +"mapstodown;","U+021A7" +"mapstoleft;","U+021A4" +"mapstoup;","U+021A5" +"marker;","U+025AE" +"mcomma;","U+02A29" +"mcy;","U+0043C" +"mdash;","U+02014" +"measuredangle;","U+02221" +"mfr;","U+1D52A" +"mho;","U+02127" +"micro;","U+000B5" +"micro","U+000B5" +"mid;","U+02223" +"midast;","U+0002A" +"midcir;","U+02AF0" +"middot;","U+000B7" +"middot","U+000B7" +"minus;","U+02212" +"minusb;","U+0229F" +"minusd;","U+02238" +"minusdu;","U+02A2A" +"mlcp;","U+02ADB" +"mldr;","U+02026" +"mnplus;","U+02213" +"models;","U+022A7" +"mopf;","U+1D55E" +"mp;","U+02213" +"mscr;","U+1D4C2" +"mstpos;","U+0223E" +"mu;","U+003BC" +"multimap;","U+022B8" +"mumap;","U+022B8" +"nLeftarrow;","U+021CD" +"nLeftrightarrow;","U+021CE" +"nRightarrow;","U+021CF" +"nVDash;","U+022AF" +"nVdash;","U+022AE" +"nabla;","U+02207" +"nacute;","U+00144" +"nap;","U+02249" +"napos;","U+00149" +"napprox;","U+02249" +"natur;","U+0266E" +"natural;","U+0266E" +"naturals;","U+02115" +"nbsp;","U+000A0" +"nbsp","U+000A0" +"ncap;","U+02A43" +"ncaron;","U+00148" +"ncedil;","U+00146" +"ncong;","U+02247" +"ncup;","U+02A42" +"ncy;","U+0043D" +"ndash;","U+02013" +"ne;","U+02260" +"neArr;","U+021D7" +"nearhk;","U+02924" +"nearr;","U+02197" +"nearrow;","U+02197" +"nequiv;","U+02262" +"nesear;","U+02928" +"nexist;","U+02204" +"nexists;","U+02204" +"nfr;","U+1D52B" +"nge;","U+02271" +"ngeq;","U+02271" +"ngsim;","U+02275" +"ngt;","U+0226F" +"ngtr;","U+0226F" +"nhArr;","U+021CE" +"nharr;","U+021AE" +"nhpar;","U+02AF2" +"ni;","U+0220B" +"nis;","U+022FC" +"nisd;","U+022FA" +"niv;","U+0220B" +"njcy;","U+0045A" +"nlArr;","U+021CD" +"nlarr;","U+0219A" +"nldr;","U+02025" +"nle;","U+02270" +"nleftarrow;","U+0219A" +"nleftrightarrow;","U+021AE" +"nleq;","U+02270" +"nless;","U+0226E" +"nlsim;","U+02274" +"nlt;","U+0226E" +"nltri;","U+022EA" +"nltrie;","U+022EC" +"nmid;","U+02224" +"nopf;","U+1D55F" +"not;","U+000AC" +"not","U+000AC" +"notin;","U+02209" +"notinva;","U+02209" +"notinvb;","U+022F7" +"notinvc;","U+022F6" +"notni;","U+0220C" +"notniva;","U+0220C" +"notnivb;","U+022FE" +"notnivc;","U+022FD" +"npar;","U+02226" +"nparallel;","U+02226" +"npolint;","U+02A14" +"npr;","U+02280" +"nprcue;","U+022E0" +"nprec;","U+02280" +"nrArr;","U+021CF" +"nrarr;","U+0219B" +"nrightarrow;","U+0219B" +"nrtri;","U+022EB" +"nrtrie;","U+022ED" +"nsc;","U+02281" +"nsccue;","U+022E1" +"nscr;","U+1D4C3" +"nshortmid;","U+02224" +"nshortparallel;","U+02226" +"nsim;","U+02241" +"nsime;","U+02244" +"nsimeq;","U+02244" +"nsmid;","U+02224" +"nspar;","U+02226" +"nsqsube;","U+022E2" +"nsqsupe;","U+022E3" +"nsub;","U+02284" +"nsube;","U+02288" +"nsubseteq;","U+02288" +"nsucc;","U+02281" +"nsup;","U+02285" +"nsupe;","U+02289" +"nsupseteq;","U+02289" +"ntgl;","U+02279" +"ntilde;","U+000F1" +"ntilde","U+000F1" +"ntlg;","U+02278" +"ntriangleleft;","U+022EA" +"ntrianglelefteq;","U+022EC" +"ntriangleright;","U+022EB" +"ntrianglerighteq;","U+022ED" +"nu;","U+003BD" +"num;","U+00023" +"numero;","U+02116" +"numsp;","U+02007" +"nvDash;","U+022AD" +"nvHarr;","U+02904" +"nvdash;","U+022AC" +"nvinfin;","U+029DE" +"nvlArr;","U+02902" +"nvrArr;","U+02903" +"nwArr;","U+021D6" +"nwarhk;","U+02923" +"nwarr;","U+02196" +"nwarrow;","U+02196" +"nwnear;","U+02927" +"oS;","U+024C8" +"oacute;","U+000F3" +"oacute","U+000F3" +"oast;","U+0229B" +"ocir;","U+0229A" +"ocirc;","U+000F4" +"ocirc","U+000F4" +"ocy;","U+0043E" +"odash;","U+0229D" +"odblac;","U+00151" +"odiv;","U+02A38" +"odot;","U+02299" +"odsold;","U+029BC" +"oelig;","U+00153" +"ofcir;","U+029BF" +"ofr;","U+1D52C" +"ogon;","U+002DB" +"ograve;","U+000F2" +"ograve","U+000F2" +"ogt;","U+029C1" +"ohbar;","U+029B5" +"ohm;","U+003A9" +"oint;","U+0222E" +"olarr;","U+021BA" +"olcir;","U+029BE" +"olcross;","U+029BB" +"oline;","U+0203E" +"olt;","U+029C0" +"omacr;","U+0014D" +"omega;","U+003C9" +"omicron;","U+003BF" +"omid;","U+029B6" +"ominus;","U+02296" +"oopf;","U+1D560" +"opar;","U+029B7" +"operp;","U+029B9" +"oplus;","U+02295" +"or;","U+02228" +"orarr;","U+021BB" +"ord;","U+02A5D" +"order;","U+02134" +"orderof;","U+02134" +"ordf;","U+000AA" +"ordf","U+000AA" +"ordm;","U+000BA" +"ordm","U+000BA" +"origof;","U+022B6" +"oror;","U+02A56" +"orslope;","U+02A57" +"orv;","U+02A5B" +"oscr;","U+02134" +"oslash;","U+000F8" +"oslash","U+000F8" +"osol;","U+02298" +"otilde;","U+000F5" +"otilde","U+000F5" +"otimes;","U+02297" +"otimesas;","U+02A36" +"ouml;","U+000F6" +"ouml","U+000F6" +"ovbar;","U+0233D" +"par;","U+02225" +"para;","U+000B6" +"para","U+000B6" +"parallel;","U+02225" +"parsim;","U+02AF3" +"parsl;","U+02AFD" +"part;","U+02202" +"pcy;","U+0043F" +"percnt;","U+00025" +"period;","U+0002E" +"permil;","U+02030" +"perp;","U+022A5" +"pertenk;","U+02031" +"pfr;","U+1D52D" +"phi;","U+003C6" +"phiv;","U+003D5" +"phmmat;","U+02133" +"phone;","U+0260E" +"pi;","U+003C0" +"pitchfork;","U+022D4" +"piv;","U+003D6" +"planck;","U+0210F" +"planckh;","U+0210E" +"plankv;","U+0210F" +"plus;","U+0002B" +"plusacir;","U+02A23" +"plusb;","U+0229E" +"pluscir;","U+02A22" +"plusdo;","U+02214" +"plusdu;","U+02A25" +"pluse;","U+02A72" +"plusmn;","U+000B1" +"plusmn","U+000B1" +"plussim;","U+02A26" +"plustwo;","U+02A27" +"pm;","U+000B1" +"pointint;","U+02A15" +"popf;","U+1D561" +"pound;","U+000A3" +"pound","U+000A3" +"pr;","U+0227A" +"prE;","U+02AB3" +"prap;","U+02AB7" +"prcue;","U+0227C" +"pre;","U+02AAF" +"prec;","U+0227A" +"precapprox;","U+02AB7" +"preccurlyeq;","U+0227C" +"preceq;","U+02AAF" +"precnapprox;","U+02AB9" +"precneqq;","U+02AB5" +"precnsim;","U+022E8" +"precsim;","U+0227E" +"prime;","U+02032" +"primes;","U+02119" +"prnE;","U+02AB5" +"prnap;","U+02AB9" +"prnsim;","U+022E8" +"prod;","U+0220F" +"profalar;","U+0232E" +"profline;","U+02312" +"profsurf;","U+02313" +"prop;","U+0221D" +"propto;","U+0221D" +"prsim;","U+0227E" +"prurel;","U+022B0" +"pscr;","U+1D4C5" +"psi;","U+003C8" +"puncsp;","U+02008" +"qfr;","U+1D52E" +"qint;","U+02A0C" +"qopf;","U+1D562" +"qprime;","U+02057" +"qscr;","U+1D4C6" +"quaternions;","U+0210D" +"quatint;","U+02A16" +"quest;","U+0003F" +"questeq;","U+0225F" +"quot;","U+00022" +"quot","U+00022" +"rAarr;","U+021DB" +"rArr;","U+021D2" +"rAtail;","U+0291C" +"rBarr;","U+0290F" +"rHar;","U+02964" +"racute;","U+00155" +"radic;","U+0221A" +"raemptyv;","U+029B3" +"rang;","U+027E9" +"rangd;","U+02992" +"range;","U+029A5" +"rangle;","U+027E9" +"raquo;","U+000BB" +"raquo","U+000BB" +"rarr;","U+02192" +"rarrap;","U+02975" +"rarrb;","U+021E5" +"rarrbfs;","U+02920" +"rarrc;","U+02933" +"rarrfs;","U+0291E" +"rarrhk;","U+021AA" +"rarrlp;","U+021AC" +"rarrpl;","U+02945" +"rarrsim;","U+02974" +"rarrtl;","U+021A3" +"rarrw;","U+0219D" +"ratail;","U+0291A" +"ratio;","U+02236" +"rationals;","U+0211A" +"rbarr;","U+0290D" +"rbbrk;","U+02773" +"rbrace;","U+0007D" +"rbrack;","U+0005D" +"rbrke;","U+0298C" +"rbrksld;","U+0298E" +"rbrkslu;","U+02990" +"rcaron;","U+00159" +"rcedil;","U+00157" +"rceil;","U+02309" +"rcub;","U+0007D" +"rcy;","U+00440" +"rdca;","U+02937" +"rdldhar;","U+02969" +"rdquo;","U+0201D" +"rdquor;","U+0201D" +"rdsh;","U+021B3" +"real;","U+0211C" +"realine;","U+0211B" +"realpart;","U+0211C" +"reals;","U+0211D" +"rect;","U+025AD" +"reg;","U+000AE" +"reg","U+000AE" +"rfisht;","U+0297D" +"rfloor;","U+0230B" +"rfr;","U+1D52F" +"rhard;","U+021C1" +"rharu;","U+021C0" +"rharul;","U+0296C" +"rho;","U+003C1" +"rhov;","U+003F1" +"rightarrow;","U+02192" +"rightarrowtail;","U+021A3" +"rightharpoondown;","U+021C1" +"rightharpoonup;","U+021C0" +"rightleftarrows;","U+021C4" +"rightleftharpoons;","U+021CC" +"rightrightarrows;","U+021C9" +"rightsquigarrow;","U+0219D" +"rightthreetimes;","U+022CC" +"ring;","U+002DA" +"risingdotseq;","U+02253" +"rlarr;","U+021C4" +"rlhar;","U+021CC" +"rlm;","U+0200F" +"rmoust;","U+023B1" +"rmoustache;","U+023B1" +"rnmid;","U+02AEE" +"roang;","U+027ED" +"roarr;","U+021FE" +"robrk;","U+027E7" +"ropar;","U+02986" +"ropf;","U+1D563" +"roplus;","U+02A2E" +"rotimes;","U+02A35" +"rpar;","U+00029" +"rpargt;","U+02994" +"rppolint;","U+02A12" +"rrarr;","U+021C9" +"rsaquo;","U+0203A" +"rscr;","U+1D4C7" +"rsh;","U+021B1" +"rsqb;","U+0005D" +"rsquo;","U+02019" +"rsquor;","U+02019" +"rthree;","U+022CC" +"rtimes;","U+022CA" +"rtri;","U+025B9" +"rtrie;","U+022B5" +"rtrif;","U+025B8" +"rtriltri;","U+029CE" +"ruluhar;","U+02968" +"rx;","U+0211E" +"sacute;","U+0015B" +"sbquo;","U+0201A" +"sc;","U+0227B" +"scE;","U+02AB4" +"scap;","U+02AB8" +"scaron;","U+00161" +"sccue;","U+0227D" +"sce;","U+02AB0" +"scedil;","U+0015F" +"scirc;","U+0015D" +"scnE;","U+02AB6" +"scnap;","U+02ABA" +"scnsim;","U+022E9" +"scpolint;","U+02A13" +"scsim;","U+0227F" +"scy;","U+00441" +"sdot;","U+022C5" +"sdotb;","U+022A1" +"sdote;","U+02A66" +"seArr;","U+021D8" +"searhk;","U+02925" +"searr;","U+02198" +"searrow;","U+02198" +"sect;","U+000A7" +"sect","U+000A7" +"semi;","U+0003B" +"seswar;","U+02929" +"setminus;","U+02216" +"setmn;","U+02216" +"sext;","U+02736" +"sfr;","U+1D530" +"sfrown;","U+02322" +"sharp;","U+0266F" +"shchcy;","U+00449" +"shcy;","U+00448" +"shortmid;","U+02223" +"shortparallel;","U+02225" +"shy;","U+000AD " +"shy","U+000AD " +"sigma;","U+003C3" +"sigmaf;","U+003C2" +"sigmav;","U+003C2" +"sim;","U+0223C" +"simdot;","U+02A6A" +"sime;","U+02243" +"simeq;","U+02243" +"simg;","U+02A9E" +"simgE;","U+02AA0" +"siml;","U+02A9D" +"simlE;","U+02A9F" +"simne;","U+02246" +"simplus;","U+02A24" +"simrarr;","U+02972" +"slarr;","U+02190" +"smallsetminus;","U+02216" +"smashp;","U+02A33" +"smeparsl;","U+029E4" +"smid;","U+02223" +"smile;","U+02323" +"smt;","U+02AAA" +"smte;","U+02AAC" +"softcy;","U+0044C" +"sol;","U+0002F" +"solb;","U+029C4" +"solbar;","U+0233F" +"sopf;","U+1D564" +"spades;","U+02660" +"spadesuit;","U+02660" +"spar;","U+02225" +"sqcap;","U+02293" +"sqcup;","U+02294" +"sqsub;","U+0228F" +"sqsube;","U+02291" +"sqsubset;","U+0228F" +"sqsubseteq;","U+02291" +"sqsup;","U+02290" +"sqsupe;","U+02292" +"sqsupset;","U+02290" +"sqsupseteq;","U+02292" +"squ;","U+025A1" +"square;","U+025A1" +"squarf;","U+025AA" +"squf;","U+025AA" +"srarr;","U+02192" +"sscr;","U+1D4C8" +"ssetmn;","U+02216" +"ssmile;","U+02323" +"sstarf;","U+022C6" +"star;","U+02606" +"starf;","U+02605" +"straightepsilon;","U+003F5" +"straightphi;","U+003D5" +"strns;","U+000AF" +"sub;","U+02282" +"subE;","U+02AC5" +"subdot;","U+02ABD" +"sube;","U+02286" +"subedot;","U+02AC3" +"submult;","U+02AC1" +"subnE;","U+02ACB" +"subne;","U+0228A" +"subplus;","U+02ABF" +"subrarr;","U+02979" +"subset;","U+02282" +"subseteq;","U+02286" +"subseteqq;","U+02AC5" +"subsetneq;","U+0228A" +"subsetneqq;","U+02ACB" +"subsim;","U+02AC7" +"subsub;","U+02AD5" +"subsup;","U+02AD3" +"succ;","U+0227B" +"succapprox;","U+02AB8" +"succcurlyeq;","U+0227D" +"succeq;","U+02AB0" +"succnapprox;","U+02ABA" +"succneqq;","U+02AB6" +"succnsim;","U+022E9" +"succsim;","U+0227F" +"sum;","U+02211" +"sung;","U+0266A" +"sup1;","U+000B9" +"sup1","U+000B9" +"sup2;","U+000B2" +"sup2","U+000B2" +"sup3;","U+000B3" +"sup3","U+000B3" +"sup;","U+02283" +"supE;","U+02AC6" +"supdot;","U+02ABE" +"supdsub;","U+02AD8" +"supe;","U+02287" +"supedot;","U+02AC4" +"suphsol;","U+027C9" +"suphsub;","U+02AD7" +"suplarr;","U+0297B" +"supmult;","U+02AC2" +"supnE;","U+02ACC" +"supne;","U+0228B" +"supplus;","U+02AC0" +"supset;","U+02283" +"supseteq;","U+02287" +"supseteqq;","U+02AC6" +"supsetneq;","U+0228B" +"supsetneqq;","U+02ACC" +"supsim;","U+02AC8" +"supsub;","U+02AD4" +"supsup;","U+02AD6" +"swArr;","U+021D9" +"swarhk;","U+02926" +"swarr;","U+02199" +"swarrow;","U+02199" +"swnwar;","U+0292A" +"szlig;","U+000DF" +"szlig","U+000DF" +"target;","U+02316" +"tau;","U+003C4" +"tbrk;","U+023B4" +"tcaron;","U+00165" +"tcedil;","U+00163" +"tcy;","U+00442" +"tdot;","U+020DB" +"telrec;","U+02315" +"tfr;","U+1D531" +"there4;","U+02234" +"therefore;","U+02234" +"theta;","U+003B8" +"thetasym;","U+003D1" +"thetav;","U+003D1" +"thickapprox;","U+02248" +"thicksim;","U+0223C" +"thinsp;","U+02009" +"thkap;","U+02248" +"thksim;","U+0223C" +"thorn;","U+000FE" +"thorn","U+000FE" +"tilde;","U+002DC" +"times;","U+000D7" +"times","U+000D7" +"timesb;","U+022A0" +"timesbar;","U+02A31" +"timesd;","U+02A30" +"tint;","U+0222D" +"toea;","U+02928" +"top;","U+022A4" +"topbot;","U+02336" +"topcir;","U+02AF1" +"topf;","U+1D565" +"topfork;","U+02ADA" +"tosa;","U+02929" +"tprime;","U+02034" +"trade;","U+02122" +"triangle;","U+025B5" +"triangledown;","U+025BF" +"triangleleft;","U+025C3" +"trianglelefteq;","U+022B4" +"triangleq;","U+0225C" +"triangleright;","U+025B9" +"trianglerighteq;","U+022B5" +"tridot;","U+025EC" +"trie;","U+0225C" +"triminus;","U+02A3A" +"triplus;","U+02A39" +"trisb;","U+029CD" +"tritime;","U+02A3B" +"trpezium;","U+023E2" +"tscr;","U+1D4C9" +"tscy;","U+00446" +"tshcy;","U+0045B" +"tstrok;","U+00167" +"twixt;","U+0226C" +"twoheadleftarrow;","U+0219E" +"twoheadrightarrow;","U+021A0" +"uArr;","U+021D1" +"uHar;","U+02963" +"uacute;","U+000FA" +"uacute","U+000FA" +"uarr;","U+02191" +"ubrcy;","U+0045E" +"ubreve;","U+0016D" +"ucirc;","U+000FB" +"ucirc","U+000FB" +"ucy;","U+00443" +"udarr;","U+021C5" +"udblac;","U+00171" +"udhar;","U+0296E" +"ufisht;","U+0297E" +"ufr;","U+1D532" +"ugrave;","U+000F9" +"ugrave","U+000F9" +"uharl;","U+021BF" +"uharr;","U+021BE" +"uhblk;","U+02580" +"ulcorn;","U+0231C" +"ulcorner;","U+0231C" +"ulcrop;","U+0230F" +"ultri;","U+025F8" +"umacr;","U+0016B" +"uml;","U+000A8" +"uml","U+000A8" +"uogon;","U+00173" +"uopf;","U+1D566" +"uparrow;","U+02191" +"updownarrow;","U+02195" +"upharpoonleft;","U+021BF" +"upharpoonright;","U+021BE" +"uplus;","U+0228E" +"upsi;","U+003C5" +"upsih;","U+003D2" +"upsilon;","U+003C5" +"upuparrows;","U+021C8" +"urcorn;","U+0231D" +"urcorner;","U+0231D" +"urcrop;","U+0230E" +"uring;","U+0016F" +"urtri;","U+025F9" +"uscr;","U+1D4CA" +"utdot;","U+022F0" +"utilde;","U+00169" +"utri;","U+025B5" +"utrif;","U+025B4" +"uuarr;","U+021C8" +"uuml;","U+000FC" +"uuml","U+000FC" +"uwangle;","U+029A7" +"vArr;","U+021D5" +"vBar;","U+02AE8" +"vBarv;","U+02AE9" +"vDash;","U+022A8" +"vangrt;","U+0299C" +"varepsilon;","U+003F5" +"varkappa;","U+003F0" +"varnothing;","U+02205" +"varphi;","U+003D5" +"varpi;","U+003D6" +"varpropto;","U+0221D" +"varr;","U+02195" +"varrho;","U+003F1" +"varsigma;","U+003C2" +"vartheta;","U+003D1" +"vartriangleleft;","U+022B2" +"vartriangleright;","U+022B3" +"vcy;","U+00432" +"vdash;","U+022A2" +"vee;","U+02228" +"veebar;","U+022BB" +"veeeq;","U+0225A" +"vellip;","U+022EE" +"verbar;","U+0007C" +"vert;","U+0007C" +"vfr;","U+1D533" +"vltri;","U+022B2" +"vopf;","U+1D567" +"vprop;","U+0221D" +"vrtri;","U+022B3" +"vscr;","U+1D4CB" +"vzigzag;","U+0299A" +"wcirc;","U+00175" +"wedbar;","U+02A5F" +"wedge;","U+02227" +"wedgeq;","U+02259" +"weierp;","U+02118" +"wfr;","U+1D534" +"wopf;","U+1D568" +"wp;","U+02118" +"wr;","U+02240" +"wreath;","U+02240" +"wscr;","U+1D4CC" +"xcap;","U+022C2" +"xcirc;","U+025EF" +"xcup;","U+022C3" +"xdtri;","U+025BD" +"xfr;","U+1D535" +"xhArr;","U+027FA" +"xharr;","U+027F7" +"xi;","U+003BE" +"xlArr;","U+027F8" +"xlarr;","U+027F5" +"xmap;","U+027FC" +"xnis;","U+022FB" +"xodot;","U+02A00" +"xopf;","U+1D569" +"xoplus;","U+02A01" +"xotime;","U+02A02" +"xrArr;","U+027F9" +"xrarr;","U+027F6" +"xscr;","U+1D4CD" +"xsqcup;","U+02A06" +"xuplus;","U+02A04" +"xutri;","U+025B3" +"xvee;","U+022C1" +"xwedge;","U+022C0" +"yacute;","U+000FD" +"yacute","U+000FD" +"yacy;","U+0044F" +"ycirc;","U+00177" +"ycy;","U+0044B" +"yen;","U+000A5" +"yen","U+000A5" +"yfr;","U+1D536" +"yicy;","U+00457" +"yopf;","U+1D56A" +"yscr;","U+1D4CE" +"yucy;","U+0044E" +"yuml;","U+000FF" +"yuml","U+000FF" +"zacute;","U+0017A" +"zcaron;","U+0017E" +"zcy;","U+00437" +"zdot;","U+0017C" +"zeetrf;","U+02128" +"zeta;","U+003B6" +"zfr;","U+1D537" +"zhcy;","U+00436" +"zigrarr;","U+021DD" +"zopf;","U+1D56B" +"zscr;","U+1D4CF" +"zwj;","U+0200D" +"zwnj;","U+0200C" diff --git a/Source/WebCore/html/parser/HTMLEntityParser.cpp b/Source/WebCore/html/parser/HTMLEntityParser.cpp new file mode 100644 index 0000000..6a422b8 --- /dev/null +++ b/Source/WebCore/html/parser/HTMLEntityParser.cpp @@ -0,0 +1,272 @@ +/* + * Copyright (C) 2008 Apple Inc. All Rights Reserved. + * Copyright (C) 2009 Torch Mobile, Inc. http://www.torchmobile.com/ + * Copyright (C) 2010 Google, Inc. All Rights Reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "HTMLEntityParser.h" + +#include "HTMLEntitySearch.h" +#include "HTMLEntityTable.h" +#include <wtf/Vector.h> + +using namespace WTF; + +namespace WebCore { + +namespace { + +static const UChar windowsLatin1ExtensionArray[32] = { + 0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, // 80-87 + 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, // 88-8F + 0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, // 90-97 + 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178, // 98-9F +}; + +inline UChar adjustEntity(UChar32 value) +{ + if ((value & ~0x1F) != 0x0080) + return value; + return windowsLatin1ExtensionArray[value - 0x80]; +} + +inline UChar32 legalEntityFor(UChar32 value) +{ + // FIXME: A number of specific entity values generate parse errors. + if (value == 0 || value > 0x10FFFF || (value >= 0xD800 && value <= 0xDFFF)) + return 0xFFFD; + if (U_IS_BMP(value)) + return adjustEntity(value); + return value; +} + +inline bool convertToUTF16(UChar32 value, Vector<UChar, 16>& decodedEntity) +{ + if (U_IS_BMP(value)) { + UChar character = static_cast<UChar>(value); + ASSERT(character == value); + decodedEntity.append(character); + return true; + } + decodedEntity.append(U16_LEAD(value)); + decodedEntity.append(U16_TRAIL(value)); + return true; +} + +inline bool isHexDigit(UChar cc) +{ + return (cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'f') || (cc >= 'A' && cc <= 'F'); +} + +inline bool isAlphaNumeric(UChar cc) +{ + return (cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'z') || (cc >= 'A' && cc <= 'Z'); +} + +void unconsumeCharacters(SegmentedString& source, const Vector<UChar, 10>& consumedCharacters) +{ + if (consumedCharacters.size() == 1) + source.push(consumedCharacters[0]); + else if (consumedCharacters.size() == 2) { + source.push(consumedCharacters[0]); + source.push(consumedCharacters[1]); + } else + source.prepend(SegmentedString(String(consumedCharacters.data(), consumedCharacters.size()))); +} + +} + +bool consumeHTMLEntity(SegmentedString& source, Vector<UChar, 16>& decodedEntity, bool& notEnoughCharacters, UChar additionalAllowedCharacter) +{ + ASSERT(!additionalAllowedCharacter || additionalAllowedCharacter == '"' || additionalAllowedCharacter == '\'' || additionalAllowedCharacter == '>'); + ASSERT(!notEnoughCharacters); + ASSERT(decodedEntity.isEmpty()); + + enum EntityState { + Initial, + Number, + MaybeHexLowerCaseX, + MaybeHexUpperCaseX, + Hex, + Decimal, + Named + }; + EntityState entityState = Initial; + UChar32 result = 0; + Vector<UChar, 10> consumedCharacters; + + while (!source.isEmpty()) { + UChar cc = *source; + switch (entityState) { + case Initial: { + if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ' || cc == '<' || cc == '&') + return false; + if (additionalAllowedCharacter && cc == additionalAllowedCharacter) + return false; + if (cc == '#') { + entityState = Number; + break; + } + if ((cc >= 'a' && cc <= 'z') || (cc >= 'A' && cc <= 'Z')) { + entityState = Named; + continue; + } + return false; + } + case Number: { + if (cc == 'x') { + entityState = MaybeHexLowerCaseX; + break; + } + if (cc == 'X') { + entityState = MaybeHexUpperCaseX; + break; + } + if (cc >= '0' && cc <= '9') { + entityState = Decimal; + continue; + } + source.push('#'); + return false; + } + case MaybeHexLowerCaseX: { + if (isHexDigit(cc)) { + entityState = Hex; + continue; + } + source.push('#'); + source.push('x'); + return false; + } + case MaybeHexUpperCaseX: { + if (isHexDigit(cc)) { + entityState = Hex; + continue; + } + source.push('#'); + source.push('X'); + return false; + } + case Hex: { + if (cc >= '0' && cc <= '9') + result = result * 16 + cc - '0'; + else if (cc >= 'a' && cc <= 'f') + result = result * 16 + 10 + cc - 'a'; + else if (cc >= 'A' && cc <= 'F') + result = result * 16 + 10 + cc - 'A'; + else { + if (cc == ';') + source.advanceAndASSERT(cc); + return convertToUTF16(legalEntityFor(result), decodedEntity); + } + break; + } + case Decimal: { + if (cc >= '0' && cc <= '9') + result = result * 10 + cc - '0'; + else { + if (cc == ';') + source.advanceAndASSERT(cc); + return convertToUTF16(legalEntityFor(result), decodedEntity); + } + break; + } + case Named: { + HTMLEntitySearch entitySearch; + while (!source.isEmpty()) { + cc = *source; + entitySearch.advance(cc); + if (!entitySearch.isEntityPrefix()) + break; + consumedCharacters.append(cc); + source.advanceAndASSERT(cc); + } + notEnoughCharacters = source.isEmpty(); + if (notEnoughCharacters) { + // We can't an entity because there might be a longer entity + // that we could match if we had more data. + unconsumeCharacters(source, consumedCharacters); + return false; + } + if (!entitySearch.mostRecentMatch()) { + ASSERT(!entitySearch.currentValue()); + unconsumeCharacters(source, consumedCharacters); + return false; + } + if (entitySearch.mostRecentMatch()->length != entitySearch.currentLength()) { + // We've consumed too many characters. We need to walk the + // source back to the point at which we had consumed an + // actual entity. + unconsumeCharacters(source, consumedCharacters); + consumedCharacters.clear(); + const int length = entitySearch.mostRecentMatch()->length; + const UChar* reference = entitySearch.mostRecentMatch()->entity; + for (int i = 0; i < length; ++i) { + cc = *source; + ASSERT_UNUSED(reference, cc == *reference++); + consumedCharacters.append(cc); + source.advanceAndASSERT(cc); + ASSERT(!source.isEmpty()); + } + cc = *source; + } + if (entitySearch.mostRecentMatch()->lastCharacter() == ';' + || !additionalAllowedCharacter + || !(isAlphaNumeric(cc) || cc == '=')) { + return convertToUTF16(entitySearch.mostRecentMatch()->value, decodedEntity); + } + unconsumeCharacters(source, consumedCharacters); + return false; + } + } + consumedCharacters.append(cc); + source.advanceAndASSERT(cc); + } + ASSERT(source.isEmpty()); + notEnoughCharacters = true; + unconsumeCharacters(source, consumedCharacters); + return false; +} + +UChar decodeNamedEntity(const char* name) +{ + HTMLEntitySearch search; + while (*name) { + search.advance(*name++); + if (!search.isEntityPrefix()) + return 0; + } + search.advance(';'); + UChar32 entityValue = search.currentValue(); + if (U16_LENGTH(entityValue) != 1) { + // Callers need to move off this API if the entity table has values + // which do no fit in a 16 bit UChar! + ASSERT_NOT_REACHED(); + return 0; + } + return static_cast<UChar>(entityValue); +} + +} // namespace WebCore diff --git a/Source/WebCore/html/parser/HTMLEntityParser.h b/Source/WebCore/html/parser/HTMLEntityParser.h new file mode 100644 index 0000000..f02e849 --- /dev/null +++ b/Source/WebCore/html/parser/HTMLEntityParser.h @@ -0,0 +1,41 @@ +/* + * Copyright (C) 2008 Apple Inc. All Rights Reserved. + * Copyright (C) 2010 Google, Inc. All Rights Reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef HTMLEntityParser_h +#define HTMLEntityParser_h + +#include "SegmentedString.h" + +namespace WebCore { + +bool consumeHTMLEntity(SegmentedString&, Vector<UChar, 16>& decodedEntity, bool& notEnoughCharacters, UChar additionalAllowedCharacter = '\0'); + +// Used by the XML parser. Not suitable for use in HTML parsing. Use consumeHTMLEntity instead. +UChar decodeNamedEntity(const char*); + +} + +#endif diff --git a/Source/WebCore/html/parser/HTMLEntitySearch.cpp b/Source/WebCore/html/parser/HTMLEntitySearch.cpp new file mode 100644 index 0000000..56fb91a --- /dev/null +++ b/Source/WebCore/html/parser/HTMLEntitySearch.cpp @@ -0,0 +1,134 @@ +/* + * Copyright (C) 2010 Google, Inc. All Rights Reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "HTMLEntitySearch.h" + +#include "HTMLEntityTable.h" + +namespace WebCore { + +namespace { + +const HTMLEntityTableEntry* halfway(const HTMLEntityTableEntry* left, const HTMLEntityTableEntry* right) +{ + return &left[(right - left) / 2]; +} + +} + +HTMLEntitySearch::HTMLEntitySearch() + : m_currentLength(0) + , m_currentValue(0) + , m_mostRecentMatch(0) + , m_first(HTMLEntityTable::firstEntry()) + , m_last(HTMLEntityTable::lastEntry()) +{ +} + +HTMLEntitySearch::CompareResult HTMLEntitySearch::compare(const HTMLEntityTableEntry* entry, UChar nextCharacter) const +{ + if (entry->length < m_currentLength + 1) + return Before; + UChar entryNextCharacter = entry->entity[m_currentLength]; + if (entryNextCharacter == nextCharacter) + return Prefix; + return entryNextCharacter < nextCharacter ? Before : After; +} + +const HTMLEntityTableEntry* HTMLEntitySearch::findFirst(UChar nextCharacter) const +{ + const HTMLEntityTableEntry* left = m_first; + const HTMLEntityTableEntry* right = m_last; + if (left == right) + return left; + CompareResult result = compare(left, nextCharacter); + if (result == Prefix) + return left; + if (result == After) + return right; + while (left + 1 < right) { + const HTMLEntityTableEntry* probe = halfway(left, right); + result = compare(probe, nextCharacter); + if (result == Before) + left = probe; + else { + ASSERT(result == After || result == Prefix); + right = probe; + } + } + ASSERT(left + 1 == right); + return right; +} + +const HTMLEntityTableEntry* HTMLEntitySearch::findLast(UChar nextCharacter) const +{ + const HTMLEntityTableEntry* left = m_first; + const HTMLEntityTableEntry* right = m_last; + if (left == right) + return right; + CompareResult result = compare(right, nextCharacter); + if (result == Prefix) + return right; + if (result == Before) + return left; + while (left + 1 < right) { + const HTMLEntityTableEntry* probe = halfway(left, right); + result = compare(probe, nextCharacter); + if (result == After) + right = probe; + else { + ASSERT(result == Before || result == Prefix); + left = probe; + } + } + ASSERT(left + 1 == right); + return left; +} + +void HTMLEntitySearch::advance(UChar nextCharacter) +{ + ASSERT(isEntityPrefix()); + if (!m_currentLength) { + m_first = HTMLEntityTable::firstEntryStartingWith(nextCharacter); + m_last = HTMLEntityTable::lastEntryStartingWith(nextCharacter); + if (!m_first || !m_last) + return fail(); + } else { + m_first = findFirst(nextCharacter); + m_last = findLast(nextCharacter); + if (m_first == m_last && compare(m_first, nextCharacter) != Prefix) + return fail(); + } + ++m_currentLength; + if (m_first->length != m_currentLength) { + m_currentValue = 0; + return; + } + m_mostRecentMatch = m_first; + m_currentValue = m_mostRecentMatch->value; +} + +} diff --git a/Source/WebCore/html/parser/HTMLEntitySearch.h b/Source/WebCore/html/parser/HTMLEntitySearch.h new file mode 100644 index 0000000..0c66318 --- /dev/null +++ b/Source/WebCore/html/parser/HTMLEntitySearch.h @@ -0,0 +1,75 @@ +/* + * Copyright (C) 2010 Google, Inc. All Rights Reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef HTMLEntitySearch_h +#define HTMLEntitySearch_h + +#include "PlatformString.h" + +namespace WebCore { + +struct HTMLEntityTableEntry; + +class HTMLEntitySearch { +public: + HTMLEntitySearch(); + + void advance(UChar); + + bool isEntityPrefix() const { return !!m_first; } + UChar32 currentValue() const { return m_currentValue; } + int currentLength() const { return m_currentLength; } + + const HTMLEntityTableEntry* mostRecentMatch() const { return m_mostRecentMatch; } + +private: + enum CompareResult { + Before, + Prefix, + After, + }; + + CompareResult compare(const HTMLEntityTableEntry*, UChar) const; + const HTMLEntityTableEntry* findFirst(UChar) const; + const HTMLEntityTableEntry* findLast(UChar) const; + + void fail() + { + m_currentValue = 0; + m_first = 0; + m_last = 0; + } + + int m_currentLength; + UChar32 m_currentValue; + + const HTMLEntityTableEntry* m_mostRecentMatch; + const HTMLEntityTableEntry* m_first; + const HTMLEntityTableEntry* m_last; +}; + +} + +#endif diff --git a/Source/WebCore/html/parser/HTMLEntityTable.h b/Source/WebCore/html/parser/HTMLEntityTable.h new file mode 100644 index 0000000..3b9ab4e --- /dev/null +++ b/Source/WebCore/html/parser/HTMLEntityTable.h @@ -0,0 +1,52 @@ +/* + * Copyright (C) 2010 Google, Inc. All Rights Reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef HTMLEntityTable_h +#define HTMLEntityTable_h + +#include "PlatformString.h" + +namespace WebCore { + +struct HTMLEntityTableEntry { + UChar lastCharacter() const { return entity[length - 1]; } + + const UChar* entity; + int length; + UChar32 value; +}; + +class HTMLEntityTable { +public: + static const HTMLEntityTableEntry* firstEntry(); + static const HTMLEntityTableEntry* lastEntry(); + + static const HTMLEntityTableEntry* firstEntryStartingWith(UChar); + static const HTMLEntityTableEntry* lastEntryStartingWith(UChar); +}; + +} + +#endif diff --git a/Source/WebCore/html/parser/HTMLFormattingElementList.cpp b/Source/WebCore/html/parser/HTMLFormattingElementList.cpp new file mode 100644 index 0000000..22bf03e --- /dev/null +++ b/Source/WebCore/html/parser/HTMLFormattingElementList.cpp @@ -0,0 +1,134 @@ +/* + * Copyright (C) 2010 Google, Inc. All Rights Reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY GOOGLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GOOGLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "HTMLFormattingElementList.h" + +#include "Element.h" +#include "NotImplemented.h" + +namespace WebCore { + +HTMLFormattingElementList::HTMLFormattingElementList() +{ +} + +HTMLFormattingElementList::~HTMLFormattingElementList() +{ +} + +Element* HTMLFormattingElementList::closestElementInScopeWithName(const AtomicString& targetName) +{ + for (unsigned i = 1; i <= m_entries.size(); ++i) { + const Entry& entry = m_entries[m_entries.size() - i]; + if (entry.isMarker()) + return 0; + if (entry.element()->hasLocalName(targetName)) + return entry.element(); + } + return 0; +} + +bool HTMLFormattingElementList::contains(Element* element) +{ + return !!find(element); +} + +HTMLFormattingElementList::Entry* HTMLFormattingElementList::find(Element* element) +{ + size_t index = m_entries.reverseFind(element); + if (index != notFound) { + // This is somewhat of a hack, and is why this method can't be const. + return &m_entries[index]; + } + return 0; +} + +HTMLFormattingElementList::Bookmark HTMLFormattingElementList::bookmarkFor(Element* element) +{ + size_t index = m_entries.reverseFind(element); + ASSERT(index != notFound); + return Bookmark(&at(index)); +} + +void HTMLFormattingElementList::swapTo(Element* oldElement, Element* newElement, const Bookmark& bookmark) +{ + ASSERT(contains(oldElement)); + ASSERT(!contains(newElement)); + if (!bookmark.hasBeenMoved()) { + ASSERT(bookmark.mark()->element() == oldElement); + bookmark.mark()->replaceElement(newElement); + return; + } + size_t index = bookmark.mark() - first(); + ASSERT(index < size()); + m_entries.insert(index + 1, newElement); + remove(oldElement); +} + +void HTMLFormattingElementList::append(Element* element) +{ + m_entries.append(element); +} + +void HTMLFormattingElementList::remove(Element* element) +{ + size_t index = m_entries.reverseFind(element); + if (index != notFound) + m_entries.remove(index); +} + +void HTMLFormattingElementList::appendMarker() +{ + m_entries.append(Entry::MarkerEntry); +} + +void HTMLFormattingElementList::clearToLastMarker() +{ + // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#clear-the-list-of-active-formatting-elements-up-to-the-last-marker + while (m_entries.size()) { + bool shouldStop = m_entries.last().isMarker(); + m_entries.removeLast(); + if (shouldStop) + break; + } +} + +#ifndef NDEBUG + +void HTMLFormattingElementList::show() +{ + for (unsigned i = 1; i <= m_entries.size(); ++i) { + const Entry& entry = m_entries[m_entries.size() - i]; + if (entry.isMarker()) + fprintf(stderr, "marker\n"); + else + entry.element()->showNode(); + } +} + +#endif + +} diff --git a/Source/WebCore/html/parser/HTMLFormattingElementList.h b/Source/WebCore/html/parser/HTMLFormattingElementList.h new file mode 100644 index 0000000..aca05bb --- /dev/null +++ b/Source/WebCore/html/parser/HTMLFormattingElementList.h @@ -0,0 +1,134 @@ +/* + * Copyright (C) 2010 Google, Inc. All Rights Reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY GOOGLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GOOGLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef HTMLFormattingElementList_h +#define HTMLFormattingElementList_h + +#include <wtf/Forward.h> +#include <wtf/RefPtr.h> +#include <wtf/Vector.h> + +namespace WebCore { + +class Element; + +// This may end up merged into HTMLElementStack. +class HTMLFormattingElementList : public Noncopyable { +public: + HTMLFormattingElementList(); + ~HTMLFormattingElementList(); + + // Ideally Entry would be private, but HTMLTreeBuilder has to coordinate + // between the HTMLFormattingElementList and HTMLElementStack and needs + // access to Entry::isMarker() and Entry::replaceElement() to do so. + class Entry { + public: + // Inline because they're hot and Vector<T> uses them. + explicit Entry(Element* element) + : m_element(element) + { + ASSERT(element); + } + enum MarkerEntryType { MarkerEntry }; + Entry(MarkerEntryType) + : m_element(0) + { + } + ~Entry() {} + + bool isMarker() const { return !m_element; } + + Element* element() const + { + // The fact that !m_element == isMarker() is an implementation detail + // callers should check isMarker() before calling element(). + ASSERT(m_element); + return m_element.get(); + } + void replaceElement(PassRefPtr<Element> element) { m_element = element; } + + // Needed for use with Vector. These are super-hot and must be inline. + bool operator==(Element* element) const { return m_element == element; } + bool operator!=(Element* element) const { return m_element != element; } + + private: + RefPtr<Element> m_element; + }; + + class Bookmark { + public: + Bookmark(Entry* entry) + : m_hasBeenMoved(false) + , m_mark(entry) + { + } + + void moveToAfter(Entry* before) + { + m_hasBeenMoved = true; + m_mark = before; + } + + bool hasBeenMoved() const { return m_hasBeenMoved; } + Entry* mark() const { return m_mark; } + + private: + bool m_hasBeenMoved; + Entry* m_mark; + }; + + bool isEmpty() const { return !size(); } + size_t size() const { return m_entries.size(); } + + Element* closestElementInScopeWithName(const AtomicString&); + + Entry* find(Element*); + bool contains(Element*); + void append(Element*); + void remove(Element*); + + Bookmark bookmarkFor(Element*); + void swapTo(Element* oldElement, Element* newElement, const Bookmark&); + + void appendMarker(); + // clearToLastMarker also clears the marker (per the HTML5 spec). + void clearToLastMarker(); + + const Entry& at(size_t i) const { return m_entries[i]; } + Entry& at(size_t i) { return m_entries[i]; } + +#ifndef NDEBUG + void show(); +#endif + +private: + Entry* first() { return &at(0); } + + Vector<Entry> m_entries; +}; + +} + +#endif // HTMLFormattingElementList_h diff --git a/Source/WebCore/html/parser/HTMLInputStream.h b/Source/WebCore/html/parser/HTMLInputStream.h new file mode 100644 index 0000000..1bfbaf9 --- /dev/null +++ b/Source/WebCore/html/parser/HTMLInputStream.h @@ -0,0 +1,164 @@ +/* + * Copyright (C) 2010 Google, Inc. All Rights Reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef HTMLInputStream_h +#define HTMLInputStream_h + +#include "SegmentedString.h" + +namespace WebCore { + +// The InputStream is made up of a sequence of SegmentedStrings: +// +// [--current--][--next--][--next--] ... [--next--] +// /\ (also called m_last) +// L_ current insertion point +// +// The current segmented string is stored in InputStream. Each of the +// afterInsertionPoint buffers are stored in InsertionPointRecords on the +// stack. +// +// We remove characters from the "current" string in the InputStream. +// document.write() will add characters at the current insertion point, +// which appends them to the "current" string. +// +// m_last is a pointer to the last of the afterInsertionPoint strings. +// The network adds data at the end of the InputStream, which appends +// them to the "last" string. +class HTMLInputStream : public Noncopyable { +public: + HTMLInputStream() + : m_last(&m_first) + { + } + + void appendToEnd(const SegmentedString& string) + { + m_last->append(string); + } + + void insertAtCurrentInsertionPoint(const SegmentedString& string) + { + m_first.append(string); + } + + bool hasInsertionPoint() const + { + if (&m_first != m_last) + return true; + if (!haveSeenEndOfFile()) { + // FIXME: Somehow we need to understand the difference between + // input streams that are coming off the network and streams that + // were created with document.open(). In the later case, we always + // have an isertion point at the end of the stream until someone + // calls document.close(). + return true; + } + return false; + } + + void markEndOfFile() + { + // FIXME: This should use InputStreamPreprocessor::endOfFileMarker + // once InputStreamPreprocessor is split off into its own header. + static const UChar endOfFileMarker = 0; + m_last->append(SegmentedString(String(&endOfFileMarker, 1))); + m_last->close(); + } + + bool haveSeenEndOfFile() const + { + return m_last->isClosed(); + } + + SegmentedString& current() { return m_first; } + const SegmentedString& current() const { return m_first; } + + void splitInto(SegmentedString& next) + { + next = m_first; + m_first = SegmentedString(); + if (m_last == &m_first) { + // We used to only have one SegmentedString in the InputStream + // but now we have two. That means m_first is no longer also + // the m_last string, |next| is now the last one. + m_last = &next; + } + } + + void mergeFrom(SegmentedString& next) + { + m_first.append(next); + if (m_last == &next) { + // The string |next| used to be the last SegmentedString in + // the InputStream. Now that it's been merged into m_first, + // that makes m_first the last one. + m_last = &m_first; + } + if (next.isClosed()) { + // We also need to merge the "closed" state from next to + // m_first. Arguably, this work could be done in append(). + m_first.close(); + } + } + +private: + SegmentedString m_first; + SegmentedString* m_last; +}; + +class InsertionPointRecord : public Noncopyable { +public: + explicit InsertionPointRecord(HTMLInputStream& inputStream) + : m_inputStream(&inputStream) + { + m_line = m_inputStream->current().currentLine(); + m_column = m_inputStream->current().currentColumn(); + m_inputStream->splitInto(m_next); + // We 'fork' current position and use it for the generated script part. + // This is a bit weird, because generated part does not have positions within an HTML document. + m_inputStream->current().setCurrentPosition(m_line, m_column, 0); + } + + ~InsertionPointRecord() + { + // Some inserted text may have remained in input stream. E.g. if script has written "&" or "<table", + // it stays in buffer because it cannot be properly tokenized before we see next part. + int unparsedRemainderLength = m_inputStream->current().length(); + m_inputStream->mergeFrom(m_next); + // We restore position for the character that goes right after unparsed remainder. + m_inputStream->current().setCurrentPosition(m_line, m_column, unparsedRemainderLength); + } + +private: + HTMLInputStream* m_inputStream; + SegmentedString m_next; + WTF::ZeroBasedNumber m_line; + WTF::ZeroBasedNumber m_column; +}; + +} + +#endif diff --git a/Source/WebCore/html/parser/HTMLMetaCharsetParser.cpp b/Source/WebCore/html/parser/HTMLMetaCharsetParser.cpp new file mode 100644 index 0000000..eac7d28 --- /dev/null +++ b/Source/WebCore/html/parser/HTMLMetaCharsetParser.cpp @@ -0,0 +1,200 @@ +/* + * Copyright (C) 2010 Google Inc. All Rights Reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "HTMLMetaCharsetParser.h" + +#include "HTMLNames.h" +#include "HTMLParserIdioms.h" +#include "HTMLTokenizer.h" +#include "PlatformString.h" +#include "TextCodec.h" +#include "TextEncodingRegistry.h" + +using namespace WTF; + +namespace WebCore { + +using namespace HTMLNames; + +HTMLMetaCharsetParser::HTMLMetaCharsetParser() + : m_tokenizer(HTMLTokenizer::create(false)) // No pre-HTML5 parser quirks. + , m_assumedCodec(newTextCodec(Latin1Encoding())) + , m_inHeadSection(true) + , m_doneChecking(false) +{ +} + +HTMLMetaCharsetParser::~HTMLMetaCharsetParser() +{ +} + +static const char charsetString[] = "charset"; +static const size_t charsetLength = sizeof("charset") - 1; + +String HTMLMetaCharsetParser::extractCharset(const String& value) +{ + size_t pos = 0; + unsigned length = value.length(); + + while (pos < length) { + pos = value.find(charsetString, pos, false); + if (pos == notFound) + break; + + pos += charsetLength; + + // Skip whitespace. + while (pos < length && value[pos] <= ' ') + ++pos; + + if (value[pos] != '=') + continue; + + ++pos; + + while (pos < length && value[pos] <= ' ') + ++pos; + + char quoteMark = 0; + if (pos < length && (value[pos] == '"' || value[pos] == '\'')) { + quoteMark = static_cast<char>(value[pos++]); + ASSERT(!(quoteMark & 0x80)); + } + + if (pos == length) + break; + + unsigned end = pos; + while (end < length && ((quoteMark && value[end] != quoteMark) || (!quoteMark && value[end] > ' ' && value[end] != '"' && value[end] != '\'' && value[end] != ';'))) + ++end; + + if (quoteMark && (end == length)) + break; // Close quote not found. + + return value.substring(pos, end - pos); + } + + return ""; +} + +bool HTMLMetaCharsetParser::processMeta() +{ + bool gotPragma = false; + Mode mode = None; + String charset; + + const HTMLToken::AttributeList& attributes = m_token.attributes(); + for (HTMLToken::AttributeList::const_iterator iter = attributes.begin(); + iter != attributes.end(); ++iter) { + AtomicString attributeName(iter->m_name.data(), iter->m_name.size()); + String attributeValue(iter->m_value.data(), iter->m_value.size()); + + if (attributeName == http_equivAttr) { + if (equalIgnoringCase(attributeValue, "content-type")) + gotPragma = true; + } else if (charset.isEmpty()) { + if (attributeName == charsetAttr) { + charset = attributeValue; + mode = Charset; + } else if (attributeName == contentAttr) { + charset = extractCharset(attributeValue); + if (charset.length()) + mode = Pragma; + } + } + } + + if (mode == Charset || (mode == Pragma && gotPragma)) { + m_encoding = TextEncoding(stripLeadingAndTrailingHTMLSpaces(charset)); + if (m_encoding.isValid()) + return true; + } + + return false; +} + +static const int bytesToCheckUnconditionally = 1024; // That many input bytes will be checked for meta charset even if <head> section is over. + +bool HTMLMetaCharsetParser::checkForMetaCharset(const char* data, size_t length) +{ + if (m_doneChecking) + return true; + + ASSERT(!m_encoding.isValid()); + + // We still don't have an encoding, and are in the head. + // The following tags are allowed in <head>: + // SCRIPT|STYLE|META|LINK|OBJECT|TITLE|BASE + + // We stop scanning when a tag that is not permitted in <head> + // is seen, rather when </head> is seen, because that more closely + // matches behavior in other browsers; more details in + // <http://bugs.webkit.org/show_bug.cgi?id=3590>. + + // Additionally, we ignore things that looks like tags in <title>, <script> + // and <noscript>; see <http://bugs.webkit.org/show_bug.cgi?id=4560>, + // <http://bugs.webkit.org/show_bug.cgi?id=12165> and + // <http://bugs.webkit.org/show_bug.cgi?id=12389>. + + // Since many sites have charset declarations after <body> or other tags + // that are disallowed in <head>, we don't bail out until we've checked at + // least bytesToCheckUnconditionally bytes of input. + + m_input.append(SegmentedString(m_assumedCodec->decode(data, length))); + + while (m_tokenizer->nextToken(m_input, m_token)) { + bool end = m_token.type() == HTMLToken::EndTag; + if (end || m_token.type() == HTMLToken::StartTag) { + AtomicString tagName(m_token.name().data(), m_token.name().size()); + if (!end) { + m_tokenizer->updateStateFor(tagName, 0); + if (tagName == metaTag && processMeta()) { + m_doneChecking = true; + return true; + } + } + + if (tagName != scriptTag && tagName != noscriptTag + && tagName != styleTag && tagName != linkTag + && tagName != metaTag && tagName != objectTag + && tagName != titleTag && tagName != baseTag + && (end || tagName != htmlTag) && (end || tagName != headTag)) { + m_inHeadSection = false; + } + } + + if (!m_inHeadSection && m_input.numberOfCharactersConsumed() >= bytesToCheckUnconditionally) { + m_doneChecking = true; + return true; + } + + m_token.clear(); + } + + return false; +} + +} diff --git a/Source/WebCore/html/parser/HTMLMetaCharsetParser.h b/Source/WebCore/html/parser/HTMLMetaCharsetParser.h new file mode 100644 index 0000000..c3136f5 --- /dev/null +++ b/Source/WebCore/html/parser/HTMLMetaCharsetParser.h @@ -0,0 +1,73 @@ +/* + * Copyright (C) 2010 Google Inc. All Rights Reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef HTMLMetaCharsetParser_h +#define HTMLMetaCharsetParser_h + +#include "HTMLToken.h" +#include "SegmentedString.h" +#include "TextEncoding.h" +#include <wtf/Noncopyable.h> + +namespace WebCore { + +class HTMLTokenizer; +class TextCodec; + +class HTMLMetaCharsetParser : public Noncopyable { +public: + static PassOwnPtr<HTMLMetaCharsetParser> create() { return adoptPtr(new HTMLMetaCharsetParser()); } + + ~HTMLMetaCharsetParser(); + + // Returns true if done checking, regardless whether an encoding is found. + bool checkForMetaCharset(const char*, size_t); + + const TextEncoding& encoding() { return m_encoding; } + +private: + HTMLMetaCharsetParser(); + + bool processMeta(); + String extractCharset(const String&); + + enum Mode { + None, + Charset, + Pragma, + }; + + OwnPtr<HTMLTokenizer> m_tokenizer; + OwnPtr<TextCodec> m_assumedCodec; + SegmentedString m_input; + HTMLToken m_token; + bool m_inHeadSection; + + bool m_doneChecking; + TextEncoding m_encoding; +}; + +} +#endif diff --git a/Source/WebCore/html/parser/HTMLParserIdioms.cpp b/Source/WebCore/html/parser/HTMLParserIdioms.cpp new file mode 100644 index 0000000..91ff8d3 --- /dev/null +++ b/Source/WebCore/html/parser/HTMLParserIdioms.cpp @@ -0,0 +1,221 @@ +/* + * Copyright (C) 2010 Apple Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "HTMLParserIdioms.h" + +#include <wtf/MathExtras.h> +#include <wtf/dtoa.h> +#include <wtf/text/AtomicString.h> + +namespace WebCore { + +String stripLeadingAndTrailingHTMLSpaces(const String& string) +{ + const UChar* characters = string.characters(); + unsigned length = string.length(); + + unsigned numLeadingSpaces; + for (numLeadingSpaces = 0; numLeadingSpaces < length; ++numLeadingSpaces) { + if (isNotHTMLSpace(characters[numLeadingSpaces])) + break; + } + + if (numLeadingSpaces == length) + return string.isNull() ? string : emptyAtom.string(); + + unsigned numTrailingSpaces; + for (numTrailingSpaces = 0; numTrailingSpaces < length; ++numTrailingSpaces) { + if (isNotHTMLSpace(characters[length - numTrailingSpaces - 1])) + break; + } + + ASSERT(numLeadingSpaces + numTrailingSpaces < length); + + return string.substring(numLeadingSpaces, length - (numLeadingSpaces + numTrailingSpaces)); +} + +String serializeForNumberType(double number) +{ + // According to HTML5, "the best representation of the number n as a floating + // point number" is a string produced by applying ToString() to n. + NumberToStringBuffer buffer; + unsigned length = numberToString(number, buffer); + return String(buffer, length); +} + +bool parseToDoubleForNumberType(const String& string, double* result) +{ + // See HTML5 2.4.4.3 `Real numbers.' + + // String::toDouble() accepts leading + and whitespace characters, which are not valid here. + UChar firstCharacter = string[0]; + if (firstCharacter != '-' && !isASCIIDigit(firstCharacter)) + return false; + + bool valid = false; + double value = string.toDouble(&valid); + if (!valid) + return false; + + // NaN and infinity are considered valid by String::toDouble, but not valid here. + if (!isfinite(value)) + return false; + + // Numbers are considered finite IEEE 754 single-precision floating point values. + // See HTML5 2.4.4.3 `Real numbers.' + if (-FLT_MAX > value || value > FLT_MAX) + return false; + + if (result) { + // The following expression converts -0 to +0. + *result = value ? value : 0; + } + + return true; +} + +bool parseToDoubleForNumberTypeWithDecimalPlaces(const String& string, double *result, unsigned *decimalPlaces) +{ + if (decimalPlaces) + *decimalPlaces = 0; + + if (!parseToDoubleForNumberType(string, result)) + return false; + + if (!decimalPlaces) + return true; + + size_t dotIndex = string.find('.'); + size_t eIndex = string.find('e'); + if (eIndex == notFound) + eIndex = string.find('E'); + + unsigned baseDecimalPlaces = 0; + if (dotIndex != notFound) { + if (eIndex == notFound) + baseDecimalPlaces = string.length() - dotIndex - 1; + else + baseDecimalPlaces = eIndex - dotIndex - 1; + } + + int exponent = 0; + if (eIndex != notFound) { + unsigned cursor = eIndex + 1, cursorSaved; + int digit, exponentSign; + int32_t exponent32; + size_t length = string.length(); + + // Not using String.toInt() in order to perform the same computation as dtoa() does. + exponentSign = 0; + switch (digit = string[cursor]) { + case '-': + exponentSign = 1; + case '+': + digit = string[++cursor]; + } + if (digit >= '0' && digit <= '9') { + while (cursor < length && digit == '0') + digit = string[++cursor]; + if (digit > '0' && digit <= '9') { + exponent32 = digit - '0'; + cursorSaved = cursor; + while (cursor < length && (digit = string[++cursor]) >= '0' && digit <= '9') + exponent32 = (10 * exponent32) + digit - '0'; + if (cursor - cursorSaved > 8 || exponent32 > 19999) + /* Avoid confusion from exponents + * so large that e might overflow. + */ + exponent = 19999; /* safe for 16 bit ints */ + else + exponent = static_cast<int>(exponent32); + if (exponentSign) + exponent = -exponent; + } else + exponent = 0; + } + } + + int intDecimalPlaces = baseDecimalPlaces - exponent; + if (intDecimalPlaces < 0) + *decimalPlaces = 0; + else if (intDecimalPlaces > 19999) + *decimalPlaces = 19999; + else + *decimalPlaces = static_cast<unsigned>(intDecimalPlaces); + + return true; +} + +// http://www.whatwg.org/specs/web-apps/current-work/#rules-for-parsing-integers +bool parseHTMLInteger(const String& input, int& value) +{ + // Step 1 + // Step 2 + const UChar* position = input.characters(); + const UChar* end = position + input.length(); + + // Step 3 + int sign = 1; + + // Step 4 + while (position < end) { + if (!isHTMLSpace(*position)) + break; + ++position; + } + + // Step 5 + if (position == end) + return false; + ASSERT(position < end); + + // Step 6 + if (*position == '-') { + sign = -1; + ++position; + } else if (*position == '+') + ++position; + if (position == end) + return false; + ASSERT(position < end); + + // Step 7 + if (!isASCIIDigit(*position)) + return false; + + // Step 8 + Vector<UChar, 16> digits; + while (position < end) { + if (!isASCIIDigit(*position)) + break; + digits.append(*position++); + } + + // Step 9 + value = sign * charactersToIntStrict(digits.data(), digits.size()); + return true; +} + +} diff --git a/Source/WebCore/html/parser/HTMLParserIdioms.h b/Source/WebCore/html/parser/HTMLParserIdioms.h new file mode 100644 index 0000000..4e8e58f --- /dev/null +++ b/Source/WebCore/html/parser/HTMLParserIdioms.h @@ -0,0 +1,76 @@ +/* + * Copyright (C) 2010 Apple Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef HTMLParserIdioms_h +#define HTMLParserIdioms_h + +#include <wtf/Forward.h> +#include <wtf/unicode/Unicode.h> + +namespace WebCore { + +// Space characters as defined by the HTML specification. +bool isHTMLSpace(UChar); +bool isNotHTMLSpace(UChar); + +// Strip leading and trailing whitespace as defined by the HTML specification. +String stripLeadingAndTrailingHTMLSpaces(const String&); + +// An implementation of the HTML specification's algorithm to convert a number to a string for number and range types. +String serializeForNumberType(double); + +// Convert the specified string to a double. If the conversion fails, the return value is false. +// Leading or trailing illegal characters cause failure, as does passing an empty string. +// The double* parameter may be 0 to check if the string can be parsed without getting the result. +bool parseToDoubleForNumberType(const String&, double*); +bool parseToDoubleForNumberTypeWithDecimalPlaces(const String&, double*, unsigned*); + +// http://www.whatwg.org/specs/web-apps/current-work/#rules-for-parsing-integers +bool parseHTMLInteger(const String&, int&); + +// Inline implementations of some of the functions declared above. + +inline bool isHTMLSpace(UChar character) +{ + // Histogram from Apple's page load test combined with some ad hoc browsing some other test suites. + // + // 82%: 216330 non-space characters, all > U+0020 + // 11%: 30017 plain space characters, U+0020 + // 5%: 12099 newline characters, U+000A + // 2%: 5346 tab characters, U+0009 + // + // No other characters seen. No U+000C or U+000D, and no other control characters. + // Accordingly, we check for non-spaces first, then space, then newline, then tab, then the other characters. + + return character <= ' ' && (character == ' ' || character == '\n' || character == '\t' || character == '\r' || character == '\f'); +} + +inline bool isNotHTMLSpace(UChar character) +{ + return !isHTMLSpace(character); +} + +} + +#endif diff --git a/Source/WebCore/html/parser/HTMLParserScheduler.cpp b/Source/WebCore/html/parser/HTMLParserScheduler.cpp new file mode 100644 index 0000000..56db1aa --- /dev/null +++ b/Source/WebCore/html/parser/HTMLParserScheduler.cpp @@ -0,0 +1,114 @@ +/* + * Copyright (C) 2010 Google, Inc. All Rights Reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "HTMLParserScheduler.h" + +#include "FrameView.h" // Only for isLayoutTimerActive +#include "HTMLDocumentParser.h" +#include "Document.h" + +// defaultParserChunkSize is used to define how many tokens the parser will +// process before checking against parserTimeLimit and possibly yielding. +// This is a performance optimization to prevent checking after every token. +static const int defaultParserChunkSize = 4096; + +// defaultParserTimeLimit is the seconds the parser will run in one write() call +// before yielding. Inline <script> execution can cause it to excede the limit. +// FIXME: We would like this value to be 0.2. +static const double defaultParserTimeLimit = 0.500; + +namespace WebCore { + +static double parserTimeLimit(Page* page) +{ + // We're using the poorly named customHTMLTokenizerTimeDelay setting. + if (page && page->hasCustomHTMLTokenizerTimeDelay()) + return page->customHTMLTokenizerTimeDelay(); + return defaultParserTimeLimit; +} + +static int parserChunkSize(Page* page) +{ + // FIXME: We may need to divide the value from customHTMLTokenizerChunkSize + // by some constant to translate from the "character" based behavior of the + // old LegacyHTMLDocumentParser to the token-based behavior of this parser. + if (page && page->hasCustomHTMLTokenizerChunkSize()) + return page->customHTMLTokenizerChunkSize(); + return defaultParserChunkSize; +} + +HTMLParserScheduler::HTMLParserScheduler(HTMLDocumentParser* parser) + : m_parser(parser) + , m_parserTimeLimit(parserTimeLimit(m_parser->document()->page())) + , m_parserChunkSize(parserChunkSize(m_parser->document()->page())) + , m_continueNextChunkTimer(this, &HTMLParserScheduler::continueNextChunkTimerFired) + , m_isSuspendedWithActiveTimer(false) +{ +} + +HTMLParserScheduler::~HTMLParserScheduler() +{ + m_continueNextChunkTimer.stop(); +} + +// FIXME: This belongs on Document. +static bool isLayoutTimerActive(Document* doc) +{ + ASSERT(doc); + return doc->view() && doc->view()->layoutPending() && !doc->minimumLayoutDelay(); +} + +void HTMLParserScheduler::continueNextChunkTimerFired(Timer<HTMLParserScheduler>* timer) +{ + ASSERT_UNUSED(timer, timer == &m_continueNextChunkTimer); + // FIXME: The timer class should handle timer priorities instead of this code. + // If a layout is scheduled, wait again to let the layout timer run first. + if (isLayoutTimerActive(m_parser->document())) { + m_continueNextChunkTimer.startOneShot(0); + return; + } + m_parser->resumeParsingAfterYield(); +} + +void HTMLParserScheduler::suspend() +{ + ASSERT(!m_isSuspendedWithActiveTimer); + if (!m_continueNextChunkTimer.isActive()) + return; + m_isSuspendedWithActiveTimer = true; + m_continueNextChunkTimer.stop(); +} + +void HTMLParserScheduler::resume() +{ + ASSERT(!m_continueNextChunkTimer.isActive()); + if (!m_isSuspendedWithActiveTimer) + return; + m_isSuspendedWithActiveTimer = false; + m_continueNextChunkTimer.startOneShot(0); +} + +} diff --git a/Source/WebCore/html/parser/HTMLParserScheduler.h b/Source/WebCore/html/parser/HTMLParserScheduler.h new file mode 100644 index 0000000..3a20b2b --- /dev/null +++ b/Source/WebCore/html/parser/HTMLParserScheduler.h @@ -0,0 +1,94 @@ +/* + * Copyright (C) 2010 Google, Inc. All Rights Reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef HTMLParserScheduler_h +#define HTMLParserScheduler_h + +#include "Timer.h" +#include <wtf/CurrentTime.h> +#include <wtf/Noncopyable.h> +#include <wtf/PassOwnPtr.h> + +namespace WebCore { + +class HTMLDocumentParser; + +class HTMLParserScheduler : public Noncopyable { +public: + static PassOwnPtr<HTMLParserScheduler> create(HTMLDocumentParser* parser) + { + return adoptPtr(new HTMLParserScheduler(parser)); + } + ~HTMLParserScheduler(); + + struct PumpSession { + PumpSession() + : processedTokens(0) + , startTime(currentTime()) + { + } + + int processedTokens; + double startTime; + }; + + // Inline as this is called after every token in the parser. + bool shouldContinueParsing(PumpSession& session) + { + if (session.processedTokens > m_parserChunkSize) { + session.processedTokens = 0; + double elapsedTime = currentTime() - session.startTime; + if (elapsedTime > m_parserTimeLimit) { + // Schedule the parser to continue and yield from the parser. + m_continueNextChunkTimer.startOneShot(0); + return false; + } + } + + ++session.processedTokens; + return true; + } + + bool isScheduledForResume() const { return m_isSuspendedWithActiveTimer || m_continueNextChunkTimer.isActive(); } + + void suspend(); + void resume(); + +private: + HTMLParserScheduler(HTMLDocumentParser*); + + void continueNextChunkTimerFired(Timer<HTMLParserScheduler>*); + + HTMLDocumentParser* m_parser; + + double m_parserTimeLimit; + int m_parserChunkSize; + Timer<HTMLParserScheduler> m_continueNextChunkTimer; + bool m_isSuspendedWithActiveTimer; +}; + +} + +#endif diff --git a/Source/WebCore/html/parser/HTMLPreloadScanner.cpp b/Source/WebCore/html/parser/HTMLPreloadScanner.cpp new file mode 100644 index 0000000..d23542f --- /dev/null +++ b/Source/WebCore/html/parser/HTMLPreloadScanner.cpp @@ -0,0 +1,194 @@ +/* + * Copyright (C) 2008 Apple Inc. All Rights Reserved. + * Copyright (C) 2009 Torch Mobile, Inc. http://www.torchmobile.com/ + * Copyright (C) 2010 Google Inc. All Rights Reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "HTMLPreloadScanner.h" + +#include "CachedResourceLoader.h" +#include "Document.h" +#include "HTMLDocumentParser.h" +#include "HTMLTokenizer.h" +#include "HTMLLinkElement.h" +#include "HTMLNames.h" +#include "HTMLParserIdioms.h" +#include "MediaList.h" +#include "MediaQueryEvaluator.h" + +namespace WebCore { + +using namespace HTMLNames; + +namespace { + +class PreloadTask { +public: + PreloadTask(const HTMLToken& token) + : m_tagName(token.name().data(), token.name().size()) + , m_linkIsStyleSheet(false) + , m_linkMediaAttributeIsScreen(true) + { + processAttributes(token.attributes()); + } + + void processAttributes(const HTMLToken::AttributeList& attributes) + { + if (m_tagName != scriptTag && m_tagName != imgTag && m_tagName != linkTag) + return; + + for (HTMLToken::AttributeList::const_iterator iter = attributes.begin(); + iter != attributes.end(); ++iter) { + AtomicString attributeName(iter->m_name.data(), iter->m_name.size()); + String attributeValue(iter->m_value.data(), iter->m_value.size()); + + if (attributeName == charsetAttr) + m_charset = attributeValue; + + if (m_tagName == scriptTag || m_tagName == imgTag) { + if (attributeName == srcAttr) + setUrlToLoad(attributeValue); + } else if (m_tagName == linkTag) { + if (attributeName == hrefAttr) + setUrlToLoad(attributeValue); + else if (attributeName == relAttr) + m_linkIsStyleSheet = relAttributeIsStyleSheet(attributeValue); + else if (attributeName == mediaAttr) + m_linkMediaAttributeIsScreen = linkMediaAttributeIsScreen(attributeValue); + } + } + } + + static bool relAttributeIsStyleSheet(const String& attributeValue) + { + HTMLLinkElement::RelAttribute rel; + HTMLLinkElement::tokenizeRelAttribute(attributeValue, rel); + return rel.m_isStyleSheet && !rel.m_isAlternate && !rel.m_isIcon && !rel.m_isDNSPrefetch; + } + + static bool linkMediaAttributeIsScreen(const String& attributeValue) + { + if (attributeValue.isEmpty()) + return true; + RefPtr<MediaList> mediaList = MediaList::createAllowingDescriptionSyntax(attributeValue); + + // Only preload screen media stylesheets. Used this way, the evaluator evaluates to true for any + // rules containing complex queries (full evaluation is possible but it requires a frame and a style selector which + // may be problematic here). + MediaQueryEvaluator mediaQueryEvaluator("screen"); + return mediaQueryEvaluator.eval(mediaList.get()); + } + + void setUrlToLoad(const String& attributeValue) + { + // We only respect the first src/href, per HTML5: + // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#attribute-name-state + if (!m_urlToLoad.isEmpty()) + return; + m_urlToLoad = stripLeadingAndTrailingHTMLSpaces(attributeValue); + } + + void preload(Document* document, bool scanningBody) + { + if (m_urlToLoad.isEmpty()) + return; + + CachedResourceLoader* cachedResourceLoader = document->cachedResourceLoader(); + if (m_tagName == scriptTag) + cachedResourceLoader->preload(CachedResource::Script, m_urlToLoad, m_charset, scanningBody); + else if (m_tagName == imgTag) + cachedResourceLoader->preload(CachedResource::ImageResource, m_urlToLoad, String(), scanningBody); + else if (m_tagName == linkTag && m_linkIsStyleSheet && m_linkMediaAttributeIsScreen) + cachedResourceLoader->preload(CachedResource::CSSStyleSheet, m_urlToLoad, m_charset, scanningBody); + } + + const AtomicString& tagName() const { return m_tagName; } + +private: + AtomicString m_tagName; + String m_urlToLoad; + String m_charset; + bool m_linkIsStyleSheet; + bool m_linkMediaAttributeIsScreen; +}; + +} // namespace + +HTMLPreloadScanner::HTMLPreloadScanner(Document* document) + : m_document(document) + , m_cssScanner(document) + , m_tokenizer(HTMLTokenizer::create(HTMLDocumentParser::usePreHTML5ParserQuirks(document))) + , m_bodySeen(false) + , m_inStyle(false) +{ +} + +void HTMLPreloadScanner::appendToEnd(const SegmentedString& source) +{ + m_source.append(source); +} + +void HTMLPreloadScanner::scan() +{ + // FIXME: We should save and re-use these tokens in HTMLDocumentParser if + // the pending script doesn't end up calling document.write. + while (m_tokenizer->nextToken(m_source, m_token)) { + processToken(); + m_token.clear(); + } +} + +void HTMLPreloadScanner::processToken() +{ + if (m_inStyle) { + if (m_token.type() == HTMLToken::Character) + m_cssScanner.scan(m_token, scanningBody()); + else if (m_token.type() == HTMLToken::EndTag) { + m_inStyle = false; + m_cssScanner.reset(); + } + } + + if (m_token.type() != HTMLToken::StartTag) + return; + + PreloadTask task(m_token); + m_tokenizer->updateStateFor(task.tagName(), m_document->frame()); + + if (task.tagName() == bodyTag) + m_bodySeen = true; + + if (task.tagName() == styleTag) + m_inStyle = true; + + task.preload(m_document, scanningBody()); +} + +bool HTMLPreloadScanner::scanningBody() const +{ + return m_document->body() || m_bodySeen; +} + +} diff --git a/Source/WebCore/html/parser/HTMLPreloadScanner.h b/Source/WebCore/html/parser/HTMLPreloadScanner.h new file mode 100644 index 0000000..94a90e6 --- /dev/null +++ b/Source/WebCore/html/parser/HTMLPreloadScanner.h @@ -0,0 +1,64 @@ +/* + * Copyright (C) 2008 Apple Inc. All Rights Reserved. + * Copyright (C) 2010 Google Inc. All Rights Reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef HTMLPreloadScanner_h +#define HTMLPreloadScanner_h + +#include "CSSPreloadScanner.h" +#include "HTMLToken.h" +#include "SegmentedString.h" +#include <wtf/Noncopyable.h> + +namespace WebCore { + +class Document; +class HTMLToken; +class HTMLTokenizer; +class SegmentedString; + +class HTMLPreloadScanner : public Noncopyable { +public: + HTMLPreloadScanner(Document*); + + void appendToEnd(const SegmentedString&); + void scan(); + +private: + void processToken(); + bool scanningBody() const; + + Document* m_document; + SegmentedString m_source; + CSSPreloadScanner m_cssScanner; + OwnPtr<HTMLTokenizer> m_tokenizer; + HTMLToken m_token; + bool m_bodySeen; + bool m_inStyle; +}; + +} + +#endif diff --git a/Source/WebCore/html/parser/HTMLScriptRunner.cpp b/Source/WebCore/html/parser/HTMLScriptRunner.cpp new file mode 100644 index 0000000..2fe1d30 --- /dev/null +++ b/Source/WebCore/html/parser/HTMLScriptRunner.cpp @@ -0,0 +1,321 @@ +/* + * Copyright (C) 2010 Google, Inc. All Rights Reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "HTMLScriptRunner.h" + +#include "Attribute.h" +#include "CachedScript.h" +#include "CachedResourceLoader.h" +#include "Element.h" +#include "Event.h" +#include "Frame.h" +#include "HTMLInputStream.h" +#include "HTMLNames.h" +#include "HTMLScriptRunnerHost.h" +#include "IgnoreDestructiveWriteCountIncrementer.h" +#include "NestingLevelIncrementer.h" +#include "NotImplemented.h" +#include "ScriptElement.h" +#include "ScriptSourceCode.h" + +namespace WebCore { + +using namespace HTMLNames; + +HTMLScriptRunner::HTMLScriptRunner(Document* document, HTMLScriptRunnerHost* host) + : m_document(document) + , m_host(host) + , m_scriptNestingLevel(0) + , m_hasScriptsWaitingForStylesheets(false) +{ + ASSERT(m_host); +} + +HTMLScriptRunner::~HTMLScriptRunner() +{ + // FIXME: Should we be passed a "done loading/parsing" callback sooner than destruction? + if (m_parsingBlockingScript.cachedScript() && m_parsingBlockingScript.watchingForLoad()) + stopWatchingForLoad(m_parsingBlockingScript); + + while (!m_scriptsToExecuteAfterParsing.isEmpty()) { + PendingScript pendingScript = m_scriptsToExecuteAfterParsing.takeFirst(); + if (pendingScript.cachedScript() && pendingScript.watchingForLoad()) + stopWatchingForLoad(pendingScript); + } +} + +void HTMLScriptRunner::detach() +{ + m_document = 0; +} + +static KURL documentURLForScriptExecution(Document* document) +{ + if (!document || !document->frame()) + return KURL(); + + // Use the URL of the currently active document for this frame. + return document->frame()->document()->url(); +} + +inline PassRefPtr<Event> createScriptLoadEvent() +{ + return Event::create(eventNames().loadEvent, false, false); +} + +inline PassRefPtr<Event> createScriptErrorEvent() +{ + return Event::create(eventNames().errorEvent, true, false); +} + +ScriptSourceCode HTMLScriptRunner::sourceFromPendingScript(const PendingScript& script, bool& errorOccurred) const +{ + if (script.cachedScript()) { + errorOccurred = script.cachedScript()->errorOccurred(); + ASSERT(script.cachedScript()->isLoaded()); + return ScriptSourceCode(script.cachedScript()); + } + errorOccurred = false; + return ScriptSourceCode(script.element()->textContent(), documentURLForScriptExecution(m_document), script.startingPosition()); +} + +bool HTMLScriptRunner::isPendingScriptReady(const PendingScript& script) +{ + m_hasScriptsWaitingForStylesheets = !m_document->haveStylesheetsLoaded(); + if (m_hasScriptsWaitingForStylesheets) + return false; + if (script.cachedScript() && !script.cachedScript()->isLoaded()) + return false; + return true; +} + +void HTMLScriptRunner::executeParsingBlockingScript() +{ + ASSERT(m_document); + ASSERT(!m_scriptNestingLevel); + ASSERT(m_document->haveStylesheetsLoaded()); + ASSERT(isPendingScriptReady(m_parsingBlockingScript)); + + InsertionPointRecord insertionPointRecord(m_host->inputStream()); + executePendingScriptAndDispatchEvent(m_parsingBlockingScript); +} + +void HTMLScriptRunner::executePendingScriptAndDispatchEvent(PendingScript& pendingScript) +{ + bool errorOccurred = false; + ScriptSourceCode sourceCode = sourceFromPendingScript(pendingScript, errorOccurred); + + // Stop watching loads before executeScript to prevent recursion if the script reloads itself. + if (pendingScript.cachedScript() && pendingScript.watchingForLoad()) + stopWatchingForLoad(pendingScript); + + // Clear the pending script before possible rentrancy from executeScript() + RefPtr<Element> element = pendingScript.releaseElementAndClear(); + if (ScriptElement* scriptElement = toScriptElement(element.get())) { + NestingLevelIncrementer nestingLevelIncrementer(m_scriptNestingLevel); + IgnoreDestructiveWriteCountIncrementer ignoreDestructiveWriteCountIncrementer(m_document); + if (errorOccurred) + element->dispatchEvent(createScriptErrorEvent()); + else { + ASSERT(isExecutingScript()); + scriptElement->executeScript(sourceCode); + element->dispatchEvent(createScriptLoadEvent()); + } + } + ASSERT(!m_scriptNestingLevel); +} + +void HTMLScriptRunner::watchForLoad(PendingScript& pendingScript) +{ + ASSERT(!pendingScript.watchingForLoad()); + m_host->watchForLoad(pendingScript.cachedScript()); + pendingScript.setWatchingForLoad(true); +} + +void HTMLScriptRunner::stopWatchingForLoad(PendingScript& pendingScript) +{ + ASSERT(pendingScript.watchingForLoad()); + m_host->stopWatchingForLoad(pendingScript.cachedScript()); + pendingScript.setWatchingForLoad(false); +} + +// This function should match 10.2.5.11 "An end tag whose tag name is 'script'" +// Script handling lives outside the tree builder to keep the each class simple. +bool HTMLScriptRunner::execute(PassRefPtr<Element> scriptElement, const TextPosition1& scriptStartPosition) +{ + ASSERT(scriptElement); + // FIXME: If scripting is disabled, always just return true; + + // Try to execute the script given to us. + runScript(scriptElement.get(), scriptStartPosition); + + if (haveParsingBlockingScript()) { + if (m_scriptNestingLevel) + return false; // Block the parser. Unwind to the outermost HTMLScriptRunner::execute before continuing parsing. + if (!executeParsingBlockingScripts()) + return false; // We still have a parsing blocking script, block the parser. + } + return true; // Scripts executed as expected, continue parsing. +} + +bool HTMLScriptRunner::haveParsingBlockingScript() const +{ + return !!m_parsingBlockingScript.element(); +} + +bool HTMLScriptRunner::executeParsingBlockingScripts() +{ + while (haveParsingBlockingScript()) { + // We only really need to check once. + if (!isPendingScriptReady(m_parsingBlockingScript)) + return false; + executeParsingBlockingScript(); + } + return true; +} + +bool HTMLScriptRunner::executeScriptsWaitingForLoad(CachedResource* cachedScript) +{ + ASSERT(!m_scriptNestingLevel); + ASSERT(haveParsingBlockingScript()); + ASSERT_UNUSED(cachedScript, m_parsingBlockingScript.cachedScript() == cachedScript); + ASSERT(m_parsingBlockingScript.cachedScript()->isLoaded()); + return executeParsingBlockingScripts(); +} + +bool HTMLScriptRunner::executeScriptsWaitingForStylesheets() +{ + ASSERT(m_document); + // Callers should check hasScriptsWaitingForStylesheets() before calling + // to prevent parser or script re-entry during </style> parsing. + ASSERT(hasScriptsWaitingForStylesheets()); + ASSERT(!m_scriptNestingLevel); + ASSERT(m_document->haveStylesheetsLoaded()); + return executeParsingBlockingScripts(); +} + +bool HTMLScriptRunner::executeScriptsWaitingForParsing() +{ + while (!m_scriptsToExecuteAfterParsing.isEmpty()) { + ASSERT(!m_scriptNestingLevel); + ASSERT(!haveParsingBlockingScript()); + ASSERT(m_scriptsToExecuteAfterParsing.first().cachedScript()); + if (!m_scriptsToExecuteAfterParsing.first().cachedScript()->isLoaded()) { + watchForLoad(m_scriptsToExecuteAfterParsing.first()); + return false; + } + PendingScript first = m_scriptsToExecuteAfterParsing.takeFirst(); + executePendingScriptAndDispatchEvent(first); + if (!m_document) + return false; + } + return true; +} + +void HTMLScriptRunner::requestParsingBlockingScript(Element* element) +{ + if (!requestPendingScript(m_parsingBlockingScript, element)) + return; + + ASSERT(m_parsingBlockingScript.cachedScript()); + + // We only care about a load callback if cachedScript is not already + // in the cache. Callers will attempt to run the m_parsingBlockingScript + // if possible before returning control to the parser. + if (!m_parsingBlockingScript.cachedScript()->isLoaded()) + watchForLoad(m_parsingBlockingScript); +} + +void HTMLScriptRunner::requestDeferredScript(Element* element) +{ + PendingScript pendingScript; + if (!requestPendingScript(pendingScript, element)) + return; + + ASSERT(pendingScript.cachedScript()); + m_scriptsToExecuteAfterParsing.append(pendingScript); +} + +bool HTMLScriptRunner::requestPendingScript(PendingScript& pendingScript, Element* script) const +{ + ASSERT(!pendingScript.element()); + const AtomicString& srcValue = script->getAttribute(srcAttr); + // Allow the host to disllow script loads (using the XSSAuditor, etc.) + if (!m_host->shouldLoadExternalScriptFromSrc(srcValue)) + return false; + // FIXME: We need to resolve the url relative to the element. + if (!script->dispatchBeforeLoadEvent(srcValue)) + return false; + pendingScript.setElement(script); + // This should correctly return 0 for empty or invalid srcValues. + CachedScript* cachedScript = m_document->cachedResourceLoader()->requestScript(srcValue, toScriptElement(script)->scriptCharset()); + if (!cachedScript) { + notImplemented(); // Dispatch error event. + return false; + } + pendingScript.setCachedScript(cachedScript); + return true; +} + +// This method is meant to match the HTML5 definition of "running a script" +// http://www.whatwg.org/specs/web-apps/current-work/multipage/scripting-1.html#running-a-script +void HTMLScriptRunner::runScript(Element* script, const TextPosition1& scriptStartPosition) +{ + ASSERT(m_document); + ASSERT(!haveParsingBlockingScript()); + { + InsertionPointRecord insertionPointRecord(m_host->inputStream()); + NestingLevelIncrementer nestingLevelIncrementer(m_scriptNestingLevel); + + ScriptElement* scriptElement = toScriptElement(script); + ASSERT(scriptElement); + if (!scriptElement->shouldExecuteAsJavaScript()) + return; + + if (script->hasAttribute(srcAttr)) { + if (script->hasAttribute(asyncAttr)) // Async takes precendence over defer. + return; // Asynchronous scripts handle themselves. + + if (script->hasAttribute(deferAttr)) + requestDeferredScript(script); + else + requestParsingBlockingScript(script); + } else if (!m_document->haveStylesheetsLoaded() && m_scriptNestingLevel == 1) { + // Block inline script execution on stylesheet load, unless we are in document.write(). + // The latter case can only happen if a script both triggers a stylesheet load + // and writes an inline script. Since write is blocking we have to execute the + // written script immediately, ignoring the pending sheets. + m_parsingBlockingScript.setElement(script); + m_parsingBlockingScript.setStartingPosition(scriptStartPosition); + } else { + ASSERT(isExecutingScript()); + ScriptSourceCode sourceCode(script->textContent(), documentURLForScriptExecution(m_document), scriptStartPosition); + scriptElement->executeScript(sourceCode); + } + } +} + +} diff --git a/Source/WebCore/html/parser/HTMLScriptRunner.h b/Source/WebCore/html/parser/HTMLScriptRunner.h new file mode 100644 index 0000000..6cf74d8 --- /dev/null +++ b/Source/WebCore/html/parser/HTMLScriptRunner.h @@ -0,0 +1,102 @@ +/* + * Copyright (C) 2010 Google, Inc. All Rights Reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef HTMLScriptRunner_h +#define HTMLScriptRunner_h + +#include "PendingScript.h" +#include <wtf/Deque.h> +#include <wtf/text/TextPosition.h> +#include <wtf/Noncopyable.h> +#include <wtf/PassRefPtr.h> + +namespace WebCore { + +class CachedResource; +class CachedScript; +class Document; +class Element; +class Frame; +class HTMLScriptRunnerHost; +class ScriptSourceCode; + +class HTMLScriptRunner : public Noncopyable { +public: + static PassOwnPtr<HTMLScriptRunner> create(Document* document, HTMLScriptRunnerHost* host) + { + return adoptPtr(new HTMLScriptRunner(document, host)); + } + ~HTMLScriptRunner(); + + void detach(); + + // Processes the passed in script and any pending scripts if possible. + bool execute(PassRefPtr<Element> scriptToProcess, const TextPosition1& scriptStartPosition); + + bool executeScriptsWaitingForLoad(CachedResource*); + bool hasScriptsWaitingForStylesheets() const { return m_hasScriptsWaitingForStylesheets; } + bool executeScriptsWaitingForStylesheets(); + bool executeScriptsWaitingForParsing(); + + bool isExecutingScript() const { return !!m_scriptNestingLevel; } + +private: + HTMLScriptRunner(Document*, HTMLScriptRunnerHost*); + + Frame* frame() const; + + void executeParsingBlockingScript(); + void executePendingScriptAndDispatchEvent(PendingScript&); + bool haveParsingBlockingScript() const; + bool executeParsingBlockingScripts(); + + void requestParsingBlockingScript(Element*); + void requestDeferredScript(Element*); + bool requestPendingScript(PendingScript&, Element*) const; + + void runScript(Element*, const TextPosition1& scriptStartPosition); + + // Helpers for dealing with HTMLScriptRunnerHost + void watchForLoad(PendingScript&); + void stopWatchingForLoad(PendingScript&); + bool isPendingScriptReady(const PendingScript&); + ScriptSourceCode sourceFromPendingScript(const PendingScript&, bool& errorOccurred) const; + + Document* m_document; + HTMLScriptRunnerHost* m_host; + PendingScript m_parsingBlockingScript; + Deque<PendingScript> m_scriptsToExecuteAfterParsing; // http://www.whatwg.org/specs/web-apps/current-work/#list-of-scripts-that-will-execute-when-the-document-has-finished-parsing + unsigned m_scriptNestingLevel; + + // We only want stylesheet loads to trigger script execution if script + // execution is currently stopped due to stylesheet loads, otherwise we'd + // cause nested script execution when parsing <style> tags since </style> + // tags can cause Document to call executeScriptsWaitingForStylesheets. + bool m_hasScriptsWaitingForStylesheets; +}; + +} + +#endif diff --git a/Source/WebCore/html/parser/HTMLScriptRunnerHost.h b/Source/WebCore/html/parser/HTMLScriptRunnerHost.h new file mode 100644 index 0000000..5b40a931 --- /dev/null +++ b/Source/WebCore/html/parser/HTMLScriptRunnerHost.h @@ -0,0 +1,54 @@ +/* + * Copyright (C) 2010 Google, Inc. All Rights Reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef HTMLScriptRunnerHost_h +#define HTMLScriptRunnerHost_h + +#include <wtf/Forward.h> + +namespace WebCore { + +class CachedResource; +class Element; +class HTMLInputStream; +class ScriptSourceCode; + +class HTMLScriptRunnerHost { +public: + virtual ~HTMLScriptRunnerHost() { } + + // Implementors should call cachedResource->addClient() here or soon after. + virtual void watchForLoad(CachedResource*) = 0; + // Implementors must call cachedResource->removeClient() immediately. + virtual void stopWatchingForLoad(CachedResource*) = 0; + + // Implementors can block certain script loads (for XSSAuditor, etc.) + virtual bool shouldLoadExternalScriptFromSrc(const AtomicString&) = 0; + virtual HTMLInputStream& inputStream() = 0; +}; + +} + +#endif diff --git a/Source/WebCore/html/parser/HTMLToken.h b/Source/WebCore/html/parser/HTMLToken.h new file mode 100644 index 0000000..42cddb8 --- /dev/null +++ b/Source/WebCore/html/parser/HTMLToken.h @@ -0,0 +1,526 @@ +/* + * Copyright (C) 2010 Google, Inc. All Rights Reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef HTMLToken_h +#define HTMLToken_h + +#include "NamedNodeMap.h" +#include <wtf/Noncopyable.h> +#include <wtf/PassOwnPtr.h> +#include <wtf/Vector.h> + +namespace WebCore { + +class HTMLToken : public Noncopyable { +public: + enum Type { + Uninitialized, + DOCTYPE, + StartTag, + EndTag, + Comment, + Character, + EndOfFile, + }; + + class Range { + public: + int m_start; + int m_end; + }; + + class Attribute { + public: + Range m_nameRange; + Range m_valueRange; + WTF::Vector<UChar, 32> m_name; + WTF::Vector<UChar, 32> m_value; + }; + + typedef WTF::Vector<Attribute, 10> AttributeList; + typedef WTF::Vector<UChar, 1024> DataVector; + + HTMLToken() { clear(); } + + void clear(int startIndex = 0) + { + m_type = Uninitialized; + m_range.m_start = startIndex; + m_range.m_end = startIndex; + m_data.clear(); + } + + int startIndex() const { return m_range.m_start; } + int endIndex() const { return m_range.m_end; } + + void end(int endIndex) + { + m_range.m_end = endIndex; + } + + void makeEndOfFile() + { + ASSERT(m_type == Uninitialized); + m_type = EndOfFile; + } + + void beginStartTag(UChar character) + { + ASSERT(character); + ASSERT(m_type == Uninitialized); + m_type = StartTag; + m_selfClosing = false; + m_currentAttribute = 0; + m_attributes.clear(); + + m_data.append(character); + } + + template<typename T> + void beginEndTag(T characters) + { + ASSERT(m_type == Uninitialized); + m_type = EndTag; + m_selfClosing = false; + m_currentAttribute = 0; + m_attributes.clear(); + + m_data.append(characters); + } + + // Starting a character token works slightly differently than starting + // other types of tokens because we want to save a per-character branch. + void ensureIsCharacterToken() + { + ASSERT(m_type == Uninitialized || m_type == Character); + m_type = Character; + } + + void beginComment() + { + ASSERT(m_type == Uninitialized); + m_type = Comment; + } + + void beginDOCTYPE() + { + ASSERT(m_type == Uninitialized); + m_type = DOCTYPE; + m_doctypeData = adoptPtr(new DoctypeData()); + } + + void beginDOCTYPE(UChar character) + { + ASSERT(character); + beginDOCTYPE(); + m_data.append(character); + } + + void appendToName(UChar character) + { + ASSERT(character); + ASSERT(m_type == StartTag || m_type == EndTag || m_type == DOCTYPE); + m_data.append(character); + } + + template<typename T> + void appendToCharacter(T characters) + { + ASSERT(m_type == Character); + m_data.append(characters); + } + + void appendToComment(UChar character) + { + ASSERT(character); + ASSERT(m_type == Comment); + m_data.append(character); + } + + void addNewAttribute() + { + ASSERT(m_type == StartTag || m_type == EndTag); + m_attributes.grow(m_attributes.size() + 1); + m_currentAttribute = &m_attributes.last(); +#ifndef NDEBUG + m_currentAttribute->m_nameRange.m_start = 0; + m_currentAttribute->m_nameRange.m_end = 0; + m_currentAttribute->m_valueRange.m_start = 0; + m_currentAttribute->m_valueRange.m_end = 0; +#endif + } + + void beginAttributeName(int index) + { + m_currentAttribute->m_nameRange.m_start = index; + } + + void endAttributeName(int index) + { + m_currentAttribute->m_nameRange.m_end = index; + m_currentAttribute->m_valueRange.m_start = index; + m_currentAttribute->m_valueRange.m_end = index; + } + + void beginAttributeValue(int index) + { + m_currentAttribute->m_valueRange.m_start = index; +#ifndef NDEBUG + m_currentAttribute->m_valueRange.m_end = 0; +#endif + } + + void endAttributeValue(int index) + { + m_currentAttribute->m_valueRange.m_end = index; + } + + void appendToAttributeName(UChar character) + { + ASSERT(character); + ASSERT(m_type == StartTag || m_type == EndTag); + ASSERT(m_currentAttribute->m_nameRange.m_start); + m_currentAttribute->m_name.append(character); + } + + void appendToAttributeValue(UChar character) + { + ASSERT(character); + ASSERT(m_type == StartTag || m_type == EndTag); + ASSERT(m_currentAttribute->m_valueRange.m_start); + m_currentAttribute->m_value.append(character); + } + + Type type() const { return m_type; } + + bool selfClosing() const + { + ASSERT(m_type == StartTag || m_type == EndTag); + return m_selfClosing; + } + + void setSelfClosing() + { + ASSERT(m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag); + m_selfClosing = true; + } + + const AttributeList& attributes() const + { + ASSERT(m_type == StartTag || m_type == EndTag); + return m_attributes; + } + + const DataVector& name() const + { + ASSERT(m_type == StartTag || m_type == EndTag || m_type == DOCTYPE); + return m_data; + } + + const DataVector& characters() const + { + ASSERT(m_type == Character); + return m_data; + } + + const DataVector& comment() const + { + ASSERT(m_type == Comment); + return m_data; + } + + // FIXME: Distinguish between a missing public identifer and an empty one. + const WTF::Vector<UChar>& publicIdentifier() const + { + ASSERT(m_type == DOCTYPE); + return m_doctypeData->m_publicIdentifier; + } + + // FIXME: Distinguish between a missing system identifer and an empty one. + const WTF::Vector<UChar>& systemIdentifier() const + { + ASSERT(m_type == DOCTYPE); + return m_doctypeData->m_systemIdentifier; + } + + void setPublicIdentifierToEmptyString() + { + ASSERT(m_type == DOCTYPE); + m_doctypeData->m_hasPublicIdentifier = true; + m_doctypeData->m_publicIdentifier.clear(); + } + + void setSystemIdentifierToEmptyString() + { + ASSERT(m_type == DOCTYPE); + m_doctypeData->m_hasSystemIdentifier = true; + m_doctypeData->m_systemIdentifier.clear(); + } + + bool forceQuirks() const + { + ASSERT(m_type == DOCTYPE); + return m_doctypeData->m_forceQuirks; + } + + void setForceQuirks() + { + ASSERT(m_type == DOCTYPE); + m_doctypeData->m_forceQuirks = true; + } + + void appendToPublicIdentifier(UChar character) + { + ASSERT(character); + ASSERT(m_type == DOCTYPE); + ASSERT(m_doctypeData->m_hasPublicIdentifier); + m_doctypeData->m_publicIdentifier.append(character); + } + + void appendToSystemIdentifier(UChar character) + { + ASSERT(character); + ASSERT(m_type == DOCTYPE); + ASSERT(m_doctypeData->m_hasSystemIdentifier); + m_doctypeData->m_systemIdentifier.append(character); + } + +private: + // FIXME: I'm not sure what the final relationship between HTMLToken and + // AtomicHTMLToken will be. I'm marking this a friend for now, but we'll + // want to end up with a cleaner interface between the two classes. + friend class AtomicHTMLToken; + + class DoctypeData : public Noncopyable { + public: + DoctypeData() + : m_hasPublicIdentifier(false) + , m_hasSystemIdentifier(false) + , m_forceQuirks(false) + { + } + + bool m_hasPublicIdentifier; + bool m_hasSystemIdentifier; + bool m_forceQuirks; + WTF::Vector<UChar> m_publicIdentifier; + WTF::Vector<UChar> m_systemIdentifier; + }; + + Type m_type; + + // Which characters from the input stream are represented by this token. + Range m_range; + + // "name" for DOCTYPE, StartTag, and EndTag + // "characters" for Character + // "data" for Comment + DataVector m_data; + + // For DOCTYPE + OwnPtr<DoctypeData> m_doctypeData; + + // For StartTag and EndTag + bool m_selfClosing; + AttributeList m_attributes; + + // A pointer into m_attributes used during lexing. + Attribute* m_currentAttribute; +}; + +// FIXME: This class should eventually be named HTMLToken once we move the +// exiting HTMLToken to be internal to the HTMLTokenizer. +class AtomicHTMLToken : public Noncopyable { +public: + AtomicHTMLToken(HTMLToken& token) + : m_type(token.type()) + { + switch (m_type) { + case HTMLToken::Uninitialized: + ASSERT_NOT_REACHED(); + break; + case HTMLToken::DOCTYPE: + m_name = AtomicString(token.name().data(), token.name().size()); + m_doctypeData = token.m_doctypeData.release(); + break; + case HTMLToken::EndOfFile: + break; + case HTMLToken::StartTag: + case HTMLToken::EndTag: { + m_selfClosing = token.selfClosing(); + m_name = AtomicString(token.name().data(), token.name().size()); + const HTMLToken::AttributeList& attributes = token.attributes(); + for (HTMLToken::AttributeList::const_iterator iter = attributes.begin(); + iter != attributes.end(); ++iter) { + if (!iter->m_name.isEmpty()) { + String name(iter->m_name.data(), iter->m_name.size()); + String value(iter->m_value.data(), iter->m_value.size()); + ASSERT(iter->m_nameRange.m_start); + ASSERT(iter->m_nameRange.m_end); + ASSERT(iter->m_valueRange.m_start); + ASSERT(iter->m_valueRange.m_end); + RefPtr<Attribute> mappedAttribute = Attribute::createMapped(name, value); + if (!m_attributes) { + m_attributes = NamedNodeMap::create(); + // Reserving capacity here improves the parser + // benchmark. It might be worth experimenting with + // the constant to see where the optimal point is. + m_attributes->reserveInitialCapacity(10); + } + m_attributes->insertAttribute(mappedAttribute.release(), false); + } + } + break; + } + case HTMLToken::Comment: + m_data = String(token.comment().data(), token.comment().size()); + break; + case HTMLToken::Character: + m_externalCharacters = &token.characters(); + break; + } + } + + AtomicHTMLToken(HTMLToken::Type type, AtomicString name, PassRefPtr<NamedNodeMap> attributes = 0) + : m_type(type) + , m_name(name) + , m_attributes(attributes) + { + ASSERT(usesName()); + } + + HTMLToken::Type type() const { return m_type; } + + const AtomicString& name() const + { + ASSERT(usesName()); + return m_name; + } + + void setName(const AtomicString& name) + { + ASSERT(usesName()); + m_name = name; + } + + bool selfClosing() const + { + ASSERT(m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag); + return m_selfClosing; + } + + Attribute* getAttributeItem(const QualifiedName& attributeName) + { + ASSERT(usesAttributes()); + if (!m_attributes) + return 0; + return m_attributes->getAttributeItem(attributeName); + } + + NamedNodeMap* attributes() const + { + ASSERT(usesAttributes()); + return m_attributes.get(); + } + + PassRefPtr<NamedNodeMap> takeAtributes() + { + ASSERT(usesAttributes()); + return m_attributes.release(); + } + + const HTMLToken::DataVector& characters() const + { + ASSERT(m_type == HTMLToken::Character); + return *m_externalCharacters; + } + + const String& comment() const + { + ASSERT(m_type == HTMLToken::Comment); + return m_data; + } + + // FIXME: Distinguish between a missing public identifer and an empty one. + WTF::Vector<UChar>& publicIdentifier() const + { + ASSERT(m_type == HTMLToken::DOCTYPE); + return m_doctypeData->m_publicIdentifier; + } + + // FIXME: Distinguish between a missing system identifer and an empty one. + WTF::Vector<UChar>& systemIdentifier() const + { + ASSERT(m_type == HTMLToken::DOCTYPE); + return m_doctypeData->m_systemIdentifier; + } + + bool forceQuirks() const + { + ASSERT(m_type == HTMLToken::DOCTYPE); + return m_doctypeData->m_forceQuirks; + } + +private: + HTMLToken::Type m_type; + + bool usesName() const + { + return m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag || m_type == HTMLToken::DOCTYPE; + } + + bool usesAttributes() const + { + return m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag; + } + + // "name" for DOCTYPE, StartTag, and EndTag + AtomicString m_name; + + // "data" for Comment + String m_data; + + // "characters" for Character + // + // We don't want to copy the the characters out of the HTMLToken, so we + // keep a pointer to its buffer instead. This buffer is owned by the + // HTMLToken and causes a lifetime dependence between these objects. + // + // FIXME: Add a mechanism for "internalizing" the characters when the + // HTMLToken is destructed. + const HTMLToken::DataVector* m_externalCharacters; + + // For DOCTYPE + OwnPtr<HTMLToken::DoctypeData> m_doctypeData; + + // For StartTag and EndTag + bool m_selfClosing; + + RefPtr<NamedNodeMap> m_attributes; +}; + +} + +#endif diff --git a/Source/WebCore/html/parser/HTMLTokenizer.cpp b/Source/WebCore/html/parser/HTMLTokenizer.cpp new file mode 100644 index 0000000..305fca2 --- /dev/null +++ b/Source/WebCore/html/parser/HTMLTokenizer.cpp @@ -0,0 +1,1698 @@ +/* + * Copyright (C) 2008 Apple Inc. All Rights Reserved. + * Copyright (C) 2009 Torch Mobile, Inc. http://www.torchmobile.com/ + * Copyright (C) 2010 Google, Inc. All Rights Reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "HTMLTokenizer.h" + +#include "HTMLEntityParser.h" +#include "HTMLToken.h" +#include "HTMLTreeBuilder.h" +#include "HTMLNames.h" +#include "NotImplemented.h" +#include <wtf/ASCIICType.h> +#include <wtf/CurrentTime.h> +#include <wtf/UnusedParam.h> +#include <wtf/text/AtomicString.h> +#include <wtf/text/CString.h> +#include <wtf/unicode/Unicode.h> + +using namespace WTF; + +namespace WebCore { + +using namespace HTMLNames; + +const UChar HTMLTokenizer::InputStreamPreprocessor::endOfFileMarker = 0; + +namespace { + +inline UChar toLowerCase(UChar cc) +{ + ASSERT(isASCIIUpper(cc)); + const int lowerCaseOffset = 0x20; + return cc + lowerCaseOffset; +} + +inline bool isTokenizerWhitespace(UChar cc) +{ + return cc == ' ' || cc == '\x0A' || cc == '\x09' || cc == '\x0C'; +} + +inline void advanceStringAndASSERTIgnoringCase(SegmentedString& source, const char* expectedCharacters) +{ + while (*expectedCharacters) + source.advanceAndASSERTIgnoringCase(*expectedCharacters++); +} + +inline void advanceStringAndASSERT(SegmentedString& source, const char* expectedCharacters) +{ + while (*expectedCharacters) + source.advanceAndASSERT(*expectedCharacters++); +} + +inline bool vectorEqualsString(const Vector<UChar, 32>& vector, const String& string) +{ + if (vector.size() != string.length()) + return false; + const UChar* stringData = string.characters(); + const UChar* vectorData = vector.data(); + // FIXME: Is there a higher-level function we should be calling here? + return !memcmp(stringData, vectorData, vector.size() * sizeof(UChar)); +} + +inline bool isEndTagBufferingState(HTMLTokenizer::State state) +{ + switch (state) { + case HTMLTokenizer::RCDATAEndTagOpenState: + case HTMLTokenizer::RCDATAEndTagNameState: + case HTMLTokenizer::RAWTEXTEndTagOpenState: + case HTMLTokenizer::RAWTEXTEndTagNameState: + case HTMLTokenizer::ScriptDataEndTagOpenState: + case HTMLTokenizer::ScriptDataEndTagNameState: + case HTMLTokenizer::ScriptDataEscapedEndTagOpenState: + case HTMLTokenizer::ScriptDataEscapedEndTagNameState: + return true; + default: + return false; + } +} + +} + +HTMLTokenizer::HTMLTokenizer(bool usePreHTML5ParserQuirks) + : m_inputStreamPreprocessor(this) + , m_usePreHTML5ParserQuirks(usePreHTML5ParserQuirks) +{ + reset(); +} + +HTMLTokenizer::~HTMLTokenizer() +{ +} + +void HTMLTokenizer::reset() +{ + m_state = DataState; + m_token = 0; + m_lineNumber = 0; + m_skipLeadingNewLineForListing = false; + m_forceNullCharacterReplacement = false; + m_shouldAllowCDATA = false; + m_additionalAllowedCharacter = '\0'; +} + +inline bool HTMLTokenizer::processEntity(SegmentedString& source) +{ + bool notEnoughCharacters = false; + Vector<UChar, 16> decodedEntity; + bool success = consumeHTMLEntity(source, decodedEntity, notEnoughCharacters); + if (notEnoughCharacters) + return false; + if (!success) { + ASSERT(decodedEntity.isEmpty()); + bufferCharacter('&'); + } else { + Vector<UChar>::const_iterator iter = decodedEntity.begin(); + for (; iter != decodedEntity.end(); ++iter) + bufferCharacter(*iter); + } + return true; +} + +#if COMPILER(MSVC) +// We need to disable the "unreachable code" warning because we want to assert +// that some code points aren't reached in the state machine. +#pragma warning(disable: 4702) +#endif + +#define BEGIN_STATE(stateName) case stateName: stateName: +#define END_STATE() ASSERT_NOT_REACHED(); break; + +// We use this macro when the HTML5 spec says "reconsume the current input +// character in the <mumble> state." +#define RECONSUME_IN(stateName) \ + do { \ + m_state = stateName; \ + goto stateName; \ + } while (false) + +// We use this macro when the HTML5 spec says "consume the next input +// character ... and switch to the <mumble> state." +#define ADVANCE_TO(stateName) \ + do { \ + m_state = stateName; \ + if (!m_inputStreamPreprocessor.advance(source, m_lineNumber)) \ + return haveBufferedCharacterToken(); \ + cc = m_inputStreamPreprocessor.nextInputCharacter(); \ + goto stateName; \ + } while (false) + +// Sometimes there's more complicated logic in the spec that separates when +// we consume the next input character and when we switch to a particular +// state. We handle those cases by advancing the source directly and using +// this macro to switch to the indicated state. +#define SWITCH_TO(stateName) \ + do { \ + m_state = stateName; \ + if (source.isEmpty() || !m_inputStreamPreprocessor.peek(source, m_lineNumber)) \ + return haveBufferedCharacterToken(); \ + cc = m_inputStreamPreprocessor.nextInputCharacter(); \ + goto stateName; \ + } while (false) + + +inline void HTMLTokenizer::saveEndTagNameIfNeeded() +{ + ASSERT(m_token->type() != HTMLToken::Uninitialized); + if (m_token->type() == HTMLToken::StartTag) + m_appropriateEndTagName = m_token->name(); +} + +// We use this function when the HTML5 spec says "Emit the current <mumble> +// token. Switch to the <mumble> state." We use the word "resume" instead of +// switch to indicate that this macro actually returns and that we'll end up +// in the state when we "resume" (i.e., are called again). +bool HTMLTokenizer::emitAndResumeIn(SegmentedString& source, State state) +{ + m_state = state; + source.advance(m_lineNumber); + saveEndTagNameIfNeeded(); + return true; +} + +// Identical to emitAndResumeIn, except does not advance. +bool HTMLTokenizer::emitAndReconsumeIn(SegmentedString&, State state) +{ + m_state = state; + saveEndTagNameIfNeeded(); + return true; +} + +// Used to emit the EndOfFile token. +// Check if we have buffered characters to emit first before emitting the EOF. +bool HTMLTokenizer::emitEndOfFile(SegmentedString& source) +{ + if (haveBufferedCharacterToken()) + return true; + m_state = DataState; + source.advance(m_lineNumber); + m_token->clear(); + m_token->makeEndOfFile(); + return true; +} + +bool HTMLTokenizer::flushBufferedEndTag(SegmentedString& source) +{ + ASSERT(m_token->type() == HTMLToken::Character || m_token->type() == HTMLToken::Uninitialized); + source.advance(m_lineNumber); + if (m_token->type() == HTMLToken::Character) + return true; + m_token->beginEndTag(m_bufferedEndTagName); + m_bufferedEndTagName.clear(); + return false; +} + +#define FLUSH_AND_ADVANCE_TO(stateName) \ + do { \ + m_state = stateName; \ + if (flushBufferedEndTag(source)) \ + return true; \ + if (source.isEmpty() \ + || !m_inputStreamPreprocessor.peek(source, m_lineNumber)) \ + return haveBufferedCharacterToken(); \ + cc = m_inputStreamPreprocessor.nextInputCharacter(); \ + goto stateName; \ + } while (false) + +bool HTMLTokenizer::flushEmitAndResumeIn(SegmentedString& source, State state) +{ + m_state = state; + flushBufferedEndTag(source); + return true; +} + +bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token) +{ + // If we have a token in progress, then we're supposed to be called back + // with the same token so we can finish it. + ASSERT(!m_token || m_token == &token || token.type() == HTMLToken::Uninitialized); + m_token = &token; + + if (!m_bufferedEndTagName.isEmpty() && !isEndTagBufferingState(m_state)) { + // FIXME: This should call flushBufferedEndTag(). + // We started an end tag during our last iteration. + m_token->beginEndTag(m_bufferedEndTagName); + m_bufferedEndTagName.clear(); + if (m_state == DataState) { + // We're back in the data state, so we must be done with the tag. + return true; + } + } + + if (source.isEmpty() || !m_inputStreamPreprocessor.peek(source, m_lineNumber)) + return haveBufferedCharacterToken(); + UChar cc = m_inputStreamPreprocessor.nextInputCharacter(); + + // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#parsing-main-inbody + // Note that this logic is different than the generic \r\n collapsing + // handled in the input stream preprocessor. This logic is here as an + // "authoring convenience" so folks can write: + // + // <pre> + // lorem ipsum + // lorem ipsum + // </pre> + // + // without getting an extra newline at the start of their <pre> element. + if (m_skipLeadingNewLineForListing) { + m_skipLeadingNewLineForListing = false; + if (cc == '\n') { + if (m_state == DataState) + ADVANCE_TO(DataState); + if (m_state == RCDATAState) + ADVANCE_TO(RCDATAState); + // When parsing text/plain documents, we run the tokenizer in the + // PLAINTEXTState and ignore m_skipLeadingNewLineForListing. + ASSERT(m_state == PLAINTEXTState); + } + } + + // Source: http://www.whatwg.org/specs/web-apps/current-work/#tokenisation0 + switch (m_state) { + BEGIN_STATE(DataState) { + if (cc == '&') + ADVANCE_TO(CharacterReferenceInDataState); + else if (cc == '<') { + if (m_token->type() == HTMLToken::Character) { + // We have a bunch of character tokens queued up that we + // are emitting lazily here. + return true; + } + ADVANCE_TO(TagOpenState); + } else if (cc == InputStreamPreprocessor::endOfFileMarker) + return emitEndOfFile(source); + else { + bufferCharacter(cc); + ADVANCE_TO(DataState); + } + } + END_STATE() + + BEGIN_STATE(CharacterReferenceInDataState) { + if (!processEntity(source)) + return haveBufferedCharacterToken(); + SWITCH_TO(DataState); + } + END_STATE() + + BEGIN_STATE(RCDATAState) { + if (cc == '&') + ADVANCE_TO(CharacterReferenceInRCDATAState); + else if (cc == '<') + ADVANCE_TO(RCDATALessThanSignState); + else if (cc == InputStreamPreprocessor::endOfFileMarker) + return emitEndOfFile(source); + else { + bufferCharacter(cc); + ADVANCE_TO(RCDATAState); + } + } + END_STATE() + + BEGIN_STATE(CharacterReferenceInRCDATAState) { + if (!processEntity(source)) + return haveBufferedCharacterToken(); + SWITCH_TO(RCDATAState); + } + END_STATE() + + BEGIN_STATE(RAWTEXTState) { + if (cc == '<') + ADVANCE_TO(RAWTEXTLessThanSignState); + else if (cc == InputStreamPreprocessor::endOfFileMarker) + return emitEndOfFile(source); + else { + bufferCharacter(cc); + ADVANCE_TO(RAWTEXTState); + } + } + END_STATE() + + BEGIN_STATE(ScriptDataState) { + if (cc == '<') + ADVANCE_TO(ScriptDataLessThanSignState); + else if (cc == InputStreamPreprocessor::endOfFileMarker) + return emitEndOfFile(source); + else { + bufferCharacter(cc); + ADVANCE_TO(ScriptDataState); + } + } + END_STATE() + + BEGIN_STATE(PLAINTEXTState) { + if (cc == InputStreamPreprocessor::endOfFileMarker) + return emitEndOfFile(source); + else + bufferCharacter(cc); + ADVANCE_TO(PLAINTEXTState); + } + END_STATE() + + BEGIN_STATE(TagOpenState) { + if (cc == '!') + ADVANCE_TO(MarkupDeclarationOpenState); + else if (cc == '/') + ADVANCE_TO(EndTagOpenState); + else if (isASCIIUpper(cc)) { + m_token->beginStartTag(toLowerCase(cc)); + ADVANCE_TO(TagNameState); + } else if (isASCIILower(cc)) { + m_token->beginStartTag(cc); + ADVANCE_TO(TagNameState); + } else if (cc == '?') { + parseError(); + // The spec consumes the current character before switching + // to the bogus comment state, but it's easier to implement + // if we reconsume the current character. + RECONSUME_IN(BogusCommentState); + } else { + parseError(); + bufferCharacter('<'); + RECONSUME_IN(DataState); + } + } + END_STATE() + + BEGIN_STATE(EndTagOpenState) { + if (isASCIIUpper(cc)) { + m_token->beginEndTag(toLowerCase(cc)); + ADVANCE_TO(TagNameState); + } else if (isASCIILower(cc)) { + m_token->beginEndTag(cc); + ADVANCE_TO(TagNameState); + } else if (cc == '>') { + parseError(); + ADVANCE_TO(DataState); + } else if (cc == InputStreamPreprocessor::endOfFileMarker) { + parseError(); + bufferCharacter('<'); + bufferCharacter('/'); + RECONSUME_IN(DataState); + } else { + parseError(); + RECONSUME_IN(BogusCommentState); + } + } + END_STATE() + + BEGIN_STATE(TagNameState) { + if (isTokenizerWhitespace(cc)) + ADVANCE_TO(BeforeAttributeNameState); + else if (cc == '/') + ADVANCE_TO(SelfClosingStartTagState); + else if (cc == '>') + return emitAndResumeIn(source, DataState); + else if (m_usePreHTML5ParserQuirks && cc == '<') + return emitAndReconsumeIn(source, DataState); + else if (isASCIIUpper(cc)) { + m_token->appendToName(toLowerCase(cc)); + ADVANCE_TO(TagNameState); + } if (cc == InputStreamPreprocessor::endOfFileMarker) { + parseError(); + RECONSUME_IN(DataState); + } else { + m_token->appendToName(cc); + ADVANCE_TO(TagNameState); + } + } + END_STATE() + + BEGIN_STATE(RCDATALessThanSignState) { + if (cc == '/') { + m_temporaryBuffer.clear(); + ASSERT(m_bufferedEndTagName.isEmpty()); + ADVANCE_TO(RCDATAEndTagOpenState); + } else { + bufferCharacter('<'); + RECONSUME_IN(RCDATAState); + } + } + END_STATE() + + BEGIN_STATE(RCDATAEndTagOpenState) { + if (isASCIIUpper(cc)) { + m_temporaryBuffer.append(cc); + addToPossibleEndTag(toLowerCase(cc)); + ADVANCE_TO(RCDATAEndTagNameState); + } else if (isASCIILower(cc)) { + m_temporaryBuffer.append(cc); + addToPossibleEndTag(cc); + ADVANCE_TO(RCDATAEndTagNameState); + } else { + bufferCharacter('<'); + bufferCharacter('/'); + RECONSUME_IN(RCDATAState); + } + } + END_STATE() + + BEGIN_STATE(RCDATAEndTagNameState) { + if (isASCIIUpper(cc)) { + m_temporaryBuffer.append(cc); + addToPossibleEndTag(toLowerCase(cc)); + ADVANCE_TO(RCDATAEndTagNameState); + } else if (isASCIILower(cc)) { + m_temporaryBuffer.append(cc); + addToPossibleEndTag(cc); + ADVANCE_TO(RCDATAEndTagNameState); + } else { + if (isTokenizerWhitespace(cc)) { + if (isAppropriateEndTag()) + FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState); + } else if (cc == '/') { + if (isAppropriateEndTag()) + FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState); + } else if (cc == '>') { + if (isAppropriateEndTag()) + return flushEmitAndResumeIn(source, DataState); + } + bufferCharacter('<'); + bufferCharacter('/'); + m_token->appendToCharacter(m_temporaryBuffer); + m_bufferedEndTagName.clear(); + RECONSUME_IN(RCDATAState); + } + } + END_STATE() + + BEGIN_STATE(RAWTEXTLessThanSignState) { + if (cc == '/') { + m_temporaryBuffer.clear(); + ASSERT(m_bufferedEndTagName.isEmpty()); + ADVANCE_TO(RAWTEXTEndTagOpenState); + } else { + bufferCharacter('<'); + RECONSUME_IN(RAWTEXTState); + } + } + END_STATE() + + BEGIN_STATE(RAWTEXTEndTagOpenState) { + if (isASCIIUpper(cc)) { + m_temporaryBuffer.append(cc); + addToPossibleEndTag(toLowerCase(cc)); + ADVANCE_TO(RAWTEXTEndTagNameState); + } else if (isASCIILower(cc)) { + m_temporaryBuffer.append(cc); + addToPossibleEndTag(cc); + ADVANCE_TO(RAWTEXTEndTagNameState); + } else { + bufferCharacter('<'); + bufferCharacter('/'); + RECONSUME_IN(RAWTEXTState); + } + } + END_STATE() + + BEGIN_STATE(RAWTEXTEndTagNameState) { + if (isASCIIUpper(cc)) { + m_temporaryBuffer.append(cc); + addToPossibleEndTag(toLowerCase(cc)); + ADVANCE_TO(RAWTEXTEndTagNameState); + } else if (isASCIILower(cc)) { + m_temporaryBuffer.append(cc); + addToPossibleEndTag(cc); + ADVANCE_TO(RAWTEXTEndTagNameState); + } else { + if (isTokenizerWhitespace(cc)) { + if (isAppropriateEndTag()) + FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState); + } else if (cc == '/') { + if (isAppropriateEndTag()) + FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState); + } else if (cc == '>') { + if (isAppropriateEndTag()) + return flushEmitAndResumeIn(source, DataState); + } + bufferCharacter('<'); + bufferCharacter('/'); + m_token->appendToCharacter(m_temporaryBuffer); + m_bufferedEndTagName.clear(); + RECONSUME_IN(RAWTEXTState); + } + } + END_STATE() + + BEGIN_STATE(ScriptDataLessThanSignState) { + if (cc == '/') { + m_temporaryBuffer.clear(); + ASSERT(m_bufferedEndTagName.isEmpty()); + ADVANCE_TO(ScriptDataEndTagOpenState); + } else if (cc == '!') { + bufferCharacter('<'); + bufferCharacter('!'); + ADVANCE_TO(ScriptDataEscapeStartState); + } else { + bufferCharacter('<'); + RECONSUME_IN(ScriptDataState); + } + } + END_STATE() + + BEGIN_STATE(ScriptDataEndTagOpenState) { + if (isASCIIUpper(cc)) { + m_temporaryBuffer.append(cc); + addToPossibleEndTag(toLowerCase(cc)); + ADVANCE_TO(ScriptDataEndTagNameState); + } else if (isASCIILower(cc)) { + m_temporaryBuffer.append(cc); + addToPossibleEndTag(cc); + ADVANCE_TO(ScriptDataEndTagNameState); + } else { + bufferCharacter('<'); + bufferCharacter('/'); + RECONSUME_IN(ScriptDataState); + } + } + END_STATE() + + BEGIN_STATE(ScriptDataEndTagNameState) { + if (isASCIIUpper(cc)) { + m_temporaryBuffer.append(cc); + addToPossibleEndTag(toLowerCase(cc)); + ADVANCE_TO(ScriptDataEndTagNameState); + } else if (isASCIILower(cc)) { + m_temporaryBuffer.append(cc); + addToPossibleEndTag(cc); + ADVANCE_TO(ScriptDataEndTagNameState); + } else { + if (isTokenizerWhitespace(cc)) { + if (isAppropriateEndTag()) + FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState); + } else if (cc == '/') { + if (isAppropriateEndTag()) + FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState); + } else if (cc == '>') { + if (isAppropriateEndTag()) + return flushEmitAndResumeIn(source, DataState); + } + bufferCharacter('<'); + bufferCharacter('/'); + m_token->appendToCharacter(m_temporaryBuffer); + m_bufferedEndTagName.clear(); + RECONSUME_IN(ScriptDataState); + } + } + END_STATE() + + BEGIN_STATE(ScriptDataEscapeStartState) { + if (cc == '-') { + bufferCharacter(cc); + ADVANCE_TO(ScriptDataEscapeStartDashState); + } else + RECONSUME_IN(ScriptDataState); + } + END_STATE() + + BEGIN_STATE(ScriptDataEscapeStartDashState) { + if (cc == '-') { + bufferCharacter(cc); + ADVANCE_TO(ScriptDataEscapedDashDashState); + } else + RECONSUME_IN(ScriptDataState); + } + END_STATE() + + BEGIN_STATE(ScriptDataEscapedState) { + if (cc == '-') { + bufferCharacter(cc); + ADVANCE_TO(ScriptDataEscapedDashState); + } else if (cc == '<') + ADVANCE_TO(ScriptDataEscapedLessThanSignState); + else if (cc == InputStreamPreprocessor::endOfFileMarker) { + parseError(); + RECONSUME_IN(DataState); + } else { + bufferCharacter(cc); + ADVANCE_TO(ScriptDataEscapedState); + } + } + END_STATE() + + BEGIN_STATE(ScriptDataEscapedDashState) { + if (cc == '-') { + bufferCharacter(cc); + ADVANCE_TO(ScriptDataEscapedDashDashState); + } else if (cc == '<') + ADVANCE_TO(ScriptDataEscapedLessThanSignState); + else if (cc == InputStreamPreprocessor::endOfFileMarker) { + parseError(); + RECONSUME_IN(DataState); + } else { + bufferCharacter(cc); + ADVANCE_TO(ScriptDataEscapedState); + } + } + END_STATE() + + BEGIN_STATE(ScriptDataEscapedDashDashState) { + if (cc == '-') { + bufferCharacter(cc); + ADVANCE_TO(ScriptDataEscapedDashDashState); + } else if (cc == '<') + ADVANCE_TO(ScriptDataEscapedLessThanSignState); + else if (cc == '>') { + bufferCharacter(cc); + ADVANCE_TO(ScriptDataState); + } if (cc == InputStreamPreprocessor::endOfFileMarker) { + parseError(); + RECONSUME_IN(DataState); + } else { + bufferCharacter(cc); + ADVANCE_TO(ScriptDataEscapedState); + } + } + END_STATE() + + BEGIN_STATE(ScriptDataEscapedLessThanSignState) { + if (cc == '/') { + m_temporaryBuffer.clear(); + ASSERT(m_bufferedEndTagName.isEmpty()); + ADVANCE_TO(ScriptDataEscapedEndTagOpenState); + } else if (isASCIIUpper(cc)) { + bufferCharacter('<'); + bufferCharacter(cc); + m_temporaryBuffer.clear(); + m_temporaryBuffer.append(toLowerCase(cc)); + ADVANCE_TO(ScriptDataDoubleEscapeStartState); + } else if (isASCIILower(cc)) { + bufferCharacter('<'); + bufferCharacter(cc); + m_temporaryBuffer.clear(); + m_temporaryBuffer.append(cc); + ADVANCE_TO(ScriptDataDoubleEscapeStartState); + } else { + bufferCharacter('<'); + RECONSUME_IN(ScriptDataEscapedState); + } + } + END_STATE() + + BEGIN_STATE(ScriptDataEscapedEndTagOpenState) { + if (isASCIIUpper(cc)) { + m_temporaryBuffer.append(cc); + addToPossibleEndTag(toLowerCase(cc)); + ADVANCE_TO(ScriptDataEscapedEndTagNameState); + } else if (isASCIILower(cc)) { + m_temporaryBuffer.append(cc); + addToPossibleEndTag(cc); + ADVANCE_TO(ScriptDataEscapedEndTagNameState); + } else { + bufferCharacter('<'); + bufferCharacter('/'); + RECONSUME_IN(ScriptDataEscapedState); + } + } + END_STATE() + + BEGIN_STATE(ScriptDataEscapedEndTagNameState) { + if (isASCIIUpper(cc)) { + m_temporaryBuffer.append(cc); + addToPossibleEndTag(toLowerCase(cc)); + ADVANCE_TO(ScriptDataEscapedEndTagNameState); + } else if (isASCIILower(cc)) { + m_temporaryBuffer.append(cc); + addToPossibleEndTag(cc); + ADVANCE_TO(ScriptDataEscapedEndTagNameState); + } else { + if (isTokenizerWhitespace(cc)) { + if (isAppropriateEndTag()) + FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState); + } else if (cc == '/') { + if (isAppropriateEndTag()) + FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState); + } else if (cc == '>') { + if (isAppropriateEndTag()) + return flushEmitAndResumeIn(source, DataState); + } + bufferCharacter('<'); + bufferCharacter('/'); + m_token->appendToCharacter(m_temporaryBuffer); + m_bufferedEndTagName.clear(); + RECONSUME_IN(ScriptDataEscapedState); + } + } + END_STATE() + + BEGIN_STATE(ScriptDataDoubleEscapeStartState) { + if (isTokenizerWhitespace(cc) || cc == '/' || cc == '>') { + bufferCharacter(cc); + if (temporaryBufferIs(scriptTag.localName())) + ADVANCE_TO(ScriptDataDoubleEscapedState); + else + ADVANCE_TO(ScriptDataEscapedState); + } else if (isASCIIUpper(cc)) { + bufferCharacter(cc); + m_temporaryBuffer.append(toLowerCase(cc)); + ADVANCE_TO(ScriptDataDoubleEscapeStartState); + } else if (isASCIILower(cc)) { + bufferCharacter(cc); + m_temporaryBuffer.append(cc); + ADVANCE_TO(ScriptDataDoubleEscapeStartState); + } else + RECONSUME_IN(ScriptDataEscapedState); + } + END_STATE() + + BEGIN_STATE(ScriptDataDoubleEscapedState) { + if (cc == '-') { + bufferCharacter(cc); + ADVANCE_TO(ScriptDataDoubleEscapedDashState); + } else if (cc == '<') { + bufferCharacter(cc); + ADVANCE_TO(ScriptDataDoubleEscapedLessThanSignState); + } else if (cc == InputStreamPreprocessor::endOfFileMarker) { + parseError(); + RECONSUME_IN(DataState); + } else { + bufferCharacter(cc); + ADVANCE_TO(ScriptDataDoubleEscapedState); + } + } + END_STATE() + + BEGIN_STATE(ScriptDataDoubleEscapedDashState) { + if (cc == '-') { + bufferCharacter(cc); + ADVANCE_TO(ScriptDataDoubleEscapedDashDashState); + } else if (cc == '<') { + bufferCharacter(cc); + ADVANCE_TO(ScriptDataDoubleEscapedLessThanSignState); + } else if (cc == InputStreamPreprocessor::endOfFileMarker) { + parseError(); + RECONSUME_IN(DataState); + } else { + bufferCharacter(cc); + ADVANCE_TO(ScriptDataDoubleEscapedState); + } + } + END_STATE() + + BEGIN_STATE(ScriptDataDoubleEscapedDashDashState) { + if (cc == '-') { + bufferCharacter(cc); + ADVANCE_TO(ScriptDataDoubleEscapedDashDashState); + } else if (cc == '<') { + bufferCharacter(cc); + ADVANCE_TO(ScriptDataDoubleEscapedLessThanSignState); + } else if (cc == '>') { + bufferCharacter(cc); + ADVANCE_TO(ScriptDataState); + } else if (cc == InputStreamPreprocessor::endOfFileMarker) { + parseError(); + RECONSUME_IN(DataState); + } else { + bufferCharacter(cc); + ADVANCE_TO(ScriptDataDoubleEscapedState); + } + } + END_STATE() + + BEGIN_STATE(ScriptDataDoubleEscapedLessThanSignState) { + if (cc == '/') { + bufferCharacter(cc); + m_temporaryBuffer.clear(); + ADVANCE_TO(ScriptDataDoubleEscapeEndState); + } else + RECONSUME_IN(ScriptDataDoubleEscapedState); + } + END_STATE() + + BEGIN_STATE(ScriptDataDoubleEscapeEndState) { + if (isTokenizerWhitespace(cc) || cc == '/' || cc == '>') { + bufferCharacter(cc); + if (temporaryBufferIs(scriptTag.localName())) + ADVANCE_TO(ScriptDataEscapedState); + else + ADVANCE_TO(ScriptDataDoubleEscapedState); + } else if (isASCIIUpper(cc)) { + bufferCharacter(cc); + m_temporaryBuffer.append(toLowerCase(cc)); + ADVANCE_TO(ScriptDataDoubleEscapeEndState); + } else if (isASCIILower(cc)) { + bufferCharacter(cc); + m_temporaryBuffer.append(cc); + ADVANCE_TO(ScriptDataDoubleEscapeEndState); + } else + RECONSUME_IN(ScriptDataDoubleEscapedState); + } + END_STATE() + + BEGIN_STATE(BeforeAttributeNameState) { + if (isTokenizerWhitespace(cc)) + ADVANCE_TO(BeforeAttributeNameState); + else if (cc == '/') + ADVANCE_TO(SelfClosingStartTagState); + else if (cc == '>') + return emitAndResumeIn(source, DataState); + else if (m_usePreHTML5ParserQuirks && cc == '<') + return emitAndReconsumeIn(source, DataState); + else if (isASCIIUpper(cc)) { + m_token->addNewAttribute(); + m_token->beginAttributeName(source.numberOfCharactersConsumed()); + m_token->appendToAttributeName(toLowerCase(cc)); + ADVANCE_TO(AttributeNameState); + } else if (cc == InputStreamPreprocessor::endOfFileMarker) { + parseError(); + RECONSUME_IN(DataState); + } else { + if (cc == '"' || cc == '\'' || cc == '<' || cc == '=') + parseError(); + m_token->addNewAttribute(); + m_token->beginAttributeName(source.numberOfCharactersConsumed()); + m_token->appendToAttributeName(cc); + ADVANCE_TO(AttributeNameState); + } + } + END_STATE() + + BEGIN_STATE(AttributeNameState) { + if (isTokenizerWhitespace(cc)) { + m_token->endAttributeName(source.numberOfCharactersConsumed()); + ADVANCE_TO(AfterAttributeNameState); + } else if (cc == '/') { + m_token->endAttributeName(source.numberOfCharactersConsumed()); + ADVANCE_TO(SelfClosingStartTagState); + } else if (cc == '=') { + m_token->endAttributeName(source.numberOfCharactersConsumed()); + ADVANCE_TO(BeforeAttributeValueState); + } else if (cc == '>') { + m_token->endAttributeName(source.numberOfCharactersConsumed()); + return emitAndResumeIn(source, DataState); + } else if (m_usePreHTML5ParserQuirks && cc == '<') { + m_token->endAttributeName(source.numberOfCharactersConsumed()); + return emitAndReconsumeIn(source, DataState); + } else if (isASCIIUpper(cc)) { + m_token->appendToAttributeName(toLowerCase(cc)); + ADVANCE_TO(AttributeNameState); + } else if (cc == InputStreamPreprocessor::endOfFileMarker) { + parseError(); + m_token->endAttributeName(source.numberOfCharactersConsumed()); + RECONSUME_IN(DataState); + } else { + if (cc == '"' || cc == '\'' || cc == '<' || cc == '=') + parseError(); + m_token->appendToAttributeName(cc); + ADVANCE_TO(AttributeNameState); + } + } + END_STATE() + + BEGIN_STATE(AfterAttributeNameState) { + if (isTokenizerWhitespace(cc)) + ADVANCE_TO(AfterAttributeNameState); + else if (cc == '/') + ADVANCE_TO(SelfClosingStartTagState); + else if (cc == '=') + ADVANCE_TO(BeforeAttributeValueState); + else if (cc == '>') + return emitAndResumeIn(source, DataState); + else if (m_usePreHTML5ParserQuirks && cc == '<') + return emitAndReconsumeIn(source, DataState); + else if (isASCIIUpper(cc)) { + m_token->addNewAttribute(); + m_token->beginAttributeName(source.numberOfCharactersConsumed()); + m_token->appendToAttributeName(toLowerCase(cc)); + ADVANCE_TO(AttributeNameState); + } else if (cc == InputStreamPreprocessor::endOfFileMarker) { + parseError(); + RECONSUME_IN(DataState); + } else { + if (cc == '"' || cc == '\'' || cc == '<') + parseError(); + m_token->addNewAttribute(); + m_token->beginAttributeName(source.numberOfCharactersConsumed()); + m_token->appendToAttributeName(cc); + ADVANCE_TO(AttributeNameState); + } + } + END_STATE() + + BEGIN_STATE(BeforeAttributeValueState) { + if (isTokenizerWhitespace(cc)) + ADVANCE_TO(BeforeAttributeValueState); + else if (cc == '"') { + m_token->beginAttributeValue(source.numberOfCharactersConsumed() + 1); + ADVANCE_TO(AttributeValueDoubleQuotedState); + } else if (cc == '&') { + m_token->beginAttributeValue(source.numberOfCharactersConsumed()); + RECONSUME_IN(AttributeValueUnquotedState); + } else if (cc == '\'') { + m_token->beginAttributeValue(source.numberOfCharactersConsumed() + 1); + ADVANCE_TO(AttributeValueSingleQuotedState); + } else if (cc == '>') { + parseError(); + return emitAndResumeIn(source, DataState); + } else if (cc == InputStreamPreprocessor::endOfFileMarker) { + parseError(); + RECONSUME_IN(DataState); + } else { + if (cc == '<' || cc == '=' || cc == '`') + parseError(); + m_token->beginAttributeValue(source.numberOfCharactersConsumed()); + m_token->appendToAttributeValue(cc); + ADVANCE_TO(AttributeValueUnquotedState); + } + } + END_STATE() + + BEGIN_STATE(AttributeValueDoubleQuotedState) { + if (cc == '"') { + m_token->endAttributeValue(source.numberOfCharactersConsumed()); + ADVANCE_TO(AfterAttributeValueQuotedState); + } else if (cc == '&') { + m_additionalAllowedCharacter = '"'; + ADVANCE_TO(CharacterReferenceInAttributeValueState); + } else if (cc == InputStreamPreprocessor::endOfFileMarker) { + parseError(); + m_token->endAttributeValue(source.numberOfCharactersConsumed()); + RECONSUME_IN(DataState); + } else { + m_token->appendToAttributeValue(cc); + ADVANCE_TO(AttributeValueDoubleQuotedState); + } + } + END_STATE() + + BEGIN_STATE(AttributeValueSingleQuotedState) { + if (cc == '\'') { + m_token->endAttributeValue(source.numberOfCharactersConsumed()); + ADVANCE_TO(AfterAttributeValueQuotedState); + } else if (cc == '&') { + m_additionalAllowedCharacter = '\''; + ADVANCE_TO(CharacterReferenceInAttributeValueState); + } else if (cc == InputStreamPreprocessor::endOfFileMarker) { + parseError(); + m_token->endAttributeValue(source.numberOfCharactersConsumed()); + RECONSUME_IN(DataState); + } else { + m_token->appendToAttributeValue(cc); + ADVANCE_TO(AttributeValueSingleQuotedState); + } + } + END_STATE() + + BEGIN_STATE(AttributeValueUnquotedState) { + if (isTokenizerWhitespace(cc)) { + m_token->endAttributeValue(source.numberOfCharactersConsumed()); + ADVANCE_TO(BeforeAttributeNameState); + } else if (cc == '&') { + m_additionalAllowedCharacter = '>'; + ADVANCE_TO(CharacterReferenceInAttributeValueState); + } else if (cc == '>') { + m_token->endAttributeValue(source.numberOfCharactersConsumed()); + return emitAndResumeIn(source, DataState); + } else if (cc == InputStreamPreprocessor::endOfFileMarker) { + parseError(); + m_token->endAttributeValue(source.numberOfCharactersConsumed()); + RECONSUME_IN(DataState); + } else { + if (cc == '"' || cc == '\'' || cc == '<' || cc == '=' || cc == '`') + parseError(); + m_token->appendToAttributeValue(cc); + ADVANCE_TO(AttributeValueUnquotedState); + } + } + END_STATE() + + BEGIN_STATE(CharacterReferenceInAttributeValueState) { + bool notEnoughCharacters = false; + Vector<UChar, 16> decodedEntity; + bool success = consumeHTMLEntity(source, decodedEntity, notEnoughCharacters, m_additionalAllowedCharacter); + if (notEnoughCharacters) + return haveBufferedCharacterToken(); + if (!success) { + ASSERT(decodedEntity.isEmpty()); + m_token->appendToAttributeValue('&'); + } else { + Vector<UChar>::const_iterator iter = decodedEntity.begin(); + for (; iter != decodedEntity.end(); ++iter) + m_token->appendToAttributeValue(*iter); + } + // We're supposed to switch back to the attribute value state that + // we were in when we were switched into this state. Rather than + // keeping track of this explictly, we observe that the previous + // state can be determined by m_additionalAllowedCharacter. + if (m_additionalAllowedCharacter == '"') + SWITCH_TO(AttributeValueDoubleQuotedState); + else if (m_additionalAllowedCharacter == '\'') + SWITCH_TO(AttributeValueSingleQuotedState); + else if (m_additionalAllowedCharacter == '>') + SWITCH_TO(AttributeValueUnquotedState); + else + ASSERT_NOT_REACHED(); + } + END_STATE() + + BEGIN_STATE(AfterAttributeValueQuotedState) { + if (isTokenizerWhitespace(cc)) + ADVANCE_TO(BeforeAttributeNameState); + else if (cc == '/') + ADVANCE_TO(SelfClosingStartTagState); + else if (cc == '>') + return emitAndResumeIn(source, DataState); + else if (m_usePreHTML5ParserQuirks && cc == '<') + return emitAndReconsumeIn(source, DataState); + else if (cc == InputStreamPreprocessor::endOfFileMarker) { + parseError(); + RECONSUME_IN(DataState); + } else { + parseError(); + RECONSUME_IN(BeforeAttributeNameState); + } + } + END_STATE() + + BEGIN_STATE(SelfClosingStartTagState) { + if (cc == '>') { + m_token->setSelfClosing(); + return emitAndResumeIn(source, DataState); + } else if (cc == InputStreamPreprocessor::endOfFileMarker) { + parseError(); + RECONSUME_IN(DataState); + } else { + parseError(); + RECONSUME_IN(BeforeAttributeNameState); + } + } + END_STATE() + + BEGIN_STATE(BogusCommentState) { + m_token->beginComment(); + RECONSUME_IN(ContinueBogusCommentState); + } + END_STATE() + + BEGIN_STATE(ContinueBogusCommentState) { + if (cc == '>') + return emitAndResumeIn(source, DataState); + else if (cc == InputStreamPreprocessor::endOfFileMarker) + return emitAndReconsumeIn(source, DataState); + else { + m_token->appendToComment(cc); + ADVANCE_TO(ContinueBogusCommentState); + } + } + END_STATE() + + BEGIN_STATE(MarkupDeclarationOpenState) { + DEFINE_STATIC_LOCAL(String, dashDashString, ("--")); + DEFINE_STATIC_LOCAL(String, doctypeString, ("doctype")); + DEFINE_STATIC_LOCAL(String, cdataString, ("[CDATA[")); + if (cc == '-') { + SegmentedString::LookAheadResult result = source.lookAhead(dashDashString); + if (result == SegmentedString::DidMatch) { + source.advanceAndASSERT('-'); + source.advanceAndASSERT('-'); + m_token->beginComment(); + SWITCH_TO(CommentStartState); + } else if (result == SegmentedString::NotEnoughCharacters) + return haveBufferedCharacterToken(); + } else if (cc == 'D' || cc == 'd') { + SegmentedString::LookAheadResult result = source.lookAheadIgnoringCase(doctypeString); + if (result == SegmentedString::DidMatch) { + advanceStringAndASSERTIgnoringCase(source, "doctype"); + SWITCH_TO(DOCTYPEState); + } else if (result == SegmentedString::NotEnoughCharacters) + return haveBufferedCharacterToken(); + } else if (cc == '[' && shouldAllowCDATA()) { + SegmentedString::LookAheadResult result = source.lookAhead(cdataString); + if (result == SegmentedString::DidMatch) { + advanceStringAndASSERT(source, "[CDATA["); + SWITCH_TO(CDATASectionState); + } else if (result == SegmentedString::NotEnoughCharacters) + return haveBufferedCharacterToken(); + } + parseError(); + RECONSUME_IN(BogusCommentState); + } + END_STATE() + + BEGIN_STATE(CommentStartState) { + if (cc == '-') + ADVANCE_TO(CommentStartDashState); + else if (cc == '>') { + parseError(); + return emitAndResumeIn(source, DataState); + } else if (cc == InputStreamPreprocessor::endOfFileMarker) { + parseError(); + return emitAndReconsumeIn(source, DataState); + } else { + m_token->appendToComment(cc); + ADVANCE_TO(CommentState); + } + } + END_STATE() + + BEGIN_STATE(CommentStartDashState) { + if (cc == '-') + ADVANCE_TO(CommentEndState); + else if (cc == '>') { + parseError(); + return emitAndResumeIn(source, DataState); + } else if (cc == InputStreamPreprocessor::endOfFileMarker) { + parseError(); + return emitAndReconsumeIn(source, DataState); + } else { + m_token->appendToComment('-'); + m_token->appendToComment(cc); + ADVANCE_TO(CommentState); + } + } + END_STATE() + + BEGIN_STATE(CommentState) { + if (cc == '-') + ADVANCE_TO(CommentEndDashState); + else if (cc == InputStreamPreprocessor::endOfFileMarker) { + parseError(); + return emitAndReconsumeIn(source, DataState); + } else { + m_token->appendToComment(cc); + ADVANCE_TO(CommentState); + } + } + END_STATE() + + BEGIN_STATE(CommentEndDashState) { + if (cc == '-') + ADVANCE_TO(CommentEndState); + else if (cc == InputStreamPreprocessor::endOfFileMarker) { + parseError(); + return emitAndReconsumeIn(source, DataState); + } else { + m_token->appendToComment('-'); + m_token->appendToComment(cc); + ADVANCE_TO(CommentState); + } + } + END_STATE() + + BEGIN_STATE(CommentEndState) { + if (cc == '>') + return emitAndResumeIn(source, DataState); + else if (cc == '!') { + parseError(); + ADVANCE_TO(CommentEndBangState); + } else if (cc == '-') { + parseError(); + m_token->appendToComment('-'); + ADVANCE_TO(CommentEndState); + } else if (cc == InputStreamPreprocessor::endOfFileMarker) { + parseError(); + return emitAndReconsumeIn(source, DataState); + } else { + parseError(); + m_token->appendToComment('-'); + m_token->appendToComment('-'); + m_token->appendToComment(cc); + ADVANCE_TO(CommentState); + } + } + END_STATE() + + BEGIN_STATE(CommentEndBangState) { + if (cc == '-') { + m_token->appendToComment('-'); + m_token->appendToComment('-'); + m_token->appendToComment('!'); + ADVANCE_TO(CommentEndDashState); + } else if (cc == '>') + return emitAndResumeIn(source, DataState); + else if (cc == InputStreamPreprocessor::endOfFileMarker) { + parseError(); + return emitAndReconsumeIn(source, DataState); + } else { + m_token->appendToComment('-'); + m_token->appendToComment('-'); + m_token->appendToComment('!'); + m_token->appendToComment(cc); + ADVANCE_TO(CommentState); + } + } + END_STATE() + + BEGIN_STATE(DOCTYPEState) { + if (isTokenizerWhitespace(cc)) + ADVANCE_TO(BeforeDOCTYPENameState); + else if (cc == InputStreamPreprocessor::endOfFileMarker) { + parseError(); + m_token->beginDOCTYPE(); + m_token->setForceQuirks(); + return emitAndReconsumeIn(source, DataState); + } else { + parseError(); + RECONSUME_IN(BeforeDOCTYPENameState); + } + } + END_STATE() + + BEGIN_STATE(BeforeDOCTYPENameState) { + if (isTokenizerWhitespace(cc)) + ADVANCE_TO(BeforeDOCTYPENameState); + else if (isASCIIUpper(cc)) { + m_token->beginDOCTYPE(toLowerCase(cc)); + ADVANCE_TO(DOCTYPENameState); + } else if (cc == '>') { + parseError(); + m_token->beginDOCTYPE(); + m_token->setForceQuirks(); + return emitAndResumeIn(source, DataState); + } else if (cc == InputStreamPreprocessor::endOfFileMarker) { + parseError(); + m_token->beginDOCTYPE(); + m_token->setForceQuirks(); + return emitAndReconsumeIn(source, DataState); + } else { + m_token->beginDOCTYPE(cc); + ADVANCE_TO(DOCTYPENameState); + } + } + END_STATE() + + BEGIN_STATE(DOCTYPENameState) { + if (isTokenizerWhitespace(cc)) + ADVANCE_TO(AfterDOCTYPENameState); + else if (cc == '>') + return emitAndResumeIn(source, DataState); + else if (isASCIIUpper(cc)) { + m_token->appendToName(toLowerCase(cc)); + ADVANCE_TO(DOCTYPENameState); + } else if (cc == InputStreamPreprocessor::endOfFileMarker) { + parseError(); + m_token->setForceQuirks(); + return emitAndReconsumeIn(source, DataState); + } else { + m_token->appendToName(cc); + ADVANCE_TO(DOCTYPENameState); + } + } + END_STATE() + + BEGIN_STATE(AfterDOCTYPENameState) { + if (isTokenizerWhitespace(cc)) + ADVANCE_TO(AfterDOCTYPENameState); + if (cc == '>') + return emitAndResumeIn(source, DataState); + else if (cc == InputStreamPreprocessor::endOfFileMarker) { + parseError(); + m_token->setForceQuirks(); + return emitAndReconsumeIn(source, DataState); + } else { + DEFINE_STATIC_LOCAL(String, publicString, ("public")); + DEFINE_STATIC_LOCAL(String, systemString, ("system")); + if (cc == 'P' || cc == 'p') { + SegmentedString::LookAheadResult result = source.lookAheadIgnoringCase(publicString); + if (result == SegmentedString::DidMatch) { + advanceStringAndASSERTIgnoringCase(source, "public"); + SWITCH_TO(AfterDOCTYPEPublicKeywordState); + } else if (result == SegmentedString::NotEnoughCharacters) + return haveBufferedCharacterToken(); + } else if (cc == 'S' || cc == 's') { + SegmentedString::LookAheadResult result = source.lookAheadIgnoringCase(systemString); + if (result == SegmentedString::DidMatch) { + advanceStringAndASSERTIgnoringCase(source, "system"); + SWITCH_TO(AfterDOCTYPESystemKeywordState); + } else if (result == SegmentedString::NotEnoughCharacters) + return haveBufferedCharacterToken(); + } + parseError(); + m_token->setForceQuirks(); + ADVANCE_TO(BogusDOCTYPEState); + } + } + END_STATE() + + BEGIN_STATE(AfterDOCTYPEPublicKeywordState) { + if (isTokenizerWhitespace(cc)) + ADVANCE_TO(BeforeDOCTYPEPublicIdentifierState); + else if (cc == '"') { + parseError(); + m_token->setPublicIdentifierToEmptyString(); + ADVANCE_TO(DOCTYPEPublicIdentifierDoubleQuotedState); + } else if (cc == '\'') { + parseError(); + m_token->setPublicIdentifierToEmptyString(); + ADVANCE_TO(DOCTYPEPublicIdentifierSingleQuotedState); + } else if (cc == '>') { + parseError(); + m_token->setForceQuirks(); + return emitAndResumeIn(source, DataState); + } else if (cc == InputStreamPreprocessor::endOfFileMarker) { + parseError(); + m_token->setForceQuirks(); + return emitAndReconsumeIn(source, DataState); + } else { + parseError(); + m_token->setForceQuirks(); + ADVANCE_TO(BogusDOCTYPEState); + } + } + END_STATE() + + BEGIN_STATE(BeforeDOCTYPEPublicIdentifierState) { + if (isTokenizerWhitespace(cc)) + ADVANCE_TO(BeforeDOCTYPEPublicIdentifierState); + else if (cc == '"') { + m_token->setPublicIdentifierToEmptyString(); + ADVANCE_TO(DOCTYPEPublicIdentifierDoubleQuotedState); + } else if (cc == '\'') { + m_token->setPublicIdentifierToEmptyString(); + ADVANCE_TO(DOCTYPEPublicIdentifierSingleQuotedState); + } else if (cc == '>') { + parseError(); + m_token->setForceQuirks(); + return emitAndResumeIn(source, DataState); + } else if (cc == InputStreamPreprocessor::endOfFileMarker) { + parseError(); + m_token->setForceQuirks(); + return emitAndReconsumeIn(source, DataState); + } else { + parseError(); + m_token->setForceQuirks(); + ADVANCE_TO(BogusDOCTYPEState); + } + } + END_STATE() + + BEGIN_STATE(DOCTYPEPublicIdentifierDoubleQuotedState) { + if (cc == '"') + ADVANCE_TO(AfterDOCTYPEPublicIdentifierState); + else if (cc == '>') { + parseError(); + m_token->setForceQuirks(); + return emitAndResumeIn(source, DataState); + } else if (cc == InputStreamPreprocessor::endOfFileMarker) { + parseError(); + m_token->setForceQuirks(); + return emitAndReconsumeIn(source, DataState); + } else { + m_token->appendToPublicIdentifier(cc); + ADVANCE_TO(DOCTYPEPublicIdentifierDoubleQuotedState); + } + } + END_STATE() + + BEGIN_STATE(DOCTYPEPublicIdentifierSingleQuotedState) { + if (cc == '\'') + ADVANCE_TO(AfterDOCTYPEPublicIdentifierState); + else if (cc == '>') { + parseError(); + m_token->setForceQuirks(); + return emitAndResumeIn(source, DataState); + } else if (cc == InputStreamPreprocessor::endOfFileMarker) { + parseError(); + m_token->setForceQuirks(); + return emitAndReconsumeIn(source, DataState); + } else { + m_token->appendToPublicIdentifier(cc); + ADVANCE_TO(DOCTYPEPublicIdentifierSingleQuotedState); + } + } + END_STATE() + + BEGIN_STATE(AfterDOCTYPEPublicIdentifierState) { + if (isTokenizerWhitespace(cc)) + ADVANCE_TO(BetweenDOCTYPEPublicAndSystemIdentifiersState); + else if (cc == '>') + return emitAndResumeIn(source, DataState); + else if (cc == '"') { + parseError(); + m_token->setSystemIdentifierToEmptyString(); + ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState); + } else if (cc == '\'') { + parseError(); + m_token->setSystemIdentifierToEmptyString(); + ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState); + } else if (cc == InputStreamPreprocessor::endOfFileMarker) { + parseError(); + m_token->setForceQuirks(); + return emitAndReconsumeIn(source, DataState); + } else { + parseError(); + m_token->setForceQuirks(); + ADVANCE_TO(BogusDOCTYPEState); + } + } + END_STATE() + + BEGIN_STATE(BetweenDOCTYPEPublicAndSystemIdentifiersState) { + if (isTokenizerWhitespace(cc)) + ADVANCE_TO(BetweenDOCTYPEPublicAndSystemIdentifiersState); + else if (cc == '>') + return emitAndResumeIn(source, DataState); + else if (cc == '"') { + m_token->setSystemIdentifierToEmptyString(); + ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState); + } else if (cc == '\'') { + m_token->setSystemIdentifierToEmptyString(); + ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState); + } else if (cc == InputStreamPreprocessor::endOfFileMarker) { + parseError(); + m_token->setForceQuirks(); + return emitAndReconsumeIn(source, DataState); + } else { + parseError(); + m_token->setForceQuirks(); + ADVANCE_TO(BogusDOCTYPEState); + } + } + END_STATE() + + BEGIN_STATE(AfterDOCTYPESystemKeywordState) { + if (isTokenizerWhitespace(cc)) + ADVANCE_TO(BeforeDOCTYPESystemIdentifierState); + else if (cc == '"') { + parseError(); + m_token->setSystemIdentifierToEmptyString(); + ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState); + } else if (cc == '\'') { + parseError(); + m_token->setSystemIdentifierToEmptyString(); + ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState); + } else if (cc == '>') { + parseError(); + m_token->setForceQuirks(); + return emitAndResumeIn(source, DataState); + } else if (cc == InputStreamPreprocessor::endOfFileMarker) { + parseError(); + m_token->setForceQuirks(); + return emitAndReconsumeIn(source, DataState); + } else { + parseError(); + m_token->setForceQuirks(); + ADVANCE_TO(BogusDOCTYPEState); + } + } + END_STATE() + + BEGIN_STATE(BeforeDOCTYPESystemIdentifierState) { + if (isTokenizerWhitespace(cc)) + ADVANCE_TO(BeforeDOCTYPESystemIdentifierState); + if (cc == '"') { + m_token->setSystemIdentifierToEmptyString(); + ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState); + } else if (cc == '\'') { + m_token->setSystemIdentifierToEmptyString(); + ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState); + } else if (cc == '>') { + parseError(); + m_token->setForceQuirks(); + return emitAndResumeIn(source, DataState); + } else if (cc == InputStreamPreprocessor::endOfFileMarker) { + parseError(); + m_token->setForceQuirks(); + return emitAndReconsumeIn(source, DataState); + } else { + parseError(); + m_token->setForceQuirks(); + ADVANCE_TO(BogusDOCTYPEState); + } + } + END_STATE() + + BEGIN_STATE(DOCTYPESystemIdentifierDoubleQuotedState) { + if (cc == '"') + ADVANCE_TO(AfterDOCTYPESystemIdentifierState); + else if (cc == '>') { + parseError(); + m_token->setForceQuirks(); + return emitAndResumeIn(source, DataState); + } else if (cc == InputStreamPreprocessor::endOfFileMarker) { + parseError(); + m_token->setForceQuirks(); + return emitAndReconsumeIn(source, DataState); + } else { + m_token->appendToSystemIdentifier(cc); + ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState); + } + } + END_STATE() + + BEGIN_STATE(DOCTYPESystemIdentifierSingleQuotedState) { + if (cc == '\'') + ADVANCE_TO(AfterDOCTYPESystemIdentifierState); + else if (cc == '>') { + parseError(); + m_token->setForceQuirks(); + return emitAndResumeIn(source, DataState); + } else if (cc == InputStreamPreprocessor::endOfFileMarker) { + parseError(); + m_token->setForceQuirks(); + return emitAndReconsumeIn(source, DataState); + } else { + m_token->appendToSystemIdentifier(cc); + ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState); + } + } + END_STATE() + + BEGIN_STATE(AfterDOCTYPESystemIdentifierState) { + if (isTokenizerWhitespace(cc)) + ADVANCE_TO(AfterDOCTYPESystemIdentifierState); + else if (cc == '>') + return emitAndResumeIn(source, DataState); + else if (cc == InputStreamPreprocessor::endOfFileMarker) { + parseError(); + m_token->setForceQuirks(); + return emitAndReconsumeIn(source, DataState); + } else { + parseError(); + ADVANCE_TO(BogusDOCTYPEState); + } + } + END_STATE() + + BEGIN_STATE(BogusDOCTYPEState) { + if (cc == '>') + return emitAndResumeIn(source, DataState); + else if (cc == InputStreamPreprocessor::endOfFileMarker) + return emitAndReconsumeIn(source, DataState); + ADVANCE_TO(BogusDOCTYPEState); + } + END_STATE() + + BEGIN_STATE(CDATASectionState) { + if (cc == ']') + ADVANCE_TO(CDATASectionRightSquareBracketState); + else if (cc == InputStreamPreprocessor::endOfFileMarker) + RECONSUME_IN(DataState); + else { + bufferCharacter(cc); + ADVANCE_TO(CDATASectionState); + } + } + END_STATE() + + BEGIN_STATE(CDATASectionRightSquareBracketState) { + if (cc == ']') + ADVANCE_TO(CDATASectionDoubleRightSquareBracketState); + else { + bufferCharacter(']'); + RECONSUME_IN(CDATASectionState); + } + } + + BEGIN_STATE(CDATASectionDoubleRightSquareBracketState) { + if (cc == '>') + ADVANCE_TO(DataState); + else { + bufferCharacter(']'); + bufferCharacter(']'); + RECONSUME_IN(CDATASectionState); + } + } + END_STATE() + + } + + ASSERT_NOT_REACHED(); + return false; +} + +void HTMLTokenizer::updateStateFor(const AtomicString& tagName, Frame* frame) +{ + if (tagName == textareaTag || tagName == titleTag) + setState(RCDATAState); + else if (tagName == plaintextTag) + setState(PLAINTEXTState); + else if (tagName == scriptTag) + setState(ScriptDataState); + else if (tagName == styleTag + || tagName == iframeTag + || tagName == xmpTag + || (tagName == noembedTag && HTMLTreeBuilder::pluginsEnabled(frame)) + || tagName == noframesTag + || (tagName == noscriptTag && HTMLTreeBuilder::scriptEnabled(frame))) + setState(RAWTEXTState); +} + +inline bool HTMLTokenizer::temporaryBufferIs(const String& expectedString) +{ + return vectorEqualsString(m_temporaryBuffer, expectedString); +} + +inline void HTMLTokenizer::addToPossibleEndTag(UChar cc) +{ + ASSERT(isEndTagBufferingState(m_state)); + m_bufferedEndTagName.append(cc); +} + +inline bool HTMLTokenizer::isAppropriateEndTag() +{ + return m_bufferedEndTagName == m_appropriateEndTagName; +} + +inline void HTMLTokenizer::bufferCharacter(UChar character) +{ + ASSERT(character != InputStreamPreprocessor::endOfFileMarker); + m_token->ensureIsCharacterToken(); + m_token->appendToCharacter(character); +} + +inline void HTMLTokenizer::parseError() +{ + notImplemented(); +} + +inline bool HTMLTokenizer::haveBufferedCharacterToken() +{ + return m_token->type() == HTMLToken::Character; +} + +} diff --git a/Source/WebCore/html/parser/HTMLTokenizer.h b/Source/WebCore/html/parser/HTMLTokenizer.h new file mode 100644 index 0000000..f16b049 --- /dev/null +++ b/Source/WebCore/html/parser/HTMLTokenizer.h @@ -0,0 +1,316 @@ +/* + * Copyright (C) 2008 Apple Inc. All Rights Reserved. + * Copyright (C) 2010 Google, Inc. All Rights Reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef HTMLTokenizer_h +#define HTMLTokenizer_h + +#include "SegmentedString.h" +#include <wtf/Noncopyable.h> +#include <wtf/PassOwnPtr.h> +#include <wtf/Vector.h> +#include <wtf/text/AtomicString.h> + +namespace WebCore { + +class Element; +class Frame; +class HTMLToken; + +class HTMLTokenizer : public Noncopyable { +public: + enum State { + DataState, + CharacterReferenceInDataState, + RCDATAState, + CharacterReferenceInRCDATAState, + RAWTEXTState, + ScriptDataState, + PLAINTEXTState, + TagOpenState, + EndTagOpenState, + TagNameState, + RCDATALessThanSignState, + RCDATAEndTagOpenState, + RCDATAEndTagNameState, + RAWTEXTLessThanSignState, + RAWTEXTEndTagOpenState, + RAWTEXTEndTagNameState, + ScriptDataLessThanSignState, + ScriptDataEndTagOpenState, + ScriptDataEndTagNameState, + ScriptDataEscapeStartState, + ScriptDataEscapeStartDashState, + ScriptDataEscapedState, + ScriptDataEscapedDashState, + ScriptDataEscapedDashDashState, + ScriptDataEscapedLessThanSignState, + ScriptDataEscapedEndTagOpenState, + ScriptDataEscapedEndTagNameState, + ScriptDataDoubleEscapeStartState, + ScriptDataDoubleEscapedState, + ScriptDataDoubleEscapedDashState, + ScriptDataDoubleEscapedDashDashState, + ScriptDataDoubleEscapedLessThanSignState, + ScriptDataDoubleEscapeEndState, + BeforeAttributeNameState, + AttributeNameState, + AfterAttributeNameState, + BeforeAttributeValueState, + AttributeValueDoubleQuotedState, + AttributeValueSingleQuotedState, + AttributeValueUnquotedState, + CharacterReferenceInAttributeValueState, + AfterAttributeValueQuotedState, + SelfClosingStartTagState, + BogusCommentState, + // The ContinueBogusCommentState is not in the HTML5 spec, but we use + // it internally to keep track of whether we've started the bogus + // comment token yet. + ContinueBogusCommentState, + MarkupDeclarationOpenState, + CommentStartState, + CommentStartDashState, + CommentState, + CommentEndDashState, + CommentEndState, + CommentEndBangState, + DOCTYPEState, + BeforeDOCTYPENameState, + DOCTYPENameState, + AfterDOCTYPENameState, + AfterDOCTYPEPublicKeywordState, + BeforeDOCTYPEPublicIdentifierState, + DOCTYPEPublicIdentifierDoubleQuotedState, + DOCTYPEPublicIdentifierSingleQuotedState, + AfterDOCTYPEPublicIdentifierState, + BetweenDOCTYPEPublicAndSystemIdentifiersState, + AfterDOCTYPESystemKeywordState, + BeforeDOCTYPESystemIdentifierState, + DOCTYPESystemIdentifierDoubleQuotedState, + DOCTYPESystemIdentifierSingleQuotedState, + AfterDOCTYPESystemIdentifierState, + BogusDOCTYPEState, + CDATASectionState, + // These CDATA states are not in the HTML5 spec, but we use them internally. + CDATASectionRightSquareBracketState, + CDATASectionDoubleRightSquareBracketState, + }; + + static PassOwnPtr<HTMLTokenizer> create(bool usePreHTML5ParserQuirks) { return adoptPtr(new HTMLTokenizer(usePreHTML5ParserQuirks)); } + ~HTMLTokenizer(); + + void reset(); + + // This function returns true if it emits a token. Otherwise, callers + // must provide the same (in progress) token on the next call (unless + // they call reset() first). + bool nextToken(SegmentedString&, HTMLToken&); + + int lineNumber() const { return m_lineNumber; } + int columnNumber() const { return 1; } // Matches LegacyHTMLDocumentParser.h behavior. + + State state() const { return m_state; } + void setState(State state) { m_state = state; } + + // Updates the tokenizer's state according to the given tag name. This is + // an approximation of how the tree builder would update the tokenizer's + // state. This method is useful for approximating HTML tokenization. To + // get exactly the correct tokenization, you need the real tree builder. + // + // The main failures in the approximation are as follows: + // + // * The first set of character tokens emitted for a <pre> element might + // contain an extra leading newline. + // * The replacement of U+0000 with U+FFFD will not be sensitive to the + // tree builder's insertion mode. + // * CDATA sections in foreign content will be tokenized as bogus comments + // instead of as character tokens. + // + void updateStateFor(const AtomicString& tagName, Frame*); + + // Hack to skip leading newline in <pre>/<listing> for authoring ease. + // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#parsing-main-inbody + void setSkipLeadingNewLineForListing(bool value) { m_skipLeadingNewLineForListing = value; } + + bool forceNullCharacterReplacement() const { return m_forceNullCharacterReplacement; } + void setForceNullCharacterReplacement(bool value) { m_forceNullCharacterReplacement = value; } + + bool shouldAllowCDATA() const { return m_shouldAllowCDATA; } + void setShouldAllowCDATA(bool value) { m_shouldAllowCDATA = value; } + + bool shouldSkipNullCharacters() const + { + return !m_forceNullCharacterReplacement + && (m_state == DataState + || m_state == RCDATAState + || m_state == RAWTEXTState + || m_state == PLAINTEXTState); + } + +private: + // http://www.whatwg.org/specs/web-apps/current-work/#preprocessing-the-input-stream + class InputStreamPreprocessor : public Noncopyable { + public: + InputStreamPreprocessor(HTMLTokenizer* tokenizer) + : m_tokenizer(tokenizer) + , m_nextInputCharacter('\0') + , m_skipNextNewLine(false) + { + } + + UChar nextInputCharacter() const { return m_nextInputCharacter; } + + // Returns whether we succeeded in peeking at the next character. + // The only way we can fail to peek is if there are no more + // characters in |source| (after collapsing \r\n, etc). + ALWAYS_INLINE bool peek(SegmentedString& source, int& lineNumber) + { + PeekAgain: + m_nextInputCharacter = *source; + + // Every branch in this function is expensive, so we have a + // fast-reject branch for characters that don't require special + // handling. Please run the parser benchmark whenever you touch + // this function. It's very hot. + static const UChar specialCharacterMask = '\n' | '\r' | '\0'; + if (m_nextInputCharacter & ~specialCharacterMask) { + m_skipNextNewLine = false; + return true; + } + + if (m_nextInputCharacter == '\n' && m_skipNextNewLine) { + m_skipNextNewLine = false; + source.advancePastNewline(lineNumber); + if (source.isEmpty()) + return false; + m_nextInputCharacter = *source; + } + if (m_nextInputCharacter == '\r') { + m_nextInputCharacter = '\n'; + m_skipNextNewLine = true; + } else { + m_skipNextNewLine = false; + // FIXME: The spec indicates that the surrogate pair range as well as + // a number of specific character values are parse errors and should be replaced + // by the replacement character. We suspect this is a problem with the spec as doing + // that filtering breaks surrogate pair handling and causes us not to match Minefield. + if (m_nextInputCharacter == '\0' && !shouldTreatNullAsEndOfFileMarker(source)) { + if (m_tokenizer->shouldSkipNullCharacters()) { + source.advancePastNonNewline(); + if (source.isEmpty()) + return false; + goto PeekAgain; + } + m_nextInputCharacter = 0xFFFD; + } + } + return true; + } + + // Returns whether there are more characters in |source| after advancing. + bool advance(SegmentedString& source, int& lineNumber) + { + source.advance(lineNumber); + if (source.isEmpty()) + return false; + return peek(source, lineNumber); + } + + static const UChar endOfFileMarker; + + private: + bool shouldTreatNullAsEndOfFileMarker(SegmentedString& source) const + { + return source.isClosed() && source.length() == 1; + } + + HTMLTokenizer* m_tokenizer; + + // http://www.whatwg.org/specs/web-apps/current-work/#next-input-character + UChar m_nextInputCharacter; + bool m_skipNextNewLine; + }; + + HTMLTokenizer(bool usePreHTML5ParserQuirks); + + inline bool processEntity(SegmentedString&); + + inline void parseError(); + inline void bufferCharacter(UChar); + inline void bufferCodePoint(unsigned); + + inline bool emitAndResumeIn(SegmentedString&, State); + inline bool emitAndReconsumeIn(SegmentedString&, State); + inline bool emitEndOfFile(SegmentedString&); + inline bool flushEmitAndResumeIn(SegmentedString&, State); + + // Return whether we need to emit a character token before dealing with + // the buffered end tag. + inline bool flushBufferedEndTag(SegmentedString&); + inline bool temporaryBufferIs(const String&); + + // Sometimes we speculatively consume input characters and we don't + // know whether they represent end tags or RCDATA, etc. These + // functions help manage these state. + inline void addToPossibleEndTag(UChar cc); + inline void saveEndTagNameIfNeeded(); + inline bool isAppropriateEndTag(); + + inline bool haveBufferedCharacterToken(); + + State m_state; + + Vector<UChar, 32> m_appropriateEndTagName; + + // m_token is owned by the caller. If nextToken is not on the stack, + // this member might be pointing to unallocated memory. + HTMLToken* m_token; + int m_lineNumber; + + bool m_skipLeadingNewLineForListing; + bool m_forceNullCharacterReplacement; + bool m_shouldAllowCDATA; + + // http://www.whatwg.org/specs/web-apps/current-work/#temporary-buffer + Vector<UChar, 32> m_temporaryBuffer; + + // We occationally want to emit both a character token and an end tag + // token (e.g., when lexing script). We buffer the name of the end tag + // token here so we remember it next time we re-enter the tokenizer. + Vector<UChar, 32> m_bufferedEndTagName; + + // http://www.whatwg.org/specs/web-apps/current-work/#additional-allowed-character + UChar m_additionalAllowedCharacter; + + // http://www.whatwg.org/specs/web-apps/current-work/#preprocessing-the-input-stream + InputStreamPreprocessor m_inputStreamPreprocessor; + + bool m_usePreHTML5ParserQuirks; +}; + +} + +#endif diff --git a/Source/WebCore/html/parser/HTMLTreeBuilder.cpp b/Source/WebCore/html/parser/HTMLTreeBuilder.cpp new file mode 100644 index 0000000..02713e5 --- /dev/null +++ b/Source/WebCore/html/parser/HTMLTreeBuilder.cpp @@ -0,0 +1,2822 @@ +/* + * Copyright (C) 2010 Google, Inc. All Rights Reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY GOOGLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GOOGLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "HTMLTreeBuilder.h" + +#include "CharacterNames.h" +#include "Comment.h" +#include "DocumentFragment.h" +#include "DocumentType.h" +#include "Frame.h" +#include "HTMLDocument.h" +#include "HTMLDocumentParser.h" +#include "HTMLElementFactory.h" +#include "HTMLFormElement.h" +#include "HTMLHtmlElement.h" +#include "HTMLNames.h" +#include "HTMLParserIdioms.h" +#include "HTMLScriptElement.h" +#include "HTMLToken.h" +#include "HTMLTokenizer.h" +#include "LocalizedStrings.h" +#include "MathMLNames.h" +#include "NotImplemented.h" +#include "SVGNames.h" +#include "ScriptController.h" +#include "Text.h" +#include "XLinkNames.h" +#include "XMLNSNames.h" +#include "XMLNames.h" + +namespace WebCore { + +using namespace HTMLNames; + +static const int uninitializedLineNumberValue = -1; + +static TextPosition1 uninitializedPositionValue1() +{ + return TextPosition1(WTF::OneBasedNumber::fromOneBasedInt(-1), WTF::OneBasedNumber::base()); +} + +namespace { + +inline bool isHTMLSpaceOrReplacementCharacter(UChar character) +{ + return isHTMLSpace(character) || character == replacementCharacter; +} + +inline bool isAllWhitespace(const String& string) +{ + return string.isAllSpecialCharacters<isHTMLSpace>(); +} + +inline bool isAllWhitespaceOrReplacementCharacters(const String& string) +{ + return string.isAllSpecialCharacters<isHTMLSpaceOrReplacementCharacter>(); +} + +bool isNumberedHeaderTag(const AtomicString& tagName) +{ + return tagName == h1Tag + || tagName == h2Tag + || tagName == h3Tag + || tagName == h4Tag + || tagName == h5Tag + || tagName == h6Tag; +} + +bool isCaptionColOrColgroupTag(const AtomicString& tagName) +{ + return tagName == captionTag + || tagName == colTag + || tagName == colgroupTag; +} + +bool isTableCellContextTag(const AtomicString& tagName) +{ + return tagName == thTag || tagName == tdTag; +} + +bool isTableBodyContextTag(const AtomicString& tagName) +{ + return tagName == tbodyTag + || tagName == tfootTag + || tagName == theadTag; +} + +// http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#special +bool isSpecialNode(Node* node) +{ + if (node->hasTagName(MathMLNames::miTag) + || node->hasTagName(MathMLNames::moTag) + || node->hasTagName(MathMLNames::mnTag) + || node->hasTagName(MathMLNames::msTag) + || node->hasTagName(MathMLNames::mtextTag) + || node->hasTagName(MathMLNames::annotation_xmlTag) + || node->hasTagName(SVGNames::foreignObjectTag) + || node->hasTagName(SVGNames::descTag) + || node->hasTagName(SVGNames::titleTag)) + return true; + if (node->namespaceURI() != xhtmlNamespaceURI) + return false; + const AtomicString& tagName = node->localName(); + return tagName == addressTag + || tagName == appletTag + || tagName == areaTag + || tagName == articleTag + || tagName == asideTag + || tagName == baseTag + || tagName == basefontTag + || tagName == bgsoundTag + || tagName == blockquoteTag + || tagName == bodyTag + || tagName == brTag + || tagName == buttonTag + || tagName == captionTag + || tagName == centerTag + || tagName == colTag + || tagName == colgroupTag + || tagName == commandTag + || tagName == ddTag + || tagName == detailsTag + || tagName == dirTag + || tagName == divTag + || tagName == dlTag + || tagName == dtTag + || tagName == embedTag + || tagName == fieldsetTag + || tagName == figcaptionTag + || tagName == figureTag + || tagName == footerTag + || tagName == formTag + || tagName == frameTag + || tagName == framesetTag + || isNumberedHeaderTag(tagName) + || tagName == headTag + || tagName == headerTag + || tagName == hgroupTag + || tagName == hrTag + || tagName == htmlTag + || tagName == iframeTag + || tagName == imgTag + || tagName == inputTag + || tagName == isindexTag + || tagName == liTag + || tagName == linkTag + || tagName == listingTag + || tagName == marqueeTag + || tagName == menuTag + || tagName == metaTag + || tagName == navTag + || tagName == noembedTag + || tagName == noframesTag + || tagName == noscriptTag + || tagName == objectTag + || tagName == olTag + || tagName == pTag + || tagName == paramTag + || tagName == plaintextTag + || tagName == preTag + || tagName == scriptTag + || tagName == sectionTag + || tagName == selectTag + || tagName == styleTag + || tagName == summaryTag + || tagName == tableTag + || isTableBodyContextTag(tagName) + || tagName == tdTag + || tagName == textareaTag + || tagName == thTag + || tagName == titleTag + || tagName == trTag + || tagName == ulTag + || tagName == wbrTag + || tagName == xmpTag; +} + +bool isNonAnchorNonNobrFormattingTag(const AtomicString& tagName) +{ + return tagName == bTag + || tagName == bigTag + || tagName == codeTag + || tagName == emTag + || tagName == fontTag + || tagName == iTag + || tagName == sTag + || tagName == smallTag + || tagName == strikeTag + || tagName == strongTag + || tagName == ttTag + || tagName == uTag; +} + +bool isNonAnchorFormattingTag(const AtomicString& tagName) +{ + return tagName == nobrTag + || isNonAnchorNonNobrFormattingTag(tagName); +} + +// http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#formatting +bool isFormattingTag(const AtomicString& tagName) +{ + return tagName == aTag || isNonAnchorFormattingTag(tagName); +} + +HTMLFormElement* closestFormAncestor(Element* element) +{ + while (element) { + if (element->hasTagName(formTag)) + return static_cast<HTMLFormElement*>(element); + ContainerNode* parent = element->parentNode(); + if (!parent || !parent->isElementNode()) + return 0; + element = static_cast<Element*>(parent); + } + return 0; +} + +} // namespace + +class HTMLTreeBuilder::ExternalCharacterTokenBuffer : public Noncopyable { +public: + explicit ExternalCharacterTokenBuffer(AtomicHTMLToken& token) + : m_current(token.characters().data()) + , m_end(m_current + token.characters().size()) + { + ASSERT(!isEmpty()); + } + + explicit ExternalCharacterTokenBuffer(const String& string) + : m_current(string.characters()) + , m_end(m_current + string.length()) + { + ASSERT(!isEmpty()); + } + + ~ExternalCharacterTokenBuffer() + { + ASSERT(isEmpty()); + } + + bool isEmpty() const { return m_current == m_end; } + + void skipLeadingWhitespace() + { + skipLeading<isHTMLSpace>(); + } + + String takeLeadingWhitespace() + { + return takeLeading<isHTMLSpace>(); + } + + String takeLeadingNonWhitespace() + { + return takeLeading<isNotHTMLSpace>(); + } + + String takeRemaining() + { + ASSERT(!isEmpty()); + const UChar* start = m_current; + m_current = m_end; + return String(start, m_current - start); + } + + void giveRemainingTo(Vector<UChar>& recipient) + { + recipient.append(m_current, m_end - m_current); + m_current = m_end; + } + + String takeRemainingWhitespace() + { + ASSERT(!isEmpty()); + Vector<UChar> whitespace; + do { + UChar cc = *m_current++; + if (isHTMLSpace(cc)) + whitespace.append(cc); + } while (m_current < m_end); + // Returning the null string when there aren't any whitespace + // characters is slightly cleaner semantically because we don't want + // to insert a text node (as opposed to inserting an empty text node). + if (whitespace.isEmpty()) + return String(); + return String::adopt(whitespace); + } + +private: + template<bool characterPredicate(UChar)> + void skipLeading() + { + ASSERT(!isEmpty()); + while (characterPredicate(*m_current)) { + if (++m_current == m_end) + return; + } + } + + template<bool characterPredicate(UChar)> + String takeLeading() + { + ASSERT(!isEmpty()); + const UChar* start = m_current; + skipLeading<characterPredicate>(); + if (start == m_current) + return String(); + return String(start, m_current - start); + } + + const UChar* m_current; + const UChar* m_end; +}; + + +HTMLTreeBuilder::HTMLTreeBuilder(HTMLDocumentParser* parser, HTMLDocument* document, bool reportErrors, bool usePreHTML5ParserQuirks) + : m_framesetOk(true) + , m_document(document) + , m_tree(document, FragmentScriptingAllowed, false) + , m_reportErrors(reportErrors) + , m_isPaused(false) + , m_insertionMode(InitialMode) + , m_originalInsertionMode(InitialMode) + , m_parser(parser) + , m_scriptToProcessStartPosition(uninitializedPositionValue1()) + , m_lastScriptElementStartPosition(TextPosition0::belowRangePosition()) + , m_usePreHTML5ParserQuirks(usePreHTML5ParserQuirks) + , m_hasPendingForeignInsertionModeSteps(false) +{ +} + +// FIXME: Member variables should be grouped into self-initializing structs to +// minimize code duplication between these constructors. +HTMLTreeBuilder::HTMLTreeBuilder(HTMLDocumentParser* parser, DocumentFragment* fragment, Element* contextElement, FragmentScriptingPermission scriptingPermission, bool usePreHTML5ParserQuirks) + : m_framesetOk(true) + , m_fragmentContext(fragment, contextElement, scriptingPermission) + , m_document(m_fragmentContext.document()) + , m_tree(m_document, scriptingPermission, true) + , m_reportErrors(false) // FIXME: Why not report errors in fragments? + , m_isPaused(false) + , m_insertionMode(InitialMode) + , m_originalInsertionMode(InitialMode) + , m_parser(parser) + , m_scriptToProcessStartPosition(uninitializedPositionValue1()) + , m_lastScriptElementStartPosition(TextPosition0::belowRangePosition()) + , m_usePreHTML5ParserQuirks(usePreHTML5ParserQuirks) + , m_hasPendingForeignInsertionModeSteps(false) +{ + if (contextElement) { + // Steps 4.2-4.6 of the HTML5 Fragment Case parsing algorithm: + // http://www.whatwg.org/specs/web-apps/current-work/multipage/the-end.html#fragment-case + m_document->setCompatibilityMode(contextElement->document()->compatibilityMode()); + processFakeStartTag(htmlTag); + resetInsertionModeAppropriately(); + m_tree.setForm(closestFormAncestor(contextElement)); + } +} + +HTMLTreeBuilder::~HTMLTreeBuilder() +{ +} + +void HTMLTreeBuilder::detach() +{ + // This call makes little sense in fragment mode, but for consistency + // DocumentParser expects detach() to always be called before it's destroyed. + m_document = 0; + // HTMLConstructionSite might be on the callstack when detach() is called + // otherwise we'd just call m_tree.clear() here instead. + m_tree.detach(); +} + +HTMLTreeBuilder::FragmentParsingContext::FragmentParsingContext() + : m_fragment(0) + , m_contextElement(0) + , m_scriptingPermission(FragmentScriptingAllowed) +{ +} + +HTMLTreeBuilder::FragmentParsingContext::FragmentParsingContext(DocumentFragment* fragment, Element* contextElement, FragmentScriptingPermission scriptingPermission) + : m_dummyDocumentForFragmentParsing(HTMLDocument::create(0, KURL(), fragment->document()->baseURI())) + , m_fragment(fragment) + , m_contextElement(contextElement) + , m_scriptingPermission(scriptingPermission) +{ + m_dummyDocumentForFragmentParsing->setCompatibilityMode(fragment->document()->compatibilityMode()); +} + +Document* HTMLTreeBuilder::FragmentParsingContext::document() const +{ + ASSERT(m_fragment); + return m_dummyDocumentForFragmentParsing.get(); +} + +void HTMLTreeBuilder::FragmentParsingContext::finished() +{ + // Populate the DocumentFragment with the parsed content now that we're done. + ContainerNode* root = m_dummyDocumentForFragmentParsing.get(); + if (m_contextElement) + root = m_dummyDocumentForFragmentParsing->documentElement(); + m_fragment->takeAllChildrenFrom(root); +} + +HTMLTreeBuilder::FragmentParsingContext::~FragmentParsingContext() +{ +} + +PassRefPtr<Element> HTMLTreeBuilder::takeScriptToProcess(TextPosition1& scriptStartPosition) +{ + // Unpause ourselves, callers may pause us again when processing the script. + // The HTML5 spec is written as though scripts are executed inside the tree + // builder. We pause the parser to exit the tree builder, and then resume + // before running scripts. + m_isPaused = false; + scriptStartPosition = m_scriptToProcessStartPosition; + m_scriptToProcessStartPosition = uninitializedPositionValue1(); + return m_scriptToProcess.release(); +} + +void HTMLTreeBuilder::constructTreeFromToken(HTMLToken& rawToken) +{ + AtomicHTMLToken token(rawToken); + constructTreeFromAtomicToken(token); +} + +void HTMLTreeBuilder::constructTreeFromAtomicToken(AtomicHTMLToken& token) +{ + processToken(token); + + // Swallowing U+0000 characters isn't in the HTML5 spec, but turning all + // the U+0000 characters into replacement characters has compatibility + // problems. + m_parser->tokenizer()->setForceNullCharacterReplacement(m_insertionMode == TextMode || m_insertionMode == InForeignContentMode); + m_parser->tokenizer()->setShouldAllowCDATA(m_insertionMode == InForeignContentMode && m_tree.currentElement()->namespaceURI() != xhtmlNamespaceURI); +} + +void HTMLTreeBuilder::processToken(AtomicHTMLToken& token) +{ + switch (token.type()) { + case HTMLToken::Uninitialized: + ASSERT_NOT_REACHED(); + break; + case HTMLToken::DOCTYPE: + processDoctypeToken(token); + break; + case HTMLToken::StartTag: + processStartTag(token); + break; + case HTMLToken::EndTag: + processEndTag(token); + break; + case HTMLToken::Comment: + processComment(token); + return; + case HTMLToken::Character: + processCharacter(token); + break; + case HTMLToken::EndOfFile: + processEndOfFile(token); + break; + } +} + +void HTMLTreeBuilder::processDoctypeToken(AtomicHTMLToken& token) +{ + ASSERT(token.type() == HTMLToken::DOCTYPE); + if (m_insertionMode == InitialMode) { + m_tree.insertDoctype(token); + setInsertionMode(BeforeHTMLMode); + return; + } + if (m_insertionMode == InTableTextMode) { + defaultForInTableText(); + processDoctypeToken(token); + return; + } + parseError(token); +} + +void HTMLTreeBuilder::processFakeStartTag(const QualifiedName& tagName, PassRefPtr<NamedNodeMap> attributes) +{ + // FIXME: We'll need a fancier conversion than just "localName" for SVG/MathML tags. + AtomicHTMLToken fakeToken(HTMLToken::StartTag, tagName.localName(), attributes); + processStartTag(fakeToken); +} + +void HTMLTreeBuilder::processFakeEndTag(const QualifiedName& tagName) +{ + // FIXME: We'll need a fancier conversion than just "localName" for SVG/MathML tags. + AtomicHTMLToken fakeToken(HTMLToken::EndTag, tagName.localName()); + processEndTag(fakeToken); +} + +void HTMLTreeBuilder::processFakeCharacters(const String& characters) +{ + ASSERT(!characters.isEmpty()); + ExternalCharacterTokenBuffer buffer(characters); + processCharacterBuffer(buffer); +} + +void HTMLTreeBuilder::processFakePEndTagIfPInButtonScope() +{ + if (!m_tree.openElements()->inButtonScope(pTag.localName())) + return; + AtomicHTMLToken endP(HTMLToken::EndTag, pTag.localName()); + processEndTag(endP); +} + +PassRefPtr<NamedNodeMap> HTMLTreeBuilder::attributesForIsindexInput(AtomicHTMLToken& token) +{ + RefPtr<NamedNodeMap> attributes = token.takeAtributes(); + if (!attributes) + attributes = NamedNodeMap::create(); + else { + attributes->removeAttribute(nameAttr); + attributes->removeAttribute(actionAttr); + attributes->removeAttribute(promptAttr); + } + + RefPtr<Attribute> mappedAttribute = Attribute::createMapped(nameAttr, isindexTag.localName()); + attributes->insertAttribute(mappedAttribute.release(), false); + return attributes.release(); +} + +void HTMLTreeBuilder::processIsindexStartTagForInBody(AtomicHTMLToken& token) +{ + ASSERT(token.type() == HTMLToken::StartTag); + ASSERT(token.name() == isindexTag); + parseError(token); + if (m_tree.form()) + return; + notImplemented(); // Acknowledge self-closing flag + processFakeStartTag(formTag); + RefPtr<Attribute> actionAttribute = token.getAttributeItem(actionAttr); + if (actionAttribute) { + ASSERT(m_tree.currentElement()->hasTagName(formTag)); + m_tree.currentElement()->setAttribute(actionAttr, actionAttribute->value()); + } + processFakeStartTag(hrTag); + processFakeStartTag(labelTag); + RefPtr<Attribute> promptAttribute = token.getAttributeItem(promptAttr); + if (promptAttribute) + processFakeCharacters(promptAttribute->value()); + else + processFakeCharacters(searchableIndexIntroduction()); + processFakeStartTag(inputTag, attributesForIsindexInput(token)); + notImplemented(); // This second set of characters may be needed by non-english locales. + processFakeEndTag(labelTag); + processFakeStartTag(hrTag); + processFakeEndTag(formTag); +} + +namespace { + +bool isLi(const Element* element) +{ + return element->hasTagName(liTag); +} + +bool isDdOrDt(const Element* element) +{ + return element->hasTagName(ddTag) + || element->hasTagName(dtTag); +} + +} + +template <bool shouldClose(const Element*)> +void HTMLTreeBuilder::processCloseWhenNestedTag(AtomicHTMLToken& token) +{ + m_framesetOk = false; + HTMLElementStack::ElementRecord* nodeRecord = m_tree.openElements()->topRecord(); + while (1) { + Element* node = nodeRecord->element(); + if (shouldClose(node)) { + processFakeEndTag(node->tagQName()); + break; + } + if (isSpecialNode(node) && !node->hasTagName(addressTag) && !node->hasTagName(divTag) && !node->hasTagName(pTag)) + break; + nodeRecord = nodeRecord->next(); + } + processFakePEndTagIfPInButtonScope(); + m_tree.insertHTMLElement(token); +} + +namespace { + +typedef HashMap<AtomicString, QualifiedName> PrefixedNameToQualifiedNameMap; + +void mapLoweredLocalNameToName(PrefixedNameToQualifiedNameMap* map, QualifiedName** names, size_t length) +{ + for (size_t i = 0; i < length; ++i) { + const QualifiedName& name = *names[i]; + const AtomicString& localName = name.localName(); + AtomicString loweredLocalName = localName.lower(); + if (loweredLocalName != localName) + map->add(loweredLocalName, name); + } +} + +void adjustSVGTagNameCase(AtomicHTMLToken& token) +{ + static PrefixedNameToQualifiedNameMap* caseMap = 0; + if (!caseMap) { + caseMap = new PrefixedNameToQualifiedNameMap; + size_t length = 0; + QualifiedName** svgTags = SVGNames::getSVGTags(&length); + mapLoweredLocalNameToName(caseMap, svgTags, length); + } + + const QualifiedName& casedName = caseMap->get(token.name()); + if (casedName.localName().isNull()) + return; + token.setName(casedName.localName()); +} + +template<QualifiedName** getAttrs(size_t* length)> +void adjustAttributes(AtomicHTMLToken& token) +{ + static PrefixedNameToQualifiedNameMap* caseMap = 0; + if (!caseMap) { + caseMap = new PrefixedNameToQualifiedNameMap; + size_t length = 0; + QualifiedName** attrs = getAttrs(&length); + mapLoweredLocalNameToName(caseMap, attrs, length); + } + + NamedNodeMap* attributes = token.attributes(); + if (!attributes) + return; + + for (unsigned x = 0; x < attributes->length(); ++x) { + Attribute* attribute = attributes->attributeItem(x); + const QualifiedName& casedName = caseMap->get(attribute->localName()); + if (!casedName.localName().isNull()) + attribute->parserSetName(casedName); + } +} + +void adjustSVGAttributes(AtomicHTMLToken& token) +{ + adjustAttributes<SVGNames::getSVGAttrs>(token); +} + +void adjustMathMLAttributes(AtomicHTMLToken& token) +{ + adjustAttributes<MathMLNames::getMathMLAttrs>(token); +} + +void addNamesWithPrefix(PrefixedNameToQualifiedNameMap* map, const AtomicString& prefix, QualifiedName** names, size_t length) +{ + for (size_t i = 0; i < length; ++i) { + QualifiedName* name = names[i]; + const AtomicString& localName = name->localName(); + AtomicString prefixColonLocalName(prefix + ":" + localName); + QualifiedName nameWithPrefix(prefix, localName, name->namespaceURI()); + map->add(prefixColonLocalName, nameWithPrefix); + } +} + +void adjustForeignAttributes(AtomicHTMLToken& token) +{ + static PrefixedNameToQualifiedNameMap* map = 0; + if (!map) { + map = new PrefixedNameToQualifiedNameMap; + size_t length = 0; + QualifiedName** attrs = XLinkNames::getXLinkAttrs(&length); + addNamesWithPrefix(map, "xlink", attrs, length); + + attrs = XMLNames::getXMLAttrs(&length); + addNamesWithPrefix(map, "xml", attrs, length); + + map->add("xmlns", XMLNSNames::xmlnsAttr); + map->add("xmlns:xlink", QualifiedName("xmlns", "xlink", XMLNSNames::xmlnsNamespaceURI)); + } + + NamedNodeMap* attributes = token.attributes(); + if (!attributes) + return; + + for (unsigned x = 0; x < attributes->length(); ++x) { + Attribute* attribute = attributes->attributeItem(x); + const QualifiedName& name = map->get(attribute->localName()); + if (!name.localName().isNull()) + attribute->parserSetName(name); + } +} + +} + +void HTMLTreeBuilder::processStartTagForInBody(AtomicHTMLToken& token) +{ + ASSERT(token.type() == HTMLToken::StartTag); + if (token.name() == htmlTag) { + m_tree.insertHTMLHtmlStartTagInBody(token); + return; + } + if (token.name() == baseTag + || token.name() == basefontTag + || token.name() == bgsoundTag + || token.name() == commandTag + || token.name() == linkTag + || token.name() == metaTag + || token.name() == noframesTag + || token.name() == scriptTag + || token.name() == styleTag + || token.name() == titleTag) { + bool didProcess = processStartTagForInHead(token); + ASSERT_UNUSED(didProcess, didProcess); + return; + } + if (token.name() == bodyTag) { + if (!m_tree.openElements()->secondElementIsHTMLBodyElement() || m_tree.openElements()->hasOnlyOneElement()) { + ASSERT(isParsingFragment()); + return; + } + m_tree.insertHTMLBodyStartTagInBody(token); + return; + } + if (token.name() == framesetTag) { + parseError(token); + if (!m_tree.openElements()->secondElementIsHTMLBodyElement() || m_tree.openElements()->hasOnlyOneElement()) { + ASSERT(isParsingFragment()); + return; + } + if (!m_framesetOk) + return; + ExceptionCode ec = 0; + m_tree.openElements()->bodyElement()->remove(ec); + ASSERT(!ec); + m_tree.openElements()->popUntil(m_tree.openElements()->bodyElement()); + m_tree.openElements()->popHTMLBodyElement(); + ASSERT(m_tree.openElements()->top() == m_tree.openElements()->htmlElement()); + m_tree.insertHTMLElement(token); + setInsertionMode(InFramesetMode); + return; + } + if (token.name() == addressTag + || token.name() == articleTag + || token.name() == asideTag + || token.name() == blockquoteTag + || token.name() == centerTag + || token.name() == detailsTag + || token.name() == dirTag + || token.name() == divTag + || token.name() == dlTag + || token.name() == fieldsetTag + || token.name() == figcaptionTag + || token.name() == figureTag + || token.name() == footerTag + || token.name() == headerTag + || token.name() == hgroupTag + || token.name() == menuTag + || token.name() == navTag + || token.name() == olTag + || token.name() == pTag + || token.name() == sectionTag + || token.name() == summaryTag + || token.name() == ulTag) { + processFakePEndTagIfPInButtonScope(); + m_tree.insertHTMLElement(token); + return; + } + if (isNumberedHeaderTag(token.name())) { + processFakePEndTagIfPInButtonScope(); + if (isNumberedHeaderTag(m_tree.currentElement()->localName())) { + parseError(token); + m_tree.openElements()->pop(); + } + m_tree.insertHTMLElement(token); + return; + } + if (token.name() == preTag || token.name() == listingTag) { + processFakePEndTagIfPInButtonScope(); + m_tree.insertHTMLElement(token); + m_parser->tokenizer()->setSkipLeadingNewLineForListing(true); + m_framesetOk = false; + return; + } + if (token.name() == formTag) { + if (m_tree.form()) { + parseError(token); + return; + } + processFakePEndTagIfPInButtonScope(); + m_tree.insertHTMLFormElement(token); + return; + } + if (token.name() == liTag) { + processCloseWhenNestedTag<isLi>(token); + return; + } + if (token.name() == ddTag || token.name() == dtTag) { + processCloseWhenNestedTag<isDdOrDt>(token); + return; + } + if (token.name() == plaintextTag) { + processFakePEndTagIfPInButtonScope(); + m_tree.insertHTMLElement(token); + m_parser->tokenizer()->setState(HTMLTokenizer::PLAINTEXTState); + return; + } + if (token.name() == buttonTag) { + if (m_tree.openElements()->inScope(buttonTag)) { + parseError(token); + processFakeEndTag(buttonTag); + reprocessStartTag(token); // FIXME: Could we just fall through here? + return; + } + m_tree.reconstructTheActiveFormattingElements(); + m_tree.insertHTMLElement(token); + m_framesetOk = false; + return; + } + if (token.name() == aTag) { + Element* activeATag = m_tree.activeFormattingElements()->closestElementInScopeWithName(aTag.localName()); + if (activeATag) { + parseError(token); + processFakeEndTag(aTag); + m_tree.activeFormattingElements()->remove(activeATag); + if (m_tree.openElements()->contains(activeATag)) + m_tree.openElements()->remove(activeATag); + } + m_tree.reconstructTheActiveFormattingElements(); + m_tree.insertFormattingElement(token); + return; + } + if (isNonAnchorNonNobrFormattingTag(token.name())) { + m_tree.reconstructTheActiveFormattingElements(); + m_tree.insertFormattingElement(token); + return; + } + if (token.name() == nobrTag) { + m_tree.reconstructTheActiveFormattingElements(); + if (m_tree.openElements()->inScope(nobrTag)) { + parseError(token); + processFakeEndTag(nobrTag); + m_tree.reconstructTheActiveFormattingElements(); + } + m_tree.insertFormattingElement(token); + return; + } + if (token.name() == appletTag + || token.name() == marqueeTag + || token.name() == objectTag) { + m_tree.reconstructTheActiveFormattingElements(); + m_tree.insertHTMLElement(token); + m_tree.activeFormattingElements()->appendMarker(); + m_framesetOk = false; + return; + } + if (token.name() == tableTag) { + if (!m_document->inQuirksMode() && m_tree.openElements()->inButtonScope(pTag)) + processFakeEndTag(pTag); + m_tree.insertHTMLElement(token); + m_framesetOk = false; + setInsertionMode(InTableMode); + return; + } + if (token.name() == imageTag) { + parseError(token); + // Apparently we're not supposed to ask. + token.setName(imgTag.localName()); + prepareToReprocessToken(); + // Note the fall through to the imgTag handling below! + } + if (token.name() == areaTag + || token.name() == brTag + || token.name() == embedTag + || token.name() == imgTag + || token.name() == keygenTag + || token.name() == wbrTag) { + m_tree.reconstructTheActiveFormattingElements(); + m_tree.insertSelfClosingHTMLElement(token); + m_framesetOk = false; + return; + } + if (token.name() == inputTag) { + RefPtr<Attribute> typeAttribute = token.getAttributeItem(typeAttr); + m_tree.reconstructTheActiveFormattingElements(); + m_tree.insertSelfClosingHTMLElement(token); + if (!typeAttribute || !equalIgnoringCase(typeAttribute->value(), "hidden")) + m_framesetOk = false; + return; + } + if (token.name() == paramTag + || token.name() == sourceTag + || token.name() == trackTag) { + m_tree.insertSelfClosingHTMLElement(token); + return; + } + if (token.name() == hrTag) { + processFakePEndTagIfPInButtonScope(); + m_tree.insertSelfClosingHTMLElement(token); + m_framesetOk = false; + return; + } + if (token.name() == isindexTag) { + processIsindexStartTagForInBody(token); + return; + } + if (token.name() == textareaTag) { + m_tree.insertHTMLElement(token); + m_parser->tokenizer()->setSkipLeadingNewLineForListing(true); + m_parser->tokenizer()->setState(HTMLTokenizer::RCDATAState); + m_originalInsertionMode = m_insertionMode; + m_framesetOk = false; + setInsertionMode(TextMode); + return; + } + if (token.name() == xmpTag) { + processFakePEndTagIfPInButtonScope(); + m_tree.reconstructTheActiveFormattingElements(); + m_framesetOk = false; + processGenericRawTextStartTag(token); + return; + } + if (token.name() == iframeTag) { + m_framesetOk = false; + processGenericRawTextStartTag(token); + return; + } + if (token.name() == noembedTag && pluginsEnabled(m_document->frame())) { + processGenericRawTextStartTag(token); + return; + } + if (token.name() == noscriptTag && scriptEnabled(m_document->frame())) { + processGenericRawTextStartTag(token); + return; + } + if (token.name() == selectTag) { + m_tree.reconstructTheActiveFormattingElements(); + m_tree.insertHTMLElement(token); + m_framesetOk = false; + if (m_insertionMode == InTableMode + || m_insertionMode == InCaptionMode + || m_insertionMode == InColumnGroupMode + || m_insertionMode == InTableBodyMode + || m_insertionMode == InRowMode + || m_insertionMode == InCellMode) + setInsertionMode(InSelectInTableMode); + else + setInsertionMode(InSelectMode); + return; + } + if (token.name() == optgroupTag || token.name() == optionTag) { + if (m_tree.openElements()->inScope(optionTag.localName())) { + AtomicHTMLToken endOption(HTMLToken::EndTag, optionTag.localName()); + processEndTag(endOption); + } + m_tree.reconstructTheActiveFormattingElements(); + m_tree.insertHTMLElement(token); + return; + } + if (token.name() == rpTag || token.name() == rtTag) { + if (m_tree.openElements()->inScope(rubyTag.localName())) { + m_tree.generateImpliedEndTags(); + if (!m_tree.currentElement()->hasTagName(rubyTag)) { + parseError(token); + m_tree.openElements()->popUntil(rubyTag.localName()); + } + } + m_tree.insertHTMLElement(token); + return; + } + if (token.name() == MathMLNames::mathTag.localName()) { + m_tree.reconstructTheActiveFormattingElements(); + adjustMathMLAttributes(token); + adjustForeignAttributes(token); + m_tree.insertForeignElement(token, MathMLNames::mathmlNamespaceURI); + if (m_insertionMode != InForeignContentMode) + setInsertionMode(InForeignContentMode); + return; + } + if (token.name() == SVGNames::svgTag.localName()) { + m_tree.reconstructTheActiveFormattingElements(); + adjustSVGAttributes(token); + adjustForeignAttributes(token); + m_tree.insertForeignElement(token, SVGNames::svgNamespaceURI); + if (m_insertionMode != InForeignContentMode) + setInsertionMode(InForeignContentMode); + return; + } + if (isCaptionColOrColgroupTag(token.name()) + || token.name() == frameTag + || token.name() == headTag + || isTableBodyContextTag(token.name()) + || isTableCellContextTag(token.name()) + || token.name() == trTag) { + parseError(token); + return; + } + m_tree.reconstructTheActiveFormattingElements(); + m_tree.insertHTMLElement(token); +} + +bool HTMLTreeBuilder::processColgroupEndTagForInColumnGroup() +{ + if (m_tree.currentElement() == m_tree.openElements()->htmlElement()) { + ASSERT(isParsingFragment()); + // FIXME: parse error + return false; + } + m_tree.openElements()->pop(); + setInsertionMode(InTableMode); + return true; +} + +// http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#close-the-cell +void HTMLTreeBuilder::closeTheCell() +{ + ASSERT(insertionMode() == InCellMode); + if (m_tree.openElements()->inTableScope(tdTag)) { + ASSERT(!m_tree.openElements()->inTableScope(thTag)); + processFakeEndTag(tdTag); + return; + } + ASSERT(m_tree.openElements()->inTableScope(thTag)); + processFakeEndTag(thTag); + ASSERT(insertionMode() == InRowMode); +} + +void HTMLTreeBuilder::processStartTagForInTable(AtomicHTMLToken& token) +{ + ASSERT(token.type() == HTMLToken::StartTag); + if (token.name() == captionTag) { + m_tree.openElements()->popUntilTableScopeMarker(); + m_tree.activeFormattingElements()->appendMarker(); + m_tree.insertHTMLElement(token); + setInsertionMode(InCaptionMode); + return; + } + if (token.name() == colgroupTag) { + m_tree.openElements()->popUntilTableScopeMarker(); + m_tree.insertHTMLElement(token); + setInsertionMode(InColumnGroupMode); + return; + } + if (token.name() == colTag) { + processFakeStartTag(colgroupTag); + ASSERT(InColumnGroupMode); + reprocessStartTag(token); + return; + } + if (isTableBodyContextTag(token.name())) { + m_tree.openElements()->popUntilTableScopeMarker(); + m_tree.insertHTMLElement(token); + setInsertionMode(InTableBodyMode); + return; + } + if (isTableCellContextTag(token.name()) + || token.name() == trTag) { + processFakeStartTag(tbodyTag); + ASSERT(insertionMode() == InTableBodyMode); + reprocessStartTag(token); + return; + } + if (token.name() == tableTag) { + parseError(token); + if (!processTableEndTagForInTable()) { + ASSERT(isParsingFragment()); + return; + } + reprocessStartTag(token); + return; + } + if (token.name() == styleTag || token.name() == scriptTag) { + processStartTagForInHead(token); + return; + } + if (token.name() == inputTag) { + Attribute* typeAttribute = token.getAttributeItem(typeAttr); + if (typeAttribute && equalIgnoringCase(typeAttribute->value(), "hidden")) { + parseError(token); + m_tree.insertSelfClosingHTMLElement(token); + return; + } + // Fall through to "anything else" case. + } + if (token.name() == formTag) { + parseError(token); + if (m_tree.form()) + return; + m_tree.insertHTMLFormElement(token, true); + m_tree.openElements()->pop(); + return; + } + parseError(token); + HTMLConstructionSite::RedirectToFosterParentGuard redirecter(m_tree); + processStartTagForInBody(token); +} + +namespace { + +bool shouldProcessForeignContentUsingInBodyInsertionMode(AtomicHTMLToken& token, Element* currentElement) +{ + ASSERT(token.type() == HTMLToken::StartTag); + if (currentElement->hasTagName(MathMLNames::miTag) + || currentElement->hasTagName(MathMLNames::moTag) + || currentElement->hasTagName(MathMLNames::mnTag) + || currentElement->hasTagName(MathMLNames::msTag) + || currentElement->hasTagName(MathMLNames::mtextTag)) { + return token.name() != MathMLNames::mglyphTag + && token.name() != MathMLNames::malignmarkTag; + } + if (currentElement->hasTagName(MathMLNames::annotation_xmlTag)) + return token.name() == SVGNames::svgTag; + if (currentElement->hasTagName(SVGNames::foreignObjectTag) + || currentElement->hasTagName(SVGNames::descTag) + || currentElement->hasTagName(SVGNames::titleTag)) + return true; + return currentElement->namespaceURI() == HTMLNames::xhtmlNamespaceURI; +} + +} + +void HTMLTreeBuilder::processStartTag(AtomicHTMLToken& token) +{ + ASSERT(token.type() == HTMLToken::StartTag); + switch (insertionMode()) { + case InitialMode: + ASSERT(insertionMode() == InitialMode); + defaultForInitial(); + // Fall through. + case BeforeHTMLMode: + ASSERT(insertionMode() == BeforeHTMLMode); + if (token.name() == htmlTag) { + m_tree.insertHTMLHtmlStartTagBeforeHTML(token); + setInsertionMode(BeforeHeadMode); + return; + } + defaultForBeforeHTML(); + // Fall through. + case BeforeHeadMode: + ASSERT(insertionMode() == BeforeHeadMode); + if (token.name() == htmlTag) { + m_tree.insertHTMLHtmlStartTagInBody(token); + return; + } + if (token.name() == headTag) { + m_tree.insertHTMLHeadElement(token); + setInsertionMode(InHeadMode); + return; + } + defaultForBeforeHead(); + // Fall through. + case InHeadMode: + ASSERT(insertionMode() == InHeadMode); + if (processStartTagForInHead(token)) + return; + defaultForInHead(); + // Fall through. + case AfterHeadMode: + ASSERT(insertionMode() == AfterHeadMode); + if (token.name() == htmlTag) { + m_tree.insertHTMLHtmlStartTagInBody(token); + return; + } + if (token.name() == bodyTag) { + m_framesetOk = false; + m_tree.insertHTMLBodyElement(token); + setInsertionMode(InBodyMode); + return; + } + if (token.name() == framesetTag) { + m_tree.insertHTMLElement(token); + setInsertionMode(InFramesetMode); + return; + } + if (token.name() == baseTag + || token.name() == basefontTag + || token.name() == bgsoundTag + || token.name() == linkTag + || token.name() == metaTag + || token.name() == noframesTag + || token.name() == scriptTag + || token.name() == styleTag + || token.name() == titleTag) { + parseError(token); + ASSERT(m_tree.head()); + m_tree.openElements()->pushHTMLHeadElement(m_tree.head()); + processStartTagForInHead(token); + m_tree.openElements()->removeHTMLHeadElement(m_tree.head()); + return; + } + if (token.name() == headTag) { + parseError(token); + return; + } + defaultForAfterHead(); + // Fall through + case InBodyMode: + ASSERT(insertionMode() == InBodyMode); + processStartTagForInBody(token); + break; + case InTableMode: + ASSERT(insertionMode() == InTableMode); + processStartTagForInTable(token); + break; + case InCaptionMode: + ASSERT(insertionMode() == InCaptionMode); + if (isCaptionColOrColgroupTag(token.name()) + || isTableBodyContextTag(token.name()) + || isTableCellContextTag(token.name()) + || token.name() == trTag) { + parseError(token); + if (!processCaptionEndTagForInCaption()) { + ASSERT(isParsingFragment()); + return; + } + reprocessStartTag(token); + return; + } + processStartTagForInBody(token); + break; + case InColumnGroupMode: + ASSERT(insertionMode() == InColumnGroupMode); + if (token.name() == htmlTag) { + m_tree.insertHTMLHtmlStartTagInBody(token); + return; + } + if (token.name() == colTag) { + m_tree.insertSelfClosingHTMLElement(token); + return; + } + if (!processColgroupEndTagForInColumnGroup()) { + ASSERT(isParsingFragment()); + return; + } + reprocessStartTag(token); + break; + case InTableBodyMode: + ASSERT(insertionMode() == InTableBodyMode); + if (token.name() == trTag) { + m_tree.openElements()->popUntilTableBodyScopeMarker(); // How is there ever anything to pop? + m_tree.insertHTMLElement(token); + setInsertionMode(InRowMode); + return; + } + if (isTableCellContextTag(token.name())) { + parseError(token); + processFakeStartTag(trTag); + ASSERT(insertionMode() == InRowMode); + reprocessStartTag(token); + return; + } + if (isCaptionColOrColgroupTag(token.name()) || isTableBodyContextTag(token.name())) { + // FIXME: This is slow. + if (!m_tree.openElements()->inTableScope(tbodyTag.localName()) && !m_tree.openElements()->inTableScope(theadTag.localName()) && !m_tree.openElements()->inTableScope(tfootTag.localName())) { + ASSERT(isParsingFragment()); + parseError(token); + return; + } + m_tree.openElements()->popUntilTableBodyScopeMarker(); + ASSERT(isTableBodyContextTag(m_tree.currentElement()->localName())); + processFakeEndTag(m_tree.currentElement()->tagQName()); + reprocessStartTag(token); + return; + } + processStartTagForInTable(token); + break; + case InRowMode: + ASSERT(insertionMode() == InRowMode); + if (isTableCellContextTag(token.name())) { + m_tree.openElements()->popUntilTableRowScopeMarker(); + m_tree.insertHTMLElement(token); + setInsertionMode(InCellMode); + m_tree.activeFormattingElements()->appendMarker(); + return; + } + if (token.name() == trTag + || isCaptionColOrColgroupTag(token.name()) + || isTableBodyContextTag(token.name())) { + if (!processTrEndTagForInRow()) { + ASSERT(isParsingFragment()); + return; + } + ASSERT(insertionMode() == InTableBodyMode); + reprocessStartTag(token); + return; + } + processStartTagForInTable(token); + break; + case InCellMode: + ASSERT(insertionMode() == InCellMode); + if (isCaptionColOrColgroupTag(token.name()) + || isTableCellContextTag(token.name()) + || token.name() == trTag + || isTableBodyContextTag(token.name())) { + // FIXME: This could be more efficient. + if (!m_tree.openElements()->inTableScope(tdTag) && !m_tree.openElements()->inTableScope(thTag)) { + ASSERT(isParsingFragment()); + parseError(token); + return; + } + closeTheCell(); + reprocessStartTag(token); + return; + } + processStartTagForInBody(token); + break; + case AfterBodyMode: + case AfterAfterBodyMode: + ASSERT(insertionMode() == AfterBodyMode || insertionMode() == AfterAfterBodyMode); + if (token.name() == htmlTag) { + m_tree.insertHTMLHtmlStartTagInBody(token); + return; + } + setInsertionMode(InBodyMode); + reprocessStartTag(token); + break; + case InHeadNoscriptMode: + ASSERT(insertionMode() == InHeadNoscriptMode); + if (token.name() == htmlTag) { + m_tree.insertHTMLHtmlStartTagInBody(token); + return; + } + if (token.name() == basefontTag + || token.name() == bgsoundTag + || token.name() == linkTag + || token.name() == metaTag + || token.name() == noframesTag + || token.name() == styleTag) { + bool didProcess = processStartTagForInHead(token); + ASSERT_UNUSED(didProcess, didProcess); + return; + } + if (token.name() == htmlTag || token.name() == noscriptTag) { + parseError(token); + return; + } + defaultForInHeadNoscript(); + processToken(token); + break; + case InFramesetMode: + ASSERT(insertionMode() == InFramesetMode); + if (token.name() == htmlTag) { + m_tree.insertHTMLHtmlStartTagInBody(token); + return; + } + if (token.name() == framesetTag) { + m_tree.insertHTMLElement(token); + return; + } + if (token.name() == frameTag) { + m_tree.insertSelfClosingHTMLElement(token); + return; + } + if (token.name() == noframesTag) { + processStartTagForInHead(token); + return; + } + parseError(token); + break; + case AfterFramesetMode: + case AfterAfterFramesetMode: + ASSERT(insertionMode() == AfterFramesetMode || insertionMode() == AfterAfterFramesetMode); + if (token.name() == htmlTag) { + m_tree.insertHTMLHtmlStartTagInBody(token); + return; + } + if (token.name() == noframesTag) { + processStartTagForInHead(token); + return; + } + parseError(token); + break; + case InSelectInTableMode: + ASSERT(insertionMode() == InSelectInTableMode); + if (token.name() == captionTag + || token.name() == tableTag + || isTableBodyContextTag(token.name()) + || token.name() == trTag + || isTableCellContextTag(token.name())) { + parseError(token); + AtomicHTMLToken endSelect(HTMLToken::EndTag, selectTag.localName()); + processEndTag(endSelect); + reprocessStartTag(token); + return; + } + // Fall through + case InSelectMode: + ASSERT(insertionMode() == InSelectMode || insertionMode() == InSelectInTableMode); + if (token.name() == htmlTag) { + m_tree.insertHTMLHtmlStartTagInBody(token); + return; + } + if (token.name() == optionTag) { + if (m_tree.currentElement()->hasTagName(optionTag)) { + AtomicHTMLToken endOption(HTMLToken::EndTag, optionTag.localName()); + processEndTag(endOption); + } + m_tree.insertHTMLElement(token); + return; + } + if (token.name() == optgroupTag) { + if (m_tree.currentElement()->hasTagName(optionTag)) { + AtomicHTMLToken endOption(HTMLToken::EndTag, optionTag.localName()); + processEndTag(endOption); + } + if (m_tree.currentElement()->hasTagName(optgroupTag)) { + AtomicHTMLToken endOptgroup(HTMLToken::EndTag, optgroupTag.localName()); + processEndTag(endOptgroup); + } + m_tree.insertHTMLElement(token); + return; + } + if (token.name() == selectTag) { + parseError(token); + AtomicHTMLToken endSelect(HTMLToken::EndTag, selectTag.localName()); + processEndTag(endSelect); + return; + } + if (token.name() == inputTag + || token.name() == keygenTag + || token.name() == textareaTag) { + parseError(token); + if (!m_tree.openElements()->inSelectScope(selectTag)) { + ASSERT(isParsingFragment()); + return; + } + AtomicHTMLToken endSelect(HTMLToken::EndTag, selectTag.localName()); + processEndTag(endSelect); + reprocessStartTag(token); + return; + } + if (token.name() == scriptTag) { + bool didProcess = processStartTagForInHead(token); + ASSERT_UNUSED(didProcess, didProcess); + return; + } + break; + case InTableTextMode: + defaultForInTableText(); + processStartTag(token); + break; + case InForeignContentMode: { + if (shouldProcessForeignContentUsingInBodyInsertionMode(token, m_tree.currentElement())) { + processForeignContentUsingInBodyModeAndResetMode(token); + return; + } + if (token.name() == bTag + || token.name() == bigTag + || token.name() == blockquoteTag + || token.name() == bodyTag + || token.name() == brTag + || token.name() == centerTag + || token.name() == codeTag + || token.name() == ddTag + || token.name() == divTag + || token.name() == dlTag + || token.name() == dtTag + || token.name() == emTag + || token.name() == embedTag + || isNumberedHeaderTag(token.name()) + || token.name() == headTag + || token.name() == hrTag + || token.name() == iTag + || token.name() == imgTag + || token.name() == liTag + || token.name() == listingTag + || token.name() == menuTag + || token.name() == metaTag + || token.name() == nobrTag + || token.name() == olTag + || token.name() == pTag + || token.name() == preTag + || token.name() == rubyTag + || token.name() == sTag + || token.name() == smallTag + || token.name() == spanTag + || token.name() == strongTag + || token.name() == strikeTag + || token.name() == subTag + || token.name() == supTag + || token.name() == tableTag + || token.name() == ttTag + || token.name() == uTag + || token.name() == ulTag + || token.name() == varTag + || (token.name() == fontTag && (token.getAttributeItem(colorAttr) || token.getAttributeItem(faceAttr) || token.getAttributeItem(sizeAttr)))) { + parseError(token); + m_tree.openElements()->popUntilForeignContentScopeMarker(); + resetInsertionModeAppropriately(); + reprocessStartTag(token); + return; + } + const AtomicString& currentNamespace = m_tree.currentElement()->namespaceURI(); + if (currentNamespace == MathMLNames::mathmlNamespaceURI) + adjustMathMLAttributes(token); + if (currentNamespace == SVGNames::svgNamespaceURI) { + adjustSVGTagNameCase(token); + adjustSVGAttributes(token); + } + adjustForeignAttributes(token); + m_tree.insertForeignElement(token, currentNamespace); + break; + } + case TextMode: + ASSERT_NOT_REACHED(); + break; + } +} + +bool HTMLTreeBuilder::processBodyEndTagForInBody(AtomicHTMLToken& token) +{ + ASSERT(token.type() == HTMLToken::EndTag); + ASSERT(token.name() == bodyTag); + if (!m_tree.openElements()->inScope(bodyTag.localName())) { + parseError(token); + return false; + } + notImplemented(); // Emit a more specific parse error based on stack contents. + setInsertionMode(AfterBodyMode); + return true; +} + +void HTMLTreeBuilder::processAnyOtherEndTagForInBody(AtomicHTMLToken& token) +{ + ASSERT(token.type() == HTMLToken::EndTag); + HTMLElementStack::ElementRecord* record = m_tree.openElements()->topRecord(); + while (1) { + Element* node = record->element(); + if (node->hasLocalName(token.name())) { + m_tree.generateImpliedEndTags(); + if (!m_tree.currentElement()->hasLocalName(token.name())) { + parseError(token); + // FIXME: This is either a bug in the spec, or a bug in our + // implementation. Filed a bug with HTML5: + // http://www.w3.org/Bugs/Public/show_bug.cgi?id=10080 + // We might have already popped the node for the token in + // generateImpliedEndTags, just abort. + if (!m_tree.openElements()->contains(node)) + return; + } + m_tree.openElements()->popUntilPopped(node); + return; + } + if (isSpecialNode(node)) { + parseError(token); + return; + } + record = record->next(); + } +} + +// FIXME: This probably belongs on HTMLElementStack. +HTMLElementStack::ElementRecord* HTMLTreeBuilder::furthestBlockForFormattingElement(Element* formattingElement) +{ + HTMLElementStack::ElementRecord* furthestBlock = 0; + HTMLElementStack::ElementRecord* record = m_tree.openElements()->topRecord(); + for (; record; record = record->next()) { + if (record->element() == formattingElement) + return furthestBlock; + if (isSpecialNode(record->element())) + furthestBlock = record; + } + ASSERT_NOT_REACHED(); + return 0; +} + +// http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#parsing-main-inbody +void HTMLTreeBuilder::callTheAdoptionAgency(AtomicHTMLToken& token) +{ + // The adoption agency algorithm is N^2. We limit the number of iterations + // to stop from hanging the whole browser. This limit is copied from the + // legacy tree builder and might need to be tweaked in the future. + static const int adoptionAgencyIterationLimit = 10; + + for (int i = 0; i < adoptionAgencyIterationLimit; ++i) { + // 1. + Element* formattingElement = m_tree.activeFormattingElements()->closestElementInScopeWithName(token.name()); + if (!formattingElement || ((m_tree.openElements()->contains(formattingElement)) && !m_tree.openElements()->inScope(formattingElement))) { + parseError(token); + notImplemented(); // Check the stack of open elements for a more specific parse error. + return; + } + HTMLElementStack::ElementRecord* formattingElementRecord = m_tree.openElements()->find(formattingElement); + if (!formattingElementRecord) { + parseError(token); + m_tree.activeFormattingElements()->remove(formattingElement); + return; + } + if (formattingElement != m_tree.currentElement()) + parseError(token); + // 2. + HTMLElementStack::ElementRecord* furthestBlock = furthestBlockForFormattingElement(formattingElement); + // 3. + if (!furthestBlock) { + m_tree.openElements()->popUntilPopped(formattingElement); + m_tree.activeFormattingElements()->remove(formattingElement); + return; + } + // 4. + ASSERT(furthestBlock->isAbove(formattingElementRecord)); + Element* commonAncestor = formattingElementRecord->next()->element(); + // 5. + HTMLFormattingElementList::Bookmark bookmark = m_tree.activeFormattingElements()->bookmarkFor(formattingElement); + // 6. + HTMLElementStack::ElementRecord* node = furthestBlock; + HTMLElementStack::ElementRecord* nextNode = node->next(); + HTMLElementStack::ElementRecord* lastNode = furthestBlock; + for (int i = 0; i < adoptionAgencyIterationLimit; ++i) { + // 6.1 + node = nextNode; + ASSERT(node); + nextNode = node->next(); // Save node->next() for the next iteration in case node is deleted in 6.2. + // 6.2 + if (!m_tree.activeFormattingElements()->contains(node->element())) { + m_tree.openElements()->remove(node->element()); + node = 0; + continue; + } + // 6.3 + if (node == formattingElementRecord) + break; + // 6.5 + RefPtr<Element> newElement = m_tree.createHTMLElementFromElementRecord(node); + HTMLFormattingElementList::Entry* nodeEntry = m_tree.activeFormattingElements()->find(node->element()); + nodeEntry->replaceElement(newElement.get()); + node->replaceElement(newElement.release()); + // 6.4 -- Intentionally out of order to handle the case where node + // was replaced in 6.5. + // http://www.w3.org/Bugs/Public/show_bug.cgi?id=10096 + if (lastNode == furthestBlock) + bookmark.moveToAfter(nodeEntry); + // 6.6 + if (Element* parent = lastNode->element()->parentElement()) + parent->parserRemoveChild(lastNode->element()); + node->element()->parserAddChild(lastNode->element()); + if (lastNode->element()->parentElement()->attached() && !lastNode->element()->attached()) + lastNode->element()->lazyAttach(); + // 6.7 + lastNode = node; + } + // 7 + const AtomicString& commonAncestorTag = commonAncestor->localName(); + if (Element* parent = lastNode->element()->parentElement()) + parent->parserRemoveChild(lastNode->element()); + // FIXME: If this moves to HTMLConstructionSite, this check should use + // causesFosterParenting(tagName) instead. + if (commonAncestorTag == tableTag + || commonAncestorTag == trTag + || isTableBodyContextTag(commonAncestorTag)) + m_tree.fosterParent(lastNode->element()); + else { + commonAncestor->parserAddChild(lastNode->element()); + if (lastNode->element()->parentElement()->attached() && !lastNode->element()->attached()) + lastNode->element()->lazyAttach(); + } + // 8 + RefPtr<Element> newElement = m_tree.createHTMLElementFromElementRecord(formattingElementRecord); + // 9 + newElement->takeAllChildrenFrom(furthestBlock->element()); + // 10 + Element* furthestBlockElement = furthestBlock->element(); + // FIXME: All this creation / parserAddChild / attach business should + // be in HTMLConstructionSite. My guess is that steps 8--12 + // should all be in some HTMLConstructionSite function. + furthestBlockElement->parserAddChild(newElement); + if (furthestBlockElement->attached() && !newElement->attached()) { + // Notice that newElement might already be attached if, for example, one of the reparented + // children is a style element, which attaches itself automatically. + newElement->attach(); + } + // 11 + m_tree.activeFormattingElements()->swapTo(formattingElement, newElement.get(), bookmark); + // 12 + m_tree.openElements()->remove(formattingElement); + m_tree.openElements()->insertAbove(newElement, furthestBlock); + } +} + +void HTMLTreeBuilder::resetInsertionModeAppropriately() +{ + // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#reset-the-insertion-mode-appropriately + bool last = false; + HTMLElementStack::ElementRecord* nodeRecord = m_tree.openElements()->topRecord(); + while (1) { + Element* node = nodeRecord->element(); + if (node == m_tree.openElements()->bottom()) { + ASSERT(isParsingFragment()); + last = true; + node = m_fragmentContext.contextElement(); + } + if (node->hasTagName(selectTag)) { + ASSERT(isParsingFragment()); + return setInsertionMode(InSelectMode); + } + if (node->hasTagName(tdTag) || node->hasTagName(thTag)) + return setInsertionMode(InCellMode); + if (node->hasTagName(trTag)) + return setInsertionMode(InRowMode); + if (node->hasTagName(tbodyTag) || node->hasTagName(theadTag) || node->hasTagName(tfootTag)) + return setInsertionMode(InTableBodyMode); + if (node->hasTagName(captionTag)) + return setInsertionMode(InCaptionMode); + if (node->hasTagName(colgroupTag)) { + ASSERT(isParsingFragment()); + return setInsertionMode(InColumnGroupMode); + } + if (node->hasTagName(tableTag)) + return setInsertionMode(InTableMode); + if (node->hasTagName(headTag)) { + ASSERT(isParsingFragment()); + return setInsertionMode(InBodyMode); + } + if (node->hasTagName(bodyTag)) + return setInsertionMode(InBodyMode); + if (node->hasTagName(framesetTag)) { + ASSERT(isParsingFragment()); + return setInsertionMode(InFramesetMode); + } + if (node->hasTagName(htmlTag)) { + ASSERT(isParsingFragment()); + return setInsertionMode(BeforeHeadMode); + } + if (node->namespaceURI() == SVGNames::svgNamespaceURI + || node->namespaceURI() == MathMLNames::mathmlNamespaceURI) + return setInsertionMode(InForeignContentMode); + if (last) { + ASSERT(isParsingFragment()); + return setInsertionMode(InBodyMode); + } + nodeRecord = nodeRecord->next(); + } +} + +void HTMLTreeBuilder::processEndTagForInTableBody(AtomicHTMLToken& token) +{ + ASSERT(token.type() == HTMLToken::EndTag); + if (isTableBodyContextTag(token.name())) { + if (!m_tree.openElements()->inTableScope(token.name())) { + parseError(token); + return; + } + m_tree.openElements()->popUntilTableBodyScopeMarker(); + m_tree.openElements()->pop(); + setInsertionMode(InTableMode); + return; + } + if (token.name() == tableTag) { + // FIXME: This is slow. + if (!m_tree.openElements()->inTableScope(tbodyTag.localName()) && !m_tree.openElements()->inTableScope(theadTag.localName()) && !m_tree.openElements()->inTableScope(tfootTag.localName())) { + ASSERT(isParsingFragment()); + parseError(token); + return; + } + m_tree.openElements()->popUntilTableBodyScopeMarker(); + ASSERT(isTableBodyContextTag(m_tree.currentElement()->localName())); + processFakeEndTag(m_tree.currentElement()->tagQName()); + reprocessEndTag(token); + return; + } + if (token.name() == bodyTag + || isCaptionColOrColgroupTag(token.name()) + || token.name() == htmlTag + || isTableCellContextTag(token.name()) + || token.name() == trTag) { + parseError(token); + return; + } + processEndTagForInTable(token); +} + +void HTMLTreeBuilder::processEndTagForInRow(AtomicHTMLToken& token) +{ + ASSERT(token.type() == HTMLToken::EndTag); + if (token.name() == trTag) { + processTrEndTagForInRow(); + return; + } + if (token.name() == tableTag) { + if (!processTrEndTagForInRow()) { + ASSERT(isParsingFragment()); + return; + } + ASSERT(insertionMode() == InTableBodyMode); + reprocessEndTag(token); + return; + } + if (isTableBodyContextTag(token.name())) { + if (!m_tree.openElements()->inTableScope(token.name())) { + parseError(token); + return; + } + processFakeEndTag(trTag); + ASSERT(insertionMode() == InTableBodyMode); + reprocessEndTag(token); + return; + } + if (token.name() == bodyTag + || isCaptionColOrColgroupTag(token.name()) + || token.name() == htmlTag + || isTableCellContextTag(token.name())) { + parseError(token); + return; + } + processEndTagForInTable(token); +} + +void HTMLTreeBuilder::processEndTagForInCell(AtomicHTMLToken& token) +{ + ASSERT(token.type() == HTMLToken::EndTag); + if (isTableCellContextTag(token.name())) { + if (!m_tree.openElements()->inTableScope(token.name())) { + parseError(token); + return; + } + m_tree.generateImpliedEndTags(); + if (!m_tree.currentElement()->hasLocalName(token.name())) + parseError(token); + m_tree.openElements()->popUntilPopped(token.name()); + m_tree.activeFormattingElements()->clearToLastMarker(); + setInsertionMode(InRowMode); + return; + } + if (token.name() == bodyTag + || isCaptionColOrColgroupTag(token.name()) + || token.name() == htmlTag) { + parseError(token); + return; + } + if (token.name() == tableTag + || token.name() == trTag + || isTableBodyContextTag(token.name())) { + if (!m_tree.openElements()->inTableScope(token.name())) { + ASSERT(isTableBodyContextTag(token.name()) || isParsingFragment()); + parseError(token); + return; + } + closeTheCell(); + reprocessEndTag(token); + return; + } + processEndTagForInBody(token); +} + +void HTMLTreeBuilder::processEndTagForInBody(AtomicHTMLToken& token) +{ + ASSERT(token.type() == HTMLToken::EndTag); + if (token.name() == bodyTag) { + processBodyEndTagForInBody(token); + return; + } + if (token.name() == htmlTag) { + AtomicHTMLToken endBody(HTMLToken::EndTag, bodyTag.localName()); + if (processBodyEndTagForInBody(endBody)) + reprocessEndTag(token); + return; + } + if (token.name() == addressTag + || token.name() == articleTag + || token.name() == asideTag + || token.name() == blockquoteTag + || token.name() == buttonTag + || token.name() == centerTag + || token.name() == detailsTag + || token.name() == dirTag + || token.name() == divTag + || token.name() == dlTag + || token.name() == fieldsetTag + || token.name() == figcaptionTag + || token.name() == figureTag + || token.name() == footerTag + || token.name() == headerTag + || token.name() == hgroupTag + || token.name() == listingTag + || token.name() == menuTag + || token.name() == navTag + || token.name() == olTag + || token.name() == preTag + || token.name() == sectionTag + || token.name() == summaryTag + || token.name() == ulTag) { + if (!m_tree.openElements()->inScope(token.name())) { + parseError(token); + return; + } + m_tree.generateImpliedEndTags(); + if (!m_tree.currentElement()->hasLocalName(token.name())) + parseError(token); + m_tree.openElements()->popUntilPopped(token.name()); + return; + } + if (token.name() == formTag) { + RefPtr<Element> node = m_tree.takeForm(); + if (!node || !m_tree.openElements()->inScope(node.get())) { + parseError(token); + return; + } + m_tree.generateImpliedEndTags(); + if (m_tree.currentElement() != node.get()) + parseError(token); + m_tree.openElements()->remove(node.get()); + } + if (token.name() == pTag) { + if (!m_tree.openElements()->inButtonScope(token.name())) { + parseError(token); + processFakeStartTag(pTag); + ASSERT(m_tree.openElements()->inScope(token.name())); + reprocessEndTag(token); + return; + } + m_tree.generateImpliedEndTagsWithExclusion(token.name()); + if (!m_tree.currentElement()->hasLocalName(token.name())) + parseError(token); + m_tree.openElements()->popUntilPopped(token.name()); + return; + } + if (token.name() == liTag) { + if (!m_tree.openElements()->inListItemScope(token.name())) { + parseError(token); + return; + } + m_tree.generateImpliedEndTagsWithExclusion(token.name()); + if (!m_tree.currentElement()->hasLocalName(token.name())) + parseError(token); + m_tree.openElements()->popUntilPopped(token.name()); + return; + } + if (token.name() == ddTag + || token.name() == dtTag) { + if (!m_tree.openElements()->inScope(token.name())) { + parseError(token); + return; + } + m_tree.generateImpliedEndTagsWithExclusion(token.name()); + if (!m_tree.currentElement()->hasLocalName(token.name())) + parseError(token); + m_tree.openElements()->popUntilPopped(token.name()); + return; + } + if (isNumberedHeaderTag(token.name())) { + if (!m_tree.openElements()->hasNumberedHeaderElementInScope()) { + parseError(token); + return; + } + m_tree.generateImpliedEndTags(); + if (!m_tree.currentElement()->hasLocalName(token.name())) + parseError(token); + m_tree.openElements()->popUntilNumberedHeaderElementPopped(); + return; + } + if (isFormattingTag(token.name())) { + callTheAdoptionAgency(token); + return; + } + if (token.name() == appletTag + || token.name() == marqueeTag + || token.name() == objectTag) { + if (!m_tree.openElements()->inScope(token.name())) { + parseError(token); + return; + } + m_tree.generateImpliedEndTags(); + if (!m_tree.currentElement()->hasLocalName(token.name())) + parseError(token); + m_tree.openElements()->popUntilPopped(token.name()); + m_tree.activeFormattingElements()->clearToLastMarker(); + return; + } + if (token.name() == brTag) { + parseError(token); + processFakeStartTag(brTag); + return; + } + processAnyOtherEndTagForInBody(token); +} + +bool HTMLTreeBuilder::processCaptionEndTagForInCaption() +{ + if (!m_tree.openElements()->inTableScope(captionTag.localName())) { + ASSERT(isParsingFragment()); + // FIXME: parse error + return false; + } + m_tree.generateImpliedEndTags(); + // FIXME: parse error if (!m_tree.currentElement()->hasTagName(captionTag)) + m_tree.openElements()->popUntilPopped(captionTag.localName()); + m_tree.activeFormattingElements()->clearToLastMarker(); + setInsertionMode(InTableMode); + return true; +} + +bool HTMLTreeBuilder::processTrEndTagForInRow() +{ + if (!m_tree.openElements()->inTableScope(trTag.localName())) { + ASSERT(isParsingFragment()); + // FIXME: parse error + return false; + } + m_tree.openElements()->popUntilTableRowScopeMarker(); + ASSERT(m_tree.currentElement()->hasTagName(trTag)); + m_tree.openElements()->pop(); + setInsertionMode(InTableBodyMode); + return true; +} + +bool HTMLTreeBuilder::processTableEndTagForInTable() +{ + if (!m_tree.openElements()->inTableScope(tableTag)) { + ASSERT(isParsingFragment()); + // FIXME: parse error. + return false; + } + m_tree.openElements()->popUntilPopped(tableTag.localName()); + resetInsertionModeAppropriately(); + return true; +} + +void HTMLTreeBuilder::processEndTagForInTable(AtomicHTMLToken& token) +{ + ASSERT(token.type() == HTMLToken::EndTag); + if (token.name() == tableTag) { + processTableEndTagForInTable(); + return; + } + if (token.name() == bodyTag + || isCaptionColOrColgroupTag(token.name()) + || token.name() == htmlTag + || isTableBodyContextTag(token.name()) + || isTableCellContextTag(token.name()) + || token.name() == trTag) { + parseError(token); + return; + } + // Is this redirection necessary here? + HTMLConstructionSite::RedirectToFosterParentGuard redirecter(m_tree); + processEndTagForInBody(token); +} + +void HTMLTreeBuilder::processEndTag(AtomicHTMLToken& token) +{ + ASSERT(token.type() == HTMLToken::EndTag); + switch (insertionMode()) { + case InitialMode: + ASSERT(insertionMode() == InitialMode); + defaultForInitial(); + // Fall through. + case BeforeHTMLMode: + ASSERT(insertionMode() == BeforeHTMLMode); + if (token.name() != headTag && token.name() != bodyTag && token.name() != htmlTag && token.name() != brTag) { + parseError(token); + return; + } + defaultForBeforeHTML(); + // Fall through. + case BeforeHeadMode: + ASSERT(insertionMode() == BeforeHeadMode); + if (token.name() != headTag && token.name() != bodyTag && token.name() != htmlTag && token.name() != brTag) { + parseError(token); + return; + } + defaultForBeforeHead(); + // Fall through. + case InHeadMode: + ASSERT(insertionMode() == InHeadMode); + if (token.name() == headTag) { + m_tree.openElements()->popHTMLHeadElement(); + setInsertionMode(AfterHeadMode); + return; + } + if (token.name() != bodyTag && token.name() != htmlTag && token.name() != brTag) { + parseError(token); + return; + } + defaultForInHead(); + // Fall through. + case AfterHeadMode: + ASSERT(insertionMode() == AfterHeadMode); + if (token.name() != bodyTag && token.name() != htmlTag && token.name() != brTag) { + parseError(token); + return; + } + defaultForAfterHead(); + // Fall through + case InBodyMode: + ASSERT(insertionMode() == InBodyMode); + processEndTagForInBody(token); + break; + case InTableMode: + ASSERT(insertionMode() == InTableMode); + processEndTagForInTable(token); + break; + case InCaptionMode: + ASSERT(insertionMode() == InCaptionMode); + if (token.name() == captionTag) { + processCaptionEndTagForInCaption(); + return; + } + if (token.name() == tableTag) { + parseError(token); + if (!processCaptionEndTagForInCaption()) { + ASSERT(isParsingFragment()); + return; + } + reprocessEndTag(token); + return; + } + if (token.name() == bodyTag + || token.name() == colTag + || token.name() == colgroupTag + || token.name() == htmlTag + || isTableBodyContextTag(token.name()) + || isTableCellContextTag(token.name()) + || token.name() == trTag) { + parseError(token); + return; + } + processEndTagForInBody(token); + break; + case InColumnGroupMode: + ASSERT(insertionMode() == InColumnGroupMode); + if (token.name() == colgroupTag) { + processColgroupEndTagForInColumnGroup(); + return; + } + if (token.name() == colTag) { + parseError(token); + return; + } + if (!processColgroupEndTagForInColumnGroup()) { + ASSERT(isParsingFragment()); + return; + } + reprocessEndTag(token); + break; + case InRowMode: + ASSERT(insertionMode() == InRowMode); + processEndTagForInRow(token); + break; + case InCellMode: + ASSERT(insertionMode() == InCellMode); + processEndTagForInCell(token); + break; + case InTableBodyMode: + ASSERT(insertionMode() == InTableBodyMode); + processEndTagForInTableBody(token); + break; + case AfterBodyMode: + ASSERT(insertionMode() == AfterBodyMode); + if (token.name() == htmlTag) { + if (isParsingFragment()) { + parseError(token); + return; + } + setInsertionMode(AfterAfterBodyMode); + return; + } + prepareToReprocessToken(); + // Fall through. + case AfterAfterBodyMode: + ASSERT(insertionMode() == AfterBodyMode || insertionMode() == AfterAfterBodyMode); + parseError(token); + setInsertionMode(InBodyMode); + reprocessEndTag(token); + break; + case InHeadNoscriptMode: + ASSERT(insertionMode() == InHeadNoscriptMode); + if (token.name() == noscriptTag) { + ASSERT(m_tree.currentElement()->hasTagName(noscriptTag)); + m_tree.openElements()->pop(); + ASSERT(m_tree.currentElement()->hasTagName(headTag)); + setInsertionMode(InHeadMode); + return; + } + if (token.name() != brTag) { + parseError(token); + return; + } + defaultForInHeadNoscript(); + processToken(token); + break; + case TextMode: + if (token.name() == scriptTag) { + // Pause ourselves so that parsing stops until the script can be processed by the caller. + m_isPaused = true; + ASSERT(m_tree.currentElement()->hasTagName(scriptTag)); + m_scriptToProcess = m_tree.currentElement(); + m_scriptToProcessStartPosition = WTF::toOneBasedTextPosition(m_lastScriptElementStartPosition); + m_tree.openElements()->pop(); + if (isParsingFragment() && m_fragmentContext.scriptingPermission() == FragmentScriptingNotAllowed) + m_scriptToProcess->removeAllChildren(); + setInsertionMode(m_originalInsertionMode); + + // This token will not have been created by the tokenizer if a + // self-closing script tag was encountered and pre-HTML5 parser + // quirks are enabled. We must set the tokenizer's state to + // DataState explicitly if the tokenizer didn't have a chance to. + ASSERT(m_parser->tokenizer()->state() == HTMLTokenizer::DataState || m_usePreHTML5ParserQuirks); + m_parser->tokenizer()->setState(HTMLTokenizer::DataState); + return; + } + m_tree.openElements()->pop(); + setInsertionMode(m_originalInsertionMode); + break; + case InFramesetMode: + ASSERT(insertionMode() == InFramesetMode); + if (token.name() == framesetTag) { + if (m_tree.currentElement() == m_tree.openElements()->htmlElement()) { + parseError(token); + return; + } + m_tree.openElements()->pop(); + if (!isParsingFragment() && !m_tree.currentElement()->hasTagName(framesetTag)) + setInsertionMode(AfterFramesetMode); + return; + } + break; + case AfterFramesetMode: + ASSERT(insertionMode() == AfterFramesetMode); + if (token.name() == htmlTag) { + setInsertionMode(AfterAfterFramesetMode); + return; + } + // Fall through. + case AfterAfterFramesetMode: + ASSERT(insertionMode() == AfterFramesetMode || insertionMode() == AfterAfterFramesetMode); + parseError(token); + break; + case InSelectInTableMode: + ASSERT(insertionMode() == InSelectInTableMode); + if (token.name() == captionTag + || token.name() == tableTag + || isTableBodyContextTag(token.name()) + || token.name() == trTag + || isTableCellContextTag(token.name())) { + parseError(token); + if (m_tree.openElements()->inTableScope(token.name())) { + AtomicHTMLToken endSelect(HTMLToken::EndTag, selectTag.localName()); + processEndTag(endSelect); + reprocessEndTag(token); + } + return; + } + // Fall through. + case InSelectMode: + ASSERT(insertionMode() == InSelectMode || insertionMode() == InSelectInTableMode); + if (token.name() == optgroupTag) { + if (m_tree.currentElement()->hasTagName(optionTag) && m_tree.oneBelowTop()->hasTagName(optgroupTag)) + processFakeEndTag(optionTag); + if (m_tree.currentElement()->hasTagName(optgroupTag)) { + m_tree.openElements()->pop(); + return; + } + parseError(token); + return; + } + if (token.name() == optionTag) { + if (m_tree.currentElement()->hasTagName(optionTag)) { + m_tree.openElements()->pop(); + return; + } + parseError(token); + return; + } + if (token.name() == selectTag) { + if (!m_tree.openElements()->inSelectScope(token.name())) { + ASSERT(isParsingFragment()); + parseError(token); + return; + } + m_tree.openElements()->popUntilPopped(selectTag.localName()); + resetInsertionModeAppropriately(); + return; + } + break; + case InTableTextMode: + defaultForInTableText(); + processEndTag(token); + break; + case InForeignContentMode: + if (token.name() == SVGNames::scriptTag && m_tree.currentElement()->hasTagName(SVGNames::scriptTag)) { + notImplemented(); + return; + } + if (m_tree.currentElement()->namespaceURI() != xhtmlNamespaceURI) { + // FIXME: This code just wants an Element* iterator, instead of an ElementRecord* + HTMLElementStack::ElementRecord* nodeRecord = m_tree.openElements()->topRecord(); + if (!nodeRecord->element()->hasLocalName(token.name())) + parseError(token); + while (1) { + if (nodeRecord->element()->hasLocalName(token.name())) { + m_tree.openElements()->popUntilPopped(nodeRecord->element()); + resetForeignInsertionMode(); + return; + } + nodeRecord = nodeRecord->next(); + if (nodeRecord->element()->namespaceURI() == xhtmlNamespaceURI) + break; + } + } + // Any other end tag (also the last two steps of "An end tag, if the current node is not an element in the HTML namespace." + processForeignContentUsingInBodyModeAndResetMode(token); + break; + } +} + +void HTMLTreeBuilder::prepareToReprocessToken() +{ + if (m_hasPendingForeignInsertionModeSteps) { + resetForeignInsertionMode(); + m_hasPendingForeignInsertionModeSteps = false; + } +} + +void HTMLTreeBuilder::reprocessStartTag(AtomicHTMLToken& token) +{ + prepareToReprocessToken(); + processStartTag(token); +} + +void HTMLTreeBuilder::reprocessEndTag(AtomicHTMLToken& token) +{ + prepareToReprocessToken(); + processEndTag(token); +} + +class HTMLTreeBuilder::FakeInsertionMode : public Noncopyable { +public: + FakeInsertionMode(HTMLTreeBuilder* treeBuilder, InsertionMode mode) + : m_treeBuilder(treeBuilder) + , m_originalMode(treeBuilder->insertionMode()) + { + m_treeBuilder->setFakeInsertionMode(mode); + } + + ~FakeInsertionMode() + { + if (m_treeBuilder->isFakeInsertionMode()) + m_treeBuilder->setInsertionMode(m_originalMode); + } + +private: + HTMLTreeBuilder* m_treeBuilder; + InsertionMode m_originalMode; +}; + +void HTMLTreeBuilder::processForeignContentUsingInBodyModeAndResetMode(AtomicHTMLToken& token) +{ + m_hasPendingForeignInsertionModeSteps = true; + { + FakeInsertionMode fakeMode(this, InBodyMode); + processToken(token); + } + if (m_hasPendingForeignInsertionModeSteps) + resetForeignInsertionMode(); +} + +void HTMLTreeBuilder::resetForeignInsertionMode() +{ + if (insertionMode() == InForeignContentMode) + resetInsertionModeAppropriately(); +} + +void HTMLTreeBuilder::processComment(AtomicHTMLToken& token) +{ + ASSERT(token.type() == HTMLToken::Comment); + if (m_insertionMode == InitialMode + || m_insertionMode == BeforeHTMLMode + || m_insertionMode == AfterAfterBodyMode + || m_insertionMode == AfterAfterFramesetMode) { + m_tree.insertCommentOnDocument(token); + return; + } + if (m_insertionMode == AfterBodyMode) { + m_tree.insertCommentOnHTMLHtmlElement(token); + return; + } + if (m_insertionMode == InTableTextMode) { + defaultForInTableText(); + processComment(token); + return; + } + m_tree.insertComment(token); +} + +void HTMLTreeBuilder::processCharacter(AtomicHTMLToken& token) +{ + ASSERT(token.type() == HTMLToken::Character); + ExternalCharacterTokenBuffer buffer(token); + processCharacterBuffer(buffer); +} + +void HTMLTreeBuilder::processCharacterBuffer(ExternalCharacterTokenBuffer& buffer) +{ +ReprocessBuffer: + switch (insertionMode()) { + case InitialMode: { + ASSERT(insertionMode() == InitialMode); + buffer.skipLeadingWhitespace(); + if (buffer.isEmpty()) + return; + defaultForInitial(); + // Fall through. + } + case BeforeHTMLMode: { + ASSERT(insertionMode() == BeforeHTMLMode); + buffer.skipLeadingWhitespace(); + if (buffer.isEmpty()) + return; + defaultForBeforeHTML(); + // Fall through. + } + case BeforeHeadMode: { + ASSERT(insertionMode() == BeforeHeadMode); + buffer.skipLeadingWhitespace(); + if (buffer.isEmpty()) + return; + defaultForBeforeHead(); + // Fall through. + } + case InHeadMode: { + ASSERT(insertionMode() == InHeadMode); + String leadingWhitespace = buffer.takeLeadingWhitespace(); + if (!leadingWhitespace.isEmpty()) + m_tree.insertTextNode(leadingWhitespace); + if (buffer.isEmpty()) + return; + defaultForInHead(); + // Fall through. + } + case AfterHeadMode: { + ASSERT(insertionMode() == AfterHeadMode); + String leadingWhitespace = buffer.takeLeadingWhitespace(); + if (!leadingWhitespace.isEmpty()) + m_tree.insertTextNode(leadingWhitespace); + if (buffer.isEmpty()) + return; + defaultForAfterHead(); + // Fall through. + } + case InBodyMode: + case InCaptionMode: + case InCellMode: { + ASSERT(insertionMode() == InBodyMode || insertionMode() == InCaptionMode || insertionMode() == InCellMode); + m_tree.reconstructTheActiveFormattingElements(); + String characters = buffer.takeRemaining(); + m_tree.insertTextNode(characters); + if (m_framesetOk && !isAllWhitespaceOrReplacementCharacters(characters)) + m_framesetOk = false; + break; + } + case InTableMode: + case InTableBodyMode: + case InRowMode: { + ASSERT(insertionMode() == InTableMode || insertionMode() == InTableBodyMode || insertionMode() == InRowMode); + ASSERT(m_pendingTableCharacters.isEmpty()); + m_originalInsertionMode = m_insertionMode; + setInsertionMode(InTableTextMode); + prepareToReprocessToken(); + // Fall through. + } + case InTableTextMode: { + buffer.giveRemainingTo(m_pendingTableCharacters); + break; + } + case InColumnGroupMode: { + ASSERT(insertionMode() == InColumnGroupMode); + String leadingWhitespace = buffer.takeLeadingWhitespace(); + if (!leadingWhitespace.isEmpty()) + m_tree.insertTextNode(leadingWhitespace); + if (buffer.isEmpty()) + return; + if (!processColgroupEndTagForInColumnGroup()) { + ASSERT(isParsingFragment()); + // The spec tells us to drop these characters on the floor. + buffer.takeLeadingNonWhitespace(); + if (buffer.isEmpty()) + return; + } + prepareToReprocessToken(); + goto ReprocessBuffer; + } + case AfterBodyMode: + case AfterAfterBodyMode: { + ASSERT(insertionMode() == AfterBodyMode || insertionMode() == AfterAfterBodyMode); + // FIXME: parse error + setInsertionMode(InBodyMode); + prepareToReprocessToken(); + goto ReprocessBuffer; + break; + } + case TextMode: { + ASSERT(insertionMode() == TextMode); + m_tree.insertTextNode(buffer.takeRemaining()); + break; + } + case InHeadNoscriptMode: { + ASSERT(insertionMode() == InHeadNoscriptMode); + String leadingWhitespace = buffer.takeLeadingWhitespace(); + if (!leadingWhitespace.isEmpty()) + m_tree.insertTextNode(leadingWhitespace); + if (buffer.isEmpty()) + return; + defaultForInHeadNoscript(); + goto ReprocessBuffer; + break; + } + case InFramesetMode: + case AfterFramesetMode: { + ASSERT(insertionMode() == InFramesetMode || insertionMode() == AfterFramesetMode || insertionMode() == AfterAfterFramesetMode); + String leadingWhitespace = buffer.takeRemainingWhitespace(); + if (!leadingWhitespace.isEmpty()) + m_tree.insertTextNode(leadingWhitespace); + // FIXME: We should generate a parse error if we skipped over any + // non-whitespace characters. + break; + } + case InSelectInTableMode: + case InSelectMode: { + ASSERT(insertionMode() == InSelectMode || insertionMode() == InSelectInTableMode); + m_tree.insertTextNode(buffer.takeRemaining()); + break; + } + case InForeignContentMode: { + ASSERT(insertionMode() == InForeignContentMode); + String characters = buffer.takeRemaining(); + m_tree.insertTextNode(characters); + if (m_framesetOk && !isAllWhitespace(characters)) + m_framesetOk = false; + break; + } + case AfterAfterFramesetMode: { + String leadingWhitespace = buffer.takeRemainingWhitespace(); + if (!leadingWhitespace.isEmpty()) { + m_tree.reconstructTheActiveFormattingElements(); + m_tree.insertTextNode(leadingWhitespace); + } + // FIXME: We should generate a parse error if we skipped over any + // non-whitespace characters. + break; + } + } +} + +void HTMLTreeBuilder::processEndOfFile(AtomicHTMLToken& token) +{ + ASSERT(token.type() == HTMLToken::EndOfFile); + switch (insertionMode()) { + case InitialMode: + ASSERT(insertionMode() == InitialMode); + defaultForInitial(); + // Fall through. + case BeforeHTMLMode: + ASSERT(insertionMode() == BeforeHTMLMode); + defaultForBeforeHTML(); + // Fall through. + case BeforeHeadMode: + ASSERT(insertionMode() == BeforeHeadMode); + defaultForBeforeHead(); + // Fall through. + case InHeadMode: + ASSERT(insertionMode() == InHeadMode); + defaultForInHead(); + // Fall through. + case AfterHeadMode: + ASSERT(insertionMode() == AfterHeadMode); + defaultForAfterHead(); + // Fall through + case InBodyMode: + case InCellMode: + case InCaptionMode: + case InRowMode: + ASSERT(insertionMode() == InBodyMode || insertionMode() == InCellMode || insertionMode() == InCaptionMode || insertionMode() == InRowMode); + notImplemented(); // Emit parse error based on what elements are still open. + break; + case AfterBodyMode: + case AfterAfterBodyMode: + ASSERT(insertionMode() == AfterBodyMode || insertionMode() == AfterAfterBodyMode); + break; + case InHeadNoscriptMode: + ASSERT(insertionMode() == InHeadNoscriptMode); + defaultForInHeadNoscript(); + processEndOfFile(token); + return; + case AfterFramesetMode: + case AfterAfterFramesetMode: + ASSERT(insertionMode() == AfterFramesetMode || insertionMode() == AfterAfterFramesetMode); + break; + case InFramesetMode: + case InTableMode: + case InTableBodyMode: + case InSelectInTableMode: + case InSelectMode: + ASSERT(insertionMode() == InSelectMode || insertionMode() == InSelectInTableMode || insertionMode() == InTableMode || insertionMode() == InFramesetMode || insertionMode() == InTableBodyMode); + if (m_tree.currentElement() != m_tree.openElements()->htmlElement()) + parseError(token); + break; + case InColumnGroupMode: + if (m_tree.currentElement() == m_tree.openElements()->htmlElement()) { + ASSERT(isParsingFragment()); + return; // FIXME: Should we break here instead of returning? + } + if (!processColgroupEndTagForInColumnGroup()) { + ASSERT(isParsingFragment()); + return; // FIXME: Should we break here instead of returning? + } + prepareToReprocessToken(); + processEndOfFile(token); + return; + case InForeignContentMode: + setInsertionMode(InBodyMode); + processEndOfFile(token); + return; + case InTableTextMode: + defaultForInTableText(); + processEndOfFile(token); + return; + case TextMode: + parseError(token); + if (m_tree.currentElement()->hasTagName(scriptTag)) + notImplemented(); // mark the script element as "already started". + m_tree.openElements()->pop(); + setInsertionMode(m_originalInsertionMode); + prepareToReprocessToken(); + processEndOfFile(token); + return; + } + ASSERT(m_tree.openElements()->top()); + m_tree.openElements()->popAll(); +} + +void HTMLTreeBuilder::defaultForInitial() +{ + notImplemented(); + if (!m_fragmentContext.fragment()) + m_document->setCompatibilityMode(Document::QuirksMode); + // FIXME: parse error + setInsertionMode(BeforeHTMLMode); + prepareToReprocessToken(); +} + +void HTMLTreeBuilder::defaultForBeforeHTML() +{ + AtomicHTMLToken startHTML(HTMLToken::StartTag, htmlTag.localName()); + m_tree.insertHTMLHtmlStartTagBeforeHTML(startHTML); + setInsertionMode(BeforeHeadMode); + prepareToReprocessToken(); +} + +void HTMLTreeBuilder::defaultForBeforeHead() +{ + AtomicHTMLToken startHead(HTMLToken::StartTag, headTag.localName()); + processStartTag(startHead); + prepareToReprocessToken(); +} + +void HTMLTreeBuilder::defaultForInHead() +{ + AtomicHTMLToken endHead(HTMLToken::EndTag, headTag.localName()); + processEndTag(endHead); + prepareToReprocessToken(); +} + +void HTMLTreeBuilder::defaultForInHeadNoscript() +{ + AtomicHTMLToken endNoscript(HTMLToken::EndTag, noscriptTag.localName()); + processEndTag(endNoscript); + prepareToReprocessToken(); +} + +void HTMLTreeBuilder::defaultForAfterHead() +{ + AtomicHTMLToken startBody(HTMLToken::StartTag, bodyTag.localName()); + processStartTag(startBody); + m_framesetOk = true; + prepareToReprocessToken(); +} + +void HTMLTreeBuilder::defaultForInTableText() +{ + String characters = String::adopt(m_pendingTableCharacters); + if (!isAllWhitespace(characters)) { + // FIXME: parse error + HTMLConstructionSite::RedirectToFosterParentGuard redirecter(m_tree); + m_tree.reconstructTheActiveFormattingElements(); + m_tree.insertTextNode(characters); + m_framesetOk = false; + setInsertionMode(m_originalInsertionMode); + prepareToReprocessToken(); + return; + } + m_tree.insertTextNode(characters); + setInsertionMode(m_originalInsertionMode); + prepareToReprocessToken(); +} + +bool HTMLTreeBuilder::processStartTagForInHead(AtomicHTMLToken& token) +{ + ASSERT(token.type() == HTMLToken::StartTag); + if (token.name() == htmlTag) { + m_tree.insertHTMLHtmlStartTagInBody(token); + return true; + } + if (token.name() == baseTag + || token.name() == basefontTag + || token.name() == bgsoundTag + || token.name() == commandTag + || token.name() == linkTag + || token.name() == metaTag) { + m_tree.insertSelfClosingHTMLElement(token); + // Note: The custom processing for the <meta> tag is done in HTMLMetaElement::process(). + return true; + } + if (token.name() == titleTag) { + processGenericRCDATAStartTag(token); + return true; + } + if (token.name() == noscriptTag) { + if (scriptEnabled(m_document->frame())) { + processGenericRawTextStartTag(token); + return true; + } + m_tree.insertHTMLElement(token); + setInsertionMode(InHeadNoscriptMode); + return true; + } + if (token.name() == noframesTag || token.name() == styleTag) { + processGenericRawTextStartTag(token); + return true; + } + if (token.name() == scriptTag) { + processScriptStartTag(token); + if (m_usePreHTML5ParserQuirks && token.selfClosing()) + processFakeEndTag(scriptTag); + return true; + } + if (token.name() == headTag) { + parseError(token); + return true; + } + return false; +} + +void HTMLTreeBuilder::processGenericRCDATAStartTag(AtomicHTMLToken& token) +{ + ASSERT(token.type() == HTMLToken::StartTag); + m_tree.insertHTMLElement(token); + m_parser->tokenizer()->setState(HTMLTokenizer::RCDATAState); + m_originalInsertionMode = m_insertionMode; + setInsertionMode(TextMode); +} + +void HTMLTreeBuilder::processGenericRawTextStartTag(AtomicHTMLToken& token) +{ + ASSERT(token.type() == HTMLToken::StartTag); + m_tree.insertHTMLElement(token); + m_parser->tokenizer()->setState(HTMLTokenizer::RAWTEXTState); + m_originalInsertionMode = m_insertionMode; + setInsertionMode(TextMode); +} + +void HTMLTreeBuilder::processScriptStartTag(AtomicHTMLToken& token) +{ + ASSERT(token.type() == HTMLToken::StartTag); + m_tree.insertScriptElement(token); + m_parser->tokenizer()->setState(HTMLTokenizer::ScriptDataState); + m_originalInsertionMode = m_insertionMode; + + TextPosition0 position = m_parser->textPosition(); + + ASSERT(position.m_line.zeroBasedInt() == m_parser->tokenizer()->lineNumber()); + + m_lastScriptElementStartPosition = position; + + setInsertionMode(TextMode); +} + +void HTMLTreeBuilder::finished() +{ + ASSERT(m_document); + if (isParsingFragment()) { + m_fragmentContext.finished(); + return; + } + + // Warning, this may detach the parser. Do not do anything else after this. + m_document->finishedParsing(); +} + +bool HTMLTreeBuilder::scriptEnabled(Frame* frame) +{ + if (!frame) + return false; + return frame->script()->canExecuteScripts(NotAboutToExecuteScript); +} + +bool HTMLTreeBuilder::pluginsEnabled(Frame* frame) +{ + if (!frame) + return false; + return frame->loader()->subframeLoader()->allowPlugins(NotAboutToInstantiatePlugin); +} + +} diff --git a/Source/WebCore/html/parser/HTMLTreeBuilder.h b/Source/WebCore/html/parser/HTMLTreeBuilder.h new file mode 100644 index 0000000..17b77b7 --- /dev/null +++ b/Source/WebCore/html/parser/HTMLTreeBuilder.h @@ -0,0 +1,267 @@ +/* + * Copyright (C) 2010 Google, Inc. All Rights Reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY GOOGLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GOOGLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef HTMLTreeBuilder_h +#define HTMLTreeBuilder_h + +#include "Element.h" +#include "FragmentScriptingPermission.h" +#include "HTMLConstructionSite.h" +#include "HTMLElementStack.h" +#include "HTMLFormattingElementList.h" +#include "HTMLTokenizer.h" +#include <wtf/text/TextPosition.h> +#include <wtf/Noncopyable.h> +#include <wtf/OwnPtr.h> +#include <wtf/PassOwnPtr.h> +#include <wtf/PassRefPtr.h> +#include <wtf/RefPtr.h> +#include <wtf/unicode/Unicode.h> + +namespace WebCore { + +class AtomicHTMLToken; +class Document; +class DocumentFragment; +class Frame; +class HTMLToken; +class HTMLDocument; +class Node; +class HTMLDocumentParser; + +class HTMLTreeBuilder : public Noncopyable { +public: + static PassOwnPtr<HTMLTreeBuilder> create(HTMLDocumentParser* parser, HTMLDocument* document, bool reportErrors, bool usePreHTML5ParserQuirks) + { + return adoptPtr(new HTMLTreeBuilder(parser, document, reportErrors, usePreHTML5ParserQuirks)); + } + static PassOwnPtr<HTMLTreeBuilder> create(HTMLDocumentParser* parser, DocumentFragment* fragment, Element* contextElement, FragmentScriptingPermission scriptingPermission, bool usePreHTML5ParserQuirks) + { + return adoptPtr(new HTMLTreeBuilder(parser, fragment, contextElement, scriptingPermission, usePreHTML5ParserQuirks)); + } + ~HTMLTreeBuilder(); + + bool isParsingFragment() const { return !!m_fragmentContext.fragment(); } + + void detach(); + + void setPaused(bool paused) { m_isPaused = paused; } + bool isPaused() const { return m_isPaused; } + + // The token really should be passed as a const& since it's never modified. + void constructTreeFromToken(HTMLToken&); + void constructTreeFromAtomicToken(AtomicHTMLToken&); + + // Must be called when parser is paused before calling the parser again. + PassRefPtr<Element> takeScriptToProcess(TextPosition1& scriptStartPosition); + + // Done, close any open tags, etc. + void finished(); + + static bool scriptEnabled(Frame*); + static bool pluginsEnabled(Frame*); + +private: + class FakeInsertionMode; + class ExternalCharacterTokenBuffer; + // Represents HTML5 "insertion mode" + // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#insertion-mode + enum InsertionMode { + InitialMode, + BeforeHTMLMode, + BeforeHeadMode, + InHeadMode, + InHeadNoscriptMode, + AfterHeadMode, + InBodyMode, + TextMode, + InTableMode, + InTableTextMode, + InCaptionMode, + InColumnGroupMode, + InTableBodyMode, + InRowMode, + InCellMode, + InSelectMode, + InSelectInTableMode, + InForeignContentMode, + AfterBodyMode, + InFramesetMode, + AfterFramesetMode, + AfterAfterBodyMode, + AfterAfterFramesetMode, + }; + + HTMLTreeBuilder(HTMLDocumentParser* parser, HTMLDocument*, bool reportErrors, bool usePreHTML5ParserQuirks); + HTMLTreeBuilder(HTMLDocumentParser* parser, DocumentFragment*, Element* contextElement, FragmentScriptingPermission, bool usePreHTML5ParserQuirks); + + void processToken(AtomicHTMLToken&); + + void processDoctypeToken(AtomicHTMLToken&); + void processStartTag(AtomicHTMLToken&); + void processEndTag(AtomicHTMLToken&); + void processComment(AtomicHTMLToken&); + void processCharacter(AtomicHTMLToken&); + void processEndOfFile(AtomicHTMLToken&); + + bool processStartTagForInHead(AtomicHTMLToken&); + void processStartTagForInBody(AtomicHTMLToken&); + void processStartTagForInTable(AtomicHTMLToken&); + void processEndTagForInBody(AtomicHTMLToken&); + void processEndTagForInTable(AtomicHTMLToken&); + void processEndTagForInTableBody(AtomicHTMLToken&); + void processEndTagForInRow(AtomicHTMLToken&); + void processEndTagForInCell(AtomicHTMLToken&); + + void processIsindexStartTagForInBody(AtomicHTMLToken&); + bool processBodyEndTagForInBody(AtomicHTMLToken&); + bool processTableEndTagForInTable(); + bool processCaptionEndTagForInCaption(); + bool processColgroupEndTagForInColumnGroup(); + bool processTrEndTagForInRow(); + // FIXME: This function should be inlined into its one call site or it + // needs to assert which tokens it can be called with. + void processAnyOtherEndTagForInBody(AtomicHTMLToken&); + + void processCharacterBuffer(ExternalCharacterTokenBuffer&); + + void processFakeStartTag(const QualifiedName&, PassRefPtr<NamedNodeMap> attributes = 0); + void processFakeEndTag(const QualifiedName&); + void processFakeCharacters(const String&); + void processFakePEndTagIfPInButtonScope(); + + void processGenericRCDATAStartTag(AtomicHTMLToken&); + void processGenericRawTextStartTag(AtomicHTMLToken&); + void processScriptStartTag(AtomicHTMLToken&); + + // Default processing for the different insertion modes. + void defaultForInitial(); + void defaultForBeforeHTML(); + void defaultForBeforeHead(); + void defaultForInHead(); + void defaultForInHeadNoscript(); + void defaultForAfterHead(); + void defaultForInTableText(); + + void prepareToReprocessToken(); + + void reprocessStartTag(AtomicHTMLToken&); + void reprocessEndTag(AtomicHTMLToken&); + + PassRefPtr<NamedNodeMap> attributesForIsindexInput(AtomicHTMLToken&); + + HTMLElementStack::ElementRecord* furthestBlockForFormattingElement(Element*); + void callTheAdoptionAgency(AtomicHTMLToken&); + + void closeTheCell(); + + template <bool shouldClose(const Element*)> + void processCloseWhenNestedTag(AtomicHTMLToken&); + + bool m_framesetOk; + + // FIXME: Implement error reporting. + void parseError(AtomicHTMLToken&) { } + + InsertionMode insertionMode() const { return m_insertionMode; } + void setInsertionMode(InsertionMode mode) + { + m_insertionMode = mode; + m_isFakeInsertionMode = false; + } + + bool isFakeInsertionMode() { return m_isFakeInsertionMode; } + void setFakeInsertionMode(InsertionMode mode) + { + m_insertionMode = mode; + m_isFakeInsertionMode = true; + } + + void resetInsertionModeAppropriately(); + + void processForeignContentUsingInBodyModeAndResetMode(AtomicHTMLToken& token); + void resetForeignInsertionMode(); + + class FragmentParsingContext : public Noncopyable { + public: + FragmentParsingContext(); + FragmentParsingContext(DocumentFragment*, Element* contextElement, FragmentScriptingPermission); + ~FragmentParsingContext(); + + Document* document() const; + DocumentFragment* fragment() const { return m_fragment; } + Element* contextElement() const { ASSERT(m_fragment); return m_contextElement; } + FragmentScriptingPermission scriptingPermission() const { ASSERT(m_fragment); return m_scriptingPermission; } + + void finished(); + + private: + RefPtr<Document> m_dummyDocumentForFragmentParsing; + DocumentFragment* m_fragment; + Element* m_contextElement; + + // FragmentScriptingNotAllowed causes the Parser to remove children + // from <script> tags (so javascript doesn't show up in pastes). + FragmentScriptingPermission m_scriptingPermission; + }; + + FragmentParsingContext m_fragmentContext; + + Document* m_document; + HTMLConstructionSite m_tree; + + bool m_reportErrors; + bool m_isPaused; + bool m_isFakeInsertionMode; + + // FIXME: InsertionModes should be a separate object to prevent direct + // manipulation of these variables. For now, be careful to always use + // setInsertionMode and never set m_insertionMode directly. + InsertionMode m_insertionMode; + InsertionMode m_originalInsertionMode; + + // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#pending-table-character-tokens + Vector<UChar> m_pendingTableCharacters; + + // We access parser because HTML5 spec requires that we be able to change the state of the tokenizer + // from within parser actions. We also need it to track the current position. + HTMLDocumentParser* m_parser; + + RefPtr<Element> m_scriptToProcess; // <script> tag which needs processing before resuming the parser. + TextPosition1 m_scriptToProcessStartPosition; // Starting line number of the script tag needing processing. + + // FIXME: We probably want to remove this member. Originally, it was + // created to service the legacy tree builder, but it seems to be used for + // some other things now. + TextPosition0 m_lastScriptElementStartPosition; + + bool m_usePreHTML5ParserQuirks; + + bool m_hasPendingForeignInsertionModeSteps; +}; + +} + +#endif diff --git a/Source/WebCore/html/parser/HTMLViewSourceParser.cpp b/Source/WebCore/html/parser/HTMLViewSourceParser.cpp new file mode 100644 index 0000000..ace8590 --- /dev/null +++ b/Source/WebCore/html/parser/HTMLViewSourceParser.cpp @@ -0,0 +1,106 @@ +/* + * Copyright (C) 2010 Google, Inc. All Rights Reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "HTMLViewSourceParser.h" + +#include "HTMLDocumentParser.h" +#include "HTMLNames.h" +#include "HTMLViewSourceDocument.h" + +namespace WebCore { + +HTMLViewSourceParser::HTMLViewSourceParser(HTMLViewSourceDocument* document) + : DecodedDataDocumentParser(document) + , m_tokenizer(HTMLTokenizer::create(HTMLDocumentParser::usePreHTML5ParserQuirks(document))) +{ +} + +HTMLViewSourceParser::~HTMLViewSourceParser() +{ +} + +void HTMLViewSourceParser::insert(const SegmentedString&) +{ + ASSERT_NOT_REACHED(); +} + +void HTMLViewSourceParser::pumpTokenizer() +{ + while (m_tokenizer->nextToken(m_input.current(), m_token)) { + m_token.end(m_input.current().numberOfCharactersConsumed()); + document()->addSource(sourceForToken(), m_token); + updateTokenizerState(); + m_token.clear(m_input.current().numberOfCharactersConsumed()); + } +} + +void HTMLViewSourceParser::append(const SegmentedString& input) +{ + m_input.appendToEnd(input); + m_source.append(input); + pumpTokenizer(); +} + +String HTMLViewSourceParser::sourceForToken() +{ + if (m_token.type() == HTMLToken::EndOfFile) + return String(); + + ASSERT(m_source.numberOfCharactersConsumed() == m_token.startIndex()); + UChar* data = 0; + int length = m_token.endIndex() - m_token.startIndex(); + String source = String::createUninitialized(length, data); + for (int i = 0; i < length; ++i) { + data[i] = *m_source; + m_source.advance(); + } + return source; +} + +void HTMLViewSourceParser::updateTokenizerState() +{ + // FIXME: The tokenizer should do this work for us. + if (m_token.type() != HTMLToken::StartTag) + return; + + AtomicString tagName(m_token.name().data(), m_token.name().size()); + m_tokenizer->updateStateFor(tagName, document()->frame()); +} + +void HTMLViewSourceParser::finish() +{ + if (!m_input.haveSeenEndOfFile()) + m_input.markEndOfFile(); + pumpTokenizer(); + document()->finishedParsing(); +} + +bool HTMLViewSourceParser::finishWasCalled() +{ + return m_input.haveSeenEndOfFile(); +} + +} diff --git a/Source/WebCore/html/parser/HTMLViewSourceParser.h b/Source/WebCore/html/parser/HTMLViewSourceParser.h new file mode 100644 index 0000000..abe55b4 --- /dev/null +++ b/Source/WebCore/html/parser/HTMLViewSourceParser.h @@ -0,0 +1,79 @@ +/* + * Copyright (C) 2010 Google, Inc. All Rights Reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef HTMLViewSourceParser_h +#define HTMLViewSourceParser_h + +#include "DecodedDataDocumentParser.h" +#include "HTMLInputStream.h" +#include "HTMLToken.h" +#include "HTMLTokenizer.h" +#include "HTMLViewSourceDocument.h" +#include <wtf/PassOwnPtr.h> + +namespace WebCore { + +class HTMLTokenizer; +class HTMLScriptRunner; +class HTMLTreeBuilder; +class HTMLPreloadScanner; +class ScriptController; +class ScriptSourceCode; + +class HTMLViewSourceParser : public DecodedDataDocumentParser { +public: + static PassRefPtr<HTMLViewSourceParser> create(HTMLViewSourceDocument* document) + { + return adoptRef(new HTMLViewSourceParser(document)); + } + virtual ~HTMLViewSourceParser(); + +protected: + explicit HTMLViewSourceParser(HTMLViewSourceDocument*); + + HTMLTokenizer* tokenizer() const { return m_tokenizer.get(); } + +private: + // DocumentParser + virtual void insert(const SegmentedString&); + virtual void append(const SegmentedString&); + virtual void finish(); + virtual bool finishWasCalled(); + + HTMLViewSourceDocument* document() const { return static_cast<HTMLViewSourceDocument*>(DecodedDataDocumentParser::document()); } + + void pumpTokenizer(); + String sourceForToken(); + void updateTokenizerState(); + + HTMLInputStream m_input; + SegmentedString m_source; + HTMLToken m_token; + OwnPtr<HTMLTokenizer> m_tokenizer; +}; + +} + +#endif diff --git a/Source/WebCore/html/parser/NestingLevelIncrementer.h b/Source/WebCore/html/parser/NestingLevelIncrementer.h new file mode 100644 index 0000000..c597876 --- /dev/null +++ b/Source/WebCore/html/parser/NestingLevelIncrementer.h @@ -0,0 +1,50 @@ +/* + * Copyright (C) 2010 Google, Inc. All Rights Reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef NestingLevelIncrementer_h +#define NestingLevelIncrementer_h + +namespace WebCore { + +class NestingLevelIncrementer : public Noncopyable { +public: + explicit NestingLevelIncrementer(unsigned& nestingLevel) + : m_nestingLevel(&nestingLevel) + { + ++(*m_nestingLevel); + } + + ~NestingLevelIncrementer() + { + --(*m_nestingLevel); + } + +private: + unsigned* m_nestingLevel; +}; + +} + +#endif diff --git a/Source/WebCore/html/parser/TextDocumentParser.cpp b/Source/WebCore/html/parser/TextDocumentParser.cpp new file mode 100644 index 0000000..d03b744 --- /dev/null +++ b/Source/WebCore/html/parser/TextDocumentParser.cpp @@ -0,0 +1,72 @@ +/* + * Copyright (C) 2010 Google Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "TextDocumentParser.h" + +#include "HTMLDocument.h" +#include "HTMLNames.h" +#include "HTMLTokenizer.h" +#include "HTMLTreeBuilder.h" + +namespace WebCore { + +using namespace HTMLNames; + +TextDocumentParser::TextDocumentParser(HTMLDocument* document) + : HTMLDocumentParser(document, false) + , m_haveInsertedFakePreElement(false) +{ + tokenizer()->setState(HTMLTokenizer::PLAINTEXTState); +} + +TextDocumentParser::~TextDocumentParser() +{ +} + +void TextDocumentParser::append(const SegmentedString& text) +{ + if (!m_haveInsertedFakePreElement) + insertFakePreElement(); + HTMLDocumentParser::append(text); +} + +void TextDocumentParser::insertFakePreElement() +{ + // In principle, we should create a specialized tree builder for + // TextDocuments, but instead we re-use the existing HTMLTreeBuilder. + // We create a fake token and give it to the tree builder rather than + // sending fake bytes through the front-end of the parser to avoid + // distrubing the line/column number calculations. + + RefPtr<Attribute> styleAttribute = Attribute::createMapped("style", "word-wrap: break-word; white-space: pre-wrap;"); + RefPtr<NamedNodeMap> attributes = NamedNodeMap::create(); + attributes->insertAttribute(styleAttribute.release(), false); + AtomicHTMLToken fakePre(HTMLToken::StartTag, preTag.localName(), attributes.release()); + + treeBuilder()->constructTreeFromAtomicToken(fakePre); + m_haveInsertedFakePreElement = true; +} + +} diff --git a/Source/WebCore/html/parser/TextDocumentParser.h b/Source/WebCore/html/parser/TextDocumentParser.h new file mode 100644 index 0000000..1cccc5b --- /dev/null +++ b/Source/WebCore/html/parser/TextDocumentParser.h @@ -0,0 +1,52 @@ +/* + * Copyright (C) 2010 Google Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + + +#ifndef TextDocumentParser_h +#define TextDocumentParser_h + +#include "HTMLDocumentParser.h" + +namespace WebCore { + +class TextDocumentParser : public HTMLDocumentParser { +public: + static PassRefPtr<TextDocumentParser> create(HTMLDocument* document) + { + return adoptRef(new TextDocumentParser(document)); + } + virtual ~TextDocumentParser(); + +private: + explicit TextDocumentParser(HTMLDocument*); + + virtual void append(const SegmentedString&); + void insertFakePreElement(); + + bool m_haveInsertedFakePreElement; +}; + +} + +#endif diff --git a/Source/WebCore/html/parser/TextViewSourceParser.cpp b/Source/WebCore/html/parser/TextViewSourceParser.cpp new file mode 100644 index 0000000..d7e6e3d --- /dev/null +++ b/Source/WebCore/html/parser/TextViewSourceParser.cpp @@ -0,0 +1,43 @@ +/* + * Copyright (C) 2010 Google, Inc. All Rights Reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "TextViewSourceParser.h" + +#include "HTMLTokenizer.h" + +namespace WebCore { + +TextViewSourceParser::TextViewSourceParser(HTMLViewSourceDocument* document) + : HTMLViewSourceParser(document) +{ + tokenizer()->setState(HTMLTokenizer::PLAINTEXTState); +} + +TextViewSourceParser::~TextViewSourceParser() +{ +} + +} diff --git a/Source/WebCore/html/parser/TextViewSourceParser.h b/Source/WebCore/html/parser/TextViewSourceParser.h new file mode 100644 index 0000000..e4170ed --- /dev/null +++ b/Source/WebCore/html/parser/TextViewSourceParser.h @@ -0,0 +1,47 @@ +/* + * Copyright (C) 2010 Google, Inc. All Rights Reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef TextViewSourceParser_h +#define TextViewSourceParser_h + +#include "HTMLViewSourceParser.h" + +namespace WebCore { + +class TextViewSourceParser : public HTMLViewSourceParser { +public: + static PassRefPtr<TextViewSourceParser> create(HTMLViewSourceDocument* document) + { + return adoptRef(new TextViewSourceParser(document)); + } + virtual ~TextViewSourceParser(); + +private: + explicit TextViewSourceParser(HTMLViewSourceDocument*); +}; + +} + +#endif diff --git a/Source/WebCore/html/parser/create-html-entity-table b/Source/WebCore/html/parser/create-html-entity-table new file mode 100755 index 0000000..e6132bc --- /dev/null +++ b/Source/WebCore/html/parser/create-html-entity-table @@ -0,0 +1,178 @@ +#!/usr/bin/env python +# Copyright (c) 2010 Google Inc. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following disclaimer +# in the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Google Inc. nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import csv +import os.path +import string +import sys + +ENTITY = 0 +VALUE = 1 + +def convert_entity_to_cpp_name(entity): + postfix = "EntityName" + if entity[-1] == ";": + return "%sSemicolon%s" % (entity[:-1], postfix) + return "%s%s" % (entity, postfix) + + +def convert_entity_to_uchar_array(entity): + return "{'%s'}" % "', '".join(entity) + + +def convert_value_to_int(value): + assert(value[0] == "U") + assert(value[1] == "+") + return "0x" + value[2:] + + +def offset_table_entry(offset): + return " &staticEntityTable[%s]," % offset + + +program_name = os.path.basename(__file__) +if len(sys.argv) < 4 or sys.argv[1] != "-o": + print >> sys.stderr, "Usage: %s -o OUTPUT_FILE INPUT_FILE" % program_name + exit(1) + +output_path = sys.argv[2] +input_path = sys.argv[3] + +html_entity_names_file = open(input_path) +entries = list(csv.reader(html_entity_names_file)) +html_entity_names_file.close() + +entries.sort(lambda a, b: cmp(a[ENTITY], b[ENTITY])) +entity_count = len(entries) + +output_file = open(output_path, "w") + +print >> output_file, """/* + * Copyright (C) 2010 Google, Inc. All Rights Reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +// THIS FILE IS GENERATED BY WebCore/html/parser/create-html-entity-table +// DO NOT EDIT (unless you are a ninja)! + +#include "config.h" +#include "HTMLEntityTable.h" + +namespace WebCore { + +namespace { +""" + +for entry in entries: + print >> output_file, "const UChar %sEntityName[] = %s;" % ( + convert_entity_to_cpp_name(entry[ENTITY]), + convert_entity_to_uchar_array(entry[ENTITY])) + +print >> output_file, """ +HTMLEntityTableEntry staticEntityTable[%s] = {""" % entity_count + +index = {} +offset = 0 +for entry in entries: + letter = entry[ENTITY][0] + if not index.get(letter): + index[letter] = offset + print >> output_file, ' { %sEntityName, %s, %s },' % ( + convert_entity_to_cpp_name(entry[ENTITY]), + len(entry[ENTITY]), + convert_value_to_int(entry[VALUE])) + offset += 1 + +print >> output_file, """}; +""" + +print >> output_file, "const HTMLEntityTableEntry* uppercaseOffset[] = {" +for letter in string.uppercase: + print >> output_file, offset_table_entry(index[letter]) +print >> output_file, offset_table_entry(index['a']) +print >> output_file, """}; + +const HTMLEntityTableEntry* lowercaseOffset[] = {""" +for letter in string.lowercase: + print >> output_file, offset_table_entry(index[letter]) +print >> output_file, offset_table_entry(entity_count) +print >> output_file, """}; + +} + +const HTMLEntityTableEntry* HTMLEntityTable::firstEntryStartingWith(UChar c) +{ + if (c >= 'A' && c <= 'Z') + return uppercaseOffset[c - 'A']; + if (c >= 'a' && c <= 'z') + return lowercaseOffset[c - 'a']; + return 0; +} + +const HTMLEntityTableEntry* HTMLEntityTable::lastEntryStartingWith(UChar c) +{ + if (c >= 'A' && c <= 'Z') + return uppercaseOffset[c - 'A' + 1] - 1; + if (c >= 'a' && c <= 'z') + return lowercaseOffset[c - 'a' + 1] - 1; + return 0; +} + +const HTMLEntityTableEntry* HTMLEntityTable::firstEntry() +{ + return &staticEntityTable[0]; +} + +const HTMLEntityTableEntry* HTMLEntityTable::lastEntry() +{ + return &staticEntityTable[%s - 1]; +} + +} +""" % entity_count |