diff options
author | Kristian Monsen <kristianm@google.com> | 2010-06-28 16:42:48 +0100 |
---|---|---|
committer | Kristian Monsen <kristianm@google.com> | 2010-07-02 10:29:56 +0100 |
commit | 06ea8e899e48f1f2f396b70e63fae369f2f23232 (patch) | |
tree | 20c1428cd05c76f32394ab354ea35ed99acd86d8 /WebCore/html/HTMLDocumentParser.h | |
parent | 72aad67af14193199e29cdd5c4ddc095a8b9a8a8 (diff) | |
download | external_webkit-06ea8e899e48f1f2f396b70e63fae369f2f23232.zip external_webkit-06ea8e899e48f1f2f396b70e63fae369f2f23232.tar.gz external_webkit-06ea8e899e48f1f2f396b70e63fae369f2f23232.tar.bz2 |
Merge WebKit at r61871: Initial merge by git.
Change-Id: I6cff43abca9cc4782e088a469ad4f03f166a65d5
Diffstat (limited to 'WebCore/html/HTMLDocumentParser.h')
-rw-r--r-- | WebCore/html/HTMLDocumentParser.h | 513 |
1 files changed, 98 insertions, 415 deletions
diff --git a/WebCore/html/HTMLDocumentParser.h b/WebCore/html/HTMLDocumentParser.h index 6072a7e..c2e752f 100644 --- a/WebCore/html/HTMLDocumentParser.h +++ b/WebCore/html/HTMLDocumentParser.h @@ -1,448 +1,131 @@ /* - Copyright (C) 1997 Martin Jones (mjones@kde.org) - (C) 1997 Torben Weis (weis@kde.org) - (C) 1998 Waldo Bastian (bastian@kde.org) - (C) 2001 Dirk Mueller (mueller@kde.org) - Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved. - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Library General Public - License as published by the Free Software Foundation; either - version 2 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Library General Public License for more details. - - You should have received a copy of the GNU Library General Public License - along with this library; see the file COPYING.LIB. If not, write to - the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, - Boston, MA 02110-1301, USA. -*/ + * Copyright (C) 2010 Google, Inc. All Rights Reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ -#ifndef HTMLTokenizer_h -#define HTMLTokenizer_h +#ifndef HTMLDocumentParser_h +#define HTMLDocumentParser_h #include "CachedResourceClient.h" -#include "CachedResourceHandle.h" -#include "MappedAttributeEntry.h" -#include "NamedNodeMap.h" +#include "FragmentScriptingPermission.h" +#include "HTMLScriptRunnerHost.h" +#include "HTMLToken.h" +#include "HTMLInputStream.h" #include "SegmentedString.h" -#include "Timer.h" #include "DocumentParser.h" -#include <wtf/Deque.h> +#include "Timer.h" #include <wtf/OwnPtr.h> -#include <wtf/Vector.h> namespace WebCore { -class CachedScript; -class DocumentFragment; class Document; +class DocumentFragment; class HTMLDocument; -class HTMLScriptElement; -class HTMLViewSourceDocument; -class FrameView; -class LegacyHTMLTreeConstructor; -class Node; -class PreloadScanner; +class HTMLParserScheduler; +class HTMLTokenizer; +class HTMLScriptRunner; +class HTMLTreeBuilder; +class HTMLPreloadScanner; +class LegacyHTMLTreeBuilder; +class ScriptController; class ScriptSourceCode; -/** - * @internal - * represents one HTML tag. Consists of a numerical id, and the list - * of attributes. Can also represent text. In this case the id = 0 and - * text contains the text. - */ -struct Token { - Token() - : beginTag(true) - , selfClosingTag(false) - , brokenXMLStyle(false) - , m_sourceInfo(0) - { } - ~Token() { } - - void addAttribute(AtomicString& attrName, const AtomicString& v, bool viewSourceMode); - - bool isOpenTag(const QualifiedName& fullName) const { return beginTag && fullName.localName() == tagName; } - bool isCloseTag(const QualifiedName& fullName) const { return !beginTag && fullName.localName() == tagName; } - - void reset() - { - attrs = 0; - text = 0; - tagName = nullAtom; - beginTag = true; - selfClosingTag = false; - brokenXMLStyle = false; - if (m_sourceInfo) - m_sourceInfo->clear(); - } - - void addViewSourceChar(UChar c) { if (!m_sourceInfo.get()) m_sourceInfo.set(new Vector<UChar>); m_sourceInfo->append(c); } - - RefPtr<NamedNodeMap> attrs; - RefPtr<StringImpl> text; - AtomicString tagName; - bool beginTag; - bool selfClosingTag; - bool brokenXMLStyle; - OwnPtr<Vector<UChar> > m_sourceInfo; -}; - -enum DoctypeState { - DoctypeBegin, - DoctypeBeforeName, - DoctypeName, - DoctypeAfterName, - DoctypeBeforePublicID, - DoctypePublicID, - DoctypeAfterPublicID, - DoctypeBeforeSystemID, - DoctypeSystemID, - DoctypeAfterSystemID, - DoctypeBogus -}; - -class DoctypeToken { -public: - DoctypeToken() {} - - void reset() - { - m_name.clear(); - m_publicID.clear(); - m_systemID.clear(); - m_state = DoctypeBegin; - m_source.clear(); - m_forceQuirks = false; - } - - DoctypeState state() { return m_state; } - void setState(DoctypeState s) { m_state = s; } - - Vector<UChar> m_name; - Vector<UChar> m_publicID; - Vector<UChar> m_systemID; - DoctypeState m_state; - - Vector<UChar> m_source; - - bool m_forceQuirks; // Used by the HTML5 parser. -}; - -//----------------------------------------------------------------------------- - -// FIXME: This class does too much. Right now it is both an HTML lexer as well -// as handling all of the non-lexer-specific junk related to tokenizing HTML -// (like dealing with <script> tags). The HTML lexer bits should be pushed -// down into a separate HTML lexer class. - -class HTMLDocumentParser : public DocumentParser, public CachedResourceClient { +class HTMLDocumentParser : public DocumentParser, HTMLScriptRunnerHost, CachedResourceClient { public: + // FIXME: These constructors should be made private and replaced by create() methods. HTMLDocumentParser(HTMLDocument*, bool reportErrors); - HTMLDocumentParser(HTMLViewSourceDocument*); - HTMLDocumentParser(DocumentFragment*, FragmentScriptingPermission = FragmentScriptingAllowed); + HTMLDocumentParser(DocumentFragment*, FragmentScriptingPermission); virtual ~HTMLDocumentParser(); - virtual void write(const SegmentedString&, bool appendData); - virtual void finish(); - virtual bool forceSynchronous() const { return m_state.forceSynchronous(); } - virtual void setForceSynchronous(bool force); - virtual bool isWaitingForScripts() const; - virtual void stopParsing(); - virtual bool processingData() const; - virtual int executingScript() const { return m_executingScript; } - - virtual int lineNumber() const { return m_lineNumber; } - virtual int columnNumber() const { return 1; } - - bool processingContentWrittenByScript() const { return m_src.excludeLineNumbers(); } - - virtual void executeScriptsWaitingForStylesheets(); + // Exposed for HTMLParserScheduler + void resumeParsingAfterYield(); - virtual LegacyHTMLTreeConstructor* htmlTreeConstructor() const { return m_treeConstructor.get(); } - virtual HTMLDocumentParser* asHTMLDocumentParser() { return this; } + static void parseDocumentFragment(const String&, DocumentFragment*, FragmentScriptingPermission = FragmentScriptingAllowed); private: - class State; - - // Where we are in parsing a tag - void begin(); - void end(); - - void reset(); - - void willWriteHTML(const SegmentedString&); - ALWAYS_INLINE void advance(State&); - void didWriteHTML(); - - PassRefPtr<Node> processToken(); - void processDoctypeToken(); - - State processListing(SegmentedString, State); - State parseComment(SegmentedString&, State); - State parseDoctype(SegmentedString&, State); - State parseServer(SegmentedString&, State); - State parseText(SegmentedString&, State); - State parseNonHTMLText(SegmentedString&, State); - State parseTag(SegmentedString&, State); - State parseEntity(SegmentedString&, UChar*& dest, State, unsigned& cBufferPos, bool start, bool parsingTag); - State parseProcessingInstruction(SegmentedString&, State); - State scriptHandler(State); - State scriptExecution(const ScriptSourceCode&, State); - void setSrc(const SegmentedString&); - - // check if we have enough space in the buffer. - // if not enlarge it - inline void checkBuffer(int len = 10) - { - if ((m_dest - m_buffer) > m_bufferSize - len) - enlargeBuffer(len); - } - - inline void checkScriptBuffer(int len = 10) - { - if (m_scriptCodeSize + len >= m_scriptCodeCapacity) - enlargeScriptBuffer(len); - } - - void enlargeBuffer(int len); - void enlargeScriptBuffer(int len); - - bool continueProcessing(int& processedCount, double startTime, State&); - void timerFired(Timer<HTMLDocumentParser>*); - void allDataProcessed(); - - // from CachedResourceClient - void notifyFinished(CachedResource*); - - void executeExternalScriptsIfReady(); - void executeExternalScriptsTimerFired(Timer<HTMLDocumentParser>*); - bool continueExecutingExternalScripts(double startTime); - - // Internal buffers - /////////////////// - UChar* m_buffer; - int m_bufferSize; - UChar* m_dest; - - Token m_currentToken; - - // This buffer holds the raw characters we've seen between the beginning of - // the attribute name and the first character of the attribute value. - Vector<UChar, 32> m_rawAttributeBeforeValue; - - // DocumentParser flags - ////////////////// - // are we in quotes within a html tag - enum { NoQuote, SingleQuote, DoubleQuote } tquote; - - // Are we in a &... character entity description? - enum EntityState { - NoEntity = 0, - SearchEntity = 1, - NumericSearch = 2, - Hexadecimal = 3, - Decimal = 4, - EntityName = 5, - SearchSemicolon = 6 - }; - unsigned EntityUnicodeValue; - - enum TagState { - NoTag = 0, - TagName = 1, - SearchAttribute = 2, - AttributeName = 3, - SearchEqual = 4, - SearchValue = 5, - QuotedValue = 6, - Value = 7, - SearchEnd = 8 - }; - - class State { - public: - State() : m_bits(0) { } - - TagState tagState() const { return static_cast<TagState>(m_bits & TagMask); } - void setTagState(TagState t) { m_bits = (m_bits & ~TagMask) | t; } - EntityState entityState() const { return static_cast<EntityState>((m_bits & EntityMask) >> EntityShift); } - void setEntityState(EntityState e) { m_bits = (m_bits & ~EntityMask) | (e << EntityShift); } - - bool inScript() const { return testBit(InScript); } - void setInScript(bool v) { setBit(InScript, v); } - bool inStyle() const { return testBit(InStyle); } - void setInStyle(bool v) { setBit(InStyle, v); } - bool inXmp() const { return testBit(InXmp); } - void setInXmp(bool v) { setBit(InXmp, v); } - bool inTitle() const { return testBit(InTitle); } - void setInTitle(bool v) { setBit(InTitle, v); } - bool inIFrame() const { return testBit(InIFrame); } - void setInIFrame(bool v) { setBit(InIFrame, v); } - bool inPlainText() const { return testBit(InPlainText); } - void setInPlainText(bool v) { setBit(InPlainText, v); } - bool inProcessingInstruction() const { return testBit(InProcessingInstruction); } - void setInProcessingInstruction(bool v) { return setBit(InProcessingInstruction, v); } - bool inComment() const { return testBit(InComment); } - void setInComment(bool v) { setBit(InComment, v); } - bool inDoctype() const { return testBit(InDoctype); } - void setInDoctype(bool v) { setBit(InDoctype, v); } - bool inTextArea() const { return testBit(InTextArea); } - void setInTextArea(bool v) { setBit(InTextArea, v); } - bool escaped() const { return testBit(Escaped); } - void setEscaped(bool v) { setBit(Escaped, v); } - bool inServer() const { return testBit(InServer); } - void setInServer(bool v) { setBit(InServer, v); } - bool skipLF() const { return testBit(SkipLF); } - void setSkipLF(bool v) { setBit(SkipLF, v); } - bool startTag() const { return testBit(StartTag); } - void setStartTag(bool v) { setBit(StartTag, v); } - bool discardLF() const { return testBit(DiscardLF); } - void setDiscardLF(bool v) { setBit(DiscardLF, v); } - bool allowYield() const { return testBit(AllowYield); } - void setAllowYield(bool v) { setBit(AllowYield, v); } - bool loadingExtScript() const { return testBit(LoadingExtScript); } - void setLoadingExtScript(bool v) { setBit(LoadingExtScript, v); } - bool forceSynchronous() const { return testBit(ForceSynchronous); } - void setForceSynchronous(bool v) { setBit(ForceSynchronous, v); } - - bool inAnyNonHTMLText() const { return m_bits & (InScript | InStyle | InXmp | InTextArea | InTitle | InIFrame); } - bool hasTagState() const { return m_bits & TagMask; } - bool hasEntityState() const { return m_bits & EntityMask; } - - bool needsSpecialWriteHandling() const { return m_bits & (InScript | InStyle | InXmp | InTextArea | InTitle | InIFrame | TagMask | EntityMask | InPlainText | InComment | InDoctype | InServer | InProcessingInstruction | StartTag); } - - private: - static const int EntityShift = 4; - enum StateBits { - TagMask = (1 << 4) - 1, - EntityMask = (1 << 7) - (1 << 4), - InScript = 1 << 7, - InStyle = 1 << 8, - // Bit 9 unused - InXmp = 1 << 10, - InTitle = 1 << 11, - InPlainText = 1 << 12, - InProcessingInstruction = 1 << 13, - InComment = 1 << 14, - InTextArea = 1 << 15, - Escaped = 1 << 16, - InServer = 1 << 17, - SkipLF = 1 << 18, - StartTag = 1 << 19, - DiscardLF = 1 << 20, // FIXME: should clarify difference between skip and discard - AllowYield = 1 << 21, - LoadingExtScript = 1 << 22, - ForceSynchronous = 1 << 23, - InIFrame = 1 << 24, - InDoctype = 1 << 25 - }; - - void setBit(StateBits bit, bool value) - { - if (value) - m_bits |= bit; - else - m_bits &= ~bit; - } - bool testBit(StateBits bit) const { return m_bits & bit; } - - unsigned m_bits; + // DocumentParser + virtual void begin(); + virtual void write(const SegmentedString&, bool isFromNetwork); + virtual void finish(); + virtual bool finishWasCalled(); + virtual bool processingData() const; + virtual void stopParsing(); + virtual bool isWaitingForScripts() const; + virtual bool isExecutingScript() const; + virtual void executeScriptsWaitingForStylesheets(); + virtual int lineNumber() const; + virtual int columnNumber() const; + // FIXME: HTMLFormControlElement accesses the LegacyHTMLTreeBuilder via this method. + // Remove this when the LegacyHTMLTreeBuilder is no longer used. + virtual LegacyHTMLTreeBuilder* htmlTreeBuilder() const; + + // HTMLScriptRunnerHost + virtual void watchForLoad(CachedResource*); + virtual void stopWatchingForLoad(CachedResource*); + virtual bool shouldLoadExternalScriptFromSrc(const AtomicString&); + virtual HTMLInputStream& inputStream() { return m_input; } + + // CachedResourceClient + virtual void notifyFinished(CachedResource*); + + void willPumpLexer(); + void didPumpLexer(); + + enum SynchronousMode { + AllowYield, + ForceSynchronous, }; + void pumpTokenizer(SynchronousMode); + void pumpTokenizerIfPossible(SynchronousMode); - State m_state; - - DoctypeToken m_doctypeToken; - int m_doctypeSearchCount; - int m_doctypeSecondarySearchCount; - - bool m_brokenServer; - - // Name of an attribute that we just scanned. - AtomicString m_attrName; + bool runScriptsForPausedTreeBuilder(); + void resumeParsingAfterScriptExecution(); - // Used to store the code of a scripting sequence - UChar* m_scriptCode; - // Size of the script sequenze stored in @ref #scriptCode - int m_scriptCodeSize; - // Maximal size that can be stored in @ref #scriptCode - int m_scriptCodeCapacity; - // resync point of script code size - int m_scriptCodeResync; - - // Stores characters if we are scanning for a string like "</script>" - UChar searchBuffer[10]; - - // Counts where we are in the string we are scanning for - int searchCount; - // the stopper string - const char* m_searchStopper; - int m_searchStopperLength; - - // if no more data is coming, just parse what we have (including ext scripts that - // may be still downloading) and finish - bool m_noMoreData; - // URL to get source code of script from - String m_scriptTagSrcAttrValue; - String m_scriptTagCharsetAttrValue; - // the HTML code we will parse after the external script we are waiting for has loaded - SegmentedString m_pendingSrc; - - // the HTML code we will parse after this particular script has - // loaded, but before all pending HTML - SegmentedString* m_currentPrependingSrc; - - // true if we are executing a script while parsing a document. This causes the parsing of - // the output of the script to be postponed until after the script has finished executing - int m_executingScript; - Deque<CachedResourceHandle<CachedScript> > m_pendingScripts; - RefPtr<HTMLScriptElement> m_scriptNode; - - bool m_requestingScript; - bool m_hasScriptsWaitingForStylesheets; - - // if we found one broken comment, there are most likely others as well - // store a flag to get rid of the O(n^2) behaviour in such a case. - bool m_brokenComments; - // current line number - int m_lineNumber; - int m_currentScriptTagStartLineNumber; - int m_currentTagStartLineNumber; + void attemptToEnd(); + void endIfDelayed(); + void end(); - double m_tokenizerTimeDelay; - int m_tokenizerChunkSize; + bool isScheduledForResume() const; + bool inScriptExecution() const; + bool inWrite() const { return m_writeNestingLevel > 0; } - // The timer for continued processing. - Timer<HTMLDocumentParser> m_timer; + ScriptController* script() const; - // The timer for continued executing external scripts. - Timer<HTMLDocumentParser> m_externalScriptsTimer; + HTMLInputStream m_input; -// This buffer can hold arbitrarily long user-defined attribute names, such as in EMBED tags. -// So any fixed number might be too small, but rather than rewriting all usage of this buffer -// we'll just make it large enough to handle all imaginable cases. -#define CBUFLEN 1024 - UChar m_cBuffer[CBUFLEN + 2]; - unsigned int m_cBufferPos; + // We hold m_token here because it might be partially complete. + HTMLToken m_token; - SegmentedString m_src; - Document* m_doc; - OwnPtr<LegacyHTMLTreeConstructor> m_treeConstructor; - bool m_inWrite; - bool m_fragment; - FragmentScriptingPermission m_scriptingPermission; + OwnPtr<HTMLTokenizer> m_tokenizer; + OwnPtr<HTMLScriptRunner> m_scriptRunner; + OwnPtr<HTMLTreeBuilder> m_treeBuilder; + OwnPtr<HTMLPreloadScanner> m_preloadScanner; + OwnPtr<HTMLParserScheduler> m_parserScheduler; - OwnPtr<PreloadScanner> m_preloadScanner; + bool m_endWasDelayed; + int m_writeNestingLevel; }; -void parseHTMLDocumentFragment(const String&, DocumentFragment*, FragmentScriptingPermission = FragmentScriptingAllowed); - -UChar decodeNamedEntity(const char*); - -} // namespace WebCore +} -#endif // HTMLTokenizer_h +#endif |