summaryrefslogtreecommitdiffstats
path: root/WebCore/html/HTMLDocumentParser.h
diff options
context:
space:
mode:
authorKristian Monsen <kristianm@google.com>2010-06-28 16:42:48 +0100
committerKristian Monsen <kristianm@google.com>2010-07-02 10:29:56 +0100
commit06ea8e899e48f1f2f396b70e63fae369f2f23232 (patch)
tree20c1428cd05c76f32394ab354ea35ed99acd86d8 /WebCore/html/HTMLDocumentParser.h
parent72aad67af14193199e29cdd5c4ddc095a8b9a8a8 (diff)
downloadexternal_webkit-06ea8e899e48f1f2f396b70e63fae369f2f23232.zip
external_webkit-06ea8e899e48f1f2f396b70e63fae369f2f23232.tar.gz
external_webkit-06ea8e899e48f1f2f396b70e63fae369f2f23232.tar.bz2
Merge WebKit at r61871: Initial merge by git.
Change-Id: I6cff43abca9cc4782e088a469ad4f03f166a65d5
Diffstat (limited to 'WebCore/html/HTMLDocumentParser.h')
-rw-r--r--WebCore/html/HTMLDocumentParser.h513
1 files changed, 98 insertions, 415 deletions
diff --git a/WebCore/html/HTMLDocumentParser.h b/WebCore/html/HTMLDocumentParser.h
index 6072a7e..c2e752f 100644
--- a/WebCore/html/HTMLDocumentParser.h
+++ b/WebCore/html/HTMLDocumentParser.h
@@ -1,448 +1,131 @@
/*
- Copyright (C) 1997 Martin Jones (mjones@kde.org)
- (C) 1997 Torben Weis (weis@kde.org)
- (C) 1998 Waldo Bastian (bastian@kde.org)
- (C) 2001 Dirk Mueller (mueller@kde.org)
- Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved.
-
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Library General Public
- License as published by the Free Software Foundation; either
- version 2 of the License, or (at your option) any later version.
-
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Library General Public License for more details.
-
- You should have received a copy of the GNU Library General Public License
- along with this library; see the file COPYING.LIB. If not, write to
- the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
- Boston, MA 02110-1301, USA.
-*/
+ * Copyright (C) 2010 Google, Inc. All Rights Reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
-#ifndef HTMLTokenizer_h
-#define HTMLTokenizer_h
+#ifndef HTMLDocumentParser_h
+#define HTMLDocumentParser_h
#include "CachedResourceClient.h"
-#include "CachedResourceHandle.h"
-#include "MappedAttributeEntry.h"
-#include "NamedNodeMap.h"
+#include "FragmentScriptingPermission.h"
+#include "HTMLScriptRunnerHost.h"
+#include "HTMLToken.h"
+#include "HTMLInputStream.h"
#include "SegmentedString.h"
-#include "Timer.h"
#include "DocumentParser.h"
-#include <wtf/Deque.h>
+#include "Timer.h"
#include <wtf/OwnPtr.h>
-#include <wtf/Vector.h>
namespace WebCore {
-class CachedScript;
-class DocumentFragment;
class Document;
+class DocumentFragment;
class HTMLDocument;
-class HTMLScriptElement;
-class HTMLViewSourceDocument;
-class FrameView;
-class LegacyHTMLTreeConstructor;
-class Node;
-class PreloadScanner;
+class HTMLParserScheduler;
+class HTMLTokenizer;
+class HTMLScriptRunner;
+class HTMLTreeBuilder;
+class HTMLPreloadScanner;
+class LegacyHTMLTreeBuilder;
+class ScriptController;
class ScriptSourceCode;
-/**
- * @internal
- * represents one HTML tag. Consists of a numerical id, and the list
- * of attributes. Can also represent text. In this case the id = 0 and
- * text contains the text.
- */
-struct Token {
- Token()
- : beginTag(true)
- , selfClosingTag(false)
- , brokenXMLStyle(false)
- , m_sourceInfo(0)
- { }
- ~Token() { }
-
- void addAttribute(AtomicString& attrName, const AtomicString& v, bool viewSourceMode);
-
- bool isOpenTag(const QualifiedName& fullName) const { return beginTag && fullName.localName() == tagName; }
- bool isCloseTag(const QualifiedName& fullName) const { return !beginTag && fullName.localName() == tagName; }
-
- void reset()
- {
- attrs = 0;
- text = 0;
- tagName = nullAtom;
- beginTag = true;
- selfClosingTag = false;
- brokenXMLStyle = false;
- if (m_sourceInfo)
- m_sourceInfo->clear();
- }
-
- void addViewSourceChar(UChar c) { if (!m_sourceInfo.get()) m_sourceInfo.set(new Vector<UChar>); m_sourceInfo->append(c); }
-
- RefPtr<NamedNodeMap> attrs;
- RefPtr<StringImpl> text;
- AtomicString tagName;
- bool beginTag;
- bool selfClosingTag;
- bool brokenXMLStyle;
- OwnPtr<Vector<UChar> > m_sourceInfo;
-};
-
-enum DoctypeState {
- DoctypeBegin,
- DoctypeBeforeName,
- DoctypeName,
- DoctypeAfterName,
- DoctypeBeforePublicID,
- DoctypePublicID,
- DoctypeAfterPublicID,
- DoctypeBeforeSystemID,
- DoctypeSystemID,
- DoctypeAfterSystemID,
- DoctypeBogus
-};
-
-class DoctypeToken {
-public:
- DoctypeToken() {}
-
- void reset()
- {
- m_name.clear();
- m_publicID.clear();
- m_systemID.clear();
- m_state = DoctypeBegin;
- m_source.clear();
- m_forceQuirks = false;
- }
-
- DoctypeState state() { return m_state; }
- void setState(DoctypeState s) { m_state = s; }
-
- Vector<UChar> m_name;
- Vector<UChar> m_publicID;
- Vector<UChar> m_systemID;
- DoctypeState m_state;
-
- Vector<UChar> m_source;
-
- bool m_forceQuirks; // Used by the HTML5 parser.
-};
-
-//-----------------------------------------------------------------------------
-
-// FIXME: This class does too much. Right now it is both an HTML lexer as well
-// as handling all of the non-lexer-specific junk related to tokenizing HTML
-// (like dealing with <script> tags). The HTML lexer bits should be pushed
-// down into a separate HTML lexer class.
-
-class HTMLDocumentParser : public DocumentParser, public CachedResourceClient {
+class HTMLDocumentParser : public DocumentParser, HTMLScriptRunnerHost, CachedResourceClient {
public:
+ // FIXME: These constructors should be made private and replaced by create() methods.
HTMLDocumentParser(HTMLDocument*, bool reportErrors);
- HTMLDocumentParser(HTMLViewSourceDocument*);
- HTMLDocumentParser(DocumentFragment*, FragmentScriptingPermission = FragmentScriptingAllowed);
+ HTMLDocumentParser(DocumentFragment*, FragmentScriptingPermission);
virtual ~HTMLDocumentParser();
- virtual void write(const SegmentedString&, bool appendData);
- virtual void finish();
- virtual bool forceSynchronous() const { return m_state.forceSynchronous(); }
- virtual void setForceSynchronous(bool force);
- virtual bool isWaitingForScripts() const;
- virtual void stopParsing();
- virtual bool processingData() const;
- virtual int executingScript() const { return m_executingScript; }
-
- virtual int lineNumber() const { return m_lineNumber; }
- virtual int columnNumber() const { return 1; }
-
- bool processingContentWrittenByScript() const { return m_src.excludeLineNumbers(); }
-
- virtual void executeScriptsWaitingForStylesheets();
+ // Exposed for HTMLParserScheduler
+ void resumeParsingAfterYield();
- virtual LegacyHTMLTreeConstructor* htmlTreeConstructor() const { return m_treeConstructor.get(); }
- virtual HTMLDocumentParser* asHTMLDocumentParser() { return this; }
+ static void parseDocumentFragment(const String&, DocumentFragment*, FragmentScriptingPermission = FragmentScriptingAllowed);
private:
- class State;
-
- // Where we are in parsing a tag
- void begin();
- void end();
-
- void reset();
-
- void willWriteHTML(const SegmentedString&);
- ALWAYS_INLINE void advance(State&);
- void didWriteHTML();
-
- PassRefPtr<Node> processToken();
- void processDoctypeToken();
-
- State processListing(SegmentedString, State);
- State parseComment(SegmentedString&, State);
- State parseDoctype(SegmentedString&, State);
- State parseServer(SegmentedString&, State);
- State parseText(SegmentedString&, State);
- State parseNonHTMLText(SegmentedString&, State);
- State parseTag(SegmentedString&, State);
- State parseEntity(SegmentedString&, UChar*& dest, State, unsigned& cBufferPos, bool start, bool parsingTag);
- State parseProcessingInstruction(SegmentedString&, State);
- State scriptHandler(State);
- State scriptExecution(const ScriptSourceCode&, State);
- void setSrc(const SegmentedString&);
-
- // check if we have enough space in the buffer.
- // if not enlarge it
- inline void checkBuffer(int len = 10)
- {
- if ((m_dest - m_buffer) > m_bufferSize - len)
- enlargeBuffer(len);
- }
-
- inline void checkScriptBuffer(int len = 10)
- {
- if (m_scriptCodeSize + len >= m_scriptCodeCapacity)
- enlargeScriptBuffer(len);
- }
-
- void enlargeBuffer(int len);
- void enlargeScriptBuffer(int len);
-
- bool continueProcessing(int& processedCount, double startTime, State&);
- void timerFired(Timer<HTMLDocumentParser>*);
- void allDataProcessed();
-
- // from CachedResourceClient
- void notifyFinished(CachedResource*);
-
- void executeExternalScriptsIfReady();
- void executeExternalScriptsTimerFired(Timer<HTMLDocumentParser>*);
- bool continueExecutingExternalScripts(double startTime);
-
- // Internal buffers
- ///////////////////
- UChar* m_buffer;
- int m_bufferSize;
- UChar* m_dest;
-
- Token m_currentToken;
-
- // This buffer holds the raw characters we've seen between the beginning of
- // the attribute name and the first character of the attribute value.
- Vector<UChar, 32> m_rawAttributeBeforeValue;
-
- // DocumentParser flags
- //////////////////
- // are we in quotes within a html tag
- enum { NoQuote, SingleQuote, DoubleQuote } tquote;
-
- // Are we in a &... character entity description?
- enum EntityState {
- NoEntity = 0,
- SearchEntity = 1,
- NumericSearch = 2,
- Hexadecimal = 3,
- Decimal = 4,
- EntityName = 5,
- SearchSemicolon = 6
- };
- unsigned EntityUnicodeValue;
-
- enum TagState {
- NoTag = 0,
- TagName = 1,
- SearchAttribute = 2,
- AttributeName = 3,
- SearchEqual = 4,
- SearchValue = 5,
- QuotedValue = 6,
- Value = 7,
- SearchEnd = 8
- };
-
- class State {
- public:
- State() : m_bits(0) { }
-
- TagState tagState() const { return static_cast<TagState>(m_bits & TagMask); }
- void setTagState(TagState t) { m_bits = (m_bits & ~TagMask) | t; }
- EntityState entityState() const { return static_cast<EntityState>((m_bits & EntityMask) >> EntityShift); }
- void setEntityState(EntityState e) { m_bits = (m_bits & ~EntityMask) | (e << EntityShift); }
-
- bool inScript() const { return testBit(InScript); }
- void setInScript(bool v) { setBit(InScript, v); }
- bool inStyle() const { return testBit(InStyle); }
- void setInStyle(bool v) { setBit(InStyle, v); }
- bool inXmp() const { return testBit(InXmp); }
- void setInXmp(bool v) { setBit(InXmp, v); }
- bool inTitle() const { return testBit(InTitle); }
- void setInTitle(bool v) { setBit(InTitle, v); }
- bool inIFrame() const { return testBit(InIFrame); }
- void setInIFrame(bool v) { setBit(InIFrame, v); }
- bool inPlainText() const { return testBit(InPlainText); }
- void setInPlainText(bool v) { setBit(InPlainText, v); }
- bool inProcessingInstruction() const { return testBit(InProcessingInstruction); }
- void setInProcessingInstruction(bool v) { return setBit(InProcessingInstruction, v); }
- bool inComment() const { return testBit(InComment); }
- void setInComment(bool v) { setBit(InComment, v); }
- bool inDoctype() const { return testBit(InDoctype); }
- void setInDoctype(bool v) { setBit(InDoctype, v); }
- bool inTextArea() const { return testBit(InTextArea); }
- void setInTextArea(bool v) { setBit(InTextArea, v); }
- bool escaped() const { return testBit(Escaped); }
- void setEscaped(bool v) { setBit(Escaped, v); }
- bool inServer() const { return testBit(InServer); }
- void setInServer(bool v) { setBit(InServer, v); }
- bool skipLF() const { return testBit(SkipLF); }
- void setSkipLF(bool v) { setBit(SkipLF, v); }
- bool startTag() const { return testBit(StartTag); }
- void setStartTag(bool v) { setBit(StartTag, v); }
- bool discardLF() const { return testBit(DiscardLF); }
- void setDiscardLF(bool v) { setBit(DiscardLF, v); }
- bool allowYield() const { return testBit(AllowYield); }
- void setAllowYield(bool v) { setBit(AllowYield, v); }
- bool loadingExtScript() const { return testBit(LoadingExtScript); }
- void setLoadingExtScript(bool v) { setBit(LoadingExtScript, v); }
- bool forceSynchronous() const { return testBit(ForceSynchronous); }
- void setForceSynchronous(bool v) { setBit(ForceSynchronous, v); }
-
- bool inAnyNonHTMLText() const { return m_bits & (InScript | InStyle | InXmp | InTextArea | InTitle | InIFrame); }
- bool hasTagState() const { return m_bits & TagMask; }
- bool hasEntityState() const { return m_bits & EntityMask; }
-
- bool needsSpecialWriteHandling() const { return m_bits & (InScript | InStyle | InXmp | InTextArea | InTitle | InIFrame | TagMask | EntityMask | InPlainText | InComment | InDoctype | InServer | InProcessingInstruction | StartTag); }
-
- private:
- static const int EntityShift = 4;
- enum StateBits {
- TagMask = (1 << 4) - 1,
- EntityMask = (1 << 7) - (1 << 4),
- InScript = 1 << 7,
- InStyle = 1 << 8,
- // Bit 9 unused
- InXmp = 1 << 10,
- InTitle = 1 << 11,
- InPlainText = 1 << 12,
- InProcessingInstruction = 1 << 13,
- InComment = 1 << 14,
- InTextArea = 1 << 15,
- Escaped = 1 << 16,
- InServer = 1 << 17,
- SkipLF = 1 << 18,
- StartTag = 1 << 19,
- DiscardLF = 1 << 20, // FIXME: should clarify difference between skip and discard
- AllowYield = 1 << 21,
- LoadingExtScript = 1 << 22,
- ForceSynchronous = 1 << 23,
- InIFrame = 1 << 24,
- InDoctype = 1 << 25
- };
-
- void setBit(StateBits bit, bool value)
- {
- if (value)
- m_bits |= bit;
- else
- m_bits &= ~bit;
- }
- bool testBit(StateBits bit) const { return m_bits & bit; }
-
- unsigned m_bits;
+ // DocumentParser
+ virtual void begin();
+ virtual void write(const SegmentedString&, bool isFromNetwork);
+ virtual void finish();
+ virtual bool finishWasCalled();
+ virtual bool processingData() const;
+ virtual void stopParsing();
+ virtual bool isWaitingForScripts() const;
+ virtual bool isExecutingScript() const;
+ virtual void executeScriptsWaitingForStylesheets();
+ virtual int lineNumber() const;
+ virtual int columnNumber() const;
+ // FIXME: HTMLFormControlElement accesses the LegacyHTMLTreeBuilder via this method.
+ // Remove this when the LegacyHTMLTreeBuilder is no longer used.
+ virtual LegacyHTMLTreeBuilder* htmlTreeBuilder() const;
+
+ // HTMLScriptRunnerHost
+ virtual void watchForLoad(CachedResource*);
+ virtual void stopWatchingForLoad(CachedResource*);
+ virtual bool shouldLoadExternalScriptFromSrc(const AtomicString&);
+ virtual HTMLInputStream& inputStream() { return m_input; }
+
+ // CachedResourceClient
+ virtual void notifyFinished(CachedResource*);
+
+ void willPumpLexer();
+ void didPumpLexer();
+
+ enum SynchronousMode {
+ AllowYield,
+ ForceSynchronous,
};
+ void pumpTokenizer(SynchronousMode);
+ void pumpTokenizerIfPossible(SynchronousMode);
- State m_state;
-
- DoctypeToken m_doctypeToken;
- int m_doctypeSearchCount;
- int m_doctypeSecondarySearchCount;
-
- bool m_brokenServer;
-
- // Name of an attribute that we just scanned.
- AtomicString m_attrName;
+ bool runScriptsForPausedTreeBuilder();
+ void resumeParsingAfterScriptExecution();
- // Used to store the code of a scripting sequence
- UChar* m_scriptCode;
- // Size of the script sequenze stored in @ref #scriptCode
- int m_scriptCodeSize;
- // Maximal size that can be stored in @ref #scriptCode
- int m_scriptCodeCapacity;
- // resync point of script code size
- int m_scriptCodeResync;
-
- // Stores characters if we are scanning for a string like "</script>"
- UChar searchBuffer[10];
-
- // Counts where we are in the string we are scanning for
- int searchCount;
- // the stopper string
- const char* m_searchStopper;
- int m_searchStopperLength;
-
- // if no more data is coming, just parse what we have (including ext scripts that
- // may be still downloading) and finish
- bool m_noMoreData;
- // URL to get source code of script from
- String m_scriptTagSrcAttrValue;
- String m_scriptTagCharsetAttrValue;
- // the HTML code we will parse after the external script we are waiting for has loaded
- SegmentedString m_pendingSrc;
-
- // the HTML code we will parse after this particular script has
- // loaded, but before all pending HTML
- SegmentedString* m_currentPrependingSrc;
-
- // true if we are executing a script while parsing a document. This causes the parsing of
- // the output of the script to be postponed until after the script has finished executing
- int m_executingScript;
- Deque<CachedResourceHandle<CachedScript> > m_pendingScripts;
- RefPtr<HTMLScriptElement> m_scriptNode;
-
- bool m_requestingScript;
- bool m_hasScriptsWaitingForStylesheets;
-
- // if we found one broken comment, there are most likely others as well
- // store a flag to get rid of the O(n^2) behaviour in such a case.
- bool m_brokenComments;
- // current line number
- int m_lineNumber;
- int m_currentScriptTagStartLineNumber;
- int m_currentTagStartLineNumber;
+ void attemptToEnd();
+ void endIfDelayed();
+ void end();
- double m_tokenizerTimeDelay;
- int m_tokenizerChunkSize;
+ bool isScheduledForResume() const;
+ bool inScriptExecution() const;
+ bool inWrite() const { return m_writeNestingLevel > 0; }
- // The timer for continued processing.
- Timer<HTMLDocumentParser> m_timer;
+ ScriptController* script() const;
- // The timer for continued executing external scripts.
- Timer<HTMLDocumentParser> m_externalScriptsTimer;
+ HTMLInputStream m_input;
-// This buffer can hold arbitrarily long user-defined attribute names, such as in EMBED tags.
-// So any fixed number might be too small, but rather than rewriting all usage of this buffer
-// we'll just make it large enough to handle all imaginable cases.
-#define CBUFLEN 1024
- UChar m_cBuffer[CBUFLEN + 2];
- unsigned int m_cBufferPos;
+ // We hold m_token here because it might be partially complete.
+ HTMLToken m_token;
- SegmentedString m_src;
- Document* m_doc;
- OwnPtr<LegacyHTMLTreeConstructor> m_treeConstructor;
- bool m_inWrite;
- bool m_fragment;
- FragmentScriptingPermission m_scriptingPermission;
+ OwnPtr<HTMLTokenizer> m_tokenizer;
+ OwnPtr<HTMLScriptRunner> m_scriptRunner;
+ OwnPtr<HTMLTreeBuilder> m_treeBuilder;
+ OwnPtr<HTMLPreloadScanner> m_preloadScanner;
+ OwnPtr<HTMLParserScheduler> m_parserScheduler;
- OwnPtr<PreloadScanner> m_preloadScanner;
+ bool m_endWasDelayed;
+ int m_writeNestingLevel;
};
-void parseHTMLDocumentFragment(const String&, DocumentFragment*, FragmentScriptingPermission = FragmentScriptingAllowed);
-
-UChar decodeNamedEntity(const char*);
-
-} // namespace WebCore
+}
-#endif // HTMLTokenizer_h
+#endif