diff options
Diffstat (limited to 'WebCore/html/HTMLDocumentParser.h')
-rw-r--r-- | WebCore/html/HTMLDocumentParser.h | 448 |
1 files changed, 448 insertions, 0 deletions
diff --git a/WebCore/html/HTMLDocumentParser.h b/WebCore/html/HTMLDocumentParser.h new file mode 100644 index 0000000..6072a7e --- /dev/null +++ b/WebCore/html/HTMLDocumentParser.h @@ -0,0 +1,448 @@ +/* + Copyright (C) 1997 Martin Jones (mjones@kde.org) + (C) 1997 Torben Weis (weis@kde.org) + (C) 1998 Waldo Bastian (bastian@kde.org) + (C) 2001 Dirk Mueller (mueller@kde.org) + Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved. + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Library General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Library General Public License for more details. + + You should have received a copy of the GNU Library General Public License + along with this library; see the file COPYING.LIB. If not, write to + the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, + Boston, MA 02110-1301, USA. +*/ + +#ifndef HTMLTokenizer_h +#define HTMLTokenizer_h + +#include "CachedResourceClient.h" +#include "CachedResourceHandle.h" +#include "MappedAttributeEntry.h" +#include "NamedNodeMap.h" +#include "SegmentedString.h" +#include "Timer.h" +#include "DocumentParser.h" +#include <wtf/Deque.h> +#include <wtf/OwnPtr.h> +#include <wtf/Vector.h> + +namespace WebCore { + +class CachedScript; +class DocumentFragment; +class Document; +class HTMLDocument; +class HTMLScriptElement; +class HTMLViewSourceDocument; +class FrameView; +class LegacyHTMLTreeConstructor; +class Node; +class PreloadScanner; +class ScriptSourceCode; + +/** + * @internal + * represents one HTML tag. Consists of a numerical id, and the list + * of attributes. Can also represent text. In this case the id = 0 and + * text contains the text. + */ +struct Token { + Token() + : beginTag(true) + , selfClosingTag(false) + , brokenXMLStyle(false) + , m_sourceInfo(0) + { } + ~Token() { } + + void addAttribute(AtomicString& attrName, const AtomicString& v, bool viewSourceMode); + + bool isOpenTag(const QualifiedName& fullName) const { return beginTag && fullName.localName() == tagName; } + bool isCloseTag(const QualifiedName& fullName) const { return !beginTag && fullName.localName() == tagName; } + + void reset() + { + attrs = 0; + text = 0; + tagName = nullAtom; + beginTag = true; + selfClosingTag = false; + brokenXMLStyle = false; + if (m_sourceInfo) + m_sourceInfo->clear(); + } + + void addViewSourceChar(UChar c) { if (!m_sourceInfo.get()) m_sourceInfo.set(new Vector<UChar>); m_sourceInfo->append(c); } + + RefPtr<NamedNodeMap> attrs; + RefPtr<StringImpl> text; + AtomicString tagName; + bool beginTag; + bool selfClosingTag; + bool brokenXMLStyle; + OwnPtr<Vector<UChar> > m_sourceInfo; +}; + +enum DoctypeState { + DoctypeBegin, + DoctypeBeforeName, + DoctypeName, + DoctypeAfterName, + DoctypeBeforePublicID, + DoctypePublicID, + DoctypeAfterPublicID, + DoctypeBeforeSystemID, + DoctypeSystemID, + DoctypeAfterSystemID, + DoctypeBogus +}; + +class DoctypeToken { +public: + DoctypeToken() {} + + void reset() + { + m_name.clear(); + m_publicID.clear(); + m_systemID.clear(); + m_state = DoctypeBegin; + m_source.clear(); + m_forceQuirks = false; + } + + DoctypeState state() { return m_state; } + void setState(DoctypeState s) { m_state = s; } + + Vector<UChar> m_name; + Vector<UChar> m_publicID; + Vector<UChar> m_systemID; + DoctypeState m_state; + + Vector<UChar> m_source; + + bool m_forceQuirks; // Used by the HTML5 parser. +}; + +//----------------------------------------------------------------------------- + +// FIXME: This class does too much. Right now it is both an HTML lexer as well +// as handling all of the non-lexer-specific junk related to tokenizing HTML +// (like dealing with <script> tags). The HTML lexer bits should be pushed +// down into a separate HTML lexer class. + +class HTMLDocumentParser : public DocumentParser, public CachedResourceClient { +public: + HTMLDocumentParser(HTMLDocument*, bool reportErrors); + HTMLDocumentParser(HTMLViewSourceDocument*); + HTMLDocumentParser(DocumentFragment*, FragmentScriptingPermission = FragmentScriptingAllowed); + virtual ~HTMLDocumentParser(); + + virtual void write(const SegmentedString&, bool appendData); + virtual void finish(); + virtual bool forceSynchronous() const { return m_state.forceSynchronous(); } + virtual void setForceSynchronous(bool force); + virtual bool isWaitingForScripts() const; + virtual void stopParsing(); + virtual bool processingData() const; + virtual int executingScript() const { return m_executingScript; } + + virtual int lineNumber() const { return m_lineNumber; } + virtual int columnNumber() const { return 1; } + + bool processingContentWrittenByScript() const { return m_src.excludeLineNumbers(); } + + virtual void executeScriptsWaitingForStylesheets(); + + virtual LegacyHTMLTreeConstructor* htmlTreeConstructor() const { return m_treeConstructor.get(); } + virtual HTMLDocumentParser* asHTMLDocumentParser() { return this; } + +private: + class State; + + // Where we are in parsing a tag + void begin(); + void end(); + + void reset(); + + void willWriteHTML(const SegmentedString&); + ALWAYS_INLINE void advance(State&); + void didWriteHTML(); + + PassRefPtr<Node> processToken(); + void processDoctypeToken(); + + State processListing(SegmentedString, State); + State parseComment(SegmentedString&, State); + State parseDoctype(SegmentedString&, State); + State parseServer(SegmentedString&, State); + State parseText(SegmentedString&, State); + State parseNonHTMLText(SegmentedString&, State); + State parseTag(SegmentedString&, State); + State parseEntity(SegmentedString&, UChar*& dest, State, unsigned& cBufferPos, bool start, bool parsingTag); + State parseProcessingInstruction(SegmentedString&, State); + State scriptHandler(State); + State scriptExecution(const ScriptSourceCode&, State); + void setSrc(const SegmentedString&); + + // check if we have enough space in the buffer. + // if not enlarge it + inline void checkBuffer(int len = 10) + { + if ((m_dest - m_buffer) > m_bufferSize - len) + enlargeBuffer(len); + } + + inline void checkScriptBuffer(int len = 10) + { + if (m_scriptCodeSize + len >= m_scriptCodeCapacity) + enlargeScriptBuffer(len); + } + + void enlargeBuffer(int len); + void enlargeScriptBuffer(int len); + + bool continueProcessing(int& processedCount, double startTime, State&); + void timerFired(Timer<HTMLDocumentParser>*); + void allDataProcessed(); + + // from CachedResourceClient + void notifyFinished(CachedResource*); + + void executeExternalScriptsIfReady(); + void executeExternalScriptsTimerFired(Timer<HTMLDocumentParser>*); + bool continueExecutingExternalScripts(double startTime); + + // Internal buffers + /////////////////// + UChar* m_buffer; + int m_bufferSize; + UChar* m_dest; + + Token m_currentToken; + + // This buffer holds the raw characters we've seen between the beginning of + // the attribute name and the first character of the attribute value. + Vector<UChar, 32> m_rawAttributeBeforeValue; + + // DocumentParser flags + ////////////////// + // are we in quotes within a html tag + enum { NoQuote, SingleQuote, DoubleQuote } tquote; + + // Are we in a &... character entity description? + enum EntityState { + NoEntity = 0, + SearchEntity = 1, + NumericSearch = 2, + Hexadecimal = 3, + Decimal = 4, + EntityName = 5, + SearchSemicolon = 6 + }; + unsigned EntityUnicodeValue; + + enum TagState { + NoTag = 0, + TagName = 1, + SearchAttribute = 2, + AttributeName = 3, + SearchEqual = 4, + SearchValue = 5, + QuotedValue = 6, + Value = 7, + SearchEnd = 8 + }; + + class State { + public: + State() : m_bits(0) { } + + TagState tagState() const { return static_cast<TagState>(m_bits & TagMask); } + void setTagState(TagState t) { m_bits = (m_bits & ~TagMask) | t; } + EntityState entityState() const { return static_cast<EntityState>((m_bits & EntityMask) >> EntityShift); } + void setEntityState(EntityState e) { m_bits = (m_bits & ~EntityMask) | (e << EntityShift); } + + bool inScript() const { return testBit(InScript); } + void setInScript(bool v) { setBit(InScript, v); } + bool inStyle() const { return testBit(InStyle); } + void setInStyle(bool v) { setBit(InStyle, v); } + bool inXmp() const { return testBit(InXmp); } + void setInXmp(bool v) { setBit(InXmp, v); } + bool inTitle() const { return testBit(InTitle); } + void setInTitle(bool v) { setBit(InTitle, v); } + bool inIFrame() const { return testBit(InIFrame); } + void setInIFrame(bool v) { setBit(InIFrame, v); } + bool inPlainText() const { return testBit(InPlainText); } + void setInPlainText(bool v) { setBit(InPlainText, v); } + bool inProcessingInstruction() const { return testBit(InProcessingInstruction); } + void setInProcessingInstruction(bool v) { return setBit(InProcessingInstruction, v); } + bool inComment() const { return testBit(InComment); } + void setInComment(bool v) { setBit(InComment, v); } + bool inDoctype() const { return testBit(InDoctype); } + void setInDoctype(bool v) { setBit(InDoctype, v); } + bool inTextArea() const { return testBit(InTextArea); } + void setInTextArea(bool v) { setBit(InTextArea, v); } + bool escaped() const { return testBit(Escaped); } + void setEscaped(bool v) { setBit(Escaped, v); } + bool inServer() const { return testBit(InServer); } + void setInServer(bool v) { setBit(InServer, v); } + bool skipLF() const { return testBit(SkipLF); } + void setSkipLF(bool v) { setBit(SkipLF, v); } + bool startTag() const { return testBit(StartTag); } + void setStartTag(bool v) { setBit(StartTag, v); } + bool discardLF() const { return testBit(DiscardLF); } + void setDiscardLF(bool v) { setBit(DiscardLF, v); } + bool allowYield() const { return testBit(AllowYield); } + void setAllowYield(bool v) { setBit(AllowYield, v); } + bool loadingExtScript() const { return testBit(LoadingExtScript); } + void setLoadingExtScript(bool v) { setBit(LoadingExtScript, v); } + bool forceSynchronous() const { return testBit(ForceSynchronous); } + void setForceSynchronous(bool v) { setBit(ForceSynchronous, v); } + + bool inAnyNonHTMLText() const { return m_bits & (InScript | InStyle | InXmp | InTextArea | InTitle | InIFrame); } + bool hasTagState() const { return m_bits & TagMask; } + bool hasEntityState() const { return m_bits & EntityMask; } + + bool needsSpecialWriteHandling() const { return m_bits & (InScript | InStyle | InXmp | InTextArea | InTitle | InIFrame | TagMask | EntityMask | InPlainText | InComment | InDoctype | InServer | InProcessingInstruction | StartTag); } + + private: + static const int EntityShift = 4; + enum StateBits { + TagMask = (1 << 4) - 1, + EntityMask = (1 << 7) - (1 << 4), + InScript = 1 << 7, + InStyle = 1 << 8, + // Bit 9 unused + InXmp = 1 << 10, + InTitle = 1 << 11, + InPlainText = 1 << 12, + InProcessingInstruction = 1 << 13, + InComment = 1 << 14, + InTextArea = 1 << 15, + Escaped = 1 << 16, + InServer = 1 << 17, + SkipLF = 1 << 18, + StartTag = 1 << 19, + DiscardLF = 1 << 20, // FIXME: should clarify difference between skip and discard + AllowYield = 1 << 21, + LoadingExtScript = 1 << 22, + ForceSynchronous = 1 << 23, + InIFrame = 1 << 24, + InDoctype = 1 << 25 + }; + + void setBit(StateBits bit, bool value) + { + if (value) + m_bits |= bit; + else + m_bits &= ~bit; + } + bool testBit(StateBits bit) const { return m_bits & bit; } + + unsigned m_bits; + }; + + State m_state; + + DoctypeToken m_doctypeToken; + int m_doctypeSearchCount; + int m_doctypeSecondarySearchCount; + + bool m_brokenServer; + + // Name of an attribute that we just scanned. + AtomicString m_attrName; + + // Used to store the code of a scripting sequence + UChar* m_scriptCode; + // Size of the script sequenze stored in @ref #scriptCode + int m_scriptCodeSize; + // Maximal size that can be stored in @ref #scriptCode + int m_scriptCodeCapacity; + // resync point of script code size + int m_scriptCodeResync; + + // Stores characters if we are scanning for a string like "</script>" + UChar searchBuffer[10]; + + // Counts where we are in the string we are scanning for + int searchCount; + // the stopper string + const char* m_searchStopper; + int m_searchStopperLength; + + // if no more data is coming, just parse what we have (including ext scripts that + // may be still downloading) and finish + bool m_noMoreData; + // URL to get source code of script from + String m_scriptTagSrcAttrValue; + String m_scriptTagCharsetAttrValue; + // the HTML code we will parse after the external script we are waiting for has loaded + SegmentedString m_pendingSrc; + + // the HTML code we will parse after this particular script has + // loaded, but before all pending HTML + SegmentedString* m_currentPrependingSrc; + + // true if we are executing a script while parsing a document. This causes the parsing of + // the output of the script to be postponed until after the script has finished executing + int m_executingScript; + Deque<CachedResourceHandle<CachedScript> > m_pendingScripts; + RefPtr<HTMLScriptElement> m_scriptNode; + + bool m_requestingScript; + bool m_hasScriptsWaitingForStylesheets; + + // if we found one broken comment, there are most likely others as well + // store a flag to get rid of the O(n^2) behaviour in such a case. + bool m_brokenComments; + // current line number + int m_lineNumber; + int m_currentScriptTagStartLineNumber; + int m_currentTagStartLineNumber; + + double m_tokenizerTimeDelay; + int m_tokenizerChunkSize; + + // The timer for continued processing. + Timer<HTMLDocumentParser> m_timer; + + // The timer for continued executing external scripts. + Timer<HTMLDocumentParser> m_externalScriptsTimer; + +// This buffer can hold arbitrarily long user-defined attribute names, such as in EMBED tags. +// So any fixed number might be too small, but rather than rewriting all usage of this buffer +// we'll just make it large enough to handle all imaginable cases. +#define CBUFLEN 1024 + UChar m_cBuffer[CBUFLEN + 2]; + unsigned int m_cBufferPos; + + SegmentedString m_src; + Document* m_doc; + OwnPtr<LegacyHTMLTreeConstructor> m_treeConstructor; + bool m_inWrite; + bool m_fragment; + FragmentScriptingPermission m_scriptingPermission; + + OwnPtr<PreloadScanner> m_preloadScanner; +}; + +void parseHTMLDocumentFragment(const String&, DocumentFragment*, FragmentScriptingPermission = FragmentScriptingAllowed); + +UChar decodeNamedEntity(const char*); + +} // namespace WebCore + +#endif // HTMLTokenizer_h |