/* Copyright (C) 1997 Martin Jones (mjones@kde.org) (C) 1997 Torben Weis (weis@kde.org) (C) 1998 Waldo Bastian (bastian@kde.org) (C) 2001 Dirk Mueller (mueller@kde.org) Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved. This library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more details. You should have received a copy of the GNU Library General Public License along with this library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ #ifndef HTMLTokenizer_h #define HTMLTokenizer_h #include "CachedResourceClient.h" #include "CachedResourceHandle.h" #include "NamedMappedAttrMap.h" #include "SegmentedString.h" #include "Timer.h" #include "Tokenizer.h" #include #include #include namespace WebCore { class CachedScript; class DocumentFragment; class Document; class HTMLDocument; class HTMLScriptElement; class HTMLViewSourceDocument; class FrameView; class HTMLParser; class Node; class PreloadScanner; class ScriptSourceCode; /** * @internal * represents one HTML tag. Consists of a numerical id, and the list * of attributes. Can also represent text. In this case the id = 0 and * text contains the text. */ struct Token { Token() : beginTag(true) , selfClosingTag(false) , brokenXMLStyle(false) , m_sourceInfo(0) { } ~Token() { } void addAttribute(AtomicString& attrName, const AtomicString& v, bool viewSourceMode); bool isOpenTag(const QualifiedName& fullName) const { return beginTag && fullName.localName() == tagName; } bool isCloseTag(const QualifiedName& fullName) const { return !beginTag && fullName.localName() == tagName; } void reset() { attrs = 0; text = 0; tagName = nullAtom; beginTag = true; selfClosingTag = false; brokenXMLStyle = false; if (m_sourceInfo) m_sourceInfo->clear(); } void addViewSourceChar(UChar c) { if (!m_sourceInfo.get()) m_sourceInfo.set(new Vector); m_sourceInfo->append(c); } RefPtr attrs; RefPtr text; AtomicString tagName; bool beginTag; bool selfClosingTag; bool brokenXMLStyle; OwnPtr > m_sourceInfo; }; enum DoctypeState { DoctypeBegin, DoctypeBeforeName, DoctypeName, DoctypeAfterName, DoctypeBeforePublicID, DoctypePublicID, DoctypeAfterPublicID, DoctypeBeforeSystemID, DoctypeSystemID, DoctypeAfterSystemID, DoctypeBogus }; class DoctypeToken { public: DoctypeToken() {} void reset() { m_name.clear(); m_publicID.clear(); m_systemID.clear(); m_state = DoctypeBegin; m_source.clear(); } DoctypeState state() { return m_state; } void setState(DoctypeState s) { m_state = s; } Vector m_name; Vector m_publicID; Vector m_systemID; DoctypeState m_state; Vector m_source; }; //----------------------------------------------------------------------------- class HTMLTokenizer : public Tokenizer, public CachedResourceClient { public: HTMLTokenizer(HTMLDocument*, bool reportErrors); HTMLTokenizer(HTMLViewSourceDocument*); HTMLTokenizer(DocumentFragment*); virtual ~HTMLTokenizer(); virtual void write(const SegmentedString&, bool appendData); virtual void finish(); virtual void setForceSynchronous(bool force); virtual bool isWaitingForScripts() const; virtual void stopParsing(); virtual bool processingData() const; virtual int executingScript() const { return m_executingScript; } virtual int lineNumber() const { return m_lineNumber; } virtual int columnNumber() const { return 1; } bool processingContentWrittenByScript() const { return m_src.excludeLineNumbers(); } virtual void executeScriptsWaitingForStylesheets(); virtual bool isHTMLTokenizer() const { return true; } HTMLParser* htmlParser() const { return m_parser.get(); } private: class State; // Where we are in parsing a tag void begin(); void end(); void reset(); PassRefPtr processToken(); void processDoctypeToken(); State processListing(SegmentedString, State); State parseComment(SegmentedString&, State); State parseDoctype(SegmentedString&, State); State parseServer(SegmentedString&, State); State parseText(SegmentedString&, State); State parseNonHTMLText(SegmentedString&, State); State parseTag(SegmentedString&, State); State parseEntity(SegmentedString&, UChar*& dest, State, unsigned& cBufferPos, bool start, bool parsingTag); State parseProcessingInstruction(SegmentedString&, State); State scriptHandler(State); State scriptExecution(const ScriptSourceCode&, State); void setSrc(const SegmentedString&); // check if we have enough space in the buffer. // if not enlarge it inline void checkBuffer(int len = 10) { if ((m_dest - m_buffer) > m_bufferSize - len) enlargeBuffer(len); } inline void checkScriptBuffer(int len = 10) { if (m_scriptCodeSize + len >= m_scriptCodeCapacity) enlargeScriptBuffer(len); } void enlargeBuffer(int len); void enlargeScriptBuffer(int len); bool continueProcessing(int& processedCount, double startTime, State&); void timerFired(Timer*); void allDataProcessed(); // from CachedResourceClient void notifyFinished(CachedResource*); // Internal buffers /////////////////// UChar* m_buffer; int m_bufferSize; UChar* m_dest; Token m_currentToken; // This buffer holds the raw characters we've seen between the beginning of // the attribute name and the first character of the attribute value. Vector m_rawAttributeBeforeValue; // Tokenizer flags ////////////////// // are we in quotes within a html tag enum { NoQuote, SingleQuote, DoubleQuote } tquote; // Are we in a &... character entity description? enum EntityState { NoEntity = 0, SearchEntity = 1, NumericSearch = 2, Hexadecimal = 3, Decimal = 4, EntityName = 5, SearchSemicolon = 6 }; unsigned EntityUnicodeValue; enum TagState { NoTag = 0, TagName = 1, SearchAttribute = 2, AttributeName = 3, SearchEqual = 4, SearchValue = 5, QuotedValue = 6, Value = 7, SearchEnd = 8 }; class State { public: State() : m_bits(0) { } TagState tagState() const { return static_cast(m_bits & TagMask); } void setTagState(TagState t) { m_bits = (m_bits & ~TagMask) | t; } EntityState entityState() const { return static_cast((m_bits & EntityMask) >> EntityShift); } void setEntityState(EntityState e) { m_bits = (m_bits & ~EntityMask) | (e << EntityShift); } bool inScript() const { return testBit(InScript); } void setInScript(bool v) { setBit(InScript, v); } bool inStyle() const { return testBit(InStyle); } void setInStyle(bool v) { setBit(InStyle, v); } bool inXmp() const { return testBit(InXmp); } void setInXmp(bool v) { setBit(InXmp, v); } bool inTitle() const { return testBit(InTitle); } void setInTitle(bool v) { setBit(InTitle, v); } bool inIFrame() const { return testBit(InIFrame); } void setInIFrame(bool v) { setBit(InIFrame, v); } bool inPlainText() const { return testBit(InPlainText); } void setInPlainText(bool v) { setBit(InPlainText, v); } bool inProcessingInstruction() const { return testBit(InProcessingInstruction); } void setInProcessingInstruction(bool v) { return setBit(InProcessingInstruction, v); } bool inComment() const { return testBit(InComment); } void setInComment(bool v) { setBit(InComment, v); } bool inDoctype() const { return testBit(InDoctype); } void setInDoctype(bool v) { setBit(InDoctype, v); } bool inTextArea() const { return testBit(InTextArea); } void setInTextArea(bool v) { setBit(InTextArea, v); } bool escaped() const { return testBit(Escaped); } void setEscaped(bool v) { setBit(Escaped, v); } bool inServer() const { return testBit(InServer); } void setInServer(bool v) { setBit(InServer, v); } bool skipLF() const { return testBit(SkipLF); } void setSkipLF(bool v) { setBit(SkipLF, v); } bool startTag() const { return testBit(StartTag); } void setStartTag(bool v) { setBit(StartTag, v); } bool discardLF() const { return testBit(DiscardLF); } void setDiscardLF(bool v) { setBit(DiscardLF, v); } bool allowYield() const { return testBit(AllowYield); } void setAllowYield(bool v) { setBit(AllowYield, v); } bool loadingExtScript() const { return testBit(LoadingExtScript); } void setLoadingExtScript(bool v) { setBit(LoadingExtScript, v); } bool forceSynchronous() const { return testBit(ForceSynchronous); } void setForceSynchronous(bool v) { setBit(ForceSynchronous, v); } bool inAnyNonHTMLText() const { return m_bits & (InScript | InStyle | InXmp | InTextArea | InTitle | InIFrame); } bool hasTagState() const { return m_bits & TagMask; } bool hasEntityState() const { return m_bits & EntityMask; } bool needsSpecialWriteHandling() const { return m_bits & (InScript | InStyle | InXmp | InTextArea | InTitle | InIFrame | TagMask | EntityMask | InPlainText | InComment | InDoctype | InServer | InProcessingInstruction | StartTag); } private: static const int EntityShift = 4; enum StateBits { TagMask = (1 << 4) - 1, EntityMask = (1 << 7) - (1 << 4), InScript = 1 << 7, InStyle = 1 << 8, // Bit 9 unused InXmp = 1 << 10, InTitle = 1 << 11, InPlainText = 1 << 12, InProcessingInstruction = 1 << 13, InComment = 1 << 14, InTextArea = 1 << 15, Escaped = 1 << 16, InServer = 1 << 17, SkipLF = 1 << 18, StartTag = 1 << 19, DiscardLF = 1 << 20, // FIXME: should clarify difference between skip and discard AllowYield = 1 << 21, LoadingExtScript = 1 << 22, ForceSynchronous = 1 << 23, InIFrame = 1 << 24, InDoctype = 1 << 25 }; void setBit(StateBits bit, bool value) { if (value) m_bits |= bit; else m_bits &= ~bit; } bool testBit(StateBits bit) const { return m_bits & bit; } unsigned m_bits; }; State m_state; DoctypeToken m_doctypeToken; int m_doctypeSearchCount; int m_doctypeSecondarySearchCount; bool m_brokenServer; // Name of an attribute that we just scanned. AtomicString m_attrName; // Used to store the code of a scripting sequence UChar* m_scriptCode; // Size of the script sequenze stored in @ref #scriptCode int m_scriptCodeSize; // Maximal size that can be stored in @ref #scriptCode int m_scriptCodeCapacity; // resync point of script code size int m_scriptCodeResync; // Stores characters if we are scanning for a string like "" UChar searchBuffer[10]; // Counts where we are in the string we are scanning for int searchCount; // the stopper string const char* m_searchStopper; int m_searchStopperLength; // if no more data is coming, just parse what we have (including ext scripts that // may be still downloading) and finish bool m_noMoreData; // URL to get source code of script from String m_scriptTagSrcAttrValue; String m_scriptTagCharsetAttrValue; // the HTML code we will parse after the external script we are waiting for has loaded SegmentedString m_pendingSrc; // the HTML code we will parse after this particular script has // loaded, but before all pending HTML SegmentedString* m_currentPrependingSrc; // true if we are executing a script while parsing a document. This causes the parsing of // the output of the script to be postponed until after the script has finished executing int m_executingScript; Deque > m_pendingScripts; RefPtr m_scriptNode; bool m_requestingScript; bool m_hasScriptsWaitingForStylesheets; // if we found one broken comment, there are most likely others as well // store a flag to get rid of the O(n^2) behaviour in such a case. bool m_brokenComments; // current line number int m_lineNumber; int m_currentScriptTagStartLineNumber; int m_currentTagStartLineNumber; double m_tokenizerTimeDelay; int m_tokenizerChunkSize; // The timer for continued processing. Timer m_timer; // This buffer can hold arbitrarily long user-defined attribute names, such as in EMBED tags. // So any fixed number might be too small, but rather than rewriting all usage of this buffer // we'll just make it large enough to handle all imaginable cases. #define CBUFLEN 1024 UChar m_cBuffer[CBUFLEN + 2]; unsigned int m_cBufferPos; SegmentedString m_src; Document* m_doc; OwnPtr m_parser; bool m_inWrite; bool m_fragment; OwnPtr m_preloadScanner; }; void parseHTMLDocumentFragment(const String&, DocumentFragment*); UChar decodeNamedEntity(const char*); } // namespace WebCore #endif // HTMLTokenizer_h