diff options
Diffstat (limited to 'WebCore/html/parser/HTMLTokenizer.h')
-rw-r--r-- | WebCore/html/parser/HTMLTokenizer.h | 36 |
1 files changed, 27 insertions, 9 deletions
diff --git a/WebCore/html/parser/HTMLTokenizer.h b/WebCore/html/parser/HTMLTokenizer.h index bab77f3..f16b049 100644 --- a/WebCore/html/parser/HTMLTokenizer.h +++ b/WebCore/html/parser/HTMLTokenizer.h @@ -36,6 +36,7 @@ namespace WebCore { class Element; +class Frame; class HTMLToken; class HTMLTokenizer : public Noncopyable { @@ -96,7 +97,6 @@ public: CommentEndDashState, CommentEndState, CommentEndBangState, - CommentEndSpaceState, DOCTYPEState, BeforeDOCTYPENameState, DOCTYPENameState, @@ -119,12 +119,12 @@ public: CDATASectionDoubleRightSquareBracketState, }; - static PassOwnPtr<HTMLTokenizer> create() { return adoptPtr(new HTMLTokenizer); } + static PassOwnPtr<HTMLTokenizer> create(bool usePreHTML5ParserQuirks) { return adoptPtr(new HTMLTokenizer(usePreHTML5ParserQuirks)); } ~HTMLTokenizer(); void reset(); - // This function returns true if it emits a token. Otherwise, callers + // This function returns true if it emits a token. Otherwise, callers // must provide the same (in progress) token on the next call (unless // they call reset() first). bool nextToken(SegmentedString&, HTMLToken&); @@ -135,6 +135,22 @@ public: State state() const { return m_state; } void setState(State state) { m_state = state; } + // Updates the tokenizer's state according to the given tag name. This is + // an approximation of how the tree builder would update the tokenizer's + // state. This method is useful for approximating HTML tokenization. To + // get exactly the correct tokenization, you need the real tree builder. + // + // The main failures in the approximation are as follows: + // + // * The first set of character tokens emitted for a <pre> element might + // contain an extra leading newline. + // * The replacement of U+0000 with U+FFFD will not be sensitive to the + // tree builder's insertion mode. + // * CDATA sections in foreign content will be tokenized as bogus comments + // instead of as character tokens. + // + void updateStateFor(const AtomicString& tagName, Frame*); + // Hack to skip leading newline in <pre>/<listing> for authoring ease. // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#parsing-main-inbody void setSkipLeadingNewLineForListing(bool value) { m_skipLeadingNewLineForListing = value; } @@ -177,8 +193,8 @@ private: // Every branch in this function is expensive, so we have a // fast-reject branch for characters that don't require special - // handling. Please run the parser benchmark whenever you touch - // this function. It's very hot. + // handling. Please run the parser benchmark whenever you touch + // this function. It's very hot. static const UChar specialCharacterMask = '\n' | '\r' | '\0'; if (m_nextInputCharacter & ~specialCharacterMask) { m_skipNextNewLine = false; @@ -238,7 +254,7 @@ private: bool m_skipNextNewLine; }; - HTMLTokenizer(); + HTMLTokenizer(bool usePreHTML5ParserQuirks); inline bool processEntity(SegmentedString&); @@ -257,7 +273,7 @@ private: inline bool temporaryBufferIs(const String&); // Sometimes we speculatively consume input characters and we don't - // know whether they represent end tags or RCDATA, etc. These + // know whether they represent end tags or RCDATA, etc. These // functions help manage these state. inline void addToPossibleEndTag(UChar cc); inline void saveEndTagNameIfNeeded(); @@ -269,7 +285,7 @@ private: Vector<UChar, 32> m_appropriateEndTagName; - // m_token is owned by the caller. If nextToken is not on the stack, + // m_token is owned by the caller. If nextToken is not on the stack, // this member might be pointing to unallocated memory. HTMLToken* m_token; int m_lineNumber; @@ -282,7 +298,7 @@ private: Vector<UChar, 32> m_temporaryBuffer; // We occationally want to emit both a character token and an end tag - // token (e.g., when lexing script). We buffer the name of the end tag + // token (e.g., when lexing script). We buffer the name of the end tag // token here so we remember it next time we re-enter the tokenizer. Vector<UChar, 32> m_bufferedEndTagName; @@ -291,6 +307,8 @@ private: // http://www.whatwg.org/specs/web-apps/current-work/#preprocessing-the-input-stream InputStreamPreprocessor m_inputStreamPreprocessor; + + bool m_usePreHTML5ParserQuirks; }; } |