1 files changed, 27 insertions, 9 deletions
diff --git a/WebCore/html/parser/HTMLTokenizer.h b/WebCore/html/parser/HTMLTokenizer.h
index bab77f3..f16b049 100644
--- a/WebCore/html/parser/HTMLTokenizer.h
+++ b/WebCore/html/parser/HTMLTokenizer.h
@@ -36,6 +36,7 @@
 namespace WebCore {
 
 class Element;
+class Frame;
 class HTMLToken;
 
 class HTMLTokenizer : public Noncopyable {
@@ -96,7 +97,6 @@ public:
         CommentEndDashState,
         CommentEndState,
         CommentEndBangState,
-        CommentEndSpaceState,
         DOCTYPEState,
         BeforeDOCTYPENameState,
         DOCTYPENameState,
@@ -119,12 +119,12 @@ public:
         CDATASectionDoubleRightSquareBracketState,
     };
 
-    static PassOwnPtr<HTMLTokenizer> create() { return adoptPtr(new HTMLTokenizer); }
+    static PassOwnPtr<HTMLTokenizer> create(bool usePreHTML5ParserQuirks) { return adoptPtr(new HTMLTokenizer(usePreHTML5ParserQuirks)); }
     ~HTMLTokenizer();
 
     void reset();
 
-    // This function returns true if it emits a token.  Otherwise, callers
+    // This function returns true if it emits a token. Otherwise, callers
     // must provide the same (in progress) token on the next call (unless
     // they call reset() first).
     bool nextToken(SegmentedString&, HTMLToken&);
@@ -135,6 +135,22 @@ public:
     State state() const { return m_state; }
     void setState(State state) { m_state = state; }
 
+    // Updates the tokenizer's state according to the given tag name. This is
+    // an approximation of how the tree builder would update the tokenizer's
+    // state. This method is useful for approximating HTML tokenization. To
+    // get exactly the correct tokenization, you need the real tree builder.
+    //
+    // The main failures in the approximation are as follows:
+    //
+    //  * The first set of character tokens emitted for a <pre> element might
+    //    contain an extra leading newline.
+    //  * The replacement of U+0000 with U+FFFD will not be sensitive to the
+    //    tree builder's insertion mode.
+    //  * CDATA sections in foreign content will be tokenized as bogus comments
+    //    instead of as character tokens.
+    //
+    void updateStateFor(const AtomicString& tagName, Frame*);
+
     // Hack to skip leading newline in <pre>/<listing> for authoring ease.
     // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#parsing-main-inbody
     void setSkipLeadingNewLineForListing(bool value) { m_skipLeadingNewLineForListing = value; }
@@ -177,8 +193,8 @@ private:
 
             // Every branch in this function is expensive, so we have a
             // fast-reject branch for characters that don't require special
-            // handling.  Please run the parser benchmark whenever you touch
-            // this function.  It's very hot.
+            // handling. Please run the parser benchmark whenever you touch
+            // this function. It's very hot.
             static const UChar specialCharacterMask = '\n' | '\r' | '\0';
             if (m_nextInputCharacter & ~specialCharacterMask) {
                 m_skipNextNewLine = false;
@@ -238,7 +254,7 @@ private:
         bool m_skipNextNewLine;
     };
 
-    HTMLTokenizer();
+    HTMLTokenizer(bool usePreHTML5ParserQuirks);
 
     inline bool processEntity(SegmentedString&);
 
@@ -257,7 +273,7 @@ private:
     inline bool temporaryBufferIs(const String&);
 
     // Sometimes we speculatively consume input characters and we don't
-    // know whether they represent end tags or RCDATA, etc.  These
+    // know whether they represent end tags or RCDATA, etc. These
     // functions help manage these state.
     inline void addToPossibleEndTag(UChar cc);
     inline void saveEndTagNameIfNeeded();
@@ -269,7 +285,7 @@ private:
 
     Vector<UChar, 32> m_appropriateEndTagName;
 
-    // m_token is owned by the caller.  If nextToken is not on the stack,
+    // m_token is owned by the caller. If nextToken is not on the stack,
     // this member might be pointing to unallocated memory.
     HTMLToken* m_token;
     int m_lineNumber;
@@ -282,7 +298,7 @@ private:
     Vector<UChar, 32> m_temporaryBuffer;
 
     // We occationally want to emit both a character token and an end tag
-    // token (e.g., when lexing script).  We buffer the name of the end tag
+    // token (e.g., when lexing script). We buffer the name of the end tag
     // token here so we remember it next time we re-enter the tokenizer.
     Vector<UChar, 32> m_bufferedEndTagName;
 
@@ -291,6 +307,8 @@ private:
 
     // http://www.whatwg.org/specs/web-apps/current-work/#preprocessing-the-input-stream
     InputStreamPreprocessor m_inputStreamPreprocessor;
+    
+    bool m_usePreHTML5ParserQuirks;
 };
 
 }