summaryrefslogtreecommitdiffstats
path: root/WebCore/html/parser/HTMLTokenizer.h
diff options
context:
space:
mode:
Diffstat (limited to 'WebCore/html/parser/HTMLTokenizer.h')
-rw-r--r--WebCore/html/parser/HTMLTokenizer.h36
1 files changed, 27 insertions, 9 deletions
diff --git a/WebCore/html/parser/HTMLTokenizer.h b/WebCore/html/parser/HTMLTokenizer.h
index bab77f3..f16b049 100644
--- a/WebCore/html/parser/HTMLTokenizer.h
+++ b/WebCore/html/parser/HTMLTokenizer.h
@@ -36,6 +36,7 @@
namespace WebCore {
class Element;
+class Frame;
class HTMLToken;
class HTMLTokenizer : public Noncopyable {
@@ -96,7 +97,6 @@ public:
CommentEndDashState,
CommentEndState,
CommentEndBangState,
- CommentEndSpaceState,
DOCTYPEState,
BeforeDOCTYPENameState,
DOCTYPENameState,
@@ -119,12 +119,12 @@ public:
CDATASectionDoubleRightSquareBracketState,
};
- static PassOwnPtr<HTMLTokenizer> create() { return adoptPtr(new HTMLTokenizer); }
+ static PassOwnPtr<HTMLTokenizer> create(bool usePreHTML5ParserQuirks) { return adoptPtr(new HTMLTokenizer(usePreHTML5ParserQuirks)); }
~HTMLTokenizer();
void reset();
- // This function returns true if it emits a token. Otherwise, callers
+ // This function returns true if it emits a token. Otherwise, callers
// must provide the same (in progress) token on the next call (unless
// they call reset() first).
bool nextToken(SegmentedString&, HTMLToken&);
@@ -135,6 +135,22 @@ public:
State state() const { return m_state; }
void setState(State state) { m_state = state; }
+ // Updates the tokenizer's state according to the given tag name. This is
+ // an approximation of how the tree builder would update the tokenizer's
+ // state. This method is useful for approximating HTML tokenization. To
+ // get exactly the correct tokenization, you need the real tree builder.
+ //
+ // The main failures in the approximation are as follows:
+ //
+ // * The first set of character tokens emitted for a <pre> element might
+ // contain an extra leading newline.
+ // * The replacement of U+0000 with U+FFFD will not be sensitive to the
+ // tree builder's insertion mode.
+ // * CDATA sections in foreign content will be tokenized as bogus comments
+ // instead of as character tokens.
+ //
+ void updateStateFor(const AtomicString& tagName, Frame*);
+
// Hack to skip leading newline in <pre>/<listing> for authoring ease.
// http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#parsing-main-inbody
void setSkipLeadingNewLineForListing(bool value) { m_skipLeadingNewLineForListing = value; }
@@ -177,8 +193,8 @@ private:
// Every branch in this function is expensive, so we have a
// fast-reject branch for characters that don't require special
- // handling. Please run the parser benchmark whenever you touch
- // this function. It's very hot.
+ // handling. Please run the parser benchmark whenever you touch
+ // this function. It's very hot.
static const UChar specialCharacterMask = '\n' | '\r' | '\0';
if (m_nextInputCharacter & ~specialCharacterMask) {
m_skipNextNewLine = false;
@@ -238,7 +254,7 @@ private:
bool m_skipNextNewLine;
};
- HTMLTokenizer();
+ HTMLTokenizer(bool usePreHTML5ParserQuirks);
inline bool processEntity(SegmentedString&);
@@ -257,7 +273,7 @@ private:
inline bool temporaryBufferIs(const String&);
// Sometimes we speculatively consume input characters and we don't
- // know whether they represent end tags or RCDATA, etc. These
+ // know whether they represent end tags or RCDATA, etc. These
// functions help manage these state.
inline void addToPossibleEndTag(UChar cc);
inline void saveEndTagNameIfNeeded();
@@ -269,7 +285,7 @@ private:
Vector<UChar, 32> m_appropriateEndTagName;
- // m_token is owned by the caller. If nextToken is not on the stack,
+ // m_token is owned by the caller. If nextToken is not on the stack,
// this member might be pointing to unallocated memory.
HTMLToken* m_token;
int m_lineNumber;
@@ -282,7 +298,7 @@ private:
Vector<UChar, 32> m_temporaryBuffer;
// We occationally want to emit both a character token and an end tag
- // token (e.g., when lexing script). We buffer the name of the end tag
+ // token (e.g., when lexing script). We buffer the name of the end tag
// token here so we remember it next time we re-enter the tokenizer.
Vector<UChar, 32> m_bufferedEndTagName;
@@ -291,6 +307,8 @@ private:
// http://www.whatwg.org/specs/web-apps/current-work/#preprocessing-the-input-stream
InputStreamPreprocessor m_inputStreamPreprocessor;
+
+ bool m_usePreHTML5ParserQuirks;
};
}