summaryrefslogtreecommitdiffstats
path: root/WebCore/html/HTMLTokenizer.h
diff options
context:
space:
mode:
Diffstat (limited to 'WebCore/html/HTMLTokenizer.h')
-rw-r--r--WebCore/html/HTMLTokenizer.h33
1 files changed, 29 insertions, 4 deletions
diff --git a/WebCore/html/HTMLTokenizer.h b/WebCore/html/HTMLTokenizer.h
index 7ee9d41..0e9ba3a 100644
--- a/WebCore/html/HTMLTokenizer.h
+++ b/WebCore/html/HTMLTokenizer.h
@@ -34,6 +34,7 @@
namespace WebCore {
+class Element;
class HTMLToken;
class HTMLTokenizer : public Noncopyable {
@@ -132,14 +133,27 @@ public:
// Hack to skip leading newline in <pre>/<listing> for authoring ease.
// http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#parsing-main-inbody
- void skipLeadingNewLineForListing() { m_skipLeadingNewLineForListing = true; }
+ void setSkipLeadingNewLineForListing(bool value) { m_skipLeadingNewLineForListing = value; }
+
+ bool forceNullCharacterReplacement() const { return m_forceNullCharacterReplacement; }
+ void setForceNullCharacterReplacement(bool value) { m_forceNullCharacterReplacement = value; }
+
+ bool shouldSkipNullCharacters() const
+ {
+ return !m_forceNullCharacterReplacement
+ && (m_state == DataState
+ || m_state == RCDATAState
+ || m_state == RAWTEXTState
+ || m_state == PLAINTEXTState);
+ }
private:
// http://www.whatwg.org/specs/web-apps/current-work/#preprocessing-the-input-stream
class InputStreamPreprocessor : public Noncopyable {
public:
- InputStreamPreprocessor()
- : m_nextInputCharacter('\0')
+ InputStreamPreprocessor(HTMLTokenizer* tokenizer)
+ : m_tokenizer(tokenizer)
+ , m_nextInputCharacter('\0')
, m_skipNextNewLine(false)
{
}
@@ -151,6 +165,7 @@ private:
// characters in |source| (after collapsing \r\n, etc).
ALWAYS_INLINE bool peek(SegmentedString& source, int& lineNumber)
{
+ PeekAgain:
m_nextInputCharacter = *source;
// Every branch in this function is expensive, so we have a
@@ -179,8 +194,15 @@ private:
// a number of specific character values are parse errors and should be replaced
// by the replacement character. We suspect this is a problem with the spec as doing
// that filtering breaks surrogate pair handling and causes us not to match Minefield.
- if (m_nextInputCharacter == '\0' && !shouldTreatNullAsEndOfFileMarker(source))
+ if (m_nextInputCharacter == '\0' && !shouldTreatNullAsEndOfFileMarker(source)) {
+ if (m_tokenizer->shouldSkipNullCharacters()) {
+ source.advancePastNonNewline();
+ if (source.isEmpty())
+ return false;
+ goto PeekAgain;
+ }
m_nextInputCharacter = 0xFFFD;
+ }
}
return true;
}
@@ -202,6 +224,8 @@ private:
return source.isClosed() && source.length() == 1;
}
+ HTMLTokenizer* m_tokenizer;
+
// http://www.whatwg.org/specs/web-apps/current-work/#next-input-character
UChar m_nextInputCharacter;
bool m_skipNextNewLine;
@@ -242,6 +266,7 @@ private:
int m_lineNumber;
bool m_skipLeadingNewLineForListing;
+ bool m_forceNullCharacterReplacement;
// http://www.whatwg.org/specs/web-apps/current-work/#temporary-buffer
Vector<UChar, 32> m_temporaryBuffer;