diff options
Diffstat (limited to 'WebCore/html/HTML5Lexer.h')
-rw-r--r-- | WebCore/html/HTML5Lexer.h | 162 |
1 files changed, 162 insertions, 0 deletions
diff --git a/WebCore/html/HTML5Lexer.h b/WebCore/html/HTML5Lexer.h new file mode 100644 index 0000000..6d61cc2 --- /dev/null +++ b/WebCore/html/HTML5Lexer.h @@ -0,0 +1,162 @@ +/* + * Copyright (C) 2008 Apple Inc. All Rights Reserved. + * Copyright (C) 2010 Google, Inc. All Rights Reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef HTML5Lexer_h +#define HTML5Lexer_h + +#include "AtomicString.h" +#include "SegmentedString.h" +#include <wtf/Noncopyable.h> +#include <wtf/Vector.h> + +namespace WebCore { + + class HTML5Token; + + class HTML5Lexer : public Noncopyable { + public: + enum State { + DataState, + CharacterReferenceInDataState, + RCDATAState, + CharacterReferenceInRCDATAState, + RAWTEXTState, + ScriptDataState, + PLAINTEXTState, + TagOpenState, + EndTagOpenState, + TagNameState, + RCDATALessThanSignState, + RCDATAEndTagOpenState, + RCDATAEndTagNameState, + RAWTEXTLessThanSignState, + RAWTEXTEndTagOpenState, + RAWTEXTEndTagNameState, + ScriptDataLessThanSignState, + ScriptDataEndTagOpenState, + ScriptDataEndTagNameState, + ScriptDataEscapeStartState, + ScriptDataEscapeStartDashState, + ScriptDataEscapedState, + ScriptDataEscapedDashState, + ScriptDataEscapedDashDashState, + ScriptDataEscapedLessThanSignState, + ScriptDataEscapedEndTagOpenState, + ScriptDataEscapedEndTagNameState, + ScriptDataDoubleEscapeStartState, + ScriptDataDoubleEscapedState, + ScriptDataDoubleEscapedDashState, + ScriptDataDoubleEscapedDashDashState, + ScriptDataDoubleEscapedLessThanSignState, + ScriptDataDoubleEscapeEndState, + BeforeAttributeNameState, + AttributeNameState, + AfterAttributeNameState, + BeforeAttributeValueState, + AttributeValueDoubleQuotedState, + AttributeValueSingleQuotedState, + AttributeValueUnquotedState, + CharacterReferenceInAttributeValueState, + AfterAttributeValueQuotedState, + SelfClosingStartTagState, + BogusCommentState, + MarkupDeclarationOpenState, + CommentStartState, + CommentStartDashState, + CommentState, + CommentEndDashState, + CommentEndState, + CommentEndBangState, + CommentEndSpaceState, + DOCTYPEState, + BeforeDOCTYPENameState, + DOCTYPENameState, + AfterDOCTYPENameState, + AfterDOCTYPEPublicKeywordState, + BeforeDOCTYPEPublicIdentifierState, + DOCTYPEPublicIdentifierDoubleQuotedState, + DOCTYPEPublicIdentifierSingleQuotedState, + AfterDOCTYPEPublicIdentifierState, + BetweenDOCTYPEPublicAndSystemIdentifiersState, + AfterDOCTYPESystemKeywordState, + BeforeDOCTYPESystemIdentifierState, + DOCTYPESystemIdentifierDoubleQuotedState, + DOCTYPESystemIdentifierSingleQuotedState, + AfterDOCTYPESystemIdentifierState, + BogusDOCTYPEState, + CDATASectionState, + TokenizingCharacterReferencesState, + }; + + HTML5Lexer(); + ~HTML5Lexer(); + + void reset(); + + // This function returns true if it emits a token. Otherwise, callers + // must provide the same (in progress) token on the next call (unless + // they call reset() first). + bool nextToken(SegmentedString&, HTML5Token&); + + void setState(State state) { m_state = state; } + + static unsigned consumeEntity(SegmentedString&, bool& notEnoughCharacters); + + private: + inline void emitCharacter(UChar); + inline void emitParseError(); + inline void emitCurrentToken(); + inline void emitCurrentDoctypeToken(); + + inline bool temporaryBufferIs(const char*); + + inline bool isAppropriateEndTag(); + + State m_state; + + AtomicString m_appropriateEndTagName; + + // m_token is owned by the caller. If nextToken is not on the stack, + // this member might be pointing to unallocated memory. + HTML5Token* m_token; + + bool m_emitPending; + + // http://www.whatwg.org/specs/web-apps/current-work/#temporary-buffer + Vector<UChar, 32> m_temporaryBuffer; + + // We occationally want to emit both a character token and an end tag + // token (e.g., when lexing script). We buffer the name of the end tag + // token here so we remember it next time we re-enter the lexer. + Vector<UChar, 32> m_bufferedEndTagName; + + // http://www.whatwg.org/specs/web-apps/current-work/#additional-allowed-character + UChar m_additionalAllowedCharacter; + }; + +} + +#endif |