Merge WebKit at r75993: Initial merge by git.

Change-Id: I602bbdc3974787a3b0450456a30a7868286921c3
author: Ben Murdoch <benm@google.com> 2011-05-13 16:23:25 +0100
committer: Ben Murdoch <benm@google.com> 2011-05-16 11:35:02 +0100
commit: 65f03d4f644ce73618e5f4f50dd694b26f55ae12 (patch)
tree: f478babb801e720de7bfaee23443ffe029f58731 /Source/JavaScriptCore/yarr/YarrParser.h
parent: 47de4a2fb7262c7ebdb9cd133ad2c54c187454d0 (diff)
download: external_webkit-65f03d4f644ce73618e5f4f50dd694b26f55ae12.zip
external_webkit-65f03d4f644ce73618e5f4f50dd694b26f55ae12.tar.gz
external_webkit-65f03d4f644ce73618e5f4f50dd694b26f55ae12.tar.bz2
1 files changed, 871 insertions, 0 deletions
diff --git a/Source/JavaScriptCore/yarr/YarrParser.h b/Source/JavaScriptCore/yarr/YarrParser.h
new file mode 100644
index 0000000..8c5741a
--- /dev/null
+++ b/Source/JavaScriptCore/yarr/YarrParser.h
@@ -0,0 +1,871 @@
+/*
+ * Copyright (C) 2009 Apple Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
+ */
+
+#ifndef YarrParser_h
+#define YarrParser_h
+
+#include "UString.h"
+#include "Yarr.h"
+#include <wtf/ASCIICType.h>
+#include <wtf/unicode/Unicode.h>
+
+namespace JSC { namespace Yarr {
+
+#define REGEXP_ERROR_PREFIX "Invalid regular expression: "
+
+enum BuiltInCharacterClassID {
+    DigitClassID,
+    SpaceClassID,
+    WordClassID,
+    NewlineClassID,
+};
+
+// The Parser class should not be used directly - only via the Yarr::parse() method.
+template<class Delegate>
+class Parser {
+private:
+    template<class FriendDelegate>
+    friend const char* parse(FriendDelegate& delegate, const UString& pattern, unsigned backReferenceLimit);
+
+    enum ErrorCode {
+        NoError,
+        PatternTooLarge,
+        QuantifierOutOfOrder,
+        QuantifierWithoutAtom,
+        MissingParentheses,
+        ParenthesesUnmatched,
+        ParenthesesTypeInvalid,
+        CharacterClassUnmatched,
+        CharacterClassOutOfOrder,
+        EscapeUnterminated,
+        NumberOfErrorCodes
+    };
+
+    /*
+     * CharacterClassParserDelegate:
+     *
+     * The class CharacterClassParserDelegate is used in the parsing of character
+     * classes.  This class handles detection of character ranges.  This class
+     * implements enough of the delegate interface such that it can be passed to
+     * parseEscape() as an EscapeDelegate.  This allows parseEscape() to be reused
+     * to perform the parsing of escape characters in character sets.
+     */
+    class CharacterClassParserDelegate {
+    public:
+        CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err)
+            : m_delegate(delegate)
+            , m_err(err)
+            , m_state(Empty)
+        {
+        }
+
+        /*
+         * begin():
+         *
+         * Called at beginning of construction.
+         */
+        void begin(bool invert)
+        {
+            m_delegate.atomCharacterClassBegin(invert);
+        }
+
+        /*
+         * atomPatternCharacter():
+         *
+         * This method is called either from parseCharacterClass() (for an unescaped
+         * character in a character class), or from parseEscape(). In the former case
+         * the value true will be passed for the argument 'hyphenIsRange', and in this
+         * mode we will allow a hypen to be treated as indicating a range (i.e. /[a-z]/
+         * is different to /[a\-z]/).
+         */
+        void atomPatternCharacter(UChar ch, bool hyphenIsRange = false)
+        {
+            switch (m_state) {
+            case AfterCharacterClass:
+                // Following a builtin character class we need look out for a hyphen.
+                // We're looking for invalid ranges, such as /[\d-x]/ or /[\d-\d]/.
+                // If we see a hyphen following a charater class then unlike usual
+                // we'll report it to the delegate immediately, and put ourself into
+                // a poisoned state. Any following calls to add another character or
+                // character class will result in an error. (A hypen following a
+                // character-class is itself valid, but only  at the end of a regex).
+                if (hyphenIsRange && ch == '-') {
+                    m_delegate.atomCharacterClassAtom('-');
+                    m_state = AfterCharacterClassHyphen;
+                    return;
+                }
+                // Otherwise just fall through - cached character so treat this as Empty.
+
+            case Empty:
+                m_character = ch;
+                m_state = CachedCharacter;
+                return;
+
+            case CachedCharacter:
+                if (hyphenIsRange && ch == '-')
+                    m_state = CachedCharacterHyphen;
+                else {
+                    m_delegate.atomCharacterClassAtom(m_character);
+                    m_character = ch;
+                }
+                return;
+
+            case CachedCharacterHyphen:
+                if (ch < m_character) {
+                    m_err = CharacterClassOutOfOrder;
+                    return;
+                }
+                m_delegate.atomCharacterClassRange(m_character, ch);
+                m_state = Empty;
+                return;
+
+                // See coment in atomBuiltInCharacterClass below.
+                // This too is technically an error, per ECMA-262, and again we
+                // we chose to allow this.  Note a subtlely here that while we
+                // diverge from the spec's definition of CharacterRange we do
+                // remain in compliance with the grammar.  For example, consider
+                // the expression /[\d-a-z]/.  We comply with the grammar in
+                // this case by not allowing a-z to be matched as a range.
+            case AfterCharacterClassHyphen:
+                m_delegate.atomCharacterClassAtom(ch);
+                m_state = Empty;
+                return;
+            }
+        }
+
+        /*
+         * atomBuiltInCharacterClass():
+         *
+         * Adds a built-in character class, called by parseEscape().
+         */
+        void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert)
+        {
+            switch (m_state) {
+            case CachedCharacter:
+                // Flush the currently cached character, then fall through.
+                m_delegate.atomCharacterClassAtom(m_character);
+
+            case Empty:
+            case AfterCharacterClass:
+                m_state = AfterCharacterClass;
+                m_delegate.atomCharacterClassBuiltIn(classID, invert);
+                return;
+
+                // If we hit either of these cases, we have an invalid range that
+                // looks something like /[x-\d]/ or /[\d-\d]/.
+                // According to ECMA-262 this should be a syntax error, but
+                // empirical testing shows this to break teh webz.  Instead we
+                // comply with to the ECMA-262 grammar, and assume the grammar to
+                // have matched the range correctly, but tweak our interpretation
+                // of CharacterRange.  Effectively we implicitly handle the hyphen
+                // as if it were escaped, e.g. /[\w-_]/ is treated as /[\w\-_]/.
+            case CachedCharacterHyphen:
+                m_delegate.atomCharacterClassAtom(m_character);
+                m_delegate.atomCharacterClassAtom('-');
+                // fall through
+            case AfterCharacterClassHyphen:
+                m_delegate.atomCharacterClassBuiltIn(classID, invert);
+                m_state = Empty;
+                return;
+            }
+        }
+
+        /*
+         * end():
+         *
+         * Called at end of construction.
+         */
+        void end()
+        {
+            if (m_state == CachedCharacter)
+                m_delegate.atomCharacterClassAtom(m_character);
+            else if (m_state == CachedCharacterHyphen) {
+                m_delegate.atomCharacterClassAtom(m_character);
+                m_delegate.atomCharacterClassAtom('-');
+            }
+            m_delegate.atomCharacterClassEnd();
+        }
+
+        // parseEscape() should never call these delegate methods when
+        // invoked with inCharacterClass set.
+        void assertionWordBoundary(bool) { ASSERT_NOT_REACHED(); }
+        void atomBackReference(unsigned) { ASSERT_NOT_REACHED(); }
+
+    private:
+        Delegate& m_delegate;
+        ErrorCode& m_err;
+        enum CharacterClassConstructionState {
+            Empty,
+            CachedCharacter,
+            CachedCharacterHyphen,
+            AfterCharacterClass,
+            AfterCharacterClassHyphen,
+        } m_state;
+        UChar m_character;
+    };
+
+    Parser(Delegate& delegate, const UString& pattern, unsigned backReferenceLimit)
+        : m_delegate(delegate)
+        , m_backReferenceLimit(backReferenceLimit)
+        , m_err(NoError)
+        , m_data(pattern.characters())
+        , m_size(pattern.length())
+        , m_index(0)
+        , m_parenthesesNestingDepth(0)
+    {
+    }
+    
+    /*
+     * parseEscape():
+     *
+     * Helper for parseTokens() AND parseCharacterClass().
+     * Unlike the other parser methods, this function does not report tokens
+     * directly to the member delegate (m_delegate), instead tokens are
+     * emitted to the delegate provided as an argument.  In the case of atom
+     * escapes, parseTokens() will call parseEscape() passing m_delegate as
+     * an argument, and as such the escape will be reported to the delegate.
+     *
+     * However this method may also be used by parseCharacterClass(), in which
+     * case a CharacterClassParserDelegate will be passed as the delegate that
+     * tokens should be added to.  A boolean flag is also provided to indicate
+     * whether that an escape in a CharacterClass is being parsed (some parsing
+     * rules change in this context).
+     *
+     * The boolean value returned by this method indicates whether the token
+     * parsed was an atom (outside of a characted class \b and \B will be
+     * interpreted as assertions).
+     */
+    template<bool inCharacterClass, class EscapeDelegate>
+    bool parseEscape(EscapeDelegate& delegate)
+    {
+        ASSERT(!m_err);
+        ASSERT(peek() == '\\');
+        consume();
+
+        if (atEndOfPattern()) {
+            m_err = EscapeUnterminated;
+            return false;
+        }
+
+        switch (peek()) {
+        // Assertions
+        case 'b':
+            consume();
+            if (inCharacterClass)
+                delegate.atomPatternCharacter('\b');
+            else {
+                delegate.assertionWordBoundary(false);
+                return false;
+            }
+            break;
+        case 'B':
+            consume();
+            if (inCharacterClass)
+                delegate.atomPatternCharacter('B');
+            else {
+                delegate.assertionWordBoundary(true);
+                return false;
+            }
+            break;
+
+        // CharacterClassEscape
+        case 'd':
+            consume();
+            delegate.atomBuiltInCharacterClass(DigitClassID, false);
+            break;
+        case 's':
+            consume();
+            delegate.atomBuiltInCharacterClass(SpaceClassID, false);
+            break;
+        case 'w':
+            consume();
+            delegate.atomBuiltInCharacterClass(WordClassID, false);
+            break;
+        case 'D':
+            consume();
+            delegate.atomBuiltInCharacterClass(DigitClassID, true);
+            break;
+        case 'S':
+            consume();
+            delegate.atomBuiltInCharacterClass(SpaceClassID, true);
+            break;
+        case 'W':
+            consume();
+            delegate.atomBuiltInCharacterClass(WordClassID, true);
+            break;
+
+        // DecimalEscape
+        case '1':
+        case '2':
+        case '3':
+        case '4':
+        case '5':
+        case '6':
+        case '7':
+        case '8':
+        case '9': {
+            // To match Firefox, we parse an invalid backreference in the range [1-7] as an octal escape.
+            // First, try to parse this as backreference.
+            if (!inCharacterClass) {
+                ParseState state = saveState();
+
+                unsigned backReference = consumeNumber();
+                if (backReference <= m_backReferenceLimit) {
+                    delegate.atomBackReference(backReference);
+                    break;
+                }
+
+                restoreState(state);
+            }
+            
+            // Not a backreference, and not octal.
+            if (peek() >= '8') {
+                delegate.atomPatternCharacter('\\');
+                break;
+            }
+
+            // Fall-through to handle this as an octal escape.
+        }
+
+        // Octal escape
+        case '0':
+            delegate.atomPatternCharacter(consumeOctal());
+            break;
+
+        // ControlEscape
+        case 'f':
+            consume();
+            delegate.atomPatternCharacter('\f');
+            break;
+        case 'n':
+            consume();
+            delegate.atomPatternCharacter('\n');
+            break;
+        case 'r':
+            consume();
+            delegate.atomPatternCharacter('\r');
+            break;
+        case 't':
+            consume();
+            delegate.atomPatternCharacter('\t');
+            break;
+        case 'v':
+            consume();
+            delegate.atomPatternCharacter('\v');
+            break;
+
+        // ControlLetter
+        case 'c': {
+            ParseState state = saveState();
+            consume();
+            if (!atEndOfPattern()) {
+                int control = consume();
+
+                // To match Firefox, inside a character class, we also accept numbers and '_' as control characters.
+                if (inCharacterClass ? WTF::isASCIIAlphanumeric(control) || (control == '_') : WTF::isASCIIAlpha(control)) {
+                    delegate.atomPatternCharacter(control & 0x1f);
+                    break;
+                }
+            }
+            restoreState(state);
+            delegate.atomPatternCharacter('\\');
+            break;
+        }
+
+        // HexEscape
+        case 'x': {
+            consume();
+            int x = tryConsumeHex(2);
+            if (x == -1)
+                delegate.atomPatternCharacter('x');
+            else
+                delegate.atomPatternCharacter(x);
+            break;
+        }
+
+        // UnicodeEscape
+        case 'u': {
+            consume();
+            int u = tryConsumeHex(4);
+            if (u == -1)
+                delegate.atomPatternCharacter('u');
+            else
+                delegate.atomPatternCharacter(u);
+            break;
+        }
+
+        // IdentityEscape
+        default:
+            delegate.atomPatternCharacter(consume());
+        }
+        
+        return true;
+    }
+
+    /*
+     * parseAtomEscape(), parseCharacterClassEscape():
+     *
+     * These methods alias to parseEscape().
+     */
+    bool parseAtomEscape()
+    {
+        return parseEscape<false>(m_delegate);
+    }
+    void parseCharacterClassEscape(CharacterClassParserDelegate& delegate)
+    {
+        parseEscape<true>(delegate);
+    }
+
+    /*
+     * parseCharacterClass():
+     *
+     * Helper for parseTokens(); calls dirctly and indirectly (via parseCharacterClassEscape)
+     * to an instance of CharacterClassParserDelegate, to describe the character class to the
+     * delegate.
+     */
+    void parseCharacterClass()
+    {
+        ASSERT(!m_err);
+        ASSERT(peek() == '[');
+        consume();
+
+        CharacterClassParserDelegate characterClassConstructor(m_delegate, m_err);
+
+        characterClassConstructor.begin(tryConsume('^'));
+
+        while (!atEndOfPattern()) {
+            switch (peek()) {
+            case ']':
+                consume();
+                characterClassConstructor.end();
+                return;
+
+            case '\\':
+                parseCharacterClassEscape(characterClassConstructor);
+                break;
+
+            default:
+                characterClassConstructor.atomPatternCharacter(consume(), true);
+            }
+
+            if (m_err)
+                return;
+        }
+
+        m_err = CharacterClassUnmatched;
+    }
+
+    /*
+     * parseParenthesesBegin():
+     *
+     * Helper for parseTokens(); checks for parentheses types other than regular capturing subpatterns.
+     */
+    void parseParenthesesBegin()
+    {
+        ASSERT(!m_err);
+        ASSERT(peek() == '(');
+        consume();
+
+        if (tryConsume('?')) {
+            if (atEndOfPattern()) {
+                m_err = ParenthesesTypeInvalid;
+                return;
+            }
+
+            switch (consume()) {
+            case ':':
+                m_delegate.atomParenthesesSubpatternBegin(false);
+                break;
+            
+            case '=':
+                m_delegate.atomParentheticalAssertionBegin();
+                break;
+
+            case '!':
+                m_delegate.atomParentheticalAssertionBegin(true);
+                break;
+            
+            default:
+                m_err = ParenthesesTypeInvalid;
+            }
+        } else
+            m_delegate.atomParenthesesSubpatternBegin();
+
+        ++m_parenthesesNestingDepth;
+    }
+
+    /*
+     * parseParenthesesEnd():
+     *
+     * Helper for parseTokens(); checks for parse errors (due to unmatched parentheses).
+     */
+    void parseParenthesesEnd()
+    {
+        ASSERT(!m_err);
+        ASSERT(peek() == ')');
+        consume();
+
+        if (m_parenthesesNestingDepth > 0)
+            m_delegate.atomParenthesesEnd();
+        else
+            m_err = ParenthesesUnmatched;
+
+        --m_parenthesesNestingDepth;
+    }
+
+    /*
+     * parseQuantifier():
+     *
+     * Helper for parseTokens(); checks for parse errors and non-greedy quantifiers.
+     */
+    void parseQuantifier(bool lastTokenWasAnAtom, unsigned min, unsigned max)
+    {
+        ASSERT(!m_err);
+        ASSERT(min <= max);
+
+        if (lastTokenWasAnAtom)
+            m_delegate.quantifyAtom(min, max, !tryConsume('?'));
+        else
+            m_err = QuantifierWithoutAtom;
+    }
+
+    /*
+     * parseTokens():
+     *
+     * This method loops over the input pattern reporting tokens to the delegate.
+     * The method returns when a parse error is detected, or the end of the pattern
+     * is reached.  One piece of state is tracked around the loop, which is whether
+     * the last token passed to the delegate was an atom (this is necessary to detect
+     * a parse error when a quantifier provided without an atom to quantify).
+     */
+    void parseTokens()
+    {
+        bool lastTokenWasAnAtom = false;
+
+        while (!atEndOfPattern()) {
+            switch (peek()) {
+            case '|':
+                consume();
+                m_delegate.disjunction();
+                lastTokenWasAnAtom = false;
+                break;
+
+            case '(':
+                parseParenthesesBegin();
+                lastTokenWasAnAtom = false;
+                break;
+
+            case ')':
+                parseParenthesesEnd();
+                lastTokenWasAnAtom = true;
+                break;
+
+            case '^':
+                consume();
+                m_delegate.assertionBOL();
+                lastTokenWasAnAtom = false;
+                break;
+
+            case '$':
+                consume();
+                m_delegate.assertionEOL();
+                lastTokenWasAnAtom = false;
+                break;
+
+            case '.':
+                consume();
+                m_delegate.atomBuiltInCharacterClass(NewlineClassID, true);
+                lastTokenWasAnAtom = true;
+                break;
+
+            case '[':
+                parseCharacterClass();
+                lastTokenWasAnAtom = true;
+                break;
+
+            case '\\':
+                lastTokenWasAnAtom = parseAtomEscape();
+                break;
+
+            case '*':
+                consume();
+                parseQuantifier(lastTokenWasAnAtom, 0, quantifyInfinite);
+                lastTokenWasAnAtom = false;
+                break;
+
+            case '+':
+                consume();
+                parseQuantifier(lastTokenWasAnAtom, 1, quantifyInfinite);
+                lastTokenWasAnAtom = false;
+                break;
+
+            case '?':
+                consume();
+                parseQuantifier(lastTokenWasAnAtom, 0, 1);
+                lastTokenWasAnAtom = false;
+                break;
+
+            case '{': {
+                ParseState state = saveState();
+
+                consume();
+                if (peekIsDigit()) {
+                    unsigned min = consumeNumber();
+                    unsigned max = min;
+                    
+                    if (tryConsume(','))
+                        max = peekIsDigit() ? consumeNumber() : quantifyInfinite;
+
+                    if (tryConsume('}')) {
+                        if (min <= max)
+                            parseQuantifier(lastTokenWasAnAtom, min, max);
+                        else
+                            m_err = QuantifierOutOfOrder;
+                        lastTokenWasAnAtom = false;
+                        break;
+                    }
+                }
+
+                restoreState(state);
+            } // if we did not find a complete quantifer, fall through to the default case.
+
+            default:
+                m_delegate.atomPatternCharacter(consume());
+                lastTokenWasAnAtom = true;
+            }
+
+            if (m_err)
+                return;
+        }
+
+        if (m_parenthesesNestingDepth > 0)
+            m_err = MissingParentheses;
+    }
+
+    /*
+     * parse():
+     *
+     * This method calls parseTokens() to parse over the input and converts any
+     * error code to a const char* for a result.
+     */
+    const char* parse()
+    {
+        if (m_size > MAX_PATTERN_SIZE)
+            m_err = PatternTooLarge;
+        else
+            parseTokens();
+        ASSERT(atEndOfPattern() || m_err);
+
+        // The order of this array must match the ErrorCode enum.
+        static const char* errorMessages[NumberOfErrorCodes] = {
+            0, // NoError
+            REGEXP_ERROR_PREFIX "regular expression too large",
+            REGEXP_ERROR_PREFIX "numbers out of order in {} quantifier",
+            REGEXP_ERROR_PREFIX "nothing to repeat",
+            REGEXP_ERROR_PREFIX "missing )",
+            REGEXP_ERROR_PREFIX "unmatched parentheses",
+            REGEXP_ERROR_PREFIX "unrecognized character after (?",
+            REGEXP_ERROR_PREFIX "missing terminating ] for character class",
+            REGEXP_ERROR_PREFIX "range out of order in character class",
+            REGEXP_ERROR_PREFIX "\\ at end of pattern"
+        };
+
+        return errorMessages[m_err];
+    }
+
+
+    // Misc helper functions:
+
+    typedef unsigned ParseState;
+    
+    ParseState saveState()
+    {
+        return m_index;
+    }
+
+    void restoreState(ParseState state)
+    {
+        m_index = state;
+    }
+
+    bool atEndOfPattern()
+    {
+        ASSERT(m_index <= m_size);
+        return m_index == m_size;
+    }
+
+    int peek()
+    {
+        ASSERT(m_index < m_size);
+        return m_data[m_index];
+    }
+
+    bool peekIsDigit()
+    {
+        return !atEndOfPattern() && WTF::isASCIIDigit(peek());
+    }
+
+    unsigned peekDigit()
+    {
+        ASSERT(peekIsDigit());
+        return peek() - '0';
+    }
+
+    int consume()
+    {
+        ASSERT(m_index < m_size);
+        return m_data[m_index++];
+    }
+
+    unsigned consumeDigit()
+    {
+        ASSERT(peekIsDigit());
+        return consume() - '0';
+    }
+
+    unsigned consumeNumber()
+    {
+        unsigned n = consumeDigit();
+        // check for overflow.
+        for (unsigned newValue; peekIsDigit() && ((newValue = n * 10 + peekDigit()) >= n); ) {
+            n = newValue;
+            consume();
+        }
+        return n;
+    }
+
+    unsigned consumeOctal()
+    {
+        ASSERT(WTF::isASCIIOctalDigit(peek()));
+
+        unsigned n = consumeDigit();
+        while (n < 32 && !atEndOfPattern() && WTF::isASCIIOctalDigit(peek()))
+            n = n * 8 + consumeDigit();
+        return n;
+    }
+
+    bool tryConsume(UChar ch)
+    {
+        if (atEndOfPattern() || (m_data[m_index] != ch))
+            return false;
+        ++m_index;
+        return true;
+    }
+
+    int tryConsumeHex(int count)
+    {
+        ParseState state = saveState();
+
+        int n = 0;
+        while (count--) {
+            if (atEndOfPattern() || !WTF::isASCIIHexDigit(peek())) {
+                restoreState(state);
+                return -1;
+            }
+            n = (n << 4) | WTF::toASCIIHexValue(consume());
+        }
+        return n;
+    }
+
+    Delegate& m_delegate;
+    unsigned m_backReferenceLimit;
+    ErrorCode m_err;
+    const UChar* m_data;
+    unsigned m_size;
+    unsigned m_index;
+    unsigned m_parenthesesNestingDepth;
+
+    // Derived by empirical testing of compile time in PCRE and WREC.
+    static const unsigned MAX_PATTERN_SIZE = 1024 * 1024;
+};
+
+/*
+ * Yarr::parse():
+ *
+ * The parse method is passed a pattern to be parsed and a delegate upon which
+ * callbacks will be made to record the parsed tokens forming the regex.
+ * Yarr::parse() returns null on success, or a const C string providing an error
+ * message where a parse error occurs.
+ *
+ * The Delegate must implement the following interface:
+ *
+ *    void assertionBOL();
+ *    void assertionEOL();
+ *    void assertionWordBoundary(bool invert);
+ *
+ *    void atomPatternCharacter(UChar ch);
+ *    void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert);
+ *    void atomCharacterClassBegin(bool invert)
+ *    void atomCharacterClassAtom(UChar ch)
+ *    void atomCharacterClassRange(UChar begin, UChar end)
+ *    void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert)
+ *    void atomCharacterClassEnd()
+ *    void atomParenthesesSubpatternBegin(bool capture = true);
+ *    void atomParentheticalAssertionBegin(bool invert = false);
+ *    void atomParenthesesEnd();
+ *    void atomBackReference(unsigned subpatternId);
+ *
+ *    void quantifyAtom(unsigned min, unsigned max, bool greedy);
+ *
+ *    void disjunction();
+ *
+ * The regular expression is described by a sequence of assertion*() and atom*()
+ * callbacks to the delegate, describing the terms in the regular expression.
+ * Following an atom a quantifyAtom() call may occur to indicate that the previous
+ * atom should be quantified.  In the case of atoms described across multiple
+ * calls (parentheses and character classes) the call to quantifyAtom() will come
+ * after the call to the atom*End() method, never after atom*Begin().
+ *
+ * Character classes may either be described by a single call to
+ * atomBuiltInCharacterClass(), or by a sequence of atomCharacterClass*() calls.
+ * In the latter case, ...Begin() will be called, followed by a sequence of
+ * calls to ...Atom(), ...Range(), and ...BuiltIn(), followed by a call to ...End().
+ *
+ * Sequences of atoms and assertions are broken into alternatives via calls to
+ * disjunction().  Assertions, atoms, and disjunctions emitted between calls to
+ * atomParenthesesBegin() and atomParenthesesEnd() form the body of a subpattern.
+ * atomParenthesesBegin() is passed a subpatternId.  In the case of a regular
+ * capturing subpattern, this will be the subpatternId associated with these
+ * parentheses, and will also by definition be the lowest subpatternId of these
+ * parentheses and of any nested paretheses.  The atomParenthesesEnd() method
+ * is passed the subpatternId of the last capturing subexpression nested within
+ * these paretheses.  In the case of a capturing subpattern with no nested
+ * capturing subpatterns, the same subpatternId will be passed to the begin and
+ * end functions.  In the case of non-capturing subpatterns the subpatternId
+ * passed to the begin method is also the first possible subpatternId that might
+ * be nested within these paretheses.  If a set of non-capturing parentheses does
+ * not contain any capturing subpatterns, then the subpatternId passed to begin
+ * will be greater than the subpatternId passed to end.
+ */
+
+template<class Delegate>
+const char* parse(Delegate& delegate, const UString& pattern, unsigned backReferenceLimit = quantifyInfinite)
+{
+    return Parser<Delegate>(delegate, pattern, backReferenceLimit).parse();
+}
+
+} } // namespace JSC::Yarr
+
+#endif // YarrParser_h
author	Ben Murdoch <benm@google.com>	2011-05-13 16:23:25 +0100
committer	Ben Murdoch <benm@google.com>	2011-05-16 11:35:02 +0100
commit	65f03d4f644ce73618e5f4f50dd694b26f55ae12 (patch)
tree	f478babb801e720de7bfaee23443ffe029f58731 /Source/JavaScriptCore/yarr/YarrParser.h
parent	47de4a2fb7262c7ebdb9cd133ad2c54c187454d0 (diff)
download	external_webkit-65f03d4f644ce73618e5f4f50dd694b26f55ae12.zip external_webkit-65f03d4f644ce73618e5f4f50dd694b26f55ae12.tar.gz external_webkit-65f03d4f644ce73618e5f4f50dd694b26f55ae12.tar.bz2