diff options
Diffstat (limited to 'JavaScriptCore/yarr/RegexParser.h')
-rw-r--r-- | JavaScriptCore/yarr/RegexParser.h | 151 |
1 files changed, 95 insertions, 56 deletions
diff --git a/JavaScriptCore/yarr/RegexParser.h b/JavaScriptCore/yarr/RegexParser.h index a5c0ba2..ec5f589 100644 --- a/JavaScriptCore/yarr/RegexParser.h +++ b/JavaScriptCore/yarr/RegexParser.h @@ -33,6 +33,8 @@ namespace JSC { namespace Yarr { +static const unsigned quantifyInfinite = UINT_MAX; + enum BuiltInCharacterClassID { DigitClassID, SpaceClassID, @@ -75,7 +77,7 @@ private: CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err) : m_delegate(delegate) , m_err(err) - , m_state(empty) + , m_state(Empty) { } @@ -90,54 +92,67 @@ private: } /* - * atomPatternCharacterUnescaped(): + * atomPatternCharacter(): * - * This method is called directly from parseCharacterClass(), to report a new - * pattern character token. This method differs from atomPatternCharacter(), - * which will be called from parseEscape(), since a hypen provided via this - * method may be indicating a character range, but a hyphen parsed by - * parseEscape() cannot be interpreted as doing so. + * This method is called either from parseCharacterClass() (for an unescaped + * character in a character class), or from parseEscape(). In the former case + * the value true will be passed for the argument 'hyphenIsRange', and in this + * mode we will allow a hypen to be treated as indicating a range (i.e. /[a-z]/ + * is different to /[a\-z]/). */ - void atomPatternCharacterUnescaped(UChar ch) + void atomPatternCharacter(UChar ch, bool hyphenIsRange = false) { switch (m_state) { - case empty: + case AfterCharacterClass: + // Following a builtin character class we need look out for a hyphen. + // We're looking for invalid ranges, such as /[\d-x]/ or /[\d-\d]/. + // If we see a hyphen following a charater class then unlike usual + // we'll report it to the delegate immediately, and put ourself into + // a poisoned state. Any following calls to add another character or + // character class will result in an error. (A hypen following a + // character-class is itself valid, but only at the end of a regex). + if (hyphenIsRange && ch == '-') { + m_delegate.atomCharacterClassAtom('-'); + m_state = AfterCharacterClassHyphen; + return; + } + // Otherwise just fall through - cached character so treat this as Empty. + + case Empty: m_character = ch; - m_state = cachedCharacter; - break; + m_state = CachedCharacter; + return; - case cachedCharacter: - if (ch == '-') - m_state = cachedCharacterHyphen; + case CachedCharacter: + if (hyphenIsRange && ch == '-') + m_state = CachedCharacterHyphen; else { m_delegate.atomCharacterClassAtom(m_character); m_character = ch; } - break; + return; - case cachedCharacterHyphen: - if (ch >= m_character) - m_delegate.atomCharacterClassRange(m_character, ch); - else + case CachedCharacterHyphen: + if (ch < m_character) { m_err = CharacterClassOutOfOrder; - m_state = empty; - } - } - - /* - * atomPatternCharacter(): - * - * Adds a pattern character, called by parseEscape(), as such will not - * interpret a hyphen as indicating a character range. - */ - void atomPatternCharacter(UChar ch) - { - // Flush if a character is already pending to prevent the - // hyphen from begin interpreted as indicating a range. - if((ch == '-') && (m_state == cachedCharacter)) - flush(); + return; + } + m_delegate.atomCharacterClassRange(m_character, ch); + m_state = Empty; + return; - atomPatternCharacterUnescaped(ch); + // See coment in atomBuiltInCharacterClass below. + // This too is technically an error, per ECMA-262, and again we + // we chose to allow this. Note a subtlely here that while we + // diverge from the spec's definition of CharacterRange we do + // remain in compliance with the grammar. For example, consider + // the expression /[\d-a-z]/. We comply with the grammar in + // this case by not allowing a-z to be matched as a range. + case AfterCharacterClassHyphen: + m_delegate.atomCharacterClassAtom(ch); + m_state = Empty; + return; + } } /* @@ -147,8 +162,34 @@ private: */ void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert) { - flush(); - m_delegate.atomCharacterClassBuiltIn(classID, invert); + switch (m_state) { + case CachedCharacter: + // Flush the currently cached character, then fall through. + m_delegate.atomCharacterClassAtom(m_character); + + case Empty: + case AfterCharacterClass: + m_state = AfterCharacterClass; + m_delegate.atomCharacterClassBuiltIn(classID, invert); + return; + + // If we hit either of these cases, we have an invalid range that + // looks something like /[x-\d]/ or /[\d-\d]/. + // According to ECMA-262 this should be a syntax error, but + // empirical testing shows this to break teh webz. Instead we + // comply with to the ECMA-262 grammar, and assume the grammar to + // have matched the range correctly, but tweak our interpretation + // of CharacterRange. Effectively we implicitly handle the hyphen + // as if it were escaped, e.g. /[\w-_]/ is treated as /[\w\-_]/. + case CachedCharacterHyphen: + m_delegate.atomCharacterClassAtom(m_character); + m_delegate.atomCharacterClassAtom('-'); + // fall through + case AfterCharacterClassHyphen: + m_delegate.atomCharacterClassBuiltIn(classID, invert); + m_state = Empty; + return; + } } /* @@ -158,7 +199,12 @@ private: */ void end() { - flush(); + if (m_state == CachedCharacter) + m_delegate.atomCharacterClassAtom(m_character); + else if (m_state == CachedCharacterHyphen) { + m_delegate.atomCharacterClassAtom(m_character); + m_delegate.atomCharacterClassAtom('-'); + } m_delegate.atomCharacterClassEnd(); } @@ -168,21 +214,14 @@ private: void atomBackReference(unsigned) { ASSERT_NOT_REACHED(); } private: - void flush() - { - if (m_state != empty) // either cachedCharacter or cachedCharacterHyphen - m_delegate.atomCharacterClassAtom(m_character); - if (m_state == cachedCharacterHyphen) - m_delegate.atomCharacterClassAtom('-'); - m_state = empty; - } - Delegate& m_delegate; ErrorCode& m_err; enum CharacterClassConstructionState { - empty, - cachedCharacter, - cachedCharacterHyphen, + Empty, + CachedCharacter, + CachedCharacterHyphen, + AfterCharacterClass, + AfterCharacterClassHyphen, } m_state; UChar m_character; }; @@ -428,7 +467,7 @@ private: break; default: - characterClassConstructor.atomPatternCharacterUnescaped(consume()); + characterClassConstructor.atomPatternCharacter(consume(), true); } if (m_err) @@ -572,13 +611,13 @@ private: case '*': consume(); - parseQuantifier(lastTokenWasAnAtom, 0, UINT_MAX); + parseQuantifier(lastTokenWasAnAtom, 0, quantifyInfinite); lastTokenWasAnAtom = false; break; case '+': consume(); - parseQuantifier(lastTokenWasAnAtom, 1, UINT_MAX); + parseQuantifier(lastTokenWasAnAtom, 1, quantifyInfinite); lastTokenWasAnAtom = false; break; @@ -597,7 +636,7 @@ private: unsigned max = min; if (tryConsume(',')) - max = peekIsDigit() ? consumeNumber() : UINT_MAX; + max = peekIsDigit() ? consumeNumber() : quantifyInfinite; if (tryConsume('}')) { if (min <= max) @@ -838,7 +877,7 @@ private: */ template<class Delegate> -const char* parse(Delegate& delegate, const UString& pattern, unsigned backReferenceLimit = UINT_MAX) +const char* parse(Delegate& delegate, const UString& pattern, unsigned backReferenceLimit = quantifyInfinite) { return Parser<Delegate>(delegate, pattern, backReferenceLimit).parse(); } |