1 files changed, 95 insertions, 56 deletions
diff --git a/JavaScriptCore/yarr/RegexParser.h b/JavaScriptCore/yarr/RegexParser.h
index a5c0ba2..ec5f589 100644
--- a/JavaScriptCore/yarr/RegexParser.h
+++ b/JavaScriptCore/yarr/RegexParser.h
@@ -33,6 +33,8 @@
 
 namespace JSC { namespace Yarr {
 
+static const unsigned quantifyInfinite = UINT_MAX;
+
 enum BuiltInCharacterClassID {
     DigitClassID,
     SpaceClassID,
@@ -75,7 +77,7 @@ private:
         CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err)
             : m_delegate(delegate)
             , m_err(err)
-            , m_state(empty)
+            , m_state(Empty)
         {
         }
 
@@ -90,54 +92,67 @@ private:
         }
 
         /*
-         * atomPatternCharacterUnescaped():
+         * atomPatternCharacter():
          *
-         * This method is called directly from parseCharacterClass(), to report a new
-         * pattern character token.  This method differs from atomPatternCharacter(),
-         * which will be called from parseEscape(), since a hypen provided via this
-         * method may be indicating a character range, but a hyphen parsed by
-         * parseEscape() cannot be interpreted as doing so.
+         * This method is called either from parseCharacterClass() (for an unescaped
+         * character in a character class), or from parseEscape(). In the former case
+         * the value true will be passed for the argument 'hyphenIsRange', and in this
+         * mode we will allow a hypen to be treated as indicating a range (i.e. /[a-z]/
+         * is different to /[a\-z]/).
          */
-        void atomPatternCharacterUnescaped(UChar ch)
+        void atomPatternCharacter(UChar ch, bool hyphenIsRange = false)
         {
             switch (m_state) {
-            case empty:
+            case AfterCharacterClass:
+                // Following a builtin character class we need look out for a hyphen.
+                // We're looking for invalid ranges, such as /[\d-x]/ or /[\d-\d]/.
+                // If we see a hyphen following a charater class then unlike usual
+                // we'll report it to the delegate immediately, and put ourself into
+                // a poisoned state. Any following calls to add another character or
+                // character class will result in an error. (A hypen following a
+                // character-class is itself valid, but only  at the end of a regex).
+                if (hyphenIsRange && ch == '-') {
+                    m_delegate.atomCharacterClassAtom('-');
+                    m_state = AfterCharacterClassHyphen;
+                    return;
+                }
+                // Otherwise just fall through - cached character so treat this as Empty.
+
+            case Empty:
                 m_character = ch;
-                m_state = cachedCharacter;
-                break;
+                m_state = CachedCharacter;
+                return;
 
-            case cachedCharacter:
-                if (ch == '-')
-                    m_state = cachedCharacterHyphen;
+            case CachedCharacter:
+                if (hyphenIsRange && ch == '-')
+                    m_state = CachedCharacterHyphen;
                 else {
                     m_delegate.atomCharacterClassAtom(m_character);
                     m_character = ch;
                 }
-                break;
+                return;
 
-            case cachedCharacterHyphen:
-                if (ch >= m_character)
-                    m_delegate.atomCharacterClassRange(m_character, ch);
-                else
+            case CachedCharacterHyphen:
+                if (ch < m_character) {
                     m_err = CharacterClassOutOfOrder;
-                m_state = empty;
-            }
-        }
-
-        /*
-         * atomPatternCharacter():
-         *
-         * Adds a pattern character, called by parseEscape(), as such will not
-         * interpret a hyphen as indicating a character range.
-         */
-        void atomPatternCharacter(UChar ch)
-        {
-            // Flush if a character is already pending to prevent the
-            // hyphen from begin interpreted as indicating a range.
-            if((ch == '-') && (m_state == cachedCharacter))
-                flush();
+                    return;
+                }
+                m_delegate.atomCharacterClassRange(m_character, ch);
+                m_state = Empty;
+                return;
 
-            atomPatternCharacterUnescaped(ch);
+                // See coment in atomBuiltInCharacterClass below.
+                // This too is technically an error, per ECMA-262, and again we
+                // we chose to allow this.  Note a subtlely here that while we
+                // diverge from the spec's definition of CharacterRange we do
+                // remain in compliance with the grammar.  For example, consider
+                // the expression /[\d-a-z]/.  We comply with the grammar in
+                // this case by not allowing a-z to be matched as a range.
+            case AfterCharacterClassHyphen:
+                m_delegate.atomCharacterClassAtom(ch);
+                m_state = Empty;
+                return;
+            }
         }
 
         /*
@@ -147,8 +162,34 @@ private:
          */
         void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert)
         {
-            flush();
-            m_delegate.atomCharacterClassBuiltIn(classID, invert);
+            switch (m_state) {
+            case CachedCharacter:
+                // Flush the currently cached character, then fall through.
+                m_delegate.atomCharacterClassAtom(m_character);
+
+            case Empty:
+            case AfterCharacterClass:
+                m_state = AfterCharacterClass;
+                m_delegate.atomCharacterClassBuiltIn(classID, invert);
+                return;
+
+                // If we hit either of these cases, we have an invalid range that
+                // looks something like /[x-\d]/ or /[\d-\d]/.
+                // According to ECMA-262 this should be a syntax error, but
+                // empirical testing shows this to break teh webz.  Instead we
+                // comply with to the ECMA-262 grammar, and assume the grammar to
+                // have matched the range correctly, but tweak our interpretation
+                // of CharacterRange.  Effectively we implicitly handle the hyphen
+                // as if it were escaped, e.g. /[\w-_]/ is treated as /[\w\-_]/.
+            case CachedCharacterHyphen:
+                m_delegate.atomCharacterClassAtom(m_character);
+                m_delegate.atomCharacterClassAtom('-');
+                // fall through
+            case AfterCharacterClassHyphen:
+                m_delegate.atomCharacterClassBuiltIn(classID, invert);
+                m_state = Empty;
+                return;
+            }
         }
 
         /*
@@ -158,7 +199,12 @@ private:
          */
         void end()
         {
-            flush();
+            if (m_state == CachedCharacter)
+                m_delegate.atomCharacterClassAtom(m_character);
+            else if (m_state == CachedCharacterHyphen) {
+                m_delegate.atomCharacterClassAtom(m_character);
+                m_delegate.atomCharacterClassAtom('-');
+            }
             m_delegate.atomCharacterClassEnd();
         }
 
@@ -168,21 +214,14 @@ private:
         void atomBackReference(unsigned) { ASSERT_NOT_REACHED(); }
 
     private:
-        void flush()
-        {
-            if (m_state != empty) // either cachedCharacter or cachedCharacterHyphen
-                m_delegate.atomCharacterClassAtom(m_character);
-            if (m_state == cachedCharacterHyphen)
-                m_delegate.atomCharacterClassAtom('-');
-            m_state = empty;
-        }
-    
         Delegate& m_delegate;
         ErrorCode& m_err;
         enum CharacterClassConstructionState {
-            empty,
-            cachedCharacter,
-            cachedCharacterHyphen,
+            Empty,
+            CachedCharacter,
+            CachedCharacterHyphen,
+            AfterCharacterClass,
+            AfterCharacterClassHyphen,
         } m_state;
         UChar m_character;
     };
@@ -428,7 +467,7 @@ private:
                 break;
 
             default:
-                characterClassConstructor.atomPatternCharacterUnescaped(consume());
+                characterClassConstructor.atomPatternCharacter(consume(), true);
             }
 
             if (m_err)
@@ -572,13 +611,13 @@ private:
 
             case '*':
                 consume();
-                parseQuantifier(lastTokenWasAnAtom, 0, UINT_MAX);
+                parseQuantifier(lastTokenWasAnAtom, 0, quantifyInfinite);
                 lastTokenWasAnAtom = false;
                 break;
 
             case '+':
                 consume();
-                parseQuantifier(lastTokenWasAnAtom, 1, UINT_MAX);
+                parseQuantifier(lastTokenWasAnAtom, 1, quantifyInfinite);
                 lastTokenWasAnAtom = false;
                 break;
 
@@ -597,7 +636,7 @@ private:
                     unsigned max = min;
                     
                     if (tryConsume(','))
-                        max = peekIsDigit() ? consumeNumber() : UINT_MAX;
+                        max = peekIsDigit() ? consumeNumber() : quantifyInfinite;
 
                     if (tryConsume('}')) {
                         if (min <= max)
@@ -838,7 +877,7 @@ private:
  */
 
 template<class Delegate>
-const char* parse(Delegate& delegate, const UString& pattern, unsigned backReferenceLimit = UINT_MAX)
+const char* parse(Delegate& delegate, const UString& pattern, unsigned backReferenceLimit = quantifyInfinite)
 {
     return Parser<Delegate>(delegate, pattern, backReferenceLimit).parse();
 }