summaryrefslogtreecommitdiffstats
path: root/JavaScriptCore/yarr/RegexParser.h
diff options
context:
space:
mode:
Diffstat (limited to 'JavaScriptCore/yarr/RegexParser.h')
-rw-r--r--JavaScriptCore/yarr/RegexParser.h141
1 files changed, 81 insertions, 60 deletions
diff --git a/JavaScriptCore/yarr/RegexParser.h b/JavaScriptCore/yarr/RegexParser.h
index ede9417..8392cdf 100644
--- a/JavaScriptCore/yarr/RegexParser.h
+++ b/JavaScriptCore/yarr/RegexParser.h
@@ -26,8 +26,6 @@
#ifndef RegexParser_h
#define RegexParser_h
-#if ENABLE(YARR)
-
#include "UString.h"
#include <limits.h>
#include <wtf/ASCIICType.h>
@@ -35,6 +33,8 @@
namespace JSC { namespace Yarr {
+static const unsigned quantifyInfinite = UINT_MAX;
+
enum BuiltInCharacterClassID {
DigitClassID,
SpaceClassID,
@@ -58,6 +58,7 @@ private:
ParenthesesUnmatched,
ParenthesesTypeInvalid,
CharacterClassUnmatched,
+ CharacterClassInvalidRange,
CharacterClassOutOfOrder,
EscapeUnterminated,
NumberOfErrorCodes
@@ -77,7 +78,7 @@ private:
CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err)
: m_delegate(delegate)
, m_err(err)
- , m_state(empty)
+ , m_state(Empty)
{
}
@@ -92,54 +93,60 @@ private:
}
/*
- * atomPatternCharacterUnescaped():
+ * atomPatternCharacter():
*
- * This method is called directly from parseCharacterClass(), to report a new
- * pattern character token. This method differs from atomPatternCharacter(),
- * which will be called from parseEscape(), since a hypen provided via this
- * method may be indicating a character range, but a hyphen parsed by
- * parseEscape() cannot be interpreted as doing so.
+ * This method is called either from parseCharacterClass() (for an unescaped
+ * character in a character class), or from parseEscape(). In the former case
+ * the value true will be passed for the argument 'hyphenIsRange', and in this
+ * mode we will allow a hypen to be treated as indicating a range (i.e. /[a-z]/
+ * is different to /[a\-z]/).
*/
- void atomPatternCharacterUnescaped(UChar ch)
+ void atomPatternCharacter(UChar ch, bool hyphenIsRange = false)
{
switch (m_state) {
- case empty:
+ case AfterCharacterClass:
+ // Following a builtin character class we need look out for a hyphen.
+ // We're looking for invalid ranges, such as /[\d-x]/ or /[\d-\d]/.
+ // If we see a hyphen following a charater class then unlike usual
+ // we'll report it to the delegate immediately, and put ourself into
+ // a poisoned state. Any following calls to add another character or
+ // character class will result in an error. (A hypen following a
+ // character-class is itself valid, but only at the end of a regex).
+ if (hyphenIsRange && ch == '-') {
+ m_delegate.atomCharacterClassAtom('-');
+ m_state = AfterCharacterClassHyphen;
+ return;
+ }
+ // Otherwise just fall through - cached character so treat this as Empty.
+
+ case Empty:
m_character = ch;
- m_state = cachedCharacter;
- break;
+ m_state = CachedCharacter;
+ return;
- case cachedCharacter:
- if (ch == '-')
- m_state = cachedCharacterHyphen;
+ case CachedCharacter:
+ if (hyphenIsRange && ch == '-')
+ m_state = CachedCharacterHyphen;
else {
m_delegate.atomCharacterClassAtom(m_character);
m_character = ch;
}
- break;
+ return;
- case cachedCharacterHyphen:
- if (ch >= m_character)
- m_delegate.atomCharacterClassRange(m_character, ch);
- else
+ case CachedCharacterHyphen:
+ if (ch < m_character) {
m_err = CharacterClassOutOfOrder;
- m_state = empty;
- }
- }
-
- /*
- * atomPatternCharacter():
- *
- * Adds a pattern character, called by parseEscape(), as such will not
- * interpret a hyphen as indicating a character range.
- */
- void atomPatternCharacter(UChar ch)
- {
- // Flush if a character is already pending to prevent the
- // hyphen from begin interpreted as indicating a range.
- if((ch == '-') && (m_state == cachedCharacter))
- flush();
+ return;
+ }
+ m_delegate.atomCharacterClassRange(m_character, ch);
+ m_state = Empty;
+ return;
- atomPatternCharacterUnescaped(ch);
+ case AfterCharacterClassHyphen:
+ // Error! We have something like /[\d-x]/.
+ m_err = CharacterClassInvalidRange;
+ return;
+ }
}
/*
@@ -149,8 +156,25 @@ private:
*/
void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert)
{
- flush();
- m_delegate.atomCharacterClassBuiltIn(classID, invert);
+ switch (m_state) {
+ case CachedCharacter:
+ // Flush the currently cached character, then fall through.
+ m_delegate.atomCharacterClassAtom(m_character);
+
+ case Empty:
+ case AfterCharacterClass:
+ m_state = AfterCharacterClass;
+ m_delegate.atomCharacterClassBuiltIn(classID, invert);
+ return;
+
+ case CachedCharacterHyphen:
+ case AfterCharacterClassHyphen:
+ // Error! If we hit either of these cases, we have an
+ // invalid range that looks something like /[x-\d]/
+ // or /[\d-\d]/.
+ m_err = CharacterClassInvalidRange;
+ return;
+ }
}
/*
@@ -160,7 +184,12 @@ private:
*/
void end()
{
- flush();
+ if (m_state == CachedCharacter)
+ m_delegate.atomCharacterClassAtom(m_character);
+ else if (m_state == CachedCharacterHyphen) {
+ m_delegate.atomCharacterClassAtom(m_character);
+ m_delegate.atomCharacterClassAtom('-');
+ }
m_delegate.atomCharacterClassEnd();
}
@@ -170,21 +199,14 @@ private:
void atomBackReference(unsigned) { ASSERT_NOT_REACHED(); }
private:
- void flush()
- {
- if (m_state != empty) // either cachedCharacter or cachedCharacterHyphen
- m_delegate.atomCharacterClassAtom(m_character);
- if (m_state == cachedCharacterHyphen)
- m_delegate.atomCharacterClassAtom('-');
- m_state = empty;
- }
-
Delegate& m_delegate;
ErrorCode& m_err;
enum CharacterClassConstructionState {
- empty,
- cachedCharacter,
- cachedCharacterHyphen,
+ Empty,
+ CachedCharacter,
+ CachedCharacterHyphen,
+ AfterCharacterClass,
+ AfterCharacterClassHyphen,
} m_state;
UChar m_character;
};
@@ -430,7 +452,7 @@ private:
break;
default:
- characterClassConstructor.atomPatternCharacterUnescaped(consume());
+ characterClassConstructor.atomPatternCharacter(consume(), true);
}
if (m_err)
@@ -574,13 +596,13 @@ private:
case '*':
consume();
- parseQuantifier(lastTokenWasAnAtom, 0, UINT_MAX);
+ parseQuantifier(lastTokenWasAnAtom, 0, quantifyInfinite);
lastTokenWasAnAtom = false;
break;
case '+':
consume();
- parseQuantifier(lastTokenWasAnAtom, 1, UINT_MAX);
+ parseQuantifier(lastTokenWasAnAtom, 1, quantifyInfinite);
lastTokenWasAnAtom = false;
break;
@@ -599,7 +621,7 @@ private:
unsigned max = min;
if (tryConsume(','))
- max = peekIsDigit() ? consumeNumber() : UINT_MAX;
+ max = peekIsDigit() ? consumeNumber() : quantifyInfinite;
if (tryConsume('}')) {
if (min <= max)
@@ -659,6 +681,7 @@ private:
"unmatched parentheses",
"unrecognized character after (?",
"missing terminating ] for character class",
+ "invalid range in character class",
"range out of order in character class",
"\\ at end of pattern"
};
@@ -840,13 +863,11 @@ private:
*/
template<class Delegate>
-const char* parse(Delegate& delegate, const UString& pattern, unsigned backReferenceLimit = UINT_MAX)
+const char* parse(Delegate& delegate, const UString& pattern, unsigned backReferenceLimit = quantifyInfinite)
{
return Parser<Delegate>(delegate, pattern, backReferenceLimit).parse();
}
} } // namespace JSC::Yarr
-#endif
-
#endif // RegexParser_h