summaryrefslogtreecommitdiffstats
path: root/JavaScriptCore/yarr
diff options
context:
space:
mode:
Diffstat (limited to 'JavaScriptCore/yarr')
-rw-r--r--JavaScriptCore/yarr/RegexCompiler.cpp11
-rw-r--r--JavaScriptCore/yarr/RegexInterpreter.cpp16
-rw-r--r--JavaScriptCore/yarr/RegexJIT.cpp6
-rw-r--r--JavaScriptCore/yarr/RegexParser.h137
-rw-r--r--JavaScriptCore/yarr/RegexPattern.h4
5 files changed, 75 insertions, 99 deletions
diff --git a/JavaScriptCore/yarr/RegexCompiler.cpp b/JavaScriptCore/yarr/RegexCompiler.cpp
index e40c791..06ecbad 100644
--- a/JavaScriptCore/yarr/RegexCompiler.cpp
+++ b/JavaScriptCore/yarr/RegexCompiler.cpp
@@ -520,7 +520,7 @@ public:
PatternTerm& term = currentAlternative->lastTerm();
ASSERT((term.type == PatternTerm::TypeParenthesesSubpattern) || (term.type == PatternTerm::TypeParentheticalAssertion));
- if ((term.type == PatternTerm::TypeParenthesesSubpattern) && term.invertOrCapture && (subpatternId == term.parentheses.subpatternId)) {
+ if ((term.type == PatternTerm::TypeParenthesesSubpattern) && term.invertOrCapture && (subpatternId == term.subpatternId)) {
m_alternative->m_terms.append(PatternTerm::ForwardReference());
return;
}
@@ -595,7 +595,7 @@ public:
term.quantify(min, QuantifierFixedCount);
m_alternative->m_terms.append(copyTerm(term));
// NOTE: this term is interesting from an analysis perspective, in that it can be ignored.....
- m_alternative->lastTerm().quantify((max == quantifyInfinite) ? max : max - min, greedy ? QuantifierGreedy : QuantifierNonGreedy);
+ m_alternative->lastTerm().quantify((max == UINT_MAX) ? max : max - min, greedy ? QuantifierGreedy : QuantifierNonGreedy);
if (m_alternative->lastTerm().type == PatternTerm::TypeParenthesesSubpattern)
m_alternative->lastTerm().parentheses.isCopy = true;
}
@@ -734,8 +734,7 @@ public:
// This optimization identifies sets of parentheses that we will never need to backtrack.
// In these cases we do not need to store state from prior iterations.
// We can presently avoid backtracking for:
- // * where the parens are at the end of the regular expression (last term in any of the
- // alternatives of the main body disjunction).
+ // * a set of parens at the end of the regular expression (last term in any of the alternatives of the main body disjunction).
// * where the parens are non-capturing, and quantified unbounded greedy (*).
// * where the parens do not contain any capturing subpatterns.
void checkForTerminalParentheses()
@@ -746,13 +745,13 @@ public:
return;
Vector<PatternAlternative*>& alternatives = m_pattern.m_body->m_alternatives;
- for (size_t i = 0; i < alternatives.size(); ++i) {
+ for (unsigned i =0; i < alternatives.size(); ++i) {
Vector<PatternTerm>& terms = alternatives[i]->m_terms;
if (terms.size()) {
PatternTerm& term = terms.last();
if (term.type == PatternTerm::TypeParenthesesSubpattern
&& term.quantityType == QuantifierGreedy
- && term.quantityCount == quantifyInfinite
+ && term.quantityCount == UINT_MAX
&& !term.capture())
term.parentheses.isTerminal = true;
}
diff --git a/JavaScriptCore/yarr/RegexInterpreter.cpp b/JavaScriptCore/yarr/RegexInterpreter.cpp
index a51cd25..164158e 100644
--- a/JavaScriptCore/yarr/RegexInterpreter.cpp
+++ b/JavaScriptCore/yarr/RegexInterpreter.cpp
@@ -515,7 +515,8 @@ public:
if (matchEnd == -1)
return true;
- ASSERT((matchBegin == -1) || (matchBegin <= matchEnd));
+ ASSERT((matchBegin == -1) == (matchEnd == -1));
+ ASSERT(matchBegin <= matchEnd);
if (matchBegin == matchEnd)
return true;
@@ -557,7 +558,8 @@ public:
int matchBegin = output[(term.atom.subpatternId << 1)];
int matchEnd = output[(term.atom.subpatternId << 1) + 1];
- ASSERT((matchBegin == -1) || (matchBegin <= matchEnd));
+ ASSERT((matchBegin == -1) == (matchEnd == -1));
+ ASSERT(matchBegin <= matchEnd);
if (matchBegin == matchEnd)
return false;
@@ -717,7 +719,7 @@ public:
if (term.capture()) {
// Technically this access to inputPosition should be accessing the begin term's
// inputPosition, but for repeats other than fixed these values should be
- // the same anyway! (We don't pre-check for greedy or non-greedy matches.)
+ // the same anyway! (we don't pre-check for greedy or non-greedy matches.)
ASSERT((&term - term.atom.parenthesesWidth)->type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
ASSERT((&term - term.atom.parenthesesWidth)->inputPosition == term.inputPosition);
unsigned subpatternId = term.atom.subpatternId;
@@ -737,7 +739,7 @@ public:
{
ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalBegin);
ASSERT(term.atom.quantityType == QuantifierGreedy);
- ASSERT(term.atom.quantityCount == quantifyInfinite);
+ ASSERT(term.atom.quantityCount == UINT_MAX);
ASSERT(!term.capture());
BackTrackInfoParenthesesTerminal* backTrack = reinterpret_cast<BackTrackInfoParenthesesTerminal*>(context->frame + term.frameLocation);
@@ -763,7 +765,7 @@ public:
{
ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalBegin);
ASSERT(term.atom.quantityType == QuantifierGreedy);
- ASSERT(term.atom.quantityCount == quantifyInfinite);
+ ASSERT(term.atom.quantityCount == UINT_MAX);
ASSERT(!term.capture());
// If we backtrack to this point, we have failed to match this iteration of the parens.
@@ -775,7 +777,7 @@ public:
bool backtrackParenthesesTerminalEnd(ByteTerm&, DisjunctionContext*)
{
// 'Terminal' parentheses are at the end of the regex, and as such a match past end
- // should always be returned as a successful match - we should never backtrack to here.
+ // should always be returned as a successful match - we should never becktrack to here.
ASSERT_NOT_REACHED();
return false;
}
@@ -1826,7 +1828,7 @@ public:
break;
case PatternTerm::TypeBackReference:
- atomBackReference(term.backReferenceSubpatternId, term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType);
+ atomBackReference(term.subpatternId, term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType);
break;
case PatternTerm::TypeForwardReference:
diff --git a/JavaScriptCore/yarr/RegexJIT.cpp b/JavaScriptCore/yarr/RegexJIT.cpp
index 1eac667..acbd458 100644
--- a/JavaScriptCore/yarr/RegexJIT.cpp
+++ b/JavaScriptCore/yarr/RegexJIT.cpp
@@ -624,7 +624,7 @@ class RegexGenerator : private MacroAssembler {
add32(Imm32(1), countRegister);
add32(Imm32(1), index);
- if (term.quantityCount != quantifyInfinite) {
+ if (term.quantityCount != 0xffffffff) {
branch32(NotEqual, countRegister, Imm32(term.quantityCount)).linkTo(loop, this);
failures.append(jump());
} else
@@ -662,7 +662,7 @@ class RegexGenerator : private MacroAssembler {
loadFromFrame(term.frameLocation, countRegister);
atEndOfInput().linkTo(hardFail, this);
- if (term.quantityCount != quantifyInfinite)
+ if (term.quantityCount != 0xffffffff)
branch32(Equal, countRegister, Imm32(term.quantityCount), hardFail);
if (m_pattern.m_ignoreCase && isASCIIAlpha(ch)) {
readCharacter(state.inputOffset(), character);
@@ -749,7 +749,7 @@ class RegexGenerator : private MacroAssembler {
add32(Imm32(1), countRegister);
add32(Imm32(1), index);
- if (term.quantityCount != quantifyInfinite) {
+ if (term.quantityCount != 0xffffffff) {
branch32(NotEqual, countRegister, Imm32(term.quantityCount)).linkTo(loop, this);
failures.append(jump());
} else
diff --git a/JavaScriptCore/yarr/RegexParser.h b/JavaScriptCore/yarr/RegexParser.h
index 8392cdf..a5c0ba2 100644
--- a/JavaScriptCore/yarr/RegexParser.h
+++ b/JavaScriptCore/yarr/RegexParser.h
@@ -33,8 +33,6 @@
namespace JSC { namespace Yarr {
-static const unsigned quantifyInfinite = UINT_MAX;
-
enum BuiltInCharacterClassID {
DigitClassID,
SpaceClassID,
@@ -58,7 +56,6 @@ private:
ParenthesesUnmatched,
ParenthesesTypeInvalid,
CharacterClassUnmatched,
- CharacterClassInvalidRange,
CharacterClassOutOfOrder,
EscapeUnterminated,
NumberOfErrorCodes
@@ -78,7 +75,7 @@ private:
CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err)
: m_delegate(delegate)
, m_err(err)
- , m_state(Empty)
+ , m_state(empty)
{
}
@@ -93,88 +90,65 @@ private:
}
/*
- * atomPatternCharacter():
+ * atomPatternCharacterUnescaped():
*
- * This method is called either from parseCharacterClass() (for an unescaped
- * character in a character class), or from parseEscape(). In the former case
- * the value true will be passed for the argument 'hyphenIsRange', and in this
- * mode we will allow a hypen to be treated as indicating a range (i.e. /[a-z]/
- * is different to /[a\-z]/).
+ * This method is called directly from parseCharacterClass(), to report a new
+ * pattern character token. This method differs from atomPatternCharacter(),
+ * which will be called from parseEscape(), since a hypen provided via this
+ * method may be indicating a character range, but a hyphen parsed by
+ * parseEscape() cannot be interpreted as doing so.
*/
- void atomPatternCharacter(UChar ch, bool hyphenIsRange = false)
+ void atomPatternCharacterUnescaped(UChar ch)
{
switch (m_state) {
- case AfterCharacterClass:
- // Following a builtin character class we need look out for a hyphen.
- // We're looking for invalid ranges, such as /[\d-x]/ or /[\d-\d]/.
- // If we see a hyphen following a charater class then unlike usual
- // we'll report it to the delegate immediately, and put ourself into
- // a poisoned state. Any following calls to add another character or
- // character class will result in an error. (A hypen following a
- // character-class is itself valid, but only at the end of a regex).
- if (hyphenIsRange && ch == '-') {
- m_delegate.atomCharacterClassAtom('-');
- m_state = AfterCharacterClassHyphen;
- return;
- }
- // Otherwise just fall through - cached character so treat this as Empty.
-
- case Empty:
+ case empty:
m_character = ch;
- m_state = CachedCharacter;
- return;
+ m_state = cachedCharacter;
+ break;
- case CachedCharacter:
- if (hyphenIsRange && ch == '-')
- m_state = CachedCharacterHyphen;
+ case cachedCharacter:
+ if (ch == '-')
+ m_state = cachedCharacterHyphen;
else {
m_delegate.atomCharacterClassAtom(m_character);
m_character = ch;
}
- return;
+ break;
- case CachedCharacterHyphen:
- if (ch < m_character) {
+ case cachedCharacterHyphen:
+ if (ch >= m_character)
+ m_delegate.atomCharacterClassRange(m_character, ch);
+ else
m_err = CharacterClassOutOfOrder;
- return;
- }
- m_delegate.atomCharacterClassRange(m_character, ch);
- m_state = Empty;
- return;
-
- case AfterCharacterClassHyphen:
- // Error! We have something like /[\d-x]/.
- m_err = CharacterClassInvalidRange;
- return;
+ m_state = empty;
}
}
/*
+ * atomPatternCharacter():
+ *
+ * Adds a pattern character, called by parseEscape(), as such will not
+ * interpret a hyphen as indicating a character range.
+ */
+ void atomPatternCharacter(UChar ch)
+ {
+ // Flush if a character is already pending to prevent the
+ // hyphen from begin interpreted as indicating a range.
+ if((ch == '-') && (m_state == cachedCharacter))
+ flush();
+
+ atomPatternCharacterUnescaped(ch);
+ }
+
+ /*
* atomBuiltInCharacterClass():
*
* Adds a built-in character class, called by parseEscape().
*/
void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert)
{
- switch (m_state) {
- case CachedCharacter:
- // Flush the currently cached character, then fall through.
- m_delegate.atomCharacterClassAtom(m_character);
-
- case Empty:
- case AfterCharacterClass:
- m_state = AfterCharacterClass;
- m_delegate.atomCharacterClassBuiltIn(classID, invert);
- return;
-
- case CachedCharacterHyphen:
- case AfterCharacterClassHyphen:
- // Error! If we hit either of these cases, we have an
- // invalid range that looks something like /[x-\d]/
- // or /[\d-\d]/.
- m_err = CharacterClassInvalidRange;
- return;
- }
+ flush();
+ m_delegate.atomCharacterClassBuiltIn(classID, invert);
}
/*
@@ -184,12 +158,7 @@ private:
*/
void end()
{
- if (m_state == CachedCharacter)
- m_delegate.atomCharacterClassAtom(m_character);
- else if (m_state == CachedCharacterHyphen) {
- m_delegate.atomCharacterClassAtom(m_character);
- m_delegate.atomCharacterClassAtom('-');
- }
+ flush();
m_delegate.atomCharacterClassEnd();
}
@@ -199,14 +168,21 @@ private:
void atomBackReference(unsigned) { ASSERT_NOT_REACHED(); }
private:
+ void flush()
+ {
+ if (m_state != empty) // either cachedCharacter or cachedCharacterHyphen
+ m_delegate.atomCharacterClassAtom(m_character);
+ if (m_state == cachedCharacterHyphen)
+ m_delegate.atomCharacterClassAtom('-');
+ m_state = empty;
+ }
+
Delegate& m_delegate;
ErrorCode& m_err;
enum CharacterClassConstructionState {
- Empty,
- CachedCharacter,
- CachedCharacterHyphen,
- AfterCharacterClass,
- AfterCharacterClassHyphen,
+ empty,
+ cachedCharacter,
+ cachedCharacterHyphen,
} m_state;
UChar m_character;
};
@@ -452,7 +428,7 @@ private:
break;
default:
- characterClassConstructor.atomPatternCharacter(consume(), true);
+ characterClassConstructor.atomPatternCharacterUnescaped(consume());
}
if (m_err)
@@ -596,13 +572,13 @@ private:
case '*':
consume();
- parseQuantifier(lastTokenWasAnAtom, 0, quantifyInfinite);
+ parseQuantifier(lastTokenWasAnAtom, 0, UINT_MAX);
lastTokenWasAnAtom = false;
break;
case '+':
consume();
- parseQuantifier(lastTokenWasAnAtom, 1, quantifyInfinite);
+ parseQuantifier(lastTokenWasAnAtom, 1, UINT_MAX);
lastTokenWasAnAtom = false;
break;
@@ -621,7 +597,7 @@ private:
unsigned max = min;
if (tryConsume(','))
- max = peekIsDigit() ? consumeNumber() : quantifyInfinite;
+ max = peekIsDigit() ? consumeNumber() : UINT_MAX;
if (tryConsume('}')) {
if (min <= max)
@@ -681,7 +657,6 @@ private:
"unmatched parentheses",
"unrecognized character after (?",
"missing terminating ] for character class",
- "invalid range in character class",
"range out of order in character class",
"\\ at end of pattern"
};
@@ -863,7 +838,7 @@ private:
*/
template<class Delegate>
-const char* parse(Delegate& delegate, const UString& pattern, unsigned backReferenceLimit = quantifyInfinite)
+const char* parse(Delegate& delegate, const UString& pattern, unsigned backReferenceLimit = UINT_MAX)
{
return Parser<Delegate>(delegate, pattern, backReferenceLimit).parse();
}
diff --git a/JavaScriptCore/yarr/RegexPattern.h b/JavaScriptCore/yarr/RegexPattern.h
index 8a7d35b..c76c641 100644
--- a/JavaScriptCore/yarr/RegexPattern.h
+++ b/JavaScriptCore/yarr/RegexPattern.h
@@ -107,7 +107,7 @@ struct PatternTerm {
union {
UChar patternCharacter;
CharacterClass* characterClass;
- unsigned backReferenceSubpatternId;
+ unsigned subpatternId;
struct {
PatternDisjunction* disjunction;
unsigned subpatternId;
@@ -162,7 +162,7 @@ struct PatternTerm {
: type(TypeBackReference)
, invertOrCapture(false)
{
- backReferenceSubpatternId = spatternId;
+ subpatternId = spatternId;
quantityType = QuantifierFixedCount;
quantityCount = 1;
}