summaryrefslogtreecommitdiffstats
path: root/JavaScriptCore/yarr
diff options
context:
space:
mode:
Diffstat (limited to 'JavaScriptCore/yarr')
-rw-r--r--JavaScriptCore/yarr/RegexCompiler.cpp54
-rw-r--r--JavaScriptCore/yarr/RegexCompiler.h4
-rw-r--r--JavaScriptCore/yarr/RegexInterpreter.cpp287
-rw-r--r--JavaScriptCore/yarr/RegexInterpreter.h7
-rw-r--r--JavaScriptCore/yarr/RegexJIT.cpp103
-rw-r--r--JavaScriptCore/yarr/RegexJIT.h30
-rw-r--r--JavaScriptCore/yarr/RegexParser.h141
-rw-r--r--JavaScriptCore/yarr/RegexPattern.h13
8 files changed, 407 insertions, 232 deletions
diff --git a/JavaScriptCore/yarr/RegexCompiler.cpp b/JavaScriptCore/yarr/RegexCompiler.cpp
index ccfc94e..e40c791 100644
--- a/JavaScriptCore/yarr/RegexCompiler.cpp
+++ b/JavaScriptCore/yarr/RegexCompiler.cpp
@@ -31,8 +31,6 @@
#include "RegexPattern.h"
#include <wtf/Vector.h>
-#if ENABLE(YARR)
-
using namespace WTF;
namespace JSC { namespace Yarr {
@@ -522,7 +520,7 @@ public:
PatternTerm& term = currentAlternative->lastTerm();
ASSERT((term.type == PatternTerm::TypeParenthesesSubpattern) || (term.type == PatternTerm::TypeParentheticalAssertion));
- if ((term.type == PatternTerm::TypeParenthesesSubpattern) && term.invertOrCapture && (subpatternId == term.subpatternId)) {
+ if ((term.type == PatternTerm::TypeParenthesesSubpattern) && term.invertOrCapture && (subpatternId == term.parentheses.subpatternId)) {
m_alternative->m_terms.append(PatternTerm::ForwardReference());
return;
}
@@ -597,7 +595,7 @@ public:
term.quantify(min, QuantifierFixedCount);
m_alternative->m_terms.append(copyTerm(term));
// NOTE: this term is interesting from an analysis perspective, in that it can be ignored.....
- m_alternative->lastTerm().quantify((max == UINT_MAX) ? max : max - min, greedy ? QuantifierGreedy : QuantifierNonGreedy);
+ m_alternative->lastTerm().quantify((max == quantifyInfinite) ? max : max - min, greedy ? QuantifierGreedy : QuantifierNonGreedy);
if (m_alternative->lastTerm().type == PatternTerm::TypeParenthesesSubpattern)
m_alternative->lastTerm().parentheses.isCopy = true;
}
@@ -669,14 +667,17 @@ public:
case PatternTerm::TypeParenthesesSubpattern:
// Note: for fixed once parentheses we will ensure at least the minimum is available; others are on their own.
term.frameLocation = currentCallFrameSize;
- if ((term.quantityCount == 1) && !term.parentheses.isCopy) {
- if (term.quantityType == QuantifierFixedCount) {
- currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition);
- currentInputPosition += term.parentheses.disjunction->m_minimumSize;
- } else {
+ if (term.quantityCount == 1 && !term.parentheses.isCopy) {
+ if (term.quantityType != QuantifierFixedCount)
currentCallFrameSize += RegexStackSpaceForBackTrackInfoParenthesesOnce;
- currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition);
- }
+ currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition);
+ // If quantity is fixed, then pre-check its minimum size.
+ if (term.quantityType == QuantifierFixedCount)
+ currentInputPosition += term.parentheses.disjunction->m_minimumSize;
+ term.inputPosition = currentInputPosition;
+ } else if (term.parentheses.isTerminal) {
+ currentCallFrameSize += RegexStackSpaceForBackTrackInfoParenthesesTerminal;
+ currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition);
term.inputPosition = currentInputPosition;
} else {
term.inputPosition = currentInputPosition;
@@ -730,6 +731,34 @@ public:
setupDisjunctionOffsets(m_pattern.m_body, 0, 0);
}
+ // This optimization identifies sets of parentheses that we will never need to backtrack.
+ // In these cases we do not need to store state from prior iterations.
+ // We can presently avoid backtracking for:
+ // * where the parens are at the end of the regular expression (last term in any of the
+ // alternatives of the main body disjunction).
+ // * where the parens are non-capturing, and quantified unbounded greedy (*).
+ // * where the parens do not contain any capturing subpatterns.
+ void checkForTerminalParentheses()
+ {
+ // This check is much too crude; should be just checking whether the candidate
+ // node contains nested capturing subpatterns, not the whole expression!
+ if (m_pattern.m_numSubpatterns)
+ return;
+
+ Vector<PatternAlternative*>& alternatives = m_pattern.m_body->m_alternatives;
+ for (size_t i = 0; i < alternatives.size(); ++i) {
+ Vector<PatternTerm>& terms = alternatives[i]->m_terms;
+ if (terms.size()) {
+ PatternTerm& term = terms.last();
+ if (term.type == PatternTerm::TypeParenthesesSubpattern
+ && term.quantityType == QuantifierGreedy
+ && term.quantityCount == quantifyInfinite
+ && !term.capture())
+ term.parentheses.isTerminal = true;
+ }
+ }
+ }
+
void optimizeBOL()
{
// Look for expressions containing beginning of line (^) anchoring and unroll them.
@@ -932,6 +961,7 @@ const char* compileRegex(const UString& patternString, RegexPattern& pattern)
ASSERT(numSubpatterns == pattern.m_numSubpatterns);
}
+ constructor.checkForTerminalParentheses();
constructor.optimizeBOL();
constructor.setupOffsets();
@@ -942,5 +972,3 @@ const char* compileRegex(const UString& patternString, RegexPattern& pattern)
} }
-
-#endif
diff --git a/JavaScriptCore/yarr/RegexCompiler.h b/JavaScriptCore/yarr/RegexCompiler.h
index 9d2443a..399374e 100644
--- a/JavaScriptCore/yarr/RegexCompiler.h
+++ b/JavaScriptCore/yarr/RegexCompiler.h
@@ -26,8 +26,6 @@
#ifndef RegexCompiler_h
#define RegexCompiler_h
-#if ENABLE(YARR)
-
#include "RegexParser.h"
#include "RegexPattern.h"
#include <wtf/unicode/Unicode.h>
@@ -38,6 +36,4 @@ const char* compileRegex(const UString& patternString, RegexPattern& pattern);
} } // namespace JSC::Yarr
-#endif
-
#endif // RegexCompiler_h
diff --git a/JavaScriptCore/yarr/RegexInterpreter.cpp b/JavaScriptCore/yarr/RegexInterpreter.cpp
index dc3024a..a51cd25 100644
--- a/JavaScriptCore/yarr/RegexInterpreter.cpp
+++ b/JavaScriptCore/yarr/RegexInterpreter.cpp
@@ -35,8 +35,6 @@
#include <stdio.h>
#endif
-#if ENABLE(YARR)
-
using namespace WTF;
namespace JSC { namespace Yarr {
@@ -62,7 +60,10 @@ public:
uintptr_t begin;
};
struct BackTrackInfoParenthesesOnce {
- uintptr_t inParentheses;
+ uintptr_t begin;
+ };
+ struct BackTrackInfoParenthesesTerminal {
+ uintptr_t begin;
};
struct BackTrackInfoParentheses {
uintptr_t matchAmount;
@@ -514,8 +515,7 @@ public:
if (matchEnd == -1)
return true;
- ASSERT((matchBegin == -1) == (matchEnd == -1));
- ASSERT(matchBegin <= matchEnd);
+ ASSERT((matchBegin == -1) || (matchBegin <= matchEnd));
if (matchBegin == matchEnd)
return true;
@@ -557,8 +557,7 @@ public:
int matchBegin = output[(term.atom.subpatternId << 1)];
int matchEnd = output[(term.atom.subpatternId << 1) + 1];
- ASSERT((matchBegin == -1) == (matchEnd == -1));
- ASSERT(matchBegin <= matchEnd);
+ ASSERT((matchBegin == -1) || (matchBegin <= matchEnd));
if (matchBegin == matchEnd)
return false;
@@ -633,11 +632,11 @@ public:
switch (term.atom.quantityType) {
case QuantifierGreedy: {
// set this speculatively; if we get to the parens end this will be true.
- backTrack->inParentheses = 1;
+ backTrack->begin = input.getPos();
break;
}
case QuantifierNonGreedy: {
- backTrack->inParentheses = 0;
+ backTrack->begin = notFound;
context->term += term.atom.parenthesesWidth;
return true;
}
@@ -653,7 +652,7 @@ public:
return true;
}
- bool matchParenthesesOnceEnd(ByteTerm& term, DisjunctionContext*)
+ bool matchParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context)
{
ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd);
ASSERT(term.atom.quantityCount == 1);
@@ -662,7 +661,12 @@ public:
unsigned subpatternId = term.atom.subpatternId;
output[(subpatternId << 1) + 1] = input.getPos() + term.inputPosition;
}
- return true;
+
+ if (term.atom.quantityType == QuantifierFixedCount)
+ return true;
+
+ BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
+ return backTrack->begin != input.getPos();
}
bool backtrackParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context)
@@ -681,12 +685,12 @@ public:
switch (term.atom.quantityType) {
case QuantifierGreedy:
// if we backtrack to this point, there is another chance - try matching nothing.
- ASSERT(backTrack->inParentheses);
- backTrack->inParentheses = 0;
+ ASSERT(backTrack->begin != notFound);
+ backTrack->begin = notFound;
context->term += term.atom.parenthesesWidth;
return true;
case QuantifierNonGreedy:
- ASSERT(backTrack->inParentheses);
+ ASSERT(backTrack->begin != notFound);
case QuantifierFixedCount:
break;
}
@@ -703,15 +707,19 @@ public:
switch (term.atom.quantityType) {
case QuantifierGreedy:
- if (!backTrack->inParentheses) {
+ if (backTrack->begin == notFound) {
context->term -= term.atom.parenthesesWidth;
return false;
}
case QuantifierNonGreedy:
- if (!backTrack->inParentheses) {
- // now try to match the parens; set this speculatively.
- backTrack->inParentheses = 1;
+ if (backTrack->begin == notFound) {
+ backTrack->begin = input.getPos();
if (term.capture()) {
+ // Technically this access to inputPosition should be accessing the begin term's
+ // inputPosition, but for repeats other than fixed these values should be
+ // the same anyway! (We don't pre-check for greedy or non-greedy matches.)
+ ASSERT((&term - term.atom.parenthesesWidth)->type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
+ ASSERT((&term - term.atom.parenthesesWidth)->inputPosition == term.inputPosition);
unsigned subpatternId = term.atom.subpatternId;
output[subpatternId << 1] = input.getPos() + term.inputPosition;
}
@@ -725,6 +733,53 @@ public:
return false;
}
+ bool matchParenthesesTerminalBegin(ByteTerm& term, DisjunctionContext* context)
+ {
+ ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalBegin);
+ ASSERT(term.atom.quantityType == QuantifierGreedy);
+ ASSERT(term.atom.quantityCount == quantifyInfinite);
+ ASSERT(!term.capture());
+
+ BackTrackInfoParenthesesTerminal* backTrack = reinterpret_cast<BackTrackInfoParenthesesTerminal*>(context->frame + term.frameLocation);
+ backTrack->begin = input.getPos();
+ return true;
+ }
+
+ bool matchParenthesesTerminalEnd(ByteTerm& term, DisjunctionContext* context)
+ {
+ ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalEnd);
+
+ BackTrackInfoParenthesesTerminal* backTrack = reinterpret_cast<BackTrackInfoParenthesesTerminal*>(context->frame + term.frameLocation);
+ // Empty match is a failed match.
+ if (backTrack->begin == input.getPos())
+ return false;
+
+ // Successful match! Okay, what's next? - loop around and try to match moar!
+ context->term -= (term.atom.parenthesesWidth + 1);
+ return true;
+ }
+
+ bool backtrackParenthesesTerminalBegin(ByteTerm& term, DisjunctionContext* context)
+ {
+ ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalBegin);
+ ASSERT(term.atom.quantityType == QuantifierGreedy);
+ ASSERT(term.atom.quantityCount == quantifyInfinite);
+ ASSERT(!term.capture());
+
+ // If we backtrack to this point, we have failed to match this iteration of the parens.
+ // Since this is greedy / zero minimum a failed is also accepted as a match!
+ context->term += term.atom.parenthesesWidth;
+ return true;
+ }
+
+ bool backtrackParenthesesTerminalEnd(ByteTerm&, DisjunctionContext*)
+ {
+ // 'Terminal' parentheses are at the end of the regex, and as such a match past end
+ // should always be returned as a successful match - we should never backtrack to here.
+ ASSERT_NOT_REACHED();
+ return false;
+ }
+
bool matchParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context)
{
ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin);
@@ -1173,6 +1228,14 @@ public:
if (matchParenthesesOnceEnd(currentTerm(), context))
MATCH_NEXT();
BACKTRACK();
+ case ByteTerm::TypeParenthesesSubpatternTerminalBegin:
+ if (matchParenthesesTerminalBegin(currentTerm(), context))
+ MATCH_NEXT();
+ BACKTRACK();
+ case ByteTerm::TypeParenthesesSubpatternTerminalEnd:
+ if (matchParenthesesTerminalEnd(currentTerm(), context))
+ MATCH_NEXT();
+ BACKTRACK();
case ByteTerm::TypeParentheticalAssertionBegin:
if (matchParentheticalAssertionBegin(currentTerm(), context))
MATCH_NEXT();
@@ -1286,6 +1349,14 @@ public:
if (backtrackParenthesesOnceEnd(currentTerm(), context))
MATCH_NEXT();
BACKTRACK();
+ case ByteTerm::TypeParenthesesSubpatternTerminalBegin:
+ if (backtrackParenthesesTerminalBegin(currentTerm(), context))
+ MATCH_NEXT();
+ BACKTRACK();
+ case ByteTerm::TypeParenthesesSubpatternTerminalEnd:
+ if (backtrackParenthesesTerminalEnd(currentTerm(), context))
+ MATCH_NEXT();
+ BACKTRACK();
case ByteTerm::TypeParentheticalAssertionBegin:
if (backtrackParentheticalAssertionBegin(currentTerm(), context))
MATCH_NEXT();
@@ -1341,9 +1412,8 @@ public:
pattern->m_allocator->stopAllocator();
- if (output[0] == -1 && result != JSRegExpNoMatch)
- return result;
-
+ // RegExp.cpp currently expects all error to be converted to -1.
+ ASSERT((result == JSRegExpMatch) == (output[0] != -1));
return output[0];
}
@@ -1448,8 +1518,38 @@ public:
m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
}
+ void atomParenthesesOnceBegin(unsigned subpatternId, bool capture, int inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation)
+ {
+ int beginTerm = m_bodyDisjunction->terms.size();
+
+ m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, inputPosition));
+ m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
+ m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
+ m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation;
+
+ m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
+ m_currentAlternativeIndex = beginTerm + 1;
+ }
+
+ void atomParenthesesTerminalBegin(unsigned subpatternId, bool capture, int inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation)
+ {
+ int beginTerm = m_bodyDisjunction->terms.size();
+
+ m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternTerminalBegin, subpatternId, capture, inputPosition));
+ m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
+ m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
+ m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation;
+
+ m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
+ m_currentAlternativeIndex = beginTerm + 1;
+ }
+
void atomParenthesesSubpatternBegin(unsigned subpatternId, bool capture, int inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation)
{
+ // Errrk! - this is a little crazy, we initially generate as a TypeParenthesesSubpatternOnceBegin,
+ // then fix this up at the end! - simplifying this should make it much clearer.
+ // https://bugs.webkit.org/show_bug.cgi?id=50136
+
int beginTerm = m_bodyDisjunction->terms.size();
m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, inputPosition));
@@ -1474,6 +1574,28 @@ public:
m_currentAlternativeIndex = beginTerm + 1;
}
+ void atomParentheticalAssertionEnd(int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType)
+ {
+ unsigned beginTerm = popParenthesesStack();
+ closeAlternative(beginTerm + 1);
+ unsigned endTerm = m_bodyDisjunction->terms.size();
+
+ ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParentheticalAssertionBegin);
+
+ bool invertOrCapture = m_bodyDisjunction->terms[beginTerm].invertOrCapture;
+ unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId;
+
+ m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParentheticalAssertionEnd, subpatternId, invertOrCapture, inputPosition));
+ m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm;
+ m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm;
+ m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation;
+
+ m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount;
+ m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
+ m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount;
+ m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType;
+ }
+
unsigned popParenthesesStack()
{
ASSERT(m_parenthesesStack.size());
@@ -1545,50 +1667,79 @@ public:
m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation;
}
- void atomParenthesesEnd(bool doInline, unsigned lastSubpatternId, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType, unsigned callFrameSize = 0)
+ void atomParenthesesSubpatternEnd(unsigned lastSubpatternId, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType, unsigned callFrameSize = 0)
+ {
+ unsigned beginTerm = popParenthesesStack();
+ closeAlternative(beginTerm + 1);
+ unsigned endTerm = m_bodyDisjunction->terms.size();
+
+ ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
+
+ ByteTerm& parenthesesBegin = m_bodyDisjunction->terms[beginTerm];
+
+ bool invertOrCapture = parenthesesBegin.invertOrCapture;
+ unsigned subpatternId = parenthesesBegin.atom.subpatternId;
+
+ unsigned numSubpatterns = lastSubpatternId - subpatternId + 1;
+ ByteDisjunction* parenthesesDisjunction = new ByteDisjunction(numSubpatterns, callFrameSize);
+
+ parenthesesDisjunction->terms.append(ByteTerm::SubpatternBegin());
+ for (unsigned termInParentheses = beginTerm + 1; termInParentheses < endTerm; ++termInParentheses)
+ parenthesesDisjunction->terms.append(m_bodyDisjunction->terms[termInParentheses]);
+ parenthesesDisjunction->terms.append(ByteTerm::SubpatternEnd());
+
+ m_bodyDisjunction->terms.shrink(beginTerm);
+
+ m_allParenthesesInfo.append(parenthesesDisjunction);
+ m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction, invertOrCapture, inputPosition));
+
+ m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount;
+ m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
+ m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
+ }
+
+ void atomParenthesesOnceEnd(int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType)
{
unsigned beginTerm = popParenthesesStack();
closeAlternative(beginTerm + 1);
unsigned endTerm = m_bodyDisjunction->terms.size();
- bool isAssertion = m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParentheticalAssertionBegin;
+ ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
+
bool invertOrCapture = m_bodyDisjunction->terms[beginTerm].invertOrCapture;
unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId;
- m_bodyDisjunction->terms.append(ByteTerm(isAssertion ? ByteTerm::TypeParentheticalAssertionEnd : ByteTerm::TypeParenthesesSubpatternOnceEnd, subpatternId, invertOrCapture, inputPosition));
+ m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceEnd, subpatternId, invertOrCapture, inputPosition));
m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm;
m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm;
m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation;
- if (doInline) {
- m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount;
- m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
- m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount;
- m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType;
- } else {
- ByteTerm& parenthesesBegin = m_bodyDisjunction->terms[beginTerm];
- ASSERT(parenthesesBegin.type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
-
- bool invertOrCapture = parenthesesBegin.invertOrCapture;
- unsigned subpatternId = parenthesesBegin.atom.subpatternId;
+ m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount;
+ m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
+ m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount;
+ m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType;
+ }
- unsigned numSubpatterns = lastSubpatternId - subpatternId + 1;
- ByteDisjunction* parenthesesDisjunction = new ByteDisjunction(numSubpatterns, callFrameSize);
+ void atomParenthesesTerminalEnd(int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType)
+ {
+ unsigned beginTerm = popParenthesesStack();
+ closeAlternative(beginTerm + 1);
+ unsigned endTerm = m_bodyDisjunction->terms.size();
- parenthesesDisjunction->terms.append(ByteTerm::SubpatternBegin());
- for (unsigned termInParentheses = beginTerm + 1; termInParentheses < endTerm; ++termInParentheses)
- parenthesesDisjunction->terms.append(m_bodyDisjunction->terms[termInParentheses]);
- parenthesesDisjunction->terms.append(ByteTerm::SubpatternEnd());
+ ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternTerminalBegin);
- m_bodyDisjunction->terms.shrink(beginTerm);
+ bool invertOrCapture = m_bodyDisjunction->terms[beginTerm].invertOrCapture;
+ unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId;
- m_allParenthesesInfo.append(parenthesesDisjunction);
- m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction, invertOrCapture, inputPosition));
+ m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternTerminalEnd, subpatternId, invertOrCapture, inputPosition));
+ m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm;
+ m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm;
+ m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation;
- m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount;
- m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
- m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
- }
+ m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount;
+ m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
+ m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount;
+ m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType;
}
void regexBegin(unsigned numSubpatterns, unsigned callFrameSize, bool onceThrough)
@@ -1675,7 +1826,7 @@ public:
break;
case PatternTerm::TypeBackReference:
- atomBackReference(term.subpatternId, term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType);
+ atomBackReference(term.backReferenceSubpatternId, term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType);
break;
case PatternTerm::TypeForwardReference:
@@ -1683,24 +1834,27 @@ public:
case PatternTerm::TypeParenthesesSubpattern: {
unsigned disjunctionAlreadyCheckedCount = 0;
- if ((term.quantityCount == 1) && !term.parentheses.isCopy) {
- if (term.quantityType == QuantifierFixedCount) {
+ if (term.quantityCount == 1 && !term.parentheses.isCopy) {
+ unsigned alternativeFrameLocation = term.frameLocation;
+ // For QuantifierFixedCount we pre-check the minimum size; for greedy/non-greedy we reserve a slot in the frame.
+ if (term.quantityType == QuantifierFixedCount)
disjunctionAlreadyCheckedCount = term.parentheses.disjunction->m_minimumSize;
- unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked;
- atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.invertOrCapture, delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, term.frameLocation);
- emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, term.parentheses.disjunction->m_minimumSize);
- atomParenthesesEnd(true, term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize);
- } else {
- unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked;
- atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.invertOrCapture, delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, term.frameLocation + RegexStackSpaceForBackTrackInfoParenthesesOnce);
- emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0);
- atomParenthesesEnd(true, term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize);
- }
+ else
+ alternativeFrameLocation += RegexStackSpaceForBackTrackInfoParenthesesOnce;
+ unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked;
+ atomParenthesesOnceBegin(term.parentheses.subpatternId, term.invertOrCapture, delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, alternativeFrameLocation);
+ emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, disjunctionAlreadyCheckedCount);
+ atomParenthesesOnceEnd(delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType);
+ } else if (term.parentheses.isTerminal) {
+ unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked;
+ atomParenthesesTerminalBegin(term.parentheses.subpatternId, term.invertOrCapture, delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, term.frameLocation + RegexStackSpaceForBackTrackInfoParenthesesOnce);
+ emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, disjunctionAlreadyCheckedCount);
+ atomParenthesesTerminalEnd(delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType);
} else {
unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked;
atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.invertOrCapture, delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, 0);
emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0);
- atomParenthesesEnd(false, term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize);
+ atomParenthesesSubpatternEnd(term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize);
}
break;
}
@@ -1713,7 +1867,7 @@ public:
atomParentheticalAssertionBegin(term.parentheses.subpatternId, term.invertOrCapture, term.frameLocation, alternativeFrameLocation);
emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, positiveInputOffset, true);
- atomParenthesesEnd(true, term.parentheses.lastSubpatternId, 0, term.frameLocation, term.quantityCount, term.quantityType);
+ atomParentheticalAssertionEnd(0, term.frameLocation, term.quantityCount, term.quantityType);
break;
}
}
@@ -1742,6 +1896,11 @@ PassOwnPtr<BytecodePattern> byteCompileRegex(const UString& patternString, unsig
return ByteCompiler(pattern).compile(allocator);
}
+PassOwnPtr<BytecodePattern> byteCompileRegex(RegexPattern& pattern, BumpPointerAllocator* allocator)
+{
+ return ByteCompiler(pattern).compile(allocator);
+}
+
int interpretRegex(BytecodePattern* regex, const UChar* input, unsigned start, unsigned length, int* output)
{
return Interpreter(regex, output, input, start, length).interpret();
@@ -1758,5 +1917,3 @@ COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParentheses) == (RegexStackSpace
} }
-
-#endif
diff --git a/JavaScriptCore/yarr/RegexInterpreter.h b/JavaScriptCore/yarr/RegexInterpreter.h
index dae8f9d..2e23472 100644
--- a/JavaScriptCore/yarr/RegexInterpreter.h
+++ b/JavaScriptCore/yarr/RegexInterpreter.h
@@ -26,8 +26,6 @@
#ifndef RegexInterpreter_h
#define RegexInterpreter_h
-#if ENABLE(YARR)
-
#include "RegexParser.h"
#include "RegexPattern.h"
#include <wtf/PassOwnPtr.h>
@@ -83,6 +81,8 @@ struct ByteTerm {
TypeParenthesesSubpattern,
TypeParenthesesSubpatternOnceBegin,
TypeParenthesesSubpatternOnceEnd,
+ TypeParenthesesSubpatternTerminalBegin,
+ TypeParenthesesSubpatternTerminalEnd,
TypeParentheticalAssertionBegin,
TypeParentheticalAssertionEnd,
TypeCheckInput,
@@ -365,10 +365,9 @@ private:
};
PassOwnPtr<BytecodePattern> byteCompileRegex(const UString& pattern, unsigned& numSubpatterns, const char*& error, BumpPointerAllocator*, bool ignoreCase = false, bool multiline = false);
+PassOwnPtr<BytecodePattern> byteCompileRegex(RegexPattern& pattern, BumpPointerAllocator*);
int interpretRegex(BytecodePattern* v_regex, const UChar* input, unsigned start, unsigned length, int* output);
} } // namespace JSC::Yarr
-#endif
-
#endif // RegexInterpreter_h
diff --git a/JavaScriptCore/yarr/RegexJIT.cpp b/JavaScriptCore/yarr/RegexJIT.cpp
index 1b80464..1eac667 100644
--- a/JavaScriptCore/yarr/RegexJIT.cpp
+++ b/JavaScriptCore/yarr/RegexJIT.cpp
@@ -31,8 +31,7 @@
#include "LinkBuffer.h"
#include "MacroAssembler.h"
#include "RegexCompiler.h"
-
-#include "pcre.h" // temporary, remove when fallback is removed.
+#include "RegexInterpreter.h" // temporary, remove when fallback is removed.
#if ENABLE(YARR_JIT)
@@ -625,7 +624,7 @@ class RegexGenerator : private MacroAssembler {
add32(Imm32(1), countRegister);
add32(Imm32(1), index);
- if (term.quantityCount != 0xffffffff) {
+ if (term.quantityCount != quantifyInfinite) {
branch32(NotEqual, countRegister, Imm32(term.quantityCount)).linkTo(loop, this);
failures.append(jump());
} else
@@ -663,7 +662,7 @@ class RegexGenerator : private MacroAssembler {
loadFromFrame(term.frameLocation, countRegister);
atEndOfInput().linkTo(hardFail, this);
- if (term.quantityCount != 0xffffffff)
+ if (term.quantityCount != quantifyInfinite)
branch32(Equal, countRegister, Imm32(term.quantityCount), hardFail);
if (m_pattern.m_ignoreCase && isASCIIAlpha(ch)) {
readCharacter(state.inputOffset(), character);
@@ -750,7 +749,7 @@ class RegexGenerator : private MacroAssembler {
add32(Imm32(1), countRegister);
add32(Imm32(1), index);
- if (term.quantityCount != 0xffffffff) {
+ if (term.quantityCount != quantifyInfinite) {
branch32(NotEqual, countRegister, Imm32(term.quantityCount)).linkTo(loop, this);
failures.append(jump());
} else
@@ -911,12 +910,7 @@ class RegexGenerator : private MacroAssembler {
PatternDisjunction* disjunction = term.parentheses.disjunction;
ASSERT(term.quantityCount == 1);
- if (term.parentheses.isCopy) {
- m_shouldFallBack = true;
- return;
- }
-
- unsigned preCheckedCount = ((term.quantityCount == 1) && (term.quantityType == QuantifierFixedCount)) ? disjunction->m_minimumSize : 0;
+ unsigned preCheckedCount = (term.quantityType == QuantifierFixedCount) ? disjunction->m_minimumSize : 0;
unsigned parenthesesFrameLocation = term.frameLocation;
unsigned alternativeFrameLocation = parenthesesFrameLocation;
@@ -935,12 +929,12 @@ class RegexGenerator : private MacroAssembler {
Jump nonGreedySkipParentheses;
Label nonGreedyTryParentheses;
if (term.quantityType == QuantifierGreedy)
- storeToFrame(Imm32(1), parenthesesFrameLocation);
+ storeToFrame(index, parenthesesFrameLocation);
else if (term.quantityType == QuantifierNonGreedy) {
- storeToFrame(Imm32(0), parenthesesFrameLocation);
+ storeToFrame(Imm32(-1), parenthesesFrameLocation);
nonGreedySkipParentheses = jump();
nonGreedyTryParentheses = label();
- storeToFrame(Imm32(1), parenthesesFrameLocation);
+ storeToFrame(index, parenthesesFrameLocation);
}
// store the match start index
@@ -958,29 +952,21 @@ class RegexGenerator : private MacroAssembler {
TermGenerationState parenthesesState(disjunction, state.checkedTotal);
generateParenthesesDisjunction(state.term(), parenthesesState, alternativeFrameLocation);
- // store the match end index
- if (term.invertOrCapture) {
- int inputOffset = state.inputOffset();
- if (inputOffset) {
- move(index, indexTemporary);
- add32(Imm32(state.inputOffset()), indexTemporary);
- store32(indexTemporary, Address(output, ((term.parentheses.subpatternId << 1) + 1) * sizeof(int)));
- } else
- store32(index, Address(output, ((term.parentheses.subpatternId << 1) + 1) * sizeof(int)));
- }
- Jump success = jump();
+ Jump success = (term.quantityType == QuantifierFixedCount) ?
+ jump() :
+ branch32(NotEqual, index, Address(stackPointerRegister, (parenthesesFrameLocation * sizeof(void*))));
// A failure AFTER the parens jumps here
Label backtrackFromAfterParens(this);
if (term.quantityType == QuantifierGreedy) {
- // If this is zero we have now tested with both with and without the parens.
+ // If this is -1 we have now tested with both with and without the parens.
loadFromFrame(parenthesesFrameLocation, indexTemporary);
- state.jumpToBacktrack(branchTest32(Zero, indexTemporary), this);
+ state.jumpToBacktrack(branch32(Equal, indexTemporary, Imm32(-1)), this);
} else if (term.quantityType == QuantifierNonGreedy) {
- // If this is zero we have now tested with both with and without the parens.
+ // If this is -1 we have now tested without the parens, now test with.
loadFromFrame(parenthesesFrameLocation, indexTemporary);
- branchTest32(Zero, indexTemporary).linkTo(nonGreedyTryParentheses, this);
+ branch32(Equal, indexTemporary, Imm32(-1)).linkTo(nonGreedyTryParentheses, this);
}
parenthesesState.plantJumpToBacktrackIfExists(this);
@@ -990,7 +976,7 @@ class RegexGenerator : private MacroAssembler {
store32(Imm32(-1), Address(output, (term.parentheses.subpatternId << 1) * sizeof(int)));
if (term.quantityType == QuantifierGreedy)
- storeToFrame(Imm32(0), parenthesesFrameLocation);
+ storeToFrame(Imm32(-1), parenthesesFrameLocation);
else
state.jumpToBacktrack(jump(), this);
@@ -998,6 +984,17 @@ class RegexGenerator : private MacroAssembler {
if (term.quantityType == QuantifierNonGreedy)
nonGreedySkipParentheses.link(this);
success.link(this);
+
+ // store the match end index
+ if (term.invertOrCapture) {
+ int inputOffset = state.inputOffset();
+ if (inputOffset) {
+ move(index, indexTemporary);
+ add32(Imm32(state.inputOffset()), indexTemporary);
+ store32(indexTemporary, Address(output, ((term.parentheses.subpatternId << 1) + 1) * sizeof(int)));
+ } else
+ store32(index, Address(output, ((term.parentheses.subpatternId << 1) + 1) * sizeof(int)));
+ }
}
}
@@ -1008,25 +1005,6 @@ class RegexGenerator : private MacroAssembler {
ASSERT(parenthesesTerm.type == PatternTerm::TypeParenthesesSubpattern);
ASSERT(parenthesesTerm.quantityCount != 1); // Handled by generateParenthesesSingle.
- // Capturing not yet implemented!
- if (parenthesesTerm.invertOrCapture) {
- m_shouldFallBack = true;
- return;
- }
-
- // Quantification limit not yet implemented!
- if (parenthesesTerm.quantityCount != 0xffffffff) {
- m_shouldFallBack = true;
- return;
- }
-
- // Need to reset nested subpatterns between iterations...
- // for the minute this crude check rejects all patterns with any subpatterns!
- if (m_pattern.m_numSubpatterns) {
- m_shouldFallBack = true;
- return;
- }
-
TermGenerationState parenthesesState(disjunction, state.checkedTotal);
Label matchAgain(this);
@@ -1048,7 +1026,11 @@ class RegexGenerator : private MacroAssembler {
generateTerm(parenthesesState);
// If we get here, we matched! If the index advanced then try to match more since limit isn't supported yet.
- branch32(GreaterThan, index, Address(stackPointerRegister, (parenthesesTerm.frameLocation * sizeof(void*))), matchAgain);
+ branch32(NotEqual, index, Address(stackPointerRegister, (parenthesesTerm.frameLocation * sizeof(void*))), matchAgain);
+
+ // If we get here we matched, but we matched "" - cannot accept this alternative as is, so either backtrack,
+ // or fall through to try the next alternative if no backtrack is available.
+ parenthesesState.plantJumpToBacktrackIfExists(this);
parenthesesState.linkAlternativeBacktracks(this);
// We get here if the alternative fails to match - fall through to the next iteration, or out of the loop.
@@ -1181,17 +1163,12 @@ class RegexGenerator : private MacroAssembler {
break;
case PatternTerm::TypeParenthesesSubpattern:
- if (term.quantityCount == 1) {
+ if (term.quantityCount == 1 && !term.parentheses.isCopy)
generateParenthesesSingle(state);
- break;
- } else if (state.isLastTerm() && state.isMainDisjunction()) { // Is this is the last term of the main disjunction?
- // If this has a greedy quantifier, then it will never need to backtrack!
- if (term.quantityType == QuantifierGreedy) {
- generateParenthesesGreedyNoBacktrack(state);
- break;
- }
- }
- m_shouldFallBack = true;
+ else if (term.parentheses.isTerminal)
+ generateParenthesesGreedyNoBacktrack(state);
+ else
+ m_shouldFallBack = true;
break;
case PatternTerm::TypeParentheticalAssertion:
@@ -1532,7 +1509,7 @@ private:
Vector<AlternativeBacktrackRecord> m_backtrackRecords;
};
-void jitCompileRegex(JSGlobalData* globalData, RegexCodeBlock& jitObject, const UString& patternString, unsigned& numSubpatterns, const char*& error, bool ignoreCase, bool multiline)
+void jitCompileRegex(JSGlobalData* globalData, RegexCodeBlock& jitObject, const UString& patternString, unsigned& numSubpatterns, const char*& error, BumpPointerAllocator* allocator, bool ignoreCase, bool multiline)
{
RegexPattern pattern(ignoreCase, multiline);
if ((error = compileRegex(patternString, pattern)))
@@ -1546,9 +1523,7 @@ void jitCompileRegex(JSGlobalData* globalData, RegexCodeBlock& jitObject, const
return;
}
- JSRegExpIgnoreCaseOption ignoreCaseOption = ignoreCase ? JSRegExpIgnoreCase : JSRegExpDoNotIgnoreCase;
- JSRegExpMultilineOption multilineOption = multiline ? JSRegExpMultiline : JSRegExpSingleLine;
- jitObject.setFallback(jsRegExpCompile(reinterpret_cast<const UChar*>(patternString.characters()), patternString.length(), ignoreCaseOption, multilineOption, &numSubpatterns, &error));
+ jitObject.setFallback(byteCompileRegex(pattern, allocator));
}
}}
diff --git a/JavaScriptCore/yarr/RegexJIT.h b/JavaScriptCore/yarr/RegexJIT.h
index 02e905d..c4c382c 100644
--- a/JavaScriptCore/yarr/RegexJIT.h
+++ b/JavaScriptCore/yarr/RegexJIT.h
@@ -29,12 +29,10 @@
#if ENABLE(YARR_JIT)
#include "MacroAssembler.h"
+#include "RegexInterpreter.h" // temporary, remove when fallback is removed.
#include "RegexPattern.h"
#include "UString.h"
-#include "pcre.h"
-struct JSRegExp; // temporary, remove when fallback is removed.
-
#if CPU(X86) && !COMPILER(MSVC)
#define YARR_CALL __attribute__ ((regparm (3)))
#else
@@ -53,18 +51,21 @@ class RegexCodeBlock {
public:
RegexCodeBlock()
- : m_fallback(0)
+ : m_needFallback(false)
{
}
~RegexCodeBlock()
{
- if (m_fallback)
- jsRegExpFree(m_fallback);
}
- JSRegExp* getFallback() { return m_fallback; }
- void setFallback(JSRegExp* fallback) { m_fallback = fallback; }
+ BytecodePattern* getFallback() { return m_fallback.get(); }
+ bool isFallback() { return m_needFallback; }
+ void setFallback(PassOwnPtr<BytecodePattern> fallback)
+ {
+ m_fallback = fallback;
+ m_needFallback = true;
+ }
bool operator!() { return (!m_ref.m_code.executableAddress() && !m_fallback); }
void set(MacroAssembler::CodeRef ref) { m_ref = ref; }
@@ -73,22 +74,23 @@ public:
{
return reinterpret_cast<RegexJITCode>(m_ref.m_code.executableAddress())(input, start, length, output);
}
-
+
#if ENABLE(REGEXP_TRACING)
void *getAddr() { return m_ref.m_code.executableAddress(); }
#endif
private:
MacroAssembler::CodeRef m_ref;
- JSRegExp* m_fallback;
+ OwnPtr<Yarr::BytecodePattern> m_fallback;
+ bool m_needFallback;
};
-void jitCompileRegex(JSGlobalData* globalData, RegexCodeBlock& jitObject, const UString& pattern, unsigned& numSubpatterns, const char*& error, bool ignoreCase = false, bool multiline = false);
+void jitCompileRegex(JSGlobalData* globalData, RegexCodeBlock& jitObject, const UString& pattern, unsigned& numSubpatterns, const char*& error, BumpPointerAllocator* allocator, bool ignoreCase = false, bool multiline = false);
-inline int executeRegex(RegexCodeBlock& jitObject, const UChar* input, unsigned start, unsigned length, int* output, int outputArraySize)
+inline int executeRegex(RegexCodeBlock& jitObject, const UChar* input, unsigned start, unsigned length, int* output)
{
- if (JSRegExp* fallback = jitObject.getFallback())
- return (jsRegExpExecute(fallback, input, length, start, output, outputArraySize) < 0) ? -1 : output[0];
+ if (jitObject.isFallback())
+ return (interpretRegex(jitObject.getFallback(), input, start, length, output));
return jitObject.execute(input, start, length, output);
}
diff --git a/JavaScriptCore/yarr/RegexParser.h b/JavaScriptCore/yarr/RegexParser.h
index ede9417..8392cdf 100644
--- a/JavaScriptCore/yarr/RegexParser.h
+++ b/JavaScriptCore/yarr/RegexParser.h
@@ -26,8 +26,6 @@
#ifndef RegexParser_h
#define RegexParser_h
-#if ENABLE(YARR)
-
#include "UString.h"
#include <limits.h>
#include <wtf/ASCIICType.h>
@@ -35,6 +33,8 @@
namespace JSC { namespace Yarr {
+static const unsigned quantifyInfinite = UINT_MAX;
+
enum BuiltInCharacterClassID {
DigitClassID,
SpaceClassID,
@@ -58,6 +58,7 @@ private:
ParenthesesUnmatched,
ParenthesesTypeInvalid,
CharacterClassUnmatched,
+ CharacterClassInvalidRange,
CharacterClassOutOfOrder,
EscapeUnterminated,
NumberOfErrorCodes
@@ -77,7 +78,7 @@ private:
CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err)
: m_delegate(delegate)
, m_err(err)
- , m_state(empty)
+ , m_state(Empty)
{
}
@@ -92,54 +93,60 @@ private:
}
/*
- * atomPatternCharacterUnescaped():
+ * atomPatternCharacter():
*
- * This method is called directly from parseCharacterClass(), to report a new
- * pattern character token. This method differs from atomPatternCharacter(),
- * which will be called from parseEscape(), since a hypen provided via this
- * method may be indicating a character range, but a hyphen parsed by
- * parseEscape() cannot be interpreted as doing so.
+ * This method is called either from parseCharacterClass() (for an unescaped
+ * character in a character class), or from parseEscape(). In the former case
+ * the value true will be passed for the argument 'hyphenIsRange', and in this
+ * mode we will allow a hypen to be treated as indicating a range (i.e. /[a-z]/
+ * is different to /[a\-z]/).
*/
- void atomPatternCharacterUnescaped(UChar ch)
+ void atomPatternCharacter(UChar ch, bool hyphenIsRange = false)
{
switch (m_state) {
- case empty:
+ case AfterCharacterClass:
+ // Following a builtin character class we need look out for a hyphen.
+ // We're looking for invalid ranges, such as /[\d-x]/ or /[\d-\d]/.
+ // If we see a hyphen following a charater class then unlike usual
+ // we'll report it to the delegate immediately, and put ourself into
+ // a poisoned state. Any following calls to add another character or
+ // character class will result in an error. (A hypen following a
+ // character-class is itself valid, but only at the end of a regex).
+ if (hyphenIsRange && ch == '-') {
+ m_delegate.atomCharacterClassAtom('-');
+ m_state = AfterCharacterClassHyphen;
+ return;
+ }
+ // Otherwise just fall through - cached character so treat this as Empty.
+
+ case Empty:
m_character = ch;
- m_state = cachedCharacter;
- break;
+ m_state = CachedCharacter;
+ return;
- case cachedCharacter:
- if (ch == '-')
- m_state = cachedCharacterHyphen;
+ case CachedCharacter:
+ if (hyphenIsRange && ch == '-')
+ m_state = CachedCharacterHyphen;
else {
m_delegate.atomCharacterClassAtom(m_character);
m_character = ch;
}
- break;
+ return;
- case cachedCharacterHyphen:
- if (ch >= m_character)
- m_delegate.atomCharacterClassRange(m_character, ch);
- else
+ case CachedCharacterHyphen:
+ if (ch < m_character) {
m_err = CharacterClassOutOfOrder;
- m_state = empty;
- }
- }
-
- /*
- * atomPatternCharacter():
- *
- * Adds a pattern character, called by parseEscape(), as such will not
- * interpret a hyphen as indicating a character range.
- */
- void atomPatternCharacter(UChar ch)
- {
- // Flush if a character is already pending to prevent the
- // hyphen from begin interpreted as indicating a range.
- if((ch == '-') && (m_state == cachedCharacter))
- flush();
+ return;
+ }
+ m_delegate.atomCharacterClassRange(m_character, ch);
+ m_state = Empty;
+ return;
- atomPatternCharacterUnescaped(ch);
+ case AfterCharacterClassHyphen:
+ // Error! We have something like /[\d-x]/.
+ m_err = CharacterClassInvalidRange;
+ return;
+ }
}
/*
@@ -149,8 +156,25 @@ private:
*/
void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert)
{
- flush();
- m_delegate.atomCharacterClassBuiltIn(classID, invert);
+ switch (m_state) {
+ case CachedCharacter:
+ // Flush the currently cached character, then fall through.
+ m_delegate.atomCharacterClassAtom(m_character);
+
+ case Empty:
+ case AfterCharacterClass:
+ m_state = AfterCharacterClass;
+ m_delegate.atomCharacterClassBuiltIn(classID, invert);
+ return;
+
+ case CachedCharacterHyphen:
+ case AfterCharacterClassHyphen:
+ // Error! If we hit either of these cases, we have an
+ // invalid range that looks something like /[x-\d]/
+ // or /[\d-\d]/.
+ m_err = CharacterClassInvalidRange;
+ return;
+ }
}
/*
@@ -160,7 +184,12 @@ private:
*/
void end()
{
- flush();
+ if (m_state == CachedCharacter)
+ m_delegate.atomCharacterClassAtom(m_character);
+ else if (m_state == CachedCharacterHyphen) {
+ m_delegate.atomCharacterClassAtom(m_character);
+ m_delegate.atomCharacterClassAtom('-');
+ }
m_delegate.atomCharacterClassEnd();
}
@@ -170,21 +199,14 @@ private:
void atomBackReference(unsigned) { ASSERT_NOT_REACHED(); }
private:
- void flush()
- {
- if (m_state != empty) // either cachedCharacter or cachedCharacterHyphen
- m_delegate.atomCharacterClassAtom(m_character);
- if (m_state == cachedCharacterHyphen)
- m_delegate.atomCharacterClassAtom('-');
- m_state = empty;
- }
-
Delegate& m_delegate;
ErrorCode& m_err;
enum CharacterClassConstructionState {
- empty,
- cachedCharacter,
- cachedCharacterHyphen,
+ Empty,
+ CachedCharacter,
+ CachedCharacterHyphen,
+ AfterCharacterClass,
+ AfterCharacterClassHyphen,
} m_state;
UChar m_character;
};
@@ -430,7 +452,7 @@ private:
break;
default:
- characterClassConstructor.atomPatternCharacterUnescaped(consume());
+ characterClassConstructor.atomPatternCharacter(consume(), true);
}
if (m_err)
@@ -574,13 +596,13 @@ private:
case '*':
consume();
- parseQuantifier(lastTokenWasAnAtom, 0, UINT_MAX);
+ parseQuantifier(lastTokenWasAnAtom, 0, quantifyInfinite);
lastTokenWasAnAtom = false;
break;
case '+':
consume();
- parseQuantifier(lastTokenWasAnAtom, 1, UINT_MAX);
+ parseQuantifier(lastTokenWasAnAtom, 1, quantifyInfinite);
lastTokenWasAnAtom = false;
break;
@@ -599,7 +621,7 @@ private:
unsigned max = min;
if (tryConsume(','))
- max = peekIsDigit() ? consumeNumber() : UINT_MAX;
+ max = peekIsDigit() ? consumeNumber() : quantifyInfinite;
if (tryConsume('}')) {
if (min <= max)
@@ -659,6 +681,7 @@ private:
"unmatched parentheses",
"unrecognized character after (?",
"missing terminating ] for character class",
+ "invalid range in character class",
"range out of order in character class",
"\\ at end of pattern"
};
@@ -840,13 +863,11 @@ private:
*/
template<class Delegate>
-const char* parse(Delegate& delegate, const UString& pattern, unsigned backReferenceLimit = UINT_MAX)
+const char* parse(Delegate& delegate, const UString& pattern, unsigned backReferenceLimit = quantifyInfinite)
{
return Parser<Delegate>(delegate, pattern, backReferenceLimit).parse();
}
} } // namespace JSC::Yarr
-#endif
-
#endif // RegexParser_h
diff --git a/JavaScriptCore/yarr/RegexPattern.h b/JavaScriptCore/yarr/RegexPattern.h
index be31fcd..8a7d35b 100644
--- a/JavaScriptCore/yarr/RegexPattern.h
+++ b/JavaScriptCore/yarr/RegexPattern.h
@@ -27,13 +27,9 @@
#ifndef RegexPattern_h
#define RegexPattern_h
-
-#if ENABLE(YARR)
-
#include <wtf/Vector.h>
#include <wtf/unicode/Unicode.h>
-
namespace JSC { namespace Yarr {
#define RegexStackSpaceForBackTrackInfoPatternCharacter 1 // Only for !fixed quantifiers.
@@ -42,6 +38,7 @@ namespace JSC { namespace Yarr {
#define RegexStackSpaceForBackTrackInfoAlternative 1 // One per alternative.
#define RegexStackSpaceForBackTrackInfoParentheticalAssertion 1
#define RegexStackSpaceForBackTrackInfoParenthesesOnce 1 // Only for !fixed quantifiers.
+#define RegexStackSpaceForBackTrackInfoParenthesesTerminal 1
#define RegexStackSpaceForBackTrackInfoParentheses 4
struct PatternDisjunction;
@@ -110,12 +107,13 @@ struct PatternTerm {
union {
UChar patternCharacter;
CharacterClass* characterClass;
- unsigned subpatternId;
+ unsigned backReferenceSubpatternId;
struct {
PatternDisjunction* disjunction;
unsigned subpatternId;
unsigned lastSubpatternId;
bool isCopy;
+ bool isTerminal;
} parentheses;
};
QuantifierType quantityType;
@@ -147,6 +145,7 @@ struct PatternTerm {
parentheses.disjunction = disjunction;
parentheses.subpatternId = subpatternId;
parentheses.isCopy = false;
+ parentheses.isTerminal = false;
quantityType = QuantifierFixedCount;
quantityCount = 1;
}
@@ -163,7 +162,7 @@ struct PatternTerm {
: type(TypeBackReference)
, invertOrCapture(false)
{
- subpatternId = spatternId;
+ backReferenceSubpatternId = spatternId;
quantityType = QuantifierFixedCount;
quantityCount = 1;
}
@@ -429,6 +428,4 @@ private:
} } // namespace JSC::Yarr
-#endif
-
#endif // RegexPattern_h