diff options
Diffstat (limited to 'JavaScriptCore/yarr')
| -rw-r--r-- | JavaScriptCore/yarr/RegexCompiler.cpp | 54 | ||||
| -rw-r--r-- | JavaScriptCore/yarr/RegexCompiler.h | 4 | ||||
| -rw-r--r-- | JavaScriptCore/yarr/RegexInterpreter.cpp | 287 | ||||
| -rw-r--r-- | JavaScriptCore/yarr/RegexInterpreter.h | 7 | ||||
| -rw-r--r-- | JavaScriptCore/yarr/RegexJIT.cpp | 103 | ||||
| -rw-r--r-- | JavaScriptCore/yarr/RegexJIT.h | 30 | ||||
| -rw-r--r-- | JavaScriptCore/yarr/RegexParser.h | 141 | ||||
| -rw-r--r-- | JavaScriptCore/yarr/RegexPattern.h | 13 |
8 files changed, 407 insertions, 232 deletions
diff --git a/JavaScriptCore/yarr/RegexCompiler.cpp b/JavaScriptCore/yarr/RegexCompiler.cpp index ccfc94e..e40c791 100644 --- a/JavaScriptCore/yarr/RegexCompiler.cpp +++ b/JavaScriptCore/yarr/RegexCompiler.cpp @@ -31,8 +31,6 @@ #include "RegexPattern.h" #include <wtf/Vector.h> -#if ENABLE(YARR) - using namespace WTF; namespace JSC { namespace Yarr { @@ -522,7 +520,7 @@ public: PatternTerm& term = currentAlternative->lastTerm(); ASSERT((term.type == PatternTerm::TypeParenthesesSubpattern) || (term.type == PatternTerm::TypeParentheticalAssertion)); - if ((term.type == PatternTerm::TypeParenthesesSubpattern) && term.invertOrCapture && (subpatternId == term.subpatternId)) { + if ((term.type == PatternTerm::TypeParenthesesSubpattern) && term.invertOrCapture && (subpatternId == term.parentheses.subpatternId)) { m_alternative->m_terms.append(PatternTerm::ForwardReference()); return; } @@ -597,7 +595,7 @@ public: term.quantify(min, QuantifierFixedCount); m_alternative->m_terms.append(copyTerm(term)); // NOTE: this term is interesting from an analysis perspective, in that it can be ignored..... - m_alternative->lastTerm().quantify((max == UINT_MAX) ? max : max - min, greedy ? QuantifierGreedy : QuantifierNonGreedy); + m_alternative->lastTerm().quantify((max == quantifyInfinite) ? max : max - min, greedy ? QuantifierGreedy : QuantifierNonGreedy); if (m_alternative->lastTerm().type == PatternTerm::TypeParenthesesSubpattern) m_alternative->lastTerm().parentheses.isCopy = true; } @@ -669,14 +667,17 @@ public: case PatternTerm::TypeParenthesesSubpattern: // Note: for fixed once parentheses we will ensure at least the minimum is available; others are on their own. term.frameLocation = currentCallFrameSize; - if ((term.quantityCount == 1) && !term.parentheses.isCopy) { - if (term.quantityType == QuantifierFixedCount) { - currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition); - currentInputPosition += term.parentheses.disjunction->m_minimumSize; - } else { + if (term.quantityCount == 1 && !term.parentheses.isCopy) { + if (term.quantityType != QuantifierFixedCount) currentCallFrameSize += RegexStackSpaceForBackTrackInfoParenthesesOnce; - currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition); - } + currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition); + // If quantity is fixed, then pre-check its minimum size. + if (term.quantityType == QuantifierFixedCount) + currentInputPosition += term.parentheses.disjunction->m_minimumSize; + term.inputPosition = currentInputPosition; + } else if (term.parentheses.isTerminal) { + currentCallFrameSize += RegexStackSpaceForBackTrackInfoParenthesesTerminal; + currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition); term.inputPosition = currentInputPosition; } else { term.inputPosition = currentInputPosition; @@ -730,6 +731,34 @@ public: setupDisjunctionOffsets(m_pattern.m_body, 0, 0); } + // This optimization identifies sets of parentheses that we will never need to backtrack. + // In these cases we do not need to store state from prior iterations. + // We can presently avoid backtracking for: + // * where the parens are at the end of the regular expression (last term in any of the + // alternatives of the main body disjunction). + // * where the parens are non-capturing, and quantified unbounded greedy (*). + // * where the parens do not contain any capturing subpatterns. + void checkForTerminalParentheses() + { + // This check is much too crude; should be just checking whether the candidate + // node contains nested capturing subpatterns, not the whole expression! + if (m_pattern.m_numSubpatterns) + return; + + Vector<PatternAlternative*>& alternatives = m_pattern.m_body->m_alternatives; + for (size_t i = 0; i < alternatives.size(); ++i) { + Vector<PatternTerm>& terms = alternatives[i]->m_terms; + if (terms.size()) { + PatternTerm& term = terms.last(); + if (term.type == PatternTerm::TypeParenthesesSubpattern + && term.quantityType == QuantifierGreedy + && term.quantityCount == quantifyInfinite + && !term.capture()) + term.parentheses.isTerminal = true; + } + } + } + void optimizeBOL() { // Look for expressions containing beginning of line (^) anchoring and unroll them. @@ -932,6 +961,7 @@ const char* compileRegex(const UString& patternString, RegexPattern& pattern) ASSERT(numSubpatterns == pattern.m_numSubpatterns); } + constructor.checkForTerminalParentheses(); constructor.optimizeBOL(); constructor.setupOffsets(); @@ -942,5 +972,3 @@ const char* compileRegex(const UString& patternString, RegexPattern& pattern) } } - -#endif diff --git a/JavaScriptCore/yarr/RegexCompiler.h b/JavaScriptCore/yarr/RegexCompiler.h index 9d2443a..399374e 100644 --- a/JavaScriptCore/yarr/RegexCompiler.h +++ b/JavaScriptCore/yarr/RegexCompiler.h @@ -26,8 +26,6 @@ #ifndef RegexCompiler_h #define RegexCompiler_h -#if ENABLE(YARR) - #include "RegexParser.h" #include "RegexPattern.h" #include <wtf/unicode/Unicode.h> @@ -38,6 +36,4 @@ const char* compileRegex(const UString& patternString, RegexPattern& pattern); } } // namespace JSC::Yarr -#endif - #endif // RegexCompiler_h diff --git a/JavaScriptCore/yarr/RegexInterpreter.cpp b/JavaScriptCore/yarr/RegexInterpreter.cpp index dc3024a..a51cd25 100644 --- a/JavaScriptCore/yarr/RegexInterpreter.cpp +++ b/JavaScriptCore/yarr/RegexInterpreter.cpp @@ -35,8 +35,6 @@ #include <stdio.h> #endif -#if ENABLE(YARR) - using namespace WTF; namespace JSC { namespace Yarr { @@ -62,7 +60,10 @@ public: uintptr_t begin; }; struct BackTrackInfoParenthesesOnce { - uintptr_t inParentheses; + uintptr_t begin; + }; + struct BackTrackInfoParenthesesTerminal { + uintptr_t begin; }; struct BackTrackInfoParentheses { uintptr_t matchAmount; @@ -514,8 +515,7 @@ public: if (matchEnd == -1) return true; - ASSERT((matchBegin == -1) == (matchEnd == -1)); - ASSERT(matchBegin <= matchEnd); + ASSERT((matchBegin == -1) || (matchBegin <= matchEnd)); if (matchBegin == matchEnd) return true; @@ -557,8 +557,7 @@ public: int matchBegin = output[(term.atom.subpatternId << 1)]; int matchEnd = output[(term.atom.subpatternId << 1) + 1]; - ASSERT((matchBegin == -1) == (matchEnd == -1)); - ASSERT(matchBegin <= matchEnd); + ASSERT((matchBegin == -1) || (matchBegin <= matchEnd)); if (matchBegin == matchEnd) return false; @@ -633,11 +632,11 @@ public: switch (term.atom.quantityType) { case QuantifierGreedy: { // set this speculatively; if we get to the parens end this will be true. - backTrack->inParentheses = 1; + backTrack->begin = input.getPos(); break; } case QuantifierNonGreedy: { - backTrack->inParentheses = 0; + backTrack->begin = notFound; context->term += term.atom.parenthesesWidth; return true; } @@ -653,7 +652,7 @@ public: return true; } - bool matchParenthesesOnceEnd(ByteTerm& term, DisjunctionContext*) + bool matchParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context) { ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd); ASSERT(term.atom.quantityCount == 1); @@ -662,7 +661,12 @@ public: unsigned subpatternId = term.atom.subpatternId; output[(subpatternId << 1) + 1] = input.getPos() + term.inputPosition; } - return true; + + if (term.atom.quantityType == QuantifierFixedCount) + return true; + + BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation); + return backTrack->begin != input.getPos(); } bool backtrackParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context) @@ -681,12 +685,12 @@ public: switch (term.atom.quantityType) { case QuantifierGreedy: // if we backtrack to this point, there is another chance - try matching nothing. - ASSERT(backTrack->inParentheses); - backTrack->inParentheses = 0; + ASSERT(backTrack->begin != notFound); + backTrack->begin = notFound; context->term += term.atom.parenthesesWidth; return true; case QuantifierNonGreedy: - ASSERT(backTrack->inParentheses); + ASSERT(backTrack->begin != notFound); case QuantifierFixedCount: break; } @@ -703,15 +707,19 @@ public: switch (term.atom.quantityType) { case QuantifierGreedy: - if (!backTrack->inParentheses) { + if (backTrack->begin == notFound) { context->term -= term.atom.parenthesesWidth; return false; } case QuantifierNonGreedy: - if (!backTrack->inParentheses) { - // now try to match the parens; set this speculatively. - backTrack->inParentheses = 1; + if (backTrack->begin == notFound) { + backTrack->begin = input.getPos(); if (term.capture()) { + // Technically this access to inputPosition should be accessing the begin term's + // inputPosition, but for repeats other than fixed these values should be + // the same anyway! (We don't pre-check for greedy or non-greedy matches.) + ASSERT((&term - term.atom.parenthesesWidth)->type == ByteTerm::TypeParenthesesSubpatternOnceBegin); + ASSERT((&term - term.atom.parenthesesWidth)->inputPosition == term.inputPosition); unsigned subpatternId = term.atom.subpatternId; output[subpatternId << 1] = input.getPos() + term.inputPosition; } @@ -725,6 +733,53 @@ public: return false; } + bool matchParenthesesTerminalBegin(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalBegin); + ASSERT(term.atom.quantityType == QuantifierGreedy); + ASSERT(term.atom.quantityCount == quantifyInfinite); + ASSERT(!term.capture()); + + BackTrackInfoParenthesesTerminal* backTrack = reinterpret_cast<BackTrackInfoParenthesesTerminal*>(context->frame + term.frameLocation); + backTrack->begin = input.getPos(); + return true; + } + + bool matchParenthesesTerminalEnd(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalEnd); + + BackTrackInfoParenthesesTerminal* backTrack = reinterpret_cast<BackTrackInfoParenthesesTerminal*>(context->frame + term.frameLocation); + // Empty match is a failed match. + if (backTrack->begin == input.getPos()) + return false; + + // Successful match! Okay, what's next? - loop around and try to match moar! + context->term -= (term.atom.parenthesesWidth + 1); + return true; + } + + bool backtrackParenthesesTerminalBegin(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalBegin); + ASSERT(term.atom.quantityType == QuantifierGreedy); + ASSERT(term.atom.quantityCount == quantifyInfinite); + ASSERT(!term.capture()); + + // If we backtrack to this point, we have failed to match this iteration of the parens. + // Since this is greedy / zero minimum a failed is also accepted as a match! + context->term += term.atom.parenthesesWidth; + return true; + } + + bool backtrackParenthesesTerminalEnd(ByteTerm&, DisjunctionContext*) + { + // 'Terminal' parentheses are at the end of the regex, and as such a match past end + // should always be returned as a successful match - we should never backtrack to here. + ASSERT_NOT_REACHED(); + return false; + } + bool matchParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context) { ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin); @@ -1173,6 +1228,14 @@ public: if (matchParenthesesOnceEnd(currentTerm(), context)) MATCH_NEXT(); BACKTRACK(); + case ByteTerm::TypeParenthesesSubpatternTerminalBegin: + if (matchParenthesesTerminalBegin(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeParenthesesSubpatternTerminalEnd: + if (matchParenthesesTerminalEnd(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); case ByteTerm::TypeParentheticalAssertionBegin: if (matchParentheticalAssertionBegin(currentTerm(), context)) MATCH_NEXT(); @@ -1286,6 +1349,14 @@ public: if (backtrackParenthesesOnceEnd(currentTerm(), context)) MATCH_NEXT(); BACKTRACK(); + case ByteTerm::TypeParenthesesSubpatternTerminalBegin: + if (backtrackParenthesesTerminalBegin(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeParenthesesSubpatternTerminalEnd: + if (backtrackParenthesesTerminalEnd(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); case ByteTerm::TypeParentheticalAssertionBegin: if (backtrackParentheticalAssertionBegin(currentTerm(), context)) MATCH_NEXT(); @@ -1341,9 +1412,8 @@ public: pattern->m_allocator->stopAllocator(); - if (output[0] == -1 && result != JSRegExpNoMatch) - return result; - + // RegExp.cpp currently expects all error to be converted to -1. + ASSERT((result == JSRegExpMatch) == (output[0] != -1)); return output[0]; } @@ -1448,8 +1518,38 @@ public: m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; } + void atomParenthesesOnceBegin(unsigned subpatternId, bool capture, int inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation) + { + int beginTerm = m_bodyDisjunction->terms.size(); + + m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, inputPosition)); + m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; + m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin()); + m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation; + + m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex)); + m_currentAlternativeIndex = beginTerm + 1; + } + + void atomParenthesesTerminalBegin(unsigned subpatternId, bool capture, int inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation) + { + int beginTerm = m_bodyDisjunction->terms.size(); + + m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternTerminalBegin, subpatternId, capture, inputPosition)); + m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; + m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin()); + m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation; + + m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex)); + m_currentAlternativeIndex = beginTerm + 1; + } + void atomParenthesesSubpatternBegin(unsigned subpatternId, bool capture, int inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation) { + // Errrk! - this is a little crazy, we initially generate as a TypeParenthesesSubpatternOnceBegin, + // then fix this up at the end! - simplifying this should make it much clearer. + // https://bugs.webkit.org/show_bug.cgi?id=50136 + int beginTerm = m_bodyDisjunction->terms.size(); m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, inputPosition)); @@ -1474,6 +1574,28 @@ public: m_currentAlternativeIndex = beginTerm + 1; } + void atomParentheticalAssertionEnd(int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType) + { + unsigned beginTerm = popParenthesesStack(); + closeAlternative(beginTerm + 1); + unsigned endTerm = m_bodyDisjunction->terms.size(); + + ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParentheticalAssertionBegin); + + bool invertOrCapture = m_bodyDisjunction->terms[beginTerm].invertOrCapture; + unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId; + + m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParentheticalAssertionEnd, subpatternId, invertOrCapture, inputPosition)); + m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm; + m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm; + m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation; + + m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount; + m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; + m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount; + m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType; + } + unsigned popParenthesesStack() { ASSERT(m_parenthesesStack.size()); @@ -1545,50 +1667,79 @@ public: m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation; } - void atomParenthesesEnd(bool doInline, unsigned lastSubpatternId, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType, unsigned callFrameSize = 0) + void atomParenthesesSubpatternEnd(unsigned lastSubpatternId, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType, unsigned callFrameSize = 0) + { + unsigned beginTerm = popParenthesesStack(); + closeAlternative(beginTerm + 1); + unsigned endTerm = m_bodyDisjunction->terms.size(); + + ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternOnceBegin); + + ByteTerm& parenthesesBegin = m_bodyDisjunction->terms[beginTerm]; + + bool invertOrCapture = parenthesesBegin.invertOrCapture; + unsigned subpatternId = parenthesesBegin.atom.subpatternId; + + unsigned numSubpatterns = lastSubpatternId - subpatternId + 1; + ByteDisjunction* parenthesesDisjunction = new ByteDisjunction(numSubpatterns, callFrameSize); + + parenthesesDisjunction->terms.append(ByteTerm::SubpatternBegin()); + for (unsigned termInParentheses = beginTerm + 1; termInParentheses < endTerm; ++termInParentheses) + parenthesesDisjunction->terms.append(m_bodyDisjunction->terms[termInParentheses]); + parenthesesDisjunction->terms.append(ByteTerm::SubpatternEnd()); + + m_bodyDisjunction->terms.shrink(beginTerm); + + m_allParenthesesInfo.append(parenthesesDisjunction); + m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction, invertOrCapture, inputPosition)); + + m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount; + m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; + m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation; + } + + void atomParenthesesOnceEnd(int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType) { unsigned beginTerm = popParenthesesStack(); closeAlternative(beginTerm + 1); unsigned endTerm = m_bodyDisjunction->terms.size(); - bool isAssertion = m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParentheticalAssertionBegin; + ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternOnceBegin); + bool invertOrCapture = m_bodyDisjunction->terms[beginTerm].invertOrCapture; unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId; - m_bodyDisjunction->terms.append(ByteTerm(isAssertion ? ByteTerm::TypeParentheticalAssertionEnd : ByteTerm::TypeParenthesesSubpatternOnceEnd, subpatternId, invertOrCapture, inputPosition)); + m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceEnd, subpatternId, invertOrCapture, inputPosition)); m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm; m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm; m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation; - if (doInline) { - m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount; - m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; - m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount; - m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType; - } else { - ByteTerm& parenthesesBegin = m_bodyDisjunction->terms[beginTerm]; - ASSERT(parenthesesBegin.type == ByteTerm::TypeParenthesesSubpatternOnceBegin); - - bool invertOrCapture = parenthesesBegin.invertOrCapture; - unsigned subpatternId = parenthesesBegin.atom.subpatternId; + m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount; + m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; + m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount; + m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType; + } - unsigned numSubpatterns = lastSubpatternId - subpatternId + 1; - ByteDisjunction* parenthesesDisjunction = new ByteDisjunction(numSubpatterns, callFrameSize); + void atomParenthesesTerminalEnd(int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType) + { + unsigned beginTerm = popParenthesesStack(); + closeAlternative(beginTerm + 1); + unsigned endTerm = m_bodyDisjunction->terms.size(); - parenthesesDisjunction->terms.append(ByteTerm::SubpatternBegin()); - for (unsigned termInParentheses = beginTerm + 1; termInParentheses < endTerm; ++termInParentheses) - parenthesesDisjunction->terms.append(m_bodyDisjunction->terms[termInParentheses]); - parenthesesDisjunction->terms.append(ByteTerm::SubpatternEnd()); + ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternTerminalBegin); - m_bodyDisjunction->terms.shrink(beginTerm); + bool invertOrCapture = m_bodyDisjunction->terms[beginTerm].invertOrCapture; + unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId; - m_allParenthesesInfo.append(parenthesesDisjunction); - m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction, invertOrCapture, inputPosition)); + m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternTerminalEnd, subpatternId, invertOrCapture, inputPosition)); + m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm; + m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm; + m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation; - m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount; - m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; - m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation; - } + m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount; + m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; + m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount; + m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType; } void regexBegin(unsigned numSubpatterns, unsigned callFrameSize, bool onceThrough) @@ -1675,7 +1826,7 @@ public: break; case PatternTerm::TypeBackReference: - atomBackReference(term.subpatternId, term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType); + atomBackReference(term.backReferenceSubpatternId, term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType); break; case PatternTerm::TypeForwardReference: @@ -1683,24 +1834,27 @@ public: case PatternTerm::TypeParenthesesSubpattern: { unsigned disjunctionAlreadyCheckedCount = 0; - if ((term.quantityCount == 1) && !term.parentheses.isCopy) { - if (term.quantityType == QuantifierFixedCount) { + if (term.quantityCount == 1 && !term.parentheses.isCopy) { + unsigned alternativeFrameLocation = term.frameLocation; + // For QuantifierFixedCount we pre-check the minimum size; for greedy/non-greedy we reserve a slot in the frame. + if (term.quantityType == QuantifierFixedCount) disjunctionAlreadyCheckedCount = term.parentheses.disjunction->m_minimumSize; - unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked; - atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.invertOrCapture, delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, term.frameLocation); - emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, term.parentheses.disjunction->m_minimumSize); - atomParenthesesEnd(true, term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize); - } else { - unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked; - atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.invertOrCapture, delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, term.frameLocation + RegexStackSpaceForBackTrackInfoParenthesesOnce); - emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0); - atomParenthesesEnd(true, term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize); - } + else + alternativeFrameLocation += RegexStackSpaceForBackTrackInfoParenthesesOnce; + unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked; + atomParenthesesOnceBegin(term.parentheses.subpatternId, term.invertOrCapture, delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, alternativeFrameLocation); + emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, disjunctionAlreadyCheckedCount); + atomParenthesesOnceEnd(delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType); + } else if (term.parentheses.isTerminal) { + unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked; + atomParenthesesTerminalBegin(term.parentheses.subpatternId, term.invertOrCapture, delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, term.frameLocation + RegexStackSpaceForBackTrackInfoParenthesesOnce); + emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, disjunctionAlreadyCheckedCount); + atomParenthesesTerminalEnd(delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType); } else { unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked; atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.invertOrCapture, delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, 0); emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0); - atomParenthesesEnd(false, term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize); + atomParenthesesSubpatternEnd(term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize); } break; } @@ -1713,7 +1867,7 @@ public: atomParentheticalAssertionBegin(term.parentheses.subpatternId, term.invertOrCapture, term.frameLocation, alternativeFrameLocation); emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, positiveInputOffset, true); - atomParenthesesEnd(true, term.parentheses.lastSubpatternId, 0, term.frameLocation, term.quantityCount, term.quantityType); + atomParentheticalAssertionEnd(0, term.frameLocation, term.quantityCount, term.quantityType); break; } } @@ -1742,6 +1896,11 @@ PassOwnPtr<BytecodePattern> byteCompileRegex(const UString& patternString, unsig return ByteCompiler(pattern).compile(allocator); } +PassOwnPtr<BytecodePattern> byteCompileRegex(RegexPattern& pattern, BumpPointerAllocator* allocator) +{ + return ByteCompiler(pattern).compile(allocator); +} + int interpretRegex(BytecodePattern* regex, const UChar* input, unsigned start, unsigned length, int* output) { return Interpreter(regex, output, input, start, length).interpret(); @@ -1758,5 +1917,3 @@ COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParentheses) == (RegexStackSpace } } - -#endif diff --git a/JavaScriptCore/yarr/RegexInterpreter.h b/JavaScriptCore/yarr/RegexInterpreter.h index dae8f9d..2e23472 100644 --- a/JavaScriptCore/yarr/RegexInterpreter.h +++ b/JavaScriptCore/yarr/RegexInterpreter.h @@ -26,8 +26,6 @@ #ifndef RegexInterpreter_h #define RegexInterpreter_h -#if ENABLE(YARR) - #include "RegexParser.h" #include "RegexPattern.h" #include <wtf/PassOwnPtr.h> @@ -83,6 +81,8 @@ struct ByteTerm { TypeParenthesesSubpattern, TypeParenthesesSubpatternOnceBegin, TypeParenthesesSubpatternOnceEnd, + TypeParenthesesSubpatternTerminalBegin, + TypeParenthesesSubpatternTerminalEnd, TypeParentheticalAssertionBegin, TypeParentheticalAssertionEnd, TypeCheckInput, @@ -365,10 +365,9 @@ private: }; PassOwnPtr<BytecodePattern> byteCompileRegex(const UString& pattern, unsigned& numSubpatterns, const char*& error, BumpPointerAllocator*, bool ignoreCase = false, bool multiline = false); +PassOwnPtr<BytecodePattern> byteCompileRegex(RegexPattern& pattern, BumpPointerAllocator*); int interpretRegex(BytecodePattern* v_regex, const UChar* input, unsigned start, unsigned length, int* output); } } // namespace JSC::Yarr -#endif - #endif // RegexInterpreter_h diff --git a/JavaScriptCore/yarr/RegexJIT.cpp b/JavaScriptCore/yarr/RegexJIT.cpp index 1b80464..1eac667 100644 --- a/JavaScriptCore/yarr/RegexJIT.cpp +++ b/JavaScriptCore/yarr/RegexJIT.cpp @@ -31,8 +31,7 @@ #include "LinkBuffer.h" #include "MacroAssembler.h" #include "RegexCompiler.h" - -#include "pcre.h" // temporary, remove when fallback is removed. +#include "RegexInterpreter.h" // temporary, remove when fallback is removed. #if ENABLE(YARR_JIT) @@ -625,7 +624,7 @@ class RegexGenerator : private MacroAssembler { add32(Imm32(1), countRegister); add32(Imm32(1), index); - if (term.quantityCount != 0xffffffff) { + if (term.quantityCount != quantifyInfinite) { branch32(NotEqual, countRegister, Imm32(term.quantityCount)).linkTo(loop, this); failures.append(jump()); } else @@ -663,7 +662,7 @@ class RegexGenerator : private MacroAssembler { loadFromFrame(term.frameLocation, countRegister); atEndOfInput().linkTo(hardFail, this); - if (term.quantityCount != 0xffffffff) + if (term.quantityCount != quantifyInfinite) branch32(Equal, countRegister, Imm32(term.quantityCount), hardFail); if (m_pattern.m_ignoreCase && isASCIIAlpha(ch)) { readCharacter(state.inputOffset(), character); @@ -750,7 +749,7 @@ class RegexGenerator : private MacroAssembler { add32(Imm32(1), countRegister); add32(Imm32(1), index); - if (term.quantityCount != 0xffffffff) { + if (term.quantityCount != quantifyInfinite) { branch32(NotEqual, countRegister, Imm32(term.quantityCount)).linkTo(loop, this); failures.append(jump()); } else @@ -911,12 +910,7 @@ class RegexGenerator : private MacroAssembler { PatternDisjunction* disjunction = term.parentheses.disjunction; ASSERT(term.quantityCount == 1); - if (term.parentheses.isCopy) { - m_shouldFallBack = true; - return; - } - - unsigned preCheckedCount = ((term.quantityCount == 1) && (term.quantityType == QuantifierFixedCount)) ? disjunction->m_minimumSize : 0; + unsigned preCheckedCount = (term.quantityType == QuantifierFixedCount) ? disjunction->m_minimumSize : 0; unsigned parenthesesFrameLocation = term.frameLocation; unsigned alternativeFrameLocation = parenthesesFrameLocation; @@ -935,12 +929,12 @@ class RegexGenerator : private MacroAssembler { Jump nonGreedySkipParentheses; Label nonGreedyTryParentheses; if (term.quantityType == QuantifierGreedy) - storeToFrame(Imm32(1), parenthesesFrameLocation); + storeToFrame(index, parenthesesFrameLocation); else if (term.quantityType == QuantifierNonGreedy) { - storeToFrame(Imm32(0), parenthesesFrameLocation); + storeToFrame(Imm32(-1), parenthesesFrameLocation); nonGreedySkipParentheses = jump(); nonGreedyTryParentheses = label(); - storeToFrame(Imm32(1), parenthesesFrameLocation); + storeToFrame(index, parenthesesFrameLocation); } // store the match start index @@ -958,29 +952,21 @@ class RegexGenerator : private MacroAssembler { TermGenerationState parenthesesState(disjunction, state.checkedTotal); generateParenthesesDisjunction(state.term(), parenthesesState, alternativeFrameLocation); - // store the match end index - if (term.invertOrCapture) { - int inputOffset = state.inputOffset(); - if (inputOffset) { - move(index, indexTemporary); - add32(Imm32(state.inputOffset()), indexTemporary); - store32(indexTemporary, Address(output, ((term.parentheses.subpatternId << 1) + 1) * sizeof(int))); - } else - store32(index, Address(output, ((term.parentheses.subpatternId << 1) + 1) * sizeof(int))); - } - Jump success = jump(); + Jump success = (term.quantityType == QuantifierFixedCount) ? + jump() : + branch32(NotEqual, index, Address(stackPointerRegister, (parenthesesFrameLocation * sizeof(void*)))); // A failure AFTER the parens jumps here Label backtrackFromAfterParens(this); if (term.quantityType == QuantifierGreedy) { - // If this is zero we have now tested with both with and without the parens. + // If this is -1 we have now tested with both with and without the parens. loadFromFrame(parenthesesFrameLocation, indexTemporary); - state.jumpToBacktrack(branchTest32(Zero, indexTemporary), this); + state.jumpToBacktrack(branch32(Equal, indexTemporary, Imm32(-1)), this); } else if (term.quantityType == QuantifierNonGreedy) { - // If this is zero we have now tested with both with and without the parens. + // If this is -1 we have now tested without the parens, now test with. loadFromFrame(parenthesesFrameLocation, indexTemporary); - branchTest32(Zero, indexTemporary).linkTo(nonGreedyTryParentheses, this); + branch32(Equal, indexTemporary, Imm32(-1)).linkTo(nonGreedyTryParentheses, this); } parenthesesState.plantJumpToBacktrackIfExists(this); @@ -990,7 +976,7 @@ class RegexGenerator : private MacroAssembler { store32(Imm32(-1), Address(output, (term.parentheses.subpatternId << 1) * sizeof(int))); if (term.quantityType == QuantifierGreedy) - storeToFrame(Imm32(0), parenthesesFrameLocation); + storeToFrame(Imm32(-1), parenthesesFrameLocation); else state.jumpToBacktrack(jump(), this); @@ -998,6 +984,17 @@ class RegexGenerator : private MacroAssembler { if (term.quantityType == QuantifierNonGreedy) nonGreedySkipParentheses.link(this); success.link(this); + + // store the match end index + if (term.invertOrCapture) { + int inputOffset = state.inputOffset(); + if (inputOffset) { + move(index, indexTemporary); + add32(Imm32(state.inputOffset()), indexTemporary); + store32(indexTemporary, Address(output, ((term.parentheses.subpatternId << 1) + 1) * sizeof(int))); + } else + store32(index, Address(output, ((term.parentheses.subpatternId << 1) + 1) * sizeof(int))); + } } } @@ -1008,25 +1005,6 @@ class RegexGenerator : private MacroAssembler { ASSERT(parenthesesTerm.type == PatternTerm::TypeParenthesesSubpattern); ASSERT(parenthesesTerm.quantityCount != 1); // Handled by generateParenthesesSingle. - // Capturing not yet implemented! - if (parenthesesTerm.invertOrCapture) { - m_shouldFallBack = true; - return; - } - - // Quantification limit not yet implemented! - if (parenthesesTerm.quantityCount != 0xffffffff) { - m_shouldFallBack = true; - return; - } - - // Need to reset nested subpatterns between iterations... - // for the minute this crude check rejects all patterns with any subpatterns! - if (m_pattern.m_numSubpatterns) { - m_shouldFallBack = true; - return; - } - TermGenerationState parenthesesState(disjunction, state.checkedTotal); Label matchAgain(this); @@ -1048,7 +1026,11 @@ class RegexGenerator : private MacroAssembler { generateTerm(parenthesesState); // If we get here, we matched! If the index advanced then try to match more since limit isn't supported yet. - branch32(GreaterThan, index, Address(stackPointerRegister, (parenthesesTerm.frameLocation * sizeof(void*))), matchAgain); + branch32(NotEqual, index, Address(stackPointerRegister, (parenthesesTerm.frameLocation * sizeof(void*))), matchAgain); + + // If we get here we matched, but we matched "" - cannot accept this alternative as is, so either backtrack, + // or fall through to try the next alternative if no backtrack is available. + parenthesesState.plantJumpToBacktrackIfExists(this); parenthesesState.linkAlternativeBacktracks(this); // We get here if the alternative fails to match - fall through to the next iteration, or out of the loop. @@ -1181,17 +1163,12 @@ class RegexGenerator : private MacroAssembler { break; case PatternTerm::TypeParenthesesSubpattern: - if (term.quantityCount == 1) { + if (term.quantityCount == 1 && !term.parentheses.isCopy) generateParenthesesSingle(state); - break; - } else if (state.isLastTerm() && state.isMainDisjunction()) { // Is this is the last term of the main disjunction? - // If this has a greedy quantifier, then it will never need to backtrack! - if (term.quantityType == QuantifierGreedy) { - generateParenthesesGreedyNoBacktrack(state); - break; - } - } - m_shouldFallBack = true; + else if (term.parentheses.isTerminal) + generateParenthesesGreedyNoBacktrack(state); + else + m_shouldFallBack = true; break; case PatternTerm::TypeParentheticalAssertion: @@ -1532,7 +1509,7 @@ private: Vector<AlternativeBacktrackRecord> m_backtrackRecords; }; -void jitCompileRegex(JSGlobalData* globalData, RegexCodeBlock& jitObject, const UString& patternString, unsigned& numSubpatterns, const char*& error, bool ignoreCase, bool multiline) +void jitCompileRegex(JSGlobalData* globalData, RegexCodeBlock& jitObject, const UString& patternString, unsigned& numSubpatterns, const char*& error, BumpPointerAllocator* allocator, bool ignoreCase, bool multiline) { RegexPattern pattern(ignoreCase, multiline); if ((error = compileRegex(patternString, pattern))) @@ -1546,9 +1523,7 @@ void jitCompileRegex(JSGlobalData* globalData, RegexCodeBlock& jitObject, const return; } - JSRegExpIgnoreCaseOption ignoreCaseOption = ignoreCase ? JSRegExpIgnoreCase : JSRegExpDoNotIgnoreCase; - JSRegExpMultilineOption multilineOption = multiline ? JSRegExpMultiline : JSRegExpSingleLine; - jitObject.setFallback(jsRegExpCompile(reinterpret_cast<const UChar*>(patternString.characters()), patternString.length(), ignoreCaseOption, multilineOption, &numSubpatterns, &error)); + jitObject.setFallback(byteCompileRegex(pattern, allocator)); } }} diff --git a/JavaScriptCore/yarr/RegexJIT.h b/JavaScriptCore/yarr/RegexJIT.h index 02e905d..c4c382c 100644 --- a/JavaScriptCore/yarr/RegexJIT.h +++ b/JavaScriptCore/yarr/RegexJIT.h @@ -29,12 +29,10 @@ #if ENABLE(YARR_JIT) #include "MacroAssembler.h" +#include "RegexInterpreter.h" // temporary, remove when fallback is removed. #include "RegexPattern.h" #include "UString.h" -#include "pcre.h" -struct JSRegExp; // temporary, remove when fallback is removed. - #if CPU(X86) && !COMPILER(MSVC) #define YARR_CALL __attribute__ ((regparm (3))) #else @@ -53,18 +51,21 @@ class RegexCodeBlock { public: RegexCodeBlock() - : m_fallback(0) + : m_needFallback(false) { } ~RegexCodeBlock() { - if (m_fallback) - jsRegExpFree(m_fallback); } - JSRegExp* getFallback() { return m_fallback; } - void setFallback(JSRegExp* fallback) { m_fallback = fallback; } + BytecodePattern* getFallback() { return m_fallback.get(); } + bool isFallback() { return m_needFallback; } + void setFallback(PassOwnPtr<BytecodePattern> fallback) + { + m_fallback = fallback; + m_needFallback = true; + } bool operator!() { return (!m_ref.m_code.executableAddress() && !m_fallback); } void set(MacroAssembler::CodeRef ref) { m_ref = ref; } @@ -73,22 +74,23 @@ public: { return reinterpret_cast<RegexJITCode>(m_ref.m_code.executableAddress())(input, start, length, output); } - + #if ENABLE(REGEXP_TRACING) void *getAddr() { return m_ref.m_code.executableAddress(); } #endif private: MacroAssembler::CodeRef m_ref; - JSRegExp* m_fallback; + OwnPtr<Yarr::BytecodePattern> m_fallback; + bool m_needFallback; }; -void jitCompileRegex(JSGlobalData* globalData, RegexCodeBlock& jitObject, const UString& pattern, unsigned& numSubpatterns, const char*& error, bool ignoreCase = false, bool multiline = false); +void jitCompileRegex(JSGlobalData* globalData, RegexCodeBlock& jitObject, const UString& pattern, unsigned& numSubpatterns, const char*& error, BumpPointerAllocator* allocator, bool ignoreCase = false, bool multiline = false); -inline int executeRegex(RegexCodeBlock& jitObject, const UChar* input, unsigned start, unsigned length, int* output, int outputArraySize) +inline int executeRegex(RegexCodeBlock& jitObject, const UChar* input, unsigned start, unsigned length, int* output) { - if (JSRegExp* fallback = jitObject.getFallback()) - return (jsRegExpExecute(fallback, input, length, start, output, outputArraySize) < 0) ? -1 : output[0]; + if (jitObject.isFallback()) + return (interpretRegex(jitObject.getFallback(), input, start, length, output)); return jitObject.execute(input, start, length, output); } diff --git a/JavaScriptCore/yarr/RegexParser.h b/JavaScriptCore/yarr/RegexParser.h index ede9417..8392cdf 100644 --- a/JavaScriptCore/yarr/RegexParser.h +++ b/JavaScriptCore/yarr/RegexParser.h @@ -26,8 +26,6 @@ #ifndef RegexParser_h #define RegexParser_h -#if ENABLE(YARR) - #include "UString.h" #include <limits.h> #include <wtf/ASCIICType.h> @@ -35,6 +33,8 @@ namespace JSC { namespace Yarr { +static const unsigned quantifyInfinite = UINT_MAX; + enum BuiltInCharacterClassID { DigitClassID, SpaceClassID, @@ -58,6 +58,7 @@ private: ParenthesesUnmatched, ParenthesesTypeInvalid, CharacterClassUnmatched, + CharacterClassInvalidRange, CharacterClassOutOfOrder, EscapeUnterminated, NumberOfErrorCodes @@ -77,7 +78,7 @@ private: CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err) : m_delegate(delegate) , m_err(err) - , m_state(empty) + , m_state(Empty) { } @@ -92,54 +93,60 @@ private: } /* - * atomPatternCharacterUnescaped(): + * atomPatternCharacter(): * - * This method is called directly from parseCharacterClass(), to report a new - * pattern character token. This method differs from atomPatternCharacter(), - * which will be called from parseEscape(), since a hypen provided via this - * method may be indicating a character range, but a hyphen parsed by - * parseEscape() cannot be interpreted as doing so. + * This method is called either from parseCharacterClass() (for an unescaped + * character in a character class), or from parseEscape(). In the former case + * the value true will be passed for the argument 'hyphenIsRange', and in this + * mode we will allow a hypen to be treated as indicating a range (i.e. /[a-z]/ + * is different to /[a\-z]/). */ - void atomPatternCharacterUnescaped(UChar ch) + void atomPatternCharacter(UChar ch, bool hyphenIsRange = false) { switch (m_state) { - case empty: + case AfterCharacterClass: + // Following a builtin character class we need look out for a hyphen. + // We're looking for invalid ranges, such as /[\d-x]/ or /[\d-\d]/. + // If we see a hyphen following a charater class then unlike usual + // we'll report it to the delegate immediately, and put ourself into + // a poisoned state. Any following calls to add another character or + // character class will result in an error. (A hypen following a + // character-class is itself valid, but only at the end of a regex). + if (hyphenIsRange && ch == '-') { + m_delegate.atomCharacterClassAtom('-'); + m_state = AfterCharacterClassHyphen; + return; + } + // Otherwise just fall through - cached character so treat this as Empty. + + case Empty: m_character = ch; - m_state = cachedCharacter; - break; + m_state = CachedCharacter; + return; - case cachedCharacter: - if (ch == '-') - m_state = cachedCharacterHyphen; + case CachedCharacter: + if (hyphenIsRange && ch == '-') + m_state = CachedCharacterHyphen; else { m_delegate.atomCharacterClassAtom(m_character); m_character = ch; } - break; + return; - case cachedCharacterHyphen: - if (ch >= m_character) - m_delegate.atomCharacterClassRange(m_character, ch); - else + case CachedCharacterHyphen: + if (ch < m_character) { m_err = CharacterClassOutOfOrder; - m_state = empty; - } - } - - /* - * atomPatternCharacter(): - * - * Adds a pattern character, called by parseEscape(), as such will not - * interpret a hyphen as indicating a character range. - */ - void atomPatternCharacter(UChar ch) - { - // Flush if a character is already pending to prevent the - // hyphen from begin interpreted as indicating a range. - if((ch == '-') && (m_state == cachedCharacter)) - flush(); + return; + } + m_delegate.atomCharacterClassRange(m_character, ch); + m_state = Empty; + return; - atomPatternCharacterUnescaped(ch); + case AfterCharacterClassHyphen: + // Error! We have something like /[\d-x]/. + m_err = CharacterClassInvalidRange; + return; + } } /* @@ -149,8 +156,25 @@ private: */ void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert) { - flush(); - m_delegate.atomCharacterClassBuiltIn(classID, invert); + switch (m_state) { + case CachedCharacter: + // Flush the currently cached character, then fall through. + m_delegate.atomCharacterClassAtom(m_character); + + case Empty: + case AfterCharacterClass: + m_state = AfterCharacterClass; + m_delegate.atomCharacterClassBuiltIn(classID, invert); + return; + + case CachedCharacterHyphen: + case AfterCharacterClassHyphen: + // Error! If we hit either of these cases, we have an + // invalid range that looks something like /[x-\d]/ + // or /[\d-\d]/. + m_err = CharacterClassInvalidRange; + return; + } } /* @@ -160,7 +184,12 @@ private: */ void end() { - flush(); + if (m_state == CachedCharacter) + m_delegate.atomCharacterClassAtom(m_character); + else if (m_state == CachedCharacterHyphen) { + m_delegate.atomCharacterClassAtom(m_character); + m_delegate.atomCharacterClassAtom('-'); + } m_delegate.atomCharacterClassEnd(); } @@ -170,21 +199,14 @@ private: void atomBackReference(unsigned) { ASSERT_NOT_REACHED(); } private: - void flush() - { - if (m_state != empty) // either cachedCharacter or cachedCharacterHyphen - m_delegate.atomCharacterClassAtom(m_character); - if (m_state == cachedCharacterHyphen) - m_delegate.atomCharacterClassAtom('-'); - m_state = empty; - } - Delegate& m_delegate; ErrorCode& m_err; enum CharacterClassConstructionState { - empty, - cachedCharacter, - cachedCharacterHyphen, + Empty, + CachedCharacter, + CachedCharacterHyphen, + AfterCharacterClass, + AfterCharacterClassHyphen, } m_state; UChar m_character; }; @@ -430,7 +452,7 @@ private: break; default: - characterClassConstructor.atomPatternCharacterUnescaped(consume()); + characterClassConstructor.atomPatternCharacter(consume(), true); } if (m_err) @@ -574,13 +596,13 @@ private: case '*': consume(); - parseQuantifier(lastTokenWasAnAtom, 0, UINT_MAX); + parseQuantifier(lastTokenWasAnAtom, 0, quantifyInfinite); lastTokenWasAnAtom = false; break; case '+': consume(); - parseQuantifier(lastTokenWasAnAtom, 1, UINT_MAX); + parseQuantifier(lastTokenWasAnAtom, 1, quantifyInfinite); lastTokenWasAnAtom = false; break; @@ -599,7 +621,7 @@ private: unsigned max = min; if (tryConsume(',')) - max = peekIsDigit() ? consumeNumber() : UINT_MAX; + max = peekIsDigit() ? consumeNumber() : quantifyInfinite; if (tryConsume('}')) { if (min <= max) @@ -659,6 +681,7 @@ private: "unmatched parentheses", "unrecognized character after (?", "missing terminating ] for character class", + "invalid range in character class", "range out of order in character class", "\\ at end of pattern" }; @@ -840,13 +863,11 @@ private: */ template<class Delegate> -const char* parse(Delegate& delegate, const UString& pattern, unsigned backReferenceLimit = UINT_MAX) +const char* parse(Delegate& delegate, const UString& pattern, unsigned backReferenceLimit = quantifyInfinite) { return Parser<Delegate>(delegate, pattern, backReferenceLimit).parse(); } } } // namespace JSC::Yarr -#endif - #endif // RegexParser_h diff --git a/JavaScriptCore/yarr/RegexPattern.h b/JavaScriptCore/yarr/RegexPattern.h index be31fcd..8a7d35b 100644 --- a/JavaScriptCore/yarr/RegexPattern.h +++ b/JavaScriptCore/yarr/RegexPattern.h @@ -27,13 +27,9 @@ #ifndef RegexPattern_h #define RegexPattern_h - -#if ENABLE(YARR) - #include <wtf/Vector.h> #include <wtf/unicode/Unicode.h> - namespace JSC { namespace Yarr { #define RegexStackSpaceForBackTrackInfoPatternCharacter 1 // Only for !fixed quantifiers. @@ -42,6 +38,7 @@ namespace JSC { namespace Yarr { #define RegexStackSpaceForBackTrackInfoAlternative 1 // One per alternative. #define RegexStackSpaceForBackTrackInfoParentheticalAssertion 1 #define RegexStackSpaceForBackTrackInfoParenthesesOnce 1 // Only for !fixed quantifiers. +#define RegexStackSpaceForBackTrackInfoParenthesesTerminal 1 #define RegexStackSpaceForBackTrackInfoParentheses 4 struct PatternDisjunction; @@ -110,12 +107,13 @@ struct PatternTerm { union { UChar patternCharacter; CharacterClass* characterClass; - unsigned subpatternId; + unsigned backReferenceSubpatternId; struct { PatternDisjunction* disjunction; unsigned subpatternId; unsigned lastSubpatternId; bool isCopy; + bool isTerminal; } parentheses; }; QuantifierType quantityType; @@ -147,6 +145,7 @@ struct PatternTerm { parentheses.disjunction = disjunction; parentheses.subpatternId = subpatternId; parentheses.isCopy = false; + parentheses.isTerminal = false; quantityType = QuantifierFixedCount; quantityCount = 1; } @@ -163,7 +162,7 @@ struct PatternTerm { : type(TypeBackReference) , invertOrCapture(false) { - subpatternId = spatternId; + backReferenceSubpatternId = spatternId; quantityType = QuantifierFixedCount; quantityCount = 1; } @@ -429,6 +428,4 @@ private: } } // namespace JSC::Yarr -#endif - #endif // RegexPattern_h |
