summaryrefslogtreecommitdiffstats
path: root/JavaScriptCore/yarr
diff options
context:
space:
mode:
Diffstat (limited to 'JavaScriptCore/yarr')
-rw-r--r--JavaScriptCore/yarr/RegexCompiler.cpp469
-rw-r--r--JavaScriptCore/yarr/RegexCompiler.h8
-rw-r--r--JavaScriptCore/yarr/RegexInterpreter.cpp635
-rw-r--r--JavaScriptCore/yarr/RegexInterpreter.h62
-rw-r--r--JavaScriptCore/yarr/RegexJIT.cpp467
-rw-r--r--JavaScriptCore/yarr/RegexJIT.h38
-rw-r--r--JavaScriptCore/yarr/RegexParser.h14
-rw-r--r--JavaScriptCore/yarr/RegexPattern.h95
8 files changed, 1274 insertions, 514 deletions
diff --git a/JavaScriptCore/yarr/RegexCompiler.cpp b/JavaScriptCore/yarr/RegexCompiler.cpp
index 9cd3d12..06ecbad 100644
--- a/JavaScriptCore/yarr/RegexCompiler.cpp
+++ b/JavaScriptCore/yarr/RegexCompiler.cpp
@@ -1,5 +1,6 @@
/*
* Copyright (C) 2009 Apple Inc. All rights reserved.
+ * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -30,12 +31,12 @@
#include "RegexPattern.h"
#include <wtf/Vector.h>
-#if ENABLE(YARR)
-
using namespace WTF;
namespace JSC { namespace Yarr {
+#include "RegExpJitTables.h"
+
class CharacterClassConstructor {
public:
CharacterClassConstructor(bool isCaseInsensitive = false)
@@ -141,7 +142,7 @@ public:
CharacterClass* charClass()
{
- CharacterClass* characterClass = new CharacterClass();
+ CharacterClass* characterClass = new CharacterClass(0);
characterClass->m_matches.append(m_matches);
characterClass->m_ranges.append(m_ranges);
@@ -233,110 +234,118 @@ private:
Vector<CharacterRange> m_rangesUnicode;
};
+struct BeginCharHelper {
+ BeginCharHelper(Vector<BeginChar>* beginChars, bool isCaseInsensitive = false)
+ : m_beginChars(beginChars)
+ , m_isCaseInsensitive(isCaseInsensitive)
+ {}
+
+ void addBeginChar(BeginChar beginChar, Vector<TermChain>* hotTerms, QuantifierType quantityType, unsigned quantityCount)
+ {
+ if (quantityType == QuantifierFixedCount && quantityCount > 1) {
+ // We duplicate the first found character if the quantity of the term is more than one. eg.: /a{3}/
+ beginChar.value |= beginChar.value << 16;
+ beginChar.mask |= beginChar.mask << 16;
+ addCharacter(beginChar);
+ } else if (quantityType == QuantifierFixedCount && quantityCount == 1 && hotTerms->size())
+ // In case of characters with fixed quantifier we should check the next character as well.
+ linkHotTerms(beginChar, hotTerms);
+ else
+ // In case of greedy matching the next character checking is unnecessary therefore we just store
+ // the first character.
+ addCharacter(beginChar);
+ }
-CharacterClass* newlineCreate()
-{
- CharacterClass* characterClass = new CharacterClass();
+ // Merge two following BeginChars in the vector to reduce the number of character checks.
+ void merge(unsigned size)
+ {
+ for (unsigned i = 0; i < size; i++) {
+ BeginChar* curr = &m_beginChars->at(i);
+ BeginChar* next = &m_beginChars->at(i + 1);
- characterClass->m_matches.append('\n');
- characterClass->m_matches.append('\r');
- characterClass->m_matchesUnicode.append(0x2028);
- characterClass->m_matchesUnicode.append(0x2029);
-
- return characterClass;
-}
+ // If the current and the next size of value is different we should skip the merge process
+ // because the 16bit and 32bit values are unmergable.
+ if (curr->value <= 0xFFFF && next->value > 0xFFFF)
+ continue;
-CharacterClass* digitsCreate()
-{
- CharacterClass* characterClass = new CharacterClass();
+ unsigned diff = curr->value ^ next->value;
- characterClass->m_ranges.append(CharacterRange('0', '9'));
-
- return characterClass;
-}
+ curr->mask |= diff;
+ curr->value |= curr->mask;
-CharacterClass* spacesCreate()
-{
- CharacterClass* characterClass = new CharacterClass();
-
- characterClass->m_matches.append(' ');
- characterClass->m_ranges.append(CharacterRange('\t', '\r'));
- characterClass->m_matchesUnicode.append(0x00a0);
- characterClass->m_matchesUnicode.append(0x1680);
- characterClass->m_matchesUnicode.append(0x180e);
- characterClass->m_matchesUnicode.append(0x2028);
- characterClass->m_matchesUnicode.append(0x2029);
- characterClass->m_matchesUnicode.append(0x202f);
- characterClass->m_matchesUnicode.append(0x205f);
- characterClass->m_matchesUnicode.append(0x3000);
- characterClass->m_rangesUnicode.append(CharacterRange(0x2000, 0x200a));
-
- return characterClass;
-}
+ m_beginChars->remove(i + 1);
+ size--;
+ }
+ }
-CharacterClass* wordcharCreate()
-{
- CharacterClass* characterClass = new CharacterClass();
+private:
+ void addCharacter(BeginChar beginChar)
+ {
+ unsigned pos = 0;
+ unsigned range = m_beginChars->size();
- characterClass->m_matches.append('_');
- characterClass->m_ranges.append(CharacterRange('0', '9'));
- characterClass->m_ranges.append(CharacterRange('A', 'Z'));
- characterClass->m_ranges.append(CharacterRange('a', 'z'));
-
- return characterClass;
-}
+ // binary chop, find position to insert char.
+ while (range) {
+ unsigned index = range >> 1;
-CharacterClass* nondigitsCreate()
-{
- CharacterClass* characterClass = new CharacterClass();
+ int val = m_beginChars->at(pos+index).value - beginChar.value;
+ if (!val)
+ return;
+ if (val < 0)
+ range = index;
+ else {
+ pos += (index+1);
+ range -= (index+1);
+ }
+ }
- characterClass->m_ranges.append(CharacterRange(0, '0' - 1));
- characterClass->m_ranges.append(CharacterRange('9' + 1, 0x7f));
- characterClass->m_rangesUnicode.append(CharacterRange(0x80, 0xffff));
-
- return characterClass;
-}
+ if (pos == m_beginChars->size())
+ m_beginChars->append(beginChar);
+ else
+ m_beginChars->insert(pos, beginChar);
+ }
-CharacterClass* nonspacesCreate()
-{
- CharacterClass* characterClass = new CharacterClass();
-
- characterClass->m_ranges.append(CharacterRange(0, '\t' - 1));
- characterClass->m_ranges.append(CharacterRange('\r' + 1, ' ' - 1));
- characterClass->m_ranges.append(CharacterRange(' ' + 1, 0x7f));
- characterClass->m_rangesUnicode.append(CharacterRange(0x0080, 0x009f));
- characterClass->m_rangesUnicode.append(CharacterRange(0x00a1, 0x167f));
- characterClass->m_rangesUnicode.append(CharacterRange(0x1681, 0x180d));
- characterClass->m_rangesUnicode.append(CharacterRange(0x180f, 0x1fff));
- characterClass->m_rangesUnicode.append(CharacterRange(0x200b, 0x2027));
- characterClass->m_rangesUnicode.append(CharacterRange(0x202a, 0x202e));
- characterClass->m_rangesUnicode.append(CharacterRange(0x2030, 0x205e));
- characterClass->m_rangesUnicode.append(CharacterRange(0x2060, 0x2fff));
- characterClass->m_rangesUnicode.append(CharacterRange(0x3001, 0xffff));
-
- return characterClass;
-}
+ // Create BeginChar objects by appending each terms from a hotTerms vector to an existing BeginChar object.
+ void linkHotTerms(BeginChar beginChar, Vector<TermChain>* hotTerms)
+ {
+ for (unsigned i = 0; i < hotTerms->size(); i++) {
+ PatternTerm hotTerm = hotTerms->at(i).term;
+ ASSERT(hotTerm.type == PatternTerm::TypePatternCharacter);
-CharacterClass* nonwordcharCreate()
-{
- CharacterClass* characterClass = new CharacterClass();
+ UChar characterNext = hotTerm.patternCharacter;
- characterClass->m_matches.append('`');
- characterClass->m_ranges.append(CharacterRange(0, '0' - 1));
- characterClass->m_ranges.append(CharacterRange('9' + 1, 'A' - 1));
- characterClass->m_ranges.append(CharacterRange('Z' + 1, '_' - 1));
- characterClass->m_ranges.append(CharacterRange('z' + 1, 0x7f));
- characterClass->m_rangesUnicode.append(CharacterRange(0x80, 0xffff));
+ // Append a character to an existing BeginChar object.
+ if (characterNext <= 0x7f) {
+ unsigned mask = 0;
- return characterClass;
-}
+ if (m_isCaseInsensitive && isASCIIAlpha(characterNext)) {
+ mask = 32;
+ characterNext = toASCIILower(characterNext);
+ }
+ addCharacter(BeginChar(beginChar.value | (characterNext << 16), beginChar.mask | (mask << 16)));
+ } else {
+ UChar upper, lower;
+ if (m_isCaseInsensitive && ((upper = Unicode::toUpper(characterNext)) != (lower = Unicode::toLower(characterNext)))) {
+ addCharacter(BeginChar(beginChar.value | (upper << 16), beginChar.mask));
+ addCharacter(BeginChar(beginChar.value | (lower << 16), beginChar.mask));
+ } else
+ addCharacter(BeginChar(beginChar.value | (characterNext << 16), beginChar.mask));
+ }
+ }
+ }
+
+ Vector<BeginChar>* m_beginChars;
+ bool m_isCaseInsensitive;
+};
class RegexPatternConstructor {
public:
RegexPatternConstructor(RegexPattern& pattern)
: m_pattern(pattern)
, m_characterClassConstructor(pattern.m_ignoreCase)
+ , m_beginCharHelper(&pattern.m_beginChars, pattern.m_ignoreCase)
+ , m_invertParentheticalAssertion(false)
{
}
@@ -352,6 +361,11 @@ public:
void assertionBOL()
{
+ if (!m_alternative->m_terms.size() & !m_invertParentheticalAssertion) {
+ m_alternative->m_startsWithBOL = true;
+ m_alternative->m_containsBOL = true;
+ m_pattern.m_containsBOL = true;
+ }
m_alternative->m_terms.append(PatternTerm::BOL());
}
void assertionEOL()
@@ -455,20 +469,42 @@ public:
m_pattern.m_disjunctions.append(parenthesesDisjunction);
m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParentheticalAssertion, m_pattern.m_numSubpatterns + 1, parenthesesDisjunction, invert));
m_alternative = parenthesesDisjunction->addNewAlternative();
+ m_invertParentheticalAssertion = invert;
}
void atomParenthesesEnd()
{
ASSERT(m_alternative->m_parent);
ASSERT(m_alternative->m_parent->m_parent);
+
+ PatternDisjunction* parenthesisDisjunction = m_alternative->m_parent;
m_alternative = m_alternative->m_parent->m_parent;
+
+ PatternTerm& lastTerm = m_alternative->lastTerm();
+
+ unsigned numParenAlternatives = parenthesisDisjunction->m_alternatives.size();
+ unsigned numBOLAnchoredAlts = 0;
+ // Bubble up BOL flags
+ for (unsigned i = 0; i < numParenAlternatives; i++) {
+ if (parenthesisDisjunction->m_alternatives[i]->m_startsWithBOL)
+ numBOLAnchoredAlts++;
+ }
- m_alternative->lastTerm().parentheses.lastSubpatternId = m_pattern.m_numSubpatterns;
+ if (numBOLAnchoredAlts) {
+ m_alternative->m_containsBOL = true;
+ // If all the alternatives in parens start with BOL, then so does this one
+ if (numBOLAnchoredAlts == numParenAlternatives)
+ m_alternative->m_startsWithBOL = true;
+ }
+
+ lastTerm.parentheses.lastSubpatternId = m_pattern.m_numSubpatterns;
+ m_invertParentheticalAssertion = false;
}
void atomBackReference(unsigned subpatternId)
{
ASSERT(subpatternId);
+ m_pattern.m_containsBackreferences = true;
m_pattern.m_maxBackReference = std::max(m_pattern.m_maxBackReference, subpatternId);
if (subpatternId > m_pattern.m_numSubpatterns) {
@@ -493,32 +529,39 @@ public:
m_alternative->m_terms.append(PatternTerm(subpatternId));
}
- PatternDisjunction* copyDisjunction(PatternDisjunction* disjunction)
+ // deep copy the argument disjunction. If filterStartsWithBOL is true,
+ // skip alternatives with m_startsWithBOL set true.
+ PatternDisjunction* copyDisjunction(PatternDisjunction* disjunction, bool filterStartsWithBOL = false)
{
- PatternDisjunction* newDisjunction = new PatternDisjunction();
-
- newDisjunction->m_parent = disjunction->m_parent;
+ PatternDisjunction* newDisjunction = 0;
for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
PatternAlternative* alternative = disjunction->m_alternatives[alt];
- PatternAlternative* newAlternative = newDisjunction->addNewAlternative();
- for (unsigned i = 0; i < alternative->m_terms.size(); ++i)
- newAlternative->m_terms.append(copyTerm(alternative->m_terms[i]));
+ if (!filterStartsWithBOL || !alternative->m_startsWithBOL) {
+ if (!newDisjunction) {
+ newDisjunction = new PatternDisjunction();
+ newDisjunction->m_parent = disjunction->m_parent;
+ }
+ PatternAlternative* newAlternative = newDisjunction->addNewAlternative();
+ for (unsigned i = 0; i < alternative->m_terms.size(); ++i)
+ newAlternative->m_terms.append(copyTerm(alternative->m_terms[i], filterStartsWithBOL));
+ }
}
-
- m_pattern.m_disjunctions.append(newDisjunction);
+
+ if (newDisjunction)
+ m_pattern.m_disjunctions.append(newDisjunction);
return newDisjunction;
}
-
- PatternTerm copyTerm(PatternTerm& term)
+
+ PatternTerm copyTerm(PatternTerm& term, bool filterStartsWithBOL = false)
{
if ((term.type != PatternTerm::TypeParenthesesSubpattern) && (term.type != PatternTerm::TypeParentheticalAssertion))
return PatternTerm(term);
-
+
PatternTerm termCopy = term;
- termCopy.parentheses.disjunction = copyDisjunction(termCopy.parentheses.disjunction);
+ termCopy.parentheses.disjunction = copyDisjunction(termCopy.parentheses.disjunction, filterStartsWithBOL);
return termCopy;
}
-
+
void quantifyAtom(unsigned min, unsigned max, bool greedy)
{
ASSERT(min <= max);
@@ -624,14 +667,17 @@ public:
case PatternTerm::TypeParenthesesSubpattern:
// Note: for fixed once parentheses we will ensure at least the minimum is available; others are on their own.
term.frameLocation = currentCallFrameSize;
- if ((term.quantityCount == 1) && !term.parentheses.isCopy) {
- if (term.quantityType == QuantifierFixedCount) {
- currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition);
- currentInputPosition += term.parentheses.disjunction->m_minimumSize;
- } else {
+ if (term.quantityCount == 1 && !term.parentheses.isCopy) {
+ if (term.quantityType != QuantifierFixedCount)
currentCallFrameSize += RegexStackSpaceForBackTrackInfoParenthesesOnce;
- currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition);
- }
+ currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition);
+ // If quantity is fixed, then pre-check its minimum size.
+ if (term.quantityType == QuantifierFixedCount)
+ currentInputPosition += term.parentheses.disjunction->m_minimumSize;
+ term.inputPosition = currentInputPosition;
+ } else if (term.parentheses.isTerminal) {
+ currentCallFrameSize += RegexStackSpaceForBackTrackInfoParenthesesTerminal;
+ currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition);
term.inputPosition = currentInputPosition;
} else {
term.inputPosition = currentInputPosition;
@@ -685,11 +731,208 @@ public:
setupDisjunctionOffsets(m_pattern.m_body, 0, 0);
}
+ // This optimization identifies sets of parentheses that we will never need to backtrack.
+ // In these cases we do not need to store state from prior iterations.
+ // We can presently avoid backtracking for:
+ // * a set of parens at the end of the regular expression (last term in any of the alternatives of the main body disjunction).
+ // * where the parens are non-capturing, and quantified unbounded greedy (*).
+ // * where the parens do not contain any capturing subpatterns.
+ void checkForTerminalParentheses()
+ {
+ // This check is much too crude; should be just checking whether the candidate
+ // node contains nested capturing subpatterns, not the whole expression!
+ if (m_pattern.m_numSubpatterns)
+ return;
+
+ Vector<PatternAlternative*>& alternatives = m_pattern.m_body->m_alternatives;
+ for (unsigned i =0; i < alternatives.size(); ++i) {
+ Vector<PatternTerm>& terms = alternatives[i]->m_terms;
+ if (terms.size()) {
+ PatternTerm& term = terms.last();
+ if (term.type == PatternTerm::TypeParenthesesSubpattern
+ && term.quantityType == QuantifierGreedy
+ && term.quantityCount == UINT_MAX
+ && !term.capture())
+ term.parentheses.isTerminal = true;
+ }
+ }
+ }
+
+ void optimizeBOL()
+ {
+ // Look for expressions containing beginning of line (^) anchoring and unroll them.
+ // e.g. /^a|^b|c/ becomes /^a|^b|c/ which is executed once followed by /c/ which loops
+ // This code relies on the parsing code tagging alternatives with m_containsBOL and
+ // m_startsWithBOL and rolling those up to containing alternatives.
+ // At this point, this is only valid for non-multiline expressions.
+ PatternDisjunction* disjunction = m_pattern.m_body;
+
+ if (!m_pattern.m_containsBOL || m_pattern.m_multiline)
+ return;
+
+ PatternDisjunction* loopDisjunction = copyDisjunction(disjunction, true);
+
+ // Set alternatives in disjunction to "onceThrough"
+ for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt)
+ disjunction->m_alternatives[alt]->setOnceThrough();
+
+ if (loopDisjunction) {
+ // Move alternatives from loopDisjunction to disjunction
+ for (unsigned alt = 0; alt < loopDisjunction->m_alternatives.size(); ++alt)
+ disjunction->m_alternatives.append(loopDisjunction->m_alternatives[alt]);
+
+ loopDisjunction->m_alternatives.clear();
+ }
+ }
+
+ bool addBeginTerm(PatternTerm term, Vector<TermChain>* beginTerms, PatternAlternative* alternative, unsigned numTerms, unsigned termIndex, unsigned depth)
+ {
+ if (term.quantityType == QuantifierFixedCount) {
+ beginTerms->append(TermChain(term));
+ if (depth < 2 && termIndex < numTerms - 1 && term.quantityCount == 1)
+ setupAlternativeBeginTerms(alternative, &beginTerms->last().hotTerms, termIndex + 1, depth + 1);
+ } else if (termIndex != numTerms - 1) {
+ beginTerms->append(TermChain(term));
+ return true;
+ }
+
+ return false;
+ }
+
+ // This function collects the terms which are potentially matching the first number of depth characters in the result.
+ // If this function returns false then it found at least one term which makes the beginning character
+ // look-up optimization inefficient.
+ bool setupDisjunctionBeginTerms(PatternDisjunction* disjunction, Vector<TermChain>* beginTerms, unsigned depth)
+ {
+ for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
+ PatternAlternative* alternative = disjunction->m_alternatives[alt];
+
+ if (!setupAlternativeBeginTerms(alternative, beginTerms, 0, depth))
+ return false;
+ }
+
+ return true;
+ }
+
+ bool setupAlternativeBeginTerms(PatternAlternative* alternative, Vector<TermChain>* beginTerms, unsigned termIndex, unsigned depth)
+ {
+ bool checkNext = true;
+ unsigned numTerms = alternative->m_terms.size();
+
+ while (checkNext && termIndex < numTerms) {
+ PatternTerm term = alternative->m_terms[termIndex];
+ checkNext = false;
+
+ switch (term.type) {
+ case PatternTerm::TypeAssertionBOL:
+ case PatternTerm::TypeAssertionEOL:
+ case PatternTerm::TypeAssertionWordBoundary:
+ return false;
+
+ case PatternTerm::TypeBackReference:
+ case PatternTerm::TypeForwardReference:
+ return false;
+
+ case PatternTerm::TypePatternCharacter:
+ if (addBeginTerm(term, beginTerms, alternative, numTerms, termIndex, depth)) {
+ termIndex++;
+ checkNext = true;
+ }
+ break;
+
+ case PatternTerm::TypeCharacterClass:
+ return false;
+
+ case PatternTerm::TypeParentheticalAssertion:
+ if (term.invertOrCapture)
+ return false;
+
+ case PatternTerm::TypeParenthesesSubpattern:
+ if (term.quantityType != QuantifierFixedCount) {
+ if (termIndex == numTerms - 1)
+ break;
+
+ termIndex++;
+ checkNext = true;
+
+ }
+
+ if (!setupDisjunctionBeginTerms(term.parentheses.disjunction, beginTerms, depth))
+ return false;
+
+ break;
+ }
+ }
+
+ return true;
+ }
+
+ void setupBeginChars()
+ {
+ Vector<TermChain> beginTerms;
+ bool containsFixedCharacter = false;
+
+ if ((!m_pattern.m_body->m_hasFixedSize || m_pattern.m_body->m_alternatives.size() > 1)
+ && setupDisjunctionBeginTerms(m_pattern.m_body, &beginTerms, 0)) {
+ unsigned size = beginTerms.size();
+
+ // If we haven't collected any terms we should abort the preparation of beginning character look-up optimization.
+ if (!size)
+ return;
+
+ m_pattern.m_containsBeginChars = true;
+
+ for (unsigned i = 0; i < size; i++) {
+ PatternTerm term = beginTerms[i].term;
+
+ // We have just collected PatternCharacter terms, other terms are not allowed.
+ ASSERT(term.type == PatternTerm::TypePatternCharacter);
+
+ if (term.quantityType == QuantifierFixedCount)
+ containsFixedCharacter = true;
+
+ UChar character = term.patternCharacter;
+ unsigned mask = 0;
+
+ if (character <= 0x7f) {
+ if (m_pattern.m_ignoreCase && isASCIIAlpha(character)) {
+ mask = 32;
+ character = toASCIILower(character);
+ }
+
+ m_beginCharHelper.addBeginChar(BeginChar(character, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount);
+ } else {
+ UChar upper, lower;
+ if (m_pattern.m_ignoreCase && ((upper = Unicode::toUpper(character)) != (lower = Unicode::toLower(character)))) {
+ m_beginCharHelper.addBeginChar(BeginChar(upper, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount);
+ m_beginCharHelper.addBeginChar(BeginChar(lower, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount);
+ } else
+ m_beginCharHelper.addBeginChar(BeginChar(character, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount);
+ }
+ }
+
+ // If the pattern doesn't contain terms with fixed quantifiers then the beginning character look-up optimization is inefficient.
+ if (!containsFixedCharacter) {
+ m_pattern.m_containsBeginChars = false;
+ return;
+ }
+
+ size = m_pattern.m_beginChars.size();
+
+ if (size > 2)
+ m_beginCharHelper.merge(size - 1);
+ else if (size <= 1)
+ m_pattern.m_containsBeginChars = false;
+ }
+ }
+
private:
RegexPattern& m_pattern;
PatternAlternative* m_alternative;
CharacterClassConstructor m_characterClassConstructor;
+ BeginCharHelper m_beginCharHelper;
bool m_invertCharacterClass;
+ bool m_invertParentheticalAssertion;
};
@@ -717,12 +960,14 @@ const char* compileRegex(const UString& patternString, RegexPattern& pattern)
ASSERT(numSubpatterns == pattern.m_numSubpatterns);
}
+ constructor.checkForTerminalParentheses();
+ constructor.optimizeBOL();
+
constructor.setupOffsets();
+ constructor.setupBeginChars();
- return false;
+ return 0;
};
} }
-
-#endif
diff --git a/JavaScriptCore/yarr/RegexCompiler.h b/JavaScriptCore/yarr/RegexCompiler.h
index 3ed2be9..399374e 100644
--- a/JavaScriptCore/yarr/RegexCompiler.h
+++ b/JavaScriptCore/yarr/RegexCompiler.h
@@ -26,13 +26,9 @@
#ifndef RegexCompiler_h
#define RegexCompiler_h
-#include <wtf/Platform.h>
-
-#if ENABLE(YARR)
-
-#include <wtf/unicode/Unicode.h>
#include "RegexParser.h"
#include "RegexPattern.h"
+#include <wtf/unicode/Unicode.h>
namespace JSC { namespace Yarr {
@@ -40,6 +36,4 @@ const char* compileRegex(const UString& patternString, RegexPattern& pattern);
} } // namespace JSC::Yarr
-#endif
-
#endif // RegexCompiler_h
diff --git a/JavaScriptCore/yarr/RegexInterpreter.cpp b/JavaScriptCore/yarr/RegexInterpreter.cpp
index d088086..164158e 100644
--- a/JavaScriptCore/yarr/RegexInterpreter.cpp
+++ b/JavaScriptCore/yarr/RegexInterpreter.cpp
@@ -1,5 +1,6 @@
/*
* Copyright (C) 2009 Apple Inc. All rights reserved.
+ * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -28,13 +29,12 @@
#include "RegexCompiler.h"
#include "RegexPattern.h"
+#include <wtf/BumpPointerAllocator.h>
#ifndef NDEBUG
#include <stdio.h>
#endif
-#if ENABLE(YARR)
-
using namespace WTF;
namespace JSC { namespace Yarr {
@@ -60,7 +60,10 @@ public:
uintptr_t begin;
};
struct BackTrackInfoParenthesesOnce {
- uintptr_t inParentheses;
+ uintptr_t begin;
+ };
+ struct BackTrackInfoParenthesesTerminal {
+ uintptr_t begin;
};
struct BackTrackInfoParentheses {
uintptr_t matchAmount;
@@ -104,12 +107,16 @@ public:
DisjunctionContext* allocDisjunctionContext(ByteDisjunction* disjunction)
{
- return new(malloc(sizeof(DisjunctionContext) + (disjunction->m_frameSize - 1) * sizeof(uintptr_t))) DisjunctionContext();
+ size_t size = sizeof(DisjunctionContext) - sizeof(uintptr_t) + disjunction->m_frameSize * sizeof(uintptr_t);
+ allocatorPool = allocatorPool->ensureCapacity(size);
+ if (!allocatorPool)
+ CRASH();
+ return new(allocatorPool->alloc(size)) DisjunctionContext();
}
void freeDisjunctionContext(DisjunctionContext* context)
{
- free(context);
+ allocatorPool = allocatorPool->dealloc(context);
}
struct ParenthesesDisjunctionContext
@@ -150,12 +157,16 @@ public:
ParenthesesDisjunctionContext* allocParenthesesDisjunctionContext(ByteDisjunction* disjunction, int* output, ByteTerm& term)
{
- return new(malloc(sizeof(ParenthesesDisjunctionContext) + (((term.atom.parenthesesDisjunction->m_numSubpatterns << 1) - 1) * sizeof(int)) + sizeof(DisjunctionContext) + (disjunction->m_frameSize - 1) * sizeof(uintptr_t))) ParenthesesDisjunctionContext(output, term);
+ size_t size = sizeof(ParenthesesDisjunctionContext) - sizeof(int) + (term.atom.parenthesesDisjunction->m_numSubpatterns << 1) * sizeof(int) + sizeof(DisjunctionContext) - sizeof(uintptr_t) + disjunction->m_frameSize * sizeof(uintptr_t);
+ allocatorPool = allocatorPool->ensureCapacity(size);
+ if (!allocatorPool)
+ CRASH();
+ return new(allocatorPool->alloc(size)) ParenthesesDisjunctionContext(output, term);
}
void freeParenthesesDisjunctionContext(ParenthesesDisjunctionContext* context)
{
- free(context);
+ allocatorPool = allocatorPool->dealloc(context);
}
class InputStream {
@@ -186,6 +197,12 @@ public:
return -1;
}
+ int readPair()
+ {
+ ASSERT(pos + 1 < length);
+ return input[pos] | input[pos + 1] << 16;
+ }
+
int readChecked(int position)
{
ASSERT(position < 0);
@@ -253,6 +270,11 @@ public:
return (pos + position) == length;
}
+ bool isNotAvailableInput(int position)
+ {
+ return (pos + position) > length;
+ }
+
private:
const UChar* input;
unsigned pos;
@@ -280,20 +302,6 @@ public:
return false;
}
- bool tryConsumeCharacter(int testChar)
- {
- if (input.atEnd())
- return false;
-
- int ch = input.read();
-
- if (pattern->m_ignoreCase ? ((Unicode::toLower(testChar) == ch) || (Unicode::toUpper(testChar) == ch)) : (testChar == ch)) {
- input.next();
- return true;
- }
- return false;
- }
-
bool checkCharacter(int testChar, int inputPosition)
{
return testChar == input.readChecked(inputPosition);
@@ -305,23 +313,6 @@ public:
return (loChar == ch) || (hiChar == ch);
}
- bool tryConsumeCharacterClass(CharacterClass* characterClass, bool invert)
- {
- if (input.atEnd())
- return false;
-
- bool match = testCharacterClass(characterClass, input.read());
-
- if (invert)
- match = !match;
-
- if (match) {
- input.next();
- return true;
- }
- return false;
- }
-
bool checkCharacterClass(CharacterClass* characterClass, bool invert, int inputPosition)
{
bool match = testCharacterClass(characterClass, input.readChecked(inputPosition));
@@ -335,10 +326,24 @@ public:
if (!input.checkInput(matchSize))
return false;
- for (int i = 0; i < matchSize; ++i) {
- if (!checkCharacter(input.reread(matchBegin + i), inputOffset - matchSize + i)) {
- input.uncheckInput(matchSize);
- return false;
+ if (pattern->m_ignoreCase) {
+ for (int i = 0; i < matchSize; ++i) {
+ int ch = input.reread(matchBegin + i);
+
+ int lo = Unicode::toLower(ch);
+ int hi = Unicode::toUpper(ch);
+
+ if ((lo != hi) ? (!checkCasedCharacter(lo, hi, inputOffset - matchSize + i)) : (!checkCharacter(ch, inputOffset - matchSize + i))) {
+ input.uncheckInput(matchSize);
+ return false;
+ }
+ }
+ } else {
+ for (int i = 0; i < matchSize; ++i) {
+ if (!checkCharacter(input.reread(matchBegin + i), inputOffset - matchSize + i)) {
+ input.uncheckInput(matchSize);
+ return false;
+ }
}
}
@@ -503,6 +508,13 @@ public:
int matchBegin = output[(term.atom.subpatternId << 1)];
int matchEnd = output[(term.atom.subpatternId << 1) + 1];
+
+ // If the end position of the referenced match hasn't set yet then the backreference in the same parentheses where it references to that.
+ // In this case the result of match is empty string like when it references to a parentheses with zero-width match.
+ // Eg.: /(a\1)/
+ if (matchEnd == -1)
+ return true;
+
ASSERT((matchBegin == -1) == (matchEnd == -1));
ASSERT(matchBegin <= matchEnd);
@@ -592,27 +604,24 @@ public:
unsigned count = term.atom.parenthesesDisjunction->m_numSubpatterns;
context->restoreOutput(output, firstSubpatternId, count);
}
- void resetAssertionMatches(ByteTerm& term)
- {
- unsigned firstSubpatternId = term.atom.subpatternId;
- unsigned count = term.atom.parenthesesDisjunction->m_numSubpatterns;
- for (unsigned i = 0; i < (count << 1); ++i)
- output[(firstSubpatternId << 1) + i] = -1;
- }
- bool parenthesesDoBacktrack(ByteTerm& term, BackTrackInfoParentheses* backTrack)
+ JSRegExpResult parenthesesDoBacktrack(ByteTerm& term, BackTrackInfoParentheses* backTrack)
{
while (backTrack->matchAmount) {
ParenthesesDisjunctionContext* context = backTrack->lastContext;
- if (matchDisjunction(term.atom.parenthesesDisjunction, context->getDisjunctionContext(term), true))
- return true;
+ JSRegExpResult result = matchDisjunction(term.atom.parenthesesDisjunction, context->getDisjunctionContext(term), true);
+ if (result == JSRegExpMatch)
+ return JSRegExpMatch;
resetMatches(term, context);
popParenthesesDisjunctionContext(backTrack);
freeParenthesesDisjunctionContext(context);
+
+ if (result != JSRegExpNoMatch)
+ return result;
}
- return false;
+ return JSRegExpNoMatch;
}
bool matchParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context)
@@ -625,11 +634,11 @@ public:
switch (term.atom.quantityType) {
case QuantifierGreedy: {
// set this speculatively; if we get to the parens end this will be true.
- backTrack->inParentheses = 1;
+ backTrack->begin = input.getPos();
break;
}
case QuantifierNonGreedy: {
- backTrack->inParentheses = 0;
+ backTrack->begin = notFound;
context->term += term.atom.parenthesesWidth;
return true;
}
@@ -645,7 +654,7 @@ public:
return true;
}
- bool matchParenthesesOnceEnd(ByteTerm& term, DisjunctionContext*)
+ bool matchParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context)
{
ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd);
ASSERT(term.atom.quantityCount == 1);
@@ -654,7 +663,12 @@ public:
unsigned subpatternId = term.atom.subpatternId;
output[(subpatternId << 1) + 1] = input.getPos() + term.inputPosition;
}
- return true;
+
+ if (term.atom.quantityType == QuantifierFixedCount)
+ return true;
+
+ BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
+ return backTrack->begin != input.getPos();
}
bool backtrackParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context)
@@ -673,12 +687,12 @@ public:
switch (term.atom.quantityType) {
case QuantifierGreedy:
// if we backtrack to this point, there is another chance - try matching nothing.
- ASSERT(backTrack->inParentheses);
- backTrack->inParentheses = 0;
+ ASSERT(backTrack->begin != notFound);
+ backTrack->begin = notFound;
context->term += term.atom.parenthesesWidth;
return true;
case QuantifierNonGreedy:
- ASSERT(backTrack->inParentheses);
+ ASSERT(backTrack->begin != notFound);
case QuantifierFixedCount:
break;
}
@@ -695,17 +709,21 @@ public:
switch (term.atom.quantityType) {
case QuantifierGreedy:
- if (!backTrack->inParentheses) {
+ if (backTrack->begin == notFound) {
context->term -= term.atom.parenthesesWidth;
return false;
}
case QuantifierNonGreedy:
- if (!backTrack->inParentheses) {
- // now try to match the parens; set this speculatively.
- backTrack->inParentheses = 1;
+ if (backTrack->begin == notFound) {
+ backTrack->begin = input.getPos();
if (term.capture()) {
+ // Technically this access to inputPosition should be accessing the begin term's
+ // inputPosition, but for repeats other than fixed these values should be
+ // the same anyway! (we don't pre-check for greedy or non-greedy matches.)
+ ASSERT((&term - term.atom.parenthesesWidth)->type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
+ ASSERT((&term - term.atom.parenthesesWidth)->inputPosition == term.inputPosition);
unsigned subpatternId = term.atom.subpatternId;
- output[(subpatternId << 1) + 1] = input.getPos() + term.inputPosition;
+ output[subpatternId << 1] = input.getPos() + term.inputPosition;
}
context->term -= term.atom.parenthesesWidth;
return true;
@@ -717,6 +735,53 @@ public:
return false;
}
+ bool matchParenthesesTerminalBegin(ByteTerm& term, DisjunctionContext* context)
+ {
+ ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalBegin);
+ ASSERT(term.atom.quantityType == QuantifierGreedy);
+ ASSERT(term.atom.quantityCount == UINT_MAX);
+ ASSERT(!term.capture());
+
+ BackTrackInfoParenthesesTerminal* backTrack = reinterpret_cast<BackTrackInfoParenthesesTerminal*>(context->frame + term.frameLocation);
+ backTrack->begin = input.getPos();
+ return true;
+ }
+
+ bool matchParenthesesTerminalEnd(ByteTerm& term, DisjunctionContext* context)
+ {
+ ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalEnd);
+
+ BackTrackInfoParenthesesTerminal* backTrack = reinterpret_cast<BackTrackInfoParenthesesTerminal*>(context->frame + term.frameLocation);
+ // Empty match is a failed match.
+ if (backTrack->begin == input.getPos())
+ return false;
+
+ // Successful match! Okay, what's next? - loop around and try to match moar!
+ context->term -= (term.atom.parenthesesWidth + 1);
+ return true;
+ }
+
+ bool backtrackParenthesesTerminalBegin(ByteTerm& term, DisjunctionContext* context)
+ {
+ ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalBegin);
+ ASSERT(term.atom.quantityType == QuantifierGreedy);
+ ASSERT(term.atom.quantityCount == UINT_MAX);
+ ASSERT(!term.capture());
+
+ // If we backtrack to this point, we have failed to match this iteration of the parens.
+ // Since this is greedy / zero minimum a failed is also accepted as a match!
+ context->term += term.atom.parenthesesWidth;
+ return true;
+ }
+
+ bool backtrackParenthesesTerminalEnd(ByteTerm&, DisjunctionContext*)
+ {
+ // 'Terminal' parentheses are at the end of the regex, and as such a match past end
+ // should always be returned as a successful match - we should never becktrack to here.
+ ASSERT_NOT_REACHED();
+ return false;
+ }
+
bool matchParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context)
{
ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin);
@@ -773,7 +838,7 @@ public:
return false;
}
- bool matchParentheses(ByteTerm& term, DisjunctionContext* context)
+ JSRegExpResult matchParentheses(ByteTerm& term, DisjunctionContext* context)
{
ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern);
@@ -794,31 +859,42 @@ public:
while (backTrack->matchAmount < term.atom.quantityCount) {
// Try to do a match, and it it succeeds, add it to the list.
ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
- if (matchDisjunction(disjunctionBody, context->getDisjunctionContext(term)))
+ JSRegExpResult result = matchDisjunction(disjunctionBody, context->getDisjunctionContext(term));
+ if (result == JSRegExpMatch)
appendParenthesesDisjunctionContext(backTrack, context);
else {
// The match failed; try to find an alternate point to carry on from.
resetMatches(term, context);
freeParenthesesDisjunctionContext(context);
- if (!parenthesesDoBacktrack(term, backTrack))
- return false;
+
+ if (result == JSRegExpNoMatch) {
+ JSRegExpResult backtrackResult = parenthesesDoBacktrack(term, backTrack);
+ if (backtrackResult != JSRegExpMatch)
+ return backtrackResult;
+ } else
+ return result;
}
}
ASSERT(backTrack->matchAmount == term.atom.quantityCount);
ParenthesesDisjunctionContext* context = backTrack->lastContext;
recordParenthesesMatch(term, context);
- return true;
+ return JSRegExpMatch;
}
case QuantifierGreedy: {
while (backTrack->matchAmount < term.atom.quantityCount) {
ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
- if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term)))
+ JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term));
+ if (result == JSRegExpMatch)
appendParenthesesDisjunctionContext(backTrack, context);
else {
resetMatches(term, context);
freeParenthesesDisjunctionContext(context);
+
+ if (result != JSRegExpNoMatch)
+ return result;
+
break;
}
}
@@ -827,15 +903,15 @@ public:
ParenthesesDisjunctionContext* context = backTrack->lastContext;
recordParenthesesMatch(term, context);
}
- return true;
+ return JSRegExpMatch;
}
case QuantifierNonGreedy:
- return true;
+ return JSRegExpMatch;
}
ASSERT_NOT_REACHED();
- return false;
+ return JSRegExpErrorNoMatch;
}
// Rules for backtracking differ depending on whether this is greedy or non-greedy.
@@ -848,7 +924,7 @@ public:
// Non-greedy, we've already done the one less case, so don't match on popping.
// We haven't done the one more case, so always try to add that.
//
- bool backtrackParentheses(ByteTerm& term, DisjunctionContext* context)
+ JSRegExpResult backtrackParentheses(ByteTerm& term, DisjunctionContext* context)
{
ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern);
@@ -867,44 +943,58 @@ public:
ASSERT(backTrack->matchAmount == term.atom.quantityCount);
ParenthesesDisjunctionContext* context = 0;
+ JSRegExpResult result = parenthesesDoBacktrack(term, backTrack);
- if (!parenthesesDoBacktrack(term, backTrack))
- return false;
+ if (result != JSRegExpMatch)
+ return result;
// While we haven't yet reached our fixed limit,
while (backTrack->matchAmount < term.atom.quantityCount) {
// Try to do a match, and it it succeeds, add it to the list.
context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
- if (matchDisjunction(disjunctionBody, context->getDisjunctionContext(term)))
+ result = matchDisjunction(disjunctionBody, context->getDisjunctionContext(term));
+
+ if (result == JSRegExpMatch)
appendParenthesesDisjunctionContext(backTrack, context);
else {
// The match failed; try to find an alternate point to carry on from.
resetMatches(term, context);
freeParenthesesDisjunctionContext(context);
- if (!parenthesesDoBacktrack(term, backTrack))
- return false;
+
+ if (result == JSRegExpNoMatch) {
+ JSRegExpResult backtrackResult = parenthesesDoBacktrack(term, backTrack);
+ if (backtrackResult != JSRegExpMatch)
+ return backtrackResult;
+ } else
+ return result;
}
}
ASSERT(backTrack->matchAmount == term.atom.quantityCount);
context = backTrack->lastContext;
recordParenthesesMatch(term, context);
- return true;
+ return JSRegExpMatch;
}
case QuantifierGreedy: {
if (!backTrack->matchAmount)
- return false;
+ return JSRegExpNoMatch;
ParenthesesDisjunctionContext* context = backTrack->lastContext;
- if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true)) {
+ JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true);
+ if (result == JSRegExpMatch) {
while (backTrack->matchAmount < term.atom.quantityCount) {
ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
- if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term)))
+ JSRegExpResult parenthesesResult = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term));
+ if (parenthesesResult == JSRegExpMatch)
appendParenthesesDisjunctionContext(backTrack, context);
else {
resetMatches(term, context);
freeParenthesesDisjunctionContext(context);
+
+ if (parenthesesResult != JSRegExpNoMatch)
+ return parenthesesResult;
+
break;
}
}
@@ -912,63 +1002,108 @@ public:
resetMatches(term, context);
popParenthesesDisjunctionContext(backTrack);
freeParenthesesDisjunctionContext(context);
+
+ if (result != JSRegExpNoMatch)
+ return result;
}
if (backTrack->matchAmount) {
ParenthesesDisjunctionContext* context = backTrack->lastContext;
recordParenthesesMatch(term, context);
}
- return true;
+ return JSRegExpMatch;
}
case QuantifierNonGreedy: {
// If we've not reached the limit, try to add one more match.
if (backTrack->matchAmount < term.atom.quantityCount) {
ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
- if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term))) {
+ JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term));
+ if (result == JSRegExpMatch) {
appendParenthesesDisjunctionContext(backTrack, context);
recordParenthesesMatch(term, context);
- return true;
- } else {
- resetMatches(term, context);
- freeParenthesesDisjunctionContext(context);
+ return JSRegExpMatch;
}
+
+ resetMatches(term, context);
+ freeParenthesesDisjunctionContext(context);
+
+ if (result != JSRegExpNoMatch)
+ return result;
}
// Nope - okay backtrack looking for an alternative.
while (backTrack->matchAmount) {
ParenthesesDisjunctionContext* context = backTrack->lastContext;
- if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true)) {
+ JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true);
+ if (result == JSRegExpMatch) {
// successful backtrack! we're back in the game!
if (backTrack->matchAmount) {
context = backTrack->lastContext;
recordParenthesesMatch(term, context);
}
- return true;
+ return JSRegExpMatch;
}
// pop a match off the stack
resetMatches(term, context);
popParenthesesDisjunctionContext(backTrack);
freeParenthesesDisjunctionContext(context);
+
+ return result;
}
- return false;
+ return JSRegExpNoMatch;
}
}
ASSERT_NOT_REACHED();
- return false;
+ return JSRegExpErrorNoMatch;
+ }
+
+ void lookupForBeginChars()
+ {
+ int character;
+ bool firstSingleCharFound;
+
+ while (true) {
+ if (input.isNotAvailableInput(2))
+ return;
+
+ firstSingleCharFound = false;
+
+ character = input.readPair();
+
+ for (unsigned i = 0; i < pattern->m_beginChars.size(); ++i) {
+ BeginChar bc = pattern->m_beginChars[i];
+
+ if (!firstSingleCharFound && bc.value <= 0xFFFF) {
+ firstSingleCharFound = true;
+ character &= 0xFFFF;
+ }
+
+ if ((character | bc.mask) == bc.value)
+ return;
+ }
+
+ input.next();
+ }
}
#define MATCH_NEXT() { ++context->term; goto matchAgain; }
#define BACKTRACK() { --context->term; goto backtrack; }
#define currentTerm() (disjunction->terms[context->term])
- bool matchDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false)
+ JSRegExpResult matchDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false, bool isBody = false)
{
+ if (!--remainingMatchCount)
+ return JSRegExpErrorHitLimit;
+
if (btrack)
BACKTRACK();
+ if (pattern->m_containsBeginChars && isBody)
+ lookupForBeginChars();
+
context->matchBegin = input.getPos();
context->term = 0;
@@ -980,14 +1115,14 @@ public:
MATCH_NEXT();
case ByteTerm::TypeSubpatternEnd:
context->matchEnd = input.getPos();
- return true;
+ return JSRegExpMatch;
case ByteTerm::TypeBodyAlternativeBegin:
MATCH_NEXT();
case ByteTerm::TypeBodyAlternativeDisjunction:
case ByteTerm::TypeBodyAlternativeEnd:
context->matchEnd = input.getPos();
- return true;
+ return JSRegExpMatch;
case ByteTerm::TypeAlternativeBegin:
MATCH_NEXT();
@@ -1077,10 +1212,16 @@ public:
if (matchBackReference(currentTerm(), context))
MATCH_NEXT();
BACKTRACK();
- case ByteTerm::TypeParenthesesSubpattern:
- if (matchParentheses(currentTerm(), context))
+ case ByteTerm::TypeParenthesesSubpattern: {
+ JSRegExpResult result = matchParentheses(currentTerm(), context);
+
+ if (result == JSRegExpMatch) {
MATCH_NEXT();
+ } else if (result != JSRegExpNoMatch)
+ return result;
+
BACKTRACK();
+ }
case ByteTerm::TypeParenthesesSubpatternOnceBegin:
if (matchParenthesesOnceBegin(currentTerm(), context))
MATCH_NEXT();
@@ -1089,6 +1230,14 @@ public:
if (matchParenthesesOnceEnd(currentTerm(), context))
MATCH_NEXT();
BACKTRACK();
+ case ByteTerm::TypeParenthesesSubpatternTerminalBegin:
+ if (matchParenthesesTerminalBegin(currentTerm(), context))
+ MATCH_NEXT();
+ BACKTRACK();
+ case ByteTerm::TypeParenthesesSubpatternTerminalEnd:
+ if (matchParenthesesTerminalEnd(currentTerm(), context))
+ MATCH_NEXT();
+ BACKTRACK();
case ByteTerm::TypeParentheticalAssertionBegin:
if (matchParentheticalAssertionBegin(currentTerm(), context))
MATCH_NEXT();
@@ -1112,7 +1261,7 @@ public:
switch (currentTerm().type) {
case ByteTerm::TypeSubpatternBegin:
- return false;
+ return JSRegExpNoMatch;
case ByteTerm::TypeSubpatternEnd:
ASSERT_NOT_REACHED();
@@ -1124,10 +1273,18 @@ public:
MATCH_NEXT();
if (input.atEnd())
- return false;
+ return JSRegExpNoMatch;
input.next();
+
+ if (pattern->m_containsBeginChars && isBody)
+ lookupForBeginChars();
+
context->matchBegin = input.getPos();
+
+ if (currentTerm().alternative.onceThrough)
+ context->term += currentTerm().alternative.next;
+
MATCH_NEXT();
}
case ByteTerm::TypeBodyAlternativeEnd:
@@ -1176,10 +1333,16 @@ public:
if (backtrackBackReference(currentTerm(), context))
MATCH_NEXT();
BACKTRACK();
- case ByteTerm::TypeParenthesesSubpattern:
- if (backtrackParentheses(currentTerm(), context))
+ case ByteTerm::TypeParenthesesSubpattern: {
+ JSRegExpResult result = backtrackParentheses(currentTerm(), context);
+
+ if (result == JSRegExpMatch) {
MATCH_NEXT();
+ } else if (result != JSRegExpNoMatch)
+ return result;
+
BACKTRACK();
+ }
case ByteTerm::TypeParenthesesSubpatternOnceBegin:
if (backtrackParenthesesOnceBegin(currentTerm(), context))
MATCH_NEXT();
@@ -1188,6 +1351,14 @@ public:
if (backtrackParenthesesOnceEnd(currentTerm(), context))
MATCH_NEXT();
BACKTRACK();
+ case ByteTerm::TypeParenthesesSubpatternTerminalBegin:
+ if (backtrackParenthesesTerminalBegin(currentTerm(), context))
+ MATCH_NEXT();
+ BACKTRACK();
+ case ByteTerm::TypeParenthesesSubpatternTerminalEnd:
+ if (backtrackParenthesesTerminalEnd(currentTerm(), context))
+ MATCH_NEXT();
+ BACKTRACK();
case ByteTerm::TypeParentheticalAssertionBegin:
if (backtrackParentheticalAssertionBegin(currentTerm(), context))
MATCH_NEXT();
@@ -1203,36 +1374,48 @@ public:
}
ASSERT_NOT_REACHED();
- return false;
+ return JSRegExpErrorNoMatch;
}
- bool matchNonZeroDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false)
+ JSRegExpResult matchNonZeroDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false)
{
- if (matchDisjunction(disjunction, context, btrack)) {
+ JSRegExpResult result = matchDisjunction(disjunction, context, btrack);
+
+ if (result == JSRegExpMatch) {
while (context->matchBegin == context->matchEnd) {
- if (!matchDisjunction(disjunction, context, true))
- return false;
+ result = matchDisjunction(disjunction, context, true);
+ if (result != JSRegExpMatch)
+ return result;
}
- return true;
+ return JSRegExpMatch;
}
- return false;
+ return result;
}
int interpret()
{
+ allocatorPool = pattern->m_allocator->startAllocator();
+ if (!allocatorPool)
+ CRASH();
+
for (unsigned i = 0; i < ((pattern->m_body->m_numSubpatterns + 1) << 1); ++i)
output[i] = -1;
DisjunctionContext* context = allocDisjunctionContext(pattern->m_body.get());
- if (matchDisjunction(pattern->m_body.get(), context)) {
+ JSRegExpResult result = matchDisjunction(pattern->m_body.get(), context, false, true);
+ if (result == JSRegExpMatch) {
output[0] = context->matchBegin;
output[1] = context->matchEnd;
}
freeDisjunctionContext(context);
+ pattern->m_allocator->stopAllocator();
+
+ // RegExp.cpp currently expects all error to be converted to -1.
+ ASSERT((result == JSRegExpMatch) == (output[0] != -1));
return output[0];
}
@@ -1240,6 +1423,8 @@ public:
: pattern(pattern)
, output(output)
, input(inputChar, start, length)
+ , allocatorPool(0)
+ , remainingMatchCount(matchLimit)
{
}
@@ -1247,6 +1432,8 @@ private:
BytecodePattern *pattern;
int* output;
InputStream input;
+ BumpPointerPool* allocatorPool;
+ unsigned remainingMatchCount;
};
@@ -1266,17 +1453,16 @@ public:
ByteCompiler(RegexPattern& pattern)
: m_pattern(pattern)
{
- m_bodyDisjunction = 0;
m_currentAlternativeIndex = 0;
}
- BytecodePattern* compile()
+ PassOwnPtr<BytecodePattern> compile(BumpPointerAllocator* allocator)
{
- regexBegin(m_pattern.m_numSubpatterns, m_pattern.m_body->m_callFrameSize);
+ regexBegin(m_pattern.m_numSubpatterns, m_pattern.m_body->m_callFrameSize, m_pattern.m_body->m_alternatives[0]->onceThrough());
emitDisjunction(m_pattern.m_body);
regexEnd();
- return new BytecodePattern(m_bodyDisjunction, m_allParenthesesInfo, m_pattern);
+ return adoptPtr(new BytecodePattern(m_bodyDisjunction.release(), m_allParenthesesInfo, m_pattern, allocator));
}
void checkInput(unsigned count)
@@ -1334,8 +1520,38 @@ public:
m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
}
+ void atomParenthesesOnceBegin(unsigned subpatternId, bool capture, int inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation)
+ {
+ int beginTerm = m_bodyDisjunction->terms.size();
+
+ m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, inputPosition));
+ m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
+ m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
+ m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation;
+
+ m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
+ m_currentAlternativeIndex = beginTerm + 1;
+ }
+
+ void atomParenthesesTerminalBegin(unsigned subpatternId, bool capture, int inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation)
+ {
+ int beginTerm = m_bodyDisjunction->terms.size();
+
+ m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternTerminalBegin, subpatternId, capture, inputPosition));
+ m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
+ m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
+ m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation;
+
+ m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
+ m_currentAlternativeIndex = beginTerm + 1;
+ }
+
void atomParenthesesSubpatternBegin(unsigned subpatternId, bool capture, int inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation)
{
+ // Errrk! - this is a little crazy, we initially generate as a TypeParenthesesSubpatternOnceBegin,
+ // then fix this up at the end! - simplifying this should make it much clearer.
+ // https://bugs.webkit.org/show_bug.cgi?id=50136
+
int beginTerm = m_bodyDisjunction->terms.size();
m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, inputPosition));
@@ -1360,6 +1576,28 @@ public:
m_currentAlternativeIndex = beginTerm + 1;
}
+ void atomParentheticalAssertionEnd(int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType)
+ {
+ unsigned beginTerm = popParenthesesStack();
+ closeAlternative(beginTerm + 1);
+ unsigned endTerm = m_bodyDisjunction->terms.size();
+
+ ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParentheticalAssertionBegin);
+
+ bool invertOrCapture = m_bodyDisjunction->terms[beginTerm].invertOrCapture;
+ unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId;
+
+ m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParentheticalAssertionEnd, subpatternId, invertOrCapture, inputPosition));
+ m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm;
+ m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm;
+ m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation;
+
+ m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount;
+ m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
+ m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount;
+ m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType;
+ }
+
unsigned popParenthesesStack()
{
ASSERT(m_parenthesesStack.size());
@@ -1431,56 +1669,85 @@ public:
m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation;
}
- void atomParenthesesEnd(bool doInline, unsigned lastSubpatternId, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType, unsigned callFrameSize = 0)
+ void atomParenthesesSubpatternEnd(unsigned lastSubpatternId, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType, unsigned callFrameSize = 0)
+ {
+ unsigned beginTerm = popParenthesesStack();
+ closeAlternative(beginTerm + 1);
+ unsigned endTerm = m_bodyDisjunction->terms.size();
+
+ ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
+
+ ByteTerm& parenthesesBegin = m_bodyDisjunction->terms[beginTerm];
+
+ bool invertOrCapture = parenthesesBegin.invertOrCapture;
+ unsigned subpatternId = parenthesesBegin.atom.subpatternId;
+
+ unsigned numSubpatterns = lastSubpatternId - subpatternId + 1;
+ ByteDisjunction* parenthesesDisjunction = new ByteDisjunction(numSubpatterns, callFrameSize);
+
+ parenthesesDisjunction->terms.append(ByteTerm::SubpatternBegin());
+ for (unsigned termInParentheses = beginTerm + 1; termInParentheses < endTerm; ++termInParentheses)
+ parenthesesDisjunction->terms.append(m_bodyDisjunction->terms[termInParentheses]);
+ parenthesesDisjunction->terms.append(ByteTerm::SubpatternEnd());
+
+ m_bodyDisjunction->terms.shrink(beginTerm);
+
+ m_allParenthesesInfo.append(parenthesesDisjunction);
+ m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction, invertOrCapture, inputPosition));
+
+ m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount;
+ m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
+ m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
+ }
+
+ void atomParenthesesOnceEnd(int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType)
{
unsigned beginTerm = popParenthesesStack();
closeAlternative(beginTerm + 1);
unsigned endTerm = m_bodyDisjunction->terms.size();
- bool isAssertion = m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParentheticalAssertionBegin;
+ ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
+
bool invertOrCapture = m_bodyDisjunction->terms[beginTerm].invertOrCapture;
unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId;
- m_bodyDisjunction->terms.append(ByteTerm(isAssertion ? ByteTerm::TypeParentheticalAssertionEnd : ByteTerm::TypeParenthesesSubpatternOnceEnd, subpatternId, invertOrCapture, inputPosition));
+ m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceEnd, subpatternId, invertOrCapture, inputPosition));
m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm;
m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm;
m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation;
- if (doInline) {
- m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount;
- m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
- m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount;
- m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType;
- } else {
- ByteTerm& parenthesesBegin = m_bodyDisjunction->terms[beginTerm];
- ASSERT(parenthesesBegin.type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
-
- bool invertOrCapture = parenthesesBegin.invertOrCapture;
- unsigned subpatternId = parenthesesBegin.atom.subpatternId;
+ m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount;
+ m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
+ m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount;
+ m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType;
+ }
- unsigned numSubpatterns = lastSubpatternId - subpatternId + 1;
- ByteDisjunction* parenthesesDisjunction = new ByteDisjunction(numSubpatterns, callFrameSize);
+ void atomParenthesesTerminalEnd(int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType)
+ {
+ unsigned beginTerm = popParenthesesStack();
+ closeAlternative(beginTerm + 1);
+ unsigned endTerm = m_bodyDisjunction->terms.size();
- parenthesesDisjunction->terms.append(ByteTerm::SubpatternBegin());
- for (unsigned termInParentheses = beginTerm + 1; termInParentheses < endTerm; ++termInParentheses)
- parenthesesDisjunction->terms.append(m_bodyDisjunction->terms[termInParentheses]);
- parenthesesDisjunction->terms.append(ByteTerm::SubpatternEnd());
+ ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternTerminalBegin);
- m_bodyDisjunction->terms.shrink(beginTerm);
+ bool invertOrCapture = m_bodyDisjunction->terms[beginTerm].invertOrCapture;
+ unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId;
- m_allParenthesesInfo.append(parenthesesDisjunction);
- m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction, invertOrCapture, inputPosition));
+ m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternTerminalEnd, subpatternId, invertOrCapture, inputPosition));
+ m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm;
+ m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm;
+ m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation;
- m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount;
- m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
- m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
- }
+ m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount;
+ m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
+ m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount;
+ m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType;
}
- void regexBegin(unsigned numSubpatterns, unsigned callFrameSize)
+ void regexBegin(unsigned numSubpatterns, unsigned callFrameSize, bool onceThrough)
{
- m_bodyDisjunction = new ByteDisjunction(numSubpatterns, callFrameSize);
- m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeBegin());
+ m_bodyDisjunction = adoptPtr(new ByteDisjunction(numSubpatterns, callFrameSize));
+ m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeBegin(onceThrough));
m_bodyDisjunction->terms[0].frameLocation = 0;
m_currentAlternativeIndex = 0;
}
@@ -1490,11 +1757,11 @@ public:
closeBodyAlternative();
}
- void alternativeBodyDisjunction()
+ void alternativeBodyDisjunction(bool onceThrough)
{
int newAlternativeIndex = m_bodyDisjunction->terms.size();
m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex;
- m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeDisjunction());
+ m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeDisjunction(onceThrough));
m_currentAlternativeIndex = newAlternativeIndex;
}
@@ -1508,26 +1775,33 @@ public:
m_currentAlternativeIndex = newAlternativeIndex;
}
- void emitDisjunction(PatternDisjunction* disjunction, unsigned inputCountAlreadyChecked = 0, unsigned parenthesesInputCountAlreadyChecked = 0)
+ void emitDisjunction(PatternDisjunction* disjunction, unsigned inputCountAlreadyChecked = 0, unsigned parenthesesInputCountAlreadyChecked = 0, bool isParentheticalAssertion = false)
{
for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
unsigned currentCountAlreadyChecked = inputCountAlreadyChecked;
+ PatternAlternative* alternative = disjunction->m_alternatives[alt];
+
if (alt) {
if (disjunction == m_pattern.m_body)
- alternativeBodyDisjunction();
+ alternativeBodyDisjunction(alternative->onceThrough());
else
alternativeDisjunction();
}
- PatternAlternative* alternative = disjunction->m_alternatives[alt];
unsigned minimumSize = alternative->m_minimumSize;
+ int countToCheck;
- ASSERT(minimumSize >= parenthesesInputCountAlreadyChecked);
- unsigned countToCheck = minimumSize - parenthesesInputCountAlreadyChecked;
- if (countToCheck)
+ if (isParentheticalAssertion && parenthesesInputCountAlreadyChecked > minimumSize)
+ countToCheck = 0;
+ else
+ countToCheck = minimumSize - parenthesesInputCountAlreadyChecked;
+
+ ASSERT(countToCheck >= 0);
+ if (countToCheck) {
checkInput(countToCheck);
- currentCountAlreadyChecked += countToCheck;
+ currentCountAlreadyChecked += countToCheck;
+ }
for (unsigned i = 0; i < alternative->m_terms.size(); ++i) {
PatternTerm& term = alternative->m_terms[i];
@@ -1562,34 +1836,40 @@ public:
case PatternTerm::TypeParenthesesSubpattern: {
unsigned disjunctionAlreadyCheckedCount = 0;
- if ((term.quantityCount == 1) && !term.parentheses.isCopy) {
- if (term.quantityType == QuantifierFixedCount) {
+ if (term.quantityCount == 1 && !term.parentheses.isCopy) {
+ unsigned alternativeFrameLocation = term.frameLocation;
+ // For QuantifierFixedCount we pre-check the minimum size; for greedy/non-greedy we reserve a slot in the frame.
+ if (term.quantityType == QuantifierFixedCount)
disjunctionAlreadyCheckedCount = term.parentheses.disjunction->m_minimumSize;
- unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked;
- atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.invertOrCapture, delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, term.frameLocation);
- emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, term.parentheses.disjunction->m_minimumSize);
- atomParenthesesEnd(true, term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize);
- } else {
- unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked;
- atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.invertOrCapture, delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, term.frameLocation + RegexStackSpaceForBackTrackInfoParenthesesOnce);
- emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0);
- atomParenthesesEnd(true, term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize);
- }
+ else
+ alternativeFrameLocation += RegexStackSpaceForBackTrackInfoParenthesesOnce;
+ unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked;
+ atomParenthesesOnceBegin(term.parentheses.subpatternId, term.invertOrCapture, delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, alternativeFrameLocation);
+ emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, disjunctionAlreadyCheckedCount);
+ atomParenthesesOnceEnd(delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType);
+ } else if (term.parentheses.isTerminal) {
+ unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked;
+ atomParenthesesTerminalBegin(term.parentheses.subpatternId, term.invertOrCapture, delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, term.frameLocation + RegexStackSpaceForBackTrackInfoParenthesesOnce);
+ emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, disjunctionAlreadyCheckedCount);
+ atomParenthesesTerminalEnd(delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType);
} else {
unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked;
atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.invertOrCapture, delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, 0);
emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0);
- atomParenthesesEnd(false, term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize);
+ atomParenthesesSubpatternEnd(term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize);
}
break;
}
case PatternTerm::TypeParentheticalAssertion: {
- unsigned alternativeFrameLocation = term.inputPosition + RegexStackSpaceForBackTrackInfoParentheticalAssertion;
+ unsigned alternativeFrameLocation = term.frameLocation + RegexStackSpaceForBackTrackInfoParentheticalAssertion;
+
+ ASSERT(currentCountAlreadyChecked >= (unsigned)term.inputPosition);
+ int positiveInputOffset = currentCountAlreadyChecked - term.inputPosition;
atomParentheticalAssertionBegin(term.parentheses.subpatternId, term.invertOrCapture, term.frameLocation, alternativeFrameLocation);
- emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0);
- atomParenthesesEnd(true, term.parentheses.lastSubpatternId, 0, term.frameLocation, term.quantityCount, term.quantityType);
+ emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, positiveInputOffset, true);
+ atomParentheticalAssertionEnd(0, term.frameLocation, term.quantityCount, term.quantityType);
break;
}
}
@@ -1599,23 +1879,28 @@ public:
private:
RegexPattern& m_pattern;
- ByteDisjunction* m_bodyDisjunction;
+ OwnPtr<ByteDisjunction> m_bodyDisjunction;
unsigned m_currentAlternativeIndex;
Vector<ParenthesesStackEntry> m_parenthesesStack;
Vector<ByteDisjunction*> m_allParenthesesInfo;
};
-BytecodePattern* byteCompileRegex(const UString& patternString, unsigned& numSubpatterns, const char*& error, bool ignoreCase, bool multiline)
+PassOwnPtr<BytecodePattern> byteCompileRegex(const UString& patternString, unsigned& numSubpatterns, const char*& error, BumpPointerAllocator* allocator, bool ignoreCase, bool multiline)
{
RegexPattern pattern(ignoreCase, multiline);
if ((error = compileRegex(patternString, pattern)))
- return 0;
+ return PassOwnPtr<BytecodePattern>();
numSubpatterns = pattern.m_numSubpatterns;
- return ByteCompiler(pattern).compile();
+ return ByteCompiler(pattern).compile(allocator);
+}
+
+PassOwnPtr<BytecodePattern> byteCompileRegex(RegexPattern& pattern, BumpPointerAllocator* allocator)
+{
+ return ByteCompiler(pattern).compile(allocator);
}
int interpretRegex(BytecodePattern* regex, const UChar* input, unsigned start, unsigned length, int* output)
@@ -1634,5 +1919,3 @@ COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParentheses) == (RegexStackSpace
} }
-
-#endif
diff --git a/JavaScriptCore/yarr/RegexInterpreter.h b/JavaScriptCore/yarr/RegexInterpreter.h
index 48c9a5e..2e23472 100644
--- a/JavaScriptCore/yarr/RegexInterpreter.h
+++ b/JavaScriptCore/yarr/RegexInterpreter.h
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2009 Apple Inc. All rights reserved.
+ * Copyright (C) 2009, 2010 Apple Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -26,16 +26,33 @@
#ifndef RegexInterpreter_h
#define RegexInterpreter_h
-#include <wtf/Platform.h>
-
-#if ENABLE(YARR)
-
-#include <wtf/unicode/Unicode.h>
#include "RegexParser.h"
#include "RegexPattern.h"
+#include <wtf/PassOwnPtr.h>
+#include <wtf/unicode/Unicode.h>
+
+namespace WTF {
+class BumpPointerAllocator;
+}
+using WTF::BumpPointerAllocator;
namespace JSC { namespace Yarr {
+// TODO move the matchLimit constant and the JSRegExpResult enum to the JSRegExp.h when pcre is removed.
+
+// The below limit restricts the number of "recursive" match calls in order to
+// avoid spending exponential time on complex regular expressions.
+static const unsigned matchLimit = 1000000;
+
+enum JSRegExpResult {
+ JSRegExpMatch = 1,
+ JSRegExpNoMatch = 0,
+ JSRegExpErrorNoMatch = -1,
+ JSRegExpErrorHitLimit = -2,
+ JSRegExpErrorNoMemory = -3,
+ JSRegExpErrorInternal = -4
+};
+
class ByteDisjunction;
struct ByteTerm {
@@ -64,6 +81,8 @@ struct ByteTerm {
TypeParenthesesSubpattern,
TypeParenthesesSubpatternOnceBegin,
TypeParenthesesSubpatternOnceEnd,
+ TypeParenthesesSubpatternTerminalBegin,
+ TypeParenthesesSubpatternTerminalEnd,
TypeParentheticalAssertionBegin,
TypeParentheticalAssertionEnd,
TypeCheckInput,
@@ -90,6 +109,7 @@ struct ByteTerm {
struct {
int next;
int end;
+ bool onceThrough;
} alternative;
unsigned checkInputCount;
};
@@ -211,19 +231,21 @@ struct ByteTerm {
return ByteTerm(TypeBackReference, subpatternId, false, inputPos);
}
- static ByteTerm BodyAlternativeBegin()
+ static ByteTerm BodyAlternativeBegin(bool onceThrough)
{
ByteTerm term(TypeBodyAlternativeBegin);
term.alternative.next = 0;
term.alternative.end = 0;
+ term.alternative.onceThrough = onceThrough;
return term;
}
- static ByteTerm BodyAlternativeDisjunction()
+ static ByteTerm BodyAlternativeDisjunction(bool onceThrough)
{
ByteTerm term(TypeBodyAlternativeDisjunction);
term.alternative.next = 0;
term.alternative.end = 0;
+ term.alternative.onceThrough = onceThrough;
return term;
}
@@ -232,6 +254,7 @@ struct ByteTerm {
ByteTerm term(TypeBodyAlternativeEnd);
term.alternative.next = 0;
term.alternative.end = 0;
+ term.alternative.onceThrough = false;
return term;
}
@@ -240,6 +263,7 @@ struct ByteTerm {
ByteTerm term(TypeAlternativeBegin);
term.alternative.next = 0;
term.alternative.end = 0;
+ term.alternative.onceThrough = false;
return term;
}
@@ -248,6 +272,7 @@ struct ByteTerm {
ByteTerm term(TypeAlternativeDisjunction);
term.alternative.next = 0;
term.alternative.end = 0;
+ term.alternative.onceThrough = false;
return term;
}
@@ -256,6 +281,7 @@ struct ByteTerm {
ByteTerm term(TypeAlternativeEnd);
term.alternative.next = 0;
term.alternative.end = 0;
+ term.alternative.onceThrough = false;
return term;
}
@@ -294,10 +320,12 @@ public:
};
struct BytecodePattern : FastAllocBase {
- BytecodePattern(ByteDisjunction* body, Vector<ByteDisjunction*> allParenthesesInfo, RegexPattern& pattern)
+ BytecodePattern(PassOwnPtr<ByteDisjunction> body, Vector<ByteDisjunction*> allParenthesesInfo, RegexPattern& pattern, BumpPointerAllocator* allocator)
: m_body(body)
, m_ignoreCase(pattern.m_ignoreCase)
, m_multiline(pattern.m_multiline)
+ , m_containsBeginChars(pattern.m_containsBeginChars)
+ , m_allocator(allocator)
{
newlineCharacterClass = pattern.newlineCharacterClass();
wordcharCharacterClass = pattern.wordcharCharacterClass();
@@ -308,6 +336,8 @@ struct BytecodePattern : FastAllocBase {
// array, so that it won't delete them on destruction. We'll
// take responsibility for that.
pattern.m_userCharacterClasses.clear();
+
+ m_beginChars.append(pattern.m_beginChars);
}
~BytecodePattern()
@@ -319,19 +349,25 @@ struct BytecodePattern : FastAllocBase {
OwnPtr<ByteDisjunction> m_body;
bool m_ignoreCase;
bool m_multiline;
-
+ bool m_containsBeginChars;
+ // Each BytecodePattern is associated with a RegExp, each RegExp is associated
+ // with a JSGlobalData. Cache a pointer to out JSGlobalData's m_regexAllocator.
+ BumpPointerAllocator* m_allocator;
+
CharacterClass* newlineCharacterClass;
CharacterClass* wordcharCharacterClass;
+
+ Vector<BeginChar> m_beginChars;
+
private:
Vector<ByteDisjunction*> m_allParenthesesInfo;
Vector<CharacterClass*> m_userCharacterClasses;
};
-BytecodePattern* byteCompileRegex(const UString& pattern, unsigned& numSubpatterns, const char*& error, bool ignoreCase = false, bool multiline = false);
+PassOwnPtr<BytecodePattern> byteCompileRegex(const UString& pattern, unsigned& numSubpatterns, const char*& error, BumpPointerAllocator*, bool ignoreCase = false, bool multiline = false);
+PassOwnPtr<BytecodePattern> byteCompileRegex(RegexPattern& pattern, BumpPointerAllocator*);
int interpretRegex(BytecodePattern* v_regex, const UChar* input, unsigned start, unsigned length, int* output);
} } // namespace JSC::Yarr
-#endif
-
#endif // RegexInterpreter_h
diff --git a/JavaScriptCore/yarr/RegexJIT.cpp b/JavaScriptCore/yarr/RegexJIT.cpp
index fcb8d86..acbd458 100644
--- a/JavaScriptCore/yarr/RegexJIT.cpp
+++ b/JavaScriptCore/yarr/RegexJIT.cpp
@@ -31,8 +31,7 @@
#include "LinkBuffer.h"
#include "MacroAssembler.h"
#include "RegexCompiler.h"
-
-#include "pcre.h" // temporary, remove when fallback is removed.
+#include "RegexInterpreter.h" // temporary, remove when fallback is removed.
#if ENABLE(YARR_JIT)
@@ -40,7 +39,6 @@ using namespace WTF;
namespace JSC { namespace Yarr {
-
class RegexGenerator : private MacroAssembler {
friend void jitCompileRegex(JSGlobalData* globalData, RegexCodeBlock& jitObject, const UString& pattern, unsigned& numSubpatterns, const char*& error, bool ignoreCase, bool multiline);
@@ -54,6 +52,16 @@ class RegexGenerator : private MacroAssembler {
static const RegisterID regT1 = ARMRegisters::r6;
static const RegisterID returnRegister = ARMRegisters::r0;
+#elif CPU(MIPS)
+ static const RegisterID input = MIPSRegisters::a0;
+ static const RegisterID index = MIPSRegisters::a1;
+ static const RegisterID length = MIPSRegisters::a2;
+ static const RegisterID output = MIPSRegisters::a3;
+
+ static const RegisterID regT0 = MIPSRegisters::t4;
+ static const RegisterID regT1 = MIPSRegisters::t5;
+
+ static const RegisterID returnRegister = MIPSRegisters::v0;
#elif CPU(X86)
static const RegisterID input = X86Registers::eax;
static const RegisterID index = X86Registers::edx;
@@ -145,6 +153,11 @@ class RegexGenerator : private MacroAssembler {
void matchCharacterClass(RegisterID character, JumpList& matchDest, const CharacterClass* charClass)
{
+ if (charClass->m_table) {
+ ExtendedAddress tableEntry(character, reinterpret_cast<intptr_t>(charClass->m_table->m_table));
+ matchDest.append(branchTest8(charClass->m_table->m_inverted ? Zero : NonZero, tableEntry));
+ return;
+ }
Jump unicodeFail;
if (charClass->m_matchesUnicode.size() || charClass->m_rangesUnicode.size()) {
Jump isAscii = branch32(LessThanOrEqual, character, Imm32(0x7f));
@@ -331,6 +344,15 @@ class RegexGenerator : private MacroAssembler {
ASSERT(alternativeValid());
return alternative()->m_terms[t];
}
+ bool isLastTerm()
+ {
+ ASSERT(alternativeValid());
+ return (t + 1) == alternative()->m_terms.size();
+ }
+ bool isMainDisjunction()
+ {
+ return !disjunction->m_parent;
+ }
PatternTerm& lookaheadTerm()
{
@@ -599,10 +621,14 @@ class RegexGenerator : private MacroAssembler {
ASSERT(!m_pattern.m_ignoreCase || (Unicode::toLower(ch) == Unicode::toUpper(ch)));
failures.append(jumpIfCharNotEquals(ch, state.inputOffset()));
}
+
add32(Imm32(1), countRegister);
add32(Imm32(1), index);
- branch32(NotEqual, countRegister, Imm32(term.quantityCount)).linkTo(loop, this);
- failures.append(jump());
+ if (term.quantityCount != 0xffffffff) {
+ branch32(NotEqual, countRegister, Imm32(term.quantityCount)).linkTo(loop, this);
+ failures.append(jump());
+ } else
+ jump(loop);
Label backtrackBegin(this);
loadFromFrame(term.frameLocation, countRegister);
@@ -636,7 +662,8 @@ class RegexGenerator : private MacroAssembler {
loadFromFrame(term.frameLocation, countRegister);
atEndOfInput().linkTo(hardFail, this);
- branch32(Equal, countRegister, Imm32(term.quantityCount), hardFail);
+ if (term.quantityCount != 0xffffffff)
+ branch32(Equal, countRegister, Imm32(term.quantityCount), hardFail);
if (m_pattern.m_ignoreCase && isASCIIAlpha(ch)) {
readCharacter(state.inputOffset(), character);
or32(Imm32(32), character);
@@ -722,8 +749,11 @@ class RegexGenerator : private MacroAssembler {
add32(Imm32(1), countRegister);
add32(Imm32(1), index);
- branch32(NotEqual, countRegister, Imm32(term.quantityCount)).linkTo(loop, this);
- failures.append(jump());
+ if (term.quantityCount != 0xffffffff) {
+ branch32(NotEqual, countRegister, Imm32(term.quantityCount)).linkTo(loop, this);
+ failures.append(jump());
+ } else
+ jump(loop);
Label backtrackBegin(this);
loadFromFrame(term.frameLocation, countRegister);
@@ -880,7 +910,7 @@ class RegexGenerator : private MacroAssembler {
PatternDisjunction* disjunction = term.parentheses.disjunction;
ASSERT(term.quantityCount == 1);
- unsigned preCheckedCount = ((term.quantityCount == 1) && (term.quantityType == QuantifierFixedCount)) ? disjunction->m_minimumSize : 0;
+ unsigned preCheckedCount = (term.quantityType == QuantifierFixedCount) ? disjunction->m_minimumSize : 0;
unsigned parenthesesFrameLocation = term.frameLocation;
unsigned alternativeFrameLocation = parenthesesFrameLocation;
@@ -899,12 +929,12 @@ class RegexGenerator : private MacroAssembler {
Jump nonGreedySkipParentheses;
Label nonGreedyTryParentheses;
if (term.quantityType == QuantifierGreedy)
- storeToFrame(Imm32(1), parenthesesFrameLocation);
+ storeToFrame(index, parenthesesFrameLocation);
else if (term.quantityType == QuantifierNonGreedy) {
- storeToFrame(Imm32(0), parenthesesFrameLocation);
+ storeToFrame(Imm32(-1), parenthesesFrameLocation);
nonGreedySkipParentheses = jump();
nonGreedyTryParentheses = label();
- storeToFrame(Imm32(1), parenthesesFrameLocation);
+ storeToFrame(index, parenthesesFrameLocation);
}
// store the match start index
@@ -922,41 +952,31 @@ class RegexGenerator : private MacroAssembler {
TermGenerationState parenthesesState(disjunction, state.checkedTotal);
generateParenthesesDisjunction(state.term(), parenthesesState, alternativeFrameLocation);
- // store the match end index
- if (term.invertOrCapture) {
- int inputOffset = state.inputOffset();
- if (inputOffset) {
- move(index, indexTemporary);
- add32(Imm32(state.inputOffset()), indexTemporary);
- store32(indexTemporary, Address(output, ((term.parentheses.subpatternId << 1) + 1) * sizeof(int)));
- } else
- store32(index, Address(output, ((term.parentheses.subpatternId << 1) + 1) * sizeof(int)));
- }
- Jump success = jump();
+ Jump success = (term.quantityType == QuantifierFixedCount) ?
+ jump() :
+ branch32(NotEqual, index, Address(stackPointerRegister, (parenthesesFrameLocation * sizeof(void*))));
// A failure AFTER the parens jumps here
Label backtrackFromAfterParens(this);
if (term.quantityType == QuantifierGreedy) {
- // If this is zero we have now tested with both with and without the parens.
+ // If this is -1 we have now tested with both with and without the parens.
loadFromFrame(parenthesesFrameLocation, indexTemporary);
- state.jumpToBacktrack(branchTest32(Zero, indexTemporary), this);
+ state.jumpToBacktrack(branch32(Equal, indexTemporary, Imm32(-1)), this);
} else if (term.quantityType == QuantifierNonGreedy) {
- // If this is zero we have now tested with both with and without the parens.
+ // If this is -1 we have now tested without the parens, now test with.
loadFromFrame(parenthesesFrameLocation, indexTemporary);
- branchTest32(Zero, indexTemporary).linkTo(nonGreedyTryParentheses, this);
+ branch32(Equal, indexTemporary, Imm32(-1)).linkTo(nonGreedyTryParentheses, this);
}
parenthesesState.plantJumpToBacktrackIfExists(this);
// A failure WITHIN the parens jumps here
parenthesesState.linkAlternativeBacktracks(this);
- if (term.invertOrCapture) {
+ if (term.invertOrCapture)
store32(Imm32(-1), Address(output, (term.parentheses.subpatternId << 1) * sizeof(int)));
- store32(Imm32(-1), Address(output, ((term.parentheses.subpatternId << 1) + 1) * sizeof(int)));
- }
if (term.quantityType == QuantifierGreedy)
- storeToFrame(Imm32(0), parenthesesFrameLocation);
+ storeToFrame(Imm32(-1), parenthesesFrameLocation);
else
state.jumpToBacktrack(jump(), this);
@@ -964,9 +984,67 @@ class RegexGenerator : private MacroAssembler {
if (term.quantityType == QuantifierNonGreedy)
nonGreedySkipParentheses.link(this);
success.link(this);
+
+ // store the match end index
+ if (term.invertOrCapture) {
+ int inputOffset = state.inputOffset();
+ if (inputOffset) {
+ move(index, indexTemporary);
+ add32(Imm32(state.inputOffset()), indexTemporary);
+ store32(indexTemporary, Address(output, ((term.parentheses.subpatternId << 1) + 1) * sizeof(int)));
+ } else
+ store32(index, Address(output, ((term.parentheses.subpatternId << 1) + 1) * sizeof(int)));
+ }
}
}
+ void generateParenthesesGreedyNoBacktrack(TermGenerationState& state)
+ {
+ PatternTerm& parenthesesTerm = state.term();
+ PatternDisjunction* disjunction = parenthesesTerm.parentheses.disjunction;
+ ASSERT(parenthesesTerm.type == PatternTerm::TypeParenthesesSubpattern);
+ ASSERT(parenthesesTerm.quantityCount != 1); // Handled by generateParenthesesSingle.
+
+ TermGenerationState parenthesesState(disjunction, state.checkedTotal);
+
+ Label matchAgain(this);
+
+ storeToFrame(index, parenthesesTerm.frameLocation); // Save the current index to check for zero len matches later.
+
+ for (parenthesesState.resetAlternative(); parenthesesState.alternativeValid(); parenthesesState.nextAlternative()) {
+
+ PatternAlternative* alternative = parenthesesState.alternative();
+ optimizeAlternative(alternative);
+
+ int countToCheck = alternative->m_minimumSize;
+ if (countToCheck) {
+ parenthesesState.addBacktrackJump(jumpIfNoAvailableInput(countToCheck));
+ parenthesesState.checkedTotal += countToCheck;
+ }
+
+ for (parenthesesState.resetTerm(); parenthesesState.termValid(); parenthesesState.nextTerm())
+ generateTerm(parenthesesState);
+
+ // If we get here, we matched! If the index advanced then try to match more since limit isn't supported yet.
+ branch32(NotEqual, index, Address(stackPointerRegister, (parenthesesTerm.frameLocation * sizeof(void*))), matchAgain);
+
+ // If we get here we matched, but we matched "" - cannot accept this alternative as is, so either backtrack,
+ // or fall through to try the next alternative if no backtrack is available.
+ parenthesesState.plantJumpToBacktrackIfExists(this);
+
+ parenthesesState.linkAlternativeBacktracks(this);
+ // We get here if the alternative fails to match - fall through to the next iteration, or out of the loop.
+
+ if (countToCheck) {
+ sub32(Imm32(countToCheck), index);
+ parenthesesState.checkedTotal -= countToCheck;
+ }
+ }
+
+ // If the last alternative falls through to here, we have a failed match...
+ // Which means that we match whatever we have matched up to this point (even if nothing).
+ }
+
void generateParentheticalAssertion(TermGenerationState& state)
{
PatternTerm& term = state.term();
@@ -1078,17 +1156,19 @@ class RegexGenerator : private MacroAssembler {
break;
case PatternTerm::TypeBackReference:
- m_generationFailed = true;
+ m_shouldFallBack = true;
break;
case PatternTerm::TypeForwardReference:
break;
case PatternTerm::TypeParenthesesSubpattern:
- if ((term.quantityCount == 1) && !term.parentheses.isCopy)
+ if (term.quantityCount == 1 && !term.parentheses.isCopy)
generateParenthesesSingle(state);
+ else if (term.parentheses.isTerminal)
+ generateParenthesesGreedyNoBacktrack(state);
else
- m_generationFailed = true;
+ m_shouldFallBack = true;
break;
case PatternTerm::TypeParentheticalAssertion:
@@ -1102,21 +1182,26 @@ class RegexGenerator : private MacroAssembler {
TermGenerationState state(disjunction, 0);
state.resetAlternative();
- // Plant a check to see if there is sufficient input available to run the first alternative.
- // Jumping back to the label 'firstAlternative' will get to this check, jumping to
- // 'firstAlternativeInputChecked' will jump directly to matching the alternative having
- // skipped this check.
-
- Label firstAlternative(this);
-
// check availability for the next alternative
int countCheckedForCurrentAlternative = 0;
int countToCheckForFirstAlternative = 0;
bool hasShorterAlternatives = false;
+ bool setRepeatAlternativeLabels = false;
JumpList notEnoughInputForPreviousAlternative;
+ Label firstAlternative;
+ Label firstAlternativeInputChecked;
+ // The label 'firstAlternative' is used to plant a check to see if there is
+ // sufficient input available to run the first repeating alternative.
+ // The label 'firstAlternativeInputChecked' will jump directly to matching
+ // the first repeating alternative having skipped this check.
+
if (state.alternativeValid()) {
PatternAlternative* alternative = state.alternative();
+ if (!alternative->onceThrough()) {
+ firstAlternative = Label(this);
+ setRepeatAlternativeLabels = true;
+ }
countToCheckForFirstAlternative = alternative->m_minimumSize;
state.checkedTotal += countToCheckForFirstAlternative;
if (countToCheckForFirstAlternative)
@@ -1124,31 +1209,35 @@ class RegexGenerator : private MacroAssembler {
countCheckedForCurrentAlternative = countToCheckForFirstAlternative;
}
- Label firstAlternativeInputChecked(this);
+ if (setRepeatAlternativeLabels)
+ firstAlternativeInputChecked = Label(this);
while (state.alternativeValid()) {
- // Track whether any alternatives are shorter than the first one.
- hasShorterAlternatives = hasShorterAlternatives || (countCheckedForCurrentAlternative < countToCheckForFirstAlternative);
-
PatternAlternative* alternative = state.alternative();
optimizeAlternative(alternative);
+ // Track whether any alternatives are shorter than the first one.
+ if (!alternative->onceThrough())
+ hasShorterAlternatives = hasShorterAlternatives || (countCheckedForCurrentAlternative < countToCheckForFirstAlternative);
+
for (state.resetTerm(); state.termValid(); state.nextTerm())
generateTerm(state);
// If we get here, the alternative matched.
if (m_pattern.m_body->m_callFrameSize)
addPtr(Imm32(m_pattern.m_body->m_callFrameSize * sizeof(void*)), stackPointerRegister);
-
+
ASSERT(index != returnRegister);
if (m_pattern.m_body->m_hasFixedSize) {
move(index, returnRegister);
if (alternative->m_minimumSize)
sub32(Imm32(alternative->m_minimumSize), returnRegister);
+
+ store32(returnRegister, output);
} else
- pop(returnRegister);
+ load32(Address(output), returnRegister);
+
store32(index, Address(output, 4));
- store32(returnRegister, output);
generateReturn();
@@ -1157,47 +1246,75 @@ class RegexGenerator : private MacroAssembler {
// if there are any more alternatives, plant the check for input before looping.
if (state.alternativeValid()) {
PatternAlternative* nextAlternative = state.alternative();
- int countToCheckForNextAlternative = nextAlternative->m_minimumSize;
-
- if (countCheckedForCurrentAlternative > countToCheckForNextAlternative) { // CASE 1: current alternative was longer than the next one.
+ if (!setRepeatAlternativeLabels && !nextAlternative->onceThrough()) {
+ // We have handled non-repeating alternatives, jump to next iteration
+ // and loop over repeating alternatives.
+ state.jumpToBacktrack(jump(), this);
+
+ countToCheckForFirstAlternative = nextAlternative->m_minimumSize;
+
// If we get here, there the last input checked failed.
notEnoughInputForPreviousAlternative.link(this);
-
- // Check if sufficent input available to run the next alternative
- notEnoughInputForPreviousAlternative.append(jumpIfNoAvailableInput(countToCheckForNextAlternative - countCheckedForCurrentAlternative));
- // We are now in the correct state to enter the next alternative; this add is only required
- // to mirror and revert operation of the sub32, just below.
- add32(Imm32(countCheckedForCurrentAlternative - countToCheckForNextAlternative), index);
-
- // If we get here, there the last input checked passed.
+
state.linkAlternativeBacktracks(this);
- // No need to check if we can run the next alternative, since it is shorter -
- // just update index.
- sub32(Imm32(countCheckedForCurrentAlternative - countToCheckForNextAlternative), index);
- } else if (countCheckedForCurrentAlternative < countToCheckForNextAlternative) { // CASE 2: next alternative is longer than the current one.
- // If we get here, there the last input checked failed.
- // If there is insufficient input to run the current alternative, and the next alternative is longer,
- // then there is definitely not enough input to run it - don't even check. Just adjust index, as if
- // we had checked.
- notEnoughInputForPreviousAlternative.link(this);
- add32(Imm32(countToCheckForNextAlternative - countCheckedForCurrentAlternative), index);
- notEnoughInputForPreviousAlternative.append(jump());
- // The next alternative is longer than the current one; check the difference.
- state.linkAlternativeBacktracks(this);
- notEnoughInputForPreviousAlternative.append(jumpIfNoAvailableInput(countToCheckForNextAlternative - countCheckedForCurrentAlternative));
- } else { // CASE 3: Both alternatives are the same length.
- ASSERT(countCheckedForCurrentAlternative == countToCheckForNextAlternative);
+ // Back up to start the looping alternatives.
+ if (countCheckedForCurrentAlternative)
+ sub32(Imm32(countCheckedForCurrentAlternative), index);
+
+ firstAlternative = Label(this);
+
+ state.checkedTotal = countToCheckForFirstAlternative;
+ if (countToCheckForFirstAlternative)
+ notEnoughInputForPreviousAlternative.append(jumpIfNoAvailableInput(countToCheckForFirstAlternative));
+
+ countCheckedForCurrentAlternative = countToCheckForFirstAlternative;
+
+ firstAlternativeInputChecked = Label(this);
- // If the next alterative is the same length as this one, then no need to check the input -
- // if there was sufficent input to run the current alternative then there is sufficient
- // input to run the next one; if not, there isn't.
- state.linkAlternativeBacktracks(this);
+ setRepeatAlternativeLabels = true;
+ } else {
+ int countToCheckForNextAlternative = nextAlternative->m_minimumSize;
+
+ if (countCheckedForCurrentAlternative > countToCheckForNextAlternative) { // CASE 1: current alternative was longer than the next one.
+ // If we get here, then the last input checked failed.
+ notEnoughInputForPreviousAlternative.link(this);
+
+ // Check if sufficent input available to run the next alternative
+ notEnoughInputForPreviousAlternative.append(jumpIfNoAvailableInput(countToCheckForNextAlternative - countCheckedForCurrentAlternative));
+ // We are now in the correct state to enter the next alternative; this add is only required
+ // to mirror and revert operation of the sub32, just below.
+ add32(Imm32(countCheckedForCurrentAlternative - countToCheckForNextAlternative), index);
+
+ // If we get here, then the last input checked passed.
+ state.linkAlternativeBacktracks(this);
+ // No need to check if we can run the next alternative, since it is shorter -
+ // just update index.
+ sub32(Imm32(countCheckedForCurrentAlternative - countToCheckForNextAlternative), index);
+ } else if (countCheckedForCurrentAlternative < countToCheckForNextAlternative) { // CASE 2: next alternative is longer than the current one.
+ // If we get here, then the last input checked failed.
+ // If there is insufficient input to run the current alternative, and the next alternative is longer,
+ // then there is definitely not enough input to run it - don't even check. Just adjust index, as if
+ // we had checked.
+ notEnoughInputForPreviousAlternative.link(this);
+ add32(Imm32(countToCheckForNextAlternative - countCheckedForCurrentAlternative), index);
+ notEnoughInputForPreviousAlternative.append(jump());
+
+ // The next alternative is longer than the current one; check the difference.
+ state.linkAlternativeBacktracks(this);
+ notEnoughInputForPreviousAlternative.append(jumpIfNoAvailableInput(countToCheckForNextAlternative - countCheckedForCurrentAlternative));
+ } else { // CASE 3: Both alternatives are the same length.
+ ASSERT(countCheckedForCurrentAlternative == countToCheckForNextAlternative);
+
+ // If the next alterative is the same length as this one, then no need to check the input -
+ // if there was sufficent input to run the current alternative then there is sufficient
+ // input to run the next one; if not, there isn't.
+ state.linkAlternativeBacktracks(this);
+ }
+ state.checkedTotal -= countCheckedForCurrentAlternative;
+ countCheckedForCurrentAlternative = countToCheckForNextAlternative;
+ state.checkedTotal += countCheckedForCurrentAlternative;
}
-
- state.checkedTotal -= countCheckedForCurrentAlternative;
- countCheckedForCurrentAlternative = countToCheckForNextAlternative;
- state.checkedTotal += countCheckedForCurrentAlternative;
}
}
@@ -1205,81 +1322,86 @@ class RegexGenerator : private MacroAssembler {
state.checkedTotal -= countCheckedForCurrentAlternative;
- // How much more input need there be to be able to retry from the first alternative?
- // examples:
- // /yarr_jit/ or /wrec|pcre/
- // In these examples we need check for one more input before looping.
- // /yarr_jit|pcre/
- // In this case we need check for 5 more input to loop (+4 to allow for the first alterative
- // being four longer than the last alternative checked, and another +1 to effectively move
- // the start position along by one).
- // /yarr|rules/ or /wrec|notsomuch/
- // In these examples, provided that there was sufficient input to have just been matching for
- // the second alternative we can loop without checking for available input (since the second
- // alternative is longer than the first). In the latter example we need to decrement index
- // (by 4) so the start position is only progressed by 1 from the last iteration.
- int incrementForNextIter = (countToCheckForFirstAlternative - countCheckedForCurrentAlternative) + 1;
-
- // First, deal with the cases where there was sufficient input to try the last alternative.
- if (incrementForNextIter > 0) // We need to check for more input anyway, fall through to the checking below.
- state.linkAlternativeBacktracks(this);
- else if (m_pattern.m_body->m_hasFixedSize && !incrementForNextIter) // No need to update anything, link these backtracks straight to the to pof the loop!
- state.linkAlternativeBacktracksTo(firstAlternativeInputChecked, this);
- else { // no need to check the input, but we do have some bookkeeping to do first.
+ if (!setRepeatAlternativeLabels) {
+ // If there are no alternatives that need repeating (all are marked 'onceThrough') then just link
+ // the match failures to this point, and fall through to the return below.
state.linkAlternativeBacktracks(this);
+ notEnoughInputForPreviousAlternative.link(this);
+ } else {
+ // How much more input need there be to be able to retry from the first alternative?
+ // examples:
+ // /yarr_jit/ or /wrec|pcre/
+ // In these examples we need check for one more input before looping.
+ // /yarr_jit|pcre/
+ // In this case we need check for 5 more input to loop (+4 to allow for the first alterative
+ // being four longer than the last alternative checked, and another +1 to effectively move
+ // the start position along by one).
+ // /yarr|rules/ or /wrec|notsomuch/
+ // In these examples, provided that there was sufficient input to have just been matching for
+ // the second alternative we can loop without checking for available input (since the second
+ // alternative is longer than the first). In the latter example we need to decrement index
+ // (by 4) so the start position is only progressed by 1 from the last iteration.
+ int incrementForNextIter = (countToCheckForFirstAlternative - countCheckedForCurrentAlternative) + 1;
+
+ // First, deal with the cases where there was sufficient input to try the last alternative.
+ if (incrementForNextIter > 0) // We need to check for more input anyway, fall through to the checking below.
+ state.linkAlternativeBacktracks(this);
+ else if (m_pattern.m_body->m_hasFixedSize && !incrementForNextIter) // No need to update anything, link these backtracks straight to the to pof the loop!
+ state.linkAlternativeBacktracksTo(firstAlternativeInputChecked, this);
+ else { // no need to check the input, but we do have some bookkeeping to do first.
+ state.linkAlternativeBacktracks(this);
- // Where necessary update our preserved start position.
- if (!m_pattern.m_body->m_hasFixedSize) {
- move(index, regT0);
- sub32(Imm32(countCheckedForCurrentAlternative - 1), regT0);
- poke(regT0, m_pattern.m_body->m_callFrameSize);
+ // Where necessary update our preserved start position.
+ if (!m_pattern.m_body->m_hasFixedSize) {
+ move(index, regT0);
+ sub32(Imm32(countCheckedForCurrentAlternative - 1), regT0);
+ store32(regT0, Address(output));
+ }
+
+ // Update index if necessary, and loop (without checking).
+ if (incrementForNextIter)
+ add32(Imm32(incrementForNextIter), index);
+ jump().linkTo(firstAlternativeInputChecked, this);
}
- // Update index if necessary, and loop (without checking).
- if (incrementForNextIter)
- add32(Imm32(incrementForNextIter), index);
- jump().linkTo(firstAlternativeInputChecked, this);
+ notEnoughInputForPreviousAlternative.link(this);
+ // Update our idea of the start position, if we're tracking this.
+ if (!m_pattern.m_body->m_hasFixedSize) {
+ if (countCheckedForCurrentAlternative - 1) {
+ move(index, regT0);
+ sub32(Imm32(countCheckedForCurrentAlternative - 1), regT0);
+ store32(regT0, Address(output));
+ } else
+ store32(index, Address(output));
+ }
+
+ // Check if there is sufficent input to run the first alternative again.
+ jumpIfAvailableInput(incrementForNextIter).linkTo(firstAlternativeInputChecked, this);
+ // No - insufficent input to run the first alteranative, are there any other alternatives we
+ // might need to check? If so, the last check will have left the index incremented by
+ // (countToCheckForFirstAlternative + 1), so we need test whether countToCheckForFirstAlternative
+ // LESS input is available, to have the effect of just progressing the start position by 1
+ // from the last iteration. If this check passes we can just jump up to the check associated
+ // with the first alternative in the loop. This is a bit sad, since we'll end up trying the
+ // first alternative again, and this check will fail (otherwise the check planted just above
+ // here would have passed). This is a bit sad, however it saves trying to do something more
+ // complex here in compilation, and in the common case we should end up coallescing the checks.
+ //
+ // FIXME: a nice improvement here may be to stop trying to match sooner, based on the least
+ // of the minimum-alternative-lengths. E.g. if I have two alternatives of length 200 and 150,
+ // and a string of length 100, we'll end up looping index from 0 to 100, checking whether there
+ // is sufficient input to run either alternative (constantly failing). If there had been only
+ // one alternative, or if the shorter alternative had come first, we would have terminated
+ // immediately. :-/
+ if (hasShorterAlternatives)
+ jumpIfAvailableInput(-countToCheckForFirstAlternative).linkTo(firstAlternative, this);
+ // index will now be a bit garbled (depending on whether 'hasShorterAlternatives' is true,
+ // it has either been incremented by 1 or by (countToCheckForFirstAlternative + 1) ...
+ // but since we're about to return a failure this doesn't really matter!)
}
- notEnoughInputForPreviousAlternative.link(this);
- // Update our idea of the start position, if we're tracking this.
- if (!m_pattern.m_body->m_hasFixedSize) {
- if (countCheckedForCurrentAlternative - 1) {
- move(index, regT0);
- sub32(Imm32(countCheckedForCurrentAlternative - 1), regT0);
- poke(regT0, m_pattern.m_body->m_callFrameSize);
- } else
- poke(index, m_pattern.m_body->m_callFrameSize);
- }
- // Check if there is sufficent input to run the first alternative again.
- jumpIfAvailableInput(incrementForNextIter).linkTo(firstAlternativeInputChecked, this);
- // No - insufficent input to run the first alteranative, are there any other alternatives we
- // might need to check? If so, the last check will have left the index incremented by
- // (countToCheckForFirstAlternative + 1), so we need test whether countToCheckForFirstAlternative
- // LESS input is available, to have the effect of just progressing the start position by 1
- // from the last iteration. If this check passes we can just jump up to the check associated
- // with the first alternative in the loop. This is a bit sad, since we'll end up trying the
- // first alternative again, and this check will fail (otherwise the check planted just above
- // here would have passed). This is a bit sad, however it saves trying to do something more
- // complex here in compilation, and in the common case we should end up coallescing the checks.
- //
- // FIXME: a nice improvement here may be to stop trying to match sooner, based on the least
- // of the minimum-alternative-lengths. E.g. if I have two alternatives of length 200 and 150,
- // and a string of length 100, we'll end up looping index from 0 to 100, checking whether there
- // is sufficient input to run either alternative (constantly failing). If there had been only
- // one alternative, or if the shorter alternative had come first, we would have terminated
- // immediately. :-/
- if (hasShorterAlternatives)
- jumpIfAvailableInput(-countToCheckForFirstAlternative).linkTo(firstAlternative, this);
- // index will now be a bit garbled (depending on whether 'hasShorterAlternatives' is true,
- // it has either been incremented by 1 or by (countToCheckForFirstAlternative + 1) ...
- // but since we're about to return a failure this doesn't really matter!)
-
- unsigned frameSize = m_pattern.m_body->m_callFrameSize;
- if (!m_pattern.m_body->m_hasFixedSize)
- ++frameSize;
- if (frameSize)
- addPtr(Imm32(frameSize * sizeof(void*)), stackPointerRegister);
+ if (m_pattern.m_body->m_callFrameSize)
+ addPtr(Imm32(m_pattern.m_body->m_callFrameSize * sizeof(void*)), stackPointerRegister);
move(Imm32(-1), returnRegister);
@@ -1312,7 +1434,12 @@ class RegexGenerator : private MacroAssembler {
push(ARMRegisters::r4);
push(ARMRegisters::r5);
push(ARMRegisters::r6);
+#if CPU(ARM_TRADITIONAL)
+ push(ARMRegisters::r8); // scratch register
+#endif
move(ARMRegisters::r3, output);
+#elif CPU(MIPS)
+ // Do nothing.
#endif
}
@@ -1327,9 +1454,14 @@ class RegexGenerator : private MacroAssembler {
pop(X86Registers::ebx);
pop(X86Registers::ebp);
#elif CPU(ARM)
+#if CPU(ARM_TRADITIONAL)
+ pop(ARMRegisters::r8); // scratch register
+#endif
pop(ARMRegisters::r6);
pop(ARMRegisters::r5);
pop(ARMRegisters::r4);
+#elif CPU(MIPS)
+ // Do nothing
#endif
ret();
}
@@ -1337,7 +1469,7 @@ class RegexGenerator : private MacroAssembler {
public:
RegexGenerator(RegexPattern& pattern)
: m_pattern(pattern)
- , m_generationFailed(false)
+ , m_shouldFallBack(false)
{
}
@@ -1345,9 +1477,8 @@ public:
{
generateEnter();
- // TODO: do I really want this on the stack?
if (!m_pattern.m_body->m_hasFixedSize)
- push(index);
+ store32(index, Address(output));
if (m_pattern.m_body->m_callFrameSize)
subPtr(Imm32(m_pattern.m_body->m_callFrameSize * sizeof(void*)), stackPointerRegister);
@@ -1359,7 +1490,7 @@ public:
{
generate();
- LinkBuffer patchBuffer(this, globalData->executableAllocator.poolForSize(size()));
+ LinkBuffer patchBuffer(this, globalData->executableAllocator.poolForSize(size()), 0);
for (unsigned i = 0; i < m_backtrackRecords.size(); ++i)
patchBuffer.patch(m_backtrackRecords[i].dataLabel, patchBuffer.locationOf(m_backtrackRecords[i].backtrackLocation));
@@ -1367,34 +1498,32 @@ public:
jitObject.set(patchBuffer.finalizeCode());
}
- bool generationFailed()
+ bool shouldFallBack()
{
- return m_generationFailed;
+ return m_shouldFallBack;
}
private:
RegexPattern& m_pattern;
+ bool m_shouldFallBack;
Vector<AlternativeBacktrackRecord> m_backtrackRecords;
- bool m_generationFailed;
};
-void jitCompileRegex(JSGlobalData* globalData, RegexCodeBlock& jitObject, const UString& patternString, unsigned& numSubpatterns, const char*& error, bool ignoreCase, bool multiline)
+void jitCompileRegex(JSGlobalData* globalData, RegexCodeBlock& jitObject, const UString& patternString, unsigned& numSubpatterns, const char*& error, BumpPointerAllocator* allocator, bool ignoreCase, bool multiline)
{
RegexPattern pattern(ignoreCase, multiline);
-
if ((error = compileRegex(patternString, pattern)))
return;
-
numSubpatterns = pattern.m_numSubpatterns;
- RegexGenerator generator(pattern);
- generator.compile(globalData, jitObject);
-
- if (generator.generationFailed()) {
- JSRegExpIgnoreCaseOption ignoreCaseOption = ignoreCase ? JSRegExpIgnoreCase : JSRegExpDoNotIgnoreCase;
- JSRegExpMultilineOption multilineOption = multiline ? JSRegExpMultiline : JSRegExpSingleLine;
- jitObject.setFallback(jsRegExpCompile(reinterpret_cast<const UChar*>(patternString.data()), patternString.size(), ignoreCaseOption, multilineOption, &numSubpatterns, &error));
+ if (!pattern.m_containsBackreferences && globalData->canUseJIT()) {
+ RegexGenerator generator(pattern);
+ generator.compile(globalData, jitObject);
+ if (!generator.shouldFallBack())
+ return;
}
+
+ jitObject.setFallback(byteCompileRegex(pattern, allocator));
}
}}
diff --git a/JavaScriptCore/yarr/RegexJIT.h b/JavaScriptCore/yarr/RegexJIT.h
index 935b9a3..c4c382c 100644
--- a/JavaScriptCore/yarr/RegexJIT.h
+++ b/JavaScriptCore/yarr/RegexJIT.h
@@ -26,16 +26,12 @@
#ifndef RegexJIT_h
#define RegexJIT_h
-#include <wtf/Platform.h>
-
#if ENABLE(YARR_JIT)
#include "MacroAssembler.h"
+#include "RegexInterpreter.h" // temporary, remove when fallback is removed.
#include "RegexPattern.h"
-#include <UString.h>
-
-#include <pcre.h>
-struct JSRegExp; // temporary, remove when fallback is removed.
+#include "UString.h"
#if CPU(X86) && !COMPILER(MSVC)
#define YARR_CALL __attribute__ ((regparm (3)))
@@ -55,20 +51,23 @@ class RegexCodeBlock {
public:
RegexCodeBlock()
- : m_fallback(0)
+ : m_needFallback(false)
{
}
~RegexCodeBlock()
{
- if (m_fallback)
- jsRegExpFree(m_fallback);
}
- JSRegExp* getFallback() { return m_fallback; }
- void setFallback(JSRegExp* fallback) { m_fallback = fallback; }
+ BytecodePattern* getFallback() { return m_fallback.get(); }
+ bool isFallback() { return m_needFallback; }
+ void setFallback(PassOwnPtr<BytecodePattern> fallback)
+ {
+ m_fallback = fallback;
+ m_needFallback = true;
+ }
- bool operator!() { return !m_ref.m_code.executableAddress(); }
+ bool operator!() { return (!m_ref.m_code.executableAddress() && !m_fallback); }
void set(MacroAssembler::CodeRef ref) { m_ref = ref; }
int execute(const UChar* input, unsigned start, unsigned length, int* output)
@@ -76,17 +75,22 @@ public:
return reinterpret_cast<RegexJITCode>(m_ref.m_code.executableAddress())(input, start, length, output);
}
+#if ENABLE(REGEXP_TRACING)
+ void *getAddr() { return m_ref.m_code.executableAddress(); }
+#endif
+
private:
MacroAssembler::CodeRef m_ref;
- JSRegExp* m_fallback;
+ OwnPtr<Yarr::BytecodePattern> m_fallback;
+ bool m_needFallback;
};
-void jitCompileRegex(JSGlobalData* globalData, RegexCodeBlock& jitObject, const UString& pattern, unsigned& numSubpatterns, const char*& error, bool ignoreCase = false, bool multiline = false);
+void jitCompileRegex(JSGlobalData* globalData, RegexCodeBlock& jitObject, const UString& pattern, unsigned& numSubpatterns, const char*& error, BumpPointerAllocator* allocator, bool ignoreCase = false, bool multiline = false);
-inline int executeRegex(RegexCodeBlock& jitObject, const UChar* input, unsigned start, unsigned length, int* output, int outputArraySize)
+inline int executeRegex(RegexCodeBlock& jitObject, const UChar* input, unsigned start, unsigned length, int* output)
{
- if (JSRegExp* fallback = jitObject.getFallback())
- return (jsRegExpExecute(fallback, input, length, start, output, outputArraySize) < 0) ? -1 : output[0];
+ if (jitObject.isFallback())
+ return (interpretRegex(jitObject.getFallback(), input, start, length, output));
return jitObject.execute(input, start, length, output);
}
diff --git a/JavaScriptCore/yarr/RegexParser.h b/JavaScriptCore/yarr/RegexParser.h
index 64e8463..a5c0ba2 100644
--- a/JavaScriptCore/yarr/RegexParser.h
+++ b/JavaScriptCore/yarr/RegexParser.h
@@ -26,14 +26,10 @@
#ifndef RegexParser_h
#define RegexParser_h
-#include <wtf/Platform.h>
-
-#if ENABLE(YARR)
-
-#include <UString.h>
+#include "UString.h"
+#include <limits.h>
#include <wtf/ASCIICType.h>
#include <wtf/unicode/Unicode.h>
-#include <limits.h>
namespace JSC { namespace Yarr {
@@ -195,8 +191,8 @@ private:
: m_delegate(delegate)
, m_backReferenceLimit(backReferenceLimit)
, m_err(NoError)
- , m_data(pattern.data())
- , m_size(pattern.size())
+ , m_data(pattern.characters())
+ , m_size(pattern.length())
, m_index(0)
, m_parenthesesNestingDepth(0)
{
@@ -849,6 +845,4 @@ const char* parse(Delegate& delegate, const UString& pattern, unsigned backRefer
} } // namespace JSC::Yarr
-#endif
-
#endif // RegexParser_h
diff --git a/JavaScriptCore/yarr/RegexPattern.h b/JavaScriptCore/yarr/RegexPattern.h
index dd7512d..c76c641 100644
--- a/JavaScriptCore/yarr/RegexPattern.h
+++ b/JavaScriptCore/yarr/RegexPattern.h
@@ -1,5 +1,6 @@
/*
* Copyright (C) 2009 Apple Inc. All rights reserved.
+ * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -26,14 +27,9 @@
#ifndef RegexPattern_h
#define RegexPattern_h
-#include <wtf/Platform.h>
-
-#if ENABLE(YARR)
-
#include <wtf/Vector.h>
#include <wtf/unicode/Unicode.h>
-
namespace JSC { namespace Yarr {
#define RegexStackSpaceForBackTrackInfoPatternCharacter 1 // Only for !fixed quantifiers.
@@ -42,6 +38,7 @@ namespace JSC { namespace Yarr {
#define RegexStackSpaceForBackTrackInfoAlternative 1 // One per alternative.
#define RegexStackSpaceForBackTrackInfoParentheticalAssertion 1
#define RegexStackSpaceForBackTrackInfoParenthesesOnce 1 // Only for !fixed quantifiers.
+#define RegexStackSpaceForBackTrackInfoParenthesesTerminal 1
#define RegexStackSpaceForBackTrackInfoParentheses 4
struct PatternDisjunction;
@@ -57,11 +54,35 @@ struct CharacterRange {
}
};
+struct CharacterClassTable : RefCounted<CharacterClassTable> {
+ const char* m_table;
+ bool m_inverted;
+ static PassRefPtr<CharacterClassTable> create(const char* table, bool inverted)
+ {
+ return adoptRef(new CharacterClassTable(table, inverted));
+ }
+
+private:
+ CharacterClassTable(const char* table, bool inverted)
+ : m_table(table)
+ , m_inverted(inverted)
+ {
+ }
+};
+
struct CharacterClass : FastAllocBase {
+ // All CharacterClass instances have to have the full set of matches and ranges,
+ // they may have an optional table for faster lookups (which must match the
+ // specified matches and ranges)
+ CharacterClass(PassRefPtr<CharacterClassTable> table)
+ : m_table(table)
+ {
+ }
Vector<UChar> m_matches;
Vector<CharacterRange> m_ranges;
Vector<UChar> m_matchesUnicode;
Vector<CharacterRange> m_rangesUnicode;
+ RefPtr<CharacterClassTable> m_table;
};
enum QuantifierType {
@@ -92,6 +113,7 @@ struct PatternTerm {
unsigned subpatternId;
unsigned lastSubpatternId;
bool isCopy;
+ bool isTerminal;
} parentheses;
};
QuantifierType quantityType;
@@ -123,6 +145,7 @@ struct PatternTerm {
parentheses.disjunction = disjunction;
parentheses.subpatternId = subpatternId;
parentheses.isCopy = false;
+ parentheses.isTerminal = false;
quantityType = QuantifierFixedCount;
quantityCount = 1;
}
@@ -184,6 +207,10 @@ struct PatternTerm {
struct PatternAlternative : FastAllocBase {
PatternAlternative(PatternDisjunction* disjunction)
: m_parent(disjunction)
+ , m_onceThrough(false)
+ , m_hasFixedSize(false)
+ , m_startsWithBOL(false)
+ , m_containsBOL(false)
{
}
@@ -198,16 +225,30 @@ struct PatternAlternative : FastAllocBase {
ASSERT(m_terms.size());
m_terms.shrink(m_terms.size() - 1);
}
+
+ void setOnceThrough()
+ {
+ m_onceThrough = true;
+ }
+
+ bool onceThrough()
+ {
+ return m_onceThrough;
+ }
Vector<PatternTerm> m_terms;
PatternDisjunction* m_parent;
unsigned m_minimumSize;
- bool m_hasFixedSize;
+ bool m_onceThrough : 1;
+ bool m_hasFixedSize : 1;
+ bool m_startsWithBOL : 1;
+ bool m_containsBOL : 1;
};
struct PatternDisjunction : FastAllocBase {
PatternDisjunction(PatternAlternative* parent = 0)
: m_parent(parent)
+ , m_hasFixedSize(false)
{
}
@@ -242,10 +283,37 @@ CharacterClass* nondigitsCreate();
CharacterClass* nonspacesCreate();
CharacterClass* nonwordcharCreate();
+struct TermChain {
+ TermChain(PatternTerm term)
+ : term(term)
+ {}
+
+ PatternTerm term;
+ Vector<TermChain> hotTerms;
+};
+
+struct BeginChar {
+ BeginChar()
+ : value(0)
+ , mask(0)
+ {}
+
+ BeginChar(unsigned value, unsigned mask)
+ : value(value)
+ , mask(mask)
+ {}
+
+ unsigned value;
+ unsigned mask;
+};
+
struct RegexPattern {
RegexPattern(bool ignoreCase, bool multiline)
: m_ignoreCase(ignoreCase)
, m_multiline(multiline)
+ , m_containsBackreferences(false)
+ , m_containsBeginChars(false)
+ , m_containsBOL(false)
, m_numSubpatterns(0)
, m_maxBackReference(0)
, newlineCached(0)
@@ -269,6 +337,10 @@ struct RegexPattern {
m_numSubpatterns = 0;
m_maxBackReference = 0;
+ m_containsBackreferences = false;
+ m_containsBeginChars = false;
+ m_containsBOL = false;
+
newlineCached = 0;
digitsCached = 0;
spacesCached = 0;
@@ -281,6 +353,7 @@ struct RegexPattern {
m_disjunctions.clear();
deleteAllValues(m_userCharacterClasses);
m_userCharacterClasses.clear();
+ m_beginChars.clear();
}
bool containsIllegalBackReference()
@@ -331,13 +404,17 @@ struct RegexPattern {
return nonwordcharCached;
}
- bool m_ignoreCase;
- bool m_multiline;
+ bool m_ignoreCase : 1;
+ bool m_multiline : 1;
+ bool m_containsBackreferences : 1;
+ bool m_containsBeginChars : 1;
+ bool m_containsBOL : 1;
unsigned m_numSubpatterns;
unsigned m_maxBackReference;
PatternDisjunction* m_body;
Vector<PatternDisjunction*, 4> m_disjunctions;
Vector<CharacterClass*> m_userCharacterClasses;
+ Vector<BeginChar> m_beginChars;
private:
CharacterClass* newlineCached;
@@ -351,6 +428,4 @@ private:
} } // namespace JSC::Yarr
-#endif
-
#endif // RegexPattern_h