summaryrefslogtreecommitdiffstats
path: root/JavaScriptCore/yarr/RegexCompiler.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'JavaScriptCore/yarr/RegexCompiler.cpp')
-rw-r--r--JavaScriptCore/yarr/RegexCompiler.cpp95
1 files changed, 80 insertions, 15 deletions
diff --git a/JavaScriptCore/yarr/RegexCompiler.cpp b/JavaScriptCore/yarr/RegexCompiler.cpp
index fa87186..9f9e028 100644
--- a/JavaScriptCore/yarr/RegexCompiler.cpp
+++ b/JavaScriptCore/yarr/RegexCompiler.cpp
@@ -240,6 +240,7 @@ public:
RegexPatternConstructor(RegexPattern& pattern)
: m_pattern(pattern)
, m_characterClassConstructor(pattern.m_ignoreCase)
+ , m_invertParentheticalAssertion(false)
{
}
@@ -255,6 +256,11 @@ public:
void assertionBOL()
{
+ if (!m_alternative->m_terms.size() & !m_invertParentheticalAssertion) {
+ m_alternative->m_startsWithBOL = true;
+ m_alternative->m_containsBOL = true;
+ m_pattern.m_containsBOL = true;
+ }
m_alternative->m_terms.append(PatternTerm::BOL());
}
void assertionEOL()
@@ -358,15 +364,36 @@ public:
m_pattern.m_disjunctions.append(parenthesesDisjunction);
m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParentheticalAssertion, m_pattern.m_numSubpatterns + 1, parenthesesDisjunction, invert));
m_alternative = parenthesesDisjunction->addNewAlternative();
+ m_invertParentheticalAssertion = invert;
}
void atomParenthesesEnd()
{
ASSERT(m_alternative->m_parent);
ASSERT(m_alternative->m_parent->m_parent);
+
+ PatternDisjunction* parenthesisDisjunction = m_alternative->m_parent;
m_alternative = m_alternative->m_parent->m_parent;
+
+ PatternTerm& lastTerm = m_alternative->lastTerm();
+
+ unsigned numParenAlternatives = parenthesisDisjunction->m_alternatives.size();
+ unsigned numBOLAnchoredAlts = 0;
+ // Bubble up BOL flags
+ for (unsigned i = 0; i < numParenAlternatives; i++) {
+ if (parenthesisDisjunction->m_alternatives[i]->m_startsWithBOL)
+ numBOLAnchoredAlts++;
+ }
+
+ if (numBOLAnchoredAlts) {
+ m_alternative->m_containsBOL = true;
+ // If all the alternatives in parens start with BOL, then so does this one
+ if (numBOLAnchoredAlts == numParenAlternatives)
+ m_alternative->m_startsWithBOL = true;
+ }
- m_alternative->lastTerm().parentheses.lastSubpatternId = m_pattern.m_numSubpatterns;
+ lastTerm.parentheses.lastSubpatternId = m_pattern.m_numSubpatterns;
+ m_invertParentheticalAssertion = false;
}
void atomBackReference(unsigned subpatternId)
@@ -397,32 +424,39 @@ public:
m_alternative->m_terms.append(PatternTerm(subpatternId));
}
- PatternDisjunction* copyDisjunction(PatternDisjunction* disjunction)
+ // deep copy the argument disjunction. If filterStartsWithBOL is true,
+ // skip alternatives with m_startsWithBOL set true.
+ PatternDisjunction* copyDisjunction(PatternDisjunction* disjunction, bool filterStartsWithBOL = false)
{
- PatternDisjunction* newDisjunction = new PatternDisjunction();
-
- newDisjunction->m_parent = disjunction->m_parent;
+ PatternDisjunction* newDisjunction = 0;
for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
PatternAlternative* alternative = disjunction->m_alternatives[alt];
- PatternAlternative* newAlternative = newDisjunction->addNewAlternative();
- for (unsigned i = 0; i < alternative->m_terms.size(); ++i)
- newAlternative->m_terms.append(copyTerm(alternative->m_terms[i]));
+ if (!filterStartsWithBOL || !alternative->m_startsWithBOL) {
+ if (!newDisjunction) {
+ newDisjunction = new PatternDisjunction();
+ newDisjunction->m_parent = disjunction->m_parent;
+ }
+ PatternAlternative* newAlternative = newDisjunction->addNewAlternative();
+ for (unsigned i = 0; i < alternative->m_terms.size(); ++i)
+ newAlternative->m_terms.append(copyTerm(alternative->m_terms[i], filterStartsWithBOL));
+ }
}
-
- m_pattern.m_disjunctions.append(newDisjunction);
+
+ if (newDisjunction)
+ m_pattern.m_disjunctions.append(newDisjunction);
return newDisjunction;
}
-
- PatternTerm copyTerm(PatternTerm& term)
+
+ PatternTerm copyTerm(PatternTerm& term, bool filterStartsWithBOL = false)
{
if ((term.type != PatternTerm::TypeParenthesesSubpattern) && (term.type != PatternTerm::TypeParentheticalAssertion))
return PatternTerm(term);
-
+
PatternTerm termCopy = term;
- termCopy.parentheses.disjunction = copyDisjunction(termCopy.parentheses.disjunction);
+ termCopy.parentheses.disjunction = copyDisjunction(termCopy.parentheses.disjunction, filterStartsWithBOL);
return termCopy;
}
-
+
void quantifyAtom(unsigned min, unsigned max, bool greedy)
{
ASSERT(min <= max);
@@ -589,11 +623,40 @@ public:
setupDisjunctionOffsets(m_pattern.m_body, 0, 0);
}
+ void optimizeBOL()
+ {
+ // Look for expressions containing beginning of line (^) anchoring and unroll them.
+ // e.g. /^a|^b|c/ becomes /^a|^b|c/ which is executed once followed by /c/ which loops
+ // This code relies on the parsing code tagging alternatives with m_containsBOL and
+ // m_startsWithBOL and rolling those up to containing alternatives.
+ // At this point, this is only valid for non-multiline expressions.
+ PatternDisjunction* disjunction = m_pattern.m_body;
+
+ if (!m_pattern.m_containsBOL || m_pattern.m_multiline)
+ return;
+
+ PatternDisjunction* loopDisjunction = copyDisjunction(disjunction, true);
+
+ // Set alternatives in disjunction to "onceThrough"
+ for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt)
+ disjunction->m_alternatives[alt]->setOnceThrough();
+
+ if (loopDisjunction) {
+ // Move alternatives from loopDisjunction to disjunction
+ for (unsigned alt = 0; alt < loopDisjunction->m_alternatives.size(); ++alt)
+ disjunction->m_alternatives.append(loopDisjunction->m_alternatives[alt]);
+
+ loopDisjunction->m_alternatives.clear();
+ }
+ }
+
+
private:
RegexPattern& m_pattern;
PatternAlternative* m_alternative;
CharacterClassConstructor m_characterClassConstructor;
bool m_invertCharacterClass;
+ bool m_invertParentheticalAssertion;
};
@@ -621,6 +684,8 @@ const char* compileRegex(const UString& patternString, RegexPattern& pattern)
ASSERT(numSubpatterns == pattern.m_numSubpatterns);
}
+ constructor.optimizeBOL();
+
constructor.setupOffsets();
return 0;