summaryrefslogtreecommitdiffstats
path: root/JavaScriptCore/yarr
diff options
context:
space:
mode:
authorRussell Brenner <russellbrenner@google.com>2010-11-18 17:33:13 -0800
committerRussell Brenner <russellbrenner@google.com>2010-12-02 13:47:21 -0800
commit6b70adc33054f8aee8c54d0f460458a9df11b8a5 (patch)
tree103a13998c33944d6ab3b8318c509a037e639460 /JavaScriptCore/yarr
parentbdf4ebc8e70b2d221b6ee7a65660918ecb1d33aa (diff)
downloadexternal_webkit-6b70adc33054f8aee8c54d0f460458a9df11b8a5.zip
external_webkit-6b70adc33054f8aee8c54d0f460458a9df11b8a5.tar.gz
external_webkit-6b70adc33054f8aee8c54d0f460458a9df11b8a5.tar.bz2
Merge WebKit at r72274: Initial merge by git.
Change-Id: Ie51f0b4a16da82942bd516dce59cfb79ebbe25fb
Diffstat (limited to 'JavaScriptCore/yarr')
-rw-r--r--JavaScriptCore/yarr/RegexCompiler.cpp253
-rw-r--r--JavaScriptCore/yarr/RegexInterpreter.cpp211
-rw-r--r--JavaScriptCore/yarr/RegexInterpreter.h22
-rw-r--r--JavaScriptCore/yarr/RegexJIT.h4
-rw-r--r--JavaScriptCore/yarr/RegexParser.h2
-rw-r--r--JavaScriptCore/yarr/RegexPattern.h30
6 files changed, 467 insertions, 55 deletions
diff --git a/JavaScriptCore/yarr/RegexCompiler.cpp b/JavaScriptCore/yarr/RegexCompiler.cpp
index 9f9e028..ccfc94e 100644
--- a/JavaScriptCore/yarr/RegexCompiler.cpp
+++ b/JavaScriptCore/yarr/RegexCompiler.cpp
@@ -1,5 +1,6 @@
/*
* Copyright (C) 2009 Apple Inc. All rights reserved.
+ * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -235,11 +236,117 @@ private:
Vector<CharacterRange> m_rangesUnicode;
};
+struct BeginCharHelper {
+ BeginCharHelper(Vector<BeginChar>* beginChars, bool isCaseInsensitive = false)
+ : m_beginChars(beginChars)
+ , m_isCaseInsensitive(isCaseInsensitive)
+ {}
+
+ void addBeginChar(BeginChar beginChar, Vector<TermChain>* hotTerms, QuantifierType quantityType, unsigned quantityCount)
+ {
+ if (quantityType == QuantifierFixedCount && quantityCount > 1) {
+ // We duplicate the first found character if the quantity of the term is more than one. eg.: /a{3}/
+ beginChar.value |= beginChar.value << 16;
+ beginChar.mask |= beginChar.mask << 16;
+ addCharacter(beginChar);
+ } else if (quantityType == QuantifierFixedCount && quantityCount == 1 && hotTerms->size())
+ // In case of characters with fixed quantifier we should check the next character as well.
+ linkHotTerms(beginChar, hotTerms);
+ else
+ // In case of greedy matching the next character checking is unnecessary therefore we just store
+ // the first character.
+ addCharacter(beginChar);
+ }
+
+ // Merge two following BeginChars in the vector to reduce the number of character checks.
+ void merge(unsigned size)
+ {
+ for (unsigned i = 0; i < size; i++) {
+ BeginChar* curr = &m_beginChars->at(i);
+ BeginChar* next = &m_beginChars->at(i + 1);
+
+ // If the current and the next size of value is different we should skip the merge process
+ // because the 16bit and 32bit values are unmergable.
+ if (curr->value <= 0xFFFF && next->value > 0xFFFF)
+ continue;
+
+ unsigned diff = curr->value ^ next->value;
+
+ curr->mask |= diff;
+ curr->value |= curr->mask;
+
+ m_beginChars->remove(i + 1);
+ size--;
+ }
+ }
+
+private:
+ void addCharacter(BeginChar beginChar)
+ {
+ unsigned pos = 0;
+ unsigned range = m_beginChars->size();
+
+ // binary chop, find position to insert char.
+ while (range) {
+ unsigned index = range >> 1;
+
+ int val = m_beginChars->at(pos+index).value - beginChar.value;
+ if (!val)
+ return;
+ if (val < 0)
+ range = index;
+ else {
+ pos += (index+1);
+ range -= (index+1);
+ }
+ }
+
+ if (pos == m_beginChars->size())
+ m_beginChars->append(beginChar);
+ else
+ m_beginChars->insert(pos, beginChar);
+ }
+
+ // Create BeginChar objects by appending each terms from a hotTerms vector to an existing BeginChar object.
+ void linkHotTerms(BeginChar beginChar, Vector<TermChain>* hotTerms)
+ {
+ for (unsigned i = 0; i < hotTerms->size(); i++) {
+ PatternTerm hotTerm = hotTerms->at(i).term;
+ ASSERT(hotTerm.type == PatternTerm::TypePatternCharacter);
+
+ UChar characterNext = hotTerm.patternCharacter;
+
+ // Append a character to an existing BeginChar object.
+ if (characterNext <= 0x7f) {
+ unsigned mask = 0;
+
+ if (m_isCaseInsensitive && isASCIIAlpha(characterNext)) {
+ mask = 32;
+ characterNext = toASCIILower(characterNext);
+ }
+
+ addCharacter(BeginChar(beginChar.value | (characterNext << 16), beginChar.mask | (mask << 16)));
+ } else {
+ UChar upper, lower;
+ if (m_isCaseInsensitive && ((upper = Unicode::toUpper(characterNext)) != (lower = Unicode::toLower(characterNext)))) {
+ addCharacter(BeginChar(beginChar.value | (upper << 16), beginChar.mask));
+ addCharacter(BeginChar(beginChar.value | (lower << 16), beginChar.mask));
+ } else
+ addCharacter(BeginChar(beginChar.value | (characterNext << 16), beginChar.mask));
+ }
+ }
+ }
+
+ Vector<BeginChar>* m_beginChars;
+ bool m_isCaseInsensitive;
+};
+
class RegexPatternConstructor {
public:
RegexPatternConstructor(RegexPattern& pattern)
: m_pattern(pattern)
, m_characterClassConstructor(pattern.m_ignoreCase)
+ , m_beginCharHelper(&pattern.m_beginChars, pattern.m_ignoreCase)
, m_invertParentheticalAssertion(false)
{
}
@@ -649,12 +756,153 @@ public:
loopDisjunction->m_alternatives.clear();
}
}
-
-
+
+ bool addBeginTerm(PatternTerm term, Vector<TermChain>* beginTerms, PatternAlternative* alternative, unsigned numTerms, unsigned termIndex, unsigned depth)
+ {
+ if (term.quantityType == QuantifierFixedCount) {
+ beginTerms->append(TermChain(term));
+ if (depth < 2 && termIndex < numTerms - 1 && term.quantityCount == 1)
+ setupAlternativeBeginTerms(alternative, &beginTerms->last().hotTerms, termIndex + 1, depth + 1);
+ } else if (termIndex != numTerms - 1) {
+ beginTerms->append(TermChain(term));
+ return true;
+ }
+
+ return false;
+ }
+
+ // This function collects the terms which are potentially matching the first number of depth characters in the result.
+ // If this function returns false then it found at least one term which makes the beginning character
+ // look-up optimization inefficient.
+ bool setupDisjunctionBeginTerms(PatternDisjunction* disjunction, Vector<TermChain>* beginTerms, unsigned depth)
+ {
+ for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
+ PatternAlternative* alternative = disjunction->m_alternatives[alt];
+
+ if (!setupAlternativeBeginTerms(alternative, beginTerms, 0, depth))
+ return false;
+ }
+
+ return true;
+ }
+
+ bool setupAlternativeBeginTerms(PatternAlternative* alternative, Vector<TermChain>* beginTerms, unsigned termIndex, unsigned depth)
+ {
+ bool checkNext = true;
+ unsigned numTerms = alternative->m_terms.size();
+
+ while (checkNext && termIndex < numTerms) {
+ PatternTerm term = alternative->m_terms[termIndex];
+ checkNext = false;
+
+ switch (term.type) {
+ case PatternTerm::TypeAssertionBOL:
+ case PatternTerm::TypeAssertionEOL:
+ case PatternTerm::TypeAssertionWordBoundary:
+ return false;
+
+ case PatternTerm::TypeBackReference:
+ case PatternTerm::TypeForwardReference:
+ return false;
+
+ case PatternTerm::TypePatternCharacter:
+ if (addBeginTerm(term, beginTerms, alternative, numTerms, termIndex, depth)) {
+ termIndex++;
+ checkNext = true;
+ }
+ break;
+
+ case PatternTerm::TypeCharacterClass:
+ return false;
+
+ case PatternTerm::TypeParentheticalAssertion:
+ if (term.invertOrCapture)
+ return false;
+
+ case PatternTerm::TypeParenthesesSubpattern:
+ if (term.quantityType != QuantifierFixedCount) {
+ if (termIndex == numTerms - 1)
+ break;
+
+ termIndex++;
+ checkNext = true;
+
+ }
+
+ if (!setupDisjunctionBeginTerms(term.parentheses.disjunction, beginTerms, depth))
+ return false;
+
+ break;
+ }
+ }
+
+ return true;
+ }
+
+ void setupBeginChars()
+ {
+ Vector<TermChain> beginTerms;
+ bool containsFixedCharacter = false;
+
+ if ((!m_pattern.m_body->m_hasFixedSize || m_pattern.m_body->m_alternatives.size() > 1)
+ && setupDisjunctionBeginTerms(m_pattern.m_body, &beginTerms, 0)) {
+ unsigned size = beginTerms.size();
+
+ // If we haven't collected any terms we should abort the preparation of beginning character look-up optimization.
+ if (!size)
+ return;
+
+ m_pattern.m_containsBeginChars = true;
+
+ for (unsigned i = 0; i < size; i++) {
+ PatternTerm term = beginTerms[i].term;
+
+ // We have just collected PatternCharacter terms, other terms are not allowed.
+ ASSERT(term.type == PatternTerm::TypePatternCharacter);
+
+ if (term.quantityType == QuantifierFixedCount)
+ containsFixedCharacter = true;
+
+ UChar character = term.patternCharacter;
+ unsigned mask = 0;
+
+ if (character <= 0x7f) {
+ if (m_pattern.m_ignoreCase && isASCIIAlpha(character)) {
+ mask = 32;
+ character = toASCIILower(character);
+ }
+
+ m_beginCharHelper.addBeginChar(BeginChar(character, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount);
+ } else {
+ UChar upper, lower;
+ if (m_pattern.m_ignoreCase && ((upper = Unicode::toUpper(character)) != (lower = Unicode::toLower(character)))) {
+ m_beginCharHelper.addBeginChar(BeginChar(upper, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount);
+ m_beginCharHelper.addBeginChar(BeginChar(lower, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount);
+ } else
+ m_beginCharHelper.addBeginChar(BeginChar(character, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount);
+ }
+ }
+
+ // If the pattern doesn't contain terms with fixed quantifiers then the beginning character look-up optimization is inefficient.
+ if (!containsFixedCharacter) {
+ m_pattern.m_containsBeginChars = false;
+ return;
+ }
+
+ size = m_pattern.m_beginChars.size();
+
+ if (size > 2)
+ m_beginCharHelper.merge(size - 1);
+ else if (size <= 1)
+ m_pattern.m_containsBeginChars = false;
+ }
+ }
+
private:
RegexPattern& m_pattern;
PatternAlternative* m_alternative;
CharacterClassConstructor m_characterClassConstructor;
+ BeginCharHelper m_beginCharHelper;
bool m_invertCharacterClass;
bool m_invertParentheticalAssertion;
};
@@ -687,6 +935,7 @@ const char* compileRegex(const UString& patternString, RegexPattern& pattern)
constructor.optimizeBOL();
constructor.setupOffsets();
+ constructor.setupBeginChars();
return 0;
};
diff --git a/JavaScriptCore/yarr/RegexInterpreter.cpp b/JavaScriptCore/yarr/RegexInterpreter.cpp
index ec96636..dc3024a 100644
--- a/JavaScriptCore/yarr/RegexInterpreter.cpp
+++ b/JavaScriptCore/yarr/RegexInterpreter.cpp
@@ -1,5 +1,6 @@
/*
* Copyright (C) 2009 Apple Inc. All rights reserved.
+ * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -195,6 +196,12 @@ public:
return -1;
}
+ int readPair()
+ {
+ ASSERT(pos + 1 < length);
+ return input[pos] | input[pos + 1] << 16;
+ }
+
int readChecked(int position)
{
ASSERT(position < 0);
@@ -262,6 +269,11 @@ public:
return (pos + position) == length;
}
+ bool isNotAvailableInput(int position)
+ {
+ return (pos + position) > length;
+ }
+
private:
const UChar* input;
unsigned pos;
@@ -591,20 +603,24 @@ public:
unsigned count = term.atom.parenthesesDisjunction->m_numSubpatterns;
context->restoreOutput(output, firstSubpatternId, count);
}
- bool parenthesesDoBacktrack(ByteTerm& term, BackTrackInfoParentheses* backTrack)
+ JSRegExpResult parenthesesDoBacktrack(ByteTerm& term, BackTrackInfoParentheses* backTrack)
{
while (backTrack->matchAmount) {
ParenthesesDisjunctionContext* context = backTrack->lastContext;
- if (matchDisjunction(term.atom.parenthesesDisjunction, context->getDisjunctionContext(term), true))
- return true;
+ JSRegExpResult result = matchDisjunction(term.atom.parenthesesDisjunction, context->getDisjunctionContext(term), true);
+ if (result == JSRegExpMatch)
+ return JSRegExpMatch;
resetMatches(term, context);
popParenthesesDisjunctionContext(backTrack);
freeParenthesesDisjunctionContext(context);
+
+ if (result != JSRegExpNoMatch)
+ return result;
}
- return false;
+ return JSRegExpNoMatch;
}
bool matchParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context)
@@ -765,7 +781,7 @@ public:
return false;
}
- bool matchParentheses(ByteTerm& term, DisjunctionContext* context)
+ JSRegExpResult matchParentheses(ByteTerm& term, DisjunctionContext* context)
{
ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern);
@@ -786,31 +802,42 @@ public:
while (backTrack->matchAmount < term.atom.quantityCount) {
// Try to do a match, and it it succeeds, add it to the list.
ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
- if (matchDisjunction(disjunctionBody, context->getDisjunctionContext(term)))
+ JSRegExpResult result = matchDisjunction(disjunctionBody, context->getDisjunctionContext(term));
+ if (result == JSRegExpMatch)
appendParenthesesDisjunctionContext(backTrack, context);
else {
// The match failed; try to find an alternate point to carry on from.
resetMatches(term, context);
freeParenthesesDisjunctionContext(context);
- if (!parenthesesDoBacktrack(term, backTrack))
- return false;
+
+ if (result == JSRegExpNoMatch) {
+ JSRegExpResult backtrackResult = parenthesesDoBacktrack(term, backTrack);
+ if (backtrackResult != JSRegExpMatch)
+ return backtrackResult;
+ } else
+ return result;
}
}
ASSERT(backTrack->matchAmount == term.atom.quantityCount);
ParenthesesDisjunctionContext* context = backTrack->lastContext;
recordParenthesesMatch(term, context);
- return true;
+ return JSRegExpMatch;
}
case QuantifierGreedy: {
while (backTrack->matchAmount < term.atom.quantityCount) {
ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
- if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term)))
+ JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term));
+ if (result == JSRegExpMatch)
appendParenthesesDisjunctionContext(backTrack, context);
else {
resetMatches(term, context);
freeParenthesesDisjunctionContext(context);
+
+ if (result != JSRegExpNoMatch)
+ return result;
+
break;
}
}
@@ -819,15 +846,15 @@ public:
ParenthesesDisjunctionContext* context = backTrack->lastContext;
recordParenthesesMatch(term, context);
}
- return true;
+ return JSRegExpMatch;
}
case QuantifierNonGreedy:
- return true;
+ return JSRegExpMatch;
}
ASSERT_NOT_REACHED();
- return false;
+ return JSRegExpErrorNoMatch;
}
// Rules for backtracking differ depending on whether this is greedy or non-greedy.
@@ -840,7 +867,7 @@ public:
// Non-greedy, we've already done the one less case, so don't match on popping.
// We haven't done the one more case, so always try to add that.
//
- bool backtrackParentheses(ByteTerm& term, DisjunctionContext* context)
+ JSRegExpResult backtrackParentheses(ByteTerm& term, DisjunctionContext* context)
{
ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern);
@@ -859,44 +886,58 @@ public:
ASSERT(backTrack->matchAmount == term.atom.quantityCount);
ParenthesesDisjunctionContext* context = 0;
+ JSRegExpResult result = parenthesesDoBacktrack(term, backTrack);
- if (!parenthesesDoBacktrack(term, backTrack))
- return false;
+ if (result != JSRegExpMatch)
+ return result;
// While we haven't yet reached our fixed limit,
while (backTrack->matchAmount < term.atom.quantityCount) {
// Try to do a match, and it it succeeds, add it to the list.
context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
- if (matchDisjunction(disjunctionBody, context->getDisjunctionContext(term)))
+ result = matchDisjunction(disjunctionBody, context->getDisjunctionContext(term));
+
+ if (result == JSRegExpMatch)
appendParenthesesDisjunctionContext(backTrack, context);
else {
// The match failed; try to find an alternate point to carry on from.
resetMatches(term, context);
freeParenthesesDisjunctionContext(context);
- if (!parenthesesDoBacktrack(term, backTrack))
- return false;
+
+ if (result == JSRegExpNoMatch) {
+ JSRegExpResult backtrackResult = parenthesesDoBacktrack(term, backTrack);
+ if (backtrackResult != JSRegExpMatch)
+ return backtrackResult;
+ } else
+ return result;
}
}
ASSERT(backTrack->matchAmount == term.atom.quantityCount);
context = backTrack->lastContext;
recordParenthesesMatch(term, context);
- return true;
+ return JSRegExpMatch;
}
case QuantifierGreedy: {
if (!backTrack->matchAmount)
- return false;
+ return JSRegExpNoMatch;
ParenthesesDisjunctionContext* context = backTrack->lastContext;
- if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true)) {
+ JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true);
+ if (result == JSRegExpMatch) {
while (backTrack->matchAmount < term.atom.quantityCount) {
ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
- if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term)))
+ JSRegExpResult parenthesesResult = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term));
+ if (parenthesesResult == JSRegExpMatch)
appendParenthesesDisjunctionContext(backTrack, context);
else {
resetMatches(term, context);
freeParenthesesDisjunctionContext(context);
+
+ if (parenthesesResult != JSRegExpNoMatch)
+ return parenthesesResult;
+
break;
}
}
@@ -904,63 +945,108 @@ public:
resetMatches(term, context);
popParenthesesDisjunctionContext(backTrack);
freeParenthesesDisjunctionContext(context);
+
+ if (result != JSRegExpNoMatch)
+ return result;
}
if (backTrack->matchAmount) {
ParenthesesDisjunctionContext* context = backTrack->lastContext;
recordParenthesesMatch(term, context);
}
- return true;
+ return JSRegExpMatch;
}
case QuantifierNonGreedy: {
// If we've not reached the limit, try to add one more match.
if (backTrack->matchAmount < term.atom.quantityCount) {
ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
- if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term))) {
+ JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term));
+ if (result == JSRegExpMatch) {
appendParenthesesDisjunctionContext(backTrack, context);
recordParenthesesMatch(term, context);
- return true;
- } else {
- resetMatches(term, context);
- freeParenthesesDisjunctionContext(context);
+ return JSRegExpMatch;
}
+
+ resetMatches(term, context);
+ freeParenthesesDisjunctionContext(context);
+
+ if (result != JSRegExpNoMatch)
+ return result;
}
// Nope - okay backtrack looking for an alternative.
while (backTrack->matchAmount) {
ParenthesesDisjunctionContext* context = backTrack->lastContext;
- if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true)) {
+ JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true);
+ if (result == JSRegExpMatch) {
// successful backtrack! we're back in the game!
if (backTrack->matchAmount) {
context = backTrack->lastContext;
recordParenthesesMatch(term, context);
}
- return true;
+ return JSRegExpMatch;
}
// pop a match off the stack
resetMatches(term, context);
popParenthesesDisjunctionContext(backTrack);
freeParenthesesDisjunctionContext(context);
+
+ return result;
}
- return false;
+ return JSRegExpNoMatch;
}
}
ASSERT_NOT_REACHED();
- return false;
+ return JSRegExpErrorNoMatch;
+ }
+
+ void lookupForBeginChars()
+ {
+ int character;
+ bool firstSingleCharFound;
+
+ while (true) {
+ if (input.isNotAvailableInput(2))
+ return;
+
+ firstSingleCharFound = false;
+
+ character = input.readPair();
+
+ for (unsigned i = 0; i < pattern->m_beginChars.size(); ++i) {
+ BeginChar bc = pattern->m_beginChars[i];
+
+ if (!firstSingleCharFound && bc.value <= 0xFFFF) {
+ firstSingleCharFound = true;
+ character &= 0xFFFF;
+ }
+
+ if ((character | bc.mask) == bc.value)
+ return;
+ }
+
+ input.next();
+ }
}
#define MATCH_NEXT() { ++context->term; goto matchAgain; }
#define BACKTRACK() { --context->term; goto backtrack; }
#define currentTerm() (disjunction->terms[context->term])
- bool matchDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false)
+ JSRegExpResult matchDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false, bool isBody = false)
{
+ if (!--remainingMatchCount)
+ return JSRegExpErrorHitLimit;
+
if (btrack)
BACKTRACK();
+ if (pattern->m_containsBeginChars && isBody)
+ lookupForBeginChars();
+
context->matchBegin = input.getPos();
context->term = 0;
@@ -972,14 +1058,14 @@ public:
MATCH_NEXT();
case ByteTerm::TypeSubpatternEnd:
context->matchEnd = input.getPos();
- return true;
+ return JSRegExpMatch;
case ByteTerm::TypeBodyAlternativeBegin:
MATCH_NEXT();
case ByteTerm::TypeBodyAlternativeDisjunction:
case ByteTerm::TypeBodyAlternativeEnd:
context->matchEnd = input.getPos();
- return true;
+ return JSRegExpMatch;
case ByteTerm::TypeAlternativeBegin:
MATCH_NEXT();
@@ -1069,10 +1155,16 @@ public:
if (matchBackReference(currentTerm(), context))
MATCH_NEXT();
BACKTRACK();
- case ByteTerm::TypeParenthesesSubpattern:
- if (matchParentheses(currentTerm(), context))
+ case ByteTerm::TypeParenthesesSubpattern: {
+ JSRegExpResult result = matchParentheses(currentTerm(), context);
+
+ if (result == JSRegExpMatch) {
MATCH_NEXT();
+ } else if (result != JSRegExpNoMatch)
+ return result;
+
BACKTRACK();
+ }
case ByteTerm::TypeParenthesesSubpatternOnceBegin:
if (matchParenthesesOnceBegin(currentTerm(), context))
MATCH_NEXT();
@@ -1104,7 +1196,7 @@ public:
switch (currentTerm().type) {
case ByteTerm::TypeSubpatternBegin:
- return false;
+ return JSRegExpNoMatch;
case ByteTerm::TypeSubpatternEnd:
ASSERT_NOT_REACHED();
@@ -1116,9 +1208,13 @@ public:
MATCH_NEXT();
if (input.atEnd())
- return false;
+ return JSRegExpNoMatch;
input.next();
+
+ if (pattern->m_containsBeginChars && isBody)
+ lookupForBeginChars();
+
context->matchBegin = input.getPos();
if (currentTerm().alternative.onceThrough)
@@ -1172,10 +1268,16 @@ public:
if (backtrackBackReference(currentTerm(), context))
MATCH_NEXT();
BACKTRACK();
- case ByteTerm::TypeParenthesesSubpattern:
- if (backtrackParentheses(currentTerm(), context))
+ case ByteTerm::TypeParenthesesSubpattern: {
+ JSRegExpResult result = backtrackParentheses(currentTerm(), context);
+
+ if (result == JSRegExpMatch) {
MATCH_NEXT();
+ } else if (result != JSRegExpNoMatch)
+ return result;
+
BACKTRACK();
+ }
case ByteTerm::TypeParenthesesSubpatternOnceBegin:
if (backtrackParenthesesOnceBegin(currentTerm(), context))
MATCH_NEXT();
@@ -1199,20 +1301,23 @@ public:
}
ASSERT_NOT_REACHED();
- return false;
+ return JSRegExpErrorNoMatch;
}
- bool matchNonZeroDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false)
+ JSRegExpResult matchNonZeroDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false)
{
- if (matchDisjunction(disjunction, context, btrack)) {
+ JSRegExpResult result = matchDisjunction(disjunction, context, btrack);
+
+ if (result == JSRegExpMatch) {
while (context->matchBegin == context->matchEnd) {
- if (!matchDisjunction(disjunction, context, true))
- return false;
+ result = matchDisjunction(disjunction, context, true);
+ if (result != JSRegExpMatch)
+ return result;
}
- return true;
+ return JSRegExpMatch;
}
- return false;
+ return result;
}
int interpret()
@@ -1226,7 +1331,8 @@ public:
DisjunctionContext* context = allocDisjunctionContext(pattern->m_body.get());
- if (matchDisjunction(pattern->m_body.get(), context)) {
+ JSRegExpResult result = matchDisjunction(pattern->m_body.get(), context, false, true);
+ if (result == JSRegExpMatch) {
output[0] = context->matchBegin;
output[1] = context->matchEnd;
}
@@ -1235,6 +1341,9 @@ public:
pattern->m_allocator->stopAllocator();
+ if (output[0] == -1 && result != JSRegExpNoMatch)
+ return result;
+
return output[0];
}
@@ -1243,6 +1352,7 @@ public:
, output(output)
, input(inputChar, start, length)
, allocatorPool(0)
+ , remainingMatchCount(matchLimit)
{
}
@@ -1251,6 +1361,7 @@ private:
int* output;
InputStream input;
BumpPointerPool* allocatorPool;
+ unsigned remainingMatchCount;
};
diff --git a/JavaScriptCore/yarr/RegexInterpreter.h b/JavaScriptCore/yarr/RegexInterpreter.h
index 37f17fe..dae8f9d 100644
--- a/JavaScriptCore/yarr/RegexInterpreter.h
+++ b/JavaScriptCore/yarr/RegexInterpreter.h
@@ -40,6 +40,21 @@ using WTF::BumpPointerAllocator;
namespace JSC { namespace Yarr {
+// TODO move the matchLimit constant and the JSRegExpResult enum to the JSRegExp.h when pcre is removed.
+
+// The below limit restricts the number of "recursive" match calls in order to
+// avoid spending exponential time on complex regular expressions.
+static const unsigned matchLimit = 1000000;
+
+enum JSRegExpResult {
+ JSRegExpMatch = 1,
+ JSRegExpNoMatch = 0,
+ JSRegExpErrorNoMatch = -1,
+ JSRegExpErrorHitLimit = -2,
+ JSRegExpErrorNoMemory = -3,
+ JSRegExpErrorInternal = -4
+};
+
class ByteDisjunction;
struct ByteTerm {
@@ -309,6 +324,7 @@ struct BytecodePattern : FastAllocBase {
: m_body(body)
, m_ignoreCase(pattern.m_ignoreCase)
, m_multiline(pattern.m_multiline)
+ , m_containsBeginChars(pattern.m_containsBeginChars)
, m_allocator(allocator)
{
newlineCharacterClass = pattern.newlineCharacterClass();
@@ -320,6 +336,8 @@ struct BytecodePattern : FastAllocBase {
// array, so that it won't delete them on destruction. We'll
// take responsibility for that.
pattern.m_userCharacterClasses.clear();
+
+ m_beginChars.append(pattern.m_beginChars);
}
~BytecodePattern()
@@ -331,12 +349,16 @@ struct BytecodePattern : FastAllocBase {
OwnPtr<ByteDisjunction> m_body;
bool m_ignoreCase;
bool m_multiline;
+ bool m_containsBeginChars;
// Each BytecodePattern is associated with a RegExp, each RegExp is associated
// with a JSGlobalData. Cache a pointer to out JSGlobalData's m_regexAllocator.
BumpPointerAllocator* m_allocator;
CharacterClass* newlineCharacterClass;
CharacterClass* wordcharCharacterClass;
+
+ Vector<BeginChar> m_beginChars;
+
private:
Vector<ByteDisjunction*> m_allParenthesesInfo;
Vector<CharacterClass*> m_userCharacterClasses;
diff --git a/JavaScriptCore/yarr/RegexJIT.h b/JavaScriptCore/yarr/RegexJIT.h
index 5d587b2..02e905d 100644
--- a/JavaScriptCore/yarr/RegexJIT.h
+++ b/JavaScriptCore/yarr/RegexJIT.h
@@ -30,9 +30,9 @@
#include "MacroAssembler.h"
#include "RegexPattern.h"
-#include <UString.h>
+#include "UString.h"
-#include <pcre.h>
+#include "pcre.h"
struct JSRegExp; // temporary, remove when fallback is removed.
#if CPU(X86) && !COMPILER(MSVC)
diff --git a/JavaScriptCore/yarr/RegexParser.h b/JavaScriptCore/yarr/RegexParser.h
index 8a5eeba..ede9417 100644
--- a/JavaScriptCore/yarr/RegexParser.h
+++ b/JavaScriptCore/yarr/RegexParser.h
@@ -28,7 +28,7 @@
#if ENABLE(YARR)
-#include <UString.h>
+#include "UString.h"
#include <limits.h>
#include <wtf/ASCIICType.h>
#include <wtf/unicode/Unicode.h>
diff --git a/JavaScriptCore/yarr/RegexPattern.h b/JavaScriptCore/yarr/RegexPattern.h
index eecbd43..be31fcd 100644
--- a/JavaScriptCore/yarr/RegexPattern.h
+++ b/JavaScriptCore/yarr/RegexPattern.h
@@ -1,5 +1,6 @@
/*
* Copyright (C) 2009 Apple Inc. All rights reserved.
+ * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -283,11 +284,36 @@ CharacterClass* nondigitsCreate();
CharacterClass* nonspacesCreate();
CharacterClass* nonwordcharCreate();
+struct TermChain {
+ TermChain(PatternTerm term)
+ : term(term)
+ {}
+
+ PatternTerm term;
+ Vector<TermChain> hotTerms;
+};
+
+struct BeginChar {
+ BeginChar()
+ : value(0)
+ , mask(0)
+ {}
+
+ BeginChar(unsigned value, unsigned mask)
+ : value(value)
+ , mask(mask)
+ {}
+
+ unsigned value;
+ unsigned mask;
+};
+
struct RegexPattern {
RegexPattern(bool ignoreCase, bool multiline)
: m_ignoreCase(ignoreCase)
, m_multiline(multiline)
, m_containsBackreferences(false)
+ , m_containsBeginChars(false)
, m_containsBOL(false)
, m_numSubpatterns(0)
, m_maxBackReference(0)
@@ -313,6 +339,7 @@ struct RegexPattern {
m_maxBackReference = 0;
m_containsBackreferences = false;
+ m_containsBeginChars = false;
m_containsBOL = false;
newlineCached = 0;
@@ -327,6 +354,7 @@ struct RegexPattern {
m_disjunctions.clear();
deleteAllValues(m_userCharacterClasses);
m_userCharacterClasses.clear();
+ m_beginChars.clear();
}
bool containsIllegalBackReference()
@@ -380,12 +408,14 @@ struct RegexPattern {
bool m_ignoreCase : 1;
bool m_multiline : 1;
bool m_containsBackreferences : 1;
+ bool m_containsBeginChars : 1;
bool m_containsBOL : 1;
unsigned m_numSubpatterns;
unsigned m_maxBackReference;
PatternDisjunction* m_body;
Vector<PatternDisjunction*, 4> m_disjunctions;
Vector<CharacterClass*> m_userCharacterClasses;
+ Vector<BeginChar> m_beginChars;
private:
CharacterClass* newlineCached;