diff options
author | Russell Brenner <russellbrenner@google.com> | 2010-11-18 17:33:13 -0800 |
---|---|---|
committer | Russell Brenner <russellbrenner@google.com> | 2010-12-02 13:47:21 -0800 |
commit | 6b70adc33054f8aee8c54d0f460458a9df11b8a5 (patch) | |
tree | 103a13998c33944d6ab3b8318c509a037e639460 /JavaScriptCore/yarr | |
parent | bdf4ebc8e70b2d221b6ee7a65660918ecb1d33aa (diff) | |
download | external_webkit-6b70adc33054f8aee8c54d0f460458a9df11b8a5.zip external_webkit-6b70adc33054f8aee8c54d0f460458a9df11b8a5.tar.gz external_webkit-6b70adc33054f8aee8c54d0f460458a9df11b8a5.tar.bz2 |
Merge WebKit at r72274: Initial merge by git.
Change-Id: Ie51f0b4a16da82942bd516dce59cfb79ebbe25fb
Diffstat (limited to 'JavaScriptCore/yarr')
-rw-r--r-- | JavaScriptCore/yarr/RegexCompiler.cpp | 253 | ||||
-rw-r--r-- | JavaScriptCore/yarr/RegexInterpreter.cpp | 211 | ||||
-rw-r--r-- | JavaScriptCore/yarr/RegexInterpreter.h | 22 | ||||
-rw-r--r-- | JavaScriptCore/yarr/RegexJIT.h | 4 | ||||
-rw-r--r-- | JavaScriptCore/yarr/RegexParser.h | 2 | ||||
-rw-r--r-- | JavaScriptCore/yarr/RegexPattern.h | 30 |
6 files changed, 467 insertions, 55 deletions
diff --git a/JavaScriptCore/yarr/RegexCompiler.cpp b/JavaScriptCore/yarr/RegexCompiler.cpp index 9f9e028..ccfc94e 100644 --- a/JavaScriptCore/yarr/RegexCompiler.cpp +++ b/JavaScriptCore/yarr/RegexCompiler.cpp @@ -1,5 +1,6 @@ /* * Copyright (C) 2009 Apple Inc. All rights reserved. + * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -235,11 +236,117 @@ private: Vector<CharacterRange> m_rangesUnicode; }; +struct BeginCharHelper { + BeginCharHelper(Vector<BeginChar>* beginChars, bool isCaseInsensitive = false) + : m_beginChars(beginChars) + , m_isCaseInsensitive(isCaseInsensitive) + {} + + void addBeginChar(BeginChar beginChar, Vector<TermChain>* hotTerms, QuantifierType quantityType, unsigned quantityCount) + { + if (quantityType == QuantifierFixedCount && quantityCount > 1) { + // We duplicate the first found character if the quantity of the term is more than one. eg.: /a{3}/ + beginChar.value |= beginChar.value << 16; + beginChar.mask |= beginChar.mask << 16; + addCharacter(beginChar); + } else if (quantityType == QuantifierFixedCount && quantityCount == 1 && hotTerms->size()) + // In case of characters with fixed quantifier we should check the next character as well. + linkHotTerms(beginChar, hotTerms); + else + // In case of greedy matching the next character checking is unnecessary therefore we just store + // the first character. + addCharacter(beginChar); + } + + // Merge two following BeginChars in the vector to reduce the number of character checks. + void merge(unsigned size) + { + for (unsigned i = 0; i < size; i++) { + BeginChar* curr = &m_beginChars->at(i); + BeginChar* next = &m_beginChars->at(i + 1); + + // If the current and the next size of value is different we should skip the merge process + // because the 16bit and 32bit values are unmergable. + if (curr->value <= 0xFFFF && next->value > 0xFFFF) + continue; + + unsigned diff = curr->value ^ next->value; + + curr->mask |= diff; + curr->value |= curr->mask; + + m_beginChars->remove(i + 1); + size--; + } + } + +private: + void addCharacter(BeginChar beginChar) + { + unsigned pos = 0; + unsigned range = m_beginChars->size(); + + // binary chop, find position to insert char. + while (range) { + unsigned index = range >> 1; + + int val = m_beginChars->at(pos+index).value - beginChar.value; + if (!val) + return; + if (val < 0) + range = index; + else { + pos += (index+1); + range -= (index+1); + } + } + + if (pos == m_beginChars->size()) + m_beginChars->append(beginChar); + else + m_beginChars->insert(pos, beginChar); + } + + // Create BeginChar objects by appending each terms from a hotTerms vector to an existing BeginChar object. + void linkHotTerms(BeginChar beginChar, Vector<TermChain>* hotTerms) + { + for (unsigned i = 0; i < hotTerms->size(); i++) { + PatternTerm hotTerm = hotTerms->at(i).term; + ASSERT(hotTerm.type == PatternTerm::TypePatternCharacter); + + UChar characterNext = hotTerm.patternCharacter; + + // Append a character to an existing BeginChar object. + if (characterNext <= 0x7f) { + unsigned mask = 0; + + if (m_isCaseInsensitive && isASCIIAlpha(characterNext)) { + mask = 32; + characterNext = toASCIILower(characterNext); + } + + addCharacter(BeginChar(beginChar.value | (characterNext << 16), beginChar.mask | (mask << 16))); + } else { + UChar upper, lower; + if (m_isCaseInsensitive && ((upper = Unicode::toUpper(characterNext)) != (lower = Unicode::toLower(characterNext)))) { + addCharacter(BeginChar(beginChar.value | (upper << 16), beginChar.mask)); + addCharacter(BeginChar(beginChar.value | (lower << 16), beginChar.mask)); + } else + addCharacter(BeginChar(beginChar.value | (characterNext << 16), beginChar.mask)); + } + } + } + + Vector<BeginChar>* m_beginChars; + bool m_isCaseInsensitive; +}; + class RegexPatternConstructor { public: RegexPatternConstructor(RegexPattern& pattern) : m_pattern(pattern) , m_characterClassConstructor(pattern.m_ignoreCase) + , m_beginCharHelper(&pattern.m_beginChars, pattern.m_ignoreCase) , m_invertParentheticalAssertion(false) { } @@ -649,12 +756,153 @@ public: loopDisjunction->m_alternatives.clear(); } } - - + + bool addBeginTerm(PatternTerm term, Vector<TermChain>* beginTerms, PatternAlternative* alternative, unsigned numTerms, unsigned termIndex, unsigned depth) + { + if (term.quantityType == QuantifierFixedCount) { + beginTerms->append(TermChain(term)); + if (depth < 2 && termIndex < numTerms - 1 && term.quantityCount == 1) + setupAlternativeBeginTerms(alternative, &beginTerms->last().hotTerms, termIndex + 1, depth + 1); + } else if (termIndex != numTerms - 1) { + beginTerms->append(TermChain(term)); + return true; + } + + return false; + } + + // This function collects the terms which are potentially matching the first number of depth characters in the result. + // If this function returns false then it found at least one term which makes the beginning character + // look-up optimization inefficient. + bool setupDisjunctionBeginTerms(PatternDisjunction* disjunction, Vector<TermChain>* beginTerms, unsigned depth) + { + for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) { + PatternAlternative* alternative = disjunction->m_alternatives[alt]; + + if (!setupAlternativeBeginTerms(alternative, beginTerms, 0, depth)) + return false; + } + + return true; + } + + bool setupAlternativeBeginTerms(PatternAlternative* alternative, Vector<TermChain>* beginTerms, unsigned termIndex, unsigned depth) + { + bool checkNext = true; + unsigned numTerms = alternative->m_terms.size(); + + while (checkNext && termIndex < numTerms) { + PatternTerm term = alternative->m_terms[termIndex]; + checkNext = false; + + switch (term.type) { + case PatternTerm::TypeAssertionBOL: + case PatternTerm::TypeAssertionEOL: + case PatternTerm::TypeAssertionWordBoundary: + return false; + + case PatternTerm::TypeBackReference: + case PatternTerm::TypeForwardReference: + return false; + + case PatternTerm::TypePatternCharacter: + if (addBeginTerm(term, beginTerms, alternative, numTerms, termIndex, depth)) { + termIndex++; + checkNext = true; + } + break; + + case PatternTerm::TypeCharacterClass: + return false; + + case PatternTerm::TypeParentheticalAssertion: + if (term.invertOrCapture) + return false; + + case PatternTerm::TypeParenthesesSubpattern: + if (term.quantityType != QuantifierFixedCount) { + if (termIndex == numTerms - 1) + break; + + termIndex++; + checkNext = true; + + } + + if (!setupDisjunctionBeginTerms(term.parentheses.disjunction, beginTerms, depth)) + return false; + + break; + } + } + + return true; + } + + void setupBeginChars() + { + Vector<TermChain> beginTerms; + bool containsFixedCharacter = false; + + if ((!m_pattern.m_body->m_hasFixedSize || m_pattern.m_body->m_alternatives.size() > 1) + && setupDisjunctionBeginTerms(m_pattern.m_body, &beginTerms, 0)) { + unsigned size = beginTerms.size(); + + // If we haven't collected any terms we should abort the preparation of beginning character look-up optimization. + if (!size) + return; + + m_pattern.m_containsBeginChars = true; + + for (unsigned i = 0; i < size; i++) { + PatternTerm term = beginTerms[i].term; + + // We have just collected PatternCharacter terms, other terms are not allowed. + ASSERT(term.type == PatternTerm::TypePatternCharacter); + + if (term.quantityType == QuantifierFixedCount) + containsFixedCharacter = true; + + UChar character = term.patternCharacter; + unsigned mask = 0; + + if (character <= 0x7f) { + if (m_pattern.m_ignoreCase && isASCIIAlpha(character)) { + mask = 32; + character = toASCIILower(character); + } + + m_beginCharHelper.addBeginChar(BeginChar(character, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount); + } else { + UChar upper, lower; + if (m_pattern.m_ignoreCase && ((upper = Unicode::toUpper(character)) != (lower = Unicode::toLower(character)))) { + m_beginCharHelper.addBeginChar(BeginChar(upper, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount); + m_beginCharHelper.addBeginChar(BeginChar(lower, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount); + } else + m_beginCharHelper.addBeginChar(BeginChar(character, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount); + } + } + + // If the pattern doesn't contain terms with fixed quantifiers then the beginning character look-up optimization is inefficient. + if (!containsFixedCharacter) { + m_pattern.m_containsBeginChars = false; + return; + } + + size = m_pattern.m_beginChars.size(); + + if (size > 2) + m_beginCharHelper.merge(size - 1); + else if (size <= 1) + m_pattern.m_containsBeginChars = false; + } + } + private: RegexPattern& m_pattern; PatternAlternative* m_alternative; CharacterClassConstructor m_characterClassConstructor; + BeginCharHelper m_beginCharHelper; bool m_invertCharacterClass; bool m_invertParentheticalAssertion; }; @@ -687,6 +935,7 @@ const char* compileRegex(const UString& patternString, RegexPattern& pattern) constructor.optimizeBOL(); constructor.setupOffsets(); + constructor.setupBeginChars(); return 0; }; diff --git a/JavaScriptCore/yarr/RegexInterpreter.cpp b/JavaScriptCore/yarr/RegexInterpreter.cpp index ec96636..dc3024a 100644 --- a/JavaScriptCore/yarr/RegexInterpreter.cpp +++ b/JavaScriptCore/yarr/RegexInterpreter.cpp @@ -1,5 +1,6 @@ /* * Copyright (C) 2009 Apple Inc. All rights reserved. + * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -195,6 +196,12 @@ public: return -1; } + int readPair() + { + ASSERT(pos + 1 < length); + return input[pos] | input[pos + 1] << 16; + } + int readChecked(int position) { ASSERT(position < 0); @@ -262,6 +269,11 @@ public: return (pos + position) == length; } + bool isNotAvailableInput(int position) + { + return (pos + position) > length; + } + private: const UChar* input; unsigned pos; @@ -591,20 +603,24 @@ public: unsigned count = term.atom.parenthesesDisjunction->m_numSubpatterns; context->restoreOutput(output, firstSubpatternId, count); } - bool parenthesesDoBacktrack(ByteTerm& term, BackTrackInfoParentheses* backTrack) + JSRegExpResult parenthesesDoBacktrack(ByteTerm& term, BackTrackInfoParentheses* backTrack) { while (backTrack->matchAmount) { ParenthesesDisjunctionContext* context = backTrack->lastContext; - if (matchDisjunction(term.atom.parenthesesDisjunction, context->getDisjunctionContext(term), true)) - return true; + JSRegExpResult result = matchDisjunction(term.atom.parenthesesDisjunction, context->getDisjunctionContext(term), true); + if (result == JSRegExpMatch) + return JSRegExpMatch; resetMatches(term, context); popParenthesesDisjunctionContext(backTrack); freeParenthesesDisjunctionContext(context); + + if (result != JSRegExpNoMatch) + return result; } - return false; + return JSRegExpNoMatch; } bool matchParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context) @@ -765,7 +781,7 @@ public: return false; } - bool matchParentheses(ByteTerm& term, DisjunctionContext* context) + JSRegExpResult matchParentheses(ByteTerm& term, DisjunctionContext* context) { ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern); @@ -786,31 +802,42 @@ public: while (backTrack->matchAmount < term.atom.quantityCount) { // Try to do a match, and it it succeeds, add it to the list. ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); - if (matchDisjunction(disjunctionBody, context->getDisjunctionContext(term))) + JSRegExpResult result = matchDisjunction(disjunctionBody, context->getDisjunctionContext(term)); + if (result == JSRegExpMatch) appendParenthesesDisjunctionContext(backTrack, context); else { // The match failed; try to find an alternate point to carry on from. resetMatches(term, context); freeParenthesesDisjunctionContext(context); - if (!parenthesesDoBacktrack(term, backTrack)) - return false; + + if (result == JSRegExpNoMatch) { + JSRegExpResult backtrackResult = parenthesesDoBacktrack(term, backTrack); + if (backtrackResult != JSRegExpMatch) + return backtrackResult; + } else + return result; } } ASSERT(backTrack->matchAmount == term.atom.quantityCount); ParenthesesDisjunctionContext* context = backTrack->lastContext; recordParenthesesMatch(term, context); - return true; + return JSRegExpMatch; } case QuantifierGreedy: { while (backTrack->matchAmount < term.atom.quantityCount) { ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); - if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term))) + JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term)); + if (result == JSRegExpMatch) appendParenthesesDisjunctionContext(backTrack, context); else { resetMatches(term, context); freeParenthesesDisjunctionContext(context); + + if (result != JSRegExpNoMatch) + return result; + break; } } @@ -819,15 +846,15 @@ public: ParenthesesDisjunctionContext* context = backTrack->lastContext; recordParenthesesMatch(term, context); } - return true; + return JSRegExpMatch; } case QuantifierNonGreedy: - return true; + return JSRegExpMatch; } ASSERT_NOT_REACHED(); - return false; + return JSRegExpErrorNoMatch; } // Rules for backtracking differ depending on whether this is greedy or non-greedy. @@ -840,7 +867,7 @@ public: // Non-greedy, we've already done the one less case, so don't match on popping. // We haven't done the one more case, so always try to add that. // - bool backtrackParentheses(ByteTerm& term, DisjunctionContext* context) + JSRegExpResult backtrackParentheses(ByteTerm& term, DisjunctionContext* context) { ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern); @@ -859,44 +886,58 @@ public: ASSERT(backTrack->matchAmount == term.atom.quantityCount); ParenthesesDisjunctionContext* context = 0; + JSRegExpResult result = parenthesesDoBacktrack(term, backTrack); - if (!parenthesesDoBacktrack(term, backTrack)) - return false; + if (result != JSRegExpMatch) + return result; // While we haven't yet reached our fixed limit, while (backTrack->matchAmount < term.atom.quantityCount) { // Try to do a match, and it it succeeds, add it to the list. context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); - if (matchDisjunction(disjunctionBody, context->getDisjunctionContext(term))) + result = matchDisjunction(disjunctionBody, context->getDisjunctionContext(term)); + + if (result == JSRegExpMatch) appendParenthesesDisjunctionContext(backTrack, context); else { // The match failed; try to find an alternate point to carry on from. resetMatches(term, context); freeParenthesesDisjunctionContext(context); - if (!parenthesesDoBacktrack(term, backTrack)) - return false; + + if (result == JSRegExpNoMatch) { + JSRegExpResult backtrackResult = parenthesesDoBacktrack(term, backTrack); + if (backtrackResult != JSRegExpMatch) + return backtrackResult; + } else + return result; } } ASSERT(backTrack->matchAmount == term.atom.quantityCount); context = backTrack->lastContext; recordParenthesesMatch(term, context); - return true; + return JSRegExpMatch; } case QuantifierGreedy: { if (!backTrack->matchAmount) - return false; + return JSRegExpNoMatch; ParenthesesDisjunctionContext* context = backTrack->lastContext; - if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true)) { + JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true); + if (result == JSRegExpMatch) { while (backTrack->matchAmount < term.atom.quantityCount) { ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); - if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term))) + JSRegExpResult parenthesesResult = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term)); + if (parenthesesResult == JSRegExpMatch) appendParenthesesDisjunctionContext(backTrack, context); else { resetMatches(term, context); freeParenthesesDisjunctionContext(context); + + if (parenthesesResult != JSRegExpNoMatch) + return parenthesesResult; + break; } } @@ -904,63 +945,108 @@ public: resetMatches(term, context); popParenthesesDisjunctionContext(backTrack); freeParenthesesDisjunctionContext(context); + + if (result != JSRegExpNoMatch) + return result; } if (backTrack->matchAmount) { ParenthesesDisjunctionContext* context = backTrack->lastContext; recordParenthesesMatch(term, context); } - return true; + return JSRegExpMatch; } case QuantifierNonGreedy: { // If we've not reached the limit, try to add one more match. if (backTrack->matchAmount < term.atom.quantityCount) { ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); - if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term))) { + JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term)); + if (result == JSRegExpMatch) { appendParenthesesDisjunctionContext(backTrack, context); recordParenthesesMatch(term, context); - return true; - } else { - resetMatches(term, context); - freeParenthesesDisjunctionContext(context); + return JSRegExpMatch; } + + resetMatches(term, context); + freeParenthesesDisjunctionContext(context); + + if (result != JSRegExpNoMatch) + return result; } // Nope - okay backtrack looking for an alternative. while (backTrack->matchAmount) { ParenthesesDisjunctionContext* context = backTrack->lastContext; - if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true)) { + JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true); + if (result == JSRegExpMatch) { // successful backtrack! we're back in the game! if (backTrack->matchAmount) { context = backTrack->lastContext; recordParenthesesMatch(term, context); } - return true; + return JSRegExpMatch; } // pop a match off the stack resetMatches(term, context); popParenthesesDisjunctionContext(backTrack); freeParenthesesDisjunctionContext(context); + + return result; } - return false; + return JSRegExpNoMatch; } } ASSERT_NOT_REACHED(); - return false; + return JSRegExpErrorNoMatch; + } + + void lookupForBeginChars() + { + int character; + bool firstSingleCharFound; + + while (true) { + if (input.isNotAvailableInput(2)) + return; + + firstSingleCharFound = false; + + character = input.readPair(); + + for (unsigned i = 0; i < pattern->m_beginChars.size(); ++i) { + BeginChar bc = pattern->m_beginChars[i]; + + if (!firstSingleCharFound && bc.value <= 0xFFFF) { + firstSingleCharFound = true; + character &= 0xFFFF; + } + + if ((character | bc.mask) == bc.value) + return; + } + + input.next(); + } } #define MATCH_NEXT() { ++context->term; goto matchAgain; } #define BACKTRACK() { --context->term; goto backtrack; } #define currentTerm() (disjunction->terms[context->term]) - bool matchDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false) + JSRegExpResult matchDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false, bool isBody = false) { + if (!--remainingMatchCount) + return JSRegExpErrorHitLimit; + if (btrack) BACKTRACK(); + if (pattern->m_containsBeginChars && isBody) + lookupForBeginChars(); + context->matchBegin = input.getPos(); context->term = 0; @@ -972,14 +1058,14 @@ public: MATCH_NEXT(); case ByteTerm::TypeSubpatternEnd: context->matchEnd = input.getPos(); - return true; + return JSRegExpMatch; case ByteTerm::TypeBodyAlternativeBegin: MATCH_NEXT(); case ByteTerm::TypeBodyAlternativeDisjunction: case ByteTerm::TypeBodyAlternativeEnd: context->matchEnd = input.getPos(); - return true; + return JSRegExpMatch; case ByteTerm::TypeAlternativeBegin: MATCH_NEXT(); @@ -1069,10 +1155,16 @@ public: if (matchBackReference(currentTerm(), context)) MATCH_NEXT(); BACKTRACK(); - case ByteTerm::TypeParenthesesSubpattern: - if (matchParentheses(currentTerm(), context)) + case ByteTerm::TypeParenthesesSubpattern: { + JSRegExpResult result = matchParentheses(currentTerm(), context); + + if (result == JSRegExpMatch) { MATCH_NEXT(); + } else if (result != JSRegExpNoMatch) + return result; + BACKTRACK(); + } case ByteTerm::TypeParenthesesSubpatternOnceBegin: if (matchParenthesesOnceBegin(currentTerm(), context)) MATCH_NEXT(); @@ -1104,7 +1196,7 @@ public: switch (currentTerm().type) { case ByteTerm::TypeSubpatternBegin: - return false; + return JSRegExpNoMatch; case ByteTerm::TypeSubpatternEnd: ASSERT_NOT_REACHED(); @@ -1116,9 +1208,13 @@ public: MATCH_NEXT(); if (input.atEnd()) - return false; + return JSRegExpNoMatch; input.next(); + + if (pattern->m_containsBeginChars && isBody) + lookupForBeginChars(); + context->matchBegin = input.getPos(); if (currentTerm().alternative.onceThrough) @@ -1172,10 +1268,16 @@ public: if (backtrackBackReference(currentTerm(), context)) MATCH_NEXT(); BACKTRACK(); - case ByteTerm::TypeParenthesesSubpattern: - if (backtrackParentheses(currentTerm(), context)) + case ByteTerm::TypeParenthesesSubpattern: { + JSRegExpResult result = backtrackParentheses(currentTerm(), context); + + if (result == JSRegExpMatch) { MATCH_NEXT(); + } else if (result != JSRegExpNoMatch) + return result; + BACKTRACK(); + } case ByteTerm::TypeParenthesesSubpatternOnceBegin: if (backtrackParenthesesOnceBegin(currentTerm(), context)) MATCH_NEXT(); @@ -1199,20 +1301,23 @@ public: } ASSERT_NOT_REACHED(); - return false; + return JSRegExpErrorNoMatch; } - bool matchNonZeroDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false) + JSRegExpResult matchNonZeroDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false) { - if (matchDisjunction(disjunction, context, btrack)) { + JSRegExpResult result = matchDisjunction(disjunction, context, btrack); + + if (result == JSRegExpMatch) { while (context->matchBegin == context->matchEnd) { - if (!matchDisjunction(disjunction, context, true)) - return false; + result = matchDisjunction(disjunction, context, true); + if (result != JSRegExpMatch) + return result; } - return true; + return JSRegExpMatch; } - return false; + return result; } int interpret() @@ -1226,7 +1331,8 @@ public: DisjunctionContext* context = allocDisjunctionContext(pattern->m_body.get()); - if (matchDisjunction(pattern->m_body.get(), context)) { + JSRegExpResult result = matchDisjunction(pattern->m_body.get(), context, false, true); + if (result == JSRegExpMatch) { output[0] = context->matchBegin; output[1] = context->matchEnd; } @@ -1235,6 +1341,9 @@ public: pattern->m_allocator->stopAllocator(); + if (output[0] == -1 && result != JSRegExpNoMatch) + return result; + return output[0]; } @@ -1243,6 +1352,7 @@ public: , output(output) , input(inputChar, start, length) , allocatorPool(0) + , remainingMatchCount(matchLimit) { } @@ -1251,6 +1361,7 @@ private: int* output; InputStream input; BumpPointerPool* allocatorPool; + unsigned remainingMatchCount; }; diff --git a/JavaScriptCore/yarr/RegexInterpreter.h b/JavaScriptCore/yarr/RegexInterpreter.h index 37f17fe..dae8f9d 100644 --- a/JavaScriptCore/yarr/RegexInterpreter.h +++ b/JavaScriptCore/yarr/RegexInterpreter.h @@ -40,6 +40,21 @@ using WTF::BumpPointerAllocator; namespace JSC { namespace Yarr { +// TODO move the matchLimit constant and the JSRegExpResult enum to the JSRegExp.h when pcre is removed. + +// The below limit restricts the number of "recursive" match calls in order to +// avoid spending exponential time on complex regular expressions. +static const unsigned matchLimit = 1000000; + +enum JSRegExpResult { + JSRegExpMatch = 1, + JSRegExpNoMatch = 0, + JSRegExpErrorNoMatch = -1, + JSRegExpErrorHitLimit = -2, + JSRegExpErrorNoMemory = -3, + JSRegExpErrorInternal = -4 +}; + class ByteDisjunction; struct ByteTerm { @@ -309,6 +324,7 @@ struct BytecodePattern : FastAllocBase { : m_body(body) , m_ignoreCase(pattern.m_ignoreCase) , m_multiline(pattern.m_multiline) + , m_containsBeginChars(pattern.m_containsBeginChars) , m_allocator(allocator) { newlineCharacterClass = pattern.newlineCharacterClass(); @@ -320,6 +336,8 @@ struct BytecodePattern : FastAllocBase { // array, so that it won't delete them on destruction. We'll // take responsibility for that. pattern.m_userCharacterClasses.clear(); + + m_beginChars.append(pattern.m_beginChars); } ~BytecodePattern() @@ -331,12 +349,16 @@ struct BytecodePattern : FastAllocBase { OwnPtr<ByteDisjunction> m_body; bool m_ignoreCase; bool m_multiline; + bool m_containsBeginChars; // Each BytecodePattern is associated with a RegExp, each RegExp is associated // with a JSGlobalData. Cache a pointer to out JSGlobalData's m_regexAllocator. BumpPointerAllocator* m_allocator; CharacterClass* newlineCharacterClass; CharacterClass* wordcharCharacterClass; + + Vector<BeginChar> m_beginChars; + private: Vector<ByteDisjunction*> m_allParenthesesInfo; Vector<CharacterClass*> m_userCharacterClasses; diff --git a/JavaScriptCore/yarr/RegexJIT.h b/JavaScriptCore/yarr/RegexJIT.h index 5d587b2..02e905d 100644 --- a/JavaScriptCore/yarr/RegexJIT.h +++ b/JavaScriptCore/yarr/RegexJIT.h @@ -30,9 +30,9 @@ #include "MacroAssembler.h" #include "RegexPattern.h" -#include <UString.h> +#include "UString.h" -#include <pcre.h> +#include "pcre.h" struct JSRegExp; // temporary, remove when fallback is removed. #if CPU(X86) && !COMPILER(MSVC) diff --git a/JavaScriptCore/yarr/RegexParser.h b/JavaScriptCore/yarr/RegexParser.h index 8a5eeba..ede9417 100644 --- a/JavaScriptCore/yarr/RegexParser.h +++ b/JavaScriptCore/yarr/RegexParser.h @@ -28,7 +28,7 @@ #if ENABLE(YARR) -#include <UString.h> +#include "UString.h" #include <limits.h> #include <wtf/ASCIICType.h> #include <wtf/unicode/Unicode.h> diff --git a/JavaScriptCore/yarr/RegexPattern.h b/JavaScriptCore/yarr/RegexPattern.h index eecbd43..be31fcd 100644 --- a/JavaScriptCore/yarr/RegexPattern.h +++ b/JavaScriptCore/yarr/RegexPattern.h @@ -1,5 +1,6 @@ /* * Copyright (C) 2009 Apple Inc. All rights reserved. + * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -283,11 +284,36 @@ CharacterClass* nondigitsCreate(); CharacterClass* nonspacesCreate(); CharacterClass* nonwordcharCreate(); +struct TermChain { + TermChain(PatternTerm term) + : term(term) + {} + + PatternTerm term; + Vector<TermChain> hotTerms; +}; + +struct BeginChar { + BeginChar() + : value(0) + , mask(0) + {} + + BeginChar(unsigned value, unsigned mask) + : value(value) + , mask(mask) + {} + + unsigned value; + unsigned mask; +}; + struct RegexPattern { RegexPattern(bool ignoreCase, bool multiline) : m_ignoreCase(ignoreCase) , m_multiline(multiline) , m_containsBackreferences(false) + , m_containsBeginChars(false) , m_containsBOL(false) , m_numSubpatterns(0) , m_maxBackReference(0) @@ -313,6 +339,7 @@ struct RegexPattern { m_maxBackReference = 0; m_containsBackreferences = false; + m_containsBeginChars = false; m_containsBOL = false; newlineCached = 0; @@ -327,6 +354,7 @@ struct RegexPattern { m_disjunctions.clear(); deleteAllValues(m_userCharacterClasses); m_userCharacterClasses.clear(); + m_beginChars.clear(); } bool containsIllegalBackReference() @@ -380,12 +408,14 @@ struct RegexPattern { bool m_ignoreCase : 1; bool m_multiline : 1; bool m_containsBackreferences : 1; + bool m_containsBeginChars : 1; bool m_containsBOL : 1; unsigned m_numSubpatterns; unsigned m_maxBackReference; PatternDisjunction* m_body; Vector<PatternDisjunction*, 4> m_disjunctions; Vector<CharacterClass*> m_userCharacterClasses; + Vector<BeginChar> m_beginChars; private: CharacterClass* newlineCached; |