diff options
Diffstat (limited to 'JavaScriptCore/yarr')
| -rw-r--r-- | JavaScriptCore/yarr/RegexCompiler.cpp | 469 | ||||
| -rw-r--r-- | JavaScriptCore/yarr/RegexCompiler.h | 8 | ||||
| -rw-r--r-- | JavaScriptCore/yarr/RegexInterpreter.cpp | 635 | ||||
| -rw-r--r-- | JavaScriptCore/yarr/RegexInterpreter.h | 62 | ||||
| -rw-r--r-- | JavaScriptCore/yarr/RegexJIT.cpp | 467 | ||||
| -rw-r--r-- | JavaScriptCore/yarr/RegexJIT.h | 38 | ||||
| -rw-r--r-- | JavaScriptCore/yarr/RegexParser.h | 14 | ||||
| -rw-r--r-- | JavaScriptCore/yarr/RegexPattern.h | 95 |
8 files changed, 1274 insertions, 514 deletions
diff --git a/JavaScriptCore/yarr/RegexCompiler.cpp b/JavaScriptCore/yarr/RegexCompiler.cpp index 9cd3d12..06ecbad 100644 --- a/JavaScriptCore/yarr/RegexCompiler.cpp +++ b/JavaScriptCore/yarr/RegexCompiler.cpp @@ -1,5 +1,6 @@ /* * Copyright (C) 2009 Apple Inc. All rights reserved. + * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -30,12 +31,12 @@ #include "RegexPattern.h" #include <wtf/Vector.h> -#if ENABLE(YARR) - using namespace WTF; namespace JSC { namespace Yarr { +#include "RegExpJitTables.h" + class CharacterClassConstructor { public: CharacterClassConstructor(bool isCaseInsensitive = false) @@ -141,7 +142,7 @@ public: CharacterClass* charClass() { - CharacterClass* characterClass = new CharacterClass(); + CharacterClass* characterClass = new CharacterClass(0); characterClass->m_matches.append(m_matches); characterClass->m_ranges.append(m_ranges); @@ -233,110 +234,118 @@ private: Vector<CharacterRange> m_rangesUnicode; }; +struct BeginCharHelper { + BeginCharHelper(Vector<BeginChar>* beginChars, bool isCaseInsensitive = false) + : m_beginChars(beginChars) + , m_isCaseInsensitive(isCaseInsensitive) + {} + + void addBeginChar(BeginChar beginChar, Vector<TermChain>* hotTerms, QuantifierType quantityType, unsigned quantityCount) + { + if (quantityType == QuantifierFixedCount && quantityCount > 1) { + // We duplicate the first found character if the quantity of the term is more than one. eg.: /a{3}/ + beginChar.value |= beginChar.value << 16; + beginChar.mask |= beginChar.mask << 16; + addCharacter(beginChar); + } else if (quantityType == QuantifierFixedCount && quantityCount == 1 && hotTerms->size()) + // In case of characters with fixed quantifier we should check the next character as well. + linkHotTerms(beginChar, hotTerms); + else + // In case of greedy matching the next character checking is unnecessary therefore we just store + // the first character. + addCharacter(beginChar); + } -CharacterClass* newlineCreate() -{ - CharacterClass* characterClass = new CharacterClass(); + // Merge two following BeginChars in the vector to reduce the number of character checks. + void merge(unsigned size) + { + for (unsigned i = 0; i < size; i++) { + BeginChar* curr = &m_beginChars->at(i); + BeginChar* next = &m_beginChars->at(i + 1); - characterClass->m_matches.append('\n'); - characterClass->m_matches.append('\r'); - characterClass->m_matchesUnicode.append(0x2028); - characterClass->m_matchesUnicode.append(0x2029); - - return characterClass; -} + // If the current and the next size of value is different we should skip the merge process + // because the 16bit and 32bit values are unmergable. + if (curr->value <= 0xFFFF && next->value > 0xFFFF) + continue; -CharacterClass* digitsCreate() -{ - CharacterClass* characterClass = new CharacterClass(); + unsigned diff = curr->value ^ next->value; - characterClass->m_ranges.append(CharacterRange('0', '9')); - - return characterClass; -} + curr->mask |= diff; + curr->value |= curr->mask; -CharacterClass* spacesCreate() -{ - CharacterClass* characterClass = new CharacterClass(); - - characterClass->m_matches.append(' '); - characterClass->m_ranges.append(CharacterRange('\t', '\r')); - characterClass->m_matchesUnicode.append(0x00a0); - characterClass->m_matchesUnicode.append(0x1680); - characterClass->m_matchesUnicode.append(0x180e); - characterClass->m_matchesUnicode.append(0x2028); - characterClass->m_matchesUnicode.append(0x2029); - characterClass->m_matchesUnicode.append(0x202f); - characterClass->m_matchesUnicode.append(0x205f); - characterClass->m_matchesUnicode.append(0x3000); - characterClass->m_rangesUnicode.append(CharacterRange(0x2000, 0x200a)); - - return characterClass; -} + m_beginChars->remove(i + 1); + size--; + } + } -CharacterClass* wordcharCreate() -{ - CharacterClass* characterClass = new CharacterClass(); +private: + void addCharacter(BeginChar beginChar) + { + unsigned pos = 0; + unsigned range = m_beginChars->size(); - characterClass->m_matches.append('_'); - characterClass->m_ranges.append(CharacterRange('0', '9')); - characterClass->m_ranges.append(CharacterRange('A', 'Z')); - characterClass->m_ranges.append(CharacterRange('a', 'z')); - - return characterClass; -} + // binary chop, find position to insert char. + while (range) { + unsigned index = range >> 1; -CharacterClass* nondigitsCreate() -{ - CharacterClass* characterClass = new CharacterClass(); + int val = m_beginChars->at(pos+index).value - beginChar.value; + if (!val) + return; + if (val < 0) + range = index; + else { + pos += (index+1); + range -= (index+1); + } + } - characterClass->m_ranges.append(CharacterRange(0, '0' - 1)); - characterClass->m_ranges.append(CharacterRange('9' + 1, 0x7f)); - characterClass->m_rangesUnicode.append(CharacterRange(0x80, 0xffff)); - - return characterClass; -} + if (pos == m_beginChars->size()) + m_beginChars->append(beginChar); + else + m_beginChars->insert(pos, beginChar); + } -CharacterClass* nonspacesCreate() -{ - CharacterClass* characterClass = new CharacterClass(); - - characterClass->m_ranges.append(CharacterRange(0, '\t' - 1)); - characterClass->m_ranges.append(CharacterRange('\r' + 1, ' ' - 1)); - characterClass->m_ranges.append(CharacterRange(' ' + 1, 0x7f)); - characterClass->m_rangesUnicode.append(CharacterRange(0x0080, 0x009f)); - characterClass->m_rangesUnicode.append(CharacterRange(0x00a1, 0x167f)); - characterClass->m_rangesUnicode.append(CharacterRange(0x1681, 0x180d)); - characterClass->m_rangesUnicode.append(CharacterRange(0x180f, 0x1fff)); - characterClass->m_rangesUnicode.append(CharacterRange(0x200b, 0x2027)); - characterClass->m_rangesUnicode.append(CharacterRange(0x202a, 0x202e)); - characterClass->m_rangesUnicode.append(CharacterRange(0x2030, 0x205e)); - characterClass->m_rangesUnicode.append(CharacterRange(0x2060, 0x2fff)); - characterClass->m_rangesUnicode.append(CharacterRange(0x3001, 0xffff)); - - return characterClass; -} + // Create BeginChar objects by appending each terms from a hotTerms vector to an existing BeginChar object. + void linkHotTerms(BeginChar beginChar, Vector<TermChain>* hotTerms) + { + for (unsigned i = 0; i < hotTerms->size(); i++) { + PatternTerm hotTerm = hotTerms->at(i).term; + ASSERT(hotTerm.type == PatternTerm::TypePatternCharacter); -CharacterClass* nonwordcharCreate() -{ - CharacterClass* characterClass = new CharacterClass(); + UChar characterNext = hotTerm.patternCharacter; - characterClass->m_matches.append('`'); - characterClass->m_ranges.append(CharacterRange(0, '0' - 1)); - characterClass->m_ranges.append(CharacterRange('9' + 1, 'A' - 1)); - characterClass->m_ranges.append(CharacterRange('Z' + 1, '_' - 1)); - characterClass->m_ranges.append(CharacterRange('z' + 1, 0x7f)); - characterClass->m_rangesUnicode.append(CharacterRange(0x80, 0xffff)); + // Append a character to an existing BeginChar object. + if (characterNext <= 0x7f) { + unsigned mask = 0; - return characterClass; -} + if (m_isCaseInsensitive && isASCIIAlpha(characterNext)) { + mask = 32; + characterNext = toASCIILower(characterNext); + } + addCharacter(BeginChar(beginChar.value | (characterNext << 16), beginChar.mask | (mask << 16))); + } else { + UChar upper, lower; + if (m_isCaseInsensitive && ((upper = Unicode::toUpper(characterNext)) != (lower = Unicode::toLower(characterNext)))) { + addCharacter(BeginChar(beginChar.value | (upper << 16), beginChar.mask)); + addCharacter(BeginChar(beginChar.value | (lower << 16), beginChar.mask)); + } else + addCharacter(BeginChar(beginChar.value | (characterNext << 16), beginChar.mask)); + } + } + } + + Vector<BeginChar>* m_beginChars; + bool m_isCaseInsensitive; +}; class RegexPatternConstructor { public: RegexPatternConstructor(RegexPattern& pattern) : m_pattern(pattern) , m_characterClassConstructor(pattern.m_ignoreCase) + , m_beginCharHelper(&pattern.m_beginChars, pattern.m_ignoreCase) + , m_invertParentheticalAssertion(false) { } @@ -352,6 +361,11 @@ public: void assertionBOL() { + if (!m_alternative->m_terms.size() & !m_invertParentheticalAssertion) { + m_alternative->m_startsWithBOL = true; + m_alternative->m_containsBOL = true; + m_pattern.m_containsBOL = true; + } m_alternative->m_terms.append(PatternTerm::BOL()); } void assertionEOL() @@ -455,20 +469,42 @@ public: m_pattern.m_disjunctions.append(parenthesesDisjunction); m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParentheticalAssertion, m_pattern.m_numSubpatterns + 1, parenthesesDisjunction, invert)); m_alternative = parenthesesDisjunction->addNewAlternative(); + m_invertParentheticalAssertion = invert; } void atomParenthesesEnd() { ASSERT(m_alternative->m_parent); ASSERT(m_alternative->m_parent->m_parent); + + PatternDisjunction* parenthesisDisjunction = m_alternative->m_parent; m_alternative = m_alternative->m_parent->m_parent; + + PatternTerm& lastTerm = m_alternative->lastTerm(); + + unsigned numParenAlternatives = parenthesisDisjunction->m_alternatives.size(); + unsigned numBOLAnchoredAlts = 0; + // Bubble up BOL flags + for (unsigned i = 0; i < numParenAlternatives; i++) { + if (parenthesisDisjunction->m_alternatives[i]->m_startsWithBOL) + numBOLAnchoredAlts++; + } - m_alternative->lastTerm().parentheses.lastSubpatternId = m_pattern.m_numSubpatterns; + if (numBOLAnchoredAlts) { + m_alternative->m_containsBOL = true; + // If all the alternatives in parens start with BOL, then so does this one + if (numBOLAnchoredAlts == numParenAlternatives) + m_alternative->m_startsWithBOL = true; + } + + lastTerm.parentheses.lastSubpatternId = m_pattern.m_numSubpatterns; + m_invertParentheticalAssertion = false; } void atomBackReference(unsigned subpatternId) { ASSERT(subpatternId); + m_pattern.m_containsBackreferences = true; m_pattern.m_maxBackReference = std::max(m_pattern.m_maxBackReference, subpatternId); if (subpatternId > m_pattern.m_numSubpatterns) { @@ -493,32 +529,39 @@ public: m_alternative->m_terms.append(PatternTerm(subpatternId)); } - PatternDisjunction* copyDisjunction(PatternDisjunction* disjunction) + // deep copy the argument disjunction. If filterStartsWithBOL is true, + // skip alternatives with m_startsWithBOL set true. + PatternDisjunction* copyDisjunction(PatternDisjunction* disjunction, bool filterStartsWithBOL = false) { - PatternDisjunction* newDisjunction = new PatternDisjunction(); - - newDisjunction->m_parent = disjunction->m_parent; + PatternDisjunction* newDisjunction = 0; for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) { PatternAlternative* alternative = disjunction->m_alternatives[alt]; - PatternAlternative* newAlternative = newDisjunction->addNewAlternative(); - for (unsigned i = 0; i < alternative->m_terms.size(); ++i) - newAlternative->m_terms.append(copyTerm(alternative->m_terms[i])); + if (!filterStartsWithBOL || !alternative->m_startsWithBOL) { + if (!newDisjunction) { + newDisjunction = new PatternDisjunction(); + newDisjunction->m_parent = disjunction->m_parent; + } + PatternAlternative* newAlternative = newDisjunction->addNewAlternative(); + for (unsigned i = 0; i < alternative->m_terms.size(); ++i) + newAlternative->m_terms.append(copyTerm(alternative->m_terms[i], filterStartsWithBOL)); + } } - - m_pattern.m_disjunctions.append(newDisjunction); + + if (newDisjunction) + m_pattern.m_disjunctions.append(newDisjunction); return newDisjunction; } - - PatternTerm copyTerm(PatternTerm& term) + + PatternTerm copyTerm(PatternTerm& term, bool filterStartsWithBOL = false) { if ((term.type != PatternTerm::TypeParenthesesSubpattern) && (term.type != PatternTerm::TypeParentheticalAssertion)) return PatternTerm(term); - + PatternTerm termCopy = term; - termCopy.parentheses.disjunction = copyDisjunction(termCopy.parentheses.disjunction); + termCopy.parentheses.disjunction = copyDisjunction(termCopy.parentheses.disjunction, filterStartsWithBOL); return termCopy; } - + void quantifyAtom(unsigned min, unsigned max, bool greedy) { ASSERT(min <= max); @@ -624,14 +667,17 @@ public: case PatternTerm::TypeParenthesesSubpattern: // Note: for fixed once parentheses we will ensure at least the minimum is available; others are on their own. term.frameLocation = currentCallFrameSize; - if ((term.quantityCount == 1) && !term.parentheses.isCopy) { - if (term.quantityType == QuantifierFixedCount) { - currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition); - currentInputPosition += term.parentheses.disjunction->m_minimumSize; - } else { + if (term.quantityCount == 1 && !term.parentheses.isCopy) { + if (term.quantityType != QuantifierFixedCount) currentCallFrameSize += RegexStackSpaceForBackTrackInfoParenthesesOnce; - currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition); - } + currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition); + // If quantity is fixed, then pre-check its minimum size. + if (term.quantityType == QuantifierFixedCount) + currentInputPosition += term.parentheses.disjunction->m_minimumSize; + term.inputPosition = currentInputPosition; + } else if (term.parentheses.isTerminal) { + currentCallFrameSize += RegexStackSpaceForBackTrackInfoParenthesesTerminal; + currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition); term.inputPosition = currentInputPosition; } else { term.inputPosition = currentInputPosition; @@ -685,11 +731,208 @@ public: setupDisjunctionOffsets(m_pattern.m_body, 0, 0); } + // This optimization identifies sets of parentheses that we will never need to backtrack. + // In these cases we do not need to store state from prior iterations. + // We can presently avoid backtracking for: + // * a set of parens at the end of the regular expression (last term in any of the alternatives of the main body disjunction). + // * where the parens are non-capturing, and quantified unbounded greedy (*). + // * where the parens do not contain any capturing subpatterns. + void checkForTerminalParentheses() + { + // This check is much too crude; should be just checking whether the candidate + // node contains nested capturing subpatterns, not the whole expression! + if (m_pattern.m_numSubpatterns) + return; + + Vector<PatternAlternative*>& alternatives = m_pattern.m_body->m_alternatives; + for (unsigned i =0; i < alternatives.size(); ++i) { + Vector<PatternTerm>& terms = alternatives[i]->m_terms; + if (terms.size()) { + PatternTerm& term = terms.last(); + if (term.type == PatternTerm::TypeParenthesesSubpattern + && term.quantityType == QuantifierGreedy + && term.quantityCount == UINT_MAX + && !term.capture()) + term.parentheses.isTerminal = true; + } + } + } + + void optimizeBOL() + { + // Look for expressions containing beginning of line (^) anchoring and unroll them. + // e.g. /^a|^b|c/ becomes /^a|^b|c/ which is executed once followed by /c/ which loops + // This code relies on the parsing code tagging alternatives with m_containsBOL and + // m_startsWithBOL and rolling those up to containing alternatives. + // At this point, this is only valid for non-multiline expressions. + PatternDisjunction* disjunction = m_pattern.m_body; + + if (!m_pattern.m_containsBOL || m_pattern.m_multiline) + return; + + PatternDisjunction* loopDisjunction = copyDisjunction(disjunction, true); + + // Set alternatives in disjunction to "onceThrough" + for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) + disjunction->m_alternatives[alt]->setOnceThrough(); + + if (loopDisjunction) { + // Move alternatives from loopDisjunction to disjunction + for (unsigned alt = 0; alt < loopDisjunction->m_alternatives.size(); ++alt) + disjunction->m_alternatives.append(loopDisjunction->m_alternatives[alt]); + + loopDisjunction->m_alternatives.clear(); + } + } + + bool addBeginTerm(PatternTerm term, Vector<TermChain>* beginTerms, PatternAlternative* alternative, unsigned numTerms, unsigned termIndex, unsigned depth) + { + if (term.quantityType == QuantifierFixedCount) { + beginTerms->append(TermChain(term)); + if (depth < 2 && termIndex < numTerms - 1 && term.quantityCount == 1) + setupAlternativeBeginTerms(alternative, &beginTerms->last().hotTerms, termIndex + 1, depth + 1); + } else if (termIndex != numTerms - 1) { + beginTerms->append(TermChain(term)); + return true; + } + + return false; + } + + // This function collects the terms which are potentially matching the first number of depth characters in the result. + // If this function returns false then it found at least one term which makes the beginning character + // look-up optimization inefficient. + bool setupDisjunctionBeginTerms(PatternDisjunction* disjunction, Vector<TermChain>* beginTerms, unsigned depth) + { + for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) { + PatternAlternative* alternative = disjunction->m_alternatives[alt]; + + if (!setupAlternativeBeginTerms(alternative, beginTerms, 0, depth)) + return false; + } + + return true; + } + + bool setupAlternativeBeginTerms(PatternAlternative* alternative, Vector<TermChain>* beginTerms, unsigned termIndex, unsigned depth) + { + bool checkNext = true; + unsigned numTerms = alternative->m_terms.size(); + + while (checkNext && termIndex < numTerms) { + PatternTerm term = alternative->m_terms[termIndex]; + checkNext = false; + + switch (term.type) { + case PatternTerm::TypeAssertionBOL: + case PatternTerm::TypeAssertionEOL: + case PatternTerm::TypeAssertionWordBoundary: + return false; + + case PatternTerm::TypeBackReference: + case PatternTerm::TypeForwardReference: + return false; + + case PatternTerm::TypePatternCharacter: + if (addBeginTerm(term, beginTerms, alternative, numTerms, termIndex, depth)) { + termIndex++; + checkNext = true; + } + break; + + case PatternTerm::TypeCharacterClass: + return false; + + case PatternTerm::TypeParentheticalAssertion: + if (term.invertOrCapture) + return false; + + case PatternTerm::TypeParenthesesSubpattern: + if (term.quantityType != QuantifierFixedCount) { + if (termIndex == numTerms - 1) + break; + + termIndex++; + checkNext = true; + + } + + if (!setupDisjunctionBeginTerms(term.parentheses.disjunction, beginTerms, depth)) + return false; + + break; + } + } + + return true; + } + + void setupBeginChars() + { + Vector<TermChain> beginTerms; + bool containsFixedCharacter = false; + + if ((!m_pattern.m_body->m_hasFixedSize || m_pattern.m_body->m_alternatives.size() > 1) + && setupDisjunctionBeginTerms(m_pattern.m_body, &beginTerms, 0)) { + unsigned size = beginTerms.size(); + + // If we haven't collected any terms we should abort the preparation of beginning character look-up optimization. + if (!size) + return; + + m_pattern.m_containsBeginChars = true; + + for (unsigned i = 0; i < size; i++) { + PatternTerm term = beginTerms[i].term; + + // We have just collected PatternCharacter terms, other terms are not allowed. + ASSERT(term.type == PatternTerm::TypePatternCharacter); + + if (term.quantityType == QuantifierFixedCount) + containsFixedCharacter = true; + + UChar character = term.patternCharacter; + unsigned mask = 0; + + if (character <= 0x7f) { + if (m_pattern.m_ignoreCase && isASCIIAlpha(character)) { + mask = 32; + character = toASCIILower(character); + } + + m_beginCharHelper.addBeginChar(BeginChar(character, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount); + } else { + UChar upper, lower; + if (m_pattern.m_ignoreCase && ((upper = Unicode::toUpper(character)) != (lower = Unicode::toLower(character)))) { + m_beginCharHelper.addBeginChar(BeginChar(upper, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount); + m_beginCharHelper.addBeginChar(BeginChar(lower, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount); + } else + m_beginCharHelper.addBeginChar(BeginChar(character, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount); + } + } + + // If the pattern doesn't contain terms with fixed quantifiers then the beginning character look-up optimization is inefficient. + if (!containsFixedCharacter) { + m_pattern.m_containsBeginChars = false; + return; + } + + size = m_pattern.m_beginChars.size(); + + if (size > 2) + m_beginCharHelper.merge(size - 1); + else if (size <= 1) + m_pattern.m_containsBeginChars = false; + } + } + private: RegexPattern& m_pattern; PatternAlternative* m_alternative; CharacterClassConstructor m_characterClassConstructor; + BeginCharHelper m_beginCharHelper; bool m_invertCharacterClass; + bool m_invertParentheticalAssertion; }; @@ -717,12 +960,14 @@ const char* compileRegex(const UString& patternString, RegexPattern& pattern) ASSERT(numSubpatterns == pattern.m_numSubpatterns); } + constructor.checkForTerminalParentheses(); + constructor.optimizeBOL(); + constructor.setupOffsets(); + constructor.setupBeginChars(); - return false; + return 0; }; } } - -#endif diff --git a/JavaScriptCore/yarr/RegexCompiler.h b/JavaScriptCore/yarr/RegexCompiler.h index 3ed2be9..399374e 100644 --- a/JavaScriptCore/yarr/RegexCompiler.h +++ b/JavaScriptCore/yarr/RegexCompiler.h @@ -26,13 +26,9 @@ #ifndef RegexCompiler_h #define RegexCompiler_h -#include <wtf/Platform.h> - -#if ENABLE(YARR) - -#include <wtf/unicode/Unicode.h> #include "RegexParser.h" #include "RegexPattern.h" +#include <wtf/unicode/Unicode.h> namespace JSC { namespace Yarr { @@ -40,6 +36,4 @@ const char* compileRegex(const UString& patternString, RegexPattern& pattern); } } // namespace JSC::Yarr -#endif - #endif // RegexCompiler_h diff --git a/JavaScriptCore/yarr/RegexInterpreter.cpp b/JavaScriptCore/yarr/RegexInterpreter.cpp index d088086..164158e 100644 --- a/JavaScriptCore/yarr/RegexInterpreter.cpp +++ b/JavaScriptCore/yarr/RegexInterpreter.cpp @@ -1,5 +1,6 @@ /* * Copyright (C) 2009 Apple Inc. All rights reserved. + * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -28,13 +29,12 @@ #include "RegexCompiler.h" #include "RegexPattern.h" +#include <wtf/BumpPointerAllocator.h> #ifndef NDEBUG #include <stdio.h> #endif -#if ENABLE(YARR) - using namespace WTF; namespace JSC { namespace Yarr { @@ -60,7 +60,10 @@ public: uintptr_t begin; }; struct BackTrackInfoParenthesesOnce { - uintptr_t inParentheses; + uintptr_t begin; + }; + struct BackTrackInfoParenthesesTerminal { + uintptr_t begin; }; struct BackTrackInfoParentheses { uintptr_t matchAmount; @@ -104,12 +107,16 @@ public: DisjunctionContext* allocDisjunctionContext(ByteDisjunction* disjunction) { - return new(malloc(sizeof(DisjunctionContext) + (disjunction->m_frameSize - 1) * sizeof(uintptr_t))) DisjunctionContext(); + size_t size = sizeof(DisjunctionContext) - sizeof(uintptr_t) + disjunction->m_frameSize * sizeof(uintptr_t); + allocatorPool = allocatorPool->ensureCapacity(size); + if (!allocatorPool) + CRASH(); + return new(allocatorPool->alloc(size)) DisjunctionContext(); } void freeDisjunctionContext(DisjunctionContext* context) { - free(context); + allocatorPool = allocatorPool->dealloc(context); } struct ParenthesesDisjunctionContext @@ -150,12 +157,16 @@ public: ParenthesesDisjunctionContext* allocParenthesesDisjunctionContext(ByteDisjunction* disjunction, int* output, ByteTerm& term) { - return new(malloc(sizeof(ParenthesesDisjunctionContext) + (((term.atom.parenthesesDisjunction->m_numSubpatterns << 1) - 1) * sizeof(int)) + sizeof(DisjunctionContext) + (disjunction->m_frameSize - 1) * sizeof(uintptr_t))) ParenthesesDisjunctionContext(output, term); + size_t size = sizeof(ParenthesesDisjunctionContext) - sizeof(int) + (term.atom.parenthesesDisjunction->m_numSubpatterns << 1) * sizeof(int) + sizeof(DisjunctionContext) - sizeof(uintptr_t) + disjunction->m_frameSize * sizeof(uintptr_t); + allocatorPool = allocatorPool->ensureCapacity(size); + if (!allocatorPool) + CRASH(); + return new(allocatorPool->alloc(size)) ParenthesesDisjunctionContext(output, term); } void freeParenthesesDisjunctionContext(ParenthesesDisjunctionContext* context) { - free(context); + allocatorPool = allocatorPool->dealloc(context); } class InputStream { @@ -186,6 +197,12 @@ public: return -1; } + int readPair() + { + ASSERT(pos + 1 < length); + return input[pos] | input[pos + 1] << 16; + } + int readChecked(int position) { ASSERT(position < 0); @@ -253,6 +270,11 @@ public: return (pos + position) == length; } + bool isNotAvailableInput(int position) + { + return (pos + position) > length; + } + private: const UChar* input; unsigned pos; @@ -280,20 +302,6 @@ public: return false; } - bool tryConsumeCharacter(int testChar) - { - if (input.atEnd()) - return false; - - int ch = input.read(); - - if (pattern->m_ignoreCase ? ((Unicode::toLower(testChar) == ch) || (Unicode::toUpper(testChar) == ch)) : (testChar == ch)) { - input.next(); - return true; - } - return false; - } - bool checkCharacter(int testChar, int inputPosition) { return testChar == input.readChecked(inputPosition); @@ -305,23 +313,6 @@ public: return (loChar == ch) || (hiChar == ch); } - bool tryConsumeCharacterClass(CharacterClass* characterClass, bool invert) - { - if (input.atEnd()) - return false; - - bool match = testCharacterClass(characterClass, input.read()); - - if (invert) - match = !match; - - if (match) { - input.next(); - return true; - } - return false; - } - bool checkCharacterClass(CharacterClass* characterClass, bool invert, int inputPosition) { bool match = testCharacterClass(characterClass, input.readChecked(inputPosition)); @@ -335,10 +326,24 @@ public: if (!input.checkInput(matchSize)) return false; - for (int i = 0; i < matchSize; ++i) { - if (!checkCharacter(input.reread(matchBegin + i), inputOffset - matchSize + i)) { - input.uncheckInput(matchSize); - return false; + if (pattern->m_ignoreCase) { + for (int i = 0; i < matchSize; ++i) { + int ch = input.reread(matchBegin + i); + + int lo = Unicode::toLower(ch); + int hi = Unicode::toUpper(ch); + + if ((lo != hi) ? (!checkCasedCharacter(lo, hi, inputOffset - matchSize + i)) : (!checkCharacter(ch, inputOffset - matchSize + i))) { + input.uncheckInput(matchSize); + return false; + } + } + } else { + for (int i = 0; i < matchSize; ++i) { + if (!checkCharacter(input.reread(matchBegin + i), inputOffset - matchSize + i)) { + input.uncheckInput(matchSize); + return false; + } } } @@ -503,6 +508,13 @@ public: int matchBegin = output[(term.atom.subpatternId << 1)]; int matchEnd = output[(term.atom.subpatternId << 1) + 1]; + + // If the end position of the referenced match hasn't set yet then the backreference in the same parentheses where it references to that. + // In this case the result of match is empty string like when it references to a parentheses with zero-width match. + // Eg.: /(a\1)/ + if (matchEnd == -1) + return true; + ASSERT((matchBegin == -1) == (matchEnd == -1)); ASSERT(matchBegin <= matchEnd); @@ -592,27 +604,24 @@ public: unsigned count = term.atom.parenthesesDisjunction->m_numSubpatterns; context->restoreOutput(output, firstSubpatternId, count); } - void resetAssertionMatches(ByteTerm& term) - { - unsigned firstSubpatternId = term.atom.subpatternId; - unsigned count = term.atom.parenthesesDisjunction->m_numSubpatterns; - for (unsigned i = 0; i < (count << 1); ++i) - output[(firstSubpatternId << 1) + i] = -1; - } - bool parenthesesDoBacktrack(ByteTerm& term, BackTrackInfoParentheses* backTrack) + JSRegExpResult parenthesesDoBacktrack(ByteTerm& term, BackTrackInfoParentheses* backTrack) { while (backTrack->matchAmount) { ParenthesesDisjunctionContext* context = backTrack->lastContext; - if (matchDisjunction(term.atom.parenthesesDisjunction, context->getDisjunctionContext(term), true)) - return true; + JSRegExpResult result = matchDisjunction(term.atom.parenthesesDisjunction, context->getDisjunctionContext(term), true); + if (result == JSRegExpMatch) + return JSRegExpMatch; resetMatches(term, context); popParenthesesDisjunctionContext(backTrack); freeParenthesesDisjunctionContext(context); + + if (result != JSRegExpNoMatch) + return result; } - return false; + return JSRegExpNoMatch; } bool matchParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context) @@ -625,11 +634,11 @@ public: switch (term.atom.quantityType) { case QuantifierGreedy: { // set this speculatively; if we get to the parens end this will be true. - backTrack->inParentheses = 1; + backTrack->begin = input.getPos(); break; } case QuantifierNonGreedy: { - backTrack->inParentheses = 0; + backTrack->begin = notFound; context->term += term.atom.parenthesesWidth; return true; } @@ -645,7 +654,7 @@ public: return true; } - bool matchParenthesesOnceEnd(ByteTerm& term, DisjunctionContext*) + bool matchParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context) { ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd); ASSERT(term.atom.quantityCount == 1); @@ -654,7 +663,12 @@ public: unsigned subpatternId = term.atom.subpatternId; output[(subpatternId << 1) + 1] = input.getPos() + term.inputPosition; } - return true; + + if (term.atom.quantityType == QuantifierFixedCount) + return true; + + BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation); + return backTrack->begin != input.getPos(); } bool backtrackParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context) @@ -673,12 +687,12 @@ public: switch (term.atom.quantityType) { case QuantifierGreedy: // if we backtrack to this point, there is another chance - try matching nothing. - ASSERT(backTrack->inParentheses); - backTrack->inParentheses = 0; + ASSERT(backTrack->begin != notFound); + backTrack->begin = notFound; context->term += term.atom.parenthesesWidth; return true; case QuantifierNonGreedy: - ASSERT(backTrack->inParentheses); + ASSERT(backTrack->begin != notFound); case QuantifierFixedCount: break; } @@ -695,17 +709,21 @@ public: switch (term.atom.quantityType) { case QuantifierGreedy: - if (!backTrack->inParentheses) { + if (backTrack->begin == notFound) { context->term -= term.atom.parenthesesWidth; return false; } case QuantifierNonGreedy: - if (!backTrack->inParentheses) { - // now try to match the parens; set this speculatively. - backTrack->inParentheses = 1; + if (backTrack->begin == notFound) { + backTrack->begin = input.getPos(); if (term.capture()) { + // Technically this access to inputPosition should be accessing the begin term's + // inputPosition, but for repeats other than fixed these values should be + // the same anyway! (we don't pre-check for greedy or non-greedy matches.) + ASSERT((&term - term.atom.parenthesesWidth)->type == ByteTerm::TypeParenthesesSubpatternOnceBegin); + ASSERT((&term - term.atom.parenthesesWidth)->inputPosition == term.inputPosition); unsigned subpatternId = term.atom.subpatternId; - output[(subpatternId << 1) + 1] = input.getPos() + term.inputPosition; + output[subpatternId << 1] = input.getPos() + term.inputPosition; } context->term -= term.atom.parenthesesWidth; return true; @@ -717,6 +735,53 @@ public: return false; } + bool matchParenthesesTerminalBegin(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalBegin); + ASSERT(term.atom.quantityType == QuantifierGreedy); + ASSERT(term.atom.quantityCount == UINT_MAX); + ASSERT(!term.capture()); + + BackTrackInfoParenthesesTerminal* backTrack = reinterpret_cast<BackTrackInfoParenthesesTerminal*>(context->frame + term.frameLocation); + backTrack->begin = input.getPos(); + return true; + } + + bool matchParenthesesTerminalEnd(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalEnd); + + BackTrackInfoParenthesesTerminal* backTrack = reinterpret_cast<BackTrackInfoParenthesesTerminal*>(context->frame + term.frameLocation); + // Empty match is a failed match. + if (backTrack->begin == input.getPos()) + return false; + + // Successful match! Okay, what's next? - loop around and try to match moar! + context->term -= (term.atom.parenthesesWidth + 1); + return true; + } + + bool backtrackParenthesesTerminalBegin(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalBegin); + ASSERT(term.atom.quantityType == QuantifierGreedy); + ASSERT(term.atom.quantityCount == UINT_MAX); + ASSERT(!term.capture()); + + // If we backtrack to this point, we have failed to match this iteration of the parens. + // Since this is greedy / zero minimum a failed is also accepted as a match! + context->term += term.atom.parenthesesWidth; + return true; + } + + bool backtrackParenthesesTerminalEnd(ByteTerm&, DisjunctionContext*) + { + // 'Terminal' parentheses are at the end of the regex, and as such a match past end + // should always be returned as a successful match - we should never becktrack to here. + ASSERT_NOT_REACHED(); + return false; + } + bool matchParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context) { ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin); @@ -773,7 +838,7 @@ public: return false; } - bool matchParentheses(ByteTerm& term, DisjunctionContext* context) + JSRegExpResult matchParentheses(ByteTerm& term, DisjunctionContext* context) { ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern); @@ -794,31 +859,42 @@ public: while (backTrack->matchAmount < term.atom.quantityCount) { // Try to do a match, and it it succeeds, add it to the list. ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); - if (matchDisjunction(disjunctionBody, context->getDisjunctionContext(term))) + JSRegExpResult result = matchDisjunction(disjunctionBody, context->getDisjunctionContext(term)); + if (result == JSRegExpMatch) appendParenthesesDisjunctionContext(backTrack, context); else { // The match failed; try to find an alternate point to carry on from. resetMatches(term, context); freeParenthesesDisjunctionContext(context); - if (!parenthesesDoBacktrack(term, backTrack)) - return false; + + if (result == JSRegExpNoMatch) { + JSRegExpResult backtrackResult = parenthesesDoBacktrack(term, backTrack); + if (backtrackResult != JSRegExpMatch) + return backtrackResult; + } else + return result; } } ASSERT(backTrack->matchAmount == term.atom.quantityCount); ParenthesesDisjunctionContext* context = backTrack->lastContext; recordParenthesesMatch(term, context); - return true; + return JSRegExpMatch; } case QuantifierGreedy: { while (backTrack->matchAmount < term.atom.quantityCount) { ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); - if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term))) + JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term)); + if (result == JSRegExpMatch) appendParenthesesDisjunctionContext(backTrack, context); else { resetMatches(term, context); freeParenthesesDisjunctionContext(context); + + if (result != JSRegExpNoMatch) + return result; + break; } } @@ -827,15 +903,15 @@ public: ParenthesesDisjunctionContext* context = backTrack->lastContext; recordParenthesesMatch(term, context); } - return true; + return JSRegExpMatch; } case QuantifierNonGreedy: - return true; + return JSRegExpMatch; } ASSERT_NOT_REACHED(); - return false; + return JSRegExpErrorNoMatch; } // Rules for backtracking differ depending on whether this is greedy or non-greedy. @@ -848,7 +924,7 @@ public: // Non-greedy, we've already done the one less case, so don't match on popping. // We haven't done the one more case, so always try to add that. // - bool backtrackParentheses(ByteTerm& term, DisjunctionContext* context) + JSRegExpResult backtrackParentheses(ByteTerm& term, DisjunctionContext* context) { ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern); @@ -867,44 +943,58 @@ public: ASSERT(backTrack->matchAmount == term.atom.quantityCount); ParenthesesDisjunctionContext* context = 0; + JSRegExpResult result = parenthesesDoBacktrack(term, backTrack); - if (!parenthesesDoBacktrack(term, backTrack)) - return false; + if (result != JSRegExpMatch) + return result; // While we haven't yet reached our fixed limit, while (backTrack->matchAmount < term.atom.quantityCount) { // Try to do a match, and it it succeeds, add it to the list. context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); - if (matchDisjunction(disjunctionBody, context->getDisjunctionContext(term))) + result = matchDisjunction(disjunctionBody, context->getDisjunctionContext(term)); + + if (result == JSRegExpMatch) appendParenthesesDisjunctionContext(backTrack, context); else { // The match failed; try to find an alternate point to carry on from. resetMatches(term, context); freeParenthesesDisjunctionContext(context); - if (!parenthesesDoBacktrack(term, backTrack)) - return false; + + if (result == JSRegExpNoMatch) { + JSRegExpResult backtrackResult = parenthesesDoBacktrack(term, backTrack); + if (backtrackResult != JSRegExpMatch) + return backtrackResult; + } else + return result; } } ASSERT(backTrack->matchAmount == term.atom.quantityCount); context = backTrack->lastContext; recordParenthesesMatch(term, context); - return true; + return JSRegExpMatch; } case QuantifierGreedy: { if (!backTrack->matchAmount) - return false; + return JSRegExpNoMatch; ParenthesesDisjunctionContext* context = backTrack->lastContext; - if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true)) { + JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true); + if (result == JSRegExpMatch) { while (backTrack->matchAmount < term.atom.quantityCount) { ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); - if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term))) + JSRegExpResult parenthesesResult = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term)); + if (parenthesesResult == JSRegExpMatch) appendParenthesesDisjunctionContext(backTrack, context); else { resetMatches(term, context); freeParenthesesDisjunctionContext(context); + + if (parenthesesResult != JSRegExpNoMatch) + return parenthesesResult; + break; } } @@ -912,63 +1002,108 @@ public: resetMatches(term, context); popParenthesesDisjunctionContext(backTrack); freeParenthesesDisjunctionContext(context); + + if (result != JSRegExpNoMatch) + return result; } if (backTrack->matchAmount) { ParenthesesDisjunctionContext* context = backTrack->lastContext; recordParenthesesMatch(term, context); } - return true; + return JSRegExpMatch; } case QuantifierNonGreedy: { // If we've not reached the limit, try to add one more match. if (backTrack->matchAmount < term.atom.quantityCount) { ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); - if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term))) { + JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term)); + if (result == JSRegExpMatch) { appendParenthesesDisjunctionContext(backTrack, context); recordParenthesesMatch(term, context); - return true; - } else { - resetMatches(term, context); - freeParenthesesDisjunctionContext(context); + return JSRegExpMatch; } + + resetMatches(term, context); + freeParenthesesDisjunctionContext(context); + + if (result != JSRegExpNoMatch) + return result; } // Nope - okay backtrack looking for an alternative. while (backTrack->matchAmount) { ParenthesesDisjunctionContext* context = backTrack->lastContext; - if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true)) { + JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true); + if (result == JSRegExpMatch) { // successful backtrack! we're back in the game! if (backTrack->matchAmount) { context = backTrack->lastContext; recordParenthesesMatch(term, context); } - return true; + return JSRegExpMatch; } // pop a match off the stack resetMatches(term, context); popParenthesesDisjunctionContext(backTrack); freeParenthesesDisjunctionContext(context); + + return result; } - return false; + return JSRegExpNoMatch; } } ASSERT_NOT_REACHED(); - return false; + return JSRegExpErrorNoMatch; + } + + void lookupForBeginChars() + { + int character; + bool firstSingleCharFound; + + while (true) { + if (input.isNotAvailableInput(2)) + return; + + firstSingleCharFound = false; + + character = input.readPair(); + + for (unsigned i = 0; i < pattern->m_beginChars.size(); ++i) { + BeginChar bc = pattern->m_beginChars[i]; + + if (!firstSingleCharFound && bc.value <= 0xFFFF) { + firstSingleCharFound = true; + character &= 0xFFFF; + } + + if ((character | bc.mask) == bc.value) + return; + } + + input.next(); + } } #define MATCH_NEXT() { ++context->term; goto matchAgain; } #define BACKTRACK() { --context->term; goto backtrack; } #define currentTerm() (disjunction->terms[context->term]) - bool matchDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false) + JSRegExpResult matchDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false, bool isBody = false) { + if (!--remainingMatchCount) + return JSRegExpErrorHitLimit; + if (btrack) BACKTRACK(); + if (pattern->m_containsBeginChars && isBody) + lookupForBeginChars(); + context->matchBegin = input.getPos(); context->term = 0; @@ -980,14 +1115,14 @@ public: MATCH_NEXT(); case ByteTerm::TypeSubpatternEnd: context->matchEnd = input.getPos(); - return true; + return JSRegExpMatch; case ByteTerm::TypeBodyAlternativeBegin: MATCH_NEXT(); case ByteTerm::TypeBodyAlternativeDisjunction: case ByteTerm::TypeBodyAlternativeEnd: context->matchEnd = input.getPos(); - return true; + return JSRegExpMatch; case ByteTerm::TypeAlternativeBegin: MATCH_NEXT(); @@ -1077,10 +1212,16 @@ public: if (matchBackReference(currentTerm(), context)) MATCH_NEXT(); BACKTRACK(); - case ByteTerm::TypeParenthesesSubpattern: - if (matchParentheses(currentTerm(), context)) + case ByteTerm::TypeParenthesesSubpattern: { + JSRegExpResult result = matchParentheses(currentTerm(), context); + + if (result == JSRegExpMatch) { MATCH_NEXT(); + } else if (result != JSRegExpNoMatch) + return result; + BACKTRACK(); + } case ByteTerm::TypeParenthesesSubpatternOnceBegin: if (matchParenthesesOnceBegin(currentTerm(), context)) MATCH_NEXT(); @@ -1089,6 +1230,14 @@ public: if (matchParenthesesOnceEnd(currentTerm(), context)) MATCH_NEXT(); BACKTRACK(); + case ByteTerm::TypeParenthesesSubpatternTerminalBegin: + if (matchParenthesesTerminalBegin(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeParenthesesSubpatternTerminalEnd: + if (matchParenthesesTerminalEnd(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); case ByteTerm::TypeParentheticalAssertionBegin: if (matchParentheticalAssertionBegin(currentTerm(), context)) MATCH_NEXT(); @@ -1112,7 +1261,7 @@ public: switch (currentTerm().type) { case ByteTerm::TypeSubpatternBegin: - return false; + return JSRegExpNoMatch; case ByteTerm::TypeSubpatternEnd: ASSERT_NOT_REACHED(); @@ -1124,10 +1273,18 @@ public: MATCH_NEXT(); if (input.atEnd()) - return false; + return JSRegExpNoMatch; input.next(); + + if (pattern->m_containsBeginChars && isBody) + lookupForBeginChars(); + context->matchBegin = input.getPos(); + + if (currentTerm().alternative.onceThrough) + context->term += currentTerm().alternative.next; + MATCH_NEXT(); } case ByteTerm::TypeBodyAlternativeEnd: @@ -1176,10 +1333,16 @@ public: if (backtrackBackReference(currentTerm(), context)) MATCH_NEXT(); BACKTRACK(); - case ByteTerm::TypeParenthesesSubpattern: - if (backtrackParentheses(currentTerm(), context)) + case ByteTerm::TypeParenthesesSubpattern: { + JSRegExpResult result = backtrackParentheses(currentTerm(), context); + + if (result == JSRegExpMatch) { MATCH_NEXT(); + } else if (result != JSRegExpNoMatch) + return result; + BACKTRACK(); + } case ByteTerm::TypeParenthesesSubpatternOnceBegin: if (backtrackParenthesesOnceBegin(currentTerm(), context)) MATCH_NEXT(); @@ -1188,6 +1351,14 @@ public: if (backtrackParenthesesOnceEnd(currentTerm(), context)) MATCH_NEXT(); BACKTRACK(); + case ByteTerm::TypeParenthesesSubpatternTerminalBegin: + if (backtrackParenthesesTerminalBegin(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeParenthesesSubpatternTerminalEnd: + if (backtrackParenthesesTerminalEnd(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); case ByteTerm::TypeParentheticalAssertionBegin: if (backtrackParentheticalAssertionBegin(currentTerm(), context)) MATCH_NEXT(); @@ -1203,36 +1374,48 @@ public: } ASSERT_NOT_REACHED(); - return false; + return JSRegExpErrorNoMatch; } - bool matchNonZeroDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false) + JSRegExpResult matchNonZeroDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false) { - if (matchDisjunction(disjunction, context, btrack)) { + JSRegExpResult result = matchDisjunction(disjunction, context, btrack); + + if (result == JSRegExpMatch) { while (context->matchBegin == context->matchEnd) { - if (!matchDisjunction(disjunction, context, true)) - return false; + result = matchDisjunction(disjunction, context, true); + if (result != JSRegExpMatch) + return result; } - return true; + return JSRegExpMatch; } - return false; + return result; } int interpret() { + allocatorPool = pattern->m_allocator->startAllocator(); + if (!allocatorPool) + CRASH(); + for (unsigned i = 0; i < ((pattern->m_body->m_numSubpatterns + 1) << 1); ++i) output[i] = -1; DisjunctionContext* context = allocDisjunctionContext(pattern->m_body.get()); - if (matchDisjunction(pattern->m_body.get(), context)) { + JSRegExpResult result = matchDisjunction(pattern->m_body.get(), context, false, true); + if (result == JSRegExpMatch) { output[0] = context->matchBegin; output[1] = context->matchEnd; } freeDisjunctionContext(context); + pattern->m_allocator->stopAllocator(); + + // RegExp.cpp currently expects all error to be converted to -1. + ASSERT((result == JSRegExpMatch) == (output[0] != -1)); return output[0]; } @@ -1240,6 +1423,8 @@ public: : pattern(pattern) , output(output) , input(inputChar, start, length) + , allocatorPool(0) + , remainingMatchCount(matchLimit) { } @@ -1247,6 +1432,8 @@ private: BytecodePattern *pattern; int* output; InputStream input; + BumpPointerPool* allocatorPool; + unsigned remainingMatchCount; }; @@ -1266,17 +1453,16 @@ public: ByteCompiler(RegexPattern& pattern) : m_pattern(pattern) { - m_bodyDisjunction = 0; m_currentAlternativeIndex = 0; } - BytecodePattern* compile() + PassOwnPtr<BytecodePattern> compile(BumpPointerAllocator* allocator) { - regexBegin(m_pattern.m_numSubpatterns, m_pattern.m_body->m_callFrameSize); + regexBegin(m_pattern.m_numSubpatterns, m_pattern.m_body->m_callFrameSize, m_pattern.m_body->m_alternatives[0]->onceThrough()); emitDisjunction(m_pattern.m_body); regexEnd(); - return new BytecodePattern(m_bodyDisjunction, m_allParenthesesInfo, m_pattern); + return adoptPtr(new BytecodePattern(m_bodyDisjunction.release(), m_allParenthesesInfo, m_pattern, allocator)); } void checkInput(unsigned count) @@ -1334,8 +1520,38 @@ public: m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; } + void atomParenthesesOnceBegin(unsigned subpatternId, bool capture, int inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation) + { + int beginTerm = m_bodyDisjunction->terms.size(); + + m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, inputPosition)); + m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; + m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin()); + m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation; + + m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex)); + m_currentAlternativeIndex = beginTerm + 1; + } + + void atomParenthesesTerminalBegin(unsigned subpatternId, bool capture, int inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation) + { + int beginTerm = m_bodyDisjunction->terms.size(); + + m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternTerminalBegin, subpatternId, capture, inputPosition)); + m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; + m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin()); + m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation; + + m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex)); + m_currentAlternativeIndex = beginTerm + 1; + } + void atomParenthesesSubpatternBegin(unsigned subpatternId, bool capture, int inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation) { + // Errrk! - this is a little crazy, we initially generate as a TypeParenthesesSubpatternOnceBegin, + // then fix this up at the end! - simplifying this should make it much clearer. + // https://bugs.webkit.org/show_bug.cgi?id=50136 + int beginTerm = m_bodyDisjunction->terms.size(); m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, inputPosition)); @@ -1360,6 +1576,28 @@ public: m_currentAlternativeIndex = beginTerm + 1; } + void atomParentheticalAssertionEnd(int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType) + { + unsigned beginTerm = popParenthesesStack(); + closeAlternative(beginTerm + 1); + unsigned endTerm = m_bodyDisjunction->terms.size(); + + ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParentheticalAssertionBegin); + + bool invertOrCapture = m_bodyDisjunction->terms[beginTerm].invertOrCapture; + unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId; + + m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParentheticalAssertionEnd, subpatternId, invertOrCapture, inputPosition)); + m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm; + m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm; + m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation; + + m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount; + m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; + m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount; + m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType; + } + unsigned popParenthesesStack() { ASSERT(m_parenthesesStack.size()); @@ -1431,56 +1669,85 @@ public: m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation; } - void atomParenthesesEnd(bool doInline, unsigned lastSubpatternId, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType, unsigned callFrameSize = 0) + void atomParenthesesSubpatternEnd(unsigned lastSubpatternId, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType, unsigned callFrameSize = 0) + { + unsigned beginTerm = popParenthesesStack(); + closeAlternative(beginTerm + 1); + unsigned endTerm = m_bodyDisjunction->terms.size(); + + ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternOnceBegin); + + ByteTerm& parenthesesBegin = m_bodyDisjunction->terms[beginTerm]; + + bool invertOrCapture = parenthesesBegin.invertOrCapture; + unsigned subpatternId = parenthesesBegin.atom.subpatternId; + + unsigned numSubpatterns = lastSubpatternId - subpatternId + 1; + ByteDisjunction* parenthesesDisjunction = new ByteDisjunction(numSubpatterns, callFrameSize); + + parenthesesDisjunction->terms.append(ByteTerm::SubpatternBegin()); + for (unsigned termInParentheses = beginTerm + 1; termInParentheses < endTerm; ++termInParentheses) + parenthesesDisjunction->terms.append(m_bodyDisjunction->terms[termInParentheses]); + parenthesesDisjunction->terms.append(ByteTerm::SubpatternEnd()); + + m_bodyDisjunction->terms.shrink(beginTerm); + + m_allParenthesesInfo.append(parenthesesDisjunction); + m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction, invertOrCapture, inputPosition)); + + m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount; + m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; + m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation; + } + + void atomParenthesesOnceEnd(int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType) { unsigned beginTerm = popParenthesesStack(); closeAlternative(beginTerm + 1); unsigned endTerm = m_bodyDisjunction->terms.size(); - bool isAssertion = m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParentheticalAssertionBegin; + ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternOnceBegin); + bool invertOrCapture = m_bodyDisjunction->terms[beginTerm].invertOrCapture; unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId; - m_bodyDisjunction->terms.append(ByteTerm(isAssertion ? ByteTerm::TypeParentheticalAssertionEnd : ByteTerm::TypeParenthesesSubpatternOnceEnd, subpatternId, invertOrCapture, inputPosition)); + m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceEnd, subpatternId, invertOrCapture, inputPosition)); m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm; m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm; m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation; - if (doInline) { - m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount; - m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; - m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount; - m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType; - } else { - ByteTerm& parenthesesBegin = m_bodyDisjunction->terms[beginTerm]; - ASSERT(parenthesesBegin.type == ByteTerm::TypeParenthesesSubpatternOnceBegin); - - bool invertOrCapture = parenthesesBegin.invertOrCapture; - unsigned subpatternId = parenthesesBegin.atom.subpatternId; + m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount; + m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; + m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount; + m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType; + } - unsigned numSubpatterns = lastSubpatternId - subpatternId + 1; - ByteDisjunction* parenthesesDisjunction = new ByteDisjunction(numSubpatterns, callFrameSize); + void atomParenthesesTerminalEnd(int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType) + { + unsigned beginTerm = popParenthesesStack(); + closeAlternative(beginTerm + 1); + unsigned endTerm = m_bodyDisjunction->terms.size(); - parenthesesDisjunction->terms.append(ByteTerm::SubpatternBegin()); - for (unsigned termInParentheses = beginTerm + 1; termInParentheses < endTerm; ++termInParentheses) - parenthesesDisjunction->terms.append(m_bodyDisjunction->terms[termInParentheses]); - parenthesesDisjunction->terms.append(ByteTerm::SubpatternEnd()); + ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternTerminalBegin); - m_bodyDisjunction->terms.shrink(beginTerm); + bool invertOrCapture = m_bodyDisjunction->terms[beginTerm].invertOrCapture; + unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId; - m_allParenthesesInfo.append(parenthesesDisjunction); - m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction, invertOrCapture, inputPosition)); + m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternTerminalEnd, subpatternId, invertOrCapture, inputPosition)); + m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm; + m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm; + m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation; - m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount; - m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; - m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation; - } + m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount; + m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; + m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount; + m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType; } - void regexBegin(unsigned numSubpatterns, unsigned callFrameSize) + void regexBegin(unsigned numSubpatterns, unsigned callFrameSize, bool onceThrough) { - m_bodyDisjunction = new ByteDisjunction(numSubpatterns, callFrameSize); - m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeBegin()); + m_bodyDisjunction = adoptPtr(new ByteDisjunction(numSubpatterns, callFrameSize)); + m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeBegin(onceThrough)); m_bodyDisjunction->terms[0].frameLocation = 0; m_currentAlternativeIndex = 0; } @@ -1490,11 +1757,11 @@ public: closeBodyAlternative(); } - void alternativeBodyDisjunction() + void alternativeBodyDisjunction(bool onceThrough) { int newAlternativeIndex = m_bodyDisjunction->terms.size(); m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex; - m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeDisjunction()); + m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeDisjunction(onceThrough)); m_currentAlternativeIndex = newAlternativeIndex; } @@ -1508,26 +1775,33 @@ public: m_currentAlternativeIndex = newAlternativeIndex; } - void emitDisjunction(PatternDisjunction* disjunction, unsigned inputCountAlreadyChecked = 0, unsigned parenthesesInputCountAlreadyChecked = 0) + void emitDisjunction(PatternDisjunction* disjunction, unsigned inputCountAlreadyChecked = 0, unsigned parenthesesInputCountAlreadyChecked = 0, bool isParentheticalAssertion = false) { for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) { unsigned currentCountAlreadyChecked = inputCountAlreadyChecked; + PatternAlternative* alternative = disjunction->m_alternatives[alt]; + if (alt) { if (disjunction == m_pattern.m_body) - alternativeBodyDisjunction(); + alternativeBodyDisjunction(alternative->onceThrough()); else alternativeDisjunction(); } - PatternAlternative* alternative = disjunction->m_alternatives[alt]; unsigned minimumSize = alternative->m_minimumSize; + int countToCheck; - ASSERT(minimumSize >= parenthesesInputCountAlreadyChecked); - unsigned countToCheck = minimumSize - parenthesesInputCountAlreadyChecked; - if (countToCheck) + if (isParentheticalAssertion && parenthesesInputCountAlreadyChecked > minimumSize) + countToCheck = 0; + else + countToCheck = minimumSize - parenthesesInputCountAlreadyChecked; + + ASSERT(countToCheck >= 0); + if (countToCheck) { checkInput(countToCheck); - currentCountAlreadyChecked += countToCheck; + currentCountAlreadyChecked += countToCheck; + } for (unsigned i = 0; i < alternative->m_terms.size(); ++i) { PatternTerm& term = alternative->m_terms[i]; @@ -1562,34 +1836,40 @@ public: case PatternTerm::TypeParenthesesSubpattern: { unsigned disjunctionAlreadyCheckedCount = 0; - if ((term.quantityCount == 1) && !term.parentheses.isCopy) { - if (term.quantityType == QuantifierFixedCount) { + if (term.quantityCount == 1 && !term.parentheses.isCopy) { + unsigned alternativeFrameLocation = term.frameLocation; + // For QuantifierFixedCount we pre-check the minimum size; for greedy/non-greedy we reserve a slot in the frame. + if (term.quantityType == QuantifierFixedCount) disjunctionAlreadyCheckedCount = term.parentheses.disjunction->m_minimumSize; - unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked; - atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.invertOrCapture, delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, term.frameLocation); - emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, term.parentheses.disjunction->m_minimumSize); - atomParenthesesEnd(true, term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize); - } else { - unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked; - atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.invertOrCapture, delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, term.frameLocation + RegexStackSpaceForBackTrackInfoParenthesesOnce); - emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0); - atomParenthesesEnd(true, term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize); - } + else + alternativeFrameLocation += RegexStackSpaceForBackTrackInfoParenthesesOnce; + unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked; + atomParenthesesOnceBegin(term.parentheses.subpatternId, term.invertOrCapture, delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, alternativeFrameLocation); + emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, disjunctionAlreadyCheckedCount); + atomParenthesesOnceEnd(delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType); + } else if (term.parentheses.isTerminal) { + unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked; + atomParenthesesTerminalBegin(term.parentheses.subpatternId, term.invertOrCapture, delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, term.frameLocation + RegexStackSpaceForBackTrackInfoParenthesesOnce); + emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, disjunctionAlreadyCheckedCount); + atomParenthesesTerminalEnd(delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType); } else { unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked; atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.invertOrCapture, delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, 0); emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0); - atomParenthesesEnd(false, term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize); + atomParenthesesSubpatternEnd(term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize); } break; } case PatternTerm::TypeParentheticalAssertion: { - unsigned alternativeFrameLocation = term.inputPosition + RegexStackSpaceForBackTrackInfoParentheticalAssertion; + unsigned alternativeFrameLocation = term.frameLocation + RegexStackSpaceForBackTrackInfoParentheticalAssertion; + + ASSERT(currentCountAlreadyChecked >= (unsigned)term.inputPosition); + int positiveInputOffset = currentCountAlreadyChecked - term.inputPosition; atomParentheticalAssertionBegin(term.parentheses.subpatternId, term.invertOrCapture, term.frameLocation, alternativeFrameLocation); - emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0); - atomParenthesesEnd(true, term.parentheses.lastSubpatternId, 0, term.frameLocation, term.quantityCount, term.quantityType); + emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, positiveInputOffset, true); + atomParentheticalAssertionEnd(0, term.frameLocation, term.quantityCount, term.quantityType); break; } } @@ -1599,23 +1879,28 @@ public: private: RegexPattern& m_pattern; - ByteDisjunction* m_bodyDisjunction; + OwnPtr<ByteDisjunction> m_bodyDisjunction; unsigned m_currentAlternativeIndex; Vector<ParenthesesStackEntry> m_parenthesesStack; Vector<ByteDisjunction*> m_allParenthesesInfo; }; -BytecodePattern* byteCompileRegex(const UString& patternString, unsigned& numSubpatterns, const char*& error, bool ignoreCase, bool multiline) +PassOwnPtr<BytecodePattern> byteCompileRegex(const UString& patternString, unsigned& numSubpatterns, const char*& error, BumpPointerAllocator* allocator, bool ignoreCase, bool multiline) { RegexPattern pattern(ignoreCase, multiline); if ((error = compileRegex(patternString, pattern))) - return 0; + return PassOwnPtr<BytecodePattern>(); numSubpatterns = pattern.m_numSubpatterns; - return ByteCompiler(pattern).compile(); + return ByteCompiler(pattern).compile(allocator); +} + +PassOwnPtr<BytecodePattern> byteCompileRegex(RegexPattern& pattern, BumpPointerAllocator* allocator) +{ + return ByteCompiler(pattern).compile(allocator); } int interpretRegex(BytecodePattern* regex, const UChar* input, unsigned start, unsigned length, int* output) @@ -1634,5 +1919,3 @@ COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParentheses) == (RegexStackSpace } } - -#endif diff --git a/JavaScriptCore/yarr/RegexInterpreter.h b/JavaScriptCore/yarr/RegexInterpreter.h index 48c9a5e..2e23472 100644 --- a/JavaScriptCore/yarr/RegexInterpreter.h +++ b/JavaScriptCore/yarr/RegexInterpreter.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2009 Apple Inc. All rights reserved. + * Copyright (C) 2009, 2010 Apple Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -26,16 +26,33 @@ #ifndef RegexInterpreter_h #define RegexInterpreter_h -#include <wtf/Platform.h> - -#if ENABLE(YARR) - -#include <wtf/unicode/Unicode.h> #include "RegexParser.h" #include "RegexPattern.h" +#include <wtf/PassOwnPtr.h> +#include <wtf/unicode/Unicode.h> + +namespace WTF { +class BumpPointerAllocator; +} +using WTF::BumpPointerAllocator; namespace JSC { namespace Yarr { +// TODO move the matchLimit constant and the JSRegExpResult enum to the JSRegExp.h when pcre is removed. + +// The below limit restricts the number of "recursive" match calls in order to +// avoid spending exponential time on complex regular expressions. +static const unsigned matchLimit = 1000000; + +enum JSRegExpResult { + JSRegExpMatch = 1, + JSRegExpNoMatch = 0, + JSRegExpErrorNoMatch = -1, + JSRegExpErrorHitLimit = -2, + JSRegExpErrorNoMemory = -3, + JSRegExpErrorInternal = -4 +}; + class ByteDisjunction; struct ByteTerm { @@ -64,6 +81,8 @@ struct ByteTerm { TypeParenthesesSubpattern, TypeParenthesesSubpatternOnceBegin, TypeParenthesesSubpatternOnceEnd, + TypeParenthesesSubpatternTerminalBegin, + TypeParenthesesSubpatternTerminalEnd, TypeParentheticalAssertionBegin, TypeParentheticalAssertionEnd, TypeCheckInput, @@ -90,6 +109,7 @@ struct ByteTerm { struct { int next; int end; + bool onceThrough; } alternative; unsigned checkInputCount; }; @@ -211,19 +231,21 @@ struct ByteTerm { return ByteTerm(TypeBackReference, subpatternId, false, inputPos); } - static ByteTerm BodyAlternativeBegin() + static ByteTerm BodyAlternativeBegin(bool onceThrough) { ByteTerm term(TypeBodyAlternativeBegin); term.alternative.next = 0; term.alternative.end = 0; + term.alternative.onceThrough = onceThrough; return term; } - static ByteTerm BodyAlternativeDisjunction() + static ByteTerm BodyAlternativeDisjunction(bool onceThrough) { ByteTerm term(TypeBodyAlternativeDisjunction); term.alternative.next = 0; term.alternative.end = 0; + term.alternative.onceThrough = onceThrough; return term; } @@ -232,6 +254,7 @@ struct ByteTerm { ByteTerm term(TypeBodyAlternativeEnd); term.alternative.next = 0; term.alternative.end = 0; + term.alternative.onceThrough = false; return term; } @@ -240,6 +263,7 @@ struct ByteTerm { ByteTerm term(TypeAlternativeBegin); term.alternative.next = 0; term.alternative.end = 0; + term.alternative.onceThrough = false; return term; } @@ -248,6 +272,7 @@ struct ByteTerm { ByteTerm term(TypeAlternativeDisjunction); term.alternative.next = 0; term.alternative.end = 0; + term.alternative.onceThrough = false; return term; } @@ -256,6 +281,7 @@ struct ByteTerm { ByteTerm term(TypeAlternativeEnd); term.alternative.next = 0; term.alternative.end = 0; + term.alternative.onceThrough = false; return term; } @@ -294,10 +320,12 @@ public: }; struct BytecodePattern : FastAllocBase { - BytecodePattern(ByteDisjunction* body, Vector<ByteDisjunction*> allParenthesesInfo, RegexPattern& pattern) + BytecodePattern(PassOwnPtr<ByteDisjunction> body, Vector<ByteDisjunction*> allParenthesesInfo, RegexPattern& pattern, BumpPointerAllocator* allocator) : m_body(body) , m_ignoreCase(pattern.m_ignoreCase) , m_multiline(pattern.m_multiline) + , m_containsBeginChars(pattern.m_containsBeginChars) + , m_allocator(allocator) { newlineCharacterClass = pattern.newlineCharacterClass(); wordcharCharacterClass = pattern.wordcharCharacterClass(); @@ -308,6 +336,8 @@ struct BytecodePattern : FastAllocBase { // array, so that it won't delete them on destruction. We'll // take responsibility for that. pattern.m_userCharacterClasses.clear(); + + m_beginChars.append(pattern.m_beginChars); } ~BytecodePattern() @@ -319,19 +349,25 @@ struct BytecodePattern : FastAllocBase { OwnPtr<ByteDisjunction> m_body; bool m_ignoreCase; bool m_multiline; - + bool m_containsBeginChars; + // Each BytecodePattern is associated with a RegExp, each RegExp is associated + // with a JSGlobalData. Cache a pointer to out JSGlobalData's m_regexAllocator. + BumpPointerAllocator* m_allocator; + CharacterClass* newlineCharacterClass; CharacterClass* wordcharCharacterClass; + + Vector<BeginChar> m_beginChars; + private: Vector<ByteDisjunction*> m_allParenthesesInfo; Vector<CharacterClass*> m_userCharacterClasses; }; -BytecodePattern* byteCompileRegex(const UString& pattern, unsigned& numSubpatterns, const char*& error, bool ignoreCase = false, bool multiline = false); +PassOwnPtr<BytecodePattern> byteCompileRegex(const UString& pattern, unsigned& numSubpatterns, const char*& error, BumpPointerAllocator*, bool ignoreCase = false, bool multiline = false); +PassOwnPtr<BytecodePattern> byteCompileRegex(RegexPattern& pattern, BumpPointerAllocator*); int interpretRegex(BytecodePattern* v_regex, const UChar* input, unsigned start, unsigned length, int* output); } } // namespace JSC::Yarr -#endif - #endif // RegexInterpreter_h diff --git a/JavaScriptCore/yarr/RegexJIT.cpp b/JavaScriptCore/yarr/RegexJIT.cpp index fcb8d86..acbd458 100644 --- a/JavaScriptCore/yarr/RegexJIT.cpp +++ b/JavaScriptCore/yarr/RegexJIT.cpp @@ -31,8 +31,7 @@ #include "LinkBuffer.h" #include "MacroAssembler.h" #include "RegexCompiler.h" - -#include "pcre.h" // temporary, remove when fallback is removed. +#include "RegexInterpreter.h" // temporary, remove when fallback is removed. #if ENABLE(YARR_JIT) @@ -40,7 +39,6 @@ using namespace WTF; namespace JSC { namespace Yarr { - class RegexGenerator : private MacroAssembler { friend void jitCompileRegex(JSGlobalData* globalData, RegexCodeBlock& jitObject, const UString& pattern, unsigned& numSubpatterns, const char*& error, bool ignoreCase, bool multiline); @@ -54,6 +52,16 @@ class RegexGenerator : private MacroAssembler { static const RegisterID regT1 = ARMRegisters::r6; static const RegisterID returnRegister = ARMRegisters::r0; +#elif CPU(MIPS) + static const RegisterID input = MIPSRegisters::a0; + static const RegisterID index = MIPSRegisters::a1; + static const RegisterID length = MIPSRegisters::a2; + static const RegisterID output = MIPSRegisters::a3; + + static const RegisterID regT0 = MIPSRegisters::t4; + static const RegisterID regT1 = MIPSRegisters::t5; + + static const RegisterID returnRegister = MIPSRegisters::v0; #elif CPU(X86) static const RegisterID input = X86Registers::eax; static const RegisterID index = X86Registers::edx; @@ -145,6 +153,11 @@ class RegexGenerator : private MacroAssembler { void matchCharacterClass(RegisterID character, JumpList& matchDest, const CharacterClass* charClass) { + if (charClass->m_table) { + ExtendedAddress tableEntry(character, reinterpret_cast<intptr_t>(charClass->m_table->m_table)); + matchDest.append(branchTest8(charClass->m_table->m_inverted ? Zero : NonZero, tableEntry)); + return; + } Jump unicodeFail; if (charClass->m_matchesUnicode.size() || charClass->m_rangesUnicode.size()) { Jump isAscii = branch32(LessThanOrEqual, character, Imm32(0x7f)); @@ -331,6 +344,15 @@ class RegexGenerator : private MacroAssembler { ASSERT(alternativeValid()); return alternative()->m_terms[t]; } + bool isLastTerm() + { + ASSERT(alternativeValid()); + return (t + 1) == alternative()->m_terms.size(); + } + bool isMainDisjunction() + { + return !disjunction->m_parent; + } PatternTerm& lookaheadTerm() { @@ -599,10 +621,14 @@ class RegexGenerator : private MacroAssembler { ASSERT(!m_pattern.m_ignoreCase || (Unicode::toLower(ch) == Unicode::toUpper(ch))); failures.append(jumpIfCharNotEquals(ch, state.inputOffset())); } + add32(Imm32(1), countRegister); add32(Imm32(1), index); - branch32(NotEqual, countRegister, Imm32(term.quantityCount)).linkTo(loop, this); - failures.append(jump()); + if (term.quantityCount != 0xffffffff) { + branch32(NotEqual, countRegister, Imm32(term.quantityCount)).linkTo(loop, this); + failures.append(jump()); + } else + jump(loop); Label backtrackBegin(this); loadFromFrame(term.frameLocation, countRegister); @@ -636,7 +662,8 @@ class RegexGenerator : private MacroAssembler { loadFromFrame(term.frameLocation, countRegister); atEndOfInput().linkTo(hardFail, this); - branch32(Equal, countRegister, Imm32(term.quantityCount), hardFail); + if (term.quantityCount != 0xffffffff) + branch32(Equal, countRegister, Imm32(term.quantityCount), hardFail); if (m_pattern.m_ignoreCase && isASCIIAlpha(ch)) { readCharacter(state.inputOffset(), character); or32(Imm32(32), character); @@ -722,8 +749,11 @@ class RegexGenerator : private MacroAssembler { add32(Imm32(1), countRegister); add32(Imm32(1), index); - branch32(NotEqual, countRegister, Imm32(term.quantityCount)).linkTo(loop, this); - failures.append(jump()); + if (term.quantityCount != 0xffffffff) { + branch32(NotEqual, countRegister, Imm32(term.quantityCount)).linkTo(loop, this); + failures.append(jump()); + } else + jump(loop); Label backtrackBegin(this); loadFromFrame(term.frameLocation, countRegister); @@ -880,7 +910,7 @@ class RegexGenerator : private MacroAssembler { PatternDisjunction* disjunction = term.parentheses.disjunction; ASSERT(term.quantityCount == 1); - unsigned preCheckedCount = ((term.quantityCount == 1) && (term.quantityType == QuantifierFixedCount)) ? disjunction->m_minimumSize : 0; + unsigned preCheckedCount = (term.quantityType == QuantifierFixedCount) ? disjunction->m_minimumSize : 0; unsigned parenthesesFrameLocation = term.frameLocation; unsigned alternativeFrameLocation = parenthesesFrameLocation; @@ -899,12 +929,12 @@ class RegexGenerator : private MacroAssembler { Jump nonGreedySkipParentheses; Label nonGreedyTryParentheses; if (term.quantityType == QuantifierGreedy) - storeToFrame(Imm32(1), parenthesesFrameLocation); + storeToFrame(index, parenthesesFrameLocation); else if (term.quantityType == QuantifierNonGreedy) { - storeToFrame(Imm32(0), parenthesesFrameLocation); + storeToFrame(Imm32(-1), parenthesesFrameLocation); nonGreedySkipParentheses = jump(); nonGreedyTryParentheses = label(); - storeToFrame(Imm32(1), parenthesesFrameLocation); + storeToFrame(index, parenthesesFrameLocation); } // store the match start index @@ -922,41 +952,31 @@ class RegexGenerator : private MacroAssembler { TermGenerationState parenthesesState(disjunction, state.checkedTotal); generateParenthesesDisjunction(state.term(), parenthesesState, alternativeFrameLocation); - // store the match end index - if (term.invertOrCapture) { - int inputOffset = state.inputOffset(); - if (inputOffset) { - move(index, indexTemporary); - add32(Imm32(state.inputOffset()), indexTemporary); - store32(indexTemporary, Address(output, ((term.parentheses.subpatternId << 1) + 1) * sizeof(int))); - } else - store32(index, Address(output, ((term.parentheses.subpatternId << 1) + 1) * sizeof(int))); - } - Jump success = jump(); + Jump success = (term.quantityType == QuantifierFixedCount) ? + jump() : + branch32(NotEqual, index, Address(stackPointerRegister, (parenthesesFrameLocation * sizeof(void*)))); // A failure AFTER the parens jumps here Label backtrackFromAfterParens(this); if (term.quantityType == QuantifierGreedy) { - // If this is zero we have now tested with both with and without the parens. + // If this is -1 we have now tested with both with and without the parens. loadFromFrame(parenthesesFrameLocation, indexTemporary); - state.jumpToBacktrack(branchTest32(Zero, indexTemporary), this); + state.jumpToBacktrack(branch32(Equal, indexTemporary, Imm32(-1)), this); } else if (term.quantityType == QuantifierNonGreedy) { - // If this is zero we have now tested with both with and without the parens. + // If this is -1 we have now tested without the parens, now test with. loadFromFrame(parenthesesFrameLocation, indexTemporary); - branchTest32(Zero, indexTemporary).linkTo(nonGreedyTryParentheses, this); + branch32(Equal, indexTemporary, Imm32(-1)).linkTo(nonGreedyTryParentheses, this); } parenthesesState.plantJumpToBacktrackIfExists(this); // A failure WITHIN the parens jumps here parenthesesState.linkAlternativeBacktracks(this); - if (term.invertOrCapture) { + if (term.invertOrCapture) store32(Imm32(-1), Address(output, (term.parentheses.subpatternId << 1) * sizeof(int))); - store32(Imm32(-1), Address(output, ((term.parentheses.subpatternId << 1) + 1) * sizeof(int))); - } if (term.quantityType == QuantifierGreedy) - storeToFrame(Imm32(0), parenthesesFrameLocation); + storeToFrame(Imm32(-1), parenthesesFrameLocation); else state.jumpToBacktrack(jump(), this); @@ -964,9 +984,67 @@ class RegexGenerator : private MacroAssembler { if (term.quantityType == QuantifierNonGreedy) nonGreedySkipParentheses.link(this); success.link(this); + + // store the match end index + if (term.invertOrCapture) { + int inputOffset = state.inputOffset(); + if (inputOffset) { + move(index, indexTemporary); + add32(Imm32(state.inputOffset()), indexTemporary); + store32(indexTemporary, Address(output, ((term.parentheses.subpatternId << 1) + 1) * sizeof(int))); + } else + store32(index, Address(output, ((term.parentheses.subpatternId << 1) + 1) * sizeof(int))); + } } } + void generateParenthesesGreedyNoBacktrack(TermGenerationState& state) + { + PatternTerm& parenthesesTerm = state.term(); + PatternDisjunction* disjunction = parenthesesTerm.parentheses.disjunction; + ASSERT(parenthesesTerm.type == PatternTerm::TypeParenthesesSubpattern); + ASSERT(parenthesesTerm.quantityCount != 1); // Handled by generateParenthesesSingle. + + TermGenerationState parenthesesState(disjunction, state.checkedTotal); + + Label matchAgain(this); + + storeToFrame(index, parenthesesTerm.frameLocation); // Save the current index to check for zero len matches later. + + for (parenthesesState.resetAlternative(); parenthesesState.alternativeValid(); parenthesesState.nextAlternative()) { + + PatternAlternative* alternative = parenthesesState.alternative(); + optimizeAlternative(alternative); + + int countToCheck = alternative->m_minimumSize; + if (countToCheck) { + parenthesesState.addBacktrackJump(jumpIfNoAvailableInput(countToCheck)); + parenthesesState.checkedTotal += countToCheck; + } + + for (parenthesesState.resetTerm(); parenthesesState.termValid(); parenthesesState.nextTerm()) + generateTerm(parenthesesState); + + // If we get here, we matched! If the index advanced then try to match more since limit isn't supported yet. + branch32(NotEqual, index, Address(stackPointerRegister, (parenthesesTerm.frameLocation * sizeof(void*))), matchAgain); + + // If we get here we matched, but we matched "" - cannot accept this alternative as is, so either backtrack, + // or fall through to try the next alternative if no backtrack is available. + parenthesesState.plantJumpToBacktrackIfExists(this); + + parenthesesState.linkAlternativeBacktracks(this); + // We get here if the alternative fails to match - fall through to the next iteration, or out of the loop. + + if (countToCheck) { + sub32(Imm32(countToCheck), index); + parenthesesState.checkedTotal -= countToCheck; + } + } + + // If the last alternative falls through to here, we have a failed match... + // Which means that we match whatever we have matched up to this point (even if nothing). + } + void generateParentheticalAssertion(TermGenerationState& state) { PatternTerm& term = state.term(); @@ -1078,17 +1156,19 @@ class RegexGenerator : private MacroAssembler { break; case PatternTerm::TypeBackReference: - m_generationFailed = true; + m_shouldFallBack = true; break; case PatternTerm::TypeForwardReference: break; case PatternTerm::TypeParenthesesSubpattern: - if ((term.quantityCount == 1) && !term.parentheses.isCopy) + if (term.quantityCount == 1 && !term.parentheses.isCopy) generateParenthesesSingle(state); + else if (term.parentheses.isTerminal) + generateParenthesesGreedyNoBacktrack(state); else - m_generationFailed = true; + m_shouldFallBack = true; break; case PatternTerm::TypeParentheticalAssertion: @@ -1102,21 +1182,26 @@ class RegexGenerator : private MacroAssembler { TermGenerationState state(disjunction, 0); state.resetAlternative(); - // Plant a check to see if there is sufficient input available to run the first alternative. - // Jumping back to the label 'firstAlternative' will get to this check, jumping to - // 'firstAlternativeInputChecked' will jump directly to matching the alternative having - // skipped this check. - - Label firstAlternative(this); - // check availability for the next alternative int countCheckedForCurrentAlternative = 0; int countToCheckForFirstAlternative = 0; bool hasShorterAlternatives = false; + bool setRepeatAlternativeLabels = false; JumpList notEnoughInputForPreviousAlternative; + Label firstAlternative; + Label firstAlternativeInputChecked; + // The label 'firstAlternative' is used to plant a check to see if there is + // sufficient input available to run the first repeating alternative. + // The label 'firstAlternativeInputChecked' will jump directly to matching + // the first repeating alternative having skipped this check. + if (state.alternativeValid()) { PatternAlternative* alternative = state.alternative(); + if (!alternative->onceThrough()) { + firstAlternative = Label(this); + setRepeatAlternativeLabels = true; + } countToCheckForFirstAlternative = alternative->m_minimumSize; state.checkedTotal += countToCheckForFirstAlternative; if (countToCheckForFirstAlternative) @@ -1124,31 +1209,35 @@ class RegexGenerator : private MacroAssembler { countCheckedForCurrentAlternative = countToCheckForFirstAlternative; } - Label firstAlternativeInputChecked(this); + if (setRepeatAlternativeLabels) + firstAlternativeInputChecked = Label(this); while (state.alternativeValid()) { - // Track whether any alternatives are shorter than the first one. - hasShorterAlternatives = hasShorterAlternatives || (countCheckedForCurrentAlternative < countToCheckForFirstAlternative); - PatternAlternative* alternative = state.alternative(); optimizeAlternative(alternative); + // Track whether any alternatives are shorter than the first one. + if (!alternative->onceThrough()) + hasShorterAlternatives = hasShorterAlternatives || (countCheckedForCurrentAlternative < countToCheckForFirstAlternative); + for (state.resetTerm(); state.termValid(); state.nextTerm()) generateTerm(state); // If we get here, the alternative matched. if (m_pattern.m_body->m_callFrameSize) addPtr(Imm32(m_pattern.m_body->m_callFrameSize * sizeof(void*)), stackPointerRegister); - + ASSERT(index != returnRegister); if (m_pattern.m_body->m_hasFixedSize) { move(index, returnRegister); if (alternative->m_minimumSize) sub32(Imm32(alternative->m_minimumSize), returnRegister); + + store32(returnRegister, output); } else - pop(returnRegister); + load32(Address(output), returnRegister); + store32(index, Address(output, 4)); - store32(returnRegister, output); generateReturn(); @@ -1157,47 +1246,75 @@ class RegexGenerator : private MacroAssembler { // if there are any more alternatives, plant the check for input before looping. if (state.alternativeValid()) { PatternAlternative* nextAlternative = state.alternative(); - int countToCheckForNextAlternative = nextAlternative->m_minimumSize; - - if (countCheckedForCurrentAlternative > countToCheckForNextAlternative) { // CASE 1: current alternative was longer than the next one. + if (!setRepeatAlternativeLabels && !nextAlternative->onceThrough()) { + // We have handled non-repeating alternatives, jump to next iteration + // and loop over repeating alternatives. + state.jumpToBacktrack(jump(), this); + + countToCheckForFirstAlternative = nextAlternative->m_minimumSize; + // If we get here, there the last input checked failed. notEnoughInputForPreviousAlternative.link(this); - - // Check if sufficent input available to run the next alternative - notEnoughInputForPreviousAlternative.append(jumpIfNoAvailableInput(countToCheckForNextAlternative - countCheckedForCurrentAlternative)); - // We are now in the correct state to enter the next alternative; this add is only required - // to mirror and revert operation of the sub32, just below. - add32(Imm32(countCheckedForCurrentAlternative - countToCheckForNextAlternative), index); - - // If we get here, there the last input checked passed. + state.linkAlternativeBacktracks(this); - // No need to check if we can run the next alternative, since it is shorter - - // just update index. - sub32(Imm32(countCheckedForCurrentAlternative - countToCheckForNextAlternative), index); - } else if (countCheckedForCurrentAlternative < countToCheckForNextAlternative) { // CASE 2: next alternative is longer than the current one. - // If we get here, there the last input checked failed. - // If there is insufficient input to run the current alternative, and the next alternative is longer, - // then there is definitely not enough input to run it - don't even check. Just adjust index, as if - // we had checked. - notEnoughInputForPreviousAlternative.link(this); - add32(Imm32(countToCheckForNextAlternative - countCheckedForCurrentAlternative), index); - notEnoughInputForPreviousAlternative.append(jump()); - // The next alternative is longer than the current one; check the difference. - state.linkAlternativeBacktracks(this); - notEnoughInputForPreviousAlternative.append(jumpIfNoAvailableInput(countToCheckForNextAlternative - countCheckedForCurrentAlternative)); - } else { // CASE 3: Both alternatives are the same length. - ASSERT(countCheckedForCurrentAlternative == countToCheckForNextAlternative); + // Back up to start the looping alternatives. + if (countCheckedForCurrentAlternative) + sub32(Imm32(countCheckedForCurrentAlternative), index); + + firstAlternative = Label(this); + + state.checkedTotal = countToCheckForFirstAlternative; + if (countToCheckForFirstAlternative) + notEnoughInputForPreviousAlternative.append(jumpIfNoAvailableInput(countToCheckForFirstAlternative)); + + countCheckedForCurrentAlternative = countToCheckForFirstAlternative; + + firstAlternativeInputChecked = Label(this); - // If the next alterative is the same length as this one, then no need to check the input - - // if there was sufficent input to run the current alternative then there is sufficient - // input to run the next one; if not, there isn't. - state.linkAlternativeBacktracks(this); + setRepeatAlternativeLabels = true; + } else { + int countToCheckForNextAlternative = nextAlternative->m_minimumSize; + + if (countCheckedForCurrentAlternative > countToCheckForNextAlternative) { // CASE 1: current alternative was longer than the next one. + // If we get here, then the last input checked failed. + notEnoughInputForPreviousAlternative.link(this); + + // Check if sufficent input available to run the next alternative + notEnoughInputForPreviousAlternative.append(jumpIfNoAvailableInput(countToCheckForNextAlternative - countCheckedForCurrentAlternative)); + // We are now in the correct state to enter the next alternative; this add is only required + // to mirror and revert operation of the sub32, just below. + add32(Imm32(countCheckedForCurrentAlternative - countToCheckForNextAlternative), index); + + // If we get here, then the last input checked passed. + state.linkAlternativeBacktracks(this); + // No need to check if we can run the next alternative, since it is shorter - + // just update index. + sub32(Imm32(countCheckedForCurrentAlternative - countToCheckForNextAlternative), index); + } else if (countCheckedForCurrentAlternative < countToCheckForNextAlternative) { // CASE 2: next alternative is longer than the current one. + // If we get here, then the last input checked failed. + // If there is insufficient input to run the current alternative, and the next alternative is longer, + // then there is definitely not enough input to run it - don't even check. Just adjust index, as if + // we had checked. + notEnoughInputForPreviousAlternative.link(this); + add32(Imm32(countToCheckForNextAlternative - countCheckedForCurrentAlternative), index); + notEnoughInputForPreviousAlternative.append(jump()); + + // The next alternative is longer than the current one; check the difference. + state.linkAlternativeBacktracks(this); + notEnoughInputForPreviousAlternative.append(jumpIfNoAvailableInput(countToCheckForNextAlternative - countCheckedForCurrentAlternative)); + } else { // CASE 3: Both alternatives are the same length. + ASSERT(countCheckedForCurrentAlternative == countToCheckForNextAlternative); + + // If the next alterative is the same length as this one, then no need to check the input - + // if there was sufficent input to run the current alternative then there is sufficient + // input to run the next one; if not, there isn't. + state.linkAlternativeBacktracks(this); + } + state.checkedTotal -= countCheckedForCurrentAlternative; + countCheckedForCurrentAlternative = countToCheckForNextAlternative; + state.checkedTotal += countCheckedForCurrentAlternative; } - - state.checkedTotal -= countCheckedForCurrentAlternative; - countCheckedForCurrentAlternative = countToCheckForNextAlternative; - state.checkedTotal += countCheckedForCurrentAlternative; } } @@ -1205,81 +1322,86 @@ class RegexGenerator : private MacroAssembler { state.checkedTotal -= countCheckedForCurrentAlternative; - // How much more input need there be to be able to retry from the first alternative? - // examples: - // /yarr_jit/ or /wrec|pcre/ - // In these examples we need check for one more input before looping. - // /yarr_jit|pcre/ - // In this case we need check for 5 more input to loop (+4 to allow for the first alterative - // being four longer than the last alternative checked, and another +1 to effectively move - // the start position along by one). - // /yarr|rules/ or /wrec|notsomuch/ - // In these examples, provided that there was sufficient input to have just been matching for - // the second alternative we can loop without checking for available input (since the second - // alternative is longer than the first). In the latter example we need to decrement index - // (by 4) so the start position is only progressed by 1 from the last iteration. - int incrementForNextIter = (countToCheckForFirstAlternative - countCheckedForCurrentAlternative) + 1; - - // First, deal with the cases where there was sufficient input to try the last alternative. - if (incrementForNextIter > 0) // We need to check for more input anyway, fall through to the checking below. - state.linkAlternativeBacktracks(this); - else if (m_pattern.m_body->m_hasFixedSize && !incrementForNextIter) // No need to update anything, link these backtracks straight to the to pof the loop! - state.linkAlternativeBacktracksTo(firstAlternativeInputChecked, this); - else { // no need to check the input, but we do have some bookkeeping to do first. + if (!setRepeatAlternativeLabels) { + // If there are no alternatives that need repeating (all are marked 'onceThrough') then just link + // the match failures to this point, and fall through to the return below. state.linkAlternativeBacktracks(this); + notEnoughInputForPreviousAlternative.link(this); + } else { + // How much more input need there be to be able to retry from the first alternative? + // examples: + // /yarr_jit/ or /wrec|pcre/ + // In these examples we need check for one more input before looping. + // /yarr_jit|pcre/ + // In this case we need check for 5 more input to loop (+4 to allow for the first alterative + // being four longer than the last alternative checked, and another +1 to effectively move + // the start position along by one). + // /yarr|rules/ or /wrec|notsomuch/ + // In these examples, provided that there was sufficient input to have just been matching for + // the second alternative we can loop without checking for available input (since the second + // alternative is longer than the first). In the latter example we need to decrement index + // (by 4) so the start position is only progressed by 1 from the last iteration. + int incrementForNextIter = (countToCheckForFirstAlternative - countCheckedForCurrentAlternative) + 1; + + // First, deal with the cases where there was sufficient input to try the last alternative. + if (incrementForNextIter > 0) // We need to check for more input anyway, fall through to the checking below. + state.linkAlternativeBacktracks(this); + else if (m_pattern.m_body->m_hasFixedSize && !incrementForNextIter) // No need to update anything, link these backtracks straight to the to pof the loop! + state.linkAlternativeBacktracksTo(firstAlternativeInputChecked, this); + else { // no need to check the input, but we do have some bookkeeping to do first. + state.linkAlternativeBacktracks(this); - // Where necessary update our preserved start position. - if (!m_pattern.m_body->m_hasFixedSize) { - move(index, regT0); - sub32(Imm32(countCheckedForCurrentAlternative - 1), regT0); - poke(regT0, m_pattern.m_body->m_callFrameSize); + // Where necessary update our preserved start position. + if (!m_pattern.m_body->m_hasFixedSize) { + move(index, regT0); + sub32(Imm32(countCheckedForCurrentAlternative - 1), regT0); + store32(regT0, Address(output)); + } + + // Update index if necessary, and loop (without checking). + if (incrementForNextIter) + add32(Imm32(incrementForNextIter), index); + jump().linkTo(firstAlternativeInputChecked, this); } - // Update index if necessary, and loop (without checking). - if (incrementForNextIter) - add32(Imm32(incrementForNextIter), index); - jump().linkTo(firstAlternativeInputChecked, this); + notEnoughInputForPreviousAlternative.link(this); + // Update our idea of the start position, if we're tracking this. + if (!m_pattern.m_body->m_hasFixedSize) { + if (countCheckedForCurrentAlternative - 1) { + move(index, regT0); + sub32(Imm32(countCheckedForCurrentAlternative - 1), regT0); + store32(regT0, Address(output)); + } else + store32(index, Address(output)); + } + + // Check if there is sufficent input to run the first alternative again. + jumpIfAvailableInput(incrementForNextIter).linkTo(firstAlternativeInputChecked, this); + // No - insufficent input to run the first alteranative, are there any other alternatives we + // might need to check? If so, the last check will have left the index incremented by + // (countToCheckForFirstAlternative + 1), so we need test whether countToCheckForFirstAlternative + // LESS input is available, to have the effect of just progressing the start position by 1 + // from the last iteration. If this check passes we can just jump up to the check associated + // with the first alternative in the loop. This is a bit sad, since we'll end up trying the + // first alternative again, and this check will fail (otherwise the check planted just above + // here would have passed). This is a bit sad, however it saves trying to do something more + // complex here in compilation, and in the common case we should end up coallescing the checks. + // + // FIXME: a nice improvement here may be to stop trying to match sooner, based on the least + // of the minimum-alternative-lengths. E.g. if I have two alternatives of length 200 and 150, + // and a string of length 100, we'll end up looping index from 0 to 100, checking whether there + // is sufficient input to run either alternative (constantly failing). If there had been only + // one alternative, or if the shorter alternative had come first, we would have terminated + // immediately. :-/ + if (hasShorterAlternatives) + jumpIfAvailableInput(-countToCheckForFirstAlternative).linkTo(firstAlternative, this); + // index will now be a bit garbled (depending on whether 'hasShorterAlternatives' is true, + // it has either been incremented by 1 or by (countToCheckForFirstAlternative + 1) ... + // but since we're about to return a failure this doesn't really matter!) } - notEnoughInputForPreviousAlternative.link(this); - // Update our idea of the start position, if we're tracking this. - if (!m_pattern.m_body->m_hasFixedSize) { - if (countCheckedForCurrentAlternative - 1) { - move(index, regT0); - sub32(Imm32(countCheckedForCurrentAlternative - 1), regT0); - poke(regT0, m_pattern.m_body->m_callFrameSize); - } else - poke(index, m_pattern.m_body->m_callFrameSize); - } - // Check if there is sufficent input to run the first alternative again. - jumpIfAvailableInput(incrementForNextIter).linkTo(firstAlternativeInputChecked, this); - // No - insufficent input to run the first alteranative, are there any other alternatives we - // might need to check? If so, the last check will have left the index incremented by - // (countToCheckForFirstAlternative + 1), so we need test whether countToCheckForFirstAlternative - // LESS input is available, to have the effect of just progressing the start position by 1 - // from the last iteration. If this check passes we can just jump up to the check associated - // with the first alternative in the loop. This is a bit sad, since we'll end up trying the - // first alternative again, and this check will fail (otherwise the check planted just above - // here would have passed). This is a bit sad, however it saves trying to do something more - // complex here in compilation, and in the common case we should end up coallescing the checks. - // - // FIXME: a nice improvement here may be to stop trying to match sooner, based on the least - // of the minimum-alternative-lengths. E.g. if I have two alternatives of length 200 and 150, - // and a string of length 100, we'll end up looping index from 0 to 100, checking whether there - // is sufficient input to run either alternative (constantly failing). If there had been only - // one alternative, or if the shorter alternative had come first, we would have terminated - // immediately. :-/ - if (hasShorterAlternatives) - jumpIfAvailableInput(-countToCheckForFirstAlternative).linkTo(firstAlternative, this); - // index will now be a bit garbled (depending on whether 'hasShorterAlternatives' is true, - // it has either been incremented by 1 or by (countToCheckForFirstAlternative + 1) ... - // but since we're about to return a failure this doesn't really matter!) - - unsigned frameSize = m_pattern.m_body->m_callFrameSize; - if (!m_pattern.m_body->m_hasFixedSize) - ++frameSize; - if (frameSize) - addPtr(Imm32(frameSize * sizeof(void*)), stackPointerRegister); + if (m_pattern.m_body->m_callFrameSize) + addPtr(Imm32(m_pattern.m_body->m_callFrameSize * sizeof(void*)), stackPointerRegister); move(Imm32(-1), returnRegister); @@ -1312,7 +1434,12 @@ class RegexGenerator : private MacroAssembler { push(ARMRegisters::r4); push(ARMRegisters::r5); push(ARMRegisters::r6); +#if CPU(ARM_TRADITIONAL) + push(ARMRegisters::r8); // scratch register +#endif move(ARMRegisters::r3, output); +#elif CPU(MIPS) + // Do nothing. #endif } @@ -1327,9 +1454,14 @@ class RegexGenerator : private MacroAssembler { pop(X86Registers::ebx); pop(X86Registers::ebp); #elif CPU(ARM) +#if CPU(ARM_TRADITIONAL) + pop(ARMRegisters::r8); // scratch register +#endif pop(ARMRegisters::r6); pop(ARMRegisters::r5); pop(ARMRegisters::r4); +#elif CPU(MIPS) + // Do nothing #endif ret(); } @@ -1337,7 +1469,7 @@ class RegexGenerator : private MacroAssembler { public: RegexGenerator(RegexPattern& pattern) : m_pattern(pattern) - , m_generationFailed(false) + , m_shouldFallBack(false) { } @@ -1345,9 +1477,8 @@ public: { generateEnter(); - // TODO: do I really want this on the stack? if (!m_pattern.m_body->m_hasFixedSize) - push(index); + store32(index, Address(output)); if (m_pattern.m_body->m_callFrameSize) subPtr(Imm32(m_pattern.m_body->m_callFrameSize * sizeof(void*)), stackPointerRegister); @@ -1359,7 +1490,7 @@ public: { generate(); - LinkBuffer patchBuffer(this, globalData->executableAllocator.poolForSize(size())); + LinkBuffer patchBuffer(this, globalData->executableAllocator.poolForSize(size()), 0); for (unsigned i = 0; i < m_backtrackRecords.size(); ++i) patchBuffer.patch(m_backtrackRecords[i].dataLabel, patchBuffer.locationOf(m_backtrackRecords[i].backtrackLocation)); @@ -1367,34 +1498,32 @@ public: jitObject.set(patchBuffer.finalizeCode()); } - bool generationFailed() + bool shouldFallBack() { - return m_generationFailed; + return m_shouldFallBack; } private: RegexPattern& m_pattern; + bool m_shouldFallBack; Vector<AlternativeBacktrackRecord> m_backtrackRecords; - bool m_generationFailed; }; -void jitCompileRegex(JSGlobalData* globalData, RegexCodeBlock& jitObject, const UString& patternString, unsigned& numSubpatterns, const char*& error, bool ignoreCase, bool multiline) +void jitCompileRegex(JSGlobalData* globalData, RegexCodeBlock& jitObject, const UString& patternString, unsigned& numSubpatterns, const char*& error, BumpPointerAllocator* allocator, bool ignoreCase, bool multiline) { RegexPattern pattern(ignoreCase, multiline); - if ((error = compileRegex(patternString, pattern))) return; - numSubpatterns = pattern.m_numSubpatterns; - RegexGenerator generator(pattern); - generator.compile(globalData, jitObject); - - if (generator.generationFailed()) { - JSRegExpIgnoreCaseOption ignoreCaseOption = ignoreCase ? JSRegExpIgnoreCase : JSRegExpDoNotIgnoreCase; - JSRegExpMultilineOption multilineOption = multiline ? JSRegExpMultiline : JSRegExpSingleLine; - jitObject.setFallback(jsRegExpCompile(reinterpret_cast<const UChar*>(patternString.data()), patternString.size(), ignoreCaseOption, multilineOption, &numSubpatterns, &error)); + if (!pattern.m_containsBackreferences && globalData->canUseJIT()) { + RegexGenerator generator(pattern); + generator.compile(globalData, jitObject); + if (!generator.shouldFallBack()) + return; } + + jitObject.setFallback(byteCompileRegex(pattern, allocator)); } }} diff --git a/JavaScriptCore/yarr/RegexJIT.h b/JavaScriptCore/yarr/RegexJIT.h index 935b9a3..c4c382c 100644 --- a/JavaScriptCore/yarr/RegexJIT.h +++ b/JavaScriptCore/yarr/RegexJIT.h @@ -26,16 +26,12 @@ #ifndef RegexJIT_h #define RegexJIT_h -#include <wtf/Platform.h> - #if ENABLE(YARR_JIT) #include "MacroAssembler.h" +#include "RegexInterpreter.h" // temporary, remove when fallback is removed. #include "RegexPattern.h" -#include <UString.h> - -#include <pcre.h> -struct JSRegExp; // temporary, remove when fallback is removed. +#include "UString.h" #if CPU(X86) && !COMPILER(MSVC) #define YARR_CALL __attribute__ ((regparm (3))) @@ -55,20 +51,23 @@ class RegexCodeBlock { public: RegexCodeBlock() - : m_fallback(0) + : m_needFallback(false) { } ~RegexCodeBlock() { - if (m_fallback) - jsRegExpFree(m_fallback); } - JSRegExp* getFallback() { return m_fallback; } - void setFallback(JSRegExp* fallback) { m_fallback = fallback; } + BytecodePattern* getFallback() { return m_fallback.get(); } + bool isFallback() { return m_needFallback; } + void setFallback(PassOwnPtr<BytecodePattern> fallback) + { + m_fallback = fallback; + m_needFallback = true; + } - bool operator!() { return !m_ref.m_code.executableAddress(); } + bool operator!() { return (!m_ref.m_code.executableAddress() && !m_fallback); } void set(MacroAssembler::CodeRef ref) { m_ref = ref; } int execute(const UChar* input, unsigned start, unsigned length, int* output) @@ -76,17 +75,22 @@ public: return reinterpret_cast<RegexJITCode>(m_ref.m_code.executableAddress())(input, start, length, output); } +#if ENABLE(REGEXP_TRACING) + void *getAddr() { return m_ref.m_code.executableAddress(); } +#endif + private: MacroAssembler::CodeRef m_ref; - JSRegExp* m_fallback; + OwnPtr<Yarr::BytecodePattern> m_fallback; + bool m_needFallback; }; -void jitCompileRegex(JSGlobalData* globalData, RegexCodeBlock& jitObject, const UString& pattern, unsigned& numSubpatterns, const char*& error, bool ignoreCase = false, bool multiline = false); +void jitCompileRegex(JSGlobalData* globalData, RegexCodeBlock& jitObject, const UString& pattern, unsigned& numSubpatterns, const char*& error, BumpPointerAllocator* allocator, bool ignoreCase = false, bool multiline = false); -inline int executeRegex(RegexCodeBlock& jitObject, const UChar* input, unsigned start, unsigned length, int* output, int outputArraySize) +inline int executeRegex(RegexCodeBlock& jitObject, const UChar* input, unsigned start, unsigned length, int* output) { - if (JSRegExp* fallback = jitObject.getFallback()) - return (jsRegExpExecute(fallback, input, length, start, output, outputArraySize) < 0) ? -1 : output[0]; + if (jitObject.isFallback()) + return (interpretRegex(jitObject.getFallback(), input, start, length, output)); return jitObject.execute(input, start, length, output); } diff --git a/JavaScriptCore/yarr/RegexParser.h b/JavaScriptCore/yarr/RegexParser.h index 64e8463..a5c0ba2 100644 --- a/JavaScriptCore/yarr/RegexParser.h +++ b/JavaScriptCore/yarr/RegexParser.h @@ -26,14 +26,10 @@ #ifndef RegexParser_h #define RegexParser_h -#include <wtf/Platform.h> - -#if ENABLE(YARR) - -#include <UString.h> +#include "UString.h" +#include <limits.h> #include <wtf/ASCIICType.h> #include <wtf/unicode/Unicode.h> -#include <limits.h> namespace JSC { namespace Yarr { @@ -195,8 +191,8 @@ private: : m_delegate(delegate) , m_backReferenceLimit(backReferenceLimit) , m_err(NoError) - , m_data(pattern.data()) - , m_size(pattern.size()) + , m_data(pattern.characters()) + , m_size(pattern.length()) , m_index(0) , m_parenthesesNestingDepth(0) { @@ -849,6 +845,4 @@ const char* parse(Delegate& delegate, const UString& pattern, unsigned backRefer } } // namespace JSC::Yarr -#endif - #endif // RegexParser_h diff --git a/JavaScriptCore/yarr/RegexPattern.h b/JavaScriptCore/yarr/RegexPattern.h index dd7512d..c76c641 100644 --- a/JavaScriptCore/yarr/RegexPattern.h +++ b/JavaScriptCore/yarr/RegexPattern.h @@ -1,5 +1,6 @@ /* * Copyright (C) 2009 Apple Inc. All rights reserved. + * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -26,14 +27,9 @@ #ifndef RegexPattern_h #define RegexPattern_h -#include <wtf/Platform.h> - -#if ENABLE(YARR) - #include <wtf/Vector.h> #include <wtf/unicode/Unicode.h> - namespace JSC { namespace Yarr { #define RegexStackSpaceForBackTrackInfoPatternCharacter 1 // Only for !fixed quantifiers. @@ -42,6 +38,7 @@ namespace JSC { namespace Yarr { #define RegexStackSpaceForBackTrackInfoAlternative 1 // One per alternative. #define RegexStackSpaceForBackTrackInfoParentheticalAssertion 1 #define RegexStackSpaceForBackTrackInfoParenthesesOnce 1 // Only for !fixed quantifiers. +#define RegexStackSpaceForBackTrackInfoParenthesesTerminal 1 #define RegexStackSpaceForBackTrackInfoParentheses 4 struct PatternDisjunction; @@ -57,11 +54,35 @@ struct CharacterRange { } }; +struct CharacterClassTable : RefCounted<CharacterClassTable> { + const char* m_table; + bool m_inverted; + static PassRefPtr<CharacterClassTable> create(const char* table, bool inverted) + { + return adoptRef(new CharacterClassTable(table, inverted)); + } + +private: + CharacterClassTable(const char* table, bool inverted) + : m_table(table) + , m_inverted(inverted) + { + } +}; + struct CharacterClass : FastAllocBase { + // All CharacterClass instances have to have the full set of matches and ranges, + // they may have an optional table for faster lookups (which must match the + // specified matches and ranges) + CharacterClass(PassRefPtr<CharacterClassTable> table) + : m_table(table) + { + } Vector<UChar> m_matches; Vector<CharacterRange> m_ranges; Vector<UChar> m_matchesUnicode; Vector<CharacterRange> m_rangesUnicode; + RefPtr<CharacterClassTable> m_table; }; enum QuantifierType { @@ -92,6 +113,7 @@ struct PatternTerm { unsigned subpatternId; unsigned lastSubpatternId; bool isCopy; + bool isTerminal; } parentheses; }; QuantifierType quantityType; @@ -123,6 +145,7 @@ struct PatternTerm { parentheses.disjunction = disjunction; parentheses.subpatternId = subpatternId; parentheses.isCopy = false; + parentheses.isTerminal = false; quantityType = QuantifierFixedCount; quantityCount = 1; } @@ -184,6 +207,10 @@ struct PatternTerm { struct PatternAlternative : FastAllocBase { PatternAlternative(PatternDisjunction* disjunction) : m_parent(disjunction) + , m_onceThrough(false) + , m_hasFixedSize(false) + , m_startsWithBOL(false) + , m_containsBOL(false) { } @@ -198,16 +225,30 @@ struct PatternAlternative : FastAllocBase { ASSERT(m_terms.size()); m_terms.shrink(m_terms.size() - 1); } + + void setOnceThrough() + { + m_onceThrough = true; + } + + bool onceThrough() + { + return m_onceThrough; + } Vector<PatternTerm> m_terms; PatternDisjunction* m_parent; unsigned m_minimumSize; - bool m_hasFixedSize; + bool m_onceThrough : 1; + bool m_hasFixedSize : 1; + bool m_startsWithBOL : 1; + bool m_containsBOL : 1; }; struct PatternDisjunction : FastAllocBase { PatternDisjunction(PatternAlternative* parent = 0) : m_parent(parent) + , m_hasFixedSize(false) { } @@ -242,10 +283,37 @@ CharacterClass* nondigitsCreate(); CharacterClass* nonspacesCreate(); CharacterClass* nonwordcharCreate(); +struct TermChain { + TermChain(PatternTerm term) + : term(term) + {} + + PatternTerm term; + Vector<TermChain> hotTerms; +}; + +struct BeginChar { + BeginChar() + : value(0) + , mask(0) + {} + + BeginChar(unsigned value, unsigned mask) + : value(value) + , mask(mask) + {} + + unsigned value; + unsigned mask; +}; + struct RegexPattern { RegexPattern(bool ignoreCase, bool multiline) : m_ignoreCase(ignoreCase) , m_multiline(multiline) + , m_containsBackreferences(false) + , m_containsBeginChars(false) + , m_containsBOL(false) , m_numSubpatterns(0) , m_maxBackReference(0) , newlineCached(0) @@ -269,6 +337,10 @@ struct RegexPattern { m_numSubpatterns = 0; m_maxBackReference = 0; + m_containsBackreferences = false; + m_containsBeginChars = false; + m_containsBOL = false; + newlineCached = 0; digitsCached = 0; spacesCached = 0; @@ -281,6 +353,7 @@ struct RegexPattern { m_disjunctions.clear(); deleteAllValues(m_userCharacterClasses); m_userCharacterClasses.clear(); + m_beginChars.clear(); } bool containsIllegalBackReference() @@ -331,13 +404,17 @@ struct RegexPattern { return nonwordcharCached; } - bool m_ignoreCase; - bool m_multiline; + bool m_ignoreCase : 1; + bool m_multiline : 1; + bool m_containsBackreferences : 1; + bool m_containsBeginChars : 1; + bool m_containsBOL : 1; unsigned m_numSubpatterns; unsigned m_maxBackReference; PatternDisjunction* m_body; Vector<PatternDisjunction*, 4> m_disjunctions; Vector<CharacterClass*> m_userCharacterClasses; + Vector<BeginChar> m_beginChars; private: CharacterClass* newlineCached; @@ -351,6 +428,4 @@ private: } } // namespace JSC::Yarr -#endif - #endif // RegexPattern_h |
