diff options
Diffstat (limited to 'JavaScriptCore/yarr/RegexCompiler.cpp')
| -rw-r--r-- | JavaScriptCore/yarr/RegexCompiler.cpp | 469 |
1 files changed, 357 insertions, 112 deletions
diff --git a/JavaScriptCore/yarr/RegexCompiler.cpp b/JavaScriptCore/yarr/RegexCompiler.cpp index 9cd3d12..06ecbad 100644 --- a/JavaScriptCore/yarr/RegexCompiler.cpp +++ b/JavaScriptCore/yarr/RegexCompiler.cpp @@ -1,5 +1,6 @@ /* * Copyright (C) 2009 Apple Inc. All rights reserved. + * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -30,12 +31,12 @@ #include "RegexPattern.h" #include <wtf/Vector.h> -#if ENABLE(YARR) - using namespace WTF; namespace JSC { namespace Yarr { +#include "RegExpJitTables.h" + class CharacterClassConstructor { public: CharacterClassConstructor(bool isCaseInsensitive = false) @@ -141,7 +142,7 @@ public: CharacterClass* charClass() { - CharacterClass* characterClass = new CharacterClass(); + CharacterClass* characterClass = new CharacterClass(0); characterClass->m_matches.append(m_matches); characterClass->m_ranges.append(m_ranges); @@ -233,110 +234,118 @@ private: Vector<CharacterRange> m_rangesUnicode; }; +struct BeginCharHelper { + BeginCharHelper(Vector<BeginChar>* beginChars, bool isCaseInsensitive = false) + : m_beginChars(beginChars) + , m_isCaseInsensitive(isCaseInsensitive) + {} + + void addBeginChar(BeginChar beginChar, Vector<TermChain>* hotTerms, QuantifierType quantityType, unsigned quantityCount) + { + if (quantityType == QuantifierFixedCount && quantityCount > 1) { + // We duplicate the first found character if the quantity of the term is more than one. eg.: /a{3}/ + beginChar.value |= beginChar.value << 16; + beginChar.mask |= beginChar.mask << 16; + addCharacter(beginChar); + } else if (quantityType == QuantifierFixedCount && quantityCount == 1 && hotTerms->size()) + // In case of characters with fixed quantifier we should check the next character as well. + linkHotTerms(beginChar, hotTerms); + else + // In case of greedy matching the next character checking is unnecessary therefore we just store + // the first character. + addCharacter(beginChar); + } -CharacterClass* newlineCreate() -{ - CharacterClass* characterClass = new CharacterClass(); + // Merge two following BeginChars in the vector to reduce the number of character checks. + void merge(unsigned size) + { + for (unsigned i = 0; i < size; i++) { + BeginChar* curr = &m_beginChars->at(i); + BeginChar* next = &m_beginChars->at(i + 1); - characterClass->m_matches.append('\n'); - characterClass->m_matches.append('\r'); - characterClass->m_matchesUnicode.append(0x2028); - characterClass->m_matchesUnicode.append(0x2029); - - return characterClass; -} + // If the current and the next size of value is different we should skip the merge process + // because the 16bit and 32bit values are unmergable. + if (curr->value <= 0xFFFF && next->value > 0xFFFF) + continue; -CharacterClass* digitsCreate() -{ - CharacterClass* characterClass = new CharacterClass(); + unsigned diff = curr->value ^ next->value; - characterClass->m_ranges.append(CharacterRange('0', '9')); - - return characterClass; -} + curr->mask |= diff; + curr->value |= curr->mask; -CharacterClass* spacesCreate() -{ - CharacterClass* characterClass = new CharacterClass(); - - characterClass->m_matches.append(' '); - characterClass->m_ranges.append(CharacterRange('\t', '\r')); - characterClass->m_matchesUnicode.append(0x00a0); - characterClass->m_matchesUnicode.append(0x1680); - characterClass->m_matchesUnicode.append(0x180e); - characterClass->m_matchesUnicode.append(0x2028); - characterClass->m_matchesUnicode.append(0x2029); - characterClass->m_matchesUnicode.append(0x202f); - characterClass->m_matchesUnicode.append(0x205f); - characterClass->m_matchesUnicode.append(0x3000); - characterClass->m_rangesUnicode.append(CharacterRange(0x2000, 0x200a)); - - return characterClass; -} + m_beginChars->remove(i + 1); + size--; + } + } -CharacterClass* wordcharCreate() -{ - CharacterClass* characterClass = new CharacterClass(); +private: + void addCharacter(BeginChar beginChar) + { + unsigned pos = 0; + unsigned range = m_beginChars->size(); - characterClass->m_matches.append('_'); - characterClass->m_ranges.append(CharacterRange('0', '9')); - characterClass->m_ranges.append(CharacterRange('A', 'Z')); - characterClass->m_ranges.append(CharacterRange('a', 'z')); - - return characterClass; -} + // binary chop, find position to insert char. + while (range) { + unsigned index = range >> 1; -CharacterClass* nondigitsCreate() -{ - CharacterClass* characterClass = new CharacterClass(); + int val = m_beginChars->at(pos+index).value - beginChar.value; + if (!val) + return; + if (val < 0) + range = index; + else { + pos += (index+1); + range -= (index+1); + } + } - characterClass->m_ranges.append(CharacterRange(0, '0' - 1)); - characterClass->m_ranges.append(CharacterRange('9' + 1, 0x7f)); - characterClass->m_rangesUnicode.append(CharacterRange(0x80, 0xffff)); - - return characterClass; -} + if (pos == m_beginChars->size()) + m_beginChars->append(beginChar); + else + m_beginChars->insert(pos, beginChar); + } -CharacterClass* nonspacesCreate() -{ - CharacterClass* characterClass = new CharacterClass(); - - characterClass->m_ranges.append(CharacterRange(0, '\t' - 1)); - characterClass->m_ranges.append(CharacterRange('\r' + 1, ' ' - 1)); - characterClass->m_ranges.append(CharacterRange(' ' + 1, 0x7f)); - characterClass->m_rangesUnicode.append(CharacterRange(0x0080, 0x009f)); - characterClass->m_rangesUnicode.append(CharacterRange(0x00a1, 0x167f)); - characterClass->m_rangesUnicode.append(CharacterRange(0x1681, 0x180d)); - characterClass->m_rangesUnicode.append(CharacterRange(0x180f, 0x1fff)); - characterClass->m_rangesUnicode.append(CharacterRange(0x200b, 0x2027)); - characterClass->m_rangesUnicode.append(CharacterRange(0x202a, 0x202e)); - characterClass->m_rangesUnicode.append(CharacterRange(0x2030, 0x205e)); - characterClass->m_rangesUnicode.append(CharacterRange(0x2060, 0x2fff)); - characterClass->m_rangesUnicode.append(CharacterRange(0x3001, 0xffff)); - - return characterClass; -} + // Create BeginChar objects by appending each terms from a hotTerms vector to an existing BeginChar object. + void linkHotTerms(BeginChar beginChar, Vector<TermChain>* hotTerms) + { + for (unsigned i = 0; i < hotTerms->size(); i++) { + PatternTerm hotTerm = hotTerms->at(i).term; + ASSERT(hotTerm.type == PatternTerm::TypePatternCharacter); -CharacterClass* nonwordcharCreate() -{ - CharacterClass* characterClass = new CharacterClass(); + UChar characterNext = hotTerm.patternCharacter; - characterClass->m_matches.append('`'); - characterClass->m_ranges.append(CharacterRange(0, '0' - 1)); - characterClass->m_ranges.append(CharacterRange('9' + 1, 'A' - 1)); - characterClass->m_ranges.append(CharacterRange('Z' + 1, '_' - 1)); - characterClass->m_ranges.append(CharacterRange('z' + 1, 0x7f)); - characterClass->m_rangesUnicode.append(CharacterRange(0x80, 0xffff)); + // Append a character to an existing BeginChar object. + if (characterNext <= 0x7f) { + unsigned mask = 0; - return characterClass; -} + if (m_isCaseInsensitive && isASCIIAlpha(characterNext)) { + mask = 32; + characterNext = toASCIILower(characterNext); + } + addCharacter(BeginChar(beginChar.value | (characterNext << 16), beginChar.mask | (mask << 16))); + } else { + UChar upper, lower; + if (m_isCaseInsensitive && ((upper = Unicode::toUpper(characterNext)) != (lower = Unicode::toLower(characterNext)))) { + addCharacter(BeginChar(beginChar.value | (upper << 16), beginChar.mask)); + addCharacter(BeginChar(beginChar.value | (lower << 16), beginChar.mask)); + } else + addCharacter(BeginChar(beginChar.value | (characterNext << 16), beginChar.mask)); + } + } + } + + Vector<BeginChar>* m_beginChars; + bool m_isCaseInsensitive; +}; class RegexPatternConstructor { public: RegexPatternConstructor(RegexPattern& pattern) : m_pattern(pattern) , m_characterClassConstructor(pattern.m_ignoreCase) + , m_beginCharHelper(&pattern.m_beginChars, pattern.m_ignoreCase) + , m_invertParentheticalAssertion(false) { } @@ -352,6 +361,11 @@ public: void assertionBOL() { + if (!m_alternative->m_terms.size() & !m_invertParentheticalAssertion) { + m_alternative->m_startsWithBOL = true; + m_alternative->m_containsBOL = true; + m_pattern.m_containsBOL = true; + } m_alternative->m_terms.append(PatternTerm::BOL()); } void assertionEOL() @@ -455,20 +469,42 @@ public: m_pattern.m_disjunctions.append(parenthesesDisjunction); m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParentheticalAssertion, m_pattern.m_numSubpatterns + 1, parenthesesDisjunction, invert)); m_alternative = parenthesesDisjunction->addNewAlternative(); + m_invertParentheticalAssertion = invert; } void atomParenthesesEnd() { ASSERT(m_alternative->m_parent); ASSERT(m_alternative->m_parent->m_parent); + + PatternDisjunction* parenthesisDisjunction = m_alternative->m_parent; m_alternative = m_alternative->m_parent->m_parent; + + PatternTerm& lastTerm = m_alternative->lastTerm(); + + unsigned numParenAlternatives = parenthesisDisjunction->m_alternatives.size(); + unsigned numBOLAnchoredAlts = 0; + // Bubble up BOL flags + for (unsigned i = 0; i < numParenAlternatives; i++) { + if (parenthesisDisjunction->m_alternatives[i]->m_startsWithBOL) + numBOLAnchoredAlts++; + } - m_alternative->lastTerm().parentheses.lastSubpatternId = m_pattern.m_numSubpatterns; + if (numBOLAnchoredAlts) { + m_alternative->m_containsBOL = true; + // If all the alternatives in parens start with BOL, then so does this one + if (numBOLAnchoredAlts == numParenAlternatives) + m_alternative->m_startsWithBOL = true; + } + + lastTerm.parentheses.lastSubpatternId = m_pattern.m_numSubpatterns; + m_invertParentheticalAssertion = false; } void atomBackReference(unsigned subpatternId) { ASSERT(subpatternId); + m_pattern.m_containsBackreferences = true; m_pattern.m_maxBackReference = std::max(m_pattern.m_maxBackReference, subpatternId); if (subpatternId > m_pattern.m_numSubpatterns) { @@ -493,32 +529,39 @@ public: m_alternative->m_terms.append(PatternTerm(subpatternId)); } - PatternDisjunction* copyDisjunction(PatternDisjunction* disjunction) + // deep copy the argument disjunction. If filterStartsWithBOL is true, + // skip alternatives with m_startsWithBOL set true. + PatternDisjunction* copyDisjunction(PatternDisjunction* disjunction, bool filterStartsWithBOL = false) { - PatternDisjunction* newDisjunction = new PatternDisjunction(); - - newDisjunction->m_parent = disjunction->m_parent; + PatternDisjunction* newDisjunction = 0; for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) { PatternAlternative* alternative = disjunction->m_alternatives[alt]; - PatternAlternative* newAlternative = newDisjunction->addNewAlternative(); - for (unsigned i = 0; i < alternative->m_terms.size(); ++i) - newAlternative->m_terms.append(copyTerm(alternative->m_terms[i])); + if (!filterStartsWithBOL || !alternative->m_startsWithBOL) { + if (!newDisjunction) { + newDisjunction = new PatternDisjunction(); + newDisjunction->m_parent = disjunction->m_parent; + } + PatternAlternative* newAlternative = newDisjunction->addNewAlternative(); + for (unsigned i = 0; i < alternative->m_terms.size(); ++i) + newAlternative->m_terms.append(copyTerm(alternative->m_terms[i], filterStartsWithBOL)); + } } - - m_pattern.m_disjunctions.append(newDisjunction); + + if (newDisjunction) + m_pattern.m_disjunctions.append(newDisjunction); return newDisjunction; } - - PatternTerm copyTerm(PatternTerm& term) + + PatternTerm copyTerm(PatternTerm& term, bool filterStartsWithBOL = false) { if ((term.type != PatternTerm::TypeParenthesesSubpattern) && (term.type != PatternTerm::TypeParentheticalAssertion)) return PatternTerm(term); - + PatternTerm termCopy = term; - termCopy.parentheses.disjunction = copyDisjunction(termCopy.parentheses.disjunction); + termCopy.parentheses.disjunction = copyDisjunction(termCopy.parentheses.disjunction, filterStartsWithBOL); return termCopy; } - + void quantifyAtom(unsigned min, unsigned max, bool greedy) { ASSERT(min <= max); @@ -624,14 +667,17 @@ public: case PatternTerm::TypeParenthesesSubpattern: // Note: for fixed once parentheses we will ensure at least the minimum is available; others are on their own. term.frameLocation = currentCallFrameSize; - if ((term.quantityCount == 1) && !term.parentheses.isCopy) { - if (term.quantityType == QuantifierFixedCount) { - currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition); - currentInputPosition += term.parentheses.disjunction->m_minimumSize; - } else { + if (term.quantityCount == 1 && !term.parentheses.isCopy) { + if (term.quantityType != QuantifierFixedCount) currentCallFrameSize += RegexStackSpaceForBackTrackInfoParenthesesOnce; - currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition); - } + currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition); + // If quantity is fixed, then pre-check its minimum size. + if (term.quantityType == QuantifierFixedCount) + currentInputPosition += term.parentheses.disjunction->m_minimumSize; + term.inputPosition = currentInputPosition; + } else if (term.parentheses.isTerminal) { + currentCallFrameSize += RegexStackSpaceForBackTrackInfoParenthesesTerminal; + currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition); term.inputPosition = currentInputPosition; } else { term.inputPosition = currentInputPosition; @@ -685,11 +731,208 @@ public: setupDisjunctionOffsets(m_pattern.m_body, 0, 0); } + // This optimization identifies sets of parentheses that we will never need to backtrack. + // In these cases we do not need to store state from prior iterations. + // We can presently avoid backtracking for: + // * a set of parens at the end of the regular expression (last term in any of the alternatives of the main body disjunction). + // * where the parens are non-capturing, and quantified unbounded greedy (*). + // * where the parens do not contain any capturing subpatterns. + void checkForTerminalParentheses() + { + // This check is much too crude; should be just checking whether the candidate + // node contains nested capturing subpatterns, not the whole expression! + if (m_pattern.m_numSubpatterns) + return; + + Vector<PatternAlternative*>& alternatives = m_pattern.m_body->m_alternatives; + for (unsigned i =0; i < alternatives.size(); ++i) { + Vector<PatternTerm>& terms = alternatives[i]->m_terms; + if (terms.size()) { + PatternTerm& term = terms.last(); + if (term.type == PatternTerm::TypeParenthesesSubpattern + && term.quantityType == QuantifierGreedy + && term.quantityCount == UINT_MAX + && !term.capture()) + term.parentheses.isTerminal = true; + } + } + } + + void optimizeBOL() + { + // Look for expressions containing beginning of line (^) anchoring and unroll them. + // e.g. /^a|^b|c/ becomes /^a|^b|c/ which is executed once followed by /c/ which loops + // This code relies on the parsing code tagging alternatives with m_containsBOL and + // m_startsWithBOL and rolling those up to containing alternatives. + // At this point, this is only valid for non-multiline expressions. + PatternDisjunction* disjunction = m_pattern.m_body; + + if (!m_pattern.m_containsBOL || m_pattern.m_multiline) + return; + + PatternDisjunction* loopDisjunction = copyDisjunction(disjunction, true); + + // Set alternatives in disjunction to "onceThrough" + for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) + disjunction->m_alternatives[alt]->setOnceThrough(); + + if (loopDisjunction) { + // Move alternatives from loopDisjunction to disjunction + for (unsigned alt = 0; alt < loopDisjunction->m_alternatives.size(); ++alt) + disjunction->m_alternatives.append(loopDisjunction->m_alternatives[alt]); + + loopDisjunction->m_alternatives.clear(); + } + } + + bool addBeginTerm(PatternTerm term, Vector<TermChain>* beginTerms, PatternAlternative* alternative, unsigned numTerms, unsigned termIndex, unsigned depth) + { + if (term.quantityType == QuantifierFixedCount) { + beginTerms->append(TermChain(term)); + if (depth < 2 && termIndex < numTerms - 1 && term.quantityCount == 1) + setupAlternativeBeginTerms(alternative, &beginTerms->last().hotTerms, termIndex + 1, depth + 1); + } else if (termIndex != numTerms - 1) { + beginTerms->append(TermChain(term)); + return true; + } + + return false; + } + + // This function collects the terms which are potentially matching the first number of depth characters in the result. + // If this function returns false then it found at least one term which makes the beginning character + // look-up optimization inefficient. + bool setupDisjunctionBeginTerms(PatternDisjunction* disjunction, Vector<TermChain>* beginTerms, unsigned depth) + { + for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) { + PatternAlternative* alternative = disjunction->m_alternatives[alt]; + + if (!setupAlternativeBeginTerms(alternative, beginTerms, 0, depth)) + return false; + } + + return true; + } + + bool setupAlternativeBeginTerms(PatternAlternative* alternative, Vector<TermChain>* beginTerms, unsigned termIndex, unsigned depth) + { + bool checkNext = true; + unsigned numTerms = alternative->m_terms.size(); + + while (checkNext && termIndex < numTerms) { + PatternTerm term = alternative->m_terms[termIndex]; + checkNext = false; + + switch (term.type) { + case PatternTerm::TypeAssertionBOL: + case PatternTerm::TypeAssertionEOL: + case PatternTerm::TypeAssertionWordBoundary: + return false; + + case PatternTerm::TypeBackReference: + case PatternTerm::TypeForwardReference: + return false; + + case PatternTerm::TypePatternCharacter: + if (addBeginTerm(term, beginTerms, alternative, numTerms, termIndex, depth)) { + termIndex++; + checkNext = true; + } + break; + + case PatternTerm::TypeCharacterClass: + return false; + + case PatternTerm::TypeParentheticalAssertion: + if (term.invertOrCapture) + return false; + + case PatternTerm::TypeParenthesesSubpattern: + if (term.quantityType != QuantifierFixedCount) { + if (termIndex == numTerms - 1) + break; + + termIndex++; + checkNext = true; + + } + + if (!setupDisjunctionBeginTerms(term.parentheses.disjunction, beginTerms, depth)) + return false; + + break; + } + } + + return true; + } + + void setupBeginChars() + { + Vector<TermChain> beginTerms; + bool containsFixedCharacter = false; + + if ((!m_pattern.m_body->m_hasFixedSize || m_pattern.m_body->m_alternatives.size() > 1) + && setupDisjunctionBeginTerms(m_pattern.m_body, &beginTerms, 0)) { + unsigned size = beginTerms.size(); + + // If we haven't collected any terms we should abort the preparation of beginning character look-up optimization. + if (!size) + return; + + m_pattern.m_containsBeginChars = true; + + for (unsigned i = 0; i < size; i++) { + PatternTerm term = beginTerms[i].term; + + // We have just collected PatternCharacter terms, other terms are not allowed. + ASSERT(term.type == PatternTerm::TypePatternCharacter); + + if (term.quantityType == QuantifierFixedCount) + containsFixedCharacter = true; + + UChar character = term.patternCharacter; + unsigned mask = 0; + + if (character <= 0x7f) { + if (m_pattern.m_ignoreCase && isASCIIAlpha(character)) { + mask = 32; + character = toASCIILower(character); + } + + m_beginCharHelper.addBeginChar(BeginChar(character, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount); + } else { + UChar upper, lower; + if (m_pattern.m_ignoreCase && ((upper = Unicode::toUpper(character)) != (lower = Unicode::toLower(character)))) { + m_beginCharHelper.addBeginChar(BeginChar(upper, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount); + m_beginCharHelper.addBeginChar(BeginChar(lower, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount); + } else + m_beginCharHelper.addBeginChar(BeginChar(character, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount); + } + } + + // If the pattern doesn't contain terms with fixed quantifiers then the beginning character look-up optimization is inefficient. + if (!containsFixedCharacter) { + m_pattern.m_containsBeginChars = false; + return; + } + + size = m_pattern.m_beginChars.size(); + + if (size > 2) + m_beginCharHelper.merge(size - 1); + else if (size <= 1) + m_pattern.m_containsBeginChars = false; + } + } + private: RegexPattern& m_pattern; PatternAlternative* m_alternative; CharacterClassConstructor m_characterClassConstructor; + BeginCharHelper m_beginCharHelper; bool m_invertCharacterClass; + bool m_invertParentheticalAssertion; }; @@ -717,12 +960,14 @@ const char* compileRegex(const UString& patternString, RegexPattern& pattern) ASSERT(numSubpatterns == pattern.m_numSubpatterns); } + constructor.checkForTerminalParentheses(); + constructor.optimizeBOL(); + constructor.setupOffsets(); + constructor.setupBeginChars(); - return false; + return 0; }; } } - -#endif |
