diff options
Diffstat (limited to 'JavaScriptCore/yarr')
-rw-r--r-- | JavaScriptCore/yarr/RegexCompiler.cpp | 11 | ||||
-rw-r--r-- | JavaScriptCore/yarr/RegexInterpreter.cpp | 16 | ||||
-rw-r--r-- | JavaScriptCore/yarr/RegexJIT.cpp | 6 | ||||
-rw-r--r-- | JavaScriptCore/yarr/RegexParser.h | 137 | ||||
-rw-r--r-- | JavaScriptCore/yarr/RegexPattern.h | 4 |
5 files changed, 75 insertions, 99 deletions
diff --git a/JavaScriptCore/yarr/RegexCompiler.cpp b/JavaScriptCore/yarr/RegexCompiler.cpp index e40c791..06ecbad 100644 --- a/JavaScriptCore/yarr/RegexCompiler.cpp +++ b/JavaScriptCore/yarr/RegexCompiler.cpp @@ -520,7 +520,7 @@ public: PatternTerm& term = currentAlternative->lastTerm(); ASSERT((term.type == PatternTerm::TypeParenthesesSubpattern) || (term.type == PatternTerm::TypeParentheticalAssertion)); - if ((term.type == PatternTerm::TypeParenthesesSubpattern) && term.invertOrCapture && (subpatternId == term.parentheses.subpatternId)) { + if ((term.type == PatternTerm::TypeParenthesesSubpattern) && term.invertOrCapture && (subpatternId == term.subpatternId)) { m_alternative->m_terms.append(PatternTerm::ForwardReference()); return; } @@ -595,7 +595,7 @@ public: term.quantify(min, QuantifierFixedCount); m_alternative->m_terms.append(copyTerm(term)); // NOTE: this term is interesting from an analysis perspective, in that it can be ignored..... - m_alternative->lastTerm().quantify((max == quantifyInfinite) ? max : max - min, greedy ? QuantifierGreedy : QuantifierNonGreedy); + m_alternative->lastTerm().quantify((max == UINT_MAX) ? max : max - min, greedy ? QuantifierGreedy : QuantifierNonGreedy); if (m_alternative->lastTerm().type == PatternTerm::TypeParenthesesSubpattern) m_alternative->lastTerm().parentheses.isCopy = true; } @@ -734,8 +734,7 @@ public: // This optimization identifies sets of parentheses that we will never need to backtrack. // In these cases we do not need to store state from prior iterations. // We can presently avoid backtracking for: - // * where the parens are at the end of the regular expression (last term in any of the - // alternatives of the main body disjunction). + // * a set of parens at the end of the regular expression (last term in any of the alternatives of the main body disjunction). // * where the parens are non-capturing, and quantified unbounded greedy (*). // * where the parens do not contain any capturing subpatterns. void checkForTerminalParentheses() @@ -746,13 +745,13 @@ public: return; Vector<PatternAlternative*>& alternatives = m_pattern.m_body->m_alternatives; - for (size_t i = 0; i < alternatives.size(); ++i) { + for (unsigned i =0; i < alternatives.size(); ++i) { Vector<PatternTerm>& terms = alternatives[i]->m_terms; if (terms.size()) { PatternTerm& term = terms.last(); if (term.type == PatternTerm::TypeParenthesesSubpattern && term.quantityType == QuantifierGreedy - && term.quantityCount == quantifyInfinite + && term.quantityCount == UINT_MAX && !term.capture()) term.parentheses.isTerminal = true; } diff --git a/JavaScriptCore/yarr/RegexInterpreter.cpp b/JavaScriptCore/yarr/RegexInterpreter.cpp index a51cd25..164158e 100644 --- a/JavaScriptCore/yarr/RegexInterpreter.cpp +++ b/JavaScriptCore/yarr/RegexInterpreter.cpp @@ -515,7 +515,8 @@ public: if (matchEnd == -1) return true; - ASSERT((matchBegin == -1) || (matchBegin <= matchEnd)); + ASSERT((matchBegin == -1) == (matchEnd == -1)); + ASSERT(matchBegin <= matchEnd); if (matchBegin == matchEnd) return true; @@ -557,7 +558,8 @@ public: int matchBegin = output[(term.atom.subpatternId << 1)]; int matchEnd = output[(term.atom.subpatternId << 1) + 1]; - ASSERT((matchBegin == -1) || (matchBegin <= matchEnd)); + ASSERT((matchBegin == -1) == (matchEnd == -1)); + ASSERT(matchBegin <= matchEnd); if (matchBegin == matchEnd) return false; @@ -717,7 +719,7 @@ public: if (term.capture()) { // Technically this access to inputPosition should be accessing the begin term's // inputPosition, but for repeats other than fixed these values should be - // the same anyway! (We don't pre-check for greedy or non-greedy matches.) + // the same anyway! (we don't pre-check for greedy or non-greedy matches.) ASSERT((&term - term.atom.parenthesesWidth)->type == ByteTerm::TypeParenthesesSubpatternOnceBegin); ASSERT((&term - term.atom.parenthesesWidth)->inputPosition == term.inputPosition); unsigned subpatternId = term.atom.subpatternId; @@ -737,7 +739,7 @@ public: { ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalBegin); ASSERT(term.atom.quantityType == QuantifierGreedy); - ASSERT(term.atom.quantityCount == quantifyInfinite); + ASSERT(term.atom.quantityCount == UINT_MAX); ASSERT(!term.capture()); BackTrackInfoParenthesesTerminal* backTrack = reinterpret_cast<BackTrackInfoParenthesesTerminal*>(context->frame + term.frameLocation); @@ -763,7 +765,7 @@ public: { ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalBegin); ASSERT(term.atom.quantityType == QuantifierGreedy); - ASSERT(term.atom.quantityCount == quantifyInfinite); + ASSERT(term.atom.quantityCount == UINT_MAX); ASSERT(!term.capture()); // If we backtrack to this point, we have failed to match this iteration of the parens. @@ -775,7 +777,7 @@ public: bool backtrackParenthesesTerminalEnd(ByteTerm&, DisjunctionContext*) { // 'Terminal' parentheses are at the end of the regex, and as such a match past end - // should always be returned as a successful match - we should never backtrack to here. + // should always be returned as a successful match - we should never becktrack to here. ASSERT_NOT_REACHED(); return false; } @@ -1826,7 +1828,7 @@ public: break; case PatternTerm::TypeBackReference: - atomBackReference(term.backReferenceSubpatternId, term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType); + atomBackReference(term.subpatternId, term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType); break; case PatternTerm::TypeForwardReference: diff --git a/JavaScriptCore/yarr/RegexJIT.cpp b/JavaScriptCore/yarr/RegexJIT.cpp index 1eac667..acbd458 100644 --- a/JavaScriptCore/yarr/RegexJIT.cpp +++ b/JavaScriptCore/yarr/RegexJIT.cpp @@ -624,7 +624,7 @@ class RegexGenerator : private MacroAssembler { add32(Imm32(1), countRegister); add32(Imm32(1), index); - if (term.quantityCount != quantifyInfinite) { + if (term.quantityCount != 0xffffffff) { branch32(NotEqual, countRegister, Imm32(term.quantityCount)).linkTo(loop, this); failures.append(jump()); } else @@ -662,7 +662,7 @@ class RegexGenerator : private MacroAssembler { loadFromFrame(term.frameLocation, countRegister); atEndOfInput().linkTo(hardFail, this); - if (term.quantityCount != quantifyInfinite) + if (term.quantityCount != 0xffffffff) branch32(Equal, countRegister, Imm32(term.quantityCount), hardFail); if (m_pattern.m_ignoreCase && isASCIIAlpha(ch)) { readCharacter(state.inputOffset(), character); @@ -749,7 +749,7 @@ class RegexGenerator : private MacroAssembler { add32(Imm32(1), countRegister); add32(Imm32(1), index); - if (term.quantityCount != quantifyInfinite) { + if (term.quantityCount != 0xffffffff) { branch32(NotEqual, countRegister, Imm32(term.quantityCount)).linkTo(loop, this); failures.append(jump()); } else diff --git a/JavaScriptCore/yarr/RegexParser.h b/JavaScriptCore/yarr/RegexParser.h index 8392cdf..a5c0ba2 100644 --- a/JavaScriptCore/yarr/RegexParser.h +++ b/JavaScriptCore/yarr/RegexParser.h @@ -33,8 +33,6 @@ namespace JSC { namespace Yarr { -static const unsigned quantifyInfinite = UINT_MAX; - enum BuiltInCharacterClassID { DigitClassID, SpaceClassID, @@ -58,7 +56,6 @@ private: ParenthesesUnmatched, ParenthesesTypeInvalid, CharacterClassUnmatched, - CharacterClassInvalidRange, CharacterClassOutOfOrder, EscapeUnterminated, NumberOfErrorCodes @@ -78,7 +75,7 @@ private: CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err) : m_delegate(delegate) , m_err(err) - , m_state(Empty) + , m_state(empty) { } @@ -93,88 +90,65 @@ private: } /* - * atomPatternCharacter(): + * atomPatternCharacterUnescaped(): * - * This method is called either from parseCharacterClass() (for an unescaped - * character in a character class), or from parseEscape(). In the former case - * the value true will be passed for the argument 'hyphenIsRange', and in this - * mode we will allow a hypen to be treated as indicating a range (i.e. /[a-z]/ - * is different to /[a\-z]/). + * This method is called directly from parseCharacterClass(), to report a new + * pattern character token. This method differs from atomPatternCharacter(), + * which will be called from parseEscape(), since a hypen provided via this + * method may be indicating a character range, but a hyphen parsed by + * parseEscape() cannot be interpreted as doing so. */ - void atomPatternCharacter(UChar ch, bool hyphenIsRange = false) + void atomPatternCharacterUnescaped(UChar ch) { switch (m_state) { - case AfterCharacterClass: - // Following a builtin character class we need look out for a hyphen. - // We're looking for invalid ranges, such as /[\d-x]/ or /[\d-\d]/. - // If we see a hyphen following a charater class then unlike usual - // we'll report it to the delegate immediately, and put ourself into - // a poisoned state. Any following calls to add another character or - // character class will result in an error. (A hypen following a - // character-class is itself valid, but only at the end of a regex). - if (hyphenIsRange && ch == '-') { - m_delegate.atomCharacterClassAtom('-'); - m_state = AfterCharacterClassHyphen; - return; - } - // Otherwise just fall through - cached character so treat this as Empty. - - case Empty: + case empty: m_character = ch; - m_state = CachedCharacter; - return; + m_state = cachedCharacter; + break; - case CachedCharacter: - if (hyphenIsRange && ch == '-') - m_state = CachedCharacterHyphen; + case cachedCharacter: + if (ch == '-') + m_state = cachedCharacterHyphen; else { m_delegate.atomCharacterClassAtom(m_character); m_character = ch; } - return; + break; - case CachedCharacterHyphen: - if (ch < m_character) { + case cachedCharacterHyphen: + if (ch >= m_character) + m_delegate.atomCharacterClassRange(m_character, ch); + else m_err = CharacterClassOutOfOrder; - return; - } - m_delegate.atomCharacterClassRange(m_character, ch); - m_state = Empty; - return; - - case AfterCharacterClassHyphen: - // Error! We have something like /[\d-x]/. - m_err = CharacterClassInvalidRange; - return; + m_state = empty; } } /* + * atomPatternCharacter(): + * + * Adds a pattern character, called by parseEscape(), as such will not + * interpret a hyphen as indicating a character range. + */ + void atomPatternCharacter(UChar ch) + { + // Flush if a character is already pending to prevent the + // hyphen from begin interpreted as indicating a range. + if((ch == '-') && (m_state == cachedCharacter)) + flush(); + + atomPatternCharacterUnescaped(ch); + } + + /* * atomBuiltInCharacterClass(): * * Adds a built-in character class, called by parseEscape(). */ void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert) { - switch (m_state) { - case CachedCharacter: - // Flush the currently cached character, then fall through. - m_delegate.atomCharacterClassAtom(m_character); - - case Empty: - case AfterCharacterClass: - m_state = AfterCharacterClass; - m_delegate.atomCharacterClassBuiltIn(classID, invert); - return; - - case CachedCharacterHyphen: - case AfterCharacterClassHyphen: - // Error! If we hit either of these cases, we have an - // invalid range that looks something like /[x-\d]/ - // or /[\d-\d]/. - m_err = CharacterClassInvalidRange; - return; - } + flush(); + m_delegate.atomCharacterClassBuiltIn(classID, invert); } /* @@ -184,12 +158,7 @@ private: */ void end() { - if (m_state == CachedCharacter) - m_delegate.atomCharacterClassAtom(m_character); - else if (m_state == CachedCharacterHyphen) { - m_delegate.atomCharacterClassAtom(m_character); - m_delegate.atomCharacterClassAtom('-'); - } + flush(); m_delegate.atomCharacterClassEnd(); } @@ -199,14 +168,21 @@ private: void atomBackReference(unsigned) { ASSERT_NOT_REACHED(); } private: + void flush() + { + if (m_state != empty) // either cachedCharacter or cachedCharacterHyphen + m_delegate.atomCharacterClassAtom(m_character); + if (m_state == cachedCharacterHyphen) + m_delegate.atomCharacterClassAtom('-'); + m_state = empty; + } + Delegate& m_delegate; ErrorCode& m_err; enum CharacterClassConstructionState { - Empty, - CachedCharacter, - CachedCharacterHyphen, - AfterCharacterClass, - AfterCharacterClassHyphen, + empty, + cachedCharacter, + cachedCharacterHyphen, } m_state; UChar m_character; }; @@ -452,7 +428,7 @@ private: break; default: - characterClassConstructor.atomPatternCharacter(consume(), true); + characterClassConstructor.atomPatternCharacterUnescaped(consume()); } if (m_err) @@ -596,13 +572,13 @@ private: case '*': consume(); - parseQuantifier(lastTokenWasAnAtom, 0, quantifyInfinite); + parseQuantifier(lastTokenWasAnAtom, 0, UINT_MAX); lastTokenWasAnAtom = false; break; case '+': consume(); - parseQuantifier(lastTokenWasAnAtom, 1, quantifyInfinite); + parseQuantifier(lastTokenWasAnAtom, 1, UINT_MAX); lastTokenWasAnAtom = false; break; @@ -621,7 +597,7 @@ private: unsigned max = min; if (tryConsume(',')) - max = peekIsDigit() ? consumeNumber() : quantifyInfinite; + max = peekIsDigit() ? consumeNumber() : UINT_MAX; if (tryConsume('}')) { if (min <= max) @@ -681,7 +657,6 @@ private: "unmatched parentheses", "unrecognized character after (?", "missing terminating ] for character class", - "invalid range in character class", "range out of order in character class", "\\ at end of pattern" }; @@ -863,7 +838,7 @@ private: */ template<class Delegate> -const char* parse(Delegate& delegate, const UString& pattern, unsigned backReferenceLimit = quantifyInfinite) +const char* parse(Delegate& delegate, const UString& pattern, unsigned backReferenceLimit = UINT_MAX) { return Parser<Delegate>(delegate, pattern, backReferenceLimit).parse(); } diff --git a/JavaScriptCore/yarr/RegexPattern.h b/JavaScriptCore/yarr/RegexPattern.h index 8a7d35b..c76c641 100644 --- a/JavaScriptCore/yarr/RegexPattern.h +++ b/JavaScriptCore/yarr/RegexPattern.h @@ -107,7 +107,7 @@ struct PatternTerm { union { UChar patternCharacter; CharacterClass* characterClass; - unsigned backReferenceSubpatternId; + unsigned subpatternId; struct { PatternDisjunction* disjunction; unsigned subpatternId; @@ -162,7 +162,7 @@ struct PatternTerm { : type(TypeBackReference) , invertOrCapture(false) { - backReferenceSubpatternId = spatternId; + subpatternId = spatternId; quantityType = QuantifierFixedCount; quantityCount = 1; } |