diff options
Diffstat (limited to 'Source/JavaScriptCore/yarr')
-rw-r--r-- | Source/JavaScriptCore/yarr/RegexInterpreter.cpp | 1891 | ||||
-rw-r--r-- | Source/JavaScriptCore/yarr/RegexInterpreter.h | 381 | ||||
-rw-r--r-- | Source/JavaScriptCore/yarr/RegexJIT.cpp | 2213 | ||||
-rw-r--r-- | Source/JavaScriptCore/yarr/RegexJIT.h | 90 | ||||
-rw-r--r-- | Source/JavaScriptCore/yarr/RegexParser.h | 887 | ||||
-rw-r--r-- | Source/JavaScriptCore/yarr/RegexPattern.cpp | 991 | ||||
-rw-r--r-- | Source/JavaScriptCore/yarr/RegexPattern.h | 424 |
7 files changed, 6877 insertions, 0 deletions
diff --git a/Source/JavaScriptCore/yarr/RegexInterpreter.cpp b/Source/JavaScriptCore/yarr/RegexInterpreter.cpp new file mode 100644 index 0000000..7769922 --- /dev/null +++ b/Source/JavaScriptCore/yarr/RegexInterpreter.cpp @@ -0,0 +1,1891 @@ +/* + * Copyright (C) 2009 Apple Inc. All rights reserved. + * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "RegexInterpreter.h" + +#include "RegexPattern.h" +#include <wtf/BumpPointerAllocator.h> + +#ifndef NDEBUG +#include <stdio.h> +#endif + +using namespace WTF; + +namespace JSC { namespace Yarr { + +class Interpreter { +public: + struct ParenthesesDisjunctionContext; + + struct BackTrackInfoPatternCharacter { + uintptr_t matchAmount; + }; + struct BackTrackInfoCharacterClass { + uintptr_t matchAmount; + }; + struct BackTrackInfoBackReference { + uintptr_t begin; // Not really needed for greedy quantifiers. + uintptr_t matchAmount; // Not really needed for fixed quantifiers. + }; + struct BackTrackInfoAlternative { + uintptr_t offset; + }; + struct BackTrackInfoParentheticalAssertion { + uintptr_t begin; + }; + struct BackTrackInfoParenthesesOnce { + uintptr_t begin; + }; + struct BackTrackInfoParenthesesTerminal { + uintptr_t begin; + }; + struct BackTrackInfoParentheses { + uintptr_t matchAmount; + ParenthesesDisjunctionContext* lastContext; + }; + + static inline void appendParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack, ParenthesesDisjunctionContext* context) + { + context->next = backTrack->lastContext; + backTrack->lastContext = context; + ++backTrack->matchAmount; + } + + static inline void popParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack) + { + ASSERT(backTrack->matchAmount); + ASSERT(backTrack->lastContext); + backTrack->lastContext = backTrack->lastContext->next; + --backTrack->matchAmount; + } + + struct DisjunctionContext + { + DisjunctionContext() + : term(0) + { + } + + void* operator new(size_t, void* where) + { + return where; + } + + int term; + unsigned matchBegin; + unsigned matchEnd; + uintptr_t frame[1]; + }; + + DisjunctionContext* allocDisjunctionContext(ByteDisjunction* disjunction) + { + size_t size = sizeof(DisjunctionContext) - sizeof(uintptr_t) + disjunction->m_frameSize * sizeof(uintptr_t); + allocatorPool = allocatorPool->ensureCapacity(size); + if (!allocatorPool) + CRASH(); + return new(allocatorPool->alloc(size)) DisjunctionContext(); + } + + void freeDisjunctionContext(DisjunctionContext* context) + { + allocatorPool = allocatorPool->dealloc(context); + } + + struct ParenthesesDisjunctionContext + { + ParenthesesDisjunctionContext(int* output, ByteTerm& term) + : next(0) + { + unsigned firstSubpatternId = term.atom.subpatternId; + unsigned numNestedSubpatterns = term.atom.parenthesesDisjunction->m_numSubpatterns; + + for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i) { + subpatternBackup[i] = output[(firstSubpatternId << 1) + i]; + output[(firstSubpatternId << 1) + i] = -1; + } + + new(getDisjunctionContext(term)) DisjunctionContext(); + } + + void* operator new(size_t, void* where) + { + return where; + } + + void restoreOutput(int* output, unsigned firstSubpatternId, unsigned numNestedSubpatterns) + { + for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i) + output[(firstSubpatternId << 1) + i] = subpatternBackup[i]; + } + + DisjunctionContext* getDisjunctionContext(ByteTerm& term) + { + return reinterpret_cast<DisjunctionContext*>(&(subpatternBackup[term.atom.parenthesesDisjunction->m_numSubpatterns << 1])); + } + + ParenthesesDisjunctionContext* next; + int subpatternBackup[1]; + }; + + ParenthesesDisjunctionContext* allocParenthesesDisjunctionContext(ByteDisjunction* disjunction, int* output, ByteTerm& term) + { + size_t size = sizeof(ParenthesesDisjunctionContext) - sizeof(int) + (term.atom.parenthesesDisjunction->m_numSubpatterns << 1) * sizeof(int) + sizeof(DisjunctionContext) - sizeof(uintptr_t) + disjunction->m_frameSize * sizeof(uintptr_t); + allocatorPool = allocatorPool->ensureCapacity(size); + if (!allocatorPool) + CRASH(); + return new(allocatorPool->alloc(size)) ParenthesesDisjunctionContext(output, term); + } + + void freeParenthesesDisjunctionContext(ParenthesesDisjunctionContext* context) + { + allocatorPool = allocatorPool->dealloc(context); + } + + class InputStream { + public: + InputStream(const UChar* input, unsigned start, unsigned length) + : input(input) + , pos(start) + , length(length) + { + } + + void next() + { + ++pos; + } + + void rewind(unsigned amount) + { + ASSERT(pos >= amount); + pos -= amount; + } + + int read() + { + ASSERT(pos < length); + if (pos < length) + return input[pos]; + return -1; + } + + int readPair() + { + ASSERT(pos + 1 < length); + return input[pos] | input[pos + 1] << 16; + } + + int readChecked(int position) + { + ASSERT(position < 0); + ASSERT((unsigned)-position <= pos); + unsigned p = pos + position; + ASSERT(p < length); + return input[p]; + } + + int reread(unsigned from) + { + ASSERT(from < length); + return input[from]; + } + + int prev() + { + ASSERT(!(pos > length)); + if (pos && length) + return input[pos - 1]; + return -1; + } + + unsigned getPos() + { + return pos; + } + + void setPos(unsigned p) + { + pos = p; + } + + bool atStart() + { + return pos == 0; + } + + bool atEnd() + { + return pos == length; + } + + bool checkInput(int count) + { + if ((pos + count) <= length) { + pos += count; + return true; + } else + return false; + } + + void uncheckInput(int count) + { + pos -= count; + } + + bool atStart(int position) + { + return (pos + position) == 0; + } + + bool atEnd(int position) + { + return (pos + position) == length; + } + + bool isNotAvailableInput(int position) + { + return (pos + position) > length; + } + + private: + const UChar* input; + unsigned pos; + unsigned length; + }; + + bool testCharacterClass(CharacterClass* characterClass, int ch) + { + if (ch & 0xFF80) { + for (unsigned i = 0; i < characterClass->m_matchesUnicode.size(); ++i) + if (ch == characterClass->m_matchesUnicode[i]) + return true; + for (unsigned i = 0; i < characterClass->m_rangesUnicode.size(); ++i) + if ((ch >= characterClass->m_rangesUnicode[i].begin) && (ch <= characterClass->m_rangesUnicode[i].end)) + return true; + } else { + for (unsigned i = 0; i < characterClass->m_matches.size(); ++i) + if (ch == characterClass->m_matches[i]) + return true; + for (unsigned i = 0; i < characterClass->m_ranges.size(); ++i) + if ((ch >= characterClass->m_ranges[i].begin) && (ch <= characterClass->m_ranges[i].end)) + return true; + } + + return false; + } + + bool checkCharacter(int testChar, int inputPosition) + { + return testChar == input.readChecked(inputPosition); + } + + bool checkCasedCharacter(int loChar, int hiChar, int inputPosition) + { + int ch = input.readChecked(inputPosition); + return (loChar == ch) || (hiChar == ch); + } + + bool checkCharacterClass(CharacterClass* characterClass, bool invert, int inputPosition) + { + bool match = testCharacterClass(characterClass, input.readChecked(inputPosition)); + return invert ? !match : match; + } + + bool tryConsumeBackReference(int matchBegin, int matchEnd, int inputOffset) + { + int matchSize = matchEnd - matchBegin; + + if (!input.checkInput(matchSize)) + return false; + + if (pattern->m_ignoreCase) { + for (int i = 0; i < matchSize; ++i) { + int ch = input.reread(matchBegin + i); + + int lo = Unicode::toLower(ch); + int hi = Unicode::toUpper(ch); + + if ((lo != hi) ? (!checkCasedCharacter(lo, hi, inputOffset - matchSize + i)) : (!checkCharacter(ch, inputOffset - matchSize + i))) { + input.uncheckInput(matchSize); + return false; + } + } + } else { + for (int i = 0; i < matchSize; ++i) { + if (!checkCharacter(input.reread(matchBegin + i), inputOffset - matchSize + i)) { + input.uncheckInput(matchSize); + return false; + } + } + } + + return true; + } + + bool matchAssertionBOL(ByteTerm& term) + { + return (input.atStart(term.inputPosition)) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition - 1))); + } + + bool matchAssertionEOL(ByteTerm& term) + { + if (term.inputPosition) + return (input.atEnd(term.inputPosition)) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition))); + else + return (input.atEnd()) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.read())); + } + + bool matchAssertionWordBoundary(ByteTerm& term) + { + bool prevIsWordchar = !input.atStart(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition - 1)); + bool readIsWordchar; + if (term.inputPosition) + readIsWordchar = !input.atEnd(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition)); + else + readIsWordchar = !input.atEnd() && testCharacterClass(pattern->wordcharCharacterClass, input.read()); + + bool wordBoundary = prevIsWordchar != readIsWordchar; + return term.invert() ? !wordBoundary : wordBoundary; + } + + bool backtrackPatternCharacter(ByteTerm& term, DisjunctionContext* context) + { + BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation); + + switch (term.atom.quantityType) { + case QuantifierFixedCount: + break; + + case QuantifierGreedy: + if (backTrack->matchAmount) { + --backTrack->matchAmount; + input.uncheckInput(1); + return true; + } + break; + + case QuantifierNonGreedy: + if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) { + ++backTrack->matchAmount; + if (checkCharacter(term.atom.patternCharacter, term.inputPosition - 1)) + return true; + } + input.uncheckInput(backTrack->matchAmount); + break; + } + + return false; + } + + bool backtrackPatternCasedCharacter(ByteTerm& term, DisjunctionContext* context) + { + BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation); + + switch (term.atom.quantityType) { + case QuantifierFixedCount: + break; + + case QuantifierGreedy: + if (backTrack->matchAmount) { + --backTrack->matchAmount; + input.uncheckInput(1); + return true; + } + break; + + case QuantifierNonGreedy: + if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) { + ++backTrack->matchAmount; + if (checkCasedCharacter(term.atom.casedCharacter.lo, term.atom.casedCharacter.hi, term.inputPosition - 1)) + return true; + } + input.uncheckInput(backTrack->matchAmount); + break; + } + + return false; + } + + bool matchCharacterClass(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeCharacterClass); + BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation); + + switch (term.atom.quantityType) { + case QuantifierFixedCount: { + for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) { + if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition + matchAmount)) + return false; + } + return true; + } + + case QuantifierGreedy: { + unsigned matchAmount = 0; + while ((matchAmount < term.atom.quantityCount) && input.checkInput(1)) { + if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - 1)) { + input.uncheckInput(1); + break; + } + ++matchAmount; + } + backTrack->matchAmount = matchAmount; + + return true; + } + + case QuantifierNonGreedy: + backTrack->matchAmount = 0; + return true; + } + + ASSERT_NOT_REACHED(); + return false; + } + + bool backtrackCharacterClass(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeCharacterClass); + BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation); + + switch (term.atom.quantityType) { + case QuantifierFixedCount: + break; + + case QuantifierGreedy: + if (backTrack->matchAmount) { + --backTrack->matchAmount; + input.uncheckInput(1); + return true; + } + break; + + case QuantifierNonGreedy: + if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) { + ++backTrack->matchAmount; + if (checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - 1)) + return true; + } + input.uncheckInput(backTrack->matchAmount); + break; + } + + return false; + } + + bool matchBackReference(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeBackReference); + BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation); + + int matchBegin = output[(term.atom.subpatternId << 1)]; + int matchEnd = output[(term.atom.subpatternId << 1) + 1]; + + // If the end position of the referenced match hasn't set yet then the backreference in the same parentheses where it references to that. + // In this case the result of match is empty string like when it references to a parentheses with zero-width match. + // Eg.: /(a\1)/ + if (matchEnd == -1) + return true; + + ASSERT((matchBegin == -1) || (matchBegin <= matchEnd)); + + if (matchBegin == matchEnd) + return true; + + switch (term.atom.quantityType) { + case QuantifierFixedCount: { + backTrack->begin = input.getPos(); + for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) { + if (!tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) { + input.setPos(backTrack->begin); + return false; + } + } + return true; + } + + case QuantifierGreedy: { + unsigned matchAmount = 0; + while ((matchAmount < term.atom.quantityCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) + ++matchAmount; + backTrack->matchAmount = matchAmount; + return true; + } + + case QuantifierNonGreedy: + backTrack->begin = input.getPos(); + backTrack->matchAmount = 0; + return true; + } + + ASSERT_NOT_REACHED(); + return false; + } + + bool backtrackBackReference(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeBackReference); + BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation); + + int matchBegin = output[(term.atom.subpatternId << 1)]; + int matchEnd = output[(term.atom.subpatternId << 1) + 1]; + ASSERT((matchBegin == -1) || (matchBegin <= matchEnd)); + + if (matchBegin == matchEnd) + return false; + + switch (term.atom.quantityType) { + case QuantifierFixedCount: + // for quantityCount == 1, could rewind. + input.setPos(backTrack->begin); + break; + + case QuantifierGreedy: + if (backTrack->matchAmount) { + --backTrack->matchAmount; + input.rewind(matchEnd - matchBegin); + return true; + } + break; + + case QuantifierNonGreedy: + if ((backTrack->matchAmount < term.atom.quantityCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) { + ++backTrack->matchAmount; + return true; + } else + input.setPos(backTrack->begin); + break; + } + + return false; + } + + void recordParenthesesMatch(ByteTerm& term, ParenthesesDisjunctionContext* context) + { + if (term.capture()) { + unsigned subpatternId = term.atom.subpatternId; + output[(subpatternId << 1)] = context->getDisjunctionContext(term)->matchBegin + term.inputPosition; + output[(subpatternId << 1) + 1] = context->getDisjunctionContext(term)->matchEnd + term.inputPosition; + } + } + void resetMatches(ByteTerm& term, ParenthesesDisjunctionContext* context) + { + unsigned firstSubpatternId = term.atom.subpatternId; + unsigned count = term.atom.parenthesesDisjunction->m_numSubpatterns; + context->restoreOutput(output, firstSubpatternId, count); + } + JSRegExpResult parenthesesDoBacktrack(ByteTerm& term, BackTrackInfoParentheses* backTrack) + { + while (backTrack->matchAmount) { + ParenthesesDisjunctionContext* context = backTrack->lastContext; + + JSRegExpResult result = matchDisjunction(term.atom.parenthesesDisjunction, context->getDisjunctionContext(term), true); + if (result == JSRegExpMatch) + return JSRegExpMatch; + + resetMatches(term, context); + popParenthesesDisjunctionContext(backTrack); + freeParenthesesDisjunctionContext(context); + + if (result != JSRegExpNoMatch) + return result; + } + + return JSRegExpNoMatch; + } + + bool matchParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin); + ASSERT(term.atom.quantityCount == 1); + + BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation); + + switch (term.atom.quantityType) { + case QuantifierGreedy: { + // set this speculatively; if we get to the parens end this will be true. + backTrack->begin = input.getPos(); + break; + } + case QuantifierNonGreedy: { + backTrack->begin = notFound; + context->term += term.atom.parenthesesWidth; + return true; + } + case QuantifierFixedCount: + break; + } + + if (term.capture()) { + unsigned subpatternId = term.atom.subpatternId; + output[(subpatternId << 1)] = input.getPos() + term.inputPosition; + } + + return true; + } + + bool matchParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd); + ASSERT(term.atom.quantityCount == 1); + + if (term.capture()) { + unsigned subpatternId = term.atom.subpatternId; + output[(subpatternId << 1) + 1] = input.getPos() + term.inputPosition; + } + + if (term.atom.quantityType == QuantifierFixedCount) + return true; + + BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation); + return backTrack->begin != input.getPos(); + } + + bool backtrackParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin); + ASSERT(term.atom.quantityCount == 1); + + BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation); + + if (term.capture()) { + unsigned subpatternId = term.atom.subpatternId; + output[(subpatternId << 1)] = -1; + output[(subpatternId << 1) + 1] = -1; + } + + switch (term.atom.quantityType) { + case QuantifierGreedy: + // if we backtrack to this point, there is another chance - try matching nothing. + ASSERT(backTrack->begin != notFound); + backTrack->begin = notFound; + context->term += term.atom.parenthesesWidth; + return true; + case QuantifierNonGreedy: + ASSERT(backTrack->begin != notFound); + case QuantifierFixedCount: + break; + } + + return false; + } + + bool backtrackParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd); + ASSERT(term.atom.quantityCount == 1); + + BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation); + + switch (term.atom.quantityType) { + case QuantifierGreedy: + if (backTrack->begin == notFound) { + context->term -= term.atom.parenthesesWidth; + return false; + } + case QuantifierNonGreedy: + if (backTrack->begin == notFound) { + backTrack->begin = input.getPos(); + if (term.capture()) { + // Technically this access to inputPosition should be accessing the begin term's + // inputPosition, but for repeats other than fixed these values should be + // the same anyway! (We don't pre-check for greedy or non-greedy matches.) + ASSERT((&term - term.atom.parenthesesWidth)->type == ByteTerm::TypeParenthesesSubpatternOnceBegin); + ASSERT((&term - term.atom.parenthesesWidth)->inputPosition == term.inputPosition); + unsigned subpatternId = term.atom.subpatternId; + output[subpatternId << 1] = input.getPos() + term.inputPosition; + } + context->term -= term.atom.parenthesesWidth; + return true; + } + case QuantifierFixedCount: + break; + } + + return false; + } + + bool matchParenthesesTerminalBegin(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalBegin); + ASSERT(term.atom.quantityType == QuantifierGreedy); + ASSERT(term.atom.quantityCount == quantifyInfinite); + ASSERT(!term.capture()); + + BackTrackInfoParenthesesTerminal* backTrack = reinterpret_cast<BackTrackInfoParenthesesTerminal*>(context->frame + term.frameLocation); + backTrack->begin = input.getPos(); + return true; + } + + bool matchParenthesesTerminalEnd(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalEnd); + + BackTrackInfoParenthesesTerminal* backTrack = reinterpret_cast<BackTrackInfoParenthesesTerminal*>(context->frame + term.frameLocation); + // Empty match is a failed match. + if (backTrack->begin == input.getPos()) + return false; + + // Successful match! Okay, what's next? - loop around and try to match moar! + context->term -= (term.atom.parenthesesWidth + 1); + return true; + } + + bool backtrackParenthesesTerminalBegin(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalBegin); + ASSERT(term.atom.quantityType == QuantifierGreedy); + ASSERT(term.atom.quantityCount == quantifyInfinite); + ASSERT(!term.capture()); + + // If we backtrack to this point, we have failed to match this iteration of the parens. + // Since this is greedy / zero minimum a failed is also accepted as a match! + context->term += term.atom.parenthesesWidth; + return true; + } + + bool backtrackParenthesesTerminalEnd(ByteTerm&, DisjunctionContext*) + { + // 'Terminal' parentheses are at the end of the regex, and as such a match past end + // should always be returned as a successful match - we should never backtrack to here. + ASSERT_NOT_REACHED(); + return false; + } + + bool matchParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin); + ASSERT(term.atom.quantityCount == 1); + + BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation); + + backTrack->begin = input.getPos(); + return true; + } + + bool matchParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd); + ASSERT(term.atom.quantityCount == 1); + + BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation); + + input.setPos(backTrack->begin); + + // We've reached the end of the parens; if they are inverted, this is failure. + if (term.invert()) { + context->term -= term.atom.parenthesesWidth; + return false; + } + + return true; + } + + bool backtrackParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin); + ASSERT(term.atom.quantityCount == 1); + + // We've failed to match parens; if they are inverted, this is win! + if (term.invert()) { + context->term += term.atom.parenthesesWidth; + return true; + } + + return false; + } + + bool backtrackParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd); + ASSERT(term.atom.quantityCount == 1); + + BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation); + + input.setPos(backTrack->begin); + + context->term -= term.atom.parenthesesWidth; + return false; + } + + JSRegExpResult matchParentheses(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern); + + BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation); + ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction; + + backTrack->matchAmount = 0; + backTrack->lastContext = 0; + + switch (term.atom.quantityType) { + case QuantifierFixedCount: { + // While we haven't yet reached our fixed limit, + while (backTrack->matchAmount < term.atom.quantityCount) { + // Try to do a match, and it it succeeds, add it to the list. + ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); + JSRegExpResult result = matchDisjunction(disjunctionBody, context->getDisjunctionContext(term)); + if (result == JSRegExpMatch) + appendParenthesesDisjunctionContext(backTrack, context); + else { + // The match failed; try to find an alternate point to carry on from. + resetMatches(term, context); + freeParenthesesDisjunctionContext(context); + + if (result == JSRegExpNoMatch) { + JSRegExpResult backtrackResult = parenthesesDoBacktrack(term, backTrack); + if (backtrackResult != JSRegExpMatch) + return backtrackResult; + } else + return result; + } + } + + ASSERT(backTrack->matchAmount == term.atom.quantityCount); + ParenthesesDisjunctionContext* context = backTrack->lastContext; + recordParenthesesMatch(term, context); + return JSRegExpMatch; + } + + case QuantifierGreedy: { + while (backTrack->matchAmount < term.atom.quantityCount) { + ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); + JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term)); + if (result == JSRegExpMatch) + appendParenthesesDisjunctionContext(backTrack, context); + else { + resetMatches(term, context); + freeParenthesesDisjunctionContext(context); + + if (result != JSRegExpNoMatch) + return result; + + break; + } + } + + if (backTrack->matchAmount) { + ParenthesesDisjunctionContext* context = backTrack->lastContext; + recordParenthesesMatch(term, context); + } + return JSRegExpMatch; + } + + case QuantifierNonGreedy: + return JSRegExpMatch; + } + + ASSERT_NOT_REACHED(); + return JSRegExpErrorNoMatch; + } + + // Rules for backtracking differ depending on whether this is greedy or non-greedy. + // + // Greedy matches never should try just adding more - you should already have done + // the 'more' cases. Always backtrack, at least a leetle bit. However cases where + // you backtrack an item off the list needs checking, since we'll never have matched + // the one less case. Tracking forwards, still add as much as possible. + // + // Non-greedy, we've already done the one less case, so don't match on popping. + // We haven't done the one more case, so always try to add that. + // + JSRegExpResult backtrackParentheses(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern); + + BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation); + ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction; + + switch (term.atom.quantityType) { + case QuantifierFixedCount: { + ASSERT(backTrack->matchAmount == term.atom.quantityCount); + + ParenthesesDisjunctionContext* context = 0; + JSRegExpResult result = parenthesesDoBacktrack(term, backTrack); + + if (result != JSRegExpMatch) + return result; + + // While we haven't yet reached our fixed limit, + while (backTrack->matchAmount < term.atom.quantityCount) { + // Try to do a match, and it it succeeds, add it to the list. + context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); + result = matchDisjunction(disjunctionBody, context->getDisjunctionContext(term)); + + if (result == JSRegExpMatch) + appendParenthesesDisjunctionContext(backTrack, context); + else { + // The match failed; try to find an alternate point to carry on from. + resetMatches(term, context); + freeParenthesesDisjunctionContext(context); + + if (result == JSRegExpNoMatch) { + JSRegExpResult backtrackResult = parenthesesDoBacktrack(term, backTrack); + if (backtrackResult != JSRegExpMatch) + return backtrackResult; + } else + return result; + } + } + + ASSERT(backTrack->matchAmount == term.atom.quantityCount); + context = backTrack->lastContext; + recordParenthesesMatch(term, context); + return JSRegExpMatch; + } + + case QuantifierGreedy: { + if (!backTrack->matchAmount) + return JSRegExpNoMatch; + + ParenthesesDisjunctionContext* context = backTrack->lastContext; + JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true); + if (result == JSRegExpMatch) { + while (backTrack->matchAmount < term.atom.quantityCount) { + ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); + JSRegExpResult parenthesesResult = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term)); + if (parenthesesResult == JSRegExpMatch) + appendParenthesesDisjunctionContext(backTrack, context); + else { + resetMatches(term, context); + freeParenthesesDisjunctionContext(context); + + if (parenthesesResult != JSRegExpNoMatch) + return parenthesesResult; + + break; + } + } + } else { + resetMatches(term, context); + popParenthesesDisjunctionContext(backTrack); + freeParenthesesDisjunctionContext(context); + + if (result != JSRegExpNoMatch) + return result; + } + + if (backTrack->matchAmount) { + ParenthesesDisjunctionContext* context = backTrack->lastContext; + recordParenthesesMatch(term, context); + } + return JSRegExpMatch; + } + + case QuantifierNonGreedy: { + // If we've not reached the limit, try to add one more match. + if (backTrack->matchAmount < term.atom.quantityCount) { + ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); + JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term)); + if (result == JSRegExpMatch) { + appendParenthesesDisjunctionContext(backTrack, context); + recordParenthesesMatch(term, context); + return JSRegExpMatch; + } + + resetMatches(term, context); + freeParenthesesDisjunctionContext(context); + + if (result != JSRegExpNoMatch) + return result; + } + + // Nope - okay backtrack looking for an alternative. + while (backTrack->matchAmount) { + ParenthesesDisjunctionContext* context = backTrack->lastContext; + JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true); + if (result == JSRegExpMatch) { + // successful backtrack! we're back in the game! + if (backTrack->matchAmount) { + context = backTrack->lastContext; + recordParenthesesMatch(term, context); + } + return JSRegExpMatch; + } + + // pop a match off the stack + resetMatches(term, context); + popParenthesesDisjunctionContext(backTrack); + freeParenthesesDisjunctionContext(context); + + return result; + } + + return JSRegExpNoMatch; + } + } + + ASSERT_NOT_REACHED(); + return JSRegExpErrorNoMatch; + } + + void lookupForBeginChars() + { + int character; + bool firstSingleCharFound; + + while (true) { + if (input.isNotAvailableInput(2)) + return; + + firstSingleCharFound = false; + + character = input.readPair(); + + for (unsigned i = 0; i < pattern->m_beginChars.size(); ++i) { + BeginChar bc = pattern->m_beginChars[i]; + + if (!firstSingleCharFound && bc.value <= 0xFFFF) { + firstSingleCharFound = true; + character &= 0xFFFF; + } + + if ((character | bc.mask) == bc.value) + return; + } + + input.next(); + } + } + +#define MATCH_NEXT() { ++context->term; goto matchAgain; } +#define BACKTRACK() { --context->term; goto backtrack; } +#define currentTerm() (disjunction->terms[context->term]) + JSRegExpResult matchDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false, bool isBody = false) + { + if (!--remainingMatchCount) + return JSRegExpErrorHitLimit; + + if (btrack) + BACKTRACK(); + + if (pattern->m_containsBeginChars && isBody) + lookupForBeginChars(); + + context->matchBegin = input.getPos(); + context->term = 0; + + matchAgain: + ASSERT(context->term < static_cast<int>(disjunction->terms.size())); + + switch (currentTerm().type) { + case ByteTerm::TypeSubpatternBegin: + MATCH_NEXT(); + case ByteTerm::TypeSubpatternEnd: + context->matchEnd = input.getPos(); + return JSRegExpMatch; + + case ByteTerm::TypeBodyAlternativeBegin: + MATCH_NEXT(); + case ByteTerm::TypeBodyAlternativeDisjunction: + case ByteTerm::TypeBodyAlternativeEnd: + context->matchEnd = input.getPos(); + return JSRegExpMatch; + + case ByteTerm::TypeAlternativeBegin: + MATCH_NEXT(); + case ByteTerm::TypeAlternativeDisjunction: + case ByteTerm::TypeAlternativeEnd: { + int offset = currentTerm().alternative.end; + BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation); + backTrack->offset = offset; + context->term += offset; + MATCH_NEXT(); + } + + case ByteTerm::TypeAssertionBOL: + if (matchAssertionBOL(currentTerm())) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeAssertionEOL: + if (matchAssertionEOL(currentTerm())) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeAssertionWordBoundary: + if (matchAssertionWordBoundary(currentTerm())) + MATCH_NEXT(); + BACKTRACK(); + + case ByteTerm::TypePatternCharacterOnce: + case ByteTerm::TypePatternCharacterFixed: { + for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) { + if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition + matchAmount)) + BACKTRACK(); + } + MATCH_NEXT(); + } + case ByteTerm::TypePatternCharacterGreedy: { + BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation); + unsigned matchAmount = 0; + while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) { + if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition - 1)) { + input.uncheckInput(1); + break; + } + ++matchAmount; + } + backTrack->matchAmount = matchAmount; + + MATCH_NEXT(); + } + case ByteTerm::TypePatternCharacterNonGreedy: { + BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation); + backTrack->matchAmount = 0; + MATCH_NEXT(); + } + + case ByteTerm::TypePatternCasedCharacterOnce: + case ByteTerm::TypePatternCasedCharacterFixed: { + for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) { + if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition + matchAmount)) + BACKTRACK(); + } + MATCH_NEXT(); + } + case ByteTerm::TypePatternCasedCharacterGreedy: { + BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation); + unsigned matchAmount = 0; + while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) { + if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition - 1)) { + input.uncheckInput(1); + break; + } + ++matchAmount; + } + backTrack->matchAmount = matchAmount; + + MATCH_NEXT(); + } + case ByteTerm::TypePatternCasedCharacterNonGreedy: { + BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation); + backTrack->matchAmount = 0; + MATCH_NEXT(); + } + + case ByteTerm::TypeCharacterClass: + if (matchCharacterClass(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeBackReference: + if (matchBackReference(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeParenthesesSubpattern: { + JSRegExpResult result = matchParentheses(currentTerm(), context); + + if (result == JSRegExpMatch) { + MATCH_NEXT(); + } else if (result != JSRegExpNoMatch) + return result; + + BACKTRACK(); + } + case ByteTerm::TypeParenthesesSubpatternOnceBegin: + if (matchParenthesesOnceBegin(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeParenthesesSubpatternOnceEnd: + if (matchParenthesesOnceEnd(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeParenthesesSubpatternTerminalBegin: + if (matchParenthesesTerminalBegin(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeParenthesesSubpatternTerminalEnd: + if (matchParenthesesTerminalEnd(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeParentheticalAssertionBegin: + if (matchParentheticalAssertionBegin(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeParentheticalAssertionEnd: + if (matchParentheticalAssertionEnd(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + + case ByteTerm::TypeCheckInput: + if (input.checkInput(currentTerm().checkInputCount)) + MATCH_NEXT(); + BACKTRACK(); + } + + // We should never fall-through to here. + ASSERT_NOT_REACHED(); + + backtrack: + ASSERT(context->term < static_cast<int>(disjunction->terms.size())); + + switch (currentTerm().type) { + case ByteTerm::TypeSubpatternBegin: + return JSRegExpNoMatch; + case ByteTerm::TypeSubpatternEnd: + ASSERT_NOT_REACHED(); + + case ByteTerm::TypeBodyAlternativeBegin: + case ByteTerm::TypeBodyAlternativeDisjunction: { + int offset = currentTerm().alternative.next; + context->term += offset; + if (offset > 0) + MATCH_NEXT(); + + if (input.atEnd()) + return JSRegExpNoMatch; + + input.next(); + + if (pattern->m_containsBeginChars && isBody) + lookupForBeginChars(); + + context->matchBegin = input.getPos(); + + if (currentTerm().alternative.onceThrough) + context->term += currentTerm().alternative.next; + + MATCH_NEXT(); + } + case ByteTerm::TypeBodyAlternativeEnd: + ASSERT_NOT_REACHED(); + + case ByteTerm::TypeAlternativeBegin: + case ByteTerm::TypeAlternativeDisjunction: { + int offset = currentTerm().alternative.next; + context->term += offset; + if (offset > 0) + MATCH_NEXT(); + BACKTRACK(); + } + case ByteTerm::TypeAlternativeEnd: { + // We should never backtrack back into an alternative of the main body of the regex. + BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation); + unsigned offset = backTrack->offset; + context->term -= offset; + BACKTRACK(); + } + + case ByteTerm::TypeAssertionBOL: + case ByteTerm::TypeAssertionEOL: + case ByteTerm::TypeAssertionWordBoundary: + BACKTRACK(); + + case ByteTerm::TypePatternCharacterOnce: + case ByteTerm::TypePatternCharacterFixed: + case ByteTerm::TypePatternCharacterGreedy: + case ByteTerm::TypePatternCharacterNonGreedy: + if (backtrackPatternCharacter(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypePatternCasedCharacterOnce: + case ByteTerm::TypePatternCasedCharacterFixed: + case ByteTerm::TypePatternCasedCharacterGreedy: + case ByteTerm::TypePatternCasedCharacterNonGreedy: + if (backtrackPatternCasedCharacter(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeCharacterClass: + if (backtrackCharacterClass(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeBackReference: + if (backtrackBackReference(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeParenthesesSubpattern: { + JSRegExpResult result = backtrackParentheses(currentTerm(), context); + + if (result == JSRegExpMatch) { + MATCH_NEXT(); + } else if (result != JSRegExpNoMatch) + return result; + + BACKTRACK(); + } + case ByteTerm::TypeParenthesesSubpatternOnceBegin: + if (backtrackParenthesesOnceBegin(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeParenthesesSubpatternOnceEnd: + if (backtrackParenthesesOnceEnd(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeParenthesesSubpatternTerminalBegin: + if (backtrackParenthesesTerminalBegin(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeParenthesesSubpatternTerminalEnd: + if (backtrackParenthesesTerminalEnd(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeParentheticalAssertionBegin: + if (backtrackParentheticalAssertionBegin(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeParentheticalAssertionEnd: + if (backtrackParentheticalAssertionEnd(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + + case ByteTerm::TypeCheckInput: + input.uncheckInput(currentTerm().checkInputCount); + BACKTRACK(); + } + + ASSERT_NOT_REACHED(); + return JSRegExpErrorNoMatch; + } + + JSRegExpResult matchNonZeroDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false) + { + JSRegExpResult result = matchDisjunction(disjunction, context, btrack); + + if (result == JSRegExpMatch) { + while (context->matchBegin == context->matchEnd) { + result = matchDisjunction(disjunction, context, true); + if (result != JSRegExpMatch) + return result; + } + return JSRegExpMatch; + } + + return result; + } + + int interpret() + { + allocatorPool = pattern->m_allocator->startAllocator(); + if (!allocatorPool) + CRASH(); + + for (unsigned i = 0; i < ((pattern->m_body->m_numSubpatterns + 1) << 1); ++i) + output[i] = -1; + + DisjunctionContext* context = allocDisjunctionContext(pattern->m_body.get()); + + JSRegExpResult result = matchDisjunction(pattern->m_body.get(), context, false, true); + if (result == JSRegExpMatch) { + output[0] = context->matchBegin; + output[1] = context->matchEnd; + } + + freeDisjunctionContext(context); + + pattern->m_allocator->stopAllocator(); + + // RegExp.cpp currently expects all error to be converted to -1. + ASSERT((result == JSRegExpMatch) == (output[0] != -1)); + return output[0]; + } + + Interpreter(BytecodePattern* pattern, int* output, const UChar* inputChar, unsigned start, unsigned length) + : pattern(pattern) + , output(output) + , input(inputChar, start, length) + , allocatorPool(0) + , remainingMatchCount(matchLimit) + { + } + +private: + BytecodePattern *pattern; + int* output; + InputStream input; + BumpPointerPool* allocatorPool; + unsigned remainingMatchCount; +}; + + + +class ByteCompiler { + struct ParenthesesStackEntry { + unsigned beginTerm; + unsigned savedAlternativeIndex; + ParenthesesStackEntry(unsigned beginTerm, unsigned savedAlternativeIndex/*, unsigned subpatternId, bool capture = false*/) + : beginTerm(beginTerm) + , savedAlternativeIndex(savedAlternativeIndex) + { + } + }; + +public: + ByteCompiler(RegexPattern& pattern) + : m_pattern(pattern) + { + m_currentAlternativeIndex = 0; + } + + PassOwnPtr<BytecodePattern> compile(BumpPointerAllocator* allocator) + { + regexBegin(m_pattern.m_numSubpatterns, m_pattern.m_body->m_callFrameSize, m_pattern.m_body->m_alternatives[0]->onceThrough()); + emitDisjunction(m_pattern.m_body); + regexEnd(); + + return adoptPtr(new BytecodePattern(m_bodyDisjunction.release(), m_allParenthesesInfo, m_pattern, allocator)); + } + + void checkInput(unsigned count) + { + m_bodyDisjunction->terms.append(ByteTerm::CheckInput(count)); + } + + void assertionBOL(int inputPosition) + { + m_bodyDisjunction->terms.append(ByteTerm::BOL(inputPosition)); + } + + void assertionEOL(int inputPosition) + { + m_bodyDisjunction->terms.append(ByteTerm::EOL(inputPosition)); + } + + void assertionWordBoundary(bool invert, int inputPosition) + { + m_bodyDisjunction->terms.append(ByteTerm::WordBoundary(invert, inputPosition)); + } + + void atomPatternCharacter(UChar ch, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType) + { + if (m_pattern.m_ignoreCase) { + UChar lo = Unicode::toLower(ch); + UChar hi = Unicode::toUpper(ch); + + if (lo != hi) { + m_bodyDisjunction->terms.append(ByteTerm(lo, hi, inputPosition, frameLocation, quantityCount, quantityType)); + return; + } + } + + m_bodyDisjunction->terms.append(ByteTerm(ch, inputPosition, frameLocation, quantityCount, quantityType)); + } + + void atomCharacterClass(CharacterClass* characterClass, bool invert, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType) + { + m_bodyDisjunction->terms.append(ByteTerm(characterClass, invert, inputPosition)); + + m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount; + m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType; + m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; + } + + void atomBackReference(unsigned subpatternId, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType) + { + ASSERT(subpatternId); + + m_bodyDisjunction->terms.append(ByteTerm::BackReference(subpatternId, inputPosition)); + + m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount; + m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType; + m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; + } + + void atomParenthesesOnceBegin(unsigned subpatternId, bool capture, int inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation) + { + int beginTerm = m_bodyDisjunction->terms.size(); + + m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, false, inputPosition)); + m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; + m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin()); + m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation; + + m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex)); + m_currentAlternativeIndex = beginTerm + 1; + } + + void atomParenthesesTerminalBegin(unsigned subpatternId, bool capture, int inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation) + { + int beginTerm = m_bodyDisjunction->terms.size(); + + m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternTerminalBegin, subpatternId, capture, false, inputPosition)); + m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; + m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin()); + m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation; + + m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex)); + m_currentAlternativeIndex = beginTerm + 1; + } + + void atomParenthesesSubpatternBegin(unsigned subpatternId, bool capture, int inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation) + { + // Errrk! - this is a little crazy, we initially generate as a TypeParenthesesSubpatternOnceBegin, + // then fix this up at the end! - simplifying this should make it much clearer. + // https://bugs.webkit.org/show_bug.cgi?id=50136 + + int beginTerm = m_bodyDisjunction->terms.size(); + + m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, false, inputPosition)); + m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; + m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin()); + m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation; + + m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex)); + m_currentAlternativeIndex = beginTerm + 1; + } + + void atomParentheticalAssertionBegin(unsigned subpatternId, bool invert, unsigned frameLocation, unsigned alternativeFrameLocation) + { + int beginTerm = m_bodyDisjunction->terms.size(); + + m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParentheticalAssertionBegin, subpatternId, false, invert, 0)); + m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; + m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin()); + m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation; + + m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex)); + m_currentAlternativeIndex = beginTerm + 1; + } + + void atomParentheticalAssertionEnd(int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType) + { + unsigned beginTerm = popParenthesesStack(); + closeAlternative(beginTerm + 1); + unsigned endTerm = m_bodyDisjunction->terms.size(); + + ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParentheticalAssertionBegin); + + bool invert = m_bodyDisjunction->terms[beginTerm].invert(); + unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId; + + m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParentheticalAssertionEnd, subpatternId, false, invert, inputPosition)); + m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm; + m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm; + m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation; + + m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount; + m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; + m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount; + m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType; + } + + unsigned popParenthesesStack() + { + ASSERT(m_parenthesesStack.size()); + int stackEnd = m_parenthesesStack.size() - 1; + unsigned beginTerm = m_parenthesesStack[stackEnd].beginTerm; + m_currentAlternativeIndex = m_parenthesesStack[stackEnd].savedAlternativeIndex; + m_parenthesesStack.shrink(stackEnd); + + ASSERT(beginTerm < m_bodyDisjunction->terms.size()); + ASSERT(m_currentAlternativeIndex < m_bodyDisjunction->terms.size()); + + return beginTerm; + } + +#ifndef NDEBUG + void dumpDisjunction(ByteDisjunction* disjunction) + { + printf("ByteDisjunction(%p):\n\t", disjunction); + for (unsigned i = 0; i < disjunction->terms.size(); ++i) + printf("{ %d } ", disjunction->terms[i].type); + printf("\n"); + } +#endif + + void closeAlternative(int beginTerm) + { + int origBeginTerm = beginTerm; + ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeBegin); + int endIndex = m_bodyDisjunction->terms.size(); + + unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation; + + if (!m_bodyDisjunction->terms[beginTerm].alternative.next) + m_bodyDisjunction->terms.remove(beginTerm); + else { + while (m_bodyDisjunction->terms[beginTerm].alternative.next) { + beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next; + ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeDisjunction); + m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm; + m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation; + } + + m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm; + + m_bodyDisjunction->terms.append(ByteTerm::AlternativeEnd()); + m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation; + } + } + + void closeBodyAlternative() + { + int beginTerm = 0; + int origBeginTerm = 0; + ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeBegin); + int endIndex = m_bodyDisjunction->terms.size(); + + unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation; + + while (m_bodyDisjunction->terms[beginTerm].alternative.next) { + beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next; + ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeDisjunction); + m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm; + m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation; + } + + m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm; + + m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeEnd()); + m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation; + } + + void atomParenthesesSubpatternEnd(unsigned lastSubpatternId, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType, unsigned callFrameSize = 0) + { + unsigned beginTerm = popParenthesesStack(); + closeAlternative(beginTerm + 1); + unsigned endTerm = m_bodyDisjunction->terms.size(); + + ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternOnceBegin); + + ByteTerm& parenthesesBegin = m_bodyDisjunction->terms[beginTerm]; + + bool capture = parenthesesBegin.capture(); + unsigned subpatternId = parenthesesBegin.atom.subpatternId; + + unsigned numSubpatterns = lastSubpatternId - subpatternId + 1; + ByteDisjunction* parenthesesDisjunction = new ByteDisjunction(numSubpatterns, callFrameSize); + + parenthesesDisjunction->terms.append(ByteTerm::SubpatternBegin()); + for (unsigned termInParentheses = beginTerm + 1; termInParentheses < endTerm; ++termInParentheses) + parenthesesDisjunction->terms.append(m_bodyDisjunction->terms[termInParentheses]); + parenthesesDisjunction->terms.append(ByteTerm::SubpatternEnd()); + + m_bodyDisjunction->terms.shrink(beginTerm); + + m_allParenthesesInfo.append(parenthesesDisjunction); + m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction, capture, inputPosition)); + + m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount; + m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; + m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation; + } + + void atomParenthesesOnceEnd(int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType) + { + unsigned beginTerm = popParenthesesStack(); + closeAlternative(beginTerm + 1); + unsigned endTerm = m_bodyDisjunction->terms.size(); + + ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternOnceBegin); + + bool capture = m_bodyDisjunction->terms[beginTerm].capture(); + unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId; + + m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceEnd, subpatternId, capture, false, inputPosition)); + m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm; + m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm; + m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation; + + m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount; + m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; + m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount; + m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType; + } + + void atomParenthesesTerminalEnd(int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType) + { + unsigned beginTerm = popParenthesesStack(); + closeAlternative(beginTerm + 1); + unsigned endTerm = m_bodyDisjunction->terms.size(); + + ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternTerminalBegin); + + bool capture = m_bodyDisjunction->terms[beginTerm].capture(); + unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId; + + m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternTerminalEnd, subpatternId, capture, false, inputPosition)); + m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm; + m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm; + m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation; + + m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount; + m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; + m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount; + m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType; + } + + void regexBegin(unsigned numSubpatterns, unsigned callFrameSize, bool onceThrough) + { + m_bodyDisjunction = adoptPtr(new ByteDisjunction(numSubpatterns, callFrameSize)); + m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeBegin(onceThrough)); + m_bodyDisjunction->terms[0].frameLocation = 0; + m_currentAlternativeIndex = 0; + } + + void regexEnd() + { + closeBodyAlternative(); + } + + void alternativeBodyDisjunction(bool onceThrough) + { + int newAlternativeIndex = m_bodyDisjunction->terms.size(); + m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex; + m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeDisjunction(onceThrough)); + + m_currentAlternativeIndex = newAlternativeIndex; + } + + void alternativeDisjunction() + { + int newAlternativeIndex = m_bodyDisjunction->terms.size(); + m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex; + m_bodyDisjunction->terms.append(ByteTerm::AlternativeDisjunction()); + + m_currentAlternativeIndex = newAlternativeIndex; + } + + void emitDisjunction(PatternDisjunction* disjunction, unsigned inputCountAlreadyChecked = 0, unsigned parenthesesInputCountAlreadyChecked = 0, bool isParentheticalAssertion = false) + { + for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) { + unsigned currentCountAlreadyChecked = inputCountAlreadyChecked; + + PatternAlternative* alternative = disjunction->m_alternatives[alt]; + + if (alt) { + if (disjunction == m_pattern.m_body) + alternativeBodyDisjunction(alternative->onceThrough()); + else + alternativeDisjunction(); + } + + unsigned minimumSize = alternative->m_minimumSize; + int countToCheck; + + if (isParentheticalAssertion && parenthesesInputCountAlreadyChecked > minimumSize) + countToCheck = 0; + else + countToCheck = minimumSize - parenthesesInputCountAlreadyChecked; + + ASSERT(countToCheck >= 0); + if (countToCheck) { + checkInput(countToCheck); + currentCountAlreadyChecked += countToCheck; + } + + for (unsigned i = 0; i < alternative->m_terms.size(); ++i) { + PatternTerm& term = alternative->m_terms[i]; + + switch (term.type) { + case PatternTerm::TypeAssertionBOL: + assertionBOL(term.inputPosition - currentCountAlreadyChecked); + break; + + case PatternTerm::TypeAssertionEOL: + assertionEOL(term.inputPosition - currentCountAlreadyChecked); + break; + + case PatternTerm::TypeAssertionWordBoundary: + assertionWordBoundary(term.invert(), term.inputPosition - currentCountAlreadyChecked); + break; + + case PatternTerm::TypePatternCharacter: + atomPatternCharacter(term.patternCharacter, term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType); + break; + + case PatternTerm::TypeCharacterClass: + atomCharacterClass(term.characterClass, term.invert(), term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType); + break; + + case PatternTerm::TypeBackReference: + atomBackReference(term.backReferenceSubpatternId, term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType); + break; + + case PatternTerm::TypeForwardReference: + break; + + case PatternTerm::TypeParenthesesSubpattern: { + unsigned disjunctionAlreadyCheckedCount = 0; + if (term.quantityCount == 1 && !term.parentheses.isCopy) { + unsigned alternativeFrameLocation = term.frameLocation; + // For QuantifierFixedCount we pre-check the minimum size; for greedy/non-greedy we reserve a slot in the frame. + if (term.quantityType == QuantifierFixedCount) + disjunctionAlreadyCheckedCount = term.parentheses.disjunction->m_minimumSize; + else + alternativeFrameLocation += RegexStackSpaceForBackTrackInfoParenthesesOnce; + unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked; + atomParenthesesOnceBegin(term.parentheses.subpatternId, term.capture(), delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, alternativeFrameLocation); + emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, disjunctionAlreadyCheckedCount); + atomParenthesesOnceEnd(delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType); + } else if (term.parentheses.isTerminal) { + unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked; + atomParenthesesTerminalBegin(term.parentheses.subpatternId, term.capture(), delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, term.frameLocation + RegexStackSpaceForBackTrackInfoParenthesesOnce); + emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, disjunctionAlreadyCheckedCount); + atomParenthesesTerminalEnd(delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType); + } else { + unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked; + atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.capture(), delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, 0); + emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0); + atomParenthesesSubpatternEnd(term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize); + } + break; + } + + case PatternTerm::TypeParentheticalAssertion: { + unsigned alternativeFrameLocation = term.frameLocation + RegexStackSpaceForBackTrackInfoParentheticalAssertion; + + ASSERT(currentCountAlreadyChecked >= (unsigned)term.inputPosition); + int positiveInputOffset = currentCountAlreadyChecked - term.inputPosition; + + atomParentheticalAssertionBegin(term.parentheses.subpatternId, term.invert(), term.frameLocation, alternativeFrameLocation); + emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, positiveInputOffset, true); + atomParentheticalAssertionEnd(0, term.frameLocation, term.quantityCount, term.quantityType); + break; + } + } + } + } + } + +private: + RegexPattern& m_pattern; + OwnPtr<ByteDisjunction> m_bodyDisjunction; + unsigned m_currentAlternativeIndex; + Vector<ParenthesesStackEntry> m_parenthesesStack; + Vector<ByteDisjunction*> m_allParenthesesInfo; +}; + +PassOwnPtr<BytecodePattern> byteCompileRegex(RegexPattern& pattern, BumpPointerAllocator* allocator) +{ + return ByteCompiler(pattern).compile(allocator); +} + +int interpretRegex(BytecodePattern* regex, const UChar* input, unsigned start, unsigned length, int* output) +{ + return Interpreter(regex, output, input, start, length).interpret(); +} + + +COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoPatternCharacter) == (RegexStackSpaceForBackTrackInfoPatternCharacter * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoPatternCharacter); +COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoCharacterClass) == (RegexStackSpaceForBackTrackInfoCharacterClass * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoCharacterClass); +COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoBackReference) == (RegexStackSpaceForBackTrackInfoBackReference * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoBackReference); +COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoAlternative) == (RegexStackSpaceForBackTrackInfoAlternative * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoAlternative); +COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParentheticalAssertion) == (RegexStackSpaceForBackTrackInfoParentheticalAssertion * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoParentheticalAssertion); +COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParenthesesOnce) == (RegexStackSpaceForBackTrackInfoParenthesesOnce * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoParenthesesOnce); +COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParentheses) == (RegexStackSpaceForBackTrackInfoParentheses * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoParentheses); + + +} } diff --git a/Source/JavaScriptCore/yarr/RegexInterpreter.h b/Source/JavaScriptCore/yarr/RegexInterpreter.h new file mode 100644 index 0000000..0fd8a57 --- /dev/null +++ b/Source/JavaScriptCore/yarr/RegexInterpreter.h @@ -0,0 +1,381 @@ +/* + * Copyright (C) 2009, 2010 Apple Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef RegexInterpreter_h +#define RegexInterpreter_h + +#include "RegexParser.h" +#include "RegexPattern.h" +#include <wtf/PassOwnPtr.h> +#include <wtf/unicode/Unicode.h> + +namespace WTF { +class BumpPointerAllocator; +} +using WTF::BumpPointerAllocator; + +namespace JSC { namespace Yarr { + +// TODO move the matchLimit constant and the JSRegExpResult enum to the JSRegExp.h when pcre is removed. + +// The below limit restricts the number of "recursive" match calls in order to +// avoid spending exponential time on complex regular expressions. +static const unsigned matchLimit = 1000000; + +enum JSRegExpResult { + JSRegExpMatch = 1, + JSRegExpNoMatch = 0, + JSRegExpErrorNoMatch = -1, + JSRegExpErrorHitLimit = -2, + JSRegExpErrorNoMemory = -3, + JSRegExpErrorInternal = -4 +}; + +class ByteDisjunction; + +struct ByteTerm { + enum Type { + TypeBodyAlternativeBegin, + TypeBodyAlternativeDisjunction, + TypeBodyAlternativeEnd, + TypeAlternativeBegin, + TypeAlternativeDisjunction, + TypeAlternativeEnd, + TypeSubpatternBegin, + TypeSubpatternEnd, + TypeAssertionBOL, + TypeAssertionEOL, + TypeAssertionWordBoundary, + TypePatternCharacterOnce, + TypePatternCharacterFixed, + TypePatternCharacterGreedy, + TypePatternCharacterNonGreedy, + TypePatternCasedCharacterOnce, + TypePatternCasedCharacterFixed, + TypePatternCasedCharacterGreedy, + TypePatternCasedCharacterNonGreedy, + TypeCharacterClass, + TypeBackReference, + TypeParenthesesSubpattern, + TypeParenthesesSubpatternOnceBegin, + TypeParenthesesSubpatternOnceEnd, + TypeParenthesesSubpatternTerminalBegin, + TypeParenthesesSubpatternTerminalEnd, + TypeParentheticalAssertionBegin, + TypeParentheticalAssertionEnd, + TypeCheckInput, + } type; + union { + struct { + union { + UChar patternCharacter; + struct { + UChar lo; + UChar hi; + } casedCharacter; + CharacterClass* characterClass; + unsigned subpatternId; + }; + union { + ByteDisjunction* parenthesesDisjunction; + unsigned parenthesesWidth; + }; + QuantifierType quantityType; + unsigned quantityCount; + } atom; + struct { + int next; + int end; + bool onceThrough; + } alternative; + unsigned checkInputCount; + }; + unsigned frameLocation; + bool m_capture : 1; + bool m_invert : 1; + int inputPosition; + + ByteTerm(UChar ch, int inputPos, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType) + : frameLocation(frameLocation) + , m_capture(false) + , m_invert(false) + { + switch (quantityType) { + case QuantifierFixedCount: + type = (quantityCount == 1) ? ByteTerm::TypePatternCharacterOnce : ByteTerm::TypePatternCharacterFixed; + break; + case QuantifierGreedy: + type = ByteTerm::TypePatternCharacterGreedy; + break; + case QuantifierNonGreedy: + type = ByteTerm::TypePatternCharacterNonGreedy; + break; + } + + atom.patternCharacter = ch; + atom.quantityType = quantityType; + atom.quantityCount = quantityCount; + inputPosition = inputPos; + } + + ByteTerm(UChar lo, UChar hi, int inputPos, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType) + : frameLocation(frameLocation) + , m_capture(false) + , m_invert(false) + { + switch (quantityType) { + case QuantifierFixedCount: + type = (quantityCount == 1) ? ByteTerm::TypePatternCasedCharacterOnce : ByteTerm::TypePatternCasedCharacterFixed; + break; + case QuantifierGreedy: + type = ByteTerm::TypePatternCasedCharacterGreedy; + break; + case QuantifierNonGreedy: + type = ByteTerm::TypePatternCasedCharacterNonGreedy; + break; + } + + atom.casedCharacter.lo = lo; + atom.casedCharacter.hi = hi; + atom.quantityType = quantityType; + atom.quantityCount = quantityCount; + inputPosition = inputPos; + } + + ByteTerm(CharacterClass* characterClass, bool invert, int inputPos) + : type(ByteTerm::TypeCharacterClass) + , m_capture(false) + , m_invert(invert) + { + atom.characterClass = characterClass; + atom.quantityType = QuantifierFixedCount; + atom.quantityCount = 1; + inputPosition = inputPos; + } + + ByteTerm(Type type, unsigned subpatternId, ByteDisjunction* parenthesesInfo, bool capture, int inputPos) + : type(type) + , m_capture(capture) + , m_invert(false) + { + atom.subpatternId = subpatternId; + atom.parenthesesDisjunction = parenthesesInfo; + atom.quantityType = QuantifierFixedCount; + atom.quantityCount = 1; + inputPosition = inputPos; + } + + ByteTerm(Type type, bool invert = false) + : type(type) + , m_capture(false) + , m_invert(invert) + { + atom.quantityType = QuantifierFixedCount; + atom.quantityCount = 1; + } + + ByteTerm(Type type, unsigned subpatternId, bool capture, bool invert, int inputPos) + : type(type) + , m_capture(capture) + , m_invert(invert) + { + atom.subpatternId = subpatternId; + atom.quantityType = QuantifierFixedCount; + atom.quantityCount = 1; + inputPosition = inputPos; + } + + static ByteTerm BOL(int inputPos) + { + ByteTerm term(TypeAssertionBOL); + term.inputPosition = inputPos; + return term; + } + + static ByteTerm CheckInput(unsigned count) + { + ByteTerm term(TypeCheckInput); + term.checkInputCount = count; + return term; + } + + static ByteTerm EOL(int inputPos) + { + ByteTerm term(TypeAssertionEOL); + term.inputPosition = inputPos; + return term; + } + + static ByteTerm WordBoundary(bool invert, int inputPos) + { + ByteTerm term(TypeAssertionWordBoundary, invert); + term.inputPosition = inputPos; + return term; + } + + static ByteTerm BackReference(unsigned subpatternId, int inputPos) + { + return ByteTerm(TypeBackReference, subpatternId, false, false, inputPos); + } + + static ByteTerm BodyAlternativeBegin(bool onceThrough) + { + ByteTerm term(TypeBodyAlternativeBegin); + term.alternative.next = 0; + term.alternative.end = 0; + term.alternative.onceThrough = onceThrough; + return term; + } + + static ByteTerm BodyAlternativeDisjunction(bool onceThrough) + { + ByteTerm term(TypeBodyAlternativeDisjunction); + term.alternative.next = 0; + term.alternative.end = 0; + term.alternative.onceThrough = onceThrough; + return term; + } + + static ByteTerm BodyAlternativeEnd() + { + ByteTerm term(TypeBodyAlternativeEnd); + term.alternative.next = 0; + term.alternative.end = 0; + term.alternative.onceThrough = false; + return term; + } + + static ByteTerm AlternativeBegin() + { + ByteTerm term(TypeAlternativeBegin); + term.alternative.next = 0; + term.alternative.end = 0; + term.alternative.onceThrough = false; + return term; + } + + static ByteTerm AlternativeDisjunction() + { + ByteTerm term(TypeAlternativeDisjunction); + term.alternative.next = 0; + term.alternative.end = 0; + term.alternative.onceThrough = false; + return term; + } + + static ByteTerm AlternativeEnd() + { + ByteTerm term(TypeAlternativeEnd); + term.alternative.next = 0; + term.alternative.end = 0; + term.alternative.onceThrough = false; + return term; + } + + static ByteTerm SubpatternBegin() + { + return ByteTerm(TypeSubpatternBegin); + } + + static ByteTerm SubpatternEnd() + { + return ByteTerm(TypeSubpatternEnd); + } + + bool invert() + { + return m_invert; + } + + bool capture() + { + return m_capture; + } +}; + +class ByteDisjunction : public FastAllocBase { +public: + ByteDisjunction(unsigned numSubpatterns, unsigned frameSize) + : m_numSubpatterns(numSubpatterns) + , m_frameSize(frameSize) + { + } + + Vector<ByteTerm> terms; + unsigned m_numSubpatterns; + unsigned m_frameSize; +}; + +struct BytecodePattern : FastAllocBase { + BytecodePattern(PassOwnPtr<ByteDisjunction> body, Vector<ByteDisjunction*> allParenthesesInfo, RegexPattern& pattern, BumpPointerAllocator* allocator) + : m_body(body) + , m_ignoreCase(pattern.m_ignoreCase) + , m_multiline(pattern.m_multiline) + , m_containsBeginChars(pattern.m_containsBeginChars) + , m_allocator(allocator) + { + newlineCharacterClass = pattern.newlineCharacterClass(); + wordcharCharacterClass = pattern.wordcharCharacterClass(); + + m_allParenthesesInfo.append(allParenthesesInfo); + m_userCharacterClasses.append(pattern.m_userCharacterClasses); + // 'Steal' the RegexPattern's CharacterClasses! We clear its + // array, so that it won't delete them on destruction. We'll + // take responsibility for that. + pattern.m_userCharacterClasses.clear(); + + m_beginChars.append(pattern.m_beginChars); + } + + ~BytecodePattern() + { + deleteAllValues(m_allParenthesesInfo); + deleteAllValues(m_userCharacterClasses); + } + + OwnPtr<ByteDisjunction> m_body; + bool m_ignoreCase; + bool m_multiline; + bool m_containsBeginChars; + // Each BytecodePattern is associated with a RegExp, each RegExp is associated + // with a JSGlobalData. Cache a pointer to out JSGlobalData's m_regexAllocator. + BumpPointerAllocator* m_allocator; + + CharacterClass* newlineCharacterClass; + CharacterClass* wordcharCharacterClass; + + Vector<BeginChar> m_beginChars; + +private: + Vector<ByteDisjunction*> m_allParenthesesInfo; + Vector<CharacterClass*> m_userCharacterClasses; +}; + +PassOwnPtr<BytecodePattern> byteCompileRegex(RegexPattern& pattern, BumpPointerAllocator*); +int interpretRegex(BytecodePattern* v_regex, const UChar* input, unsigned start, unsigned length, int* output); + +} } // namespace JSC::Yarr + +#endif // RegexInterpreter_h diff --git a/Source/JavaScriptCore/yarr/RegexJIT.cpp b/Source/JavaScriptCore/yarr/RegexJIT.cpp new file mode 100644 index 0000000..50fe6db --- /dev/null +++ b/Source/JavaScriptCore/yarr/RegexJIT.cpp @@ -0,0 +1,2213 @@ +/* + * Copyright (C) 2009 Apple Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "RegexJIT.h" + +#include "ASCIICType.h" +#include "JSGlobalData.h" +#include "LinkBuffer.h" +#include "MacroAssembler.h" +#include "RegexParser.h" + +#if ENABLE(YARR_JIT) + +using namespace WTF; + +namespace JSC { namespace Yarr { + +class RegexGenerator : private MacroAssembler { + friend void jitCompileRegex(JSGlobalData* globalData, RegexCodeBlock& jitObject, const UString& pattern, unsigned& numSubpatterns, const char*& error, bool ignoreCase, bool multiline); + +#if CPU(ARM) + static const RegisterID input = ARMRegisters::r0; + static const RegisterID index = ARMRegisters::r1; + static const RegisterID length = ARMRegisters::r2; + static const RegisterID output = ARMRegisters::r4; + + static const RegisterID regT0 = ARMRegisters::r5; + static const RegisterID regT1 = ARMRegisters::r6; + + static const RegisterID returnRegister = ARMRegisters::r0; +#elif CPU(MIPS) + static const RegisterID input = MIPSRegisters::a0; + static const RegisterID index = MIPSRegisters::a1; + static const RegisterID length = MIPSRegisters::a2; + static const RegisterID output = MIPSRegisters::a3; + + static const RegisterID regT0 = MIPSRegisters::t4; + static const RegisterID regT1 = MIPSRegisters::t5; + + static const RegisterID returnRegister = MIPSRegisters::v0; +#elif CPU(X86) + static const RegisterID input = X86Registers::eax; + static const RegisterID index = X86Registers::edx; + static const RegisterID length = X86Registers::ecx; + static const RegisterID output = X86Registers::edi; + + static const RegisterID regT0 = X86Registers::ebx; + static const RegisterID regT1 = X86Registers::esi; + + static const RegisterID returnRegister = X86Registers::eax; +#elif CPU(X86_64) + static const RegisterID input = X86Registers::edi; + static const RegisterID index = X86Registers::esi; + static const RegisterID length = X86Registers::edx; + static const RegisterID output = X86Registers::ecx; + + static const RegisterID regT0 = X86Registers::eax; + static const RegisterID regT1 = X86Registers::ebx; + + static const RegisterID returnRegister = X86Registers::eax; +#endif + + void optimizeAlternative(PatternAlternative* alternative) + { + if (!alternative->m_terms.size()) + return; + + for (unsigned i = 0; i < alternative->m_terms.size() - 1; ++i) { + PatternTerm& term = alternative->m_terms[i]; + PatternTerm& nextTerm = alternative->m_terms[i + 1]; + + if ((term.type == PatternTerm::TypeCharacterClass) + && (term.quantityType == QuantifierFixedCount) + && (nextTerm.type == PatternTerm::TypePatternCharacter) + && (nextTerm.quantityType == QuantifierFixedCount)) { + PatternTerm termCopy = term; + alternative->m_terms[i] = nextTerm; + alternative->m_terms[i + 1] = termCopy; + } + } + } + + void matchCharacterClassRange(RegisterID character, JumpList& failures, JumpList& matchDest, const CharacterRange* ranges, unsigned count, unsigned* matchIndex, const UChar* matches, unsigned matchCount) + { + do { + // pick which range we're going to generate + int which = count >> 1; + char lo = ranges[which].begin; + char hi = ranges[which].end; + + // check if there are any ranges or matches below lo. If not, just jl to failure - + // if there is anything else to check, check that first, if it falls through jmp to failure. + if ((*matchIndex < matchCount) && (matches[*matchIndex] < lo)) { + Jump loOrAbove = branch32(GreaterThanOrEqual, character, Imm32((unsigned short)lo)); + + // generate code for all ranges before this one + if (which) + matchCharacterClassRange(character, failures, matchDest, ranges, which, matchIndex, matches, matchCount); + + while ((*matchIndex < matchCount) && (matches[*matchIndex] < lo)) { + matchDest.append(branch32(Equal, character, Imm32((unsigned short)matches[*matchIndex]))); + ++*matchIndex; + } + failures.append(jump()); + + loOrAbove.link(this); + } else if (which) { + Jump loOrAbove = branch32(GreaterThanOrEqual, character, Imm32((unsigned short)lo)); + + matchCharacterClassRange(character, failures, matchDest, ranges, which, matchIndex, matches, matchCount); + failures.append(jump()); + + loOrAbove.link(this); + } else + failures.append(branch32(LessThan, character, Imm32((unsigned short)lo))); + + while ((*matchIndex < matchCount) && (matches[*matchIndex] <= hi)) + ++*matchIndex; + + matchDest.append(branch32(LessThanOrEqual, character, Imm32((unsigned short)hi))); + // fall through to here, the value is above hi. + + // shuffle along & loop around if there are any more matches to handle. + unsigned next = which + 1; + ranges += next; + count -= next; + } while (count); + } + + void matchCharacterClass(RegisterID character, JumpList& matchDest, const CharacterClass* charClass) + { + if (charClass->m_table) { + ExtendedAddress tableEntry(character, reinterpret_cast<intptr_t>(charClass->m_table->m_table)); + matchDest.append(branchTest8(charClass->m_table->m_inverted ? Zero : NonZero, tableEntry)); + return; + } + Jump unicodeFail; + if (charClass->m_matchesUnicode.size() || charClass->m_rangesUnicode.size()) { + Jump isAscii = branch32(LessThanOrEqual, character, Imm32(0x7f)); + + if (charClass->m_matchesUnicode.size()) { + for (unsigned i = 0; i < charClass->m_matchesUnicode.size(); ++i) { + UChar ch = charClass->m_matchesUnicode[i]; + matchDest.append(branch32(Equal, character, Imm32(ch))); + } + } + + if (charClass->m_rangesUnicode.size()) { + for (unsigned i = 0; i < charClass->m_rangesUnicode.size(); ++i) { + UChar lo = charClass->m_rangesUnicode[i].begin; + UChar hi = charClass->m_rangesUnicode[i].end; + + Jump below = branch32(LessThan, character, Imm32(lo)); + matchDest.append(branch32(LessThanOrEqual, character, Imm32(hi))); + below.link(this); + } + } + + unicodeFail = jump(); + isAscii.link(this); + } + + if (charClass->m_ranges.size()) { + unsigned matchIndex = 0; + JumpList failures; + matchCharacterClassRange(character, failures, matchDest, charClass->m_ranges.begin(), charClass->m_ranges.size(), &matchIndex, charClass->m_matches.begin(), charClass->m_matches.size()); + while (matchIndex < charClass->m_matches.size()) + matchDest.append(branch32(Equal, character, Imm32((unsigned short)charClass->m_matches[matchIndex++]))); + + failures.link(this); + } else if (charClass->m_matches.size()) { + // optimization: gather 'a','A' etc back together, can mask & test once. + Vector<char> matchesAZaz; + + for (unsigned i = 0; i < charClass->m_matches.size(); ++i) { + char ch = charClass->m_matches[i]; + if (m_pattern.m_ignoreCase) { + if (isASCIILower(ch)) { + matchesAZaz.append(ch); + continue; + } + if (isASCIIUpper(ch)) + continue; + } + matchDest.append(branch32(Equal, character, Imm32((unsigned short)ch))); + } + + if (unsigned countAZaz = matchesAZaz.size()) { + or32(Imm32(32), character); + for (unsigned i = 0; i < countAZaz; ++i) + matchDest.append(branch32(Equal, character, Imm32(matchesAZaz[i]))); + } + } + + if (charClass->m_matchesUnicode.size() || charClass->m_rangesUnicode.size()) + unicodeFail.link(this); + } + + // Jumps if input not available; will have (incorrectly) incremented already! + Jump jumpIfNoAvailableInput(unsigned countToCheck) + { + add32(Imm32(countToCheck), index); + return branch32(Above, index, length); + } + + Jump jumpIfAvailableInput(unsigned countToCheck) + { + add32(Imm32(countToCheck), index); + return branch32(BelowOrEqual, index, length); + } + + Jump checkInput() + { + return branch32(BelowOrEqual, index, length); + } + + Jump atEndOfInput() + { + return branch32(Equal, index, length); + } + + Jump notAtEndOfInput() + { + return branch32(NotEqual, index, length); + } + + Jump jumpIfCharEquals(UChar ch, int inputPosition) + { + return branch16(Equal, BaseIndex(input, index, TimesTwo, inputPosition * sizeof(UChar)), Imm32(ch)); + } + + Jump jumpIfCharNotEquals(UChar ch, int inputPosition) + { + return branch16(NotEqual, BaseIndex(input, index, TimesTwo, inputPosition * sizeof(UChar)), Imm32(ch)); + } + + void readCharacter(int inputPosition, RegisterID reg) + { + load16(BaseIndex(input, index, TimesTwo, inputPosition * sizeof(UChar)), reg); + } + + void storeToFrame(RegisterID reg, unsigned frameLocation) + { + poke(reg, frameLocation); + } + + void storeToFrame(Imm32 imm, unsigned frameLocation) + { + poke(imm, frameLocation); + } + + DataLabelPtr storeToFrameWithPatch(unsigned frameLocation) + { + return storePtrWithPatch(ImmPtr(0), Address(stackPointerRegister, frameLocation * sizeof(void*))); + } + + void loadFromFrame(unsigned frameLocation, RegisterID reg) + { + peek(reg, frameLocation); + } + + void loadFromFrameAndJump(unsigned frameLocation) + { + jump(Address(stackPointerRegister, frameLocation * sizeof(void*))); + } + + struct IndirectJumpEntry { + IndirectJumpEntry(int32_t stackOffset) + : m_stackOffset(stackOffset) + { + } + + IndirectJumpEntry(int32_t stackOffset, Jump jump) + : m_stackOffset(stackOffset) + { + addJump(jump); + } + + void addJump(Jump jump) + { + m_relJumps.append(jump); + } + + int32_t m_stackOffset; + JumpList m_relJumps; + }; + + struct AlternativeBacktrackRecord { + DataLabelPtr dataLabel; + Label backtrackLocation; + + AlternativeBacktrackRecord(DataLabelPtr dataLabel, Label backtrackLocation) + : dataLabel(dataLabel) + , backtrackLocation(backtrackLocation) + { + } + }; + + struct ParenthesesTail; + struct TermGenerationState; + + struct GenerationState { + typedef HashMap<int, IndirectJumpEntry*, WTF::IntHash<uint32_t>, UnsignedWithZeroKeyHashTraits<uint32_t> > IndirectJumpHashMap; + + GenerationState() + : m_parenNestingLevel(0) + { + } + + void addIndirectJumpEntry(int32_t stackOffset, Jump jump) + { + IndirectJumpHashMap::iterator result = m_indirectJumpMap.find(stackOffset); + + ASSERT(stackOffset >= 0); + + uint32_t offset = static_cast<uint32_t>(stackOffset); + + if (result == m_indirectJumpMap.end()) + m_indirectJumpMap.add(offset, new IndirectJumpEntry(stackOffset, jump)); + else + result->second->addJump(jump); + } + + void addIndirectJumpEntry(int32_t stackOffset, JumpList jumps) + { + JumpList::JumpVector jumpVector = jumps.jumps(); + size_t size = jumpVector.size(); + for (size_t i = 0; i < size; ++i) + addIndirectJumpEntry(stackOffset, jumpVector[i]); + + jumps.empty(); + } + + void emitIndirectJumpTable(MacroAssembler* masm) + { + for (IndirectJumpHashMap::iterator iter = m_indirectJumpMap.begin(); iter != m_indirectJumpMap.end(); ++iter) { + IndirectJumpEntry* indJumpEntry = iter->second; + indJumpEntry->m_relJumps.link(masm); + masm->jump(Address(stackPointerRegister, indJumpEntry->m_stackOffset)); + delete indJumpEntry; + } + } + + void incrementParenNestingLevel() + { + ++m_parenNestingLevel; + } + + void decrementParenNestingLevel() + { + --m_parenNestingLevel; + } + + ParenthesesTail* addParenthesesTail(PatternTerm& term, ParenthesesTail* nextOuterParenTail) + { + ParenthesesTail* parenthesesTail = new ParenthesesTail(term, m_parenNestingLevel, nextOuterParenTail); + m_parenTails.append(parenthesesTail); + m_parenTailsForIteration.append(parenthesesTail); + + return parenthesesTail; + } + + void emitParenthesesTail(RegexGenerator* generator) + { + unsigned vectorSize = m_parenTails.size(); + bool priorBacktrackFallThrough = false; + + // Emit in reverse order so parentTail N can fall through to N-1 + for (unsigned index = vectorSize; index > 0; --index) { + JumpList jumpsToNext; + priorBacktrackFallThrough = m_parenTails[index-1].get()->generateCode(generator, jumpsToNext, priorBacktrackFallThrough, index > 1); + if (index > 1) + jumpsToNext.linkTo(generator->label(), generator); + else + addJumpsToNextInteration(jumpsToNext); + } + m_parenTails.clear(); + } + + void addJumpToNextInteration(Jump jump) + { + m_jumpsToNextInteration.append(jump); + } + + void addJumpsToNextInteration(JumpList jumps) + { + m_jumpsToNextInteration.append(jumps); + } + + void addDataLabelToNextIteration(DataLabelPtr dataLabel) + { + m_dataPtrsToNextIteration.append(dataLabel); + } + + void linkToNextIteration(Label label) + { + m_nextIteration = label; + + for (unsigned i = 0; i < m_dataPtrsToNextIteration.size(); ++i) + m_backtrackRecords.append(AlternativeBacktrackRecord(m_dataPtrsToNextIteration[i], m_nextIteration)); + + m_dataPtrsToNextIteration.clear(); + + for (unsigned i = 0; i < m_parenTailsForIteration.size(); ++i) + m_parenTailsForIteration[i]->setNextIteration(m_nextIteration); + + m_parenTailsForIteration.clear(); + } + + void linkToNextIteration(RegexGenerator* generator) + { + m_jumpsToNextInteration.linkTo(m_nextIteration, generator); + } + + int m_parenNestingLevel; + Vector<AlternativeBacktrackRecord> m_backtrackRecords; + IndirectJumpHashMap m_indirectJumpMap; + Label m_nextIteration; + Vector<OwnPtr<ParenthesesTail> > m_parenTails; + JumpList m_jumpsToNextInteration; + Vector<DataLabelPtr> m_dataPtrsToNextIteration; + Vector<ParenthesesTail*> m_parenTailsForIteration; + }; + + struct BacktrackDestination { + typedef enum { + NoBacktrack, + BacktrackLabel, + BacktrackStackOffset, + BacktrackJumpList, + BacktrackLinked + } BacktrackType; + + BacktrackDestination() + : m_backtrackType(NoBacktrack) + , m_backtrackToLabel(0) + , m_subDataLabelPtr(0) + , m_nextBacktrack(0) + , m_backtrackSourceLabel(0) + , m_backtrackSourceJumps(0) + { + } + + BacktrackDestination(int32_t stackOffset) + : m_backtrackType(BacktrackStackOffset) + , m_backtrackStackOffset(stackOffset) + , m_backtrackToLabel(0) + , m_subDataLabelPtr(0) + , m_nextBacktrack(0) + , m_backtrackSourceLabel(0) + , m_backtrackSourceJumps(0) + { + } + + BacktrackDestination(Label label) + : m_backtrackType(BacktrackLabel) + , m_backtrackLabel(label) + , m_backtrackToLabel(0) + , m_subDataLabelPtr(0) + , m_nextBacktrack(0) + , m_backtrackSourceLabel(0) + , m_backtrackSourceJumps(0) + { + } + + void clear(bool doDataLabelClear = true) + { + m_backtrackType = NoBacktrack; + if (doDataLabelClear) + clearDataLabel(); + m_nextBacktrack = 0; + } + + void clearDataLabel() + { + m_dataLabelPtr = DataLabelPtr(); + } + + bool hasDestination() + { + return (m_backtrackType != NoBacktrack); + } + + bool isStackOffset() + { + return (m_backtrackType == BacktrackStackOffset); + } + + bool isLabel() + { + return (m_backtrackType == BacktrackLabel); + } + + bool isJumpList() + { + return (m_backtrackType == BacktrackJumpList); + } + + bool hasDataLabel() + { + return m_dataLabelPtr.isSet(); + } + + void copyTarget(BacktrackDestination& rhs, bool copyDataLabel = true) + { + m_backtrackType = rhs.m_backtrackType; + if (m_backtrackType == BacktrackStackOffset) + m_backtrackStackOffset = rhs.m_backtrackStackOffset; + else if (m_backtrackType == BacktrackLabel) + m_backtrackLabel = rhs.m_backtrackLabel; + if (copyDataLabel) + m_dataLabelPtr = rhs.m_dataLabelPtr; + m_backtrackSourceJumps = rhs.m_backtrackSourceJumps; + m_backtrackSourceLabel = rhs.m_backtrackSourceLabel; + } + + void copyTo(BacktrackDestination& lhs) + { + lhs.m_backtrackType = m_backtrackType; + if (m_backtrackType == BacktrackStackOffset) + lhs.m_backtrackStackOffset = m_backtrackStackOffset; + else if (m_backtrackType == BacktrackLabel) + lhs.m_backtrackLabel = m_backtrackLabel; + lhs.m_backtrackSourceJumps = m_backtrackSourceJumps; + lhs.m_backtrackSourceLabel = m_backtrackSourceLabel; + lhs.m_dataLabelPtr = m_dataLabelPtr; + lhs.m_backTrackJumps = m_backTrackJumps; + } + + void addBacktrackJump(Jump jump) + { + m_backTrackJumps.append(jump); + } + + void setStackOffset(int32_t stackOffset) + { + m_backtrackType = BacktrackStackOffset; + m_backtrackStackOffset = stackOffset; + } + + void setLabel(Label label) + { + m_backtrackType = BacktrackLabel; + m_backtrackLabel = label; + } + + void setNextBacktrackLabel(Label label) + { + if (m_nextBacktrack) + m_nextBacktrack->setLabel(label); + } + + void copyBacktrackToLabel(BacktrackDestination& rhs) + { + if (rhs.m_backtrackToLabel) + m_backtrackToLabel = rhs.m_backtrackToLabel; + } + + void setBacktrackToLabel(Label* backtrackToLabel) + { + if (!m_backtrackToLabel) + m_backtrackToLabel = backtrackToLabel; + } + + void setBacktrackJumpList(JumpList* jumpList) + { + m_backtrackType = BacktrackJumpList; + m_backtrackSourceJumps = jumpList; + } + + void setBacktrackSourceLabel(Label* backtrackSourceLabel) + { + m_backtrackSourceLabel = backtrackSourceLabel; + } + + void setDataLabel(DataLabelPtr dp) + { + if (m_subDataLabelPtr) { + *m_subDataLabelPtr = dp; + m_subDataLabelPtr = 0; + } else + m_dataLabelPtr = dp; + } + + void setSubDataLabelPtr(DataLabelPtr* subDataLabelPtr) + { + m_subDataLabelPtr = subDataLabelPtr; + } + + void linkToNextBacktrack(BacktrackDestination* nextBacktrack) + { + m_nextBacktrack = nextBacktrack; + } + + int32_t getStackOffset() + { + ASSERT(m_backtrackType == BacktrackStackOffset); + return m_backtrackStackOffset; + } + + Label getLabel() + { + ASSERT(m_backtrackType == BacktrackLabel); + return m_backtrackLabel; + } + + JumpList& getBacktrackJumps() + { + return m_backTrackJumps; + } + + DataLabelPtr& getDataLabel() + { + return m_dataLabelPtr; + } + + void jumpToBacktrack(MacroAssembler* masm) + { + if (isJumpList()) { + if (m_backtrackSourceLabel && (m_backtrackSourceLabel->isSet())) + masm->jump().linkTo(*m_backtrackSourceLabel, masm); + else + m_backtrackSourceJumps->append(masm->jump()); + } else if (isStackOffset()) + masm->jump(Address(stackPointerRegister, m_backtrackStackOffset)); + else if (isLabel()) + masm->jump().linkTo(m_backtrackLabel, masm); + else + m_backTrackJumps.append(masm->jump()); + } + + void jumpToBacktrack(RegexGenerator* generator, Jump jump) + { + if (isJumpList()) { + if (m_backtrackSourceLabel && (m_backtrackSourceLabel->isSet())) + jump.linkTo(*m_backtrackSourceLabel, generator); + else + m_backtrackSourceJumps->append(jump); + } else if (isStackOffset()) + generator->m_expressionState.addIndirectJumpEntry(getStackOffset(), jump); + else if (isLabel()) + jump.linkTo(getLabel(), generator); + else + m_backTrackJumps.append(jump); + } + + void jumpToBacktrack(RegexGenerator* generator, JumpList& jumps) + { + if (isJumpList()) { + if (m_backtrackSourceLabel && (m_backtrackSourceLabel->isSet())) + jumps.linkTo(*m_backtrackSourceLabel, generator); + else + m_backtrackSourceJumps->append(jumps); + } else if (isStackOffset()) + generator->m_expressionState.addIndirectJumpEntry(getStackOffset(), jumps); + else if (isLabel()) + jumps.linkTo(getLabel(), generator); + else + m_backTrackJumps.append(jumps); + } + + bool linkDataLabelToHereIfExists(RegexGenerator* generator) + { + if (hasDataLabel()) { + generator->m_expressionState.m_backtrackRecords.append(AlternativeBacktrackRecord(getDataLabel(), generator->label())); + clearDataLabel(); + return true; + } + + return false; + } + + bool plantJumpToBacktrackIfExists(RegexGenerator* generator) + { + if (isJumpList()) { + if (m_backtrackSourceLabel && (m_backtrackSourceLabel->isSet())) + generator->jump(*m_backtrackSourceLabel); + else + m_backtrackSourceJumps->append(generator->jump()); + + return true; + } + + if (isStackOffset()) { + generator->jump(Address(stackPointerRegister, getStackOffset())); + return true; + } + + if (isLabel()) { + generator->jump(getLabel()); + if (hasDataLabel()) { + generator->m_expressionState.m_backtrackRecords.append(AlternativeBacktrackRecord(getDataLabel(), getLabel())); + clearDataLabel(); + } + return true; + } + + return false; + } + + void linkAlternativeBacktracks(RegexGenerator* generator, bool nextIteration = false) + { + Label hereLabel = generator->label(); + + if (m_backtrackToLabel) { + *m_backtrackToLabel = hereLabel; + m_backtrackToLabel = 0; + } + + m_backTrackJumps.link(generator); + + if (nextIteration) + generator->m_expressionState.linkToNextIteration(hereLabel); + + if (hasDataLabel()) { + generator->m_expressionState.m_backtrackRecords.append(AlternativeBacktrackRecord(getDataLabel(), hereLabel)); + // data label cleared as a result of the clear() below + } + + clear(); + } + + void linkAlternativeBacktracksTo(RegexGenerator* generator, Label label, bool nextIteration = false) + { + m_backTrackJumps.linkTo(label, generator); + + if (nextIteration) + generator->m_expressionState.linkToNextIteration(label); + + if (hasDataLabel()) { + generator->m_expressionState.m_backtrackRecords.append(AlternativeBacktrackRecord(getDataLabel(), label)); + clearDataLabel(); + } + } + + private: + BacktrackType m_backtrackType; + int32_t m_backtrackStackOffset; + Label m_backtrackLabel; + DataLabelPtr m_dataLabelPtr; + Label* m_backtrackToLabel; + DataLabelPtr* m_subDataLabelPtr; + BacktrackDestination* m_nextBacktrack; + Label* m_backtrackSourceLabel; + JumpList* m_backtrackSourceJumps; + JumpList m_backTrackJumps; + }; + + struct TermGenerationState { + TermGenerationState(PatternDisjunction* disjunction, unsigned checkedTotal) + : disjunction(disjunction) + , checkedTotal(checkedTotal) + , m_subParenNum(0) + , m_linkedBacktrack(0) + , m_parenthesesTail(0) + { + } + + void resetAlternative() + { + m_backtrack.clear(); + alt = 0; + } + bool alternativeValid() + { + return alt < disjunction->m_alternatives.size(); + } + void nextAlternative() + { + ++alt; + } + PatternAlternative* alternative() + { + return disjunction->m_alternatives[alt]; + } + bool isLastAlternative() + { + return (alt + 1) == disjunction->m_alternatives.size(); + } + + void resetTerm() + { + ASSERT(alternativeValid()); + t = 0; + m_subParenNum = 0; + } + bool termValid() + { + ASSERT(alternativeValid()); + return t < alternative()->m_terms.size(); + } + void nextTerm() + { + ASSERT(alternativeValid()); + ++t; + } + PatternTerm& term() + { + ASSERT(alternativeValid()); + return alternative()->m_terms[t]; + } + bool isLastTerm() + { + ASSERT(alternativeValid()); + return (t + 1) == alternative()->m_terms.size(); + } + unsigned getSubParenNum() + { + return m_subParenNum++; + } + bool isMainDisjunction() + { + return !disjunction->m_parent; + } + + void setParenthesesTail(ParenthesesTail* parenthesesTail) + { + m_parenthesesTail = parenthesesTail; + } + + ParenthesesTail* getParenthesesTail() + { + return m_parenthesesTail; + } + + PatternTerm& lookaheadTerm() + { + ASSERT(alternativeValid()); + ASSERT((t + 1) < alternative()->m_terms.size()); + return alternative()->m_terms[t + 1]; + } + bool isSinglePatternCharacterLookaheadTerm() + { + ASSERT(alternativeValid()); + return ((t + 1) < alternative()->m_terms.size()) + && (lookaheadTerm().type == PatternTerm::TypePatternCharacter) + && (lookaheadTerm().quantityType == QuantifierFixedCount) + && (lookaheadTerm().quantityCount == 1); + } + + int inputOffset() + { + return term().inputPosition - checkedTotal; + } + + void clearBacktrack() + { + m_backtrack.clear(false); + m_linkedBacktrack = 0; + } + + void jumpToBacktrack(MacroAssembler* masm) + { + m_backtrack.jumpToBacktrack(masm); + } + + void jumpToBacktrack(RegexGenerator* generator, Jump jump) + { + m_backtrack.jumpToBacktrack(generator, jump); + } + + void jumpToBacktrack(RegexGenerator* generator, JumpList& jumps) + { + m_backtrack.jumpToBacktrack(generator, jumps); + } + + bool plantJumpToBacktrackIfExists(RegexGenerator* generator) + { + return m_backtrack.plantJumpToBacktrackIfExists(generator); + } + + bool linkDataLabelToBacktrackIfExists(RegexGenerator* generator) + { + if ((m_backtrack.isLabel()) && (m_backtrack.hasDataLabel())) { + generator->m_expressionState.m_backtrackRecords.append(AlternativeBacktrackRecord(m_backtrack.getDataLabel(), m_backtrack.getLabel())); + m_backtrack.clearDataLabel(); + return true; + } + + return false; + } + + void addBacktrackJump(Jump jump) + { + m_backtrack.addBacktrackJump(jump); + } + + void setBacktrackDataLabel(DataLabelPtr dp) + { + m_backtrack.setDataLabel(dp); + } + + void setBackTrackStackOffset(int32_t stackOffset) + { + m_backtrack.setStackOffset(stackOffset); + } + + void setBacktrackLabel(Label label) + { + m_backtrack.setLabel(label); + } + + void linkAlternativeBacktracks(RegexGenerator* generator, bool nextIteration = false) + { + m_backtrack.linkAlternativeBacktracks(generator, nextIteration); + m_linkedBacktrack = 0; + } + + void linkAlternativeBacktracksTo(RegexGenerator* generator, Label label, bool nextIteration = false) + { + m_backtrack.linkAlternativeBacktracksTo(generator, label, nextIteration); + } + + void setBacktrackLink(BacktrackDestination* linkedBacktrack) + { + m_linkedBacktrack = linkedBacktrack; + } + + void chainBacktracks(BacktrackDestination* followonBacktrack) + { + if (m_linkedBacktrack) + m_linkedBacktrack->linkToNextBacktrack(followonBacktrack); + } + + void chainBacktrackJumps(JumpList* jumpList) + { + if (m_linkedBacktrack && !(m_linkedBacktrack->hasDestination())) + m_linkedBacktrack->setBacktrackJumpList(jumpList); + } + + BacktrackDestination& getBacktrackDestination() + { + return m_backtrack; + } + + void propagateBacktrackingFrom(RegexGenerator* generator, BacktrackDestination& backtrack, bool doJump = true) + { + if (doJump) + m_backtrack.jumpToBacktrack(generator, backtrack.getBacktrackJumps()); + if (backtrack.hasDestination()) { + if (m_backtrack.hasDataLabel()) + generator->m_expressionState.addDataLabelToNextIteration(m_backtrack.getDataLabel()); + + m_backtrack.copyTarget(backtrack, doJump); + } + } + + PatternDisjunction* disjunction; + int checkedTotal; + private: + unsigned alt; + unsigned t; + unsigned m_subParenNum; + BacktrackDestination m_backtrack; + BacktrackDestination* m_linkedBacktrack; + ParenthesesTail* m_parenthesesTail; + + }; + + struct ParenthesesTail { + ParenthesesTail(PatternTerm& term, int nestingLevel, ParenthesesTail* nextOuterParenTail) + : m_term(term) + , m_nestingLevel(nestingLevel) + , m_subParenIndex(0) + , m_nextOuterParenTail(nextOuterParenTail) + { + } + + void processBacktracks(RegexGenerator* generator, TermGenerationState& state, TermGenerationState& parenthesesState, Label nonGreedyTryParentheses, Label fallThrough) + { + m_nonGreedyTryParentheses = nonGreedyTryParentheses; + m_fallThrough = fallThrough; + + m_subParenIndex = state.getSubParenNum(); + parenthesesState.getBacktrackDestination().copyTo(m_parenBacktrack); + state.chainBacktracks(&m_backtrack); + BacktrackDestination& stateBacktrack = state.getBacktrackDestination(); + stateBacktrack.copyTo(m_backtrack); + stateBacktrack.setBacktrackToLabel(&m_backtrackToLabel); + state.setBacktrackLink(&m_backtrack); + stateBacktrack.setSubDataLabelPtr(&m_dataAfterLabelPtr); + + m_doDirectBacktrack = m_parenBacktrack.hasDestination(); + + if ((m_term.quantityType == QuantifierGreedy) || (m_term.quantityType == QuantifierNonGreedy)) + m_doDirectBacktrack = false; + + if (m_doDirectBacktrack) + state.propagateBacktrackingFrom(generator, m_parenBacktrack, false); + else { + stateBacktrack.setBacktrackJumpList(&m_pattBacktrackJumps); + stateBacktrack.setBacktrackSourceLabel(&m_backtrackFromAfterParens); + } + + parenthesesState.chainBacktrackJumps(&m_pattBacktrackJumps); + } + + void setNextIteration(Label nextIteration) + { + if (!m_nestingLevel && !m_backtrackToLabel.isSet()) + m_backtrackToLabel = nextIteration; + } + + void addAfterParenJump(Jump jump) + { + m_pattBacktrackJumps.append(jump); + } + + bool generateCode(RegexGenerator* generator, JumpList& jumpsToNext, bool priorBackTrackFallThrough, bool nextBacktrackFallThrough) + { + const RegisterID indexTemporary = regT0; + unsigned parenthesesFrameLocation = m_term.frameLocation; + Jump fromPriorBacktrack; + bool needJumpForPriorParenTail = false; + + if (priorBackTrackFallThrough + && ((m_term.quantityType == QuantifierGreedy) + || (m_term.quantityType == QuantifierNonGreedy) + || (!m_doDirectBacktrack && m_parenBacktrack.hasDestination()))) { + // If the prior paren tail code assumed that it could fall through, + // but we need to generate after paren backtrack code, then provide + // a jump around that code for the prior paren tail code. + // A regular expressing like ((xxx)...)? needs this. + fromPriorBacktrack = generator->jump(); + needJumpForPriorParenTail = true; + } + + if (!m_backtrack.hasDestination()) { + if (m_backtrackToLabel.isSet()) { + m_backtrack.setLabel(m_backtrackToLabel); + nextBacktrackFallThrough = false; + } else if (!m_subParenIndex && m_nextOuterParenTail) { + // If we don't have a destination and we are the first term of a nested paren, go + // back to the outer paren. + // There is an optimization if the next outer paren is the next paren to be emitted. + // In that case we really want the else clause. + m_backtrack.setBacktrackJumpList(&m_nextOuterParenTail->m_withinBacktrackJumps); + nextBacktrackFallThrough = false; + } else + m_backtrack.setBacktrackJumpList(&jumpsToNext); + } else + nextBacktrackFallThrough = false; + + // A failure AFTER the parens jumps here - Backtrack to this paren + m_backtrackFromAfterParens = generator->label(); + + if (m_dataAfterLabelPtr.isSet()) + generator->m_expressionState.m_backtrackRecords.append(AlternativeBacktrackRecord(m_dataAfterLabelPtr, m_backtrackFromAfterParens)); + + m_pattBacktrackJumps.link(generator); + + if (m_term.quantityType == QuantifierGreedy) { + // If this is -1 we have now tested with both with and without the parens. + generator->loadFromFrame(parenthesesFrameLocation, indexTemporary); + m_backtrack.jumpToBacktrack(generator, generator->branch32(Equal, indexTemporary, Imm32(-1))); + } else if (m_term.quantityType == QuantifierNonGreedy) { + // If this is -1 we have now tested with both with and without the parens. + generator->loadFromFrame(parenthesesFrameLocation, indexTemporary); + generator->branch32(Equal, indexTemporary, Imm32(-1)).linkTo(m_nonGreedyTryParentheses, generator); + } + + if (!m_doDirectBacktrack) + m_parenBacktrack.plantJumpToBacktrackIfExists(generator); + + // A failure WITHIN the parens jumps here + if (needJumpForPriorParenTail) + fromPriorBacktrack.link(generator); + m_parenBacktrack.linkAlternativeBacktracks(generator); + m_withinBacktrackJumps.link(generator); + + if (m_term.capture()) + generator->store32(Imm32(-1), Address(output, (m_term.parentheses.subpatternId << 1) * sizeof(int))); + + if (m_term.quantityType == QuantifierGreedy) { + generator->storeToFrame(Imm32(-1), parenthesesFrameLocation); + generator->jump().linkTo(m_fallThrough, generator); + nextBacktrackFallThrough = false; + } else if (!nextBacktrackFallThrough) + m_backtrack.jumpToBacktrack(generator); + + if (!m_doDirectBacktrack) + m_backtrack.setNextBacktrackLabel(m_backtrackFromAfterParens); + + return nextBacktrackFallThrough; + } + + PatternTerm& m_term; + int m_nestingLevel; + unsigned m_subParenIndex; + ParenthesesTail* m_nextOuterParenTail; + Label m_nonGreedyTryParentheses; + Label m_fallThrough; + Label m_backtrackToLabel; + Label m_backtrackFromAfterParens; + DataLabelPtr m_dataAfterLabelPtr; + JumpList m_pattBacktrackJumps; + JumpList m_withinBacktrackJumps; + BacktrackDestination m_parenBacktrack; + BacktrackDestination m_backtrack; + bool m_doDirectBacktrack; + }; + + void generateAssertionBOL(TermGenerationState& state) + { + PatternTerm& term = state.term(); + + if (m_pattern.m_multiline) { + const RegisterID character = regT0; + + JumpList matchDest; + if (!term.inputPosition) + matchDest.append(branch32(Equal, index, Imm32(state.checkedTotal))); + + readCharacter(state.inputOffset() - 1, character); + matchCharacterClass(character, matchDest, m_pattern.newlineCharacterClass()); + state.jumpToBacktrack(this); + + matchDest.link(this); + } else { + // Erk, really should poison out these alternatives early. :-/ + if (term.inputPosition) + state.jumpToBacktrack(this); + else + state.jumpToBacktrack(this, branch32(NotEqual, index, Imm32(state.checkedTotal))); + } + } + + void generateAssertionEOL(TermGenerationState& state) + { + PatternTerm& term = state.term(); + + if (m_pattern.m_multiline) { + const RegisterID character = regT0; + + JumpList matchDest; + if (term.inputPosition == state.checkedTotal) + matchDest.append(atEndOfInput()); + + readCharacter(state.inputOffset(), character); + matchCharacterClass(character, matchDest, m_pattern.newlineCharacterClass()); + state.jumpToBacktrack(this); + + matchDest.link(this); + } else { + if (term.inputPosition == state.checkedTotal) + state.jumpToBacktrack(this, notAtEndOfInput()); + // Erk, really should poison out these alternatives early. :-/ + else + state.jumpToBacktrack(this); + } + } + + // Also falls though on nextIsNotWordChar. + void matchAssertionWordchar(TermGenerationState& state, JumpList& nextIsWordChar, JumpList& nextIsNotWordChar) + { + const RegisterID character = regT0; + PatternTerm& term = state.term(); + + if (term.inputPosition == state.checkedTotal) + nextIsNotWordChar.append(atEndOfInput()); + + readCharacter(state.inputOffset(), character); + matchCharacterClass(character, nextIsWordChar, m_pattern.wordcharCharacterClass()); + } + + void generateAssertionWordBoundary(TermGenerationState& state) + { + const RegisterID character = regT0; + PatternTerm& term = state.term(); + + Jump atBegin; + JumpList matchDest; + if (!term.inputPosition) + atBegin = branch32(Equal, index, Imm32(state.checkedTotal)); + readCharacter(state.inputOffset() - 1, character); + matchCharacterClass(character, matchDest, m_pattern.wordcharCharacterClass()); + if (!term.inputPosition) + atBegin.link(this); + + // We fall through to here if the last character was not a wordchar. + JumpList nonWordCharThenWordChar; + JumpList nonWordCharThenNonWordChar; + if (term.invert()) { + matchAssertionWordchar(state, nonWordCharThenNonWordChar, nonWordCharThenWordChar); + nonWordCharThenWordChar.append(jump()); + } else { + matchAssertionWordchar(state, nonWordCharThenWordChar, nonWordCharThenNonWordChar); + nonWordCharThenNonWordChar.append(jump()); + } + state.jumpToBacktrack(this, nonWordCharThenNonWordChar); + + // We jump here if the last character was a wordchar. + matchDest.link(this); + JumpList wordCharThenWordChar; + JumpList wordCharThenNonWordChar; + if (term.invert()) { + matchAssertionWordchar(state, wordCharThenNonWordChar, wordCharThenWordChar); + wordCharThenWordChar.append(jump()); + } else { + matchAssertionWordchar(state, wordCharThenWordChar, wordCharThenNonWordChar); + // This can fall-though! + } + + state.jumpToBacktrack(this, wordCharThenWordChar); + + nonWordCharThenWordChar.link(this); + wordCharThenNonWordChar.link(this); + } + + void generatePatternCharacterSingle(TermGenerationState& state) + { + const RegisterID character = regT0; + UChar ch = state.term().patternCharacter; + + if (m_pattern.m_ignoreCase && isASCIIAlpha(ch)) { + readCharacter(state.inputOffset(), character); + or32(Imm32(32), character); + state.jumpToBacktrack(this, branch32(NotEqual, character, Imm32(Unicode::toLower(ch)))); + } else { + ASSERT(!m_pattern.m_ignoreCase || (Unicode::toLower(ch) == Unicode::toUpper(ch))); + state.jumpToBacktrack(this, jumpIfCharNotEquals(ch, state.inputOffset())); + } + } + + void generatePatternCharacterPair(TermGenerationState& state) + { + const RegisterID character = regT0; + UChar ch1 = state.term().patternCharacter; + UChar ch2 = state.lookaheadTerm().patternCharacter; + + int mask = 0; + int chPair = ch1 | (ch2 << 16); + + if (m_pattern.m_ignoreCase) { + if (isASCIIAlpha(ch1)) + mask |= 32; + if (isASCIIAlpha(ch2)) + mask |= 32 << 16; + } + + if (mask) { + load32WithUnalignedHalfWords(BaseIndex(input, index, TimesTwo, state.inputOffset() * sizeof(UChar)), character); + or32(Imm32(mask), character); + state.jumpToBacktrack(this, branch32(NotEqual, character, Imm32(chPair | mask))); + } else + state.jumpToBacktrack(this, branch32WithUnalignedHalfWords(NotEqual, BaseIndex(input, index, TimesTwo, state.inputOffset() * sizeof(UChar)), Imm32(chPair))); + } + + void generatePatternCharacterFixed(TermGenerationState& state) + { + const RegisterID character = regT0; + const RegisterID countRegister = regT1; + PatternTerm& term = state.term(); + UChar ch = term.patternCharacter; + + move(index, countRegister); + sub32(Imm32(term.quantityCount), countRegister); + + Label loop(this); + if (m_pattern.m_ignoreCase && isASCIIAlpha(ch)) { + load16(BaseIndex(input, countRegister, TimesTwo, (state.inputOffset() + term.quantityCount) * sizeof(UChar)), character); + or32(Imm32(32), character); + state.jumpToBacktrack(this, branch32(NotEqual, character, Imm32(Unicode::toLower(ch)))); + } else { + ASSERT(!m_pattern.m_ignoreCase || (Unicode::toLower(ch) == Unicode::toUpper(ch))); + state.jumpToBacktrack(this, branch16(NotEqual, BaseIndex(input, countRegister, TimesTwo, (state.inputOffset() + term.quantityCount) * sizeof(UChar)), Imm32(ch))); + } + add32(Imm32(1), countRegister); + branch32(NotEqual, countRegister, index).linkTo(loop, this); + } + + void generatePatternCharacterGreedy(TermGenerationState& state) + { + const RegisterID character = regT0; + const RegisterID countRegister = regT1; + PatternTerm& term = state.term(); + UChar ch = term.patternCharacter; + + move(Imm32(0), countRegister); + + JumpList failures; + Label loop(this); + failures.append(atEndOfInput()); + if (m_pattern.m_ignoreCase && isASCIIAlpha(ch)) { + readCharacter(state.inputOffset(), character); + or32(Imm32(32), character); + failures.append(branch32(NotEqual, character, Imm32(Unicode::toLower(ch)))); + } else { + ASSERT(!m_pattern.m_ignoreCase || (Unicode::toLower(ch) == Unicode::toUpper(ch))); + failures.append(jumpIfCharNotEquals(ch, state.inputOffset())); + } + + add32(Imm32(1), countRegister); + add32(Imm32(1), index); + if (term.quantityCount != quantifyInfinite) { + branch32(NotEqual, countRegister, Imm32(term.quantityCount)).linkTo(loop, this); + failures.append(jump()); + } else + jump(loop); + + Label backtrackBegin(this); + loadFromFrame(term.frameLocation, countRegister); + state.jumpToBacktrack(this, branchTest32(Zero, countRegister)); + sub32(Imm32(1), countRegister); + sub32(Imm32(1), index); + + failures.link(this); + + storeToFrame(countRegister, term.frameLocation); + + state.setBacktrackLabel(backtrackBegin); + } + + void generatePatternCharacterNonGreedy(TermGenerationState& state) + { + const RegisterID character = regT0; + const RegisterID countRegister = regT1; + PatternTerm& term = state.term(); + UChar ch = term.patternCharacter; + + move(Imm32(0), countRegister); + + Jump firstTimeDoNothing = jump(); + + Label hardFail(this); + sub32(countRegister, index); + state.jumpToBacktrack(this); + + Label backtrackBegin(this); + loadFromFrame(term.frameLocation, countRegister); + + atEndOfInput().linkTo(hardFail, this); + if (term.quantityCount != quantifyInfinite) + branch32(Equal, countRegister, Imm32(term.quantityCount), hardFail); + if (m_pattern.m_ignoreCase && isASCIIAlpha(ch)) { + readCharacter(state.inputOffset(), character); + or32(Imm32(32), character); + branch32(NotEqual, character, Imm32(Unicode::toLower(ch))).linkTo(hardFail, this); + } else { + ASSERT(!m_pattern.m_ignoreCase || (Unicode::toLower(ch) == Unicode::toUpper(ch))); + jumpIfCharNotEquals(ch, state.inputOffset()).linkTo(hardFail, this); + } + + add32(Imm32(1), countRegister); + add32(Imm32(1), index); + + firstTimeDoNothing.link(this); + storeToFrame(countRegister, term.frameLocation); + + state.setBacktrackLabel(backtrackBegin); + } + + void generateCharacterClassSingle(TermGenerationState& state) + { + const RegisterID character = regT0; + PatternTerm& term = state.term(); + + JumpList matchDest; + readCharacter(state.inputOffset(), character); + matchCharacterClass(character, matchDest, term.characterClass); + + if (term.invert()) + state.jumpToBacktrack(this, matchDest); + else { + state.jumpToBacktrack(this); + matchDest.link(this); + } + } + + void generateCharacterClassFixed(TermGenerationState& state) + { + const RegisterID character = regT0; + const RegisterID countRegister = regT1; + PatternTerm& term = state.term(); + + move(index, countRegister); + sub32(Imm32(term.quantityCount), countRegister); + + Label loop(this); + JumpList matchDest; + load16(BaseIndex(input, countRegister, TimesTwo, (state.inputOffset() + term.quantityCount) * sizeof(UChar)), character); + matchCharacterClass(character, matchDest, term.characterClass); + + if (term.invert()) + state.jumpToBacktrack(this, matchDest); + else { + state.jumpToBacktrack(this); + matchDest.link(this); + } + + add32(Imm32(1), countRegister); + branch32(NotEqual, countRegister, index).linkTo(loop, this); + } + + void generateCharacterClassGreedy(TermGenerationState& state) + { + const RegisterID character = regT0; + const RegisterID countRegister = regT1; + PatternTerm& term = state.term(); + + move(Imm32(0), countRegister); + + JumpList failures; + Label loop(this); + failures.append(atEndOfInput()); + + if (term.invert()) { + readCharacter(state.inputOffset(), character); + matchCharacterClass(character, failures, term.characterClass); + } else { + JumpList matchDest; + readCharacter(state.inputOffset(), character); + matchCharacterClass(character, matchDest, term.characterClass); + failures.append(jump()); + matchDest.link(this); + } + + add32(Imm32(1), countRegister); + add32(Imm32(1), index); + if (term.quantityCount != quantifyInfinite) { + branch32(NotEqual, countRegister, Imm32(term.quantityCount)).linkTo(loop, this); + failures.append(jump()); + } else + jump(loop); + + Label backtrackBegin(this); + loadFromFrame(term.frameLocation, countRegister); + state.jumpToBacktrack(this, branchTest32(Zero, countRegister)); + sub32(Imm32(1), countRegister); + sub32(Imm32(1), index); + + failures.link(this); + + storeToFrame(countRegister, term.frameLocation); + + state.setBacktrackLabel(backtrackBegin); + } + + void generateCharacterClassNonGreedy(TermGenerationState& state) + { + const RegisterID character = regT0; + const RegisterID countRegister = regT1; + PatternTerm& term = state.term(); + + move(Imm32(0), countRegister); + + Jump firstTimeDoNothing = jump(); + + Label hardFail(this); + sub32(countRegister, index); + state.jumpToBacktrack(this); + + Label backtrackBegin(this); + loadFromFrame(term.frameLocation, countRegister); + + atEndOfInput().linkTo(hardFail, this); + branch32(Equal, countRegister, Imm32(term.quantityCount), hardFail); + + JumpList matchDest; + readCharacter(state.inputOffset(), character); + matchCharacterClass(character, matchDest, term.characterClass); + + if (term.invert()) + matchDest.linkTo(hardFail, this); + else { + jump(hardFail); + matchDest.link(this); + } + + add32(Imm32(1), countRegister); + add32(Imm32(1), index); + + firstTimeDoNothing.link(this); + storeToFrame(countRegister, term.frameLocation); + + state.setBacktrackLabel(backtrackBegin); + } + + void generateParenthesesDisjunction(PatternTerm& parenthesesTerm, TermGenerationState& state, unsigned alternativeFrameLocation) + { + ASSERT((parenthesesTerm.type == PatternTerm::TypeParenthesesSubpattern) || (parenthesesTerm.type == PatternTerm::TypeParentheticalAssertion)); + ASSERT(parenthesesTerm.quantityCount == 1); + + PatternDisjunction* disjunction = parenthesesTerm.parentheses.disjunction; + unsigned preCheckedCount = ((parenthesesTerm.quantityType == QuantifierFixedCount) && (parenthesesTerm.type != PatternTerm::TypeParentheticalAssertion)) ? disjunction->m_minimumSize : 0; + + if (disjunction->m_alternatives.size() == 1) { + state.resetAlternative(); + ASSERT(state.alternativeValid()); + PatternAlternative* alternative = state.alternative(); + optimizeAlternative(alternative); + + int countToCheck = alternative->m_minimumSize - preCheckedCount; + if (countToCheck) { + ASSERT((parenthesesTerm.type == PatternTerm::TypeParentheticalAssertion) || (parenthesesTerm.quantityType != QuantifierFixedCount)); + + // FIXME: This is quite horrible. The call to 'plantJumpToBacktrackIfExists' + // will be forced to always trampoline into here, just to decrement the index. + // Ick. + Jump skip = jump(); + + Label backtrackBegin(this); + sub32(Imm32(countToCheck), index); + state.addBacktrackJump(jump()); + + skip.link(this); + + state.setBacktrackLabel(backtrackBegin); + + state.jumpToBacktrack(this, jumpIfNoAvailableInput(countToCheck)); + state.checkedTotal += countToCheck; + } + + for (state.resetTerm(); state.termValid(); state.nextTerm()) + generateTerm(state); + + state.checkedTotal -= countToCheck; + } else { + JumpList successes; + bool propogateBacktrack = false; + + for (state.resetAlternative(); state.alternativeValid(); state.nextAlternative()) { + + PatternAlternative* alternative = state.alternative(); + optimizeAlternative(alternative); + + ASSERT(alternative->m_minimumSize >= preCheckedCount); + int countToCheck = alternative->m_minimumSize - preCheckedCount; + if (countToCheck) { + state.addBacktrackJump(jumpIfNoAvailableInput(countToCheck)); + state.checkedTotal += countToCheck; + } + + for (state.resetTerm(); state.termValid(); state.nextTerm()) + generateTerm(state); + + // Matched an alternative. + DataLabelPtr dataLabel = storeToFrameWithPatch(alternativeFrameLocation); + + if (!state.isLastAlternative() || countToCheck) + successes.append(jump()); + + // Alternative did not match. + + state.setBacktrackDataLabel(dataLabel); + + // Do we have a backtrack destination? + // if so, link the data label to it. + state.linkDataLabelToBacktrackIfExists(this); + + if (!state.isLastAlternative() || countToCheck) + state.linkAlternativeBacktracks(this); + + if (countToCheck) { + sub32(Imm32(countToCheck), index); + state.checkedTotal -= countToCheck; + } else if (state.isLastAlternative()) + propogateBacktrack = true; + } + // We fall through to here when the last alternative fails. + // Add a backtrack out of here for the parenthese handling code to link up. + if (!propogateBacktrack) + state.addBacktrackJump(jump()); + + // Save address on stack for the parens code to backtrack to, to retry the + // next alternative. + state.setBackTrackStackOffset(alternativeFrameLocation * sizeof(void*)); + + successes.link(this); + } + } + + void generateParenthesesSingle(TermGenerationState& state) + { + const RegisterID indexTemporary = regT0; + PatternTerm& term = state.term(); + PatternDisjunction* disjunction = term.parentheses.disjunction; + ASSERT(term.quantityCount == 1); + + unsigned preCheckedCount = (term.quantityType == QuantifierFixedCount) ? disjunction->m_minimumSize : 0; + + unsigned parenthesesFrameLocation = term.frameLocation; + unsigned alternativeFrameLocation = parenthesesFrameLocation; + if (term.quantityType != QuantifierFixedCount) + alternativeFrameLocation += RegexStackSpaceForBackTrackInfoParenthesesOnce; + + // optimized case - no capture & no quantifier can be handled in a light-weight manner. + if (!term.capture() && (term.quantityType == QuantifierFixedCount)) { + m_expressionState.incrementParenNestingLevel(); + + TermGenerationState parenthesesState(disjunction, state.checkedTotal); + generateParenthesesDisjunction(state.term(), parenthesesState, alternativeFrameLocation); + // this expects that any backtracks back out of the parentheses will be in the + // parenthesesState's m_backTrackJumps vector, and that if they need backtracking + // they will have set an entry point on the parenthesesState's m_backtrackLabel. + BacktrackDestination& parenthesesBacktrack = parenthesesState.getBacktrackDestination(); + state.propagateBacktrackingFrom(this, parenthesesBacktrack); + state.getBacktrackDestination().copyBacktrackToLabel(parenthesesBacktrack); + + m_expressionState.decrementParenNestingLevel(); + } else { + Jump nonGreedySkipParentheses; + Label nonGreedyTryParentheses; + if (term.quantityType == QuantifierGreedy) + storeToFrame(index, parenthesesFrameLocation); + else if (term.quantityType == QuantifierNonGreedy) { + storeToFrame(Imm32(-1), parenthesesFrameLocation); + nonGreedySkipParentheses = jump(); + nonGreedyTryParentheses = label(); + storeToFrame(index, parenthesesFrameLocation); + } + + // store the match start index + if (term.capture()) { + int inputOffset = state.inputOffset() - preCheckedCount; + if (inputOffset) { + move(index, indexTemporary); + add32(Imm32(inputOffset), indexTemporary); + store32(indexTemporary, Address(output, (term.parentheses.subpatternId << 1) * sizeof(int))); + } else + store32(index, Address(output, (term.parentheses.subpatternId << 1) * sizeof(int))); + } + + ParenthesesTail* parenthesesTail = m_expressionState.addParenthesesTail(term, state.getParenthesesTail()); + + m_expressionState.incrementParenNestingLevel(); + + TermGenerationState parenthesesState(disjunction, state.checkedTotal); + + // Save the parenthesesTail for backtracking from nested parens to this one. + parenthesesState.setParenthesesTail(parenthesesTail); + + // generate the body of the parentheses + generateParenthesesDisjunction(state.term(), parenthesesState, alternativeFrameLocation); + + // For non-fixed counts, backtrack if we didn't match anything. + if (term.quantityType != QuantifierFixedCount) + parenthesesTail->addAfterParenJump(branch32(Equal, index, Address(stackPointerRegister, (parenthesesFrameLocation * sizeof(void*))))); + + // store the match end index + if (term.capture()) { + int inputOffset = state.inputOffset(); + if (inputOffset) { + move(index, indexTemporary); + add32(Imm32(state.inputOffset()), indexTemporary); + store32(indexTemporary, Address(output, ((term.parentheses.subpatternId << 1) + 1) * sizeof(int))); + } else + store32(index, Address(output, ((term.parentheses.subpatternId << 1) + 1) * sizeof(int))); + } + + m_expressionState.decrementParenNestingLevel(); + + parenthesesTail->processBacktracks(this, state, parenthesesState, nonGreedyTryParentheses, label()); + + parenthesesState.getBacktrackDestination().clear(); + + if (term.quantityType == QuantifierNonGreedy) + nonGreedySkipParentheses.link(this); + } + } + + void generateParenthesesGreedyNoBacktrack(TermGenerationState& state) + { + PatternTerm& parenthesesTerm = state.term(); + PatternDisjunction* disjunction = parenthesesTerm.parentheses.disjunction; + ASSERT(parenthesesTerm.type == PatternTerm::TypeParenthesesSubpattern); + ASSERT(parenthesesTerm.quantityCount != 1); // Handled by generateParenthesesSingle. + + TermGenerationState parenthesesState(disjunction, state.checkedTotal); + + Label matchAgain(this); + + storeToFrame(index, parenthesesTerm.frameLocation); // Save the current index to check for zero len matches later. + + for (parenthesesState.resetAlternative(); parenthesesState.alternativeValid(); parenthesesState.nextAlternative()) { + + PatternAlternative* alternative = parenthesesState.alternative(); + optimizeAlternative(alternative); + + int countToCheck = alternative->m_minimumSize; + if (countToCheck) { + parenthesesState.addBacktrackJump(jumpIfNoAvailableInput(countToCheck)); + parenthesesState.checkedTotal += countToCheck; + } + + for (parenthesesState.resetTerm(); parenthesesState.termValid(); parenthesesState.nextTerm()) + generateTerm(parenthesesState); + + // If we get here, we matched! If the index advanced then try to match more since limit isn't supported yet. + branch32(NotEqual, index, Address(stackPointerRegister, (parenthesesTerm.frameLocation * sizeof(void*))), matchAgain); + + // If we get here we matched, but we matched "" - cannot accept this alternative as is, so either backtrack, + // or fall through to try the next alternative if no backtrack is available. + parenthesesState.plantJumpToBacktrackIfExists(this); + + parenthesesState.linkAlternativeBacktracks(this); + + // We get here if the alternative fails to match - fall through to the next iteration, or out of the loop. + + if (countToCheck) { + sub32(Imm32(countToCheck), index); + parenthesesState.checkedTotal -= countToCheck; + } + } + + // If the last alternative falls through to here, we have a failed match... + // Which means that we match whatever we have matched up to this point (even if nothing). + } + + void generateParentheticalAssertion(TermGenerationState& state) + { + PatternTerm& term = state.term(); + PatternDisjunction* disjunction = term.parentheses.disjunction; + ASSERT(term.quantityCount == 1); + ASSERT(term.quantityType == QuantifierFixedCount); + + unsigned parenthesesFrameLocation = term.frameLocation; + unsigned alternativeFrameLocation = parenthesesFrameLocation + RegexStackSpaceForBackTrackInfoParentheticalAssertion; + + int countCheckedAfterAssertion = state.checkedTotal - term.inputPosition; + + if (term.invert()) { + // Inverted case + storeToFrame(index, parenthesesFrameLocation); + + state.checkedTotal -= countCheckedAfterAssertion; + if (countCheckedAfterAssertion) + sub32(Imm32(countCheckedAfterAssertion), index); + + TermGenerationState parenthesesState(disjunction, state.checkedTotal); + generateParenthesesDisjunction(state.term(), parenthesesState, alternativeFrameLocation); + // Success! - which means - Fail! + loadFromFrame(parenthesesFrameLocation, index); + state.jumpToBacktrack(this); + + // And fail means success. + parenthesesState.linkAlternativeBacktracks(this); + + loadFromFrame(parenthesesFrameLocation, index); + + state.checkedTotal += countCheckedAfterAssertion; + } else { + // Normal case + storeToFrame(index, parenthesesFrameLocation); + + state.checkedTotal -= countCheckedAfterAssertion; + if (countCheckedAfterAssertion) + sub32(Imm32(countCheckedAfterAssertion), index); + + TermGenerationState parenthesesState(disjunction, state.checkedTotal); + generateParenthesesDisjunction(state.term(), parenthesesState, alternativeFrameLocation); + // Success! - which means - Success! + loadFromFrame(parenthesesFrameLocation, index); + Jump success = jump(); + + parenthesesState.linkAlternativeBacktracks(this); + + loadFromFrame(parenthesesFrameLocation, index); + state.jumpToBacktrack(this); + + success.link(this); + + state.checkedTotal += countCheckedAfterAssertion; + } + } + + void generateTerm(TermGenerationState& state) + { + PatternTerm& term = state.term(); + + switch (term.type) { + case PatternTerm::TypeAssertionBOL: + generateAssertionBOL(state); + break; + + case PatternTerm::TypeAssertionEOL: + generateAssertionEOL(state); + break; + + case PatternTerm::TypeAssertionWordBoundary: + generateAssertionWordBoundary(state); + break; + + case PatternTerm::TypePatternCharacter: + switch (term.quantityType) { + case QuantifierFixedCount: + if (term.quantityCount == 1) { + if (state.isSinglePatternCharacterLookaheadTerm() && (state.lookaheadTerm().inputPosition == (term.inputPosition + 1))) { + generatePatternCharacterPair(state); + state.nextTerm(); + } else + generatePatternCharacterSingle(state); + } else + generatePatternCharacterFixed(state); + break; + case QuantifierGreedy: + generatePatternCharacterGreedy(state); + break; + case QuantifierNonGreedy: + generatePatternCharacterNonGreedy(state); + break; + } + break; + + case PatternTerm::TypeCharacterClass: + switch (term.quantityType) { + case QuantifierFixedCount: + if (term.quantityCount == 1) + generateCharacterClassSingle(state); + else + generateCharacterClassFixed(state); + break; + case QuantifierGreedy: + generateCharacterClassGreedy(state); + break; + case QuantifierNonGreedy: + generateCharacterClassNonGreedy(state); + break; + } + break; + + case PatternTerm::TypeBackReference: + m_shouldFallBack = true; + break; + + case PatternTerm::TypeForwardReference: + break; + + case PatternTerm::TypeParenthesesSubpattern: + if (term.quantityCount == 1 && !term.parentheses.isCopy) + generateParenthesesSingle(state); + else if (term.parentheses.isTerminal) + generateParenthesesGreedyNoBacktrack(state); + else + m_shouldFallBack = true; + break; + + case PatternTerm::TypeParentheticalAssertion: + generateParentheticalAssertion(state); + break; + } + } + + void generateDisjunction(PatternDisjunction* disjunction) + { + TermGenerationState state(disjunction, 0); + state.resetAlternative(); + + // check availability for the next alternative + int countCheckedForCurrentAlternative = 0; + int countToCheckForFirstAlternative = 0; + bool hasShorterAlternatives = false; + bool setRepeatAlternativeLabels = false; + JumpList notEnoughInputForPreviousAlternative; + Label firstAlternative; + Label firstAlternativeInputChecked; + + // The label 'firstAlternative' is used to plant a check to see if there is + // sufficient input available to run the first repeating alternative. + // The label 'firstAlternativeInputChecked' will jump directly to matching + // the first repeating alternative having skipped this check. + + if (state.alternativeValid()) { + PatternAlternative* alternative = state.alternative(); + if (!alternative->onceThrough()) { + firstAlternative = Label(this); + setRepeatAlternativeLabels = true; + } + countToCheckForFirstAlternative = alternative->m_minimumSize; + state.checkedTotal += countToCheckForFirstAlternative; + if (countToCheckForFirstAlternative) + notEnoughInputForPreviousAlternative.append(jumpIfNoAvailableInput(countToCheckForFirstAlternative)); + countCheckedForCurrentAlternative = countToCheckForFirstAlternative; + } + + if (setRepeatAlternativeLabels) + firstAlternativeInputChecked = Label(this); + + while (state.alternativeValid()) { + PatternAlternative* alternative = state.alternative(); + optimizeAlternative(alternative); + + // Track whether any alternatives are shorter than the first one. + if (!alternative->onceThrough()) + hasShorterAlternatives = hasShorterAlternatives || (countCheckedForCurrentAlternative < countToCheckForFirstAlternative); + + for (state.resetTerm(); state.termValid(); state.nextTerm()) + generateTerm(state); + + // If we get here, the alternative matched. + if (m_pattern.m_body->m_callFrameSize) + addPtr(Imm32(m_pattern.m_body->m_callFrameSize * sizeof(void*)), stackPointerRegister); + + ASSERT(index != returnRegister); + if (m_pattern.m_body->m_hasFixedSize) { + move(index, returnRegister); + if (alternative->m_minimumSize) + sub32(Imm32(alternative->m_minimumSize), returnRegister); + + store32(returnRegister, output); + } else + load32(Address(output), returnRegister); + + store32(index, Address(output, 4)); + + generateReturn(); + + state.nextAlternative(); + if (alternative->onceThrough() && state.alternativeValid()) + state.clearBacktrack(); + + // if there are any more alternatives, plant the check for input before looping. + if (state.alternativeValid()) { + PatternAlternative* nextAlternative = state.alternative(); + if (!setRepeatAlternativeLabels && !nextAlternative->onceThrough()) { + // We have handled non-repeating alternatives, jump to next iteration + // and loop over repeating alternatives. + state.jumpToBacktrack(this); + + countToCheckForFirstAlternative = nextAlternative->m_minimumSize; + + // If we get here, there the last input checked failed. + notEnoughInputForPreviousAlternative.link(this); + + state.linkAlternativeBacktracks(this); + + // Back up to start the looping alternatives. + if (countCheckedForCurrentAlternative) + sub32(Imm32(countCheckedForCurrentAlternative), index); + + firstAlternative = Label(this); + + state.checkedTotal = countToCheckForFirstAlternative; + if (countToCheckForFirstAlternative) + notEnoughInputForPreviousAlternative.append(jumpIfNoAvailableInput(countToCheckForFirstAlternative)); + + countCheckedForCurrentAlternative = countToCheckForFirstAlternative; + + firstAlternativeInputChecked = Label(this); + + setRepeatAlternativeLabels = true; + } else { + int countToCheckForNextAlternative = nextAlternative->m_minimumSize; + + if (countCheckedForCurrentAlternative > countToCheckForNextAlternative) { // CASE 1: current alternative was longer than the next one. + // If we get here, then the last input checked failed. + notEnoughInputForPreviousAlternative.link(this); + + // Check if sufficent input available to run the next alternative + notEnoughInputForPreviousAlternative.append(jumpIfNoAvailableInput(countToCheckForNextAlternative - countCheckedForCurrentAlternative)); + // We are now in the correct state to enter the next alternative; this add is only required + // to mirror and revert operation of the sub32, just below. + add32(Imm32(countCheckedForCurrentAlternative - countToCheckForNextAlternative), index); + + // If we get here, then the last input checked passed. + state.linkAlternativeBacktracks(this); + + // No need to check if we can run the next alternative, since it is shorter - + // just update index. + sub32(Imm32(countCheckedForCurrentAlternative - countToCheckForNextAlternative), index); + } else if (countCheckedForCurrentAlternative < countToCheckForNextAlternative) { // CASE 2: next alternative is longer than the current one. + // If we get here, then the last input checked failed. + // If there is insufficient input to run the current alternative, and the next alternative is longer, + // then there is definitely not enough input to run it - don't even check. Just adjust index, as if + // we had checked. + notEnoughInputForPreviousAlternative.link(this); + add32(Imm32(countToCheckForNextAlternative - countCheckedForCurrentAlternative), index); + notEnoughInputForPreviousAlternative.append(jump()); + + // The next alternative is longer than the current one; check the difference. + state.linkAlternativeBacktracks(this); + + notEnoughInputForPreviousAlternative.append(jumpIfNoAvailableInput(countToCheckForNextAlternative - countCheckedForCurrentAlternative)); + } else { // CASE 3: Both alternatives are the same length. + ASSERT(countCheckedForCurrentAlternative == countToCheckForNextAlternative); + + // If the next alterative is the same length as this one, then no need to check the input - + // if there was sufficent input to run the current alternative then there is sufficient + // input to run the next one; if not, there isn't. + state.linkAlternativeBacktracks(this); + } + state.checkedTotal -= countCheckedForCurrentAlternative; + countCheckedForCurrentAlternative = countToCheckForNextAlternative; + state.checkedTotal += countCheckedForCurrentAlternative; + } + } + } + + // If we get here, all Alternatives failed... + + state.checkedTotal -= countCheckedForCurrentAlternative; + + if (!setRepeatAlternativeLabels) { + // If there are no alternatives that need repeating (all are marked 'onceThrough') then just link + // the match failures to this point, and fall through to the return below. + state.linkAlternativeBacktracks(this, true); + + notEnoughInputForPreviousAlternative.link(this); + } else { + // How much more input need there be to be able to retry from the first alternative? + // examples: + // /yarr_jit/ or /wrec|pcre/ + // In these examples we need check for one more input before looping. + // /yarr_jit|pcre/ + // In this case we need check for 5 more input to loop (+4 to allow for the first alterative + // being four longer than the last alternative checked, and another +1 to effectively move + // the start position along by one). + // /yarr|rules/ or /wrec|notsomuch/ + // In these examples, provided that there was sufficient input to have just been matching for + // the second alternative we can loop without checking for available input (since the second + // alternative is longer than the first). In the latter example we need to decrement index + // (by 4) so the start position is only progressed by 1 from the last iteration. + int incrementForNextIter = (countToCheckForFirstAlternative - countCheckedForCurrentAlternative) + 1; + + // First, deal with the cases where there was sufficient input to try the last alternative. + if (incrementForNextIter > 0) // We need to check for more input anyway, fall through to the checking below. + state.linkAlternativeBacktracks(this, true); + else if (m_pattern.m_body->m_hasFixedSize && !incrementForNextIter) // No need to update anything, link these backtracks straight to the to pof the loop! + state.linkAlternativeBacktracksTo(this, firstAlternativeInputChecked, true); + else { // no need to check the input, but we do have some bookkeeping to do first. + state.linkAlternativeBacktracks(this, true); + + // Where necessary update our preserved start position. + if (!m_pattern.m_body->m_hasFixedSize) { + move(index, regT0); + sub32(Imm32(countCheckedForCurrentAlternative - 1), regT0); + store32(regT0, Address(output)); + } + + // Update index if necessary, and loop (without checking). + if (incrementForNextIter) + add32(Imm32(incrementForNextIter), index); + jump().linkTo(firstAlternativeInputChecked, this); + } + + notEnoughInputForPreviousAlternative.link(this); + // Update our idea of the start position, if we're tracking this. + if (!m_pattern.m_body->m_hasFixedSize) { + if (countCheckedForCurrentAlternative - 1) { + move(index, regT0); + sub32(Imm32(countCheckedForCurrentAlternative - 1), regT0); + store32(regT0, Address(output)); + } else + store32(index, Address(output)); + } + + // Check if there is sufficent input to run the first alternative again. + jumpIfAvailableInput(incrementForNextIter).linkTo(firstAlternativeInputChecked, this); + // No - insufficent input to run the first alteranative, are there any other alternatives we + // might need to check? If so, the last check will have left the index incremented by + // (countToCheckForFirstAlternative + 1), so we need test whether countToCheckForFirstAlternative + // LESS input is available, to have the effect of just progressing the start position by 1 + // from the last iteration. If this check passes we can just jump up to the check associated + // with the first alternative in the loop. This is a bit sad, since we'll end up trying the + // first alternative again, and this check will fail (otherwise the check planted just above + // here would have passed). This is a bit sad, however it saves trying to do something more + // complex here in compilation, and in the common case we should end up coallescing the checks. + // + // FIXME: a nice improvement here may be to stop trying to match sooner, based on the least + // of the minimum-alternative-lengths. E.g. if I have two alternatives of length 200 and 150, + // and a string of length 100, we'll end up looping index from 0 to 100, checking whether there + // is sufficient input to run either alternative (constantly failing). If there had been only + // one alternative, or if the shorter alternative had come first, we would have terminated + // immediately. :-/ + if (hasShorterAlternatives) + jumpIfAvailableInput(-countToCheckForFirstAlternative).linkTo(firstAlternative, this); + // index will now be a bit garbled (depending on whether 'hasShorterAlternatives' is true, + // it has either been incremented by 1 or by (countToCheckForFirstAlternative + 1) ... + // but since we're about to return a failure this doesn't really matter!) + } + + if (m_pattern.m_body->m_callFrameSize) + addPtr(Imm32(m_pattern.m_body->m_callFrameSize * sizeof(void*)), stackPointerRegister); + + move(Imm32(-1), returnRegister); + + generateReturn(); + + m_expressionState.emitParenthesesTail(this); + m_expressionState.emitIndirectJumpTable(this); + m_expressionState.linkToNextIteration(this); + } + + void generateEnter() + { +#if CPU(X86_64) + push(X86Registers::ebp); + move(stackPointerRegister, X86Registers::ebp); + push(X86Registers::ebx); +#elif CPU(X86) + push(X86Registers::ebp); + move(stackPointerRegister, X86Registers::ebp); + // TODO: do we need spill registers to fill the output pointer if there are no sub captures? + push(X86Registers::ebx); + push(X86Registers::edi); + push(X86Registers::esi); + // load output into edi (2 = saved ebp + return address). + #if COMPILER(MSVC) + loadPtr(Address(X86Registers::ebp, 2 * sizeof(void*)), input); + loadPtr(Address(X86Registers::ebp, 3 * sizeof(void*)), index); + loadPtr(Address(X86Registers::ebp, 4 * sizeof(void*)), length); + loadPtr(Address(X86Registers::ebp, 5 * sizeof(void*)), output); + #else + loadPtr(Address(X86Registers::ebp, 2 * sizeof(void*)), output); + #endif +#elif CPU(ARM) + push(ARMRegisters::r4); + push(ARMRegisters::r5); + push(ARMRegisters::r6); +#if CPU(ARM_TRADITIONAL) + push(ARMRegisters::r8); // scratch register +#endif + move(ARMRegisters::r3, output); +#elif CPU(MIPS) + // Do nothing. +#endif + } + + void generateReturn() + { +#if CPU(X86_64) + pop(X86Registers::ebx); + pop(X86Registers::ebp); +#elif CPU(X86) + pop(X86Registers::esi); + pop(X86Registers::edi); + pop(X86Registers::ebx); + pop(X86Registers::ebp); +#elif CPU(ARM) +#if CPU(ARM_TRADITIONAL) + pop(ARMRegisters::r8); // scratch register +#endif + pop(ARMRegisters::r6); + pop(ARMRegisters::r5); + pop(ARMRegisters::r4); +#elif CPU(MIPS) + // Do nothing +#endif + ret(); + } + +public: + RegexGenerator(RegexPattern& pattern) + : m_pattern(pattern) + , m_shouldFallBack(false) + { + } + + void generate() + { + generateEnter(); + + if (!m_pattern.m_body->m_hasFixedSize) + store32(index, Address(output)); + + if (m_pattern.m_body->m_callFrameSize) + subPtr(Imm32(m_pattern.m_body->m_callFrameSize * sizeof(void*)), stackPointerRegister); + + generateDisjunction(m_pattern.m_body); + } + + void compile(JSGlobalData* globalData, RegexCodeBlock& jitObject) + { + generate(); + + LinkBuffer patchBuffer(this, globalData->regexAllocator.poolForSize(size()), 0); + + for (unsigned i = 0; i < m_expressionState.m_backtrackRecords.size(); ++i) + patchBuffer.patch(m_expressionState.m_backtrackRecords[i].dataLabel, patchBuffer.locationOf(m_expressionState.m_backtrackRecords[i].backtrackLocation)); + + jitObject.set(patchBuffer.finalizeCode()); + jitObject.setFallBack(m_shouldFallBack); + } + +private: + RegexPattern& m_pattern; + bool m_shouldFallBack; + GenerationState m_expressionState; +}; + +void jitCompileRegex(RegexPattern& pattern, JSGlobalData* globalData, RegexCodeBlock& jitObject) +{ + RegexGenerator generator(pattern); + generator.compile(globalData, jitObject); +} + + +}} + +#endif diff --git a/Source/JavaScriptCore/yarr/RegexJIT.h b/Source/JavaScriptCore/yarr/RegexJIT.h new file mode 100644 index 0000000..5e3dca1 --- /dev/null +++ b/Source/JavaScriptCore/yarr/RegexJIT.h @@ -0,0 +1,90 @@ +/* + * Copyright (C) 2009 Apple Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef RegexJIT_h +#define RegexJIT_h + +#if ENABLE(YARR_JIT) + +#include "MacroAssembler.h" +#include "RegexPattern.h" +#include "UString.h" + +#if CPU(X86) && !COMPILER(MSVC) +#define YARR_CALL __attribute__ ((regparm (3))) +#else +#define YARR_CALL +#endif + +namespace JSC { + +class JSGlobalData; +class ExecutablePool; + +namespace Yarr { + +class RegexCodeBlock { + typedef int (*RegexJITCode)(const UChar* input, unsigned start, unsigned length, int* output) YARR_CALL; + +public: + RegexCodeBlock() + : m_needFallBack(false) + { + } + + ~RegexCodeBlock() + { + } + + void setFallBack(bool fallback) { m_needFallBack = fallback; } + bool isFallBack() { return m_needFallBack; } + void set(MacroAssembler::CodeRef ref) { m_ref = ref; } + + int execute(const UChar* input, unsigned start, unsigned length, int* output) + { + return reinterpret_cast<RegexJITCode>(m_ref.m_code.executableAddress())(input, start, length, output); + } + +#if ENABLE(REGEXP_TRACING) + void *getAddr() { return m_ref.m_code.executableAddress(); } +#endif + +private: + MacroAssembler::CodeRef m_ref; + bool m_needFallBack; +}; + +void jitCompileRegex(RegexPattern& pattern, JSGlobalData* globalData, RegexCodeBlock& jitObject); + +inline int executeRegex(RegexCodeBlock& jitObject, const UChar* input, unsigned start, unsigned length, int* output) +{ + return jitObject.execute(input, start, length, output); +} + +} } // namespace JSC::Yarr + +#endif + +#endif // RegexJIT_h diff --git a/Source/JavaScriptCore/yarr/RegexParser.h b/Source/JavaScriptCore/yarr/RegexParser.h new file mode 100644 index 0000000..ec5f589 --- /dev/null +++ b/Source/JavaScriptCore/yarr/RegexParser.h @@ -0,0 +1,887 @@ +/* + * Copyright (C) 2009 Apple Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef RegexParser_h +#define RegexParser_h + +#include "UString.h" +#include <limits.h> +#include <wtf/ASCIICType.h> +#include <wtf/unicode/Unicode.h> + +namespace JSC { namespace Yarr { + +static const unsigned quantifyInfinite = UINT_MAX; + +enum BuiltInCharacterClassID { + DigitClassID, + SpaceClassID, + WordClassID, + NewlineClassID, +}; + +// The Parser class should not be used directly - only via the Yarr::parse() method. +template<class Delegate> +class Parser { +private: + template<class FriendDelegate> + friend const char* parse(FriendDelegate& delegate, const UString& pattern, unsigned backReferenceLimit); + + enum ErrorCode { + NoError, + PatternTooLarge, + QuantifierOutOfOrder, + QuantifierWithoutAtom, + MissingParentheses, + ParenthesesUnmatched, + ParenthesesTypeInvalid, + CharacterClassUnmatched, + CharacterClassOutOfOrder, + EscapeUnterminated, + NumberOfErrorCodes + }; + + /* + * CharacterClassParserDelegate: + * + * The class CharacterClassParserDelegate is used in the parsing of character + * classes. This class handles detection of character ranges. This class + * implements enough of the delegate interface such that it can be passed to + * parseEscape() as an EscapeDelegate. This allows parseEscape() to be reused + * to perform the parsing of escape characters in character sets. + */ + class CharacterClassParserDelegate { + public: + CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err) + : m_delegate(delegate) + , m_err(err) + , m_state(Empty) + { + } + + /* + * begin(): + * + * Called at beginning of construction. + */ + void begin(bool invert) + { + m_delegate.atomCharacterClassBegin(invert); + } + + /* + * atomPatternCharacter(): + * + * This method is called either from parseCharacterClass() (for an unescaped + * character in a character class), or from parseEscape(). In the former case + * the value true will be passed for the argument 'hyphenIsRange', and in this + * mode we will allow a hypen to be treated as indicating a range (i.e. /[a-z]/ + * is different to /[a\-z]/). + */ + void atomPatternCharacter(UChar ch, bool hyphenIsRange = false) + { + switch (m_state) { + case AfterCharacterClass: + // Following a builtin character class we need look out for a hyphen. + // We're looking for invalid ranges, such as /[\d-x]/ or /[\d-\d]/. + // If we see a hyphen following a charater class then unlike usual + // we'll report it to the delegate immediately, and put ourself into + // a poisoned state. Any following calls to add another character or + // character class will result in an error. (A hypen following a + // character-class is itself valid, but only at the end of a regex). + if (hyphenIsRange && ch == '-') { + m_delegate.atomCharacterClassAtom('-'); + m_state = AfterCharacterClassHyphen; + return; + } + // Otherwise just fall through - cached character so treat this as Empty. + + case Empty: + m_character = ch; + m_state = CachedCharacter; + return; + + case CachedCharacter: + if (hyphenIsRange && ch == '-') + m_state = CachedCharacterHyphen; + else { + m_delegate.atomCharacterClassAtom(m_character); + m_character = ch; + } + return; + + case CachedCharacterHyphen: + if (ch < m_character) { + m_err = CharacterClassOutOfOrder; + return; + } + m_delegate.atomCharacterClassRange(m_character, ch); + m_state = Empty; + return; + + // See coment in atomBuiltInCharacterClass below. + // This too is technically an error, per ECMA-262, and again we + // we chose to allow this. Note a subtlely here that while we + // diverge from the spec's definition of CharacterRange we do + // remain in compliance with the grammar. For example, consider + // the expression /[\d-a-z]/. We comply with the grammar in + // this case by not allowing a-z to be matched as a range. + case AfterCharacterClassHyphen: + m_delegate.atomCharacterClassAtom(ch); + m_state = Empty; + return; + } + } + + /* + * atomBuiltInCharacterClass(): + * + * Adds a built-in character class, called by parseEscape(). + */ + void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert) + { + switch (m_state) { + case CachedCharacter: + // Flush the currently cached character, then fall through. + m_delegate.atomCharacterClassAtom(m_character); + + case Empty: + case AfterCharacterClass: + m_state = AfterCharacterClass; + m_delegate.atomCharacterClassBuiltIn(classID, invert); + return; + + // If we hit either of these cases, we have an invalid range that + // looks something like /[x-\d]/ or /[\d-\d]/. + // According to ECMA-262 this should be a syntax error, but + // empirical testing shows this to break teh webz. Instead we + // comply with to the ECMA-262 grammar, and assume the grammar to + // have matched the range correctly, but tweak our interpretation + // of CharacterRange. Effectively we implicitly handle the hyphen + // as if it were escaped, e.g. /[\w-_]/ is treated as /[\w\-_]/. + case CachedCharacterHyphen: + m_delegate.atomCharacterClassAtom(m_character); + m_delegate.atomCharacterClassAtom('-'); + // fall through + case AfterCharacterClassHyphen: + m_delegate.atomCharacterClassBuiltIn(classID, invert); + m_state = Empty; + return; + } + } + + /* + * end(): + * + * Called at end of construction. + */ + void end() + { + if (m_state == CachedCharacter) + m_delegate.atomCharacterClassAtom(m_character); + else if (m_state == CachedCharacterHyphen) { + m_delegate.atomCharacterClassAtom(m_character); + m_delegate.atomCharacterClassAtom('-'); + } + m_delegate.atomCharacterClassEnd(); + } + + // parseEscape() should never call these delegate methods when + // invoked with inCharacterClass set. + void assertionWordBoundary(bool) { ASSERT_NOT_REACHED(); } + void atomBackReference(unsigned) { ASSERT_NOT_REACHED(); } + + private: + Delegate& m_delegate; + ErrorCode& m_err; + enum CharacterClassConstructionState { + Empty, + CachedCharacter, + CachedCharacterHyphen, + AfterCharacterClass, + AfterCharacterClassHyphen, + } m_state; + UChar m_character; + }; + + Parser(Delegate& delegate, const UString& pattern, unsigned backReferenceLimit) + : m_delegate(delegate) + , m_backReferenceLimit(backReferenceLimit) + , m_err(NoError) + , m_data(pattern.characters()) + , m_size(pattern.length()) + , m_index(0) + , m_parenthesesNestingDepth(0) + { + } + + /* + * parseEscape(): + * + * Helper for parseTokens() AND parseCharacterClass(). + * Unlike the other parser methods, this function does not report tokens + * directly to the member delegate (m_delegate), instead tokens are + * emitted to the delegate provided as an argument. In the case of atom + * escapes, parseTokens() will call parseEscape() passing m_delegate as + * an argument, and as such the escape will be reported to the delegate. + * + * However this method may also be used by parseCharacterClass(), in which + * case a CharacterClassParserDelegate will be passed as the delegate that + * tokens should be added to. A boolean flag is also provided to indicate + * whether that an escape in a CharacterClass is being parsed (some parsing + * rules change in this context). + * + * The boolean value returned by this method indicates whether the token + * parsed was an atom (outside of a characted class \b and \B will be + * interpreted as assertions). + */ + template<bool inCharacterClass, class EscapeDelegate> + bool parseEscape(EscapeDelegate& delegate) + { + ASSERT(!m_err); + ASSERT(peek() == '\\'); + consume(); + + if (atEndOfPattern()) { + m_err = EscapeUnterminated; + return false; + } + + switch (peek()) { + // Assertions + case 'b': + consume(); + if (inCharacterClass) + delegate.atomPatternCharacter('\b'); + else { + delegate.assertionWordBoundary(false); + return false; + } + break; + case 'B': + consume(); + if (inCharacterClass) + delegate.atomPatternCharacter('B'); + else { + delegate.assertionWordBoundary(true); + return false; + } + break; + + // CharacterClassEscape + case 'd': + consume(); + delegate.atomBuiltInCharacterClass(DigitClassID, false); + break; + case 's': + consume(); + delegate.atomBuiltInCharacterClass(SpaceClassID, false); + break; + case 'w': + consume(); + delegate.atomBuiltInCharacterClass(WordClassID, false); + break; + case 'D': + consume(); + delegate.atomBuiltInCharacterClass(DigitClassID, true); + break; + case 'S': + consume(); + delegate.atomBuiltInCharacterClass(SpaceClassID, true); + break; + case 'W': + consume(); + delegate.atomBuiltInCharacterClass(WordClassID, true); + break; + + // DecimalEscape + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': { + // To match Firefox, we parse an invalid backreference in the range [1-7] as an octal escape. + // First, try to parse this as backreference. + if (!inCharacterClass) { + ParseState state = saveState(); + + unsigned backReference = consumeNumber(); + if (backReference <= m_backReferenceLimit) { + delegate.atomBackReference(backReference); + break; + } + + restoreState(state); + } + + // Not a backreference, and not octal. + if (peek() >= '8') { + delegate.atomPatternCharacter('\\'); + break; + } + + // Fall-through to handle this as an octal escape. + } + + // Octal escape + case '0': + delegate.atomPatternCharacter(consumeOctal()); + break; + + // ControlEscape + case 'f': + consume(); + delegate.atomPatternCharacter('\f'); + break; + case 'n': + consume(); + delegate.atomPatternCharacter('\n'); + break; + case 'r': + consume(); + delegate.atomPatternCharacter('\r'); + break; + case 't': + consume(); + delegate.atomPatternCharacter('\t'); + break; + case 'v': + consume(); + delegate.atomPatternCharacter('\v'); + break; + + // ControlLetter + case 'c': { + ParseState state = saveState(); + consume(); + if (!atEndOfPattern()) { + int control = consume(); + + // To match Firefox, inside a character class, we also accept numbers and '_' as control characters. + if (inCharacterClass ? WTF::isASCIIAlphanumeric(control) || (control == '_') : WTF::isASCIIAlpha(control)) { + delegate.atomPatternCharacter(control & 0x1f); + break; + } + } + restoreState(state); + delegate.atomPatternCharacter('\\'); + break; + } + + // HexEscape + case 'x': { + consume(); + int x = tryConsumeHex(2); + if (x == -1) + delegate.atomPatternCharacter('x'); + else + delegate.atomPatternCharacter(x); + break; + } + + // UnicodeEscape + case 'u': { + consume(); + int u = tryConsumeHex(4); + if (u == -1) + delegate.atomPatternCharacter('u'); + else + delegate.atomPatternCharacter(u); + break; + } + + // IdentityEscape + default: + delegate.atomPatternCharacter(consume()); + } + + return true; + } + + /* + * parseAtomEscape(), parseCharacterClassEscape(): + * + * These methods alias to parseEscape(). + */ + bool parseAtomEscape() + { + return parseEscape<false>(m_delegate); + } + void parseCharacterClassEscape(CharacterClassParserDelegate& delegate) + { + parseEscape<true>(delegate); + } + + /* + * parseCharacterClass(): + * + * Helper for parseTokens(); calls dirctly and indirectly (via parseCharacterClassEscape) + * to an instance of CharacterClassParserDelegate, to describe the character class to the + * delegate. + */ + void parseCharacterClass() + { + ASSERT(!m_err); + ASSERT(peek() == '['); + consume(); + + CharacterClassParserDelegate characterClassConstructor(m_delegate, m_err); + + characterClassConstructor.begin(tryConsume('^')); + + while (!atEndOfPattern()) { + switch (peek()) { + case ']': + consume(); + characterClassConstructor.end(); + return; + + case '\\': + parseCharacterClassEscape(characterClassConstructor); + break; + + default: + characterClassConstructor.atomPatternCharacter(consume(), true); + } + + if (m_err) + return; + } + + m_err = CharacterClassUnmatched; + } + + /* + * parseParenthesesBegin(): + * + * Helper for parseTokens(); checks for parentheses types other than regular capturing subpatterns. + */ + void parseParenthesesBegin() + { + ASSERT(!m_err); + ASSERT(peek() == '('); + consume(); + + if (tryConsume('?')) { + if (atEndOfPattern()) { + m_err = ParenthesesTypeInvalid; + return; + } + + switch (consume()) { + case ':': + m_delegate.atomParenthesesSubpatternBegin(false); + break; + + case '=': + m_delegate.atomParentheticalAssertionBegin(); + break; + + case '!': + m_delegate.atomParentheticalAssertionBegin(true); + break; + + default: + m_err = ParenthesesTypeInvalid; + } + } else + m_delegate.atomParenthesesSubpatternBegin(); + + ++m_parenthesesNestingDepth; + } + + /* + * parseParenthesesEnd(): + * + * Helper for parseTokens(); checks for parse errors (due to unmatched parentheses). + */ + void parseParenthesesEnd() + { + ASSERT(!m_err); + ASSERT(peek() == ')'); + consume(); + + if (m_parenthesesNestingDepth > 0) + m_delegate.atomParenthesesEnd(); + else + m_err = ParenthesesUnmatched; + + --m_parenthesesNestingDepth; + } + + /* + * parseQuantifier(): + * + * Helper for parseTokens(); checks for parse errors and non-greedy quantifiers. + */ + void parseQuantifier(bool lastTokenWasAnAtom, unsigned min, unsigned max) + { + ASSERT(!m_err); + ASSERT(min <= max); + + if (lastTokenWasAnAtom) + m_delegate.quantifyAtom(min, max, !tryConsume('?')); + else + m_err = QuantifierWithoutAtom; + } + + /* + * parseTokens(): + * + * This method loops over the input pattern reporting tokens to the delegate. + * The method returns when a parse error is detected, or the end of the pattern + * is reached. One piece of state is tracked around the loop, which is whether + * the last token passed to the delegate was an atom (this is necessary to detect + * a parse error when a quantifier provided without an atom to quantify). + */ + void parseTokens() + { + bool lastTokenWasAnAtom = false; + + while (!atEndOfPattern()) { + switch (peek()) { + case '|': + consume(); + m_delegate.disjunction(); + lastTokenWasAnAtom = false; + break; + + case '(': + parseParenthesesBegin(); + lastTokenWasAnAtom = false; + break; + + case ')': + parseParenthesesEnd(); + lastTokenWasAnAtom = true; + break; + + case '^': + consume(); + m_delegate.assertionBOL(); + lastTokenWasAnAtom = false; + break; + + case '$': + consume(); + m_delegate.assertionEOL(); + lastTokenWasAnAtom = false; + break; + + case '.': + consume(); + m_delegate.atomBuiltInCharacterClass(NewlineClassID, true); + lastTokenWasAnAtom = true; + break; + + case '[': + parseCharacterClass(); + lastTokenWasAnAtom = true; + break; + + case '\\': + lastTokenWasAnAtom = parseAtomEscape(); + break; + + case '*': + consume(); + parseQuantifier(lastTokenWasAnAtom, 0, quantifyInfinite); + lastTokenWasAnAtom = false; + break; + + case '+': + consume(); + parseQuantifier(lastTokenWasAnAtom, 1, quantifyInfinite); + lastTokenWasAnAtom = false; + break; + + case '?': + consume(); + parseQuantifier(lastTokenWasAnAtom, 0, 1); + lastTokenWasAnAtom = false; + break; + + case '{': { + ParseState state = saveState(); + + consume(); + if (peekIsDigit()) { + unsigned min = consumeNumber(); + unsigned max = min; + + if (tryConsume(',')) + max = peekIsDigit() ? consumeNumber() : quantifyInfinite; + + if (tryConsume('}')) { + if (min <= max) + parseQuantifier(lastTokenWasAnAtom, min, max); + else + m_err = QuantifierOutOfOrder; + lastTokenWasAnAtom = false; + break; + } + } + + restoreState(state); + } // if we did not find a complete quantifer, fall through to the default case. + + default: + m_delegate.atomPatternCharacter(consume()); + lastTokenWasAnAtom = true; + } + + if (m_err) + return; + } + + if (m_parenthesesNestingDepth > 0) + m_err = MissingParentheses; + } + + /* + * parse(): + * + * This method calls regexBegin(), calls parseTokens() to parse over the input + * patterns, calls regexEnd() or regexError() as appropriate, and converts any + * error code to a const char* for a result. + */ + const char* parse() + { + m_delegate.regexBegin(); + + if (m_size > MAX_PATTERN_SIZE) + m_err = PatternTooLarge; + else + parseTokens(); + ASSERT(atEndOfPattern() || m_err); + + if (m_err) + m_delegate.regexError(); + else + m_delegate.regexEnd(); + + // The order of this array must match the ErrorCode enum. + static const char* errorMessages[NumberOfErrorCodes] = { + 0, // NoError + "regular expression too large", + "numbers out of order in {} quantifier", + "nothing to repeat", + "missing )", + "unmatched parentheses", + "unrecognized character after (?", + "missing terminating ] for character class", + "range out of order in character class", + "\\ at end of pattern" + }; + + return errorMessages[m_err]; + } + + + // Misc helper functions: + + typedef unsigned ParseState; + + ParseState saveState() + { + return m_index; + } + + void restoreState(ParseState state) + { + m_index = state; + } + + bool atEndOfPattern() + { + ASSERT(m_index <= m_size); + return m_index == m_size; + } + + int peek() + { + ASSERT(m_index < m_size); + return m_data[m_index]; + } + + bool peekIsDigit() + { + return !atEndOfPattern() && WTF::isASCIIDigit(peek()); + } + + unsigned peekDigit() + { + ASSERT(peekIsDigit()); + return peek() - '0'; + } + + int consume() + { + ASSERT(m_index < m_size); + return m_data[m_index++]; + } + + unsigned consumeDigit() + { + ASSERT(peekIsDigit()); + return consume() - '0'; + } + + unsigned consumeNumber() + { + unsigned n = consumeDigit(); + // check for overflow. + for (unsigned newValue; peekIsDigit() && ((newValue = n * 10 + peekDigit()) >= n); ) { + n = newValue; + consume(); + } + return n; + } + + unsigned consumeOctal() + { + ASSERT(WTF::isASCIIOctalDigit(peek())); + + unsigned n = consumeDigit(); + while (n < 32 && !atEndOfPattern() && WTF::isASCIIOctalDigit(peek())) + n = n * 8 + consumeDigit(); + return n; + } + + bool tryConsume(UChar ch) + { + if (atEndOfPattern() || (m_data[m_index] != ch)) + return false; + ++m_index; + return true; + } + + int tryConsumeHex(int count) + { + ParseState state = saveState(); + + int n = 0; + while (count--) { + if (atEndOfPattern() || !WTF::isASCIIHexDigit(peek())) { + restoreState(state); + return -1; + } + n = (n << 4) | WTF::toASCIIHexValue(consume()); + } + return n; + } + + Delegate& m_delegate; + unsigned m_backReferenceLimit; + ErrorCode m_err; + const UChar* m_data; + unsigned m_size; + unsigned m_index; + unsigned m_parenthesesNestingDepth; + + // Derived by empirical testing of compile time in PCRE and WREC. + static const unsigned MAX_PATTERN_SIZE = 1024 * 1024; +}; + +/* + * Yarr::parse(): + * + * The parse method is passed a pattern to be parsed and a delegate upon which + * callbacks will be made to record the parsed tokens forming the regex. + * Yarr::parse() returns null on success, or a const C string providing an error + * message where a parse error occurs. + * + * The Delegate must implement the following interface: + * + * void assertionBOL(); + * void assertionEOL(); + * void assertionWordBoundary(bool invert); + * + * void atomPatternCharacter(UChar ch); + * void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert); + * void atomCharacterClassBegin(bool invert) + * void atomCharacterClassAtom(UChar ch) + * void atomCharacterClassRange(UChar begin, UChar end) + * void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert) + * void atomCharacterClassEnd() + * void atomParenthesesSubpatternBegin(bool capture = true); + * void atomParentheticalAssertionBegin(bool invert = false); + * void atomParenthesesEnd(); + * void atomBackReference(unsigned subpatternId); + * + * void quantifyAtom(unsigned min, unsigned max, bool greedy); + * + * void disjunction(); + * + * void regexBegin(); + * void regexEnd(); + * void regexError(); + * + * Before any call recording tokens are made, regexBegin() will be called on the + * delegate once. Once parsing is complete either regexEnd() or regexError() will + * be called, as appropriate. + * + * The regular expression is described by a sequence of assertion*() and atom*() + * callbacks to the delegate, describing the terms in the regular expression. + * Following an atom a quantifyAtom() call may occur to indicate that the previous + * atom should be quantified. In the case of atoms described across multiple + * calls (parentheses and character classes) the call to quantifyAtom() will come + * after the call to the atom*End() method, never after atom*Begin(). + * + * Character classes may either be described by a single call to + * atomBuiltInCharacterClass(), or by a sequence of atomCharacterClass*() calls. + * In the latter case, ...Begin() will be called, followed by a sequence of + * calls to ...Atom(), ...Range(), and ...BuiltIn(), followed by a call to ...End(). + * + * Sequences of atoms and assertions are broken into alternatives via calls to + * disjunction(). Assertions, atoms, and disjunctions emitted between calls to + * atomParenthesesBegin() and atomParenthesesEnd() form the body of a subpattern. + * atomParenthesesBegin() is passed a subpatternId. In the case of a regular + * capturing subpattern, this will be the subpatternId associated with these + * parentheses, and will also by definition be the lowest subpatternId of these + * parentheses and of any nested paretheses. The atomParenthesesEnd() method + * is passed the subpatternId of the last capturing subexpression nested within + * these paretheses. In the case of a capturing subpattern with no nested + * capturing subpatterns, the same subpatternId will be passed to the begin and + * end functions. In the case of non-capturing subpatterns the subpatternId + * passed to the begin method is also the first possible subpatternId that might + * be nested within these paretheses. If a set of non-capturing parentheses does + * not contain any capturing subpatterns, then the subpatternId passed to begin + * will be greater than the subpatternId passed to end. + */ + +template<class Delegate> +const char* parse(Delegate& delegate, const UString& pattern, unsigned backReferenceLimit = quantifyInfinite) +{ + return Parser<Delegate>(delegate, pattern, backReferenceLimit).parse(); +} + +} } // namespace JSC::Yarr + +#endif // RegexParser_h diff --git a/Source/JavaScriptCore/yarr/RegexPattern.cpp b/Source/JavaScriptCore/yarr/RegexPattern.cpp new file mode 100644 index 0000000..e737d0e --- /dev/null +++ b/Source/JavaScriptCore/yarr/RegexPattern.cpp @@ -0,0 +1,991 @@ +/* + * Copyright (C) 2009 Apple Inc. All rights reserved. + * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include "RegexInterpreter.h" +#include "RegexPattern.h" +#include <wtf/Vector.h> + +using namespace WTF; + +namespace JSC { namespace Yarr { + +#include "RegExpJitTables.h" + +class CharacterClassConstructor { +public: + CharacterClassConstructor(bool isCaseInsensitive = false) + : m_isCaseInsensitive(isCaseInsensitive) + { + } + + void reset() + { + m_matches.clear(); + m_ranges.clear(); + m_matchesUnicode.clear(); + m_rangesUnicode.clear(); + } + + void append(const CharacterClass* other) + { + for (size_t i = 0; i < other->m_matches.size(); ++i) + addSorted(m_matches, other->m_matches[i]); + for (size_t i = 0; i < other->m_ranges.size(); ++i) + addSortedRange(m_ranges, other->m_ranges[i].begin, other->m_ranges[i].end); + for (size_t i = 0; i < other->m_matchesUnicode.size(); ++i) + addSorted(m_matchesUnicode, other->m_matchesUnicode[i]); + for (size_t i = 0; i < other->m_rangesUnicode.size(); ++i) + addSortedRange(m_rangesUnicode, other->m_rangesUnicode[i].begin, other->m_rangesUnicode[i].end); + } + + void putChar(UChar ch) + { + if (ch <= 0x7f) { + if (m_isCaseInsensitive && isASCIIAlpha(ch)) { + addSorted(m_matches, toASCIIUpper(ch)); + addSorted(m_matches, toASCIILower(ch)); + } else + addSorted(m_matches, ch); + } else { + UChar upper, lower; + if (m_isCaseInsensitive && ((upper = Unicode::toUpper(ch)) != (lower = Unicode::toLower(ch)))) { + addSorted(m_matchesUnicode, upper); + addSorted(m_matchesUnicode, lower); + } else + addSorted(m_matchesUnicode, ch); + } + } + + // returns true if this character has another case, and 'ch' is the upper case form. + static inline bool isUnicodeUpper(UChar ch) + { + return ch != Unicode::toLower(ch); + } + + // returns true if this character has another case, and 'ch' is the lower case form. + static inline bool isUnicodeLower(UChar ch) + { + return ch != Unicode::toUpper(ch); + } + + void putRange(UChar lo, UChar hi) + { + if (lo <= 0x7f) { + char asciiLo = lo; + char asciiHi = std::min(hi, (UChar)0x7f); + addSortedRange(m_ranges, lo, asciiHi); + + if (m_isCaseInsensitive) { + if ((asciiLo <= 'Z') && (asciiHi >= 'A')) + addSortedRange(m_ranges, std::max(asciiLo, 'A')+('a'-'A'), std::min(asciiHi, 'Z')+('a'-'A')); + if ((asciiLo <= 'z') && (asciiHi >= 'a')) + addSortedRange(m_ranges, std::max(asciiLo, 'a')+('A'-'a'), std::min(asciiHi, 'z')+('A'-'a')); + } + } + if (hi >= 0x80) { + uint32_t unicodeCurr = std::max(lo, (UChar)0x80); + addSortedRange(m_rangesUnicode, unicodeCurr, hi); + + if (m_isCaseInsensitive) { + while (unicodeCurr <= hi) { + // If the upper bound of the range (hi) is 0xffff, the increments to + // unicodeCurr in this loop may take it to 0x10000. This is fine + // (if so we won't re-enter the loop, since the loop condition above + // will definitely fail) - but this does mean we cannot use a UChar + // to represent unicodeCurr, we must use a 32-bit value instead. + ASSERT(unicodeCurr <= 0xffff); + + if (isUnicodeUpper(unicodeCurr)) { + UChar lowerCaseRangeBegin = Unicode::toLower(unicodeCurr); + UChar lowerCaseRangeEnd = lowerCaseRangeBegin; + while ((++unicodeCurr <= hi) && isUnicodeUpper(unicodeCurr) && (Unicode::toLower(unicodeCurr) == (lowerCaseRangeEnd + 1))) + lowerCaseRangeEnd++; + addSortedRange(m_rangesUnicode, lowerCaseRangeBegin, lowerCaseRangeEnd); + } else if (isUnicodeLower(unicodeCurr)) { + UChar upperCaseRangeBegin = Unicode::toUpper(unicodeCurr); + UChar upperCaseRangeEnd = upperCaseRangeBegin; + while ((++unicodeCurr <= hi) && isUnicodeLower(unicodeCurr) && (Unicode::toUpper(unicodeCurr) == (upperCaseRangeEnd + 1))) + upperCaseRangeEnd++; + addSortedRange(m_rangesUnicode, upperCaseRangeBegin, upperCaseRangeEnd); + } else + ++unicodeCurr; + } + } + } + } + + CharacterClass* charClass() + { + CharacterClass* characterClass = new CharacterClass(0); + + characterClass->m_matches.append(m_matches); + characterClass->m_ranges.append(m_ranges); + characterClass->m_matchesUnicode.append(m_matchesUnicode); + characterClass->m_rangesUnicode.append(m_rangesUnicode); + + reset(); + + return characterClass; + } + +private: + void addSorted(Vector<UChar>& matches, UChar ch) + { + unsigned pos = 0; + unsigned range = matches.size(); + + // binary chop, find position to insert char. + while (range) { + unsigned index = range >> 1; + + int val = matches[pos+index] - ch; + if (!val) + return; + else if (val > 0) + range = index; + else { + pos += (index+1); + range -= (index+1); + } + } + + if (pos == matches.size()) + matches.append(ch); + else + matches.insert(pos, ch); + } + + void addSortedRange(Vector<CharacterRange>& ranges, UChar lo, UChar hi) + { + unsigned end = ranges.size(); + + // Simple linear scan - I doubt there are that many ranges anyway... + // feel free to fix this with something faster (eg binary chop). + for (unsigned i = 0; i < end; ++i) { + // does the new range fall before the current position in the array + if (hi < ranges[i].begin) { + // optional optimization: concatenate appending ranges? - may not be worthwhile. + if (hi == (ranges[i].begin - 1)) { + ranges[i].begin = lo; + return; + } + ranges.insert(i, CharacterRange(lo, hi)); + return; + } + // Okay, since we didn't hit the last case, the end of the new range is definitely at or after the begining + // If the new range start at or before the end of the last range, then the overlap (if it starts one after the + // end of the last range they concatenate, which is just as good. + if (lo <= (ranges[i].end + 1)) { + // found an intersect! we'll replace this entry in the array. + ranges[i].begin = std::min(ranges[i].begin, lo); + ranges[i].end = std::max(ranges[i].end, hi); + + // now check if the new range can subsume any subsequent ranges. + unsigned next = i+1; + // each iteration of the loop we will either remove something from the list, or break the loop. + while (next < ranges.size()) { + if (ranges[next].begin <= (ranges[i].end + 1)) { + // the next entry now overlaps / concatenates this one. + ranges[i].end = std::max(ranges[i].end, ranges[next].end); + ranges.remove(next); + } else + break; + } + + return; + } + } + + // CharacterRange comes after all existing ranges. + ranges.append(CharacterRange(lo, hi)); + } + + bool m_isCaseInsensitive; + + Vector<UChar> m_matches; + Vector<CharacterRange> m_ranges; + Vector<UChar> m_matchesUnicode; + Vector<CharacterRange> m_rangesUnicode; +}; + +struct BeginCharHelper { + BeginCharHelper(Vector<BeginChar>* beginChars, bool isCaseInsensitive = false) + : m_beginChars(beginChars) + , m_isCaseInsensitive(isCaseInsensitive) + {} + + void addBeginChar(BeginChar beginChar, Vector<TermChain>* hotTerms, QuantifierType quantityType, unsigned quantityCount) + { + if (quantityType == QuantifierFixedCount && quantityCount > 1) { + // We duplicate the first found character if the quantity of the term is more than one. eg.: /a{3}/ + beginChar.value |= beginChar.value << 16; + beginChar.mask |= beginChar.mask << 16; + addCharacter(beginChar); + } else if (quantityType == QuantifierFixedCount && quantityCount == 1 && hotTerms->size()) + // In case of characters with fixed quantifier we should check the next character as well. + linkHotTerms(beginChar, hotTerms); + else + // In case of greedy matching the next character checking is unnecessary therefore we just store + // the first character. + addCharacter(beginChar); + } + + // Merge two following BeginChars in the vector to reduce the number of character checks. + void merge(unsigned size) + { + for (unsigned i = 0; i < size; i++) { + BeginChar* curr = &m_beginChars->at(i); + BeginChar* next = &m_beginChars->at(i + 1); + + // If the current and the next size of value is different we should skip the merge process + // because the 16bit and 32bit values are unmergable. + if (curr->value <= 0xFFFF && next->value > 0xFFFF) + continue; + + unsigned diff = curr->value ^ next->value; + + curr->mask |= diff; + curr->value |= curr->mask; + + m_beginChars->remove(i + 1); + size--; + } + } + +private: + void addCharacter(BeginChar beginChar) + { + unsigned pos = 0; + unsigned range = m_beginChars->size(); + + // binary chop, find position to insert char. + while (range) { + unsigned index = range >> 1; + + int val = m_beginChars->at(pos+index).value - beginChar.value; + if (!val) + return; + if (val < 0) + range = index; + else { + pos += (index+1); + range -= (index+1); + } + } + + if (pos == m_beginChars->size()) + m_beginChars->append(beginChar); + else + m_beginChars->insert(pos, beginChar); + } + + // Create BeginChar objects by appending each terms from a hotTerms vector to an existing BeginChar object. + void linkHotTerms(BeginChar beginChar, Vector<TermChain>* hotTerms) + { + for (unsigned i = 0; i < hotTerms->size(); i++) { + PatternTerm hotTerm = hotTerms->at(i).term; + ASSERT(hotTerm.type == PatternTerm::TypePatternCharacter); + + UChar characterNext = hotTerm.patternCharacter; + + // Append a character to an existing BeginChar object. + if (characterNext <= 0x7f) { + unsigned mask = 0; + + if (m_isCaseInsensitive && isASCIIAlpha(characterNext)) { + mask = 32; + characterNext = toASCIILower(characterNext); + } + + addCharacter(BeginChar(beginChar.value | (characterNext << 16), beginChar.mask | (mask << 16))); + } else { + UChar upper, lower; + if (m_isCaseInsensitive && ((upper = Unicode::toUpper(characterNext)) != (lower = Unicode::toLower(characterNext)))) { + addCharacter(BeginChar(beginChar.value | (upper << 16), beginChar.mask)); + addCharacter(BeginChar(beginChar.value | (lower << 16), beginChar.mask)); + } else + addCharacter(BeginChar(beginChar.value | (characterNext << 16), beginChar.mask)); + } + } + } + + Vector<BeginChar>* m_beginChars; + bool m_isCaseInsensitive; +}; + +class RegexPatternConstructor { +public: + RegexPatternConstructor(RegexPattern& pattern) + : m_pattern(pattern) + , m_characterClassConstructor(pattern.m_ignoreCase) + , m_beginCharHelper(&pattern.m_beginChars, pattern.m_ignoreCase) + , m_invertParentheticalAssertion(false) + { + } + + ~RegexPatternConstructor() + { + } + + void reset() + { + m_pattern.reset(); + m_characterClassConstructor.reset(); + } + + void assertionBOL() + { + if (!m_alternative->m_terms.size() & !m_invertParentheticalAssertion) { + m_alternative->m_startsWithBOL = true; + m_alternative->m_containsBOL = true; + m_pattern.m_containsBOL = true; + } + m_alternative->m_terms.append(PatternTerm::BOL()); + } + void assertionEOL() + { + m_alternative->m_terms.append(PatternTerm::EOL()); + } + void assertionWordBoundary(bool invert) + { + m_alternative->m_terms.append(PatternTerm::WordBoundary(invert)); + } + + void atomPatternCharacter(UChar ch) + { + // We handle case-insensitive checking of unicode characters which do have both + // cases by handling them as if they were defined using a CharacterClass. + if (m_pattern.m_ignoreCase && !isASCII(ch) && (Unicode::toUpper(ch) != Unicode::toLower(ch))) { + atomCharacterClassBegin(); + atomCharacterClassAtom(ch); + atomCharacterClassEnd(); + } else + m_alternative->m_terms.append(PatternTerm(ch)); + } + + void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert) + { + switch (classID) { + case DigitClassID: + m_alternative->m_terms.append(PatternTerm(m_pattern.digitsCharacterClass(), invert)); + break; + case SpaceClassID: + m_alternative->m_terms.append(PatternTerm(m_pattern.spacesCharacterClass(), invert)); + break; + case WordClassID: + m_alternative->m_terms.append(PatternTerm(m_pattern.wordcharCharacterClass(), invert)); + break; + case NewlineClassID: + m_alternative->m_terms.append(PatternTerm(m_pattern.newlineCharacterClass(), invert)); + break; + } + } + + void atomCharacterClassBegin(bool invert = false) + { + m_invertCharacterClass = invert; + } + + void atomCharacterClassAtom(UChar ch) + { + m_characterClassConstructor.putChar(ch); + } + + void atomCharacterClassRange(UChar begin, UChar end) + { + m_characterClassConstructor.putRange(begin, end); + } + + void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert) + { + ASSERT(classID != NewlineClassID); + + switch (classID) { + case DigitClassID: + m_characterClassConstructor.append(invert ? m_pattern.nondigitsCharacterClass() : m_pattern.digitsCharacterClass()); + break; + + case SpaceClassID: + m_characterClassConstructor.append(invert ? m_pattern.nonspacesCharacterClass() : m_pattern.spacesCharacterClass()); + break; + + case WordClassID: + m_characterClassConstructor.append(invert ? m_pattern.nonwordcharCharacterClass() : m_pattern.wordcharCharacterClass()); + break; + + default: + ASSERT_NOT_REACHED(); + } + } + + void atomCharacterClassEnd() + { + CharacterClass* newCharacterClass = m_characterClassConstructor.charClass(); + m_pattern.m_userCharacterClasses.append(newCharacterClass); + m_alternative->m_terms.append(PatternTerm(newCharacterClass, m_invertCharacterClass)); + } + + void atomParenthesesSubpatternBegin(bool capture = true) + { + unsigned subpatternId = m_pattern.m_numSubpatterns + 1; + if (capture) + m_pattern.m_numSubpatterns++; + + PatternDisjunction* parenthesesDisjunction = new PatternDisjunction(m_alternative); + m_pattern.m_disjunctions.append(parenthesesDisjunction); + m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction, capture, false)); + m_alternative = parenthesesDisjunction->addNewAlternative(); + } + + void atomParentheticalAssertionBegin(bool invert = false) + { + PatternDisjunction* parenthesesDisjunction = new PatternDisjunction(m_alternative); + m_pattern.m_disjunctions.append(parenthesesDisjunction); + m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParentheticalAssertion, m_pattern.m_numSubpatterns + 1, parenthesesDisjunction, false, invert)); + m_alternative = parenthesesDisjunction->addNewAlternative(); + m_invertParentheticalAssertion = invert; + } + + void atomParenthesesEnd() + { + ASSERT(m_alternative->m_parent); + ASSERT(m_alternative->m_parent->m_parent); + + PatternDisjunction* parenthesesDisjunction = m_alternative->m_parent; + m_alternative = m_alternative->m_parent->m_parent; + + PatternTerm& lastTerm = m_alternative->lastTerm(); + + unsigned numParenAlternatives = parenthesesDisjunction->m_alternatives.size(); + unsigned numBOLAnchoredAlts = 0; + // Bubble up BOL flags + for (unsigned i = 0; i < numParenAlternatives; i++) { + if (parenthesesDisjunction->m_alternatives[i]->m_startsWithBOL) + numBOLAnchoredAlts++; + } + + if (numBOLAnchoredAlts) { + m_alternative->m_containsBOL = true; + // If all the alternatives in parens start with BOL, then so does this one + if (numBOLAnchoredAlts == numParenAlternatives) + m_alternative->m_startsWithBOL = true; + } + + lastTerm.parentheses.lastSubpatternId = m_pattern.m_numSubpatterns; + m_invertParentheticalAssertion = false; + } + + void atomBackReference(unsigned subpatternId) + { + ASSERT(subpatternId); + m_pattern.m_containsBackreferences = true; + m_pattern.m_maxBackReference = std::max(m_pattern.m_maxBackReference, subpatternId); + + if (subpatternId > m_pattern.m_numSubpatterns) { + m_alternative->m_terms.append(PatternTerm::ForwardReference()); + return; + } + + PatternAlternative* currentAlternative = m_alternative; + ASSERT(currentAlternative); + + // Note to self: if we waited until the AST was baked, we could also remove forwards refs + while ((currentAlternative = currentAlternative->m_parent->m_parent)) { + PatternTerm& term = currentAlternative->lastTerm(); + ASSERT((term.type == PatternTerm::TypeParenthesesSubpattern) || (term.type == PatternTerm::TypeParentheticalAssertion)); + + if ((term.type == PatternTerm::TypeParenthesesSubpattern) && term.capture() && (subpatternId == term.parentheses.subpatternId)) { + m_alternative->m_terms.append(PatternTerm::ForwardReference()); + return; + } + } + + m_alternative->m_terms.append(PatternTerm(subpatternId)); + } + + // deep copy the argument disjunction. If filterStartsWithBOL is true, + // skip alternatives with m_startsWithBOL set true. + PatternDisjunction* copyDisjunction(PatternDisjunction* disjunction, bool filterStartsWithBOL = false) + { + PatternDisjunction* newDisjunction = 0; + for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) { + PatternAlternative* alternative = disjunction->m_alternatives[alt]; + if (!filterStartsWithBOL || !alternative->m_startsWithBOL) { + if (!newDisjunction) { + newDisjunction = new PatternDisjunction(); + newDisjunction->m_parent = disjunction->m_parent; + } + PatternAlternative* newAlternative = newDisjunction->addNewAlternative(); + for (unsigned i = 0; i < alternative->m_terms.size(); ++i) + newAlternative->m_terms.append(copyTerm(alternative->m_terms[i], filterStartsWithBOL)); + } + } + + if (newDisjunction) + m_pattern.m_disjunctions.append(newDisjunction); + return newDisjunction; + } + + PatternTerm copyTerm(PatternTerm& term, bool filterStartsWithBOL = false) + { + if ((term.type != PatternTerm::TypeParenthesesSubpattern) && (term.type != PatternTerm::TypeParentheticalAssertion)) + return PatternTerm(term); + + PatternTerm termCopy = term; + termCopy.parentheses.disjunction = copyDisjunction(termCopy.parentheses.disjunction, filterStartsWithBOL); + return termCopy; + } + + void quantifyAtom(unsigned min, unsigned max, bool greedy) + { + ASSERT(min <= max); + ASSERT(m_alternative->m_terms.size()); + + if (!max) { + m_alternative->removeLastTerm(); + return; + } + + PatternTerm& term = m_alternative->lastTerm(); + ASSERT(term.type > PatternTerm::TypeAssertionWordBoundary); + ASSERT((term.quantityCount == 1) && (term.quantityType == QuantifierFixedCount)); + + // For any assertion with a zero minimum, not matching is valid and has no effect, + // remove it. Otherwise, we need to match as least once, but there is no point + // matching more than once, so remove the quantifier. It is not entirely clear + // from the spec whether or not this behavior is correct, but I believe this + // matches Firefox. :-/ + if (term.type == PatternTerm::TypeParentheticalAssertion) { + if (!min) + m_alternative->removeLastTerm(); + return; + } + + if (min == 0) + term.quantify(max, greedy ? QuantifierGreedy : QuantifierNonGreedy); + else if (min == max) + term.quantify(min, QuantifierFixedCount); + else { + term.quantify(min, QuantifierFixedCount); + m_alternative->m_terms.append(copyTerm(term)); + // NOTE: this term is interesting from an analysis perspective, in that it can be ignored..... + m_alternative->lastTerm().quantify((max == quantifyInfinite) ? max : max - min, greedy ? QuantifierGreedy : QuantifierNonGreedy); + if (m_alternative->lastTerm().type == PatternTerm::TypeParenthesesSubpattern) + m_alternative->lastTerm().parentheses.isCopy = true; + } + } + + void disjunction() + { + m_alternative = m_alternative->m_parent->addNewAlternative(); + } + + void regexBegin() + { + m_pattern.m_body = new PatternDisjunction(); + m_alternative = m_pattern.m_body->addNewAlternative(); + m_pattern.m_disjunctions.append(m_pattern.m_body); + } + void regexEnd() + { + } + void regexError() + { + } + + unsigned setupAlternativeOffsets(PatternAlternative* alternative, unsigned currentCallFrameSize, unsigned initialInputPosition) + { + alternative->m_hasFixedSize = true; + unsigned currentInputPosition = initialInputPosition; + + for (unsigned i = 0; i < alternative->m_terms.size(); ++i) { + PatternTerm& term = alternative->m_terms[i]; + + switch (term.type) { + case PatternTerm::TypeAssertionBOL: + case PatternTerm::TypeAssertionEOL: + case PatternTerm::TypeAssertionWordBoundary: + term.inputPosition = currentInputPosition; + break; + + case PatternTerm::TypeBackReference: + term.inputPosition = currentInputPosition; + term.frameLocation = currentCallFrameSize; + currentCallFrameSize += RegexStackSpaceForBackTrackInfoBackReference; + alternative->m_hasFixedSize = false; + break; + + case PatternTerm::TypeForwardReference: + break; + + case PatternTerm::TypePatternCharacter: + term.inputPosition = currentInputPosition; + if (term.quantityType != QuantifierFixedCount) { + term.frameLocation = currentCallFrameSize; + currentCallFrameSize += RegexStackSpaceForBackTrackInfoPatternCharacter; + alternative->m_hasFixedSize = false; + } else + currentInputPosition += term.quantityCount; + break; + + case PatternTerm::TypeCharacterClass: + term.inputPosition = currentInputPosition; + if (term.quantityType != QuantifierFixedCount) { + term.frameLocation = currentCallFrameSize; + currentCallFrameSize += RegexStackSpaceForBackTrackInfoCharacterClass; + alternative->m_hasFixedSize = false; + } else + currentInputPosition += term.quantityCount; + break; + + case PatternTerm::TypeParenthesesSubpattern: + // Note: for fixed once parentheses we will ensure at least the minimum is available; others are on their own. + term.frameLocation = currentCallFrameSize; + if (term.quantityCount == 1 && !term.parentheses.isCopy) { + if (term.quantityType != QuantifierFixedCount) + currentCallFrameSize += RegexStackSpaceForBackTrackInfoParenthesesOnce; + currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition); + // If quantity is fixed, then pre-check its minimum size. + if (term.quantityType == QuantifierFixedCount) + currentInputPosition += term.parentheses.disjunction->m_minimumSize; + term.inputPosition = currentInputPosition; + } else if (term.parentheses.isTerminal) { + currentCallFrameSize += RegexStackSpaceForBackTrackInfoParenthesesTerminal; + currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition); + term.inputPosition = currentInputPosition; + } else { + term.inputPosition = currentInputPosition; + setupDisjunctionOffsets(term.parentheses.disjunction, 0, currentInputPosition); + currentCallFrameSize += RegexStackSpaceForBackTrackInfoParentheses; + } + // Fixed count of 1 could be accepted, if they have a fixed size *AND* if all alternatives are of the same length. + alternative->m_hasFixedSize = false; + break; + + case PatternTerm::TypeParentheticalAssertion: + term.inputPosition = currentInputPosition; + term.frameLocation = currentCallFrameSize; + currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize + RegexStackSpaceForBackTrackInfoParentheticalAssertion, currentInputPosition); + break; + } + } + + alternative->m_minimumSize = currentInputPosition - initialInputPosition; + return currentCallFrameSize; + } + + unsigned setupDisjunctionOffsets(PatternDisjunction* disjunction, unsigned initialCallFrameSize, unsigned initialInputPosition) + { + if ((disjunction != m_pattern.m_body) && (disjunction->m_alternatives.size() > 1)) + initialCallFrameSize += RegexStackSpaceForBackTrackInfoAlternative; + + unsigned minimumInputSize = UINT_MAX; + unsigned maximumCallFrameSize = 0; + bool hasFixedSize = true; + + for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) { + PatternAlternative* alternative = disjunction->m_alternatives[alt]; + unsigned currentAlternativeCallFrameSize = setupAlternativeOffsets(alternative, initialCallFrameSize, initialInputPosition); + minimumInputSize = min(minimumInputSize, alternative->m_minimumSize); + maximumCallFrameSize = max(maximumCallFrameSize, currentAlternativeCallFrameSize); + hasFixedSize &= alternative->m_hasFixedSize; + } + + ASSERT(minimumInputSize != UINT_MAX); + ASSERT(maximumCallFrameSize >= initialCallFrameSize); + + disjunction->m_hasFixedSize = hasFixedSize; + disjunction->m_minimumSize = minimumInputSize; + disjunction->m_callFrameSize = maximumCallFrameSize; + return maximumCallFrameSize; + } + + void setupOffsets() + { + setupDisjunctionOffsets(m_pattern.m_body, 0, 0); + } + + // This optimization identifies sets of parentheses that we will never need to backtrack. + // In these cases we do not need to store state from prior iterations. + // We can presently avoid backtracking for: + // * where the parens are at the end of the regular expression (last term in any of the + // alternatives of the main body disjunction). + // * where the parens are non-capturing, and quantified unbounded greedy (*). + // * where the parens do not contain any capturing subpatterns. + void checkForTerminalParentheses() + { + // This check is much too crude; should be just checking whether the candidate + // node contains nested capturing subpatterns, not the whole expression! + if (m_pattern.m_numSubpatterns) + return; + + Vector<PatternAlternative*>& alternatives = m_pattern.m_body->m_alternatives; + for (size_t i = 0; i < alternatives.size(); ++i) { + Vector<PatternTerm>& terms = alternatives[i]->m_terms; + if (terms.size()) { + PatternTerm& term = terms.last(); + if (term.type == PatternTerm::TypeParenthesesSubpattern + && term.quantityType == QuantifierGreedy + && term.quantityCount == quantifyInfinite + && !term.capture()) + term.parentheses.isTerminal = true; + } + } + } + + void optimizeBOL() + { + // Look for expressions containing beginning of line (^) anchoring and unroll them. + // e.g. /^a|^b|c/ becomes /^a|^b|c/ which is executed once followed by /c/ which loops + // This code relies on the parsing code tagging alternatives with m_containsBOL and + // m_startsWithBOL and rolling those up to containing alternatives. + // At this point, this is only valid for non-multiline expressions. + PatternDisjunction* disjunction = m_pattern.m_body; + + if (!m_pattern.m_containsBOL || m_pattern.m_multiline) + return; + + PatternDisjunction* loopDisjunction = copyDisjunction(disjunction, true); + + // Set alternatives in disjunction to "onceThrough" + for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) + disjunction->m_alternatives[alt]->setOnceThrough(); + + if (loopDisjunction) { + // Move alternatives from loopDisjunction to disjunction + for (unsigned alt = 0; alt < loopDisjunction->m_alternatives.size(); ++alt) + disjunction->m_alternatives.append(loopDisjunction->m_alternatives[alt]); + + loopDisjunction->m_alternatives.clear(); + } + } + + bool addBeginTerm(PatternTerm term, Vector<TermChain>* beginTerms, PatternAlternative* alternative, unsigned numTerms, unsigned termIndex, unsigned depth) + { + if (term.quantityType == QuantifierFixedCount) { + beginTerms->append(TermChain(term)); + if (depth < 2 && termIndex < numTerms - 1 && term.quantityCount == 1) + setupAlternativeBeginTerms(alternative, &beginTerms->last().hotTerms, termIndex + 1, depth + 1); + } else if (termIndex != numTerms - 1) { + beginTerms->append(TermChain(term)); + return true; + } + + return false; + } + + // This function collects the terms which are potentially matching the first number of depth characters in the result. + // If this function returns false then it found at least one term which makes the beginning character + // look-up optimization inefficient. + bool setupDisjunctionBeginTerms(PatternDisjunction* disjunction, Vector<TermChain>* beginTerms, unsigned depth) + { + for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) { + PatternAlternative* alternative = disjunction->m_alternatives[alt]; + + if (!setupAlternativeBeginTerms(alternative, beginTerms, 0, depth)) + return false; + } + + return true; + } + + bool setupAlternativeBeginTerms(PatternAlternative* alternative, Vector<TermChain>* beginTerms, unsigned termIndex, unsigned depth) + { + bool checkNext = true; + unsigned numTerms = alternative->m_terms.size(); + + while (checkNext && termIndex < numTerms) { + PatternTerm term = alternative->m_terms[termIndex]; + checkNext = false; + + switch (term.type) { + case PatternTerm::TypeAssertionBOL: + case PatternTerm::TypeAssertionEOL: + case PatternTerm::TypeAssertionWordBoundary: + return false; + + case PatternTerm::TypeBackReference: + case PatternTerm::TypeForwardReference: + return false; + + case PatternTerm::TypePatternCharacter: + if (addBeginTerm(term, beginTerms, alternative, numTerms, termIndex, depth)) { + termIndex++; + checkNext = true; + } + break; + + case PatternTerm::TypeCharacterClass: + return false; + + case PatternTerm::TypeParentheticalAssertion: + if (term.invert()) + return false; + + case PatternTerm::TypeParenthesesSubpattern: + if (term.quantityType != QuantifierFixedCount) { + if (termIndex == numTerms - 1) + break; + + termIndex++; + checkNext = true; + + } + + if (!setupDisjunctionBeginTerms(term.parentheses.disjunction, beginTerms, depth)) + return false; + + break; + } + } + + return true; + } + + void setupBeginChars() + { + Vector<TermChain> beginTerms; + bool containsFixedCharacter = false; + + if ((!m_pattern.m_body->m_hasFixedSize || m_pattern.m_body->m_alternatives.size() > 1) + && setupDisjunctionBeginTerms(m_pattern.m_body, &beginTerms, 0)) { + unsigned size = beginTerms.size(); + + // If we haven't collected any terms we should abort the preparation of beginning character look-up optimization. + if (!size) + return; + + m_pattern.m_containsBeginChars = true; + + for (unsigned i = 0; i < size; i++) { + PatternTerm term = beginTerms[i].term; + + // We have just collected PatternCharacter terms, other terms are not allowed. + ASSERT(term.type == PatternTerm::TypePatternCharacter); + + if (term.quantityType == QuantifierFixedCount) + containsFixedCharacter = true; + + UChar character = term.patternCharacter; + unsigned mask = 0; + + if (character <= 0x7f) { + if (m_pattern.m_ignoreCase && isASCIIAlpha(character)) { + mask = 32; + character = toASCIILower(character); + } + + m_beginCharHelper.addBeginChar(BeginChar(character, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount); + } else { + UChar upper, lower; + if (m_pattern.m_ignoreCase && ((upper = Unicode::toUpper(character)) != (lower = Unicode::toLower(character)))) { + m_beginCharHelper.addBeginChar(BeginChar(upper, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount); + m_beginCharHelper.addBeginChar(BeginChar(lower, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount); + } else + m_beginCharHelper.addBeginChar(BeginChar(character, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount); + } + } + + // If the pattern doesn't contain terms with fixed quantifiers then the beginning character look-up optimization is inefficient. + if (!containsFixedCharacter) { + m_pattern.m_containsBeginChars = false; + return; + } + + size = m_pattern.m_beginChars.size(); + + if (size > 2) + m_beginCharHelper.merge(size - 1); + else if (size <= 1) + m_pattern.m_containsBeginChars = false; + } + } + +private: + RegexPattern& m_pattern; + PatternAlternative* m_alternative; + CharacterClassConstructor m_characterClassConstructor; + BeginCharHelper m_beginCharHelper; + bool m_invertCharacterClass; + bool m_invertParentheticalAssertion; +}; + + +static const char* compileRegex(const UString& patternString, RegexPattern& pattern) +{ + RegexPatternConstructor constructor(pattern); + + if (const char* error = parse(constructor, patternString)) + return error; + + // If the pattern contains illegal backreferences reset & reparse. + // Quoting Netscape's "What's new in JavaScript 1.2", + // "Note: if the number of left parentheses is less than the number specified + // in \#, the \# is taken as an octal escape as described in the next row." + if (pattern.containsIllegalBackReference()) { + unsigned numSubpatterns = pattern.m_numSubpatterns; + + constructor.reset(); +#if !ASSERT_DISABLED + const char* error = +#endif + parse(constructor, patternString, numSubpatterns); + + ASSERT(!error); + ASSERT(numSubpatterns == pattern.m_numSubpatterns); + } + + constructor.checkForTerminalParentheses(); + constructor.optimizeBOL(); + + constructor.setupOffsets(); + constructor.setupBeginChars(); + + return 0; +}; + +RegexPattern::RegexPattern(const UString& pattern, bool ignoreCase, bool multiline, const char** error) + : m_ignoreCase(ignoreCase) + , m_multiline(multiline) + , m_containsBackreferences(false) + , m_containsBeginChars(false) + , m_containsBOL(false) + , m_numSubpatterns(0) + , m_maxBackReference(0) + , newlineCached(0) + , digitsCached(0) + , spacesCached(0) + , wordcharCached(0) + , nondigitsCached(0) + , nonspacesCached(0) + , nonwordcharCached(0) +{ + *error = compileRegex(pattern, *this); +} + +} } diff --git a/Source/JavaScriptCore/yarr/RegexPattern.h b/Source/JavaScriptCore/yarr/RegexPattern.h new file mode 100644 index 0000000..6833dd6 --- /dev/null +++ b/Source/JavaScriptCore/yarr/RegexPattern.h @@ -0,0 +1,424 @@ +/* + * Copyright (C) 2009 Apple Inc. All rights reserved. + * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef RegexPattern_h +#define RegexPattern_h + +#include <wtf/Vector.h> +#include <wtf/unicode/Unicode.h> + +#include <UString.h> + +namespace JSC { namespace Yarr { + +#define RegexStackSpaceForBackTrackInfoPatternCharacter 1 // Only for !fixed quantifiers. +#define RegexStackSpaceForBackTrackInfoCharacterClass 1 // Only for !fixed quantifiers. +#define RegexStackSpaceForBackTrackInfoBackReference 2 +#define RegexStackSpaceForBackTrackInfoAlternative 1 // One per alternative. +#define RegexStackSpaceForBackTrackInfoParentheticalAssertion 1 +#define RegexStackSpaceForBackTrackInfoParenthesesOnce 1 // Only for !fixed quantifiers. +#define RegexStackSpaceForBackTrackInfoParenthesesTerminal 1 +#define RegexStackSpaceForBackTrackInfoParentheses 2 + +struct PatternDisjunction; + +struct CharacterRange { + UChar begin; + UChar end; + + CharacterRange(UChar begin, UChar end) + : begin(begin) + , end(end) + { + } +}; + +struct CharacterClassTable : RefCounted<CharacterClassTable> { + const char* m_table; + bool m_inverted; + static PassRefPtr<CharacterClassTable> create(const char* table, bool inverted) + { + return adoptRef(new CharacterClassTable(table, inverted)); + } + +private: + CharacterClassTable(const char* table, bool inverted) + : m_table(table) + , m_inverted(inverted) + { + } +}; + +struct CharacterClass : FastAllocBase { + // All CharacterClass instances have to have the full set of matches and ranges, + // they may have an optional table for faster lookups (which must match the + // specified matches and ranges) + CharacterClass(PassRefPtr<CharacterClassTable> table) + : m_table(table) + { + } + Vector<UChar> m_matches; + Vector<CharacterRange> m_ranges; + Vector<UChar> m_matchesUnicode; + Vector<CharacterRange> m_rangesUnicode; + RefPtr<CharacterClassTable> m_table; +}; + +enum QuantifierType { + QuantifierFixedCount, + QuantifierGreedy, + QuantifierNonGreedy, +}; + +struct PatternTerm { + enum Type { + TypeAssertionBOL, + TypeAssertionEOL, + TypeAssertionWordBoundary, + TypePatternCharacter, + TypeCharacterClass, + TypeBackReference, + TypeForwardReference, + TypeParenthesesSubpattern, + TypeParentheticalAssertion, + } type; + bool m_capture :1; + bool m_invert :1; + union { + UChar patternCharacter; + CharacterClass* characterClass; + unsigned backReferenceSubpatternId; + struct { + PatternDisjunction* disjunction; + unsigned subpatternId; + unsigned lastSubpatternId; + bool isCopy; + bool isTerminal; + } parentheses; + }; + QuantifierType quantityType; + unsigned quantityCount; + int inputPosition; + unsigned frameLocation; + + PatternTerm(UChar ch) + : type(PatternTerm::TypePatternCharacter) + , m_capture(false) + , m_invert(false) + { + patternCharacter = ch; + quantityType = QuantifierFixedCount; + quantityCount = 1; + } + + PatternTerm(CharacterClass* charClass, bool invert) + : type(PatternTerm::TypeCharacterClass) + , m_capture(false) + , m_invert(invert) + { + characterClass = charClass; + quantityType = QuantifierFixedCount; + quantityCount = 1; + } + + PatternTerm(Type type, unsigned subpatternId, PatternDisjunction* disjunction, bool capture = false, bool invert = false) + : type(type) + , m_capture(capture) + , m_invert(invert) + { + parentheses.disjunction = disjunction; + parentheses.subpatternId = subpatternId; + parentheses.isCopy = false; + parentheses.isTerminal = false; + quantityType = QuantifierFixedCount; + quantityCount = 1; + } + + PatternTerm(Type type, bool invert = false) + : type(type) + , m_capture(false) + , m_invert(invert) + { + quantityType = QuantifierFixedCount; + quantityCount = 1; + } + + PatternTerm(unsigned spatternId) + : type(TypeBackReference) + , m_capture(false) + , m_invert(false) + { + backReferenceSubpatternId = spatternId; + quantityType = QuantifierFixedCount; + quantityCount = 1; + } + + static PatternTerm ForwardReference() + { + return PatternTerm(TypeForwardReference); + } + + static PatternTerm BOL() + { + return PatternTerm(TypeAssertionBOL); + } + + static PatternTerm EOL() + { + return PatternTerm(TypeAssertionEOL); + } + + static PatternTerm WordBoundary(bool invert) + { + return PatternTerm(TypeAssertionWordBoundary, invert); + } + + bool invert() + { + return m_invert; + } + + bool capture() + { + return m_capture; + } + + void quantify(unsigned count, QuantifierType type) + { + quantityCount = count; + quantityType = type; + } +}; + +struct PatternAlternative : FastAllocBase { + PatternAlternative(PatternDisjunction* disjunction) + : m_parent(disjunction) + , m_onceThrough(false) + , m_hasFixedSize(false) + , m_startsWithBOL(false) + , m_containsBOL(false) + { + } + + PatternTerm& lastTerm() + { + ASSERT(m_terms.size()); + return m_terms[m_terms.size() - 1]; + } + + void removeLastTerm() + { + ASSERT(m_terms.size()); + m_terms.shrink(m_terms.size() - 1); + } + + void setOnceThrough() + { + m_onceThrough = true; + } + + bool onceThrough() + { + return m_onceThrough; + } + + Vector<PatternTerm> m_terms; + PatternDisjunction* m_parent; + unsigned m_minimumSize; + bool m_onceThrough : 1; + bool m_hasFixedSize : 1; + bool m_startsWithBOL : 1; + bool m_containsBOL : 1; +}; + +struct PatternDisjunction : FastAllocBase { + PatternDisjunction(PatternAlternative* parent = 0) + : m_parent(parent) + , m_hasFixedSize(false) + { + } + + ~PatternDisjunction() + { + deleteAllValues(m_alternatives); + } + + PatternAlternative* addNewAlternative() + { + PatternAlternative* alternative = new PatternAlternative(this); + m_alternatives.append(alternative); + return alternative; + } + + Vector<PatternAlternative*> m_alternatives; + PatternAlternative* m_parent; + unsigned m_minimumSize; + unsigned m_callFrameSize; + bool m_hasFixedSize; +}; + +// You probably don't want to be calling these functions directly +// (please to be calling newlineCharacterClass() et al on your +// friendly neighborhood RegexPattern instance to get nicely +// cached copies). +CharacterClass* newlineCreate(); +CharacterClass* digitsCreate(); +CharacterClass* spacesCreate(); +CharacterClass* wordcharCreate(); +CharacterClass* nondigitsCreate(); +CharacterClass* nonspacesCreate(); +CharacterClass* nonwordcharCreate(); + +struct TermChain { + TermChain(PatternTerm term) + : term(term) + {} + + PatternTerm term; + Vector<TermChain> hotTerms; +}; + +struct BeginChar { + BeginChar() + : value(0) + , mask(0) + {} + + BeginChar(unsigned value, unsigned mask) + : value(value) + , mask(mask) + {} + + unsigned value; + unsigned mask; +}; + +struct RegexPattern { + RegexPattern(const UString& pattern, bool ignoreCase, bool multiline, const char** error); + + ~RegexPattern() + { + deleteAllValues(m_disjunctions); + deleteAllValues(m_userCharacterClasses); + } + + void reset() + { + m_numSubpatterns = 0; + m_maxBackReference = 0; + + m_containsBackreferences = false; + m_containsBeginChars = false; + m_containsBOL = false; + + newlineCached = 0; + digitsCached = 0; + spacesCached = 0; + wordcharCached = 0; + nondigitsCached = 0; + nonspacesCached = 0; + nonwordcharCached = 0; + + deleteAllValues(m_disjunctions); + m_disjunctions.clear(); + deleteAllValues(m_userCharacterClasses); + m_userCharacterClasses.clear(); + m_beginChars.clear(); + } + + bool containsIllegalBackReference() + { + return m_maxBackReference > m_numSubpatterns; + } + + CharacterClass* newlineCharacterClass() + { + if (!newlineCached) + m_userCharacterClasses.append(newlineCached = newlineCreate()); + return newlineCached; + } + CharacterClass* digitsCharacterClass() + { + if (!digitsCached) + m_userCharacterClasses.append(digitsCached = digitsCreate()); + return digitsCached; + } + CharacterClass* spacesCharacterClass() + { + if (!spacesCached) + m_userCharacterClasses.append(spacesCached = spacesCreate()); + return spacesCached; + } + CharacterClass* wordcharCharacterClass() + { + if (!wordcharCached) + m_userCharacterClasses.append(wordcharCached = wordcharCreate()); + return wordcharCached; + } + CharacterClass* nondigitsCharacterClass() + { + if (!nondigitsCached) + m_userCharacterClasses.append(nondigitsCached = nondigitsCreate()); + return nondigitsCached; + } + CharacterClass* nonspacesCharacterClass() + { + if (!nonspacesCached) + m_userCharacterClasses.append(nonspacesCached = nonspacesCreate()); + return nonspacesCached; + } + CharacterClass* nonwordcharCharacterClass() + { + if (!nonwordcharCached) + m_userCharacterClasses.append(nonwordcharCached = nonwordcharCreate()); + return nonwordcharCached; + } + + bool m_ignoreCase : 1; + bool m_multiline : 1; + bool m_containsBackreferences : 1; + bool m_containsBeginChars : 1; + bool m_containsBOL : 1; + unsigned m_numSubpatterns; + unsigned m_maxBackReference; + PatternDisjunction* m_body; + Vector<PatternDisjunction*, 4> m_disjunctions; + Vector<CharacterClass*> m_userCharacterClasses; + Vector<BeginChar> m_beginChars; + +private: + CharacterClass* newlineCached; + CharacterClass* digitsCached; + CharacterClass* spacesCached; + CharacterClass* wordcharCached; + CharacterClass* nondigitsCached; + CharacterClass* nonspacesCached; + CharacterClass* nonwordcharCached; +}; + +} } // namespace JSC::Yarr + +#endif // RegexPattern_h |