From 65f03d4f644ce73618e5f4f50dd694b26f55ae12 Mon Sep 17 00:00:00 2001 From: Ben Murdoch Date: Fri, 13 May 2011 16:23:25 +0100 Subject: Merge WebKit at r75993: Initial merge by git. Change-Id: I602bbdc3974787a3b0450456a30a7868286921c3 --- Source/JavaScriptCore/yarr/RegexInterpreter.cpp | 1891 ------------------ Source/JavaScriptCore/yarr/RegexInterpreter.h | 381 ---- Source/JavaScriptCore/yarr/RegexJIT.cpp | 2213 --------------------- Source/JavaScriptCore/yarr/RegexJIT.h | 90 - Source/JavaScriptCore/yarr/RegexParser.h | 887 --------- Source/JavaScriptCore/yarr/RegexPattern.cpp | 991 ---------- Source/JavaScriptCore/yarr/RegexPattern.h | 424 ---- Source/JavaScriptCore/yarr/Yarr.h | 72 + Source/JavaScriptCore/yarr/YarrInterpreter.cpp | 1890 ++++++++++++++++++ Source/JavaScriptCore/yarr/YarrInterpreter.h | 362 ++++ Source/JavaScriptCore/yarr/YarrJIT.cpp | 2242 ++++++++++++++++++++++ Source/JavaScriptCore/yarr/YarrJIT.h | 82 + Source/JavaScriptCore/yarr/YarrParser.h | 871 +++++++++ Source/JavaScriptCore/yarr/YarrPattern.cpp | 985 ++++++++++ Source/JavaScriptCore/yarr/YarrPattern.h | 417 ++++ Source/JavaScriptCore/yarr/YarrSyntaxChecker.cpp | 59 + Source/JavaScriptCore/yarr/YarrSyntaxChecker.h | 38 + 17 files changed, 7018 insertions(+), 6877 deletions(-) delete mode 100644 Source/JavaScriptCore/yarr/RegexInterpreter.cpp delete mode 100644 Source/JavaScriptCore/yarr/RegexInterpreter.h delete mode 100644 Source/JavaScriptCore/yarr/RegexJIT.cpp delete mode 100644 Source/JavaScriptCore/yarr/RegexJIT.h delete mode 100644 Source/JavaScriptCore/yarr/RegexParser.h delete mode 100644 Source/JavaScriptCore/yarr/RegexPattern.cpp delete mode 100644 Source/JavaScriptCore/yarr/RegexPattern.h create mode 100644 Source/JavaScriptCore/yarr/Yarr.h create mode 100644 Source/JavaScriptCore/yarr/YarrInterpreter.cpp create mode 100644 Source/JavaScriptCore/yarr/YarrInterpreter.h create mode 100644 Source/JavaScriptCore/yarr/YarrJIT.cpp create mode 100644 Source/JavaScriptCore/yarr/YarrJIT.h create mode 100644 Source/JavaScriptCore/yarr/YarrParser.h create mode 100644 Source/JavaScriptCore/yarr/YarrPattern.cpp create mode 100644 Source/JavaScriptCore/yarr/YarrPattern.h create mode 100644 Source/JavaScriptCore/yarr/YarrSyntaxChecker.cpp create mode 100644 Source/JavaScriptCore/yarr/YarrSyntaxChecker.h (limited to 'Source/JavaScriptCore/yarr') diff --git a/Source/JavaScriptCore/yarr/RegexInterpreter.cpp b/Source/JavaScriptCore/yarr/RegexInterpreter.cpp deleted file mode 100644 index 7769922..0000000 --- a/Source/JavaScriptCore/yarr/RegexInterpreter.cpp +++ /dev/null @@ -1,1891 +0,0 @@ -/* - * Copyright (C) 2009 Apple Inc. All rights reserved. - * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY - * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY - * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include "config.h" -#include "RegexInterpreter.h" - -#include "RegexPattern.h" -#include - -#ifndef NDEBUG -#include -#endif - -using namespace WTF; - -namespace JSC { namespace Yarr { - -class Interpreter { -public: - struct ParenthesesDisjunctionContext; - - struct BackTrackInfoPatternCharacter { - uintptr_t matchAmount; - }; - struct BackTrackInfoCharacterClass { - uintptr_t matchAmount; - }; - struct BackTrackInfoBackReference { - uintptr_t begin; // Not really needed for greedy quantifiers. - uintptr_t matchAmount; // Not really needed for fixed quantifiers. - }; - struct BackTrackInfoAlternative { - uintptr_t offset; - }; - struct BackTrackInfoParentheticalAssertion { - uintptr_t begin; - }; - struct BackTrackInfoParenthesesOnce { - uintptr_t begin; - }; - struct BackTrackInfoParenthesesTerminal { - uintptr_t begin; - }; - struct BackTrackInfoParentheses { - uintptr_t matchAmount; - ParenthesesDisjunctionContext* lastContext; - }; - - static inline void appendParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack, ParenthesesDisjunctionContext* context) - { - context->next = backTrack->lastContext; - backTrack->lastContext = context; - ++backTrack->matchAmount; - } - - static inline void popParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack) - { - ASSERT(backTrack->matchAmount); - ASSERT(backTrack->lastContext); - backTrack->lastContext = backTrack->lastContext->next; - --backTrack->matchAmount; - } - - struct DisjunctionContext - { - DisjunctionContext() - : term(0) - { - } - - void* operator new(size_t, void* where) - { - return where; - } - - int term; - unsigned matchBegin; - unsigned matchEnd; - uintptr_t frame[1]; - }; - - DisjunctionContext* allocDisjunctionContext(ByteDisjunction* disjunction) - { - size_t size = sizeof(DisjunctionContext) - sizeof(uintptr_t) + disjunction->m_frameSize * sizeof(uintptr_t); - allocatorPool = allocatorPool->ensureCapacity(size); - if (!allocatorPool) - CRASH(); - return new(allocatorPool->alloc(size)) DisjunctionContext(); - } - - void freeDisjunctionContext(DisjunctionContext* context) - { - allocatorPool = allocatorPool->dealloc(context); - } - - struct ParenthesesDisjunctionContext - { - ParenthesesDisjunctionContext(int* output, ByteTerm& term) - : next(0) - { - unsigned firstSubpatternId = term.atom.subpatternId; - unsigned numNestedSubpatterns = term.atom.parenthesesDisjunction->m_numSubpatterns; - - for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i) { - subpatternBackup[i] = output[(firstSubpatternId << 1) + i]; - output[(firstSubpatternId << 1) + i] = -1; - } - - new(getDisjunctionContext(term)) DisjunctionContext(); - } - - void* operator new(size_t, void* where) - { - return where; - } - - void restoreOutput(int* output, unsigned firstSubpatternId, unsigned numNestedSubpatterns) - { - for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i) - output[(firstSubpatternId << 1) + i] = subpatternBackup[i]; - } - - DisjunctionContext* getDisjunctionContext(ByteTerm& term) - { - return reinterpret_cast(&(subpatternBackup[term.atom.parenthesesDisjunction->m_numSubpatterns << 1])); - } - - ParenthesesDisjunctionContext* next; - int subpatternBackup[1]; - }; - - ParenthesesDisjunctionContext* allocParenthesesDisjunctionContext(ByteDisjunction* disjunction, int* output, ByteTerm& term) - { - size_t size = sizeof(ParenthesesDisjunctionContext) - sizeof(int) + (term.atom.parenthesesDisjunction->m_numSubpatterns << 1) * sizeof(int) + sizeof(DisjunctionContext) - sizeof(uintptr_t) + disjunction->m_frameSize * sizeof(uintptr_t); - allocatorPool = allocatorPool->ensureCapacity(size); - if (!allocatorPool) - CRASH(); - return new(allocatorPool->alloc(size)) ParenthesesDisjunctionContext(output, term); - } - - void freeParenthesesDisjunctionContext(ParenthesesDisjunctionContext* context) - { - allocatorPool = allocatorPool->dealloc(context); - } - - class InputStream { - public: - InputStream(const UChar* input, unsigned start, unsigned length) - : input(input) - , pos(start) - , length(length) - { - } - - void next() - { - ++pos; - } - - void rewind(unsigned amount) - { - ASSERT(pos >= amount); - pos -= amount; - } - - int read() - { - ASSERT(pos < length); - if (pos < length) - return input[pos]; - return -1; - } - - int readPair() - { - ASSERT(pos + 1 < length); - return input[pos] | input[pos + 1] << 16; - } - - int readChecked(int position) - { - ASSERT(position < 0); - ASSERT((unsigned)-position <= pos); - unsigned p = pos + position; - ASSERT(p < length); - return input[p]; - } - - int reread(unsigned from) - { - ASSERT(from < length); - return input[from]; - } - - int prev() - { - ASSERT(!(pos > length)); - if (pos && length) - return input[pos - 1]; - return -1; - } - - unsigned getPos() - { - return pos; - } - - void setPos(unsigned p) - { - pos = p; - } - - bool atStart() - { - return pos == 0; - } - - bool atEnd() - { - return pos == length; - } - - bool checkInput(int count) - { - if ((pos + count) <= length) { - pos += count; - return true; - } else - return false; - } - - void uncheckInput(int count) - { - pos -= count; - } - - bool atStart(int position) - { - return (pos + position) == 0; - } - - bool atEnd(int position) - { - return (pos + position) == length; - } - - bool isNotAvailableInput(int position) - { - return (pos + position) > length; - } - - private: - const UChar* input; - unsigned pos; - unsigned length; - }; - - bool testCharacterClass(CharacterClass* characterClass, int ch) - { - if (ch & 0xFF80) { - for (unsigned i = 0; i < characterClass->m_matchesUnicode.size(); ++i) - if (ch == characterClass->m_matchesUnicode[i]) - return true; - for (unsigned i = 0; i < characterClass->m_rangesUnicode.size(); ++i) - if ((ch >= characterClass->m_rangesUnicode[i].begin) && (ch <= characterClass->m_rangesUnicode[i].end)) - return true; - } else { - for (unsigned i = 0; i < characterClass->m_matches.size(); ++i) - if (ch == characterClass->m_matches[i]) - return true; - for (unsigned i = 0; i < characterClass->m_ranges.size(); ++i) - if ((ch >= characterClass->m_ranges[i].begin) && (ch <= characterClass->m_ranges[i].end)) - return true; - } - - return false; - } - - bool checkCharacter(int testChar, int inputPosition) - { - return testChar == input.readChecked(inputPosition); - } - - bool checkCasedCharacter(int loChar, int hiChar, int inputPosition) - { - int ch = input.readChecked(inputPosition); - return (loChar == ch) || (hiChar == ch); - } - - bool checkCharacterClass(CharacterClass* characterClass, bool invert, int inputPosition) - { - bool match = testCharacterClass(characterClass, input.readChecked(inputPosition)); - return invert ? !match : match; - } - - bool tryConsumeBackReference(int matchBegin, int matchEnd, int inputOffset) - { - int matchSize = matchEnd - matchBegin; - - if (!input.checkInput(matchSize)) - return false; - - if (pattern->m_ignoreCase) { - for (int i = 0; i < matchSize; ++i) { - int ch = input.reread(matchBegin + i); - - int lo = Unicode::toLower(ch); - int hi = Unicode::toUpper(ch); - - if ((lo != hi) ? (!checkCasedCharacter(lo, hi, inputOffset - matchSize + i)) : (!checkCharacter(ch, inputOffset - matchSize + i))) { - input.uncheckInput(matchSize); - return false; - } - } - } else { - for (int i = 0; i < matchSize; ++i) { - if (!checkCharacter(input.reread(matchBegin + i), inputOffset - matchSize + i)) { - input.uncheckInput(matchSize); - return false; - } - } - } - - return true; - } - - bool matchAssertionBOL(ByteTerm& term) - { - return (input.atStart(term.inputPosition)) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition - 1))); - } - - bool matchAssertionEOL(ByteTerm& term) - { - if (term.inputPosition) - return (input.atEnd(term.inputPosition)) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition))); - else - return (input.atEnd()) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.read())); - } - - bool matchAssertionWordBoundary(ByteTerm& term) - { - bool prevIsWordchar = !input.atStart(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition - 1)); - bool readIsWordchar; - if (term.inputPosition) - readIsWordchar = !input.atEnd(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition)); - else - readIsWordchar = !input.atEnd() && testCharacterClass(pattern->wordcharCharacterClass, input.read()); - - bool wordBoundary = prevIsWordchar != readIsWordchar; - return term.invert() ? !wordBoundary : wordBoundary; - } - - bool backtrackPatternCharacter(ByteTerm& term, DisjunctionContext* context) - { - BackTrackInfoPatternCharacter* backTrack = reinterpret_cast(context->frame + term.frameLocation); - - switch (term.atom.quantityType) { - case QuantifierFixedCount: - break; - - case QuantifierGreedy: - if (backTrack->matchAmount) { - --backTrack->matchAmount; - input.uncheckInput(1); - return true; - } - break; - - case QuantifierNonGreedy: - if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) { - ++backTrack->matchAmount; - if (checkCharacter(term.atom.patternCharacter, term.inputPosition - 1)) - return true; - } - input.uncheckInput(backTrack->matchAmount); - break; - } - - return false; - } - - bool backtrackPatternCasedCharacter(ByteTerm& term, DisjunctionContext* context) - { - BackTrackInfoPatternCharacter* backTrack = reinterpret_cast(context->frame + term.frameLocation); - - switch (term.atom.quantityType) { - case QuantifierFixedCount: - break; - - case QuantifierGreedy: - if (backTrack->matchAmount) { - --backTrack->matchAmount; - input.uncheckInput(1); - return true; - } - break; - - case QuantifierNonGreedy: - if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) { - ++backTrack->matchAmount; - if (checkCasedCharacter(term.atom.casedCharacter.lo, term.atom.casedCharacter.hi, term.inputPosition - 1)) - return true; - } - input.uncheckInput(backTrack->matchAmount); - break; - } - - return false; - } - - bool matchCharacterClass(ByteTerm& term, DisjunctionContext* context) - { - ASSERT(term.type == ByteTerm::TypeCharacterClass); - BackTrackInfoPatternCharacter* backTrack = reinterpret_cast(context->frame + term.frameLocation); - - switch (term.atom.quantityType) { - case QuantifierFixedCount: { - for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) { - if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition + matchAmount)) - return false; - } - return true; - } - - case QuantifierGreedy: { - unsigned matchAmount = 0; - while ((matchAmount < term.atom.quantityCount) && input.checkInput(1)) { - if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - 1)) { - input.uncheckInput(1); - break; - } - ++matchAmount; - } - backTrack->matchAmount = matchAmount; - - return true; - } - - case QuantifierNonGreedy: - backTrack->matchAmount = 0; - return true; - } - - ASSERT_NOT_REACHED(); - return false; - } - - bool backtrackCharacterClass(ByteTerm& term, DisjunctionContext* context) - { - ASSERT(term.type == ByteTerm::TypeCharacterClass); - BackTrackInfoPatternCharacter* backTrack = reinterpret_cast(context->frame + term.frameLocation); - - switch (term.atom.quantityType) { - case QuantifierFixedCount: - break; - - case QuantifierGreedy: - if (backTrack->matchAmount) { - --backTrack->matchAmount; - input.uncheckInput(1); - return true; - } - break; - - case QuantifierNonGreedy: - if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) { - ++backTrack->matchAmount; - if (checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - 1)) - return true; - } - input.uncheckInput(backTrack->matchAmount); - break; - } - - return false; - } - - bool matchBackReference(ByteTerm& term, DisjunctionContext* context) - { - ASSERT(term.type == ByteTerm::TypeBackReference); - BackTrackInfoBackReference* backTrack = reinterpret_cast(context->frame + term.frameLocation); - - int matchBegin = output[(term.atom.subpatternId << 1)]; - int matchEnd = output[(term.atom.subpatternId << 1) + 1]; - - // If the end position of the referenced match hasn't set yet then the backreference in the same parentheses where it references to that. - // In this case the result of match is empty string like when it references to a parentheses with zero-width match. - // Eg.: /(a\1)/ - if (matchEnd == -1) - return true; - - ASSERT((matchBegin == -1) || (matchBegin <= matchEnd)); - - if (matchBegin == matchEnd) - return true; - - switch (term.atom.quantityType) { - case QuantifierFixedCount: { - backTrack->begin = input.getPos(); - for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) { - if (!tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) { - input.setPos(backTrack->begin); - return false; - } - } - return true; - } - - case QuantifierGreedy: { - unsigned matchAmount = 0; - while ((matchAmount < term.atom.quantityCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) - ++matchAmount; - backTrack->matchAmount = matchAmount; - return true; - } - - case QuantifierNonGreedy: - backTrack->begin = input.getPos(); - backTrack->matchAmount = 0; - return true; - } - - ASSERT_NOT_REACHED(); - return false; - } - - bool backtrackBackReference(ByteTerm& term, DisjunctionContext* context) - { - ASSERT(term.type == ByteTerm::TypeBackReference); - BackTrackInfoBackReference* backTrack = reinterpret_cast(context->frame + term.frameLocation); - - int matchBegin = output[(term.atom.subpatternId << 1)]; - int matchEnd = output[(term.atom.subpatternId << 1) + 1]; - ASSERT((matchBegin == -1) || (matchBegin <= matchEnd)); - - if (matchBegin == matchEnd) - return false; - - switch (term.atom.quantityType) { - case QuantifierFixedCount: - // for quantityCount == 1, could rewind. - input.setPos(backTrack->begin); - break; - - case QuantifierGreedy: - if (backTrack->matchAmount) { - --backTrack->matchAmount; - input.rewind(matchEnd - matchBegin); - return true; - } - break; - - case QuantifierNonGreedy: - if ((backTrack->matchAmount < term.atom.quantityCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) { - ++backTrack->matchAmount; - return true; - } else - input.setPos(backTrack->begin); - break; - } - - return false; - } - - void recordParenthesesMatch(ByteTerm& term, ParenthesesDisjunctionContext* context) - { - if (term.capture()) { - unsigned subpatternId = term.atom.subpatternId; - output[(subpatternId << 1)] = context->getDisjunctionContext(term)->matchBegin + term.inputPosition; - output[(subpatternId << 1) + 1] = context->getDisjunctionContext(term)->matchEnd + term.inputPosition; - } - } - void resetMatches(ByteTerm& term, ParenthesesDisjunctionContext* context) - { - unsigned firstSubpatternId = term.atom.subpatternId; - unsigned count = term.atom.parenthesesDisjunction->m_numSubpatterns; - context->restoreOutput(output, firstSubpatternId, count); - } - JSRegExpResult parenthesesDoBacktrack(ByteTerm& term, BackTrackInfoParentheses* backTrack) - { - while (backTrack->matchAmount) { - ParenthesesDisjunctionContext* context = backTrack->lastContext; - - JSRegExpResult result = matchDisjunction(term.atom.parenthesesDisjunction, context->getDisjunctionContext(term), true); - if (result == JSRegExpMatch) - return JSRegExpMatch; - - resetMatches(term, context); - popParenthesesDisjunctionContext(backTrack); - freeParenthesesDisjunctionContext(context); - - if (result != JSRegExpNoMatch) - return result; - } - - return JSRegExpNoMatch; - } - - bool matchParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context) - { - ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin); - ASSERT(term.atom.quantityCount == 1); - - BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast(context->frame + term.frameLocation); - - switch (term.atom.quantityType) { - case QuantifierGreedy: { - // set this speculatively; if we get to the parens end this will be true. - backTrack->begin = input.getPos(); - break; - } - case QuantifierNonGreedy: { - backTrack->begin = notFound; - context->term += term.atom.parenthesesWidth; - return true; - } - case QuantifierFixedCount: - break; - } - - if (term.capture()) { - unsigned subpatternId = term.atom.subpatternId; - output[(subpatternId << 1)] = input.getPos() + term.inputPosition; - } - - return true; - } - - bool matchParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context) - { - ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd); - ASSERT(term.atom.quantityCount == 1); - - if (term.capture()) { - unsigned subpatternId = term.atom.subpatternId; - output[(subpatternId << 1) + 1] = input.getPos() + term.inputPosition; - } - - if (term.atom.quantityType == QuantifierFixedCount) - return true; - - BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast(context->frame + term.frameLocation); - return backTrack->begin != input.getPos(); - } - - bool backtrackParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context) - { - ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin); - ASSERT(term.atom.quantityCount == 1); - - BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast(context->frame + term.frameLocation); - - if (term.capture()) { - unsigned subpatternId = term.atom.subpatternId; - output[(subpatternId << 1)] = -1; - output[(subpatternId << 1) + 1] = -1; - } - - switch (term.atom.quantityType) { - case QuantifierGreedy: - // if we backtrack to this point, there is another chance - try matching nothing. - ASSERT(backTrack->begin != notFound); - backTrack->begin = notFound; - context->term += term.atom.parenthesesWidth; - return true; - case QuantifierNonGreedy: - ASSERT(backTrack->begin != notFound); - case QuantifierFixedCount: - break; - } - - return false; - } - - bool backtrackParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context) - { - ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd); - ASSERT(term.atom.quantityCount == 1); - - BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast(context->frame + term.frameLocation); - - switch (term.atom.quantityType) { - case QuantifierGreedy: - if (backTrack->begin == notFound) { - context->term -= term.atom.parenthesesWidth; - return false; - } - case QuantifierNonGreedy: - if (backTrack->begin == notFound) { - backTrack->begin = input.getPos(); - if (term.capture()) { - // Technically this access to inputPosition should be accessing the begin term's - // inputPosition, but for repeats other than fixed these values should be - // the same anyway! (We don't pre-check for greedy or non-greedy matches.) - ASSERT((&term - term.atom.parenthesesWidth)->type == ByteTerm::TypeParenthesesSubpatternOnceBegin); - ASSERT((&term - term.atom.parenthesesWidth)->inputPosition == term.inputPosition); - unsigned subpatternId = term.atom.subpatternId; - output[subpatternId << 1] = input.getPos() + term.inputPosition; - } - context->term -= term.atom.parenthesesWidth; - return true; - } - case QuantifierFixedCount: - break; - } - - return false; - } - - bool matchParenthesesTerminalBegin(ByteTerm& term, DisjunctionContext* context) - { - ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalBegin); - ASSERT(term.atom.quantityType == QuantifierGreedy); - ASSERT(term.atom.quantityCount == quantifyInfinite); - ASSERT(!term.capture()); - - BackTrackInfoParenthesesTerminal* backTrack = reinterpret_cast(context->frame + term.frameLocation); - backTrack->begin = input.getPos(); - return true; - } - - bool matchParenthesesTerminalEnd(ByteTerm& term, DisjunctionContext* context) - { - ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalEnd); - - BackTrackInfoParenthesesTerminal* backTrack = reinterpret_cast(context->frame + term.frameLocation); - // Empty match is a failed match. - if (backTrack->begin == input.getPos()) - return false; - - // Successful match! Okay, what's next? - loop around and try to match moar! - context->term -= (term.atom.parenthesesWidth + 1); - return true; - } - - bool backtrackParenthesesTerminalBegin(ByteTerm& term, DisjunctionContext* context) - { - ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalBegin); - ASSERT(term.atom.quantityType == QuantifierGreedy); - ASSERT(term.atom.quantityCount == quantifyInfinite); - ASSERT(!term.capture()); - - // If we backtrack to this point, we have failed to match this iteration of the parens. - // Since this is greedy / zero minimum a failed is also accepted as a match! - context->term += term.atom.parenthesesWidth; - return true; - } - - bool backtrackParenthesesTerminalEnd(ByteTerm&, DisjunctionContext*) - { - // 'Terminal' parentheses are at the end of the regex, and as such a match past end - // should always be returned as a successful match - we should never backtrack to here. - ASSERT_NOT_REACHED(); - return false; - } - - bool matchParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context) - { - ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin); - ASSERT(term.atom.quantityCount == 1); - - BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast(context->frame + term.frameLocation); - - backTrack->begin = input.getPos(); - return true; - } - - bool matchParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context) - { - ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd); - ASSERT(term.atom.quantityCount == 1); - - BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast(context->frame + term.frameLocation); - - input.setPos(backTrack->begin); - - // We've reached the end of the parens; if they are inverted, this is failure. - if (term.invert()) { - context->term -= term.atom.parenthesesWidth; - return false; - } - - return true; - } - - bool backtrackParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context) - { - ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin); - ASSERT(term.atom.quantityCount == 1); - - // We've failed to match parens; if they are inverted, this is win! - if (term.invert()) { - context->term += term.atom.parenthesesWidth; - return true; - } - - return false; - } - - bool backtrackParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context) - { - ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd); - ASSERT(term.atom.quantityCount == 1); - - BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast(context->frame + term.frameLocation); - - input.setPos(backTrack->begin); - - context->term -= term.atom.parenthesesWidth; - return false; - } - - JSRegExpResult matchParentheses(ByteTerm& term, DisjunctionContext* context) - { - ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern); - - BackTrackInfoParentheses* backTrack = reinterpret_cast(context->frame + term.frameLocation); - ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction; - - backTrack->matchAmount = 0; - backTrack->lastContext = 0; - - switch (term.atom.quantityType) { - case QuantifierFixedCount: { - // While we haven't yet reached our fixed limit, - while (backTrack->matchAmount < term.atom.quantityCount) { - // Try to do a match, and it it succeeds, add it to the list. - ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); - JSRegExpResult result = matchDisjunction(disjunctionBody, context->getDisjunctionContext(term)); - if (result == JSRegExpMatch) - appendParenthesesDisjunctionContext(backTrack, context); - else { - // The match failed; try to find an alternate point to carry on from. - resetMatches(term, context); - freeParenthesesDisjunctionContext(context); - - if (result == JSRegExpNoMatch) { - JSRegExpResult backtrackResult = parenthesesDoBacktrack(term, backTrack); - if (backtrackResult != JSRegExpMatch) - return backtrackResult; - } else - return result; - } - } - - ASSERT(backTrack->matchAmount == term.atom.quantityCount); - ParenthesesDisjunctionContext* context = backTrack->lastContext; - recordParenthesesMatch(term, context); - return JSRegExpMatch; - } - - case QuantifierGreedy: { - while (backTrack->matchAmount < term.atom.quantityCount) { - ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); - JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term)); - if (result == JSRegExpMatch) - appendParenthesesDisjunctionContext(backTrack, context); - else { - resetMatches(term, context); - freeParenthesesDisjunctionContext(context); - - if (result != JSRegExpNoMatch) - return result; - - break; - } - } - - if (backTrack->matchAmount) { - ParenthesesDisjunctionContext* context = backTrack->lastContext; - recordParenthesesMatch(term, context); - } - return JSRegExpMatch; - } - - case QuantifierNonGreedy: - return JSRegExpMatch; - } - - ASSERT_NOT_REACHED(); - return JSRegExpErrorNoMatch; - } - - // Rules for backtracking differ depending on whether this is greedy or non-greedy. - // - // Greedy matches never should try just adding more - you should already have done - // the 'more' cases. Always backtrack, at least a leetle bit. However cases where - // you backtrack an item off the list needs checking, since we'll never have matched - // the one less case. Tracking forwards, still add as much as possible. - // - // Non-greedy, we've already done the one less case, so don't match on popping. - // We haven't done the one more case, so always try to add that. - // - JSRegExpResult backtrackParentheses(ByteTerm& term, DisjunctionContext* context) - { - ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern); - - BackTrackInfoParentheses* backTrack = reinterpret_cast(context->frame + term.frameLocation); - ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction; - - switch (term.atom.quantityType) { - case QuantifierFixedCount: { - ASSERT(backTrack->matchAmount == term.atom.quantityCount); - - ParenthesesDisjunctionContext* context = 0; - JSRegExpResult result = parenthesesDoBacktrack(term, backTrack); - - if (result != JSRegExpMatch) - return result; - - // While we haven't yet reached our fixed limit, - while (backTrack->matchAmount < term.atom.quantityCount) { - // Try to do a match, and it it succeeds, add it to the list. - context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); - result = matchDisjunction(disjunctionBody, context->getDisjunctionContext(term)); - - if (result == JSRegExpMatch) - appendParenthesesDisjunctionContext(backTrack, context); - else { - // The match failed; try to find an alternate point to carry on from. - resetMatches(term, context); - freeParenthesesDisjunctionContext(context); - - if (result == JSRegExpNoMatch) { - JSRegExpResult backtrackResult = parenthesesDoBacktrack(term, backTrack); - if (backtrackResult != JSRegExpMatch) - return backtrackResult; - } else - return result; - } - } - - ASSERT(backTrack->matchAmount == term.atom.quantityCount); - context = backTrack->lastContext; - recordParenthesesMatch(term, context); - return JSRegExpMatch; - } - - case QuantifierGreedy: { - if (!backTrack->matchAmount) - return JSRegExpNoMatch; - - ParenthesesDisjunctionContext* context = backTrack->lastContext; - JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true); - if (result == JSRegExpMatch) { - while (backTrack->matchAmount < term.atom.quantityCount) { - ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); - JSRegExpResult parenthesesResult = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term)); - if (parenthesesResult == JSRegExpMatch) - appendParenthesesDisjunctionContext(backTrack, context); - else { - resetMatches(term, context); - freeParenthesesDisjunctionContext(context); - - if (parenthesesResult != JSRegExpNoMatch) - return parenthesesResult; - - break; - } - } - } else { - resetMatches(term, context); - popParenthesesDisjunctionContext(backTrack); - freeParenthesesDisjunctionContext(context); - - if (result != JSRegExpNoMatch) - return result; - } - - if (backTrack->matchAmount) { - ParenthesesDisjunctionContext* context = backTrack->lastContext; - recordParenthesesMatch(term, context); - } - return JSRegExpMatch; - } - - case QuantifierNonGreedy: { - // If we've not reached the limit, try to add one more match. - if (backTrack->matchAmount < term.atom.quantityCount) { - ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); - JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term)); - if (result == JSRegExpMatch) { - appendParenthesesDisjunctionContext(backTrack, context); - recordParenthesesMatch(term, context); - return JSRegExpMatch; - } - - resetMatches(term, context); - freeParenthesesDisjunctionContext(context); - - if (result != JSRegExpNoMatch) - return result; - } - - // Nope - okay backtrack looking for an alternative. - while (backTrack->matchAmount) { - ParenthesesDisjunctionContext* context = backTrack->lastContext; - JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true); - if (result == JSRegExpMatch) { - // successful backtrack! we're back in the game! - if (backTrack->matchAmount) { - context = backTrack->lastContext; - recordParenthesesMatch(term, context); - } - return JSRegExpMatch; - } - - // pop a match off the stack - resetMatches(term, context); - popParenthesesDisjunctionContext(backTrack); - freeParenthesesDisjunctionContext(context); - - return result; - } - - return JSRegExpNoMatch; - } - } - - ASSERT_NOT_REACHED(); - return JSRegExpErrorNoMatch; - } - - void lookupForBeginChars() - { - int character; - bool firstSingleCharFound; - - while (true) { - if (input.isNotAvailableInput(2)) - return; - - firstSingleCharFound = false; - - character = input.readPair(); - - for (unsigned i = 0; i < pattern->m_beginChars.size(); ++i) { - BeginChar bc = pattern->m_beginChars[i]; - - if (!firstSingleCharFound && bc.value <= 0xFFFF) { - firstSingleCharFound = true; - character &= 0xFFFF; - } - - if ((character | bc.mask) == bc.value) - return; - } - - input.next(); - } - } - -#define MATCH_NEXT() { ++context->term; goto matchAgain; } -#define BACKTRACK() { --context->term; goto backtrack; } -#define currentTerm() (disjunction->terms[context->term]) - JSRegExpResult matchDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false, bool isBody = false) - { - if (!--remainingMatchCount) - return JSRegExpErrorHitLimit; - - if (btrack) - BACKTRACK(); - - if (pattern->m_containsBeginChars && isBody) - lookupForBeginChars(); - - context->matchBegin = input.getPos(); - context->term = 0; - - matchAgain: - ASSERT(context->term < static_cast(disjunction->terms.size())); - - switch (currentTerm().type) { - case ByteTerm::TypeSubpatternBegin: - MATCH_NEXT(); - case ByteTerm::TypeSubpatternEnd: - context->matchEnd = input.getPos(); - return JSRegExpMatch; - - case ByteTerm::TypeBodyAlternativeBegin: - MATCH_NEXT(); - case ByteTerm::TypeBodyAlternativeDisjunction: - case ByteTerm::TypeBodyAlternativeEnd: - context->matchEnd = input.getPos(); - return JSRegExpMatch; - - case ByteTerm::TypeAlternativeBegin: - MATCH_NEXT(); - case ByteTerm::TypeAlternativeDisjunction: - case ByteTerm::TypeAlternativeEnd: { - int offset = currentTerm().alternative.end; - BackTrackInfoAlternative* backTrack = reinterpret_cast(context->frame + currentTerm().frameLocation); - backTrack->offset = offset; - context->term += offset; - MATCH_NEXT(); - } - - case ByteTerm::TypeAssertionBOL: - if (matchAssertionBOL(currentTerm())) - MATCH_NEXT(); - BACKTRACK(); - case ByteTerm::TypeAssertionEOL: - if (matchAssertionEOL(currentTerm())) - MATCH_NEXT(); - BACKTRACK(); - case ByteTerm::TypeAssertionWordBoundary: - if (matchAssertionWordBoundary(currentTerm())) - MATCH_NEXT(); - BACKTRACK(); - - case ByteTerm::TypePatternCharacterOnce: - case ByteTerm::TypePatternCharacterFixed: { - for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) { - if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition + matchAmount)) - BACKTRACK(); - } - MATCH_NEXT(); - } - case ByteTerm::TypePatternCharacterGreedy: { - BackTrackInfoPatternCharacter* backTrack = reinterpret_cast(context->frame + currentTerm().frameLocation); - unsigned matchAmount = 0; - while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) { - if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition - 1)) { - input.uncheckInput(1); - break; - } - ++matchAmount; - } - backTrack->matchAmount = matchAmount; - - MATCH_NEXT(); - } - case ByteTerm::TypePatternCharacterNonGreedy: { - BackTrackInfoPatternCharacter* backTrack = reinterpret_cast(context->frame + currentTerm().frameLocation); - backTrack->matchAmount = 0; - MATCH_NEXT(); - } - - case ByteTerm::TypePatternCasedCharacterOnce: - case ByteTerm::TypePatternCasedCharacterFixed: { - for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) { - if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition + matchAmount)) - BACKTRACK(); - } - MATCH_NEXT(); - } - case ByteTerm::TypePatternCasedCharacterGreedy: { - BackTrackInfoPatternCharacter* backTrack = reinterpret_cast(context->frame + currentTerm().frameLocation); - unsigned matchAmount = 0; - while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) { - if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition - 1)) { - input.uncheckInput(1); - break; - } - ++matchAmount; - } - backTrack->matchAmount = matchAmount; - - MATCH_NEXT(); - } - case ByteTerm::TypePatternCasedCharacterNonGreedy: { - BackTrackInfoPatternCharacter* backTrack = reinterpret_cast(context->frame + currentTerm().frameLocation); - backTrack->matchAmount = 0; - MATCH_NEXT(); - } - - case ByteTerm::TypeCharacterClass: - if (matchCharacterClass(currentTerm(), context)) - MATCH_NEXT(); - BACKTRACK(); - case ByteTerm::TypeBackReference: - if (matchBackReference(currentTerm(), context)) - MATCH_NEXT(); - BACKTRACK(); - case ByteTerm::TypeParenthesesSubpattern: { - JSRegExpResult result = matchParentheses(currentTerm(), context); - - if (result == JSRegExpMatch) { - MATCH_NEXT(); - } else if (result != JSRegExpNoMatch) - return result; - - BACKTRACK(); - } - case ByteTerm::TypeParenthesesSubpatternOnceBegin: - if (matchParenthesesOnceBegin(currentTerm(), context)) - MATCH_NEXT(); - BACKTRACK(); - case ByteTerm::TypeParenthesesSubpatternOnceEnd: - if (matchParenthesesOnceEnd(currentTerm(), context)) - MATCH_NEXT(); - BACKTRACK(); - case ByteTerm::TypeParenthesesSubpatternTerminalBegin: - if (matchParenthesesTerminalBegin(currentTerm(), context)) - MATCH_NEXT(); - BACKTRACK(); - case ByteTerm::TypeParenthesesSubpatternTerminalEnd: - if (matchParenthesesTerminalEnd(currentTerm(), context)) - MATCH_NEXT(); - BACKTRACK(); - case ByteTerm::TypeParentheticalAssertionBegin: - if (matchParentheticalAssertionBegin(currentTerm(), context)) - MATCH_NEXT(); - BACKTRACK(); - case ByteTerm::TypeParentheticalAssertionEnd: - if (matchParentheticalAssertionEnd(currentTerm(), context)) - MATCH_NEXT(); - BACKTRACK(); - - case ByteTerm::TypeCheckInput: - if (input.checkInput(currentTerm().checkInputCount)) - MATCH_NEXT(); - BACKTRACK(); - } - - // We should never fall-through to here. - ASSERT_NOT_REACHED(); - - backtrack: - ASSERT(context->term < static_cast(disjunction->terms.size())); - - switch (currentTerm().type) { - case ByteTerm::TypeSubpatternBegin: - return JSRegExpNoMatch; - case ByteTerm::TypeSubpatternEnd: - ASSERT_NOT_REACHED(); - - case ByteTerm::TypeBodyAlternativeBegin: - case ByteTerm::TypeBodyAlternativeDisjunction: { - int offset = currentTerm().alternative.next; - context->term += offset; - if (offset > 0) - MATCH_NEXT(); - - if (input.atEnd()) - return JSRegExpNoMatch; - - input.next(); - - if (pattern->m_containsBeginChars && isBody) - lookupForBeginChars(); - - context->matchBegin = input.getPos(); - - if (currentTerm().alternative.onceThrough) - context->term += currentTerm().alternative.next; - - MATCH_NEXT(); - } - case ByteTerm::TypeBodyAlternativeEnd: - ASSERT_NOT_REACHED(); - - case ByteTerm::TypeAlternativeBegin: - case ByteTerm::TypeAlternativeDisjunction: { - int offset = currentTerm().alternative.next; - context->term += offset; - if (offset > 0) - MATCH_NEXT(); - BACKTRACK(); - } - case ByteTerm::TypeAlternativeEnd: { - // We should never backtrack back into an alternative of the main body of the regex. - BackTrackInfoAlternative* backTrack = reinterpret_cast(context->frame + currentTerm().frameLocation); - unsigned offset = backTrack->offset; - context->term -= offset; - BACKTRACK(); - } - - case ByteTerm::TypeAssertionBOL: - case ByteTerm::TypeAssertionEOL: - case ByteTerm::TypeAssertionWordBoundary: - BACKTRACK(); - - case ByteTerm::TypePatternCharacterOnce: - case ByteTerm::TypePatternCharacterFixed: - case ByteTerm::TypePatternCharacterGreedy: - case ByteTerm::TypePatternCharacterNonGreedy: - if (backtrackPatternCharacter(currentTerm(), context)) - MATCH_NEXT(); - BACKTRACK(); - case ByteTerm::TypePatternCasedCharacterOnce: - case ByteTerm::TypePatternCasedCharacterFixed: - case ByteTerm::TypePatternCasedCharacterGreedy: - case ByteTerm::TypePatternCasedCharacterNonGreedy: - if (backtrackPatternCasedCharacter(currentTerm(), context)) - MATCH_NEXT(); - BACKTRACK(); - case ByteTerm::TypeCharacterClass: - if (backtrackCharacterClass(currentTerm(), context)) - MATCH_NEXT(); - BACKTRACK(); - case ByteTerm::TypeBackReference: - if (backtrackBackReference(currentTerm(), context)) - MATCH_NEXT(); - BACKTRACK(); - case ByteTerm::TypeParenthesesSubpattern: { - JSRegExpResult result = backtrackParentheses(currentTerm(), context); - - if (result == JSRegExpMatch) { - MATCH_NEXT(); - } else if (result != JSRegExpNoMatch) - return result; - - BACKTRACK(); - } - case ByteTerm::TypeParenthesesSubpatternOnceBegin: - if (backtrackParenthesesOnceBegin(currentTerm(), context)) - MATCH_NEXT(); - BACKTRACK(); - case ByteTerm::TypeParenthesesSubpatternOnceEnd: - if (backtrackParenthesesOnceEnd(currentTerm(), context)) - MATCH_NEXT(); - BACKTRACK(); - case ByteTerm::TypeParenthesesSubpatternTerminalBegin: - if (backtrackParenthesesTerminalBegin(currentTerm(), context)) - MATCH_NEXT(); - BACKTRACK(); - case ByteTerm::TypeParenthesesSubpatternTerminalEnd: - if (backtrackParenthesesTerminalEnd(currentTerm(), context)) - MATCH_NEXT(); - BACKTRACK(); - case ByteTerm::TypeParentheticalAssertionBegin: - if (backtrackParentheticalAssertionBegin(currentTerm(), context)) - MATCH_NEXT(); - BACKTRACK(); - case ByteTerm::TypeParentheticalAssertionEnd: - if (backtrackParentheticalAssertionEnd(currentTerm(), context)) - MATCH_NEXT(); - BACKTRACK(); - - case ByteTerm::TypeCheckInput: - input.uncheckInput(currentTerm().checkInputCount); - BACKTRACK(); - } - - ASSERT_NOT_REACHED(); - return JSRegExpErrorNoMatch; - } - - JSRegExpResult matchNonZeroDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false) - { - JSRegExpResult result = matchDisjunction(disjunction, context, btrack); - - if (result == JSRegExpMatch) { - while (context->matchBegin == context->matchEnd) { - result = matchDisjunction(disjunction, context, true); - if (result != JSRegExpMatch) - return result; - } - return JSRegExpMatch; - } - - return result; - } - - int interpret() - { - allocatorPool = pattern->m_allocator->startAllocator(); - if (!allocatorPool) - CRASH(); - - for (unsigned i = 0; i < ((pattern->m_body->m_numSubpatterns + 1) << 1); ++i) - output[i] = -1; - - DisjunctionContext* context = allocDisjunctionContext(pattern->m_body.get()); - - JSRegExpResult result = matchDisjunction(pattern->m_body.get(), context, false, true); - if (result == JSRegExpMatch) { - output[0] = context->matchBegin; - output[1] = context->matchEnd; - } - - freeDisjunctionContext(context); - - pattern->m_allocator->stopAllocator(); - - // RegExp.cpp currently expects all error to be converted to -1. - ASSERT((result == JSRegExpMatch) == (output[0] != -1)); - return output[0]; - } - - Interpreter(BytecodePattern* pattern, int* output, const UChar* inputChar, unsigned start, unsigned length) - : pattern(pattern) - , output(output) - , input(inputChar, start, length) - , allocatorPool(0) - , remainingMatchCount(matchLimit) - { - } - -private: - BytecodePattern *pattern; - int* output; - InputStream input; - BumpPointerPool* allocatorPool; - unsigned remainingMatchCount; -}; - - - -class ByteCompiler { - struct ParenthesesStackEntry { - unsigned beginTerm; - unsigned savedAlternativeIndex; - ParenthesesStackEntry(unsigned beginTerm, unsigned savedAlternativeIndex/*, unsigned subpatternId, bool capture = false*/) - : beginTerm(beginTerm) - , savedAlternativeIndex(savedAlternativeIndex) - { - } - }; - -public: - ByteCompiler(RegexPattern& pattern) - : m_pattern(pattern) - { - m_currentAlternativeIndex = 0; - } - - PassOwnPtr compile(BumpPointerAllocator* allocator) - { - regexBegin(m_pattern.m_numSubpatterns, m_pattern.m_body->m_callFrameSize, m_pattern.m_body->m_alternatives[0]->onceThrough()); - emitDisjunction(m_pattern.m_body); - regexEnd(); - - return adoptPtr(new BytecodePattern(m_bodyDisjunction.release(), m_allParenthesesInfo, m_pattern, allocator)); - } - - void checkInput(unsigned count) - { - m_bodyDisjunction->terms.append(ByteTerm::CheckInput(count)); - } - - void assertionBOL(int inputPosition) - { - m_bodyDisjunction->terms.append(ByteTerm::BOL(inputPosition)); - } - - void assertionEOL(int inputPosition) - { - m_bodyDisjunction->terms.append(ByteTerm::EOL(inputPosition)); - } - - void assertionWordBoundary(bool invert, int inputPosition) - { - m_bodyDisjunction->terms.append(ByteTerm::WordBoundary(invert, inputPosition)); - } - - void atomPatternCharacter(UChar ch, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType) - { - if (m_pattern.m_ignoreCase) { - UChar lo = Unicode::toLower(ch); - UChar hi = Unicode::toUpper(ch); - - if (lo != hi) { - m_bodyDisjunction->terms.append(ByteTerm(lo, hi, inputPosition, frameLocation, quantityCount, quantityType)); - return; - } - } - - m_bodyDisjunction->terms.append(ByteTerm(ch, inputPosition, frameLocation, quantityCount, quantityType)); - } - - void atomCharacterClass(CharacterClass* characterClass, bool invert, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType) - { - m_bodyDisjunction->terms.append(ByteTerm(characterClass, invert, inputPosition)); - - m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount; - m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType; - m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; - } - - void atomBackReference(unsigned subpatternId, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType) - { - ASSERT(subpatternId); - - m_bodyDisjunction->terms.append(ByteTerm::BackReference(subpatternId, inputPosition)); - - m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount; - m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType; - m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; - } - - void atomParenthesesOnceBegin(unsigned subpatternId, bool capture, int inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation) - { - int beginTerm = m_bodyDisjunction->terms.size(); - - m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, false, inputPosition)); - m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; - m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin()); - m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation; - - m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex)); - m_currentAlternativeIndex = beginTerm + 1; - } - - void atomParenthesesTerminalBegin(unsigned subpatternId, bool capture, int inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation) - { - int beginTerm = m_bodyDisjunction->terms.size(); - - m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternTerminalBegin, subpatternId, capture, false, inputPosition)); - m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; - m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin()); - m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation; - - m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex)); - m_currentAlternativeIndex = beginTerm + 1; - } - - void atomParenthesesSubpatternBegin(unsigned subpatternId, bool capture, int inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation) - { - // Errrk! - this is a little crazy, we initially generate as a TypeParenthesesSubpatternOnceBegin, - // then fix this up at the end! - simplifying this should make it much clearer. - // https://bugs.webkit.org/show_bug.cgi?id=50136 - - int beginTerm = m_bodyDisjunction->terms.size(); - - m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, false, inputPosition)); - m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; - m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin()); - m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation; - - m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex)); - m_currentAlternativeIndex = beginTerm + 1; - } - - void atomParentheticalAssertionBegin(unsigned subpatternId, bool invert, unsigned frameLocation, unsigned alternativeFrameLocation) - { - int beginTerm = m_bodyDisjunction->terms.size(); - - m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParentheticalAssertionBegin, subpatternId, false, invert, 0)); - m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; - m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin()); - m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation; - - m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex)); - m_currentAlternativeIndex = beginTerm + 1; - } - - void atomParentheticalAssertionEnd(int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType) - { - unsigned beginTerm = popParenthesesStack(); - closeAlternative(beginTerm + 1); - unsigned endTerm = m_bodyDisjunction->terms.size(); - - ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParentheticalAssertionBegin); - - bool invert = m_bodyDisjunction->terms[beginTerm].invert(); - unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId; - - m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParentheticalAssertionEnd, subpatternId, false, invert, inputPosition)); - m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm; - m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm; - m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation; - - m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount; - m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; - m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount; - m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType; - } - - unsigned popParenthesesStack() - { - ASSERT(m_parenthesesStack.size()); - int stackEnd = m_parenthesesStack.size() - 1; - unsigned beginTerm = m_parenthesesStack[stackEnd].beginTerm; - m_currentAlternativeIndex = m_parenthesesStack[stackEnd].savedAlternativeIndex; - m_parenthesesStack.shrink(stackEnd); - - ASSERT(beginTerm < m_bodyDisjunction->terms.size()); - ASSERT(m_currentAlternativeIndex < m_bodyDisjunction->terms.size()); - - return beginTerm; - } - -#ifndef NDEBUG - void dumpDisjunction(ByteDisjunction* disjunction) - { - printf("ByteDisjunction(%p):\n\t", disjunction); - for (unsigned i = 0; i < disjunction->terms.size(); ++i) - printf("{ %d } ", disjunction->terms[i].type); - printf("\n"); - } -#endif - - void closeAlternative(int beginTerm) - { - int origBeginTerm = beginTerm; - ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeBegin); - int endIndex = m_bodyDisjunction->terms.size(); - - unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation; - - if (!m_bodyDisjunction->terms[beginTerm].alternative.next) - m_bodyDisjunction->terms.remove(beginTerm); - else { - while (m_bodyDisjunction->terms[beginTerm].alternative.next) { - beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next; - ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeDisjunction); - m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm; - m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation; - } - - m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm; - - m_bodyDisjunction->terms.append(ByteTerm::AlternativeEnd()); - m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation; - } - } - - void closeBodyAlternative() - { - int beginTerm = 0; - int origBeginTerm = 0; - ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeBegin); - int endIndex = m_bodyDisjunction->terms.size(); - - unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation; - - while (m_bodyDisjunction->terms[beginTerm].alternative.next) { - beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next; - ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeDisjunction); - m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm; - m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation; - } - - m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm; - - m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeEnd()); - m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation; - } - - void atomParenthesesSubpatternEnd(unsigned lastSubpatternId, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType, unsigned callFrameSize = 0) - { - unsigned beginTerm = popParenthesesStack(); - closeAlternative(beginTerm + 1); - unsigned endTerm = m_bodyDisjunction->terms.size(); - - ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternOnceBegin); - - ByteTerm& parenthesesBegin = m_bodyDisjunction->terms[beginTerm]; - - bool capture = parenthesesBegin.capture(); - unsigned subpatternId = parenthesesBegin.atom.subpatternId; - - unsigned numSubpatterns = lastSubpatternId - subpatternId + 1; - ByteDisjunction* parenthesesDisjunction = new ByteDisjunction(numSubpatterns, callFrameSize); - - parenthesesDisjunction->terms.append(ByteTerm::SubpatternBegin()); - for (unsigned termInParentheses = beginTerm + 1; termInParentheses < endTerm; ++termInParentheses) - parenthesesDisjunction->terms.append(m_bodyDisjunction->terms[termInParentheses]); - parenthesesDisjunction->terms.append(ByteTerm::SubpatternEnd()); - - m_bodyDisjunction->terms.shrink(beginTerm); - - m_allParenthesesInfo.append(parenthesesDisjunction); - m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction, capture, inputPosition)); - - m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount; - m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; - m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation; - } - - void atomParenthesesOnceEnd(int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType) - { - unsigned beginTerm = popParenthesesStack(); - closeAlternative(beginTerm + 1); - unsigned endTerm = m_bodyDisjunction->terms.size(); - - ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternOnceBegin); - - bool capture = m_bodyDisjunction->terms[beginTerm].capture(); - unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId; - - m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceEnd, subpatternId, capture, false, inputPosition)); - m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm; - m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm; - m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation; - - m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount; - m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; - m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount; - m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType; - } - - void atomParenthesesTerminalEnd(int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType) - { - unsigned beginTerm = popParenthesesStack(); - closeAlternative(beginTerm + 1); - unsigned endTerm = m_bodyDisjunction->terms.size(); - - ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternTerminalBegin); - - bool capture = m_bodyDisjunction->terms[beginTerm].capture(); - unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId; - - m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternTerminalEnd, subpatternId, capture, false, inputPosition)); - m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm; - m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm; - m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation; - - m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount; - m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; - m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount; - m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType; - } - - void regexBegin(unsigned numSubpatterns, unsigned callFrameSize, bool onceThrough) - { - m_bodyDisjunction = adoptPtr(new ByteDisjunction(numSubpatterns, callFrameSize)); - m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeBegin(onceThrough)); - m_bodyDisjunction->terms[0].frameLocation = 0; - m_currentAlternativeIndex = 0; - } - - void regexEnd() - { - closeBodyAlternative(); - } - - void alternativeBodyDisjunction(bool onceThrough) - { - int newAlternativeIndex = m_bodyDisjunction->terms.size(); - m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex; - m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeDisjunction(onceThrough)); - - m_currentAlternativeIndex = newAlternativeIndex; - } - - void alternativeDisjunction() - { - int newAlternativeIndex = m_bodyDisjunction->terms.size(); - m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex; - m_bodyDisjunction->terms.append(ByteTerm::AlternativeDisjunction()); - - m_currentAlternativeIndex = newAlternativeIndex; - } - - void emitDisjunction(PatternDisjunction* disjunction, unsigned inputCountAlreadyChecked = 0, unsigned parenthesesInputCountAlreadyChecked = 0, bool isParentheticalAssertion = false) - { - for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) { - unsigned currentCountAlreadyChecked = inputCountAlreadyChecked; - - PatternAlternative* alternative = disjunction->m_alternatives[alt]; - - if (alt) { - if (disjunction == m_pattern.m_body) - alternativeBodyDisjunction(alternative->onceThrough()); - else - alternativeDisjunction(); - } - - unsigned minimumSize = alternative->m_minimumSize; - int countToCheck; - - if (isParentheticalAssertion && parenthesesInputCountAlreadyChecked > minimumSize) - countToCheck = 0; - else - countToCheck = minimumSize - parenthesesInputCountAlreadyChecked; - - ASSERT(countToCheck >= 0); - if (countToCheck) { - checkInput(countToCheck); - currentCountAlreadyChecked += countToCheck; - } - - for (unsigned i = 0; i < alternative->m_terms.size(); ++i) { - PatternTerm& term = alternative->m_terms[i]; - - switch (term.type) { - case PatternTerm::TypeAssertionBOL: - assertionBOL(term.inputPosition - currentCountAlreadyChecked); - break; - - case PatternTerm::TypeAssertionEOL: - assertionEOL(term.inputPosition - currentCountAlreadyChecked); - break; - - case PatternTerm::TypeAssertionWordBoundary: - assertionWordBoundary(term.invert(), term.inputPosition - currentCountAlreadyChecked); - break; - - case PatternTerm::TypePatternCharacter: - atomPatternCharacter(term.patternCharacter, term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType); - break; - - case PatternTerm::TypeCharacterClass: - atomCharacterClass(term.characterClass, term.invert(), term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType); - break; - - case PatternTerm::TypeBackReference: - atomBackReference(term.backReferenceSubpatternId, term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType); - break; - - case PatternTerm::TypeForwardReference: - break; - - case PatternTerm::TypeParenthesesSubpattern: { - unsigned disjunctionAlreadyCheckedCount = 0; - if (term.quantityCount == 1 && !term.parentheses.isCopy) { - unsigned alternativeFrameLocation = term.frameLocation; - // For QuantifierFixedCount we pre-check the minimum size; for greedy/non-greedy we reserve a slot in the frame. - if (term.quantityType == QuantifierFixedCount) - disjunctionAlreadyCheckedCount = term.parentheses.disjunction->m_minimumSize; - else - alternativeFrameLocation += RegexStackSpaceForBackTrackInfoParenthesesOnce; - unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked; - atomParenthesesOnceBegin(term.parentheses.subpatternId, term.capture(), delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, alternativeFrameLocation); - emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, disjunctionAlreadyCheckedCount); - atomParenthesesOnceEnd(delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType); - } else if (term.parentheses.isTerminal) { - unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked; - atomParenthesesTerminalBegin(term.parentheses.subpatternId, term.capture(), delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, term.frameLocation + RegexStackSpaceForBackTrackInfoParenthesesOnce); - emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, disjunctionAlreadyCheckedCount); - atomParenthesesTerminalEnd(delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType); - } else { - unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked; - atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.capture(), delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, 0); - emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0); - atomParenthesesSubpatternEnd(term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize); - } - break; - } - - case PatternTerm::TypeParentheticalAssertion: { - unsigned alternativeFrameLocation = term.frameLocation + RegexStackSpaceForBackTrackInfoParentheticalAssertion; - - ASSERT(currentCountAlreadyChecked >= (unsigned)term.inputPosition); - int positiveInputOffset = currentCountAlreadyChecked - term.inputPosition; - - atomParentheticalAssertionBegin(term.parentheses.subpatternId, term.invert(), term.frameLocation, alternativeFrameLocation); - emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, positiveInputOffset, true); - atomParentheticalAssertionEnd(0, term.frameLocation, term.quantityCount, term.quantityType); - break; - } - } - } - } - } - -private: - RegexPattern& m_pattern; - OwnPtr m_bodyDisjunction; - unsigned m_currentAlternativeIndex; - Vector m_parenthesesStack; - Vector m_allParenthesesInfo; -}; - -PassOwnPtr byteCompileRegex(RegexPattern& pattern, BumpPointerAllocator* allocator) -{ - return ByteCompiler(pattern).compile(allocator); -} - -int interpretRegex(BytecodePattern* regex, const UChar* input, unsigned start, unsigned length, int* output) -{ - return Interpreter(regex, output, input, start, length).interpret(); -} - - -COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoPatternCharacter) == (RegexStackSpaceForBackTrackInfoPatternCharacter * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoPatternCharacter); -COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoCharacterClass) == (RegexStackSpaceForBackTrackInfoCharacterClass * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoCharacterClass); -COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoBackReference) == (RegexStackSpaceForBackTrackInfoBackReference * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoBackReference); -COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoAlternative) == (RegexStackSpaceForBackTrackInfoAlternative * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoAlternative); -COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParentheticalAssertion) == (RegexStackSpaceForBackTrackInfoParentheticalAssertion * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoParentheticalAssertion); -COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParenthesesOnce) == (RegexStackSpaceForBackTrackInfoParenthesesOnce * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoParenthesesOnce); -COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParentheses) == (RegexStackSpaceForBackTrackInfoParentheses * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoParentheses); - - -} } diff --git a/Source/JavaScriptCore/yarr/RegexInterpreter.h b/Source/JavaScriptCore/yarr/RegexInterpreter.h deleted file mode 100644 index 0fd8a57..0000000 --- a/Source/JavaScriptCore/yarr/RegexInterpreter.h +++ /dev/null @@ -1,381 +0,0 @@ -/* - * Copyright (C) 2009, 2010 Apple Inc. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY - * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY - * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef RegexInterpreter_h -#define RegexInterpreter_h - -#include "RegexParser.h" -#include "RegexPattern.h" -#include -#include - -namespace WTF { -class BumpPointerAllocator; -} -using WTF::BumpPointerAllocator; - -namespace JSC { namespace Yarr { - -// TODO move the matchLimit constant and the JSRegExpResult enum to the JSRegExp.h when pcre is removed. - -// The below limit restricts the number of "recursive" match calls in order to -// avoid spending exponential time on complex regular expressions. -static const unsigned matchLimit = 1000000; - -enum JSRegExpResult { - JSRegExpMatch = 1, - JSRegExpNoMatch = 0, - JSRegExpErrorNoMatch = -1, - JSRegExpErrorHitLimit = -2, - JSRegExpErrorNoMemory = -3, - JSRegExpErrorInternal = -4 -}; - -class ByteDisjunction; - -struct ByteTerm { - enum Type { - TypeBodyAlternativeBegin, - TypeBodyAlternativeDisjunction, - TypeBodyAlternativeEnd, - TypeAlternativeBegin, - TypeAlternativeDisjunction, - TypeAlternativeEnd, - TypeSubpatternBegin, - TypeSubpatternEnd, - TypeAssertionBOL, - TypeAssertionEOL, - TypeAssertionWordBoundary, - TypePatternCharacterOnce, - TypePatternCharacterFixed, - TypePatternCharacterGreedy, - TypePatternCharacterNonGreedy, - TypePatternCasedCharacterOnce, - TypePatternCasedCharacterFixed, - TypePatternCasedCharacterGreedy, - TypePatternCasedCharacterNonGreedy, - TypeCharacterClass, - TypeBackReference, - TypeParenthesesSubpattern, - TypeParenthesesSubpatternOnceBegin, - TypeParenthesesSubpatternOnceEnd, - TypeParenthesesSubpatternTerminalBegin, - TypeParenthesesSubpatternTerminalEnd, - TypeParentheticalAssertionBegin, - TypeParentheticalAssertionEnd, - TypeCheckInput, - } type; - union { - struct { - union { - UChar patternCharacter; - struct { - UChar lo; - UChar hi; - } casedCharacter; - CharacterClass* characterClass; - unsigned subpatternId; - }; - union { - ByteDisjunction* parenthesesDisjunction; - unsigned parenthesesWidth; - }; - QuantifierType quantityType; - unsigned quantityCount; - } atom; - struct { - int next; - int end; - bool onceThrough; - } alternative; - unsigned checkInputCount; - }; - unsigned frameLocation; - bool m_capture : 1; - bool m_invert : 1; - int inputPosition; - - ByteTerm(UChar ch, int inputPos, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType) - : frameLocation(frameLocation) - , m_capture(false) - , m_invert(false) - { - switch (quantityType) { - case QuantifierFixedCount: - type = (quantityCount == 1) ? ByteTerm::TypePatternCharacterOnce : ByteTerm::TypePatternCharacterFixed; - break; - case QuantifierGreedy: - type = ByteTerm::TypePatternCharacterGreedy; - break; - case QuantifierNonGreedy: - type = ByteTerm::TypePatternCharacterNonGreedy; - break; - } - - atom.patternCharacter = ch; - atom.quantityType = quantityType; - atom.quantityCount = quantityCount; - inputPosition = inputPos; - } - - ByteTerm(UChar lo, UChar hi, int inputPos, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType) - : frameLocation(frameLocation) - , m_capture(false) - , m_invert(false) - { - switch (quantityType) { - case QuantifierFixedCount: - type = (quantityCount == 1) ? ByteTerm::TypePatternCasedCharacterOnce : ByteTerm::TypePatternCasedCharacterFixed; - break; - case QuantifierGreedy: - type = ByteTerm::TypePatternCasedCharacterGreedy; - break; - case QuantifierNonGreedy: - type = ByteTerm::TypePatternCasedCharacterNonGreedy; - break; - } - - atom.casedCharacter.lo = lo; - atom.casedCharacter.hi = hi; - atom.quantityType = quantityType; - atom.quantityCount = quantityCount; - inputPosition = inputPos; - } - - ByteTerm(CharacterClass* characterClass, bool invert, int inputPos) - : type(ByteTerm::TypeCharacterClass) - , m_capture(false) - , m_invert(invert) - { - atom.characterClass = characterClass; - atom.quantityType = QuantifierFixedCount; - atom.quantityCount = 1; - inputPosition = inputPos; - } - - ByteTerm(Type type, unsigned subpatternId, ByteDisjunction* parenthesesInfo, bool capture, int inputPos) - : type(type) - , m_capture(capture) - , m_invert(false) - { - atom.subpatternId = subpatternId; - atom.parenthesesDisjunction = parenthesesInfo; - atom.quantityType = QuantifierFixedCount; - atom.quantityCount = 1; - inputPosition = inputPos; - } - - ByteTerm(Type type, bool invert = false) - : type(type) - , m_capture(false) - , m_invert(invert) - { - atom.quantityType = QuantifierFixedCount; - atom.quantityCount = 1; - } - - ByteTerm(Type type, unsigned subpatternId, bool capture, bool invert, int inputPos) - : type(type) - , m_capture(capture) - , m_invert(invert) - { - atom.subpatternId = subpatternId; - atom.quantityType = QuantifierFixedCount; - atom.quantityCount = 1; - inputPosition = inputPos; - } - - static ByteTerm BOL(int inputPos) - { - ByteTerm term(TypeAssertionBOL); - term.inputPosition = inputPos; - return term; - } - - static ByteTerm CheckInput(unsigned count) - { - ByteTerm term(TypeCheckInput); - term.checkInputCount = count; - return term; - } - - static ByteTerm EOL(int inputPos) - { - ByteTerm term(TypeAssertionEOL); - term.inputPosition = inputPos; - return term; - } - - static ByteTerm WordBoundary(bool invert, int inputPos) - { - ByteTerm term(TypeAssertionWordBoundary, invert); - term.inputPosition = inputPos; - return term; - } - - static ByteTerm BackReference(unsigned subpatternId, int inputPos) - { - return ByteTerm(TypeBackReference, subpatternId, false, false, inputPos); - } - - static ByteTerm BodyAlternativeBegin(bool onceThrough) - { - ByteTerm term(TypeBodyAlternativeBegin); - term.alternative.next = 0; - term.alternative.end = 0; - term.alternative.onceThrough = onceThrough; - return term; - } - - static ByteTerm BodyAlternativeDisjunction(bool onceThrough) - { - ByteTerm term(TypeBodyAlternativeDisjunction); - term.alternative.next = 0; - term.alternative.end = 0; - term.alternative.onceThrough = onceThrough; - return term; - } - - static ByteTerm BodyAlternativeEnd() - { - ByteTerm term(TypeBodyAlternativeEnd); - term.alternative.next = 0; - term.alternative.end = 0; - term.alternative.onceThrough = false; - return term; - } - - static ByteTerm AlternativeBegin() - { - ByteTerm term(TypeAlternativeBegin); - term.alternative.next = 0; - term.alternative.end = 0; - term.alternative.onceThrough = false; - return term; - } - - static ByteTerm AlternativeDisjunction() - { - ByteTerm term(TypeAlternativeDisjunction); - term.alternative.next = 0; - term.alternative.end = 0; - term.alternative.onceThrough = false; - return term; - } - - static ByteTerm AlternativeEnd() - { - ByteTerm term(TypeAlternativeEnd); - term.alternative.next = 0; - term.alternative.end = 0; - term.alternative.onceThrough = false; - return term; - } - - static ByteTerm SubpatternBegin() - { - return ByteTerm(TypeSubpatternBegin); - } - - static ByteTerm SubpatternEnd() - { - return ByteTerm(TypeSubpatternEnd); - } - - bool invert() - { - return m_invert; - } - - bool capture() - { - return m_capture; - } -}; - -class ByteDisjunction : public FastAllocBase { -public: - ByteDisjunction(unsigned numSubpatterns, unsigned frameSize) - : m_numSubpatterns(numSubpatterns) - , m_frameSize(frameSize) - { - } - - Vector terms; - unsigned m_numSubpatterns; - unsigned m_frameSize; -}; - -struct BytecodePattern : FastAllocBase { - BytecodePattern(PassOwnPtr body, Vector allParenthesesInfo, RegexPattern& pattern, BumpPointerAllocator* allocator) - : m_body(body) - , m_ignoreCase(pattern.m_ignoreCase) - , m_multiline(pattern.m_multiline) - , m_containsBeginChars(pattern.m_containsBeginChars) - , m_allocator(allocator) - { - newlineCharacterClass = pattern.newlineCharacterClass(); - wordcharCharacterClass = pattern.wordcharCharacterClass(); - - m_allParenthesesInfo.append(allParenthesesInfo); - m_userCharacterClasses.append(pattern.m_userCharacterClasses); - // 'Steal' the RegexPattern's CharacterClasses! We clear its - // array, so that it won't delete them on destruction. We'll - // take responsibility for that. - pattern.m_userCharacterClasses.clear(); - - m_beginChars.append(pattern.m_beginChars); - } - - ~BytecodePattern() - { - deleteAllValues(m_allParenthesesInfo); - deleteAllValues(m_userCharacterClasses); - } - - OwnPtr m_body; - bool m_ignoreCase; - bool m_multiline; - bool m_containsBeginChars; - // Each BytecodePattern is associated with a RegExp, each RegExp is associated - // with a JSGlobalData. Cache a pointer to out JSGlobalData's m_regexAllocator. - BumpPointerAllocator* m_allocator; - - CharacterClass* newlineCharacterClass; - CharacterClass* wordcharCharacterClass; - - Vector m_beginChars; - -private: - Vector m_allParenthesesInfo; - Vector m_userCharacterClasses; -}; - -PassOwnPtr byteCompileRegex(RegexPattern& pattern, BumpPointerAllocator*); -int interpretRegex(BytecodePattern* v_regex, const UChar* input, unsigned start, unsigned length, int* output); - -} } // namespace JSC::Yarr - -#endif // RegexInterpreter_h diff --git a/Source/JavaScriptCore/yarr/RegexJIT.cpp b/Source/JavaScriptCore/yarr/RegexJIT.cpp deleted file mode 100644 index 50fe6db..0000000 --- a/Source/JavaScriptCore/yarr/RegexJIT.cpp +++ /dev/null @@ -1,2213 +0,0 @@ -/* - * Copyright (C) 2009 Apple Inc. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY - * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY - * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include "config.h" -#include "RegexJIT.h" - -#include "ASCIICType.h" -#include "JSGlobalData.h" -#include "LinkBuffer.h" -#include "MacroAssembler.h" -#include "RegexParser.h" - -#if ENABLE(YARR_JIT) - -using namespace WTF; - -namespace JSC { namespace Yarr { - -class RegexGenerator : private MacroAssembler { - friend void jitCompileRegex(JSGlobalData* globalData, RegexCodeBlock& jitObject, const UString& pattern, unsigned& numSubpatterns, const char*& error, bool ignoreCase, bool multiline); - -#if CPU(ARM) - static const RegisterID input = ARMRegisters::r0; - static const RegisterID index = ARMRegisters::r1; - static const RegisterID length = ARMRegisters::r2; - static const RegisterID output = ARMRegisters::r4; - - static const RegisterID regT0 = ARMRegisters::r5; - static const RegisterID regT1 = ARMRegisters::r6; - - static const RegisterID returnRegister = ARMRegisters::r0; -#elif CPU(MIPS) - static const RegisterID input = MIPSRegisters::a0; - static const RegisterID index = MIPSRegisters::a1; - static const RegisterID length = MIPSRegisters::a2; - static const RegisterID output = MIPSRegisters::a3; - - static const RegisterID regT0 = MIPSRegisters::t4; - static const RegisterID regT1 = MIPSRegisters::t5; - - static const RegisterID returnRegister = MIPSRegisters::v0; -#elif CPU(X86) - static const RegisterID input = X86Registers::eax; - static const RegisterID index = X86Registers::edx; - static const RegisterID length = X86Registers::ecx; - static const RegisterID output = X86Registers::edi; - - static const RegisterID regT0 = X86Registers::ebx; - static const RegisterID regT1 = X86Registers::esi; - - static const RegisterID returnRegister = X86Registers::eax; -#elif CPU(X86_64) - static const RegisterID input = X86Registers::edi; - static const RegisterID index = X86Registers::esi; - static const RegisterID length = X86Registers::edx; - static const RegisterID output = X86Registers::ecx; - - static const RegisterID regT0 = X86Registers::eax; - static const RegisterID regT1 = X86Registers::ebx; - - static const RegisterID returnRegister = X86Registers::eax; -#endif - - void optimizeAlternative(PatternAlternative* alternative) - { - if (!alternative->m_terms.size()) - return; - - for (unsigned i = 0; i < alternative->m_terms.size() - 1; ++i) { - PatternTerm& term = alternative->m_terms[i]; - PatternTerm& nextTerm = alternative->m_terms[i + 1]; - - if ((term.type == PatternTerm::TypeCharacterClass) - && (term.quantityType == QuantifierFixedCount) - && (nextTerm.type == PatternTerm::TypePatternCharacter) - && (nextTerm.quantityType == QuantifierFixedCount)) { - PatternTerm termCopy = term; - alternative->m_terms[i] = nextTerm; - alternative->m_terms[i + 1] = termCopy; - } - } - } - - void matchCharacterClassRange(RegisterID character, JumpList& failures, JumpList& matchDest, const CharacterRange* ranges, unsigned count, unsigned* matchIndex, const UChar* matches, unsigned matchCount) - { - do { - // pick which range we're going to generate - int which = count >> 1; - char lo = ranges[which].begin; - char hi = ranges[which].end; - - // check if there are any ranges or matches below lo. If not, just jl to failure - - // if there is anything else to check, check that first, if it falls through jmp to failure. - if ((*matchIndex < matchCount) && (matches[*matchIndex] < lo)) { - Jump loOrAbove = branch32(GreaterThanOrEqual, character, Imm32((unsigned short)lo)); - - // generate code for all ranges before this one - if (which) - matchCharacterClassRange(character, failures, matchDest, ranges, which, matchIndex, matches, matchCount); - - while ((*matchIndex < matchCount) && (matches[*matchIndex] < lo)) { - matchDest.append(branch32(Equal, character, Imm32((unsigned short)matches[*matchIndex]))); - ++*matchIndex; - } - failures.append(jump()); - - loOrAbove.link(this); - } else if (which) { - Jump loOrAbove = branch32(GreaterThanOrEqual, character, Imm32((unsigned short)lo)); - - matchCharacterClassRange(character, failures, matchDest, ranges, which, matchIndex, matches, matchCount); - failures.append(jump()); - - loOrAbove.link(this); - } else - failures.append(branch32(LessThan, character, Imm32((unsigned short)lo))); - - while ((*matchIndex < matchCount) && (matches[*matchIndex] <= hi)) - ++*matchIndex; - - matchDest.append(branch32(LessThanOrEqual, character, Imm32((unsigned short)hi))); - // fall through to here, the value is above hi. - - // shuffle along & loop around if there are any more matches to handle. - unsigned next = which + 1; - ranges += next; - count -= next; - } while (count); - } - - void matchCharacterClass(RegisterID character, JumpList& matchDest, const CharacterClass* charClass) - { - if (charClass->m_table) { - ExtendedAddress tableEntry(character, reinterpret_cast(charClass->m_table->m_table)); - matchDest.append(branchTest8(charClass->m_table->m_inverted ? Zero : NonZero, tableEntry)); - return; - } - Jump unicodeFail; - if (charClass->m_matchesUnicode.size() || charClass->m_rangesUnicode.size()) { - Jump isAscii = branch32(LessThanOrEqual, character, Imm32(0x7f)); - - if (charClass->m_matchesUnicode.size()) { - for (unsigned i = 0; i < charClass->m_matchesUnicode.size(); ++i) { - UChar ch = charClass->m_matchesUnicode[i]; - matchDest.append(branch32(Equal, character, Imm32(ch))); - } - } - - if (charClass->m_rangesUnicode.size()) { - for (unsigned i = 0; i < charClass->m_rangesUnicode.size(); ++i) { - UChar lo = charClass->m_rangesUnicode[i].begin; - UChar hi = charClass->m_rangesUnicode[i].end; - - Jump below = branch32(LessThan, character, Imm32(lo)); - matchDest.append(branch32(LessThanOrEqual, character, Imm32(hi))); - below.link(this); - } - } - - unicodeFail = jump(); - isAscii.link(this); - } - - if (charClass->m_ranges.size()) { - unsigned matchIndex = 0; - JumpList failures; - matchCharacterClassRange(character, failures, matchDest, charClass->m_ranges.begin(), charClass->m_ranges.size(), &matchIndex, charClass->m_matches.begin(), charClass->m_matches.size()); - while (matchIndex < charClass->m_matches.size()) - matchDest.append(branch32(Equal, character, Imm32((unsigned short)charClass->m_matches[matchIndex++]))); - - failures.link(this); - } else if (charClass->m_matches.size()) { - // optimization: gather 'a','A' etc back together, can mask & test once. - Vector matchesAZaz; - - for (unsigned i = 0; i < charClass->m_matches.size(); ++i) { - char ch = charClass->m_matches[i]; - if (m_pattern.m_ignoreCase) { - if (isASCIILower(ch)) { - matchesAZaz.append(ch); - continue; - } - if (isASCIIUpper(ch)) - continue; - } - matchDest.append(branch32(Equal, character, Imm32((unsigned short)ch))); - } - - if (unsigned countAZaz = matchesAZaz.size()) { - or32(Imm32(32), character); - for (unsigned i = 0; i < countAZaz; ++i) - matchDest.append(branch32(Equal, character, Imm32(matchesAZaz[i]))); - } - } - - if (charClass->m_matchesUnicode.size() || charClass->m_rangesUnicode.size()) - unicodeFail.link(this); - } - - // Jumps if input not available; will have (incorrectly) incremented already! - Jump jumpIfNoAvailableInput(unsigned countToCheck) - { - add32(Imm32(countToCheck), index); - return branch32(Above, index, length); - } - - Jump jumpIfAvailableInput(unsigned countToCheck) - { - add32(Imm32(countToCheck), index); - return branch32(BelowOrEqual, index, length); - } - - Jump checkInput() - { - return branch32(BelowOrEqual, index, length); - } - - Jump atEndOfInput() - { - return branch32(Equal, index, length); - } - - Jump notAtEndOfInput() - { - return branch32(NotEqual, index, length); - } - - Jump jumpIfCharEquals(UChar ch, int inputPosition) - { - return branch16(Equal, BaseIndex(input, index, TimesTwo, inputPosition * sizeof(UChar)), Imm32(ch)); - } - - Jump jumpIfCharNotEquals(UChar ch, int inputPosition) - { - return branch16(NotEqual, BaseIndex(input, index, TimesTwo, inputPosition * sizeof(UChar)), Imm32(ch)); - } - - void readCharacter(int inputPosition, RegisterID reg) - { - load16(BaseIndex(input, index, TimesTwo, inputPosition * sizeof(UChar)), reg); - } - - void storeToFrame(RegisterID reg, unsigned frameLocation) - { - poke(reg, frameLocation); - } - - void storeToFrame(Imm32 imm, unsigned frameLocation) - { - poke(imm, frameLocation); - } - - DataLabelPtr storeToFrameWithPatch(unsigned frameLocation) - { - return storePtrWithPatch(ImmPtr(0), Address(stackPointerRegister, frameLocation * sizeof(void*))); - } - - void loadFromFrame(unsigned frameLocation, RegisterID reg) - { - peek(reg, frameLocation); - } - - void loadFromFrameAndJump(unsigned frameLocation) - { - jump(Address(stackPointerRegister, frameLocation * sizeof(void*))); - } - - struct IndirectJumpEntry { - IndirectJumpEntry(int32_t stackOffset) - : m_stackOffset(stackOffset) - { - } - - IndirectJumpEntry(int32_t stackOffset, Jump jump) - : m_stackOffset(stackOffset) - { - addJump(jump); - } - - void addJump(Jump jump) - { - m_relJumps.append(jump); - } - - int32_t m_stackOffset; - JumpList m_relJumps; - }; - - struct AlternativeBacktrackRecord { - DataLabelPtr dataLabel; - Label backtrackLocation; - - AlternativeBacktrackRecord(DataLabelPtr dataLabel, Label backtrackLocation) - : dataLabel(dataLabel) - , backtrackLocation(backtrackLocation) - { - } - }; - - struct ParenthesesTail; - struct TermGenerationState; - - struct GenerationState { - typedef HashMap, UnsignedWithZeroKeyHashTraits > IndirectJumpHashMap; - - GenerationState() - : m_parenNestingLevel(0) - { - } - - void addIndirectJumpEntry(int32_t stackOffset, Jump jump) - { - IndirectJumpHashMap::iterator result = m_indirectJumpMap.find(stackOffset); - - ASSERT(stackOffset >= 0); - - uint32_t offset = static_cast(stackOffset); - - if (result == m_indirectJumpMap.end()) - m_indirectJumpMap.add(offset, new IndirectJumpEntry(stackOffset, jump)); - else - result->second->addJump(jump); - } - - void addIndirectJumpEntry(int32_t stackOffset, JumpList jumps) - { - JumpList::JumpVector jumpVector = jumps.jumps(); - size_t size = jumpVector.size(); - for (size_t i = 0; i < size; ++i) - addIndirectJumpEntry(stackOffset, jumpVector[i]); - - jumps.empty(); - } - - void emitIndirectJumpTable(MacroAssembler* masm) - { - for (IndirectJumpHashMap::iterator iter = m_indirectJumpMap.begin(); iter != m_indirectJumpMap.end(); ++iter) { - IndirectJumpEntry* indJumpEntry = iter->second; - indJumpEntry->m_relJumps.link(masm); - masm->jump(Address(stackPointerRegister, indJumpEntry->m_stackOffset)); - delete indJumpEntry; - } - } - - void incrementParenNestingLevel() - { - ++m_parenNestingLevel; - } - - void decrementParenNestingLevel() - { - --m_parenNestingLevel; - } - - ParenthesesTail* addParenthesesTail(PatternTerm& term, ParenthesesTail* nextOuterParenTail) - { - ParenthesesTail* parenthesesTail = new ParenthesesTail(term, m_parenNestingLevel, nextOuterParenTail); - m_parenTails.append(parenthesesTail); - m_parenTailsForIteration.append(parenthesesTail); - - return parenthesesTail; - } - - void emitParenthesesTail(RegexGenerator* generator) - { - unsigned vectorSize = m_parenTails.size(); - bool priorBacktrackFallThrough = false; - - // Emit in reverse order so parentTail N can fall through to N-1 - for (unsigned index = vectorSize; index > 0; --index) { - JumpList jumpsToNext; - priorBacktrackFallThrough = m_parenTails[index-1].get()->generateCode(generator, jumpsToNext, priorBacktrackFallThrough, index > 1); - if (index > 1) - jumpsToNext.linkTo(generator->label(), generator); - else - addJumpsToNextInteration(jumpsToNext); - } - m_parenTails.clear(); - } - - void addJumpToNextInteration(Jump jump) - { - m_jumpsToNextInteration.append(jump); - } - - void addJumpsToNextInteration(JumpList jumps) - { - m_jumpsToNextInteration.append(jumps); - } - - void addDataLabelToNextIteration(DataLabelPtr dataLabel) - { - m_dataPtrsToNextIteration.append(dataLabel); - } - - void linkToNextIteration(Label label) - { - m_nextIteration = label; - - for (unsigned i = 0; i < m_dataPtrsToNextIteration.size(); ++i) - m_backtrackRecords.append(AlternativeBacktrackRecord(m_dataPtrsToNextIteration[i], m_nextIteration)); - - m_dataPtrsToNextIteration.clear(); - - for (unsigned i = 0; i < m_parenTailsForIteration.size(); ++i) - m_parenTailsForIteration[i]->setNextIteration(m_nextIteration); - - m_parenTailsForIteration.clear(); - } - - void linkToNextIteration(RegexGenerator* generator) - { - m_jumpsToNextInteration.linkTo(m_nextIteration, generator); - } - - int m_parenNestingLevel; - Vector m_backtrackRecords; - IndirectJumpHashMap m_indirectJumpMap; - Label m_nextIteration; - Vector > m_parenTails; - JumpList m_jumpsToNextInteration; - Vector m_dataPtrsToNextIteration; - Vector m_parenTailsForIteration; - }; - - struct BacktrackDestination { - typedef enum { - NoBacktrack, - BacktrackLabel, - BacktrackStackOffset, - BacktrackJumpList, - BacktrackLinked - } BacktrackType; - - BacktrackDestination() - : m_backtrackType(NoBacktrack) - , m_backtrackToLabel(0) - , m_subDataLabelPtr(0) - , m_nextBacktrack(0) - , m_backtrackSourceLabel(0) - , m_backtrackSourceJumps(0) - { - } - - BacktrackDestination(int32_t stackOffset) - : m_backtrackType(BacktrackStackOffset) - , m_backtrackStackOffset(stackOffset) - , m_backtrackToLabel(0) - , m_subDataLabelPtr(0) - , m_nextBacktrack(0) - , m_backtrackSourceLabel(0) - , m_backtrackSourceJumps(0) - { - } - - BacktrackDestination(Label label) - : m_backtrackType(BacktrackLabel) - , m_backtrackLabel(label) - , m_backtrackToLabel(0) - , m_subDataLabelPtr(0) - , m_nextBacktrack(0) - , m_backtrackSourceLabel(0) - , m_backtrackSourceJumps(0) - { - } - - void clear(bool doDataLabelClear = true) - { - m_backtrackType = NoBacktrack; - if (doDataLabelClear) - clearDataLabel(); - m_nextBacktrack = 0; - } - - void clearDataLabel() - { - m_dataLabelPtr = DataLabelPtr(); - } - - bool hasDestination() - { - return (m_backtrackType != NoBacktrack); - } - - bool isStackOffset() - { - return (m_backtrackType == BacktrackStackOffset); - } - - bool isLabel() - { - return (m_backtrackType == BacktrackLabel); - } - - bool isJumpList() - { - return (m_backtrackType == BacktrackJumpList); - } - - bool hasDataLabel() - { - return m_dataLabelPtr.isSet(); - } - - void copyTarget(BacktrackDestination& rhs, bool copyDataLabel = true) - { - m_backtrackType = rhs.m_backtrackType; - if (m_backtrackType == BacktrackStackOffset) - m_backtrackStackOffset = rhs.m_backtrackStackOffset; - else if (m_backtrackType == BacktrackLabel) - m_backtrackLabel = rhs.m_backtrackLabel; - if (copyDataLabel) - m_dataLabelPtr = rhs.m_dataLabelPtr; - m_backtrackSourceJumps = rhs.m_backtrackSourceJumps; - m_backtrackSourceLabel = rhs.m_backtrackSourceLabel; - } - - void copyTo(BacktrackDestination& lhs) - { - lhs.m_backtrackType = m_backtrackType; - if (m_backtrackType == BacktrackStackOffset) - lhs.m_backtrackStackOffset = m_backtrackStackOffset; - else if (m_backtrackType == BacktrackLabel) - lhs.m_backtrackLabel = m_backtrackLabel; - lhs.m_backtrackSourceJumps = m_backtrackSourceJumps; - lhs.m_backtrackSourceLabel = m_backtrackSourceLabel; - lhs.m_dataLabelPtr = m_dataLabelPtr; - lhs.m_backTrackJumps = m_backTrackJumps; - } - - void addBacktrackJump(Jump jump) - { - m_backTrackJumps.append(jump); - } - - void setStackOffset(int32_t stackOffset) - { - m_backtrackType = BacktrackStackOffset; - m_backtrackStackOffset = stackOffset; - } - - void setLabel(Label label) - { - m_backtrackType = BacktrackLabel; - m_backtrackLabel = label; - } - - void setNextBacktrackLabel(Label label) - { - if (m_nextBacktrack) - m_nextBacktrack->setLabel(label); - } - - void copyBacktrackToLabel(BacktrackDestination& rhs) - { - if (rhs.m_backtrackToLabel) - m_backtrackToLabel = rhs.m_backtrackToLabel; - } - - void setBacktrackToLabel(Label* backtrackToLabel) - { - if (!m_backtrackToLabel) - m_backtrackToLabel = backtrackToLabel; - } - - void setBacktrackJumpList(JumpList* jumpList) - { - m_backtrackType = BacktrackJumpList; - m_backtrackSourceJumps = jumpList; - } - - void setBacktrackSourceLabel(Label* backtrackSourceLabel) - { - m_backtrackSourceLabel = backtrackSourceLabel; - } - - void setDataLabel(DataLabelPtr dp) - { - if (m_subDataLabelPtr) { - *m_subDataLabelPtr = dp; - m_subDataLabelPtr = 0; - } else - m_dataLabelPtr = dp; - } - - void setSubDataLabelPtr(DataLabelPtr* subDataLabelPtr) - { - m_subDataLabelPtr = subDataLabelPtr; - } - - void linkToNextBacktrack(BacktrackDestination* nextBacktrack) - { - m_nextBacktrack = nextBacktrack; - } - - int32_t getStackOffset() - { - ASSERT(m_backtrackType == BacktrackStackOffset); - return m_backtrackStackOffset; - } - - Label getLabel() - { - ASSERT(m_backtrackType == BacktrackLabel); - return m_backtrackLabel; - } - - JumpList& getBacktrackJumps() - { - return m_backTrackJumps; - } - - DataLabelPtr& getDataLabel() - { - return m_dataLabelPtr; - } - - void jumpToBacktrack(MacroAssembler* masm) - { - if (isJumpList()) { - if (m_backtrackSourceLabel && (m_backtrackSourceLabel->isSet())) - masm->jump().linkTo(*m_backtrackSourceLabel, masm); - else - m_backtrackSourceJumps->append(masm->jump()); - } else if (isStackOffset()) - masm->jump(Address(stackPointerRegister, m_backtrackStackOffset)); - else if (isLabel()) - masm->jump().linkTo(m_backtrackLabel, masm); - else - m_backTrackJumps.append(masm->jump()); - } - - void jumpToBacktrack(RegexGenerator* generator, Jump jump) - { - if (isJumpList()) { - if (m_backtrackSourceLabel && (m_backtrackSourceLabel->isSet())) - jump.linkTo(*m_backtrackSourceLabel, generator); - else - m_backtrackSourceJumps->append(jump); - } else if (isStackOffset()) - generator->m_expressionState.addIndirectJumpEntry(getStackOffset(), jump); - else if (isLabel()) - jump.linkTo(getLabel(), generator); - else - m_backTrackJumps.append(jump); - } - - void jumpToBacktrack(RegexGenerator* generator, JumpList& jumps) - { - if (isJumpList()) { - if (m_backtrackSourceLabel && (m_backtrackSourceLabel->isSet())) - jumps.linkTo(*m_backtrackSourceLabel, generator); - else - m_backtrackSourceJumps->append(jumps); - } else if (isStackOffset()) - generator->m_expressionState.addIndirectJumpEntry(getStackOffset(), jumps); - else if (isLabel()) - jumps.linkTo(getLabel(), generator); - else - m_backTrackJumps.append(jumps); - } - - bool linkDataLabelToHereIfExists(RegexGenerator* generator) - { - if (hasDataLabel()) { - generator->m_expressionState.m_backtrackRecords.append(AlternativeBacktrackRecord(getDataLabel(), generator->label())); - clearDataLabel(); - return true; - } - - return false; - } - - bool plantJumpToBacktrackIfExists(RegexGenerator* generator) - { - if (isJumpList()) { - if (m_backtrackSourceLabel && (m_backtrackSourceLabel->isSet())) - generator->jump(*m_backtrackSourceLabel); - else - m_backtrackSourceJumps->append(generator->jump()); - - return true; - } - - if (isStackOffset()) { - generator->jump(Address(stackPointerRegister, getStackOffset())); - return true; - } - - if (isLabel()) { - generator->jump(getLabel()); - if (hasDataLabel()) { - generator->m_expressionState.m_backtrackRecords.append(AlternativeBacktrackRecord(getDataLabel(), getLabel())); - clearDataLabel(); - } - return true; - } - - return false; - } - - void linkAlternativeBacktracks(RegexGenerator* generator, bool nextIteration = false) - { - Label hereLabel = generator->label(); - - if (m_backtrackToLabel) { - *m_backtrackToLabel = hereLabel; - m_backtrackToLabel = 0; - } - - m_backTrackJumps.link(generator); - - if (nextIteration) - generator->m_expressionState.linkToNextIteration(hereLabel); - - if (hasDataLabel()) { - generator->m_expressionState.m_backtrackRecords.append(AlternativeBacktrackRecord(getDataLabel(), hereLabel)); - // data label cleared as a result of the clear() below - } - - clear(); - } - - void linkAlternativeBacktracksTo(RegexGenerator* generator, Label label, bool nextIteration = false) - { - m_backTrackJumps.linkTo(label, generator); - - if (nextIteration) - generator->m_expressionState.linkToNextIteration(label); - - if (hasDataLabel()) { - generator->m_expressionState.m_backtrackRecords.append(AlternativeBacktrackRecord(getDataLabel(), label)); - clearDataLabel(); - } - } - - private: - BacktrackType m_backtrackType; - int32_t m_backtrackStackOffset; - Label m_backtrackLabel; - DataLabelPtr m_dataLabelPtr; - Label* m_backtrackToLabel; - DataLabelPtr* m_subDataLabelPtr; - BacktrackDestination* m_nextBacktrack; - Label* m_backtrackSourceLabel; - JumpList* m_backtrackSourceJumps; - JumpList m_backTrackJumps; - }; - - struct TermGenerationState { - TermGenerationState(PatternDisjunction* disjunction, unsigned checkedTotal) - : disjunction(disjunction) - , checkedTotal(checkedTotal) - , m_subParenNum(0) - , m_linkedBacktrack(0) - , m_parenthesesTail(0) - { - } - - void resetAlternative() - { - m_backtrack.clear(); - alt = 0; - } - bool alternativeValid() - { - return alt < disjunction->m_alternatives.size(); - } - void nextAlternative() - { - ++alt; - } - PatternAlternative* alternative() - { - return disjunction->m_alternatives[alt]; - } - bool isLastAlternative() - { - return (alt + 1) == disjunction->m_alternatives.size(); - } - - void resetTerm() - { - ASSERT(alternativeValid()); - t = 0; - m_subParenNum = 0; - } - bool termValid() - { - ASSERT(alternativeValid()); - return t < alternative()->m_terms.size(); - } - void nextTerm() - { - ASSERT(alternativeValid()); - ++t; - } - PatternTerm& term() - { - ASSERT(alternativeValid()); - return alternative()->m_terms[t]; - } - bool isLastTerm() - { - ASSERT(alternativeValid()); - return (t + 1) == alternative()->m_terms.size(); - } - unsigned getSubParenNum() - { - return m_subParenNum++; - } - bool isMainDisjunction() - { - return !disjunction->m_parent; - } - - void setParenthesesTail(ParenthesesTail* parenthesesTail) - { - m_parenthesesTail = parenthesesTail; - } - - ParenthesesTail* getParenthesesTail() - { - return m_parenthesesTail; - } - - PatternTerm& lookaheadTerm() - { - ASSERT(alternativeValid()); - ASSERT((t + 1) < alternative()->m_terms.size()); - return alternative()->m_terms[t + 1]; - } - bool isSinglePatternCharacterLookaheadTerm() - { - ASSERT(alternativeValid()); - return ((t + 1) < alternative()->m_terms.size()) - && (lookaheadTerm().type == PatternTerm::TypePatternCharacter) - && (lookaheadTerm().quantityType == QuantifierFixedCount) - && (lookaheadTerm().quantityCount == 1); - } - - int inputOffset() - { - return term().inputPosition - checkedTotal; - } - - void clearBacktrack() - { - m_backtrack.clear(false); - m_linkedBacktrack = 0; - } - - void jumpToBacktrack(MacroAssembler* masm) - { - m_backtrack.jumpToBacktrack(masm); - } - - void jumpToBacktrack(RegexGenerator* generator, Jump jump) - { - m_backtrack.jumpToBacktrack(generator, jump); - } - - void jumpToBacktrack(RegexGenerator* generator, JumpList& jumps) - { - m_backtrack.jumpToBacktrack(generator, jumps); - } - - bool plantJumpToBacktrackIfExists(RegexGenerator* generator) - { - return m_backtrack.plantJumpToBacktrackIfExists(generator); - } - - bool linkDataLabelToBacktrackIfExists(RegexGenerator* generator) - { - if ((m_backtrack.isLabel()) && (m_backtrack.hasDataLabel())) { - generator->m_expressionState.m_backtrackRecords.append(AlternativeBacktrackRecord(m_backtrack.getDataLabel(), m_backtrack.getLabel())); - m_backtrack.clearDataLabel(); - return true; - } - - return false; - } - - void addBacktrackJump(Jump jump) - { - m_backtrack.addBacktrackJump(jump); - } - - void setBacktrackDataLabel(DataLabelPtr dp) - { - m_backtrack.setDataLabel(dp); - } - - void setBackTrackStackOffset(int32_t stackOffset) - { - m_backtrack.setStackOffset(stackOffset); - } - - void setBacktrackLabel(Label label) - { - m_backtrack.setLabel(label); - } - - void linkAlternativeBacktracks(RegexGenerator* generator, bool nextIteration = false) - { - m_backtrack.linkAlternativeBacktracks(generator, nextIteration); - m_linkedBacktrack = 0; - } - - void linkAlternativeBacktracksTo(RegexGenerator* generator, Label label, bool nextIteration = false) - { - m_backtrack.linkAlternativeBacktracksTo(generator, label, nextIteration); - } - - void setBacktrackLink(BacktrackDestination* linkedBacktrack) - { - m_linkedBacktrack = linkedBacktrack; - } - - void chainBacktracks(BacktrackDestination* followonBacktrack) - { - if (m_linkedBacktrack) - m_linkedBacktrack->linkToNextBacktrack(followonBacktrack); - } - - void chainBacktrackJumps(JumpList* jumpList) - { - if (m_linkedBacktrack && !(m_linkedBacktrack->hasDestination())) - m_linkedBacktrack->setBacktrackJumpList(jumpList); - } - - BacktrackDestination& getBacktrackDestination() - { - return m_backtrack; - } - - void propagateBacktrackingFrom(RegexGenerator* generator, BacktrackDestination& backtrack, bool doJump = true) - { - if (doJump) - m_backtrack.jumpToBacktrack(generator, backtrack.getBacktrackJumps()); - if (backtrack.hasDestination()) { - if (m_backtrack.hasDataLabel()) - generator->m_expressionState.addDataLabelToNextIteration(m_backtrack.getDataLabel()); - - m_backtrack.copyTarget(backtrack, doJump); - } - } - - PatternDisjunction* disjunction; - int checkedTotal; - private: - unsigned alt; - unsigned t; - unsigned m_subParenNum; - BacktrackDestination m_backtrack; - BacktrackDestination* m_linkedBacktrack; - ParenthesesTail* m_parenthesesTail; - - }; - - struct ParenthesesTail { - ParenthesesTail(PatternTerm& term, int nestingLevel, ParenthesesTail* nextOuterParenTail) - : m_term(term) - , m_nestingLevel(nestingLevel) - , m_subParenIndex(0) - , m_nextOuterParenTail(nextOuterParenTail) - { - } - - void processBacktracks(RegexGenerator* generator, TermGenerationState& state, TermGenerationState& parenthesesState, Label nonGreedyTryParentheses, Label fallThrough) - { - m_nonGreedyTryParentheses = nonGreedyTryParentheses; - m_fallThrough = fallThrough; - - m_subParenIndex = state.getSubParenNum(); - parenthesesState.getBacktrackDestination().copyTo(m_parenBacktrack); - state.chainBacktracks(&m_backtrack); - BacktrackDestination& stateBacktrack = state.getBacktrackDestination(); - stateBacktrack.copyTo(m_backtrack); - stateBacktrack.setBacktrackToLabel(&m_backtrackToLabel); - state.setBacktrackLink(&m_backtrack); - stateBacktrack.setSubDataLabelPtr(&m_dataAfterLabelPtr); - - m_doDirectBacktrack = m_parenBacktrack.hasDestination(); - - if ((m_term.quantityType == QuantifierGreedy) || (m_term.quantityType == QuantifierNonGreedy)) - m_doDirectBacktrack = false; - - if (m_doDirectBacktrack) - state.propagateBacktrackingFrom(generator, m_parenBacktrack, false); - else { - stateBacktrack.setBacktrackJumpList(&m_pattBacktrackJumps); - stateBacktrack.setBacktrackSourceLabel(&m_backtrackFromAfterParens); - } - - parenthesesState.chainBacktrackJumps(&m_pattBacktrackJumps); - } - - void setNextIteration(Label nextIteration) - { - if (!m_nestingLevel && !m_backtrackToLabel.isSet()) - m_backtrackToLabel = nextIteration; - } - - void addAfterParenJump(Jump jump) - { - m_pattBacktrackJumps.append(jump); - } - - bool generateCode(RegexGenerator* generator, JumpList& jumpsToNext, bool priorBackTrackFallThrough, bool nextBacktrackFallThrough) - { - const RegisterID indexTemporary = regT0; - unsigned parenthesesFrameLocation = m_term.frameLocation; - Jump fromPriorBacktrack; - bool needJumpForPriorParenTail = false; - - if (priorBackTrackFallThrough - && ((m_term.quantityType == QuantifierGreedy) - || (m_term.quantityType == QuantifierNonGreedy) - || (!m_doDirectBacktrack && m_parenBacktrack.hasDestination()))) { - // If the prior paren tail code assumed that it could fall through, - // but we need to generate after paren backtrack code, then provide - // a jump around that code for the prior paren tail code. - // A regular expressing like ((xxx)...)? needs this. - fromPriorBacktrack = generator->jump(); - needJumpForPriorParenTail = true; - } - - if (!m_backtrack.hasDestination()) { - if (m_backtrackToLabel.isSet()) { - m_backtrack.setLabel(m_backtrackToLabel); - nextBacktrackFallThrough = false; - } else if (!m_subParenIndex && m_nextOuterParenTail) { - // If we don't have a destination and we are the first term of a nested paren, go - // back to the outer paren. - // There is an optimization if the next outer paren is the next paren to be emitted. - // In that case we really want the else clause. - m_backtrack.setBacktrackJumpList(&m_nextOuterParenTail->m_withinBacktrackJumps); - nextBacktrackFallThrough = false; - } else - m_backtrack.setBacktrackJumpList(&jumpsToNext); - } else - nextBacktrackFallThrough = false; - - // A failure AFTER the parens jumps here - Backtrack to this paren - m_backtrackFromAfterParens = generator->label(); - - if (m_dataAfterLabelPtr.isSet()) - generator->m_expressionState.m_backtrackRecords.append(AlternativeBacktrackRecord(m_dataAfterLabelPtr, m_backtrackFromAfterParens)); - - m_pattBacktrackJumps.link(generator); - - if (m_term.quantityType == QuantifierGreedy) { - // If this is -1 we have now tested with both with and without the parens. - generator->loadFromFrame(parenthesesFrameLocation, indexTemporary); - m_backtrack.jumpToBacktrack(generator, generator->branch32(Equal, indexTemporary, Imm32(-1))); - } else if (m_term.quantityType == QuantifierNonGreedy) { - // If this is -1 we have now tested with both with and without the parens. - generator->loadFromFrame(parenthesesFrameLocation, indexTemporary); - generator->branch32(Equal, indexTemporary, Imm32(-1)).linkTo(m_nonGreedyTryParentheses, generator); - } - - if (!m_doDirectBacktrack) - m_parenBacktrack.plantJumpToBacktrackIfExists(generator); - - // A failure WITHIN the parens jumps here - if (needJumpForPriorParenTail) - fromPriorBacktrack.link(generator); - m_parenBacktrack.linkAlternativeBacktracks(generator); - m_withinBacktrackJumps.link(generator); - - if (m_term.capture()) - generator->store32(Imm32(-1), Address(output, (m_term.parentheses.subpatternId << 1) * sizeof(int))); - - if (m_term.quantityType == QuantifierGreedy) { - generator->storeToFrame(Imm32(-1), parenthesesFrameLocation); - generator->jump().linkTo(m_fallThrough, generator); - nextBacktrackFallThrough = false; - } else if (!nextBacktrackFallThrough) - m_backtrack.jumpToBacktrack(generator); - - if (!m_doDirectBacktrack) - m_backtrack.setNextBacktrackLabel(m_backtrackFromAfterParens); - - return nextBacktrackFallThrough; - } - - PatternTerm& m_term; - int m_nestingLevel; - unsigned m_subParenIndex; - ParenthesesTail* m_nextOuterParenTail; - Label m_nonGreedyTryParentheses; - Label m_fallThrough; - Label m_backtrackToLabel; - Label m_backtrackFromAfterParens; - DataLabelPtr m_dataAfterLabelPtr; - JumpList m_pattBacktrackJumps; - JumpList m_withinBacktrackJumps; - BacktrackDestination m_parenBacktrack; - BacktrackDestination m_backtrack; - bool m_doDirectBacktrack; - }; - - void generateAssertionBOL(TermGenerationState& state) - { - PatternTerm& term = state.term(); - - if (m_pattern.m_multiline) { - const RegisterID character = regT0; - - JumpList matchDest; - if (!term.inputPosition) - matchDest.append(branch32(Equal, index, Imm32(state.checkedTotal))); - - readCharacter(state.inputOffset() - 1, character); - matchCharacterClass(character, matchDest, m_pattern.newlineCharacterClass()); - state.jumpToBacktrack(this); - - matchDest.link(this); - } else { - // Erk, really should poison out these alternatives early. :-/ - if (term.inputPosition) - state.jumpToBacktrack(this); - else - state.jumpToBacktrack(this, branch32(NotEqual, index, Imm32(state.checkedTotal))); - } - } - - void generateAssertionEOL(TermGenerationState& state) - { - PatternTerm& term = state.term(); - - if (m_pattern.m_multiline) { - const RegisterID character = regT0; - - JumpList matchDest; - if (term.inputPosition == state.checkedTotal) - matchDest.append(atEndOfInput()); - - readCharacter(state.inputOffset(), character); - matchCharacterClass(character, matchDest, m_pattern.newlineCharacterClass()); - state.jumpToBacktrack(this); - - matchDest.link(this); - } else { - if (term.inputPosition == state.checkedTotal) - state.jumpToBacktrack(this, notAtEndOfInput()); - // Erk, really should poison out these alternatives early. :-/ - else - state.jumpToBacktrack(this); - } - } - - // Also falls though on nextIsNotWordChar. - void matchAssertionWordchar(TermGenerationState& state, JumpList& nextIsWordChar, JumpList& nextIsNotWordChar) - { - const RegisterID character = regT0; - PatternTerm& term = state.term(); - - if (term.inputPosition == state.checkedTotal) - nextIsNotWordChar.append(atEndOfInput()); - - readCharacter(state.inputOffset(), character); - matchCharacterClass(character, nextIsWordChar, m_pattern.wordcharCharacterClass()); - } - - void generateAssertionWordBoundary(TermGenerationState& state) - { - const RegisterID character = regT0; - PatternTerm& term = state.term(); - - Jump atBegin; - JumpList matchDest; - if (!term.inputPosition) - atBegin = branch32(Equal, index, Imm32(state.checkedTotal)); - readCharacter(state.inputOffset() - 1, character); - matchCharacterClass(character, matchDest, m_pattern.wordcharCharacterClass()); - if (!term.inputPosition) - atBegin.link(this); - - // We fall through to here if the last character was not a wordchar. - JumpList nonWordCharThenWordChar; - JumpList nonWordCharThenNonWordChar; - if (term.invert()) { - matchAssertionWordchar(state, nonWordCharThenNonWordChar, nonWordCharThenWordChar); - nonWordCharThenWordChar.append(jump()); - } else { - matchAssertionWordchar(state, nonWordCharThenWordChar, nonWordCharThenNonWordChar); - nonWordCharThenNonWordChar.append(jump()); - } - state.jumpToBacktrack(this, nonWordCharThenNonWordChar); - - // We jump here if the last character was a wordchar. - matchDest.link(this); - JumpList wordCharThenWordChar; - JumpList wordCharThenNonWordChar; - if (term.invert()) { - matchAssertionWordchar(state, wordCharThenNonWordChar, wordCharThenWordChar); - wordCharThenWordChar.append(jump()); - } else { - matchAssertionWordchar(state, wordCharThenWordChar, wordCharThenNonWordChar); - // This can fall-though! - } - - state.jumpToBacktrack(this, wordCharThenWordChar); - - nonWordCharThenWordChar.link(this); - wordCharThenNonWordChar.link(this); - } - - void generatePatternCharacterSingle(TermGenerationState& state) - { - const RegisterID character = regT0; - UChar ch = state.term().patternCharacter; - - if (m_pattern.m_ignoreCase && isASCIIAlpha(ch)) { - readCharacter(state.inputOffset(), character); - or32(Imm32(32), character); - state.jumpToBacktrack(this, branch32(NotEqual, character, Imm32(Unicode::toLower(ch)))); - } else { - ASSERT(!m_pattern.m_ignoreCase || (Unicode::toLower(ch) == Unicode::toUpper(ch))); - state.jumpToBacktrack(this, jumpIfCharNotEquals(ch, state.inputOffset())); - } - } - - void generatePatternCharacterPair(TermGenerationState& state) - { - const RegisterID character = regT0; - UChar ch1 = state.term().patternCharacter; - UChar ch2 = state.lookaheadTerm().patternCharacter; - - int mask = 0; - int chPair = ch1 | (ch2 << 16); - - if (m_pattern.m_ignoreCase) { - if (isASCIIAlpha(ch1)) - mask |= 32; - if (isASCIIAlpha(ch2)) - mask |= 32 << 16; - } - - if (mask) { - load32WithUnalignedHalfWords(BaseIndex(input, index, TimesTwo, state.inputOffset() * sizeof(UChar)), character); - or32(Imm32(mask), character); - state.jumpToBacktrack(this, branch32(NotEqual, character, Imm32(chPair | mask))); - } else - state.jumpToBacktrack(this, branch32WithUnalignedHalfWords(NotEqual, BaseIndex(input, index, TimesTwo, state.inputOffset() * sizeof(UChar)), Imm32(chPair))); - } - - void generatePatternCharacterFixed(TermGenerationState& state) - { - const RegisterID character = regT0; - const RegisterID countRegister = regT1; - PatternTerm& term = state.term(); - UChar ch = term.patternCharacter; - - move(index, countRegister); - sub32(Imm32(term.quantityCount), countRegister); - - Label loop(this); - if (m_pattern.m_ignoreCase && isASCIIAlpha(ch)) { - load16(BaseIndex(input, countRegister, TimesTwo, (state.inputOffset() + term.quantityCount) * sizeof(UChar)), character); - or32(Imm32(32), character); - state.jumpToBacktrack(this, branch32(NotEqual, character, Imm32(Unicode::toLower(ch)))); - } else { - ASSERT(!m_pattern.m_ignoreCase || (Unicode::toLower(ch) == Unicode::toUpper(ch))); - state.jumpToBacktrack(this, branch16(NotEqual, BaseIndex(input, countRegister, TimesTwo, (state.inputOffset() + term.quantityCount) * sizeof(UChar)), Imm32(ch))); - } - add32(Imm32(1), countRegister); - branch32(NotEqual, countRegister, index).linkTo(loop, this); - } - - void generatePatternCharacterGreedy(TermGenerationState& state) - { - const RegisterID character = regT0; - const RegisterID countRegister = regT1; - PatternTerm& term = state.term(); - UChar ch = term.patternCharacter; - - move(Imm32(0), countRegister); - - JumpList failures; - Label loop(this); - failures.append(atEndOfInput()); - if (m_pattern.m_ignoreCase && isASCIIAlpha(ch)) { - readCharacter(state.inputOffset(), character); - or32(Imm32(32), character); - failures.append(branch32(NotEqual, character, Imm32(Unicode::toLower(ch)))); - } else { - ASSERT(!m_pattern.m_ignoreCase || (Unicode::toLower(ch) == Unicode::toUpper(ch))); - failures.append(jumpIfCharNotEquals(ch, state.inputOffset())); - } - - add32(Imm32(1), countRegister); - add32(Imm32(1), index); - if (term.quantityCount != quantifyInfinite) { - branch32(NotEqual, countRegister, Imm32(term.quantityCount)).linkTo(loop, this); - failures.append(jump()); - } else - jump(loop); - - Label backtrackBegin(this); - loadFromFrame(term.frameLocation, countRegister); - state.jumpToBacktrack(this, branchTest32(Zero, countRegister)); - sub32(Imm32(1), countRegister); - sub32(Imm32(1), index); - - failures.link(this); - - storeToFrame(countRegister, term.frameLocation); - - state.setBacktrackLabel(backtrackBegin); - } - - void generatePatternCharacterNonGreedy(TermGenerationState& state) - { - const RegisterID character = regT0; - const RegisterID countRegister = regT1; - PatternTerm& term = state.term(); - UChar ch = term.patternCharacter; - - move(Imm32(0), countRegister); - - Jump firstTimeDoNothing = jump(); - - Label hardFail(this); - sub32(countRegister, index); - state.jumpToBacktrack(this); - - Label backtrackBegin(this); - loadFromFrame(term.frameLocation, countRegister); - - atEndOfInput().linkTo(hardFail, this); - if (term.quantityCount != quantifyInfinite) - branch32(Equal, countRegister, Imm32(term.quantityCount), hardFail); - if (m_pattern.m_ignoreCase && isASCIIAlpha(ch)) { - readCharacter(state.inputOffset(), character); - or32(Imm32(32), character); - branch32(NotEqual, character, Imm32(Unicode::toLower(ch))).linkTo(hardFail, this); - } else { - ASSERT(!m_pattern.m_ignoreCase || (Unicode::toLower(ch) == Unicode::toUpper(ch))); - jumpIfCharNotEquals(ch, state.inputOffset()).linkTo(hardFail, this); - } - - add32(Imm32(1), countRegister); - add32(Imm32(1), index); - - firstTimeDoNothing.link(this); - storeToFrame(countRegister, term.frameLocation); - - state.setBacktrackLabel(backtrackBegin); - } - - void generateCharacterClassSingle(TermGenerationState& state) - { - const RegisterID character = regT0; - PatternTerm& term = state.term(); - - JumpList matchDest; - readCharacter(state.inputOffset(), character); - matchCharacterClass(character, matchDest, term.characterClass); - - if (term.invert()) - state.jumpToBacktrack(this, matchDest); - else { - state.jumpToBacktrack(this); - matchDest.link(this); - } - } - - void generateCharacterClassFixed(TermGenerationState& state) - { - const RegisterID character = regT0; - const RegisterID countRegister = regT1; - PatternTerm& term = state.term(); - - move(index, countRegister); - sub32(Imm32(term.quantityCount), countRegister); - - Label loop(this); - JumpList matchDest; - load16(BaseIndex(input, countRegister, TimesTwo, (state.inputOffset() + term.quantityCount) * sizeof(UChar)), character); - matchCharacterClass(character, matchDest, term.characterClass); - - if (term.invert()) - state.jumpToBacktrack(this, matchDest); - else { - state.jumpToBacktrack(this); - matchDest.link(this); - } - - add32(Imm32(1), countRegister); - branch32(NotEqual, countRegister, index).linkTo(loop, this); - } - - void generateCharacterClassGreedy(TermGenerationState& state) - { - const RegisterID character = regT0; - const RegisterID countRegister = regT1; - PatternTerm& term = state.term(); - - move(Imm32(0), countRegister); - - JumpList failures; - Label loop(this); - failures.append(atEndOfInput()); - - if (term.invert()) { - readCharacter(state.inputOffset(), character); - matchCharacterClass(character, failures, term.characterClass); - } else { - JumpList matchDest; - readCharacter(state.inputOffset(), character); - matchCharacterClass(character, matchDest, term.characterClass); - failures.append(jump()); - matchDest.link(this); - } - - add32(Imm32(1), countRegister); - add32(Imm32(1), index); - if (term.quantityCount != quantifyInfinite) { - branch32(NotEqual, countRegister, Imm32(term.quantityCount)).linkTo(loop, this); - failures.append(jump()); - } else - jump(loop); - - Label backtrackBegin(this); - loadFromFrame(term.frameLocation, countRegister); - state.jumpToBacktrack(this, branchTest32(Zero, countRegister)); - sub32(Imm32(1), countRegister); - sub32(Imm32(1), index); - - failures.link(this); - - storeToFrame(countRegister, term.frameLocation); - - state.setBacktrackLabel(backtrackBegin); - } - - void generateCharacterClassNonGreedy(TermGenerationState& state) - { - const RegisterID character = regT0; - const RegisterID countRegister = regT1; - PatternTerm& term = state.term(); - - move(Imm32(0), countRegister); - - Jump firstTimeDoNothing = jump(); - - Label hardFail(this); - sub32(countRegister, index); - state.jumpToBacktrack(this); - - Label backtrackBegin(this); - loadFromFrame(term.frameLocation, countRegister); - - atEndOfInput().linkTo(hardFail, this); - branch32(Equal, countRegister, Imm32(term.quantityCount), hardFail); - - JumpList matchDest; - readCharacter(state.inputOffset(), character); - matchCharacterClass(character, matchDest, term.characterClass); - - if (term.invert()) - matchDest.linkTo(hardFail, this); - else { - jump(hardFail); - matchDest.link(this); - } - - add32(Imm32(1), countRegister); - add32(Imm32(1), index); - - firstTimeDoNothing.link(this); - storeToFrame(countRegister, term.frameLocation); - - state.setBacktrackLabel(backtrackBegin); - } - - void generateParenthesesDisjunction(PatternTerm& parenthesesTerm, TermGenerationState& state, unsigned alternativeFrameLocation) - { - ASSERT((parenthesesTerm.type == PatternTerm::TypeParenthesesSubpattern) || (parenthesesTerm.type == PatternTerm::TypeParentheticalAssertion)); - ASSERT(parenthesesTerm.quantityCount == 1); - - PatternDisjunction* disjunction = parenthesesTerm.parentheses.disjunction; - unsigned preCheckedCount = ((parenthesesTerm.quantityType == QuantifierFixedCount) && (parenthesesTerm.type != PatternTerm::TypeParentheticalAssertion)) ? disjunction->m_minimumSize : 0; - - if (disjunction->m_alternatives.size() == 1) { - state.resetAlternative(); - ASSERT(state.alternativeValid()); - PatternAlternative* alternative = state.alternative(); - optimizeAlternative(alternative); - - int countToCheck = alternative->m_minimumSize - preCheckedCount; - if (countToCheck) { - ASSERT((parenthesesTerm.type == PatternTerm::TypeParentheticalAssertion) || (parenthesesTerm.quantityType != QuantifierFixedCount)); - - // FIXME: This is quite horrible. The call to 'plantJumpToBacktrackIfExists' - // will be forced to always trampoline into here, just to decrement the index. - // Ick. - Jump skip = jump(); - - Label backtrackBegin(this); - sub32(Imm32(countToCheck), index); - state.addBacktrackJump(jump()); - - skip.link(this); - - state.setBacktrackLabel(backtrackBegin); - - state.jumpToBacktrack(this, jumpIfNoAvailableInput(countToCheck)); - state.checkedTotal += countToCheck; - } - - for (state.resetTerm(); state.termValid(); state.nextTerm()) - generateTerm(state); - - state.checkedTotal -= countToCheck; - } else { - JumpList successes; - bool propogateBacktrack = false; - - for (state.resetAlternative(); state.alternativeValid(); state.nextAlternative()) { - - PatternAlternative* alternative = state.alternative(); - optimizeAlternative(alternative); - - ASSERT(alternative->m_minimumSize >= preCheckedCount); - int countToCheck = alternative->m_minimumSize - preCheckedCount; - if (countToCheck) { - state.addBacktrackJump(jumpIfNoAvailableInput(countToCheck)); - state.checkedTotal += countToCheck; - } - - for (state.resetTerm(); state.termValid(); state.nextTerm()) - generateTerm(state); - - // Matched an alternative. - DataLabelPtr dataLabel = storeToFrameWithPatch(alternativeFrameLocation); - - if (!state.isLastAlternative() || countToCheck) - successes.append(jump()); - - // Alternative did not match. - - state.setBacktrackDataLabel(dataLabel); - - // Do we have a backtrack destination? - // if so, link the data label to it. - state.linkDataLabelToBacktrackIfExists(this); - - if (!state.isLastAlternative() || countToCheck) - state.linkAlternativeBacktracks(this); - - if (countToCheck) { - sub32(Imm32(countToCheck), index); - state.checkedTotal -= countToCheck; - } else if (state.isLastAlternative()) - propogateBacktrack = true; - } - // We fall through to here when the last alternative fails. - // Add a backtrack out of here for the parenthese handling code to link up. - if (!propogateBacktrack) - state.addBacktrackJump(jump()); - - // Save address on stack for the parens code to backtrack to, to retry the - // next alternative. - state.setBackTrackStackOffset(alternativeFrameLocation * sizeof(void*)); - - successes.link(this); - } - } - - void generateParenthesesSingle(TermGenerationState& state) - { - const RegisterID indexTemporary = regT0; - PatternTerm& term = state.term(); - PatternDisjunction* disjunction = term.parentheses.disjunction; - ASSERT(term.quantityCount == 1); - - unsigned preCheckedCount = (term.quantityType == QuantifierFixedCount) ? disjunction->m_minimumSize : 0; - - unsigned parenthesesFrameLocation = term.frameLocation; - unsigned alternativeFrameLocation = parenthesesFrameLocation; - if (term.quantityType != QuantifierFixedCount) - alternativeFrameLocation += RegexStackSpaceForBackTrackInfoParenthesesOnce; - - // optimized case - no capture & no quantifier can be handled in a light-weight manner. - if (!term.capture() && (term.quantityType == QuantifierFixedCount)) { - m_expressionState.incrementParenNestingLevel(); - - TermGenerationState parenthesesState(disjunction, state.checkedTotal); - generateParenthesesDisjunction(state.term(), parenthesesState, alternativeFrameLocation); - // this expects that any backtracks back out of the parentheses will be in the - // parenthesesState's m_backTrackJumps vector, and that if they need backtracking - // they will have set an entry point on the parenthesesState's m_backtrackLabel. - BacktrackDestination& parenthesesBacktrack = parenthesesState.getBacktrackDestination(); - state.propagateBacktrackingFrom(this, parenthesesBacktrack); - state.getBacktrackDestination().copyBacktrackToLabel(parenthesesBacktrack); - - m_expressionState.decrementParenNestingLevel(); - } else { - Jump nonGreedySkipParentheses; - Label nonGreedyTryParentheses; - if (term.quantityType == QuantifierGreedy) - storeToFrame(index, parenthesesFrameLocation); - else if (term.quantityType == QuantifierNonGreedy) { - storeToFrame(Imm32(-1), parenthesesFrameLocation); - nonGreedySkipParentheses = jump(); - nonGreedyTryParentheses = label(); - storeToFrame(index, parenthesesFrameLocation); - } - - // store the match start index - if (term.capture()) { - int inputOffset = state.inputOffset() - preCheckedCount; - if (inputOffset) { - move(index, indexTemporary); - add32(Imm32(inputOffset), indexTemporary); - store32(indexTemporary, Address(output, (term.parentheses.subpatternId << 1) * sizeof(int))); - } else - store32(index, Address(output, (term.parentheses.subpatternId << 1) * sizeof(int))); - } - - ParenthesesTail* parenthesesTail = m_expressionState.addParenthesesTail(term, state.getParenthesesTail()); - - m_expressionState.incrementParenNestingLevel(); - - TermGenerationState parenthesesState(disjunction, state.checkedTotal); - - // Save the parenthesesTail for backtracking from nested parens to this one. - parenthesesState.setParenthesesTail(parenthesesTail); - - // generate the body of the parentheses - generateParenthesesDisjunction(state.term(), parenthesesState, alternativeFrameLocation); - - // For non-fixed counts, backtrack if we didn't match anything. - if (term.quantityType != QuantifierFixedCount) - parenthesesTail->addAfterParenJump(branch32(Equal, index, Address(stackPointerRegister, (parenthesesFrameLocation * sizeof(void*))))); - - // store the match end index - if (term.capture()) { - int inputOffset = state.inputOffset(); - if (inputOffset) { - move(index, indexTemporary); - add32(Imm32(state.inputOffset()), indexTemporary); - store32(indexTemporary, Address(output, ((term.parentheses.subpatternId << 1) + 1) * sizeof(int))); - } else - store32(index, Address(output, ((term.parentheses.subpatternId << 1) + 1) * sizeof(int))); - } - - m_expressionState.decrementParenNestingLevel(); - - parenthesesTail->processBacktracks(this, state, parenthesesState, nonGreedyTryParentheses, label()); - - parenthesesState.getBacktrackDestination().clear(); - - if (term.quantityType == QuantifierNonGreedy) - nonGreedySkipParentheses.link(this); - } - } - - void generateParenthesesGreedyNoBacktrack(TermGenerationState& state) - { - PatternTerm& parenthesesTerm = state.term(); - PatternDisjunction* disjunction = parenthesesTerm.parentheses.disjunction; - ASSERT(parenthesesTerm.type == PatternTerm::TypeParenthesesSubpattern); - ASSERT(parenthesesTerm.quantityCount != 1); // Handled by generateParenthesesSingle. - - TermGenerationState parenthesesState(disjunction, state.checkedTotal); - - Label matchAgain(this); - - storeToFrame(index, parenthesesTerm.frameLocation); // Save the current index to check for zero len matches later. - - for (parenthesesState.resetAlternative(); parenthesesState.alternativeValid(); parenthesesState.nextAlternative()) { - - PatternAlternative* alternative = parenthesesState.alternative(); - optimizeAlternative(alternative); - - int countToCheck = alternative->m_minimumSize; - if (countToCheck) { - parenthesesState.addBacktrackJump(jumpIfNoAvailableInput(countToCheck)); - parenthesesState.checkedTotal += countToCheck; - } - - for (parenthesesState.resetTerm(); parenthesesState.termValid(); parenthesesState.nextTerm()) - generateTerm(parenthesesState); - - // If we get here, we matched! If the index advanced then try to match more since limit isn't supported yet. - branch32(NotEqual, index, Address(stackPointerRegister, (parenthesesTerm.frameLocation * sizeof(void*))), matchAgain); - - // If we get here we matched, but we matched "" - cannot accept this alternative as is, so either backtrack, - // or fall through to try the next alternative if no backtrack is available. - parenthesesState.plantJumpToBacktrackIfExists(this); - - parenthesesState.linkAlternativeBacktracks(this); - - // We get here if the alternative fails to match - fall through to the next iteration, or out of the loop. - - if (countToCheck) { - sub32(Imm32(countToCheck), index); - parenthesesState.checkedTotal -= countToCheck; - } - } - - // If the last alternative falls through to here, we have a failed match... - // Which means that we match whatever we have matched up to this point (even if nothing). - } - - void generateParentheticalAssertion(TermGenerationState& state) - { - PatternTerm& term = state.term(); - PatternDisjunction* disjunction = term.parentheses.disjunction; - ASSERT(term.quantityCount == 1); - ASSERT(term.quantityType == QuantifierFixedCount); - - unsigned parenthesesFrameLocation = term.frameLocation; - unsigned alternativeFrameLocation = parenthesesFrameLocation + RegexStackSpaceForBackTrackInfoParentheticalAssertion; - - int countCheckedAfterAssertion = state.checkedTotal - term.inputPosition; - - if (term.invert()) { - // Inverted case - storeToFrame(index, parenthesesFrameLocation); - - state.checkedTotal -= countCheckedAfterAssertion; - if (countCheckedAfterAssertion) - sub32(Imm32(countCheckedAfterAssertion), index); - - TermGenerationState parenthesesState(disjunction, state.checkedTotal); - generateParenthesesDisjunction(state.term(), parenthesesState, alternativeFrameLocation); - // Success! - which means - Fail! - loadFromFrame(parenthesesFrameLocation, index); - state.jumpToBacktrack(this); - - // And fail means success. - parenthesesState.linkAlternativeBacktracks(this); - - loadFromFrame(parenthesesFrameLocation, index); - - state.checkedTotal += countCheckedAfterAssertion; - } else { - // Normal case - storeToFrame(index, parenthesesFrameLocation); - - state.checkedTotal -= countCheckedAfterAssertion; - if (countCheckedAfterAssertion) - sub32(Imm32(countCheckedAfterAssertion), index); - - TermGenerationState parenthesesState(disjunction, state.checkedTotal); - generateParenthesesDisjunction(state.term(), parenthesesState, alternativeFrameLocation); - // Success! - which means - Success! - loadFromFrame(parenthesesFrameLocation, index); - Jump success = jump(); - - parenthesesState.linkAlternativeBacktracks(this); - - loadFromFrame(parenthesesFrameLocation, index); - state.jumpToBacktrack(this); - - success.link(this); - - state.checkedTotal += countCheckedAfterAssertion; - } - } - - void generateTerm(TermGenerationState& state) - { - PatternTerm& term = state.term(); - - switch (term.type) { - case PatternTerm::TypeAssertionBOL: - generateAssertionBOL(state); - break; - - case PatternTerm::TypeAssertionEOL: - generateAssertionEOL(state); - break; - - case PatternTerm::TypeAssertionWordBoundary: - generateAssertionWordBoundary(state); - break; - - case PatternTerm::TypePatternCharacter: - switch (term.quantityType) { - case QuantifierFixedCount: - if (term.quantityCount == 1) { - if (state.isSinglePatternCharacterLookaheadTerm() && (state.lookaheadTerm().inputPosition == (term.inputPosition + 1))) { - generatePatternCharacterPair(state); - state.nextTerm(); - } else - generatePatternCharacterSingle(state); - } else - generatePatternCharacterFixed(state); - break; - case QuantifierGreedy: - generatePatternCharacterGreedy(state); - break; - case QuantifierNonGreedy: - generatePatternCharacterNonGreedy(state); - break; - } - break; - - case PatternTerm::TypeCharacterClass: - switch (term.quantityType) { - case QuantifierFixedCount: - if (term.quantityCount == 1) - generateCharacterClassSingle(state); - else - generateCharacterClassFixed(state); - break; - case QuantifierGreedy: - generateCharacterClassGreedy(state); - break; - case QuantifierNonGreedy: - generateCharacterClassNonGreedy(state); - break; - } - break; - - case PatternTerm::TypeBackReference: - m_shouldFallBack = true; - break; - - case PatternTerm::TypeForwardReference: - break; - - case PatternTerm::TypeParenthesesSubpattern: - if (term.quantityCount == 1 && !term.parentheses.isCopy) - generateParenthesesSingle(state); - else if (term.parentheses.isTerminal) - generateParenthesesGreedyNoBacktrack(state); - else - m_shouldFallBack = true; - break; - - case PatternTerm::TypeParentheticalAssertion: - generateParentheticalAssertion(state); - break; - } - } - - void generateDisjunction(PatternDisjunction* disjunction) - { - TermGenerationState state(disjunction, 0); - state.resetAlternative(); - - // check availability for the next alternative - int countCheckedForCurrentAlternative = 0; - int countToCheckForFirstAlternative = 0; - bool hasShorterAlternatives = false; - bool setRepeatAlternativeLabels = false; - JumpList notEnoughInputForPreviousAlternative; - Label firstAlternative; - Label firstAlternativeInputChecked; - - // The label 'firstAlternative' is used to plant a check to see if there is - // sufficient input available to run the first repeating alternative. - // The label 'firstAlternativeInputChecked' will jump directly to matching - // the first repeating alternative having skipped this check. - - if (state.alternativeValid()) { - PatternAlternative* alternative = state.alternative(); - if (!alternative->onceThrough()) { - firstAlternative = Label(this); - setRepeatAlternativeLabels = true; - } - countToCheckForFirstAlternative = alternative->m_minimumSize; - state.checkedTotal += countToCheckForFirstAlternative; - if (countToCheckForFirstAlternative) - notEnoughInputForPreviousAlternative.append(jumpIfNoAvailableInput(countToCheckForFirstAlternative)); - countCheckedForCurrentAlternative = countToCheckForFirstAlternative; - } - - if (setRepeatAlternativeLabels) - firstAlternativeInputChecked = Label(this); - - while (state.alternativeValid()) { - PatternAlternative* alternative = state.alternative(); - optimizeAlternative(alternative); - - // Track whether any alternatives are shorter than the first one. - if (!alternative->onceThrough()) - hasShorterAlternatives = hasShorterAlternatives || (countCheckedForCurrentAlternative < countToCheckForFirstAlternative); - - for (state.resetTerm(); state.termValid(); state.nextTerm()) - generateTerm(state); - - // If we get here, the alternative matched. - if (m_pattern.m_body->m_callFrameSize) - addPtr(Imm32(m_pattern.m_body->m_callFrameSize * sizeof(void*)), stackPointerRegister); - - ASSERT(index != returnRegister); - if (m_pattern.m_body->m_hasFixedSize) { - move(index, returnRegister); - if (alternative->m_minimumSize) - sub32(Imm32(alternative->m_minimumSize), returnRegister); - - store32(returnRegister, output); - } else - load32(Address(output), returnRegister); - - store32(index, Address(output, 4)); - - generateReturn(); - - state.nextAlternative(); - if (alternative->onceThrough() && state.alternativeValid()) - state.clearBacktrack(); - - // if there are any more alternatives, plant the check for input before looping. - if (state.alternativeValid()) { - PatternAlternative* nextAlternative = state.alternative(); - if (!setRepeatAlternativeLabels && !nextAlternative->onceThrough()) { - // We have handled non-repeating alternatives, jump to next iteration - // and loop over repeating alternatives. - state.jumpToBacktrack(this); - - countToCheckForFirstAlternative = nextAlternative->m_minimumSize; - - // If we get here, there the last input checked failed. - notEnoughInputForPreviousAlternative.link(this); - - state.linkAlternativeBacktracks(this); - - // Back up to start the looping alternatives. - if (countCheckedForCurrentAlternative) - sub32(Imm32(countCheckedForCurrentAlternative), index); - - firstAlternative = Label(this); - - state.checkedTotal = countToCheckForFirstAlternative; - if (countToCheckForFirstAlternative) - notEnoughInputForPreviousAlternative.append(jumpIfNoAvailableInput(countToCheckForFirstAlternative)); - - countCheckedForCurrentAlternative = countToCheckForFirstAlternative; - - firstAlternativeInputChecked = Label(this); - - setRepeatAlternativeLabels = true; - } else { - int countToCheckForNextAlternative = nextAlternative->m_minimumSize; - - if (countCheckedForCurrentAlternative > countToCheckForNextAlternative) { // CASE 1: current alternative was longer than the next one. - // If we get here, then the last input checked failed. - notEnoughInputForPreviousAlternative.link(this); - - // Check if sufficent input available to run the next alternative - notEnoughInputForPreviousAlternative.append(jumpIfNoAvailableInput(countToCheckForNextAlternative - countCheckedForCurrentAlternative)); - // We are now in the correct state to enter the next alternative; this add is only required - // to mirror and revert operation of the sub32, just below. - add32(Imm32(countCheckedForCurrentAlternative - countToCheckForNextAlternative), index); - - // If we get here, then the last input checked passed. - state.linkAlternativeBacktracks(this); - - // No need to check if we can run the next alternative, since it is shorter - - // just update index. - sub32(Imm32(countCheckedForCurrentAlternative - countToCheckForNextAlternative), index); - } else if (countCheckedForCurrentAlternative < countToCheckForNextAlternative) { // CASE 2: next alternative is longer than the current one. - // If we get here, then the last input checked failed. - // If there is insufficient input to run the current alternative, and the next alternative is longer, - // then there is definitely not enough input to run it - don't even check. Just adjust index, as if - // we had checked. - notEnoughInputForPreviousAlternative.link(this); - add32(Imm32(countToCheckForNextAlternative - countCheckedForCurrentAlternative), index); - notEnoughInputForPreviousAlternative.append(jump()); - - // The next alternative is longer than the current one; check the difference. - state.linkAlternativeBacktracks(this); - - notEnoughInputForPreviousAlternative.append(jumpIfNoAvailableInput(countToCheckForNextAlternative - countCheckedForCurrentAlternative)); - } else { // CASE 3: Both alternatives are the same length. - ASSERT(countCheckedForCurrentAlternative == countToCheckForNextAlternative); - - // If the next alterative is the same length as this one, then no need to check the input - - // if there was sufficent input to run the current alternative then there is sufficient - // input to run the next one; if not, there isn't. - state.linkAlternativeBacktracks(this); - } - state.checkedTotal -= countCheckedForCurrentAlternative; - countCheckedForCurrentAlternative = countToCheckForNextAlternative; - state.checkedTotal += countCheckedForCurrentAlternative; - } - } - } - - // If we get here, all Alternatives failed... - - state.checkedTotal -= countCheckedForCurrentAlternative; - - if (!setRepeatAlternativeLabels) { - // If there are no alternatives that need repeating (all are marked 'onceThrough') then just link - // the match failures to this point, and fall through to the return below. - state.linkAlternativeBacktracks(this, true); - - notEnoughInputForPreviousAlternative.link(this); - } else { - // How much more input need there be to be able to retry from the first alternative? - // examples: - // /yarr_jit/ or /wrec|pcre/ - // In these examples we need check for one more input before looping. - // /yarr_jit|pcre/ - // In this case we need check for 5 more input to loop (+4 to allow for the first alterative - // being four longer than the last alternative checked, and another +1 to effectively move - // the start position along by one). - // /yarr|rules/ or /wrec|notsomuch/ - // In these examples, provided that there was sufficient input to have just been matching for - // the second alternative we can loop without checking for available input (since the second - // alternative is longer than the first). In the latter example we need to decrement index - // (by 4) so the start position is only progressed by 1 from the last iteration. - int incrementForNextIter = (countToCheckForFirstAlternative - countCheckedForCurrentAlternative) + 1; - - // First, deal with the cases where there was sufficient input to try the last alternative. - if (incrementForNextIter > 0) // We need to check for more input anyway, fall through to the checking below. - state.linkAlternativeBacktracks(this, true); - else if (m_pattern.m_body->m_hasFixedSize && !incrementForNextIter) // No need to update anything, link these backtracks straight to the to pof the loop! - state.linkAlternativeBacktracksTo(this, firstAlternativeInputChecked, true); - else { // no need to check the input, but we do have some bookkeeping to do first. - state.linkAlternativeBacktracks(this, true); - - // Where necessary update our preserved start position. - if (!m_pattern.m_body->m_hasFixedSize) { - move(index, regT0); - sub32(Imm32(countCheckedForCurrentAlternative - 1), regT0); - store32(regT0, Address(output)); - } - - // Update index if necessary, and loop (without checking). - if (incrementForNextIter) - add32(Imm32(incrementForNextIter), index); - jump().linkTo(firstAlternativeInputChecked, this); - } - - notEnoughInputForPreviousAlternative.link(this); - // Update our idea of the start position, if we're tracking this. - if (!m_pattern.m_body->m_hasFixedSize) { - if (countCheckedForCurrentAlternative - 1) { - move(index, regT0); - sub32(Imm32(countCheckedForCurrentAlternative - 1), regT0); - store32(regT0, Address(output)); - } else - store32(index, Address(output)); - } - - // Check if there is sufficent input to run the first alternative again. - jumpIfAvailableInput(incrementForNextIter).linkTo(firstAlternativeInputChecked, this); - // No - insufficent input to run the first alteranative, are there any other alternatives we - // might need to check? If so, the last check will have left the index incremented by - // (countToCheckForFirstAlternative + 1), so we need test whether countToCheckForFirstAlternative - // LESS input is available, to have the effect of just progressing the start position by 1 - // from the last iteration. If this check passes we can just jump up to the check associated - // with the first alternative in the loop. This is a bit sad, since we'll end up trying the - // first alternative again, and this check will fail (otherwise the check planted just above - // here would have passed). This is a bit sad, however it saves trying to do something more - // complex here in compilation, and in the common case we should end up coallescing the checks. - // - // FIXME: a nice improvement here may be to stop trying to match sooner, based on the least - // of the minimum-alternative-lengths. E.g. if I have two alternatives of length 200 and 150, - // and a string of length 100, we'll end up looping index from 0 to 100, checking whether there - // is sufficient input to run either alternative (constantly failing). If there had been only - // one alternative, or if the shorter alternative had come first, we would have terminated - // immediately. :-/ - if (hasShorterAlternatives) - jumpIfAvailableInput(-countToCheckForFirstAlternative).linkTo(firstAlternative, this); - // index will now be a bit garbled (depending on whether 'hasShorterAlternatives' is true, - // it has either been incremented by 1 or by (countToCheckForFirstAlternative + 1) ... - // but since we're about to return a failure this doesn't really matter!) - } - - if (m_pattern.m_body->m_callFrameSize) - addPtr(Imm32(m_pattern.m_body->m_callFrameSize * sizeof(void*)), stackPointerRegister); - - move(Imm32(-1), returnRegister); - - generateReturn(); - - m_expressionState.emitParenthesesTail(this); - m_expressionState.emitIndirectJumpTable(this); - m_expressionState.linkToNextIteration(this); - } - - void generateEnter() - { -#if CPU(X86_64) - push(X86Registers::ebp); - move(stackPointerRegister, X86Registers::ebp); - push(X86Registers::ebx); -#elif CPU(X86) - push(X86Registers::ebp); - move(stackPointerRegister, X86Registers::ebp); - // TODO: do we need spill registers to fill the output pointer if there are no sub captures? - push(X86Registers::ebx); - push(X86Registers::edi); - push(X86Registers::esi); - // load output into edi (2 = saved ebp + return address). - #if COMPILER(MSVC) - loadPtr(Address(X86Registers::ebp, 2 * sizeof(void*)), input); - loadPtr(Address(X86Registers::ebp, 3 * sizeof(void*)), index); - loadPtr(Address(X86Registers::ebp, 4 * sizeof(void*)), length); - loadPtr(Address(X86Registers::ebp, 5 * sizeof(void*)), output); - #else - loadPtr(Address(X86Registers::ebp, 2 * sizeof(void*)), output); - #endif -#elif CPU(ARM) - push(ARMRegisters::r4); - push(ARMRegisters::r5); - push(ARMRegisters::r6); -#if CPU(ARM_TRADITIONAL) - push(ARMRegisters::r8); // scratch register -#endif - move(ARMRegisters::r3, output); -#elif CPU(MIPS) - // Do nothing. -#endif - } - - void generateReturn() - { -#if CPU(X86_64) - pop(X86Registers::ebx); - pop(X86Registers::ebp); -#elif CPU(X86) - pop(X86Registers::esi); - pop(X86Registers::edi); - pop(X86Registers::ebx); - pop(X86Registers::ebp); -#elif CPU(ARM) -#if CPU(ARM_TRADITIONAL) - pop(ARMRegisters::r8); // scratch register -#endif - pop(ARMRegisters::r6); - pop(ARMRegisters::r5); - pop(ARMRegisters::r4); -#elif CPU(MIPS) - // Do nothing -#endif - ret(); - } - -public: - RegexGenerator(RegexPattern& pattern) - : m_pattern(pattern) - , m_shouldFallBack(false) - { - } - - void generate() - { - generateEnter(); - - if (!m_pattern.m_body->m_hasFixedSize) - store32(index, Address(output)); - - if (m_pattern.m_body->m_callFrameSize) - subPtr(Imm32(m_pattern.m_body->m_callFrameSize * sizeof(void*)), stackPointerRegister); - - generateDisjunction(m_pattern.m_body); - } - - void compile(JSGlobalData* globalData, RegexCodeBlock& jitObject) - { - generate(); - - LinkBuffer patchBuffer(this, globalData->regexAllocator.poolForSize(size()), 0); - - for (unsigned i = 0; i < m_expressionState.m_backtrackRecords.size(); ++i) - patchBuffer.patch(m_expressionState.m_backtrackRecords[i].dataLabel, patchBuffer.locationOf(m_expressionState.m_backtrackRecords[i].backtrackLocation)); - - jitObject.set(patchBuffer.finalizeCode()); - jitObject.setFallBack(m_shouldFallBack); - } - -private: - RegexPattern& m_pattern; - bool m_shouldFallBack; - GenerationState m_expressionState; -}; - -void jitCompileRegex(RegexPattern& pattern, JSGlobalData* globalData, RegexCodeBlock& jitObject) -{ - RegexGenerator generator(pattern); - generator.compile(globalData, jitObject); -} - - -}} - -#endif diff --git a/Source/JavaScriptCore/yarr/RegexJIT.h b/Source/JavaScriptCore/yarr/RegexJIT.h deleted file mode 100644 index 5e3dca1..0000000 --- a/Source/JavaScriptCore/yarr/RegexJIT.h +++ /dev/null @@ -1,90 +0,0 @@ -/* - * Copyright (C) 2009 Apple Inc. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY - * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY - * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef RegexJIT_h -#define RegexJIT_h - -#if ENABLE(YARR_JIT) - -#include "MacroAssembler.h" -#include "RegexPattern.h" -#include "UString.h" - -#if CPU(X86) && !COMPILER(MSVC) -#define YARR_CALL __attribute__ ((regparm (3))) -#else -#define YARR_CALL -#endif - -namespace JSC { - -class JSGlobalData; -class ExecutablePool; - -namespace Yarr { - -class RegexCodeBlock { - typedef int (*RegexJITCode)(const UChar* input, unsigned start, unsigned length, int* output) YARR_CALL; - -public: - RegexCodeBlock() - : m_needFallBack(false) - { - } - - ~RegexCodeBlock() - { - } - - void setFallBack(bool fallback) { m_needFallBack = fallback; } - bool isFallBack() { return m_needFallBack; } - void set(MacroAssembler::CodeRef ref) { m_ref = ref; } - - int execute(const UChar* input, unsigned start, unsigned length, int* output) - { - return reinterpret_cast(m_ref.m_code.executableAddress())(input, start, length, output); - } - -#if ENABLE(REGEXP_TRACING) - void *getAddr() { return m_ref.m_code.executableAddress(); } -#endif - -private: - MacroAssembler::CodeRef m_ref; - bool m_needFallBack; -}; - -void jitCompileRegex(RegexPattern& pattern, JSGlobalData* globalData, RegexCodeBlock& jitObject); - -inline int executeRegex(RegexCodeBlock& jitObject, const UChar* input, unsigned start, unsigned length, int* output) -{ - return jitObject.execute(input, start, length, output); -} - -} } // namespace JSC::Yarr - -#endif - -#endif // RegexJIT_h diff --git a/Source/JavaScriptCore/yarr/RegexParser.h b/Source/JavaScriptCore/yarr/RegexParser.h deleted file mode 100644 index ec5f589..0000000 --- a/Source/JavaScriptCore/yarr/RegexParser.h +++ /dev/null @@ -1,887 +0,0 @@ -/* - * Copyright (C) 2009 Apple Inc. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY - * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY - * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef RegexParser_h -#define RegexParser_h - -#include "UString.h" -#include -#include -#include - -namespace JSC { namespace Yarr { - -static const unsigned quantifyInfinite = UINT_MAX; - -enum BuiltInCharacterClassID { - DigitClassID, - SpaceClassID, - WordClassID, - NewlineClassID, -}; - -// The Parser class should not be used directly - only via the Yarr::parse() method. -template -class Parser { -private: - template - friend const char* parse(FriendDelegate& delegate, const UString& pattern, unsigned backReferenceLimit); - - enum ErrorCode { - NoError, - PatternTooLarge, - QuantifierOutOfOrder, - QuantifierWithoutAtom, - MissingParentheses, - ParenthesesUnmatched, - ParenthesesTypeInvalid, - CharacterClassUnmatched, - CharacterClassOutOfOrder, - EscapeUnterminated, - NumberOfErrorCodes - }; - - /* - * CharacterClassParserDelegate: - * - * The class CharacterClassParserDelegate is used in the parsing of character - * classes. This class handles detection of character ranges. This class - * implements enough of the delegate interface such that it can be passed to - * parseEscape() as an EscapeDelegate. This allows parseEscape() to be reused - * to perform the parsing of escape characters in character sets. - */ - class CharacterClassParserDelegate { - public: - CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err) - : m_delegate(delegate) - , m_err(err) - , m_state(Empty) - { - } - - /* - * begin(): - * - * Called at beginning of construction. - */ - void begin(bool invert) - { - m_delegate.atomCharacterClassBegin(invert); - } - - /* - * atomPatternCharacter(): - * - * This method is called either from parseCharacterClass() (for an unescaped - * character in a character class), or from parseEscape(). In the former case - * the value true will be passed for the argument 'hyphenIsRange', and in this - * mode we will allow a hypen to be treated as indicating a range (i.e. /[a-z]/ - * is different to /[a\-z]/). - */ - void atomPatternCharacter(UChar ch, bool hyphenIsRange = false) - { - switch (m_state) { - case AfterCharacterClass: - // Following a builtin character class we need look out for a hyphen. - // We're looking for invalid ranges, such as /[\d-x]/ or /[\d-\d]/. - // If we see a hyphen following a charater class then unlike usual - // we'll report it to the delegate immediately, and put ourself into - // a poisoned state. Any following calls to add another character or - // character class will result in an error. (A hypen following a - // character-class is itself valid, but only at the end of a regex). - if (hyphenIsRange && ch == '-') { - m_delegate.atomCharacterClassAtom('-'); - m_state = AfterCharacterClassHyphen; - return; - } - // Otherwise just fall through - cached character so treat this as Empty. - - case Empty: - m_character = ch; - m_state = CachedCharacter; - return; - - case CachedCharacter: - if (hyphenIsRange && ch == '-') - m_state = CachedCharacterHyphen; - else { - m_delegate.atomCharacterClassAtom(m_character); - m_character = ch; - } - return; - - case CachedCharacterHyphen: - if (ch < m_character) { - m_err = CharacterClassOutOfOrder; - return; - } - m_delegate.atomCharacterClassRange(m_character, ch); - m_state = Empty; - return; - - // See coment in atomBuiltInCharacterClass below. - // This too is technically an error, per ECMA-262, and again we - // we chose to allow this. Note a subtlely here that while we - // diverge from the spec's definition of CharacterRange we do - // remain in compliance with the grammar. For example, consider - // the expression /[\d-a-z]/. We comply with the grammar in - // this case by not allowing a-z to be matched as a range. - case AfterCharacterClassHyphen: - m_delegate.atomCharacterClassAtom(ch); - m_state = Empty; - return; - } - } - - /* - * atomBuiltInCharacterClass(): - * - * Adds a built-in character class, called by parseEscape(). - */ - void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert) - { - switch (m_state) { - case CachedCharacter: - // Flush the currently cached character, then fall through. - m_delegate.atomCharacterClassAtom(m_character); - - case Empty: - case AfterCharacterClass: - m_state = AfterCharacterClass; - m_delegate.atomCharacterClassBuiltIn(classID, invert); - return; - - // If we hit either of these cases, we have an invalid range that - // looks something like /[x-\d]/ or /[\d-\d]/. - // According to ECMA-262 this should be a syntax error, but - // empirical testing shows this to break teh webz. Instead we - // comply with to the ECMA-262 grammar, and assume the grammar to - // have matched the range correctly, but tweak our interpretation - // of CharacterRange. Effectively we implicitly handle the hyphen - // as if it were escaped, e.g. /[\w-_]/ is treated as /[\w\-_]/. - case CachedCharacterHyphen: - m_delegate.atomCharacterClassAtom(m_character); - m_delegate.atomCharacterClassAtom('-'); - // fall through - case AfterCharacterClassHyphen: - m_delegate.atomCharacterClassBuiltIn(classID, invert); - m_state = Empty; - return; - } - } - - /* - * end(): - * - * Called at end of construction. - */ - void end() - { - if (m_state == CachedCharacter) - m_delegate.atomCharacterClassAtom(m_character); - else if (m_state == CachedCharacterHyphen) { - m_delegate.atomCharacterClassAtom(m_character); - m_delegate.atomCharacterClassAtom('-'); - } - m_delegate.atomCharacterClassEnd(); - } - - // parseEscape() should never call these delegate methods when - // invoked with inCharacterClass set. - void assertionWordBoundary(bool) { ASSERT_NOT_REACHED(); } - void atomBackReference(unsigned) { ASSERT_NOT_REACHED(); } - - private: - Delegate& m_delegate; - ErrorCode& m_err; - enum CharacterClassConstructionState { - Empty, - CachedCharacter, - CachedCharacterHyphen, - AfterCharacterClass, - AfterCharacterClassHyphen, - } m_state; - UChar m_character; - }; - - Parser(Delegate& delegate, const UString& pattern, unsigned backReferenceLimit) - : m_delegate(delegate) - , m_backReferenceLimit(backReferenceLimit) - , m_err(NoError) - , m_data(pattern.characters()) - , m_size(pattern.length()) - , m_index(0) - , m_parenthesesNestingDepth(0) - { - } - - /* - * parseEscape(): - * - * Helper for parseTokens() AND parseCharacterClass(). - * Unlike the other parser methods, this function does not report tokens - * directly to the member delegate (m_delegate), instead tokens are - * emitted to the delegate provided as an argument. In the case of atom - * escapes, parseTokens() will call parseEscape() passing m_delegate as - * an argument, and as such the escape will be reported to the delegate. - * - * However this method may also be used by parseCharacterClass(), in which - * case a CharacterClassParserDelegate will be passed as the delegate that - * tokens should be added to. A boolean flag is also provided to indicate - * whether that an escape in a CharacterClass is being parsed (some parsing - * rules change in this context). - * - * The boolean value returned by this method indicates whether the token - * parsed was an atom (outside of a characted class \b and \B will be - * interpreted as assertions). - */ - template - bool parseEscape(EscapeDelegate& delegate) - { - ASSERT(!m_err); - ASSERT(peek() == '\\'); - consume(); - - if (atEndOfPattern()) { - m_err = EscapeUnterminated; - return false; - } - - switch (peek()) { - // Assertions - case 'b': - consume(); - if (inCharacterClass) - delegate.atomPatternCharacter('\b'); - else { - delegate.assertionWordBoundary(false); - return false; - } - break; - case 'B': - consume(); - if (inCharacterClass) - delegate.atomPatternCharacter('B'); - else { - delegate.assertionWordBoundary(true); - return false; - } - break; - - // CharacterClassEscape - case 'd': - consume(); - delegate.atomBuiltInCharacterClass(DigitClassID, false); - break; - case 's': - consume(); - delegate.atomBuiltInCharacterClass(SpaceClassID, false); - break; - case 'w': - consume(); - delegate.atomBuiltInCharacterClass(WordClassID, false); - break; - case 'D': - consume(); - delegate.atomBuiltInCharacterClass(DigitClassID, true); - break; - case 'S': - consume(); - delegate.atomBuiltInCharacterClass(SpaceClassID, true); - break; - case 'W': - consume(); - delegate.atomBuiltInCharacterClass(WordClassID, true); - break; - - // DecimalEscape - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': { - // To match Firefox, we parse an invalid backreference in the range [1-7] as an octal escape. - // First, try to parse this as backreference. - if (!inCharacterClass) { - ParseState state = saveState(); - - unsigned backReference = consumeNumber(); - if (backReference <= m_backReferenceLimit) { - delegate.atomBackReference(backReference); - break; - } - - restoreState(state); - } - - // Not a backreference, and not octal. - if (peek() >= '8') { - delegate.atomPatternCharacter('\\'); - break; - } - - // Fall-through to handle this as an octal escape. - } - - // Octal escape - case '0': - delegate.atomPatternCharacter(consumeOctal()); - break; - - // ControlEscape - case 'f': - consume(); - delegate.atomPatternCharacter('\f'); - break; - case 'n': - consume(); - delegate.atomPatternCharacter('\n'); - break; - case 'r': - consume(); - delegate.atomPatternCharacter('\r'); - break; - case 't': - consume(); - delegate.atomPatternCharacter('\t'); - break; - case 'v': - consume(); - delegate.atomPatternCharacter('\v'); - break; - - // ControlLetter - case 'c': { - ParseState state = saveState(); - consume(); - if (!atEndOfPattern()) { - int control = consume(); - - // To match Firefox, inside a character class, we also accept numbers and '_' as control characters. - if (inCharacterClass ? WTF::isASCIIAlphanumeric(control) || (control == '_') : WTF::isASCIIAlpha(control)) { - delegate.atomPatternCharacter(control & 0x1f); - break; - } - } - restoreState(state); - delegate.atomPatternCharacter('\\'); - break; - } - - // HexEscape - case 'x': { - consume(); - int x = tryConsumeHex(2); - if (x == -1) - delegate.atomPatternCharacter('x'); - else - delegate.atomPatternCharacter(x); - break; - } - - // UnicodeEscape - case 'u': { - consume(); - int u = tryConsumeHex(4); - if (u == -1) - delegate.atomPatternCharacter('u'); - else - delegate.atomPatternCharacter(u); - break; - } - - // IdentityEscape - default: - delegate.atomPatternCharacter(consume()); - } - - return true; - } - - /* - * parseAtomEscape(), parseCharacterClassEscape(): - * - * These methods alias to parseEscape(). - */ - bool parseAtomEscape() - { - return parseEscape(m_delegate); - } - void parseCharacterClassEscape(CharacterClassParserDelegate& delegate) - { - parseEscape(delegate); - } - - /* - * parseCharacterClass(): - * - * Helper for parseTokens(); calls dirctly and indirectly (via parseCharacterClassEscape) - * to an instance of CharacterClassParserDelegate, to describe the character class to the - * delegate. - */ - void parseCharacterClass() - { - ASSERT(!m_err); - ASSERT(peek() == '['); - consume(); - - CharacterClassParserDelegate characterClassConstructor(m_delegate, m_err); - - characterClassConstructor.begin(tryConsume('^')); - - while (!atEndOfPattern()) { - switch (peek()) { - case ']': - consume(); - characterClassConstructor.end(); - return; - - case '\\': - parseCharacterClassEscape(characterClassConstructor); - break; - - default: - characterClassConstructor.atomPatternCharacter(consume(), true); - } - - if (m_err) - return; - } - - m_err = CharacterClassUnmatched; - } - - /* - * parseParenthesesBegin(): - * - * Helper for parseTokens(); checks for parentheses types other than regular capturing subpatterns. - */ - void parseParenthesesBegin() - { - ASSERT(!m_err); - ASSERT(peek() == '('); - consume(); - - if (tryConsume('?')) { - if (atEndOfPattern()) { - m_err = ParenthesesTypeInvalid; - return; - } - - switch (consume()) { - case ':': - m_delegate.atomParenthesesSubpatternBegin(false); - break; - - case '=': - m_delegate.atomParentheticalAssertionBegin(); - break; - - case '!': - m_delegate.atomParentheticalAssertionBegin(true); - break; - - default: - m_err = ParenthesesTypeInvalid; - } - } else - m_delegate.atomParenthesesSubpatternBegin(); - - ++m_parenthesesNestingDepth; - } - - /* - * parseParenthesesEnd(): - * - * Helper for parseTokens(); checks for parse errors (due to unmatched parentheses). - */ - void parseParenthesesEnd() - { - ASSERT(!m_err); - ASSERT(peek() == ')'); - consume(); - - if (m_parenthesesNestingDepth > 0) - m_delegate.atomParenthesesEnd(); - else - m_err = ParenthesesUnmatched; - - --m_parenthesesNestingDepth; - } - - /* - * parseQuantifier(): - * - * Helper for parseTokens(); checks for parse errors and non-greedy quantifiers. - */ - void parseQuantifier(bool lastTokenWasAnAtom, unsigned min, unsigned max) - { - ASSERT(!m_err); - ASSERT(min <= max); - - if (lastTokenWasAnAtom) - m_delegate.quantifyAtom(min, max, !tryConsume('?')); - else - m_err = QuantifierWithoutAtom; - } - - /* - * parseTokens(): - * - * This method loops over the input pattern reporting tokens to the delegate. - * The method returns when a parse error is detected, or the end of the pattern - * is reached. One piece of state is tracked around the loop, which is whether - * the last token passed to the delegate was an atom (this is necessary to detect - * a parse error when a quantifier provided without an atom to quantify). - */ - void parseTokens() - { - bool lastTokenWasAnAtom = false; - - while (!atEndOfPattern()) { - switch (peek()) { - case '|': - consume(); - m_delegate.disjunction(); - lastTokenWasAnAtom = false; - break; - - case '(': - parseParenthesesBegin(); - lastTokenWasAnAtom = false; - break; - - case ')': - parseParenthesesEnd(); - lastTokenWasAnAtom = true; - break; - - case '^': - consume(); - m_delegate.assertionBOL(); - lastTokenWasAnAtom = false; - break; - - case '$': - consume(); - m_delegate.assertionEOL(); - lastTokenWasAnAtom = false; - break; - - case '.': - consume(); - m_delegate.atomBuiltInCharacterClass(NewlineClassID, true); - lastTokenWasAnAtom = true; - break; - - case '[': - parseCharacterClass(); - lastTokenWasAnAtom = true; - break; - - case '\\': - lastTokenWasAnAtom = parseAtomEscape(); - break; - - case '*': - consume(); - parseQuantifier(lastTokenWasAnAtom, 0, quantifyInfinite); - lastTokenWasAnAtom = false; - break; - - case '+': - consume(); - parseQuantifier(lastTokenWasAnAtom, 1, quantifyInfinite); - lastTokenWasAnAtom = false; - break; - - case '?': - consume(); - parseQuantifier(lastTokenWasAnAtom, 0, 1); - lastTokenWasAnAtom = false; - break; - - case '{': { - ParseState state = saveState(); - - consume(); - if (peekIsDigit()) { - unsigned min = consumeNumber(); - unsigned max = min; - - if (tryConsume(',')) - max = peekIsDigit() ? consumeNumber() : quantifyInfinite; - - if (tryConsume('}')) { - if (min <= max) - parseQuantifier(lastTokenWasAnAtom, min, max); - else - m_err = QuantifierOutOfOrder; - lastTokenWasAnAtom = false; - break; - } - } - - restoreState(state); - } // if we did not find a complete quantifer, fall through to the default case. - - default: - m_delegate.atomPatternCharacter(consume()); - lastTokenWasAnAtom = true; - } - - if (m_err) - return; - } - - if (m_parenthesesNestingDepth > 0) - m_err = MissingParentheses; - } - - /* - * parse(): - * - * This method calls regexBegin(), calls parseTokens() to parse over the input - * patterns, calls regexEnd() or regexError() as appropriate, and converts any - * error code to a const char* for a result. - */ - const char* parse() - { - m_delegate.regexBegin(); - - if (m_size > MAX_PATTERN_SIZE) - m_err = PatternTooLarge; - else - parseTokens(); - ASSERT(atEndOfPattern() || m_err); - - if (m_err) - m_delegate.regexError(); - else - m_delegate.regexEnd(); - - // The order of this array must match the ErrorCode enum. - static const char* errorMessages[NumberOfErrorCodes] = { - 0, // NoError - "regular expression too large", - "numbers out of order in {} quantifier", - "nothing to repeat", - "missing )", - "unmatched parentheses", - "unrecognized character after (?", - "missing terminating ] for character class", - "range out of order in character class", - "\\ at end of pattern" - }; - - return errorMessages[m_err]; - } - - - // Misc helper functions: - - typedef unsigned ParseState; - - ParseState saveState() - { - return m_index; - } - - void restoreState(ParseState state) - { - m_index = state; - } - - bool atEndOfPattern() - { - ASSERT(m_index <= m_size); - return m_index == m_size; - } - - int peek() - { - ASSERT(m_index < m_size); - return m_data[m_index]; - } - - bool peekIsDigit() - { - return !atEndOfPattern() && WTF::isASCIIDigit(peek()); - } - - unsigned peekDigit() - { - ASSERT(peekIsDigit()); - return peek() - '0'; - } - - int consume() - { - ASSERT(m_index < m_size); - return m_data[m_index++]; - } - - unsigned consumeDigit() - { - ASSERT(peekIsDigit()); - return consume() - '0'; - } - - unsigned consumeNumber() - { - unsigned n = consumeDigit(); - // check for overflow. - for (unsigned newValue; peekIsDigit() && ((newValue = n * 10 + peekDigit()) >= n); ) { - n = newValue; - consume(); - } - return n; - } - - unsigned consumeOctal() - { - ASSERT(WTF::isASCIIOctalDigit(peek())); - - unsigned n = consumeDigit(); - while (n < 32 && !atEndOfPattern() && WTF::isASCIIOctalDigit(peek())) - n = n * 8 + consumeDigit(); - return n; - } - - bool tryConsume(UChar ch) - { - if (atEndOfPattern() || (m_data[m_index] != ch)) - return false; - ++m_index; - return true; - } - - int tryConsumeHex(int count) - { - ParseState state = saveState(); - - int n = 0; - while (count--) { - if (atEndOfPattern() || !WTF::isASCIIHexDigit(peek())) { - restoreState(state); - return -1; - } - n = (n << 4) | WTF::toASCIIHexValue(consume()); - } - return n; - } - - Delegate& m_delegate; - unsigned m_backReferenceLimit; - ErrorCode m_err; - const UChar* m_data; - unsigned m_size; - unsigned m_index; - unsigned m_parenthesesNestingDepth; - - // Derived by empirical testing of compile time in PCRE and WREC. - static const unsigned MAX_PATTERN_SIZE = 1024 * 1024; -}; - -/* - * Yarr::parse(): - * - * The parse method is passed a pattern to be parsed and a delegate upon which - * callbacks will be made to record the parsed tokens forming the regex. - * Yarr::parse() returns null on success, or a const C string providing an error - * message where a parse error occurs. - * - * The Delegate must implement the following interface: - * - * void assertionBOL(); - * void assertionEOL(); - * void assertionWordBoundary(bool invert); - * - * void atomPatternCharacter(UChar ch); - * void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert); - * void atomCharacterClassBegin(bool invert) - * void atomCharacterClassAtom(UChar ch) - * void atomCharacterClassRange(UChar begin, UChar end) - * void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert) - * void atomCharacterClassEnd() - * void atomParenthesesSubpatternBegin(bool capture = true); - * void atomParentheticalAssertionBegin(bool invert = false); - * void atomParenthesesEnd(); - * void atomBackReference(unsigned subpatternId); - * - * void quantifyAtom(unsigned min, unsigned max, bool greedy); - * - * void disjunction(); - * - * void regexBegin(); - * void regexEnd(); - * void regexError(); - * - * Before any call recording tokens are made, regexBegin() will be called on the - * delegate once. Once parsing is complete either regexEnd() or regexError() will - * be called, as appropriate. - * - * The regular expression is described by a sequence of assertion*() and atom*() - * callbacks to the delegate, describing the terms in the regular expression. - * Following an atom a quantifyAtom() call may occur to indicate that the previous - * atom should be quantified. In the case of atoms described across multiple - * calls (parentheses and character classes) the call to quantifyAtom() will come - * after the call to the atom*End() method, never after atom*Begin(). - * - * Character classes may either be described by a single call to - * atomBuiltInCharacterClass(), or by a sequence of atomCharacterClass*() calls. - * In the latter case, ...Begin() will be called, followed by a sequence of - * calls to ...Atom(), ...Range(), and ...BuiltIn(), followed by a call to ...End(). - * - * Sequences of atoms and assertions are broken into alternatives via calls to - * disjunction(). Assertions, atoms, and disjunctions emitted between calls to - * atomParenthesesBegin() and atomParenthesesEnd() form the body of a subpattern. - * atomParenthesesBegin() is passed a subpatternId. In the case of a regular - * capturing subpattern, this will be the subpatternId associated with these - * parentheses, and will also by definition be the lowest subpatternId of these - * parentheses and of any nested paretheses. The atomParenthesesEnd() method - * is passed the subpatternId of the last capturing subexpression nested within - * these paretheses. In the case of a capturing subpattern with no nested - * capturing subpatterns, the same subpatternId will be passed to the begin and - * end functions. In the case of non-capturing subpatterns the subpatternId - * passed to the begin method is also the first possible subpatternId that might - * be nested within these paretheses. If a set of non-capturing parentheses does - * not contain any capturing subpatterns, then the subpatternId passed to begin - * will be greater than the subpatternId passed to end. - */ - -template -const char* parse(Delegate& delegate, const UString& pattern, unsigned backReferenceLimit = quantifyInfinite) -{ - return Parser(delegate, pattern, backReferenceLimit).parse(); -} - -} } // namespace JSC::Yarr - -#endif // RegexParser_h diff --git a/Source/JavaScriptCore/yarr/RegexPattern.cpp b/Source/JavaScriptCore/yarr/RegexPattern.cpp deleted file mode 100644 index e737d0e..0000000 --- a/Source/JavaScriptCore/yarr/RegexPattern.cpp +++ /dev/null @@ -1,991 +0,0 @@ -/* - * Copyright (C) 2009 Apple Inc. All rights reserved. - * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY - * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY - * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include "config.h" - -#include "RegexInterpreter.h" -#include "RegexPattern.h" -#include - -using namespace WTF; - -namespace JSC { namespace Yarr { - -#include "RegExpJitTables.h" - -class CharacterClassConstructor { -public: - CharacterClassConstructor(bool isCaseInsensitive = false) - : m_isCaseInsensitive(isCaseInsensitive) - { - } - - void reset() - { - m_matches.clear(); - m_ranges.clear(); - m_matchesUnicode.clear(); - m_rangesUnicode.clear(); - } - - void append(const CharacterClass* other) - { - for (size_t i = 0; i < other->m_matches.size(); ++i) - addSorted(m_matches, other->m_matches[i]); - for (size_t i = 0; i < other->m_ranges.size(); ++i) - addSortedRange(m_ranges, other->m_ranges[i].begin, other->m_ranges[i].end); - for (size_t i = 0; i < other->m_matchesUnicode.size(); ++i) - addSorted(m_matchesUnicode, other->m_matchesUnicode[i]); - for (size_t i = 0; i < other->m_rangesUnicode.size(); ++i) - addSortedRange(m_rangesUnicode, other->m_rangesUnicode[i].begin, other->m_rangesUnicode[i].end); - } - - void putChar(UChar ch) - { - if (ch <= 0x7f) { - if (m_isCaseInsensitive && isASCIIAlpha(ch)) { - addSorted(m_matches, toASCIIUpper(ch)); - addSorted(m_matches, toASCIILower(ch)); - } else - addSorted(m_matches, ch); - } else { - UChar upper, lower; - if (m_isCaseInsensitive && ((upper = Unicode::toUpper(ch)) != (lower = Unicode::toLower(ch)))) { - addSorted(m_matchesUnicode, upper); - addSorted(m_matchesUnicode, lower); - } else - addSorted(m_matchesUnicode, ch); - } - } - - // returns true if this character has another case, and 'ch' is the upper case form. - static inline bool isUnicodeUpper(UChar ch) - { - return ch != Unicode::toLower(ch); - } - - // returns true if this character has another case, and 'ch' is the lower case form. - static inline bool isUnicodeLower(UChar ch) - { - return ch != Unicode::toUpper(ch); - } - - void putRange(UChar lo, UChar hi) - { - if (lo <= 0x7f) { - char asciiLo = lo; - char asciiHi = std::min(hi, (UChar)0x7f); - addSortedRange(m_ranges, lo, asciiHi); - - if (m_isCaseInsensitive) { - if ((asciiLo <= 'Z') && (asciiHi >= 'A')) - addSortedRange(m_ranges, std::max(asciiLo, 'A')+('a'-'A'), std::min(asciiHi, 'Z')+('a'-'A')); - if ((asciiLo <= 'z') && (asciiHi >= 'a')) - addSortedRange(m_ranges, std::max(asciiLo, 'a')+('A'-'a'), std::min(asciiHi, 'z')+('A'-'a')); - } - } - if (hi >= 0x80) { - uint32_t unicodeCurr = std::max(lo, (UChar)0x80); - addSortedRange(m_rangesUnicode, unicodeCurr, hi); - - if (m_isCaseInsensitive) { - while (unicodeCurr <= hi) { - // If the upper bound of the range (hi) is 0xffff, the increments to - // unicodeCurr in this loop may take it to 0x10000. This is fine - // (if so we won't re-enter the loop, since the loop condition above - // will definitely fail) - but this does mean we cannot use a UChar - // to represent unicodeCurr, we must use a 32-bit value instead. - ASSERT(unicodeCurr <= 0xffff); - - if (isUnicodeUpper(unicodeCurr)) { - UChar lowerCaseRangeBegin = Unicode::toLower(unicodeCurr); - UChar lowerCaseRangeEnd = lowerCaseRangeBegin; - while ((++unicodeCurr <= hi) && isUnicodeUpper(unicodeCurr) && (Unicode::toLower(unicodeCurr) == (lowerCaseRangeEnd + 1))) - lowerCaseRangeEnd++; - addSortedRange(m_rangesUnicode, lowerCaseRangeBegin, lowerCaseRangeEnd); - } else if (isUnicodeLower(unicodeCurr)) { - UChar upperCaseRangeBegin = Unicode::toUpper(unicodeCurr); - UChar upperCaseRangeEnd = upperCaseRangeBegin; - while ((++unicodeCurr <= hi) && isUnicodeLower(unicodeCurr) && (Unicode::toUpper(unicodeCurr) == (upperCaseRangeEnd + 1))) - upperCaseRangeEnd++; - addSortedRange(m_rangesUnicode, upperCaseRangeBegin, upperCaseRangeEnd); - } else - ++unicodeCurr; - } - } - } - } - - CharacterClass* charClass() - { - CharacterClass* characterClass = new CharacterClass(0); - - characterClass->m_matches.append(m_matches); - characterClass->m_ranges.append(m_ranges); - characterClass->m_matchesUnicode.append(m_matchesUnicode); - characterClass->m_rangesUnicode.append(m_rangesUnicode); - - reset(); - - return characterClass; - } - -private: - void addSorted(Vector& matches, UChar ch) - { - unsigned pos = 0; - unsigned range = matches.size(); - - // binary chop, find position to insert char. - while (range) { - unsigned index = range >> 1; - - int val = matches[pos+index] - ch; - if (!val) - return; - else if (val > 0) - range = index; - else { - pos += (index+1); - range -= (index+1); - } - } - - if (pos == matches.size()) - matches.append(ch); - else - matches.insert(pos, ch); - } - - void addSortedRange(Vector& ranges, UChar lo, UChar hi) - { - unsigned end = ranges.size(); - - // Simple linear scan - I doubt there are that many ranges anyway... - // feel free to fix this with something faster (eg binary chop). - for (unsigned i = 0; i < end; ++i) { - // does the new range fall before the current position in the array - if (hi < ranges[i].begin) { - // optional optimization: concatenate appending ranges? - may not be worthwhile. - if (hi == (ranges[i].begin - 1)) { - ranges[i].begin = lo; - return; - } - ranges.insert(i, CharacterRange(lo, hi)); - return; - } - // Okay, since we didn't hit the last case, the end of the new range is definitely at or after the begining - // If the new range start at or before the end of the last range, then the overlap (if it starts one after the - // end of the last range they concatenate, which is just as good. - if (lo <= (ranges[i].end + 1)) { - // found an intersect! we'll replace this entry in the array. - ranges[i].begin = std::min(ranges[i].begin, lo); - ranges[i].end = std::max(ranges[i].end, hi); - - // now check if the new range can subsume any subsequent ranges. - unsigned next = i+1; - // each iteration of the loop we will either remove something from the list, or break the loop. - while (next < ranges.size()) { - if (ranges[next].begin <= (ranges[i].end + 1)) { - // the next entry now overlaps / concatenates this one. - ranges[i].end = std::max(ranges[i].end, ranges[next].end); - ranges.remove(next); - } else - break; - } - - return; - } - } - - // CharacterRange comes after all existing ranges. - ranges.append(CharacterRange(lo, hi)); - } - - bool m_isCaseInsensitive; - - Vector m_matches; - Vector m_ranges; - Vector m_matchesUnicode; - Vector m_rangesUnicode; -}; - -struct BeginCharHelper { - BeginCharHelper(Vector* beginChars, bool isCaseInsensitive = false) - : m_beginChars(beginChars) - , m_isCaseInsensitive(isCaseInsensitive) - {} - - void addBeginChar(BeginChar beginChar, Vector* hotTerms, QuantifierType quantityType, unsigned quantityCount) - { - if (quantityType == QuantifierFixedCount && quantityCount > 1) { - // We duplicate the first found character if the quantity of the term is more than one. eg.: /a{3}/ - beginChar.value |= beginChar.value << 16; - beginChar.mask |= beginChar.mask << 16; - addCharacter(beginChar); - } else if (quantityType == QuantifierFixedCount && quantityCount == 1 && hotTerms->size()) - // In case of characters with fixed quantifier we should check the next character as well. - linkHotTerms(beginChar, hotTerms); - else - // In case of greedy matching the next character checking is unnecessary therefore we just store - // the first character. - addCharacter(beginChar); - } - - // Merge two following BeginChars in the vector to reduce the number of character checks. - void merge(unsigned size) - { - for (unsigned i = 0; i < size; i++) { - BeginChar* curr = &m_beginChars->at(i); - BeginChar* next = &m_beginChars->at(i + 1); - - // If the current and the next size of value is different we should skip the merge process - // because the 16bit and 32bit values are unmergable. - if (curr->value <= 0xFFFF && next->value > 0xFFFF) - continue; - - unsigned diff = curr->value ^ next->value; - - curr->mask |= diff; - curr->value |= curr->mask; - - m_beginChars->remove(i + 1); - size--; - } - } - -private: - void addCharacter(BeginChar beginChar) - { - unsigned pos = 0; - unsigned range = m_beginChars->size(); - - // binary chop, find position to insert char. - while (range) { - unsigned index = range >> 1; - - int val = m_beginChars->at(pos+index).value - beginChar.value; - if (!val) - return; - if (val < 0) - range = index; - else { - pos += (index+1); - range -= (index+1); - } - } - - if (pos == m_beginChars->size()) - m_beginChars->append(beginChar); - else - m_beginChars->insert(pos, beginChar); - } - - // Create BeginChar objects by appending each terms from a hotTerms vector to an existing BeginChar object. - void linkHotTerms(BeginChar beginChar, Vector* hotTerms) - { - for (unsigned i = 0; i < hotTerms->size(); i++) { - PatternTerm hotTerm = hotTerms->at(i).term; - ASSERT(hotTerm.type == PatternTerm::TypePatternCharacter); - - UChar characterNext = hotTerm.patternCharacter; - - // Append a character to an existing BeginChar object. - if (characterNext <= 0x7f) { - unsigned mask = 0; - - if (m_isCaseInsensitive && isASCIIAlpha(characterNext)) { - mask = 32; - characterNext = toASCIILower(characterNext); - } - - addCharacter(BeginChar(beginChar.value | (characterNext << 16), beginChar.mask | (mask << 16))); - } else { - UChar upper, lower; - if (m_isCaseInsensitive && ((upper = Unicode::toUpper(characterNext)) != (lower = Unicode::toLower(characterNext)))) { - addCharacter(BeginChar(beginChar.value | (upper << 16), beginChar.mask)); - addCharacter(BeginChar(beginChar.value | (lower << 16), beginChar.mask)); - } else - addCharacter(BeginChar(beginChar.value | (characterNext << 16), beginChar.mask)); - } - } - } - - Vector* m_beginChars; - bool m_isCaseInsensitive; -}; - -class RegexPatternConstructor { -public: - RegexPatternConstructor(RegexPattern& pattern) - : m_pattern(pattern) - , m_characterClassConstructor(pattern.m_ignoreCase) - , m_beginCharHelper(&pattern.m_beginChars, pattern.m_ignoreCase) - , m_invertParentheticalAssertion(false) - { - } - - ~RegexPatternConstructor() - { - } - - void reset() - { - m_pattern.reset(); - m_characterClassConstructor.reset(); - } - - void assertionBOL() - { - if (!m_alternative->m_terms.size() & !m_invertParentheticalAssertion) { - m_alternative->m_startsWithBOL = true; - m_alternative->m_containsBOL = true; - m_pattern.m_containsBOL = true; - } - m_alternative->m_terms.append(PatternTerm::BOL()); - } - void assertionEOL() - { - m_alternative->m_terms.append(PatternTerm::EOL()); - } - void assertionWordBoundary(bool invert) - { - m_alternative->m_terms.append(PatternTerm::WordBoundary(invert)); - } - - void atomPatternCharacter(UChar ch) - { - // We handle case-insensitive checking of unicode characters which do have both - // cases by handling them as if they were defined using a CharacterClass. - if (m_pattern.m_ignoreCase && !isASCII(ch) && (Unicode::toUpper(ch) != Unicode::toLower(ch))) { - atomCharacterClassBegin(); - atomCharacterClassAtom(ch); - atomCharacterClassEnd(); - } else - m_alternative->m_terms.append(PatternTerm(ch)); - } - - void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert) - { - switch (classID) { - case DigitClassID: - m_alternative->m_terms.append(PatternTerm(m_pattern.digitsCharacterClass(), invert)); - break; - case SpaceClassID: - m_alternative->m_terms.append(PatternTerm(m_pattern.spacesCharacterClass(), invert)); - break; - case WordClassID: - m_alternative->m_terms.append(PatternTerm(m_pattern.wordcharCharacterClass(), invert)); - break; - case NewlineClassID: - m_alternative->m_terms.append(PatternTerm(m_pattern.newlineCharacterClass(), invert)); - break; - } - } - - void atomCharacterClassBegin(bool invert = false) - { - m_invertCharacterClass = invert; - } - - void atomCharacterClassAtom(UChar ch) - { - m_characterClassConstructor.putChar(ch); - } - - void atomCharacterClassRange(UChar begin, UChar end) - { - m_characterClassConstructor.putRange(begin, end); - } - - void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert) - { - ASSERT(classID != NewlineClassID); - - switch (classID) { - case DigitClassID: - m_characterClassConstructor.append(invert ? m_pattern.nondigitsCharacterClass() : m_pattern.digitsCharacterClass()); - break; - - case SpaceClassID: - m_characterClassConstructor.append(invert ? m_pattern.nonspacesCharacterClass() : m_pattern.spacesCharacterClass()); - break; - - case WordClassID: - m_characterClassConstructor.append(invert ? m_pattern.nonwordcharCharacterClass() : m_pattern.wordcharCharacterClass()); - break; - - default: - ASSERT_NOT_REACHED(); - } - } - - void atomCharacterClassEnd() - { - CharacterClass* newCharacterClass = m_characterClassConstructor.charClass(); - m_pattern.m_userCharacterClasses.append(newCharacterClass); - m_alternative->m_terms.append(PatternTerm(newCharacterClass, m_invertCharacterClass)); - } - - void atomParenthesesSubpatternBegin(bool capture = true) - { - unsigned subpatternId = m_pattern.m_numSubpatterns + 1; - if (capture) - m_pattern.m_numSubpatterns++; - - PatternDisjunction* parenthesesDisjunction = new PatternDisjunction(m_alternative); - m_pattern.m_disjunctions.append(parenthesesDisjunction); - m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction, capture, false)); - m_alternative = parenthesesDisjunction->addNewAlternative(); - } - - void atomParentheticalAssertionBegin(bool invert = false) - { - PatternDisjunction* parenthesesDisjunction = new PatternDisjunction(m_alternative); - m_pattern.m_disjunctions.append(parenthesesDisjunction); - m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParentheticalAssertion, m_pattern.m_numSubpatterns + 1, parenthesesDisjunction, false, invert)); - m_alternative = parenthesesDisjunction->addNewAlternative(); - m_invertParentheticalAssertion = invert; - } - - void atomParenthesesEnd() - { - ASSERT(m_alternative->m_parent); - ASSERT(m_alternative->m_parent->m_parent); - - PatternDisjunction* parenthesesDisjunction = m_alternative->m_parent; - m_alternative = m_alternative->m_parent->m_parent; - - PatternTerm& lastTerm = m_alternative->lastTerm(); - - unsigned numParenAlternatives = parenthesesDisjunction->m_alternatives.size(); - unsigned numBOLAnchoredAlts = 0; - // Bubble up BOL flags - for (unsigned i = 0; i < numParenAlternatives; i++) { - if (parenthesesDisjunction->m_alternatives[i]->m_startsWithBOL) - numBOLAnchoredAlts++; - } - - if (numBOLAnchoredAlts) { - m_alternative->m_containsBOL = true; - // If all the alternatives in parens start with BOL, then so does this one - if (numBOLAnchoredAlts == numParenAlternatives) - m_alternative->m_startsWithBOL = true; - } - - lastTerm.parentheses.lastSubpatternId = m_pattern.m_numSubpatterns; - m_invertParentheticalAssertion = false; - } - - void atomBackReference(unsigned subpatternId) - { - ASSERT(subpatternId); - m_pattern.m_containsBackreferences = true; - m_pattern.m_maxBackReference = std::max(m_pattern.m_maxBackReference, subpatternId); - - if (subpatternId > m_pattern.m_numSubpatterns) { - m_alternative->m_terms.append(PatternTerm::ForwardReference()); - return; - } - - PatternAlternative* currentAlternative = m_alternative; - ASSERT(currentAlternative); - - // Note to self: if we waited until the AST was baked, we could also remove forwards refs - while ((currentAlternative = currentAlternative->m_parent->m_parent)) { - PatternTerm& term = currentAlternative->lastTerm(); - ASSERT((term.type == PatternTerm::TypeParenthesesSubpattern) || (term.type == PatternTerm::TypeParentheticalAssertion)); - - if ((term.type == PatternTerm::TypeParenthesesSubpattern) && term.capture() && (subpatternId == term.parentheses.subpatternId)) { - m_alternative->m_terms.append(PatternTerm::ForwardReference()); - return; - } - } - - m_alternative->m_terms.append(PatternTerm(subpatternId)); - } - - // deep copy the argument disjunction. If filterStartsWithBOL is true, - // skip alternatives with m_startsWithBOL set true. - PatternDisjunction* copyDisjunction(PatternDisjunction* disjunction, bool filterStartsWithBOL = false) - { - PatternDisjunction* newDisjunction = 0; - for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) { - PatternAlternative* alternative = disjunction->m_alternatives[alt]; - if (!filterStartsWithBOL || !alternative->m_startsWithBOL) { - if (!newDisjunction) { - newDisjunction = new PatternDisjunction(); - newDisjunction->m_parent = disjunction->m_parent; - } - PatternAlternative* newAlternative = newDisjunction->addNewAlternative(); - for (unsigned i = 0; i < alternative->m_terms.size(); ++i) - newAlternative->m_terms.append(copyTerm(alternative->m_terms[i], filterStartsWithBOL)); - } - } - - if (newDisjunction) - m_pattern.m_disjunctions.append(newDisjunction); - return newDisjunction; - } - - PatternTerm copyTerm(PatternTerm& term, bool filterStartsWithBOL = false) - { - if ((term.type != PatternTerm::TypeParenthesesSubpattern) && (term.type != PatternTerm::TypeParentheticalAssertion)) - return PatternTerm(term); - - PatternTerm termCopy = term; - termCopy.parentheses.disjunction = copyDisjunction(termCopy.parentheses.disjunction, filterStartsWithBOL); - return termCopy; - } - - void quantifyAtom(unsigned min, unsigned max, bool greedy) - { - ASSERT(min <= max); - ASSERT(m_alternative->m_terms.size()); - - if (!max) { - m_alternative->removeLastTerm(); - return; - } - - PatternTerm& term = m_alternative->lastTerm(); - ASSERT(term.type > PatternTerm::TypeAssertionWordBoundary); - ASSERT((term.quantityCount == 1) && (term.quantityType == QuantifierFixedCount)); - - // For any assertion with a zero minimum, not matching is valid and has no effect, - // remove it. Otherwise, we need to match as least once, but there is no point - // matching more than once, so remove the quantifier. It is not entirely clear - // from the spec whether or not this behavior is correct, but I believe this - // matches Firefox. :-/ - if (term.type == PatternTerm::TypeParentheticalAssertion) { - if (!min) - m_alternative->removeLastTerm(); - return; - } - - if (min == 0) - term.quantify(max, greedy ? QuantifierGreedy : QuantifierNonGreedy); - else if (min == max) - term.quantify(min, QuantifierFixedCount); - else { - term.quantify(min, QuantifierFixedCount); - m_alternative->m_terms.append(copyTerm(term)); - // NOTE: this term is interesting from an analysis perspective, in that it can be ignored..... - m_alternative->lastTerm().quantify((max == quantifyInfinite) ? max : max - min, greedy ? QuantifierGreedy : QuantifierNonGreedy); - if (m_alternative->lastTerm().type == PatternTerm::TypeParenthesesSubpattern) - m_alternative->lastTerm().parentheses.isCopy = true; - } - } - - void disjunction() - { - m_alternative = m_alternative->m_parent->addNewAlternative(); - } - - void regexBegin() - { - m_pattern.m_body = new PatternDisjunction(); - m_alternative = m_pattern.m_body->addNewAlternative(); - m_pattern.m_disjunctions.append(m_pattern.m_body); - } - void regexEnd() - { - } - void regexError() - { - } - - unsigned setupAlternativeOffsets(PatternAlternative* alternative, unsigned currentCallFrameSize, unsigned initialInputPosition) - { - alternative->m_hasFixedSize = true; - unsigned currentInputPosition = initialInputPosition; - - for (unsigned i = 0; i < alternative->m_terms.size(); ++i) { - PatternTerm& term = alternative->m_terms[i]; - - switch (term.type) { - case PatternTerm::TypeAssertionBOL: - case PatternTerm::TypeAssertionEOL: - case PatternTerm::TypeAssertionWordBoundary: - term.inputPosition = currentInputPosition; - break; - - case PatternTerm::TypeBackReference: - term.inputPosition = currentInputPosition; - term.frameLocation = currentCallFrameSize; - currentCallFrameSize += RegexStackSpaceForBackTrackInfoBackReference; - alternative->m_hasFixedSize = false; - break; - - case PatternTerm::TypeForwardReference: - break; - - case PatternTerm::TypePatternCharacter: - term.inputPosition = currentInputPosition; - if (term.quantityType != QuantifierFixedCount) { - term.frameLocation = currentCallFrameSize; - currentCallFrameSize += RegexStackSpaceForBackTrackInfoPatternCharacter; - alternative->m_hasFixedSize = false; - } else - currentInputPosition += term.quantityCount; - break; - - case PatternTerm::TypeCharacterClass: - term.inputPosition = currentInputPosition; - if (term.quantityType != QuantifierFixedCount) { - term.frameLocation = currentCallFrameSize; - currentCallFrameSize += RegexStackSpaceForBackTrackInfoCharacterClass; - alternative->m_hasFixedSize = false; - } else - currentInputPosition += term.quantityCount; - break; - - case PatternTerm::TypeParenthesesSubpattern: - // Note: for fixed once parentheses we will ensure at least the minimum is available; others are on their own. - term.frameLocation = currentCallFrameSize; - if (term.quantityCount == 1 && !term.parentheses.isCopy) { - if (term.quantityType != QuantifierFixedCount) - currentCallFrameSize += RegexStackSpaceForBackTrackInfoParenthesesOnce; - currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition); - // If quantity is fixed, then pre-check its minimum size. - if (term.quantityType == QuantifierFixedCount) - currentInputPosition += term.parentheses.disjunction->m_minimumSize; - term.inputPosition = currentInputPosition; - } else if (term.parentheses.isTerminal) { - currentCallFrameSize += RegexStackSpaceForBackTrackInfoParenthesesTerminal; - currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition); - term.inputPosition = currentInputPosition; - } else { - term.inputPosition = currentInputPosition; - setupDisjunctionOffsets(term.parentheses.disjunction, 0, currentInputPosition); - currentCallFrameSize += RegexStackSpaceForBackTrackInfoParentheses; - } - // Fixed count of 1 could be accepted, if they have a fixed size *AND* if all alternatives are of the same length. - alternative->m_hasFixedSize = false; - break; - - case PatternTerm::TypeParentheticalAssertion: - term.inputPosition = currentInputPosition; - term.frameLocation = currentCallFrameSize; - currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize + RegexStackSpaceForBackTrackInfoParentheticalAssertion, currentInputPosition); - break; - } - } - - alternative->m_minimumSize = currentInputPosition - initialInputPosition; - return currentCallFrameSize; - } - - unsigned setupDisjunctionOffsets(PatternDisjunction* disjunction, unsigned initialCallFrameSize, unsigned initialInputPosition) - { - if ((disjunction != m_pattern.m_body) && (disjunction->m_alternatives.size() > 1)) - initialCallFrameSize += RegexStackSpaceForBackTrackInfoAlternative; - - unsigned minimumInputSize = UINT_MAX; - unsigned maximumCallFrameSize = 0; - bool hasFixedSize = true; - - for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) { - PatternAlternative* alternative = disjunction->m_alternatives[alt]; - unsigned currentAlternativeCallFrameSize = setupAlternativeOffsets(alternative, initialCallFrameSize, initialInputPosition); - minimumInputSize = min(minimumInputSize, alternative->m_minimumSize); - maximumCallFrameSize = max(maximumCallFrameSize, currentAlternativeCallFrameSize); - hasFixedSize &= alternative->m_hasFixedSize; - } - - ASSERT(minimumInputSize != UINT_MAX); - ASSERT(maximumCallFrameSize >= initialCallFrameSize); - - disjunction->m_hasFixedSize = hasFixedSize; - disjunction->m_minimumSize = minimumInputSize; - disjunction->m_callFrameSize = maximumCallFrameSize; - return maximumCallFrameSize; - } - - void setupOffsets() - { - setupDisjunctionOffsets(m_pattern.m_body, 0, 0); - } - - // This optimization identifies sets of parentheses that we will never need to backtrack. - // In these cases we do not need to store state from prior iterations. - // We can presently avoid backtracking for: - // * where the parens are at the end of the regular expression (last term in any of the - // alternatives of the main body disjunction). - // * where the parens are non-capturing, and quantified unbounded greedy (*). - // * where the parens do not contain any capturing subpatterns. - void checkForTerminalParentheses() - { - // This check is much too crude; should be just checking whether the candidate - // node contains nested capturing subpatterns, not the whole expression! - if (m_pattern.m_numSubpatterns) - return; - - Vector& alternatives = m_pattern.m_body->m_alternatives; - for (size_t i = 0; i < alternatives.size(); ++i) { - Vector& terms = alternatives[i]->m_terms; - if (terms.size()) { - PatternTerm& term = terms.last(); - if (term.type == PatternTerm::TypeParenthesesSubpattern - && term.quantityType == QuantifierGreedy - && term.quantityCount == quantifyInfinite - && !term.capture()) - term.parentheses.isTerminal = true; - } - } - } - - void optimizeBOL() - { - // Look for expressions containing beginning of line (^) anchoring and unroll them. - // e.g. /^a|^b|c/ becomes /^a|^b|c/ which is executed once followed by /c/ which loops - // This code relies on the parsing code tagging alternatives with m_containsBOL and - // m_startsWithBOL and rolling those up to containing alternatives. - // At this point, this is only valid for non-multiline expressions. - PatternDisjunction* disjunction = m_pattern.m_body; - - if (!m_pattern.m_containsBOL || m_pattern.m_multiline) - return; - - PatternDisjunction* loopDisjunction = copyDisjunction(disjunction, true); - - // Set alternatives in disjunction to "onceThrough" - for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) - disjunction->m_alternatives[alt]->setOnceThrough(); - - if (loopDisjunction) { - // Move alternatives from loopDisjunction to disjunction - for (unsigned alt = 0; alt < loopDisjunction->m_alternatives.size(); ++alt) - disjunction->m_alternatives.append(loopDisjunction->m_alternatives[alt]); - - loopDisjunction->m_alternatives.clear(); - } - } - - bool addBeginTerm(PatternTerm term, Vector* beginTerms, PatternAlternative* alternative, unsigned numTerms, unsigned termIndex, unsigned depth) - { - if (term.quantityType == QuantifierFixedCount) { - beginTerms->append(TermChain(term)); - if (depth < 2 && termIndex < numTerms - 1 && term.quantityCount == 1) - setupAlternativeBeginTerms(alternative, &beginTerms->last().hotTerms, termIndex + 1, depth + 1); - } else if (termIndex != numTerms - 1) { - beginTerms->append(TermChain(term)); - return true; - } - - return false; - } - - // This function collects the terms which are potentially matching the first number of depth characters in the result. - // If this function returns false then it found at least one term which makes the beginning character - // look-up optimization inefficient. - bool setupDisjunctionBeginTerms(PatternDisjunction* disjunction, Vector* beginTerms, unsigned depth) - { - for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) { - PatternAlternative* alternative = disjunction->m_alternatives[alt]; - - if (!setupAlternativeBeginTerms(alternative, beginTerms, 0, depth)) - return false; - } - - return true; - } - - bool setupAlternativeBeginTerms(PatternAlternative* alternative, Vector* beginTerms, unsigned termIndex, unsigned depth) - { - bool checkNext = true; - unsigned numTerms = alternative->m_terms.size(); - - while (checkNext && termIndex < numTerms) { - PatternTerm term = alternative->m_terms[termIndex]; - checkNext = false; - - switch (term.type) { - case PatternTerm::TypeAssertionBOL: - case PatternTerm::TypeAssertionEOL: - case PatternTerm::TypeAssertionWordBoundary: - return false; - - case PatternTerm::TypeBackReference: - case PatternTerm::TypeForwardReference: - return false; - - case PatternTerm::TypePatternCharacter: - if (addBeginTerm(term, beginTerms, alternative, numTerms, termIndex, depth)) { - termIndex++; - checkNext = true; - } - break; - - case PatternTerm::TypeCharacterClass: - return false; - - case PatternTerm::TypeParentheticalAssertion: - if (term.invert()) - return false; - - case PatternTerm::TypeParenthesesSubpattern: - if (term.quantityType != QuantifierFixedCount) { - if (termIndex == numTerms - 1) - break; - - termIndex++; - checkNext = true; - - } - - if (!setupDisjunctionBeginTerms(term.parentheses.disjunction, beginTerms, depth)) - return false; - - break; - } - } - - return true; - } - - void setupBeginChars() - { - Vector beginTerms; - bool containsFixedCharacter = false; - - if ((!m_pattern.m_body->m_hasFixedSize || m_pattern.m_body->m_alternatives.size() > 1) - && setupDisjunctionBeginTerms(m_pattern.m_body, &beginTerms, 0)) { - unsigned size = beginTerms.size(); - - // If we haven't collected any terms we should abort the preparation of beginning character look-up optimization. - if (!size) - return; - - m_pattern.m_containsBeginChars = true; - - for (unsigned i = 0; i < size; i++) { - PatternTerm term = beginTerms[i].term; - - // We have just collected PatternCharacter terms, other terms are not allowed. - ASSERT(term.type == PatternTerm::TypePatternCharacter); - - if (term.quantityType == QuantifierFixedCount) - containsFixedCharacter = true; - - UChar character = term.patternCharacter; - unsigned mask = 0; - - if (character <= 0x7f) { - if (m_pattern.m_ignoreCase && isASCIIAlpha(character)) { - mask = 32; - character = toASCIILower(character); - } - - m_beginCharHelper.addBeginChar(BeginChar(character, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount); - } else { - UChar upper, lower; - if (m_pattern.m_ignoreCase && ((upper = Unicode::toUpper(character)) != (lower = Unicode::toLower(character)))) { - m_beginCharHelper.addBeginChar(BeginChar(upper, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount); - m_beginCharHelper.addBeginChar(BeginChar(lower, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount); - } else - m_beginCharHelper.addBeginChar(BeginChar(character, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount); - } - } - - // If the pattern doesn't contain terms with fixed quantifiers then the beginning character look-up optimization is inefficient. - if (!containsFixedCharacter) { - m_pattern.m_containsBeginChars = false; - return; - } - - size = m_pattern.m_beginChars.size(); - - if (size > 2) - m_beginCharHelper.merge(size - 1); - else if (size <= 1) - m_pattern.m_containsBeginChars = false; - } - } - -private: - RegexPattern& m_pattern; - PatternAlternative* m_alternative; - CharacterClassConstructor m_characterClassConstructor; - BeginCharHelper m_beginCharHelper; - bool m_invertCharacterClass; - bool m_invertParentheticalAssertion; -}; - - -static const char* compileRegex(const UString& patternString, RegexPattern& pattern) -{ - RegexPatternConstructor constructor(pattern); - - if (const char* error = parse(constructor, patternString)) - return error; - - // If the pattern contains illegal backreferences reset & reparse. - // Quoting Netscape's "What's new in JavaScript 1.2", - // "Note: if the number of left parentheses is less than the number specified - // in \#, the \# is taken as an octal escape as described in the next row." - if (pattern.containsIllegalBackReference()) { - unsigned numSubpatterns = pattern.m_numSubpatterns; - - constructor.reset(); -#if !ASSERT_DISABLED - const char* error = -#endif - parse(constructor, patternString, numSubpatterns); - - ASSERT(!error); - ASSERT(numSubpatterns == pattern.m_numSubpatterns); - } - - constructor.checkForTerminalParentheses(); - constructor.optimizeBOL(); - - constructor.setupOffsets(); - constructor.setupBeginChars(); - - return 0; -}; - -RegexPattern::RegexPattern(const UString& pattern, bool ignoreCase, bool multiline, const char** error) - : m_ignoreCase(ignoreCase) - , m_multiline(multiline) - , m_containsBackreferences(false) - , m_containsBeginChars(false) - , m_containsBOL(false) - , m_numSubpatterns(0) - , m_maxBackReference(0) - , newlineCached(0) - , digitsCached(0) - , spacesCached(0) - , wordcharCached(0) - , nondigitsCached(0) - , nonspacesCached(0) - , nonwordcharCached(0) -{ - *error = compileRegex(pattern, *this); -} - -} } diff --git a/Source/JavaScriptCore/yarr/RegexPattern.h b/Source/JavaScriptCore/yarr/RegexPattern.h deleted file mode 100644 index 6833dd6..0000000 --- a/Source/JavaScriptCore/yarr/RegexPattern.h +++ /dev/null @@ -1,424 +0,0 @@ -/* - * Copyright (C) 2009 Apple Inc. All rights reserved. - * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY - * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY - * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef RegexPattern_h -#define RegexPattern_h - -#include -#include - -#include - -namespace JSC { namespace Yarr { - -#define RegexStackSpaceForBackTrackInfoPatternCharacter 1 // Only for !fixed quantifiers. -#define RegexStackSpaceForBackTrackInfoCharacterClass 1 // Only for !fixed quantifiers. -#define RegexStackSpaceForBackTrackInfoBackReference 2 -#define RegexStackSpaceForBackTrackInfoAlternative 1 // One per alternative. -#define RegexStackSpaceForBackTrackInfoParentheticalAssertion 1 -#define RegexStackSpaceForBackTrackInfoParenthesesOnce 1 // Only for !fixed quantifiers. -#define RegexStackSpaceForBackTrackInfoParenthesesTerminal 1 -#define RegexStackSpaceForBackTrackInfoParentheses 2 - -struct PatternDisjunction; - -struct CharacterRange { - UChar begin; - UChar end; - - CharacterRange(UChar begin, UChar end) - : begin(begin) - , end(end) - { - } -}; - -struct CharacterClassTable : RefCounted { - const char* m_table; - bool m_inverted; - static PassRefPtr create(const char* table, bool inverted) - { - return adoptRef(new CharacterClassTable(table, inverted)); - } - -private: - CharacterClassTable(const char* table, bool inverted) - : m_table(table) - , m_inverted(inverted) - { - } -}; - -struct CharacterClass : FastAllocBase { - // All CharacterClass instances have to have the full set of matches and ranges, - // they may have an optional table for faster lookups (which must match the - // specified matches and ranges) - CharacterClass(PassRefPtr table) - : m_table(table) - { - } - Vector m_matches; - Vector m_ranges; - Vector m_matchesUnicode; - Vector m_rangesUnicode; - RefPtr m_table; -}; - -enum QuantifierType { - QuantifierFixedCount, - QuantifierGreedy, - QuantifierNonGreedy, -}; - -struct PatternTerm { - enum Type { - TypeAssertionBOL, - TypeAssertionEOL, - TypeAssertionWordBoundary, - TypePatternCharacter, - TypeCharacterClass, - TypeBackReference, - TypeForwardReference, - TypeParenthesesSubpattern, - TypeParentheticalAssertion, - } type; - bool m_capture :1; - bool m_invert :1; - union { - UChar patternCharacter; - CharacterClass* characterClass; - unsigned backReferenceSubpatternId; - struct { - PatternDisjunction* disjunction; - unsigned subpatternId; - unsigned lastSubpatternId; - bool isCopy; - bool isTerminal; - } parentheses; - }; - QuantifierType quantityType; - unsigned quantityCount; - int inputPosition; - unsigned frameLocation; - - PatternTerm(UChar ch) - : type(PatternTerm::TypePatternCharacter) - , m_capture(false) - , m_invert(false) - { - patternCharacter = ch; - quantityType = QuantifierFixedCount; - quantityCount = 1; - } - - PatternTerm(CharacterClass* charClass, bool invert) - : type(PatternTerm::TypeCharacterClass) - , m_capture(false) - , m_invert(invert) - { - characterClass = charClass; - quantityType = QuantifierFixedCount; - quantityCount = 1; - } - - PatternTerm(Type type, unsigned subpatternId, PatternDisjunction* disjunction, bool capture = false, bool invert = false) - : type(type) - , m_capture(capture) - , m_invert(invert) - { - parentheses.disjunction = disjunction; - parentheses.subpatternId = subpatternId; - parentheses.isCopy = false; - parentheses.isTerminal = false; - quantityType = QuantifierFixedCount; - quantityCount = 1; - } - - PatternTerm(Type type, bool invert = false) - : type(type) - , m_capture(false) - , m_invert(invert) - { - quantityType = QuantifierFixedCount; - quantityCount = 1; - } - - PatternTerm(unsigned spatternId) - : type(TypeBackReference) - , m_capture(false) - , m_invert(false) - { - backReferenceSubpatternId = spatternId; - quantityType = QuantifierFixedCount; - quantityCount = 1; - } - - static PatternTerm ForwardReference() - { - return PatternTerm(TypeForwardReference); - } - - static PatternTerm BOL() - { - return PatternTerm(TypeAssertionBOL); - } - - static PatternTerm EOL() - { - return PatternTerm(TypeAssertionEOL); - } - - static PatternTerm WordBoundary(bool invert) - { - return PatternTerm(TypeAssertionWordBoundary, invert); - } - - bool invert() - { - return m_invert; - } - - bool capture() - { - return m_capture; - } - - void quantify(unsigned count, QuantifierType type) - { - quantityCount = count; - quantityType = type; - } -}; - -struct PatternAlternative : FastAllocBase { - PatternAlternative(PatternDisjunction* disjunction) - : m_parent(disjunction) - , m_onceThrough(false) - , m_hasFixedSize(false) - , m_startsWithBOL(false) - , m_containsBOL(false) - { - } - - PatternTerm& lastTerm() - { - ASSERT(m_terms.size()); - return m_terms[m_terms.size() - 1]; - } - - void removeLastTerm() - { - ASSERT(m_terms.size()); - m_terms.shrink(m_terms.size() - 1); - } - - void setOnceThrough() - { - m_onceThrough = true; - } - - bool onceThrough() - { - return m_onceThrough; - } - - Vector m_terms; - PatternDisjunction* m_parent; - unsigned m_minimumSize; - bool m_onceThrough : 1; - bool m_hasFixedSize : 1; - bool m_startsWithBOL : 1; - bool m_containsBOL : 1; -}; - -struct PatternDisjunction : FastAllocBase { - PatternDisjunction(PatternAlternative* parent = 0) - : m_parent(parent) - , m_hasFixedSize(false) - { - } - - ~PatternDisjunction() - { - deleteAllValues(m_alternatives); - } - - PatternAlternative* addNewAlternative() - { - PatternAlternative* alternative = new PatternAlternative(this); - m_alternatives.append(alternative); - return alternative; - } - - Vector m_alternatives; - PatternAlternative* m_parent; - unsigned m_minimumSize; - unsigned m_callFrameSize; - bool m_hasFixedSize; -}; - -// You probably don't want to be calling these functions directly -// (please to be calling newlineCharacterClass() et al on your -// friendly neighborhood RegexPattern instance to get nicely -// cached copies). -CharacterClass* newlineCreate(); -CharacterClass* digitsCreate(); -CharacterClass* spacesCreate(); -CharacterClass* wordcharCreate(); -CharacterClass* nondigitsCreate(); -CharacterClass* nonspacesCreate(); -CharacterClass* nonwordcharCreate(); - -struct TermChain { - TermChain(PatternTerm term) - : term(term) - {} - - PatternTerm term; - Vector hotTerms; -}; - -struct BeginChar { - BeginChar() - : value(0) - , mask(0) - {} - - BeginChar(unsigned value, unsigned mask) - : value(value) - , mask(mask) - {} - - unsigned value; - unsigned mask; -}; - -struct RegexPattern { - RegexPattern(const UString& pattern, bool ignoreCase, bool multiline, const char** error); - - ~RegexPattern() - { - deleteAllValues(m_disjunctions); - deleteAllValues(m_userCharacterClasses); - } - - void reset() - { - m_numSubpatterns = 0; - m_maxBackReference = 0; - - m_containsBackreferences = false; - m_containsBeginChars = false; - m_containsBOL = false; - - newlineCached = 0; - digitsCached = 0; - spacesCached = 0; - wordcharCached = 0; - nondigitsCached = 0; - nonspacesCached = 0; - nonwordcharCached = 0; - - deleteAllValues(m_disjunctions); - m_disjunctions.clear(); - deleteAllValues(m_userCharacterClasses); - m_userCharacterClasses.clear(); - m_beginChars.clear(); - } - - bool containsIllegalBackReference() - { - return m_maxBackReference > m_numSubpatterns; - } - - CharacterClass* newlineCharacterClass() - { - if (!newlineCached) - m_userCharacterClasses.append(newlineCached = newlineCreate()); - return newlineCached; - } - CharacterClass* digitsCharacterClass() - { - if (!digitsCached) - m_userCharacterClasses.append(digitsCached = digitsCreate()); - return digitsCached; - } - CharacterClass* spacesCharacterClass() - { - if (!spacesCached) - m_userCharacterClasses.append(spacesCached = spacesCreate()); - return spacesCached; - } - CharacterClass* wordcharCharacterClass() - { - if (!wordcharCached) - m_userCharacterClasses.append(wordcharCached = wordcharCreate()); - return wordcharCached; - } - CharacterClass* nondigitsCharacterClass() - { - if (!nondigitsCached) - m_userCharacterClasses.append(nondigitsCached = nondigitsCreate()); - return nondigitsCached; - } - CharacterClass* nonspacesCharacterClass() - { - if (!nonspacesCached) - m_userCharacterClasses.append(nonspacesCached = nonspacesCreate()); - return nonspacesCached; - } - CharacterClass* nonwordcharCharacterClass() - { - if (!nonwordcharCached) - m_userCharacterClasses.append(nonwordcharCached = nonwordcharCreate()); - return nonwordcharCached; - } - - bool m_ignoreCase : 1; - bool m_multiline : 1; - bool m_containsBackreferences : 1; - bool m_containsBeginChars : 1; - bool m_containsBOL : 1; - unsigned m_numSubpatterns; - unsigned m_maxBackReference; - PatternDisjunction* m_body; - Vector m_disjunctions; - Vector m_userCharacterClasses; - Vector m_beginChars; - -private: - CharacterClass* newlineCached; - CharacterClass* digitsCached; - CharacterClass* spacesCached; - CharacterClass* wordcharCached; - CharacterClass* nondigitsCached; - CharacterClass* nonspacesCached; - CharacterClass* nonwordcharCached; -}; - -} } // namespace JSC::Yarr - -#endif // RegexPattern_h diff --git a/Source/JavaScriptCore/yarr/Yarr.h b/Source/JavaScriptCore/yarr/Yarr.h new file mode 100644 index 0000000..57176bc --- /dev/null +++ b/Source/JavaScriptCore/yarr/Yarr.h @@ -0,0 +1,72 @@ +/* + * Copyright (C) 2009 Apple Inc. All rights reserved. + * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY UNIVERSITY OF SZEGED ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL UNIVERSITY OF SZEGED OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef Yarr_h +#define Yarr_h + +#include "YarrInterpreter.h" +#include "YarrJIT.h" +#include "YarrPattern.h" + +namespace JSC { namespace Yarr { + +#define YarrStackSpaceForBackTrackInfoPatternCharacter 1 // Only for !fixed quantifiers. +#define YarrStackSpaceForBackTrackInfoCharacterClass 1 // Only for !fixed quantifiers. +#define YarrStackSpaceForBackTrackInfoBackReference 2 +#define YarrStackSpaceForBackTrackInfoAlternative 1 // One per alternative. +#define YarrStackSpaceForBackTrackInfoParentheticalAssertion 1 +#define YarrStackSpaceForBackTrackInfoParenthesesOnce 1 // Only for !fixed quantifiers. +#define YarrStackSpaceForBackTrackInfoParenthesesTerminal 1 +#define YarrStackSpaceForBackTrackInfoParentheses 2 + +static const unsigned quantifyInfinite = UINT_MAX; + +// The below limit restricts the number of "recursive" match calls in order to +// avoid spending exponential time on complex regular expressions. +static const unsigned matchLimit = 1000000; + +enum JSRegExpResult { + JSRegExpMatch = 1, + JSRegExpNoMatch = 0, + JSRegExpErrorNoMatch = -1, + JSRegExpErrorHitLimit = -2, + JSRegExpErrorNoMemory = -3, + JSRegExpErrorInternal = -4 +}; + +PassOwnPtr byteCompile(YarrPattern&, BumpPointerAllocator*); +int interpret(BytecodePattern*, const UChar* input, unsigned start, unsigned length, int* output); + +#if ENABLE(YARR_JIT) +void jitCompile(YarrPattern&, JSGlobalData*, YarrCodeBlock& jitObject); +int execute(YarrCodeBlock& jitObject, const UChar* input, unsigned start, unsigned length, int* output); +#endif + +} } // namespace JSC::Yarr + +#endif // Yarr_h + diff --git a/Source/JavaScriptCore/yarr/YarrInterpreter.cpp b/Source/JavaScriptCore/yarr/YarrInterpreter.cpp new file mode 100644 index 0000000..d2dbf23 --- /dev/null +++ b/Source/JavaScriptCore/yarr/YarrInterpreter.cpp @@ -0,0 +1,1890 @@ +/* + * Copyright (C) 2009 Apple Inc. All rights reserved. + * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "YarrInterpreter.h" + +#include "Yarr.h" +#include + +#ifndef NDEBUG +#include +#endif + +using namespace WTF; + +namespace JSC { namespace Yarr { + +class Interpreter { +public: + struct ParenthesesDisjunctionContext; + + struct BackTrackInfoPatternCharacter { + uintptr_t matchAmount; + }; + struct BackTrackInfoCharacterClass { + uintptr_t matchAmount; + }; + struct BackTrackInfoBackReference { + uintptr_t begin; // Not really needed for greedy quantifiers. + uintptr_t matchAmount; // Not really needed for fixed quantifiers. + }; + struct BackTrackInfoAlternative { + uintptr_t offset; + }; + struct BackTrackInfoParentheticalAssertion { + uintptr_t begin; + }; + struct BackTrackInfoParenthesesOnce { + uintptr_t begin; + }; + struct BackTrackInfoParenthesesTerminal { + uintptr_t begin; + }; + struct BackTrackInfoParentheses { + uintptr_t matchAmount; + ParenthesesDisjunctionContext* lastContext; + }; + + static inline void appendParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack, ParenthesesDisjunctionContext* context) + { + context->next = backTrack->lastContext; + backTrack->lastContext = context; + ++backTrack->matchAmount; + } + + static inline void popParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack) + { + ASSERT(backTrack->matchAmount); + ASSERT(backTrack->lastContext); + backTrack->lastContext = backTrack->lastContext->next; + --backTrack->matchAmount; + } + + struct DisjunctionContext + { + DisjunctionContext() + : term(0) + { + } + + void* operator new(size_t, void* where) + { + return where; + } + + int term; + unsigned matchBegin; + unsigned matchEnd; + uintptr_t frame[1]; + }; + + DisjunctionContext* allocDisjunctionContext(ByteDisjunction* disjunction) + { + size_t size = sizeof(DisjunctionContext) - sizeof(uintptr_t) + disjunction->m_frameSize * sizeof(uintptr_t); + allocatorPool = allocatorPool->ensureCapacity(size); + if (!allocatorPool) + CRASH(); + return new(allocatorPool->alloc(size)) DisjunctionContext(); + } + + void freeDisjunctionContext(DisjunctionContext* context) + { + allocatorPool = allocatorPool->dealloc(context); + } + + struct ParenthesesDisjunctionContext + { + ParenthesesDisjunctionContext(int* output, ByteTerm& term) + : next(0) + { + unsigned firstSubpatternId = term.atom.subpatternId; + unsigned numNestedSubpatterns = term.atom.parenthesesDisjunction->m_numSubpatterns; + + for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i) { + subpatternBackup[i] = output[(firstSubpatternId << 1) + i]; + output[(firstSubpatternId << 1) + i] = -1; + } + + new(getDisjunctionContext(term)) DisjunctionContext(); + } + + void* operator new(size_t, void* where) + { + return where; + } + + void restoreOutput(int* output, unsigned firstSubpatternId, unsigned numNestedSubpatterns) + { + for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i) + output[(firstSubpatternId << 1) + i] = subpatternBackup[i]; + } + + DisjunctionContext* getDisjunctionContext(ByteTerm& term) + { + return reinterpret_cast(&(subpatternBackup[term.atom.parenthesesDisjunction->m_numSubpatterns << 1])); + } + + ParenthesesDisjunctionContext* next; + int subpatternBackup[1]; + }; + + ParenthesesDisjunctionContext* allocParenthesesDisjunctionContext(ByteDisjunction* disjunction, int* output, ByteTerm& term) + { + size_t size = sizeof(ParenthesesDisjunctionContext) - sizeof(int) + (term.atom.parenthesesDisjunction->m_numSubpatterns << 1) * sizeof(int) + sizeof(DisjunctionContext) - sizeof(uintptr_t) + disjunction->m_frameSize * sizeof(uintptr_t); + allocatorPool = allocatorPool->ensureCapacity(size); + if (!allocatorPool) + CRASH(); + return new(allocatorPool->alloc(size)) ParenthesesDisjunctionContext(output, term); + } + + void freeParenthesesDisjunctionContext(ParenthesesDisjunctionContext* context) + { + allocatorPool = allocatorPool->dealloc(context); + } + + class InputStream { + public: + InputStream(const UChar* input, unsigned start, unsigned length) + : input(input) + , pos(start) + , length(length) + { + } + + void next() + { + ++pos; + } + + void rewind(unsigned amount) + { + ASSERT(pos >= amount); + pos -= amount; + } + + int read() + { + ASSERT(pos < length); + if (pos < length) + return input[pos]; + return -1; + } + + int readPair() + { + ASSERT(pos + 1 < length); + return input[pos] | input[pos + 1] << 16; + } + + int readChecked(int position) + { + ASSERT(position < 0); + ASSERT((unsigned)-position <= pos); + unsigned p = pos + position; + ASSERT(p < length); + return input[p]; + } + + int reread(unsigned from) + { + ASSERT(from < length); + return input[from]; + } + + int prev() + { + ASSERT(!(pos > length)); + if (pos && length) + return input[pos - 1]; + return -1; + } + + unsigned getPos() + { + return pos; + } + + void setPos(unsigned p) + { + pos = p; + } + + bool atStart() + { + return pos == 0; + } + + bool atEnd() + { + return pos == length; + } + + bool checkInput(int count) + { + if ((pos + count) <= length) { + pos += count; + return true; + } else + return false; + } + + void uncheckInput(int count) + { + pos -= count; + } + + bool atStart(int position) + { + return (pos + position) == 0; + } + + bool atEnd(int position) + { + return (pos + position) == length; + } + + bool isNotAvailableInput(int position) + { + return (pos + position) > length; + } + + private: + const UChar* input; + unsigned pos; + unsigned length; + }; + + bool testCharacterClass(CharacterClass* characterClass, int ch) + { + if (ch & 0xFF80) { + for (unsigned i = 0; i < characterClass->m_matchesUnicode.size(); ++i) + if (ch == characterClass->m_matchesUnicode[i]) + return true; + for (unsigned i = 0; i < characterClass->m_rangesUnicode.size(); ++i) + if ((ch >= characterClass->m_rangesUnicode[i].begin) && (ch <= characterClass->m_rangesUnicode[i].end)) + return true; + } else { + for (unsigned i = 0; i < characterClass->m_matches.size(); ++i) + if (ch == characterClass->m_matches[i]) + return true; + for (unsigned i = 0; i < characterClass->m_ranges.size(); ++i) + if ((ch >= characterClass->m_ranges[i].begin) && (ch <= characterClass->m_ranges[i].end)) + return true; + } + + return false; + } + + bool checkCharacter(int testChar, int inputPosition) + { + return testChar == input.readChecked(inputPosition); + } + + bool checkCasedCharacter(int loChar, int hiChar, int inputPosition) + { + int ch = input.readChecked(inputPosition); + return (loChar == ch) || (hiChar == ch); + } + + bool checkCharacterClass(CharacterClass* characterClass, bool invert, int inputPosition) + { + bool match = testCharacterClass(characterClass, input.readChecked(inputPosition)); + return invert ? !match : match; + } + + bool tryConsumeBackReference(int matchBegin, int matchEnd, int inputOffset) + { + int matchSize = matchEnd - matchBegin; + + if (!input.checkInput(matchSize)) + return false; + + if (pattern->m_ignoreCase) { + for (int i = 0; i < matchSize; ++i) { + int ch = input.reread(matchBegin + i); + + int lo = Unicode::toLower(ch); + int hi = Unicode::toUpper(ch); + + if ((lo != hi) ? (!checkCasedCharacter(lo, hi, inputOffset - matchSize + i)) : (!checkCharacter(ch, inputOffset - matchSize + i))) { + input.uncheckInput(matchSize); + return false; + } + } + } else { + for (int i = 0; i < matchSize; ++i) { + if (!checkCharacter(input.reread(matchBegin + i), inputOffset - matchSize + i)) { + input.uncheckInput(matchSize); + return false; + } + } + } + + return true; + } + + bool matchAssertionBOL(ByteTerm& term) + { + return (input.atStart(term.inputPosition)) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition - 1))); + } + + bool matchAssertionEOL(ByteTerm& term) + { + if (term.inputPosition) + return (input.atEnd(term.inputPosition)) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition))); + else + return (input.atEnd()) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.read())); + } + + bool matchAssertionWordBoundary(ByteTerm& term) + { + bool prevIsWordchar = !input.atStart(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition - 1)); + bool readIsWordchar; + if (term.inputPosition) + readIsWordchar = !input.atEnd(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition)); + else + readIsWordchar = !input.atEnd() && testCharacterClass(pattern->wordcharCharacterClass, input.read()); + + bool wordBoundary = prevIsWordchar != readIsWordchar; + return term.invert() ? !wordBoundary : wordBoundary; + } + + bool backtrackPatternCharacter(ByteTerm& term, DisjunctionContext* context) + { + BackTrackInfoPatternCharacter* backTrack = reinterpret_cast(context->frame + term.frameLocation); + + switch (term.atom.quantityType) { + case QuantifierFixedCount: + break; + + case QuantifierGreedy: + if (backTrack->matchAmount) { + --backTrack->matchAmount; + input.uncheckInput(1); + return true; + } + break; + + case QuantifierNonGreedy: + if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) { + ++backTrack->matchAmount; + if (checkCharacter(term.atom.patternCharacter, term.inputPosition - 1)) + return true; + } + input.uncheckInput(backTrack->matchAmount); + break; + } + + return false; + } + + bool backtrackPatternCasedCharacter(ByteTerm& term, DisjunctionContext* context) + { + BackTrackInfoPatternCharacter* backTrack = reinterpret_cast(context->frame + term.frameLocation); + + switch (term.atom.quantityType) { + case QuantifierFixedCount: + break; + + case QuantifierGreedy: + if (backTrack->matchAmount) { + --backTrack->matchAmount; + input.uncheckInput(1); + return true; + } + break; + + case QuantifierNonGreedy: + if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) { + ++backTrack->matchAmount; + if (checkCasedCharacter(term.atom.casedCharacter.lo, term.atom.casedCharacter.hi, term.inputPosition - 1)) + return true; + } + input.uncheckInput(backTrack->matchAmount); + break; + } + + return false; + } + + bool matchCharacterClass(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeCharacterClass); + BackTrackInfoPatternCharacter* backTrack = reinterpret_cast(context->frame + term.frameLocation); + + switch (term.atom.quantityType) { + case QuantifierFixedCount: { + for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) { + if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition + matchAmount)) + return false; + } + return true; + } + + case QuantifierGreedy: { + unsigned matchAmount = 0; + while ((matchAmount < term.atom.quantityCount) && input.checkInput(1)) { + if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - 1)) { + input.uncheckInput(1); + break; + } + ++matchAmount; + } + backTrack->matchAmount = matchAmount; + + return true; + } + + case QuantifierNonGreedy: + backTrack->matchAmount = 0; + return true; + } + + ASSERT_NOT_REACHED(); + return false; + } + + bool backtrackCharacterClass(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeCharacterClass); + BackTrackInfoPatternCharacter* backTrack = reinterpret_cast(context->frame + term.frameLocation); + + switch (term.atom.quantityType) { + case QuantifierFixedCount: + break; + + case QuantifierGreedy: + if (backTrack->matchAmount) { + --backTrack->matchAmount; + input.uncheckInput(1); + return true; + } + break; + + case QuantifierNonGreedy: + if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) { + ++backTrack->matchAmount; + if (checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - 1)) + return true; + } + input.uncheckInput(backTrack->matchAmount); + break; + } + + return false; + } + + bool matchBackReference(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeBackReference); + BackTrackInfoBackReference* backTrack = reinterpret_cast(context->frame + term.frameLocation); + + int matchBegin = output[(term.atom.subpatternId << 1)]; + int matchEnd = output[(term.atom.subpatternId << 1) + 1]; + + // If the end position of the referenced match hasn't set yet then the backreference in the same parentheses where it references to that. + // In this case the result of match is empty string like when it references to a parentheses with zero-width match. + // Eg.: /(a\1)/ + if (matchEnd == -1) + return true; + + ASSERT((matchBegin == -1) || (matchBegin <= matchEnd)); + + if (matchBegin == matchEnd) + return true; + + switch (term.atom.quantityType) { + case QuantifierFixedCount: { + backTrack->begin = input.getPos(); + for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) { + if (!tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) { + input.setPos(backTrack->begin); + return false; + } + } + return true; + } + + case QuantifierGreedy: { + unsigned matchAmount = 0; + while ((matchAmount < term.atom.quantityCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) + ++matchAmount; + backTrack->matchAmount = matchAmount; + return true; + } + + case QuantifierNonGreedy: + backTrack->begin = input.getPos(); + backTrack->matchAmount = 0; + return true; + } + + ASSERT_NOT_REACHED(); + return false; + } + + bool backtrackBackReference(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeBackReference); + BackTrackInfoBackReference* backTrack = reinterpret_cast(context->frame + term.frameLocation); + + int matchBegin = output[(term.atom.subpatternId << 1)]; + int matchEnd = output[(term.atom.subpatternId << 1) + 1]; + ASSERT((matchBegin == -1) || (matchBegin <= matchEnd)); + + if (matchBegin == matchEnd) + return false; + + switch (term.atom.quantityType) { + case QuantifierFixedCount: + // for quantityCount == 1, could rewind. + input.setPos(backTrack->begin); + break; + + case QuantifierGreedy: + if (backTrack->matchAmount) { + --backTrack->matchAmount; + input.rewind(matchEnd - matchBegin); + return true; + } + break; + + case QuantifierNonGreedy: + if ((backTrack->matchAmount < term.atom.quantityCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) { + ++backTrack->matchAmount; + return true; + } else + input.setPos(backTrack->begin); + break; + } + + return false; + } + + void recordParenthesesMatch(ByteTerm& term, ParenthesesDisjunctionContext* context) + { + if (term.capture()) { + unsigned subpatternId = term.atom.subpatternId; + output[(subpatternId << 1)] = context->getDisjunctionContext(term)->matchBegin + term.inputPosition; + output[(subpatternId << 1) + 1] = context->getDisjunctionContext(term)->matchEnd + term.inputPosition; + } + } + void resetMatches(ByteTerm& term, ParenthesesDisjunctionContext* context) + { + unsigned firstSubpatternId = term.atom.subpatternId; + unsigned count = term.atom.parenthesesDisjunction->m_numSubpatterns; + context->restoreOutput(output, firstSubpatternId, count); + } + JSRegExpResult parenthesesDoBacktrack(ByteTerm& term, BackTrackInfoParentheses* backTrack) + { + while (backTrack->matchAmount) { + ParenthesesDisjunctionContext* context = backTrack->lastContext; + + JSRegExpResult result = matchDisjunction(term.atom.parenthesesDisjunction, context->getDisjunctionContext(term), true); + if (result == JSRegExpMatch) + return JSRegExpMatch; + + resetMatches(term, context); + popParenthesesDisjunctionContext(backTrack); + freeParenthesesDisjunctionContext(context); + + if (result != JSRegExpNoMatch) + return result; + } + + return JSRegExpNoMatch; + } + + bool matchParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin); + ASSERT(term.atom.quantityCount == 1); + + BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast(context->frame + term.frameLocation); + + switch (term.atom.quantityType) { + case QuantifierGreedy: { + // set this speculatively; if we get to the parens end this will be true. + backTrack->begin = input.getPos(); + break; + } + case QuantifierNonGreedy: { + backTrack->begin = notFound; + context->term += term.atom.parenthesesWidth; + return true; + } + case QuantifierFixedCount: + break; + } + + if (term.capture()) { + unsigned subpatternId = term.atom.subpatternId; + output[(subpatternId << 1)] = input.getPos() + term.inputPosition; + } + + return true; + } + + bool matchParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd); + ASSERT(term.atom.quantityCount == 1); + + if (term.capture()) { + unsigned subpatternId = term.atom.subpatternId; + output[(subpatternId << 1) + 1] = input.getPos() + term.inputPosition; + } + + if (term.atom.quantityType == QuantifierFixedCount) + return true; + + BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast(context->frame + term.frameLocation); + return backTrack->begin != input.getPos(); + } + + bool backtrackParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin); + ASSERT(term.atom.quantityCount == 1); + + BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast(context->frame + term.frameLocation); + + if (term.capture()) { + unsigned subpatternId = term.atom.subpatternId; + output[(subpatternId << 1)] = -1; + output[(subpatternId << 1) + 1] = -1; + } + + switch (term.atom.quantityType) { + case QuantifierGreedy: + // if we backtrack to this point, there is another chance - try matching nothing. + ASSERT(backTrack->begin != notFound); + backTrack->begin = notFound; + context->term += term.atom.parenthesesWidth; + return true; + case QuantifierNonGreedy: + ASSERT(backTrack->begin != notFound); + case QuantifierFixedCount: + break; + } + + return false; + } + + bool backtrackParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd); + ASSERT(term.atom.quantityCount == 1); + + BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast(context->frame + term.frameLocation); + + switch (term.atom.quantityType) { + case QuantifierGreedy: + if (backTrack->begin == notFound) { + context->term -= term.atom.parenthesesWidth; + return false; + } + case QuantifierNonGreedy: + if (backTrack->begin == notFound) { + backTrack->begin = input.getPos(); + if (term.capture()) { + // Technically this access to inputPosition should be accessing the begin term's + // inputPosition, but for repeats other than fixed these values should be + // the same anyway! (We don't pre-check for greedy or non-greedy matches.) + ASSERT((&term - term.atom.parenthesesWidth)->type == ByteTerm::TypeParenthesesSubpatternOnceBegin); + ASSERT((&term - term.atom.parenthesesWidth)->inputPosition == term.inputPosition); + unsigned subpatternId = term.atom.subpatternId; + output[subpatternId << 1] = input.getPos() + term.inputPosition; + } + context->term -= term.atom.parenthesesWidth; + return true; + } + case QuantifierFixedCount: + break; + } + + return false; + } + + bool matchParenthesesTerminalBegin(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalBegin); + ASSERT(term.atom.quantityType == QuantifierGreedy); + ASSERT(term.atom.quantityCount == quantifyInfinite); + ASSERT(!term.capture()); + + BackTrackInfoParenthesesTerminal* backTrack = reinterpret_cast(context->frame + term.frameLocation); + backTrack->begin = input.getPos(); + return true; + } + + bool matchParenthesesTerminalEnd(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalEnd); + + BackTrackInfoParenthesesTerminal* backTrack = reinterpret_cast(context->frame + term.frameLocation); + // Empty match is a failed match. + if (backTrack->begin == input.getPos()) + return false; + + // Successful match! Okay, what's next? - loop around and try to match moar! + context->term -= (term.atom.parenthesesWidth + 1); + return true; + } + + bool backtrackParenthesesTerminalBegin(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalBegin); + ASSERT(term.atom.quantityType == QuantifierGreedy); + ASSERT(term.atom.quantityCount == quantifyInfinite); + ASSERT(!term.capture()); + + // If we backtrack to this point, we have failed to match this iteration of the parens. + // Since this is greedy / zero minimum a failed is also accepted as a match! + context->term += term.atom.parenthesesWidth; + return true; + } + + bool backtrackParenthesesTerminalEnd(ByteTerm&, DisjunctionContext*) + { + // 'Terminal' parentheses are at the end of the regex, and as such a match past end + // should always be returned as a successful match - we should never backtrack to here. + ASSERT_NOT_REACHED(); + return false; + } + + bool matchParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin); + ASSERT(term.atom.quantityCount == 1); + + BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast(context->frame + term.frameLocation); + + backTrack->begin = input.getPos(); + return true; + } + + bool matchParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd); + ASSERT(term.atom.quantityCount == 1); + + BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast(context->frame + term.frameLocation); + + input.setPos(backTrack->begin); + + // We've reached the end of the parens; if they are inverted, this is failure. + if (term.invert()) { + context->term -= term.atom.parenthesesWidth; + return false; + } + + return true; + } + + bool backtrackParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin); + ASSERT(term.atom.quantityCount == 1); + + // We've failed to match parens; if they are inverted, this is win! + if (term.invert()) { + context->term += term.atom.parenthesesWidth; + return true; + } + + return false; + } + + bool backtrackParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd); + ASSERT(term.atom.quantityCount == 1); + + BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast(context->frame + term.frameLocation); + + input.setPos(backTrack->begin); + + context->term -= term.atom.parenthesesWidth; + return false; + } + + JSRegExpResult matchParentheses(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern); + + BackTrackInfoParentheses* backTrack = reinterpret_cast(context->frame + term.frameLocation); + ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction; + + backTrack->matchAmount = 0; + backTrack->lastContext = 0; + + switch (term.atom.quantityType) { + case QuantifierFixedCount: { + // While we haven't yet reached our fixed limit, + while (backTrack->matchAmount < term.atom.quantityCount) { + // Try to do a match, and it it succeeds, add it to the list. + ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); + JSRegExpResult result = matchDisjunction(disjunctionBody, context->getDisjunctionContext(term)); + if (result == JSRegExpMatch) + appendParenthesesDisjunctionContext(backTrack, context); + else { + // The match failed; try to find an alternate point to carry on from. + resetMatches(term, context); + freeParenthesesDisjunctionContext(context); + + if (result == JSRegExpNoMatch) { + JSRegExpResult backtrackResult = parenthesesDoBacktrack(term, backTrack); + if (backtrackResult != JSRegExpMatch) + return backtrackResult; + } else + return result; + } + } + + ASSERT(backTrack->matchAmount == term.atom.quantityCount); + ParenthesesDisjunctionContext* context = backTrack->lastContext; + recordParenthesesMatch(term, context); + return JSRegExpMatch; + } + + case QuantifierGreedy: { + while (backTrack->matchAmount < term.atom.quantityCount) { + ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); + JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term)); + if (result == JSRegExpMatch) + appendParenthesesDisjunctionContext(backTrack, context); + else { + resetMatches(term, context); + freeParenthesesDisjunctionContext(context); + + if (result != JSRegExpNoMatch) + return result; + + break; + } + } + + if (backTrack->matchAmount) { + ParenthesesDisjunctionContext* context = backTrack->lastContext; + recordParenthesesMatch(term, context); + } + return JSRegExpMatch; + } + + case QuantifierNonGreedy: + return JSRegExpMatch; + } + + ASSERT_NOT_REACHED(); + return JSRegExpErrorNoMatch; + } + + // Rules for backtracking differ depending on whether this is greedy or non-greedy. + // + // Greedy matches never should try just adding more - you should already have done + // the 'more' cases. Always backtrack, at least a leetle bit. However cases where + // you backtrack an item off the list needs checking, since we'll never have matched + // the one less case. Tracking forwards, still add as much as possible. + // + // Non-greedy, we've already done the one less case, so don't match on popping. + // We haven't done the one more case, so always try to add that. + // + JSRegExpResult backtrackParentheses(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern); + + BackTrackInfoParentheses* backTrack = reinterpret_cast(context->frame + term.frameLocation); + ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction; + + switch (term.atom.quantityType) { + case QuantifierFixedCount: { + ASSERT(backTrack->matchAmount == term.atom.quantityCount); + + ParenthesesDisjunctionContext* context = 0; + JSRegExpResult result = parenthesesDoBacktrack(term, backTrack); + + if (result != JSRegExpMatch) + return result; + + // While we haven't yet reached our fixed limit, + while (backTrack->matchAmount < term.atom.quantityCount) { + // Try to do a match, and it it succeeds, add it to the list. + context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); + result = matchDisjunction(disjunctionBody, context->getDisjunctionContext(term)); + + if (result == JSRegExpMatch) + appendParenthesesDisjunctionContext(backTrack, context); + else { + // The match failed; try to find an alternate point to carry on from. + resetMatches(term, context); + freeParenthesesDisjunctionContext(context); + + if (result == JSRegExpNoMatch) { + JSRegExpResult backtrackResult = parenthesesDoBacktrack(term, backTrack); + if (backtrackResult != JSRegExpMatch) + return backtrackResult; + } else + return result; + } + } + + ASSERT(backTrack->matchAmount == term.atom.quantityCount); + context = backTrack->lastContext; + recordParenthesesMatch(term, context); + return JSRegExpMatch; + } + + case QuantifierGreedy: { + if (!backTrack->matchAmount) + return JSRegExpNoMatch; + + ParenthesesDisjunctionContext* context = backTrack->lastContext; + JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true); + if (result == JSRegExpMatch) { + while (backTrack->matchAmount < term.atom.quantityCount) { + ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); + JSRegExpResult parenthesesResult = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term)); + if (parenthesesResult == JSRegExpMatch) + appendParenthesesDisjunctionContext(backTrack, context); + else { + resetMatches(term, context); + freeParenthesesDisjunctionContext(context); + + if (parenthesesResult != JSRegExpNoMatch) + return parenthesesResult; + + break; + } + } + } else { + resetMatches(term, context); + popParenthesesDisjunctionContext(backTrack); + freeParenthesesDisjunctionContext(context); + + if (result != JSRegExpNoMatch) + return result; + } + + if (backTrack->matchAmount) { + ParenthesesDisjunctionContext* context = backTrack->lastContext; + recordParenthesesMatch(term, context); + } + return JSRegExpMatch; + } + + case QuantifierNonGreedy: { + // If we've not reached the limit, try to add one more match. + if (backTrack->matchAmount < term.atom.quantityCount) { + ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); + JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term)); + if (result == JSRegExpMatch) { + appendParenthesesDisjunctionContext(backTrack, context); + recordParenthesesMatch(term, context); + return JSRegExpMatch; + } + + resetMatches(term, context); + freeParenthesesDisjunctionContext(context); + + if (result != JSRegExpNoMatch) + return result; + } + + // Nope - okay backtrack looking for an alternative. + while (backTrack->matchAmount) { + ParenthesesDisjunctionContext* context = backTrack->lastContext; + JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true); + if (result == JSRegExpMatch) { + // successful backtrack! we're back in the game! + if (backTrack->matchAmount) { + context = backTrack->lastContext; + recordParenthesesMatch(term, context); + } + return JSRegExpMatch; + } + + // pop a match off the stack + resetMatches(term, context); + popParenthesesDisjunctionContext(backTrack); + freeParenthesesDisjunctionContext(context); + + return result; + } + + return JSRegExpNoMatch; + } + } + + ASSERT_NOT_REACHED(); + return JSRegExpErrorNoMatch; + } + + void lookupForBeginChars() + { + int character; + bool firstSingleCharFound; + + while (true) { + if (input.isNotAvailableInput(2)) + return; + + firstSingleCharFound = false; + + character = input.readPair(); + + for (unsigned i = 0; i < pattern->m_beginChars.size(); ++i) { + BeginChar bc = pattern->m_beginChars[i]; + + if (!firstSingleCharFound && bc.value <= 0xFFFF) { + firstSingleCharFound = true; + character &= 0xFFFF; + } + + if ((character | bc.mask) == bc.value) + return; + } + + input.next(); + } + } + +#define MATCH_NEXT() { ++context->term; goto matchAgain; } +#define BACKTRACK() { --context->term; goto backtrack; } +#define currentTerm() (disjunction->terms[context->term]) + JSRegExpResult matchDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false, bool isBody = false) + { + if (!--remainingMatchCount) + return JSRegExpErrorHitLimit; + + if (btrack) + BACKTRACK(); + + if (pattern->m_containsBeginChars && isBody) + lookupForBeginChars(); + + context->matchBegin = input.getPos(); + context->term = 0; + + matchAgain: + ASSERT(context->term < static_cast(disjunction->terms.size())); + + switch (currentTerm().type) { + case ByteTerm::TypeSubpatternBegin: + MATCH_NEXT(); + case ByteTerm::TypeSubpatternEnd: + context->matchEnd = input.getPos(); + return JSRegExpMatch; + + case ByteTerm::TypeBodyAlternativeBegin: + MATCH_NEXT(); + case ByteTerm::TypeBodyAlternativeDisjunction: + case ByteTerm::TypeBodyAlternativeEnd: + context->matchEnd = input.getPos(); + return JSRegExpMatch; + + case ByteTerm::TypeAlternativeBegin: + MATCH_NEXT(); + case ByteTerm::TypeAlternativeDisjunction: + case ByteTerm::TypeAlternativeEnd: { + int offset = currentTerm().alternative.end; + BackTrackInfoAlternative* backTrack = reinterpret_cast(context->frame + currentTerm().frameLocation); + backTrack->offset = offset; + context->term += offset; + MATCH_NEXT(); + } + + case ByteTerm::TypeAssertionBOL: + if (matchAssertionBOL(currentTerm())) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeAssertionEOL: + if (matchAssertionEOL(currentTerm())) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeAssertionWordBoundary: + if (matchAssertionWordBoundary(currentTerm())) + MATCH_NEXT(); + BACKTRACK(); + + case ByteTerm::TypePatternCharacterOnce: + case ByteTerm::TypePatternCharacterFixed: { + for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) { + if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition + matchAmount)) + BACKTRACK(); + } + MATCH_NEXT(); + } + case ByteTerm::TypePatternCharacterGreedy: { + BackTrackInfoPatternCharacter* backTrack = reinterpret_cast(context->frame + currentTerm().frameLocation); + unsigned matchAmount = 0; + while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) { + if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition - 1)) { + input.uncheckInput(1); + break; + } + ++matchAmount; + } + backTrack->matchAmount = matchAmount; + + MATCH_NEXT(); + } + case ByteTerm::TypePatternCharacterNonGreedy: { + BackTrackInfoPatternCharacter* backTrack = reinterpret_cast(context->frame + currentTerm().frameLocation); + backTrack->matchAmount = 0; + MATCH_NEXT(); + } + + case ByteTerm::TypePatternCasedCharacterOnce: + case ByteTerm::TypePatternCasedCharacterFixed: { + for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) { + if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition + matchAmount)) + BACKTRACK(); + } + MATCH_NEXT(); + } + case ByteTerm::TypePatternCasedCharacterGreedy: { + BackTrackInfoPatternCharacter* backTrack = reinterpret_cast(context->frame + currentTerm().frameLocation); + unsigned matchAmount = 0; + while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) { + if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition - 1)) { + input.uncheckInput(1); + break; + } + ++matchAmount; + } + backTrack->matchAmount = matchAmount; + + MATCH_NEXT(); + } + case ByteTerm::TypePatternCasedCharacterNonGreedy: { + BackTrackInfoPatternCharacter* backTrack = reinterpret_cast(context->frame + currentTerm().frameLocation); + backTrack->matchAmount = 0; + MATCH_NEXT(); + } + + case ByteTerm::TypeCharacterClass: + if (matchCharacterClass(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeBackReference: + if (matchBackReference(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeParenthesesSubpattern: { + JSRegExpResult result = matchParentheses(currentTerm(), context); + + if (result == JSRegExpMatch) { + MATCH_NEXT(); + } else if (result != JSRegExpNoMatch) + return result; + + BACKTRACK(); + } + case ByteTerm::TypeParenthesesSubpatternOnceBegin: + if (matchParenthesesOnceBegin(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeParenthesesSubpatternOnceEnd: + if (matchParenthesesOnceEnd(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeParenthesesSubpatternTerminalBegin: + if (matchParenthesesTerminalBegin(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeParenthesesSubpatternTerminalEnd: + if (matchParenthesesTerminalEnd(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeParentheticalAssertionBegin: + if (matchParentheticalAssertionBegin(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeParentheticalAssertionEnd: + if (matchParentheticalAssertionEnd(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + + case ByteTerm::TypeCheckInput: + if (input.checkInput(currentTerm().checkInputCount)) + MATCH_NEXT(); + BACKTRACK(); + } + + // We should never fall-through to here. + ASSERT_NOT_REACHED(); + + backtrack: + ASSERT(context->term < static_cast(disjunction->terms.size())); + + switch (currentTerm().type) { + case ByteTerm::TypeSubpatternBegin: + return JSRegExpNoMatch; + case ByteTerm::TypeSubpatternEnd: + ASSERT_NOT_REACHED(); + + case ByteTerm::TypeBodyAlternativeBegin: + case ByteTerm::TypeBodyAlternativeDisjunction: { + int offset = currentTerm().alternative.next; + context->term += offset; + if (offset > 0) + MATCH_NEXT(); + + if (input.atEnd()) + return JSRegExpNoMatch; + + input.next(); + + if (pattern->m_containsBeginChars && isBody) + lookupForBeginChars(); + + context->matchBegin = input.getPos(); + + if (currentTerm().alternative.onceThrough) + context->term += currentTerm().alternative.next; + + MATCH_NEXT(); + } + case ByteTerm::TypeBodyAlternativeEnd: + ASSERT_NOT_REACHED(); + + case ByteTerm::TypeAlternativeBegin: + case ByteTerm::TypeAlternativeDisjunction: { + int offset = currentTerm().alternative.next; + context->term += offset; + if (offset > 0) + MATCH_NEXT(); + BACKTRACK(); + } + case ByteTerm::TypeAlternativeEnd: { + // We should never backtrack back into an alternative of the main body of the regex. + BackTrackInfoAlternative* backTrack = reinterpret_cast(context->frame + currentTerm().frameLocation); + unsigned offset = backTrack->offset; + context->term -= offset; + BACKTRACK(); + } + + case ByteTerm::TypeAssertionBOL: + case ByteTerm::TypeAssertionEOL: + case ByteTerm::TypeAssertionWordBoundary: + BACKTRACK(); + + case ByteTerm::TypePatternCharacterOnce: + case ByteTerm::TypePatternCharacterFixed: + case ByteTerm::TypePatternCharacterGreedy: + case ByteTerm::TypePatternCharacterNonGreedy: + if (backtrackPatternCharacter(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypePatternCasedCharacterOnce: + case ByteTerm::TypePatternCasedCharacterFixed: + case ByteTerm::TypePatternCasedCharacterGreedy: + case ByteTerm::TypePatternCasedCharacterNonGreedy: + if (backtrackPatternCasedCharacter(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeCharacterClass: + if (backtrackCharacterClass(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeBackReference: + if (backtrackBackReference(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeParenthesesSubpattern: { + JSRegExpResult result = backtrackParentheses(currentTerm(), context); + + if (result == JSRegExpMatch) { + MATCH_NEXT(); + } else if (result != JSRegExpNoMatch) + return result; + + BACKTRACK(); + } + case ByteTerm::TypeParenthesesSubpatternOnceBegin: + if (backtrackParenthesesOnceBegin(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeParenthesesSubpatternOnceEnd: + if (backtrackParenthesesOnceEnd(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeParenthesesSubpatternTerminalBegin: + if (backtrackParenthesesTerminalBegin(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeParenthesesSubpatternTerminalEnd: + if (backtrackParenthesesTerminalEnd(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeParentheticalAssertionBegin: + if (backtrackParentheticalAssertionBegin(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeParentheticalAssertionEnd: + if (backtrackParentheticalAssertionEnd(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + + case ByteTerm::TypeCheckInput: + input.uncheckInput(currentTerm().checkInputCount); + BACKTRACK(); + } + + ASSERT_NOT_REACHED(); + return JSRegExpErrorNoMatch; + } + + JSRegExpResult matchNonZeroDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false) + { + JSRegExpResult result = matchDisjunction(disjunction, context, btrack); + + if (result == JSRegExpMatch) { + while (context->matchBegin == context->matchEnd) { + result = matchDisjunction(disjunction, context, true); + if (result != JSRegExpMatch) + return result; + } + return JSRegExpMatch; + } + + return result; + } + + int interpret() + { + allocatorPool = pattern->m_allocator->startAllocator(); + if (!allocatorPool) + CRASH(); + + for (unsigned i = 0; i < ((pattern->m_body->m_numSubpatterns + 1) << 1); ++i) + output[i] = -1; + + DisjunctionContext* context = allocDisjunctionContext(pattern->m_body.get()); + + JSRegExpResult result = matchDisjunction(pattern->m_body.get(), context, false, true); + if (result == JSRegExpMatch) { + output[0] = context->matchBegin; + output[1] = context->matchEnd; + } + + freeDisjunctionContext(context); + + pattern->m_allocator->stopAllocator(); + + // RegExp.cpp currently expects all error to be converted to -1. + ASSERT((result == JSRegExpMatch) == (output[0] != -1)); + return output[0]; + } + + Interpreter(BytecodePattern* pattern, int* output, const UChar* inputChar, unsigned start, unsigned length) + : pattern(pattern) + , output(output) + , input(inputChar, start, length) + , allocatorPool(0) + , remainingMatchCount(matchLimit) + { + } + +private: + BytecodePattern *pattern; + int* output; + InputStream input; + BumpPointerPool* allocatorPool; + unsigned remainingMatchCount; +}; + + + +class ByteCompiler { + struct ParenthesesStackEntry { + unsigned beginTerm; + unsigned savedAlternativeIndex; + ParenthesesStackEntry(unsigned beginTerm, unsigned savedAlternativeIndex/*, unsigned subpatternId, bool capture = false*/) + : beginTerm(beginTerm) + , savedAlternativeIndex(savedAlternativeIndex) + { + } + }; + +public: + ByteCompiler(YarrPattern& pattern) + : m_pattern(pattern) + { + m_currentAlternativeIndex = 0; + } + + PassOwnPtr compile(BumpPointerAllocator* allocator) + { + regexBegin(m_pattern.m_numSubpatterns, m_pattern.m_body->m_callFrameSize, m_pattern.m_body->m_alternatives[0]->onceThrough()); + emitDisjunction(m_pattern.m_body); + regexEnd(); + + return adoptPtr(new BytecodePattern(m_bodyDisjunction.release(), m_allParenthesesInfo, m_pattern, allocator)); + } + + void checkInput(unsigned count) + { + m_bodyDisjunction->terms.append(ByteTerm::CheckInput(count)); + } + + void assertionBOL(int inputPosition) + { + m_bodyDisjunction->terms.append(ByteTerm::BOL(inputPosition)); + } + + void assertionEOL(int inputPosition) + { + m_bodyDisjunction->terms.append(ByteTerm::EOL(inputPosition)); + } + + void assertionWordBoundary(bool invert, int inputPosition) + { + m_bodyDisjunction->terms.append(ByteTerm::WordBoundary(invert, inputPosition)); + } + + void atomPatternCharacter(UChar ch, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType) + { + if (m_pattern.m_ignoreCase) { + UChar lo = Unicode::toLower(ch); + UChar hi = Unicode::toUpper(ch); + + if (lo != hi) { + m_bodyDisjunction->terms.append(ByteTerm(lo, hi, inputPosition, frameLocation, quantityCount, quantityType)); + return; + } + } + + m_bodyDisjunction->terms.append(ByteTerm(ch, inputPosition, frameLocation, quantityCount, quantityType)); + } + + void atomCharacterClass(CharacterClass* characterClass, bool invert, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType) + { + m_bodyDisjunction->terms.append(ByteTerm(characterClass, invert, inputPosition)); + + m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount; + m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType; + m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; + } + + void atomBackReference(unsigned subpatternId, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType) + { + ASSERT(subpatternId); + + m_bodyDisjunction->terms.append(ByteTerm::BackReference(subpatternId, inputPosition)); + + m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount; + m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType; + m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; + } + + void atomParenthesesOnceBegin(unsigned subpatternId, bool capture, int inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation) + { + int beginTerm = m_bodyDisjunction->terms.size(); + + m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, false, inputPosition)); + m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; + m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin()); + m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation; + + m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex)); + m_currentAlternativeIndex = beginTerm + 1; + } + + void atomParenthesesTerminalBegin(unsigned subpatternId, bool capture, int inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation) + { + int beginTerm = m_bodyDisjunction->terms.size(); + + m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternTerminalBegin, subpatternId, capture, false, inputPosition)); + m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; + m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin()); + m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation; + + m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex)); + m_currentAlternativeIndex = beginTerm + 1; + } + + void atomParenthesesSubpatternBegin(unsigned subpatternId, bool capture, int inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation) + { + // Errrk! - this is a little crazy, we initially generate as a TypeParenthesesSubpatternOnceBegin, + // then fix this up at the end! - simplifying this should make it much clearer. + // https://bugs.webkit.org/show_bug.cgi?id=50136 + + int beginTerm = m_bodyDisjunction->terms.size(); + + m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, false, inputPosition)); + m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; + m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin()); + m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation; + + m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex)); + m_currentAlternativeIndex = beginTerm + 1; + } + + void atomParentheticalAssertionBegin(unsigned subpatternId, bool invert, unsigned frameLocation, unsigned alternativeFrameLocation) + { + int beginTerm = m_bodyDisjunction->terms.size(); + + m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParentheticalAssertionBegin, subpatternId, false, invert, 0)); + m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; + m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin()); + m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation; + + m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex)); + m_currentAlternativeIndex = beginTerm + 1; + } + + void atomParentheticalAssertionEnd(int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType) + { + unsigned beginTerm = popParenthesesStack(); + closeAlternative(beginTerm + 1); + unsigned endTerm = m_bodyDisjunction->terms.size(); + + ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParentheticalAssertionBegin); + + bool invert = m_bodyDisjunction->terms[beginTerm].invert(); + unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId; + + m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParentheticalAssertionEnd, subpatternId, false, invert, inputPosition)); + m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm; + m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm; + m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation; + + m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount; + m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; + m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount; + m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType; + } + + unsigned popParenthesesStack() + { + ASSERT(m_parenthesesStack.size()); + int stackEnd = m_parenthesesStack.size() - 1; + unsigned beginTerm = m_parenthesesStack[stackEnd].beginTerm; + m_currentAlternativeIndex = m_parenthesesStack[stackEnd].savedAlternativeIndex; + m_parenthesesStack.shrink(stackEnd); + + ASSERT(beginTerm < m_bodyDisjunction->terms.size()); + ASSERT(m_currentAlternativeIndex < m_bodyDisjunction->terms.size()); + + return beginTerm; + } + +#ifndef NDEBUG + void dumpDisjunction(ByteDisjunction* disjunction) + { + printf("ByteDisjunction(%p):\n\t", disjunction); + for (unsigned i = 0; i < disjunction->terms.size(); ++i) + printf("{ %d } ", disjunction->terms[i].type); + printf("\n"); + } +#endif + + void closeAlternative(int beginTerm) + { + int origBeginTerm = beginTerm; + ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeBegin); + int endIndex = m_bodyDisjunction->terms.size(); + + unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation; + + if (!m_bodyDisjunction->terms[beginTerm].alternative.next) + m_bodyDisjunction->terms.remove(beginTerm); + else { + while (m_bodyDisjunction->terms[beginTerm].alternative.next) { + beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next; + ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeDisjunction); + m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm; + m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation; + } + + m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm; + + m_bodyDisjunction->terms.append(ByteTerm::AlternativeEnd()); + m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation; + } + } + + void closeBodyAlternative() + { + int beginTerm = 0; + int origBeginTerm = 0; + ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeBegin); + int endIndex = m_bodyDisjunction->terms.size(); + + unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation; + + while (m_bodyDisjunction->terms[beginTerm].alternative.next) { + beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next; + ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeDisjunction); + m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm; + m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation; + } + + m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm; + + m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeEnd()); + m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation; + } + + void atomParenthesesSubpatternEnd(unsigned lastSubpatternId, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType, unsigned callFrameSize = 0) + { + unsigned beginTerm = popParenthesesStack(); + closeAlternative(beginTerm + 1); + unsigned endTerm = m_bodyDisjunction->terms.size(); + + ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternOnceBegin); + + ByteTerm& parenthesesBegin = m_bodyDisjunction->terms[beginTerm]; + + bool capture = parenthesesBegin.capture(); + unsigned subpatternId = parenthesesBegin.atom.subpatternId; + + unsigned numSubpatterns = lastSubpatternId - subpatternId + 1; + ByteDisjunction* parenthesesDisjunction = new ByteDisjunction(numSubpatterns, callFrameSize); + + parenthesesDisjunction->terms.append(ByteTerm::SubpatternBegin()); + for (unsigned termInParentheses = beginTerm + 1; termInParentheses < endTerm; ++termInParentheses) + parenthesesDisjunction->terms.append(m_bodyDisjunction->terms[termInParentheses]); + parenthesesDisjunction->terms.append(ByteTerm::SubpatternEnd()); + + m_bodyDisjunction->terms.shrink(beginTerm); + + m_allParenthesesInfo.append(parenthesesDisjunction); + m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction, capture, inputPosition)); + + m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount; + m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; + m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation; + } + + void atomParenthesesOnceEnd(int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType) + { + unsigned beginTerm = popParenthesesStack(); + closeAlternative(beginTerm + 1); + unsigned endTerm = m_bodyDisjunction->terms.size(); + + ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternOnceBegin); + + bool capture = m_bodyDisjunction->terms[beginTerm].capture(); + unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId; + + m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceEnd, subpatternId, capture, false, inputPosition)); + m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm; + m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm; + m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation; + + m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount; + m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; + m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount; + m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType; + } + + void atomParenthesesTerminalEnd(int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType) + { + unsigned beginTerm = popParenthesesStack(); + closeAlternative(beginTerm + 1); + unsigned endTerm = m_bodyDisjunction->terms.size(); + + ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternTerminalBegin); + + bool capture = m_bodyDisjunction->terms[beginTerm].capture(); + unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId; + + m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternTerminalEnd, subpatternId, capture, false, inputPosition)); + m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm; + m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm; + m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation; + + m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount; + m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; + m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount; + m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType; + } + + void regexBegin(unsigned numSubpatterns, unsigned callFrameSize, bool onceThrough) + { + m_bodyDisjunction = adoptPtr(new ByteDisjunction(numSubpatterns, callFrameSize)); + m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeBegin(onceThrough)); + m_bodyDisjunction->terms[0].frameLocation = 0; + m_currentAlternativeIndex = 0; + } + + void regexEnd() + { + closeBodyAlternative(); + } + + void alternativeBodyDisjunction(bool onceThrough) + { + int newAlternativeIndex = m_bodyDisjunction->terms.size(); + m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex; + m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeDisjunction(onceThrough)); + + m_currentAlternativeIndex = newAlternativeIndex; + } + + void alternativeDisjunction() + { + int newAlternativeIndex = m_bodyDisjunction->terms.size(); + m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex; + m_bodyDisjunction->terms.append(ByteTerm::AlternativeDisjunction()); + + m_currentAlternativeIndex = newAlternativeIndex; + } + + void emitDisjunction(PatternDisjunction* disjunction, unsigned inputCountAlreadyChecked = 0, unsigned parenthesesInputCountAlreadyChecked = 0, bool isParentheticalAssertion = false) + { + for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) { + unsigned currentCountAlreadyChecked = inputCountAlreadyChecked; + + PatternAlternative* alternative = disjunction->m_alternatives[alt]; + + if (alt) { + if (disjunction == m_pattern.m_body) + alternativeBodyDisjunction(alternative->onceThrough()); + else + alternativeDisjunction(); + } + + unsigned minimumSize = alternative->m_minimumSize; + int countToCheck; + + if (isParentheticalAssertion && parenthesesInputCountAlreadyChecked > minimumSize) + countToCheck = 0; + else + countToCheck = minimumSize - parenthesesInputCountAlreadyChecked; + + ASSERT(countToCheck >= 0); + if (countToCheck) { + checkInput(countToCheck); + currentCountAlreadyChecked += countToCheck; + } + + for (unsigned i = 0; i < alternative->m_terms.size(); ++i) { + PatternTerm& term = alternative->m_terms[i]; + + switch (term.type) { + case PatternTerm::TypeAssertionBOL: + assertionBOL(term.inputPosition - currentCountAlreadyChecked); + break; + + case PatternTerm::TypeAssertionEOL: + assertionEOL(term.inputPosition - currentCountAlreadyChecked); + break; + + case PatternTerm::TypeAssertionWordBoundary: + assertionWordBoundary(term.invert(), term.inputPosition - currentCountAlreadyChecked); + break; + + case PatternTerm::TypePatternCharacter: + atomPatternCharacter(term.patternCharacter, term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType); + break; + + case PatternTerm::TypeCharacterClass: + atomCharacterClass(term.characterClass, term.invert(), term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType); + break; + + case PatternTerm::TypeBackReference: + atomBackReference(term.backReferenceSubpatternId, term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType); + break; + + case PatternTerm::TypeForwardReference: + break; + + case PatternTerm::TypeParenthesesSubpattern: { + unsigned disjunctionAlreadyCheckedCount = 0; + if (term.quantityCount == 1 && !term.parentheses.isCopy) { + unsigned alternativeFrameLocation = term.frameLocation; + // For QuantifierFixedCount we pre-check the minimum size; for greedy/non-greedy we reserve a slot in the frame. + if (term.quantityType == QuantifierFixedCount) + disjunctionAlreadyCheckedCount = term.parentheses.disjunction->m_minimumSize; + else + alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParenthesesOnce; + unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked; + atomParenthesesOnceBegin(term.parentheses.subpatternId, term.capture(), delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, alternativeFrameLocation); + emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, disjunctionAlreadyCheckedCount); + atomParenthesesOnceEnd(delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType); + } else if (term.parentheses.isTerminal) { + unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked; + atomParenthesesTerminalBegin(term.parentheses.subpatternId, term.capture(), delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, term.frameLocation + YarrStackSpaceForBackTrackInfoParenthesesOnce); + emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, disjunctionAlreadyCheckedCount); + atomParenthesesTerminalEnd(delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType); + } else { + unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked; + atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.capture(), delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, 0); + emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0); + atomParenthesesSubpatternEnd(term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize); + } + break; + } + + case PatternTerm::TypeParentheticalAssertion: { + unsigned alternativeFrameLocation = term.frameLocation + YarrStackSpaceForBackTrackInfoParentheticalAssertion; + + ASSERT(currentCountAlreadyChecked >= (unsigned)term.inputPosition); + int positiveInputOffset = currentCountAlreadyChecked - term.inputPosition; + + atomParentheticalAssertionBegin(term.parentheses.subpatternId, term.invert(), term.frameLocation, alternativeFrameLocation); + emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, positiveInputOffset, true); + atomParentheticalAssertionEnd(0, term.frameLocation, term.quantityCount, term.quantityType); + break; + } + } + } + } + } + +private: + YarrPattern& m_pattern; + OwnPtr m_bodyDisjunction; + unsigned m_currentAlternativeIndex; + Vector m_parenthesesStack; + Vector m_allParenthesesInfo; +}; + +PassOwnPtr byteCompile(YarrPattern& pattern, BumpPointerAllocator* allocator) +{ + return ByteCompiler(pattern).compile(allocator); +} + +int interpret(BytecodePattern* bytecode, const UChar* input, unsigned start, unsigned length, int* output) +{ + return Interpreter(bytecode, output, input, start, length).interpret(); +} + +COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoPatternCharacter) == (YarrStackSpaceForBackTrackInfoPatternCharacter * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoPatternCharacter); +COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoCharacterClass) == (YarrStackSpaceForBackTrackInfoCharacterClass * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoCharacterClass); +COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoBackReference) == (YarrStackSpaceForBackTrackInfoBackReference * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoBackReference); +COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoAlternative) == (YarrStackSpaceForBackTrackInfoAlternative * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoAlternative); +COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParentheticalAssertion) == (YarrStackSpaceForBackTrackInfoParentheticalAssertion * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParentheticalAssertion); +COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParenthesesOnce) == (YarrStackSpaceForBackTrackInfoParenthesesOnce * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParenthesesOnce); +COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParentheses) == (YarrStackSpaceForBackTrackInfoParentheses * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParentheses); + + +} } diff --git a/Source/JavaScriptCore/yarr/YarrInterpreter.h b/Source/JavaScriptCore/yarr/YarrInterpreter.h new file mode 100644 index 0000000..be703a9 --- /dev/null +++ b/Source/JavaScriptCore/yarr/YarrInterpreter.h @@ -0,0 +1,362 @@ +/* + * Copyright (C) 2009, 2010 Apple Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef YarrInterpreter_h +#define YarrInterpreter_h + +#include "YarrPattern.h" +#include +#include + +namespace WTF { +class BumpPointerAllocator; +} +using WTF::BumpPointerAllocator; + +namespace JSC { namespace Yarr { + +class ByteDisjunction; + +struct ByteTerm { + enum Type { + TypeBodyAlternativeBegin, + TypeBodyAlternativeDisjunction, + TypeBodyAlternativeEnd, + TypeAlternativeBegin, + TypeAlternativeDisjunction, + TypeAlternativeEnd, + TypeSubpatternBegin, + TypeSubpatternEnd, + TypeAssertionBOL, + TypeAssertionEOL, + TypeAssertionWordBoundary, + TypePatternCharacterOnce, + TypePatternCharacterFixed, + TypePatternCharacterGreedy, + TypePatternCharacterNonGreedy, + TypePatternCasedCharacterOnce, + TypePatternCasedCharacterFixed, + TypePatternCasedCharacterGreedy, + TypePatternCasedCharacterNonGreedy, + TypeCharacterClass, + TypeBackReference, + TypeParenthesesSubpattern, + TypeParenthesesSubpatternOnceBegin, + TypeParenthesesSubpatternOnceEnd, + TypeParenthesesSubpatternTerminalBegin, + TypeParenthesesSubpatternTerminalEnd, + TypeParentheticalAssertionBegin, + TypeParentheticalAssertionEnd, + TypeCheckInput, + } type; + union { + struct { + union { + UChar patternCharacter; + struct { + UChar lo; + UChar hi; + } casedCharacter; + CharacterClass* characterClass; + unsigned subpatternId; + }; + union { + ByteDisjunction* parenthesesDisjunction; + unsigned parenthesesWidth; + }; + QuantifierType quantityType; + unsigned quantityCount; + } atom; + struct { + int next; + int end; + bool onceThrough; + } alternative; + unsigned checkInputCount; + }; + unsigned frameLocation; + bool m_capture : 1; + bool m_invert : 1; + int inputPosition; + + ByteTerm(UChar ch, int inputPos, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType) + : frameLocation(frameLocation) + , m_capture(false) + , m_invert(false) + { + switch (quantityType) { + case QuantifierFixedCount: + type = (quantityCount == 1) ? ByteTerm::TypePatternCharacterOnce : ByteTerm::TypePatternCharacterFixed; + break; + case QuantifierGreedy: + type = ByteTerm::TypePatternCharacterGreedy; + break; + case QuantifierNonGreedy: + type = ByteTerm::TypePatternCharacterNonGreedy; + break; + } + + atom.patternCharacter = ch; + atom.quantityType = quantityType; + atom.quantityCount = quantityCount; + inputPosition = inputPos; + } + + ByteTerm(UChar lo, UChar hi, int inputPos, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType) + : frameLocation(frameLocation) + , m_capture(false) + , m_invert(false) + { + switch (quantityType) { + case QuantifierFixedCount: + type = (quantityCount == 1) ? ByteTerm::TypePatternCasedCharacterOnce : ByteTerm::TypePatternCasedCharacterFixed; + break; + case QuantifierGreedy: + type = ByteTerm::TypePatternCasedCharacterGreedy; + break; + case QuantifierNonGreedy: + type = ByteTerm::TypePatternCasedCharacterNonGreedy; + break; + } + + atom.casedCharacter.lo = lo; + atom.casedCharacter.hi = hi; + atom.quantityType = quantityType; + atom.quantityCount = quantityCount; + inputPosition = inputPos; + } + + ByteTerm(CharacterClass* characterClass, bool invert, int inputPos) + : type(ByteTerm::TypeCharacterClass) + , m_capture(false) + , m_invert(invert) + { + atom.characterClass = characterClass; + atom.quantityType = QuantifierFixedCount; + atom.quantityCount = 1; + inputPosition = inputPos; + } + + ByteTerm(Type type, unsigned subpatternId, ByteDisjunction* parenthesesInfo, bool capture, int inputPos) + : type(type) + , m_capture(capture) + , m_invert(false) + { + atom.subpatternId = subpatternId; + atom.parenthesesDisjunction = parenthesesInfo; + atom.quantityType = QuantifierFixedCount; + atom.quantityCount = 1; + inputPosition = inputPos; + } + + ByteTerm(Type type, bool invert = false) + : type(type) + , m_capture(false) + , m_invert(invert) + { + atom.quantityType = QuantifierFixedCount; + atom.quantityCount = 1; + } + + ByteTerm(Type type, unsigned subpatternId, bool capture, bool invert, int inputPos) + : type(type) + , m_capture(capture) + , m_invert(invert) + { + atom.subpatternId = subpatternId; + atom.quantityType = QuantifierFixedCount; + atom.quantityCount = 1; + inputPosition = inputPos; + } + + static ByteTerm BOL(int inputPos) + { + ByteTerm term(TypeAssertionBOL); + term.inputPosition = inputPos; + return term; + } + + static ByteTerm CheckInput(unsigned count) + { + ByteTerm term(TypeCheckInput); + term.checkInputCount = count; + return term; + } + + static ByteTerm EOL(int inputPos) + { + ByteTerm term(TypeAssertionEOL); + term.inputPosition = inputPos; + return term; + } + + static ByteTerm WordBoundary(bool invert, int inputPos) + { + ByteTerm term(TypeAssertionWordBoundary, invert); + term.inputPosition = inputPos; + return term; + } + + static ByteTerm BackReference(unsigned subpatternId, int inputPos) + { + return ByteTerm(TypeBackReference, subpatternId, false, false, inputPos); + } + + static ByteTerm BodyAlternativeBegin(bool onceThrough) + { + ByteTerm term(TypeBodyAlternativeBegin); + term.alternative.next = 0; + term.alternative.end = 0; + term.alternative.onceThrough = onceThrough; + return term; + } + + static ByteTerm BodyAlternativeDisjunction(bool onceThrough) + { + ByteTerm term(TypeBodyAlternativeDisjunction); + term.alternative.next = 0; + term.alternative.end = 0; + term.alternative.onceThrough = onceThrough; + return term; + } + + static ByteTerm BodyAlternativeEnd() + { + ByteTerm term(TypeBodyAlternativeEnd); + term.alternative.next = 0; + term.alternative.end = 0; + term.alternative.onceThrough = false; + return term; + } + + static ByteTerm AlternativeBegin() + { + ByteTerm term(TypeAlternativeBegin); + term.alternative.next = 0; + term.alternative.end = 0; + term.alternative.onceThrough = false; + return term; + } + + static ByteTerm AlternativeDisjunction() + { + ByteTerm term(TypeAlternativeDisjunction); + term.alternative.next = 0; + term.alternative.end = 0; + term.alternative.onceThrough = false; + return term; + } + + static ByteTerm AlternativeEnd() + { + ByteTerm term(TypeAlternativeEnd); + term.alternative.next = 0; + term.alternative.end = 0; + term.alternative.onceThrough = false; + return term; + } + + static ByteTerm SubpatternBegin() + { + return ByteTerm(TypeSubpatternBegin); + } + + static ByteTerm SubpatternEnd() + { + return ByteTerm(TypeSubpatternEnd); + } + + bool invert() + { + return m_invert; + } + + bool capture() + { + return m_capture; + } +}; + +class ByteDisjunction : public FastAllocBase { +public: + ByteDisjunction(unsigned numSubpatterns, unsigned frameSize) + : m_numSubpatterns(numSubpatterns) + , m_frameSize(frameSize) + { + } + + Vector terms; + unsigned m_numSubpatterns; + unsigned m_frameSize; +}; + +struct BytecodePattern : FastAllocBase { + BytecodePattern(PassOwnPtr body, Vector allParenthesesInfo, YarrPattern& pattern, BumpPointerAllocator* allocator) + : m_body(body) + , m_ignoreCase(pattern.m_ignoreCase) + , m_multiline(pattern.m_multiline) + , m_containsBeginChars(pattern.m_containsBeginChars) + , m_allocator(allocator) + { + newlineCharacterClass = pattern.newlineCharacterClass(); + wordcharCharacterClass = pattern.wordcharCharacterClass(); + + m_allParenthesesInfo.append(allParenthesesInfo); + m_userCharacterClasses.append(pattern.m_userCharacterClasses); + // 'Steal' the YarrPattern's CharacterClasses! We clear its + // array, so that it won't delete them on destruction. We'll + // take responsibility for that. + pattern.m_userCharacterClasses.clear(); + + m_beginChars.append(pattern.m_beginChars); + } + + ~BytecodePattern() + { + deleteAllValues(m_allParenthesesInfo); + deleteAllValues(m_userCharacterClasses); + } + + OwnPtr m_body; + bool m_ignoreCase; + bool m_multiline; + bool m_containsBeginChars; + // Each BytecodePattern is associated with a RegExp, each RegExp is associated + // with a JSGlobalData. Cache a pointer to out JSGlobalData's m_regExpAllocator. + BumpPointerAllocator* m_allocator; + + CharacterClass* newlineCharacterClass; + CharacterClass* wordcharCharacterClass; + + Vector m_beginChars; + +private: + Vector m_allParenthesesInfo; + Vector m_userCharacterClasses; +}; + +} } // namespace JSC::Yarr + +#endif // YarrInterpreter_h diff --git a/Source/JavaScriptCore/yarr/YarrJIT.cpp b/Source/JavaScriptCore/yarr/YarrJIT.cpp new file mode 100644 index 0000000..ae59cba --- /dev/null +++ b/Source/JavaScriptCore/yarr/YarrJIT.cpp @@ -0,0 +1,2242 @@ +/* + * Copyright (C) 2009 Apple Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "YarrJIT.h" + +#include "ASCIICType.h" +#include "JSGlobalData.h" +#include "LinkBuffer.h" +#include "MacroAssembler.h" +#include "Yarr.h" + +#if ENABLE(YARR_JIT) + +using namespace WTF; + +namespace JSC { namespace Yarr { + +class YarrGenerator : private MacroAssembler { + friend void jitCompile(JSGlobalData*, YarrCodeBlock& jitObject, const UString& pattern, unsigned& numSubpatterns, const char*& error, bool ignoreCase, bool multiline); + +#if CPU(ARM) + static const RegisterID input = ARMRegisters::r0; + static const RegisterID index = ARMRegisters::r1; + static const RegisterID length = ARMRegisters::r2; + static const RegisterID output = ARMRegisters::r4; + + static const RegisterID regT0 = ARMRegisters::r5; + static const RegisterID regT1 = ARMRegisters::r6; + + static const RegisterID returnRegister = ARMRegisters::r0; +#elif CPU(MIPS) + static const RegisterID input = MIPSRegisters::a0; + static const RegisterID index = MIPSRegisters::a1; + static const RegisterID length = MIPSRegisters::a2; + static const RegisterID output = MIPSRegisters::a3; + + static const RegisterID regT0 = MIPSRegisters::t4; + static const RegisterID regT1 = MIPSRegisters::t5; + + static const RegisterID returnRegister = MIPSRegisters::v0; +#elif CPU(X86) + static const RegisterID input = X86Registers::eax; + static const RegisterID index = X86Registers::edx; + static const RegisterID length = X86Registers::ecx; + static const RegisterID output = X86Registers::edi; + + static const RegisterID regT0 = X86Registers::ebx; + static const RegisterID regT1 = X86Registers::esi; + + static const RegisterID returnRegister = X86Registers::eax; +#elif CPU(X86_64) + static const RegisterID input = X86Registers::edi; + static const RegisterID index = X86Registers::esi; + static const RegisterID length = X86Registers::edx; + static const RegisterID output = X86Registers::ecx; + + static const RegisterID regT0 = X86Registers::eax; + static const RegisterID regT1 = X86Registers::ebx; + + static const RegisterID returnRegister = X86Registers::eax; +#endif + + void optimizeAlternative(PatternAlternative* alternative) + { + if (!alternative->m_terms.size()) + return; + + for (unsigned i = 0; i < alternative->m_terms.size() - 1; ++i) { + PatternTerm& term = alternative->m_terms[i]; + PatternTerm& nextTerm = alternative->m_terms[i + 1]; + + if ((term.type == PatternTerm::TypeCharacterClass) + && (term.quantityType == QuantifierFixedCount) + && (nextTerm.type == PatternTerm::TypePatternCharacter) + && (nextTerm.quantityType == QuantifierFixedCount)) { + PatternTerm termCopy = term; + alternative->m_terms[i] = nextTerm; + alternative->m_terms[i + 1] = termCopy; + } + } + } + + void matchCharacterClassRange(RegisterID character, JumpList& failures, JumpList& matchDest, const CharacterRange* ranges, unsigned count, unsigned* matchIndex, const UChar* matches, unsigned matchCount) + { + do { + // pick which range we're going to generate + int which = count >> 1; + char lo = ranges[which].begin; + char hi = ranges[which].end; + + // check if there are any ranges or matches below lo. If not, just jl to failure - + // if there is anything else to check, check that first, if it falls through jmp to failure. + if ((*matchIndex < matchCount) && (matches[*matchIndex] < lo)) { + Jump loOrAbove = branch32(GreaterThanOrEqual, character, Imm32((unsigned short)lo)); + + // generate code for all ranges before this one + if (which) + matchCharacterClassRange(character, failures, matchDest, ranges, which, matchIndex, matches, matchCount); + + while ((*matchIndex < matchCount) && (matches[*matchIndex] < lo)) { + matchDest.append(branch32(Equal, character, Imm32((unsigned short)matches[*matchIndex]))); + ++*matchIndex; + } + failures.append(jump()); + + loOrAbove.link(this); + } else if (which) { + Jump loOrAbove = branch32(GreaterThanOrEqual, character, Imm32((unsigned short)lo)); + + matchCharacterClassRange(character, failures, matchDest, ranges, which, matchIndex, matches, matchCount); + failures.append(jump()); + + loOrAbove.link(this); + } else + failures.append(branch32(LessThan, character, Imm32((unsigned short)lo))); + + while ((*matchIndex < matchCount) && (matches[*matchIndex] <= hi)) + ++*matchIndex; + + matchDest.append(branch32(LessThanOrEqual, character, Imm32((unsigned short)hi))); + // fall through to here, the value is above hi. + + // shuffle along & loop around if there are any more matches to handle. + unsigned next = which + 1; + ranges += next; + count -= next; + } while (count); + } + + void matchCharacterClass(RegisterID character, JumpList& matchDest, const CharacterClass* charClass) + { + if (charClass->m_table) { + ExtendedAddress tableEntry(character, reinterpret_cast(charClass->m_table->m_table)); + matchDest.append(branchTest8(charClass->m_table->m_inverted ? Zero : NonZero, tableEntry)); + return; + } + Jump unicodeFail; + if (charClass->m_matchesUnicode.size() || charClass->m_rangesUnicode.size()) { + Jump isAscii = branch32(LessThanOrEqual, character, Imm32(0x7f)); + + if (charClass->m_matchesUnicode.size()) { + for (unsigned i = 0; i < charClass->m_matchesUnicode.size(); ++i) { + UChar ch = charClass->m_matchesUnicode[i]; + matchDest.append(branch32(Equal, character, Imm32(ch))); + } + } + + if (charClass->m_rangesUnicode.size()) { + for (unsigned i = 0; i < charClass->m_rangesUnicode.size(); ++i) { + UChar lo = charClass->m_rangesUnicode[i].begin; + UChar hi = charClass->m_rangesUnicode[i].end; + + Jump below = branch32(LessThan, character, Imm32(lo)); + matchDest.append(branch32(LessThanOrEqual, character, Imm32(hi))); + below.link(this); + } + } + + unicodeFail = jump(); + isAscii.link(this); + } + + if (charClass->m_ranges.size()) { + unsigned matchIndex = 0; + JumpList failures; + matchCharacterClassRange(character, failures, matchDest, charClass->m_ranges.begin(), charClass->m_ranges.size(), &matchIndex, charClass->m_matches.begin(), charClass->m_matches.size()); + while (matchIndex < charClass->m_matches.size()) + matchDest.append(branch32(Equal, character, Imm32((unsigned short)charClass->m_matches[matchIndex++]))); + + failures.link(this); + } else if (charClass->m_matches.size()) { + // optimization: gather 'a','A' etc back together, can mask & test once. + Vector matchesAZaz; + + for (unsigned i = 0; i < charClass->m_matches.size(); ++i) { + char ch = charClass->m_matches[i]; + if (m_pattern.m_ignoreCase) { + if (isASCIILower(ch)) { + matchesAZaz.append(ch); + continue; + } + if (isASCIIUpper(ch)) + continue; + } + matchDest.append(branch32(Equal, character, Imm32((unsigned short)ch))); + } + + if (unsigned countAZaz = matchesAZaz.size()) { + or32(Imm32(32), character); + for (unsigned i = 0; i < countAZaz; ++i) + matchDest.append(branch32(Equal, character, Imm32(matchesAZaz[i]))); + } + } + + if (charClass->m_matchesUnicode.size() || charClass->m_rangesUnicode.size()) + unicodeFail.link(this); + } + + // Jumps if input not available; will have (incorrectly) incremented already! + Jump jumpIfNoAvailableInput(unsigned countToCheck) + { + add32(Imm32(countToCheck), index); + return branch32(Above, index, length); + } + + Jump jumpIfAvailableInput(unsigned countToCheck) + { + add32(Imm32(countToCheck), index); + return branch32(BelowOrEqual, index, length); + } + + Jump checkInput() + { + return branch32(BelowOrEqual, index, length); + } + + Jump atEndOfInput() + { + return branch32(Equal, index, length); + } + + Jump notAtEndOfInput() + { + return branch32(NotEqual, index, length); + } + + Jump jumpIfCharEquals(UChar ch, int inputPosition) + { + return branch16(Equal, BaseIndex(input, index, TimesTwo, inputPosition * sizeof(UChar)), Imm32(ch)); + } + + Jump jumpIfCharNotEquals(UChar ch, int inputPosition) + { + return branch16(NotEqual, BaseIndex(input, index, TimesTwo, inputPosition * sizeof(UChar)), Imm32(ch)); + } + + void readCharacter(int inputPosition, RegisterID reg) + { + load16(BaseIndex(input, index, TimesTwo, inputPosition * sizeof(UChar)), reg); + } + + void storeToFrame(RegisterID reg, unsigned frameLocation) + { + poke(reg, frameLocation); + } + + void storeToFrame(Imm32 imm, unsigned frameLocation) + { + poke(imm, frameLocation); + } + + DataLabelPtr storeToFrameWithPatch(unsigned frameLocation) + { + return storePtrWithPatch(ImmPtr(0), Address(stackPointerRegister, frameLocation * sizeof(void*))); + } + + void loadFromFrame(unsigned frameLocation, RegisterID reg) + { + peek(reg, frameLocation); + } + + void loadFromFrameAndJump(unsigned frameLocation) + { + jump(Address(stackPointerRegister, frameLocation * sizeof(void*))); + } + + struct IndirectJumpEntry { + IndirectJumpEntry(int32_t stackOffset) + : m_stackOffset(stackOffset) + { + } + + IndirectJumpEntry(int32_t stackOffset, Jump jump) + : m_stackOffset(stackOffset) + { + addJump(jump); + } + + IndirectJumpEntry(int32_t stackOffset, DataLabelPtr dataLabel) + : m_stackOffset(stackOffset) + { + addDataLabel(dataLabel); + } + + void addJump(Jump jump) + { + m_relJumps.append(jump); + } + + void addDataLabel(DataLabelPtr dataLabel) + { + m_dataLabelPtrVector.append(dataLabel); + } + + int32_t m_stackOffset; + JumpList m_relJumps; + Vector m_dataLabelPtrVector; + }; + + struct AlternativeBacktrackRecord { + DataLabelPtr dataLabel; + Label backtrackLocation; + + AlternativeBacktrackRecord(DataLabelPtr dataLabel, Label backtrackLocation) + : dataLabel(dataLabel) + , backtrackLocation(backtrackLocation) + { + } + }; + + struct ParenthesesTail; + struct TermGenerationState; + + struct GenerationState { + typedef HashMap, UnsignedWithZeroKeyHashTraits > IndirectJumpHashMap; + + GenerationState() + : m_parenNestingLevel(0) + { + } + + void addIndirectJumpEntry(int32_t stackOffset, Jump jump) + { + IndirectJumpHashMap::iterator result = m_indirectJumpMap.find(stackOffset); + + ASSERT(stackOffset >= 0); + + uint32_t offset = static_cast(stackOffset); + + if (result == m_indirectJumpMap.end()) + m_indirectJumpMap.add(offset, new IndirectJumpEntry(stackOffset, jump)); + else + result->second->addJump(jump); + } + + void addIndirectJumpEntry(int32_t stackOffset, JumpList jumps) + { + JumpList::JumpVector jumpVector = jumps.jumps(); + size_t size = jumpVector.size(); + for (size_t i = 0; i < size; ++i) + addIndirectJumpEntry(stackOffset, jumpVector[i]); + + jumps.empty(); + } + + void addIndirectJumpEntry(int32_t stackOffset, DataLabelPtr dataLabel) + { + IndirectJumpHashMap::iterator result = m_indirectJumpMap.find(stackOffset); + + ASSERT(stackOffset >= 0); + + uint32_t offset = static_cast(stackOffset); + + if (result == m_indirectJumpMap.end()) + m_indirectJumpMap.add(offset, new IndirectJumpEntry(stackOffset, dataLabel)); + else + result->second->addDataLabel(dataLabel); + } + + void emitIndirectJumpTable(MacroAssembler* masm) + { + for (IndirectJumpHashMap::iterator iter = m_indirectJumpMap.begin(); iter != m_indirectJumpMap.end(); ++iter) { + IndirectJumpEntry* indJumpEntry = iter->second; + size_t size = indJumpEntry->m_dataLabelPtrVector.size(); + if (size) { + // Link any associated DataLabelPtr's with indirect jump via label + Label hereLabel = masm->label(); + for (size_t i = 0; i < size; ++i) + m_backtrackRecords.append(AlternativeBacktrackRecord(indJumpEntry->m_dataLabelPtrVector[i], hereLabel)); + } + indJumpEntry->m_relJumps.link(masm); + masm->jump(Address(stackPointerRegister, indJumpEntry->m_stackOffset)); + delete indJumpEntry; + } + } + + void incrementParenNestingLevel() + { + ++m_parenNestingLevel; + } + + void decrementParenNestingLevel() + { + --m_parenNestingLevel; + } + + ParenthesesTail* addParenthesesTail(PatternTerm& term, ParenthesesTail* nextOuterParenTail) + { + ParenthesesTail* parenthesesTail = new ParenthesesTail(term, m_parenNestingLevel, nextOuterParenTail); + m_parenTails.append(parenthesesTail); + m_parenTailsForIteration.append(parenthesesTail); + + return parenthesesTail; + } + + void emitParenthesesTail(YarrGenerator* generator) + { + unsigned vectorSize = m_parenTails.size(); + bool priorBacktrackFallThrough = false; + + // Emit in reverse order so parentTail N can fall through to N-1 + for (unsigned index = vectorSize; index > 0; --index) { + JumpList jumpsToNext; + priorBacktrackFallThrough = m_parenTails[index-1].get()->generateCode(generator, jumpsToNext, priorBacktrackFallThrough, index > 1); + if (index > 1) + jumpsToNext.linkTo(generator->label(), generator); + else + addJumpsToNextInteration(jumpsToNext); + } + m_parenTails.clear(); + } + + void addJumpToNextInteration(Jump jump) + { + m_jumpsToNextInteration.append(jump); + } + + void addJumpsToNextInteration(JumpList jumps) + { + m_jumpsToNextInteration.append(jumps); + } + + void addDataLabelToNextIteration(DataLabelPtr dataLabel) + { + m_dataPtrsToNextIteration.append(dataLabel); + } + + void linkToNextIteration(Label label) + { + m_nextIteration = label; + + for (unsigned i = 0; i < m_dataPtrsToNextIteration.size(); ++i) + m_backtrackRecords.append(AlternativeBacktrackRecord(m_dataPtrsToNextIteration[i], m_nextIteration)); + + m_dataPtrsToNextIteration.clear(); + + for (unsigned i = 0; i < m_parenTailsForIteration.size(); ++i) + m_parenTailsForIteration[i]->setNextIteration(m_nextIteration); + + m_parenTailsForIteration.clear(); + } + + void linkToNextIteration(YarrGenerator* generator) + { + m_jumpsToNextInteration.linkTo(m_nextIteration, generator); + } + + int m_parenNestingLevel; + Vector m_backtrackRecords; + IndirectJumpHashMap m_indirectJumpMap; + Label m_nextIteration; + Vector > m_parenTails; + JumpList m_jumpsToNextInteration; + Vector m_dataPtrsToNextIteration; + Vector m_parenTailsForIteration; + }; + + struct BacktrackDestination { + typedef enum { + NoBacktrack, + BacktrackLabel, + BacktrackStackOffset, + BacktrackJumpList, + BacktrackLinked + } BacktrackType; + + BacktrackDestination() + : m_backtrackType(NoBacktrack) + , m_backtrackToLabel(0) + , m_subDataLabelPtr(0) + , m_nextBacktrack(0) + , m_backtrackSourceLabel(0) + , m_backtrackSourceJumps(0) + { + } + + BacktrackDestination(int32_t stackOffset) + : m_backtrackType(BacktrackStackOffset) + , m_backtrackStackOffset(stackOffset) + , m_backtrackToLabel(0) + , m_subDataLabelPtr(0) + , m_nextBacktrack(0) + , m_backtrackSourceLabel(0) + , m_backtrackSourceJumps(0) + { + } + + BacktrackDestination(Label label) + : m_backtrackType(BacktrackLabel) + , m_backtrackLabel(label) + , m_backtrackToLabel(0) + , m_subDataLabelPtr(0) + , m_nextBacktrack(0) + , m_backtrackSourceLabel(0) + , m_backtrackSourceJumps(0) + { + } + + void clear(bool doDataLabelClear = true) + { + m_backtrackType = NoBacktrack; + if (doDataLabelClear) + clearDataLabel(); + m_nextBacktrack = 0; + } + + void clearDataLabel() + { + m_dataLabelPtr = DataLabelPtr(); + } + + bool hasDestination() + { + return (m_backtrackType != NoBacktrack); + } + + bool isStackOffset() + { + return (m_backtrackType == BacktrackStackOffset); + } + + bool isLabel() + { + return (m_backtrackType == BacktrackLabel); + } + + bool isJumpList() + { + return (m_backtrackType == BacktrackJumpList); + } + + bool hasDataLabel() + { + return m_dataLabelPtr.isSet(); + } + + void copyTarget(BacktrackDestination& rhs, bool copyDataLabel = true) + { + m_backtrackType = rhs.m_backtrackType; + if (m_backtrackType == BacktrackStackOffset) + m_backtrackStackOffset = rhs.m_backtrackStackOffset; + else if (m_backtrackType == BacktrackLabel) + m_backtrackLabel = rhs.m_backtrackLabel; + if (copyDataLabel) + m_dataLabelPtr = rhs.m_dataLabelPtr; + m_backtrackSourceJumps = rhs.m_backtrackSourceJumps; + m_backtrackSourceLabel = rhs.m_backtrackSourceLabel; + } + + void copyTo(BacktrackDestination& lhs) + { + lhs.m_backtrackType = m_backtrackType; + if (m_backtrackType == BacktrackStackOffset) + lhs.m_backtrackStackOffset = m_backtrackStackOffset; + else if (m_backtrackType == BacktrackLabel) + lhs.m_backtrackLabel = m_backtrackLabel; + lhs.m_backtrackSourceJumps = m_backtrackSourceJumps; + lhs.m_backtrackSourceLabel = m_backtrackSourceLabel; + lhs.m_dataLabelPtr = m_dataLabelPtr; + lhs.m_backTrackJumps = m_backTrackJumps; + } + + void addBacktrackJump(Jump jump) + { + m_backTrackJumps.append(jump); + } + + void setStackOffset(int32_t stackOffset) + { + m_backtrackType = BacktrackStackOffset; + m_backtrackStackOffset = stackOffset; + } + + void setLabel(Label label) + { + m_backtrackType = BacktrackLabel; + m_backtrackLabel = label; + } + + void setNextBacktrackLabel(Label label) + { + if (m_nextBacktrack) + m_nextBacktrack->setLabel(label); + } + + void copyBacktrackToLabel(BacktrackDestination& rhs) + { + if (rhs.m_backtrackToLabel) + m_backtrackToLabel = rhs.m_backtrackToLabel; + } + + void setBacktrackToLabel(Label* backtrackToLabel) + { + if (!m_backtrackToLabel) + m_backtrackToLabel = backtrackToLabel; + } + + void setBacktrackJumpList(JumpList* jumpList) + { + m_backtrackType = BacktrackJumpList; + m_backtrackSourceJumps = jumpList; + } + + void setBacktrackSourceLabel(Label* backtrackSourceLabel) + { + m_backtrackSourceLabel = backtrackSourceLabel; + } + + void setDataLabel(DataLabelPtr dp) + { + if (m_subDataLabelPtr) { + *m_subDataLabelPtr = dp; + m_subDataLabelPtr = 0; + } else + m_dataLabelPtr = dp; + } + + void clearSubDataLabelPtr() + { + m_subDataLabelPtr = 0; + } + + void setSubDataLabelPtr(DataLabelPtr* subDataLabelPtr) + { + m_subDataLabelPtr = subDataLabelPtr; + } + + void linkToNextBacktrack(BacktrackDestination* nextBacktrack) + { + m_nextBacktrack = nextBacktrack; + } + + int32_t getStackOffset() + { + ASSERT(m_backtrackType == BacktrackStackOffset); + return m_backtrackStackOffset; + } + + Label getLabel() + { + ASSERT(m_backtrackType == BacktrackLabel); + return m_backtrackLabel; + } + + JumpList& getBacktrackJumps() + { + return m_backTrackJumps; + } + + DataLabelPtr& getDataLabel() + { + return m_dataLabelPtr; + } + + void jumpToBacktrack(MacroAssembler* masm) + { + if (isJumpList()) { + if (m_backtrackSourceLabel && (m_backtrackSourceLabel->isSet())) + masm->jump().linkTo(*m_backtrackSourceLabel, masm); + else + m_backtrackSourceJumps->append(masm->jump()); + } else if (isStackOffset()) + masm->jump(Address(stackPointerRegister, m_backtrackStackOffset)); + else if (isLabel()) + masm->jump().linkTo(m_backtrackLabel, masm); + else + m_backTrackJumps.append(masm->jump()); + } + + void jumpToBacktrack(YarrGenerator* generator, Jump jump) + { + if (isJumpList()) { + if (m_backtrackSourceLabel && (m_backtrackSourceLabel->isSet())) + jump.linkTo(*m_backtrackSourceLabel, generator); + else + m_backtrackSourceJumps->append(jump); + } else if (isStackOffset()) + generator->m_expressionState.addIndirectJumpEntry(getStackOffset(), jump); + else if (isLabel()) + jump.linkTo(getLabel(), generator); + else + m_backTrackJumps.append(jump); + } + + void jumpToBacktrack(YarrGenerator* generator, JumpList& jumps) + { + if (isJumpList()) { + if (m_backtrackSourceLabel && (m_backtrackSourceLabel->isSet())) + jumps.linkTo(*m_backtrackSourceLabel, generator); + else + m_backtrackSourceJumps->append(jumps); + } else if (isStackOffset()) + generator->m_expressionState.addIndirectJumpEntry(getStackOffset(), jumps); + else if (isLabel()) + jumps.linkTo(getLabel(), generator); + else + m_backTrackJumps.append(jumps); + } + + bool plantJumpToBacktrackIfExists(YarrGenerator* generator) + { + if (isJumpList()) { + if (m_backtrackSourceLabel && (m_backtrackSourceLabel->isSet())) + generator->jump(*m_backtrackSourceLabel); + else + m_backtrackSourceJumps->append(generator->jump()); + + return true; + } + + if (isStackOffset()) { + generator->jump(Address(stackPointerRegister, getStackOffset())); + return true; + } + + if (isLabel()) { + generator->jump(getLabel()); + if (hasDataLabel()) { + generator->m_expressionState.m_backtrackRecords.append(AlternativeBacktrackRecord(getDataLabel(), getLabel())); + clearDataLabel(); + } + return true; + } + + return false; + } + + void linkAlternativeBacktracks(YarrGenerator* generator, bool nextIteration = false) + { + Label hereLabel = generator->label(); + + if (m_backtrackToLabel) { + *m_backtrackToLabel = hereLabel; + m_backtrackToLabel = 0; + } + + m_backTrackJumps.link(generator); + + if (nextIteration) + generator->m_expressionState.linkToNextIteration(hereLabel); + + if (hasDataLabel()) { + generator->m_expressionState.m_backtrackRecords.append(AlternativeBacktrackRecord(getDataLabel(), hereLabel)); + // data label cleared as a result of the clear() below + } + + clear(); + } + + void linkAlternativeBacktracksTo(YarrGenerator* generator, Label label, bool nextIteration = false) + { + m_backTrackJumps.linkTo(label, generator); + + if (nextIteration) + generator->m_expressionState.linkToNextIteration(label); + + if (hasDataLabel()) { + generator->m_expressionState.m_backtrackRecords.append(AlternativeBacktrackRecord(getDataLabel(), label)); + clearDataLabel(); + } + } + + private: + BacktrackType m_backtrackType; + int32_t m_backtrackStackOffset; + Label m_backtrackLabel; + DataLabelPtr m_dataLabelPtr; + Label* m_backtrackToLabel; + DataLabelPtr* m_subDataLabelPtr; + BacktrackDestination* m_nextBacktrack; + Label* m_backtrackSourceLabel; + JumpList* m_backtrackSourceJumps; + JumpList m_backTrackJumps; + }; + + struct TermGenerationState { + TermGenerationState(PatternDisjunction* disjunction, unsigned checkedTotal) + : disjunction(disjunction) + , checkedTotal(checkedTotal) + , m_subParenNum(0) + , m_linkedBacktrack(0) + , m_parenthesesTail(0) + { + } + + void resetAlternative() + { + m_backtrack.clear(); + alt = 0; + } + bool alternativeValid() + { + return alt < disjunction->m_alternatives.size(); + } + void nextAlternative() + { + ++alt; + } + PatternAlternative* alternative() + { + return disjunction->m_alternatives[alt]; + } + bool isLastAlternative() + { + return (alt + 1) == disjunction->m_alternatives.size(); + } + + void resetTerm() + { + ASSERT(alternativeValid()); + t = 0; + m_subParenNum = 0; + } + bool termValid() + { + ASSERT(alternativeValid()); + return t < alternative()->m_terms.size(); + } + void nextTerm() + { + ASSERT(alternativeValid()); + ++t; + } + PatternTerm& term() + { + ASSERT(alternativeValid()); + return alternative()->m_terms[t]; + } + bool isLastTerm() + { + ASSERT(alternativeValid()); + return (t + 1) == alternative()->m_terms.size(); + } + unsigned getSubParenNum() + { + return m_subParenNum++; + } + bool isMainDisjunction() + { + return !disjunction->m_parent; + } + + void setParenthesesTail(ParenthesesTail* parenthesesTail) + { + m_parenthesesTail = parenthesesTail; + } + + ParenthesesTail* getParenthesesTail() + { + return m_parenthesesTail; + } + + PatternTerm& lookaheadTerm() + { + ASSERT(alternativeValid()); + ASSERT((t + 1) < alternative()->m_terms.size()); + return alternative()->m_terms[t + 1]; + } + bool isSinglePatternCharacterLookaheadTerm() + { + ASSERT(alternativeValid()); + return ((t + 1) < alternative()->m_terms.size()) + && (lookaheadTerm().type == PatternTerm::TypePatternCharacter) + && (lookaheadTerm().quantityType == QuantifierFixedCount) + && (lookaheadTerm().quantityCount == 1); + } + + int inputOffset() + { + return term().inputPosition - checkedTotal; + } + + void clearBacktrack() + { + m_backtrack.clear(false); + m_linkedBacktrack = 0; + } + + void jumpToBacktrack(MacroAssembler* masm) + { + m_backtrack.jumpToBacktrack(masm); + } + + void jumpToBacktrack(YarrGenerator* generator, Jump jump) + { + m_backtrack.jumpToBacktrack(generator, jump); + } + + void jumpToBacktrack(YarrGenerator* generator, JumpList& jumps) + { + m_backtrack.jumpToBacktrack(generator, jumps); + } + + bool plantJumpToBacktrackIfExists(YarrGenerator* generator) + { + return m_backtrack.plantJumpToBacktrackIfExists(generator); + } + + bool linkDataLabelToBacktrackIfExists(YarrGenerator* generator, DataLabelPtr dataLabel) + { + // If we have a stack offset backtrack destination, use it directly + if (m_backtrack.isStackOffset()) { + generator->m_expressionState.addIndirectJumpEntry(m_backtrack.getStackOffset(), dataLabel); + m_backtrack.clearSubDataLabelPtr(); + } else { + // Otherwise set the data label (which may be linked) + setBacktrackDataLabel(dataLabel); + + if ((m_backtrack.isLabel()) && (m_backtrack.hasDataLabel())) { + generator->m_expressionState.m_backtrackRecords.append(AlternativeBacktrackRecord(m_backtrack.getDataLabel(), m_backtrack.getLabel())); + m_backtrack.clearDataLabel(); + return true; + } + } + + return false; + } + + void addBacktrackJump(Jump jump) + { + m_backtrack.addBacktrackJump(jump); + } + + void setBacktrackDataLabel(DataLabelPtr dp) + { + m_backtrack.setDataLabel(dp); + } + + void setBackTrackStackOffset(int32_t stackOffset) + { + m_backtrack.setStackOffset(stackOffset); + } + + void setBacktrackLabel(Label label) + { + m_backtrack.setLabel(label); + } + + void linkAlternativeBacktracks(YarrGenerator* generator, bool nextIteration = false) + { + m_backtrack.linkAlternativeBacktracks(generator, nextIteration); + m_linkedBacktrack = 0; + } + + void linkAlternativeBacktracksTo(YarrGenerator* generator, Label label, bool nextIteration = false) + { + m_backtrack.linkAlternativeBacktracksTo(generator, label, nextIteration); + } + + void setBacktrackLink(BacktrackDestination* linkedBacktrack) + { + m_linkedBacktrack = linkedBacktrack; + } + + void chainBacktracks(BacktrackDestination* followonBacktrack) + { + if (m_linkedBacktrack) + m_linkedBacktrack->linkToNextBacktrack(followonBacktrack); + } + + BacktrackDestination& getBacktrackDestination() + { + return m_backtrack; + } + + void propagateBacktrackingFrom(YarrGenerator* generator, BacktrackDestination& backtrack, bool doJump = true) + { + if (doJump) + m_backtrack.jumpToBacktrack(generator, backtrack.getBacktrackJumps()); + if (backtrack.hasDestination()) { + if (m_backtrack.hasDataLabel()) + generator->m_expressionState.addDataLabelToNextIteration(m_backtrack.getDataLabel()); + + m_backtrack.copyTarget(backtrack, doJump); + } + } + + PatternDisjunction* disjunction; + int checkedTotal; + private: + unsigned alt; + unsigned t; + unsigned m_subParenNum; + BacktrackDestination m_backtrack; + BacktrackDestination* m_linkedBacktrack; + ParenthesesTail* m_parenthesesTail; + + }; + + struct ParenthesesTail { + ParenthesesTail(PatternTerm& term, int nestingLevel, ParenthesesTail* nextOuterParenTail) + : m_term(term) + , m_nestingLevel(nestingLevel) + , m_subParenIndex(0) + , m_nextOuterParenTail(nextOuterParenTail) + { + } + + void processBacktracks(YarrGenerator* generator, TermGenerationState& state, TermGenerationState& parenthesesState, Label nonGreedyTryParentheses, Label fallThrough) + { + m_nonGreedyTryParentheses = nonGreedyTryParentheses; + m_fallThrough = fallThrough; + + m_subParenIndex = state.getSubParenNum(); + parenthesesState.getBacktrackDestination().copyTo(m_parenBacktrack); + state.chainBacktracks(&m_backtrack); + BacktrackDestination& stateBacktrack = state.getBacktrackDestination(); + stateBacktrack.copyTo(m_backtrack); + stateBacktrack.setBacktrackToLabel(&m_backtrackToLabel); + state.setBacktrackLink(&m_backtrack); + stateBacktrack.setSubDataLabelPtr(&m_dataAfterLabelPtr); + + m_doDirectBacktrack = m_parenBacktrack.hasDestination(); + + if ((m_term.quantityType == QuantifierGreedy) || (m_term.quantityType == QuantifierNonGreedy)) + m_doDirectBacktrack = false; + + if (m_doDirectBacktrack) + state.propagateBacktrackingFrom(generator, m_parenBacktrack, false); + else { + stateBacktrack.setBacktrackJumpList(&m_pattBacktrackJumps); + stateBacktrack.setBacktrackSourceLabel(&m_backtrackFromAfterParens); + } + } + + void setNextIteration(Label nextIteration) + { + if (!m_nestingLevel && !m_backtrackToLabel.isSet()) + m_backtrackToLabel = nextIteration; + } + + void addAfterParenJump(Jump jump) + { + m_pattBacktrackJumps.append(jump); + } + + bool generateCode(YarrGenerator* generator, JumpList& jumpsToNext, bool priorBackTrackFallThrough, bool nextBacktrackFallThrough) + { + const RegisterID indexTemporary = regT0; + unsigned parenthesesFrameLocation = m_term.frameLocation; + Jump fromPriorBacktrack; + bool needJumpForPriorParenTail = false; + + if (priorBackTrackFallThrough + && ((m_term.quantityType == QuantifierGreedy) + || (m_term.quantityType == QuantifierNonGreedy) + || (!m_doDirectBacktrack && m_parenBacktrack.hasDestination()))) { + // If the prior paren tail code assumed that it could fall through, + // but we need to generate after paren backtrack code, then provide + // a jump around that code for the prior paren tail code. + // A regular expressing like ((xxx)...)? needs this. + fromPriorBacktrack = generator->jump(); + needJumpForPriorParenTail = true; + } + + if (!m_backtrack.hasDestination()) { + if (m_backtrackToLabel.isSet()) { + m_backtrack.setLabel(m_backtrackToLabel); + nextBacktrackFallThrough = false; + } else if (!m_subParenIndex && m_nextOuterParenTail) { + // If we don't have a destination and we are the first term of a nested paren, go + // back to the outer paren. + // There is an optimization if the next outer paren is the next paren to be emitted. + // In that case we really want the else clause. + m_backtrack.setBacktrackJumpList(&m_nextOuterParenTail->m_withinBacktrackJumps); + nextBacktrackFallThrough = false; + } else + m_backtrack.setBacktrackJumpList(&jumpsToNext); + } else + nextBacktrackFallThrough = false; + + // A failure AFTER the parens jumps here - Backtrack to this paren + m_backtrackFromAfterParens = generator->label(); + + if (m_dataAfterLabelPtr.isSet()) + generator->m_expressionState.m_backtrackRecords.append(AlternativeBacktrackRecord(m_dataAfterLabelPtr, m_backtrackFromAfterParens)); + + m_pattBacktrackJumps.link(generator); + + if (m_term.quantityType == QuantifierGreedy) { + // If this is -1 we have now tested with both with and without the parens. + generator->loadFromFrame(parenthesesFrameLocation, indexTemporary); + m_backtrack.jumpToBacktrack(generator, generator->branch32(Equal, indexTemporary, Imm32(-1))); + } else if (m_term.quantityType == QuantifierNonGreedy) { + // If this is -1 we have now tested with both with and without the parens. + generator->loadFromFrame(parenthesesFrameLocation, indexTemporary); + generator->branch32(Equal, indexTemporary, Imm32(-1)).linkTo(m_nonGreedyTryParentheses, generator); + } + + if (!m_doDirectBacktrack) + m_parenBacktrack.plantJumpToBacktrackIfExists(generator); + + // A failure WITHIN the parens jumps here + if (needJumpForPriorParenTail) + fromPriorBacktrack.link(generator); + m_parenBacktrack.linkAlternativeBacktracks(generator); + m_withinBacktrackJumps.link(generator); + + if (m_term.capture()) + generator->store32(Imm32(-1), Address(output, (m_term.parentheses.subpatternId << 1) * sizeof(int))); + + if (m_term.quantityType == QuantifierGreedy) { + generator->storeToFrame(Imm32(-1), parenthesesFrameLocation); + generator->jump().linkTo(m_fallThrough, generator); + nextBacktrackFallThrough = false; + } else if (!nextBacktrackFallThrough) + m_backtrack.jumpToBacktrack(generator); + + if (!m_doDirectBacktrack) + m_backtrack.setNextBacktrackLabel(m_backtrackFromAfterParens); + + return nextBacktrackFallThrough; + } + + PatternTerm& m_term; + int m_nestingLevel; + unsigned m_subParenIndex; + ParenthesesTail* m_nextOuterParenTail; + Label m_nonGreedyTryParentheses; + Label m_fallThrough; + Label m_backtrackToLabel; + Label m_backtrackFromAfterParens; + DataLabelPtr m_dataAfterLabelPtr; + JumpList m_pattBacktrackJumps; + JumpList m_withinBacktrackJumps; + BacktrackDestination m_parenBacktrack; + BacktrackDestination m_backtrack; + bool m_doDirectBacktrack; + }; + + void generateAssertionBOL(TermGenerationState& state) + { + PatternTerm& term = state.term(); + + if (m_pattern.m_multiline) { + const RegisterID character = regT0; + + JumpList matchDest; + if (!term.inputPosition) + matchDest.append(branch32(Equal, index, Imm32(state.checkedTotal))); + + readCharacter(state.inputOffset() - 1, character); + matchCharacterClass(character, matchDest, m_pattern.newlineCharacterClass()); + state.jumpToBacktrack(this); + + matchDest.link(this); + } else { + // Erk, really should poison out these alternatives early. :-/ + if (term.inputPosition) + state.jumpToBacktrack(this); + else + state.jumpToBacktrack(this, branch32(NotEqual, index, Imm32(state.checkedTotal))); + } + } + + void generateAssertionEOL(TermGenerationState& state) + { + PatternTerm& term = state.term(); + + if (m_pattern.m_multiline) { + const RegisterID character = regT0; + + JumpList matchDest; + if (term.inputPosition == state.checkedTotal) + matchDest.append(atEndOfInput()); + + readCharacter(state.inputOffset(), character); + matchCharacterClass(character, matchDest, m_pattern.newlineCharacterClass()); + state.jumpToBacktrack(this); + + matchDest.link(this); + } else { + if (term.inputPosition == state.checkedTotal) + state.jumpToBacktrack(this, notAtEndOfInput()); + // Erk, really should poison out these alternatives early. :-/ + else + state.jumpToBacktrack(this); + } + } + + // Also falls though on nextIsNotWordChar. + void matchAssertionWordchar(TermGenerationState& state, JumpList& nextIsWordChar, JumpList& nextIsNotWordChar) + { + const RegisterID character = regT0; + PatternTerm& term = state.term(); + + if (term.inputPosition == state.checkedTotal) + nextIsNotWordChar.append(atEndOfInput()); + + readCharacter(state.inputOffset(), character); + matchCharacterClass(character, nextIsWordChar, m_pattern.wordcharCharacterClass()); + } + + void generateAssertionWordBoundary(TermGenerationState& state) + { + const RegisterID character = regT0; + PatternTerm& term = state.term(); + + Jump atBegin; + JumpList matchDest; + if (!term.inputPosition) + atBegin = branch32(Equal, index, Imm32(state.checkedTotal)); + readCharacter(state.inputOffset() - 1, character); + matchCharacterClass(character, matchDest, m_pattern.wordcharCharacterClass()); + if (!term.inputPosition) + atBegin.link(this); + + // We fall through to here if the last character was not a wordchar. + JumpList nonWordCharThenWordChar; + JumpList nonWordCharThenNonWordChar; + if (term.invert()) { + matchAssertionWordchar(state, nonWordCharThenNonWordChar, nonWordCharThenWordChar); + nonWordCharThenWordChar.append(jump()); + } else { + matchAssertionWordchar(state, nonWordCharThenWordChar, nonWordCharThenNonWordChar); + nonWordCharThenNonWordChar.append(jump()); + } + state.jumpToBacktrack(this, nonWordCharThenNonWordChar); + + // We jump here if the last character was a wordchar. + matchDest.link(this); + JumpList wordCharThenWordChar; + JumpList wordCharThenNonWordChar; + if (term.invert()) { + matchAssertionWordchar(state, wordCharThenNonWordChar, wordCharThenWordChar); + wordCharThenWordChar.append(jump()); + } else { + matchAssertionWordchar(state, wordCharThenWordChar, wordCharThenNonWordChar); + // This can fall-though! + } + + state.jumpToBacktrack(this, wordCharThenWordChar); + + nonWordCharThenWordChar.link(this); + wordCharThenNonWordChar.link(this); + } + + void generatePatternCharacterSingle(TermGenerationState& state) + { + const RegisterID character = regT0; + UChar ch = state.term().patternCharacter; + + if (m_pattern.m_ignoreCase && isASCIIAlpha(ch)) { + readCharacter(state.inputOffset(), character); + or32(Imm32(32), character); + state.jumpToBacktrack(this, branch32(NotEqual, character, Imm32(Unicode::toLower(ch)))); + } else { + ASSERT(!m_pattern.m_ignoreCase || (Unicode::toLower(ch) == Unicode::toUpper(ch))); + state.jumpToBacktrack(this, jumpIfCharNotEquals(ch, state.inputOffset())); + } + } + + void generatePatternCharacterPair(TermGenerationState& state) + { + const RegisterID character = regT0; + UChar ch1 = state.term().patternCharacter; + UChar ch2 = state.lookaheadTerm().patternCharacter; + + int mask = 0; + int chPair = ch1 | (ch2 << 16); + + if (m_pattern.m_ignoreCase) { + if (isASCIIAlpha(ch1)) + mask |= 32; + if (isASCIIAlpha(ch2)) + mask |= 32 << 16; + } + + if (mask) { + load32WithUnalignedHalfWords(BaseIndex(input, index, TimesTwo, state.inputOffset() * sizeof(UChar)), character); + or32(Imm32(mask), character); + state.jumpToBacktrack(this, branch32(NotEqual, character, Imm32(chPair | mask))); + } else + state.jumpToBacktrack(this, branch32WithUnalignedHalfWords(NotEqual, BaseIndex(input, index, TimesTwo, state.inputOffset() * sizeof(UChar)), Imm32(chPair))); + } + + void generatePatternCharacterFixed(TermGenerationState& state) + { + const RegisterID character = regT0; + const RegisterID countRegister = regT1; + PatternTerm& term = state.term(); + UChar ch = term.patternCharacter; + + move(index, countRegister); + sub32(Imm32(term.quantityCount), countRegister); + + Label loop(this); + if (m_pattern.m_ignoreCase && isASCIIAlpha(ch)) { + load16(BaseIndex(input, countRegister, TimesTwo, (state.inputOffset() + term.quantityCount) * sizeof(UChar)), character); + or32(Imm32(32), character); + state.jumpToBacktrack(this, branch32(NotEqual, character, Imm32(Unicode::toLower(ch)))); + } else { + ASSERT(!m_pattern.m_ignoreCase || (Unicode::toLower(ch) == Unicode::toUpper(ch))); + state.jumpToBacktrack(this, branch16(NotEqual, BaseIndex(input, countRegister, TimesTwo, (state.inputOffset() + term.quantityCount) * sizeof(UChar)), Imm32(ch))); + } + add32(Imm32(1), countRegister); + branch32(NotEqual, countRegister, index).linkTo(loop, this); + } + + void generatePatternCharacterGreedy(TermGenerationState& state) + { + const RegisterID character = regT0; + const RegisterID countRegister = regT1; + PatternTerm& term = state.term(); + UChar ch = term.patternCharacter; + + move(Imm32(0), countRegister); + + JumpList failures; + Label loop(this); + failures.append(atEndOfInput()); + if (m_pattern.m_ignoreCase && isASCIIAlpha(ch)) { + readCharacter(state.inputOffset(), character); + or32(Imm32(32), character); + failures.append(branch32(NotEqual, character, Imm32(Unicode::toLower(ch)))); + } else { + ASSERT(!m_pattern.m_ignoreCase || (Unicode::toLower(ch) == Unicode::toUpper(ch))); + failures.append(jumpIfCharNotEquals(ch, state.inputOffset())); + } + + add32(Imm32(1), countRegister); + add32(Imm32(1), index); + if (term.quantityCount != quantifyInfinite) { + branch32(NotEqual, countRegister, Imm32(term.quantityCount)).linkTo(loop, this); + failures.append(jump()); + } else + jump(loop); + + Label backtrackBegin(this); + loadFromFrame(term.frameLocation, countRegister); + state.jumpToBacktrack(this, branchTest32(Zero, countRegister)); + sub32(Imm32(1), countRegister); + sub32(Imm32(1), index); + + failures.link(this); + + storeToFrame(countRegister, term.frameLocation); + + state.setBacktrackLabel(backtrackBegin); + } + + void generatePatternCharacterNonGreedy(TermGenerationState& state) + { + const RegisterID character = regT0; + const RegisterID countRegister = regT1; + PatternTerm& term = state.term(); + UChar ch = term.patternCharacter; + + move(Imm32(0), countRegister); + + Jump firstTimeDoNothing = jump(); + + Label hardFail(this); + sub32(countRegister, index); + state.jumpToBacktrack(this); + + Label backtrackBegin(this); + loadFromFrame(term.frameLocation, countRegister); + + atEndOfInput().linkTo(hardFail, this); + if (term.quantityCount != quantifyInfinite) + branch32(Equal, countRegister, Imm32(term.quantityCount), hardFail); + if (m_pattern.m_ignoreCase && isASCIIAlpha(ch)) { + readCharacter(state.inputOffset(), character); + or32(Imm32(32), character); + branch32(NotEqual, character, Imm32(Unicode::toLower(ch))).linkTo(hardFail, this); + } else { + ASSERT(!m_pattern.m_ignoreCase || (Unicode::toLower(ch) == Unicode::toUpper(ch))); + jumpIfCharNotEquals(ch, state.inputOffset()).linkTo(hardFail, this); + } + + add32(Imm32(1), countRegister); + add32(Imm32(1), index); + + firstTimeDoNothing.link(this); + storeToFrame(countRegister, term.frameLocation); + + state.setBacktrackLabel(backtrackBegin); + } + + void generateCharacterClassSingle(TermGenerationState& state) + { + const RegisterID character = regT0; + PatternTerm& term = state.term(); + + JumpList matchDest; + readCharacter(state.inputOffset(), character); + matchCharacterClass(character, matchDest, term.characterClass); + + if (term.invert()) + state.jumpToBacktrack(this, matchDest); + else { + state.jumpToBacktrack(this); + matchDest.link(this); + } + } + + void generateCharacterClassFixed(TermGenerationState& state) + { + const RegisterID character = regT0; + const RegisterID countRegister = regT1; + PatternTerm& term = state.term(); + + move(index, countRegister); + sub32(Imm32(term.quantityCount), countRegister); + + Label loop(this); + JumpList matchDest; + load16(BaseIndex(input, countRegister, TimesTwo, (state.inputOffset() + term.quantityCount) * sizeof(UChar)), character); + matchCharacterClass(character, matchDest, term.characterClass); + + if (term.invert()) + state.jumpToBacktrack(this, matchDest); + else { + state.jumpToBacktrack(this); + matchDest.link(this); + } + + add32(Imm32(1), countRegister); + branch32(NotEqual, countRegister, index).linkTo(loop, this); + } + + void generateCharacterClassGreedy(TermGenerationState& state) + { + const RegisterID character = regT0; + const RegisterID countRegister = regT1; + PatternTerm& term = state.term(); + + move(Imm32(0), countRegister); + + JumpList failures; + Label loop(this); + failures.append(atEndOfInput()); + + if (term.invert()) { + readCharacter(state.inputOffset(), character); + matchCharacterClass(character, failures, term.characterClass); + } else { + JumpList matchDest; + readCharacter(state.inputOffset(), character); + matchCharacterClass(character, matchDest, term.characterClass); + failures.append(jump()); + matchDest.link(this); + } + + add32(Imm32(1), countRegister); + add32(Imm32(1), index); + if (term.quantityCount != quantifyInfinite) { + branch32(NotEqual, countRegister, Imm32(term.quantityCount)).linkTo(loop, this); + failures.append(jump()); + } else + jump(loop); + + Label backtrackBegin(this); + loadFromFrame(term.frameLocation, countRegister); + state.jumpToBacktrack(this, branchTest32(Zero, countRegister)); + sub32(Imm32(1), countRegister); + sub32(Imm32(1), index); + + failures.link(this); + + storeToFrame(countRegister, term.frameLocation); + + state.setBacktrackLabel(backtrackBegin); + } + + void generateCharacterClassNonGreedy(TermGenerationState& state) + { + const RegisterID character = regT0; + const RegisterID countRegister = regT1; + PatternTerm& term = state.term(); + + move(Imm32(0), countRegister); + + Jump firstTimeDoNothing = jump(); + + Label hardFail(this); + sub32(countRegister, index); + state.jumpToBacktrack(this); + + Label backtrackBegin(this); + loadFromFrame(term.frameLocation, countRegister); + + atEndOfInput().linkTo(hardFail, this); + branch32(Equal, countRegister, Imm32(term.quantityCount), hardFail); + + JumpList matchDest; + readCharacter(state.inputOffset(), character); + matchCharacterClass(character, matchDest, term.characterClass); + + if (term.invert()) + matchDest.linkTo(hardFail, this); + else { + jump(hardFail); + matchDest.link(this); + } + + add32(Imm32(1), countRegister); + add32(Imm32(1), index); + + firstTimeDoNothing.link(this); + storeToFrame(countRegister, term.frameLocation); + + state.setBacktrackLabel(backtrackBegin); + } + + void generateParenthesesDisjunction(PatternTerm& parenthesesTerm, TermGenerationState& state, unsigned alternativeFrameLocation) + { + ASSERT((parenthesesTerm.type == PatternTerm::TypeParenthesesSubpattern) || (parenthesesTerm.type == PatternTerm::TypeParentheticalAssertion)); + ASSERT(parenthesesTerm.quantityCount == 1); + + PatternDisjunction* disjunction = parenthesesTerm.parentheses.disjunction; + unsigned preCheckedCount = ((parenthesesTerm.quantityType == QuantifierFixedCount) && (parenthesesTerm.type != PatternTerm::TypeParentheticalAssertion)) ? disjunction->m_minimumSize : 0; + + if (disjunction->m_alternatives.size() == 1) { + state.resetAlternative(); + ASSERT(state.alternativeValid()); + PatternAlternative* alternative = state.alternative(); + optimizeAlternative(alternative); + + int countToCheck = alternative->m_minimumSize - preCheckedCount; + if (countToCheck) { + ASSERT((parenthesesTerm.type == PatternTerm::TypeParentheticalAssertion) || (parenthesesTerm.quantityType != QuantifierFixedCount)); + + // FIXME: This is quite horrible. The call to 'plantJumpToBacktrackIfExists' + // will be forced to always trampoline into here, just to decrement the index. + // Ick. + Jump skip = jump(); + + Label backtrackBegin(this); + sub32(Imm32(countToCheck), index); + state.addBacktrackJump(jump()); + + skip.link(this); + + state.setBacktrackLabel(backtrackBegin); + + state.jumpToBacktrack(this, jumpIfNoAvailableInput(countToCheck)); + state.checkedTotal += countToCheck; + } + + for (state.resetTerm(); state.termValid(); state.nextTerm()) + generateTerm(state); + + state.checkedTotal -= countToCheck; + } else { + JumpList successes; + bool propogateBacktrack = false; + + for (state.resetAlternative(); state.alternativeValid(); state.nextAlternative()) { + + PatternAlternative* alternative = state.alternative(); + optimizeAlternative(alternative); + + ASSERT(alternative->m_minimumSize >= preCheckedCount); + int countToCheck = alternative->m_minimumSize - preCheckedCount; + if (countToCheck) { + state.addBacktrackJump(jumpIfNoAvailableInput(countToCheck)); + state.checkedTotal += countToCheck; + } + + for (state.resetTerm(); state.termValid(); state.nextTerm()) + generateTerm(state); + + // Matched an alternative. + DataLabelPtr dataLabel = storeToFrameWithPatch(alternativeFrameLocation); + + if (!state.isLastAlternative() || countToCheck) + successes.append(jump()); + + // Alternative did not match. + + // Do we have a backtrack destination? + // if so, link the data label to it. + state.linkDataLabelToBacktrackIfExists(this, dataLabel); + + if (!state.isLastAlternative() || countToCheck) + state.linkAlternativeBacktracks(this); + + if (countToCheck) { + sub32(Imm32(countToCheck), index); + state.checkedTotal -= countToCheck; + } else if (state.isLastAlternative()) + propogateBacktrack = true; + } + // We fall through to here when the last alternative fails. + // Add a backtrack out of here for the parenthese handling code to link up. + if (!propogateBacktrack) + state.addBacktrackJump(jump()); + + // Save address on stack for the parens code to backtrack to, to retry the + // next alternative. + state.setBackTrackStackOffset(alternativeFrameLocation * sizeof(void*)); + + successes.link(this); + } + } + + void generateParenthesesSingle(TermGenerationState& state) + { + const RegisterID indexTemporary = regT0; + PatternTerm& term = state.term(); + PatternDisjunction* disjunction = term.parentheses.disjunction; + ASSERT(term.quantityCount == 1); + + unsigned preCheckedCount = (term.quantityType == QuantifierFixedCount) ? disjunction->m_minimumSize : 0; + + unsigned parenthesesFrameLocation = term.frameLocation; + unsigned alternativeFrameLocation = parenthesesFrameLocation; + if (term.quantityType != QuantifierFixedCount) + alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParenthesesOnce; + + // optimized case - no capture & no quantifier can be handled in a light-weight manner. + if (!term.capture() && (term.quantityType == QuantifierFixedCount)) { + m_expressionState.incrementParenNestingLevel(); + + TermGenerationState parenthesesState(disjunction, state.checkedTotal); + generateParenthesesDisjunction(state.term(), parenthesesState, alternativeFrameLocation); + // this expects that any backtracks back out of the parentheses will be in the + // parenthesesState's m_backTrackJumps vector, and that if they need backtracking + // they will have set an entry point on the parenthesesState's m_backtrackLabel. + BacktrackDestination& parenthesesBacktrack = parenthesesState.getBacktrackDestination(); + state.propagateBacktrackingFrom(this, parenthesesBacktrack); + state.getBacktrackDestination().copyBacktrackToLabel(parenthesesBacktrack); + + m_expressionState.decrementParenNestingLevel(); + } else { + Jump nonGreedySkipParentheses; + Label nonGreedyTryParentheses; + if (term.quantityType == QuantifierGreedy) + storeToFrame(index, parenthesesFrameLocation); + else if (term.quantityType == QuantifierNonGreedy) { + storeToFrame(Imm32(-1), parenthesesFrameLocation); + nonGreedySkipParentheses = jump(); + nonGreedyTryParentheses = label(); + storeToFrame(index, parenthesesFrameLocation); + } + + // store the match start index + if (term.capture()) { + int inputOffset = state.inputOffset() - preCheckedCount; + if (inputOffset) { + move(index, indexTemporary); + add32(Imm32(inputOffset), indexTemporary); + store32(indexTemporary, Address(output, (term.parentheses.subpatternId << 1) * sizeof(int))); + } else + store32(index, Address(output, (term.parentheses.subpatternId << 1) * sizeof(int))); + } + + ParenthesesTail* parenthesesTail = m_expressionState.addParenthesesTail(term, state.getParenthesesTail()); + + m_expressionState.incrementParenNestingLevel(); + + TermGenerationState parenthesesState(disjunction, state.checkedTotal); + + // Save the parenthesesTail for backtracking from nested parens to this one. + parenthesesState.setParenthesesTail(parenthesesTail); + + // generate the body of the parentheses + generateParenthesesDisjunction(state.term(), parenthesesState, alternativeFrameLocation); + + // For non-fixed counts, backtrack if we didn't match anything. + if (term.quantityType != QuantifierFixedCount) + parenthesesTail->addAfterParenJump(branch32(Equal, index, Address(stackPointerRegister, (parenthesesFrameLocation * sizeof(void*))))); + + // store the match end index + if (term.capture()) { + int inputOffset = state.inputOffset(); + if (inputOffset) { + move(index, indexTemporary); + add32(Imm32(state.inputOffset()), indexTemporary); + store32(indexTemporary, Address(output, ((term.parentheses.subpatternId << 1) + 1) * sizeof(int))); + } else + store32(index, Address(output, ((term.parentheses.subpatternId << 1) + 1) * sizeof(int))); + } + + m_expressionState.decrementParenNestingLevel(); + + parenthesesTail->processBacktracks(this, state, parenthesesState, nonGreedyTryParentheses, label()); + + parenthesesState.getBacktrackDestination().clear(); + + if (term.quantityType == QuantifierNonGreedy) + nonGreedySkipParentheses.link(this); + } + } + + void generateParenthesesGreedyNoBacktrack(TermGenerationState& state) + { + PatternTerm& parenthesesTerm = state.term(); + PatternDisjunction* disjunction = parenthesesTerm.parentheses.disjunction; + ASSERT(parenthesesTerm.type == PatternTerm::TypeParenthesesSubpattern); + ASSERT(parenthesesTerm.quantityCount != 1); // Handled by generateParenthesesSingle. + + TermGenerationState parenthesesState(disjunction, state.checkedTotal); + + Label matchAgain(this); + + storeToFrame(index, parenthesesTerm.frameLocation); // Save the current index to check for zero len matches later. + + for (parenthesesState.resetAlternative(); parenthesesState.alternativeValid(); parenthesesState.nextAlternative()) { + + PatternAlternative* alternative = parenthesesState.alternative(); + optimizeAlternative(alternative); + + int countToCheck = alternative->m_minimumSize; + if (countToCheck) { + parenthesesState.addBacktrackJump(jumpIfNoAvailableInput(countToCheck)); + parenthesesState.checkedTotal += countToCheck; + } + + for (parenthesesState.resetTerm(); parenthesesState.termValid(); parenthesesState.nextTerm()) + generateTerm(parenthesesState); + + // If we get here, we matched! If the index advanced then try to match more since limit isn't supported yet. + branch32(NotEqual, index, Address(stackPointerRegister, (parenthesesTerm.frameLocation * sizeof(void*))), matchAgain); + + // If we get here we matched, but we matched "" - cannot accept this alternative as is, so either backtrack, + // or fall through to try the next alternative if no backtrack is available. + parenthesesState.plantJumpToBacktrackIfExists(this); + + parenthesesState.linkAlternativeBacktracks(this); + + // We get here if the alternative fails to match - fall through to the next iteration, or out of the loop. + + if (countToCheck) { + sub32(Imm32(countToCheck), index); + parenthesesState.checkedTotal -= countToCheck; + } + } + + // If the last alternative falls through to here, we have a failed match... + // Which means that we match whatever we have matched up to this point (even if nothing). + } + + void generateParentheticalAssertion(TermGenerationState& state) + { + PatternTerm& term = state.term(); + PatternDisjunction* disjunction = term.parentheses.disjunction; + ASSERT(term.quantityCount == 1); + ASSERT(term.quantityType == QuantifierFixedCount); + + unsigned parenthesesFrameLocation = term.frameLocation; + unsigned alternativeFrameLocation = parenthesesFrameLocation + YarrStackSpaceForBackTrackInfoParentheticalAssertion; + + int countCheckedAfterAssertion = state.checkedTotal - term.inputPosition; + + if (term.invert()) { + // Inverted case + storeToFrame(index, parenthesesFrameLocation); + + state.checkedTotal -= countCheckedAfterAssertion; + if (countCheckedAfterAssertion) + sub32(Imm32(countCheckedAfterAssertion), index); + + TermGenerationState parenthesesState(disjunction, state.checkedTotal); + generateParenthesesDisjunction(state.term(), parenthesesState, alternativeFrameLocation); + // Success! - which means - Fail! + loadFromFrame(parenthesesFrameLocation, index); + state.jumpToBacktrack(this); + + // And fail means success. + parenthesesState.linkAlternativeBacktracks(this); + + loadFromFrame(parenthesesFrameLocation, index); + + state.checkedTotal += countCheckedAfterAssertion; + } else { + // Normal case + storeToFrame(index, parenthesesFrameLocation); + + state.checkedTotal -= countCheckedAfterAssertion; + if (countCheckedAfterAssertion) + sub32(Imm32(countCheckedAfterAssertion), index); + + TermGenerationState parenthesesState(disjunction, state.checkedTotal); + generateParenthesesDisjunction(state.term(), parenthesesState, alternativeFrameLocation); + // Success! - which means - Success! + loadFromFrame(parenthesesFrameLocation, index); + Jump success = jump(); + + parenthesesState.linkAlternativeBacktracks(this); + + loadFromFrame(parenthesesFrameLocation, index); + state.jumpToBacktrack(this); + + success.link(this); + + state.checkedTotal += countCheckedAfterAssertion; + } + } + + void generateTerm(TermGenerationState& state) + { + PatternTerm& term = state.term(); + + switch (term.type) { + case PatternTerm::TypeAssertionBOL: + generateAssertionBOL(state); + break; + + case PatternTerm::TypeAssertionEOL: + generateAssertionEOL(state); + break; + + case PatternTerm::TypeAssertionWordBoundary: + generateAssertionWordBoundary(state); + break; + + case PatternTerm::TypePatternCharacter: + switch (term.quantityType) { + case QuantifierFixedCount: + if (term.quantityCount == 1) { + if (state.isSinglePatternCharacterLookaheadTerm() && (state.lookaheadTerm().inputPosition == (term.inputPosition + 1))) { + generatePatternCharacterPair(state); + state.nextTerm(); + } else + generatePatternCharacterSingle(state); + } else + generatePatternCharacterFixed(state); + break; + case QuantifierGreedy: + generatePatternCharacterGreedy(state); + break; + case QuantifierNonGreedy: + generatePatternCharacterNonGreedy(state); + break; + } + break; + + case PatternTerm::TypeCharacterClass: + switch (term.quantityType) { + case QuantifierFixedCount: + if (term.quantityCount == 1) + generateCharacterClassSingle(state); + else + generateCharacterClassFixed(state); + break; + case QuantifierGreedy: + generateCharacterClassGreedy(state); + break; + case QuantifierNonGreedy: + generateCharacterClassNonGreedy(state); + break; + } + break; + + case PatternTerm::TypeBackReference: + m_shouldFallBack = true; + break; + + case PatternTerm::TypeForwardReference: + break; + + case PatternTerm::TypeParenthesesSubpattern: + if (term.quantityCount == 1 && !term.parentheses.isCopy) + generateParenthesesSingle(state); + else if (term.parentheses.isTerminal) + generateParenthesesGreedyNoBacktrack(state); + else + m_shouldFallBack = true; + break; + + case PatternTerm::TypeParentheticalAssertion: + generateParentheticalAssertion(state); + break; + } + } + + void generateDisjunction(PatternDisjunction* disjunction) + { + TermGenerationState state(disjunction, 0); + state.resetAlternative(); + + // check availability for the next alternative + int countCheckedForCurrentAlternative = 0; + int countToCheckForFirstAlternative = 0; + bool hasShorterAlternatives = false; + bool setRepeatAlternativeLabels = false; + JumpList notEnoughInputForPreviousAlternative; + Label firstAlternative; + Label firstAlternativeInputChecked; + + // The label 'firstAlternative' is used to plant a check to see if there is + // sufficient input available to run the first repeating alternative. + // The label 'firstAlternativeInputChecked' will jump directly to matching + // the first repeating alternative having skipped this check. + + if (state.alternativeValid()) { + PatternAlternative* alternative = state.alternative(); + if (!alternative->onceThrough()) { + firstAlternative = Label(this); + setRepeatAlternativeLabels = true; + } + countToCheckForFirstAlternative = alternative->m_minimumSize; + state.checkedTotal += countToCheckForFirstAlternative; + if (countToCheckForFirstAlternative) + notEnoughInputForPreviousAlternative.append(jumpIfNoAvailableInput(countToCheckForFirstAlternative)); + countCheckedForCurrentAlternative = countToCheckForFirstAlternative; + } + + if (setRepeatAlternativeLabels) + firstAlternativeInputChecked = Label(this); + + while (state.alternativeValid()) { + PatternAlternative* alternative = state.alternative(); + optimizeAlternative(alternative); + + // Track whether any alternatives are shorter than the first one. + if (!alternative->onceThrough()) + hasShorterAlternatives = hasShorterAlternatives || (countCheckedForCurrentAlternative < countToCheckForFirstAlternative); + + for (state.resetTerm(); state.termValid(); state.nextTerm()) + generateTerm(state); + + // If we get here, the alternative matched. + if (m_pattern.m_body->m_callFrameSize) + addPtr(Imm32(m_pattern.m_body->m_callFrameSize * sizeof(void*)), stackPointerRegister); + + ASSERT(index != returnRegister); + if (m_pattern.m_body->m_hasFixedSize) { + move(index, returnRegister); + if (alternative->m_minimumSize) + sub32(Imm32(alternative->m_minimumSize), returnRegister); + + store32(returnRegister, output); + } else + load32(Address(output), returnRegister); + + store32(index, Address(output, 4)); + + generateReturn(); + + state.nextAlternative(); + if (alternative->onceThrough() && state.alternativeValid()) + state.clearBacktrack(); + + // if there are any more alternatives, plant the check for input before looping. + if (state.alternativeValid()) { + PatternAlternative* nextAlternative = state.alternative(); + if (!setRepeatAlternativeLabels && !nextAlternative->onceThrough()) { + // We have handled non-repeating alternatives, jump to next iteration + // and loop over repeating alternatives. + state.jumpToBacktrack(this); + + countToCheckForFirstAlternative = nextAlternative->m_minimumSize; + + // If we get here, there the last input checked failed. + notEnoughInputForPreviousAlternative.link(this); + + state.linkAlternativeBacktracks(this); + + // Back up to start the looping alternatives. + if (countCheckedForCurrentAlternative) + sub32(Imm32(countCheckedForCurrentAlternative), index); + + firstAlternative = Label(this); + + state.checkedTotal = countToCheckForFirstAlternative; + if (countToCheckForFirstAlternative) + notEnoughInputForPreviousAlternative.append(jumpIfNoAvailableInput(countToCheckForFirstAlternative)); + + countCheckedForCurrentAlternative = countToCheckForFirstAlternative; + + firstAlternativeInputChecked = Label(this); + + setRepeatAlternativeLabels = true; + } else { + int countToCheckForNextAlternative = nextAlternative->m_minimumSize; + + if (countCheckedForCurrentAlternative > countToCheckForNextAlternative) { // CASE 1: current alternative was longer than the next one. + // If we get here, then the last input checked failed. + notEnoughInputForPreviousAlternative.link(this); + + // Check if sufficent input available to run the next alternative + notEnoughInputForPreviousAlternative.append(jumpIfNoAvailableInput(countToCheckForNextAlternative - countCheckedForCurrentAlternative)); + // We are now in the correct state to enter the next alternative; this add is only required + // to mirror and revert operation of the sub32, just below. + add32(Imm32(countCheckedForCurrentAlternative - countToCheckForNextAlternative), index); + + // If we get here, then the last input checked passed. + state.linkAlternativeBacktracks(this); + + // No need to check if we can run the next alternative, since it is shorter - + // just update index. + sub32(Imm32(countCheckedForCurrentAlternative - countToCheckForNextAlternative), index); + } else if (countCheckedForCurrentAlternative < countToCheckForNextAlternative) { // CASE 2: next alternative is longer than the current one. + // If we get here, then the last input checked failed. + // If there is insufficient input to run the current alternative, and the next alternative is longer, + // then there is definitely not enough input to run it - don't even check. Just adjust index, as if + // we had checked. + notEnoughInputForPreviousAlternative.link(this); + add32(Imm32(countToCheckForNextAlternative - countCheckedForCurrentAlternative), index); + notEnoughInputForPreviousAlternative.append(jump()); + + // The next alternative is longer than the current one; check the difference. + state.linkAlternativeBacktracks(this); + + notEnoughInputForPreviousAlternative.append(jumpIfNoAvailableInput(countToCheckForNextAlternative - countCheckedForCurrentAlternative)); + } else { // CASE 3: Both alternatives are the same length. + ASSERT(countCheckedForCurrentAlternative == countToCheckForNextAlternative); + + // If the next alterative is the same length as this one, then no need to check the input - + // if there was sufficent input to run the current alternative then there is sufficient + // input to run the next one; if not, there isn't. + state.linkAlternativeBacktracks(this); + } + state.checkedTotal -= countCheckedForCurrentAlternative; + countCheckedForCurrentAlternative = countToCheckForNextAlternative; + state.checkedTotal += countCheckedForCurrentAlternative; + } + } + } + + // If we get here, all Alternatives failed... + + state.checkedTotal -= countCheckedForCurrentAlternative; + + if (!setRepeatAlternativeLabels) { + // If there are no alternatives that need repeating (all are marked 'onceThrough') then just link + // the match failures to this point, and fall through to the return below. + state.linkAlternativeBacktracks(this, true); + + notEnoughInputForPreviousAlternative.link(this); + } else { + // How much more input need there be to be able to retry from the first alternative? + // examples: + // /yarr_jit/ or /wrec|pcre/ + // In these examples we need check for one more input before looping. + // /yarr_jit|pcre/ + // In this case we need check for 5 more input to loop (+4 to allow for the first alterative + // being four longer than the last alternative checked, and another +1 to effectively move + // the start position along by one). + // /yarr|rules/ or /wrec|notsomuch/ + // In these examples, provided that there was sufficient input to have just been matching for + // the second alternative we can loop without checking for available input (since the second + // alternative is longer than the first). In the latter example we need to decrement index + // (by 4) so the start position is only progressed by 1 from the last iteration. + int incrementForNextIter = (countToCheckForFirstAlternative - countCheckedForCurrentAlternative) + 1; + + // First, deal with the cases where there was sufficient input to try the last alternative. + if (incrementForNextIter > 0) // We need to check for more input anyway, fall through to the checking below. + state.linkAlternativeBacktracks(this, true); + else if (m_pattern.m_body->m_hasFixedSize && !incrementForNextIter) // No need to update anything, link these backtracks straight to the to pof the loop! + state.linkAlternativeBacktracksTo(this, firstAlternativeInputChecked, true); + else { // no need to check the input, but we do have some bookkeeping to do first. + state.linkAlternativeBacktracks(this, true); + + // Where necessary update our preserved start position. + if (!m_pattern.m_body->m_hasFixedSize) { + move(index, regT0); + sub32(Imm32(countCheckedForCurrentAlternative - 1), regT0); + store32(regT0, Address(output)); + } + + // Update index if necessary, and loop (without checking). + if (incrementForNextIter) + add32(Imm32(incrementForNextIter), index); + jump().linkTo(firstAlternativeInputChecked, this); + } + + notEnoughInputForPreviousAlternative.link(this); + // Update our idea of the start position, if we're tracking this. + if (!m_pattern.m_body->m_hasFixedSize) { + if (countCheckedForCurrentAlternative - 1) { + move(index, regT0); + sub32(Imm32(countCheckedForCurrentAlternative - 1), regT0); + store32(regT0, Address(output)); + } else + store32(index, Address(output)); + } + + // Check if there is sufficent input to run the first alternative again. + jumpIfAvailableInput(incrementForNextIter).linkTo(firstAlternativeInputChecked, this); + // No - insufficent input to run the first alteranative, are there any other alternatives we + // might need to check? If so, the last check will have left the index incremented by + // (countToCheckForFirstAlternative + 1), so we need test whether countToCheckForFirstAlternative + // LESS input is available, to have the effect of just progressing the start position by 1 + // from the last iteration. If this check passes we can just jump up to the check associated + // with the first alternative in the loop. This is a bit sad, since we'll end up trying the + // first alternative again, and this check will fail (otherwise the check planted just above + // here would have passed). This is a bit sad, however it saves trying to do something more + // complex here in compilation, and in the common case we should end up coallescing the checks. + // + // FIXME: a nice improvement here may be to stop trying to match sooner, based on the least + // of the minimum-alternative-lengths. E.g. if I have two alternatives of length 200 and 150, + // and a string of length 100, we'll end up looping index from 0 to 100, checking whether there + // is sufficient input to run either alternative (constantly failing). If there had been only + // one alternative, or if the shorter alternative had come first, we would have terminated + // immediately. :-/ + if (hasShorterAlternatives) + jumpIfAvailableInput(-countToCheckForFirstAlternative).linkTo(firstAlternative, this); + // index will now be a bit garbled (depending on whether 'hasShorterAlternatives' is true, + // it has either been incremented by 1 or by (countToCheckForFirstAlternative + 1) ... + // but since we're about to return a failure this doesn't really matter!) + } + + if (m_pattern.m_body->m_callFrameSize) + addPtr(Imm32(m_pattern.m_body->m_callFrameSize * sizeof(void*)), stackPointerRegister); + + move(Imm32(-1), returnRegister); + + generateReturn(); + + m_expressionState.emitParenthesesTail(this); + m_expressionState.emitIndirectJumpTable(this); + m_expressionState.linkToNextIteration(this); + } + + void generateEnter() + { +#if CPU(X86_64) + push(X86Registers::ebp); + move(stackPointerRegister, X86Registers::ebp); + push(X86Registers::ebx); +#elif CPU(X86) + push(X86Registers::ebp); + move(stackPointerRegister, X86Registers::ebp); + // TODO: do we need spill registers to fill the output pointer if there are no sub captures? + push(X86Registers::ebx); + push(X86Registers::edi); + push(X86Registers::esi); + // load output into edi (2 = saved ebp + return address). + #if COMPILER(MSVC) + loadPtr(Address(X86Registers::ebp, 2 * sizeof(void*)), input); + loadPtr(Address(X86Registers::ebp, 3 * sizeof(void*)), index); + loadPtr(Address(X86Registers::ebp, 4 * sizeof(void*)), length); + loadPtr(Address(X86Registers::ebp, 5 * sizeof(void*)), output); + #else + loadPtr(Address(X86Registers::ebp, 2 * sizeof(void*)), output); + #endif +#elif CPU(ARM) + push(ARMRegisters::r4); + push(ARMRegisters::r5); + push(ARMRegisters::r6); +#if CPU(ARM_TRADITIONAL) + push(ARMRegisters::r8); // scratch register +#endif + move(ARMRegisters::r3, output); +#elif CPU(MIPS) + // Do nothing. +#endif + } + + void generateReturn() + { +#if CPU(X86_64) + pop(X86Registers::ebx); + pop(X86Registers::ebp); +#elif CPU(X86) + pop(X86Registers::esi); + pop(X86Registers::edi); + pop(X86Registers::ebx); + pop(X86Registers::ebp); +#elif CPU(ARM) +#if CPU(ARM_TRADITIONAL) + pop(ARMRegisters::r8); // scratch register +#endif + pop(ARMRegisters::r6); + pop(ARMRegisters::r5); + pop(ARMRegisters::r4); +#elif CPU(MIPS) + // Do nothing +#endif + ret(); + } + +public: + YarrGenerator(YarrPattern& pattern) + : m_pattern(pattern) + , m_shouldFallBack(false) + { + } + + void generate() + { + generateEnter(); + + if (!m_pattern.m_body->m_hasFixedSize) + store32(index, Address(output)); + + if (m_pattern.m_body->m_callFrameSize) + subPtr(Imm32(m_pattern.m_body->m_callFrameSize * sizeof(void*)), stackPointerRegister); + + generateDisjunction(m_pattern.m_body); + } + + void compile(JSGlobalData* globalData, YarrCodeBlock& jitObject) + { + generate(); + + LinkBuffer patchBuffer(this, globalData->regexAllocator.poolForSize(size()), 0); + + for (unsigned i = 0; i < m_expressionState.m_backtrackRecords.size(); ++i) + patchBuffer.patch(m_expressionState.m_backtrackRecords[i].dataLabel, patchBuffer.locationOf(m_expressionState.m_backtrackRecords[i].backtrackLocation)); + + jitObject.set(patchBuffer.finalizeCode()); + jitObject.setFallBack(m_shouldFallBack); + } + +private: + YarrPattern& m_pattern; + bool m_shouldFallBack; + GenerationState m_expressionState; +}; + +void jitCompile(YarrPattern& pattern, JSGlobalData* globalData, YarrCodeBlock& jitObject) +{ + YarrGenerator(pattern).compile(globalData, jitObject); +} + +int execute(YarrCodeBlock& jitObject, const UChar* input, unsigned start, unsigned length, int* output) +{ + return jitObject.execute(input, start, length, output); +} + +}} + +#endif diff --git a/Source/JavaScriptCore/yarr/YarrJIT.h b/Source/JavaScriptCore/yarr/YarrJIT.h new file mode 100644 index 0000000..414b575 --- /dev/null +++ b/Source/JavaScriptCore/yarr/YarrJIT.h @@ -0,0 +1,82 @@ +/* + * Copyright (C) 2009 Apple Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef YarrJIT_h +#define YarrJIT_h + +#if ENABLE(YARR_JIT) + +#include "MacroAssembler.h" +#include "UString.h" + +#if CPU(X86) && !COMPILER(MSVC) +#define YARR_CALL __attribute__ ((regparm (3))) +#else +#define YARR_CALL +#endif + +namespace JSC { + +class JSGlobalData; +class ExecutablePool; + +namespace Yarr { + +class YarrCodeBlock { + typedef int (*YarrJITCode)(const UChar* input, unsigned start, unsigned length, int* output) YARR_CALL; + +public: + YarrCodeBlock() + : m_needFallBack(false) + { + } + + ~YarrCodeBlock() + { + } + + void setFallBack(bool fallback) { m_needFallBack = fallback; } + bool isFallBack() { return m_needFallBack; } + void set(MacroAssembler::CodeRef ref) { m_ref = ref; } + + int execute(const UChar* input, unsigned start, unsigned length, int* output) + { + return reinterpret_cast(m_ref.m_code.executableAddress())(input, start, length, output); + } + +#if ENABLE(REGEXP_TRACING) + void *getAddr() { return m_ref.m_code.executableAddress(); } +#endif + +private: + MacroAssembler::CodeRef m_ref; + bool m_needFallBack; +}; + +} } // namespace JSC::Yarr + +#endif + +#endif // YarrJIT_h diff --git a/Source/JavaScriptCore/yarr/YarrParser.h b/Source/JavaScriptCore/yarr/YarrParser.h new file mode 100644 index 0000000..8c5741a --- /dev/null +++ b/Source/JavaScriptCore/yarr/YarrParser.h @@ -0,0 +1,871 @@ +/* + * Copyright (C) 2009 Apple Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef YarrParser_h +#define YarrParser_h + +#include "UString.h" +#include "Yarr.h" +#include +#include + +namespace JSC { namespace Yarr { + +#define REGEXP_ERROR_PREFIX "Invalid regular expression: " + +enum BuiltInCharacterClassID { + DigitClassID, + SpaceClassID, + WordClassID, + NewlineClassID, +}; + +// The Parser class should not be used directly - only via the Yarr::parse() method. +template +class Parser { +private: + template + friend const char* parse(FriendDelegate& delegate, const UString& pattern, unsigned backReferenceLimit); + + enum ErrorCode { + NoError, + PatternTooLarge, + QuantifierOutOfOrder, + QuantifierWithoutAtom, + MissingParentheses, + ParenthesesUnmatched, + ParenthesesTypeInvalid, + CharacterClassUnmatched, + CharacterClassOutOfOrder, + EscapeUnterminated, + NumberOfErrorCodes + }; + + /* + * CharacterClassParserDelegate: + * + * The class CharacterClassParserDelegate is used in the parsing of character + * classes. This class handles detection of character ranges. This class + * implements enough of the delegate interface such that it can be passed to + * parseEscape() as an EscapeDelegate. This allows parseEscape() to be reused + * to perform the parsing of escape characters in character sets. + */ + class CharacterClassParserDelegate { + public: + CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err) + : m_delegate(delegate) + , m_err(err) + , m_state(Empty) + { + } + + /* + * begin(): + * + * Called at beginning of construction. + */ + void begin(bool invert) + { + m_delegate.atomCharacterClassBegin(invert); + } + + /* + * atomPatternCharacter(): + * + * This method is called either from parseCharacterClass() (for an unescaped + * character in a character class), or from parseEscape(). In the former case + * the value true will be passed for the argument 'hyphenIsRange', and in this + * mode we will allow a hypen to be treated as indicating a range (i.e. /[a-z]/ + * is different to /[a\-z]/). + */ + void atomPatternCharacter(UChar ch, bool hyphenIsRange = false) + { + switch (m_state) { + case AfterCharacterClass: + // Following a builtin character class we need look out for a hyphen. + // We're looking for invalid ranges, such as /[\d-x]/ or /[\d-\d]/. + // If we see a hyphen following a charater class then unlike usual + // we'll report it to the delegate immediately, and put ourself into + // a poisoned state. Any following calls to add another character or + // character class will result in an error. (A hypen following a + // character-class is itself valid, but only at the end of a regex). + if (hyphenIsRange && ch == '-') { + m_delegate.atomCharacterClassAtom('-'); + m_state = AfterCharacterClassHyphen; + return; + } + // Otherwise just fall through - cached character so treat this as Empty. + + case Empty: + m_character = ch; + m_state = CachedCharacter; + return; + + case CachedCharacter: + if (hyphenIsRange && ch == '-') + m_state = CachedCharacterHyphen; + else { + m_delegate.atomCharacterClassAtom(m_character); + m_character = ch; + } + return; + + case CachedCharacterHyphen: + if (ch < m_character) { + m_err = CharacterClassOutOfOrder; + return; + } + m_delegate.atomCharacterClassRange(m_character, ch); + m_state = Empty; + return; + + // See coment in atomBuiltInCharacterClass below. + // This too is technically an error, per ECMA-262, and again we + // we chose to allow this. Note a subtlely here that while we + // diverge from the spec's definition of CharacterRange we do + // remain in compliance with the grammar. For example, consider + // the expression /[\d-a-z]/. We comply with the grammar in + // this case by not allowing a-z to be matched as a range. + case AfterCharacterClassHyphen: + m_delegate.atomCharacterClassAtom(ch); + m_state = Empty; + return; + } + } + + /* + * atomBuiltInCharacterClass(): + * + * Adds a built-in character class, called by parseEscape(). + */ + void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert) + { + switch (m_state) { + case CachedCharacter: + // Flush the currently cached character, then fall through. + m_delegate.atomCharacterClassAtom(m_character); + + case Empty: + case AfterCharacterClass: + m_state = AfterCharacterClass; + m_delegate.atomCharacterClassBuiltIn(classID, invert); + return; + + // If we hit either of these cases, we have an invalid range that + // looks something like /[x-\d]/ or /[\d-\d]/. + // According to ECMA-262 this should be a syntax error, but + // empirical testing shows this to break teh webz. Instead we + // comply with to the ECMA-262 grammar, and assume the grammar to + // have matched the range correctly, but tweak our interpretation + // of CharacterRange. Effectively we implicitly handle the hyphen + // as if it were escaped, e.g. /[\w-_]/ is treated as /[\w\-_]/. + case CachedCharacterHyphen: + m_delegate.atomCharacterClassAtom(m_character); + m_delegate.atomCharacterClassAtom('-'); + // fall through + case AfterCharacterClassHyphen: + m_delegate.atomCharacterClassBuiltIn(classID, invert); + m_state = Empty; + return; + } + } + + /* + * end(): + * + * Called at end of construction. + */ + void end() + { + if (m_state == CachedCharacter) + m_delegate.atomCharacterClassAtom(m_character); + else if (m_state == CachedCharacterHyphen) { + m_delegate.atomCharacterClassAtom(m_character); + m_delegate.atomCharacterClassAtom('-'); + } + m_delegate.atomCharacterClassEnd(); + } + + // parseEscape() should never call these delegate methods when + // invoked with inCharacterClass set. + void assertionWordBoundary(bool) { ASSERT_NOT_REACHED(); } + void atomBackReference(unsigned) { ASSERT_NOT_REACHED(); } + + private: + Delegate& m_delegate; + ErrorCode& m_err; + enum CharacterClassConstructionState { + Empty, + CachedCharacter, + CachedCharacterHyphen, + AfterCharacterClass, + AfterCharacterClassHyphen, + } m_state; + UChar m_character; + }; + + Parser(Delegate& delegate, const UString& pattern, unsigned backReferenceLimit) + : m_delegate(delegate) + , m_backReferenceLimit(backReferenceLimit) + , m_err(NoError) + , m_data(pattern.characters()) + , m_size(pattern.length()) + , m_index(0) + , m_parenthesesNestingDepth(0) + { + } + + /* + * parseEscape(): + * + * Helper for parseTokens() AND parseCharacterClass(). + * Unlike the other parser methods, this function does not report tokens + * directly to the member delegate (m_delegate), instead tokens are + * emitted to the delegate provided as an argument. In the case of atom + * escapes, parseTokens() will call parseEscape() passing m_delegate as + * an argument, and as such the escape will be reported to the delegate. + * + * However this method may also be used by parseCharacterClass(), in which + * case a CharacterClassParserDelegate will be passed as the delegate that + * tokens should be added to. A boolean flag is also provided to indicate + * whether that an escape in a CharacterClass is being parsed (some parsing + * rules change in this context). + * + * The boolean value returned by this method indicates whether the token + * parsed was an atom (outside of a characted class \b and \B will be + * interpreted as assertions). + */ + template + bool parseEscape(EscapeDelegate& delegate) + { + ASSERT(!m_err); + ASSERT(peek() == '\\'); + consume(); + + if (atEndOfPattern()) { + m_err = EscapeUnterminated; + return false; + } + + switch (peek()) { + // Assertions + case 'b': + consume(); + if (inCharacterClass) + delegate.atomPatternCharacter('\b'); + else { + delegate.assertionWordBoundary(false); + return false; + } + break; + case 'B': + consume(); + if (inCharacterClass) + delegate.atomPatternCharacter('B'); + else { + delegate.assertionWordBoundary(true); + return false; + } + break; + + // CharacterClassEscape + case 'd': + consume(); + delegate.atomBuiltInCharacterClass(DigitClassID, false); + break; + case 's': + consume(); + delegate.atomBuiltInCharacterClass(SpaceClassID, false); + break; + case 'w': + consume(); + delegate.atomBuiltInCharacterClass(WordClassID, false); + break; + case 'D': + consume(); + delegate.atomBuiltInCharacterClass(DigitClassID, true); + break; + case 'S': + consume(); + delegate.atomBuiltInCharacterClass(SpaceClassID, true); + break; + case 'W': + consume(); + delegate.atomBuiltInCharacterClass(WordClassID, true); + break; + + // DecimalEscape + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': { + // To match Firefox, we parse an invalid backreference in the range [1-7] as an octal escape. + // First, try to parse this as backreference. + if (!inCharacterClass) { + ParseState state = saveState(); + + unsigned backReference = consumeNumber(); + if (backReference <= m_backReferenceLimit) { + delegate.atomBackReference(backReference); + break; + } + + restoreState(state); + } + + // Not a backreference, and not octal. + if (peek() >= '8') { + delegate.atomPatternCharacter('\\'); + break; + } + + // Fall-through to handle this as an octal escape. + } + + // Octal escape + case '0': + delegate.atomPatternCharacter(consumeOctal()); + break; + + // ControlEscape + case 'f': + consume(); + delegate.atomPatternCharacter('\f'); + break; + case 'n': + consume(); + delegate.atomPatternCharacter('\n'); + break; + case 'r': + consume(); + delegate.atomPatternCharacter('\r'); + break; + case 't': + consume(); + delegate.atomPatternCharacter('\t'); + break; + case 'v': + consume(); + delegate.atomPatternCharacter('\v'); + break; + + // ControlLetter + case 'c': { + ParseState state = saveState(); + consume(); + if (!atEndOfPattern()) { + int control = consume(); + + // To match Firefox, inside a character class, we also accept numbers and '_' as control characters. + if (inCharacterClass ? WTF::isASCIIAlphanumeric(control) || (control == '_') : WTF::isASCIIAlpha(control)) { + delegate.atomPatternCharacter(control & 0x1f); + break; + } + } + restoreState(state); + delegate.atomPatternCharacter('\\'); + break; + } + + // HexEscape + case 'x': { + consume(); + int x = tryConsumeHex(2); + if (x == -1) + delegate.atomPatternCharacter('x'); + else + delegate.atomPatternCharacter(x); + break; + } + + // UnicodeEscape + case 'u': { + consume(); + int u = tryConsumeHex(4); + if (u == -1) + delegate.atomPatternCharacter('u'); + else + delegate.atomPatternCharacter(u); + break; + } + + // IdentityEscape + default: + delegate.atomPatternCharacter(consume()); + } + + return true; + } + + /* + * parseAtomEscape(), parseCharacterClassEscape(): + * + * These methods alias to parseEscape(). + */ + bool parseAtomEscape() + { + return parseEscape(m_delegate); + } + void parseCharacterClassEscape(CharacterClassParserDelegate& delegate) + { + parseEscape(delegate); + } + + /* + * parseCharacterClass(): + * + * Helper for parseTokens(); calls dirctly and indirectly (via parseCharacterClassEscape) + * to an instance of CharacterClassParserDelegate, to describe the character class to the + * delegate. + */ + void parseCharacterClass() + { + ASSERT(!m_err); + ASSERT(peek() == '['); + consume(); + + CharacterClassParserDelegate characterClassConstructor(m_delegate, m_err); + + characterClassConstructor.begin(tryConsume('^')); + + while (!atEndOfPattern()) { + switch (peek()) { + case ']': + consume(); + characterClassConstructor.end(); + return; + + case '\\': + parseCharacterClassEscape(characterClassConstructor); + break; + + default: + characterClassConstructor.atomPatternCharacter(consume(), true); + } + + if (m_err) + return; + } + + m_err = CharacterClassUnmatched; + } + + /* + * parseParenthesesBegin(): + * + * Helper for parseTokens(); checks for parentheses types other than regular capturing subpatterns. + */ + void parseParenthesesBegin() + { + ASSERT(!m_err); + ASSERT(peek() == '('); + consume(); + + if (tryConsume('?')) { + if (atEndOfPattern()) { + m_err = ParenthesesTypeInvalid; + return; + } + + switch (consume()) { + case ':': + m_delegate.atomParenthesesSubpatternBegin(false); + break; + + case '=': + m_delegate.atomParentheticalAssertionBegin(); + break; + + case '!': + m_delegate.atomParentheticalAssertionBegin(true); + break; + + default: + m_err = ParenthesesTypeInvalid; + } + } else + m_delegate.atomParenthesesSubpatternBegin(); + + ++m_parenthesesNestingDepth; + } + + /* + * parseParenthesesEnd(): + * + * Helper for parseTokens(); checks for parse errors (due to unmatched parentheses). + */ + void parseParenthesesEnd() + { + ASSERT(!m_err); + ASSERT(peek() == ')'); + consume(); + + if (m_parenthesesNestingDepth > 0) + m_delegate.atomParenthesesEnd(); + else + m_err = ParenthesesUnmatched; + + --m_parenthesesNestingDepth; + } + + /* + * parseQuantifier(): + * + * Helper for parseTokens(); checks for parse errors and non-greedy quantifiers. + */ + void parseQuantifier(bool lastTokenWasAnAtom, unsigned min, unsigned max) + { + ASSERT(!m_err); + ASSERT(min <= max); + + if (lastTokenWasAnAtom) + m_delegate.quantifyAtom(min, max, !tryConsume('?')); + else + m_err = QuantifierWithoutAtom; + } + + /* + * parseTokens(): + * + * This method loops over the input pattern reporting tokens to the delegate. + * The method returns when a parse error is detected, or the end of the pattern + * is reached. One piece of state is tracked around the loop, which is whether + * the last token passed to the delegate was an atom (this is necessary to detect + * a parse error when a quantifier provided without an atom to quantify). + */ + void parseTokens() + { + bool lastTokenWasAnAtom = false; + + while (!atEndOfPattern()) { + switch (peek()) { + case '|': + consume(); + m_delegate.disjunction(); + lastTokenWasAnAtom = false; + break; + + case '(': + parseParenthesesBegin(); + lastTokenWasAnAtom = false; + break; + + case ')': + parseParenthesesEnd(); + lastTokenWasAnAtom = true; + break; + + case '^': + consume(); + m_delegate.assertionBOL(); + lastTokenWasAnAtom = false; + break; + + case '$': + consume(); + m_delegate.assertionEOL(); + lastTokenWasAnAtom = false; + break; + + case '.': + consume(); + m_delegate.atomBuiltInCharacterClass(NewlineClassID, true); + lastTokenWasAnAtom = true; + break; + + case '[': + parseCharacterClass(); + lastTokenWasAnAtom = true; + break; + + case '\\': + lastTokenWasAnAtom = parseAtomEscape(); + break; + + case '*': + consume(); + parseQuantifier(lastTokenWasAnAtom, 0, quantifyInfinite); + lastTokenWasAnAtom = false; + break; + + case '+': + consume(); + parseQuantifier(lastTokenWasAnAtom, 1, quantifyInfinite); + lastTokenWasAnAtom = false; + break; + + case '?': + consume(); + parseQuantifier(lastTokenWasAnAtom, 0, 1); + lastTokenWasAnAtom = false; + break; + + case '{': { + ParseState state = saveState(); + + consume(); + if (peekIsDigit()) { + unsigned min = consumeNumber(); + unsigned max = min; + + if (tryConsume(',')) + max = peekIsDigit() ? consumeNumber() : quantifyInfinite; + + if (tryConsume('}')) { + if (min <= max) + parseQuantifier(lastTokenWasAnAtom, min, max); + else + m_err = QuantifierOutOfOrder; + lastTokenWasAnAtom = false; + break; + } + } + + restoreState(state); + } // if we did not find a complete quantifer, fall through to the default case. + + default: + m_delegate.atomPatternCharacter(consume()); + lastTokenWasAnAtom = true; + } + + if (m_err) + return; + } + + if (m_parenthesesNestingDepth > 0) + m_err = MissingParentheses; + } + + /* + * parse(): + * + * This method calls parseTokens() to parse over the input and converts any + * error code to a const char* for a result. + */ + const char* parse() + { + if (m_size > MAX_PATTERN_SIZE) + m_err = PatternTooLarge; + else + parseTokens(); + ASSERT(atEndOfPattern() || m_err); + + // The order of this array must match the ErrorCode enum. + static const char* errorMessages[NumberOfErrorCodes] = { + 0, // NoError + REGEXP_ERROR_PREFIX "regular expression too large", + REGEXP_ERROR_PREFIX "numbers out of order in {} quantifier", + REGEXP_ERROR_PREFIX "nothing to repeat", + REGEXP_ERROR_PREFIX "missing )", + REGEXP_ERROR_PREFIX "unmatched parentheses", + REGEXP_ERROR_PREFIX "unrecognized character after (?", + REGEXP_ERROR_PREFIX "missing terminating ] for character class", + REGEXP_ERROR_PREFIX "range out of order in character class", + REGEXP_ERROR_PREFIX "\\ at end of pattern" + }; + + return errorMessages[m_err]; + } + + + // Misc helper functions: + + typedef unsigned ParseState; + + ParseState saveState() + { + return m_index; + } + + void restoreState(ParseState state) + { + m_index = state; + } + + bool atEndOfPattern() + { + ASSERT(m_index <= m_size); + return m_index == m_size; + } + + int peek() + { + ASSERT(m_index < m_size); + return m_data[m_index]; + } + + bool peekIsDigit() + { + return !atEndOfPattern() && WTF::isASCIIDigit(peek()); + } + + unsigned peekDigit() + { + ASSERT(peekIsDigit()); + return peek() - '0'; + } + + int consume() + { + ASSERT(m_index < m_size); + return m_data[m_index++]; + } + + unsigned consumeDigit() + { + ASSERT(peekIsDigit()); + return consume() - '0'; + } + + unsigned consumeNumber() + { + unsigned n = consumeDigit(); + // check for overflow. + for (unsigned newValue; peekIsDigit() && ((newValue = n * 10 + peekDigit()) >= n); ) { + n = newValue; + consume(); + } + return n; + } + + unsigned consumeOctal() + { + ASSERT(WTF::isASCIIOctalDigit(peek())); + + unsigned n = consumeDigit(); + while (n < 32 && !atEndOfPattern() && WTF::isASCIIOctalDigit(peek())) + n = n * 8 + consumeDigit(); + return n; + } + + bool tryConsume(UChar ch) + { + if (atEndOfPattern() || (m_data[m_index] != ch)) + return false; + ++m_index; + return true; + } + + int tryConsumeHex(int count) + { + ParseState state = saveState(); + + int n = 0; + while (count--) { + if (atEndOfPattern() || !WTF::isASCIIHexDigit(peek())) { + restoreState(state); + return -1; + } + n = (n << 4) | WTF::toASCIIHexValue(consume()); + } + return n; + } + + Delegate& m_delegate; + unsigned m_backReferenceLimit; + ErrorCode m_err; + const UChar* m_data; + unsigned m_size; + unsigned m_index; + unsigned m_parenthesesNestingDepth; + + // Derived by empirical testing of compile time in PCRE and WREC. + static const unsigned MAX_PATTERN_SIZE = 1024 * 1024; +}; + +/* + * Yarr::parse(): + * + * The parse method is passed a pattern to be parsed and a delegate upon which + * callbacks will be made to record the parsed tokens forming the regex. + * Yarr::parse() returns null on success, or a const C string providing an error + * message where a parse error occurs. + * + * The Delegate must implement the following interface: + * + * void assertionBOL(); + * void assertionEOL(); + * void assertionWordBoundary(bool invert); + * + * void atomPatternCharacter(UChar ch); + * void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert); + * void atomCharacterClassBegin(bool invert) + * void atomCharacterClassAtom(UChar ch) + * void atomCharacterClassRange(UChar begin, UChar end) + * void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert) + * void atomCharacterClassEnd() + * void atomParenthesesSubpatternBegin(bool capture = true); + * void atomParentheticalAssertionBegin(bool invert = false); + * void atomParenthesesEnd(); + * void atomBackReference(unsigned subpatternId); + * + * void quantifyAtom(unsigned min, unsigned max, bool greedy); + * + * void disjunction(); + * + * The regular expression is described by a sequence of assertion*() and atom*() + * callbacks to the delegate, describing the terms in the regular expression. + * Following an atom a quantifyAtom() call may occur to indicate that the previous + * atom should be quantified. In the case of atoms described across multiple + * calls (parentheses and character classes) the call to quantifyAtom() will come + * after the call to the atom*End() method, never after atom*Begin(). + * + * Character classes may either be described by a single call to + * atomBuiltInCharacterClass(), or by a sequence of atomCharacterClass*() calls. + * In the latter case, ...Begin() will be called, followed by a sequence of + * calls to ...Atom(), ...Range(), and ...BuiltIn(), followed by a call to ...End(). + * + * Sequences of atoms and assertions are broken into alternatives via calls to + * disjunction(). Assertions, atoms, and disjunctions emitted between calls to + * atomParenthesesBegin() and atomParenthesesEnd() form the body of a subpattern. + * atomParenthesesBegin() is passed a subpatternId. In the case of a regular + * capturing subpattern, this will be the subpatternId associated with these + * parentheses, and will also by definition be the lowest subpatternId of these + * parentheses and of any nested paretheses. The atomParenthesesEnd() method + * is passed the subpatternId of the last capturing subexpression nested within + * these paretheses. In the case of a capturing subpattern with no nested + * capturing subpatterns, the same subpatternId will be passed to the begin and + * end functions. In the case of non-capturing subpatterns the subpatternId + * passed to the begin method is also the first possible subpatternId that might + * be nested within these paretheses. If a set of non-capturing parentheses does + * not contain any capturing subpatterns, then the subpatternId passed to begin + * will be greater than the subpatternId passed to end. + */ + +template +const char* parse(Delegate& delegate, const UString& pattern, unsigned backReferenceLimit = quantifyInfinite) +{ + return Parser(delegate, pattern, backReferenceLimit).parse(); +} + +} } // namespace JSC::Yarr + +#endif // YarrParser_h diff --git a/Source/JavaScriptCore/yarr/YarrPattern.cpp b/Source/JavaScriptCore/yarr/YarrPattern.cpp new file mode 100644 index 0000000..112b65d --- /dev/null +++ b/Source/JavaScriptCore/yarr/YarrPattern.cpp @@ -0,0 +1,985 @@ +/* + * Copyright (C) 2009 Apple Inc. All rights reserved. + * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "YarrPattern.h" + +#include "Yarr.h" +#include "YarrParser.h" +#include + +using namespace WTF; + +namespace JSC { namespace Yarr { + +#include "RegExpJitTables.h" + +class CharacterClassConstructor { +public: + CharacterClassConstructor(bool isCaseInsensitive = false) + : m_isCaseInsensitive(isCaseInsensitive) + { + } + + void reset() + { + m_matches.clear(); + m_ranges.clear(); + m_matchesUnicode.clear(); + m_rangesUnicode.clear(); + } + + void append(const CharacterClass* other) + { + for (size_t i = 0; i < other->m_matches.size(); ++i) + addSorted(m_matches, other->m_matches[i]); + for (size_t i = 0; i < other->m_ranges.size(); ++i) + addSortedRange(m_ranges, other->m_ranges[i].begin, other->m_ranges[i].end); + for (size_t i = 0; i < other->m_matchesUnicode.size(); ++i) + addSorted(m_matchesUnicode, other->m_matchesUnicode[i]); + for (size_t i = 0; i < other->m_rangesUnicode.size(); ++i) + addSortedRange(m_rangesUnicode, other->m_rangesUnicode[i].begin, other->m_rangesUnicode[i].end); + } + + void putChar(UChar ch) + { + if (ch <= 0x7f) { + if (m_isCaseInsensitive && isASCIIAlpha(ch)) { + addSorted(m_matches, toASCIIUpper(ch)); + addSorted(m_matches, toASCIILower(ch)); + } else + addSorted(m_matches, ch); + } else { + UChar upper, lower; + if (m_isCaseInsensitive && ((upper = Unicode::toUpper(ch)) != (lower = Unicode::toLower(ch)))) { + addSorted(m_matchesUnicode, upper); + addSorted(m_matchesUnicode, lower); + } else + addSorted(m_matchesUnicode, ch); + } + } + + // returns true if this character has another case, and 'ch' is the upper case form. + static inline bool isUnicodeUpper(UChar ch) + { + return ch != Unicode::toLower(ch); + } + + // returns true if this character has another case, and 'ch' is the lower case form. + static inline bool isUnicodeLower(UChar ch) + { + return ch != Unicode::toUpper(ch); + } + + void putRange(UChar lo, UChar hi) + { + if (lo <= 0x7f) { + char asciiLo = lo; + char asciiHi = std::min(hi, (UChar)0x7f); + addSortedRange(m_ranges, lo, asciiHi); + + if (m_isCaseInsensitive) { + if ((asciiLo <= 'Z') && (asciiHi >= 'A')) + addSortedRange(m_ranges, std::max(asciiLo, 'A')+('a'-'A'), std::min(asciiHi, 'Z')+('a'-'A')); + if ((asciiLo <= 'z') && (asciiHi >= 'a')) + addSortedRange(m_ranges, std::max(asciiLo, 'a')+('A'-'a'), std::min(asciiHi, 'z')+('A'-'a')); + } + } + if (hi >= 0x80) { + uint32_t unicodeCurr = std::max(lo, (UChar)0x80); + addSortedRange(m_rangesUnicode, unicodeCurr, hi); + + if (m_isCaseInsensitive) { + while (unicodeCurr <= hi) { + // If the upper bound of the range (hi) is 0xffff, the increments to + // unicodeCurr in this loop may take it to 0x10000. This is fine + // (if so we won't re-enter the loop, since the loop condition above + // will definitely fail) - but this does mean we cannot use a UChar + // to represent unicodeCurr, we must use a 32-bit value instead. + ASSERT(unicodeCurr <= 0xffff); + + if (isUnicodeUpper(unicodeCurr)) { + UChar lowerCaseRangeBegin = Unicode::toLower(unicodeCurr); + UChar lowerCaseRangeEnd = lowerCaseRangeBegin; + while ((++unicodeCurr <= hi) && isUnicodeUpper(unicodeCurr) && (Unicode::toLower(unicodeCurr) == (lowerCaseRangeEnd + 1))) + lowerCaseRangeEnd++; + addSortedRange(m_rangesUnicode, lowerCaseRangeBegin, lowerCaseRangeEnd); + } else if (isUnicodeLower(unicodeCurr)) { + UChar upperCaseRangeBegin = Unicode::toUpper(unicodeCurr); + UChar upperCaseRangeEnd = upperCaseRangeBegin; + while ((++unicodeCurr <= hi) && isUnicodeLower(unicodeCurr) && (Unicode::toUpper(unicodeCurr) == (upperCaseRangeEnd + 1))) + upperCaseRangeEnd++; + addSortedRange(m_rangesUnicode, upperCaseRangeBegin, upperCaseRangeEnd); + } else + ++unicodeCurr; + } + } + } + } + + CharacterClass* charClass() + { + CharacterClass* characterClass = new CharacterClass(0); + + characterClass->m_matches.append(m_matches); + characterClass->m_ranges.append(m_ranges); + characterClass->m_matchesUnicode.append(m_matchesUnicode); + characterClass->m_rangesUnicode.append(m_rangesUnicode); + + reset(); + + return characterClass; + } + +private: + void addSorted(Vector& matches, UChar ch) + { + unsigned pos = 0; + unsigned range = matches.size(); + + // binary chop, find position to insert char. + while (range) { + unsigned index = range >> 1; + + int val = matches[pos+index] - ch; + if (!val) + return; + else if (val > 0) + range = index; + else { + pos += (index+1); + range -= (index+1); + } + } + + if (pos == matches.size()) + matches.append(ch); + else + matches.insert(pos, ch); + } + + void addSortedRange(Vector& ranges, UChar lo, UChar hi) + { + unsigned end = ranges.size(); + + // Simple linear scan - I doubt there are that many ranges anyway... + // feel free to fix this with something faster (eg binary chop). + for (unsigned i = 0; i < end; ++i) { + // does the new range fall before the current position in the array + if (hi < ranges[i].begin) { + // optional optimization: concatenate appending ranges? - may not be worthwhile. + if (hi == (ranges[i].begin - 1)) { + ranges[i].begin = lo; + return; + } + ranges.insert(i, CharacterRange(lo, hi)); + return; + } + // Okay, since we didn't hit the last case, the end of the new range is definitely at or after the begining + // If the new range start at or before the end of the last range, then the overlap (if it starts one after the + // end of the last range they concatenate, which is just as good. + if (lo <= (ranges[i].end + 1)) { + // found an intersect! we'll replace this entry in the array. + ranges[i].begin = std::min(ranges[i].begin, lo); + ranges[i].end = std::max(ranges[i].end, hi); + + // now check if the new range can subsume any subsequent ranges. + unsigned next = i+1; + // each iteration of the loop we will either remove something from the list, or break the loop. + while (next < ranges.size()) { + if (ranges[next].begin <= (ranges[i].end + 1)) { + // the next entry now overlaps / concatenates this one. + ranges[i].end = std::max(ranges[i].end, ranges[next].end); + ranges.remove(next); + } else + break; + } + + return; + } + } + + // CharacterRange comes after all existing ranges. + ranges.append(CharacterRange(lo, hi)); + } + + bool m_isCaseInsensitive; + + Vector m_matches; + Vector m_ranges; + Vector m_matchesUnicode; + Vector m_rangesUnicode; +}; + +struct BeginCharHelper { + BeginCharHelper(Vector* beginChars, bool isCaseInsensitive = false) + : m_beginChars(beginChars) + , m_isCaseInsensitive(isCaseInsensitive) + {} + + void addBeginChar(BeginChar beginChar, Vector* hotTerms, QuantifierType quantityType, unsigned quantityCount) + { + if (quantityType == QuantifierFixedCount && quantityCount > 1) { + // We duplicate the first found character if the quantity of the term is more than one. eg.: /a{3}/ + beginChar.value |= beginChar.value << 16; + beginChar.mask |= beginChar.mask << 16; + addCharacter(beginChar); + } else if (quantityType == QuantifierFixedCount && quantityCount == 1 && hotTerms->size()) + // In case of characters with fixed quantifier we should check the next character as well. + linkHotTerms(beginChar, hotTerms); + else + // In case of greedy matching the next character checking is unnecessary therefore we just store + // the first character. + addCharacter(beginChar); + } + + // Merge two following BeginChars in the vector to reduce the number of character checks. + void merge(unsigned size) + { + for (unsigned i = 0; i < size; i++) { + BeginChar* curr = &m_beginChars->at(i); + BeginChar* next = &m_beginChars->at(i + 1); + + // If the current and the next size of value is different we should skip the merge process + // because the 16bit and 32bit values are unmergable. + if (curr->value <= 0xFFFF && next->value > 0xFFFF) + continue; + + unsigned diff = curr->value ^ next->value; + + curr->mask |= diff; + curr->value |= curr->mask; + + m_beginChars->remove(i + 1); + size--; + } + } + +private: + void addCharacter(BeginChar beginChar) + { + unsigned pos = 0; + unsigned range = m_beginChars->size(); + + // binary chop, find position to insert char. + while (range) { + unsigned index = range >> 1; + + int val = m_beginChars->at(pos+index).value - beginChar.value; + if (!val) + return; + if (val < 0) + range = index; + else { + pos += (index+1); + range -= (index+1); + } + } + + if (pos == m_beginChars->size()) + m_beginChars->append(beginChar); + else + m_beginChars->insert(pos, beginChar); + } + + // Create BeginChar objects by appending each terms from a hotTerms vector to an existing BeginChar object. + void linkHotTerms(BeginChar beginChar, Vector* hotTerms) + { + for (unsigned i = 0; i < hotTerms->size(); i++) { + PatternTerm hotTerm = hotTerms->at(i).term; + ASSERT(hotTerm.type == PatternTerm::TypePatternCharacter); + + UChar characterNext = hotTerm.patternCharacter; + + // Append a character to an existing BeginChar object. + if (characterNext <= 0x7f) { + unsigned mask = 0; + + if (m_isCaseInsensitive && isASCIIAlpha(characterNext)) { + mask = 32; + characterNext = toASCIILower(characterNext); + } + + addCharacter(BeginChar(beginChar.value | (characterNext << 16), beginChar.mask | (mask << 16))); + } else { + UChar upper, lower; + if (m_isCaseInsensitive && ((upper = Unicode::toUpper(characterNext)) != (lower = Unicode::toLower(characterNext)))) { + addCharacter(BeginChar(beginChar.value | (upper << 16), beginChar.mask)); + addCharacter(BeginChar(beginChar.value | (lower << 16), beginChar.mask)); + } else + addCharacter(BeginChar(beginChar.value | (characterNext << 16), beginChar.mask)); + } + } + } + + Vector* m_beginChars; + bool m_isCaseInsensitive; +}; + +class YarrPatternConstructor { +public: + YarrPatternConstructor(YarrPattern& pattern) + : m_pattern(pattern) + , m_characterClassConstructor(pattern.m_ignoreCase) + , m_beginCharHelper(&pattern.m_beginChars, pattern.m_ignoreCase) + , m_invertParentheticalAssertion(false) + { + m_pattern.m_body = new PatternDisjunction(); + m_alternative = m_pattern.m_body->addNewAlternative(); + m_pattern.m_disjunctions.append(m_pattern.m_body); + } + + ~YarrPatternConstructor() + { + } + + void reset() + { + m_pattern.reset(); + m_characterClassConstructor.reset(); + + m_pattern.m_body = new PatternDisjunction(); + m_alternative = m_pattern.m_body->addNewAlternative(); + m_pattern.m_disjunctions.append(m_pattern.m_body); + } + + void assertionBOL() + { + if (!m_alternative->m_terms.size() & !m_invertParentheticalAssertion) { + m_alternative->m_startsWithBOL = true; + m_alternative->m_containsBOL = true; + m_pattern.m_containsBOL = true; + } + m_alternative->m_terms.append(PatternTerm::BOL()); + } + void assertionEOL() + { + m_alternative->m_terms.append(PatternTerm::EOL()); + } + void assertionWordBoundary(bool invert) + { + m_alternative->m_terms.append(PatternTerm::WordBoundary(invert)); + } + + void atomPatternCharacter(UChar ch) + { + // We handle case-insensitive checking of unicode characters which do have both + // cases by handling them as if they were defined using a CharacterClass. + if (m_pattern.m_ignoreCase && !isASCII(ch) && (Unicode::toUpper(ch) != Unicode::toLower(ch))) { + atomCharacterClassBegin(); + atomCharacterClassAtom(ch); + atomCharacterClassEnd(); + } else + m_alternative->m_terms.append(PatternTerm(ch)); + } + + void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert) + { + switch (classID) { + case DigitClassID: + m_alternative->m_terms.append(PatternTerm(m_pattern.digitsCharacterClass(), invert)); + break; + case SpaceClassID: + m_alternative->m_terms.append(PatternTerm(m_pattern.spacesCharacterClass(), invert)); + break; + case WordClassID: + m_alternative->m_terms.append(PatternTerm(m_pattern.wordcharCharacterClass(), invert)); + break; + case NewlineClassID: + m_alternative->m_terms.append(PatternTerm(m_pattern.newlineCharacterClass(), invert)); + break; + } + } + + void atomCharacterClassBegin(bool invert = false) + { + m_invertCharacterClass = invert; + } + + void atomCharacterClassAtom(UChar ch) + { + m_characterClassConstructor.putChar(ch); + } + + void atomCharacterClassRange(UChar begin, UChar end) + { + m_characterClassConstructor.putRange(begin, end); + } + + void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert) + { + ASSERT(classID != NewlineClassID); + + switch (classID) { + case DigitClassID: + m_characterClassConstructor.append(invert ? m_pattern.nondigitsCharacterClass() : m_pattern.digitsCharacterClass()); + break; + + case SpaceClassID: + m_characterClassConstructor.append(invert ? m_pattern.nonspacesCharacterClass() : m_pattern.spacesCharacterClass()); + break; + + case WordClassID: + m_characterClassConstructor.append(invert ? m_pattern.nonwordcharCharacterClass() : m_pattern.wordcharCharacterClass()); + break; + + default: + ASSERT_NOT_REACHED(); + } + } + + void atomCharacterClassEnd() + { + CharacterClass* newCharacterClass = m_characterClassConstructor.charClass(); + m_pattern.m_userCharacterClasses.append(newCharacterClass); + m_alternative->m_terms.append(PatternTerm(newCharacterClass, m_invertCharacterClass)); + } + + void atomParenthesesSubpatternBegin(bool capture = true) + { + unsigned subpatternId = m_pattern.m_numSubpatterns + 1; + if (capture) + m_pattern.m_numSubpatterns++; + + PatternDisjunction* parenthesesDisjunction = new PatternDisjunction(m_alternative); + m_pattern.m_disjunctions.append(parenthesesDisjunction); + m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction, capture, false)); + m_alternative = parenthesesDisjunction->addNewAlternative(); + } + + void atomParentheticalAssertionBegin(bool invert = false) + { + PatternDisjunction* parenthesesDisjunction = new PatternDisjunction(m_alternative); + m_pattern.m_disjunctions.append(parenthesesDisjunction); + m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParentheticalAssertion, m_pattern.m_numSubpatterns + 1, parenthesesDisjunction, false, invert)); + m_alternative = parenthesesDisjunction->addNewAlternative(); + m_invertParentheticalAssertion = invert; + } + + void atomParenthesesEnd() + { + ASSERT(m_alternative->m_parent); + ASSERT(m_alternative->m_parent->m_parent); + + PatternDisjunction* parenthesesDisjunction = m_alternative->m_parent; + m_alternative = m_alternative->m_parent->m_parent; + + PatternTerm& lastTerm = m_alternative->lastTerm(); + + unsigned numParenAlternatives = parenthesesDisjunction->m_alternatives.size(); + unsigned numBOLAnchoredAlts = 0; + // Bubble up BOL flags + for (unsigned i = 0; i < numParenAlternatives; i++) { + if (parenthesesDisjunction->m_alternatives[i]->m_startsWithBOL) + numBOLAnchoredAlts++; + } + + if (numBOLAnchoredAlts) { + m_alternative->m_containsBOL = true; + // If all the alternatives in parens start with BOL, then so does this one + if (numBOLAnchoredAlts == numParenAlternatives) + m_alternative->m_startsWithBOL = true; + } + + lastTerm.parentheses.lastSubpatternId = m_pattern.m_numSubpatterns; + m_invertParentheticalAssertion = false; + } + + void atomBackReference(unsigned subpatternId) + { + ASSERT(subpatternId); + m_pattern.m_containsBackreferences = true; + m_pattern.m_maxBackReference = std::max(m_pattern.m_maxBackReference, subpatternId); + + if (subpatternId > m_pattern.m_numSubpatterns) { + m_alternative->m_terms.append(PatternTerm::ForwardReference()); + return; + } + + PatternAlternative* currentAlternative = m_alternative; + ASSERT(currentAlternative); + + // Note to self: if we waited until the AST was baked, we could also remove forwards refs + while ((currentAlternative = currentAlternative->m_parent->m_parent)) { + PatternTerm& term = currentAlternative->lastTerm(); + ASSERT((term.type == PatternTerm::TypeParenthesesSubpattern) || (term.type == PatternTerm::TypeParentheticalAssertion)); + + if ((term.type == PatternTerm::TypeParenthesesSubpattern) && term.capture() && (subpatternId == term.parentheses.subpatternId)) { + m_alternative->m_terms.append(PatternTerm::ForwardReference()); + return; + } + } + + m_alternative->m_terms.append(PatternTerm(subpatternId)); + } + + // deep copy the argument disjunction. If filterStartsWithBOL is true, + // skip alternatives with m_startsWithBOL set true. + PatternDisjunction* copyDisjunction(PatternDisjunction* disjunction, bool filterStartsWithBOL = false) + { + PatternDisjunction* newDisjunction = 0; + for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) { + PatternAlternative* alternative = disjunction->m_alternatives[alt]; + if (!filterStartsWithBOL || !alternative->m_startsWithBOL) { + if (!newDisjunction) { + newDisjunction = new PatternDisjunction(); + newDisjunction->m_parent = disjunction->m_parent; + } + PatternAlternative* newAlternative = newDisjunction->addNewAlternative(); + for (unsigned i = 0; i < alternative->m_terms.size(); ++i) + newAlternative->m_terms.append(copyTerm(alternative->m_terms[i], filterStartsWithBOL)); + } + } + + if (newDisjunction) + m_pattern.m_disjunctions.append(newDisjunction); + return newDisjunction; + } + + PatternTerm copyTerm(PatternTerm& term, bool filterStartsWithBOL = false) + { + if ((term.type != PatternTerm::TypeParenthesesSubpattern) && (term.type != PatternTerm::TypeParentheticalAssertion)) + return PatternTerm(term); + + PatternTerm termCopy = term; + termCopy.parentheses.disjunction = copyDisjunction(termCopy.parentheses.disjunction, filterStartsWithBOL); + return termCopy; + } + + void quantifyAtom(unsigned min, unsigned max, bool greedy) + { + ASSERT(min <= max); + ASSERT(m_alternative->m_terms.size()); + + if (!max) { + m_alternative->removeLastTerm(); + return; + } + + PatternTerm& term = m_alternative->lastTerm(); + ASSERT(term.type > PatternTerm::TypeAssertionWordBoundary); + ASSERT((term.quantityCount == 1) && (term.quantityType == QuantifierFixedCount)); + + // For any assertion with a zero minimum, not matching is valid and has no effect, + // remove it. Otherwise, we need to match as least once, but there is no point + // matching more than once, so remove the quantifier. It is not entirely clear + // from the spec whether or not this behavior is correct, but I believe this + // matches Firefox. :-/ + if (term.type == PatternTerm::TypeParentheticalAssertion) { + if (!min) + m_alternative->removeLastTerm(); + return; + } + + if (min == 0) + term.quantify(max, greedy ? QuantifierGreedy : QuantifierNonGreedy); + else if (min == max) + term.quantify(min, QuantifierFixedCount); + else { + term.quantify(min, QuantifierFixedCount); + m_alternative->m_terms.append(copyTerm(term)); + // NOTE: this term is interesting from an analysis perspective, in that it can be ignored..... + m_alternative->lastTerm().quantify((max == quantifyInfinite) ? max : max - min, greedy ? QuantifierGreedy : QuantifierNonGreedy); + if (m_alternative->lastTerm().type == PatternTerm::TypeParenthesesSubpattern) + m_alternative->lastTerm().parentheses.isCopy = true; + } + } + + void disjunction() + { + m_alternative = m_alternative->m_parent->addNewAlternative(); + } + + unsigned setupAlternativeOffsets(PatternAlternative* alternative, unsigned currentCallFrameSize, unsigned initialInputPosition) + { + alternative->m_hasFixedSize = true; + unsigned currentInputPosition = initialInputPosition; + + for (unsigned i = 0; i < alternative->m_terms.size(); ++i) { + PatternTerm& term = alternative->m_terms[i]; + + switch (term.type) { + case PatternTerm::TypeAssertionBOL: + case PatternTerm::TypeAssertionEOL: + case PatternTerm::TypeAssertionWordBoundary: + term.inputPosition = currentInputPosition; + break; + + case PatternTerm::TypeBackReference: + term.inputPosition = currentInputPosition; + term.frameLocation = currentCallFrameSize; + currentCallFrameSize += YarrStackSpaceForBackTrackInfoBackReference; + alternative->m_hasFixedSize = false; + break; + + case PatternTerm::TypeForwardReference: + break; + + case PatternTerm::TypePatternCharacter: + term.inputPosition = currentInputPosition; + if (term.quantityType != QuantifierFixedCount) { + term.frameLocation = currentCallFrameSize; + currentCallFrameSize += YarrStackSpaceForBackTrackInfoPatternCharacter; + alternative->m_hasFixedSize = false; + } else + currentInputPosition += term.quantityCount; + break; + + case PatternTerm::TypeCharacterClass: + term.inputPosition = currentInputPosition; + if (term.quantityType != QuantifierFixedCount) { + term.frameLocation = currentCallFrameSize; + currentCallFrameSize += YarrStackSpaceForBackTrackInfoCharacterClass; + alternative->m_hasFixedSize = false; + } else + currentInputPosition += term.quantityCount; + break; + + case PatternTerm::TypeParenthesesSubpattern: + // Note: for fixed once parentheses we will ensure at least the minimum is available; others are on their own. + term.frameLocation = currentCallFrameSize; + if (term.quantityCount == 1 && !term.parentheses.isCopy) { + if (term.quantityType != QuantifierFixedCount) + currentCallFrameSize += YarrStackSpaceForBackTrackInfoParenthesesOnce; + currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition); + // If quantity is fixed, then pre-check its minimum size. + if (term.quantityType == QuantifierFixedCount) + currentInputPosition += term.parentheses.disjunction->m_minimumSize; + term.inputPosition = currentInputPosition; + } else if (term.parentheses.isTerminal) { + currentCallFrameSize += YarrStackSpaceForBackTrackInfoParenthesesTerminal; + currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition); + term.inputPosition = currentInputPosition; + } else { + term.inputPosition = currentInputPosition; + setupDisjunctionOffsets(term.parentheses.disjunction, 0, currentInputPosition); + currentCallFrameSize += YarrStackSpaceForBackTrackInfoParentheses; + } + // Fixed count of 1 could be accepted, if they have a fixed size *AND* if all alternatives are of the same length. + alternative->m_hasFixedSize = false; + break; + + case PatternTerm::TypeParentheticalAssertion: + term.inputPosition = currentInputPosition; + term.frameLocation = currentCallFrameSize; + currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize + YarrStackSpaceForBackTrackInfoParentheticalAssertion, currentInputPosition); + break; + } + } + + alternative->m_minimumSize = currentInputPosition - initialInputPosition; + return currentCallFrameSize; + } + + unsigned setupDisjunctionOffsets(PatternDisjunction* disjunction, unsigned initialCallFrameSize, unsigned initialInputPosition) + { + if ((disjunction != m_pattern.m_body) && (disjunction->m_alternatives.size() > 1)) + initialCallFrameSize += YarrStackSpaceForBackTrackInfoAlternative; + + unsigned minimumInputSize = UINT_MAX; + unsigned maximumCallFrameSize = 0; + bool hasFixedSize = true; + + for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) { + PatternAlternative* alternative = disjunction->m_alternatives[alt]; + unsigned currentAlternativeCallFrameSize = setupAlternativeOffsets(alternative, initialCallFrameSize, initialInputPosition); + minimumInputSize = min(minimumInputSize, alternative->m_minimumSize); + maximumCallFrameSize = max(maximumCallFrameSize, currentAlternativeCallFrameSize); + hasFixedSize &= alternative->m_hasFixedSize; + } + + ASSERT(minimumInputSize != UINT_MAX); + ASSERT(maximumCallFrameSize >= initialCallFrameSize); + + disjunction->m_hasFixedSize = hasFixedSize; + disjunction->m_minimumSize = minimumInputSize; + disjunction->m_callFrameSize = maximumCallFrameSize; + return maximumCallFrameSize; + } + + void setupOffsets() + { + setupDisjunctionOffsets(m_pattern.m_body, 0, 0); + } + + // This optimization identifies sets of parentheses that we will never need to backtrack. + // In these cases we do not need to store state from prior iterations. + // We can presently avoid backtracking for: + // * where the parens are at the end of the regular expression (last term in any of the + // alternatives of the main body disjunction). + // * where the parens are non-capturing, and quantified unbounded greedy (*). + // * where the parens do not contain any capturing subpatterns. + void checkForTerminalParentheses() + { + // This check is much too crude; should be just checking whether the candidate + // node contains nested capturing subpatterns, not the whole expression! + if (m_pattern.m_numSubpatterns) + return; + + Vector& alternatives = m_pattern.m_body->m_alternatives; + for (size_t i = 0; i < alternatives.size(); ++i) { + Vector& terms = alternatives[i]->m_terms; + if (terms.size()) { + PatternTerm& term = terms.last(); + if (term.type == PatternTerm::TypeParenthesesSubpattern + && term.quantityType == QuantifierGreedy + && term.quantityCount == quantifyInfinite + && !term.capture()) + term.parentheses.isTerminal = true; + } + } + } + + void optimizeBOL() + { + // Look for expressions containing beginning of line (^) anchoring and unroll them. + // e.g. /^a|^b|c/ becomes /^a|^b|c/ which is executed once followed by /c/ which loops + // This code relies on the parsing code tagging alternatives with m_containsBOL and + // m_startsWithBOL and rolling those up to containing alternatives. + // At this point, this is only valid for non-multiline expressions. + PatternDisjunction* disjunction = m_pattern.m_body; + + if (!m_pattern.m_containsBOL || m_pattern.m_multiline) + return; + + PatternDisjunction* loopDisjunction = copyDisjunction(disjunction, true); + + // Set alternatives in disjunction to "onceThrough" + for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) + disjunction->m_alternatives[alt]->setOnceThrough(); + + if (loopDisjunction) { + // Move alternatives from loopDisjunction to disjunction + for (unsigned alt = 0; alt < loopDisjunction->m_alternatives.size(); ++alt) + disjunction->m_alternatives.append(loopDisjunction->m_alternatives[alt]); + + loopDisjunction->m_alternatives.clear(); + } + } + + bool addBeginTerm(PatternTerm term, Vector* beginTerms, PatternAlternative* alternative, unsigned numTerms, unsigned termIndex, unsigned depth) + { + if (term.quantityType == QuantifierFixedCount) { + beginTerms->append(TermChain(term)); + if (depth < 2 && termIndex < numTerms - 1 && term.quantityCount == 1) + setupAlternativeBeginTerms(alternative, &beginTerms->last().hotTerms, termIndex + 1, depth + 1); + } else if (termIndex != numTerms - 1) { + beginTerms->append(TermChain(term)); + return true; + } + + return false; + } + + // This function collects the terms which are potentially matching the first number of depth characters in the result. + // If this function returns false then it found at least one term which makes the beginning character + // look-up optimization inefficient. + bool setupDisjunctionBeginTerms(PatternDisjunction* disjunction, Vector* beginTerms, unsigned depth) + { + for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) { + PatternAlternative* alternative = disjunction->m_alternatives[alt]; + + if (!setupAlternativeBeginTerms(alternative, beginTerms, 0, depth)) + return false; + } + + return true; + } + + bool setupAlternativeBeginTerms(PatternAlternative* alternative, Vector* beginTerms, unsigned termIndex, unsigned depth) + { + bool checkNext = true; + unsigned numTerms = alternative->m_terms.size(); + + while (checkNext && termIndex < numTerms) { + PatternTerm term = alternative->m_terms[termIndex]; + checkNext = false; + + switch (term.type) { + case PatternTerm::TypeAssertionBOL: + case PatternTerm::TypeAssertionEOL: + case PatternTerm::TypeAssertionWordBoundary: + return false; + + case PatternTerm::TypeBackReference: + case PatternTerm::TypeForwardReference: + return false; + + case PatternTerm::TypePatternCharacter: + if (addBeginTerm(term, beginTerms, alternative, numTerms, termIndex, depth)) { + termIndex++; + checkNext = true; + } + break; + + case PatternTerm::TypeCharacterClass: + return false; + + case PatternTerm::TypeParentheticalAssertion: + if (term.invert()) + return false; + + case PatternTerm::TypeParenthesesSubpattern: + if (term.quantityType != QuantifierFixedCount) { + if (termIndex == numTerms - 1) + break; + + termIndex++; + checkNext = true; + + } + + if (!setupDisjunctionBeginTerms(term.parentheses.disjunction, beginTerms, depth)) + return false; + + break; + } + } + + return true; + } + + void setupBeginChars() + { + Vector beginTerms; + bool containsFixedCharacter = false; + + if ((!m_pattern.m_body->m_hasFixedSize || m_pattern.m_body->m_alternatives.size() > 1) + && setupDisjunctionBeginTerms(m_pattern.m_body, &beginTerms, 0)) { + unsigned size = beginTerms.size(); + + // If we haven't collected any terms we should abort the preparation of beginning character look-up optimization. + if (!size) + return; + + m_pattern.m_containsBeginChars = true; + + for (unsigned i = 0; i < size; i++) { + PatternTerm term = beginTerms[i].term; + + // We have just collected PatternCharacter terms, other terms are not allowed. + ASSERT(term.type == PatternTerm::TypePatternCharacter); + + if (term.quantityType == QuantifierFixedCount) + containsFixedCharacter = true; + + UChar character = term.patternCharacter; + unsigned mask = 0; + + if (character <= 0x7f) { + if (m_pattern.m_ignoreCase && isASCIIAlpha(character)) { + mask = 32; + character = toASCIILower(character); + } + + m_beginCharHelper.addBeginChar(BeginChar(character, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount); + } else { + UChar upper, lower; + if (m_pattern.m_ignoreCase && ((upper = Unicode::toUpper(character)) != (lower = Unicode::toLower(character)))) { + m_beginCharHelper.addBeginChar(BeginChar(upper, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount); + m_beginCharHelper.addBeginChar(BeginChar(lower, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount); + } else + m_beginCharHelper.addBeginChar(BeginChar(character, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount); + } + } + + // If the pattern doesn't contain terms with fixed quantifiers then the beginning character look-up optimization is inefficient. + if (!containsFixedCharacter) { + m_pattern.m_containsBeginChars = false; + return; + } + + size = m_pattern.m_beginChars.size(); + + if (size > 2) + m_beginCharHelper.merge(size - 1); + else if (size <= 1) + m_pattern.m_containsBeginChars = false; + } + } + +private: + YarrPattern& m_pattern; + PatternAlternative* m_alternative; + CharacterClassConstructor m_characterClassConstructor; + BeginCharHelper m_beginCharHelper; + bool m_invertCharacterClass; + bool m_invertParentheticalAssertion; +}; + +const char* YarrPattern::compile(const UString& patternString) +{ + YarrPatternConstructor constructor(*this); + + if (const char* error = parse(constructor, patternString)) + return error; + + // If the pattern contains illegal backreferences reset & reparse. + // Quoting Netscape's "What's new in JavaScript 1.2", + // "Note: if the number of left parentheses is less than the number specified + // in \#, the \# is taken as an octal escape as described in the next row." + if (containsIllegalBackReference()) { + unsigned numSubpatterns = m_numSubpatterns; + + constructor.reset(); +#if !ASSERT_DISABLED + const char* error = +#endif + parse(constructor, patternString, numSubpatterns); + + ASSERT(!error); + ASSERT(numSubpatterns == m_numSubpatterns); + } + + constructor.checkForTerminalParentheses(); + constructor.optimizeBOL(); + + constructor.setupOffsets(); + constructor.setupBeginChars(); + + return 0; +} + +YarrPattern::YarrPattern(const UString& pattern, bool ignoreCase, bool multiline, const char** error) + : m_ignoreCase(ignoreCase) + , m_multiline(multiline) + , m_containsBackreferences(false) + , m_containsBeginChars(false) + , m_containsBOL(false) + , m_numSubpatterns(0) + , m_maxBackReference(0) + , newlineCached(0) + , digitsCached(0) + , spacesCached(0) + , wordcharCached(0) + , nondigitsCached(0) + , nonspacesCached(0) + , nonwordcharCached(0) +{ + *error = compile(pattern); +} + +} } diff --git a/Source/JavaScriptCore/yarr/YarrPattern.h b/Source/JavaScriptCore/yarr/YarrPattern.h new file mode 100644 index 0000000..2172dda --- /dev/null +++ b/Source/JavaScriptCore/yarr/YarrPattern.h @@ -0,0 +1,417 @@ +/* + * Copyright (C) 2009 Apple Inc. All rights reserved. + * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef YarrPattern_h +#define YarrPattern_h + +#include +#include + +#include + +namespace JSC { namespace Yarr { + +struct PatternDisjunction; + +struct CharacterRange { + UChar begin; + UChar end; + + CharacterRange(UChar begin, UChar end) + : begin(begin) + , end(end) + { + } +}; + +struct CharacterClassTable : RefCounted { + const char* m_table; + bool m_inverted; + static PassRefPtr create(const char* table, bool inverted) + { + return adoptRef(new CharacterClassTable(table, inverted)); + } + +private: + CharacterClassTable(const char* table, bool inverted) + : m_table(table) + , m_inverted(inverted) + { + } +}; + +struct CharacterClass : FastAllocBase { + // All CharacterClass instances have to have the full set of matches and ranges, + // they may have an optional table for faster lookups (which must match the + // specified matches and ranges) + CharacterClass(PassRefPtr table) + : m_table(table) + { + } + Vector m_matches; + Vector m_ranges; + Vector m_matchesUnicode; + Vector m_rangesUnicode; + RefPtr m_table; +}; + +enum QuantifierType { + QuantifierFixedCount, + QuantifierGreedy, + QuantifierNonGreedy, +}; + +struct PatternTerm { + enum Type { + TypeAssertionBOL, + TypeAssertionEOL, + TypeAssertionWordBoundary, + TypePatternCharacter, + TypeCharacterClass, + TypeBackReference, + TypeForwardReference, + TypeParenthesesSubpattern, + TypeParentheticalAssertion, + } type; + bool m_capture :1; + bool m_invert :1; + union { + UChar patternCharacter; + CharacterClass* characterClass; + unsigned backReferenceSubpatternId; + struct { + PatternDisjunction* disjunction; + unsigned subpatternId; + unsigned lastSubpatternId; + bool isCopy; + bool isTerminal; + } parentheses; + }; + QuantifierType quantityType; + unsigned quantityCount; + int inputPosition; + unsigned frameLocation; + + PatternTerm(UChar ch) + : type(PatternTerm::TypePatternCharacter) + , m_capture(false) + , m_invert(false) + { + patternCharacter = ch; + quantityType = QuantifierFixedCount; + quantityCount = 1; + } + + PatternTerm(CharacterClass* charClass, bool invert) + : type(PatternTerm::TypeCharacterClass) + , m_capture(false) + , m_invert(invert) + { + characterClass = charClass; + quantityType = QuantifierFixedCount; + quantityCount = 1; + } + + PatternTerm(Type type, unsigned subpatternId, PatternDisjunction* disjunction, bool capture = false, bool invert = false) + : type(type) + , m_capture(capture) + , m_invert(invert) + { + parentheses.disjunction = disjunction; + parentheses.subpatternId = subpatternId; + parentheses.isCopy = false; + parentheses.isTerminal = false; + quantityType = QuantifierFixedCount; + quantityCount = 1; + } + + PatternTerm(Type type, bool invert = false) + : type(type) + , m_capture(false) + , m_invert(invert) + { + quantityType = QuantifierFixedCount; + quantityCount = 1; + } + + PatternTerm(unsigned spatternId) + : type(TypeBackReference) + , m_capture(false) + , m_invert(false) + { + backReferenceSubpatternId = spatternId; + quantityType = QuantifierFixedCount; + quantityCount = 1; + } + + static PatternTerm ForwardReference() + { + return PatternTerm(TypeForwardReference); + } + + static PatternTerm BOL() + { + return PatternTerm(TypeAssertionBOL); + } + + static PatternTerm EOL() + { + return PatternTerm(TypeAssertionEOL); + } + + static PatternTerm WordBoundary(bool invert) + { + return PatternTerm(TypeAssertionWordBoundary, invert); + } + + bool invert() + { + return m_invert; + } + + bool capture() + { + return m_capture; + } + + void quantify(unsigned count, QuantifierType type) + { + quantityCount = count; + quantityType = type; + } +}; + +struct PatternAlternative : FastAllocBase { + PatternAlternative(PatternDisjunction* disjunction) + : m_parent(disjunction) + , m_onceThrough(false) + , m_hasFixedSize(false) + , m_startsWithBOL(false) + , m_containsBOL(false) + { + } + + PatternTerm& lastTerm() + { + ASSERT(m_terms.size()); + return m_terms[m_terms.size() - 1]; + } + + void removeLastTerm() + { + ASSERT(m_terms.size()); + m_terms.shrink(m_terms.size() - 1); + } + + void setOnceThrough() + { + m_onceThrough = true; + } + + bool onceThrough() + { + return m_onceThrough; + } + + Vector m_terms; + PatternDisjunction* m_parent; + unsigned m_minimumSize; + bool m_onceThrough : 1; + bool m_hasFixedSize : 1; + bool m_startsWithBOL : 1; + bool m_containsBOL : 1; +}; + +struct PatternDisjunction : FastAllocBase { + PatternDisjunction(PatternAlternative* parent = 0) + : m_parent(parent) + , m_hasFixedSize(false) + { + } + + ~PatternDisjunction() + { + deleteAllValues(m_alternatives); + } + + PatternAlternative* addNewAlternative() + { + PatternAlternative* alternative = new PatternAlternative(this); + m_alternatives.append(alternative); + return alternative; + } + + Vector m_alternatives; + PatternAlternative* m_parent; + unsigned m_minimumSize; + unsigned m_callFrameSize; + bool m_hasFixedSize; +}; + +// You probably don't want to be calling these functions directly +// (please to be calling newlineCharacterClass() et al on your +// friendly neighborhood YarrPattern instance to get nicely +// cached copies). +CharacterClass* newlineCreate(); +CharacterClass* digitsCreate(); +CharacterClass* spacesCreate(); +CharacterClass* wordcharCreate(); +CharacterClass* nondigitsCreate(); +CharacterClass* nonspacesCreate(); +CharacterClass* nonwordcharCreate(); + +struct TermChain { + TermChain(PatternTerm term) + : term(term) + {} + + PatternTerm term; + Vector hotTerms; +}; + +struct BeginChar { + BeginChar() + : value(0) + , mask(0) + {} + + BeginChar(unsigned value, unsigned mask) + : value(value) + , mask(mask) + {} + + unsigned value; + unsigned mask; +}; + +struct YarrPattern { + YarrPattern(const UString& pattern, bool ignoreCase, bool multiline, const char** error); + + ~YarrPattern() + { + deleteAllValues(m_disjunctions); + deleteAllValues(m_userCharacterClasses); + } + + void reset() + { + m_numSubpatterns = 0; + m_maxBackReference = 0; + + m_containsBackreferences = false; + m_containsBeginChars = false; + m_containsBOL = false; + + newlineCached = 0; + digitsCached = 0; + spacesCached = 0; + wordcharCached = 0; + nondigitsCached = 0; + nonspacesCached = 0; + nonwordcharCached = 0; + + deleteAllValues(m_disjunctions); + m_disjunctions.clear(); + deleteAllValues(m_userCharacterClasses); + m_userCharacterClasses.clear(); + m_beginChars.clear(); + } + + bool containsIllegalBackReference() + { + return m_maxBackReference > m_numSubpatterns; + } + + CharacterClass* newlineCharacterClass() + { + if (!newlineCached) + m_userCharacterClasses.append(newlineCached = newlineCreate()); + return newlineCached; + } + CharacterClass* digitsCharacterClass() + { + if (!digitsCached) + m_userCharacterClasses.append(digitsCached = digitsCreate()); + return digitsCached; + } + CharacterClass* spacesCharacterClass() + { + if (!spacesCached) + m_userCharacterClasses.append(spacesCached = spacesCreate()); + return spacesCached; + } + CharacterClass* wordcharCharacterClass() + { + if (!wordcharCached) + m_userCharacterClasses.append(wordcharCached = wordcharCreate()); + return wordcharCached; + } + CharacterClass* nondigitsCharacterClass() + { + if (!nondigitsCached) + m_userCharacterClasses.append(nondigitsCached = nondigitsCreate()); + return nondigitsCached; + } + CharacterClass* nonspacesCharacterClass() + { + if (!nonspacesCached) + m_userCharacterClasses.append(nonspacesCached = nonspacesCreate()); + return nonspacesCached; + } + CharacterClass* nonwordcharCharacterClass() + { + if (!nonwordcharCached) + m_userCharacterClasses.append(nonwordcharCached = nonwordcharCreate()); + return nonwordcharCached; + } + + bool m_ignoreCase : 1; + bool m_multiline : 1; + bool m_containsBackreferences : 1; + bool m_containsBeginChars : 1; + bool m_containsBOL : 1; + unsigned m_numSubpatterns; + unsigned m_maxBackReference; + PatternDisjunction* m_body; + Vector m_disjunctions; + Vector m_userCharacterClasses; + Vector m_beginChars; + +private: + const char* compile(const UString& patternString); + + CharacterClass* newlineCached; + CharacterClass* digitsCached; + CharacterClass* spacesCached; + CharacterClass* wordcharCached; + CharacterClass* nondigitsCached; + CharacterClass* nonspacesCached; + CharacterClass* nonwordcharCached; +}; + +} } // namespace JSC::Yarr + +#endif // YarrPattern_h diff --git a/Source/JavaScriptCore/yarr/YarrSyntaxChecker.cpp b/Source/JavaScriptCore/yarr/YarrSyntaxChecker.cpp new file mode 100644 index 0000000..51fda94 --- /dev/null +++ b/Source/JavaScriptCore/yarr/YarrSyntaxChecker.cpp @@ -0,0 +1,59 @@ +/* + * Copyright (C) 2011 Apple Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "YarrSyntaxChecker.h" + +#include "YarrParser.h" + +namespace JSC { namespace Yarr { + +class SyntaxChecker { +public: + void assertionBOL() {} + void assertionEOL() {} + void assertionWordBoundary(bool) {} + void atomPatternCharacter(UChar) {} + void atomBuiltInCharacterClass(BuiltInCharacterClassID, bool) {} + void atomCharacterClassBegin(bool = false) {} + void atomCharacterClassAtom(UChar) {} + void atomCharacterClassRange(UChar, UChar) {} + void atomCharacterClassBuiltIn(BuiltInCharacterClassID, bool) {} + void atomCharacterClassEnd() {} + void atomParenthesesSubpatternBegin(bool = true) {} + void atomParentheticalAssertionBegin(bool = false) {} + void atomParenthesesEnd() {} + void atomBackReference(unsigned) {} + void quantifyAtom(unsigned, unsigned, bool) {} + void disjunction() {} +}; + +const char* checkSyntax(const UString& pattern) +{ + SyntaxChecker syntaxChecker; + return parse(syntaxChecker, pattern); +} + +}} // JSC::YARR diff --git a/Source/JavaScriptCore/yarr/YarrSyntaxChecker.h b/Source/JavaScriptCore/yarr/YarrSyntaxChecker.h new file mode 100644 index 0000000..e48cb9e --- /dev/null +++ b/Source/JavaScriptCore/yarr/YarrSyntaxChecker.h @@ -0,0 +1,38 @@ +/* + * Copyright (C) 2011 Apple Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef YarrSyntaxChecker_h +#define YarrSyntaxChecker_h + +#include + +namespace JSC { namespace Yarr { + +const char* checkSyntax(const UString& pattern); + +}} // JSC::YARR + +#endif // YarrSyntaxChecker_h + -- cgit v1.1