diff options
author | The Android Open Source Project <initial-contribution@android.com> | 2009-03-03 18:28:41 -0800 |
---|---|---|
committer | The Android Open Source Project <initial-contribution@android.com> | 2009-03-03 18:28:41 -0800 |
commit | 648161bb0edfc3d43db63caed5cc5213bc6cb78f (patch) | |
tree | 4b825dc642cb6eb9a060e54bf8d69288fbee4904 /JavaScriptCore/wrec | |
parent | a65af38181ac7d34544586bdb5cd004de93897ad (diff) | |
download | external_webkit-648161bb0edfc3d43db63caed5cc5213bc6cb78f.zip external_webkit-648161bb0edfc3d43db63caed5cc5213bc6cb78f.tar.gz external_webkit-648161bb0edfc3d43db63caed5cc5213bc6cb78f.tar.bz2 |
auto import from //depot/cupcake/@135843
Diffstat (limited to 'JavaScriptCore/wrec')
-rw-r--r-- | JavaScriptCore/wrec/CharacterClassConstructor.cpp | 360 | ||||
-rw-r--r-- | JavaScriptCore/wrec/CharacterClassConstructor.h | 121 | ||||
-rw-r--r-- | JavaScriptCore/wrec/WREC.cpp | 1324 | ||||
-rw-r--r-- | JavaScriptCore/wrec/WREC.h | 258 |
4 files changed, 0 insertions, 2063 deletions
diff --git a/JavaScriptCore/wrec/CharacterClassConstructor.cpp b/JavaScriptCore/wrec/CharacterClassConstructor.cpp deleted file mode 100644 index 85eccaa..0000000 --- a/JavaScriptCore/wrec/CharacterClassConstructor.cpp +++ /dev/null @@ -1,360 +0,0 @@ -/* - * Copyright (C) 2008 Apple Inc. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY - * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY - * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include "config.h" -#include "CharacterClassConstructor.h" - -#if ENABLE(WREC) - -#include "pcre_internal.h" -#include <wtf/ASCIICType.h> - -using namespace WTF; - -namespace JSC { - -static const UChar asciiNewlines[2] = { '\n', '\r' }; -static const UChar unicodeNewlines[2] = { 0x2028, 0x2029 }; -CharacterClass& getCharacterClassNewline() { - static CharacterClass charClass = { - asciiNewlines, 2, - 0, 0, - unicodeNewlines, 2, - 0, 0, - }; - - return charClass; -} - -static const CharacterClassRange asciiDigitsRange[1] = { { '0', '9' } }; -CharacterClass& getCharacterClassDigits() { - static CharacterClass charClass = { - 0, 0, - asciiDigitsRange, 1, - 0, 0, - 0, 0, - }; - - return charClass; -} - -static const UChar asciiSpaces[1] = { ' ' }; -static const CharacterClassRange asciiSpacesRange[1] = { { '\t', '\r' } }; -static const UChar unicodeSpaces[8] = { 0x00a0, 0x1680, 0x180e, 0x2028, 0x2029, 0x202f, 0x205f, 0x3000 }; -static const CharacterClassRange unicodeSpacesRange[1] = { { 0x2000, 0x200a } }; -CharacterClass& getCharacterClassSpaces() { - static CharacterClass charClass = { - asciiSpaces, 1, - asciiSpacesRange, 1, - unicodeSpaces, 8, - unicodeSpacesRange, 1, - }; - - return charClass; -} - -static const UChar asciiWordchar[1] = { '_' }; -static const CharacterClassRange asciiWordcharRange[3] = { { '0', '9' }, { 'A', 'Z' }, { 'a', 'z' } }; -CharacterClass& getCharacterClassWordchar() { - static CharacterClass charClass = { - asciiWordchar, 1, - asciiWordcharRange, 3, - 0, 0, - 0, 0, - }; - - return charClass; -} - -static const CharacterClassRange asciiNondigitsRange[2] = { { 0, '0' - 1 }, { '9' + 1, 0x7f } }; -static const CharacterClassRange unicodeNondigitsRange[1] = { { 0x0080, 0xffff } }; -CharacterClass& getCharacterClassNondigits() { - static CharacterClass charClass = { - 0, 0, - asciiNondigitsRange, 2, - 0, 0, - unicodeNondigitsRange, 1, - }; - - return charClass; -} - -static const CharacterClassRange asciiNonspacesRange[3] = { { 0, '\t' - 1 }, { '\r' + 1, ' ' - 1 }, { ' ' + 1, 0x7f } }; -static const CharacterClassRange unicodeNonspacesRange[9] = { - { 0x0080, 0x009f }, - { 0x00a1, 0x167f }, - { 0x1681, 0x180d }, - { 0x180f, 0x1fff }, - { 0x200b, 0x2027 }, - { 0x202a, 0x202e }, - { 0x2030, 0x205e }, - { 0x2060, 0x2fff }, - { 0x3001, 0xffff } -}; -CharacterClass& getCharacterClassNonspaces() { - static CharacterClass charClass = { - 0, 0, - asciiNonspacesRange, 3, - 0, 0, - unicodeNonspacesRange, 9, - }; - - return charClass; -} - -static const UChar asciiNonwordchar[1] = { '`' }; -static const CharacterClassRange asciiNonwordcharRange[4] = { { 0, '0' - 1 }, { '9' + 1, 'A' - 1 }, { 'Z' + 1, '_' - 1 }, { 'z' + 1, 0x7f } }; -static const CharacterClassRange unicodeNonwordcharRange[1] = { { 0x0080, 0xffff } }; -CharacterClass& getCharacterClassNonwordchar() { - static CharacterClass charClass = { - asciiNonwordchar, 1, - asciiNonwordcharRange, 4, - 0, 0, - unicodeNonwordcharRange, 1, - }; - - return charClass; -} - -void CharacterClassConstructor::addSorted(Vector<UChar>& matches, UChar ch) -{ - unsigned pos = 0; - unsigned range = matches.size(); - - // binary chop, find position to insert char. - while (range) { - unsigned index = range >> 1; - - int val = matches[pos+index] - ch; - if (!val) - return; - else if (val > 0) - range = index; - else { - pos += (index+1); - range -= (index+1); - } - } - - if (pos == matches.size()) - matches.append(ch); - else - matches.insert(pos, ch); -} - -void CharacterClassConstructor::addSortedRange(Vector<CharacterClassRange>& ranges, UChar lo, UChar hi) -{ - unsigned end = ranges.size(); - - // Simple linear scan - I doubt there are that many ranges anyway... - // feel free to fix this with something faster (eg binary chop). - for (unsigned i = 0; i < end; ++i) { - // does the new range fall before the current position in the array - if (hi < ranges[i].begin) { - // optional optimization: concatenate appending ranges? - may not be worthwhile. - if (hi == (ranges[i].begin - 1)) { - ranges[i].begin = lo; - return; - } - CharacterClassRange r = {lo, hi}; - ranges.insert(i, r); - return; - } - // Okay, since we didn't hit the last case, the end of the new range is definitely at or after the begining - // If the new range start at or before the end of the last range, then the overlap (if it starts one after the - // end of the last range they concatenate, which is just as good. - if (lo <= (ranges[i].end + 1)) { - // found an intersect! we'll replace this entry in the array. - ranges[i].begin = std::min(ranges[i].begin, lo); - ranges[i].end = std::max(ranges[i].end, hi); - - // now check if the new range can subsume any subsequent ranges. - unsigned next = i+1; - // each iteration of the loop we will either remove something from the list, or break the loop. - while (next < ranges.size()) { - if (ranges[next].begin <= (ranges[i].end + 1)) { - // the next entry now overlaps / concatenates this one. - ranges[i].end = std::max(ranges[i].end, ranges[next].end); - ranges.remove(next); - } else - break; - } - - return; - } - } - - // Range comes after all existing ranges. - CharacterClassRange r = {lo, hi}; - ranges.append(r); -} - -void CharacterClassConstructor::put(UChar ch) -{ - // Parsing a regular expression like [a-z], we start in an initial empty state: - // ((m_charBuffer == -1) && !m_isPendingDash) - // When buffer the 'a' sice it may be (and is in this case) part of a range: - // ((m_charBuffer != -1) && !m_isPendingDash) - // Having parsed the hyphen we then record that the dash is also pending: - // ((m_charBuffer != -1) && m_isPendingDash) - // The next change will always take us back to the initial state - either because - // a complete range has been parsed (such as [a-z]), or because a flush is forced, - // due to an early end in the regexp ([a-]), or a character class escape being added - // ([a-\s]). The fourth permutation of m_charBuffer and m_isPendingDash is not permitted. - ASSERT(!((m_charBuffer == -1) && m_isPendingDash)); - - if (m_charBuffer != -1) { - if (m_isPendingDash) { - // EXAMPLE: parsing [-a-c], the 'c' reaches this case - we have buffered a previous character and seen a hyphen, so this is a range. - UChar lo = m_charBuffer; - UChar hi = ch; - // Reset back to the inital state. - m_charBuffer = -1; - m_isPendingDash = false; - - // This is an error, detected lazily. Do not proceed. - if (lo > hi) { - m_isUpsideDown = true; - return; - } - - if (lo <= 0x7f) { - char asciiLo = lo; - char asciiHi = std::min(hi, (UChar)0x7f); - addSortedRange(m_ranges, lo, asciiHi); - - if (m_isCaseInsensitive) { - if ((asciiLo <= 'Z') && (asciiHi >= 'A')) - addSortedRange(m_ranges, std::max(asciiLo, 'A')+('a'-'A'), std::min(asciiHi, 'Z')+('a'-'A')); - if ((asciiLo <= 'z') && (asciiHi >= 'a')) - addSortedRange(m_ranges, std::max(asciiLo, 'a')+('A'-'a'), std::min(asciiHi, 'z')+('A'-'a')); - } - } - if (hi >= 0x80) { - UChar unicodeCurr = std::max(lo, (UChar)0x80); - addSortedRange(m_rangesUnicode, unicodeCurr, hi); - - if (m_isCaseInsensitive) { - // we're going to scan along, updating the start of the range - while (unicodeCurr <= hi) { - // Spin forwards over any characters that don't have two cases. - for (; kjs_pcre_ucp_othercase(unicodeCurr) == -1; ++unicodeCurr) { - // if this was the last character in the range, we're done. - if (unicodeCurr == hi) - return; - } - // if we fall through to here, unicodeCurr <= hi & has another case. Get the other case. - UChar rangeStart = unicodeCurr; - UChar otherCurr = kjs_pcre_ucp_othercase(unicodeCurr); - - // If unicodeCurr is not yet hi, check the next char in the range. If it also has another case, - // and if it's other case value is one greater then the othercase value for the current last - // character included in the range, we can include next into the range. - while ((unicodeCurr < hi) && (kjs_pcre_ucp_othercase(unicodeCurr + 1) == (otherCurr + 1))) { - // increment unicodeCurr; it points to the end of the range. - // increment otherCurr, due to the check above other for next must be 1 greater than the currrent other value. - ++unicodeCurr; - ++otherCurr; - } - - // otherChar is the last in the range of other case chars, calculate offset to get back to the start. - addSortedRange(m_rangesUnicode, otherCurr-(unicodeCurr-rangeStart), otherCurr); - - // unicodeCurr has been added, move on to the next char. - ++unicodeCurr; - } - } - } - } else if (ch == '-') - // EXAMPLE: parsing [-a-c], the second '-' reaches this case - the hyphen is treated as potentially indicating a range. - m_isPendingDash = true; - else { - // EXAMPLE: Parsing [-a-c], the 'a' reaches this case - we repace the previously buffered char with the 'a'. - flush(); - m_charBuffer = ch; - } - } else - // EXAMPLE: Parsing [-a-c], the first hyphen reaches this case - there is no buffered character - // (the hyphen not treated as a special character in this case, same handling for any char). - m_charBuffer = ch; -} - -// When a character is added to the set we do not immediately add it to the arrays, in case it is actually defining a range. -// When we have determined the character is not used in specifing a range it is added, in a sorted fashion, to the appropriate -// array (either ascii or unicode). -// If the pattern is case insensitive we add entries for both cases. -void CharacterClassConstructor::flush() -{ - if (m_charBuffer != -1) { - if (m_charBuffer <= 0x7f) { - if (m_isCaseInsensitive && isASCIILower(m_charBuffer)) - addSorted(m_matches, toASCIIUpper(m_charBuffer)); - addSorted(m_matches, m_charBuffer); - if (m_isCaseInsensitive && isASCIIUpper(m_charBuffer)) - addSorted(m_matches, toASCIILower(m_charBuffer)); - } else { - addSorted(m_matchesUnicode, m_charBuffer); - if (m_isCaseInsensitive) { - int other = kjs_pcre_ucp_othercase(m_charBuffer); - if (other != -1) - addSorted(m_matchesUnicode, other); - } - } - m_charBuffer = -1; - } - - if (m_isPendingDash) { - addSorted(m_matches, '-'); - m_isPendingDash = false; - } -} - -void CharacterClassConstructor::append(CharacterClass& other) -{ - // [x-\s] will add, 'x', '-', and all unicode spaces to new class (same as [x\s-]). - // Need to check the spec, really, but think this matches PCRE behaviour. - flush(); - - if (other.numMatches) { - for (size_t i = 0; i < other.numMatches; ++i) - addSorted(m_matches, other.matches[i]); - } - if (other.numRanges) { - for (size_t i = 0; i < other.numRanges; ++i) - addSortedRange(m_ranges, other.ranges[i].begin, other.ranges[i].end); - } - if (other.numMatchesUnicode) { - for (size_t i = 0; i < other.numMatchesUnicode; ++i) - addSorted(m_matchesUnicode, other.matchesUnicode[i]); - } - if (other.numRangesUnicode) { - for (size_t i = 0; i < other.numRangesUnicode; ++i) - addSortedRange(m_rangesUnicode, other.rangesUnicode[i].begin, other.rangesUnicode[i].end); - } -} - -} - -#endif // ENABLE(WREC) diff --git a/JavaScriptCore/wrec/CharacterClassConstructor.h b/JavaScriptCore/wrec/CharacterClassConstructor.h deleted file mode 100644 index 5accdae..0000000 --- a/JavaScriptCore/wrec/CharacterClassConstructor.h +++ /dev/null @@ -1,121 +0,0 @@ -/* - * Copyright (C) 2008 Apple Inc. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY - * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY - * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef CharacterClassConstructor_h -#define CharacterClassConstructor_h - -#if ENABLE(WREC) - -#include "ustring.h" - -namespace JSC { - - struct CharacterClassRange { - UChar begin; - UChar end; - }; - - struct CharacterClass { - const UChar* matches; - unsigned numMatches; - - const CharacterClassRange* ranges; - unsigned numRanges; - - const UChar* matchesUnicode; - unsigned numMatchesUnicode; - - const CharacterClassRange* rangesUnicode; - unsigned numRangesUnicode; - }; - - CharacterClass& getCharacterClassNewline(); - CharacterClass& getCharacterClassDigits(); - CharacterClass& getCharacterClassSpaces(); - CharacterClass& getCharacterClassWordchar(); - CharacterClass& getCharacterClassNondigits(); - CharacterClass& getCharacterClassNonspaces(); - CharacterClass& getCharacterClassNonwordchar(); - - class CharacterClassConstructor { - public: - CharacterClassConstructor(bool isCaseInsensitive) - : m_charBuffer(-1) - , m_isPendingDash(false) - , m_isCaseInsensitive(isCaseInsensitive) - , m_isUpsideDown(false) - { - } - - void flush(); - - // We need to flush prior to an escaped hyphen to prevent it as being treated as indicating - // a range, e.g. [a\-c] we flush prior to adding the hyphen so that this is not treated as - // [a-c]. However, we do not want to flush if we have already seen a non escaped hyphen - - // e.g. [+-\-] should be treated the same as [+--], producing a range that will also match - // a comma. - void flushBeforeEscapedHyphen() - { - if (!m_isPendingDash) - flush(); - } - - void put(UChar ch); - void append(CharacterClass& other); - - bool isUpsideDown() { return m_isUpsideDown; } - - ALWAYS_INLINE CharacterClass charClass() - { - CharacterClass newCharClass = { - m_matches.begin(), m_matches.size(), - m_ranges.begin(), m_ranges.size(), - m_matchesUnicode.begin(), m_matchesUnicode.size(), - m_rangesUnicode.begin(), m_rangesUnicode.size(), - }; - - return newCharClass; - } - - private: - void addSorted(Vector<UChar>& matches, UChar ch); - void addSortedRange(Vector<CharacterClassRange>& ranges, UChar lo, UChar hi); - - int m_charBuffer; - bool m_isPendingDash; - bool m_isCaseInsensitive; - bool m_isUpsideDown; - - Vector<UChar> m_matches; - Vector<CharacterClassRange> m_ranges; - Vector<UChar> m_matchesUnicode; - Vector<CharacterClassRange> m_rangesUnicode; - }; - -} - -#endif // ENABLE(WREC) - -#endif // CharacterClassConstructor_h diff --git a/JavaScriptCore/wrec/WREC.cpp b/JavaScriptCore/wrec/WREC.cpp deleted file mode 100644 index 6d64a2c..0000000 --- a/JavaScriptCore/wrec/WREC.cpp +++ /dev/null @@ -1,1324 +0,0 @@ -/* - * Copyright (C) 2008 Apple Inc. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY - * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY - * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include "config.h" -#include "WREC.h" - -#if ENABLE(WREC) - -#include "CharacterClassConstructor.h" -#include "Machine.h" -#include "pcre_internal.h" - -using namespace WTF; - -namespace JSC { - -class GenerateAtomFunctor { -public: - virtual ~GenerateAtomFunctor() {} - - virtual void generateAtom(WRECGenerator*, JmpSrcVector&) = 0; - virtual void backtrack(WRECGenerator*) = 0; -}; - -class GeneratePatternCharacterFunctor : public GenerateAtomFunctor { -public: - GeneratePatternCharacterFunctor(const UChar ch) - : m_ch(ch) - { - } - - virtual void generateAtom(WRECGenerator*, JmpSrcVector&); - virtual void backtrack(WRECGenerator*); - -private: - const UChar m_ch; -}; - -class GenerateCharacterClassFunctor : public GenerateAtomFunctor { -public: - GenerateCharacterClassFunctor(CharacterClass* charClass, bool invert) - : m_charClass(charClass) - , m_invert(invert) - { - } - - virtual void generateAtom(WRECGenerator*, JmpSrcVector&); - virtual void backtrack(WRECGenerator*); - -private: - CharacterClass* m_charClass; - bool m_invert; -}; - -class GenerateBackreferenceFunctor : public GenerateAtomFunctor { -public: - GenerateBackreferenceFunctor(unsigned subpatternId) - : m_subpatternId(subpatternId) - { - } - - virtual void generateAtom(WRECGenerator*, JmpSrcVector&); - virtual void backtrack(WRECGenerator*); - -private: - unsigned m_subpatternId; -}; - -class GenerateParenthesesNonGreedyFunctor : public GenerateAtomFunctor { -public: - GenerateParenthesesNonGreedyFunctor(WRECGenerator::JmpDst start, WRECGenerator::JmpSrc success, WRECGenerator::JmpSrc fail) - : m_start(start) - , m_success(success) - , m_fail(fail) - { - } - - virtual void generateAtom(WRECGenerator*, JmpSrcVector&); - virtual void backtrack(WRECGenerator*); - -private: - WRECGenerator::JmpDst m_start; - WRECGenerator::JmpSrc m_success; - WRECGenerator::JmpSrc m_fail; -}; - -void GeneratePatternCharacterFunctor::generateAtom(WRECGenerator* wrec, JmpSrcVector& failures) -{ - wrec->generatePatternCharacter(failures, m_ch); -} -void GeneratePatternCharacterFunctor::backtrack(WRECGenerator* wrec) -{ - wrec->generateBacktrack1(); -} - -void GenerateCharacterClassFunctor::generateAtom(WRECGenerator* wrec, JmpSrcVector& failures) -{ - wrec->generateCharacterClass(failures, *m_charClass, m_invert); -} -void GenerateCharacterClassFunctor::backtrack(WRECGenerator* wrec) -{ - wrec->generateBacktrack1(); -} - -void GenerateBackreferenceFunctor::generateAtom(WRECGenerator* wrec, JmpSrcVector& failures) -{ - wrec->generateBackreference(failures, m_subpatternId); -} -void GenerateBackreferenceFunctor::backtrack(WRECGenerator* wrec) -{ - wrec->generateBacktrackBackreference(m_subpatternId); -} - -void GenerateParenthesesNonGreedyFunctor::generateAtom(WRECGenerator* wrec, JmpSrcVector& failures) -{ - wrec->generateParenthesesNonGreedy(failures, m_start, m_success, m_fail); -} - -void GenerateParenthesesNonGreedyFunctor::backtrack(WRECGenerator*) -{ - // FIXME: do something about this. - CRASH(); -} - -void WRECGenerator::generateBacktrack1() -{ - m_jit.subl_i8r(1, currentPositionRegister); -} - -void WRECGenerator::generateBacktrackBackreference(unsigned subpatternId) -{ - m_jit.subl_mr((2 * subpatternId + 1) * sizeof(int), outputRegister, currentPositionRegister); - m_jit.addl_mr((2 * subpatternId) * sizeof(int), outputRegister, currentPositionRegister); -} - -void WRECGenerator::generateBackreferenceQuantifier(JmpSrcVector& failures, Quantifier::Type quantifierType, unsigned subpatternId, unsigned min, unsigned max) -{ - GenerateBackreferenceFunctor functor(subpatternId); - - m_jit.movl_mr((2 * subpatternId) * sizeof(int), outputRegister, currentValueRegister); - m_jit.cmpl_rm(currentValueRegister, ((2 * subpatternId) + 1) * sizeof(int), outputRegister); - JmpSrc skipIfEmpty = m_jit.emitUnlinkedJe(); - - ASSERT(quantifierType == Quantifier::Greedy || quantifierType == Quantifier::NonGreedy); - if (quantifierType == Quantifier::Greedy) - generateGreedyQuantifier(failures, functor, min, max); - else - generateNonGreedyQuantifier(failures, functor, min, max); - - m_jit.link(skipIfEmpty, m_jit.label()); -} - -void WRECGenerator::generateNonGreedyQuantifier(JmpSrcVector& failures, GenerateAtomFunctor& functor, unsigned min, unsigned max) -{ - // comment me better! - JmpSrcVector newFailures; - - // (0) Setup: - // init quantifierCountRegister - m_jit.pushl_r(quantifierCountRegister); - m_jit.xorl_rr(quantifierCountRegister, quantifierCountRegister); - JmpSrc gotoStart = m_jit.emitUnlinkedJmp(); - - // (4) Failure case - - JmpDst quantifierFailed = m_jit.label(); - // (4.1) Restore original value of quantifierCountRegister from the stack - m_jit.popl_r(quantifierCountRegister); - failures.append(m_jit.emitUnlinkedJmp()); - - // (3) We just tried an alternative, and it failed - check we can try more. - - JmpDst alternativeFailed = m_jit.label(); - // (3.1) remove the value pushed prior to testing the alternative - m_jit.popl_r(currentPositionRegister); - // (3.2) if there is a limit, and we have reached it, game over. - if (max != Quantifier::noMaxSpecified) { - m_jit.cmpl_i32r(max, quantifierCountRegister); - m_jit.link(m_jit.emitUnlinkedJe(), quantifierFailed); - } - - // (1) Do a check for the atom - - // (1.0) This is where we start, if there is a minimum (then we must read at least one of the atom). - JmpDst testQuantifiedAtom = m_jit.label(); - if (min) - m_jit.link(gotoStart, testQuantifiedAtom); - // (1.1) Do a check for the atom check. - functor.generateAtom(this, newFailures); - // (1.2) If we get here, successful match! - m_jit.addl_i8r(1, quantifierCountRegister); - // (1.3) We needed to read the atom, and we failed - that's terminally bad news. - for (unsigned i = 0; i < newFailures.size(); ++i) - m_jit.link(newFailures[i], quantifierFailed); - newFailures.clear(); - // (1.4) If there is a minimum, check we have read enough ... - if (min) { - // ... except if min was 1 no need to keep checking! - if (min != 1) { - m_jit.cmpl_i32r(min, quantifierCountRegister); - m_jit.link(m_jit.emitUnlinkedJl(), testQuantifiedAtom); - } - } else - m_jit.link(gotoStart, m_jit.label()); - // if there was no minimum, this is where we start. - - // (2) Plant an alternative check for the remainder of the expression - - // (2.1) recursively call to parseAlternative, if it falls through, success! - m_jit.pushl_r(currentPositionRegister); - m_parser.parseAlternative(newFailures); - m_jit.addl_i8r(4, X86::esp); - m_jit.popl_r(quantifierCountRegister); - // (2.2) link failure cases to jump back up to alternativeFailed. - for (unsigned i = 0; i < newFailures.size(); ++i) - m_jit.link(newFailures[i], alternativeFailed); - newFailures.clear(); -} - -void WRECGenerator::generateGreedyQuantifier(JmpSrcVector& failures, GenerateAtomFunctor& functor, unsigned min, unsigned max) -{ - if (!max) - return; - - // comment me better! - JmpSrcVector newFailures; - - // (0) Setup: - // init quantifierCountRegister - m_jit.pushl_r(quantifierCountRegister); - m_jit.xorl_rr(quantifierCountRegister, quantifierCountRegister); - - // (1) Greedily read as many of the atom as possible - - JmpDst readMore = m_jit.label(); - - // (1.1) Do a character class check. - functor.generateAtom(this, newFailures); - // (1.2) If we get here, successful match! - m_jit.addl_i8r(1, quantifierCountRegister); - // (1.3) loop, unless we've read max limit. - if (max != Quantifier::noMaxSpecified) { - if (max != 1) { - // if there is a limit, only loop while less than limit, otherwise fall throught to... - m_jit.cmpl_i32r(max, quantifierCountRegister); - m_jit.link(m_jit.emitUnlinkedJne(), readMore); - } - // ...if there is no min we need jump to the alternative test, if there is we can just fall through to it. - if (!min) - newFailures.append(m_jit.emitUnlinkedJmp()); - } else - m_jit.link(m_jit.emitUnlinkedJmp(), readMore); - // (1.4) check enough matches to bother trying an alternative... - if (min) { - // We will fall through to here if (min && max), after the max check. - // First, also link a - JmpDst minimumTest = m_jit.label(); - for (unsigned i = 0; i < newFailures.size(); ++i) - m_jit.link(newFailures[i], minimumTest); - newFailures.clear(); - // - m_jit.cmpl_i32r(min, quantifierCountRegister); - newFailures.append(m_jit.emitUnlinkedJae()); - } - - // (4) Failure case - - JmpDst quantifierFailed = m_jit.label(); - // (4.1) Restore original value of quantifierCountRegister from the stack - m_jit.popl_r(quantifierCountRegister); - failures.append(m_jit.emitUnlinkedJmp()); - - // (3) Backtrack - - JmpDst backtrack = m_jit.label(); - // (3.1) this was preserved prior to executing the alternative - m_jit.popl_r(currentPositionRegister); - // (3.2) check we can retry with fewer matches - backtracking fails if already at the minimum - m_jit.cmpl_i32r(min, quantifierCountRegister); - m_jit.link(m_jit.emitUnlinkedJe(), quantifierFailed); - // (3.3) roll off one match, and retry. - functor.backtrack(this); - m_jit.subl_i8r(1, quantifierCountRegister); - - // (2) Try an alternative. - - // (2.1) point to retry - JmpDst tryAlternative = m_jit.label(); - for (unsigned i = 0; i < newFailures.size(); ++i) - m_jit.link(newFailures[i], tryAlternative); - newFailures.clear(); - // (2.2) recursively call to parseAlternative, if it falls through, success! - m_jit.pushl_r(currentPositionRegister); - m_parser.parseAlternative(newFailures); - m_jit.addl_i8r(4, X86::esp); - m_jit.popl_r(quantifierCountRegister); - // (2.3) link failure cases to here. - for (unsigned i = 0; i < newFailures.size(); ++i) - m_jit.link(newFailures[i], backtrack); - newFailures.clear(); -} - -void WRECGenerator::generatePatternCharacter(JmpSrcVector& failures, int ch) -{ - // check there is more input, read a char. - m_jit.cmpl_rr(lengthRegister, currentPositionRegister); - failures.append(m_jit.emitUnlinkedJe()); - m_jit.movzwl_mr(inputRegister, currentPositionRegister, 2, currentValueRegister); - - // used for unicode case insensitive - bool hasUpper = false; - JmpSrc isUpper; - - // if case insensitive match - if (m_parser.m_ignoreCase) { - UChar lower, upper; - - // check for ascii case sensitive characters - if (isASCIIAlpha(ch)) { - m_jit.orl_i32r(32, currentValueRegister); - ch |= 32; - } else if ((ch > 0x7f) && ((lower = Unicode::toLower(ch)) != (upper = Unicode::toUpper(ch)))) { - // handle unicode case sentitive characters - branch to success on upper - m_jit.cmpl_i32r(upper, currentValueRegister); - isUpper = m_jit.emitUnlinkedJe(); - hasUpper = true; - ch = lower; - } - } - - // checks for ch, or lower case version of ch, if insensitive - m_jit.cmpl_i32r((unsigned short)ch, currentValueRegister); - failures.append(m_jit.emitUnlinkedJne()); - - if (m_parser.m_ignoreCase && hasUpper) { - // for unicode case insensitive matches, branch here if upper matches. - m_jit.link(isUpper, m_jit.label()); - } - - // on success consume the char - m_jit.addl_i8r(1, currentPositionRegister); -} - -void WRECGenerator::generateCharacterClassInvertedRange(JmpSrcVector& failures, JmpSrcVector& matchDest, const CharacterClassRange* ranges, unsigned count, unsigned* matchIndex, const UChar* matches, unsigned matchCount) -{ - do { - // pick which range we're going to generate - int which = count >> 1; - char lo = ranges[which].begin; - char hi = ranges[which].end; - - m_jit.cmpl_i32r((unsigned short)lo, currentValueRegister); - - // check if there are any ranges or matches below lo. If not, just jl to failure - - // if there is anything else to check, check that first, if it falls through jmp to failure. - if ((*matchIndex < matchCount) && (matches[*matchIndex] < lo)) { - JmpSrc loOrAbove = m_jit.emitUnlinkedJge(); - - // generate code for all ranges before this one - if (which) - generateCharacterClassInvertedRange(failures, matchDest, ranges, which, matchIndex, matches, matchCount); - - do { - m_jit.cmpl_i32r((unsigned short)matches[*matchIndex], currentValueRegister); - matchDest.append(m_jit.emitUnlinkedJe()); - ++*matchIndex; - } while ((*matchIndex < matchCount) && (matches[*matchIndex] < lo)); - failures.append(m_jit.emitUnlinkedJmp()); - - m_jit.link(loOrAbove, m_jit.label()); - } else if (which) { - JmpSrc loOrAbove = m_jit.emitUnlinkedJge(); - - generateCharacterClassInvertedRange(failures, matchDest, ranges, which, matchIndex, matches, matchCount); - failures.append(m_jit.emitUnlinkedJmp()); - - m_jit.link(loOrAbove, m_jit.label()); - } else - failures.append(m_jit.emitUnlinkedJl()); - - while ((*matchIndex < matchCount) && (matches[*matchIndex] <= hi)) - ++*matchIndex; - - m_jit.cmpl_i32r((unsigned short)hi, currentValueRegister); - matchDest.append(m_jit.emitUnlinkedJle()); - // fall through to here, the value is above hi. - - // shuffle along & loop around if there are any more matches to handle. - unsigned next = which + 1; - ranges += next; - count -= next; - } while (count); -} - -void WRECGenerator::generateCharacterClassInverted(JmpSrcVector& matchDest, CharacterClass& charClass) -{ - JmpSrc unicodeFail; - if (charClass.numMatchesUnicode || charClass.numRangesUnicode) { - m_jit.cmpl_i8r(0x7f, currentValueRegister); - JmpSrc isAscii = m_jit.emitUnlinkedJle(); - - if (charClass.numMatchesUnicode) { - for (unsigned i = 0; i < charClass.numMatchesUnicode; ++i) { - UChar ch = charClass.matchesUnicode[i]; - m_jit.cmpl_i32r((unsigned short)ch, currentValueRegister); - matchDest.append(m_jit.emitUnlinkedJe()); - } - } - - if (charClass.numRangesUnicode) { - for (unsigned i = 0; i < charClass.numRangesUnicode; ++i) { - UChar lo = charClass.rangesUnicode[i].begin; - UChar hi = charClass.rangesUnicode[i].end; - - m_jit.cmpl_i32r((unsigned short)lo, currentValueRegister); - JmpSrc below = m_jit.emitUnlinkedJl(); - m_jit.cmpl_i32r((unsigned short)hi, currentValueRegister); - matchDest.append(m_jit.emitUnlinkedJle()); - m_jit.link(below, m_jit.label()); - } - } - - unicodeFail = m_jit.emitUnlinkedJmp(); - m_jit.link(isAscii, m_jit.label()); - } - - if (charClass.numRanges) { - unsigned matchIndex = 0; - JmpSrcVector failures; - generateCharacterClassInvertedRange(failures, matchDest, charClass.ranges, charClass.numRanges, &matchIndex, charClass.matches, charClass.numMatches); - while (matchIndex < charClass.numMatches) { - m_jit.cmpl_i32r((unsigned short)charClass.matches[matchIndex], currentValueRegister); - matchDest.append(m_jit.emitUnlinkedJe()); - ++matchIndex; - } - JmpDst noMatch = m_jit.label(); - for (unsigned i = 0; i < failures.size(); ++i) - m_jit.link(failures[i], noMatch); - failures.clear(); - } else if (charClass.numMatches) { - // optimization: gather 'a','A' etc back together, can mask & test once. - Vector<char> matchesAZaz; - - for (unsigned i = 0; i < charClass.numMatches; ++i) { - char ch = charClass.matches[i]; - if (m_parser.m_ignoreCase) { - if (isASCIILower(ch)) { - matchesAZaz.append(ch); - continue; - } - if (isASCIIUpper(ch)) - continue; - } - m_jit.cmpl_i32r((unsigned short)ch, currentValueRegister); - matchDest.append(m_jit.emitUnlinkedJe()); - } - - if (unsigned countAZaz = matchesAZaz.size()) { - m_jit.orl_i32r(32, currentValueRegister); - - for (unsigned i = 0; i < countAZaz; ++i) { - char ch = matchesAZaz[i]; - m_jit.cmpl_i32r((unsigned short)ch, currentValueRegister); - matchDest.append(m_jit.emitUnlinkedJe()); - } - } - } - - if (charClass.numMatchesUnicode || charClass.numRangesUnicode) - m_jit.link(unicodeFail, m_jit.label()); -} - -void WRECGenerator::generateCharacterClass(JmpSrcVector& failures, CharacterClass& charClass, bool invert) -{ - m_jit.cmpl_rr(lengthRegister, currentPositionRegister); - failures.append(m_jit.emitUnlinkedJe()); - m_jit.movzwl_mr(inputRegister, currentPositionRegister, 2, currentValueRegister); - - if (invert) - generateCharacterClassInverted(failures, charClass); - else { - JmpSrcVector successes; - generateCharacterClassInverted(successes, charClass); - failures.append(m_jit.emitUnlinkedJmp()); - JmpDst here = m_jit.label(); - for (unsigned i = 0; i < successes.size(); ++i) - m_jit.link(successes[i], here); - } - - m_jit.addl_i8r(1, currentPositionRegister); -} - -WRECGenerator::JmpSrc WRECGenerator::generateParentheses(ParenthesesType type) -{ - unsigned subpatternId = (type == capturing) ? ++m_parser.m_numSubpatterns : m_parser.m_numSubpatterns; - - // push pos, both to preserve for fail + reloaded in parseDisjunction - m_jit.pushl_r(currentPositionRegister); - - // Plant a disjunction, wrapped to invert behaviour - - JmpSrcVector newFailures; - m_parser.parseDisjunction(newFailures); - - if (type == capturing) { - m_jit.popl_r(currentValueRegister); - m_jit.movl_rm(currentValueRegister, (2 * subpatternId) * sizeof(int), outputRegister); - m_jit.movl_rm(currentPositionRegister, (2 * subpatternId + 1) * sizeof(int), outputRegister); - } else if (type == non_capturing) - m_jit.addl_i8r(4, X86::esp); - else - m_jit.popl_r(currentPositionRegister); - - // This is a little lame - jump to jump if there is a nested disjunction. - // (suggestion to fix: make parseDisjunction populate a JmpSrcVector of - // disjunct successes... this is probably not worth the compile cost in - // the common case to fix). - JmpSrc successfulMatch = m_jit.emitUnlinkedJmp(); - - JmpDst originalFailure = m_jit.label(); - for (unsigned i = 0; i < newFailures.size(); ++i) - m_jit.link(newFailures[i], originalFailure); - newFailures.clear(); - // fail: restore currentPositionRegister - m_jit.popl_r(currentPositionRegister); - - JmpSrc jumpToFail; - // If this was an inverted assert, fail = success! - just let the failure case drop through, - // success case goes to failures. Both paths restore curr pos. - if (type == inverted_assertion) - jumpToFail = successfulMatch; - else { - // plant a jump so any fail will link off to 'failures', - jumpToFail = m_jit.emitUnlinkedJmp(); - // link successes to jump here - m_jit.link(successfulMatch, m_jit.label()); - } - return jumpToFail; -} - -void WRECGenerator::generateParenthesesNonGreedy(JmpSrcVector& failures, JmpDst start, JmpSrc success, JmpSrc fail) -{ - m_jit.link(m_jit.emitUnlinkedJmp(), start); - m_jit.link(success, m_jit.label()); - - failures.append(fail); -} - -WRECGenerator::JmpSrc WRECGenerator::generateParenthesesResetTrampoline(JmpSrcVector& newFailures, unsigned subpatternIdBefore, unsigned subpatternIdAfter) -{ - JmpSrc skip = m_jit.emitUnlinkedJmp(); - - JmpDst subPatternResetTrampoline = m_jit.label(); - for (unsigned i = 0; i < newFailures.size(); ++i) - m_jit.link(newFailures[i], subPatternResetTrampoline); - newFailures.clear(); - for (unsigned i = subpatternIdBefore + 1; i <= subpatternIdAfter; ++i) { - m_jit.movl_i32m(-1, (2 * i) * sizeof(int), outputRegister); - m_jit.movl_i32m(-1, (2 * i + 1) * sizeof(int), outputRegister); - } - - JmpSrc newFailJump = m_jit.emitUnlinkedJmp(); - m_jit.link(skip, m_jit.label()); - - return newFailJump; -} - -void WRECGenerator::generateAssertionBOL(JmpSrcVector& failures) -{ - if (m_parser.m_multiline) { - JmpSrcVector previousIsNewline; - - // begin of input == success - m_jit.cmpl_i8r(0, currentPositionRegister); - previousIsNewline.append(m_jit.emitUnlinkedJe()); - - // now check prev char against newline characters. - m_jit.movzwl_mr(-2, inputRegister, currentPositionRegister, 2, currentValueRegister); - generateCharacterClassInverted(previousIsNewline, getCharacterClassNewline()); - - failures.append(m_jit.emitUnlinkedJmp()); - - JmpDst success = m_jit.label(); - for (unsigned i = 0; i < previousIsNewline.size(); ++i) - m_jit.link(previousIsNewline[i], success); - previousIsNewline.clear(); - } else { - m_jit.cmpl_i8r(0, currentPositionRegister); - failures.append(m_jit.emitUnlinkedJne()); - } -} - -void WRECGenerator::generateAssertionEOL(JmpSrcVector& failures) -{ - if (m_parser.m_multiline) { - JmpSrcVector nextIsNewline; - - // end of input == success - m_jit.cmpl_rr(lengthRegister, currentPositionRegister); - nextIsNewline.append(m_jit.emitUnlinkedJe()); - - // now check next char against newline characters. - m_jit.movzwl_mr(inputRegister, currentPositionRegister, 2, currentValueRegister); - generateCharacterClassInverted(nextIsNewline, getCharacterClassNewline()); - - failures.append(m_jit.emitUnlinkedJmp()); - - JmpDst success = m_jit.label(); - for (unsigned i = 0; i < nextIsNewline.size(); ++i) - m_jit.link(nextIsNewline[i], success); - nextIsNewline.clear(); - } else { - m_jit.cmpl_rr(lengthRegister, currentPositionRegister); - failures.append(m_jit.emitUnlinkedJne()); - } -} - -void WRECGenerator::generateAssertionWordBoundary(JmpSrcVector& failures, bool invert) -{ - JmpSrcVector wordBoundary; - JmpSrcVector notWordBoundary; - - // (1) Check if the previous value was a word char - - // (1.1) check for begin of input - m_jit.cmpl_i8r(0, currentPositionRegister); - JmpSrc atBegin = m_jit.emitUnlinkedJe(); - // (1.2) load the last char, and chck if is word character - m_jit.movzwl_mr(-2, inputRegister, currentPositionRegister, 2, currentValueRegister); - JmpSrcVector previousIsWord; - generateCharacterClassInverted(previousIsWord, getCharacterClassWordchar()); - // (1.3) if we get here, previous is not a word char - m_jit.link(atBegin, m_jit.label()); - - // (2) Handle situation where previous was NOT a \w - - // (2.1) check for end of input - m_jit.cmpl_rr(lengthRegister, currentPositionRegister); - notWordBoundary.append(m_jit.emitUnlinkedJe()); - // (2.2) load the next char, and chck if is word character - m_jit.movzwl_mr(inputRegister, currentPositionRegister, 2, currentValueRegister); - generateCharacterClassInverted(wordBoundary, getCharacterClassWordchar()); - // (2.3) If we get here, neither chars are word chars - notWordBoundary.append(m_jit.emitUnlinkedJmp()); - - // (3) Handle situation where previous was a \w - - // (3.0) link success in first match to here - JmpDst section3 = m_jit.label(); - for (unsigned i = 0; i < previousIsWord.size(); ++i) - m_jit.link(previousIsWord[i], section3); - previousIsWord.clear(); - // (3.1) check for end of input - m_jit.cmpl_rr(lengthRegister, currentPositionRegister); - wordBoundary.append(m_jit.emitUnlinkedJe()); - // (3.2) load the next char, and chck if is word character - m_jit.movzwl_mr(inputRegister, currentPositionRegister, 2, currentValueRegister); - generateCharacterClassInverted(notWordBoundary, getCharacterClassWordchar()); - // (3.3) If we get here, this is an end of a word, within the input. - - // (4) Link everything up - - if (invert) { - // handle the fall through case - wordBoundary.append(m_jit.emitUnlinkedJmp()); - - // looking for non word boundaries, so link boundary fails to here. - JmpDst success = m_jit.label(); - for (unsigned i = 0; i < notWordBoundary.size(); ++i) - m_jit.link(notWordBoundary[i], success); - notWordBoundary.clear(); - - failures.append(wordBoundary.begin(), wordBoundary.size()); - } else { - // looking for word boundaries, so link successes here. - JmpDst success = m_jit.label(); - for (unsigned i = 0; i < wordBoundary.size(); ++i) - m_jit.link(wordBoundary[i], success); - wordBoundary.clear(); - - failures.append(notWordBoundary.begin(), notWordBoundary.size()); - } -} - -void WRECGenerator::generateBackreference(JmpSrcVector& failures, unsigned subpatternId) -{ - m_jit.pushl_r(currentPositionRegister); - m_jit.pushl_r(quantifierCountRegister); - - // get the start pos of the backref into quantifierCountRegister (multipurpose!) - m_jit.movl_mr((2 * subpatternId) * sizeof(int), outputRegister, quantifierCountRegister); - - JmpSrc skipIncrement = m_jit.emitUnlinkedJmp(); - JmpDst topOfLoop = m_jit.label(); - m_jit.addl_i8r(1, currentPositionRegister); - m_jit.addl_i8r(1, quantifierCountRegister); - m_jit.link(skipIncrement, m_jit.label()); - - // check if we're at the end of backref (if we are, success!) - m_jit.cmpl_rm(quantifierCountRegister, ((2 * subpatternId) + 1) * sizeof(int), outputRegister); - JmpSrc endOfBackRef = m_jit.emitUnlinkedJe(); - - m_jit.movzwl_mr(inputRegister, quantifierCountRegister, 2, currentValueRegister); - - // check if we've run out of input (this would be a can o'fail) - m_jit.cmpl_rr(lengthRegister, currentPositionRegister); - JmpSrc endOfInput = m_jit.emitUnlinkedJe(); - - m_jit.cmpw_rm(currentValueRegister, inputRegister, currentPositionRegister, 2); - m_jit.link(m_jit.emitUnlinkedJe(), topOfLoop); - - m_jit.link(endOfInput, m_jit.label()); - // Failure - m_jit.popl_r(quantifierCountRegister); - m_jit.popl_r(currentPositionRegister); - failures.append(m_jit.emitUnlinkedJmp()); - - // Success - m_jit.link(endOfBackRef, m_jit.label()); - m_jit.popl_r(quantifierCountRegister); - m_jit.addl_i8r(4, X86::esp); -} - -void WRECGenerator::generateDisjunction(JmpSrcVector& successes, JmpSrcVector& failures) -{ - successes.append(m_jit.emitUnlinkedJmp()); - - JmpDst here = m_jit.label(); - - for (unsigned i = 0; i < failures.size(); ++i) - m_jit.link(failures[i], here); - failures.clear(); - - m_jit.movl_mr(X86::esp, currentPositionRegister); -} - -void WRECGenerator::terminateDisjunction(JmpSrcVector& successes) -{ - JmpDst here = m_jit.label(); - for (unsigned i = 0; i < successes.size(); ++i) - m_jit.link(successes[i], here); - successes.clear(); -} - -ALWAYS_INLINE Quantifier WRECParser::parseGreedyQuantifier() -{ - switch (peek()) { - case '?': - consume(); - return Quantifier(Quantifier::Greedy, 0, 1); - - case '*': - consume(); - return Quantifier(Quantifier::Greedy, 0); - - case '+': - consume(); - return Quantifier(Quantifier::Greedy, 1); - - case '{': { - consume(); - // a numeric quantifier should always have a lower bound - if (!peekIsDigit()) { - m_err = Error_malformedQuantifier; - return Quantifier(Quantifier::Error); - } - int min = consumeNumber(); - - // this should either be a , or a } - switch (peek()) { - case '}': - // {n} - exactly n times. (technically I think a '?' is valid in the bnf - bit meaningless). - consume(); - return Quantifier(Quantifier::Greedy, min, min); - - case ',': - consume(); - switch (peek()) { - case '}': - // {n,} - n to inf times. - consume(); - return Quantifier(Quantifier::Greedy, min); - - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': { - // {n,m} - n to m times. - int max = consumeNumber(); - - if (peek() != '}') { - m_err = Error_malformedQuantifier; - return Quantifier(Quantifier::Error); - } - consume(); - - return Quantifier(Quantifier::Greedy, min, max); - } - - default: - m_err = Error_malformedQuantifier; - return Quantifier(Quantifier::Error); - } - - default: - m_err = Error_malformedQuantifier; - return Quantifier(Quantifier::Error); - } - } - // None of the above - no quantifier - default: - return Quantifier(); - } -} - -Quantifier WRECParser::parseQuantifier() -{ - Quantifier q = parseGreedyQuantifier(); - - if ((q.type == Quantifier::Greedy) && (peek() == '?')) { - consume(); - q.type = Quantifier::NonGreedy; - } - - return q; -} - -bool WRECParser::parsePatternCharacterQualifier(JmpSrcVector& failures, int ch) -{ - Quantifier q = parseQuantifier(); - - switch (q.type) { - case Quantifier::None: { - m_generator.generatePatternCharacter(failures, ch); - break; - } - - case Quantifier::Greedy: { - GeneratePatternCharacterFunctor functor(ch); - m_generator.generateGreedyQuantifier(failures, functor, q.min, q.max); - break; - } - - case Quantifier::NonGreedy: { - GeneratePatternCharacterFunctor functor(ch); - m_generator.generateNonGreedyQuantifier(failures, functor, q.min, q.max); - break; - } - - case Quantifier::Error: - return false; - } - - return true; -} - -bool WRECParser::parseCharacterClassQuantifier(JmpSrcVector& failures, CharacterClass& charClass, bool invert) -{ - Quantifier q = parseQuantifier(); - - switch (q.type) { - case Quantifier::None: { - m_generator.generateCharacterClass(failures, charClass, invert); - break; - } - - case Quantifier::Greedy: { - GenerateCharacterClassFunctor functor(&charClass, invert); - m_generator.generateGreedyQuantifier(failures, functor, q.min, q.max); - break; - } - - case Quantifier::NonGreedy: { - GenerateCharacterClassFunctor functor(&charClass, invert); - m_generator.generateNonGreedyQuantifier(failures, functor, q.min, q.max); - break; - } - - case Quantifier::Error: - return false; - } - - return true; -} - -bool WRECParser::parseBackreferenceQuantifier(JmpSrcVector& failures, unsigned subpatternId) -{ - Quantifier q = parseQuantifier(); - - switch (q.type) { - case Quantifier::None: { - m_generator.generateBackreference(failures, subpatternId); - break; - } - - case Quantifier::Greedy: - case Quantifier::NonGreedy: - m_generator.generateBackreferenceQuantifier(failures, q.type, subpatternId, q.min, q.max); - return true; - - case Quantifier::Error: - return false; - } - - return true; -} - -bool WRECParser::parseParentheses(JmpSrcVector&) -{ - // FIXME: We don't currently backtrack correctly within parentheses in cases such as - // "c".match(/(.*)c/) so we fall back to PCRE for any regexp containing parentheses. - - m_err = TempError_unsupportedParentheses; - return false; -} - -bool WRECParser::parseCharacterClass(JmpSrcVector& failures) -{ - bool invert = false; - if (peek() == '^') { - consume(); - invert = true; - } - - CharacterClassConstructor charClassConstructor(m_ignoreCase); - - UChar ch; - while ((ch = peek()) != ']') { - switch (ch) { - case EndOfPattern: - m_err = Error_malformedCharacterClass; - return false; - - case '\\': - consume(); - switch (ch = peek()) { - case EndOfPattern: - m_err = Error_malformedEscape; - return false; - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - charClassConstructor.put(consumeOctal()); - break; - - // ControlEscape - case 'b': - consume(); - charClassConstructor.put('\b'); - break; - case 'f': - consume(); - charClassConstructor.put('\f'); - break; - case 'n': - consume(); - charClassConstructor.put('\n'); - break; - case 'r': - consume(); - charClassConstructor.put('\r'); - break; - case 't': - consume(); - charClassConstructor.put('\t'); - break; - case 'v': - consume(); - charClassConstructor.put('\v'); - break; - - // ControlLetter - case 'c': { - consume(); - int control = consume(); - if (!isASCIIAlpha(control)) { - m_err = Error_malformedEscape; - return false; - } - charClassConstructor.put(control&31); - break; - } - - // HexEscape - case 'x': { - consume(); - int x = consumeHex(2); - if (x == -1) { - m_err = Error_malformedEscape; - return false; - } - charClassConstructor.put(x); - break; - } - - // UnicodeEscape - case 'u': { - consume(); - int x = consumeHex(4); - if (x == -1) { - m_err = Error_malformedEscape; - return false; - } - charClassConstructor.put(x); - break; - } - - // CharacterClassEscape - case 'd': - consume(); - charClassConstructor.append(getCharacterClassDigits()); - break; - case 's': - consume(); - charClassConstructor.append(getCharacterClassSpaces()); - break; - case 'w': - consume(); - charClassConstructor.append(getCharacterClassWordchar()); - break; - case 'D': - consume(); - charClassConstructor.append(getCharacterClassNondigits()); - break; - case 'S': - consume(); - charClassConstructor.append(getCharacterClassNonspaces()); - break; - case 'W': - consume(); - charClassConstructor.append(getCharacterClassNonwordchar()); - break; - - case '-': - consume(); - charClassConstructor.flushBeforeEscapedHyphen(); - charClassConstructor.put(ch); - break; - - // IdentityEscape - default: { - // TODO: check this test for IdentifierPart. - int ch = consume(); - if (isASCIIAlphanumeric(ch) || (ch == '_')) { - m_err = Error_malformedEscape; - return false; - } - charClassConstructor.put(ch); - break; - } - } - break; - - default: - consume(); - charClassConstructor.put(ch); - } - } - consume(); - - // lazily catch reversed ranges ([z-a])in character classes - if (charClassConstructor.isUpsideDown()) { - m_err = Error_malformedCharacterClass; - return false; - } - - charClassConstructor.flush(); - CharacterClass charClass = charClassConstructor.charClass(); - return parseCharacterClassQuantifier(failures, charClass, invert); -} - -bool WRECParser::parseOctalEscape(JmpSrcVector& failures) -{ - return parsePatternCharacterQualifier(failures, consumeOctal()); -} - -bool WRECParser::parseEscape(JmpSrcVector& failures) -{ - switch (peek()) { - case EndOfPattern: - m_err = Error_malformedEscape; - return false; - - // Assertions - case 'b': - consume(); - m_generator.generateAssertionWordBoundary(failures, false); - return true; - - case 'B': - consume(); - m_generator.generateAssertionWordBoundary(failures, true); - return true; - - // Octal escape - case '0': - consume(); - return parseOctalEscape(failures); - - // DecimalEscape - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': { - // FIXME: This does not support forward references. It's not clear whether they - // should be valid. - unsigned value = peekDigit(); - if (value > m_numSubpatterns) - return parseOctalEscape(failures); - } - case '8': - case '9': { - unsigned value = peekDigit(); - if (value > m_numSubpatterns) { - consume(); - m_err = Error_malformedEscape; - return false; - } - consume(); - - while (peekIsDigit()) { - unsigned newValue = value * 10 + peekDigit(); - if (newValue > m_numSubpatterns) - break; - value = newValue; - consume(); - } - - parseBackreferenceQuantifier(failures, value); - return true; - } - - // ControlEscape - case 'f': - consume(); - return parsePatternCharacterQualifier(failures, '\f'); - case 'n': - consume(); - return parsePatternCharacterQualifier(failures, '\n'); - case 'r': - consume(); - return parsePatternCharacterQualifier(failures, '\r'); - case 't': - consume(); - return parsePatternCharacterQualifier(failures, '\t'); - case 'v': - consume(); - return parsePatternCharacterQualifier(failures, '\v'); - - // ControlLetter - case 'c': { - consume(); - int control = consume(); - if (!isASCIIAlpha(control)) { - m_err = Error_malformedEscape; - return false; - } - return parsePatternCharacterQualifier(failures, control&31); - } - - // HexEscape - case 'x': { - consume(); - int x = consumeHex(2); - if (x == -1) { - m_err = Error_malformedEscape; - return false; - } - return parsePatternCharacterQualifier(failures, x); - } - - // UnicodeEscape - case 'u': { - consume(); - int x = consumeHex(4); - if (x == -1) { - m_err = Error_malformedEscape; - return false; - } - return parsePatternCharacterQualifier(failures, x); - } - - // CharacterClassEscape - case 'd': - consume(); - return parseCharacterClassQuantifier(failures, getCharacterClassDigits(), false); - case 's': - consume(); - return parseCharacterClassQuantifier(failures, getCharacterClassSpaces(), false); - case 'w': - consume(); - return parseCharacterClassQuantifier(failures, getCharacterClassWordchar(), false); - case 'D': - consume(); - return parseCharacterClassQuantifier(failures, getCharacterClassDigits(), true); - case 'S': - consume(); - return parseCharacterClassQuantifier(failures, getCharacterClassSpaces(), true); - case 'W': - consume(); - return parseCharacterClassQuantifier(failures, getCharacterClassWordchar(), true); - - // IdentityEscape - default: { - // TODO: check this test for IdentifierPart. - int ch = consume(); - if (isASCIIAlphanumeric(ch) || (ch == '_')) { - m_err = Error_malformedEscape; - return false; - } - return parsePatternCharacterQualifier(failures, ch); - } - } -} - -bool WRECParser::parseTerm(JmpSrcVector& failures) -{ - switch (peek()) { - case EndOfPattern: - case '*': - case '+': - case '?': - case ')': - case ']': - case '{': - case '}': - case '|': - // Not allowed in a Term! - return false; - - case '^': - consume(); - m_generator.generateAssertionBOL(failures); - return true; - - case '$': - consume(); - m_generator.generateAssertionEOL(failures); - return true; - - case '\\': - // b & B are also assertions... - consume(); - return parseEscape(failures); - - case '.': - consume(); - return parseCharacterClassQuantifier(failures, getCharacterClassNewline(), true); - - case '[': - // CharacterClass - consume(); - return parseCharacterClass(failures); - - case '(': - consume(); - return parseParentheses(failures); - - default: - // Anything else is a PatternCharacter - return parsePatternCharacterQualifier(failures, consume()); - } -} - -/* - interface req: CURR_POS is on stack (can be reloaded). -*/ -void WRECParser::parseDisjunction(JmpSrcVector& failures) -{ - parseAlternative(failures); - - if (peek() == '|') { - JmpSrcVector successes; - - do { - consume(); - - m_generator.generateDisjunction(successes, failures); - - parseAlternative(failures); - } while (peek() == '|'); - - m_generator.terminateDisjunction(successes); - } -} - -} // namespace JSC - -#endif // ENABLE(WREC) diff --git a/JavaScriptCore/wrec/WREC.h b/JavaScriptCore/wrec/WREC.h deleted file mode 100644 index 301bd3b..0000000 --- a/JavaScriptCore/wrec/WREC.h +++ /dev/null @@ -1,258 +0,0 @@ -/* - * Copyright (C) 2008 Apple Inc. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY - * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY - * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef WREC_h -#define WREC_h - -#if ENABLE(WREC) - -#include "ustring.h" -#include <masm/X86Assembler.h> -#include <wtf/ASCIICType.h> -#include <wtf/Vector.h> - -#if COMPILER(GCC) -#define WREC_CALL __attribute__ ((regparm (3))) -#else -#define WREC_CALL -#endif - -namespace JSC { - - typedef int (*WRECFunction)(const UChar* input, unsigned start, unsigned length, int* output) WREC_CALL; - - class GenerateAtomFunctor; - struct CharacterClassRange; - struct CharacterClass; - - struct Quantifier { - enum Type { - None, - Greedy, - NonGreedy, - Error, - }; - - Quantifier() - : type(None) - { - } - - Quantifier(Type type, unsigned min = 0, unsigned max = noMaxSpecified) - : type(type) - , min(min) - , max(max) - { - } - - Type type; - - unsigned min; - unsigned max; - - static const unsigned noMaxSpecified = UINT_MAX; - }; - - class WRECParser; - - typedef Vector<X86Assembler::JmpSrc> JmpSrcVector; - - class WRECGenerator { - public: - WRECGenerator(WRECParser& parser, X86Assembler& jit) - : m_parser(parser) - , m_jit(jit) - { - } - - typedef X86Assembler::JmpSrc JmpSrc; - typedef X86Assembler::JmpDst JmpDst; - - // these regs setup by the params - static const X86Assembler::RegisterID inputRegister = X86::eax; - static const X86Assembler::RegisterID currentPositionRegister = X86::edx; - static const X86Assembler::RegisterID lengthRegister = X86::ecx; - static const X86Assembler::RegisterID currentValueRegister = X86::esi; - static const X86Assembler::RegisterID outputRegister = X86::edi; - static const X86Assembler::RegisterID quantifierCountRegister = X86::ebx; - - friend class GenerateAtomFunctor; - friend class GeneratePatternCharacterFunctor; - friend class GenerateCharacterClassFunctor; - friend class GenerateBackreferenceFunctor; - friend class GenerateParenthesesNonGreedyFunctor; - - void generateGreedyQuantifier(JmpSrcVector& failures, GenerateAtomFunctor& functor, unsigned min, unsigned max); - void generateNonGreedyQuantifier(JmpSrcVector& failures, GenerateAtomFunctor& functor, unsigned min, unsigned max); - void generateBacktrack1(); - void generateBacktrackBackreference(unsigned subpatternId); - void generateCharacterClass(JmpSrcVector& failures, CharacterClass& charClass, bool invert); - void generateCharacterClassInverted(JmpSrcVector& failures, CharacterClass& charClass); - void generateCharacterClassInvertedRange(JmpSrcVector& failures, JmpSrcVector& matchDest, const CharacterClassRange* ranges, unsigned count, unsigned* matchIndex, const UChar* matches, unsigned matchCount); - void generatePatternCharacter(JmpSrcVector& failures, int ch); - void generateAssertionWordBoundary(JmpSrcVector& failures, bool invert); - void generateAssertionBOL(JmpSrcVector& failures); - void generateAssertionEOL(JmpSrcVector& failures); - void generateBackreference(JmpSrcVector& failures, unsigned subpatternID); - void generateBackreferenceQuantifier(JmpSrcVector& failures, Quantifier::Type quantifierType, unsigned subpatternId, unsigned min, unsigned max); - enum ParenthesesType { capturing, non_capturing, assertion, inverted_assertion }; // order is relied on in generateParentheses() - JmpSrc generateParentheses(ParenthesesType type); - JmpSrc generateParenthesesResetTrampoline(JmpSrcVector& newFailures, unsigned subpatternIdBefore, unsigned subpatternIdAfter); - void generateParenthesesNonGreedy(JmpSrcVector& failures, JmpDst start, JmpSrc success, JmpSrc fail); - - void generateDisjunction(JmpSrcVector& successes, JmpSrcVector& failures); - void terminateDisjunction(JmpSrcVector& successes); - - private: - WRECParser& m_parser; - X86Assembler& m_jit; - }; - - class WRECParser { - public: - bool m_ignoreCase; - bool m_multiline; - unsigned m_numSubpatterns; - enum WRECError { - NoError, - Error_malformedCharacterClass, - Error_malformedParentheses, - Error_malformedPattern, - Error_malformedQuantifier, - Error_malformedEscape, - TempError_unsupportedQuantifier, - TempError_unsupportedParentheses, - } m_err; - - WRECParser(const UString& pattern, bool ignoreCase, bool multiline, X86Assembler& jit) - : m_ignoreCase(ignoreCase) - , m_multiline(multiline) - , m_numSubpatterns(0) - , m_err(NoError) - , m_generator(*this, jit) - , m_data(pattern.data()) - , m_size(pattern.size()) - , m_index(0) - { - } - - void parseAlternative(JmpSrcVector& failures) - { - while (parseTerm(failures)) { } - } - - void parseDisjunction(JmpSrcVector& failures); - - bool parseTerm(JmpSrcVector& failures); - bool parseEscape(JmpSrcVector& failures); - bool parseOctalEscape(JmpSrcVector& failures); - bool parseParentheses(JmpSrcVector& failures); - bool parseCharacterClass(JmpSrcVector& failures); - bool parseCharacterClassQuantifier(JmpSrcVector& failures, CharacterClass& charClass, bool invert); - bool parsePatternCharacterQualifier(JmpSrcVector& failures, int ch); - bool parseBackreferenceQuantifier(JmpSrcVector& failures, unsigned subpatternId); - - ALWAYS_INLINE Quantifier parseGreedyQuantifier(); - Quantifier parseQuantifier(); - - static const int EndOfPattern = -1; - - int peek() - { - if (m_index >= m_size) - return EndOfPattern; - return m_data[m_index]; - } - - int consume() - { - if (m_index >= m_size) - return EndOfPattern; - return m_data[m_index++]; - } - - bool peekIsDigit() - { - return WTF::isASCIIDigit(peek()); - } - - unsigned peekDigit() - { - ASSERT(peekIsDigit()); - return peek() - '0'; - } - - unsigned consumeDigit() - { - ASSERT(peekIsDigit()); - return consume() - '0'; - } - - unsigned consumeNumber() - { - int n = consumeDigit(); - while (peekIsDigit()) { - n *= 10; - n += consumeDigit(); - } - return n; - } - - int consumeHex(int count) - { - int n = 0; - while (count--) { - if (!WTF::isASCIIHexDigit(peek())) - return -1; - n = (n<<4) | WTF::toASCIIHexValue(consume()); - } - return n; - } - - unsigned consumeOctal() - { - unsigned n = 0; - while (n < 32 && WTF::isASCIIOctalDigit(peek())) - n = n * 8 + (consume() - '0'); - return n; - } - - bool isEndOfPattern() - { - return peek() != EndOfPattern; - } - - private: - WRECGenerator m_generator; - const UChar* m_data; - unsigned m_size; - unsigned m_index; - }; - -} // namespace JSC - -#endif // ENABLE(WREC) - -#endif // WREC_h |