diff options
Diffstat (limited to 'JavaScriptCore/wrec')
-rw-r--r-- | JavaScriptCore/wrec/CharacterClassConstructor.cpp | 360 | ||||
-rw-r--r-- | JavaScriptCore/wrec/CharacterClassConstructor.h | 121 | ||||
-rw-r--r-- | JavaScriptCore/wrec/WREC.cpp | 1324 | ||||
-rw-r--r-- | JavaScriptCore/wrec/WREC.h | 258 |
4 files changed, 2063 insertions, 0 deletions
diff --git a/JavaScriptCore/wrec/CharacterClassConstructor.cpp b/JavaScriptCore/wrec/CharacterClassConstructor.cpp new file mode 100644 index 0000000..85eccaa --- /dev/null +++ b/JavaScriptCore/wrec/CharacterClassConstructor.cpp @@ -0,0 +1,360 @@ +/* + * Copyright (C) 2008 Apple Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "CharacterClassConstructor.h" + +#if ENABLE(WREC) + +#include "pcre_internal.h" +#include <wtf/ASCIICType.h> + +using namespace WTF; + +namespace JSC { + +static const UChar asciiNewlines[2] = { '\n', '\r' }; +static const UChar unicodeNewlines[2] = { 0x2028, 0x2029 }; +CharacterClass& getCharacterClassNewline() { + static CharacterClass charClass = { + asciiNewlines, 2, + 0, 0, + unicodeNewlines, 2, + 0, 0, + }; + + return charClass; +} + +static const CharacterClassRange asciiDigitsRange[1] = { { '0', '9' } }; +CharacterClass& getCharacterClassDigits() { + static CharacterClass charClass = { + 0, 0, + asciiDigitsRange, 1, + 0, 0, + 0, 0, + }; + + return charClass; +} + +static const UChar asciiSpaces[1] = { ' ' }; +static const CharacterClassRange asciiSpacesRange[1] = { { '\t', '\r' } }; +static const UChar unicodeSpaces[8] = { 0x00a0, 0x1680, 0x180e, 0x2028, 0x2029, 0x202f, 0x205f, 0x3000 }; +static const CharacterClassRange unicodeSpacesRange[1] = { { 0x2000, 0x200a } }; +CharacterClass& getCharacterClassSpaces() { + static CharacterClass charClass = { + asciiSpaces, 1, + asciiSpacesRange, 1, + unicodeSpaces, 8, + unicodeSpacesRange, 1, + }; + + return charClass; +} + +static const UChar asciiWordchar[1] = { '_' }; +static const CharacterClassRange asciiWordcharRange[3] = { { '0', '9' }, { 'A', 'Z' }, { 'a', 'z' } }; +CharacterClass& getCharacterClassWordchar() { + static CharacterClass charClass = { + asciiWordchar, 1, + asciiWordcharRange, 3, + 0, 0, + 0, 0, + }; + + return charClass; +} + +static const CharacterClassRange asciiNondigitsRange[2] = { { 0, '0' - 1 }, { '9' + 1, 0x7f } }; +static const CharacterClassRange unicodeNondigitsRange[1] = { { 0x0080, 0xffff } }; +CharacterClass& getCharacterClassNondigits() { + static CharacterClass charClass = { + 0, 0, + asciiNondigitsRange, 2, + 0, 0, + unicodeNondigitsRange, 1, + }; + + return charClass; +} + +static const CharacterClassRange asciiNonspacesRange[3] = { { 0, '\t' - 1 }, { '\r' + 1, ' ' - 1 }, { ' ' + 1, 0x7f } }; +static const CharacterClassRange unicodeNonspacesRange[9] = { + { 0x0080, 0x009f }, + { 0x00a1, 0x167f }, + { 0x1681, 0x180d }, + { 0x180f, 0x1fff }, + { 0x200b, 0x2027 }, + { 0x202a, 0x202e }, + { 0x2030, 0x205e }, + { 0x2060, 0x2fff }, + { 0x3001, 0xffff } +}; +CharacterClass& getCharacterClassNonspaces() { + static CharacterClass charClass = { + 0, 0, + asciiNonspacesRange, 3, + 0, 0, + unicodeNonspacesRange, 9, + }; + + return charClass; +} + +static const UChar asciiNonwordchar[1] = { '`' }; +static const CharacterClassRange asciiNonwordcharRange[4] = { { 0, '0' - 1 }, { '9' + 1, 'A' - 1 }, { 'Z' + 1, '_' - 1 }, { 'z' + 1, 0x7f } }; +static const CharacterClassRange unicodeNonwordcharRange[1] = { { 0x0080, 0xffff } }; +CharacterClass& getCharacterClassNonwordchar() { + static CharacterClass charClass = { + asciiNonwordchar, 1, + asciiNonwordcharRange, 4, + 0, 0, + unicodeNonwordcharRange, 1, + }; + + return charClass; +} + +void CharacterClassConstructor::addSorted(Vector<UChar>& matches, UChar ch) +{ + unsigned pos = 0; + unsigned range = matches.size(); + + // binary chop, find position to insert char. + while (range) { + unsigned index = range >> 1; + + int val = matches[pos+index] - ch; + if (!val) + return; + else if (val > 0) + range = index; + else { + pos += (index+1); + range -= (index+1); + } + } + + if (pos == matches.size()) + matches.append(ch); + else + matches.insert(pos, ch); +} + +void CharacterClassConstructor::addSortedRange(Vector<CharacterClassRange>& ranges, UChar lo, UChar hi) +{ + unsigned end = ranges.size(); + + // Simple linear scan - I doubt there are that many ranges anyway... + // feel free to fix this with something faster (eg binary chop). + for (unsigned i = 0; i < end; ++i) { + // does the new range fall before the current position in the array + if (hi < ranges[i].begin) { + // optional optimization: concatenate appending ranges? - may not be worthwhile. + if (hi == (ranges[i].begin - 1)) { + ranges[i].begin = lo; + return; + } + CharacterClassRange r = {lo, hi}; + ranges.insert(i, r); + return; + } + // Okay, since we didn't hit the last case, the end of the new range is definitely at or after the begining + // If the new range start at or before the end of the last range, then the overlap (if it starts one after the + // end of the last range they concatenate, which is just as good. + if (lo <= (ranges[i].end + 1)) { + // found an intersect! we'll replace this entry in the array. + ranges[i].begin = std::min(ranges[i].begin, lo); + ranges[i].end = std::max(ranges[i].end, hi); + + // now check if the new range can subsume any subsequent ranges. + unsigned next = i+1; + // each iteration of the loop we will either remove something from the list, or break the loop. + while (next < ranges.size()) { + if (ranges[next].begin <= (ranges[i].end + 1)) { + // the next entry now overlaps / concatenates this one. + ranges[i].end = std::max(ranges[i].end, ranges[next].end); + ranges.remove(next); + } else + break; + } + + return; + } + } + + // Range comes after all existing ranges. + CharacterClassRange r = {lo, hi}; + ranges.append(r); +} + +void CharacterClassConstructor::put(UChar ch) +{ + // Parsing a regular expression like [a-z], we start in an initial empty state: + // ((m_charBuffer == -1) && !m_isPendingDash) + // When buffer the 'a' sice it may be (and is in this case) part of a range: + // ((m_charBuffer != -1) && !m_isPendingDash) + // Having parsed the hyphen we then record that the dash is also pending: + // ((m_charBuffer != -1) && m_isPendingDash) + // The next change will always take us back to the initial state - either because + // a complete range has been parsed (such as [a-z]), or because a flush is forced, + // due to an early end in the regexp ([a-]), or a character class escape being added + // ([a-\s]). The fourth permutation of m_charBuffer and m_isPendingDash is not permitted. + ASSERT(!((m_charBuffer == -1) && m_isPendingDash)); + + if (m_charBuffer != -1) { + if (m_isPendingDash) { + // EXAMPLE: parsing [-a-c], the 'c' reaches this case - we have buffered a previous character and seen a hyphen, so this is a range. + UChar lo = m_charBuffer; + UChar hi = ch; + // Reset back to the inital state. + m_charBuffer = -1; + m_isPendingDash = false; + + // This is an error, detected lazily. Do not proceed. + if (lo > hi) { + m_isUpsideDown = true; + return; + } + + if (lo <= 0x7f) { + char asciiLo = lo; + char asciiHi = std::min(hi, (UChar)0x7f); + addSortedRange(m_ranges, lo, asciiHi); + + if (m_isCaseInsensitive) { + if ((asciiLo <= 'Z') && (asciiHi >= 'A')) + addSortedRange(m_ranges, std::max(asciiLo, 'A')+('a'-'A'), std::min(asciiHi, 'Z')+('a'-'A')); + if ((asciiLo <= 'z') && (asciiHi >= 'a')) + addSortedRange(m_ranges, std::max(asciiLo, 'a')+('A'-'a'), std::min(asciiHi, 'z')+('A'-'a')); + } + } + if (hi >= 0x80) { + UChar unicodeCurr = std::max(lo, (UChar)0x80); + addSortedRange(m_rangesUnicode, unicodeCurr, hi); + + if (m_isCaseInsensitive) { + // we're going to scan along, updating the start of the range + while (unicodeCurr <= hi) { + // Spin forwards over any characters that don't have two cases. + for (; kjs_pcre_ucp_othercase(unicodeCurr) == -1; ++unicodeCurr) { + // if this was the last character in the range, we're done. + if (unicodeCurr == hi) + return; + } + // if we fall through to here, unicodeCurr <= hi & has another case. Get the other case. + UChar rangeStart = unicodeCurr; + UChar otherCurr = kjs_pcre_ucp_othercase(unicodeCurr); + + // If unicodeCurr is not yet hi, check the next char in the range. If it also has another case, + // and if it's other case value is one greater then the othercase value for the current last + // character included in the range, we can include next into the range. + while ((unicodeCurr < hi) && (kjs_pcre_ucp_othercase(unicodeCurr + 1) == (otherCurr + 1))) { + // increment unicodeCurr; it points to the end of the range. + // increment otherCurr, due to the check above other for next must be 1 greater than the currrent other value. + ++unicodeCurr; + ++otherCurr; + } + + // otherChar is the last in the range of other case chars, calculate offset to get back to the start. + addSortedRange(m_rangesUnicode, otherCurr-(unicodeCurr-rangeStart), otherCurr); + + // unicodeCurr has been added, move on to the next char. + ++unicodeCurr; + } + } + } + } else if (ch == '-') + // EXAMPLE: parsing [-a-c], the second '-' reaches this case - the hyphen is treated as potentially indicating a range. + m_isPendingDash = true; + else { + // EXAMPLE: Parsing [-a-c], the 'a' reaches this case - we repace the previously buffered char with the 'a'. + flush(); + m_charBuffer = ch; + } + } else + // EXAMPLE: Parsing [-a-c], the first hyphen reaches this case - there is no buffered character + // (the hyphen not treated as a special character in this case, same handling for any char). + m_charBuffer = ch; +} + +// When a character is added to the set we do not immediately add it to the arrays, in case it is actually defining a range. +// When we have determined the character is not used in specifing a range it is added, in a sorted fashion, to the appropriate +// array (either ascii or unicode). +// If the pattern is case insensitive we add entries for both cases. +void CharacterClassConstructor::flush() +{ + if (m_charBuffer != -1) { + if (m_charBuffer <= 0x7f) { + if (m_isCaseInsensitive && isASCIILower(m_charBuffer)) + addSorted(m_matches, toASCIIUpper(m_charBuffer)); + addSorted(m_matches, m_charBuffer); + if (m_isCaseInsensitive && isASCIIUpper(m_charBuffer)) + addSorted(m_matches, toASCIILower(m_charBuffer)); + } else { + addSorted(m_matchesUnicode, m_charBuffer); + if (m_isCaseInsensitive) { + int other = kjs_pcre_ucp_othercase(m_charBuffer); + if (other != -1) + addSorted(m_matchesUnicode, other); + } + } + m_charBuffer = -1; + } + + if (m_isPendingDash) { + addSorted(m_matches, '-'); + m_isPendingDash = false; + } +} + +void CharacterClassConstructor::append(CharacterClass& other) +{ + // [x-\s] will add, 'x', '-', and all unicode spaces to new class (same as [x\s-]). + // Need to check the spec, really, but think this matches PCRE behaviour. + flush(); + + if (other.numMatches) { + for (size_t i = 0; i < other.numMatches; ++i) + addSorted(m_matches, other.matches[i]); + } + if (other.numRanges) { + for (size_t i = 0; i < other.numRanges; ++i) + addSortedRange(m_ranges, other.ranges[i].begin, other.ranges[i].end); + } + if (other.numMatchesUnicode) { + for (size_t i = 0; i < other.numMatchesUnicode; ++i) + addSorted(m_matchesUnicode, other.matchesUnicode[i]); + } + if (other.numRangesUnicode) { + for (size_t i = 0; i < other.numRangesUnicode; ++i) + addSortedRange(m_rangesUnicode, other.rangesUnicode[i].begin, other.rangesUnicode[i].end); + } +} + +} + +#endif // ENABLE(WREC) diff --git a/JavaScriptCore/wrec/CharacterClassConstructor.h b/JavaScriptCore/wrec/CharacterClassConstructor.h new file mode 100644 index 0000000..5accdae --- /dev/null +++ b/JavaScriptCore/wrec/CharacterClassConstructor.h @@ -0,0 +1,121 @@ +/* + * Copyright (C) 2008 Apple Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef CharacterClassConstructor_h +#define CharacterClassConstructor_h + +#if ENABLE(WREC) + +#include "ustring.h" + +namespace JSC { + + struct CharacterClassRange { + UChar begin; + UChar end; + }; + + struct CharacterClass { + const UChar* matches; + unsigned numMatches; + + const CharacterClassRange* ranges; + unsigned numRanges; + + const UChar* matchesUnicode; + unsigned numMatchesUnicode; + + const CharacterClassRange* rangesUnicode; + unsigned numRangesUnicode; + }; + + CharacterClass& getCharacterClassNewline(); + CharacterClass& getCharacterClassDigits(); + CharacterClass& getCharacterClassSpaces(); + CharacterClass& getCharacterClassWordchar(); + CharacterClass& getCharacterClassNondigits(); + CharacterClass& getCharacterClassNonspaces(); + CharacterClass& getCharacterClassNonwordchar(); + + class CharacterClassConstructor { + public: + CharacterClassConstructor(bool isCaseInsensitive) + : m_charBuffer(-1) + , m_isPendingDash(false) + , m_isCaseInsensitive(isCaseInsensitive) + , m_isUpsideDown(false) + { + } + + void flush(); + + // We need to flush prior to an escaped hyphen to prevent it as being treated as indicating + // a range, e.g. [a\-c] we flush prior to adding the hyphen so that this is not treated as + // [a-c]. However, we do not want to flush if we have already seen a non escaped hyphen - + // e.g. [+-\-] should be treated the same as [+--], producing a range that will also match + // a comma. + void flushBeforeEscapedHyphen() + { + if (!m_isPendingDash) + flush(); + } + + void put(UChar ch); + void append(CharacterClass& other); + + bool isUpsideDown() { return m_isUpsideDown; } + + ALWAYS_INLINE CharacterClass charClass() + { + CharacterClass newCharClass = { + m_matches.begin(), m_matches.size(), + m_ranges.begin(), m_ranges.size(), + m_matchesUnicode.begin(), m_matchesUnicode.size(), + m_rangesUnicode.begin(), m_rangesUnicode.size(), + }; + + return newCharClass; + } + + private: + void addSorted(Vector<UChar>& matches, UChar ch); + void addSortedRange(Vector<CharacterClassRange>& ranges, UChar lo, UChar hi); + + int m_charBuffer; + bool m_isPendingDash; + bool m_isCaseInsensitive; + bool m_isUpsideDown; + + Vector<UChar> m_matches; + Vector<CharacterClassRange> m_ranges; + Vector<UChar> m_matchesUnicode; + Vector<CharacterClassRange> m_rangesUnicode; + }; + +} + +#endif // ENABLE(WREC) + +#endif // CharacterClassConstructor_h diff --git a/JavaScriptCore/wrec/WREC.cpp b/JavaScriptCore/wrec/WREC.cpp new file mode 100644 index 0000000..6d64a2c --- /dev/null +++ b/JavaScriptCore/wrec/WREC.cpp @@ -0,0 +1,1324 @@ +/* + * Copyright (C) 2008 Apple Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "WREC.h" + +#if ENABLE(WREC) + +#include "CharacterClassConstructor.h" +#include "Machine.h" +#include "pcre_internal.h" + +using namespace WTF; + +namespace JSC { + +class GenerateAtomFunctor { +public: + virtual ~GenerateAtomFunctor() {} + + virtual void generateAtom(WRECGenerator*, JmpSrcVector&) = 0; + virtual void backtrack(WRECGenerator*) = 0; +}; + +class GeneratePatternCharacterFunctor : public GenerateAtomFunctor { +public: + GeneratePatternCharacterFunctor(const UChar ch) + : m_ch(ch) + { + } + + virtual void generateAtom(WRECGenerator*, JmpSrcVector&); + virtual void backtrack(WRECGenerator*); + +private: + const UChar m_ch; +}; + +class GenerateCharacterClassFunctor : public GenerateAtomFunctor { +public: + GenerateCharacterClassFunctor(CharacterClass* charClass, bool invert) + : m_charClass(charClass) + , m_invert(invert) + { + } + + virtual void generateAtom(WRECGenerator*, JmpSrcVector&); + virtual void backtrack(WRECGenerator*); + +private: + CharacterClass* m_charClass; + bool m_invert; +}; + +class GenerateBackreferenceFunctor : public GenerateAtomFunctor { +public: + GenerateBackreferenceFunctor(unsigned subpatternId) + : m_subpatternId(subpatternId) + { + } + + virtual void generateAtom(WRECGenerator*, JmpSrcVector&); + virtual void backtrack(WRECGenerator*); + +private: + unsigned m_subpatternId; +}; + +class GenerateParenthesesNonGreedyFunctor : public GenerateAtomFunctor { +public: + GenerateParenthesesNonGreedyFunctor(WRECGenerator::JmpDst start, WRECGenerator::JmpSrc success, WRECGenerator::JmpSrc fail) + : m_start(start) + , m_success(success) + , m_fail(fail) + { + } + + virtual void generateAtom(WRECGenerator*, JmpSrcVector&); + virtual void backtrack(WRECGenerator*); + +private: + WRECGenerator::JmpDst m_start; + WRECGenerator::JmpSrc m_success; + WRECGenerator::JmpSrc m_fail; +}; + +void GeneratePatternCharacterFunctor::generateAtom(WRECGenerator* wrec, JmpSrcVector& failures) +{ + wrec->generatePatternCharacter(failures, m_ch); +} +void GeneratePatternCharacterFunctor::backtrack(WRECGenerator* wrec) +{ + wrec->generateBacktrack1(); +} + +void GenerateCharacterClassFunctor::generateAtom(WRECGenerator* wrec, JmpSrcVector& failures) +{ + wrec->generateCharacterClass(failures, *m_charClass, m_invert); +} +void GenerateCharacterClassFunctor::backtrack(WRECGenerator* wrec) +{ + wrec->generateBacktrack1(); +} + +void GenerateBackreferenceFunctor::generateAtom(WRECGenerator* wrec, JmpSrcVector& failures) +{ + wrec->generateBackreference(failures, m_subpatternId); +} +void GenerateBackreferenceFunctor::backtrack(WRECGenerator* wrec) +{ + wrec->generateBacktrackBackreference(m_subpatternId); +} + +void GenerateParenthesesNonGreedyFunctor::generateAtom(WRECGenerator* wrec, JmpSrcVector& failures) +{ + wrec->generateParenthesesNonGreedy(failures, m_start, m_success, m_fail); +} + +void GenerateParenthesesNonGreedyFunctor::backtrack(WRECGenerator*) +{ + // FIXME: do something about this. + CRASH(); +} + +void WRECGenerator::generateBacktrack1() +{ + m_jit.subl_i8r(1, currentPositionRegister); +} + +void WRECGenerator::generateBacktrackBackreference(unsigned subpatternId) +{ + m_jit.subl_mr((2 * subpatternId + 1) * sizeof(int), outputRegister, currentPositionRegister); + m_jit.addl_mr((2 * subpatternId) * sizeof(int), outputRegister, currentPositionRegister); +} + +void WRECGenerator::generateBackreferenceQuantifier(JmpSrcVector& failures, Quantifier::Type quantifierType, unsigned subpatternId, unsigned min, unsigned max) +{ + GenerateBackreferenceFunctor functor(subpatternId); + + m_jit.movl_mr((2 * subpatternId) * sizeof(int), outputRegister, currentValueRegister); + m_jit.cmpl_rm(currentValueRegister, ((2 * subpatternId) + 1) * sizeof(int), outputRegister); + JmpSrc skipIfEmpty = m_jit.emitUnlinkedJe(); + + ASSERT(quantifierType == Quantifier::Greedy || quantifierType == Quantifier::NonGreedy); + if (quantifierType == Quantifier::Greedy) + generateGreedyQuantifier(failures, functor, min, max); + else + generateNonGreedyQuantifier(failures, functor, min, max); + + m_jit.link(skipIfEmpty, m_jit.label()); +} + +void WRECGenerator::generateNonGreedyQuantifier(JmpSrcVector& failures, GenerateAtomFunctor& functor, unsigned min, unsigned max) +{ + // comment me better! + JmpSrcVector newFailures; + + // (0) Setup: + // init quantifierCountRegister + m_jit.pushl_r(quantifierCountRegister); + m_jit.xorl_rr(quantifierCountRegister, quantifierCountRegister); + JmpSrc gotoStart = m_jit.emitUnlinkedJmp(); + + // (4) Failure case + + JmpDst quantifierFailed = m_jit.label(); + // (4.1) Restore original value of quantifierCountRegister from the stack + m_jit.popl_r(quantifierCountRegister); + failures.append(m_jit.emitUnlinkedJmp()); + + // (3) We just tried an alternative, and it failed - check we can try more. + + JmpDst alternativeFailed = m_jit.label(); + // (3.1) remove the value pushed prior to testing the alternative + m_jit.popl_r(currentPositionRegister); + // (3.2) if there is a limit, and we have reached it, game over. + if (max != Quantifier::noMaxSpecified) { + m_jit.cmpl_i32r(max, quantifierCountRegister); + m_jit.link(m_jit.emitUnlinkedJe(), quantifierFailed); + } + + // (1) Do a check for the atom + + // (1.0) This is where we start, if there is a minimum (then we must read at least one of the atom). + JmpDst testQuantifiedAtom = m_jit.label(); + if (min) + m_jit.link(gotoStart, testQuantifiedAtom); + // (1.1) Do a check for the atom check. + functor.generateAtom(this, newFailures); + // (1.2) If we get here, successful match! + m_jit.addl_i8r(1, quantifierCountRegister); + // (1.3) We needed to read the atom, and we failed - that's terminally bad news. + for (unsigned i = 0; i < newFailures.size(); ++i) + m_jit.link(newFailures[i], quantifierFailed); + newFailures.clear(); + // (1.4) If there is a minimum, check we have read enough ... + if (min) { + // ... except if min was 1 no need to keep checking! + if (min != 1) { + m_jit.cmpl_i32r(min, quantifierCountRegister); + m_jit.link(m_jit.emitUnlinkedJl(), testQuantifiedAtom); + } + } else + m_jit.link(gotoStart, m_jit.label()); + // if there was no minimum, this is where we start. + + // (2) Plant an alternative check for the remainder of the expression + + // (2.1) recursively call to parseAlternative, if it falls through, success! + m_jit.pushl_r(currentPositionRegister); + m_parser.parseAlternative(newFailures); + m_jit.addl_i8r(4, X86::esp); + m_jit.popl_r(quantifierCountRegister); + // (2.2) link failure cases to jump back up to alternativeFailed. + for (unsigned i = 0; i < newFailures.size(); ++i) + m_jit.link(newFailures[i], alternativeFailed); + newFailures.clear(); +} + +void WRECGenerator::generateGreedyQuantifier(JmpSrcVector& failures, GenerateAtomFunctor& functor, unsigned min, unsigned max) +{ + if (!max) + return; + + // comment me better! + JmpSrcVector newFailures; + + // (0) Setup: + // init quantifierCountRegister + m_jit.pushl_r(quantifierCountRegister); + m_jit.xorl_rr(quantifierCountRegister, quantifierCountRegister); + + // (1) Greedily read as many of the atom as possible + + JmpDst readMore = m_jit.label(); + + // (1.1) Do a character class check. + functor.generateAtom(this, newFailures); + // (1.2) If we get here, successful match! + m_jit.addl_i8r(1, quantifierCountRegister); + // (1.3) loop, unless we've read max limit. + if (max != Quantifier::noMaxSpecified) { + if (max != 1) { + // if there is a limit, only loop while less than limit, otherwise fall throught to... + m_jit.cmpl_i32r(max, quantifierCountRegister); + m_jit.link(m_jit.emitUnlinkedJne(), readMore); + } + // ...if there is no min we need jump to the alternative test, if there is we can just fall through to it. + if (!min) + newFailures.append(m_jit.emitUnlinkedJmp()); + } else + m_jit.link(m_jit.emitUnlinkedJmp(), readMore); + // (1.4) check enough matches to bother trying an alternative... + if (min) { + // We will fall through to here if (min && max), after the max check. + // First, also link a + JmpDst minimumTest = m_jit.label(); + for (unsigned i = 0; i < newFailures.size(); ++i) + m_jit.link(newFailures[i], minimumTest); + newFailures.clear(); + // + m_jit.cmpl_i32r(min, quantifierCountRegister); + newFailures.append(m_jit.emitUnlinkedJae()); + } + + // (4) Failure case + + JmpDst quantifierFailed = m_jit.label(); + // (4.1) Restore original value of quantifierCountRegister from the stack + m_jit.popl_r(quantifierCountRegister); + failures.append(m_jit.emitUnlinkedJmp()); + + // (3) Backtrack + + JmpDst backtrack = m_jit.label(); + // (3.1) this was preserved prior to executing the alternative + m_jit.popl_r(currentPositionRegister); + // (3.2) check we can retry with fewer matches - backtracking fails if already at the minimum + m_jit.cmpl_i32r(min, quantifierCountRegister); + m_jit.link(m_jit.emitUnlinkedJe(), quantifierFailed); + // (3.3) roll off one match, and retry. + functor.backtrack(this); + m_jit.subl_i8r(1, quantifierCountRegister); + + // (2) Try an alternative. + + // (2.1) point to retry + JmpDst tryAlternative = m_jit.label(); + for (unsigned i = 0; i < newFailures.size(); ++i) + m_jit.link(newFailures[i], tryAlternative); + newFailures.clear(); + // (2.2) recursively call to parseAlternative, if it falls through, success! + m_jit.pushl_r(currentPositionRegister); + m_parser.parseAlternative(newFailures); + m_jit.addl_i8r(4, X86::esp); + m_jit.popl_r(quantifierCountRegister); + // (2.3) link failure cases to here. + for (unsigned i = 0; i < newFailures.size(); ++i) + m_jit.link(newFailures[i], backtrack); + newFailures.clear(); +} + +void WRECGenerator::generatePatternCharacter(JmpSrcVector& failures, int ch) +{ + // check there is more input, read a char. + m_jit.cmpl_rr(lengthRegister, currentPositionRegister); + failures.append(m_jit.emitUnlinkedJe()); + m_jit.movzwl_mr(inputRegister, currentPositionRegister, 2, currentValueRegister); + + // used for unicode case insensitive + bool hasUpper = false; + JmpSrc isUpper; + + // if case insensitive match + if (m_parser.m_ignoreCase) { + UChar lower, upper; + + // check for ascii case sensitive characters + if (isASCIIAlpha(ch)) { + m_jit.orl_i32r(32, currentValueRegister); + ch |= 32; + } else if ((ch > 0x7f) && ((lower = Unicode::toLower(ch)) != (upper = Unicode::toUpper(ch)))) { + // handle unicode case sentitive characters - branch to success on upper + m_jit.cmpl_i32r(upper, currentValueRegister); + isUpper = m_jit.emitUnlinkedJe(); + hasUpper = true; + ch = lower; + } + } + + // checks for ch, or lower case version of ch, if insensitive + m_jit.cmpl_i32r((unsigned short)ch, currentValueRegister); + failures.append(m_jit.emitUnlinkedJne()); + + if (m_parser.m_ignoreCase && hasUpper) { + // for unicode case insensitive matches, branch here if upper matches. + m_jit.link(isUpper, m_jit.label()); + } + + // on success consume the char + m_jit.addl_i8r(1, currentPositionRegister); +} + +void WRECGenerator::generateCharacterClassInvertedRange(JmpSrcVector& failures, JmpSrcVector& matchDest, const CharacterClassRange* ranges, unsigned count, unsigned* matchIndex, const UChar* matches, unsigned matchCount) +{ + do { + // pick which range we're going to generate + int which = count >> 1; + char lo = ranges[which].begin; + char hi = ranges[which].end; + + m_jit.cmpl_i32r((unsigned short)lo, currentValueRegister); + + // check if there are any ranges or matches below lo. If not, just jl to failure - + // if there is anything else to check, check that first, if it falls through jmp to failure. + if ((*matchIndex < matchCount) && (matches[*matchIndex] < lo)) { + JmpSrc loOrAbove = m_jit.emitUnlinkedJge(); + + // generate code for all ranges before this one + if (which) + generateCharacterClassInvertedRange(failures, matchDest, ranges, which, matchIndex, matches, matchCount); + + do { + m_jit.cmpl_i32r((unsigned short)matches[*matchIndex], currentValueRegister); + matchDest.append(m_jit.emitUnlinkedJe()); + ++*matchIndex; + } while ((*matchIndex < matchCount) && (matches[*matchIndex] < lo)); + failures.append(m_jit.emitUnlinkedJmp()); + + m_jit.link(loOrAbove, m_jit.label()); + } else if (which) { + JmpSrc loOrAbove = m_jit.emitUnlinkedJge(); + + generateCharacterClassInvertedRange(failures, matchDest, ranges, which, matchIndex, matches, matchCount); + failures.append(m_jit.emitUnlinkedJmp()); + + m_jit.link(loOrAbove, m_jit.label()); + } else + failures.append(m_jit.emitUnlinkedJl()); + + while ((*matchIndex < matchCount) && (matches[*matchIndex] <= hi)) + ++*matchIndex; + + m_jit.cmpl_i32r((unsigned short)hi, currentValueRegister); + matchDest.append(m_jit.emitUnlinkedJle()); + // fall through to here, the value is above hi. + + // shuffle along & loop around if there are any more matches to handle. + unsigned next = which + 1; + ranges += next; + count -= next; + } while (count); +} + +void WRECGenerator::generateCharacterClassInverted(JmpSrcVector& matchDest, CharacterClass& charClass) +{ + JmpSrc unicodeFail; + if (charClass.numMatchesUnicode || charClass.numRangesUnicode) { + m_jit.cmpl_i8r(0x7f, currentValueRegister); + JmpSrc isAscii = m_jit.emitUnlinkedJle(); + + if (charClass.numMatchesUnicode) { + for (unsigned i = 0; i < charClass.numMatchesUnicode; ++i) { + UChar ch = charClass.matchesUnicode[i]; + m_jit.cmpl_i32r((unsigned short)ch, currentValueRegister); + matchDest.append(m_jit.emitUnlinkedJe()); + } + } + + if (charClass.numRangesUnicode) { + for (unsigned i = 0; i < charClass.numRangesUnicode; ++i) { + UChar lo = charClass.rangesUnicode[i].begin; + UChar hi = charClass.rangesUnicode[i].end; + + m_jit.cmpl_i32r((unsigned short)lo, currentValueRegister); + JmpSrc below = m_jit.emitUnlinkedJl(); + m_jit.cmpl_i32r((unsigned short)hi, currentValueRegister); + matchDest.append(m_jit.emitUnlinkedJle()); + m_jit.link(below, m_jit.label()); + } + } + + unicodeFail = m_jit.emitUnlinkedJmp(); + m_jit.link(isAscii, m_jit.label()); + } + + if (charClass.numRanges) { + unsigned matchIndex = 0; + JmpSrcVector failures; + generateCharacterClassInvertedRange(failures, matchDest, charClass.ranges, charClass.numRanges, &matchIndex, charClass.matches, charClass.numMatches); + while (matchIndex < charClass.numMatches) { + m_jit.cmpl_i32r((unsigned short)charClass.matches[matchIndex], currentValueRegister); + matchDest.append(m_jit.emitUnlinkedJe()); + ++matchIndex; + } + JmpDst noMatch = m_jit.label(); + for (unsigned i = 0; i < failures.size(); ++i) + m_jit.link(failures[i], noMatch); + failures.clear(); + } else if (charClass.numMatches) { + // optimization: gather 'a','A' etc back together, can mask & test once. + Vector<char> matchesAZaz; + + for (unsigned i = 0; i < charClass.numMatches; ++i) { + char ch = charClass.matches[i]; + if (m_parser.m_ignoreCase) { + if (isASCIILower(ch)) { + matchesAZaz.append(ch); + continue; + } + if (isASCIIUpper(ch)) + continue; + } + m_jit.cmpl_i32r((unsigned short)ch, currentValueRegister); + matchDest.append(m_jit.emitUnlinkedJe()); + } + + if (unsigned countAZaz = matchesAZaz.size()) { + m_jit.orl_i32r(32, currentValueRegister); + + for (unsigned i = 0; i < countAZaz; ++i) { + char ch = matchesAZaz[i]; + m_jit.cmpl_i32r((unsigned short)ch, currentValueRegister); + matchDest.append(m_jit.emitUnlinkedJe()); + } + } + } + + if (charClass.numMatchesUnicode || charClass.numRangesUnicode) + m_jit.link(unicodeFail, m_jit.label()); +} + +void WRECGenerator::generateCharacterClass(JmpSrcVector& failures, CharacterClass& charClass, bool invert) +{ + m_jit.cmpl_rr(lengthRegister, currentPositionRegister); + failures.append(m_jit.emitUnlinkedJe()); + m_jit.movzwl_mr(inputRegister, currentPositionRegister, 2, currentValueRegister); + + if (invert) + generateCharacterClassInverted(failures, charClass); + else { + JmpSrcVector successes; + generateCharacterClassInverted(successes, charClass); + failures.append(m_jit.emitUnlinkedJmp()); + JmpDst here = m_jit.label(); + for (unsigned i = 0; i < successes.size(); ++i) + m_jit.link(successes[i], here); + } + + m_jit.addl_i8r(1, currentPositionRegister); +} + +WRECGenerator::JmpSrc WRECGenerator::generateParentheses(ParenthesesType type) +{ + unsigned subpatternId = (type == capturing) ? ++m_parser.m_numSubpatterns : m_parser.m_numSubpatterns; + + // push pos, both to preserve for fail + reloaded in parseDisjunction + m_jit.pushl_r(currentPositionRegister); + + // Plant a disjunction, wrapped to invert behaviour - + JmpSrcVector newFailures; + m_parser.parseDisjunction(newFailures); + + if (type == capturing) { + m_jit.popl_r(currentValueRegister); + m_jit.movl_rm(currentValueRegister, (2 * subpatternId) * sizeof(int), outputRegister); + m_jit.movl_rm(currentPositionRegister, (2 * subpatternId + 1) * sizeof(int), outputRegister); + } else if (type == non_capturing) + m_jit.addl_i8r(4, X86::esp); + else + m_jit.popl_r(currentPositionRegister); + + // This is a little lame - jump to jump if there is a nested disjunction. + // (suggestion to fix: make parseDisjunction populate a JmpSrcVector of + // disjunct successes... this is probably not worth the compile cost in + // the common case to fix). + JmpSrc successfulMatch = m_jit.emitUnlinkedJmp(); + + JmpDst originalFailure = m_jit.label(); + for (unsigned i = 0; i < newFailures.size(); ++i) + m_jit.link(newFailures[i], originalFailure); + newFailures.clear(); + // fail: restore currentPositionRegister + m_jit.popl_r(currentPositionRegister); + + JmpSrc jumpToFail; + // If this was an inverted assert, fail = success! - just let the failure case drop through, + // success case goes to failures. Both paths restore curr pos. + if (type == inverted_assertion) + jumpToFail = successfulMatch; + else { + // plant a jump so any fail will link off to 'failures', + jumpToFail = m_jit.emitUnlinkedJmp(); + // link successes to jump here + m_jit.link(successfulMatch, m_jit.label()); + } + return jumpToFail; +} + +void WRECGenerator::generateParenthesesNonGreedy(JmpSrcVector& failures, JmpDst start, JmpSrc success, JmpSrc fail) +{ + m_jit.link(m_jit.emitUnlinkedJmp(), start); + m_jit.link(success, m_jit.label()); + + failures.append(fail); +} + +WRECGenerator::JmpSrc WRECGenerator::generateParenthesesResetTrampoline(JmpSrcVector& newFailures, unsigned subpatternIdBefore, unsigned subpatternIdAfter) +{ + JmpSrc skip = m_jit.emitUnlinkedJmp(); + + JmpDst subPatternResetTrampoline = m_jit.label(); + for (unsigned i = 0; i < newFailures.size(); ++i) + m_jit.link(newFailures[i], subPatternResetTrampoline); + newFailures.clear(); + for (unsigned i = subpatternIdBefore + 1; i <= subpatternIdAfter; ++i) { + m_jit.movl_i32m(-1, (2 * i) * sizeof(int), outputRegister); + m_jit.movl_i32m(-1, (2 * i + 1) * sizeof(int), outputRegister); + } + + JmpSrc newFailJump = m_jit.emitUnlinkedJmp(); + m_jit.link(skip, m_jit.label()); + + return newFailJump; +} + +void WRECGenerator::generateAssertionBOL(JmpSrcVector& failures) +{ + if (m_parser.m_multiline) { + JmpSrcVector previousIsNewline; + + // begin of input == success + m_jit.cmpl_i8r(0, currentPositionRegister); + previousIsNewline.append(m_jit.emitUnlinkedJe()); + + // now check prev char against newline characters. + m_jit.movzwl_mr(-2, inputRegister, currentPositionRegister, 2, currentValueRegister); + generateCharacterClassInverted(previousIsNewline, getCharacterClassNewline()); + + failures.append(m_jit.emitUnlinkedJmp()); + + JmpDst success = m_jit.label(); + for (unsigned i = 0; i < previousIsNewline.size(); ++i) + m_jit.link(previousIsNewline[i], success); + previousIsNewline.clear(); + } else { + m_jit.cmpl_i8r(0, currentPositionRegister); + failures.append(m_jit.emitUnlinkedJne()); + } +} + +void WRECGenerator::generateAssertionEOL(JmpSrcVector& failures) +{ + if (m_parser.m_multiline) { + JmpSrcVector nextIsNewline; + + // end of input == success + m_jit.cmpl_rr(lengthRegister, currentPositionRegister); + nextIsNewline.append(m_jit.emitUnlinkedJe()); + + // now check next char against newline characters. + m_jit.movzwl_mr(inputRegister, currentPositionRegister, 2, currentValueRegister); + generateCharacterClassInverted(nextIsNewline, getCharacterClassNewline()); + + failures.append(m_jit.emitUnlinkedJmp()); + + JmpDst success = m_jit.label(); + for (unsigned i = 0; i < nextIsNewline.size(); ++i) + m_jit.link(nextIsNewline[i], success); + nextIsNewline.clear(); + } else { + m_jit.cmpl_rr(lengthRegister, currentPositionRegister); + failures.append(m_jit.emitUnlinkedJne()); + } +} + +void WRECGenerator::generateAssertionWordBoundary(JmpSrcVector& failures, bool invert) +{ + JmpSrcVector wordBoundary; + JmpSrcVector notWordBoundary; + + // (1) Check if the previous value was a word char + + // (1.1) check for begin of input + m_jit.cmpl_i8r(0, currentPositionRegister); + JmpSrc atBegin = m_jit.emitUnlinkedJe(); + // (1.2) load the last char, and chck if is word character + m_jit.movzwl_mr(-2, inputRegister, currentPositionRegister, 2, currentValueRegister); + JmpSrcVector previousIsWord; + generateCharacterClassInverted(previousIsWord, getCharacterClassWordchar()); + // (1.3) if we get here, previous is not a word char + m_jit.link(atBegin, m_jit.label()); + + // (2) Handle situation where previous was NOT a \w + + // (2.1) check for end of input + m_jit.cmpl_rr(lengthRegister, currentPositionRegister); + notWordBoundary.append(m_jit.emitUnlinkedJe()); + // (2.2) load the next char, and chck if is word character + m_jit.movzwl_mr(inputRegister, currentPositionRegister, 2, currentValueRegister); + generateCharacterClassInverted(wordBoundary, getCharacterClassWordchar()); + // (2.3) If we get here, neither chars are word chars + notWordBoundary.append(m_jit.emitUnlinkedJmp()); + + // (3) Handle situation where previous was a \w + + // (3.0) link success in first match to here + JmpDst section3 = m_jit.label(); + for (unsigned i = 0; i < previousIsWord.size(); ++i) + m_jit.link(previousIsWord[i], section3); + previousIsWord.clear(); + // (3.1) check for end of input + m_jit.cmpl_rr(lengthRegister, currentPositionRegister); + wordBoundary.append(m_jit.emitUnlinkedJe()); + // (3.2) load the next char, and chck if is word character + m_jit.movzwl_mr(inputRegister, currentPositionRegister, 2, currentValueRegister); + generateCharacterClassInverted(notWordBoundary, getCharacterClassWordchar()); + // (3.3) If we get here, this is an end of a word, within the input. + + // (4) Link everything up + + if (invert) { + // handle the fall through case + wordBoundary.append(m_jit.emitUnlinkedJmp()); + + // looking for non word boundaries, so link boundary fails to here. + JmpDst success = m_jit.label(); + for (unsigned i = 0; i < notWordBoundary.size(); ++i) + m_jit.link(notWordBoundary[i], success); + notWordBoundary.clear(); + + failures.append(wordBoundary.begin(), wordBoundary.size()); + } else { + // looking for word boundaries, so link successes here. + JmpDst success = m_jit.label(); + for (unsigned i = 0; i < wordBoundary.size(); ++i) + m_jit.link(wordBoundary[i], success); + wordBoundary.clear(); + + failures.append(notWordBoundary.begin(), notWordBoundary.size()); + } +} + +void WRECGenerator::generateBackreference(JmpSrcVector& failures, unsigned subpatternId) +{ + m_jit.pushl_r(currentPositionRegister); + m_jit.pushl_r(quantifierCountRegister); + + // get the start pos of the backref into quantifierCountRegister (multipurpose!) + m_jit.movl_mr((2 * subpatternId) * sizeof(int), outputRegister, quantifierCountRegister); + + JmpSrc skipIncrement = m_jit.emitUnlinkedJmp(); + JmpDst topOfLoop = m_jit.label(); + m_jit.addl_i8r(1, currentPositionRegister); + m_jit.addl_i8r(1, quantifierCountRegister); + m_jit.link(skipIncrement, m_jit.label()); + + // check if we're at the end of backref (if we are, success!) + m_jit.cmpl_rm(quantifierCountRegister, ((2 * subpatternId) + 1) * sizeof(int), outputRegister); + JmpSrc endOfBackRef = m_jit.emitUnlinkedJe(); + + m_jit.movzwl_mr(inputRegister, quantifierCountRegister, 2, currentValueRegister); + + // check if we've run out of input (this would be a can o'fail) + m_jit.cmpl_rr(lengthRegister, currentPositionRegister); + JmpSrc endOfInput = m_jit.emitUnlinkedJe(); + + m_jit.cmpw_rm(currentValueRegister, inputRegister, currentPositionRegister, 2); + m_jit.link(m_jit.emitUnlinkedJe(), topOfLoop); + + m_jit.link(endOfInput, m_jit.label()); + // Failure + m_jit.popl_r(quantifierCountRegister); + m_jit.popl_r(currentPositionRegister); + failures.append(m_jit.emitUnlinkedJmp()); + + // Success + m_jit.link(endOfBackRef, m_jit.label()); + m_jit.popl_r(quantifierCountRegister); + m_jit.addl_i8r(4, X86::esp); +} + +void WRECGenerator::generateDisjunction(JmpSrcVector& successes, JmpSrcVector& failures) +{ + successes.append(m_jit.emitUnlinkedJmp()); + + JmpDst here = m_jit.label(); + + for (unsigned i = 0; i < failures.size(); ++i) + m_jit.link(failures[i], here); + failures.clear(); + + m_jit.movl_mr(X86::esp, currentPositionRegister); +} + +void WRECGenerator::terminateDisjunction(JmpSrcVector& successes) +{ + JmpDst here = m_jit.label(); + for (unsigned i = 0; i < successes.size(); ++i) + m_jit.link(successes[i], here); + successes.clear(); +} + +ALWAYS_INLINE Quantifier WRECParser::parseGreedyQuantifier() +{ + switch (peek()) { + case '?': + consume(); + return Quantifier(Quantifier::Greedy, 0, 1); + + case '*': + consume(); + return Quantifier(Quantifier::Greedy, 0); + + case '+': + consume(); + return Quantifier(Quantifier::Greedy, 1); + + case '{': { + consume(); + // a numeric quantifier should always have a lower bound + if (!peekIsDigit()) { + m_err = Error_malformedQuantifier; + return Quantifier(Quantifier::Error); + } + int min = consumeNumber(); + + // this should either be a , or a } + switch (peek()) { + case '}': + // {n} - exactly n times. (technically I think a '?' is valid in the bnf - bit meaningless). + consume(); + return Quantifier(Quantifier::Greedy, min, min); + + case ',': + consume(); + switch (peek()) { + case '}': + // {n,} - n to inf times. + consume(); + return Quantifier(Quantifier::Greedy, min); + + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': { + // {n,m} - n to m times. + int max = consumeNumber(); + + if (peek() != '}') { + m_err = Error_malformedQuantifier; + return Quantifier(Quantifier::Error); + } + consume(); + + return Quantifier(Quantifier::Greedy, min, max); + } + + default: + m_err = Error_malformedQuantifier; + return Quantifier(Quantifier::Error); + } + + default: + m_err = Error_malformedQuantifier; + return Quantifier(Quantifier::Error); + } + } + // None of the above - no quantifier + default: + return Quantifier(); + } +} + +Quantifier WRECParser::parseQuantifier() +{ + Quantifier q = parseGreedyQuantifier(); + + if ((q.type == Quantifier::Greedy) && (peek() == '?')) { + consume(); + q.type = Quantifier::NonGreedy; + } + + return q; +} + +bool WRECParser::parsePatternCharacterQualifier(JmpSrcVector& failures, int ch) +{ + Quantifier q = parseQuantifier(); + + switch (q.type) { + case Quantifier::None: { + m_generator.generatePatternCharacter(failures, ch); + break; + } + + case Quantifier::Greedy: { + GeneratePatternCharacterFunctor functor(ch); + m_generator.generateGreedyQuantifier(failures, functor, q.min, q.max); + break; + } + + case Quantifier::NonGreedy: { + GeneratePatternCharacterFunctor functor(ch); + m_generator.generateNonGreedyQuantifier(failures, functor, q.min, q.max); + break; + } + + case Quantifier::Error: + return false; + } + + return true; +} + +bool WRECParser::parseCharacterClassQuantifier(JmpSrcVector& failures, CharacterClass& charClass, bool invert) +{ + Quantifier q = parseQuantifier(); + + switch (q.type) { + case Quantifier::None: { + m_generator.generateCharacterClass(failures, charClass, invert); + break; + } + + case Quantifier::Greedy: { + GenerateCharacterClassFunctor functor(&charClass, invert); + m_generator.generateGreedyQuantifier(failures, functor, q.min, q.max); + break; + } + + case Quantifier::NonGreedy: { + GenerateCharacterClassFunctor functor(&charClass, invert); + m_generator.generateNonGreedyQuantifier(failures, functor, q.min, q.max); + break; + } + + case Quantifier::Error: + return false; + } + + return true; +} + +bool WRECParser::parseBackreferenceQuantifier(JmpSrcVector& failures, unsigned subpatternId) +{ + Quantifier q = parseQuantifier(); + + switch (q.type) { + case Quantifier::None: { + m_generator.generateBackreference(failures, subpatternId); + break; + } + + case Quantifier::Greedy: + case Quantifier::NonGreedy: + m_generator.generateBackreferenceQuantifier(failures, q.type, subpatternId, q.min, q.max); + return true; + + case Quantifier::Error: + return false; + } + + return true; +} + +bool WRECParser::parseParentheses(JmpSrcVector&) +{ + // FIXME: We don't currently backtrack correctly within parentheses in cases such as + // "c".match(/(.*)c/) so we fall back to PCRE for any regexp containing parentheses. + + m_err = TempError_unsupportedParentheses; + return false; +} + +bool WRECParser::parseCharacterClass(JmpSrcVector& failures) +{ + bool invert = false; + if (peek() == '^') { + consume(); + invert = true; + } + + CharacterClassConstructor charClassConstructor(m_ignoreCase); + + UChar ch; + while ((ch = peek()) != ']') { + switch (ch) { + case EndOfPattern: + m_err = Error_malformedCharacterClass; + return false; + + case '\\': + consume(); + switch (ch = peek()) { + case EndOfPattern: + m_err = Error_malformedEscape; + return false; + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + charClassConstructor.put(consumeOctal()); + break; + + // ControlEscape + case 'b': + consume(); + charClassConstructor.put('\b'); + break; + case 'f': + consume(); + charClassConstructor.put('\f'); + break; + case 'n': + consume(); + charClassConstructor.put('\n'); + break; + case 'r': + consume(); + charClassConstructor.put('\r'); + break; + case 't': + consume(); + charClassConstructor.put('\t'); + break; + case 'v': + consume(); + charClassConstructor.put('\v'); + break; + + // ControlLetter + case 'c': { + consume(); + int control = consume(); + if (!isASCIIAlpha(control)) { + m_err = Error_malformedEscape; + return false; + } + charClassConstructor.put(control&31); + break; + } + + // HexEscape + case 'x': { + consume(); + int x = consumeHex(2); + if (x == -1) { + m_err = Error_malformedEscape; + return false; + } + charClassConstructor.put(x); + break; + } + + // UnicodeEscape + case 'u': { + consume(); + int x = consumeHex(4); + if (x == -1) { + m_err = Error_malformedEscape; + return false; + } + charClassConstructor.put(x); + break; + } + + // CharacterClassEscape + case 'd': + consume(); + charClassConstructor.append(getCharacterClassDigits()); + break; + case 's': + consume(); + charClassConstructor.append(getCharacterClassSpaces()); + break; + case 'w': + consume(); + charClassConstructor.append(getCharacterClassWordchar()); + break; + case 'D': + consume(); + charClassConstructor.append(getCharacterClassNondigits()); + break; + case 'S': + consume(); + charClassConstructor.append(getCharacterClassNonspaces()); + break; + case 'W': + consume(); + charClassConstructor.append(getCharacterClassNonwordchar()); + break; + + case '-': + consume(); + charClassConstructor.flushBeforeEscapedHyphen(); + charClassConstructor.put(ch); + break; + + // IdentityEscape + default: { + // TODO: check this test for IdentifierPart. + int ch = consume(); + if (isASCIIAlphanumeric(ch) || (ch == '_')) { + m_err = Error_malformedEscape; + return false; + } + charClassConstructor.put(ch); + break; + } + } + break; + + default: + consume(); + charClassConstructor.put(ch); + } + } + consume(); + + // lazily catch reversed ranges ([z-a])in character classes + if (charClassConstructor.isUpsideDown()) { + m_err = Error_malformedCharacterClass; + return false; + } + + charClassConstructor.flush(); + CharacterClass charClass = charClassConstructor.charClass(); + return parseCharacterClassQuantifier(failures, charClass, invert); +} + +bool WRECParser::parseOctalEscape(JmpSrcVector& failures) +{ + return parsePatternCharacterQualifier(failures, consumeOctal()); +} + +bool WRECParser::parseEscape(JmpSrcVector& failures) +{ + switch (peek()) { + case EndOfPattern: + m_err = Error_malformedEscape; + return false; + + // Assertions + case 'b': + consume(); + m_generator.generateAssertionWordBoundary(failures, false); + return true; + + case 'B': + consume(); + m_generator.generateAssertionWordBoundary(failures, true); + return true; + + // Octal escape + case '0': + consume(); + return parseOctalEscape(failures); + + // DecimalEscape + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': { + // FIXME: This does not support forward references. It's not clear whether they + // should be valid. + unsigned value = peekDigit(); + if (value > m_numSubpatterns) + return parseOctalEscape(failures); + } + case '8': + case '9': { + unsigned value = peekDigit(); + if (value > m_numSubpatterns) { + consume(); + m_err = Error_malformedEscape; + return false; + } + consume(); + + while (peekIsDigit()) { + unsigned newValue = value * 10 + peekDigit(); + if (newValue > m_numSubpatterns) + break; + value = newValue; + consume(); + } + + parseBackreferenceQuantifier(failures, value); + return true; + } + + // ControlEscape + case 'f': + consume(); + return parsePatternCharacterQualifier(failures, '\f'); + case 'n': + consume(); + return parsePatternCharacterQualifier(failures, '\n'); + case 'r': + consume(); + return parsePatternCharacterQualifier(failures, '\r'); + case 't': + consume(); + return parsePatternCharacterQualifier(failures, '\t'); + case 'v': + consume(); + return parsePatternCharacterQualifier(failures, '\v'); + + // ControlLetter + case 'c': { + consume(); + int control = consume(); + if (!isASCIIAlpha(control)) { + m_err = Error_malformedEscape; + return false; + } + return parsePatternCharacterQualifier(failures, control&31); + } + + // HexEscape + case 'x': { + consume(); + int x = consumeHex(2); + if (x == -1) { + m_err = Error_malformedEscape; + return false; + } + return parsePatternCharacterQualifier(failures, x); + } + + // UnicodeEscape + case 'u': { + consume(); + int x = consumeHex(4); + if (x == -1) { + m_err = Error_malformedEscape; + return false; + } + return parsePatternCharacterQualifier(failures, x); + } + + // CharacterClassEscape + case 'd': + consume(); + return parseCharacterClassQuantifier(failures, getCharacterClassDigits(), false); + case 's': + consume(); + return parseCharacterClassQuantifier(failures, getCharacterClassSpaces(), false); + case 'w': + consume(); + return parseCharacterClassQuantifier(failures, getCharacterClassWordchar(), false); + case 'D': + consume(); + return parseCharacterClassQuantifier(failures, getCharacterClassDigits(), true); + case 'S': + consume(); + return parseCharacterClassQuantifier(failures, getCharacterClassSpaces(), true); + case 'W': + consume(); + return parseCharacterClassQuantifier(failures, getCharacterClassWordchar(), true); + + // IdentityEscape + default: { + // TODO: check this test for IdentifierPart. + int ch = consume(); + if (isASCIIAlphanumeric(ch) || (ch == '_')) { + m_err = Error_malformedEscape; + return false; + } + return parsePatternCharacterQualifier(failures, ch); + } + } +} + +bool WRECParser::parseTerm(JmpSrcVector& failures) +{ + switch (peek()) { + case EndOfPattern: + case '*': + case '+': + case '?': + case ')': + case ']': + case '{': + case '}': + case '|': + // Not allowed in a Term! + return false; + + case '^': + consume(); + m_generator.generateAssertionBOL(failures); + return true; + + case '$': + consume(); + m_generator.generateAssertionEOL(failures); + return true; + + case '\\': + // b & B are also assertions... + consume(); + return parseEscape(failures); + + case '.': + consume(); + return parseCharacterClassQuantifier(failures, getCharacterClassNewline(), true); + + case '[': + // CharacterClass + consume(); + return parseCharacterClass(failures); + + case '(': + consume(); + return parseParentheses(failures); + + default: + // Anything else is a PatternCharacter + return parsePatternCharacterQualifier(failures, consume()); + } +} + +/* + interface req: CURR_POS is on stack (can be reloaded). +*/ +void WRECParser::parseDisjunction(JmpSrcVector& failures) +{ + parseAlternative(failures); + + if (peek() == '|') { + JmpSrcVector successes; + + do { + consume(); + + m_generator.generateDisjunction(successes, failures); + + parseAlternative(failures); + } while (peek() == '|'); + + m_generator.terminateDisjunction(successes); + } +} + +} // namespace JSC + +#endif // ENABLE(WREC) diff --git a/JavaScriptCore/wrec/WREC.h b/JavaScriptCore/wrec/WREC.h new file mode 100644 index 0000000..301bd3b --- /dev/null +++ b/JavaScriptCore/wrec/WREC.h @@ -0,0 +1,258 @@ +/* + * Copyright (C) 2008 Apple Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef WREC_h +#define WREC_h + +#if ENABLE(WREC) + +#include "ustring.h" +#include <masm/X86Assembler.h> +#include <wtf/ASCIICType.h> +#include <wtf/Vector.h> + +#if COMPILER(GCC) +#define WREC_CALL __attribute__ ((regparm (3))) +#else +#define WREC_CALL +#endif + +namespace JSC { + + typedef int (*WRECFunction)(const UChar* input, unsigned start, unsigned length, int* output) WREC_CALL; + + class GenerateAtomFunctor; + struct CharacterClassRange; + struct CharacterClass; + + struct Quantifier { + enum Type { + None, + Greedy, + NonGreedy, + Error, + }; + + Quantifier() + : type(None) + { + } + + Quantifier(Type type, unsigned min = 0, unsigned max = noMaxSpecified) + : type(type) + , min(min) + , max(max) + { + } + + Type type; + + unsigned min; + unsigned max; + + static const unsigned noMaxSpecified = UINT_MAX; + }; + + class WRECParser; + + typedef Vector<X86Assembler::JmpSrc> JmpSrcVector; + + class WRECGenerator { + public: + WRECGenerator(WRECParser& parser, X86Assembler& jit) + : m_parser(parser) + , m_jit(jit) + { + } + + typedef X86Assembler::JmpSrc JmpSrc; + typedef X86Assembler::JmpDst JmpDst; + + // these regs setup by the params + static const X86Assembler::RegisterID inputRegister = X86::eax; + static const X86Assembler::RegisterID currentPositionRegister = X86::edx; + static const X86Assembler::RegisterID lengthRegister = X86::ecx; + static const X86Assembler::RegisterID currentValueRegister = X86::esi; + static const X86Assembler::RegisterID outputRegister = X86::edi; + static const X86Assembler::RegisterID quantifierCountRegister = X86::ebx; + + friend class GenerateAtomFunctor; + friend class GeneratePatternCharacterFunctor; + friend class GenerateCharacterClassFunctor; + friend class GenerateBackreferenceFunctor; + friend class GenerateParenthesesNonGreedyFunctor; + + void generateGreedyQuantifier(JmpSrcVector& failures, GenerateAtomFunctor& functor, unsigned min, unsigned max); + void generateNonGreedyQuantifier(JmpSrcVector& failures, GenerateAtomFunctor& functor, unsigned min, unsigned max); + void generateBacktrack1(); + void generateBacktrackBackreference(unsigned subpatternId); + void generateCharacterClass(JmpSrcVector& failures, CharacterClass& charClass, bool invert); + void generateCharacterClassInverted(JmpSrcVector& failures, CharacterClass& charClass); + void generateCharacterClassInvertedRange(JmpSrcVector& failures, JmpSrcVector& matchDest, const CharacterClassRange* ranges, unsigned count, unsigned* matchIndex, const UChar* matches, unsigned matchCount); + void generatePatternCharacter(JmpSrcVector& failures, int ch); + void generateAssertionWordBoundary(JmpSrcVector& failures, bool invert); + void generateAssertionBOL(JmpSrcVector& failures); + void generateAssertionEOL(JmpSrcVector& failures); + void generateBackreference(JmpSrcVector& failures, unsigned subpatternID); + void generateBackreferenceQuantifier(JmpSrcVector& failures, Quantifier::Type quantifierType, unsigned subpatternId, unsigned min, unsigned max); + enum ParenthesesType { capturing, non_capturing, assertion, inverted_assertion }; // order is relied on in generateParentheses() + JmpSrc generateParentheses(ParenthesesType type); + JmpSrc generateParenthesesResetTrampoline(JmpSrcVector& newFailures, unsigned subpatternIdBefore, unsigned subpatternIdAfter); + void generateParenthesesNonGreedy(JmpSrcVector& failures, JmpDst start, JmpSrc success, JmpSrc fail); + + void generateDisjunction(JmpSrcVector& successes, JmpSrcVector& failures); + void terminateDisjunction(JmpSrcVector& successes); + + private: + WRECParser& m_parser; + X86Assembler& m_jit; + }; + + class WRECParser { + public: + bool m_ignoreCase; + bool m_multiline; + unsigned m_numSubpatterns; + enum WRECError { + NoError, + Error_malformedCharacterClass, + Error_malformedParentheses, + Error_malformedPattern, + Error_malformedQuantifier, + Error_malformedEscape, + TempError_unsupportedQuantifier, + TempError_unsupportedParentheses, + } m_err; + + WRECParser(const UString& pattern, bool ignoreCase, bool multiline, X86Assembler& jit) + : m_ignoreCase(ignoreCase) + , m_multiline(multiline) + , m_numSubpatterns(0) + , m_err(NoError) + , m_generator(*this, jit) + , m_data(pattern.data()) + , m_size(pattern.size()) + , m_index(0) + { + } + + void parseAlternative(JmpSrcVector& failures) + { + while (parseTerm(failures)) { } + } + + void parseDisjunction(JmpSrcVector& failures); + + bool parseTerm(JmpSrcVector& failures); + bool parseEscape(JmpSrcVector& failures); + bool parseOctalEscape(JmpSrcVector& failures); + bool parseParentheses(JmpSrcVector& failures); + bool parseCharacterClass(JmpSrcVector& failures); + bool parseCharacterClassQuantifier(JmpSrcVector& failures, CharacterClass& charClass, bool invert); + bool parsePatternCharacterQualifier(JmpSrcVector& failures, int ch); + bool parseBackreferenceQuantifier(JmpSrcVector& failures, unsigned subpatternId); + + ALWAYS_INLINE Quantifier parseGreedyQuantifier(); + Quantifier parseQuantifier(); + + static const int EndOfPattern = -1; + + int peek() + { + if (m_index >= m_size) + return EndOfPattern; + return m_data[m_index]; + } + + int consume() + { + if (m_index >= m_size) + return EndOfPattern; + return m_data[m_index++]; + } + + bool peekIsDigit() + { + return WTF::isASCIIDigit(peek()); + } + + unsigned peekDigit() + { + ASSERT(peekIsDigit()); + return peek() - '0'; + } + + unsigned consumeDigit() + { + ASSERT(peekIsDigit()); + return consume() - '0'; + } + + unsigned consumeNumber() + { + int n = consumeDigit(); + while (peekIsDigit()) { + n *= 10; + n += consumeDigit(); + } + return n; + } + + int consumeHex(int count) + { + int n = 0; + while (count--) { + if (!WTF::isASCIIHexDigit(peek())) + return -1; + n = (n<<4) | WTF::toASCIIHexValue(consume()); + } + return n; + } + + unsigned consumeOctal() + { + unsigned n = 0; + while (n < 32 && WTF::isASCIIOctalDigit(peek())) + n = n * 8 + (consume() - '0'); + return n; + } + + bool isEndOfPattern() + { + return peek() != EndOfPattern; + } + + private: + WRECGenerator m_generator; + const UChar* m_data; + unsigned m_size; + unsigned m_index; + }; + +} // namespace JSC + +#endif // ENABLE(WREC) + +#endif // WREC_h |