diff options
Diffstat (limited to 'JavaScriptCore/wrec/CharacterClassConstructor.cpp')
-rw-r--r-- | JavaScriptCore/wrec/CharacterClassConstructor.cpp | 360 |
1 files changed, 0 insertions, 360 deletions
diff --git a/JavaScriptCore/wrec/CharacterClassConstructor.cpp b/JavaScriptCore/wrec/CharacterClassConstructor.cpp deleted file mode 100644 index 85eccaa..0000000 --- a/JavaScriptCore/wrec/CharacterClassConstructor.cpp +++ /dev/null @@ -1,360 +0,0 @@ -/* - * Copyright (C) 2008 Apple Inc. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY - * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY - * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include "config.h" -#include "CharacterClassConstructor.h" - -#if ENABLE(WREC) - -#include "pcre_internal.h" -#include <wtf/ASCIICType.h> - -using namespace WTF; - -namespace JSC { - -static const UChar asciiNewlines[2] = { '\n', '\r' }; -static const UChar unicodeNewlines[2] = { 0x2028, 0x2029 }; -CharacterClass& getCharacterClassNewline() { - static CharacterClass charClass = { - asciiNewlines, 2, - 0, 0, - unicodeNewlines, 2, - 0, 0, - }; - - return charClass; -} - -static const CharacterClassRange asciiDigitsRange[1] = { { '0', '9' } }; -CharacterClass& getCharacterClassDigits() { - static CharacterClass charClass = { - 0, 0, - asciiDigitsRange, 1, - 0, 0, - 0, 0, - }; - - return charClass; -} - -static const UChar asciiSpaces[1] = { ' ' }; -static const CharacterClassRange asciiSpacesRange[1] = { { '\t', '\r' } }; -static const UChar unicodeSpaces[8] = { 0x00a0, 0x1680, 0x180e, 0x2028, 0x2029, 0x202f, 0x205f, 0x3000 }; -static const CharacterClassRange unicodeSpacesRange[1] = { { 0x2000, 0x200a } }; -CharacterClass& getCharacterClassSpaces() { - static CharacterClass charClass = { - asciiSpaces, 1, - asciiSpacesRange, 1, - unicodeSpaces, 8, - unicodeSpacesRange, 1, - }; - - return charClass; -} - -static const UChar asciiWordchar[1] = { '_' }; -static const CharacterClassRange asciiWordcharRange[3] = { { '0', '9' }, { 'A', 'Z' }, { 'a', 'z' } }; -CharacterClass& getCharacterClassWordchar() { - static CharacterClass charClass = { - asciiWordchar, 1, - asciiWordcharRange, 3, - 0, 0, - 0, 0, - }; - - return charClass; -} - -static const CharacterClassRange asciiNondigitsRange[2] = { { 0, '0' - 1 }, { '9' + 1, 0x7f } }; -static const CharacterClassRange unicodeNondigitsRange[1] = { { 0x0080, 0xffff } }; -CharacterClass& getCharacterClassNondigits() { - static CharacterClass charClass = { - 0, 0, - asciiNondigitsRange, 2, - 0, 0, - unicodeNondigitsRange, 1, - }; - - return charClass; -} - -static const CharacterClassRange asciiNonspacesRange[3] = { { 0, '\t' - 1 }, { '\r' + 1, ' ' - 1 }, { ' ' + 1, 0x7f } }; -static const CharacterClassRange unicodeNonspacesRange[9] = { - { 0x0080, 0x009f }, - { 0x00a1, 0x167f }, - { 0x1681, 0x180d }, - { 0x180f, 0x1fff }, - { 0x200b, 0x2027 }, - { 0x202a, 0x202e }, - { 0x2030, 0x205e }, - { 0x2060, 0x2fff }, - { 0x3001, 0xffff } -}; -CharacterClass& getCharacterClassNonspaces() { - static CharacterClass charClass = { - 0, 0, - asciiNonspacesRange, 3, - 0, 0, - unicodeNonspacesRange, 9, - }; - - return charClass; -} - -static const UChar asciiNonwordchar[1] = { '`' }; -static const CharacterClassRange asciiNonwordcharRange[4] = { { 0, '0' - 1 }, { '9' + 1, 'A' - 1 }, { 'Z' + 1, '_' - 1 }, { 'z' + 1, 0x7f } }; -static const CharacterClassRange unicodeNonwordcharRange[1] = { { 0x0080, 0xffff } }; -CharacterClass& getCharacterClassNonwordchar() { - static CharacterClass charClass = { - asciiNonwordchar, 1, - asciiNonwordcharRange, 4, - 0, 0, - unicodeNonwordcharRange, 1, - }; - - return charClass; -} - -void CharacterClassConstructor::addSorted(Vector<UChar>& matches, UChar ch) -{ - unsigned pos = 0; - unsigned range = matches.size(); - - // binary chop, find position to insert char. - while (range) { - unsigned index = range >> 1; - - int val = matches[pos+index] - ch; - if (!val) - return; - else if (val > 0) - range = index; - else { - pos += (index+1); - range -= (index+1); - } - } - - if (pos == matches.size()) - matches.append(ch); - else - matches.insert(pos, ch); -} - -void CharacterClassConstructor::addSortedRange(Vector<CharacterClassRange>& ranges, UChar lo, UChar hi) -{ - unsigned end = ranges.size(); - - // Simple linear scan - I doubt there are that many ranges anyway... - // feel free to fix this with something faster (eg binary chop). - for (unsigned i = 0; i < end; ++i) { - // does the new range fall before the current position in the array - if (hi < ranges[i].begin) { - // optional optimization: concatenate appending ranges? - may not be worthwhile. - if (hi == (ranges[i].begin - 1)) { - ranges[i].begin = lo; - return; - } - CharacterClassRange r = {lo, hi}; - ranges.insert(i, r); - return; - } - // Okay, since we didn't hit the last case, the end of the new range is definitely at or after the begining - // If the new range start at or before the end of the last range, then the overlap (if it starts one after the - // end of the last range they concatenate, which is just as good. - if (lo <= (ranges[i].end + 1)) { - // found an intersect! we'll replace this entry in the array. - ranges[i].begin = std::min(ranges[i].begin, lo); - ranges[i].end = std::max(ranges[i].end, hi); - - // now check if the new range can subsume any subsequent ranges. - unsigned next = i+1; - // each iteration of the loop we will either remove something from the list, or break the loop. - while (next < ranges.size()) { - if (ranges[next].begin <= (ranges[i].end + 1)) { - // the next entry now overlaps / concatenates this one. - ranges[i].end = std::max(ranges[i].end, ranges[next].end); - ranges.remove(next); - } else - break; - } - - return; - } - } - - // Range comes after all existing ranges. - CharacterClassRange r = {lo, hi}; - ranges.append(r); -} - -void CharacterClassConstructor::put(UChar ch) -{ - // Parsing a regular expression like [a-z], we start in an initial empty state: - // ((m_charBuffer == -1) && !m_isPendingDash) - // When buffer the 'a' sice it may be (and is in this case) part of a range: - // ((m_charBuffer != -1) && !m_isPendingDash) - // Having parsed the hyphen we then record that the dash is also pending: - // ((m_charBuffer != -1) && m_isPendingDash) - // The next change will always take us back to the initial state - either because - // a complete range has been parsed (such as [a-z]), or because a flush is forced, - // due to an early end in the regexp ([a-]), or a character class escape being added - // ([a-\s]). The fourth permutation of m_charBuffer and m_isPendingDash is not permitted. - ASSERT(!((m_charBuffer == -1) && m_isPendingDash)); - - if (m_charBuffer != -1) { - if (m_isPendingDash) { - // EXAMPLE: parsing [-a-c], the 'c' reaches this case - we have buffered a previous character and seen a hyphen, so this is a range. - UChar lo = m_charBuffer; - UChar hi = ch; - // Reset back to the inital state. - m_charBuffer = -1; - m_isPendingDash = false; - - // This is an error, detected lazily. Do not proceed. - if (lo > hi) { - m_isUpsideDown = true; - return; - } - - if (lo <= 0x7f) { - char asciiLo = lo; - char asciiHi = std::min(hi, (UChar)0x7f); - addSortedRange(m_ranges, lo, asciiHi); - - if (m_isCaseInsensitive) { - if ((asciiLo <= 'Z') && (asciiHi >= 'A')) - addSortedRange(m_ranges, std::max(asciiLo, 'A')+('a'-'A'), std::min(asciiHi, 'Z')+('a'-'A')); - if ((asciiLo <= 'z') && (asciiHi >= 'a')) - addSortedRange(m_ranges, std::max(asciiLo, 'a')+('A'-'a'), std::min(asciiHi, 'z')+('A'-'a')); - } - } - if (hi >= 0x80) { - UChar unicodeCurr = std::max(lo, (UChar)0x80); - addSortedRange(m_rangesUnicode, unicodeCurr, hi); - - if (m_isCaseInsensitive) { - // we're going to scan along, updating the start of the range - while (unicodeCurr <= hi) { - // Spin forwards over any characters that don't have two cases. - for (; kjs_pcre_ucp_othercase(unicodeCurr) == -1; ++unicodeCurr) { - // if this was the last character in the range, we're done. - if (unicodeCurr == hi) - return; - } - // if we fall through to here, unicodeCurr <= hi & has another case. Get the other case. - UChar rangeStart = unicodeCurr; - UChar otherCurr = kjs_pcre_ucp_othercase(unicodeCurr); - - // If unicodeCurr is not yet hi, check the next char in the range. If it also has another case, - // and if it's other case value is one greater then the othercase value for the current last - // character included in the range, we can include next into the range. - while ((unicodeCurr < hi) && (kjs_pcre_ucp_othercase(unicodeCurr + 1) == (otherCurr + 1))) { - // increment unicodeCurr; it points to the end of the range. - // increment otherCurr, due to the check above other for next must be 1 greater than the currrent other value. - ++unicodeCurr; - ++otherCurr; - } - - // otherChar is the last in the range of other case chars, calculate offset to get back to the start. - addSortedRange(m_rangesUnicode, otherCurr-(unicodeCurr-rangeStart), otherCurr); - - // unicodeCurr has been added, move on to the next char. - ++unicodeCurr; - } - } - } - } else if (ch == '-') - // EXAMPLE: parsing [-a-c], the second '-' reaches this case - the hyphen is treated as potentially indicating a range. - m_isPendingDash = true; - else { - // EXAMPLE: Parsing [-a-c], the 'a' reaches this case - we repace the previously buffered char with the 'a'. - flush(); - m_charBuffer = ch; - } - } else - // EXAMPLE: Parsing [-a-c], the first hyphen reaches this case - there is no buffered character - // (the hyphen not treated as a special character in this case, same handling for any char). - m_charBuffer = ch; -} - -// When a character is added to the set we do not immediately add it to the arrays, in case it is actually defining a range. -// When we have determined the character is not used in specifing a range it is added, in a sorted fashion, to the appropriate -// array (either ascii or unicode). -// If the pattern is case insensitive we add entries for both cases. -void CharacterClassConstructor::flush() -{ - if (m_charBuffer != -1) { - if (m_charBuffer <= 0x7f) { - if (m_isCaseInsensitive && isASCIILower(m_charBuffer)) - addSorted(m_matches, toASCIIUpper(m_charBuffer)); - addSorted(m_matches, m_charBuffer); - if (m_isCaseInsensitive && isASCIIUpper(m_charBuffer)) - addSorted(m_matches, toASCIILower(m_charBuffer)); - } else { - addSorted(m_matchesUnicode, m_charBuffer); - if (m_isCaseInsensitive) { - int other = kjs_pcre_ucp_othercase(m_charBuffer); - if (other != -1) - addSorted(m_matchesUnicode, other); - } - } - m_charBuffer = -1; - } - - if (m_isPendingDash) { - addSorted(m_matches, '-'); - m_isPendingDash = false; - } -} - -void CharacterClassConstructor::append(CharacterClass& other) -{ - // [x-\s] will add, 'x', '-', and all unicode spaces to new class (same as [x\s-]). - // Need to check the spec, really, but think this matches PCRE behaviour. - flush(); - - if (other.numMatches) { - for (size_t i = 0; i < other.numMatches; ++i) - addSorted(m_matches, other.matches[i]); - } - if (other.numRanges) { - for (size_t i = 0; i < other.numRanges; ++i) - addSortedRange(m_ranges, other.ranges[i].begin, other.ranges[i].end); - } - if (other.numMatchesUnicode) { - for (size_t i = 0; i < other.numMatchesUnicode; ++i) - addSorted(m_matchesUnicode, other.matchesUnicode[i]); - } - if (other.numRangesUnicode) { - for (size_t i = 0; i < other.numRangesUnicode; ++i) - addSortedRange(m_rangesUnicode, other.rangesUnicode[i].begin, other.rangesUnicode[i].end); - } -} - -} - -#endif // ENABLE(WREC) |