Merge WebKit at r72274: Initial merge by git.

Change-Id: Ie51f0b4a16da82942bd516dce59cfb79ebbe25fb
author: Russell Brenner <russellbrenner@google.com> 2010-11-18 17:33:13 -0800
committer: Russell Brenner <russellbrenner@google.com> 2010-12-02 13:47:21 -0800
commit: 6b70adc33054f8aee8c54d0f460458a9df11b8a5 (patch)
tree: 103a13998c33944d6ab3b8318c509a037e639460 /JavaScriptCore/yarr
parent: bdf4ebc8e70b2d221b6ee7a65660918ecb1d33aa (diff)
download: external_webkit-6b70adc33054f8aee8c54d0f460458a9df11b8a5.zip
external_webkit-6b70adc33054f8aee8c54d0f460458a9df11b8a5.tar.gz
external_webkit-6b70adc33054f8aee8c54d0f460458a9df11b8a5.tar.bz2
6 files changed, 467 insertions, 55 deletions
diff --git a/JavaScriptCore/yarr/RegexCompiler.cpp b/JavaScriptCore/yarr/RegexCompiler.cpp
index 9f9e028..ccfc94e 100644
--- a/JavaScriptCore/yarr/RegexCompiler.cpp
+++ b/JavaScriptCore/yarr/RegexCompiler.cpp
@@ -1,5 +1,6 @@
 /*
  * Copyright (C) 2009 Apple Inc. All rights reserved.
+ * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -235,11 +236,117 @@ private:
     Vector<CharacterRange> m_rangesUnicode;
 };
 
+struct BeginCharHelper {
+    BeginCharHelper(Vector<BeginChar>* beginChars, bool isCaseInsensitive = false)
+        : m_beginChars(beginChars)
+        , m_isCaseInsensitive(isCaseInsensitive)
+    {}
+
+    void addBeginChar(BeginChar beginChar, Vector<TermChain>* hotTerms, QuantifierType quantityType, unsigned quantityCount)
+    {
+        if (quantityType == QuantifierFixedCount && quantityCount > 1) {
+            // We duplicate the first found character if the quantity of the term is more than one. eg.: /a{3}/
+            beginChar.value |= beginChar.value << 16;
+            beginChar.mask |= beginChar.mask << 16;
+            addCharacter(beginChar);
+        } else if (quantityType == QuantifierFixedCount && quantityCount == 1 && hotTerms->size())
+            // In case of characters with fixed quantifier we should check the next character as well.
+            linkHotTerms(beginChar, hotTerms);
+        else
+            // In case of greedy matching the next character checking is unnecessary therefore we just store
+            // the first character.
+            addCharacter(beginChar);
+    }
+
+    // Merge two following BeginChars in the vector to reduce the number of character checks.
+    void merge(unsigned size)
+    {
+        for (unsigned i = 0; i < size; i++) {
+            BeginChar* curr = &m_beginChars->at(i);
+            BeginChar* next = &m_beginChars->at(i + 1);
+
+            // If the current and the next size of value is different we should skip the merge process
+            // because the 16bit and 32bit values are unmergable.
+            if (curr->value <= 0xFFFF && next->value > 0xFFFF)
+                continue;
+
+            unsigned diff = curr->value ^ next->value;
+
+            curr->mask |= diff;
+            curr->value |= curr->mask;
+
+            m_beginChars->remove(i + 1);
+            size--;
+        }
+    }
+
+private:
+    void addCharacter(BeginChar beginChar)
+    {
+        unsigned pos = 0;
+        unsigned range = m_beginChars->size();
+
+        // binary chop, find position to insert char.
+        while (range) {
+            unsigned index = range >> 1;
+
+            int val = m_beginChars->at(pos+index).value - beginChar.value;
+            if (!val)
+                return;
+            if (val < 0)
+                range = index;
+            else {
+                pos += (index+1);
+                range -= (index+1);
+            }
+        }
+
+        if (pos == m_beginChars->size())
+            m_beginChars->append(beginChar);
+        else
+            m_beginChars->insert(pos, beginChar);
+    }
+
+    // Create BeginChar objects by appending each terms from a hotTerms vector to an existing BeginChar object.
+    void linkHotTerms(BeginChar beginChar, Vector<TermChain>* hotTerms)
+    {
+        for (unsigned i = 0; i < hotTerms->size(); i++) {
+            PatternTerm hotTerm = hotTerms->at(i).term;
+            ASSERT(hotTerm.type == PatternTerm::TypePatternCharacter);
+
+            UChar characterNext = hotTerm.patternCharacter;
+
+            // Append a character to an existing BeginChar object.
+            if (characterNext <= 0x7f) {
+                unsigned mask = 0;
+
+                if (m_isCaseInsensitive && isASCIIAlpha(characterNext)) {
+                    mask = 32;
+                    characterNext = toASCIILower(characterNext);
+                }
+
+                addCharacter(BeginChar(beginChar.value | (characterNext << 16), beginChar.mask | (mask << 16)));
+            } else {
+                UChar upper, lower;
+                if (m_isCaseInsensitive && ((upper = Unicode::toUpper(characterNext)) != (lower = Unicode::toLower(characterNext)))) {
+                    addCharacter(BeginChar(beginChar.value | (upper << 16), beginChar.mask));
+                    addCharacter(BeginChar(beginChar.value | (lower << 16), beginChar.mask));
+                } else
+                    addCharacter(BeginChar(beginChar.value | (characterNext << 16), beginChar.mask));
+            }
+        }
+    }
+
+    Vector<BeginChar>* m_beginChars;
+    bool m_isCaseInsensitive;
+};
+
 class RegexPatternConstructor {
 public:
     RegexPatternConstructor(RegexPattern& pattern)
         : m_pattern(pattern)
         , m_characterClassConstructor(pattern.m_ignoreCase)
+        , m_beginCharHelper(&pattern.m_beginChars, pattern.m_ignoreCase)
         , m_invertParentheticalAssertion(false)
     {
     }
@@ -649,12 +756,153 @@ public:
             loopDisjunction->m_alternatives.clear();
         }
     }
-    
-    
+
+    bool addBeginTerm(PatternTerm term, Vector<TermChain>* beginTerms, PatternAlternative* alternative, unsigned numTerms, unsigned termIndex, unsigned depth)
+    {
+        if (term.quantityType == QuantifierFixedCount) {
+            beginTerms->append(TermChain(term));
+            if (depth < 2 && termIndex < numTerms - 1 && term.quantityCount == 1)
+                setupAlternativeBeginTerms(alternative, &beginTerms->last().hotTerms, termIndex + 1, depth + 1);
+        } else if (termIndex != numTerms - 1) {
+            beginTerms->append(TermChain(term));
+            return true;
+        }
+
+        return false;
+    }
+
+    // This function collects the terms which are potentially matching the first number of depth characters in the result.
+    // If this function returns false then it found at least one term which makes the beginning character
+    // look-up optimization inefficient.
+    bool setupDisjunctionBeginTerms(PatternDisjunction* disjunction, Vector<TermChain>* beginTerms, unsigned depth)
+    {
+        for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
+            PatternAlternative* alternative = disjunction->m_alternatives[alt];
+
+            if (!setupAlternativeBeginTerms(alternative, beginTerms, 0, depth))
+                return false;
+        }
+
+        return true;
+    }
+
+    bool setupAlternativeBeginTerms(PatternAlternative* alternative, Vector<TermChain>* beginTerms, unsigned termIndex, unsigned depth)
+    {
+        bool checkNext = true;
+        unsigned numTerms = alternative->m_terms.size();
+
+        while (checkNext && termIndex < numTerms) {
+            PatternTerm term = alternative->m_terms[termIndex];
+            checkNext = false;
+
+            switch (term.type) {
+            case PatternTerm::TypeAssertionBOL:
+            case PatternTerm::TypeAssertionEOL:
+            case PatternTerm::TypeAssertionWordBoundary:
+                return false;
+
+            case PatternTerm::TypeBackReference:
+            case PatternTerm::TypeForwardReference:
+                return false;
+
+            case PatternTerm::TypePatternCharacter:
+                if (addBeginTerm(term, beginTerms, alternative, numTerms, termIndex, depth)) {
+                    termIndex++;
+                    checkNext = true;
+                }
+                break;
+
+            case PatternTerm::TypeCharacterClass:
+                return false;
+
+            case PatternTerm::TypeParentheticalAssertion:
+                if (term.invertOrCapture)
+                    return false;
+
+            case PatternTerm::TypeParenthesesSubpattern:
+                if (term.quantityType != QuantifierFixedCount) {
+                    if (termIndex == numTerms - 1)
+                        break;
+
+                    termIndex++;
+                    checkNext = true;
+
+                }
+
+                if (!setupDisjunctionBeginTerms(term.parentheses.disjunction, beginTerms, depth))
+                    return false;
+
+                break;
+            }
+        }
+
+        return true;
+    }
+
+    void setupBeginChars()
+    {
+        Vector<TermChain> beginTerms;
+        bool containsFixedCharacter = false;
+
+        if ((!m_pattern.m_body->m_hasFixedSize || m_pattern.m_body->m_alternatives.size() > 1)
+                && setupDisjunctionBeginTerms(m_pattern.m_body, &beginTerms, 0)) {
+            unsigned size = beginTerms.size();
+
+            // If we haven't collected any terms we should abort the preparation of beginning character look-up optimization.
+            if (!size)
+                return;
+
+            m_pattern.m_containsBeginChars = true;
+
+            for (unsigned i = 0; i < size; i++) {
+                PatternTerm term = beginTerms[i].term;
+
+                // We have just collected PatternCharacter terms, other terms are not allowed.
+                ASSERT(term.type == PatternTerm::TypePatternCharacter);
+
+                if (term.quantityType == QuantifierFixedCount)
+                    containsFixedCharacter = true;
+
+                UChar character = term.patternCharacter;
+                unsigned mask = 0;
+
+                if (character <= 0x7f) {
+                    if (m_pattern.m_ignoreCase && isASCIIAlpha(character)) {
+                        mask = 32;
+                        character = toASCIILower(character);
+                    }
+
+                    m_beginCharHelper.addBeginChar(BeginChar(character, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount);
+                } else {
+                    UChar upper, lower;
+                    if (m_pattern.m_ignoreCase && ((upper = Unicode::toUpper(character)) != (lower = Unicode::toLower(character)))) {
+                        m_beginCharHelper.addBeginChar(BeginChar(upper, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount);
+                        m_beginCharHelper.addBeginChar(BeginChar(lower, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount);
+                    } else
+                        m_beginCharHelper.addBeginChar(BeginChar(character, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount);
+                }
+            }
+
+            // If the pattern doesn't contain terms with fixed quantifiers then the beginning character look-up optimization is inefficient.
+            if (!containsFixedCharacter) {
+                m_pattern.m_containsBeginChars = false;
+                return;
+            }
+
+            size = m_pattern.m_beginChars.size();
+
+            if (size > 2)
+                m_beginCharHelper.merge(size - 1);
+            else if (size <= 1)
+                m_pattern.m_containsBeginChars = false;
+        }
+    }
+
 private:
     RegexPattern& m_pattern;
     PatternAlternative* m_alternative;
     CharacterClassConstructor m_characterClassConstructor;
+    BeginCharHelper m_beginCharHelper;
     bool m_invertCharacterClass;
     bool m_invertParentheticalAssertion;
 };
@@ -687,6 +935,7 @@ const char* compileRegex(const UString& patternString, RegexPattern& pattern)
     constructor.optimizeBOL();
         
     constructor.setupOffsets();
+    constructor.setupBeginChars();
 
     return 0;
 };
diff --git a/JavaScriptCore/yarr/RegexInterpreter.cpp b/JavaScriptCore/yarr/RegexInterpreter.cpp
index ec96636..dc3024a 100644
--- a/JavaScriptCore/yarr/RegexInterpreter.cpp
+++ b/JavaScriptCore/yarr/RegexInterpreter.cpp
@@ -1,5 +1,6 @@
 /*
  * Copyright (C) 2009 Apple Inc. All rights reserved.
+ * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -195,6 +196,12 @@ public:
             return -1;
         }
 
+        int readPair()
+        {
+            ASSERT(pos + 1 < length);
+            return input[pos] | input[pos + 1] << 16;
+        }
+
         int readChecked(int position)
         {
             ASSERT(position < 0);
@@ -262,6 +269,11 @@ public:
             return (pos + position) == length;
         }
 
+        bool isNotAvailableInput(int position)
+        {
+            return (pos + position) > length;
+        }
+
     private:
         const UChar* input;
         unsigned pos;
@@ -591,20 +603,24 @@ public:
         unsigned count = term.atom.parenthesesDisjunction->m_numSubpatterns;
         context->restoreOutput(output, firstSubpatternId, count);
     }
-    bool parenthesesDoBacktrack(ByteTerm& term, BackTrackInfoParentheses* backTrack)
+    JSRegExpResult parenthesesDoBacktrack(ByteTerm& term, BackTrackInfoParentheses* backTrack)
     {
         while (backTrack->matchAmount) {
             ParenthesesDisjunctionContext* context = backTrack->lastContext;
 
-            if (matchDisjunction(term.atom.parenthesesDisjunction, context->getDisjunctionContext(term), true))
-                return true;
+            JSRegExpResult result = matchDisjunction(term.atom.parenthesesDisjunction, context->getDisjunctionContext(term), true);
+            if (result == JSRegExpMatch)
+                return JSRegExpMatch;
 
             resetMatches(term, context);
             popParenthesesDisjunctionContext(backTrack);
             freeParenthesesDisjunctionContext(context);
+
+            if (result != JSRegExpNoMatch)
+                return result;
         }
 
-        return false;
+        return JSRegExpNoMatch;
     }
 
     bool matchParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context)
@@ -765,7 +781,7 @@ public:
         return false;
     }
 
-    bool matchParentheses(ByteTerm& term, DisjunctionContext* context)
+    JSRegExpResult matchParentheses(ByteTerm& term, DisjunctionContext* context)
     {
         ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern);
 
@@ -786,31 +802,42 @@ public:
             while (backTrack->matchAmount < term.atom.quantityCount) {
                 // Try to do a match, and it it succeeds, add it to the list.
                 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
-                if (matchDisjunction(disjunctionBody, context->getDisjunctionContext(term)))
+                JSRegExpResult result = matchDisjunction(disjunctionBody, context->getDisjunctionContext(term));
+                if (result == JSRegExpMatch)
                     appendParenthesesDisjunctionContext(backTrack, context);
                 else {
                     // The match failed; try to find an alternate point to carry on from.
                     resetMatches(term, context);
                     freeParenthesesDisjunctionContext(context);
-                    if (!parenthesesDoBacktrack(term, backTrack))
-                        return false;
+
+                    if (result == JSRegExpNoMatch) {
+                        JSRegExpResult backtrackResult = parenthesesDoBacktrack(term, backTrack);
+                        if (backtrackResult != JSRegExpMatch)
+                            return backtrackResult;
+                    } else
+                        return result;
                 }
             }
 
             ASSERT(backTrack->matchAmount == term.atom.quantityCount);
             ParenthesesDisjunctionContext* context = backTrack->lastContext;
             recordParenthesesMatch(term, context);
-            return true;
+            return JSRegExpMatch;
         }
 
         case QuantifierGreedy: {
             while (backTrack->matchAmount < term.atom.quantityCount) {
                 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
-                if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term)))
+                JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term));
+                if (result == JSRegExpMatch)
                     appendParenthesesDisjunctionContext(backTrack, context);
                 else {
                     resetMatches(term, context);
                     freeParenthesesDisjunctionContext(context);
+
+                    if (result != JSRegExpNoMatch)
+                        return result;
+
                     break;
                 }
             }
@@ -819,15 +846,15 @@ public:
                 ParenthesesDisjunctionContext* context = backTrack->lastContext;
                 recordParenthesesMatch(term, context);
             }
-            return true;
+            return JSRegExpMatch;
         }
 
         case QuantifierNonGreedy:
-            return true;
+            return JSRegExpMatch;
         }
 
         ASSERT_NOT_REACHED();
-        return false;
+        return JSRegExpErrorNoMatch;
     }
 
     // Rules for backtracking differ depending on whether this is greedy or non-greedy.
@@ -840,7 +867,7 @@ public:
     // Non-greedy, we've already done the one less case, so don't match on popping.
     // We haven't done the one more case, so always try to add that.
     //
-    bool backtrackParentheses(ByteTerm& term, DisjunctionContext* context)
+    JSRegExpResult backtrackParentheses(ByteTerm& term, DisjunctionContext* context)
     {
         ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern);
 
@@ -859,44 +886,58 @@ public:
             ASSERT(backTrack->matchAmount == term.atom.quantityCount);
 
             ParenthesesDisjunctionContext* context = 0;
+            JSRegExpResult result = parenthesesDoBacktrack(term, backTrack);
 
-            if (!parenthesesDoBacktrack(term, backTrack))
-                return false;
+            if (result != JSRegExpMatch)
+                return result;
 
             // While we haven't yet reached our fixed limit,
             while (backTrack->matchAmount < term.atom.quantityCount) {
                 // Try to do a match, and it it succeeds, add it to the list.
                 context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
-                if (matchDisjunction(disjunctionBody, context->getDisjunctionContext(term)))
+                result = matchDisjunction(disjunctionBody, context->getDisjunctionContext(term));
+
+                if (result == JSRegExpMatch)
                     appendParenthesesDisjunctionContext(backTrack, context);
                 else {
                     // The match failed; try to find an alternate point to carry on from.
                     resetMatches(term, context);
                     freeParenthesesDisjunctionContext(context);
-                    if (!parenthesesDoBacktrack(term, backTrack))
-                        return false;
+
+                    if (result == JSRegExpNoMatch) {
+                        JSRegExpResult backtrackResult = parenthesesDoBacktrack(term, backTrack);
+                        if (backtrackResult != JSRegExpMatch)
+                            return backtrackResult;
+                    } else
+                        return result;
                 }
             }
 
             ASSERT(backTrack->matchAmount == term.atom.quantityCount);
             context = backTrack->lastContext;
             recordParenthesesMatch(term, context);
-            return true;
+            return JSRegExpMatch;
         }
 
         case QuantifierGreedy: {
             if (!backTrack->matchAmount)
-                return false;
+                return JSRegExpNoMatch;
 
             ParenthesesDisjunctionContext* context = backTrack->lastContext;
-            if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true)) {
+            JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true);
+            if (result == JSRegExpMatch) {
                 while (backTrack->matchAmount < term.atom.quantityCount) {
                     ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
-                    if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term)))
+                    JSRegExpResult parenthesesResult = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term));
+                    if (parenthesesResult == JSRegExpMatch)
                         appendParenthesesDisjunctionContext(backTrack, context);
                     else {
                         resetMatches(term, context);
                         freeParenthesesDisjunctionContext(context);
+
+                        if (parenthesesResult != JSRegExpNoMatch)
+                            return parenthesesResult;
+
                         break;
                     }
                 }
@@ -904,63 +945,108 @@ public:
                 resetMatches(term, context);
                 popParenthesesDisjunctionContext(backTrack);
                 freeParenthesesDisjunctionContext(context);
+
+                if (result != JSRegExpNoMatch)
+                    return result;
             }
 
             if (backTrack->matchAmount) {
                 ParenthesesDisjunctionContext* context = backTrack->lastContext;
                 recordParenthesesMatch(term, context);
             }
-            return true;
+            return JSRegExpMatch;
         }
 
         case QuantifierNonGreedy: {
             // If we've not reached the limit, try to add one more match.
             if (backTrack->matchAmount < term.atom.quantityCount) {
                 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
-                if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term))) {
+                JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term));
+                if (result == JSRegExpMatch) {
                     appendParenthesesDisjunctionContext(backTrack, context);
                     recordParenthesesMatch(term, context);
-                    return true;
-                } else {
-                    resetMatches(term, context);
-                    freeParenthesesDisjunctionContext(context);
+                    return JSRegExpMatch;
                 }
+
+                resetMatches(term, context);
+                freeParenthesesDisjunctionContext(context);
+
+                if (result != JSRegExpNoMatch)
+                    return result;
             }
 
             // Nope - okay backtrack looking for an alternative.
             while (backTrack->matchAmount) {
                 ParenthesesDisjunctionContext* context = backTrack->lastContext;
-                if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true)) {
+                JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true);
+                if (result == JSRegExpMatch) {
                     // successful backtrack! we're back in the game!
                     if (backTrack->matchAmount) {
                         context = backTrack->lastContext;
                         recordParenthesesMatch(term, context);
                     }
-                    return true;
+                    return JSRegExpMatch;
                 }
 
                 // pop a match off the stack
                 resetMatches(term, context);
                 popParenthesesDisjunctionContext(backTrack);
                 freeParenthesesDisjunctionContext(context);
+
+                return result;
             }
 
-            return false;
+            return JSRegExpNoMatch;
         }
         }
 
         ASSERT_NOT_REACHED();
-        return false;
+        return JSRegExpErrorNoMatch;
+    }
+
+    void lookupForBeginChars()
+    {
+        int character;
+        bool firstSingleCharFound;
+
+        while (true) {
+            if (input.isNotAvailableInput(2))
+                return;
+
+            firstSingleCharFound = false;
+
+            character = input.readPair();
+
+            for (unsigned i = 0; i < pattern->m_beginChars.size(); ++i) {
+                BeginChar bc = pattern->m_beginChars[i];
+
+                if (!firstSingleCharFound && bc.value <= 0xFFFF) {
+                    firstSingleCharFound = true;
+                    character &= 0xFFFF;
+                }
+
+                if ((character | bc.mask) == bc.value)
+                    return;
+            }
+
+            input.next();
+        }
     }
 
 #define MATCH_NEXT() { ++context->term; goto matchAgain; }
 #define BACKTRACK() { --context->term; goto backtrack; }
 #define currentTerm() (disjunction->terms[context->term])
-    bool matchDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false)
+    JSRegExpResult matchDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false, bool isBody = false)
     {
+        if (!--remainingMatchCount)
+            return JSRegExpErrorHitLimit;
+
         if (btrack)
             BACKTRACK();
 
+        if (pattern->m_containsBeginChars && isBody)
+            lookupForBeginChars();
+
         context->matchBegin = input.getPos();
         context->term = 0;
 
@@ -972,14 +1058,14 @@ public:
             MATCH_NEXT();
         case ByteTerm::TypeSubpatternEnd:
             context->matchEnd = input.getPos();
-            return true;
+            return JSRegExpMatch;
 
         case ByteTerm::TypeBodyAlternativeBegin:
             MATCH_NEXT();
         case ByteTerm::TypeBodyAlternativeDisjunction:
         case ByteTerm::TypeBodyAlternativeEnd:
             context->matchEnd = input.getPos();
-            return true;
+            return JSRegExpMatch;
 
         case ByteTerm::TypeAlternativeBegin:
             MATCH_NEXT();
@@ -1069,10 +1155,16 @@ public:
             if (matchBackReference(currentTerm(), context))
                 MATCH_NEXT();
             BACKTRACK();
-        case ByteTerm::TypeParenthesesSubpattern:
-            if (matchParentheses(currentTerm(), context))
+        case ByteTerm::TypeParenthesesSubpattern: {
+            JSRegExpResult result = matchParentheses(currentTerm(), context);
+
+            if (result == JSRegExpMatch) {
                 MATCH_NEXT();
+            }  else if (result != JSRegExpNoMatch)
+                return result;
+
             BACKTRACK();
+        }
         case ByteTerm::TypeParenthesesSubpatternOnceBegin:
             if (matchParenthesesOnceBegin(currentTerm(), context))
                 MATCH_NEXT();
@@ -1104,7 +1196,7 @@ public:
 
         switch (currentTerm().type) {
         case ByteTerm::TypeSubpatternBegin:
-            return false;
+            return JSRegExpNoMatch;
         case ByteTerm::TypeSubpatternEnd:
             ASSERT_NOT_REACHED();
 
@@ -1116,9 +1208,13 @@ public:
                 MATCH_NEXT();
 
             if (input.atEnd())
-                return false;
+                return JSRegExpNoMatch;
 
             input.next();
+
+            if (pattern->m_containsBeginChars && isBody)
+                lookupForBeginChars();
+
             context->matchBegin = input.getPos();
 
             if (currentTerm().alternative.onceThrough)
@@ -1172,10 +1268,16 @@ public:
                 if (backtrackBackReference(currentTerm(), context))
                     MATCH_NEXT();
                 BACKTRACK();
-            case ByteTerm::TypeParenthesesSubpattern:
-                if (backtrackParentheses(currentTerm(), context))
+            case ByteTerm::TypeParenthesesSubpattern: {
+                JSRegExpResult result = backtrackParentheses(currentTerm(), context);
+
+                if (result == JSRegExpMatch) {
                     MATCH_NEXT();
+                } else if (result != JSRegExpNoMatch)
+                    return result;
+
                 BACKTRACK();
+            }
             case ByteTerm::TypeParenthesesSubpatternOnceBegin:
                 if (backtrackParenthesesOnceBegin(currentTerm(), context))
                     MATCH_NEXT();
@@ -1199,20 +1301,23 @@ public:
         }
 
         ASSERT_NOT_REACHED();
-        return false;
+        return JSRegExpErrorNoMatch;
     }
 
-    bool matchNonZeroDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false)
+    JSRegExpResult matchNonZeroDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false)
     {
-        if (matchDisjunction(disjunction, context, btrack)) {
+        JSRegExpResult result = matchDisjunction(disjunction, context, btrack);
+
+        if (result == JSRegExpMatch) {
             while (context->matchBegin == context->matchEnd) {
-                if (!matchDisjunction(disjunction, context, true))
-                    return false;
+                result = matchDisjunction(disjunction, context, true);
+                if (result != JSRegExpMatch)
+                    return result;
             }
-            return true;
+            return JSRegExpMatch;
         }
 
-        return false;
+        return result;
     }
 
     int interpret()
@@ -1226,7 +1331,8 @@ public:
 
         DisjunctionContext* context = allocDisjunctionContext(pattern->m_body.get());
 
-        if (matchDisjunction(pattern->m_body.get(), context)) {
+        JSRegExpResult result = matchDisjunction(pattern->m_body.get(), context, false, true);
+        if (result == JSRegExpMatch) {
             output[0] = context->matchBegin;
             output[1] = context->matchEnd;
         }
@@ -1235,6 +1341,9 @@ public:
 
         pattern->m_allocator->stopAllocator();
 
+        if (output[0] == -1 && result != JSRegExpNoMatch)
+            return result;
+
         return output[0];
     }
 
@@ -1243,6 +1352,7 @@ public:
         , output(output)
         , input(inputChar, start, length)
         , allocatorPool(0)
+        , remainingMatchCount(matchLimit)
     {
     }
 
@@ -1251,6 +1361,7 @@ private:
     int* output;
     InputStream input;
     BumpPointerPool* allocatorPool;
+    unsigned remainingMatchCount;
 };
 
 
diff --git a/JavaScriptCore/yarr/RegexInterpreter.h b/JavaScriptCore/yarr/RegexInterpreter.h
index 37f17fe..dae8f9d 100644
--- a/JavaScriptCore/yarr/RegexInterpreter.h
+++ b/JavaScriptCore/yarr/RegexInterpreter.h
@@ -40,6 +40,21 @@ using WTF::BumpPointerAllocator;
 
 namespace JSC { namespace Yarr {
 
+// TODO move the matchLimit constant and the JSRegExpResult enum to the JSRegExp.h when pcre is removed.
+
+// The below limit restricts the number of "recursive" match calls in order to
+// avoid spending exponential time on complex regular expressions.
+static const unsigned matchLimit = 1000000;
+
+enum JSRegExpResult {
+    JSRegExpMatch = 1,
+    JSRegExpNoMatch = 0,
+    JSRegExpErrorNoMatch = -1,
+    JSRegExpErrorHitLimit = -2,
+    JSRegExpErrorNoMemory = -3,
+    JSRegExpErrorInternal = -4
+};
+
 class ByteDisjunction;
 
 struct ByteTerm {
@@ -309,6 +324,7 @@ struct BytecodePattern : FastAllocBase {
         : m_body(body)
         , m_ignoreCase(pattern.m_ignoreCase)
         , m_multiline(pattern.m_multiline)
+        , m_containsBeginChars(pattern.m_containsBeginChars)
         , m_allocator(allocator)
     {
         newlineCharacterClass = pattern.newlineCharacterClass();
@@ -320,6 +336,8 @@ struct BytecodePattern : FastAllocBase {
         // array, so that it won't delete them on destruction.  We'll
         // take responsibility for that.
         pattern.m_userCharacterClasses.clear();
+
+        m_beginChars.append(pattern.m_beginChars);
     }
 
     ~BytecodePattern()
@@ -331,12 +349,16 @@ struct BytecodePattern : FastAllocBase {
     OwnPtr<ByteDisjunction> m_body;
     bool m_ignoreCase;
     bool m_multiline;
+    bool m_containsBeginChars;
     // Each BytecodePattern is associated with a RegExp, each RegExp is associated
     // with a JSGlobalData.  Cache a pointer to out JSGlobalData's m_regexAllocator.
     BumpPointerAllocator* m_allocator;
 
     CharacterClass* newlineCharacterClass;
     CharacterClass* wordcharCharacterClass;
+
+    Vector<BeginChar> m_beginChars;
+
 private:
     Vector<ByteDisjunction*> m_allParenthesesInfo;
     Vector<CharacterClass*> m_userCharacterClasses;
diff --git a/JavaScriptCore/yarr/RegexJIT.h b/JavaScriptCore/yarr/RegexJIT.h
index 5d587b2..02e905d 100644
--- a/JavaScriptCore/yarr/RegexJIT.h
+++ b/JavaScriptCore/yarr/RegexJIT.h
@@ -30,9 +30,9 @@
 
 #include "MacroAssembler.h"
 #include "RegexPattern.h"
-#include <UString.h>
+#include "UString.h"
 
-#include <pcre.h>
+#include "pcre.h"
 struct JSRegExp; // temporary, remove when fallback is removed.
 
 #if CPU(X86) && !COMPILER(MSVC)
diff --git a/JavaScriptCore/yarr/RegexParser.h b/JavaScriptCore/yarr/RegexParser.h
index 8a5eeba..ede9417 100644
--- a/JavaScriptCore/yarr/RegexParser.h
+++ b/JavaScriptCore/yarr/RegexParser.h
@@ -28,7 +28,7 @@
 
 #if ENABLE(YARR)
 
-#include <UString.h>
+#include "UString.h"
 #include <limits.h>
 #include <wtf/ASCIICType.h>
 #include <wtf/unicode/Unicode.h>
diff --git a/JavaScriptCore/yarr/RegexPattern.h b/JavaScriptCore/yarr/RegexPattern.h
index eecbd43..be31fcd 100644
--- a/JavaScriptCore/yarr/RegexPattern.h
+++ b/JavaScriptCore/yarr/RegexPattern.h
@@ -1,5 +1,6 @@
 /*
  * Copyright (C) 2009 Apple Inc. All rights reserved.
+ * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -283,11 +284,36 @@ CharacterClass* nondigitsCreate();
 CharacterClass* nonspacesCreate();
 CharacterClass* nonwordcharCreate();
 
+struct TermChain {
+    TermChain(PatternTerm term)
+        : term(term)
+    {}
+
+    PatternTerm term;
+    Vector<TermChain> hotTerms;
+};
+
+struct BeginChar {
+    BeginChar()
+        : value(0)
+        , mask(0)
+    {}
+
+    BeginChar(unsigned value, unsigned mask)
+        : value(value)
+        , mask(mask)
+    {}
+
+    unsigned value;
+    unsigned mask;
+};
+
 struct RegexPattern {
     RegexPattern(bool ignoreCase, bool multiline)
         : m_ignoreCase(ignoreCase)
         , m_multiline(multiline)
         , m_containsBackreferences(false)
+        , m_containsBeginChars(false)
         , m_containsBOL(false)
         , m_numSubpatterns(0)
         , m_maxBackReference(0)
@@ -313,6 +339,7 @@ struct RegexPattern {
         m_maxBackReference = 0;
 
         m_containsBackreferences = false;
+        m_containsBeginChars = false;
         m_containsBOL = false;
 
         newlineCached = 0;
@@ -327,6 +354,7 @@ struct RegexPattern {
         m_disjunctions.clear();
         deleteAllValues(m_userCharacterClasses);
         m_userCharacterClasses.clear();
+        m_beginChars.clear();
     }
 
     bool containsIllegalBackReference()
@@ -380,12 +408,14 @@ struct RegexPattern {
     bool m_ignoreCase : 1;
     bool m_multiline : 1;
     bool m_containsBackreferences : 1;
+    bool m_containsBeginChars : 1;
     bool m_containsBOL : 1;
     unsigned m_numSubpatterns;
     unsigned m_maxBackReference;
     PatternDisjunction* m_body;
     Vector<PatternDisjunction*, 4> m_disjunctions;
     Vector<CharacterClass*> m_userCharacterClasses;
+    Vector<BeginChar> m_beginChars;
 
 private:
     CharacterClass* newlineCached;
author	Russell Brenner <russellbrenner@google.com>	2010-11-18 17:33:13 -0800
committer	Russell Brenner <russellbrenner@google.com>	2010-12-02 13:47:21 -0800
commit	6b70adc33054f8aee8c54d0f460458a9df11b8a5 (patch)
tree	103a13998c33944d6ab3b8318c509a037e639460 /JavaScriptCore/yarr
parent	bdf4ebc8e70b2d221b6ee7a65660918ecb1d33aa (diff)
download	external_webkit-6b70adc33054f8aee8c54d0f460458a9df11b8a5.zip external_webkit-6b70adc33054f8aee8c54d0f460458a9df11b8a5.tar.gz external_webkit-6b70adc33054f8aee8c54d0f460458a9df11b8a5.tar.bz2