/* * Copyright (C) 2009 Apple Inc. All rights reserved. * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef RegexPattern_h #define RegexPattern_h #include #include namespace JSC { namespace Yarr { #define RegexStackSpaceForBackTrackInfoPatternCharacter 1 // Only for !fixed quantifiers. #define RegexStackSpaceForBackTrackInfoCharacterClass 1 // Only for !fixed quantifiers. #define RegexStackSpaceForBackTrackInfoBackReference 2 #define RegexStackSpaceForBackTrackInfoAlternative 1 // One per alternative. #define RegexStackSpaceForBackTrackInfoParentheticalAssertion 1 #define RegexStackSpaceForBackTrackInfoParenthesesOnce 1 // Only for !fixed quantifiers. #define RegexStackSpaceForBackTrackInfoParenthesesTerminal 1 #define RegexStackSpaceForBackTrackInfoParentheses 4 struct PatternDisjunction; struct CharacterRange { UChar begin; UChar end; CharacterRange(UChar begin, UChar end) : begin(begin) , end(end) { } }; struct CharacterClassTable : RefCounted { const char* m_table; bool m_inverted; static PassRefPtr create(const char* table, bool inverted) { return adoptRef(new CharacterClassTable(table, inverted)); } private: CharacterClassTable(const char* table, bool inverted) : m_table(table) , m_inverted(inverted) { } }; struct CharacterClass : FastAllocBase { // All CharacterClass instances have to have the full set of matches and ranges, // they may have an optional table for faster lookups (which must match the // specified matches and ranges) CharacterClass(PassRefPtr table) : m_table(table) { } Vector m_matches; Vector m_ranges; Vector m_matchesUnicode; Vector m_rangesUnicode; RefPtr m_table; }; enum QuantifierType { QuantifierFixedCount, QuantifierGreedy, QuantifierNonGreedy, }; struct PatternTerm { enum Type { TypeAssertionBOL, TypeAssertionEOL, TypeAssertionWordBoundary, TypePatternCharacter, TypeCharacterClass, TypeBackReference, TypeForwardReference, TypeParenthesesSubpattern, TypeParentheticalAssertion, } type; bool invertOrCapture; union { UChar patternCharacter; CharacterClass* characterClass; unsigned backReferenceSubpatternId; struct { PatternDisjunction* disjunction; unsigned subpatternId; unsigned lastSubpatternId; bool isCopy; bool isTerminal; } parentheses; }; QuantifierType quantityType; unsigned quantityCount; int inputPosition; unsigned frameLocation; PatternTerm(UChar ch) : type(PatternTerm::TypePatternCharacter) { patternCharacter = ch; quantityType = QuantifierFixedCount; quantityCount = 1; } PatternTerm(CharacterClass* charClass, bool invert) : type(PatternTerm::TypeCharacterClass) , invertOrCapture(invert) { characterClass = charClass; quantityType = QuantifierFixedCount; quantityCount = 1; } PatternTerm(Type type, unsigned subpatternId, PatternDisjunction* disjunction, bool invertOrCapture) : type(type) , invertOrCapture(invertOrCapture) { parentheses.disjunction = disjunction; parentheses.subpatternId = subpatternId; parentheses.isCopy = false; parentheses.isTerminal = false; quantityType = QuantifierFixedCount; quantityCount = 1; } PatternTerm(Type type, bool invert = false) : type(type) , invertOrCapture(invert) { quantityType = QuantifierFixedCount; quantityCount = 1; } PatternTerm(unsigned spatternId) : type(TypeBackReference) , invertOrCapture(false) { backReferenceSubpatternId = spatternId; quantityType = QuantifierFixedCount; quantityCount = 1; } static PatternTerm ForwardReference() { return PatternTerm(TypeForwardReference); } static PatternTerm BOL() { return PatternTerm(TypeAssertionBOL); } static PatternTerm EOL() { return PatternTerm(TypeAssertionEOL); } static PatternTerm WordBoundary(bool invert) { return PatternTerm(TypeAssertionWordBoundary, invert); } bool invert() { return invertOrCapture; } bool capture() { return invertOrCapture; } void quantify(unsigned count, QuantifierType type) { quantityCount = count; quantityType = type; } }; struct PatternAlternative : FastAllocBase { PatternAlternative(PatternDisjunction* disjunction) : m_parent(disjunction) , m_onceThrough(false) , m_hasFixedSize(false) , m_startsWithBOL(false) , m_containsBOL(false) { } PatternTerm& lastTerm() { ASSERT(m_terms.size()); return m_terms[m_terms.size() - 1]; } void removeLastTerm() { ASSERT(m_terms.size()); m_terms.shrink(m_terms.size() - 1); } void setOnceThrough() { m_onceThrough = true; } bool onceThrough() { return m_onceThrough; } Vector m_terms; PatternDisjunction* m_parent; unsigned m_minimumSize; bool m_onceThrough : 1; bool m_hasFixedSize : 1; bool m_startsWithBOL : 1; bool m_containsBOL : 1; }; struct PatternDisjunction : FastAllocBase { PatternDisjunction(PatternAlternative* parent = 0) : m_parent(parent) , m_hasFixedSize(false) { } ~PatternDisjunction() { deleteAllValues(m_alternatives); } PatternAlternative* addNewAlternative() { PatternAlternative* alternative = new PatternAlternative(this); m_alternatives.append(alternative); return alternative; } Vector m_alternatives; PatternAlternative* m_parent; unsigned m_minimumSize; unsigned m_callFrameSize; bool m_hasFixedSize; }; // You probably don't want to be calling these functions directly // (please to be calling newlineCharacterClass() et al on your // friendly neighborhood RegexPattern instance to get nicely // cached copies). CharacterClass* newlineCreate(); CharacterClass* digitsCreate(); CharacterClass* spacesCreate(); CharacterClass* wordcharCreate(); CharacterClass* nondigitsCreate(); CharacterClass* nonspacesCreate(); CharacterClass* nonwordcharCreate(); struct TermChain { TermChain(PatternTerm term) : term(term) {} PatternTerm term; Vector hotTerms; }; struct BeginChar { BeginChar() : value(0) , mask(0) {} BeginChar(unsigned value, unsigned mask) : value(value) , mask(mask) {} unsigned value; unsigned mask; }; struct RegexPattern { RegexPattern(bool ignoreCase, bool multiline) : m_ignoreCase(ignoreCase) , m_multiline(multiline) , m_containsBackreferences(false) , m_containsBeginChars(false) , m_containsBOL(false) , m_numSubpatterns(0) , m_maxBackReference(0) , newlineCached(0) , digitsCached(0) , spacesCached(0) , wordcharCached(0) , nondigitsCached(0) , nonspacesCached(0) , nonwordcharCached(0) { } ~RegexPattern() { deleteAllValues(m_disjunctions); deleteAllValues(m_userCharacterClasses); } void reset() { m_numSubpatterns = 0; m_maxBackReference = 0; m_containsBackreferences = false; m_containsBeginChars = false; m_containsBOL = false; newlineCached = 0; digitsCached = 0; spacesCached = 0; wordcharCached = 0; nondigitsCached = 0; nonspacesCached = 0; nonwordcharCached = 0; deleteAllValues(m_disjunctions); m_disjunctions.clear(); deleteAllValues(m_userCharacterClasses); m_userCharacterClasses.clear(); m_beginChars.clear(); } bool containsIllegalBackReference() { return m_maxBackReference > m_numSubpatterns; } CharacterClass* newlineCharacterClass() { if (!newlineCached) m_userCharacterClasses.append(newlineCached = newlineCreate()); return newlineCached; } CharacterClass* digitsCharacterClass() { if (!digitsCached) m_userCharacterClasses.append(digitsCached = digitsCreate()); return digitsCached; } CharacterClass* spacesCharacterClass() { if (!spacesCached) m_userCharacterClasses.append(spacesCached = spacesCreate()); return spacesCached; } CharacterClass* wordcharCharacterClass() { if (!wordcharCached) m_userCharacterClasses.append(wordcharCached = wordcharCreate()); return wordcharCached; } CharacterClass* nondigitsCharacterClass() { if (!nondigitsCached) m_userCharacterClasses.append(nondigitsCached = nondigitsCreate()); return nondigitsCached; } CharacterClass* nonspacesCharacterClass() { if (!nonspacesCached) m_userCharacterClasses.append(nonspacesCached = nonspacesCreate()); return nonspacesCached; } CharacterClass* nonwordcharCharacterClass() { if (!nonwordcharCached) m_userCharacterClasses.append(nonwordcharCached = nonwordcharCreate()); return nonwordcharCached; } bool m_ignoreCase : 1; bool m_multiline : 1; bool m_containsBackreferences : 1; bool m_containsBeginChars : 1; bool m_containsBOL : 1; unsigned m_numSubpatterns; unsigned m_maxBackReference; PatternDisjunction* m_body; Vector m_disjunctions; Vector m_userCharacterClasses; Vector m_beginChars; private: CharacterClass* newlineCached; CharacterClass* digitsCached; CharacterClass* spacesCached; CharacterClass* wordcharCached; CharacterClass* nondigitsCached; CharacterClass* nonspacesCached; CharacterClass* nonwordcharCached; }; } } // namespace JSC::Yarr #endif // RegexPattern_h