diff options
Diffstat (limited to 'Source/JavaScriptCore/yarr/RegexPattern.h')
-rw-r--r-- | Source/JavaScriptCore/yarr/RegexPattern.h | 424 |
1 files changed, 424 insertions, 0 deletions
diff --git a/Source/JavaScriptCore/yarr/RegexPattern.h b/Source/JavaScriptCore/yarr/RegexPattern.h new file mode 100644 index 0000000..6833dd6 --- /dev/null +++ b/Source/JavaScriptCore/yarr/RegexPattern.h @@ -0,0 +1,424 @@ +/* + * Copyright (C) 2009 Apple Inc. All rights reserved. + * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef RegexPattern_h +#define RegexPattern_h + +#include <wtf/Vector.h> +#include <wtf/unicode/Unicode.h> + +#include <UString.h> + +namespace JSC { namespace Yarr { + +#define RegexStackSpaceForBackTrackInfoPatternCharacter 1 // Only for !fixed quantifiers. +#define RegexStackSpaceForBackTrackInfoCharacterClass 1 // Only for !fixed quantifiers. +#define RegexStackSpaceForBackTrackInfoBackReference 2 +#define RegexStackSpaceForBackTrackInfoAlternative 1 // One per alternative. +#define RegexStackSpaceForBackTrackInfoParentheticalAssertion 1 +#define RegexStackSpaceForBackTrackInfoParenthesesOnce 1 // Only for !fixed quantifiers. +#define RegexStackSpaceForBackTrackInfoParenthesesTerminal 1 +#define RegexStackSpaceForBackTrackInfoParentheses 2 + +struct PatternDisjunction; + +struct CharacterRange { + UChar begin; + UChar end; + + CharacterRange(UChar begin, UChar end) + : begin(begin) + , end(end) + { + } +}; + +struct CharacterClassTable : RefCounted<CharacterClassTable> { + const char* m_table; + bool m_inverted; + static PassRefPtr<CharacterClassTable> create(const char* table, bool inverted) + { + return adoptRef(new CharacterClassTable(table, inverted)); + } + +private: + CharacterClassTable(const char* table, bool inverted) + : m_table(table) + , m_inverted(inverted) + { + } +}; + +struct CharacterClass : FastAllocBase { + // All CharacterClass instances have to have the full set of matches and ranges, + // they may have an optional table for faster lookups (which must match the + // specified matches and ranges) + CharacterClass(PassRefPtr<CharacterClassTable> table) + : m_table(table) + { + } + Vector<UChar> m_matches; + Vector<CharacterRange> m_ranges; + Vector<UChar> m_matchesUnicode; + Vector<CharacterRange> m_rangesUnicode; + RefPtr<CharacterClassTable> m_table; +}; + +enum QuantifierType { + QuantifierFixedCount, + QuantifierGreedy, + QuantifierNonGreedy, +}; + +struct PatternTerm { + enum Type { + TypeAssertionBOL, + TypeAssertionEOL, + TypeAssertionWordBoundary, + TypePatternCharacter, + TypeCharacterClass, + TypeBackReference, + TypeForwardReference, + TypeParenthesesSubpattern, + TypeParentheticalAssertion, + } type; + bool m_capture :1; + bool m_invert :1; + union { + UChar patternCharacter; + CharacterClass* characterClass; + unsigned backReferenceSubpatternId; + struct { + PatternDisjunction* disjunction; + unsigned subpatternId; + unsigned lastSubpatternId; + bool isCopy; + bool isTerminal; + } parentheses; + }; + QuantifierType quantityType; + unsigned quantityCount; + int inputPosition; + unsigned frameLocation; + + PatternTerm(UChar ch) + : type(PatternTerm::TypePatternCharacter) + , m_capture(false) + , m_invert(false) + { + patternCharacter = ch; + quantityType = QuantifierFixedCount; + quantityCount = 1; + } + + PatternTerm(CharacterClass* charClass, bool invert) + : type(PatternTerm::TypeCharacterClass) + , m_capture(false) + , m_invert(invert) + { + characterClass = charClass; + quantityType = QuantifierFixedCount; + quantityCount = 1; + } + + PatternTerm(Type type, unsigned subpatternId, PatternDisjunction* disjunction, bool capture = false, bool invert = false) + : type(type) + , m_capture(capture) + , m_invert(invert) + { + parentheses.disjunction = disjunction; + parentheses.subpatternId = subpatternId; + parentheses.isCopy = false; + parentheses.isTerminal = false; + quantityType = QuantifierFixedCount; + quantityCount = 1; + } + + PatternTerm(Type type, bool invert = false) + : type(type) + , m_capture(false) + , m_invert(invert) + { + quantityType = QuantifierFixedCount; + quantityCount = 1; + } + + PatternTerm(unsigned spatternId) + : type(TypeBackReference) + , m_capture(false) + , m_invert(false) + { + backReferenceSubpatternId = spatternId; + quantityType = QuantifierFixedCount; + quantityCount = 1; + } + + static PatternTerm ForwardReference() + { + return PatternTerm(TypeForwardReference); + } + + static PatternTerm BOL() + { + return PatternTerm(TypeAssertionBOL); + } + + static PatternTerm EOL() + { + return PatternTerm(TypeAssertionEOL); + } + + static PatternTerm WordBoundary(bool invert) + { + return PatternTerm(TypeAssertionWordBoundary, invert); + } + + bool invert() + { + return m_invert; + } + + bool capture() + { + return m_capture; + } + + void quantify(unsigned count, QuantifierType type) + { + quantityCount = count; + quantityType = type; + } +}; + +struct PatternAlternative : FastAllocBase { + PatternAlternative(PatternDisjunction* disjunction) + : m_parent(disjunction) + , m_onceThrough(false) + , m_hasFixedSize(false) + , m_startsWithBOL(false) + , m_containsBOL(false) + { + } + + PatternTerm& lastTerm() + { + ASSERT(m_terms.size()); + return m_terms[m_terms.size() - 1]; + } + + void removeLastTerm() + { + ASSERT(m_terms.size()); + m_terms.shrink(m_terms.size() - 1); + } + + void setOnceThrough() + { + m_onceThrough = true; + } + + bool onceThrough() + { + return m_onceThrough; + } + + Vector<PatternTerm> m_terms; + PatternDisjunction* m_parent; + unsigned m_minimumSize; + bool m_onceThrough : 1; + bool m_hasFixedSize : 1; + bool m_startsWithBOL : 1; + bool m_containsBOL : 1; +}; + +struct PatternDisjunction : FastAllocBase { + PatternDisjunction(PatternAlternative* parent = 0) + : m_parent(parent) + , m_hasFixedSize(false) + { + } + + ~PatternDisjunction() + { + deleteAllValues(m_alternatives); + } + + PatternAlternative* addNewAlternative() + { + PatternAlternative* alternative = new PatternAlternative(this); + m_alternatives.append(alternative); + return alternative; + } + + Vector<PatternAlternative*> m_alternatives; + PatternAlternative* m_parent; + unsigned m_minimumSize; + unsigned m_callFrameSize; + bool m_hasFixedSize; +}; + +// You probably don't want to be calling these functions directly +// (please to be calling newlineCharacterClass() et al on your +// friendly neighborhood RegexPattern instance to get nicely +// cached copies). +CharacterClass* newlineCreate(); +CharacterClass* digitsCreate(); +CharacterClass* spacesCreate(); +CharacterClass* wordcharCreate(); +CharacterClass* nondigitsCreate(); +CharacterClass* nonspacesCreate(); +CharacterClass* nonwordcharCreate(); + +struct TermChain { + TermChain(PatternTerm term) + : term(term) + {} + + PatternTerm term; + Vector<TermChain> hotTerms; +}; + +struct BeginChar { + BeginChar() + : value(0) + , mask(0) + {} + + BeginChar(unsigned value, unsigned mask) + : value(value) + , mask(mask) + {} + + unsigned value; + unsigned mask; +}; + +struct RegexPattern { + RegexPattern(const UString& pattern, bool ignoreCase, bool multiline, const char** error); + + ~RegexPattern() + { + deleteAllValues(m_disjunctions); + deleteAllValues(m_userCharacterClasses); + } + + void reset() + { + m_numSubpatterns = 0; + m_maxBackReference = 0; + + m_containsBackreferences = false; + m_containsBeginChars = false; + m_containsBOL = false; + + newlineCached = 0; + digitsCached = 0; + spacesCached = 0; + wordcharCached = 0; + nondigitsCached = 0; + nonspacesCached = 0; + nonwordcharCached = 0; + + deleteAllValues(m_disjunctions); + m_disjunctions.clear(); + deleteAllValues(m_userCharacterClasses); + m_userCharacterClasses.clear(); + m_beginChars.clear(); + } + + bool containsIllegalBackReference() + { + return m_maxBackReference > m_numSubpatterns; + } + + CharacterClass* newlineCharacterClass() + { + if (!newlineCached) + m_userCharacterClasses.append(newlineCached = newlineCreate()); + return newlineCached; + } + CharacterClass* digitsCharacterClass() + { + if (!digitsCached) + m_userCharacterClasses.append(digitsCached = digitsCreate()); + return digitsCached; + } + CharacterClass* spacesCharacterClass() + { + if (!spacesCached) + m_userCharacterClasses.append(spacesCached = spacesCreate()); + return spacesCached; + } + CharacterClass* wordcharCharacterClass() + { + if (!wordcharCached) + m_userCharacterClasses.append(wordcharCached = wordcharCreate()); + return wordcharCached; + } + CharacterClass* nondigitsCharacterClass() + { + if (!nondigitsCached) + m_userCharacterClasses.append(nondigitsCached = nondigitsCreate()); + return nondigitsCached; + } + CharacterClass* nonspacesCharacterClass() + { + if (!nonspacesCached) + m_userCharacterClasses.append(nonspacesCached = nonspacesCreate()); + return nonspacesCached; + } + CharacterClass* nonwordcharCharacterClass() + { + if (!nonwordcharCached) + m_userCharacterClasses.append(nonwordcharCached = nonwordcharCreate()); + return nonwordcharCached; + } + + bool m_ignoreCase : 1; + bool m_multiline : 1; + bool m_containsBackreferences : 1; + bool m_containsBeginChars : 1; + bool m_containsBOL : 1; + unsigned m_numSubpatterns; + unsigned m_maxBackReference; + PatternDisjunction* m_body; + Vector<PatternDisjunction*, 4> m_disjunctions; + Vector<CharacterClass*> m_userCharacterClasses; + Vector<BeginChar> m_beginChars; + +private: + CharacterClass* newlineCached; + CharacterClass* digitsCached; + CharacterClass* spacesCached; + CharacterClass* wordcharCached; + CharacterClass* nondigitsCached; + CharacterClass* nonspacesCached; + CharacterClass* nonwordcharCached; +}; + +} } // namespace JSC::Yarr + +#endif // RegexPattern_h |