diff options
author | Steve Block <steveblock@google.com> | 2011-05-13 06:44:40 -0700 |
---|---|---|
committer | Android (Google) Code Review <android-gerrit@google.com> | 2011-05-13 06:44:40 -0700 |
commit | 08014c20784f3db5df3a89b73cce46037b77eb59 (patch) | |
tree | 47749210d31e19e6e2f64036fa8fae2ad693476f /Source/JavaScriptCore/pcre/pcre_exec.cpp | |
parent | 860220379e56aeb66424861ad602b07ee22b4055 (diff) | |
parent | 4c3661f7918f8b3f139f824efb7855bedccb4c94 (diff) | |
download | external_webkit-08014c20784f3db5df3a89b73cce46037b77eb59.zip external_webkit-08014c20784f3db5df3a89b73cce46037b77eb59.tar.gz external_webkit-08014c20784f3db5df3a89b73cce46037b77eb59.tar.bz2 |
Merge changes Ide388898,Ic49f367c,I1158a808,Iacb6ca5d,I2100dd3a,I5c1abe54,Ib0ef9902,I31dbc523,I570314b3
* changes:
Merge WebKit at r75315: Update WebKit version
Merge WebKit at r75315: Add FrameLoaderClient PageCache stubs
Merge WebKit at r75315: Stub out AXObjectCache::remove()
Merge WebKit at r75315: Fix ImageBuffer
Merge WebKit at r75315: Fix PluginData::initPlugins()
Merge WebKit at r75315: Fix conflicts
Merge WebKit at r75315: Fix Makefiles
Merge WebKit at r75315: Move Android-specific WebCore files to Source
Merge WebKit at r75315: Initial merge by git.
Diffstat (limited to 'Source/JavaScriptCore/pcre/pcre_exec.cpp')
-rw-r--r-- | Source/JavaScriptCore/pcre/pcre_exec.cpp | 2177 |
1 files changed, 2177 insertions, 0 deletions
diff --git a/Source/JavaScriptCore/pcre/pcre_exec.cpp b/Source/JavaScriptCore/pcre/pcre_exec.cpp new file mode 100644 index 0000000..789f80a --- /dev/null +++ b/Source/JavaScriptCore/pcre/pcre_exec.cpp @@ -0,0 +1,2177 @@ +/* This is JavaScriptCore's variant of the PCRE library. While this library +started out as a copy of PCRE, many of the features of PCRE have been +removed. This library now supports only the regular expression features +required by the JavaScript language specification, and has only the functions +needed by JavaScriptCore and the rest of WebKit. + + Originally written by Philip Hazel + Copyright (c) 1997-2006 University of Cambridge + Copyright (C) 2002, 2004, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved. + Copyright (C) 2007 Eric Seidel <eric@webkit.org> + +----------------------------------------------------------------------------- +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the University of Cambridge nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +----------------------------------------------------------------------------- +*/ + +/* This module contains jsRegExpExecute(), the externally visible function +that does pattern matching using an NFA algorithm, following the rules from +the JavaScript specification. There are also some supporting functions. */ + +#include "config.h" +#include "pcre_internal.h" + +#include <limits.h> +#include <wtf/ASCIICType.h> +#include <wtf/Vector.h> + +#if REGEXP_HISTOGRAM +#include <wtf/DateMath.h> +#include <runtime/UString.h> +#endif + +using namespace WTF; + +#if HAVE(COMPUTED_GOTO) +#define USE_COMPUTED_GOTO_FOR_MATCH_RECURSION +//#define USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP +#endif + +/* Avoid warnings on Windows. */ +#undef min +#undef max + +#ifndef USE_COMPUTED_GOTO_FOR_MATCH_RECURSION +typedef int ReturnLocation; +#else +typedef void* ReturnLocation; +#endif + +#if !REGEXP_HISTOGRAM + +class HistogramTimeLogger { +public: + HistogramTimeLogger(const JSRegExp*) { } +}; + +#else + +using namespace JSC; + +class Histogram { +public: + ~Histogram(); + void add(const JSRegExp*, double); + +private: + typedef HashMap<RefPtr<StringImpl>, double> Map; + Map times; +}; + +class HistogramTimeLogger { +public: + HistogramTimeLogger(const JSRegExp*); + ~HistogramTimeLogger(); + +private: + const JSRegExp* m_re; + double m_startTime; +}; + +#endif + +/* Structure for building a chain of data for holding the values of +the subject pointer at the start of each bracket, used to detect when +an empty string has been matched by a bracket to break infinite loops. */ +struct BracketChainNode { + BracketChainNode* previousBracket; + const UChar* bracketStart; +}; + +struct MatchFrame : FastAllocBase { + ReturnLocation returnLocation; + struct MatchFrame* previousFrame; + + /* Function arguments that may change */ + struct { + const UChar* subjectPtr; + const unsigned char* instructionPtr; + int offsetTop; + BracketChainNode* bracketChain; + } args; + + + /* PCRE uses "fake" recursion built off of gotos, thus + stack-based local variables are not safe to use. Instead we have to + store local variables on the current MatchFrame. */ + struct { + const unsigned char* data; + const unsigned char* startOfRepeatingBracket; + const UChar* subjectPtrAtStartOfInstruction; // Several instrutions stash away a subjectPtr here for later compare + const unsigned char* instructionPtrAtStartOfOnce; + + int repeatOthercase; + + int ctype; + int fc; + int fi; + int length; + int max; + int number; + int offset; + int saveOffset1; + int saveOffset2; + int saveOffset3; + + BracketChainNode bracketChainNode; + } locals; +}; + +/* Structure for passing "static" information around between the functions +doing traditional NFA matching, so that they are thread-safe. */ + +struct MatchData { + int* offsetVector; /* Offset vector */ + int offsetEnd; /* One past the end */ + int offsetMax; /* The maximum usable for return data */ + bool offsetOverflow; /* Set if too many extractions */ + const UChar* startSubject; /* Start of the subject string */ + const UChar* endSubject; /* End of the subject string */ + const UChar* endMatchPtr; /* Subject position at end match */ + int endOffsetTop; /* Highwater mark at end of match */ + bool multiline; + bool ignoreCase; +}; + +/* The maximum remaining length of subject we are prepared to search for a +reqByte match. */ + +#define REQ_BYTE_MAX 1000 + +/* The below limit restricts the number of "recursive" match calls in order to +avoid spending exponential time on complex regular expressions. */ + +static const unsigned matchLimit = 1000000; + +#ifdef DEBUG +/************************************************* +* Debugging function to print chars * +*************************************************/ + +/* Print a sequence of chars in printable format, stopping at the end of the +subject if the requested. + +Arguments: + p points to characters + length number to print + isSubject true if printing from within md.startSubject + md pointer to matching data block, if isSubject is true +*/ + +static void pchars(const UChar* p, int length, bool isSubject, const MatchData& md) +{ + if (isSubject && length > md.endSubject - p) + length = md.endSubject - p; + while (length-- > 0) { + int c; + if (isASCIIPrintable(c = *(p++))) + printf("%c", c); + else if (c < 256) + printf("\\x%02x", c); + else + printf("\\x{%x}", c); + } +} +#endif + +/************************************************* +* Match a back-reference * +*************************************************/ + +/* If a back reference hasn't been set, the length that is passed is greater +than the number of characters left in the string, so the match fails. + +Arguments: + offset index into the offset vector + subjectPtr points into the subject + length length to be matched + md points to match data block + +Returns: true if matched +*/ + +static bool matchRef(int offset, const UChar* subjectPtr, int length, const MatchData& md) +{ + const UChar* p = md.startSubject + md.offsetVector[offset]; + +#ifdef DEBUG + if (subjectPtr >= md.endSubject) + printf("matching subject <null>"); + else { + printf("matching subject "); + pchars(subjectPtr, length, true, md); + } + printf(" against backref "); + pchars(p, length, false, md); + printf("\n"); +#endif + + /* Always fail if not enough characters left */ + + if (length > md.endSubject - subjectPtr) + return false; + + /* Separate the caselesss case for speed */ + + if (md.ignoreCase) { + while (length-- > 0) { + UChar c = *p++; + int othercase = jsc_pcre_ucp_othercase(c); + UChar d = *subjectPtr++; + if (c != d && othercase != d) + return false; + } + } + else { + while (length-- > 0) + if (*p++ != *subjectPtr++) + return false; + } + + return true; +} + +#ifndef USE_COMPUTED_GOTO_FOR_MATCH_RECURSION + +/* Use numbered labels and switch statement at the bottom of the match function. */ + +#define RMATCH_WHERE(num) num +#define RRETURN_LABEL RRETURN_SWITCH + +#else + +/* Use GCC's computed goto extension. */ + +/* For one test case this is more than 40% faster than the switch statement. +We could avoid the use of the num argument entirely by using local labels, +but using it for the GCC case as well as the non-GCC case allows us to share +a bit more code and notice if we use conflicting numbers.*/ + +#define RMATCH_WHERE(num) &&RRETURN_##num +#define RRETURN_LABEL *stack.currentFrame->returnLocation + +#endif + +#define RECURSIVE_MATCH_COMMON(num) \ + goto RECURSE;\ + RRETURN_##num: \ + stack.popCurrentFrame(); + +#define RECURSIVE_MATCH(num, ra, rb) \ + do { \ + stack.pushNewFrame((ra), (rb), RMATCH_WHERE(num)); \ + RECURSIVE_MATCH_COMMON(num) \ + } while (0) + +#define RECURSIVE_MATCH_NEW_GROUP(num, ra, rb) \ + do { \ + stack.pushNewFrame((ra), (rb), RMATCH_WHERE(num)); \ + startNewGroup(stack.currentFrame); \ + RECURSIVE_MATCH_COMMON(num) \ + } while (0) + +#define RRETURN goto RRETURN_LABEL + +#define RRETURN_NO_MATCH do { isMatch = false; RRETURN; } while (0) + +/************************************************* +* Match from current position * +*************************************************/ + +/* On entry instructionPtr points to the first opcode, and subjectPtr to the first character +in the subject string, while substringStart holds the value of subjectPtr at the start of the +last bracketed group - used for breaking infinite loops matching zero-length +strings. This function is called recursively in many circumstances. Whenever it +returns a negative (error) response, the outer match() call must also return the +same response. + +Arguments: + subjectPtr pointer in subject + instructionPtr position in code + offsetTop current top pointer + md pointer to "static" info for the match + +Returns: 1 if matched ) these values are >= 0 + 0 if failed to match ) + a negative error value if aborted by an error condition + (e.g. stopped by repeated call or recursion limit) +*/ + +static const unsigned numFramesOnStack = 16; + +struct MatchStack { + MatchStack() + : framesEnd(frames + numFramesOnStack) + , currentFrame(frames) + , size(1) // match() creates accesses the first frame w/o calling pushNewFrame + { + ASSERT((sizeof(frames) / sizeof(frames[0])) == numFramesOnStack); + } + + MatchFrame frames[numFramesOnStack]; + MatchFrame* framesEnd; + MatchFrame* currentFrame; + unsigned size; + + inline bool canUseStackBufferForNextFrame() + { + return size < numFramesOnStack; + } + + inline MatchFrame* allocateNextFrame() + { + if (canUseStackBufferForNextFrame()) + return currentFrame + 1; + return new MatchFrame; + } + + inline void pushNewFrame(const unsigned char* instructionPtr, BracketChainNode* bracketChain, ReturnLocation returnLocation) + { + MatchFrame* newframe = allocateNextFrame(); + newframe->previousFrame = currentFrame; + + newframe->args.subjectPtr = currentFrame->args.subjectPtr; + newframe->args.offsetTop = currentFrame->args.offsetTop; + newframe->args.instructionPtr = instructionPtr; + newframe->args.bracketChain = bracketChain; + newframe->returnLocation = returnLocation; + size++; + + currentFrame = newframe; + } + + inline void popCurrentFrame() + { + MatchFrame* oldFrame = currentFrame; + currentFrame = currentFrame->previousFrame; + if (size > numFramesOnStack) + delete oldFrame; + size--; + } + + void popAllFrames() + { + while (size) + popCurrentFrame(); + } +}; + +static int matchError(int errorCode, MatchStack& stack) +{ + stack.popAllFrames(); + return errorCode; +} + +/* Get the next UTF-8 character, not advancing the pointer, incrementing length + if there are extra bytes. This is called when we know we are in UTF-8 mode. */ + +static inline void getUTF8CharAndIncrementLength(int& c, const unsigned char* subjectPtr, int& len) +{ + c = *subjectPtr; + if ((c & 0xc0) == 0xc0) { + int gcaa = jsc_pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ + int gcss = 6 * gcaa; + c = (c & jsc_pcre_utf8_table3[gcaa]) << gcss; + for (int gcii = 1; gcii <= gcaa; gcii++) { + gcss -= 6; + c |= (subjectPtr[gcii] & 0x3f) << gcss; + } + len += gcaa; + } +} + +static inline void startNewGroup(MatchFrame* currentFrame) +{ + /* At the start of a bracketed group, add the current subject pointer to the + stack of such pointers, to be re-instated at the end of the group when we hit + the closing ket. When match() is called in other circumstances, we don't add to + this stack. */ + + currentFrame->locals.bracketChainNode.previousBracket = currentFrame->args.bracketChain; + currentFrame->locals.bracketChainNode.bracketStart = currentFrame->args.subjectPtr; + currentFrame->args.bracketChain = ¤tFrame->locals.bracketChainNode; +} + +// FIXME: "minimize" means "not greedy", we should invert the callers to ask for "greedy" to be less confusing +static inline void repeatInformationFromInstructionOffset(int instructionOffset, bool& minimize, int& minimumRepeats, int& maximumRepeats) +{ + // Instruction offsets are based off of OP_CRSTAR, OP_STAR, OP_TYPESTAR, OP_NOTSTAR + static const char minimumRepeatsFromInstructionOffset[] = { 0, 0, 1, 1, 0, 0 }; + static const int maximumRepeatsFromInstructionOffset[] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX, 1, 1 }; + + ASSERT(instructionOffset >= 0); + ASSERT(instructionOffset <= (OP_CRMINQUERY - OP_CRSTAR)); + + minimize = (instructionOffset & 1); // this assumes ordering: Instruction, MinimizeInstruction, Instruction2, MinimizeInstruction2 + minimumRepeats = minimumRepeatsFromInstructionOffset[instructionOffset]; + maximumRepeats = maximumRepeatsFromInstructionOffset[instructionOffset]; +} + +static int match(const UChar* subjectPtr, const unsigned char* instructionPtr, int offsetTop, MatchData& md) +{ + bool isMatch = false; + int min; + bool minimize = false; /* Initialization not really needed, but some compilers think so. */ + unsigned remainingMatchCount = matchLimit; + int othercase; /* Declare here to avoid errors during jumps */ + + MatchStack stack; + + /* The opcode jump table. */ +#ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP +#define EMIT_JUMP_TABLE_ENTRY(opcode) &&LABEL_OP_##opcode, + static void* opcodeJumpTable[256] = { FOR_EACH_OPCODE(EMIT_JUMP_TABLE_ENTRY) }; +#undef EMIT_JUMP_TABLE_ENTRY +#endif + + /* One-time setup of the opcode jump table. */ +#ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP + for (int i = 255; !opcodeJumpTable[i]; i--) + opcodeJumpTable[i] = &&CAPTURING_BRACKET; +#endif + +#ifdef USE_COMPUTED_GOTO_FOR_MATCH_RECURSION + // Shark shows this as a hot line + // Using a static const here makes this line disappear, but makes later access hotter (not sure why) + stack.currentFrame->returnLocation = &&RETURN; +#else + stack.currentFrame->returnLocation = 0; +#endif + stack.currentFrame->args.subjectPtr = subjectPtr; + stack.currentFrame->args.instructionPtr = instructionPtr; + stack.currentFrame->args.offsetTop = offsetTop; + stack.currentFrame->args.bracketChain = 0; + startNewGroup(stack.currentFrame); + + /* This is where control jumps back to to effect "recursion" */ + +RECURSE: + if (!--remainingMatchCount) + return matchError(JSRegExpErrorHitLimit, stack); + + /* Now start processing the operations. */ + +#ifndef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP + while (true) +#endif + { + +#ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP +#define BEGIN_OPCODE(opcode) LABEL_OP_##opcode +#define NEXT_OPCODE goto *opcodeJumpTable[*stack.currentFrame->args.instructionPtr] +#else +#define BEGIN_OPCODE(opcode) case OP_##opcode +#define NEXT_OPCODE continue +#endif + +#ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP + NEXT_OPCODE; +#else + switch (*stack.currentFrame->args.instructionPtr) +#endif + { + /* Non-capturing bracket: optimized */ + + BEGIN_OPCODE(BRA): + NON_CAPTURING_BRACKET: + DPRINTF(("start bracket 0\n")); + do { + RECURSIVE_MATCH_NEW_GROUP(2, stack.currentFrame->args.instructionPtr + 1 + LINK_SIZE, stack.currentFrame->args.bracketChain); + if (isMatch) + RRETURN; + stack.currentFrame->args.instructionPtr += getLinkValue(stack.currentFrame->args.instructionPtr + 1); + } while (*stack.currentFrame->args.instructionPtr == OP_ALT); + DPRINTF(("bracket 0 failed\n")); + RRETURN; + + /* Skip over large extraction number data if encountered. */ + + BEGIN_OPCODE(BRANUMBER): + stack.currentFrame->args.instructionPtr += 3; + NEXT_OPCODE; + + /* End of the pattern. */ + + BEGIN_OPCODE(END): + md.endMatchPtr = stack.currentFrame->args.subjectPtr; /* Record where we ended */ + md.endOffsetTop = stack.currentFrame->args.offsetTop; /* and how many extracts were taken */ + isMatch = true; + RRETURN; + + /* Assertion brackets. Check the alternative branches in turn - the + matching won't pass the KET for an assertion. If any one branch matches, + the assertion is true. Lookbehind assertions have an OP_REVERSE item at the + start of each branch to move the current point backwards, so the code at + this level is identical to the lookahead case. */ + + BEGIN_OPCODE(ASSERT): + do { + RECURSIVE_MATCH_NEW_GROUP(6, stack.currentFrame->args.instructionPtr + 1 + LINK_SIZE, NULL); + if (isMatch) + break; + stack.currentFrame->args.instructionPtr += getLinkValue(stack.currentFrame->args.instructionPtr + 1); + } while (*stack.currentFrame->args.instructionPtr == OP_ALT); + if (*stack.currentFrame->args.instructionPtr == OP_KET) + RRETURN_NO_MATCH; + + /* Continue from after the assertion, updating the offsets high water + mark, since extracts may have been taken during the assertion. */ + + advanceToEndOfBracket(stack.currentFrame->args.instructionPtr); + stack.currentFrame->args.instructionPtr += 1 + LINK_SIZE; + stack.currentFrame->args.offsetTop = md.endOffsetTop; + NEXT_OPCODE; + + /* Negative assertion: all branches must fail to match */ + + BEGIN_OPCODE(ASSERT_NOT): + do { + RECURSIVE_MATCH_NEW_GROUP(7, stack.currentFrame->args.instructionPtr + 1 + LINK_SIZE, NULL); + if (isMatch) + RRETURN_NO_MATCH; + stack.currentFrame->args.instructionPtr += getLinkValue(stack.currentFrame->args.instructionPtr + 1); + } while (*stack.currentFrame->args.instructionPtr == OP_ALT); + + stack.currentFrame->args.instructionPtr += 1 + LINK_SIZE; + NEXT_OPCODE; + + /* An alternation is the end of a branch; scan along to find the end of the + bracketed group and go to there. */ + + BEGIN_OPCODE(ALT): + advanceToEndOfBracket(stack.currentFrame->args.instructionPtr); + NEXT_OPCODE; + + /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating + that it may occur zero times. It may repeat infinitely, or not at all - + i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper + repeat limits are compiled as a number of copies, with the optional ones + preceded by BRAZERO or BRAMINZERO. */ + + BEGIN_OPCODE(BRAZERO): { + stack.currentFrame->locals.startOfRepeatingBracket = stack.currentFrame->args.instructionPtr + 1; + RECURSIVE_MATCH_NEW_GROUP(14, stack.currentFrame->locals.startOfRepeatingBracket, stack.currentFrame->args.bracketChain); + if (isMatch) + RRETURN; + advanceToEndOfBracket(stack.currentFrame->locals.startOfRepeatingBracket); + stack.currentFrame->args.instructionPtr = stack.currentFrame->locals.startOfRepeatingBracket + 1 + LINK_SIZE; + NEXT_OPCODE; + } + + BEGIN_OPCODE(BRAMINZERO): { + stack.currentFrame->locals.startOfRepeatingBracket = stack.currentFrame->args.instructionPtr + 1; + advanceToEndOfBracket(stack.currentFrame->locals.startOfRepeatingBracket); + RECURSIVE_MATCH_NEW_GROUP(15, stack.currentFrame->locals.startOfRepeatingBracket + 1 + LINK_SIZE, stack.currentFrame->args.bracketChain); + if (isMatch) + RRETURN; + stack.currentFrame->args.instructionPtr++; + NEXT_OPCODE; + } + + /* End of a group, repeated or non-repeating. If we are at the end of + an assertion "group", stop matching and return 1, but record the + current high water mark for use by positive assertions. Do this also + for the "once" (not-backup up) groups. */ + + BEGIN_OPCODE(KET): + BEGIN_OPCODE(KETRMIN): + BEGIN_OPCODE(KETRMAX): + stack.currentFrame->locals.instructionPtrAtStartOfOnce = stack.currentFrame->args.instructionPtr - getLinkValue(stack.currentFrame->args.instructionPtr + 1); + stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.bracketChain->bracketStart; + + /* Back up the stack of bracket start pointers. */ + + stack.currentFrame->args.bracketChain = stack.currentFrame->args.bracketChain->previousBracket; + + if (*stack.currentFrame->locals.instructionPtrAtStartOfOnce == OP_ASSERT || *stack.currentFrame->locals.instructionPtrAtStartOfOnce == OP_ASSERT_NOT) { + md.endOffsetTop = stack.currentFrame->args.offsetTop; + isMatch = true; + RRETURN; + } + + /* In all other cases except a conditional group we have to check the + group number back at the start and if necessary complete handling an + extraction by setting the offsets and bumping the high water mark. */ + + stack.currentFrame->locals.number = *stack.currentFrame->locals.instructionPtrAtStartOfOnce - OP_BRA; + + /* For extended extraction brackets (large number), we have to fish out + the number from a dummy opcode at the start. */ + + if (stack.currentFrame->locals.number > EXTRACT_BASIC_MAX) + stack.currentFrame->locals.number = get2ByteValue(stack.currentFrame->locals.instructionPtrAtStartOfOnce + 2 + LINK_SIZE); + stack.currentFrame->locals.offset = stack.currentFrame->locals.number << 1; + +#ifdef DEBUG + printf("end bracket %d", stack.currentFrame->locals.number); + printf("\n"); +#endif + + /* Test for a numbered group. This includes groups called as a result + of recursion. Note that whole-pattern recursion is coded as a recurse + into group 0, so it won't be picked up here. Instead, we catch it when + the OP_END is reached. */ + + if (stack.currentFrame->locals.number > 0) { + if (stack.currentFrame->locals.offset >= md.offsetMax) + md.offsetOverflow = true; + else { + md.offsetVector[stack.currentFrame->locals.offset] = + md.offsetVector[md.offsetEnd - stack.currentFrame->locals.number]; + md.offsetVector[stack.currentFrame->locals.offset+1] = stack.currentFrame->args.subjectPtr - md.startSubject; + if (stack.currentFrame->args.offsetTop <= stack.currentFrame->locals.offset) + stack.currentFrame->args.offsetTop = stack.currentFrame->locals.offset + 2; + } + } + + /* For a non-repeating ket, just continue at this level. This also + happens for a repeating ket if no characters were matched in the group. + This is the forcible breaking of infinite loops as implemented in Perl + 5.005. If there is an options reset, it will get obeyed in the normal + course of events. */ + + if (*stack.currentFrame->args.instructionPtr == OP_KET || stack.currentFrame->args.subjectPtr == stack.currentFrame->locals.subjectPtrAtStartOfInstruction) { + stack.currentFrame->args.instructionPtr += 1 + LINK_SIZE; + NEXT_OPCODE; + } + + /* The repeating kets try the rest of the pattern or restart from the + preceding bracket, in the appropriate order. */ + + if (*stack.currentFrame->args.instructionPtr == OP_KETRMIN) { + RECURSIVE_MATCH(16, stack.currentFrame->args.instructionPtr + 1 + LINK_SIZE, stack.currentFrame->args.bracketChain); + if (isMatch) + RRETURN; + RECURSIVE_MATCH_NEW_GROUP(17, stack.currentFrame->locals.instructionPtrAtStartOfOnce, stack.currentFrame->args.bracketChain); + if (isMatch) + RRETURN; + } else { /* OP_KETRMAX */ + RECURSIVE_MATCH_NEW_GROUP(18, stack.currentFrame->locals.instructionPtrAtStartOfOnce, stack.currentFrame->args.bracketChain); + if (isMatch) + RRETURN; + RECURSIVE_MATCH(19, stack.currentFrame->args.instructionPtr + 1 + LINK_SIZE, stack.currentFrame->args.bracketChain); + if (isMatch) + RRETURN; + } + RRETURN; + + /* Start of subject. */ + + BEGIN_OPCODE(CIRC): + if (stack.currentFrame->args.subjectPtr != md.startSubject) + RRETURN_NO_MATCH; + stack.currentFrame->args.instructionPtr++; + NEXT_OPCODE; + + /* After internal newline if multiline. */ + + BEGIN_OPCODE(BOL): + if (stack.currentFrame->args.subjectPtr != md.startSubject && !isNewline(stack.currentFrame->args.subjectPtr[-1])) + RRETURN_NO_MATCH; + stack.currentFrame->args.instructionPtr++; + NEXT_OPCODE; + + /* End of subject. */ + + BEGIN_OPCODE(DOLL): + if (stack.currentFrame->args.subjectPtr < md.endSubject) + RRETURN_NO_MATCH; + stack.currentFrame->args.instructionPtr++; + NEXT_OPCODE; + + /* Before internal newline if multiline. */ + + BEGIN_OPCODE(EOL): + if (stack.currentFrame->args.subjectPtr < md.endSubject && !isNewline(*stack.currentFrame->args.subjectPtr)) + RRETURN_NO_MATCH; + stack.currentFrame->args.instructionPtr++; + NEXT_OPCODE; + + /* Word boundary assertions */ + + BEGIN_OPCODE(NOT_WORD_BOUNDARY): + BEGIN_OPCODE(WORD_BOUNDARY): { + bool currentCharIsWordChar = false; + bool previousCharIsWordChar = false; + + if (stack.currentFrame->args.subjectPtr > md.startSubject) + previousCharIsWordChar = isWordChar(stack.currentFrame->args.subjectPtr[-1]); + if (stack.currentFrame->args.subjectPtr < md.endSubject) + currentCharIsWordChar = isWordChar(*stack.currentFrame->args.subjectPtr); + + /* Now see if the situation is what we want */ + bool wordBoundaryDesired = (*stack.currentFrame->args.instructionPtr++ == OP_WORD_BOUNDARY); + if (wordBoundaryDesired ? currentCharIsWordChar == previousCharIsWordChar : currentCharIsWordChar != previousCharIsWordChar) + RRETURN_NO_MATCH; + NEXT_OPCODE; + } + + /* Match a single character type; inline for speed */ + + BEGIN_OPCODE(NOT_NEWLINE): + if (stack.currentFrame->args.subjectPtr >= md.endSubject) + RRETURN_NO_MATCH; + if (isNewline(*stack.currentFrame->args.subjectPtr++)) + RRETURN_NO_MATCH; + stack.currentFrame->args.instructionPtr++; + NEXT_OPCODE; + + BEGIN_OPCODE(NOT_DIGIT): + if (stack.currentFrame->args.subjectPtr >= md.endSubject) + RRETURN_NO_MATCH; + if (isASCIIDigit(*stack.currentFrame->args.subjectPtr++)) + RRETURN_NO_MATCH; + stack.currentFrame->args.instructionPtr++; + NEXT_OPCODE; + + BEGIN_OPCODE(DIGIT): + if (stack.currentFrame->args.subjectPtr >= md.endSubject) + RRETURN_NO_MATCH; + if (!isASCIIDigit(*stack.currentFrame->args.subjectPtr++)) + RRETURN_NO_MATCH; + stack.currentFrame->args.instructionPtr++; + NEXT_OPCODE; + + BEGIN_OPCODE(NOT_WHITESPACE): + if (stack.currentFrame->args.subjectPtr >= md.endSubject) + RRETURN_NO_MATCH; + if (isSpaceChar(*stack.currentFrame->args.subjectPtr++)) + RRETURN_NO_MATCH; + stack.currentFrame->args.instructionPtr++; + NEXT_OPCODE; + + BEGIN_OPCODE(WHITESPACE): + if (stack.currentFrame->args.subjectPtr >= md.endSubject) + RRETURN_NO_MATCH; + if (!isSpaceChar(*stack.currentFrame->args.subjectPtr++)) + RRETURN_NO_MATCH; + stack.currentFrame->args.instructionPtr++; + NEXT_OPCODE; + + BEGIN_OPCODE(NOT_WORDCHAR): + if (stack.currentFrame->args.subjectPtr >= md.endSubject) + RRETURN_NO_MATCH; + if (isWordChar(*stack.currentFrame->args.subjectPtr++)) + RRETURN_NO_MATCH; + stack.currentFrame->args.instructionPtr++; + NEXT_OPCODE; + + BEGIN_OPCODE(WORDCHAR): + if (stack.currentFrame->args.subjectPtr >= md.endSubject) + RRETURN_NO_MATCH; + if (!isWordChar(*stack.currentFrame->args.subjectPtr++)) + RRETURN_NO_MATCH; + stack.currentFrame->args.instructionPtr++; + NEXT_OPCODE; + + /* Match a back reference, possibly repeatedly. Look past the end of the + item to see if there is repeat information following. The code is similar + to that for character classes, but repeated for efficiency. Then obey + similar code to character type repeats - written out again for speed. + However, if the referenced string is the empty string, always treat + it as matched, any number of times (otherwise there could be infinite + loops). */ + + BEGIN_OPCODE(REF): + stack.currentFrame->locals.offset = get2ByteValue(stack.currentFrame->args.instructionPtr + 1) << 1; /* Doubled ref number */ + stack.currentFrame->args.instructionPtr += 3; /* Advance past item */ + + /* If the reference is unset, set the length to be longer than the amount + of subject left; this ensures that every attempt at a match fails. We + can't just fail here, because of the possibility of quantifiers with zero + minima. */ + + if (stack.currentFrame->locals.offset >= stack.currentFrame->args.offsetTop || md.offsetVector[stack.currentFrame->locals.offset] < 0) + stack.currentFrame->locals.length = 0; + else + stack.currentFrame->locals.length = md.offsetVector[stack.currentFrame->locals.offset+1] - md.offsetVector[stack.currentFrame->locals.offset]; + + /* Set up for repetition, or handle the non-repeated case */ + + switch (*stack.currentFrame->args.instructionPtr) { + case OP_CRSTAR: + case OP_CRMINSTAR: + case OP_CRPLUS: + case OP_CRMINPLUS: + case OP_CRQUERY: + case OP_CRMINQUERY: + repeatInformationFromInstructionOffset(*stack.currentFrame->args.instructionPtr++ - OP_CRSTAR, minimize, min, stack.currentFrame->locals.max); + break; + + case OP_CRRANGE: + case OP_CRMINRANGE: + minimize = (*stack.currentFrame->args.instructionPtr == OP_CRMINRANGE); + min = get2ByteValue(stack.currentFrame->args.instructionPtr + 1); + stack.currentFrame->locals.max = get2ByteValue(stack.currentFrame->args.instructionPtr + 3); + if (stack.currentFrame->locals.max == 0) + stack.currentFrame->locals.max = INT_MAX; + stack.currentFrame->args.instructionPtr += 5; + break; + + default: /* No repeat follows */ + if (!matchRef(stack.currentFrame->locals.offset, stack.currentFrame->args.subjectPtr, stack.currentFrame->locals.length, md)) + RRETURN_NO_MATCH; + stack.currentFrame->args.subjectPtr += stack.currentFrame->locals.length; + NEXT_OPCODE; + } + + /* If the length of the reference is zero, just continue with the + main loop. */ + + if (stack.currentFrame->locals.length == 0) + NEXT_OPCODE; + + /* First, ensure the minimum number of matches are present. */ + + for (int i = 1; i <= min; i++) { + if (!matchRef(stack.currentFrame->locals.offset, stack.currentFrame->args.subjectPtr, stack.currentFrame->locals.length, md)) + RRETURN_NO_MATCH; + stack.currentFrame->args.subjectPtr += stack.currentFrame->locals.length; + } + + /* If min = max, continue at the same level without recursion. + They are not both allowed to be zero. */ + + if (min == stack.currentFrame->locals.max) + NEXT_OPCODE; + + /* If minimizing, keep trying and advancing the pointer */ + + if (minimize) { + for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) { + RECURSIVE_MATCH(20, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); + if (isMatch) + RRETURN; + if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || !matchRef(stack.currentFrame->locals.offset, stack.currentFrame->args.subjectPtr, stack.currentFrame->locals.length, md)) + RRETURN; + stack.currentFrame->args.subjectPtr += stack.currentFrame->locals.length; + } + /* Control never reaches here */ + } + + /* If maximizing, find the longest string and work backwards */ + + else { + stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr; + for (int i = min; i < stack.currentFrame->locals.max; i++) { + if (!matchRef(stack.currentFrame->locals.offset, stack.currentFrame->args.subjectPtr, stack.currentFrame->locals.length, md)) + break; + stack.currentFrame->args.subjectPtr += stack.currentFrame->locals.length; + } + while (stack.currentFrame->args.subjectPtr >= stack.currentFrame->locals.subjectPtrAtStartOfInstruction) { + RECURSIVE_MATCH(21, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); + if (isMatch) + RRETURN; + stack.currentFrame->args.subjectPtr -= stack.currentFrame->locals.length; + } + RRETURN_NO_MATCH; + } + /* Control never reaches here */ + + /* Match a bit-mapped character class, possibly repeatedly. This op code is + used when all the characters in the class have values in the range 0-255, + and either the matching is caseful, or the characters are in the range + 0-127 when UTF-8 processing is enabled. The only difference between + OP_CLASS and OP_NCLASS occurs when a data character outside the range is + encountered. + + First, look past the end of the item to see if there is repeat information + following. Then obey similar code to character type repeats - written out + again for speed. */ + + BEGIN_OPCODE(NCLASS): + BEGIN_OPCODE(CLASS): + stack.currentFrame->locals.data = stack.currentFrame->args.instructionPtr + 1; /* Save for matching */ + stack.currentFrame->args.instructionPtr += 33; /* Advance past the item */ + + switch (*stack.currentFrame->args.instructionPtr) { + case OP_CRSTAR: + case OP_CRMINSTAR: + case OP_CRPLUS: + case OP_CRMINPLUS: + case OP_CRQUERY: + case OP_CRMINQUERY: + repeatInformationFromInstructionOffset(*stack.currentFrame->args.instructionPtr++ - OP_CRSTAR, minimize, min, stack.currentFrame->locals.max); + break; + + case OP_CRRANGE: + case OP_CRMINRANGE: + minimize = (*stack.currentFrame->args.instructionPtr == OP_CRMINRANGE); + min = get2ByteValue(stack.currentFrame->args.instructionPtr + 1); + stack.currentFrame->locals.max = get2ByteValue(stack.currentFrame->args.instructionPtr + 3); + if (stack.currentFrame->locals.max == 0) + stack.currentFrame->locals.max = INT_MAX; + stack.currentFrame->args.instructionPtr += 5; + break; + + default: /* No repeat follows */ + min = stack.currentFrame->locals.max = 1; + break; + } + + /* First, ensure the minimum number of matches are present. */ + + for (int i = 1; i <= min; i++) { + if (stack.currentFrame->args.subjectPtr >= md.endSubject) + RRETURN_NO_MATCH; + int c = *stack.currentFrame->args.subjectPtr++; + if (c > 255) { + if (stack.currentFrame->locals.data[-1] == OP_CLASS) + RRETURN_NO_MATCH; + } else { + if (!(stack.currentFrame->locals.data[c / 8] & (1 << (c & 7)))) + RRETURN_NO_MATCH; + } + } + + /* If max == min we can continue with the main loop without the + need to recurse. */ + + if (min == stack.currentFrame->locals.max) + NEXT_OPCODE; + + /* If minimizing, keep testing the rest of the expression and advancing + the pointer while it matches the class. */ + if (minimize) { + for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) { + RECURSIVE_MATCH(22, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); + if (isMatch) + RRETURN; + if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || stack.currentFrame->args.subjectPtr >= md.endSubject) + RRETURN; + int c = *stack.currentFrame->args.subjectPtr++; + if (c > 255) { + if (stack.currentFrame->locals.data[-1] == OP_CLASS) + RRETURN; + } else { + if ((stack.currentFrame->locals.data[c/8] & (1 << (c&7))) == 0) + RRETURN; + } + } + /* Control never reaches here */ + } + /* If maximizing, find the longest possible run, then work backwards. */ + else { + stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr; + + for (int i = min; i < stack.currentFrame->locals.max; i++) { + if (stack.currentFrame->args.subjectPtr >= md.endSubject) + break; + int c = *stack.currentFrame->args.subjectPtr; + if (c > 255) { + if (stack.currentFrame->locals.data[-1] == OP_CLASS) + break; + } else { + if (!(stack.currentFrame->locals.data[c / 8] & (1 << (c & 7)))) + break; + } + ++stack.currentFrame->args.subjectPtr; + } + for (;;) { + RECURSIVE_MATCH(24, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); + if (isMatch) + RRETURN; + if (stack.currentFrame->args.subjectPtr-- == stack.currentFrame->locals.subjectPtrAtStartOfInstruction) + break; /* Stop if tried at original pos */ + } + + RRETURN; + } + /* Control never reaches here */ + + /* Match an extended character class. */ + + BEGIN_OPCODE(XCLASS): + stack.currentFrame->locals.data = stack.currentFrame->args.instructionPtr + 1 + LINK_SIZE; /* Save for matching */ + stack.currentFrame->args.instructionPtr += getLinkValue(stack.currentFrame->args.instructionPtr + 1); /* Advance past the item */ + + switch (*stack.currentFrame->args.instructionPtr) { + case OP_CRSTAR: + case OP_CRMINSTAR: + case OP_CRPLUS: + case OP_CRMINPLUS: + case OP_CRQUERY: + case OP_CRMINQUERY: + repeatInformationFromInstructionOffset(*stack.currentFrame->args.instructionPtr++ - OP_CRSTAR, minimize, min, stack.currentFrame->locals.max); + break; + + case OP_CRRANGE: + case OP_CRMINRANGE: + minimize = (*stack.currentFrame->args.instructionPtr == OP_CRMINRANGE); + min = get2ByteValue(stack.currentFrame->args.instructionPtr + 1); + stack.currentFrame->locals.max = get2ByteValue(stack.currentFrame->args.instructionPtr + 3); + if (stack.currentFrame->locals.max == 0) + stack.currentFrame->locals.max = INT_MAX; + stack.currentFrame->args.instructionPtr += 5; + break; + + default: /* No repeat follows */ + min = stack.currentFrame->locals.max = 1; + } + + /* First, ensure the minimum number of matches are present. */ + + for (int i = 1; i <= min; i++) { + if (stack.currentFrame->args.subjectPtr >= md.endSubject) + RRETURN_NO_MATCH; + int c = *stack.currentFrame->args.subjectPtr++; + if (!jsc_pcre_xclass(c, stack.currentFrame->locals.data)) + RRETURN_NO_MATCH; + } + + /* If max == min we can continue with the main loop without the + need to recurse. */ + + if (min == stack.currentFrame->locals.max) + NEXT_OPCODE; + + /* If minimizing, keep testing the rest of the expression and advancing + the pointer while it matches the class. */ + + if (minimize) { + for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) { + RECURSIVE_MATCH(26, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); + if (isMatch) + RRETURN; + if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || stack.currentFrame->args.subjectPtr >= md.endSubject) + RRETURN; + int c = *stack.currentFrame->args.subjectPtr++; + if (!jsc_pcre_xclass(c, stack.currentFrame->locals.data)) + RRETURN; + } + /* Control never reaches here */ + } + + /* If maximizing, find the longest possible run, then work backwards. */ + + else { + stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr; + for (int i = min; i < stack.currentFrame->locals.max; i++) { + if (stack.currentFrame->args.subjectPtr >= md.endSubject) + break; + int c = *stack.currentFrame->args.subjectPtr; + if (!jsc_pcre_xclass(c, stack.currentFrame->locals.data)) + break; + ++stack.currentFrame->args.subjectPtr; + } + for(;;) { + RECURSIVE_MATCH(27, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); + if (isMatch) + RRETURN; + if (stack.currentFrame->args.subjectPtr-- == stack.currentFrame->locals.subjectPtrAtStartOfInstruction) + break; /* Stop if tried at original pos */ + } + RRETURN; + } + + /* Control never reaches here */ + + /* Match a single character, casefully */ + + BEGIN_OPCODE(CHAR): + stack.currentFrame->locals.length = 1; + stack.currentFrame->args.instructionPtr++; + getUTF8CharAndIncrementLength(stack.currentFrame->locals.fc, stack.currentFrame->args.instructionPtr, stack.currentFrame->locals.length); + stack.currentFrame->args.instructionPtr += stack.currentFrame->locals.length; + if (stack.currentFrame->args.subjectPtr >= md.endSubject) + RRETURN_NO_MATCH; + if (stack.currentFrame->locals.fc != *stack.currentFrame->args.subjectPtr++) + RRETURN_NO_MATCH; + NEXT_OPCODE; + + /* Match a single character, caselessly */ + + BEGIN_OPCODE(CHAR_IGNORING_CASE): { + stack.currentFrame->locals.length = 1; + stack.currentFrame->args.instructionPtr++; + getUTF8CharAndIncrementLength(stack.currentFrame->locals.fc, stack.currentFrame->args.instructionPtr, stack.currentFrame->locals.length); + stack.currentFrame->args.instructionPtr += stack.currentFrame->locals.length; + if (stack.currentFrame->args.subjectPtr >= md.endSubject) + RRETURN_NO_MATCH; + int dc = *stack.currentFrame->args.subjectPtr++; + if (stack.currentFrame->locals.fc != dc && jsc_pcre_ucp_othercase(stack.currentFrame->locals.fc) != dc) + RRETURN_NO_MATCH; + NEXT_OPCODE; + } + + /* Match a single ASCII character. */ + + BEGIN_OPCODE(ASCII_CHAR): + if (md.endSubject == stack.currentFrame->args.subjectPtr) + RRETURN_NO_MATCH; + if (*stack.currentFrame->args.subjectPtr != stack.currentFrame->args.instructionPtr[1]) + RRETURN_NO_MATCH; + ++stack.currentFrame->args.subjectPtr; + stack.currentFrame->args.instructionPtr += 2; + NEXT_OPCODE; + + /* Match one of two cases of an ASCII letter. */ + + BEGIN_OPCODE(ASCII_LETTER_IGNORING_CASE): + if (md.endSubject == stack.currentFrame->args.subjectPtr) + RRETURN_NO_MATCH; + if ((*stack.currentFrame->args.subjectPtr | 0x20) != stack.currentFrame->args.instructionPtr[1]) + RRETURN_NO_MATCH; + ++stack.currentFrame->args.subjectPtr; + stack.currentFrame->args.instructionPtr += 2; + NEXT_OPCODE; + + /* Match a single character repeatedly; different opcodes share code. */ + + BEGIN_OPCODE(EXACT): + min = stack.currentFrame->locals.max = get2ByteValue(stack.currentFrame->args.instructionPtr + 1); + minimize = false; + stack.currentFrame->args.instructionPtr += 3; + goto REPEATCHAR; + + BEGIN_OPCODE(UPTO): + BEGIN_OPCODE(MINUPTO): + min = 0; + stack.currentFrame->locals.max = get2ByteValue(stack.currentFrame->args.instructionPtr + 1); + minimize = *stack.currentFrame->args.instructionPtr == OP_MINUPTO; + stack.currentFrame->args.instructionPtr += 3; + goto REPEATCHAR; + + BEGIN_OPCODE(STAR): + BEGIN_OPCODE(MINSTAR): + BEGIN_OPCODE(PLUS): + BEGIN_OPCODE(MINPLUS): + BEGIN_OPCODE(QUERY): + BEGIN_OPCODE(MINQUERY): + repeatInformationFromInstructionOffset(*stack.currentFrame->args.instructionPtr++ - OP_STAR, minimize, min, stack.currentFrame->locals.max); + + /* Common code for all repeated single-character matches. We can give + up quickly if there are fewer than the minimum number of characters left in + the subject. */ + + REPEATCHAR: + + stack.currentFrame->locals.length = 1; + getUTF8CharAndIncrementLength(stack.currentFrame->locals.fc, stack.currentFrame->args.instructionPtr, stack.currentFrame->locals.length); + if (min * (stack.currentFrame->locals.fc > 0xFFFF ? 2 : 1) > md.endSubject - stack.currentFrame->args.subjectPtr) + RRETURN_NO_MATCH; + stack.currentFrame->args.instructionPtr += stack.currentFrame->locals.length; + + if (stack.currentFrame->locals.fc <= 0xFFFF) { + othercase = md.ignoreCase ? jsc_pcre_ucp_othercase(stack.currentFrame->locals.fc) : -1; + + for (int i = 1; i <= min; i++) { + if (*stack.currentFrame->args.subjectPtr != stack.currentFrame->locals.fc && *stack.currentFrame->args.subjectPtr != othercase) + RRETURN_NO_MATCH; + ++stack.currentFrame->args.subjectPtr; + } + + if (min == stack.currentFrame->locals.max) + NEXT_OPCODE; + + if (minimize) { + stack.currentFrame->locals.repeatOthercase = othercase; + for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) { + RECURSIVE_MATCH(28, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); + if (isMatch) + RRETURN; + if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || stack.currentFrame->args.subjectPtr >= md.endSubject) + RRETURN; + if (*stack.currentFrame->args.subjectPtr != stack.currentFrame->locals.fc && *stack.currentFrame->args.subjectPtr != stack.currentFrame->locals.repeatOthercase) + RRETURN; + ++stack.currentFrame->args.subjectPtr; + } + /* Control never reaches here */ + } else { + stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr; + for (int i = min; i < stack.currentFrame->locals.max; i++) { + if (stack.currentFrame->args.subjectPtr >= md.endSubject) + break; + if (*stack.currentFrame->args.subjectPtr != stack.currentFrame->locals.fc && *stack.currentFrame->args.subjectPtr != othercase) + break; + ++stack.currentFrame->args.subjectPtr; + } + while (stack.currentFrame->args.subjectPtr >= stack.currentFrame->locals.subjectPtrAtStartOfInstruction) { + RECURSIVE_MATCH(29, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); + if (isMatch) + RRETURN; + --stack.currentFrame->args.subjectPtr; + } + RRETURN_NO_MATCH; + } + /* Control never reaches here */ + } else { + /* No case on surrogate pairs, so no need to bother with "othercase". */ + + for (int i = 1; i <= min; i++) { + if (*stack.currentFrame->args.subjectPtr != stack.currentFrame->locals.fc) + RRETURN_NO_MATCH; + stack.currentFrame->args.subjectPtr += 2; + } + + if (min == stack.currentFrame->locals.max) + NEXT_OPCODE; + + if (minimize) { + for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) { + RECURSIVE_MATCH(30, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); + if (isMatch) + RRETURN; + if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || stack.currentFrame->args.subjectPtr >= md.endSubject) + RRETURN; + if (*stack.currentFrame->args.subjectPtr != stack.currentFrame->locals.fc) + RRETURN; + stack.currentFrame->args.subjectPtr += 2; + } + /* Control never reaches here */ + } else { + stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr; + for (int i = min; i < stack.currentFrame->locals.max; i++) { + if (stack.currentFrame->args.subjectPtr > md.endSubject - 2) + break; + if (*stack.currentFrame->args.subjectPtr != stack.currentFrame->locals.fc) + break; + stack.currentFrame->args.subjectPtr += 2; + } + while (stack.currentFrame->args.subjectPtr >= stack.currentFrame->locals.subjectPtrAtStartOfInstruction) { + RECURSIVE_MATCH(31, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); + if (isMatch) + RRETURN; + stack.currentFrame->args.subjectPtr -= 2; + } + RRETURN_NO_MATCH; + } + /* Control never reaches here */ + } + /* Control never reaches here */ + + /* Match a negated single one-byte character. */ + + BEGIN_OPCODE(NOT): { + if (stack.currentFrame->args.subjectPtr >= md.endSubject) + RRETURN_NO_MATCH; + int b = stack.currentFrame->args.instructionPtr[1]; + int c = *stack.currentFrame->args.subjectPtr++; + stack.currentFrame->args.instructionPtr += 2; + if (md.ignoreCase) { + if (c < 128) + c = toLowerCase(c); + if (toLowerCase(b) == c) + RRETURN_NO_MATCH; + } else { + if (b == c) + RRETURN_NO_MATCH; + } + NEXT_OPCODE; + } + + /* Match a negated single one-byte character repeatedly. This is almost a + repeat of the code for a repeated single character, but I haven't found a + nice way of commoning these up that doesn't require a test of the + positive/negative option for each character match. Maybe that wouldn't add + very much to the time taken, but character matching *is* what this is all + about... */ + + BEGIN_OPCODE(NOTEXACT): + min = stack.currentFrame->locals.max = get2ByteValue(stack.currentFrame->args.instructionPtr + 1); + minimize = false; + stack.currentFrame->args.instructionPtr += 3; + goto REPEATNOTCHAR; + + BEGIN_OPCODE(NOTUPTO): + BEGIN_OPCODE(NOTMINUPTO): + min = 0; + stack.currentFrame->locals.max = get2ByteValue(stack.currentFrame->args.instructionPtr + 1); + minimize = *stack.currentFrame->args.instructionPtr == OP_NOTMINUPTO; + stack.currentFrame->args.instructionPtr += 3; + goto REPEATNOTCHAR; + + BEGIN_OPCODE(NOTSTAR): + BEGIN_OPCODE(NOTMINSTAR): + BEGIN_OPCODE(NOTPLUS): + BEGIN_OPCODE(NOTMINPLUS): + BEGIN_OPCODE(NOTQUERY): + BEGIN_OPCODE(NOTMINQUERY): + repeatInformationFromInstructionOffset(*stack.currentFrame->args.instructionPtr++ - OP_NOTSTAR, minimize, min, stack.currentFrame->locals.max); + + /* Common code for all repeated single-byte matches. We can give up quickly + if there are fewer than the minimum number of bytes left in the + subject. */ + + REPEATNOTCHAR: + if (min > md.endSubject - stack.currentFrame->args.subjectPtr) + RRETURN_NO_MATCH; + stack.currentFrame->locals.fc = *stack.currentFrame->args.instructionPtr++; + + /* The code is duplicated for the caseless and caseful cases, for speed, + since matching characters is likely to be quite common. First, ensure the + minimum number of matches are present. If min = max, continue at the same + level without recursing. Otherwise, if minimizing, keep trying the rest of + the expression and advancing one matching character if failing, up to the + maximum. Alternatively, if maximizing, find the maximum number of + characters and work backwards. */ + + DPRINTF(("negative matching %c{%d,%d}\n", stack.currentFrame->locals.fc, min, stack.currentFrame->locals.max)); + + if (md.ignoreCase) { + if (stack.currentFrame->locals.fc < 128) + stack.currentFrame->locals.fc = toLowerCase(stack.currentFrame->locals.fc); + + for (int i = 1; i <= min; i++) { + int d = *stack.currentFrame->args.subjectPtr++; + if (d < 128) + d = toLowerCase(d); + if (stack.currentFrame->locals.fc == d) + RRETURN_NO_MATCH; + } + + if (min == stack.currentFrame->locals.max) + NEXT_OPCODE; + + if (minimize) { + for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) { + RECURSIVE_MATCH(38, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); + if (isMatch) + RRETURN; + int d = *stack.currentFrame->args.subjectPtr++; + if (d < 128) + d = toLowerCase(d); + if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || stack.currentFrame->args.subjectPtr >= md.endSubject || stack.currentFrame->locals.fc == d) + RRETURN; + } + /* Control never reaches here */ + } + + /* Maximize case */ + + else { + stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr; + + for (int i = min; i < stack.currentFrame->locals.max; i++) { + if (stack.currentFrame->args.subjectPtr >= md.endSubject) + break; + int d = *stack.currentFrame->args.subjectPtr; + if (d < 128) + d = toLowerCase(d); + if (stack.currentFrame->locals.fc == d) + break; + ++stack.currentFrame->args.subjectPtr; + } + for (;;) { + RECURSIVE_MATCH(40, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); + if (isMatch) + RRETURN; + if (stack.currentFrame->args.subjectPtr-- == stack.currentFrame->locals.subjectPtrAtStartOfInstruction) + break; /* Stop if tried at original pos */ + } + + RRETURN; + } + /* Control never reaches here */ + } + + /* Caseful comparisons */ + + else { + for (int i = 1; i <= min; i++) { + int d = *stack.currentFrame->args.subjectPtr++; + if (stack.currentFrame->locals.fc == d) + RRETURN_NO_MATCH; + } + + if (min == stack.currentFrame->locals.max) + NEXT_OPCODE; + + if (minimize) { + for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) { + RECURSIVE_MATCH(42, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); + if (isMatch) + RRETURN; + int d = *stack.currentFrame->args.subjectPtr++; + if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || stack.currentFrame->args.subjectPtr >= md.endSubject || stack.currentFrame->locals.fc == d) + RRETURN; + } + /* Control never reaches here */ + } + + /* Maximize case */ + + else { + stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr; + + for (int i = min; i < stack.currentFrame->locals.max; i++) { + if (stack.currentFrame->args.subjectPtr >= md.endSubject) + break; + int d = *stack.currentFrame->args.subjectPtr; + if (stack.currentFrame->locals.fc == d) + break; + ++stack.currentFrame->args.subjectPtr; + } + for (;;) { + RECURSIVE_MATCH(44, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); + if (isMatch) + RRETURN; + if (stack.currentFrame->args.subjectPtr-- == stack.currentFrame->locals.subjectPtrAtStartOfInstruction) + break; /* Stop if tried at original pos */ + } + + RRETURN; + } + } + /* Control never reaches here */ + + /* Match a single character type repeatedly; several different opcodes + share code. This is very similar to the code for single characters, but we + repeat it in the interests of efficiency. */ + + BEGIN_OPCODE(TYPEEXACT): + min = stack.currentFrame->locals.max = get2ByteValue(stack.currentFrame->args.instructionPtr + 1); + minimize = true; + stack.currentFrame->args.instructionPtr += 3; + goto REPEATTYPE; + + BEGIN_OPCODE(TYPEUPTO): + BEGIN_OPCODE(TYPEMINUPTO): + min = 0; + stack.currentFrame->locals.max = get2ByteValue(stack.currentFrame->args.instructionPtr + 1); + minimize = *stack.currentFrame->args.instructionPtr == OP_TYPEMINUPTO; + stack.currentFrame->args.instructionPtr += 3; + goto REPEATTYPE; + + BEGIN_OPCODE(TYPESTAR): + BEGIN_OPCODE(TYPEMINSTAR): + BEGIN_OPCODE(TYPEPLUS): + BEGIN_OPCODE(TYPEMINPLUS): + BEGIN_OPCODE(TYPEQUERY): + BEGIN_OPCODE(TYPEMINQUERY): + repeatInformationFromInstructionOffset(*stack.currentFrame->args.instructionPtr++ - OP_TYPESTAR, minimize, min, stack.currentFrame->locals.max); + + /* Common code for all repeated single character type matches. Note that + in UTF-8 mode, '.' matches a character of any length, but for the other + character types, the valid characters are all one-byte long. */ + + REPEATTYPE: + stack.currentFrame->locals.ctype = *stack.currentFrame->args.instructionPtr++; /* Code for the character type */ + + /* First, ensure the minimum number of matches are present. Use inline + code for maximizing the speed, and do the type test once at the start + (i.e. keep it out of the loop). Also we can test that there are at least + the minimum number of characters before we start. */ + + if (min > md.endSubject - stack.currentFrame->args.subjectPtr) + RRETURN_NO_MATCH; + if (min > 0) { + switch (stack.currentFrame->locals.ctype) { + case OP_NOT_NEWLINE: + for (int i = 1; i <= min; i++) { + if (isNewline(*stack.currentFrame->args.subjectPtr)) + RRETURN_NO_MATCH; + ++stack.currentFrame->args.subjectPtr; + } + break; + + case OP_NOT_DIGIT: + for (int i = 1; i <= min; i++) { + if (isASCIIDigit(*stack.currentFrame->args.subjectPtr)) + RRETURN_NO_MATCH; + ++stack.currentFrame->args.subjectPtr; + } + break; + + case OP_DIGIT: + for (int i = 1; i <= min; i++) { + if (!isASCIIDigit(*stack.currentFrame->args.subjectPtr)) + RRETURN_NO_MATCH; + ++stack.currentFrame->args.subjectPtr; + } + break; + + case OP_NOT_WHITESPACE: + for (int i = 1; i <= min; i++) { + if (isSpaceChar(*stack.currentFrame->args.subjectPtr)) + RRETURN_NO_MATCH; + ++stack.currentFrame->args.subjectPtr; + } + break; + + case OP_WHITESPACE: + for (int i = 1; i <= min; i++) { + if (!isSpaceChar(*stack.currentFrame->args.subjectPtr)) + RRETURN_NO_MATCH; + ++stack.currentFrame->args.subjectPtr; + } + break; + + case OP_NOT_WORDCHAR: + for (int i = 1; i <= min; i++) { + if (isWordChar(*stack.currentFrame->args.subjectPtr)) + RRETURN_NO_MATCH; + ++stack.currentFrame->args.subjectPtr; + } + break; + + case OP_WORDCHAR: + for (int i = 1; i <= min; i++) { + if (!isWordChar(*stack.currentFrame->args.subjectPtr)) + RRETURN_NO_MATCH; + ++stack.currentFrame->args.subjectPtr; + } + break; + + default: + ASSERT_NOT_REACHED(); + return matchError(JSRegExpErrorInternal, stack); + } /* End switch(stack.currentFrame->locals.ctype) */ + } + + /* If min = max, continue at the same level without recursing */ + + if (min == stack.currentFrame->locals.max) + NEXT_OPCODE; + + /* If minimizing, we have to test the rest of the pattern before each + subsequent match. */ + + if (minimize) { + for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) { + RECURSIVE_MATCH(48, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); + if (isMatch) + RRETURN; + if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || stack.currentFrame->args.subjectPtr >= md.endSubject) + RRETURN; + + int c = *stack.currentFrame->args.subjectPtr++; + switch (stack.currentFrame->locals.ctype) { + case OP_NOT_NEWLINE: + if (isNewline(c)) + RRETURN; + break; + + case OP_NOT_DIGIT: + if (isASCIIDigit(c)) + RRETURN; + break; + + case OP_DIGIT: + if (!isASCIIDigit(c)) + RRETURN; + break; + + case OP_NOT_WHITESPACE: + if (isSpaceChar(c)) + RRETURN; + break; + + case OP_WHITESPACE: + if (!isSpaceChar(c)) + RRETURN; + break; + + case OP_NOT_WORDCHAR: + if (isWordChar(c)) + RRETURN; + break; + + case OP_WORDCHAR: + if (!isWordChar(c)) + RRETURN; + break; + + default: + ASSERT_NOT_REACHED(); + return matchError(JSRegExpErrorInternal, stack); + } + } + /* Control never reaches here */ + } + + /* If maximizing it is worth using inline code for speed, doing the type + test once at the start (i.e. keep it out of the loop). */ + + else { + stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr; /* Remember where we started */ + + switch (stack.currentFrame->locals.ctype) { + case OP_NOT_NEWLINE: + for (int i = min; i < stack.currentFrame->locals.max; i++) { + if (stack.currentFrame->args.subjectPtr >= md.endSubject || isNewline(*stack.currentFrame->args.subjectPtr)) + break; + stack.currentFrame->args.subjectPtr++; + } + break; + + case OP_NOT_DIGIT: + for (int i = min; i < stack.currentFrame->locals.max; i++) { + if (stack.currentFrame->args.subjectPtr >= md.endSubject) + break; + int c = *stack.currentFrame->args.subjectPtr; + if (isASCIIDigit(c)) + break; + ++stack.currentFrame->args.subjectPtr; + } + break; + + case OP_DIGIT: + for (int i = min; i < stack.currentFrame->locals.max; i++) { + if (stack.currentFrame->args.subjectPtr >= md.endSubject) + break; + int c = *stack.currentFrame->args.subjectPtr; + if (!isASCIIDigit(c)) + break; + ++stack.currentFrame->args.subjectPtr; + } + break; + + case OP_NOT_WHITESPACE: + for (int i = min; i < stack.currentFrame->locals.max; i++) { + if (stack.currentFrame->args.subjectPtr >= md.endSubject) + break; + int c = *stack.currentFrame->args.subjectPtr; + if (isSpaceChar(c)) + break; + ++stack.currentFrame->args.subjectPtr; + } + break; + + case OP_WHITESPACE: + for (int i = min; i < stack.currentFrame->locals.max; i++) { + if (stack.currentFrame->args.subjectPtr >= md.endSubject) + break; + int c = *stack.currentFrame->args.subjectPtr; + if (!isSpaceChar(c)) + break; + ++stack.currentFrame->args.subjectPtr; + } + break; + + case OP_NOT_WORDCHAR: + for (int i = min; i < stack.currentFrame->locals.max; i++) { + if (stack.currentFrame->args.subjectPtr >= md.endSubject) + break; + int c = *stack.currentFrame->args.subjectPtr; + if (isWordChar(c)) + break; + ++stack.currentFrame->args.subjectPtr; + } + break; + + case OP_WORDCHAR: + for (int i = min; i < stack.currentFrame->locals.max; i++) { + if (stack.currentFrame->args.subjectPtr >= md.endSubject) + break; + int c = *stack.currentFrame->args.subjectPtr; + if (!isWordChar(c)) + break; + ++stack.currentFrame->args.subjectPtr; + } + break; + + default: + ASSERT_NOT_REACHED(); + return matchError(JSRegExpErrorInternal, stack); + } + + /* stack.currentFrame->args.subjectPtr is now past the end of the maximum run */ + + for (;;) { + RECURSIVE_MATCH(52, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); + if (isMatch) + RRETURN; + if (stack.currentFrame->args.subjectPtr-- == stack.currentFrame->locals.subjectPtrAtStartOfInstruction) + break; /* Stop if tried at original pos */ + } + + /* Get here if we can't make it match with any permitted repetitions */ + + RRETURN; + } + /* Control never reaches here */ + + BEGIN_OPCODE(CRMINPLUS): + BEGIN_OPCODE(CRMINQUERY): + BEGIN_OPCODE(CRMINRANGE): + BEGIN_OPCODE(CRMINSTAR): + BEGIN_OPCODE(CRPLUS): + BEGIN_OPCODE(CRQUERY): + BEGIN_OPCODE(CRRANGE): + BEGIN_OPCODE(CRSTAR): + ASSERT_NOT_REACHED(); + return matchError(JSRegExpErrorInternal, stack); + +#ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP + CAPTURING_BRACKET: +#else + default: +#endif + /* Opening capturing bracket. If there is space in the offset vector, save + the current subject position in the working slot at the top of the vector. We + mustn't change the current values of the data slot, because they may be set + from a previous iteration of this group, and be referred to by a reference + inside the group. + + If the bracket fails to match, we need to restore this value and also the + values of the final offsets, in case they were set by a previous iteration of + the same bracket. + + If there isn't enough space in the offset vector, treat this as if it were a + non-capturing bracket. Don't worry about setting the flag for the error case + here; that is handled in the code for KET. */ + + ASSERT(*stack.currentFrame->args.instructionPtr > OP_BRA); + + stack.currentFrame->locals.number = *stack.currentFrame->args.instructionPtr - OP_BRA; + + /* For extended extraction brackets (large number), we have to fish out the + number from a dummy opcode at the start. */ + + if (stack.currentFrame->locals.number > EXTRACT_BASIC_MAX) + stack.currentFrame->locals.number = get2ByteValue(stack.currentFrame->args.instructionPtr + 2 + LINK_SIZE); + stack.currentFrame->locals.offset = stack.currentFrame->locals.number << 1; + +#ifdef DEBUG + printf("start bracket %d subject=", stack.currentFrame->locals.number); + pchars(stack.currentFrame->args.subjectPtr, 16, true, md); + printf("\n"); +#endif + + if (stack.currentFrame->locals.offset < md.offsetMax) { + stack.currentFrame->locals.saveOffset1 = md.offsetVector[stack.currentFrame->locals.offset]; + stack.currentFrame->locals.saveOffset2 = md.offsetVector[stack.currentFrame->locals.offset + 1]; + stack.currentFrame->locals.saveOffset3 = md.offsetVector[md.offsetEnd - stack.currentFrame->locals.number]; + + DPRINTF(("saving %d %d %d\n", stack.currentFrame->locals.saveOffset1, stack.currentFrame->locals.saveOffset2, stack.currentFrame->locals.saveOffset3)); + md.offsetVector[md.offsetEnd - stack.currentFrame->locals.number] = stack.currentFrame->args.subjectPtr - md.startSubject; + + do { + RECURSIVE_MATCH_NEW_GROUP(1, stack.currentFrame->args.instructionPtr + 1 + LINK_SIZE, stack.currentFrame->args.bracketChain); + if (isMatch) + RRETURN; + stack.currentFrame->args.instructionPtr += getLinkValue(stack.currentFrame->args.instructionPtr + 1); + } while (*stack.currentFrame->args.instructionPtr == OP_ALT); + + DPRINTF(("bracket %d failed\n", stack.currentFrame->locals.number)); + + md.offsetVector[stack.currentFrame->locals.offset] = stack.currentFrame->locals.saveOffset1; + md.offsetVector[stack.currentFrame->locals.offset + 1] = stack.currentFrame->locals.saveOffset2; + md.offsetVector[md.offsetEnd - stack.currentFrame->locals.number] = stack.currentFrame->locals.saveOffset3; + + RRETURN; + } + + /* Insufficient room for saving captured contents */ + + goto NON_CAPTURING_BRACKET; + } + + /* Do not stick any code in here without much thought; it is assumed + that "continue" in the code above comes out to here to repeat the main + loop. */ + + } /* End of main loop */ + + ASSERT_NOT_REACHED(); + +#ifndef USE_COMPUTED_GOTO_FOR_MATCH_RECURSION + +RRETURN_SWITCH: + switch (stack.currentFrame->returnLocation) { + case 0: goto RETURN; + case 1: goto RRETURN_1; + case 2: goto RRETURN_2; + case 6: goto RRETURN_6; + case 7: goto RRETURN_7; + case 14: goto RRETURN_14; + case 15: goto RRETURN_15; + case 16: goto RRETURN_16; + case 17: goto RRETURN_17; + case 18: goto RRETURN_18; + case 19: goto RRETURN_19; + case 20: goto RRETURN_20; + case 21: goto RRETURN_21; + case 22: goto RRETURN_22; + case 24: goto RRETURN_24; + case 26: goto RRETURN_26; + case 27: goto RRETURN_27; + case 28: goto RRETURN_28; + case 29: goto RRETURN_29; + case 30: goto RRETURN_30; + case 31: goto RRETURN_31; + case 38: goto RRETURN_38; + case 40: goto RRETURN_40; + case 42: goto RRETURN_42; + case 44: goto RRETURN_44; + case 48: goto RRETURN_48; + case 52: goto RRETURN_52; + } + + ASSERT_NOT_REACHED(); + return matchError(JSRegExpErrorInternal, stack); + +#endif + +RETURN: + return isMatch; +} + + +/************************************************* +* Execute a Regular Expression * +*************************************************/ + +/* This function applies a compiled re to a subject string and picks out +portions of the string if it matches. Two elements in the vector are set for +each substring: the offsets to the start and end of the substring. + +Arguments: + re points to the compiled expression + extra_data points to extra data or is NULL + subject points to the subject string + length length of subject string (may contain binary zeros) + start_offset where to start in the subject string + options option bits + offsets points to a vector of ints to be filled in with offsets + offsetCount the number of elements in the vector + +Returns: > 0 => success; value is the number of elements filled in + = 0 => success, but offsets is not big enough + -1 => failed to match + < -1 => some kind of unexpected problem +*/ + +static void tryFirstByteOptimization(const UChar*& subjectPtr, const UChar* endSubject, int firstByte, bool firstByteIsCaseless, bool useMultiLineFirstCharOptimization, const UChar* originalSubjectStart) +{ + // If firstByte is set, try scanning to the first instance of that byte + // no need to try and match against any earlier part of the subject string. + if (firstByte >= 0) { + UChar firstChar = firstByte; + if (firstByteIsCaseless) + while (subjectPtr < endSubject) { + int c = *subjectPtr; + if (c > 127) + break; + if (toLowerCase(c) == firstChar) + break; + subjectPtr++; + } + else { + while (subjectPtr < endSubject && *subjectPtr != firstChar) + subjectPtr++; + } + } else if (useMultiLineFirstCharOptimization) { + /* Or to just after \n for a multiline match if possible */ + // I'm not sure why this != originalSubjectStart check is necessary -- ecs 11/18/07 + if (subjectPtr > originalSubjectStart) { + while (subjectPtr < endSubject && !isNewline(subjectPtr[-1])) + subjectPtr++; + } + } +} + +static bool tryRequiredByteOptimization(const UChar*& subjectPtr, const UChar* endSubject, int reqByte, int reqByte2, bool reqByteIsCaseless, bool hasFirstByte, const UChar*& reqBytePtr) +{ + /* If reqByte is set, we know that that character must appear in the subject + for the match to succeed. If the first character is set, reqByte must be + later in the subject; otherwise the test starts at the match point. This + optimization can save a huge amount of backtracking in patterns with nested + unlimited repeats that aren't going to match. Writing separate code for + cased/caseless versions makes it go faster, as does using an autoincrement + and backing off on a match. + + HOWEVER: when the subject string is very, very long, searching to its end can + take a long time, and give bad performance on quite ordinary patterns. This + showed up when somebody was matching /^C/ on a 32-megabyte string... so we + don't do this when the string is sufficiently long. + */ + + if (reqByte >= 0 && endSubject - subjectPtr < REQ_BYTE_MAX) { + const UChar* p = subjectPtr + (hasFirstByte ? 1 : 0); + + /* We don't need to repeat the search if we haven't yet reached the + place we found it at last time. */ + + if (p > reqBytePtr) { + if (reqByteIsCaseless) { + while (p < endSubject) { + int pp = *p++; + if (pp == reqByte || pp == reqByte2) { + p--; + break; + } + } + } else { + while (p < endSubject) { + if (*p++ == reqByte) { + p--; + break; + } + } + } + + /* If we can't find the required character, break the matching loop */ + + if (p >= endSubject) + return true; + + /* If we have found the required character, save the point where we + found it, so that we don't search again next time round the loop if + the start hasn't passed this character yet. */ + + reqBytePtr = p; + } + } + return false; +} + +int jsRegExpExecute(const JSRegExp* re, + const UChar* subject, int length, int start_offset, int* offsets, + int offsetCount) +{ + ASSERT(re); + ASSERT(subject || !length); + ASSERT(offsetCount >= 0); + ASSERT(offsets || offsetCount == 0); + + HistogramTimeLogger logger(re); + + MatchData matchBlock; + matchBlock.startSubject = subject; + matchBlock.endSubject = matchBlock.startSubject + length; + const UChar* endSubject = matchBlock.endSubject; + + matchBlock.multiline = (re->options & MatchAcrossMultipleLinesOption); + matchBlock.ignoreCase = (re->options & IgnoreCaseOption); + + /* If the expression has got more back references than the offsets supplied can + hold, we get a temporary chunk of working store to use during the matching. + Otherwise, we can use the vector supplied, rounding down its size to a multiple + of 3. */ + + int ocount = offsetCount - (offsetCount % 3); + + // FIXME: This is lame that we have to second-guess our caller here. + // The API should change to either fail-hard when we don't have enough offset space + // or that we shouldn't ask our callers to pre-allocate in the first place. + bool usingTemporaryOffsets = false; + if (re->topBackref > 0 && re->topBackref >= ocount/3) { + ocount = re->topBackref * 3 + 3; + matchBlock.offsetVector = new int[ocount]; + if (!matchBlock.offsetVector) + return JSRegExpErrorNoMemory; + usingTemporaryOffsets = true; + } else + matchBlock.offsetVector = offsets; + + matchBlock.offsetEnd = ocount; + matchBlock.offsetMax = (2*ocount)/3; + matchBlock.offsetOverflow = false; + + /* Compute the minimum number of offsets that we need to reset each time. Doing + this makes a huge difference to execution time when there aren't many brackets + in the pattern. */ + + int resetCount = 2 + re->topBracket * 2; + if (resetCount > offsetCount) + resetCount = ocount; + + /* Reset the working variable associated with each extraction. These should + never be used unless previously set, but they get saved and restored, and so we + initialize them to avoid reading uninitialized locations. */ + + if (matchBlock.offsetVector) { + int* iptr = matchBlock.offsetVector + ocount; + int* iend = iptr - resetCount/2 + 1; + while (--iptr >= iend) + *iptr = -1; + } + + /* Set up the first character to match, if available. The firstByte value is + never set for an anchored regular expression, but the anchoring may be forced + at run time, so we have to test for anchoring. The first char may be unset for + an unanchored pattern, of course. If there's no first char and the pattern was + studied, there may be a bitmap of possible first characters. */ + + bool firstByteIsCaseless = false; + int firstByte = -1; + if (re->options & UseFirstByteOptimizationOption) { + firstByte = re->firstByte & 255; + if ((firstByteIsCaseless = (re->firstByte & REQ_IGNORE_CASE))) + firstByte = toLowerCase(firstByte); + } + + /* For anchored or unanchored matches, there may be a "last known required + character" set. */ + + bool reqByteIsCaseless = false; + int reqByte = -1; + int reqByte2 = -1; + if (re->options & UseRequiredByteOptimizationOption) { + reqByte = re->reqByte & 255; // FIXME: This optimization could be made to work for UTF16 chars as well... + reqByteIsCaseless = (re->reqByte & REQ_IGNORE_CASE); + reqByte2 = flipCase(reqByte); + } + + /* Loop for handling unanchored repeated matching attempts; for anchored regexs + the loop runs just once. */ + + const UChar* startMatch = subject + start_offset; + const UChar* reqBytePtr = startMatch - 1; + bool useMultiLineFirstCharOptimization = re->options & UseMultiLineFirstByteOptimizationOption; + + do { + /* Reset the maximum number of extractions we might see. */ + if (matchBlock.offsetVector) { + int* iptr = matchBlock.offsetVector; + int* iend = iptr + resetCount; + while (iptr < iend) + *iptr++ = -1; + } + + tryFirstByteOptimization(startMatch, endSubject, firstByte, firstByteIsCaseless, useMultiLineFirstCharOptimization, matchBlock.startSubject + start_offset); + if (tryRequiredByteOptimization(startMatch, endSubject, reqByte, reqByte2, reqByteIsCaseless, firstByte >= 0, reqBytePtr)) + break; + + /* When a match occurs, substrings will be set for all internal extractions; + we just need to set up the whole thing as substring 0 before returning. If + there were too many extractions, set the return code to zero. In the case + where we had to get some local store to hold offsets for backreferences, copy + those back references that we can. In this case there need not be overflow + if certain parts of the pattern were not used. */ + + /* The code starts after the JSRegExp block and the capture name table. */ + const unsigned char* start_code = (const unsigned char*)(re + 1); + + int returnCode = match(startMatch, start_code, 2, matchBlock); + + /* When the result is no match, advance the pointer to the next character + and continue. */ + if (returnCode == 0) { + startMatch++; + continue; + } + + if (returnCode != 1) { + ASSERT(returnCode == JSRegExpErrorHitLimit || returnCode == JSRegExpErrorNoMemory); + DPRINTF((">>>> error: returning %d\n", returnCode)); + return returnCode; + } + + /* We have a match! Copy the offset information from temporary store if + necessary */ + + if (usingTemporaryOffsets) { + if (offsetCount >= 4) { + memcpy(offsets + 2, matchBlock.offsetVector + 2, (offsetCount - 2) * sizeof(int)); + DPRINTF(("Copied offsets from temporary memory\n")); + } + if (matchBlock.endOffsetTop > offsetCount) + matchBlock.offsetOverflow = true; + + DPRINTF(("Freeing temporary memory\n")); + delete [] matchBlock.offsetVector; + } + + returnCode = matchBlock.offsetOverflow ? 0 : matchBlock.endOffsetTop / 2; + + if (offsetCount < 2) + returnCode = 0; + else { + offsets[0] = startMatch - matchBlock.startSubject; + offsets[1] = matchBlock.endMatchPtr - matchBlock.startSubject; + } + + DPRINTF((">>>> returning %d\n", returnCode)); + return returnCode; + } while (!(re->options & IsAnchoredOption) && startMatch <= endSubject); + + if (usingTemporaryOffsets) { + DPRINTF(("Freeing temporary memory\n")); + delete [] matchBlock.offsetVector; + } + + DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n")); + return JSRegExpErrorNoMatch; +} + +#if REGEXP_HISTOGRAM + +class CompareHistogramEntries { +public: + bool operator()(const pair<UString, double>& a, const pair<UString, double>& b) + { + if (a.second == b.second) + return a.first < b.first; + return a.second < b.second; + } +}; + +Histogram::~Histogram() +{ + Vector<pair<UString, double> > values; + Map::iterator end = times.end(); + for (Map::iterator it = times.begin(); it != end; ++it) + values.append(*it); + sort(values.begin(), values.end(), CompareHistogramEntries()); + size_t size = values.size(); + printf("Regular Expressions, sorted by time spent evaluating them:\n"); + for (size_t i = 0; i < size; ++i) + printf(" %f - %s\n", values[size - i - 1].second, values[size - i - 1].first.utf8().c_str()); +} + +void Histogram::add(const JSRegExp* re, double elapsedTime) +{ + UString string(reinterpret_cast<const UChar*>(reinterpret_cast<const char*>(re) + re->stringOffset), re->stringLength); + if (re->options & IgnoreCaseOption && re->options & MatchAcrossMultipleLinesOption) + string += " (multi-line, ignore case)"; + else { + if (re->options & IgnoreCaseOption) + string += " (ignore case)"; + if (re->options & MatchAcrossMultipleLinesOption) + string += " (multi-line)"; + } + pair<Map::iterator, bool> result = times.add(string.impl(), elapsedTime); + if (!result.second) + result.first->second += elapsedTime; +} + +HistogramTimeLogger::HistogramTimeLogger(const JSRegExp* re) + : m_re(re) + , m_startTime(currentTimeMS()) +{ +} + +HistogramTimeLogger::~HistogramTimeLogger() +{ + static Histogram histogram; + histogram.add(m_re, currentTimeMS() - m_startTime); +} + +#endif |