/* * Copyright (C) 1999-2000 Harri Porten (porten@kde.org) * Copyright (C) 2006, 2007, 2008, 2009 Apple Inc. All Rights Reserved. * Copyright (C) 2007 Cameron Zwarich (cwzwarich@uwaterloo.ca) * Copyright (C) 2010 Zoltan Herczeg (zherczeg@inf.u-szeged.hu) * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Library General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Library General Public License for more details. * * You should have received a copy of the GNU Library General Public License * along with this library; see the file COPYING.LIB. If not, write to * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, * Boston, MA 02110-1301, USA. * */ #include "config.h" #include "Lexer.h" #include "JSFunction.h" #include "JSGlobalObjectFunctions.h" #include "Identifier.h" #include "NodeInfo.h" #include "Nodes.h" #include "dtoa.h" #include #include #include #include using namespace WTF; using namespace Unicode; #include "JSParser.h" #include "Lookup.h" #include "Lexer.lut.h" namespace JSC { enum CharacterTypes { // Types for the main switch CharacterInvalid, CharacterAlpha, CharacterZero, CharacterNumber, CharacterLineTerminator, CharacterExclamationMark, CharacterSimple, CharacterQuote, CharacterDot, CharacterSlash, CharacterBackSlash, CharacterSemicolon, CharacterOpenBrace, CharacterCloseBrace, CharacterAdd, CharacterSub, CharacterMultiply, CharacterModulo, CharacterAnd, CharacterXor, CharacterOr, CharacterLess, CharacterGreater, CharacterEqual, // Other types (only one so far) CharacterWhiteSpace, }; // 128 ascii codes static unsigned char AsciiCharacters[128] = { /* 0 - Null */ CharacterInvalid, /* 1 - Start of Heading */ CharacterInvalid, /* 2 - Start of Text */ CharacterInvalid, /* 3 - End of Text */ CharacterInvalid, /* 4 - End of Transm. */ CharacterInvalid, /* 5 - Enquiry */ CharacterInvalid, /* 6 - Acknowledgment */ CharacterInvalid, /* 7 - Bell */ CharacterInvalid, /* 8 - Back Space */ CharacterInvalid, /* 9 - Horizontal Tab */ CharacterWhiteSpace, /* 10 - Line Feed */ CharacterLineTerminator, /* 11 - Vertical Tab */ CharacterWhiteSpace, /* 12 - Form Feed */ CharacterWhiteSpace, /* 13 - Carriage Return */ CharacterLineTerminator, /* 14 - Shift Out */ CharacterInvalid, /* 15 - Shift In */ CharacterInvalid, /* 16 - Data Line Escape */ CharacterInvalid, /* 17 - Device Control 1 */ CharacterInvalid, /* 18 - Device Control 2 */ CharacterInvalid, /* 19 - Device Control 3 */ CharacterInvalid, /* 20 - Device Control 4 */ CharacterInvalid, /* 21 - Negative Ack. */ CharacterInvalid, /* 22 - Synchronous Idle */ CharacterInvalid, /* 23 - End of Transmit */ CharacterInvalid, /* 24 - Cancel */ CharacterInvalid, /* 25 - End of Medium */ CharacterInvalid, /* 26 - Substitute */ CharacterInvalid, /* 27 - Escape */ CharacterInvalid, /* 28 - File Separator */ CharacterInvalid, /* 29 - Group Separator */ CharacterInvalid, /* 30 - Record Separator */ CharacterInvalid, /* 31 - Unit Separator */ CharacterInvalid, /* 32 - Space */ CharacterWhiteSpace, /* 33 - ! */ CharacterExclamationMark, /* 34 - " */ CharacterQuote, /* 35 - # */ CharacterInvalid, /* 36 - $ */ CharacterAlpha, /* 37 - % */ CharacterModulo, /* 38 - & */ CharacterAnd, /* 39 - ' */ CharacterQuote, /* 40 - ( */ CharacterSimple, /* 41 - ) */ CharacterSimple, /* 42 - * */ CharacterMultiply, /* 43 - + */ CharacterAdd, /* 44 - , */ CharacterSimple, /* 45 - - */ CharacterSub, /* 46 - . */ CharacterDot, /* 47 - / */ CharacterSlash, /* 48 - 0 */ CharacterZero, /* 49 - 1 */ CharacterNumber, /* 50 - 2 */ CharacterNumber, /* 51 - 3 */ CharacterNumber, /* 52 - 4 */ CharacterNumber, /* 53 - 5 */ CharacterNumber, /* 54 - 6 */ CharacterNumber, /* 55 - 7 */ CharacterNumber, /* 56 - 8 */ CharacterNumber, /* 57 - 9 */ CharacterNumber, /* 58 - : */ CharacterSimple, /* 59 - ; */ CharacterSemicolon, /* 60 - < */ CharacterLess, /* 61 - = */ CharacterEqual, /* 62 - > */ CharacterGreater, /* 63 - ? */ CharacterSimple, /* 64 - @ */ CharacterInvalid, /* 65 - A */ CharacterAlpha, /* 66 - B */ CharacterAlpha, /* 67 - C */ CharacterAlpha, /* 68 - D */ CharacterAlpha, /* 69 - E */ CharacterAlpha, /* 70 - F */ CharacterAlpha, /* 71 - G */ CharacterAlpha, /* 72 - H */ CharacterAlpha, /* 73 - I */ CharacterAlpha, /* 74 - J */ CharacterAlpha, /* 75 - K */ CharacterAlpha, /* 76 - L */ CharacterAlpha, /* 77 - M */ CharacterAlpha, /* 78 - N */ CharacterAlpha, /* 79 - O */ CharacterAlpha, /* 80 - P */ CharacterAlpha, /* 81 - Q */ CharacterAlpha, /* 82 - R */ CharacterAlpha, /* 83 - S */ CharacterAlpha, /* 84 - T */ CharacterAlpha, /* 85 - U */ CharacterAlpha, /* 86 - V */ CharacterAlpha, /* 87 - W */ CharacterAlpha, /* 88 - X */ CharacterAlpha, /* 89 - Y */ CharacterAlpha, /* 90 - Z */ CharacterAlpha, /* 91 - [ */ CharacterSimple, /* 92 - \ */ CharacterBackSlash, /* 93 - ] */ CharacterSimple, /* 94 - ^ */ CharacterXor, /* 95 - _ */ CharacterAlpha, /* 96 - ` */ CharacterInvalid, /* 97 - a */ CharacterAlpha, /* 98 - b */ CharacterAlpha, /* 99 - c */ CharacterAlpha, /* 100 - d */ CharacterAlpha, /* 101 - e */ CharacterAlpha, /* 102 - f */ CharacterAlpha, /* 103 - g */ CharacterAlpha, /* 104 - h */ CharacterAlpha, /* 105 - i */ CharacterAlpha, /* 106 - j */ CharacterAlpha, /* 107 - k */ CharacterAlpha, /* 108 - l */ CharacterAlpha, /* 109 - m */ CharacterAlpha, /* 110 - n */ CharacterAlpha, /* 111 - o */ CharacterAlpha, /* 112 - p */ CharacterAlpha, /* 113 - q */ CharacterAlpha, /* 114 - r */ CharacterAlpha, /* 115 - s */ CharacterAlpha, /* 116 - t */ CharacterAlpha, /* 117 - u */ CharacterAlpha, /* 118 - v */ CharacterAlpha, /* 119 - w */ CharacterAlpha, /* 120 - x */ CharacterAlpha, /* 121 - y */ CharacterAlpha, /* 122 - z */ CharacterAlpha, /* 123 - { */ CharacterOpenBrace, /* 124 - | */ CharacterOr, /* 125 - } */ CharacterCloseBrace, /* 126 - ~ */ CharacterSimple, /* 127 - Delete */ CharacterInvalid, }; Lexer::Lexer(JSGlobalData* globalData) : m_isReparsing(false) , m_globalData(globalData) , m_keywordTable(JSC::mainTable) { } Lexer::~Lexer() { m_keywordTable.deleteTable(); } ALWAYS_INLINE const UChar* Lexer::currentCharacter() const { ASSERT(m_code <= m_codeEnd); return m_code; } ALWAYS_INLINE int Lexer::currentOffset() const { return currentCharacter() - m_codeStart; } void Lexer::setCode(const SourceCode& source, ParserArena& arena) { m_arena = &arena.identifierArena(); m_lineNumber = source.firstLine(); m_delimited = false; m_lastToken = -1; const UChar* data = source.provider()->data(); m_source = &source; m_codeStart = data; m_code = data + source.startOffset(); m_codeEnd = data + source.endOffset(); m_error = false; m_atLineStart = true; m_buffer8.reserveInitialCapacity(initialReadBufferCapacity); m_buffer16.reserveInitialCapacity((m_codeEnd - m_code) / 2); if (LIKELY(m_code < m_codeEnd)) m_current = *m_code; else m_current = -1; ASSERT(currentOffset() == source.startOffset()); } ALWAYS_INLINE void Lexer::shift() { // Faster than an if-else sequence ASSERT(m_current != -1); m_current = -1; ++m_code; if (LIKELY(m_code < m_codeEnd)) m_current = *m_code; } ALWAYS_INLINE int Lexer::peek(int offset) { // Only use if necessary ASSERT(offset > 0 && offset < 5); const UChar* code = m_code + offset; return (code < m_codeEnd) ? *code : -1; } int Lexer::getUnicodeCharacter() { int char1 = peek(1); int char2 = peek(2); int char3 = peek(3); if (UNLIKELY(!isASCIIHexDigit(m_current) || !isASCIIHexDigit(char1) || !isASCIIHexDigit(char2) || !isASCIIHexDigit(char3))) return -1; int result = convertUnicode(m_current, char1, char2, char3); shift(); shift(); shift(); shift(); return result; } void Lexer::shiftLineTerminator() { ASSERT(isLineTerminator(m_current)); int m_prev = m_current; shift(); // Allow both CRLF and LFCR. if (m_prev + m_current == '\n' + '\r') shift(); ++m_lineNumber; } ALWAYS_INLINE const Identifier* Lexer::makeIdentifier(const UChar* characters, size_t length) { return &m_arena->makeIdentifier(m_globalData, characters, length); } ALWAYS_INLINE bool Lexer::lastTokenWasRestrKeyword() const { return m_lastToken == CONTINUE || m_lastToken == BREAK || m_lastToken == RETURN || m_lastToken == THROW; } static NEVER_INLINE bool isNonASCIIIdentStart(int c) { return category(c) & (Letter_Uppercase | Letter_Lowercase | Letter_Titlecase | Letter_Modifier | Letter_Other); } static inline bool isIdentStart(int c) { return isASCII(c) ? isASCIIAlpha(c) || c == '$' || c == '_' : isNonASCIIIdentStart(c); } static NEVER_INLINE bool isNonASCIIIdentPart(int c) { return category(c) & (Letter_Uppercase | Letter_Lowercase | Letter_Titlecase | Letter_Modifier | Letter_Other | Mark_NonSpacing | Mark_SpacingCombining | Number_DecimalDigit | Punctuation_Connector); } static inline bool isIdentPart(int c) { return isASCII(c) ? isASCIIAlphanumeric(c) || c == '$' || c == '_' : isNonASCIIIdentPart(c); } static inline int singleEscape(int c) { switch (c) { case 'b': return 0x08; case 't': return 0x09; case 'n': return 0x0A; case 'v': return 0x0B; case 'f': return 0x0C; case 'r': return 0x0D; default: return c; } } inline void Lexer::record8(int c) { ASSERT(c >= 0); ASSERT(c <= 0xFF); m_buffer8.append(static_cast(c)); } inline void Lexer::record16(UChar c) { m_buffer16.append(c); } inline void Lexer::record16(int c) { ASSERT(c >= 0); ASSERT(c <= USHRT_MAX); record16(UChar(static_cast(c))); } int Lexer::lex(void* p1, void* p2) { ASSERT(!m_error); ASSERT(m_buffer8.isEmpty()); ASSERT(m_buffer16.isEmpty()); YYSTYPE* lvalp = static_cast(p1); YYLTYPE* llocp = static_cast(p2); int token = 0; m_terminator = false; start: while (isWhiteSpace(m_current)) shift(); int startOffset = currentOffset(); if (UNLIKELY(m_current == -1)) { if (!m_terminator && !m_delimited && !m_isReparsing) { // automatic semicolon insertion if program incomplete token = ';'; goto doneSemicolon; } return 0; } m_delimited = false; ASSERT(m_current >= 0); if (m_current < 128) { ASSERT(isASCII(m_current)); switch (AsciiCharacters[m_current]) { case CharacterGreater: shift(); if (m_current == '>') { shift(); if (m_current == '>') { shift(); if (m_current == '=') { shift(); token = URSHIFTEQUAL; break; } token = URSHIFT; break; } if (m_current == '=') { shift(); token = RSHIFTEQUAL; break; } token = RSHIFT; break; } if (m_current == '=') { shift(); token = GE; break; } token = '>'; break; case CharacterEqual: shift(); if (m_current == '=') { shift(); if (m_current == '=') { shift(); token = STREQ; break; } token = EQEQ; break; } token = '='; break; case CharacterLess: shift(); if (m_current == '!' && peek(1) == '-' && peek(2) == '-') { //