/* Copyright (C) 1997 Martin Jones (mjones@kde.org) (C) 1997 Torben Weis (weis@kde.org) (C) 1998 Waldo Bastian (bastian@kde.org) (C) 1999 Lars Knoll (knoll@kde.org) (C) 1999 Antti Koivisto (koivisto@kde.org) (C) 2001 Dirk Mueller (mueller@kde.org) Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved. Copyright (C) 2005, 2006 Alexey Proskuryakov (ap@nypop.com) Copyright (C) 2009 Torch Mobile Inc. All rights reserved. (http://www.torchmobile.com/) This library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more details. You should have received a copy of the GNU Library General Public License along with this library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ #include "config.h" #include "HTMLTokenizer.h" #include "CSSHelper.h" #include "Cache.h" #include "CachedScript.h" #include "DocLoader.h" #include "DocumentFragment.h" #include "EventNames.h" #include "Frame.h" #include "FrameLoader.h" #include "FrameView.h" #include "HTMLElement.h" #include "HTMLNames.h" #include "HTMLParser.h" #include "HTMLScriptElement.h" #include "HTMLViewSourceDocument.h" #include "MappedAttribute.h" #include "Page.h" #include "PreloadScanner.h" #include "ScriptController.h" #include "ScriptSourceCode.h" #include "ScriptValue.h" #include "XSSAuditor.h" #include #include #include "HTMLEntityNames.c" #ifdef ANDROID_INSTRUMENT #include "TimeCounter.h" #endif #define PRELOAD_SCANNER_ENABLED 1 // #define INSTRUMENT_LAYOUT_SCHEDULING 1 using namespace WTF; using namespace std; namespace WebCore { using namespace HTMLNames; #if MOBILE // The mobile device needs to be responsive, as such the tokenizer chunk size is reduced. // This value is used to define how many characters the tokenizer will process before // yeilding control. static const int defaultTokenizerChunkSize = 256; #else static const int defaultTokenizerChunkSize = 4096; #endif #if MOBILE // As the chunks are smaller (above), the tokenizer should not yield for as long a period, otherwise // it will take way to long to load a page. static const double defaultTokenizerTimeDelay = 0.300; #else // FIXME: We would like this constant to be 200ms. // Yielding more aggressively results in increased responsiveness and better incremental rendering. // It slows down overall page-load on slower machines, though, so for now we set a value of 500. static const double defaultTokenizerTimeDelay = 0.500; #endif static const char commentStart [] = " as a close comment, even though it's // not technically valid. endCharsCount = 4; } if (handleBrokenComments || endCharsCount > 1) { src.advancePastNonNewline(); if (!(state.inTitle() || state.inScript() || state.inXmp() || state.inTextArea() || state.inStyle() || state.inIFrame())) { checkScriptBuffer(); m_scriptCode[m_scriptCodeSize] = 0; m_scriptCode[m_scriptCodeSize + 1] = 0; m_currentToken.tagName = commentAtom; m_currentToken.beginTag = true; state = processListing(SegmentedString(m_scriptCode, m_scriptCodeSize - endCharsCount), state); processToken(); m_currentToken.tagName = commentAtom; m_currentToken.beginTag = false; processToken(); m_scriptCodeSize = 0; } state.setInComment(false); return state; // Finished parsing comment } } src.advance(m_lineNumber); } return state; } HTMLTokenizer::State HTMLTokenizer::parseServer(SegmentedString& src, State state) { checkScriptBuffer(src.length()); while (!src.isEmpty()) { UChar ch = *src; m_scriptCode[m_scriptCodeSize++] = ch; if (ch == '>' && m_scriptCodeSize > 1 && m_scriptCode[m_scriptCodeSize - 2] == '%') { src.advancePastNonNewline(); state.setInServer(false); m_scriptCodeSize = 0; return state; // Finished parsing server include } src.advance(m_lineNumber); } return state; } HTMLTokenizer::State HTMLTokenizer::parseProcessingInstruction(SegmentedString& src, State state) { UChar oldchar = 0; while (!src.isEmpty()) { UChar chbegin = *src; if (chbegin == '\'') tquote = tquote == SingleQuote ? NoQuote : SingleQuote; else if (chbegin == '\"') tquote = tquote == DoubleQuote ? NoQuote : DoubleQuote; // Look for '?>' // Some crappy sites omit the "?" before it, so // we look for an unquoted '>' instead. (IE compatible) else if (chbegin == '>' && (!tquote || oldchar == '?')) { // We got a '?>' sequence state.setInProcessingInstruction(false); src.advancePastNonNewline(); state.setDiscardLF(true); return state; // Finished parsing comment! } src.advance(m_lineNumber); oldchar = chbegin; } return state; } HTMLTokenizer::State HTMLTokenizer::parseText(SegmentedString& src, State state) { while (!src.isEmpty()) { UChar cc = *src; if (state.skipLF()) { state.setSkipLF(false); if (cc == '\n') { src.advancePastNewline(m_lineNumber); continue; } } // do we need to enlarge the buffer? checkBuffer(); if (cc == '\r') { state.setSkipLF(true); *m_dest++ = '\n'; } else *m_dest++ = cc; src.advance(m_lineNumber); } return state; } HTMLTokenizer::State HTMLTokenizer::parseEntity(SegmentedString& src, UChar*& dest, State state, unsigned& cBufferPos, bool start, bool parsingTag) { if (start) { cBufferPos = 0; state.setEntityState(SearchEntity); EntityUnicodeValue = 0; } while (!src.isEmpty()) { UChar cc = *src; switch (state.entityState()) { case NoEntity: ASSERT(state.entityState() != NoEntity); return state; case SearchEntity: if (cc == '#') { m_cBuffer[cBufferPos++] = cc; src.advancePastNonNewline(); state.setEntityState(NumericSearch); } else state.setEntityState(EntityName); break; case NumericSearch: if (cc == 'x' || cc == 'X') { m_cBuffer[cBufferPos++] = cc; src.advancePastNonNewline(); state.setEntityState(Hexadecimal); } else if (cc >= '0' && cc <= '9') state.setEntityState(Decimal); else state.setEntityState(SearchSemicolon); break; case Hexadecimal: { int ll = min(src.length(), 10 - cBufferPos); while (ll--) { cc = *src; if (!((cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'f') || (cc >= 'A' && cc <= 'F'))) { state.setEntityState(SearchSemicolon); break; } int digit; if (cc < 'A') digit = cc - '0'; else digit = (cc - 'A' + 10) & 0xF; // handle both upper and lower case without a branch EntityUnicodeValue = EntityUnicodeValue * 16 + digit; m_cBuffer[cBufferPos++] = cc; src.advancePastNonNewline(); } if (cBufferPos == 10) state.setEntityState(SearchSemicolon); break; } case Decimal: { int ll = min(src.length(), 9-cBufferPos); while (ll--) { cc = *src; if (!(cc >= '0' && cc <= '9')) { state.setEntityState(SearchSemicolon); break; } EntityUnicodeValue = EntityUnicodeValue * 10 + (cc - '0'); m_cBuffer[cBufferPos++] = cc; src.advancePastNonNewline(); } if (cBufferPos == 9) state.setEntityState(SearchSemicolon); break; } case EntityName: { int ll = min(src.length(), 9-cBufferPos); while (ll--) { cc = *src; if (!((cc >= 'a' && cc <= 'z') || (cc >= '0' && cc <= '9') || (cc >= 'A' && cc <= 'Z'))) { state.setEntityState(SearchSemicolon); break; } m_cBuffer[cBufferPos++] = cc; src.advancePastNonNewline(); } if (cBufferPos == 9) state.setEntityState(SearchSemicolon); if (state.entityState() == SearchSemicolon) { if (cBufferPos > 1) { // Since the maximum length of entity name is 9, // so a single char array which is allocated on // the stack, its length is 10, should be OK. // Also if we have an illegal character, we treat it // as illegal entity name. unsigned testedEntityNameLen = 0; char tmpEntityNameBuffer[10]; ASSERT(cBufferPos < 10); for (; testedEntityNameLen < cBufferPos; ++testedEntityNameLen) { if (m_cBuffer[testedEntityNameLen] > 0x7e) break; tmpEntityNameBuffer[testedEntityNameLen] = m_cBuffer[testedEntityNameLen]; } const Entity *e; if (testedEntityNameLen == cBufferPos) e = findEntity(tmpEntityNameBuffer, cBufferPos); else e = 0; if (e) EntityUnicodeValue = e->code; // be IE compatible if (parsingTag && EntityUnicodeValue > 255 && *src != ';') EntityUnicodeValue = 0; } } else break; } case SearchSemicolon: // Don't allow values that are more than 21 bits. if (EntityUnicodeValue > 0 && EntityUnicodeValue <= 0x10FFFF) { if (!inViewSourceMode()) { if (*src == ';') src.advancePastNonNewline(); if (EntityUnicodeValue <= 0xFFFF) { checkBuffer(); src.push(fixUpChar(EntityUnicodeValue)); } else { // Convert to UTF-16, using surrogate code points. checkBuffer(2); src.push(U16_LEAD(EntityUnicodeValue)); src.push(U16_TRAIL(EntityUnicodeValue)); } } else { // FIXME: We should eventually colorize entities by sending them as a special token. // 12 bytes required: up to 10 bytes in m_cBuffer plus the // leading '&' and trailing ';' checkBuffer(12); *dest++ = '&'; for (unsigned i = 0; i < cBufferPos; i++) dest[i] = m_cBuffer[i]; dest += cBufferPos; if (*src == ';') { *dest++ = ';'; src.advancePastNonNewline(); } } } else { // 11 bytes required: up to 10 bytes in m_cBuffer plus the // leading '&' checkBuffer(11); // ignore the sequence, add it to the buffer as plaintext *dest++ = '&'; for (unsigned i = 0; i < cBufferPos; i++) dest[i] = m_cBuffer[i]; dest += cBufferPos; } state.setEntityState(NoEntity); return state; } } return state; } HTMLTokenizer::State HTMLTokenizer::parseDoctype(SegmentedString& src, State state) { ASSERT(state.inDoctype()); while (!src.isEmpty() && state.inDoctype()) { UChar c = *src; bool isWhitespace = c == '\r' || c == '\n' || c == '\t' || c == ' '; switch (m_doctypeToken.state()) { case DoctypeBegin: { m_doctypeToken.setState(DoctypeBeforeName); if (isWhitespace) { src.advance(m_lineNumber); if (inViewSourceMode()) m_doctypeToken.m_source.append(c); } break; } case DoctypeBeforeName: { if (c == '>') { // Malformed. Just exit. src.advancePastNonNewline(); state.setInDoctype(false); if (inViewSourceMode()) processDoctypeToken(); } else if (isWhitespace) { src.advance(m_lineNumber); if (inViewSourceMode()) m_doctypeToken.m_source.append(c); } else m_doctypeToken.setState(DoctypeName); break; } case DoctypeName: { if (c == '>') { // Valid doctype. Emit it. src.advancePastNonNewline(); state.setInDoctype(false); processDoctypeToken(); } else if (isWhitespace) { m_doctypeSearchCount = 0; // Used now to scan for PUBLIC m_doctypeSecondarySearchCount = 0; // Used now to scan for SYSTEM m_doctypeToken.setState(DoctypeAfterName); src.advance(m_lineNumber); if (inViewSourceMode()) m_doctypeToken.m_source.append(c); } else { src.advancePastNonNewline(); m_doctypeToken.m_name.append(c); if (inViewSourceMode()) m_doctypeToken.m_source.append(c); } break; } case DoctypeAfterName: { if (c == '>') { // Valid doctype. Emit it. src.advancePastNonNewline(); state.setInDoctype(false); processDoctypeToken(); } else if (!isWhitespace) { src.advancePastNonNewline(); if (toASCIILower(c) == publicStart[m_doctypeSearchCount]) { m_doctypeSearchCount++; if (m_doctypeSearchCount == 6) // Found 'PUBLIC' sequence m_doctypeToken.setState(DoctypeBeforePublicID); } else if (m_doctypeSearchCount > 0) { m_doctypeSearchCount = 0; m_doctypeToken.setState(DoctypeBogus); } else if (toASCIILower(c) == systemStart[m_doctypeSecondarySearchCount]) { m_doctypeSecondarySearchCount++; if (m_doctypeSecondarySearchCount == 6) // Found 'SYSTEM' sequence m_doctypeToken.setState(DoctypeBeforeSystemID); } else { m_doctypeSecondarySearchCount = 0; m_doctypeToken.setState(DoctypeBogus); } if (inViewSourceMode()) m_doctypeToken.m_source.append(c); } else { src.advance(m_lineNumber); // Whitespace keeps us in the after name state. if (inViewSourceMode()) m_doctypeToken.m_source.append(c); } break; } case DoctypeBeforePublicID: { if (c == '\"' || c == '\'') { tquote = c == '\"' ? DoubleQuote : SingleQuote; m_doctypeToken.setState(DoctypePublicID); src.advancePastNonNewline(); if (inViewSourceMode()) m_doctypeToken.m_source.append(c); } else if (c == '>') { // Considered bogus. Don't process the doctype. src.advancePastNonNewline(); state.setInDoctype(false); if (inViewSourceMode()) processDoctypeToken(); } else if (isWhitespace) { src.advance(m_lineNumber); if (inViewSourceMode()) m_doctypeToken.m_source.append(c); } else m_doctypeToken.setState(DoctypeBogus); break; } case DoctypePublicID: { if ((c == '\"' && tquote == DoubleQuote) || (c == '\'' && tquote == SingleQuote)) { src.advancePastNonNewline(); m_doctypeToken.setState(DoctypeAfterPublicID); if (inViewSourceMode()) m_doctypeToken.m_source.append(c); } else if (c == '>') { // Considered bogus. Don't process the doctype. src.advancePastNonNewline(); state.setInDoctype(false); if (inViewSourceMode()) processDoctypeToken(); } else { m_doctypeToken.m_publicID.append(c); src.advance(m_lineNumber); if (inViewSourceMode()) m_doctypeToken.m_source.append(c); } break; } case DoctypeAfterPublicID: if (c == '\"' || c == '\'') { tquote = c == '\"' ? DoubleQuote : SingleQuote; m_doctypeToken.setState(DoctypeSystemID); src.advancePastNonNewline(); if (inViewSourceMode()) m_doctypeToken.m_source.append(c); } else if (c == '>') { // Valid doctype. Emit it now. src.advancePastNonNewline(); state.setInDoctype(false); processDoctypeToken(); } else if (isWhitespace) { src.advance(m_lineNumber); if (inViewSourceMode()) m_doctypeToken.m_source.append(c); } else m_doctypeToken.setState(DoctypeBogus); break; case DoctypeBeforeSystemID: if (c == '\"' || c == '\'') { tquote = c == '\"' ? DoubleQuote : SingleQuote; m_doctypeToken.setState(DoctypeSystemID); src.advancePastNonNewline(); if (inViewSourceMode()) m_doctypeToken.m_source.append(c); } else if (c == '>') { // Considered bogus. Don't process the doctype. src.advancePastNonNewline(); state.setInDoctype(false); } else if (isWhitespace) { src.advance(m_lineNumber); if (inViewSourceMode()) m_doctypeToken.m_source.append(c); } else m_doctypeToken.setState(DoctypeBogus); break; case DoctypeSystemID: if ((c == '\"' && tquote == DoubleQuote) || (c == '\'' && tquote == SingleQuote)) { src.advancePastNonNewline(); m_doctypeToken.setState(DoctypeAfterSystemID); if (inViewSourceMode()) m_doctypeToken.m_source.append(c); } else if (c == '>') { // Considered bogus. Don't process the doctype. src.advancePastNonNewline(); state.setInDoctype(false); if (inViewSourceMode()) processDoctypeToken(); } else { m_doctypeToken.m_systemID.append(c); src.advance(m_lineNumber); if (inViewSourceMode()) m_doctypeToken.m_source.append(c); } break; case DoctypeAfterSystemID: if (c == '>') { // Valid doctype. Emit it now. src.advancePastNonNewline(); state.setInDoctype(false); processDoctypeToken(); } else if (isWhitespace) { src.advance(m_lineNumber); if (inViewSourceMode()) m_doctypeToken.m_source.append(c); } else m_doctypeToken.setState(DoctypeBogus); break; case DoctypeBogus: if (c == '>') { // Done with the bogus doctype. src.advancePastNonNewline(); state.setInDoctype(false); if (inViewSourceMode()) processDoctypeToken(); } else { src.advance(m_lineNumber); // Just keep scanning for '>' if (inViewSourceMode()) m_doctypeToken.m_source.append(c); } break; default: break; } } return state; } HTMLTokenizer::State HTMLTokenizer::parseTag(SegmentedString& src, State state) { ASSERT(!state.hasEntityState()); unsigned cBufferPos = m_cBufferPos; bool lastIsSlash = false; while (!src.isEmpty()) { checkBuffer(); switch (state.tagState()) { case NoTag: { m_cBufferPos = cBufferPos; return state; } case TagName: { if (searchCount > 0) { if (*src == commentStart[searchCount]) { searchCount++; if (searchCount == 2) m_doctypeSearchCount++; // A '!' is also part of a doctype, so we are moving through that still as well. else m_doctypeSearchCount = 0; if (searchCount == 4) { // Found ' as a valid comment, since both mozilla and IE on windows // can handle this case. Only do this in quirks mode. -dwh if (!src.isEmpty() && *src == '>' && m_doc->inCompatMode()) { state.setInComment(false); src.advancePastNonNewline(); if (!src.isEmpty()) m_cBuffer[cBufferPos++] = *src; } else state = parseComment(src, state); m_cBufferPos = cBufferPos; return state; // Finished parsing tag! } m_cBuffer[cBufferPos++] = *src; src.advancePastNonNewline(); break; } else searchCount = 0; // Stop looking for ' or searchCount = 1; // Look for '