diff options
Diffstat (limited to 'WebCore/loader/TextResourceDecoder.cpp')
-rw-r--r-- | WebCore/loader/TextResourceDecoder.cpp | 152 |
1 files changed, 106 insertions, 46 deletions
diff --git a/WebCore/loader/TextResourceDecoder.cpp b/WebCore/loader/TextResourceDecoder.cpp index f37d8f7..ee81326 100644 --- a/WebCore/loader/TextResourceDecoder.cpp +++ b/WebCore/loader/TextResourceDecoder.cpp @@ -1,6 +1,6 @@ /* Copyright (C) 1999 Lars Knoll (knoll@mpi-hd.mpg.de) - Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008 Apple Inc. All rights reserved. + Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved. Copyright (C) 2005, 2006, 2007 Alexey Proskuryakov (ap@nypop.com) This library is free software; you can redistribute it and/or @@ -26,6 +26,9 @@ #include "DOMImplementation.h" #include "HTMLNames.h" #include "TextCodec.h" +#include "TextEncoding.h" +#include "TextEncodingDetector.h" +#include "TextEncodingRegistry.h" #include <wtf/ASCIICType.h> #include <wtf/StringExtras.h> @@ -320,14 +323,17 @@ const TextEncoding& TextResourceDecoder::defaultEncoding(ContentType contentType return specifiedDefaultEncoding; } -TextResourceDecoder::TextResourceDecoder(const String& mimeType, const TextEncoding& specifiedDefaultEncoding) +TextResourceDecoder::TextResourceDecoder(const String& mimeType, const TextEncoding& specifiedDefaultEncoding, bool usesEncodingDetector) : m_contentType(determineContentType(mimeType)) - , m_decoder(defaultEncoding(m_contentType, specifiedDefaultEncoding)) + , m_encoding(defaultEncoding(m_contentType, specifiedDefaultEncoding)) , m_source(DefaultEncoding) + , m_hintEncoding(0) , m_checkedForBOM(false) , m_checkedForCSSCharset(false) , m_checkedForHeadCharset(false) + , m_useLenientXMLDecoding(false) , m_sawError(false) + , m_usesEncodingDetector(usesEncodingDetector) { } @@ -344,12 +350,13 @@ void TextResourceDecoder::setEncoding(const TextEncoding& encoding, EncodingSour // When encoding comes from meta tag (i.e. it cannot be XML files sent via XHR), // treat x-user-defined as windows-1252 (bug 18270) if (source == EncodingFromMetaTag && strcasecmp(encoding.name(), "x-user-defined") == 0) - m_decoder.reset("windows-1252"); + m_encoding = "windows-1252"; else if (source == EncodingFromMetaTag || source == EncodingFromXMLHeader || source == EncodingFromCSSCharset) - m_decoder.reset(encoding.closestByteBasedEquivalent()); + m_encoding = encoding.closestByteBasedEquivalent(); else - m_decoder.reset(encoding); + m_encoding = encoding; + m_codec.clear(); m_source = source; } @@ -401,51 +408,54 @@ static inline bool skipWhitespace(const char*& pos, const char* dataEnd) return pos != dataEnd; } -void TextResourceDecoder::checkForBOM(const char* data, size_t len) +size_t TextResourceDecoder::checkForBOM(const char* data, size_t len) { // Check for UTF-16/32 or UTF-8 BOM mark at the beginning, which is a sure sign of a Unicode encoding. + // We let it override even a user-chosen encoding. + ASSERT(!m_checkedForBOM); - if (m_source == UserChosenEncoding) { - // FIXME: Maybe a BOM should override even a user-chosen encoding. - m_checkedForBOM = true; - return; - } + size_t lengthOfBOM = 0; - // Check if we have enough data. size_t bufferLength = m_buffer.size(); - if (bufferLength + len < 4) - return; - - m_checkedForBOM = true; - // Extract the first four bytes. - // Handle the case where some of bytes are already in the buffer. - // The last byte is always guaranteed to not be in the buffer. - const unsigned char* udata = reinterpret_cast<const unsigned char*>(data); - unsigned char c1 = bufferLength >= 1 ? m_buffer[0] : *udata++; - unsigned char c2 = bufferLength >= 2 ? m_buffer[1] : *udata++; - unsigned char c3 = bufferLength >= 3 ? m_buffer[2] : *udata++; - ASSERT(bufferLength < 4); - unsigned char c4 = *udata; + size_t buf1Len = bufferLength; + size_t buf2Len = len; + const unsigned char* buf1 = reinterpret_cast<const unsigned char*>(m_buffer.data()); + const unsigned char* buf2 = reinterpret_cast<const unsigned char*>(data); + unsigned char c1 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0; + unsigned char c2 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0; + unsigned char c3 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0; + unsigned char c4 = buf2Len ? (--buf2Len, *buf2++) : 0; // Check for the BOM. if (c1 == 0xFF && c2 == 0xFE) { - if (c3 !=0 || c4 != 0) + if (c3 != 0 || c4 != 0) { setEncoding(UTF16LittleEndianEncoding(), AutoDetectedEncoding); - else + lengthOfBOM = 2; + } else { setEncoding(UTF32LittleEndianEncoding(), AutoDetectedEncoding); - } - else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) + lengthOfBOM = 4; + } + } else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) { setEncoding(UTF8Encoding(), AutoDetectedEncoding); - else if (c1 == 0xFE && c2 == 0xFF) + lengthOfBOM = 3; + } else if (c1 == 0xFE && c2 == 0xFF) { setEncoding(UTF16BigEndianEncoding(), AutoDetectedEncoding); - else if (c1 == 0 && c2 == 0 && c3 == 0xFE && c4 == 0xFF) + lengthOfBOM = 2; + } else if (c1 == 0 && c2 == 0 && c3 == 0xFE && c4 == 0xFF) { setEncoding(UTF32BigEndianEncoding(), AutoDetectedEncoding); + lengthOfBOM = 4; + } + + if (lengthOfBOM || bufferLength + len >= 4) + m_checkedForBOM = true; + + return lengthOfBOM; } bool TextResourceDecoder::checkForCSSCharset(const char* data, size_t len, bool& movedDataToBuffer) { - if (m_source != DefaultEncoding) { + if (m_source != DefaultEncoding && m_source != EncodingFromParentFrame) { m_checkedForCSSCharset = true; return true; } @@ -526,7 +536,7 @@ const int bytesToCheckUnconditionally = 1024; // That many input bytes will be c bool TextResourceDecoder::checkForHeadCharset(const char* data, size_t len, bool& movedDataToBuffer) { - if (m_source != DefaultEncoding) { + if (m_source != DefaultEncoding && m_source != EncodingFromParentFrame) { m_checkedForHeadCharset = true; return true; } @@ -636,7 +646,7 @@ bool TextResourceDecoder::checkForHeadCharset(const char* data, size_t len, bool ptr++; continue; } - if (c >= 'a' && c <= 'z' || c >= '0' && c <= '9') + if ((c >= 'a' && c <= 'z') || (c >= '0' && c <= '9')) ; else if (c >= 'A' && c <= 'Z') c += 'a' - 'A'; @@ -695,8 +705,8 @@ bool TextResourceDecoder::checkForHeadCharset(const char* data, size_t len, bool break; if (str[pos++] != '=') continue; - while (pos < length && - (str[pos] <= ' ') || str[pos] == '=' || str[pos] == '"' || str[pos] == '\'') + while ((pos < length) && + (str[pos] <= ' ' || str[pos] == '=' || str[pos] == '"' || str[pos] == '\'')) pos++; // end ? @@ -753,10 +763,28 @@ void TextResourceDecoder::detectJapaneseEncoding(const char* data, size_t len) } } +// We use the encoding detector in two cases: +// 1. Encoding detector is turned ON and no other encoding source is +// available (that is, it's DefaultEncoding). +// 2. Encoding detector is turned ON and the encoding is set to +// the encoding of the parent frame, which is also auto-detected. +// Note that condition #2 is NOT satisfied unless parent-child frame +// relationship is compliant to the same-origin policy. If they're from +// different domains, |m_source| would not be set to EncodingFromParentFrame +// in the first place. +bool TextResourceDecoder::shouldAutoDetect() const +{ + // Just checking m_hintEncoding suffices here because it's only set + // in setHintEncoding when the source is AutoDetectedEncoding. + return m_usesEncodingDetector + && (m_source == DefaultEncoding || (m_source == EncodingFromParentFrame && m_hintEncoding)); +} + String TextResourceDecoder::decode(const char* data, size_t len) { + size_t lengthOfBOM = 0; if (!m_checkedForBOM) - checkForBOM(data, len); + lengthOfBOM = checkForBOM(data, len); bool movedDataToBuffer = false; @@ -768,15 +796,32 @@ String TextResourceDecoder::decode(const char* data, size_t len) if (!checkForHeadCharset(data, len, movedDataToBuffer)) return ""; - // Do the auto-detect if our default encoding is one of the Japanese ones. - // FIXME: It seems wrong to change our encoding downstream after we have already done some decoding. - if (m_source != UserChosenEncoding && m_source != AutoDetectedEncoding && encoding().isJapanese()) + // FIXME: It seems wrong to change our encoding downstream after + // we have already done some decoding. However, it's not possible + // to avoid in a sense in two cases below because triggering conditions + // for both cases depend on the information that won't be available + // until we do partial read. + // The first case had better be removed altogether (see bug 21990) + // or at least be made to be invoked only when the encoding detection + // is turned on. + // Do the auto-detect 1) using Japanese detector if our default encoding is + // one of the Japanese detector or 2) using detectTextEncoding if encoding + // detection is turned on. + if (m_source != UserChosenEncoding && m_source != AutoDetectedEncoding && m_encoding.isJapanese()) detectJapaneseEncoding(data, len); + else if (shouldAutoDetect()) { + TextEncoding detectedEncoding; + if (detectTextEncoding(data, len, m_hintEncoding, &detectedEncoding)) + setEncoding(detectedEncoding, AutoDetectedEncoding); + } + + ASSERT(m_encoding.isValid()); - ASSERT(encoding().isValid()); + if (!m_codec) + m_codec.set(newTextCodec(m_encoding).release()); if (m_buffer.isEmpty()) - return m_decoder.decode(data, len, false, m_contentType == XML, m_sawError); + return m_codec->decode(data + lengthOfBOM, len - lengthOfBOM, false, m_contentType == XML, m_sawError); if (!movedDataToBuffer) { size_t oldSize = m_buffer.size(); @@ -784,16 +829,31 @@ String TextResourceDecoder::decode(const char* data, size_t len) memcpy(m_buffer.data() + oldSize, data, len); } - String result = m_decoder.decode(m_buffer.data(), m_buffer.size(), false, m_contentType == XML, m_sawError); + String result = m_codec->decode(m_buffer.data() + lengthOfBOM, m_buffer.size() - lengthOfBOM, false, m_contentType == XML && !m_useLenientXMLDecoding, m_sawError); m_buffer.clear(); return result; } String TextResourceDecoder::flush() { - String result = m_decoder.decode(m_buffer.data(), m_buffer.size(), true, m_contentType == XML, m_sawError); + // If we can not identify the encoding even after a document is completely + // loaded, we need to detect the encoding if other conditions for + // autodetection is satisfied. + if (m_buffer.size() && shouldAutoDetect() + && ((!m_checkedForHeadCharset && (m_contentType == HTML || m_contentType == XML)) || (!m_checkedForCSSCharset && (m_contentType == CSS)))) { + TextEncoding detectedEncoding; + if (detectTextEncoding(m_buffer.data(), m_buffer.size(), + m_hintEncoding, &detectedEncoding)) + setEncoding(detectedEncoding, AutoDetectedEncoding); + } + + if (!m_codec) + m_codec.set(newTextCodec(m_encoding).release()); + + String result = m_codec->decode(m_buffer.data(), m_buffer.size(), true, m_contentType == XML && !m_useLenientXMLDecoding, m_sawError); m_buffer.clear(); - m_decoder.reset(m_decoder.encoding()); + m_codec.clear(); + m_checkedForBOM = false; // Skip BOM again when re-decoding. return result; } |