summaryrefslogtreecommitdiffstats
path: root/WebCore/loader/TextResourceDecoder.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'WebCore/loader/TextResourceDecoder.cpp')
-rw-r--r--WebCore/loader/TextResourceDecoder.cpp152
1 files changed, 106 insertions, 46 deletions
diff --git a/WebCore/loader/TextResourceDecoder.cpp b/WebCore/loader/TextResourceDecoder.cpp
index f37d8f7..ee81326 100644
--- a/WebCore/loader/TextResourceDecoder.cpp
+++ b/WebCore/loader/TextResourceDecoder.cpp
@@ -1,6 +1,6 @@
/*
Copyright (C) 1999 Lars Knoll (knoll@mpi-hd.mpg.de)
- Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008 Apple Inc. All rights reserved.
+ Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved.
Copyright (C) 2005, 2006, 2007 Alexey Proskuryakov (ap@nypop.com)
This library is free software; you can redistribute it and/or
@@ -26,6 +26,9 @@
#include "DOMImplementation.h"
#include "HTMLNames.h"
#include "TextCodec.h"
+#include "TextEncoding.h"
+#include "TextEncodingDetector.h"
+#include "TextEncodingRegistry.h"
#include <wtf/ASCIICType.h>
#include <wtf/StringExtras.h>
@@ -320,14 +323,17 @@ const TextEncoding& TextResourceDecoder::defaultEncoding(ContentType contentType
return specifiedDefaultEncoding;
}
-TextResourceDecoder::TextResourceDecoder(const String& mimeType, const TextEncoding& specifiedDefaultEncoding)
+TextResourceDecoder::TextResourceDecoder(const String& mimeType, const TextEncoding& specifiedDefaultEncoding, bool usesEncodingDetector)
: m_contentType(determineContentType(mimeType))
- , m_decoder(defaultEncoding(m_contentType, specifiedDefaultEncoding))
+ , m_encoding(defaultEncoding(m_contentType, specifiedDefaultEncoding))
, m_source(DefaultEncoding)
+ , m_hintEncoding(0)
, m_checkedForBOM(false)
, m_checkedForCSSCharset(false)
, m_checkedForHeadCharset(false)
+ , m_useLenientXMLDecoding(false)
, m_sawError(false)
+ , m_usesEncodingDetector(usesEncodingDetector)
{
}
@@ -344,12 +350,13 @@ void TextResourceDecoder::setEncoding(const TextEncoding& encoding, EncodingSour
// When encoding comes from meta tag (i.e. it cannot be XML files sent via XHR),
// treat x-user-defined as windows-1252 (bug 18270)
if (source == EncodingFromMetaTag && strcasecmp(encoding.name(), "x-user-defined") == 0)
- m_decoder.reset("windows-1252");
+ m_encoding = "windows-1252";
else if (source == EncodingFromMetaTag || source == EncodingFromXMLHeader || source == EncodingFromCSSCharset)
- m_decoder.reset(encoding.closestByteBasedEquivalent());
+ m_encoding = encoding.closestByteBasedEquivalent();
else
- m_decoder.reset(encoding);
+ m_encoding = encoding;
+ m_codec.clear();
m_source = source;
}
@@ -401,51 +408,54 @@ static inline bool skipWhitespace(const char*& pos, const char* dataEnd)
return pos != dataEnd;
}
-void TextResourceDecoder::checkForBOM(const char* data, size_t len)
+size_t TextResourceDecoder::checkForBOM(const char* data, size_t len)
{
// Check for UTF-16/32 or UTF-8 BOM mark at the beginning, which is a sure sign of a Unicode encoding.
+ // We let it override even a user-chosen encoding.
+ ASSERT(!m_checkedForBOM);
- if (m_source == UserChosenEncoding) {
- // FIXME: Maybe a BOM should override even a user-chosen encoding.
- m_checkedForBOM = true;
- return;
- }
+ size_t lengthOfBOM = 0;
- // Check if we have enough data.
size_t bufferLength = m_buffer.size();
- if (bufferLength + len < 4)
- return;
-
- m_checkedForBOM = true;
- // Extract the first four bytes.
- // Handle the case where some of bytes are already in the buffer.
- // The last byte is always guaranteed to not be in the buffer.
- const unsigned char* udata = reinterpret_cast<const unsigned char*>(data);
- unsigned char c1 = bufferLength >= 1 ? m_buffer[0] : *udata++;
- unsigned char c2 = bufferLength >= 2 ? m_buffer[1] : *udata++;
- unsigned char c3 = bufferLength >= 3 ? m_buffer[2] : *udata++;
- ASSERT(bufferLength < 4);
- unsigned char c4 = *udata;
+ size_t buf1Len = bufferLength;
+ size_t buf2Len = len;
+ const unsigned char* buf1 = reinterpret_cast<const unsigned char*>(m_buffer.data());
+ const unsigned char* buf2 = reinterpret_cast<const unsigned char*>(data);
+ unsigned char c1 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
+ unsigned char c2 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
+ unsigned char c3 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
+ unsigned char c4 = buf2Len ? (--buf2Len, *buf2++) : 0;
// Check for the BOM.
if (c1 == 0xFF && c2 == 0xFE) {
- if (c3 !=0 || c4 != 0)
+ if (c3 != 0 || c4 != 0) {
setEncoding(UTF16LittleEndianEncoding(), AutoDetectedEncoding);
- else
+ lengthOfBOM = 2;
+ } else {
setEncoding(UTF32LittleEndianEncoding(), AutoDetectedEncoding);
- }
- else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF)
+ lengthOfBOM = 4;
+ }
+ } else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) {
setEncoding(UTF8Encoding(), AutoDetectedEncoding);
- else if (c1 == 0xFE && c2 == 0xFF)
+ lengthOfBOM = 3;
+ } else if (c1 == 0xFE && c2 == 0xFF) {
setEncoding(UTF16BigEndianEncoding(), AutoDetectedEncoding);
- else if (c1 == 0 && c2 == 0 && c3 == 0xFE && c4 == 0xFF)
+ lengthOfBOM = 2;
+ } else if (c1 == 0 && c2 == 0 && c3 == 0xFE && c4 == 0xFF) {
setEncoding(UTF32BigEndianEncoding(), AutoDetectedEncoding);
+ lengthOfBOM = 4;
+ }
+
+ if (lengthOfBOM || bufferLength + len >= 4)
+ m_checkedForBOM = true;
+
+ return lengthOfBOM;
}
bool TextResourceDecoder::checkForCSSCharset(const char* data, size_t len, bool& movedDataToBuffer)
{
- if (m_source != DefaultEncoding) {
+ if (m_source != DefaultEncoding && m_source != EncodingFromParentFrame) {
m_checkedForCSSCharset = true;
return true;
}
@@ -526,7 +536,7 @@ const int bytesToCheckUnconditionally = 1024; // That many input bytes will be c
bool TextResourceDecoder::checkForHeadCharset(const char* data, size_t len, bool& movedDataToBuffer)
{
- if (m_source != DefaultEncoding) {
+ if (m_source != DefaultEncoding && m_source != EncodingFromParentFrame) {
m_checkedForHeadCharset = true;
return true;
}
@@ -636,7 +646,7 @@ bool TextResourceDecoder::checkForHeadCharset(const char* data, size_t len, bool
ptr++;
continue;
}
- if (c >= 'a' && c <= 'z' || c >= '0' && c <= '9')
+ if ((c >= 'a' && c <= 'z') || (c >= '0' && c <= '9'))
;
else if (c >= 'A' && c <= 'Z')
c += 'a' - 'A';
@@ -695,8 +705,8 @@ bool TextResourceDecoder::checkForHeadCharset(const char* data, size_t len, bool
break;
if (str[pos++] != '=')
continue;
- while (pos < length &&
- (str[pos] <= ' ') || str[pos] == '=' || str[pos] == '"' || str[pos] == '\'')
+ while ((pos < length) &&
+ (str[pos] <= ' ' || str[pos] == '=' || str[pos] == '"' || str[pos] == '\''))
pos++;
// end ?
@@ -753,10 +763,28 @@ void TextResourceDecoder::detectJapaneseEncoding(const char* data, size_t len)
}
}
+// We use the encoding detector in two cases:
+// 1. Encoding detector is turned ON and no other encoding source is
+// available (that is, it's DefaultEncoding).
+// 2. Encoding detector is turned ON and the encoding is set to
+// the encoding of the parent frame, which is also auto-detected.
+// Note that condition #2 is NOT satisfied unless parent-child frame
+// relationship is compliant to the same-origin policy. If they're from
+// different domains, |m_source| would not be set to EncodingFromParentFrame
+// in the first place.
+bool TextResourceDecoder::shouldAutoDetect() const
+{
+ // Just checking m_hintEncoding suffices here because it's only set
+ // in setHintEncoding when the source is AutoDetectedEncoding.
+ return m_usesEncodingDetector
+ && (m_source == DefaultEncoding || (m_source == EncodingFromParentFrame && m_hintEncoding));
+}
+
String TextResourceDecoder::decode(const char* data, size_t len)
{
+ size_t lengthOfBOM = 0;
if (!m_checkedForBOM)
- checkForBOM(data, len);
+ lengthOfBOM = checkForBOM(data, len);
bool movedDataToBuffer = false;
@@ -768,15 +796,32 @@ String TextResourceDecoder::decode(const char* data, size_t len)
if (!checkForHeadCharset(data, len, movedDataToBuffer))
return "";
- // Do the auto-detect if our default encoding is one of the Japanese ones.
- // FIXME: It seems wrong to change our encoding downstream after we have already done some decoding.
- if (m_source != UserChosenEncoding && m_source != AutoDetectedEncoding && encoding().isJapanese())
+ // FIXME: It seems wrong to change our encoding downstream after
+ // we have already done some decoding. However, it's not possible
+ // to avoid in a sense in two cases below because triggering conditions
+ // for both cases depend on the information that won't be available
+ // until we do partial read.
+ // The first case had better be removed altogether (see bug 21990)
+ // or at least be made to be invoked only when the encoding detection
+ // is turned on.
+ // Do the auto-detect 1) using Japanese detector if our default encoding is
+ // one of the Japanese detector or 2) using detectTextEncoding if encoding
+ // detection is turned on.
+ if (m_source != UserChosenEncoding && m_source != AutoDetectedEncoding && m_encoding.isJapanese())
detectJapaneseEncoding(data, len);
+ else if (shouldAutoDetect()) {
+ TextEncoding detectedEncoding;
+ if (detectTextEncoding(data, len, m_hintEncoding, &detectedEncoding))
+ setEncoding(detectedEncoding, AutoDetectedEncoding);
+ }
+
+ ASSERT(m_encoding.isValid());
- ASSERT(encoding().isValid());
+ if (!m_codec)
+ m_codec.set(newTextCodec(m_encoding).release());
if (m_buffer.isEmpty())
- return m_decoder.decode(data, len, false, m_contentType == XML, m_sawError);
+ return m_codec->decode(data + lengthOfBOM, len - lengthOfBOM, false, m_contentType == XML, m_sawError);
if (!movedDataToBuffer) {
size_t oldSize = m_buffer.size();
@@ -784,16 +829,31 @@ String TextResourceDecoder::decode(const char* data, size_t len)
memcpy(m_buffer.data() + oldSize, data, len);
}
- String result = m_decoder.decode(m_buffer.data(), m_buffer.size(), false, m_contentType == XML, m_sawError);
+ String result = m_codec->decode(m_buffer.data() + lengthOfBOM, m_buffer.size() - lengthOfBOM, false, m_contentType == XML && !m_useLenientXMLDecoding, m_sawError);
m_buffer.clear();
return result;
}
String TextResourceDecoder::flush()
{
- String result = m_decoder.decode(m_buffer.data(), m_buffer.size(), true, m_contentType == XML, m_sawError);
+ // If we can not identify the encoding even after a document is completely
+ // loaded, we need to detect the encoding if other conditions for
+ // autodetection is satisfied.
+ if (m_buffer.size() && shouldAutoDetect()
+ && ((!m_checkedForHeadCharset && (m_contentType == HTML || m_contentType == XML)) || (!m_checkedForCSSCharset && (m_contentType == CSS)))) {
+ TextEncoding detectedEncoding;
+ if (detectTextEncoding(m_buffer.data(), m_buffer.size(),
+ m_hintEncoding, &detectedEncoding))
+ setEncoding(detectedEncoding, AutoDetectedEncoding);
+ }
+
+ if (!m_codec)
+ m_codec.set(newTextCodec(m_encoding).release());
+
+ String result = m_codec->decode(m_buffer.data(), m_buffer.size(), true, m_contentType == XML && !m_useLenientXMLDecoding, m_sawError);
m_buffer.clear();
- m_decoder.reset(m_decoder.encoding());
+ m_codec.clear();
+ m_checkedForBOM = false; // Skip BOM again when re-decoding.
return result;
}