summaryrefslogtreecommitdiffstats
path: root/Source/WebCore/platform/text/TextEncoding.cpp
diff options
context:
space:
mode:
authorSteve Block <steveblock@google.com>2011-05-06 11:45:16 +0100
committerSteve Block <steveblock@google.com>2011-05-12 13:44:10 +0100
commitcad810f21b803229eb11403f9209855525a25d57 (patch)
tree29a6fd0279be608e0fe9ffe9841f722f0f4e4269 /Source/WebCore/platform/text/TextEncoding.cpp
parent121b0cf4517156d0ac5111caf9830c51b69bae8f (diff)
downloadexternal_webkit-cad810f21b803229eb11403f9209855525a25d57.zip
external_webkit-cad810f21b803229eb11403f9209855525a25d57.tar.gz
external_webkit-cad810f21b803229eb11403f9209855525a25d57.tar.bz2
Merge WebKit at r75315: Initial merge by git.
Change-Id: I570314b346ce101c935ed22a626b48c2af266b84
Diffstat (limited to 'Source/WebCore/platform/text/TextEncoding.cpp')
-rw-r--r--Source/WebCore/platform/text/TextEncoding.cpp265
1 files changed, 265 insertions, 0 deletions
diff --git a/Source/WebCore/platform/text/TextEncoding.cpp b/Source/WebCore/platform/text/TextEncoding.cpp
new file mode 100644
index 0000000..33313a0
--- /dev/null
+++ b/Source/WebCore/platform/text/TextEncoding.cpp
@@ -0,0 +1,265 @@
+/*
+ * Copyright (C) 2004, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved.
+ * Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com>
+ * Copyright (C) 2007-2009 Torch Mobile, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+#include "TextEncoding.h"
+
+#include "PlatformString.h"
+#include "TextCodec.h"
+#include "TextEncodingRegistry.h"
+#if USE(ICU_UNICODE)
+#include <unicode/unorm.h>
+#elif USE(QT4_UNICODE)
+#include <QString>
+#elif USE(GLIB_UNICODE)
+#include <glib.h>
+#include "GOwnPtr.h"
+#endif
+#include <wtf/text/CString.h>
+#include <wtf/OwnPtr.h>
+#include <wtf/StdLibExtras.h>
+
+namespace WebCore {
+
+static const TextEncoding& UTF7Encoding()
+{
+ static TextEncoding globalUTF7Encoding("UTF-7");
+ return globalUTF7Encoding;
+}
+
+TextEncoding::TextEncoding(const char* name)
+ : m_name(atomicCanonicalTextEncodingName(name))
+ , m_backslashAsCurrencySymbol(backslashAsCurrencySymbol())
+{
+}
+
+TextEncoding::TextEncoding(const String& name)
+ : m_name(atomicCanonicalTextEncodingName(name.characters(), name.length()))
+ , m_backslashAsCurrencySymbol(backslashAsCurrencySymbol())
+{
+}
+
+String TextEncoding::decode(const char* data, size_t length, bool stopOnError, bool& sawError) const
+{
+ if (!m_name)
+ return String();
+
+ return newTextCodec(*this)->decode(data, length, true, stopOnError, sawError);
+}
+
+CString TextEncoding::encode(const UChar* characters, size_t length, UnencodableHandling handling) const
+{
+ if (!m_name)
+ return CString();
+
+ if (!length)
+ return "";
+
+#if USE(ICU_UNICODE)
+ // FIXME: What's the right place to do normalization?
+ // It's a little strange to do it inside the encode function.
+ // Perhaps normalization should be an explicit step done before calling encode.
+
+ const UChar* source = characters;
+ size_t sourceLength = length;
+
+ Vector<UChar> normalizedCharacters;
+
+ UErrorCode err = U_ZERO_ERROR;
+ if (unorm_quickCheck(source, sourceLength, UNORM_NFC, &err) != UNORM_YES) {
+ // First try using the length of the original string, since normalization to NFC rarely increases length.
+ normalizedCharacters.grow(sourceLength);
+ int32_t normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), length, &err);
+ if (err == U_BUFFER_OVERFLOW_ERROR) {
+ err = U_ZERO_ERROR;
+ normalizedCharacters.resize(normalizedLength);
+ normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), normalizedLength, &err);
+ }
+ ASSERT(U_SUCCESS(err));
+
+ source = normalizedCharacters.data();
+ sourceLength = normalizedLength;
+ }
+ return newTextCodec(*this)->encode(source, sourceLength, handling);
+#elif USE(QT4_UNICODE)
+ QString str(reinterpret_cast<const QChar*>(characters), length);
+ str = str.normalized(QString::NormalizationForm_C);
+ return newTextCodec(*this)->encode(reinterpret_cast<const UChar *>(str.utf16()), str.length(), handling);
+#elif USE(GLIB_UNICODE)
+ GOwnPtr<char> UTF8Source;
+ UTF8Source.set(g_utf16_to_utf8(characters, length, 0, 0, 0));
+ if (!UTF8Source) {
+ // If conversion to UTF-8 failed, try with the string without normalization
+ return newTextCodec(*this)->encode(characters, length, handling);
+ }
+
+ GOwnPtr<char> UTF8Normalized;
+ UTF8Normalized.set(g_utf8_normalize(UTF8Source.get(), -1, G_NORMALIZE_NFC));
+
+ long UTF16Length;
+ GOwnPtr<UChar> UTF16Normalized;
+ UTF16Normalized.set(g_utf8_to_utf16(UTF8Normalized.get(), -1, 0, &UTF16Length, 0));
+
+ return newTextCodec(*this)->encode(UTF16Normalized.get(), UTF16Length, handling);
+#elif OS(WINCE)
+ // normalization will be done by Windows CE API
+ OwnPtr<TextCodec> textCodec = newTextCodec(*this);
+ return textCodec.get() ? textCodec->encode(characters, length, handling) : CString();
+#elif USE(BREWMP_UNICODE)
+ // FIXME: not sure if Brew MP normalizes the input string automatically
+ OwnPtr<TextCodec> textCodec = newTextCodec(*this);
+ return textCodec.get() ? textCodec->encode(characters, length, handling) : CString();
+#endif
+}
+
+const char* TextEncoding::domName() const
+{
+ if (noExtendedTextEncodingNameUsed())
+ return m_name;
+
+ // We treat EUC-KR as windows-949 (its superset), but need to expose
+ // the name 'EUC-KR' because the name 'windows-949' is not recognized by
+ // most Korean web servers even though they do use the encoding
+ // 'windows-949' with the name 'EUC-KR'.
+ // FIXME: This is not thread-safe. At the moment, this function is
+ // only accessed in a single thread, but eventually has to be made
+ // thread-safe along with usesVisualOrdering().
+ static const char* const a = atomicCanonicalTextEncodingName("windows-949");
+ if (m_name == a)
+ return "EUC-KR";
+ return m_name;
+}
+
+bool TextEncoding::usesVisualOrdering() const
+{
+ if (noExtendedTextEncodingNameUsed())
+ return false;
+
+ static const char* const a = atomicCanonicalTextEncodingName("ISO-8859-8");
+ return m_name == a;
+}
+
+bool TextEncoding::isJapanese() const
+{
+ return isJapaneseEncoding(m_name);
+}
+
+UChar TextEncoding::backslashAsCurrencySymbol() const
+{
+ return shouldShowBackslashAsCurrencySymbolIn(m_name) ? 0x00A5 : '\\';
+}
+
+bool TextEncoding::isNonByteBasedEncoding() const
+{
+ if (noExtendedTextEncodingNameUsed()) {
+ return *this == UTF16LittleEndianEncoding()
+ || *this == UTF16BigEndianEncoding();
+ }
+
+ return *this == UTF16LittleEndianEncoding()
+ || *this == UTF16BigEndianEncoding()
+ || *this == UTF32BigEndianEncoding()
+ || *this == UTF32LittleEndianEncoding();
+}
+
+bool TextEncoding::isUTF7Encoding() const
+{
+ if (noExtendedTextEncodingNameUsed())
+ return false;
+
+ return *this == UTF7Encoding();
+}
+
+const TextEncoding& TextEncoding::closestByteBasedEquivalent() const
+{
+ if (isNonByteBasedEncoding())
+ return UTF8Encoding();
+ return *this;
+}
+
+// HTML5 specifies that UTF-8 be used in form submission when a form is
+// is a part of a document in UTF-16 probably because UTF-16 is not a
+// byte-based encoding and can contain 0x00. By extension, the same
+// should be done for UTF-32. In case of UTF-7, it is a byte-based encoding,
+// but it's fraught with problems and we'd rather steer clear of it.
+const TextEncoding& TextEncoding::encodingForFormSubmission() const
+{
+ if (isNonByteBasedEncoding() || isUTF7Encoding())
+ return UTF8Encoding();
+ return *this;
+}
+
+const TextEncoding& ASCIIEncoding()
+{
+ static TextEncoding globalASCIIEncoding("ASCII");
+ return globalASCIIEncoding;
+}
+
+const TextEncoding& Latin1Encoding()
+{
+ static TextEncoding globalLatin1Encoding("latin1");
+ return globalLatin1Encoding;
+}
+
+const TextEncoding& UTF16BigEndianEncoding()
+{
+ static TextEncoding globalUTF16BigEndianEncoding("UTF-16BE");
+ return globalUTF16BigEndianEncoding;
+}
+
+const TextEncoding& UTF16LittleEndianEncoding()
+{
+ static TextEncoding globalUTF16LittleEndianEncoding("UTF-16LE");
+ return globalUTF16LittleEndianEncoding;
+}
+
+const TextEncoding& UTF32BigEndianEncoding()
+{
+ static TextEncoding globalUTF32BigEndianEncoding("UTF-32BE");
+ return globalUTF32BigEndianEncoding;
+}
+
+const TextEncoding& UTF32LittleEndianEncoding()
+{
+ static TextEncoding globalUTF32LittleEndianEncoding("UTF-32LE");
+ return globalUTF32LittleEndianEncoding;
+}
+
+const TextEncoding& UTF8Encoding()
+{
+ static TextEncoding globalUTF8Encoding("UTF-8");
+ ASSERT(globalUTF8Encoding.isValid());
+ return globalUTF8Encoding;
+}
+
+const TextEncoding& WindowsLatin1Encoding()
+{
+ static TextEncoding globalWindowsLatin1Encoding("WinLatin-1");
+ return globalWindowsLatin1Encoding;
+}
+
+} // namespace WebCore