From cad810f21b803229eb11403f9209855525a25d57 Mon Sep 17 00:00:00 2001 From: Steve Block Date: Fri, 6 May 2011 11:45:16 +0100 Subject: Merge WebKit at r75315: Initial merge by git. Change-Id: I570314b346ce101c935ed22a626b48c2af266b84 --- .../WebCore/platform/text/TextEncodingRegistry.cpp | 402 +++++++++++++++++++++ 1 file changed, 402 insertions(+) create mode 100644 Source/WebCore/platform/text/TextEncodingRegistry.cpp (limited to 'Source/WebCore/platform/text/TextEncodingRegistry.cpp') diff --git a/Source/WebCore/platform/text/TextEncodingRegistry.cpp b/Source/WebCore/platform/text/TextEncodingRegistry.cpp new file mode 100644 index 0000000..c0c0255 --- /dev/null +++ b/Source/WebCore/platform/text/TextEncodingRegistry.cpp @@ -0,0 +1,402 @@ +/* + * Copyright (C) 2006, 2007 Apple Inc. All rights reserved. + * Copyright (C) 2007-2009 Torch Mobile, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "TextEncodingRegistry.h" + +#include "PlatformString.h" +#include "TextCodecLatin1.h" +#include "TextCodecUserDefined.h" +#include "TextCodecUTF16.h" +#include "TextEncoding.h" +#include +#include +#include +#include +#include +#include +#include +#include + +#if USE(ICU_UNICODE) +#include "TextCodecICU.h" +#endif +#if PLATFORM(MAC) +#include "TextCodecMac.h" +#endif +#if PLATFORM(QT) +#include "qt/TextCodecQt.h" +#endif +#if USE(GLIB_UNICODE) +#include "gtk/TextCodecGtk.h" +#endif +#if USE(BREWMP_UNICODE) +#include "brew/TextCodecBrew.h" +#endif +#if OS(WINCE) && !PLATFORM(QT) +#include "TextCodecWinCE.h" +#endif + +using namespace WTF; + +namespace WebCore { + +const size_t maxEncodingNameLength = 63; + +// Hash for all-ASCII strings that does case folding. +struct TextEncodingNameHash { + + static bool equal(const char* s1, const char* s2) + { + char c1; + char c2; + do { + c1 = *s1++; + c2 = *s2++; + if (toASCIILower(c1) != toASCIILower(c2)) + return false; + } while (c1 && c2); + return !c1 && !c2; + } + + // This algorithm is the one-at-a-time hash from: + // http://burtleburtle.net/bob/hash/hashfaq.html + // http://burtleburtle.net/bob/hash/doobs.html + static unsigned hash(const char* s) + { + unsigned h = WTF::stringHashingStartValue; + for (;;) { + char c = *s++; + if (!c) { + h += (h << 3); + h ^= (h >> 11); + h += (h << 15); + return h; + } + h += toASCIILower(c); + h += (h << 10); + h ^= (h >> 6); + } + } + + static const bool safeToCompareToEmptyOrDeleted = false; +}; + +struct TextCodecFactory { + NewTextCodecFunction function; + const void* additionalData; + TextCodecFactory(NewTextCodecFunction f = 0, const void* d = 0) : function(f), additionalData(d) { } +}; + +typedef HashMap TextEncodingNameMap; +typedef HashMap TextCodecMap; + +static Mutex& encodingRegistryMutex() +{ + // We don't have to use AtomicallyInitializedStatic here because + // this function is called on the main thread for any page before + // it is used in worker threads. + DEFINE_STATIC_LOCAL(Mutex, mutex, ()); + return mutex; +} + +static TextEncodingNameMap* textEncodingNameMap; +static TextCodecMap* textCodecMap; +static bool didExtendTextCodecMaps; +static HashSet* japaneseEncodings; +static HashSet* nonBackslashEncodings; + +static const char* const textEncodingNameBlacklist[] = { + "UTF-7" +}; + +#if ERROR_DISABLED + +static inline void checkExistingName(const char*, const char*) { } + +#else + +static void checkExistingName(const char* alias, const char* atomicName) +{ + const char* oldAtomicName = textEncodingNameMap->get(alias); + if (!oldAtomicName) + return; + if (oldAtomicName == atomicName) + return; + // Keep the warning silent about one case where we know this will happen. + if (strcmp(alias, "ISO-8859-8-I") == 0 + && strcmp(oldAtomicName, "ISO-8859-8-I") == 0 + && strcasecmp(atomicName, "iso-8859-8") == 0) + return; + LOG_ERROR("alias %s maps to %s already, but someone is trying to make it map to %s", alias, oldAtomicName, atomicName); +} + +#endif + +static bool isUndesiredAlias(const char* alias) +{ + // Reject aliases with version numbers that are supported by some back-ends (such as "ISO_2022,locale=ja,version=0" in ICU). + for (const char* p = alias; *p; ++p) { + if (*p == ',') + return true; + } + // 8859_1 is known to (at least) ICU, but other browsers don't support this name - and having it caused a compatibility + // problem, see bug 43554. + if (0 == strcmp(alias, "8859_1")) + return true; + return false; +} + +static void addToTextEncodingNameMap(const char* alias, const char* name) +{ + ASSERT(strlen(alias) <= maxEncodingNameLength); + if (isUndesiredAlias(alias)) + return; + const char* atomicName = textEncodingNameMap->get(name); + ASSERT(strcmp(alias, name) == 0 || atomicName); + if (!atomicName) + atomicName = name; + checkExistingName(alias, atomicName); + textEncodingNameMap->add(alias, atomicName); +} + +static void addToTextCodecMap(const char* name, NewTextCodecFunction function, const void* additionalData) +{ + const char* atomicName = textEncodingNameMap->get(name); + ASSERT(atomicName); + textCodecMap->add(atomicName, TextCodecFactory(function, additionalData)); +} + +static void pruneBlacklistedCodecs() +{ + for (size_t i = 0; i < WTF_ARRAY_LENGTH(textEncodingNameBlacklist); ++i) { + const char* atomicName = textEncodingNameMap->get(textEncodingNameBlacklist[i]); + if (!atomicName) + continue; + + Vector names; + TextEncodingNameMap::const_iterator it = textEncodingNameMap->begin(); + TextEncodingNameMap::const_iterator end = textEncodingNameMap->end(); + for (; it != end; ++it) { + if (it->second == atomicName) + names.append(it->first); + } + + size_t length = names.size(); + for (size_t j = 0; j < length; ++j) + textEncodingNameMap->remove(names[j]); + + textCodecMap->remove(atomicName); + } +} + +static void buildBaseTextCodecMaps() +{ + ASSERT(isMainThread()); + ASSERT(!textCodecMap); + ASSERT(!textEncodingNameMap); + + textCodecMap = new TextCodecMap; + textEncodingNameMap = new TextEncodingNameMap; + + TextCodecLatin1::registerEncodingNames(addToTextEncodingNameMap); + TextCodecLatin1::registerCodecs(addToTextCodecMap); + + TextCodecUTF16::registerEncodingNames(addToTextEncodingNameMap); + TextCodecUTF16::registerCodecs(addToTextCodecMap); + + TextCodecUserDefined::registerEncodingNames(addToTextEncodingNameMap); + TextCodecUserDefined::registerCodecs(addToTextCodecMap); + +#if USE(ICU_UNICODE) + TextCodecICU::registerBaseEncodingNames(addToTextEncodingNameMap); + TextCodecICU::registerBaseCodecs(addToTextCodecMap); +#endif + +#if USE(GLIB_UNICODE) + TextCodecGtk::registerBaseEncodingNames(addToTextEncodingNameMap); + TextCodecGtk::registerBaseCodecs(addToTextCodecMap); +#endif + +#if USE(BREWMP_UNICODE) + TextCodecBrew::registerBaseEncodingNames(addToTextEncodingNameMap); + TextCodecBrew::registerBaseCodecs(addToTextCodecMap); +#endif + +#if OS(WINCE) && !PLATFORM(QT) + TextCodecWinCE::registerBaseEncodingNames(addToTextEncodingNameMap); + TextCodecWinCE::registerBaseCodecs(addToTextCodecMap); +#endif +} + +static void addEncodingName(HashSet* set, const char* name) +{ + // We must not use atomicCanonicalTextEncodingName() because this function is called in it. + const char* atomicName = textEncodingNameMap->get(name); + if (atomicName) + set->add(atomicName); +} + +static void buildQuirksSets() +{ + // FIXME: Having isJapaneseEncoding() and shouldShowBackslashAsCurrencySymbolIn() + // and initializing the sets for them in TextEncodingRegistry.cpp look strange. + + ASSERT(!japaneseEncodings); + ASSERT(!nonBackslashEncodings); + + japaneseEncodings = new HashSet(); + addEncodingName(japaneseEncodings, "EUC-JP"); + addEncodingName(japaneseEncodings, "ISO-2022-JP"); + addEncodingName(japaneseEncodings, "ISO-2022-JP-1"); + addEncodingName(japaneseEncodings, "ISO-2022-JP-2"); + addEncodingName(japaneseEncodings, "ISO-2022-JP-3"); + addEncodingName(japaneseEncodings, "JIS_C6226-1978"); + addEncodingName(japaneseEncodings, "JIS_X0201"); + addEncodingName(japaneseEncodings, "JIS_X0208-1983"); + addEncodingName(japaneseEncodings, "JIS_X0208-1990"); + addEncodingName(japaneseEncodings, "JIS_X0212-1990"); + addEncodingName(japaneseEncodings, "Shift_JIS"); + addEncodingName(japaneseEncodings, "Shift_JIS_X0213-2000"); + addEncodingName(japaneseEncodings, "cp932"); + addEncodingName(japaneseEncodings, "x-mac-japanese"); + + nonBackslashEncodings = new HashSet(); + // The text encodings below treat backslash as a currency symbol for IE compatibility. + // See http://blogs.msdn.com/michkap/archive/2005/09/17/469941.aspx for more information. + addEncodingName(nonBackslashEncodings, "x-mac-japanese"); + addEncodingName(nonBackslashEncodings, "ISO-2022-JP"); + addEncodingName(nonBackslashEncodings, "EUC-JP"); + // Shift_JIS_X0213-2000 is not the same encoding as Shift_JIS on Mac. We need to register both of them. + addEncodingName(nonBackslashEncodings, "Shift_JIS"); + addEncodingName(nonBackslashEncodings, "Shift_JIS_X0213-2000"); +} + +bool isJapaneseEncoding(const char* canonicalEncodingName) +{ + return canonicalEncodingName && japaneseEncodings && japaneseEncodings->contains(canonicalEncodingName); +} + +bool shouldShowBackslashAsCurrencySymbolIn(const char* canonicalEncodingName) +{ + return canonicalEncodingName && nonBackslashEncodings && nonBackslashEncodings->contains(canonicalEncodingName); +} + +static void extendTextCodecMaps() +{ +#if USE(ICU_UNICODE) + TextCodecICU::registerExtendedEncodingNames(addToTextEncodingNameMap); + TextCodecICU::registerExtendedCodecs(addToTextCodecMap); +#endif + +#if USE(QT4_UNICODE) + TextCodecQt::registerEncodingNames(addToTextEncodingNameMap); + TextCodecQt::registerCodecs(addToTextCodecMap); +#endif + +#if PLATFORM(MAC) + TextCodecMac::registerEncodingNames(addToTextEncodingNameMap); + TextCodecMac::registerCodecs(addToTextCodecMap); +#endif + +#if USE(GLIB_UNICODE) + TextCodecGtk::registerExtendedEncodingNames(addToTextEncodingNameMap); + TextCodecGtk::registerExtendedCodecs(addToTextCodecMap); +#endif + +#if OS(WINCE) && !PLATFORM(QT) + TextCodecWinCE::registerExtendedEncodingNames(addToTextEncodingNameMap); + TextCodecWinCE::registerExtendedCodecs(addToTextCodecMap); +#endif + + pruneBlacklistedCodecs(); + buildQuirksSets(); +} + +PassOwnPtr newTextCodec(const TextEncoding& encoding) +{ + MutexLocker lock(encodingRegistryMutex()); + + ASSERT(textCodecMap); + TextCodecFactory factory = textCodecMap->get(encoding.name()); + ASSERT(factory.function); + return factory.function(encoding, factory.additionalData); +} + +const char* atomicCanonicalTextEncodingName(const char* name) +{ + if (!name || !name[0]) + return 0; + if (!textEncodingNameMap) + buildBaseTextCodecMaps(); + + MutexLocker lock(encodingRegistryMutex()); + + if (const char* atomicName = textEncodingNameMap->get(name)) + return atomicName; + if (didExtendTextCodecMaps) + return 0; + extendTextCodecMaps(); + didExtendTextCodecMaps = true; + return textEncodingNameMap->get(name); +} + +const char* atomicCanonicalTextEncodingName(const UChar* characters, size_t length) +{ + char buffer[maxEncodingNameLength + 1]; + size_t j = 0; + for (size_t i = 0; i < length; ++i) { + UChar c = characters[i]; + if (j == maxEncodingNameLength) + return 0; + buffer[j++] = c; + } + buffer[j] = 0; + return atomicCanonicalTextEncodingName(buffer); +} + +bool noExtendedTextEncodingNameUsed() +{ + // If the calling thread did not use extended encoding names, it is fine for it to use a stale false value. + return !didExtendTextCodecMaps; +} + +#ifndef NDEBUG +void dumpTextEncodingNameMap() +{ + unsigned size = textEncodingNameMap->size(); + fprintf(stderr, "Dumping %u entries in WebCore::textEncodingNameMap...\n", size); + + MutexLocker lock(encodingRegistryMutex()); + + TextEncodingNameMap::const_iterator it = textEncodingNameMap->begin(); + TextEncodingNameMap::const_iterator end = textEncodingNameMap->end(); + for (; it != end; ++it) + fprintf(stderr, "'%s' => '%s'\n", it->first, it->second); +} +#endif + +} // namespace WebCore -- cgit v1.1