summaryrefslogtreecommitdiffstats
path: root/WebCore/platform/text/wince/TextCodecWinCE.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'WebCore/platform/text/wince/TextCodecWinCE.cpp')
-rw-r--r--WebCore/platform/text/wince/TextCodecWinCE.cpp483
1 files changed, 483 insertions, 0 deletions
diff --git a/WebCore/platform/text/wince/TextCodecWinCE.cpp b/WebCore/platform/text/wince/TextCodecWinCE.cpp
new file mode 100644
index 0000000..644b12f
--- /dev/null
+++ b/WebCore/platform/text/wince/TextCodecWinCE.cpp
@@ -0,0 +1,483 @@
+/*
+ * Copyright (C) 2007-2009 Torch Mobile, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * This library is distributed in the hope that i will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public License
+ * along with this library; see the file COPYING.LIB. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#include "config.h"
+#include "TextCodecWinCE.h"
+
+#include "ce_textcodecs.h"
+#include "FontCache.h"
+#include "PlatformString.h"
+#include "StringHash.h"
+#include <mlang.h>
+#include <winbase.h>
+#include <winnls.h>
+#include <wtf/HashMap.h>
+#include <wtf/HashSet.h>
+#include <wtf/text/CString.h>
+#include <wtf/unicode/UTF8.h>
+
+namespace WebCore {
+
+struct CharsetInfo {
+ CString m_name;
+ String m_friendlyName;
+ UINT m_codePage;
+ Vector<CString> m_aliases;
+ bool m_usesNativeCodec;
+};
+
+class LanguageManager {
+private:
+ LanguageManager();
+
+ friend LanguageManager& languageManager();
+};
+
+// Usage: a lookup table used to get CharsetInfo with code page ID.
+// Key: code page ID. Value: charset information.
+static HashMap<UINT, CString>& codePageCharsets()
+{
+ static HashMap<UINT, CString> cc;
+ return cc;
+}
+
+static HashMap<String, CharsetInfo>& knownCharsets()
+{
+ static HashMap<String, CharsetInfo> kc;
+ return kc;
+}
+
+// Usage: a map that stores charsets that are supported by system. Sorted by name.
+// Key: charset. Value: code page ID.
+typedef HashSet<String> CharsetSet;
+static CharsetSet& supportedCharsets()
+{
+ static CharsetSet sl;
+ return sl;
+}
+
+static LanguageManager& languageManager()
+{
+ static LanguageManager lm;
+ return lm;
+}
+
+static void addCharset(UINT codePage, const char* charsetName, const wchar_t* friendlyName, const char* charsetAliases, bool nativeSupport = false)
+{
+ CharsetInfo info;
+ info.m_codePage = codePage;
+ info.m_name = charsetName;
+ info.m_friendlyName = friendlyName;
+ info.m_usesNativeCodec = nativeSupport;
+ const char* begin = charsetAliases;
+ for (;;) {
+ const char* end = strchr(begin, '|');
+ CString alias = end ? CString(begin, end - begin) : begin;
+ if (alias.length())
+ info.m_aliases.append(alias);
+ if (!end)
+ break;
+ begin = end + 1;
+ }
+ knownCharsets().set(info.m_name.data(), info);
+ if (codePage != CP_ACP)
+ codePageCharsets().set(codePage, info.m_name);
+}
+
+LanguageManager::LanguageManager()
+{
+ // 437, 708, 709, 710, 720, 737, 775, 850, 852
+ addCharset(932, "SHIFT_JIS", L"Japanese (SHIFT_JIS)", "shift_jis");
+ addCharset(936, "GBK", L"Chinese Simplified (GBK)", "gbk|gb2312");
+ addCharset(949, "KSC5601", L"Korean (KSC5601)", "ks_c_5601-1987|ksc5601|euc-kr|euckr|x-euc-kr");
+ addCharset(950, "BIG5", L"Chinese Traditional (BIG5)", "big5");
+ addCharset(1361, "JOHAB", L"Korean (Johab)", "johab|korean.johab");
+ addCharset(51932, "EUC-JP", L"Japanese (EUC)", "euc-jp|eucjp|x-euc-jp", true);
+ addCharset(874, "CP874", L"Thai (Windows)", "cp874|windows-874", true);
+ addCharset(CP_ACP, "TIS620", L"Thai (TIS 620)", "tis620|ISO-8859-11|ISO-IR-166|TIS-620|TIS620-0TIS620.2529-1|TIS620.2533-0|TIS620.2533-1|thai8", true);
+ addCharset(CP_ACP, "MACTHAI", L"Thai (Mac OS)", "macthai|x-mac-thai|mac-thai", true);
+ supportedCharsets().add("EUC-JP");
+ supportedCharsets().add("CP874");
+ supportedCharsets().add("TIS620");
+ supportedCharsets().add("MACTHAI");
+
+ IEnumCodePage* enumInterface;
+ IMultiLanguage* mli = FontCache::getMultiLanguageInterface();
+ if (mli && S_OK == mli->EnumCodePages(MIMECONTF_BROWSER, &enumInterface)) {
+ MIMECPINFO cpInfo;
+ ULONG ccpInfo;
+ while (S_OK == enumInterface->Next(1, &cpInfo, &ccpInfo) && ccpInfo) {
+ if (!IsValidCodePage(cpInfo.uiCodePage))
+ continue;
+
+ HashMap<UINT, CString>::iterator i = codePageCharsets().find(cpInfo.uiCodePage);
+
+ CString name(String(cpInfo.wszWebCharset).latin1());
+ if (i == codePageCharsets().end()) {
+ CharsetInfo info;
+ info.m_codePage = cpInfo.uiCodePage;
+ knownCharsets().set(name.data(), info);
+ i = codePageCharsets().set(cpInfo.uiCodePage, name).first;
+ }
+ if (i != codePageCharsets().end()) {
+ HashMap<String, CharsetInfo>::iterator j = knownCharsets().find(String(i->second.data(), i->second.length()));
+ ASSERT(j != knownCharsets().end());
+ CharsetInfo& info = j->second;
+ info.m_name = i->second.data();
+ info.m_friendlyName = cpInfo.wszDescription;
+ info.m_aliases.append(name);
+ info.m_aliases.append(String(cpInfo.wszHeaderCharset).latin1());
+ info.m_aliases.append(String(cpInfo.wszBodyCharset).latin1());
+ info.m_usesNativeCodec = false;
+ String cpName = String::format("cp%d", cpInfo.uiCodePage);
+ info.m_aliases.append(cpName.latin1());
+ supportedCharsets().add(i->second.data());
+ }
+ }
+ enumInterface->Release();
+ }
+}
+
+static UINT getCodePage(const char* name)
+{
+ if (!strcmp(name, "UTF-8"))
+ return CP_UTF8;
+
+ // Explicitly use a "const" reference to fix the silly VS build error
+ // saying "==" is not found for const_iterator and iterator
+ const HashMap<String, CharsetInfo>& charsets = knownCharsets();
+ HashMap<String, CharsetInfo>::const_iterator i = charsets.find(name);
+ return i == charsets.end() ? CP_ACP : i->second.m_codePage;
+}
+
+static PassOwnPtr<TextCodec> newTextCodecWinCE(const TextEncoding& encoding, const void*)
+{
+ return new TextCodecWinCE(encoding);
+}
+
+TextCodecWinCE::TextCodecWinCE(const TextEncoding& encoding)
+ : m_encoding(encoding)
+{
+}
+
+TextCodecWinCE::~TextCodecWinCE()
+{
+}
+
+void TextCodecWinCE::registerBaseEncodingNames(EncodingNameRegistrar registrar)
+{
+ registrar("UTF-8", "UTF-8");
+}
+
+void TextCodecWinCE::registerBaseCodecs(TextCodecRegistrar registrar)
+{
+ registrar("UTF-8", newTextCodecWinCE, 0);
+}
+
+void TextCodecWinCE::registerExtendedEncodingNames(EncodingNameRegistrar registrar)
+{
+ languageManager();
+ for (CharsetSet::iterator i = supportedCharsets().begin(); i != supportedCharsets().end(); ++i) {
+ HashMap<String, CharsetInfo>::iterator j = knownCharsets().find(*i);
+ if (j != knownCharsets().end()) {
+ registrar(j->second.m_name.data(), j->second.m_name.data());
+ for (Vector<CString>::const_iterator alias = j->second.m_aliases.begin(); alias != j->second.m_aliases.end(); ++alias)
+ registrar(alias->data(), j->second.m_name.data());
+ }
+ }
+}
+
+void TextCodecWinCE::registerExtendedCodecs(TextCodecRegistrar registrar)
+{
+ languageManager();
+ for (CharsetSet::iterator i = supportedCharsets().begin(); i != supportedCharsets().end(); ++i) {
+ HashMap<String, CharsetInfo>::iterator j = knownCharsets().find(*i);
+ if (j != knownCharsets().end())
+ registrar(j->second.m_name.data(), newTextCodecWinCE, 0);
+ }
+}
+
+static DWORD getCodePageFlags(UINT codePage)
+{
+ if (codePage == CP_UTF8)
+ return MB_ERR_INVALID_CHARS;
+
+ if (codePage == 42) // Symbol
+ return 0;
+
+ // Microsoft says the flag must be 0 for the following code pages
+ if (codePage > 50000) {
+ if ((codePage >= 50220 && codePage <= 50222)
+ || codePage == 50225
+ || codePage == 50227
+ || codePage == 50229
+ || codePage == 52936
+ || codePage == 54936
+ || (codePage >= 57002 && codePage <= 57001)
+ || codePage == 65000 // UTF-7
+ )
+ return 0;
+ }
+
+ return MB_PRECOMPOSED | MB_ERR_INVALID_CHARS;
+}
+
+static inline const char* findFirstNonAsciiCharacter(const char* bytes, size_t length)
+{
+ for (const char* bytesEnd = bytes + length; bytes < bytesEnd; ++bytes) {
+ if (*bytes & 0x80)
+ break;
+ }
+ return bytes;
+}
+
+static void decode(Vector<UChar, 8192>& result, const char* encodingName, const char* bytes, size_t length, size_t* left, bool canBeFirstTime, bool& sawInvalidChar)
+{
+ *left = length;
+ if (!bytes || !length)
+ return;
+
+ UINT codePage;
+
+ HashMap<String, CharsetInfo>::iterator i = knownCharsets().find(encodingName);
+ if (i == knownCharsets().end()) {
+ if (!strcmp(encodingName, "UTF-8"))
+ codePage = CP_UTF8;
+ else
+ codePage = CP_ACP;
+ } else {
+ codePage = i->second.m_codePage;
+ if (i->second.m_usesNativeCodec) {
+ typedef int (*FuncEucMbToWc)(wchar_t *pwc, const unsigned char *s, int n);
+ FuncEucMbToWc encMbToWc = 0;
+ if (!strcmp(encodingName, "EUC-JP"))
+ encMbToWc = TextCodecsCE::euc_jp_mbtowc;
+ else if (!strcmp(encodingName, "CP874"))
+ encMbToWc = TextCodecsCE::cp874_mbtowc;
+ else if (!strcmp(encodingName, "TIS620"))
+ encMbToWc = TextCodecsCE::tis620_mbtowc;
+ else if (!strcmp(encodingName, "MACTHAI"))
+ encMbToWc = TextCodecsCE::mac_thai_mbtowc;
+
+ if (encMbToWc) {
+ const char* const srcStart = bytes;
+ const char* const srcEnd = bytes + length;
+ int lastSize = result.size();
+ result.resize(lastSize + length);
+ for (;;) {
+ UChar* dst = result.data() + lastSize;
+ const UChar* const dstEnd = result.data() + result.size();
+ for (; dst < dstEnd && bytes < srcEnd; ++dst) {
+ int numberEncoded = encMbToWc(dst, (const unsigned char*)bytes, srcEnd - bytes);
+ if (numberEncoded >= 0)
+ bytes += numberEncoded;
+ else {
+ if (numberEncoded == RET_ILSEQ)
+ sawInvalidChar = true;
+ break;
+ }
+ }
+ if (bytes == srcEnd || dst != dstEnd) {
+ *left = srcEnd - bytes;
+ result.resize(dst - result.data());
+ return;
+ }
+ lastSize = result.size();
+ result.resize(result.size() + 256);
+ }
+ } else {
+ *left = 0;
+ result.append(bytes, length);
+ return;
+ }
+ }
+ }
+
+ DWORD flags = getCodePageFlags(codePage);
+
+ if (codePage == CP_UTF8) {
+ if (canBeFirstTime) {
+ // Handle BOM.
+ if (length > 3) {
+ if (bytes[0] == (char)0xEF && bytes[1] == (char)0xBB && bytes[2] == (char)0xBF) {
+ // BOM found!
+ length -= 3;
+ bytes += 3;
+ *left = length;
+ }
+ } else if (bytes[0] == 0xEF && (length < 2 || bytes[1] == (char)0xBB) && (length < 3 || bytes[2] == (char)0xBF)) {
+ if (length == 3)
+ *left = 0;
+ return;
+ }
+ }
+
+ // Process ASCII characters at beginning.
+ const char* firstNonAsciiChar = findFirstNonAsciiCharacter(bytes, length);
+ int numAsciiCharacters = firstNonAsciiChar - bytes;
+ if (numAsciiCharacters) {
+ result.append(bytes, numAsciiCharacters);
+ length -= numAsciiCharacters;
+ if (!length) {
+ *left = 0;
+ return;
+ }
+ bytes = firstNonAsciiChar;
+ }
+
+ int oldSize = result.size();
+ result.resize(oldSize + length);
+ UChar* resultStart = result.data() + oldSize;
+ const char* sourceStart = bytes;
+ const char* const sourceEnd = bytes + length;
+ for (;;) {
+ using namespace WTF::Unicode;
+ ConversionResult convRes = convertUTF8ToUTF16(&sourceStart
+ , sourceEnd
+ , &resultStart
+ , result.data() + result.size()
+ , true);
+
+ // FIXME: is it possible?
+ if (convRes == targetExhausted && sourceStart < sourceEnd) {
+ oldSize = result.size();
+ result.resize(oldSize + 256);
+ resultStart = result.data() + oldSize;
+ continue;
+ }
+
+ if (convRes != conversionOK)
+ sawInvalidChar = true;
+
+ break;
+ }
+
+ *left = sourceEnd - sourceStart;
+ result.resize(resultStart - result.data());
+ } else {
+ int testLength = length;
+ int untestedLength = length;
+ for (;;) {
+ int resultLength = MultiByteToWideChar(codePage, flags, bytes, testLength, 0, 0);
+
+ if (resultLength > 0) {
+ int oldSize = result.size();
+ result.resize(oldSize + resultLength);
+
+ MultiByteToWideChar(codePage, flags, bytes, testLength, result.data() + oldSize, resultLength);
+
+ if (testLength == untestedLength) {
+ *left = length - testLength;
+ break;
+ }
+ untestedLength -= testLength;
+ length -= testLength;
+ bytes += testLength;
+ } else {
+ untestedLength = testLength - 1;
+ if (!untestedLength) {
+ *left = length;
+ break;
+ }
+ }
+ testLength = (untestedLength + 1) / 2;
+ }
+ }
+}
+
+String TextCodecWinCE::decode(const char* bytes, size_t length, bool flush, bool stopOnError, bool& sawError)
+{
+ if (!m_decodeBuffer.isEmpty()) {
+ m_decodeBuffer.append(bytes, length);
+ bytes = m_decodeBuffer.data();
+ length = m_decodeBuffer.size();
+ }
+
+ size_t left;
+ Vector<UChar, 8192> result;
+ for (;;) {
+ bool sawInvalidChar = false;
+ WebCore::decode(result, m_encoding.name(), bytes, length, &left, m_decodeBuffer.isEmpty(), sawInvalidChar);
+ if (!left)
+ break;
+
+ if (!sawInvalidChar && !flush && left < 16)
+ break;
+
+ result.append(L'?');
+ sawError = true;
+ if (stopOnError)
+ return String(result.data(), result.size());
+
+
+ if (left == 1)
+ break;
+
+ bytes += length - left + 1;
+ length = left - 1;
+ }
+ if (left && !flush) {
+ if (m_decodeBuffer.isEmpty())
+ m_decodeBuffer.append(bytes + length - left, left);
+ else {
+ memmove(m_decodeBuffer.data(), bytes + length - left, left);
+ m_decodeBuffer.resize(left);
+ }
+ } else
+ m_decodeBuffer.clear();
+ return String(result.data(), result.size());
+}
+
+CString TextCodecWinCE::encode(const UChar* characters, size_t length, UnencodableHandling)
+{
+ if (!characters || !length)
+ return CString();
+
+ UINT codePage = getCodePage(m_encoding.name());
+ DWORD flags = codePage == CP_UTF8 ? 0 : WC_COMPOSITECHECK;
+
+ int resultLength = WideCharToMultiByte(codePage, flags, characters, length, 0, 0, 0, 0);
+
+ // FIXME: We need to implement UnencodableHandling: QuestionMarksForUnencodables, EntitiesForUnencodables, and URLEncodedEntitiesForUnencodables.
+
+ if (resultLength <= 0)
+ return "?";
+
+ Vector<char> result(resultLength);
+
+ WideCharToMultiByte(codePage, flags, characters, length, result.data(), resultLength, 0, 0);
+
+ return CString(result.data(), result.size());
+}
+
+void TextCodecWinCE::enumerateSupportedEncodings(EncodingReceiver& receiver)
+{
+ languageManager();
+ for (CharsetSet::iterator i = supportedCharsets().begin(); i != supportedCharsets().end(); ++i) {
+ HashMap<String, CharsetInfo>::iterator j = knownCharsets().find(*i);
+ if (j != knownCharsets().end() && !receiver.receive(j->second.m_name.data(), j->second.m_friendlyName.charactersWithNullTermination(), j->second.m_codePage))
+ break;
+ }
+}
+
+} // namespace WebCore