diff options
author | John Reck <jreck@google.com> | 2010-11-04 12:00:17 -0700 |
---|---|---|
committer | John Reck <jreck@google.com> | 2010-11-09 11:35:04 -0800 |
commit | e14391e94c850b8bd03680c23b38978db68687a8 (patch) | |
tree | 3fed87e6620fecaf3edc7259ae58a11662bedcb2 /WebCore/platform/text | |
parent | 1bd705833a68f07850cf7e204b26f8d328d16951 (diff) | |
download | external_webkit-e14391e94c850b8bd03680c23b38978db68687a8.zip external_webkit-e14391e94c850b8bd03680c23b38978db68687a8.tar.gz external_webkit-e14391e94c850b8bd03680c23b38978db68687a8.tar.bz2 |
Merge Webkit at r70949: Initial merge by git.
Change-Id: I77b8645c083b5d0da8dba73ed01d4014aab9848e
Diffstat (limited to 'WebCore/platform/text')
-rw-r--r-- | WebCore/platform/text/TextEncoding.cpp | 4 | ||||
-rw-r--r-- | WebCore/platform/text/brew/StringBrew.cpp | 44 | ||||
-rw-r--r-- | WebCore/platform/text/gtk/TextBreakIteratorGtk.cpp | 278 | ||||
-rw-r--r-- | WebCore/platform/text/gtk/TextCodecGtk.cpp | 689 | ||||
-rw-r--r-- | WebCore/platform/text/gtk/TextCodecGtk.h | 87 |
5 files changed, 667 insertions, 435 deletions
diff --git a/WebCore/platform/text/TextEncoding.cpp b/WebCore/platform/text/TextEncoding.cpp index 921ceeb..58e691f 100644 --- a/WebCore/platform/text/TextEncoding.cpp +++ b/WebCore/platform/text/TextEncoding.cpp @@ -120,6 +120,10 @@ CString TextEncoding::encode(const UChar* characters, size_t length, Unencodable #elif USE(GLIB_UNICODE) GOwnPtr<char> UTF8Source; UTF8Source.set(g_utf16_to_utf8(characters, length, 0, 0, 0)); + if (!UTF8Source) { + // If conversion to UTF-8 failed, try with the string without normalization + return newTextCodec(*this)->encode(characters, length, handling); + } GOwnPtr<char> UTF8Normalized; UTF8Normalized.set(g_utf8_normalize(UTF8Source.get(), -1, G_NORMALIZE_NFC)); diff --git a/WebCore/platform/text/brew/StringBrew.cpp b/WebCore/platform/text/brew/StringBrew.cpp new file mode 100644 index 0000000..7869e0f --- /dev/null +++ b/WebCore/platform/text/brew/StringBrew.cpp @@ -0,0 +1,44 @@ +/* + * Copyright (C) 2010 Company 100, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "PlatformString.h" + +#include <AEEStdLib.h> + +namespace WTF { + +// String conversions +String::String(const AECHAR* string) +{ + // It is safe to cast AECHAR to UChar as both of them use 16 bits representation. + const UChar* str = reinterpret_cast<const UChar*>(string); + const size_t len = WSTRLEN(string); + + m_impl = StringImpl::create(str, len); +} + +} // namespace WebCore + diff --git a/WebCore/platform/text/gtk/TextBreakIteratorGtk.cpp b/WebCore/platform/text/gtk/TextBreakIteratorGtk.cpp index 3be0c70..be3f302 100644 --- a/WebCore/platform/text/gtk/TextBreakIteratorGtk.cpp +++ b/WebCore/platform/text/gtk/TextBreakIteratorGtk.cpp @@ -3,6 +3,7 @@ * Copyright (C) 2007 Apple Inc. All rights reserved. * Copyright (C) 2008 Jürg Billeter <j@bitron.ch> * Copyright (C) 2008 Dominik Röttsches <dominik.roettsches@access-company.com> + * Copyright (C) 2010 Igalia S.L. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Library General Public @@ -22,13 +23,160 @@ */ #include "config.h" -#include "GOwnPtr.h" + #include "TextBreakIterator.h" +#include "GOwnPtr.h" #include <pango/pango.h> +using namespace std; + +#define UTF8_IS_SURROGATE(character) (character >= 0x10000 && character <= 0x10FFFF) namespace WebCore { +class CharacterIterator { +public: + bool setText(const UChar* string, int length); + const gchar* getText() { return m_utf8.get(); } + int getLength() { return m_length; } + glong getSize() { return m_size; } + void setIndex(int index); + int getIndex() { return m_index; } + void setUTF16Index(int index); + int getUTF16Index() { return m_utf16Index; } + int getUTF16Length() { return m_utf16Length; } + int first(); + int last(); + int next(); + int previous(); +private: + int characterSize(int index); + + GOwnPtr<char> m_utf8; + int m_length; + long m_size; + int m_index; + int m_utf16Index; + int m_utf16Length; +}; + +int CharacterIterator::characterSize(int index) +{ + if (index == m_length || index < 0) + return 0; + if (m_length == m_utf16Length) + return 1; + + gchar* indexPtr = g_utf8_offset_to_pointer(m_utf8.get(), index); + gunichar character = g_utf8_get_char(indexPtr); + return UTF8_IS_SURROGATE(character) ? 2 : 1; +} + +bool CharacterIterator::setText(const UChar* string, int length) +{ + long utf8Size = 0; + m_utf8.set(g_utf16_to_utf8(string, length, 0, &utf8Size, 0)); + if (!utf8Size) + return false; + + m_utf16Length = length; + m_length = g_utf8_strlen(m_utf8.get(), utf8Size); + m_size = utf8Size; + m_index = 0; + m_utf16Index = 0; + + return true; +} + +void CharacterIterator::setIndex(int index) +{ + if (index == m_index) + return; + if (index <= 0) + m_index = m_utf16Index = 0; + else if (index >= m_length) { + m_index = m_length; + m_utf16Index = m_utf16Length; + } else if (m_length == m_utf16Length) + m_index = m_utf16Index = index; + else { + m_index = index; + int utf16Index = 0; + int utf8Index = 0; + while (utf8Index < index) { + utf16Index += characterSize(utf8Index); + utf8Index++; + } + m_utf16Index = utf16Index; + } +} + +void CharacterIterator::setUTF16Index(int index) +{ + if (index == m_utf16Index) + return; + if (index <= 0) + m_utf16Index = m_index = 0; + else if (index >= m_utf16Length) { + m_utf16Index = m_utf16Length; + m_index = m_length; + } else if (m_length == m_utf16Length) + m_utf16Index = m_index = index; + else { + m_utf16Index = index; + int utf16Index = 0; + int utf8Index = 0; + while (utf16Index < index) { + utf16Index += characterSize(utf8Index); + utf8Index++; + } + m_index = utf8Index; + } +} + +int CharacterIterator::first() +{ + m_index = m_utf16Index = 0; + return m_index; +} + +int CharacterIterator::last() +{ + m_index = m_length; + m_utf16Index = m_utf16Length; + return m_index; +} + +int CharacterIterator::next() +{ + int next = m_index + 1; + + if (next <= m_length) { + m_utf16Index = min(m_utf16Index + characterSize(m_index), m_utf16Length); + m_index = next; + } else { + m_index = TextBreakDone; + m_utf16Index = TextBreakDone; + } + + return m_index; +} + +int CharacterIterator::previous() +{ + int previous = m_index - 1; + + if (previous >= 0) { + m_utf16Index = max(m_utf16Index - characterSize(previous), 0); + m_index = previous; + } else { + m_index = TextBreakDone; + m_utf16Index = TextBreakDone; + } + + return m_index; +} + enum UBreakIteratorType { UBRK_CHARACTER, UBRK_WORD, @@ -39,9 +187,8 @@ enum UBreakIteratorType { class TextBreakIterator { public: UBreakIteratorType m_type; - int m_length; PangoLogAttr* m_logAttrs; - int m_index; + CharacterIterator m_charIterator; }; static TextBreakIterator* setUpIterator(bool& createdIterator, TextBreakIterator*& iterator, @@ -57,19 +204,17 @@ static TextBreakIterator* setUpIterator(bool& createdIterator, TextBreakIterator if (!iterator) return 0; - long utf8len; - GOwnPtr<char> utf8; - utf8.set(g_utf16_to_utf8(string, length, 0, &utf8len, 0)); + if (!iterator->m_charIterator.setText(string, length)) + return 0; - // FIXME: assumes no surrogate pairs + int charLength = iterator->m_charIterator.getLength(); iterator->m_type = type; - iterator->m_length = length; if (createdIterator) g_free(iterator->m_logAttrs); - iterator->m_logAttrs = g_new0(PangoLogAttr, length + 1); - iterator->m_index = -1; - pango_get_log_attrs(utf8.get(), utf8len, -1, 0, iterator->m_logAttrs, length + 1); + iterator->m_logAttrs = g_new0(PangoLogAttr, charLength + 1); + pango_get_log_attrs(iterator->m_charIterator.getText(), iterator->m_charIterator.getSize(), + -1, 0, iterator->m_logAttrs, charLength + 1); return iterator; } @@ -108,21 +253,13 @@ TextBreakIterator* sentenceBreakIterator(const UChar* string, int length) return setUpIterator(createdSentenceBreakIterator, staticSentenceBreakIterator, UBRK_SENTENCE, string, length); } -int textBreakFirst(TextBreakIterator* bi) +int textBreakFirst(TextBreakIterator* iterator) { - // see textBreakLast - - int firstCursorPosition = -1; - int pos = 0; - while (pos <= bi->m_length && (firstCursorPosition < 0)) { - if (bi->m_logAttrs[pos].is_cursor_position) - firstCursorPosition = pos; - } - bi->m_index = firstCursorPosition; - return firstCursorPosition; + iterator->m_charIterator.first(); + return iterator->m_charIterator.getUTF16Index(); } -int textBreakLast(TextBreakIterator* bi) +int textBreakLast(TextBreakIterator* iterator) { // TextBreakLast is not meant to find just any break according to bi->m_type // but really the one near the last character. @@ -137,81 +274,92 @@ int textBreakLast(TextBreakIterator* bi) // Otherwise return m_length, as "the first character beyond the last" is outside our string. bool whiteSpaceAtTheEnd = true; - int nextWhiteSpacePos = bi->m_length; - - int pos = bi->m_length; + int nextWhiteSpacePos = iterator->m_charIterator.getLength(); + + int pos = iterator->m_charIterator.last(); while (pos >= 0 && whiteSpaceAtTheEnd) { - if (bi->m_logAttrs[pos].is_cursor_position) { - if (whiteSpaceAtTheEnd = bi->m_logAttrs[pos].is_white) + if (iterator->m_logAttrs[pos].is_cursor_position) { + if (whiteSpaceAtTheEnd = iterator->m_logAttrs[pos].is_white) nextWhiteSpacePos = pos; } - pos--; + pos = iterator->m_charIterator.previous(); } - bi->m_index = nextWhiteSpacePos; - return nextWhiteSpacePos; + iterator->m_charIterator.setIndex(nextWhiteSpacePos); + return iterator->m_charIterator.getUTF16Index(); } -int textBreakNext(TextBreakIterator* bi) +int textBreakNext(TextBreakIterator* iterator) { - for (int i = bi->m_index + 1; i <= bi->m_length; i++) { + while (iterator->m_charIterator.next() != TextBreakDone) { + int index = iterator->m_charIterator.getIndex(); // FIXME: UBRK_WORD case: Single multibyte characters (i.e. white space around them), such as the euro symbol €, // are not marked as word_start & word_end as opposed to the way ICU does it. // This leads to - for example - different word selection behaviour when right clicking. - if ((bi->m_type == UBRK_LINE && bi->m_logAttrs[i].is_line_break) - || (bi->m_type == UBRK_WORD && (bi->m_logAttrs[i].is_word_start || bi->m_logAttrs[i].is_word_end)) - || (bi->m_type == UBRK_CHARACTER && bi->m_logAttrs[i].is_cursor_position) - || (bi->m_type == UBRK_SENTENCE && (bi->m_logAttrs[i].is_sentence_start || bi->m_logAttrs[i].is_sentence_end)) ) { - bi->m_index = i; - return i; + if ((iterator->m_type == UBRK_LINE && iterator->m_logAttrs[index].is_line_break) + || (iterator->m_type == UBRK_WORD && (iterator->m_logAttrs[index].is_word_start || iterator->m_logAttrs[index].is_word_end)) + || (iterator->m_type == UBRK_CHARACTER && iterator->m_logAttrs[index].is_cursor_position) + || (iterator->m_type == UBRK_SENTENCE && (iterator->m_logAttrs[index].is_sentence_start || iterator->m_logAttrs[index].is_sentence_end)) ) { + break; } } - return TextBreakDone; + return iterator->m_charIterator.getUTF16Index(); } -int textBreakPrevious(TextBreakIterator* bi) +int textBreakPrevious(TextBreakIterator* iterator) { - for (int i = bi->m_index - 1; i >= 0; i--) { - if ((bi->m_type == UBRK_LINE && bi->m_logAttrs[i].is_line_break) - || (bi->m_type == UBRK_WORD && (bi->m_logAttrs[i].is_word_start || bi->m_logAttrs[i].is_word_end)) - || (bi->m_type == UBRK_CHARACTER && bi->m_logAttrs[i].is_cursor_position) - || (bi->m_type == UBRK_SENTENCE && (bi->m_logAttrs[i].is_sentence_start || bi->m_logAttrs[i].is_sentence_end)) ) { - bi->m_index = i; - return i; + while (iterator->m_charIterator.previous() != TextBreakDone) { + int index = iterator->m_charIterator.getIndex(); + + if ((iterator->m_type == UBRK_LINE && iterator->m_logAttrs[index].is_line_break) + || (iterator->m_type == UBRK_WORD && (iterator->m_logAttrs[index].is_word_start || iterator->m_logAttrs[index].is_word_end)) + || (iterator->m_type == UBRK_CHARACTER && iterator->m_logAttrs[index].is_cursor_position) + || (iterator->m_type == UBRK_SENTENCE && (iterator->m_logAttrs[index].is_sentence_start || iterator->m_logAttrs[index].is_sentence_end)) ) { + break; } } - return textBreakFirst(bi); + return iterator->m_charIterator.getUTF16Index(); } -int textBreakPreceding(TextBreakIterator* bi, int pos) +int textBreakPreceding(TextBreakIterator* iterator, int offset) { - bi->m_index = pos; - return textBreakPrevious(bi); + if (offset > iterator->m_charIterator.getUTF16Length()) + return TextBreakDone; + if (offset < 0) + return 0; + iterator->m_charIterator.setUTF16Index(offset); + return textBreakPrevious(iterator); } -int textBreakFollowing(TextBreakIterator* bi, int pos) +int textBreakFollowing(TextBreakIterator* iterator, int offset) { - if (pos < 0) - pos = -1; - bi->m_index = pos; - return textBreakNext(bi); + if (offset > iterator->m_charIterator.getUTF16Length()) + return TextBreakDone; + if (offset < 0) + return 0; + iterator->m_charIterator.setUTF16Index(offset); + return textBreakNext(iterator); } -int textBreakCurrent(TextBreakIterator* bi) +int textBreakCurrent(TextBreakIterator* iterator) { - return bi->m_index; + return iterator->m_charIterator.getUTF16Index(); } -bool isTextBreak(TextBreakIterator* bi, int pos) +bool isTextBreak(TextBreakIterator* iterator, int offset) { - if (bi->m_index < 0) + if (!offset) + return true; + if (offset > iterator->m_charIterator.getUTF16Length()) return false; - return ((bi->m_type == UBRK_LINE && bi->m_logAttrs[bi->m_index].is_line_break) - || (bi->m_type == UBRK_WORD && bi->m_logAttrs[bi->m_index].is_word_end) - || (bi->m_type == UBRK_CHARACTER && bi->m_logAttrs[bi->m_index].is_char_break) - || (bi->m_type == UBRK_SENTENCE && bi->m_logAttrs[bi->m_index].is_sentence_end) ); + iterator->m_charIterator.setUTF16Index(offset); + + int index = iterator->m_charIterator.getIndex(); + iterator->m_charIterator.previous(); + textBreakNext(iterator); + return iterator->m_charIterator.getIndex() == index; } } diff --git a/WebCore/platform/text/gtk/TextCodecGtk.cpp b/WebCore/platform/text/gtk/TextCodecGtk.cpp index 4224c0c..bf6afcd 100644 --- a/WebCore/platform/text/gtk/TextCodecGtk.cpp +++ b/WebCore/platform/text/gtk/TextCodecGtk.cpp @@ -29,6 +29,7 @@ #include "config.h" #include "TextCodecGtk.h" +#include <gio/gio.h> #include "GOwnPtr.h" #include "Logging.h" #include "PlatformString.h" @@ -45,164 +46,12 @@ namespace WebCore { // This can be achieved by specifying the UTF-16 codecs' endianness explicitly when initializing GLib. #if (G_BYTE_ORDER == G_BIG_ENDIAN) - const gchar* WebCore::TextCodecGtk::m_internalEncodingName = "UTF-16BE"; -#else - const gchar* WebCore::TextCodecGtk::m_internalEncodingName = "UTF-16LE"; +static const gchar* internalEncodingName = "UTF-16BE"; +#else +static const gchar* internalEncodingName = "UTF-16LE"; #endif -// We're specifying the list of text codecs and their aliases here. -// For each codec the first entry is the canonical name, remaining ones are used as aliases. -// Each alias list must be terminated by a 0. - -// Unicode -TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_UTF_8 = { "UTF-8", 0 }; - -// Western -TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_ISO_8859_1 = { "ISO-8859-1", "CP819", "IBM819", "ISO-IR-100", "ISO8859-1", "ISO_8859-1", "ISO_8859-1:1987", "L1", "LATIN1", "CSISOLATIN1", 0 }; -TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_MACROMAN = { "MACROMAN", "MAC", "MACINTOSH", "CSMACINTOSH", 0 }; - -// Japanese -TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_SHIFT_JIS = { "Shift_JIS", "MS_KANJI", "SHIFT-JIS", "SJIS", "CSSHIFTJIS", 0 }; - TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_EUC_JP = { "EUC-JP", "EUC_JP", "EUCJP", "EXTENDED_UNIX_CODE_PACKED_FORMAT_FOR_JAPANESE", "CSEUCPKDFMTJAPANESE", 0 }; -TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_ISO_2022_JP = { "ISO-2022-JP", 0 }; - -// Traditional Chinese -TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_BIG5 = { "BIG5", "BIG-5", "BIG-FIVE", "BIG5", "BIGFIVE", "CN-BIG5", "CSBIG5", 0 }; -TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_BIG5_HKSCS = { "BIG5-HKSCS", "BIG5-HKSCS:2004", "BIG5HKSCS", 0 }; -TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_CP950 = { "CP950", 0 }; - -// Korean -TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_ISO_2022_KR = { "ISO-2022-KR", "CSISO2022KR", 0 }; -TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_CP949 = { "CP949", "UHC", 0 }; -TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_EUC_KR = { "EUC-KR", "CSEUCKR", 0 }; - -// Arabic -TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_ISO_8859_6 = { "ISO-8859-6", "ARABIC", "ASMO-708", "ECMA-114", "ISO-IR-127", "ISO8859-6", "ISO_8859-6", "ISO_8859-6:1987", "CSISOLATINARABIC", 0 }; -TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_CP1256 = { "windows-1256", "CP1256", "MS-ARAB", 0 }; // rearranged, windows-1256 now declared the canonical name and put to lowercase to fix /fast/encoding/ahram-org-eg.html test case - -// Hebrew -TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_ISO_8859_8 = { "ISO-8859-8", "HEBREW", "ISO-8859-8", "ISO-IR-138", "ISO8859-8", "ISO_8859-8", "ISO_8859-8:1988", "CSISOLATINHEBREW", 0 }; -TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_CP1255 = { "windows-1255", "CP1255", "MS-HEBR", 0 }; // rearranged, moved windows-1255 as canonical and lowercased, fixing /fast/encoding/meta-charset.html - -// Greek -TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_ISO_8859_7 = { "ISO-8859-7", "ECMA-118", "ELOT_928", "GREEK", "GREEK8", "ISO-IR-126", "ISO8859-7", "ISO_8859-7", "ISO_8859-7:1987", "ISO_8859-7:2003", "CSI", 0 }; -TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_CP869 = { "CP869", "869", "CP-GR", "IBM869", "CSIBM869", 0 }; -TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_WINDOWS_1253 = { "WINDOWS-1253", 0 }; - -// Cyrillic -TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_ISO_8859_5 = { "ISO-8859-5", "CYRILLIC", "ISO-IR-144", "ISO8859-5", "ISO_8859-5", "ISO_8859-5:1988", "CSISOLATINCYRILLIC", 0 }; -TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_KOI8_R = { "KOI8-R", "CSKOI8R", 0 }; -TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_CP866 = { "CP866", "866", "IBM866", "CSIBM866", 0 }; -TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_KOI8_U = { "KOI8-U", 0 }; -TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_WINDOWS_1251 = { "windows-1251", "CP1251", 0 }; // CP1251 added to pass /fast/encoding/charset-cp1251.html -TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_MACCYRILLIC = { "mac-cyrillic", "MACCYRILLIC", "x-mac-cyrillic", 0 }; - -// Thai -TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_CP874 = { "CP874", "WINDOWS-874", 0 }; -TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_TIS_620 = { "TIS-620", 0 }; - -// Simplified Chinese -TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_GBK = { "GBK", 0 }; -TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_HZ = { "HZ", "HZ-GB-2312", 0 }; -TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_GB18030 = { "GB18030", 0 }; -TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_EUC_CN = { "EUC-CN", "EUCCN", "GB2312", "CN-GB", "CSGB2312", "EUC_CN", 0 }; -TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_2312_80 = { "GB_2312-80", "CHINESE", "csISO58GB231280", "GB2312.1980-0", "ISO-IR-58" }; - -// Central European -TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_ISO_8859_2 = { "ISO-8859-2", "ISO-IR-101", "ISO8859-2", "ISO_8859-2", "ISO_8859-2:1987", "L2", "LATIN2", "CSISOLATIN2", 0 }; -TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_CP1250 = { "CP1250", "MS-EE", "WINDOWS-1250", 0 }; -TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_MACCENTRALEUROPE = { "MAC-CENTRALEUROPE", 0 }; - -// Vietnamese -TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_CP1258 = { "CP1258", "WINDOWS-1258", 0 }; - -// Turkish -TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_CP1254 = { "CP1254", "MS-TURK", "WINDOWS-1254", 0 }; -TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_ISO_8859_9 = { "ISO-8859-9", "ISO-IR-148", "ISO8859-9", "ISO_8859-9", "ISO_8859-9:1989", "L5", "LATIN5", "CSISOLATIN5", 0 }; - -// Baltic -TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_CP1257 = { "CP1257", "WINBALTRIM", "WINDOWS-1257", 0 }; -TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_ISO_8859_4 = { "ISO-8859-4", "ISO-IR-110", "ISO8859-4", "ISO_8859-4", "ISO_8859-4:1988", "L4", "LATIN4", "CSISOLATIN4", 0 }; - -gconstpointer const TextCodecGtk::m_iconvBaseCodecList[] = { - // Unicode - &m_codecAliases_UTF_8, - - // Western - &m_codecAliases_ISO_8859_1 -}; - -gconstpointer const TextCodecGtk::m_iconvExtendedCodecList[] = -{ - // Western - &m_codecAliases_MACROMAN, - - // Japanese - &m_codecAliases_SHIFT_JIS, - &m_codecAliases_EUC_JP, - &m_codecAliases_ISO_2022_JP, - - // Simplified Chinese - &m_codecAliases_BIG5, - &m_codecAliases_BIG5_HKSCS, - &m_codecAliases_CP950, - - // Korean - &m_codecAliases_ISO_2022_KR, - &m_codecAliases_CP949, - &m_codecAliases_EUC_KR, - - // Arabic - &m_codecAliases_ISO_8859_6, - &m_codecAliases_CP1256, - - // Hebrew - &m_codecAliases_ISO_8859_8, - &m_codecAliases_CP1255, - - // Greek - &m_codecAliases_ISO_8859_7, - &m_codecAliases_CP869, - &m_codecAliases_WINDOWS_1253, - - // Cyrillic - &m_codecAliases_ISO_8859_5, - &m_codecAliases_KOI8_R, - &m_codecAliases_CP866, - &m_codecAliases_KOI8_U, - &m_codecAliases_WINDOWS_1251, - &m_codecAliases_MACCYRILLIC, - - // Thai - &m_codecAliases_CP874, - &m_codecAliases_TIS_620, - - // Traditional Chinese - &m_codecAliases_GBK, - &m_codecAliases_HZ, - &m_codecAliases_GB18030, - &m_codecAliases_EUC_CN, - &m_codecAliases_2312_80, - - // Central European - &m_codecAliases_ISO_8859_2, - &m_codecAliases_CP1250, - &m_codecAliases_MACCENTRALEUROPE, - - // Vietnamese - &m_codecAliases_CP1258, - - // Turkish - &m_codecAliases_CP1254, - &m_codecAliases_ISO_8859_9, - - // Baltic - &m_codecAliases_CP1257, - &m_codecAliases_ISO_8859_4 -}; - - const size_t ConversionBufferSize = 16384; @@ -211,17 +60,17 @@ static PassOwnPtr<TextCodec> newTextCodecGtk(const TextEncoding& encoding, const return new TextCodecGtk(encoding); } -gboolean TextCodecGtk::isEncodingAvailable(const gchar* encName) +static bool isEncodingAvailable(const gchar* encodingName) { GIConv tester; // test decoding - tester = g_iconv_open(m_internalEncodingName, encName); + tester = g_iconv_open(internalEncodingName, encodingName); if (tester == reinterpret_cast<GIConv>(-1)) { return false; } else { g_iconv_close(tester); // test encoding - tester = g_iconv_open(encName, m_internalEncodingName); + tester = g_iconv_open(encodingName, internalEncodingName); if (tester == reinterpret_cast<GIConv>(-1)) { return false; } else { @@ -231,186 +80,435 @@ gboolean TextCodecGtk::isEncodingAvailable(const gchar* encName) } } -void TextCodecGtk::registerEncodingNames(EncodingNameRegistrar registrar, bool extended) +static bool registerEncodingNameIfAvailable(EncodingNameRegistrar registrar, const char* canonicalName) { - const void* const* encodingList; - unsigned int listLength = 0; - if (extended) { - encodingList = m_iconvExtendedCodecList; - listLength = sizeof(m_iconvExtendedCodecList)/sizeof(gpointer); - } else { - encodingList = m_iconvBaseCodecList; - listLength = sizeof(m_iconvBaseCodecList)/sizeof(gpointer); - } - - for (unsigned int i = 0; i < listLength; ++i) { - codecAliasList *codecAliases = static_cast<codecAliasList*>(encodingList[i]); - - // Our convention is, the first entry in codecAliases is the canonical name, - // see above in the list of declarations. - // Probe GLib for this one first. If it's not available, we skip the whole group of aliases. - - int codecCount = 0; - const char *canonicalName; - canonicalName = (*codecAliases)[codecCount]; - - if (!isEncodingAvailable(canonicalName)) - continue; + if (isEncodingAvailable(canonicalName)) { registrar(canonicalName, canonicalName); - - const char *currentAlias; - while ((currentAlias = (*codecAliases)[++codecCount])) { - if (isEncodingAvailable(currentAlias)) - registrar(currentAlias, canonicalName); - } - + return true; } + + return false; } -void TextCodecGtk::registerCodecs(TextCodecRegistrar registrar, bool extended) +static void registerEncodingAliasIfAvailable(EncodingNameRegistrar registrar, const char* canonicalName, const char* aliasName) { - const void* const* encodingList; - unsigned int listLength = 0; - if (extended) { - encodingList = m_iconvExtendedCodecList; - listLength = sizeof(m_iconvExtendedCodecList)/sizeof(gpointer); - } else { - encodingList = m_iconvBaseCodecList; - listLength = sizeof(m_iconvBaseCodecList)/sizeof(gpointer); - } + if (isEncodingAvailable(aliasName)) + registrar(aliasName, canonicalName); +} - for (unsigned int i = 0; i < listLength; ++i) { - codecAliasList *codecAliases = static_cast<codecAliasList*>(encodingList[i]); - // by convention, the first "alias" should be the canonical name, see the definition of the alias lists - const gchar *codecName = (*codecAliases)[0]; - if (isEncodingAvailable(codecName)) - registrar(codecName, newTextCodecGtk, 0); - } +static void registerCodecIfAvailable(TextCodecRegistrar registrar, const char* codecName) +{ + if (isEncodingAvailable(codecName)) + registrar(codecName, newTextCodecGtk, 0); } void TextCodecGtk::registerBaseEncodingNames(EncodingNameRegistrar registrar) { - registerEncodingNames(registrar, false); + // Unicode + registerEncodingNameIfAvailable(registrar, "UTF-8"); + registerEncodingNameIfAvailable(registrar, "UTF-32"); + registerEncodingNameIfAvailable(registrar, "UTF-32BE"); + registerEncodingNameIfAvailable(registrar, "UTF-32LE"); + + // Western + if (registerEncodingNameIfAvailable(registrar, "ISO-8859-1")) { + registerEncodingAliasIfAvailable(registrar, "ISO-8859-1", "CP819"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-1", "IBM819"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-1", "ISO-IR-100"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-1", "ISO8859-1"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-1", "ISO_8859-1"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-1", "ISO_8859-1:1987"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-1", "L1"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-1", "LATIN1"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-1", "CSISOLATIN1"); + } } void TextCodecGtk::registerBaseCodecs(TextCodecRegistrar registrar) { - registerCodecs(registrar, false); + // Unicode + registerCodecIfAvailable(registrar, "UTF-8"); + registerCodecIfAvailable(registrar, "UTF-32"); + registerCodecIfAvailable(registrar, "UTF-32BE"); + registerCodecIfAvailable(registrar, "UTF-32LE"); + + // Western + registerCodecIfAvailable(registrar, "ISO-8859-1"); } void TextCodecGtk::registerExtendedEncodingNames(EncodingNameRegistrar registrar) { - registerEncodingNames(registrar, true); + // Western + if (registerEncodingNameIfAvailable(registrar, "MACROMAN")) { + registerEncodingAliasIfAvailable(registrar, "MACROMAN", "MAC"); + registerEncodingAliasIfAvailable(registrar, "MACROMAN", "MACINTOSH"); + registerEncodingAliasIfAvailable(registrar, "MACROMAN", "CSMACINTOSH"); + } + + // Japanese + if (registerEncodingNameIfAvailable(registrar, "Shift_JIS")) { + registerEncodingAliasIfAvailable(registrar, "Shift_JIS", "MS_KANJI"); + registerEncodingAliasIfAvailable(registrar, "Shift_JIS", "SHIFT-JIS"); + registerEncodingAliasIfAvailable(registrar, "Shift_JIS", "SJIS"); + registerEncodingAliasIfAvailable(registrar, "Shift_JIS", "CSSHIFTJIS"); + } + if (registerEncodingNameIfAvailable(registrar, "EUC-JP")) { + registerEncodingAliasIfAvailable(registrar, "EUC-JP", "EUC_JP"); + registerEncodingAliasIfAvailable(registrar, "EUC-JP", "EUCJP"); + registerEncodingAliasIfAvailable(registrar, "EUC-JP", "EXTENDED_UNIX_CODE_PACKED_FORMAT_FOR_JAPANESE"); + registerEncodingAliasIfAvailable(registrar, "EUC-JP", "CSEUCPKDFMTJAPANESE"); + } + registerEncodingNameIfAvailable(registrar, "ISO-2022-JP"); + + // Traditional Chinese + if (registerEncodingNameIfAvailable(registrar, "BIG5")) { + registerEncodingAliasIfAvailable(registrar, "BIG5", "BIG-5"); + registerEncodingAliasIfAvailable(registrar, "BIG5", "BIG-FIVE"); + registerEncodingAliasIfAvailable(registrar, "BIG5", "BIGFIVE"); + registerEncodingAliasIfAvailable(registrar, "BIG5", "CN-BIG5"); + registerEncodingAliasIfAvailable(registrar, "BIG5", "CSBIG5"); + } + if (registerEncodingNameIfAvailable(registrar, "BIG5-HKSCS")) { + registerEncodingAliasIfAvailable(registrar, "BIG5-HKSCS", "BIG5-HKSCS:2004"); + registerEncodingAliasIfAvailable(registrar, "BIG5-HKSCS", "BIG5HKSCS"); + } + registerEncodingNameIfAvailable(registrar, "CP950"); + + // Korean + if (registerEncodingNameIfAvailable(registrar, "ISO-2022-KR")) + registerEncodingAliasIfAvailable(registrar, "ISO-2022-KR", "CSISO2022KR"); + if (registerEncodingNameIfAvailable(registrar, "CP949")) + registerEncodingAliasIfAvailable(registrar, "CP949", "UHC"); + if (registerEncodingNameIfAvailable(registrar, "EUC-KR")) + registerEncodingAliasIfAvailable(registrar, "EUC-KR", "CSEUCKR"); + + // Arabic + if (registerEncodingNameIfAvailable(registrar, "ISO-8859-6")) { + registerEncodingAliasIfAvailable(registrar, "ISO-8859-6", "ARABIC"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-6", "ASMO-708"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-6", "ECMA-114"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-6", "ISO-IR-127"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-6", "ISO8859-6"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-6", "ISO_8859-6"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-6", "ISO_8859-6:1987"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-6", "CSISOLATINARABIC"); + } + // rearranged, windows-1256 now declared the canonical name and put to lowercase to fix /fast/encoding/ahram-org-eg.html test case + if (registerEncodingNameIfAvailable(registrar, "windows-1256")) { + registerEncodingAliasIfAvailable(registrar, "windows-1256", "CP1256"); + registerEncodingAliasIfAvailable(registrar, "windows-1256", "MS-ARAB"); + } + + // Hebrew + if (registerEncodingNameIfAvailable(registrar, "ISO-8859-8")) { + registerEncodingAliasIfAvailable(registrar, "ISO-8859-8", "HEBREW"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-8", "ISO-8859-8"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-8", "ISO-IR-138"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-8", "ISO8859-8"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-8", "ISO_8859-8"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-8", "ISO_8859-8:1988"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-8", "CSISOLATINHEBREW"); + } + // rearranged, moved windows-1255 as canonical and lowercased, fixing /fast/encoding/meta-charset.html + if (registerEncodingNameIfAvailable(registrar, "windows-1255")) { + registerEncodingAliasIfAvailable(registrar, "windows-1255", "CP1255"); + registerEncodingAliasIfAvailable(registrar, "windows-1255", "MS-HEBR"); + } + + // Greek + if (registerEncodingNameIfAvailable(registrar, "ISO-8859-7")) { + registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "ECMA-118"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "ELOT_928"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "GREEK"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "GREEK8"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "ISO-IR-126"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "ISO8859-7"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "ISO_8859-7"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "ISO_8859-7:1987"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "ISO_8859-7:2003"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "CSI"); + } + if (registerEncodingNameIfAvailable(registrar, "CP869")) { + registerEncodingAliasIfAvailable(registrar, "CP869", "869"); + registerEncodingAliasIfAvailable(registrar, "CP869", "CP-GR"); + registerEncodingAliasIfAvailable(registrar, "CP869", "IBM869"); + registerEncodingAliasIfAvailable(registrar, "CP869", "CSIBM869"); + } + registerEncodingNameIfAvailable(registrar, "WINDOWS-1253"); + + // Cyrillic + if (registerEncodingNameIfAvailable(registrar, "ISO-8859-5")) { + registerEncodingAliasIfAvailable(registrar, "ISO-8859-5", "CYRILLIC"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-5", "ISO-IR-144"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-5", "ISO8859-5"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-5", "ISO_8859-5"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-5", "ISO_8859-5:1988"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-5", "CSISOLATINCYRILLIC"); + } + if (registerEncodingNameIfAvailable(registrar, "KOI8-R")) + registerEncodingAliasIfAvailable(registrar, "KOI8-R", "CSKOI8R"); + if (registerEncodingNameIfAvailable(registrar, "CP866")) { + registerEncodingAliasIfAvailable(registrar, "CP866", "866"); + registerEncodingAliasIfAvailable(registrar, "CP866", "IBM866"); + registerEncodingAliasIfAvailable(registrar, "CP866", "CSIBM866"); + } + registerEncodingNameIfAvailable(registrar, "KOI8-U"); + // CP1251 added to pass /fast/encoding/charset-cp1251.html + if (registerEncodingNameIfAvailable(registrar, "windows-1251")) + registerEncodingAliasIfAvailable(registrar, "windows-1251", "CP1251"); + if (registerEncodingNameIfAvailable(registrar, "mac-cyrillic")) { + registerEncodingAliasIfAvailable(registrar, "mac-cyrillic", "MACCYRILLIC"); + registerEncodingAliasIfAvailable(registrar, "mac-cyrillic", "x-mac-cyrillic"); + } + + // Thai + if (registerEncodingNameIfAvailable(registrar, "CP874")) + registerEncodingAliasIfAvailable(registrar, "CP874", "WINDOWS-874"); + registerEncodingNameIfAvailable(registrar, "TIS-620"); + + // Simplified Chinese + registerEncodingNameIfAvailable(registrar, "GBK"); + if (registerEncodingNameIfAvailable(registrar, "HZ")) + registerEncodingAliasIfAvailable(registrar, "HZ", "HZ-GB-2312"); + registerEncodingNameIfAvailable(registrar, "GB18030"); + if (registerEncodingNameIfAvailable(registrar, "EUC-CN")) { + registerEncodingAliasIfAvailable(registrar, "EUC-CN", "EUCCN"); + registerEncodingAliasIfAvailable(registrar, "EUC-CN", "GB2312"); + registerEncodingAliasIfAvailable(registrar, "EUC-CN", "CN-GB"); + registerEncodingAliasIfAvailable(registrar, "EUC-CN", "CSGB2312"); + registerEncodingAliasIfAvailable(registrar, "EUC-CN", "EUC_CN"); + } + if (registerEncodingNameIfAvailable(registrar, "GB_2312-80")) { + registerEncodingAliasIfAvailable(registrar, "GB_2312-80", "CHINESE"); + registerEncodingAliasIfAvailable(registrar, "GB_2312-80", "csISO58GB231280"); + registerEncodingAliasIfAvailable(registrar, "GB_2312-80", "GB2312.1980-0"); + registerEncodingAliasIfAvailable(registrar, "GB_2312-80", "ISO-IR-58"); + } + + // Central European + if (registerEncodingNameIfAvailable(registrar, "ISO-8859-2")) { + registerEncodingAliasIfAvailable(registrar, "ISO-8859-2", "ISO-IR-101"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-2", "ISO8859-2"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-2", "ISO_8859-2"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-2", "ISO_8859-2:1987"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-2", "L2"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-2", "LATIN2"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-2", "CSISOLATIN2"); + } + if (registerEncodingNameIfAvailable(registrar, "CP1250")) { + registerEncodingAliasIfAvailable(registrar, "CP1250", "MS-EE"); + registerEncodingAliasIfAvailable(registrar, "CP1250", "WINDOWS-1250"); + } + registerEncodingNameIfAvailable(registrar, "MAC-CENTRALEUROPE"); + + // Vietnamese + if (registerEncodingNameIfAvailable(registrar, "CP1258")) + registerEncodingAliasIfAvailable(registrar, "CP1258", "WINDOWS-1258"); + + // Turkish + if (registerEncodingNameIfAvailable(registrar, "CP1254")) { + registerEncodingAliasIfAvailable(registrar, "CP1254", "MS-TURK"); + registerEncodingAliasIfAvailable(registrar, "CP1254", "WINDOWS-1254"); + } + if (registerEncodingNameIfAvailable(registrar, "ISO-8859-9")) { + registerEncodingAliasIfAvailable(registrar, "ISO-8859-9", "ISO-IR-148"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-9", "ISO8859-9"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-9", "ISO_8859-9"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-9", "ISO_8859-9:1989"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-9", "L5"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-9", "LATIN5"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-9", "CSISOLATIN5"); + } + + // Baltic + if (registerEncodingNameIfAvailable(registrar, "CP1257")) { + registerEncodingAliasIfAvailable(registrar, "CP1257", "WINBALTRIM"); + registerEncodingAliasIfAvailable(registrar, "CP1257", "WINDOWS-1257"); + } + if (registerEncodingNameIfAvailable(registrar, "ISO-8859-4")) { + registerEncodingAliasIfAvailable(registrar, "ISO-8859-4", "ISO-IR-110"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-4", "ISO8859-4"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-4", "ISO_8859-4"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-4", "ISO_8859-4:1988"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-4", "L4"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-4", "LATIN4"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-4", "CSISOLATIN4"); + } } void TextCodecGtk::registerExtendedCodecs(TextCodecRegistrar registrar) { - registerCodecs(registrar, true); + // Western + registerCodecIfAvailable(registrar, "MACROMAN"); + + // Japanese + registerCodecIfAvailable(registrar, "Shift_JIS"); + registerCodecIfAvailable(registrar, "EUC-JP"); + registerCodecIfAvailable(registrar, "ISO-2022-JP"); + + // Traditional Chinese + registerCodecIfAvailable(registrar, "BIG5"); + registerCodecIfAvailable(registrar, "BIG5-HKSCS"); + registerCodecIfAvailable(registrar, "CP950"); + + // Korean + registerCodecIfAvailable(registrar, "ISO-2022-KR"); + registerCodecIfAvailable(registrar, "CP949"); + registerCodecIfAvailable(registrar, "EUC-KR"); + + // Arabic + registerCodecIfAvailable(registrar, "ISO-8859-6"); + // rearranged, windows-1256 now declared the canonical name and put to lowercase to fix /fast/encoding/ahram-org-eg.html test case + registerCodecIfAvailable(registrar, "windows-1256"); + + // Hebrew + registerCodecIfAvailable(registrar, "ISO-8859-8"); + // rearranged, moved windows-1255 as canonical and lowercased, fixing /fast/encoding/meta-charset.html + registerCodecIfAvailable(registrar, "windows-1255"); + + // Greek + registerCodecIfAvailable(registrar, "ISO-8859-7"); + registerCodecIfAvailable(registrar, "CP869"); + registerCodecIfAvailable(registrar, "WINDOWS-1253"); + + // Cyrillic + registerCodecIfAvailable(registrar, "ISO-8859-5"); + registerCodecIfAvailable(registrar, "KOI8-R"); + registerCodecIfAvailable(registrar, "CP866"); + registerCodecIfAvailable(registrar, "KOI8-U"); + // CP1251 added to pass /fast/encoding/charset-cp1251.html + registerCodecIfAvailable(registrar, "windows-1251"); + registerCodecIfAvailable(registrar, "mac-cyrillic"); + + // Thai + registerCodecIfAvailable(registrar, "CP874"); + registerCodecIfAvailable(registrar, "TIS-620"); + + // Simplified Chinese + registerCodecIfAvailable(registrar, "GBK"); + registerCodecIfAvailable(registrar, "HZ"); + registerCodecIfAvailable(registrar, "GB18030"); + registerCodecIfAvailable(registrar, "EUC-CN"); + registerCodecIfAvailable(registrar, "GB_2312-80"); + + // Central European + registerCodecIfAvailable(registrar, "ISO-8859-2"); + registerCodecIfAvailable(registrar, "CP1250"); + registerCodecIfAvailable(registrar, "MAC-CENTRALEUROPE"); + + // Vietnamese + registerCodecIfAvailable(registrar, "CP1258"); + + // Turkish + registerCodecIfAvailable(registrar, "CP1254"); + registerCodecIfAvailable(registrar, "ISO-8859-9"); + + // Baltic + registerCodecIfAvailable(registrar, "CP1257"); + registerCodecIfAvailable(registrar, "ISO-8859-4"); } TextCodecGtk::TextCodecGtk(const TextEncoding& encoding) : m_encoding(encoding) , m_numBufferedBytes(0) - , m_iconvDecoder(reinterpret_cast<GIConv>(-1)) - , m_iconvEncoder(reinterpret_cast<GIConv>(-1)) { } TextCodecGtk::~TextCodecGtk() { - if (m_iconvDecoder != reinterpret_cast<GIConv>(-1)) { - g_iconv_close(m_iconvDecoder); - m_iconvDecoder = reinterpret_cast<GIConv>(-1); - } - if (m_iconvEncoder != reinterpret_cast<GIConv>(-1)) { - g_iconv_close(m_iconvEncoder); - m_iconvEncoder = reinterpret_cast<GIConv>(-1); - } } void TextCodecGtk::createIConvDecoder() const { - ASSERT(m_iconvDecoder == reinterpret_cast<GIConv>(-1)); + ASSERT(!m_iconvDecoder); - m_iconvDecoder = g_iconv_open(m_internalEncodingName, m_encoding.name()); + m_iconvDecoder = adoptPlatformRef(g_charset_converter_new(internalEncodingName, m_encoding.name(), 0)); } void TextCodecGtk::createIConvEncoder() const { - ASSERT(m_iconvDecoder == reinterpret_cast<GIConv>(-1)); + ASSERT(!m_iconvEncoder); - m_iconvEncoder = g_iconv_open(m_encoding.name(), m_internalEncodingName); + m_iconvEncoder = adoptPlatformRef(g_charset_converter_new(m_encoding.name(), internalEncodingName, 0)); } String TextCodecGtk::decode(const char* bytes, size_t length, bool flush, bool stopOnError, bool& sawError) { // Get a converter for the passed-in encoding. - if (m_iconvDecoder == reinterpret_cast<GIConv>(-1)) { + if (!m_iconvDecoder) createIConvDecoder(); - ASSERT(m_iconvDecoder != reinterpret_cast<GIConv>(-1)); - if (m_iconvDecoder == reinterpret_cast<GIConv>(-1)) { - LOG_ERROR("Error creating IConv encoder even though encoding was in table."); - return String(); - } + if (!m_iconvDecoder) { + LOG_ERROR("Error creating IConv encoder even though encoding was in table."); + return String(); } - size_t countWritten, countRead, conversionLength; - const char* conversionBytes; + Vector<UChar> result; + + gsize bytesRead = 0; + gsize bytesWritten = 0; + const gchar* input = bytes; + gsize inputLength = length; + gchar buffer[ConversionBufferSize]; + int flags = !length ? G_CONVERTER_INPUT_AT_END : G_CONVERTER_NO_FLAGS; + if (flush) + flags |= G_CONVERTER_FLUSH; + + bool bufferWasFull = false; char* prefixedBytes = 0; if (m_numBufferedBytes) { - conversionLength = length + m_numBufferedBytes; - prefixedBytes = static_cast<char*>(fastMalloc(conversionLength)); + inputLength = length + m_numBufferedBytes; + prefixedBytes = static_cast<char*>(fastMalloc(inputLength)); memcpy(prefixedBytes, m_bufferedBytes, m_numBufferedBytes); memcpy(prefixedBytes + m_numBufferedBytes, bytes, length); - - conversionBytes = prefixedBytes; - + + input = prefixedBytes; + // all buffered bytes are consumed now m_numBufferedBytes = 0; - } else { - // no previously buffered partial data, - // just convert the data that was passed in - conversionBytes = bytes; - conversionLength = length; } - GOwnPtr<GError> err; - GOwnPtr<UChar> buffer; - - buffer.outPtr() = reinterpret_cast<UChar*>(g_convert_with_iconv(conversionBytes, conversionLength, m_iconvDecoder, &countRead, &countWritten, &err.outPtr())); - - - if (err) { - LOG_ERROR("GIConv conversion error, Code %d: \"%s\"", err->code, err->message); - m_numBufferedBytes = 0; // reset state for subsequent calls to decode - fastFree(prefixedBytes); - sawError = true; - return String(); - } - - // Partial input at the end of the string may not result in an error being raised. - // From the gnome library documentation on g_convert_with_iconv: - // "Even if the conversion was successful, this may be less than len if there were partial characters at the end of the input." - // That's why we need to compare conversionLength against countRead - - m_numBufferedBytes = conversionLength - countRead; - if (m_numBufferedBytes > 0) { - if (flush) { - LOG_ERROR("Partial bytes at end of input while flush requested."); - m_numBufferedBytes = 0; // reset state for subsequent calls to decode - fastFree(prefixedBytes); - sawError = true; - return String(); + do { + GOwnPtr<GError> error; + GConverterResult res = g_converter_convert(G_CONVERTER(m_iconvDecoder.get()), + input, inputLength, + buffer, sizeof(buffer), + static_cast<GConverterFlags>(flags), + &bytesRead, &bytesWritten, + &error.outPtr()); + input += bytesRead; + inputLength -= bytesRead; + + if (res == G_CONVERTER_ERROR) { + if (g_error_matches(error.get(), G_IO_ERROR, G_IO_ERROR_PARTIAL_INPUT)) { + // There is not enough input to fully determine what the conversion should produce, + // save it to a buffer to prepend it to the next input. + memcpy(m_bufferedBytes, input, inputLength); + m_numBufferedBytes = inputLength; + inputLength = 0; + } else if (g_error_matches(error.get(), G_IO_ERROR, G_IO_ERROR_NO_SPACE)) + bufferWasFull = true; + else if (g_error_matches(error.get(), G_IO_ERROR, G_IO_ERROR_INVALID_DATA)) { + if (stopOnError) + sawError = true; + if (inputLength) { + // Ignore invalid character. + input += 1; + inputLength -= 1; + } + } else { + sawError = true; + LOG_ERROR("GIConv conversion error, Code %d: \"%s\"", error->code, error->message); + m_numBufferedBytes = 0; // Reset state for subsequent calls to decode. + fastFree(prefixedBytes); + return String(); + } } - memcpy(m_bufferedBytes, conversionBytes + countRead, m_numBufferedBytes); - } - fastFree(prefixedBytes); - - Vector<UChar> result; + result.append(reinterpret_cast<UChar*>(buffer), bytesWritten / sizeof(UChar)); + } while ((inputLength || bufferWasFull) && !sawError); - result.append(buffer.get(), countWritten / sizeof(UChar)); + fastFree(prefixedBytes); return String::adopt(result); } @@ -420,23 +518,42 @@ CString TextCodecGtk::encode(const UChar* characters, size_t length, Unencodable if (!length) return ""; - if (m_iconvEncoder == reinterpret_cast<GIConv>(-1)) + if (!m_iconvEncoder) createIConvEncoder(); - if (m_iconvEncoder == reinterpret_cast<GIConv>(-1)) + if (!m_iconvEncoder) { + LOG_ERROR("Error creating IConv encoder even though encoding was in table."); return CString(); + } - size_t count; - - GOwnPtr<GError> err; - GOwnPtr<char> buffer; - - buffer.outPtr() = g_convert_with_iconv(reinterpret_cast<const char*>(characters), length * sizeof(UChar), m_iconvEncoder, 0, &count, &err.outPtr()); - if (err) { - LOG_ERROR("GIConv conversion error, Code %d: \"%s\"", err->code, err->message); + gsize bytesRead = 0; + gsize bytesWritten = 0; + const gchar* input = reinterpret_cast<const char*>(characters); + gsize inputLength = length * sizeof(UChar); + gchar buffer[ConversionBufferSize]; + Vector<char> result; + GOwnPtr<GError> error; + + size_t size = 0; + do { + g_converter_convert(G_CONVERTER(m_iconvEncoder.get()), + input, inputLength, + buffer, sizeof(buffer), + G_CONVERTER_INPUT_AT_END, + &bytesRead, &bytesWritten, + &error.outPtr()); + input += bytesRead; + inputLength -= bytesRead; + result.grow(size + bytesWritten); + memcpy(result.data() + size, buffer, bytesWritten); + size += bytesWritten; + } while (inputLength && !error.get()); + + if (error) { + LOG_ERROR("GIConv conversion error, Code %d: \"%s\"", error->code, error->message); return CString(); } - return CString(buffer.get(), count); + return CString(result.data(), size); } } // namespace WebCore diff --git a/WebCore/platform/text/gtk/TextCodecGtk.h b/WebCore/platform/text/gtk/TextCodecGtk.h index a8af752..1fb8df9 100644 --- a/WebCore/platform/text/gtk/TextCodecGtk.h +++ b/WebCore/platform/text/gtk/TextCodecGtk.h @@ -29,6 +29,7 @@ #ifndef TextCodecGTK_h #define TextCodecGTK_h +#include "GRefPtr.h" #include <glib.h> #include "TextCodec.h" #include "TextEncoding.h" @@ -53,93 +54,11 @@ namespace WebCore { void createIConvDecoder() const; void createIConvEncoder() const; - static void registerEncodingNames(EncodingNameRegistrar registrar, bool extended); - static void registerCodecs(TextCodecRegistrar registrar, bool extended); - static gboolean isEncodingAvailable(const gchar*); - TextEncoding m_encoding; size_t m_numBufferedBytes; unsigned char m_bufferedBytes[16]; // bigger than any single multi-byte character - mutable GIConv m_iconvDecoder; - mutable GIConv m_iconvEncoder; - - static const gchar* m_internalEncodingName; - - typedef const gchar* const codecAliasList[]; - - // Unicode - static codecAliasList m_codecAliases_UTF_8; - - // Western - static codecAliasList m_codecAliases_ISO_8859_1; - static codecAliasList m_codecAliases_MACROMAN; - - // Japanese - static codecAliasList m_codecAliases_SHIFT_JIS; - static codecAliasList m_codecAliases_EUC_JP; - static codecAliasList m_codecAliases_ISO_2022_JP; - - // Traditional Chinese - static codecAliasList m_codecAliases_BIG5; - static codecAliasList m_codecAliases_BIG5_HKSCS; - static codecAliasList m_codecAliases_CP950; - - // Korean - static codecAliasList m_codecAliases_ISO_2022_KR; - static codecAliasList m_codecAliases_CP949; - static codecAliasList m_codecAliases_EUC_KR; - - // Arabic - static codecAliasList m_codecAliases_ISO_8859_6; - static codecAliasList m_codecAliases_CP1256; - - // Hebrew - static codecAliasList m_codecAliases_ISO_8859_8; - static codecAliasList m_codecAliases_CP1255; - - // Greek - static codecAliasList m_codecAliases_ISO_8859_7; - static codecAliasList m_codecAliases_CP869; - static codecAliasList m_codecAliases_WINDOWS_1253; - - // Cyrillic - static codecAliasList m_codecAliases_ISO_8859_5; - static codecAliasList m_codecAliases_KOI8_R; - static codecAliasList m_codecAliases_CP866; - static codecAliasList m_codecAliases_KOI8_U; - static codecAliasList m_codecAliases_WINDOWS_1251; - static codecAliasList m_codecAliases_MACCYRILLIC; - - // Thai - static codecAliasList m_codecAliases_CP874; - static codecAliasList m_codecAliases_TIS_620; - - // Simplified Chinese - static codecAliasList m_codecAliases_GBK; - static codecAliasList m_codecAliases_HZ; - static codecAliasList m_codecAliases_GB18030; - static codecAliasList m_codecAliases_EUC_CN; - static codecAliasList m_codecAliases_2312_80; - - // Central European - static codecAliasList m_codecAliases_ISO_8859_2; - static codecAliasList m_codecAliases_CP1250; - static codecAliasList m_codecAliases_MACCENTRALEUROPE; - - // Vietnamese - static codecAliasList m_codecAliases_CP1258; - - // Turkish - static codecAliasList m_codecAliases_CP1254; - static codecAliasList m_codecAliases_ISO_8859_9; - - // Baltic - static codecAliasList m_codecAliases_CP1257; - static codecAliasList m_codecAliases_ISO_8859_4; - - static gconstpointer const m_iconvBaseCodecList[]; - static gconstpointer const m_iconvExtendedCodecList[]; - + mutable PlatformRefPtr<GCharsetConverter> m_iconvDecoder; + mutable PlatformRefPtr<GCharsetConverter> m_iconvEncoder; }; } // namespace WebCore |