From cad810f21b803229eb11403f9209855525a25d57 Mon Sep 17 00:00:00 2001 From: Steve Block Date: Fri, 6 May 2011 11:45:16 +0100 Subject: Merge WebKit at r75315: Initial merge by git. Change-Id: I570314b346ce101c935ed22a626b48c2af266b84 --- .../platform/text/gtk/TextBreakIteratorGtk.cpp | 365 +++++++++++++ .../text/gtk/TextBreakIteratorInternalICUGtk.cpp | 37 ++ Source/WebCore/platform/text/gtk/TextCodecGtk.cpp | 578 +++++++++++++++++++++ Source/WebCore/platform/text/gtk/TextCodecGtk.h | 66 +++ 4 files changed, 1046 insertions(+) create mode 100644 Source/WebCore/platform/text/gtk/TextBreakIteratorGtk.cpp create mode 100644 Source/WebCore/platform/text/gtk/TextBreakIteratorInternalICUGtk.cpp create mode 100644 Source/WebCore/platform/text/gtk/TextCodecGtk.cpp create mode 100644 Source/WebCore/platform/text/gtk/TextCodecGtk.h (limited to 'Source/WebCore/platform/text/gtk') diff --git a/Source/WebCore/platform/text/gtk/TextBreakIteratorGtk.cpp b/Source/WebCore/platform/text/gtk/TextBreakIteratorGtk.cpp new file mode 100644 index 0000000..990e331 --- /dev/null +++ b/Source/WebCore/platform/text/gtk/TextBreakIteratorGtk.cpp @@ -0,0 +1,365 @@ +/* + * Copyright (C) 2006 Lars Knoll + * Copyright (C) 2007 Apple Inc. All rights reserved. + * Copyright (C) 2008 Jürg Billeter + * Copyright (C) 2008 Dominik Röttsches + * Copyright (C) 2010 Igalia S.L. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public License + * along with this library; see the file COPYING.LIB. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, + * Boston, MA 02110-1301, USA. + * + */ + +#include "config.h" + +#include "TextBreakIterator.h" + +#include "GOwnPtr.h" +#include +using namespace std; + +#define UTF8_IS_SURROGATE(character) (character >= 0x10000 && character <= 0x10FFFF) + +namespace WebCore { + +class CharacterIterator { +public: + bool setText(const UChar* string, int length); + const gchar* getText() { return m_utf8.get(); } + int getLength() { return m_length; } + glong getSize() { return m_size; } + void setIndex(int index); + int getIndex() { return m_index; } + void setUTF16Index(int index); + int getUTF16Index() { return m_utf16Index; } + int getUTF16Length() { return m_utf16Length; } + int first(); + int last(); + int next(); + int previous(); +private: + int characterSize(int index); + + GOwnPtr m_utf8; + int m_length; + long m_size; + int m_index; + int m_utf16Index; + int m_utf16Length; +}; + +int CharacterIterator::characterSize(int index) +{ + if (index == m_length || index < 0) + return 0; + if (m_length == m_utf16Length) + return 1; + + gchar* indexPtr = g_utf8_offset_to_pointer(m_utf8.get(), index); + gunichar character = g_utf8_get_char(indexPtr); + return UTF8_IS_SURROGATE(character) ? 2 : 1; +} + +bool CharacterIterator::setText(const UChar* string, int length) +{ + long utf8Size = 0; + m_utf8.set(g_utf16_to_utf8(string, length, 0, &utf8Size, 0)); + if (!utf8Size) + return false; + + m_utf16Length = length; + m_length = g_utf8_strlen(m_utf8.get(), utf8Size); + m_size = utf8Size; + m_index = 0; + m_utf16Index = 0; + + return true; +} + +void CharacterIterator::setIndex(int index) +{ + if (index == m_index) + return; + if (index <= 0) + m_index = m_utf16Index = 0; + else if (index >= m_length) { + m_index = m_length; + m_utf16Index = m_utf16Length; + } else if (m_length == m_utf16Length) + m_index = m_utf16Index = index; + else { + m_index = index; + int utf16Index = 0; + int utf8Index = 0; + while (utf8Index < index) { + utf16Index += characterSize(utf8Index); + utf8Index++; + } + m_utf16Index = utf16Index; + } +} + +void CharacterIterator::setUTF16Index(int index) +{ + if (index == m_utf16Index) + return; + if (index <= 0) + m_utf16Index = m_index = 0; + else if (index >= m_utf16Length) { + m_utf16Index = m_utf16Length; + m_index = m_length; + } else if (m_length == m_utf16Length) + m_utf16Index = m_index = index; + else { + m_utf16Index = index; + int utf16Index = 0; + int utf8Index = 0; + while (utf16Index < index) { + utf16Index += characterSize(utf8Index); + utf8Index++; + } + m_index = utf8Index; + } +} + +int CharacterIterator::first() +{ + m_index = m_utf16Index = 0; + return m_index; +} + +int CharacterIterator::last() +{ + m_index = m_length; + m_utf16Index = m_utf16Length; + return m_index; +} + +int CharacterIterator::next() +{ + int next = m_index + 1; + + if (next <= m_length) { + m_utf16Index = min(m_utf16Index + characterSize(m_index), m_utf16Length); + m_index = next; + } else { + m_index = TextBreakDone; + m_utf16Index = TextBreakDone; + } + + return m_index; +} + +int CharacterIterator::previous() +{ + int previous = m_index - 1; + + if (previous >= 0) { + m_utf16Index = max(m_utf16Index - characterSize(previous), 0); + m_index = previous; + } else { + m_index = TextBreakDone; + m_utf16Index = TextBreakDone; + } + + return m_index; +} + +enum UBreakIteratorType { + UBRK_CHARACTER, + UBRK_WORD, + UBRK_LINE, + UBRK_SENTENCE +}; + +class TextBreakIterator { +public: + UBreakIteratorType m_type; + PangoLogAttr* m_logAttrs; + CharacterIterator m_charIterator; +}; + +static TextBreakIterator* setUpIterator(bool& createdIterator, TextBreakIterator*& iterator, + UBreakIteratorType type, const UChar* string, int length) +{ + if (!string) + return 0; + + if (!createdIterator) { + iterator = new TextBreakIterator(); + createdIterator = true; + } + if (!iterator) + return 0; + + if (!iterator->m_charIterator.setText(string, length)) + return 0; + + int charLength = iterator->m_charIterator.getLength(); + + iterator->m_type = type; + if (createdIterator) + g_free(iterator->m_logAttrs); + iterator->m_logAttrs = g_new0(PangoLogAttr, charLength + 1); + pango_get_log_attrs(iterator->m_charIterator.getText(), iterator->m_charIterator.getSize(), + -1, 0, iterator->m_logAttrs, charLength + 1); + + return iterator; +} + +TextBreakIterator* characterBreakIterator(const UChar* string, int length) +{ + static bool createdCharacterBreakIterator = false; + static TextBreakIterator* staticCharacterBreakIterator; + return setUpIterator(createdCharacterBreakIterator, staticCharacterBreakIterator, UBRK_CHARACTER, string, length); +} + +TextBreakIterator* cursorMovementIterator(const UChar* string, int length) +{ + // FIXME: This needs closer inspection to achieve behaviour identical to the ICU version. + return characterBreakIterator(string, length); +} + +TextBreakIterator* wordBreakIterator(const UChar* string, int length) +{ + static bool createdWordBreakIterator = false; + static TextBreakIterator* staticWordBreakIterator; + return setUpIterator(createdWordBreakIterator, staticWordBreakIterator, UBRK_WORD, string, length); +} + +TextBreakIterator* lineBreakIterator(const UChar* string, int length) +{ + static bool createdLineBreakIterator = false; + static TextBreakIterator* staticLineBreakIterator; + return setUpIterator(createdLineBreakIterator, staticLineBreakIterator, UBRK_LINE, string, length); +} + +TextBreakIterator* sentenceBreakIterator(const UChar* string, int length) +{ + static bool createdSentenceBreakIterator = false; + static TextBreakIterator* staticSentenceBreakIterator; + return setUpIterator(createdSentenceBreakIterator, staticSentenceBreakIterator, UBRK_SENTENCE, string, length); +} + +int textBreakFirst(TextBreakIterator* iterator) +{ + iterator->m_charIterator.first(); + return iterator->m_charIterator.getUTF16Index(); +} + +int textBreakLast(TextBreakIterator* iterator) +{ + // TextBreakLast is not meant to find just any break according to bi->m_type + // but really the one near the last character. + // (cmp ICU documentation for ubrk_first and ubrk_last) + // From ICU docs for ubrk_last: + // "Determine the index immediately beyond the last character in the text being scanned." + + // So we should advance or traverse back based on bi->m_logAttrs cursor positions. + // If last character position in the original string is a whitespace, + // traverse to the left until the first non-white character position is found + // and return the position of the first white-space char after this one. + // Otherwise return m_length, as "the first character beyond the last" is outside our string. + + bool whiteSpaceAtTheEnd = true; + int nextWhiteSpacePos = iterator->m_charIterator.getLength(); + + int pos = iterator->m_charIterator.last(); + while (pos >= 0 && whiteSpaceAtTheEnd) { + if (iterator->m_logAttrs[pos].is_cursor_position) { + if (whiteSpaceAtTheEnd = iterator->m_logAttrs[pos].is_white) + nextWhiteSpacePos = pos; + } + pos = iterator->m_charIterator.previous(); + } + iterator->m_charIterator.setIndex(nextWhiteSpacePos); + return iterator->m_charIterator.getUTF16Index(); +} + +int textBreakNext(TextBreakIterator* iterator) +{ + while (iterator->m_charIterator.next() != TextBreakDone) { + int index = iterator->m_charIterator.getIndex(); + + // FIXME: UBRK_WORD case: Single multibyte characters (i.e. white space around them), such as the euro symbol €, + // are not marked as word_start & word_end as opposed to the way ICU does it. + // This leads to - for example - different word selection behaviour when right clicking. + + if ((iterator->m_type == UBRK_LINE && iterator->m_logAttrs[index].is_line_break) + || (iterator->m_type == UBRK_WORD && (iterator->m_logAttrs[index].is_word_start || iterator->m_logAttrs[index].is_word_end)) + || (iterator->m_type == UBRK_CHARACTER && iterator->m_logAttrs[index].is_cursor_position) + || (iterator->m_type == UBRK_SENTENCE && iterator->m_logAttrs[index].is_sentence_boundary)) { + break; + } + } + return iterator->m_charIterator.getUTF16Index(); +} + +int textBreakPrevious(TextBreakIterator* iterator) +{ + while (iterator->m_charIterator.previous() != TextBreakDone) { + int index = iterator->m_charIterator.getIndex(); + + if ((iterator->m_type == UBRK_LINE && iterator->m_logAttrs[index].is_line_break) + || (iterator->m_type == UBRK_WORD && (iterator->m_logAttrs[index].is_word_start || iterator->m_logAttrs[index].is_word_end)) + || (iterator->m_type == UBRK_CHARACTER && iterator->m_logAttrs[index].is_cursor_position) + || (iterator->m_type == UBRK_SENTENCE && iterator->m_logAttrs[index].is_sentence_boundary)) { + break; + } + } + return iterator->m_charIterator.getUTF16Index(); +} + +int textBreakPreceding(TextBreakIterator* iterator, int offset) +{ + if (offset > iterator->m_charIterator.getUTF16Length()) + return TextBreakDone; + if (offset < 0) + return 0; + iterator->m_charIterator.setUTF16Index(offset); + return textBreakPrevious(iterator); +} + +int textBreakFollowing(TextBreakIterator* iterator, int offset) +{ + if (offset > iterator->m_charIterator.getUTF16Length()) + return TextBreakDone; + if (offset < 0) + return 0; + iterator->m_charIterator.setUTF16Index(offset); + return textBreakNext(iterator); +} + +int textBreakCurrent(TextBreakIterator* iterator) +{ + return iterator->m_charIterator.getUTF16Index(); +} + +bool isTextBreak(TextBreakIterator* iterator, int offset) +{ + if (!offset) + return true; + if (offset > iterator->m_charIterator.getUTF16Length()) + return false; + + iterator->m_charIterator.setUTF16Index(offset); + + int index = iterator->m_charIterator.getIndex(); + iterator->m_charIterator.previous(); + textBreakNext(iterator); + return iterator->m_charIterator.getIndex() == index; +} + +} diff --git a/Source/WebCore/platform/text/gtk/TextBreakIteratorInternalICUGtk.cpp b/Source/WebCore/platform/text/gtk/TextBreakIteratorInternalICUGtk.cpp new file mode 100644 index 0000000..35e5a05 --- /dev/null +++ b/Source/WebCore/platform/text/gtk/TextBreakIteratorInternalICUGtk.cpp @@ -0,0 +1,37 @@ +/* + * Copyright (C) 2007 Alp Toker + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public License + * along with this library; see the file COPYING.LIB. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, + * Boston, MA 02110-1301, USA. + */ + +#include "config.h" +#include "TextBreakIteratorInternalICU.h" + +namespace WebCore { + +const char* currentSearchLocaleID() +{ + // FIXME: Should use system locale. + return ""; +} + +const char* currentTextBreakLocaleID() +{ + // FIXME: Should use system locale. + return "en_us"; +} + +} diff --git a/Source/WebCore/platform/text/gtk/TextCodecGtk.cpp b/Source/WebCore/platform/text/gtk/TextCodecGtk.cpp new file mode 100644 index 0000000..c5bd7e8 --- /dev/null +++ b/Source/WebCore/platform/text/gtk/TextCodecGtk.cpp @@ -0,0 +1,578 @@ +/* + * Copyright (C) 2004, 2006, 2007, 2008 Apple Inc. All rights reserved. + * Copyright (C) 2006 Alexey Proskuryakov + * Copyright (C) 2008 Jürg Billeter + * Copyright (C) 2009 Dominik Röttsches + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "TextCodecGtk.h" + +#include +#include "GOwnPtr.h" +#include "Logging.h" +#include "PlatformString.h" +#include +#include +#include + +using std::min; + +namespace WebCore { + +// TextCodec's appendOmittingBOM() is gone (http://trac.webkit.org/changeset/33380). +// That's why we need to avoid generating extra BOM's for the conversion result. +// This can be achieved by specifying the UTF-16 codecs' endianness explicitly when initializing GLib. + +#if (G_BYTE_ORDER == G_BIG_ENDIAN) +static const gchar* internalEncodingName = "UTF-16BE"; +#else +static const gchar* internalEncodingName = "UTF-16LE"; +#endif + + +const size_t ConversionBufferSize = 16384; + + +static PassOwnPtr newTextCodecGtk(const TextEncoding& encoding, const void*) +{ + return new TextCodecGtk(encoding); +} + +static bool isEncodingAvailable(const gchar* encodingName) +{ + GIConv tester; + // test decoding + tester = g_iconv_open(internalEncodingName, encodingName); + if (tester == reinterpret_cast(-1)) { + return false; + } else { + g_iconv_close(tester); + // test encoding + tester = g_iconv_open(encodingName, internalEncodingName); + if (tester == reinterpret_cast(-1)) { + return false; + } else { + g_iconv_close(tester); + return true; + } + } +} + +static bool registerEncodingNameIfAvailable(EncodingNameRegistrar registrar, const char* canonicalName) +{ + if (isEncodingAvailable(canonicalName)) { + registrar(canonicalName, canonicalName); + return true; + } + + return false; +} + +static void registerEncodingAliasIfAvailable(EncodingNameRegistrar registrar, const char* canonicalName, const char* aliasName) +{ + if (isEncodingAvailable(aliasName)) + registrar(aliasName, canonicalName); +} + +static void registerCodecIfAvailable(TextCodecRegistrar registrar, const char* codecName) +{ + if (isEncodingAvailable(codecName)) + registrar(codecName, newTextCodecGtk, 0); +} + +void TextCodecGtk::registerBaseEncodingNames(EncodingNameRegistrar registrar) +{ + // Unicode + registerEncodingNameIfAvailable(registrar, "UTF-8"); + registerEncodingNameIfAvailable(registrar, "UTF-32"); + registerEncodingNameIfAvailable(registrar, "UTF-32BE"); + registerEncodingNameIfAvailable(registrar, "UTF-32LE"); + + // Western + if (registerEncodingNameIfAvailable(registrar, "ISO-8859-1")) { + registerEncodingAliasIfAvailable(registrar, "ISO-8859-1", "CP819"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-1", "IBM819"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-1", "ISO-IR-100"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-1", "ISO8859-1"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-1", "ISO_8859-1"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-1", "ISO_8859-1:1987"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-1", "L1"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-1", "LATIN1"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-1", "CSISOLATIN1"); + } +} + +void TextCodecGtk::registerBaseCodecs(TextCodecRegistrar registrar) +{ + // Unicode + registerCodecIfAvailable(registrar, "UTF-8"); + registerCodecIfAvailable(registrar, "UTF-32"); + registerCodecIfAvailable(registrar, "UTF-32BE"); + registerCodecIfAvailable(registrar, "UTF-32LE"); + + // Western + registerCodecIfAvailable(registrar, "ISO-8859-1"); +} + +void TextCodecGtk::registerExtendedEncodingNames(EncodingNameRegistrar registrar) +{ + // Western + if (registerEncodingNameIfAvailable(registrar, "MACROMAN")) { + registerEncodingAliasIfAvailable(registrar, "MACROMAN", "MAC"); + registerEncodingAliasIfAvailable(registrar, "MACROMAN", "MACINTOSH"); + registerEncodingAliasIfAvailable(registrar, "MACROMAN", "CSMACINTOSH"); + } + + // Japanese + if (registerEncodingNameIfAvailable(registrar, "Shift_JIS")) { + registerEncodingAliasIfAvailable(registrar, "Shift_JIS", "MS_KANJI"); + registerEncodingAliasIfAvailable(registrar, "Shift_JIS", "SHIFT-JIS"); + registerEncodingAliasIfAvailable(registrar, "Shift_JIS", "SJIS"); + registerEncodingAliasIfAvailable(registrar, "Shift_JIS", "CSSHIFTJIS"); + } + if (registerEncodingNameIfAvailable(registrar, "EUC-JP")) { + registerEncodingAliasIfAvailable(registrar, "EUC-JP", "EUC_JP"); + registerEncodingAliasIfAvailable(registrar, "EUC-JP", "EUCJP"); + registerEncodingAliasIfAvailable(registrar, "EUC-JP", "EXTENDED_UNIX_CODE_PACKED_FORMAT_FOR_JAPANESE"); + registerEncodingAliasIfAvailable(registrar, "EUC-JP", "CSEUCPKDFMTJAPANESE"); + } + registerEncodingNameIfAvailable(registrar, "ISO-2022-JP"); + + // Traditional Chinese + if (registerEncodingNameIfAvailable(registrar, "BIG5")) { + registerEncodingAliasIfAvailable(registrar, "BIG5", "BIG-5"); + registerEncodingAliasIfAvailable(registrar, "BIG5", "BIG-FIVE"); + registerEncodingAliasIfAvailable(registrar, "BIG5", "BIGFIVE"); + registerEncodingAliasIfAvailable(registrar, "BIG5", "CN-BIG5"); + registerEncodingAliasIfAvailable(registrar, "BIG5", "CSBIG5"); + } + if (registerEncodingNameIfAvailable(registrar, "BIG5-HKSCS")) { + registerEncodingAliasIfAvailable(registrar, "BIG5-HKSCS", "BIG5-HKSCS:2004"); + registerEncodingAliasIfAvailable(registrar, "BIG5-HKSCS", "BIG5HKSCS"); + } + registerEncodingNameIfAvailable(registrar, "CP950"); + + // Korean + if (registerEncodingNameIfAvailable(registrar, "ISO-2022-KR")) + registerEncodingAliasIfAvailable(registrar, "ISO-2022-KR", "CSISO2022KR"); + if (registerEncodingNameIfAvailable(registrar, "CP949")) + registerEncodingAliasIfAvailable(registrar, "CP949", "UHC"); + if (registerEncodingNameIfAvailable(registrar, "EUC-KR")) + registerEncodingAliasIfAvailable(registrar, "EUC-KR", "CSEUCKR"); + + // Arabic + if (registerEncodingNameIfAvailable(registrar, "ISO-8859-6")) { + registerEncodingAliasIfAvailable(registrar, "ISO-8859-6", "ARABIC"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-6", "ASMO-708"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-6", "ECMA-114"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-6", "ISO-IR-127"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-6", "ISO8859-6"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-6", "ISO_8859-6"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-6", "ISO_8859-6:1987"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-6", "CSISOLATINARABIC"); + } + // rearranged, windows-1256 now declared the canonical name and put to lowercase to fix /fast/encoding/ahram-org-eg.html test case + if (registerEncodingNameIfAvailable(registrar, "windows-1256")) { + registerEncodingAliasIfAvailable(registrar, "windows-1256", "CP1256"); + registerEncodingAliasIfAvailable(registrar, "windows-1256", "MS-ARAB"); + } + + // Hebrew + if (registerEncodingNameIfAvailable(registrar, "ISO-8859-8")) { + registerEncodingAliasIfAvailable(registrar, "ISO-8859-8", "HEBREW"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-8", "ISO-8859-8"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-8", "ISO-IR-138"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-8", "ISO8859-8"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-8", "ISO_8859-8"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-8", "ISO_8859-8:1988"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-8", "CSISOLATINHEBREW"); + } + // rearranged, moved windows-1255 as canonical and lowercased, fixing /fast/encoding/meta-charset.html + if (registerEncodingNameIfAvailable(registrar, "windows-1255")) { + registerEncodingAliasIfAvailable(registrar, "windows-1255", "CP1255"); + registerEncodingAliasIfAvailable(registrar, "windows-1255", "MS-HEBR"); + } + + // Greek + if (registerEncodingNameIfAvailable(registrar, "ISO-8859-7")) { + registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "ECMA-118"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "ELOT_928"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "GREEK"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "GREEK8"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "ISO-IR-126"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "ISO8859-7"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "ISO_8859-7"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "ISO_8859-7:1987"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "ISO_8859-7:2003"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "CSI"); + } + if (registerEncodingNameIfAvailable(registrar, "CP869")) { + registerEncodingAliasIfAvailable(registrar, "CP869", "869"); + registerEncodingAliasIfAvailable(registrar, "CP869", "CP-GR"); + registerEncodingAliasIfAvailable(registrar, "CP869", "IBM869"); + registerEncodingAliasIfAvailable(registrar, "CP869", "CSIBM869"); + } + registerEncodingNameIfAvailable(registrar, "WINDOWS-1253"); + + // Cyrillic + if (registerEncodingNameIfAvailable(registrar, "ISO-8859-5")) { + registerEncodingAliasIfAvailable(registrar, "ISO-8859-5", "CYRILLIC"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-5", "ISO-IR-144"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-5", "ISO8859-5"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-5", "ISO_8859-5"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-5", "ISO_8859-5:1988"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-5", "CSISOLATINCYRILLIC"); + } + if (registerEncodingNameIfAvailable(registrar, "KOI8-R")) + registerEncodingAliasIfAvailable(registrar, "KOI8-R", "CSKOI8R"); + if (registerEncodingNameIfAvailable(registrar, "CP866")) { + registerEncodingAliasIfAvailable(registrar, "CP866", "866"); + registerEncodingAliasIfAvailable(registrar, "CP866", "IBM866"); + registerEncodingAliasIfAvailable(registrar, "CP866", "CSIBM866"); + } + registerEncodingNameIfAvailable(registrar, "KOI8-U"); + // CP1251 added to pass /fast/encoding/charset-cp1251.html + if (registerEncodingNameIfAvailable(registrar, "windows-1251")) + registerEncodingAliasIfAvailable(registrar, "windows-1251", "CP1251"); + if (registerEncodingNameIfAvailable(registrar, "mac-cyrillic")) { + registerEncodingAliasIfAvailable(registrar, "mac-cyrillic", "MACCYRILLIC"); + registerEncodingAliasIfAvailable(registrar, "mac-cyrillic", "x-mac-cyrillic"); + } + + // Thai + if (registerEncodingNameIfAvailable(registrar, "CP874")) + registerEncodingAliasIfAvailable(registrar, "CP874", "WINDOWS-874"); + registerEncodingNameIfAvailable(registrar, "TIS-620"); + + // Simplified Chinese + registerEncodingNameIfAvailable(registrar, "GBK"); + if (registerEncodingNameIfAvailable(registrar, "HZ")) + registerEncodingAliasIfAvailable(registrar, "HZ", "HZ-GB-2312"); + registerEncodingNameIfAvailable(registrar, "GB18030"); + if (registerEncodingNameIfAvailable(registrar, "EUC-CN")) { + registerEncodingAliasIfAvailable(registrar, "EUC-CN", "EUCCN"); + registerEncodingAliasIfAvailable(registrar, "EUC-CN", "GB2312"); + registerEncodingAliasIfAvailable(registrar, "EUC-CN", "CN-GB"); + registerEncodingAliasIfAvailable(registrar, "EUC-CN", "CSGB2312"); + registerEncodingAliasIfAvailable(registrar, "EUC-CN", "EUC_CN"); + } + if (registerEncodingNameIfAvailable(registrar, "GB_2312-80")) { + registerEncodingAliasIfAvailable(registrar, "GB_2312-80", "CHINESE"); + registerEncodingAliasIfAvailable(registrar, "GB_2312-80", "csISO58GB231280"); + registerEncodingAliasIfAvailable(registrar, "GB_2312-80", "GB2312.1980-0"); + registerEncodingAliasIfAvailable(registrar, "GB_2312-80", "ISO-IR-58"); + } + + // Central European + if (registerEncodingNameIfAvailable(registrar, "ISO-8859-2")) { + registerEncodingAliasIfAvailable(registrar, "ISO-8859-2", "ISO-IR-101"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-2", "ISO8859-2"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-2", "ISO_8859-2"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-2", "ISO_8859-2:1987"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-2", "L2"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-2", "LATIN2"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-2", "CSISOLATIN2"); + } + if (registerEncodingNameIfAvailable(registrar, "CP1250")) { + registerEncodingAliasIfAvailable(registrar, "CP1250", "MS-EE"); + registerEncodingAliasIfAvailable(registrar, "CP1250", "WINDOWS-1250"); + } + registerEncodingNameIfAvailable(registrar, "MAC-CENTRALEUROPE"); + + // Vietnamese + if (registerEncodingNameIfAvailable(registrar, "CP1258")) + registerEncodingAliasIfAvailable(registrar, "CP1258", "WINDOWS-1258"); + + // Turkish + if (registerEncodingNameIfAvailable(registrar, "CP1254")) { + registerEncodingAliasIfAvailable(registrar, "CP1254", "MS-TURK"); + registerEncodingAliasIfAvailable(registrar, "CP1254", "WINDOWS-1254"); + } + if (registerEncodingNameIfAvailable(registrar, "ISO-8859-9")) { + registerEncodingAliasIfAvailable(registrar, "ISO-8859-9", "ISO-IR-148"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-9", "ISO8859-9"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-9", "ISO_8859-9"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-9", "ISO_8859-9:1989"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-9", "L5"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-9", "LATIN5"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-9", "CSISOLATIN5"); + } + + // Baltic + if (registerEncodingNameIfAvailable(registrar, "CP1257")) { + registerEncodingAliasIfAvailable(registrar, "CP1257", "WINBALTRIM"); + registerEncodingAliasIfAvailable(registrar, "CP1257", "WINDOWS-1257"); + } + if (registerEncodingNameIfAvailable(registrar, "ISO-8859-4")) { + registerEncodingAliasIfAvailable(registrar, "ISO-8859-4", "ISO-IR-110"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-4", "ISO8859-4"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-4", "ISO_8859-4"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-4", "ISO_8859-4:1988"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-4", "L4"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-4", "LATIN4"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-4", "CSISOLATIN4"); + } +} + +void TextCodecGtk::registerExtendedCodecs(TextCodecRegistrar registrar) +{ + // Western + registerCodecIfAvailable(registrar, "MACROMAN"); + + // Japanese + registerCodecIfAvailable(registrar, "Shift_JIS"); + registerCodecIfAvailable(registrar, "EUC-JP"); + registerCodecIfAvailable(registrar, "ISO-2022-JP"); + + // Traditional Chinese + registerCodecIfAvailable(registrar, "BIG5"); + registerCodecIfAvailable(registrar, "BIG5-HKSCS"); + registerCodecIfAvailable(registrar, "CP950"); + + // Korean + registerCodecIfAvailable(registrar, "ISO-2022-KR"); + registerCodecIfAvailable(registrar, "CP949"); + registerCodecIfAvailable(registrar, "EUC-KR"); + + // Arabic + registerCodecIfAvailable(registrar, "ISO-8859-6"); + // rearranged, windows-1256 now declared the canonical name and put to lowercase to fix /fast/encoding/ahram-org-eg.html test case + registerCodecIfAvailable(registrar, "windows-1256"); + + // Hebrew + registerCodecIfAvailable(registrar, "ISO-8859-8"); + // rearranged, moved windows-1255 as canonical and lowercased, fixing /fast/encoding/meta-charset.html + registerCodecIfAvailable(registrar, "windows-1255"); + + // Greek + registerCodecIfAvailable(registrar, "ISO-8859-7"); + registerCodecIfAvailable(registrar, "CP869"); + registerCodecIfAvailable(registrar, "WINDOWS-1253"); + + // Cyrillic + registerCodecIfAvailable(registrar, "ISO-8859-5"); + registerCodecIfAvailable(registrar, "KOI8-R"); + registerCodecIfAvailable(registrar, "CP866"); + registerCodecIfAvailable(registrar, "KOI8-U"); + // CP1251 added to pass /fast/encoding/charset-cp1251.html + registerCodecIfAvailable(registrar, "windows-1251"); + registerCodecIfAvailable(registrar, "mac-cyrillic"); + + // Thai + registerCodecIfAvailable(registrar, "CP874"); + registerCodecIfAvailable(registrar, "TIS-620"); + + // Simplified Chinese + registerCodecIfAvailable(registrar, "GBK"); + registerCodecIfAvailable(registrar, "HZ"); + registerCodecIfAvailable(registrar, "GB18030"); + registerCodecIfAvailable(registrar, "EUC-CN"); + registerCodecIfAvailable(registrar, "GB_2312-80"); + + // Central European + registerCodecIfAvailable(registrar, "ISO-8859-2"); + registerCodecIfAvailable(registrar, "CP1250"); + registerCodecIfAvailable(registrar, "MAC-CENTRALEUROPE"); + + // Vietnamese + registerCodecIfAvailable(registrar, "CP1258"); + + // Turkish + registerCodecIfAvailable(registrar, "CP1254"); + registerCodecIfAvailable(registrar, "ISO-8859-9"); + + // Baltic + registerCodecIfAvailable(registrar, "CP1257"); + registerCodecIfAvailable(registrar, "ISO-8859-4"); +} + +TextCodecGtk::TextCodecGtk(const TextEncoding& encoding) + : m_encoding(encoding) + , m_numBufferedBytes(0) +{ +} + +TextCodecGtk::~TextCodecGtk() +{ +} + +void TextCodecGtk::createIConvDecoder() const +{ + ASSERT(!m_iconvDecoder); + + m_iconvDecoder = adoptGRef(g_charset_converter_new(internalEncodingName, m_encoding.name(), 0)); +} + +void TextCodecGtk::createIConvEncoder() const +{ + ASSERT(!m_iconvEncoder); + + m_iconvEncoder = adoptGRef(g_charset_converter_new(m_encoding.name(), internalEncodingName, 0)); +} + +String TextCodecGtk::decode(const char* bytes, size_t length, bool flush, bool stopOnError, bool& sawError) +{ + // Get a converter for the passed-in encoding. + if (!m_iconvDecoder) + createIConvDecoder(); + if (!m_iconvDecoder) { + LOG_ERROR("Error creating IConv encoder even though encoding was in table."); + return String(); + } + + Vector result; + + gsize bytesRead = 0; + gsize bytesWritten = 0; + const gchar* input = bytes; + gsize inputLength = length; + gchar buffer[ConversionBufferSize]; + int flags = !length ? G_CONVERTER_INPUT_AT_END : G_CONVERTER_NO_FLAGS; + if (flush) + flags |= G_CONVERTER_FLUSH; + + bool bufferWasFull = false; + char* prefixedBytes = 0; + + if (m_numBufferedBytes) { + inputLength = length + m_numBufferedBytes; + prefixedBytes = static_cast(fastMalloc(inputLength)); + memcpy(prefixedBytes, m_bufferedBytes, m_numBufferedBytes); + memcpy(prefixedBytes + m_numBufferedBytes, bytes, length); + + input = prefixedBytes; + + // all buffered bytes are consumed now + m_numBufferedBytes = 0; + } + + do { + GOwnPtr error; + GConverterResult res = g_converter_convert(G_CONVERTER(m_iconvDecoder.get()), + input, inputLength, + buffer, sizeof(buffer), + static_cast(flags), + &bytesRead, &bytesWritten, + &error.outPtr()); + input += bytesRead; + inputLength -= bytesRead; + + if (res == G_CONVERTER_ERROR) { + if (g_error_matches(error.get(), G_IO_ERROR, G_IO_ERROR_PARTIAL_INPUT)) { + // There is not enough input to fully determine what the conversion should produce, + // save it to a buffer to prepend it to the next input. + memcpy(m_bufferedBytes, input, inputLength); + m_numBufferedBytes = inputLength; + inputLength = 0; + } else if (g_error_matches(error.get(), G_IO_ERROR, G_IO_ERROR_NO_SPACE)) + bufferWasFull = true; + else if (g_error_matches(error.get(), G_IO_ERROR, G_IO_ERROR_INVALID_DATA)) { + if (stopOnError) + sawError = true; + if (inputLength) { + // Ignore invalid character. + input += 1; + inputLength -= 1; + } + } else { + sawError = true; + LOG_ERROR("GIConv conversion error, Code %d: \"%s\"", error->code, error->message); + m_numBufferedBytes = 0; // Reset state for subsequent calls to decode. + fastFree(prefixedBytes); + return String(); + } + } + + result.append(reinterpret_cast(buffer), bytesWritten / sizeof(UChar)); + } while ((inputLength || bufferWasFull) && !sawError); + + fastFree(prefixedBytes); + + return String::adopt(result); +} + +CString TextCodecGtk::encode(const UChar* characters, size_t length, UnencodableHandling handling) +{ + if (!length) + return ""; + + if (!m_iconvEncoder) + createIConvEncoder(); + if (!m_iconvEncoder) { + LOG_ERROR("Error creating IConv encoder even though encoding was in table."); + return CString(); + } + + gsize bytesRead = 0; + gsize bytesWritten = 0; + const gchar* input = reinterpret_cast(characters); + gsize inputLength = length * sizeof(UChar); + gchar buffer[ConversionBufferSize]; + Vector result; + GOwnPtr error; + + size_t size = 0; + do { + g_converter_convert(G_CONVERTER(m_iconvEncoder.get()), + input, inputLength, + buffer, sizeof(buffer), + G_CONVERTER_INPUT_AT_END, + &bytesRead, &bytesWritten, + &error.outPtr()); + input += bytesRead; + inputLength -= bytesRead; + if (bytesWritten > 0) { + result.grow(size + bytesWritten); + memcpy(result.data() + size, buffer, bytesWritten); + size += bytesWritten; + } + + if (error && g_error_matches(error.get(), G_IO_ERROR, G_IO_ERROR_INVALID_DATA)) { + UChar codePoint = reinterpret_cast(input)[0]; + UnencodableReplacementArray replacement; + int replacementLength = TextCodec::getUnencodableReplacement(codePoint, handling, replacement); + + // Consume the invalid character. + input += sizeof(UChar); + inputLength -= sizeof(UChar); + + // Append replacement string to result buffer. + result.grow(size + replacementLength); + memcpy(result.data() + size, replacement, replacementLength); + size += replacementLength; + + error.clear(); + } + } while (inputLength && !error.get()); + + if (error) { + LOG_ERROR("GIConv conversion error, Code %d: \"%s\"", error->code, error->message); + return CString(); + } + + return CString(result.data(), size); +} + +} // namespace WebCore diff --git a/Source/WebCore/platform/text/gtk/TextCodecGtk.h b/Source/WebCore/platform/text/gtk/TextCodecGtk.h new file mode 100644 index 0000000..bb3a445 --- /dev/null +++ b/Source/WebCore/platform/text/gtk/TextCodecGtk.h @@ -0,0 +1,66 @@ +/* + * Copyright (C) 2004, 2006, 2007 Apple Inc. All rights reserved. + * Copyright (C) 2006 Alexey Proskuryakov + * Copyright (C) 2008 Jürg Billeter + * Copyright (C) 2009 Dominik Röttsches + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef TextCodecGTK_h +#define TextCodecGTK_h + +#include "GRefPtr.h" +#include +#include "TextCodec.h" +#include "TextEncoding.h" + +namespace WebCore { + + class TextCodecGtk : public TextCodec { + public: + static void registerBaseEncodingNames(EncodingNameRegistrar); + static void registerBaseCodecs(TextCodecRegistrar); + + static void registerExtendedEncodingNames(EncodingNameRegistrar); + static void registerExtendedCodecs(TextCodecRegistrar); + + TextCodecGtk(const TextEncoding&); + virtual ~TextCodecGtk(); + + virtual String decode(const char*, size_t length, bool flush, bool stopOnError, bool& sawError); + virtual CString encode(const UChar*, size_t length, UnencodableHandling); + + private: + void createIConvDecoder() const; + void createIConvEncoder() const; + + TextEncoding m_encoding; + size_t m_numBufferedBytes; + unsigned char m_bufferedBytes[16]; // bigger than any single multi-byte character + mutable GRefPtr m_iconvDecoder; + mutable GRefPtr m_iconvEncoder; + }; + +} // namespace WebCore + +#endif // TextCodecGTK_h -- cgit v1.1