summaryrefslogtreecommitdiffstats
path: root/WebCore/platform/text/TextEncoding.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'WebCore/platform/text/TextEncoding.cpp')
-rw-r--r--WebCore/platform/text/TextEncoding.cpp213
1 files changed, 213 insertions, 0 deletions
diff --git a/WebCore/platform/text/TextEncoding.cpp b/WebCore/platform/text/TextEncoding.cpp
new file mode 100644
index 0000000..9026049
--- /dev/null
+++ b/WebCore/platform/text/TextEncoding.cpp
@@ -0,0 +1,213 @@
+/*
+ * Copyright (C) 2004, 2006, 2007, 2008 Apple Inc. All rights reserved.
+ * Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+#include "TextEncoding.h"
+
+#include "CString.h"
+#include "PlatformString.h"
+#include "TextCodec.h"
+#include "TextDecoder.h"
+#include "TextEncodingRegistry.h"
+#if USE(ICU_UNICODE)
+#include <unicode/unorm.h>
+#elif USE(QT4_UNICODE)
+#include <QString>
+#endif
+#include <wtf/HashSet.h>
+#include <wtf/OwnPtr.h>
+
+namespace WebCore {
+
+static void addEncodingName(HashSet<const char*>& set, const char* name)
+{
+ const char* atomicName = atomicCanonicalTextEncodingName(name);
+ if (atomicName)
+ set.add(atomicName);
+}
+
+TextEncoding::TextEncoding(const char* name)
+ : m_name(atomicCanonicalTextEncodingName(name))
+{
+}
+
+TextEncoding::TextEncoding(const String& name)
+ : m_name(atomicCanonicalTextEncodingName(name.characters(), name.length()))
+{
+}
+
+String TextEncoding::decode(const char* data, size_t length, bool stopOnError, bool& sawError) const
+{
+ if (!m_name)
+ return String();
+
+ return TextDecoder(*this).decode(data, length, true, stopOnError, sawError);
+}
+
+CString TextEncoding::encode(const UChar* characters, size_t length, UnencodableHandling handling) const
+{
+ if (!m_name)
+ return CString();
+
+ if (!length)
+ return "";
+
+#if USE(ICU_UNICODE)
+ // FIXME: What's the right place to do normalization?
+ // It's a little strange to do it inside the encode function.
+ // Perhaps normalization should be an explicit step done before calling encode.
+
+ const UChar* source = characters;
+ size_t sourceLength = length;
+
+ Vector<UChar> normalizedCharacters;
+
+ UErrorCode err = U_ZERO_ERROR;
+ if (unorm_quickCheck(source, sourceLength, UNORM_NFC, &err) != UNORM_YES) {
+ // First try using the length of the original string, since normalization to NFC rarely increases length.
+ normalizedCharacters.grow(sourceLength);
+ int32_t normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), length, &err);
+ if (err == U_BUFFER_OVERFLOW_ERROR) {
+ err = U_ZERO_ERROR;
+ normalizedCharacters.resize(normalizedLength);
+ normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), normalizedLength, &err);
+ }
+ ASSERT(U_SUCCESS(err));
+
+ source = normalizedCharacters.data();
+ sourceLength = normalizedLength;
+ }
+ return newTextCodec(*this)->encode(source, sourceLength, handling);
+#elif USE(QT4_UNICODE)
+ QString str(reinterpret_cast<const QChar*>(characters), length);
+ str = str.normalized(QString::NormalizationForm_C);
+ return newTextCodec(*this)->encode(reinterpret_cast<const UChar *>(str.utf16()), str.length(), handling);
+#endif
+}
+
+bool TextEncoding::usesVisualOrdering() const
+{
+ if (noExtendedTextEncodingNameUsed())
+ return false;
+
+ static const char* const a = atomicCanonicalTextEncodingName("ISO-8859-8");
+ return m_name == a;
+}
+
+bool TextEncoding::isJapanese() const
+{
+ if (noExtendedTextEncodingNameUsed())
+ return false;
+
+ static HashSet<const char*> set;
+ if (set.isEmpty()) {
+ addEncodingName(set, "x-mac-japanese");
+ addEncodingName(set, "cp932");
+ addEncodingName(set, "JIS_X0201");
+ addEncodingName(set, "JIS_X0208-1983");
+ addEncodingName(set, "JIS_X0208-1990");
+ addEncodingName(set, "JIS_X0212-1990");
+ addEncodingName(set, "JIS_C6226-1978");
+ addEncodingName(set, "Shift_JIS_X0213-2000");
+ addEncodingName(set, "ISO-2022-JP");
+ addEncodingName(set, "ISO-2022-JP-2");
+ addEncodingName(set, "ISO-2022-JP-1");
+ addEncodingName(set, "ISO-2022-JP-3");
+ addEncodingName(set, "EUC-JP");
+ addEncodingName(set, "Shift_JIS");
+ }
+ return m_name && set.contains(m_name);
+}
+
+UChar TextEncoding::backslashAsCurrencySymbol() const
+{
+ if (noExtendedTextEncodingNameUsed())
+ return '\\';
+
+ // The text encodings below treat backslash as a currency symbol.
+ // See http://blogs.msdn.com/michkap/archive/2005/09/17/469941.aspx for more information.
+ static const char* const a = atomicCanonicalTextEncodingName("Shift_JIS_X0213-2000");
+ static const char* const b = atomicCanonicalTextEncodingName("EUC-JP");
+ return (m_name == a || m_name == b) ? 0x00A5 : '\\';
+}
+
+const TextEncoding& TextEncoding::closest8BitEquivalent() const
+{
+ if (*this == UTF16BigEndianEncoding() || *this == UTF16LittleEndianEncoding())
+ return UTF8Encoding();
+ return *this;
+}
+
+const TextEncoding& ASCIIEncoding()
+{
+ static TextEncoding globalASCIIEncoding("ASCII");
+ return globalASCIIEncoding;
+}
+
+const TextEncoding& Latin1Encoding()
+{
+ static TextEncoding globalLatin1Encoding("Latin-1");
+ return globalLatin1Encoding;
+}
+
+const TextEncoding& UTF16BigEndianEncoding()
+{
+ static TextEncoding globalUTF16BigEndianEncoding("UTF-16BE");
+ return globalUTF16BigEndianEncoding;
+}
+
+const TextEncoding& UTF16LittleEndianEncoding()
+{
+ static TextEncoding globalUTF16LittleEndianEncoding("UTF-16LE");
+ return globalUTF16LittleEndianEncoding;
+}
+
+const TextEncoding& UTF32BigEndianEncoding()
+{
+ static TextEncoding globalUTF32BigEndianEncoding("UTF-32BE");
+ return globalUTF32BigEndianEncoding;
+}
+
+const TextEncoding& UTF32LittleEndianEncoding()
+{
+ static TextEncoding globalUTF32LittleEndianEncoding("UTF-32LE");
+ return globalUTF32LittleEndianEncoding;
+}
+
+
+const TextEncoding& UTF8Encoding()
+{
+ static TextEncoding globalUTF8Encoding("UTF-8");
+ return globalUTF8Encoding;
+}
+
+const TextEncoding& WindowsLatin1Encoding()
+{
+ static TextEncoding globalWindowsLatin1Encoding("WinLatin-1");
+ return globalWindowsLatin1Encoding;
+}
+
+} // namespace WebCore