diff options
author | Steve Block <steveblock@google.com> | 2011-05-06 11:45:16 +0100 |
---|---|---|
committer | Steve Block <steveblock@google.com> | 2011-05-12 13:44:10 +0100 |
commit | cad810f21b803229eb11403f9209855525a25d57 (patch) | |
tree | 29a6fd0279be608e0fe9ffe9841f722f0f4e4269 /Source/WebCore/platform/text | |
parent | 121b0cf4517156d0ac5111caf9830c51b69bae8f (diff) | |
download | external_webkit-cad810f21b803229eb11403f9209855525a25d57.zip external_webkit-cad810f21b803229eb11403f9209855525a25d57.tar.gz external_webkit-cad810f21b803229eb11403f9209855525a25d57.tar.bz2 |
Merge WebKit at r75315: Initial merge by git.
Change-Id: I570314b346ce101c935ed22a626b48c2af266b84
Diffstat (limited to 'Source/WebCore/platform/text')
85 files changed, 13885 insertions, 0 deletions
diff --git a/Source/WebCore/platform/text/AtomicStringKeyedMRUCache.h b/Source/WebCore/platform/text/AtomicStringKeyedMRUCache.h new file mode 100644 index 0000000..b3004f7 --- /dev/null +++ b/Source/WebCore/platform/text/AtomicStringKeyedMRUCache.h @@ -0,0 +1,73 @@ +/* + * Copyright (C) 2010 Apple Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS'' + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef AtomicStringKeyedMRUCache_h +#define AtomicStringKeyedMRUCache_h + +#include <wtf/text/AtomicString.h> + +namespace WebCore { + +template<typename T, size_t capacity = 4> +class AtomicStringKeyedMRUCache { +public: + T get(const AtomicString& key) + { + if (key.isNull()) { + DEFINE_STATIC_LOCAL(T, valueForNull, (createValueForNullKey())); + return valueForNull; + } + + for (size_t i = 0; i < m_cache.size(); ++i) { + if (m_cache[i].first == key) { + size_t foundIndex = i; + if (foundIndex + 1 < m_cache.size()) { + Entry entry = m_cache[foundIndex]; + m_cache.remove(foundIndex); + foundIndex = m_cache.size(); + m_cache.append(entry); + } + return m_cache[foundIndex].second; + } + } + if (m_cache.size() == capacity) + m_cache.remove(0); + + m_cache.append(std::make_pair(key, createValueForKey(key))); + return m_cache.last().second; + } + +private: + T createValueForNullKey(); + T createValueForKey(const AtomicString&); + + typedef pair<AtomicString, T> Entry; + typedef Vector<Entry, capacity> Cache; + Cache m_cache; +}; + +} + +#endif // AtomicStringKeyedMRUCache_h diff --git a/Source/WebCore/platform/text/Base64.cpp b/Source/WebCore/platform/text/Base64.cpp new file mode 100644 index 0000000..98b537a --- /dev/null +++ b/Source/WebCore/platform/text/Base64.cpp @@ -0,0 +1,210 @@ +/* + Copyright (C) 2000-2001 Dawit Alemayehu <adawit@kde.org> + Copyright (C) 2006 Alexey Proskuryakov <ap@webkit.org> + Copyright (C) 2007, 2008 Apple Inc. All rights reserved. + Copyright (C) 2010 Patrick Gansterer <paroga@paroga.com> + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License (LGPL) + version 2 as published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU Library General Public + License along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + + This code is based on the java implementation in HTTPClient + package by Ronald Tschalär Copyright (C) 1996-1999. +*/ + +#include "config.h" +#include "Base64.h" + +#include <limits.h> +#include <wtf/StringExtras.h> +#include <wtf/text/WTFString.h> + +namespace WebCore { + +static const char base64EncMap[64] = { + 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, + 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, 0x50, + 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, + 0x59, 0x5A, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, + 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, + 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, + 0x77, 0x78, 0x79, 0x7A, 0x30, 0x31, 0x32, 0x33, + 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x2B, 0x2F +}; + +static const char base64DecMap[128] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x3E, 0x00, 0x00, 0x00, 0x3F, + 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B, + 0x3C, 0x3D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, + 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, + 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, + 0x17, 0x18, 0x19, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, 0x20, + 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, + 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F, 0x30, + 0x31, 0x32, 0x33, 0x00, 0x00, 0x00, 0x00, 0x00 +}; + +void base64Encode(const Vector<char>& in, Vector<char>& out, bool insertLFs) +{ + base64Encode(in.data(), in.size(), out, insertLFs); +} + +void base64Encode(const char* data, unsigned len, Vector<char>& out, bool insertLFs) +{ + out.clear(); + if (!len) + return; + + // If the input string is pathologically large, just return nothing. + // Note: Keep this in sync with the "outLength" computation below. + // Rather than being perfectly precise, this is a bit conservative. + const unsigned maxInputBufferSize = UINT_MAX / 77 * 76 / 4 * 3 - 2; + if (len > maxInputBufferSize) + return; + + unsigned sidx = 0; + unsigned didx = 0; + + unsigned outLength = ((len + 2) / 3) * 4; + + // Deal with the 76 character per line limit specified in RFC 2045. + insertLFs = (insertLFs && outLength > 76); + if (insertLFs) + outLength += ((outLength - 1) / 76); + + int count = 0; + out.grow(outLength); + + // 3-byte to 4-byte conversion + 0-63 to ascii printable conversion + if (len > 1) { + while (sidx < len - 2) { + if (insertLFs) { + if (count && !(count % 76)) + out[didx++] = '\n'; + count += 4; + } + out[didx++] = base64EncMap[(data[sidx] >> 2) & 077]; + out[didx++] = base64EncMap[((data[sidx + 1] >> 4) & 017) | ((data[sidx] << 4) & 077)]; + out[didx++] = base64EncMap[((data[sidx + 2] >> 6) & 003) | ((data[sidx + 1] << 2) & 077)]; + out[didx++] = base64EncMap[data[sidx + 2] & 077]; + sidx += 3; + } + } + + if (sidx < len) { + if (insertLFs && (count > 0) && !(count % 76)) + out[didx++] = '\n'; + + out[didx++] = base64EncMap[(data[sidx] >> 2) & 077]; + if (sidx < len - 1) { + out[didx++] = base64EncMap[((data[sidx + 1] >> 4) & 017) | ((data[sidx] << 4) & 077)]; + out[didx++] = base64EncMap[(data[sidx + 1] << 2) & 077]; + } else + out[didx++] = base64EncMap[(data[sidx] << 4) & 077]; + } + + // Add padding + while (didx < out.size()) { + out[didx] = '='; + didx++; + } +} + +bool base64Decode(const Vector<char>& in, Vector<char>& out, Base64DecodePolicy policy) +{ + out.clear(); + + // If the input string is pathologically large, just return nothing. + if (in.size() > UINT_MAX) + return false; + + return base64Decode(in.data(), in.size(), out, policy); +} + +template<typename T> +static inline bool base64DecodeInternal(const T* data, unsigned len, Vector<char>& out, Base64DecodePolicy policy) +{ + out.clear(); + if (!len) + return true; + + out.grow(len); + + bool sawEqualsSign = false; + unsigned outLength = 0; + for (unsigned idx = 0; idx < len; idx++) { + unsigned ch = data[idx]; + if (ch == '=') + sawEqualsSign = true; + else if (('0' <= ch && ch <= '9') || ('A' <= ch && ch <= 'Z') || ('a' <= ch && ch <= 'z') || ch == '+' || ch == '/') { + if (sawEqualsSign) + return false; + out[outLength] = base64DecMap[ch]; + outLength++; + } else if (policy == FailOnInvalidCharacter || (policy == IgnoreWhitespace && !isSpaceOrNewline(ch))) + return false; + } + + if (!outLength) + return !sawEqualsSign; + + // Valid data is (n * 4 + [0,2,3]) characters long. + if ((outLength % 4) == 1) + return false; + + // 4-byte to 3-byte conversion + outLength -= (outLength + 3) / 4; + if (!outLength) + return false; + + unsigned sidx = 0; + unsigned didx = 0; + if (outLength > 1) { + while (didx < outLength - 2) { + out[didx] = (((out[sidx] << 2) & 255) | ((out[sidx + 1] >> 4) & 003)); + out[didx + 1] = (((out[sidx + 1] << 4) & 255) | ((out[sidx + 2] >> 2) & 017)); + out[didx + 2] = (((out[sidx + 2] << 6) & 255) | (out[sidx + 3] & 077)); + sidx += 4; + didx += 3; + } + } + + if (didx < outLength) + out[didx] = (((out[sidx] << 2) & 255) | ((out[sidx + 1] >> 4) & 003)); + + if (++didx < outLength) + out[didx] = (((out[sidx + 1] << 4) & 255) | ((out[sidx + 2] >> 2) & 017)); + + if (outLength < out.size()) + out.shrink(outLength); + + return true; +} + +bool base64Decode(const char* data, unsigned len, Vector<char>& out, Base64DecodePolicy policy) +{ + return base64DecodeInternal<char>(data, len, out, policy); +} + +bool base64Decode(const String& in, Vector<char>& out, Base64DecodePolicy policy) +{ + return base64DecodeInternal<UChar>(in.characters(), in.length(), out, policy); +} + +} // namespace WebCore diff --git a/Source/WebCore/platform/text/Base64.h b/Source/WebCore/platform/text/Base64.h new file mode 100644 index 0000000..211bd3c --- /dev/null +++ b/Source/WebCore/platform/text/Base64.h @@ -0,0 +1,46 @@ +/* + * Copyright (C) 2006 Alexey Proskuryakov <ap@webkit.org> + * Copyright (C) 2010 Patrick Gansterer <paroga@paroga.com> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef Base64_h +#define Base64_h + +#include <wtf/Forward.h> +#include <wtf/Vector.h> + +namespace WebCore { + +enum Base64DecodePolicy { FailOnInvalidCharacter, IgnoreWhitespace, IgnoreInvalidCharacters }; + +void base64Encode(const Vector<char>&, Vector<char>&, bool insertLFs = false); +void base64Encode(const char*, unsigned, Vector<char>&, bool insertLFs = false); + +bool base64Decode(const String&, Vector<char>&, Base64DecodePolicy = FailOnInvalidCharacter); +bool base64Decode(const Vector<char>&, Vector<char>&, Base64DecodePolicy = FailOnInvalidCharacter); +bool base64Decode(const char*, unsigned, Vector<char>&, Base64DecodePolicy = FailOnInvalidCharacter); + +} + +#endif // Base64_h diff --git a/Source/WebCore/platform/text/BidiContext.cpp b/Source/WebCore/platform/text/BidiContext.cpp new file mode 100644 index 0000000..fb6b8cf --- /dev/null +++ b/Source/WebCore/platform/text/BidiContext.cpp @@ -0,0 +1,72 @@ +/* + * Copyright (C) 2000 Lars Knoll (knoll@kde.org) + * Copyright (C) 2003, 2004, 2006, 2007, 2009, 2010 Apple Inc. All right reserved. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public License + * along with this library; see the file COPYING.LIB. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, + * Boston, MA 02110-1301, USA. + * + */ + +#include "config.h" +#include "BidiContext.h" + +namespace WebCore { + +using namespace WTF::Unicode; + +inline PassRefPtr<BidiContext> BidiContext::createUncached(unsigned char level, Direction direction, bool override, BidiContext* parent) +{ + return adoptRef(new BidiContext(level, direction, override, parent)); +} + +PassRefPtr<BidiContext> BidiContext::create(unsigned char level, Direction direction, bool override, BidiContext* parent) +{ + ASSERT(direction == (level % 2 ? RightToLeft : LeftToRight)); + + if (parent) + return createUncached(level, direction, override, parent); + + ASSERT(level <= 1); + if (!level) { + if (!override) { + static BidiContext* ltrContext = createUncached(0, LeftToRight, false, 0).releaseRef(); + return ltrContext; + } + + static BidiContext* ltrOverrideContext = createUncached(0, LeftToRight, true, 0).releaseRef(); + return ltrOverrideContext; + } + + if (!override) { + static BidiContext* rtlContext = createUncached(1, RightToLeft, false, 0).releaseRef(); + return rtlContext; + } + + static BidiContext* rtlOverrideContext = createUncached(1, RightToLeft, true, 0).releaseRef(); + return rtlOverrideContext; +} + +bool operator==(const BidiContext& c1, const BidiContext& c2) +{ + if (&c1 == &c2) + return true; + if (c1.level() != c2.level() || c1.override() != c2.override() || c1.dir() != c2.dir()) + return false; + if (!c1.parent()) + return !c2.parent(); + return c2.parent() && *c1.parent() == *c2.parent(); +} + +} // namespace WebCore diff --git a/Source/WebCore/platform/text/BidiContext.h b/Source/WebCore/platform/text/BidiContext.h new file mode 100644 index 0000000..b52815f --- /dev/null +++ b/Source/WebCore/platform/text/BidiContext.h @@ -0,0 +1,64 @@ +/* + * Copyright (C) 2000 Lars Knoll (knoll@kde.org) + * Copyright (C) 2003, 2004, 2006, 2007, 2009, 2010 Apple Inc. All right reserved. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public License + * along with this library; see the file COPYING.LIB. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, + * Boston, MA 02110-1301, USA. + * + */ + +#ifndef BidiContext_h +#define BidiContext_h + +#include <wtf/Assertions.h> +#include <wtf/PassRefPtr.h> +#include <wtf/RefCounted.h> +#include <wtf/RefPtr.h> +#include <wtf/unicode/Unicode.h> + +namespace WebCore { + +// Used to keep track of explicit embeddings. +class BidiContext : public RefCounted<BidiContext> { +public: + static PassRefPtr<BidiContext> create(unsigned char level, WTF::Unicode::Direction direction, bool override = false, BidiContext* parent = 0); + + BidiContext* parent() const { return m_parent.get(); } + unsigned char level() const { return m_level; } + WTF::Unicode::Direction dir() const { return static_cast<WTF::Unicode::Direction>(m_direction); } + bool override() const { return m_override; } + +private: + BidiContext(unsigned char level, WTF::Unicode::Direction direction, bool override, BidiContext* parent) + : m_level(level) + , m_direction(direction) + , m_override(override) + , m_parent(parent) + { + } + + static PassRefPtr<BidiContext> createUncached(unsigned char level, WTF::Unicode::Direction, bool override, BidiContext* parent); + + unsigned char m_level; + unsigned m_direction : 5; // Direction + bool m_override : 1; + RefPtr<BidiContext> m_parent; +}; + +bool operator==(const BidiContext&, const BidiContext&); + +} // namespace WebCore + +#endif // BidiContext_h diff --git a/Source/WebCore/platform/text/BidiResolver.h b/Source/WebCore/platform/text/BidiResolver.h new file mode 100644 index 0000000..1f87115 --- /dev/null +++ b/Source/WebCore/platform/text/BidiResolver.h @@ -0,0 +1,968 @@ +/* + * Copyright (C) 2000 Lars Knoll (knoll@kde.org) + * Copyright (C) 2003, 2004, 2006, 2007, 2008 Apple Inc. All right reserved. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public License + * along with this library; see the file COPYING.LIB. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, + * Boston, MA 02110-1301, USA. + * + */ + +#ifndef BidiResolver_h +#define BidiResolver_h + +#include "BidiContext.h" +#include <wtf/Noncopyable.h> +#include <wtf/PassRefPtr.h> +#include <wtf/Vector.h> + +namespace WebCore { + +template <class Iterator> struct MidpointState { + MidpointState() + { + reset(); + } + + void reset() + { + numMidpoints = 0; + currentMidpoint = 0; + betweenMidpoints = false; + } + + // The goal is to reuse the line state across multiple + // lines so we just keep an array around for midpoints and never clear it across multiple + // lines. We track the number of items and position using the two other variables. + Vector<Iterator> midpoints; + unsigned numMidpoints; + unsigned currentMidpoint; + bool betweenMidpoints; +}; + +// The BidiStatus at a given position (typically the end of a line) can +// be cached and then used to restart bidi resolution at that position. +struct BidiStatus { + BidiStatus() + : eor(WTF::Unicode::OtherNeutral) + , lastStrong(WTF::Unicode::OtherNeutral) + , last(WTF::Unicode::OtherNeutral) + { + } + + BidiStatus(WTF::Unicode::Direction eorDir, WTF::Unicode::Direction lastStrongDir, WTF::Unicode::Direction lastDir, PassRefPtr<BidiContext> bidiContext) + : eor(eorDir) + , lastStrong(lastStrongDir) + , last(lastDir) + , context(bidiContext) + { + } + + WTF::Unicode::Direction eor; + WTF::Unicode::Direction lastStrong; + WTF::Unicode::Direction last; + RefPtr<BidiContext> context; +}; + +inline bool operator==(const BidiStatus& status1, const BidiStatus& status2) +{ + return status1.eor == status2.eor && status1.last == status2.last && status1.lastStrong == status2.lastStrong && *(status1.context) == *(status2.context); +} + +inline bool operator!=(const BidiStatus& status1, const BidiStatus& status2) +{ + return !(status1 == status2); +} + +struct BidiCharacterRun { + BidiCharacterRun(int start, int stop, BidiContext* context, WTF::Unicode::Direction dir) + : m_start(start) + , m_stop(stop) + , m_override(context->override()) + , m_next(0) + { + if (dir == WTF::Unicode::OtherNeutral) + dir = context->dir(); + + m_level = context->level(); + + // add level of run (cases I1 & I2) + if (m_level % 2) { + if (dir == WTF::Unicode::LeftToRight || dir == WTF::Unicode::ArabicNumber || dir == WTF::Unicode::EuropeanNumber) + m_level++; + } else { + if (dir == WTF::Unicode::RightToLeft) + m_level++; + else if (dir == WTF::Unicode::ArabicNumber || dir == WTF::Unicode::EuropeanNumber) + m_level += 2; + } + } + + void destroy() { delete this; } + + int start() const { return m_start; } + int stop() const { return m_stop; } + unsigned char level() const { return m_level; } + bool reversed(bool visuallyOrdered) { return m_level % 2 && !visuallyOrdered; } + bool dirOverride(bool visuallyOrdered) { return m_override || visuallyOrdered; } + + BidiCharacterRun* next() const { return m_next; } + + unsigned char m_level; + int m_start; + int m_stop; + bool m_override; + BidiCharacterRun* m_next; +}; + +template <class Iterator, class Run> class BidiResolver : public Noncopyable { +public : + BidiResolver() + : m_direction(WTF::Unicode::OtherNeutral) + , reachedEndOfLine(false) + , emptyRun(true) + , m_firstRun(0) + , m_lastRun(0) + , m_logicallyLastRun(0) + , m_runCount(0) + { + } + + const Iterator& position() const { return current; } + void setPosition(const Iterator& position) { current = position; } + + void increment() { current.increment(); } + + BidiContext* context() const { return m_status.context.get(); } + void setContext(PassRefPtr<BidiContext> c) { m_status.context = c; } + + void setLastDir(WTF::Unicode::Direction lastDir) { m_status.last = lastDir; } + void setLastStrongDir(WTF::Unicode::Direction lastStrongDir) { m_status.lastStrong = lastStrongDir; } + void setEorDir(WTF::Unicode::Direction eorDir) { m_status.eor = eorDir; } + + WTF::Unicode::Direction dir() const { return m_direction; } + void setDir(WTF::Unicode::Direction d) { m_direction = d; } + + const BidiStatus& status() const { return m_status; } + void setStatus(const BidiStatus s) { m_status = s; } + + MidpointState<Iterator>& midpointState() { return m_midpointState; } + + void embed(WTF::Unicode::Direction); + void commitExplicitEmbedding(); + + void createBidiRunsForLine(const Iterator& end, bool visualOrder = false, bool hardLineBreak = false); + + Run* firstRun() const { return m_firstRun; } + Run* lastRun() const { return m_lastRun; } + Run* logicallyLastRun() const { return m_logicallyLastRun; } + unsigned runCount() const { return m_runCount; } + + void addRun(Run*); + void prependRun(Run*); + + void moveRunToEnd(Run*); + void moveRunToBeginning(Run*); + + void deleteRuns(); + +protected: + void appendRun(); + void reverseRuns(unsigned start, unsigned end); + + Iterator current; + Iterator sor; + Iterator eor; + Iterator last; + BidiStatus m_status; + WTF::Unicode::Direction m_direction; + Iterator endOfLine; + bool reachedEndOfLine; + Iterator lastBeforeET; + bool emptyRun; + + Run* m_firstRun; + Run* m_lastRun; + Run* m_logicallyLastRun; + unsigned m_runCount; + MidpointState<Iterator> m_midpointState; + +private: + void raiseExplicitEmbeddingLevel(WTF::Unicode::Direction from, WTF::Unicode::Direction to); + void lowerExplicitEmbeddingLevel(WTF::Unicode::Direction from); + void checkDirectionInLowerRaiseEmbeddingLevel(); + + Vector<WTF::Unicode::Direction, 8> m_currentExplicitEmbeddingSequence; +}; + +template <class Iterator, class Run> +inline void BidiResolver<Iterator, Run>::addRun(Run* run) +{ + if (!m_firstRun) + m_firstRun = run; + else + m_lastRun->m_next = run; + m_lastRun = run; + m_runCount++; +} + +template <class Iterator, class Run> +inline void BidiResolver<Iterator, Run>::prependRun(Run* run) +{ + ASSERT(!run->m_next); + + if (!m_lastRun) + m_lastRun = run; + else + run->m_next = m_firstRun; + m_firstRun = run; + m_runCount++; +} + +template <class Iterator, class Run> +inline void BidiResolver<Iterator, Run>::moveRunToEnd(Run* run) +{ + ASSERT(m_firstRun); + ASSERT(m_lastRun); + ASSERT(run->m_next); + + Run* current = 0; + Run* next = m_firstRun; + while (next != run) { + current = next; + next = current->next(); + } + + if (!current) + m_firstRun = run->next(); + else + current->m_next = run->m_next; + + run->m_next = 0; + m_lastRun->m_next = run; + m_lastRun = run; +} + +template <class Iterator, class Run> +inline void BidiResolver<Iterator, Run>::moveRunToBeginning(Run* run) +{ + ASSERT(m_firstRun); + ASSERT(m_lastRun); + ASSERT(run != m_firstRun); + + Run* current = m_firstRun; + Run* next = current->next(); + while (next != run) { + current = next; + next = current->next(); + } + + current->m_next = run->m_next; + if (run == m_lastRun) + m_lastRun = current; + + run->m_next = m_firstRun; + m_firstRun = run; +} + +template <class Iterator, class Run> +void BidiResolver<Iterator, Run>::appendRun() +{ + if (!emptyRun && !eor.atEnd()) { + unsigned startOffset = sor.offset(); + unsigned endOffset = eor.offset(); + + if (!endOfLine.atEnd() && endOffset >= endOfLine.offset()) { + reachedEndOfLine = true; + endOffset = endOfLine.offset(); + } + + if (endOffset >= startOffset) + addRun(new Run(startOffset, endOffset + 1, context(), m_direction)); + + eor.increment(); + sor = eor; + } + + m_direction = WTF::Unicode::OtherNeutral; + m_status.eor = WTF::Unicode::OtherNeutral; +} + +template <class Iterator, class Run> +void BidiResolver<Iterator, Run>::embed(WTF::Unicode::Direction d) +{ + using namespace WTF::Unicode; + + ASSERT(d == PopDirectionalFormat || d == LeftToRightEmbedding || d == LeftToRightOverride || d == RightToLeftEmbedding || d == RightToLeftOverride); + m_currentExplicitEmbeddingSequence.append(d); +} + +template <class Iterator, class Run> +void BidiResolver<Iterator, Run>::checkDirectionInLowerRaiseEmbeddingLevel() +{ + using namespace WTF::Unicode; + + ASSERT(m_status.eor != OtherNeutral || eor.atEnd()); + // bidi.sor ... bidi.eor ... bidi.last eor; need to append the bidi.sor-bidi.eor run or extend it through bidi.last + // Bidi control characters are included into BidiRun, so last direction + // could be one of the bidi embeddings when there are nested embeddings. + // For example: "‪‫....." + ASSERT(m_status.last == EuropeanNumberSeparator + || m_status.last == EuropeanNumberTerminator + || m_status.last == CommonNumberSeparator + || m_status.last == BoundaryNeutral + || m_status.last == BlockSeparator + || m_status.last == SegmentSeparator + || m_status.last == WhiteSpaceNeutral + || m_status.last == OtherNeutral + || m_status.last == RightToLeftEmbedding + || m_status.last == LeftToRightEmbedding + || m_status.last == RightToLeftOverride + || m_status.last == LeftToRightOverride + || m_status.last == PopDirectionalFormat); + if (m_direction == OtherNeutral) + m_direction = m_status.lastStrong == LeftToRight ? LeftToRight : RightToLeft; +} + +template <class Iterator, class Run> +void BidiResolver<Iterator, Run>::lowerExplicitEmbeddingLevel(WTF::Unicode::Direction from) +{ + using namespace WTF::Unicode; + + if (!emptyRun && eor != last) { + checkDirectionInLowerRaiseEmbeddingLevel(); + if (from == LeftToRight) { + // bidi.sor ... bidi.eor ... bidi.last L + if (m_status.eor == EuropeanNumber) { + if (m_status.lastStrong != LeftToRight) { + m_direction = EuropeanNumber; + appendRun(); + } + } else if (m_status.eor == ArabicNumber) { + m_direction = ArabicNumber; + appendRun(); + } else if (m_status.lastStrong != LeftToRight) { + appendRun(); + m_direction = LeftToRight; + } + } else if (m_status.eor == EuropeanNumber || m_status.eor == ArabicNumber || m_status.lastStrong == LeftToRight) { + appendRun(); + m_direction = RightToLeft; + } + eor = last; + } + appendRun(); + emptyRun = true; + // sor for the new run is determined by the higher level (rule X10) + setLastDir(from); + setLastStrongDir(from); + eor = Iterator(); +} + +template <class Iterator, class Run> +void BidiResolver<Iterator, Run>::raiseExplicitEmbeddingLevel(WTF::Unicode::Direction from, WTF::Unicode::Direction to) +{ + using namespace WTF::Unicode; + + if (!emptyRun && eor != last) { + checkDirectionInLowerRaiseEmbeddingLevel(); + if (to == LeftToRight) { + // bidi.sor ... bidi.eor ... bidi.last L + if (m_status.eor == EuropeanNumber) { + if (m_status.lastStrong != LeftToRight) { + m_direction = EuropeanNumber; + appendRun(); + } + } else if (m_status.eor == ArabicNumber) { + m_direction = ArabicNumber; + appendRun(); + } else if (m_status.lastStrong != LeftToRight && from == LeftToRight) { + appendRun(); + m_direction = LeftToRight; + } + } else if (m_status.eor == ArabicNumber + || (m_status.eor == EuropeanNumber && (m_status.lastStrong != LeftToRight || from == RightToLeft)) + || (m_status.eor != EuropeanNumber && m_status.lastStrong == LeftToRight && from == RightToLeft)) { + appendRun(); + m_direction = RightToLeft; + } + eor = last; + } + appendRun(); + emptyRun = true; + setLastDir(to); + setLastStrongDir(to); + eor = Iterator(); +} + +template <class Iterator, class Run> +void BidiResolver<Iterator, Run>::commitExplicitEmbedding() +{ + using namespace WTF::Unicode; + + unsigned char fromLevel = context()->level(); + RefPtr<BidiContext> toContext = context(); + + for (size_t i = 0; i < m_currentExplicitEmbeddingSequence.size(); ++i) { + Direction embedding = m_currentExplicitEmbeddingSequence[i]; + if (embedding == PopDirectionalFormat) { + if (BidiContext* parentContext = toContext->parent()) + toContext = parentContext; + } else { + Direction direction = (embedding == RightToLeftEmbedding || embedding == RightToLeftOverride) ? RightToLeft : LeftToRight; + bool override = embedding == LeftToRightOverride || embedding == RightToLeftOverride; + unsigned char level = toContext->level(); + if (direction == RightToLeft) { + // Go to the least greater odd integer + level += 1; + level |= 1; + } else { + // Go to the least greater even integer + level += 2; + level &= ~1; + } + if (level < 61) + toContext = BidiContext::create(level, direction, override, toContext.get()); + } + } + + unsigned char toLevel = toContext->level(); + + if (toLevel > fromLevel) + raiseExplicitEmbeddingLevel(fromLevel % 2 ? RightToLeft : LeftToRight, toLevel % 2 ? RightToLeft : LeftToRight); + else if (toLevel < fromLevel) + lowerExplicitEmbeddingLevel(fromLevel % 2 ? RightToLeft : LeftToRight); + + setContext(toContext); + + m_currentExplicitEmbeddingSequence.clear(); +} + +template <class Iterator, class Run> +void BidiResolver<Iterator, Run>::deleteRuns() +{ + emptyRun = true; + if (!m_firstRun) + return; + + Run* curr = m_firstRun; + while (curr) { + Run* s = curr->next(); + curr->destroy(); + curr = s; + } + + m_firstRun = 0; + m_lastRun = 0; + m_runCount = 0; +} + +template <class Iterator, class Run> +void BidiResolver<Iterator, Run>::reverseRuns(unsigned start, unsigned end) +{ + if (start >= end) + return; + + ASSERT(end < m_runCount); + + // Get the item before the start of the runs to reverse and put it in + // |beforeStart|. |curr| should point to the first run to reverse. + Run* curr = m_firstRun; + Run* beforeStart = 0; + unsigned i = 0; + while (i < start) { + i++; + beforeStart = curr; + curr = curr->next(); + } + + Run* startRun = curr; + while (i < end) { + i++; + curr = curr->next(); + } + Run* endRun = curr; + Run* afterEnd = curr->next(); + + i = start; + curr = startRun; + Run* newNext = afterEnd; + while (i <= end) { + // Do the reversal. + Run* next = curr->next(); + curr->m_next = newNext; + newNext = curr; + curr = next; + i++; + } + + // Now hook up beforeStart and afterEnd to the startRun and endRun. + if (beforeStart) + beforeStart->m_next = endRun; + else + m_firstRun = endRun; + + startRun->m_next = afterEnd; + if (!afterEnd) + m_lastRun = startRun; +} + +template <class Iterator, class Run> +void BidiResolver<Iterator, Run>::createBidiRunsForLine(const Iterator& end, bool visualOrder, bool hardLineBreak) +{ + using namespace WTF::Unicode; + + ASSERT(m_direction == OtherNeutral); + + emptyRun = true; + + eor = Iterator(); + + last = current; + bool pastEnd = false; + BidiResolver<Iterator, Run> stateAtEnd; + + while (true) { + Direction dirCurrent; + if (pastEnd && (hardLineBreak || current.atEnd())) { + BidiContext* c = context(); + while (c->parent()) + c = c->parent(); + dirCurrent = c->dir(); + if (hardLineBreak) { + // A deviation from the Unicode Bidi Algorithm in order to match + // Mac OS X text and WinIE: a hard line break resets bidi state. + stateAtEnd.setContext(c); + stateAtEnd.setEorDir(dirCurrent); + stateAtEnd.setLastDir(dirCurrent); + stateAtEnd.setLastStrongDir(dirCurrent); + } + } else { + dirCurrent = current.direction(); + if (context()->override() + && dirCurrent != RightToLeftEmbedding + && dirCurrent != LeftToRightEmbedding + && dirCurrent != RightToLeftOverride + && dirCurrent != LeftToRightOverride + && dirCurrent != PopDirectionalFormat) + dirCurrent = context()->dir(); + else if (dirCurrent == NonSpacingMark) + dirCurrent = m_status.last; + } + + ASSERT(m_status.eor != OtherNeutral || eor.atEnd()); + switch (dirCurrent) { + + // embedding and overrides (X1-X9 in the Bidi specs) + case RightToLeftEmbedding: + case LeftToRightEmbedding: + case RightToLeftOverride: + case LeftToRightOverride: + case PopDirectionalFormat: + embed(dirCurrent); + commitExplicitEmbedding(); + break; + + // strong types + case LeftToRight: + switch(m_status.last) { + case RightToLeft: + case RightToLeftArabic: + case EuropeanNumber: + case ArabicNumber: + if (m_status.last != EuropeanNumber || m_status.lastStrong != LeftToRight) + appendRun(); + break; + case LeftToRight: + break; + case EuropeanNumberSeparator: + case EuropeanNumberTerminator: + case CommonNumberSeparator: + case BoundaryNeutral: + case BlockSeparator: + case SegmentSeparator: + case WhiteSpaceNeutral: + case OtherNeutral: + if (m_status.eor == EuropeanNumber) { + if (m_status.lastStrong != LeftToRight) { + // the numbers need to be on a higher embedding level, so let's close that run + m_direction = EuropeanNumber; + appendRun(); + if (context()->dir() != LeftToRight) { + // the neutrals take the embedding direction, which is R + eor = last; + m_direction = RightToLeft; + appendRun(); + } + } + } else if (m_status.eor == ArabicNumber) { + // Arabic numbers are always on a higher embedding level, so let's close that run + m_direction = ArabicNumber; + appendRun(); + if (context()->dir() != LeftToRight) { + // the neutrals take the embedding direction, which is R + eor = last; + m_direction = RightToLeft; + appendRun(); + } + } else if (m_status.lastStrong != LeftToRight) { + //last stuff takes embedding dir + if (context()->dir() == RightToLeft) { + eor = last; + m_direction = RightToLeft; + } + appendRun(); + } + default: + break; + } + eor = current; + m_status.eor = LeftToRight; + m_status.lastStrong = LeftToRight; + m_direction = LeftToRight; + break; + case RightToLeftArabic: + case RightToLeft: + switch (m_status.last) { + case LeftToRight: + case EuropeanNumber: + case ArabicNumber: + appendRun(); + case RightToLeft: + case RightToLeftArabic: + break; + case EuropeanNumberSeparator: + case EuropeanNumberTerminator: + case CommonNumberSeparator: + case BoundaryNeutral: + case BlockSeparator: + case SegmentSeparator: + case WhiteSpaceNeutral: + case OtherNeutral: + if (m_status.eor == EuropeanNumber) { + if (m_status.lastStrong == LeftToRight && context()->dir() == LeftToRight) + eor = last; + appendRun(); + } else if (m_status.eor == ArabicNumber) + appendRun(); + else if (m_status.lastStrong == LeftToRight) { + if (context()->dir() == LeftToRight) + eor = last; + appendRun(); + } + default: + break; + } + eor = current; + m_status.eor = RightToLeft; + m_status.lastStrong = dirCurrent; + m_direction = RightToLeft; + break; + + // weak types: + + case EuropeanNumber: + if (m_status.lastStrong != RightToLeftArabic) { + // if last strong was AL change EN to AN + switch (m_status.last) { + case EuropeanNumber: + case LeftToRight: + break; + case RightToLeft: + case RightToLeftArabic: + case ArabicNumber: + eor = last; + appendRun(); + m_direction = EuropeanNumber; + break; + case EuropeanNumberSeparator: + case CommonNumberSeparator: + if (m_status.eor == EuropeanNumber) + break; + case EuropeanNumberTerminator: + case BoundaryNeutral: + case BlockSeparator: + case SegmentSeparator: + case WhiteSpaceNeutral: + case OtherNeutral: + if (m_status.eor == EuropeanNumber) { + if (m_status.lastStrong == RightToLeft) { + // ENs on both sides behave like Rs, so the neutrals should be R. + // Terminate the EN run. + appendRun(); + // Make an R run. + eor = m_status.last == EuropeanNumberTerminator ? lastBeforeET : last; + m_direction = RightToLeft; + appendRun(); + // Begin a new EN run. + m_direction = EuropeanNumber; + } + } else if (m_status.eor == ArabicNumber) { + // Terminate the AN run. + appendRun(); + if (m_status.lastStrong == RightToLeft || context()->dir() == RightToLeft) { + // Make an R run. + eor = m_status.last == EuropeanNumberTerminator ? lastBeforeET : last; + m_direction = RightToLeft; + appendRun(); + // Begin a new EN run. + m_direction = EuropeanNumber; + } + } else if (m_status.lastStrong == RightToLeft) { + // Extend the R run to include the neutrals. + eor = m_status.last == EuropeanNumberTerminator ? lastBeforeET : last; + m_direction = RightToLeft; + appendRun(); + // Begin a new EN run. + m_direction = EuropeanNumber; + } + default: + break; + } + eor = current; + m_status.eor = EuropeanNumber; + if (m_direction == OtherNeutral) + m_direction = LeftToRight; + break; + } + case ArabicNumber: + dirCurrent = ArabicNumber; + switch (m_status.last) { + case LeftToRight: + if (context()->dir() == LeftToRight) + appendRun(); + break; + case ArabicNumber: + break; + case RightToLeft: + case RightToLeftArabic: + case EuropeanNumber: + eor = last; + appendRun(); + break; + case CommonNumberSeparator: + if (m_status.eor == ArabicNumber) + break; + case EuropeanNumberSeparator: + case EuropeanNumberTerminator: + case BoundaryNeutral: + case BlockSeparator: + case SegmentSeparator: + case WhiteSpaceNeutral: + case OtherNeutral: + if (m_status.eor == ArabicNumber + || (m_status.eor == EuropeanNumber && (m_status.lastStrong == RightToLeft || context()->dir() == RightToLeft)) + || (m_status.eor != EuropeanNumber && m_status.lastStrong == LeftToRight && context()->dir() == RightToLeft)) { + // Terminate the run before the neutrals. + appendRun(); + // Begin an R run for the neutrals. + m_direction = RightToLeft; + } else if (m_direction == OtherNeutral) + m_direction = m_status.lastStrong == LeftToRight ? LeftToRight : RightToLeft; + eor = last; + appendRun(); + default: + break; + } + eor = current; + m_status.eor = ArabicNumber; + if (m_direction == OtherNeutral) + m_direction = ArabicNumber; + break; + case EuropeanNumberSeparator: + case CommonNumberSeparator: + break; + case EuropeanNumberTerminator: + if (m_status.last == EuropeanNumber) { + dirCurrent = EuropeanNumber; + eor = current; + m_status.eor = dirCurrent; + } else if (m_status.last != EuropeanNumberTerminator) + lastBeforeET = emptyRun ? eor : last; + break; + + // boundary neutrals should be ignored + case BoundaryNeutral: + if (eor == last) + eor = current; + break; + // neutrals + case BlockSeparator: + // ### what do we do with newline and paragraph seperators that come to here? + break; + case SegmentSeparator: + // ### implement rule L1 + break; + case WhiteSpaceNeutral: + break; + case OtherNeutral: + break; + default: + break; + } + + if (pastEnd && eor == current) { + if (!reachedEndOfLine) { + eor = endOfLine; + switch (m_status.eor) { + case LeftToRight: + case RightToLeft: + case ArabicNumber: + m_direction = m_status.eor; + break; + case EuropeanNumber: + m_direction = m_status.lastStrong == LeftToRight ? LeftToRight : EuropeanNumber; + break; + default: + ASSERT(false); + } + appendRun(); + } + current = end; + m_status = stateAtEnd.m_status; + sor = stateAtEnd.sor; + eor = stateAtEnd.eor; + last = stateAtEnd.last; + reachedEndOfLine = stateAtEnd.reachedEndOfLine; + lastBeforeET = stateAtEnd.lastBeforeET; + emptyRun = stateAtEnd.emptyRun; + m_direction = OtherNeutral; + break; + } + + // set m_status.last as needed. + switch (dirCurrent) { + case EuropeanNumberTerminator: + if (m_status.last != EuropeanNumber) + m_status.last = EuropeanNumberTerminator; + break; + case EuropeanNumberSeparator: + case CommonNumberSeparator: + case SegmentSeparator: + case WhiteSpaceNeutral: + case OtherNeutral: + switch(m_status.last) { + case LeftToRight: + case RightToLeft: + case RightToLeftArabic: + case EuropeanNumber: + case ArabicNumber: + m_status.last = dirCurrent; + break; + default: + m_status.last = OtherNeutral; + } + break; + case NonSpacingMark: + case BoundaryNeutral: + // ignore these + break; + case EuropeanNumber: + // fall through + default: + m_status.last = dirCurrent; + } + + last = current; + + if (emptyRun) { + sor = current; + emptyRun = false; + } + + increment(); + if (!m_currentExplicitEmbeddingSequence.isEmpty()) { + commitExplicitEmbedding(); + if (pastEnd) { + current = end; + m_status = stateAtEnd.m_status; + sor = stateAtEnd.sor; + eor = stateAtEnd.eor; + last = stateAtEnd.last; + reachedEndOfLine = stateAtEnd.reachedEndOfLine; + lastBeforeET = stateAtEnd.lastBeforeET; + emptyRun = stateAtEnd.emptyRun; + m_direction = OtherNeutral; + break; + } + } + + if (!pastEnd && (current == end || current.atEnd())) { + if (emptyRun) + break; + stateAtEnd.m_status = m_status; + stateAtEnd.sor = sor; + stateAtEnd.eor = eor; + stateAtEnd.last = last; + stateAtEnd.reachedEndOfLine = reachedEndOfLine; + stateAtEnd.lastBeforeET = lastBeforeET; + stateAtEnd.emptyRun = emptyRun; + endOfLine = last; + pastEnd = true; + } + } + + m_logicallyLastRun = m_lastRun; + + // reorder line according to run structure... + // do not reverse for visually ordered web sites + if (!visualOrder) { + + // first find highest and lowest levels + unsigned char levelLow = 128; + unsigned char levelHigh = 0; + Run* r = firstRun(); + while (r) { + if (r->m_level > levelHigh) + levelHigh = r->m_level; + if (r->m_level < levelLow) + levelLow = r->m_level; + r = r->next(); + } + + // implements reordering of the line (L2 according to Bidi spec): + // L2. From the highest level found in the text to the lowest odd level on each line, + // reverse any contiguous sequence of characters that are at that level or higher. + + // reversing is only done up to the lowest odd level + if (!(levelLow % 2)) + levelLow++; + + unsigned count = runCount() - 1; + + while (levelHigh >= levelLow) { + unsigned i = 0; + Run* currRun = firstRun(); + while (i < count) { + while (i < count && currRun && currRun->m_level < levelHigh) { + i++; + currRun = currRun->next(); + } + unsigned start = i; + while (i <= count && currRun && currRun->m_level >= levelHigh) { + i++; + currRun = currRun->next(); + } + unsigned end = i - 1; + reverseRuns(start, end); + } + levelHigh--; + } + } + endOfLine = Iterator(); +} + +} // namespace WebCore + +#endif // BidiResolver_h diff --git a/Source/WebCore/platform/text/CharacterNames.h b/Source/WebCore/platform/text/CharacterNames.h new file mode 100644 index 0000000..c4b496e --- /dev/null +++ b/Source/WebCore/platform/text/CharacterNames.h @@ -0,0 +1,90 @@ +/* + * Copyright (C) 2007, 2009, 2010 Apple Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef CharacterNames_h +#define CharacterNames_h + +#include <wtf/unicode/Unicode.h> + +namespace WebCore { + +// Names here are taken from the Unicode standard. + +// Most of these are UChar constants, not UChar32, which makes them +// more convenient for WebCore code that mostly uses UTF-16. + +const UChar32 aegeanWordSeparatorLine = 0x10100; +const UChar32 aegeanWordSeparatorDot = 0x10101; +const UChar blackCircle = 0x25CF; +const UChar blackSquare = 0x25A0; +const UChar blackUpPointingTriangle = 0x25B2; +const UChar bullet = 0x2022; +const UChar bullseye = 0x25CE; +const UChar carriageReturn = 0x000D; +const UChar ethiopicPrefaceColon = 0x1366; +const UChar ethiopicWordspace = 0x1361; +const UChar fisheye = 0x25C9; +const UChar hebrewPunctuationGeresh = 0x05F3; +const UChar hebrewPunctuationGershayim = 0x05F4; +const UChar horizontalEllipsis = 0x2026; +const UChar hyphen = 0x2010; +const UChar hyphenMinus = 0x002D; +const UChar ideographicComma = 0x3001; +const UChar ideographicFullStop = 0x3002; +const UChar ideographicSpace = 0x3000; +const UChar leftDoubleQuotationMark = 0x201C; +const UChar leftSingleQuotationMark = 0x2018; +const UChar leftToRightEmbed = 0x202A; +const UChar leftToRightMark = 0x200E; +const UChar leftToRightOverride = 0x202D; +const UChar minusSign = 0x2212; +const UChar newlineCharacter = 0x000A; +const UChar noBreakSpace = 0x00A0; +const UChar objectReplacementCharacter = 0xFFFC; +const UChar popDirectionalFormatting = 0x202C; +const UChar replacementCharacter = 0xFFFD; +const UChar rightDoubleQuotationMark = 0x201D; +const UChar rightSingleQuotationMark = 0x2019; +const UChar rightToLeftEmbed = 0x202B; +const UChar rightToLeftMark = 0x200F; +const UChar rightToLeftOverride = 0x202E; +const UChar sesameDot = 0xFE45; +const UChar softHyphen = 0x00AD; +const UChar space = 0x0020; +const UChar tibetanMarkIntersyllabicTsheg = 0x0F0B; +const UChar tibetanMarkDelimiterTshegBstar = 0x0F0C; +const UChar32 ugariticWordDivider = 0x1039F; +const UChar whiteBullet = 0x25E6; +const UChar whiteCircle = 0x25CB; +const UChar whiteSesameDot = 0xFE46; +const UChar whiteUpPointingTriangle = 0x25B3; +const UChar yenSign = 0x00A5; +const UChar zeroWidthJoiner = 0x200D; +const UChar zeroWidthNonJoiner = 0x200C; +const UChar zeroWidthSpace = 0x200B; + +} + +#endif // CharacterNames_h diff --git a/Source/WebCore/platform/text/Hyphenation.cpp b/Source/WebCore/platform/text/Hyphenation.cpp new file mode 100644 index 0000000..89f6438 --- /dev/null +++ b/Source/WebCore/platform/text/Hyphenation.cpp @@ -0,0 +1,44 @@ +/* + * Copyright (C) 2010 Apple Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS'' + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "Hyphenation.h" + +#include "NotImplemented.h" + +namespace WebCore { + +bool canHyphenate(const AtomicString& /* localeIdentifier */) +{ + return false; +} + +size_t lastHyphenLocation(const UChar* /* characters */, size_t /* length */, size_t /* beforeIndex */, const AtomicString& /* localeIdentifier */) +{ + ASSERT_NOT_REACHED(); + return 0; +} + +} // namespace WebCore diff --git a/Source/WebCore/platform/text/Hyphenation.h b/Source/WebCore/platform/text/Hyphenation.h new file mode 100644 index 0000000..a99bff0 --- /dev/null +++ b/Source/WebCore/platform/text/Hyphenation.h @@ -0,0 +1,39 @@ +/* + * Copyright (C) 2010 Apple Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS'' + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef Hyphenation_h +#define Hyphenation_h + +#include <wtf/Forward.h> +#include <wtf/unicode/Unicode.h> + +namespace WebCore { + +bool canHyphenate(const AtomicString& localeIdentifier); +size_t lastHyphenLocation(const UChar*, size_t length, size_t beforeIndex, const AtomicString& localeIdentifier); + +} // namespace WebCore + +#endif // Hyphenation_h diff --git a/Source/WebCore/platform/text/LineEnding.cpp b/Source/WebCore/platform/text/LineEnding.cpp new file mode 100644 index 0000000..00a90eb --- /dev/null +++ b/Source/WebCore/platform/text/LineEnding.cpp @@ -0,0 +1,231 @@ +/* + * Copyright (C) 2005, 2006, 2008 Apple Inc. All rights reserved. + * Copyright (C) 2010 Google Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Google Inc. nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "LineEnding.h" + +#include "PlatformString.h" +#include <wtf/text/CString.h> + +namespace { + +class OutputBuffer { +public: + virtual char* allocate(size_t size) = 0; + virtual void copy(const CString&) = 0; + virtual ~OutputBuffer() { } +}; + +class CStringBuffer : public OutputBuffer { +public: + CStringBuffer(CString& buffer) + : m_buffer(buffer) + { + } + virtual ~CStringBuffer() { } + + virtual char* allocate(size_t size) + { + char* ptr; + m_buffer = CString::newUninitialized(size, ptr); + return ptr; + } + + virtual void copy(const CString& source) + { + m_buffer = source; + } + + const CString& buffer() const { return m_buffer; } + +private: + CString m_buffer; +}; + +class VectorCharAppendBuffer : public OutputBuffer { +public: + VectorCharAppendBuffer(Vector<char>& buffer) + : m_buffer(buffer) + { + } + virtual ~VectorCharAppendBuffer() { } + + virtual char* allocate(size_t size) + { + size_t oldSize = m_buffer.size(); + m_buffer.grow(oldSize + size); + return m_buffer.data() + oldSize; + } + + virtual void copy(const CString& source) + { + m_buffer.append(source.data(), source.length()); + } + +private: + Vector<char>& m_buffer; +}; + +void internalNormalizeLineEndingsToCRLF(const CString& from, OutputBuffer& buffer) +{ + // Compute the new length. + size_t newLen = 0; + const char* p = from.data(); + while (char c = *p++) { + if (c == '\r') { + // Safe to look ahead because of trailing '\0'. + if (*p != '\n') { + // Turn CR into CRLF. + newLen += 2; + } + } else if (c == '\n') { + // Turn LF into CRLF. + newLen += 2; + } else { + // Leave other characters alone. + newLen += 1; + } + } + if (newLen < from.length()) + return; + + if (newLen == from.length()) { + buffer.copy(from); + return; + } + + p = from.data(); + char* q = buffer.allocate(newLen); + + // Make a copy of the string. + while (char c = *p++) { + if (c == '\r') { + // Safe to look ahead because of trailing '\0'. + if (*p != '\n') { + // Turn CR into CRLF. + *q++ = '\r'; + *q++ = '\n'; + } + } else if (c == '\n') { + // Turn LF into CRLF. + *q++ = '\r'; + *q++ = '\n'; + } else { + // Leave other characters alone. + *q++ = c; + } + } +} + +}; + +namespace WebCore { + +void normalizeToCROrLF(const CString& from, Vector<char>& result, bool toCR); + +// Normalize all line-endings to CR or LF. +void normalizeToCROrLF(const CString& from, Vector<char>& result, bool toCR) +{ + // Compute the new length. + size_t newLen = 0; + bool needFix = false; + const char* p = from.data(); + char fromEndingChar = toCR ? '\n' : '\r'; + char toEndingChar = toCR ? '\r' : '\n'; + while (char c = *p++) { + if (c == '\r' && *p == '\n') { + // Turn CRLF into CR or LF. + p++; + needFix = true; + } else if (c == fromEndingChar) { + // Turn CR/LF into LF/CR. + needFix = true; + } + newLen += 1; + } + + // Grow the result buffer. + p = from.data(); + size_t oldResultSize = result.size(); + result.grow(oldResultSize + newLen); + char* q = result.data() + oldResultSize; + + // If no need to fix the string, just copy the string over. + if (!needFix) { + memcpy(q, p, from.length()); + return; + } + + // Make a copy of the string. + while (char c = *p++) { + if (c == '\r' && *p == '\n') { + // Turn CRLF or CR into CR or LF. + p++; + *q++ = toEndingChar; + } else if (c == fromEndingChar) { + // Turn CR/LF into LF/CR. + *q++ = toEndingChar; + } else { + // Leave other characters alone. + *q++ = c; + } + } +} + +CString normalizeLineEndingsToCRLF(const CString& from) +{ + CString result; + CStringBuffer buffer(result); + internalNormalizeLineEndingsToCRLF(from, buffer); + return buffer.buffer(); +} + +void normalizeLineEndingsToCR(const CString& from, Vector<char>& result) +{ + normalizeToCROrLF(from, result, true); +} + +void normalizeLineEndingsToLF(const CString& from, Vector<char>& result) +{ + normalizeToCROrLF(from, result, false); +} + +void normalizeLineEndingsToNative(const CString& from, Vector<char>& result) +{ +#if OS(WINDOWS) + VectorCharAppendBuffer buffer(result); + internalNormalizeLineEndingsToCRLF(from, buffer); +#else + normalizeLineEndingsToLF(from, result); +#endif +} + +} // namespace WebCore diff --git a/Source/WebCore/platform/text/LineEnding.h b/Source/WebCore/platform/text/LineEnding.h new file mode 100644 index 0000000..4306ce8 --- /dev/null +++ b/Source/WebCore/platform/text/LineEnding.h @@ -0,0 +1,55 @@ +/* + * Copyright (C) 2005, 2006, 2008 Apple Inc. All rights reserved. + * Copyright (C) 2010 Google Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Google Inc. nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef LineEnding_h +#define LineEnding_h + +#include <wtf/Forward.h> +#include <wtf/Vector.h> + +namespace WebCore { + +// Normalize all line-endings in the given string to CRLF. +CString normalizeLineEndingsToCRLF(const CString& from); + +// Normalize all line-endings in the given string to CR and append the result to the given buffer. +void normalizeLineEndingsToCR(const CString& from, Vector<char>& result); + +// Normalize all line-endings in the given string to LF and append the result to the given buffer. +void normalizeLineEndingsToLF(const CString& from, Vector<char>& result); + +// Normalize all line-endings in the given string to the native line-endings and append the result to the given buffer. +// (Normalize to CRLF on Windows and normalize to LF on all other platforms.) +void normalizeLineEndingsToNative(const CString& from, Vector<char>& result); + +} // namespace WebCore + +#endif // LineEnding_h diff --git a/Source/WebCore/platform/text/ParserUtilities.h b/Source/WebCore/platform/text/ParserUtilities.h new file mode 100644 index 0000000..3105214 --- /dev/null +++ b/Source/WebCore/platform/text/ParserUtilities.h @@ -0,0 +1,54 @@ +/* + * Copyright (C) 2008 Apple Inc. All Rights Reserved. + * Copyright (C) 2002, 2003 The Karbon Developers + * Copyright (C) 2006, 2007 Rob Buis <buis@kde.org> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public License + * along with this library; see the file COPYING.LIB. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, + * Boston, MA 02110-1301, USA. + */ + +#ifndef ParserUtilities_h +#define ParserUtilities_h + +#include "PlatformString.h" + +namespace WebCore { + + inline bool skipString(const UChar*& ptr, const UChar* end, const UChar* name, int length) + { + if (end - ptr < length) + return false; + if (memcmp(name, ptr, sizeof(UChar) * length)) + return false; + ptr += length; + return true; + } + + inline bool skipString(const UChar*& ptr, const UChar* end, const char* str) + { + int length = strlen(str); + if (end - ptr < length) + return false; + for (int i = 0; i < length; ++i) { + if (ptr[i] != str[i]) + return false; + } + ptr += length; + return true; + } + +} // namspace WebCore + +#endif // ParserUtilities_h diff --git a/Source/WebCore/platform/text/PlatformString.h b/Source/WebCore/platform/text/PlatformString.h new file mode 100644 index 0000000..e525bd4 --- /dev/null +++ b/Source/WebCore/platform/text/PlatformString.h @@ -0,0 +1,45 @@ +/* + * (C) 1999 Lars Knoll (knoll@kde.org) + * Copyright (C) 2004, 2005, 2006, 2007, 2008 Apple Inc. All rights reserved. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public License + * along with this library; see the file COPYING.LIB. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, + * Boston, MA 02110-1301, USA. + * + */ + +#ifndef PlatformString_h +#define PlatformString_h + +// This file would be called String.h, but that conflicts with <string.h> +// on systems without case-sensitive file systems. + +#include <wtf/text/WTFString.h> + +namespace WebCore { + +class SharedBuffer; + +PassRefPtr<SharedBuffer> utf8Buffer(const String&); +// Counts the number of grapheme clusters. A surrogate pair or a sequence +// of a non-combining character and following combining characters is +// counted as 1 grapheme cluster. +unsigned numGraphemeClusters(const String& s); +// Returns the number of characters which will be less than or equal to +// the specified grapheme cluster length. +unsigned numCharactersInGraphemeClusters(const String& s, unsigned); + +} // namespace WebCore + +#endif diff --git a/Source/WebCore/platform/text/RegularExpression.cpp b/Source/WebCore/platform/text/RegularExpression.cpp new file mode 100644 index 0000000..9b063c9 --- /dev/null +++ b/Source/WebCore/platform/text/RegularExpression.cpp @@ -0,0 +1,168 @@ +/* + * Copyright (C) 2004, 2008, 2009 Apple Inc. All rights reserved. + * Copyright (C) 2008 Collabora Ltd. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "RegularExpression.h" + +#include "Logging.h" +#include <pcre/pcre.h> + +namespace WebCore { + +class RegularExpression::Private : public RefCounted<RegularExpression::Private> { +public: + static PassRefPtr<Private> create(const String& pattern, TextCaseSensitivity); + ~Private(); + + JSRegExp* regexp() const { return m_regexp; } + int lastMatchLength; + +private: + Private(const String& pattern, TextCaseSensitivity); + static JSRegExp* compile(const String& pattern, TextCaseSensitivity); + + JSRegExp* m_regexp; +}; + +inline JSRegExp* RegularExpression::Private::compile(const String& pattern, TextCaseSensitivity caseSensitivity) +{ + const char* errorMessage; + JSRegExp* regexp = jsRegExpCompile(pattern.characters(), pattern.length(), + caseSensitivity == TextCaseSensitive ? JSRegExpDoNotIgnoreCase : JSRegExpIgnoreCase, JSRegExpSingleLine, + 0, &errorMessage); + if (!regexp) + LOG_ERROR("RegularExpression: pcre_compile failed with '%s'", errorMessage); + return regexp; +} + +inline RegularExpression::Private::Private(const String& pattern, TextCaseSensitivity caseSensitivity) + : lastMatchLength(-1) + , m_regexp(compile(pattern, caseSensitivity)) +{ +} + +inline PassRefPtr<RegularExpression::Private> RegularExpression::Private::create(const String& pattern, TextCaseSensitivity caseSensitivity) +{ + return adoptRef(new Private(pattern, caseSensitivity)); +} + +RegularExpression::Private::~Private() +{ + jsRegExpFree(m_regexp); +} + +RegularExpression::RegularExpression(const String& pattern, TextCaseSensitivity caseSensitivity) + : d(Private::create(pattern, caseSensitivity)) +{ +} + +RegularExpression::RegularExpression(const RegularExpression& re) + : d(re.d) +{ +} + +RegularExpression::~RegularExpression() +{ +} + +RegularExpression& RegularExpression::operator=(const RegularExpression& re) +{ + d = re.d; + return *this; +} + +int RegularExpression::match(const String& str, int startFrom, int* matchLength) const +{ + if (!d->regexp()) + return -1; + + if (str.isNull()) + return -1; + + // First 2 offsets are start and end offsets; 3rd entry is used internally by pcre + static const size_t maxOffsets = 3; + int offsets[maxOffsets]; + int result = jsRegExpExecute(d->regexp(), str.characters(), str.length(), startFrom, offsets, maxOffsets); + if (result < 0) { + if (result != JSRegExpErrorNoMatch) + LOG_ERROR("RegularExpression: pcre_exec() failed with result %d", result); + d->lastMatchLength = -1; + return -1; + } + + // 1 means 1 match; 0 means more than one match. First match is recorded in offsets. + d->lastMatchLength = offsets[1] - offsets[0]; + if (matchLength) + *matchLength = d->lastMatchLength; + return offsets[0]; +} + +int RegularExpression::searchRev(const String& str) const +{ + // FIXME: This could be faster if it actually searched backwards. + // Instead, it just searches forwards, multiple times until it finds the last match. + + int start = 0; + int pos; + int lastPos = -1; + int lastMatchLength = -1; + do { + int matchLength; + pos = match(str, start, &matchLength); + if (pos >= 0) { + if (pos + matchLength > lastPos + lastMatchLength) { + // replace last match if this one is later and not a subset of the last match + lastPos = pos; + lastMatchLength = matchLength; + } + start = pos + 1; + } + } while (pos != -1); + d->lastMatchLength = lastMatchLength; + return lastPos; +} + +int RegularExpression::matchedLength() const +{ + return d->lastMatchLength; +} + +void replace(String& string, const RegularExpression& target, const String& replacement) +{ + int index = 0; + while (index < static_cast<int>(string.length())) { + int matchLength; + index = target.match(string, index, &matchLength); + if (index < 0) + break; + string.replace(index, matchLength, replacement); + index += replacement.length(); + if (!matchLength) + break; // Avoid infinite loop on 0-length matches, e.g. [a-z]* + } +} + +} // namespace WebCore diff --git a/Source/WebCore/platform/text/RegularExpression.h b/Source/WebCore/platform/text/RegularExpression.h new file mode 100644 index 0000000..f1611e5 --- /dev/null +++ b/Source/WebCore/platform/text/RegularExpression.h @@ -0,0 +1,55 @@ +/* + * Copyright (C) 2003, 2008, 2009 Apple Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef RegularExpression_h +#define RegularExpression_h + +#include "PlatformString.h" + +namespace WebCore { + +class RegularExpression : public FastAllocBase { +public: + RegularExpression(const String&, TextCaseSensitivity); + ~RegularExpression(); + + RegularExpression(const RegularExpression&); + RegularExpression& operator=(const RegularExpression&); + + int match(const String&, int startFrom = 0, int* matchLength = 0) const; + int searchRev(const String&) const; + + int matchedLength() const; + +private: + class Private; + RefPtr<Private> d; +}; + +void replace(String&, const RegularExpression&, const String&); + +} // namespace WebCore + +#endif // RegularExpression_h diff --git a/Source/WebCore/platform/text/SegmentedString.cpp b/Source/WebCore/platform/text/SegmentedString.cpp new file mode 100644 index 0000000..a371582 --- /dev/null +++ b/Source/WebCore/platform/text/SegmentedString.cpp @@ -0,0 +1,274 @@ +/* + Copyright (C) 2004, 2005, 2006, 2007, 2008 Apple Inc. All rights reserved. + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Library General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Library General Public License for more details. + + You should have received a copy of the GNU Library General Public License + along with this library; see the file COPYING.LIB. If not, write to + the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, + Boston, MA 02110-1301, USA. +*/ + +#include "config.h" +#include "SegmentedString.h" + +namespace WebCore { + +SegmentedString::SegmentedString(const SegmentedString &other) + : m_pushedChar1(other.m_pushedChar1) + , m_pushedChar2(other.m_pushedChar2) + , m_currentString(other.m_currentString) + , m_substrings(other.m_substrings) + , m_composite(other.m_composite) + , m_closed(other.m_closed) +{ + if (other.m_currentChar == &other.m_pushedChar1) + m_currentChar = &m_pushedChar1; + else if (other.m_currentChar == &other.m_pushedChar2) + m_currentChar = &m_pushedChar2; + else + m_currentChar = other.m_currentChar; +} + +const SegmentedString& SegmentedString::operator=(const SegmentedString &other) +{ + m_pushedChar1 = other.m_pushedChar1; + m_pushedChar2 = other.m_pushedChar2; + m_currentString = other.m_currentString; + m_substrings = other.m_substrings; + m_composite = other.m_composite; + if (other.m_currentChar == &other.m_pushedChar1) + m_currentChar = &m_pushedChar1; + else if (other.m_currentChar == &other.m_pushedChar2) + m_currentChar = &m_pushedChar2; + else + m_currentChar = other.m_currentChar; + m_closed = other.m_closed; + m_numberOfCharactersConsumedPriorToCurrentString = other.m_numberOfCharactersConsumedPriorToCurrentString; + m_numberOfCharactersConsumedPriorToCurrentLine = other.m_numberOfCharactersConsumedPriorToCurrentLine; + m_currentLine = other.m_currentLine; + + return *this; +} + +unsigned SegmentedString::length() const +{ + unsigned length = m_currentString.m_length; + if (m_pushedChar1) { + ++length; + if (m_pushedChar2) + ++length; + } + if (m_composite) { + Deque<SegmentedSubstring>::const_iterator it = m_substrings.begin(); + Deque<SegmentedSubstring>::const_iterator e = m_substrings.end(); + for (; it != e; ++it) + length += it->m_length; + } + return length; +} + +void SegmentedString::setExcludeLineNumbers() +{ + m_currentString.setExcludeLineNumbers(); + if (m_composite) { + Deque<SegmentedSubstring>::iterator it = m_substrings.begin(); + Deque<SegmentedSubstring>::iterator e = m_substrings.end(); + for (; it != e; ++it) + it->setExcludeLineNumbers(); + } +} + +void SegmentedString::clear() +{ + m_pushedChar1 = 0; + m_pushedChar2 = 0; + m_currentChar = 0; + m_currentString.clear(); + m_substrings.clear(); + m_composite = false; + m_closed = false; +} + +void SegmentedString::append(const SegmentedSubstring &s) +{ + ASSERT(!m_closed); + if (s.m_length) { + if (!m_currentString.m_length) { + m_numberOfCharactersConsumedPriorToCurrentString += m_currentString.numberOfCharactersConsumed(); + m_currentString = s; + } else { + m_substrings.append(s); + m_composite = true; + } + } +} + +void SegmentedString::prepend(const SegmentedSubstring &s) +{ + ASSERT(!escaped()); + ASSERT(!s.numberOfCharactersConsumed()); + if (s.m_length) { + // FIXME: We're assuming that the prepend were originally consumed by + // this SegmentedString. We're also ASSERTing that s is a fresh + // SegmentedSubstring. These assumptions are sufficient for our + // current use, but we might need to handle the more elaborate + // cases in the future. + m_numberOfCharactersConsumedPriorToCurrentString += m_currentString.numberOfCharactersConsumed(); + m_numberOfCharactersConsumedPriorToCurrentString -= s.m_length; + if (!m_currentString.m_length) + m_currentString = s; + else { + // Shift our m_currentString into our list. + m_substrings.prepend(m_currentString); + m_currentString = s; + m_composite = true; + } + } +} + +void SegmentedString::close() +{ + // Closing a stream twice is likely a coding mistake. + ASSERT(!m_closed); + m_closed = true; +} + +void SegmentedString::append(const SegmentedString &s) +{ + ASSERT(!m_closed); + ASSERT(!s.escaped()); + append(s.m_currentString); + if (s.m_composite) { + Deque<SegmentedSubstring>::const_iterator it = s.m_substrings.begin(); + Deque<SegmentedSubstring>::const_iterator e = s.m_substrings.end(); + for (; it != e; ++it) + append(*it); + } + m_currentChar = m_pushedChar1 ? &m_pushedChar1 : m_currentString.m_current; +} + +void SegmentedString::prepend(const SegmentedString &s) +{ + ASSERT(!escaped()); + ASSERT(!s.escaped()); + if (s.m_composite) { + Deque<SegmentedSubstring>::const_reverse_iterator it = s.m_substrings.rbegin(); + Deque<SegmentedSubstring>::const_reverse_iterator e = s.m_substrings.rend(); + for (; it != e; ++it) + prepend(*it); + } + prepend(s.m_currentString); + m_currentChar = m_pushedChar1 ? &m_pushedChar1 : m_currentString.m_current; +} + +void SegmentedString::advanceSubstring() +{ + if (m_composite) { + m_numberOfCharactersConsumedPriorToCurrentString += m_currentString.numberOfCharactersConsumed(); + m_currentString = m_substrings.takeFirst(); + // If we've previously consumed some characters of the non-current + // string, we now account for those characters as part of the current + // string, not as part of "prior to current string." + m_numberOfCharactersConsumedPriorToCurrentString -= m_currentString.numberOfCharactersConsumed(); + if (m_substrings.isEmpty()) + m_composite = false; + } else { + m_currentString.clear(); + } +} + +int SegmentedString::numberOfCharactersConsumedSlow() const +{ + int result = m_numberOfCharactersConsumedPriorToCurrentString + m_currentString.numberOfCharactersConsumed(); + if (m_pushedChar1) { + --result; + if (m_pushedChar2) + --result; + } + return result; +} + +String SegmentedString::toString() const +{ + String result; + if (m_pushedChar1) { + result.append(m_pushedChar1); + if (m_pushedChar2) + result.append(m_pushedChar2); + } + m_currentString.appendTo(result); + if (m_composite) { + Deque<SegmentedSubstring>::const_iterator it = m_substrings.begin(); + Deque<SegmentedSubstring>::const_iterator e = m_substrings.end(); + for (; it != e; ++it) + it->appendTo(result); + } + return result; +} + +void SegmentedString::advance(unsigned count, UChar* consumedCharacters) +{ + ASSERT(count <= length()); + for (unsigned i = 0; i < count; ++i) { + consumedCharacters[i] = *current(); + advance(); + } +} + +void SegmentedString::advanceSlowCase() +{ + if (m_pushedChar1) { + m_pushedChar1 = m_pushedChar2; + m_pushedChar2 = 0; + } else if (m_currentString.m_current) { + ++m_currentString.m_current; + if (--m_currentString.m_length == 0) + advanceSubstring(); + } + m_currentChar = m_pushedChar1 ? &m_pushedChar1 : m_currentString.m_current; +} + +void SegmentedString::advanceSlowCase(int& lineNumber) +{ + if (m_pushedChar1) { + m_pushedChar1 = m_pushedChar2; + m_pushedChar2 = 0; + } else if (m_currentString.m_current) { + if (*m_currentString.m_current++ == '\n' && m_currentString.doNotExcludeLineNumbers()) { + ++lineNumber; + ++m_currentLine; + m_numberOfCharactersConsumedPriorToCurrentLine = numberOfCharactersConsumed(); + } + if (--m_currentString.m_length == 0) + advanceSubstring(); + } + m_currentChar = m_pushedChar1 ? &m_pushedChar1 : m_currentString.m_current; +} + +WTF::ZeroBasedNumber SegmentedString::currentLine() const +{ + return WTF::ZeroBasedNumber::fromZeroBasedInt(m_currentLine); +} + +WTF::ZeroBasedNumber SegmentedString::currentColumn() const +{ + int zeroBasedColumn = numberOfCharactersConsumedSlow() - m_numberOfCharactersConsumedPriorToCurrentLine; + return WTF::ZeroBasedNumber::fromZeroBasedInt(zeroBasedColumn); +} + +void SegmentedString::setCurrentPosition(WTF::ZeroBasedNumber line, WTF::ZeroBasedNumber columnAftreProlog, int prologLength) +{ + m_currentLine = line.zeroBasedInt(); + m_numberOfCharactersConsumedPriorToCurrentLine = numberOfCharactersConsumedSlow() + prologLength - columnAftreProlog.zeroBasedInt(); +} + +} diff --git a/Source/WebCore/platform/text/SegmentedString.h b/Source/WebCore/platform/text/SegmentedString.h new file mode 100644 index 0000000..5f548c7 --- /dev/null +++ b/Source/WebCore/platform/text/SegmentedString.h @@ -0,0 +1,282 @@ +/* + Copyright (C) 2004, 2005, 2006, 2007, 2008 Apple Inc. All rights reserved. + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Library General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Library General Public License for more details. + + You should have received a copy of the GNU Library General Public License + along with this library; see the file COPYING.LIB. If not, write to + the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, + Boston, MA 02110-1301, USA. +*/ + +#ifndef SegmentedString_h +#define SegmentedString_h + +#include "PlatformString.h" +#include <wtf/Deque.h> +#include <wtf/text/TextPosition.h> + +namespace WebCore { + +class SegmentedString; + +class SegmentedSubstring { +public: + SegmentedSubstring() : m_length(0), m_current(0), m_doNotExcludeLineNumbers(true) {} + SegmentedSubstring(const String& str) + : m_length(str.length()) + , m_current(str.isEmpty() ? 0 : str.characters()) + , m_string(str) + , m_doNotExcludeLineNumbers(true) + { + } + + void clear() { m_length = 0; m_current = 0; } + + bool excludeLineNumbers() const { return !m_doNotExcludeLineNumbers; } + bool doNotExcludeLineNumbers() const { return m_doNotExcludeLineNumbers; } + + void setExcludeLineNumbers() { m_doNotExcludeLineNumbers = false; } + + int numberOfCharactersConsumed() const { return m_string.length() - m_length; } + + void appendTo(String& str) const + { + if (m_string.characters() == m_current) { + if (str.isEmpty()) + str = m_string; + else + str.append(m_string); + } else { + str.append(String(m_current, m_length)); + } + } + +public: + int m_length; + const UChar* m_current; + +private: + String m_string; + bool m_doNotExcludeLineNumbers; +}; + +class SegmentedString { +public: + SegmentedString() + : m_pushedChar1(0) + , m_pushedChar2(0) + , m_currentChar(0) + , m_numberOfCharactersConsumedPriorToCurrentString(0) + , m_numberOfCharactersConsumedPriorToCurrentLine(0) + , m_currentLine(0) + , m_composite(false) + , m_closed(false) + { + } + + SegmentedString(const String& str) + : m_pushedChar1(0) + , m_pushedChar2(0) + , m_currentString(str) + , m_currentChar(m_currentString.m_current) + , m_numberOfCharactersConsumedPriorToCurrentString(0) + , m_numberOfCharactersConsumedPriorToCurrentLine(0) + , m_currentLine(0) + , m_composite(false) + , m_closed(false) + { + } + + SegmentedString(const SegmentedString&); + + const SegmentedString& operator=(const SegmentedString&); + + void clear(); + void close(); + + void append(const SegmentedString&); + void prepend(const SegmentedString&); + + bool excludeLineNumbers() const { return m_currentString.excludeLineNumbers(); } + void setExcludeLineNumbers(); + + void push(UChar c) + { + if (!m_pushedChar1) { + m_pushedChar1 = c; + m_currentChar = m_pushedChar1 ? &m_pushedChar1 : m_currentString.m_current; + } else { + ASSERT(!m_pushedChar2); + m_pushedChar2 = c; + } + } + + bool isEmpty() const { return !current(); } + unsigned length() const; + + bool isClosed() const { return m_closed; } + + enum LookAheadResult { + DidNotMatch, + DidMatch, + NotEnoughCharacters, + }; + + LookAheadResult lookAhead(const String& string) { return lookAheadInline<SegmentedString::equalsLiterally>(string); } + LookAheadResult lookAheadIgnoringCase(const String& string) { return lookAheadInline<SegmentedString::equalsIgnoringCase>(string); } + + void advance() + { + if (!m_pushedChar1 && m_currentString.m_length > 1) { + --m_currentString.m_length; + m_currentChar = ++m_currentString.m_current; + return; + } + advanceSlowCase(); + } + + void advanceAndASSERT(UChar expectedCharacter) + { + ASSERT_UNUSED(expectedCharacter, *current() == expectedCharacter); + advance(); + } + + void advanceAndASSERTIgnoringCase(UChar expectedCharacter) + { + ASSERT_UNUSED(expectedCharacter, WTF::Unicode::foldCase(*current()) == WTF::Unicode::foldCase(expectedCharacter)); + advance(); + } + + void advancePastNewline(int& lineNumber) + { + ASSERT(*current() == '\n'); + if (!m_pushedChar1 && m_currentString.m_length > 1) { + int newLineFlag = m_currentString.doNotExcludeLineNumbers(); + lineNumber += newLineFlag; + m_currentLine += newLineFlag; + if (newLineFlag) + m_numberOfCharactersConsumedPriorToCurrentLine = numberOfCharactersConsumed(); + --m_currentString.m_length; + m_currentChar = ++m_currentString.m_current; + return; + } + advanceSlowCase(lineNumber); + } + + void advancePastNonNewline() + { + ASSERT(*current() != '\n'); + if (!m_pushedChar1 && m_currentString.m_length > 1) { + --m_currentString.m_length; + m_currentChar = ++m_currentString.m_current; + return; + } + advanceSlowCase(); + } + + void advance(int& lineNumber) + { + if (!m_pushedChar1 && m_currentString.m_length > 1) { + int newLineFlag = (*m_currentString.m_current == '\n') & m_currentString.doNotExcludeLineNumbers(); + lineNumber += newLineFlag; + m_currentLine += newLineFlag; + if (newLineFlag) + m_numberOfCharactersConsumedPriorToCurrentLine = numberOfCharactersConsumed() + 1; + --m_currentString.m_length; + m_currentChar = ++m_currentString.m_current; + return; + } + advanceSlowCase(lineNumber); + } + + // Writes the consumed characters into consumedCharacters, which must + // have space for at least |count| characters. + void advance(unsigned count, UChar* consumedCharacters); + + bool escaped() const { return m_pushedChar1; } + + int numberOfCharactersConsumed() const + { + // We don't currently handle the case when there are pushed character. + ASSERT(!m_pushedChar1); + return m_numberOfCharactersConsumedPriorToCurrentString + m_currentString.numberOfCharactersConsumed(); + } + + int numberOfCharactersConsumedSlow() const; + + String toString() const; + + const UChar& operator*() const { return *current(); } + const UChar* operator->() const { return current(); } + + + // The method is moderately slow, comparing to currentLine method. + WTF::ZeroBasedNumber currentColumn() const; + WTF::ZeroBasedNumber currentLine() const; + // Sets value of line/column variables. Column is specified indirectly by a parameter columnAftreProlog + // which is a value of column that we should get after a prolog (first prologLength characters) has been consumed. + void setCurrentPosition(WTF::ZeroBasedNumber line, WTF::ZeroBasedNumber columnAftreProlog, int prologLength); + +private: + void append(const SegmentedSubstring&); + void prepend(const SegmentedSubstring&); + + void advanceSlowCase(); + void advanceSlowCase(int& lineNumber); + void advanceSubstring(); + const UChar* current() const { return m_currentChar; } + + static bool equalsLiterally(const UChar* str1, const UChar* str2, size_t count) { return !memcmp(str1, str2, count * sizeof(UChar)); } + static bool equalsIgnoringCase(const UChar* str1, const UChar* str2, size_t count) { return !WTF::Unicode::umemcasecmp(str1, str2, count); } + + template<bool equals(const UChar* str1, const UChar* str2, size_t count)> + inline LookAheadResult lookAheadInline(const String& string) + { + if (!m_pushedChar1 && string.length() <= static_cast<unsigned>(m_currentString.m_length)) { + if (equals(string.characters(), m_currentString.m_current, string.length())) + return DidMatch; + return DidNotMatch; + } + return lookAheadSlowCase<equals>(string); + } + + template<bool equals(const UChar* str1, const UChar* str2, size_t count)> + LookAheadResult lookAheadSlowCase(const String& string) + { + unsigned count = string.length(); + if (count > length()) + return NotEnoughCharacters; + UChar* consumedCharacters; + String consumedString = String::createUninitialized(count, consumedCharacters); + advance(count, consumedCharacters); + LookAheadResult result = DidNotMatch; + if (equals(string.characters(), consumedCharacters, count)) + result = DidMatch; + prepend(SegmentedString(consumedString)); + return result; + } + + UChar m_pushedChar1; + UChar m_pushedChar2; + SegmentedSubstring m_currentString; + const UChar* m_currentChar; + int m_numberOfCharactersConsumedPriorToCurrentString; + int m_numberOfCharactersConsumedPriorToCurrentLine; + int m_currentLine; + Deque<SegmentedSubstring> m_substrings; + bool m_composite; + bool m_closed; +}; + +} + +#endif diff --git a/Source/WebCore/platform/text/String.cpp b/Source/WebCore/platform/text/String.cpp new file mode 100644 index 0000000..f2f8d2e --- /dev/null +++ b/Source/WebCore/platform/text/String.cpp @@ -0,0 +1,77 @@ +/* + * (C) 1999 Lars Knoll (knoll@kde.org) + * Copyright (C) 2004, 2005, 2006, 2007, 2008 Apple Inc. All rights reserved. + * Copyright (C) 2007-2009 Torch Mobile, Inc. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public License + * along with this library; see the file COPYING.LIB. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, + * Boston, MA 02110-1301, USA. + */ + +#include "config.h" +#include "PlatformString.h" + +#include "SharedBuffer.h" +#include "TextBreakIterator.h" +#include <wtf/unicode/UTF8.h> +#include <wtf/unicode/Unicode.h> + +using namespace WTF; +using namespace WTF::Unicode; + +namespace WebCore { + +PassRefPtr<SharedBuffer> utf8Buffer(const String& string) +{ + // Allocate a buffer big enough to hold all the characters. + const int length = string.length(); + Vector<char> buffer(length * 3); + + // Convert to runs of 8-bit characters. + char* p = buffer.data(); + const UChar* d = string.characters(); + ConversionResult result = convertUTF16ToUTF8(&d, d + length, &p, p + buffer.size(), true); + if (result != conversionOK) + return 0; + + buffer.shrink(p - buffer.data()); + return SharedBuffer::adoptVector(buffer); +} + +unsigned numGraphemeClusters(const String& s) +{ + TextBreakIterator* it = characterBreakIterator(s.characters(), s.length()); + if (!it) + return s.length(); + + unsigned num = 0; + while (textBreakNext(it) != TextBreakDone) + ++num; + return num; +} + +unsigned numCharactersInGraphemeClusters(const String& s, unsigned numGraphemeClusters) +{ + TextBreakIterator* it = characterBreakIterator(s.characters(), s.length()); + if (!it) + return min(s.length(), numGraphemeClusters); + + for (unsigned i = 0; i < numGraphemeClusters; ++i) { + if (textBreakNext(it) == TextBreakDone) + return s.length(); + } + return textBreakCurrent(it); +} + +} // namespace WebCore diff --git a/Source/WebCore/platform/text/SuffixTree.h b/Source/WebCore/platform/text/SuffixTree.h new file mode 100644 index 0000000..f11fd23 --- /dev/null +++ b/Source/WebCore/platform/text/SuffixTree.h @@ -0,0 +1,122 @@ +/* + * Copyright (C) 2010 Adam Barth. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SuffixTree_h +#define SuffixTree_h + +#include "PlatformString.h" +#include <wtf/Vector.h> + +namespace WebCore { + +class UnicodeCodebook { +public: + static int codeWord(UChar c) { return c; } + enum { codeSize = 1 << 8 * sizeof(UChar) }; +}; + +class ASCIICodebook { +public: + static int codeWord(UChar c) { return c & (codeSize - 1); } + enum { codeSize = 1 << (8 * sizeof(char) - 1) }; +}; + +template<typename Codebook> +class SuffixTree { +public: + SuffixTree(const String& text, unsigned depth) + : m_depth(depth) + , m_leaf(true) + { + build(text); + } + + bool mightContain(const String& query) + { + Node* current = &m_root; + int limit = std::min(m_depth, query.length()); + for (int i = 0; i < limit; ++i) { + current = current->at(Codebook::codeWord(query[i])); + if (!current) + return false; + } + return true; + } + +private: + class Node { + public: + Node(bool isLeaf = false) + { + m_children.resize(Codebook::codeSize); + m_children.fill(0); + m_isLeaf = isLeaf; + } + + ~Node() + { + for (unsigned i = 0; i < m_children.size(); ++i) { + Node* child = m_children.at(i); + if (child && !child->m_isLeaf) + delete child; + } + } + + Node*& at(int codeWord) { return m_children.at(codeWord); } + + private: + typedef Vector<Node*, Codebook::codeSize> ChildrenVector; + + ChildrenVector m_children; + bool m_isLeaf; + }; + + void build(const String& text) + { + for (unsigned base = 0; base < text.length(); ++base) { + Node* current = &m_root; + unsigned limit = std::min(base + m_depth, text.length()); + for (unsigned offset = 0; base + offset < limit; ++offset) { + ASSERT(current != &m_leaf); + Node*& child = current->at(Codebook::codeWord(text[base + offset])); + if (!child) + child = base + offset + 1 == limit ? &m_leaf : new Node(); + current = child; + } + } + } + + Node m_root; + unsigned m_depth; + + // Instead of allocating a fresh empty leaf node for ever leaf in the tree + // (there can be a lot of these), we alias all the leaves to this "static" + // leaf node. + Node m_leaf; +}; + +} // namespace WebCore + +#endif // SuffixTree_h diff --git a/Source/WebCore/platform/text/TextBoundaries.cpp b/Source/WebCore/platform/text/TextBoundaries.cpp new file mode 100644 index 0000000..fbb261b --- /dev/null +++ b/Source/WebCore/platform/text/TextBoundaries.cpp @@ -0,0 +1,107 @@ +/* + * Copyright (C) 2006, 2007 Apple Inc. All rights reserved. + * Copyright (C) 2009 Dominik Röttsches <dominik.roettsches@access-company.com> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "TextBoundaries.h" + +#include "TextBreakIterator.h" +#include <wtf/text/StringImpl.h> +#include <wtf/unicode/Unicode.h> + +using namespace WTF; +using namespace Unicode; + +namespace WebCore { + +int endOfFirstWordBoundaryContext(const UChar* characters, int length) +{ + for (int i = 0; i < length; ) { + int first = i; + UChar32 ch; + U16_NEXT(characters, i, length, ch); + if (!requiresContextForWordBoundary(ch)) + return first; + } + return length; +} + +int startOfLastWordBoundaryContext(const UChar* characters, int length) +{ + for (int i = length; i > 0; ) { + int last = i; + UChar32 ch; + U16_PREV(characters, 0, i, ch); + if (!requiresContextForWordBoundary(ch)) + return last; + } + return 0; +} + +#if !PLATFORM(BREWMP) && !PLATFORM(MAC) && !PLATFORM(QT) + +int findNextWordFromIndex(const UChar* chars, int len, int position, bool forward) +{ + TextBreakIterator* it = wordBreakIterator(chars, len); + + if (forward) { + position = textBreakFollowing(it, position); + while (position != TextBreakDone) { + // We stop searching when the character preceeding the break + // is alphanumeric. + if (position < len && isAlphanumeric(chars[position - 1])) + return position; + + position = textBreakFollowing(it, position); + } + + return len; + } else { + position = textBreakPreceding(it, position); + while (position != TextBreakDone) { + // We stop searching when the character following the break + // is alphanumeric. + if (position > 0 && isAlphanumeric(chars[position])) + return position; + + position = textBreakPreceding(it, position); + } + + return 0; + } +} + +void findWordBoundary(const UChar* chars, int len, int position, int* start, int* end) +{ + TextBreakIterator* it = wordBreakIterator(chars, len); + *end = textBreakFollowing(it, position); + if (*end < 0) + *end = textBreakLast(it); + *start = textBreakPrevious(it); +} + +#endif // !PLATFORM(BREWMP) && !PLATFORM(MAC) && !PLATFORM(QT) + +} // namespace WebCore diff --git a/Source/WebCore/platform/text/TextBoundaries.h b/Source/WebCore/platform/text/TextBoundaries.h new file mode 100644 index 0000000..870ab62 --- /dev/null +++ b/Source/WebCore/platform/text/TextBoundaries.h @@ -0,0 +1,46 @@ +/* + * Copyright (C) 2004, 2006 Apple Computer, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef TextBoundaries_h +#define TextBoundaries_h + +#include <wtf/unicode/Unicode.h> + +namespace WebCore { + + inline bool requiresContextForWordBoundary(UChar32 ch) + { + return WTF::Unicode::hasLineBreakingPropertyComplexContext(ch); + } + + int endOfFirstWordBoundaryContext(const UChar* characters, int length); + int startOfLastWordBoundaryContext(const UChar* characters, int length); + + void findWordBoundary(const UChar*, int len, int position, int* start, int* end); + int findNextWordFromIndex(const UChar*, int len, int position, bool forward); + +} + +#endif diff --git a/Source/WebCore/platform/text/TextBreakIterator.h b/Source/WebCore/platform/text/TextBreakIterator.h new file mode 100644 index 0000000..17cf5f0 --- /dev/null +++ b/Source/WebCore/platform/text/TextBreakIterator.h @@ -0,0 +1,62 @@ +/* + * Copyright (C) 2006 Lars Knoll <lars@trolltech.com> + * Copyright (C) 2007 Apple Inc. All rights reserved. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public License + * along with this library; see the file COPYING.LIB. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, + * Boston, MA 02110-1301, USA. + * + */ + +#ifndef TextBreakIterator_h +#define TextBreakIterator_h + +#include <wtf/unicode/Unicode.h> + +namespace WebCore { + + class TextBreakIterator; + + // Note: The returned iterator is good only until you get another iterator. + + // Iterates over "extended grapheme clusters", as defined in UAX #29. + // Note that platform implementations may be less sophisticated - e.g. ICU prior to + // version 4.0 only supports "legacy grapheme clusters". + // Use this for general text processing, e.g. string truncation. + TextBreakIterator* characterBreakIterator(const UChar*, int length); + + // This is similar to character break iterator in most cases, but is subject to + // platform UI conventions. One notable example where this can be different + // from character break iterator is Thai prepend characters, see bug 24342. + // Use this for insertion point and selection manipulations. + TextBreakIterator* cursorMovementIterator(const UChar*, int length); + + TextBreakIterator* wordBreakIterator(const UChar*, int length); + TextBreakIterator* lineBreakIterator(const UChar*, int length); + TextBreakIterator* sentenceBreakIterator(const UChar*, int length); + + int textBreakFirst(TextBreakIterator*); + int textBreakLast(TextBreakIterator*); + int textBreakNext(TextBreakIterator*); + int textBreakPrevious(TextBreakIterator*); + int textBreakCurrent(TextBreakIterator*); + int textBreakPreceding(TextBreakIterator*, int); + int textBreakFollowing(TextBreakIterator*, int); + bool isTextBreak(TextBreakIterator*, int); + + const int TextBreakDone = -1; + +} + +#endif diff --git a/Source/WebCore/platform/text/TextBreakIteratorICU.cpp b/Source/WebCore/platform/text/TextBreakIteratorICU.cpp new file mode 100644 index 0000000..f5575ee --- /dev/null +++ b/Source/WebCore/platform/text/TextBreakIteratorICU.cpp @@ -0,0 +1,242 @@ +/* + * Copyright (C) 2006 Lars Knoll <lars@trolltech.com> + * Copyright (C) 2007 Apple Inc. All rights reserved. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public License + * along with this library; see the file COPYING.LIB. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, + * Boston, MA 02110-1301, USA. + * + */ + +#include "config.h" +#include "TextBreakIterator.h" + +#include "PlatformString.h" +#include "TextBreakIteratorInternalICU.h" +#include <unicode/ubrk.h> +#include <wtf/Assertions.h> + +namespace WebCore { + +static TextBreakIterator* setUpIterator(bool& createdIterator, TextBreakIterator*& iterator, + UBreakIteratorType type, const UChar* string, int length) +{ + if (!string) + return 0; + + if (!createdIterator) { + UErrorCode openStatus = U_ZERO_ERROR; + iterator = reinterpret_cast<TextBreakIterator*>(ubrk_open(type, currentTextBreakLocaleID(), 0, 0, &openStatus)); + createdIterator = true; + ASSERT_WITH_MESSAGE(U_SUCCESS(openStatus), "ICU could not open a break iterator: %s (%d)", u_errorName(openStatus), openStatus); + } + if (!iterator) + return 0; + + UErrorCode setTextStatus = U_ZERO_ERROR; + ubrk_setText(reinterpret_cast<UBreakIterator*>(iterator), string, length, &setTextStatus); + if (U_FAILURE(setTextStatus)) + return 0; + + return iterator; +} + +TextBreakIterator* characterBreakIterator(const UChar* string, int length) +{ + static bool createdCharacterBreakIterator = false; + static TextBreakIterator* staticCharacterBreakIterator; + return setUpIterator(createdCharacterBreakIterator, + staticCharacterBreakIterator, UBRK_CHARACTER, string, length); +} + +TextBreakIterator* wordBreakIterator(const UChar* string, int length) +{ + static bool createdWordBreakIterator = false; + static TextBreakIterator* staticWordBreakIterator; + return setUpIterator(createdWordBreakIterator, + staticWordBreakIterator, UBRK_WORD, string, length); +} + +TextBreakIterator* lineBreakIterator(const UChar* string, int length) +{ + static bool createdLineBreakIterator = false; + static TextBreakIterator* staticLineBreakIterator; + return setUpIterator(createdLineBreakIterator, + staticLineBreakIterator, UBRK_LINE, string, length); +} + +TextBreakIterator* sentenceBreakIterator(const UChar* string, int length) +{ + static bool createdSentenceBreakIterator = false; + static TextBreakIterator* staticSentenceBreakIterator; + return setUpIterator(createdSentenceBreakIterator, + staticSentenceBreakIterator, UBRK_SENTENCE, string, length); +} + +int textBreakFirst(TextBreakIterator* iterator) +{ + return ubrk_first(reinterpret_cast<UBreakIterator*>(iterator)); +} + +int textBreakLast(TextBreakIterator* iterator) +{ + return ubrk_last(reinterpret_cast<UBreakIterator*>(iterator)); +} + +int textBreakNext(TextBreakIterator* iterator) +{ + return ubrk_next(reinterpret_cast<UBreakIterator*>(iterator)); +} + +int textBreakPrevious(TextBreakIterator* iterator) +{ + return ubrk_previous(reinterpret_cast<UBreakIterator*>(iterator)); +} + +int textBreakPreceding(TextBreakIterator* iterator, int pos) +{ + return ubrk_preceding(reinterpret_cast<UBreakIterator*>(iterator), pos); +} + +int textBreakFollowing(TextBreakIterator* iterator, int pos) +{ + return ubrk_following(reinterpret_cast<UBreakIterator*>(iterator), pos); +} + +int textBreakCurrent(TextBreakIterator* iterator) +{ + return ubrk_current(reinterpret_cast<UBreakIterator*>(iterator)); +} + +bool isTextBreak(TextBreakIterator* iterator, int position) +{ + return ubrk_isBoundary(reinterpret_cast<UBreakIterator*>(iterator), position); +} + +#ifndef BUILDING_ON_TIGER +static TextBreakIterator* setUpIteratorWithRules(bool& createdIterator, TextBreakIterator*& iterator, + const char* breakRules, const UChar* string, int length) +{ + if (!string) + return 0; + + if (!createdIterator) { + UParseError parseStatus; + UErrorCode openStatus = U_ZERO_ERROR; + String rules(breakRules); + iterator = reinterpret_cast<TextBreakIterator*>(ubrk_openRules(rules.characters(), rules.length(), 0, 0, &parseStatus, &openStatus)); + createdIterator = true; + ASSERT_WITH_MESSAGE(U_SUCCESS(openStatus), "ICU could not open a break iterator: %s (%d)", u_errorName(openStatus), openStatus); + } + if (!iterator) + return 0; + + UErrorCode setTextStatus = U_ZERO_ERROR; + ubrk_setText(reinterpret_cast<UBreakIterator*>(iterator), string, length, &setTextStatus); + if (U_FAILURE(setTextStatus)) + return 0; + + return iterator; +} +#endif // BUILDING_ON_TIGER + +TextBreakIterator* cursorMovementIterator(const UChar* string, int length) +{ +#ifdef BUILDING_ON_TIGER + // ICU 3.2 cannot compile the below rules. + return characterBreakIterator(string, length); +#else + // This rule set is based on character-break iterator rules of ICU 4.0 + // <http://source.icu-project.org/repos/icu/icu/tags/release-4-0/source/data/brkitr/char.txt>. + // The major differences from the original ones are listed below: + // * Replaced '[\p{Grapheme_Cluster_Break = SpacingMark}]' with '[\p{General_Category = Spacing Mark} - $Extend]' for ICU 3.8 or earlier; + // * Removed rules that prevent a cursor from moving after prepend characters (Bug 24342); + // * Added rules that prevent a cursor from moving after virama signs of Indic languages except Tamil (Bug 15790), and; + // * Added rules that prevent a cursor from moving before Japanese half-width katakara voiced marks. + static const char* kRules = + "$CR = [\\p{Grapheme_Cluster_Break = CR}];" + "$LF = [\\p{Grapheme_Cluster_Break = LF}];" + "$Control = [\\p{Grapheme_Cluster_Break = Control}];" + "$VoiceMarks = [\\uFF9E\\uFF9F];" // Japanese half-width katakana voiced marks + "$Extend = [\\p{Grapheme_Cluster_Break = Extend} $VoiceMarks - [\\u0E30 \\u0E32 \\u0E45 \\u0EB0 \\u0EB2]];" + "$SpacingMark = [[\\p{General_Category = Spacing Mark}] - $Extend];" + "$L = [\\p{Grapheme_Cluster_Break = L}];" + "$V = [\\p{Grapheme_Cluster_Break = V}];" + "$T = [\\p{Grapheme_Cluster_Break = T}];" + "$LV = [\\p{Grapheme_Cluster_Break = LV}];" + "$LVT = [\\p{Grapheme_Cluster_Break = LVT}];" + "$Hin0 = [\\u0905-\\u0939];" // Devanagari Letter A,...,Ha + "$HinV = \\u094D;" // Devanagari Sign Virama + "$Hin1 = [\\u0915-\\u0939];" // Devanagari Letter Ka,...,Ha + "$Ben0 = [\\u0985-\\u09B9];" // Bengali Letter A,...,Ha + "$BenV = \\u09CD;" // Bengali Sign Virama + "$Ben1 = [\\u0995-\\u09B9];" // Bengali Letter Ka,...,Ha + "$Pan0 = [\\u0A05-\\u0A39];" // Gurmukhi Letter A,...,Ha + "$PanV = \\u0A4D;" // Gurmukhi Sign Virama + "$Pan1 = [\\u0A15-\\u0A39];" // Gurmukhi Letter Ka,...,Ha + "$Guj0 = [\\u0A85-\\u0AB9];" // Gujarati Letter A,...,Ha + "$GujV = \\u0ACD;" // Gujarati Sign Virama + "$Guj1 = [\\u0A95-\\u0AB9];" // Gujarati Letter Ka,...,Ha + "$Ori0 = [\\u0B05-\\u0B39];" // Oriya Letter A,...,Ha + "$OriV = \\u0B4D;" // Oriya Sign Virama + "$Ori1 = [\\u0B15-\\u0B39];" // Oriya Letter Ka,...,Ha + "$Tel0 = [\\u0C05-\\u0C39];" // Telugu Letter A,...,Ha + "$TelV = \\u0C4D;" // Telugu Sign Virama + "$Tel1 = [\\u0C14-\\u0C39];" // Telugu Letter Ka,...,Ha + "$Kan0 = [\\u0C85-\\u0CB9];" // Kannada Letter A,...,Ha + "$KanV = \\u0CCD;" // Kannada Sign Virama + "$Kan1 = [\\u0C95-\\u0CB9];" // Kannada Letter A,...,Ha + "$Mal0 = [\\u0D05-\\u0D39];" // Malayalam Letter A,...,Ha + "$MalV = \\u0D4D;" // Malayalam Sign Virama + "$Mal1 = [\\u0D15-\\u0D39];" // Malayalam Letter A,...,Ha + "!!chain;" + "!!forward;" + "$CR $LF;" + "$L ($L | $V | $LV | $LVT);" + "($LV | $V) ($V | $T);" + "($LVT | $T) $T;" + "[^$Control $CR $LF] $Extend;" + "[^$Control $CR $LF] $SpacingMark;" + "$Hin0 $HinV $Hin1;" // Devanagari Virama (forward) + "$Ben0 $BenV $Ben1;" // Bengali Virama (forward) + "$Pan0 $PanV $Pan1;" // Gurmukhi Virama (forward) + "$Guj0 $GujV $Guj1;" // Gujarati Virama (forward) + "$Ori0 $OriV $Ori1;" // Oriya Virama (forward) + "$Tel0 $TelV $Tel1;" // Telugu Virama (forward) + "$Kan0 $KanV $Kan1;" // Kannada Virama (forward) + "$Mal0 $MalV $Mal1;" // Malayalam Virama (forward) + "!!reverse;" + "$LF $CR;" + "($L | $V | $LV | $LVT) $L;" + "($V | $T) ($LV | $V);" + "$T ($LVT | $T);" + "$Extend [^$Control $CR $LF];" + "$SpacingMark [^$Control $CR $LF];" + "$Hin1 $HinV $Hin0;" // Devanagari Virama (backward) + "$Ben1 $BenV $Ben0;" // Bengali Virama (backward) + "$Pan1 $PanV $Pan0;" // Gurmukhi Virama (backward) + "$Guj1 $GujV $Guj0;" // Gujarati Virama (backward) + "$Ori1 $OriV $Ori0;" // Gujarati Virama (backward) + "$Tel1 $TelV $Tel0;" // Telugu Virama (backward) + "$Kan1 $KanV $Kan0;" // Kannada Virama (backward) + "$Mal1 $MalV $Mal0;" // Malayalam Virama (backward) + "!!safe_reverse;" + "!!safe_forward;"; + static bool createdCursorMovementIterator = false; + static TextBreakIterator* staticCursorMovementIterator; + return setUpIteratorWithRules(createdCursorMovementIterator, staticCursorMovementIterator, kRules, string, length); +#endif // BUILDING_ON_TIGER +} + +} diff --git a/Source/WebCore/platform/text/TextBreakIteratorInternalICU.h b/Source/WebCore/platform/text/TextBreakIteratorInternalICU.h new file mode 100644 index 0000000..68b7003 --- /dev/null +++ b/Source/WebCore/platform/text/TextBreakIteratorInternalICU.h @@ -0,0 +1,34 @@ +/* + * Copyright (C) 2007 Apple Inc. All rights reserved. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public License + * along with this library; see the file COPYING.LIB. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, + * Boston, MA 02110-1301, USA. + * + */ + +#ifndef TextBreakIteratorInternalICU_h +#define TextBreakIteratorInternalICU_h + +// FIXME: Now that this handles locales for ICU, not just for text breaking, +// this file and the various implementation files should be renamed. + +namespace WebCore { + + const char* currentSearchLocaleID(); + const char* currentTextBreakLocaleID(); + +} + +#endif diff --git a/Source/WebCore/platform/text/TextCodec.cpp b/Source/WebCore/platform/text/TextCodec.cpp new file mode 100644 index 0000000..4222ee1 --- /dev/null +++ b/Source/WebCore/platform/text/TextCodec.cpp @@ -0,0 +1,58 @@ +/* + * Copyright (C) 2004, 2006 Apple Computer, Inc. All rights reserved. + * Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "TextCodec.h" + +#include "PlatformString.h" +#include <wtf/StringExtras.h> + +namespace WebCore { + +TextCodec::~TextCodec() +{ +} + +int TextCodec::getUnencodableReplacement(unsigned codePoint, UnencodableHandling handling, UnencodableReplacementArray replacement) +{ + switch (handling) { + case QuestionMarksForUnencodables: + replacement[0] = '?'; + replacement[1] = 0; + return 1; + case EntitiesForUnencodables: + snprintf(replacement, sizeof(UnencodableReplacementArray), "&#%u;", codePoint); + return static_cast<int>(strlen(replacement)); + case URLEncodedEntitiesForUnencodables: + snprintf(replacement, sizeof(UnencodableReplacementArray), "%%26%%23%u%%3B", codePoint); + return static_cast<int>(strlen(replacement)); + } + ASSERT_NOT_REACHED(); + replacement[0] = 0; + return 0; +} + +} // namespace WebCore diff --git a/Source/WebCore/platform/text/TextCodec.h b/Source/WebCore/platform/text/TextCodec.h new file mode 100644 index 0000000..c6af38a --- /dev/null +++ b/Source/WebCore/platform/text/TextCodec.h @@ -0,0 +1,86 @@ +/* + * Copyright (C) 2004, 2006 Apple Computer, Inc. All rights reserved. + * Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef TextCodec_h +#define TextCodec_h + +#include <memory> +#include <wtf/Forward.h> +#include <wtf/Noncopyable.h> +#include <wtf/PassOwnPtr.h> +#include <wtf/Vector.h> +#include <wtf/unicode/Unicode.h> + +#include "PlatformString.h" + +namespace WebCore { + class TextEncoding; + + // Specifies what will happen when a character is encountered that is + // not encodable in the character set. + enum UnencodableHandling { + // Substitutes the replacement character "?". + QuestionMarksForUnencodables, + + // Encodes the character as an XML entity. For example, U+06DE + // would be "۞" (0x6DE = 1758 in octal). + EntitiesForUnencodables, + + // Encodes the character as en entity as above, but escaped + // non-alphanumeric characters. This is used in URLs. + // For example, U+6DE would be "%26%231758%3B". + URLEncodedEntitiesForUnencodables, + }; + + typedef char UnencodableReplacementArray[32]; + + class TextCodec : public Noncopyable { + public: + virtual ~TextCodec(); + + String decode(const char* str, size_t length, bool flush = false) + { + bool ignored; + return decode(str, length, flush, false, ignored); + } + + virtual String decode(const char*, size_t length, bool flush, bool stopOnError, bool& sawError) = 0; + virtual CString encode(const UChar*, size_t length, UnencodableHandling) = 0; + + // Fills a null-terminated string representation of the given + // unencodable character into the given replacement buffer. + // The length of the string (not including the null) will be returned. + static int getUnencodableReplacement(unsigned codePoint, UnencodableHandling, UnencodableReplacementArray); + }; + + typedef void (*EncodingNameRegistrar)(const char* alias, const char* name); + + typedef PassOwnPtr<TextCodec> (*NewTextCodecFunction)(const TextEncoding&, const void* additionalData); + typedef void (*TextCodecRegistrar)(const char* name, NewTextCodecFunction, const void* additionalData); + +} // namespace WebCore + +#endif // TextCodec_h diff --git a/Source/WebCore/platform/text/TextCodecICU.cpp b/Source/WebCore/platform/text/TextCodecICU.cpp new file mode 100644 index 0000000..6a579f9 --- /dev/null +++ b/Source/WebCore/platform/text/TextCodecICU.cpp @@ -0,0 +1,490 @@ +/* + * Copyright (C) 2004, 2006, 2007, 2008 Apple Inc. All rights reserved. + * Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "TextCodecICU.h" + +#include "CharacterNames.h" +#include "PlatformString.h" +#include "ThreadGlobalData.h" +#include <unicode/ucnv.h> +#include <unicode/ucnv_cb.h> +#include <wtf/Assertions.h> +#include <wtf/text/CString.h> +#include <wtf/PassOwnPtr.h> +#include <wtf/StringExtras.h> +#include <wtf/Threading.h> + +using std::min; + +namespace WebCore { + +const size_t ConversionBufferSize = 16384; + +ICUConverterWrapper::~ICUConverterWrapper() +{ + if (converter) + ucnv_close(converter); +} + +static UConverter*& cachedConverterICU() +{ + return threadGlobalData().cachedConverterICU().converter; +} + +static PassOwnPtr<TextCodec> newTextCodecICU(const TextEncoding& encoding, const void*) +{ + return new TextCodecICU(encoding); +} + +void TextCodecICU::registerBaseEncodingNames(EncodingNameRegistrar registrar) +{ + registrar("UTF-8", "UTF-8"); +} + +void TextCodecICU::registerBaseCodecs(TextCodecRegistrar registrar) +{ + registrar("UTF-8", newTextCodecICU, 0); +} + +void TextCodecICU::registerExtendedEncodingNames(EncodingNameRegistrar registrar) +{ + // We register Hebrew with logical ordering using a separate name. + // Otherwise, this would share the same canonical name as the + // visual ordering case, and then TextEncoding could not tell them + // apart; ICU treats these names as synonyms. + registrar("ISO-8859-8-I", "ISO-8859-8-I"); + + int32_t numEncodings = ucnv_countAvailable(); + for (int32_t i = 0; i < numEncodings; ++i) { + const char* name = ucnv_getAvailableName(i); + UErrorCode error = U_ZERO_ERROR; + // Try MIME before trying IANA to pick up commonly used names like + // 'EUC-JP' instead of horrendously long names like + // 'Extended_UNIX_Code_Packed_Format_for_Japanese'. + const char* standardName = ucnv_getStandardName(name, "MIME", &error); + if (!U_SUCCESS(error) || !standardName) { + error = U_ZERO_ERROR; + // Try IANA to pick up 'windows-12xx' and other names + // which are not preferred MIME names but are widely used. + standardName = ucnv_getStandardName(name, "IANA", &error); + if (!U_SUCCESS(error) || !standardName) + continue; + } + + // 1. Treat GB2312 encoding as GBK (its more modern superset), to match other browsers. + // 2. On the Web, GB2312 is encoded as EUC-CN or HZ, while ICU provides a native encoding + // for encoding GB_2312-80 and several others. So, we need to override this behavior, too. + if (strcmp(standardName, "GB2312") == 0 || strcmp(standardName, "GB_2312-80") == 0) + standardName = "GBK"; + // Similarly, EUC-KR encodings all map to an extended version. + else if (strcmp(standardName, "KSC_5601") == 0 || strcmp(standardName, "EUC-KR") == 0 || strcmp(standardName, "cp1363") == 0) + standardName = "windows-949"; + // And so on. + else if (strcasecmp(standardName, "iso-8859-9") == 0) // This name is returned in different case by ICU 3.2 and 3.6. + standardName = "windows-1254"; + else if (strcmp(standardName, "TIS-620") == 0) + standardName = "windows-874"; + + registrar(standardName, standardName); + + uint16_t numAliases = ucnv_countAliases(name, &error); + ASSERT(U_SUCCESS(error)); + if (U_SUCCESS(error)) + for (uint16_t j = 0; j < numAliases; ++j) { + error = U_ZERO_ERROR; + const char* alias = ucnv_getAlias(name, j, &error); + ASSERT(U_SUCCESS(error)); + if (U_SUCCESS(error) && alias != standardName) + registrar(alias, standardName); + } + } + + // Additional aliases. + // These are present in modern versions of ICU, but not in ICU 3.2 (shipped with Mac OS X 10.4). + registrar("macroman", "macintosh"); + registrar("maccyrillic", "x-mac-cyrillic"); + + // Additional aliases that historically were present in the encoding + // table in WebKit on Macintosh that don't seem to be present in ICU. + // Perhaps we can prove these are not used on the web and remove them. + // Or perhaps we can get them added to ICU. + registrar("x-mac-roman", "macintosh"); + registrar("x-mac-ukrainian", "x-mac-cyrillic"); + registrar("cn-big5", "Big5"); + registrar("x-x-big5", "Big5"); + registrar("cn-gb", "GBK"); + registrar("csgb231280", "GBK"); + registrar("x-euc-cn", "GBK"); + registrar("x-gbk", "GBK"); + registrar("csISO88598I", "ISO-8859-8-I"); + registrar("koi", "KOI8-R"); + registrar("logical", "ISO-8859-8-I"); + registrar("unicode11utf8", "UTF-8"); + registrar("unicode20utf8", "UTF-8"); + registrar("x-unicode20utf8", "UTF-8"); + registrar("visual", "ISO-8859-8"); + registrar("winarabic", "windows-1256"); + registrar("winbaltic", "windows-1257"); + registrar("wincyrillic", "windows-1251"); + registrar("iso-8859-11", "windows-874"); + registrar("iso8859-11", "windows-874"); + registrar("dos-874", "windows-874"); + registrar("wingreek", "windows-1253"); + registrar("winhebrew", "windows-1255"); + registrar("winlatin2", "windows-1250"); + registrar("winturkish", "windows-1254"); + registrar("winvietnamese", "windows-1258"); + registrar("x-cp1250", "windows-1250"); + registrar("x-cp1251", "windows-1251"); + registrar("x-euc", "EUC-JP"); + registrar("x-windows-949", "windows-949"); + registrar("x-uhc", "windows-949"); + registrar("utf8", "UTF-8"); + registrar("shift-jis", "Shift_JIS"); + + // These aliases are present in modern versions of ICU, but use different codecs, and have no standard names. + // They are not present in ICU 3.2. + registrar("dos-720", "cp864"); + registrar("jis7", "ISO-2022-JP"); + + // Alternative spelling of ISO encoding names. + registrar("ISO8859-1", "ISO-8859-1"); + registrar("ISO8859-2", "ISO-8859-2"); + registrar("ISO8859-3", "ISO-8859-3"); + registrar("ISO8859-4", "ISO-8859-4"); + registrar("ISO8859-5", "ISO-8859-5"); + registrar("ISO8859-6", "ISO-8859-6"); + registrar("ISO8859-7", "ISO-8859-7"); + registrar("ISO8859-8", "ISO-8859-8"); + registrar("ISO8859-8-I", "ISO-8859-8-I"); + registrar("ISO8859-9", "ISO-8859-9"); + registrar("ISO8859-10", "ISO-8859-10"); + registrar("ISO8859-13", "ISO-8859-13"); + registrar("ISO8859-14", "ISO-8859-14"); + registrar("ISO8859-15", "ISO-8859-15"); + // Not registering ISO8859-16, because Firefox (as of version 3.6.6) doesn't know this particular alias, + // and because older versions of ICU don't support ISO-8859-16 encoding at all. +} + +void TextCodecICU::registerExtendedCodecs(TextCodecRegistrar registrar) +{ + // See comment above in registerEncodingNames. + registrar("ISO-8859-8-I", newTextCodecICU, 0); + + int32_t numEncodings = ucnv_countAvailable(); + for (int32_t i = 0; i < numEncodings; ++i) { + const char* name = ucnv_getAvailableName(i); + UErrorCode error = U_ZERO_ERROR; + const char* standardName = ucnv_getStandardName(name, "MIME", &error); + if (!U_SUCCESS(error) || !standardName) { + error = U_ZERO_ERROR; + standardName = ucnv_getStandardName(name, "IANA", &error); + if (!U_SUCCESS(error) || !standardName) + continue; + } + registrar(standardName, newTextCodecICU, 0); + } +} + +TextCodecICU::TextCodecICU(const TextEncoding& encoding) + : m_encoding(encoding) + , m_numBufferedBytes(0) + , m_converterICU(0) + , m_needsGBKFallbacks(false) +{ +} + +TextCodecICU::~TextCodecICU() +{ + releaseICUConverter(); +} + +void TextCodecICU::releaseICUConverter() const +{ + if (m_converterICU) { + UConverter*& cachedConverter = cachedConverterICU(); + if (cachedConverter) + ucnv_close(cachedConverter); + cachedConverter = m_converterICU; + m_converterICU = 0; + } +} + +void TextCodecICU::createICUConverter() const +{ + ASSERT(!m_converterICU); + + const char* name = m_encoding.name(); + m_needsGBKFallbacks = name[0] == 'G' && name[1] == 'B' && name[2] == 'K' && !name[3]; + + UErrorCode err; + + UConverter*& cachedConverter = cachedConverterICU(); + if (cachedConverter) { + err = U_ZERO_ERROR; + const char* cachedName = ucnv_getName(cachedConverter, &err); + if (U_SUCCESS(err) && m_encoding == cachedName) { + m_converterICU = cachedConverter; + cachedConverter = 0; + return; + } + } + + err = U_ZERO_ERROR; + m_converterICU = ucnv_open(m_encoding.name(), &err); +#if !LOG_DISABLED + if (err == U_AMBIGUOUS_ALIAS_WARNING) + LOG_ERROR("ICU ambiguous alias warning for encoding: %s", m_encoding.name()); +#endif + if (m_converterICU) + ucnv_setFallback(m_converterICU, TRUE); +} + +int TextCodecICU::decodeToBuffer(UChar* target, UChar* targetLimit, const char*& source, const char* sourceLimit, int32_t* offsets, bool flush, UErrorCode& err) +{ + UChar* targetStart = target; + err = U_ZERO_ERROR; + ucnv_toUnicode(m_converterICU, &target, targetLimit, &source, sourceLimit, offsets, flush, &err); + return target - targetStart; +} + +class ErrorCallbackSetter { +public: + ErrorCallbackSetter(UConverter* converter, bool stopOnError) + : m_converter(converter) + , m_shouldStopOnEncodingErrors(stopOnError) + { + if (m_shouldStopOnEncodingErrors) { + UErrorCode err = U_ZERO_ERROR; + ucnv_setToUCallBack(m_converter, UCNV_TO_U_CALLBACK_SUBSTITUTE, + UCNV_SUB_STOP_ON_ILLEGAL, &m_savedAction, + &m_savedContext, &err); + ASSERT(err == U_ZERO_ERROR); + } + } + ~ErrorCallbackSetter() + { + if (m_shouldStopOnEncodingErrors) { + UErrorCode err = U_ZERO_ERROR; + const void* oldContext; + UConverterToUCallback oldAction; + ucnv_setToUCallBack(m_converter, m_savedAction, + m_savedContext, &oldAction, + &oldContext, &err); + ASSERT(oldAction == UCNV_TO_U_CALLBACK_SUBSTITUTE); + ASSERT(!strcmp(static_cast<const char*>(oldContext), UCNV_SUB_STOP_ON_ILLEGAL)); + ASSERT(err == U_ZERO_ERROR); + } + } +private: + UConverter* m_converter; + bool m_shouldStopOnEncodingErrors; + const void* m_savedContext; + UConverterToUCallback m_savedAction; +}; + +String TextCodecICU::decode(const char* bytes, size_t length, bool flush, bool stopOnError, bool& sawError) +{ + // Get a converter for the passed-in encoding. + if (!m_converterICU) { + createICUConverter(); + ASSERT(m_converterICU); + if (!m_converterICU) { + LOG_ERROR("error creating ICU encoder even though encoding was in table"); + return String(); + } + } + + ErrorCallbackSetter callbackSetter(m_converterICU, stopOnError); + + Vector<UChar> result; + + UChar buffer[ConversionBufferSize]; + UChar* bufferLimit = buffer + ConversionBufferSize; + const char* source = reinterpret_cast<const char*>(bytes); + const char* sourceLimit = source + length; + int32_t* offsets = NULL; + UErrorCode err = U_ZERO_ERROR; + + do { + int ucharsDecoded = decodeToBuffer(buffer, bufferLimit, source, sourceLimit, offsets, flush, err); + result.append(buffer, ucharsDecoded); + } while (err == U_BUFFER_OVERFLOW_ERROR); + + if (U_FAILURE(err)) { + // flush the converter so it can be reused, and not be bothered by this error. + do { + decodeToBuffer(buffer, bufferLimit, source, sourceLimit, offsets, true, err); + } while (source < sourceLimit); + sawError = true; + } + + String resultString = String::adopt(result); + + // <http://bugs.webkit.org/show_bug.cgi?id=17014> + // Simplified Chinese pages use the code A3A0 to mean "full-width space", but ICU decodes it as U+E5E5. + if (strcmp(m_encoding.name(), "GBK") == 0 || strcasecmp(m_encoding.name(), "gb18030") == 0) + resultString.replace(0xE5E5, ideographicSpace); + + return resultString; +} + +// We need to apply these fallbacks ourselves as they are not currently supported by ICU and +// they were provided by the old TEC encoding path +// Needed to fix <rdar://problem/4708689> +static UChar getGbkEscape(UChar32 codePoint) +{ + switch (codePoint) { + case 0x01F9: + return 0xE7C8; + case 0x1E3F: + return 0xE7C7; + case 0x22EF: + return 0x2026; + case 0x301C: + return 0xFF5E; + default: + return 0; + } +} + +// Invalid character handler when writing escaped entities for unrepresentable +// characters. See the declaration of TextCodec::encode for more. +static void urlEscapedEntityCallback(const void* context, UConverterFromUnicodeArgs* fromUArgs, const UChar* codeUnits, int32_t length, + UChar32 codePoint, UConverterCallbackReason reason, UErrorCode* err) +{ + if (reason == UCNV_UNASSIGNED) { + *err = U_ZERO_ERROR; + + UnencodableReplacementArray entity; + int entityLen = TextCodec::getUnencodableReplacement(codePoint, URLEncodedEntitiesForUnencodables, entity); + ucnv_cbFromUWriteBytes(fromUArgs, entity, entityLen, 0, err); + } else + UCNV_FROM_U_CALLBACK_ESCAPE(context, fromUArgs, codeUnits, length, codePoint, reason, err); +} + +// Substitutes special GBK characters, escaping all other unassigned entities. +static void gbkCallbackEscape(const void* context, UConverterFromUnicodeArgs* fromUArgs, const UChar* codeUnits, int32_t length, + UChar32 codePoint, UConverterCallbackReason reason, UErrorCode* err) +{ + UChar outChar; + if (reason == UCNV_UNASSIGNED && (outChar = getGbkEscape(codePoint))) { + const UChar* source = &outChar; + *err = U_ZERO_ERROR; + ucnv_cbFromUWriteUChars(fromUArgs, &source, source + 1, 0, err); + return; + } + UCNV_FROM_U_CALLBACK_ESCAPE(context, fromUArgs, codeUnits, length, codePoint, reason, err); +} + +// Combines both gbkUrlEscapedEntityCallback and GBK character substitution. +static void gbkUrlEscapedEntityCallack(const void* context, UConverterFromUnicodeArgs* fromUArgs, const UChar* codeUnits, int32_t length, + UChar32 codePoint, UConverterCallbackReason reason, UErrorCode* err) +{ + if (reason == UCNV_UNASSIGNED) { + if (UChar outChar = getGbkEscape(codePoint)) { + const UChar* source = &outChar; + *err = U_ZERO_ERROR; + ucnv_cbFromUWriteUChars(fromUArgs, &source, source + 1, 0, err); + return; + } + urlEscapedEntityCallback(context, fromUArgs, codeUnits, length, codePoint, reason, err); + return; + } + UCNV_FROM_U_CALLBACK_ESCAPE(context, fromUArgs, codeUnits, length, codePoint, reason, err); +} + +static void gbkCallbackSubstitute(const void* context, UConverterFromUnicodeArgs* fromUArgs, const UChar* codeUnits, int32_t length, + UChar32 codePoint, UConverterCallbackReason reason, UErrorCode* err) +{ + UChar outChar; + if (reason == UCNV_UNASSIGNED && (outChar = getGbkEscape(codePoint))) { + const UChar* source = &outChar; + *err = U_ZERO_ERROR; + ucnv_cbFromUWriteUChars(fromUArgs, &source, source + 1, 0, err); + return; + } + UCNV_FROM_U_CALLBACK_SUBSTITUTE(context, fromUArgs, codeUnits, length, codePoint, reason, err); +} + +CString TextCodecICU::encode(const UChar* characters, size_t length, UnencodableHandling handling) +{ + if (!length) + return ""; + + if (!m_converterICU) + createICUConverter(); + if (!m_converterICU) + return CString(); + + // FIXME: We should see if there is "force ASCII range" mode in ICU; + // until then, we change the backslash into a yen sign. + // Encoding will change the yen sign back into a backslash. + String copy(characters, length); + copy = m_encoding.displayString(copy.impl()); + + const UChar* source = copy.characters(); + const UChar* sourceLimit = source + copy.length(); + + UErrorCode err = U_ZERO_ERROR; + + switch (handling) { + case QuestionMarksForUnencodables: + ucnv_setSubstChars(m_converterICU, "?", 1, &err); + ucnv_setFromUCallBack(m_converterICU, m_needsGBKFallbacks ? gbkCallbackSubstitute : UCNV_FROM_U_CALLBACK_SUBSTITUTE, 0, 0, 0, &err); + break; + case EntitiesForUnencodables: + ucnv_setFromUCallBack(m_converterICU, m_needsGBKFallbacks ? gbkCallbackEscape : UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC, 0, 0, &err); + break; + case URLEncodedEntitiesForUnencodables: + ucnv_setFromUCallBack(m_converterICU, m_needsGBKFallbacks ? gbkUrlEscapedEntityCallack : urlEscapedEntityCallback, 0, 0, 0, &err); + break; + } + + ASSERT(U_SUCCESS(err)); + if (U_FAILURE(err)) + return CString(); + + Vector<char> result; + size_t size = 0; + do { + char buffer[ConversionBufferSize]; + char* target = buffer; + char* targetLimit = target + ConversionBufferSize; + err = U_ZERO_ERROR; + ucnv_fromUnicode(m_converterICU, &target, targetLimit, &source, sourceLimit, 0, true, &err); + size_t count = target - buffer; + result.grow(size + count); + memcpy(result.data() + size, buffer, count); + size += count; + } while (err == U_BUFFER_OVERFLOW_ERROR); + + return CString(result.data(), size); +} + + +} // namespace WebCore diff --git a/Source/WebCore/platform/text/TextCodecICU.h b/Source/WebCore/platform/text/TextCodecICU.h new file mode 100644 index 0000000..bf517f7 --- /dev/null +++ b/Source/WebCore/platform/text/TextCodecICU.h @@ -0,0 +1,81 @@ +/* + * Copyright (C) 2004, 2006, 2007 Apple Inc. All rights reserved. + * Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef TextCodecICU_h +#define TextCodecICU_h + +#include "TextCodec.h" +#include "TextEncoding.h" + +#include <unicode/utypes.h> + +typedef struct UConverter UConverter; + +namespace WebCore { + + class TextCodecICU : public TextCodec { + public: + static void registerBaseEncodingNames(EncodingNameRegistrar); + static void registerBaseCodecs(TextCodecRegistrar); + + static void registerExtendedEncodingNames(EncodingNameRegistrar); + static void registerExtendedCodecs(TextCodecRegistrar); + + TextCodecICU(const TextEncoding&); + virtual ~TextCodecICU(); + + virtual String decode(const char*, size_t length, bool flush, bool stopOnError, bool& sawError); + virtual CString encode(const UChar*, size_t length, UnencodableHandling); + + private: + void createICUConverter() const; + void releaseICUConverter() const; + bool needsGBKFallbacks() const { return m_needsGBKFallbacks; } + void setNeedsGBKFallbacks(bool needsFallbacks) { m_needsGBKFallbacks = needsFallbacks; } + + int decodeToBuffer(UChar* buffer, UChar* bufferLimit, const char*& source, + const char* sourceLimit, int32_t* offsets, bool flush, UErrorCode& err); + + TextEncoding m_encoding; + unsigned m_numBufferedBytes; + unsigned char m_bufferedBytes[16]; // bigger than any single multi-byte character + mutable UConverter* m_converterICU; + mutable bool m_needsGBKFallbacks; + }; + + struct ICUConverterWrapper { + ICUConverterWrapper() + : converter(0) + { + } + ~ICUConverterWrapper(); + + UConverter* converter; + }; + +} // namespace WebCore + +#endif // TextCodecICU_h diff --git a/Source/WebCore/platform/text/TextCodecLatin1.cpp b/Source/WebCore/platform/text/TextCodecLatin1.cpp new file mode 100644 index 0000000..2a217c5 --- /dev/null +++ b/Source/WebCore/platform/text/TextCodecLatin1.cpp @@ -0,0 +1,248 @@ +/* + * Copyright (C) 2004, 2006, 2008 Apple Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "TextCodecLatin1.h" + +#include "PlatformString.h" +#include <stdio.h> +#include <wtf/text/CString.h> +#include <wtf/text/StringBuffer.h> +#include <wtf/PassOwnPtr.h> + +namespace WebCore { + +static const UChar table[256] = { + 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, // 00-07 + 0x0008, 0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x000E, 0x000F, // 08-0F + 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, // 10-17 + 0x0018, 0x0019, 0x001A, 0x001B, 0x001C, 0x001D, 0x001E, 0x001F, // 18-1F + 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, // 20-27 + 0x0028, 0x0029, 0x002A, 0x002B, 0x002C, 0x002D, 0x002E, 0x002F, // 28-2F + 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, // 30-37 + 0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F, // 38-3F + 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, // 40-47 + 0x0048, 0x0049, 0x004A, 0x004B, 0x004C, 0x004D, 0x004E, 0x004F, // 48-4F + 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, // 50-57 + 0x0058, 0x0059, 0x005A, 0x005B, 0x005C, 0x005D, 0x005E, 0x005F, // 58-5F + 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, // 60-67 + 0x0068, 0x0069, 0x006A, 0x006B, 0x006C, 0x006D, 0x006E, 0x006F, // 68-6F + 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, // 70-77 + 0x0078, 0x0079, 0x007A, 0x007B, 0x007C, 0x007D, 0x007E, 0x007F, // 78-7F + 0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, // 80-87 + 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, // 88-8F + 0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, // 90-97 + 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178, // 98-9F + 0x00A0, 0x00A1, 0x00A2, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7, // A0-A7 + 0x00A8, 0x00A9, 0x00AA, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x00AF, // A8-AF + 0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x00B6, 0x00B7, // B0-B7 + 0x00B8, 0x00B9, 0x00BA, 0x00BB, 0x00BC, 0x00BD, 0x00BE, 0x00BF, // B8-BF + 0x00C0, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x00C6, 0x00C7, // C0-C7 + 0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF, // C8-CF + 0x00D0, 0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x00D5, 0x00D6, 0x00D7, // D0-D7 + 0x00D8, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x00DD, 0x00DE, 0x00DF, // D8-DF + 0x00E0, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x00E7, // E0-E7 + 0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF, // E8-EF + 0x00F0, 0x00F1, 0x00F2, 0x00F3, 0x00F4, 0x00F5, 0x00F6, 0x00F7, // F0-F7 + 0x00F8, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x00FD, 0x00FE, 0x00FF // F8-FF +}; + +void TextCodecLatin1::registerEncodingNames(EncodingNameRegistrar registrar) +{ + registrar("windows-1252", "windows-1252"); + registrar("ISO-8859-1", "ISO-8859-1"); + registrar("US-ASCII", "US-ASCII"); + + registrar("WinLatin1", "windows-1252"); + registrar("ibm-1252", "windows-1252"); + registrar("ibm-1252_P100-2000", "windows-1252"); + + registrar("CP819", "ISO-8859-1"); + registrar("IBM819", "ISO-8859-1"); + registrar("csISOLatin1", "ISO-8859-1"); + registrar("iso-ir-100", "ISO-8859-1"); + registrar("iso_8859-1:1987", "ISO-8859-1"); + registrar("l1", "ISO-8859-1"); + registrar("latin1", "ISO-8859-1"); + + registrar("ANSI_X3.4-1968", "US-ASCII"); + registrar("ANSI_X3.4-1986", "US-ASCII"); + registrar("ASCII", "US-ASCII"); + registrar("IBM367", "US-ASCII"); + registrar("ISO646-US", "US-ASCII"); + registrar("ISO_646.irv:1991", "US-ASCII"); + registrar("cp367", "US-ASCII"); + registrar("csASCII", "US-ASCII"); + registrar("ibm-367_P100-1995", "US-ASCII"); + registrar("iso-ir-6", "US-ASCII"); + registrar("iso-ir-6-us", "US-ASCII"); + registrar("us", "US-ASCII"); + registrar("x-ansi", "US-ASCII"); +} + +static PassOwnPtr<TextCodec> newStreamingTextDecoderWindowsLatin1(const TextEncoding&, const void*) +{ + return new TextCodecLatin1; +} + +void TextCodecLatin1::registerCodecs(TextCodecRegistrar registrar) +{ + registrar("windows-1252", newStreamingTextDecoderWindowsLatin1, 0); + + // ASCII and Latin-1 both decode as Windows Latin-1 although they retain unique identities. + registrar("ISO-8859-1", newStreamingTextDecoderWindowsLatin1, 0); + registrar("US-ASCII", newStreamingTextDecoderWindowsLatin1, 0); +} + +template<size_t size> struct NonASCIIMask; +template<> struct NonASCIIMask<4> { + static unsigned value() { return 0x80808080U; } +}; +template<> struct NonASCIIMask<8> { + static unsigned long long value() { return 0x8080808080808080ULL; } +}; + +template<size_t size> struct UCharByteFiller; +template<> struct UCharByteFiller<4> { + static void copy(UChar* dest, const unsigned char* src) + { + dest[0] = src[0]; + dest[1] = src[1]; + dest[2] = src[2]; + dest[3] = src[3]; + } +}; +template<> struct UCharByteFiller<8> { + static void copy(UChar* dest, const unsigned char* src) + { + dest[0] = src[0]; + dest[1] = src[1]; + dest[2] = src[2]; + dest[3] = src[3]; + dest[4] = src[4]; + dest[5] = src[5]; + dest[6] = src[6]; + dest[7] = src[7]; + } +}; + +String TextCodecLatin1::decode(const char* bytes, size_t length, bool, bool, bool&) +{ + UChar* characters; + String result = String::createUninitialized(length, characters); + + const unsigned char* src = reinterpret_cast<const unsigned char*>(bytes); + const unsigned char* end = reinterpret_cast<const unsigned char*>(bytes + length); + const unsigned char* alignedEnd = reinterpret_cast<const unsigned char*>(reinterpret_cast<ptrdiff_t>(end) & ~(sizeof(uintptr_t) - 1)); + UChar* dest = characters; + + while (src < end) { + if (*src < 0x80) { + // Fast path for values < 0x80 (most Latin-1 text will be ASCII) + // Wait until we're at a properly aligned address, then read full CPU words. + if (!(reinterpret_cast<ptrdiff_t>(src) & (sizeof(uintptr_t) - 1))) { + while (src < alignedEnd) { + uintptr_t chunk = *reinterpret_cast_ptr<const uintptr_t*>(src); + + if (chunk & NonASCIIMask<sizeof(uintptr_t)>::value()) + goto useLookupTable; + + UCharByteFiller<sizeof(uintptr_t)>::copy(dest, src); + + src += sizeof(uintptr_t); + dest += sizeof(uintptr_t); + } + + if (src == end) + break; + } + *dest = *src; + } else { +useLookupTable: + *dest = table[*src]; + } + + ++src; + ++dest; + } + + return result; +} + +static CString encodeComplexWindowsLatin1(const UChar* characters, size_t length, UnencodableHandling handling) +{ + Vector<char> result(length); + char* bytes = result.data(); + + size_t resultLength = 0; + for (size_t i = 0; i < length; ) { + UChar32 c; + U16_NEXT(characters, i, length, c); + unsigned char b = c; + // Do an efficient check to detect characters other than 00-7F and A0-FF. + if (b != c || (c & 0xE0) == 0x80) { + // Look for a way to encode this with Windows Latin-1. + for (b = 0x80; b < 0xA0; ++b) + if (table[b] == c) + goto gotByte; + // No way to encode this character with Windows Latin-1. + UnencodableReplacementArray replacement; + int replacementLength = TextCodec::getUnencodableReplacement(c, handling, replacement); + result.grow(resultLength + replacementLength + length - i); + bytes = result.data(); + memcpy(bytes + resultLength, replacement, replacementLength); + resultLength += replacementLength; + continue; + } + gotByte: + bytes[resultLength++] = b; + } + + return CString(bytes, resultLength); +} + +CString TextCodecLatin1::encode(const UChar* characters, size_t length, UnencodableHandling handling) +{ + { + char* bytes; + CString string = CString::newUninitialized(length, bytes); + + // Convert the string a fast way and simultaneously do an efficient check to see if it's all ASCII. + UChar ored = 0; + for (size_t i = 0; i < length; ++i) { + UChar c = characters[i]; + bytes[i] = c; + ored |= c; + } + + if (!(ored & 0xFF80)) + return string; + } + + // If it wasn't all ASCII, call the function that handles more-complex cases. + return encodeComplexWindowsLatin1(characters, length, handling); +} + +} // namespace WebCore diff --git a/Source/WebCore/platform/text/TextCodecLatin1.h b/Source/WebCore/platform/text/TextCodecLatin1.h new file mode 100644 index 0000000..f035d01 --- /dev/null +++ b/Source/WebCore/platform/text/TextCodecLatin1.h @@ -0,0 +1,44 @@ +/* + * Copyright (C) 2004, 2006 Apple Computer, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef TextCodecLatin1_h +#define TextCodecLatin1_h + +#include "TextCodec.h" + +namespace WebCore { + + class TextCodecLatin1 : public TextCodec { + public: + static void registerEncodingNames(EncodingNameRegistrar); + static void registerCodecs(TextCodecRegistrar); + + virtual String decode(const char*, size_t length, bool flush, bool stopOnError, bool& sawError); + virtual CString encode(const UChar*, size_t length, UnencodableHandling); + }; + +} // namespace WebCore + +#endif // TextCodecLatin1_h diff --git a/Source/WebCore/platform/text/TextCodecUTF16.cpp b/Source/WebCore/platform/text/TextCodecUTF16.cpp new file mode 100644 index 0000000..e88e83b --- /dev/null +++ b/Source/WebCore/platform/text/TextCodecUTF16.cpp @@ -0,0 +1,150 @@ +/* + * Copyright (C) 2004, 2006, 2008, 2010 Apple Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "TextCodecUTF16.h" + +#include "PlatformString.h" +#include <wtf/text/CString.h> +#include <wtf/text/StringBuffer.h> +#include <wtf/PassOwnPtr.h> + +using namespace std; + +namespace WebCore { + +void TextCodecUTF16::registerEncodingNames(EncodingNameRegistrar registrar) +{ + registrar("UTF-16LE", "UTF-16LE"); + registrar("UTF-16BE", "UTF-16BE"); + + registrar("ISO-10646-UCS-2", "UTF-16LE"); + registrar("UCS-2", "UTF-16LE"); + registrar("UTF-16", "UTF-16LE"); + registrar("Unicode", "UTF-16LE"); + registrar("csUnicode", "UTF-16LE"); + registrar("unicodeFEFF", "UTF-16LE"); + + registrar("unicodeFFFE", "UTF-16BE"); +} + +static PassOwnPtr<TextCodec> newStreamingTextDecoderUTF16LE(const TextEncoding&, const void*) +{ + return new TextCodecUTF16(true); +} + +static PassOwnPtr<TextCodec> newStreamingTextDecoderUTF16BE(const TextEncoding&, const void*) +{ + return new TextCodecUTF16(false); +} + +void TextCodecUTF16::registerCodecs(TextCodecRegistrar registrar) +{ + registrar("UTF-16LE", newStreamingTextDecoderUTF16LE, 0); + registrar("UTF-16BE", newStreamingTextDecoderUTF16BE, 0); +} + +String TextCodecUTF16::decode(const char* bytes, size_t length, bool, bool, bool&) +{ + if (!length) + return String(); + + const unsigned char* p = reinterpret_cast<const unsigned char*>(bytes); + size_t numBytes = length + m_haveBufferedByte; + size_t numChars = numBytes / 2; + + StringBuffer buffer(numChars); + UChar* q = buffer.characters(); + + if (m_haveBufferedByte) { + UChar c; + if (m_littleEndian) + c = m_bufferedByte | (p[0] << 8); + else + c = (m_bufferedByte << 8) | p[0]; + *q++ = c; + m_haveBufferedByte = false; + p += 1; + numChars -= 1; + } + + if (m_littleEndian) { + for (size_t i = 0; i < numChars; ++i) { + UChar c = p[0] | (p[1] << 8); + p += 2; + *q++ = c; + } + } else { + for (size_t i = 0; i < numChars; ++i) { + UChar c = (p[0] << 8) | p[1]; + p += 2; + *q++ = c; + } + } + + if (numBytes & 1) { + ASSERT(!m_haveBufferedByte); + m_haveBufferedByte = true; + m_bufferedByte = p[0]; + } + + buffer.shrink(q - buffer.characters()); + + return String::adopt(buffer); +} + +CString TextCodecUTF16::encode(const UChar* characters, size_t length, UnencodableHandling) +{ + // We need to be sure we can double the length without overflowing. + // Since the passed-in length is the length of an actual existing + // character buffer, each character is two bytes, and we know + // the buffer doesn't occupy the entire address space, we can + // assert here that doubling the length does not overflow size_t + // and there's no need for a runtime check. + ASSERT(length <= numeric_limits<size_t>::max() / 2); + + char* bytes; + CString string = CString::newUninitialized(length * 2, bytes); + + // FIXME: CString is not a reasonable data structure for encoded UTF-16, which will have + // null characters inside it. Perhaps the result of encode should not be a CString. + if (m_littleEndian) { + for (size_t i = 0; i < length; ++i) { + UChar c = characters[i]; + bytes[i * 2] = c; + bytes[i * 2 + 1] = c >> 8; + } + } else { + for (size_t i = 0; i < length; ++i) { + UChar c = characters[i]; + bytes[i * 2] = c >> 8; + bytes[i * 2 + 1] = c; + } + } + + return string; +} + +} // namespace WebCore diff --git a/Source/WebCore/platform/text/TextCodecUTF16.h b/Source/WebCore/platform/text/TextCodecUTF16.h new file mode 100644 index 0000000..8ce9476 --- /dev/null +++ b/Source/WebCore/platform/text/TextCodecUTF16.h @@ -0,0 +1,51 @@ +/* + * Copyright (C) 2004, 2006 Apple Computer, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef TextCodecUTF16_h +#define TextCodecUTF16_h + +#include "TextCodec.h" + +namespace WebCore { + + class TextCodecUTF16 : public TextCodec { + public: + static void registerEncodingNames(EncodingNameRegistrar); + static void registerCodecs(TextCodecRegistrar); + + TextCodecUTF16(bool littleEndian) : m_littleEndian(littleEndian), m_haveBufferedByte(false) { } + + virtual String decode(const char*, size_t length, bool flush, bool stopOnError, bool& sawError); + virtual CString encode(const UChar*, size_t length, UnencodableHandling); + + private: + bool m_littleEndian; + bool m_haveBufferedByte; + unsigned char m_bufferedByte; + }; + +} // namespace WebCore + +#endif // TextCodecUTF16_h diff --git a/Source/WebCore/platform/text/TextCodecUserDefined.cpp b/Source/WebCore/platform/text/TextCodecUserDefined.cpp new file mode 100644 index 0000000..70d8673 --- /dev/null +++ b/Source/WebCore/platform/text/TextCodecUserDefined.cpp @@ -0,0 +1,111 @@ +/* + * Copyright (C) 2007, 2008 Apple, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "TextCodecUserDefined.h" + +#include "PlatformString.h" +#include <stdio.h> +#include <wtf/text/CString.h> +#include <wtf/text/StringBuffer.h> +#include <wtf/PassOwnPtr.h> + +namespace WebCore { + +void TextCodecUserDefined::registerEncodingNames(EncodingNameRegistrar registrar) +{ + registrar("x-user-defined", "x-user-defined"); +} + +static PassOwnPtr<TextCodec> newStreamingTextDecoderUserDefined(const TextEncoding&, const void*) +{ + return new TextCodecUserDefined; +} + +void TextCodecUserDefined::registerCodecs(TextCodecRegistrar registrar) +{ + registrar("x-user-defined", newStreamingTextDecoderUserDefined, 0); +} + +String TextCodecUserDefined::decode(const char* bytes, size_t length, bool, bool, bool&) +{ + UChar* buffer; + String result = String::createUninitialized(length, buffer); + + for (size_t i = 0; i < length; ++i) { + signed char c = bytes[i]; + buffer[i] = c & 0xF7FF; + } + + return result; +} + +static CString encodeComplexUserDefined(const UChar* characters, size_t length, UnencodableHandling handling) +{ + Vector<char> result(length); + char* bytes = result.data(); + + size_t resultLength = 0; + for (size_t i = 0; i < length; ) { + UChar32 c; + U16_NEXT(characters, i, length, c); + signed char signedByte = c; + if ((signedByte & 0xF7FF) == c) + bytes[resultLength++] = signedByte; + else { + // No way to encode this character with x-user-defined. + UnencodableReplacementArray replacement; + int replacementLength = TextCodec::getUnencodableReplacement(c, handling, replacement); + result.grow(resultLength + replacementLength + length - i); + bytes = result.data(); + memcpy(bytes + resultLength, replacement, replacementLength); + resultLength += replacementLength; + } + } + + return CString(bytes, resultLength); +} + +CString TextCodecUserDefined::encode(const UChar* characters, size_t length, UnencodableHandling handling) +{ + char* bytes; + CString string = CString::newUninitialized(length, bytes); + + // Convert the string a fast way and simultaneously do an efficient check to see if it's all ASCII. + UChar ored = 0; + for (size_t i = 0; i < length; ++i) { + UChar c = characters[i]; + bytes[i] = c; + ored |= c; + } + + if (!(ored & 0xFF80)) + return string; + + // If it wasn't all ASCII, call the function that handles more-complex cases. + return encodeComplexUserDefined(characters, length, handling); +} + +} // namespace WebCore diff --git a/Source/WebCore/platform/text/TextCodecUserDefined.h b/Source/WebCore/platform/text/TextCodecUserDefined.h new file mode 100644 index 0000000..d1b3160 --- /dev/null +++ b/Source/WebCore/platform/text/TextCodecUserDefined.h @@ -0,0 +1,44 @@ +/* + * Copyright (C) 2007 Apple, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef TextCodecUserDefined_h +#define TextCodecUserDefined_h + +#include "TextCodec.h" + +namespace WebCore { + + class TextCodecUserDefined : public TextCodec { + public: + static void registerEncodingNames(EncodingNameRegistrar); + static void registerCodecs(TextCodecRegistrar); + + virtual String decode(const char*, size_t length, bool flush, bool stopOnError, bool& sawError); + virtual CString encode(const UChar*, size_t length, UnencodableHandling); + }; + +} // namespace WebCore + +#endif // TextCodecUserDefined_h diff --git a/Source/WebCore/platform/text/TextDirection.h b/Source/WebCore/platform/text/TextDirection.h new file mode 100644 index 0000000..5be416e --- /dev/null +++ b/Source/WebCore/platform/text/TextDirection.h @@ -0,0 +1,35 @@ +/* + * Copyright (C) 2003, 2006 Apple Computer, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef TextDirection_h +#define TextDirection_h + +namespace WebCore { + + enum TextDirection { RTL, LTR }; + +} + +#endif diff --git a/Source/WebCore/platform/text/TextEncoding.cpp b/Source/WebCore/platform/text/TextEncoding.cpp new file mode 100644 index 0000000..33313a0 --- /dev/null +++ b/Source/WebCore/platform/text/TextEncoding.cpp @@ -0,0 +1,265 @@ +/* + * Copyright (C) 2004, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved. + * Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com> + * Copyright (C) 2007-2009 Torch Mobile, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "TextEncoding.h" + +#include "PlatformString.h" +#include "TextCodec.h" +#include "TextEncodingRegistry.h" +#if USE(ICU_UNICODE) +#include <unicode/unorm.h> +#elif USE(QT4_UNICODE) +#include <QString> +#elif USE(GLIB_UNICODE) +#include <glib.h> +#include "GOwnPtr.h" +#endif +#include <wtf/text/CString.h> +#include <wtf/OwnPtr.h> +#include <wtf/StdLibExtras.h> + +namespace WebCore { + +static const TextEncoding& UTF7Encoding() +{ + static TextEncoding globalUTF7Encoding("UTF-7"); + return globalUTF7Encoding; +} + +TextEncoding::TextEncoding(const char* name) + : m_name(atomicCanonicalTextEncodingName(name)) + , m_backslashAsCurrencySymbol(backslashAsCurrencySymbol()) +{ +} + +TextEncoding::TextEncoding(const String& name) + : m_name(atomicCanonicalTextEncodingName(name.characters(), name.length())) + , m_backslashAsCurrencySymbol(backslashAsCurrencySymbol()) +{ +} + +String TextEncoding::decode(const char* data, size_t length, bool stopOnError, bool& sawError) const +{ + if (!m_name) + return String(); + + return newTextCodec(*this)->decode(data, length, true, stopOnError, sawError); +} + +CString TextEncoding::encode(const UChar* characters, size_t length, UnencodableHandling handling) const +{ + if (!m_name) + return CString(); + + if (!length) + return ""; + +#if USE(ICU_UNICODE) + // FIXME: What's the right place to do normalization? + // It's a little strange to do it inside the encode function. + // Perhaps normalization should be an explicit step done before calling encode. + + const UChar* source = characters; + size_t sourceLength = length; + + Vector<UChar> normalizedCharacters; + + UErrorCode err = U_ZERO_ERROR; + if (unorm_quickCheck(source, sourceLength, UNORM_NFC, &err) != UNORM_YES) { + // First try using the length of the original string, since normalization to NFC rarely increases length. + normalizedCharacters.grow(sourceLength); + int32_t normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), length, &err); + if (err == U_BUFFER_OVERFLOW_ERROR) { + err = U_ZERO_ERROR; + normalizedCharacters.resize(normalizedLength); + normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), normalizedLength, &err); + } + ASSERT(U_SUCCESS(err)); + + source = normalizedCharacters.data(); + sourceLength = normalizedLength; + } + return newTextCodec(*this)->encode(source, sourceLength, handling); +#elif USE(QT4_UNICODE) + QString str(reinterpret_cast<const QChar*>(characters), length); + str = str.normalized(QString::NormalizationForm_C); + return newTextCodec(*this)->encode(reinterpret_cast<const UChar *>(str.utf16()), str.length(), handling); +#elif USE(GLIB_UNICODE) + GOwnPtr<char> UTF8Source; + UTF8Source.set(g_utf16_to_utf8(characters, length, 0, 0, 0)); + if (!UTF8Source) { + // If conversion to UTF-8 failed, try with the string without normalization + return newTextCodec(*this)->encode(characters, length, handling); + } + + GOwnPtr<char> UTF8Normalized; + UTF8Normalized.set(g_utf8_normalize(UTF8Source.get(), -1, G_NORMALIZE_NFC)); + + long UTF16Length; + GOwnPtr<UChar> UTF16Normalized; + UTF16Normalized.set(g_utf8_to_utf16(UTF8Normalized.get(), -1, 0, &UTF16Length, 0)); + + return newTextCodec(*this)->encode(UTF16Normalized.get(), UTF16Length, handling); +#elif OS(WINCE) + // normalization will be done by Windows CE API + OwnPtr<TextCodec> textCodec = newTextCodec(*this); + return textCodec.get() ? textCodec->encode(characters, length, handling) : CString(); +#elif USE(BREWMP_UNICODE) + // FIXME: not sure if Brew MP normalizes the input string automatically + OwnPtr<TextCodec> textCodec = newTextCodec(*this); + return textCodec.get() ? textCodec->encode(characters, length, handling) : CString(); +#endif +} + +const char* TextEncoding::domName() const +{ + if (noExtendedTextEncodingNameUsed()) + return m_name; + + // We treat EUC-KR as windows-949 (its superset), but need to expose + // the name 'EUC-KR' because the name 'windows-949' is not recognized by + // most Korean web servers even though they do use the encoding + // 'windows-949' with the name 'EUC-KR'. + // FIXME: This is not thread-safe. At the moment, this function is + // only accessed in a single thread, but eventually has to be made + // thread-safe along with usesVisualOrdering(). + static const char* const a = atomicCanonicalTextEncodingName("windows-949"); + if (m_name == a) + return "EUC-KR"; + return m_name; +} + +bool TextEncoding::usesVisualOrdering() const +{ + if (noExtendedTextEncodingNameUsed()) + return false; + + static const char* const a = atomicCanonicalTextEncodingName("ISO-8859-8"); + return m_name == a; +} + +bool TextEncoding::isJapanese() const +{ + return isJapaneseEncoding(m_name); +} + +UChar TextEncoding::backslashAsCurrencySymbol() const +{ + return shouldShowBackslashAsCurrencySymbolIn(m_name) ? 0x00A5 : '\\'; +} + +bool TextEncoding::isNonByteBasedEncoding() const +{ + if (noExtendedTextEncodingNameUsed()) { + return *this == UTF16LittleEndianEncoding() + || *this == UTF16BigEndianEncoding(); + } + + return *this == UTF16LittleEndianEncoding() + || *this == UTF16BigEndianEncoding() + || *this == UTF32BigEndianEncoding() + || *this == UTF32LittleEndianEncoding(); +} + +bool TextEncoding::isUTF7Encoding() const +{ + if (noExtendedTextEncodingNameUsed()) + return false; + + return *this == UTF7Encoding(); +} + +const TextEncoding& TextEncoding::closestByteBasedEquivalent() const +{ + if (isNonByteBasedEncoding()) + return UTF8Encoding(); + return *this; +} + +// HTML5 specifies that UTF-8 be used in form submission when a form is +// is a part of a document in UTF-16 probably because UTF-16 is not a +// byte-based encoding and can contain 0x00. By extension, the same +// should be done for UTF-32. In case of UTF-7, it is a byte-based encoding, +// but it's fraught with problems and we'd rather steer clear of it. +const TextEncoding& TextEncoding::encodingForFormSubmission() const +{ + if (isNonByteBasedEncoding() || isUTF7Encoding()) + return UTF8Encoding(); + return *this; +} + +const TextEncoding& ASCIIEncoding() +{ + static TextEncoding globalASCIIEncoding("ASCII"); + return globalASCIIEncoding; +} + +const TextEncoding& Latin1Encoding() +{ + static TextEncoding globalLatin1Encoding("latin1"); + return globalLatin1Encoding; +} + +const TextEncoding& UTF16BigEndianEncoding() +{ + static TextEncoding globalUTF16BigEndianEncoding("UTF-16BE"); + return globalUTF16BigEndianEncoding; +} + +const TextEncoding& UTF16LittleEndianEncoding() +{ + static TextEncoding globalUTF16LittleEndianEncoding("UTF-16LE"); + return globalUTF16LittleEndianEncoding; +} + +const TextEncoding& UTF32BigEndianEncoding() +{ + static TextEncoding globalUTF32BigEndianEncoding("UTF-32BE"); + return globalUTF32BigEndianEncoding; +} + +const TextEncoding& UTF32LittleEndianEncoding() +{ + static TextEncoding globalUTF32LittleEndianEncoding("UTF-32LE"); + return globalUTF32LittleEndianEncoding; +} + +const TextEncoding& UTF8Encoding() +{ + static TextEncoding globalUTF8Encoding("UTF-8"); + ASSERT(globalUTF8Encoding.isValid()); + return globalUTF8Encoding; +} + +const TextEncoding& WindowsLatin1Encoding() +{ + static TextEncoding globalWindowsLatin1Encoding("WinLatin-1"); + return globalWindowsLatin1Encoding; +} + +} // namespace WebCore diff --git a/Source/WebCore/platform/text/TextEncoding.h b/Source/WebCore/platform/text/TextEncoding.h new file mode 100644 index 0000000..675625b --- /dev/null +++ b/Source/WebCore/platform/text/TextEncoding.h @@ -0,0 +1,98 @@ +/* + * Copyright (C) 2004, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef TextEncoding_h +#define TextEncoding_h + +#include "TextCodec.h" +#include <wtf/Forward.h> +#include <wtf/unicode/Unicode.h> + +namespace WebCore { + + class TextEncoding { + public: + TextEncoding() : m_name(0) { } + TextEncoding(const char* name); + TextEncoding(const String& name); + + bool isValid() const { return m_name; } + const char* name() const { return m_name; } + const char* domName() const; // name exposed via DOM + bool usesVisualOrdering() const; + bool isJapanese() const; + + PassRefPtr<StringImpl> displayString(PassRefPtr<StringImpl> str) const + { + if (m_backslashAsCurrencySymbol == '\\' || !str) + return str; + return str->replace('\\', m_backslashAsCurrencySymbol); + } + void displayBuffer(UChar* characters, unsigned len) const + { + if (m_backslashAsCurrencySymbol == '\\') + return; + for (unsigned i = 0; i < len; ++i) { + if (characters[i] == '\\') + characters[i] = m_backslashAsCurrencySymbol; + } + } + + const TextEncoding& closestByteBasedEquivalent() const; + const TextEncoding& encodingForFormSubmission() const; + + String decode(const char* str, size_t length) const + { + bool ignored; + return decode(str, length, false, ignored); + } + String decode(const char*, size_t length, bool stopOnError, bool& sawError) const; + CString encode(const UChar*, size_t length, UnencodableHandling) const; + + UChar backslashAsCurrencySymbol() const; + + private: + bool isNonByteBasedEncoding() const; + bool isUTF7Encoding() const; + + const char* m_name; + UChar m_backslashAsCurrencySymbol; + }; + + inline bool operator==(const TextEncoding& a, const TextEncoding& b) { return a.name() == b.name(); } + inline bool operator!=(const TextEncoding& a, const TextEncoding& b) { return a.name() != b.name(); } + + const TextEncoding& ASCIIEncoding(); + const TextEncoding& Latin1Encoding(); + const TextEncoding& UTF16BigEndianEncoding(); + const TextEncoding& UTF16LittleEndianEncoding(); + const TextEncoding& UTF32BigEndianEncoding(); + const TextEncoding& UTF32LittleEndianEncoding(); + const TextEncoding& UTF8Encoding(); + const TextEncoding& WindowsLatin1Encoding(); + +} // namespace WebCore + +#endif // TextEncoding_h diff --git a/Source/WebCore/platform/text/TextEncodingDetector.h b/Source/WebCore/platform/text/TextEncodingDetector.h new file mode 100644 index 0000000..9f16ab0 --- /dev/null +++ b/Source/WebCore/platform/text/TextEncodingDetector.h @@ -0,0 +1,48 @@ +/* + * Copyright (C) 2009 Google Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Google Inc. nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef TextEncodingDetector_h +#define TextEncodingDetector_h + +namespace WebCore { + + class TextEncoding; + + // Given a sequence of bytes in |data| of length |len| and an optional + // hintEncodingName, detect the most likely character encoding. + // The way hintEncodingName is used is up to an implementation. + // Currently, the only caller sets it to the parent frame encoding. + bool detectTextEncoding(const char* data, size_t len, + const char* hintEncodingName, + TextEncoding* detectedEncoding); + +} // namespace WebCore + +#endif diff --git a/Source/WebCore/platform/text/TextEncodingDetectorICU.cpp b/Source/WebCore/platform/text/TextEncodingDetectorICU.cpp new file mode 100644 index 0000000..c0d11de --- /dev/null +++ b/Source/WebCore/platform/text/TextEncodingDetectorICU.cpp @@ -0,0 +1,129 @@ +/* + * Copyright (C) 2008, 2009 Google Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Google Inc. nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "TextEncodingDetector.h" + +#include "TextEncoding.h" +#include <wtf/UnusedParam.h> + +#ifndef BUILDING_ON_TIGER +#include "unicode/ucnv.h" +#include "unicode/ucsdet.h" +#endif + +namespace WebCore { + +bool detectTextEncoding(const char* data, size_t len, + const char* hintEncodingName, + TextEncoding* detectedEncoding) +{ + *detectedEncoding = TextEncoding(); +#ifdef BUILDING_ON_TIGER + // Tiger came with ICU 3.2 and does not have the encoding detector. + UNUSED_PARAM(data); + UNUSED_PARAM(len); + UNUSED_PARAM(hintEncodingName); + return false; +#else + int matchesCount = 0; + UErrorCode status = U_ZERO_ERROR; + UCharsetDetector* detector = ucsdet_open(&status); + if (U_FAILURE(status)) + return false; + ucsdet_enableInputFilter(detector, true); + ucsdet_setText(detector, data, static_cast<int32_t>(len), &status); + if (U_FAILURE(status)) + return false; + + // FIXME: A few things we can do other than improving + // the ICU detector itself. + // 1. Use ucsdet_detectAll and pick the most likely one given + // "the context" (parent-encoding, referrer encoding, etc). + // 2. 'Emulate' Firefox/IE's non-Universal detectors (e.g. + // Chinese, Japanese, Russian, Korean and Hebrew) by picking the + // encoding with a highest confidence among the detector-specific + // limited set of candidate encodings. + // Below is a partial implementation of the first part of what's outlined + // above. + const UCharsetMatch** matches = ucsdet_detectAll(detector, &matchesCount, &status); + if (U_FAILURE(status)) { + ucsdet_close(detector); + return false; + } + + const char* encoding = 0; + if (hintEncodingName) { + TextEncoding hintEncoding(hintEncodingName); + // 10 is the minimum confidence value consistent with the codepoint + // allocation in a given encoding. The size of a chunk passed to + // us varies even for the same html file (apparently depending on + // the network load). When we're given a rather short chunk, we + // don't have a sufficiently reliable signal other than the fact that + // the chunk is consistent with a set of encodings. So, instead of + // setting an arbitrary threshold, we have to scan all the encodings + // consistent with the data. + const int32_t kThresold = 10; + for (int i = 0; i < matchesCount; ++i) { + int32_t confidence = ucsdet_getConfidence(matches[i], &status); + if (U_FAILURE(status)) { + status = U_ZERO_ERROR; + continue; + } + if (confidence < kThresold) + break; + const char* matchEncoding = ucsdet_getName(matches[i], &status); + if (U_FAILURE(status)) { + status = U_ZERO_ERROR; + continue; + } + if (TextEncoding(matchEncoding) == hintEncoding) { + encoding = hintEncodingName; + break; + } + } + } + // If no match is found so far, just pick the top match. + // This can happen, say, when a parent frame in EUC-JP refers to + // a child frame in Shift_JIS and both frames do NOT specify the encoding + // making us resort to auto-detection (when it IS turned on). + if (!encoding && matchesCount > 0) + encoding = ucsdet_getName(matches[0], &status); + if (U_SUCCESS(status)) { + *detectedEncoding = TextEncoding(encoding); + ucsdet_close(detector); + return true; + } + ucsdet_close(detector); + return false; +#endif +} + +} diff --git a/Source/WebCore/platform/text/TextEncodingDetectorNone.cpp b/Source/WebCore/platform/text/TextEncodingDetectorNone.cpp new file mode 100644 index 0000000..3b62bc5 --- /dev/null +++ b/Source/WebCore/platform/text/TextEncodingDetectorNone.cpp @@ -0,0 +1,44 @@ +/* + * Copyright (C) 2009 Google Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Google Inc. nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "TextEncodingDetector.h" + +#include "TextEncoding.h" + +namespace WebCore { + +bool detectTextEncoding(const char*, size_t, const char*, TextEncoding* detectedEncoding) +{ + *detectedEncoding = TextEncoding(); + return false; +} + +} diff --git a/Source/WebCore/platform/text/TextEncodingRegistry.cpp b/Source/WebCore/platform/text/TextEncodingRegistry.cpp new file mode 100644 index 0000000..c0c0255 --- /dev/null +++ b/Source/WebCore/platform/text/TextEncodingRegistry.cpp @@ -0,0 +1,402 @@ +/* + * Copyright (C) 2006, 2007 Apple Inc. All rights reserved. + * Copyright (C) 2007-2009 Torch Mobile, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "TextEncodingRegistry.h" + +#include "PlatformString.h" +#include "TextCodecLatin1.h" +#include "TextCodecUserDefined.h" +#include "TextCodecUTF16.h" +#include "TextEncoding.h" +#include <wtf/ASCIICType.h> +#include <wtf/Assertions.h> +#include <wtf/HashFunctions.h> +#include <wtf/HashMap.h> +#include <wtf/HashSet.h> +#include <wtf/StdLibExtras.h> +#include <wtf/StringExtras.h> +#include <wtf/Threading.h> + +#if USE(ICU_UNICODE) +#include "TextCodecICU.h" +#endif +#if PLATFORM(MAC) +#include "TextCodecMac.h" +#endif +#if PLATFORM(QT) +#include "qt/TextCodecQt.h" +#endif +#if USE(GLIB_UNICODE) +#include "gtk/TextCodecGtk.h" +#endif +#if USE(BREWMP_UNICODE) +#include "brew/TextCodecBrew.h" +#endif +#if OS(WINCE) && !PLATFORM(QT) +#include "TextCodecWinCE.h" +#endif + +using namespace WTF; + +namespace WebCore { + +const size_t maxEncodingNameLength = 63; + +// Hash for all-ASCII strings that does case folding. +struct TextEncodingNameHash { + + static bool equal(const char* s1, const char* s2) + { + char c1; + char c2; + do { + c1 = *s1++; + c2 = *s2++; + if (toASCIILower(c1) != toASCIILower(c2)) + return false; + } while (c1 && c2); + return !c1 && !c2; + } + + // This algorithm is the one-at-a-time hash from: + // http://burtleburtle.net/bob/hash/hashfaq.html + // http://burtleburtle.net/bob/hash/doobs.html + static unsigned hash(const char* s) + { + unsigned h = WTF::stringHashingStartValue; + for (;;) { + char c = *s++; + if (!c) { + h += (h << 3); + h ^= (h >> 11); + h += (h << 15); + return h; + } + h += toASCIILower(c); + h += (h << 10); + h ^= (h >> 6); + } + } + + static const bool safeToCompareToEmptyOrDeleted = false; +}; + +struct TextCodecFactory { + NewTextCodecFunction function; + const void* additionalData; + TextCodecFactory(NewTextCodecFunction f = 0, const void* d = 0) : function(f), additionalData(d) { } +}; + +typedef HashMap<const char*, const char*, TextEncodingNameHash> TextEncodingNameMap; +typedef HashMap<const char*, TextCodecFactory> TextCodecMap; + +static Mutex& encodingRegistryMutex() +{ + // We don't have to use AtomicallyInitializedStatic here because + // this function is called on the main thread for any page before + // it is used in worker threads. + DEFINE_STATIC_LOCAL(Mutex, mutex, ()); + return mutex; +} + +static TextEncodingNameMap* textEncodingNameMap; +static TextCodecMap* textCodecMap; +static bool didExtendTextCodecMaps; +static HashSet<const char*>* japaneseEncodings; +static HashSet<const char*>* nonBackslashEncodings; + +static const char* const textEncodingNameBlacklist[] = { + "UTF-7" +}; + +#if ERROR_DISABLED + +static inline void checkExistingName(const char*, const char*) { } + +#else + +static void checkExistingName(const char* alias, const char* atomicName) +{ + const char* oldAtomicName = textEncodingNameMap->get(alias); + if (!oldAtomicName) + return; + if (oldAtomicName == atomicName) + return; + // Keep the warning silent about one case where we know this will happen. + if (strcmp(alias, "ISO-8859-8-I") == 0 + && strcmp(oldAtomicName, "ISO-8859-8-I") == 0 + && strcasecmp(atomicName, "iso-8859-8") == 0) + return; + LOG_ERROR("alias %s maps to %s already, but someone is trying to make it map to %s", alias, oldAtomicName, atomicName); +} + +#endif + +static bool isUndesiredAlias(const char* alias) +{ + // Reject aliases with version numbers that are supported by some back-ends (such as "ISO_2022,locale=ja,version=0" in ICU). + for (const char* p = alias; *p; ++p) { + if (*p == ',') + return true; + } + // 8859_1 is known to (at least) ICU, but other browsers don't support this name - and having it caused a compatibility + // problem, see bug 43554. + if (0 == strcmp(alias, "8859_1")) + return true; + return false; +} + +static void addToTextEncodingNameMap(const char* alias, const char* name) +{ + ASSERT(strlen(alias) <= maxEncodingNameLength); + if (isUndesiredAlias(alias)) + return; + const char* atomicName = textEncodingNameMap->get(name); + ASSERT(strcmp(alias, name) == 0 || atomicName); + if (!atomicName) + atomicName = name; + checkExistingName(alias, atomicName); + textEncodingNameMap->add(alias, atomicName); +} + +static void addToTextCodecMap(const char* name, NewTextCodecFunction function, const void* additionalData) +{ + const char* atomicName = textEncodingNameMap->get(name); + ASSERT(atomicName); + textCodecMap->add(atomicName, TextCodecFactory(function, additionalData)); +} + +static void pruneBlacklistedCodecs() +{ + for (size_t i = 0; i < WTF_ARRAY_LENGTH(textEncodingNameBlacklist); ++i) { + const char* atomicName = textEncodingNameMap->get(textEncodingNameBlacklist[i]); + if (!atomicName) + continue; + + Vector<const char*> names; + TextEncodingNameMap::const_iterator it = textEncodingNameMap->begin(); + TextEncodingNameMap::const_iterator end = textEncodingNameMap->end(); + for (; it != end; ++it) { + if (it->second == atomicName) + names.append(it->first); + } + + size_t length = names.size(); + for (size_t j = 0; j < length; ++j) + textEncodingNameMap->remove(names[j]); + + textCodecMap->remove(atomicName); + } +} + +static void buildBaseTextCodecMaps() +{ + ASSERT(isMainThread()); + ASSERT(!textCodecMap); + ASSERT(!textEncodingNameMap); + + textCodecMap = new TextCodecMap; + textEncodingNameMap = new TextEncodingNameMap; + + TextCodecLatin1::registerEncodingNames(addToTextEncodingNameMap); + TextCodecLatin1::registerCodecs(addToTextCodecMap); + + TextCodecUTF16::registerEncodingNames(addToTextEncodingNameMap); + TextCodecUTF16::registerCodecs(addToTextCodecMap); + + TextCodecUserDefined::registerEncodingNames(addToTextEncodingNameMap); + TextCodecUserDefined::registerCodecs(addToTextCodecMap); + +#if USE(ICU_UNICODE) + TextCodecICU::registerBaseEncodingNames(addToTextEncodingNameMap); + TextCodecICU::registerBaseCodecs(addToTextCodecMap); +#endif + +#if USE(GLIB_UNICODE) + TextCodecGtk::registerBaseEncodingNames(addToTextEncodingNameMap); + TextCodecGtk::registerBaseCodecs(addToTextCodecMap); +#endif + +#if USE(BREWMP_UNICODE) + TextCodecBrew::registerBaseEncodingNames(addToTextEncodingNameMap); + TextCodecBrew::registerBaseCodecs(addToTextCodecMap); +#endif + +#if OS(WINCE) && !PLATFORM(QT) + TextCodecWinCE::registerBaseEncodingNames(addToTextEncodingNameMap); + TextCodecWinCE::registerBaseCodecs(addToTextCodecMap); +#endif +} + +static void addEncodingName(HashSet<const char*>* set, const char* name) +{ + // We must not use atomicCanonicalTextEncodingName() because this function is called in it. + const char* atomicName = textEncodingNameMap->get(name); + if (atomicName) + set->add(atomicName); +} + +static void buildQuirksSets() +{ + // FIXME: Having isJapaneseEncoding() and shouldShowBackslashAsCurrencySymbolIn() + // and initializing the sets for them in TextEncodingRegistry.cpp look strange. + + ASSERT(!japaneseEncodings); + ASSERT(!nonBackslashEncodings); + + japaneseEncodings = new HashSet<const char*>(); + addEncodingName(japaneseEncodings, "EUC-JP"); + addEncodingName(japaneseEncodings, "ISO-2022-JP"); + addEncodingName(japaneseEncodings, "ISO-2022-JP-1"); + addEncodingName(japaneseEncodings, "ISO-2022-JP-2"); + addEncodingName(japaneseEncodings, "ISO-2022-JP-3"); + addEncodingName(japaneseEncodings, "JIS_C6226-1978"); + addEncodingName(japaneseEncodings, "JIS_X0201"); + addEncodingName(japaneseEncodings, "JIS_X0208-1983"); + addEncodingName(japaneseEncodings, "JIS_X0208-1990"); + addEncodingName(japaneseEncodings, "JIS_X0212-1990"); + addEncodingName(japaneseEncodings, "Shift_JIS"); + addEncodingName(japaneseEncodings, "Shift_JIS_X0213-2000"); + addEncodingName(japaneseEncodings, "cp932"); + addEncodingName(japaneseEncodings, "x-mac-japanese"); + + nonBackslashEncodings = new HashSet<const char*>(); + // The text encodings below treat backslash as a currency symbol for IE compatibility. + // See http://blogs.msdn.com/michkap/archive/2005/09/17/469941.aspx for more information. + addEncodingName(nonBackslashEncodings, "x-mac-japanese"); + addEncodingName(nonBackslashEncodings, "ISO-2022-JP"); + addEncodingName(nonBackslashEncodings, "EUC-JP"); + // Shift_JIS_X0213-2000 is not the same encoding as Shift_JIS on Mac. We need to register both of them. + addEncodingName(nonBackslashEncodings, "Shift_JIS"); + addEncodingName(nonBackslashEncodings, "Shift_JIS_X0213-2000"); +} + +bool isJapaneseEncoding(const char* canonicalEncodingName) +{ + return canonicalEncodingName && japaneseEncodings && japaneseEncodings->contains(canonicalEncodingName); +} + +bool shouldShowBackslashAsCurrencySymbolIn(const char* canonicalEncodingName) +{ + return canonicalEncodingName && nonBackslashEncodings && nonBackslashEncodings->contains(canonicalEncodingName); +} + +static void extendTextCodecMaps() +{ +#if USE(ICU_UNICODE) + TextCodecICU::registerExtendedEncodingNames(addToTextEncodingNameMap); + TextCodecICU::registerExtendedCodecs(addToTextCodecMap); +#endif + +#if USE(QT4_UNICODE) + TextCodecQt::registerEncodingNames(addToTextEncodingNameMap); + TextCodecQt::registerCodecs(addToTextCodecMap); +#endif + +#if PLATFORM(MAC) + TextCodecMac::registerEncodingNames(addToTextEncodingNameMap); + TextCodecMac::registerCodecs(addToTextCodecMap); +#endif + +#if USE(GLIB_UNICODE) + TextCodecGtk::registerExtendedEncodingNames(addToTextEncodingNameMap); + TextCodecGtk::registerExtendedCodecs(addToTextCodecMap); +#endif + +#if OS(WINCE) && !PLATFORM(QT) + TextCodecWinCE::registerExtendedEncodingNames(addToTextEncodingNameMap); + TextCodecWinCE::registerExtendedCodecs(addToTextCodecMap); +#endif + + pruneBlacklistedCodecs(); + buildQuirksSets(); +} + +PassOwnPtr<TextCodec> newTextCodec(const TextEncoding& encoding) +{ + MutexLocker lock(encodingRegistryMutex()); + + ASSERT(textCodecMap); + TextCodecFactory factory = textCodecMap->get(encoding.name()); + ASSERT(factory.function); + return factory.function(encoding, factory.additionalData); +} + +const char* atomicCanonicalTextEncodingName(const char* name) +{ + if (!name || !name[0]) + return 0; + if (!textEncodingNameMap) + buildBaseTextCodecMaps(); + + MutexLocker lock(encodingRegistryMutex()); + + if (const char* atomicName = textEncodingNameMap->get(name)) + return atomicName; + if (didExtendTextCodecMaps) + return 0; + extendTextCodecMaps(); + didExtendTextCodecMaps = true; + return textEncodingNameMap->get(name); +} + +const char* atomicCanonicalTextEncodingName(const UChar* characters, size_t length) +{ + char buffer[maxEncodingNameLength + 1]; + size_t j = 0; + for (size_t i = 0; i < length; ++i) { + UChar c = characters[i]; + if (j == maxEncodingNameLength) + return 0; + buffer[j++] = c; + } + buffer[j] = 0; + return atomicCanonicalTextEncodingName(buffer); +} + +bool noExtendedTextEncodingNameUsed() +{ + // If the calling thread did not use extended encoding names, it is fine for it to use a stale false value. + return !didExtendTextCodecMaps; +} + +#ifndef NDEBUG +void dumpTextEncodingNameMap() +{ + unsigned size = textEncodingNameMap->size(); + fprintf(stderr, "Dumping %u entries in WebCore::textEncodingNameMap...\n", size); + + MutexLocker lock(encodingRegistryMutex()); + + TextEncodingNameMap::const_iterator it = textEncodingNameMap->begin(); + TextEncodingNameMap::const_iterator end = textEncodingNameMap->end(); + for (; it != end; ++it) + fprintf(stderr, "'%s' => '%s'\n", it->first, it->second); +} +#endif + +} // namespace WebCore diff --git a/Source/WebCore/platform/text/TextEncodingRegistry.h b/Source/WebCore/platform/text/TextEncodingRegistry.h new file mode 100644 index 0000000..16844c6 --- /dev/null +++ b/Source/WebCore/platform/text/TextEncodingRegistry.h @@ -0,0 +1,54 @@ +/* + * Copyright (C) 2006, 2007 Apple Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef TextEncodingRegistry_h +#define TextEncodingRegistry_h + +#include <memory> +#include <wtf/PassOwnPtr.h> +#include <wtf/unicode/Unicode.h> + +namespace WebCore { + + class TextCodec; + class TextEncoding; + + // Use TextResourceDecoder::decode to decode resources, since it handles BOMs. + // Use TextEncoding::encode to encode, since it takes care of normalization. + PassOwnPtr<TextCodec> newTextCodec(const TextEncoding&); + + // Only TextEncoding should use the following functions directly. + const char* atomicCanonicalTextEncodingName(const char* alias); + const char* atomicCanonicalTextEncodingName(const UChar* aliasCharacters, size_t aliasLength); + bool noExtendedTextEncodingNameUsed(); + bool isJapaneseEncoding(const char* canonicalEncodingName); + bool shouldShowBackslashAsCurrencySymbolIn(const char* canonicalEncodingName); + +#ifndef NDEBUG + void dumpTextEncodingNameMap(); +#endif +} + +#endif // TextEncodingRegistry_h diff --git a/Source/WebCore/platform/text/TextStream.cpp b/Source/WebCore/platform/text/TextStream.cpp new file mode 100644 index 0000000..1094fa4 --- /dev/null +++ b/Source/WebCore/platform/text/TextStream.cpp @@ -0,0 +1,130 @@ +/* + * Copyright (C) 2004, 2008, 2010 Apple Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "TextStream.h" + +#include "PlatformString.h" +#include <wtf/StringExtras.h> + +using namespace std; + +namespace WebCore { + +static const size_t printBufferSize = 100; // large enough for any integer or floating point value in string format, including trailing null character + +TextStream& TextStream::operator<<(bool b) +{ + return *this << (b ? "1" : "0"); +} + +TextStream& TextStream::operator<<(int i) +{ + char buffer[printBufferSize]; + snprintf(buffer, sizeof(buffer) - 1, "%d", i); + return *this << buffer; +} + +TextStream& TextStream::operator<<(unsigned i) +{ + char buffer[printBufferSize]; + snprintf(buffer, sizeof(buffer) - 1, "%u", i); + return *this << buffer; +} + +TextStream& TextStream::operator<<(long i) +{ + char buffer[printBufferSize]; + snprintf(buffer, sizeof(buffer) - 1, "%ld", i); + return *this << buffer; +} + +TextStream& TextStream::operator<<(unsigned long i) +{ + char buffer[printBufferSize]; + snprintf(buffer, sizeof(buffer) - 1, "%lu", i); + return *this << buffer; +} + +TextStream& TextStream::operator<<(float f) +{ + char buffer[printBufferSize]; + snprintf(buffer, sizeof(buffer) - 1, "%.2f", f); + return *this << buffer; +} + +TextStream& TextStream::operator<<(double d) +{ + char buffer[printBufferSize]; + snprintf(buffer, sizeof(buffer) - 1, "%.2f", d); + return *this << buffer; +} + +TextStream& TextStream::operator<<(const char* string) +{ + size_t stringLength = strlen(string); + size_t textLength = m_text.size(); + if (stringLength > numeric_limits<size_t>::max() - textLength) + CRASH(); + m_text.grow(textLength + stringLength); + for (size_t i = 0; i < stringLength; ++i) + m_text[textLength + i] = string[i]; + return *this; +} + +TextStream& TextStream::operator<<(const void* p) +{ + char buffer[printBufferSize]; + snprintf(buffer, sizeof(buffer) - 1, "%p", p); + return *this << buffer; +} + +TextStream& TextStream::operator<<(const String& string) +{ + append(m_text, string); + return *this; +} + +String TextStream::release() +{ + return String::adopt(m_text); +} + +#if OS(WINDOWS) && CPU(X86_64) +TextStream& TextStream::operator<<(__int64 i) +{ + char buffer[printBufferSize]; + snprintf(buffer, sizeof(buffer) - 1, "%I64i", i); + return *this << buffer; +} +TextStream& TextStream::operator<<(unsigned __int64 i) +{ + char buffer[printBufferSize]; + snprintf(buffer, sizeof(buffer) - 1, "%I64u", i); + return *this << buffer; +} +#endif + +} diff --git a/Source/WebCore/platform/text/TextStream.h b/Source/WebCore/platform/text/TextStream.h new file mode 100644 index 0000000..e7e4cc0 --- /dev/null +++ b/Source/WebCore/platform/text/TextStream.h @@ -0,0 +1,60 @@ +/* + * Copyright (C) 2004, 2008 Apple Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef TextStream_h +#define TextStream_h + +#include <wtf/Forward.h> +#include <wtf/Vector.h> +#include <wtf/unicode/Unicode.h> + +namespace WebCore { + +class TextStream { +public: + TextStream& operator<<(bool); + TextStream& operator<<(int); + TextStream& operator<<(unsigned); + TextStream& operator<<(long); + TextStream& operator<<(unsigned long); + TextStream& operator<<(float); + TextStream& operator<<(double); + TextStream& operator<<(const char*); + TextStream& operator<<(const void*); + TextStream& operator<<(const String&); +#if OS(WINDOWS) && CPU(X86_64) + TextStream& operator<<(unsigned __int64); + TextStream& operator<<(__int64); +#endif + + String release(); + +private: + Vector<UChar> m_text; +}; + +} + +#endif diff --git a/Source/WebCore/platform/text/UnicodeRange.cpp b/Source/WebCore/platform/text/UnicodeRange.cpp new file mode 100644 index 0000000..0373441 --- /dev/null +++ b/Source/WebCore/platform/text/UnicodeRange.cpp @@ -0,0 +1,462 @@ +/* + * Copyright (C) 2007 Apple Computer, Inc. + * + * Portions are Copyright (C) 1998 Netscape Communications Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + * + * Alternatively, the contents of this file may be used under the terms + * of either the Mozilla Public License Version 1.1, found at + * http://www.mozilla.org/MPL/ (the "MPL") or the GNU General Public + * License Version 2.0, found at http://www.fsf.org/copyleft/gpl.html + * (the "GPL"), in which case the provisions of the MPL or the GPL are + * applicable instead of those above. If you wish to allow use of your + * version of this file only under the terms of one of those two + * licenses (the MPL or the GPL) and not to allow others to use your + * version of this file under the LGPL, indicate your decision by + * deletingthe provisions above and replace them with the notice and + * other provisions required by the MPL or the GPL, as the case may be. + * If you do not delete the provisions above, a recipient may use your + * version of this file under any of the LGPL, the MPL or the GPL. + */ + +#include "config.h" +#include "UnicodeRange.h" + +namespace WebCore { + +// This table depends on unicode range definitions. +// Each item's index must correspond to a unicode range value +// eg. x-cyrillic = LangGroupTable[cRangeCyrillic] +static const char* gUnicodeRangeToLangGroupTable[] = +{ + "x-cyrillic", + "el", + "tr", + "he", + "ar", + "x-baltic", + "th", + "ko", + "ja", + "zh-CN", + "zh-TW", + "x-devanagari", + "x-tamil", + "x-armn", + "x-beng", + "x-cans", + "x-ethi", + "x-geor", + "x-gujr", + "x-guru", + "x-khmr", + "x-mlym" +}; + +/********************************************************************** + * Unicode subranges as defined in unicode 3.0 + * x-western, x-central-euro, tr, x-baltic -> latin + * 0000 - 036f + * 1e00 - 1eff + * 2000 - 206f (general punctuation) + * 20a0 - 20cf (currency symbols) + * 2100 - 214f (letterlike symbols) + * 2150 - 218f (Number Forms) + * el -> greek + * 0370 - 03ff + * 1f00 - 1fff + * x-cyrillic -> cyrillic + * 0400 - 04ff + * he -> hebrew + * 0590 - 05ff + * ar -> arabic + * 0600 - 06ff + * fb50 - fdff (arabic presentation forms) + * fe70 - feff (arabic presentation forms b) + * th - thai + * 0e00 - 0e7f + * ko -> korean + * ac00 - d7af (hangul Syllables) + * 1100 - 11ff (jamo) + * 3130 - 318f (hangul compatibility jamo) + * ja + * 3040 - 309f (hiragana) + * 30a0 - 30ff (katakana) + * zh-CN + * zh-TW + * + * CJK + * 3100 - 312f (bopomofo) + * 31a0 - 31bf (bopomofo extended) + * 3000 - 303f (CJK Symbols and Punctuation) + * 2e80 - 2eff (CJK radicals supplement) + * 2f00 - 2fdf (Kangxi Radicals) + * 2ff0 - 2fff (Ideographic Description Characters) + * 3190 - 319f (kanbun) + * 3200 - 32ff (Enclosed CJK letters and Months) + * 3300 - 33ff (CJK compatibility) + * 3400 - 4dbf (CJK Unified Ideographs Extension A) + * 4e00 - 9faf (CJK Unified Ideographs) + * f900 - fa5f (CJK Compatibility Ideographs) + * fe30 - fe4f (CJK compatibility Forms) + * ff00 - ffef (halfwidth and fullwidth forms) + * + * Armenian + * 0530 - 058f + * Sriac + * 0700 - 074f + * Thaana + * 0780 - 07bf + * Devanagari + * 0900 - 097f + * Bengali + * 0980 - 09ff + * Gurmukhi + * 0a00 - 0a7f + * Gujarati + * 0a80 - 0aff + * Oriya + * 0b00 - 0b7f + * Tamil + * 0b80 - 0bff + * Telugu + * 0c00 - 0c7f + * Kannada + * 0c80 - 0cff + * Malayalam + * 0d00 - 0d7f + * Sinhala + * 0d80 - 0def + * Lao + * 0e80 - 0eff + * Tibetan + * 0f00 - 0fbf + * Myanmar + * 1000 - 109f + * Georgian + * 10a0 - 10ff + * Ethiopic + * 1200 - 137f + * Cherokee + * 13a0 - 13ff + * Canadian Aboriginal Syllabics + * 1400 - 167f + * Ogham + * 1680 - 169f + * Runic + * 16a0 - 16ff + * Khmer + * 1780 - 17ff + * Mongolian + * 1800 - 18af + * Misc - superscripts and subscripts + * 2070 - 209f + * Misc - Combining Diacritical Marks for Symbols + * 20d0 - 20ff + * Misc - Arrows + * 2190 - 21ff + * Misc - Mathematical Operators + * 2200 - 22ff + * Misc - Miscellaneous Technical + * 2300 - 23ff + * Misc - Control picture + * 2400 - 243f + * Misc - Optical character recognition + * 2440 - 2450 + * Misc - Enclose Alphanumerics + * 2460 - 24ff + * Misc - Box Drawing + * 2500 - 257f + * Misc - Block Elements + * 2580 - 259f + * Misc - Geometric Shapes + * 25a0 - 25ff + * Misc - Miscellaneous Symbols + * 2600 - 267f + * Misc - Dingbats + * 2700 - 27bf + * Misc - Braille Patterns + * 2800 - 28ff + * Yi Syllables + * a000 - a48f + * Yi radicals + * a490 - a4cf + * Alphabetic Presentation Forms + * fb00 - fb4f + * Misc - Combining half Marks + * fe20 - fe2f + * Misc - small form variants + * fe50 - fe6f + * Misc - Specials + * fff0 - ffff + *********************************************************************/ + +static const unsigned cNumSubTables = 9; +static const unsigned cSubTableSize = 16; + +static const unsigned char gUnicodeSubrangeTable[cNumSubTables][cSubTableSize] = +{ + { // table for X--- + cRangeTableBase+1, //u0xxx + cRangeTableBase+2, //u1xxx + cRangeTableBase+3, //u2xxx + cRangeSetCJK, //u3xxx + cRangeSetCJK, //u4xxx + cRangeSetCJK, //u5xxx + cRangeSetCJK, //u6xxx + cRangeSetCJK, //u7xxx + cRangeSetCJK, //u8xxx + cRangeSetCJK, //u9xxx + cRangeTableBase+4, //uaxxx + cRangeKorean, //ubxxx + cRangeKorean, //ucxxx + cRangeTableBase+5, //udxxx + cRangePrivate, //uexxx + cRangeTableBase+6 //ufxxx + }, + { //table for 0X-- + cRangeSetLatin, //u00xx + cRangeSetLatin, //u01xx + cRangeSetLatin, //u02xx + cRangeGreek, //u03xx XXX 0300-036f is in fact cRangeCombiningDiacriticalMarks + cRangeCyrillic, //u04xx + cRangeTableBase+7, //u05xx, includes Cyrillic supplement, Hebrew, and Armenian + cRangeArabic, //u06xx + cRangeTertiaryTable, //u07xx + cRangeUnassigned, //u08xx + cRangeTertiaryTable, //u09xx + cRangeTertiaryTable, //u0axx + cRangeTertiaryTable, //u0bxx + cRangeTertiaryTable, //u0cxx + cRangeTertiaryTable, //u0dxx + cRangeTertiaryTable, //u0exx + cRangeTibetan, //u0fxx + }, + { //table for 1x-- + cRangeTertiaryTable, //u10xx + cRangeKorean, //u11xx + cRangeEthiopic, //u12xx + cRangeTertiaryTable, //u13xx + cRangeCanadian, //u14xx + cRangeCanadian, //u15xx + cRangeTertiaryTable, //u16xx + cRangeKhmer, //u17xx + cRangeMongolian, //u18xx + cRangeUnassigned, //u19xx + cRangeUnassigned, //u1axx + cRangeUnassigned, //u1bxx + cRangeUnassigned, //u1cxx + cRangeUnassigned, //u1dxx + cRangeSetLatin, //u1exx + cRangeGreek, //u1fxx + }, + { //table for 2x-- + cRangeSetLatin, //u20xx + cRangeSetLatin, //u21xx + cRangeMathOperators, //u22xx + cRangeMiscTechnical, //u23xx + cRangeControlOpticalEnclose, //u24xx + cRangeBoxBlockGeometrics, //u25xx + cRangeMiscSymbols, //u26xx + cRangeDingbats, //u27xx + cRangeBraillePattern, //u28xx + cRangeUnassigned, //u29xx + cRangeUnassigned, //u2axx + cRangeUnassigned, //u2bxx + cRangeUnassigned, //u2cxx + cRangeUnassigned, //u2dxx + cRangeSetCJK, //u2exx + cRangeSetCJK, //u2fxx + }, + { //table for ax-- + cRangeYi, //ua0xx + cRangeYi, //ua1xx + cRangeYi, //ua2xx + cRangeYi, //ua3xx + cRangeYi, //ua4xx + cRangeUnassigned, //ua5xx + cRangeUnassigned, //ua6xx + cRangeUnassigned, //ua7xx + cRangeUnassigned, //ua8xx + cRangeUnassigned, //ua9xx + cRangeUnassigned, //uaaxx + cRangeUnassigned, //uabxx + cRangeKorean, //uacxx + cRangeKorean, //uadxx + cRangeKorean, //uaexx + cRangeKorean, //uafxx + }, + { //table for dx-- + cRangeKorean, //ud0xx + cRangeKorean, //ud1xx + cRangeKorean, //ud2xx + cRangeKorean, //ud3xx + cRangeKorean, //ud4xx + cRangeKorean, //ud5xx + cRangeKorean, //ud6xx + cRangeKorean, //ud7xx + cRangeSurrogate, //ud8xx + cRangeSurrogate, //ud9xx + cRangeSurrogate, //udaxx + cRangeSurrogate, //udbxx + cRangeSurrogate, //udcxx + cRangeSurrogate, //uddxx + cRangeSurrogate, //udexx + cRangeSurrogate, //udfxx + }, + { // table for fx-- + cRangePrivate, //uf0xx + cRangePrivate, //uf1xx + cRangePrivate, //uf2xx + cRangePrivate, //uf3xx + cRangePrivate, //uf4xx + cRangePrivate, //uf5xx + cRangePrivate, //uf6xx + cRangePrivate, //uf7xx + cRangePrivate, //uf8xx + cRangeSetCJK, //uf9xx + cRangeSetCJK, //ufaxx + cRangeArabic, //ufbxx, includes alphabic presentation form + cRangeArabic, //ufcxx + cRangeArabic, //ufdxx + cRangeArabic, //ufexx, includes Combining half marks, + // CJK compatibility forms, + // CJK compatibility forms, + // small form variants + cRangeTableBase+8, //uffxx, halfwidth and fullwidth forms, includes Specials + }, + { //table for 0x0500 - 0x05ff + cRangeCyrillic, //u050x + cRangeCyrillic, //u051x + cRangeCyrillic, //u052x + cRangeArmenian, //u053x + cRangeArmenian, //u054x + cRangeArmenian, //u055x + cRangeArmenian, //u056x + cRangeArmenian, //u057x + cRangeArmenian, //u058x + cRangeHebrew, //u059x + cRangeHebrew, //u05ax + cRangeHebrew, //u05bx + cRangeHebrew, //u05cx + cRangeHebrew, //u05dx + cRangeHebrew, //u05ex + cRangeHebrew, //u05fx + }, + { //table for 0xff00 - 0xffff + cRangeSetCJK, //uff0x, fullwidth latin + cRangeSetCJK, //uff1x, fullwidth latin + cRangeSetCJK, //uff2x, fullwidth latin + cRangeSetCJK, //uff3x, fullwidth latin + cRangeSetCJK, //uff4x, fullwidth latin + cRangeSetCJK, //uff5x, fullwidth latin + cRangeSetCJK, //uff6x, halfwidth katakana + cRangeSetCJK, //uff7x, halfwidth katakana + cRangeSetCJK, //uff8x, halfwidth katakana + cRangeSetCJK, //uff9x, halfwidth katakana + cRangeSetCJK, //uffax, halfwidth hangul jamo + cRangeSetCJK, //uffbx, halfwidth hangul jamo + cRangeSetCJK, //uffcx, halfwidth hangul jamo + cRangeSetCJK, //uffdx, halfwidth hangul jamo + cRangeSetCJK, //uffex, fullwidth symbols + cRangeSpecials, //ufffx, Specials + }, +}; + +// Most scripts between U+0700 and U+16FF are assigned a chunk of 128 (0x80) +// code points so that the number of entries in the tertiary range +// table for that range is obtained by dividing (0x1700 - 0x0700) by 128. +// Exceptions: Ethiopic, Tibetan, Hangul Jamo and Canadian aboriginal +// syllabaries take multiple chunks and Ogham and Runic share a single chunk. +static const unsigned cTertiaryTableSize = ((0x1700 - 0x0700) / 0x80); + +static const unsigned char gUnicodeTertiaryRangeTable[cTertiaryTableSize] = +{ //table for 0x0700 - 0x1600 + cRangeSyriac, //u070x + cRangeThaana, //u078x + cRangeUnassigned, //u080x place holder(resolved in the 2ndary tab.) + cRangeUnassigned, //u088x place holder(resolved in the 2ndary tab.) + cRangeDevanagari, //u090x + cRangeBengali, //u098x + cRangeGurmukhi, //u0a0x + cRangeGujarati, //u0a8x + cRangeOriya, //u0b0x + cRangeTamil, //u0b8x + cRangeTelugu, //u0c0x + cRangeKannada, //u0c8x + cRangeMalayalam, //u0d0x + cRangeSinhala, //u0d8x + cRangeThai, //u0e0x + cRangeLao, //u0e8x + cRangeTibetan, //u0f0x place holder(resolved in the 2ndary tab.) + cRangeTibetan, //u0f8x place holder(resolved in the 2ndary tab.) + cRangeMyanmar, //u100x + cRangeGeorgian, //u108x + cRangeKorean, //u110x place holder(resolved in the 2ndary tab.) + cRangeKorean, //u118x place holder(resolved in the 2ndary tab.) + cRangeEthiopic, //u120x place holder(resolved in the 2ndary tab.) + cRangeEthiopic, //u128x place holder(resolved in the 2ndary tab.) + cRangeEthiopic, //u130x + cRangeCherokee, //u138x + cRangeCanadian, //u140x place holder(resolved in the 2ndary tab.) + cRangeCanadian, //u148x place holder(resolved in the 2ndary tab.) + cRangeCanadian, //u150x place holder(resolved in the 2ndary tab.) + cRangeCanadian, //u158x place holder(resolved in the 2ndary tab.) + cRangeCanadian, //u160x + cRangeOghamRunic, //u168x this contains two scripts, Ogham & Runic +}; + +// A two level index is almost enough for locating a range, with the +// exception of u03xx and u05xx. Since we don't really care about range for +// combining diacritical marks in our font application, they are +// not discriminated further. Future adoption of this method for other use +// should be aware of this limitation. The implementation can be extended if +// there is such a need. +// For Indic, Southeast Asian scripts and some other scripts between +// U+0700 and U+16FF, it's extended to the third level. +unsigned int findCharUnicodeRange(UChar32 ch) +{ + if (ch >= 0xFFFF) + return 0; + + unsigned int range; + + //search the first table + range = gUnicodeSubrangeTable[0][ch >> 12]; + + if (range < cRangeTableBase) + // we try to get a specific range + return range; + + // otherwise, we have one more table to look at + range = gUnicodeSubrangeTable[range - cRangeTableBase][(ch & 0x0f00) >> 8]; + if (range < cRangeTableBase) + return range; + if (range < cRangeTertiaryTable) + return gUnicodeSubrangeTable[range - cRangeTableBase][(ch & 0x00f0) >> 4]; + + // Yet another table to look at : U+0700 - U+16FF : 128 code point blocks + return gUnicodeTertiaryRangeTable[(ch - 0x0700) >> 7]; +} + +const char* langGroupFromUnicodeRange(unsigned char unicodeRange) +{ + if (cRangeSpecificItemNum > unicodeRange) + return gUnicodeRangeToLangGroupTable[unicodeRange]; + return 0; +} + +} diff --git a/Source/WebCore/platform/text/UnicodeRange.h b/Source/WebCore/platform/text/UnicodeRange.h new file mode 100644 index 0000000..2278a0e --- /dev/null +++ b/Source/WebCore/platform/text/UnicodeRange.h @@ -0,0 +1,124 @@ +/* + * Copyright (C) 2007 Apple Computer, Inc. + * + * Portions are Copyright (C) 1998 Netscape Communications Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + * + * Alternatively, the contents of this file may be used under the terms + * of either the Mozilla Public License Version 1.1, found at + * http://www.mozilla.org/MPL/ (the "MPL") or the GNU General Public + * License Version 2.0, found at http://www.fsf.org/copyleft/gpl.html + * (the "GPL"), in which case the provisions of the MPL or the GPL are + * applicable instead of those above. If you wish to allow use of your + * version of this file only under the terms of one of those two + * licenses (the MPL or the GPL) and not to allow others to use your + * version of this file under the LGPL, indicate your decision by + * deletingthe provisions above and replace them with the notice and + * other provisions required by the MPL or the GPL, as the case may be. + * If you do not delete the provisions above, a recipient may use your + * version of this file under any of the LGPL, the MPL or the GPL. + */ + +#ifndef UnicodeRange_H +#define UnicodeRange_H + +#if PLATFORM(HAIKU) +#include "stdint.h" +#endif + +#include <wtf/unicode/Unicode.h> + +namespace WebCore { + +// The following constants define unicode subranges +// values below cRangeNum must be continuous so that we can map to +// a lang group directly. +// All ranges we care about should fit within 32 bits. + +// Frequently used range definitions +const unsigned char cRangeCyrillic = 0; +const unsigned char cRangeGreek = 1; +const unsigned char cRangeTurkish = 2; +const unsigned char cRangeHebrew = 3; +const unsigned char cRangeArabic = 4; +const unsigned char cRangeBaltic = 5; +const unsigned char cRangeThai = 6; +const unsigned char cRangeKorean = 7; +const unsigned char cRangeJapanese = 8; +const unsigned char cRangeSChinese = 9; +const unsigned char cRangeTChinese = 10; +const unsigned char cRangeDevanagari = 11; +const unsigned char cRangeTamil = 12; +const unsigned char cRangeArmenian = 13; +const unsigned char cRangeBengali = 14; +const unsigned char cRangeCanadian = 15; +const unsigned char cRangeEthiopic = 16; +const unsigned char cRangeGeorgian = 17; +const unsigned char cRangeGujarati = 18; +const unsigned char cRangeGurmukhi = 19; +const unsigned char cRangeKhmer = 20; +const unsigned char cRangeMalayalam = 21; + +const unsigned char cRangeSpecificItemNum = 22; + +//range/rangeSet grow to this place 22-29 + +const unsigned char cRangeSetStart = 30; // range set definition starts from here +const unsigned char cRangeSetLatin = 30; +const unsigned char cRangeSetCJK = 31; +const unsigned char cRangeSetEnd = 31; // range set definition ends here + +// less frequently used range definition +const unsigned char cRangeSurrogate = 32; +const unsigned char cRangePrivate = 33; +const unsigned char cRangeMisc = 34; +const unsigned char cRangeUnassigned = 35; +const unsigned char cRangeSyriac = 36; +const unsigned char cRangeThaana = 37; +const unsigned char cRangeOriya = 38; +const unsigned char cRangeTelugu = 39; +const unsigned char cRangeKannada = 40; +const unsigned char cRangeSinhala = 41; +const unsigned char cRangeLao = 42; +const unsigned char cRangeTibetan = 43; +const unsigned char cRangeMyanmar = 44; +const unsigned char cRangeCherokee = 45; +const unsigned char cRangeOghamRunic = 46; +const unsigned char cRangeMongolian = 47; +const unsigned char cRangeMathOperators = 48; +const unsigned char cRangeMiscTechnical = 49; +const unsigned char cRangeControlOpticalEnclose = 50; +const unsigned char cRangeBoxBlockGeometrics = 51; +const unsigned char cRangeMiscSymbols = 52; +const unsigned char cRangeDingbats = 53; +const unsigned char cRangeBraillePattern = 54; +const unsigned char cRangeYi = 55; +const unsigned char cRangeCombiningDiacriticalMarks = 56; +const unsigned char cRangeSpecials = 57; + +const unsigned char cRangeTableBase = 128; //values over 127 are reserved for internal use only +const unsigned char cRangeTertiaryTable = 145; // leave room for 16 subtable + // indices (cRangeTableBase + 1 .. + // cRangeTableBase + 16) + + + +unsigned int findCharUnicodeRange(UChar32 ch); +const char* langGroupFromUnicodeRange(unsigned char unicodeRange); + +} + +#endif // UnicodeRange_H diff --git a/Source/WebCore/platform/text/android/HyphenationAndroid.cpp b/Source/WebCore/platform/text/android/HyphenationAndroid.cpp new file mode 100644 index 0000000..d1bd839 --- /dev/null +++ b/Source/WebCore/platform/text/android/HyphenationAndroid.cpp @@ -0,0 +1,110 @@ +/* + * Copyright 2010, The Android Open Source Project + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "Hyphenation.h" + +// For external hyphenation library. +#include "hyphen.h" +#include <utils/AssetManager.h> +#include <wtf/text/CString.h> +#include <wtf/text/WTFString.h> + +extern android::AssetManager* globalAssetManager(); + +using namespace WTF; + +namespace WebCore { + +static HyphenDict* loadHyphenationDictionary() +{ + android::AssetManager* am = globalAssetManager(); + // Only support English for now. + android::Asset* a = am->open("webkit/hyph_en_US.dic", + android::Asset::ACCESS_BUFFER); + if (!a) { + // Asset webkit/hyph_en_US.dic not found! + return 0; + } + const CString dictContents = String(static_cast<const char*>(a->getBuffer(false)), + a->getLength()).utf8(); + HyphenDict* dict = hnj_hyphen_load_from_buffer(dictContents.data(), + dictContents.length()); + delete a; + + return dict; +} + +bool canHyphenate(const AtomicString& /* localeIdentifier */) +{ + // FIXME: Check that the locale identifier matches the available dictionary. + return true; +} + +size_t lastHyphenLocation(const UChar* characters, size_t length, size_t beforeIndex, const AtomicString& /* localeIdentifier */) +{ + static const size_t minWordLen = 5; + static const size_t maxWordLen = 100; + if (beforeIndex <= 0 || length < minWordLen || length > maxWordLen) + return 0; + + static HyphenDict* dict = loadHyphenationDictionary(); + if (!dict) + return 0; + + char word[maxWordLen]; + size_t wordLength = 0; + for (size_t i = 0; i < length; ++i) { + const UChar ch = characters[i]; + // Only English for now. + // To really make it language aware, we need something like language + // detection or rely on the langAttr in the html element. Though + // seems right now the langAttr is not used or quite implemented in + // webkit. + if (!isASCIIAlpha(ch)) { + // Bypass leading spaces. + if (isASCIISpace(ch) && !wordLength) + continue; + return 0; + } + word[wordLength++] = ch; + } + if (wordLength < minWordLen) + return 0; + + static const int extraBuffer = 5; + const int leadingSpacesCount = length - wordLength; + char hyphens[maxWordLen + extraBuffer]; + if (!hnj_hyphen_hyphenate(dict, word, wordLength, hyphens)) { + for (size_t i = beforeIndex - 2 - leadingSpacesCount; i > 0; --i) { + if (hyphens[i] & 1) + return i + 1 + leadingSpacesCount; + } + } + + return 0; +} + +} // namespace WebCore diff --git a/Source/WebCore/platform/text/android/TextBreakIteratorInternalICU.cpp b/Source/WebCore/platform/text/android/TextBreakIteratorInternalICU.cpp new file mode 100644 index 0000000..9732e92 --- /dev/null +++ b/Source/WebCore/platform/text/android/TextBreakIteratorInternalICU.cpp @@ -0,0 +1,43 @@ +/* + * Copyright 2007, The Android Open Source Project + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "TextBreakIteratorInternalICU.h" + +namespace WebCore { + +const char* currentSearchLocaleID() +{ + // FIXME: Should use system locale. + return ""; +} + +const char* currentTextBreakLocaleID() +{ + // FIXME: Should use system locale. + return "en_us"; +} + +} diff --git a/Source/WebCore/platform/text/brew/TextBoundariesBrew.cpp b/Source/WebCore/platform/text/brew/TextBoundariesBrew.cpp new file mode 100644 index 0000000..506bdcf --- /dev/null +++ b/Source/WebCore/platform/text/brew/TextBoundariesBrew.cpp @@ -0,0 +1,74 @@ +/* + * Copyright (C) 2006 Zack Rusin <zack@kde.org> + * Copyright (C) 2007-2009 Torch Mobile, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "TextBoundaries.h" + +#include "NotImplemented.h" +#include "PlatformString.h" + +using namespace WTF::Unicode; + +namespace WebCore { + +int findNextWordFromIndex(const UChar* buffer, int len, int position, bool forward) +{ + notImplemented(); + return 0; +} + +void findWordBoundary(const UChar* buffer, int len, int position, int* start, int* end) +{ + if (position > len) { + *start = 0; + *end = 0; + return; + } + + String str(buffer, len); + + int currentPosition = position - 1; + String foundWord; + while (currentPosition >= 0 && isLetter(str[currentPosition])) { + UChar c = str[currentPosition]; + foundWord.insert(&c, 1, 0); + --currentPosition; + } + + // currentPosition == 0 means the first char is not letter + // currentPosition == -1 means we reached the beginning + int startPos = (currentPosition < 0) ? 0 : ++currentPosition; + currentPosition = position; + while (isLetter(str[currentPosition])) { + foundWord.append(str[currentPosition]); + ++currentPosition; + } + + *start = startPos; + *end = currentPosition; +} + +} diff --git a/Source/WebCore/platform/text/brew/TextBreakIteratorBrew.cpp b/Source/WebCore/platform/text/brew/TextBreakIteratorBrew.cpp new file mode 100644 index 0000000..7f46e4f --- /dev/null +++ b/Source/WebCore/platform/text/brew/TextBreakIteratorBrew.cpp @@ -0,0 +1,312 @@ +/* + * Copyright (C) 2006 Lars Knoll <lars@trolltech.com> + * Copyright (C) 2007-2009 Torch Mobile, Inc. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public License + * along with this library; see the file COPYING.LIB. If not, write to + * the Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + * + */ + +#include "config.h" +#include "TextBreakIterator.h" + +#include "PlatformString.h" +#include <wtf/StdLibExtras.h> +#include <wtf/unicode/Unicode.h> + +using namespace WTF::Unicode; + +namespace WebCore { + +// Hack, not entirely correct +static inline bool isCharStop(UChar c) +{ + CharCategory charCategory = category(c); + return charCategory != Mark_NonSpacing && (charCategory != Other_Surrogate || (c < 0xd800 || c >= 0xdc00)); +} + +static inline bool isLineStop(UChar c) +{ + return category(c) != Separator_Line; +} + +static inline bool isSentenceStop(UChar c) +{ + return isPunct(c); +} + +class TextBreakIterator { +public: + void reset(const UChar* str, int len) + { + string = str; + length = len; + currentPos = 0; + } + virtual int first() = 0; + virtual int next() = 0; + virtual int previous() = 0; + int following(int position) + { + currentPos = position; + return next(); + } + int preceding(int position) + { + currentPos = position; + return previous(); + } + + int currentPos; + const UChar* string; + int length; +}; + +struct WordBreakIterator: TextBreakIterator { + virtual int first(); + virtual int next(); + virtual int previous(); +}; + +struct CharBreakIterator: TextBreakIterator { + virtual int first(); + virtual int next(); + virtual int previous(); +}; + +struct LineBreakIterator: TextBreakIterator { + virtual int first(); + virtual int next(); + virtual int previous(); +}; + +struct SentenceBreakIterator : TextBreakIterator { + virtual int first(); + virtual int next(); + virtual int previous(); +}; + +int WordBreakIterator::first() +{ + currentPos = 0; + return currentPos; +} + +int WordBreakIterator::next() +{ + if (currentPos == length) { + currentPos = -1; + return currentPos; + } + bool haveSpace = false; + while (currentPos < length) { + if (haveSpace && !isSpace(string[currentPos])) + break; + if (isSpace(string[currentPos])) + haveSpace = true; + ++currentPos; + } + return currentPos; +} + +int WordBreakIterator::previous() +{ + if (!currentPos) { + currentPos = -1; + return currentPos; + } + bool haveSpace = false; + while (currentPos > 0) { + if (haveSpace && !isSpace(string[currentPos])) + break; + if (isSpace(string[currentPos])) + haveSpace = true; + --currentPos; + } + return currentPos; +} + +int CharBreakIterator::first() +{ + currentPos = 0; + return currentPos; +} + +int CharBreakIterator::next() +{ + if (currentPos >= length) + return -1; + ++currentPos; + while (currentPos < length && !isCharStop(string[currentPos])) + ++currentPos; + return currentPos; +} + +int CharBreakIterator::previous() +{ + if (currentPos <= 0) + return -1; + if (currentPos > length) + currentPos = length; + --currentPos; + while (currentPos > 0 && !isCharStop(string[currentPos])) + --currentPos; + return currentPos; +} + +int LineBreakIterator::first() +{ + currentPos = 0; + return currentPos; +} + +int LineBreakIterator::next() +{ + if (currentPos == length) { + currentPos = -1; + return currentPos; + } + bool haveSpace = false; + while (currentPos < length) { + if (haveSpace && !isLineStop(string[currentPos])) + break; + if (isLineStop(string[currentPos])) + haveSpace = true; + ++currentPos; + } + return currentPos; +} + +int LineBreakIterator::previous() +{ + if (!currentPos) { + currentPos = -1; + return currentPos; + } + bool haveSpace = false; + while (currentPos > 0) { + if (haveSpace && !isLineStop(string[currentPos])) + break; + if (isLineStop(string[currentPos])) + haveSpace = true; + --currentPos; + } + return currentPos; +} + +int SentenceBreakIterator::first() +{ + currentPos = 0; + return currentPos; +} + +int SentenceBreakIterator::next() +{ + if (currentPos == length) { + currentPos = -1; + return currentPos; + } + bool haveSpace = false; + while (currentPos < length) { + if (haveSpace && !isSentenceStop(string[currentPos])) + break; + if (isSentenceStop(string[currentPos])) + haveSpace = true; + ++currentPos; + } + return currentPos; +} + +int SentenceBreakIterator::previous() +{ + if (!currentPos) { + currentPos = -1; + return currentPos; + } + bool haveSpace = false; + while (currentPos > 0) { + if (haveSpace && !isSentenceStop(string[currentPos])) + break; + if (isSentenceStop(string[currentPos])) + haveSpace = true; + --currentPos; + } + return currentPos; +} + +TextBreakIterator* wordBreakIterator(const UChar* string, int length) +{ + DEFINE_STATIC_LOCAL(WordBreakIterator, iterator, ()); + iterator.reset(string, length); + return &iterator; +} + +TextBreakIterator* characterBreakIterator(const UChar* string, int length) +{ + DEFINE_STATIC_LOCAL(CharBreakIterator, iterator, ()); + iterator.reset(string, length); + return &iterator; +} + +TextBreakIterator* lineBreakIterator(const UChar* string, int length) +{ + DEFINE_STATIC_LOCAL(LineBreakIterator , iterator, ()); + iterator.reset(string, length); + return &iterator; +} + +TextBreakIterator* sentenceBreakIterator(const UChar* string, int length) +{ + DEFINE_STATIC_LOCAL(SentenceBreakIterator, iterator, ()); + iterator.reset(string, length); + return &iterator; +} + +int textBreakFirst(TextBreakIterator* breakIterator) +{ + return breakIterator->first(); +} + +int textBreakNext(TextBreakIterator* breakIterator) +{ + return breakIterator->next(); +} + +int textBreakPreceding(TextBreakIterator* breakIterator, int position) +{ + return breakIterator->preceding(position); +} + +int textBreakFollowing(TextBreakIterator* breakIterator, int position) +{ + return breakIterator->following(position); +} + +int textBreakCurrent(TextBreakIterator* breakIterator) +{ + return breakIterator->currentPos; +} + +bool isTextBreak(TextBreakIterator*, int) +{ + return true; +} + +TextBreakIterator* cursorMovementIterator(const UChar* string, int length) +{ + return characterBreakIterator(string, length); +} + +} // namespace WebCore diff --git a/Source/WebCore/platform/text/brew/TextCodecBrew.cpp b/Source/WebCore/platform/text/brew/TextCodecBrew.cpp new file mode 100644 index 0000000..1f32298 --- /dev/null +++ b/Source/WebCore/platform/text/brew/TextCodecBrew.cpp @@ -0,0 +1,214 @@ +/* + * Copyright (C) 2010 Company 100, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "TextCodecBrew.h" + +#include "AEEAppGen.h" +#include "AEEICharsetConv.h" +#include "NotImplemented.h" +#include "PlatformString.h" +#include <wtf/Assertions.h> +#include <wtf/text/CString.h> + +namespace WebCore { + +// FIXME: Not sure if there are Brew MP devices which use big endian. +const char* WebCore::TextCodecBrew::m_internalEncodingName = "UTF-16LE"; + +static PassOwnPtr<TextCodec> newTextCodecBrew(const TextEncoding& encoding, const void*) +{ + return new TextCodecBrew(encoding); +} + +void TextCodecBrew::registerBaseEncodingNames(EncodingNameRegistrar registrar) +{ + registrar("UTF-8", "UTF-8"); +} + +void TextCodecBrew::registerBaseCodecs(TextCodecRegistrar registrar) +{ + registrar("UTF-8", newTextCodecBrew, 0); +} + +void TextCodecBrew::registerExtendedEncodingNames(EncodingNameRegistrar registrar) +{ + // FIXME: Not sure how to enumerate all available encodings. + notImplemented(); +} + +void TextCodecBrew::registerExtendedCodecs(TextCodecRegistrar registrar) +{ + notImplemented(); +} + +TextCodecBrew::TextCodecBrew(const TextEncoding& encoding) + : m_charsetConverter(0) + , m_encoding(encoding) + , m_numBufferedBytes(0) +{ + String format = String::format("%s>%s", encoding.name(), m_internalEncodingName); + + IShell* shell = reinterpret_cast<AEEApplet*>(GETAPPINSTANCE())->m_pIShell; + AEECLSID classID = ISHELL_GetHandler(shell, AEEIID_ICharsetConv, format.latin1().data()); + ISHELL_CreateInstance(shell, classID, reinterpret_cast<void**>(&m_charsetConverter)); + + ASSERT(m_charsetConverter); +} + +TextCodecBrew::~TextCodecBrew() +{ + if (m_charsetConverter) + ICharsetConv_Release(m_charsetConverter); +} + +String TextCodecBrew::decode(const char* bytes, size_t length, bool flush, bool stopOnError, bool& sawError) +{ + int code = ICharsetConv_Initialize(m_charsetConverter, m_encoding.name(), m_internalEncodingName, 0); + ASSERT(code == AEE_SUCCESS); + + Vector<UChar> result; + Vector<unsigned char> prefixedBytes(length); + + int srcSize; + unsigned char* srcBegin; + + if (m_numBufferedBytes) { + srcSize = length + m_numBufferedBytes; + prefixedBytes.grow(srcSize); + memcpy(prefixedBytes.data(), m_bufferedBytes, m_numBufferedBytes); + memcpy(prefixedBytes.data() + m_numBufferedBytes, bytes, length); + + srcBegin = prefixedBytes.data(); + + // all buffered bytes are consumed now + m_numBufferedBytes = 0; + } else { + srcSize = length; + srcBegin = const_cast<unsigned char*>(reinterpret_cast<const unsigned char*>(bytes)); + } + + unsigned char* src = srcBegin; + unsigned char* srcEnd = srcBegin + srcSize; + + Vector<UChar> dstBuffer(srcSize); + + while (src < srcEnd) { + int numCharsConverted; + unsigned char* dstBegin = reinterpret_cast<unsigned char*>(dstBuffer.data()); + unsigned char* dst = dstBegin; + int dstSize = dstBuffer.size() * sizeof(UChar); + + code = ICharsetConv_CharsetConvert(m_charsetConverter, &src, &srcSize, &dst, &dstSize, &numCharsConverted); + ASSERT(code != AEE_ENOSUCH); + + if (code == AEE_EBUFFERTOOSMALL) { + // Increase the buffer and try it again. + dstBuffer.grow(dstBuffer.size() * 2); + continue; + } + + if (code == AEE_EBADITEM) { + sawError = true; + if (stopOnError) { + result.append(L'?'); + break; + } + + src++; + } + + if (code == AEE_EINCOMPLETEITEM) { + if (flush) { + LOG_ERROR("Partial bytes at end of input while flush requested."); + sawError = true; + return String(); + } + + m_numBufferedBytes = srcEnd - src; + memcpy(m_bufferedBytes, src, m_numBufferedBytes); + break; + } + + int numChars = (dst - dstBegin) / sizeof(UChar); + if (numChars > 0) + result.append(dstBuffer.data(), numChars); + } + + return String::adopt(result); +} + +CString TextCodecBrew::encode(const UChar* characters, size_t length, UnencodableHandling handling) +{ + if (!length) + return ""; + + unsigned int replacementCharacter = '?'; + + // FIXME: Impossible to handle EntitiesForUnencodables or URLEncodedEntitiesForUnencodables with ICharsetConv. + int code = ICharsetConv_Initialize(m_charsetConverter, m_internalEncodingName, m_encoding.name(), replacementCharacter); + ASSERT(code == AEE_SUCCESS); + + Vector<char> result; + + int srcSize = length * sizeof(UChar); + unsigned char* srcBegin = const_cast<unsigned char*>(reinterpret_cast<const unsigned char*>(characters)); + unsigned char* src = srcBegin; + unsigned char* srcEnd = srcBegin + srcSize; + + Vector<unsigned char> dstBuffer(length * sizeof(UChar)); + + while (src < srcEnd) { + int numCharsConverted; + unsigned char* dstBegin = dstBuffer.data(); + unsigned char* dst = dstBegin; + int dstSize = dstBuffer.size(); + + code = ICharsetConv_CharsetConvert(m_charsetConverter, &src, &srcSize, &dst, &dstSize, &numCharsConverted); + ASSERT(code != AEE_EINCOMPLETEITEM); + + if (code == AEE_ENOSUCH) { + LOG_ERROR("Conversion error, Code=%d", code); + return CString(); + } + + if (code == AEE_EBUFFERTOOSMALL) { + // Increase the buffer and try it again. + dstBuffer.grow(dstBuffer.size() * 2); + continue; + } + + if (code == AEE_EBADITEM) + src += sizeof(UChar); // Skip the invalid character + + int numBytes = dst - dstBegin; + if (numBytes > 0) + result.append(dstBuffer.data(), numBytes); + } + + return CString(result.data(), result.size()); +} + +} // namespace WebCore diff --git a/Source/WebCore/platform/text/brew/TextCodecBrew.h b/Source/WebCore/platform/text/brew/TextCodecBrew.h new file mode 100644 index 0000000..97e2c87 --- /dev/null +++ b/Source/WebCore/platform/text/brew/TextCodecBrew.h @@ -0,0 +1,61 @@ +/* + * Copyright (C) 2010 Company 100, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef TextCodecBrew_h +#define TextCodecBrew_h + +#include "TextCodec.h" +#include "TextEncoding.h" + +typedef struct ICharsetConv ICharsetConv; + +namespace WebCore { + +class TextCodecBrew : public TextCodec { +public: + static void registerBaseEncodingNames(EncodingNameRegistrar); + static void registerBaseCodecs(TextCodecRegistrar); + + static void registerExtendedEncodingNames(EncodingNameRegistrar); + static void registerExtendedCodecs(TextCodecRegistrar); + + TextCodecBrew(const TextEncoding&); + virtual ~TextCodecBrew(); + + virtual String decode(const char*, size_t length, bool flush, bool stopOnError, bool& sawError); + virtual CString encode(const UChar*, size_t length, UnencodableHandling); + +private: + TextEncoding m_encoding; + size_t m_numBufferedBytes; + unsigned char m_bufferedBytes[16]; // bigger than any single multi-byte character + ICharsetConv* m_charsetConverter; + + static const char* m_internalEncodingName; +}; + +} // namespace WebCore + +#endif // TextCodecBrew_h diff --git a/Source/WebCore/platform/text/cf/HyphenationCF.cpp b/Source/WebCore/platform/text/cf/HyphenationCF.cpp new file mode 100644 index 0000000..3adacad --- /dev/null +++ b/Source/WebCore/platform/text/cf/HyphenationCF.cpp @@ -0,0 +1,96 @@ +/* + * Copyright (C) 2010 Apple Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS'' + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "Hyphenation.h" + +#if !defined(BUILDING_ON_TIGER) && !defined(BUILDING_ON_LEOPARD) && !defined(BUILDING_ON_SNOW_LEOPARD) + +#include "AtomicStringKeyedMRUCache.h" +#include "TextBreakIteratorInternalICU.h" +#include <wtf/ListHashSet.h> +#include <wtf/RetainPtr.h> + +namespace WebCore { + +#if !PLATFORM(WIN) || (defined(MAC_OS_X_VERSION_10_7) && MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_7) + +template<> +RetainPtr<CFLocaleRef> AtomicStringKeyedMRUCache<RetainPtr<CFLocaleRef> >::createValueForNullKey() +{ + RetainPtr<CFLocaleRef> locale(AdoptCF, CFLocaleCopyCurrent()); + + return CFStringIsHyphenationAvailableForLocale(locale.get()) ? locale : 0; +} + +template<> +RetainPtr<CFLocaleRef> AtomicStringKeyedMRUCache<RetainPtr<CFLocaleRef> >::createValueForKey(const AtomicString& localeIdentifier) +{ + RetainPtr<CFStringRef> cfLocaleIdentifier(AdoptCF, localeIdentifier.createCFString()); + RetainPtr<CFLocaleRef> locale(AdoptCF, CFLocaleCreate(kCFAllocatorDefault, cfLocaleIdentifier.get())); + + return CFStringIsHyphenationAvailableForLocale(locale.get()) ? locale : 0; +} + +static AtomicStringKeyedMRUCache<RetainPtr<CFLocaleRef> >& cfLocaleCache() +{ + DEFINE_STATIC_LOCAL(AtomicStringKeyedMRUCache<RetainPtr<CFLocaleRef> >, cache, ()); + return cache; +} + +bool canHyphenate(const AtomicString& localeIdentifier) +{ + return cfLocaleCache().get(localeIdentifier); +} + +size_t lastHyphenLocation(const UChar* characters, size_t length, size_t beforeIndex, const AtomicString& localeIdentifier) +{ + RetainPtr<CFStringRef> string(AdoptCF, CFStringCreateWithCharactersNoCopy(kCFAllocatorDefault, reinterpret_cast<const UniChar*>(characters), length, kCFAllocatorNull)); + + RetainPtr<CFLocaleRef> locale = cfLocaleCache().get(localeIdentifier); + ASSERT(locale); + + CFIndex result = CFStringGetHyphenationLocationBeforeIndex(string.get(), beforeIndex, CFRangeMake(0, length), 0, locale.get(), 0); + return result == kCFNotFound ? 0 : result; +} + +#else + +bool canHyphenate(const AtomicString&) +{ + return false; +} + +size_t lastHyphenLocation(const UChar*, size_t, size_t, const AtomicString&) +{ + ASSERT_NOT_REACHED(); + return 0; +} + +#endif // PLATFORM(WIN) && (!defined(MAC_OS_X_VERSION_10_7) || MAC_OS_X_VERSION_MAX_ALLOWED < MAC_OS_X_VERSION_10_7) + +} // namespace WebCore + +#endif // !defined(BUILDING_ON_TIGER) && !defined(BUILDING_ON_LEOPARD) && !defined(BUILDING_ON_SNOW_LEOPARD) diff --git a/Source/WebCore/platform/text/cf/StringCF.cpp b/Source/WebCore/platform/text/cf/StringCF.cpp new file mode 100644 index 0000000..dcaf8fb --- /dev/null +++ b/Source/WebCore/platform/text/cf/StringCF.cpp @@ -0,0 +1,55 @@ +/** + * Copyright (C) 2006 Apple Computer, Inc. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public License + * along with this library; see the file COPYING.LIB. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, + * Boston, MA 02110-1301, USA. + * + */ + +#include "config.h" +#include "PlatformString.h" + +#if PLATFORM(CF) + +#include <CoreFoundation/CoreFoundation.h> + +namespace WTF { + +String::String(CFStringRef str) +{ + if (!str) + return; + + CFIndex size = CFStringGetLength(str); + if (size == 0) + m_impl = StringImpl::empty(); + else { + Vector<UChar, 1024> buffer(size); + CFStringGetCharacters(str, CFRangeMake(0, size), (UniChar*)buffer.data()); + m_impl = StringImpl::create(buffer.data(), size); + } +} + +CFStringRef String::createCFString() const +{ + if (!m_impl) + return static_cast<CFStringRef>(CFRetain(CFSTR(""))); + + return m_impl->createCFString(); +} + +} + +#endif // PLATFORM(CF) diff --git a/Source/WebCore/platform/text/cf/StringImplCF.cpp b/Source/WebCore/platform/text/cf/StringImplCF.cpp new file mode 100644 index 0000000..0157918 --- /dev/null +++ b/Source/WebCore/platform/text/cf/StringImplCF.cpp @@ -0,0 +1,162 @@ +/* + * Copyright (C) 2006, 2009 Apple Inc. All rights reserved. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public License + * along with this library; see the file COPYING.LIB. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, + * Boston, MA 02110-1301, USA. + * + */ + +#include "config.h" +#include <wtf/text/StringImpl.h> + +#if PLATFORM(CF) + +#include <CoreFoundation/CoreFoundation.h> +#include <wtf/MainThread.h> +#include <wtf/PassRefPtr.h> +#include <wtf/Threading.h> + +#if PLATFORM(MAC) && !defined(BUILDING_ON_TIGER) +#include <objc/objc-auto.h> +#endif + +namespace WTF { + +namespace StringWrapperCFAllocator { + + static StringImpl* currentString; + + static const void* retain(const void* info) + { + return info; + } + + static void release(const void*) + { + ASSERT_NOT_REACHED(); + } + + static CFStringRef copyDescription(const void*) + { + return CFSTR("WTF::String-based allocator"); + } + + static void* allocate(CFIndex size, CFOptionFlags, void*) + { + StringImpl* underlyingString = 0; + if (isMainThread()) { + underlyingString = currentString; + if (underlyingString) { + currentString = 0; + underlyingString->ref(); // Balanced by call to deref in deallocate below. + } + } + StringImpl** header = static_cast<StringImpl**>(fastMalloc(sizeof(StringImpl*) + size)); + *header = underlyingString; + return header + 1; + } + + static void* reallocate(void* pointer, CFIndex newSize, CFOptionFlags, void*) + { + size_t newAllocationSize = sizeof(StringImpl*) + newSize; + StringImpl** header = static_cast<StringImpl**>(pointer) - 1; + ASSERT(!*header); + header = static_cast<StringImpl**>(fastRealloc(header, newAllocationSize)); + return header + 1; + } + + static void deallocateOnMainThread(void* headerPointer) + { + StringImpl** header = static_cast<StringImpl**>(headerPointer); + StringImpl* underlyingString = *header; + ASSERT(underlyingString); + underlyingString->deref(); // Balanced by call to ref in allocate above. + fastFree(header); + } + + static void deallocate(void* pointer, void*) + { + StringImpl** header = static_cast<StringImpl**>(pointer) - 1; + StringImpl* underlyingString = *header; + if (!underlyingString) + fastFree(header); + else { + if (!isMainThread()) + callOnMainThread(deallocateOnMainThread, header); + else { + underlyingString->deref(); // Balanced by call to ref in allocate above. + fastFree(header); + } + } + } + + static CFIndex preferredSize(CFIndex size, CFOptionFlags, void*) + { + // FIXME: If FastMalloc provided a "good size" callback, we'd want to use it here. + // Note that this optimization would help performance for strings created with the + // allocator that are mutable, and those typically are only created by callers who + // make a new string using the old string's allocator, such as some of the call + // sites in CFURL. + return size; + } + + static CFAllocatorRef create() + { +#if PLATFORM(MAC) && !defined(BUILDING_ON_TIGER) + // Since garbage collection isn't compatible with custom allocators, don't use this at all when garbage collection is active. + if (objc_collectingEnabled()) + return 0; +#endif + CFAllocatorContext context = { 0, 0, retain, release, copyDescription, allocate, reallocate, deallocate, preferredSize }; + return CFAllocatorCreate(0, &context); + } + + static CFAllocatorRef allocator() + { + static CFAllocatorRef allocator = create(); + return allocator; + } + +} + +CFStringRef StringImpl::createCFString() +{ + CFAllocatorRef allocator = (m_length && isMainThread()) ? StringWrapperCFAllocator::allocator() : 0; + if (!allocator) + return CFStringCreateWithCharacters(0, reinterpret_cast<const UniChar*>(m_data), m_length); + + // Put pointer to the StringImpl in a global so the allocator can store it with the CFString. + ASSERT(!StringWrapperCFAllocator::currentString); + StringWrapperCFAllocator::currentString = this; + + CFStringRef string = CFStringCreateWithCharactersNoCopy(allocator, reinterpret_cast<const UniChar*>(m_data), m_length, kCFAllocatorNull); + + // The allocator cleared the global when it read it, but also clear it here just in case. + ASSERT(!StringWrapperCFAllocator::currentString); + StringWrapperCFAllocator::currentString = 0; + + return string; +} + +// On StringImpl creation we could check if the allocator is the StringWrapperCFAllocator. +// If it is, then we could find the original StringImpl and just return that. But to +// do that we'd have to compute the offset from CFStringRef to the allocated block; +// the CFStringRef is *not* at the start of an allocated block. Testing shows 1000x +// more calls to createCFString than calls to the create functions with the appropriate +// allocator, so it's probably not urgent optimize that case. + +} + +#endif // PLATFORM(CF) diff --git a/Source/WebCore/platform/text/chromium/TextBreakIteratorInternalICUChromium.cpp b/Source/WebCore/platform/text/chromium/TextBreakIteratorInternalICUChromium.cpp new file mode 100644 index 0000000..e390a65 --- /dev/null +++ b/Source/WebCore/platform/text/chromium/TextBreakIteratorInternalICUChromium.cpp @@ -0,0 +1,50 @@ +/* + * Copyright (C) 2007 Apple Inc. All rights reserved. + * Copyright (C) 2008, 2009 Google Inc. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public License + * along with this library; see the file COPYING.LIB. If not, write to + * the Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + * + */ + +#include "config.h" +#include "TextBreakIteratorInternalICU.h" + +#include "Language.h" +#include "PlatformString.h" +#include <wtf/StdLibExtras.h> +#include <wtf/text/CString.h> + +namespace WebCore { + +static const char* UILanguage() +{ + // Chrome's UI language can be different from the OS UI language on Windows. + // We want to return Chrome's UI language here. + DEFINE_STATIC_LOCAL(CString, locale, (defaultLanguage().latin1())); + return locale.data(); +} + +const char* currentSearchLocaleID() +{ + return UILanguage(); +} + +const char* currentTextBreakLocaleID() +{ + return UILanguage(); +} + +} // namespace WebCore diff --git a/Source/WebCore/platform/text/efl/TextBreakIteratorInternalICUEfl.cpp b/Source/WebCore/platform/text/efl/TextBreakIteratorInternalICUEfl.cpp new file mode 100644 index 0000000..0056869 --- /dev/null +++ b/Source/WebCore/platform/text/efl/TextBreakIteratorInternalICUEfl.cpp @@ -0,0 +1,38 @@ +/* + * Copyright (C) 2007 Alp Toker <alp@atoker.com> + * Copyright (C) 2009-2010 ProFUSION embedded systems + * Copyright (C) 2009-2010 Samsung Electronics + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public License + * along with this library; see the file COPYING.LIB. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, + * Boston, MA 02110-1301, USA. + */ + +#include "config.h" +#include "TextBreakIteratorInternalICU.h" + +namespace WebCore { + +const char* currentSearchLocaleID() +{ + // FIXME: Should use system locale. + return ""; +} + +const char* currentTextBreakLocaleID() +{ + return "en_us"; +} + +} diff --git a/Source/WebCore/platform/text/gtk/TextBreakIteratorGtk.cpp b/Source/WebCore/platform/text/gtk/TextBreakIteratorGtk.cpp new file mode 100644 index 0000000..990e331 --- /dev/null +++ b/Source/WebCore/platform/text/gtk/TextBreakIteratorGtk.cpp @@ -0,0 +1,365 @@ +/* + * Copyright (C) 2006 Lars Knoll <lars@trolltech.com> + * Copyright (C) 2007 Apple Inc. All rights reserved. + * Copyright (C) 2008 JĂĽrg Billeter <j@bitron.ch> + * Copyright (C) 2008 Dominik Röttsches <dominik.roettsches@access-company.com> + * Copyright (C) 2010 Igalia S.L. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public License + * along with this library; see the file COPYING.LIB. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, + * Boston, MA 02110-1301, USA. + * + */ + +#include "config.h" + +#include "TextBreakIterator.h" + +#include "GOwnPtr.h" +#include <pango/pango.h> +using namespace std; + +#define UTF8_IS_SURROGATE(character) (character >= 0x10000 && character <= 0x10FFFF) + +namespace WebCore { + +class CharacterIterator { +public: + bool setText(const UChar* string, int length); + const gchar* getText() { return m_utf8.get(); } + int getLength() { return m_length; } + glong getSize() { return m_size; } + void setIndex(int index); + int getIndex() { return m_index; } + void setUTF16Index(int index); + int getUTF16Index() { return m_utf16Index; } + int getUTF16Length() { return m_utf16Length; } + int first(); + int last(); + int next(); + int previous(); +private: + int characterSize(int index); + + GOwnPtr<char> m_utf8; + int m_length; + long m_size; + int m_index; + int m_utf16Index; + int m_utf16Length; +}; + +int CharacterIterator::characterSize(int index) +{ + if (index == m_length || index < 0) + return 0; + if (m_length == m_utf16Length) + return 1; + + gchar* indexPtr = g_utf8_offset_to_pointer(m_utf8.get(), index); + gunichar character = g_utf8_get_char(indexPtr); + return UTF8_IS_SURROGATE(character) ? 2 : 1; +} + +bool CharacterIterator::setText(const UChar* string, int length) +{ + long utf8Size = 0; + m_utf8.set(g_utf16_to_utf8(string, length, 0, &utf8Size, 0)); + if (!utf8Size) + return false; + + m_utf16Length = length; + m_length = g_utf8_strlen(m_utf8.get(), utf8Size); + m_size = utf8Size; + m_index = 0; + m_utf16Index = 0; + + return true; +} + +void CharacterIterator::setIndex(int index) +{ + if (index == m_index) + return; + if (index <= 0) + m_index = m_utf16Index = 0; + else if (index >= m_length) { + m_index = m_length; + m_utf16Index = m_utf16Length; + } else if (m_length == m_utf16Length) + m_index = m_utf16Index = index; + else { + m_index = index; + int utf16Index = 0; + int utf8Index = 0; + while (utf8Index < index) { + utf16Index += characterSize(utf8Index); + utf8Index++; + } + m_utf16Index = utf16Index; + } +} + +void CharacterIterator::setUTF16Index(int index) +{ + if (index == m_utf16Index) + return; + if (index <= 0) + m_utf16Index = m_index = 0; + else if (index >= m_utf16Length) { + m_utf16Index = m_utf16Length; + m_index = m_length; + } else if (m_length == m_utf16Length) + m_utf16Index = m_index = index; + else { + m_utf16Index = index; + int utf16Index = 0; + int utf8Index = 0; + while (utf16Index < index) { + utf16Index += characterSize(utf8Index); + utf8Index++; + } + m_index = utf8Index; + } +} + +int CharacterIterator::first() +{ + m_index = m_utf16Index = 0; + return m_index; +} + +int CharacterIterator::last() +{ + m_index = m_length; + m_utf16Index = m_utf16Length; + return m_index; +} + +int CharacterIterator::next() +{ + int next = m_index + 1; + + if (next <= m_length) { + m_utf16Index = min(m_utf16Index + characterSize(m_index), m_utf16Length); + m_index = next; + } else { + m_index = TextBreakDone; + m_utf16Index = TextBreakDone; + } + + return m_index; +} + +int CharacterIterator::previous() +{ + int previous = m_index - 1; + + if (previous >= 0) { + m_utf16Index = max(m_utf16Index - characterSize(previous), 0); + m_index = previous; + } else { + m_index = TextBreakDone; + m_utf16Index = TextBreakDone; + } + + return m_index; +} + +enum UBreakIteratorType { + UBRK_CHARACTER, + UBRK_WORD, + UBRK_LINE, + UBRK_SENTENCE +}; + +class TextBreakIterator { +public: + UBreakIteratorType m_type; + PangoLogAttr* m_logAttrs; + CharacterIterator m_charIterator; +}; + +static TextBreakIterator* setUpIterator(bool& createdIterator, TextBreakIterator*& iterator, + UBreakIteratorType type, const UChar* string, int length) +{ + if (!string) + return 0; + + if (!createdIterator) { + iterator = new TextBreakIterator(); + createdIterator = true; + } + if (!iterator) + return 0; + + if (!iterator->m_charIterator.setText(string, length)) + return 0; + + int charLength = iterator->m_charIterator.getLength(); + + iterator->m_type = type; + if (createdIterator) + g_free(iterator->m_logAttrs); + iterator->m_logAttrs = g_new0(PangoLogAttr, charLength + 1); + pango_get_log_attrs(iterator->m_charIterator.getText(), iterator->m_charIterator.getSize(), + -1, 0, iterator->m_logAttrs, charLength + 1); + + return iterator; +} + +TextBreakIterator* characterBreakIterator(const UChar* string, int length) +{ + static bool createdCharacterBreakIterator = false; + static TextBreakIterator* staticCharacterBreakIterator; + return setUpIterator(createdCharacterBreakIterator, staticCharacterBreakIterator, UBRK_CHARACTER, string, length); +} + +TextBreakIterator* cursorMovementIterator(const UChar* string, int length) +{ + // FIXME: This needs closer inspection to achieve behaviour identical to the ICU version. + return characterBreakIterator(string, length); +} + +TextBreakIterator* wordBreakIterator(const UChar* string, int length) +{ + static bool createdWordBreakIterator = false; + static TextBreakIterator* staticWordBreakIterator; + return setUpIterator(createdWordBreakIterator, staticWordBreakIterator, UBRK_WORD, string, length); +} + +TextBreakIterator* lineBreakIterator(const UChar* string, int length) +{ + static bool createdLineBreakIterator = false; + static TextBreakIterator* staticLineBreakIterator; + return setUpIterator(createdLineBreakIterator, staticLineBreakIterator, UBRK_LINE, string, length); +} + +TextBreakIterator* sentenceBreakIterator(const UChar* string, int length) +{ + static bool createdSentenceBreakIterator = false; + static TextBreakIterator* staticSentenceBreakIterator; + return setUpIterator(createdSentenceBreakIterator, staticSentenceBreakIterator, UBRK_SENTENCE, string, length); +} + +int textBreakFirst(TextBreakIterator* iterator) +{ + iterator->m_charIterator.first(); + return iterator->m_charIterator.getUTF16Index(); +} + +int textBreakLast(TextBreakIterator* iterator) +{ + // TextBreakLast is not meant to find just any break according to bi->m_type + // but really the one near the last character. + // (cmp ICU documentation for ubrk_first and ubrk_last) + // From ICU docs for ubrk_last: + // "Determine the index immediately beyond the last character in the text being scanned." + + // So we should advance or traverse back based on bi->m_logAttrs cursor positions. + // If last character position in the original string is a whitespace, + // traverse to the left until the first non-white character position is found + // and return the position of the first white-space char after this one. + // Otherwise return m_length, as "the first character beyond the last" is outside our string. + + bool whiteSpaceAtTheEnd = true; + int nextWhiteSpacePos = iterator->m_charIterator.getLength(); + + int pos = iterator->m_charIterator.last(); + while (pos >= 0 && whiteSpaceAtTheEnd) { + if (iterator->m_logAttrs[pos].is_cursor_position) { + if (whiteSpaceAtTheEnd = iterator->m_logAttrs[pos].is_white) + nextWhiteSpacePos = pos; + } + pos = iterator->m_charIterator.previous(); + } + iterator->m_charIterator.setIndex(nextWhiteSpacePos); + return iterator->m_charIterator.getUTF16Index(); +} + +int textBreakNext(TextBreakIterator* iterator) +{ + while (iterator->m_charIterator.next() != TextBreakDone) { + int index = iterator->m_charIterator.getIndex(); + + // FIXME: UBRK_WORD case: Single multibyte characters (i.e. white space around them), such as the euro symbol €, + // are not marked as word_start & word_end as opposed to the way ICU does it. + // This leads to - for example - different word selection behaviour when right clicking. + + if ((iterator->m_type == UBRK_LINE && iterator->m_logAttrs[index].is_line_break) + || (iterator->m_type == UBRK_WORD && (iterator->m_logAttrs[index].is_word_start || iterator->m_logAttrs[index].is_word_end)) + || (iterator->m_type == UBRK_CHARACTER && iterator->m_logAttrs[index].is_cursor_position) + || (iterator->m_type == UBRK_SENTENCE && iterator->m_logAttrs[index].is_sentence_boundary)) { + break; + } + } + return iterator->m_charIterator.getUTF16Index(); +} + +int textBreakPrevious(TextBreakIterator* iterator) +{ + while (iterator->m_charIterator.previous() != TextBreakDone) { + int index = iterator->m_charIterator.getIndex(); + + if ((iterator->m_type == UBRK_LINE && iterator->m_logAttrs[index].is_line_break) + || (iterator->m_type == UBRK_WORD && (iterator->m_logAttrs[index].is_word_start || iterator->m_logAttrs[index].is_word_end)) + || (iterator->m_type == UBRK_CHARACTER && iterator->m_logAttrs[index].is_cursor_position) + || (iterator->m_type == UBRK_SENTENCE && iterator->m_logAttrs[index].is_sentence_boundary)) { + break; + } + } + return iterator->m_charIterator.getUTF16Index(); +} + +int textBreakPreceding(TextBreakIterator* iterator, int offset) +{ + if (offset > iterator->m_charIterator.getUTF16Length()) + return TextBreakDone; + if (offset < 0) + return 0; + iterator->m_charIterator.setUTF16Index(offset); + return textBreakPrevious(iterator); +} + +int textBreakFollowing(TextBreakIterator* iterator, int offset) +{ + if (offset > iterator->m_charIterator.getUTF16Length()) + return TextBreakDone; + if (offset < 0) + return 0; + iterator->m_charIterator.setUTF16Index(offset); + return textBreakNext(iterator); +} + +int textBreakCurrent(TextBreakIterator* iterator) +{ + return iterator->m_charIterator.getUTF16Index(); +} + +bool isTextBreak(TextBreakIterator* iterator, int offset) +{ + if (!offset) + return true; + if (offset > iterator->m_charIterator.getUTF16Length()) + return false; + + iterator->m_charIterator.setUTF16Index(offset); + + int index = iterator->m_charIterator.getIndex(); + iterator->m_charIterator.previous(); + textBreakNext(iterator); + return iterator->m_charIterator.getIndex() == index; +} + +} diff --git a/Source/WebCore/platform/text/gtk/TextBreakIteratorInternalICUGtk.cpp b/Source/WebCore/platform/text/gtk/TextBreakIteratorInternalICUGtk.cpp new file mode 100644 index 0000000..35e5a05 --- /dev/null +++ b/Source/WebCore/platform/text/gtk/TextBreakIteratorInternalICUGtk.cpp @@ -0,0 +1,37 @@ +/* + * Copyright (C) 2007 Alp Toker <alp@atoker.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public License + * along with this library; see the file COPYING.LIB. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, + * Boston, MA 02110-1301, USA. + */ + +#include "config.h" +#include "TextBreakIteratorInternalICU.h" + +namespace WebCore { + +const char* currentSearchLocaleID() +{ + // FIXME: Should use system locale. + return ""; +} + +const char* currentTextBreakLocaleID() +{ + // FIXME: Should use system locale. + return "en_us"; +} + +} diff --git a/Source/WebCore/platform/text/gtk/TextCodecGtk.cpp b/Source/WebCore/platform/text/gtk/TextCodecGtk.cpp new file mode 100644 index 0000000..c5bd7e8 --- /dev/null +++ b/Source/WebCore/platform/text/gtk/TextCodecGtk.cpp @@ -0,0 +1,578 @@ +/* + * Copyright (C) 2004, 2006, 2007, 2008 Apple Inc. All rights reserved. + * Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com> + * Copyright (C) 2008 JĂĽrg Billeter <j@bitron.ch> + * Copyright (C) 2009 Dominik Röttsches <dominik.roettsches@access-company.com> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "TextCodecGtk.h" + +#include <gio/gio.h> +#include "GOwnPtr.h" +#include "Logging.h" +#include "PlatformString.h" +#include <wtf/Assertions.h> +#include <wtf/HashMap.h> +#include <wtf/text/CString.h> + +using std::min; + +namespace WebCore { + +// TextCodec's appendOmittingBOM() is gone (http://trac.webkit.org/changeset/33380). +// That's why we need to avoid generating extra BOM's for the conversion result. +// This can be achieved by specifying the UTF-16 codecs' endianness explicitly when initializing GLib. + +#if (G_BYTE_ORDER == G_BIG_ENDIAN) +static const gchar* internalEncodingName = "UTF-16BE"; +#else +static const gchar* internalEncodingName = "UTF-16LE"; +#endif + + +const size_t ConversionBufferSize = 16384; + + +static PassOwnPtr<TextCodec> newTextCodecGtk(const TextEncoding& encoding, const void*) +{ + return new TextCodecGtk(encoding); +} + +static bool isEncodingAvailable(const gchar* encodingName) +{ + GIConv tester; + // test decoding + tester = g_iconv_open(internalEncodingName, encodingName); + if (tester == reinterpret_cast<GIConv>(-1)) { + return false; + } else { + g_iconv_close(tester); + // test encoding + tester = g_iconv_open(encodingName, internalEncodingName); + if (tester == reinterpret_cast<GIConv>(-1)) { + return false; + } else { + g_iconv_close(tester); + return true; + } + } +} + +static bool registerEncodingNameIfAvailable(EncodingNameRegistrar registrar, const char* canonicalName) +{ + if (isEncodingAvailable(canonicalName)) { + registrar(canonicalName, canonicalName); + return true; + } + + return false; +} + +static void registerEncodingAliasIfAvailable(EncodingNameRegistrar registrar, const char* canonicalName, const char* aliasName) +{ + if (isEncodingAvailable(aliasName)) + registrar(aliasName, canonicalName); +} + +static void registerCodecIfAvailable(TextCodecRegistrar registrar, const char* codecName) +{ + if (isEncodingAvailable(codecName)) + registrar(codecName, newTextCodecGtk, 0); +} + +void TextCodecGtk::registerBaseEncodingNames(EncodingNameRegistrar registrar) +{ + // Unicode + registerEncodingNameIfAvailable(registrar, "UTF-8"); + registerEncodingNameIfAvailable(registrar, "UTF-32"); + registerEncodingNameIfAvailable(registrar, "UTF-32BE"); + registerEncodingNameIfAvailable(registrar, "UTF-32LE"); + + // Western + if (registerEncodingNameIfAvailable(registrar, "ISO-8859-1")) { + registerEncodingAliasIfAvailable(registrar, "ISO-8859-1", "CP819"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-1", "IBM819"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-1", "ISO-IR-100"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-1", "ISO8859-1"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-1", "ISO_8859-1"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-1", "ISO_8859-1:1987"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-1", "L1"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-1", "LATIN1"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-1", "CSISOLATIN1"); + } +} + +void TextCodecGtk::registerBaseCodecs(TextCodecRegistrar registrar) +{ + // Unicode + registerCodecIfAvailable(registrar, "UTF-8"); + registerCodecIfAvailable(registrar, "UTF-32"); + registerCodecIfAvailable(registrar, "UTF-32BE"); + registerCodecIfAvailable(registrar, "UTF-32LE"); + + // Western + registerCodecIfAvailable(registrar, "ISO-8859-1"); +} + +void TextCodecGtk::registerExtendedEncodingNames(EncodingNameRegistrar registrar) +{ + // Western + if (registerEncodingNameIfAvailable(registrar, "MACROMAN")) { + registerEncodingAliasIfAvailable(registrar, "MACROMAN", "MAC"); + registerEncodingAliasIfAvailable(registrar, "MACROMAN", "MACINTOSH"); + registerEncodingAliasIfAvailable(registrar, "MACROMAN", "CSMACINTOSH"); + } + + // Japanese + if (registerEncodingNameIfAvailable(registrar, "Shift_JIS")) { + registerEncodingAliasIfAvailable(registrar, "Shift_JIS", "MS_KANJI"); + registerEncodingAliasIfAvailable(registrar, "Shift_JIS", "SHIFT-JIS"); + registerEncodingAliasIfAvailable(registrar, "Shift_JIS", "SJIS"); + registerEncodingAliasIfAvailable(registrar, "Shift_JIS", "CSSHIFTJIS"); + } + if (registerEncodingNameIfAvailable(registrar, "EUC-JP")) { + registerEncodingAliasIfAvailable(registrar, "EUC-JP", "EUC_JP"); + registerEncodingAliasIfAvailable(registrar, "EUC-JP", "EUCJP"); + registerEncodingAliasIfAvailable(registrar, "EUC-JP", "EXTENDED_UNIX_CODE_PACKED_FORMAT_FOR_JAPANESE"); + registerEncodingAliasIfAvailable(registrar, "EUC-JP", "CSEUCPKDFMTJAPANESE"); + } + registerEncodingNameIfAvailable(registrar, "ISO-2022-JP"); + + // Traditional Chinese + if (registerEncodingNameIfAvailable(registrar, "BIG5")) { + registerEncodingAliasIfAvailable(registrar, "BIG5", "BIG-5"); + registerEncodingAliasIfAvailable(registrar, "BIG5", "BIG-FIVE"); + registerEncodingAliasIfAvailable(registrar, "BIG5", "BIGFIVE"); + registerEncodingAliasIfAvailable(registrar, "BIG5", "CN-BIG5"); + registerEncodingAliasIfAvailable(registrar, "BIG5", "CSBIG5"); + } + if (registerEncodingNameIfAvailable(registrar, "BIG5-HKSCS")) { + registerEncodingAliasIfAvailable(registrar, "BIG5-HKSCS", "BIG5-HKSCS:2004"); + registerEncodingAliasIfAvailable(registrar, "BIG5-HKSCS", "BIG5HKSCS"); + } + registerEncodingNameIfAvailable(registrar, "CP950"); + + // Korean + if (registerEncodingNameIfAvailable(registrar, "ISO-2022-KR")) + registerEncodingAliasIfAvailable(registrar, "ISO-2022-KR", "CSISO2022KR"); + if (registerEncodingNameIfAvailable(registrar, "CP949")) + registerEncodingAliasIfAvailable(registrar, "CP949", "UHC"); + if (registerEncodingNameIfAvailable(registrar, "EUC-KR")) + registerEncodingAliasIfAvailable(registrar, "EUC-KR", "CSEUCKR"); + + // Arabic + if (registerEncodingNameIfAvailable(registrar, "ISO-8859-6")) { + registerEncodingAliasIfAvailable(registrar, "ISO-8859-6", "ARABIC"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-6", "ASMO-708"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-6", "ECMA-114"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-6", "ISO-IR-127"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-6", "ISO8859-6"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-6", "ISO_8859-6"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-6", "ISO_8859-6:1987"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-6", "CSISOLATINARABIC"); + } + // rearranged, windows-1256 now declared the canonical name and put to lowercase to fix /fast/encoding/ahram-org-eg.html test case + if (registerEncodingNameIfAvailable(registrar, "windows-1256")) { + registerEncodingAliasIfAvailable(registrar, "windows-1256", "CP1256"); + registerEncodingAliasIfAvailable(registrar, "windows-1256", "MS-ARAB"); + } + + // Hebrew + if (registerEncodingNameIfAvailable(registrar, "ISO-8859-8")) { + registerEncodingAliasIfAvailable(registrar, "ISO-8859-8", "HEBREW"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-8", "ISO-8859-8"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-8", "ISO-IR-138"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-8", "ISO8859-8"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-8", "ISO_8859-8"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-8", "ISO_8859-8:1988"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-8", "CSISOLATINHEBREW"); + } + // rearranged, moved windows-1255 as canonical and lowercased, fixing /fast/encoding/meta-charset.html + if (registerEncodingNameIfAvailable(registrar, "windows-1255")) { + registerEncodingAliasIfAvailable(registrar, "windows-1255", "CP1255"); + registerEncodingAliasIfAvailable(registrar, "windows-1255", "MS-HEBR"); + } + + // Greek + if (registerEncodingNameIfAvailable(registrar, "ISO-8859-7")) { + registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "ECMA-118"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "ELOT_928"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "GREEK"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "GREEK8"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "ISO-IR-126"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "ISO8859-7"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "ISO_8859-7"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "ISO_8859-7:1987"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "ISO_8859-7:2003"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "CSI"); + } + if (registerEncodingNameIfAvailable(registrar, "CP869")) { + registerEncodingAliasIfAvailable(registrar, "CP869", "869"); + registerEncodingAliasIfAvailable(registrar, "CP869", "CP-GR"); + registerEncodingAliasIfAvailable(registrar, "CP869", "IBM869"); + registerEncodingAliasIfAvailable(registrar, "CP869", "CSIBM869"); + } + registerEncodingNameIfAvailable(registrar, "WINDOWS-1253"); + + // Cyrillic + if (registerEncodingNameIfAvailable(registrar, "ISO-8859-5")) { + registerEncodingAliasIfAvailable(registrar, "ISO-8859-5", "CYRILLIC"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-5", "ISO-IR-144"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-5", "ISO8859-5"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-5", "ISO_8859-5"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-5", "ISO_8859-5:1988"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-5", "CSISOLATINCYRILLIC"); + } + if (registerEncodingNameIfAvailable(registrar, "KOI8-R")) + registerEncodingAliasIfAvailable(registrar, "KOI8-R", "CSKOI8R"); + if (registerEncodingNameIfAvailable(registrar, "CP866")) { + registerEncodingAliasIfAvailable(registrar, "CP866", "866"); + registerEncodingAliasIfAvailable(registrar, "CP866", "IBM866"); + registerEncodingAliasIfAvailable(registrar, "CP866", "CSIBM866"); + } + registerEncodingNameIfAvailable(registrar, "KOI8-U"); + // CP1251 added to pass /fast/encoding/charset-cp1251.html + if (registerEncodingNameIfAvailable(registrar, "windows-1251")) + registerEncodingAliasIfAvailable(registrar, "windows-1251", "CP1251"); + if (registerEncodingNameIfAvailable(registrar, "mac-cyrillic")) { + registerEncodingAliasIfAvailable(registrar, "mac-cyrillic", "MACCYRILLIC"); + registerEncodingAliasIfAvailable(registrar, "mac-cyrillic", "x-mac-cyrillic"); + } + + // Thai + if (registerEncodingNameIfAvailable(registrar, "CP874")) + registerEncodingAliasIfAvailable(registrar, "CP874", "WINDOWS-874"); + registerEncodingNameIfAvailable(registrar, "TIS-620"); + + // Simplified Chinese + registerEncodingNameIfAvailable(registrar, "GBK"); + if (registerEncodingNameIfAvailable(registrar, "HZ")) + registerEncodingAliasIfAvailable(registrar, "HZ", "HZ-GB-2312"); + registerEncodingNameIfAvailable(registrar, "GB18030"); + if (registerEncodingNameIfAvailable(registrar, "EUC-CN")) { + registerEncodingAliasIfAvailable(registrar, "EUC-CN", "EUCCN"); + registerEncodingAliasIfAvailable(registrar, "EUC-CN", "GB2312"); + registerEncodingAliasIfAvailable(registrar, "EUC-CN", "CN-GB"); + registerEncodingAliasIfAvailable(registrar, "EUC-CN", "CSGB2312"); + registerEncodingAliasIfAvailable(registrar, "EUC-CN", "EUC_CN"); + } + if (registerEncodingNameIfAvailable(registrar, "GB_2312-80")) { + registerEncodingAliasIfAvailable(registrar, "GB_2312-80", "CHINESE"); + registerEncodingAliasIfAvailable(registrar, "GB_2312-80", "csISO58GB231280"); + registerEncodingAliasIfAvailable(registrar, "GB_2312-80", "GB2312.1980-0"); + registerEncodingAliasIfAvailable(registrar, "GB_2312-80", "ISO-IR-58"); + } + + // Central European + if (registerEncodingNameIfAvailable(registrar, "ISO-8859-2")) { + registerEncodingAliasIfAvailable(registrar, "ISO-8859-2", "ISO-IR-101"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-2", "ISO8859-2"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-2", "ISO_8859-2"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-2", "ISO_8859-2:1987"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-2", "L2"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-2", "LATIN2"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-2", "CSISOLATIN2"); + } + if (registerEncodingNameIfAvailable(registrar, "CP1250")) { + registerEncodingAliasIfAvailable(registrar, "CP1250", "MS-EE"); + registerEncodingAliasIfAvailable(registrar, "CP1250", "WINDOWS-1250"); + } + registerEncodingNameIfAvailable(registrar, "MAC-CENTRALEUROPE"); + + // Vietnamese + if (registerEncodingNameIfAvailable(registrar, "CP1258")) + registerEncodingAliasIfAvailable(registrar, "CP1258", "WINDOWS-1258"); + + // Turkish + if (registerEncodingNameIfAvailable(registrar, "CP1254")) { + registerEncodingAliasIfAvailable(registrar, "CP1254", "MS-TURK"); + registerEncodingAliasIfAvailable(registrar, "CP1254", "WINDOWS-1254"); + } + if (registerEncodingNameIfAvailable(registrar, "ISO-8859-9")) { + registerEncodingAliasIfAvailable(registrar, "ISO-8859-9", "ISO-IR-148"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-9", "ISO8859-9"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-9", "ISO_8859-9"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-9", "ISO_8859-9:1989"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-9", "L5"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-9", "LATIN5"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-9", "CSISOLATIN5"); + } + + // Baltic + if (registerEncodingNameIfAvailable(registrar, "CP1257")) { + registerEncodingAliasIfAvailable(registrar, "CP1257", "WINBALTRIM"); + registerEncodingAliasIfAvailable(registrar, "CP1257", "WINDOWS-1257"); + } + if (registerEncodingNameIfAvailable(registrar, "ISO-8859-4")) { + registerEncodingAliasIfAvailable(registrar, "ISO-8859-4", "ISO-IR-110"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-4", "ISO8859-4"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-4", "ISO_8859-4"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-4", "ISO_8859-4:1988"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-4", "L4"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-4", "LATIN4"); + registerEncodingAliasIfAvailable(registrar, "ISO-8859-4", "CSISOLATIN4"); + } +} + +void TextCodecGtk::registerExtendedCodecs(TextCodecRegistrar registrar) +{ + // Western + registerCodecIfAvailable(registrar, "MACROMAN"); + + // Japanese + registerCodecIfAvailable(registrar, "Shift_JIS"); + registerCodecIfAvailable(registrar, "EUC-JP"); + registerCodecIfAvailable(registrar, "ISO-2022-JP"); + + // Traditional Chinese + registerCodecIfAvailable(registrar, "BIG5"); + registerCodecIfAvailable(registrar, "BIG5-HKSCS"); + registerCodecIfAvailable(registrar, "CP950"); + + // Korean + registerCodecIfAvailable(registrar, "ISO-2022-KR"); + registerCodecIfAvailable(registrar, "CP949"); + registerCodecIfAvailable(registrar, "EUC-KR"); + + // Arabic + registerCodecIfAvailable(registrar, "ISO-8859-6"); + // rearranged, windows-1256 now declared the canonical name and put to lowercase to fix /fast/encoding/ahram-org-eg.html test case + registerCodecIfAvailable(registrar, "windows-1256"); + + // Hebrew + registerCodecIfAvailable(registrar, "ISO-8859-8"); + // rearranged, moved windows-1255 as canonical and lowercased, fixing /fast/encoding/meta-charset.html + registerCodecIfAvailable(registrar, "windows-1255"); + + // Greek + registerCodecIfAvailable(registrar, "ISO-8859-7"); + registerCodecIfAvailable(registrar, "CP869"); + registerCodecIfAvailable(registrar, "WINDOWS-1253"); + + // Cyrillic + registerCodecIfAvailable(registrar, "ISO-8859-5"); + registerCodecIfAvailable(registrar, "KOI8-R"); + registerCodecIfAvailable(registrar, "CP866"); + registerCodecIfAvailable(registrar, "KOI8-U"); + // CP1251 added to pass /fast/encoding/charset-cp1251.html + registerCodecIfAvailable(registrar, "windows-1251"); + registerCodecIfAvailable(registrar, "mac-cyrillic"); + + // Thai + registerCodecIfAvailable(registrar, "CP874"); + registerCodecIfAvailable(registrar, "TIS-620"); + + // Simplified Chinese + registerCodecIfAvailable(registrar, "GBK"); + registerCodecIfAvailable(registrar, "HZ"); + registerCodecIfAvailable(registrar, "GB18030"); + registerCodecIfAvailable(registrar, "EUC-CN"); + registerCodecIfAvailable(registrar, "GB_2312-80"); + + // Central European + registerCodecIfAvailable(registrar, "ISO-8859-2"); + registerCodecIfAvailable(registrar, "CP1250"); + registerCodecIfAvailable(registrar, "MAC-CENTRALEUROPE"); + + // Vietnamese + registerCodecIfAvailable(registrar, "CP1258"); + + // Turkish + registerCodecIfAvailable(registrar, "CP1254"); + registerCodecIfAvailable(registrar, "ISO-8859-9"); + + // Baltic + registerCodecIfAvailable(registrar, "CP1257"); + registerCodecIfAvailable(registrar, "ISO-8859-4"); +} + +TextCodecGtk::TextCodecGtk(const TextEncoding& encoding) + : m_encoding(encoding) + , m_numBufferedBytes(0) +{ +} + +TextCodecGtk::~TextCodecGtk() +{ +} + +void TextCodecGtk::createIConvDecoder() const +{ + ASSERT(!m_iconvDecoder); + + m_iconvDecoder = adoptGRef(g_charset_converter_new(internalEncodingName, m_encoding.name(), 0)); +} + +void TextCodecGtk::createIConvEncoder() const +{ + ASSERT(!m_iconvEncoder); + + m_iconvEncoder = adoptGRef(g_charset_converter_new(m_encoding.name(), internalEncodingName, 0)); +} + +String TextCodecGtk::decode(const char* bytes, size_t length, bool flush, bool stopOnError, bool& sawError) +{ + // Get a converter for the passed-in encoding. + if (!m_iconvDecoder) + createIConvDecoder(); + if (!m_iconvDecoder) { + LOG_ERROR("Error creating IConv encoder even though encoding was in table."); + return String(); + } + + Vector<UChar> result; + + gsize bytesRead = 0; + gsize bytesWritten = 0; + const gchar* input = bytes; + gsize inputLength = length; + gchar buffer[ConversionBufferSize]; + int flags = !length ? G_CONVERTER_INPUT_AT_END : G_CONVERTER_NO_FLAGS; + if (flush) + flags |= G_CONVERTER_FLUSH; + + bool bufferWasFull = false; + char* prefixedBytes = 0; + + if (m_numBufferedBytes) { + inputLength = length + m_numBufferedBytes; + prefixedBytes = static_cast<char*>(fastMalloc(inputLength)); + memcpy(prefixedBytes, m_bufferedBytes, m_numBufferedBytes); + memcpy(prefixedBytes + m_numBufferedBytes, bytes, length); + + input = prefixedBytes; + + // all buffered bytes are consumed now + m_numBufferedBytes = 0; + } + + do { + GOwnPtr<GError> error; + GConverterResult res = g_converter_convert(G_CONVERTER(m_iconvDecoder.get()), + input, inputLength, + buffer, sizeof(buffer), + static_cast<GConverterFlags>(flags), + &bytesRead, &bytesWritten, + &error.outPtr()); + input += bytesRead; + inputLength -= bytesRead; + + if (res == G_CONVERTER_ERROR) { + if (g_error_matches(error.get(), G_IO_ERROR, G_IO_ERROR_PARTIAL_INPUT)) { + // There is not enough input to fully determine what the conversion should produce, + // save it to a buffer to prepend it to the next input. + memcpy(m_bufferedBytes, input, inputLength); + m_numBufferedBytes = inputLength; + inputLength = 0; + } else if (g_error_matches(error.get(), G_IO_ERROR, G_IO_ERROR_NO_SPACE)) + bufferWasFull = true; + else if (g_error_matches(error.get(), G_IO_ERROR, G_IO_ERROR_INVALID_DATA)) { + if (stopOnError) + sawError = true; + if (inputLength) { + // Ignore invalid character. + input += 1; + inputLength -= 1; + } + } else { + sawError = true; + LOG_ERROR("GIConv conversion error, Code %d: \"%s\"", error->code, error->message); + m_numBufferedBytes = 0; // Reset state for subsequent calls to decode. + fastFree(prefixedBytes); + return String(); + } + } + + result.append(reinterpret_cast<UChar*>(buffer), bytesWritten / sizeof(UChar)); + } while ((inputLength || bufferWasFull) && !sawError); + + fastFree(prefixedBytes); + + return String::adopt(result); +} + +CString TextCodecGtk::encode(const UChar* characters, size_t length, UnencodableHandling handling) +{ + if (!length) + return ""; + + if (!m_iconvEncoder) + createIConvEncoder(); + if (!m_iconvEncoder) { + LOG_ERROR("Error creating IConv encoder even though encoding was in table."); + return CString(); + } + + gsize bytesRead = 0; + gsize bytesWritten = 0; + const gchar* input = reinterpret_cast<const char*>(characters); + gsize inputLength = length * sizeof(UChar); + gchar buffer[ConversionBufferSize]; + Vector<char> result; + GOwnPtr<GError> error; + + size_t size = 0; + do { + g_converter_convert(G_CONVERTER(m_iconvEncoder.get()), + input, inputLength, + buffer, sizeof(buffer), + G_CONVERTER_INPUT_AT_END, + &bytesRead, &bytesWritten, + &error.outPtr()); + input += bytesRead; + inputLength -= bytesRead; + if (bytesWritten > 0) { + result.grow(size + bytesWritten); + memcpy(result.data() + size, buffer, bytesWritten); + size += bytesWritten; + } + + if (error && g_error_matches(error.get(), G_IO_ERROR, G_IO_ERROR_INVALID_DATA)) { + UChar codePoint = reinterpret_cast<const UChar*>(input)[0]; + UnencodableReplacementArray replacement; + int replacementLength = TextCodec::getUnencodableReplacement(codePoint, handling, replacement); + + // Consume the invalid character. + input += sizeof(UChar); + inputLength -= sizeof(UChar); + + // Append replacement string to result buffer. + result.grow(size + replacementLength); + memcpy(result.data() + size, replacement, replacementLength); + size += replacementLength; + + error.clear(); + } + } while (inputLength && !error.get()); + + if (error) { + LOG_ERROR("GIConv conversion error, Code %d: \"%s\"", error->code, error->message); + return CString(); + } + + return CString(result.data(), size); +} + +} // namespace WebCore diff --git a/Source/WebCore/platform/text/gtk/TextCodecGtk.h b/Source/WebCore/platform/text/gtk/TextCodecGtk.h new file mode 100644 index 0000000..bb3a445 --- /dev/null +++ b/Source/WebCore/platform/text/gtk/TextCodecGtk.h @@ -0,0 +1,66 @@ +/* + * Copyright (C) 2004, 2006, 2007 Apple Inc. All rights reserved. + * Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com> + * Copyright (C) 2008 JĂĽrg Billeter <j@bitron.ch> + * Copyright (C) 2009 Dominik Röttsches <dominik.roettsches@access-company.com> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef TextCodecGTK_h +#define TextCodecGTK_h + +#include "GRefPtr.h" +#include <glib.h> +#include "TextCodec.h" +#include "TextEncoding.h" + +namespace WebCore { + + class TextCodecGtk : public TextCodec { + public: + static void registerBaseEncodingNames(EncodingNameRegistrar); + static void registerBaseCodecs(TextCodecRegistrar); + + static void registerExtendedEncodingNames(EncodingNameRegistrar); + static void registerExtendedCodecs(TextCodecRegistrar); + + TextCodecGtk(const TextEncoding&); + virtual ~TextCodecGtk(); + + virtual String decode(const char*, size_t length, bool flush, bool stopOnError, bool& sawError); + virtual CString encode(const UChar*, size_t length, UnencodableHandling); + + private: + void createIConvDecoder() const; + void createIConvEncoder() const; + + TextEncoding m_encoding; + size_t m_numBufferedBytes; + unsigned char m_bufferedBytes[16]; // bigger than any single multi-byte character + mutable GRefPtr<GCharsetConverter> m_iconvDecoder; + mutable GRefPtr<GCharsetConverter> m_iconvEncoder; + }; + +} // namespace WebCore + +#endif // TextCodecGTK_h diff --git a/Source/WebCore/platform/text/haiku/TextBreakIteratorInternalICUHaiku.cpp b/Source/WebCore/platform/text/haiku/TextBreakIteratorInternalICUHaiku.cpp new file mode 100644 index 0000000..8bb8c70 --- /dev/null +++ b/Source/WebCore/platform/text/haiku/TextBreakIteratorInternalICUHaiku.cpp @@ -0,0 +1,42 @@ +/* + * Copyright (C) 2007 Apple Inc. All rights reserved. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public License + * along with this library; see the file COPYING.LIB. If not, write to + * the Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + * + */ + +#include "config.h" +#include "TextBreakIteratorInternalICU.h" + +#include "NotImplemented.h" + + +namespace WebCore { + +const char* currentSearchLocaleID() +{ + notImplemented(); + return ""; +} + +const char* currentTextBreakLocaleID() +{ + notImplemented(); + return "en_us"; +} + +} // namespace WebCore + diff --git a/Source/WebCore/platform/text/mac/CharsetData.h b/Source/WebCore/platform/text/mac/CharsetData.h new file mode 100644 index 0000000..458cecb --- /dev/null +++ b/Source/WebCore/platform/text/mac/CharsetData.h @@ -0,0 +1,37 @@ +/* + * Copyright (C) 2003, 2006 Apple Computer, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +namespace WebCore { + + #define kTextEncodingISOLatinThai kCFStringEncodingISOLatinThai + + struct CharsetEntry { + const char* name; + ::TextEncoding encoding; + }; + + extern const CharsetEntry CharsetTable[]; + +} diff --git a/Source/WebCore/platform/text/mac/HyphenationMac.mm b/Source/WebCore/platform/text/mac/HyphenationMac.mm new file mode 100644 index 0000000..d5c9283 --- /dev/null +++ b/Source/WebCore/platform/text/mac/HyphenationMac.mm @@ -0,0 +1,70 @@ +/* + * Copyright (C) 2010 Apple Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS'' + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ + +#import "config.h" +#import "Hyphenation.h" + +#if defined(BUILDING_ON_TIGER) || defined(BUILDING_ON_LEOPARD) || defined(BUILDING_ON_SNOW_LEOPARD) + +#import "AtomicStringKeyedMRUCache.h" +#import "TextBreakIteratorInternalICU.h" +#import "WebCoreSystemInterface.h" +#import <wtf/RetainPtr.h> + +namespace WebCore { + +template<> +bool AtomicStringKeyedMRUCache<bool>::createValueForNullKey() +{ + return !strcmp(currentSearchLocaleID(), "en"); +} + +template<> +bool AtomicStringKeyedMRUCache<bool>::createValueForKey(const AtomicString& localeIdentifier) +{ + RetainPtr<CFStringRef> cfLocaleIdentifier(AdoptCF, localeIdentifier.createCFString()); + RetainPtr<CFDictionaryRef> components(AdoptCF, CFLocaleCreateComponentsFromLocaleIdentifier(kCFAllocatorDefault, cfLocaleIdentifier.get())); + CFStringRef language = reinterpret_cast<CFStringRef>(CFDictionaryGetValue(components.get(), kCFLocaleLanguageCode)); + static CFStringRef englishLanguage = CFSTR("en"); + return language && CFEqual(language, englishLanguage); +} + +bool canHyphenate(const AtomicString& localeIdentifier) +{ + DEFINE_STATIC_LOCAL(AtomicStringKeyedMRUCache<bool>, isEnglishCache, ()); + return isEnglishCache.get(localeIdentifier); +} + +size_t lastHyphenLocation(const UChar* characters, size_t length, size_t beforeIndex, const AtomicString& localeIdentifier) +{ + ASSERT_UNUSED(localeIdentifier, canHyphenate(localeIdentifier)); + + RetainPtr<CFStringRef> string(AdoptCF, CFStringCreateWithCharactersNoCopy(kCFAllocatorDefault, characters, length, kCFAllocatorNull)); + return wkGetHyphenationLocationBeforeIndex(string.get(), beforeIndex); +} + +} // namespace WebCore + +#endif // defined(BUILDING_ON_TIGER) || defined(BUILDING_ON_LEOPARD) || defined(BUILDING_ON_SNOW_LEOPARD) diff --git a/Source/WebCore/platform/text/mac/ShapeArabic.c b/Source/WebCore/platform/text/mac/ShapeArabic.c new file mode 100644 index 0000000..dd61ce5 --- /dev/null +++ b/Source/WebCore/platform/text/mac/ShapeArabic.c @@ -0,0 +1,556 @@ +/* +****************************************************************************** +* +* Copyright (C) 2000-2004, International Business Machines +* Corporation and others. All Rights Reserved. +* Copyright (C) 2007 Apple Inc. All rights reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy of this +* software and associated documentation files (the "Software"), to deal in the Software +* without restriction, including without limitation the rights to use, copy, modify, +* merge, publish, distribute, and/or sell copies of the Software, and to permit persons +* to whom the Software is furnished to do so, provided that the above copyright notice(s) +* and this permission notice appear in all copies of the Software and that both the above +* copyright notice(s) and this permission notice appear in supporting documentation. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, +* INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR +* PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER +* OR HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR +* CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR +* PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING +* OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +* +* Except as contained in this notice, the name of a copyright holder shall not be used in +* advertising or otherwise to promote the sale, use or other dealings in this Software +* without prior written authorization of the copyright holder. +* +****************************************************************************** +* +* Arabic letter shaping implemented by Ayman Roshdy +*/ + +#include "config.h" + +#if USE(ATSUI) + +#include "ShapeArabic.h" + +#include <stdbool.h> +#include <string.h> +#include <unicode/utypes.h> +#include <unicode/uchar.h> +#include <unicode/ustring.h> +#include <unicode/ushape.h> +#include <wtf/Assertions.h> + +/* + * ### TODO in general for letter shaping: + * - the letter shaping code is UTF-16-unaware; needs update + * + especially invertBuffer()?! + * - needs to handle the "Arabic Tail" that is used in some legacy codepages + * as a glyph fragment of wide-glyph letters + * + IBM Unicode conversion tables map it to U+200B (ZWSP) + * + IBM Egypt has proposed to encode the tail in Unicode among Arabic Presentation Forms + */ + +/* definitions for Arabic letter shaping ------------------------------------ */ + +#define IRRELEVANT 4 +#define LAMTYPE 16 +#define ALEFTYPE 32 +#define LINKR 1 +#define LINKL 2 + +static const UChar IrrelevantPos[] = { + 0x0, 0x2, 0x4, 0x6, + 0x8, 0xA, 0xC, 0xE, +}; + +static const UChar araLink[178]= +{ + 1 + 32 + 256 * 0x11,/*0x0622*/ + 1 + 32 + 256 * 0x13,/*0x0623*/ + 1 + 256 * 0x15,/*0x0624*/ + 1 + 32 + 256 * 0x17,/*0x0625*/ + 1 + 2 + 256 * 0x19,/*0x0626*/ + 1 + 32 + 256 * 0x1D,/*0x0627*/ + 1 + 2 + 256 * 0x1F,/*0x0628*/ + 1 + 256 * 0x23,/*0x0629*/ + 1 + 2 + 256 * 0x25,/*0x062A*/ + 1 + 2 + 256 * 0x29,/*0x062B*/ + 1 + 2 + 256 * 0x2D,/*0x062C*/ + 1 + 2 + 256 * 0x31,/*0x062D*/ + 1 + 2 + 256 * 0x35,/*0x062E*/ + 1 + 256 * 0x39,/*0x062F*/ + 1 + 256 * 0x3B,/*0x0630*/ + 1 + 256 * 0x3D,/*0x0631*/ + 1 + 256 * 0x3F,/*0x0632*/ + 1 + 2 + 256 * 0x41,/*0x0633*/ + 1 + 2 + 256 * 0x45,/*0x0634*/ + 1 + 2 + 256 * 0x49,/*0x0635*/ + 1 + 2 + 256 * 0x4D,/*0x0636*/ + 1 + 2 + 256 * 0x51,/*0x0637*/ + 1 + 2 + 256 * 0x55,/*0x0638*/ + 1 + 2 + 256 * 0x59,/*0x0639*/ + 1 + 2 + 256 * 0x5D,/*0x063A*/ + 0, 0, 0, 0, 0, /*0x063B-0x063F*/ + 1 + 2, /*0x0640*/ + 1 + 2 + 256 * 0x61,/*0x0641*/ + 1 + 2 + 256 * 0x65,/*0x0642*/ + 1 + 2 + 256 * 0x69,/*0x0643*/ + 1 + 2 + 16 + 256 * 0x6D,/*0x0644*/ + 1 + 2 + 256 * 0x71,/*0x0645*/ + 1 + 2 + 256 * 0x75,/*0x0646*/ + 1 + 2 + 256 * 0x79,/*0x0647*/ + 1 + 256 * 0x7D,/*0x0648*/ + 1 + 256 * 0x7F,/*0x0649*/ + 1 + 2 + 256 * 0x81,/*0x064A*/ + 4, 4, 4, 4, /*0x064B-0x064E*/ + 4, 4, 4, 4, /*0x064F-0x0652*/ + 4, 4, 4, 0, 0, /*0x0653-0x0657*/ + 0, 0, 0, 0, /*0x0658-0x065B*/ + 1 + 256 * 0x85,/*0x065C*/ + 1 + 256 * 0x87,/*0x065D*/ + 1 + 256 * 0x89,/*0x065E*/ + 1 + 256 * 0x8B,/*0x065F*/ + 0, 0, 0, 0, 0, /*0x0660-0x0664*/ + 0, 0, 0, 0, 0, /*0x0665-0x0669*/ + 0, 0, 0, 0, 0, 0, /*0x066A-0x066F*/ + 4, /*0x0670*/ + 0, /*0x0671*/ + 1 + 32, /*0x0672*/ + 1 + 32, /*0x0673*/ + 0, /*0x0674*/ + 1 + 32, /*0x0675*/ + 1, 1, /*0x0676-0x0677*/ + 1+2, /*0x0678*/ + 1+2 + 256 * 0x16,/*0x0679*/ + 1+2 + 256 * 0x0E,/*0x067A*/ + 1+2 + 256 * 0x02,/*0x067B*/ + 1+2, 1+2, /*0x067C-0x067D*/ + 1+2 + 256 * 0x06,/*0x067E*/ + 1+2 + 256 * 0x12,/*0x067F*/ + 1+2 + 256 * 0x0A,/*0x0680*/ + 1+2, 1+2, /*0x0681-0x0682*/ + 1+2 + 256 * 0x26,/*0x0683*/ + 1+2 + 256 * 0x22,/*0x0684*/ + 1+2, /*0x0685*/ + 1+2 + 256 * 0x2A,/*0x0686*/ + 1+2 + 256 * 0x2E,/*0x0687*/ + 1 + 256 * 0x38,/*0x0688*/ + 1, 1, 1, /*0x0689-0x068B*/ + 1 + 256 * 0x34,/*0x068C*/ + 1 + 256 * 0x32,/*0x068D*/ + 1 + 256 * 0x36,/*0x068E*/ + 1, 1, /*0x068F-0x0690*/ + 1 + 256 * 0x3C,/*0x0691*/ + 1, 1, 1, 1, 1, 1, /*0x0692-0x0697*/ + 1 + 256 * 0x3A,/*0x0698*/ + 1, /*0x0699*/ + 1+2, 1+2, 1+2, 1+2, 1+2, 1+2, /*0x069A-0x069F*/ + 1+2, 1+2, 1+2, 1+2, /*0x06A0-0x06A3*/ + 1+2 + 256 * 0x2E,/*0x06A4*/ + 1+2, /*0x06A5*/ + 1+2 + 256 * 0x1E,/*0x06A6*/ + 1+2, 1+2, /*0x06A7-0x06A8*/ + 1+2 + 256 * 0x3E,/*0x06A9*/ + 1+2, 1+2, 1+2, /*0x06AA-0x06AC*/ + 1+2 + 256 * 0x83,/*0x06AD*/ + 1+2, /*0x06AE*/ + 1+2 + 256 * 0x42,/*0x06AF*/ + 1+2, /*0x06B0*/ + 1+2 + 256 * 0x4A,/*0x06B1*/ + 1+2, /*0x06B2*/ + 1+2 + 256 * 0x46,/*0x06B3*/ + 1+2, 1+2, 1+2, 1+2, 1+2, 1+2, /*0x06B4-0x06B9*/ + 1+2, /*0x06BA*/ // FIXME: Seems to have a final form + 1+2 + 256 * 0x50,/*0x06BB*/ + 1+2, 1+2, /*0x06BC-0x06BD*/ + 1+2 + 256 * 0x5A,/*0x06BE*/ + 1+2, /*0x06BF*/ + 1, /*0x06C0*/ + 1+2 + 256 * 0x56,/*0x06C1*/ + 1+2, /*0x06C2*/ + 1, 1, /*0x06C3-0x06C4*/ + 1 + 256 * 0x90,/*0x06C5*/ + 1 + 256 * 0x89,/*0x06C6*/ + 1 + 256 * 0x87,/*0x06C7*/ + 1 + 256 * 0x8B,/*0x06C8*/ + 1 + 256 * 0x92,/*0x06C9*/ + 1, /*0x06CA*/ + 1 + 256 * 0x8E,/*0x06CB*/ + 1+2 + 256 * 0xAC,/*0x06CC*/ + 1, /*0x06CD*/ + 1+2, /*0x06CE*/ + 1, /*0x06CF*/ + 1+2 + 256 * 0x94,/*0x06D0*/ + 1+2, /*0x06D1*/ + 1 + 256 * 0x5E,/*0x06D2*/ + 1 + 256 * 0x60 /*0x06D3*/ +}; + +static const UChar presLink[141]= +{ + 1 + 2, /*0xFE70*/ + 1 + 2, /*0xFE71*/ + 1 + 2, 0, 1+ 2, 0, 1+ 2, /*0xFE72-0xFE76*/ + 1 + 2, /*0xFE77*/ + 1+ 2, 1 + 2, 1+2, 1 + 2, /*0xFE78-0xFE81*/ + 1+ 2, 1 + 2, 1+2, 1 + 2, /*0xFE82-0xFE85*/ + 0, 0 + 32, 1 + 32, 0 + 32, /*0xFE86-0xFE89*/ + 1 + 32, 0, 1, 0 + 32, /*0xFE8A-0xFE8D*/ + 1 + 32, 0, 2, 1 + 2, /*0xFE8E-0xFE91*/ + 1, 0 + 32, 1 + 32, 0, /*0xFE92-0xFE95*/ + 2, 1 + 2, 1, 0, /*0xFE96-0xFE99*/ + 1, 0, 2, 1 + 2, /*0xFE9A-0xFE9D*/ + 1, 0, 2, 1 + 2, /*0xFE9E-0xFEA1*/ + 1, 0, 2, 1 + 2, /*0xFEA2-0xFEA5*/ + 1, 0, 2, 1 + 2, /*0xFEA6-0xFEA9*/ + 1, 0, 2, 1 + 2, /*0xFEAA-0xFEAD*/ + 1, 0, 1, 0, /*0xFEAE-0xFEB1*/ + 1, 0, 1, 0, /*0xFEB2-0xFEB5*/ + 1, 0, 2, 1+2, /*0xFEB6-0xFEB9*/ + 1, 0, 2, 1+2, /*0xFEBA-0xFEBD*/ + 1, 0, 2, 1+2, /*0xFEBE-0xFEC1*/ + 1, 0, 2, 1+2, /*0xFEC2-0xFEC5*/ + 1, 0, 2, 1+2, /*0xFEC6-0xFEC9*/ + 1, 0, 2, 1+2, /*0xFECA-0xFECD*/ + 1, 0, 2, 1+2, /*0xFECE-0xFED1*/ + 1, 0, 2, 1+2, /*0xFED2-0xFED5*/ + 1, 0, 2, 1+2, /*0xFED6-0xFED9*/ + 1, 0, 2, 1+2, /*0xFEDA-0xFEDD*/ + 1, 0, 2, 1+2, /*0xFEDE-0xFEE1*/ + 1, 0 + 16, 2 + 16, 1 + 2 +16, /*0xFEE2-0xFEE5*/ + 1 + 16, 0, 2, 1+2, /*0xFEE6-0xFEE9*/ + 1, 0, 2, 1+2, /*0xFEEA-0xFEED*/ + 1, 0, 2, 1+2, /*0xFEEE-0xFEF1*/ + 1, 0, 1, 0, /*0xFEF2-0xFEF5*/ + 1, 0, 2, 1+2, /*0xFEF6-0xFEF9*/ + 1, 0, 1, 0, /*0xFEFA-0xFEFD*/ + 1, 0, 1, 0, + 1 +}; + +static const UChar convertFEto06[] = +{ +/***********0******1******2******3******4******5******6******7******8******9******A******B******C******D******E******F***/ +/*FE7*/ 0x64B, 0x64B, 0x64C, 0x64C, 0x64D, 0x64D, 0x64E, 0x64E, 0x64F, 0x64F, 0x650, 0x650, 0x651, 0x651, 0x652, 0x652, +/*FE8*/ 0x621, 0x622, 0x622, 0x623, 0x623, 0x624, 0x624, 0x625, 0x625, 0x626, 0x626, 0x626, 0x626, 0x627, 0x627, 0x628, +/*FE9*/ 0x628, 0x628, 0x628, 0x629, 0x629, 0x62A, 0x62A, 0x62A, 0x62A, 0x62B, 0x62B, 0x62B, 0x62B, 0x62C, 0x62C, 0x62C, +/*FEA*/ 0x62C, 0x62D, 0x62D, 0x62D, 0x62D, 0x62E, 0x62E, 0x62E, 0x62E, 0x62F, 0x62F, 0x630, 0x630, 0x631, 0x631, 0x632, +/*FEB*/ 0x632, 0x633, 0x633, 0x633, 0x633, 0x634, 0x634, 0x634, 0x634, 0x635, 0x635, 0x635, 0x635, 0x636, 0x636, 0x636, +/*FEC*/ 0x636, 0x637, 0x637, 0x637, 0x637, 0x638, 0x638, 0x638, 0x638, 0x639, 0x639, 0x639, 0x639, 0x63A, 0x63A, 0x63A, +/*FED*/ 0x63A, 0x641, 0x641, 0x641, 0x641, 0x642, 0x642, 0x642, 0x642, 0x643, 0x643, 0x643, 0x643, 0x644, 0x644, 0x644, +/*FEE*/ 0x644, 0x645, 0x645, 0x645, 0x645, 0x646, 0x646, 0x646, 0x646, 0x647, 0x647, 0x647, 0x647, 0x648, 0x648, 0x649, +/*FEF*/ 0x649, 0x64A, 0x64A, 0x64A, 0x64A, 0x65C, 0x65C, 0x65D, 0x65D, 0x65E, 0x65E, 0x65F, 0x65F +}; + +static const UChar shapeTable[4][4][4]= +{ + { {0,0,0,0}, {0,0,0,0}, {0,1,0,3}, {0,1,0,1} }, + { {0,0,2,2}, {0,0,1,2}, {0,1,1,2}, {0,1,1,3} }, + { {0,0,0,0}, {0,0,0,0}, {0,1,0,3}, {0,1,0,3} }, + { {0,0,1,2}, {0,0,1,2}, {0,1,1,2}, {0,1,1,3} } +}; + +/* + *Name : changeLamAlef + *Function : Converts the Alef characters into an equivalent + * LamAlef location in the 0x06xx Range, this is an + * intermediate stage in the operation of the program + * later it'll be converted into the 0xFExx LamAlefs + * in the shaping function. + */ +static UChar +changeLamAlef(UChar ch) { + + switch(ch) { + case 0x0622 : + return(0x065C); + break; + case 0x0623 : + return(0x065D); + break; + case 0x0625 : + return(0x065E); + break; + case 0x0627 : + return(0x065F); + break; + default : + return(0); + break; + } +} + +/* + *Name : specialChar + *Function : Special Arabic characters need special handling in the shapeUnicode + * function, this function returns 1 or 2 for these special characters + */ +static int32_t +specialChar(UChar ch) { + + if( (ch>0x0621 && ch<0x0626)||(ch==0x0627)||(ch>0x062e && ch<0x0633)|| + (ch>0x0647 && ch<0x064a)||(ch==0x0629) ) { + return (1); + } + else + if( ch>=0x064B && ch<= 0x0652 ) + return (2); + else + if( (ch>=0x0653 && ch<= 0x0655) || ch == 0x0670 || + (ch>=0xFE70 && ch<= 0xFE7F) ) + return (3); + else + return (0); +} + +/* + *Name : getLink + *Function : Resolves the link between the characters as + * Arabic characters have four forms : + * Isolated, Initial, Middle and Final Form + */ +static UChar +getLink(UChar ch) { + + if(ch >= 0x0622 && ch <= 0x06D3) { + return(araLink[ch-0x0622]); + } else if(ch == 0x200D) { + return(3); + } else if(ch >= 0x206D && ch <= 0x206F) { + return(4); + } else if(ch >= 0xFE70 && ch <= 0xFEFC) { + return(presLink[ch-0xFE70]); + } else { + return(0); + } +} + +/* + *Name : isTashkeelChar + *Function : Returns 1 for Tashkeel characters else return 0 + */ +static int32_t +isTashkeelChar(UChar ch) { + + if( ch>=0x064B && ch<= 0x0652 ) + return (1); + else + return (0); +} + +/* + *Name : shapeUnicode + *Function : Converts an Arabic Unicode buffer in 06xx Range into a shaped + * arabic Unicode buffer in FExx Range + */ +static int32_t +shapeUnicode(UChar *dest, int32_t sourceLength, + int32_t destSize, + int tashkeelFlag) { + + int32_t i, iend; + int32_t prevPos, lastPos,Nx, Nw; + unsigned int Shape; + int32_t flag; + int32_t lamalef_found = 0; + UChar prevLink = 0, lastLink = 0, currLink, nextLink = 0; + UChar wLamalef; + + /* + * Converts the input buffer from FExx Range into 06xx Range + * to make sure that all characters are in the 06xx range + * even the lamalef is converted to the special region in + * the 06xx range + */ + for (i = 0; i < sourceLength; i++) { + UChar inputChar = dest[i]; + if ( (inputChar >= 0xFE70) && (inputChar <= 0xFEFC)) { + dest[i] = convertFEto06 [ (inputChar - 0xFE70) ] ; + } + } + + /* sets the index to the end of the buffer, together with the step point to -1 */ + i = 0; + iend = sourceLength; + + /* + * This function resolves the link between the characters . + * Arabic characters have four forms : + * Isolated Form, Initial Form, Middle Form and Final Form + */ + currLink = getLink(dest[i]); + + prevPos = i; + lastPos = i; + Nx = sourceLength + 2, Nw = 0; + + while (i != iend) { + /* If high byte of currLink > 0 then more than one shape */ + if ((currLink & 0xFF00) > 0 || isTashkeelChar(dest[i])) { + Nw = i + 1; + while (Nx >= sourceLength) { /* we need to know about next char */ + if(Nw == iend) { + nextLink = 0; + Nx = -1; + } else { + nextLink = getLink(dest[Nw]); + if((nextLink & IRRELEVANT) == 0) { + Nx = Nw; + } else { + Nw = Nw + 1; + } + } + } + + if ( ((currLink & ALEFTYPE) > 0) && ((lastLink & LAMTYPE) > 0) ) { + lamalef_found = 1; + wLamalef = changeLamAlef(dest[i]); /*get from 0x065C-0x065f */ + if ( wLamalef != 0) { + dest[i] = ' '; /* The default case is to drop the Alef and replace */ + dest[lastPos] =wLamalef; /* it by a space. */ + i=lastPos; + } + lastLink = prevLink; + currLink = getLink(wLamalef); + } + /* + * get the proper shape according to link ability of neighbors + * and of character; depends on the order of the shapes + * (isolated, initial, middle, final) in the compatibility area + */ + flag = specialChar(dest[i]); + + Shape = shapeTable[nextLink & (LINKR + LINKL)] + [lastLink & (LINKR + LINKL)] + [currLink & (LINKR + LINKL)]; + + if (flag == 1) { + Shape = (Shape == 1 || Shape == 3) ? 1 : 0; + } + else + if(flag == 2) { + if( (lastLink & LINKL) && (nextLink & LINKR) && (tashkeelFlag == 1) && + dest[i] != 0x064C && dest[i] != 0x064D ) { + Shape = 1; + if( (nextLink&ALEFTYPE) == ALEFTYPE && (lastLink&LAMTYPE) == LAMTYPE ) + Shape = 0; + } + else { + Shape = 0; + } + } + + if(flag == 2) { + dest[i] = 0xFE70 + IrrelevantPos[(dest[i] - 0x064B)] + Shape; + } + else + dest[i] = (UChar)((dest[i] < 0x0670 ? 0xFE70 : 0xFB50) + (currLink >> 8) + Shape); + } + + /* move one notch forward */ + if ((currLink & IRRELEVANT) == 0) { + prevLink = lastLink; + lastLink = currLink; + prevPos = lastPos; + lastPos = i; + } + + i++; + if (i == Nx) { + currLink = nextLink; + Nx = sourceLength + 2; + } + else if(i != iend) { + currLink = getLink(dest[i]); + } + } + + destSize = sourceLength; + + return destSize; +} + +int32_t shapeArabic(const UChar *source, int32_t sourceLength, UChar *dest, int32_t destCapacity, uint32_t options, UErrorCode *pErrorCode) { + int32_t destLength; + + /* usual error checking */ + if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { + return 0; + } + + /* make sure that no reserved options values are used; allow dest==NULL only for preflighting */ + if( source==NULL || sourceLength<-1 || + (dest==NULL && destCapacity!=0) || destCapacity<0 || + options>=U_SHAPE_DIGIT_TYPE_RESERVED || + (options&U_SHAPE_DIGITS_MASK)>=U_SHAPE_DIGITS_RESERVED + ) { + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + + /* determine the source length */ + if(sourceLength==-1) { + sourceLength=u_strlen(source); + } + if(sourceLength==0) { + return 0; + } + + /* check that source and destination do not overlap */ + if( dest!=NULL && + ((source<=dest && dest<source+sourceLength) || + (dest<=source && source<dest+destCapacity)) + ) { + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + + if((options&U_SHAPE_LETTERS_MASK)!=U_SHAPE_LETTERS_NOOP) { + int32_t outputSize = sourceLength; + + /* calculate destination size */ + /* TODO: do we ever need to do this pure preflighting? */ + ASSERT((options&U_SHAPE_LENGTH_MASK) != U_SHAPE_LENGTH_GROW_SHRINK); + + if(outputSize>destCapacity) { + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; + return outputSize; + } + + /* Start of Arabic letter shaping part */ + memcpy(dest, source, sourceLength*U_SIZEOF_UCHAR); + + ASSERT((options&U_SHAPE_TEXT_DIRECTION_MASK) == U_SHAPE_TEXT_DIRECTION_LOGICAL); + + switch(options&U_SHAPE_LETTERS_MASK) { + case U_SHAPE_LETTERS_SHAPE : + /* Call the shaping function with tashkeel flag == 1 */ + destLength = shapeUnicode(dest,sourceLength,destCapacity,1); + break; + case U_SHAPE_LETTERS_SHAPE_TASHKEEL_ISOLATED : + /* Call the shaping function with tashkeel flag == 0 */ + destLength = shapeUnicode(dest,sourceLength,destCapacity,0); + break; + case U_SHAPE_LETTERS_UNSHAPE : + ASSERT_NOT_REACHED(); + break; + default : + /* will never occur because of validity checks above */ + destLength = 0; + break; + } + + /* End of Arabic letter shaping part */ + } else + ASSERT_NOT_REACHED(); + + ASSERT((options & U_SHAPE_DIGITS_MASK) == U_SHAPE_DIGITS_NOOP); + + return sourceLength; +} + +#endif // USE(ATSUI) diff --git a/Source/WebCore/platform/text/mac/ShapeArabic.h b/Source/WebCore/platform/text/mac/ShapeArabic.h new file mode 100644 index 0000000..8aa577d --- /dev/null +++ b/Source/WebCore/platform/text/mac/ShapeArabic.h @@ -0,0 +1,44 @@ +/* + * Copyright (C) 2007 Apple Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef ShapeArabic_h +#define ShapeArabic_h + +#if USE(ATSUI) + +#include <unicode/ushape.h> + +#ifdef __cplusplus +extern "C" { +#endif + +int32_t shapeArabic(const UChar *source, int32_t sourceLength, UChar *dest, int32_t destCapacity, uint32_t options, UErrorCode *pErrorCode); + +#ifdef __cplusplus +} +#endif + +#endif // USE(ATSUI) +#endif // ShapeArabic_h diff --git a/Source/WebCore/platform/text/mac/StringImplMac.mm b/Source/WebCore/platform/text/mac/StringImplMac.mm new file mode 100644 index 0000000..6f5e953 --- /dev/null +++ b/Source/WebCore/platform/text/mac/StringImplMac.mm @@ -0,0 +1,33 @@ +/* + * Copyright (C) 2006, 2009 Apple Inc. All rights reserved. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public License + * along with this library; see the file COPYING.LIB. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, + * Boston, MA 02110-1301, USA. + * + */ + +#include "config.h" +#include <wtf/text/StringImpl.h> + +#include "FoundationExtras.h" + +namespace WTF { + +StringImpl::operator NSString *() +{ + return HardAutorelease(createCFString()); +} + +} diff --git a/Source/WebCore/platform/text/mac/StringMac.mm b/Source/WebCore/platform/text/mac/StringMac.mm new file mode 100644 index 0000000..7e98b2b --- /dev/null +++ b/Source/WebCore/platform/text/mac/StringMac.mm @@ -0,0 +1,42 @@ +/** + * Copyright (C) 2006 Apple Computer, Inc. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public License + * along with this library; see the file COPYING.LIB. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, + * Boston, MA 02110-1301, USA. + * + */ + +#include "config.h" +#include "PlatformString.h" +#include <CoreFoundation/CFString.h> + +namespace WTF { + +String::String(NSString* str) +{ + if (!str) + return; + + CFIndex size = CFStringGetLength(reinterpret_cast<CFStringRef>(str)); + if (size == 0) + m_impl = StringImpl::empty(); + else { + Vector<UChar, 1024> buffer(size); + CFStringGetCharacters(reinterpret_cast<CFStringRef>(str), CFRangeMake(0, size), buffer.data()); + m_impl = StringImpl::create(buffer.data(), size); + } +} + +} diff --git a/Source/WebCore/platform/text/mac/TextBoundaries.mm b/Source/WebCore/platform/text/mac/TextBoundaries.mm new file mode 100644 index 0000000..bd7ddf8 --- /dev/null +++ b/Source/WebCore/platform/text/mac/TextBoundaries.mm @@ -0,0 +1,56 @@ +/* + * Copyright (C) 2004, 2006 Apple Computer, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#import "config.h" +#import "TextBoundaries.h" + +using namespace WTF::Unicode; + +namespace WebCore { + +void findWordBoundary(const UChar* chars, int len, int position, int* start, int* end) +{ + NSString* string = [[NSString alloc] initWithCharactersNoCopy:const_cast<unichar*>(chars) + length:len freeWhenDone:NO]; + NSAttributedString* attr = [[NSAttributedString alloc] initWithString:string]; + NSRange range = [attr doubleClickAtIndex:(position >= len) ? len - 1 : position]; + [attr release]; + [string release]; + *start = range.location; + *end = range.location + range.length; +} + +int findNextWordFromIndex(const UChar* chars, int len, int position, bool forward) +{ + NSString* string = [[NSString alloc] initWithCharactersNoCopy:const_cast<unichar*>(chars) + length:len freeWhenDone:NO]; + NSAttributedString* attr = [[NSAttributedString alloc] initWithString:string]; + int result = [attr nextWordFromIndex:position forward:forward]; + [attr release]; + [string release]; + return result; +} + +} diff --git a/Source/WebCore/platform/text/mac/TextBreakIteratorInternalICUMac.mm b/Source/WebCore/platform/text/mac/TextBreakIteratorInternalICUMac.mm new file mode 100644 index 0000000..6af5616 --- /dev/null +++ b/Source/WebCore/platform/text/mac/TextBreakIteratorInternalICUMac.mm @@ -0,0 +1,108 @@ +/* + * Copyright (C) 2007, 2009 Apple Inc. All rights reserved. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public License + * along with this library; see the file COPYING.LIB. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, + * Boston, MA 02110-1301, USA. + * + */ + +#include "config.h" +#include "TextBreakIteratorInternalICU.h" + +#include <wtf/RetainPtr.h> + +namespace WebCore { + +static const int maxLocaleStringLength = 32; + +static inline RetainPtr<CFStringRef> textBreakLocalePreference() +{ + RetainPtr<CFPropertyListRef> locale(AdoptCF, CFPreferencesCopyValue(CFSTR("AppleTextBreakLocale"), + kCFPreferencesAnyApplication, kCFPreferencesCurrentUser, kCFPreferencesAnyHost)); + if (!locale || CFGetTypeID(locale.get()) != CFStringGetTypeID()) + return 0; + return static_cast<CFStringRef>(locale.get()); +} + +static RetainPtr<CFStringRef> topLanguagePreference() +{ + NSArray *languagesArray = [[NSUserDefaults standardUserDefaults] arrayForKey:@"AppleLanguages"]; + if (!languagesArray) + return 0; + if ([languagesArray count] < 1) + return 0; + NSString *value = [languagesArray objectAtIndex:0]; + if (![value isKindOfClass:[NSString class]]) + return 0; + return reinterpret_cast<CFStringRef>(value); +} + +static RetainPtr<CFStringRef> canonicalLanguageIdentifier(CFStringRef locale) +{ + if (!locale) + return 0; + RetainPtr<CFStringRef> canonicalLocale(AdoptCF, + CFLocaleCreateCanonicalLanguageIdentifierFromString(kCFAllocatorDefault, locale)); + if (!canonicalLocale) + return locale; + return canonicalLocale; +} + +static void getLocale(CFStringRef locale, char localeStringBuffer[maxLocaleStringLength]) +{ + // Empty string means "root locale", and that is what we use if we can't get a preference. + localeStringBuffer[0] = 0; + if (!locale) + return; + CFStringGetCString(locale, localeStringBuffer, maxLocaleStringLength, kCFStringEncodingASCII); +} + +static void getSearchLocale(char localeStringBuffer[maxLocaleStringLength]) +{ + getLocale(canonicalLanguageIdentifier(topLanguagePreference().get()).get(), localeStringBuffer); +} + +const char* currentSearchLocaleID() +{ + static char localeStringBuffer[maxLocaleStringLength]; + static bool gotSearchLocale = false; + if (!gotSearchLocale) { + getSearchLocale(localeStringBuffer); + gotSearchLocale = true; + } + return localeStringBuffer; +} + +static void getTextBreakLocale(char localeStringBuffer[maxLocaleStringLength]) +{ + // If there is no text break locale, use the top language preference. + RetainPtr<CFStringRef> locale = textBreakLocalePreference(); + if (!locale) + locale = topLanguagePreference(); + getLocale(canonicalLanguageIdentifier(locale.get()).get(), localeStringBuffer); +} + +const char* currentTextBreakLocaleID() +{ + static char localeStringBuffer[maxLocaleStringLength]; + static bool gotTextBreakLocale = false; + if (!gotTextBreakLocale) { + getTextBreakLocale(localeStringBuffer); + gotTextBreakLocale = true; + } + return localeStringBuffer; +} + +} diff --git a/Source/WebCore/platform/text/mac/TextCodecMac.cpp b/Source/WebCore/platform/text/mac/TextCodecMac.cpp new file mode 100644 index 0000000..b743f3d --- /dev/null +++ b/Source/WebCore/platform/text/mac/TextCodecMac.cpp @@ -0,0 +1,329 @@ +/* + * Copyright (C) 2004, 2006, 2008 Apple Inc. All rights reserved. + * Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "TextCodecMac.h" + +#include "CharacterNames.h" +#include "CharsetData.h" +#include "PlatformString.h" +#include "ThreadGlobalData.h" +#include <wtf/Assertions.h> +#include <wtf/text/CString.h> +#include <wtf/PassOwnPtr.h> +#include <wtf/RetainPtr.h> +#include <wtf/Threading.h> + +using namespace std; + +namespace WebCore { + +// We need to keep this because ICU doesn't support some of the encodings that we need: +// <http://bugs.webkit.org/show_bug.cgi?id=4195>. + +const size_t ConversionBufferSize = 16384; + +static TECConverterWrapper& cachedConverterTEC() +{ + return threadGlobalData().cachedConverterTEC(); +} + +void TextCodecMac::registerEncodingNames(EncodingNameRegistrar registrar) +{ + TECTextEncodingID lastEncoding = invalidEncoding; + const char* lastName = 0; + + for (size_t i = 0; CharsetTable[i].name; ++i) { + if (CharsetTable[i].encoding != lastEncoding) { + lastEncoding = CharsetTable[i].encoding; + lastName = CharsetTable[i].name; + } + registrar(CharsetTable[i].name, lastName); + } +} + +static PassOwnPtr<TextCodec> newTextCodecMac(const TextEncoding&, const void* additionalData) +{ + return new TextCodecMac(*static_cast<const TECTextEncodingID*>(additionalData)); +} + +void TextCodecMac::registerCodecs(TextCodecRegistrar registrar) +{ + TECTextEncodingID lastEncoding = invalidEncoding; + + for (size_t i = 0; CharsetTable[i].name; ++i) + if (CharsetTable[i].encoding != lastEncoding) { + registrar(CharsetTable[i].name, newTextCodecMac, &CharsetTable[i].encoding); + lastEncoding = CharsetTable[i].encoding; + } +} + +TextCodecMac::TextCodecMac(TECTextEncodingID encoding) + : m_encoding(encoding) + , m_numBufferedBytes(0) + , m_converterTEC(0) +{ +} + +TextCodecMac::~TextCodecMac() +{ + releaseTECConverter(); +} + +void TextCodecMac::releaseTECConverter() const +{ + if (m_converterTEC) { + TECConverterWrapper& cachedConverter = cachedConverterTEC(); + if (cachedConverter.converter) + TECDisposeConverter(cachedConverter.converter); + cachedConverter.converter = m_converterTEC; + cachedConverter.encoding = m_encoding; + m_converterTEC = 0; + } +} + +OSStatus TextCodecMac::createTECConverter() const +{ + TECConverterWrapper& cachedConverter = cachedConverterTEC(); + + bool cachedEncodingEqual = cachedConverter.encoding == m_encoding; + cachedConverter.encoding = invalidEncoding; + + if (cachedEncodingEqual && cachedConverter.converter) { + m_converterTEC = cachedConverter.converter; + cachedConverter.converter = 0; + + TECClearConverterContextInfo(m_converterTEC); + } else { + OSStatus status = TECCreateConverter(&m_converterTEC, m_encoding, + CreateTextEncoding(kTextEncodingUnicodeDefault, kTextEncodingDefaultVariant, kUnicode16BitFormat)); + if (status) + return status; + + TECSetBasicOptions(m_converterTEC, kUnicodeForceASCIIRangeMask); + } + + return noErr; +} + +OSStatus TextCodecMac::decode(const unsigned char* inputBuffer, int inputBufferLength, int& inputLength, + void *outputBuffer, int outputBufferLength, int& outputLength) +{ + OSStatus status; + unsigned long bytesRead = 0; + unsigned long bytesWritten = 0; + + if (m_numBufferedBytes != 0) { + // Finish converting a partial character that's in our buffer. + + // First, fill the partial character buffer with as many bytes as are available. + ASSERT(m_numBufferedBytes < sizeof(m_bufferedBytes)); + const int spaceInBuffer = sizeof(m_bufferedBytes) - m_numBufferedBytes; + const int bytesToPutInBuffer = min(spaceInBuffer, inputBufferLength); + ASSERT(bytesToPutInBuffer != 0); + memcpy(m_bufferedBytes + m_numBufferedBytes, inputBuffer, bytesToPutInBuffer); + + // Now, do a conversion on the buffer. + status = TECConvertText(m_converterTEC, m_bufferedBytes, m_numBufferedBytes + bytesToPutInBuffer, &bytesRead, + reinterpret_cast<unsigned char*>(outputBuffer), outputBufferLength, &bytesWritten); + ASSERT(bytesRead <= m_numBufferedBytes + bytesToPutInBuffer); + + if (status == kTECPartialCharErr && bytesRead == 0) { + // Handle the case where the partial character was not converted. + if (bytesToPutInBuffer >= spaceInBuffer) { + LOG_ERROR("TECConvertText gave a kTECPartialCharErr but read none of the %zu bytes in the buffer", sizeof(m_bufferedBytes)); + m_numBufferedBytes = 0; + status = kTECUnmappableElementErr; // should never happen, but use this error code + } else { + // Tell the caller we read all the source bytes and keep them in the buffer. + m_numBufferedBytes += bytesToPutInBuffer; + bytesRead = bytesToPutInBuffer; + status = noErr; + } + } else { + // We are done with the partial character buffer. + // Also, we have read some of the bytes from the main buffer. + if (bytesRead > m_numBufferedBytes) { + bytesRead -= m_numBufferedBytes; + } else { + LOG_ERROR("TECConvertText accepted some bytes it previously rejected with kTECPartialCharErr"); + bytesRead = 0; + } + m_numBufferedBytes = 0; + if (status == kTECPartialCharErr) { + // While there may be a partial character problem in the small buffer, + // we have to try again and not get confused and think there is a partial + // character problem in the large buffer. + status = noErr; + } + } + } else { + status = TECConvertText(m_converterTEC, inputBuffer, inputBufferLength, &bytesRead, + static_cast<unsigned char*>(outputBuffer), outputBufferLength, &bytesWritten); + ASSERT(static_cast<int>(bytesRead) <= inputBufferLength); + } + + // Work around bug 3351093, where sometimes we get kTECBufferBelowMinimumSizeErr instead of kTECOutputBufferFullStatus. + if (status == kTECBufferBelowMinimumSizeErr && bytesWritten != 0) + status = kTECOutputBufferFullStatus; + + inputLength = bytesRead; + outputLength = bytesWritten; + return status; +} + +String TextCodecMac::decode(const char* bytes, size_t length, bool flush, bool stopOnError, bool& sawError) +{ + // Get a converter for the passed-in encoding. + if (!m_converterTEC && createTECConverter() != noErr) + return String(); + + Vector<UChar> result; + + const unsigned char* sourcePointer = reinterpret_cast<const unsigned char*>(bytes); + int sourceLength = length; + bool bufferWasFull = false; + UniChar buffer[ConversionBufferSize]; + + while ((sourceLength || bufferWasFull) && !sawError) { + int bytesRead = 0; + int bytesWritten = 0; + OSStatus status = decode(sourcePointer, sourceLength, bytesRead, buffer, sizeof(buffer), bytesWritten); + ASSERT(bytesRead <= sourceLength); + sourcePointer += bytesRead; + sourceLength -= bytesRead; + + switch (status) { + case noErr: + case kTECOutputBufferFullStatus: + break; + case kTextMalformedInputErr: + case kTextUndefinedElementErr: + // FIXME: Put FFFD character into the output string in this case? + TECClearConverterContextInfo(m_converterTEC); + if (stopOnError) { + sawError = true; + break; + } + if (sourceLength) { + sourcePointer += 1; + sourceLength -= 1; + } + break; + case kTECPartialCharErr: { + // Put the partial character into the buffer. + ASSERT(m_numBufferedBytes == 0); + const int bufferSize = sizeof(m_numBufferedBytes); + if (sourceLength < bufferSize) { + memcpy(m_bufferedBytes, sourcePointer, sourceLength); + m_numBufferedBytes = sourceLength; + } else { + LOG_ERROR("TECConvertText gave a kTECPartialCharErr, but left %u bytes in the buffer", sourceLength); + } + sourceLength = 0; + break; + } + default: + sawError = true; + return String(); + } + + ASSERT(!(bytesWritten % sizeof(UChar))); + result.append(buffer, bytesWritten / sizeof(UChar)); + + bufferWasFull = status == kTECOutputBufferFullStatus; + } + + if (flush) { + unsigned long bytesWritten = 0; + TECFlushText(m_converterTEC, reinterpret_cast<unsigned char*>(buffer), sizeof(buffer), &bytesWritten); + ASSERT(!(bytesWritten % sizeof(UChar))); + result.append(buffer, bytesWritten / sizeof(UChar)); + } + + String resultString = String::adopt(result); + + // <rdar://problem/3225472> + // Simplified Chinese pages use the code A3A0 to mean "full-width space". + // But GB18030 decodes it to U+E5E5, which is correct in theory but not in practice. + // To work around, just change all occurences of U+E5E5 to U+3000 (ideographic space). + if (m_encoding == kCFStringEncodingGB_18030_2000) + resultString.replace(0xE5E5, ideographicSpace); + + return resultString; +} + +CString TextCodecMac::encode(const UChar* characters, size_t length, UnencodableHandling handling) +{ + // FIXME: We should really use TEC here instead of CFString for consistency with the other direction. + + // FIXME: Since there's no "force ASCII range" mode in CFString, we change the backslash into a yen sign. + // Encoding will change the yen sign back into a backslash. + String copy(characters, length); + copy.replace('\\', m_backslashAsCurrencySymbol); + RetainPtr<CFStringRef> cfs(AdoptCF, copy.createCFString()); + + CFIndex startPos = 0; + CFIndex charactersLeft = CFStringGetLength(cfs.get()); + Vector<char> result; + size_t size = 0; + UInt8 lossByte = handling == QuestionMarksForUnencodables ? '?' : 0; + while (charactersLeft > 0) { + CFRange range = CFRangeMake(startPos, charactersLeft); + CFIndex bufferLength; + CFStringGetBytes(cfs.get(), range, m_encoding, lossByte, false, NULL, 0x7FFFFFFF, &bufferLength); + + result.grow(size + bufferLength); + unsigned char* buffer = reinterpret_cast<unsigned char*>(result.data() + size); + CFIndex charactersConverted = CFStringGetBytes(cfs.get(), range, m_encoding, lossByte, false, buffer, bufferLength, &bufferLength); + size += bufferLength; + + if (charactersConverted != charactersLeft) { + unsigned badChar = CFStringGetCharacterAtIndex(cfs.get(), startPos + charactersConverted); + ++charactersConverted; + if ((badChar & 0xFC00) == 0xD800 && charactersConverted != charactersLeft) { // is high surrogate + UniChar low = CFStringGetCharacterAtIndex(cfs.get(), startPos + charactersConverted); + if ((low & 0xFC00) == 0xDC00) { // is low surrogate + badChar <<= 10; + badChar += low; + badChar += 0x10000 - (0xD800 << 10) - 0xDC00; + ++charactersConverted; + } + } + UnencodableReplacementArray entity; + int entityLength = getUnencodableReplacement(badChar, handling, entity); + result.grow(size + entityLength); + memcpy(result.data() + size, entity, entityLength); + size += entityLength; + } + + startPos += charactersConverted; + charactersLeft -= charactersConverted; + } + return CString(result.data(), size); +} + +} // namespace WebCore diff --git a/Source/WebCore/platform/text/mac/TextCodecMac.h b/Source/WebCore/platform/text/mac/TextCodecMac.h new file mode 100644 index 0000000..3e7a237 --- /dev/null +++ b/Source/WebCore/platform/text/mac/TextCodecMac.h @@ -0,0 +1,73 @@ +/* + * Copyright (C) 2004, 2006 Apple Computer, Inc. All rights reserved. + * Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef TextCodecMac_h +#define TextCodecMac_h + +#include "TextCodec.h" +#include <CoreServices/CoreServices.h> + +namespace WebCore { + + typedef ::TextEncoding TECTextEncodingID; + const TECTextEncodingID invalidEncoding = kCFStringEncodingInvalidId; + + class TextCodecMac : public TextCodec { + public: + static void registerEncodingNames(EncodingNameRegistrar); + static void registerCodecs(TextCodecRegistrar); + + explicit TextCodecMac(TECTextEncodingID); + virtual ~TextCodecMac(); + + virtual String decode(const char*, size_t length, bool flush, bool stopOnError, bool& sawError); + virtual CString encode(const UChar*, size_t length, UnencodableHandling); + + private: + OSStatus decode(const unsigned char* inputBuffer, int inputBufferLength, int& inputLength, + void* outputBuffer, int outputBufferLength, int& outputLength); + + OSStatus createTECConverter() const; + void releaseTECConverter() const; + + TECTextEncodingID m_encoding; + UChar m_backslashAsCurrencySymbol; + unsigned m_numBufferedBytes; + unsigned char m_bufferedBytes[16]; // bigger than any single multi-byte character + mutable TECObjectRef m_converterTEC; + }; + + struct TECConverterWrapper { + TECConverterWrapper() : converter(0), encoding(invalidEncoding) { } + ~TECConverterWrapper() { if (converter) TECDisposeConverter(converter); } + + TECObjectRef converter; + TECTextEncodingID encoding; + }; + +} // namespace WebCore + +#endif // TextCodecMac_h diff --git a/Source/WebCore/platform/text/mac/character-sets.txt b/Source/WebCore/platform/text/mac/character-sets.txt new file mode 100644 index 0000000..475e78e --- /dev/null +++ b/Source/WebCore/platform/text/mac/character-sets.txt @@ -0,0 +1,1868 @@ + +=================================================================== +CHARACTER SETS + +(last updated 28 January 2005) + +These are the official names for character sets that may be used in +the Internet and may be referred to in Internet documentation. These +names are expressed in ANSI_X3.4-1968 which is commonly called +US-ASCII or simply ASCII. The character set most commonly use in the +Internet and used especially in protocol standards is US-ASCII, this +is strongly encouraged. The use of the name US-ASCII is also +encouraged. + +The character set names may be up to 40 characters taken from the +printable characters of US-ASCII. However, no distinction is made +between use of upper and lower case letters. + +The MIBenum value is a unique value for use in MIBs to identify coded +character sets. + +The value space for MIBenum values has been divided into three +regions. The first region (3-999) consists of coded character sets +that have been standardized by some standard setting organization. +This region is intended for standards that do not have subset +implementations. The second region (1000-1999) is for the Unicode and +ISO/IEC 10646 coded character sets together with a specification of a +(set of) sub-repertoires that may occur. The third region (>1999) is +intended for vendor specific coded character sets. + + Assigned MIB enum Numbers + ------------------------- + 0-2 Reserved + 3-999 Set By Standards Organizations + 1000-1999 Unicode / 10646 + 2000-2999 Vendor + +The aliases that start with "cs" have been added for use with the +IANA-CHARSET-MIB as originally defined in RFC3808, and as currently +maintained by IANA at http://www/iana.org/assignments/ianacharset-mib. +Note that the ianacharset-mib needs to be kept in sync with this +registry. These aliases that start with "cs" contain the standard +numbers along with suggestive names in order to facilitate applications +that want to display the names in user interfaces. The "cs" stands +for character set and is provided for applications that need a lower +case first letter but want to use mixed case thereafter that cannot +contain any special characters, such as underbar ("_") and dash ("-"). + +If the character set is from an ISO standard, its cs alias is the ISO +standard number or name. If the character set is not from an ISO +standard, but is registered with ISO (IPSJ/ITSCJ is the current ISO +Registration Authority), the ISO Registry number is specified as +ISOnnn followed by letters suggestive of the name or standards number +of the code set. When a national or international standard is +revised, the year of revision is added to the cs alias of the new +character set entry in the IANA Registry in order to distinguish the +revised character set from the original character set. + + +Character Set Reference +------------- --------- + +Name: ANSI_X3.4-1968 [RFC1345,KXS2] +MIBenum: 3 +Source: ECMA registry +Alias: iso-ir-6 +Alias: ANSI_X3.4-1986 +Alias: ISO_646.irv:1991 +Alias: ASCII +Alias: ISO646-US +Alias: US-ASCII (preferred MIME name) +Alias: us +Alias: IBM367 +Alias: cp367 +Alias: csASCII + +Name: ISO-10646-UTF-1 +MIBenum: 27 +Source: Universal Transfer Format (1), this is the multibyte + encoding, that subsets ASCII-7. It does not have byte + ordering issues. +Alias: csISO10646UTF1 + +Name: ISO_646.basic:1983 [RFC1345,KXS2] +MIBenum: 28 +Source: ECMA registry +Alias: ref +Alias: csISO646basic1983 + +Name: INVARIANT [RFC1345,KXS2] +MIBenum: 29 +Alias: csINVARIANT + +Name: ISO_646.irv:1983 [RFC1345,KXS2] +MIBenum: 30 +Source: ECMA registry +Alias: iso-ir-2 +Alias: irv +Alias: csISO2IntlRefVersion + +Name: BS_4730 [RFC1345,KXS2] +MIBenum: 20 +Source: ECMA registry +Alias: iso-ir-4 +Alias: ISO646-GB +Alias: gb +Alias: uk +Alias: csISO4UnitedKingdom + +Name: NATS-SEFI [RFC1345,KXS2] +MIBenum: 31 +Source: ECMA registry +Alias: iso-ir-8-1 +Alias: csNATSSEFI + +Name: NATS-SEFI-ADD [RFC1345,KXS2] +MIBenum: 32 +Source: ECMA registry +Alias: iso-ir-8-2 +Alias: csNATSSEFIADD + +Name: NATS-DANO [RFC1345,KXS2] +MIBenum: 33 +Source: ECMA registry +Alias: iso-ir-9-1 +Alias: csNATSDANO + +Name: NATS-DANO-ADD [RFC1345,KXS2] +MIBenum: 34 +Source: ECMA registry +Alias: iso-ir-9-2 +Alias: csNATSDANOADD + +Name: SEN_850200_B [RFC1345,KXS2] +MIBenum: 35 +Source: ECMA registry +Alias: iso-ir-10 +Alias: FI +Alias: ISO646-FI +Alias: ISO646-SE +Alias: se +Alias: csISO10Swedish + +Name: SEN_850200_C [RFC1345,KXS2] +MIBenum: 21 +Source: ECMA registry +Alias: iso-ir-11 +Alias: ISO646-SE2 +Alias: se2 +Alias: csISO11SwedishForNames + +Name: KS_C_5601-1987 [RFC1345,KXS2] +MIBenum: 36 +Source: ECMA registry +Alias: iso-ir-149 +Alias: KS_C_5601-1989 +Alias: KSC_5601 +Alias: korean +Alias: csKSC56011987 + +Name: ISO-2022-KR (preferred MIME name) [RFC1557,Choi] +MIBenum: 37 +Source: RFC-1557 (see also KS_C_5601-1987) +Alias: csISO2022KR + +Name: EUC-KR (preferred MIME name) [RFC1557,Choi] +MIBenum: 38 +Source: RFC-1557 (see also KS_C_5861-1992) +Alias: csEUCKR + +Name: ISO-2022-JP (preferred MIME name) [RFC1468,Murai] +MIBenum: 39 +Source: RFC-1468 (see also RFC-2237) +Alias: csISO2022JP + +Name: ISO-2022-JP-2 (preferred MIME name) [RFC1554,Ohta] +MIBenum: 40 +Source: RFC-1554 +Alias: csISO2022JP2 + +Name: ISO-2022-CN [RFC1922] +MIBenum: 104 +Source: RFC-1922 + +Name: ISO-2022-CN-EXT [RFC1922] +MIBenum: 105 +Source: RFC-1922 + +Name: JIS_C6220-1969-jp [RFC1345,KXS2] +MIBenum: 41 +Source: ECMA registry +Alias: JIS_C6220-1969 +Alias: iso-ir-13 +Alias: katakana +Alias: x0201-7 +Alias: csISO13JISC6220jp + +Name: JIS_C6220-1969-ro [RFC1345,KXS2] +MIBenum: 42 +Source: ECMA registry +Alias: iso-ir-14 +Alias: jp +Alias: ISO646-JP +Alias: csISO14JISC6220ro + +Name: IT [RFC1345,KXS2] +MIBenum: 22 +Source: ECMA registry +Alias: iso-ir-15 +Alias: ISO646-IT +Alias: csISO15Italian + +Name: PT [RFC1345,KXS2] +MIBenum: 43 +Source: ECMA registry +Alias: iso-ir-16 +Alias: ISO646-PT +Alias: csISO16Portuguese + +Name: ES [RFC1345,KXS2] +MIBenum: 23 +Source: ECMA registry +Alias: iso-ir-17 +Alias: ISO646-ES +Alias: csISO17Spanish + +Name: greek7-old [RFC1345,KXS2] +MIBenum: 44 +Source: ECMA registry +Alias: iso-ir-18 +Alias: csISO18Greek7Old + +Name: latin-greek [RFC1345,KXS2] +MIBenum: 45 +Source: ECMA registry +Alias: iso-ir-19 +Alias: csISO19LatinGreek + +Name: DIN_66003 [RFC1345,KXS2] +MIBenum: 24 +Source: ECMA registry +Alias: iso-ir-21 +Alias: de +Alias: ISO646-DE +Alias: csISO21German + +Name: NF_Z_62-010_(1973) [RFC1345,KXS2] +MIBenum: 46 +Source: ECMA registry +Alias: iso-ir-25 +Alias: ISO646-FR1 +Alias: csISO25French + +Name: Latin-greek-1 [RFC1345,KXS2] +MIBenum: 47 +Source: ECMA registry +Alias: iso-ir-27 +Alias: csISO27LatinGreek1 + +Name: ISO_5427 [RFC1345,KXS2] +MIBenum: 48 +Source: ECMA registry +Alias: iso-ir-37 +Alias: csISO5427Cyrillic + +Name: JIS_C6226-1978 [RFC1345,KXS2] +MIBenum: 49 +Source: ECMA registry +Alias: iso-ir-42 +Alias: csISO42JISC62261978 + +Name: BS_viewdata [RFC1345,KXS2] +MIBenum: 50 +Source: ECMA registry +Alias: iso-ir-47 +Alias: csISO47BSViewdata + +Name: INIS [RFC1345,KXS2] +MIBenum: 51 +Source: ECMA registry +Alias: iso-ir-49 +Alias: csISO49INIS + +Name: INIS-8 [RFC1345,KXS2] +MIBenum: 52 +Source: ECMA registry +Alias: iso-ir-50 +Alias: csISO50INIS8 + +Name: INIS-cyrillic [RFC1345,KXS2] +MIBenum: 53 +Source: ECMA registry +Alias: iso-ir-51 +Alias: csISO51INISCyrillic + +Name: ISO_5427:1981 [RFC1345,KXS2] +MIBenum: 54 +Source: ECMA registry +Alias: iso-ir-54 +Alias: ISO5427Cyrillic1981 + +Name: ISO_5428:1980 [RFC1345,KXS2] +MIBenum: 55 +Source: ECMA registry +Alias: iso-ir-55 +Alias: csISO5428Greek + +Name: GB_1988-80 [RFC1345,KXS2] +MIBenum: 56 +Source: ECMA registry +Alias: iso-ir-57 +Alias: cn +Alias: ISO646-CN +Alias: csISO57GB1988 + +Name: GB_2312-80 [RFC1345,KXS2] +MIBenum: 57 +Source: ECMA registry +Alias: iso-ir-58 +Alias: chinese +Alias: csISO58GB231280 + +Name: NS_4551-1 [RFC1345,KXS2] +MIBenum: 25 +Source: ECMA registry +Alias: iso-ir-60 +Alias: ISO646-NO +Alias: no +Alias: csISO60DanishNorwegian +Alias: csISO60Norwegian1 + +Name: NS_4551-2 [RFC1345,KXS2] +MIBenum: 58 +Source: ECMA registry +Alias: ISO646-NO2 +Alias: iso-ir-61 +Alias: no2 +Alias: csISO61Norwegian2 + +Name: NF_Z_62-010 [RFC1345,KXS2] +MIBenum: 26 +Source: ECMA registry +Alias: iso-ir-69 +Alias: ISO646-FR +Alias: fr +Alias: csISO69French + +Name: videotex-suppl [RFC1345,KXS2] +MIBenum: 59 +Source: ECMA registry +Alias: iso-ir-70 +Alias: csISO70VideotexSupp1 + +Name: PT2 [RFC1345,KXS2] +MIBenum: 60 +Source: ECMA registry +Alias: iso-ir-84 +Alias: ISO646-PT2 +Alias: csISO84Portuguese2 + +Name: ES2 [RFC1345,KXS2] +MIBenum: 61 +Source: ECMA registry +Alias: iso-ir-85 +Alias: ISO646-ES2 +Alias: csISO85Spanish2 + +Name: MSZ_7795.3 [RFC1345,KXS2] +MIBenum: 62 +Source: ECMA registry +Alias: iso-ir-86 +Alias: ISO646-HU +Alias: hu +Alias: csISO86Hungarian + +Name: JIS_C6226-1983 [RFC1345,KXS2] +MIBenum: 63 +Source: ECMA registry +Alias: iso-ir-87 +Alias: x0208 +Alias: JIS_X0208-1983 +Alias: csISO87JISX0208 + +Name: greek7 [RFC1345,KXS2] +MIBenum: 64 +Source: ECMA registry +Alias: iso-ir-88 +Alias: csISO88Greek7 + +Name: ASMO_449 [RFC1345,KXS2] +MIBenum: 65 +Source: ECMA registry +Alias: ISO_9036 +Alias: arabic7 +Alias: iso-ir-89 +Alias: csISO89ASMO449 + +Name: iso-ir-90 [RFC1345,KXS2] +MIBenum: 66 +Source: ECMA registry +Alias: csISO90 + +Name: JIS_C6229-1984-a [RFC1345,KXS2] +MIBenum: 67 +Source: ECMA registry +Alias: iso-ir-91 +Alias: jp-ocr-a +Alias: csISO91JISC62291984a + +Name: JIS_C6229-1984-b [RFC1345,KXS2] +MIBenum: 68 +Source: ECMA registry +Alias: iso-ir-92 +Alias: ISO646-JP-OCR-B +Alias: jp-ocr-b +Alias: csISO92JISC62991984b + +Name: JIS_C6229-1984-b-add [RFC1345,KXS2] +MIBenum: 69 +Source: ECMA registry +Alias: iso-ir-93 +Alias: jp-ocr-b-add +Alias: csISO93JIS62291984badd + +Name: JIS_C6229-1984-hand [RFC1345,KXS2] +MIBenum: 70 +Source: ECMA registry +Alias: iso-ir-94 +Alias: jp-ocr-hand +Alias: csISO94JIS62291984hand + +Name: JIS_C6229-1984-hand-add [RFC1345,KXS2] +MIBenum: 71 +Source: ECMA registry +Alias: iso-ir-95 +Alias: jp-ocr-hand-add +Alias: csISO95JIS62291984handadd + +Name: JIS_C6229-1984-kana [RFC1345,KXS2] +MIBenum: 72 +Source: ECMA registry +Alias: iso-ir-96 +Alias: csISO96JISC62291984kana + +Name: ISO_2033-1983 [RFC1345,KXS2] +MIBenum: 73 +Source: ECMA registry +Alias: iso-ir-98 +Alias: e13b +Alias: csISO2033 + +Name: ANSI_X3.110-1983 [RFC1345,KXS2] +MIBenum: 74 +Source: ECMA registry +Alias: iso-ir-99 +Alias: CSA_T500-1983 +Alias: NAPLPS +Alias: csISO99NAPLPS + +Name: ISO_8859-1:1987 [RFC1345,KXS2] +MIBenum: 4 +Source: ECMA registry +Alias: iso-ir-100 +Alias: ISO_8859-1 +Alias: ISO-8859-1 (preferred MIME name) +Alias: latin1 +Alias: l1 +Alias: IBM819 +Alias: CP819 +Alias: csISOLatin1 + +Name: ISO_8859-2:1987 [RFC1345,KXS2] +MIBenum: 5 +Source: ECMA registry +Alias: iso-ir-101 +Alias: ISO_8859-2 +Alias: ISO-8859-2 (preferred MIME name) +Alias: latin2 +Alias: l2 +Alias: csISOLatin2 + +Name: T.61-7bit [RFC1345,KXS2] +MIBenum: 75 +Source: ECMA registry +Alias: iso-ir-102 +Alias: csISO102T617bit + +Name: T.61-8bit [RFC1345,KXS2] +MIBenum: 76 +Alias: T.61 +Source: ECMA registry +Alias: iso-ir-103 +Alias: csISO103T618bit + +Name: ISO_8859-3:1988 [RFC1345,KXS2] +MIBenum: 6 +Source: ECMA registry +Alias: iso-ir-109 +Alias: ISO_8859-3 +Alias: ISO-8859-3 (preferred MIME name) +Alias: latin3 +Alias: l3 +Alias: csISOLatin3 + +Name: ISO_8859-4:1988 [RFC1345,KXS2] +MIBenum: 7 +Source: ECMA registry +Alias: iso-ir-110 +Alias: ISO_8859-4 +Alias: ISO-8859-4 (preferred MIME name) +Alias: latin4 +Alias: l4 +Alias: csISOLatin4 + +Name: ECMA-cyrillic +MIBenum: 77 +Source: ISO registry (formerly ECMA registry) + http://www.itscj.ipsj.jp/ISO-IR/111.pdf +Alias: iso-ir-111 +Alias: KOI8-E +Alias: csISO111ECMACyrillic + +Name: CSA_Z243.4-1985-1 [RFC1345,KXS2] +MIBenum: 78 +Source: ECMA registry +Alias: iso-ir-121 +Alias: ISO646-CA +Alias: csa7-1 +Alias: ca +Alias: csISO121Canadian1 + +Name: CSA_Z243.4-1985-2 [RFC1345,KXS2] +MIBenum: 79 +Source: ECMA registry +Alias: iso-ir-122 +Alias: ISO646-CA2 +Alias: csa7-2 +Alias: csISO122Canadian2 + +Name: CSA_Z243.4-1985-gr [RFC1345,KXS2] +MIBenum: 80 +Source: ECMA registry +Alias: iso-ir-123 +Alias: csISO123CSAZ24341985gr + +Name: ISO_8859-6:1987 [RFC1345,KXS2] +MIBenum: 9 +Source: ECMA registry +Alias: iso-ir-127 +Alias: ISO_8859-6 +Alias: ISO-8859-6 (preferred MIME name) +Alias: ECMA-114 +Alias: ASMO-708 +Alias: arabic +Alias: csISOLatinArabic + +Name: ISO_8859-6-E [RFC1556,IANA] +MIBenum: 81 +Source: RFC1556 +Alias: csISO88596E +Alias: ISO-8859-6-E (preferred MIME name) + +Name: ISO_8859-6-I [RFC1556,IANA] +MIBenum: 82 +Source: RFC1556 +Alias: csISO88596I +Alias: ISO-8859-6-I (preferred MIME name) + +Name: ISO_8859-7:1987 [RFC1947,RFC1345,KXS2] +MIBenum: 10 +Source: ECMA registry +Alias: iso-ir-126 +Alias: ISO_8859-7 +Alias: ISO-8859-7 (preferred MIME name) +Alias: ELOT_928 +Alias: ECMA-118 +Alias: greek +Alias: greek8 +Alias: csISOLatinGreek + +Name: T.101-G2 [RFC1345,KXS2] +MIBenum: 83 +Source: ECMA registry +Alias: iso-ir-128 +Alias: csISO128T101G2 + +Name: ISO_8859-8:1988 [RFC1345,KXS2] +MIBenum: 11 +Source: ECMA registry +Alias: iso-ir-138 +Alias: ISO_8859-8 +Alias: ISO-8859-8 (preferred MIME name) +Alias: hebrew +Alias: csISOLatinHebrew + +Name: ISO_8859-8-E [RFC1556,Nussbacher] +MIBenum: 84 +Source: RFC1556 +Alias: csISO88598E +Alias: ISO-8859-8-E (preferred MIME name) + +Name: ISO_8859-8-I [RFC1556,Nussbacher] +MIBenum: 85 +Source: RFC1556 +Alias: csISO88598I +Alias: ISO-8859-8-I (preferred MIME name) + +Name: CSN_369103 [RFC1345,KXS2] +MIBenum: 86 +Source: ECMA registry +Alias: iso-ir-139 +Alias: csISO139CSN369103 + +Name: JUS_I.B1.002 [RFC1345,KXS2] +MIBenum: 87 +Source: ECMA registry +Alias: iso-ir-141 +Alias: ISO646-YU +Alias: js +Alias: yu +Alias: csISO141JUSIB1002 + +Name: ISO_6937-2-add [RFC1345,KXS2] +MIBenum: 14 +Source: ECMA registry and ISO 6937-2:1983 +Alias: iso-ir-142 +Alias: csISOTextComm + +Name: IEC_P27-1 [RFC1345,KXS2] +MIBenum: 88 +Source: ECMA registry +Alias: iso-ir-143 +Alias: csISO143IECP271 + +Name: ISO_8859-5:1988 [RFC1345,KXS2] +MIBenum: 8 +Source: ECMA registry +Alias: iso-ir-144 +Alias: ISO_8859-5 +Alias: ISO-8859-5 (preferred MIME name) +Alias: cyrillic +Alias: csISOLatinCyrillic + +Name: JUS_I.B1.003-serb [RFC1345,KXS2] +MIBenum: 89 +Source: ECMA registry +Alias: iso-ir-146 +Alias: serbian +Alias: csISO146Serbian + +Name: JUS_I.B1.003-mac [RFC1345,KXS2] +MIBenum: 90 +Source: ECMA registry +Alias: macedonian +Alias: iso-ir-147 +Alias: csISO147Macedonian + +Name: ISO_8859-9:1989 [RFC1345,KXS2] +MIBenum: 12 +Source: ECMA registry +Alias: iso-ir-148 +Alias: ISO_8859-9 +Alias: ISO-8859-9 (preferred MIME name) +Alias: latin5 +Alias: l5 +Alias: csISOLatin5 + +Name: greek-ccitt [RFC1345,KXS2] +MIBenum: 91 +Source: ECMA registry +Alias: iso-ir-150 +Alias: csISO150 +Alias: csISO150GreekCCITT + +Name: NC_NC00-10:81 [RFC1345,KXS2] +MIBenum: 92 +Source: ECMA registry +Alias: cuba +Alias: iso-ir-151 +Alias: ISO646-CU +Alias: csISO151Cuba + +Name: ISO_6937-2-25 [RFC1345,KXS2] +MIBenum: 93 +Source: ECMA registry +Alias: iso-ir-152 +Alias: csISO6937Add + +Name: GOST_19768-74 [RFC1345,KXS2] +MIBenum: 94 +Source: ECMA registry +Alias: ST_SEV_358-88 +Alias: iso-ir-153 +Alias: csISO153GOST1976874 + +Name: ISO_8859-supp [RFC1345,KXS2] +MIBenum: 95 +Source: ECMA registry +Alias: iso-ir-154 +Alias: latin1-2-5 +Alias: csISO8859Supp + +Name: ISO_10367-box [RFC1345,KXS2] +MIBenum: 96 +Source: ECMA registry +Alias: iso-ir-155 +Alias: csISO10367Box + +Name: ISO-8859-10 (preferred MIME name) [RFC1345,KXS2] +MIBenum: 13 +Source: ECMA registry +Alias: iso-ir-157 +Alias: l6 +Alias: ISO_8859-10:1992 +Alias: csISOLatin6 +Alias: latin6 + +Name: latin-lap [RFC1345,KXS2] +MIBenum: 97 +Source: ECMA registry +Alias: lap +Alias: iso-ir-158 +Alias: csISO158Lap + +Name: JIS_X0212-1990 [RFC1345,KXS2] +MIBenum: 98 +Source: ECMA registry +Alias: x0212 +Alias: iso-ir-159 +Alias: csISO159JISX02121990 + +Name: DS_2089 [RFC1345,KXS2] +MIBenum: 99 +Source: Danish Standard, DS 2089, February 1974 +Alias: DS2089 +Alias: ISO646-DK +Alias: dk +Alias: csISO646Danish + +Name: us-dk [RFC1345,KXS2] +MIBenum: 100 +Alias: csUSDK + +Name: dk-us [RFC1345,KXS2] +MIBenum: 101 +Alias: csDKUS + +Name: JIS_X0201 [RFC1345,KXS2] +MIBenum: 15 +Source: JIS X 0201-1976. One byte only, this is equivalent to + JIS/Roman (similar to ASCII) plus eight-bit half-width + Katakana +Alias: X0201 +Alias: csHalfWidthKatakana + +Name: KSC5636 [RFC1345,KXS2] +MIBenum: 102 +Alias: ISO646-KR +Alias: csKSC5636 + +Name: ISO-10646-UCS-2 +MIBenum: 1000 +Source: the 2-octet Basic Multilingual Plane, aka Unicode + this needs to specify network byte order: the standard + does not specify (it is a 16-bit integer space) +Alias: csUnicode + +Name: ISO-10646-UCS-4 +MIBenum: 1001 +Source: the full code space. (same comment about byte order, + these are 31-bit numbers. +Alias: csUCS4 + +Name: DEC-MCS [RFC1345,KXS2] +MIBenum: 2008 +Source: VAX/VMS User's Manual, + Order Number: AI-Y517A-TE, April 1986. +Alias: dec +Alias: csDECMCS + +Name: hp-roman8 [HP-PCL5,RFC1345,KXS2] +MIBenum: 2004 +Source: LaserJet IIP Printer User's Manual, + HP part no 33471-90901, Hewlet-Packard, June 1989. +Alias: roman8 +Alias: r8 +Alias: csHPRoman8 + +Name: macintosh [RFC1345,KXS2] +MIBenum: 2027 +Source: The Unicode Standard ver1.0, ISBN 0-201-56788-1, Oct 1991 +Alias: mac +Alias: csMacintosh + +Name: IBM037 [RFC1345,KXS2] +MIBenum: 2028 +Source: IBM NLS RM Vol2 SE09-8002-01, March 1990 +Alias: cp037 +Alias: ebcdic-cp-us +Alias: ebcdic-cp-ca +Alias: ebcdic-cp-wt +Alias: ebcdic-cp-nl +Alias: csIBM037 + +Name: IBM038 [RFC1345,KXS2] +MIBenum: 2029 +Source: IBM 3174 Character Set Ref, GA27-3831-02, March 1990 +Alias: EBCDIC-INT +Alias: cp038 +Alias: csIBM038 + +Name: IBM273 [RFC1345,KXS2] +MIBenum: 2030 +Source: IBM NLS RM Vol2 SE09-8002-01, March 1990 +Alias: CP273 +Alias: csIBM273 + +Name: IBM274 [RFC1345,KXS2] +MIBenum: 2031 +Source: IBM 3174 Character Set Ref, GA27-3831-02, March 1990 +Alias: EBCDIC-BE +Alias: CP274 +Alias: csIBM274 + +Name: IBM275 [RFC1345,KXS2] +MIBenum: 2032 +Source: IBM NLS RM Vol2 SE09-8002-01, March 1990 +Alias: EBCDIC-BR +Alias: cp275 +Alias: csIBM275 + +Name: IBM277 [RFC1345,KXS2] +MIBenum: 2033 +Source: IBM NLS RM Vol2 SE09-8002-01, March 1990 +Alias: EBCDIC-CP-DK +Alias: EBCDIC-CP-NO +Alias: csIBM277 + +Name: IBM278 [RFC1345,KXS2] +MIBenum: 2034 +Source: IBM NLS RM Vol2 SE09-8002-01, March 1990 +Alias: CP278 +Alias: ebcdic-cp-fi +Alias: ebcdic-cp-se +Alias: csIBM278 + +Name: IBM280 [RFC1345,KXS2] +MIBenum: 2035 +Source: IBM NLS RM Vol2 SE09-8002-01, March 1990 +Alias: CP280 +Alias: ebcdic-cp-it +Alias: csIBM280 + +Name: IBM281 [RFC1345,KXS2] +MIBenum: 2036 +Source: IBM 3174 Character Set Ref, GA27-3831-02, March 1990 +Alias: EBCDIC-JP-E +Alias: cp281 +Alias: csIBM281 + +Name: IBM284 [RFC1345,KXS2] +MIBenum: 2037 +Source: IBM NLS RM Vol2 SE09-8002-01, March 1990 +Alias: CP284 +Alias: ebcdic-cp-es +Alias: csIBM284 + +Name: IBM285 [RFC1345,KXS2] +MIBenum: 2038 +Source: IBM NLS RM Vol2 SE09-8002-01, March 1990 +Alias: CP285 +Alias: ebcdic-cp-gb +Alias: csIBM285 + +Name: IBM290 [RFC1345,KXS2] +MIBenum: 2039 +Source: IBM 3174 Character Set Ref, GA27-3831-02, March 1990 +Alias: cp290 +Alias: EBCDIC-JP-kana +Alias: csIBM290 + +Name: IBM297 [RFC1345,KXS2] +MIBenum: 2040 +Source: IBM NLS RM Vol2 SE09-8002-01, March 1990 +Alias: cp297 +Alias: ebcdic-cp-fr +Alias: csIBM297 + +Name: IBM420 [RFC1345,KXS2] +MIBenum: 2041 +Source: IBM NLS RM Vol2 SE09-8002-01, March 1990, + IBM NLS RM p 11-11 +Alias: cp420 +Alias: ebcdic-cp-ar1 +Alias: csIBM420 + +Name: IBM423 [RFC1345,KXS2] +MIBenum: 2042 +Source: IBM NLS RM Vol2 SE09-8002-01, March 1990 +Alias: cp423 +Alias: ebcdic-cp-gr +Alias: csIBM423 + +Name: IBM424 [RFC1345,KXS2] +MIBenum: 2043 +Source: IBM NLS RM Vol2 SE09-8002-01, March 1990 +Alias: cp424 +Alias: ebcdic-cp-he +Alias: csIBM424 + +Name: IBM437 [RFC1345,KXS2] +MIBenum: 2011 +Source: IBM NLS RM Vol2 SE09-8002-01, March 1990 +Alias: cp437 +Alias: 437 +Alias: csPC8CodePage437 + +Name: IBM500 [RFC1345,KXS2] +MIBenum: 2044 +Source: IBM NLS RM Vol2 SE09-8002-01, March 1990 +Alias: CP500 +Alias: ebcdic-cp-be +Alias: ebcdic-cp-ch +Alias: csIBM500 + +Name: IBM775 [HP-PCL5] +MIBenum: 2087 +Source: HP PCL 5 Comparison Guide (P/N 5021-0329) pp B-13, 1996 +Alias: cp775 +Alias: csPC775Baltic + +Name: IBM850 [RFC1345,KXS2] +MIBenum: 2009 +Source: IBM NLS RM Vol2 SE09-8002-01, March 1990 +Alias: cp850 +Alias: 850 +Alias: csPC850Multilingual + +Name: IBM851 [RFC1345,KXS2] +MIBenum: 2045 +Source: IBM NLS RM Vol2 SE09-8002-01, March 1990 +Alias: cp851 +Alias: 851 +Alias: csIBM851 + +Name: IBM852 [RFC1345,KXS2] +MIBenum: 2010 +Source: IBM NLS RM Vol2 SE09-8002-01, March 1990 +Alias: cp852 +Alias: 852 +Alias: csPCp852 + +Name: IBM855 [RFC1345,KXS2] +MIBenum: 2046 +Source: IBM NLS RM Vol2 SE09-8002-01, March 1990 +Alias: cp855 +Alias: 855 +Alias: csIBM855 + +Name: IBM857 [RFC1345,KXS2] +MIBenum: 2047 +Source: IBM NLS RM Vol2 SE09-8002-01, March 1990 +Alias: cp857 +Alias: 857 +Alias: csIBM857 + +Name: IBM860 [RFC1345,KXS2] +MIBenum: 2048 +Source: IBM NLS RM Vol2 SE09-8002-01, March 1990 +Alias: cp860 +Alias: 860 +Alias: csIBM860 + +Name: IBM861 [RFC1345,KXS2] +MIBenum: 2049 +Source: IBM NLS RM Vol2 SE09-8002-01, March 1990 +Alias: cp861 +Alias: 861 +Alias: cp-is +Alias: csIBM861 + +Name: IBM862 [RFC1345,KXS2] +MIBenum: 2013 +Source: IBM NLS RM Vol2 SE09-8002-01, March 1990 +Alias: cp862 +Alias: 862 +Alias: csPC862LatinHebrew + +Name: IBM863 [RFC1345,KXS2] +MIBenum: 2050 +Source: IBM Keyboard layouts and code pages, PN 07G4586 June 1991 +Alias: cp863 +Alias: 863 +Alias: csIBM863 + +Name: IBM864 [RFC1345,KXS2] +MIBenum: 2051 +Source: IBM Keyboard layouts and code pages, PN 07G4586 June 1991 +Alias: cp864 +Alias: csIBM864 + +Name: IBM865 [RFC1345,KXS2] +MIBenum: 2052 +Source: IBM DOS 3.3 Ref (Abridged), 94X9575 (Feb 1987) +Alias: cp865 +Alias: 865 +Alias: csIBM865 + +Name: IBM866 [Pond] +MIBenum: 2086 +Source: IBM NLDG Volume 2 (SE09-8002-03) August 1994 +Alias: cp866 +Alias: 866 +Alias: csIBM866 + +Name: IBM868 [RFC1345,KXS2] +MIBenum: 2053 +Source: IBM NLS RM Vol2 SE09-8002-01, March 1990 +Alias: CP868 +Alias: cp-ar +Alias: csIBM868 + +Name: IBM869 [RFC1345,KXS2] +MIBenum: 2054 +Source: IBM Keyboard layouts and code pages, PN 07G4586 June 1991 +Alias: cp869 +Alias: 869 +Alias: cp-gr +Alias: csIBM869 + +Name: IBM870 [RFC1345,KXS2] +MIBenum: 2055 +Source: IBM NLS RM Vol2 SE09-8002-01, March 1990 +Alias: CP870 +Alias: ebcdic-cp-roece +Alias: ebcdic-cp-yu +Alias: csIBM870 + +Name: IBM871 [RFC1345,KXS2] +MIBenum: 2056 +Source: IBM NLS RM Vol2 SE09-8002-01, March 1990 +Alias: CP871 +Alias: ebcdic-cp-is +Alias: csIBM871 + +Name: IBM880 [RFC1345,KXS2] +MIBenum: 2057 +Source: IBM NLS RM Vol2 SE09-8002-01, March 1990 +Alias: cp880 +Alias: EBCDIC-Cyrillic +Alias: csIBM880 + +Name: IBM891 [RFC1345,KXS2] +MIBenum: 2058 +Source: IBM NLS RM Vol2 SE09-8002-01, March 1990 +Alias: cp891 +Alias: csIBM891 + +Name: IBM903 [RFC1345,KXS2] +MIBenum: 2059 +Source: IBM NLS RM Vol2 SE09-8002-01, March 1990 +Alias: cp903 +Alias: csIBM903 + +Name: IBM904 [RFC1345,KXS2] +MIBenum: 2060 +Source: IBM NLS RM Vol2 SE09-8002-01, March 1990 +Alias: cp904 +Alias: 904 +Alias: csIBBM904 + +Name: IBM905 [RFC1345,KXS2] +MIBenum: 2061 +Source: IBM 3174 Character Set Ref, GA27-3831-02, March 1990 +Alias: CP905 +Alias: ebcdic-cp-tr +Alias: csIBM905 + +Name: IBM918 [RFC1345,KXS2] +MIBenum: 2062 +Source: IBM NLS RM Vol2 SE09-8002-01, March 1990 +Alias: CP918 +Alias: ebcdic-cp-ar2 +Alias: csIBM918 + +Name: IBM1026 [RFC1345,KXS2] +MIBenum: 2063 +Source: IBM NLS RM Vol2 SE09-8002-01, March 1990 +Alias: CP1026 +Alias: csIBM1026 + +Name: EBCDIC-AT-DE [RFC1345,KXS2] +MIBenum: 2064 +Source: IBM 3270 Char Set Ref Ch 10, GA27-2837-9, April 1987 +Alias: csIBMEBCDICATDE + +Name: EBCDIC-AT-DE-A [RFC1345,KXS2] +MIBenum: 2065 +Source: IBM 3270 Char Set Ref Ch 10, GA27-2837-9, April 1987 +Alias: csEBCDICATDEA + +Name: EBCDIC-CA-FR [RFC1345,KXS2] +MIBenum: 2066 +Source: IBM 3270 Char Set Ref Ch 10, GA27-2837-9, April 1987 +Alias: csEBCDICCAFR + +Name: EBCDIC-DK-NO [RFC1345,KXS2] +MIBenum: 2067 +Source: IBM 3270 Char Set Ref Ch 10, GA27-2837-9, April 1987 +Alias: csEBCDICDKNO + +Name: EBCDIC-DK-NO-A [RFC1345,KXS2] +MIBenum: 2068 +Source: IBM 3270 Char Set Ref Ch 10, GA27-2837-9, April 1987 +Alias: csEBCDICDKNOA + +Name: EBCDIC-FI-SE [RFC1345,KXS2] +MIBenum: 2069 +Source: IBM 3270 Char Set Ref Ch 10, GA27-2837-9, April 1987 +Alias: csEBCDICFISE + +Name: EBCDIC-FI-SE-A [RFC1345,KXS2] +MIBenum: 2070 +Source: IBM 3270 Char Set Ref Ch 10, GA27-2837-9, April 1987 +Alias: csEBCDICFISEA + +Name: EBCDIC-FR [RFC1345,KXS2] +MIBenum: 2071 +Source: IBM 3270 Char Set Ref Ch 10, GA27-2837-9, April 1987 +Alias: csEBCDICFR + +Name: EBCDIC-IT [RFC1345,KXS2] +MIBenum: 2072 +Source: IBM 3270 Char Set Ref Ch 10, GA27-2837-9, April 1987 +Alias: csEBCDICIT + +Name: EBCDIC-PT [RFC1345,KXS2] +MIBenum: 2073 +Source: IBM 3270 Char Set Ref Ch 10, GA27-2837-9, April 1987 +Alias: csEBCDICPT + +Name: EBCDIC-ES [RFC1345,KXS2] +MIBenum: 2074 +Source: IBM 3270 Char Set Ref Ch 10, GA27-2837-9, April 1987 +Alias: csEBCDICES + +Name: EBCDIC-ES-A [RFC1345,KXS2] +MIBenum: 2075 +Source: IBM 3270 Char Set Ref Ch 10, GA27-2837-9, April 1987 +Alias: csEBCDICESA + +Name: EBCDIC-ES-S [RFC1345,KXS2] +MIBenum: 2076 +Source: IBM 3270 Char Set Ref Ch 10, GA27-2837-9, April 1987 +Alias: csEBCDICESS + +Name: EBCDIC-UK [RFC1345,KXS2] +MIBenum: 2077 +Source: IBM 3270 Char Set Ref Ch 10, GA27-2837-9, April 1987 +Alias: csEBCDICUK + +Name: EBCDIC-US [RFC1345,KXS2] +MIBenum: 2078 +Source: IBM 3270 Char Set Ref Ch 10, GA27-2837-9, April 1987 +Alias: csEBCDICUS + +Name: UNKNOWN-8BIT [RFC1428] +MIBenum: 2079 +Alias: csUnknown8BiT + +Name: MNEMONIC [RFC1345,KXS2] +MIBenum: 2080 +Source: RFC 1345, also known as "mnemonic+ascii+38" +Alias: csMnemonic + +Name: MNEM [RFC1345,KXS2] +MIBenum: 2081 +Source: RFC 1345, also known as "mnemonic+ascii+8200" +Alias: csMnem + +Name: VISCII [RFC1456] +MIBenum: 2082 +Source: RFC 1456 +Alias: csVISCII + +Name: VIQR [RFC1456] +MIBenum: 2083 +Source: RFC 1456 +Alias: csVIQR + +Name: KOI8-R (preferred MIME name) [RFC1489] +MIBenum: 2084 +Source: RFC 1489, based on GOST-19768-74, ISO-6937/8, + INIS-Cyrillic, ISO-5427. +Alias: csKOI8R + +Name: KOI8-U [RFC2319] +MIBenum: 2088 +Source: RFC 2319 + +Name: IBM00858 +MIBenum: 2089 +Source: IBM See (http://www.iana.org/assignments/charset-reg/IBM00858) [Mahdi] +Alias: CCSID00858 +Alias: CP00858 +Alias: PC-Multilingual-850+euro + +Name: IBM00924 +MIBenum: 2090 +Source: IBM See (http://www.iana.org/assignments/charset-reg/IBM00924) [Mahdi] +Alias: CCSID00924 +Alias: CP00924 +Alias: ebcdic-Latin9--euro + +Name: IBM01140 +MIBenum: 2091 +Source: IBM See (http://www.iana.org/assignments/charset-reg/IBM01140) [Mahdi] +Alias: CCSID01140 +Alias: CP01140 +Alias: ebcdic-us-37+euro + +Name: IBM01141 +MIBenum: 2092 +Source: IBM See (http://www.iana.org/assignments/charset-reg/IBM01141) [Mahdi] +Alias: CCSID01141 +Alias: CP01141 +Alias: ebcdic-de-273+euro + +Name: IBM01142 +MIBenum: 2093 +Source: IBM See (http://www.iana.org/assignments/charset-reg/IBM01142) [Mahdi] +Alias: CCSID01142 +Alias: CP01142 +Alias: ebcdic-dk-277+euro +Alias: ebcdic-no-277+euro + +Name: IBM01143 +MIBenum: 2094 +Source: IBM See (http://www.iana.org/assignments/charset-reg/IBM01143) [Mahdi] +Alias: CCSID01143 +Alias: CP01143 +Alias: ebcdic-fi-278+euro +Alias: ebcdic-se-278+euro + +Name: IBM01144 +MIBenum: 2095 +Source: IBM See (http://www.iana.org/assignments/charset-reg/IBM01144) [Mahdi] +Alias: CCSID01144 +Alias: CP01144 +Alias: ebcdic-it-280+euro + +Name: IBM01145 +MIBenum: 2096 +Source: IBM See (http://www.iana.org/assignments/charset-reg/IBM01145) [Mahdi] +Alias: CCSID01145 +Alias: CP01145 +Alias: ebcdic-es-284+euro + +Name: IBM01146 +MIBenum: 2097 +Source: IBM See (http://www.iana.org/assignments/charset-reg/IBM01146) [Mahdi] +Alias: CCSID01146 +Alias: CP01146 +Alias: ebcdic-gb-285+euro + +Name: IBM01147 +MIBenum: 2098 +Source: IBM See (http://www.iana.org/assignments/charset-reg/IBM01147) [Mahdi] +Alias: CCSID01147 +Alias: CP01147 +Alias: ebcdic-fr-297+euro + +Name: IBM01148 +MIBenum: 2099 +Source: IBM See (http://www.iana.org/assignments/charset-reg/IBM01148) [Mahdi] +Alias: CCSID01148 +Alias: CP01148 +Alias: ebcdic-international-500+euro + +Name: IBM01149 +MIBenum: 2100 +Source: IBM See (http://www.iana.org/assignments/charset-reg/IBM01149) [Mahdi] +Alias: CCSID01149 +Alias: CP01149 +Alias: ebcdic-is-871+euro + +Name: Big5-HKSCS [Yick] +MIBenum: 2101 +Source: See (http://www.iana.org/assignments/charset-reg/Big5-HKSCS) +Alias: None + +Name: IBM1047 [Robrigado] +MIBenum: 2102 +Source: IBM1047 (EBCDIC Latin 1/Open Systems) +http://www-1.ibm.com/servers/eserver/iseries/software/globalization/pdf/cp01047z.pdf +Alias: IBM-1047 + +Name: PTCP154 [Uskov] +MIBenum: 2103 +Source: See (http://www.iana.org/assignments/charset-reg/PTCP154) +Alias: csPTCP154 +Alias: PT154 +Alias: CP154 +Alias: Cyrillic-Asian + +Name: Amiga-1251 +MIBenum: 2104 +Source: See (http://www.amiga.ultranet.ru/Amiga-1251.html) +Alias: Ami1251 +Alias: Amiga1251 +Alias: Ami-1251 +(Aliases are provided for historical reasons and should not be used) + [Malyshev] + +Name: KOI7-switched +MIBenum: 2105 +Source: See <http://www.iana.org/assignments/charset-reg/KOI7-switched> +Aliases: None + +Name: UNICODE-1-1 [RFC1641] +MIBenum: 1010 +Source: RFC 1641 +Alias: csUnicode11 + +Name: SCSU +MIBenum: 1011 +Source: SCSU See (http://www.iana.org/assignments/charset-reg/SCSU) [Scherer] +Alias: None + +Name: UTF-7 [RFC2152] +MIBenum: 1012 +Source: RFC 2152 +Alias: None + +Name: UTF-16BE [RFC2781] +MIBenum: 1013 +Source: RFC 2781 +Alias: None + +Name: UTF-16LE [RFC2781] +MIBenum: 1014 +Source: RFC 2781 +Alias: None + +Name: UTF-16 [RFC2781] +MIBenum: 1015 +Source: RFC 2781 +Alias: None + +Name: CESU-8 [Phipps] +MIBenum: 1016 +Source: <http://www.unicode.org/unicode/reports/tr26> +Alias: csCESU-8 + +Name: UTF-32 [Davis] +MIBenum: 1017 +Source: <http://www.unicode.org/unicode/reports/tr19/> +Alias: None + +Name: UTF-32BE [Davis] +MIBenum: 1018 +Source: <http://www.unicode.org/unicode/reports/tr19/> +Alias: None + +Name: UTF-32LE [Davis] +MIBenum: 1019 +Source: <http://www.unicode.org/unicode/reports/tr19/> +Alias: None + +Name: BOCU-1 [Scherer] +MIBenum: 1020 +Source: http://www.unicode.org/notes/tn6/ +Alias: csBOCU-1 + +Name: UNICODE-1-1-UTF-7 [RFC1642] +MIBenum: 103 +Source: RFC 1642 +Alias: csUnicode11UTF7 + +Name: UTF-8 [RFC3629] +MIBenum: 106 +Source: RFC 3629 +Alias: None + +Name: ISO-8859-13 +MIBenum: 109 +Source: ISO See (http://www.iana.org/assignments/charset-reg/iso-8859-13)[Tumasonis] +Alias: None + +Name: ISO-8859-14 +MIBenum: 110 +Source: ISO See (http://www.iana.org/assignments/charset-reg/iso-8859-14) [Simonsen] +Alias: iso-ir-199 +Alias: ISO_8859-14:1998 +Alias: ISO_8859-14 +Alias: latin8 +Alias: iso-celtic +Alias: l8 + +Name: ISO-8859-15 +MIBenum: 111 +Source: ISO + Please see: <http://www.iana.org/assignments/charset-reg/ISO-8859-15> +Alias: ISO_8859-15 +Alias: Latin-9 + +Name: ISO-8859-16 +MIBenum: 112 +Source: ISO +Alias: iso-ir-226 +Alias: ISO_8859-16:2001 +Alias: ISO_8859-16 +Alias: latin10 +Alias: l10 + +Name: GBK +MIBenum: 113 +Source: Chinese IT Standardization Technical Committee + Please see: <http://www.iana.org/assignments/charset-reg/GBK> +Alias: CP936 +Alias: MS936 +Alias: windows-936 + +Name: GB18030 +MIBenum: 114 +Source: Chinese IT Standardization Technical Committee + Please see: <http://www.iana.org/assignments/charset-reg/GB18030> +Alias: None + +Name: OSD_EBCDIC_DF04_15 +MIBenum: 115 +Source: Fujitsu-Siemens standard mainframe EBCDIC encoding + Please see: <http://www.iana.org/assignments/charset-reg/OSD-EBCDIC-DF04-15> +Alias: None + +Name: OSD_EBCDIC_DF03_IRV +MIBenum: 116 +Source: Fujitsu-Siemens standard mainframe EBCDIC encoding + Please see: <http://www.iana.org/assignments/charset-reg/OSD-EBCDIC-DF03-IRV> +Alias: None + +Name: OSD_EBCDIC_DF04_1 +MIBenum: 117 +Source: Fujitsu-Siemens standard mainframe EBCDIC encoding + Please see: <http://www.iana.org/assignments/charset-reg/OSD-EBCDIC-DF04-1> +Alias: None + +Name: JIS_Encoding +MIBenum: 16 +Source: JIS X 0202-1991. Uses ISO 2022 escape sequences to + shift code sets as documented in JIS X 0202-1991. +Alias: csJISEncoding + +Name: Shift_JIS (preferred MIME name) +MIBenum: 17 +Source: This charset is an extension of csHalfWidthKatakana by + adding graphic characters in JIS X 0208. The CCS's are + JIS X0201:1997 and JIS X0208:1997. The + complete definition is shown in Appendix 1 of JIS + X0208:1997. + This charset can be used for the top-level media type "text". +Alias: MS_Kanji +Alias: csShiftJIS + +Name: Extended_UNIX_Code_Packed_Format_for_Japanese +MIBenum: 18 +Source: Standardized by OSF, UNIX International, and UNIX Systems + Laboratories Pacific. Uses ISO 2022 rules to select + code set 0: US-ASCII (a single 7-bit byte set) + code set 1: JIS X0208-1990 (a double 8-bit byte set) + restricted to A0-FF in both bytes + code set 2: Half Width Katakana (a single 7-bit byte set) + requiring SS2 as the character prefix + code set 3: JIS X0212-1990 (a double 7-bit byte set) + restricted to A0-FF in both bytes + requiring SS3 as the character prefix +Alias: csEUCPkdFmtJapanese +Alias: EUC-JP (preferred MIME name) + +Name: Extended_UNIX_Code_Fixed_Width_for_Japanese +MIBenum: 19 +Source: Used in Japan. Each character is 2 octets. + code set 0: US-ASCII (a single 7-bit byte set) + 1st byte = 00 + 2nd byte = 20-7E + code set 1: JIS X0208-1990 (a double 7-bit byte set) + restricted to A0-FF in both bytes + code set 2: Half Width Katakana (a single 7-bit byte set) + 1st byte = 00 + 2nd byte = A0-FF + code set 3: JIS X0212-1990 (a double 7-bit byte set) + restricted to A0-FF in + the first byte + and 21-7E in the second byte +Alias: csEUCFixWidJapanese + +Name: ISO-10646-UCS-Basic +MIBenum: 1002 +Source: ASCII subset of Unicode. Basic Latin = collection 1 + See ISO 10646, Appendix A +Alias: csUnicodeASCII + +Name: ISO-10646-Unicode-Latin1 +MIBenum: 1003 +Source: ISO Latin-1 subset of Unicode. Basic Latin and Latin-1 + Supplement = collections 1 and 2. See ISO 10646, + Appendix A. See RFC 1815. +Alias: csUnicodeLatin1 +Alias: ISO-10646 + +Name: ISO-10646-J-1 +Source: ISO 10646 Japanese, see RFC 1815. + +Name: ISO-Unicode-IBM-1261 +MIBenum: 1005 +Source: IBM Latin-2, -3, -5, Extended Presentation Set, GCSGID: 1261 +Alias: csUnicodeIBM1261 + +Name: ISO-Unicode-IBM-1268 +MIBenum: 1006 +Source: IBM Latin-4 Extended Presentation Set, GCSGID: 1268 +Alias: csUnicodeIBM1268 + +Name: ISO-Unicode-IBM-1276 +MIBenum: 1007 +Source: IBM Cyrillic Greek Extended Presentation Set, GCSGID: 1276 +Alias: csUnicodeIBM1276 + +Name: ISO-Unicode-IBM-1264 +MIBenum: 1008 +Source: IBM Arabic Presentation Set, GCSGID: 1264 +Alias: csUnicodeIBM1264 + +Name: ISO-Unicode-IBM-1265 +MIBenum: 1009 +Source: IBM Hebrew Presentation Set, GCSGID: 1265 +Alias: csUnicodeIBM1265 + +Name: ISO-8859-1-Windows-3.0-Latin-1 [HP-PCL5] +MIBenum: 2000 +Source: Extended ISO 8859-1 Latin-1 for Windows 3.0. + PCL Symbol Set id: 9U +Alias: csWindows30Latin1 + +Name: ISO-8859-1-Windows-3.1-Latin-1 [HP-PCL5] +MIBenum: 2001 +Source: Extended ISO 8859-1 Latin-1 for Windows 3.1. + PCL Symbol Set id: 19U +Alias: csWindows31Latin1 + +Name: ISO-8859-2-Windows-Latin-2 [HP-PCL5] +MIBenum: 2002 +Source: Extended ISO 8859-2. Latin-2 for Windows 3.1. + PCL Symbol Set id: 9E +Alias: csWindows31Latin2 + +Name: ISO-8859-9-Windows-Latin-5 [HP-PCL5] +MIBenum: 2003 +Source: Extended ISO 8859-9. Latin-5 for Windows 3.1 + PCL Symbol Set id: 5T +Alias: csWindows31Latin5 + +Name: Adobe-Standard-Encoding [Adobe] +MIBenum: 2005 +Source: PostScript Language Reference Manual + PCL Symbol Set id: 10J +Alias: csAdobeStandardEncoding + +Name: Ventura-US [HP-PCL5] +MIBenum: 2006 +Source: Ventura US. ASCII plus characters typically used in + publishing, like pilcrow, copyright, registered, trade mark, + section, dagger, and double dagger in the range A0 (hex) + to FF (hex). + PCL Symbol Set id: 14J +Alias: csVenturaUS + +Name: Ventura-International [HP-PCL5] +MIBenum: 2007 +Source: Ventura International. ASCII plus coded characters similar + to Roman8. + PCL Symbol Set id: 13J +Alias: csVenturaInternational + +Name: PC8-Danish-Norwegian [HP-PCL5] +MIBenum: 2012 +Source: PC Danish Norwegian + 8-bit PC set for Danish Norwegian + PCL Symbol Set id: 11U +Alias: csPC8DanishNorwegian + +Name: PC8-Turkish [HP-PCL5] +MIBenum: 2014 +Source: PC Latin Turkish. PCL Symbol Set id: 9T +Alias: csPC8Turkish + +Name: IBM-Symbols [IBM-CIDT] +MIBenum: 2015 +Source: Presentation Set, CPGID: 259 +Alias: csIBMSymbols + +Name: IBM-Thai [IBM-CIDT] +MIBenum: 2016 +Source: Presentation Set, CPGID: 838 +Alias: csIBMThai + +Name: HP-Legal [HP-PCL5] +MIBenum: 2017 +Source: PCL 5 Comparison Guide, Hewlett-Packard, + HP part number 5961-0510, October 1992 + PCL Symbol Set id: 1U +Alias: csHPLegal + +Name: HP-Pi-font [HP-PCL5] +MIBenum: 2018 +Source: PCL 5 Comparison Guide, Hewlett-Packard, + HP part number 5961-0510, October 1992 + PCL Symbol Set id: 15U +Alias: csHPPiFont + +Name: HP-Math8 [HP-PCL5] +MIBenum: 2019 +Source: PCL 5 Comparison Guide, Hewlett-Packard, + HP part number 5961-0510, October 1992 + PCL Symbol Set id: 8M +Alias: csHPMath8 + +Name: Adobe-Symbol-Encoding [Adobe] +MIBenum: 2020 +Source: PostScript Language Reference Manual + PCL Symbol Set id: 5M +Alias: csHPPSMath + +Name: HP-DeskTop [HP-PCL5] +MIBenum: 2021 +Source: PCL 5 Comparison Guide, Hewlett-Packard, + HP part number 5961-0510, October 1992 + PCL Symbol Set id: 7J +Alias: csHPDesktop + +Name: Ventura-Math [HP-PCL5] +MIBenum: 2022 +Source: PCL 5 Comparison Guide, Hewlett-Packard, + HP part number 5961-0510, October 1992 + PCL Symbol Set id: 6M +Alias: csVenturaMath + +Name: Microsoft-Publishing [HP-PCL5] +MIBenum: 2023 +Source: PCL 5 Comparison Guide, Hewlett-Packard, + HP part number 5961-0510, October 1992 + PCL Symbol Set id: 6J +Alias: csMicrosoftPublishing + +Name: Windows-31J +MIBenum: 2024 +Source: Windows Japanese. A further extension of Shift_JIS + to include NEC special characters (Row 13), NEC + selection of IBM extensions (Rows 89 to 92), and IBM + extensions (Rows 115 to 119). The CCS's are + JIS X0201:1997, JIS X0208:1997, and these extensions. + This charset can be used for the top-level media type "text", + but it is of limited or specialized use (see RFC2278). + PCL Symbol Set id: 19K +Alias: csWindows31J + +Name: GB2312 (preferred MIME name) +MIBenum: 2025 +Source: Chinese for People's Republic of China (PRC) mixed one byte, + two byte set: + 20-7E = one byte ASCII + A1-FE = two byte PRC Kanji + See GB 2312-80 + PCL Symbol Set Id: 18C +Alias: csGB2312 + +Name: Big5 (preferred MIME name) +MIBenum: 2026 +Source: Chinese for Taiwan Multi-byte set. + PCL Symbol Set Id: 18T +Alias: csBig5 + +Name: windows-1250 +MIBenum: 2250 +Source: Microsoft (http://www.iana.org/assignments/charset-reg/windows-1250) [Lazhintseva] +Alias: None + +Name: windows-1251 +MIBenum: 2251 +Source: Microsoft (http://www.iana.org/assignments/charset-reg/windows-1251) [Lazhintseva] +Alias: None + +Name: windows-1252 +MIBenum: 2252 +Source: Microsoft (http://www.iana.org/assignments/charset-reg/windows-1252) [Wendt] +Alias: None + +Name: windows-1253 +MIBenum: 2253 +Source: Microsoft (http://www.iana.org/assignments/charset-reg/windows-1253) [Lazhintseva] +Alias: None + +Name: windows-1254 +MIBenum: 2254 +Source: Microsoft (http://www.iana.org/assignments/charset-reg/windows-1254) [Lazhintseva] +Alias: None + +Name: windows-1255 +MIBenum: 2255 +Source: Microsoft (http://www.iana.org/assignments/charset-reg/windows-1255) [Lazhintseva] +Alias: None + +Name: windows-1256 +MIBenum: 2256 +Source: Microsoft (http://www.iana.org/assignments/charset-reg/windows-1256) [Lazhintseva] +Alias: None + +Name: windows-1257 +MIBenum: 2257 +Source: Microsoft (http://www.iana.org/assignments/charset-reg/windows-1257) [Lazhintseva] +Alias: None + +Name: windows-1258 +MIBenum: 2258 +Source: Microsoft (http://www.iana.org/assignments/charset-reg/windows-1258) [Lazhintseva] +Alias: None + +Name: TIS-620 +MIBenum: 2259 +Source: Thai Industrial Standards Institute (TISI) [Tantsetthi] + +Name: HZ-GB-2312 +MIBenum: 2085 +Source: RFC 1842, RFC 1843 [RFC1842, RFC1843] + + +REFERENCES +---------- + +[RFC1345] Simonsen, K., "Character Mnemonics & Character Sets", + RFC 1345, Rationel Almen Planlaegning, Rationel Almen + Planlaegning, June 1992. + +[RFC1428] Vaudreuil, G., "Transition of Internet Mail from + Just-Send-8 to 8bit-SMTP/MIME", RFC1428, CNRI, February + 1993. + +[RFC1456] Vietnamese Standardization Working Group, "Conventions for + Encoding the Vietnamese Language VISCII: VIetnamese + Standard Code for Information Interchange VIQR: VIetnamese + Quoted-Readable Specification Revision 1.1", RFC 1456, May + 1993. + +[RFC1468] Murai, J., Crispin, M., and E. van der Poel, "Japanese + Character Encoding for Internet Messages", RFC 1468, + Keio University, Panda Programming, June 1993. + +[RFC1489] Chernov, A., "Registration of a Cyrillic Character Set", + RFC1489, RELCOM Development Team, July 1993. + +[RFC1554] Ohta, M., and K. Handa, "ISO-2022-JP-2: Multilingual + Extension of ISO-2022-JP", RFC1554, Tokyo Institute of + Technology, ETL, December 1993. + +[RFC1556] Nussbacher, H., "Handling of Bi-directional Texts in MIME", + RFC1556, Israeli Inter-University, December 1993. + +[RFC1557] Choi, U., Chon, K., and H. Park, "Korean Character Encoding + for Internet Messages", KAIST, Solvit Chosun Media, + December 1993. + +[RFC1641] Goldsmith, D., and M. Davis, "Using Unicode with MIME", + RFC1641, Taligent, Inc., July 1994. + +[RFC1642] Goldsmith, D., and M. Davis, "UTF-7", RFC1642, Taligent, + Inc., July 1994. + +[RFC1815] Ohta, M., "Character Sets ISO-10646 and ISO-10646-J-1", + RFC 1815, Tokyo Institute of Technology, July 1995. + + +[Adobe] Adobe Systems Incorporated, PostScript Language Reference + Manual, second edition, Addison-Wesley Publishing Company, + Inc., 1990. + +[ECMA Registry] ISO-IR: International Register of Escape Sequences + http://www.itscj.ipsj.or.jp/ISO-IE/ Note: The current + registration authority is IPSJ/ITSCJ, Japan. + +[HP-PCL5] Hewlett-Packard Company, "HP PCL 5 Comparison Guide", + (P/N 5021-0329) pp B-13, 1996. + +[IBM-CIDT] IBM Corporation, "ABOUT TYPE: IBM's Technical Reference + for Core Interchange Digitized Type", Publication number + S544-3708-01 + +[RFC1842] Wei, Y., J. Li, and Y. Jiang, "ASCII Printable + Characters-Based Chinese Character Encoding for Internet + Messages", RFC 1842, Harvard University, Rice University, + University of Maryland, August 1995. + +[RFC1843] Lee, F., "HZ - A Data Format for Exchanging Files of + Arbitrarily Mixed Chinese and ASCII Characters", RFC 1843, + Stanford University, August 1995. + +[RFC2152] Goldsmith, D., M. Davis, "UTF-7: A Mail-Safe Transformation + Format of Unicode", RFC 2152, Apple Computer, Inc., + Taligent Inc., May 1997. + +[RFC2279] Yergeau, F., "UTF-8, A Transformation Format of ISO 10646", + RFC 2279, Alis Technologies, January, 1998. + +[RFC2781] Hoffman, P., Yergeau, F., "UTF-16, an encoding of ISO 10646", + RFC 2781, February 2000. + +[RFC3629] Yergeau, F., "UTF-8, a transformation format of ISO 10646", + RFC3629, November 2003. + +PEOPLE +------ + +[KXS2] Keld Simonsen <Keld.Simonsen@dkuug.dk> + +[Choi] Woohyong Choi <whchoi@cosmos.kaist.ac.kr> + +[Davis] Mark Davis, <mark@unicode.org>, April 2002. + +[Lazhintseva] Katya Lazhintseva, <katyal@MICROSOFT.com>, May 1996. + +[Mahdi] Tamer Mahdi, <tamer@ca.ibm.com>, August 2000. + +[Malyshev] Michael Malyshev, <michael_malyshev@mail.ru>, January 2004 + +[Murai] Jun Murai <jun@wide.ad.jp> + +[Nussbacher] Hank Nussbacher, <hank@vm.tau.ac.il> + +[Ohta] Masataka Ohta, <mohta@cc.titech.ac.jp>, July 1995. + +[Phipps] Toby Phipps, <tphipps@peoplesoft.com>, March 2002. + +[Pond] Rick Pond, <rickpond@vnet.ibm.com>, March 1997. + +[Robrigado] Reuel Robrigado, <reuelr@ca.ibm.com>, September 2002. + +[Scherer] Markus Scherer, <markus.scherer@jtcsv.com>, August 2000, + September 2002. + +[Simonsen] Keld Simonsen, <Keld.Simonsen@rap.dk>, August 2000. + +[Tantsetthi] Trin Tantsetthi, <trin@mozart.inet.co.th>, September 1998. + +[Tumasonis] Vladas Tumasonis, <vladas.tumasonis@maf.vu.lt>, August 2000. + +[Uskov] Alexander Uskov, <auskov@idc.kz>, September 2002. + +[Wendt] Chris Wendt, <christw@microsoft.com>, December 1999. + +[Yick] Nicky Yick, <cliac@itsd.gcn.gov.hk>, October 2000. + +[] + + + + + + + diff --git a/Source/WebCore/platform/text/mac/mac-encodings.txt b/Source/WebCore/platform/text/mac/mac-encodings.txt new file mode 100644 index 0000000..bb45e22 --- /dev/null +++ b/Source/WebCore/platform/text/mac/mac-encodings.txt @@ -0,0 +1,45 @@ +# We'd like to eliminate this file. +# It would be nice to get rid of dependence on the TextEncodingConvert entirely. +# Perhaps we can prove these are not used on the web and remove them. +# Or perhaps we can get them added to ICU. + +# The items on the left are names of TEC TextEncoding values (without the leading kTextEncoding). +# The items on the right are IANA character set names. Names listed in character-sets.txt are not +# repeated here; mentioning any one character set from a group in there pulls in all the aliases in +# that group. + +DOSChineseTrad: cp950 +DOSGreek: cp737, ibm737 +EUC_TW: EUC-TW +ISOLatin10: ISO-8859-16 +ISOLatin6: ISO-8859-10 +ISOLatin8: ISO-8859-14 +ISOLatinThai: ISO-8859-11 +ISO_2022_JP_3: ISO-2022-JP-3 +JIS_C6226_78: JIS_C6226-1978 +JIS_X0208_83: JIS_X0208-1983 +JIS_X0208_90: JIS_X0208-1990 +JIS_X0212_90: JIS_X0212-1990 +KOI8_U: KOI8-U +MacArabic: x-mac-arabic +MacChineseSimp: x-mac-chinesesimp, xmacsimpchinese +MacChineseTrad: x-mac-chinesetrad, xmactradchinese +MacCroatian: x-mac-croatian +MacDevanagari: x-mac-devanagari +MacDingbats: x-mac-dingbats +MacFarsi: x-mac-farsi +MacGujarati: x-mac-gujarati +MacGurmukhi: x-mac-gurmukhi +MacHebrew: x-mac-hebrew +MacIcelandic: x-mac-icelandic +MacJapanese: x-mac-japanese +MacKorean: x-mac-korean +MacRomanLatin1: x-mac-roman-latin1 +MacRomanian: x-mac-romanian +MacSymbol: x-mac-symbol +MacThai: x-mac-thai +MacTibetan: x-mac-tibetan +MacVT100: x-mac-vt100 +NextStepLatin: x-nextstep +ShiftJIS_X0213_00: Shift_JIS_X0213-2000 +WindowsKoreanJohab: johab diff --git a/Source/WebCore/platform/text/mac/make-charset-table.pl b/Source/WebCore/platform/text/mac/make-charset-table.pl new file mode 100755 index 0000000..16fd25a --- /dev/null +++ b/Source/WebCore/platform/text/mac/make-charset-table.pl @@ -0,0 +1,225 @@ +#!/usr/bin/perl -w + +# Copyright (C) 2003, 2004, 2005, 2006 Apple Computer, Inc. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# 3. Neither the name of Apple Computer, Inc. ("Apple") nor the names of +# its contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY +# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF +# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +use strict; + +my %aliasesFromCharsetsFile; +my %namesWritten; + +my $output = ""; + +my $error = 0; + +sub error ($) +{ + print STDERR @_, "\n"; + $error = 1; +} + +sub emit_line +{ + my ($name, $prefix, $encoding, $flags) = @_; + + error "$name shows up twice in output" if $namesWritten{$name}; + $namesWritten{$name} = 1; + + $output .= " { \"$name\", $prefix$encoding },\n"; +} + +sub process_platform_encodings +{ + my ($filename, $PlatformPrefix) = @_; + my $baseFilename = $filename; + $baseFilename =~ s|.*/||; + + my %seenPlatformNames; + my %seenIANANames; + + open PLATFORM_ENCODINGS, $filename or die; + + while (<PLATFORM_ENCODINGS>) { + chomp; + s/\#.*$//; + s/\s+$//; + if (my ($PlatformName, undef, $flags, $IANANames) = /^(.+?)(, (.+))?: (.+)$/) { + my %aliases; + + my $PlatformNameWithFlags = $PlatformName; + if ($flags) { + $PlatformNameWithFlags .= ", " . $flags; + } else { + $flags = "NoEncodingFlags"; + } + error "Platform encoding name $PlatformName is mentioned twice in $baseFilename" if $seenPlatformNames{$PlatformNameWithFlags}; + $seenPlatformNames{$PlatformNameWithFlags} = 1; + + # Build the aliases list. + # Also check that no two names are part of the same entry in the charsets file. + my @IANANames = split ", ", $IANANames; + my $firstName = ""; + my $canonicalFirstName = ""; + my $prevName = ""; + for my $name (@IANANames) { + if ($firstName eq "") { + if ($name !~ /^[-A-Za-z0-9_]+$/) { + error "$name, in $baseFilename, has illegal characters in it"; + next; + } + $firstName = $name; + } else { + if ($name !~ /^[a-z0-9]+$/) { + error "$name, in $baseFilename, has illegal characters in it (must be all lowercase alphanumeric)"; + next; + } + if ($name le $prevName) { + error "$name comes after $prevName in $baseFilename, but everything must be in alphabetical order"; + } + $prevName = $name; + } + + my $canonicalName = lc $name; + $canonicalName =~ tr/-_//d; + + $canonicalFirstName = $canonicalName if $canonicalFirstName eq ""; + + error "$name is mentioned twice in $baseFilename" if $seenIANANames{$canonicalName}; + $seenIANANames{$canonicalName} = 1; + + $aliases{$canonicalName} = 1; + next if !$aliasesFromCharsetsFile{$canonicalName}; + for my $alias (@{$aliasesFromCharsetsFile{$canonicalName}}) { + $aliases{$alias} = 1; + } + for my $otherName (@IANANames) { + next if $canonicalName eq $otherName; + if ($aliasesFromCharsetsFile{$otherName} + && $aliasesFromCharsetsFile{$canonicalName} eq $aliasesFromCharsetsFile{$otherName} + && $canonicalName le $otherName) { + error "$baseFilename lists both $name and $otherName under $PlatformName, but that aliasing is already specified in character-sets.txt"; + } + } + } + + # write out + emit_line($firstName, $PlatformPrefix, $PlatformName, $flags); + for my $alias (sort keys %aliases) { + emit_line($alias, $PlatformPrefix, $PlatformName, $flags) if $alias ne $canonicalFirstName; + } + } elsif (/^([a-zA-Z0-9_]+)(, (.+))?$/) { + my $PlatformName = $1; + + error "Platform encoding name $PlatformName is mentioned twice in $baseFilename" if $seenPlatformNames{$PlatformName}; + $seenPlatformNames{$PlatformName} = 1; + } elsif (/./) { + error "syntax error in $baseFilename, line $."; + } + } + + close PLATFORM_ENCODINGS; +} + +sub process_iana_charset +{ + my ($canonical_name, @aliases) = @_; + + return if !$canonical_name; + + my @names = sort $canonical_name, @aliases; + + for my $name (@names) { + $aliasesFromCharsetsFile{$name} = \@names; + } +} + +sub process_iana_charsets +{ + my ($filename) = @_; + + open CHARSETS, $filename or die; + + my %seen; + + my $canonical_name; + my @aliases; + + my %exceptions = ( isoir91 => 1, isoir92 => 1 ); + + while (<CHARSETS>) { + chomp; + if ((my $new_canonical_name) = /Name: ([^ \t]*).*/) { + $new_canonical_name = lc $new_canonical_name; + $new_canonical_name =~ tr/a-z0-9//cd; + + error "saw $new_canonical_name twice in character-sets.txt", if $seen{$new_canonical_name}; + $seen{$new_canonical_name} = $new_canonical_name; + + process_iana_charset $canonical_name, @aliases; + + $canonical_name = $new_canonical_name; + @aliases = (); + } elsif ((my $new_alias) = /Alias: ([^ \t]*).*/) { + $new_alias = lc $new_alias; + $new_alias =~ tr/a-z0-9//cd; + + # do this after normalizing the alias, sometimes character-sets.txt + # has weird escape characters, e.g. \b after None + next if $new_alias eq "none"; + + error "saw $new_alias twice in character-sets.txt $seen{$new_alias}, $canonical_name", if $seen{$new_alias} && $seen{$new_alias} ne $canonical_name && !$exceptions{$new_alias}; + push @aliases, $new_alias if !$seen{$new_alias}; + $seen{$new_alias} = $canonical_name; + } + } + + process_iana_charset $canonical_name, @aliases; + + close CHARSETS; +} + +# Program body + +process_iana_charsets($ARGV[0]); +process_platform_encodings($ARGV[1], $ARGV[2]); + +exit 1 if $error; + +print <<EOF +// File generated by make-charset-table.pl. Do not edit! + +#include "config.h" +#include "CharsetData.h" + +namespace WebCore { + + const CharsetEntry CharsetTable[] = { +$output + { 0, 0 } + }; + +} +EOF diff --git a/Source/WebCore/platform/text/qt/TextBoundariesQt.cpp b/Source/WebCore/platform/text/qt/TextBoundariesQt.cpp new file mode 100644 index 0000000..a354ca6 --- /dev/null +++ b/Source/WebCore/platform/text/qt/TextBoundariesQt.cpp @@ -0,0 +1,77 @@ +/* + * Copyright (C) 2006 Zack Rusin <zack@kde.org> + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include "TextBoundaries.h" +#include "NotImplemented.h" + +#include <QString> +#include <QChar> + +#include <QDebug> +#include <stdio.h> + +#include <qtextboundaryfinder.h> + +namespace WebCore { + +int findNextWordFromIndex(UChar const* buffer, int len, int position, bool forward) +{ + QString str(reinterpret_cast<QChar const*>(buffer), len); + QTextBoundaryFinder iterator(QTextBoundaryFinder::Word, str); + iterator.setPosition(position >= len ? len - 1 : position); + if (forward) { + int pos = iterator.toNextBoundary(); + while (pos > 0) { + if (QChar(buffer[pos-1]).isLetterOrNumber()) + return pos; + pos = iterator.toNextBoundary(); + } + return len; + } else { + int pos = iterator.toPreviousBoundary(); + while (pos > 0) { + if (QChar(buffer[pos]).isLetterOrNumber()) + return pos; + pos = iterator.toPreviousBoundary(); + } + return 0; + } +} + +void findWordBoundary(UChar const* buffer, int len, int position, int* start, int* end) +{ + QString str(reinterpret_cast<QChar const*>(buffer), len); + QTextBoundaryFinder iterator(QTextBoundaryFinder::Word, str); + iterator.setPosition(position); + *start = position > 0 ? iterator.toPreviousBoundary() : 0; + *end = position == len ? len : iterator.toNextBoundary(); +} + +} + diff --git a/Source/WebCore/platform/text/qt/TextBreakIteratorQt.cpp b/Source/WebCore/platform/text/qt/TextBreakIteratorQt.cpp new file mode 100644 index 0000000..b9f5a9e --- /dev/null +++ b/Source/WebCore/platform/text/qt/TextBreakIteratorQt.cpp @@ -0,0 +1,146 @@ +/* + * Copyright (C) 2006 Lars Knoll <lars@trolltech.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public License + * along with this library; see the file COPYING.LIB. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, + * Boston, MA 02110-1301, USA. + * + */ + +#include "config.h" +#include "TextBreakIterator.h" + +#include <QtCore/qtextboundaryfinder.h> +#include <qdebug.h> + +// #define DEBUG_TEXT_ITERATORS +#ifdef DEBUG_TEXT_ITERATORS +#define DEBUG qDebug +#else +#define DEBUG if (1) {} else qDebug +#endif + +namespace WebCore { + +#if USE(QT_ICU_TEXT_BREAKING) +const char* currentTextBreakLocaleID() +{ + return QLocale::system().name().toLatin1(); +} +#else + static unsigned char buffer[1024]; + + class TextBreakIterator : public QTextBoundaryFinder { + public: + TextBreakIterator(QTextBoundaryFinder::BoundaryType type, const UChar* string, int length) + : QTextBoundaryFinder(type, (const QChar*)string, length, buffer, sizeof(buffer)) + , length(length) + , string(string) {} + TextBreakIterator() + : QTextBoundaryFinder() + , length(0) + , string(0) {} + + int length; + const UChar* string; + }; + + TextBreakIterator* setUpIterator(TextBreakIterator& iterator, QTextBoundaryFinder::BoundaryType type, const UChar* string, int length) + { + if (!string || !length) + return 0; + + if (iterator.isValid() && type == iterator.type() && length == iterator.length + && memcmp(string, iterator.string, length) == 0) { + iterator.toStart(); + return &iterator; + } + + iterator = TextBreakIterator(type, string, length); + + return &iterator; + } + + TextBreakIterator* wordBreakIterator(const UChar* string, int length) + { + static TextBreakIterator staticWordBreakIterator; + return setUpIterator(staticWordBreakIterator, QTextBoundaryFinder::Word, string, length); + } + + TextBreakIterator* characterBreakIterator(const UChar* string, int length) + { + static TextBreakIterator staticCharacterBreakIterator; + return setUpIterator(staticCharacterBreakIterator, QTextBoundaryFinder::Grapheme, string, length); + } + + TextBreakIterator* cursorMovementIterator(const UChar* string, int length) + { + return characterBreakIterator(string, length); + } + + TextBreakIterator* lineBreakIterator(const UChar* string, int length) + { + static TextBreakIterator staticLineBreakIterator; + return setUpIterator(staticLineBreakIterator, QTextBoundaryFinder::Line, string, length); + } + + TextBreakIterator* sentenceBreakIterator(const UChar* string, int length) + { + static TextBreakIterator staticSentenceBreakIterator; + return setUpIterator(staticSentenceBreakIterator, QTextBoundaryFinder::Sentence, string, length); + + } + + int textBreakFirst(TextBreakIterator* bi) + { + bi->toStart(); + DEBUG() << "textBreakFirst" << bi->position(); + return bi->position(); + } + + int textBreakNext(TextBreakIterator* bi) + { + int pos = bi->toNextBoundary(); + DEBUG() << "textBreakNext" << pos; + return pos; + } + + int textBreakPreceding(TextBreakIterator* bi, int pos) + { + bi->setPosition(pos); + int newpos = bi->toPreviousBoundary(); + DEBUG() << "textBreakPreceding" << pos << newpos; + return newpos; + } + + int textBreakFollowing(TextBreakIterator* bi, int pos) + { + bi->setPosition(pos); + int newpos = bi->toNextBoundary(); + DEBUG() << "textBreakFollowing" << pos << newpos; + return newpos; + } + + int textBreakCurrent(TextBreakIterator* bi) + { + return bi->position(); + } + + bool isTextBreak(TextBreakIterator*, int) + { + return true; + } +#endif + +} diff --git a/Source/WebCore/platform/text/qt/TextCodecQt.cpp b/Source/WebCore/platform/text/qt/TextCodecQt.cpp new file mode 100644 index 0000000..1e95d87 --- /dev/null +++ b/Source/WebCore/platform/text/qt/TextCodecQt.cpp @@ -0,0 +1,166 @@ +/* + * Copyright (C) 2006 Lars Knoll <lars@trolltech.com> + * Copyright (C) 2008 Holger Hans Peter Freyther + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "TextCodecQt.h" +#include "PlatformString.h" +#include <wtf/text/CString.h> +#include <qset.h> + +namespace WebCore { + +static QSet<QByteArray> *unique_names = 0; + +static const char *getAtomicName(const QByteArray &name) +{ + if (!unique_names) + unique_names = new QSet<QByteArray>; + + unique_names->insert(name); + return unique_names->find(name)->constData(); +} + +void TextCodecQt::registerEncodingNames(EncodingNameRegistrar registrar) +{ + QList<int> mibs = QTextCodec::availableMibs(); +// qDebug() << ">>>>>>>>> registerEncodingNames"; + + for (int i = 0; i < mibs.size(); ++i) { + QTextCodec *c = QTextCodec::codecForMib(mibs.at(i)); + const char *name = getAtomicName(c->name()); + registrar(name, name); +// qDebug() << " " << name << name; + QList<QByteArray> aliases = c->aliases(); + for (int i = 0; i < aliases.size(); ++i) { + const char *a = getAtomicName(aliases.at(i)); +// qDebug() << " (a) " << a << name; + registrar(a, name); + } + } +} + +static PassOwnPtr<TextCodec> newTextCodecQt(const TextEncoding& encoding, const void*) +{ + return new TextCodecQt(encoding); +} + +void TextCodecQt::registerCodecs(TextCodecRegistrar registrar) +{ + QList<int> mibs = QTextCodec::availableMibs(); +// qDebug() << ">>>>>>>>> registerCodecs"; + + for (int i = 0; i < mibs.size(); ++i) { + QTextCodec *c = QTextCodec::codecForMib(mibs.at(i)); + const char *name = getAtomicName(c->name()); +// qDebug() << " " << name; + registrar(name, newTextCodecQt, 0); + } +} + +TextCodecQt::TextCodecQt(const TextEncoding& encoding) + : m_encoding(encoding) +{ + m_codec = QTextCodec::codecForName(m_encoding.name()); +} + +TextCodecQt::~TextCodecQt() +{ +} + + +String TextCodecQt::decode(const char* bytes, size_t length, bool flush, bool /*stopOnError*/, bool& sawError) +{ + // We chop input buffer to smaller buffers to avoid excessive memory consumption + // when the input buffer is big. This helps reduce peak memory consumption in + // mobile devices where system RAM is limited. +#if OS(SYMBIAN) + static const int MaxInputChunkSize = 32 * 1024; +#else + static const int MaxInputChunkSize = 1024 * 1024; +#endif + const char* buf = bytes; + const char* end = buf + length; + String unicode(""); // a non-null string is expected + + while (buf < end) { + int size = end - buf; + size = qMin(size, MaxInputChunkSize); + QString decoded = m_codec->toUnicode(buf, size, &m_state); + unicode.append(reinterpret_cast_ptr<const UChar*>(decoded.unicode()), decoded.length()); + buf += size; + } + + sawError = m_state.invalidChars != 0; + + if (flush) { + m_state.flags = QTextCodec::DefaultConversion; + m_state.remainingChars = 0; + m_state.invalidChars = 0; + } + + return unicode; +} + +CString TextCodecQt::encode(const UChar* characters, size_t length, UnencodableHandling handling) +{ + QTextCodec::ConverterState state; + state.flags = QTextCodec::ConversionFlags(QTextCodec::ConvertInvalidToNull | QTextCodec::IgnoreHeader); + + if (!length) + return ""; + + QByteArray ba = m_codec->fromUnicode(reinterpret_cast<const QChar*>(characters), length, &state); + + // If some <b> characters </b> are unencodable, escape them as specified by <b> handling </b> + // We append one valid encoded chunk to a QByteArray at a time. When we encounter an unencodable chunk we + // escape it with getUnencodableReplacement, append it, then move to the next chunk. + if (state.invalidChars) { + state.invalidChars = 0; + state.remainingChars = 0; + int len = 0; + ba.clear(); + for (size_t pos = 0; pos < length; ++pos) { + QByteArray tba = m_codec->fromUnicode(reinterpret_cast<const QChar*>(characters), ++len, &state); + if (state.remainingChars) + continue; + if (state.invalidChars) { + UnencodableReplacementArray replacement; + getUnencodableReplacement(characters[0], handling, replacement); + tba.replace('\0', replacement); + state.invalidChars = 0; + } + ba.append(tba); + characters += len; + len = 0; + state.remainingChars = 0; + } + } + + return CString(ba.constData(), ba.length()); +} + + +} // namespace WebCore diff --git a/Source/WebCore/platform/text/qt/TextCodecQt.h b/Source/WebCore/platform/text/qt/TextCodecQt.h new file mode 100644 index 0000000..f28f0bb --- /dev/null +++ b/Source/WebCore/platform/text/qt/TextCodecQt.h @@ -0,0 +1,54 @@ +/* + * Copyright (C) 2006 Lars Knoll <lars@trolltech.com> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef TextCodecQt_h +#define TextCodecQt_h + +#include "TextCodec.h" +#include "TextEncoding.h" +#include <QTextCodec> + +namespace WebCore { + + class TextCodecQt : public TextCodec { + public: + static void registerEncodingNames(EncodingNameRegistrar); + static void registerCodecs(TextCodecRegistrar); + + TextCodecQt(const TextEncoding&); + virtual ~TextCodecQt(); + + virtual String decode(const char*, size_t length, bool flush, bool stopOnError, bool& sawError); + virtual CString encode(const UChar*, size_t length, UnencodableHandling); + + private: + TextEncoding m_encoding; + QTextCodec *m_codec; + QTextCodec::ConverterState m_state; + }; + +} // namespace WebCore + +#endif // TextCodecICU_h diff --git a/Source/WebCore/platform/text/transcoder/FontTranscoder.cpp b/Source/WebCore/platform/text/transcoder/FontTranscoder.cpp new file mode 100644 index 0000000..68601f9 --- /dev/null +++ b/Source/WebCore/platform/text/transcoder/FontTranscoder.cpp @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2010, Google Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Google Inc. nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "FontTranscoder.h" + +#include "CharacterNames.h" +#include "FontDescription.h" +#include "TextEncoding.h" + +namespace WebCore { + +FontTranscoder::FontTranscoder() +{ + m_converterTypes.add("MS PGothic", BackslashToYenSign); + UChar unicodeNameMSPGothic[] = {0xFF2D, 0xFF33, 0x0020, 0xFF30, 0x30B4, 0x30B7, 0x30C3, 0x30AF}; + m_converterTypes.add(AtomicString(unicodeNameMSPGothic, WTF_ARRAY_LENGTH(unicodeNameMSPGothic)), BackslashToYenSign); + + m_converterTypes.add("MS PMincho", BackslashToYenSign); + UChar unicodeNameMSPMincho[] = {0xFF2D, 0xFF33, 0x0020, 0xFF30, 0x660E, 0x671D}; + m_converterTypes.add(AtomicString(unicodeNameMSPMincho, WTF_ARRAY_LENGTH(unicodeNameMSPMincho)), BackslashToYenSign); + + m_converterTypes.add("MS Gothic", BackslashToYenSign); + UChar unicodeNameMSGothic[] = {0xFF2D, 0xFF33, 0x0020, 0x30B4, 0x30B7, 0x30C3, 0x30AF}; + m_converterTypes.add(AtomicString(unicodeNameMSGothic, WTF_ARRAY_LENGTH(unicodeNameMSGothic)), BackslashToYenSign); + + m_converterTypes.add("MS Mincho", BackslashToYenSign); + UChar unicodeNameMSMincho[] = {0xFF2D, 0xFF33, 0x0020, 0x660E, 0x671D}; + m_converterTypes.add(AtomicString(unicodeNameMSMincho, WTF_ARRAY_LENGTH(unicodeNameMSMincho)), BackslashToYenSign); + + m_converterTypes.add("Meiryo", BackslashToYenSign); + UChar unicodeNameMeiryo[] = {0x30E1, 0x30A4, 0x30EA, 0x30AA}; + m_converterTypes.add(AtomicString(unicodeNameMeiryo, WTF_ARRAY_LENGTH(unicodeNameMeiryo)), BackslashToYenSign); +} + +FontTranscoder::ConverterType FontTranscoder::converterType(const FontDescription& fontDescription, const TextEncoding* encoding) const +{ + const AtomicString& fontFamily = fontDescription.family().family().string(); + if (!fontFamily.isNull()) { + HashMap<AtomicString, ConverterType>::const_iterator found = m_converterTypes.find(fontFamily); + if (found != m_converterTypes.end()) + return found->second; + } + + // IE's default fonts for Japanese encodings change backslashes into yen signs. + // We emulate this behavior only when no font is explicitly specified. + if (encoding && encoding->backslashAsCurrencySymbol() != '\\' && !fontDescription.isSpecifiedFont()) + return BackslashToYenSign; + + return NoConversion; +} + +void FontTranscoder::convert(String& text, const FontDescription& fontDescription, const TextEncoding* encoding) const +{ + switch (converterType(fontDescription, encoding)) { + case BackslashToYenSign: { + // FIXME: TextEncoding.h has similar code. We need to factor them out. + text.replace('\\', yenSign); + break; + } + case NoConversion: + default: + ASSERT_NOT_REACHED(); + } +} + +bool FontTranscoder::needsTranscoding(const FontDescription& fontDescription, const TextEncoding* encoding) const +{ + ConverterType type = converterType(fontDescription, encoding); + return type != NoConversion; +} + +FontTranscoder& fontTranscoder() +{ + static FontTranscoder* transcoder = new FontTranscoder; + return *transcoder; +} + +} // namespace WebCore diff --git a/Source/WebCore/platform/text/transcoder/FontTranscoder.h b/Source/WebCore/platform/text/transcoder/FontTranscoder.h new file mode 100644 index 0000000..67db977 --- /dev/null +++ b/Source/WebCore/platform/text/transcoder/FontTranscoder.h @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2010, Google Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Google Inc. nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef FontTranscoder_h +#define FontTranscoder_h + +#include <wtf/HashMap.h> +#include <wtf/Noncopyable.h> +#include <wtf/text/AtomicStringHash.h> + +namespace WebCore { + +class FontDescription; +class TextEncoding; + +class FontTranscoder : public Noncopyable { +public: + void convert(String& text, const FontDescription&, const TextEncoding* = 0) const; + bool needsTranscoding(const FontDescription&, const TextEncoding* = 0) const; + +private: + FontTranscoder(); + ~FontTranscoder(); // Not implemented to make sure nobody accidentally calls delete -- WebCore does not delete singletons. + + enum ConverterType { + NoConversion, BackslashToYenSign, + }; + + ConverterType converterType(const FontDescription&, const TextEncoding*) const; + + HashMap<AtomicString, ConverterType> m_converterTypes; + + friend FontTranscoder& fontTranscoder(); +}; + +FontTranscoder& fontTranscoder(); + +} // namespace WebCore + +#endif // FontTranscoder_h diff --git a/Source/WebCore/platform/text/win/TextBreakIteratorInternalICUWin.cpp b/Source/WebCore/platform/text/win/TextBreakIteratorInternalICUWin.cpp new file mode 100644 index 0000000..e417e17 --- /dev/null +++ b/Source/WebCore/platform/text/win/TextBreakIteratorInternalICUWin.cpp @@ -0,0 +1,41 @@ +/* + * Copyright (C) 2007 Apple Inc. All rights reserved. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public License + * along with this library; see the file COPYING.LIB. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, + * Boston, MA 02110-1301, USA. + * + */ + +#include "config.h" +#include "TextBreakIteratorInternalICU.h" + +namespace WebCore { + +const char* currentSearchLocaleID() +{ + // FIXME: Should use system locale. + return ""; +} + +const char* currentTextBreakLocaleID() +{ + // Using en_US_POSIX now so word selection in address field works as expected as before (double-clicking + // in a URL selects a word delimited by periods rather than selecting the entire URL). + // However, this is not entirely correct - we should honor the system locale in the normal case. + // FIXME: <rdar://problem/6786703> Should use system locale for text breaking + return "en_US_POSIX"; +} + +} diff --git a/Source/WebCore/platform/text/wince/TextBreakIteratorWinCE.cpp b/Source/WebCore/platform/text/wince/TextBreakIteratorWinCE.cpp new file mode 100644 index 0000000..96488c0 --- /dev/null +++ b/Source/WebCore/platform/text/wince/TextBreakIteratorWinCE.cpp @@ -0,0 +1,303 @@ +/* + * Copyright (C) 2006 Lars Knoll <lars@trolltech.com> + * Copyright (C) 2007-2009 Torch Mobile, Inc. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public License + * along with this library; see the file COPYING.LIB. If not, write to + * the Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + * + */ + +#include "config.h" +#include "TextBreakIterator.h" + +#include "PlatformString.h" +#include <wtf/StdLibExtras.h> +#include <wtf/unicode/Unicode.h> + +using namespace WTF::Unicode; + +namespace WebCore { + +// Hack, not entirely correct +static inline bool isCharStop(UChar c) +{ + CharCategory charCategory = category(c); + return charCategory != Mark_NonSpacing && (charCategory != Other_Surrogate || (c < 0xd800 || c >= 0xdc00)); +} + +static inline bool isLineStop(UChar c) +{ + return category(c) != Separator_Line; +} + +static inline bool isSentenceStop(UChar c) +{ + return isPunct(c); +} + +class TextBreakIterator { +public: + void reset(const UChar* str, int len) + { + string = str; + length = len; + currentPos = 0; + } + int first() + { + currentPos = 0; + return currentPos; + } + int last() + { + currentPos = length; + return currentPos; + } + virtual int next() = 0; + virtual int previous() = 0; + int following(int position) + { + currentPos = position; + return next(); + } + int preceding(int position) + { + currentPos = position; + return previous(); + } + + int currentPos; + const UChar* string; + int length; +}; + +struct WordBreakIterator: TextBreakIterator { + virtual int next(); + virtual int previous(); +}; + +struct CharBreakIterator: TextBreakIterator { + virtual int next(); + virtual int previous(); +}; + +struct LineBreakIterator: TextBreakIterator { + virtual int next(); + virtual int previous(); +}; + +struct SentenceBreakIterator : TextBreakIterator { + virtual int next(); + virtual int previous(); +}; + +int WordBreakIterator::next() +{ + if (currentPos == length) { + currentPos = -1; + return currentPos; + } + bool haveSpace = false; + while (currentPos < length) { + if (haveSpace && !isSpace(string[currentPos])) + break; + if (isSpace(string[currentPos])) + haveSpace = true; + ++currentPos; + } + return currentPos; +} + +int WordBreakIterator::previous() +{ + if (!currentPos) { + currentPos = -1; + return currentPos; + } + bool haveSpace = false; + while (currentPos > 0) { + if (haveSpace && !isSpace(string[currentPos])) + break; + if (isSpace(string[currentPos])) + haveSpace = true; + --currentPos; + } + return currentPos; +} + +int CharBreakIterator::next() +{ + if (currentPos >= length) + return -1; + ++currentPos; + while (currentPos < length && !isCharStop(string[currentPos])) + ++currentPos; + return currentPos; +} + +int CharBreakIterator::previous() +{ + if (currentPos <= 0) + return -1; + if (currentPos > length) + currentPos = length; + --currentPos; + while (currentPos > 0 && !isCharStop(string[currentPos])) + --currentPos; + return currentPos; +} + +int LineBreakIterator::next() +{ + if (currentPos == length) { + currentPos = -1; + return currentPos; + } + bool haveSpace = false; + while (currentPos < length) { + if (haveSpace && !isLineStop(string[currentPos])) + break; + if (isLineStop(string[currentPos])) + haveSpace = true; + ++currentPos; + } + return currentPos; +} + +int LineBreakIterator::previous() +{ + if (!currentPos) { + currentPos = -1; + return currentPos; + } + bool haveSpace = false; + while (currentPos > 0) { + if (haveSpace && !isLineStop(string[currentPos])) + break; + if (isLineStop(string[currentPos])) + haveSpace = true; + --currentPos; + } + return currentPos; +} + +int SentenceBreakIterator::next() +{ + if (currentPos == length) { + currentPos = -1; + return currentPos; + } + bool haveSpace = false; + while (currentPos < length) { + if (haveSpace && !isSentenceStop(string[currentPos])) + break; + if (isSentenceStop(string[currentPos])) + haveSpace = true; + ++currentPos; + } + return currentPos; +} + +int SentenceBreakIterator::previous() +{ + if (!currentPos) { + currentPos = -1; + return currentPos; + } + bool haveSpace = false; + while (currentPos > 0) { + if (haveSpace && !isSentenceStop(string[currentPos])) + break; + if (isSentenceStop(string[currentPos])) + haveSpace = true; + --currentPos; + } + return currentPos; +} + +TextBreakIterator* wordBreakIterator(const UChar* string, int length) +{ + DEFINE_STATIC_LOCAL(WordBreakIterator, iterator, ()); + iterator.reset(string, length); + return &iterator; +} + +TextBreakIterator* characterBreakIterator(const UChar* string, int length) +{ + DEFINE_STATIC_LOCAL(CharBreakIterator, iterator, ()); + iterator.reset(string, length); + return &iterator; +} + +TextBreakIterator* lineBreakIterator(const UChar* string, int length) +{ + DEFINE_STATIC_LOCAL(LineBreakIterator , iterator, ()); + iterator.reset(string, length); + return &iterator; +} + +TextBreakIterator* sentenceBreakIterator(const UChar* string, int length) +{ + DEFINE_STATIC_LOCAL(SentenceBreakIterator, iterator, ()); + iterator.reset(string, length); + return &iterator; +} + +int textBreakFirst(TextBreakIterator* breakIterator) +{ + return breakIterator->first(); +} + +int textBreakLast(TextBreakIterator* breakIterator) +{ + return breakIterator->last(); +} + +int textBreakNext(TextBreakIterator* breakIterator) +{ + return breakIterator->next(); +} + +int textBreakPrevious(TextBreakIterator* breakIterator) +{ + return breakIterator->previous(); +} + +int textBreakPreceding(TextBreakIterator* breakIterator, int position) +{ + return breakIterator->preceding(position); +} + +int textBreakFollowing(TextBreakIterator* breakIterator, int position) +{ + return breakIterator->following(position); +} + +int textBreakCurrent(TextBreakIterator* breakIterator) +{ + return breakIterator->currentPos; +} + +bool isTextBreak(TextBreakIterator*, int) +{ + return true; +} + +TextBreakIterator* cursorMovementIterator(const UChar* string, int length) +{ + return characterBreakIterator(string, length); +} + +} // namespace WebCore diff --git a/Source/WebCore/platform/text/wince/TextCodecWinCE.cpp b/Source/WebCore/platform/text/wince/TextCodecWinCE.cpp new file mode 100644 index 0000000..3532e74 --- /dev/null +++ b/Source/WebCore/platform/text/wince/TextCodecWinCE.cpp @@ -0,0 +1,389 @@ +/* + * Copyright (C) 2007-2009 Torch Mobile, Inc. All rights reserved. + * Copyright (C) 2010 Patrick Gansterer <paroga@paroga.com> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * This library is distributed in the hope that i will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public License + * along with this library; see the file COPYING.LIB. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, + * Boston, MA 02110-1301, USA. + */ + +#include "config.h" +#include "TextCodecWinCE.h" + +#include "FontCache.h" +#include "PlatformString.h" +#include <mlang.h> +#include <winbase.h> +#include <winnls.h> +#include <wtf/HashMap.h> +#include <wtf/HashSet.h> +#include <wtf/text/CString.h> +#include <wtf/text/StringConcatenate.h> +#include <wtf/text/StringHash.h> +#include <wtf/unicode/UTF8.h> + +namespace WebCore { + +struct CharsetInfo { + CString m_name; + String m_friendlyName; + UINT m_codePage; + Vector<CString> m_aliases; +}; + +class LanguageManager { +private: + LanguageManager(); + + friend LanguageManager& languageManager(); +}; + +// Usage: a lookup table used to get CharsetInfo with code page ID. +// Key: code page ID. Value: charset information. +static HashMap<UINT, CString>& codePageCharsets() +{ + static HashMap<UINT, CString> cc; + return cc; +} + +static HashMap<String, CharsetInfo>& knownCharsets() +{ + static HashMap<String, CharsetInfo> kc; + return kc; +} + +// Usage: a map that stores charsets that are supported by system. Sorted by name. +// Key: charset. Value: code page ID. +typedef HashSet<String> CharsetSet; +static CharsetSet& supportedCharsets() +{ + static CharsetSet sl; + return sl; +} + +static LanguageManager& languageManager() +{ + static LanguageManager lm; + return lm; +} + +LanguageManager::LanguageManager() +{ + IEnumCodePage* enumInterface; + IMultiLanguage* mli = FontCache::getMultiLanguageInterface(); + if (mli && S_OK == mli->EnumCodePages(MIMECONTF_BROWSER, &enumInterface)) { + MIMECPINFO cpInfo; + ULONG ccpInfo; + while (S_OK == enumInterface->Next(1, &cpInfo, &ccpInfo) && ccpInfo) { + if (!IsValidCodePage(cpInfo.uiCodePage)) + continue; + + HashMap<UINT, CString>::iterator i = codePageCharsets().find(cpInfo.uiCodePage); + + CString name(String(cpInfo.wszWebCharset).latin1()); + if (i == codePageCharsets().end()) { + CharsetInfo info; + info.m_codePage = cpInfo.uiCodePage; + knownCharsets().set(name.data(), info); + i = codePageCharsets().set(cpInfo.uiCodePage, name).first; + } + if (i != codePageCharsets().end()) { + HashMap<String, CharsetInfo>::iterator j = knownCharsets().find(String(i->second.data(), i->second.length())); + ASSERT(j != knownCharsets().end()); + CharsetInfo& info = j->second; + info.m_name = i->second.data(); + info.m_friendlyName = cpInfo.wszDescription; + info.m_aliases.append(name); + info.m_aliases.append(String(cpInfo.wszHeaderCharset).latin1()); + info.m_aliases.append(String(cpInfo.wszBodyCharset).latin1()); + String cpName = makeString("cp", String::number(cpInfo.uiCodePage)); + info.m_aliases.append(cpName.latin1()); + supportedCharsets().add(i->second.data()); + } + } + enumInterface->Release(); + } +} + +static UINT getCodePage(const char* name) +{ + if (!strcmp(name, "UTF-8")) + return CP_UTF8; + + // Explicitly use a "const" reference to fix the silly VS build error + // saying "==" is not found for const_iterator and iterator + const HashMap<String, CharsetInfo>& charsets = knownCharsets(); + HashMap<String, CharsetInfo>::const_iterator i = charsets.find(name); + return i == charsets.end() ? CP_ACP : i->second.m_codePage; +} + +static PassOwnPtr<TextCodec> newTextCodecWinCE(const TextEncoding& encoding, const void*) +{ + return new TextCodecWinCE(getCodePage(encoding.name())); +} + +TextCodecWinCE::TextCodecWinCE(UINT codePage) + : m_codePage(codePage) +{ +} + +TextCodecWinCE::~TextCodecWinCE() +{ +} + +void TextCodecWinCE::registerBaseEncodingNames(EncodingNameRegistrar registrar) +{ + registrar("UTF-8", "UTF-8"); +} + +void TextCodecWinCE::registerBaseCodecs(TextCodecRegistrar registrar) +{ + registrar("UTF-8", newTextCodecWinCE, 0); +} + +void TextCodecWinCE::registerExtendedEncodingNames(EncodingNameRegistrar registrar) +{ + languageManager(); + for (CharsetSet::iterator i = supportedCharsets().begin(); i != supportedCharsets().end(); ++i) { + HashMap<String, CharsetInfo>::iterator j = knownCharsets().find(*i); + if (j != knownCharsets().end()) { + registrar(j->second.m_name.data(), j->second.m_name.data()); + for (Vector<CString>::const_iterator alias = j->second.m_aliases.begin(); alias != j->second.m_aliases.end(); ++alias) + registrar(alias->data(), j->second.m_name.data()); + } + } +} + +void TextCodecWinCE::registerExtendedCodecs(TextCodecRegistrar registrar) +{ + languageManager(); + for (CharsetSet::iterator i = supportedCharsets().begin(); i != supportedCharsets().end(); ++i) { + HashMap<String, CharsetInfo>::iterator j = knownCharsets().find(*i); + if (j != knownCharsets().end()) + registrar(j->second.m_name.data(), newTextCodecWinCE, 0); + } +} + +static DWORD getCodePageFlags(UINT codePage) +{ + if (codePage == CP_UTF8) + return MB_ERR_INVALID_CHARS; + + if (codePage == 42) // Symbol + return 0; + + // Microsoft says the flag must be 0 for the following code pages + if (codePage > 50000) { + if ((codePage >= 50220 && codePage <= 50222) + || codePage == 50225 + || codePage == 50227 + || codePage == 50229 + || codePage == 52936 + || codePage == 54936 + || (codePage >= 57002 && codePage <= 57001) + || codePage == 65000 // UTF-7 + ) + return 0; + } + + return MB_PRECOMPOSED | MB_ERR_INVALID_CHARS; +} + +static inline const char* findFirstNonAsciiCharacter(const char* bytes, size_t length) +{ + for (const char* bytesEnd = bytes + length; bytes < bytesEnd; ++bytes) { + if (*bytes & 0x80) + break; + } + return bytes; +} + +static void decode(Vector<UChar, 8192>& result, UINT codePage, const char* bytes, size_t length, size_t* left, bool canBeFirstTime, bool& sawInvalidChar) +{ + *left = length; + if (!bytes || !length) + return; + + DWORD flags = getCodePageFlags(codePage); + + if (codePage == CP_UTF8) { + if (canBeFirstTime) { + // Handle BOM. + if (length > 3) { + if (bytes[0] == (char)0xEF && bytes[1] == (char)0xBB && bytes[2] == (char)0xBF) { + // BOM found! + length -= 3; + bytes += 3; + *left = length; + } + } else if (bytes[0] == 0xEF && (length < 2 || bytes[1] == (char)0xBB) && (length < 3 || bytes[2] == (char)0xBF)) { + if (length == 3) + *left = 0; + return; + } + } + + // Process ASCII characters at beginning. + const char* firstNonAsciiChar = findFirstNonAsciiCharacter(bytes, length); + int numAsciiCharacters = firstNonAsciiChar - bytes; + if (numAsciiCharacters) { + result.append(bytes, numAsciiCharacters); + length -= numAsciiCharacters; + if (!length) { + *left = 0; + return; + } + bytes = firstNonAsciiChar; + } + + int oldSize = result.size(); + result.resize(oldSize + length); + UChar* resultStart = result.data() + oldSize; + const char* sourceStart = bytes; + const char* const sourceEnd = bytes + length; + for (;;) { + using namespace WTF::Unicode; + ConversionResult convRes = convertUTF8ToUTF16(&sourceStart + , sourceEnd + , &resultStart + , result.data() + result.size() + , true); + + // FIXME: is it possible? + if (convRes == targetExhausted && sourceStart < sourceEnd) { + oldSize = result.size(); + result.resize(oldSize + 256); + resultStart = result.data() + oldSize; + continue; + } + + if (convRes != conversionOK) + sawInvalidChar = true; + + break; + } + + *left = sourceEnd - sourceStart; + result.resize(resultStart - result.data()); + } else { + int testLength = length; + int untestedLength = length; + for (;;) { + int resultLength = MultiByteToWideChar(codePage, flags, bytes, testLength, 0, 0); + + if (resultLength > 0) { + int oldSize = result.size(); + result.resize(oldSize + resultLength); + + MultiByteToWideChar(codePage, flags, bytes, testLength, result.data() + oldSize, resultLength); + + if (testLength == untestedLength) { + *left = length - testLength; + break; + } + untestedLength -= testLength; + length -= testLength; + bytes += testLength; + } else { + untestedLength = testLength - 1; + if (!untestedLength) { + *left = length; + break; + } + } + testLength = (untestedLength + 1) / 2; + } + } +} + +String TextCodecWinCE::decode(const char* bytes, size_t length, bool flush, bool stopOnError, bool& sawError) +{ + if (!m_decodeBuffer.isEmpty()) { + m_decodeBuffer.append(bytes, length); + bytes = m_decodeBuffer.data(); + length = m_decodeBuffer.size(); + } + + size_t left; + Vector<UChar, 8192> result; + for (;;) { + bool sawInvalidChar = false; + WebCore::decode(result, m_codePage, bytes, length, &left, m_decodeBuffer.isEmpty(), sawInvalidChar); + if (!left) + break; + + if (!sawInvalidChar && !flush && left < 16) + break; + + result.append(L'?'); + sawError = true; + if (stopOnError) + return String::adopt(result); + + if (left == 1) + break; + + bytes += length - left + 1; + length = left - 1; + } + if (left && !flush) { + if (m_decodeBuffer.isEmpty()) + m_decodeBuffer.append(bytes + length - left, left); + else { + memmove(m_decodeBuffer.data(), bytes + length - left, left); + m_decodeBuffer.resize(left); + } + } else + m_decodeBuffer.clear(); + + return String::adopt(result); +} + +CString TextCodecWinCE::encode(const UChar* characters, size_t length, UnencodableHandling) +{ + if (!characters || !length) + return CString(); + + DWORD flags = m_codePage == CP_UTF8 ? 0 : WC_COMPOSITECHECK; + + int resultLength = WideCharToMultiByte(m_codePage, flags, characters, length, 0, 0, 0, 0); + + // FIXME: We need to implement UnencodableHandling: QuestionMarksForUnencodables, EntitiesForUnencodables, and URLEncodedEntitiesForUnencodables. + + if (resultLength <= 0) + return "?"; + + char* characterBuffer; + CString result = CString::newUninitialized(resultLength, characterBuffer); + + WideCharToMultiByte(m_codePage, flags, characters, length, characterBuffer, resultLength, 0, 0); + + return result; +} + +void TextCodecWinCE::enumerateSupportedEncodings(EncodingReceiver& receiver) +{ + languageManager(); + for (CharsetSet::iterator i = supportedCharsets().begin(); i != supportedCharsets().end(); ++i) { + HashMap<String, CharsetInfo>::iterator j = knownCharsets().find(*i); + if (j != knownCharsets().end() && !receiver.receive(j->second.m_name.data(), j->second.m_friendlyName.charactersWithNullTermination(), j->second.m_codePage)) + break; + } +} + +} // namespace WebCore diff --git a/Source/WebCore/platform/text/wince/TextCodecWinCE.h b/Source/WebCore/platform/text/wince/TextCodecWinCE.h new file mode 100644 index 0000000..8d332a6 --- /dev/null +++ b/Source/WebCore/platform/text/wince/TextCodecWinCE.h @@ -0,0 +1,73 @@ +/* + * Copyright (C) 2004, 2006, 2007 Apple Inc. All rights reserved. + * Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com> + * Copyright (C) 2007-2009 Torch Mobile, Inc. + * Copyright (C) 2010 Patrick Gansterer <paroga@paroga.com> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef TextCodecWinCE_h +#define TextCodecWinCE_h + +#include "PlatformString.h" +#include "TextCodec.h" +#include "TextEncoding.h" +#include <wtf/Vector.h> +#include <windows.h> + +namespace WebCore { + +class TextCodecWinCE : public TextCodec { +public: + static void registerBaseEncodingNames(EncodingNameRegistrar); + static void registerBaseCodecs(TextCodecRegistrar); + + static void registerExtendedEncodingNames(EncodingNameRegistrar); + static void registerExtendedCodecs(TextCodecRegistrar); + + TextCodecWinCE(UINT codePage); + virtual ~TextCodecWinCE(); + + virtual String decode(const char*, size_t length, bool flush, bool stopOnError, bool& sawError); + virtual CString encode(const UChar*, size_t length, UnencodableHandling); + + struct EncodingInfo { + String m_encoding; + String m_friendlyName; + }; + + struct EncodingReceiver { + // Return false to stop enumerating. + virtual bool receive(const char* encoding, const wchar_t* friendlyName, unsigned int codePage) = 0; + }; + + static void enumerateSupportedEncodings(EncodingReceiver& receiver); + +private: + UINT m_codePage; + Vector<char> m_decodeBuffer; +}; + +} // namespace WebCore + +#endif // TextCodecWinCE_h |