summaryrefslogtreecommitdiffstats
path: root/WebCore/platform/text
diff options
context:
space:
mode:
Diffstat (limited to 'WebCore/platform/text')
-rw-r--r--WebCore/platform/text/AtomicString.cpp2
-rw-r--r--WebCore/platform/text/Base64.cpp6
-rw-r--r--WebCore/platform/text/BidiResolver.h19
-rw-r--r--WebCore/platform/text/CString.cpp25
-rw-r--r--WebCore/platform/text/CString.h17
-rw-r--r--WebCore/platform/text/PlatformString.h12
-rw-r--r--WebCore/platform/text/String.cpp9
-rw-r--r--WebCore/platform/text/StringImpl.cpp87
-rw-r--r--WebCore/platform/text/StringImpl.h40
-rw-r--r--WebCore/platform/text/TextBreakIterator.h12
-rw-r--r--WebCore/platform/text/TextBreakIteratorICU.cpp116
-rw-r--r--WebCore/platform/text/TextCodecICU.cpp2
-rw-r--r--WebCore/platform/text/TextDecoder.cpp129
-rw-r--r--WebCore/platform/text/TextDecoder.h64
-rw-r--r--WebCore/platform/text/TextEncoding.cpp26
-rw-r--r--WebCore/platform/text/TextEncoding.h11
-rw-r--r--WebCore/platform/text/TextEncodingDetector.h48
-rw-r--r--WebCore/platform/text/TextEncodingDetectorICU.cpp129
-rw-r--r--WebCore/platform/text/TextEncodingDetectorNone.cpp51
-rw-r--r--WebCore/platform/text/TextEncodingRegistry.h7
-rw-r--r--WebCore/platform/text/android/TextBreakIteratorInternalICU.cpp36
-rw-r--r--WebCore/platform/text/cf/StringImplCF.cpp131
-rw-r--r--WebCore/platform/text/mac/ShapeArabic.c2
-rw-r--r--WebCore/platform/text/mac/StringImplMac.mm8
-rw-r--r--WebCore/platform/text/mac/StringMac.mm1
-rw-r--r--WebCore/platform/text/qt/TextBreakIteratorQt.cpp10
26 files changed, 714 insertions, 286 deletions
diff --git a/WebCore/platform/text/AtomicString.cpp b/WebCore/platform/text/AtomicString.cpp
index 5f9abfd..d85f5ee 100644
--- a/WebCore/platform/text/AtomicString.cpp
+++ b/WebCore/platform/text/AtomicString.cpp
@@ -101,7 +101,7 @@ static inline bool equal(StringImpl* string, const UChar* characters, unsigned l
if (string->length() != length)
return false;
-#if PLATFORM(ARM)
+#if PLATFORM(ARM) || PLATFORM(SH4)
const UChar* stringCharacters = string->characters();
for (unsigned i = 0; i != length; ++i) {
if (*stringCharacters++ != *characters++)
diff --git a/WebCore/platform/text/Base64.cpp b/WebCore/platform/text/Base64.cpp
index 920fa89..be19164 100644
--- a/WebCore/platform/text/Base64.cpp
+++ b/WebCore/platform/text/Base64.cpp
@@ -97,8 +97,8 @@ void base64Encode(const Vector<char>& in, Vector<char>& out, bool insertLFs)
count += 4;
}
out[didx++] = base64EncMap[(data[sidx] >> 2) & 077];
- out[didx++] = base64EncMap[(data[sidx + 1] >> 4) & 017 | (data[sidx] << 4) & 077];
- out[didx++] = base64EncMap[(data[sidx + 2] >> 6) & 003 | (data[sidx + 1] << 2) & 077];
+ out[didx++] = base64EncMap[((data[sidx + 1] >> 4) & 017) | ((data[sidx] << 4) & 077)];
+ out[didx++] = base64EncMap[((data[sidx + 2] >> 6) & 003) | ((data[sidx + 1] << 2) & 077)];
out[didx++] = base64EncMap[data[sidx + 2] & 077];
sidx += 3;
}
@@ -110,7 +110,7 @@ void base64Encode(const Vector<char>& in, Vector<char>& out, bool insertLFs)
out[didx++] = base64EncMap[(data[sidx] >> 2) & 077];
if (sidx < len - 1) {
- out[didx++] = base64EncMap[(data[sidx + 1] >> 4) & 017 | (data[sidx] << 4) & 077];
+ out[didx++] = base64EncMap[((data[sidx + 1] >> 4) & 017) | ((data[sidx] << 4) & 077)];
out[didx++] = base64EncMap[(data[sidx + 1] << 2) & 077];
} else
out[didx++] = base64EncMap[(data[sidx] << 4) & 077];
diff --git a/WebCore/platform/text/BidiResolver.h b/WebCore/platform/text/BidiResolver.h
index ffd3d51..8288be4 100644
--- a/WebCore/platform/text/BidiResolver.h
+++ b/WebCore/platform/text/BidiResolver.h
@@ -254,7 +254,16 @@ template <class Iterator, class Run>
void BidiResolver<Iterator, Run>::appendRun()
{
if (!emptyRun && !eor.atEnd()) {
- addRun(new Run(sor.offset(), eor.offset() + 1, context(), m_direction));
+ unsigned startOffset = sor.offset();
+ unsigned endOffset = eor.offset();
+
+ if (!endOfLine.atEnd() && endOffset >= endOfLine.offset()) {
+ reachedEndOfLine = true;
+ endOffset = endOfLine.offset();
+ }
+
+ if (endOffset >= startOffset)
+ addRun(new Run(startOffset, endOffset + 1, context(), m_direction));
eor.increment();
sor = eor;
@@ -352,8 +361,8 @@ void BidiResolver<Iterator, Run>::raiseExplicitEmbeddingLevel(WTF::Unicode::Dire
m_direction = LeftToRight;
}
} else if (m_status.eor == ArabicNumber
- || m_status.eor == EuropeanNumber && (m_status.lastStrong != LeftToRight || from == RightToLeft)
- || m_status.eor != EuropeanNumber && m_status.lastStrong == LeftToRight && from == RightToLeft) {
+ || (m_status.eor == EuropeanNumber && (m_status.lastStrong != LeftToRight || from == RightToLeft))
+ || (m_status.eor != EuropeanNumber && m_status.lastStrong == LeftToRight && from == RightToLeft)) {
appendRun();
m_direction = RightToLeft;
}
@@ -722,8 +731,8 @@ void BidiResolver<Iterator, Run>::createBidiRunsForLine(const Iterator& end, boo
case WhiteSpaceNeutral:
case OtherNeutral:
if (m_status.eor == ArabicNumber
- || m_status.eor == EuropeanNumber && (m_status.lastStrong == RightToLeft || context()->dir() == RightToLeft)
- || m_status.eor != EuropeanNumber && m_status.lastStrong == LeftToRight && context()->dir() == RightToLeft) {
+ || (m_status.eor == EuropeanNumber && (m_status.lastStrong == RightToLeft || context()->dir() == RightToLeft))
+ || (m_status.eor != EuropeanNumber && m_status.lastStrong == LeftToRight && context()->dir() == RightToLeft)) {
// Terminate the run before the neutrals.
appendRun();
// Begin an R run for the neutrals.
diff --git a/WebCore/platform/text/CString.cpp b/WebCore/platform/text/CString.cpp
index 8e68628..90990f8 100644
--- a/WebCore/platform/text/CString.cpp
+++ b/WebCore/platform/text/CString.cpp
@@ -47,8 +47,8 @@ void CString::init(const char* str, unsigned length)
return;
m_buffer = CStringBuffer::create(length + 1);
- memcpy(m_buffer->data(), str, length);
- m_buffer->data()[length] = '\0';
+ memcpy(m_buffer->mutableData(), str, length);
+ m_buffer->mutableData()[length] = '\0';
}
const char* CString::data() const
@@ -61,7 +61,7 @@ char* CString::mutableData()
copyBufferIfNeeded();
if (!m_buffer)
return 0;
- return m_buffer->data();
+ return m_buffer->mutableData();
}
unsigned CString::length() const
@@ -73,7 +73,7 @@ CString CString::newUninitialized(size_t length, char*& characterBuffer)
{
CString result;
result.m_buffer = CStringBuffer::create(length + 1);
- char* bytes = result.m_buffer->data();
+ char* bytes = result.m_buffer->mutableData();
bytes[length] = '\0';
characterBuffer = bytes;
return result;
@@ -87,7 +87,7 @@ void CString::copyBufferIfNeeded()
int len = m_buffer->length();
RefPtr<CStringBuffer> m_temp = m_buffer;
m_buffer = CStringBuffer::create(len);
- memcpy(m_buffer->data(), m_temp->data(), len);
+ memcpy(m_buffer->mutableData(), m_temp->data(), len);
}
bool operator==(const CString& a, const CString& b)
@@ -99,17 +99,4 @@ bool operator==(const CString& a, const CString& b)
return !strncmp(a.data(), b.data(), min(a.length(), b.length()));
}
-PassRefPtr<SharedBuffer> CString::releaseBuffer()
-{
- if (!m_buffer)
- return 0;
-
- copyBufferIfNeeded();
-
- RefPtr<SharedBuffer> result = m_buffer->releaseBuffer();
- m_buffer = 0;
- return result.release();
-}
-
-
-}
+} // namespace WebCore
diff --git a/WebCore/platform/text/CString.h b/WebCore/platform/text/CString.h
index 09f112f..f084ddf 100644
--- a/WebCore/platform/text/CString.h
+++ b/WebCore/platform/text/CString.h
@@ -36,15 +36,15 @@ namespace WebCore {
class CStringBuffer : public RefCounted<CStringBuffer> {
public:
- static PassRefPtr<CStringBuffer> create(unsigned length) { return adoptRef(new CStringBuffer(length)); }
-
- char* data() { return m_vector.data(); }
- size_t length() const { return m_vector.size(); }
+ const char* data() { return m_vector.data(); }
+ size_t length() { return m_vector.size(); }
- PassRefPtr<SharedBuffer> releaseBuffer() { return SharedBuffer::adoptVector(m_vector); }
-
private:
+ friend class CString;
+
+ static PassRefPtr<CStringBuffer> create(unsigned length) { return adoptRef(new CStringBuffer(length)); }
CStringBuffer(unsigned length) : m_vector(length) { }
+ char* mutableData() { return m_vector.data(); }
Vector<char> m_vector;
};
@@ -56,6 +56,7 @@ namespace WebCore {
CString() { }
CString(const char*);
CString(const char*, unsigned length);
+ CString(CStringBuffer* buffer) : m_buffer(buffer) { }
static CString newUninitialized(size_t length, char*& characterBuffer);
const char* data() const;
@@ -63,8 +64,8 @@ namespace WebCore {
unsigned length() const;
bool isNull() const { return !m_buffer; }
-
- PassRefPtr<SharedBuffer> releaseBuffer();
+
+ CStringBuffer* buffer() const { return m_buffer.get(); }
private:
void copyBufferIfNeeded();
diff --git a/WebCore/platform/text/PlatformString.h b/WebCore/platform/text/PlatformString.h
index 35d3079..a1541d2 100644
--- a/WebCore/platform/text/PlatformString.h
+++ b/WebCore/platform/text/PlatformString.h
@@ -27,15 +27,18 @@
#include "StringImpl.h"
-#include <wtf/PassRefPtr.h>
+#ifdef __OBJC__
+#include <objc/objc.h>
+#endif
#if USE(JSC)
#include <runtime/Identifier.h>
#else
-// runtime/Identifier.h includes HashMap.h and HashSet.h. We explicitly include
-// them in the case of non-JSC builds to keep things consistent.
+// runtime/Identifier.h brings in a variety of wtf headers. We explicitly
+// include them in the case of non-JSC builds to keep things consistent.
#include <wtf/HashMap.h>
#include <wtf/HashSet.h>
+#include <wtf/OwnPtr.h>
#endif
#if PLATFORM(CF) || (PLATFORM(QT) && PLATFORM(DARWIN))
@@ -228,6 +231,9 @@ public:
static String fromUTF8(const char*, size_t);
static String fromUTF8(const char*);
+ // Tries to convert the passed in string to UTF-8, but will fall back to Latin-1 if the string is not valid UTF-8.
+ static String fromUTF8WithLatin1Fallback(const char*, size_t);
+
// Determines the writing direction using the Unicode Bidi Algorithm rules P2 and P3.
WTF::Unicode::Direction defaultWritingDirection() const { return m_impl ? m_impl->defaultWritingDirection() : WTF::Unicode::LeftToRight; }
diff --git a/WebCore/platform/text/String.cpp b/WebCore/platform/text/String.cpp
index 638e45f..733b661 100644
--- a/WebCore/platform/text/String.cpp
+++ b/WebCore/platform/text/String.cpp
@@ -623,6 +623,15 @@ String String::fromUTF8(const char* string)
return UTF8Encoding().decode(string, strlen(string));
}
+String String::fromUTF8WithLatin1Fallback(const char* string, size_t size)
+{
+ String result = fromUTF8(string, size);
+ if (!result)
+ result = String(string, size);
+
+ return result;
+}
+
#if USE(JSC)
String::String(const Identifier& str)
{
diff --git a/WebCore/platform/text/StringImpl.cpp b/WebCore/platform/text/StringImpl.cpp
index 0556f8e..6bba990 100644
--- a/WebCore/platform/text/StringImpl.cpp
+++ b/WebCore/platform/text/StringImpl.cpp
@@ -2,7 +2,7 @@
* Copyright (C) 1999 Lars Knoll (knoll@kde.org)
* (C) 1999 Antti Koivisto (koivisto@kde.org)
* (C) 2001 Dirk Mueller ( mueller@kde.org )
- * Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008 Apple Inc. All rights reserved.
+ * Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved.
* Copyright (C) 2006 Andrew Wellington (proton@wiretapped.net)
*
* This library is free software; you can redistribute it and/or
@@ -54,6 +54,27 @@ static inline void deleteUCharVector(const UChar* p)
fastFree(const_cast<UChar*>(p));
}
+// Some of the factory methods create buffers using fastMalloc.
+// We must ensure that ll allocations of StringImpl are allocated using
+// fastMalloc so that we don't have mis-matched frees. We accomplish
+// this by overriding the new and delete operators.
+void* StringImpl::operator new(size_t size, void* address)
+{
+ if (address)
+ return address; // Allocating using an internal buffer
+ return fastMalloc(size);
+}
+
+void* StringImpl::operator new(size_t size)
+{
+ return fastMalloc(size);
+}
+
+void StringImpl::operator delete(void* address)
+{
+ fastFree(address);
+}
+
// This constructor is used only to create the empty string.
StringImpl::StringImpl()
: m_length(0)
@@ -61,6 +82,7 @@ StringImpl::StringImpl()
, m_hash(0)
, m_inTable(false)
, m_hasTerminatingNullCharacter(false)
+ , m_bufferIsInternal(false)
{
// Ensure that the hash is computed so that AtomicStringHash can call existingHash()
// with impunity. The empty string is special because it is never entered into
@@ -76,6 +98,7 @@ inline StringImpl::StringImpl(const UChar* characters, unsigned length)
, m_hash(0)
, m_inTable(false)
, m_hasTerminatingNullCharacter(false)
+ , m_bufferIsInternal(false)
{
UChar* data = newUCharVector(length);
memcpy(data, characters, length * sizeof(UChar));
@@ -87,6 +110,7 @@ inline StringImpl::StringImpl(const StringImpl& str, WithTerminatingNullCharacte
, m_hash(str.m_hash)
, m_inTable(false)
, m_hasTerminatingNullCharacter(true)
+ , m_bufferIsInternal(false)
{
UChar* data = newUCharVector(str.m_length + 1);
memcpy(data, str.m_data, str.m_length * sizeof(UChar));
@@ -99,6 +123,7 @@ inline StringImpl::StringImpl(const char* characters, unsigned length)
, m_hash(0)
, m_inTable(false)
, m_hasTerminatingNullCharacter(false)
+ , m_bufferIsInternal(false)
{
ASSERT(characters);
ASSERT(length);
@@ -117,6 +142,7 @@ inline StringImpl::StringImpl(UChar* characters, unsigned length, AdoptBuffer)
, m_hash(0)
, m_inTable(false)
, m_hasTerminatingNullCharacter(false)
+ , m_bufferIsInternal(false)
{
ASSERT(characters);
ASSERT(length);
@@ -128,6 +154,7 @@ StringImpl::StringImpl(const UChar* characters, unsigned length, unsigned hash)
, m_hash(hash)
, m_inTable(true)
, m_hasTerminatingNullCharacter(false)
+ , m_bufferIsInternal(false)
{
ASSERT(hash);
ASSERT(characters);
@@ -144,6 +171,7 @@ StringImpl::StringImpl(const char* characters, unsigned length, unsigned hash)
, m_hash(hash)
, m_inTable(true)
, m_hasTerminatingNullCharacter(false)
+ , m_bufferIsInternal(false)
{
ASSERT(hash);
ASSERT(characters);
@@ -161,7 +189,8 @@ StringImpl::~StringImpl()
{
if (m_inTable)
AtomicString::remove(this);
- deleteUCharVector(m_data);
+ if (!m_bufferIsInternal)
+ deleteUCharVector(m_data);
}
StringImpl* StringImpl::empty()
@@ -907,26 +936,8 @@ WTF::Unicode::Direction StringImpl::defaultWritingDirection()
}
// This is a hot function because it's used when parsing HTML.
-PassRefPtr<StringImpl> StringImpl::createStrippingNullCharacters(const UChar* characters, unsigned length)
+PassRefPtr<StringImpl> StringImpl::createStrippingNullCharactersSlowCase(const UChar* characters, unsigned length)
{
- ASSERT(characters);
- ASSERT(length);
-
- // Optimize for the case where there are no Null characters by quickly
- // searching for nulls, and then using StringImpl::create, which will
- // memcpy the whole buffer. This is faster than assigning character by
- // character during the loop.
-
- // Fast case.
- int foundNull = 0;
- for (unsigned i = 0; !foundNull && i < length; i++) {
- int c = characters[i]; // more efficient than using UChar here (at least on Intel Mac OS)
- foundNull |= !c;
- }
- if (!foundNull)
- return StringImpl::create(characters, length);
-
- // Slow case.
StringBuffer strippedCopy(length);
unsigned strippedLength = 0;
for (unsigned i = 0; i < length; i++) {
@@ -958,24 +969,44 @@ PassRefPtr<StringImpl> StringImpl::create(const UChar* characters, unsigned leng
{
if (!characters || !length)
return empty();
- return adoptRef(new StringImpl(characters, length));
+
+ // Allocate a single buffer large enough to contain the StringImpl
+ // struct as well as the data which it contains. This removes one
+ // heap allocation from this call.
+ size_t size = sizeof(StringImpl) + length * sizeof(UChar);
+ char* buffer = static_cast<char*>(fastMalloc(size));
+ UChar* data = reinterpret_cast<UChar*>(buffer + sizeof(StringImpl));
+ memcpy(data, characters, length * sizeof(UChar));
+ StringImpl* string = new (buffer) StringImpl(data, length, AdoptBuffer());
+ string->m_bufferIsInternal = true;
+ return adoptRef(string);
}
PassRefPtr<StringImpl> StringImpl::create(const char* characters, unsigned length)
{
if (!characters || !length)
return empty();
- return adoptRef(new StringImpl(characters, length));
+
+ // Allocate a single buffer large enough to contain the StringImpl
+ // struct as well as the data which it contains. This removes one
+ // heap allocation from this call.
+ size_t size = sizeof(StringImpl) + length * sizeof(UChar);
+ char* buffer = static_cast<char*>(fastMalloc(size));
+ UChar* data = reinterpret_cast<UChar*>(buffer + sizeof(StringImpl));
+ for (unsigned i = 0; i != length; ++i) {
+ unsigned char c = characters[i];
+ data[i] = c;
+ }
+ StringImpl* string = new (buffer) StringImpl(data, length, AdoptBuffer());
+ string->m_bufferIsInternal = true;
+ return adoptRef(string);
}
PassRefPtr<StringImpl> StringImpl::create(const char* string)
{
if (!string)
return empty();
- unsigned length = strlen(string);
- if (!length)
- return empty();
- return adoptRef(new StringImpl(string, length));
+ return create(string, strlen(string));
}
PassRefPtr<StringImpl> StringImpl::createWithTerminatingNullCharacter(const StringImpl& string)
@@ -985,7 +1016,7 @@ PassRefPtr<StringImpl> StringImpl::createWithTerminatingNullCharacter(const Stri
PassRefPtr<StringImpl> StringImpl::copy()
{
- return adoptRef(new StringImpl(m_data, m_length));
+ return create(m_data, m_length);
}
} // namespace WebCore
diff --git a/WebCore/platform/text/StringImpl.h b/WebCore/platform/text/StringImpl.h
index 281aa37..1242f27 100644
--- a/WebCore/platform/text/StringImpl.h
+++ b/WebCore/platform/text/StringImpl.h
@@ -1,6 +1,6 @@
/*
* Copyright (C) 1999 Lars Knoll (knoll@kde.org)
- * Copyright (C) 2005, 2006, 2007, 2008 Apple Inc. All rights reserved.
+ * Copyright (C) 2005, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved.
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Library General Public
@@ -24,7 +24,7 @@
#include <limits.h>
#include <wtf/ASCIICType.h>
-#include <wtf/Forward.h>
+#include <wtf/PassRefPtr.h>
#include <wtf/RefCounted.h>
#include <wtf/Vector.h>
#include <wtf/unicode/Unicode.h>
@@ -166,12 +166,25 @@ public:
operator NSString*();
#endif
+ void operator delete(void*);
+
private:
+ // Allocation from a custom buffer is only allowed internally to avoid
+ // mismatched allocators. Callers should use create().
+ void* operator new(size_t size);
+ void* operator new(size_t size, void* address);
+
+ static PassRefPtr<StringImpl> createStrippingNullCharactersSlowCase(const UChar*, unsigned length);
+
unsigned m_length;
const UChar* m_data;
mutable unsigned m_hash;
bool m_inTable;
bool m_hasTerminatingNullCharacter;
+ // In some cases, we allocate the StringImpl struct and its data
+ // within a single heap buffer. In this case, the m_data pointer
+ // is an "internal buffer", and does not need to be deallocated.
+ bool m_bufferIsInternal;
};
bool equal(StringImpl*, StringImpl*);
@@ -274,6 +287,29 @@ static inline bool isSpaceOrNewline(UChar c)
return c <= 0x7F ? WTF::isASCIISpace(c) : WTF::Unicode::direction(c) == WTF::Unicode::WhiteSpaceNeutral;
}
+// This is a hot function because it's used when parsing HTML.
+inline PassRefPtr<StringImpl> StringImpl::createStrippingNullCharacters(const UChar* characters, unsigned length)
+{
+ ASSERT(characters);
+ ASSERT(length);
+
+ // Optimize for the case where there are no Null characters by quickly
+ // searching for nulls, and then using StringImpl::create, which will
+ // memcpy the whole buffer. This is faster than assigning character by
+ // character during the loop.
+
+ // Fast case.
+ int foundNull = 0;
+ for (unsigned i = 0; !foundNull && i < length; i++) {
+ int c = characters[i]; // more efficient than using UChar here (at least on Intel Mac OS)
+ foundNull |= !c;
+ }
+ if (!foundNull)
+ return StringImpl::create(characters, length);
+
+ return StringImpl::createStrippingNullCharactersSlowCase(characters, length);
+}
+
}
namespace WTF {
diff --git a/WebCore/platform/text/TextBreakIterator.h b/WebCore/platform/text/TextBreakIterator.h
index 64717a4..7b3b963 100644
--- a/WebCore/platform/text/TextBreakIterator.h
+++ b/WebCore/platform/text/TextBreakIterator.h
@@ -29,7 +29,19 @@ namespace WebCore {
class TextBreakIterator;
// Note: The returned iterator is good only until you get another iterator.
+
+ // Iterates over "extended grapheme clusters", as defined in UAX #29.
+ // Note that platform implementations may be less sophisticated - e.g. ICU prior to
+ // version 4.0 only supports "legacy grapheme clusters".
+ // Use this for general text processing, e.g. string truncation.
TextBreakIterator* characterBreakIterator(const UChar*, int length);
+
+ // This is similar to character break iterator in most cases, but is subject to
+ // platform UI conventions. One notable example where this can be different
+ // from character break iterator is Thai prepend characters, see bug 24342.
+ // Use this for insertion point and selection manipulations.
+ TextBreakIterator* cursorMovementIterator(const UChar*, int length);
+
TextBreakIterator* wordBreakIterator(const UChar*, int length);
TextBreakIterator* lineBreakIterator(const UChar*, int length);
TextBreakIterator* sentenceBreakIterator(const UChar*, int length);
diff --git a/WebCore/platform/text/TextBreakIteratorICU.cpp b/WebCore/platform/text/TextBreakIteratorICU.cpp
index 9941f58..c4fc1b0 100644
--- a/WebCore/platform/text/TextBreakIteratorICU.cpp
+++ b/WebCore/platform/text/TextBreakIteratorICU.cpp
@@ -22,6 +22,7 @@
#include "config.h"
#include "TextBreakIterator.h"
+#include "PlatformString.h"
#include "TextBreakIteratorInternalICU.h"
#include <unicode/ubrk.h>
@@ -114,4 +115,119 @@ bool isTextBreak(TextBreakIterator* bi, int pos)
return ubrk_isBoundary(bi, pos);
}
+#ifndef BUILDING_ON_TIGER
+static TextBreakIterator* setUpIteratorWithRules(bool& createdIterator, TextBreakIterator*& iterator,
+ const char* breakRules, const UChar* string, int length)
+{
+ if (!string)
+ return 0;
+
+ if (!createdIterator) {
+ UParseError parseStatus;
+ UErrorCode openStatus = U_ZERO_ERROR;
+ String rules(breakRules);
+ iterator = static_cast<TextBreakIterator*>(ubrk_openRules(rules.characters(), rules.length(), 0, 0, &parseStatus, &openStatus));
+ createdIterator = true;
+ ASSERT_WITH_MESSAGE(U_SUCCESS(openStatus), "ICU could not open a break iterator: %s (%d)", u_errorName(openStatus), openStatus);
+ }
+ if (!iterator)
+ return 0;
+
+ UErrorCode setTextStatus = U_ZERO_ERROR;
+ ubrk_setText(iterator, string, length, &setTextStatus);
+ if (U_FAILURE(setTextStatus))
+ return 0;
+
+ return iterator;
+}
+#endif // BUILDING_ON_TIGER
+
+TextBreakIterator* cursorMovementIterator(const UChar* string, int length)
+{
+#ifdef BUILDING_ON_TIGER
+ // ICU 3.2 cannot compile the below rules.
+ return characterBreakIterator(string, length);
+#else
+ // This rule set is based on character-break iterator rules of ICU 4.0
+ // <http://source.icu-project.org/repos/icu/icu/tags/release-4-0/source/data/brkitr/char.txt>.
+ // The major differences from the original ones are listed below:
+ // * Replaced '[\p{Grapheme_Cluster_Break = SpacingMark}]' with '[\p{General_Category = Spacing Mark} - $Extend]' for ICU 3.8 or earlier;
+ // * Removed rules that prevent a cursor from moving after prepend characters (Bug 24342);
+ // * Added rules that prevent a cursor from moving after virama signs of Indic languages except Tamil (Bug 15790), and;
+ // * Added rules that prevent a cursor from moving before Japanese half-width katakara voiced marks.
+ static const char* kRules =
+ "$CR = [\\p{Grapheme_Cluster_Break = CR}];"
+ "$LF = [\\p{Grapheme_Cluster_Break = LF}];"
+ "$Control = [\\p{Grapheme_Cluster_Break = Control}];"
+ "$VoiceMarks = [\\uFF9E\\uFF9F];" // Japanese half-width katakana voiced marks
+ "$Extend = [\\p{Grapheme_Cluster_Break = Extend} $VoiceMarks];"
+ "$SpacingMark = [[\\p{General_Category = Spacing Mark}] - $Extend];"
+ "$L = [\\p{Grapheme_Cluster_Break = L}];"
+ "$V = [\\p{Grapheme_Cluster_Break = V}];"
+ "$T = [\\p{Grapheme_Cluster_Break = T}];"
+ "$LV = [\\p{Grapheme_Cluster_Break = LV}];"
+ "$LVT = [\\p{Grapheme_Cluster_Break = LVT}];"
+ "$Hin0 = [\\u0905-\\u0939];" // Devanagari Letter A,...,Ha
+ "$HinV = \\u094D;" // Devanagari Sign Virama
+ "$Hin1 = [\\u0915-\\u0939];" // Devanagari Letter Ka,...,Ha
+ "$Ben0 = [\\u0985-\\u09B9];" // Bengali Letter A,...,Ha
+ "$BenV = \\u09CD;" // Bengali Sign Virama
+ "$Ben1 = [\\u0995-\\u09B9];" // Bengali Letter Ka,...,Ha
+ "$Pan0 = [\\u0A05-\\u0A39];" // Gurmukhi Letter A,...,Ha
+ "$PanV = \\u0A4D;" // Gurmukhi Sign Virama
+ "$Pan1 = [\\u0A15-\\u0A39];" // Gurmukhi Letter Ka,...,Ha
+ "$Guj0 = [\\u0A85-\\u0AB9];" // Gujarati Letter A,...,Ha
+ "$GujV = \\u0ACD;" // Gujarati Sign Virama
+ "$Guj1 = [\\u0A95-\\u0AB9];" // Gujarati Letter Ka,...,Ha
+ "$Ori0 = [\\u0B05-\\u0B39];" // Oriya Letter A,...,Ha
+ "$OriV = \\u0B4D;" // Oriya Sign Virama
+ "$Ori1 = [\\u0B15-\\u0B39];" // Oriya Letter Ka,...,Ha
+ "$Tel0 = [\\u0C05-\\u0C39];" // Telugu Letter A,...,Ha
+ "$TelV = \\u0C4D;" // Telugu Sign Virama
+ "$Tel1 = [\\u0C14-\\u0C39];" // Telugu Letter Ka,...,Ha
+ "$Kan0 = [\\u0C85-\\u0CB9];" // Kannada Letter A,...,Ha
+ "$KanV = \\u0CCD;" // Kannada Sign Virama
+ "$Kan1 = [\\u0C95-\\u0CB9];" // Kannada Letter A,...,Ha
+ "$Mal0 = [\\u0D05-\\u0D39];" // Malayalam Letter A,...,Ha
+ "$MalV = \\u0D4D;" // Malayalam Sign Virama
+ "$Mal1 = [\\u0D15-\\u0D39];" // Malayalam Letter A,...,Ha
+ "!!chain;"
+ "!!forward;"
+ "$CR $LF;"
+ "$L ($L | $V | $LV | $LVT);"
+ "($LV | $V) ($V | $T);"
+ "($LVT | $T) $T;"
+ "[^$Control $CR $LF] $Extend;"
+ "[^$Control $CR $LF] $SpacingMark;"
+ "$Hin0 $HinV $Hin1;" // Devanagari Virama (forward)
+ "$Ben0 $BenV $Ben1;" // Bengali Virama (forward)
+ "$Pan0 $PanV $Pan1;" // Gurmukhi Virama (forward)
+ "$Guj0 $GujV $Guj1;" // Gujarati Virama (forward)
+ "$Ori0 $OriV $Ori1;" // Oriya Virama (forward)
+ "$Tel0 $TelV $Tel1;" // Telugu Virama (forward)
+ "$Kan0 $KanV $Kan1;" // Kannada Virama (forward)
+ "$Mal0 $MalV $Mal1;" // Malayalam Virama (forward)
+ "!!reverse;"
+ "$LF $CR;"
+ "($L | $V | $LV | $LVT) $L;"
+ "($V | $T) ($LV | $V);"
+ "$T ($LVT | $T);"
+ "$Extend [^$Control $CR $LF];"
+ "$SpacingMark [^$Control $CR $LF];"
+ "$Hin1 $HinV $Hin0;" // Devanagari Virama (backward)
+ "$Ben1 $BenV $Ben0;" // Bengali Virama (backward)
+ "$Pan1 $PanV $Pan0;" // Gurmukhi Virama (backward)
+ "$Guj1 $GujV $Guj0;" // Gujarati Virama (backward)
+ "$Ori1 $OriV $Ori0;" // Gujarati Virama (backward)
+ "$Tel1 $TelV $Tel0;" // Telugu Virama (backward)
+ "$Kan1 $KanV $Kan0;" // Kannada Virama (backward)
+ "$Mal1 $MalV $Mal0;" // Malayalam Virama (backward)
+ "!!safe_reverse;"
+ "!!safe_forward;";
+ static bool createdCursorMovementIterator = false;
+ static TextBreakIterator* staticCursorMovementIterator;
+ return setUpIteratorWithRules(createdCursorMovementIterator, staticCursorMovementIterator, kRules, string, length);
+#endif // BUILDING_ON_TIGER
+}
+
}
diff --git a/WebCore/platform/text/TextCodecICU.cpp b/WebCore/platform/text/TextCodecICU.cpp
index 72d45ad..72054fa 100644
--- a/WebCore/platform/text/TextCodecICU.cpp
+++ b/WebCore/platform/text/TextCodecICU.cpp
@@ -334,7 +334,7 @@ String TextCodecICU::decode(const char* bytes, size_t length, bool flush, bool s
// <http://bugs.webkit.org/show_bug.cgi?id=17014>
// Simplified Chinese pages use the code A3A0 to mean "full-width space", but ICU decodes it as U+E5E5.
- if (m_encoding == "GBK" || m_encoding == "gb18030")
+ if (strcmp(m_encoding.name(), "GBK") == 0 || strcasecmp(m_encoding.name(), "gb18030") == 0)
resultString.replace(0xE5E5, ideographicSpace);
return resultString;
diff --git a/WebCore/platform/text/TextDecoder.cpp b/WebCore/platform/text/TextDecoder.cpp
deleted file mode 100644
index e39a6b7..0000000
--- a/WebCore/platform/text/TextDecoder.cpp
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
- * Copyright (C) 2004, 2006 Apple Computer, Inc. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
- * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "config.h"
-#include "TextDecoder.h"
-
-#include "TextEncodingRegistry.h"
-
-// FIXME: Would be nice to also handle BOM for UTF-7 and UTF-32.
-
-namespace WebCore {
-
-TextDecoder::TextDecoder(const TextEncoding& encoding)
- : m_encoding(encoding)
- , m_checkedForBOM(false)
- , m_numBufferedBytes(0)
-{
-}
-
-void TextDecoder::reset(const TextEncoding& encoding)
-{
- m_encoding = encoding;
- m_codec.clear();
- m_checkedForBOM = false;
- m_numBufferedBytes = 0;
-}
-
-String TextDecoder::checkForBOM(const char* data, size_t length, bool flush, bool stopOnError, bool& sawError)
-{
- ASSERT(!m_checkedForBOM);
-
- // Check to see if we found a BOM.
- size_t numBufferedBytes = m_numBufferedBytes;
- size_t buf1Len = numBufferedBytes;
- size_t buf2Len = length;
- const unsigned char* buf1 = m_bufferedBytes;
- const unsigned char* buf2 = reinterpret_cast<const unsigned char*>(data);
- unsigned char c1 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
- unsigned char c2 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
- unsigned char c3 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
- unsigned char c4 = buf2Len ? (--buf2Len, *buf2++) : 0;
-
- const TextEncoding* encodingConsideringBOM = &m_encoding;
- bool foundBOM = true;
- size_t lengthOfBOM = 0;
- if (c1 == 0xFF && c2 == 0xFE) {
- if (c3 != 0 || c4 != 0) {
- encodingConsideringBOM = &UTF16LittleEndianEncoding();
- lengthOfBOM = 2;
- } else if (numBufferedBytes + length > sizeof(m_bufferedBytes)) {
- encodingConsideringBOM = &UTF32LittleEndianEncoding();
- lengthOfBOM = 4;
- } else
- foundBOM = false;
- } else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) {
- encodingConsideringBOM = &UTF8Encoding();
- lengthOfBOM = 3;
- } else if (c1 == 0xFE && c2 == 0xFF) {
- encodingConsideringBOM = &UTF16BigEndianEncoding();
- lengthOfBOM = 2;
- } else if (c1 == 0 && c2 == 0 && c3 == 0xFE && c4 == 0xFF) {
- encodingConsideringBOM = &UTF32BigEndianEncoding();
- lengthOfBOM = 4;
- } else
- foundBOM = false;
-
- if (!foundBOM && numBufferedBytes + length <= sizeof(m_bufferedBytes) && !flush) {
- // Continue to look for the BOM.
- memcpy(&m_bufferedBytes[numBufferedBytes], data, length);
- m_numBufferedBytes += length;
- return "";
- }
-
- // Done checking for BOM.
- m_codec.set(newTextCodec(*encodingConsideringBOM).release());
- if (!m_codec)
- return String();
- m_checkedForBOM = true;
-
- // Skip the BOM.
- if (foundBOM) {
- ASSERT(numBufferedBytes < lengthOfBOM);
- size_t numUnbufferedBOMBytes = lengthOfBOM - numBufferedBytes;
- ASSERT(numUnbufferedBOMBytes <= length);
-
- data += numUnbufferedBOMBytes;
- length -= numUnbufferedBOMBytes;
- numBufferedBytes = 0;
- m_numBufferedBytes = 0;
- }
-
- // Handle case where we have some buffered bytes to deal with.
- if (numBufferedBytes) {
- char bufferedBytes[sizeof(m_bufferedBytes)];
- memcpy(bufferedBytes, m_bufferedBytes, numBufferedBytes);
- m_numBufferedBytes = 0;
-
- String bufferedResult = m_codec->decode(bufferedBytes, numBufferedBytes, false, stopOnError, sawError);
- if (stopOnError && sawError)
- return bufferedResult;
- return bufferedResult + m_codec->decode(data, length, flush, stopOnError, sawError);
- }
-
- return m_codec->decode(data, length, flush, stopOnError, sawError);
-}
-
-} // namespace WebCore
diff --git a/WebCore/platform/text/TextDecoder.h b/WebCore/platform/text/TextDecoder.h
deleted file mode 100644
index 171cb59..0000000
--- a/WebCore/platform/text/TextDecoder.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (C) 2004, 2006 Apple Computer, Inc. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
- * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef TextDecoder_h
-#define TextDecoder_h
-
-#include "PlatformString.h"
-#include "TextCodec.h"
-#include "TextEncoding.h"
-#include <wtf/OwnPtr.h>
-
-namespace WebCore {
-
- class TextCodec;
-
- class TextDecoder {
- public:
- TextDecoder(const TextEncoding&);
- void reset(const TextEncoding&);
- const TextEncoding& encoding() const { return m_encoding; };
-
- String decode(const char* data, size_t length, bool flush, bool stopOnError, bool& sawError)
- {
- if (!m_checkedForBOM)
- return checkForBOM(data, length, flush, stopOnError, sawError);
- return m_codec->decode(data, length, flush, stopOnError, sawError);
- }
-
- private:
- String checkForBOM(const char*, size_t length, bool flush, bool stopOnError, bool& sawError);
-
- TextEncoding m_encoding;
- OwnPtr<TextCodec> m_codec;
-
- bool m_checkedForBOM;
- unsigned char m_numBufferedBytes;
- unsigned char m_bufferedBytes[3];
- };
-
-} // namespace WebCore
-
-#endif // TextDecoder_h
diff --git a/WebCore/platform/text/TextEncoding.cpp b/WebCore/platform/text/TextEncoding.cpp
index 063d96b..ed58412 100644
--- a/WebCore/platform/text/TextEncoding.cpp
+++ b/WebCore/platform/text/TextEncoding.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2004, 2006, 2007, 2008 Apple Inc. All rights reserved.
+ * Copyright (C) 2004, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved.
* Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com>
*
* Redistribution and use in source and binary forms, with or without
@@ -30,7 +30,6 @@
#include "CString.h"
#include "PlatformString.h"
#include "TextCodec.h"
-#include "TextDecoder.h"
#include "TextEncodingRegistry.h"
#if USE(ICU_UNICODE)
#include <unicode/unorm.h>
@@ -73,7 +72,7 @@ String TextEncoding::decode(const char* data, size_t length, bool stopOnError, b
if (!m_name)
return String();
- return TextDecoder(*this).decode(data, length, true, stopOnError, sawError);
+ return newTextCodec(*this)->decode(data, length, true, stopOnError, sawError);
}
CString TextEncoding::encode(const UChar* characters, size_t length, UnencodableHandling handling) const
@@ -165,10 +164,23 @@ UChar TextEncoding::backslashAsCurrencySymbol() const
bool TextEncoding::isNonByteBasedEncoding() const
{
+ if (noExtendedTextEncodingNameUsed()) {
+ return *this == UTF16LittleEndianEncoding()
+ || *this == UTF16BigEndianEncoding();
+ }
+
return *this == UTF16LittleEndianEncoding()
- || *this == UTF16BigEndianEncoding()
- || *this == UTF32BigEndianEncoding()
- || *this == UTF32LittleEndianEncoding();
+ || *this == UTF16BigEndianEncoding()
+ || *this == UTF32BigEndianEncoding()
+ || *this == UTF32LittleEndianEncoding();
+}
+
+bool TextEncoding::isUTF7Encoding() const
+{
+ if (noExtendedTextEncodingNameUsed())
+ return false;
+
+ return *this == UTF7Encoding();
}
const TextEncoding& TextEncoding::closestByteBasedEquivalent() const
@@ -185,7 +197,7 @@ const TextEncoding& TextEncoding::closestByteBasedEquivalent() const
// but it's fraught with problems and we'd rather steer clear of it.
const TextEncoding& TextEncoding::encodingForFormSubmission() const
{
- if (isNonByteBasedEncoding() || *this == UTF7Encoding())
+ if (isNonByteBasedEncoding() || isUTF7Encoding())
return UTF8Encoding();
return *this;
}
diff --git a/WebCore/platform/text/TextEncoding.h b/WebCore/platform/text/TextEncoding.h
index b2bb816..b3909f7 100644
--- a/WebCore/platform/text/TextEncoding.h
+++ b/WebCore/platform/text/TextEncoding.h
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2004, 2006 Apple Computer, Inc. All rights reserved.
+ * Copyright (C) 2004, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -45,12 +45,14 @@ namespace WebCore {
bool usesVisualOrdering() const;
bool isJapanese() const;
- PassRefPtr<StringImpl> displayString(PassRefPtr<StringImpl> str) const {
+ PassRefPtr<StringImpl> displayString(PassRefPtr<StringImpl> str) const
+ {
if (m_backslashAsCurrencySymbol == '\\' || !str)
return str;
return str->replace('\\', m_backslashAsCurrencySymbol);
}
- void displayBuffer(UChar* characters, unsigned len) const {
+ void displayBuffer(UChar* characters, unsigned len) const
+ {
if (m_backslashAsCurrencySymbol == '\\')
return;
for (unsigned i = 0; i < len; ++i) {
@@ -72,10 +74,11 @@ namespace WebCore {
private:
UChar backslashAsCurrencySymbol() const;
+ bool isNonByteBasedEncoding() const;
+ bool isUTF7Encoding() const;
const char* m_name;
UChar m_backslashAsCurrencySymbol;
- bool isNonByteBasedEncoding() const;
};
inline bool operator==(const TextEncoding& a, const TextEncoding& b) { return a.name() == b.name(); }
diff --git a/WebCore/platform/text/TextEncodingDetector.h b/WebCore/platform/text/TextEncodingDetector.h
new file mode 100644
index 0000000..9f16ab0
--- /dev/null
+++ b/WebCore/platform/text/TextEncodingDetector.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (C) 2009 Google Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef TextEncodingDetector_h
+#define TextEncodingDetector_h
+
+namespace WebCore {
+
+ class TextEncoding;
+
+ // Given a sequence of bytes in |data| of length |len| and an optional
+ // hintEncodingName, detect the most likely character encoding.
+ // The way hintEncodingName is used is up to an implementation.
+ // Currently, the only caller sets it to the parent frame encoding.
+ bool detectTextEncoding(const char* data, size_t len,
+ const char* hintEncodingName,
+ TextEncoding* detectedEncoding);
+
+} // namespace WebCore
+
+#endif
diff --git a/WebCore/platform/text/TextEncodingDetectorICU.cpp b/WebCore/platform/text/TextEncodingDetectorICU.cpp
new file mode 100644
index 0000000..26c997e
--- /dev/null
+++ b/WebCore/platform/text/TextEncodingDetectorICU.cpp
@@ -0,0 +1,129 @@
+/*
+ * Copyright (C) 2008, 2009 Google Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+#include "TextEncodingDetector.h"
+
+#include "TextEncoding.h"
+#include "UnusedParam.h"
+
+#ifndef BUILDING_ON_TIGER
+#include "unicode/ucnv.h"
+#include "unicode/ucsdet.h"
+#endif
+
+namespace WebCore {
+
+bool detectTextEncoding(const char* data, size_t len,
+ const char* hintEncodingName,
+ TextEncoding* detectedEncoding)
+{
+ *detectedEncoding = TextEncoding();
+#ifdef BUILDING_ON_TIGER
+ // Tiger came with ICU 3.2 and does not have the encoding detector.
+ UNUSED_PARAM(data);
+ UNUSED_PARAM(len);
+ UNUSED_PARAM(hintEncodingName);
+ return false;
+#else
+ int matchesCount = 0;
+ UErrorCode status = U_ZERO_ERROR;
+ UCharsetDetector* detector = ucsdet_open(&status);
+ if (U_FAILURE(status))
+ return false;
+ ucsdet_enableInputFilter(detector, true);
+ ucsdet_setText(detector, data, static_cast<int32_t>(len), &status);
+ if (U_FAILURE(status))
+ return false;
+
+ // FIXME: A few things we can do other than improving
+ // the ICU detector itself.
+ // 1. Use ucsdet_detectAll and pick the most likely one given
+ // "the context" (parent-encoding, referrer encoding, etc).
+ // 2. 'Emulate' Firefox/IE's non-Universal detectors (e.g.
+ // Chinese, Japanese, Russian, Korean and Hebrew) by picking the
+ // encoding with a highest confidence among the detetctor-specific
+ // limited set of candidate encodings.
+ // Below is a partial implementation of the first part of what's outlined
+ // above.
+ const UCharsetMatch** matches = ucsdet_detectAll(detector, &matchesCount, &status);
+ if (U_FAILURE(status)) {
+ ucsdet_close(detector);
+ return false;
+ }
+
+ const char* encoding = 0;
+ if (hintEncodingName) {
+ TextEncoding hintEncoding(hintEncodingName);
+ // 10 is the minimum confidence value consistent with the codepoint
+ // allocation in a given encoding. The size of a chunk passed to
+ // us varies even for the same html file (apparently depending on
+ // the network load). When we're given a rather short chunk, we
+ // don't have a sufficiently reliable signal other than the fact that
+ // the chunk is consistent with a set of encodings. So, instead of
+ // setting an arbitrary threshold, we have to scan all the encodings
+ // consistent with the data.
+ const int32_t kThresold = 10;
+ for (int i = 0; i < matchesCount; ++i) {
+ int32_t confidence = ucsdet_getConfidence(matches[i], &status);
+ if (U_FAILURE(status)) {
+ status = U_ZERO_ERROR;
+ continue;
+ }
+ if (confidence < kThresold)
+ break;
+ const char* matchEncoding = ucsdet_getName(matches[i], &status);
+ if (U_FAILURE(status)) {
+ status = U_ZERO_ERROR;
+ continue;
+ }
+ if (TextEncoding(matchEncoding) == hintEncoding) {
+ encoding = hintEncodingName;
+ break;
+ }
+ }
+ }
+ // If no match is found so far, just pick the top match.
+ // This can happen, say, when a parent frame in EUC-JP refers to
+ // a child frame in Shift_JIS and both frames do NOT specify the encoding
+ // making us resort to auto-detection (when it IS turned on).
+ if (!encoding && matchesCount > 0)
+ encoding = ucsdet_getName(matches[0], &status);
+ if (U_SUCCESS(status)) {
+ *detectedEncoding = TextEncoding(encoding);
+ ucsdet_close(detector);
+ return true;
+ }
+ ucsdet_close(detector);
+ return false;
+#endif
+}
+
+}
diff --git a/WebCore/platform/text/TextEncodingDetectorNone.cpp b/WebCore/platform/text/TextEncodingDetectorNone.cpp
new file mode 100644
index 0000000..2655f08
--- /dev/null
+++ b/WebCore/platform/text/TextEncodingDetectorNone.cpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (C) 2009 Google Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+#include "TextEncodingDetector.h"
+
+#include "TextEncoding.h"
+#include "UnusedParam.h"
+
+namespace WebCore {
+
+bool detectTextEncoding(const char* data, size_t len,
+ const char* hintEncodingName,
+ TextEncoding* detectedEncoding)
+{
+ UNUSED_PARAM(data)
+ UNUSED_PARAM(len)
+ UNUSED_PARAM(hintEncodingName)
+
+ *detectedEncoding = TextEncoding();
+ return false;
+}
+
+}
diff --git a/WebCore/platform/text/TextEncodingRegistry.h b/WebCore/platform/text/TextEncodingRegistry.h
index 5ca2039..d204734 100644
--- a/WebCore/platform/text/TextEncodingRegistry.h
+++ b/WebCore/platform/text/TextEncodingRegistry.h
@@ -34,11 +34,8 @@ namespace WebCore {
class TextCodec;
class TextEncoding;
- // Only TextEncoding and TextDecoder should use this function directly.
- // - Use TextDecoder::decode to decode, since it handles BOMs.
- // - Use TextEncoding::decode to decode if you have all the data at once.
- // It's implemented by calling TextDecoder::decode so works just as well.
- // - Use TextEncoding::encode to encode, since it takes care of normalization.
+ // Use TextResourceDecoder::decode to decode resources, since it handles BOMs.
+ // Use TextEncoding::encode to encode, since it takes care of normalization.
std::auto_ptr<TextCodec> newTextCodec(const TextEncoding&);
// Only TextEncoding should use this function directly.
diff --git a/WebCore/platform/text/android/TextBreakIteratorInternalICU.cpp b/WebCore/platform/text/android/TextBreakIteratorInternalICU.cpp
new file mode 100644
index 0000000..9bebe74
--- /dev/null
+++ b/WebCore/platform/text/android/TextBreakIteratorInternalICU.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright 2007, The Android Open Source Project
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+#include "TextBreakIteratorInternalICU.h"
+
+namespace WebCore {
+
+const char* currentTextBreakLocaleID()
+{
+ return "en_us";
+}
+
+}
diff --git a/WebCore/platform/text/cf/StringImplCF.cpp b/WebCore/platform/text/cf/StringImplCF.cpp
index ff595a5..8a2ae79 100644
--- a/WebCore/platform/text/cf/StringImplCF.cpp
+++ b/WebCore/platform/text/cf/StringImplCF.cpp
@@ -1,5 +1,5 @@
-/**
- * Copyright (C) 2006 Apple Computer, Inc.
+/*
+ * Copyright (C) 2006, 2009 Apple Inc. All rights reserved.
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Library General Public
@@ -24,14 +24,139 @@
#if PLATFORM(CF) || (PLATFORM(QT) && PLATFORM(DARWIN))
#include <CoreFoundation/CoreFoundation.h>
+#include <wtf/MainThread.h>
+#include <wtf/PassRefPtr.h>
+#include <wtf/Threading.h>
+
+#if PLATFORM(MAC) && !defined(BUILDING_ON_TIGER)
+#include <objc/objc-auto.h>
+#endif
namespace WebCore {
+namespace StringWrapperCFAllocator {
+
+ static StringImpl* currentString;
+
+ static const void* retain(const void* info)
+ {
+ return info;
+ }
+
+ static void release(const void*)
+ {
+ ASSERT_NOT_REACHED();
+ }
+
+ static CFStringRef copyDescription(const void*)
+ {
+ return CFSTR("WebCore::String-based allocator");
+ }
+
+ static void* allocate(CFIndex size, CFOptionFlags, void*)
+ {
+ StringImpl* underlyingString = 0;
+ if (isMainThread()) {
+ underlyingString = currentString;
+ if (underlyingString) {
+ currentString = 0;
+ underlyingString->ref(); // Balanced by call to deref in deallocate below.
+ }
+ }
+ StringImpl** header = static_cast<StringImpl**>(fastMalloc(sizeof(StringImpl*) + size));
+ *header = underlyingString;
+ return header + 1;
+ }
+
+ static void* reallocate(void* pointer, CFIndex newSize, CFOptionFlags, void*)
+ {
+ size_t newAllocationSize = sizeof(StringImpl*) + newSize;
+ StringImpl** header = static_cast<StringImpl**>(pointer) - 1;
+ ASSERT(!*header);
+ header = static_cast<StringImpl**>(fastRealloc(header, newAllocationSize));
+ return header + 1;
+ }
+
+ static void deallocateOnMainThread(void* headerPointer)
+ {
+ StringImpl** header = static_cast<StringImpl**>(headerPointer);
+ StringImpl* underlyingString = *header;
+ ASSERT(underlyingString);
+ underlyingString->deref(); // Balanced by call to ref in allocate above.
+ fastFree(header);
+ }
+
+ static void deallocate(void* pointer, void*)
+ {
+ StringImpl** header = static_cast<StringImpl**>(pointer) - 1;
+ StringImpl* underlyingString = *header;
+ if (!underlyingString)
+ fastFree(header);
+ else {
+ if (!isMainThread())
+ callOnMainThread(deallocateOnMainThread, header);
+ else {
+ underlyingString->deref(); // Balanced by call to ref in allocate above.
+ fastFree(header);
+ }
+ }
+ }
+
+ static CFIndex preferredSize(CFIndex size, CFOptionFlags, void*)
+ {
+ // FIXME: If FastMalloc provided a "good size" callback, we'd want to use it here.
+ // Note that this optimization would help performance for strings created with the
+ // allocator that are mutable, and those typically are only created by callers who
+ // make a new string using the old string's allocator, such as some of the call
+ // sites in CFURL.
+ return size;
+ }
+
+ static CFAllocatorRef create()
+ {
+#if PLATFORM(MAC) && !defined(BUILDING_ON_TIGER)
+ // Since garbage collection isn't compatible with custom allocators, don't use this at all when garbage collection is active.
+ if (objc_collectingEnabled())
+ return 0;
+#endif
+ CFAllocatorContext context = { 0, 0, retain, release, copyDescription, allocate, reallocate, deallocate, preferredSize };
+ return CFAllocatorCreate(0, &context);
+ }
+
+ static CFAllocatorRef allocator()
+ {
+ static CFAllocatorRef allocator = create();
+ return allocator;
+ }
+
+}
+
CFStringRef StringImpl::createCFString()
{
- return CFStringCreateWithCharacters(NULL, reinterpret_cast<const UniChar*>(m_data), m_length);
+ CFAllocatorRef allocator = (m_length && isMainThread()) ? StringWrapperCFAllocator::allocator() : 0;
+ if (!allocator)
+ return CFStringCreateWithCharacters(0, reinterpret_cast<const UniChar*>(m_data), m_length);
+
+ // Put pointer to the StringImpl in a global so the allocator can store it with the CFString.
+ ASSERT(!StringWrapperCFAllocator::currentString);
+ StringWrapperCFAllocator::currentString = this;
+
+ CFStringRef string = CFStringCreateWithCharactersNoCopy(allocator, reinterpret_cast<const UniChar*>(m_data), m_length, kCFAllocatorNull);
+
+ // The allocator cleared the global when it read it, but also clear it here just in case.
+ ASSERT(!StringWrapperCFAllocator::currentString);
+ StringWrapperCFAllocator::currentString = 0;
+
+ return string;
}
+// On StringImpl creation we could check if the allocator is the StringWrapperCFAllocator.
+// If it is, then we could find the original StringImpl and just return that. But to
+// do that we'd have to compute the offset from CFStringRef to the allocated block;
+// the CFStringRef is *not* at the start of an allocated block. Testing shows 1000x
+// more calls to createCFString than calls to the create functions with the appropriate
+// allocator, so it's probably not urgent optimize that case.
+
}
#endif // PLATFORM(CF) || (PLATFORM(QT) && PLATFORM(DARWIN))
diff --git a/WebCore/platform/text/mac/ShapeArabic.c b/WebCore/platform/text/mac/ShapeArabic.c
index 1e0d91b..dd61ce5 100644
--- a/WebCore/platform/text/mac/ShapeArabic.c
+++ b/WebCore/platform/text/mac/ShapeArabic.c
@@ -36,6 +36,8 @@
#include "ShapeArabic.h"
+#include <stdbool.h>
+#include <string.h>
#include <unicode/utypes.h>
#include <unicode/uchar.h>
#include <unicode/ustring.h>
diff --git a/WebCore/platform/text/mac/StringImplMac.mm b/WebCore/platform/text/mac/StringImplMac.mm
index 3e0731c..d14c6d8 100644
--- a/WebCore/platform/text/mac/StringImplMac.mm
+++ b/WebCore/platform/text/mac/StringImplMac.mm
@@ -1,5 +1,5 @@
-/**
- * Copyright (C) 2006 Apple Computer, Inc.
+/*
+ * Copyright (C) 2006, 2009 Apple Inc. All rights reserved.
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Library General Public
@@ -21,13 +21,13 @@
#include "config.h"
#include "StringImpl.h"
-#include <Foundation/Foundation.h>
+#include "FoundationExtras.h"
namespace WebCore {
StringImpl::operator NSString *()
{
- return [NSString stringWithCharacters:m_data length:m_length];
+ return HardAutorelease(createCFString());
}
}
diff --git a/WebCore/platform/text/mac/StringMac.mm b/WebCore/platform/text/mac/StringMac.mm
index 77942ea..758ae1d 100644
--- a/WebCore/platform/text/mac/StringMac.mm
+++ b/WebCore/platform/text/mac/StringMac.mm
@@ -20,6 +20,7 @@
#include "config.h"
#include "PlatformString.h"
+#include <CoreFoundation/CFString.h>
namespace WebCore {
diff --git a/WebCore/platform/text/qt/TextBreakIteratorQt.cpp b/WebCore/platform/text/qt/TextBreakIteratorQt.cpp
index 88b9680..4dc23ee 100644
--- a/WebCore/platform/text/qt/TextBreakIteratorQt.cpp
+++ b/WebCore/platform/text/qt/TextBreakIteratorQt.cpp
@@ -63,6 +63,11 @@ namespace WebCore {
return static_cast<TextBreakIterator*>(iterator);
}
+ TextBreakIterator* cursorMovementIterator(const UChar* string, int length)
+ {
+ return characterBreakIterator(string, length);
+ }
+
TextBreakIterator* lineBreakIterator(const UChar* string, int length)
{
static QTextBoundaryFinder *iterator = 0;
@@ -250,6 +255,11 @@ TextBreakIterator* characterBreakIterator(const UChar* string, int length)
return iterator;
}
+TextBreakIterator* cursorMovementIterator(const UChar* string, int length)
+{
+ return characterBreakIterator(string, length);
+}
+
TextBreakIterator* lineBreakIterator(const UChar*, int)
{
// not yet implemented