diff options
Diffstat (limited to 'WebCore/platform/text')
26 files changed, 714 insertions, 286 deletions
diff --git a/WebCore/platform/text/AtomicString.cpp b/WebCore/platform/text/AtomicString.cpp index 5f9abfd..d85f5ee 100644 --- a/WebCore/platform/text/AtomicString.cpp +++ b/WebCore/platform/text/AtomicString.cpp @@ -101,7 +101,7 @@ static inline bool equal(StringImpl* string, const UChar* characters, unsigned l if (string->length() != length) return false; -#if PLATFORM(ARM) +#if PLATFORM(ARM) || PLATFORM(SH4) const UChar* stringCharacters = string->characters(); for (unsigned i = 0; i != length; ++i) { if (*stringCharacters++ != *characters++) diff --git a/WebCore/platform/text/Base64.cpp b/WebCore/platform/text/Base64.cpp index 920fa89..be19164 100644 --- a/WebCore/platform/text/Base64.cpp +++ b/WebCore/platform/text/Base64.cpp @@ -97,8 +97,8 @@ void base64Encode(const Vector<char>& in, Vector<char>& out, bool insertLFs) count += 4; } out[didx++] = base64EncMap[(data[sidx] >> 2) & 077]; - out[didx++] = base64EncMap[(data[sidx + 1] >> 4) & 017 | (data[sidx] << 4) & 077]; - out[didx++] = base64EncMap[(data[sidx + 2] >> 6) & 003 | (data[sidx + 1] << 2) & 077]; + out[didx++] = base64EncMap[((data[sidx + 1] >> 4) & 017) | ((data[sidx] << 4) & 077)]; + out[didx++] = base64EncMap[((data[sidx + 2] >> 6) & 003) | ((data[sidx + 1] << 2) & 077)]; out[didx++] = base64EncMap[data[sidx + 2] & 077]; sidx += 3; } @@ -110,7 +110,7 @@ void base64Encode(const Vector<char>& in, Vector<char>& out, bool insertLFs) out[didx++] = base64EncMap[(data[sidx] >> 2) & 077]; if (sidx < len - 1) { - out[didx++] = base64EncMap[(data[sidx + 1] >> 4) & 017 | (data[sidx] << 4) & 077]; + out[didx++] = base64EncMap[((data[sidx + 1] >> 4) & 017) | ((data[sidx] << 4) & 077)]; out[didx++] = base64EncMap[(data[sidx + 1] << 2) & 077]; } else out[didx++] = base64EncMap[(data[sidx] << 4) & 077]; diff --git a/WebCore/platform/text/BidiResolver.h b/WebCore/platform/text/BidiResolver.h index ffd3d51..8288be4 100644 --- a/WebCore/platform/text/BidiResolver.h +++ b/WebCore/platform/text/BidiResolver.h @@ -254,7 +254,16 @@ template <class Iterator, class Run> void BidiResolver<Iterator, Run>::appendRun() { if (!emptyRun && !eor.atEnd()) { - addRun(new Run(sor.offset(), eor.offset() + 1, context(), m_direction)); + unsigned startOffset = sor.offset(); + unsigned endOffset = eor.offset(); + + if (!endOfLine.atEnd() && endOffset >= endOfLine.offset()) { + reachedEndOfLine = true; + endOffset = endOfLine.offset(); + } + + if (endOffset >= startOffset) + addRun(new Run(startOffset, endOffset + 1, context(), m_direction)); eor.increment(); sor = eor; @@ -352,8 +361,8 @@ void BidiResolver<Iterator, Run>::raiseExplicitEmbeddingLevel(WTF::Unicode::Dire m_direction = LeftToRight; } } else if (m_status.eor == ArabicNumber - || m_status.eor == EuropeanNumber && (m_status.lastStrong != LeftToRight || from == RightToLeft) - || m_status.eor != EuropeanNumber && m_status.lastStrong == LeftToRight && from == RightToLeft) { + || (m_status.eor == EuropeanNumber && (m_status.lastStrong != LeftToRight || from == RightToLeft)) + || (m_status.eor != EuropeanNumber && m_status.lastStrong == LeftToRight && from == RightToLeft)) { appendRun(); m_direction = RightToLeft; } @@ -722,8 +731,8 @@ void BidiResolver<Iterator, Run>::createBidiRunsForLine(const Iterator& end, boo case WhiteSpaceNeutral: case OtherNeutral: if (m_status.eor == ArabicNumber - || m_status.eor == EuropeanNumber && (m_status.lastStrong == RightToLeft || context()->dir() == RightToLeft) - || m_status.eor != EuropeanNumber && m_status.lastStrong == LeftToRight && context()->dir() == RightToLeft) { + || (m_status.eor == EuropeanNumber && (m_status.lastStrong == RightToLeft || context()->dir() == RightToLeft)) + || (m_status.eor != EuropeanNumber && m_status.lastStrong == LeftToRight && context()->dir() == RightToLeft)) { // Terminate the run before the neutrals. appendRun(); // Begin an R run for the neutrals. diff --git a/WebCore/platform/text/CString.cpp b/WebCore/platform/text/CString.cpp index 8e68628..90990f8 100644 --- a/WebCore/platform/text/CString.cpp +++ b/WebCore/platform/text/CString.cpp @@ -47,8 +47,8 @@ void CString::init(const char* str, unsigned length) return; m_buffer = CStringBuffer::create(length + 1); - memcpy(m_buffer->data(), str, length); - m_buffer->data()[length] = '\0'; + memcpy(m_buffer->mutableData(), str, length); + m_buffer->mutableData()[length] = '\0'; } const char* CString::data() const @@ -61,7 +61,7 @@ char* CString::mutableData() copyBufferIfNeeded(); if (!m_buffer) return 0; - return m_buffer->data(); + return m_buffer->mutableData(); } unsigned CString::length() const @@ -73,7 +73,7 @@ CString CString::newUninitialized(size_t length, char*& characterBuffer) { CString result; result.m_buffer = CStringBuffer::create(length + 1); - char* bytes = result.m_buffer->data(); + char* bytes = result.m_buffer->mutableData(); bytes[length] = '\0'; characterBuffer = bytes; return result; @@ -87,7 +87,7 @@ void CString::copyBufferIfNeeded() int len = m_buffer->length(); RefPtr<CStringBuffer> m_temp = m_buffer; m_buffer = CStringBuffer::create(len); - memcpy(m_buffer->data(), m_temp->data(), len); + memcpy(m_buffer->mutableData(), m_temp->data(), len); } bool operator==(const CString& a, const CString& b) @@ -99,17 +99,4 @@ bool operator==(const CString& a, const CString& b) return !strncmp(a.data(), b.data(), min(a.length(), b.length())); } -PassRefPtr<SharedBuffer> CString::releaseBuffer() -{ - if (!m_buffer) - return 0; - - copyBufferIfNeeded(); - - RefPtr<SharedBuffer> result = m_buffer->releaseBuffer(); - m_buffer = 0; - return result.release(); -} - - -} +} // namespace WebCore diff --git a/WebCore/platform/text/CString.h b/WebCore/platform/text/CString.h index 09f112f..f084ddf 100644 --- a/WebCore/platform/text/CString.h +++ b/WebCore/platform/text/CString.h @@ -36,15 +36,15 @@ namespace WebCore { class CStringBuffer : public RefCounted<CStringBuffer> { public: - static PassRefPtr<CStringBuffer> create(unsigned length) { return adoptRef(new CStringBuffer(length)); } - - char* data() { return m_vector.data(); } - size_t length() const { return m_vector.size(); } + const char* data() { return m_vector.data(); } + size_t length() { return m_vector.size(); } - PassRefPtr<SharedBuffer> releaseBuffer() { return SharedBuffer::adoptVector(m_vector); } - private: + friend class CString; + + static PassRefPtr<CStringBuffer> create(unsigned length) { return adoptRef(new CStringBuffer(length)); } CStringBuffer(unsigned length) : m_vector(length) { } + char* mutableData() { return m_vector.data(); } Vector<char> m_vector; }; @@ -56,6 +56,7 @@ namespace WebCore { CString() { } CString(const char*); CString(const char*, unsigned length); + CString(CStringBuffer* buffer) : m_buffer(buffer) { } static CString newUninitialized(size_t length, char*& characterBuffer); const char* data() const; @@ -63,8 +64,8 @@ namespace WebCore { unsigned length() const; bool isNull() const { return !m_buffer; } - - PassRefPtr<SharedBuffer> releaseBuffer(); + + CStringBuffer* buffer() const { return m_buffer.get(); } private: void copyBufferIfNeeded(); diff --git a/WebCore/platform/text/PlatformString.h b/WebCore/platform/text/PlatformString.h index 35d3079..a1541d2 100644 --- a/WebCore/platform/text/PlatformString.h +++ b/WebCore/platform/text/PlatformString.h @@ -27,15 +27,18 @@ #include "StringImpl.h" -#include <wtf/PassRefPtr.h> +#ifdef __OBJC__ +#include <objc/objc.h> +#endif #if USE(JSC) #include <runtime/Identifier.h> #else -// runtime/Identifier.h includes HashMap.h and HashSet.h. We explicitly include -// them in the case of non-JSC builds to keep things consistent. +// runtime/Identifier.h brings in a variety of wtf headers. We explicitly +// include them in the case of non-JSC builds to keep things consistent. #include <wtf/HashMap.h> #include <wtf/HashSet.h> +#include <wtf/OwnPtr.h> #endif #if PLATFORM(CF) || (PLATFORM(QT) && PLATFORM(DARWIN)) @@ -228,6 +231,9 @@ public: static String fromUTF8(const char*, size_t); static String fromUTF8(const char*); + // Tries to convert the passed in string to UTF-8, but will fall back to Latin-1 if the string is not valid UTF-8. + static String fromUTF8WithLatin1Fallback(const char*, size_t); + // Determines the writing direction using the Unicode Bidi Algorithm rules P2 and P3. WTF::Unicode::Direction defaultWritingDirection() const { return m_impl ? m_impl->defaultWritingDirection() : WTF::Unicode::LeftToRight; } diff --git a/WebCore/platform/text/String.cpp b/WebCore/platform/text/String.cpp index 638e45f..733b661 100644 --- a/WebCore/platform/text/String.cpp +++ b/WebCore/platform/text/String.cpp @@ -623,6 +623,15 @@ String String::fromUTF8(const char* string) return UTF8Encoding().decode(string, strlen(string)); } +String String::fromUTF8WithLatin1Fallback(const char* string, size_t size) +{ + String result = fromUTF8(string, size); + if (!result) + result = String(string, size); + + return result; +} + #if USE(JSC) String::String(const Identifier& str) { diff --git a/WebCore/platform/text/StringImpl.cpp b/WebCore/platform/text/StringImpl.cpp index 0556f8e..6bba990 100644 --- a/WebCore/platform/text/StringImpl.cpp +++ b/WebCore/platform/text/StringImpl.cpp @@ -2,7 +2,7 @@ * Copyright (C) 1999 Lars Knoll (knoll@kde.org) * (C) 1999 Antti Koivisto (koivisto@kde.org) * (C) 2001 Dirk Mueller ( mueller@kde.org ) - * Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008 Apple Inc. All rights reserved. + * Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved. * Copyright (C) 2006 Andrew Wellington (proton@wiretapped.net) * * This library is free software; you can redistribute it and/or @@ -54,6 +54,27 @@ static inline void deleteUCharVector(const UChar* p) fastFree(const_cast<UChar*>(p)); } +// Some of the factory methods create buffers using fastMalloc. +// We must ensure that ll allocations of StringImpl are allocated using +// fastMalloc so that we don't have mis-matched frees. We accomplish +// this by overriding the new and delete operators. +void* StringImpl::operator new(size_t size, void* address) +{ + if (address) + return address; // Allocating using an internal buffer + return fastMalloc(size); +} + +void* StringImpl::operator new(size_t size) +{ + return fastMalloc(size); +} + +void StringImpl::operator delete(void* address) +{ + fastFree(address); +} + // This constructor is used only to create the empty string. StringImpl::StringImpl() : m_length(0) @@ -61,6 +82,7 @@ StringImpl::StringImpl() , m_hash(0) , m_inTable(false) , m_hasTerminatingNullCharacter(false) + , m_bufferIsInternal(false) { // Ensure that the hash is computed so that AtomicStringHash can call existingHash() // with impunity. The empty string is special because it is never entered into @@ -76,6 +98,7 @@ inline StringImpl::StringImpl(const UChar* characters, unsigned length) , m_hash(0) , m_inTable(false) , m_hasTerminatingNullCharacter(false) + , m_bufferIsInternal(false) { UChar* data = newUCharVector(length); memcpy(data, characters, length * sizeof(UChar)); @@ -87,6 +110,7 @@ inline StringImpl::StringImpl(const StringImpl& str, WithTerminatingNullCharacte , m_hash(str.m_hash) , m_inTable(false) , m_hasTerminatingNullCharacter(true) + , m_bufferIsInternal(false) { UChar* data = newUCharVector(str.m_length + 1); memcpy(data, str.m_data, str.m_length * sizeof(UChar)); @@ -99,6 +123,7 @@ inline StringImpl::StringImpl(const char* characters, unsigned length) , m_hash(0) , m_inTable(false) , m_hasTerminatingNullCharacter(false) + , m_bufferIsInternal(false) { ASSERT(characters); ASSERT(length); @@ -117,6 +142,7 @@ inline StringImpl::StringImpl(UChar* characters, unsigned length, AdoptBuffer) , m_hash(0) , m_inTable(false) , m_hasTerminatingNullCharacter(false) + , m_bufferIsInternal(false) { ASSERT(characters); ASSERT(length); @@ -128,6 +154,7 @@ StringImpl::StringImpl(const UChar* characters, unsigned length, unsigned hash) , m_hash(hash) , m_inTable(true) , m_hasTerminatingNullCharacter(false) + , m_bufferIsInternal(false) { ASSERT(hash); ASSERT(characters); @@ -144,6 +171,7 @@ StringImpl::StringImpl(const char* characters, unsigned length, unsigned hash) , m_hash(hash) , m_inTable(true) , m_hasTerminatingNullCharacter(false) + , m_bufferIsInternal(false) { ASSERT(hash); ASSERT(characters); @@ -161,7 +189,8 @@ StringImpl::~StringImpl() { if (m_inTable) AtomicString::remove(this); - deleteUCharVector(m_data); + if (!m_bufferIsInternal) + deleteUCharVector(m_data); } StringImpl* StringImpl::empty() @@ -907,26 +936,8 @@ WTF::Unicode::Direction StringImpl::defaultWritingDirection() } // This is a hot function because it's used when parsing HTML. -PassRefPtr<StringImpl> StringImpl::createStrippingNullCharacters(const UChar* characters, unsigned length) +PassRefPtr<StringImpl> StringImpl::createStrippingNullCharactersSlowCase(const UChar* characters, unsigned length) { - ASSERT(characters); - ASSERT(length); - - // Optimize for the case where there are no Null characters by quickly - // searching for nulls, and then using StringImpl::create, which will - // memcpy the whole buffer. This is faster than assigning character by - // character during the loop. - - // Fast case. - int foundNull = 0; - for (unsigned i = 0; !foundNull && i < length; i++) { - int c = characters[i]; // more efficient than using UChar here (at least on Intel Mac OS) - foundNull |= !c; - } - if (!foundNull) - return StringImpl::create(characters, length); - - // Slow case. StringBuffer strippedCopy(length); unsigned strippedLength = 0; for (unsigned i = 0; i < length; i++) { @@ -958,24 +969,44 @@ PassRefPtr<StringImpl> StringImpl::create(const UChar* characters, unsigned leng { if (!characters || !length) return empty(); - return adoptRef(new StringImpl(characters, length)); + + // Allocate a single buffer large enough to contain the StringImpl + // struct as well as the data which it contains. This removes one + // heap allocation from this call. + size_t size = sizeof(StringImpl) + length * sizeof(UChar); + char* buffer = static_cast<char*>(fastMalloc(size)); + UChar* data = reinterpret_cast<UChar*>(buffer + sizeof(StringImpl)); + memcpy(data, characters, length * sizeof(UChar)); + StringImpl* string = new (buffer) StringImpl(data, length, AdoptBuffer()); + string->m_bufferIsInternal = true; + return adoptRef(string); } PassRefPtr<StringImpl> StringImpl::create(const char* characters, unsigned length) { if (!characters || !length) return empty(); - return adoptRef(new StringImpl(characters, length)); + + // Allocate a single buffer large enough to contain the StringImpl + // struct as well as the data which it contains. This removes one + // heap allocation from this call. + size_t size = sizeof(StringImpl) + length * sizeof(UChar); + char* buffer = static_cast<char*>(fastMalloc(size)); + UChar* data = reinterpret_cast<UChar*>(buffer + sizeof(StringImpl)); + for (unsigned i = 0; i != length; ++i) { + unsigned char c = characters[i]; + data[i] = c; + } + StringImpl* string = new (buffer) StringImpl(data, length, AdoptBuffer()); + string->m_bufferIsInternal = true; + return adoptRef(string); } PassRefPtr<StringImpl> StringImpl::create(const char* string) { if (!string) return empty(); - unsigned length = strlen(string); - if (!length) - return empty(); - return adoptRef(new StringImpl(string, length)); + return create(string, strlen(string)); } PassRefPtr<StringImpl> StringImpl::createWithTerminatingNullCharacter(const StringImpl& string) @@ -985,7 +1016,7 @@ PassRefPtr<StringImpl> StringImpl::createWithTerminatingNullCharacter(const Stri PassRefPtr<StringImpl> StringImpl::copy() { - return adoptRef(new StringImpl(m_data, m_length)); + return create(m_data, m_length); } } // namespace WebCore diff --git a/WebCore/platform/text/StringImpl.h b/WebCore/platform/text/StringImpl.h index 281aa37..1242f27 100644 --- a/WebCore/platform/text/StringImpl.h +++ b/WebCore/platform/text/StringImpl.h @@ -1,6 +1,6 @@ /* * Copyright (C) 1999 Lars Knoll (knoll@kde.org) - * Copyright (C) 2005, 2006, 2007, 2008 Apple Inc. All rights reserved. + * Copyright (C) 2005, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Library General Public @@ -24,7 +24,7 @@ #include <limits.h> #include <wtf/ASCIICType.h> -#include <wtf/Forward.h> +#include <wtf/PassRefPtr.h> #include <wtf/RefCounted.h> #include <wtf/Vector.h> #include <wtf/unicode/Unicode.h> @@ -166,12 +166,25 @@ public: operator NSString*(); #endif + void operator delete(void*); + private: + // Allocation from a custom buffer is only allowed internally to avoid + // mismatched allocators. Callers should use create(). + void* operator new(size_t size); + void* operator new(size_t size, void* address); + + static PassRefPtr<StringImpl> createStrippingNullCharactersSlowCase(const UChar*, unsigned length); + unsigned m_length; const UChar* m_data; mutable unsigned m_hash; bool m_inTable; bool m_hasTerminatingNullCharacter; + // In some cases, we allocate the StringImpl struct and its data + // within a single heap buffer. In this case, the m_data pointer + // is an "internal buffer", and does not need to be deallocated. + bool m_bufferIsInternal; }; bool equal(StringImpl*, StringImpl*); @@ -274,6 +287,29 @@ static inline bool isSpaceOrNewline(UChar c) return c <= 0x7F ? WTF::isASCIISpace(c) : WTF::Unicode::direction(c) == WTF::Unicode::WhiteSpaceNeutral; } +// This is a hot function because it's used when parsing HTML. +inline PassRefPtr<StringImpl> StringImpl::createStrippingNullCharacters(const UChar* characters, unsigned length) +{ + ASSERT(characters); + ASSERT(length); + + // Optimize for the case where there are no Null characters by quickly + // searching for nulls, and then using StringImpl::create, which will + // memcpy the whole buffer. This is faster than assigning character by + // character during the loop. + + // Fast case. + int foundNull = 0; + for (unsigned i = 0; !foundNull && i < length; i++) { + int c = characters[i]; // more efficient than using UChar here (at least on Intel Mac OS) + foundNull |= !c; + } + if (!foundNull) + return StringImpl::create(characters, length); + + return StringImpl::createStrippingNullCharactersSlowCase(characters, length); +} + } namespace WTF { diff --git a/WebCore/platform/text/TextBreakIterator.h b/WebCore/platform/text/TextBreakIterator.h index 64717a4..7b3b963 100644 --- a/WebCore/platform/text/TextBreakIterator.h +++ b/WebCore/platform/text/TextBreakIterator.h @@ -29,7 +29,19 @@ namespace WebCore { class TextBreakIterator; // Note: The returned iterator is good only until you get another iterator. + + // Iterates over "extended grapheme clusters", as defined in UAX #29. + // Note that platform implementations may be less sophisticated - e.g. ICU prior to + // version 4.0 only supports "legacy grapheme clusters". + // Use this for general text processing, e.g. string truncation. TextBreakIterator* characterBreakIterator(const UChar*, int length); + + // This is similar to character break iterator in most cases, but is subject to + // platform UI conventions. One notable example where this can be different + // from character break iterator is Thai prepend characters, see bug 24342. + // Use this for insertion point and selection manipulations. + TextBreakIterator* cursorMovementIterator(const UChar*, int length); + TextBreakIterator* wordBreakIterator(const UChar*, int length); TextBreakIterator* lineBreakIterator(const UChar*, int length); TextBreakIterator* sentenceBreakIterator(const UChar*, int length); diff --git a/WebCore/platform/text/TextBreakIteratorICU.cpp b/WebCore/platform/text/TextBreakIteratorICU.cpp index 9941f58..c4fc1b0 100644 --- a/WebCore/platform/text/TextBreakIteratorICU.cpp +++ b/WebCore/platform/text/TextBreakIteratorICU.cpp @@ -22,6 +22,7 @@ #include "config.h" #include "TextBreakIterator.h" +#include "PlatformString.h" #include "TextBreakIteratorInternalICU.h" #include <unicode/ubrk.h> @@ -114,4 +115,119 @@ bool isTextBreak(TextBreakIterator* bi, int pos) return ubrk_isBoundary(bi, pos); } +#ifndef BUILDING_ON_TIGER +static TextBreakIterator* setUpIteratorWithRules(bool& createdIterator, TextBreakIterator*& iterator, + const char* breakRules, const UChar* string, int length) +{ + if (!string) + return 0; + + if (!createdIterator) { + UParseError parseStatus; + UErrorCode openStatus = U_ZERO_ERROR; + String rules(breakRules); + iterator = static_cast<TextBreakIterator*>(ubrk_openRules(rules.characters(), rules.length(), 0, 0, &parseStatus, &openStatus)); + createdIterator = true; + ASSERT_WITH_MESSAGE(U_SUCCESS(openStatus), "ICU could not open a break iterator: %s (%d)", u_errorName(openStatus), openStatus); + } + if (!iterator) + return 0; + + UErrorCode setTextStatus = U_ZERO_ERROR; + ubrk_setText(iterator, string, length, &setTextStatus); + if (U_FAILURE(setTextStatus)) + return 0; + + return iterator; +} +#endif // BUILDING_ON_TIGER + +TextBreakIterator* cursorMovementIterator(const UChar* string, int length) +{ +#ifdef BUILDING_ON_TIGER + // ICU 3.2 cannot compile the below rules. + return characterBreakIterator(string, length); +#else + // This rule set is based on character-break iterator rules of ICU 4.0 + // <http://source.icu-project.org/repos/icu/icu/tags/release-4-0/source/data/brkitr/char.txt>. + // The major differences from the original ones are listed below: + // * Replaced '[\p{Grapheme_Cluster_Break = SpacingMark}]' with '[\p{General_Category = Spacing Mark} - $Extend]' for ICU 3.8 or earlier; + // * Removed rules that prevent a cursor from moving after prepend characters (Bug 24342); + // * Added rules that prevent a cursor from moving after virama signs of Indic languages except Tamil (Bug 15790), and; + // * Added rules that prevent a cursor from moving before Japanese half-width katakara voiced marks. + static const char* kRules = + "$CR = [\\p{Grapheme_Cluster_Break = CR}];" + "$LF = [\\p{Grapheme_Cluster_Break = LF}];" + "$Control = [\\p{Grapheme_Cluster_Break = Control}];" + "$VoiceMarks = [\\uFF9E\\uFF9F];" // Japanese half-width katakana voiced marks + "$Extend = [\\p{Grapheme_Cluster_Break = Extend} $VoiceMarks];" + "$SpacingMark = [[\\p{General_Category = Spacing Mark}] - $Extend];" + "$L = [\\p{Grapheme_Cluster_Break = L}];" + "$V = [\\p{Grapheme_Cluster_Break = V}];" + "$T = [\\p{Grapheme_Cluster_Break = T}];" + "$LV = [\\p{Grapheme_Cluster_Break = LV}];" + "$LVT = [\\p{Grapheme_Cluster_Break = LVT}];" + "$Hin0 = [\\u0905-\\u0939];" // Devanagari Letter A,...,Ha + "$HinV = \\u094D;" // Devanagari Sign Virama + "$Hin1 = [\\u0915-\\u0939];" // Devanagari Letter Ka,...,Ha + "$Ben0 = [\\u0985-\\u09B9];" // Bengali Letter A,...,Ha + "$BenV = \\u09CD;" // Bengali Sign Virama + "$Ben1 = [\\u0995-\\u09B9];" // Bengali Letter Ka,...,Ha + "$Pan0 = [\\u0A05-\\u0A39];" // Gurmukhi Letter A,...,Ha + "$PanV = \\u0A4D;" // Gurmukhi Sign Virama + "$Pan1 = [\\u0A15-\\u0A39];" // Gurmukhi Letter Ka,...,Ha + "$Guj0 = [\\u0A85-\\u0AB9];" // Gujarati Letter A,...,Ha + "$GujV = \\u0ACD;" // Gujarati Sign Virama + "$Guj1 = [\\u0A95-\\u0AB9];" // Gujarati Letter Ka,...,Ha + "$Ori0 = [\\u0B05-\\u0B39];" // Oriya Letter A,...,Ha + "$OriV = \\u0B4D;" // Oriya Sign Virama + "$Ori1 = [\\u0B15-\\u0B39];" // Oriya Letter Ka,...,Ha + "$Tel0 = [\\u0C05-\\u0C39];" // Telugu Letter A,...,Ha + "$TelV = \\u0C4D;" // Telugu Sign Virama + "$Tel1 = [\\u0C14-\\u0C39];" // Telugu Letter Ka,...,Ha + "$Kan0 = [\\u0C85-\\u0CB9];" // Kannada Letter A,...,Ha + "$KanV = \\u0CCD;" // Kannada Sign Virama + "$Kan1 = [\\u0C95-\\u0CB9];" // Kannada Letter A,...,Ha + "$Mal0 = [\\u0D05-\\u0D39];" // Malayalam Letter A,...,Ha + "$MalV = \\u0D4D;" // Malayalam Sign Virama + "$Mal1 = [\\u0D15-\\u0D39];" // Malayalam Letter A,...,Ha + "!!chain;" + "!!forward;" + "$CR $LF;" + "$L ($L | $V | $LV | $LVT);" + "($LV | $V) ($V | $T);" + "($LVT | $T) $T;" + "[^$Control $CR $LF] $Extend;" + "[^$Control $CR $LF] $SpacingMark;" + "$Hin0 $HinV $Hin1;" // Devanagari Virama (forward) + "$Ben0 $BenV $Ben1;" // Bengali Virama (forward) + "$Pan0 $PanV $Pan1;" // Gurmukhi Virama (forward) + "$Guj0 $GujV $Guj1;" // Gujarati Virama (forward) + "$Ori0 $OriV $Ori1;" // Oriya Virama (forward) + "$Tel0 $TelV $Tel1;" // Telugu Virama (forward) + "$Kan0 $KanV $Kan1;" // Kannada Virama (forward) + "$Mal0 $MalV $Mal1;" // Malayalam Virama (forward) + "!!reverse;" + "$LF $CR;" + "($L | $V | $LV | $LVT) $L;" + "($V | $T) ($LV | $V);" + "$T ($LVT | $T);" + "$Extend [^$Control $CR $LF];" + "$SpacingMark [^$Control $CR $LF];" + "$Hin1 $HinV $Hin0;" // Devanagari Virama (backward) + "$Ben1 $BenV $Ben0;" // Bengali Virama (backward) + "$Pan1 $PanV $Pan0;" // Gurmukhi Virama (backward) + "$Guj1 $GujV $Guj0;" // Gujarati Virama (backward) + "$Ori1 $OriV $Ori0;" // Gujarati Virama (backward) + "$Tel1 $TelV $Tel0;" // Telugu Virama (backward) + "$Kan1 $KanV $Kan0;" // Kannada Virama (backward) + "$Mal1 $MalV $Mal0;" // Malayalam Virama (backward) + "!!safe_reverse;" + "!!safe_forward;"; + static bool createdCursorMovementIterator = false; + static TextBreakIterator* staticCursorMovementIterator; + return setUpIteratorWithRules(createdCursorMovementIterator, staticCursorMovementIterator, kRules, string, length); +#endif // BUILDING_ON_TIGER +} + } diff --git a/WebCore/platform/text/TextCodecICU.cpp b/WebCore/platform/text/TextCodecICU.cpp index 72d45ad..72054fa 100644 --- a/WebCore/platform/text/TextCodecICU.cpp +++ b/WebCore/platform/text/TextCodecICU.cpp @@ -334,7 +334,7 @@ String TextCodecICU::decode(const char* bytes, size_t length, bool flush, bool s // <http://bugs.webkit.org/show_bug.cgi?id=17014> // Simplified Chinese pages use the code A3A0 to mean "full-width space", but ICU decodes it as U+E5E5. - if (m_encoding == "GBK" || m_encoding == "gb18030") + if (strcmp(m_encoding.name(), "GBK") == 0 || strcasecmp(m_encoding.name(), "gb18030") == 0) resultString.replace(0xE5E5, ideographicSpace); return resultString; diff --git a/WebCore/platform/text/TextDecoder.cpp b/WebCore/platform/text/TextDecoder.cpp deleted file mode 100644 index e39a6b7..0000000 --- a/WebCore/platform/text/TextDecoder.cpp +++ /dev/null @@ -1,129 +0,0 @@ -/* - * Copyright (C) 2004, 2006 Apple Computer, Inc. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY - * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY - * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include "config.h" -#include "TextDecoder.h" - -#include "TextEncodingRegistry.h" - -// FIXME: Would be nice to also handle BOM for UTF-7 and UTF-32. - -namespace WebCore { - -TextDecoder::TextDecoder(const TextEncoding& encoding) - : m_encoding(encoding) - , m_checkedForBOM(false) - , m_numBufferedBytes(0) -{ -} - -void TextDecoder::reset(const TextEncoding& encoding) -{ - m_encoding = encoding; - m_codec.clear(); - m_checkedForBOM = false; - m_numBufferedBytes = 0; -} - -String TextDecoder::checkForBOM(const char* data, size_t length, bool flush, bool stopOnError, bool& sawError) -{ - ASSERT(!m_checkedForBOM); - - // Check to see if we found a BOM. - size_t numBufferedBytes = m_numBufferedBytes; - size_t buf1Len = numBufferedBytes; - size_t buf2Len = length; - const unsigned char* buf1 = m_bufferedBytes; - const unsigned char* buf2 = reinterpret_cast<const unsigned char*>(data); - unsigned char c1 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0; - unsigned char c2 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0; - unsigned char c3 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0; - unsigned char c4 = buf2Len ? (--buf2Len, *buf2++) : 0; - - const TextEncoding* encodingConsideringBOM = &m_encoding; - bool foundBOM = true; - size_t lengthOfBOM = 0; - if (c1 == 0xFF && c2 == 0xFE) { - if (c3 != 0 || c4 != 0) { - encodingConsideringBOM = &UTF16LittleEndianEncoding(); - lengthOfBOM = 2; - } else if (numBufferedBytes + length > sizeof(m_bufferedBytes)) { - encodingConsideringBOM = &UTF32LittleEndianEncoding(); - lengthOfBOM = 4; - } else - foundBOM = false; - } else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) { - encodingConsideringBOM = &UTF8Encoding(); - lengthOfBOM = 3; - } else if (c1 == 0xFE && c2 == 0xFF) { - encodingConsideringBOM = &UTF16BigEndianEncoding(); - lengthOfBOM = 2; - } else if (c1 == 0 && c2 == 0 && c3 == 0xFE && c4 == 0xFF) { - encodingConsideringBOM = &UTF32BigEndianEncoding(); - lengthOfBOM = 4; - } else - foundBOM = false; - - if (!foundBOM && numBufferedBytes + length <= sizeof(m_bufferedBytes) && !flush) { - // Continue to look for the BOM. - memcpy(&m_bufferedBytes[numBufferedBytes], data, length); - m_numBufferedBytes += length; - return ""; - } - - // Done checking for BOM. - m_codec.set(newTextCodec(*encodingConsideringBOM).release()); - if (!m_codec) - return String(); - m_checkedForBOM = true; - - // Skip the BOM. - if (foundBOM) { - ASSERT(numBufferedBytes < lengthOfBOM); - size_t numUnbufferedBOMBytes = lengthOfBOM - numBufferedBytes; - ASSERT(numUnbufferedBOMBytes <= length); - - data += numUnbufferedBOMBytes; - length -= numUnbufferedBOMBytes; - numBufferedBytes = 0; - m_numBufferedBytes = 0; - } - - // Handle case where we have some buffered bytes to deal with. - if (numBufferedBytes) { - char bufferedBytes[sizeof(m_bufferedBytes)]; - memcpy(bufferedBytes, m_bufferedBytes, numBufferedBytes); - m_numBufferedBytes = 0; - - String bufferedResult = m_codec->decode(bufferedBytes, numBufferedBytes, false, stopOnError, sawError); - if (stopOnError && sawError) - return bufferedResult; - return bufferedResult + m_codec->decode(data, length, flush, stopOnError, sawError); - } - - return m_codec->decode(data, length, flush, stopOnError, sawError); -} - -} // namespace WebCore diff --git a/WebCore/platform/text/TextDecoder.h b/WebCore/platform/text/TextDecoder.h deleted file mode 100644 index 171cb59..0000000 --- a/WebCore/platform/text/TextDecoder.h +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Copyright (C) 2004, 2006 Apple Computer, Inc. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY - * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY - * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef TextDecoder_h -#define TextDecoder_h - -#include "PlatformString.h" -#include "TextCodec.h" -#include "TextEncoding.h" -#include <wtf/OwnPtr.h> - -namespace WebCore { - - class TextCodec; - - class TextDecoder { - public: - TextDecoder(const TextEncoding&); - void reset(const TextEncoding&); - const TextEncoding& encoding() const { return m_encoding; }; - - String decode(const char* data, size_t length, bool flush, bool stopOnError, bool& sawError) - { - if (!m_checkedForBOM) - return checkForBOM(data, length, flush, stopOnError, sawError); - return m_codec->decode(data, length, flush, stopOnError, sawError); - } - - private: - String checkForBOM(const char*, size_t length, bool flush, bool stopOnError, bool& sawError); - - TextEncoding m_encoding; - OwnPtr<TextCodec> m_codec; - - bool m_checkedForBOM; - unsigned char m_numBufferedBytes; - unsigned char m_bufferedBytes[3]; - }; - -} // namespace WebCore - -#endif // TextDecoder_h diff --git a/WebCore/platform/text/TextEncoding.cpp b/WebCore/platform/text/TextEncoding.cpp index 063d96b..ed58412 100644 --- a/WebCore/platform/text/TextEncoding.cpp +++ b/WebCore/platform/text/TextEncoding.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2004, 2006, 2007, 2008 Apple Inc. All rights reserved. + * Copyright (C) 2004, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved. * Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com> * * Redistribution and use in source and binary forms, with or without @@ -30,7 +30,6 @@ #include "CString.h" #include "PlatformString.h" #include "TextCodec.h" -#include "TextDecoder.h" #include "TextEncodingRegistry.h" #if USE(ICU_UNICODE) #include <unicode/unorm.h> @@ -73,7 +72,7 @@ String TextEncoding::decode(const char* data, size_t length, bool stopOnError, b if (!m_name) return String(); - return TextDecoder(*this).decode(data, length, true, stopOnError, sawError); + return newTextCodec(*this)->decode(data, length, true, stopOnError, sawError); } CString TextEncoding::encode(const UChar* characters, size_t length, UnencodableHandling handling) const @@ -165,10 +164,23 @@ UChar TextEncoding::backslashAsCurrencySymbol() const bool TextEncoding::isNonByteBasedEncoding() const { + if (noExtendedTextEncodingNameUsed()) { + return *this == UTF16LittleEndianEncoding() + || *this == UTF16BigEndianEncoding(); + } + return *this == UTF16LittleEndianEncoding() - || *this == UTF16BigEndianEncoding() - || *this == UTF32BigEndianEncoding() - || *this == UTF32LittleEndianEncoding(); + || *this == UTF16BigEndianEncoding() + || *this == UTF32BigEndianEncoding() + || *this == UTF32LittleEndianEncoding(); +} + +bool TextEncoding::isUTF7Encoding() const +{ + if (noExtendedTextEncodingNameUsed()) + return false; + + return *this == UTF7Encoding(); } const TextEncoding& TextEncoding::closestByteBasedEquivalent() const @@ -185,7 +197,7 @@ const TextEncoding& TextEncoding::closestByteBasedEquivalent() const // but it's fraught with problems and we'd rather steer clear of it. const TextEncoding& TextEncoding::encodingForFormSubmission() const { - if (isNonByteBasedEncoding() || *this == UTF7Encoding()) + if (isNonByteBasedEncoding() || isUTF7Encoding()) return UTF8Encoding(); return *this; } diff --git a/WebCore/platform/text/TextEncoding.h b/WebCore/platform/text/TextEncoding.h index b2bb816..b3909f7 100644 --- a/WebCore/platform/text/TextEncoding.h +++ b/WebCore/platform/text/TextEncoding.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2004, 2006 Apple Computer, Inc. All rights reserved. + * Copyright (C) 2004, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -45,12 +45,14 @@ namespace WebCore { bool usesVisualOrdering() const; bool isJapanese() const; - PassRefPtr<StringImpl> displayString(PassRefPtr<StringImpl> str) const { + PassRefPtr<StringImpl> displayString(PassRefPtr<StringImpl> str) const + { if (m_backslashAsCurrencySymbol == '\\' || !str) return str; return str->replace('\\', m_backslashAsCurrencySymbol); } - void displayBuffer(UChar* characters, unsigned len) const { + void displayBuffer(UChar* characters, unsigned len) const + { if (m_backslashAsCurrencySymbol == '\\') return; for (unsigned i = 0; i < len; ++i) { @@ -72,10 +74,11 @@ namespace WebCore { private: UChar backslashAsCurrencySymbol() const; + bool isNonByteBasedEncoding() const; + bool isUTF7Encoding() const; const char* m_name; UChar m_backslashAsCurrencySymbol; - bool isNonByteBasedEncoding() const; }; inline bool operator==(const TextEncoding& a, const TextEncoding& b) { return a.name() == b.name(); } diff --git a/WebCore/platform/text/TextEncodingDetector.h b/WebCore/platform/text/TextEncodingDetector.h new file mode 100644 index 0000000..9f16ab0 --- /dev/null +++ b/WebCore/platform/text/TextEncodingDetector.h @@ -0,0 +1,48 @@ +/* + * Copyright (C) 2009 Google Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Google Inc. nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef TextEncodingDetector_h +#define TextEncodingDetector_h + +namespace WebCore { + + class TextEncoding; + + // Given a sequence of bytes in |data| of length |len| and an optional + // hintEncodingName, detect the most likely character encoding. + // The way hintEncodingName is used is up to an implementation. + // Currently, the only caller sets it to the parent frame encoding. + bool detectTextEncoding(const char* data, size_t len, + const char* hintEncodingName, + TextEncoding* detectedEncoding); + +} // namespace WebCore + +#endif diff --git a/WebCore/platform/text/TextEncodingDetectorICU.cpp b/WebCore/platform/text/TextEncodingDetectorICU.cpp new file mode 100644 index 0000000..26c997e --- /dev/null +++ b/WebCore/platform/text/TextEncodingDetectorICU.cpp @@ -0,0 +1,129 @@ +/* + * Copyright (C) 2008, 2009 Google Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Google Inc. nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "TextEncodingDetector.h" + +#include "TextEncoding.h" +#include "UnusedParam.h" + +#ifndef BUILDING_ON_TIGER +#include "unicode/ucnv.h" +#include "unicode/ucsdet.h" +#endif + +namespace WebCore { + +bool detectTextEncoding(const char* data, size_t len, + const char* hintEncodingName, + TextEncoding* detectedEncoding) +{ + *detectedEncoding = TextEncoding(); +#ifdef BUILDING_ON_TIGER + // Tiger came with ICU 3.2 and does not have the encoding detector. + UNUSED_PARAM(data); + UNUSED_PARAM(len); + UNUSED_PARAM(hintEncodingName); + return false; +#else + int matchesCount = 0; + UErrorCode status = U_ZERO_ERROR; + UCharsetDetector* detector = ucsdet_open(&status); + if (U_FAILURE(status)) + return false; + ucsdet_enableInputFilter(detector, true); + ucsdet_setText(detector, data, static_cast<int32_t>(len), &status); + if (U_FAILURE(status)) + return false; + + // FIXME: A few things we can do other than improving + // the ICU detector itself. + // 1. Use ucsdet_detectAll and pick the most likely one given + // "the context" (parent-encoding, referrer encoding, etc). + // 2. 'Emulate' Firefox/IE's non-Universal detectors (e.g. + // Chinese, Japanese, Russian, Korean and Hebrew) by picking the + // encoding with a highest confidence among the detetctor-specific + // limited set of candidate encodings. + // Below is a partial implementation of the first part of what's outlined + // above. + const UCharsetMatch** matches = ucsdet_detectAll(detector, &matchesCount, &status); + if (U_FAILURE(status)) { + ucsdet_close(detector); + return false; + } + + const char* encoding = 0; + if (hintEncodingName) { + TextEncoding hintEncoding(hintEncodingName); + // 10 is the minimum confidence value consistent with the codepoint + // allocation in a given encoding. The size of a chunk passed to + // us varies even for the same html file (apparently depending on + // the network load). When we're given a rather short chunk, we + // don't have a sufficiently reliable signal other than the fact that + // the chunk is consistent with a set of encodings. So, instead of + // setting an arbitrary threshold, we have to scan all the encodings + // consistent with the data. + const int32_t kThresold = 10; + for (int i = 0; i < matchesCount; ++i) { + int32_t confidence = ucsdet_getConfidence(matches[i], &status); + if (U_FAILURE(status)) { + status = U_ZERO_ERROR; + continue; + } + if (confidence < kThresold) + break; + const char* matchEncoding = ucsdet_getName(matches[i], &status); + if (U_FAILURE(status)) { + status = U_ZERO_ERROR; + continue; + } + if (TextEncoding(matchEncoding) == hintEncoding) { + encoding = hintEncodingName; + break; + } + } + } + // If no match is found so far, just pick the top match. + // This can happen, say, when a parent frame in EUC-JP refers to + // a child frame in Shift_JIS and both frames do NOT specify the encoding + // making us resort to auto-detection (when it IS turned on). + if (!encoding && matchesCount > 0) + encoding = ucsdet_getName(matches[0], &status); + if (U_SUCCESS(status)) { + *detectedEncoding = TextEncoding(encoding); + ucsdet_close(detector); + return true; + } + ucsdet_close(detector); + return false; +#endif +} + +} diff --git a/WebCore/platform/text/TextEncodingDetectorNone.cpp b/WebCore/platform/text/TextEncodingDetectorNone.cpp new file mode 100644 index 0000000..2655f08 --- /dev/null +++ b/WebCore/platform/text/TextEncodingDetectorNone.cpp @@ -0,0 +1,51 @@ +/* + * Copyright (C) 2009 Google Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Google Inc. nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "TextEncodingDetector.h" + +#include "TextEncoding.h" +#include "UnusedParam.h" + +namespace WebCore { + +bool detectTextEncoding(const char* data, size_t len, + const char* hintEncodingName, + TextEncoding* detectedEncoding) +{ + UNUSED_PARAM(data) + UNUSED_PARAM(len) + UNUSED_PARAM(hintEncodingName) + + *detectedEncoding = TextEncoding(); + return false; +} + +} diff --git a/WebCore/platform/text/TextEncodingRegistry.h b/WebCore/platform/text/TextEncodingRegistry.h index 5ca2039..d204734 100644 --- a/WebCore/platform/text/TextEncodingRegistry.h +++ b/WebCore/platform/text/TextEncodingRegistry.h @@ -34,11 +34,8 @@ namespace WebCore { class TextCodec; class TextEncoding; - // Only TextEncoding and TextDecoder should use this function directly. - // - Use TextDecoder::decode to decode, since it handles BOMs. - // - Use TextEncoding::decode to decode if you have all the data at once. - // It's implemented by calling TextDecoder::decode so works just as well. - // - Use TextEncoding::encode to encode, since it takes care of normalization. + // Use TextResourceDecoder::decode to decode resources, since it handles BOMs. + // Use TextEncoding::encode to encode, since it takes care of normalization. std::auto_ptr<TextCodec> newTextCodec(const TextEncoding&); // Only TextEncoding should use this function directly. diff --git a/WebCore/platform/text/android/TextBreakIteratorInternalICU.cpp b/WebCore/platform/text/android/TextBreakIteratorInternalICU.cpp new file mode 100644 index 0000000..9bebe74 --- /dev/null +++ b/WebCore/platform/text/android/TextBreakIteratorInternalICU.cpp @@ -0,0 +1,36 @@ +/* + * Copyright 2007, The Android Open Source Project + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "TextBreakIteratorInternalICU.h" + +namespace WebCore { + +const char* currentTextBreakLocaleID() +{ + return "en_us"; +} + +} diff --git a/WebCore/platform/text/cf/StringImplCF.cpp b/WebCore/platform/text/cf/StringImplCF.cpp index ff595a5..8a2ae79 100644 --- a/WebCore/platform/text/cf/StringImplCF.cpp +++ b/WebCore/platform/text/cf/StringImplCF.cpp @@ -1,5 +1,5 @@ -/** - * Copyright (C) 2006 Apple Computer, Inc. +/* + * Copyright (C) 2006, 2009 Apple Inc. All rights reserved. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Library General Public @@ -24,14 +24,139 @@ #if PLATFORM(CF) || (PLATFORM(QT) && PLATFORM(DARWIN)) #include <CoreFoundation/CoreFoundation.h> +#include <wtf/MainThread.h> +#include <wtf/PassRefPtr.h> +#include <wtf/Threading.h> + +#if PLATFORM(MAC) && !defined(BUILDING_ON_TIGER) +#include <objc/objc-auto.h> +#endif namespace WebCore { +namespace StringWrapperCFAllocator { + + static StringImpl* currentString; + + static const void* retain(const void* info) + { + return info; + } + + static void release(const void*) + { + ASSERT_NOT_REACHED(); + } + + static CFStringRef copyDescription(const void*) + { + return CFSTR("WebCore::String-based allocator"); + } + + static void* allocate(CFIndex size, CFOptionFlags, void*) + { + StringImpl* underlyingString = 0; + if (isMainThread()) { + underlyingString = currentString; + if (underlyingString) { + currentString = 0; + underlyingString->ref(); // Balanced by call to deref in deallocate below. + } + } + StringImpl** header = static_cast<StringImpl**>(fastMalloc(sizeof(StringImpl*) + size)); + *header = underlyingString; + return header + 1; + } + + static void* reallocate(void* pointer, CFIndex newSize, CFOptionFlags, void*) + { + size_t newAllocationSize = sizeof(StringImpl*) + newSize; + StringImpl** header = static_cast<StringImpl**>(pointer) - 1; + ASSERT(!*header); + header = static_cast<StringImpl**>(fastRealloc(header, newAllocationSize)); + return header + 1; + } + + static void deallocateOnMainThread(void* headerPointer) + { + StringImpl** header = static_cast<StringImpl**>(headerPointer); + StringImpl* underlyingString = *header; + ASSERT(underlyingString); + underlyingString->deref(); // Balanced by call to ref in allocate above. + fastFree(header); + } + + static void deallocate(void* pointer, void*) + { + StringImpl** header = static_cast<StringImpl**>(pointer) - 1; + StringImpl* underlyingString = *header; + if (!underlyingString) + fastFree(header); + else { + if (!isMainThread()) + callOnMainThread(deallocateOnMainThread, header); + else { + underlyingString->deref(); // Balanced by call to ref in allocate above. + fastFree(header); + } + } + } + + static CFIndex preferredSize(CFIndex size, CFOptionFlags, void*) + { + // FIXME: If FastMalloc provided a "good size" callback, we'd want to use it here. + // Note that this optimization would help performance for strings created with the + // allocator that are mutable, and those typically are only created by callers who + // make a new string using the old string's allocator, such as some of the call + // sites in CFURL. + return size; + } + + static CFAllocatorRef create() + { +#if PLATFORM(MAC) && !defined(BUILDING_ON_TIGER) + // Since garbage collection isn't compatible with custom allocators, don't use this at all when garbage collection is active. + if (objc_collectingEnabled()) + return 0; +#endif + CFAllocatorContext context = { 0, 0, retain, release, copyDescription, allocate, reallocate, deallocate, preferredSize }; + return CFAllocatorCreate(0, &context); + } + + static CFAllocatorRef allocator() + { + static CFAllocatorRef allocator = create(); + return allocator; + } + +} + CFStringRef StringImpl::createCFString() { - return CFStringCreateWithCharacters(NULL, reinterpret_cast<const UniChar*>(m_data), m_length); + CFAllocatorRef allocator = (m_length && isMainThread()) ? StringWrapperCFAllocator::allocator() : 0; + if (!allocator) + return CFStringCreateWithCharacters(0, reinterpret_cast<const UniChar*>(m_data), m_length); + + // Put pointer to the StringImpl in a global so the allocator can store it with the CFString. + ASSERT(!StringWrapperCFAllocator::currentString); + StringWrapperCFAllocator::currentString = this; + + CFStringRef string = CFStringCreateWithCharactersNoCopy(allocator, reinterpret_cast<const UniChar*>(m_data), m_length, kCFAllocatorNull); + + // The allocator cleared the global when it read it, but also clear it here just in case. + ASSERT(!StringWrapperCFAllocator::currentString); + StringWrapperCFAllocator::currentString = 0; + + return string; } +// On StringImpl creation we could check if the allocator is the StringWrapperCFAllocator. +// If it is, then we could find the original StringImpl and just return that. But to +// do that we'd have to compute the offset from CFStringRef to the allocated block; +// the CFStringRef is *not* at the start of an allocated block. Testing shows 1000x +// more calls to createCFString than calls to the create functions with the appropriate +// allocator, so it's probably not urgent optimize that case. + } #endif // PLATFORM(CF) || (PLATFORM(QT) && PLATFORM(DARWIN)) diff --git a/WebCore/platform/text/mac/ShapeArabic.c b/WebCore/platform/text/mac/ShapeArabic.c index 1e0d91b..dd61ce5 100644 --- a/WebCore/platform/text/mac/ShapeArabic.c +++ b/WebCore/platform/text/mac/ShapeArabic.c @@ -36,6 +36,8 @@ #include "ShapeArabic.h" +#include <stdbool.h> +#include <string.h> #include <unicode/utypes.h> #include <unicode/uchar.h> #include <unicode/ustring.h> diff --git a/WebCore/platform/text/mac/StringImplMac.mm b/WebCore/platform/text/mac/StringImplMac.mm index 3e0731c..d14c6d8 100644 --- a/WebCore/platform/text/mac/StringImplMac.mm +++ b/WebCore/platform/text/mac/StringImplMac.mm @@ -1,5 +1,5 @@ -/** - * Copyright (C) 2006 Apple Computer, Inc. +/* + * Copyright (C) 2006, 2009 Apple Inc. All rights reserved. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Library General Public @@ -21,13 +21,13 @@ #include "config.h" #include "StringImpl.h" -#include <Foundation/Foundation.h> +#include "FoundationExtras.h" namespace WebCore { StringImpl::operator NSString *() { - return [NSString stringWithCharacters:m_data length:m_length]; + return HardAutorelease(createCFString()); } } diff --git a/WebCore/platform/text/mac/StringMac.mm b/WebCore/platform/text/mac/StringMac.mm index 77942ea..758ae1d 100644 --- a/WebCore/platform/text/mac/StringMac.mm +++ b/WebCore/platform/text/mac/StringMac.mm @@ -20,6 +20,7 @@ #include "config.h" #include "PlatformString.h" +#include <CoreFoundation/CFString.h> namespace WebCore { diff --git a/WebCore/platform/text/qt/TextBreakIteratorQt.cpp b/WebCore/platform/text/qt/TextBreakIteratorQt.cpp index 88b9680..4dc23ee 100644 --- a/WebCore/platform/text/qt/TextBreakIteratorQt.cpp +++ b/WebCore/platform/text/qt/TextBreakIteratorQt.cpp @@ -63,6 +63,11 @@ namespace WebCore { return static_cast<TextBreakIterator*>(iterator); } + TextBreakIterator* cursorMovementIterator(const UChar* string, int length) + { + return characterBreakIterator(string, length); + } + TextBreakIterator* lineBreakIterator(const UChar* string, int length) { static QTextBoundaryFinder *iterator = 0; @@ -250,6 +255,11 @@ TextBreakIterator* characterBreakIterator(const UChar* string, int length) return iterator; } +TextBreakIterator* cursorMovementIterator(const UChar* string, int length) +{ + return characterBreakIterator(string, length); +} + TextBreakIterator* lineBreakIterator(const UChar*, int) { // not yet implemented |