diff options
author | Steve Block <steveblock@google.com> | 2011-05-18 13:36:51 +0100 |
---|---|---|
committer | Steve Block <steveblock@google.com> | 2011-05-24 15:38:28 +0100 |
commit | 2fc2651226baac27029e38c9d6ef883fa32084db (patch) | |
tree | e396d4bf89dcce6ed02071be66212495b1df1dec /Source/WebCore/platform/text | |
parent | b3725cedeb43722b3b175aaeff70552e562d2c94 (diff) | |
download | external_webkit-2fc2651226baac27029e38c9d6ef883fa32084db.zip external_webkit-2fc2651226baac27029e38c9d6ef883fa32084db.tar.gz external_webkit-2fc2651226baac27029e38c9d6ef883fa32084db.tar.bz2 |
Merge WebKit at r78450: Initial merge by git.
Change-Id: I6d3e5f1f868ec266a0aafdef66182ddc3f265dc1
Diffstat (limited to 'Source/WebCore/platform/text')
-rw-r--r-- | Source/WebCore/platform/text/Base64.cpp | 6 | ||||
-rw-r--r-- | Source/WebCore/platform/text/Base64.h | 29 | ||||
-rw-r--r-- | Source/WebCore/platform/text/BidiResolver.h | 10 | ||||
-rw-r--r-- | Source/WebCore/platform/text/CharacterNames.h | 90 | ||||
-rw-r--r-- | Source/WebCore/platform/text/LocalizedNumber.h | 59 | ||||
-rw-r--r-- | Source/WebCore/platform/text/LocalizedNumberNone.cpp | 55 | ||||
-rw-r--r-- | Source/WebCore/platform/text/RegularExpression.cpp | 91 | ||||
-rw-r--r-- | Source/WebCore/platform/text/SegmentedString.cpp | 15 | ||||
-rw-r--r-- | Source/WebCore/platform/text/SegmentedString.h | 12 | ||||
-rw-r--r-- | Source/WebCore/platform/text/TextCodecICU.cpp | 2 | ||||
-rw-r--r-- | Source/WebCore/platform/text/TextCodecUTF16.cpp | 2 | ||||
-rw-r--r-- | Source/WebCore/platform/text/TextCodecUTF8.cpp | 276 | ||||
-rw-r--r-- | Source/WebCore/platform/text/TextCodecUTF8.h | 52 | ||||
-rw-r--r-- | Source/WebCore/platform/text/TextEncodingRegistry.cpp | 15 | ||||
-rw-r--r-- | Source/WebCore/platform/text/mac/TextCodecMac.cpp | 4 | ||||
-rw-r--r-- | Source/WebCore/platform/text/transcoder/FontTranscoder.cpp | 2 |
16 files changed, 547 insertions, 173 deletions
diff --git a/Source/WebCore/platform/text/Base64.cpp b/Source/WebCore/platform/text/Base64.cpp index 98b537a..bf706f6 100644 --- a/Source/WebCore/platform/text/Base64.cpp +++ b/Source/WebCore/platform/text/Base64.cpp @@ -60,9 +60,11 @@ static const char base64DecMap[128] = { 0x31, 0x32, 0x33, 0x00, 0x00, 0x00, 0x00, 0x00 }; -void base64Encode(const Vector<char>& in, Vector<char>& out, bool insertLFs) +String base64Encode(const char* data, unsigned length, bool insertLFs) { - base64Encode(in.data(), in.size(), out, insertLFs); + Vector<char> result; + base64Encode(data, length, result, insertLFs); + return String(result.data(), result.size()); } void base64Encode(const char* data, unsigned len, Vector<char>& out, bool insertLFs) diff --git a/Source/WebCore/platform/text/Base64.h b/Source/WebCore/platform/text/Base64.h index 211bd3c..70855de 100644 --- a/Source/WebCore/platform/text/Base64.h +++ b/Source/WebCore/platform/text/Base64.h @@ -27,20 +27,45 @@ #ifndef Base64_h #define Base64_h -#include <wtf/Forward.h> #include <wtf/Vector.h> +#include <wtf/text/CString.h> +#include <wtf/text/WTFString.h> namespace WebCore { enum Base64DecodePolicy { FailOnInvalidCharacter, IgnoreWhitespace, IgnoreInvalidCharacters }; -void base64Encode(const Vector<char>&, Vector<char>&, bool insertLFs = false); void base64Encode(const char*, unsigned, Vector<char>&, bool insertLFs = false); +void base64Encode(const Vector<char>&, Vector<char>&, bool insertLFs = false); +void base64Encode(const CString&, Vector<char>&, bool insertLFs = false); +String base64Encode(const char*, unsigned, bool insertLFs = false); +String base64Encode(const Vector<char>&, bool insertLFs = false); +String base64Encode(const CString&, bool insertLFs = false); bool base64Decode(const String&, Vector<char>&, Base64DecodePolicy = FailOnInvalidCharacter); bool base64Decode(const Vector<char>&, Vector<char>&, Base64DecodePolicy = FailOnInvalidCharacter); bool base64Decode(const char*, unsigned, Vector<char>&, Base64DecodePolicy = FailOnInvalidCharacter); +inline void base64Encode(const Vector<char>& in, Vector<char>& out, bool insertLFs) +{ + base64Encode(in.data(), in.size(), out, insertLFs); } +inline void base64Encode(const CString& in, Vector<char>& out, bool insertLFs) +{ + base64Encode(in.data(), in.length(), out, insertLFs); +} + +inline String base64Encode(const Vector<char>& in, bool insertLFs) +{ + return base64Encode(in.data(), in.size(), insertLFs); +} + +inline String base64Encode(const CString& in, bool insertLFs) +{ + return base64Encode(in.data(), in.length(), insertLFs); +} + +} // namespace WebCore + #endif // Base64_h diff --git a/Source/WebCore/platform/text/BidiResolver.h b/Source/WebCore/platform/text/BidiResolver.h index 8abd698..72d163c 100644 --- a/Source/WebCore/platform/text/BidiResolver.h +++ b/Source/WebCore/platform/text/BidiResolver.h @@ -161,7 +161,7 @@ public : MidpointState<Iterator>& midpointState() { return m_midpointState; } void embed(WTF::Unicode::Direction); - void commitExplicitEmbedding(); + bool commitExplicitEmbedding(); void createBidiRunsForLine(const Iterator& end, bool visualOrder = false, bool hardLineBreak = false); @@ -400,7 +400,7 @@ void BidiResolver<Iterator, Run>::raiseExplicitEmbeddingLevel(WTF::Unicode::Dire } template <class Iterator, class Run> -void BidiResolver<Iterator, Run>::commitExplicitEmbedding() +bool BidiResolver<Iterator, Run>::commitExplicitEmbedding() { using namespace WTF::Unicode; @@ -440,6 +440,8 @@ void BidiResolver<Iterator, Run>::commitExplicitEmbedding() setContext(toContext); m_currentExplicitEmbeddingSequence.clear(); + + return fromLevel != toLevel; } template <class Iterator, class Run> @@ -881,8 +883,8 @@ void BidiResolver<Iterator, Run>::createBidiRunsForLine(const Iterator& end, boo increment(); if (!m_currentExplicitEmbeddingSequence.isEmpty()) { - commitExplicitEmbedding(); - if (pastEnd) { + bool committed = commitExplicitEmbedding(); + if (committed && pastEnd) { current = end; m_status = stateAtEnd.m_status; sor = stateAtEnd.sor; diff --git a/Source/WebCore/platform/text/CharacterNames.h b/Source/WebCore/platform/text/CharacterNames.h deleted file mode 100644 index c4b496e..0000000 --- a/Source/WebCore/platform/text/CharacterNames.h +++ /dev/null @@ -1,90 +0,0 @@ -/* - * Copyright (C) 2007, 2009, 2010 Apple Inc. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY - * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY - * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef CharacterNames_h -#define CharacterNames_h - -#include <wtf/unicode/Unicode.h> - -namespace WebCore { - -// Names here are taken from the Unicode standard. - -// Most of these are UChar constants, not UChar32, which makes them -// more convenient for WebCore code that mostly uses UTF-16. - -const UChar32 aegeanWordSeparatorLine = 0x10100; -const UChar32 aegeanWordSeparatorDot = 0x10101; -const UChar blackCircle = 0x25CF; -const UChar blackSquare = 0x25A0; -const UChar blackUpPointingTriangle = 0x25B2; -const UChar bullet = 0x2022; -const UChar bullseye = 0x25CE; -const UChar carriageReturn = 0x000D; -const UChar ethiopicPrefaceColon = 0x1366; -const UChar ethiopicWordspace = 0x1361; -const UChar fisheye = 0x25C9; -const UChar hebrewPunctuationGeresh = 0x05F3; -const UChar hebrewPunctuationGershayim = 0x05F4; -const UChar horizontalEllipsis = 0x2026; -const UChar hyphen = 0x2010; -const UChar hyphenMinus = 0x002D; -const UChar ideographicComma = 0x3001; -const UChar ideographicFullStop = 0x3002; -const UChar ideographicSpace = 0x3000; -const UChar leftDoubleQuotationMark = 0x201C; -const UChar leftSingleQuotationMark = 0x2018; -const UChar leftToRightEmbed = 0x202A; -const UChar leftToRightMark = 0x200E; -const UChar leftToRightOverride = 0x202D; -const UChar minusSign = 0x2212; -const UChar newlineCharacter = 0x000A; -const UChar noBreakSpace = 0x00A0; -const UChar objectReplacementCharacter = 0xFFFC; -const UChar popDirectionalFormatting = 0x202C; -const UChar replacementCharacter = 0xFFFD; -const UChar rightDoubleQuotationMark = 0x201D; -const UChar rightSingleQuotationMark = 0x2019; -const UChar rightToLeftEmbed = 0x202B; -const UChar rightToLeftMark = 0x200F; -const UChar rightToLeftOverride = 0x202E; -const UChar sesameDot = 0xFE45; -const UChar softHyphen = 0x00AD; -const UChar space = 0x0020; -const UChar tibetanMarkIntersyllabicTsheg = 0x0F0B; -const UChar tibetanMarkDelimiterTshegBstar = 0x0F0C; -const UChar32 ugariticWordDivider = 0x1039F; -const UChar whiteBullet = 0x25E6; -const UChar whiteCircle = 0x25CB; -const UChar whiteSesameDot = 0xFE46; -const UChar whiteUpPointingTriangle = 0x25B3; -const UChar yenSign = 0x00A5; -const UChar zeroWidthJoiner = 0x200D; -const UChar zeroWidthNonJoiner = 0x200C; -const UChar zeroWidthSpace = 0x200B; - -} - -#endif // CharacterNames_h diff --git a/Source/WebCore/platform/text/LocalizedNumber.h b/Source/WebCore/platform/text/LocalizedNumber.h new file mode 100644 index 0000000..45873b8 --- /dev/null +++ b/Source/WebCore/platform/text/LocalizedNumber.h @@ -0,0 +1,59 @@ +/* + * Copyright (C) 2011 Google Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Google Inc. nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef LocalizedNumber_h +#define LocalizedNumber_h + +#include <wtf/text/WTFString.h> + +namespace WebCore { + +// Parses a string representation of a floating point number localized +// for the browser's current locale. If the input string is not valid +// or an implementation doesn't support localized numbers, this +// function returns NaN. This function doesn't need to support +// scientific notation, NaN, +Infinity and -Infinity, and doesn't need +// to support the standard representations of ECMAScript and HTML5. +double parseLocalizedNumber(const String&); + +// Serializes the specified floating point number for the browser's +// current locale. If an implementation doesn't support localized +// numbers or the input value is NaN or Infinitiy, the function should +// return an empty string. +String formatLocalizedNumber(double); + +// Returns true if the input character can be used to represent a +// number in the browser locale. For example, this should return true for 0-9 . +// , + - for en-US locale. +bool isLocalizedNumberCharacter(UChar32); + +} // namespace WebCore + +#endif // LocalizedNumber_h diff --git a/Source/WebCore/platform/text/LocalizedNumberNone.cpp b/Source/WebCore/platform/text/LocalizedNumberNone.cpp new file mode 100644 index 0000000..6f017e9 --- /dev/null +++ b/Source/WebCore/platform/text/LocalizedNumberNone.cpp @@ -0,0 +1,55 @@ +/* + * Copyright (C) 2011 Google Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Google Inc. nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "LocalizedNumber.h" + +#include <limits> + +using namespace std; + +namespace WebCore { + +double parseLocalizedNumber(const String&) +{ + return numeric_limits<double>::quiet_NaN(); +} + +String formatLocalizedNumber(double) +{ + return String(); +} + +bool isLocalizedNumberCharacter(UChar32) +{ + return false; +} + +} // namespace WebCore diff --git a/Source/WebCore/platform/text/RegularExpression.cpp b/Source/WebCore/platform/text/RegularExpression.cpp index 9b063c9..e020b91 100644 --- a/Source/WebCore/platform/text/RegularExpression.cpp +++ b/Source/WebCore/platform/text/RegularExpression.cpp @@ -1,6 +1,7 @@ /* * Copyright (C) 2004, 2008, 2009 Apple Inc. All rights reserved. * Copyright (C) 2008 Collabora Ltd. + * Copyright (C) 2011 Peter Varga (pvarga@webkit.org), University of Szeged * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -27,52 +28,48 @@ #include "config.h" #include "RegularExpression.h" +#include <wtf/BumpPointerAllocator.h> +#include <yarr/Yarr.h> #include "Logging.h" -#include <pcre/pcre.h> namespace WebCore { class RegularExpression::Private : public RefCounted<RegularExpression::Private> { public: - static PassRefPtr<Private> create(const String& pattern, TextCaseSensitivity); - ~Private(); + static PassRefPtr<Private> create(const String& pattern, TextCaseSensitivity caseSensitivity) + { + return adoptRef(new Private(pattern, caseSensitivity)); + } - JSRegExp* regexp() const { return m_regexp; } - int lastMatchLength; + int lastMatchLength; -private: - Private(const String& pattern, TextCaseSensitivity); - static JSRegExp* compile(const String& pattern, TextCaseSensitivity); + unsigned m_numSubpatterns; + OwnPtr<JSC::Yarr::BytecodePattern> m_regExpByteCode; - JSRegExp* m_regexp; -}; +private: + Private(const String& pattern, TextCaseSensitivity caseSensitivity) + : lastMatchLength(-1) + , m_regExpByteCode(compile(pattern, caseSensitivity)) + , m_constructionError(0) + { + } -inline JSRegExp* RegularExpression::Private::compile(const String& pattern, TextCaseSensitivity caseSensitivity) -{ - const char* errorMessage; - JSRegExp* regexp = jsRegExpCompile(pattern.characters(), pattern.length(), - caseSensitivity == TextCaseSensitive ? JSRegExpDoNotIgnoreCase : JSRegExpIgnoreCase, JSRegExpSingleLine, - 0, &errorMessage); - if (!regexp) - LOG_ERROR("RegularExpression: pcre_compile failed with '%s'", errorMessage); - return regexp; -} + PassOwnPtr<JSC::Yarr::BytecodePattern> compile(const String& patternString, TextCaseSensitivity caseSensitivity) + { + JSC::Yarr::YarrPattern pattern(JSC::UString(patternString.impl()), (caseSensitivity == TextCaseInsensitive), false, &m_constructionError); + if (m_constructionError) { + LOG_ERROR("RegularExpression: YARR compile failed with '%s'", m_constructionError); + return PassOwnPtr<JSC::Yarr::BytecodePattern>(); + } -inline RegularExpression::Private::Private(const String& pattern, TextCaseSensitivity caseSensitivity) - : lastMatchLength(-1) - , m_regexp(compile(pattern, caseSensitivity)) -{ -} + m_numSubpatterns = pattern.m_numSubpatterns; -inline PassRefPtr<RegularExpression::Private> RegularExpression::Private::create(const String& pattern, TextCaseSensitivity caseSensitivity) -{ - return adoptRef(new Private(pattern, caseSensitivity)); -} + return JSC::Yarr::byteCompile(pattern, &m_regexAllocator); + } -RegularExpression::Private::~Private() -{ - jsRegExpFree(m_regexp); -} + BumpPointerAllocator m_regexAllocator; + const char* m_constructionError; +}; RegularExpression::RegularExpression(const String& pattern, TextCaseSensitivity caseSensitivity) : d(Private::create(pattern, caseSensitivity)) @@ -96,28 +93,36 @@ RegularExpression& RegularExpression::operator=(const RegularExpression& re) int RegularExpression::match(const String& str, int startFrom, int* matchLength) const { - if (!d->regexp()) + if (!d->m_regExpByteCode) return -1; if (str.isNull()) return -1; - // First 2 offsets are start and end offsets; 3rd entry is used internally by pcre - static const size_t maxOffsets = 3; - int offsets[maxOffsets]; - int result = jsRegExpExecute(d->regexp(), str.characters(), str.length(), startFrom, offsets, maxOffsets); + int offsetVectorSize = (d->m_numSubpatterns + 1) * 2; + int* offsetVector; + Vector<int, 32> nonReturnedOvector; + + nonReturnedOvector.resize(offsetVectorSize); + offsetVector = nonReturnedOvector.data(); + + ASSERT(offsetVector); + for (unsigned j = 0, i = 0; i < d->m_numSubpatterns + 1; j += 2, i++) + offsetVector[j] = -1; + + int result = JSC::Yarr::interpret(d->m_regExpByteCode.get(), str.characters(), startFrom, str.length(), offsetVector); + ASSERT(result >= -1); + if (result < 0) { - if (result != JSRegExpErrorNoMatch) - LOG_ERROR("RegularExpression: pcre_exec() failed with result %d", result); d->lastMatchLength = -1; return -1; } - // 1 means 1 match; 0 means more than one match. First match is recorded in offsets. - d->lastMatchLength = offsets[1] - offsets[0]; + // 1 means 1 match; 0 means more than one match. First match is recorded in offsetVector. + d->lastMatchLength = offsetVector[1] - offsetVector[0]; if (matchLength) *matchLength = d->lastMatchLength; - return offsets[0]; + return offsetVector[0]; } int RegularExpression::searchRev(const String& str) const diff --git a/Source/WebCore/platform/text/SegmentedString.cpp b/Source/WebCore/platform/text/SegmentedString.cpp index 5e9755b..7c859dc 100644 --- a/Source/WebCore/platform/text/SegmentedString.cpp +++ b/Source/WebCore/platform/text/SegmentedString.cpp @@ -186,17 +186,6 @@ void SegmentedString::advanceSubstring() } } -int SegmentedString::numberOfCharactersConsumedSlow() const -{ - int result = m_numberOfCharactersConsumedPriorToCurrentString + m_currentString.numberOfCharactersConsumed(); - if (m_pushedChar1) { - --result; - if (m_pushedChar2) - --result; - } - return result; -} - String SegmentedString::toString() const { String result; @@ -262,14 +251,14 @@ WTF::ZeroBasedNumber SegmentedString::currentLine() const WTF::ZeroBasedNumber SegmentedString::currentColumn() const { - int zeroBasedColumn = numberOfCharactersConsumedSlow() - m_numberOfCharactersConsumedPriorToCurrentLine; + int zeroBasedColumn = numberOfCharactersConsumed() - m_numberOfCharactersConsumedPriorToCurrentLine; return WTF::ZeroBasedNumber::fromZeroBasedInt(zeroBasedColumn); } void SegmentedString::setCurrentPosition(WTF::ZeroBasedNumber line, WTF::ZeroBasedNumber columnAftreProlog, int prologLength) { m_currentLine = line.zeroBasedInt(); - m_numberOfCharactersConsumedPriorToCurrentLine = numberOfCharactersConsumedSlow() + prologLength - columnAftreProlog.zeroBasedInt(); + m_numberOfCharactersConsumedPriorToCurrentLine = numberOfCharactersConsumed() + prologLength - columnAftreProlog.zeroBasedInt(); } } diff --git a/Source/WebCore/platform/text/SegmentedString.h b/Source/WebCore/platform/text/SegmentedString.h index 30c899d..3784b50 100644 --- a/Source/WebCore/platform/text/SegmentedString.h +++ b/Source/WebCore/platform/text/SegmentedString.h @@ -206,13 +206,15 @@ public: int numberOfCharactersConsumed() const { - // We don't currently handle the case when there are pushed character. - ASSERT(!m_pushedChar1); - return m_numberOfCharactersConsumedPriorToCurrentString + m_currentString.numberOfCharactersConsumed(); + int numberOfPushedCharacters = 0; + if (m_pushedChar1) { + ++numberOfPushedCharacters; + if (m_pushedChar2) + ++numberOfPushedCharacters; + } + return m_numberOfCharactersConsumedPriorToCurrentString + m_currentString.numberOfCharactersConsumed() - numberOfPushedCharacters; } - int numberOfCharactersConsumedSlow() const; - String toString() const; const UChar& operator*() const { return *current(); } diff --git a/Source/WebCore/platform/text/TextCodecICU.cpp b/Source/WebCore/platform/text/TextCodecICU.cpp index 6a579f9..92a158a 100644 --- a/Source/WebCore/platform/text/TextCodecICU.cpp +++ b/Source/WebCore/platform/text/TextCodecICU.cpp @@ -27,7 +27,6 @@ #include "config.h" #include "TextCodecICU.h" -#include "CharacterNames.h" #include "PlatformString.h" #include "ThreadGlobalData.h" #include <unicode/ucnv.h> @@ -37,6 +36,7 @@ #include <wtf/PassOwnPtr.h> #include <wtf/StringExtras.h> #include <wtf/Threading.h> +#include <wtf/unicode/CharacterNames.h> using std::min; diff --git a/Source/WebCore/platform/text/TextCodecUTF16.cpp b/Source/WebCore/platform/text/TextCodecUTF16.cpp index e88e83b..4ceed23 100644 --- a/Source/WebCore/platform/text/TextCodecUTF16.cpp +++ b/Source/WebCore/platform/text/TextCodecUTF16.cpp @@ -71,6 +71,8 @@ String TextCodecUTF16::decode(const char* bytes, size_t length, bool, bool, bool if (!length) return String(); + // FIXME: This should generate an error if there is an unpaired surrogate. + const unsigned char* p = reinterpret_cast<const unsigned char*>(bytes); size_t numBytes = length + m_haveBufferedByte; size_t numChars = numBytes / 2; diff --git a/Source/WebCore/platform/text/TextCodecUTF8.cpp b/Source/WebCore/platform/text/TextCodecUTF8.cpp new file mode 100644 index 0000000..8944d68 --- /dev/null +++ b/Source/WebCore/platform/text/TextCodecUTF8.cpp @@ -0,0 +1,276 @@ +/* + * Copyright (C) 2004, 2006, 2008, 2011 Apple Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "TextCodecUTF8.h" + +#include <wtf/text/CString.h> +#include <wtf/text/StringBuffer.h> +#include <wtf/unicode/UTF8.h> + +using namespace WTF::Unicode; +using namespace std; + +namespace WebCore { + +// Assuming that a pointer is the size of a "machine word", then +// uintptr_t is an integer type that is also a machine word. +typedef uintptr_t MachineWord; + +// This constant has type uintptr_t since we will use it to align +// pointers. Not because MachineWord is uintptr_t. +const uintptr_t machineWordAlignmentMask = sizeof(MachineWord) - 1; + +template<size_t size> struct NonASCIIMask; +template<> struct NonASCIIMask<4> { + static unsigned value() { return 0x80808080U; } +}; +template<> struct NonASCIIMask<8> { + static unsigned long long value() { return 0x8080808080808080ULL; } +}; + +template<size_t size> struct UCharByteFiller; +template<> struct UCharByteFiller<4> { + static void copy(UChar* destination, const uint8_t* source) + { + destination[0] = source[0]; + destination[1] = source[1]; + destination[2] = source[2]; + destination[3] = source[3]; + } +}; +template<> struct UCharByteFiller<8> { + static void copy(UChar* destination, const uint8_t* source) + { + destination[0] = source[0]; + destination[1] = source[1]; + destination[2] = source[2]; + destination[3] = source[3]; + destination[4] = source[4]; + destination[5] = source[5]; + destination[6] = source[6]; + destination[7] = source[7]; + } +}; + +static inline bool isAlignedToMachineWord(const void* pointer) +{ + return !(reinterpret_cast<uintptr_t>(pointer) & machineWordAlignmentMask); +} + +template<typename T> static inline T* alignToMachineWord(T* pointer) +{ + return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(pointer) & ~machineWordAlignmentMask); +} + +PassOwnPtr<TextCodec> TextCodecUTF8::create(const TextEncoding&, const void*) +{ + return adoptPtr(new TextCodecUTF8); +} + +void TextCodecUTF8::registerEncodingNames(EncodingNameRegistrar registrar) +{ + registrar("UTF-8", "UTF-8"); +} + +void TextCodecUTF8::registerCodecs(TextCodecRegistrar registrar) +{ + registrar("UTF-8", create, 0); +} + +static inline int nonASCIISequenceLength(unsigned char firstByte) +{ + ASSERT(!isASCII(firstByte)); + switch (firstByte >> 4) { + case 0xF: + return 4; + case 0xE: + return 3; + } + return 2; +} + +static inline int decodeNonASCIISequence(const unsigned char* sequence, unsigned length) +{ + ASSERT(!isASCII(sequence[0])); + if (length == 2) { + ASSERT(sequence[0] <= 0xDF); + if (sequence[0] < 0xC2) + return -1; + if (sequence[1] < 0x80 || sequence[1] > 0xBF) + return -1; + return ((sequence[0] << 6) + sequence[1]) - 0x00003080; + } + if (length == 3) { + ASSERT(sequence[0] >= 0xE0 && sequence[0] <= 0xEF); + switch (sequence[0]) { + case 0xE0: + if (sequence[1] < 0xA0 || sequence[1] > 0xBF) + return -1; + break; + case 0xED: + if (sequence[1] < 0x80 || sequence[1] > 0x9F) + return -1; + break; + default: + if (sequence[1] < 0x80 || sequence[1] > 0xBF) + return -1; + } + if (sequence[2] < 0x80 || sequence[2] > 0xBF) + return -1; + return ((sequence[0] << 12) + (sequence[1] << 6) + sequence[2]) - 0x000E2080; + } + ASSERT(length == 4); + ASSERT(sequence[0] >= 0xF0 && sequence[0] <= 0xF4); + switch (sequence[0]) { + case 0xF0: + if (sequence[1] < 0x90 || sequence[1] > 0xBF) + return -1; + break; + case 0xF4: + if (sequence[1] < 0x80 || sequence[1] > 0x8F) + return -1; + break; + default: + if (sequence[1] < 0x80 || sequence[1] > 0xBF) + return -1; + } + if (sequence[2] < 0x80 || sequence[2] > 0xBF) + return -1; + if (sequence[3] < 0x80 || sequence[3] > 0xBF) + return -1; + return ((sequence[0] << 18) + (sequence[1] << 12) + (sequence[2] << 6) + sequence[3]) - 0x03C82080; +} + +String TextCodecUTF8::decode(const char* bytes, size_t length, bool flush, bool stopOnError, bool& sawError) +{ + StringBuffer buffer(length); + + const uint8_t* source = reinterpret_cast<const uint8_t*>(bytes); + const uint8_t* end = source + length; + const uint8_t* alignedEnd = alignToMachineWord(end); + UChar* destination = buffer.characters(); + + int count; + int character; + + if (m_partialSequenceSize) { + count = nonASCIISequenceLength(m_partialSequence[0]); + ASSERT(count > m_partialSequenceSize); + if (count - m_partialSequenceSize > end - source) { + memcpy(m_partialSequence + m_partialSequenceSize, source, end - source); + m_partialSequenceSize += end - source; + source = end; + } else { + uint8_t completeSequence[U8_MAX_LENGTH]; + memcpy(completeSequence, m_partialSequence, m_partialSequenceSize); + memcpy(completeSequence + m_partialSequenceSize, source, count - m_partialSequenceSize); + source += count - m_partialSequenceSize; + m_partialSequenceSize = 0; + character = decodeNonASCIISequence(completeSequence, count); + goto decodedNonASCII; + } + } + + while (source < end) { + if (isASCII(*source)) { + // Fast path for ASCII. Most UTF-8 text will be ASCII. + if (isAlignedToMachineWord(source)) { + while (source < alignedEnd) { + MachineWord chunk = *reinterpret_cast_ptr<const MachineWord*>(source); + if (chunk & NonASCIIMask<sizeof(MachineWord)>::value()) { + if (isASCII(*source)) + break; + goto nonASCII; + } + UCharByteFiller<sizeof(MachineWord)>::copy(destination, source); + source += sizeof(MachineWord); + destination += sizeof(MachineWord); + } + if (source == end) + break; + } + *destination++ = *source++; + } else { +nonASCII: + count = nonASCIISequenceLength(*source); + ASSERT(count >= 2); + ASSERT(count <= 4); + if (count > end - source) { + ASSERT(end - source <= static_cast<ptrdiff_t>(sizeof(m_partialSequence))); + ASSERT(!m_partialSequenceSize); + m_partialSequenceSize = end - source; + memcpy(m_partialSequence, source, m_partialSequenceSize); + break; + } + character = decodeNonASCIISequence(source, count); + source += count; +decodedNonASCII: + if (character < 0) { + if (stopOnError) { + sawError = true; + break; + } + } else { + ASSERT(!U_IS_SURROGATE(character)); + if (U_IS_BMP(character)) + *destination++ = character; + else { + *destination++ = U16_LEAD(character); + *destination++ = U16_TRAIL(character); + } + } + } + } + + buffer.shrink(destination - buffer.characters()); + + if (flush && m_partialSequenceSize) + sawError = true; + + return String::adopt(buffer); +} + +CString TextCodecUTF8::encode(const UChar* characters, size_t length, UnencodableHandling) +{ + // The maximum number of UTF-8 bytes needed per UTF-16 code unit is 3. + // BMP characters take only one UTF-16 code unit and can take up to 3 bytes (3x). + // Non-BMP characters take two UTF-16 code units and can take up to 4 bytes (2x). + if (length > numeric_limits<size_t>::max() / 3) + CRASH(); + Vector<uint8_t> bytes(length * 3); + + size_t i = 0; + size_t bytesWritten = 0; + while (i < length) { + UChar32 character; + U16_NEXT(characters, i, length, character); + U8_APPEND_UNSAFE(bytes.data(), bytesWritten, character); + } + + return CString(reinterpret_cast<char*>(bytes.data()), bytesWritten); +} + +} // namespace WebCore diff --git a/Source/WebCore/platform/text/TextCodecUTF8.h b/Source/WebCore/platform/text/TextCodecUTF8.h new file mode 100644 index 0000000..f3b6b7a --- /dev/null +++ b/Source/WebCore/platform/text/TextCodecUTF8.h @@ -0,0 +1,52 @@ +/* + * Copyright (C) 2011 Apple Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef TextCodecUTF8_h +#define TextCodecUTF8_h + +#include "TextCodec.h" + +namespace WebCore { + +class TextCodecUTF8 : public TextCodec { +public: + static void registerEncodingNames(EncodingNameRegistrar); + static void registerCodecs(TextCodecRegistrar); + + virtual String decode(const char*, size_t length, bool flush, bool stopOnError, bool& sawError); + virtual CString encode(const UChar*, size_t length, UnencodableHandling); + +private: + static PassOwnPtr<TextCodec> create(const TextEncoding&, const void*); + TextCodecUTF8() : m_partialSequenceSize(0) { } + + int m_partialSequenceSize; + char m_partialSequence[U8_MAX_LENGTH - 1]; + +}; + +} // namespace WebCore + +#endif // TextCodecUTF8_h diff --git a/Source/WebCore/platform/text/TextEncodingRegistry.cpp b/Source/WebCore/platform/text/TextEncodingRegistry.cpp index c0c0255..1dc09ee 100644 --- a/Source/WebCore/platform/text/TextEncodingRegistry.cpp +++ b/Source/WebCore/platform/text/TextEncodingRegistry.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2006, 2007 Apple Inc. All rights reserved. + * Copyright (C) 2006, 2007, 2011 Apple Inc. All rights reserved. * Copyright (C) 2007-2009 Torch Mobile, Inc. * * Redistribution and use in source and binary forms, with or without @@ -27,14 +27,12 @@ #include "config.h" #include "TextEncodingRegistry.h" -#include "PlatformString.h" #include "TextCodecLatin1.h" #include "TextCodecUserDefined.h" #include "TextCodecUTF16.h" +#include "TextCodecUTF8.h" #include "TextEncoding.h" #include <wtf/ASCIICType.h> -#include <wtf/Assertions.h> -#include <wtf/HashFunctions.h> #include <wtf/HashMap.h> #include <wtf/HashSet.h> #include <wtf/StdLibExtras.h> @@ -68,7 +66,6 @@ const size_t maxEncodingNameLength = 63; // Hash for all-ASCII strings that does case folding. struct TextEncodingNameHash { - static bool equal(const char* s1, const char* s2) { char c1; @@ -129,9 +126,7 @@ static bool didExtendTextCodecMaps; static HashSet<const char*>* japaneseEncodings; static HashSet<const char*>* nonBackslashEncodings; -static const char* const textEncodingNameBlacklist[] = { - "UTF-7" -}; +static const char* const textEncodingNameBlacklist[] = { "UTF-7" }; #if ERROR_DISABLED @@ -268,7 +263,7 @@ static void buildQuirksSets() ASSERT(!japaneseEncodings); ASSERT(!nonBackslashEncodings); - japaneseEncodings = new HashSet<const char*>(); + japaneseEncodings = new HashSet<const char*>; addEncodingName(japaneseEncodings, "EUC-JP"); addEncodingName(japaneseEncodings, "ISO-2022-JP"); addEncodingName(japaneseEncodings, "ISO-2022-JP-1"); @@ -284,7 +279,7 @@ static void buildQuirksSets() addEncodingName(japaneseEncodings, "cp932"); addEncodingName(japaneseEncodings, "x-mac-japanese"); - nonBackslashEncodings = new HashSet<const char*>(); + nonBackslashEncodings = new HashSet<const char*>; // The text encodings below treat backslash as a currency symbol for IE compatibility. // See http://blogs.msdn.com/michkap/archive/2005/09/17/469941.aspx for more information. addEncodingName(nonBackslashEncodings, "x-mac-japanese"); diff --git a/Source/WebCore/platform/text/mac/TextCodecMac.cpp b/Source/WebCore/platform/text/mac/TextCodecMac.cpp index b743f3d..64d0485 100644 --- a/Source/WebCore/platform/text/mac/TextCodecMac.cpp +++ b/Source/WebCore/platform/text/mac/TextCodecMac.cpp @@ -27,15 +27,15 @@ #include "config.h" #include "TextCodecMac.h" -#include "CharacterNames.h" #include "CharsetData.h" #include "PlatformString.h" #include "ThreadGlobalData.h" #include <wtf/Assertions.h> -#include <wtf/text/CString.h> #include <wtf/PassOwnPtr.h> #include <wtf/RetainPtr.h> #include <wtf/Threading.h> +#include <wtf/text/CString.h> +#include <wtf/unicode/CharacterNames.h> using namespace std; diff --git a/Source/WebCore/platform/text/transcoder/FontTranscoder.cpp b/Source/WebCore/platform/text/transcoder/FontTranscoder.cpp index 68601f9..4e07f50 100644 --- a/Source/WebCore/platform/text/transcoder/FontTranscoder.cpp +++ b/Source/WebCore/platform/text/transcoder/FontTranscoder.cpp @@ -31,9 +31,9 @@ #include "config.h" #include "FontTranscoder.h" -#include "CharacterNames.h" #include "FontDescription.h" #include "TextEncoding.h" +#include <wtf/unicode/CharacterNames.h> namespace WebCore { |