Merge WebKit at r78450: Initial merge by git.

Change-Id: I6d3e5f1f868ec266a0aafdef66182ddc3f265dc1
author: Steve Block <steveblock@google.com> 2011-05-18 13:36:51 +0100
committer: Steve Block <steveblock@google.com> 2011-05-24 15:38:28 +0100
commit: 2fc2651226baac27029e38c9d6ef883fa32084db (patch)
tree: e396d4bf89dcce6ed02071be66212495b1df1dec /Source/WebCore/platform/text
parent: b3725cedeb43722b3b175aaeff70552e562d2c94 (diff)
download: external_webkit-2fc2651226baac27029e38c9d6ef883fa32084db.zip
external_webkit-2fc2651226baac27029e38c9d6ef883fa32084db.tar.gz
external_webkit-2fc2651226baac27029e38c9d6ef883fa32084db.tar.bz2
16 files changed, 547 insertions, 173 deletions
diff --git a/Source/WebCore/platform/text/Base64.cpp b/Source/WebCore/platform/text/Base64.cpp
index 98b537a..bf706f6 100644
--- a/Source/WebCore/platform/text/Base64.cpp
+++ b/Source/WebCore/platform/text/Base64.cpp
@@ -60,9 +60,11 @@ static const char base64DecMap[128] = {
     0x31, 0x32, 0x33, 0x00, 0x00, 0x00, 0x00, 0x00
 };
 
-void base64Encode(const Vector<char>& in, Vector<char>& out, bool insertLFs)
+String base64Encode(const char* data, unsigned length, bool insertLFs)
 {
-    base64Encode(in.data(), in.size(), out, insertLFs);
+    Vector<char> result;
+    base64Encode(data, length, result, insertLFs);
+    return String(result.data(), result.size());
 }
 
 void base64Encode(const char* data, unsigned len, Vector<char>& out, bool insertLFs)
diff --git a/Source/WebCore/platform/text/Base64.h b/Source/WebCore/platform/text/Base64.h
index 211bd3c..70855de 100644
--- a/Source/WebCore/platform/text/Base64.h
+++ b/Source/WebCore/platform/text/Base64.h
@@ -27,20 +27,45 @@
 #ifndef Base64_h
 #define Base64_h
 
-#include <wtf/Forward.h>
 #include <wtf/Vector.h>
+#include <wtf/text/CString.h>
+#include <wtf/text/WTFString.h>
 
 namespace WebCore {
 
 enum Base64DecodePolicy { FailOnInvalidCharacter, IgnoreWhitespace, IgnoreInvalidCharacters };
 
-void base64Encode(const Vector<char>&, Vector<char>&, bool insertLFs = false);
 void base64Encode(const char*, unsigned, Vector<char>&, bool insertLFs = false);
+void base64Encode(const Vector<char>&, Vector<char>&, bool insertLFs = false);
+void base64Encode(const CString&, Vector<char>&, bool insertLFs = false);
+String base64Encode(const char*, unsigned, bool insertLFs = false);
+String base64Encode(const Vector<char>&, bool insertLFs = false);
+String base64Encode(const CString&, bool insertLFs = false);
 
 bool base64Decode(const String&, Vector<char>&, Base64DecodePolicy = FailOnInvalidCharacter);
 bool base64Decode(const Vector<char>&, Vector<char>&, Base64DecodePolicy = FailOnInvalidCharacter);
 bool base64Decode(const char*, unsigned, Vector<char>&, Base64DecodePolicy = FailOnInvalidCharacter);
 
+inline void base64Encode(const Vector<char>& in, Vector<char>& out, bool insertLFs)
+{
+    base64Encode(in.data(), in.size(), out, insertLFs);
 }
 
+inline void base64Encode(const CString& in, Vector<char>& out, bool insertLFs)
+{
+    base64Encode(in.data(), in.length(), out, insertLFs);
+}
+
+inline String base64Encode(const Vector<char>& in, bool insertLFs)
+{
+    return base64Encode(in.data(), in.size(), insertLFs);
+}
+
+inline String base64Encode(const CString& in, bool insertLFs)
+{
+    return base64Encode(in.data(), in.length(), insertLFs);
+}
+
+} // namespace WebCore
+
 #endif // Base64_h
diff --git a/Source/WebCore/platform/text/BidiResolver.h b/Source/WebCore/platform/text/BidiResolver.h
index 8abd698..72d163c 100644
--- a/Source/WebCore/platform/text/BidiResolver.h
+++ b/Source/WebCore/platform/text/BidiResolver.h
@@ -161,7 +161,7 @@ public :
     MidpointState<Iterator>& midpointState() { return m_midpointState; }
 
     void embed(WTF::Unicode::Direction);
-    void commitExplicitEmbedding();
+    bool commitExplicitEmbedding();
 
     void createBidiRunsForLine(const Iterator& end, bool visualOrder = false, bool hardLineBreak = false);
 
@@ -400,7 +400,7 @@ void BidiResolver<Iterator, Run>::raiseExplicitEmbeddingLevel(WTF::Unicode::Dire
 }
 
 template <class Iterator, class Run>
-void BidiResolver<Iterator, Run>::commitExplicitEmbedding()
+bool BidiResolver<Iterator, Run>::commitExplicitEmbedding()
 {
     using namespace WTF::Unicode;
 
@@ -440,6 +440,8 @@ void BidiResolver<Iterator, Run>::commitExplicitEmbedding()
     setContext(toContext);
 
     m_currentExplicitEmbeddingSequence.clear();
+
+    return fromLevel != toLevel;
 }
 
 template <class Iterator, class Run>
@@ -881,8 +883,8 @@ void BidiResolver<Iterator, Run>::createBidiRunsForLine(const Iterator& end, boo
 
         increment();
         if (!m_currentExplicitEmbeddingSequence.isEmpty()) {
-            commitExplicitEmbedding();
-            if (pastEnd) {
+            bool committed = commitExplicitEmbedding();
+            if (committed && pastEnd) {
                 current = end;
                 m_status = stateAtEnd.m_status;
                 sor = stateAtEnd.sor; 
diff --git a/Source/WebCore/platform/text/CharacterNames.h b/Source/WebCore/platform/text/CharacterNames.h
deleted file mode 100644
index c4b496e..0000000
--- a/Source/WebCore/platform/text/CharacterNames.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (C) 2007, 2009, 2010 Apple Inc. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
- * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
- */
-
-#ifndef CharacterNames_h
-#define CharacterNames_h
-
-#include <wtf/unicode/Unicode.h>
-
-namespace WebCore {
-
-// Names here are taken from the Unicode standard.
-
-// Most of these are UChar constants, not UChar32, which makes them
-// more convenient for WebCore code that mostly uses UTF-16.
-
-const UChar32 aegeanWordSeparatorLine = 0x10100;
-const UChar32 aegeanWordSeparatorDot = 0x10101;
-const UChar blackCircle = 0x25CF;
-const UChar blackSquare = 0x25A0;
-const UChar blackUpPointingTriangle = 0x25B2;
-const UChar bullet = 0x2022;
-const UChar bullseye = 0x25CE;
-const UChar carriageReturn = 0x000D;
-const UChar ethiopicPrefaceColon = 0x1366;
-const UChar ethiopicWordspace = 0x1361;
-const UChar fisheye = 0x25C9;
-const UChar hebrewPunctuationGeresh = 0x05F3;
-const UChar hebrewPunctuationGershayim = 0x05F4;
-const UChar horizontalEllipsis = 0x2026;
-const UChar hyphen = 0x2010;
-const UChar hyphenMinus = 0x002D;
-const UChar ideographicComma = 0x3001;
-const UChar ideographicFullStop = 0x3002;
-const UChar ideographicSpace = 0x3000;
-const UChar leftDoubleQuotationMark = 0x201C;
-const UChar leftSingleQuotationMark = 0x2018;
-const UChar leftToRightEmbed = 0x202A;
-const UChar leftToRightMark = 0x200E;
-const UChar leftToRightOverride = 0x202D;
-const UChar minusSign = 0x2212;
-const UChar newlineCharacter = 0x000A;
-const UChar noBreakSpace = 0x00A0;
-const UChar objectReplacementCharacter = 0xFFFC;
-const UChar popDirectionalFormatting = 0x202C;
-const UChar replacementCharacter = 0xFFFD;
-const UChar rightDoubleQuotationMark = 0x201D;
-const UChar rightSingleQuotationMark = 0x2019;
-const UChar rightToLeftEmbed = 0x202B;
-const UChar rightToLeftMark = 0x200F;
-const UChar rightToLeftOverride = 0x202E;
-const UChar sesameDot = 0xFE45;
-const UChar softHyphen = 0x00AD;
-const UChar space = 0x0020;
-const UChar tibetanMarkIntersyllabicTsheg = 0x0F0B;
-const UChar tibetanMarkDelimiterTshegBstar = 0x0F0C;
-const UChar32 ugariticWordDivider = 0x1039F;
-const UChar whiteBullet = 0x25E6;
-const UChar whiteCircle = 0x25CB;
-const UChar whiteSesameDot = 0xFE46;
-const UChar whiteUpPointingTriangle = 0x25B3;
-const UChar yenSign = 0x00A5;
-const UChar zeroWidthJoiner = 0x200D;
-const UChar zeroWidthNonJoiner = 0x200C;
-const UChar zeroWidthSpace = 0x200B;
-
-}
-
-#endif // CharacterNames_h
diff --git a/Source/WebCore/platform/text/LocalizedNumber.h b/Source/WebCore/platform/text/LocalizedNumber.h
new file mode 100644
index 0000000..45873b8
--- /dev/null
+++ b/Source/WebCore/platform/text/LocalizedNumber.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (C) 2011 Google Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef LocalizedNumber_h
+#define LocalizedNumber_h
+
+#include <wtf/text/WTFString.h>
+
+namespace WebCore {
+
+// Parses a string representation of a floating point number localized
+// for the browser's current locale. If the input string is not valid
+// or an implementation doesn't support localized numbers, this
+// function returns NaN. This function doesn't need to support
+// scientific notation, NaN, +Infinity and -Infinity, and doesn't need
+// to support the standard representations of ECMAScript and HTML5.
+double parseLocalizedNumber(const String&);
+
+// Serializes the specified floating point number for the browser's
+// current locale.  If an implementation doesn't support localized
+// numbers or the input value is NaN or Infinitiy, the function should
+// return an empty string.
+String formatLocalizedNumber(double);
+
+// Returns true if the input character can be used to represent a
+// number in the browser locale. For example, this should return true for 0-9 .
+// , + - for en-US locale.
+bool isLocalizedNumberCharacter(UChar32);
+
+} // namespace WebCore
+
+#endif // LocalizedNumber_h
diff --git a/Source/WebCore/platform/text/LocalizedNumberNone.cpp b/Source/WebCore/platform/text/LocalizedNumberNone.cpp
new file mode 100644
index 0000000..6f017e9
--- /dev/null
+++ b/Source/WebCore/platform/text/LocalizedNumberNone.cpp
@@ -0,0 +1,55 @@
+/*
+ * Copyright (C) 2011 Google Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+#include "LocalizedNumber.h"
+
+#include <limits>
+
+using namespace std;
+
+namespace WebCore {
+
+double parseLocalizedNumber(const String&)
+{
+    return numeric_limits<double>::quiet_NaN();
+}
+
+String formatLocalizedNumber(double)
+{
+    return String();
+}
+
+bool isLocalizedNumberCharacter(UChar32)
+{
+    return false;
+}
+
+} // namespace WebCore
diff --git a/Source/WebCore/platform/text/RegularExpression.cpp b/Source/WebCore/platform/text/RegularExpression.cpp
index 9b063c9..e020b91 100644
--- a/Source/WebCore/platform/text/RegularExpression.cpp
+++ b/Source/WebCore/platform/text/RegularExpression.cpp
@@ -1,6 +1,7 @@
 /*
  * Copyright (C) 2004, 2008, 2009 Apple Inc. All rights reserved.
  * Copyright (C) 2008 Collabora Ltd.
+ * Copyright (C) 2011 Peter Varga (pvarga@webkit.org), University of Szeged
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -27,52 +28,48 @@
 #include "config.h"
 #include "RegularExpression.h"
 
+#include <wtf/BumpPointerAllocator.h>
+#include <yarr/Yarr.h>
 #include "Logging.h"
-#include <pcre/pcre.h>
 
 namespace WebCore {
 
 class RegularExpression::Private : public RefCounted<RegularExpression::Private> {
 public:
-    static PassRefPtr<Private> create(const String& pattern, TextCaseSensitivity);
-    ~Private();
+    static PassRefPtr<Private> create(const String& pattern, TextCaseSensitivity caseSensitivity)
+    {
+        return adoptRef(new Private(pattern, caseSensitivity));
+    }
 
-    JSRegExp* regexp() const { return m_regexp; }
-    int lastMatchLength;    
+    int lastMatchLength;
 
-private:
-    Private(const String& pattern, TextCaseSensitivity);
-    static JSRegExp* compile(const String& pattern, TextCaseSensitivity);
+    unsigned m_numSubpatterns;
+    OwnPtr<JSC::Yarr::BytecodePattern> m_regExpByteCode;
 
-    JSRegExp* m_regexp;
-};
+private:
+    Private(const String& pattern, TextCaseSensitivity caseSensitivity)
+        : lastMatchLength(-1)
+        , m_regExpByteCode(compile(pattern, caseSensitivity))
+        , m_constructionError(0)
+    {
+    }
 
-inline JSRegExp* RegularExpression::Private::compile(const String& pattern, TextCaseSensitivity caseSensitivity)
-{
-    const char* errorMessage;
-    JSRegExp* regexp = jsRegExpCompile(pattern.characters(), pattern.length(),
-        caseSensitivity == TextCaseSensitive ? JSRegExpDoNotIgnoreCase : JSRegExpIgnoreCase, JSRegExpSingleLine,
-        0, &errorMessage);
-    if (!regexp)
-        LOG_ERROR("RegularExpression: pcre_compile failed with '%s'", errorMessage);
-    return regexp;
-}
+    PassOwnPtr<JSC::Yarr::BytecodePattern> compile(const String& patternString, TextCaseSensitivity caseSensitivity)
+    {
+        JSC::Yarr::YarrPattern pattern(JSC::UString(patternString.impl()), (caseSensitivity == TextCaseInsensitive), false, &m_constructionError);
+        if (m_constructionError) {
+            LOG_ERROR("RegularExpression: YARR compile failed with '%s'", m_constructionError);
+            return PassOwnPtr<JSC::Yarr::BytecodePattern>();
+        }
 
-inline RegularExpression::Private::Private(const String& pattern, TextCaseSensitivity caseSensitivity)
-    : lastMatchLength(-1)
-    , m_regexp(compile(pattern, caseSensitivity))
-{
-}
+        m_numSubpatterns = pattern.m_numSubpatterns;
 
-inline PassRefPtr<RegularExpression::Private> RegularExpression::Private::create(const String& pattern, TextCaseSensitivity caseSensitivity)
-{
-    return adoptRef(new Private(pattern, caseSensitivity));
-}
+        return JSC::Yarr::byteCompile(pattern, &m_regexAllocator);
+    }
 
-RegularExpression::Private::~Private()
-{
-    jsRegExpFree(m_regexp);
-}
+    BumpPointerAllocator m_regexAllocator;
+    const char* m_constructionError;
+};
 
 RegularExpression::RegularExpression(const String& pattern, TextCaseSensitivity caseSensitivity)
     : d(Private::create(pattern, caseSensitivity))
@@ -96,28 +93,36 @@ RegularExpression& RegularExpression::operator=(const RegularExpression& re)
 
 int RegularExpression::match(const String& str, int startFrom, int* matchLength) const
 {
-    if (!d->regexp())
+    if (!d->m_regExpByteCode)
         return -1;
 
     if (str.isNull())
         return -1;
 
-    // First 2 offsets are start and end offsets; 3rd entry is used internally by pcre
-    static const size_t maxOffsets = 3;
-    int offsets[maxOffsets];
-    int result = jsRegExpExecute(d->regexp(), str.characters(), str.length(), startFrom, offsets, maxOffsets);
+    int offsetVectorSize = (d->m_numSubpatterns + 1) * 2;
+    int* offsetVector;
+    Vector<int, 32> nonReturnedOvector;
+
+    nonReturnedOvector.resize(offsetVectorSize);
+    offsetVector = nonReturnedOvector.data();
+
+    ASSERT(offsetVector);
+    for (unsigned j = 0, i = 0; i < d->m_numSubpatterns + 1; j += 2, i++)
+        offsetVector[j] = -1;
+
+    int result = JSC::Yarr::interpret(d->m_regExpByteCode.get(), str.characters(), startFrom, str.length(), offsetVector);
+    ASSERT(result >= -1);
+
     if (result < 0) {
-        if (result != JSRegExpErrorNoMatch)
-            LOG_ERROR("RegularExpression: pcre_exec() failed with result %d", result);
         d->lastMatchLength = -1;
         return -1;
     }
 
-    // 1 means 1 match; 0 means more than one match. First match is recorded in offsets.
-    d->lastMatchLength = offsets[1] - offsets[0];
+    // 1 means 1 match; 0 means more than one match. First match is recorded in offsetVector.
+    d->lastMatchLength = offsetVector[1] - offsetVector[0];
     if (matchLength)
         *matchLength = d->lastMatchLength;
-    return offsets[0];
+    return offsetVector[0];
 }
 
 int RegularExpression::searchRev(const String& str) const
diff --git a/Source/WebCore/platform/text/SegmentedString.cpp b/Source/WebCore/platform/text/SegmentedString.cpp
index 5e9755b..7c859dc 100644
--- a/Source/WebCore/platform/text/SegmentedString.cpp
+++ b/Source/WebCore/platform/text/SegmentedString.cpp
@@ -186,17 +186,6 @@ void SegmentedString::advanceSubstring()
     }
 }
 
-int SegmentedString::numberOfCharactersConsumedSlow() const
-{
-    int result = m_numberOfCharactersConsumedPriorToCurrentString + m_currentString.numberOfCharactersConsumed();
-    if (m_pushedChar1) {
-        --result;
-        if (m_pushedChar2)
-            --result;
-    }
-    return result;
-}
-
 String SegmentedString::toString() const
 {
     String result;
@@ -262,14 +251,14 @@ WTF::ZeroBasedNumber SegmentedString::currentLine() const
 
 WTF::ZeroBasedNumber SegmentedString::currentColumn() const
 {
-    int zeroBasedColumn = numberOfCharactersConsumedSlow() - m_numberOfCharactersConsumedPriorToCurrentLine;
+    int zeroBasedColumn = numberOfCharactersConsumed() - m_numberOfCharactersConsumedPriorToCurrentLine;
     return WTF::ZeroBasedNumber::fromZeroBasedInt(zeroBasedColumn);
 }
 
 void SegmentedString::setCurrentPosition(WTF::ZeroBasedNumber line, WTF::ZeroBasedNumber columnAftreProlog, int prologLength)
 {
     m_currentLine = line.zeroBasedInt();
-    m_numberOfCharactersConsumedPriorToCurrentLine = numberOfCharactersConsumedSlow() + prologLength - columnAftreProlog.zeroBasedInt();
+    m_numberOfCharactersConsumedPriorToCurrentLine = numberOfCharactersConsumed() + prologLength - columnAftreProlog.zeroBasedInt();
 }
 
 }
diff --git a/Source/WebCore/platform/text/SegmentedString.h b/Source/WebCore/platform/text/SegmentedString.h
index 30c899d..3784b50 100644
--- a/Source/WebCore/platform/text/SegmentedString.h
+++ b/Source/WebCore/platform/text/SegmentedString.h
@@ -206,13 +206,15 @@ public:
 
     int numberOfCharactersConsumed() const
     {
-        // We don't currently handle the case when there are pushed character.
-        ASSERT(!m_pushedChar1);
-        return m_numberOfCharactersConsumedPriorToCurrentString + m_currentString.numberOfCharactersConsumed();
+        int numberOfPushedCharacters = 0;
+        if (m_pushedChar1) {
+            ++numberOfPushedCharacters;
+            if (m_pushedChar2)
+                ++numberOfPushedCharacters;
+        }
+        return m_numberOfCharactersConsumedPriorToCurrentString + m_currentString.numberOfCharactersConsumed() - numberOfPushedCharacters;
     }
 
-    int numberOfCharactersConsumedSlow() const;
-
     String toString() const;
 
     const UChar& operator*() const { return *current(); }
diff --git a/Source/WebCore/platform/text/TextCodecICU.cpp b/Source/WebCore/platform/text/TextCodecICU.cpp
index 6a579f9..92a158a 100644
--- a/Source/WebCore/platform/text/TextCodecICU.cpp
+++ b/Source/WebCore/platform/text/TextCodecICU.cpp
@@ -27,7 +27,6 @@
 #include "config.h"
 #include "TextCodecICU.h"
 
-#include "CharacterNames.h"
 #include "PlatformString.h"
 #include "ThreadGlobalData.h"
 #include <unicode/ucnv.h>
@@ -37,6 +36,7 @@
 #include <wtf/PassOwnPtr.h>
 #include <wtf/StringExtras.h>
 #include <wtf/Threading.h>
+#include <wtf/unicode/CharacterNames.h>
 
 using std::min;
 
diff --git a/Source/WebCore/platform/text/TextCodecUTF16.cpp b/Source/WebCore/platform/text/TextCodecUTF16.cpp
index e88e83b..4ceed23 100644
--- a/Source/WebCore/platform/text/TextCodecUTF16.cpp
+++ b/Source/WebCore/platform/text/TextCodecUTF16.cpp
@@ -71,6 +71,8 @@ String TextCodecUTF16::decode(const char* bytes, size_t length, bool, bool, bool
     if (!length)
         return String();
 
+    // FIXME: This should generate an error if there is an unpaired surrogate.
+
     const unsigned char* p = reinterpret_cast<const unsigned char*>(bytes);
     size_t numBytes = length + m_haveBufferedByte;
     size_t numChars = numBytes / 2;
diff --git a/Source/WebCore/platform/text/TextCodecUTF8.cpp b/Source/WebCore/platform/text/TextCodecUTF8.cpp
new file mode 100644
index 0000000..8944d68
--- /dev/null
+++ b/Source/WebCore/platform/text/TextCodecUTF8.cpp
@@ -0,0 +1,276 @@
+/*
+ * Copyright (C) 2004, 2006, 2008, 2011 Apple Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
+ */
+
+#include "config.h"
+#include "TextCodecUTF8.h"
+
+#include <wtf/text/CString.h>
+#include <wtf/text/StringBuffer.h>
+#include <wtf/unicode/UTF8.h>
+
+using namespace WTF::Unicode;
+using namespace std;
+
+namespace WebCore {
+
+// Assuming that a pointer is the size of a "machine word", then
+// uintptr_t is an integer type that is also a machine word.
+typedef uintptr_t MachineWord;
+
+// This constant has type uintptr_t since we will use it to align
+// pointers. Not because MachineWord is uintptr_t.
+const uintptr_t machineWordAlignmentMask = sizeof(MachineWord) - 1;
+
+template<size_t size> struct NonASCIIMask;
+template<> struct NonASCIIMask<4> {
+    static unsigned value() { return 0x80808080U; }
+};
+template<> struct NonASCIIMask<8> {
+    static unsigned long long value() { return 0x8080808080808080ULL; }
+};
+
+template<size_t size> struct UCharByteFiller;
+template<> struct UCharByteFiller<4> {
+    static void copy(UChar* destination, const uint8_t* source)
+    {
+        destination[0] = source[0];
+        destination[1] = source[1];
+        destination[2] = source[2];
+        destination[3] = source[3];
+    }
+};
+template<> struct UCharByteFiller<8> {
+    static void copy(UChar* destination, const uint8_t* source)
+    {
+        destination[0] = source[0];
+        destination[1] = source[1];
+        destination[2] = source[2];
+        destination[3] = source[3];
+        destination[4] = source[4];
+        destination[5] = source[5];
+        destination[6] = source[6];
+        destination[7] = source[7];
+    }
+};
+
+static inline bool isAlignedToMachineWord(const void* pointer)
+{
+    return !(reinterpret_cast<uintptr_t>(pointer) & machineWordAlignmentMask);
+}
+
+template<typename T> static inline T* alignToMachineWord(T* pointer)
+{
+    return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(pointer) & ~machineWordAlignmentMask);
+}
+
+PassOwnPtr<TextCodec> TextCodecUTF8::create(const TextEncoding&, const void*)
+{
+    return adoptPtr(new TextCodecUTF8);
+}
+
+void TextCodecUTF8::registerEncodingNames(EncodingNameRegistrar registrar)
+{
+    registrar("UTF-8", "UTF-8");
+}
+
+void TextCodecUTF8::registerCodecs(TextCodecRegistrar registrar)
+{
+    registrar("UTF-8", create, 0);
+}
+
+static inline int nonASCIISequenceLength(unsigned char firstByte)
+{
+    ASSERT(!isASCII(firstByte));
+    switch (firstByte >> 4) {
+    case 0xF:
+        return 4;
+    case 0xE:
+        return 3;
+    }
+    return 2;
+}
+
+static inline int decodeNonASCIISequence(const unsigned char* sequence, unsigned length)
+{
+    ASSERT(!isASCII(sequence[0]));
+    if (length == 2) {
+        ASSERT(sequence[0] <= 0xDF);
+        if (sequence[0] < 0xC2)
+            return -1;
+        if (sequence[1] < 0x80 || sequence[1] > 0xBF)
+            return -1;
+        return ((sequence[0] << 6) + sequence[1]) - 0x00003080;
+    }
+    if (length == 3) {
+        ASSERT(sequence[0] >= 0xE0 && sequence[0] <= 0xEF);
+        switch (sequence[0]) {
+        case 0xE0:
+            if (sequence[1] < 0xA0 || sequence[1] > 0xBF)
+                return -1;
+            break;
+        case 0xED:
+            if (sequence[1] < 0x80 || sequence[1] > 0x9F)
+                return -1;
+            break;
+        default:
+            if (sequence[1] < 0x80 || sequence[1] > 0xBF)
+                return -1;
+        }
+        if (sequence[2] < 0x80 || sequence[2] > 0xBF)
+            return -1;
+        return ((sequence[0] << 12) + (sequence[1] << 6) + sequence[2]) - 0x000E2080;
+    }
+    ASSERT(length == 4);
+    ASSERT(sequence[0] >= 0xF0 && sequence[0] <= 0xF4);
+    switch (sequence[0]) {
+    case 0xF0:
+        if (sequence[1] < 0x90 || sequence[1] > 0xBF)
+            return -1;
+        break;
+    case 0xF4:
+        if (sequence[1] < 0x80 || sequence[1] > 0x8F)
+            return -1;
+        break;
+    default:
+        if (sequence[1] < 0x80 || sequence[1] > 0xBF)
+            return -1;
+    }
+    if (sequence[2] < 0x80 || sequence[2] > 0xBF)
+        return -1;
+    if (sequence[3] < 0x80 || sequence[3] > 0xBF)
+        return -1;
+    return ((sequence[0] << 18) + (sequence[1] << 12) + (sequence[2] << 6) + sequence[3]) - 0x03C82080;
+}
+
+String TextCodecUTF8::decode(const char* bytes, size_t length, bool flush, bool stopOnError, bool& sawError)
+{
+    StringBuffer buffer(length);
+
+    const uint8_t* source = reinterpret_cast<const uint8_t*>(bytes);
+    const uint8_t* end = source + length;
+    const uint8_t* alignedEnd = alignToMachineWord(end);
+    UChar* destination = buffer.characters();
+
+    int count;
+    int character;
+
+    if (m_partialSequenceSize) {
+        count = nonASCIISequenceLength(m_partialSequence[0]);
+        ASSERT(count > m_partialSequenceSize);
+        if (count - m_partialSequenceSize > end - source) {
+            memcpy(m_partialSequence + m_partialSequenceSize, source, end - source);
+            m_partialSequenceSize += end - source;
+            source = end;
+        } else {
+            uint8_t completeSequence[U8_MAX_LENGTH];
+            memcpy(completeSequence, m_partialSequence, m_partialSequenceSize);
+            memcpy(completeSequence + m_partialSequenceSize, source, count - m_partialSequenceSize);
+            source += count - m_partialSequenceSize;
+            m_partialSequenceSize = 0;
+            character = decodeNonASCIISequence(completeSequence, count);
+            goto decodedNonASCII;
+        }
+    }
+
+    while (source < end) {
+        if (isASCII(*source)) {
+            // Fast path for ASCII. Most UTF-8 text will be ASCII.
+            if (isAlignedToMachineWord(source)) {
+                while (source < alignedEnd) {
+                    MachineWord chunk = *reinterpret_cast_ptr<const MachineWord*>(source);
+                    if (chunk & NonASCIIMask<sizeof(MachineWord)>::value()) {
+                        if (isASCII(*source))
+                            break;
+                        goto nonASCII;
+                    }
+                    UCharByteFiller<sizeof(MachineWord)>::copy(destination, source);
+                    source += sizeof(MachineWord);
+                    destination += sizeof(MachineWord);
+                }
+                if (source == end)
+                    break;
+            }
+            *destination++ = *source++;
+        } else {
+nonASCII:
+            count = nonASCIISequenceLength(*source);
+            ASSERT(count >= 2);
+            ASSERT(count <= 4);
+            if (count > end - source) {
+                ASSERT(end - source <= static_cast<ptrdiff_t>(sizeof(m_partialSequence)));
+                ASSERT(!m_partialSequenceSize);
+                m_partialSequenceSize = end - source;
+                memcpy(m_partialSequence, source, m_partialSequenceSize);
+                break;
+            }
+            character = decodeNonASCIISequence(source, count);
+            source += count;
+decodedNonASCII:
+            if (character < 0) {
+                if (stopOnError) {
+                    sawError = true;
+                    break;
+                }
+            } else {
+                ASSERT(!U_IS_SURROGATE(character));
+                if (U_IS_BMP(character))
+                    *destination++ = character;
+                else {
+                    *destination++ = U16_LEAD(character);
+                    *destination++ = U16_TRAIL(character);
+                }
+            }
+        }
+    }
+
+    buffer.shrink(destination - buffer.characters());
+
+    if (flush && m_partialSequenceSize)
+        sawError = true;
+
+    return String::adopt(buffer);
+}
+
+CString TextCodecUTF8::encode(const UChar* characters, size_t length, UnencodableHandling)
+{
+    // The maximum number of UTF-8 bytes needed per UTF-16 code unit is 3.
+    // BMP characters take only one UTF-16 code unit and can take up to 3 bytes (3x).
+    // Non-BMP characters take two UTF-16 code units and can take up to 4 bytes (2x).
+    if (length > numeric_limits<size_t>::max() / 3)
+        CRASH();
+    Vector<uint8_t> bytes(length * 3);
+
+    size_t i = 0;
+    size_t bytesWritten = 0;
+    while (i < length) {
+        UChar32 character;
+        U16_NEXT(characters, i, length, character);
+        U8_APPEND_UNSAFE(bytes.data(), bytesWritten, character);
+    }
+
+    return CString(reinterpret_cast<char*>(bytes.data()), bytesWritten);
+}
+
+} // namespace WebCore
diff --git a/Source/WebCore/platform/text/TextCodecUTF8.h b/Source/WebCore/platform/text/TextCodecUTF8.h
new file mode 100644
index 0000000..f3b6b7a
--- /dev/null
+++ b/Source/WebCore/platform/text/TextCodecUTF8.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (C) 2011 Apple Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
+ */
+
+#ifndef TextCodecUTF8_h
+#define TextCodecUTF8_h
+
+#include "TextCodec.h"
+
+namespace WebCore {
+
+class TextCodecUTF8 : public TextCodec {
+public:
+    static void registerEncodingNames(EncodingNameRegistrar);
+    static void registerCodecs(TextCodecRegistrar);
+
+    virtual String decode(const char*, size_t length, bool flush, bool stopOnError, bool& sawError);
+    virtual CString encode(const UChar*, size_t length, UnencodableHandling);
+
+private:
+    static PassOwnPtr<TextCodec> create(const TextEncoding&, const void*);
+    TextCodecUTF8() : m_partialSequenceSize(0) { }
+
+    int m_partialSequenceSize;
+    char m_partialSequence[U8_MAX_LENGTH - 1];
+    
+};
+
+} // namespace WebCore
+
+#endif // TextCodecUTF8_h
diff --git a/Source/WebCore/platform/text/TextEncodingRegistry.cpp b/Source/WebCore/platform/text/TextEncodingRegistry.cpp
index c0c0255..1dc09ee 100644
--- a/Source/WebCore/platform/text/TextEncodingRegistry.cpp
+++ b/Source/WebCore/platform/text/TextEncodingRegistry.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2006, 2007 Apple Inc. All rights reserved.
+ * Copyright (C) 2006, 2007, 2011 Apple Inc. All rights reserved.
  * Copyright (C) 2007-2009 Torch Mobile, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -27,14 +27,12 @@
 #include "config.h"
 #include "TextEncodingRegistry.h"
 
-#include "PlatformString.h"
 #include "TextCodecLatin1.h"
 #include "TextCodecUserDefined.h"
 #include "TextCodecUTF16.h"
+#include "TextCodecUTF8.h"
 #include "TextEncoding.h"
 #include <wtf/ASCIICType.h>
-#include <wtf/Assertions.h>
-#include <wtf/HashFunctions.h>
 #include <wtf/HashMap.h>
 #include <wtf/HashSet.h>
 #include <wtf/StdLibExtras.h>
@@ -68,7 +66,6 @@ const size_t maxEncodingNameLength = 63;
 
 // Hash for all-ASCII strings that does case folding.
 struct TextEncodingNameHash {
-
     static bool equal(const char* s1, const char* s2)
     {
         char c1;
@@ -129,9 +126,7 @@ static bool didExtendTextCodecMaps;
 static HashSet<const char*>* japaneseEncodings;
 static HashSet<const char*>* nonBackslashEncodings;
 
-static const char* const textEncodingNameBlacklist[] = {
-    "UTF-7"
-};
+static const char* const textEncodingNameBlacklist[] = { "UTF-7" };
 
 #if ERROR_DISABLED
 
@@ -268,7 +263,7 @@ static void buildQuirksSets()
     ASSERT(!japaneseEncodings);
     ASSERT(!nonBackslashEncodings);
 
-    japaneseEncodings = new HashSet<const char*>();
+    japaneseEncodings = new HashSet<const char*>;
     addEncodingName(japaneseEncodings, "EUC-JP");
     addEncodingName(japaneseEncodings, "ISO-2022-JP");
     addEncodingName(japaneseEncodings, "ISO-2022-JP-1");
@@ -284,7 +279,7 @@ static void buildQuirksSets()
     addEncodingName(japaneseEncodings, "cp932");
     addEncodingName(japaneseEncodings, "x-mac-japanese");
 
-    nonBackslashEncodings = new HashSet<const char*>();
+    nonBackslashEncodings = new HashSet<const char*>;
     // The text encodings below treat backslash as a currency symbol for IE compatibility.
     // See http://blogs.msdn.com/michkap/archive/2005/09/17/469941.aspx for more information.
     addEncodingName(nonBackslashEncodings, "x-mac-japanese");
diff --git a/Source/WebCore/platform/text/mac/TextCodecMac.cpp b/Source/WebCore/platform/text/mac/TextCodecMac.cpp
index b743f3d..64d0485 100644
--- a/Source/WebCore/platform/text/mac/TextCodecMac.cpp
+++ b/Source/WebCore/platform/text/mac/TextCodecMac.cpp
@@ -27,15 +27,15 @@
 #include "config.h"
 #include "TextCodecMac.h"
 
-#include "CharacterNames.h"
 #include "CharsetData.h"
 #include "PlatformString.h"
 #include "ThreadGlobalData.h"
 #include <wtf/Assertions.h>
-#include <wtf/text/CString.h>
 #include <wtf/PassOwnPtr.h>
 #include <wtf/RetainPtr.h>
 #include <wtf/Threading.h>
+#include <wtf/text/CString.h>
+#include <wtf/unicode/CharacterNames.h>
 
 using namespace std;
 
diff --git a/Source/WebCore/platform/text/transcoder/FontTranscoder.cpp b/Source/WebCore/platform/text/transcoder/FontTranscoder.cpp
index 68601f9..4e07f50 100644
--- a/Source/WebCore/platform/text/transcoder/FontTranscoder.cpp
+++ b/Source/WebCore/platform/text/transcoder/FontTranscoder.cpp
@@ -31,9 +31,9 @@
 #include "config.h"
 #include "FontTranscoder.h"
 
-#include "CharacterNames.h"
 #include "FontDescription.h"
 #include "TextEncoding.h"
+#include <wtf/unicode/CharacterNames.h>
 
 namespace WebCore {
author	Steve Block <steveblock@google.com>	2011-05-18 13:36:51 +0100
committer	Steve Block <steveblock@google.com>	2011-05-24 15:38:28 +0100
commit	2fc2651226baac27029e38c9d6ef883fa32084db (patch)
tree	e396d4bf89dcce6ed02071be66212495b1df1dec /Source/WebCore/platform/text
parent	b3725cedeb43722b3b175aaeff70552e562d2c94 (diff)
download	external_webkit-2fc2651226baac27029e38c9d6ef883fa32084db.zip external_webkit-2fc2651226baac27029e38c9d6ef883fa32084db.tar.gz external_webkit-2fc2651226baac27029e38c9d6ef883fa32084db.tar.bz2