summaryrefslogtreecommitdiffstats
path: root/Source/WebCore/platform/text
diff options
context:
space:
mode:
authorSteve Block <steveblock@google.com>2011-05-18 13:36:51 +0100
committerSteve Block <steveblock@google.com>2011-05-24 15:38:28 +0100
commit2fc2651226baac27029e38c9d6ef883fa32084db (patch)
treee396d4bf89dcce6ed02071be66212495b1df1dec /Source/WebCore/platform/text
parentb3725cedeb43722b3b175aaeff70552e562d2c94 (diff)
downloadexternal_webkit-2fc2651226baac27029e38c9d6ef883fa32084db.zip
external_webkit-2fc2651226baac27029e38c9d6ef883fa32084db.tar.gz
external_webkit-2fc2651226baac27029e38c9d6ef883fa32084db.tar.bz2
Merge WebKit at r78450: Initial merge by git.
Change-Id: I6d3e5f1f868ec266a0aafdef66182ddc3f265dc1
Diffstat (limited to 'Source/WebCore/platform/text')
-rw-r--r--Source/WebCore/platform/text/Base64.cpp6
-rw-r--r--Source/WebCore/platform/text/Base64.h29
-rw-r--r--Source/WebCore/platform/text/BidiResolver.h10
-rw-r--r--Source/WebCore/platform/text/CharacterNames.h90
-rw-r--r--Source/WebCore/platform/text/LocalizedNumber.h59
-rw-r--r--Source/WebCore/platform/text/LocalizedNumberNone.cpp55
-rw-r--r--Source/WebCore/platform/text/RegularExpression.cpp91
-rw-r--r--Source/WebCore/platform/text/SegmentedString.cpp15
-rw-r--r--Source/WebCore/platform/text/SegmentedString.h12
-rw-r--r--Source/WebCore/platform/text/TextCodecICU.cpp2
-rw-r--r--Source/WebCore/platform/text/TextCodecUTF16.cpp2
-rw-r--r--Source/WebCore/platform/text/TextCodecUTF8.cpp276
-rw-r--r--Source/WebCore/platform/text/TextCodecUTF8.h52
-rw-r--r--Source/WebCore/platform/text/TextEncodingRegistry.cpp15
-rw-r--r--Source/WebCore/platform/text/mac/TextCodecMac.cpp4
-rw-r--r--Source/WebCore/platform/text/transcoder/FontTranscoder.cpp2
16 files changed, 547 insertions, 173 deletions
diff --git a/Source/WebCore/platform/text/Base64.cpp b/Source/WebCore/platform/text/Base64.cpp
index 98b537a..bf706f6 100644
--- a/Source/WebCore/platform/text/Base64.cpp
+++ b/Source/WebCore/platform/text/Base64.cpp
@@ -60,9 +60,11 @@ static const char base64DecMap[128] = {
0x31, 0x32, 0x33, 0x00, 0x00, 0x00, 0x00, 0x00
};
-void base64Encode(const Vector<char>& in, Vector<char>& out, bool insertLFs)
+String base64Encode(const char* data, unsigned length, bool insertLFs)
{
- base64Encode(in.data(), in.size(), out, insertLFs);
+ Vector<char> result;
+ base64Encode(data, length, result, insertLFs);
+ return String(result.data(), result.size());
}
void base64Encode(const char* data, unsigned len, Vector<char>& out, bool insertLFs)
diff --git a/Source/WebCore/platform/text/Base64.h b/Source/WebCore/platform/text/Base64.h
index 211bd3c..70855de 100644
--- a/Source/WebCore/platform/text/Base64.h
+++ b/Source/WebCore/platform/text/Base64.h
@@ -27,20 +27,45 @@
#ifndef Base64_h
#define Base64_h
-#include <wtf/Forward.h>
#include <wtf/Vector.h>
+#include <wtf/text/CString.h>
+#include <wtf/text/WTFString.h>
namespace WebCore {
enum Base64DecodePolicy { FailOnInvalidCharacter, IgnoreWhitespace, IgnoreInvalidCharacters };
-void base64Encode(const Vector<char>&, Vector<char>&, bool insertLFs = false);
void base64Encode(const char*, unsigned, Vector<char>&, bool insertLFs = false);
+void base64Encode(const Vector<char>&, Vector<char>&, bool insertLFs = false);
+void base64Encode(const CString&, Vector<char>&, bool insertLFs = false);
+String base64Encode(const char*, unsigned, bool insertLFs = false);
+String base64Encode(const Vector<char>&, bool insertLFs = false);
+String base64Encode(const CString&, bool insertLFs = false);
bool base64Decode(const String&, Vector<char>&, Base64DecodePolicy = FailOnInvalidCharacter);
bool base64Decode(const Vector<char>&, Vector<char>&, Base64DecodePolicy = FailOnInvalidCharacter);
bool base64Decode(const char*, unsigned, Vector<char>&, Base64DecodePolicy = FailOnInvalidCharacter);
+inline void base64Encode(const Vector<char>& in, Vector<char>& out, bool insertLFs)
+{
+ base64Encode(in.data(), in.size(), out, insertLFs);
}
+inline void base64Encode(const CString& in, Vector<char>& out, bool insertLFs)
+{
+ base64Encode(in.data(), in.length(), out, insertLFs);
+}
+
+inline String base64Encode(const Vector<char>& in, bool insertLFs)
+{
+ return base64Encode(in.data(), in.size(), insertLFs);
+}
+
+inline String base64Encode(const CString& in, bool insertLFs)
+{
+ return base64Encode(in.data(), in.length(), insertLFs);
+}
+
+} // namespace WebCore
+
#endif // Base64_h
diff --git a/Source/WebCore/platform/text/BidiResolver.h b/Source/WebCore/platform/text/BidiResolver.h
index 8abd698..72d163c 100644
--- a/Source/WebCore/platform/text/BidiResolver.h
+++ b/Source/WebCore/platform/text/BidiResolver.h
@@ -161,7 +161,7 @@ public :
MidpointState<Iterator>& midpointState() { return m_midpointState; }
void embed(WTF::Unicode::Direction);
- void commitExplicitEmbedding();
+ bool commitExplicitEmbedding();
void createBidiRunsForLine(const Iterator& end, bool visualOrder = false, bool hardLineBreak = false);
@@ -400,7 +400,7 @@ void BidiResolver<Iterator, Run>::raiseExplicitEmbeddingLevel(WTF::Unicode::Dire
}
template <class Iterator, class Run>
-void BidiResolver<Iterator, Run>::commitExplicitEmbedding()
+bool BidiResolver<Iterator, Run>::commitExplicitEmbedding()
{
using namespace WTF::Unicode;
@@ -440,6 +440,8 @@ void BidiResolver<Iterator, Run>::commitExplicitEmbedding()
setContext(toContext);
m_currentExplicitEmbeddingSequence.clear();
+
+ return fromLevel != toLevel;
}
template <class Iterator, class Run>
@@ -881,8 +883,8 @@ void BidiResolver<Iterator, Run>::createBidiRunsForLine(const Iterator& end, boo
increment();
if (!m_currentExplicitEmbeddingSequence.isEmpty()) {
- commitExplicitEmbedding();
- if (pastEnd) {
+ bool committed = commitExplicitEmbedding();
+ if (committed && pastEnd) {
current = end;
m_status = stateAtEnd.m_status;
sor = stateAtEnd.sor;
diff --git a/Source/WebCore/platform/text/CharacterNames.h b/Source/WebCore/platform/text/CharacterNames.h
deleted file mode 100644
index c4b496e..0000000
--- a/Source/WebCore/platform/text/CharacterNames.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (C) 2007, 2009, 2010 Apple Inc. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
- * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef CharacterNames_h
-#define CharacterNames_h
-
-#include <wtf/unicode/Unicode.h>
-
-namespace WebCore {
-
-// Names here are taken from the Unicode standard.
-
-// Most of these are UChar constants, not UChar32, which makes them
-// more convenient for WebCore code that mostly uses UTF-16.
-
-const UChar32 aegeanWordSeparatorLine = 0x10100;
-const UChar32 aegeanWordSeparatorDot = 0x10101;
-const UChar blackCircle = 0x25CF;
-const UChar blackSquare = 0x25A0;
-const UChar blackUpPointingTriangle = 0x25B2;
-const UChar bullet = 0x2022;
-const UChar bullseye = 0x25CE;
-const UChar carriageReturn = 0x000D;
-const UChar ethiopicPrefaceColon = 0x1366;
-const UChar ethiopicWordspace = 0x1361;
-const UChar fisheye = 0x25C9;
-const UChar hebrewPunctuationGeresh = 0x05F3;
-const UChar hebrewPunctuationGershayim = 0x05F4;
-const UChar horizontalEllipsis = 0x2026;
-const UChar hyphen = 0x2010;
-const UChar hyphenMinus = 0x002D;
-const UChar ideographicComma = 0x3001;
-const UChar ideographicFullStop = 0x3002;
-const UChar ideographicSpace = 0x3000;
-const UChar leftDoubleQuotationMark = 0x201C;
-const UChar leftSingleQuotationMark = 0x2018;
-const UChar leftToRightEmbed = 0x202A;
-const UChar leftToRightMark = 0x200E;
-const UChar leftToRightOverride = 0x202D;
-const UChar minusSign = 0x2212;
-const UChar newlineCharacter = 0x000A;
-const UChar noBreakSpace = 0x00A0;
-const UChar objectReplacementCharacter = 0xFFFC;
-const UChar popDirectionalFormatting = 0x202C;
-const UChar replacementCharacter = 0xFFFD;
-const UChar rightDoubleQuotationMark = 0x201D;
-const UChar rightSingleQuotationMark = 0x2019;
-const UChar rightToLeftEmbed = 0x202B;
-const UChar rightToLeftMark = 0x200F;
-const UChar rightToLeftOverride = 0x202E;
-const UChar sesameDot = 0xFE45;
-const UChar softHyphen = 0x00AD;
-const UChar space = 0x0020;
-const UChar tibetanMarkIntersyllabicTsheg = 0x0F0B;
-const UChar tibetanMarkDelimiterTshegBstar = 0x0F0C;
-const UChar32 ugariticWordDivider = 0x1039F;
-const UChar whiteBullet = 0x25E6;
-const UChar whiteCircle = 0x25CB;
-const UChar whiteSesameDot = 0xFE46;
-const UChar whiteUpPointingTriangle = 0x25B3;
-const UChar yenSign = 0x00A5;
-const UChar zeroWidthJoiner = 0x200D;
-const UChar zeroWidthNonJoiner = 0x200C;
-const UChar zeroWidthSpace = 0x200B;
-
-}
-
-#endif // CharacterNames_h
diff --git a/Source/WebCore/platform/text/LocalizedNumber.h b/Source/WebCore/platform/text/LocalizedNumber.h
new file mode 100644
index 0000000..45873b8
--- /dev/null
+++ b/Source/WebCore/platform/text/LocalizedNumber.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (C) 2011 Google Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef LocalizedNumber_h
+#define LocalizedNumber_h
+
+#include <wtf/text/WTFString.h>
+
+namespace WebCore {
+
+// Parses a string representation of a floating point number localized
+// for the browser's current locale. If the input string is not valid
+// or an implementation doesn't support localized numbers, this
+// function returns NaN. This function doesn't need to support
+// scientific notation, NaN, +Infinity and -Infinity, and doesn't need
+// to support the standard representations of ECMAScript and HTML5.
+double parseLocalizedNumber(const String&);
+
+// Serializes the specified floating point number for the browser's
+// current locale. If an implementation doesn't support localized
+// numbers or the input value is NaN or Infinitiy, the function should
+// return an empty string.
+String formatLocalizedNumber(double);
+
+// Returns true if the input character can be used to represent a
+// number in the browser locale. For example, this should return true for 0-9 .
+// , + - for en-US locale.
+bool isLocalizedNumberCharacter(UChar32);
+
+} // namespace WebCore
+
+#endif // LocalizedNumber_h
diff --git a/Source/WebCore/platform/text/LocalizedNumberNone.cpp b/Source/WebCore/platform/text/LocalizedNumberNone.cpp
new file mode 100644
index 0000000..6f017e9
--- /dev/null
+++ b/Source/WebCore/platform/text/LocalizedNumberNone.cpp
@@ -0,0 +1,55 @@
+/*
+ * Copyright (C) 2011 Google Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+#include "LocalizedNumber.h"
+
+#include <limits>
+
+using namespace std;
+
+namespace WebCore {
+
+double parseLocalizedNumber(const String&)
+{
+ return numeric_limits<double>::quiet_NaN();
+}
+
+String formatLocalizedNumber(double)
+{
+ return String();
+}
+
+bool isLocalizedNumberCharacter(UChar32)
+{
+ return false;
+}
+
+} // namespace WebCore
diff --git a/Source/WebCore/platform/text/RegularExpression.cpp b/Source/WebCore/platform/text/RegularExpression.cpp
index 9b063c9..e020b91 100644
--- a/Source/WebCore/platform/text/RegularExpression.cpp
+++ b/Source/WebCore/platform/text/RegularExpression.cpp
@@ -1,6 +1,7 @@
/*
* Copyright (C) 2004, 2008, 2009 Apple Inc. All rights reserved.
* Copyright (C) 2008 Collabora Ltd.
+ * Copyright (C) 2011 Peter Varga (pvarga@webkit.org), University of Szeged
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -27,52 +28,48 @@
#include "config.h"
#include "RegularExpression.h"
+#include <wtf/BumpPointerAllocator.h>
+#include <yarr/Yarr.h>
#include "Logging.h"
-#include <pcre/pcre.h>
namespace WebCore {
class RegularExpression::Private : public RefCounted<RegularExpression::Private> {
public:
- static PassRefPtr<Private> create(const String& pattern, TextCaseSensitivity);
- ~Private();
+ static PassRefPtr<Private> create(const String& pattern, TextCaseSensitivity caseSensitivity)
+ {
+ return adoptRef(new Private(pattern, caseSensitivity));
+ }
- JSRegExp* regexp() const { return m_regexp; }
- int lastMatchLength;
+ int lastMatchLength;
-private:
- Private(const String& pattern, TextCaseSensitivity);
- static JSRegExp* compile(const String& pattern, TextCaseSensitivity);
+ unsigned m_numSubpatterns;
+ OwnPtr<JSC::Yarr::BytecodePattern> m_regExpByteCode;
- JSRegExp* m_regexp;
-};
+private:
+ Private(const String& pattern, TextCaseSensitivity caseSensitivity)
+ : lastMatchLength(-1)
+ , m_regExpByteCode(compile(pattern, caseSensitivity))
+ , m_constructionError(0)
+ {
+ }
-inline JSRegExp* RegularExpression::Private::compile(const String& pattern, TextCaseSensitivity caseSensitivity)
-{
- const char* errorMessage;
- JSRegExp* regexp = jsRegExpCompile(pattern.characters(), pattern.length(),
- caseSensitivity == TextCaseSensitive ? JSRegExpDoNotIgnoreCase : JSRegExpIgnoreCase, JSRegExpSingleLine,
- 0, &errorMessage);
- if (!regexp)
- LOG_ERROR("RegularExpression: pcre_compile failed with '%s'", errorMessage);
- return regexp;
-}
+ PassOwnPtr<JSC::Yarr::BytecodePattern> compile(const String& patternString, TextCaseSensitivity caseSensitivity)
+ {
+ JSC::Yarr::YarrPattern pattern(JSC::UString(patternString.impl()), (caseSensitivity == TextCaseInsensitive), false, &m_constructionError);
+ if (m_constructionError) {
+ LOG_ERROR("RegularExpression: YARR compile failed with '%s'", m_constructionError);
+ return PassOwnPtr<JSC::Yarr::BytecodePattern>();
+ }
-inline RegularExpression::Private::Private(const String& pattern, TextCaseSensitivity caseSensitivity)
- : lastMatchLength(-1)
- , m_regexp(compile(pattern, caseSensitivity))
-{
-}
+ m_numSubpatterns = pattern.m_numSubpatterns;
-inline PassRefPtr<RegularExpression::Private> RegularExpression::Private::create(const String& pattern, TextCaseSensitivity caseSensitivity)
-{
- return adoptRef(new Private(pattern, caseSensitivity));
-}
+ return JSC::Yarr::byteCompile(pattern, &m_regexAllocator);
+ }
-RegularExpression::Private::~Private()
-{
- jsRegExpFree(m_regexp);
-}
+ BumpPointerAllocator m_regexAllocator;
+ const char* m_constructionError;
+};
RegularExpression::RegularExpression(const String& pattern, TextCaseSensitivity caseSensitivity)
: d(Private::create(pattern, caseSensitivity))
@@ -96,28 +93,36 @@ RegularExpression& RegularExpression::operator=(const RegularExpression& re)
int RegularExpression::match(const String& str, int startFrom, int* matchLength) const
{
- if (!d->regexp())
+ if (!d->m_regExpByteCode)
return -1;
if (str.isNull())
return -1;
- // First 2 offsets are start and end offsets; 3rd entry is used internally by pcre
- static const size_t maxOffsets = 3;
- int offsets[maxOffsets];
- int result = jsRegExpExecute(d->regexp(), str.characters(), str.length(), startFrom, offsets, maxOffsets);
+ int offsetVectorSize = (d->m_numSubpatterns + 1) * 2;
+ int* offsetVector;
+ Vector<int, 32> nonReturnedOvector;
+
+ nonReturnedOvector.resize(offsetVectorSize);
+ offsetVector = nonReturnedOvector.data();
+
+ ASSERT(offsetVector);
+ for (unsigned j = 0, i = 0; i < d->m_numSubpatterns + 1; j += 2, i++)
+ offsetVector[j] = -1;
+
+ int result = JSC::Yarr::interpret(d->m_regExpByteCode.get(), str.characters(), startFrom, str.length(), offsetVector);
+ ASSERT(result >= -1);
+
if (result < 0) {
- if (result != JSRegExpErrorNoMatch)
- LOG_ERROR("RegularExpression: pcre_exec() failed with result %d", result);
d->lastMatchLength = -1;
return -1;
}
- // 1 means 1 match; 0 means more than one match. First match is recorded in offsets.
- d->lastMatchLength = offsets[1] - offsets[0];
+ // 1 means 1 match; 0 means more than one match. First match is recorded in offsetVector.
+ d->lastMatchLength = offsetVector[1] - offsetVector[0];
if (matchLength)
*matchLength = d->lastMatchLength;
- return offsets[0];
+ return offsetVector[0];
}
int RegularExpression::searchRev(const String& str) const
diff --git a/Source/WebCore/platform/text/SegmentedString.cpp b/Source/WebCore/platform/text/SegmentedString.cpp
index 5e9755b..7c859dc 100644
--- a/Source/WebCore/platform/text/SegmentedString.cpp
+++ b/Source/WebCore/platform/text/SegmentedString.cpp
@@ -186,17 +186,6 @@ void SegmentedString::advanceSubstring()
}
}
-int SegmentedString::numberOfCharactersConsumedSlow() const
-{
- int result = m_numberOfCharactersConsumedPriorToCurrentString + m_currentString.numberOfCharactersConsumed();
- if (m_pushedChar1) {
- --result;
- if (m_pushedChar2)
- --result;
- }
- return result;
-}
-
String SegmentedString::toString() const
{
String result;
@@ -262,14 +251,14 @@ WTF::ZeroBasedNumber SegmentedString::currentLine() const
WTF::ZeroBasedNumber SegmentedString::currentColumn() const
{
- int zeroBasedColumn = numberOfCharactersConsumedSlow() - m_numberOfCharactersConsumedPriorToCurrentLine;
+ int zeroBasedColumn = numberOfCharactersConsumed() - m_numberOfCharactersConsumedPriorToCurrentLine;
return WTF::ZeroBasedNumber::fromZeroBasedInt(zeroBasedColumn);
}
void SegmentedString::setCurrentPosition(WTF::ZeroBasedNumber line, WTF::ZeroBasedNumber columnAftreProlog, int prologLength)
{
m_currentLine = line.zeroBasedInt();
- m_numberOfCharactersConsumedPriorToCurrentLine = numberOfCharactersConsumedSlow() + prologLength - columnAftreProlog.zeroBasedInt();
+ m_numberOfCharactersConsumedPriorToCurrentLine = numberOfCharactersConsumed() + prologLength - columnAftreProlog.zeroBasedInt();
}
}
diff --git a/Source/WebCore/platform/text/SegmentedString.h b/Source/WebCore/platform/text/SegmentedString.h
index 30c899d..3784b50 100644
--- a/Source/WebCore/platform/text/SegmentedString.h
+++ b/Source/WebCore/platform/text/SegmentedString.h
@@ -206,13 +206,15 @@ public:
int numberOfCharactersConsumed() const
{
- // We don't currently handle the case when there are pushed character.
- ASSERT(!m_pushedChar1);
- return m_numberOfCharactersConsumedPriorToCurrentString + m_currentString.numberOfCharactersConsumed();
+ int numberOfPushedCharacters = 0;
+ if (m_pushedChar1) {
+ ++numberOfPushedCharacters;
+ if (m_pushedChar2)
+ ++numberOfPushedCharacters;
+ }
+ return m_numberOfCharactersConsumedPriorToCurrentString + m_currentString.numberOfCharactersConsumed() - numberOfPushedCharacters;
}
- int numberOfCharactersConsumedSlow() const;
-
String toString() const;
const UChar& operator*() const { return *current(); }
diff --git a/Source/WebCore/platform/text/TextCodecICU.cpp b/Source/WebCore/platform/text/TextCodecICU.cpp
index 6a579f9..92a158a 100644
--- a/Source/WebCore/platform/text/TextCodecICU.cpp
+++ b/Source/WebCore/platform/text/TextCodecICU.cpp
@@ -27,7 +27,6 @@
#include "config.h"
#include "TextCodecICU.h"
-#include "CharacterNames.h"
#include "PlatformString.h"
#include "ThreadGlobalData.h"
#include <unicode/ucnv.h>
@@ -37,6 +36,7 @@
#include <wtf/PassOwnPtr.h>
#include <wtf/StringExtras.h>
#include <wtf/Threading.h>
+#include <wtf/unicode/CharacterNames.h>
using std::min;
diff --git a/Source/WebCore/platform/text/TextCodecUTF16.cpp b/Source/WebCore/platform/text/TextCodecUTF16.cpp
index e88e83b..4ceed23 100644
--- a/Source/WebCore/platform/text/TextCodecUTF16.cpp
+++ b/Source/WebCore/platform/text/TextCodecUTF16.cpp
@@ -71,6 +71,8 @@ String TextCodecUTF16::decode(const char* bytes, size_t length, bool, bool, bool
if (!length)
return String();
+ // FIXME: This should generate an error if there is an unpaired surrogate.
+
const unsigned char* p = reinterpret_cast<const unsigned char*>(bytes);
size_t numBytes = length + m_haveBufferedByte;
size_t numChars = numBytes / 2;
diff --git a/Source/WebCore/platform/text/TextCodecUTF8.cpp b/Source/WebCore/platform/text/TextCodecUTF8.cpp
new file mode 100644
index 0000000..8944d68
--- /dev/null
+++ b/Source/WebCore/platform/text/TextCodecUTF8.cpp
@@ -0,0 +1,276 @@
+/*
+ * Copyright (C) 2004, 2006, 2008, 2011 Apple Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+#include "TextCodecUTF8.h"
+
+#include <wtf/text/CString.h>
+#include <wtf/text/StringBuffer.h>
+#include <wtf/unicode/UTF8.h>
+
+using namespace WTF::Unicode;
+using namespace std;
+
+namespace WebCore {
+
+// Assuming that a pointer is the size of a "machine word", then
+// uintptr_t is an integer type that is also a machine word.
+typedef uintptr_t MachineWord;
+
+// This constant has type uintptr_t since we will use it to align
+// pointers. Not because MachineWord is uintptr_t.
+const uintptr_t machineWordAlignmentMask = sizeof(MachineWord) - 1;
+
+template<size_t size> struct NonASCIIMask;
+template<> struct NonASCIIMask<4> {
+ static unsigned value() { return 0x80808080U; }
+};
+template<> struct NonASCIIMask<8> {
+ static unsigned long long value() { return 0x8080808080808080ULL; }
+};
+
+template<size_t size> struct UCharByteFiller;
+template<> struct UCharByteFiller<4> {
+ static void copy(UChar* destination, const uint8_t* source)
+ {
+ destination[0] = source[0];
+ destination[1] = source[1];
+ destination[2] = source[2];
+ destination[3] = source[3];
+ }
+};
+template<> struct UCharByteFiller<8> {
+ static void copy(UChar* destination, const uint8_t* source)
+ {
+ destination[0] = source[0];
+ destination[1] = source[1];
+ destination[2] = source[2];
+ destination[3] = source[3];
+ destination[4] = source[4];
+ destination[5] = source[5];
+ destination[6] = source[6];
+ destination[7] = source[7];
+ }
+};
+
+static inline bool isAlignedToMachineWord(const void* pointer)
+{
+ return !(reinterpret_cast<uintptr_t>(pointer) & machineWordAlignmentMask);
+}
+
+template<typename T> static inline T* alignToMachineWord(T* pointer)
+{
+ return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(pointer) & ~machineWordAlignmentMask);
+}
+
+PassOwnPtr<TextCodec> TextCodecUTF8::create(const TextEncoding&, const void*)
+{
+ return adoptPtr(new TextCodecUTF8);
+}
+
+void TextCodecUTF8::registerEncodingNames(EncodingNameRegistrar registrar)
+{
+ registrar("UTF-8", "UTF-8");
+}
+
+void TextCodecUTF8::registerCodecs(TextCodecRegistrar registrar)
+{
+ registrar("UTF-8", create, 0);
+}
+
+static inline int nonASCIISequenceLength(unsigned char firstByte)
+{
+ ASSERT(!isASCII(firstByte));
+ switch (firstByte >> 4) {
+ case 0xF:
+ return 4;
+ case 0xE:
+ return 3;
+ }
+ return 2;
+}
+
+static inline int decodeNonASCIISequence(const unsigned char* sequence, unsigned length)
+{
+ ASSERT(!isASCII(sequence[0]));
+ if (length == 2) {
+ ASSERT(sequence[0] <= 0xDF);
+ if (sequence[0] < 0xC2)
+ return -1;
+ if (sequence[1] < 0x80 || sequence[1] > 0xBF)
+ return -1;
+ return ((sequence[0] << 6) + sequence[1]) - 0x00003080;
+ }
+ if (length == 3) {
+ ASSERT(sequence[0] >= 0xE0 && sequence[0] <= 0xEF);
+ switch (sequence[0]) {
+ case 0xE0:
+ if (sequence[1] < 0xA0 || sequence[1] > 0xBF)
+ return -1;
+ break;
+ case 0xED:
+ if (sequence[1] < 0x80 || sequence[1] > 0x9F)
+ return -1;
+ break;
+ default:
+ if (sequence[1] < 0x80 || sequence[1] > 0xBF)
+ return -1;
+ }
+ if (sequence[2] < 0x80 || sequence[2] > 0xBF)
+ return -1;
+ return ((sequence[0] << 12) + (sequence[1] << 6) + sequence[2]) - 0x000E2080;
+ }
+ ASSERT(length == 4);
+ ASSERT(sequence[0] >= 0xF0 && sequence[0] <= 0xF4);
+ switch (sequence[0]) {
+ case 0xF0:
+ if (sequence[1] < 0x90 || sequence[1] > 0xBF)
+ return -1;
+ break;
+ case 0xF4:
+ if (sequence[1] < 0x80 || sequence[1] > 0x8F)
+ return -1;
+ break;
+ default:
+ if (sequence[1] < 0x80 || sequence[1] > 0xBF)
+ return -1;
+ }
+ if (sequence[2] < 0x80 || sequence[2] > 0xBF)
+ return -1;
+ if (sequence[3] < 0x80 || sequence[3] > 0xBF)
+ return -1;
+ return ((sequence[0] << 18) + (sequence[1] << 12) + (sequence[2] << 6) + sequence[3]) - 0x03C82080;
+}
+
+String TextCodecUTF8::decode(const char* bytes, size_t length, bool flush, bool stopOnError, bool& sawError)
+{
+ StringBuffer buffer(length);
+
+ const uint8_t* source = reinterpret_cast<const uint8_t*>(bytes);
+ const uint8_t* end = source + length;
+ const uint8_t* alignedEnd = alignToMachineWord(end);
+ UChar* destination = buffer.characters();
+
+ int count;
+ int character;
+
+ if (m_partialSequenceSize) {
+ count = nonASCIISequenceLength(m_partialSequence[0]);
+ ASSERT(count > m_partialSequenceSize);
+ if (count - m_partialSequenceSize > end - source) {
+ memcpy(m_partialSequence + m_partialSequenceSize, source, end - source);
+ m_partialSequenceSize += end - source;
+ source = end;
+ } else {
+ uint8_t completeSequence[U8_MAX_LENGTH];
+ memcpy(completeSequence, m_partialSequence, m_partialSequenceSize);
+ memcpy(completeSequence + m_partialSequenceSize, source, count - m_partialSequenceSize);
+ source += count - m_partialSequenceSize;
+ m_partialSequenceSize = 0;
+ character = decodeNonASCIISequence(completeSequence, count);
+ goto decodedNonASCII;
+ }
+ }
+
+ while (source < end) {
+ if (isASCII(*source)) {
+ // Fast path for ASCII. Most UTF-8 text will be ASCII.
+ if (isAlignedToMachineWord(source)) {
+ while (source < alignedEnd) {
+ MachineWord chunk = *reinterpret_cast_ptr<const MachineWord*>(source);
+ if (chunk & NonASCIIMask<sizeof(MachineWord)>::value()) {
+ if (isASCII(*source))
+ break;
+ goto nonASCII;
+ }
+ UCharByteFiller<sizeof(MachineWord)>::copy(destination, source);
+ source += sizeof(MachineWord);
+ destination += sizeof(MachineWord);
+ }
+ if (source == end)
+ break;
+ }
+ *destination++ = *source++;
+ } else {
+nonASCII:
+ count = nonASCIISequenceLength(*source);
+ ASSERT(count >= 2);
+ ASSERT(count <= 4);
+ if (count > end - source) {
+ ASSERT(end - source <= static_cast<ptrdiff_t>(sizeof(m_partialSequence)));
+ ASSERT(!m_partialSequenceSize);
+ m_partialSequenceSize = end - source;
+ memcpy(m_partialSequence, source, m_partialSequenceSize);
+ break;
+ }
+ character = decodeNonASCIISequence(source, count);
+ source += count;
+decodedNonASCII:
+ if (character < 0) {
+ if (stopOnError) {
+ sawError = true;
+ break;
+ }
+ } else {
+ ASSERT(!U_IS_SURROGATE(character));
+ if (U_IS_BMP(character))
+ *destination++ = character;
+ else {
+ *destination++ = U16_LEAD(character);
+ *destination++ = U16_TRAIL(character);
+ }
+ }
+ }
+ }
+
+ buffer.shrink(destination - buffer.characters());
+
+ if (flush && m_partialSequenceSize)
+ sawError = true;
+
+ return String::adopt(buffer);
+}
+
+CString TextCodecUTF8::encode(const UChar* characters, size_t length, UnencodableHandling)
+{
+ // The maximum number of UTF-8 bytes needed per UTF-16 code unit is 3.
+ // BMP characters take only one UTF-16 code unit and can take up to 3 bytes (3x).
+ // Non-BMP characters take two UTF-16 code units and can take up to 4 bytes (2x).
+ if (length > numeric_limits<size_t>::max() / 3)
+ CRASH();
+ Vector<uint8_t> bytes(length * 3);
+
+ size_t i = 0;
+ size_t bytesWritten = 0;
+ while (i < length) {
+ UChar32 character;
+ U16_NEXT(characters, i, length, character);
+ U8_APPEND_UNSAFE(bytes.data(), bytesWritten, character);
+ }
+
+ return CString(reinterpret_cast<char*>(bytes.data()), bytesWritten);
+}
+
+} // namespace WebCore
diff --git a/Source/WebCore/platform/text/TextCodecUTF8.h b/Source/WebCore/platform/text/TextCodecUTF8.h
new file mode 100644
index 0000000..f3b6b7a
--- /dev/null
+++ b/Source/WebCore/platform/text/TextCodecUTF8.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (C) 2011 Apple Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef TextCodecUTF8_h
+#define TextCodecUTF8_h
+
+#include "TextCodec.h"
+
+namespace WebCore {
+
+class TextCodecUTF8 : public TextCodec {
+public:
+ static void registerEncodingNames(EncodingNameRegistrar);
+ static void registerCodecs(TextCodecRegistrar);
+
+ virtual String decode(const char*, size_t length, bool flush, bool stopOnError, bool& sawError);
+ virtual CString encode(const UChar*, size_t length, UnencodableHandling);
+
+private:
+ static PassOwnPtr<TextCodec> create(const TextEncoding&, const void*);
+ TextCodecUTF8() : m_partialSequenceSize(0) { }
+
+ int m_partialSequenceSize;
+ char m_partialSequence[U8_MAX_LENGTH - 1];
+
+};
+
+} // namespace WebCore
+
+#endif // TextCodecUTF8_h
diff --git a/Source/WebCore/platform/text/TextEncodingRegistry.cpp b/Source/WebCore/platform/text/TextEncodingRegistry.cpp
index c0c0255..1dc09ee 100644
--- a/Source/WebCore/platform/text/TextEncodingRegistry.cpp
+++ b/Source/WebCore/platform/text/TextEncodingRegistry.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2006, 2007 Apple Inc. All rights reserved.
+ * Copyright (C) 2006, 2007, 2011 Apple Inc. All rights reserved.
* Copyright (C) 2007-2009 Torch Mobile, Inc.
*
* Redistribution and use in source and binary forms, with or without
@@ -27,14 +27,12 @@
#include "config.h"
#include "TextEncodingRegistry.h"
-#include "PlatformString.h"
#include "TextCodecLatin1.h"
#include "TextCodecUserDefined.h"
#include "TextCodecUTF16.h"
+#include "TextCodecUTF8.h"
#include "TextEncoding.h"
#include <wtf/ASCIICType.h>
-#include <wtf/Assertions.h>
-#include <wtf/HashFunctions.h>
#include <wtf/HashMap.h>
#include <wtf/HashSet.h>
#include <wtf/StdLibExtras.h>
@@ -68,7 +66,6 @@ const size_t maxEncodingNameLength = 63;
// Hash for all-ASCII strings that does case folding.
struct TextEncodingNameHash {
-
static bool equal(const char* s1, const char* s2)
{
char c1;
@@ -129,9 +126,7 @@ static bool didExtendTextCodecMaps;
static HashSet<const char*>* japaneseEncodings;
static HashSet<const char*>* nonBackslashEncodings;
-static const char* const textEncodingNameBlacklist[] = {
- "UTF-7"
-};
+static const char* const textEncodingNameBlacklist[] = { "UTF-7" };
#if ERROR_DISABLED
@@ -268,7 +263,7 @@ static void buildQuirksSets()
ASSERT(!japaneseEncodings);
ASSERT(!nonBackslashEncodings);
- japaneseEncodings = new HashSet<const char*>();
+ japaneseEncodings = new HashSet<const char*>;
addEncodingName(japaneseEncodings, "EUC-JP");
addEncodingName(japaneseEncodings, "ISO-2022-JP");
addEncodingName(japaneseEncodings, "ISO-2022-JP-1");
@@ -284,7 +279,7 @@ static void buildQuirksSets()
addEncodingName(japaneseEncodings, "cp932");
addEncodingName(japaneseEncodings, "x-mac-japanese");
- nonBackslashEncodings = new HashSet<const char*>();
+ nonBackslashEncodings = new HashSet<const char*>;
// The text encodings below treat backslash as a currency symbol for IE compatibility.
// See http://blogs.msdn.com/michkap/archive/2005/09/17/469941.aspx for more information.
addEncodingName(nonBackslashEncodings, "x-mac-japanese");
diff --git a/Source/WebCore/platform/text/mac/TextCodecMac.cpp b/Source/WebCore/platform/text/mac/TextCodecMac.cpp
index b743f3d..64d0485 100644
--- a/Source/WebCore/platform/text/mac/TextCodecMac.cpp
+++ b/Source/WebCore/platform/text/mac/TextCodecMac.cpp
@@ -27,15 +27,15 @@
#include "config.h"
#include "TextCodecMac.h"
-#include "CharacterNames.h"
#include "CharsetData.h"
#include "PlatformString.h"
#include "ThreadGlobalData.h"
#include <wtf/Assertions.h>
-#include <wtf/text/CString.h>
#include <wtf/PassOwnPtr.h>
#include <wtf/RetainPtr.h>
#include <wtf/Threading.h>
+#include <wtf/text/CString.h>
+#include <wtf/unicode/CharacterNames.h>
using namespace std;
diff --git a/Source/WebCore/platform/text/transcoder/FontTranscoder.cpp b/Source/WebCore/platform/text/transcoder/FontTranscoder.cpp
index 68601f9..4e07f50 100644
--- a/Source/WebCore/platform/text/transcoder/FontTranscoder.cpp
+++ b/Source/WebCore/platform/text/transcoder/FontTranscoder.cpp
@@ -31,9 +31,9 @@
#include "config.h"
#include "FontTranscoder.h"
-#include "CharacterNames.h"
#include "FontDescription.h"
#include "TextEncoding.h"
+#include <wtf/unicode/CharacterNames.h>
namespace WebCore {