4 files changed, 188 insertions, 28 deletions
diff --git a/Source/JavaScriptCore/wtf/unicode/CharacterNames.h b/Source/JavaScriptCore/wtf/unicode/CharacterNames.h
new file mode 100644
index 0000000..3d093a6
--- /dev/null
+++ b/Source/JavaScriptCore/wtf/unicode/CharacterNames.h
@@ -0,0 +1,142 @@
+/*
+ * Copyright (C) 2007, 2009, 2010 Apple Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
+ */
+
+#ifndef CharacterNames_h
+#define CharacterNames_h
+
+#include "Unicode.h"
+
+namespace WTF {
+namespace Unicode {
+
+// Names here are taken from the Unicode standard.
+
+// Most of these are UChar constants, not UChar32, which makes them
+// more convenient for WebCore code that mostly uses UTF-16.
+
+const UChar32 aegeanWordSeparatorLine = 0x10100;
+const UChar32 aegeanWordSeparatorDot = 0x10101;
+const UChar blackCircle = 0x25CF;
+const UChar blackSquare = 0x25A0;
+const UChar blackUpPointingTriangle = 0x25B2;
+const UChar bullet = 0x2022;
+const UChar bullseye = 0x25CE;
+const UChar carriageReturn = 0x000D;
+const UChar ethiopicPrefaceColon = 0x1366;
+const UChar ethiopicWordspace = 0x1361;
+const UChar fisheye = 0x25C9;
+const UChar hebrewPunctuationGeresh = 0x05F3;
+const UChar hebrewPunctuationGershayim = 0x05F4;
+const UChar horizontalEllipsis = 0x2026;
+const UChar hyphen = 0x2010;
+const UChar hyphenMinus = 0x002D;
+const UChar ideographicComma = 0x3001;
+const UChar ideographicFullStop = 0x3002;
+const UChar ideographicSpace = 0x3000;
+const UChar leftDoubleQuotationMark = 0x201C;
+const UChar leftSingleQuotationMark = 0x2018;
+const UChar leftToRightEmbed = 0x202A;
+const UChar leftToRightMark = 0x200E;
+const UChar leftToRightOverride = 0x202D;
+const UChar minusSign = 0x2212;
+const UChar newlineCharacter = 0x000A;
+const UChar noBreakSpace = 0x00A0;
+const UChar objectReplacementCharacter = 0xFFFC;
+const UChar popDirectionalFormatting = 0x202C;
+const UChar replacementCharacter = 0xFFFD;
+const UChar rightDoubleQuotationMark = 0x201D;
+const UChar rightSingleQuotationMark = 0x2019;
+const UChar rightToLeftEmbed = 0x202B;
+const UChar rightToLeftMark = 0x200F;
+const UChar rightToLeftOverride = 0x202E;
+const UChar sesameDot = 0xFE45;
+const UChar softHyphen = 0x00AD;
+const UChar space = 0x0020;
+const UChar tibetanMarkIntersyllabicTsheg = 0x0F0B;
+const UChar tibetanMarkDelimiterTshegBstar = 0x0F0C;
+const UChar32 ugariticWordDivider = 0x1039F;
+const UChar whiteBullet = 0x25E6;
+const UChar whiteCircle = 0x25CB;
+const UChar whiteSesameDot = 0xFE46;
+const UChar whiteUpPointingTriangle = 0x25B3;
+const UChar yenSign = 0x00A5;
+const UChar zeroWidthJoiner = 0x200D;
+const UChar zeroWidthNonJoiner = 0x200C;
+const UChar zeroWidthSpace = 0x200B;
+
+} // namespace Unicode
+} // namespace WTF
+
+using WTF::Unicode::aegeanWordSeparatorLine;
+using WTF::Unicode::aegeanWordSeparatorDot;
+using WTF::Unicode::blackCircle;
+using WTF::Unicode::blackSquare;
+using WTF::Unicode::blackUpPointingTriangle;
+using WTF::Unicode::bullet;
+using WTF::Unicode::bullseye;
+using WTF::Unicode::carriageReturn;
+using WTF::Unicode::ethiopicPrefaceColon;
+using WTF::Unicode::ethiopicWordspace;
+using WTF::Unicode::fisheye;
+using WTF::Unicode::hebrewPunctuationGeresh;
+using WTF::Unicode::hebrewPunctuationGershayim;
+using WTF::Unicode::horizontalEllipsis;
+using WTF::Unicode::hyphen;
+using WTF::Unicode::hyphenMinus;
+using WTF::Unicode::ideographicComma;
+using WTF::Unicode::ideographicFullStop;
+using WTF::Unicode::ideographicSpace;
+using WTF::Unicode::leftDoubleQuotationMark;
+using WTF::Unicode::leftSingleQuotationMark;
+using WTF::Unicode::leftToRightEmbed;
+using WTF::Unicode::leftToRightMark;
+using WTF::Unicode::leftToRightOverride;
+using WTF::Unicode::minusSign;
+using WTF::Unicode::newlineCharacter;
+using WTF::Unicode::noBreakSpace;
+using WTF::Unicode::objectReplacementCharacter;
+using WTF::Unicode::popDirectionalFormatting;
+using WTF::Unicode::replacementCharacter;
+using WTF::Unicode::rightDoubleQuotationMark;
+using WTF::Unicode::rightSingleQuotationMark;
+using WTF::Unicode::rightToLeftEmbed;
+using WTF::Unicode::rightToLeftMark;
+using WTF::Unicode::rightToLeftOverride;
+using WTF::Unicode::sesameDot;
+using WTF::Unicode::softHyphen;
+using WTF::Unicode::space;
+using WTF::Unicode::tibetanMarkIntersyllabicTsheg;
+using WTF::Unicode::tibetanMarkDelimiterTshegBstar;
+using WTF::Unicode::ugariticWordDivider;
+using WTF::Unicode::whiteBullet;
+using WTF::Unicode::whiteCircle;
+using WTF::Unicode::whiteSesameDot;
+using WTF::Unicode::whiteUpPointingTriangle;
+using WTF::Unicode::yenSign;
+using WTF::Unicode::zeroWidthJoiner;
+using WTF::Unicode::zeroWidthNonJoiner;
+using WTF::Unicode::zeroWidthSpace;
+
+#endif // CharacterNames_h
diff --git a/Source/JavaScriptCore/wtf/unicode/UTF8.cpp b/Source/JavaScriptCore/wtf/unicode/UTF8.cpp
index dc24ed5..4c3738b 100644
--- a/Source/JavaScriptCore/wtf/unicode/UTF8.cpp
+++ b/Source/JavaScriptCore/wtf/unicode/UTF8.cpp
@@ -26,16 +26,14 @@
 
 #include "config.h"
 #include "UTF8.h"
-#include <wtf/StringHasher.h>
 
 #include "ASCIICType.h"
+#include <wtf/StringHasher.h>
+#include <wtf/unicode/CharacterNames.h>
 
 namespace WTF {
 namespace Unicode {
 
-// FIXME: Use definition from CharacterNames.h.
-static const UChar replacementCharacter = 0xFFFD;
-
 inline int inlineUTF8SequenceLengthNonASCII(char b0)
 {
     if ((b0 & 0xC0) != 0xC0)
@@ -316,25 +314,33 @@ ConversionResult convertUTF8ToUTF16(
     return result;
 }
 
-unsigned calculateStringHashFromUTF8(const char* data, const char* dataEnd, unsigned& utf16Length)
+static inline unsigned calculateStringHashAndLengthFromUTF8Internal(const char* data, const char* dataEnd, unsigned& dataLength, unsigned& utf16Length)
 {
     if (!data)
         return 0;
 
     WTF::StringHasher stringHasher;
+    dataLength = 0;
     utf16Length = 0;
 
-    while (data < dataEnd) {
+    while (data < dataEnd || (!dataEnd && *data)) {
         if (isASCII(*data)) {
             stringHasher.addCharacter(*data++);
+            dataLength++;
             utf16Length++;
             continue;
         }
 
         int utf8SequenceLength = inlineUTF8SequenceLengthNonASCII(*data);
+        dataLength += utf8SequenceLength;
 
-        if (dataEnd - data < utf8SequenceLength)
-            return false;
+        if (!dataEnd) {
+            for (int i = 1; i < utf8SequenceLength; ++i) {
+                if (!data[i])
+                    return 0;
+            }
+        } else if (dataEnd - data < utf8SequenceLength)
+            return 0;
 
         if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(data), utf8SequenceLength))
             return 0;
@@ -359,6 +365,17 @@ unsigned calculateStringHashFromUTF8(const char* data, const char* dataEnd, unsi
     return stringHasher.hash();
 }
 
+unsigned calculateStringHashFromUTF8(const char* data, const char* dataEnd, unsigned& utf16Length)
+{
+    unsigned dataLength;
+    return calculateStringHashAndLengthFromUTF8Internal(data, dataEnd, dataLength, utf16Length);
+}
+
+unsigned calculateStringHashAndLengthFromUTF8(const char* data, unsigned& dataLength, unsigned& utf16Length)
+{
+    return calculateStringHashAndLengthFromUTF8Internal(data, 0, dataLength, utf16Length);
+}
+
 bool equalUTF16WithUTF8(const UChar* a, const UChar* aEnd, const char* b, const char* bEnd)
 {
     while (b < bEnd) {
diff --git a/Source/JavaScriptCore/wtf/unicode/UTF8.h b/Source/JavaScriptCore/wtf/unicode/UTF8.h
index 1f4baca..bbfaa84 100644
--- a/Source/JavaScriptCore/wtf/unicode/UTF8.h
+++ b/Source/JavaScriptCore/wtf/unicode/UTF8.h
@@ -71,6 +71,7 @@ namespace Unicode {
                     char** targetStart, char* targetEnd, bool strict = true);
 
     unsigned calculateStringHashFromUTF8(const char* data, const char* dataEnd, unsigned& utf16Length);
+    unsigned calculateStringHashAndLengthFromUTF8(const char* data, unsigned& dataLength, unsigned& utf16Length);
 
     bool equalUTF16WithUTF8(const UChar* a, const UChar* aEnd, const char* b, const char* bEnd);
 
diff --git a/Source/JavaScriptCore/wtf/unicode/UnicodeMacrosFromICU.h b/Source/JavaScriptCore/wtf/unicode/UnicodeMacrosFromICU.h
index 8959912..09a7036 100644
--- a/Source/JavaScriptCore/wtf/unicode/UnicodeMacrosFromICU.h
+++ b/Source/JavaScriptCore/wtf/unicode/UnicodeMacrosFromICU.h
@@ -1,25 +1,5 @@
 /*
  *  Copyright (C) 1999-2004, International Business Machines Corporation and others.  All Rights Reserved.
- *  Copyright (C) 2006 George Staikos <staikos@kde.org>
- *  Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com>
- *  Copyright (C) 2007 Apple Computer, Inc. All rights reserved.
- *  Copyright (C) 2008 Jürg Billeter <j@bitron.ch>
- *  Copyright (C) 2008 Dominik Röttsches <dominik.roettsches@access-company.com>
- *
- *  This library is free software; you can redistribute it and/or
- *  modify it under the terms of the GNU Library General Public
- *  License as published by the Free Software Foundation; either
- *  version 2 of the License, or (at your option) any later version.
- *
- *  This library is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- *  Library General Public License for more details.
- *
- *  You should have received a copy of the GNU Library General Public License
- *  along with this library; see the file COPYING.LIB.  If not, write to
- *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
- *  Boston, MA 02110-1301, USA.
  *
  */
 
@@ -97,4 +77,24 @@
 
 #define U_MASK(x) ((uint32_t)1<<(x))
 
+#define U8_MAX_LENGTH 4
+
+#define U8_APPEND_UNSAFE(s, i, c) { \
+    if((uint32_t)(c)<=0x7f) { \
+        (s)[(i)++]=(uint8_t)(c); \
+    } else { \
+        if((uint32_t)(c)<=0x7ff) { \
+            (s)[(i)++]=(uint8_t)(((c)>>6)|0xc0); \
+        } else { \
+            if((uint32_t)(c)<=0xffff) { \
+                (s)[(i)++]=(uint8_t)(((c)>>12)|0xe0); \
+            } else { \
+                (s)[(i)++]=(uint8_t)(((c)>>18)|0xf0); \
+                (s)[(i)++]=(uint8_t)((((c)>>12)&0x3f)|0x80); \
+            } \
+            (s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)|0x80); \
+        } \
+        (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80); \
+    } \
+}
 #endif