Merge WebKit at r74534: Initial merge by git.

Change-Id: I6ccd1154fa1b19c2ec2a66878eb675738735f1eb
author: Ben Murdoch <benm@google.com> 2011-05-05 14:36:32 +0100
committer: Ben Murdoch <benm@google.com> 2011-05-10 15:38:30 +0100
commit: f05b935882198ccf7d81675736e3aeb089c5113a (patch)
tree: 4ea0ca838d9ef1b15cf17ddb3928efb427c7e5a1 /JavaScriptCore/wtf/unicode
parent: 60fbdcc62bced8db2cb1fd233cc4d1e4ea17db1b (diff)
download: external_webkit-f05b935882198ccf7d81675736e3aeb089c5113a.zip
external_webkit-f05b935882198ccf7d81675736e3aeb089c5113a.tar.gz
external_webkit-f05b935882198ccf7d81675736e3aeb089c5113a.tar.bz2
3 files changed, 171 insertions, 40 deletions
diff --git a/JavaScriptCore/wtf/unicode/UTF8.cpp b/JavaScriptCore/wtf/unicode/UTF8.cpp
index 40c5609..dc24ed5 100644
--- a/JavaScriptCore/wtf/unicode/UTF8.cpp
+++ b/JavaScriptCore/wtf/unicode/UTF8.cpp
@@ -1,5 +1,6 @@
 /*
  * Copyright (C) 2007 Apple Inc.  All rights reserved.
+ * Copyright (C) 2010 Patrick Gansterer <paroga@paroga.com>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -25,10 +26,16 @@
 
 #include "config.h"
 #include "UTF8.h"
+#include <wtf/StringHasher.h>
+
+#include "ASCIICType.h"
 
 namespace WTF {
 namespace Unicode {
 
+// FIXME: Use definition from CharacterNames.h.
+static const UChar replacementCharacter = 0xFFFD;
+
 inline int inlineUTF8SequenceLengthNonASCII(char b0)
 {
     if ((b0 & 0xC0) != 0xC0)
@@ -44,12 +51,12 @@ inline int inlineUTF8SequenceLengthNonASCII(char b0)
 
 inline int inlineUTF8SequenceLength(char b0)
 {
-    return (b0 & 0x80) == 0 ? 1 : inlineUTF8SequenceLengthNonASCII(b0);
+    return isASCII(b0) ? 1 : inlineUTF8SequenceLengthNonASCII(b0);
 }
 
 int UTF8SequenceLength(char b0)
 {
-    return (b0 & 0x80) == 0 ? 1 : inlineUTF8SequenceLengthNonASCII(b0);
+    return isASCII(b0) ? 1 : inlineUTF8SequenceLengthNonASCII(b0);
 }
 
 int decodeUTF8Sequence(const char* sequence)
@@ -172,7 +179,7 @@ ConversionResult convertUTF16ToUTF8(
             bytesToWrite = 4;
         } else {
             bytesToWrite = 3;
-            ch = 0xFFFD;
+            ch = replacementCharacter;
         }
 
         target += bytesToWrite;
@@ -231,6 +238,23 @@ static bool isLegalUTF8(const unsigned char* source, int length)
 static const UChar32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, 
             0x03C82080UL, 0xFA082080UL, 0x82082080UL };
 
+static inline UChar32 readUTF8Sequence(const char*& sequence, unsigned length)
+{
+    UChar32 character = 0;
+
+    // The cases all fall through.
+    switch (length) {
+        case 6: character += static_cast<unsigned char>(*sequence++); character <<= 6;
+        case 5: character += static_cast<unsigned char>(*sequence++); character <<= 6;
+        case 4: character += static_cast<unsigned char>(*sequence++); character <<= 6;
+        case 3: character += static_cast<unsigned char>(*sequence++); character <<= 6;
+        case 2: character += static_cast<unsigned char>(*sequence++); character <<= 6;
+        case 1: character += static_cast<unsigned char>(*sequence++);
+    }
+
+    return character - offsetsFromUTF8[length - 1];
+}
+
 ConversionResult convertUTF8ToUTF16(
     const char** sourceStart, const char* sourceEnd, 
     UChar** targetStart, UChar* targetEnd, bool strict)
@@ -239,60 +263,52 @@ ConversionResult convertUTF8ToUTF16(
     const char* source = *sourceStart;
     UChar* target = *targetStart;
     while (source < sourceEnd) {
-        UChar32 ch = 0;
-        int extraBytesToRead = inlineUTF8SequenceLength(*source) - 1;
-        if (source + extraBytesToRead >= sourceEnd) {
+        int utf8SequenceLength = inlineUTF8SequenceLength(*source);
+        if (sourceEnd - source < utf8SequenceLength)  {
             result = sourceExhausted;
             break;
         }
         // Do this check whether lenient or strict
-        if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source), extraBytesToRead + 1)) {
+        if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source), utf8SequenceLength)) {
             result = sourceIllegal;
             break;
         }
-        // The cases all fall through.
-        switch (extraBytesToRead) {
-            case 5: ch += static_cast<unsigned char>(*source++); ch <<= 6; // remember, illegal UTF-8
-            case 4: ch += static_cast<unsigned char>(*source++); ch <<= 6; // remember, illegal UTF-8
-            case 3: ch += static_cast<unsigned char>(*source++); ch <<= 6;
-            case 2: ch += static_cast<unsigned char>(*source++); ch <<= 6;
-            case 1: ch += static_cast<unsigned char>(*source++); ch <<= 6;
-            case 0: ch += static_cast<unsigned char>(*source++);
-        }
-        ch -= offsetsFromUTF8[extraBytesToRead];
+
+        UChar32 character = readUTF8Sequence(source, utf8SequenceLength);
 
         if (target >= targetEnd) {
-            source -= (extraBytesToRead + 1); // Back up source pointer!
-            result = targetExhausted; break;
+            source -= utf8SequenceLength; // Back up source pointer!
+            result = targetExhausted;
+            break;
         }
-        if (ch <= 0xFFFF) {
+
+        if (U_IS_BMP(character)) {
             // UTF-16 surrogate values are illegal in UTF-32
-            if (ch >= 0xD800 && ch <= 0xDFFF) {
+            if (U_IS_SURROGATE(character)) {
                 if (strict) {
-                    source -= (extraBytesToRead + 1); // return to the illegal value itself
+                    source -= utf8SequenceLength; // return to the illegal value itself
                     result = sourceIllegal;
                     break;
                 } else
-                    *target++ = 0xFFFD;
-            } else
-                *target++ = (UChar)ch; // normal case
-        } else if (ch > 0x10FFFF) {
-            if (strict) {
-                result = sourceIllegal;
-                source -= (extraBytesToRead + 1); // return to the start
-                break; // Bail out; shouldn't continue
+                    *target++ = replacementCharacter;
             } else
-                *target++ = 0xFFFD;
-        } else {
+                *target++ = character; // normal case
+        } else if (U_IS_SUPPLEMENTARY(character)) {
             // target is a character in range 0xFFFF - 0x10FFFF
             if (target + 1 >= targetEnd) {
-                source -= (extraBytesToRead + 1); // Back up source pointer!
+                source -= utf8SequenceLength; // Back up source pointer!
                 result = targetExhausted;
                 break;
             }
-            ch -= 0x0010000UL;
-            *target++ = (UChar)((ch >> 10) + 0xD800);
-            *target++ = (UChar)((ch & 0x03FF) + 0xDC00);
+            *target++ = U16_LEAD(character);
+            *target++ = U16_TRAIL(character);
+        } else {
+            if (strict) {
+                source -= utf8SequenceLength; // return to the start
+                result = sourceIllegal;
+                break; // Bail out; shouldn't continue
+            } else
+                *target++ = replacementCharacter;
         }
     }
     *sourceStart = source;
@@ -300,5 +316,86 @@ ConversionResult convertUTF8ToUTF16(
     return result;
 }
 
+unsigned calculateStringHashFromUTF8(const char* data, const char* dataEnd, unsigned& utf16Length)
+{
+    if (!data)
+        return 0;
+
+    WTF::StringHasher stringHasher;
+    utf16Length = 0;
+
+    while (data < dataEnd) {
+        if (isASCII(*data)) {
+            stringHasher.addCharacter(*data++);
+            utf16Length++;
+            continue;
+        }
+
+        int utf8SequenceLength = inlineUTF8SequenceLengthNonASCII(*data);
+
+        if (dataEnd - data < utf8SequenceLength)
+            return false;
+
+        if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(data), utf8SequenceLength))
+            return 0;
+
+        UChar32 character = readUTF8Sequence(data, utf8SequenceLength);
+        ASSERT(!isASCII(character));
+
+        if (U_IS_BMP(character)) {
+            // UTF-16 surrogate values are illegal in UTF-32
+            if (U_IS_SURROGATE(character))
+                return 0;
+            stringHasher.addCharacter(static_cast<UChar>(character)); // normal case
+            utf16Length++;
+        } else if (U_IS_SUPPLEMENTARY(character)) {
+            stringHasher.addCharacters(static_cast<UChar>(U16_LEAD(character)),
+                                       static_cast<UChar>(U16_TRAIL(character)));
+            utf16Length += 2;
+        } else
+            return 0;
+    }
+
+    return stringHasher.hash();
 }
+
+bool equalUTF16WithUTF8(const UChar* a, const UChar* aEnd, const char* b, const char* bEnd)
+{
+    while (b < bEnd) {
+        if (isASCII(*b)) {
+            if (*a++ != *b++)
+                return false;
+            continue;
+        }
+
+        int utf8SequenceLength = inlineUTF8SequenceLengthNonASCII(*b);
+
+        if (bEnd - b < utf8SequenceLength)
+            return false;
+
+        if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(b), utf8SequenceLength))
+            return 0;
+
+        UChar32 character = readUTF8Sequence(b, utf8SequenceLength);
+        ASSERT(!isASCII(character));
+
+        if (U_IS_BMP(character)) {
+            // UTF-16 surrogate values are illegal in UTF-32
+            if (U_IS_SURROGATE(character))
+                return false;
+            if (*a++ != character)
+                return false;
+        } else if (U_IS_SUPPLEMENTARY(character)) {
+            if (*a++ != U16_LEAD(character))
+                return false;
+            if (*a++ != U16_TRAIL(character))
+                return false;
+        } else
+            return false;
+    }
+
+    return a == aEnd;
 }
+
+} // namespace Unicode
+} // namespace WTF
diff --git a/JavaScriptCore/wtf/unicode/UTF8.h b/JavaScriptCore/wtf/unicode/UTF8.h
index a5ed93e..1f4baca 100644
--- a/JavaScriptCore/wtf/unicode/UTF8.h
+++ b/JavaScriptCore/wtf/unicode/UTF8.h
@@ -29,7 +29,7 @@
 #include "Unicode.h"
 
 namespace WTF {
-  namespace Unicode {
+namespace Unicode {
 
     // Given a first byte, gives the length of the UTF-8 sequence it begins.
     // Returns 0 for bytes that are not legal starts of UTF-8 sequences.
@@ -69,7 +69,12 @@ namespace WTF {
     ConversionResult convertUTF16ToUTF8(
                     const UChar** sourceStart, const UChar* sourceEnd, 
                     char** targetStart, char* targetEnd, bool strict = true);
-  }
-}
+
+    unsigned calculateStringHashFromUTF8(const char* data, const char* dataEnd, unsigned& utf16Length);
+
+    bool equalUTF16WithUTF8(const UChar* a, const UChar* aEnd, const char* b, const char* bEnd);
+
+} // namespace Unicode
+} // namespace WTF
 
 #endif // WTF_UTF8_h
diff --git a/JavaScriptCore/wtf/unicode/UnicodeMacrosFromICU.h b/JavaScriptCore/wtf/unicode/UnicodeMacrosFromICU.h
index f865ef1..8959912 100644
--- a/JavaScriptCore/wtf/unicode/UnicodeMacrosFromICU.h
+++ b/JavaScriptCore/wtf/unicode/UnicodeMacrosFromICU.h
@@ -1,4 +1,5 @@
 /*
+ *  Copyright (C) 1999-2004, International Business Machines Corporation and others.  All Rights Reserved.
  *  Copyright (C) 2006 George Staikos <staikos@kde.org>
  *  Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com>
  *  Copyright (C) 2007 Apple Computer, Inc. All rights reserved.
@@ -38,11 +39,28 @@
 #define U16_TRAIL(supplementary) (UChar)(((supplementary)&0x3ff)|0xdc00)
 #define U16_LENGTH(c) ((uint32_t)(c) <= 0xffff ? 1 : 2)
 
+#define U_IS_SUPPLEMENTARY(c) ((UChar32)((c)-0x10000)<=0xfffff)
 #define U_IS_SURROGATE(c) (((c)&0xfffff800)==0xd800)
 #define U16_IS_SINGLE(c) !U_IS_SURROGATE(c)
 #define U16_IS_SURROGATE(c) U_IS_SURROGATE(c)
 #define U16_IS_SURROGATE_LEAD(c) (((c)&0x400)==0)
 
+#define U16_GET(s, start, i, length, c) { \
+    (c)=(s)[i]; \
+    if(U16_IS_SURROGATE(c)) { \
+        uint16_t __c2; \
+        if(U16_IS_SURROGATE_LEAD(c)) { \
+            if((i)+1<(length) && U16_IS_TRAIL(__c2=(s)[(i)+1])) { \
+                (c)=U16_GET_SUPPLEMENTARY((c), __c2); \
+            } \
+        } else { \
+            if((i)-1>=(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \
+                (c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \
+            } \
+        } \
+    } \
+}
+
 #define U16_PREV(s, start, i, c) { \
     (c)=(s)[--(i)]; \
     if(U16_IS_TRAIL(c)) { \
@@ -54,6 +72,12 @@
     } \
 }
 
+#define U16_BACK_1(s, start, i) { \
+    if(U16_IS_TRAIL((s)[--(i)]) && (i)>(start) && U16_IS_LEAD((s)[(i)-1])) { \
+        --(i); \
+    } \
+}
+
 #define U16_NEXT(s, i, length, c) { \
     (c)=(s)[(i)++]; \
     if(U16_IS_LEAD(c)) { \
@@ -65,7 +89,12 @@
     } \
 }
 
+#define U16_FWD_1(s, i, length) { \
+    if(U16_IS_LEAD((s)[(i)++]) && (i)<(length) && U16_IS_TRAIL((s)[i])) { \
+        ++(i); \
+    } \
+}
+
 #define U_MASK(x) ((uint32_t)1<<(x))
 
 #endif
-
author	Ben Murdoch <benm@google.com>	2011-05-05 14:36:32 +0100
committer	Ben Murdoch <benm@google.com>	2011-05-10 15:38:30 +0100
commit	f05b935882198ccf7d81675736e3aeb089c5113a (patch)
tree	4ea0ca838d9ef1b15cf17ddb3928efb427c7e5a1 /JavaScriptCore/wtf/unicode
parent	60fbdcc62bced8db2cb1fd233cc4d1e4ea17db1b (diff)
download	external_webkit-f05b935882198ccf7d81675736e3aeb089c5113a.zip external_webkit-f05b935882198ccf7d81675736e3aeb089c5113a.tar.gz external_webkit-f05b935882198ccf7d81675736e3aeb089c5113a.tar.bz2