Merge WebKit at r73109: Initial merge by git.

Change-Id: I61f1a66d9642e3d8405d3ac6ccab2a53421c75d8
author: Shimeng (Simon) Wang <swang@google.com> 2010-12-07 17:22:45 -0800
committer: Shimeng (Simon) Wang <swang@google.com> 2010-12-22 14:15:40 -0800
commit: 4576aa36e9a9671459299c7963ac95aa94beaea9 (patch)
tree: 3863574e050f168c0126ecb47c83319fab0972d8 /JavaScriptCore/wtf/unicode
parent: 55323ac613cc31553107b68603cb627264d22bb0 (diff)
download: external_webkit-4576aa36e9a9671459299c7963ac95aa94beaea9.zip
external_webkit-4576aa36e9a9671459299c7963ac95aa94beaea9.tar.gz
external_webkit-4576aa36e9a9671459299c7963ac95aa94beaea9.tar.bz2
3 files changed, 129 insertions, 109 deletions
diff --git a/JavaScriptCore/wtf/unicode/UTF8.cpp b/JavaScriptCore/wtf/unicode/UTF8.cpp
index 40c5609..ca4fc1c 100644
--- a/JavaScriptCore/wtf/unicode/UTF8.cpp
+++ b/JavaScriptCore/wtf/unicode/UTF8.cpp
@@ -26,9 +26,14 @@
 #include "config.h"
 #include "UTF8.h"
 
+#include "ASCIICType.h"
+
 namespace WTF {
 namespace Unicode {
 
+// FIXME: Use definition from CharacterNames.h.
+const UChar replacementCharacter = 0xFFFD;
+
 inline int inlineUTF8SequenceLengthNonASCII(char b0)
 {
     if ((b0 & 0xC0) != 0xC0)
@@ -44,12 +49,12 @@ inline int inlineUTF8SequenceLengthNonASCII(char b0)
 
 inline int inlineUTF8SequenceLength(char b0)
 {
-    return (b0 & 0x80) == 0 ? 1 : inlineUTF8SequenceLengthNonASCII(b0);
+    return isASCII(b0) ? 1 : inlineUTF8SequenceLengthNonASCII(b0);
 }
 
 int UTF8SequenceLength(char b0)
 {
-    return (b0 & 0x80) == 0 ? 1 : inlineUTF8SequenceLengthNonASCII(b0);
+    return isASCII(b0) ? 1 : inlineUTF8SequenceLengthNonASCII(b0);
 }
 
 int decodeUTF8Sequence(const char* sequence)
@@ -172,7 +177,7 @@ ConversionResult convertUTF16ToUTF8(
             bytesToWrite = 4;
         } else {
             bytesToWrite = 3;
-            ch = 0xFFFD;
+            ch = replacementCharacter;
         }
 
         target += bytesToWrite;
@@ -231,6 +236,23 @@ static bool isLegalUTF8(const unsigned char* source, int length)
 static const UChar32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, 
             0x03C82080UL, 0xFA082080UL, 0x82082080UL };
 
+static inline UChar32 readUTF8Sequence(const char*& sequence, unsigned length)
+{
+    UChar32 character = 0;
+
+    // The cases all fall through.
+    switch (length) {
+        case 6: character += static_cast<unsigned char>(*sequence++); character <<= 6;
+        case 5: character += static_cast<unsigned char>(*sequence++); character <<= 6;
+        case 4: character += static_cast<unsigned char>(*sequence++); character <<= 6;
+        case 3: character += static_cast<unsigned char>(*sequence++); character <<= 6;
+        case 2: character += static_cast<unsigned char>(*sequence++); character <<= 6;
+        case 1: character += static_cast<unsigned char>(*sequence++);
+    }
+
+    return character - offsetsFromUTF8[length - 1];
+}
+
 ConversionResult convertUTF8ToUTF16(
     const char** sourceStart, const char* sourceEnd, 
     UChar** targetStart, UChar* targetEnd, bool strict)
@@ -239,60 +261,52 @@ ConversionResult convertUTF8ToUTF16(
     const char* source = *sourceStart;
     UChar* target = *targetStart;
     while (source < sourceEnd) {
-        UChar32 ch = 0;
-        int extraBytesToRead = inlineUTF8SequenceLength(*source) - 1;
-        if (source + extraBytesToRead >= sourceEnd) {
+        int utf8SequenceLength = inlineUTF8SequenceLength(*source);
+        if (sourceEnd - source < utf8SequenceLength)  {
             result = sourceExhausted;
             break;
         }
         // Do this check whether lenient or strict
-        if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source), extraBytesToRead + 1)) {
+        if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source), utf8SequenceLength)) {
             result = sourceIllegal;
             break;
         }
-        // The cases all fall through.
-        switch (extraBytesToRead) {
-            case 5: ch += static_cast<unsigned char>(*source++); ch <<= 6; // remember, illegal UTF-8
-            case 4: ch += static_cast<unsigned char>(*source++); ch <<= 6; // remember, illegal UTF-8
-            case 3: ch += static_cast<unsigned char>(*source++); ch <<= 6;
-            case 2: ch += static_cast<unsigned char>(*source++); ch <<= 6;
-            case 1: ch += static_cast<unsigned char>(*source++); ch <<= 6;
-            case 0: ch += static_cast<unsigned char>(*source++);
-        }
-        ch -= offsetsFromUTF8[extraBytesToRead];
+
+        UChar32 character = readUTF8Sequence(source, utf8SequenceLength);
 
         if (target >= targetEnd) {
-            source -= (extraBytesToRead + 1); // Back up source pointer!
-            result = targetExhausted; break;
+            source -= utf8SequenceLength; // Back up source pointer!
+            result = targetExhausted;
+            break;
         }
-        if (ch <= 0xFFFF) {
+
+        if (U_IS_BMP(character)) {
             // UTF-16 surrogate values are illegal in UTF-32
-            if (ch >= 0xD800 && ch <= 0xDFFF) {
+            if (U_IS_SURROGATE(character)) {
                 if (strict) {
-                    source -= (extraBytesToRead + 1); // return to the illegal value itself
+                    source -= utf8SequenceLength; // return to the illegal value itself
                     result = sourceIllegal;
                     break;
                 } else
-                    *target++ = 0xFFFD;
+                    *target++ = replacementCharacter;
             } else
-                *target++ = (UChar)ch; // normal case
-        } else if (ch > 0x10FFFF) {
-            if (strict) {
-                result = sourceIllegal;
-                source -= (extraBytesToRead + 1); // return to the start
-                break; // Bail out; shouldn't continue
-            } else
-                *target++ = 0xFFFD;
-        } else {
+                *target++ = character; // normal case
+        } else if (U_IS_SUPPLEMENTARY(character)) {
             // target is a character in range 0xFFFF - 0x10FFFF
             if (target + 1 >= targetEnd) {
-                source -= (extraBytesToRead + 1); // Back up source pointer!
+                source -= utf8SequenceLength; // Back up source pointer!
                 result = targetExhausted;
                 break;
             }
-            ch -= 0x0010000UL;
-            *target++ = (UChar)((ch >> 10) + 0xD800);
-            *target++ = (UChar)((ch & 0x03FF) + 0xDC00);
+            *target++ = U16_LEAD(character);
+            *target++ = U16_TRAIL(character);
+        } else {
+            if (strict) {
+                source -= utf8SequenceLength; // return to the start
+                result = sourceIllegal;
+                break; // Bail out; shouldn't continue
+            } else
+                *target++ = replacementCharacter;
         }
     }
     *sourceStart = source;
@@ -300,5 +314,5 @@ ConversionResult convertUTF8ToUTF16(
     return result;
 }
 
-}
-}
+} // namespace Unicode
+} // namespace WTF
diff --git a/JavaScriptCore/wtf/unicode/UnicodeMacrosFromICU.h b/JavaScriptCore/wtf/unicode/UnicodeMacrosFromICU.h
index f865ef1..8959912 100644
--- a/JavaScriptCore/wtf/unicode/UnicodeMacrosFromICU.h
+++ b/JavaScriptCore/wtf/unicode/UnicodeMacrosFromICU.h
@@ -1,4 +1,5 @@
 /*
+ *  Copyright (C) 1999-2004, International Business Machines Corporation and others.  All Rights Reserved.
  *  Copyright (C) 2006 George Staikos <staikos@kde.org>
  *  Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com>
  *  Copyright (C) 2007 Apple Computer, Inc. All rights reserved.
@@ -38,11 +39,28 @@
 #define U16_TRAIL(supplementary) (UChar)(((supplementary)&0x3ff)|0xdc00)
 #define U16_LENGTH(c) ((uint32_t)(c) <= 0xffff ? 1 : 2)
 
+#define U_IS_SUPPLEMENTARY(c) ((UChar32)((c)-0x10000)<=0xfffff)
 #define U_IS_SURROGATE(c) (((c)&0xfffff800)==0xd800)
 #define U16_IS_SINGLE(c) !U_IS_SURROGATE(c)
 #define U16_IS_SURROGATE(c) U_IS_SURROGATE(c)
 #define U16_IS_SURROGATE_LEAD(c) (((c)&0x400)==0)
 
+#define U16_GET(s, start, i, length, c) { \
+    (c)=(s)[i]; \
+    if(U16_IS_SURROGATE(c)) { \
+        uint16_t __c2; \
+        if(U16_IS_SURROGATE_LEAD(c)) { \
+            if((i)+1<(length) && U16_IS_TRAIL(__c2=(s)[(i)+1])) { \
+                (c)=U16_GET_SUPPLEMENTARY((c), __c2); \
+            } \
+        } else { \
+            if((i)-1>=(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \
+                (c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \
+            } \
+        } \
+    } \
+}
+
 #define U16_PREV(s, start, i, c) { \
     (c)=(s)[--(i)]; \
     if(U16_IS_TRAIL(c)) { \
@@ -54,6 +72,12 @@
     } \
 }
 
+#define U16_BACK_1(s, start, i) { \
+    if(U16_IS_TRAIL((s)[--(i)]) && (i)>(start) && U16_IS_LEAD((s)[(i)-1])) { \
+        --(i); \
+    } \
+}
+
 #define U16_NEXT(s, i, length, c) { \
     (c)=(s)[(i)++]; \
     if(U16_IS_LEAD(c)) { \
@@ -65,7 +89,12 @@
     } \
 }
 
+#define U16_FWD_1(s, i, length) { \
+    if(U16_IS_LEAD((s)[(i)++]) && (i)<(length) && U16_IS_TRAIL((s)[i])) { \
+        ++(i); \
+    } \
+}
+
 #define U_MASK(x) ((uint32_t)1<<(x))
 
 #endif
-
diff --git a/JavaScriptCore/wtf/unicode/glib/UnicodeGLib.cpp b/JavaScriptCore/wtf/unicode/glib/UnicodeGLib.cpp
index e20c376..a01c3ee 100644
--- a/JavaScriptCore/wtf/unicode/glib/UnicodeGLib.cpp
+++ b/JavaScriptCore/wtf/unicode/glib/UnicodeGLib.cpp
@@ -1,6 +1,7 @@
 /*
  *  Copyright (C) 2008 Jürg Billeter <j@bitron.ch>
  *  Copyright (C) 2008 Dominik Röttsches <dominik.roettsches@access-company.com>
+ *  Copyright (C) 2010 Igalia S.L.
  *
  *  This library is free software; you can redistribute it and/or
  *  modify it under the terms of the GNU Library General Public
@@ -22,6 +23,11 @@
 #include "config.h"
 #include "UnicodeGLib.h"
 
+#include <wtf/Vector.h>
+#include <wtf/unicode/UTF8.h>
+
+#define UTF8_IS_SURROGATE(character) (character >= 0x10000 && character <= 0x10FFFF)
+
 namespace WTF {
 namespace Unicode {
 
@@ -43,100 +49,71 @@ UChar32 foldCase(UChar32 ch)
     return *ucs4Result;
 }
 
-int foldCase(UChar* result, int resultLength, const UChar* src, int srcLength, bool* error)
+static int getUTF16LengthFromUTF8(const gchar* utf8String, int length)
 {
-    *error = false;
-    GOwnPtr<GError> gerror;
+    int utf16Length = 0;
+    const gchar* inputString = utf8String;
 
-    GOwnPtr<char> utf8src;
-    utf8src.set(g_utf16_to_utf8(src, srcLength, 0, 0, &gerror.outPtr()));
-    if (gerror) {
-        *error = true;
-        return -1;
-    }
-
-    GOwnPtr<char> utf8result;
-    utf8result.set(g_utf8_casefold(utf8src.get(), -1));
+    while ((utf8String + length - inputString > 0) && *inputString) {
+        gunichar character = g_utf8_get_char(inputString);
 
-    long utf16resultLength = -1;
-    GOwnPtr<UChar> utf16result;
-    utf16result.set(g_utf8_to_utf16(utf8result.get(), -1, 0, &utf16resultLength, &gerror.outPtr()));
-    if (gerror) {
-        *error = true;
-        return -1;
+        utf16Length += UTF8_IS_SURROGATE(character) ? 2 : 1;
+        inputString = g_utf8_next_char(inputString);
     }
 
-    if (utf16resultLength > resultLength) {
-        *error = true;
-        return utf16resultLength;
-    }
-    memcpy(result, utf16result.get(), utf16resultLength * sizeof(UChar));
-
-    return utf16resultLength;
+    return utf16Length;
 }
 
-int toLower(UChar* result, int resultLength, const UChar* src, int srcLength, bool* error)
+typedef gchar* (*UTF8CaseFunction)(const gchar*, gssize length);
+
+static int convertCase(UChar* result, int resultLength, const UChar* src, int srcLength, bool* error, UTF8CaseFunction caseFunction)
 {
     *error = false;
-    GOwnPtr<GError> gerror;
 
-    GOwnPtr<char> utf8src;
-    utf8src.set(g_utf16_to_utf8(src, srcLength, 0, 0, &gerror.outPtr()));
-    if (gerror) {
+    // Allocate a buffer big enough to hold all the characters.
+    Vector<char> buffer(srcLength * 3);
+    char* utf8Target = buffer.data();
+    const UChar* utf16Source = src;
+    ConversionResult conversionResult = convertUTF16ToUTF8(&utf16Source, utf16Source + srcLength, &utf8Target, utf8Target + buffer.size(), true);
+    if (conversionResult != conversionOK) {
         *error = true;
         return -1;
     }
+    buffer.shrink(utf8Target - buffer.data());
 
-    GOwnPtr<char> utf8result;
-    utf8result.set(g_utf8_strdown(utf8src.get(), -1));
+    GOwnPtr<char> utf8Result(caseFunction(buffer.data(), buffer.size()));
+    long utf8ResultLength = strlen(utf8Result.get());
 
-    long utf16resultLength = -1;
-    GOwnPtr<UChar> utf16result;
-    utf16result.set(g_utf8_to_utf16(utf8result.get(), -1, 0, &utf16resultLength, &gerror.outPtr()));
-    if (gerror) {
+    // Calculate the destination buffer size.
+    int realLength = getUTF16LengthFromUTF8(utf8Result.get(), utf8ResultLength);
+    if (realLength > resultLength) {
         *error = true;
-        return -1;
+        return realLength;
     }
 
-    if (utf16resultLength > resultLength) {
+    // Convert the result to UTF-16.
+    UChar* utf16Target = result;
+    const char* utf8Source = utf8Result.get();
+    conversionResult = convertUTF8ToUTF16(&utf8Source, utf8Source + utf8ResultLength, &utf16Target, utf16Target + resultLength, true);
+    long utf16ResultLength = utf16Target - result;
+    if (conversionResult != conversionOK)
         *error = true;
-        return utf16resultLength;
-    }
-    memcpy(result, utf16result.get(), utf16resultLength * sizeof(UChar));
 
-    return utf16resultLength;
+    return utf16ResultLength <= 0 ? -1 : utf16ResultLength;
 }
-
-int toUpper(UChar* result, int resultLength, const UChar* src, int srcLength, bool* error)
+int foldCase(UChar* result, int resultLength, const UChar* src, int srcLength, bool* error)
 {
-    *error = false;
-    GOwnPtr<GError> gerror;
-
-    GOwnPtr<char> utf8src;
-    utf8src.set(g_utf16_to_utf8(src, srcLength, 0, 0, &gerror.outPtr()));
-    if (gerror) {
-        *error = true;
-        return -1;
-    }
-
-    GOwnPtr<char> utf8result;
-    utf8result.set(g_utf8_strup(utf8src.get(), -1));
-
-    long utf16resultLength = -1;
-    GOwnPtr<UChar> utf16result;
-    utf16result.set(g_utf8_to_utf16(utf8result.get(), -1, 0, &utf16resultLength, &gerror.outPtr()));
-    if (gerror) {
-        *error = true;
-        return -1;
-    }
+    return convertCase(result, resultLength, src, srcLength, error, g_utf8_casefold);
+}
 
-    if (utf16resultLength > resultLength) {
-        *error = true;
-        return utf16resultLength;
-    }
-    memcpy(result, utf16result.get(), utf16resultLength * sizeof(UChar));
+int toLower(UChar* result, int resultLength, const UChar* src, int srcLength, bool* error)
+{
+    return convertCase(result, resultLength, src, srcLength, error, g_utf8_strdown);
+}
 
-    return utf16resultLength;
+int toUpper(UChar* result, int resultLength, const UChar* src, int srcLength, bool* error)
+{
+    return convertCase(result, resultLength, src, srcLength, error, g_utf8_strup);
 }
 
 Direction direction(UChar32 c)
author	Shimeng (Simon) Wang <swang@google.com>	2010-12-07 17:22:45 -0800
committer	Shimeng (Simon) Wang <swang@google.com>	2010-12-22 14:15:40 -0800
commit	4576aa36e9a9671459299c7963ac95aa94beaea9 (patch)
tree	3863574e050f168c0126ecb47c83319fab0972d8 /JavaScriptCore/wtf/unicode
parent	55323ac613cc31553107b68603cb627264d22bb0 (diff)
download	external_webkit-4576aa36e9a9671459299c7963ac95aa94beaea9.zip external_webkit-4576aa36e9a9671459299c7963ac95aa94beaea9.tar.gz external_webkit-4576aa36e9a9671459299c7963ac95aa94beaea9.tar.bz2