Merge WebKit at r72805: Initial merge by Git

Note that this is a backwards merge from Chromium release 9.0.600.0 to 9.0.597.0, to align with the Chromium 9 stable release branch. Change-Id: I5d2bb4e8cee9d39ae8485abf48bdb55ecf8b3790
author: Steve Block <steveblock@google.com> 2011-01-05 12:15:11 +0000
committer: Steve Block <steveblock@google.com> 2011-01-06 14:14:00 +0000
commit: d06194330da2bb8da887d2e1adeacb3a5c1504b2 (patch)
tree: e0af8413af65a8e30630563441af7bdb8478e513 /JavaScriptCore/wtf/unicode/UTF8.cpp
parent: 419a5cf2f8db6ca014df624865197ffb82caad37 (diff)
download: external_webkit-d06194330da2bb8da887d2e1adeacb3a5c1504b2.zip
external_webkit-d06194330da2bb8da887d2e1adeacb3a5c1504b2.tar.gz
external_webkit-d06194330da2bb8da887d2e1adeacb3a5c1504b2.tar.bz2
1 files changed, 38 insertions, 52 deletions
diff --git a/JavaScriptCore/wtf/unicode/UTF8.cpp b/JavaScriptCore/wtf/unicode/UTF8.cpp
index ca4fc1c..40c5609 100644
--- a/JavaScriptCore/wtf/unicode/UTF8.cpp
+++ b/JavaScriptCore/wtf/unicode/UTF8.cpp
@@ -26,14 +26,9 @@
 #include "config.h"
 #include "UTF8.h"
 
-#include "ASCIICType.h"
-
 namespace WTF {
 namespace Unicode {
 
-// FIXME: Use definition from CharacterNames.h.
-const UChar replacementCharacter = 0xFFFD;
-
 inline int inlineUTF8SequenceLengthNonASCII(char b0)
 {
     if ((b0 & 0xC0) != 0xC0)
@@ -49,12 +44,12 @@ inline int inlineUTF8SequenceLengthNonASCII(char b0)
 
 inline int inlineUTF8SequenceLength(char b0)
 {
-    return isASCII(b0) ? 1 : inlineUTF8SequenceLengthNonASCII(b0);
+    return (b0 & 0x80) == 0 ? 1 : inlineUTF8SequenceLengthNonASCII(b0);
 }
 
 int UTF8SequenceLength(char b0)
 {
-    return isASCII(b0) ? 1 : inlineUTF8SequenceLengthNonASCII(b0);
+    return (b0 & 0x80) == 0 ? 1 : inlineUTF8SequenceLengthNonASCII(b0);
 }
 
 int decodeUTF8Sequence(const char* sequence)
@@ -177,7 +172,7 @@ ConversionResult convertUTF16ToUTF8(
             bytesToWrite = 4;
         } else {
             bytesToWrite = 3;
-            ch = replacementCharacter;
+            ch = 0xFFFD;
         }
 
         target += bytesToWrite;
@@ -236,23 +231,6 @@ static bool isLegalUTF8(const unsigned char* source, int length)
 static const UChar32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, 
             0x03C82080UL, 0xFA082080UL, 0x82082080UL };
 
-static inline UChar32 readUTF8Sequence(const char*& sequence, unsigned length)
-{
-    UChar32 character = 0;
-
-    // The cases all fall through.
-    switch (length) {
-        case 6: character += static_cast<unsigned char>(*sequence++); character <<= 6;
-        case 5: character += static_cast<unsigned char>(*sequence++); character <<= 6;
-        case 4: character += static_cast<unsigned char>(*sequence++); character <<= 6;
-        case 3: character += static_cast<unsigned char>(*sequence++); character <<= 6;
-        case 2: character += static_cast<unsigned char>(*sequence++); character <<= 6;
-        case 1: character += static_cast<unsigned char>(*sequence++);
-    }
-
-    return character - offsetsFromUTF8[length - 1];
-}
-
 ConversionResult convertUTF8ToUTF16(
     const char** sourceStart, const char* sourceEnd, 
     UChar** targetStart, UChar* targetEnd, bool strict)
@@ -261,52 +239,60 @@ ConversionResult convertUTF8ToUTF16(
     const char* source = *sourceStart;
     UChar* target = *targetStart;
     while (source < sourceEnd) {
-        int utf8SequenceLength = inlineUTF8SequenceLength(*source);
-        if (sourceEnd - source < utf8SequenceLength)  {
+        UChar32 ch = 0;
+        int extraBytesToRead = inlineUTF8SequenceLength(*source) - 1;
+        if (source + extraBytesToRead >= sourceEnd) {
             result = sourceExhausted;
             break;
         }
         // Do this check whether lenient or strict
-        if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source), utf8SequenceLength)) {
+        if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source), extraBytesToRead + 1)) {
             result = sourceIllegal;
             break;
         }
-
-        UChar32 character = readUTF8Sequence(source, utf8SequenceLength);
+        // The cases all fall through.
+        switch (extraBytesToRead) {
+            case 5: ch += static_cast<unsigned char>(*source++); ch <<= 6; // remember, illegal UTF-8
+            case 4: ch += static_cast<unsigned char>(*source++); ch <<= 6; // remember, illegal UTF-8
+            case 3: ch += static_cast<unsigned char>(*source++); ch <<= 6;
+            case 2: ch += static_cast<unsigned char>(*source++); ch <<= 6;
+            case 1: ch += static_cast<unsigned char>(*source++); ch <<= 6;
+            case 0: ch += static_cast<unsigned char>(*source++);
+        }
+        ch -= offsetsFromUTF8[extraBytesToRead];
 
         if (target >= targetEnd) {
-            source -= utf8SequenceLength; // Back up source pointer!
-            result = targetExhausted;
-            break;
+            source -= (extraBytesToRead + 1); // Back up source pointer!
+            result = targetExhausted; break;
         }
-
-        if (U_IS_BMP(character)) {
+        if (ch <= 0xFFFF) {
             // UTF-16 surrogate values are illegal in UTF-32
-            if (U_IS_SURROGATE(character)) {
+            if (ch >= 0xD800 && ch <= 0xDFFF) {
                 if (strict) {
-                    source -= utf8SequenceLength; // return to the illegal value itself
+                    source -= (extraBytesToRead + 1); // return to the illegal value itself
                     result = sourceIllegal;
                     break;
                 } else
-                    *target++ = replacementCharacter;
+                    *target++ = 0xFFFD;
             } else
-                *target++ = character; // normal case
-        } else if (U_IS_SUPPLEMENTARY(character)) {
+                *target++ = (UChar)ch; // normal case
+        } else if (ch > 0x10FFFF) {
+            if (strict) {
+                result = sourceIllegal;
+                source -= (extraBytesToRead + 1); // return to the start
+                break; // Bail out; shouldn't continue
+            } else
+                *target++ = 0xFFFD;
+        } else {
             // target is a character in range 0xFFFF - 0x10FFFF
             if (target + 1 >= targetEnd) {
-                source -= utf8SequenceLength; // Back up source pointer!
+                source -= (extraBytesToRead + 1); // Back up source pointer!
                 result = targetExhausted;
                 break;
             }
-            *target++ = U16_LEAD(character);
-            *target++ = U16_TRAIL(character);
-        } else {
-            if (strict) {
-                source -= utf8SequenceLength; // return to the start
-                result = sourceIllegal;
-                break; // Bail out; shouldn't continue
-            } else
-                *target++ = replacementCharacter;
+            ch -= 0x0010000UL;
+            *target++ = (UChar)((ch >> 10) + 0xD800);
+            *target++ = (UChar)((ch & 0x03FF) + 0xDC00);
         }
     }
     *sourceStart = source;
@@ -314,5 +300,5 @@ ConversionResult convertUTF8ToUTF16(
     return result;
 }
 
-} // namespace Unicode
-} // namespace WTF
+}
+}
author	Steve Block <steveblock@google.com>	2011-01-05 12:15:11 +0000
committer	Steve Block <steveblock@google.com>	2011-01-06 14:14:00 +0000
commit	d06194330da2bb8da887d2e1adeacb3a5c1504b2 (patch)
tree	e0af8413af65a8e30630563441af7bdb8478e513 /JavaScriptCore/wtf/unicode/UTF8.cpp
parent	419a5cf2f8db6ca014df624865197ffb82caad37 (diff)
download	external_webkit-d06194330da2bb8da887d2e1adeacb3a5c1504b2.zip external_webkit-d06194330da2bb8da887d2e1adeacb3a5c1504b2.tar.gz external_webkit-d06194330da2bb8da887d2e1adeacb3a5c1504b2.tar.bz2