diff options
| author | Shimeng (Simon) Wang <swang@google.com> | 2010-12-07 17:22:45 -0800 |
|---|---|---|
| committer | Shimeng (Simon) Wang <swang@google.com> | 2010-12-22 14:15:40 -0800 |
| commit | 4576aa36e9a9671459299c7963ac95aa94beaea9 (patch) | |
| tree | 3863574e050f168c0126ecb47c83319fab0972d8 /JavaScriptCore/wtf/unicode | |
| parent | 55323ac613cc31553107b68603cb627264d22bb0 (diff) | |
| download | external_webkit-4576aa36e9a9671459299c7963ac95aa94beaea9.zip external_webkit-4576aa36e9a9671459299c7963ac95aa94beaea9.tar.gz external_webkit-4576aa36e9a9671459299c7963ac95aa94beaea9.tar.bz2 | |
Merge WebKit at r73109: Initial merge by git.
Change-Id: I61f1a66d9642e3d8405d3ac6ccab2a53421c75d8
Diffstat (limited to 'JavaScriptCore/wtf/unicode')
| -rw-r--r-- | JavaScriptCore/wtf/unicode/UTF8.cpp | 90 | ||||
| -rw-r--r-- | JavaScriptCore/wtf/unicode/UnicodeMacrosFromICU.h | 31 | ||||
| -rw-r--r-- | JavaScriptCore/wtf/unicode/glib/UnicodeGLib.cpp | 117 |
3 files changed, 129 insertions, 109 deletions
diff --git a/JavaScriptCore/wtf/unicode/UTF8.cpp b/JavaScriptCore/wtf/unicode/UTF8.cpp index 40c5609..ca4fc1c 100644 --- a/JavaScriptCore/wtf/unicode/UTF8.cpp +++ b/JavaScriptCore/wtf/unicode/UTF8.cpp @@ -26,9 +26,14 @@ #include "config.h" #include "UTF8.h" +#include "ASCIICType.h" + namespace WTF { namespace Unicode { +// FIXME: Use definition from CharacterNames.h. +const UChar replacementCharacter = 0xFFFD; + inline int inlineUTF8SequenceLengthNonASCII(char b0) { if ((b0 & 0xC0) != 0xC0) @@ -44,12 +49,12 @@ inline int inlineUTF8SequenceLengthNonASCII(char b0) inline int inlineUTF8SequenceLength(char b0) { - return (b0 & 0x80) == 0 ? 1 : inlineUTF8SequenceLengthNonASCII(b0); + return isASCII(b0) ? 1 : inlineUTF8SequenceLengthNonASCII(b0); } int UTF8SequenceLength(char b0) { - return (b0 & 0x80) == 0 ? 1 : inlineUTF8SequenceLengthNonASCII(b0); + return isASCII(b0) ? 1 : inlineUTF8SequenceLengthNonASCII(b0); } int decodeUTF8Sequence(const char* sequence) @@ -172,7 +177,7 @@ ConversionResult convertUTF16ToUTF8( bytesToWrite = 4; } else { bytesToWrite = 3; - ch = 0xFFFD; + ch = replacementCharacter; } target += bytesToWrite; @@ -231,6 +236,23 @@ static bool isLegalUTF8(const unsigned char* source, int length) static const UChar32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, 0x03C82080UL, 0xFA082080UL, 0x82082080UL }; +static inline UChar32 readUTF8Sequence(const char*& sequence, unsigned length) +{ + UChar32 character = 0; + + // The cases all fall through. + switch (length) { + case 6: character += static_cast<unsigned char>(*sequence++); character <<= 6; + case 5: character += static_cast<unsigned char>(*sequence++); character <<= 6; + case 4: character += static_cast<unsigned char>(*sequence++); character <<= 6; + case 3: character += static_cast<unsigned char>(*sequence++); character <<= 6; + case 2: character += static_cast<unsigned char>(*sequence++); character <<= 6; + case 1: character += static_cast<unsigned char>(*sequence++); + } + + return character - offsetsFromUTF8[length - 1]; +} + ConversionResult convertUTF8ToUTF16( const char** sourceStart, const char* sourceEnd, UChar** targetStart, UChar* targetEnd, bool strict) @@ -239,60 +261,52 @@ ConversionResult convertUTF8ToUTF16( const char* source = *sourceStart; UChar* target = *targetStart; while (source < sourceEnd) { - UChar32 ch = 0; - int extraBytesToRead = inlineUTF8SequenceLength(*source) - 1; - if (source + extraBytesToRead >= sourceEnd) { + int utf8SequenceLength = inlineUTF8SequenceLength(*source); + if (sourceEnd - source < utf8SequenceLength) { result = sourceExhausted; break; } // Do this check whether lenient or strict - if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source), extraBytesToRead + 1)) { + if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source), utf8SequenceLength)) { result = sourceIllegal; break; } - // The cases all fall through. - switch (extraBytesToRead) { - case 5: ch += static_cast<unsigned char>(*source++); ch <<= 6; // remember, illegal UTF-8 - case 4: ch += static_cast<unsigned char>(*source++); ch <<= 6; // remember, illegal UTF-8 - case 3: ch += static_cast<unsigned char>(*source++); ch <<= 6; - case 2: ch += static_cast<unsigned char>(*source++); ch <<= 6; - case 1: ch += static_cast<unsigned char>(*source++); ch <<= 6; - case 0: ch += static_cast<unsigned char>(*source++); - } - ch -= offsetsFromUTF8[extraBytesToRead]; + + UChar32 character = readUTF8Sequence(source, utf8SequenceLength); if (target >= targetEnd) { - source -= (extraBytesToRead + 1); // Back up source pointer! - result = targetExhausted; break; + source -= utf8SequenceLength; // Back up source pointer! + result = targetExhausted; + break; } - if (ch <= 0xFFFF) { + + if (U_IS_BMP(character)) { // UTF-16 surrogate values are illegal in UTF-32 - if (ch >= 0xD800 && ch <= 0xDFFF) { + if (U_IS_SURROGATE(character)) { if (strict) { - source -= (extraBytesToRead + 1); // return to the illegal value itself + source -= utf8SequenceLength; // return to the illegal value itself result = sourceIllegal; break; } else - *target++ = 0xFFFD; + *target++ = replacementCharacter; } else - *target++ = (UChar)ch; // normal case - } else if (ch > 0x10FFFF) { - if (strict) { - result = sourceIllegal; - source -= (extraBytesToRead + 1); // return to the start - break; // Bail out; shouldn't continue - } else - *target++ = 0xFFFD; - } else { + *target++ = character; // normal case + } else if (U_IS_SUPPLEMENTARY(character)) { // target is a character in range 0xFFFF - 0x10FFFF if (target + 1 >= targetEnd) { - source -= (extraBytesToRead + 1); // Back up source pointer! + source -= utf8SequenceLength; // Back up source pointer! result = targetExhausted; break; } - ch -= 0x0010000UL; - *target++ = (UChar)((ch >> 10) + 0xD800); - *target++ = (UChar)((ch & 0x03FF) + 0xDC00); + *target++ = U16_LEAD(character); + *target++ = U16_TRAIL(character); + } else { + if (strict) { + source -= utf8SequenceLength; // return to the start + result = sourceIllegal; + break; // Bail out; shouldn't continue + } else + *target++ = replacementCharacter; } } *sourceStart = source; @@ -300,5 +314,5 @@ ConversionResult convertUTF8ToUTF16( return result; } -} -} +} // namespace Unicode +} // namespace WTF diff --git a/JavaScriptCore/wtf/unicode/UnicodeMacrosFromICU.h b/JavaScriptCore/wtf/unicode/UnicodeMacrosFromICU.h index f865ef1..8959912 100644 --- a/JavaScriptCore/wtf/unicode/UnicodeMacrosFromICU.h +++ b/JavaScriptCore/wtf/unicode/UnicodeMacrosFromICU.h @@ -1,4 +1,5 @@ /* + * Copyright (C) 1999-2004, International Business Machines Corporation and others. All Rights Reserved. * Copyright (C) 2006 George Staikos <staikos@kde.org> * Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com> * Copyright (C) 2007 Apple Computer, Inc. All rights reserved. @@ -38,11 +39,28 @@ #define U16_TRAIL(supplementary) (UChar)(((supplementary)&0x3ff)|0xdc00) #define U16_LENGTH(c) ((uint32_t)(c) <= 0xffff ? 1 : 2) +#define U_IS_SUPPLEMENTARY(c) ((UChar32)((c)-0x10000)<=0xfffff) #define U_IS_SURROGATE(c) (((c)&0xfffff800)==0xd800) #define U16_IS_SINGLE(c) !U_IS_SURROGATE(c) #define U16_IS_SURROGATE(c) U_IS_SURROGATE(c) #define U16_IS_SURROGATE_LEAD(c) (((c)&0x400)==0) +#define U16_GET(s, start, i, length, c) { \ + (c)=(s)[i]; \ + if(U16_IS_SURROGATE(c)) { \ + uint16_t __c2; \ + if(U16_IS_SURROGATE_LEAD(c)) { \ + if((i)+1<(length) && U16_IS_TRAIL(__c2=(s)[(i)+1])) { \ + (c)=U16_GET_SUPPLEMENTARY((c), __c2); \ + } \ + } else { \ + if((i)-1>=(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \ + (c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \ + } \ + } \ + } \ +} + #define U16_PREV(s, start, i, c) { \ (c)=(s)[--(i)]; \ if(U16_IS_TRAIL(c)) { \ @@ -54,6 +72,12 @@ } \ } +#define U16_BACK_1(s, start, i) { \ + if(U16_IS_TRAIL((s)[--(i)]) && (i)>(start) && U16_IS_LEAD((s)[(i)-1])) { \ + --(i); \ + } \ +} + #define U16_NEXT(s, i, length, c) { \ (c)=(s)[(i)++]; \ if(U16_IS_LEAD(c)) { \ @@ -65,7 +89,12 @@ } \ } +#define U16_FWD_1(s, i, length) { \ + if(U16_IS_LEAD((s)[(i)++]) && (i)<(length) && U16_IS_TRAIL((s)[i])) { \ + ++(i); \ + } \ +} + #define U_MASK(x) ((uint32_t)1<<(x)) #endif - diff --git a/JavaScriptCore/wtf/unicode/glib/UnicodeGLib.cpp b/JavaScriptCore/wtf/unicode/glib/UnicodeGLib.cpp index e20c376..a01c3ee 100644 --- a/JavaScriptCore/wtf/unicode/glib/UnicodeGLib.cpp +++ b/JavaScriptCore/wtf/unicode/glib/UnicodeGLib.cpp @@ -1,6 +1,7 @@ /* * Copyright (C) 2008 Jürg Billeter <j@bitron.ch> * Copyright (C) 2008 Dominik Röttsches <dominik.roettsches@access-company.com> + * Copyright (C) 2010 Igalia S.L. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Library General Public @@ -22,6 +23,11 @@ #include "config.h" #include "UnicodeGLib.h" +#include <wtf/Vector.h> +#include <wtf/unicode/UTF8.h> + +#define UTF8_IS_SURROGATE(character) (character >= 0x10000 && character <= 0x10FFFF) + namespace WTF { namespace Unicode { @@ -43,100 +49,71 @@ UChar32 foldCase(UChar32 ch) return *ucs4Result; } -int foldCase(UChar* result, int resultLength, const UChar* src, int srcLength, bool* error) +static int getUTF16LengthFromUTF8(const gchar* utf8String, int length) { - *error = false; - GOwnPtr<GError> gerror; + int utf16Length = 0; + const gchar* inputString = utf8String; - GOwnPtr<char> utf8src; - utf8src.set(g_utf16_to_utf8(src, srcLength, 0, 0, &gerror.outPtr())); - if (gerror) { - *error = true; - return -1; - } - - GOwnPtr<char> utf8result; - utf8result.set(g_utf8_casefold(utf8src.get(), -1)); + while ((utf8String + length - inputString > 0) && *inputString) { + gunichar character = g_utf8_get_char(inputString); - long utf16resultLength = -1; - GOwnPtr<UChar> utf16result; - utf16result.set(g_utf8_to_utf16(utf8result.get(), -1, 0, &utf16resultLength, &gerror.outPtr())); - if (gerror) { - *error = true; - return -1; + utf16Length += UTF8_IS_SURROGATE(character) ? 2 : 1; + inputString = g_utf8_next_char(inputString); } - if (utf16resultLength > resultLength) { - *error = true; - return utf16resultLength; - } - memcpy(result, utf16result.get(), utf16resultLength * sizeof(UChar)); - - return utf16resultLength; + return utf16Length; } -int toLower(UChar* result, int resultLength, const UChar* src, int srcLength, bool* error) +typedef gchar* (*UTF8CaseFunction)(const gchar*, gssize length); + +static int convertCase(UChar* result, int resultLength, const UChar* src, int srcLength, bool* error, UTF8CaseFunction caseFunction) { *error = false; - GOwnPtr<GError> gerror; - GOwnPtr<char> utf8src; - utf8src.set(g_utf16_to_utf8(src, srcLength, 0, 0, &gerror.outPtr())); - if (gerror) { + // Allocate a buffer big enough to hold all the characters. + Vector<char> buffer(srcLength * 3); + char* utf8Target = buffer.data(); + const UChar* utf16Source = src; + ConversionResult conversionResult = convertUTF16ToUTF8(&utf16Source, utf16Source + srcLength, &utf8Target, utf8Target + buffer.size(), true); + if (conversionResult != conversionOK) { *error = true; return -1; } + buffer.shrink(utf8Target - buffer.data()); - GOwnPtr<char> utf8result; - utf8result.set(g_utf8_strdown(utf8src.get(), -1)); + GOwnPtr<char> utf8Result(caseFunction(buffer.data(), buffer.size())); + long utf8ResultLength = strlen(utf8Result.get()); - long utf16resultLength = -1; - GOwnPtr<UChar> utf16result; - utf16result.set(g_utf8_to_utf16(utf8result.get(), -1, 0, &utf16resultLength, &gerror.outPtr())); - if (gerror) { + // Calculate the destination buffer size. + int realLength = getUTF16LengthFromUTF8(utf8Result.get(), utf8ResultLength); + if (realLength > resultLength) { *error = true; - return -1; + return realLength; } - if (utf16resultLength > resultLength) { + // Convert the result to UTF-16. + UChar* utf16Target = result; + const char* utf8Source = utf8Result.get(); + conversionResult = convertUTF8ToUTF16(&utf8Source, utf8Source + utf8ResultLength, &utf16Target, utf16Target + resultLength, true); + long utf16ResultLength = utf16Target - result; + if (conversionResult != conversionOK) *error = true; - return utf16resultLength; - } - memcpy(result, utf16result.get(), utf16resultLength * sizeof(UChar)); - return utf16resultLength; + return utf16ResultLength <= 0 ? -1 : utf16ResultLength; } - -int toUpper(UChar* result, int resultLength, const UChar* src, int srcLength, bool* error) +int foldCase(UChar* result, int resultLength, const UChar* src, int srcLength, bool* error) { - *error = false; - GOwnPtr<GError> gerror; - - GOwnPtr<char> utf8src; - utf8src.set(g_utf16_to_utf8(src, srcLength, 0, 0, &gerror.outPtr())); - if (gerror) { - *error = true; - return -1; - } - - GOwnPtr<char> utf8result; - utf8result.set(g_utf8_strup(utf8src.get(), -1)); - - long utf16resultLength = -1; - GOwnPtr<UChar> utf16result; - utf16result.set(g_utf8_to_utf16(utf8result.get(), -1, 0, &utf16resultLength, &gerror.outPtr())); - if (gerror) { - *error = true; - return -1; - } + return convertCase(result, resultLength, src, srcLength, error, g_utf8_casefold); +} - if (utf16resultLength > resultLength) { - *error = true; - return utf16resultLength; - } - memcpy(result, utf16result.get(), utf16resultLength * sizeof(UChar)); +int toLower(UChar* result, int resultLength, const UChar* src, int srcLength, bool* error) +{ + return convertCase(result, resultLength, src, srcLength, error, g_utf8_strdown); +} - return utf16resultLength; +int toUpper(UChar* result, int resultLength, const UChar* src, int srcLength, bool* error) +{ + return convertCase(result, resultLength, src, srcLength, error, g_utf8_strup); } Direction direction(UChar32 c) |
