diff options
Diffstat (limited to 'JavaScriptCore/wtf/unicode')
-rw-r--r-- | JavaScriptCore/wtf/unicode/UTF8.cpp | 90 | ||||
-rw-r--r-- | JavaScriptCore/wtf/unicode/UnicodeMacrosFromICU.h | 31 |
2 files changed, 39 insertions, 82 deletions
diff --git a/JavaScriptCore/wtf/unicode/UTF8.cpp b/JavaScriptCore/wtf/unicode/UTF8.cpp index ca4fc1c..40c5609 100644 --- a/JavaScriptCore/wtf/unicode/UTF8.cpp +++ b/JavaScriptCore/wtf/unicode/UTF8.cpp @@ -26,14 +26,9 @@ #include "config.h" #include "UTF8.h" -#include "ASCIICType.h" - namespace WTF { namespace Unicode { -// FIXME: Use definition from CharacterNames.h. -const UChar replacementCharacter = 0xFFFD; - inline int inlineUTF8SequenceLengthNonASCII(char b0) { if ((b0 & 0xC0) != 0xC0) @@ -49,12 +44,12 @@ inline int inlineUTF8SequenceLengthNonASCII(char b0) inline int inlineUTF8SequenceLength(char b0) { - return isASCII(b0) ? 1 : inlineUTF8SequenceLengthNonASCII(b0); + return (b0 & 0x80) == 0 ? 1 : inlineUTF8SequenceLengthNonASCII(b0); } int UTF8SequenceLength(char b0) { - return isASCII(b0) ? 1 : inlineUTF8SequenceLengthNonASCII(b0); + return (b0 & 0x80) == 0 ? 1 : inlineUTF8SequenceLengthNonASCII(b0); } int decodeUTF8Sequence(const char* sequence) @@ -177,7 +172,7 @@ ConversionResult convertUTF16ToUTF8( bytesToWrite = 4; } else { bytesToWrite = 3; - ch = replacementCharacter; + ch = 0xFFFD; } target += bytesToWrite; @@ -236,23 +231,6 @@ static bool isLegalUTF8(const unsigned char* source, int length) static const UChar32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, 0x03C82080UL, 0xFA082080UL, 0x82082080UL }; -static inline UChar32 readUTF8Sequence(const char*& sequence, unsigned length) -{ - UChar32 character = 0; - - // The cases all fall through. - switch (length) { - case 6: character += static_cast<unsigned char>(*sequence++); character <<= 6; - case 5: character += static_cast<unsigned char>(*sequence++); character <<= 6; - case 4: character += static_cast<unsigned char>(*sequence++); character <<= 6; - case 3: character += static_cast<unsigned char>(*sequence++); character <<= 6; - case 2: character += static_cast<unsigned char>(*sequence++); character <<= 6; - case 1: character += static_cast<unsigned char>(*sequence++); - } - - return character - offsetsFromUTF8[length - 1]; -} - ConversionResult convertUTF8ToUTF16( const char** sourceStart, const char* sourceEnd, UChar** targetStart, UChar* targetEnd, bool strict) @@ -261,52 +239,60 @@ ConversionResult convertUTF8ToUTF16( const char* source = *sourceStart; UChar* target = *targetStart; while (source < sourceEnd) { - int utf8SequenceLength = inlineUTF8SequenceLength(*source); - if (sourceEnd - source < utf8SequenceLength) { + UChar32 ch = 0; + int extraBytesToRead = inlineUTF8SequenceLength(*source) - 1; + if (source + extraBytesToRead >= sourceEnd) { result = sourceExhausted; break; } // Do this check whether lenient or strict - if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source), utf8SequenceLength)) { + if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source), extraBytesToRead + 1)) { result = sourceIllegal; break; } - - UChar32 character = readUTF8Sequence(source, utf8SequenceLength); + // The cases all fall through. + switch (extraBytesToRead) { + case 5: ch += static_cast<unsigned char>(*source++); ch <<= 6; // remember, illegal UTF-8 + case 4: ch += static_cast<unsigned char>(*source++); ch <<= 6; // remember, illegal UTF-8 + case 3: ch += static_cast<unsigned char>(*source++); ch <<= 6; + case 2: ch += static_cast<unsigned char>(*source++); ch <<= 6; + case 1: ch += static_cast<unsigned char>(*source++); ch <<= 6; + case 0: ch += static_cast<unsigned char>(*source++); + } + ch -= offsetsFromUTF8[extraBytesToRead]; if (target >= targetEnd) { - source -= utf8SequenceLength; // Back up source pointer! - result = targetExhausted; - break; + source -= (extraBytesToRead + 1); // Back up source pointer! + result = targetExhausted; break; } - - if (U_IS_BMP(character)) { + if (ch <= 0xFFFF) { // UTF-16 surrogate values are illegal in UTF-32 - if (U_IS_SURROGATE(character)) { + if (ch >= 0xD800 && ch <= 0xDFFF) { if (strict) { - source -= utf8SequenceLength; // return to the illegal value itself + source -= (extraBytesToRead + 1); // return to the illegal value itself result = sourceIllegal; break; } else - *target++ = replacementCharacter; + *target++ = 0xFFFD; } else - *target++ = character; // normal case - } else if (U_IS_SUPPLEMENTARY(character)) { + *target++ = (UChar)ch; // normal case + } else if (ch > 0x10FFFF) { + if (strict) { + result = sourceIllegal; + source -= (extraBytesToRead + 1); // return to the start + break; // Bail out; shouldn't continue + } else + *target++ = 0xFFFD; + } else { // target is a character in range 0xFFFF - 0x10FFFF if (target + 1 >= targetEnd) { - source -= utf8SequenceLength; // Back up source pointer! + source -= (extraBytesToRead + 1); // Back up source pointer! result = targetExhausted; break; } - *target++ = U16_LEAD(character); - *target++ = U16_TRAIL(character); - } else { - if (strict) { - source -= utf8SequenceLength; // return to the start - result = sourceIllegal; - break; // Bail out; shouldn't continue - } else - *target++ = replacementCharacter; + ch -= 0x0010000UL; + *target++ = (UChar)((ch >> 10) + 0xD800); + *target++ = (UChar)((ch & 0x03FF) + 0xDC00); } } *sourceStart = source; @@ -314,5 +300,5 @@ ConversionResult convertUTF8ToUTF16( return result; } -} // namespace Unicode -} // namespace WTF +} +} diff --git a/JavaScriptCore/wtf/unicode/UnicodeMacrosFromICU.h b/JavaScriptCore/wtf/unicode/UnicodeMacrosFromICU.h index 8959912..f865ef1 100644 --- a/JavaScriptCore/wtf/unicode/UnicodeMacrosFromICU.h +++ b/JavaScriptCore/wtf/unicode/UnicodeMacrosFromICU.h @@ -1,5 +1,4 @@ /* - * Copyright (C) 1999-2004, International Business Machines Corporation and others. All Rights Reserved. * Copyright (C) 2006 George Staikos <staikos@kde.org> * Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com> * Copyright (C) 2007 Apple Computer, Inc. All rights reserved. @@ -39,28 +38,11 @@ #define U16_TRAIL(supplementary) (UChar)(((supplementary)&0x3ff)|0xdc00) #define U16_LENGTH(c) ((uint32_t)(c) <= 0xffff ? 1 : 2) -#define U_IS_SUPPLEMENTARY(c) ((UChar32)((c)-0x10000)<=0xfffff) #define U_IS_SURROGATE(c) (((c)&0xfffff800)==0xd800) #define U16_IS_SINGLE(c) !U_IS_SURROGATE(c) #define U16_IS_SURROGATE(c) U_IS_SURROGATE(c) #define U16_IS_SURROGATE_LEAD(c) (((c)&0x400)==0) -#define U16_GET(s, start, i, length, c) { \ - (c)=(s)[i]; \ - if(U16_IS_SURROGATE(c)) { \ - uint16_t __c2; \ - if(U16_IS_SURROGATE_LEAD(c)) { \ - if((i)+1<(length) && U16_IS_TRAIL(__c2=(s)[(i)+1])) { \ - (c)=U16_GET_SUPPLEMENTARY((c), __c2); \ - } \ - } else { \ - if((i)-1>=(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \ - (c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \ - } \ - } \ - } \ -} - #define U16_PREV(s, start, i, c) { \ (c)=(s)[--(i)]; \ if(U16_IS_TRAIL(c)) { \ @@ -72,12 +54,6 @@ } \ } -#define U16_BACK_1(s, start, i) { \ - if(U16_IS_TRAIL((s)[--(i)]) && (i)>(start) && U16_IS_LEAD((s)[(i)-1])) { \ - --(i); \ - } \ -} - #define U16_NEXT(s, i, length, c) { \ (c)=(s)[(i)++]; \ if(U16_IS_LEAD(c)) { \ @@ -89,12 +65,7 @@ } \ } -#define U16_FWD_1(s, i, length) { \ - if(U16_IS_LEAD((s)[(i)++]) && (i)<(length) && U16_IS_TRAIL((s)[i])) { \ - ++(i); \ - } \ -} - #define U_MASK(x) ((uint32_t)1<<(x)) #endif + |