diff options
Diffstat (limited to 'Source/JavaScriptCore/wtf/unicode/UTF8.cpp')
-rw-r--r-- | Source/JavaScriptCore/wtf/unicode/UTF8.cpp | 401 |
1 files changed, 401 insertions, 0 deletions
diff --git a/Source/JavaScriptCore/wtf/unicode/UTF8.cpp b/Source/JavaScriptCore/wtf/unicode/UTF8.cpp new file mode 100644 index 0000000..dc24ed5 --- /dev/null +++ b/Source/JavaScriptCore/wtf/unicode/UTF8.cpp @@ -0,0 +1,401 @@ +/* + * Copyright (C) 2007 Apple Inc. All rights reserved. + * Copyright (C) 2010 Patrick Gansterer <paroga@paroga.com> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "UTF8.h" +#include <wtf/StringHasher.h> + +#include "ASCIICType.h" + +namespace WTF { +namespace Unicode { + +// FIXME: Use definition from CharacterNames.h. +static const UChar replacementCharacter = 0xFFFD; + +inline int inlineUTF8SequenceLengthNonASCII(char b0) +{ + if ((b0 & 0xC0) != 0xC0) + return 0; + if ((b0 & 0xE0) == 0xC0) + return 2; + if ((b0 & 0xF0) == 0xE0) + return 3; + if ((b0 & 0xF8) == 0xF0) + return 4; + return 0; +} + +inline int inlineUTF8SequenceLength(char b0) +{ + return isASCII(b0) ? 1 : inlineUTF8SequenceLengthNonASCII(b0); +} + +int UTF8SequenceLength(char b0) +{ + return isASCII(b0) ? 1 : inlineUTF8SequenceLengthNonASCII(b0); +} + +int decodeUTF8Sequence(const char* sequence) +{ + // Handle 0-byte sequences (never valid). + const unsigned char b0 = sequence[0]; + const int length = inlineUTF8SequenceLength(b0); + if (length == 0) + return -1; + + // Handle 1-byte sequences (plain ASCII). + const unsigned char b1 = sequence[1]; + if (length == 1) { + if (b1) + return -1; + return b0; + } + + // Handle 2-byte sequences. + if ((b1 & 0xC0) != 0x80) + return -1; + const unsigned char b2 = sequence[2]; + if (length == 2) { + if (b2) + return -1; + const int c = ((b0 & 0x1F) << 6) | (b1 & 0x3F); + if (c < 0x80) + return -1; + return c; + } + + // Handle 3-byte sequences. + if ((b2 & 0xC0) != 0x80) + return -1; + const unsigned char b3 = sequence[3]; + if (length == 3) { + if (b3) + return -1; + const int c = ((b0 & 0xF) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F); + if (c < 0x800) + return -1; + // UTF-16 surrogates should never appear in UTF-8 data. + if (c >= 0xD800 && c <= 0xDFFF) + return -1; + return c; + } + + // Handle 4-byte sequences. + if ((b3 & 0xC0) != 0x80) + return -1; + const unsigned char b4 = sequence[4]; + if (length == 4) { + if (b4) + return -1; + const int c = ((b0 & 0x7) << 18) | ((b1 & 0x3F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F); + if (c < 0x10000 || c > 0x10FFFF) + return -1; + return c; + } + + return -1; +} + +// Once the bits are split out into bytes of UTF-8, this is a mask OR-ed +// into the first byte, depending on how many bytes follow. There are +// as many entries in this table as there are UTF-8 sequence types. +// (I.e., one byte sequence, two byte... etc.). Remember that sequencs +// for *legal* UTF-8 will be 4 or fewer bytes total. +static const unsigned char firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; + +ConversionResult convertUTF16ToUTF8( + const UChar** sourceStart, const UChar* sourceEnd, + char** targetStart, char* targetEnd, bool strict) +{ + ConversionResult result = conversionOK; + const UChar* source = *sourceStart; + char* target = *targetStart; + while (source < sourceEnd) { + UChar32 ch; + unsigned short bytesToWrite = 0; + const UChar32 byteMask = 0xBF; + const UChar32 byteMark = 0x80; + const UChar* oldSource = source; // In case we have to back up because of target overflow. + ch = static_cast<unsigned short>(*source++); + // If we have a surrogate pair, convert to UChar32 first. + if (ch >= 0xD800 && ch <= 0xDBFF) { + // If the 16 bits following the high surrogate are in the source buffer... + if (source < sourceEnd) { + UChar32 ch2 = static_cast<unsigned short>(*source); + // If it's a low surrogate, convert to UChar32. + if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { + ch = ((ch - 0xD800) << 10) + (ch2 - 0xDC00) + 0x0010000; + ++source; + } else if (strict) { // it's an unpaired high surrogate + --source; // return to the illegal value itself + result = sourceIllegal; + break; + } + } else { // We don't have the 16 bits following the high surrogate. + --source; // return to the high surrogate + result = sourceExhausted; + break; + } + } else if (strict) { + // UTF-16 surrogate values are illegal in UTF-32 + if (ch >= 0xDC00 && ch <= 0xDFFF) { + --source; // return to the illegal value itself + result = sourceIllegal; + break; + } + } + // Figure out how many bytes the result will require + if (ch < (UChar32)0x80) { + bytesToWrite = 1; + } else if (ch < (UChar32)0x800) { + bytesToWrite = 2; + } else if (ch < (UChar32)0x10000) { + bytesToWrite = 3; + } else if (ch < (UChar32)0x110000) { + bytesToWrite = 4; + } else { + bytesToWrite = 3; + ch = replacementCharacter; + } + + target += bytesToWrite; + if (target > targetEnd) { + source = oldSource; // Back up source pointer! + target -= bytesToWrite; + result = targetExhausted; + break; + } + switch (bytesToWrite) { // note: everything falls through. + case 4: *--target = (char)((ch | byteMark) & byteMask); ch >>= 6; + case 3: *--target = (char)((ch | byteMark) & byteMask); ch >>= 6; + case 2: *--target = (char)((ch | byteMark) & byteMask); ch >>= 6; + case 1: *--target = (char)(ch | firstByteMark[bytesToWrite]); + } + target += bytesToWrite; + } + *sourceStart = source; + *targetStart = target; + return result; +} + +// This must be called with the length pre-determined by the first byte. +// If presented with a length > 4, this returns false. The Unicode +// definition of UTF-8 goes up to 4-byte sequences. +static bool isLegalUTF8(const unsigned char* source, int length) +{ + unsigned char a; + const unsigned char* srcptr = source + length; + switch (length) { + default: return false; + // Everything else falls through when "true"... + case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; + case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; + case 2: if ((a = (*--srcptr)) > 0xBF) return false; + + switch (*source) { + // no fall-through in this inner switch + case 0xE0: if (a < 0xA0) return false; break; + case 0xED: if (a > 0x9F) return false; break; + case 0xF0: if (a < 0x90) return false; break; + case 0xF4: if (a > 0x8F) return false; break; + default: if (a < 0x80) return false; + } + + case 1: if (*source >= 0x80 && *source < 0xC2) return false; + } + if (*source > 0xF4) + return false; + return true; +} + +// Magic values subtracted from a buffer value during UTF8 conversion. +// This table contains as many values as there might be trailing bytes +// in a UTF-8 sequence. +static const UChar32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, + 0x03C82080UL, 0xFA082080UL, 0x82082080UL }; + +static inline UChar32 readUTF8Sequence(const char*& sequence, unsigned length) +{ + UChar32 character = 0; + + // The cases all fall through. + switch (length) { + case 6: character += static_cast<unsigned char>(*sequence++); character <<= 6; + case 5: character += static_cast<unsigned char>(*sequence++); character <<= 6; + case 4: character += static_cast<unsigned char>(*sequence++); character <<= 6; + case 3: character += static_cast<unsigned char>(*sequence++); character <<= 6; + case 2: character += static_cast<unsigned char>(*sequence++); character <<= 6; + case 1: character += static_cast<unsigned char>(*sequence++); + } + + return character - offsetsFromUTF8[length - 1]; +} + +ConversionResult convertUTF8ToUTF16( + const char** sourceStart, const char* sourceEnd, + UChar** targetStart, UChar* targetEnd, bool strict) +{ + ConversionResult result = conversionOK; + const char* source = *sourceStart; + UChar* target = *targetStart; + while (source < sourceEnd) { + int utf8SequenceLength = inlineUTF8SequenceLength(*source); + if (sourceEnd - source < utf8SequenceLength) { + result = sourceExhausted; + break; + } + // Do this check whether lenient or strict + if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source), utf8SequenceLength)) { + result = sourceIllegal; + break; + } + + UChar32 character = readUTF8Sequence(source, utf8SequenceLength); + + if (target >= targetEnd) { + source -= utf8SequenceLength; // Back up source pointer! + result = targetExhausted; + break; + } + + if (U_IS_BMP(character)) { + // UTF-16 surrogate values are illegal in UTF-32 + if (U_IS_SURROGATE(character)) { + if (strict) { + source -= utf8SequenceLength; // return to the illegal value itself + result = sourceIllegal; + break; + } else + *target++ = replacementCharacter; + } else + *target++ = character; // normal case + } else if (U_IS_SUPPLEMENTARY(character)) { + // target is a character in range 0xFFFF - 0x10FFFF + if (target + 1 >= targetEnd) { + source -= utf8SequenceLength; // Back up source pointer! + result = targetExhausted; + break; + } + *target++ = U16_LEAD(character); + *target++ = U16_TRAIL(character); + } else { + if (strict) { + source -= utf8SequenceLength; // return to the start + result = sourceIllegal; + break; // Bail out; shouldn't continue + } else + *target++ = replacementCharacter; + } + } + *sourceStart = source; + *targetStart = target; + return result; +} + +unsigned calculateStringHashFromUTF8(const char* data, const char* dataEnd, unsigned& utf16Length) +{ + if (!data) + return 0; + + WTF::StringHasher stringHasher; + utf16Length = 0; + + while (data < dataEnd) { + if (isASCII(*data)) { + stringHasher.addCharacter(*data++); + utf16Length++; + continue; + } + + int utf8SequenceLength = inlineUTF8SequenceLengthNonASCII(*data); + + if (dataEnd - data < utf8SequenceLength) + return false; + + if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(data), utf8SequenceLength)) + return 0; + + UChar32 character = readUTF8Sequence(data, utf8SequenceLength); + ASSERT(!isASCII(character)); + + if (U_IS_BMP(character)) { + // UTF-16 surrogate values are illegal in UTF-32 + if (U_IS_SURROGATE(character)) + return 0; + stringHasher.addCharacter(static_cast<UChar>(character)); // normal case + utf16Length++; + } else if (U_IS_SUPPLEMENTARY(character)) { + stringHasher.addCharacters(static_cast<UChar>(U16_LEAD(character)), + static_cast<UChar>(U16_TRAIL(character))); + utf16Length += 2; + } else + return 0; + } + + return stringHasher.hash(); +} + +bool equalUTF16WithUTF8(const UChar* a, const UChar* aEnd, const char* b, const char* bEnd) +{ + while (b < bEnd) { + if (isASCII(*b)) { + if (*a++ != *b++) + return false; + continue; + } + + int utf8SequenceLength = inlineUTF8SequenceLengthNonASCII(*b); + + if (bEnd - b < utf8SequenceLength) + return false; + + if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(b), utf8SequenceLength)) + return 0; + + UChar32 character = readUTF8Sequence(b, utf8SequenceLength); + ASSERT(!isASCII(character)); + + if (U_IS_BMP(character)) { + // UTF-16 surrogate values are illegal in UTF-32 + if (U_IS_SURROGATE(character)) + return false; + if (*a++ != character) + return false; + } else if (U_IS_SUPPLEMENTARY(character)) { + if (*a++ != U16_LEAD(character)) + return false; + if (*a++ != U16_TRAIL(character)) + return false; + } else + return false; + } + + return a == aEnd; +} + +} // namespace Unicode +} // namespace WTF |