diff options
Diffstat (limited to 'Source/WebCore/platform/text/TextCodecUTF8.cpp')
-rw-r--r-- | Source/WebCore/platform/text/TextCodecUTF8.cpp | 312 |
1 files changed, 176 insertions, 136 deletions
diff --git a/Source/WebCore/platform/text/TextCodecUTF8.cpp b/Source/WebCore/platform/text/TextCodecUTF8.cpp index 8944d68..5f82092 100644 --- a/Source/WebCore/platform/text/TextCodecUTF8.cpp +++ b/Source/WebCore/platform/text/TextCodecUTF8.cpp @@ -26,64 +26,17 @@ #include "config.h" #include "TextCodecUTF8.h" +#include "TextCodecASCIIFastPath.h" #include <wtf/text/CString.h> #include <wtf/text/StringBuffer.h> -#include <wtf/unicode/UTF8.h> +#include <wtf/unicode/CharacterNames.h> using namespace WTF::Unicode; using namespace std; namespace WebCore { -// Assuming that a pointer is the size of a "machine word", then -// uintptr_t is an integer type that is also a machine word. -typedef uintptr_t MachineWord; - -// This constant has type uintptr_t since we will use it to align -// pointers. Not because MachineWord is uintptr_t. -const uintptr_t machineWordAlignmentMask = sizeof(MachineWord) - 1; - -template<size_t size> struct NonASCIIMask; -template<> struct NonASCIIMask<4> { - static unsigned value() { return 0x80808080U; } -}; -template<> struct NonASCIIMask<8> { - static unsigned long long value() { return 0x8080808080808080ULL; } -}; - -template<size_t size> struct UCharByteFiller; -template<> struct UCharByteFiller<4> { - static void copy(UChar* destination, const uint8_t* source) - { - destination[0] = source[0]; - destination[1] = source[1]; - destination[2] = source[2]; - destination[3] = source[3]; - } -}; -template<> struct UCharByteFiller<8> { - static void copy(UChar* destination, const uint8_t* source) - { - destination[0] = source[0]; - destination[1] = source[1]; - destination[2] = source[2]; - destination[3] = source[3]; - destination[4] = source[4]; - destination[5] = source[5]; - destination[6] = source[6]; - destination[7] = source[7]; - } -}; - -static inline bool isAlignedToMachineWord(const void* pointer) -{ - return !(reinterpret_cast<uintptr_t>(pointer) & machineWordAlignmentMask); -} - -template<typename T> static inline T* alignToMachineWord(T* pointer) -{ - return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(pointer) & ~machineWordAlignmentMask); -} +const int nonCharacter = -1; PassOwnPtr<TextCodec> TextCodecUTF8::create(const TextEncoding&, const void*) { @@ -93,6 +46,15 @@ PassOwnPtr<TextCodec> TextCodecUTF8::create(const TextEncoding&, const void*) void TextCodecUTF8::registerEncodingNames(EncodingNameRegistrar registrar) { registrar("UTF-8", "UTF-8"); + + // Additional aliases that originally were present in the encoding + // table in WebKit on Macintosh, and subsequently added by + // TextCodecICU. Perhaps we can prove some are not used on the web + // and remove them. + registrar("unicode11utf8", "UTF-8"); + registrar("unicode20utf8", "UTF-8"); + registrar("utf8", "UTF-8"); + registrar("x-unicode20utf8", "UTF-8"); } void TextCodecUTF8::registerCodecs(TextCodecRegistrar registrar) @@ -100,27 +62,38 @@ void TextCodecUTF8::registerCodecs(TextCodecRegistrar registrar) registrar("UTF-8", create, 0); } -static inline int nonASCIISequenceLength(unsigned char firstByte) +static inline int nonASCIISequenceLength(uint8_t firstByte) { - ASSERT(!isASCII(firstByte)); - switch (firstByte >> 4) { - case 0xF: - return 4; - case 0xE: - return 3; - } - return 2; + static const uint8_t lengths[256] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + }; + return lengths[firstByte]; } -static inline int decodeNonASCIISequence(const unsigned char* sequence, unsigned length) +static inline int decodeNonASCIISequence(const uint8_t* sequence, unsigned length) { ASSERT(!isASCII(sequence[0])); if (length == 2) { ASSERT(sequence[0] <= 0xDF); if (sequence[0] < 0xC2) - return -1; + return nonCharacter; if (sequence[1] < 0x80 || sequence[1] > 0xBF) - return -1; + return nonCharacter; return ((sequence[0] << 6) + sequence[1]) - 0x00003080; } if (length == 3) { @@ -128,18 +101,18 @@ static inline int decodeNonASCIISequence(const unsigned char* sequence, unsigned switch (sequence[0]) { case 0xE0: if (sequence[1] < 0xA0 || sequence[1] > 0xBF) - return -1; + return nonCharacter; break; case 0xED: if (sequence[1] < 0x80 || sequence[1] > 0x9F) - return -1; + return nonCharacter; break; default: if (sequence[1] < 0x80 || sequence[1] > 0xBF) - return -1; + return nonCharacter; } if (sequence[2] < 0x80 || sequence[2] > 0xBF) - return -1; + return nonCharacter; return ((sequence[0] << 12) + (sequence[1] << 6) + sequence[2]) - 0x000E2080; } ASSERT(length == 4); @@ -147,109 +120,176 @@ static inline int decodeNonASCIISequence(const unsigned char* sequence, unsigned switch (sequence[0]) { case 0xF0: if (sequence[1] < 0x90 || sequence[1] > 0xBF) - return -1; + return nonCharacter; break; case 0xF4: if (sequence[1] < 0x80 || sequence[1] > 0x8F) - return -1; + return nonCharacter; break; default: if (sequence[1] < 0x80 || sequence[1] > 0xBF) - return -1; + return nonCharacter; } if (sequence[2] < 0x80 || sequence[2] > 0xBF) - return -1; + return nonCharacter; if (sequence[3] < 0x80 || sequence[3] > 0xBF) - return -1; + return nonCharacter; return ((sequence[0] << 18) + (sequence[1] << 12) + (sequence[2] << 6) + sequence[3]) - 0x03C82080; } +static inline UChar* appendCharacter(UChar* destination, int character) +{ + ASSERT(character != nonCharacter); + ASSERT(!U_IS_SURROGATE(character)); + if (U_IS_BMP(character)) + *destination++ = character; + else { + *destination++ = U16_LEAD(character); + *destination++ = U16_TRAIL(character); + } + return destination; +} + +void TextCodecUTF8::consumePartialSequenceByte() +{ + --m_partialSequenceSize; + memmove(m_partialSequence, m_partialSequence + 1, m_partialSequenceSize); +} + +void TextCodecUTF8::handleError(UChar*& destination, bool stopOnError, bool& sawError) +{ + sawError = true; + if (stopOnError) + return; + // Each error generates a replacement character and consumes one byte. + *destination++ = replacementCharacter; + consumePartialSequenceByte(); +} + +void TextCodecUTF8::handlePartialSequence(UChar*& destination, const uint8_t*& source, const uint8_t* end, bool flush, bool stopOnError, bool& sawError) +{ + ASSERT(m_partialSequenceSize); + do { + if (isASCII(m_partialSequence[0])) { + *destination++ = m_partialSequence[0]; + consumePartialSequenceByte(); + continue; + } + int count = nonASCIISequenceLength(m_partialSequence[0]); + if (!count) { + handleError(destination, stopOnError, sawError); + if (stopOnError) + return; + continue; + } + if (count > m_partialSequenceSize) { + if (count - m_partialSequenceSize > end - source) { + if (!flush) { + // The new data is not enough to complete the sequence, so + // add it to the existing partial sequence. + memcpy(m_partialSequence + m_partialSequenceSize, source, end - source); + m_partialSequenceSize += end - source; + return; + } + // An incomplete partial sequence at the end is an error. + handleError(destination, stopOnError, sawError); + if (stopOnError) + return; + continue; + } + memcpy(m_partialSequence + m_partialSequenceSize, source, count - m_partialSequenceSize); + source += count - m_partialSequenceSize; + m_partialSequenceSize = count; + } + int character = decodeNonASCIISequence(m_partialSequence, count); + if (character == nonCharacter) { + handleError(destination, stopOnError, sawError); + if (stopOnError) + return; + continue; + } + m_partialSequenceSize -= count; + destination = appendCharacter(destination, character); + } while (m_partialSequenceSize); +} + String TextCodecUTF8::decode(const char* bytes, size_t length, bool flush, bool stopOnError, bool& sawError) { - StringBuffer buffer(length); + // Each input byte might turn into a character. + // That includes all bytes in the partial-sequence buffer because + // each byte in an invalid sequence will turn into a replacement character. + StringBuffer buffer(m_partialSequenceSize + length); const uint8_t* source = reinterpret_cast<const uint8_t*>(bytes); const uint8_t* end = source + length; const uint8_t* alignedEnd = alignToMachineWord(end); UChar* destination = buffer.characters(); - int count; - int character; - - if (m_partialSequenceSize) { - count = nonASCIISequenceLength(m_partialSequence[0]); - ASSERT(count > m_partialSequenceSize); - if (count - m_partialSequenceSize > end - source) { - memcpy(m_partialSequence + m_partialSequenceSize, source, end - source); - m_partialSequenceSize += end - source; - source = end; - } else { - uint8_t completeSequence[U8_MAX_LENGTH]; - memcpy(completeSequence, m_partialSequence, m_partialSequenceSize); - memcpy(completeSequence + m_partialSequenceSize, source, count - m_partialSequenceSize); - source += count - m_partialSequenceSize; - m_partialSequenceSize = 0; - character = decodeNonASCIISequence(completeSequence, count); - goto decodedNonASCII; + do { + if (m_partialSequenceSize) { + // Explicitly copy destination and source pointers to avoid taking pointers to the + // local variables, which may harm code generation by disabling some optimizations + // in some compilers. + UChar* destinationForHandlePartialSequence = destination; + const uint8_t* sourceForHandlePartialSequence = source; + handlePartialSequence(destinationForHandlePartialSequence, sourceForHandlePartialSequence, end, flush, stopOnError, sawError); + destination = destinationForHandlePartialSequence; + source = sourceForHandlePartialSequence; + if (m_partialSequenceSize) + break; } - } - while (source < end) { - if (isASCII(*source)) { - // Fast path for ASCII. Most UTF-8 text will be ASCII. - if (isAlignedToMachineWord(source)) { - while (source < alignedEnd) { - MachineWord chunk = *reinterpret_cast_ptr<const MachineWord*>(source); - if (chunk & NonASCIIMask<sizeof(MachineWord)>::value()) { - if (isASCII(*source)) + while (source < end) { + if (isASCII(*source)) { + // Fast path for ASCII. Most UTF-8 text will be ASCII. + if (isAlignedToMachineWord(source)) { + while (source < alignedEnd) { + MachineWord chunk = *reinterpret_cast_ptr<const MachineWord*>(source); + if (!isAllASCII(chunk)) break; - goto nonASCII; + copyASCIIMachineWord(destination, source); + source += sizeof(MachineWord); + destination += sizeof(MachineWord); } - UCharByteFiller<sizeof(MachineWord)>::copy(destination, source); - source += sizeof(MachineWord); - destination += sizeof(MachineWord); + if (source == end) + break; + if (!isASCII(*source)) + continue; } - if (source == end) - break; - } - *destination++ = *source++; - } else { -nonASCII: - count = nonASCIISequenceLength(*source); - ASSERT(count >= 2); - ASSERT(count <= 4); - if (count > end - source) { - ASSERT(end - source <= static_cast<ptrdiff_t>(sizeof(m_partialSequence))); - ASSERT(!m_partialSequenceSize); - m_partialSequenceSize = end - source; - memcpy(m_partialSequence, source, m_partialSequenceSize); - break; + *destination++ = *source++; + continue; } - character = decodeNonASCIISequence(source, count); - source += count; -decodedNonASCII: - if (character < 0) { - if (stopOnError) { - sawError = true; + int count = nonASCIISequenceLength(*source); + int character; + if (!count) + character = nonCharacter; + else { + if (count > end - source) { + ASSERT(end - source < static_cast<ptrdiff_t>(sizeof(m_partialSequence))); + ASSERT(!m_partialSequenceSize); + m_partialSequenceSize = end - source; + memcpy(m_partialSequence, source, m_partialSequenceSize); + source = end; break; } - } else { - ASSERT(!U_IS_SURROGATE(character)); - if (U_IS_BMP(character)) - *destination++ = character; - else { - *destination++ = U16_LEAD(character); - *destination++ = U16_TRAIL(character); - } + character = decodeNonASCIISequence(source, count); } + if (character == nonCharacter) { + sawError = true; + if (stopOnError) + break; + // Each error generates a replacement character and consumes one byte. + *destination++ = replacementCharacter; + ++source; + continue; + } + source += count; + destination = appendCharacter(destination, character); } - } + } while (flush && m_partialSequenceSize); buffer.shrink(destination - buffer.characters()); - if (flush && m_partialSequenceSize) - sawError = true; - return String::adopt(buffer); } |