summaryrefslogtreecommitdiffstats
path: root/Source/WebCore/platform/text/TextCodecUTF8.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'Source/WebCore/platform/text/TextCodecUTF8.cpp')
-rw-r--r--Source/WebCore/platform/text/TextCodecUTF8.cpp312
1 files changed, 176 insertions, 136 deletions
diff --git a/Source/WebCore/platform/text/TextCodecUTF8.cpp b/Source/WebCore/platform/text/TextCodecUTF8.cpp
index 8944d68..5f82092 100644
--- a/Source/WebCore/platform/text/TextCodecUTF8.cpp
+++ b/Source/WebCore/platform/text/TextCodecUTF8.cpp
@@ -26,64 +26,17 @@
#include "config.h"
#include "TextCodecUTF8.h"
+#include "TextCodecASCIIFastPath.h"
#include <wtf/text/CString.h>
#include <wtf/text/StringBuffer.h>
-#include <wtf/unicode/UTF8.h>
+#include <wtf/unicode/CharacterNames.h>
using namespace WTF::Unicode;
using namespace std;
namespace WebCore {
-// Assuming that a pointer is the size of a "machine word", then
-// uintptr_t is an integer type that is also a machine word.
-typedef uintptr_t MachineWord;
-
-// This constant has type uintptr_t since we will use it to align
-// pointers. Not because MachineWord is uintptr_t.
-const uintptr_t machineWordAlignmentMask = sizeof(MachineWord) - 1;
-
-template<size_t size> struct NonASCIIMask;
-template<> struct NonASCIIMask<4> {
- static unsigned value() { return 0x80808080U; }
-};
-template<> struct NonASCIIMask<8> {
- static unsigned long long value() { return 0x8080808080808080ULL; }
-};
-
-template<size_t size> struct UCharByteFiller;
-template<> struct UCharByteFiller<4> {
- static void copy(UChar* destination, const uint8_t* source)
- {
- destination[0] = source[0];
- destination[1] = source[1];
- destination[2] = source[2];
- destination[3] = source[3];
- }
-};
-template<> struct UCharByteFiller<8> {
- static void copy(UChar* destination, const uint8_t* source)
- {
- destination[0] = source[0];
- destination[1] = source[1];
- destination[2] = source[2];
- destination[3] = source[3];
- destination[4] = source[4];
- destination[5] = source[5];
- destination[6] = source[6];
- destination[7] = source[7];
- }
-};
-
-static inline bool isAlignedToMachineWord(const void* pointer)
-{
- return !(reinterpret_cast<uintptr_t>(pointer) & machineWordAlignmentMask);
-}
-
-template<typename T> static inline T* alignToMachineWord(T* pointer)
-{
- return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(pointer) & ~machineWordAlignmentMask);
-}
+const int nonCharacter = -1;
PassOwnPtr<TextCodec> TextCodecUTF8::create(const TextEncoding&, const void*)
{
@@ -93,6 +46,15 @@ PassOwnPtr<TextCodec> TextCodecUTF8::create(const TextEncoding&, const void*)
void TextCodecUTF8::registerEncodingNames(EncodingNameRegistrar registrar)
{
registrar("UTF-8", "UTF-8");
+
+ // Additional aliases that originally were present in the encoding
+ // table in WebKit on Macintosh, and subsequently added by
+ // TextCodecICU. Perhaps we can prove some are not used on the web
+ // and remove them.
+ registrar("unicode11utf8", "UTF-8");
+ registrar("unicode20utf8", "UTF-8");
+ registrar("utf8", "UTF-8");
+ registrar("x-unicode20utf8", "UTF-8");
}
void TextCodecUTF8::registerCodecs(TextCodecRegistrar registrar)
@@ -100,27 +62,38 @@ void TextCodecUTF8::registerCodecs(TextCodecRegistrar registrar)
registrar("UTF-8", create, 0);
}
-static inline int nonASCIISequenceLength(unsigned char firstByte)
+static inline int nonASCIISequenceLength(uint8_t firstByte)
{
- ASSERT(!isASCII(firstByte));
- switch (firstByte >> 4) {
- case 0xF:
- return 4;
- case 0xE:
- return 3;
- }
- return 2;
+ static const uint8_t lengths[256] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+ };
+ return lengths[firstByte];
}
-static inline int decodeNonASCIISequence(const unsigned char* sequence, unsigned length)
+static inline int decodeNonASCIISequence(const uint8_t* sequence, unsigned length)
{
ASSERT(!isASCII(sequence[0]));
if (length == 2) {
ASSERT(sequence[0] <= 0xDF);
if (sequence[0] < 0xC2)
- return -1;
+ return nonCharacter;
if (sequence[1] < 0x80 || sequence[1] > 0xBF)
- return -1;
+ return nonCharacter;
return ((sequence[0] << 6) + sequence[1]) - 0x00003080;
}
if (length == 3) {
@@ -128,18 +101,18 @@ static inline int decodeNonASCIISequence(const unsigned char* sequence, unsigned
switch (sequence[0]) {
case 0xE0:
if (sequence[1] < 0xA0 || sequence[1] > 0xBF)
- return -1;
+ return nonCharacter;
break;
case 0xED:
if (sequence[1] < 0x80 || sequence[1] > 0x9F)
- return -1;
+ return nonCharacter;
break;
default:
if (sequence[1] < 0x80 || sequence[1] > 0xBF)
- return -1;
+ return nonCharacter;
}
if (sequence[2] < 0x80 || sequence[2] > 0xBF)
- return -1;
+ return nonCharacter;
return ((sequence[0] << 12) + (sequence[1] << 6) + sequence[2]) - 0x000E2080;
}
ASSERT(length == 4);
@@ -147,109 +120,176 @@ static inline int decodeNonASCIISequence(const unsigned char* sequence, unsigned
switch (sequence[0]) {
case 0xF0:
if (sequence[1] < 0x90 || sequence[1] > 0xBF)
- return -1;
+ return nonCharacter;
break;
case 0xF4:
if (sequence[1] < 0x80 || sequence[1] > 0x8F)
- return -1;
+ return nonCharacter;
break;
default:
if (sequence[1] < 0x80 || sequence[1] > 0xBF)
- return -1;
+ return nonCharacter;
}
if (sequence[2] < 0x80 || sequence[2] > 0xBF)
- return -1;
+ return nonCharacter;
if (sequence[3] < 0x80 || sequence[3] > 0xBF)
- return -1;
+ return nonCharacter;
return ((sequence[0] << 18) + (sequence[1] << 12) + (sequence[2] << 6) + sequence[3]) - 0x03C82080;
}
+static inline UChar* appendCharacter(UChar* destination, int character)
+{
+ ASSERT(character != nonCharacter);
+ ASSERT(!U_IS_SURROGATE(character));
+ if (U_IS_BMP(character))
+ *destination++ = character;
+ else {
+ *destination++ = U16_LEAD(character);
+ *destination++ = U16_TRAIL(character);
+ }
+ return destination;
+}
+
+void TextCodecUTF8::consumePartialSequenceByte()
+{
+ --m_partialSequenceSize;
+ memmove(m_partialSequence, m_partialSequence + 1, m_partialSequenceSize);
+}
+
+void TextCodecUTF8::handleError(UChar*& destination, bool stopOnError, bool& sawError)
+{
+ sawError = true;
+ if (stopOnError)
+ return;
+ // Each error generates a replacement character and consumes one byte.
+ *destination++ = replacementCharacter;
+ consumePartialSequenceByte();
+}
+
+void TextCodecUTF8::handlePartialSequence(UChar*& destination, const uint8_t*& source, const uint8_t* end, bool flush, bool stopOnError, bool& sawError)
+{
+ ASSERT(m_partialSequenceSize);
+ do {
+ if (isASCII(m_partialSequence[0])) {
+ *destination++ = m_partialSequence[0];
+ consumePartialSequenceByte();
+ continue;
+ }
+ int count = nonASCIISequenceLength(m_partialSequence[0]);
+ if (!count) {
+ handleError(destination, stopOnError, sawError);
+ if (stopOnError)
+ return;
+ continue;
+ }
+ if (count > m_partialSequenceSize) {
+ if (count - m_partialSequenceSize > end - source) {
+ if (!flush) {
+ // The new data is not enough to complete the sequence, so
+ // add it to the existing partial sequence.
+ memcpy(m_partialSequence + m_partialSequenceSize, source, end - source);
+ m_partialSequenceSize += end - source;
+ return;
+ }
+ // An incomplete partial sequence at the end is an error.
+ handleError(destination, stopOnError, sawError);
+ if (stopOnError)
+ return;
+ continue;
+ }
+ memcpy(m_partialSequence + m_partialSequenceSize, source, count - m_partialSequenceSize);
+ source += count - m_partialSequenceSize;
+ m_partialSequenceSize = count;
+ }
+ int character = decodeNonASCIISequence(m_partialSequence, count);
+ if (character == nonCharacter) {
+ handleError(destination, stopOnError, sawError);
+ if (stopOnError)
+ return;
+ continue;
+ }
+ m_partialSequenceSize -= count;
+ destination = appendCharacter(destination, character);
+ } while (m_partialSequenceSize);
+}
+
String TextCodecUTF8::decode(const char* bytes, size_t length, bool flush, bool stopOnError, bool& sawError)
{
- StringBuffer buffer(length);
+ // Each input byte might turn into a character.
+ // That includes all bytes in the partial-sequence buffer because
+ // each byte in an invalid sequence will turn into a replacement character.
+ StringBuffer buffer(m_partialSequenceSize + length);
const uint8_t* source = reinterpret_cast<const uint8_t*>(bytes);
const uint8_t* end = source + length;
const uint8_t* alignedEnd = alignToMachineWord(end);
UChar* destination = buffer.characters();
- int count;
- int character;
-
- if (m_partialSequenceSize) {
- count = nonASCIISequenceLength(m_partialSequence[0]);
- ASSERT(count > m_partialSequenceSize);
- if (count - m_partialSequenceSize > end - source) {
- memcpy(m_partialSequence + m_partialSequenceSize, source, end - source);
- m_partialSequenceSize += end - source;
- source = end;
- } else {
- uint8_t completeSequence[U8_MAX_LENGTH];
- memcpy(completeSequence, m_partialSequence, m_partialSequenceSize);
- memcpy(completeSequence + m_partialSequenceSize, source, count - m_partialSequenceSize);
- source += count - m_partialSequenceSize;
- m_partialSequenceSize = 0;
- character = decodeNonASCIISequence(completeSequence, count);
- goto decodedNonASCII;
+ do {
+ if (m_partialSequenceSize) {
+ // Explicitly copy destination and source pointers to avoid taking pointers to the
+ // local variables, which may harm code generation by disabling some optimizations
+ // in some compilers.
+ UChar* destinationForHandlePartialSequence = destination;
+ const uint8_t* sourceForHandlePartialSequence = source;
+ handlePartialSequence(destinationForHandlePartialSequence, sourceForHandlePartialSequence, end, flush, stopOnError, sawError);
+ destination = destinationForHandlePartialSequence;
+ source = sourceForHandlePartialSequence;
+ if (m_partialSequenceSize)
+ break;
}
- }
- while (source < end) {
- if (isASCII(*source)) {
- // Fast path for ASCII. Most UTF-8 text will be ASCII.
- if (isAlignedToMachineWord(source)) {
- while (source < alignedEnd) {
- MachineWord chunk = *reinterpret_cast_ptr<const MachineWord*>(source);
- if (chunk & NonASCIIMask<sizeof(MachineWord)>::value()) {
- if (isASCII(*source))
+ while (source < end) {
+ if (isASCII(*source)) {
+ // Fast path for ASCII. Most UTF-8 text will be ASCII.
+ if (isAlignedToMachineWord(source)) {
+ while (source < alignedEnd) {
+ MachineWord chunk = *reinterpret_cast_ptr<const MachineWord*>(source);
+ if (!isAllASCII(chunk))
break;
- goto nonASCII;
+ copyASCIIMachineWord(destination, source);
+ source += sizeof(MachineWord);
+ destination += sizeof(MachineWord);
}
- UCharByteFiller<sizeof(MachineWord)>::copy(destination, source);
- source += sizeof(MachineWord);
- destination += sizeof(MachineWord);
+ if (source == end)
+ break;
+ if (!isASCII(*source))
+ continue;
}
- if (source == end)
- break;
- }
- *destination++ = *source++;
- } else {
-nonASCII:
- count = nonASCIISequenceLength(*source);
- ASSERT(count >= 2);
- ASSERT(count <= 4);
- if (count > end - source) {
- ASSERT(end - source <= static_cast<ptrdiff_t>(sizeof(m_partialSequence)));
- ASSERT(!m_partialSequenceSize);
- m_partialSequenceSize = end - source;
- memcpy(m_partialSequence, source, m_partialSequenceSize);
- break;
+ *destination++ = *source++;
+ continue;
}
- character = decodeNonASCIISequence(source, count);
- source += count;
-decodedNonASCII:
- if (character < 0) {
- if (stopOnError) {
- sawError = true;
+ int count = nonASCIISequenceLength(*source);
+ int character;
+ if (!count)
+ character = nonCharacter;
+ else {
+ if (count > end - source) {
+ ASSERT(end - source < static_cast<ptrdiff_t>(sizeof(m_partialSequence)));
+ ASSERT(!m_partialSequenceSize);
+ m_partialSequenceSize = end - source;
+ memcpy(m_partialSequence, source, m_partialSequenceSize);
+ source = end;
break;
}
- } else {
- ASSERT(!U_IS_SURROGATE(character));
- if (U_IS_BMP(character))
- *destination++ = character;
- else {
- *destination++ = U16_LEAD(character);
- *destination++ = U16_TRAIL(character);
- }
+ character = decodeNonASCIISequence(source, count);
}
+ if (character == nonCharacter) {
+ sawError = true;
+ if (stopOnError)
+ break;
+ // Each error generates a replacement character and consumes one byte.
+ *destination++ = replacementCharacter;
+ ++source;
+ continue;
+ }
+ source += count;
+ destination = appendCharacter(destination, character);
}
- }
+ } while (flush && m_partialSequenceSize);
buffer.shrink(destination - buffer.characters());
- if (flush && m_partialSequenceSize)
- sawError = true;
-
return String::adopt(buffer);
}