summaryrefslogtreecommitdiffstats
path: root/Source/WebCore/platform/text/TextCodecUTF8.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'Source/WebCore/platform/text/TextCodecUTF8.cpp')
-rw-r--r--Source/WebCore/platform/text/TextCodecUTF8.cpp276
1 files changed, 276 insertions, 0 deletions
diff --git a/Source/WebCore/platform/text/TextCodecUTF8.cpp b/Source/WebCore/platform/text/TextCodecUTF8.cpp
new file mode 100644
index 0000000..8944d68
--- /dev/null
+++ b/Source/WebCore/platform/text/TextCodecUTF8.cpp
@@ -0,0 +1,276 @@
+/*
+ * Copyright (C) 2004, 2006, 2008, 2011 Apple Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+#include "TextCodecUTF8.h"
+
+#include <wtf/text/CString.h>
+#include <wtf/text/StringBuffer.h>
+#include <wtf/unicode/UTF8.h>
+
+using namespace WTF::Unicode;
+using namespace std;
+
+namespace WebCore {
+
+// Assuming that a pointer is the size of a "machine word", then
+// uintptr_t is an integer type that is also a machine word.
+typedef uintptr_t MachineWord;
+
+// This constant has type uintptr_t since we will use it to align
+// pointers. Not because MachineWord is uintptr_t.
+const uintptr_t machineWordAlignmentMask = sizeof(MachineWord) - 1;
+
+template<size_t size> struct NonASCIIMask;
+template<> struct NonASCIIMask<4> {
+ static unsigned value() { return 0x80808080U; }
+};
+template<> struct NonASCIIMask<8> {
+ static unsigned long long value() { return 0x8080808080808080ULL; }
+};
+
+template<size_t size> struct UCharByteFiller;
+template<> struct UCharByteFiller<4> {
+ static void copy(UChar* destination, const uint8_t* source)
+ {
+ destination[0] = source[0];
+ destination[1] = source[1];
+ destination[2] = source[2];
+ destination[3] = source[3];
+ }
+};
+template<> struct UCharByteFiller<8> {
+ static void copy(UChar* destination, const uint8_t* source)
+ {
+ destination[0] = source[0];
+ destination[1] = source[1];
+ destination[2] = source[2];
+ destination[3] = source[3];
+ destination[4] = source[4];
+ destination[5] = source[5];
+ destination[6] = source[6];
+ destination[7] = source[7];
+ }
+};
+
+static inline bool isAlignedToMachineWord(const void* pointer)
+{
+ return !(reinterpret_cast<uintptr_t>(pointer) & machineWordAlignmentMask);
+}
+
+template<typename T> static inline T* alignToMachineWord(T* pointer)
+{
+ return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(pointer) & ~machineWordAlignmentMask);
+}
+
+PassOwnPtr<TextCodec> TextCodecUTF8::create(const TextEncoding&, const void*)
+{
+ return adoptPtr(new TextCodecUTF8);
+}
+
+void TextCodecUTF8::registerEncodingNames(EncodingNameRegistrar registrar)
+{
+ registrar("UTF-8", "UTF-8");
+}
+
+void TextCodecUTF8::registerCodecs(TextCodecRegistrar registrar)
+{
+ registrar("UTF-8", create, 0);
+}
+
+static inline int nonASCIISequenceLength(unsigned char firstByte)
+{
+ ASSERT(!isASCII(firstByte));
+ switch (firstByte >> 4) {
+ case 0xF:
+ return 4;
+ case 0xE:
+ return 3;
+ }
+ return 2;
+}
+
+static inline int decodeNonASCIISequence(const unsigned char* sequence, unsigned length)
+{
+ ASSERT(!isASCII(sequence[0]));
+ if (length == 2) {
+ ASSERT(sequence[0] <= 0xDF);
+ if (sequence[0] < 0xC2)
+ return -1;
+ if (sequence[1] < 0x80 || sequence[1] > 0xBF)
+ return -1;
+ return ((sequence[0] << 6) + sequence[1]) - 0x00003080;
+ }
+ if (length == 3) {
+ ASSERT(sequence[0] >= 0xE0 && sequence[0] <= 0xEF);
+ switch (sequence[0]) {
+ case 0xE0:
+ if (sequence[1] < 0xA0 || sequence[1] > 0xBF)
+ return -1;
+ break;
+ case 0xED:
+ if (sequence[1] < 0x80 || sequence[1] > 0x9F)
+ return -1;
+ break;
+ default:
+ if (sequence[1] < 0x80 || sequence[1] > 0xBF)
+ return -1;
+ }
+ if (sequence[2] < 0x80 || sequence[2] > 0xBF)
+ return -1;
+ return ((sequence[0] << 12) + (sequence[1] << 6) + sequence[2]) - 0x000E2080;
+ }
+ ASSERT(length == 4);
+ ASSERT(sequence[0] >= 0xF0 && sequence[0] <= 0xF4);
+ switch (sequence[0]) {
+ case 0xF0:
+ if (sequence[1] < 0x90 || sequence[1] > 0xBF)
+ return -1;
+ break;
+ case 0xF4:
+ if (sequence[1] < 0x80 || sequence[1] > 0x8F)
+ return -1;
+ break;
+ default:
+ if (sequence[1] < 0x80 || sequence[1] > 0xBF)
+ return -1;
+ }
+ if (sequence[2] < 0x80 || sequence[2] > 0xBF)
+ return -1;
+ if (sequence[3] < 0x80 || sequence[3] > 0xBF)
+ return -1;
+ return ((sequence[0] << 18) + (sequence[1] << 12) + (sequence[2] << 6) + sequence[3]) - 0x03C82080;
+}
+
+String TextCodecUTF8::decode(const char* bytes, size_t length, bool flush, bool stopOnError, bool& sawError)
+{
+ StringBuffer buffer(length);
+
+ const uint8_t* source = reinterpret_cast<const uint8_t*>(bytes);
+ const uint8_t* end = source + length;
+ const uint8_t* alignedEnd = alignToMachineWord(end);
+ UChar* destination = buffer.characters();
+
+ int count;
+ int character;
+
+ if (m_partialSequenceSize) {
+ count = nonASCIISequenceLength(m_partialSequence[0]);
+ ASSERT(count > m_partialSequenceSize);
+ if (count - m_partialSequenceSize > end - source) {
+ memcpy(m_partialSequence + m_partialSequenceSize, source, end - source);
+ m_partialSequenceSize += end - source;
+ source = end;
+ } else {
+ uint8_t completeSequence[U8_MAX_LENGTH];
+ memcpy(completeSequence, m_partialSequence, m_partialSequenceSize);
+ memcpy(completeSequence + m_partialSequenceSize, source, count - m_partialSequenceSize);
+ source += count - m_partialSequenceSize;
+ m_partialSequenceSize = 0;
+ character = decodeNonASCIISequence(completeSequence, count);
+ goto decodedNonASCII;
+ }
+ }
+
+ while (source < end) {
+ if (isASCII(*source)) {
+ // Fast path for ASCII. Most UTF-8 text will be ASCII.
+ if (isAlignedToMachineWord(source)) {
+ while (source < alignedEnd) {
+ MachineWord chunk = *reinterpret_cast_ptr<const MachineWord*>(source);
+ if (chunk & NonASCIIMask<sizeof(MachineWord)>::value()) {
+ if (isASCII(*source))
+ break;
+ goto nonASCII;
+ }
+ UCharByteFiller<sizeof(MachineWord)>::copy(destination, source);
+ source += sizeof(MachineWord);
+ destination += sizeof(MachineWord);
+ }
+ if (source == end)
+ break;
+ }
+ *destination++ = *source++;
+ } else {
+nonASCII:
+ count = nonASCIISequenceLength(*source);
+ ASSERT(count >= 2);
+ ASSERT(count <= 4);
+ if (count > end - source) {
+ ASSERT(end - source <= static_cast<ptrdiff_t>(sizeof(m_partialSequence)));
+ ASSERT(!m_partialSequenceSize);
+ m_partialSequenceSize = end - source;
+ memcpy(m_partialSequence, source, m_partialSequenceSize);
+ break;
+ }
+ character = decodeNonASCIISequence(source, count);
+ source += count;
+decodedNonASCII:
+ if (character < 0) {
+ if (stopOnError) {
+ sawError = true;
+ break;
+ }
+ } else {
+ ASSERT(!U_IS_SURROGATE(character));
+ if (U_IS_BMP(character))
+ *destination++ = character;
+ else {
+ *destination++ = U16_LEAD(character);
+ *destination++ = U16_TRAIL(character);
+ }
+ }
+ }
+ }
+
+ buffer.shrink(destination - buffer.characters());
+
+ if (flush && m_partialSequenceSize)
+ sawError = true;
+
+ return String::adopt(buffer);
+}
+
+CString TextCodecUTF8::encode(const UChar* characters, size_t length, UnencodableHandling)
+{
+ // The maximum number of UTF-8 bytes needed per UTF-16 code unit is 3.
+ // BMP characters take only one UTF-16 code unit and can take up to 3 bytes (3x).
+ // Non-BMP characters take two UTF-16 code units and can take up to 4 bytes (2x).
+ if (length > numeric_limits<size_t>::max() / 3)
+ CRASH();
+ Vector<uint8_t> bytes(length * 3);
+
+ size_t i = 0;
+ size_t bytesWritten = 0;
+ while (i < length) {
+ UChar32 character;
+ U16_NEXT(characters, i, length, character);
+ U8_APPEND_UNSAFE(bytes.data(), bytesWritten, character);
+ }
+
+ return CString(reinterpret_cast<char*>(bytes.data()), bytesWritten);
+}
+
+} // namespace WebCore