summaryrefslogtreecommitdiffstats
path: root/JavaScriptCore/wtf/unicode
diff options
context:
space:
mode:
authorBen Murdoch <benm@google.com>2011-05-05 14:36:32 +0100
committerBen Murdoch <benm@google.com>2011-05-10 15:38:30 +0100
commitf05b935882198ccf7d81675736e3aeb089c5113a (patch)
tree4ea0ca838d9ef1b15cf17ddb3928efb427c7e5a1 /JavaScriptCore/wtf/unicode
parent60fbdcc62bced8db2cb1fd233cc4d1e4ea17db1b (diff)
downloadexternal_webkit-f05b935882198ccf7d81675736e3aeb089c5113a.zip
external_webkit-f05b935882198ccf7d81675736e3aeb089c5113a.tar.gz
external_webkit-f05b935882198ccf7d81675736e3aeb089c5113a.tar.bz2
Merge WebKit at r74534: Initial merge by git.
Change-Id: I6ccd1154fa1b19c2ec2a66878eb675738735f1eb
Diffstat (limited to 'JavaScriptCore/wtf/unicode')
-rw-r--r--JavaScriptCore/wtf/unicode/UTF8.cpp169
-rw-r--r--JavaScriptCore/wtf/unicode/UTF8.h11
-rw-r--r--JavaScriptCore/wtf/unicode/UnicodeMacrosFromICU.h31
3 files changed, 171 insertions, 40 deletions
diff --git a/JavaScriptCore/wtf/unicode/UTF8.cpp b/JavaScriptCore/wtf/unicode/UTF8.cpp
index 40c5609..dc24ed5 100644
--- a/JavaScriptCore/wtf/unicode/UTF8.cpp
+++ b/JavaScriptCore/wtf/unicode/UTF8.cpp
@@ -1,5 +1,6 @@
/*
* Copyright (C) 2007 Apple Inc. All rights reserved.
+ * Copyright (C) 2010 Patrick Gansterer <paroga@paroga.com>
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -25,10 +26,16 @@
#include "config.h"
#include "UTF8.h"
+#include <wtf/StringHasher.h>
+
+#include "ASCIICType.h"
namespace WTF {
namespace Unicode {
+// FIXME: Use definition from CharacterNames.h.
+static const UChar replacementCharacter = 0xFFFD;
+
inline int inlineUTF8SequenceLengthNonASCII(char b0)
{
if ((b0 & 0xC0) != 0xC0)
@@ -44,12 +51,12 @@ inline int inlineUTF8SequenceLengthNonASCII(char b0)
inline int inlineUTF8SequenceLength(char b0)
{
- return (b0 & 0x80) == 0 ? 1 : inlineUTF8SequenceLengthNonASCII(b0);
+ return isASCII(b0) ? 1 : inlineUTF8SequenceLengthNonASCII(b0);
}
int UTF8SequenceLength(char b0)
{
- return (b0 & 0x80) == 0 ? 1 : inlineUTF8SequenceLengthNonASCII(b0);
+ return isASCII(b0) ? 1 : inlineUTF8SequenceLengthNonASCII(b0);
}
int decodeUTF8Sequence(const char* sequence)
@@ -172,7 +179,7 @@ ConversionResult convertUTF16ToUTF8(
bytesToWrite = 4;
} else {
bytesToWrite = 3;
- ch = 0xFFFD;
+ ch = replacementCharacter;
}
target += bytesToWrite;
@@ -231,6 +238,23 @@ static bool isLegalUTF8(const unsigned char* source, int length)
static const UChar32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
0x03C82080UL, 0xFA082080UL, 0x82082080UL };
+static inline UChar32 readUTF8Sequence(const char*& sequence, unsigned length)
+{
+ UChar32 character = 0;
+
+ // The cases all fall through.
+ switch (length) {
+ case 6: character += static_cast<unsigned char>(*sequence++); character <<= 6;
+ case 5: character += static_cast<unsigned char>(*sequence++); character <<= 6;
+ case 4: character += static_cast<unsigned char>(*sequence++); character <<= 6;
+ case 3: character += static_cast<unsigned char>(*sequence++); character <<= 6;
+ case 2: character += static_cast<unsigned char>(*sequence++); character <<= 6;
+ case 1: character += static_cast<unsigned char>(*sequence++);
+ }
+
+ return character - offsetsFromUTF8[length - 1];
+}
+
ConversionResult convertUTF8ToUTF16(
const char** sourceStart, const char* sourceEnd,
UChar** targetStart, UChar* targetEnd, bool strict)
@@ -239,60 +263,52 @@ ConversionResult convertUTF8ToUTF16(
const char* source = *sourceStart;
UChar* target = *targetStart;
while (source < sourceEnd) {
- UChar32 ch = 0;
- int extraBytesToRead = inlineUTF8SequenceLength(*source) - 1;
- if (source + extraBytesToRead >= sourceEnd) {
+ int utf8SequenceLength = inlineUTF8SequenceLength(*source);
+ if (sourceEnd - source < utf8SequenceLength) {
result = sourceExhausted;
break;
}
// Do this check whether lenient or strict
- if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source), extraBytesToRead + 1)) {
+ if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source), utf8SequenceLength)) {
result = sourceIllegal;
break;
}
- // The cases all fall through.
- switch (extraBytesToRead) {
- case 5: ch += static_cast<unsigned char>(*source++); ch <<= 6; // remember, illegal UTF-8
- case 4: ch += static_cast<unsigned char>(*source++); ch <<= 6; // remember, illegal UTF-8
- case 3: ch += static_cast<unsigned char>(*source++); ch <<= 6;
- case 2: ch += static_cast<unsigned char>(*source++); ch <<= 6;
- case 1: ch += static_cast<unsigned char>(*source++); ch <<= 6;
- case 0: ch += static_cast<unsigned char>(*source++);
- }
- ch -= offsetsFromUTF8[extraBytesToRead];
+
+ UChar32 character = readUTF8Sequence(source, utf8SequenceLength);
if (target >= targetEnd) {
- source -= (extraBytesToRead + 1); // Back up source pointer!
- result = targetExhausted; break;
+ source -= utf8SequenceLength; // Back up source pointer!
+ result = targetExhausted;
+ break;
}
- if (ch <= 0xFFFF) {
+
+ if (U_IS_BMP(character)) {
// UTF-16 surrogate values are illegal in UTF-32
- if (ch >= 0xD800 && ch <= 0xDFFF) {
+ if (U_IS_SURROGATE(character)) {
if (strict) {
- source -= (extraBytesToRead + 1); // return to the illegal value itself
+ source -= utf8SequenceLength; // return to the illegal value itself
result = sourceIllegal;
break;
} else
- *target++ = 0xFFFD;
- } else
- *target++ = (UChar)ch; // normal case
- } else if (ch > 0x10FFFF) {
- if (strict) {
- result = sourceIllegal;
- source -= (extraBytesToRead + 1); // return to the start
- break; // Bail out; shouldn't continue
+ *target++ = replacementCharacter;
} else
- *target++ = 0xFFFD;
- } else {
+ *target++ = character; // normal case
+ } else if (U_IS_SUPPLEMENTARY(character)) {
// target is a character in range 0xFFFF - 0x10FFFF
if (target + 1 >= targetEnd) {
- source -= (extraBytesToRead + 1); // Back up source pointer!
+ source -= utf8SequenceLength; // Back up source pointer!
result = targetExhausted;
break;
}
- ch -= 0x0010000UL;
- *target++ = (UChar)((ch >> 10) + 0xD800);
- *target++ = (UChar)((ch & 0x03FF) + 0xDC00);
+ *target++ = U16_LEAD(character);
+ *target++ = U16_TRAIL(character);
+ } else {
+ if (strict) {
+ source -= utf8SequenceLength; // return to the start
+ result = sourceIllegal;
+ break; // Bail out; shouldn't continue
+ } else
+ *target++ = replacementCharacter;
}
}
*sourceStart = source;
@@ -300,5 +316,86 @@ ConversionResult convertUTF8ToUTF16(
return result;
}
+unsigned calculateStringHashFromUTF8(const char* data, const char* dataEnd, unsigned& utf16Length)
+{
+ if (!data)
+ return 0;
+
+ WTF::StringHasher stringHasher;
+ utf16Length = 0;
+
+ while (data < dataEnd) {
+ if (isASCII(*data)) {
+ stringHasher.addCharacter(*data++);
+ utf16Length++;
+ continue;
+ }
+
+ int utf8SequenceLength = inlineUTF8SequenceLengthNonASCII(*data);
+
+ if (dataEnd - data < utf8SequenceLength)
+ return false;
+
+ if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(data), utf8SequenceLength))
+ return 0;
+
+ UChar32 character = readUTF8Sequence(data, utf8SequenceLength);
+ ASSERT(!isASCII(character));
+
+ if (U_IS_BMP(character)) {
+ // UTF-16 surrogate values are illegal in UTF-32
+ if (U_IS_SURROGATE(character))
+ return 0;
+ stringHasher.addCharacter(static_cast<UChar>(character)); // normal case
+ utf16Length++;
+ } else if (U_IS_SUPPLEMENTARY(character)) {
+ stringHasher.addCharacters(static_cast<UChar>(U16_LEAD(character)),
+ static_cast<UChar>(U16_TRAIL(character)));
+ utf16Length += 2;
+ } else
+ return 0;
+ }
+
+ return stringHasher.hash();
}
+
+bool equalUTF16WithUTF8(const UChar* a, const UChar* aEnd, const char* b, const char* bEnd)
+{
+ while (b < bEnd) {
+ if (isASCII(*b)) {
+ if (*a++ != *b++)
+ return false;
+ continue;
+ }
+
+ int utf8SequenceLength = inlineUTF8SequenceLengthNonASCII(*b);
+
+ if (bEnd - b < utf8SequenceLength)
+ return false;
+
+ if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(b), utf8SequenceLength))
+ return 0;
+
+ UChar32 character = readUTF8Sequence(b, utf8SequenceLength);
+ ASSERT(!isASCII(character));
+
+ if (U_IS_BMP(character)) {
+ // UTF-16 surrogate values are illegal in UTF-32
+ if (U_IS_SURROGATE(character))
+ return false;
+ if (*a++ != character)
+ return false;
+ } else if (U_IS_SUPPLEMENTARY(character)) {
+ if (*a++ != U16_LEAD(character))
+ return false;
+ if (*a++ != U16_TRAIL(character))
+ return false;
+ } else
+ return false;
+ }
+
+ return a == aEnd;
}
+
+} // namespace Unicode
+} // namespace WTF
diff --git a/JavaScriptCore/wtf/unicode/UTF8.h b/JavaScriptCore/wtf/unicode/UTF8.h
index a5ed93e..1f4baca 100644
--- a/JavaScriptCore/wtf/unicode/UTF8.h
+++ b/JavaScriptCore/wtf/unicode/UTF8.h
@@ -29,7 +29,7 @@
#include "Unicode.h"
namespace WTF {
- namespace Unicode {
+namespace Unicode {
// Given a first byte, gives the length of the UTF-8 sequence it begins.
// Returns 0 for bytes that are not legal starts of UTF-8 sequences.
@@ -69,7 +69,12 @@ namespace WTF {
ConversionResult convertUTF16ToUTF8(
const UChar** sourceStart, const UChar* sourceEnd,
char** targetStart, char* targetEnd, bool strict = true);
- }
-}
+
+ unsigned calculateStringHashFromUTF8(const char* data, const char* dataEnd, unsigned& utf16Length);
+
+ bool equalUTF16WithUTF8(const UChar* a, const UChar* aEnd, const char* b, const char* bEnd);
+
+} // namespace Unicode
+} // namespace WTF
#endif // WTF_UTF8_h
diff --git a/JavaScriptCore/wtf/unicode/UnicodeMacrosFromICU.h b/JavaScriptCore/wtf/unicode/UnicodeMacrosFromICU.h
index f865ef1..8959912 100644
--- a/JavaScriptCore/wtf/unicode/UnicodeMacrosFromICU.h
+++ b/JavaScriptCore/wtf/unicode/UnicodeMacrosFromICU.h
@@ -1,4 +1,5 @@
/*
+ * Copyright (C) 1999-2004, International Business Machines Corporation and others. All Rights Reserved.
* Copyright (C) 2006 George Staikos <staikos@kde.org>
* Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com>
* Copyright (C) 2007 Apple Computer, Inc. All rights reserved.
@@ -38,11 +39,28 @@
#define U16_TRAIL(supplementary) (UChar)(((supplementary)&0x3ff)|0xdc00)
#define U16_LENGTH(c) ((uint32_t)(c) <= 0xffff ? 1 : 2)
+#define U_IS_SUPPLEMENTARY(c) ((UChar32)((c)-0x10000)<=0xfffff)
#define U_IS_SURROGATE(c) (((c)&0xfffff800)==0xd800)
#define U16_IS_SINGLE(c) !U_IS_SURROGATE(c)
#define U16_IS_SURROGATE(c) U_IS_SURROGATE(c)
#define U16_IS_SURROGATE_LEAD(c) (((c)&0x400)==0)
+#define U16_GET(s, start, i, length, c) { \
+ (c)=(s)[i]; \
+ if(U16_IS_SURROGATE(c)) { \
+ uint16_t __c2; \
+ if(U16_IS_SURROGATE_LEAD(c)) { \
+ if((i)+1<(length) && U16_IS_TRAIL(__c2=(s)[(i)+1])) { \
+ (c)=U16_GET_SUPPLEMENTARY((c), __c2); \
+ } \
+ } else { \
+ if((i)-1>=(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \
+ (c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \
+ } \
+ } \
+ } \
+}
+
#define U16_PREV(s, start, i, c) { \
(c)=(s)[--(i)]; \
if(U16_IS_TRAIL(c)) { \
@@ -54,6 +72,12 @@
} \
}
+#define U16_BACK_1(s, start, i) { \
+ if(U16_IS_TRAIL((s)[--(i)]) && (i)>(start) && U16_IS_LEAD((s)[(i)-1])) { \
+ --(i); \
+ } \
+}
+
#define U16_NEXT(s, i, length, c) { \
(c)=(s)[(i)++]; \
if(U16_IS_LEAD(c)) { \
@@ -65,7 +89,12 @@
} \
}
+#define U16_FWD_1(s, i, length) { \
+ if(U16_IS_LEAD((s)[(i)++]) && (i)<(length) && U16_IS_TRAIL((s)[i])) { \
+ ++(i); \
+ } \
+}
+
#define U_MASK(x) ((uint32_t)1<<(x))
#endif
-