summaryrefslogtreecommitdiffstats
path: root/JavaScriptCore/wtf/unicode
diff options
context:
space:
mode:
authorThe Android Open Source Project <initial-contribution@android.com>2009-03-03 19:30:52 -0800
committerThe Android Open Source Project <initial-contribution@android.com>2009-03-03 19:30:52 -0800
commit8e35f3cfc7fba1d1c829dc557ebad6409cbe16a2 (patch)
tree11425ea0b299d6fb89c6d3618a22d97d5bf68d0f /JavaScriptCore/wtf/unicode
parent648161bb0edfc3d43db63caed5cc5213bc6cb78f (diff)
downloadexternal_webkit-8e35f3cfc7fba1d1c829dc557ebad6409cbe16a2.zip
external_webkit-8e35f3cfc7fba1d1c829dc557ebad6409cbe16a2.tar.gz
external_webkit-8e35f3cfc7fba1d1c829dc557ebad6409cbe16a2.tar.bz2
auto import from //depot/cupcake/@135843
Diffstat (limited to 'JavaScriptCore/wtf/unicode')
-rw-r--r--JavaScriptCore/wtf/unicode/Collator.h67
-rw-r--r--JavaScriptCore/wtf/unicode/CollatorDefault.cpp75
-rw-r--r--JavaScriptCore/wtf/unicode/UTF8.cpp303
-rw-r--r--JavaScriptCore/wtf/unicode/UTF8.h75
-rw-r--r--JavaScriptCore/wtf/unicode/Unicode.h36
-rw-r--r--JavaScriptCore/wtf/unicode/icu/CollatorICU.cpp144
-rw-r--r--JavaScriptCore/wtf/unicode/icu/UnicodeIcu.h219
-rw-r--r--JavaScriptCore/wtf/unicode/qt4/UnicodeQt4.h525
8 files changed, 1444 insertions, 0 deletions
diff --git a/JavaScriptCore/wtf/unicode/Collator.h b/JavaScriptCore/wtf/unicode/Collator.h
new file mode 100644
index 0000000..f04779d
--- /dev/null
+++ b/JavaScriptCore/wtf/unicode/Collator.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (C) 2008 Apple Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of Apple Computer, Inc. ("Apple") nor the names of
+ * its contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef WTF_Collator_h
+#define WTF_Collator_h
+
+#include <memory>
+#include <wtf/Noncopyable.h>
+#include <wtf/unicode/Unicode.h>
+
+#if USE(ICU_UNICODE) && !UCONFIG_NO_COLLATION
+struct UCollator;
+#endif
+
+namespace WTF {
+
+ class Collator : Noncopyable {
+ public:
+ enum Result { Equal = 0, Greater = 1, Less = -1 };
+
+ Collator(const char* locale); // Parsing is lenient; e.g. language identifiers (such as "en-US") are accepted, too.
+ ~Collator();
+ void setOrderLowerFirst(bool);
+
+ static std::auto_ptr<Collator> userDefault();
+
+ Result collate(const ::UChar*, size_t, const ::UChar*, size_t) const;
+
+ private:
+#if USE(ICU_UNICODE) && !UCONFIG_NO_COLLATION
+ void createCollator() const;
+ void releaseCollator();
+ mutable UCollator* m_collator;
+#endif
+ char* m_locale;
+ bool m_lowerFirst;
+ };
+}
+
+using WTF::Collator;
+
+#endif
diff --git a/JavaScriptCore/wtf/unicode/CollatorDefault.cpp b/JavaScriptCore/wtf/unicode/CollatorDefault.cpp
new file mode 100644
index 0000000..eddbe53
--- /dev/null
+++ b/JavaScriptCore/wtf/unicode/CollatorDefault.cpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright (C) 2008 Apple Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of Apple Computer, Inc. ("Apple") nor the names of
+ * its contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+#include "Collator.h"
+
+#if !USE(ICU_UNICODE) || UCONFIG_NO_COLLATION
+
+namespace WTF {
+
+Collator::Collator(const char*)
+{
+}
+
+Collator::~Collator()
+{
+}
+
+void Collator::setOrderLowerFirst(bool)
+{
+}
+
+std::auto_ptr<Collator> Collator::userDefault()
+{
+ return std::auto_ptr<Collator>(new Collator(0));
+}
+
+// A default implementation for platforms that lack Unicode-aware collation.
+Collator::Result Collator::collate(const UChar* lhs, size_t lhsLength, const UChar* rhs, size_t rhsLength) const
+{
+ int lmin = lhsLength < rhsLength ? lhsLength : rhsLength;
+ int l = 0;
+ while (l < lmin && *lhs == *rhs) {
+ lhs++;
+ rhs++;
+ l++;
+ }
+
+ if (l < lmin)
+ return (*lhs > *rhs) ? Greater : Less;
+
+ if (lhsLength == rhsLength)
+ return Equal;
+
+ return (lhsLength > rhsLength) ? Greater : Less;
+}
+
+}
+
+#endif
diff --git a/JavaScriptCore/wtf/unicode/UTF8.cpp b/JavaScriptCore/wtf/unicode/UTF8.cpp
new file mode 100644
index 0000000..9e713fe
--- /dev/null
+++ b/JavaScriptCore/wtf/unicode/UTF8.cpp
@@ -0,0 +1,303 @@
+/*
+ * Copyright (C) 2007 Apple Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "UTF8.h"
+
+namespace WTF {
+namespace Unicode {
+
+inline int inlineUTF8SequenceLengthNonASCII(char b0)
+{
+ if ((b0 & 0xC0) != 0xC0)
+ return 0;
+ if ((b0 & 0xE0) == 0xC0)
+ return 2;
+ if ((b0 & 0xF0) == 0xE0)
+ return 3;
+ if ((b0 & 0xF8) == 0xF0)
+ return 4;
+ return 0;
+}
+
+inline int inlineUTF8SequenceLength(char b0)
+{
+ return (b0 & 0x80) == 0 ? 1 : inlineUTF8SequenceLengthNonASCII(b0);
+}
+
+int UTF8SequenceLength(char b0)
+{
+ return (b0 & 0x80) == 0 ? 1 : inlineUTF8SequenceLengthNonASCII(b0);
+}
+
+int decodeUTF8Sequence(const char* sequence)
+{
+ // Handle 0-byte sequences (never valid).
+ const unsigned char b0 = sequence[0];
+ const int length = inlineUTF8SequenceLength(b0);
+ if (length == 0)
+ return -1;
+
+ // Handle 1-byte sequences (plain ASCII).
+ const unsigned char b1 = sequence[1];
+ if (length == 1) {
+ if (b1)
+ return -1;
+ return b0;
+ }
+
+ // Handle 2-byte sequences.
+ if ((b1 & 0xC0) != 0x80)
+ return -1;
+ const unsigned char b2 = sequence[2];
+ if (length == 2) {
+ if (b2)
+ return -1;
+ const int c = ((b0 & 0x1F) << 6) | (b1 & 0x3F);
+ if (c < 0x80)
+ return -1;
+ return c;
+ }
+
+ // Handle 3-byte sequences.
+ if ((b2 & 0xC0) != 0x80)
+ return -1;
+ const unsigned char b3 = sequence[3];
+ if (length == 3) {
+ if (b3)
+ return -1;
+ const int c = ((b0 & 0xF) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F);
+ if (c < 0x800)
+ return -1;
+ // UTF-16 surrogates should never appear in UTF-8 data.
+ if (c >= 0xD800 && c <= 0xDFFF)
+ return -1;
+ return c;
+ }
+
+ // Handle 4-byte sequences.
+ if ((b3 & 0xC0) != 0x80)
+ return -1;
+ const unsigned char b4 = sequence[4];
+ if (length == 4) {
+ if (b4)
+ return -1;
+ const int c = ((b0 & 0x7) << 18) | ((b1 & 0x3F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F);
+ if (c < 0x10000 || c > 0x10FFFF)
+ return -1;
+ return c;
+ }
+
+ return -1;
+}
+
+// Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
+// into the first byte, depending on how many bytes follow. There are
+// as many entries in this table as there are UTF-8 sequence types.
+// (I.e., one byte sequence, two byte... etc.). Remember that sequencs
+// for *legal* UTF-8 will be 4 or fewer bytes total.
+static const unsigned char firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
+
+ConversionResult convertUTF16ToUTF8(
+ const UChar** sourceStart, const UChar* sourceEnd,
+ char** targetStart, char* targetEnd, bool strict)
+{
+ ConversionResult result = conversionOK;
+ const UChar* source = *sourceStart;
+ char* target = *targetStart;
+ while (source < sourceEnd) {
+ UChar32 ch;
+ unsigned short bytesToWrite = 0;
+ const UChar32 byteMask = 0xBF;
+ const UChar32 byteMark = 0x80;
+ const UChar* oldSource = source; // In case we have to back up because of target overflow.
+ ch = static_cast<unsigned short>(*source++);
+ // If we have a surrogate pair, convert to UChar32 first.
+ if (ch >= 0xD800 && ch <= 0xDBFF) {
+ // If the 16 bits following the high surrogate are in the source buffer...
+ if (source < sourceEnd) {
+ UChar32 ch2 = static_cast<unsigned short>(*source);
+ // If it's a low surrogate, convert to UChar32.
+ if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
+ ch = ((ch - 0xD800) << 10) + (ch2 - 0xDC00) + 0x0010000;
+ ++source;
+ } else if (strict) { // it's an unpaired high surrogate
+ --source; // return to the illegal value itself
+ result = sourceIllegal;
+ break;
+ }
+ } else { // We don't have the 16 bits following the high surrogate.
+ --source; // return to the high surrogate
+ result = sourceExhausted;
+ break;
+ }
+ } else if (strict) {
+ // UTF-16 surrogate values are illegal in UTF-32
+ if (ch >= 0xDC00 && ch <= 0xDFFF) {
+ --source; // return to the illegal value itself
+ result = sourceIllegal;
+ break;
+ }
+ }
+ // Figure out how many bytes the result will require
+ if (ch < (UChar32)0x80) {
+ bytesToWrite = 1;
+ } else if (ch < (UChar32)0x800) {
+ bytesToWrite = 2;
+ } else if (ch < (UChar32)0x10000) {
+ bytesToWrite = 3;
+ } else if (ch < (UChar32)0x110000) {
+ bytesToWrite = 4;
+ } else {
+ bytesToWrite = 3;
+ ch = 0xFFFD;
+ }
+
+ target += bytesToWrite;
+ if (target > targetEnd) {
+ source = oldSource; // Back up source pointer!
+ target -= bytesToWrite;
+ result = targetExhausted;
+ break;
+ }
+ switch (bytesToWrite) { // note: everything falls through.
+ case 4: *--target = (char)((ch | byteMark) & byteMask); ch >>= 6;
+ case 3: *--target = (char)((ch | byteMark) & byteMask); ch >>= 6;
+ case 2: *--target = (char)((ch | byteMark) & byteMask); ch >>= 6;
+ case 1: *--target = (char)(ch | firstByteMark[bytesToWrite]);
+ }
+ target += bytesToWrite;
+ }
+ *sourceStart = source;
+ *targetStart = target;
+ return result;
+}
+
+// This must be called with the length pre-determined by the first byte.
+// If presented with a length > 4, this returns false. The Unicode
+// definition of UTF-8 goes up to 4-byte sequences.
+static bool isLegalUTF8(const unsigned char* source, int length)
+{
+ unsigned char a;
+ const unsigned char* srcptr = source + length;
+ switch (length) {
+ default: return false;
+ // Everything else falls through when "true"...
+ case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
+ case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
+ case 2: if ((a = (*--srcptr)) > 0xBF) return false;
+
+ switch (*source) {
+ // no fall-through in this inner switch
+ case 0xE0: if (a < 0xA0) return false; break;
+ case 0xED: if (a > 0x9F) return false; break;
+ case 0xF0: if (a < 0x90) return false; break;
+ case 0xF4: if (a > 0x8F) return false; break;
+ default: if (a < 0x80) return false;
+ }
+
+ case 1: if (*source >= 0x80 && *source < 0xC2) return false;
+ }
+ if (*source > 0xF4)
+ return false;
+ return true;
+}
+
+// Magic values subtracted from a buffer value during UTF8 conversion.
+// This table contains as many values as there might be trailing bytes
+// in a UTF-8 sequence.
+static const UChar32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
+ 0x03C82080UL, 0xFA082080UL, 0x82082080UL };
+
+ConversionResult convertUTF8ToUTF16(
+ const char** sourceStart, const char* sourceEnd,
+ UChar** targetStart, UChar* targetEnd, bool strict)
+{
+ ConversionResult result = conversionOK;
+ const char* source = *sourceStart;
+ UChar* target = *targetStart;
+ while (source < sourceEnd) {
+ UChar32 ch = 0;
+ int extraBytesToRead = UTF8SequenceLength(*source) - 1;
+ if (source + extraBytesToRead >= sourceEnd) {
+ result = sourceExhausted;
+ break;
+ }
+ // Do this check whether lenient or strict
+ if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source), extraBytesToRead + 1)) {
+ result = sourceIllegal;
+ break;
+ }
+ // The cases all fall through.
+ switch (extraBytesToRead) {
+ case 5: ch += static_cast<unsigned char>(*source++); ch <<= 6; // remember, illegal UTF-8
+ case 4: ch += static_cast<unsigned char>(*source++); ch <<= 6; // remember, illegal UTF-8
+ case 3: ch += static_cast<unsigned char>(*source++); ch <<= 6;
+ case 2: ch += static_cast<unsigned char>(*source++); ch <<= 6;
+ case 1: ch += static_cast<unsigned char>(*source++); ch <<= 6;
+ case 0: ch += static_cast<unsigned char>(*source++);
+ }
+ ch -= offsetsFromUTF8[extraBytesToRead];
+
+ if (target >= targetEnd) {
+ source -= (extraBytesToRead + 1); // Back up source pointer!
+ result = targetExhausted; break;
+ }
+ if (ch <= 0xFFFF) {
+ // UTF-16 surrogate values are illegal in UTF-32
+ if (ch >= 0xD800 && ch <= 0xDFFF) {
+ if (strict) {
+ source -= (extraBytesToRead + 1); // return to the illegal value itself
+ result = sourceIllegal;
+ break;
+ } else
+ *target++ = 0xFFFD;
+ } else
+ *target++ = (UChar)ch; // normal case
+ } else if (ch > 0x10FFFF) {
+ if (strict) {
+ result = sourceIllegal;
+ source -= (extraBytesToRead + 1); // return to the start
+ break; // Bail out; shouldn't continue
+ } else
+ *target++ = 0xFFFD;
+ } else {
+ // target is a character in range 0xFFFF - 0x10FFFF
+ if (target + 1 >= targetEnd) {
+ source -= (extraBytesToRead + 1); // Back up source pointer!
+ result = targetExhausted;
+ break;
+ }
+ ch -= 0x0010000UL;
+ *target++ = (UChar)((ch >> 10) + 0xD800);
+ *target++ = (UChar)((ch & 0x03FF) + 0xDC00);
+ }
+ }
+ *sourceStart = source;
+ *targetStart = target;
+ return result;
+}
+
+}
+}
diff --git a/JavaScriptCore/wtf/unicode/UTF8.h b/JavaScriptCore/wtf/unicode/UTF8.h
new file mode 100644
index 0000000..a5ed93e
--- /dev/null
+++ b/JavaScriptCore/wtf/unicode/UTF8.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (C) 2007 Apple Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef WTF_UTF8_h
+#define WTF_UTF8_h
+
+#include "Unicode.h"
+
+namespace WTF {
+ namespace Unicode {
+
+ // Given a first byte, gives the length of the UTF-8 sequence it begins.
+ // Returns 0 for bytes that are not legal starts of UTF-8 sequences.
+ // Only allows sequences of up to 4 bytes, since that works for all Unicode characters (U-00000000 to U-0010FFFF).
+ int UTF8SequenceLength(char);
+
+ // Takes a null-terminated C-style string with a UTF-8 sequence in it and converts it to a character.
+ // Only allows Unicode characters (U-00000000 to U-0010FFFF).
+ // Returns -1 if the sequence is not valid (including presence of extra bytes).
+ int decodeUTF8Sequence(const char*);
+
+ typedef enum {
+ conversionOK, // conversion successful
+ sourceExhausted, // partial character in source, but hit end
+ targetExhausted, // insuff. room in target for conversion
+ sourceIllegal // source sequence is illegal/malformed
+ } ConversionResult;
+
+ // These conversion functions take a "strict" argument. When this
+ // flag is set to strict, both irregular sequences and isolated surrogates
+ // will cause an error. When the flag is set to lenient, both irregular
+ // sequences and isolated surrogates are converted.
+ //
+ // Whether the flag is strict or lenient, all illegal sequences will cause
+ // an error return. This includes sequences such as: <F4 90 80 80>, <C0 80>,
+ // or <A0> in UTF-8, and values above 0x10FFFF in UTF-32. Conformant code
+ // must check for illegal sequences.
+ //
+ // When the flag is set to lenient, characters over 0x10FFFF are converted
+ // to the replacement character; otherwise (when the flag is set to strict)
+ // they constitute an error.
+
+ ConversionResult convertUTF8ToUTF16(
+ const char** sourceStart, const char* sourceEnd,
+ UChar** targetStart, UChar* targetEnd, bool strict = true);
+
+ ConversionResult convertUTF16ToUTF8(
+ const UChar** sourceStart, const UChar* sourceEnd,
+ char** targetStart, char* targetEnd, bool strict = true);
+ }
+}
+
+#endif // WTF_UTF8_h
diff --git a/JavaScriptCore/wtf/unicode/Unicode.h b/JavaScriptCore/wtf/unicode/Unicode.h
new file mode 100644
index 0000000..9cd3555
--- /dev/null
+++ b/JavaScriptCore/wtf/unicode/Unicode.h
@@ -0,0 +1,36 @@
+/*
+ * This file is part of the KDE libraries
+ * Copyright (C) 2006 George Staikos <staikos@kde.org>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public License
+ * along with this library; see the file COPYING.LIB. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ *
+ */
+
+#ifndef KJS_UNICODE_H
+#define KJS_UNICODE_H
+
+#include <wtf/Platform.h>
+
+#if USE(QT4_UNICODE)
+#include "qt4/UnicodeQt4.h"
+#elif USE(ICU_UNICODE)
+#include <wtf/unicode/icu/UnicodeIcu.h>
+#else
+#error "Unknown Unicode implementation"
+#endif
+
+#endif
+// vim: ts=2 sw=2 et
diff --git a/JavaScriptCore/wtf/unicode/icu/CollatorICU.cpp b/JavaScriptCore/wtf/unicode/icu/CollatorICU.cpp
new file mode 100644
index 0000000..a8bcc81
--- /dev/null
+++ b/JavaScriptCore/wtf/unicode/icu/CollatorICU.cpp
@@ -0,0 +1,144 @@
+/*
+ * Copyright (C) 2008 Apple Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of Apple Computer, Inc. ("Apple") nor the names of
+ * its contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+#include "Collator.h"
+
+#if USE(ICU_UNICODE) && !UCONFIG_NO_COLLATION
+
+#include "Assertions.h"
+#include "Threading.h"
+#include <unicode/ucol.h>
+#include <string.h>
+
+#if PLATFORM(DARWIN)
+#include <CoreFoundation/CoreFoundation.h>
+#endif
+
+namespace WTF {
+
+static UCollator* cachedCollator;
+static Mutex& cachedCollatorMutex()
+{
+ AtomicallyInitializedStatic(Mutex, mutex);
+ return mutex;
+}
+
+Collator::Collator(const char* locale)
+ : m_collator(0)
+ , m_locale(locale ? strdup(locale) : 0)
+ , m_lowerFirst(false)
+{
+}
+
+std::auto_ptr<Collator> Collator::userDefault()
+{
+#if PLATFORM(DARWIN) && PLATFORM(CF)
+ // Mac OS X doesn't set UNIX locale to match user-selected one, so ICU default doesn't work.
+ CFStringRef collationOrder = (CFStringRef)CFPreferencesCopyValue(CFSTR("AppleCollationOrder"), kCFPreferencesAnyApplication, kCFPreferencesCurrentUser, kCFPreferencesAnyHost);
+ char buf[256];
+ if (collationOrder) {
+ CFStringGetCString(collationOrder, buf, sizeof(buf), kCFStringEncodingASCII);
+ CFRelease(collationOrder);
+ return std::auto_ptr<Collator>(new Collator(buf));
+ } else
+ return std::auto_ptr<Collator>(new Collator(""));
+#else
+ return std::auto_ptr<Collator>(new Collator(0));
+#endif
+}
+
+Collator::~Collator()
+{
+ releaseCollator();
+ free(m_locale);
+}
+
+void Collator::setOrderLowerFirst(bool lowerFirst)
+{
+ m_lowerFirst = lowerFirst;
+}
+
+Collator::Result Collator::collate(const UChar* lhs, size_t lhsLength, const UChar* rhs, size_t rhsLength) const
+{
+ if (!m_collator)
+ createCollator();
+
+ return static_cast<Result>(ucol_strcoll(m_collator, lhs, lhsLength, rhs, rhsLength));
+}
+
+void Collator::createCollator() const
+{
+ ASSERT(!m_collator);
+ UErrorCode status = U_ZERO_ERROR;
+
+ {
+ Locker<Mutex> lock(cachedCollatorMutex());
+ if (cachedCollator) {
+ const char* cachedCollatorLocale = ucol_getLocaleByType(cachedCollator, ULOC_REQUESTED_LOCALE, &status);
+ ASSERT(U_SUCCESS(status));
+ ASSERT(cachedCollatorLocale);
+
+ UColAttributeValue cachedCollatorLowerFirst = ucol_getAttribute(cachedCollator, UCOL_CASE_FIRST, &status);
+ ASSERT(U_SUCCESS(status));
+
+ // FIXME: default locale is never matched, because ucol_getLocaleByType returns the actual one used, not 0.
+ if (m_locale && 0 == strcmp(cachedCollatorLocale, m_locale)
+ && ((UCOL_LOWER_FIRST == cachedCollatorLowerFirst && m_lowerFirst) || (UCOL_UPPER_FIRST == cachedCollatorLowerFirst && !m_lowerFirst))) {
+ m_collator = cachedCollator;
+ cachedCollator = 0;
+ return;
+ }
+ }
+ }
+
+ m_collator = ucol_open(m_locale, &status);
+ if (U_FAILURE(status)) {
+ status = U_ZERO_ERROR;
+ m_collator = ucol_open("", &status); // Fallback to Unicode Collation Algorithm.
+ }
+ ASSERT(U_SUCCESS(status));
+
+ ucol_setAttribute(m_collator, UCOL_CASE_FIRST, m_lowerFirst ? UCOL_LOWER_FIRST : UCOL_UPPER_FIRST, &status);
+ ASSERT(U_SUCCESS(status));
+}
+
+void Collator::releaseCollator()
+{
+ {
+ Locker<Mutex> lock(cachedCollatorMutex());
+ if (cachedCollator)
+ ucol_close(cachedCollator);
+ cachedCollator = m_collator;
+ m_collator = 0;
+ }
+}
+
+}
+
+#endif
diff --git a/JavaScriptCore/wtf/unicode/icu/UnicodeIcu.h b/JavaScriptCore/wtf/unicode/icu/UnicodeIcu.h
new file mode 100644
index 0000000..7cdc55c
--- /dev/null
+++ b/JavaScriptCore/wtf/unicode/icu/UnicodeIcu.h
@@ -0,0 +1,219 @@
+/*
+ * Copyright (C) 2006 George Staikos <staikos@kde.org>
+ * Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com>
+ * Copyright (C) 2007 Apple Inc. All rights reserved.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public License
+ * along with this library; see the file COPYING.LIB. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ *
+ */
+
+#ifndef KJS_UNICODE_ICU_H
+#define KJS_UNICODE_ICU_H
+
+#include <stdlib.h>
+#include <unicode/uchar.h>
+#include <unicode/ustring.h>
+#include <unicode/utf16.h>
+
+namespace WTF {
+namespace Unicode {
+
+enum Direction {
+ LeftToRight = U_LEFT_TO_RIGHT,
+ RightToLeft = U_RIGHT_TO_LEFT,
+ EuropeanNumber = U_EUROPEAN_NUMBER,
+ EuropeanNumberSeparator = U_EUROPEAN_NUMBER_SEPARATOR,
+ EuropeanNumberTerminator = U_EUROPEAN_NUMBER_TERMINATOR,
+ ArabicNumber = U_ARABIC_NUMBER,
+ CommonNumberSeparator = U_COMMON_NUMBER_SEPARATOR,
+ BlockSeparator = U_BLOCK_SEPARATOR,
+ SegmentSeparator = U_SEGMENT_SEPARATOR,
+ WhiteSpaceNeutral = U_WHITE_SPACE_NEUTRAL,
+ OtherNeutral = U_OTHER_NEUTRAL,
+ LeftToRightEmbedding = U_LEFT_TO_RIGHT_EMBEDDING,
+ LeftToRightOverride = U_LEFT_TO_RIGHT_OVERRIDE,
+ RightToLeftArabic = U_RIGHT_TO_LEFT_ARABIC,
+ RightToLeftEmbedding = U_RIGHT_TO_LEFT_EMBEDDING,
+ RightToLeftOverride = U_RIGHT_TO_LEFT_OVERRIDE,
+ PopDirectionalFormat = U_POP_DIRECTIONAL_FORMAT,
+ NonSpacingMark = U_DIR_NON_SPACING_MARK,
+ BoundaryNeutral = U_BOUNDARY_NEUTRAL
+};
+
+enum DecompositionType {
+ DecompositionNone = U_DT_NONE,
+ DecompositionCanonical = U_DT_CANONICAL,
+ DecompositionCompat = U_DT_COMPAT,
+ DecompositionCircle = U_DT_CIRCLE,
+ DecompositionFinal = U_DT_FINAL,
+ DecompositionFont = U_DT_FONT,
+ DecompositionFraction = U_DT_FRACTION,
+ DecompositionInitial = U_DT_INITIAL,
+ DecompositionIsolated = U_DT_ISOLATED,
+ DecompositionMedial = U_DT_MEDIAL,
+ DecompositionNarrow = U_DT_NARROW,
+ DecompositionNoBreak = U_DT_NOBREAK,
+ DecompositionSmall = U_DT_SMALL,
+ DecompositionSquare = U_DT_SQUARE,
+ DecompositionSub = U_DT_SUB,
+ DecompositionSuper = U_DT_SUPER,
+ DecompositionVertical = U_DT_VERTICAL,
+ DecompositionWide = U_DT_WIDE,
+};
+
+enum CharCategory {
+ NoCategory = 0,
+ Other_NotAssigned = U_MASK(U_GENERAL_OTHER_TYPES),
+ Letter_Uppercase = U_MASK(U_UPPERCASE_LETTER),
+ Letter_Lowercase = U_MASK(U_LOWERCASE_LETTER),
+ Letter_Titlecase = U_MASK(U_TITLECASE_LETTER),
+ Letter_Modifier = U_MASK(U_MODIFIER_LETTER),
+ Letter_Other = U_MASK(U_OTHER_LETTER),
+
+ Mark_NonSpacing = U_MASK(U_NON_SPACING_MARK),
+ Mark_Enclosing = U_MASK(U_ENCLOSING_MARK),
+ Mark_SpacingCombining = U_MASK(U_COMBINING_SPACING_MARK),
+
+ Number_DecimalDigit = U_MASK(U_DECIMAL_DIGIT_NUMBER),
+ Number_Letter = U_MASK(U_LETTER_NUMBER),
+ Number_Other = U_MASK(U_OTHER_NUMBER),
+
+ Separator_Space = U_MASK(U_SPACE_SEPARATOR),
+ Separator_Line = U_MASK(U_LINE_SEPARATOR),
+ Separator_Paragraph = U_MASK(U_PARAGRAPH_SEPARATOR),
+
+ Other_Control = U_MASK(U_CONTROL_CHAR),
+ Other_Format = U_MASK(U_FORMAT_CHAR),
+ Other_PrivateUse = U_MASK(U_PRIVATE_USE_CHAR),
+ Other_Surrogate = U_MASK(U_SURROGATE),
+
+ Punctuation_Dash = U_MASK(U_DASH_PUNCTUATION),
+ Punctuation_Open = U_MASK(U_START_PUNCTUATION),
+ Punctuation_Close = U_MASK(U_END_PUNCTUATION),
+ Punctuation_Connector = U_MASK(U_CONNECTOR_PUNCTUATION),
+ Punctuation_Other = U_MASK(U_OTHER_PUNCTUATION),
+
+ Symbol_Math = U_MASK(U_MATH_SYMBOL),
+ Symbol_Currency = U_MASK(U_CURRENCY_SYMBOL),
+ Symbol_Modifier = U_MASK(U_MODIFIER_SYMBOL),
+ Symbol_Other = U_MASK(U_OTHER_SYMBOL),
+
+ Punctuation_InitialQuote = U_MASK(U_INITIAL_PUNCTUATION),
+ Punctuation_FinalQuote = U_MASK(U_FINAL_PUNCTUATION)
+};
+
+inline UChar32 foldCase(UChar32 c)
+{
+ return u_foldCase(c, U_FOLD_CASE_DEFAULT);
+}
+
+inline int foldCase(UChar* result, int resultLength, const UChar* src, int srcLength, bool* error)
+{
+ UErrorCode status = U_ZERO_ERROR;
+ int realLength = u_strFoldCase(result, resultLength, src, srcLength, U_FOLD_CASE_DEFAULT, &status);
+ *error = !U_SUCCESS(status);
+ return realLength;
+}
+
+inline int toLower(UChar* result, int resultLength, const UChar* src, int srcLength, bool* error)
+{
+ UErrorCode status = U_ZERO_ERROR;
+ int realLength = u_strToLower(result, resultLength, src, srcLength, "", &status);
+ *error = !!U_FAILURE(status);
+ return realLength;
+}
+
+inline UChar32 toLower(UChar32 c)
+{
+ return u_tolower(c);
+}
+
+inline UChar32 toUpper(UChar32 c)
+{
+ return u_toupper(c);
+}
+
+inline int toUpper(UChar* result, int resultLength, const UChar* src, int srcLength, bool* error)
+{
+ UErrorCode status = U_ZERO_ERROR;
+ int realLength = u_strToUpper(result, resultLength, src, srcLength, "", &status);
+ *error = !!U_FAILURE(status);
+ return realLength;
+}
+
+inline UChar32 toTitleCase(UChar32 c)
+{
+ return u_totitle(c);
+}
+
+inline bool isArabicChar(UChar32 c)
+{
+ return ublock_getCode(c) == UBLOCK_ARABIC;
+}
+
+inline bool isSeparatorSpace(UChar32 c)
+{
+ return u_charType(c) == U_SPACE_SEPARATOR;
+}
+
+inline bool isPrintableChar(UChar32 c)
+{
+ return !!u_isprint(c);
+}
+
+inline bool isPunct(UChar32 c)
+{
+ return !!u_ispunct(c);
+}
+
+inline UChar32 mirroredChar(UChar32 c)
+{
+ return u_charMirror(c);
+}
+
+inline CharCategory category(UChar32 c)
+{
+ return static_cast<CharCategory>(U_GET_GC_MASK(c));
+}
+
+inline Direction direction(UChar32 c)
+{
+ return static_cast<Direction>(u_charDirection(c));
+}
+
+inline bool isLower(UChar32 c)
+{
+ return !!u_islower(c);
+}
+
+inline uint8_t combiningClass(UChar32 c)
+{
+ return u_getCombiningClass(c);
+}
+
+inline DecompositionType decompositionType(UChar32 c)
+{
+ return static_cast<DecompositionType>(u_getIntPropertyValue(c, UCHAR_DECOMPOSITION_TYPE));
+}
+
+inline int umemcasecmp(const UChar* a, const UChar* b, int len)
+{
+ return u_memcasecmp(a, b, len, U_FOLD_CASE_DEFAULT);
+}
+
+} }
+
+#endif
diff --git a/JavaScriptCore/wtf/unicode/qt4/UnicodeQt4.h b/JavaScriptCore/wtf/unicode/qt4/UnicodeQt4.h
new file mode 100644
index 0000000..d285a8f
--- /dev/null
+++ b/JavaScriptCore/wtf/unicode/qt4/UnicodeQt4.h
@@ -0,0 +1,525 @@
+/*
+ * Copyright (C) 2006 George Staikos <staikos@kde.org>
+ * Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public License
+ * along with this library; see the file COPYING.LIB. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ *
+ */
+
+#ifndef KJS_UNICODE_QT4_H
+#define KJS_UNICODE_QT4_H
+
+#include <QChar>
+#include <QString>
+
+#include <config.h>
+
+#include <stdint.h>
+
+#if QT_VERSION >= 0x040300
+QT_BEGIN_NAMESPACE
+namespace QUnicodeTables {
+ struct Properties {
+ ushort category : 8;
+ ushort line_break_class : 8;
+ ushort direction : 8;
+ ushort combiningClass :8;
+ ushort joining : 2;
+ signed short digitValue : 6; /* 5 needed */
+ ushort unicodeVersion : 4;
+ ushort lowerCaseSpecial : 1;
+ ushort upperCaseSpecial : 1;
+ ushort titleCaseSpecial : 1;
+ ushort caseFoldSpecial : 1; /* currently unused */
+ signed short mirrorDiff : 16;
+ signed short lowerCaseDiff : 16;
+ signed short upperCaseDiff : 16;
+ signed short titleCaseDiff : 16;
+ signed short caseFoldDiff : 16;
+ };
+ Q_CORE_EXPORT const Properties * QT_FASTCALL properties(uint ucs4);
+ Q_CORE_EXPORT const Properties * QT_FASTCALL properties(ushort ucs2);
+}
+QT_END_NAMESPACE
+#endif
+
+// ugly hack to make UChar compatible with JSChar in API/JSStringRef.h
+#if defined(Q_OS_WIN)
+typedef wchar_t UChar;
+#else
+typedef uint16_t UChar;
+#endif
+typedef uint32_t UChar32;
+
+// some defines from ICU
+
+#define U16_IS_LEAD(c) (((c)&0xfffffc00)==0xd800)
+#define U16_IS_TRAIL(c) (((c)&0xfffffc00)==0xdc00)
+#define U16_SURROGATE_OFFSET ((0xd800<<10UL)+0xdc00-0x10000)
+#define U16_GET_SUPPLEMENTARY(lead, trail) \
+ (((UChar32)(lead)<<10UL)+(UChar32)(trail)-U16_SURROGATE_OFFSET)
+
+#define U16_LEAD(supplementary) (UChar)(((supplementary)>>10)+0xd7c0)
+#define U16_TRAIL(supplementary) (UChar)(((supplementary)&0x3ff)|0xdc00)
+
+#define U_IS_SURROGATE(c) (((c)&0xfffff800)==0xd800)
+#define U16_IS_SINGLE(c) !U_IS_SURROGATE(c)
+#define U16_IS_SURROGATE(c) U_IS_SURROGATE(c)
+#define U16_IS_SURROGATE_LEAD(c) (((c)&0x400)==0)
+
+#define U16_NEXT(s, i, length, c) { \
+ (c)=(s)[(i)++]; \
+ if(U16_IS_LEAD(c)) { \
+ uint16_t __c2; \
+ if((i)<(length) && U16_IS_TRAIL(__c2=(s)[(i)])) { \
+ ++(i); \
+ (c)=U16_GET_SUPPLEMENTARY((c), __c2); \
+ } \
+ } \
+}
+
+#define U_MASK(x) ((uint32_t)1<<(x))
+
+namespace WTF {
+namespace Unicode {
+
+enum Direction {
+ LeftToRight = QChar::DirL,
+ RightToLeft = QChar::DirR,
+ EuropeanNumber = QChar::DirEN,
+ EuropeanNumberSeparator = QChar::DirES,
+ EuropeanNumberTerminator = QChar::DirET,
+ ArabicNumber = QChar::DirAN,
+ CommonNumberSeparator = QChar::DirCS,
+ BlockSeparator = QChar::DirB,
+ SegmentSeparator = QChar::DirS,
+ WhiteSpaceNeutral = QChar::DirWS,
+ OtherNeutral = QChar::DirON,
+ LeftToRightEmbedding = QChar::DirLRE,
+ LeftToRightOverride = QChar::DirLRO,
+ RightToLeftArabic = QChar::DirAL,
+ RightToLeftEmbedding = QChar::DirRLE,
+ RightToLeftOverride = QChar::DirRLO,
+ PopDirectionalFormat = QChar::DirPDF,
+ NonSpacingMark = QChar::DirNSM,
+ BoundaryNeutral = QChar::DirBN
+};
+
+enum DecompositionType {
+ DecompositionNone = QChar::NoDecomposition,
+ DecompositionCanonical = QChar::Canonical,
+ DecompositionCompat = QChar::Compat,
+ DecompositionCircle = QChar::Circle,
+ DecompositionFinal = QChar::Final,
+ DecompositionFont = QChar::Font,
+ DecompositionFraction = QChar::Fraction,
+ DecompositionInitial = QChar::Initial,
+ DecompositionIsolated = QChar::Isolated,
+ DecompositionMedial = QChar::Medial,
+ DecompositionNarrow = QChar::Narrow,
+ DecompositionNoBreak = QChar::NoBreak,
+ DecompositionSmall = QChar::Small,
+ DecompositionSquare = QChar::Square,
+ DecompositionSub = QChar::Sub,
+ DecompositionSuper = QChar::Super,
+ DecompositionVertical = QChar::Vertical,
+ DecompositionWide = QChar::Wide
+};
+
+enum CharCategory {
+ NoCategory = 0,
+ Mark_NonSpacing = U_MASK(QChar::Mark_NonSpacing),
+ Mark_SpacingCombining = U_MASK(QChar::Mark_SpacingCombining),
+ Mark_Enclosing = U_MASK(QChar::Mark_Enclosing),
+ Number_DecimalDigit = U_MASK(QChar::Number_DecimalDigit),
+ Number_Letter = U_MASK(QChar::Number_Letter),
+ Number_Other = U_MASK(QChar::Number_Other),
+ Separator_Space = U_MASK(QChar::Separator_Space),
+ Separator_Line = U_MASK(QChar::Separator_Line),
+ Separator_Paragraph = U_MASK(QChar::Separator_Paragraph),
+ Other_Control = U_MASK(QChar::Other_Control),
+ Other_Format = U_MASK(QChar::Other_Format),
+ Other_Surrogate = U_MASK(QChar::Other_Surrogate),
+ Other_PrivateUse = U_MASK(QChar::Other_PrivateUse),
+ Other_NotAssigned = U_MASK(QChar::Other_NotAssigned),
+ Letter_Uppercase = U_MASK(QChar::Letter_Uppercase),
+ Letter_Lowercase = U_MASK(QChar::Letter_Lowercase),
+ Letter_Titlecase = U_MASK(QChar::Letter_Titlecase),
+ Letter_Modifier = U_MASK(QChar::Letter_Modifier),
+ Letter_Other = U_MASK(QChar::Letter_Other),
+ Punctuation_Connector = U_MASK(QChar::Punctuation_Connector),
+ Punctuation_Dash = U_MASK(QChar::Punctuation_Dash),
+ Punctuation_Open = U_MASK(QChar::Punctuation_Open),
+ Punctuation_Close = U_MASK(QChar::Punctuation_Close),
+ Punctuation_InitialQuote = U_MASK(QChar::Punctuation_InitialQuote),
+ Punctuation_FinalQuote = U_MASK(QChar::Punctuation_FinalQuote),
+ Punctuation_Other = U_MASK(QChar::Punctuation_Other),
+ Symbol_Math = U_MASK(QChar::Symbol_Math),
+ Symbol_Currency = U_MASK(QChar::Symbol_Currency),
+ Symbol_Modifier = U_MASK(QChar::Symbol_Modifier),
+ Symbol_Other = U_MASK(QChar::Symbol_Other),
+};
+
+
+#if QT_VERSION >= 0x040300
+
+// FIXME: handle surrogates correctly in all methods
+
+inline UChar32 toLower(UChar32 ch)
+{
+ return QChar::toLower(ch);
+}
+
+inline int toLower(UChar* result, int resultLength, const UChar* src, int srcLength, bool* error)
+{
+ const UChar *e = src + srcLength;
+ const UChar *s = src;
+ UChar *r = result;
+ uint rindex = 0;
+
+ // this avoids one out of bounds check in the loop
+ if (s < e && QChar(*s).isLowSurrogate()) {
+ if (r)
+ r[rindex] = *s++;
+ ++rindex;
+ }
+
+ int needed = 0;
+ while (s < e && (rindex < uint(resultLength) || !r)) {
+ uint c = *s;
+ if (QChar(c).isLowSurrogate() && QChar(*(s - 1)).isHighSurrogate())
+ c = QChar::surrogateToUcs4(*(s - 1), c);
+ const QUnicodeTables::Properties *prop = QUnicodeTables::properties(c);
+ if (prop->lowerCaseSpecial) {
+ QString qstring;
+ if (c < 0x10000) {
+ qstring += QChar(c);
+ } else {
+ qstring += QChar(*(s-1));
+ qstring += QChar(*s);
+ }
+ qstring = qstring.toLower();
+ for (int i = 0; i < qstring.length(); ++i) {
+ if (rindex >= uint(resultLength)) {
+ needed += qstring.length() - i;
+ break;
+ }
+ if (r)
+ r[rindex] = qstring.at(i).unicode();
+ ++rindex;
+ }
+ } else {
+ if (r)
+ r[rindex] = *s + prop->lowerCaseDiff;
+ ++rindex;
+ }
+ ++s;
+ }
+ if (s < e)
+ needed += e - s;
+ *error = (needed != 0);
+ if (rindex < uint(resultLength))
+ r[rindex] = 0;
+ return rindex + needed;
+}
+
+inline UChar32 toUpper(UChar32 ch)
+{
+ return QChar::toUpper(ch);
+}
+
+inline int toUpper(UChar* result, int resultLength, const UChar* src, int srcLength, bool* error)
+{
+ const UChar *e = src + srcLength;
+ const UChar *s = src;
+ UChar *r = result;
+ int rindex = 0;
+
+ // this avoids one out of bounds check in the loop
+ if (s < e && QChar(*s).isLowSurrogate()) {
+ if (r)
+ r[rindex] = *s++;
+ ++rindex;
+ }
+
+ int needed = 0;
+ while (s < e && (rindex < resultLength || !r)) {
+ uint c = *s;
+ if (QChar(c).isLowSurrogate() && QChar(*(s - 1)).isHighSurrogate())
+ c = QChar::surrogateToUcs4(*(s - 1), c);
+ const QUnicodeTables::Properties *prop = QUnicodeTables::properties(c);
+ if (prop->upperCaseSpecial) {
+ QString qstring;
+ if (c < 0x10000) {
+ qstring += QChar(c);
+ } else {
+ qstring += QChar(*(s-1));
+ qstring += QChar(*s);
+ }
+ qstring = qstring.toUpper();
+ for (int i = 0; i < qstring.length(); ++i) {
+ if (rindex >= resultLength) {
+ needed += qstring.length() - i;
+ break;
+ }
+ if (r)
+ r[rindex] = qstring.at(i).unicode();
+ ++rindex;
+ }
+ } else {
+ if (r)
+ r[rindex] = *s + prop->upperCaseDiff;
+ ++rindex;
+ }
+ ++s;
+ }
+ if (s < e)
+ needed += e - s;
+ *error = (needed != 0);
+ if (rindex < resultLength)
+ r[rindex] = 0;
+ return rindex + needed;
+}
+
+inline int toTitleCase(UChar32 c)
+{
+ return QChar::toTitleCase(c);
+}
+
+inline UChar32 foldCase(UChar32 c)
+{
+ return QChar::toCaseFolded(c);
+}
+
+inline int foldCase(UChar* result, int resultLength, const UChar* src, int srcLength, bool* error)
+{
+ // FIXME: handle special casing. Easiest with some low level API in Qt
+ *error = false;
+ if (resultLength < srcLength) {
+ *error = true;
+ return srcLength;
+ }
+ for (int i = 0; i < srcLength; ++i)
+ result[i] = QChar::toCaseFolded(ushort(src[i]));
+ return srcLength;
+}
+
+inline bool isArabicChar(UChar32 c)
+{
+ return c >= 0x0600 && c <= 0x06FF;
+}
+
+inline bool isPrintableChar(UChar32 c)
+{
+ const uint test = U_MASK(QChar::Other_Control) |
+ U_MASK(QChar::Other_NotAssigned);
+ return !(U_MASK(QChar::category(c)) & test);
+}
+
+inline bool isSeparatorSpace(UChar32 c)
+{
+ return QChar::category(c) == QChar::Separator_Space;
+}
+
+inline bool isPunct(UChar32 c)
+{
+ const uint test = U_MASK(QChar::Punctuation_Connector) |
+ U_MASK(QChar::Punctuation_Dash) |
+ U_MASK(QChar::Punctuation_Open) |
+ U_MASK(QChar::Punctuation_Close) |
+ U_MASK(QChar::Punctuation_InitialQuote) |
+ U_MASK(QChar::Punctuation_FinalQuote) |
+ U_MASK(QChar::Punctuation_Other);
+ return U_MASK(QChar::category(c)) & test;
+}
+
+inline bool isLower(UChar32 c)
+{
+ return QChar::category(c) == QChar::Letter_Lowercase;
+}
+
+inline UChar32 mirroredChar(UChar32 c)
+{
+ return QChar::mirroredChar(c);
+}
+
+inline uint8_t combiningClass(UChar32 c)
+{
+ return QChar::combiningClass(c);
+}
+
+inline DecompositionType decompositionType(UChar32 c)
+{
+ return (DecompositionType)QChar::decompositionTag(c);
+}
+
+inline int umemcasecmp(const UChar* a, const UChar* b, int len)
+{
+ // handle surrogates correctly
+ for (int i = 0; i < len; ++i) {
+ uint c1 = QChar::toCaseFolded(ushort(a[i]));
+ uint c2 = QChar::toCaseFolded(ushort(b[i]));
+ if (c1 != c2)
+ return c1 - c2;
+ }
+ return 0;
+}
+
+inline Direction direction(UChar32 c)
+{
+ return (Direction)QChar::direction(c);
+}
+
+inline CharCategory category(UChar32 c)
+{
+ return (CharCategory) U_MASK(QChar::category(c));
+}
+
+#else
+
+inline UChar32 toLower(UChar32 ch)
+{
+ if (ch > 0xffff)
+ return ch;
+ return QChar((unsigned short)ch).toLower().unicode();
+}
+
+inline int toLower(UChar* result, int resultLength, const UChar* src, int srcLength, bool* error)
+{
+ *error = false;
+ if (resultLength < srcLength) {
+ *error = true;
+ return srcLength;
+ }
+ for (int i = 0; i < srcLength; ++i)
+ result[i] = QChar(src[i]).toLower().unicode();
+ return srcLength;
+}
+
+inline UChar32 toUpper(UChar32 ch)
+{
+ if (ch > 0xffff)
+ return ch;
+ return QChar((unsigned short)ch).toUpper().unicode();
+}
+
+inline int toUpper(UChar* result, int resultLength, const UChar* src, int srcLength, bool* error)
+{
+ *error = false;
+ if (resultLength < srcLength) {
+ *error = true;
+ return srcLength;
+ }
+ for (int i = 0; i < srcLength; ++i)
+ result[i] = QChar(src[i]).toUpper().unicode();
+ return srcLength;
+}
+
+inline int toTitleCase(UChar32 c)
+{
+ if (c > 0xffff)
+ return c;
+ return QChar((unsigned short)c).toUpper().unicode();
+}
+
+inline UChar32 foldCase(UChar32 c)
+{
+ if (c > 0xffff)
+ return c;
+ return QChar((unsigned short)c).toLower().unicode();
+}
+
+inline int foldCase(UChar* result, int resultLength, const UChar* src, int srcLength, bool* error)
+{
+ return toLower(result, resultLength, src, srcLength, error);
+}
+
+inline bool isPrintableChar(UChar32 c)
+{
+ return (c & 0xffff0000) == 0 && QChar((unsigned short)c).isPrint();
+}
+
+inline bool isArabicChar(UChar32 c)
+{
+ return c >= 0x0600 && c <= 0x06FF;
+}
+
+inline bool isSeparatorSpace(UChar32 c)
+{
+ return (c & 0xffff0000) == 0 && QChar((unsigned short)c).category() == QChar::Separator_Space;
+}
+
+inline bool isPunct(UChar32 c)
+{
+ return (c & 0xffff0000) == 0 && QChar((unsigned short)c).isPunct();
+}
+
+inline bool isLower(UChar32 c)
+{
+ return (c & 0xffff0000) == 0 && QChar((unsigned short)c).category() == QChar::Letter_Lowercase;
+}
+
+inline UChar32 mirroredChar(UChar32 c)
+{
+ if (c > 0xffff)
+ return c;
+ return QChar(c).mirroredChar().unicode();
+}
+
+inline uint8_t combiningClass(UChar32 c)
+{
+ if (c > 0xffff)
+ return 0;
+ return QChar((unsigned short)c).combiningClass();
+}
+
+inline DecompositionType decompositionType(UChar32 c)
+{
+ if (c > 0xffff)
+ return DecompositionNone;
+ return (DecompositionType)QChar(c).decompositionTag();
+}
+
+inline int umemcasecmp(const UChar* a, const UChar* b, int len)
+{
+ for (int i = 0; i < len; ++i) {
+ QChar c1 = QChar(a[i]).toLower();
+ QChar c2 = QChar(b[i]).toLower();
+ if (c1 != c2)
+ return c1.unicode() - c2.unicode();
+ }
+ return 0;
+}
+
+inline Direction direction(UChar32 c)
+{
+ if (c > 0xffff)
+ return LeftToRight;
+ return (Direction)QChar(c).direction();
+}
+
+inline CharCategory category(UChar32 c)
+{
+ if (c > 0xffff)
+ return NoCategory;
+ return (CharCategory) U_MASK(QChar(c).category());
+}
+
+#endif
+
+} }
+
+#endif