diff options
author | Upstream <upstream-import@none> | 1970-01-12 13:46:40 +0000 |
---|---|---|
committer | Upstream <upstream-import@none> | 1970-01-12 13:46:40 +0000 |
commit | d8543bb6618c17b12da906afa77d216f58cf4058 (patch) | |
tree | c58dc05ed86825bd0ef8d305d58c8205106b540f /JavaScriptCore/wtf/unicode | |
download | external_webkit-d8543bb6618c17b12da906afa77d216f58cf4058.zip external_webkit-d8543bb6618c17b12da906afa77d216f58cf4058.tar.gz external_webkit-d8543bb6618c17b12da906afa77d216f58cf4058.tar.bz2 |
external/webkit r30707
Diffstat (limited to 'JavaScriptCore/wtf/unicode')
-rw-r--r-- | JavaScriptCore/wtf/unicode/UTF8.cpp | 303 | ||||
-rw-r--r-- | JavaScriptCore/wtf/unicode/UTF8.h | 75 | ||||
-rw-r--r-- | JavaScriptCore/wtf/unicode/Unicode.h | 37 | ||||
-rw-r--r-- | JavaScriptCore/wtf/unicode/icu/UnicodeIcu.h | 239 | ||||
-rw-r--r-- | JavaScriptCore/wtf/unicode/qt4/UnicodeQt4.h | 547 |
5 files changed, 1201 insertions, 0 deletions
diff --git a/JavaScriptCore/wtf/unicode/UTF8.cpp b/JavaScriptCore/wtf/unicode/UTF8.cpp new file mode 100644 index 0000000..9e713fe --- /dev/null +++ b/JavaScriptCore/wtf/unicode/UTF8.cpp @@ -0,0 +1,303 @@ +/* + * Copyright (C) 2007 Apple Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "UTF8.h" + +namespace WTF { +namespace Unicode { + +inline int inlineUTF8SequenceLengthNonASCII(char b0) +{ + if ((b0 & 0xC0) != 0xC0) + return 0; + if ((b0 & 0xE0) == 0xC0) + return 2; + if ((b0 & 0xF0) == 0xE0) + return 3; + if ((b0 & 0xF8) == 0xF0) + return 4; + return 0; +} + +inline int inlineUTF8SequenceLength(char b0) +{ + return (b0 & 0x80) == 0 ? 1 : inlineUTF8SequenceLengthNonASCII(b0); +} + +int UTF8SequenceLength(char b0) +{ + return (b0 & 0x80) == 0 ? 1 : inlineUTF8SequenceLengthNonASCII(b0); +} + +int decodeUTF8Sequence(const char* sequence) +{ + // Handle 0-byte sequences (never valid). + const unsigned char b0 = sequence[0]; + const int length = inlineUTF8SequenceLength(b0); + if (length == 0) + return -1; + + // Handle 1-byte sequences (plain ASCII). + const unsigned char b1 = sequence[1]; + if (length == 1) { + if (b1) + return -1; + return b0; + } + + // Handle 2-byte sequences. + if ((b1 & 0xC0) != 0x80) + return -1; + const unsigned char b2 = sequence[2]; + if (length == 2) { + if (b2) + return -1; + const int c = ((b0 & 0x1F) << 6) | (b1 & 0x3F); + if (c < 0x80) + return -1; + return c; + } + + // Handle 3-byte sequences. + if ((b2 & 0xC0) != 0x80) + return -1; + const unsigned char b3 = sequence[3]; + if (length == 3) { + if (b3) + return -1; + const int c = ((b0 & 0xF) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F); + if (c < 0x800) + return -1; + // UTF-16 surrogates should never appear in UTF-8 data. + if (c >= 0xD800 && c <= 0xDFFF) + return -1; + return c; + } + + // Handle 4-byte sequences. + if ((b3 & 0xC0) != 0x80) + return -1; + const unsigned char b4 = sequence[4]; + if (length == 4) { + if (b4) + return -1; + const int c = ((b0 & 0x7) << 18) | ((b1 & 0x3F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F); + if (c < 0x10000 || c > 0x10FFFF) + return -1; + return c; + } + + return -1; +} + +// Once the bits are split out into bytes of UTF-8, this is a mask OR-ed +// into the first byte, depending on how many bytes follow. There are +// as many entries in this table as there are UTF-8 sequence types. +// (I.e., one byte sequence, two byte... etc.). Remember that sequencs +// for *legal* UTF-8 will be 4 or fewer bytes total. +static const unsigned char firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; + +ConversionResult convertUTF16ToUTF8( + const UChar** sourceStart, const UChar* sourceEnd, + char** targetStart, char* targetEnd, bool strict) +{ + ConversionResult result = conversionOK; + const UChar* source = *sourceStart; + char* target = *targetStart; + while (source < sourceEnd) { + UChar32 ch; + unsigned short bytesToWrite = 0; + const UChar32 byteMask = 0xBF; + const UChar32 byteMark = 0x80; + const UChar* oldSource = source; // In case we have to back up because of target overflow. + ch = static_cast<unsigned short>(*source++); + // If we have a surrogate pair, convert to UChar32 first. + if (ch >= 0xD800 && ch <= 0xDBFF) { + // If the 16 bits following the high surrogate are in the source buffer... + if (source < sourceEnd) { + UChar32 ch2 = static_cast<unsigned short>(*source); + // If it's a low surrogate, convert to UChar32. + if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { + ch = ((ch - 0xD800) << 10) + (ch2 - 0xDC00) + 0x0010000; + ++source; + } else if (strict) { // it's an unpaired high surrogate + --source; // return to the illegal value itself + result = sourceIllegal; + break; + } + } else { // We don't have the 16 bits following the high surrogate. + --source; // return to the high surrogate + result = sourceExhausted; + break; + } + } else if (strict) { + // UTF-16 surrogate values are illegal in UTF-32 + if (ch >= 0xDC00 && ch <= 0xDFFF) { + --source; // return to the illegal value itself + result = sourceIllegal; + break; + } + } + // Figure out how many bytes the result will require + if (ch < (UChar32)0x80) { + bytesToWrite = 1; + } else if (ch < (UChar32)0x800) { + bytesToWrite = 2; + } else if (ch < (UChar32)0x10000) { + bytesToWrite = 3; + } else if (ch < (UChar32)0x110000) { + bytesToWrite = 4; + } else { + bytesToWrite = 3; + ch = 0xFFFD; + } + + target += bytesToWrite; + if (target > targetEnd) { + source = oldSource; // Back up source pointer! + target -= bytesToWrite; + result = targetExhausted; + break; + } + switch (bytesToWrite) { // note: everything falls through. + case 4: *--target = (char)((ch | byteMark) & byteMask); ch >>= 6; + case 3: *--target = (char)((ch | byteMark) & byteMask); ch >>= 6; + case 2: *--target = (char)((ch | byteMark) & byteMask); ch >>= 6; + case 1: *--target = (char)(ch | firstByteMark[bytesToWrite]); + } + target += bytesToWrite; + } + *sourceStart = source; + *targetStart = target; + return result; +} + +// This must be called with the length pre-determined by the first byte. +// If presented with a length > 4, this returns false. The Unicode +// definition of UTF-8 goes up to 4-byte sequences. +static bool isLegalUTF8(const unsigned char* source, int length) +{ + unsigned char a; + const unsigned char* srcptr = source + length; + switch (length) { + default: return false; + // Everything else falls through when "true"... + case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; + case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; + case 2: if ((a = (*--srcptr)) > 0xBF) return false; + + switch (*source) { + // no fall-through in this inner switch + case 0xE0: if (a < 0xA0) return false; break; + case 0xED: if (a > 0x9F) return false; break; + case 0xF0: if (a < 0x90) return false; break; + case 0xF4: if (a > 0x8F) return false; break; + default: if (a < 0x80) return false; + } + + case 1: if (*source >= 0x80 && *source < 0xC2) return false; + } + if (*source > 0xF4) + return false; + return true; +} + +// Magic values subtracted from a buffer value during UTF8 conversion. +// This table contains as many values as there might be trailing bytes +// in a UTF-8 sequence. +static const UChar32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, + 0x03C82080UL, 0xFA082080UL, 0x82082080UL }; + +ConversionResult convertUTF8ToUTF16( + const char** sourceStart, const char* sourceEnd, + UChar** targetStart, UChar* targetEnd, bool strict) +{ + ConversionResult result = conversionOK; + const char* source = *sourceStart; + UChar* target = *targetStart; + while (source < sourceEnd) { + UChar32 ch = 0; + int extraBytesToRead = UTF8SequenceLength(*source) - 1; + if (source + extraBytesToRead >= sourceEnd) { + result = sourceExhausted; + break; + } + // Do this check whether lenient or strict + if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source), extraBytesToRead + 1)) { + result = sourceIllegal; + break; + } + // The cases all fall through. + switch (extraBytesToRead) { + case 5: ch += static_cast<unsigned char>(*source++); ch <<= 6; // remember, illegal UTF-8 + case 4: ch += static_cast<unsigned char>(*source++); ch <<= 6; // remember, illegal UTF-8 + case 3: ch += static_cast<unsigned char>(*source++); ch <<= 6; + case 2: ch += static_cast<unsigned char>(*source++); ch <<= 6; + case 1: ch += static_cast<unsigned char>(*source++); ch <<= 6; + case 0: ch += static_cast<unsigned char>(*source++); + } + ch -= offsetsFromUTF8[extraBytesToRead]; + + if (target >= targetEnd) { + source -= (extraBytesToRead + 1); // Back up source pointer! + result = targetExhausted; break; + } + if (ch <= 0xFFFF) { + // UTF-16 surrogate values are illegal in UTF-32 + if (ch >= 0xD800 && ch <= 0xDFFF) { + if (strict) { + source -= (extraBytesToRead + 1); // return to the illegal value itself + result = sourceIllegal; + break; + } else + *target++ = 0xFFFD; + } else + *target++ = (UChar)ch; // normal case + } else if (ch > 0x10FFFF) { + if (strict) { + result = sourceIllegal; + source -= (extraBytesToRead + 1); // return to the start + break; // Bail out; shouldn't continue + } else + *target++ = 0xFFFD; + } else { + // target is a character in range 0xFFFF - 0x10FFFF + if (target + 1 >= targetEnd) { + source -= (extraBytesToRead + 1); // Back up source pointer! + result = targetExhausted; + break; + } + ch -= 0x0010000UL; + *target++ = (UChar)((ch >> 10) + 0xD800); + *target++ = (UChar)((ch & 0x03FF) + 0xDC00); + } + } + *sourceStart = source; + *targetStart = target; + return result; +} + +} +} diff --git a/JavaScriptCore/wtf/unicode/UTF8.h b/JavaScriptCore/wtf/unicode/UTF8.h new file mode 100644 index 0000000..a5ed93e --- /dev/null +++ b/JavaScriptCore/wtf/unicode/UTF8.h @@ -0,0 +1,75 @@ +/* + * Copyright (C) 2007 Apple Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef WTF_UTF8_h +#define WTF_UTF8_h + +#include "Unicode.h" + +namespace WTF { + namespace Unicode { + + // Given a first byte, gives the length of the UTF-8 sequence it begins. + // Returns 0 for bytes that are not legal starts of UTF-8 sequences. + // Only allows sequences of up to 4 bytes, since that works for all Unicode characters (U-00000000 to U-0010FFFF). + int UTF8SequenceLength(char); + + // Takes a null-terminated C-style string with a UTF-8 sequence in it and converts it to a character. + // Only allows Unicode characters (U-00000000 to U-0010FFFF). + // Returns -1 if the sequence is not valid (including presence of extra bytes). + int decodeUTF8Sequence(const char*); + + typedef enum { + conversionOK, // conversion successful + sourceExhausted, // partial character in source, but hit end + targetExhausted, // insuff. room in target for conversion + sourceIllegal // source sequence is illegal/malformed + } ConversionResult; + + // These conversion functions take a "strict" argument. When this + // flag is set to strict, both irregular sequences and isolated surrogates + // will cause an error. When the flag is set to lenient, both irregular + // sequences and isolated surrogates are converted. + // + // Whether the flag is strict or lenient, all illegal sequences will cause + // an error return. This includes sequences such as: <F4 90 80 80>, <C0 80>, + // or <A0> in UTF-8, and values above 0x10FFFF in UTF-32. Conformant code + // must check for illegal sequences. + // + // When the flag is set to lenient, characters over 0x10FFFF are converted + // to the replacement character; otherwise (when the flag is set to strict) + // they constitute an error. + + ConversionResult convertUTF8ToUTF16( + const char** sourceStart, const char* sourceEnd, + UChar** targetStart, UChar* targetEnd, bool strict = true); + + ConversionResult convertUTF16ToUTF8( + const UChar** sourceStart, const UChar* sourceEnd, + char** targetStart, char* targetEnd, bool strict = true); + } +} + +#endif // WTF_UTF8_h diff --git a/JavaScriptCore/wtf/unicode/Unicode.h b/JavaScriptCore/wtf/unicode/Unicode.h new file mode 100644 index 0000000..f890afc --- /dev/null +++ b/JavaScriptCore/wtf/unicode/Unicode.h @@ -0,0 +1,37 @@ +// -*- c-basic-offset: 2 -*- +/* + * This file is part of the KDE libraries + * Copyright (C) 2006 George Staikos <staikos@kde.org> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public License + * along with this library; see the file COPYING.LIB. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, + * Boston, MA 02110-1301, USA. + * + */ + +#ifndef KJS_UNICODE_H +#define KJS_UNICODE_H + +#include <wtf/Platform.h> + +#if USE(QT4_UNICODE) +#include "qt4/UnicodeQt4.h" +#elif USE(ICU_UNICODE) +#include <wtf/unicode/icu/UnicodeIcu.h> +#else +#error "Unknown Unicode implementation" +#endif + +#endif +// vim: ts=2 sw=2 et diff --git a/JavaScriptCore/wtf/unicode/icu/UnicodeIcu.h b/JavaScriptCore/wtf/unicode/icu/UnicodeIcu.h new file mode 100644 index 0000000..cc9ab8c --- /dev/null +++ b/JavaScriptCore/wtf/unicode/icu/UnicodeIcu.h @@ -0,0 +1,239 @@ +// -*- c-basic-offset: 2 -*- +/* + * This file is part of the KDE libraries + * Copyright (C) 2006 George Staikos <staikos@kde.org> + * Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com> + * Copyright (C) 2007 Apple Computer, Inc. All rights reserved. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public License + * along with this library; see the file COPYING.LIB. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, + * Boston, MA 02110-1301, USA. + * + */ + +#ifndef KJS_UNICODE_ICU_H +#define KJS_UNICODE_ICU_H + +#include <unicode/uchar.h> +#include <unicode/ustring.h> +#include <unicode/utf16.h> + +#include <stdlib.h> + +namespace WTF { + namespace Unicode { + + enum Direction { + LeftToRight = U_LEFT_TO_RIGHT, + RightToLeft = U_RIGHT_TO_LEFT, + EuropeanNumber = U_EUROPEAN_NUMBER, + EuropeanNumberSeparator = U_EUROPEAN_NUMBER_SEPARATOR, + EuropeanNumberTerminator = U_EUROPEAN_NUMBER_TERMINATOR, + ArabicNumber = U_ARABIC_NUMBER, + CommonNumberSeparator = U_COMMON_NUMBER_SEPARATOR, + BlockSeparator = U_BLOCK_SEPARATOR, + SegmentSeparator = U_SEGMENT_SEPARATOR, + WhiteSpaceNeutral = U_WHITE_SPACE_NEUTRAL, + OtherNeutral = U_OTHER_NEUTRAL, + LeftToRightEmbedding = U_LEFT_TO_RIGHT_EMBEDDING, + LeftToRightOverride = U_LEFT_TO_RIGHT_OVERRIDE, + RightToLeftArabic = U_RIGHT_TO_LEFT_ARABIC, + RightToLeftEmbedding = U_RIGHT_TO_LEFT_EMBEDDING, + RightToLeftOverride = U_RIGHT_TO_LEFT_OVERRIDE, + PopDirectionalFormat = U_POP_DIRECTIONAL_FORMAT, + NonSpacingMark = U_DIR_NON_SPACING_MARK, + BoundaryNeutral = U_BOUNDARY_NEUTRAL + }; + + enum DecompositionType { + DecompositionNone = U_DT_NONE, + DecompositionCanonical = U_DT_CANONICAL, + DecompositionCompat = U_DT_COMPAT, + DecompositionCircle = U_DT_CIRCLE, + DecompositionFinal = U_DT_FINAL, + DecompositionFont = U_DT_FONT, + DecompositionFraction = U_DT_FRACTION, + DecompositionInitial = U_DT_INITIAL, + DecompositionIsolated = U_DT_ISOLATED, + DecompositionMedial = U_DT_MEDIAL, + DecompositionNarrow = U_DT_NARROW, + DecompositionNoBreak = U_DT_NOBREAK, + DecompositionSmall = U_DT_SMALL, + DecompositionSquare = U_DT_SQUARE, + DecompositionSub = U_DT_SUB, + DecompositionSuper = U_DT_SUPER, + DecompositionVertical = U_DT_VERTICAL, + DecompositionWide = U_DT_WIDE, + }; + + enum CharCategory { + NoCategory = 0, + Other_NotAssigned = U_MASK(U_GENERAL_OTHER_TYPES), + Letter_Uppercase = U_MASK(U_UPPERCASE_LETTER), + Letter_Lowercase = U_MASK(U_LOWERCASE_LETTER), + Letter_Titlecase = U_MASK(U_TITLECASE_LETTER), + Letter_Modifier = U_MASK(U_MODIFIER_LETTER), + Letter_Other = U_MASK(U_OTHER_LETTER), + + Mark_NonSpacing = U_MASK(U_NON_SPACING_MARK), + Mark_Enclosing = U_MASK(U_ENCLOSING_MARK), + Mark_SpacingCombining = U_MASK(U_COMBINING_SPACING_MARK), + + Number_DecimalDigit = U_MASK(U_DECIMAL_DIGIT_NUMBER), + Number_Letter = U_MASK(U_LETTER_NUMBER), + Number_Other = U_MASK(U_OTHER_NUMBER), + + Separator_Space = U_MASK(U_SPACE_SEPARATOR), + Separator_Line = U_MASK(U_LINE_SEPARATOR), + Separator_Paragraph = U_MASK(U_PARAGRAPH_SEPARATOR), + + Other_Control = U_MASK(U_CONTROL_CHAR), + Other_Format = U_MASK(U_FORMAT_CHAR), + Other_PrivateUse = U_MASK(U_PRIVATE_USE_CHAR), + Other_Surrogate = U_MASK(U_SURROGATE), + + Punctuation_Dash = U_MASK(U_DASH_PUNCTUATION), + Punctuation_Open = U_MASK(U_START_PUNCTUATION), + Punctuation_Close = U_MASK(U_END_PUNCTUATION), + Punctuation_Connector = U_MASK(U_CONNECTOR_PUNCTUATION), + Punctuation_Other = U_MASK(U_OTHER_PUNCTUATION), + + Symbol_Math = U_MASK(U_MATH_SYMBOL), + Symbol_Currency = U_MASK(U_CURRENCY_SYMBOL), + Symbol_Modifier = U_MASK(U_MODIFIER_SYMBOL), + Symbol_Other = U_MASK(U_OTHER_SYMBOL), + + Punctuation_InitialQuote = U_MASK(U_INITIAL_PUNCTUATION), + Punctuation_FinalQuote = U_MASK(U_FINAL_PUNCTUATION) + }; + + inline UChar32 foldCase(UChar32 c) + { + return u_foldCase(c, U_FOLD_CASE_DEFAULT); + } + + inline int foldCase(UChar* result, int resultLength, const UChar* src, int srcLength, bool* error) + { + UErrorCode status = U_ZERO_ERROR; + int realLength = u_strFoldCase(result, resultLength, src, srcLength, U_FOLD_CASE_DEFAULT, &status); + *error = !U_SUCCESS(status); + return realLength; + } + + inline int toLower(UChar* result, int resultLength, const UChar* src, int srcLength, bool* error) + { + UErrorCode status = U_ZERO_ERROR; + int realLength = u_strToLower(result, resultLength, src, srcLength, "", &status); + *error = !!U_FAILURE(status); + return realLength; + } + + inline UChar32 toLower(UChar32 c) + { + return u_tolower(c); + } + + inline UChar32 toUpper(UChar32 c) + { + return u_toupper(c); + } + + inline int toUpper(UChar* result, int resultLength, const UChar* src, int srcLength, bool* error) + { + UErrorCode status = U_ZERO_ERROR; + int realLength = u_strToUpper(result, resultLength, src, srcLength, "", &status); + *error = !!U_FAILURE(status); + return realLength; + } + + inline UChar32 toTitleCase(UChar32 c) + { + return u_totitle(c); + } + + inline bool isArabicChar(UChar32 c) + { + return ublock_getCode(c) == UBLOCK_ARABIC; + } + + inline bool isFormatChar(UChar32 c) + { + return u_charType(c) == U_FORMAT_CHAR; + } + + inline bool isSeparatorSpace(UChar32 c) + { + return u_charType(c) == U_SPACE_SEPARATOR; + } + + inline bool isPrintableChar(UChar32 c) + { + return !!u_isprint(c); + } + + inline bool isDigit(UChar32 c) + { + return !!u_isdigit(c); + } + + inline bool isPunct(UChar32 c) + { + return !!u_ispunct(c); + } + + inline UChar32 mirroredChar(UChar32 c) + { + return u_charMirror(c); + } + + inline CharCategory category(UChar32 c) + { + return static_cast<CharCategory>(U_GET_GC_MASK(c)); + } + + inline Direction direction(UChar32 c) + { + return static_cast<Direction>(u_charDirection(c)); + } + + inline bool isLower(UChar32 c) + { + return !!u_islower(c); + } + + inline int digitValue(UChar32 c) + { + return u_charDigitValue(c); + } + + inline uint8_t combiningClass(UChar32 c) + { + return u_getCombiningClass(c); + } + + inline DecompositionType decompositionType(UChar32 c) + { + return static_cast<DecompositionType>(u_getIntPropertyValue(c, UCHAR_DECOMPOSITION_TYPE)); + } + + inline int umemcasecmp(const UChar* a, const UChar* b, int len) + { + return u_memcasecmp(a, b, len, U_FOLD_CASE_DEFAULT); + } + + } +} + +#endif +// vim: ts=2 sw=2 et diff --git a/JavaScriptCore/wtf/unicode/qt4/UnicodeQt4.h b/JavaScriptCore/wtf/unicode/qt4/UnicodeQt4.h new file mode 100644 index 0000000..0fbd869 --- /dev/null +++ b/JavaScriptCore/wtf/unicode/qt4/UnicodeQt4.h @@ -0,0 +1,547 @@ +/* + * This file is part of the KDE libraries + * Copyright (C) 2006 George Staikos <staikos@kde.org> + * Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public License + * along with this library; see the file COPYING.LIB. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, + * Boston, MA 02110-1301, USA. + * + */ + +#ifndef KJS_UNICODE_QT4_H +#define KJS_UNICODE_QT4_H + +#include <QChar> +#include <QString> + +#include <config.h> + +#include <stdint.h> + +#if QT_VERSION >= 0x040300 +namespace QUnicodeTables { + struct Properties { + ushort category : 8; + ushort line_break_class : 8; + ushort direction : 8; + ushort combiningClass :8; + ushort joining : 2; + signed short digitValue : 6; /* 5 needed */ + ushort unicodeVersion : 4; + ushort lowerCaseSpecial : 1; + ushort upperCaseSpecial : 1; + ushort titleCaseSpecial : 1; + ushort caseFoldSpecial : 1; /* currently unused */ + signed short mirrorDiff : 16; + signed short lowerCaseDiff : 16; + signed short upperCaseDiff : 16; + signed short titleCaseDiff : 16; + signed short caseFoldDiff : 16; + }; + Q_CORE_EXPORT const Properties * QT_FASTCALL properties(uint ucs4); + Q_CORE_EXPORT const Properties * QT_FASTCALL properties(ushort ucs2); +} +#endif + +// ugly hack to make UChar compatible with JSChar in API/JSStringRef.h +#if defined(Q_OS_WIN) +typedef wchar_t UChar; +#else +typedef uint16_t UChar; +#endif +typedef uint32_t UChar32; + +// some defines from ICU + +#define U16_IS_LEAD(c) (((c)&0xfffffc00)==0xd800) +#define U16_IS_TRAIL(c) (((c)&0xfffffc00)==0xdc00) +#define U16_SURROGATE_OFFSET ((0xd800<<10UL)+0xdc00-0x10000) +#define U16_GET_SUPPLEMENTARY(lead, trail) \ + (((UChar32)(lead)<<10UL)+(UChar32)(trail)-U16_SURROGATE_OFFSET) + +#define U16_LEAD(supplementary) (UChar)(((supplementary)>>10)+0xd7c0) +#define U16_TRAIL(supplementary) (UChar)(((supplementary)&0x3ff)|0xdc00) + +#define U_IS_SURROGATE(c) (((c)&0xfffff800)==0xd800) +#define U16_IS_SINGLE(c) !U_IS_SURROGATE(c) +#define U16_IS_SURROGATE(c) U_IS_SURROGATE(c) +#define U16_IS_SURROGATE_LEAD(c) (((c)&0x400)==0) + +#define U16_NEXT(s, i, length, c) { \ + (c)=(s)[(i)++]; \ + if(U16_IS_LEAD(c)) { \ + uint16_t __c2; \ + if((i)<(length) && U16_IS_TRAIL(__c2=(s)[(i)])) { \ + ++(i); \ + (c)=U16_GET_SUPPLEMENTARY((c), __c2); \ + } \ + } \ +} + +#define U_MASK(x) ((uint32_t)1<<(x)) + +namespace WTF { + namespace Unicode { + + enum Direction { + LeftToRight = QChar::DirL, + RightToLeft = QChar::DirR, + EuropeanNumber = QChar::DirEN, + EuropeanNumberSeparator = QChar::DirES, + EuropeanNumberTerminator = QChar::DirET, + ArabicNumber = QChar::DirAN, + CommonNumberSeparator = QChar::DirCS, + BlockSeparator = QChar::DirB, + SegmentSeparator = QChar::DirS, + WhiteSpaceNeutral = QChar::DirWS, + OtherNeutral = QChar::DirON, + LeftToRightEmbedding = QChar::DirLRE, + LeftToRightOverride = QChar::DirLRO, + RightToLeftArabic = QChar::DirAL, + RightToLeftEmbedding = QChar::DirRLE, + RightToLeftOverride = QChar::DirRLO, + PopDirectionalFormat = QChar::DirPDF, + NonSpacingMark = QChar::DirNSM, + BoundaryNeutral = QChar::DirBN + }; + + enum DecompositionType { + DecompositionNone = QChar::NoDecomposition, + DecompositionCanonical = QChar::Canonical, + DecompositionCompat = QChar::Compat, + DecompositionCircle = QChar::Circle, + DecompositionFinal = QChar::Final, + DecompositionFont = QChar::Font, + DecompositionFraction = QChar::Fraction, + DecompositionInitial = QChar::Initial, + DecompositionIsolated = QChar::Isolated, + DecompositionMedial = QChar::Medial, + DecompositionNarrow = QChar::Narrow, + DecompositionNoBreak = QChar::NoBreak, + DecompositionSmall = QChar::Small, + DecompositionSquare = QChar::Square, + DecompositionSub = QChar::Sub, + DecompositionSuper = QChar::Super, + DecompositionVertical = QChar::Vertical, + DecompositionWide = QChar::Wide + }; + + enum CharCategory { + NoCategory = 0, + Mark_NonSpacing = U_MASK(QChar::Mark_NonSpacing), + Mark_SpacingCombining = U_MASK(QChar::Mark_SpacingCombining), + Mark_Enclosing = U_MASK(QChar::Mark_Enclosing), + Number_DecimalDigit = U_MASK(QChar::Number_DecimalDigit), + Number_Letter = U_MASK(QChar::Number_Letter), + Number_Other = U_MASK(QChar::Number_Other), + Separator_Space = U_MASK(QChar::Separator_Space), + Separator_Line = U_MASK(QChar::Separator_Line), + Separator_Paragraph = U_MASK(QChar::Separator_Paragraph), + Other_Control = U_MASK(QChar::Other_Control), + Other_Format = U_MASK(QChar::Other_Format), + Other_Surrogate = U_MASK(QChar::Other_Surrogate), + Other_PrivateUse = U_MASK(QChar::Other_PrivateUse), + Other_NotAssigned = U_MASK(QChar::Other_NotAssigned), + Letter_Uppercase = U_MASK(QChar::Letter_Uppercase), + Letter_Lowercase = U_MASK(QChar::Letter_Lowercase), + Letter_Titlecase = U_MASK(QChar::Letter_Titlecase), + Letter_Modifier = U_MASK(QChar::Letter_Modifier), + Letter_Other = U_MASK(QChar::Letter_Other), + Punctuation_Connector = U_MASK(QChar::Punctuation_Connector), + Punctuation_Dash = U_MASK(QChar::Punctuation_Dash), + Punctuation_Open = U_MASK(QChar::Punctuation_Open), + Punctuation_Close = U_MASK(QChar::Punctuation_Close), + Punctuation_InitialQuote = U_MASK(QChar::Punctuation_InitialQuote), + Punctuation_FinalQuote = U_MASK(QChar::Punctuation_FinalQuote), + Punctuation_Other = U_MASK(QChar::Punctuation_Other), + Symbol_Math = U_MASK(QChar::Symbol_Math), + Symbol_Currency = U_MASK(QChar::Symbol_Currency), + Symbol_Modifier = U_MASK(QChar::Symbol_Modifier), + Symbol_Other = U_MASK(QChar::Symbol_Other), + }; + + +#if QT_VERSION >= 0x040300 + // FIXME: handle surrogates correctly in all methods + + inline UChar32 toLower(UChar32 ch) + { + return QChar::toLower(ch); + } + + inline int toLower(UChar* result, int resultLength, const UChar* src, int srcLength, bool* error) + { + const UChar *e = src + srcLength; + const UChar *s = src; + UChar *r = result; + UChar *re = result + resultLength; + + // this avoids one out of bounds check in the loop + if (QChar(*s).isLowSurrogate()) + *r++ = *s++; + + int needed = 0; + while (s < e && r < re) { + uint c = *s; + if (QChar(c).isLowSurrogate() && QChar(*(s - 1)).isHighSurrogate()) + c = QChar::surrogateToUcs4(*(s - 1), c); + const QUnicodeTables::Properties *prop = QUnicodeTables::properties(c); + if (prop->lowerCaseSpecial) { + QString qstring; + if (c < 0x10000) { + qstring += QChar(c); + } else { + qstring += QChar(*(s-1)); + qstring += QChar(*s); + } + qstring = qstring.toLower(); + for (int i = 0; i < qstring.length(); ++i) { + if (r == re) { + needed += qstring.length() - i; + break; + } + *r = qstring.at(i).unicode(); + ++r; + } + } else { + *r = *s + prop->lowerCaseDiff; + ++r; + } + ++s; + } + if (s < e) + needed += e - s; + *error = (needed != 0); + if (r < re) + *r = 0; + return (r - result) + needed; + } + + inline UChar32 toUpper(UChar32 ch) + { + return QChar::toUpper(ch); + } + + inline int toUpper(UChar* result, int resultLength, const UChar* src, int srcLength, bool* error) + { + const UChar *e = src + srcLength; + const UChar *s = src; + UChar *r = result; + UChar *re = result + resultLength; + + // this avoids one out of bounds check in the loop + if (QChar(*s).isLowSurrogate()) + *r++ = *s++; + + int needed = 0; + while (s < e && r < re) { + uint c = *s; + if (QChar(c).isLowSurrogate() && QChar(*(s - 1)).isHighSurrogate()) + c = QChar::surrogateToUcs4(*(s - 1), c); + const QUnicodeTables::Properties *prop = QUnicodeTables::properties(c); + if (prop->upperCaseSpecial) { + QString qstring; + if (c < 0x10000) { + qstring += QChar(c); + } else { + qstring += QChar(*(s-1)); + qstring += QChar(*s); + } + qstring = qstring.toUpper(); + for (int i = 0; i < qstring.length(); ++i) { + if (r == re) { + needed += qstring.length() - i; + break; + } + *r = qstring.at(i).unicode(); + ++r; + } + } else { + *r = *s + prop->upperCaseDiff; + ++r; + } + ++s; + } + if (s < e) + needed += e - s; + *error = (needed != 0); + if (r < re) + *r = 0; + return (r - result) + needed; + } + + inline int toTitleCase(UChar32 c) + { + return QChar::toTitleCase(c); + } + + inline UChar32 foldCase(UChar32 c) + { + return QChar::toCaseFolded(c); + } + + inline int foldCase(UChar* result, int resultLength, const UChar* src, int srcLength, bool* error) + { + // FIXME: handle special casing. Easiest with some low level API in Qt + *error = false; + if (resultLength < srcLength) { + *error = true; + return srcLength; + } + for (int i = 0; i < srcLength; ++i) + result[i] = QChar::toCaseFolded(src[i]); + return srcLength; + } + + inline bool isFormatChar(UChar32 c) + { + return QChar::category(c) == QChar::Other_Format; + } + + inline bool isArabicChar(UChar32 c) + { + return c >= 0x0600 && c <= 0x06FF; + } + + inline bool isPrintableChar(UChar32 c) + { + const uint test = U_MASK(QChar::Other_Control) | + U_MASK(QChar::Other_NotAssigned); + return !(U_MASK(QChar::category(c)) & test); + } + + inline bool isSeparatorSpace(UChar32 c) + { + return QChar::category(c) == QChar::Separator_Space; + } + + inline bool isPunct(UChar32 c) + { + const uint test = U_MASK(QChar::Punctuation_Connector) | + U_MASK(QChar::Punctuation_Dash) | + U_MASK(QChar::Punctuation_Open) | + U_MASK(QChar::Punctuation_Close) | + U_MASK(QChar::Punctuation_InitialQuote) | + U_MASK(QChar::Punctuation_FinalQuote) | + U_MASK(QChar::Punctuation_Other); + return U_MASK(QChar::category(c)) & test; + } + + inline bool isDigit(UChar32 c) + { + return QChar::category(c) == QChar::Number_DecimalDigit; + } + + inline bool isLower(UChar32 c) + { + return QChar::category(c) == QChar::Letter_Lowercase; + } + + inline int digitValue(UChar32 c) + { + return QChar::digitValue(c); + } + + inline UChar32 mirroredChar(UChar32 c) + { + return QChar::mirroredChar(c); + } + + inline uint8_t combiningClass(UChar32 c) + { + return QChar::combiningClass(c); + } + + inline DecompositionType decompositionType(UChar32 c) + { + return (DecompositionType)QChar::decompositionTag(c); + } + + inline int umemcasecmp(const UChar* a, const UChar* b, int len) + { + // handle surrogates correctly + for (int i = 0; i < len; ++i) { + uint c1 = QChar::toCaseFolded(a[i]); + uint c2 = QChar::toCaseFolded(b[i]); + if (c1 != c2) + return c1 - c2; + } + return 0; + } + + inline Direction direction(UChar32 c) + { + return (Direction)QChar::direction(c); + } + + inline CharCategory category(UChar32 c) + { + return (CharCategory) U_MASK(QChar::category(c)); + } + +#else + + inline UChar32 toLower(UChar32 ch) + { + if (ch > 0xffff) + return ch; + return QChar((unsigned short)ch).toLower().unicode(); + } + + inline int toLower(UChar* result, int resultLength, const UChar* src, int srcLength, bool* error) + { + *error = false; + if (resultLength < srcLength) { + *error = true; + return srcLength; + } + for (int i = 0; i < srcLength; ++i) + result[i] = QChar(src[i]).toLower().unicode(); + return srcLength; + } + + inline UChar32 toUpper(UChar32 ch) + { + if (ch > 0xffff) + return ch; + return QChar((unsigned short)ch).toUpper().unicode(); + } + + inline int toUpper(UChar* result, int resultLength, const UChar* src, int srcLength, bool* error) + { + *error = false; + if (resultLength < srcLength) { + *error = true; + return srcLength; + } + for (int i = 0; i < srcLength; ++i) + result[i] = QChar(src[i]).toUpper().unicode(); + return srcLength; + } + + inline int toTitleCase(UChar32 c) + { + if (c > 0xffff) + return c; + return QChar((unsigned short)c).toUpper().unicode(); + } + + inline UChar32 foldCase(UChar32 c) + { + if (c > 0xffff) + return c; + return QChar((unsigned short)c).toLower().unicode(); + } + + inline int foldCase(UChar* result, int resultLength, const UChar* src, int srcLength, bool* error) + { + return toLower(result, resultLength, src, srcLength, error); + } + + inline bool isFormatChar(UChar32 c) + { + return (c & 0xffff0000) == 0 && QChar((unsigned short)c).category() == QChar::Other_Format; + } + + inline bool isPrintableChar(UChar32 c) + { + return (c & 0xffff0000) == 0 && QChar((unsigned short)c).isPrint(); + } + + inline bool isArabicChar(UChar32 c) + { + return c >= 0x0600 && c <= 0x06FF; + } + + inline bool isSeparatorSpace(UChar32 c) + { + return (c & 0xffff0000) == 0 && QChar((unsigned short)c).category() == QChar::Separator_Space; + } + + inline bool isPunct(UChar32 c) + { + return (c & 0xffff0000) == 0 && QChar((unsigned short)c).isPunct(); + } + + inline bool isDigit(UChar32 c) + { + return (c & 0xffff0000) == 0 && QChar((unsigned short)c).isDigit(); + } + + inline bool isLower(UChar32 c) + { + return (c & 0xffff0000) == 0 && QChar((unsigned short)c).category() == QChar::Letter_Lowercase; + } + + inline int digitValue(UChar32 c) + { + if (c > 0xffff) + return 0; + return QChar(c).digitValue(); + } + + inline UChar32 mirroredChar(UChar32 c) + { + if (c > 0xffff) + return c; + return QChar(c).mirroredChar().unicode(); + } + + inline uint8_t combiningClass(UChar32 c) + { + if (c > 0xffff) + return 0; + return QChar((unsigned short)c).combiningClass(); + } + + inline DecompositionType decompositionType(UChar32 c) + { + if (c > 0xffff) + return DecompositionNone; + return (DecompositionType)QChar(c).decompositionTag(); + } + + inline int umemcasecmp(const UChar* a, const UChar* b, int len) + { + for (int i = 0; i < len; ++i) { + QChar c1 = QChar(a[i]).toLower(); + QChar c2 = QChar(b[i]).toLower(); + if (c1 != c2) + return c1.unicode() - c2.unicode(); + } + return 0; + } + + inline Direction direction(UChar32 c) + { + if (c > 0xffff) + return LeftToRight; + return (Direction)QChar(c).direction(); + } + + inline CharCategory category(UChar32 c) + { + if (c > 0xffff) + return NoCategory; + return (CharCategory) U_MASK(QChar(c).category()); + } + +#endif + + } +} + +#endif +// vim: ts=2 sw=2 et |