diff options
Diffstat (limited to 'Source/JavaScriptCore/wtf/unicode')
-rw-r--r-- | Source/JavaScriptCore/wtf/unicode/CharacterNames.h | 142 | ||||
-rw-r--r-- | Source/JavaScriptCore/wtf/unicode/UTF8.cpp | 33 | ||||
-rw-r--r-- | Source/JavaScriptCore/wtf/unicode/UTF8.h | 1 | ||||
-rw-r--r-- | Source/JavaScriptCore/wtf/unicode/UnicodeMacrosFromICU.h | 40 |
4 files changed, 188 insertions, 28 deletions
diff --git a/Source/JavaScriptCore/wtf/unicode/CharacterNames.h b/Source/JavaScriptCore/wtf/unicode/CharacterNames.h new file mode 100644 index 0000000..3d093a6 --- /dev/null +++ b/Source/JavaScriptCore/wtf/unicode/CharacterNames.h @@ -0,0 +1,142 @@ +/* + * Copyright (C) 2007, 2009, 2010 Apple Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef CharacterNames_h +#define CharacterNames_h + +#include "Unicode.h" + +namespace WTF { +namespace Unicode { + +// Names here are taken from the Unicode standard. + +// Most of these are UChar constants, not UChar32, which makes them +// more convenient for WebCore code that mostly uses UTF-16. + +const UChar32 aegeanWordSeparatorLine = 0x10100; +const UChar32 aegeanWordSeparatorDot = 0x10101; +const UChar blackCircle = 0x25CF; +const UChar blackSquare = 0x25A0; +const UChar blackUpPointingTriangle = 0x25B2; +const UChar bullet = 0x2022; +const UChar bullseye = 0x25CE; +const UChar carriageReturn = 0x000D; +const UChar ethiopicPrefaceColon = 0x1366; +const UChar ethiopicWordspace = 0x1361; +const UChar fisheye = 0x25C9; +const UChar hebrewPunctuationGeresh = 0x05F3; +const UChar hebrewPunctuationGershayim = 0x05F4; +const UChar horizontalEllipsis = 0x2026; +const UChar hyphen = 0x2010; +const UChar hyphenMinus = 0x002D; +const UChar ideographicComma = 0x3001; +const UChar ideographicFullStop = 0x3002; +const UChar ideographicSpace = 0x3000; +const UChar leftDoubleQuotationMark = 0x201C; +const UChar leftSingleQuotationMark = 0x2018; +const UChar leftToRightEmbed = 0x202A; +const UChar leftToRightMark = 0x200E; +const UChar leftToRightOverride = 0x202D; +const UChar minusSign = 0x2212; +const UChar newlineCharacter = 0x000A; +const UChar noBreakSpace = 0x00A0; +const UChar objectReplacementCharacter = 0xFFFC; +const UChar popDirectionalFormatting = 0x202C; +const UChar replacementCharacter = 0xFFFD; +const UChar rightDoubleQuotationMark = 0x201D; +const UChar rightSingleQuotationMark = 0x2019; +const UChar rightToLeftEmbed = 0x202B; +const UChar rightToLeftMark = 0x200F; +const UChar rightToLeftOverride = 0x202E; +const UChar sesameDot = 0xFE45; +const UChar softHyphen = 0x00AD; +const UChar space = 0x0020; +const UChar tibetanMarkIntersyllabicTsheg = 0x0F0B; +const UChar tibetanMarkDelimiterTshegBstar = 0x0F0C; +const UChar32 ugariticWordDivider = 0x1039F; +const UChar whiteBullet = 0x25E6; +const UChar whiteCircle = 0x25CB; +const UChar whiteSesameDot = 0xFE46; +const UChar whiteUpPointingTriangle = 0x25B3; +const UChar yenSign = 0x00A5; +const UChar zeroWidthJoiner = 0x200D; +const UChar zeroWidthNonJoiner = 0x200C; +const UChar zeroWidthSpace = 0x200B; + +} // namespace Unicode +} // namespace WTF + +using WTF::Unicode::aegeanWordSeparatorLine; +using WTF::Unicode::aegeanWordSeparatorDot; +using WTF::Unicode::blackCircle; +using WTF::Unicode::blackSquare; +using WTF::Unicode::blackUpPointingTriangle; +using WTF::Unicode::bullet; +using WTF::Unicode::bullseye; +using WTF::Unicode::carriageReturn; +using WTF::Unicode::ethiopicPrefaceColon; +using WTF::Unicode::ethiopicWordspace; +using WTF::Unicode::fisheye; +using WTF::Unicode::hebrewPunctuationGeresh; +using WTF::Unicode::hebrewPunctuationGershayim; +using WTF::Unicode::horizontalEllipsis; +using WTF::Unicode::hyphen; +using WTF::Unicode::hyphenMinus; +using WTF::Unicode::ideographicComma; +using WTF::Unicode::ideographicFullStop; +using WTF::Unicode::ideographicSpace; +using WTF::Unicode::leftDoubleQuotationMark; +using WTF::Unicode::leftSingleQuotationMark; +using WTF::Unicode::leftToRightEmbed; +using WTF::Unicode::leftToRightMark; +using WTF::Unicode::leftToRightOverride; +using WTF::Unicode::minusSign; +using WTF::Unicode::newlineCharacter; +using WTF::Unicode::noBreakSpace; +using WTF::Unicode::objectReplacementCharacter; +using WTF::Unicode::popDirectionalFormatting; +using WTF::Unicode::replacementCharacter; +using WTF::Unicode::rightDoubleQuotationMark; +using WTF::Unicode::rightSingleQuotationMark; +using WTF::Unicode::rightToLeftEmbed; +using WTF::Unicode::rightToLeftMark; +using WTF::Unicode::rightToLeftOverride; +using WTF::Unicode::sesameDot; +using WTF::Unicode::softHyphen; +using WTF::Unicode::space; +using WTF::Unicode::tibetanMarkIntersyllabicTsheg; +using WTF::Unicode::tibetanMarkDelimiterTshegBstar; +using WTF::Unicode::ugariticWordDivider; +using WTF::Unicode::whiteBullet; +using WTF::Unicode::whiteCircle; +using WTF::Unicode::whiteSesameDot; +using WTF::Unicode::whiteUpPointingTriangle; +using WTF::Unicode::yenSign; +using WTF::Unicode::zeroWidthJoiner; +using WTF::Unicode::zeroWidthNonJoiner; +using WTF::Unicode::zeroWidthSpace; + +#endif // CharacterNames_h diff --git a/Source/JavaScriptCore/wtf/unicode/UTF8.cpp b/Source/JavaScriptCore/wtf/unicode/UTF8.cpp index dc24ed5..4c3738b 100644 --- a/Source/JavaScriptCore/wtf/unicode/UTF8.cpp +++ b/Source/JavaScriptCore/wtf/unicode/UTF8.cpp @@ -26,16 +26,14 @@ #include "config.h" #include "UTF8.h" -#include <wtf/StringHasher.h> #include "ASCIICType.h" +#include <wtf/StringHasher.h> +#include <wtf/unicode/CharacterNames.h> namespace WTF { namespace Unicode { -// FIXME: Use definition from CharacterNames.h. -static const UChar replacementCharacter = 0xFFFD; - inline int inlineUTF8SequenceLengthNonASCII(char b0) { if ((b0 & 0xC0) != 0xC0) @@ -316,25 +314,33 @@ ConversionResult convertUTF8ToUTF16( return result; } -unsigned calculateStringHashFromUTF8(const char* data, const char* dataEnd, unsigned& utf16Length) +static inline unsigned calculateStringHashAndLengthFromUTF8Internal(const char* data, const char* dataEnd, unsigned& dataLength, unsigned& utf16Length) { if (!data) return 0; WTF::StringHasher stringHasher; + dataLength = 0; utf16Length = 0; - while (data < dataEnd) { + while (data < dataEnd || (!dataEnd && *data)) { if (isASCII(*data)) { stringHasher.addCharacter(*data++); + dataLength++; utf16Length++; continue; } int utf8SequenceLength = inlineUTF8SequenceLengthNonASCII(*data); + dataLength += utf8SequenceLength; - if (dataEnd - data < utf8SequenceLength) - return false; + if (!dataEnd) { + for (int i = 1; i < utf8SequenceLength; ++i) { + if (!data[i]) + return 0; + } + } else if (dataEnd - data < utf8SequenceLength) + return 0; if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(data), utf8SequenceLength)) return 0; @@ -359,6 +365,17 @@ unsigned calculateStringHashFromUTF8(const char* data, const char* dataEnd, unsi return stringHasher.hash(); } +unsigned calculateStringHashFromUTF8(const char* data, const char* dataEnd, unsigned& utf16Length) +{ + unsigned dataLength; + return calculateStringHashAndLengthFromUTF8Internal(data, dataEnd, dataLength, utf16Length); +} + +unsigned calculateStringHashAndLengthFromUTF8(const char* data, unsigned& dataLength, unsigned& utf16Length) +{ + return calculateStringHashAndLengthFromUTF8Internal(data, 0, dataLength, utf16Length); +} + bool equalUTF16WithUTF8(const UChar* a, const UChar* aEnd, const char* b, const char* bEnd) { while (b < bEnd) { diff --git a/Source/JavaScriptCore/wtf/unicode/UTF8.h b/Source/JavaScriptCore/wtf/unicode/UTF8.h index 1f4baca..bbfaa84 100644 --- a/Source/JavaScriptCore/wtf/unicode/UTF8.h +++ b/Source/JavaScriptCore/wtf/unicode/UTF8.h @@ -71,6 +71,7 @@ namespace Unicode { char** targetStart, char* targetEnd, bool strict = true); unsigned calculateStringHashFromUTF8(const char* data, const char* dataEnd, unsigned& utf16Length); + unsigned calculateStringHashAndLengthFromUTF8(const char* data, unsigned& dataLength, unsigned& utf16Length); bool equalUTF16WithUTF8(const UChar* a, const UChar* aEnd, const char* b, const char* bEnd); diff --git a/Source/JavaScriptCore/wtf/unicode/UnicodeMacrosFromICU.h b/Source/JavaScriptCore/wtf/unicode/UnicodeMacrosFromICU.h index 8959912..09a7036 100644 --- a/Source/JavaScriptCore/wtf/unicode/UnicodeMacrosFromICU.h +++ b/Source/JavaScriptCore/wtf/unicode/UnicodeMacrosFromICU.h @@ -1,25 +1,5 @@ /* * Copyright (C) 1999-2004, International Business Machines Corporation and others. All Rights Reserved. - * Copyright (C) 2006 George Staikos <staikos@kde.org> - * Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com> - * Copyright (C) 2007 Apple Computer, Inc. All rights reserved. - * Copyright (C) 2008 Jürg Billeter <j@bitron.ch> - * Copyright (C) 2008 Dominik Röttsches <dominik.roettsches@access-company.com> - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Library General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Library General Public License for more details. - * - * You should have received a copy of the GNU Library General Public License - * along with this library; see the file COPYING.LIB. If not, write to - * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, - * Boston, MA 02110-1301, USA. * */ @@ -97,4 +77,24 @@ #define U_MASK(x) ((uint32_t)1<<(x)) +#define U8_MAX_LENGTH 4 + +#define U8_APPEND_UNSAFE(s, i, c) { \ + if((uint32_t)(c)<=0x7f) { \ + (s)[(i)++]=(uint8_t)(c); \ + } else { \ + if((uint32_t)(c)<=0x7ff) { \ + (s)[(i)++]=(uint8_t)(((c)>>6)|0xc0); \ + } else { \ + if((uint32_t)(c)<=0xffff) { \ + (s)[(i)++]=(uint8_t)(((c)>>12)|0xe0); \ + } else { \ + (s)[(i)++]=(uint8_t)(((c)>>18)|0xf0); \ + (s)[(i)++]=(uint8_t)((((c)>>12)&0x3f)|0x80); \ + } \ + (s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)|0x80); \ + } \ + (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80); \ + } \ +} #endif |