diff options
author | Steve Block <steveblock@google.com> | 2011-05-13 06:44:40 -0700 |
---|---|---|
committer | Android (Google) Code Review <android-gerrit@google.com> | 2011-05-13 06:44:40 -0700 |
commit | 08014c20784f3db5df3a89b73cce46037b77eb59 (patch) | |
tree | 47749210d31e19e6e2f64036fa8fae2ad693476f /Source/WebCore/platform/text/UnicodeRange.cpp | |
parent | 860220379e56aeb66424861ad602b07ee22b4055 (diff) | |
parent | 4c3661f7918f8b3f139f824efb7855bedccb4c94 (diff) | |
download | external_webkit-08014c20784f3db5df3a89b73cce46037b77eb59.zip external_webkit-08014c20784f3db5df3a89b73cce46037b77eb59.tar.gz external_webkit-08014c20784f3db5df3a89b73cce46037b77eb59.tar.bz2 |
Merge changes Ide388898,Ic49f367c,I1158a808,Iacb6ca5d,I2100dd3a,I5c1abe54,Ib0ef9902,I31dbc523,I570314b3
* changes:
Merge WebKit at r75315: Update WebKit version
Merge WebKit at r75315: Add FrameLoaderClient PageCache stubs
Merge WebKit at r75315: Stub out AXObjectCache::remove()
Merge WebKit at r75315: Fix ImageBuffer
Merge WebKit at r75315: Fix PluginData::initPlugins()
Merge WebKit at r75315: Fix conflicts
Merge WebKit at r75315: Fix Makefiles
Merge WebKit at r75315: Move Android-specific WebCore files to Source
Merge WebKit at r75315: Initial merge by git.
Diffstat (limited to 'Source/WebCore/platform/text/UnicodeRange.cpp')
-rw-r--r-- | Source/WebCore/platform/text/UnicodeRange.cpp | 462 |
1 files changed, 462 insertions, 0 deletions
diff --git a/Source/WebCore/platform/text/UnicodeRange.cpp b/Source/WebCore/platform/text/UnicodeRange.cpp new file mode 100644 index 0000000..0373441 --- /dev/null +++ b/Source/WebCore/platform/text/UnicodeRange.cpp @@ -0,0 +1,462 @@ +/* + * Copyright (C) 2007 Apple Computer, Inc. + * + * Portions are Copyright (C) 1998 Netscape Communications Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + * + * Alternatively, the contents of this file may be used under the terms + * of either the Mozilla Public License Version 1.1, found at + * http://www.mozilla.org/MPL/ (the "MPL") or the GNU General Public + * License Version 2.0, found at http://www.fsf.org/copyleft/gpl.html + * (the "GPL"), in which case the provisions of the MPL or the GPL are + * applicable instead of those above. If you wish to allow use of your + * version of this file only under the terms of one of those two + * licenses (the MPL or the GPL) and not to allow others to use your + * version of this file under the LGPL, indicate your decision by + * deletingthe provisions above and replace them with the notice and + * other provisions required by the MPL or the GPL, as the case may be. + * If you do not delete the provisions above, a recipient may use your + * version of this file under any of the LGPL, the MPL or the GPL. + */ + +#include "config.h" +#include "UnicodeRange.h" + +namespace WebCore { + +// This table depends on unicode range definitions. +// Each item's index must correspond to a unicode range value +// eg. x-cyrillic = LangGroupTable[cRangeCyrillic] +static const char* gUnicodeRangeToLangGroupTable[] = +{ + "x-cyrillic", + "el", + "tr", + "he", + "ar", + "x-baltic", + "th", + "ko", + "ja", + "zh-CN", + "zh-TW", + "x-devanagari", + "x-tamil", + "x-armn", + "x-beng", + "x-cans", + "x-ethi", + "x-geor", + "x-gujr", + "x-guru", + "x-khmr", + "x-mlym" +}; + +/********************************************************************** + * Unicode subranges as defined in unicode 3.0 + * x-western, x-central-euro, tr, x-baltic -> latin + * 0000 - 036f + * 1e00 - 1eff + * 2000 - 206f (general punctuation) + * 20a0 - 20cf (currency symbols) + * 2100 - 214f (letterlike symbols) + * 2150 - 218f (Number Forms) + * el -> greek + * 0370 - 03ff + * 1f00 - 1fff + * x-cyrillic -> cyrillic + * 0400 - 04ff + * he -> hebrew + * 0590 - 05ff + * ar -> arabic + * 0600 - 06ff + * fb50 - fdff (arabic presentation forms) + * fe70 - feff (arabic presentation forms b) + * th - thai + * 0e00 - 0e7f + * ko -> korean + * ac00 - d7af (hangul Syllables) + * 1100 - 11ff (jamo) + * 3130 - 318f (hangul compatibility jamo) + * ja + * 3040 - 309f (hiragana) + * 30a0 - 30ff (katakana) + * zh-CN + * zh-TW + * + * CJK + * 3100 - 312f (bopomofo) + * 31a0 - 31bf (bopomofo extended) + * 3000 - 303f (CJK Symbols and Punctuation) + * 2e80 - 2eff (CJK radicals supplement) + * 2f00 - 2fdf (Kangxi Radicals) + * 2ff0 - 2fff (Ideographic Description Characters) + * 3190 - 319f (kanbun) + * 3200 - 32ff (Enclosed CJK letters and Months) + * 3300 - 33ff (CJK compatibility) + * 3400 - 4dbf (CJK Unified Ideographs Extension A) + * 4e00 - 9faf (CJK Unified Ideographs) + * f900 - fa5f (CJK Compatibility Ideographs) + * fe30 - fe4f (CJK compatibility Forms) + * ff00 - ffef (halfwidth and fullwidth forms) + * + * Armenian + * 0530 - 058f + * Sriac + * 0700 - 074f + * Thaana + * 0780 - 07bf + * Devanagari + * 0900 - 097f + * Bengali + * 0980 - 09ff + * Gurmukhi + * 0a00 - 0a7f + * Gujarati + * 0a80 - 0aff + * Oriya + * 0b00 - 0b7f + * Tamil + * 0b80 - 0bff + * Telugu + * 0c00 - 0c7f + * Kannada + * 0c80 - 0cff + * Malayalam + * 0d00 - 0d7f + * Sinhala + * 0d80 - 0def + * Lao + * 0e80 - 0eff + * Tibetan + * 0f00 - 0fbf + * Myanmar + * 1000 - 109f + * Georgian + * 10a0 - 10ff + * Ethiopic + * 1200 - 137f + * Cherokee + * 13a0 - 13ff + * Canadian Aboriginal Syllabics + * 1400 - 167f + * Ogham + * 1680 - 169f + * Runic + * 16a0 - 16ff + * Khmer + * 1780 - 17ff + * Mongolian + * 1800 - 18af + * Misc - superscripts and subscripts + * 2070 - 209f + * Misc - Combining Diacritical Marks for Symbols + * 20d0 - 20ff + * Misc - Arrows + * 2190 - 21ff + * Misc - Mathematical Operators + * 2200 - 22ff + * Misc - Miscellaneous Technical + * 2300 - 23ff + * Misc - Control picture + * 2400 - 243f + * Misc - Optical character recognition + * 2440 - 2450 + * Misc - Enclose Alphanumerics + * 2460 - 24ff + * Misc - Box Drawing + * 2500 - 257f + * Misc - Block Elements + * 2580 - 259f + * Misc - Geometric Shapes + * 25a0 - 25ff + * Misc - Miscellaneous Symbols + * 2600 - 267f + * Misc - Dingbats + * 2700 - 27bf + * Misc - Braille Patterns + * 2800 - 28ff + * Yi Syllables + * a000 - a48f + * Yi radicals + * a490 - a4cf + * Alphabetic Presentation Forms + * fb00 - fb4f + * Misc - Combining half Marks + * fe20 - fe2f + * Misc - small form variants + * fe50 - fe6f + * Misc - Specials + * fff0 - ffff + *********************************************************************/ + +static const unsigned cNumSubTables = 9; +static const unsigned cSubTableSize = 16; + +static const unsigned char gUnicodeSubrangeTable[cNumSubTables][cSubTableSize] = +{ + { // table for X--- + cRangeTableBase+1, //u0xxx + cRangeTableBase+2, //u1xxx + cRangeTableBase+3, //u2xxx + cRangeSetCJK, //u3xxx + cRangeSetCJK, //u4xxx + cRangeSetCJK, //u5xxx + cRangeSetCJK, //u6xxx + cRangeSetCJK, //u7xxx + cRangeSetCJK, //u8xxx + cRangeSetCJK, //u9xxx + cRangeTableBase+4, //uaxxx + cRangeKorean, //ubxxx + cRangeKorean, //ucxxx + cRangeTableBase+5, //udxxx + cRangePrivate, //uexxx + cRangeTableBase+6 //ufxxx + }, + { //table for 0X-- + cRangeSetLatin, //u00xx + cRangeSetLatin, //u01xx + cRangeSetLatin, //u02xx + cRangeGreek, //u03xx XXX 0300-036f is in fact cRangeCombiningDiacriticalMarks + cRangeCyrillic, //u04xx + cRangeTableBase+7, //u05xx, includes Cyrillic supplement, Hebrew, and Armenian + cRangeArabic, //u06xx + cRangeTertiaryTable, //u07xx + cRangeUnassigned, //u08xx + cRangeTertiaryTable, //u09xx + cRangeTertiaryTable, //u0axx + cRangeTertiaryTable, //u0bxx + cRangeTertiaryTable, //u0cxx + cRangeTertiaryTable, //u0dxx + cRangeTertiaryTable, //u0exx + cRangeTibetan, //u0fxx + }, + { //table for 1x-- + cRangeTertiaryTable, //u10xx + cRangeKorean, //u11xx + cRangeEthiopic, //u12xx + cRangeTertiaryTable, //u13xx + cRangeCanadian, //u14xx + cRangeCanadian, //u15xx + cRangeTertiaryTable, //u16xx + cRangeKhmer, //u17xx + cRangeMongolian, //u18xx + cRangeUnassigned, //u19xx + cRangeUnassigned, //u1axx + cRangeUnassigned, //u1bxx + cRangeUnassigned, //u1cxx + cRangeUnassigned, //u1dxx + cRangeSetLatin, //u1exx + cRangeGreek, //u1fxx + }, + { //table for 2x-- + cRangeSetLatin, //u20xx + cRangeSetLatin, //u21xx + cRangeMathOperators, //u22xx + cRangeMiscTechnical, //u23xx + cRangeControlOpticalEnclose, //u24xx + cRangeBoxBlockGeometrics, //u25xx + cRangeMiscSymbols, //u26xx + cRangeDingbats, //u27xx + cRangeBraillePattern, //u28xx + cRangeUnassigned, //u29xx + cRangeUnassigned, //u2axx + cRangeUnassigned, //u2bxx + cRangeUnassigned, //u2cxx + cRangeUnassigned, //u2dxx + cRangeSetCJK, //u2exx + cRangeSetCJK, //u2fxx + }, + { //table for ax-- + cRangeYi, //ua0xx + cRangeYi, //ua1xx + cRangeYi, //ua2xx + cRangeYi, //ua3xx + cRangeYi, //ua4xx + cRangeUnassigned, //ua5xx + cRangeUnassigned, //ua6xx + cRangeUnassigned, //ua7xx + cRangeUnassigned, //ua8xx + cRangeUnassigned, //ua9xx + cRangeUnassigned, //uaaxx + cRangeUnassigned, //uabxx + cRangeKorean, //uacxx + cRangeKorean, //uadxx + cRangeKorean, //uaexx + cRangeKorean, //uafxx + }, + { //table for dx-- + cRangeKorean, //ud0xx + cRangeKorean, //ud1xx + cRangeKorean, //ud2xx + cRangeKorean, //ud3xx + cRangeKorean, //ud4xx + cRangeKorean, //ud5xx + cRangeKorean, //ud6xx + cRangeKorean, //ud7xx + cRangeSurrogate, //ud8xx + cRangeSurrogate, //ud9xx + cRangeSurrogate, //udaxx + cRangeSurrogate, //udbxx + cRangeSurrogate, //udcxx + cRangeSurrogate, //uddxx + cRangeSurrogate, //udexx + cRangeSurrogate, //udfxx + }, + { // table for fx-- + cRangePrivate, //uf0xx + cRangePrivate, //uf1xx + cRangePrivate, //uf2xx + cRangePrivate, //uf3xx + cRangePrivate, //uf4xx + cRangePrivate, //uf5xx + cRangePrivate, //uf6xx + cRangePrivate, //uf7xx + cRangePrivate, //uf8xx + cRangeSetCJK, //uf9xx + cRangeSetCJK, //ufaxx + cRangeArabic, //ufbxx, includes alphabic presentation form + cRangeArabic, //ufcxx + cRangeArabic, //ufdxx + cRangeArabic, //ufexx, includes Combining half marks, + // CJK compatibility forms, + // CJK compatibility forms, + // small form variants + cRangeTableBase+8, //uffxx, halfwidth and fullwidth forms, includes Specials + }, + { //table for 0x0500 - 0x05ff + cRangeCyrillic, //u050x + cRangeCyrillic, //u051x + cRangeCyrillic, //u052x + cRangeArmenian, //u053x + cRangeArmenian, //u054x + cRangeArmenian, //u055x + cRangeArmenian, //u056x + cRangeArmenian, //u057x + cRangeArmenian, //u058x + cRangeHebrew, //u059x + cRangeHebrew, //u05ax + cRangeHebrew, //u05bx + cRangeHebrew, //u05cx + cRangeHebrew, //u05dx + cRangeHebrew, //u05ex + cRangeHebrew, //u05fx + }, + { //table for 0xff00 - 0xffff + cRangeSetCJK, //uff0x, fullwidth latin + cRangeSetCJK, //uff1x, fullwidth latin + cRangeSetCJK, //uff2x, fullwidth latin + cRangeSetCJK, //uff3x, fullwidth latin + cRangeSetCJK, //uff4x, fullwidth latin + cRangeSetCJK, //uff5x, fullwidth latin + cRangeSetCJK, //uff6x, halfwidth katakana + cRangeSetCJK, //uff7x, halfwidth katakana + cRangeSetCJK, //uff8x, halfwidth katakana + cRangeSetCJK, //uff9x, halfwidth katakana + cRangeSetCJK, //uffax, halfwidth hangul jamo + cRangeSetCJK, //uffbx, halfwidth hangul jamo + cRangeSetCJK, //uffcx, halfwidth hangul jamo + cRangeSetCJK, //uffdx, halfwidth hangul jamo + cRangeSetCJK, //uffex, fullwidth symbols + cRangeSpecials, //ufffx, Specials + }, +}; + +// Most scripts between U+0700 and U+16FF are assigned a chunk of 128 (0x80) +// code points so that the number of entries in the tertiary range +// table for that range is obtained by dividing (0x1700 - 0x0700) by 128. +// Exceptions: Ethiopic, Tibetan, Hangul Jamo and Canadian aboriginal +// syllabaries take multiple chunks and Ogham and Runic share a single chunk. +static const unsigned cTertiaryTableSize = ((0x1700 - 0x0700) / 0x80); + +static const unsigned char gUnicodeTertiaryRangeTable[cTertiaryTableSize] = +{ //table for 0x0700 - 0x1600 + cRangeSyriac, //u070x + cRangeThaana, //u078x + cRangeUnassigned, //u080x place holder(resolved in the 2ndary tab.) + cRangeUnassigned, //u088x place holder(resolved in the 2ndary tab.) + cRangeDevanagari, //u090x + cRangeBengali, //u098x + cRangeGurmukhi, //u0a0x + cRangeGujarati, //u0a8x + cRangeOriya, //u0b0x + cRangeTamil, //u0b8x + cRangeTelugu, //u0c0x + cRangeKannada, //u0c8x + cRangeMalayalam, //u0d0x + cRangeSinhala, //u0d8x + cRangeThai, //u0e0x + cRangeLao, //u0e8x + cRangeTibetan, //u0f0x place holder(resolved in the 2ndary tab.) + cRangeTibetan, //u0f8x place holder(resolved in the 2ndary tab.) + cRangeMyanmar, //u100x + cRangeGeorgian, //u108x + cRangeKorean, //u110x place holder(resolved in the 2ndary tab.) + cRangeKorean, //u118x place holder(resolved in the 2ndary tab.) + cRangeEthiopic, //u120x place holder(resolved in the 2ndary tab.) + cRangeEthiopic, //u128x place holder(resolved in the 2ndary tab.) + cRangeEthiopic, //u130x + cRangeCherokee, //u138x + cRangeCanadian, //u140x place holder(resolved in the 2ndary tab.) + cRangeCanadian, //u148x place holder(resolved in the 2ndary tab.) + cRangeCanadian, //u150x place holder(resolved in the 2ndary tab.) + cRangeCanadian, //u158x place holder(resolved in the 2ndary tab.) + cRangeCanadian, //u160x + cRangeOghamRunic, //u168x this contains two scripts, Ogham & Runic +}; + +// A two level index is almost enough for locating a range, with the +// exception of u03xx and u05xx. Since we don't really care about range for +// combining diacritical marks in our font application, they are +// not discriminated further. Future adoption of this method for other use +// should be aware of this limitation. The implementation can be extended if +// there is such a need. +// For Indic, Southeast Asian scripts and some other scripts between +// U+0700 and U+16FF, it's extended to the third level. +unsigned int findCharUnicodeRange(UChar32 ch) +{ + if (ch >= 0xFFFF) + return 0; + + unsigned int range; + + //search the first table + range = gUnicodeSubrangeTable[0][ch >> 12]; + + if (range < cRangeTableBase) + // we try to get a specific range + return range; + + // otherwise, we have one more table to look at + range = gUnicodeSubrangeTable[range - cRangeTableBase][(ch & 0x0f00) >> 8]; + if (range < cRangeTableBase) + return range; + if (range < cRangeTertiaryTable) + return gUnicodeSubrangeTable[range - cRangeTableBase][(ch & 0x00f0) >> 4]; + + // Yet another table to look at : U+0700 - U+16FF : 128 code point blocks + return gUnicodeTertiaryRangeTable[(ch - 0x0700) >> 7]; +} + +const char* langGroupFromUnicodeRange(unsigned char unicodeRange) +{ + if (cRangeSpecificItemNum > unicodeRange) + return gUnicodeRangeToLangGroupTable[unicodeRange]; + return 0; +} + +} |