diff options
Diffstat (limited to 'WebCore/platform/text/UnicodeRange.cpp')
-rw-r--r-- | WebCore/platform/text/UnicodeRange.cpp | 462 |
1 files changed, 0 insertions, 462 deletions
diff --git a/WebCore/platform/text/UnicodeRange.cpp b/WebCore/platform/text/UnicodeRange.cpp deleted file mode 100644 index 0373441..0000000 --- a/WebCore/platform/text/UnicodeRange.cpp +++ /dev/null @@ -1,462 +0,0 @@ -/* - * Copyright (C) 2007 Apple Computer, Inc. - * - * Portions are Copyright (C) 1998 Netscape Communications Corporation. - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - * - * Alternatively, the contents of this file may be used under the terms - * of either the Mozilla Public License Version 1.1, found at - * http://www.mozilla.org/MPL/ (the "MPL") or the GNU General Public - * License Version 2.0, found at http://www.fsf.org/copyleft/gpl.html - * (the "GPL"), in which case the provisions of the MPL or the GPL are - * applicable instead of those above. If you wish to allow use of your - * version of this file only under the terms of one of those two - * licenses (the MPL or the GPL) and not to allow others to use your - * version of this file under the LGPL, indicate your decision by - * deletingthe provisions above and replace them with the notice and - * other provisions required by the MPL or the GPL, as the case may be. - * If you do not delete the provisions above, a recipient may use your - * version of this file under any of the LGPL, the MPL or the GPL. - */ - -#include "config.h" -#include "UnicodeRange.h" - -namespace WebCore { - -// This table depends on unicode range definitions. -// Each item's index must correspond to a unicode range value -// eg. x-cyrillic = LangGroupTable[cRangeCyrillic] -static const char* gUnicodeRangeToLangGroupTable[] = -{ - "x-cyrillic", - "el", - "tr", - "he", - "ar", - "x-baltic", - "th", - "ko", - "ja", - "zh-CN", - "zh-TW", - "x-devanagari", - "x-tamil", - "x-armn", - "x-beng", - "x-cans", - "x-ethi", - "x-geor", - "x-gujr", - "x-guru", - "x-khmr", - "x-mlym" -}; - -/********************************************************************** - * Unicode subranges as defined in unicode 3.0 - * x-western, x-central-euro, tr, x-baltic -> latin - * 0000 - 036f - * 1e00 - 1eff - * 2000 - 206f (general punctuation) - * 20a0 - 20cf (currency symbols) - * 2100 - 214f (letterlike symbols) - * 2150 - 218f (Number Forms) - * el -> greek - * 0370 - 03ff - * 1f00 - 1fff - * x-cyrillic -> cyrillic - * 0400 - 04ff - * he -> hebrew - * 0590 - 05ff - * ar -> arabic - * 0600 - 06ff - * fb50 - fdff (arabic presentation forms) - * fe70 - feff (arabic presentation forms b) - * th - thai - * 0e00 - 0e7f - * ko -> korean - * ac00 - d7af (hangul Syllables) - * 1100 - 11ff (jamo) - * 3130 - 318f (hangul compatibility jamo) - * ja - * 3040 - 309f (hiragana) - * 30a0 - 30ff (katakana) - * zh-CN - * zh-TW - * - * CJK - * 3100 - 312f (bopomofo) - * 31a0 - 31bf (bopomofo extended) - * 3000 - 303f (CJK Symbols and Punctuation) - * 2e80 - 2eff (CJK radicals supplement) - * 2f00 - 2fdf (Kangxi Radicals) - * 2ff0 - 2fff (Ideographic Description Characters) - * 3190 - 319f (kanbun) - * 3200 - 32ff (Enclosed CJK letters and Months) - * 3300 - 33ff (CJK compatibility) - * 3400 - 4dbf (CJK Unified Ideographs Extension A) - * 4e00 - 9faf (CJK Unified Ideographs) - * f900 - fa5f (CJK Compatibility Ideographs) - * fe30 - fe4f (CJK compatibility Forms) - * ff00 - ffef (halfwidth and fullwidth forms) - * - * Armenian - * 0530 - 058f - * Sriac - * 0700 - 074f - * Thaana - * 0780 - 07bf - * Devanagari - * 0900 - 097f - * Bengali - * 0980 - 09ff - * Gurmukhi - * 0a00 - 0a7f - * Gujarati - * 0a80 - 0aff - * Oriya - * 0b00 - 0b7f - * Tamil - * 0b80 - 0bff - * Telugu - * 0c00 - 0c7f - * Kannada - * 0c80 - 0cff - * Malayalam - * 0d00 - 0d7f - * Sinhala - * 0d80 - 0def - * Lao - * 0e80 - 0eff - * Tibetan - * 0f00 - 0fbf - * Myanmar - * 1000 - 109f - * Georgian - * 10a0 - 10ff - * Ethiopic - * 1200 - 137f - * Cherokee - * 13a0 - 13ff - * Canadian Aboriginal Syllabics - * 1400 - 167f - * Ogham - * 1680 - 169f - * Runic - * 16a0 - 16ff - * Khmer - * 1780 - 17ff - * Mongolian - * 1800 - 18af - * Misc - superscripts and subscripts - * 2070 - 209f - * Misc - Combining Diacritical Marks for Symbols - * 20d0 - 20ff - * Misc - Arrows - * 2190 - 21ff - * Misc - Mathematical Operators - * 2200 - 22ff - * Misc - Miscellaneous Technical - * 2300 - 23ff - * Misc - Control picture - * 2400 - 243f - * Misc - Optical character recognition - * 2440 - 2450 - * Misc - Enclose Alphanumerics - * 2460 - 24ff - * Misc - Box Drawing - * 2500 - 257f - * Misc - Block Elements - * 2580 - 259f - * Misc - Geometric Shapes - * 25a0 - 25ff - * Misc - Miscellaneous Symbols - * 2600 - 267f - * Misc - Dingbats - * 2700 - 27bf - * Misc - Braille Patterns - * 2800 - 28ff - * Yi Syllables - * a000 - a48f - * Yi radicals - * a490 - a4cf - * Alphabetic Presentation Forms - * fb00 - fb4f - * Misc - Combining half Marks - * fe20 - fe2f - * Misc - small form variants - * fe50 - fe6f - * Misc - Specials - * fff0 - ffff - *********************************************************************/ - -static const unsigned cNumSubTables = 9; -static const unsigned cSubTableSize = 16; - -static const unsigned char gUnicodeSubrangeTable[cNumSubTables][cSubTableSize] = -{ - { // table for X--- - cRangeTableBase+1, //u0xxx - cRangeTableBase+2, //u1xxx - cRangeTableBase+3, //u2xxx - cRangeSetCJK, //u3xxx - cRangeSetCJK, //u4xxx - cRangeSetCJK, //u5xxx - cRangeSetCJK, //u6xxx - cRangeSetCJK, //u7xxx - cRangeSetCJK, //u8xxx - cRangeSetCJK, //u9xxx - cRangeTableBase+4, //uaxxx - cRangeKorean, //ubxxx - cRangeKorean, //ucxxx - cRangeTableBase+5, //udxxx - cRangePrivate, //uexxx - cRangeTableBase+6 //ufxxx - }, - { //table for 0X-- - cRangeSetLatin, //u00xx - cRangeSetLatin, //u01xx - cRangeSetLatin, //u02xx - cRangeGreek, //u03xx XXX 0300-036f is in fact cRangeCombiningDiacriticalMarks - cRangeCyrillic, //u04xx - cRangeTableBase+7, //u05xx, includes Cyrillic supplement, Hebrew, and Armenian - cRangeArabic, //u06xx - cRangeTertiaryTable, //u07xx - cRangeUnassigned, //u08xx - cRangeTertiaryTable, //u09xx - cRangeTertiaryTable, //u0axx - cRangeTertiaryTable, //u0bxx - cRangeTertiaryTable, //u0cxx - cRangeTertiaryTable, //u0dxx - cRangeTertiaryTable, //u0exx - cRangeTibetan, //u0fxx - }, - { //table for 1x-- - cRangeTertiaryTable, //u10xx - cRangeKorean, //u11xx - cRangeEthiopic, //u12xx - cRangeTertiaryTable, //u13xx - cRangeCanadian, //u14xx - cRangeCanadian, //u15xx - cRangeTertiaryTable, //u16xx - cRangeKhmer, //u17xx - cRangeMongolian, //u18xx - cRangeUnassigned, //u19xx - cRangeUnassigned, //u1axx - cRangeUnassigned, //u1bxx - cRangeUnassigned, //u1cxx - cRangeUnassigned, //u1dxx - cRangeSetLatin, //u1exx - cRangeGreek, //u1fxx - }, - { //table for 2x-- - cRangeSetLatin, //u20xx - cRangeSetLatin, //u21xx - cRangeMathOperators, //u22xx - cRangeMiscTechnical, //u23xx - cRangeControlOpticalEnclose, //u24xx - cRangeBoxBlockGeometrics, //u25xx - cRangeMiscSymbols, //u26xx - cRangeDingbats, //u27xx - cRangeBraillePattern, //u28xx - cRangeUnassigned, //u29xx - cRangeUnassigned, //u2axx - cRangeUnassigned, //u2bxx - cRangeUnassigned, //u2cxx - cRangeUnassigned, //u2dxx - cRangeSetCJK, //u2exx - cRangeSetCJK, //u2fxx - }, - { //table for ax-- - cRangeYi, //ua0xx - cRangeYi, //ua1xx - cRangeYi, //ua2xx - cRangeYi, //ua3xx - cRangeYi, //ua4xx - cRangeUnassigned, //ua5xx - cRangeUnassigned, //ua6xx - cRangeUnassigned, //ua7xx - cRangeUnassigned, //ua8xx - cRangeUnassigned, //ua9xx - cRangeUnassigned, //uaaxx - cRangeUnassigned, //uabxx - cRangeKorean, //uacxx - cRangeKorean, //uadxx - cRangeKorean, //uaexx - cRangeKorean, //uafxx - }, - { //table for dx-- - cRangeKorean, //ud0xx - cRangeKorean, //ud1xx - cRangeKorean, //ud2xx - cRangeKorean, //ud3xx - cRangeKorean, //ud4xx - cRangeKorean, //ud5xx - cRangeKorean, //ud6xx - cRangeKorean, //ud7xx - cRangeSurrogate, //ud8xx - cRangeSurrogate, //ud9xx - cRangeSurrogate, //udaxx - cRangeSurrogate, //udbxx - cRangeSurrogate, //udcxx - cRangeSurrogate, //uddxx - cRangeSurrogate, //udexx - cRangeSurrogate, //udfxx - }, - { // table for fx-- - cRangePrivate, //uf0xx - cRangePrivate, //uf1xx - cRangePrivate, //uf2xx - cRangePrivate, //uf3xx - cRangePrivate, //uf4xx - cRangePrivate, //uf5xx - cRangePrivate, //uf6xx - cRangePrivate, //uf7xx - cRangePrivate, //uf8xx - cRangeSetCJK, //uf9xx - cRangeSetCJK, //ufaxx - cRangeArabic, //ufbxx, includes alphabic presentation form - cRangeArabic, //ufcxx - cRangeArabic, //ufdxx - cRangeArabic, //ufexx, includes Combining half marks, - // CJK compatibility forms, - // CJK compatibility forms, - // small form variants - cRangeTableBase+8, //uffxx, halfwidth and fullwidth forms, includes Specials - }, - { //table for 0x0500 - 0x05ff - cRangeCyrillic, //u050x - cRangeCyrillic, //u051x - cRangeCyrillic, //u052x - cRangeArmenian, //u053x - cRangeArmenian, //u054x - cRangeArmenian, //u055x - cRangeArmenian, //u056x - cRangeArmenian, //u057x - cRangeArmenian, //u058x - cRangeHebrew, //u059x - cRangeHebrew, //u05ax - cRangeHebrew, //u05bx - cRangeHebrew, //u05cx - cRangeHebrew, //u05dx - cRangeHebrew, //u05ex - cRangeHebrew, //u05fx - }, - { //table for 0xff00 - 0xffff - cRangeSetCJK, //uff0x, fullwidth latin - cRangeSetCJK, //uff1x, fullwidth latin - cRangeSetCJK, //uff2x, fullwidth latin - cRangeSetCJK, //uff3x, fullwidth latin - cRangeSetCJK, //uff4x, fullwidth latin - cRangeSetCJK, //uff5x, fullwidth latin - cRangeSetCJK, //uff6x, halfwidth katakana - cRangeSetCJK, //uff7x, halfwidth katakana - cRangeSetCJK, //uff8x, halfwidth katakana - cRangeSetCJK, //uff9x, halfwidth katakana - cRangeSetCJK, //uffax, halfwidth hangul jamo - cRangeSetCJK, //uffbx, halfwidth hangul jamo - cRangeSetCJK, //uffcx, halfwidth hangul jamo - cRangeSetCJK, //uffdx, halfwidth hangul jamo - cRangeSetCJK, //uffex, fullwidth symbols - cRangeSpecials, //ufffx, Specials - }, -}; - -// Most scripts between U+0700 and U+16FF are assigned a chunk of 128 (0x80) -// code points so that the number of entries in the tertiary range -// table for that range is obtained by dividing (0x1700 - 0x0700) by 128. -// Exceptions: Ethiopic, Tibetan, Hangul Jamo and Canadian aboriginal -// syllabaries take multiple chunks and Ogham and Runic share a single chunk. -static const unsigned cTertiaryTableSize = ((0x1700 - 0x0700) / 0x80); - -static const unsigned char gUnicodeTertiaryRangeTable[cTertiaryTableSize] = -{ //table for 0x0700 - 0x1600 - cRangeSyriac, //u070x - cRangeThaana, //u078x - cRangeUnassigned, //u080x place holder(resolved in the 2ndary tab.) - cRangeUnassigned, //u088x place holder(resolved in the 2ndary tab.) - cRangeDevanagari, //u090x - cRangeBengali, //u098x - cRangeGurmukhi, //u0a0x - cRangeGujarati, //u0a8x - cRangeOriya, //u0b0x - cRangeTamil, //u0b8x - cRangeTelugu, //u0c0x - cRangeKannada, //u0c8x - cRangeMalayalam, //u0d0x - cRangeSinhala, //u0d8x - cRangeThai, //u0e0x - cRangeLao, //u0e8x - cRangeTibetan, //u0f0x place holder(resolved in the 2ndary tab.) - cRangeTibetan, //u0f8x place holder(resolved in the 2ndary tab.) - cRangeMyanmar, //u100x - cRangeGeorgian, //u108x - cRangeKorean, //u110x place holder(resolved in the 2ndary tab.) - cRangeKorean, //u118x place holder(resolved in the 2ndary tab.) - cRangeEthiopic, //u120x place holder(resolved in the 2ndary tab.) - cRangeEthiopic, //u128x place holder(resolved in the 2ndary tab.) - cRangeEthiopic, //u130x - cRangeCherokee, //u138x - cRangeCanadian, //u140x place holder(resolved in the 2ndary tab.) - cRangeCanadian, //u148x place holder(resolved in the 2ndary tab.) - cRangeCanadian, //u150x place holder(resolved in the 2ndary tab.) - cRangeCanadian, //u158x place holder(resolved in the 2ndary tab.) - cRangeCanadian, //u160x - cRangeOghamRunic, //u168x this contains two scripts, Ogham & Runic -}; - -// A two level index is almost enough for locating a range, with the -// exception of u03xx and u05xx. Since we don't really care about range for -// combining diacritical marks in our font application, they are -// not discriminated further. Future adoption of this method for other use -// should be aware of this limitation. The implementation can be extended if -// there is such a need. -// For Indic, Southeast Asian scripts and some other scripts between -// U+0700 and U+16FF, it's extended to the third level. -unsigned int findCharUnicodeRange(UChar32 ch) -{ - if (ch >= 0xFFFF) - return 0; - - unsigned int range; - - //search the first table - range = gUnicodeSubrangeTable[0][ch >> 12]; - - if (range < cRangeTableBase) - // we try to get a specific range - return range; - - // otherwise, we have one more table to look at - range = gUnicodeSubrangeTable[range - cRangeTableBase][(ch & 0x0f00) >> 8]; - if (range < cRangeTableBase) - return range; - if (range < cRangeTertiaryTable) - return gUnicodeSubrangeTable[range - cRangeTableBase][(ch & 0x00f0) >> 4]; - - // Yet another table to look at : U+0700 - U+16FF : 128 code point blocks - return gUnicodeTertiaryRangeTable[(ch - 0x0700) >> 7]; -} - -const char* langGroupFromUnicodeRange(unsigned char unicodeRange) -{ - if (cRangeSpecificItemNum > unicodeRange) - return gUnicodeRangeToLangGroupTable[unicodeRange]; - return 0; -} - -} |