diff options
Diffstat (limited to 'WebCore/platform/text/mac')
-rw-r--r-- | WebCore/platform/text/mac/CharsetData.h | 37 | ||||
-rw-r--r-- | WebCore/platform/text/mac/ShapeArabic.c | 555 | ||||
-rw-r--r-- | WebCore/platform/text/mac/ShapeArabic.h | 44 | ||||
-rw-r--r-- | WebCore/platform/text/mac/StringImplMac.mm | 31 | ||||
-rw-r--r-- | WebCore/platform/text/mac/StringMac.mm | 41 | ||||
-rw-r--r-- | WebCore/platform/text/mac/TextBoundaries.mm | 54 | ||||
-rw-r--r-- | WebCore/platform/text/mac/TextBreakIteratorInternalICUMac.mm | 72 | ||||
-rw-r--r-- | WebCore/platform/text/mac/TextCodecMac.cpp | 321 | ||||
-rw-r--r-- | WebCore/platform/text/mac/TextCodecMac.h | 65 | ||||
-rw-r--r-- | WebCore/platform/text/mac/character-sets.txt | 1868 | ||||
-rw-r--r-- | WebCore/platform/text/mac/mac-encodings.txt | 45 | ||||
-rwxr-xr-x | WebCore/platform/text/mac/make-charset-table.pl | 225 |
12 files changed, 3358 insertions, 0 deletions
diff --git a/WebCore/platform/text/mac/CharsetData.h b/WebCore/platform/text/mac/CharsetData.h new file mode 100644 index 0000000..458cecb --- /dev/null +++ b/WebCore/platform/text/mac/CharsetData.h @@ -0,0 +1,37 @@ +/* + * Copyright (C) 2003, 2006 Apple Computer, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +namespace WebCore { + + #define kTextEncodingISOLatinThai kCFStringEncodingISOLatinThai + + struct CharsetEntry { + const char* name; + ::TextEncoding encoding; + }; + + extern const CharsetEntry CharsetTable[]; + +} diff --git a/WebCore/platform/text/mac/ShapeArabic.c b/WebCore/platform/text/mac/ShapeArabic.c new file mode 100644 index 0000000..6dbc008 --- /dev/null +++ b/WebCore/platform/text/mac/ShapeArabic.c @@ -0,0 +1,555 @@ +/* +****************************************************************************** +* +* Copyright (C) 2000-2004, International Business Machines +* Corporation and others. All Rights Reserved. +* Copyright (C) 2007 Apple Inc. All rights reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy of this +* software and associated documentation files (the "Software"), to deal in the Software +* without restriction, including without limitation the rights to use, copy, modify, +* merge, publish, distribute, and/or sell copies of the Software, and to permit persons +* to whom the Software is furnished to do so, provided that the above copyright notice(s) +* and this permission notice appear in all copies of the Software and that both the above +* copyright notice(s) and this permission notice appear in supporting documentation. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, +* INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR +* PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER +* OR HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR +* CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR +* PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING +* OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +* +* Except as contained in this notice, the name of a copyright holder shall not be used in +* advertising or otherwise to promote the sale, use or other dealings in this Software +* without prior written authorization of the copyright holder. +* +****************************************************************************** +* +* Arabic letter shaping implemented by Ayman Roshdy +*/ + +#include "config.h" + +#if USE(ATSUI) + +#include "ShapeArabic.h" + +#include <unicode/utypes.h> +#include <unicode/uchar.h> +#include <unicode/ustring.h> +#include <unicode/ushape.h> +#include <wtf/Assertions.h> + +/* + * ### TODO in general for letter shaping: + * - the letter shaping code is UTF-16-unaware; needs update + * + especially invertBuffer()?! + * - needs to handle the "Arabic Tail" that is used in some legacy codepages + * as a glyph fragment of wide-glyph letters + * + IBM Unicode conversion tables map it to U+200B (ZWSP) + * + IBM Egypt has proposed to encode the tail in Unicode among Arabic Presentation Forms + */ + +/* definitions for Arabic letter shaping ------------------------------------ */ + +#define IRRELEVANT 4 +#define LAMTYPE 16 +#define ALEFTYPE 32 +#define LINKR 1 +#define LINKL 2 + +static const UChar IrrelevantPos[] = { + 0x0, 0x2, 0x4, 0x6, + 0x8, 0xA, 0xC, 0xE, +}; + +static const UChar araLink[178]= +{ + 1 + 32 + 256 * 0x11,/*0x0622*/ + 1 + 32 + 256 * 0x13,/*0x0623*/ + 1 + 256 * 0x15,/*0x0624*/ + 1 + 32 + 256 * 0x17,/*0x0625*/ + 1 + 2 + 256 * 0x19,/*0x0626*/ + 1 + 32 + 256 * 0x1D,/*0x0627*/ + 1 + 2 + 256 * 0x1F,/*0x0628*/ + 1 + 256 * 0x23,/*0x0629*/ + 1 + 2 + 256 * 0x25,/*0x062A*/ + 1 + 2 + 256 * 0x29,/*0x062B*/ + 1 + 2 + 256 * 0x2D,/*0x062C*/ + 1 + 2 + 256 * 0x31,/*0x062D*/ + 1 + 2 + 256 * 0x35,/*0x062E*/ + 1 + 256 * 0x39,/*0x062F*/ + 1 + 256 * 0x3B,/*0x0630*/ + 1 + 256 * 0x3D,/*0x0631*/ + 1 + 256 * 0x3F,/*0x0632*/ + 1 + 2 + 256 * 0x41,/*0x0633*/ + 1 + 2 + 256 * 0x45,/*0x0634*/ + 1 + 2 + 256 * 0x49,/*0x0635*/ + 1 + 2 + 256 * 0x4D,/*0x0636*/ + 1 + 2 + 256 * 0x51,/*0x0637*/ + 1 + 2 + 256 * 0x55,/*0x0638*/ + 1 + 2 + 256 * 0x59,/*0x0639*/ + 1 + 2 + 256 * 0x5D,/*0x063A*/ + 0, 0, 0, 0, 0, /*0x063B-0x063F*/ + 1 + 2, /*0x0640*/ + 1 + 2 + 256 * 0x61,/*0x0641*/ + 1 + 2 + 256 * 0x65,/*0x0642*/ + 1 + 2 + 256 * 0x69,/*0x0643*/ + 1 + 2 + 16 + 256 * 0x6D,/*0x0644*/ + 1 + 2 + 256 * 0x71,/*0x0645*/ + 1 + 2 + 256 * 0x75,/*0x0646*/ + 1 + 2 + 256 * 0x79,/*0x0647*/ + 1 + 256 * 0x7D,/*0x0648*/ + 1 + 256 * 0x7F,/*0x0649*/ + 1 + 2 + 256 * 0x81,/*0x064A*/ + 4, 4, 4, 4, /*0x064B-0x064E*/ + 4, 4, 4, 4, /*0x064F-0x0652*/ + 4, 4, 4, 0, 0, /*0x0653-0x0657*/ + 0, 0, 0, 0, /*0x0658-0x065B*/ + 1 + 256 * 0x85,/*0x065C*/ + 1 + 256 * 0x87,/*0x065D*/ + 1 + 256 * 0x89,/*0x065E*/ + 1 + 256 * 0x8B,/*0x065F*/ + 0, 0, 0, 0, 0, /*0x0660-0x0664*/ + 0, 0, 0, 0, 0, /*0x0665-0x0669*/ + 0, 0, 0, 0, 0, 0, /*0x066A-0x066F*/ + 4, /*0x0670*/ + 0, /*0x0671*/ + 1 + 32, /*0x0672*/ + 1 + 32, /*0x0673*/ + 0, /*0x0674*/ + 1 + 32, /*0x0675*/ + 1, 1, /*0x0676-0x0677*/ + 1+2, /*0x0678*/ + 1+2 + 256 * 0x16,/*0x0679*/ + 1+2 + 256 * 0x0E,/*0x067A*/ + 1+2 + 256 * 0x02,/*0x067B*/ + 1+2, 1+2, /*0x067C-0x067D*/ + 1+2 + 256 * 0x06,/*0x067E*/ + 1+2 + 256 * 0x12,/*0x067F*/ + 1+2 + 256 * 0x0A,/*0x0680*/ + 1+2, 1+2, /*0x0681-0x0682*/ + 1+2 + 256 * 0x26,/*0x0683*/ + 1+2 + 256 * 0x22,/*0x0684*/ + 1+2, /*0x0685*/ + 1+2 + 256 * 0x2A,/*0x0686*/ + 1+2 + 256 * 0x2E,/*0x0687*/ + 1 + 256 * 0x38,/*0x0688*/ + 1, 1, 1, /*0x0689-0x068B*/ + 1 + 256 * 0x34,/*0x068C*/ + 1 + 256 * 0x32,/*0x068D*/ + 1 + 256 * 0x36,/*0x068E*/ + 1, 1, /*0x068F-0x0690*/ + 1 + 256 * 0x3C,/*0x0691*/ + 1, 1, 1, 1, 1, 1, /*0x0692-0x0697*/ + 1 + 256 * 0x3A,/*0x0698*/ + 1, /*0x0699*/ + 1+2, 1+2, 1+2, 1+2, 1+2, 1+2, /*0x069A-0x069F*/ + 1+2, 1+2, 1+2, 1+2, /*0x06A0-0x06A3*/ + 1+2 + 256 * 0x2E,/*0x06A4*/ + 1+2, /*0x06A5*/ + 1+2 + 256 * 0x1E,/*0x06A6*/ + 1+2, 1+2, /*0x06A7-0x06A8*/ + 1+2 + 256 * 0x3E,/*0x06A9*/ + 1+2, 1+2, 1+2, /*0x06AA-0x06AC*/ + 1+2 + 256 * 0x83,/*0x06AD*/ + 1+2, /*0x06AE*/ + 1+2 + 256 * 0x42,/*0x06AF*/ + 1+2, /*0x06B0*/ + 1+2 + 256 * 0x4A,/*0x06B1*/ + 1+2, /*0x06B2*/ + 1+2 + 256 * 0x46,/*0x06B3*/ + 1+2, 1+2, 1+2, 1+2, 1+2, 1+2, /*0x06B4-0x06B9*/ + 1+2, /*0x06BA*/ // FIXME: Seems to have a final form + 1+2 + 256 * 0x50,/*0x06BB*/ + 1+2, 1+2, /*0x06BC-0x06BD*/ + 1+2 + 256 * 0x5A,/*0x06BE*/ + 1+2, /*0x06BF*/ + 1, /*0x06C0*/ + 1+2 + 256 * 0x56,/*0x06C1*/ + 1+2, /*0x06C2*/ + 1, 1, /*0x06C3-0x06C4*/ + 1 + 256 * 0x90,/*0x06C5*/ + 1 + 256 * 0x89,/*0x06C6*/ + 1 + 256 * 0x87,/*0x06C7*/ + 1 + 256 * 0x8B,/*0x06C8*/ + 1 + 256 * 0x92,/*0x06C9*/ + 1, /*0x06CA*/ + 1 + 256 * 0x8E,/*0x06CB*/ + 1+2 + 256 * 0xAC,/*0x06CC*/ + 1, /*0x06CD*/ + 1+2, /*0x06CE*/ + 1, /*0x06CF*/ + 1+2 + 256 * 0x94,/*0x06D0*/ + 1+2, /*0x06D1*/ + 1 + 256 * 0x5E,/*0x06D2*/ + 1 + 256 * 0x60 /*0x06D3*/ +}; + +static const UChar presLink[141]= +{ + 1 + 2, /*0xFE70*/ + 1 + 2, /*0xFE71*/ + 1 + 2, 0, 1+ 2, 0, 1+ 2, /*0xFE72-0xFE76*/ + 1 + 2, /*0xFE77*/ + 1+ 2, 1 + 2, 1+2, 1 + 2, /*0xFE78-0xFE81*/ + 1+ 2, 1 + 2, 1+2, 1 + 2, /*0xFE82-0xFE85*/ + 0, 0 + 32, 1 + 32, 0 + 32, /*0xFE86-0xFE89*/ + 1 + 32, 0, 1, 0 + 32, /*0xFE8A-0xFE8D*/ + 1 + 32, 0, 2, 1 + 2, /*0xFE8E-0xFE91*/ + 1, 0 + 32, 1 + 32, 0, /*0xFE92-0xFE95*/ + 2, 1 + 2, 1, 0, /*0xFE96-0xFE99*/ + 1, 0, 2, 1 + 2, /*0xFE9A-0xFE9D*/ + 1, 0, 2, 1 + 2, /*0xFE9E-0xFEA1*/ + 1, 0, 2, 1 + 2, /*0xFEA2-0xFEA5*/ + 1, 0, 2, 1 + 2, /*0xFEA6-0xFEA9*/ + 1, 0, 2, 1 + 2, /*0xFEAA-0xFEAD*/ + 1, 0, 1, 0, /*0xFEAE-0xFEB1*/ + 1, 0, 1, 0, /*0xFEB2-0xFEB5*/ + 1, 0, 2, 1+2, /*0xFEB6-0xFEB9*/ + 1, 0, 2, 1+2, /*0xFEBA-0xFEBD*/ + 1, 0, 2, 1+2, /*0xFEBE-0xFEC1*/ + 1, 0, 2, 1+2, /*0xFEC2-0xFEC5*/ + 1, 0, 2, 1+2, /*0xFEC6-0xFEC9*/ + 1, 0, 2, 1+2, /*0xFECA-0xFECD*/ + 1, 0, 2, 1+2, /*0xFECE-0xFED1*/ + 1, 0, 2, 1+2, /*0xFED2-0xFED5*/ + 1, 0, 2, 1+2, /*0xFED6-0xFED9*/ + 1, 0, 2, 1+2, /*0xFEDA-0xFEDD*/ + 1, 0, 2, 1+2, /*0xFEDE-0xFEE1*/ + 1, 0 + 16, 2 + 16, 1 + 2 +16, /*0xFEE2-0xFEE5*/ + 1 + 16, 0, 2, 1+2, /*0xFEE6-0xFEE9*/ + 1, 0, 2, 1+2, /*0xFEEA-0xFEED*/ + 1, 0, 2, 1+2, /*0xFEEE-0xFEF1*/ + 1, 0, 1, 0, /*0xFEF2-0xFEF5*/ + 1, 0, 2, 1+2, /*0xFEF6-0xFEF9*/ + 1, 0, 1, 0, /*0xFEFA-0xFEFD*/ + 1, 0, 1, 0, + 1 +}; + +static const UChar convertFEto06[] = +{ +/***********0******1******2******3******4******5******6******7******8******9******A******B******C******D******E******F***/ +/*FE7*/ 0x64B, 0x64B, 0x64C, 0x64C, 0x64D, 0x64D, 0x64E, 0x64E, 0x64F, 0x64F, 0x650, 0x650, 0x651, 0x651, 0x652, 0x652, +/*FE8*/ 0x621, 0x622, 0x622, 0x623, 0x623, 0x624, 0x624, 0x625, 0x625, 0x626, 0x626, 0x626, 0x626, 0x627, 0x627, 0x628, +/*FE9*/ 0x628, 0x628, 0x628, 0x629, 0x629, 0x62A, 0x62A, 0x62A, 0x62A, 0x62B, 0x62B, 0x62B, 0x62B, 0x62C, 0x62C, 0x62C, +/*FEA*/ 0x62C, 0x62D, 0x62D, 0x62D, 0x62D, 0x62E, 0x62E, 0x62E, 0x62E, 0x62F, 0x62F, 0x630, 0x630, 0x631, 0x631, 0x632, +/*FEB*/ 0x632, 0x633, 0x633, 0x633, 0x633, 0x634, 0x634, 0x634, 0x634, 0x635, 0x635, 0x635, 0x635, 0x636, 0x636, 0x636, +/*FEC*/ 0x636, 0x637, 0x637, 0x637, 0x637, 0x638, 0x638, 0x638, 0x638, 0x639, 0x639, 0x639, 0x639, 0x63A, 0x63A, 0x63A, +/*FED*/ 0x63A, 0x641, 0x641, 0x641, 0x641, 0x642, 0x642, 0x642, 0x642, 0x643, 0x643, 0x643, 0x643, 0x644, 0x644, 0x644, +/*FEE*/ 0x644, 0x645, 0x645, 0x645, 0x645, 0x646, 0x646, 0x646, 0x646, 0x647, 0x647, 0x647, 0x647, 0x648, 0x648, 0x649, +/*FEF*/ 0x649, 0x64A, 0x64A, 0x64A, 0x64A, 0x65C, 0x65C, 0x65D, 0x65D, 0x65E, 0x65E, 0x65F, 0x65F +}; + +static const UChar shapeTable[4][4][4]= +{ + { {0,0,0,0}, {0,0,0,0}, {0,1,0,3}, {0,1,0,1} }, + { {0,0,2,2}, {0,0,1,2}, {0,1,1,2}, {0,1,1,3} }, + { {0,0,0,0}, {0,0,0,0}, {0,1,0,3}, {0,1,0,3} }, + { {0,0,1,2}, {0,0,1,2}, {0,1,1,2}, {0,1,1,3} } +}; + +/* + *Name : changeLamAlef + *Function : Converts the Alef characters into an equivalent + * LamAlef location in the 0x06xx Range, this is an + * intermediate stage in the operation of the program + * later it'll be converted into the 0xFExx LamAlefs + * in the shaping function. + */ +static UChar +changeLamAlef(UChar ch) { + + switch(ch) { + case 0x0622 : + return(0x065C); + break; + case 0x0623 : + return(0x065D); + break; + case 0x0625 : + return(0x065E); + break; + case 0x0627 : + return(0x065F); + break; + default : + return(0); + break; + } +} + +/* + *Name : specialChar + *Function : Special Arabic characters need special handling in the shapeUnicode + * function, this function returns 1 or 2 for these special characters + */ +static int32_t +specialChar(UChar ch) { + + if( (ch>0x0621 && ch<0x0626)||(ch==0x0627)||(ch>0x062e && ch<0x0633)|| + (ch>0x0647 && ch<0x064a)||(ch==0x0629) ) { + return (1); + } + else + if( ch>=0x064B && ch<= 0x0652 ) + return (2); + else + if( (ch>=0x0653 && ch<= 0x0655) || ch == 0x0670 || + (ch>=0xFE70 && ch<= 0xFE7F) ) + return (3); + else + return (0); +} + +/* + *Name : getLink + *Function : Resolves the link between the characters as + * Arabic characters have four forms : + * Isolated, Initial, Middle and Final Form + */ +static UChar +getLink(UChar ch) { + + if(ch >= 0x0622 && ch <= 0x06D3) { + return(araLink[ch-0x0622]); + } else if(ch == 0x200D) { + return(3); + } else if(ch >= 0x206D && ch <= 0x206F) { + return(4); + } else if(ch >= 0xFE70 && ch <= 0xFEFC) { + return(presLink[ch-0xFE70]); + } else { + return(0); + } +} + +/* + *Name : isTashkeelChar + *Function : Returns 1 for Tashkeel characters else return 0 + */ +static int32_t +isTashkeelChar(UChar ch) { + + if( ch>=0x064B && ch<= 0x0652 ) + return (1); + else + return (0); +} + +/* + *Name : shapeUnicode + *Function : Converts an Arabic Unicode buffer in 06xx Range into a shaped + * arabic Unicode buffer in FExx Range + */ +static int32_t +shapeUnicode(UChar *dest, int32_t sourceLength, + int32_t destSize,uint32_t options, + UErrorCode *pErrorCode, + int tashkeelFlag) { + + int32_t i, iend; + int32_t prevPos, lastPos,Nx, Nw; + unsigned int Shape; + int32_t flag; + int32_t lamalef_found = 0; + UChar prevLink = 0, lastLink = 0, currLink, nextLink = 0; + UChar wLamalef; + + /* + * Converts the input buffer from FExx Range into 06xx Range + * to make sure that all characters are in the 06xx range + * even the lamalef is converted to the special region in + * the 06xx range + */ + for (i = 0; i < sourceLength; i++) { + UChar inputChar = dest[i]; + if ( (inputChar >= 0xFE70) && (inputChar <= 0xFEFC)) { + dest[i] = convertFEto06 [ (inputChar - 0xFE70) ] ; + } + } + + /* sets the index to the end of the buffer, together with the step point to -1 */ + i = 0; + iend = sourceLength; + + /* + * This function resolves the link between the characters . + * Arabic characters have four forms : + * Isolated Form, Initial Form, Middle Form and Final Form + */ + currLink = getLink(dest[i]); + + prevPos = i; + lastPos = i; + Nx = sourceLength + 2, Nw = 0; + + while (i != iend) { + /* If high byte of currLink > 0 then more than one shape */ + if ((currLink & 0xFF00) > 0 || isTashkeelChar(dest[i])) { + Nw = i + 1; + while (Nx >= sourceLength) { /* we need to know about next char */ + if(Nw == iend) { + nextLink = 0; + Nx = -1; + } else { + nextLink = getLink(dest[Nw]); + if((nextLink & IRRELEVANT) == 0) { + Nx = Nw; + } else { + Nw = Nw + 1; + } + } + } + + if ( ((currLink & ALEFTYPE) > 0) && ((lastLink & LAMTYPE) > 0) ) { + lamalef_found = 1; + wLamalef = changeLamAlef(dest[i]); /*get from 0x065C-0x065f */ + if ( wLamalef != 0) { + dest[i] = ' '; /* The default case is to drop the Alef and replace */ + dest[lastPos] =wLamalef; /* it by a space. */ + i=lastPos; + } + lastLink = prevLink; + currLink = getLink(wLamalef); + } + /* + * get the proper shape according to link ability of neighbors + * and of character; depends on the order of the shapes + * (isolated, initial, middle, final) in the compatibility area + */ + flag = specialChar(dest[i]); + + Shape = shapeTable[nextLink & (LINKR + LINKL)] + [lastLink & (LINKR + LINKL)] + [currLink & (LINKR + LINKL)]; + + if (flag == 1) { + Shape = (Shape == 1 || Shape == 3) ? 1 : 0; + } + else + if(flag == 2) { + if( (lastLink & LINKL) && (nextLink & LINKR) && (tashkeelFlag == 1) && + dest[i] != 0x064C && dest[i] != 0x064D ) { + Shape = 1; + if( (nextLink&ALEFTYPE) == ALEFTYPE && (lastLink&LAMTYPE) == LAMTYPE ) + Shape = 0; + } + else { + Shape = 0; + } + } + + if(flag == 2) { + dest[i] = 0xFE70 + IrrelevantPos[(dest[i] - 0x064B)] + Shape; + } + else + dest[i] = (UChar)((dest[i] < 0x0670 ? 0xFE70 : 0xFB50) + (currLink >> 8) + Shape); + } + + /* move one notch forward */ + if ((currLink & IRRELEVANT) == 0) { + prevLink = lastLink; + lastLink = currLink; + prevPos = lastPos; + lastPos = i; + } + + i++; + if (i == Nx) { + currLink = nextLink; + Nx = sourceLength + 2; + } + else if(i != iend) { + currLink = getLink(dest[i]); + } + } + + destSize = sourceLength; + + return destSize; +} + +int32_t shapeArabic(const UChar *source, int32_t sourceLength, UChar *dest, int32_t destCapacity, uint32_t options, UErrorCode *pErrorCode) { + int32_t destLength; + + /* usual error checking */ + if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { + return 0; + } + + /* make sure that no reserved options values are used; allow dest==NULL only for preflighting */ + if( source==NULL || sourceLength<-1 || + (dest==NULL && destCapacity!=0) || destCapacity<0 || + options>=U_SHAPE_DIGIT_TYPE_RESERVED || + (options&U_SHAPE_DIGITS_MASK)>=U_SHAPE_DIGITS_RESERVED + ) { + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + + /* determine the source length */ + if(sourceLength==-1) { + sourceLength=u_strlen(source); + } + if(sourceLength==0) { + return 0; + } + + /* check that source and destination do not overlap */ + if( dest!=NULL && + ((source<=dest && dest<source+sourceLength) || + (dest<=source && source<dest+destCapacity)) + ) { + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + + if((options&U_SHAPE_LETTERS_MASK)!=U_SHAPE_LETTERS_NOOP) { + int32_t outputSize = sourceLength; + + /* calculate destination size */ + /* TODO: do we ever need to do this pure preflighting? */ + ASSERT((options&U_SHAPE_LENGTH_MASK) != U_SHAPE_LENGTH_GROW_SHRINK); + + if(outputSize>destCapacity) { + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; + return outputSize; + } + + /* Start of Arabic letter shaping part */ + memcpy(dest, source, sourceLength*U_SIZEOF_UCHAR); + + ASSERT((options&U_SHAPE_TEXT_DIRECTION_MASK) == U_SHAPE_TEXT_DIRECTION_LOGICAL); + + switch(options&U_SHAPE_LETTERS_MASK) { + case U_SHAPE_LETTERS_SHAPE : + /* Call the shaping function with tashkeel flag == 1 */ + destLength = shapeUnicode(dest,sourceLength,destCapacity,options,pErrorCode,1); + break; + case U_SHAPE_LETTERS_SHAPE_TASHKEEL_ISOLATED : + /* Call the shaping function with tashkeel flag == 0 */ + destLength = shapeUnicode(dest,sourceLength,destCapacity,options,pErrorCode,0); + break; + case U_SHAPE_LETTERS_UNSHAPE : + ASSERT_NOT_REACHED(); + break; + default : + /* will never occur because of validity checks above */ + destLength = 0; + break; + } + + /* End of Arabic letter shaping part */ + } else + ASSERT_NOT_REACHED(); + + ASSERT((options & U_SHAPE_DIGITS_MASK) == U_SHAPE_DIGITS_NOOP); + + return sourceLength; +} + +#endif // USE(ATSUI) diff --git a/WebCore/platform/text/mac/ShapeArabic.h b/WebCore/platform/text/mac/ShapeArabic.h new file mode 100644 index 0000000..8aa577d --- /dev/null +++ b/WebCore/platform/text/mac/ShapeArabic.h @@ -0,0 +1,44 @@ +/* + * Copyright (C) 2007 Apple Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef ShapeArabic_h +#define ShapeArabic_h + +#if USE(ATSUI) + +#include <unicode/ushape.h> + +#ifdef __cplusplus +extern "C" { +#endif + +int32_t shapeArabic(const UChar *source, int32_t sourceLength, UChar *dest, int32_t destCapacity, uint32_t options, UErrorCode *pErrorCode); + +#ifdef __cplusplus +} +#endif + +#endif // USE(ATSUI) +#endif // ShapeArabic_h diff --git a/WebCore/platform/text/mac/StringImplMac.mm b/WebCore/platform/text/mac/StringImplMac.mm new file mode 100644 index 0000000..2180b94 --- /dev/null +++ b/WebCore/platform/text/mac/StringImplMac.mm @@ -0,0 +1,31 @@ +/** + * Copyright (C) 2006 Apple Computer, Inc. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public License + * along with this library; see the file COPYING.LIB. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, + * Boston, MA 02110-1301, USA. + * + */ + +#include "config.h" +#include "StringImpl.h" + +namespace WebCore { + +StringImpl::operator NSString *() +{ + return [NSString stringWithCharacters:m_data length:m_length]; +} + +} diff --git a/WebCore/platform/text/mac/StringMac.mm b/WebCore/platform/text/mac/StringMac.mm new file mode 100644 index 0000000..77942ea --- /dev/null +++ b/WebCore/platform/text/mac/StringMac.mm @@ -0,0 +1,41 @@ +/** + * Copyright (C) 2006 Apple Computer, Inc. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public License + * along with this library; see the file COPYING.LIB. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, + * Boston, MA 02110-1301, USA. + * + */ + +#include "config.h" +#include "PlatformString.h" + +namespace WebCore { + +String::String(NSString* str) +{ + if (!str) + return; + + CFIndex size = CFStringGetLength(reinterpret_cast<CFStringRef>(str)); + if (size == 0) + m_impl = StringImpl::empty(); + else { + Vector<UChar, 1024> buffer(size); + CFStringGetCharacters(reinterpret_cast<CFStringRef>(str), CFRangeMake(0, size), buffer.data()); + m_impl = StringImpl::create(buffer.data(), size); + } +} + +} diff --git a/WebCore/platform/text/mac/TextBoundaries.mm b/WebCore/platform/text/mac/TextBoundaries.mm new file mode 100644 index 0000000..ff1dfd2 --- /dev/null +++ b/WebCore/platform/text/mac/TextBoundaries.mm @@ -0,0 +1,54 @@ +/* + * Copyright (C) 2004, 2006 Apple Computer, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#import "config.h" +#import "TextBoundaries.h" + +namespace WebCore { + +void findWordBoundary(const UChar* chars, int len, int position, int* start, int* end) +{ + NSString* string = [[NSString alloc] initWithCharactersNoCopy:const_cast<unichar*>(chars) + length:len freeWhenDone:NO]; + NSAttributedString* attr = [[NSAttributedString alloc] initWithString:string]; + NSRange range = [attr doubleClickAtIndex:(position >= len) ? len - 1 : position]; + [attr release]; + [string release]; + *start = range.location; + *end = range.location + range.length; +} + +int findNextWordFromIndex(const UChar* chars, int len, int position, bool forward) +{ + NSString* string = [[NSString alloc] initWithCharactersNoCopy:const_cast<unichar*>(chars) + length:len freeWhenDone:NO]; + NSAttributedString* attr = [[NSAttributedString alloc] initWithString:string]; + int result = [attr nextWordFromIndex:position forward:forward]; + [attr release]; + [string release]; + return result; +} + +} diff --git a/WebCore/platform/text/mac/TextBreakIteratorInternalICUMac.mm b/WebCore/platform/text/mac/TextBreakIteratorInternalICUMac.mm new file mode 100644 index 0000000..92983eb --- /dev/null +++ b/WebCore/platform/text/mac/TextBreakIteratorInternalICUMac.mm @@ -0,0 +1,72 @@ +/* + * Copyright (C) 2007 Apple Inc. All rights reserved. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public License + * along with this library; see the file COPYING.LIB. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, + * Boston, MA 02110-1301, USA. + * + */ + +#include "config.h" +#include "TextBreakIteratorInternalICU.h" + +namespace WebCore { + +static const int maxLocaleStringLength = 32; + +// This code was swiped from the CarbonCore UnicodeUtilities. One change from that is to use the empty +// string instead of the "old locale model" as the ultimate fallback. This change is per the UnicodeUtilities +// engineer. +static void getTextBreakLocale(char localeStringBuffer[maxLocaleStringLength]) +{ + // Empty string means "root locale", which is what we use if we can't use a pref. + + // We get the parts string from AppleTextBreakLocale pref. + // If that fails then look for the first language in the AppleLanguages pref. + CFStringRef prefLocaleStr = (CFStringRef)CFPreferencesCopyValue(CFSTR("AppleTextBreakLocale"), + kCFPreferencesAnyApplication, kCFPreferencesCurrentUser, kCFPreferencesAnyHost); + if (!prefLocaleStr) { + CFArrayRef appleLangArr = (CFArrayRef)CFPreferencesCopyValue(CFSTR("AppleLanguages"), + kCFPreferencesAnyApplication, kCFPreferencesCurrentUser, kCFPreferencesAnyHost); + if (appleLangArr) { + // Take the topmost language. Retain so that we can blindly release later. + prefLocaleStr = (CFStringRef)CFArrayGetValueAtIndex(appleLangArr, 0); + if (prefLocaleStr) + CFRetain(prefLocaleStr); + CFRelease(appleLangArr); + } + } + if (prefLocaleStr) { + // Canonicalize pref string in case it is not in the canonical format. + CFStringRef canonLocaleCFStr = CFLocaleCreateCanonicalLanguageIdentifierFromString(kCFAllocatorDefault, prefLocaleStr); + if (canonLocaleCFStr) { + CFStringGetCString(canonLocaleCFStr, localeStringBuffer, maxLocaleStringLength, kCFStringEncodingASCII); + CFRelease(canonLocaleCFStr); + } + CFRelease(prefLocaleStr); + } +} + +const char* currentTextBreakLocaleID() +{ + static char localeStringBuffer[maxLocaleStringLength]; + static bool gotTextBreakLocale = false; + if (!gotTextBreakLocale) { + getTextBreakLocale(localeStringBuffer); + gotTextBreakLocale = true; + } + return localeStringBuffer; +} + +} diff --git a/WebCore/platform/text/mac/TextCodecMac.cpp b/WebCore/platform/text/mac/TextCodecMac.cpp new file mode 100644 index 0000000..ac1f0fb --- /dev/null +++ b/WebCore/platform/text/mac/TextCodecMac.cpp @@ -0,0 +1,321 @@ +/* + * Copyright (C) 2004, 2006, 2008 Apple Inc. All rights reserved. + * Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "TextCodecMac.h" + +#include "CString.h" +#include "CharacterNames.h" +#include "CharsetData.h" +#include "PlatformString.h" +#include <wtf/Assertions.h> + +using std::auto_ptr; +using std::min; + +namespace WebCore { + +// We need to keep this because ICU doesn't support some of the encodings that we need: +// <http://bugs.webkit.org/show_bug.cgi?id=4195>. + +const size_t ConversionBufferSize = 16384; + +static TECObjectRef cachedConverterTEC; +static TECTextEncodingID cachedConverterEncoding = invalidEncoding; + +void TextCodecMac::registerEncodingNames(EncodingNameRegistrar registrar) +{ + TECTextEncodingID lastEncoding = invalidEncoding; + const char* lastName = 0; + + for (size_t i = 0; CharsetTable[i].name; ++i) { + if (CharsetTable[i].encoding != lastEncoding) { + lastEncoding = CharsetTable[i].encoding; + lastName = CharsetTable[i].name; + } + registrar(CharsetTable[i].name, lastName); + } +} + +static auto_ptr<TextCodec> newTextCodecMac(const TextEncoding&, const void* additionalData) +{ + return auto_ptr<TextCodec>(new TextCodecMac(*static_cast<const TECTextEncodingID*>(additionalData))); +} + +void TextCodecMac::registerCodecs(TextCodecRegistrar registrar) +{ + TECTextEncodingID lastEncoding = invalidEncoding; + + for (size_t i = 0; CharsetTable[i].name; ++i) + if (CharsetTable[i].encoding != lastEncoding) { + registrar(CharsetTable[i].name, newTextCodecMac, &CharsetTable[i].encoding); + lastEncoding = CharsetTable[i].encoding; + } +} + +TextCodecMac::TextCodecMac(TECTextEncodingID encoding) + : m_encoding(encoding) + , m_numBufferedBytes(0) + , m_converterTEC(0) +{ +} + +TextCodecMac::~TextCodecMac() +{ + releaseTECConverter(); +} + +void TextCodecMac::releaseTECConverter() const +{ + if (m_converterTEC) { + if (cachedConverterTEC != 0) + TECDisposeConverter(cachedConverterTEC); + cachedConverterTEC = m_converterTEC; + cachedConverterEncoding = m_encoding; + m_converterTEC = 0; + } +} + +OSStatus TextCodecMac::createTECConverter() const +{ + bool cachedEncodingEqual = cachedConverterEncoding == m_encoding; + cachedConverterEncoding = invalidEncoding; + + if (cachedEncodingEqual && cachedConverterTEC) { + m_converterTEC = cachedConverterTEC; + cachedConverterTEC = 0; + TECClearConverterContextInfo(m_converterTEC); + } else { + OSStatus status = TECCreateConverter(&m_converterTEC, m_encoding, + CreateTextEncoding(kTextEncodingUnicodeDefault, kTextEncodingDefaultVariant, kUnicode16BitFormat)); + if (status) + return status; + + TECSetBasicOptions(m_converterTEC, kUnicodeForceASCIIRangeMask); + } + + return noErr; +} + +OSStatus TextCodecMac::decode(const unsigned char* inputBuffer, int inputBufferLength, int& inputLength, + void *outputBuffer, int outputBufferLength, int& outputLength) +{ + OSStatus status; + unsigned long bytesRead = 0; + unsigned long bytesWritten = 0; + + if (m_numBufferedBytes != 0) { + // Finish converting a partial character that's in our buffer. + + // First, fill the partial character buffer with as many bytes as are available. + ASSERT(m_numBufferedBytes < sizeof(m_bufferedBytes)); + const int spaceInBuffer = sizeof(m_bufferedBytes) - m_numBufferedBytes; + const int bytesToPutInBuffer = MIN(spaceInBuffer, inputBufferLength); + ASSERT(bytesToPutInBuffer != 0); + memcpy(m_bufferedBytes + m_numBufferedBytes, inputBuffer, bytesToPutInBuffer); + + // Now, do a conversion on the buffer. + status = TECConvertText(m_converterTEC, m_bufferedBytes, m_numBufferedBytes + bytesToPutInBuffer, &bytesRead, + reinterpret_cast<unsigned char*>(outputBuffer), outputBufferLength, &bytesWritten); + ASSERT(bytesRead <= m_numBufferedBytes + bytesToPutInBuffer); + + if (status == kTECPartialCharErr && bytesRead == 0) { + // Handle the case where the partial character was not converted. + if (bytesToPutInBuffer >= spaceInBuffer) { + LOG_ERROR("TECConvertText gave a kTECPartialCharErr but read none of the %zu bytes in the buffer", sizeof(m_bufferedBytes)); + m_numBufferedBytes = 0; + status = kTECUnmappableElementErr; // should never happen, but use this error code + } else { + // Tell the caller we read all the source bytes and keep them in the buffer. + m_numBufferedBytes += bytesToPutInBuffer; + bytesRead = bytesToPutInBuffer; + status = noErr; + } + } else { + // We are done with the partial character buffer. + // Also, we have read some of the bytes from the main buffer. + if (bytesRead > m_numBufferedBytes) { + bytesRead -= m_numBufferedBytes; + } else { + LOG_ERROR("TECConvertText accepted some bytes it previously rejected with kTECPartialCharErr"); + bytesRead = 0; + } + m_numBufferedBytes = 0; + if (status == kTECPartialCharErr) { + // While there may be a partial character problem in the small buffer, + // we have to try again and not get confused and think there is a partial + // character problem in the large buffer. + status = noErr; + } + } + } else { + status = TECConvertText(m_converterTEC, inputBuffer, inputBufferLength, &bytesRead, + static_cast<unsigned char*>(outputBuffer), outputBufferLength, &bytesWritten); + ASSERT(static_cast<int>(bytesRead) <= inputBufferLength); + } + + // Work around bug 3351093, where sometimes we get kTECBufferBelowMinimumSizeErr instead of kTECOutputBufferFullStatus. + if (status == kTECBufferBelowMinimumSizeErr && bytesWritten != 0) + status = kTECOutputBufferFullStatus; + + inputLength = bytesRead; + outputLength = bytesWritten; + return status; +} + +String TextCodecMac::decode(const char* bytes, size_t length, bool flush, bool stopOnError, bool& sawError) +{ + // Get a converter for the passed-in encoding. + if (!m_converterTEC && createTECConverter() != noErr) + return String(); + + Vector<UChar> result; + + const unsigned char* sourcePointer = reinterpret_cast<const unsigned char*>(bytes); + int sourceLength = length; + bool bufferWasFull = false; + UniChar buffer[ConversionBufferSize]; + + while ((sourceLength || bufferWasFull) && !sawError) { + int bytesRead = 0; + int bytesWritten = 0; + OSStatus status = decode(sourcePointer, sourceLength, bytesRead, buffer, sizeof(buffer), bytesWritten); + ASSERT(bytesRead <= sourceLength); + sourcePointer += bytesRead; + sourceLength -= bytesRead; + + switch (status) { + case noErr: + case kTECOutputBufferFullStatus: + break; + case kTextMalformedInputErr: + case kTextUndefinedElementErr: + // FIXME: Put FFFD character into the output string in this case? + TECClearConverterContextInfo(m_converterTEC); + if (stopOnError) { + sawError = true; + break; + } + if (sourceLength) { + sourcePointer += 1; + sourceLength -= 1; + } + break; + case kTECPartialCharErr: { + // Put the partial character into the buffer. + ASSERT(m_numBufferedBytes == 0); + const int bufferSize = sizeof(m_numBufferedBytes); + if (sourceLength < bufferSize) { + memcpy(m_bufferedBytes, sourcePointer, sourceLength); + m_numBufferedBytes = sourceLength; + } else { + LOG_ERROR("TECConvertText gave a kTECPartialCharErr, but left %u bytes in the buffer", sourceLength); + } + sourceLength = 0; + break; + } + default: + sawError = true; + return String(); + } + + ASSERT(!(bytesWritten % sizeof(UChar))); + result.append(buffer, bytesWritten / sizeof(UChar)); + + bufferWasFull = status == kTECOutputBufferFullStatus; + } + + if (flush) { + unsigned long bytesWritten = 0; + TECFlushText(m_converterTEC, reinterpret_cast<unsigned char*>(buffer), sizeof(buffer), &bytesWritten); + ASSERT(!(bytesWritten % sizeof(UChar))); + result.append(buffer, bytesWritten / sizeof(UChar)); + } + + String resultString = String::adopt(result); + + // <rdar://problem/3225472> + // Simplified Chinese pages use the code A3A0 to mean "full-width space". + // But GB18030 decodes it to U+E5E5, which is correct in theory but not in practice. + // To work around, just change all occurences of U+E5E5 to U+3000 (ideographic space). + if (m_encoding == kCFStringEncodingGB_18030_2000) + resultString.replace(0xE5E5, ideographicSpace); + + return resultString; +} + +CString TextCodecMac::encode(const UChar* characters, size_t length, UnencodableHandling handling) +{ + // FIXME: We should really use TEC here instead of CFString for consistency with the other direction. + + // FIXME: Since there's no "force ASCII range" mode in CFString, we change the backslash into a yen sign. + // Encoding will change the yen sign back into a backslash. + String copy(characters, length); + copy.replace('\\', m_backslashAsCurrencySymbol); + CFStringRef cfs = copy.createCFString(); + + CFIndex startPos = 0; + CFIndex charactersLeft = CFStringGetLength(cfs); + Vector<char> result; + size_t size = 0; + UInt8 lossByte = handling == QuestionMarksForUnencodables ? '?' : 0; + while (charactersLeft > 0) { + CFRange range = CFRangeMake(startPos, charactersLeft); + CFIndex bufferLength; + CFStringGetBytes(cfs, range, m_encoding, lossByte, false, NULL, 0x7FFFFFFF, &bufferLength); + + result.grow(size + bufferLength); + unsigned char* buffer = reinterpret_cast<unsigned char*>(result.data() + size); + CFIndex charactersConverted = CFStringGetBytes(cfs, range, m_encoding, lossByte, false, buffer, bufferLength, &bufferLength); + size += bufferLength; + + if (charactersConverted != charactersLeft) { + unsigned badChar = CFStringGetCharacterAtIndex(cfs, startPos + charactersConverted); + ++charactersConverted; + if ((badChar & 0xFC00) == 0xD800 && charactersConverted != charactersLeft) { // is high surrogate + UniChar low = CFStringGetCharacterAtIndex(cfs, startPos + charactersConverted); + if ((low & 0xFC00) == 0xDC00) { // is low surrogate + badChar <<= 10; + badChar += low; + badChar += 0x10000 - (0xD800 << 10) - 0xDC00; + ++charactersConverted; + } + } + UnencodableReplacementArray entity; + int entityLength = getUnencodableReplacement(badChar, handling, entity); + result.grow(size + entityLength); + memcpy(result.data() + size, entity, entityLength); + size += entityLength; + } + + startPos += charactersConverted; + charactersLeft -= charactersConverted; + } + CFRelease(cfs); + return CString(result.data(), size); +} + +} // namespace WebCore diff --git a/WebCore/platform/text/mac/TextCodecMac.h b/WebCore/platform/text/mac/TextCodecMac.h new file mode 100644 index 0000000..aee4a97 --- /dev/null +++ b/WebCore/platform/text/mac/TextCodecMac.h @@ -0,0 +1,65 @@ +/* + * Copyright (C) 2004, 2006 Apple Computer, Inc. All rights reserved. + * Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef TextCodecMac_h +#define TextCodecMac_h + +#include "TextCodec.h" +#include <CoreServices/CoreServices.h> + +namespace WebCore { + + typedef ::TextEncoding TECTextEncodingID; + const TECTextEncodingID invalidEncoding = kCFStringEncodingInvalidId; + + class TextCodecMac : public TextCodec { + public: + static void registerEncodingNames(EncodingNameRegistrar); + static void registerCodecs(TextCodecRegistrar); + + explicit TextCodecMac(TECTextEncodingID); + virtual ~TextCodecMac(); + + virtual String decode(const char*, size_t length, bool flush, bool stopOnError, bool& sawError); + virtual CString encode(const UChar*, size_t length, UnencodableHandling); + + private: + OSStatus decode(const unsigned char* inputBuffer, int inputBufferLength, int& inputLength, + void* outputBuffer, int outputBufferLength, int& outputLength); + + OSStatus createTECConverter() const; + void releaseTECConverter() const; + + TECTextEncodingID m_encoding; + UChar m_backslashAsCurrencySymbol; + unsigned m_numBufferedBytes; + unsigned char m_bufferedBytes[16]; // bigger than any single multi-byte character + mutable TECObjectRef m_converterTEC; + }; + +} // namespace WebCore + +#endif // TextCodecMac_h diff --git a/WebCore/platform/text/mac/character-sets.txt b/WebCore/platform/text/mac/character-sets.txt new file mode 100644 index 0000000..475e78e --- /dev/null +++ b/WebCore/platform/text/mac/character-sets.txt @@ -0,0 +1,1868 @@ + +=================================================================== +CHARACTER SETS + +(last updated 28 January 2005) + +These are the official names for character sets that may be used in +the Internet and may be referred to in Internet documentation. These +names are expressed in ANSI_X3.4-1968 which is commonly called +US-ASCII or simply ASCII. The character set most commonly use in the +Internet and used especially in protocol standards is US-ASCII, this +is strongly encouraged. The use of the name US-ASCII is also +encouraged. + +The character set names may be up to 40 characters taken from the +printable characters of US-ASCII. However, no distinction is made +between use of upper and lower case letters. + +The MIBenum value is a unique value for use in MIBs to identify coded +character sets. + +The value space for MIBenum values has been divided into three +regions. The first region (3-999) consists of coded character sets +that have been standardized by some standard setting organization. +This region is intended for standards that do not have subset +implementations. The second region (1000-1999) is for the Unicode and +ISO/IEC 10646 coded character sets together with a specification of a +(set of) sub-repertoires that may occur. The third region (>1999) is +intended for vendor specific coded character sets. + + Assigned MIB enum Numbers + ------------------------- + 0-2 Reserved + 3-999 Set By Standards Organizations + 1000-1999 Unicode / 10646 + 2000-2999 Vendor + +The aliases that start with "cs" have been added for use with the +IANA-CHARSET-MIB as originally defined in RFC3808, and as currently +maintained by IANA at http://www/iana.org/assignments/ianacharset-mib. +Note that the ianacharset-mib needs to be kept in sync with this +registry. These aliases that start with "cs" contain the standard +numbers along with suggestive names in order to facilitate applications +that want to display the names in user interfaces. The "cs" stands +for character set and is provided for applications that need a lower +case first letter but want to use mixed case thereafter that cannot +contain any special characters, such as underbar ("_") and dash ("-"). + +If the character set is from an ISO standard, its cs alias is the ISO +standard number or name. If the character set is not from an ISO +standard, but is registered with ISO (IPSJ/ITSCJ is the current ISO +Registration Authority), the ISO Registry number is specified as +ISOnnn followed by letters suggestive of the name or standards number +of the code set. When a national or international standard is +revised, the year of revision is added to the cs alias of the new +character set entry in the IANA Registry in order to distinguish the +revised character set from the original character set. + + +Character Set Reference +------------- --------- + +Name: ANSI_X3.4-1968 [RFC1345,KXS2] +MIBenum: 3 +Source: ECMA registry +Alias: iso-ir-6 +Alias: ANSI_X3.4-1986 +Alias: ISO_646.irv:1991 +Alias: ASCII +Alias: ISO646-US +Alias: US-ASCII (preferred MIME name) +Alias: us +Alias: IBM367 +Alias: cp367 +Alias: csASCII + +Name: ISO-10646-UTF-1 +MIBenum: 27 +Source: Universal Transfer Format (1), this is the multibyte + encoding, that subsets ASCII-7. It does not have byte + ordering issues. +Alias: csISO10646UTF1 + +Name: ISO_646.basic:1983 [RFC1345,KXS2] +MIBenum: 28 +Source: ECMA registry +Alias: ref +Alias: csISO646basic1983 + +Name: INVARIANT [RFC1345,KXS2] +MIBenum: 29 +Alias: csINVARIANT + +Name: ISO_646.irv:1983 [RFC1345,KXS2] +MIBenum: 30 +Source: ECMA registry +Alias: iso-ir-2 +Alias: irv +Alias: csISO2IntlRefVersion + +Name: BS_4730 [RFC1345,KXS2] +MIBenum: 20 +Source: ECMA registry +Alias: iso-ir-4 +Alias: ISO646-GB +Alias: gb +Alias: uk +Alias: csISO4UnitedKingdom + +Name: NATS-SEFI [RFC1345,KXS2] +MIBenum: 31 +Source: ECMA registry +Alias: iso-ir-8-1 +Alias: csNATSSEFI + +Name: NATS-SEFI-ADD [RFC1345,KXS2] +MIBenum: 32 +Source: ECMA registry +Alias: iso-ir-8-2 +Alias: csNATSSEFIADD + +Name: NATS-DANO [RFC1345,KXS2] +MIBenum: 33 +Source: ECMA registry +Alias: iso-ir-9-1 +Alias: csNATSDANO + +Name: NATS-DANO-ADD [RFC1345,KXS2] +MIBenum: 34 +Source: ECMA registry +Alias: iso-ir-9-2 +Alias: csNATSDANOADD + +Name: SEN_850200_B [RFC1345,KXS2] +MIBenum: 35 +Source: ECMA registry +Alias: iso-ir-10 +Alias: FI +Alias: ISO646-FI +Alias: ISO646-SE +Alias: se +Alias: csISO10Swedish + +Name: SEN_850200_C [RFC1345,KXS2] +MIBenum: 21 +Source: ECMA registry +Alias: iso-ir-11 +Alias: ISO646-SE2 +Alias: se2 +Alias: csISO11SwedishForNames + +Name: KS_C_5601-1987 [RFC1345,KXS2] +MIBenum: 36 +Source: ECMA registry +Alias: iso-ir-149 +Alias: KS_C_5601-1989 +Alias: KSC_5601 +Alias: korean +Alias: csKSC56011987 + +Name: ISO-2022-KR (preferred MIME name) [RFC1557,Choi] +MIBenum: 37 +Source: RFC-1557 (see also KS_C_5601-1987) +Alias: csISO2022KR + +Name: EUC-KR (preferred MIME name) [RFC1557,Choi] +MIBenum: 38 +Source: RFC-1557 (see also KS_C_5861-1992) +Alias: csEUCKR + +Name: ISO-2022-JP (preferred MIME name) [RFC1468,Murai] +MIBenum: 39 +Source: RFC-1468 (see also RFC-2237) +Alias: csISO2022JP + +Name: ISO-2022-JP-2 (preferred MIME name) [RFC1554,Ohta] +MIBenum: 40 +Source: RFC-1554 +Alias: csISO2022JP2 + +Name: ISO-2022-CN [RFC1922] +MIBenum: 104 +Source: RFC-1922 + +Name: ISO-2022-CN-EXT [RFC1922] +MIBenum: 105 +Source: RFC-1922 + +Name: JIS_C6220-1969-jp [RFC1345,KXS2] +MIBenum: 41 +Source: ECMA registry +Alias: JIS_C6220-1969 +Alias: iso-ir-13 +Alias: katakana +Alias: x0201-7 +Alias: csISO13JISC6220jp + +Name: JIS_C6220-1969-ro [RFC1345,KXS2] +MIBenum: 42 +Source: ECMA registry +Alias: iso-ir-14 +Alias: jp +Alias: ISO646-JP +Alias: csISO14JISC6220ro + +Name: IT [RFC1345,KXS2] +MIBenum: 22 +Source: ECMA registry +Alias: iso-ir-15 +Alias: ISO646-IT +Alias: csISO15Italian + +Name: PT [RFC1345,KXS2] +MIBenum: 43 +Source: ECMA registry +Alias: iso-ir-16 +Alias: ISO646-PT +Alias: csISO16Portuguese + +Name: ES [RFC1345,KXS2] +MIBenum: 23 +Source: ECMA registry +Alias: iso-ir-17 +Alias: ISO646-ES +Alias: csISO17Spanish + +Name: greek7-old [RFC1345,KXS2] +MIBenum: 44 +Source: ECMA registry +Alias: iso-ir-18 +Alias: csISO18Greek7Old + +Name: latin-greek [RFC1345,KXS2] +MIBenum: 45 +Source: ECMA registry +Alias: iso-ir-19 +Alias: csISO19LatinGreek + +Name: DIN_66003 [RFC1345,KXS2] +MIBenum: 24 +Source: ECMA registry +Alias: iso-ir-21 +Alias: de +Alias: ISO646-DE +Alias: csISO21German + +Name: NF_Z_62-010_(1973) [RFC1345,KXS2] +MIBenum: 46 +Source: ECMA registry +Alias: iso-ir-25 +Alias: ISO646-FR1 +Alias: csISO25French + +Name: Latin-greek-1 [RFC1345,KXS2] +MIBenum: 47 +Source: ECMA registry +Alias: iso-ir-27 +Alias: csISO27LatinGreek1 + +Name: ISO_5427 [RFC1345,KXS2] +MIBenum: 48 +Source: ECMA registry +Alias: iso-ir-37 +Alias: csISO5427Cyrillic + +Name: JIS_C6226-1978 [RFC1345,KXS2] +MIBenum: 49 +Source: ECMA registry +Alias: iso-ir-42 +Alias: csISO42JISC62261978 + +Name: BS_viewdata [RFC1345,KXS2] +MIBenum: 50 +Source: ECMA registry +Alias: iso-ir-47 +Alias: csISO47BSViewdata + +Name: INIS [RFC1345,KXS2] +MIBenum: 51 +Source: ECMA registry +Alias: iso-ir-49 +Alias: csISO49INIS + +Name: INIS-8 [RFC1345,KXS2] +MIBenum: 52 +Source: ECMA registry +Alias: iso-ir-50 +Alias: csISO50INIS8 + +Name: INIS-cyrillic [RFC1345,KXS2] +MIBenum: 53 +Source: ECMA registry +Alias: iso-ir-51 +Alias: csISO51INISCyrillic + +Name: ISO_5427:1981 [RFC1345,KXS2] +MIBenum: 54 +Source: ECMA registry +Alias: iso-ir-54 +Alias: ISO5427Cyrillic1981 + +Name: ISO_5428:1980 [RFC1345,KXS2] +MIBenum: 55 +Source: ECMA registry +Alias: iso-ir-55 +Alias: csISO5428Greek + +Name: GB_1988-80 [RFC1345,KXS2] +MIBenum: 56 +Source: ECMA registry +Alias: iso-ir-57 +Alias: cn +Alias: ISO646-CN +Alias: csISO57GB1988 + +Name: GB_2312-80 [RFC1345,KXS2] +MIBenum: 57 +Source: ECMA registry +Alias: iso-ir-58 +Alias: chinese +Alias: csISO58GB231280 + +Name: NS_4551-1 [RFC1345,KXS2] +MIBenum: 25 +Source: ECMA registry +Alias: iso-ir-60 +Alias: ISO646-NO +Alias: no +Alias: csISO60DanishNorwegian +Alias: csISO60Norwegian1 + +Name: NS_4551-2 [RFC1345,KXS2] +MIBenum: 58 +Source: ECMA registry +Alias: ISO646-NO2 +Alias: iso-ir-61 +Alias: no2 +Alias: csISO61Norwegian2 + +Name: NF_Z_62-010 [RFC1345,KXS2] +MIBenum: 26 +Source: ECMA registry +Alias: iso-ir-69 +Alias: ISO646-FR +Alias: fr +Alias: csISO69French + +Name: videotex-suppl [RFC1345,KXS2] +MIBenum: 59 +Source: ECMA registry +Alias: iso-ir-70 +Alias: csISO70VideotexSupp1 + +Name: PT2 [RFC1345,KXS2] +MIBenum: 60 +Source: ECMA registry +Alias: iso-ir-84 +Alias: ISO646-PT2 +Alias: csISO84Portuguese2 + +Name: ES2 [RFC1345,KXS2] +MIBenum: 61 +Source: ECMA registry +Alias: iso-ir-85 +Alias: ISO646-ES2 +Alias: csISO85Spanish2 + +Name: MSZ_7795.3 [RFC1345,KXS2] +MIBenum: 62 +Source: ECMA registry +Alias: iso-ir-86 +Alias: ISO646-HU +Alias: hu +Alias: csISO86Hungarian + +Name: JIS_C6226-1983 [RFC1345,KXS2] +MIBenum: 63 +Source: ECMA registry +Alias: iso-ir-87 +Alias: x0208 +Alias: JIS_X0208-1983 +Alias: csISO87JISX0208 + +Name: greek7 [RFC1345,KXS2] +MIBenum: 64 +Source: ECMA registry +Alias: iso-ir-88 +Alias: csISO88Greek7 + +Name: ASMO_449 [RFC1345,KXS2] +MIBenum: 65 +Source: ECMA registry +Alias: ISO_9036 +Alias: arabic7 +Alias: iso-ir-89 +Alias: csISO89ASMO449 + +Name: iso-ir-90 [RFC1345,KXS2] +MIBenum: 66 +Source: ECMA registry +Alias: csISO90 + +Name: JIS_C6229-1984-a [RFC1345,KXS2] +MIBenum: 67 +Source: ECMA registry +Alias: iso-ir-91 +Alias: jp-ocr-a +Alias: csISO91JISC62291984a + +Name: JIS_C6229-1984-b [RFC1345,KXS2] +MIBenum: 68 +Source: ECMA registry +Alias: iso-ir-92 +Alias: ISO646-JP-OCR-B +Alias: jp-ocr-b +Alias: csISO92JISC62991984b + +Name: JIS_C6229-1984-b-add [RFC1345,KXS2] +MIBenum: 69 +Source: ECMA registry +Alias: iso-ir-93 +Alias: jp-ocr-b-add +Alias: csISO93JIS62291984badd + +Name: JIS_C6229-1984-hand [RFC1345,KXS2] +MIBenum: 70 +Source: ECMA registry +Alias: iso-ir-94 +Alias: jp-ocr-hand +Alias: csISO94JIS62291984hand + +Name: JIS_C6229-1984-hand-add [RFC1345,KXS2] +MIBenum: 71 +Source: ECMA registry +Alias: iso-ir-95 +Alias: jp-ocr-hand-add +Alias: csISO95JIS62291984handadd + +Name: JIS_C6229-1984-kana [RFC1345,KXS2] +MIBenum: 72 +Source: ECMA registry +Alias: iso-ir-96 +Alias: csISO96JISC62291984kana + +Name: ISO_2033-1983 [RFC1345,KXS2] +MIBenum: 73 +Source: ECMA registry +Alias: iso-ir-98 +Alias: e13b +Alias: csISO2033 + +Name: ANSI_X3.110-1983 [RFC1345,KXS2] +MIBenum: 74 +Source: ECMA registry +Alias: iso-ir-99 +Alias: CSA_T500-1983 +Alias: NAPLPS +Alias: csISO99NAPLPS + +Name: ISO_8859-1:1987 [RFC1345,KXS2] +MIBenum: 4 +Source: ECMA registry +Alias: iso-ir-100 +Alias: ISO_8859-1 +Alias: ISO-8859-1 (preferred MIME name) +Alias: latin1 +Alias: l1 +Alias: IBM819 +Alias: CP819 +Alias: csISOLatin1 + +Name: ISO_8859-2:1987 [RFC1345,KXS2] +MIBenum: 5 +Source: ECMA registry +Alias: iso-ir-101 +Alias: ISO_8859-2 +Alias: ISO-8859-2 (preferred MIME name) +Alias: latin2 +Alias: l2 +Alias: csISOLatin2 + +Name: T.61-7bit [RFC1345,KXS2] +MIBenum: 75 +Source: ECMA registry +Alias: iso-ir-102 +Alias: csISO102T617bit + +Name: T.61-8bit [RFC1345,KXS2] +MIBenum: 76 +Alias: T.61 +Source: ECMA registry +Alias: iso-ir-103 +Alias: csISO103T618bit + +Name: ISO_8859-3:1988 [RFC1345,KXS2] +MIBenum: 6 +Source: ECMA registry +Alias: iso-ir-109 +Alias: ISO_8859-3 +Alias: ISO-8859-3 (preferred MIME name) +Alias: latin3 +Alias: l3 +Alias: csISOLatin3 + +Name: ISO_8859-4:1988 [RFC1345,KXS2] +MIBenum: 7 +Source: ECMA registry +Alias: iso-ir-110 +Alias: ISO_8859-4 +Alias: ISO-8859-4 (preferred MIME name) +Alias: latin4 +Alias: l4 +Alias: csISOLatin4 + +Name: ECMA-cyrillic +MIBenum: 77 +Source: ISO registry (formerly ECMA registry) + http://www.itscj.ipsj.jp/ISO-IR/111.pdf +Alias: iso-ir-111 +Alias: KOI8-E +Alias: csISO111ECMACyrillic + +Name: CSA_Z243.4-1985-1 [RFC1345,KXS2] +MIBenum: 78 +Source: ECMA registry +Alias: iso-ir-121 +Alias: ISO646-CA +Alias: csa7-1 +Alias: ca +Alias: csISO121Canadian1 + +Name: CSA_Z243.4-1985-2 [RFC1345,KXS2] +MIBenum: 79 +Source: ECMA registry +Alias: iso-ir-122 +Alias: ISO646-CA2 +Alias: csa7-2 +Alias: csISO122Canadian2 + +Name: CSA_Z243.4-1985-gr [RFC1345,KXS2] +MIBenum: 80 +Source: ECMA registry +Alias: iso-ir-123 +Alias: csISO123CSAZ24341985gr + +Name: ISO_8859-6:1987 [RFC1345,KXS2] +MIBenum: 9 +Source: ECMA registry +Alias: iso-ir-127 +Alias: ISO_8859-6 +Alias: ISO-8859-6 (preferred MIME name) +Alias: ECMA-114 +Alias: ASMO-708 +Alias: arabic +Alias: csISOLatinArabic + +Name: ISO_8859-6-E [RFC1556,IANA] +MIBenum: 81 +Source: RFC1556 +Alias: csISO88596E +Alias: ISO-8859-6-E (preferred MIME name) + +Name: ISO_8859-6-I [RFC1556,IANA] +MIBenum: 82 +Source: RFC1556 +Alias: csISO88596I +Alias: ISO-8859-6-I (preferred MIME name) + +Name: ISO_8859-7:1987 [RFC1947,RFC1345,KXS2] +MIBenum: 10 +Source: ECMA registry +Alias: iso-ir-126 +Alias: ISO_8859-7 +Alias: ISO-8859-7 (preferred MIME name) +Alias: ELOT_928 +Alias: ECMA-118 +Alias: greek +Alias: greek8 +Alias: csISOLatinGreek + +Name: T.101-G2 [RFC1345,KXS2] +MIBenum: 83 +Source: ECMA registry +Alias: iso-ir-128 +Alias: csISO128T101G2 + +Name: ISO_8859-8:1988 [RFC1345,KXS2] +MIBenum: 11 +Source: ECMA registry +Alias: iso-ir-138 +Alias: ISO_8859-8 +Alias: ISO-8859-8 (preferred MIME name) +Alias: hebrew +Alias: csISOLatinHebrew + +Name: ISO_8859-8-E [RFC1556,Nussbacher] +MIBenum: 84 +Source: RFC1556 +Alias: csISO88598E +Alias: ISO-8859-8-E (preferred MIME name) + +Name: ISO_8859-8-I [RFC1556,Nussbacher] +MIBenum: 85 +Source: RFC1556 +Alias: csISO88598I +Alias: ISO-8859-8-I (preferred MIME name) + +Name: CSN_369103 [RFC1345,KXS2] +MIBenum: 86 +Source: ECMA registry +Alias: iso-ir-139 +Alias: csISO139CSN369103 + +Name: JUS_I.B1.002 [RFC1345,KXS2] +MIBenum: 87 +Source: ECMA registry +Alias: iso-ir-141 +Alias: ISO646-YU +Alias: js +Alias: yu +Alias: csISO141JUSIB1002 + +Name: ISO_6937-2-add [RFC1345,KXS2] +MIBenum: 14 +Source: ECMA registry and ISO 6937-2:1983 +Alias: iso-ir-142 +Alias: csISOTextComm + +Name: IEC_P27-1 [RFC1345,KXS2] +MIBenum: 88 +Source: ECMA registry +Alias: iso-ir-143 +Alias: csISO143IECP271 + +Name: ISO_8859-5:1988 [RFC1345,KXS2] +MIBenum: 8 +Source: ECMA registry +Alias: iso-ir-144 +Alias: ISO_8859-5 +Alias: ISO-8859-5 (preferred MIME name) +Alias: cyrillic +Alias: csISOLatinCyrillic + +Name: JUS_I.B1.003-serb [RFC1345,KXS2] +MIBenum: 89 +Source: ECMA registry +Alias: iso-ir-146 +Alias: serbian +Alias: csISO146Serbian + +Name: JUS_I.B1.003-mac [RFC1345,KXS2] +MIBenum: 90 +Source: ECMA registry +Alias: macedonian +Alias: iso-ir-147 +Alias: csISO147Macedonian + +Name: ISO_8859-9:1989 [RFC1345,KXS2] +MIBenum: 12 +Source: ECMA registry +Alias: iso-ir-148 +Alias: ISO_8859-9 +Alias: ISO-8859-9 (preferred MIME name) +Alias: latin5 +Alias: l5 +Alias: csISOLatin5 + +Name: greek-ccitt [RFC1345,KXS2] +MIBenum: 91 +Source: ECMA registry +Alias: iso-ir-150 +Alias: csISO150 +Alias: csISO150GreekCCITT + +Name: NC_NC00-10:81 [RFC1345,KXS2] +MIBenum: 92 +Source: ECMA registry +Alias: cuba +Alias: iso-ir-151 +Alias: ISO646-CU +Alias: csISO151Cuba + +Name: ISO_6937-2-25 [RFC1345,KXS2] +MIBenum: 93 +Source: ECMA registry +Alias: iso-ir-152 +Alias: csISO6937Add + +Name: GOST_19768-74 [RFC1345,KXS2] +MIBenum: 94 +Source: ECMA registry +Alias: ST_SEV_358-88 +Alias: iso-ir-153 +Alias: csISO153GOST1976874 + +Name: ISO_8859-supp [RFC1345,KXS2] +MIBenum: 95 +Source: ECMA registry +Alias: iso-ir-154 +Alias: latin1-2-5 +Alias: csISO8859Supp + +Name: ISO_10367-box [RFC1345,KXS2] +MIBenum: 96 +Source: ECMA registry +Alias: iso-ir-155 +Alias: csISO10367Box + +Name: ISO-8859-10 (preferred MIME name) [RFC1345,KXS2] +MIBenum: 13 +Source: ECMA registry +Alias: iso-ir-157 +Alias: l6 +Alias: ISO_8859-10:1992 +Alias: csISOLatin6 +Alias: latin6 + +Name: latin-lap [RFC1345,KXS2] +MIBenum: 97 +Source: ECMA registry +Alias: lap +Alias: iso-ir-158 +Alias: csISO158Lap + +Name: JIS_X0212-1990 [RFC1345,KXS2] +MIBenum: 98 +Source: ECMA registry +Alias: x0212 +Alias: iso-ir-159 +Alias: csISO159JISX02121990 + +Name: DS_2089 [RFC1345,KXS2] +MIBenum: 99 +Source: Danish Standard, DS 2089, February 1974 +Alias: DS2089 +Alias: ISO646-DK +Alias: dk +Alias: csISO646Danish + +Name: us-dk [RFC1345,KXS2] +MIBenum: 100 +Alias: csUSDK + +Name: dk-us [RFC1345,KXS2] +MIBenum: 101 +Alias: csDKUS + +Name: JIS_X0201 [RFC1345,KXS2] +MIBenum: 15 +Source: JIS X 0201-1976. One byte only, this is equivalent to + JIS/Roman (similar to ASCII) plus eight-bit half-width + Katakana +Alias: X0201 +Alias: csHalfWidthKatakana + +Name: KSC5636 [RFC1345,KXS2] +MIBenum: 102 +Alias: ISO646-KR +Alias: csKSC5636 + +Name: ISO-10646-UCS-2 +MIBenum: 1000 +Source: the 2-octet Basic Multilingual Plane, aka Unicode + this needs to specify network byte order: the standard + does not specify (it is a 16-bit integer space) +Alias: csUnicode + +Name: ISO-10646-UCS-4 +MIBenum: 1001 +Source: the full code space. (same comment about byte order, + these are 31-bit numbers. +Alias: csUCS4 + +Name: DEC-MCS [RFC1345,KXS2] +MIBenum: 2008 +Source: VAX/VMS User's Manual, + Order Number: AI-Y517A-TE, April 1986. +Alias: dec +Alias: csDECMCS + +Name: hp-roman8 [HP-PCL5,RFC1345,KXS2] +MIBenum: 2004 +Source: LaserJet IIP Printer User's Manual, + HP part no 33471-90901, Hewlet-Packard, June 1989. +Alias: roman8 +Alias: r8 +Alias: csHPRoman8 + +Name: macintosh [RFC1345,KXS2] +MIBenum: 2027 +Source: The Unicode Standard ver1.0, ISBN 0-201-56788-1, Oct 1991 +Alias: mac +Alias: csMacintosh + +Name: IBM037 [RFC1345,KXS2] +MIBenum: 2028 +Source: IBM NLS RM Vol2 SE09-8002-01, March 1990 +Alias: cp037 +Alias: ebcdic-cp-us +Alias: ebcdic-cp-ca +Alias: ebcdic-cp-wt +Alias: ebcdic-cp-nl +Alias: csIBM037 + +Name: IBM038 [RFC1345,KXS2] +MIBenum: 2029 +Source: IBM 3174 Character Set Ref, GA27-3831-02, March 1990 +Alias: EBCDIC-INT +Alias: cp038 +Alias: csIBM038 + +Name: IBM273 [RFC1345,KXS2] +MIBenum: 2030 +Source: IBM NLS RM Vol2 SE09-8002-01, March 1990 +Alias: CP273 +Alias: csIBM273 + +Name: IBM274 [RFC1345,KXS2] +MIBenum: 2031 +Source: IBM 3174 Character Set Ref, GA27-3831-02, March 1990 +Alias: EBCDIC-BE +Alias: CP274 +Alias: csIBM274 + +Name: IBM275 [RFC1345,KXS2] +MIBenum: 2032 +Source: IBM NLS RM Vol2 SE09-8002-01, March 1990 +Alias: EBCDIC-BR +Alias: cp275 +Alias: csIBM275 + +Name: IBM277 [RFC1345,KXS2] +MIBenum: 2033 +Source: IBM NLS RM Vol2 SE09-8002-01, March 1990 +Alias: EBCDIC-CP-DK +Alias: EBCDIC-CP-NO +Alias: csIBM277 + +Name: IBM278 [RFC1345,KXS2] +MIBenum: 2034 +Source: IBM NLS RM Vol2 SE09-8002-01, March 1990 +Alias: CP278 +Alias: ebcdic-cp-fi +Alias: ebcdic-cp-se +Alias: csIBM278 + +Name: IBM280 [RFC1345,KXS2] +MIBenum: 2035 +Source: IBM NLS RM Vol2 SE09-8002-01, March 1990 +Alias: CP280 +Alias: ebcdic-cp-it +Alias: csIBM280 + +Name: IBM281 [RFC1345,KXS2] +MIBenum: 2036 +Source: IBM 3174 Character Set Ref, GA27-3831-02, March 1990 +Alias: EBCDIC-JP-E +Alias: cp281 +Alias: csIBM281 + +Name: IBM284 [RFC1345,KXS2] +MIBenum: 2037 +Source: IBM NLS RM Vol2 SE09-8002-01, March 1990 +Alias: CP284 +Alias: ebcdic-cp-es +Alias: csIBM284 + +Name: IBM285 [RFC1345,KXS2] +MIBenum: 2038 +Source: IBM NLS RM Vol2 SE09-8002-01, March 1990 +Alias: CP285 +Alias: ebcdic-cp-gb +Alias: csIBM285 + +Name: IBM290 [RFC1345,KXS2] +MIBenum: 2039 +Source: IBM 3174 Character Set Ref, GA27-3831-02, March 1990 +Alias: cp290 +Alias: EBCDIC-JP-kana +Alias: csIBM290 + +Name: IBM297 [RFC1345,KXS2] +MIBenum: 2040 +Source: IBM NLS RM Vol2 SE09-8002-01, March 1990 +Alias: cp297 +Alias: ebcdic-cp-fr +Alias: csIBM297 + +Name: IBM420 [RFC1345,KXS2] +MIBenum: 2041 +Source: IBM NLS RM Vol2 SE09-8002-01, March 1990, + IBM NLS RM p 11-11 +Alias: cp420 +Alias: ebcdic-cp-ar1 +Alias: csIBM420 + +Name: IBM423 [RFC1345,KXS2] +MIBenum: 2042 +Source: IBM NLS RM Vol2 SE09-8002-01, March 1990 +Alias: cp423 +Alias: ebcdic-cp-gr +Alias: csIBM423 + +Name: IBM424 [RFC1345,KXS2] +MIBenum: 2043 +Source: IBM NLS RM Vol2 SE09-8002-01, March 1990 +Alias: cp424 +Alias: ebcdic-cp-he +Alias: csIBM424 + +Name: IBM437 [RFC1345,KXS2] +MIBenum: 2011 +Source: IBM NLS RM Vol2 SE09-8002-01, March 1990 +Alias: cp437 +Alias: 437 +Alias: csPC8CodePage437 + +Name: IBM500 [RFC1345,KXS2] +MIBenum: 2044 +Source: IBM NLS RM Vol2 SE09-8002-01, March 1990 +Alias: CP500 +Alias: ebcdic-cp-be +Alias: ebcdic-cp-ch +Alias: csIBM500 + +Name: IBM775 [HP-PCL5] +MIBenum: 2087 +Source: HP PCL 5 Comparison Guide (P/N 5021-0329) pp B-13, 1996 +Alias: cp775 +Alias: csPC775Baltic + +Name: IBM850 [RFC1345,KXS2] +MIBenum: 2009 +Source: IBM NLS RM Vol2 SE09-8002-01, March 1990 +Alias: cp850 +Alias: 850 +Alias: csPC850Multilingual + +Name: IBM851 [RFC1345,KXS2] +MIBenum: 2045 +Source: IBM NLS RM Vol2 SE09-8002-01, March 1990 +Alias: cp851 +Alias: 851 +Alias: csIBM851 + +Name: IBM852 [RFC1345,KXS2] +MIBenum: 2010 +Source: IBM NLS RM Vol2 SE09-8002-01, March 1990 +Alias: cp852 +Alias: 852 +Alias: csPCp852 + +Name: IBM855 [RFC1345,KXS2] +MIBenum: 2046 +Source: IBM NLS RM Vol2 SE09-8002-01, March 1990 +Alias: cp855 +Alias: 855 +Alias: csIBM855 + +Name: IBM857 [RFC1345,KXS2] +MIBenum: 2047 +Source: IBM NLS RM Vol2 SE09-8002-01, March 1990 +Alias: cp857 +Alias: 857 +Alias: csIBM857 + +Name: IBM860 [RFC1345,KXS2] +MIBenum: 2048 +Source: IBM NLS RM Vol2 SE09-8002-01, March 1990 +Alias: cp860 +Alias: 860 +Alias: csIBM860 + +Name: IBM861 [RFC1345,KXS2] +MIBenum: 2049 +Source: IBM NLS RM Vol2 SE09-8002-01, March 1990 +Alias: cp861 +Alias: 861 +Alias: cp-is +Alias: csIBM861 + +Name: IBM862 [RFC1345,KXS2] +MIBenum: 2013 +Source: IBM NLS RM Vol2 SE09-8002-01, March 1990 +Alias: cp862 +Alias: 862 +Alias: csPC862LatinHebrew + +Name: IBM863 [RFC1345,KXS2] +MIBenum: 2050 +Source: IBM Keyboard layouts and code pages, PN 07G4586 June 1991 +Alias: cp863 +Alias: 863 +Alias: csIBM863 + +Name: IBM864 [RFC1345,KXS2] +MIBenum: 2051 +Source: IBM Keyboard layouts and code pages, PN 07G4586 June 1991 +Alias: cp864 +Alias: csIBM864 + +Name: IBM865 [RFC1345,KXS2] +MIBenum: 2052 +Source: IBM DOS 3.3 Ref (Abridged), 94X9575 (Feb 1987) +Alias: cp865 +Alias: 865 +Alias: csIBM865 + +Name: IBM866 [Pond] +MIBenum: 2086 +Source: IBM NLDG Volume 2 (SE09-8002-03) August 1994 +Alias: cp866 +Alias: 866 +Alias: csIBM866 + +Name: IBM868 [RFC1345,KXS2] +MIBenum: 2053 +Source: IBM NLS RM Vol2 SE09-8002-01, March 1990 +Alias: CP868 +Alias: cp-ar +Alias: csIBM868 + +Name: IBM869 [RFC1345,KXS2] +MIBenum: 2054 +Source: IBM Keyboard layouts and code pages, PN 07G4586 June 1991 +Alias: cp869 +Alias: 869 +Alias: cp-gr +Alias: csIBM869 + +Name: IBM870 [RFC1345,KXS2] +MIBenum: 2055 +Source: IBM NLS RM Vol2 SE09-8002-01, March 1990 +Alias: CP870 +Alias: ebcdic-cp-roece +Alias: ebcdic-cp-yu +Alias: csIBM870 + +Name: IBM871 [RFC1345,KXS2] +MIBenum: 2056 +Source: IBM NLS RM Vol2 SE09-8002-01, March 1990 +Alias: CP871 +Alias: ebcdic-cp-is +Alias: csIBM871 + +Name: IBM880 [RFC1345,KXS2] +MIBenum: 2057 +Source: IBM NLS RM Vol2 SE09-8002-01, March 1990 +Alias: cp880 +Alias: EBCDIC-Cyrillic +Alias: csIBM880 + +Name: IBM891 [RFC1345,KXS2] +MIBenum: 2058 +Source: IBM NLS RM Vol2 SE09-8002-01, March 1990 +Alias: cp891 +Alias: csIBM891 + +Name: IBM903 [RFC1345,KXS2] +MIBenum: 2059 +Source: IBM NLS RM Vol2 SE09-8002-01, March 1990 +Alias: cp903 +Alias: csIBM903 + +Name: IBM904 [RFC1345,KXS2] +MIBenum: 2060 +Source: IBM NLS RM Vol2 SE09-8002-01, March 1990 +Alias: cp904 +Alias: 904 +Alias: csIBBM904 + +Name: IBM905 [RFC1345,KXS2] +MIBenum: 2061 +Source: IBM 3174 Character Set Ref, GA27-3831-02, March 1990 +Alias: CP905 +Alias: ebcdic-cp-tr +Alias: csIBM905 + +Name: IBM918 [RFC1345,KXS2] +MIBenum: 2062 +Source: IBM NLS RM Vol2 SE09-8002-01, March 1990 +Alias: CP918 +Alias: ebcdic-cp-ar2 +Alias: csIBM918 + +Name: IBM1026 [RFC1345,KXS2] +MIBenum: 2063 +Source: IBM NLS RM Vol2 SE09-8002-01, March 1990 +Alias: CP1026 +Alias: csIBM1026 + +Name: EBCDIC-AT-DE [RFC1345,KXS2] +MIBenum: 2064 +Source: IBM 3270 Char Set Ref Ch 10, GA27-2837-9, April 1987 +Alias: csIBMEBCDICATDE + +Name: EBCDIC-AT-DE-A [RFC1345,KXS2] +MIBenum: 2065 +Source: IBM 3270 Char Set Ref Ch 10, GA27-2837-9, April 1987 +Alias: csEBCDICATDEA + +Name: EBCDIC-CA-FR [RFC1345,KXS2] +MIBenum: 2066 +Source: IBM 3270 Char Set Ref Ch 10, GA27-2837-9, April 1987 +Alias: csEBCDICCAFR + +Name: EBCDIC-DK-NO [RFC1345,KXS2] +MIBenum: 2067 +Source: IBM 3270 Char Set Ref Ch 10, GA27-2837-9, April 1987 +Alias: csEBCDICDKNO + +Name: EBCDIC-DK-NO-A [RFC1345,KXS2] +MIBenum: 2068 +Source: IBM 3270 Char Set Ref Ch 10, GA27-2837-9, April 1987 +Alias: csEBCDICDKNOA + +Name: EBCDIC-FI-SE [RFC1345,KXS2] +MIBenum: 2069 +Source: IBM 3270 Char Set Ref Ch 10, GA27-2837-9, April 1987 +Alias: csEBCDICFISE + +Name: EBCDIC-FI-SE-A [RFC1345,KXS2] +MIBenum: 2070 +Source: IBM 3270 Char Set Ref Ch 10, GA27-2837-9, April 1987 +Alias: csEBCDICFISEA + +Name: EBCDIC-FR [RFC1345,KXS2] +MIBenum: 2071 +Source: IBM 3270 Char Set Ref Ch 10, GA27-2837-9, April 1987 +Alias: csEBCDICFR + +Name: EBCDIC-IT [RFC1345,KXS2] +MIBenum: 2072 +Source: IBM 3270 Char Set Ref Ch 10, GA27-2837-9, April 1987 +Alias: csEBCDICIT + +Name: EBCDIC-PT [RFC1345,KXS2] +MIBenum: 2073 +Source: IBM 3270 Char Set Ref Ch 10, GA27-2837-9, April 1987 +Alias: csEBCDICPT + +Name: EBCDIC-ES [RFC1345,KXS2] +MIBenum: 2074 +Source: IBM 3270 Char Set Ref Ch 10, GA27-2837-9, April 1987 +Alias: csEBCDICES + +Name: EBCDIC-ES-A [RFC1345,KXS2] +MIBenum: 2075 +Source: IBM 3270 Char Set Ref Ch 10, GA27-2837-9, April 1987 +Alias: csEBCDICESA + +Name: EBCDIC-ES-S [RFC1345,KXS2] +MIBenum: 2076 +Source: IBM 3270 Char Set Ref Ch 10, GA27-2837-9, April 1987 +Alias: csEBCDICESS + +Name: EBCDIC-UK [RFC1345,KXS2] +MIBenum: 2077 +Source: IBM 3270 Char Set Ref Ch 10, GA27-2837-9, April 1987 +Alias: csEBCDICUK + +Name: EBCDIC-US [RFC1345,KXS2] +MIBenum: 2078 +Source: IBM 3270 Char Set Ref Ch 10, GA27-2837-9, April 1987 +Alias: csEBCDICUS + +Name: UNKNOWN-8BIT [RFC1428] +MIBenum: 2079 +Alias: csUnknown8BiT + +Name: MNEMONIC [RFC1345,KXS2] +MIBenum: 2080 +Source: RFC 1345, also known as "mnemonic+ascii+38" +Alias: csMnemonic + +Name: MNEM [RFC1345,KXS2] +MIBenum: 2081 +Source: RFC 1345, also known as "mnemonic+ascii+8200" +Alias: csMnem + +Name: VISCII [RFC1456] +MIBenum: 2082 +Source: RFC 1456 +Alias: csVISCII + +Name: VIQR [RFC1456] +MIBenum: 2083 +Source: RFC 1456 +Alias: csVIQR + +Name: KOI8-R (preferred MIME name) [RFC1489] +MIBenum: 2084 +Source: RFC 1489, based on GOST-19768-74, ISO-6937/8, + INIS-Cyrillic, ISO-5427. +Alias: csKOI8R + +Name: KOI8-U [RFC2319] +MIBenum: 2088 +Source: RFC 2319 + +Name: IBM00858 +MIBenum: 2089 +Source: IBM See (http://www.iana.org/assignments/charset-reg/IBM00858) [Mahdi] +Alias: CCSID00858 +Alias: CP00858 +Alias: PC-Multilingual-850+euro + +Name: IBM00924 +MIBenum: 2090 +Source: IBM See (http://www.iana.org/assignments/charset-reg/IBM00924) [Mahdi] +Alias: CCSID00924 +Alias: CP00924 +Alias: ebcdic-Latin9--euro + +Name: IBM01140 +MIBenum: 2091 +Source: IBM See (http://www.iana.org/assignments/charset-reg/IBM01140) [Mahdi] +Alias: CCSID01140 +Alias: CP01140 +Alias: ebcdic-us-37+euro + +Name: IBM01141 +MIBenum: 2092 +Source: IBM See (http://www.iana.org/assignments/charset-reg/IBM01141) [Mahdi] +Alias: CCSID01141 +Alias: CP01141 +Alias: ebcdic-de-273+euro + +Name: IBM01142 +MIBenum: 2093 +Source: IBM See (http://www.iana.org/assignments/charset-reg/IBM01142) [Mahdi] +Alias: CCSID01142 +Alias: CP01142 +Alias: ebcdic-dk-277+euro +Alias: ebcdic-no-277+euro + +Name: IBM01143 +MIBenum: 2094 +Source: IBM See (http://www.iana.org/assignments/charset-reg/IBM01143) [Mahdi] +Alias: CCSID01143 +Alias: CP01143 +Alias: ebcdic-fi-278+euro +Alias: ebcdic-se-278+euro + +Name: IBM01144 +MIBenum: 2095 +Source: IBM See (http://www.iana.org/assignments/charset-reg/IBM01144) [Mahdi] +Alias: CCSID01144 +Alias: CP01144 +Alias: ebcdic-it-280+euro + +Name: IBM01145 +MIBenum: 2096 +Source: IBM See (http://www.iana.org/assignments/charset-reg/IBM01145) [Mahdi] +Alias: CCSID01145 +Alias: CP01145 +Alias: ebcdic-es-284+euro + +Name: IBM01146 +MIBenum: 2097 +Source: IBM See (http://www.iana.org/assignments/charset-reg/IBM01146) [Mahdi] +Alias: CCSID01146 +Alias: CP01146 +Alias: ebcdic-gb-285+euro + +Name: IBM01147 +MIBenum: 2098 +Source: IBM See (http://www.iana.org/assignments/charset-reg/IBM01147) [Mahdi] +Alias: CCSID01147 +Alias: CP01147 +Alias: ebcdic-fr-297+euro + +Name: IBM01148 +MIBenum: 2099 +Source: IBM See (http://www.iana.org/assignments/charset-reg/IBM01148) [Mahdi] +Alias: CCSID01148 +Alias: CP01148 +Alias: ebcdic-international-500+euro + +Name: IBM01149 +MIBenum: 2100 +Source: IBM See (http://www.iana.org/assignments/charset-reg/IBM01149) [Mahdi] +Alias: CCSID01149 +Alias: CP01149 +Alias: ebcdic-is-871+euro + +Name: Big5-HKSCS [Yick] +MIBenum: 2101 +Source: See (http://www.iana.org/assignments/charset-reg/Big5-HKSCS) +Alias: None + +Name: IBM1047 [Robrigado] +MIBenum: 2102 +Source: IBM1047 (EBCDIC Latin 1/Open Systems) +http://www-1.ibm.com/servers/eserver/iseries/software/globalization/pdf/cp01047z.pdf +Alias: IBM-1047 + +Name: PTCP154 [Uskov] +MIBenum: 2103 +Source: See (http://www.iana.org/assignments/charset-reg/PTCP154) +Alias: csPTCP154 +Alias: PT154 +Alias: CP154 +Alias: Cyrillic-Asian + +Name: Amiga-1251 +MIBenum: 2104 +Source: See (http://www.amiga.ultranet.ru/Amiga-1251.html) +Alias: Ami1251 +Alias: Amiga1251 +Alias: Ami-1251 +(Aliases are provided for historical reasons and should not be used) + [Malyshev] + +Name: KOI7-switched +MIBenum: 2105 +Source: See <http://www.iana.org/assignments/charset-reg/KOI7-switched> +Aliases: None + +Name: UNICODE-1-1 [RFC1641] +MIBenum: 1010 +Source: RFC 1641 +Alias: csUnicode11 + +Name: SCSU +MIBenum: 1011 +Source: SCSU See (http://www.iana.org/assignments/charset-reg/SCSU) [Scherer] +Alias: None + +Name: UTF-7 [RFC2152] +MIBenum: 1012 +Source: RFC 2152 +Alias: None + +Name: UTF-16BE [RFC2781] +MIBenum: 1013 +Source: RFC 2781 +Alias: None + +Name: UTF-16LE [RFC2781] +MIBenum: 1014 +Source: RFC 2781 +Alias: None + +Name: UTF-16 [RFC2781] +MIBenum: 1015 +Source: RFC 2781 +Alias: None + +Name: CESU-8 [Phipps] +MIBenum: 1016 +Source: <http://www.unicode.org/unicode/reports/tr26> +Alias: csCESU-8 + +Name: UTF-32 [Davis] +MIBenum: 1017 +Source: <http://www.unicode.org/unicode/reports/tr19/> +Alias: None + +Name: UTF-32BE [Davis] +MIBenum: 1018 +Source: <http://www.unicode.org/unicode/reports/tr19/> +Alias: None + +Name: UTF-32LE [Davis] +MIBenum: 1019 +Source: <http://www.unicode.org/unicode/reports/tr19/> +Alias: None + +Name: BOCU-1 [Scherer] +MIBenum: 1020 +Source: http://www.unicode.org/notes/tn6/ +Alias: csBOCU-1 + +Name: UNICODE-1-1-UTF-7 [RFC1642] +MIBenum: 103 +Source: RFC 1642 +Alias: csUnicode11UTF7 + +Name: UTF-8 [RFC3629] +MIBenum: 106 +Source: RFC 3629 +Alias: None + +Name: ISO-8859-13 +MIBenum: 109 +Source: ISO See (http://www.iana.org/assignments/charset-reg/iso-8859-13)[Tumasonis] +Alias: None + +Name: ISO-8859-14 +MIBenum: 110 +Source: ISO See (http://www.iana.org/assignments/charset-reg/iso-8859-14) [Simonsen] +Alias: iso-ir-199 +Alias: ISO_8859-14:1998 +Alias: ISO_8859-14 +Alias: latin8 +Alias: iso-celtic +Alias: l8 + +Name: ISO-8859-15 +MIBenum: 111 +Source: ISO + Please see: <http://www.iana.org/assignments/charset-reg/ISO-8859-15> +Alias: ISO_8859-15 +Alias: Latin-9 + +Name: ISO-8859-16 +MIBenum: 112 +Source: ISO +Alias: iso-ir-226 +Alias: ISO_8859-16:2001 +Alias: ISO_8859-16 +Alias: latin10 +Alias: l10 + +Name: GBK +MIBenum: 113 +Source: Chinese IT Standardization Technical Committee + Please see: <http://www.iana.org/assignments/charset-reg/GBK> +Alias: CP936 +Alias: MS936 +Alias: windows-936 + +Name: GB18030 +MIBenum: 114 +Source: Chinese IT Standardization Technical Committee + Please see: <http://www.iana.org/assignments/charset-reg/GB18030> +Alias: None + +Name: OSD_EBCDIC_DF04_15 +MIBenum: 115 +Source: Fujitsu-Siemens standard mainframe EBCDIC encoding + Please see: <http://www.iana.org/assignments/charset-reg/OSD-EBCDIC-DF04-15> +Alias: None + +Name: OSD_EBCDIC_DF03_IRV +MIBenum: 116 +Source: Fujitsu-Siemens standard mainframe EBCDIC encoding + Please see: <http://www.iana.org/assignments/charset-reg/OSD-EBCDIC-DF03-IRV> +Alias: None + +Name: OSD_EBCDIC_DF04_1 +MIBenum: 117 +Source: Fujitsu-Siemens standard mainframe EBCDIC encoding + Please see: <http://www.iana.org/assignments/charset-reg/OSD-EBCDIC-DF04-1> +Alias: None + +Name: JIS_Encoding +MIBenum: 16 +Source: JIS X 0202-1991. Uses ISO 2022 escape sequences to + shift code sets as documented in JIS X 0202-1991. +Alias: csJISEncoding + +Name: Shift_JIS (preferred MIME name) +MIBenum: 17 +Source: This charset is an extension of csHalfWidthKatakana by + adding graphic characters in JIS X 0208. The CCS's are + JIS X0201:1997 and JIS X0208:1997. The + complete definition is shown in Appendix 1 of JIS + X0208:1997. + This charset can be used for the top-level media type "text". +Alias: MS_Kanji +Alias: csShiftJIS + +Name: Extended_UNIX_Code_Packed_Format_for_Japanese +MIBenum: 18 +Source: Standardized by OSF, UNIX International, and UNIX Systems + Laboratories Pacific. Uses ISO 2022 rules to select + code set 0: US-ASCII (a single 7-bit byte set) + code set 1: JIS X0208-1990 (a double 8-bit byte set) + restricted to A0-FF in both bytes + code set 2: Half Width Katakana (a single 7-bit byte set) + requiring SS2 as the character prefix + code set 3: JIS X0212-1990 (a double 7-bit byte set) + restricted to A0-FF in both bytes + requiring SS3 as the character prefix +Alias: csEUCPkdFmtJapanese +Alias: EUC-JP (preferred MIME name) + +Name: Extended_UNIX_Code_Fixed_Width_for_Japanese +MIBenum: 19 +Source: Used in Japan. Each character is 2 octets. + code set 0: US-ASCII (a single 7-bit byte set) + 1st byte = 00 + 2nd byte = 20-7E + code set 1: JIS X0208-1990 (a double 7-bit byte set) + restricted to A0-FF in both bytes + code set 2: Half Width Katakana (a single 7-bit byte set) + 1st byte = 00 + 2nd byte = A0-FF + code set 3: JIS X0212-1990 (a double 7-bit byte set) + restricted to A0-FF in + the first byte + and 21-7E in the second byte +Alias: csEUCFixWidJapanese + +Name: ISO-10646-UCS-Basic +MIBenum: 1002 +Source: ASCII subset of Unicode. Basic Latin = collection 1 + See ISO 10646, Appendix A +Alias: csUnicodeASCII + +Name: ISO-10646-Unicode-Latin1 +MIBenum: 1003 +Source: ISO Latin-1 subset of Unicode. Basic Latin and Latin-1 + Supplement = collections 1 and 2. See ISO 10646, + Appendix A. See RFC 1815. +Alias: csUnicodeLatin1 +Alias: ISO-10646 + +Name: ISO-10646-J-1 +Source: ISO 10646 Japanese, see RFC 1815. + +Name: ISO-Unicode-IBM-1261 +MIBenum: 1005 +Source: IBM Latin-2, -3, -5, Extended Presentation Set, GCSGID: 1261 +Alias: csUnicodeIBM1261 + +Name: ISO-Unicode-IBM-1268 +MIBenum: 1006 +Source: IBM Latin-4 Extended Presentation Set, GCSGID: 1268 +Alias: csUnicodeIBM1268 + +Name: ISO-Unicode-IBM-1276 +MIBenum: 1007 +Source: IBM Cyrillic Greek Extended Presentation Set, GCSGID: 1276 +Alias: csUnicodeIBM1276 + +Name: ISO-Unicode-IBM-1264 +MIBenum: 1008 +Source: IBM Arabic Presentation Set, GCSGID: 1264 +Alias: csUnicodeIBM1264 + +Name: ISO-Unicode-IBM-1265 +MIBenum: 1009 +Source: IBM Hebrew Presentation Set, GCSGID: 1265 +Alias: csUnicodeIBM1265 + +Name: ISO-8859-1-Windows-3.0-Latin-1 [HP-PCL5] +MIBenum: 2000 +Source: Extended ISO 8859-1 Latin-1 for Windows 3.0. + PCL Symbol Set id: 9U +Alias: csWindows30Latin1 + +Name: ISO-8859-1-Windows-3.1-Latin-1 [HP-PCL5] +MIBenum: 2001 +Source: Extended ISO 8859-1 Latin-1 for Windows 3.1. + PCL Symbol Set id: 19U +Alias: csWindows31Latin1 + +Name: ISO-8859-2-Windows-Latin-2 [HP-PCL5] +MIBenum: 2002 +Source: Extended ISO 8859-2. Latin-2 for Windows 3.1. + PCL Symbol Set id: 9E +Alias: csWindows31Latin2 + +Name: ISO-8859-9-Windows-Latin-5 [HP-PCL5] +MIBenum: 2003 +Source: Extended ISO 8859-9. Latin-5 for Windows 3.1 + PCL Symbol Set id: 5T +Alias: csWindows31Latin5 + +Name: Adobe-Standard-Encoding [Adobe] +MIBenum: 2005 +Source: PostScript Language Reference Manual + PCL Symbol Set id: 10J +Alias: csAdobeStandardEncoding + +Name: Ventura-US [HP-PCL5] +MIBenum: 2006 +Source: Ventura US. ASCII plus characters typically used in + publishing, like pilcrow, copyright, registered, trade mark, + section, dagger, and double dagger in the range A0 (hex) + to FF (hex). + PCL Symbol Set id: 14J +Alias: csVenturaUS + +Name: Ventura-International [HP-PCL5] +MIBenum: 2007 +Source: Ventura International. ASCII plus coded characters similar + to Roman8. + PCL Symbol Set id: 13J +Alias: csVenturaInternational + +Name: PC8-Danish-Norwegian [HP-PCL5] +MIBenum: 2012 +Source: PC Danish Norwegian + 8-bit PC set for Danish Norwegian + PCL Symbol Set id: 11U +Alias: csPC8DanishNorwegian + +Name: PC8-Turkish [HP-PCL5] +MIBenum: 2014 +Source: PC Latin Turkish. PCL Symbol Set id: 9T +Alias: csPC8Turkish + +Name: IBM-Symbols [IBM-CIDT] +MIBenum: 2015 +Source: Presentation Set, CPGID: 259 +Alias: csIBMSymbols + +Name: IBM-Thai [IBM-CIDT] +MIBenum: 2016 +Source: Presentation Set, CPGID: 838 +Alias: csIBMThai + +Name: HP-Legal [HP-PCL5] +MIBenum: 2017 +Source: PCL 5 Comparison Guide, Hewlett-Packard, + HP part number 5961-0510, October 1992 + PCL Symbol Set id: 1U +Alias: csHPLegal + +Name: HP-Pi-font [HP-PCL5] +MIBenum: 2018 +Source: PCL 5 Comparison Guide, Hewlett-Packard, + HP part number 5961-0510, October 1992 + PCL Symbol Set id: 15U +Alias: csHPPiFont + +Name: HP-Math8 [HP-PCL5] +MIBenum: 2019 +Source: PCL 5 Comparison Guide, Hewlett-Packard, + HP part number 5961-0510, October 1992 + PCL Symbol Set id: 8M +Alias: csHPMath8 + +Name: Adobe-Symbol-Encoding [Adobe] +MIBenum: 2020 +Source: PostScript Language Reference Manual + PCL Symbol Set id: 5M +Alias: csHPPSMath + +Name: HP-DeskTop [HP-PCL5] +MIBenum: 2021 +Source: PCL 5 Comparison Guide, Hewlett-Packard, + HP part number 5961-0510, October 1992 + PCL Symbol Set id: 7J +Alias: csHPDesktop + +Name: Ventura-Math [HP-PCL5] +MIBenum: 2022 +Source: PCL 5 Comparison Guide, Hewlett-Packard, + HP part number 5961-0510, October 1992 + PCL Symbol Set id: 6M +Alias: csVenturaMath + +Name: Microsoft-Publishing [HP-PCL5] +MIBenum: 2023 +Source: PCL 5 Comparison Guide, Hewlett-Packard, + HP part number 5961-0510, October 1992 + PCL Symbol Set id: 6J +Alias: csMicrosoftPublishing + +Name: Windows-31J +MIBenum: 2024 +Source: Windows Japanese. A further extension of Shift_JIS + to include NEC special characters (Row 13), NEC + selection of IBM extensions (Rows 89 to 92), and IBM + extensions (Rows 115 to 119). The CCS's are + JIS X0201:1997, JIS X0208:1997, and these extensions. + This charset can be used for the top-level media type "text", + but it is of limited or specialized use (see RFC2278). + PCL Symbol Set id: 19K +Alias: csWindows31J + +Name: GB2312 (preferred MIME name) +MIBenum: 2025 +Source: Chinese for People's Republic of China (PRC) mixed one byte, + two byte set: + 20-7E = one byte ASCII + A1-FE = two byte PRC Kanji + See GB 2312-80 + PCL Symbol Set Id: 18C +Alias: csGB2312 + +Name: Big5 (preferred MIME name) +MIBenum: 2026 +Source: Chinese for Taiwan Multi-byte set. + PCL Symbol Set Id: 18T +Alias: csBig5 + +Name: windows-1250 +MIBenum: 2250 +Source: Microsoft (http://www.iana.org/assignments/charset-reg/windows-1250) [Lazhintseva] +Alias: None + +Name: windows-1251 +MIBenum: 2251 +Source: Microsoft (http://www.iana.org/assignments/charset-reg/windows-1251) [Lazhintseva] +Alias: None + +Name: windows-1252 +MIBenum: 2252 +Source: Microsoft (http://www.iana.org/assignments/charset-reg/windows-1252) [Wendt] +Alias: None + +Name: windows-1253 +MIBenum: 2253 +Source: Microsoft (http://www.iana.org/assignments/charset-reg/windows-1253) [Lazhintseva] +Alias: None + +Name: windows-1254 +MIBenum: 2254 +Source: Microsoft (http://www.iana.org/assignments/charset-reg/windows-1254) [Lazhintseva] +Alias: None + +Name: windows-1255 +MIBenum: 2255 +Source: Microsoft (http://www.iana.org/assignments/charset-reg/windows-1255) [Lazhintseva] +Alias: None + +Name: windows-1256 +MIBenum: 2256 +Source: Microsoft (http://www.iana.org/assignments/charset-reg/windows-1256) [Lazhintseva] +Alias: None + +Name: windows-1257 +MIBenum: 2257 +Source: Microsoft (http://www.iana.org/assignments/charset-reg/windows-1257) [Lazhintseva] +Alias: None + +Name: windows-1258 +MIBenum: 2258 +Source: Microsoft (http://www.iana.org/assignments/charset-reg/windows-1258) [Lazhintseva] +Alias: None + +Name: TIS-620 +MIBenum: 2259 +Source: Thai Industrial Standards Institute (TISI) [Tantsetthi] + +Name: HZ-GB-2312 +MIBenum: 2085 +Source: RFC 1842, RFC 1843 [RFC1842, RFC1843] + + +REFERENCES +---------- + +[RFC1345] Simonsen, K., "Character Mnemonics & Character Sets", + RFC 1345, Rationel Almen Planlaegning, Rationel Almen + Planlaegning, June 1992. + +[RFC1428] Vaudreuil, G., "Transition of Internet Mail from + Just-Send-8 to 8bit-SMTP/MIME", RFC1428, CNRI, February + 1993. + +[RFC1456] Vietnamese Standardization Working Group, "Conventions for + Encoding the Vietnamese Language VISCII: VIetnamese + Standard Code for Information Interchange VIQR: VIetnamese + Quoted-Readable Specification Revision 1.1", RFC 1456, May + 1993. + +[RFC1468] Murai, J., Crispin, M., and E. van der Poel, "Japanese + Character Encoding for Internet Messages", RFC 1468, + Keio University, Panda Programming, June 1993. + +[RFC1489] Chernov, A., "Registration of a Cyrillic Character Set", + RFC1489, RELCOM Development Team, July 1993. + +[RFC1554] Ohta, M., and K. Handa, "ISO-2022-JP-2: Multilingual + Extension of ISO-2022-JP", RFC1554, Tokyo Institute of + Technology, ETL, December 1993. + +[RFC1556] Nussbacher, H., "Handling of Bi-directional Texts in MIME", + RFC1556, Israeli Inter-University, December 1993. + +[RFC1557] Choi, U., Chon, K., and H. Park, "Korean Character Encoding + for Internet Messages", KAIST, Solvit Chosun Media, + December 1993. + +[RFC1641] Goldsmith, D., and M. Davis, "Using Unicode with MIME", + RFC1641, Taligent, Inc., July 1994. + +[RFC1642] Goldsmith, D., and M. Davis, "UTF-7", RFC1642, Taligent, + Inc., July 1994. + +[RFC1815] Ohta, M., "Character Sets ISO-10646 and ISO-10646-J-1", + RFC 1815, Tokyo Institute of Technology, July 1995. + + +[Adobe] Adobe Systems Incorporated, PostScript Language Reference + Manual, second edition, Addison-Wesley Publishing Company, + Inc., 1990. + +[ECMA Registry] ISO-IR: International Register of Escape Sequences + http://www.itscj.ipsj.or.jp/ISO-IE/ Note: The current + registration authority is IPSJ/ITSCJ, Japan. + +[HP-PCL5] Hewlett-Packard Company, "HP PCL 5 Comparison Guide", + (P/N 5021-0329) pp B-13, 1996. + +[IBM-CIDT] IBM Corporation, "ABOUT TYPE: IBM's Technical Reference + for Core Interchange Digitized Type", Publication number + S544-3708-01 + +[RFC1842] Wei, Y., J. Li, and Y. Jiang, "ASCII Printable + Characters-Based Chinese Character Encoding for Internet + Messages", RFC 1842, Harvard University, Rice University, + University of Maryland, August 1995. + +[RFC1843] Lee, F., "HZ - A Data Format for Exchanging Files of + Arbitrarily Mixed Chinese and ASCII Characters", RFC 1843, + Stanford University, August 1995. + +[RFC2152] Goldsmith, D., M. Davis, "UTF-7: A Mail-Safe Transformation + Format of Unicode", RFC 2152, Apple Computer, Inc., + Taligent Inc., May 1997. + +[RFC2279] Yergeau, F., "UTF-8, A Transformation Format of ISO 10646", + RFC 2279, Alis Technologies, January, 1998. + +[RFC2781] Hoffman, P., Yergeau, F., "UTF-16, an encoding of ISO 10646", + RFC 2781, February 2000. + +[RFC3629] Yergeau, F., "UTF-8, a transformation format of ISO 10646", + RFC3629, November 2003. + +PEOPLE +------ + +[KXS2] Keld Simonsen <Keld.Simonsen@dkuug.dk> + +[Choi] Woohyong Choi <whchoi@cosmos.kaist.ac.kr> + +[Davis] Mark Davis, <mark@unicode.org>, April 2002. + +[Lazhintseva] Katya Lazhintseva, <katyal@MICROSOFT.com>, May 1996. + +[Mahdi] Tamer Mahdi, <tamer@ca.ibm.com>, August 2000. + +[Malyshev] Michael Malyshev, <michael_malyshev@mail.ru>, January 2004 + +[Murai] Jun Murai <jun@wide.ad.jp> + +[Nussbacher] Hank Nussbacher, <hank@vm.tau.ac.il> + +[Ohta] Masataka Ohta, <mohta@cc.titech.ac.jp>, July 1995. + +[Phipps] Toby Phipps, <tphipps@peoplesoft.com>, March 2002. + +[Pond] Rick Pond, <rickpond@vnet.ibm.com>, March 1997. + +[Robrigado] Reuel Robrigado, <reuelr@ca.ibm.com>, September 2002. + +[Scherer] Markus Scherer, <markus.scherer@jtcsv.com>, August 2000, + September 2002. + +[Simonsen] Keld Simonsen, <Keld.Simonsen@rap.dk>, August 2000. + +[Tantsetthi] Trin Tantsetthi, <trin@mozart.inet.co.th>, September 1998. + +[Tumasonis] Vladas Tumasonis, <vladas.tumasonis@maf.vu.lt>, August 2000. + +[Uskov] Alexander Uskov, <auskov@idc.kz>, September 2002. + +[Wendt] Chris Wendt, <christw@microsoft.com>, December 1999. + +[Yick] Nicky Yick, <cliac@itsd.gcn.gov.hk>, October 2000. + +[] + + + + + + + diff --git a/WebCore/platform/text/mac/mac-encodings.txt b/WebCore/platform/text/mac/mac-encodings.txt new file mode 100644 index 0000000..bb45e22 --- /dev/null +++ b/WebCore/platform/text/mac/mac-encodings.txt @@ -0,0 +1,45 @@ +# We'd like to eliminate this file. +# It would be nice to get rid of dependence on the TextEncodingConvert entirely. +# Perhaps we can prove these are not used on the web and remove them. +# Or perhaps we can get them added to ICU. + +# The items on the left are names of TEC TextEncoding values (without the leading kTextEncoding). +# The items on the right are IANA character set names. Names listed in character-sets.txt are not +# repeated here; mentioning any one character set from a group in there pulls in all the aliases in +# that group. + +DOSChineseTrad: cp950 +DOSGreek: cp737, ibm737 +EUC_TW: EUC-TW +ISOLatin10: ISO-8859-16 +ISOLatin6: ISO-8859-10 +ISOLatin8: ISO-8859-14 +ISOLatinThai: ISO-8859-11 +ISO_2022_JP_3: ISO-2022-JP-3 +JIS_C6226_78: JIS_C6226-1978 +JIS_X0208_83: JIS_X0208-1983 +JIS_X0208_90: JIS_X0208-1990 +JIS_X0212_90: JIS_X0212-1990 +KOI8_U: KOI8-U +MacArabic: x-mac-arabic +MacChineseSimp: x-mac-chinesesimp, xmacsimpchinese +MacChineseTrad: x-mac-chinesetrad, xmactradchinese +MacCroatian: x-mac-croatian +MacDevanagari: x-mac-devanagari +MacDingbats: x-mac-dingbats +MacFarsi: x-mac-farsi +MacGujarati: x-mac-gujarati +MacGurmukhi: x-mac-gurmukhi +MacHebrew: x-mac-hebrew +MacIcelandic: x-mac-icelandic +MacJapanese: x-mac-japanese +MacKorean: x-mac-korean +MacRomanLatin1: x-mac-roman-latin1 +MacRomanian: x-mac-romanian +MacSymbol: x-mac-symbol +MacThai: x-mac-thai +MacTibetan: x-mac-tibetan +MacVT100: x-mac-vt100 +NextStepLatin: x-nextstep +ShiftJIS_X0213_00: Shift_JIS_X0213-2000 +WindowsKoreanJohab: johab diff --git a/WebCore/platform/text/mac/make-charset-table.pl b/WebCore/platform/text/mac/make-charset-table.pl new file mode 100755 index 0000000..16fd25a --- /dev/null +++ b/WebCore/platform/text/mac/make-charset-table.pl @@ -0,0 +1,225 @@ +#!/usr/bin/perl -w + +# Copyright (C) 2003, 2004, 2005, 2006 Apple Computer, Inc. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# 3. Neither the name of Apple Computer, Inc. ("Apple") nor the names of +# its contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY +# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF +# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +use strict; + +my %aliasesFromCharsetsFile; +my %namesWritten; + +my $output = ""; + +my $error = 0; + +sub error ($) +{ + print STDERR @_, "\n"; + $error = 1; +} + +sub emit_line +{ + my ($name, $prefix, $encoding, $flags) = @_; + + error "$name shows up twice in output" if $namesWritten{$name}; + $namesWritten{$name} = 1; + + $output .= " { \"$name\", $prefix$encoding },\n"; +} + +sub process_platform_encodings +{ + my ($filename, $PlatformPrefix) = @_; + my $baseFilename = $filename; + $baseFilename =~ s|.*/||; + + my %seenPlatformNames; + my %seenIANANames; + + open PLATFORM_ENCODINGS, $filename or die; + + while (<PLATFORM_ENCODINGS>) { + chomp; + s/\#.*$//; + s/\s+$//; + if (my ($PlatformName, undef, $flags, $IANANames) = /^(.+?)(, (.+))?: (.+)$/) { + my %aliases; + + my $PlatformNameWithFlags = $PlatformName; + if ($flags) { + $PlatformNameWithFlags .= ", " . $flags; + } else { + $flags = "NoEncodingFlags"; + } + error "Platform encoding name $PlatformName is mentioned twice in $baseFilename" if $seenPlatformNames{$PlatformNameWithFlags}; + $seenPlatformNames{$PlatformNameWithFlags} = 1; + + # Build the aliases list. + # Also check that no two names are part of the same entry in the charsets file. + my @IANANames = split ", ", $IANANames; + my $firstName = ""; + my $canonicalFirstName = ""; + my $prevName = ""; + for my $name (@IANANames) { + if ($firstName eq "") { + if ($name !~ /^[-A-Za-z0-9_]+$/) { + error "$name, in $baseFilename, has illegal characters in it"; + next; + } + $firstName = $name; + } else { + if ($name !~ /^[a-z0-9]+$/) { + error "$name, in $baseFilename, has illegal characters in it (must be all lowercase alphanumeric)"; + next; + } + if ($name le $prevName) { + error "$name comes after $prevName in $baseFilename, but everything must be in alphabetical order"; + } + $prevName = $name; + } + + my $canonicalName = lc $name; + $canonicalName =~ tr/-_//d; + + $canonicalFirstName = $canonicalName if $canonicalFirstName eq ""; + + error "$name is mentioned twice in $baseFilename" if $seenIANANames{$canonicalName}; + $seenIANANames{$canonicalName} = 1; + + $aliases{$canonicalName} = 1; + next if !$aliasesFromCharsetsFile{$canonicalName}; + for my $alias (@{$aliasesFromCharsetsFile{$canonicalName}}) { + $aliases{$alias} = 1; + } + for my $otherName (@IANANames) { + next if $canonicalName eq $otherName; + if ($aliasesFromCharsetsFile{$otherName} + && $aliasesFromCharsetsFile{$canonicalName} eq $aliasesFromCharsetsFile{$otherName} + && $canonicalName le $otherName) { + error "$baseFilename lists both $name and $otherName under $PlatformName, but that aliasing is already specified in character-sets.txt"; + } + } + } + + # write out + emit_line($firstName, $PlatformPrefix, $PlatformName, $flags); + for my $alias (sort keys %aliases) { + emit_line($alias, $PlatformPrefix, $PlatformName, $flags) if $alias ne $canonicalFirstName; + } + } elsif (/^([a-zA-Z0-9_]+)(, (.+))?$/) { + my $PlatformName = $1; + + error "Platform encoding name $PlatformName is mentioned twice in $baseFilename" if $seenPlatformNames{$PlatformName}; + $seenPlatformNames{$PlatformName} = 1; + } elsif (/./) { + error "syntax error in $baseFilename, line $."; + } + } + + close PLATFORM_ENCODINGS; +} + +sub process_iana_charset +{ + my ($canonical_name, @aliases) = @_; + + return if !$canonical_name; + + my @names = sort $canonical_name, @aliases; + + for my $name (@names) { + $aliasesFromCharsetsFile{$name} = \@names; + } +} + +sub process_iana_charsets +{ + my ($filename) = @_; + + open CHARSETS, $filename or die; + + my %seen; + + my $canonical_name; + my @aliases; + + my %exceptions = ( isoir91 => 1, isoir92 => 1 ); + + while (<CHARSETS>) { + chomp; + if ((my $new_canonical_name) = /Name: ([^ \t]*).*/) { + $new_canonical_name = lc $new_canonical_name; + $new_canonical_name =~ tr/a-z0-9//cd; + + error "saw $new_canonical_name twice in character-sets.txt", if $seen{$new_canonical_name}; + $seen{$new_canonical_name} = $new_canonical_name; + + process_iana_charset $canonical_name, @aliases; + + $canonical_name = $new_canonical_name; + @aliases = (); + } elsif ((my $new_alias) = /Alias: ([^ \t]*).*/) { + $new_alias = lc $new_alias; + $new_alias =~ tr/a-z0-9//cd; + + # do this after normalizing the alias, sometimes character-sets.txt + # has weird escape characters, e.g. \b after None + next if $new_alias eq "none"; + + error "saw $new_alias twice in character-sets.txt $seen{$new_alias}, $canonical_name", if $seen{$new_alias} && $seen{$new_alias} ne $canonical_name && !$exceptions{$new_alias}; + push @aliases, $new_alias if !$seen{$new_alias}; + $seen{$new_alias} = $canonical_name; + } + } + + process_iana_charset $canonical_name, @aliases; + + close CHARSETS; +} + +# Program body + +process_iana_charsets($ARGV[0]); +process_platform_encodings($ARGV[1], $ARGV[2]); + +exit 1 if $error; + +print <<EOF +// File generated by make-charset-table.pl. Do not edit! + +#include "config.h" +#include "CharsetData.h" + +namespace WebCore { + + const CharsetEntry CharsetTable[] = { +$output + { 0, 0 } + }; + +} +EOF |