diff options
Diffstat (limited to 'WebCore/html/HTMLEntityParser.cpp')
-rw-r--r-- | WebCore/html/HTMLEntityParser.cpp | 242 |
1 files changed, 242 insertions, 0 deletions
diff --git a/WebCore/html/HTMLEntityParser.cpp b/WebCore/html/HTMLEntityParser.cpp new file mode 100644 index 0000000..3d8d48d --- /dev/null +++ b/WebCore/html/HTMLEntityParser.cpp @@ -0,0 +1,242 @@ +/* + * Copyright (C) 2008 Apple Inc. All Rights Reserved. + * Copyright (C) 2009 Torch Mobile, Inc. http://www.torchmobile.com/ + * Copyright (C) 2010 Google, Inc. All Rights Reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "HTMLEntityParser.h" + +#include <wtf/Vector.h> + +// Use __GNUC__ instead of PLATFORM(GCC) to stay consistent with the gperf generated c file +#ifdef __GNUC__ +// The main parser includes this too so we are getting two copies of the data. However, this way the code gets inlined. +#include "HTMLEntityNames.cpp" +#else +// Not inlined for non-GCC compilers +struct Entity { + const char* name; + int code; +}; +const struct Entity* findEntity(register const char* str, register unsigned int len); +#endif + +using namespace WTF; + +namespace WebCore { + +namespace { + +static const UChar windowsLatin1ExtensionArray[32] = { + 0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, // 80-87 + 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, // 88-8F + 0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, // 90-97 + 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178, // 98-9F +}; + +inline UChar adjustEntity(unsigned value) +{ + if ((value & ~0x1F) != 0x0080) + return value; + return windowsLatin1ExtensionArray[value - 0x80]; +} + +inline unsigned legalEntityFor(unsigned value) +{ + // FIXME: A number of specific entity values generate parse errors. + if (value == 0 || value > 0x10FFFF || (value >= 0xD800 && value <= 0xDFFF)) + return 0xFFFD; + if (value < 0xFFFF) + return adjustEntity(value); + return value; +} + +inline bool isHexDigit(UChar cc) +{ + return (cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'f') || (cc >= 'A' && cc <= 'F'); +} + +inline bool isAlphaNumeric(UChar cc) +{ + return (cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'z') || (cc >= 'A' && cc <= 'Z'); +} + +void unconsumeCharacters(SegmentedString& source, const Vector<UChar, 10>& consumedCharacters) +{ + if (consumedCharacters.size() == 1) + source.push(consumedCharacters[0]); + else if (consumedCharacters.size() == 2) { + source.push(consumedCharacters[0]); + source.push(consumedCharacters[1]); + } else + source.prepend(SegmentedString(String(consumedCharacters.data(), consumedCharacters.size()))); +} + +} + +unsigned consumeHTMLEntity(SegmentedString& source, bool& notEnoughCharacters, UChar additionalAllowedCharacter) +{ + ASSERT(!additionalAllowedCharacter || additionalAllowedCharacter == '"' || additionalAllowedCharacter == '\'' || additionalAllowedCharacter == '>'); + ASSERT(!notEnoughCharacters); + + enum EntityState { + Initial, + NumberType, + MaybeHexLowerCaseX, + MaybeHexUpperCaseX, + Hex, + Decimal, + Named + }; + EntityState entityState = Initial; + unsigned result = 0; + Vector<UChar, 10> consumedCharacters; + Vector<char, 10> entityName; + + while (!source.isEmpty()) { + UChar cc = *source; + switch (entityState) { + case Initial: { + if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ' || cc == '<' || cc == '&') + return 0; + if (additionalAllowedCharacter && cc == additionalAllowedCharacter) + return 0; + if (cc == '#') { + entityState = NumberType; + break; + } + if ((cc >= 'a' && cc <= 'z') || (cc >= 'A' && cc <= 'Z')) { + entityState = Named; + continue; + } + return 0; + } + case NumberType: { + if (cc == 'x') { + entityState = MaybeHexLowerCaseX; + break; + } + if (cc == 'X') { + entityState = MaybeHexUpperCaseX; + break; + } + if (cc >= '0' && cc <= '9') { + entityState = Decimal; + continue; + } + source.push('#'); + return 0; + } + case MaybeHexLowerCaseX: { + if (isHexDigit(cc)) { + entityState = Hex; + continue; + } + source.push('#'); + source.push('x'); + return 0; + } + case MaybeHexUpperCaseX: { + if (isHexDigit(cc)) { + entityState = Hex; + continue; + } + source.push('#'); + source.push('X'); + return 0; + } + case Hex: { + if (cc >= '0' && cc <= '9') + result = result * 16 + cc - '0'; + else if (cc >= 'a' && cc <= 'f') + result = result * 16 + 10 + cc - 'a'; + else if (cc >= 'A' && cc <= 'F') + result = result * 16 + 10 + cc - 'A'; + else if (cc == ';') { + source.advancePastNonNewline(); + return legalEntityFor(result); + } else + return legalEntityFor(result); + break; + } + case Decimal: { + if (cc >= '0' && cc <= '9') + result = result * 10 + cc - '0'; + else if (cc == ';') { + source.advancePastNonNewline(); + return legalEntityFor(result); + } else + return legalEntityFor(result); + break; + } + case Named: { + // FIXME: This code is wrong. We need to find the longest matching entity. + // The examples from the spec are: + // I'm ¬it; I tell you + // I'm ∉ I tell you + // In the first case, "¬" is the entity. In the second + // case, "∉" is the entity. + // FIXME: Our list of HTML entities is incomplete. + // FIXME: The number 8 below is bogus. + while (!source.isEmpty() && entityName.size() <= 8) { + cc = *source; + if (cc == ';') { + const Entity* entity = findEntity(entityName.data(), entityName.size()); + if (entity) { + source.advanceAndASSERT(';'); + return entity->code; + } + break; + } + if (!isAlphaNumeric(cc)) { + const Entity* entity = findEntity(entityName.data(), entityName.size()); + if (entity) { + // HTML5 tells us to ignore this entity, for historical reasons, + // if the lookhead character is '='. + if (additionalAllowedCharacter && cc == '=') + break; + return entity->code; + } + break; + } + entityName.append(cc); + consumedCharacters.append(cc); + source.advanceAndASSERT(cc); + } + notEnoughCharacters = source.isEmpty(); + unconsumeCharacters(source, consumedCharacters); + return 0; + } + } + consumedCharacters.append(cc); + source.advanceAndASSERT(cc); + } + ASSERT(source.isEmpty()); + notEnoughCharacters = true; + unconsumeCharacters(source, consumedCharacters); + return 0; +} + +} // namespace WebCore |