diff options
author | Kristian Monsen <kristianm@google.com> | 2010-09-08 12:18:00 +0100 |
---|---|---|
committer | Kristian Monsen <kristianm@google.com> | 2010-09-11 12:08:58 +0100 |
commit | 5ddde30071f639962dd557c453f2ad01f8f0fd00 (patch) | |
tree | 775803c4ab35af50aa5f5472cd1fb95fe9d5152d /WebCore/html/parser/HTMLEntityParser.cpp | |
parent | 3e63d9b33b753ca86d0765d1b3d711114ba9e34f (diff) | |
download | external_webkit-5ddde30071f639962dd557c453f2ad01f8f0fd00.zip external_webkit-5ddde30071f639962dd557c453f2ad01f8f0fd00.tar.gz external_webkit-5ddde30071f639962dd557c453f2ad01f8f0fd00.tar.bz2 |
Merge WebKit at r66666 : Initial merge by git.
Change-Id: I57dedeb49859adc9c539e760f0e749768c66626f
Diffstat (limited to 'WebCore/html/parser/HTMLEntityParser.cpp')
-rw-r--r-- | WebCore/html/parser/HTMLEntityParser.cpp | 272 |
1 files changed, 272 insertions, 0 deletions
diff --git a/WebCore/html/parser/HTMLEntityParser.cpp b/WebCore/html/parser/HTMLEntityParser.cpp new file mode 100644 index 0000000..6a422b8 --- /dev/null +++ b/WebCore/html/parser/HTMLEntityParser.cpp @@ -0,0 +1,272 @@ +/* + * Copyright (C) 2008 Apple Inc. All Rights Reserved. + * Copyright (C) 2009 Torch Mobile, Inc. http://www.torchmobile.com/ + * Copyright (C) 2010 Google, Inc. All Rights Reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "HTMLEntityParser.h" + +#include "HTMLEntitySearch.h" +#include "HTMLEntityTable.h" +#include <wtf/Vector.h> + +using namespace WTF; + +namespace WebCore { + +namespace { + +static const UChar windowsLatin1ExtensionArray[32] = { + 0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, // 80-87 + 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, // 88-8F + 0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, // 90-97 + 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178, // 98-9F +}; + +inline UChar adjustEntity(UChar32 value) +{ + if ((value & ~0x1F) != 0x0080) + return value; + return windowsLatin1ExtensionArray[value - 0x80]; +} + +inline UChar32 legalEntityFor(UChar32 value) +{ + // FIXME: A number of specific entity values generate parse errors. + if (value == 0 || value > 0x10FFFF || (value >= 0xD800 && value <= 0xDFFF)) + return 0xFFFD; + if (U_IS_BMP(value)) + return adjustEntity(value); + return value; +} + +inline bool convertToUTF16(UChar32 value, Vector<UChar, 16>& decodedEntity) +{ + if (U_IS_BMP(value)) { + UChar character = static_cast<UChar>(value); + ASSERT(character == value); + decodedEntity.append(character); + return true; + } + decodedEntity.append(U16_LEAD(value)); + decodedEntity.append(U16_TRAIL(value)); + return true; +} + +inline bool isHexDigit(UChar cc) +{ + return (cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'f') || (cc >= 'A' && cc <= 'F'); +} + +inline bool isAlphaNumeric(UChar cc) +{ + return (cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'z') || (cc >= 'A' && cc <= 'Z'); +} + +void unconsumeCharacters(SegmentedString& source, const Vector<UChar, 10>& consumedCharacters) +{ + if (consumedCharacters.size() == 1) + source.push(consumedCharacters[0]); + else if (consumedCharacters.size() == 2) { + source.push(consumedCharacters[0]); + source.push(consumedCharacters[1]); + } else + source.prepend(SegmentedString(String(consumedCharacters.data(), consumedCharacters.size()))); +} + +} + +bool consumeHTMLEntity(SegmentedString& source, Vector<UChar, 16>& decodedEntity, bool& notEnoughCharacters, UChar additionalAllowedCharacter) +{ + ASSERT(!additionalAllowedCharacter || additionalAllowedCharacter == '"' || additionalAllowedCharacter == '\'' || additionalAllowedCharacter == '>'); + ASSERT(!notEnoughCharacters); + ASSERT(decodedEntity.isEmpty()); + + enum EntityState { + Initial, + Number, + MaybeHexLowerCaseX, + MaybeHexUpperCaseX, + Hex, + Decimal, + Named + }; + EntityState entityState = Initial; + UChar32 result = 0; + Vector<UChar, 10> consumedCharacters; + + while (!source.isEmpty()) { + UChar cc = *source; + switch (entityState) { + case Initial: { + if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ' || cc == '<' || cc == '&') + return false; + if (additionalAllowedCharacter && cc == additionalAllowedCharacter) + return false; + if (cc == '#') { + entityState = Number; + break; + } + if ((cc >= 'a' && cc <= 'z') || (cc >= 'A' && cc <= 'Z')) { + entityState = Named; + continue; + } + return false; + } + case Number: { + if (cc == 'x') { + entityState = MaybeHexLowerCaseX; + break; + } + if (cc == 'X') { + entityState = MaybeHexUpperCaseX; + break; + } + if (cc >= '0' && cc <= '9') { + entityState = Decimal; + continue; + } + source.push('#'); + return false; + } + case MaybeHexLowerCaseX: { + if (isHexDigit(cc)) { + entityState = Hex; + continue; + } + source.push('#'); + source.push('x'); + return false; + } + case MaybeHexUpperCaseX: { + if (isHexDigit(cc)) { + entityState = Hex; + continue; + } + source.push('#'); + source.push('X'); + return false; + } + case Hex: { + if (cc >= '0' && cc <= '9') + result = result * 16 + cc - '0'; + else if (cc >= 'a' && cc <= 'f') + result = result * 16 + 10 + cc - 'a'; + else if (cc >= 'A' && cc <= 'F') + result = result * 16 + 10 + cc - 'A'; + else { + if (cc == ';') + source.advanceAndASSERT(cc); + return convertToUTF16(legalEntityFor(result), decodedEntity); + } + break; + } + case Decimal: { + if (cc >= '0' && cc <= '9') + result = result * 10 + cc - '0'; + else { + if (cc == ';') + source.advanceAndASSERT(cc); + return convertToUTF16(legalEntityFor(result), decodedEntity); + } + break; + } + case Named: { + HTMLEntitySearch entitySearch; + while (!source.isEmpty()) { + cc = *source; + entitySearch.advance(cc); + if (!entitySearch.isEntityPrefix()) + break; + consumedCharacters.append(cc); + source.advanceAndASSERT(cc); + } + notEnoughCharacters = source.isEmpty(); + if (notEnoughCharacters) { + // We can't an entity because there might be a longer entity + // that we could match if we had more data. + unconsumeCharacters(source, consumedCharacters); + return false; + } + if (!entitySearch.mostRecentMatch()) { + ASSERT(!entitySearch.currentValue()); + unconsumeCharacters(source, consumedCharacters); + return false; + } + if (entitySearch.mostRecentMatch()->length != entitySearch.currentLength()) { + // We've consumed too many characters. We need to walk the + // source back to the point at which we had consumed an + // actual entity. + unconsumeCharacters(source, consumedCharacters); + consumedCharacters.clear(); + const int length = entitySearch.mostRecentMatch()->length; + const UChar* reference = entitySearch.mostRecentMatch()->entity; + for (int i = 0; i < length; ++i) { + cc = *source; + ASSERT_UNUSED(reference, cc == *reference++); + consumedCharacters.append(cc); + source.advanceAndASSERT(cc); + ASSERT(!source.isEmpty()); + } + cc = *source; + } + if (entitySearch.mostRecentMatch()->lastCharacter() == ';' + || !additionalAllowedCharacter + || !(isAlphaNumeric(cc) || cc == '=')) { + return convertToUTF16(entitySearch.mostRecentMatch()->value, decodedEntity); + } + unconsumeCharacters(source, consumedCharacters); + return false; + } + } + consumedCharacters.append(cc); + source.advanceAndASSERT(cc); + } + ASSERT(source.isEmpty()); + notEnoughCharacters = true; + unconsumeCharacters(source, consumedCharacters); + return false; +} + +UChar decodeNamedEntity(const char* name) +{ + HTMLEntitySearch search; + while (*name) { + search.advance(*name++); + if (!search.isEntityPrefix()) + return 0; + } + search.advance(';'); + UChar32 entityValue = search.currentValue(); + if (U16_LENGTH(entityValue) != 1) { + // Callers need to move off this API if the entity table has values + // which do no fit in a 16 bit UChar! + ASSERT_NOT_REACHED(); + return 0; + } + return static_cast<UChar>(entityValue); +} + +} // namespace WebCore |