/* * Copyright (C) 2011 Google Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #undef WEBKIT_IMPLEMENTATION #undef LOG #include "base/utf_string_conversions.h" #include "net/base/escape.h" #include "PhoneEmailDetector.h" #include "Settings.h" #include "WebString.h" #define LOG_TAG "PhoneNumberDetector" #include #define PHONE_PATTERN "(200) /-.\\ 100 -. 0000" static const char kTelSchemaPrefix[] = "tel:"; static const char kEmailSchemaPrefix[] = "mailto:"; void FindReset(FindState* state); void FindResetNumber(FindState* state); FoundState FindPartialNumber(const UChar* chars, unsigned length, FindState* s); struct FindState; static FoundState FindPartialEMail(const UChar* , unsigned length, FindState* ); static bool IsDomainChar(UChar ch); static bool IsMailboxChar(UChar ch); PhoneEmailDetector::PhoneEmailDetector() : m_foundResult(FOUND_NONE) { } bool PhoneEmailDetector::IsEnabled(const WebKit::WebHitTestInfo& hit_test) { WebCore::Settings* settings = GetSettings(hit_test); if (!settings) return false; m_isPhoneDetectionEnabled = settings->formatDetectionTelephone(); m_isEmailDetectionEnabled = settings->formatDetectionEmail(); return m_isEmailDetectionEnabled || m_isPhoneDetectionEnabled; } bool PhoneEmailDetector::FindContent(const string16::const_iterator& begin, const string16::const_iterator& end, size_t* start_pos, size_t* end_pos) { FindReset(&m_findState); m_foundResult = FOUND_NONE; if (m_isPhoneDetectionEnabled) m_foundResult = FindPartialNumber(begin, end - begin, &m_findState); if (m_foundResult == FOUND_COMPLETE) m_prefix = kTelSchemaPrefix; else { FindReset(&m_findState); if (m_isEmailDetectionEnabled) m_foundResult = FindPartialEMail(begin, end - begin, &m_findState); m_prefix = kEmailSchemaPrefix; } *start_pos = m_findState.mStartResult; *end_pos = m_findState.mEndResult; return m_foundResult == FOUND_COMPLETE; } std::string PhoneEmailDetector::GetContentText(const WebKit::WebRange& range) { if (m_foundResult == FOUND_COMPLETE) { if (m_prefix == kTelSchemaPrefix) return UTF16ToUTF8(m_findState.mStore); else return UTF16ToUTF8(range.toPlainText()); } return std::string(); } GURL PhoneEmailDetector::GetIntentURL(const std::string& content_text) { return GURL(m_prefix + EscapeQueryParamValue(content_text, true)); } void FindReset(FindState* state) { memset(state, 0, sizeof(FindState)); state->mCurrent = ' '; FindResetNumber(state); } void FindResetNumber(FindState* state) { state->mOpenParen = false; state->mPattern = (char*) PHONE_PATTERN; state->mStorePtr = state->mStore; } FoundState FindPartialNumber(const UChar* chars, unsigned length, FindState* s) { char* pattern = s->mPattern; UChar* store = s->mStorePtr; const UChar* start = chars; const UChar* end = chars + length; const UChar* lastDigit = 0; string16 search16(chars, length); std::string searchSpace = UTF16ToUTF8(search16); do { bool initialized = s->mInitialized; while (chars < end) { if (initialized == false) { s->mBackTwo = s->mBackOne; s->mBackOne = s->mCurrent; } UChar ch = s->mCurrent = *chars; do { char patternChar = *pattern; switch (patternChar) { case '2': if (initialized == false) { s->mStartResult = chars - start; initialized = true; } case '0': case '1': if (ch < patternChar || ch > '9') goto resetPattern; *store++ = ch; pattern++; lastDigit = chars; goto nextChar; case '\0': if (WTF::isASCIIDigit(ch) == false) { *store = '\0'; goto checkMatch; } goto resetPattern; case ' ': if (ch == patternChar) goto nextChar; break; case '(': if (ch == patternChar) { s->mStartResult = chars - start; initialized = true; s->mOpenParen = true; } goto commonPunctuation; case ')': if ((ch == patternChar) ^ s->mOpenParen) goto resetPattern; default: commonPunctuation: if (ch == patternChar) { pattern++; goto nextChar; } } } while (++pattern); // never false nextChar: chars++; } break; resetPattern: if (s->mContinuationNode) return FOUND_NONE; FindResetNumber(s); pattern = s->mPattern; store = s->mStorePtr; } while (++chars < end); checkMatch: if (WTF::isASCIIDigit(s->mBackOne != '1' ? s->mBackOne : s->mBackTwo)) { return FOUND_NONE; } *store = '\0'; s->mStorePtr = store; s->mPattern = pattern; s->mEndResult = lastDigit - start + 1; char pState = pattern[0]; return pState == '\0' ? FOUND_COMPLETE : pState == '(' || (WTF::isASCIIDigit(pState) && WTF::isASCIIDigit(pattern[-1])) ? FOUND_NONE : FOUND_PARTIAL; } FoundState FindPartialEMail(const UChar* chars, unsigned length, FindState* s) { // the following tables were generated by tests/browser/focusNavigation/BrowserDebug.cpp // hand-edit at your own risk static const int domainTwoLetter[] = { 0x02df797c, // a followed by: [cdefgilmnoqrstuwxz] 0x036e73fb, // b followed by: [abdefghijmnorstvwyz] 0x03b67ded, // c followed by: [acdfghiklmnorsuvxyz] 0x02005610, // d followed by: [ejkmoz] 0x001e00d4, // e followed by: [ceghrstu] 0x00025700, // f followed by: [ijkmor] 0x015fb9fb, // g followed by: [abdefghilmnpqrstuwy] 0x001a3400, // h followed by: [kmnrtu] 0x000f7818, // i followed by: [delmnoqrst] 0x0000d010, // j followed by: [emop] 0x0342b1d0, // k followed by: [eghimnprwyz] 0x013e0507, // l followed by: [abcikrstuvy] 0x03fffccd, // m followed by: [acdghklmnopqrstuvwxyz] 0x0212c975, // n followed by: [acefgilopruz] 0x00001000, // o followed by: [m] 0x014e3cf1, // p followed by: [aefghklmnrstwy] 0x00000001, // q followed by: [a] 0x00504010, // r followed by: [eouw] 0x032a7fdf, // s followed by: [abcdeghijklmnortvyz] 0x026afeec, // t followed by: [cdfghjklmnoprtvwz] 0x03041441, // u followed by: [agkmsyz] 0x00102155, // v followed by: [aceginu] 0x00040020, // w followed by: [fs] 0x00000000, // x 0x00180010, // y followed by: [etu] 0x00401001, // z followed by: [amw] }; static char const* const longDomainNames[] = { "\x03" "ero" "\x03" "rpa", // aero, arpa "\x02" "iz", // biz "\x02" "at" "\x02" "om" "\x03" "oop", // cat, com, coop NULL, // d "\x02" "du", // edu NULL, // f "\x02" "ov", // gov NULL, // h "\x03" "nfo" "\x02" "nt", // info, int "\x03" "obs", // jobs NULL, // k NULL, // l "\x02" "il" "\x03" "obi" "\x05" "useum", // mil, mobi, museum "\x03" "ame" "\x02" "et", // name, net "\x02" "rg", // , org "\x02" "ro", // pro NULL, // q NULL, // r NULL, // s "\x05" "ravel", // travel NULL, // u NULL, // v NULL, // w NULL, // x NULL, // y NULL, // z }; const UChar* start = chars; const UChar* end = chars + length; while (chars < end) { UChar ch = *chars++; if (ch != '@') continue; const UChar* atLocation = chars - 1; // search for domain ch = *chars++ | 0x20; // convert uppercase to lower if (ch < 'a' || ch > 'z') continue; while (chars < end) { ch = *chars++; if (IsDomainChar(ch) == false) goto nextAt; if (ch != '.') continue; UChar firstLetter = *chars++ | 0x20; // first letter of the domain if (chars >= end) return FOUND_NONE; // only one letter; must be at least two firstLetter -= 'a'; if (firstLetter > 'z' - 'a') continue; // non-letter followed '.' int secondLetterMask = domainTwoLetter[firstLetter]; ch = *chars | 0x20; // second letter of the domain ch -= 'a'; if (ch >= 'z' - 'a') continue; bool secondMatch = (secondLetterMask & 1 << ch) != 0; const char* wordMatch = longDomainNames[firstLetter]; int wordIndex = 0; while (wordMatch != NULL) { int len = *wordMatch++; char match; do { match = wordMatch[wordIndex]; if (match < 0x20) goto foundDomainStart; if (chars[wordIndex] != match) break; wordIndex++; } while (true); wordMatch += len; if (*wordMatch == '\0') break; wordIndex = 0; } if (secondMatch) { wordIndex = 1; foundDomainStart: chars += wordIndex; if (chars < end) { ch = *chars; if (ch != '.') { if (IsDomainChar(ch)) goto nextDot; } else if (chars + 1 < end && IsDomainChar(chars[1])) goto nextDot; } // found domain. Search backwards from '@' for beginning of email address s->mEndResult = chars - start; chars = atLocation; if (chars <= start) goto nextAt; ch = *--chars; if (ch == '.') goto nextAt; // mailbox can't end in period do { if (IsMailboxChar(ch) == false) { chars++; break; } if (chars == start) break; ch = *--chars; } while (true); UChar firstChar = *chars; if (firstChar == '.' || firstChar == '@') // mailbox can't start with period or be empty goto nextAt; s->mStartResult = chars - start; return FOUND_COMPLETE; } nextDot: ; } nextAt: chars = atLocation + 1; } return FOUND_NONE; } bool IsDomainChar(UChar ch) { static const unsigned body[] = {0x03ff6000, 0x07fffffe, 0x07fffffe}; // 0-9 . - A-Z a-z ch -= 0x20; if (ch > 'z' - 0x20) return false; return (body[ch >> 5] & 1 << (ch & 0x1f)) != 0; } bool IsMailboxChar(UChar ch) { // According to http://en.wikipedia.org/wiki/Email_address // ! # $ % & ' * + - . / 0-9 = ? // A-Z ^ _ // ` a-z { | } ~ static const unsigned body[] = {0xa3ffecfa, 0xc7fffffe, 0x7fffffff}; ch -= 0x20; if (ch > '~' - 0x20) return false; return (body[ch >> 5] & 1 << (ch & 0x1f)) != 0; }