diff options
Diffstat (limited to 'WebCore/editing/TextIterator.cpp')
-rw-r--r-- | WebCore/editing/TextIterator.cpp | 218 |
1 files changed, 199 insertions, 19 deletions
diff --git a/WebCore/editing/TextIterator.cpp b/WebCore/editing/TextIterator.cpp index a96268d..a3edd38 100644 --- a/WebCore/editing/TextIterator.cpp +++ b/WebCore/editing/TextIterator.cpp @@ -38,6 +38,8 @@ #include "RenderTableRow.h" #include "RenderTextControl.h" #include "RenderTextFragment.h" +#include "TextBoundaries.h" +#include "TextBreakIterator.h" #include "VisiblePosition.h" #include "visible_units.h" @@ -56,14 +58,18 @@ using namespace HTMLNames; // Buffer that knows how to compare with a search target. // Keeps enough of the previous text to be able to search in the future, but no more. // Non-breaking spaces are always equal to normal spaces. -// Case folding is also done if <isCaseSensitive> is false. +// Case folding is also done if the CaseInsensitive option is specified. +// Matches are further filtered if the AtWordStarts option is specified, although some +// matches inside a word are permitted if TreatMedialCapitalAsWordStart is specified as well. class SearchBuffer : public Noncopyable { public: - SearchBuffer(const String& target, bool isCaseSensitive); + SearchBuffer(const String& target, FindOptions); ~SearchBuffer(); // Returns number of characters appended; guaranteed to be in the range [1, length]. size_t append(const UChar*, size_t length); + bool needsMoreContext() const; + void prependContext(const UChar*, size_t length); void reachedBreak(); // Result is the size in characters of what was found. @@ -75,11 +81,16 @@ public: private: bool isBadMatch(const UChar*, size_t length) const; + bool isWordStartMatch(size_t start, size_t length) const; String m_target; + FindOptions m_options; + Vector<UChar> m_buffer; size_t m_overlap; + size_t m_prefixLength; bool m_atBreak; + bool m_needsMoreContext; bool m_targetRequiresKanaWorkaround; Vector<UChar> m_normalizedTarget; @@ -92,7 +103,7 @@ private: size_t length() const; String m_target; - bool m_isCaseSensitive; + FindOptions m_options; Vector<UChar> m_buffer; Vector<bool> m_isCharacterStartBuffer; @@ -473,7 +484,7 @@ bool TextIterator::handleTextNode() handleTextNodeFirstLetter(static_cast<RenderTextFragment*>(renderer)); if (m_firstLetterText) { String firstLetter = m_firstLetterText->text(); - emitText(m_node, m_firstLetterText, m_offset, firstLetter.length()); + emitText(m_node, m_firstLetterText, m_offset, m_offset + firstLetter.length()); m_firstLetterText = 0; m_textBox = 0; return false; @@ -1828,9 +1839,46 @@ static void normalizeCharacters(const UChar* characters, unsigned length, Vector ASSERT(status == U_STRING_NOT_TERMINATED_WARNING); } -inline SearchBuffer::SearchBuffer(const String& target, bool isCaseSensitive) +static bool isNonLatin1Separator(UChar32 character) +{ + ASSERT_ARG(character, character >= 256); + + return U_GET_GC_MASK(character) & (U_GC_S_MASK | U_GC_P_MASK | U_GC_Z_MASK | U_GC_CF_MASK); +} + +static inline bool isSeparator(UChar32 character) +{ + static const bool latin1SeparatorTable[256] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // space ! " # $ % & ' ( ) * + , - . / + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, // : ; < = > ? + 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // @ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, // [ \ ] ^ _ + 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // ` + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, // { | } ~ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0 + }; + + if (character < 256) + return latin1SeparatorTable[character]; + + return isNonLatin1Separator(character); +} + +inline SearchBuffer::SearchBuffer(const String& target, FindOptions options) : m_target(target) + , m_options(options) + , m_prefixLength(0) , m_atBreak(true) + , m_needsMoreContext(options & AtWordStarts) , m_targetRequiresKanaWorkaround(containsKanaLetters(m_target)) { ASSERT(!m_target.isEmpty()); @@ -1844,6 +1892,17 @@ inline SearchBuffer::SearchBuffer(const String& target, bool isCaseSensitive) m_buffer.reserveInitialCapacity(max(targetLength * 8, minimumSearchBufferSize)); m_overlap = m_buffer.capacity() / 4; + if ((m_options & AtWordStarts) && targetLength) { + UChar32 targetFirstCharacter; + U16_GET(m_target.characters(), 0, 0, targetLength, targetFirstCharacter); + // Characters in the separator category never really occur at the beginning of a word, + // so if the target begins with such a character, we just ignore the AtWordStart option. + if (isSeparator(targetFirstCharacter)) { + m_options &= ~AtWordStarts; + m_needsMoreContext = false; + } + } + // Grab the single global searcher. // If we ever have a reason to do more than once search buffer at once, we'll have // to move to multiple searchers. @@ -1852,7 +1911,7 @@ inline SearchBuffer::SearchBuffer(const String& target, bool isCaseSensitive) UStringSearch* searcher = WebCore::searcher(); UCollator* collator = usearch_getCollator(searcher); - UCollationStrength strength = isCaseSensitive ? UCOL_TERTIARY : UCOL_PRIMARY; + UCollationStrength strength = m_options & CaseInsensitive ? UCOL_PRIMARY : UCOL_TERTIARY; if (ucol_getStrength(collator) != strength) { ucol_setStrength(collator, strength); usearch_reset(searcher); @@ -1878,9 +1937,11 @@ inline size_t SearchBuffer::append(const UChar* characters, size_t length) if (m_atBreak) { m_buffer.shrink(0); + m_prefixLength = 0; m_atBreak = false; } else if (m_buffer.size() == m_buffer.capacity()) { memcpy(m_buffer.data(), m_buffer.data() + m_buffer.size() - m_overlap, m_overlap * sizeof(UChar)); + m_prefixLength -= min(m_prefixLength, m_buffer.size() - m_overlap); m_buffer.shrink(m_overlap); } @@ -1892,6 +1953,35 @@ inline size_t SearchBuffer::append(const UChar* characters, size_t length) return usableLength; } +inline bool SearchBuffer::needsMoreContext() const +{ + return m_needsMoreContext; +} + +inline void SearchBuffer::prependContext(const UChar* characters, size_t length) +{ + ASSERT(m_needsMoreContext); + ASSERT(m_prefixLength == m_buffer.size()); + + if (!length) + return; + + m_atBreak = false; + + size_t wordBoundaryContextStart = length; + if (wordBoundaryContextStart) { + U16_BACK_1(characters, 0, wordBoundaryContextStart); + wordBoundaryContextStart = startOfLastWordBoundaryContext(characters, wordBoundaryContextStart); + } + + size_t usableLength = min(m_buffer.capacity() - m_prefixLength, length - wordBoundaryContextStart); + m_buffer.prepend(characters + length - usableLength, usableLength); + m_prefixLength += usableLength; + + if (wordBoundaryContextStart || m_prefixLength == m_buffer.capacity()) + m_needsMoreContext = false; +} + inline bool SearchBuffer::atBreak() const { return m_atBreak; @@ -1962,6 +2052,55 @@ inline bool SearchBuffer::isBadMatch(const UChar* match, size_t matchLength) con } } +inline bool SearchBuffer::isWordStartMatch(size_t start, size_t length) const +{ + ASSERT(m_options & AtWordStarts); + + if (!start) + return true; + + if (m_options & TreatMedialCapitalAsWordStart) { + int size = m_buffer.size(); + int offset = start; + UChar32 firstCharacter; + U16_GET(m_buffer.data(), 0, offset, size, firstCharacter); + UChar32 previousCharacter; + U16_PREV(m_buffer.data(), 0, offset, previousCharacter); + + if (isSeparator(firstCharacter)) { + // The start of a separator run is a word start (".org" in "webkit.org"). + if (!isSeparator(previousCharacter)) + return true; + } else if (isASCIIUpper(firstCharacter)) { + // The start of an uppercase run is a word start ("Kit" in "WebKit"). + if (!isASCIIUpper(previousCharacter)) + return true; + // The last character of an uppercase run followed by a non-separator, non-digit + // is a word start ("Request" in "XMLHTTPRequest"). + offset = start; + U16_FWD_1(m_buffer.data(), offset, size); + UChar32 nextCharacter = 0; + if (offset < size) + U16_GET(m_buffer.data(), 0, offset, size, nextCharacter); + if (!isASCIIUpper(nextCharacter) && !isASCIIDigit(nextCharacter) && !isSeparator(nextCharacter)) + return true; + } else if (isASCIIDigit(firstCharacter)) { + // The start of a digit run is a word start ("2" in "WebKit2"). + if (!isASCIIDigit(previousCharacter)) + return true; + } else if (isSeparator(previousCharacter) || isASCIIDigit(previousCharacter)) { + // The start of a non-separator, non-uppercase, non-digit run is a word start, + // except after an uppercase. ("org" in "webkit.org", but not "ore" in "WebCore"). + return true; + } + } + + size_t wordBreakSearchStart = start + length; + while (wordBreakSearchStart > start) + wordBreakSearchStart = findNextWordFromIndex(m_buffer.data(), m_buffer.size(), wordBreakSearchStart, false /* backwards */); + return wordBreakSearchStart == start; +} + inline size_t SearchBuffer::search(size_t& start) { size_t size = m_buffer.size(); @@ -1979,7 +2118,10 @@ inline size_t SearchBuffer::search(size_t& start) usearch_setText(searcher, m_buffer.data(), size, &status); ASSERT(status == U_ZERO_ERROR); - int matchStart = usearch_first(searcher, &status); + usearch_setOffset(searcher, m_prefixLength, &status); + ASSERT(status == U_ZERO_ERROR); + + int matchStart = usearch_next(searcher, &status); ASSERT(status == U_ZERO_ERROR); nextMatch: @@ -1992,8 +2134,18 @@ nextMatch: // The same match may appear later, matching more characters, // possibly including a combining character that's not yet in the buffer. if (!m_atBreak && static_cast<size_t>(matchStart) >= size - m_overlap) { - memcpy(m_buffer.data(), m_buffer.data() + size - m_overlap, m_overlap * sizeof(UChar)); - m_buffer.shrink(m_overlap); + size_t overlap = m_overlap; + if (m_options & AtWordStarts) { + // Ensure that there is sufficient context before matchStart the next time around for + // determining if it is at a word boundary. + int wordBoundaryContextStart = matchStart; + U16_BACK_1(m_buffer.data(), 0, wordBoundaryContextStart); + wordBoundaryContextStart = startOfLastWordBoundaryContext(m_buffer.data(), wordBoundaryContextStart); + overlap = min(size - 1, max(overlap, size - wordBoundaryContextStart)); + } + memcpy(m_buffer.data(), m_buffer.data() + size - overlap, overlap * sizeof(UChar)); + m_prefixLength -= min(m_prefixLength, size - overlap); + m_buffer.shrink(overlap); return 0; } @@ -2001,7 +2153,7 @@ nextMatch: ASSERT(matchStart + matchedLength <= size); // If this match is "bad", move on to the next match. - if (isBadMatch(m_buffer.data() + matchStart, matchedLength)) { + if (isBadMatch(m_buffer.data() + matchStart, matchedLength) || ((m_options & AtWordStarts) && !isWordStartMatch(matchStart, matchedLength))) { matchStart = usearch_next(searcher, &status); ASSERT(status == U_ZERO_ERROR); goto nextMatch; @@ -2009,6 +2161,7 @@ nextMatch: size_t newSize = size - (matchStart + 1); memmove(m_buffer.data(), m_buffer.data() + matchStart + 1, newSize * sizeof(UChar)); + m_prefixLength -= min<size_t>(m_prefixLength, matchStart + 1); m_buffer.shrink(newSize); start = size - matchStart; @@ -2017,9 +2170,9 @@ nextMatch: #else // !ICU_UNICODE -inline SearchBuffer::SearchBuffer(const String& target, bool isCaseSensitive) - : m_target(isCaseSensitive ? target : target.foldCase()) - , m_isCaseSensitive(isCaseSensitive) +inline SearchBuffer::SearchBuffer(const String& target, FindOptions options) + : m_target(options & CaseInsensitive ? target.foldCase() : target) + , m_options(options) , m_buffer(m_target.length()) , m_isCharacterStartBuffer(m_target.length()) , m_isBufferFull(false) @@ -2058,7 +2211,7 @@ inline void SearchBuffer::append(UChar c, bool isStart) inline size_t SearchBuffer::append(const UChar* characters, size_t length) { ASSERT(length); - if (m_isCaseSensitive) { + if (!(m_options & CaseInsensitive)) { append(characters[0], true); return 1; } @@ -2078,6 +2231,16 @@ inline size_t SearchBuffer::append(const UChar* characters, size_t length) return 1; } +inline bool SearchBuffer::needsMoreContext() const +{ + return false; +} + +void SearchBuffer::prependContext(const UChar*, size_t) +{ + ASSERT_NOT_REACHED(); +} + inline size_t SearchBuffer::search(size_t& start) { if (!m_isBufferFull) @@ -2332,12 +2495,24 @@ static PassRefPtr<Range> collapsedToBoundary(const Range* range, bool forward) return result.release(); } -static size_t findPlainText(CharacterIterator& it, const String& target, bool forward, bool caseSensitive, size_t& matchStart) +static size_t findPlainText(CharacterIterator& it, const String& target, FindOptions options, size_t& matchStart) { matchStart = 0; size_t matchLength = 0; - SearchBuffer buffer(target, caseSensitive); + SearchBuffer buffer(target, options); + + if (buffer.needsMoreContext()) { + RefPtr<Range> startRange = it.range(); + RefPtr<Range> beforeStartRange = startRange->ownerDocument()->createRange(); + ExceptionCode ec = 0; + beforeStartRange->setEnd(startRange->startContainer(), startRange->startOffset(), ec); + for (SimplifiedBackwardsTextIterator backwardsIterator(beforeStartRange.get()); !backwardsIterator.atEnd(); backwardsIterator.advance()) { + buffer.prependContext(backwardsIterator.characters(), backwardsIterator.length()); + if (!buffer.needsMoreContext()) + break; + } + } while (!it.atEnd()) { it.advance(buffer.append(it.characters(), it.length())); @@ -2351,7 +2526,7 @@ tryAgain: matchLength = newMatchLength; // If searching forward, stop on the first match. // If searching backward, don't stop, so we end up with the last match. - if (forward) + if (!(options & Backwards)) break; goto tryAgain; } @@ -2366,14 +2541,19 @@ tryAgain: PassRefPtr<Range> findPlainText(const Range* range, const String& target, bool forward, bool caseSensitive) { + return findPlainText(range, target, (forward ? 0 : Backwards) | (caseSensitive ? 0 : CaseInsensitive)); +} + +PassRefPtr<Range> findPlainText(const Range* range, const String& target, FindOptions options) +{ // First, find the text. size_t matchStart; size_t matchLength; { CharacterIterator findIterator(range, TextIteratorEntersTextControls); - matchLength = findPlainText(findIterator, target, forward, caseSensitive, matchStart); + matchLength = findPlainText(findIterator, target, options, matchStart); if (!matchLength) - return collapsedToBoundary(range, forward); + return collapsedToBoundary(range, !(options & Backwards)); } // Then, find the document position of the start and the end of the text. |