summaryrefslogtreecommitdiffstats
path: root/WebCore/editing/TextIterator.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'WebCore/editing/TextIterator.cpp')
-rw-r--r--WebCore/editing/TextIterator.cpp218
1 files changed, 199 insertions, 19 deletions
diff --git a/WebCore/editing/TextIterator.cpp b/WebCore/editing/TextIterator.cpp
index a96268d..a3edd38 100644
--- a/WebCore/editing/TextIterator.cpp
+++ b/WebCore/editing/TextIterator.cpp
@@ -38,6 +38,8 @@
#include "RenderTableRow.h"
#include "RenderTextControl.h"
#include "RenderTextFragment.h"
+#include "TextBoundaries.h"
+#include "TextBreakIterator.h"
#include "VisiblePosition.h"
#include "visible_units.h"
@@ -56,14 +58,18 @@ using namespace HTMLNames;
// Buffer that knows how to compare with a search target.
// Keeps enough of the previous text to be able to search in the future, but no more.
// Non-breaking spaces are always equal to normal spaces.
-// Case folding is also done if <isCaseSensitive> is false.
+// Case folding is also done if the CaseInsensitive option is specified.
+// Matches are further filtered if the AtWordStarts option is specified, although some
+// matches inside a word are permitted if TreatMedialCapitalAsWordStart is specified as well.
class SearchBuffer : public Noncopyable {
public:
- SearchBuffer(const String& target, bool isCaseSensitive);
+ SearchBuffer(const String& target, FindOptions);
~SearchBuffer();
// Returns number of characters appended; guaranteed to be in the range [1, length].
size_t append(const UChar*, size_t length);
+ bool needsMoreContext() const;
+ void prependContext(const UChar*, size_t length);
void reachedBreak();
// Result is the size in characters of what was found.
@@ -75,11 +81,16 @@ public:
private:
bool isBadMatch(const UChar*, size_t length) const;
+ bool isWordStartMatch(size_t start, size_t length) const;
String m_target;
+ FindOptions m_options;
+
Vector<UChar> m_buffer;
size_t m_overlap;
+ size_t m_prefixLength;
bool m_atBreak;
+ bool m_needsMoreContext;
bool m_targetRequiresKanaWorkaround;
Vector<UChar> m_normalizedTarget;
@@ -92,7 +103,7 @@ private:
size_t length() const;
String m_target;
- bool m_isCaseSensitive;
+ FindOptions m_options;
Vector<UChar> m_buffer;
Vector<bool> m_isCharacterStartBuffer;
@@ -473,7 +484,7 @@ bool TextIterator::handleTextNode()
handleTextNodeFirstLetter(static_cast<RenderTextFragment*>(renderer));
if (m_firstLetterText) {
String firstLetter = m_firstLetterText->text();
- emitText(m_node, m_firstLetterText, m_offset, firstLetter.length());
+ emitText(m_node, m_firstLetterText, m_offset, m_offset + firstLetter.length());
m_firstLetterText = 0;
m_textBox = 0;
return false;
@@ -1828,9 +1839,46 @@ static void normalizeCharacters(const UChar* characters, unsigned length, Vector
ASSERT(status == U_STRING_NOT_TERMINATED_WARNING);
}
-inline SearchBuffer::SearchBuffer(const String& target, bool isCaseSensitive)
+static bool isNonLatin1Separator(UChar32 character)
+{
+ ASSERT_ARG(character, character >= 256);
+
+ return U_GET_GC_MASK(character) & (U_GC_S_MASK | U_GC_P_MASK | U_GC_Z_MASK | U_GC_CF_MASK);
+}
+
+static inline bool isSeparator(UChar32 character)
+{
+ static const bool latin1SeparatorTable[256] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // space ! " # $ % & ' ( ) * + , - . /
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, // : ; < = > ?
+ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // @
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, // [ \ ] ^ _
+ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // `
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, // { | } ~
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0
+ };
+
+ if (character < 256)
+ return latin1SeparatorTable[character];
+
+ return isNonLatin1Separator(character);
+}
+
+inline SearchBuffer::SearchBuffer(const String& target, FindOptions options)
: m_target(target)
+ , m_options(options)
+ , m_prefixLength(0)
, m_atBreak(true)
+ , m_needsMoreContext(options & AtWordStarts)
, m_targetRequiresKanaWorkaround(containsKanaLetters(m_target))
{
ASSERT(!m_target.isEmpty());
@@ -1844,6 +1892,17 @@ inline SearchBuffer::SearchBuffer(const String& target, bool isCaseSensitive)
m_buffer.reserveInitialCapacity(max(targetLength * 8, minimumSearchBufferSize));
m_overlap = m_buffer.capacity() / 4;
+ if ((m_options & AtWordStarts) && targetLength) {
+ UChar32 targetFirstCharacter;
+ U16_GET(m_target.characters(), 0, 0, targetLength, targetFirstCharacter);
+ // Characters in the separator category never really occur at the beginning of a word,
+ // so if the target begins with such a character, we just ignore the AtWordStart option.
+ if (isSeparator(targetFirstCharacter)) {
+ m_options &= ~AtWordStarts;
+ m_needsMoreContext = false;
+ }
+ }
+
// Grab the single global searcher.
// If we ever have a reason to do more than once search buffer at once, we'll have
// to move to multiple searchers.
@@ -1852,7 +1911,7 @@ inline SearchBuffer::SearchBuffer(const String& target, bool isCaseSensitive)
UStringSearch* searcher = WebCore::searcher();
UCollator* collator = usearch_getCollator(searcher);
- UCollationStrength strength = isCaseSensitive ? UCOL_TERTIARY : UCOL_PRIMARY;
+ UCollationStrength strength = m_options & CaseInsensitive ? UCOL_PRIMARY : UCOL_TERTIARY;
if (ucol_getStrength(collator) != strength) {
ucol_setStrength(collator, strength);
usearch_reset(searcher);
@@ -1878,9 +1937,11 @@ inline size_t SearchBuffer::append(const UChar* characters, size_t length)
if (m_atBreak) {
m_buffer.shrink(0);
+ m_prefixLength = 0;
m_atBreak = false;
} else if (m_buffer.size() == m_buffer.capacity()) {
memcpy(m_buffer.data(), m_buffer.data() + m_buffer.size() - m_overlap, m_overlap * sizeof(UChar));
+ m_prefixLength -= min(m_prefixLength, m_buffer.size() - m_overlap);
m_buffer.shrink(m_overlap);
}
@@ -1892,6 +1953,35 @@ inline size_t SearchBuffer::append(const UChar* characters, size_t length)
return usableLength;
}
+inline bool SearchBuffer::needsMoreContext() const
+{
+ return m_needsMoreContext;
+}
+
+inline void SearchBuffer::prependContext(const UChar* characters, size_t length)
+{
+ ASSERT(m_needsMoreContext);
+ ASSERT(m_prefixLength == m_buffer.size());
+
+ if (!length)
+ return;
+
+ m_atBreak = false;
+
+ size_t wordBoundaryContextStart = length;
+ if (wordBoundaryContextStart) {
+ U16_BACK_1(characters, 0, wordBoundaryContextStart);
+ wordBoundaryContextStart = startOfLastWordBoundaryContext(characters, wordBoundaryContextStart);
+ }
+
+ size_t usableLength = min(m_buffer.capacity() - m_prefixLength, length - wordBoundaryContextStart);
+ m_buffer.prepend(characters + length - usableLength, usableLength);
+ m_prefixLength += usableLength;
+
+ if (wordBoundaryContextStart || m_prefixLength == m_buffer.capacity())
+ m_needsMoreContext = false;
+}
+
inline bool SearchBuffer::atBreak() const
{
return m_atBreak;
@@ -1962,6 +2052,55 @@ inline bool SearchBuffer::isBadMatch(const UChar* match, size_t matchLength) con
}
}
+inline bool SearchBuffer::isWordStartMatch(size_t start, size_t length) const
+{
+ ASSERT(m_options & AtWordStarts);
+
+ if (!start)
+ return true;
+
+ if (m_options & TreatMedialCapitalAsWordStart) {
+ int size = m_buffer.size();
+ int offset = start;
+ UChar32 firstCharacter;
+ U16_GET(m_buffer.data(), 0, offset, size, firstCharacter);
+ UChar32 previousCharacter;
+ U16_PREV(m_buffer.data(), 0, offset, previousCharacter);
+
+ if (isSeparator(firstCharacter)) {
+ // The start of a separator run is a word start (".org" in "webkit.org").
+ if (!isSeparator(previousCharacter))
+ return true;
+ } else if (isASCIIUpper(firstCharacter)) {
+ // The start of an uppercase run is a word start ("Kit" in "WebKit").
+ if (!isASCIIUpper(previousCharacter))
+ return true;
+ // The last character of an uppercase run followed by a non-separator, non-digit
+ // is a word start ("Request" in "XMLHTTPRequest").
+ offset = start;
+ U16_FWD_1(m_buffer.data(), offset, size);
+ UChar32 nextCharacter = 0;
+ if (offset < size)
+ U16_GET(m_buffer.data(), 0, offset, size, nextCharacter);
+ if (!isASCIIUpper(nextCharacter) && !isASCIIDigit(nextCharacter) && !isSeparator(nextCharacter))
+ return true;
+ } else if (isASCIIDigit(firstCharacter)) {
+ // The start of a digit run is a word start ("2" in "WebKit2").
+ if (!isASCIIDigit(previousCharacter))
+ return true;
+ } else if (isSeparator(previousCharacter) || isASCIIDigit(previousCharacter)) {
+ // The start of a non-separator, non-uppercase, non-digit run is a word start,
+ // except after an uppercase. ("org" in "webkit.org", but not "ore" in "WebCore").
+ return true;
+ }
+ }
+
+ size_t wordBreakSearchStart = start + length;
+ while (wordBreakSearchStart > start)
+ wordBreakSearchStart = findNextWordFromIndex(m_buffer.data(), m_buffer.size(), wordBreakSearchStart, false /* backwards */);
+ return wordBreakSearchStart == start;
+}
+
inline size_t SearchBuffer::search(size_t& start)
{
size_t size = m_buffer.size();
@@ -1979,7 +2118,10 @@ inline size_t SearchBuffer::search(size_t& start)
usearch_setText(searcher, m_buffer.data(), size, &status);
ASSERT(status == U_ZERO_ERROR);
- int matchStart = usearch_first(searcher, &status);
+ usearch_setOffset(searcher, m_prefixLength, &status);
+ ASSERT(status == U_ZERO_ERROR);
+
+ int matchStart = usearch_next(searcher, &status);
ASSERT(status == U_ZERO_ERROR);
nextMatch:
@@ -1992,8 +2134,18 @@ nextMatch:
// The same match may appear later, matching more characters,
// possibly including a combining character that's not yet in the buffer.
if (!m_atBreak && static_cast<size_t>(matchStart) >= size - m_overlap) {
- memcpy(m_buffer.data(), m_buffer.data() + size - m_overlap, m_overlap * sizeof(UChar));
- m_buffer.shrink(m_overlap);
+ size_t overlap = m_overlap;
+ if (m_options & AtWordStarts) {
+ // Ensure that there is sufficient context before matchStart the next time around for
+ // determining if it is at a word boundary.
+ int wordBoundaryContextStart = matchStart;
+ U16_BACK_1(m_buffer.data(), 0, wordBoundaryContextStart);
+ wordBoundaryContextStart = startOfLastWordBoundaryContext(m_buffer.data(), wordBoundaryContextStart);
+ overlap = min(size - 1, max(overlap, size - wordBoundaryContextStart));
+ }
+ memcpy(m_buffer.data(), m_buffer.data() + size - overlap, overlap * sizeof(UChar));
+ m_prefixLength -= min(m_prefixLength, size - overlap);
+ m_buffer.shrink(overlap);
return 0;
}
@@ -2001,7 +2153,7 @@ nextMatch:
ASSERT(matchStart + matchedLength <= size);
// If this match is "bad", move on to the next match.
- if (isBadMatch(m_buffer.data() + matchStart, matchedLength)) {
+ if (isBadMatch(m_buffer.data() + matchStart, matchedLength) || ((m_options & AtWordStarts) && !isWordStartMatch(matchStart, matchedLength))) {
matchStart = usearch_next(searcher, &status);
ASSERT(status == U_ZERO_ERROR);
goto nextMatch;
@@ -2009,6 +2161,7 @@ nextMatch:
size_t newSize = size - (matchStart + 1);
memmove(m_buffer.data(), m_buffer.data() + matchStart + 1, newSize * sizeof(UChar));
+ m_prefixLength -= min<size_t>(m_prefixLength, matchStart + 1);
m_buffer.shrink(newSize);
start = size - matchStart;
@@ -2017,9 +2170,9 @@ nextMatch:
#else // !ICU_UNICODE
-inline SearchBuffer::SearchBuffer(const String& target, bool isCaseSensitive)
- : m_target(isCaseSensitive ? target : target.foldCase())
- , m_isCaseSensitive(isCaseSensitive)
+inline SearchBuffer::SearchBuffer(const String& target, FindOptions options)
+ : m_target(options & CaseInsensitive ? target.foldCase() : target)
+ , m_options(options)
, m_buffer(m_target.length())
, m_isCharacterStartBuffer(m_target.length())
, m_isBufferFull(false)
@@ -2058,7 +2211,7 @@ inline void SearchBuffer::append(UChar c, bool isStart)
inline size_t SearchBuffer::append(const UChar* characters, size_t length)
{
ASSERT(length);
- if (m_isCaseSensitive) {
+ if (!(m_options & CaseInsensitive)) {
append(characters[0], true);
return 1;
}
@@ -2078,6 +2231,16 @@ inline size_t SearchBuffer::append(const UChar* characters, size_t length)
return 1;
}
+inline bool SearchBuffer::needsMoreContext() const
+{
+ return false;
+}
+
+void SearchBuffer::prependContext(const UChar*, size_t)
+{
+ ASSERT_NOT_REACHED();
+}
+
inline size_t SearchBuffer::search(size_t& start)
{
if (!m_isBufferFull)
@@ -2332,12 +2495,24 @@ static PassRefPtr<Range> collapsedToBoundary(const Range* range, bool forward)
return result.release();
}
-static size_t findPlainText(CharacterIterator& it, const String& target, bool forward, bool caseSensitive, size_t& matchStart)
+static size_t findPlainText(CharacterIterator& it, const String& target, FindOptions options, size_t& matchStart)
{
matchStart = 0;
size_t matchLength = 0;
- SearchBuffer buffer(target, caseSensitive);
+ SearchBuffer buffer(target, options);
+
+ if (buffer.needsMoreContext()) {
+ RefPtr<Range> startRange = it.range();
+ RefPtr<Range> beforeStartRange = startRange->ownerDocument()->createRange();
+ ExceptionCode ec = 0;
+ beforeStartRange->setEnd(startRange->startContainer(), startRange->startOffset(), ec);
+ for (SimplifiedBackwardsTextIterator backwardsIterator(beforeStartRange.get()); !backwardsIterator.atEnd(); backwardsIterator.advance()) {
+ buffer.prependContext(backwardsIterator.characters(), backwardsIterator.length());
+ if (!buffer.needsMoreContext())
+ break;
+ }
+ }
while (!it.atEnd()) {
it.advance(buffer.append(it.characters(), it.length()));
@@ -2351,7 +2526,7 @@ tryAgain:
matchLength = newMatchLength;
// If searching forward, stop on the first match.
// If searching backward, don't stop, so we end up with the last match.
- if (forward)
+ if (!(options & Backwards))
break;
goto tryAgain;
}
@@ -2366,14 +2541,19 @@ tryAgain:
PassRefPtr<Range> findPlainText(const Range* range, const String& target, bool forward, bool caseSensitive)
{
+ return findPlainText(range, target, (forward ? 0 : Backwards) | (caseSensitive ? 0 : CaseInsensitive));
+}
+
+PassRefPtr<Range> findPlainText(const Range* range, const String& target, FindOptions options)
+{
// First, find the text.
size_t matchStart;
size_t matchLength;
{
CharacterIterator findIterator(range, TextIteratorEntersTextControls);
- matchLength = findPlainText(findIterator, target, forward, caseSensitive, matchStart);
+ matchLength = findPlainText(findIterator, target, options, matchStart);
if (!matchLength)
- return collapsedToBoundary(range, forward);
+ return collapsedToBoundary(range, !(options & Backwards));
}
// Then, find the document position of the start and the end of the text.