diff options
author | Benjamin Kramer <benny.kra@googlemail.com> | 2011-10-15 10:08:31 +0000 |
---|---|---|
committer | Benjamin Kramer <benny.kra@googlemail.com> | 2011-10-15 10:08:31 +0000 |
commit | 6e6a558ebce556476d8fd659b419a2760f2ab154 (patch) | |
tree | 0331aaeaa342b5fd9f0adf98895d3d91e2182f4f | |
parent | e9b58d0aac4e89b53a4be0e6f289b66649e1512b (diff) | |
download | external_llvm-6e6a558ebce556476d8fd659b419a2760f2ab154.zip external_llvm-6e6a558ebce556476d8fd659b419a2760f2ab154.tar.gz external_llvm-6e6a558ebce556476d8fd659b419a2760f2ab154.tar.bz2 |
Add a bad char heuristic to StringRef::find.
Based on Horspool's simplified version of Boyer-Moore. We use a constant-sized table of
uint8_ts to keep cache thrashing low, needles bigger than 255 bytes are uncommon anyways.
The worst case is still O(n*m) but we do a lot better on the average case now.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@142061 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r-- | lib/Support/StringRef.cpp | 29 | ||||
-rw-r--r-- | unittests/ADT/StringRefTest.cpp | 6 |
2 files changed, 32 insertions, 3 deletions
diff --git a/lib/Support/StringRef.cpp b/lib/Support/StringRef.cpp index b5b4f94..a862ed2 100644 --- a/lib/Support/StringRef.cpp +++ b/lib/Support/StringRef.cpp @@ -144,9 +144,32 @@ size_t StringRef::find(StringRef Str, size_t From) const { size_t N = Str.size(); if (N > Length) return npos; - for (size_t e = Length - N + 1, i = min(From, e); i != e; ++i) - if (substr(i, N).equals(Str)) - return i; + + // For short haystacks or unsupported needles fall back to the naive algorithm + if (Length < 16 || N > 255 || N == 0) { + for (size_t e = Length - N + 1, i = min(From, e); i != e; ++i) + if (substr(i, N).equals(Str)) + return i; + return npos; + } + + // Build the bad char heuristic table, with uint8_t to reduce cache thrashing. + uint8_t BadCharSkip[256]; + std::memset(BadCharSkip, N, 256); + for (unsigned i = 0; i != N-1; ++i) + BadCharSkip[(uint8_t)Str[i]] = N-1-i; + + unsigned Len = Length, Pos = min(From, Length); + while (Len >= N) { + if (substr(Pos, N).equals(Str)) // See if this is the correct substring. + return Pos; + + // Otherwise skip the appropriate number of bytes. + uint8_t Skip = BadCharSkip[(uint8_t)Data[Pos+N-1]]; + Len -= Skip; + Pos += Skip; + } + return npos; } diff --git a/unittests/ADT/StringRefTest.cpp b/unittests/ADT/StringRefTest.cpp index 8364eac..d910843 100644 --- a/unittests/ADT/StringRefTest.cpp +++ b/unittests/ADT/StringRefTest.cpp @@ -245,6 +245,12 @@ TEST(StringRefTest, Find) { EXPECT_EQ(StringRef::npos, Str.find("zz")); EXPECT_EQ(2U, Str.find("ll", 2)); EXPECT_EQ(StringRef::npos, Str.find("ll", 3)); + EXPECT_EQ(0U, Str.find("")); + StringRef LongStr("hellx xello hell ello world foo bar hello"); + EXPECT_EQ(36U, LongStr.find("hello")); + EXPECT_EQ(28U, LongStr.find("foo")); + EXPECT_EQ(12U, LongStr.find("hell", 2)); + EXPECT_EQ(0U, LongStr.find("")); EXPECT_EQ(3U, Str.rfind('l')); EXPECT_EQ(StringRef::npos, Str.rfind('z')); |