From d2a5c0d8562407f9acab97451a785b513edd4c9b Mon Sep 17 00:00:00 2001 From: Daniel Dunbar Date: Wed, 17 Feb 2010 20:08:42 +0000 Subject: Add Regex::sub, for doing regular expression substitution with backreferences. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@96503 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/Support/Regex.h | 13 +++++++ lib/Support/Regex.cpp | 76 +++++++++++++++++++++++++++++++++++++++++ unittests/Support/RegexTest.cpp | 29 ++++++++++++++++ 3 files changed, 118 insertions(+) diff --git a/include/llvm/Support/Regex.h b/include/llvm/Support/Regex.h index c954c0d..591af00 100644 --- a/include/llvm/Support/Regex.h +++ b/include/llvm/Support/Regex.h @@ -56,6 +56,19 @@ namespace llvm { /// /// This returns true on a successful match. bool match(const StringRef &String, SmallVectorImpl *Matches=0); + + /// sub - Return the result of replacing the first match of the regex in + /// \arg String with the \arg Repl string. Backreferences like "\0" in the + /// replacement string are replaced with the appropriate match substring. + /// + /// Note that the replacement string has backslash escaping performed on + /// it. Invalid backreferences are ignored (replaced by empty strings). + /// + /// \param Error If non-null, any errors in the substitution (invalid + /// backreferences, trailing backslashes) will be recorded as a non-empty + /// string. + std::string sub(StringRef Repl, StringRef String, std::string *Error = 0); + private: struct llvm_regex *preg; int error; diff --git a/lib/Support/Regex.cpp b/lib/Support/Regex.cpp index 618ca05..a7631de 100644 --- a/lib/Support/Regex.cpp +++ b/lib/Support/Regex.cpp @@ -90,3 +90,79 @@ bool Regex::match(const StringRef &String, SmallVectorImpl *Matches){ return true; } + +std::string Regex::sub(StringRef Repl, StringRef String, + std::string *Error) { + SmallVector Matches; + + // Reset error, if given. + if (Error && !Error->empty()) *Error = ""; + + // Return the input if there was no match. + if (!match(String, &Matches)) + return String; + + // Otherwise splice in the replacement string, starting with the prefix before + // the match. + std::string Res(String.begin(), Matches[0].begin()); + + // Then the replacement string, honoring possible substitutions. + while (!Repl.empty()) { + // Skip to the next escape. + std::pair Split = Repl.split('\\'); + + // Add the skipped substring. + Res += Split.first; + + // Check for terminimation and trailing backslash. + if (Split.second.empty()) { + if (Repl.size() != Split.first.size() && + Error && Error->empty()) + *Error = "replacement string contained trailing backslash"; + break; + } + + // Otherwise update the replacement string and interpret escapes. + Repl = Split.second; + + // FIXME: We should have a StringExtras function for mapping C99 escapes. + switch (Repl[0]) { + // Treat all unrecognized characters as self-quoting. + default: + Res += Repl[0]; + Repl = Repl.substr(1); + break; + + // Single character escapes. + case 't': + Res += '\t'; + Repl = Repl.substr(1); + break; + case 'n': + Res += '\n'; + Repl = Repl.substr(1); + break; + + // Decimal escapes are backreferences. + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': { + // Extract the backreference number. + StringRef Ref = Repl.slice(0, Repl.find_first_not_of("0123456789")); + Repl = Repl.substr(Ref.size()); + + unsigned RefValue; + if (!Ref.getAsInteger(10, RefValue) && + RefValue < Matches.size()) + Res += Matches[RefValue]; + else if (Error && Error->empty()) + *Error = "invalid backreference string '" + Ref.str() + "'"; + break; + } + } + } + + // And finally the suffix. + Res += StringRef(Matches[0].end(), String.end() - Matches[0].end()); + + return Res; +} diff --git a/unittests/Support/RegexTest.cpp b/unittests/Support/RegexTest.cpp index 44c7e55..65b66c3 100644 --- a/unittests/Support/RegexTest.cpp +++ b/unittests/Support/RegexTest.cpp @@ -62,4 +62,33 @@ TEST_F(RegexTest, Basics) { EXPECT_TRUE(r5.match(String)); } +TEST_F(RegexTest, Substitution) { + std::string Error; + + EXPECT_EQ("aNUMber", Regex("[0-9]+").sub("NUM", "a1234ber")); + + // Standard Escapes + EXPECT_EQ("a\\ber", Regex("[0-9]+").sub("\\\\", "a1234ber", &Error)); + EXPECT_EQ(Error, ""); + EXPECT_EQ("a\nber", Regex("[0-9]+").sub("\\n", "a1234ber", &Error)); + EXPECT_EQ(Error, ""); + EXPECT_EQ("a\tber", Regex("[0-9]+").sub("\\t", "a1234ber", &Error)); + EXPECT_EQ(Error, ""); + EXPECT_EQ("ajber", Regex("[0-9]+").sub("\\j", "a1234ber", &Error)); + EXPECT_EQ(Error, ""); + + EXPECT_EQ("aber", Regex("[0-9]+").sub("\\", "a1234ber", &Error)); + EXPECT_EQ(Error, "replacement string contained trailing backslash"); + + // Backreferences + EXPECT_EQ("aa1234bber", Regex("a[0-9]+b").sub("a\\0b", "a1234ber", &Error)); + EXPECT_EQ(Error, ""); + + EXPECT_EQ("a1234ber", Regex("a([0-9]+)b").sub("a\\1b", "a1234ber", &Error)); + EXPECT_EQ(Error, ""); + + EXPECT_EQ("aber", Regex("a[0-9]+b").sub("a\\100b", "a1234ber", &Error)); + EXPECT_EQ(Error, "invalid backreference string '100'"); +} + } -- cgit v1.1