diff options
author | Rui Ueyama <ruiu@google.com> | 2013-07-30 19:03:20 +0000 |
---|---|---|
committer | Rui Ueyama <ruiu@google.com> | 2013-07-30 19:03:20 +0000 |
commit | 264e92d6db46083f9f46484ec39e99f18d35d370 (patch) | |
tree | 3b838deb6312015d9ecd346e5140fe4cdafe9bb9 | |
parent | f6de55f5d40c498d3bc2d74cf169e5ad135f9215 (diff) | |
download | external_llvm-264e92d6db46083f9f46484ec39e99f18d35d370.zip external_llvm-264e92d6db46083f9f46484ec39e99f18d35d370.tar.gz external_llvm-264e92d6db46083f9f46484ec39e99f18d35d370.tar.bz2 |
Implement TokenizeWindowsCommandLine.
This is a follow up patch for r187390 to implement the parser for the
Windows-style command line. This should follow the rule as described
at http://msdn.microsoft.com/en-us/library/windows/desktop/17w5ykft(v=vs.85).aspx
Differential Revision: http://llvm-reviews.chandlerc.com/D1235
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@187430 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r-- | lib/Support/CommandLine.cpp | 102 | ||||
-rw-r--r-- | unittests/Support/CommandLineTest.cpp | 37 |
2 files changed, 129 insertions, 10 deletions
diff --git a/lib/Support/CommandLine.cpp b/lib/Support/CommandLine.cpp index 1975a01..a47af27 100644 --- a/lib/Support/CommandLine.cpp +++ b/lib/Support/CommandLine.cpp @@ -498,9 +498,109 @@ void cl::TokenizeGNUCommandLine(StringRef Src, StringSaver &Saver, NewArgv.push_back(Saver.SaveString(Token.c_str())); } +/// Backslashes are interpreted in a rather complicated way in the Windows-style +/// command line, because backslashes are used both to separate path and to +/// escape double quote. This method consumes runs of backslashes as well as the +/// following double quote if it's escaped. +/// +/// * If an even number of backslashes is followed by a double quote, one +/// backslash is output for every pair of backslashes, and the last double +/// quote remains unconsumed. The double quote will later be interpreted as +/// the start or end of a quoted string in the main loop outside of this +/// function. +/// +/// * If an odd number of backslashes is followed by a double quote, one +/// backslash is output for every pair of backslashes, and a double quote is +/// output for the last pair of backslash-double quote. The double quote is +/// consumed in this case. +/// +/// * Otherwise, backslashes are interpreted literally. +static size_t parseBackslash(StringRef Src, size_t I, SmallString<128> &Token) { + size_t E = Src.size(); + int BackslashCount = 0; + // Skip the backslashes. + do { + ++I; + ++BackslashCount; + } while (I != E && Src[I] == '\\'); + + bool FollowedByDoubleQuote = (I != E && Src[I] == '"'); + if (FollowedByDoubleQuote) { + Token.append(BackslashCount / 2, '\\'); + if (BackslashCount % 2 == 0) + return I - 1; + Token.push_back('"'); + return I; + } + Token.append(BackslashCount, '\\'); + return I - 1; +} + void cl::TokenizeWindowsCommandLine(StringRef Src, StringSaver &Saver, SmallVectorImpl<const char *> &NewArgv) { - llvm_unreachable("FIXME not implemented"); + SmallString<128> Token; + + // This is a small state machine to consume characters until it reaches the + // end of the source string. + enum { INIT, UNQUOTED, QUOTED } State = INIT; + for (size_t I = 0, E = Src.size(); I != E; ++I) { + // INIT state indicates that the current input index is at the start of + // the string or between tokens. + if (State == INIT) { + if (isWhitespace(Src[I])) + continue; + if (Src[I] == '"') { + State = QUOTED; + continue; + } + if (Src[I] == '\\') { + I = parseBackslash(Src, I, Token); + State = UNQUOTED; + continue; + } + Token.push_back(Src[I]); + State = UNQUOTED; + continue; + } + + // UNQUOTED state means that it's reading a token not quoted by double + // quotes. + if (State == UNQUOTED) { + // Whitespace means the end of the token. + if (isWhitespace(Src[I])) { + NewArgv.push_back(Saver.SaveString(Token.c_str())); + Token.clear(); + State = INIT; + continue; + } + if (Src[I] == '"') { + State = QUOTED; + continue; + } + if (Src[I] == '\\') { + I = parseBackslash(Src, I, Token); + continue; + } + Token.push_back(Src[I]); + continue; + } + + // QUOTED state means that it's reading a token quoted by double quotes. + if (State == QUOTED) { + if (Src[I] == '"') { + State = UNQUOTED; + continue; + } + if (Src[I] == '\\') { + I = parseBackslash(Src, I, Token); + continue; + } + Token.push_back(Src[I]); + } + } + // Append the last token after hitting EOF with no whitespace. + if (!Token.empty()) + NewArgv.push_back(Saver.SaveString(Token.c_str())); } static bool ExpandResponseFile(const char *FName, StringSaver &Saver, diff --git a/unittests/Support/CommandLineTest.cpp b/unittests/Support/CommandLineTest.cpp index 7a1c382..c54e1b9 100644 --- a/unittests/Support/CommandLineTest.cpp +++ b/unittests/Support/CommandLineTest.cpp @@ -125,21 +125,40 @@ class StrDupSaver : public cl::StringSaver { } }; -TEST(CommandLineTest, TokenizeGNUCommandLine) { - const char *Input = "foo\\ bar \"foo bar\" \'foo bar\' 'foo\\\\bar' " - "foo\"bar\"baz C:\\src\\foo.cpp \"C:\\src\\foo.cpp\""; - const char *const Output[] = { "foo bar", "foo bar", "foo bar", "foo\\bar", - "foobarbaz", "C:\\src\\foo.cpp", - "C:\\src\\foo.cpp" }; +typedef void ParserFunction(StringRef Source, llvm::cl::StringSaver &Saver, + SmallVectorImpl<const char *> &NewArgv); + + +void testCommandLineTokenizer(ParserFunction *parse, const char *Input, + const char *const Output[], size_t OutputSize) { SmallVector<const char *, 0> Actual; StrDupSaver Saver; - cl::TokenizeGNUCommandLine(Input, Saver, Actual); - EXPECT_EQ(array_lengthof(Output), Actual.size()); + parse(Input, Saver, Actual); + EXPECT_EQ(OutputSize, Actual.size()); for (unsigned I = 0, E = Actual.size(); I != E; ++I) { - if (I < array_lengthof(Output)) + if (I < OutputSize) EXPECT_STREQ(Output[I], Actual[I]); free(const_cast<char *>(Actual[I])); } } +TEST(CommandLineTest, TokenizeGNUCommandLine) { + const char *Input = "foo\\ bar \"foo bar\" \'foo bar\' 'foo\\\\bar' " + "foo\"bar\"baz C:\\src\\foo.cpp \"C:\\src\\foo.cpp\""; + const char *const Output[] = { "foo bar", "foo bar", "foo bar", "foo\\bar", + "foobarbaz", "C:\\src\\foo.cpp", + "C:\\src\\foo.cpp" }; + testCommandLineTokenizer(cl::TokenizeGNUCommandLine, Input, Output, + array_lengthof(Output)); +} + +TEST(CommandLineTest, TokenizeWindowsCommandLine) { + const char *Input = "a\\b c\\\\d e\\\\\"f g\" h\\\"i j\\\\\\\"k \"lmn\" o pqr " + "\"st \\\"u\" \\v"; + const char *const Output[] = { "a\\b", "c\\\\d", "e\\f g", "h\"i", "j\\\"k", + "lmn", "o", "pqr", "st \"u", "\\v" }; + testCommandLineTokenizer(cl::TokenizeWindowsCommandLine, Input, Output, + array_lengthof(Output)); +} + } // anonymous namespace |