diff options
Diffstat (limited to 'src/google/protobuf/io/tokenizer_unittest.cc')
-rw-r--r-- | src/google/protobuf/io/tokenizer_unittest.cc | 358 |
1 files changed, 51 insertions, 307 deletions
diff --git a/src/google/protobuf/io/tokenizer_unittest.cc b/src/google/protobuf/io/tokenizer_unittest.cc index de096fb..358ec56 100644 --- a/src/google/protobuf/io/tokenizer_unittest.cc +++ b/src/google/protobuf/io/tokenizer_unittest.cc @@ -1,6 +1,6 @@ // Protocol Buffers - Google's data interchange format // Copyright 2008 Google Inc. All rights reserved. -// https://developers.google.com/protocol-buffers/ +// http://code.google.com/p/protobuf/ // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -32,10 +32,9 @@ // Based on original Protocol Buffers design by // Sanjay Ghemawat, Jeff Dean, and others. -#include <limits.h> -#include <math.h> - #include <vector> +#include <math.h> +#include <limits.h> #include <google/protobuf/io/tokenizer.h> #include <google/protobuf/io/zero_copy_stream_impl.h> @@ -258,7 +257,6 @@ TEST_2D(TokenizerTest, SimpleTokens, kSimpleTokenCases, kBlockSizes) { EXPECT_EQ("", tokenizer.current().text); EXPECT_EQ(0, tokenizer.current().line); EXPECT_EQ(0, tokenizer.current().column); - EXPECT_EQ(0, tokenizer.current().end_column); // Parse the token. ASSERT_TRUE(tokenizer.Next()); @@ -270,8 +268,6 @@ TEST_2D(TokenizerTest, SimpleTokens, kSimpleTokenCases, kBlockSizes) { // Check that it is located at the beginning of the input EXPECT_EQ(0, tokenizer.current().line); EXPECT_EQ(0, tokenizer.current().column); - EXPECT_EQ(kSimpleTokenCases_case.input.size(), - tokenizer.current().end_column); // There should be no more input. EXPECT_FALSE(tokenizer.Next()); @@ -281,8 +277,6 @@ TEST_2D(TokenizerTest, SimpleTokens, kSimpleTokenCases, kBlockSizes) { EXPECT_EQ("", tokenizer.current().text); EXPECT_EQ(0, tokenizer.current().line); EXPECT_EQ(kSimpleTokenCases_case.input.size(), tokenizer.current().column); - EXPECT_EQ(kSimpleTokenCases_case.input.size(), - tokenizer.current().end_column); // There should be no errors. EXPECT_TRUE(error_collector.text_.empty()); @@ -345,77 +339,76 @@ MultiTokenCase kMultiTokenCases[] = { // Test all token types at the same time. { "foo 1 1.2 + 'bar'", { - { Tokenizer::TYPE_IDENTIFIER, "foo" , 0, 0, 3 }, - { Tokenizer::TYPE_INTEGER , "1" , 0, 4, 5 }, - { Tokenizer::TYPE_FLOAT , "1.2" , 0, 6, 9 }, - { Tokenizer::TYPE_SYMBOL , "+" , 0, 10, 11 }, - { Tokenizer::TYPE_STRING , "'bar'", 0, 12, 17 }, - { Tokenizer::TYPE_END , "" , 0, 17, 17 }, + { Tokenizer::TYPE_IDENTIFIER, "foo" , 0, 0 }, + { Tokenizer::TYPE_INTEGER , "1" , 0, 4 }, + { Tokenizer::TYPE_FLOAT , "1.2" , 0, 6 }, + { Tokenizer::TYPE_SYMBOL , "+" , 0, 10 }, + { Tokenizer::TYPE_STRING , "'bar'", 0, 12 }, + { Tokenizer::TYPE_END , "" , 0, 17 }, }}, // Test that consecutive symbols are parsed as separate tokens. { "!@+%", { - { Tokenizer::TYPE_SYMBOL , "!" , 0, 0, 1 }, - { Tokenizer::TYPE_SYMBOL , "@" , 0, 1, 2 }, - { Tokenizer::TYPE_SYMBOL , "+" , 0, 2, 3 }, - { Tokenizer::TYPE_SYMBOL , "%" , 0, 3, 4 }, - { Tokenizer::TYPE_END , "" , 0, 4, 4 }, + { Tokenizer::TYPE_SYMBOL , "!" , 0, 0 }, + { Tokenizer::TYPE_SYMBOL , "@" , 0, 1 }, + { Tokenizer::TYPE_SYMBOL , "+" , 0, 2 }, + { Tokenizer::TYPE_SYMBOL , "%" , 0, 3 }, + { Tokenizer::TYPE_END , "" , 0, 4 }, }}, // Test that newlines affect line numbers correctly. { "foo bar\nrab oof", { - { Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3 }, - { Tokenizer::TYPE_IDENTIFIER, "bar", 0, 4, 7 }, - { Tokenizer::TYPE_IDENTIFIER, "rab", 1, 0, 3 }, - { Tokenizer::TYPE_IDENTIFIER, "oof", 1, 4, 7 }, - { Tokenizer::TYPE_END , "" , 1, 7, 7 }, + { Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0 }, + { Tokenizer::TYPE_IDENTIFIER, "bar", 0, 4 }, + { Tokenizer::TYPE_IDENTIFIER, "rab", 1, 0 }, + { Tokenizer::TYPE_IDENTIFIER, "oof", 1, 4 }, + { Tokenizer::TYPE_END , "" , 1, 7 }, }}, // Test that tabs affect column numbers correctly. { "foo\tbar \tbaz", { - { Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3 }, - { Tokenizer::TYPE_IDENTIFIER, "bar", 0, 8, 11 }, - { Tokenizer::TYPE_IDENTIFIER, "baz", 0, 16, 19 }, - { Tokenizer::TYPE_END , "" , 0, 19, 19 }, - }}, - - // Test that tabs in string literals affect column numbers correctly. - { "\"foo\tbar\" baz", { - { Tokenizer::TYPE_STRING , "\"foo\tbar\"", 0, 0, 12 }, - { Tokenizer::TYPE_IDENTIFIER, "baz" , 0, 13, 16 }, - { Tokenizer::TYPE_END , "" , 0, 16, 16 }, + { Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0 }, + { Tokenizer::TYPE_IDENTIFIER, "bar", 0, 8 }, + { Tokenizer::TYPE_IDENTIFIER, "baz", 0, 16 }, + { Tokenizer::TYPE_END , "" , 0, 19 }, }}, // Test that line comments are ignored. { "foo // This is a comment\n" "bar // This is another comment", { - { Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3 }, - { Tokenizer::TYPE_IDENTIFIER, "bar", 1, 0, 3 }, - { Tokenizer::TYPE_END , "" , 1, 30, 30 }, + { Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0 }, + { Tokenizer::TYPE_IDENTIFIER, "bar", 1, 0 }, + { Tokenizer::TYPE_END , "" , 1, 30 }, }}, // Test that block comments are ignored. { "foo /* This is a block comment */ bar", { - { Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3 }, - { Tokenizer::TYPE_IDENTIFIER, "bar", 0, 34, 37 }, - { Tokenizer::TYPE_END , "" , 0, 37, 37 }, + { Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0 }, + { Tokenizer::TYPE_IDENTIFIER, "bar", 0, 34 }, + { Tokenizer::TYPE_END , "" , 0, 37 }, }}, // Test that sh-style comments are not ignored by default. { "foo # bar\n" "baz", { - { Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3 }, - { Tokenizer::TYPE_SYMBOL , "#" , 0, 4, 5 }, - { Tokenizer::TYPE_IDENTIFIER, "bar", 0, 6, 9 }, - { Tokenizer::TYPE_IDENTIFIER, "baz", 1, 0, 3 }, - { Tokenizer::TYPE_END , "" , 1, 3, 3 }, + { Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0 }, + { Tokenizer::TYPE_SYMBOL , "#" , 0, 4 }, + { Tokenizer::TYPE_IDENTIFIER, "bar", 0, 6 }, + { Tokenizer::TYPE_IDENTIFIER, "baz", 1, 0 }, + { Tokenizer::TYPE_END , "" , 1, 3 }, + }}, + + // Bytes with the high-order bit set should not be seen as control characters. + { "\300", { + { Tokenizer::TYPE_SYMBOL, "\300", 0, 0 }, + { Tokenizer::TYPE_END , "" , 0, 1 }, }}, // Test all whitespace chars { "foo\n\t\r\v\fbar", { - { Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3 }, - { Tokenizer::TYPE_IDENTIFIER, "bar", 1, 11, 14 }, - { Tokenizer::TYPE_END , "" , 1, 14, 14 }, + { Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0 }, + { Tokenizer::TYPE_IDENTIFIER, "bar", 1, 11 }, + { Tokenizer::TYPE_END , "" , 1, 14 }, }}, }; @@ -432,7 +425,6 @@ TEST_2D(TokenizerTest, MultipleTokens, kMultiTokenCases, kBlockSizes) { EXPECT_EQ("", tokenizer.current().text); EXPECT_EQ(0, tokenizer.current().line); EXPECT_EQ(0, tokenizer.current().column); - EXPECT_EQ(0, tokenizer.current().end_column); // Loop through all expected tokens. int i = 0; @@ -442,8 +434,6 @@ TEST_2D(TokenizerTest, MultipleTokens, kMultiTokenCases, kBlockSizes) { SCOPED_TRACE(testing::Message() << "Token #" << i << ": " << token.text); - Tokenizer::Token previous = tokenizer.current(); - // Next() should only return false when it hits the end token. if (token.type != Tokenizer::TYPE_END) { ASSERT_TRUE(tokenizer.Next()); @@ -451,19 +441,11 @@ TEST_2D(TokenizerTest, MultipleTokens, kMultiTokenCases, kBlockSizes) { ASSERT_FALSE(tokenizer.Next()); } - // Check that the previous token is set correctly. - EXPECT_EQ(previous.type, tokenizer.previous().type); - EXPECT_EQ(previous.text, tokenizer.previous().text); - EXPECT_EQ(previous.line, tokenizer.previous().line); - EXPECT_EQ(previous.column, tokenizer.previous().column); - EXPECT_EQ(previous.end_column, tokenizer.previous().end_column); - // Check that the token matches the expected one. EXPECT_EQ(token.type, tokenizer.current().type); EXPECT_EQ(token.text, tokenizer.current().text); EXPECT_EQ(token.line, tokenizer.current().line); EXPECT_EQ(token.column, tokenizer.current().column); - EXPECT_EQ(token.end_column, tokenizer.current().end_column); } while (token.type != Tokenizer::TYPE_END); @@ -509,217 +491,6 @@ TEST_1D(TokenizerTest, ShCommentStyle, kBlockSizes) { // ------------------------------------------------------------------- -// In each case, the input is expected to have two tokens named "prev" and -// "next" with comments in between. -struct DocCommentCase { - string input; - - const char* prev_trailing_comments; - const char* detached_comments[10]; - const char* next_leading_comments; -}; - -inline ostream& operator<<(ostream& out, - const DocCommentCase& test_case) { - return out << CEscape(test_case.input); -} - -DocCommentCase kDocCommentCases[] = { - { - "prev next", - - "", - {}, - "" - }, - - { - "prev /* ignored */ next", - - "", - {}, - "" - }, - - { - "prev // trailing comment\n" - "next", - - " trailing comment\n", - {}, - "" - }, - - { - "prev\n" - "// leading comment\n" - "// line 2\n" - "next", - - "", - {}, - " leading comment\n" - " line 2\n" - }, - - { - "prev\n" - "// trailing comment\n" - "// line 2\n" - "\n" - "next", - - " trailing comment\n" - " line 2\n", - {}, - "" - }, - - { - "prev // trailing comment\n" - "// leading comment\n" - "// line 2\n" - "next", - - " trailing comment\n", - {}, - " leading comment\n" - " line 2\n" - }, - - { - "prev /* trailing block comment */\n" - "/* leading block comment\n" - " * line 2\n" - " * line 3 */" - "next", - - " trailing block comment ", - {}, - " leading block comment\n" - " line 2\n" - " line 3 " - }, - - { - "prev\n" - "/* trailing block comment\n" - " * line 2\n" - " * line 3\n" - " */\n" - "/* leading block comment\n" - " * line 2\n" - " * line 3 */" - "next", - - " trailing block comment\n" - " line 2\n" - " line 3\n", - {}, - " leading block comment\n" - " line 2\n" - " line 3 " - }, - - { - "prev\n" - "// trailing comment\n" - "\n" - "// detached comment\n" - "// line 2\n" - "\n" - "// second detached comment\n" - "/* third detached comment\n" - " * line 2 */\n" - "// leading comment\n" - "next", - - " trailing comment\n", - { - " detached comment\n" - " line 2\n", - " second detached comment\n", - " third detached comment\n" - " line 2 " - }, - " leading comment\n" - }, - - { - "prev /**/\n" - "\n" - "// detached comment\n" - "\n" - "// leading comment\n" - "next", - - "", - { - " detached comment\n" - }, - " leading comment\n" - }, - - { - "prev /**/\n" - "// leading comment\n" - "next", - - "", - {}, - " leading comment\n" - }, - }; - -TEST_2D(TokenizerTest, DocComments, kDocCommentCases, kBlockSizes) { - // Set up the tokenizer. - TestInputStream input(kDocCommentCases_case.input.data(), - kDocCommentCases_case.input.size(), - kBlockSizes_case); - TestErrorCollector error_collector; - Tokenizer tokenizer(&input, &error_collector); - - // Set up a second tokenizer where we'll pass all NULLs to NextWithComments(). - TestInputStream input2(kDocCommentCases_case.input.data(), - kDocCommentCases_case.input.size(), - kBlockSizes_case); - Tokenizer tokenizer2(&input2, &error_collector); - - tokenizer.Next(); - tokenizer2.Next(); - - EXPECT_EQ("prev", tokenizer.current().text); - EXPECT_EQ("prev", tokenizer2.current().text); - - string prev_trailing_comments; - vector<string> detached_comments; - string next_leading_comments; - tokenizer.NextWithComments(&prev_trailing_comments, &detached_comments, - &next_leading_comments); - tokenizer2.NextWithComments(NULL, NULL, NULL); - EXPECT_EQ("next", tokenizer.current().text); - EXPECT_EQ("next", tokenizer2.current().text); - - EXPECT_EQ(kDocCommentCases_case.prev_trailing_comments, - prev_trailing_comments); - - for (int i = 0; i < detached_comments.size(); i++) { - ASSERT_LT(i, GOOGLE_ARRAYSIZE(kDocCommentCases)); - ASSERT_TRUE(kDocCommentCases_case.detached_comments[i] != NULL); - EXPECT_EQ(kDocCommentCases_case.detached_comments[i], - detached_comments[i]); - } - - // Verify that we matched all the detached comments. - EXPECT_EQ(NULL, - kDocCommentCases_case.detached_comments[detached_comments.size()]); - - EXPECT_EQ(kDocCommentCases_case.next_leading_comments, - next_leading_comments); -} - -// ------------------------------------------------------------------- - // Test parse helpers. It's not really worth setting up a full data-driven // test here. TEST_F(TokenizerTest, ParseInteger) { @@ -735,7 +506,7 @@ TEST_F(TokenizerTest, ParseInteger) { EXPECT_EQ(0, ParseInteger("0x")); uint64 i; -#ifdef PROTOBUF_HAS_DEATH_TEST // death tests do not work on Windows yet +#ifdef GTEST_HAS_DEATH_TEST // death tests do not work on Windows yet // Test invalid integers that will never be tokenized as integers. EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("zxy", kuint64max, &i), "passed text that could not have been tokenized as an integer"); @@ -747,7 +518,7 @@ TEST_F(TokenizerTest, ParseInteger) { "passed text that could not have been tokenized as an integer"); EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("-1", kuint64max, &i), "passed text that could not have been tokenized as an integer"); -#endif // PROTOBUF_HAS_DEATH_TEST +#endif // GTEST_HAS_DEATH_TEST // Test overflows. EXPECT_TRUE (Tokenizer::ParseInteger("0", 0, &i)); @@ -790,7 +561,7 @@ TEST_F(TokenizerTest, ParseFloat) { EXPECT_EQ( 0.0, Tokenizer::ParseFloat("1e-9999999999999999999999999999")); EXPECT_EQ(HUGE_VAL, Tokenizer::ParseFloat("1e+9999999999999999999999999999")); -#ifdef PROTOBUF_HAS_DEATH_TEST // death tests do not work on Windows yet +#ifdef GTEST_HAS_DEATH_TEST // death tests do not work on Windows yet // Test invalid integers that will never be tokenized as integers. EXPECT_DEBUG_DEATH(Tokenizer::ParseFloat("zxy"), "passed text that could not have been tokenized as a float"); @@ -798,7 +569,7 @@ TEST_F(TokenizerTest, ParseFloat) { "passed text that could not have been tokenized as a float"); EXPECT_DEBUG_DEATH(Tokenizer::ParseFloat("-1.0"), "passed text that could not have been tokenized as a float"); -#endif // PROTOBUF_HAS_DEATH_TEST +#endif // GTEST_HAS_DEATH_TEST } TEST_F(TokenizerTest, ParseString) { @@ -820,27 +591,11 @@ TEST_F(TokenizerTest, ParseString) { Tokenizer::ParseString("'\\", &output); EXPECT_EQ("\\", output); - // Experiment with Unicode escapes. Here are one-, two- and three-byte Unicode - // characters. - Tokenizer::ParseString("'\\u0024\\u00a2\\u20ac\\U00024b62XX'", &output); - EXPECT_EQ("$¢€𤭢XX", output); - // Same thing encoded using UTF16. - Tokenizer::ParseString("'\\u0024\\u00a2\\u20ac\\ud852\\udf62XX'", &output); - EXPECT_EQ("$¢€𤭢XX", output); - // Here's some broken UTF16; there's a head surrogate with no tail surrogate. - // We just output this as if it were UTF8; it's not a defined code point, but - // it has a defined encoding. - Tokenizer::ParseString("'\\ud852XX'", &output); - EXPECT_EQ("\xed\xa1\x92XX", output); - // Malformed escape: Demons may fly out of the nose. - Tokenizer::ParseString("\\u0", &output); - EXPECT_EQ("u0", output); - // Test invalid strings that will never be tokenized as strings. -#ifdef PROTOBUF_HAS_DEATH_TEST // death tests do not work on Windows yet +#ifdef GTEST_HAS_DEATH_TEST // death tests do not work on Windows yet EXPECT_DEBUG_DEATH(Tokenizer::ParseString("", &output), "passed text that could not have been tokenized as a string"); -#endif // PROTOBUF_HAS_DEATH_TEST +#endif // GTEST_HAS_DEATH_TEST } TEST_F(TokenizerTest, ParseStringAppend) { @@ -877,15 +632,9 @@ ErrorCase kErrorCases[] = { { "'\\x' foo", true, "0:3: Expected hex digits for escape sequence.\n" }, { "'foo", false, - "0:4: Unexpected end of string.\n" }, + "0:4: String literals cannot cross line boundaries.\n" }, { "'bar\nfoo", true, "0:4: String literals cannot cross line boundaries.\n" }, - { "'\\u01' foo", true, - "0:5: Expected four hex digits for \\u escape sequence.\n" }, - { "'\\u01' foo", true, - "0:5: Expected four hex digits for \\u escape sequence.\n" }, - { "'\\uXYZ' foo", true, - "0:3: Expected four hex digits for \\u escape sequence.\n" }, // Integer errors. { "123foo", true, @@ -945,10 +694,6 @@ ErrorCase kErrorCases[] = { "0:0: Invalid control characters encountered in text.\n" }, { string("\0\0foo", 5), true, "0:0: Invalid control characters encountered in text.\n" }, - - // Check error from high order bits set - { "\300foo", true, - "0:0: Interpreting non ascii codepoint 192.\n" }, }; TEST_2D(TokenizerTest, Errors, kErrorCases, kBlockSizes) { @@ -966,7 +711,7 @@ TEST_2D(TokenizerTest, Errors, kErrorCases, kBlockSizes) { } // Check that the errors match what was expected. - EXPECT_EQ(kErrorCases_case.errors, error_collector.text_); + EXPECT_EQ(error_collector.text_, kErrorCases_case.errors); // If the error was recoverable, make sure we saw "foo" after it. if (kErrorCases_case.recoverable) { @@ -992,7 +737,6 @@ TEST_1D(TokenizerTest, BackUpOnDestruction, kBlockSizes) { EXPECT_EQ(strlen("foo"), input.ByteCount()); } - } // namespace } // namespace io } // namespace protobuf |