aboutsummaryrefslogtreecommitdiffstats
path: root/src/google/protobuf/io/tokenizer_unittest.cc
diff options
context:
space:
mode:
Diffstat (limited to 'src/google/protobuf/io/tokenizer_unittest.cc')
-rw-r--r--src/google/protobuf/io/tokenizer_unittest.cc730
1 files changed, 730 insertions, 0 deletions
diff --git a/src/google/protobuf/io/tokenizer_unittest.cc b/src/google/protobuf/io/tokenizer_unittest.cc
new file mode 100644
index 0000000..eac1455
--- /dev/null
+++ b/src/google/protobuf/io/tokenizer_unittest.cc
@@ -0,0 +1,730 @@
+// Protocol Buffers - Google's data interchange format
+// Copyright 2008 Google Inc. All rights reserved.
+// http://code.google.com/p/protobuf/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Author: kenton@google.com (Kenton Varda)
+// Based on original Protocol Buffers design by
+// Sanjay Ghemawat, Jeff Dean, and others.
+
+#include <vector>
+#include <math.h>
+#include <limits.h>
+
+#include <google/protobuf/io/tokenizer.h>
+#include <google/protobuf/io/zero_copy_stream_impl.h>
+
+#include <google/protobuf/stubs/common.h>
+#include <google/protobuf/stubs/strutil.h>
+#include <google/protobuf/stubs/substitute.h>
+#include <google/protobuf/testing/googletest.h>
+#include <gtest/gtest.h>
+
+namespace google {
+namespace protobuf {
+namespace io {
+namespace {
+
+// ===================================================================
+// Data-Driven Test Infrastructure
+
+// TODO(kenton): This is copied from coded_stream_unittest. This is
+// temporary until these fetaures are integrated into gTest itself.
+
+// TEST_1D and TEST_2D are macros I'd eventually like to see added to
+// gTest. These macros can be used to declare tests which should be
+// run multiple times, once for each item in some input array. TEST_1D
+// tests all cases in a single input array. TEST_2D tests all
+// combinations of cases from two arrays. The arrays must be statically
+// defined such that the GOOGLE_ARRAYSIZE() macro works on them. Example:
+//
+// int kCases[] = {1, 2, 3, 4}
+// TEST_1D(MyFixture, MyTest, kCases) {
+// EXPECT_GT(kCases_case, 0);
+// }
+//
+// This test iterates through the numbers 1, 2, 3, and 4 and tests that
+// they are all grater than zero. In case of failure, the exact case
+// which failed will be printed. The case type must be printable using
+// ostream::operator<<.
+
+#define TEST_1D(FIXTURE, NAME, CASES) \
+ class FIXTURE##_##NAME##_DD : public FIXTURE { \
+ protected: \
+ template <typename CaseType> \
+ void DoSingleCase(const CaseType& CASES##_case); \
+ }; \
+ \
+ TEST_F(FIXTURE##_##NAME##_DD, NAME) { \
+ for (int i = 0; i < GOOGLE_ARRAYSIZE(CASES); i++) { \
+ SCOPED_TRACE(testing::Message() \
+ << #CASES " case #" << i << ": " << CASES[i]); \
+ DoSingleCase(CASES[i]); \
+ } \
+ } \
+ \
+ template <typename CaseType> \
+ void FIXTURE##_##NAME##_DD::DoSingleCase(const CaseType& CASES##_case)
+
+#define TEST_2D(FIXTURE, NAME, CASES1, CASES2) \
+ class FIXTURE##_##NAME##_DD : public FIXTURE { \
+ protected: \
+ template <typename CaseType1, typename CaseType2> \
+ void DoSingleCase(const CaseType1& CASES1##_case, \
+ const CaseType2& CASES2##_case); \
+ }; \
+ \
+ TEST_F(FIXTURE##_##NAME##_DD, NAME) { \
+ for (int i = 0; i < GOOGLE_ARRAYSIZE(CASES1); i++) { \
+ for (int j = 0; j < GOOGLE_ARRAYSIZE(CASES2); j++) { \
+ SCOPED_TRACE(testing::Message() \
+ << #CASES1 " case #" << i << ": " << CASES1[i] << ", " \
+ << #CASES2 " case #" << j << ": " << CASES2[j]); \
+ DoSingleCase(CASES1[i], CASES2[j]); \
+ } \
+ } \
+ } \
+ \
+ template <typename CaseType1, typename CaseType2> \
+ void FIXTURE##_##NAME##_DD::DoSingleCase(const CaseType1& CASES1##_case, \
+ const CaseType2& CASES2##_case)
+
+// -------------------------------------------------------------------
+
+// An input stream that is basically like an ArrayInputStream but sometimes
+// returns empty buffers, just to throw us off.
+class TestInputStream : public ZeroCopyInputStream {
+ public:
+ TestInputStream(const void* data, int size, int block_size)
+ : array_stream_(data, size, block_size), counter_(0) {}
+ ~TestInputStream() {}
+
+ // implements ZeroCopyInputStream ----------------------------------
+ bool Next(const void** data, int* size) {
+ // We'll return empty buffers starting with the first buffer, and every
+ // 3 and 5 buffers after that.
+ if (counter_ % 3 == 0 || counter_ % 5 == 0) {
+ *data = NULL;
+ *size = 0;
+ ++counter_;
+ return true;
+ } else {
+ ++counter_;
+ return array_stream_.Next(data, size);
+ }
+ }
+
+ void BackUp(int count) { return array_stream_.BackUp(count); }
+ bool Skip(int count) { return array_stream_.Skip(count); }
+ int64 ByteCount() const { return array_stream_.ByteCount(); }
+
+ private:
+ ArrayInputStream array_stream_;
+ int counter_;
+};
+
+// -------------------------------------------------------------------
+
+// An error collector which simply concatenates all its errors into a big
+// block of text which can be checked.
+class TestErrorCollector : public ErrorCollector {
+ public:
+ TestErrorCollector() {}
+ ~TestErrorCollector() {}
+
+ string text_;
+
+ // implements ErrorCollector ---------------------------------------
+ void AddError(int line, int column, const string& message) {
+ strings::SubstituteAndAppend(&text_, "$0:$1: $2\n",
+ line, column, message);
+ }
+};
+
+// -------------------------------------------------------------------
+
+// We test each operation over a variety of block sizes to insure that
+// we test cases where reads cross buffer boundaries as well as cases
+// where they don't. This is sort of a brute-force approach to this,
+// but it's easy to write and easy to understand.
+const int kBlockSizes[] = {1, 2, 3, 5, 7, 13, 32, 1024};
+
+class TokenizerTest : public testing::Test {
+ protected:
+ // For easy testing.
+ uint64 ParseInteger(const string& text) {
+ uint64 result;
+ EXPECT_TRUE(Tokenizer::ParseInteger(text, kuint64max, &result));
+ return result;
+ }
+};
+
+// ===================================================================
+
+// These tests causes gcc 3.3.5 (and earlier?) to give the cryptic error:
+// "sorry, unimplemented: `method_call_expr' not supported by dump_expr"
+#if !defined(__GNUC__) || __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 3)
+
+// In each test case, the entire input text should parse as a single token
+// of the given type.
+struct SimpleTokenCase {
+ string input;
+ Tokenizer::TokenType type;
+};
+
+inline ostream& operator<<(ostream& out,
+ const SimpleTokenCase& test_case) {
+ return out << CEscape(test_case.input);
+}
+
+SimpleTokenCase kSimpleTokenCases[] = {
+ // Test identifiers.
+ { "hello", Tokenizer::TYPE_IDENTIFIER },
+
+ // Test integers.
+ { "123", Tokenizer::TYPE_INTEGER },
+ { "0xab6", Tokenizer::TYPE_INTEGER },
+ { "0XAB6", Tokenizer::TYPE_INTEGER },
+ { "0X1234567", Tokenizer::TYPE_INTEGER },
+ { "0x89abcdef", Tokenizer::TYPE_INTEGER },
+ { "0x89ABCDEF", Tokenizer::TYPE_INTEGER },
+ { "01234567", Tokenizer::TYPE_INTEGER },
+
+ // Test floats.
+ { "123.45", Tokenizer::TYPE_FLOAT },
+ { "1.", Tokenizer::TYPE_FLOAT },
+ { "1e3", Tokenizer::TYPE_FLOAT },
+ { "1E3", Tokenizer::TYPE_FLOAT },
+ { "1e-3", Tokenizer::TYPE_FLOAT },
+ { "1e+3", Tokenizer::TYPE_FLOAT },
+ { "1.e3", Tokenizer::TYPE_FLOAT },
+ { "1.2e3", Tokenizer::TYPE_FLOAT },
+ { ".1", Tokenizer::TYPE_FLOAT },
+ { ".1e3", Tokenizer::TYPE_FLOAT },
+ { ".1e-3", Tokenizer::TYPE_FLOAT },
+ { ".1e+3", Tokenizer::TYPE_FLOAT },
+
+ // Test strings.
+ { "'hello'", Tokenizer::TYPE_STRING },
+ { "\"foo\"", Tokenizer::TYPE_STRING },
+ { "'a\"b'", Tokenizer::TYPE_STRING },
+ { "\"a'b\"", Tokenizer::TYPE_STRING },
+ { "'a\\'b'", Tokenizer::TYPE_STRING },
+ { "\"a\\\"b\"", Tokenizer::TYPE_STRING },
+ { "'\\xf'", Tokenizer::TYPE_STRING },
+ { "'\\0'", Tokenizer::TYPE_STRING },
+
+ // Test symbols.
+ { "+", Tokenizer::TYPE_SYMBOL },
+ { ".", Tokenizer::TYPE_SYMBOL },
+};
+
+TEST_2D(TokenizerTest, SimpleTokens, kSimpleTokenCases, kBlockSizes) {
+ // Set up the tokenizer.
+ TestInputStream input(kSimpleTokenCases_case.input.data(),
+ kSimpleTokenCases_case.input.size(),
+ kBlockSizes_case);
+ TestErrorCollector error_collector;
+ Tokenizer tokenizer(&input, &error_collector);
+
+ // Before Next() is called, the initial token should always be TYPE_START.
+ EXPECT_EQ(Tokenizer::TYPE_START, tokenizer.current().type);
+ EXPECT_EQ("", tokenizer.current().text);
+ EXPECT_EQ(0, tokenizer.current().line);
+ EXPECT_EQ(0, tokenizer.current().column);
+
+ // Parse the token.
+ ASSERT_TRUE(tokenizer.Next());
+
+ // Check that it has the right type.
+ EXPECT_EQ(kSimpleTokenCases_case.type, tokenizer.current().type);
+ // Check that it contains the complete input text.
+ EXPECT_EQ(kSimpleTokenCases_case.input, tokenizer.current().text);
+ // Check that it is located at the beginning of the input
+ EXPECT_EQ(0, tokenizer.current().line);
+ EXPECT_EQ(0, tokenizer.current().column);
+
+ // There should be no more input.
+ EXPECT_FALSE(tokenizer.Next());
+
+ // After Next() returns false, the token should have type TYPE_END.
+ EXPECT_EQ(Tokenizer::TYPE_END, tokenizer.current().type);
+ EXPECT_EQ("", tokenizer.current().text);
+ EXPECT_EQ(0, tokenizer.current().line);
+ EXPECT_EQ(kSimpleTokenCases_case.input.size(), tokenizer.current().column);
+
+ // There should be no errors.
+ EXPECT_TRUE(error_collector.text_.empty());
+}
+
+TEST_1D(TokenizerTest, FloatSuffix, kBlockSizes) {
+ // Test the "allow_f_after_float" option.
+
+ // Set up the tokenizer.
+ const char* text = "1f 2.5f 6e3f 7F";
+ TestInputStream input(text, strlen(text), kBlockSizes_case);
+ TestErrorCollector error_collector;
+ Tokenizer tokenizer(&input, &error_collector);
+ tokenizer.set_allow_f_after_float(true);
+
+ // Advance through tokens and check that they are parsed as expected.
+ ASSERT_TRUE(tokenizer.Next());
+ EXPECT_EQ(tokenizer.current().text, "1f");
+ EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
+ ASSERT_TRUE(tokenizer.Next());
+ EXPECT_EQ(tokenizer.current().text, "2.5f");
+ EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
+ ASSERT_TRUE(tokenizer.Next());
+ EXPECT_EQ(tokenizer.current().text, "6e3f");
+ EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
+ ASSERT_TRUE(tokenizer.Next());
+ EXPECT_EQ(tokenizer.current().text, "7F");
+ EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
+
+ // There should be no more input.
+ EXPECT_FALSE(tokenizer.Next());
+ // There should be no errors.
+ EXPECT_TRUE(error_collector.text_.empty());
+}
+
+#endif
+
+// -------------------------------------------------------------------
+
+// In each case, the input is parsed to produce a list of tokens. The
+// last token in "output" must have type TYPE_END.
+struct MultiTokenCase {
+ string input;
+ Tokenizer::Token output[10]; // The compiler wants a constant array
+ // size for initialization to work. There
+ // is no reason this can't be increased if
+ // needed.
+};
+
+inline ostream& operator<<(ostream& out,
+ const MultiTokenCase& test_case) {
+ return out << CEscape(test_case.input);
+}
+
+MultiTokenCase kMultiTokenCases[] = {
+ // Test empty input.
+ { "", {
+ { Tokenizer::TYPE_END , "" , 0, 0 },
+ }},
+
+ // Test all token types at the same time.
+ { "foo 1 1.2 + 'bar'", {
+ { Tokenizer::TYPE_IDENTIFIER, "foo" , 0, 0 },
+ { Tokenizer::TYPE_INTEGER , "1" , 0, 4 },
+ { Tokenizer::TYPE_FLOAT , "1.2" , 0, 6 },
+ { Tokenizer::TYPE_SYMBOL , "+" , 0, 10 },
+ { Tokenizer::TYPE_STRING , "'bar'", 0, 12 },
+ { Tokenizer::TYPE_END , "" , 0, 17 },
+ }},
+
+ // Test that consecutive symbols are parsed as separate tokens.
+ { "!@+%", {
+ { Tokenizer::TYPE_SYMBOL , "!" , 0, 0 },
+ { Tokenizer::TYPE_SYMBOL , "@" , 0, 1 },
+ { Tokenizer::TYPE_SYMBOL , "+" , 0, 2 },
+ { Tokenizer::TYPE_SYMBOL , "%" , 0, 3 },
+ { Tokenizer::TYPE_END , "" , 0, 4 },
+ }},
+
+ // Test that newlines affect line numbers correctly.
+ { "foo bar\nrab oof", {
+ { Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0 },
+ { Tokenizer::TYPE_IDENTIFIER, "bar", 0, 4 },
+ { Tokenizer::TYPE_IDENTIFIER, "rab", 1, 0 },
+ { Tokenizer::TYPE_IDENTIFIER, "oof", 1, 4 },
+ { Tokenizer::TYPE_END , "" , 1, 7 },
+ }},
+
+ // Test that tabs affect column numbers correctly.
+ { "foo\tbar \tbaz", {
+ { Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0 },
+ { Tokenizer::TYPE_IDENTIFIER, "bar", 0, 8 },
+ { Tokenizer::TYPE_IDENTIFIER, "baz", 0, 16 },
+ { Tokenizer::TYPE_END , "" , 0, 19 },
+ }},
+
+ // Test that line comments are ignored.
+ { "foo // This is a comment\n"
+ "bar // This is another comment", {
+ { Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0 },
+ { Tokenizer::TYPE_IDENTIFIER, "bar", 1, 0 },
+ { Tokenizer::TYPE_END , "" , 1, 30 },
+ }},
+
+ // Test that block comments are ignored.
+ { "foo /* This is a block comment */ bar", {
+ { Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0 },
+ { Tokenizer::TYPE_IDENTIFIER, "bar", 0, 34 },
+ { Tokenizer::TYPE_END , "" , 0, 37 },
+ }},
+
+ // Test that sh-style comments are not ignored by default.
+ { "foo # bar\n"
+ "baz", {
+ { Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0 },
+ { Tokenizer::TYPE_SYMBOL , "#" , 0, 4 },
+ { Tokenizer::TYPE_IDENTIFIER, "bar", 0, 6 },
+ { Tokenizer::TYPE_IDENTIFIER, "baz", 1, 0 },
+ { Tokenizer::TYPE_END , "" , 1, 3 },
+ }},
+};
+
+TEST_2D(TokenizerTest, MultipleTokens, kMultiTokenCases, kBlockSizes) {
+ // Set up the tokenizer.
+ TestInputStream input(kMultiTokenCases_case.input.data(),
+ kMultiTokenCases_case.input.size(),
+ kBlockSizes_case);
+ TestErrorCollector error_collector;
+ Tokenizer tokenizer(&input, &error_collector);
+
+ // Before Next() is called, the initial token should always be TYPE_START.
+ EXPECT_EQ(Tokenizer::TYPE_START, tokenizer.current().type);
+ EXPECT_EQ("", tokenizer.current().text);
+ EXPECT_EQ(0, tokenizer.current().line);
+ EXPECT_EQ(0, tokenizer.current().column);
+
+ // Loop through all expected tokens.
+ int i = 0;
+ Tokenizer::Token token;
+ do {
+ token = kMultiTokenCases_case.output[i++];
+
+ SCOPED_TRACE(testing::Message() << "Token #" << i << ": " << token.text);
+
+ // Next() should only return false when it hits the end token.
+ if (token.type != Tokenizer::TYPE_END) {
+ ASSERT_TRUE(tokenizer.Next());
+ } else {
+ ASSERT_FALSE(tokenizer.Next());
+ }
+
+ // Check that the token matches the expected one.
+ EXPECT_EQ(token.type, tokenizer.current().type);
+ EXPECT_EQ(token.text, tokenizer.current().text);
+ EXPECT_EQ(token.line, tokenizer.current().line);
+ EXPECT_EQ(token.column, tokenizer.current().column);
+
+ } while (token.type != Tokenizer::TYPE_END);
+
+ // There should be no errors.
+ EXPECT_TRUE(error_collector.text_.empty());
+}
+
+// This test causes gcc 3.3.5 (and earlier?) to give the cryptic error:
+// "sorry, unimplemented: `method_call_expr' not supported by dump_expr"
+#if !defined(__GNUC__) || __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 3)
+
+TEST_1D(TokenizerTest, ShCommentStyle, kBlockSizes) {
+ // Test the "comment_style" option.
+
+ const char* text = "foo # bar\n"
+ "baz // qux\n"
+ "corge /* grault */\n"
+ "garply";
+ const char* const kTokens[] = {"foo", // "# bar" is ignored
+ "baz", "/", "/", "qux",
+ "corge", "/", "*", "grault", "*", "/",
+ "garply"};
+
+ // Set up the tokenizer.
+ TestInputStream input(text, strlen(text), kBlockSizes_case);
+ TestErrorCollector error_collector;
+ Tokenizer tokenizer(&input, &error_collector);
+ tokenizer.set_comment_style(Tokenizer::SH_COMMENT_STYLE);
+
+ // Advance through tokens and check that they are parsed as expected.
+ for (int i = 0; i < GOOGLE_ARRAYSIZE(kTokens); i++) {
+ EXPECT_TRUE(tokenizer.Next());
+ EXPECT_EQ(tokenizer.current().text, kTokens[i]);
+ }
+
+ // There should be no more input.
+ EXPECT_FALSE(tokenizer.Next());
+ // There should be no errors.
+ EXPECT_TRUE(error_collector.text_.empty());
+}
+
+#endif
+
+// -------------------------------------------------------------------
+
+// Test parse helpers. It's not really worth setting up a full data-driven
+// test here.
+TEST_F(TokenizerTest, ParseInteger) {
+ EXPECT_EQ(0, ParseInteger("0"));
+ EXPECT_EQ(123, ParseInteger("123"));
+ EXPECT_EQ(0xabcdef12u, ParseInteger("0xabcdef12"));
+ EXPECT_EQ(0xabcdef12u, ParseInteger("0xABCDEF12"));
+ EXPECT_EQ(kuint64max, ParseInteger("0xFFFFFFFFFFFFFFFF"));
+ EXPECT_EQ(01234567, ParseInteger("01234567"));
+ EXPECT_EQ(0X123, ParseInteger("0X123"));
+
+ // Test invalid integers that may still be tokenized as integers.
+ EXPECT_EQ(0, ParseInteger("0x"));
+
+ uint64 i;
+#ifdef GTEST_HAS_DEATH_TEST // death tests do not work on Windows yet
+ // Test invalid integers that will never be tokenized as integers.
+ EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("zxy", kuint64max, &i),
+ "passed text that could not have been tokenized as an integer");
+ EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("1.2", kuint64max, &i),
+ "passed text that could not have been tokenized as an integer");
+ EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("08", kuint64max, &i),
+ "passed text that could not have been tokenized as an integer");
+ EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("0xg", kuint64max, &i),
+ "passed text that could not have been tokenized as an integer");
+ EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("-1", kuint64max, &i),
+ "passed text that could not have been tokenized as an integer");
+#endif // GTEST_HAS_DEATH_TEST
+
+ // Test overflows.
+ EXPECT_TRUE (Tokenizer::ParseInteger("0", 0, &i));
+ EXPECT_FALSE(Tokenizer::ParseInteger("1", 0, &i));
+ EXPECT_TRUE (Tokenizer::ParseInteger("1", 1, &i));
+ EXPECT_TRUE (Tokenizer::ParseInteger("12345", 12345, &i));
+ EXPECT_FALSE(Tokenizer::ParseInteger("12346", 12345, &i));
+ EXPECT_TRUE (Tokenizer::ParseInteger("0xFFFFFFFFFFFFFFFF" , kuint64max, &i));
+ EXPECT_FALSE(Tokenizer::ParseInteger("0x10000000000000000", kuint64max, &i));
+}
+
+TEST_F(TokenizerTest, ParseFloat) {
+ EXPECT_DOUBLE_EQ(1 , Tokenizer::ParseFloat("1."));
+ EXPECT_DOUBLE_EQ(1e3 , Tokenizer::ParseFloat("1e3"));
+ EXPECT_DOUBLE_EQ(1e3 , Tokenizer::ParseFloat("1E3"));
+ EXPECT_DOUBLE_EQ(1.5e3, Tokenizer::ParseFloat("1.5e3"));
+ EXPECT_DOUBLE_EQ(.1 , Tokenizer::ParseFloat(".1"));
+ EXPECT_DOUBLE_EQ(.25 , Tokenizer::ParseFloat(".25"));
+ EXPECT_DOUBLE_EQ(.1e3 , Tokenizer::ParseFloat(".1e3"));
+ EXPECT_DOUBLE_EQ(.25e3, Tokenizer::ParseFloat(".25e3"));
+ EXPECT_DOUBLE_EQ(.1e+3, Tokenizer::ParseFloat(".1e+3"));
+ EXPECT_DOUBLE_EQ(.1e-3, Tokenizer::ParseFloat(".1e-3"));
+ EXPECT_DOUBLE_EQ(5 , Tokenizer::ParseFloat("5"));
+ EXPECT_DOUBLE_EQ(6e-12, Tokenizer::ParseFloat("6e-12"));
+ EXPECT_DOUBLE_EQ(1.2 , Tokenizer::ParseFloat("1.2"));
+ EXPECT_DOUBLE_EQ(1.e2 , Tokenizer::ParseFloat("1.e2"));
+
+ // Test invalid integers that may still be tokenized as integers.
+ EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1e"));
+ EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1e-"));
+ EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1.e"));
+
+ // Test 'f' suffix.
+ EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1f"));
+ EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1.0f"));
+ EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1F"));
+
+ // These should parse successfully even though they are out of range.
+ // Overflows become infinity and underflows become zero.
+ EXPECT_EQ( 0.0, Tokenizer::ParseFloat("1e-9999999999999999999999999999"));
+ EXPECT_EQ(HUGE_VAL, Tokenizer::ParseFloat("1e+9999999999999999999999999999"));
+
+#ifdef GTEST_HAS_DEATH_TEST // death tests do not work on Windows yet
+ // Test invalid integers that will never be tokenized as integers.
+ EXPECT_DEBUG_DEATH(Tokenizer::ParseFloat("zxy"),
+ "passed text that could not have been tokenized as a float");
+ EXPECT_DEBUG_DEATH(Tokenizer::ParseFloat("1-e0"),
+ "passed text that could not have been tokenized as a float");
+ EXPECT_DEBUG_DEATH(Tokenizer::ParseFloat("-1.0"),
+ "passed text that could not have been tokenized as a float");
+#endif // GTEST_HAS_DEATH_TEST
+}
+
+TEST_F(TokenizerTest, ParseString) {
+ string output;
+ Tokenizer::ParseString("'hello'", &output);
+ EXPECT_EQ("hello", output);
+ Tokenizer::ParseString("\"blah\\nblah2\"", &output);
+ EXPECT_EQ("blah\nblah2", output);
+ Tokenizer::ParseString("'\\1x\\1\\123\\739\\52\\334n\\3'", &output);
+ EXPECT_EQ("\1x\1\123\739\52\334n\3", output);
+ Tokenizer::ParseString("'\\x20\\x4'", &output);
+ EXPECT_EQ("\x20\x4", output);
+
+ // Test invalid strings that may still be tokenized as strings.
+ Tokenizer::ParseString("\"\\a\\l\\v\\t", &output); // \l is invalid
+ EXPECT_EQ("\a?\v\t", output);
+ Tokenizer::ParseString("'", &output);
+ EXPECT_EQ("", output);
+ Tokenizer::ParseString("'\\", &output);
+ EXPECT_EQ("\\", output);
+
+ // Test invalid strings that will never be tokenized as strings.
+#ifdef GTEST_HAS_DEATH_TEST // death tests do not work on Windows yet
+ EXPECT_DEBUG_DEATH(Tokenizer::ParseString("", &output),
+ "passed text that could not have been tokenized as a string");
+#endif // GTEST_HAS_DEATH_TEST
+}
+
+TEST_F(TokenizerTest, ParseStringAppend) {
+ // Check that ParseString and ParseStringAppend differ.
+ string output("stuff+");
+ Tokenizer::ParseStringAppend("'hello'", &output);
+ EXPECT_EQ("stuff+hello", output);
+ Tokenizer::ParseString("'hello'", &output);
+ EXPECT_EQ("hello", output);
+}
+
+// -------------------------------------------------------------------
+
+// Each case parses some input text, ignoring the tokens produced, and
+// checks that the error output matches what is expected.
+struct ErrorCase {
+ string input;
+ bool recoverable; // True if the tokenizer should be able to recover and
+ // parse more tokens after seeing this error. Cases
+ // for which this is true must end with "foo" as
+ // the last token, which the test will check for.
+ const char* errors;
+};
+
+inline ostream& operator<<(ostream& out,
+ const ErrorCase& test_case) {
+ return out << CEscape(test_case.input);
+}
+
+ErrorCase kErrorCases[] = {
+ // String errors.
+ { "'\\l' foo", true,
+ "0:2: Invalid escape sequence in string literal.\n" },
+ { "'\\x' foo", true,
+ "0:3: Expected hex digits for escape sequence.\n" },
+ { "'foo", false,
+ "0:4: String literals cannot cross line boundaries.\n" },
+ { "'bar\nfoo", true,
+ "0:4: String literals cannot cross line boundaries.\n" },
+
+ // Integer errors.
+ { "123foo", true,
+ "0:3: Need space between number and identifier.\n" },
+
+ // Hex/octal errors.
+ { "0x foo", true,
+ "0:2: \"0x\" must be followed by hex digits.\n" },
+ { "0541823 foo", true,
+ "0:4: Numbers starting with leading zero must be in octal.\n" },
+ { "0x123z foo", true,
+ "0:5: Need space between number and identifier.\n" },
+ { "0x123.4 foo", true,
+ "0:5: Hex and octal numbers must be integers.\n" },
+ { "0123.4 foo", true,
+ "0:4: Hex and octal numbers must be integers.\n" },
+
+ // Float errors.
+ { "1e foo", true,
+ "0:2: \"e\" must be followed by exponent.\n" },
+ { "1e- foo", true,
+ "0:3: \"e\" must be followed by exponent.\n" },
+ { "1.2.3 foo", true,
+ "0:3: Already saw decimal point or exponent; can't have another one.\n" },
+ { "1e2.3 foo", true,
+ "0:3: Already saw decimal point or exponent; can't have another one.\n" },
+ { "a.1 foo", true,
+ "0:1: Need space between identifier and decimal point.\n" },
+ // allow_f_after_float not enabled, so this should be an error.
+ { "1.0f foo", true,
+ "0:3: Need space between number and identifier.\n" },
+
+ // Block comment errors.
+ { "/*", false,
+ "0:2: End-of-file inside block comment.\n"
+ "0:0: Comment started here.\n"},
+ { "/*/*/ foo", true,
+ "0:3: \"/*\" inside block comment. Block comments cannot be nested.\n"},
+
+ // Control characters. Multiple consecutive control characters should only
+ // produce one error.
+ { "\b foo", true,
+ "0:0: Invalid control characters encountered in text.\n" },
+ { "\b\b foo", true,
+ "0:0: Invalid control characters encountered in text.\n" },
+
+ // Check that control characters at end of input don't result in an
+ // infinite loop.
+ { "\b", false,
+ "0:0: Invalid control characters encountered in text.\n" },
+
+ // Check recovery from '\0'. We have to explicitly specify the length of
+ // these strings because otherwise the string constructor will just call
+ // strlen() which will see the first '\0' and think that is the end of the
+ // string.
+ { string("\0foo", 4), true,
+ "0:0: Invalid control characters encountered in text.\n" },
+ { string("\0\0foo", 5), true,
+ "0:0: Invalid control characters encountered in text.\n" },
+};
+
+TEST_2D(TokenizerTest, Errors, kErrorCases, kBlockSizes) {
+ // Set up the tokenizer.
+ TestInputStream input(kErrorCases_case.input.data(),
+ kErrorCases_case.input.size(),
+ kBlockSizes_case);
+ TestErrorCollector error_collector;
+ Tokenizer tokenizer(&input, &error_collector);
+
+ // Ignore all input, except remember if the last token was "foo".
+ bool last_was_foo = false;
+ while (tokenizer.Next()) {
+ last_was_foo = tokenizer.current().text == "foo";
+ }
+
+ // Check that the errors match what was expected.
+ EXPECT_EQ(error_collector.text_, kErrorCases_case.errors);
+
+ // If the error was recoverable, make sure we saw "foo" after it.
+ if (kErrorCases_case.recoverable) {
+ EXPECT_TRUE(last_was_foo);
+ }
+}
+
+// -------------------------------------------------------------------
+
+TEST_1D(TokenizerTest, BackUpOnDestruction, kBlockSizes) {
+ string text = "foo bar";
+ TestInputStream input(text.data(), text.size(), kBlockSizes_case);
+
+ // Create a tokenizer, read one token, then destroy it.
+ {
+ TestErrorCollector error_collector;
+ Tokenizer tokenizer(&input, &error_collector);
+
+ tokenizer.Next();
+ }
+
+ // Only "foo" should have been read.
+ EXPECT_EQ(strlen("foo"), input.ByteCount());
+}
+
+} // namespace
+} // namespace io
+} // namespace protobuf
+} // namespace google