1 files changed, 51 insertions, 307 deletions
diff --git a/src/google/protobuf/io/tokenizer_unittest.cc b/src/google/protobuf/io/tokenizer_unittest.cc
index de096fb..358ec56 100644
--- a/src/google/protobuf/io/tokenizer_unittest.cc
+++ b/src/google/protobuf/io/tokenizer_unittest.cc
@@ -1,6 +1,6 @@
 // Protocol Buffers - Google's data interchange format
 // Copyright 2008 Google Inc.  All rights reserved.
-// https://developers.google.com/protocol-buffers/
+// http://code.google.com/p/protobuf/
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
@@ -32,10 +32,9 @@
 //  Based on original Protocol Buffers design by
 //  Sanjay Ghemawat, Jeff Dean, and others.
 
-#include <limits.h>
-#include <math.h>
-
 #include <vector>
+#include <math.h>
+#include <limits.h>
 
 #include <google/protobuf/io/tokenizer.h>
 #include <google/protobuf/io/zero_copy_stream_impl.h>
@@ -258,7 +257,6 @@ TEST_2D(TokenizerTest, SimpleTokens, kSimpleTokenCases, kBlockSizes) {
   EXPECT_EQ("", tokenizer.current().text);
   EXPECT_EQ(0, tokenizer.current().line);
   EXPECT_EQ(0, tokenizer.current().column);
-  EXPECT_EQ(0, tokenizer.current().end_column);
 
   // Parse the token.
   ASSERT_TRUE(tokenizer.Next());
@@ -270,8 +268,6 @@ TEST_2D(TokenizerTest, SimpleTokens, kSimpleTokenCases, kBlockSizes) {
   // Check that it is located at the beginning of the input
   EXPECT_EQ(0, tokenizer.current().line);
   EXPECT_EQ(0, tokenizer.current().column);
-  EXPECT_EQ(kSimpleTokenCases_case.input.size(),
-            tokenizer.current().end_column);
 
   // There should be no more input.
   EXPECT_FALSE(tokenizer.Next());
@@ -281,8 +277,6 @@ TEST_2D(TokenizerTest, SimpleTokens, kSimpleTokenCases, kBlockSizes) {
   EXPECT_EQ("", tokenizer.current().text);
   EXPECT_EQ(0, tokenizer.current().line);
   EXPECT_EQ(kSimpleTokenCases_case.input.size(), tokenizer.current().column);
-  EXPECT_EQ(kSimpleTokenCases_case.input.size(),
-            tokenizer.current().end_column);
 
   // There should be no errors.
   EXPECT_TRUE(error_collector.text_.empty());
@@ -345,77 +339,76 @@ MultiTokenCase kMultiTokenCases[] = {
 
   // Test all token types at the same time.
   { "foo 1 1.2 + 'bar'", {
-    { Tokenizer::TYPE_IDENTIFIER, "foo"  , 0,  0,  3 },
-    { Tokenizer::TYPE_INTEGER   , "1"    , 0,  4,  5 },
-    { Tokenizer::TYPE_FLOAT     , "1.2"  , 0,  6,  9 },
-    { Tokenizer::TYPE_SYMBOL    , "+"    , 0, 10, 11 },
-    { Tokenizer::TYPE_STRING    , "'bar'", 0, 12, 17 },
-    { Tokenizer::TYPE_END       , ""     , 0, 17, 17 },
+    { Tokenizer::TYPE_IDENTIFIER, "foo"  , 0,  0 },
+    { Tokenizer::TYPE_INTEGER   , "1"    , 0,  4 },
+    { Tokenizer::TYPE_FLOAT     , "1.2"  , 0,  6 },
+    { Tokenizer::TYPE_SYMBOL    , "+"    , 0, 10 },
+    { Tokenizer::TYPE_STRING    , "'bar'", 0, 12 },
+    { Tokenizer::TYPE_END       , ""     , 0, 17 },
   }},
 
   // Test that consecutive symbols are parsed as separate tokens.
   { "!@+%", {
-    { Tokenizer::TYPE_SYMBOL    , "!"    , 0, 0, 1 },
-    { Tokenizer::TYPE_SYMBOL    , "@"    , 0, 1, 2 },
-    { Tokenizer::TYPE_SYMBOL    , "+"    , 0, 2, 3 },
-    { Tokenizer::TYPE_SYMBOL    , "%"    , 0, 3, 4 },
-    { Tokenizer::TYPE_END       , ""     , 0, 4, 4 },
+    { Tokenizer::TYPE_SYMBOL    , "!"    , 0, 0 },
+    { Tokenizer::TYPE_SYMBOL    , "@"    , 0, 1 },
+    { Tokenizer::TYPE_SYMBOL    , "+"    , 0, 2 },
+    { Tokenizer::TYPE_SYMBOL    , "%"    , 0, 3 },
+    { Tokenizer::TYPE_END       , ""     , 0, 4 },
   }},
 
   // Test that newlines affect line numbers correctly.
   { "foo bar\nrab oof", {
-    { Tokenizer::TYPE_IDENTIFIER, "foo", 0,  0, 3 },
-    { Tokenizer::TYPE_IDENTIFIER, "bar", 0,  4, 7 },
-    { Tokenizer::TYPE_IDENTIFIER, "rab", 1,  0, 3 },
-    { Tokenizer::TYPE_IDENTIFIER, "oof", 1,  4, 7 },
-    { Tokenizer::TYPE_END       , ""   , 1,  7, 7 },
+    { Tokenizer::TYPE_IDENTIFIER, "foo", 0,  0 },
+    { Tokenizer::TYPE_IDENTIFIER, "bar", 0,  4 },
+    { Tokenizer::TYPE_IDENTIFIER, "rab", 1,  0 },
+    { Tokenizer::TYPE_IDENTIFIER, "oof", 1,  4 },
+    { Tokenizer::TYPE_END       , ""   , 1,  7 },
   }},
 
   // Test that tabs affect column numbers correctly.
   { "foo\tbar  \tbaz", {
-    { Tokenizer::TYPE_IDENTIFIER, "foo", 0,  0,  3 },
-    { Tokenizer::TYPE_IDENTIFIER, "bar", 0,  8, 11 },
-    { Tokenizer::TYPE_IDENTIFIER, "baz", 0, 16, 19 },
-    { Tokenizer::TYPE_END       , ""   , 0, 19, 19 },
-  }},
-
-  // Test that tabs in string literals affect column numbers correctly.
-  { "\"foo\tbar\" baz", {
-    { Tokenizer::TYPE_STRING    , "\"foo\tbar\"", 0,  0, 12 },
-    { Tokenizer::TYPE_IDENTIFIER, "baz"         , 0, 13, 16 },
-    { Tokenizer::TYPE_END       , ""            , 0, 16, 16 },
+    { Tokenizer::TYPE_IDENTIFIER, "foo", 0,  0 },
+    { Tokenizer::TYPE_IDENTIFIER, "bar", 0,  8 },
+    { Tokenizer::TYPE_IDENTIFIER, "baz", 0, 16 },
+    { Tokenizer::TYPE_END       , ""   , 0, 19 },
   }},
 
   // Test that line comments are ignored.
   { "foo // This is a comment\n"
     "bar // This is another comment", {
-    { Tokenizer::TYPE_IDENTIFIER, "foo", 0,  0,  3 },
-    { Tokenizer::TYPE_IDENTIFIER, "bar", 1,  0,  3 },
-    { Tokenizer::TYPE_END       , ""   , 1, 30, 30 },
+    { Tokenizer::TYPE_IDENTIFIER, "foo", 0,  0 },
+    { Tokenizer::TYPE_IDENTIFIER, "bar", 1,  0 },
+    { Tokenizer::TYPE_END       , ""   , 1, 30 },
   }},
 
   // Test that block comments are ignored.
   { "foo /* This is a block comment */ bar", {
-    { Tokenizer::TYPE_IDENTIFIER, "foo", 0,  0,  3 },
-    { Tokenizer::TYPE_IDENTIFIER, "bar", 0, 34, 37 },
-    { Tokenizer::TYPE_END       , ""   , 0, 37, 37 },
+    { Tokenizer::TYPE_IDENTIFIER, "foo", 0,  0 },
+    { Tokenizer::TYPE_IDENTIFIER, "bar", 0, 34 },
+    { Tokenizer::TYPE_END       , ""   , 0, 37 },
   }},
 
   // Test that sh-style comments are not ignored by default.
   { "foo # bar\n"
     "baz", {
-    { Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3 },
-    { Tokenizer::TYPE_SYMBOL    , "#"  , 0, 4, 5 },
-    { Tokenizer::TYPE_IDENTIFIER, "bar", 0, 6, 9 },
-    { Tokenizer::TYPE_IDENTIFIER, "baz", 1, 0, 3 },
-    { Tokenizer::TYPE_END       , ""   , 1, 3, 3 },
+    { Tokenizer::TYPE_IDENTIFIER, "foo", 0,  0 },
+    { Tokenizer::TYPE_SYMBOL    , "#"  , 0,  4 },
+    { Tokenizer::TYPE_IDENTIFIER, "bar", 0,  6 },
+    { Tokenizer::TYPE_IDENTIFIER, "baz", 1,  0 },
+    { Tokenizer::TYPE_END       , ""   , 1, 3 },
+  }},
+
+  // Bytes with the high-order bit set should not be seen as control characters.
+  { "\300", {
+    { Tokenizer::TYPE_SYMBOL, "\300", 0, 0 },
+    { Tokenizer::TYPE_END   , ""    , 0, 1 },
   }},
 
   // Test all whitespace chars
   { "foo\n\t\r\v\fbar", {
-    { Tokenizer::TYPE_IDENTIFIER, "foo", 0,  0,  3 },
-    { Tokenizer::TYPE_IDENTIFIER, "bar", 1, 11, 14 },
-    { Tokenizer::TYPE_END       , ""   , 1, 14, 14 },
+    { Tokenizer::TYPE_IDENTIFIER, "foo", 0,  0 },
+    { Tokenizer::TYPE_IDENTIFIER, "bar", 1, 11 },
+    { Tokenizer::TYPE_END       , ""   , 1, 14 },
   }},
 };
 
@@ -432,7 +425,6 @@ TEST_2D(TokenizerTest, MultipleTokens, kMultiTokenCases, kBlockSizes) {
   EXPECT_EQ("", tokenizer.current().text);
   EXPECT_EQ(0, tokenizer.current().line);
   EXPECT_EQ(0, tokenizer.current().column);
-  EXPECT_EQ(0, tokenizer.current().end_column);
 
   // Loop through all expected tokens.
   int i = 0;
@@ -442,8 +434,6 @@ TEST_2D(TokenizerTest, MultipleTokens, kMultiTokenCases, kBlockSizes) {
 
     SCOPED_TRACE(testing::Message() << "Token #" << i << ": " << token.text);
 
-    Tokenizer::Token previous = tokenizer.current();
-
     // Next() should only return false when it hits the end token.
     if (token.type != Tokenizer::TYPE_END) {
       ASSERT_TRUE(tokenizer.Next());
@@ -451,19 +441,11 @@ TEST_2D(TokenizerTest, MultipleTokens, kMultiTokenCases, kBlockSizes) {
       ASSERT_FALSE(tokenizer.Next());
     }
 
-    // Check that the previous token is set correctly.
-    EXPECT_EQ(previous.type, tokenizer.previous().type);
-    EXPECT_EQ(previous.text, tokenizer.previous().text);
-    EXPECT_EQ(previous.line, tokenizer.previous().line);
-    EXPECT_EQ(previous.column, tokenizer.previous().column);
-    EXPECT_EQ(previous.end_column, tokenizer.previous().end_column);
-
     // Check that the token matches the expected one.
     EXPECT_EQ(token.type, tokenizer.current().type);
     EXPECT_EQ(token.text, tokenizer.current().text);
     EXPECT_EQ(token.line, tokenizer.current().line);
     EXPECT_EQ(token.column, tokenizer.current().column);
-    EXPECT_EQ(token.end_column, tokenizer.current().end_column);
 
   } while (token.type != Tokenizer::TYPE_END);
 
@@ -509,217 +491,6 @@ TEST_1D(TokenizerTest, ShCommentStyle, kBlockSizes) {
 
 // -------------------------------------------------------------------
 
-// In each case, the input is expected to have two tokens named "prev" and
-// "next" with comments in between.
-struct DocCommentCase {
-  string input;
-
-  const char* prev_trailing_comments;
-  const char* detached_comments[10];
-  const char* next_leading_comments;
-};
-
-inline ostream& operator<<(ostream& out,
-                           const DocCommentCase& test_case) {
-  return out << CEscape(test_case.input);
-}
-
-DocCommentCase kDocCommentCases[] = {
-  {
-    "prev next",
-
-    "",
-    {},
-    ""
-      },
-
-        {
-      "prev /* ignored */ next",
-
-      "",
-      {},
-      ""
-        },
-
-          {
-        "prev // trailing comment\n"
-            "next",
-
-            " trailing comment\n",
-            {},
-            ""
-          },
-
-            {
-          "prev\n"
-              "// leading comment\n"
-              "// line 2\n"
-              "next",
-
-              "",
-              {},
-              " leading comment\n"
-              " line 2\n"
-            },
-
-              {
-            "prev\n"
-                "// trailing comment\n"
-                "// line 2\n"
-                "\n"
-                "next",
-
-                " trailing comment\n"
-                " line 2\n",
-                {},
-                ""
-              },
-
-                {
-              "prev // trailing comment\n"
-                  "// leading comment\n"
-                  "// line 2\n"
-                  "next",
-
-                  " trailing comment\n",
-                  {},
-                  " leading comment\n"
-                  " line 2\n"
-                },
-
-                  {
-                "prev /* trailing block comment */\n"
-                    "/* leading block comment\n"
-                    " * line 2\n"
-                    " * line 3 */"
-                    "next",
-
-                    " trailing block comment ",
-                    {},
-                    " leading block comment\n"
-                    " line 2\n"
-                    " line 3 "
-                  },
-
-                    {
-                  "prev\n"
-                      "/* trailing block comment\n"
-                      " * line 2\n"
-                      " * line 3\n"
-                      " */\n"
-                      "/* leading block comment\n"
-                      " * line 2\n"
-                      " * line 3 */"
-                      "next",
-
-                      " trailing block comment\n"
-                      " line 2\n"
-                      " line 3\n",
-                      {},
-                      " leading block comment\n"
-                      " line 2\n"
-                      " line 3 "
-                    },
-
-                      {
-                    "prev\n"
-                        "// trailing comment\n"
-                        "\n"
-                        "// detached comment\n"
-                        "// line 2\n"
-                        "\n"
-                        "// second detached comment\n"
-                        "/* third detached comment\n"
-                        " * line 2 */\n"
-                        "// leading comment\n"
-                        "next",
-
-                        " trailing comment\n",
-                        {
-                      " detached comment\n"
-                          " line 2\n",
-                          " second detached comment\n",
-                          " third detached comment\n"
-                          " line 2 "
-                        },
-                          " leading comment\n"
-                        },
-
-                          {
-                        "prev /**/\n"
-                            "\n"
-                            "// detached comment\n"
-                            "\n"
-                            "// leading comment\n"
-                            "next",
-
-                            "",
-                            {
-                          " detached comment\n"
-                            },
-                              " leading comment\n"
-                            },
-
-                              {
-                            "prev /**/\n"
-                                "// leading comment\n"
-                                "next",
-
-                                "",
-                                {},
-                                " leading comment\n"
-                              },
-                              };
-
-TEST_2D(TokenizerTest, DocComments, kDocCommentCases, kBlockSizes) {
-  // Set up the tokenizer.
-  TestInputStream input(kDocCommentCases_case.input.data(),
-                        kDocCommentCases_case.input.size(),
-                        kBlockSizes_case);
-  TestErrorCollector error_collector;
-  Tokenizer tokenizer(&input, &error_collector);
-
-  // Set up a second tokenizer where we'll pass all NULLs to NextWithComments().
-  TestInputStream input2(kDocCommentCases_case.input.data(),
-                        kDocCommentCases_case.input.size(),
-                        kBlockSizes_case);
-  Tokenizer tokenizer2(&input2, &error_collector);
-
-  tokenizer.Next();
-  tokenizer2.Next();
-
-  EXPECT_EQ("prev", tokenizer.current().text);
-  EXPECT_EQ("prev", tokenizer2.current().text);
-
-  string prev_trailing_comments;
-  vector<string> detached_comments;
-  string next_leading_comments;
-  tokenizer.NextWithComments(&prev_trailing_comments, &detached_comments,
-                             &next_leading_comments);
-  tokenizer2.NextWithComments(NULL, NULL, NULL);
-  EXPECT_EQ("next", tokenizer.current().text);
-  EXPECT_EQ("next", tokenizer2.current().text);
-
-  EXPECT_EQ(kDocCommentCases_case.prev_trailing_comments,
-            prev_trailing_comments);
-
-  for (int i = 0; i < detached_comments.size(); i++) {
-    ASSERT_LT(i, GOOGLE_ARRAYSIZE(kDocCommentCases));
-    ASSERT_TRUE(kDocCommentCases_case.detached_comments[i] != NULL);
-    EXPECT_EQ(kDocCommentCases_case.detached_comments[i],
-              detached_comments[i]);
-  }
-
-  // Verify that we matched all the detached comments.
-  EXPECT_EQ(NULL,
-      kDocCommentCases_case.detached_comments[detached_comments.size()]);
-
-  EXPECT_EQ(kDocCommentCases_case.next_leading_comments,
-            next_leading_comments);
-}
-
-// -------------------------------------------------------------------
-
 // Test parse helpers.  It's not really worth setting up a full data-driven
 // test here.
 TEST_F(TokenizerTest, ParseInteger) {
@@ -735,7 +506,7 @@ TEST_F(TokenizerTest, ParseInteger) {
   EXPECT_EQ(0, ParseInteger("0x"));
 
   uint64 i;
-#ifdef PROTOBUF_HAS_DEATH_TEST  // death tests do not work on Windows yet
+#ifdef GTEST_HAS_DEATH_TEST  // death tests do not work on Windows yet
   // Test invalid integers that will never be tokenized as integers.
   EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("zxy", kuint64max, &i),
     "passed text that could not have been tokenized as an integer");
@@ -747,7 +518,7 @@ TEST_F(TokenizerTest, ParseInteger) {
     "passed text that could not have been tokenized as an integer");
   EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("-1", kuint64max, &i),
     "passed text that could not have been tokenized as an integer");
-#endif  // PROTOBUF_HAS_DEATH_TEST
+#endif  // GTEST_HAS_DEATH_TEST
 
   // Test overflows.
   EXPECT_TRUE (Tokenizer::ParseInteger("0", 0, &i));
@@ -790,7 +561,7 @@ TEST_F(TokenizerTest, ParseFloat) {
   EXPECT_EQ(     0.0, Tokenizer::ParseFloat("1e-9999999999999999999999999999"));
   EXPECT_EQ(HUGE_VAL, Tokenizer::ParseFloat("1e+9999999999999999999999999999"));
 
-#ifdef PROTOBUF_HAS_DEATH_TEST  // death tests do not work on Windows yet
+#ifdef GTEST_HAS_DEATH_TEST  // death tests do not work on Windows yet
   // Test invalid integers that will never be tokenized as integers.
   EXPECT_DEBUG_DEATH(Tokenizer::ParseFloat("zxy"),
     "passed text that could not have been tokenized as a float");
@@ -798,7 +569,7 @@ TEST_F(TokenizerTest, ParseFloat) {
     "passed text that could not have been tokenized as a float");
   EXPECT_DEBUG_DEATH(Tokenizer::ParseFloat("-1.0"),
     "passed text that could not have been tokenized as a float");
-#endif  // PROTOBUF_HAS_DEATH_TEST
+#endif  // GTEST_HAS_DEATH_TEST
 }
 
 TEST_F(TokenizerTest, ParseString) {
@@ -820,27 +591,11 @@ TEST_F(TokenizerTest, ParseString) {
   Tokenizer::ParseString("'\\", &output);
   EXPECT_EQ("\\", output);
 
-  // Experiment with Unicode escapes. Here are one-, two- and three-byte Unicode
-  // characters.
-  Tokenizer::ParseString("'\\u0024\\u00a2\\u20ac\\U00024b62XX'", &output);
-  EXPECT_EQ("$¢€𤭢XX", output);
-  // Same thing encoded using UTF16.
-  Tokenizer::ParseString("'\\u0024\\u00a2\\u20ac\\ud852\\udf62XX'", &output);
-  EXPECT_EQ("$¢€𤭢XX", output);
-  // Here's some broken UTF16; there's a head surrogate with no tail surrogate.
-  // We just output this as if it were UTF8; it's not a defined code point, but
-  // it has a defined encoding.
-  Tokenizer::ParseString("'\\ud852XX'", &output);
-  EXPECT_EQ("\xed\xa1\x92XX", output);
-  // Malformed escape: Demons may fly out of the nose.
-  Tokenizer::ParseString("\\u0", &output);
-  EXPECT_EQ("u0", output);
-
   // Test invalid strings that will never be tokenized as strings.
-#ifdef PROTOBUF_HAS_DEATH_TEST  // death tests do not work on Windows yet
+#ifdef GTEST_HAS_DEATH_TEST  // death tests do not work on Windows yet
   EXPECT_DEBUG_DEATH(Tokenizer::ParseString("", &output),
     "passed text that could not have been tokenized as a string");
-#endif  // PROTOBUF_HAS_DEATH_TEST
+#endif  // GTEST_HAS_DEATH_TEST
 }
 
 TEST_F(TokenizerTest, ParseStringAppend) {
@@ -877,15 +632,9 @@ ErrorCase kErrorCases[] = {
   { "'\\x' foo", true,
     "0:3: Expected hex digits for escape sequence.\n" },
   { "'foo", false,
-    "0:4: Unexpected end of string.\n" },
+    "0:4: String literals cannot cross line boundaries.\n" },
   { "'bar\nfoo", true,
     "0:4: String literals cannot cross line boundaries.\n" },
-  { "'\\u01' foo", true,
-    "0:5: Expected four hex digits for \\u escape sequence.\n" },
-  { "'\\u01' foo", true,
-    "0:5: Expected four hex digits for \\u escape sequence.\n" },
-  { "'\\uXYZ' foo", true,
-    "0:3: Expected four hex digits for \\u escape sequence.\n" },
 
   // Integer errors.
   { "123foo", true,
@@ -945,10 +694,6 @@ ErrorCase kErrorCases[] = {
     "0:0: Invalid control characters encountered in text.\n" },
   { string("\0\0foo", 5), true,
     "0:0: Invalid control characters encountered in text.\n" },
-
-  // Check error from high order bits set
-  { "\300foo", true,
-    "0:0: Interpreting non ascii codepoint 192.\n" },
 };
 
 TEST_2D(TokenizerTest, Errors, kErrorCases, kBlockSizes) {
@@ -966,7 +711,7 @@ TEST_2D(TokenizerTest, Errors, kErrorCases, kBlockSizes) {
   }
 
   // Check that the errors match what was expected.
-  EXPECT_EQ(kErrorCases_case.errors, error_collector.text_);
+  EXPECT_EQ(error_collector.text_, kErrorCases_case.errors);
 
   // If the error was recoverable, make sure we saw "foo" after it.
   if (kErrorCases_case.recoverable) {
@@ -992,7 +737,6 @@ TEST_1D(TokenizerTest, BackUpOnDestruction, kBlockSizes) {
   EXPECT_EQ(strlen("foo"), input.ByteCount());
 }
 
-
 }  // namespace
 }  // namespace io
 }  // namespace protobuf