Error on invalid UTF-8 sequences

Invalid UTF-8 was being best-effort consumed, which given the right sequence of brokenness, could end up with diagnostic locations referring to bytes beyond the end of a line.

Improve the UTF-8 decoding so that it can detect when multi-byte codepoints are missing the high-bit being set.
Actually detect this in a lexer, and parser and produce errors.

Bug: tint:1437
Bug: chromium:1305648
Change-Id: I459f0df840b4ce8c4f5f82363f93602bf8326984
Reviewed-on: https://dawn-review.googlesource.com/c/tint/+/83540
Kokoro: Kokoro <noreply+kokoro@google.com>
Reviewed-by: Antonio Maiorano <amaiorano@google.com>
Commit-Queue: Ben Clayton <bclayton@google.com>
diff --git a/src/tint/reader/wgsl/lexer.cc b/src/tint/reader/wgsl/lexer.cc
index 6961b05..a68afa2 100644
--- a/src/tint/reader/wgsl/lexer.cc
+++ b/src/tint/reader/wgsl/lexer.cc
@@ -739,6 +739,10 @@
     auto* utf8 = reinterpret_cast<const uint8_t*>(&file_->content.data[pos_]);
     auto [code_point, n] =
         text::utf8::Decode(utf8, file_->content.data.size() - pos_);
+    if (n == 0) {
+      pos_++;  // Skip the bad byte.
+      return {Token::Type::kError, source, "invalid UTF-8"};
+    }
     if (code_point != text::CodePoint('_') && !code_point.IsXIDStart()) {
       return {};
     }
@@ -752,6 +756,10 @@
     auto* utf8 = reinterpret_cast<const uint8_t*>(&file_->content.data[pos_]);
     auto [code_point, n] =
         text::utf8::Decode(utf8, file_->content.data.size() - pos_);
+    if (n == 0) {
+      pos_++;  // Skip the bad byte.
+      return {Token::Type::kError, source, "invalid UTF-8"};
+    }
     if (!code_point.IsXIDContinue()) {
       break;
     }
diff --git a/src/tint/reader/wgsl/lexer_test.cc b/src/tint/reader/wgsl/lexer_test.cc
index 3df5b4e..b05c0a6 100644
--- a/src/tint/reader/wgsl/lexer_test.cc
+++ b/src/tint/reader/wgsl/lexer_test.cc
@@ -344,11 +344,11 @@
 
 struct UnicodeCase {
   const char* utf8;
-  size_t code_units;
+  size_t count;
 };
 
-using UnicodeIdentifierTest = testing::TestWithParam<UnicodeCase>;
-TEST_P(UnicodeIdentifierTest, Parse) {
+using ValidUnicodeIdentifierTest = testing::TestWithParam<UnicodeCase>;
+TEST_P(ValidUnicodeIdentifierTest, Parse) {
   Source::File file("", GetParam().utf8);
   Lexer l(&file);
 
@@ -357,12 +357,12 @@
   EXPECT_EQ(t.source().range.begin.line, 1u);
   EXPECT_EQ(t.source().range.begin.column, 1u);
   EXPECT_EQ(t.source().range.end.line, 1u);
-  EXPECT_EQ(t.source().range.end.column, 1u + GetParam().code_units);
+  EXPECT_EQ(t.source().range.end.column, 1u + GetParam().count);
   EXPECT_EQ(t.to_str(), GetParam().utf8);
 }
 INSTANTIATE_TEST_SUITE_P(
     LexerTest,
-    UnicodeIdentifierTest,
+    ValidUnicodeIdentifierTest,
     testing::ValuesIn({
         UnicodeCase{// "𝐢𝐝𝐞𝐧𝐭𝐢𝐟𝐢𝐞𝐫"
                     "\xf0\x9d\x90\xa2\xf0\x9d\x90\x9d\xf0\x9d\x90\x9e\xf0\x9d"
@@ -393,6 +393,56 @@
             43},
     }));
 
+using InvalidUnicodeIdentifierTest = testing::TestWithParam<const char*>;
+TEST_P(InvalidUnicodeIdentifierTest, Parse) {
+  Source::File file("", GetParam());
+  Lexer l(&file);
+
+  auto t = l.next();
+  EXPECT_TRUE(t.IsError());
+  EXPECT_EQ(t.source().range.begin.line, 1u);
+  EXPECT_EQ(t.source().range.begin.column, 1u);
+  EXPECT_EQ(t.source().range.end.line, 1u);
+  EXPECT_EQ(t.source().range.end.column, 1u);
+  EXPECT_EQ(t.to_str(), "invalid UTF-8");
+}
+INSTANTIATE_TEST_SUITE_P(
+    LexerTest,
+    InvalidUnicodeIdentifierTest,
+    testing::ValuesIn({
+        "\x80\x80\x80\x80",  // 10000000
+        "\x81\x80\x80\x80",  // 10000001
+        "\x8f\x80\x80\x80",  // 10001111
+        "\x90\x80\x80\x80",  // 10010000
+        "\x91\x80\x80\x80",  // 10010001
+        "\x9f\x80\x80\x80",  // 10011111
+        "\xa0\x80\x80\x80",  // 10100000
+        "\xa1\x80\x80\x80",  // 10100001
+        "\xaf\x80\x80\x80",  // 10101111
+        "\xb0\x80\x80\x80",  // 10110000
+        "\xb1\x80\x80\x80",  // 10110001
+        "\xbf\x80\x80\x80",  // 10111111
+        "\xc0\x80\x80\x80",  // 11000000
+        "\xc1\x80\x80\x80",  // 11000001
+        "\xf5\x80\x80\x80",  // 11110101
+        "\xf6\x80\x80\x80",  // 11110110
+        "\xf7\x80\x80\x80",  // 11110111
+        "\xf8\x80\x80\x80",  // 11111000
+        "\xfe\x80\x80\x80",  // 11111110
+        "\xff\x80\x80\x80",  // 11111111
+
+        "\xd0",          // 2-bytes, missing second byte
+        "\xe8\x8f",      // 3-bytes, missing third byte
+        "\xf4\x8f\x8f",  // 4-bytes, missing fourth byte
+
+        "\xd0\x7f",          // 2-bytes, second byte MSB unset
+        "\xe8\x7f\x8f",      // 3-bytes, second byte MSB unset
+        "\xe8\x8f\x7f",      // 3-bytes, third byte MSB unset
+        "\xf4\x7f\x8f\x8f",  // 4-bytes, second byte MSB unset
+        "\xf4\x8f\x7f\x8f",  // 4-bytes, third byte MSB unset
+        "\xf4\x8f\x8f\x7f",  // 4-bytes, fourth byte MSB unset
+    }));
+
 TEST_F(LexerTest, IdentifierTest_SingleUnderscoreDoesNotMatch) {
   Source::File file("", "_");
   Lexer l(&file);
diff --git a/src/tint/reader/wgsl/parser_impl.cc b/src/tint/reader/wgsl/parser_impl.cc
index fa43f1f..54c53dd 100644
--- a/src/tint/reader/wgsl/parser_impl.cc
+++ b/src/tint/reader/wgsl/parser_impl.cc
@@ -303,8 +303,9 @@
 }
 
 Token ParserImpl::peek(size_t idx) {
-  while (token_queue_.size() < (idx + 1))
+  while (token_queue_.size() < (idx + 1)) {
     token_queue_.push_back(lexer_->next());
+  }
   return token_queue_[idx];
 }
 
@@ -448,9 +449,8 @@
   }
 
   // The token might itself be an error.
-  if (t.IsError()) {
-    next();  // Consume it.
-    return add_error(t.source(), t.to_str());
+  if (handle_error(t)) {
+    return Failure::kErrored;
   }
 
   // Exhausted all attempts to make sense of where we're at.
@@ -2746,9 +2746,6 @@
 //   | FALSE
 Maybe<const ast::LiteralExpression*> ParserImpl::const_literal() {
   auto t = peek();
-  if (t.IsError()) {
-    return add_error(t.source(), t.to_str());
-  }
   if (match(Token::Type::kTrue)) {
     return create<ast::BoolLiteralExpression>(t.source(), true);
   }
@@ -2764,6 +2761,9 @@
   if (match(Token::Type::kFloatLiteral)) {
     return create<ast::FloatLiteralExpression>(t.source(), t.to_f32());
   }
+  if (handle_error(t)) {
+    return Failure::kErrored;
+  }
   return Failure::kNoMatch;
 }
 
@@ -3165,13 +3165,18 @@
     return true;
   }
 
+  // Error cases
+  synchronized_ = false;
+  if (handle_error(t)) {
+    return false;
+  }
+
   std::stringstream err;
   err << "expected '" << Token::TypeToName(tok) << "'";
   if (!use.empty()) {
     err << " for " << use;
   }
   add_error(t, err.str());
-  synchronized_ = false;
   return false;
 }
 
@@ -3220,6 +3225,9 @@
 
     return {t.to_str(), t.source()};
   }
+  if (handle_error(t)) {
+    return Failure::kErrored;
+  }
   synchronized_ = false;
   return add_error(t.source(), "expected identifier", use);
 }
@@ -3342,6 +3350,16 @@
   return false;
 }
 
+bool ParserImpl::handle_error(const Token& t) {
+  // The token might itself be an error.
+  if (t.IsError()) {
+    synchronized_ = false;
+    add_error(t.source(), t.to_str());
+    return true;
+  }
+  return false;
+}
+
 template <typename F, typename T>
 T ParserImpl::without_error(F&& body) {
   silence_errors_++;
diff --git a/src/tint/reader/wgsl/parser_impl.h b/src/tint/reader/wgsl/parser_impl.h
index 74eb18f..3ae3739 100644
--- a/src/tint/reader/wgsl/parser_impl.h
+++ b/src/tint/reader/wgsl/parser_impl.h
@@ -828,6 +828,12 @@
   /// @see sync().
   bool is_sync_token(const Token& t) const;
 
+  /// If `t` is an error token, then `synchronized_` is set to false and the
+  /// token's error is appended to the builder's diagnostics. If `t` is not an
+  /// error token, then this function does nothing and false is returned.
+  /// @returns true if `t` is an error, otherwise false.
+  bool handle_error(const Token& t);
+
   /// @returns true if #synchronized_ is true and the number of reported errors
   /// is less than #max_errors_.
   bool continue_parsing() {
diff --git a/src/tint/reader/wgsl/parser_impl_error_msg_test.cc b/src/tint/reader/wgsl/parser_impl_error_msg_test.cc
index 08c48a9..3cde554 100644
--- a/src/tint/reader/wgsl/parser_impl_error_msg_test.cc
+++ b/src/tint/reader/wgsl/parser_impl_error_msg_test.cc
@@ -1689,6 +1689,12 @@
 )");
 }
 
+TEST_F(ParserImplErrorTest, InvalidUTF8) {
+  EXPECT("fn fu\xd0nc() {}",
+         "test.wgsl:1:4 error: invalid UTF-8\n"
+         "fn fu\xD0nc() {}\n");
+}
+
 }  // namespace
 }  // namespace wgsl
 }  // namespace reader
diff --git a/src/tint/text/unicode.cc b/src/tint/text/unicode.cc
index 56e8292..b48f295 100644
--- a/src/tint/text/unicode.cc
+++ b/src/tint/text/unicode.cc
@@ -468,27 +468,35 @@
 
   CodePoint c;
 
+  uint8_t valid = 0x80;
   switch (n) {
     // Note: n=0 (invalid) is correctly handled without a case.
     case 1:
       c = CodePoint{ptr[0]};
       break;
     case 2:
+      valid &= ptr[1];
       c = CodePoint{(static_cast<uint32_t>(ptr[0] & 0b00011111) << 6) |
                     (static_cast<uint32_t>(ptr[1] & 0b00111111))};
       break;
     case 3:
+      valid &= ptr[1] & ptr[2];
       c = CodePoint{(static_cast<uint32_t>(ptr[0] & 0b00001111) << 12) |
                     (static_cast<uint32_t>(ptr[1] & 0b00111111) << 6) |
                     (static_cast<uint32_t>(ptr[2] & 0b00111111))};
       break;
     case 4:
+      valid &= ptr[1] & ptr[2] & ptr[3];
       c = CodePoint{(static_cast<uint32_t>(ptr[0] & 0b00000111) << 18) |
                     (static_cast<uint32_t>(ptr[1] & 0b00111111) << 12) |
                     (static_cast<uint32_t>(ptr[2] & 0b00111111) << 6) |
                     (static_cast<uint32_t>(ptr[3] & 0b00111111))};
       break;
   }
+  if (!valid) {
+    n = 0;
+    c = 0;
+  }
   return {c, n};
 }
 
diff --git a/src/tint/text/unicode_test.cc b/src/tint/text/unicode_test.cc
index deffae9..38221a4 100644
--- a/src/tint/text/unicode_test.cc
+++ b/src/tint/text/unicode_test.cc
@@ -455,30 +455,42 @@
   EXPECT_EQ(width, 0u);
 }
 
-INSTANTIATE_TEST_SUITE_P(Invalid,
-                         DecodeUTF8InvalidTest,
-                         ::testing::ValuesIn({
-                             "\x80\x80\x80\x80",  // 10000000
-                             "\x81\x80\x80\x80",  // 10000001
-                             "\x8f\x80\x80\x80",  // 10001111
-                             "\x90\x80\x80\x80",  // 10010000
-                             "\x91\x80\x80\x80",  // 10010001
-                             "\x9f\x80\x80\x80",  // 10011111
-                             "\xa0\x80\x80\x80",  // 10100000
-                             "\xa1\x80\x80\x80",  // 10100001
-                             "\xaf\x80\x80\x80",  // 10101111
-                             "\xb0\x80\x80\x80",  // 10110000
-                             "\xb1\x80\x80\x80",  // 10110001
-                             "\xbf\x80\x80\x80",  // 10111111
-                             "\xc0\x80\x80\x80",  // 11000000
-                             "\xc1\x80\x80\x80",  // 11000001
-                             "\xf5\x80\x80\x80",  // 11110101
-                             "\xf6\x80\x80\x80",  // 11110110
-                             "\xf7\x80\x80\x80",  // 11110111
-                             "\xf8\x80\x80\x80",  // 11111000
-                             "\xfe\x80\x80\x80",  // 11111110
-                             "\xff\x80\x80\x80",  // 11111111
-                         }));
+INSTANTIATE_TEST_SUITE_P(
+    Invalid,
+    DecodeUTF8InvalidTest,
+    ::testing::ValuesIn({
+        "\x80\x80\x80\x80",  // 10000000
+        "\x81\x80\x80\x80",  // 10000001
+        "\x8f\x80\x80\x80",  // 10001111
+        "\x90\x80\x80\x80",  // 10010000
+        "\x91\x80\x80\x80",  // 10010001
+        "\x9f\x80\x80\x80",  // 10011111
+        "\xa0\x80\x80\x80",  // 10100000
+        "\xa1\x80\x80\x80",  // 10100001
+        "\xaf\x80\x80\x80",  // 10101111
+        "\xb0\x80\x80\x80",  // 10110000
+        "\xb1\x80\x80\x80",  // 10110001
+        "\xbf\x80\x80\x80",  // 10111111
+        "\xc0\x80\x80\x80",  // 11000000
+        "\xc1\x80\x80\x80",  // 11000001
+        "\xf5\x80\x80\x80",  // 11110101
+        "\xf6\x80\x80\x80",  // 11110110
+        "\xf7\x80\x80\x80",  // 11110111
+        "\xf8\x80\x80\x80",  // 11111000
+        "\xfe\x80\x80\x80",  // 11111110
+        "\xff\x80\x80\x80",  // 11111111
+
+        "\xd0",          // 2-bytes, missing second byte
+        "\xe8\x8f",      // 3-bytes, missing third byte
+        "\xf4\x8f\x8f",  // 4-bytes, missing fourth byte
+
+        "\xd0\x7f",          // 2-bytes, second byte MSB unset
+        "\xe8\x7f\x8f",      // 3-bytes, second byte MSB unset
+        "\xe8\x8f\x7f",      // 3-bytes, third byte MSB unset
+        "\xf4\x7f\x8f\x8f",  // 4-bytes, second byte MSB unset
+        "\xf4\x8f\x7f\x8f",  // 4-bytes, third byte MSB unset
+        "\xf4\x8f\x8f\x7f",  // 4-bytes, fourth byte MSB unset
+    }));
 
 }  // namespace