Error on invalid UTF-8 sequences
Invalid UTF-8 was being best-effort consumed, which given the right sequence of brokenness, could end up with diagnostic locations referring to bytes beyond the end of a line.
Improve the UTF-8 decoding so that it can detect when multi-byte codepoints are missing the high-bit being set.
Actually detect this in a lexer, and parser and produce errors.
Bug: tint:1437
Bug: chromium:1305648
Change-Id: I459f0df840b4ce8c4f5f82363f93602bf8326984
Reviewed-on: https://dawn-review.googlesource.com/c/tint/+/83540
Kokoro: Kokoro <noreply+kokoro@google.com>
Reviewed-by: Antonio Maiorano <amaiorano@google.com>
Commit-Queue: Ben Clayton <bclayton@google.com>
diff --git a/src/tint/reader/wgsl/lexer.cc b/src/tint/reader/wgsl/lexer.cc
index 6961b05..a68afa2 100644
--- a/src/tint/reader/wgsl/lexer.cc
+++ b/src/tint/reader/wgsl/lexer.cc
@@ -739,6 +739,10 @@
auto* utf8 = reinterpret_cast<const uint8_t*>(&file_->content.data[pos_]);
auto [code_point, n] =
text::utf8::Decode(utf8, file_->content.data.size() - pos_);
+ if (n == 0) {
+ pos_++; // Skip the bad byte.
+ return {Token::Type::kError, source, "invalid UTF-8"};
+ }
if (code_point != text::CodePoint('_') && !code_point.IsXIDStart()) {
return {};
}
@@ -752,6 +756,10 @@
auto* utf8 = reinterpret_cast<const uint8_t*>(&file_->content.data[pos_]);
auto [code_point, n] =
text::utf8::Decode(utf8, file_->content.data.size() - pos_);
+ if (n == 0) {
+ pos_++; // Skip the bad byte.
+ return {Token::Type::kError, source, "invalid UTF-8"};
+ }
if (!code_point.IsXIDContinue()) {
break;
}
diff --git a/src/tint/reader/wgsl/lexer_test.cc b/src/tint/reader/wgsl/lexer_test.cc
index 3df5b4e..b05c0a6 100644
--- a/src/tint/reader/wgsl/lexer_test.cc
+++ b/src/tint/reader/wgsl/lexer_test.cc
@@ -344,11 +344,11 @@
struct UnicodeCase {
const char* utf8;
- size_t code_units;
+ size_t count;
};
-using UnicodeIdentifierTest = testing::TestWithParam<UnicodeCase>;
-TEST_P(UnicodeIdentifierTest, Parse) {
+using ValidUnicodeIdentifierTest = testing::TestWithParam<UnicodeCase>;
+TEST_P(ValidUnicodeIdentifierTest, Parse) {
Source::File file("", GetParam().utf8);
Lexer l(&file);
@@ -357,12 +357,12 @@
EXPECT_EQ(t.source().range.begin.line, 1u);
EXPECT_EQ(t.source().range.begin.column, 1u);
EXPECT_EQ(t.source().range.end.line, 1u);
- EXPECT_EQ(t.source().range.end.column, 1u + GetParam().code_units);
+ EXPECT_EQ(t.source().range.end.column, 1u + GetParam().count);
EXPECT_EQ(t.to_str(), GetParam().utf8);
}
INSTANTIATE_TEST_SUITE_P(
LexerTest,
- UnicodeIdentifierTest,
+ ValidUnicodeIdentifierTest,
testing::ValuesIn({
UnicodeCase{// "𝐢𝐝𝐞𝐧𝐭𝐢𝐟𝐢𝐞𝐫"
"\xf0\x9d\x90\xa2\xf0\x9d\x90\x9d\xf0\x9d\x90\x9e\xf0\x9d"
@@ -393,6 +393,56 @@
43},
}));
+using InvalidUnicodeIdentifierTest = testing::TestWithParam<const char*>;
+TEST_P(InvalidUnicodeIdentifierTest, Parse) {
+ Source::File file("", GetParam());
+ Lexer l(&file);
+
+ auto t = l.next();
+ EXPECT_TRUE(t.IsError());
+ EXPECT_EQ(t.source().range.begin.line, 1u);
+ EXPECT_EQ(t.source().range.begin.column, 1u);
+ EXPECT_EQ(t.source().range.end.line, 1u);
+ EXPECT_EQ(t.source().range.end.column, 1u);
+ EXPECT_EQ(t.to_str(), "invalid UTF-8");
+}
+INSTANTIATE_TEST_SUITE_P(
+ LexerTest,
+ InvalidUnicodeIdentifierTest,
+ testing::ValuesIn({
+ "\x80\x80\x80\x80", // 10000000
+ "\x81\x80\x80\x80", // 10000001
+ "\x8f\x80\x80\x80", // 10001111
+ "\x90\x80\x80\x80", // 10010000
+ "\x91\x80\x80\x80", // 10010001
+ "\x9f\x80\x80\x80", // 10011111
+ "\xa0\x80\x80\x80", // 10100000
+ "\xa1\x80\x80\x80", // 10100001
+ "\xaf\x80\x80\x80", // 10101111
+ "\xb0\x80\x80\x80", // 10110000
+ "\xb1\x80\x80\x80", // 10110001
+ "\xbf\x80\x80\x80", // 10111111
+ "\xc0\x80\x80\x80", // 11000000
+ "\xc1\x80\x80\x80", // 11000001
+ "\xf5\x80\x80\x80", // 11110101
+ "\xf6\x80\x80\x80", // 11110110
+ "\xf7\x80\x80\x80", // 11110111
+ "\xf8\x80\x80\x80", // 11111000
+ "\xfe\x80\x80\x80", // 11111110
+ "\xff\x80\x80\x80", // 11111111
+
+ "\xd0", // 2-bytes, missing second byte
+ "\xe8\x8f", // 3-bytes, missing third byte
+ "\xf4\x8f\x8f", // 4-bytes, missing fourth byte
+
+ "\xd0\x7f", // 2-bytes, second byte MSB unset
+ "\xe8\x7f\x8f", // 3-bytes, second byte MSB unset
+ "\xe8\x8f\x7f", // 3-bytes, third byte MSB unset
+ "\xf4\x7f\x8f\x8f", // 4-bytes, second byte MSB unset
+ "\xf4\x8f\x7f\x8f", // 4-bytes, third byte MSB unset
+ "\xf4\x8f\x8f\x7f", // 4-bytes, fourth byte MSB unset
+ }));
+
TEST_F(LexerTest, IdentifierTest_SingleUnderscoreDoesNotMatch) {
Source::File file("", "_");
Lexer l(&file);
diff --git a/src/tint/reader/wgsl/parser_impl.cc b/src/tint/reader/wgsl/parser_impl.cc
index fa43f1f..54c53dd 100644
--- a/src/tint/reader/wgsl/parser_impl.cc
+++ b/src/tint/reader/wgsl/parser_impl.cc
@@ -303,8 +303,9 @@
}
Token ParserImpl::peek(size_t idx) {
- while (token_queue_.size() < (idx + 1))
+ while (token_queue_.size() < (idx + 1)) {
token_queue_.push_back(lexer_->next());
+ }
return token_queue_[idx];
}
@@ -448,9 +449,8 @@
}
// The token might itself be an error.
- if (t.IsError()) {
- next(); // Consume it.
- return add_error(t.source(), t.to_str());
+ if (handle_error(t)) {
+ return Failure::kErrored;
}
// Exhausted all attempts to make sense of where we're at.
@@ -2746,9 +2746,6 @@
// | FALSE
Maybe<const ast::LiteralExpression*> ParserImpl::const_literal() {
auto t = peek();
- if (t.IsError()) {
- return add_error(t.source(), t.to_str());
- }
if (match(Token::Type::kTrue)) {
return create<ast::BoolLiteralExpression>(t.source(), true);
}
@@ -2764,6 +2761,9 @@
if (match(Token::Type::kFloatLiteral)) {
return create<ast::FloatLiteralExpression>(t.source(), t.to_f32());
}
+ if (handle_error(t)) {
+ return Failure::kErrored;
+ }
return Failure::kNoMatch;
}
@@ -3165,13 +3165,18 @@
return true;
}
+ // Error cases
+ synchronized_ = false;
+ if (handle_error(t)) {
+ return false;
+ }
+
std::stringstream err;
err << "expected '" << Token::TypeToName(tok) << "'";
if (!use.empty()) {
err << " for " << use;
}
add_error(t, err.str());
- synchronized_ = false;
return false;
}
@@ -3220,6 +3225,9 @@
return {t.to_str(), t.source()};
}
+ if (handle_error(t)) {
+ return Failure::kErrored;
+ }
synchronized_ = false;
return add_error(t.source(), "expected identifier", use);
}
@@ -3342,6 +3350,16 @@
return false;
}
+bool ParserImpl::handle_error(const Token& t) {
+ // The token might itself be an error.
+ if (t.IsError()) {
+ synchronized_ = false;
+ add_error(t.source(), t.to_str());
+ return true;
+ }
+ return false;
+}
+
template <typename F, typename T>
T ParserImpl::without_error(F&& body) {
silence_errors_++;
diff --git a/src/tint/reader/wgsl/parser_impl.h b/src/tint/reader/wgsl/parser_impl.h
index 74eb18f..3ae3739 100644
--- a/src/tint/reader/wgsl/parser_impl.h
+++ b/src/tint/reader/wgsl/parser_impl.h
@@ -828,6 +828,12 @@
/// @see sync().
bool is_sync_token(const Token& t) const;
+ /// If `t` is an error token, then `synchronized_` is set to false and the
+ /// token's error is appended to the builder's diagnostics. If `t` is not an
+ /// error token, then this function does nothing and false is returned.
+ /// @returns true if `t` is an error, otherwise false.
+ bool handle_error(const Token& t);
+
/// @returns true if #synchronized_ is true and the number of reported errors
/// is less than #max_errors_.
bool continue_parsing() {
diff --git a/src/tint/reader/wgsl/parser_impl_error_msg_test.cc b/src/tint/reader/wgsl/parser_impl_error_msg_test.cc
index 08c48a9..3cde554 100644
--- a/src/tint/reader/wgsl/parser_impl_error_msg_test.cc
+++ b/src/tint/reader/wgsl/parser_impl_error_msg_test.cc
@@ -1689,6 +1689,12 @@
)");
}
+TEST_F(ParserImplErrorTest, InvalidUTF8) {
+ EXPECT("fn fu\xd0nc() {}",
+ "test.wgsl:1:4 error: invalid UTF-8\n"
+ "fn fu\xD0nc() {}\n");
+}
+
} // namespace
} // namespace wgsl
} // namespace reader
diff --git a/src/tint/text/unicode.cc b/src/tint/text/unicode.cc
index 56e8292..b48f295 100644
--- a/src/tint/text/unicode.cc
+++ b/src/tint/text/unicode.cc
@@ -468,27 +468,35 @@
CodePoint c;
+ uint8_t valid = 0x80;
switch (n) {
// Note: n=0 (invalid) is correctly handled without a case.
case 1:
c = CodePoint{ptr[0]};
break;
case 2:
+ valid &= ptr[1];
c = CodePoint{(static_cast<uint32_t>(ptr[0] & 0b00011111) << 6) |
(static_cast<uint32_t>(ptr[1] & 0b00111111))};
break;
case 3:
+ valid &= ptr[1] & ptr[2];
c = CodePoint{(static_cast<uint32_t>(ptr[0] & 0b00001111) << 12) |
(static_cast<uint32_t>(ptr[1] & 0b00111111) << 6) |
(static_cast<uint32_t>(ptr[2] & 0b00111111))};
break;
case 4:
+ valid &= ptr[1] & ptr[2] & ptr[3];
c = CodePoint{(static_cast<uint32_t>(ptr[0] & 0b00000111) << 18) |
(static_cast<uint32_t>(ptr[1] & 0b00111111) << 12) |
(static_cast<uint32_t>(ptr[2] & 0b00111111) << 6) |
(static_cast<uint32_t>(ptr[3] & 0b00111111))};
break;
}
+ if (!valid) {
+ n = 0;
+ c = 0;
+ }
return {c, n};
}
diff --git a/src/tint/text/unicode_test.cc b/src/tint/text/unicode_test.cc
index deffae9..38221a4 100644
--- a/src/tint/text/unicode_test.cc
+++ b/src/tint/text/unicode_test.cc
@@ -455,30 +455,42 @@
EXPECT_EQ(width, 0u);
}
-INSTANTIATE_TEST_SUITE_P(Invalid,
- DecodeUTF8InvalidTest,
- ::testing::ValuesIn({
- "\x80\x80\x80\x80", // 10000000
- "\x81\x80\x80\x80", // 10000001
- "\x8f\x80\x80\x80", // 10001111
- "\x90\x80\x80\x80", // 10010000
- "\x91\x80\x80\x80", // 10010001
- "\x9f\x80\x80\x80", // 10011111
- "\xa0\x80\x80\x80", // 10100000
- "\xa1\x80\x80\x80", // 10100001
- "\xaf\x80\x80\x80", // 10101111
- "\xb0\x80\x80\x80", // 10110000
- "\xb1\x80\x80\x80", // 10110001
- "\xbf\x80\x80\x80", // 10111111
- "\xc0\x80\x80\x80", // 11000000
- "\xc1\x80\x80\x80", // 11000001
- "\xf5\x80\x80\x80", // 11110101
- "\xf6\x80\x80\x80", // 11110110
- "\xf7\x80\x80\x80", // 11110111
- "\xf8\x80\x80\x80", // 11111000
- "\xfe\x80\x80\x80", // 11111110
- "\xff\x80\x80\x80", // 11111111
- }));
+INSTANTIATE_TEST_SUITE_P(
+ Invalid,
+ DecodeUTF8InvalidTest,
+ ::testing::ValuesIn({
+ "\x80\x80\x80\x80", // 10000000
+ "\x81\x80\x80\x80", // 10000001
+ "\x8f\x80\x80\x80", // 10001111
+ "\x90\x80\x80\x80", // 10010000
+ "\x91\x80\x80\x80", // 10010001
+ "\x9f\x80\x80\x80", // 10011111
+ "\xa0\x80\x80\x80", // 10100000
+ "\xa1\x80\x80\x80", // 10100001
+ "\xaf\x80\x80\x80", // 10101111
+ "\xb0\x80\x80\x80", // 10110000
+ "\xb1\x80\x80\x80", // 10110001
+ "\xbf\x80\x80\x80", // 10111111
+ "\xc0\x80\x80\x80", // 11000000
+ "\xc1\x80\x80\x80", // 11000001
+ "\xf5\x80\x80\x80", // 11110101
+ "\xf6\x80\x80\x80", // 11110110
+ "\xf7\x80\x80\x80", // 11110111
+ "\xf8\x80\x80\x80", // 11111000
+ "\xfe\x80\x80\x80", // 11111110
+ "\xff\x80\x80\x80", // 11111111
+
+ "\xd0", // 2-bytes, missing second byte
+ "\xe8\x8f", // 3-bytes, missing third byte
+ "\xf4\x8f\x8f", // 4-bytes, missing fourth byte
+
+ "\xd0\x7f", // 2-bytes, second byte MSB unset
+ "\xe8\x7f\x8f", // 3-bytes, second byte MSB unset
+ "\xe8\x8f\x7f", // 3-bytes, third byte MSB unset
+ "\xf4\x7f\x8f\x8f", // 4-bytes, second byte MSB unset
+ "\xf4\x8f\x7f\x8f", // 4-bytes, third byte MSB unset
+ "\xf4\x8f\x8f\x7f", // 4-bytes, fourth byte MSB unset
+ }));
} // namespace