reader/wgsl: Support unicode identifiers

Bug: tint:1437
Change-Id: Ie00ccb3e93d207111e55117dfc989f79b76164bf
Reviewed-on: https://dawn-review.googlesource.com/c/tint/+/80844
Kokoro: Kokoro <noreply+kokoro@google.com>
Reviewed-by: David Neto <dneto@google.com>
Commit-Queue: Ben Clayton <bclayton@google.com>
diff --git a/docs/origin-trial-changes.md b/docs/origin-trial-changes.md
index 9e19b62..d93bce4 100644
--- a/docs/origin-trial-changes.md
+++ b/docs/origin-trial-changes.md
@@ -12,6 +12,7 @@
 
 * Module-scope declarations can now be declared in any order. [tint:1266](crbug.com/tint/1266)
 * The `override` keyword and `@id()` attribute for pipeline-overridable constants are now supported, replacing the `@override` attribute. [tint:1403](crbug.com/tint/1403)
+* Tint now supports unicode identifiers. [tint:1437](crbug.com/tint/1437)
 
 ## Changes for M99
 
diff --git a/samples/main.cc b/samples/main.cc
index d1d0ec2..8a8484a 100644
--- a/samples/main.cc
+++ b/samples/main.cc
@@ -1119,7 +1119,8 @@
     case Format::kMsl: {
 #if TINT_BUILD_MSL_WRITER
       transform_inputs.Add<tint::transform::Renamer::Config>(
-          tint::transform::Renamer::Target::kMslKeywords);
+          tint::transform::Renamer::Target::kMslKeywords,
+          /* preserve_unicode */ false);
       transform_manager.Add<tint::transform::Renamer>();
 #endif  // TINT_BUILD_MSL_WRITER
       break;
@@ -1132,7 +1133,8 @@
     case Format::kHlsl: {
 #if TINT_BUILD_HLSL_WRITER
       transform_inputs.Add<tint::transform::Renamer::Config>(
-          tint::transform::Renamer::Target::kHlslKeywords);
+          tint::transform::Renamer::Target::kHlslKeywords,
+          /* preserve_unicode */ false);
       transform_manager.Add<tint::transform::Renamer>();
 #endif  // TINT_BUILD_HLSL_WRITER
       break;
diff --git a/src/reader/wgsl/lexer.cc b/src/reader/wgsl/lexer.cc
index 56e56ad..8b0f7ac 100644
--- a/src/reader/wgsl/lexer.cc
+++ b/src/reader/wgsl/lexer.cc
@@ -21,6 +21,7 @@
 #include <utility>
 
 #include "src/debug.h"
+#include "src/text/unicode.h"
 
 namespace tint {
 namespace reader {
@@ -113,18 +114,10 @@
   return (pos_ < len_) && (file_->content.data[pos_] == 0);
 }
 
-bool Lexer::is_alpha(char ch) const {
-  return std::isalpha(ch);
-}
-
 bool Lexer::is_digit(char ch) const {
   return std::isdigit(ch);
 }
 
-bool Lexer::is_alphanum_underscore(char ch) const {
-  return is_alpha(ch) || is_digit(ch) || ch == '_';
-}
-
 bool Lexer::is_hex(char ch) const {
   return std::isxdigit(ch);
 }
@@ -733,31 +726,52 @@
 }
 
 Token Lexer::try_ident() {
-  // Must begin with an a-zA-Z_
-  if (!(is_alpha(file_->content.data[pos_]) ||
-        file_->content.data[pos_] == '_')) {
-    return {};
-  }
-
   auto source = begin_source();
+  auto start = pos_;
 
-  auto s = pos_;
-  while (!is_eof() && is_alphanum_underscore(file_->content.data[pos_])) {
-    pos_++;
-    location_.column++;
+  // This below assumes that the size of a single std::string element is 1 byte.
+  static_assert(sizeof(file_->content.data[0]) == sizeof(uint8_t),
+                "tint::reader::wgsl requires the size of a std::string element "
+                "to be a single byte");
+
+  // Must begin with an XID_Source unicode character, or underscore
+  {
+    auto* utf8 = reinterpret_cast<const uint8_t*>(&file_->content.data[pos_]);
+    auto [code_point, n] =
+        text::utf8::Decode(utf8, file_->content.data.size() - pos_);
+    if (code_point != text::CodePoint('_') && !code_point.IsXIDStart()) {
+      return {};
+    }
+    // Consume start codepoint
+    pos_ += n;
+    location_.column += n;
   }
 
-  if (file_->content.data[s] == '_') {
+  while (!is_eof()) {
+    // Must continue with an XID_Continue unicode character
+    auto* utf8 = reinterpret_cast<const uint8_t*>(&file_->content.data[pos_]);
+    auto [code_point, n] =
+        text::utf8::Decode(utf8, file_->content.data.size() - pos_);
+    if (!code_point.IsXIDContinue()) {
+      break;
+    }
+
+    // Consume continuing codepoint
+    pos_ += n;
+    location_.column += n;
+  }
+
+  if (file_->content.data[start] == '_') {
     // Check for an underscore on its own (special token), or a
     // double-underscore (not allowed).
-    if ((pos_ == s + 1) || (file_->content.data[s + 1] == '_')) {
-      location_.column -= (pos_ - s);
-      pos_ = s;
+    if ((pos_ == start + 1) || (file_->content.data[start + 1] == '_')) {
+      location_.column -= (pos_ - start);
+      pos_ = start;
       return {};
     }
   }
 
-  auto str = file_->content.data_view.substr(s, pos_ - s);
+  auto str = file_->content.data_view.substr(start, pos_ - start);
   end_source(source);
 
   auto t = check_keyword(source, str);
diff --git a/src/reader/wgsl/lexer.h b/src/reader/wgsl/lexer.h
index 43d9ea5..5bdb20f 100644
--- a/src/reader/wgsl/lexer.h
+++ b/src/reader/wgsl/lexer.h
@@ -76,25 +76,18 @@
   /// it is not null.
   bool is_null() const;
   /// @param ch a character
-  /// @returns true if 'ch' is an alphabetic character
-  bool is_alpha(char ch) const;
-  /// @param ch a character
   /// @returns true if 'ch' is a decimal digit
   bool is_digit(char ch) const;
   /// @param ch a character
   /// @returns true if 'ch' is a hexadecimal digit
   bool is_hex(char ch) const;
-  /// @param ch a character
-  /// @returns true if 'ch' is a digit, an alphabetic character,
-  /// or an underscore.
-  bool is_alphanum_underscore(char ch) const;
   bool matches(size_t pos, std::string_view substr);
 
   /// The source file content
   Source::File const* const file_;
   /// The length of the input
   uint32_t len_ = 0;
-  /// The current position within the input
+  /// The current position in utf-8 code units (bytes) within the input
   uint32_t pos_ = 0;
   /// The current location within the input
   Source::Location location_;
diff --git a/src/reader/wgsl/lexer_test.cc b/src/reader/wgsl/lexer_test.cc
index ff6c9ca..2e79906 100644
--- a/src/reader/wgsl/lexer_test.cc
+++ b/src/reader/wgsl/lexer_test.cc
@@ -315,8 +315,8 @@
                     "2.5E+ 123",
                     "2.5E- 123"));
 
-using IdentifierTest = testing::TestWithParam<const char*>;
-TEST_P(IdentifierTest, Parse) {
+using AsciiIdentifierTest = testing::TestWithParam<const char*>;
+TEST_P(AsciiIdentifierTest, Parse) {
   Source::File file("", GetParam());
   Lexer l(&file);
 
@@ -329,7 +329,7 @@
   EXPECT_EQ(t.to_str(), GetParam());
 }
 INSTANTIATE_TEST_SUITE_P(LexerTest,
-                         IdentifierTest,
+                         AsciiIdentifierTest,
                          testing::Values("a",
                                          "test",
                                          "test01",
@@ -342,6 +342,57 @@
                                          "ABCDEFGHIJKLMNOPQRSTUVWXYZ",
                                          "alldigits_0123456789"));
 
+struct UnicodeCase {
+  const char* utf8;
+  size_t code_units;
+};
+
+using UnicodeIdentifierTest = testing::TestWithParam<UnicodeCase>;
+TEST_P(UnicodeIdentifierTest, Parse) {
+  Source::File file("", GetParam().utf8);
+  Lexer l(&file);
+
+  auto t = l.next();
+  EXPECT_TRUE(t.IsIdentifier());
+  EXPECT_EQ(t.source().range.begin.line, 1u);
+  EXPECT_EQ(t.source().range.begin.column, 1u);
+  EXPECT_EQ(t.source().range.end.line, 1u);
+  EXPECT_EQ(t.source().range.end.column, 1u + GetParam().code_units);
+  EXPECT_EQ(t.to_str(), GetParam().utf8);
+}
+INSTANTIATE_TEST_SUITE_P(
+    LexerTest,
+    UnicodeIdentifierTest,
+    testing::ValuesIn({
+        UnicodeCase{// "๐ข๐๐ž๐ง๐ญ๐ข๐Ÿ๐ข๐ž๐ซ"
+                    "\xf0\x9d\x90\xa2\xf0\x9d\x90\x9d\xf0\x9d\x90\x9e\xf0\x9d"
+                    "\x90\xa7\xf0\x9d\x90\xad\xf0\x9d\x90\xa2\xf0\x9d\x90\x9f"
+                    "\xf0\x9d\x90\xa2\xf0\x9d\x90\x9e\xf0\x9d\x90\xab",
+                    40},
+        UnicodeCase{// "๐‘–๐‘‘๐‘’๐‘›๐‘ก๐‘–๐‘“๐‘–๐‘’๐‘Ÿ"
+                    "\xf0\x9d\x91\x96\xf0\x9d\x91\x91\xf0\x9d\x91\x92\xf0\x9d"
+                    "\x91\x9b\xf0\x9d\x91\xa1\xf0\x9d\x91\x96\xf0\x9d\x91\x93"
+                    "\xf0\x9d\x91\x96\xf0\x9d\x91\x92\xf0\x9d\x91\x9f",
+                    40},
+        UnicodeCase{
+            // "๏ฝ‰๏ฝ„๏ฝ…๏ฝŽ๏ฝ”๏ฝ‰๏ฝ†๏ฝ‰๏ฝ…๏ฝ’"
+            "\xef\xbd\x89\xef\xbd\x84\xef\xbd\x85\xef\xbd\x8e\xef\xbd\x94\xef"
+            "\xbd\x89\xef\xbd\x86\xef\xbd\x89\xef\xbd\x85\xef\xbd\x92",
+            30},
+        UnicodeCase{// "๐•š๐••๐•–๐•Ÿ๐•ฅ๐•š๐•—๐•š๐•–๐•ฃ๐Ÿ™๐Ÿš๐Ÿ›"
+                    "\xf0\x9d\x95\x9a\xf0\x9d\x95\x95\xf0\x9d\x95\x96\xf0\x9d"
+                    "\x95\x9f\xf0\x9d\x95\xa5\xf0\x9d\x95\x9a\xf0\x9d\x95\x97"
+                    "\xf0\x9d\x95\x9a\xf0\x9d\x95\x96\xf0\x9d\x95\xa3\xf0\x9d"
+                    "\x9f\x99\xf0\x9d\x9f\x9a\xf0\x9d\x9f\x9b",
+                    52},
+        UnicodeCase{
+            // "๐–Ž๐–‰๐–Š๐–“๐–™๐–Ž๐–‹๐–Ž๐–Š๐–—123"
+            "\xf0\x9d\x96\x8e\xf0\x9d\x96\x89\xf0\x9d\x96\x8a\xf0\x9d\x96\x93"
+            "\xf0\x9d\x96\x99\xf0\x9d\x96\x8e\xf0\x9d\x96\x8b\xf0\x9d\x96\x8e"
+            "\xf0\x9d\x96\x8a\xf0\x9d\x96\x97\x31\x32\x33",
+            43},
+    }));
+
 TEST_F(LexerTest, IdentifierTest_SingleUnderscoreDoesNotMatch) {
   Source::File file("", "_");
   Lexer l(&file);
diff --git a/src/reader/wgsl/parser_impl_function_decl_test.cc b/src/reader/wgsl/parser_impl_function_decl_test.cc
index f9bb2bd..56f024e 100644
--- a/src/reader/wgsl/parser_impl_function_decl_test.cc
+++ b/src/reader/wgsl/parser_impl_function_decl_test.cc
@@ -14,6 +14,7 @@
 
 #include "src/ast/workgroup_attribute.h"
 #include "src/reader/wgsl/parser_impl_test_helper.h"
+#include "src/utils/string.h"
 
 namespace tint {
 namespace reader {
@@ -48,6 +49,51 @@
   EXPECT_TRUE(body->statements[0]->Is<ast::ReturnStatement>());
 }
 
+TEST_F(ParserImplTest, FunctionDecl_Unicode) {
+  const std::string function_ident =  // "๐—ณ๐˜‚๐—ป๐—ฐ๐˜๐—ถ๐—ผ๐—ป"
+      "\xf0\x9d\x97\xb3\xf0\x9d\x98\x82\xf0\x9d\x97\xbb\xf0\x9d\x97\xb0\xf0\x9d"
+      "\x98\x81\xf0\x9d\x97\xb6\xf0\x9d\x97\xbc\xf0\x9d\x97\xbb";
+
+  const std::string param_a_ident =  // "๐“น๐“ช๐“ป๐“ช๐“ถ_๐“ช"
+      "\xf0\x9d\x93\xb9\xf0\x9d\x93\xaa\xf0\x9d\x93\xbb\xf0\x9d\x93\xaa\xf0\x9d"
+      "\x93\xb6\x5f\xf0\x9d\x93\xaa";
+
+  const std::string param_b_ident =  // "๐•ก๐•’๐•ฃ๐•’๐•ž_๐•“"
+      "\xf0\x9d\x95\xa1\xf0\x9d\x95\x92\xf0\x9d\x95\xa3\xf0\x9d\x95\x92\xf0\x9d"
+      "\x95\x9e\x5f\xf0\x9d\x95\x93";
+
+  std::string src = "fn $function($param_a : i32, $param_b : f32) { return; }";
+  src = utils::ReplaceAll(src, "$function", function_ident);
+  src = utils::ReplaceAll(src, "$param_a", param_a_ident);
+  src = utils::ReplaceAll(src, "$param_b", param_b_ident);
+
+  auto p = parser(src);
+  auto attrs = p->attribute_list();
+  EXPECT_FALSE(p->has_error()) << p->error();
+  ASSERT_FALSE(attrs.errored);
+  EXPECT_FALSE(attrs.matched);
+  auto f = p->function_decl(attrs.value);
+  EXPECT_FALSE(p->has_error()) << p->error();
+  EXPECT_FALSE(f.errored);
+  EXPECT_TRUE(f.matched);
+  ASSERT_NE(f.value, nullptr);
+
+  EXPECT_EQ(f->symbol, p->builder().Symbols().Get(function_ident));
+  ASSERT_NE(f->return_type, nullptr);
+  EXPECT_TRUE(f->return_type->Is<ast::Void>());
+
+  ASSERT_EQ(f->params.size(), 2u);
+  EXPECT_EQ(f->params[0]->symbol, p->builder().Symbols().Get(param_a_ident));
+  EXPECT_EQ(f->params[1]->symbol, p->builder().Symbols().Get(param_b_ident));
+
+  ASSERT_NE(f->return_type, nullptr);
+  EXPECT_TRUE(f->return_type->Is<ast::Void>());
+
+  auto* body = f->body;
+  ASSERT_EQ(body->statements.size(), 1u);
+  EXPECT_TRUE(body->statements[0]->Is<ast::ReturnStatement>());
+}
+
 TEST_F(ParserImplTest, FunctionDecl_AttributeList) {
   auto p = parser("@workgroup_size(2, 3, 4) fn main() { return; }");
   auto attrs = p->attribute_list();
diff --git a/src/reader/wgsl/parser_impl_struct_decl_test.cc b/src/reader/wgsl/parser_impl_struct_decl_test.cc
index 5df3bd0..40d6cc7 100644
--- a/src/reader/wgsl/parser_impl_struct_decl_test.cc
+++ b/src/reader/wgsl/parser_impl_struct_decl_test.cc
@@ -14,6 +14,7 @@
 
 #include "src/ast/struct_block_attribute.h"
 #include "src/reader/wgsl/parser_impl_test_helper.h"
+#include "src/utils/string.h"
 
 namespace tint {
 namespace reader {
@@ -42,6 +43,46 @@
   EXPECT_EQ(s->members[1]->symbol, p->builder().Symbols().Register("b"));
 }
 
+TEST_F(ParserImplTest, StructDecl_Unicode_Parses) {
+  const std::string struct_ident =  // "๐“ผ๐“ฝ๐“ป๐“พ๐“ฌ๐“ฝ๐“พ๐“ป๐“ฎ"
+      "\xf0\x9d\x93\xbc\xf0\x9d\x93\xbd\xf0\x9d\x93\xbb\xf0\x9d\x93\xbe\xf0\x9d"
+      "\x93\xac\xf0\x9d\x93\xbd\xf0\x9d\x93\xbe\xf0\x9d\x93\xbb\xf0\x9d\x93"
+      "\xae";
+  const std::string member_a_ident =  // "๐•ž๐•–๐•ž๐•“๐•–๐•ฃ_๐•’"
+      "\xf0\x9d\x95\x9e\xf0\x9d\x95\x96\xf0\x9d\x95\x9e\xf0\x9d\x95\x93\xf0\x9d"
+      "\x95\x96\xf0\x9d\x95\xa3\x5f\xf0\x9d\x95\x92";
+  const std::string member_b_ident =  // "๐”ช๐”ข๐”ช๐”Ÿ๐”ข๐”ฏ_๐”Ÿ"
+      "\xf0\x9d\x94\xaa\xf0\x9d\x94\xa2\xf0\x9d\x94\xaa\xf0\x9d\x94\x9f\xf0\x9d"
+      "\x94\xa2\xf0\x9d\x94\xaf\x5f\xf0\x9d\x94\x9f";
+
+  std::string src = R"(
+struct $struct {
+  $member_a : i32;
+  $member_b : f32;
+})";
+  src = utils::ReplaceAll(src, "$struct", struct_ident);
+  src = utils::ReplaceAll(src, "$member_a", member_a_ident);
+  src = utils::ReplaceAll(src, "$member_b", member_b_ident);
+
+  auto p = parser(src);
+  auto attrs = p->attribute_list();
+  EXPECT_FALSE(attrs.errored);
+  EXPECT_FALSE(attrs.matched);
+  ASSERT_EQ(attrs.value.size(), 0u);
+
+  auto s = p->struct_decl(attrs.value);
+  EXPECT_FALSE(p->has_error());
+  EXPECT_FALSE(s.errored);
+  EXPECT_TRUE(s.matched);
+  ASSERT_NE(s.value, nullptr);
+  ASSERT_EQ(s->name, p->builder().Symbols().Register(struct_ident));
+  ASSERT_EQ(s->members.size(), 2u);
+  EXPECT_EQ(s->members[0]->symbol,
+            p->builder().Symbols().Register(member_a_ident));
+  EXPECT_EQ(s->members[1]->symbol,
+            p->builder().Symbols().Register(member_b_ident));
+}
+
 TEST_F(ParserImplTest, StructDecl_ParsesWithAttribute) {
   auto p = parser(R"(
 [[block]] struct B {
diff --git a/src/reader/wgsl/parser_impl_type_alias_test.cc b/src/reader/wgsl/parser_impl_type_alias_test.cc
index 45e2e32..301db6f 100644
--- a/src/reader/wgsl/parser_impl_type_alias_test.cc
+++ b/src/reader/wgsl/parser_impl_type_alias_test.cc
@@ -34,7 +34,7 @@
   EXPECT_EQ(t.value->source.range, (Source::Range{{1u, 1u}, {1u, 13u}}));
 }
 
-TEST_F(ParserImplTest, TypeDecl_ParsesStruct_Ident) {
+TEST_F(ParserImplTest, TypeDecl_Parses_Ident) {
   auto p = parser("type a = B");
 
   auto t = p->type_alias();
@@ -49,6 +49,25 @@
   EXPECT_EQ(alias->source.range, (Source::Range{{1u, 1u}, {1u, 11u}}));
 }
 
+TEST_F(ParserImplTest, TypeDecl_Unicode_Parses_Ident) {
+  const std::string ident =  // "๐“ถ๐”‚_๐“ฝ๐”‚๐“น๐“ฎ"
+      "\xf0\x9d\x93\xb6\xf0\x9d\x94\x82\x5f\xf0\x9d\x93\xbd\xf0\x9d\x94\x82\xf0"
+      "\x9d\x93\xb9\xf0\x9d\x93\xae";
+
+  auto p = parser("type " + ident + " = i32");
+
+  auto t = p->type_alias();
+  EXPECT_FALSE(p->has_error());
+  EXPECT_FALSE(t.errored);
+  EXPECT_TRUE(t.matched);
+  ASSERT_NE(t.value, nullptr);
+  ASSERT_TRUE(t.value->Is<ast::Alias>());
+  auto* alias = t.value->As<ast::Alias>();
+  EXPECT_EQ(p->builder().Symbols().NameFor(alias->name), ident);
+  EXPECT_TRUE(alias->type->Is<ast::I32>());
+  EXPECT_EQ(alias->source.range, (Source::Range{{1u, 1u}, {1u, 37u}}));
+}
+
 TEST_F(ParserImplTest, TypeDecl_MissingIdent) {
   auto p = parser("type = i32");
   auto t = p->type_alias();
diff --git a/src/reader/wgsl/parser_impl_variable_decl_test.cc b/src/reader/wgsl/parser_impl_variable_decl_test.cc
index 97bffb1..88a5873 100644
--- a/src/reader/wgsl/parser_impl_variable_decl_test.cc
+++ b/src/reader/wgsl/parser_impl_variable_decl_test.cc
@@ -18,7 +18,6 @@
 namespace reader {
 namespace wgsl {
 namespace {
-
 TEST_F(ParserImplTest, VariableDecl_Parses) {
   auto p = parser("var my_var : f32");
   auto v = p->variable_decl();
@@ -33,6 +32,25 @@
   EXPECT_EQ(v->type->source.range, (Source::Range{{1u, 14u}, {1u, 17u}}));
 }
 
+TEST_F(ParserImplTest, VariableDecl_Unicode_Parses) {
+  const std::string ident =  // "๐–Ž๐–‰๐–Š๐–“๐–™๐–Ž๐–‹๐–Ž๐–Š๐–—123"
+      "\xf0\x9d\x96\x8e\xf0\x9d\x96\x89\xf0\x9d\x96\x8a\xf0\x9d\x96\x93"
+      "\xf0\x9d\x96\x99\xf0\x9d\x96\x8e\xf0\x9d\x96\x8b\xf0\x9d\x96\x8e"
+      "\xf0\x9d\x96\x8a\xf0\x9d\x96\x97\x31\x32\x33";
+
+  auto p = parser("var " + ident + " : f32");
+  auto v = p->variable_decl();
+  EXPECT_FALSE(p->has_error());
+  EXPECT_TRUE(v.matched);
+  EXPECT_FALSE(v.errored);
+  EXPECT_EQ(v->name, ident);
+  EXPECT_NE(v->type, nullptr);
+  EXPECT_TRUE(v->type->Is<ast::F32>());
+
+  EXPECT_EQ(v->source.range, (Source::Range{{1u, 5u}, {1u, 48u}}));
+  EXPECT_EQ(v->type->source.range, (Source::Range{{1u, 51u}, {1u, 54u}}));
+}
+
 TEST_F(ParserImplTest, VariableDecl_Inferred_Parses) {
   auto p = parser("var my_var = 1.0");
   auto v = p->variable_decl(/*allow_inferred = */ true);
diff --git a/src/source.h b/src/source.h
index 4802d06..b40c994 100644
--- a/src/source.h
+++ b/src/source.h
@@ -27,7 +27,7 @@
 /// Source describes a range of characters within a source file.
 class Source {
  public:
-  /// FileContent describes the content of a source file.
+  /// FileContent describes the content of a source file encoded using utf-8.
   class FileContent {
    public:
     /// Constructs the FileContent with the given file content.
@@ -78,7 +78,8 @@
    public:
     /// the 1-based line number. 0 represents no line information.
     size_t line = 0;
-    /// the 1-based column number. 0 represents no column information.
+    /// the 1-based column number in utf8-code units (bytes).
+    /// 0 represents no column information.
     size_t column = 0;
 
     /// Returns true of `this` location is lexicographically less than `rhs`
diff --git a/src/text/unicode.cc b/src/text/unicode.cc
index 12767bf..826eb5b 100644
--- a/src/text/unicode.cc
+++ b/src/text/unicode.cc
@@ -427,7 +427,9 @@
   return out << "'U+" << std::hex << code_point.value << "'";
 }
 
-std::pair<CodePoint, size_t> utf8::Decode(const uint8_t* ptr, size_t len) {
+namespace utf8 {
+
+std::pair<CodePoint, size_t> Decode(const uint8_t* ptr, size_t len) {
   if (len < 1) {
     return {};
   }
@@ -490,4 +492,15 @@
   return {c, n};
 }
 
+bool IsASCII(std::string_view str) {
+  for (auto c : str) {
+    if (c & 0x80) {
+      return false;
+    }
+  }
+  return true;
+}
+
+}  // namespace utf8
+
 }  // namespace tint::text
diff --git a/src/text/unicode.h b/src/text/unicode.h
index c1ef915..3c74221 100644
--- a/src/text/unicode.h
+++ b/src/text/unicode.h
@@ -69,6 +69,10 @@
 ///          If the next code point cannot be decoded then returns [0,0].
 std::pair<CodePoint, size_t> Decode(const uint8_t* ptr, size_t len);
 
+/// @returns true if all the utf-8 code points in the string are ASCII
+/// (code-points 0x00..0x7f).
+bool IsASCII(std::string_view);
+
 }  // namespace utf8
 
 }  // namespace tint::text
diff --git a/src/transform/glsl.cc b/src/transform/glsl.cc
index 46cc848..7744370 100644
--- a/src/transform/glsl.cc
+++ b/src/transform/glsl.cc
@@ -56,7 +56,8 @@
     data.Add<SingleEntryPoint::Config>(cfg->entry_point);
   }
   manager.Add<Renamer>();
-  data.Add<Renamer::Config>(Renamer::Target::kGlslKeywords);
+  data.Add<Renamer::Config>(Renamer::Target::kGlslKeywords,
+                            /* preserve_unicode */ false);
   manager.Add<Unshadow>();
 
   // Attempt to convert `loop`s into for-loops. This is to try and massage the
diff --git a/src/transform/renamer.cc b/src/transform/renamer.cc
index 0b49e6e..6863c3d 100644
--- a/src/transform/renamer.cc
+++ b/src/transform/renamer.cc
@@ -21,13 +21,13 @@
 #include "src/program_builder.h"
 #include "src/sem/call.h"
 #include "src/sem/member_accessor_expression.h"
+#include "src/text/unicode.h"
 
 TINT_INSTANTIATE_TYPEINFO(tint::transform::Renamer);
 TINT_INSTANTIATE_TYPEINFO(tint::transform::Renamer::Data);
 TINT_INSTANTIATE_TYPEINFO(tint::transform::Renamer::Config);
 
-namespace tint {
-namespace transform {
+namespace tint::transform {
 
 namespace {
 
@@ -1245,7 +1245,7 @@
 Renamer::Data::Data(const Data&) = default;
 Renamer::Data::~Data() = default;
 
-Renamer::Config::Config(Target t) : target(t) {}
+Renamer::Config::Config(Target t, bool pu) : target(t), preserve_unicode(pu) {}
 Renamer::Config::Config(const Config&) = default;
 Renamer::Config::~Config() = default;
 
@@ -1293,48 +1293,52 @@
   Data::Remappings remappings;
 
   Target target = Target::kAll;
+  bool preserve_unicode = false;
 
   if (auto* cfg = inputs.Get<Config>()) {
     target = cfg->target;
+    preserve_unicode = cfg->preserve_unicode;
   }
 
   ctx.ReplaceAll([&](Symbol sym_in) {
     auto name_in = ctx.src->Symbols().NameFor(sym_in);
-    switch (target) {
-      case Target::kAll:
-        // Always rename.
-        break;
-      case Target::kGlslKeywords:
-        if (!std::binary_search(
-                kReservedKeywordsGLSL,
-                kReservedKeywordsGLSL +
-                    sizeof(kReservedKeywordsGLSL) / sizeof(const char*),
-                name_in) &&
-            name_in.compare(0, 3, "gl_")) {
-          // No match, just reuse the original name.
-          return ctx.dst->Symbols().New(name_in);
-        }
-        break;
-      case Target::kHlslKeywords:
-        if (!std::binary_search(
-                kReservedKeywordsHLSL,
-                kReservedKeywordsHLSL +
-                    sizeof(kReservedKeywordsHLSL) / sizeof(const char*),
-                name_in)) {
-          // No match, just reuse the original name.
-          return ctx.dst->Symbols().New(name_in);
-        }
-        break;
-      case Target::kMslKeywords:
-        if (!std::binary_search(
-                kReservedKeywordsMSL,
-                kReservedKeywordsMSL +
-                    sizeof(kReservedKeywordsMSL) / sizeof(const char*),
-                name_in)) {
-          // No match, just reuse the original name.
-          return ctx.dst->Symbols().New(name_in);
-        }
-        break;
+    if (preserve_unicode || text::utf8::IsASCII(name_in)) {
+      switch (target) {
+        case Target::kAll:
+          // Always rename.
+          break;
+        case Target::kGlslKeywords:
+          if (!std::binary_search(
+                  kReservedKeywordsGLSL,
+                  kReservedKeywordsGLSL +
+                      sizeof(kReservedKeywordsGLSL) / sizeof(const char*),
+                  name_in) &&
+              name_in.compare(0, 3, "gl_")) {
+            // No match, just reuse the original name.
+            return ctx.dst->Symbols().New(name_in);
+          }
+          break;
+        case Target::kHlslKeywords:
+          if (!std::binary_search(
+                  kReservedKeywordsHLSL,
+                  kReservedKeywordsHLSL +
+                      sizeof(kReservedKeywordsHLSL) / sizeof(const char*),
+                  name_in)) {
+            // No match, just reuse the original name.
+            return ctx.dst->Symbols().New(name_in);
+          }
+          break;
+        case Target::kMslKeywords:
+          if (!std::binary_search(
+                  kReservedKeywordsMSL,
+                  kReservedKeywordsMSL +
+                      sizeof(kReservedKeywordsMSL) / sizeof(const char*),
+                  name_in)) {
+            // No match, just reuse the original name.
+            return ctx.dst->Symbols().New(name_in);
+          }
+          break;
+      }
     }
 
     auto sym_out = ctx.dst->Sym();
@@ -1359,5 +1363,4 @@
                 std::make_unique<Data>(std::move(remappings)));
 }
 
-}  // namespace transform
-}  // namespace tint
+}  // namespace tint::transform
diff --git a/src/transform/renamer.h b/src/transform/renamer.h
index 4bec367..9b448e8 100644
--- a/src/transform/renamer.h
+++ b/src/transform/renamer.h
@@ -20,8 +20,7 @@
 
 #include "src/transform/transform.h"
 
-namespace tint {
-namespace transform {
+namespace tint::transform {
 
 /// Renamer is a Transform that renames all the symbols in a program.
 class Renamer : public Castable<Renamer, Transform> {
@@ -63,7 +62,9 @@
   struct Config : public Castable<Config, transform::Data> {
     /// Constructor
     /// @param tgt the targets to rename
-    explicit Config(Target tgt);
+    /// @param keep_unicode if false, symbols with non-ascii code-points are
+    /// renamed
+    explicit Config(Target tgt, bool keep_unicode = false);
 
     /// Copy constructor
     Config(const Config&);
@@ -73,6 +74,9 @@
 
     /// The targets to rename
     Target const target = Target::kAll;
+
+    /// If false, symbols with non-ascii code-points are renamed.
+    bool preserve_unicode = false;
   };
 
   /// Constructor using a the configuration provided in the input Data
@@ -88,7 +92,6 @@
   Output Run(const Program* program, const DataMap& data = {}) const override;
 };
 
-}  // namespace transform
-}  // namespace tint
+}  // namespace tint::transform
 
 #endif  // SRC_TRANSFORM_RENAMER_H_
diff --git a/src/transform/renamer_test.cc b/src/transform/renamer_test.cc
index 0cdbdb4..56c887c 100644
--- a/src/transform/renamer_test.cc
+++ b/src/transform/renamer_test.cc
@@ -23,6 +23,11 @@
 namespace transform {
 namespace {
 
+constexpr const char kUnicodeIdentifier[] =  // "๐–Ž๐–‰๐–Š๐–“๐–™๐–Ž๐–‹๐–Ž๐–Š๐–—123"
+    "\xf0\x9d\x96\x8e\xf0\x9d\x96\x89\xf0\x9d\x96\x8a\xf0\x9d\x96\x93"
+    "\xf0\x9d\x96\x99\xf0\x9d\x96\x8e\xf0\x9d\x96\x8b\xf0\x9d\x96\x8e"
+    "\xf0\x9d\x96\x8a\xf0\x9d\x96\x97\x31\x32\x33";
+
 using ::testing::ContainerEq;
 
 using RenamerTest = TransformTest;
@@ -184,6 +189,25 @@
   EXPECT_THAT(data->remappings, ContainerEq(expected_remappings));
 }
 
+TEST_F(RenamerTest, PreserveUnicode) {
+  auto src = R"(
+@stage(fragment)
+fn frag_main() {
+  var )" + std::string(kUnicodeIdentifier) +
+             R"( : i32;
+}
+)";
+
+  auto expect = src;
+
+  DataMap inputs;
+  inputs.Add<Renamer::Config>(Renamer::Target::kMslKeywords,
+                              /* preserve_unicode */ true);
+  auto got = Run<Renamer>(src, inputs);
+
+  EXPECT_EQ(expect, str(got));
+}
+
 TEST_F(RenamerTest, AttemptSymbolCollision) {
   auto* src = R"(
 @stage(vertex)
@@ -244,7 +268,8 @@
 )";
 
   DataMap inputs;
-  inputs.Add<Renamer::Config>(Renamer::Target::kGlslKeywords);
+  inputs.Add<Renamer::Config>(Renamer::Target::kGlslKeywords,
+                              /* preserve_unicode */ false);
   auto got = Run<Renamer>(src, inputs);
 
   EXPECT_EQ(expect, str(got));
@@ -269,7 +294,8 @@
 )";
 
   DataMap inputs;
-  inputs.Add<Renamer::Config>(Renamer::Target::kHlslKeywords);
+  inputs.Add<Renamer::Config>(Renamer::Target::kHlslKeywords,
+                              /* preserve_unicode */ false);
   auto got = Run<Renamer>(src, inputs);
 
   EXPECT_EQ(expect, str(got));
@@ -294,7 +320,8 @@
 )";
 
   DataMap inputs;
-  inputs.Add<Renamer::Config>(Renamer::Target::kMslKeywords);
+  inputs.Add<Renamer::Config>(Renamer::Target::kMslKeywords,
+                              /* preserve_unicode */ false);
   auto got = Run<Renamer>(src, inputs);
 
   EXPECT_EQ(expect, str(got));
@@ -528,7 +555,8 @@
                                          //    "void",       // WGSL keyword
                                          "volatile",
                                          //    "while",      // WGSL keyword
-                                         "writeonly"));
+                                         "writeonly",
+                                         kUnicodeIdentifier));
 
 INSTANTIATE_TEST_SUITE_P(RenamerTestHlsl,
                          RenamerTestHlsl,
@@ -1142,8 +1170,9 @@
                                          "vertexshader",
                                          "virtual",
                                          // "void",  // WGSL keyword
-                                         "volatile"));
-//                                          "while"  // WGSL reserved keyword
+                                         "volatile",
+                                         // "while"  // WGSL reserved keyword
+                                         kUnicodeIdentifier));
 
 INSTANTIATE_TEST_SUITE_P(
     RenamerTestMsl,
@@ -1425,7 +1454,9 @@
         "M_2_PI_H",
         "M_2_SQRTPI_H",
         "M_SQRT2_H",
-        "M_SQRT1_2_H"));
+        "M_SQRT1_2_H",
+        // "while"  // WGSL reserved keyword
+        kUnicodeIdentifier));
 
 }  // namespace
 }  // namespace transform
diff --git a/src/writer/hlsl/test_helper.h b/src/writer/hlsl/test_helper.h
index e55b9c1..337a1ac 100644
--- a/src/writer/hlsl/test_helper.h
+++ b/src/writer/hlsl/test_helper.h
@@ -89,7 +89,8 @@
     transform::Manager transform_manager;
     transform::DataMap transform_data;
     transform_data.Add<transform::Renamer::Config>(
-        transform::Renamer::Target::kHlslKeywords);
+        transform::Renamer::Target::kHlslKeywords,
+        /* preserve_unicode */ true);
     transform_manager.Add<tint::transform::Renamer>();
     auto result =
         transform_manager.Run(&sanitized_result.program, transform_data);
diff --git a/test/unicode/comments.wgsl b/test/unicode/comments.wgsl
new file mode 100644
index 0000000..0e42055
--- /dev/null
+++ b/test/unicode/comments.wgsl
@@ -0,0 +1,11 @@
+// Øโ“‘๊šซ์นวด๐Œ’ๅฒพ๐Ÿฅโดตใจๅˆแฎ—
+
+/*
+/* ๐Ÿ‘‹๐ŸŒŽ */
+เคจเคฎเคธเฅเคคเฅ‡ เคฆเฅเคจเคฟเคฏเคพ
+*/
+
+@stage(fragment)
+fn /* ใ“ใ‚“ใซใกใฏไธ–็•Œ */ main( /* ไฝ ๅฅฝไธ–็•Œ */ ) {
+    // ์•ˆ๋…•ํ•˜์„ธ์š” ์„ธ๊ณ„
+}
diff --git a/test/unicode/comments.wgsl.expected.glsl b/test/unicode/comments.wgsl.expected.glsl
new file mode 100644
index 0000000..bb42c59
--- /dev/null
+++ b/test/unicode/comments.wgsl.expected.glsl
@@ -0,0 +1,10 @@
+#version 310 es
+precision mediump float;
+
+void tint_symbol() {
+}
+
+void main() {
+  tint_symbol();
+  return;
+}
diff --git a/test/unicode/comments.wgsl.expected.hlsl b/test/unicode/comments.wgsl.expected.hlsl
new file mode 100644
index 0000000..61d6d57
--- /dev/null
+++ b/test/unicode/comments.wgsl.expected.hlsl
@@ -0,0 +1,3 @@
+void main() {
+  return;
+}
diff --git a/test/unicode/comments.wgsl.expected.msl b/test/unicode/comments.wgsl.expected.msl
new file mode 100644
index 0000000..ac4b77b
--- /dev/null
+++ b/test/unicode/comments.wgsl.expected.msl
@@ -0,0 +1,7 @@
+#include <metal_stdlib>
+
+using namespace metal;
+fragment void tint_symbol() {
+  return;
+}
+
diff --git a/test/unicode/comments.wgsl.expected.spvasm b/test/unicode/comments.wgsl.expected.spvasm
new file mode 100644
index 0000000..123699a
--- /dev/null
+++ b/test/unicode/comments.wgsl.expected.spvasm
@@ -0,0 +1,16 @@
+; SPIR-V
+; Version: 1.3
+; Generator: Google Tint Compiler; 0
+; Bound: 5
+; Schema: 0
+               OpCapability Shader
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint Fragment %main "main"
+               OpExecutionMode %main OriginUpperLeft
+               OpName %main "main"
+       %void = OpTypeVoid
+          %1 = OpTypeFunction %void
+       %main = OpFunction %void None %1
+          %4 = OpLabel
+               OpReturn
+               OpFunctionEnd
diff --git a/test/unicode/comments.wgsl.expected.wgsl b/test/unicode/comments.wgsl.expected.wgsl
new file mode 100644
index 0000000..3662a35
--- /dev/null
+++ b/test/unicode/comments.wgsl.expected.wgsl
@@ -0,0 +1,3 @@
+@stage(fragment)
+fn main() {
+}
diff --git a/test/unicode/indentifiers.wgsl b/test/unicode/indentifiers.wgsl
new file mode 100644
index 0000000..84c904a
--- /dev/null
+++ b/test/unicode/indentifiers.wgsl
@@ -0,0 +1,12 @@
+type ๐“‰๐“Ž๐“…โ„ฏ_๐’ถ = i32;
+type ๐ญ๐ฒ๐ฉ๐ž_๐› = f32;
+
+fn ๐“ฏ๐“พ๐“ท๐“ฌ๐“ฝ๐“ฒ๐“ธ๐“ท(แต–แตƒสณแตƒแต : ๐“‰๐“Ž๐“…โ„ฏ_๐’ถ) -> ๐ญ๐ฒ๐ฉ๐ž_๐› {
+    return ๐ญ๐ฒ๐ฉ๐ž_๐›(แต–แตƒสณแตƒแต);
+}
+
+@stage(fragment)
+fn ๐•–๐•Ÿ๐•ฅ๐•ฃ๐•ช๐•ก๐• ๐•š๐•Ÿ๐•ฅ() {
+    var ๐™ซ๐™–๐™ง๐™ž๐™–๐™—๐™ก๐™š : ๐“‰๐“Ž๐“…โ„ฏ_๐’ถ;
+    let ๐–—๐–Š๐–˜๐–š๐–‘๐–™ = ๐“ฏ๐“พ๐“ท๐“ฌ๐“ฝ๐“ฒ๐“ธ๐“ท(๐™ซ๐™–๐™ง๐™ž๐™–๐™—๐™ก๐™š);
+}
diff --git a/test/unicode/indentifiers.wgsl.expected.glsl b/test/unicode/indentifiers.wgsl.expected.glsl
new file mode 100644
index 0000000..d5d5790
--- /dev/null
+++ b/test/unicode/indentifiers.wgsl.expected.glsl
@@ -0,0 +1,16 @@
+#version 310 es
+precision mediump float;
+
+float tint_symbol_2(int tint_symbol_3) {
+  return float(tint_symbol_3);
+}
+
+void tint_symbol_4() {
+  int tint_symbol_5 = 0;
+  float tint_symbol_6 = tint_symbol_2(tint_symbol_5);
+}
+
+void main() {
+  tint_symbol_4();
+  return;
+}
diff --git a/test/unicode/indentifiers.wgsl.expected.hlsl b/test/unicode/indentifiers.wgsl.expected.hlsl
new file mode 100644
index 0000000..d82de3a
--- /dev/null
+++ b/test/unicode/indentifiers.wgsl.expected.hlsl
@@ -0,0 +1,9 @@
+float tint_symbol_2(int tint_symbol_3) {
+  return float(tint_symbol_3);
+}
+
+void tint_symbol_4() {
+  int tint_symbol_5 = 0;
+  const float tint_symbol_6 = tint_symbol_2(tint_symbol_5);
+  return;
+}
diff --git a/test/unicode/indentifiers.wgsl.expected.msl b/test/unicode/indentifiers.wgsl.expected.msl
new file mode 100644
index 0000000..ea0afa5
--- /dev/null
+++ b/test/unicode/indentifiers.wgsl.expected.msl
@@ -0,0 +1,13 @@
+#include <metal_stdlib>
+
+using namespace metal;
+float tint_symbol_2(int tint_symbol_3) {
+  return float(tint_symbol_3);
+}
+
+fragment void tint_symbol_4() {
+  int tint_symbol_5 = 0;
+  float const tint_symbol_6 = tint_symbol_2(tint_symbol_5);
+  return;
+}
+
diff --git a/test/unicode/indentifiers.wgsl.expected.spvasm b/test/unicode/indentifiers.wgsl.expected.spvasm
new file mode 100644
index 0000000..93b3e82
--- /dev/null
+++ b/test/unicode/indentifiers.wgsl.expected.spvasm
@@ -0,0 +1,33 @@
+; SPIR-V
+; Version: 1.3
+; Generator: Google Tint Compiler; 0
+; Bound: 17
+; Schema: 0
+               OpCapability Shader
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint Fragment %________________________________________ "๐•–๐•Ÿ๐•ฅ๐•ฃ๐•ช๐•ก๐• ๐•š๐•Ÿ๐•ฅ"
+               OpExecutionMode %________________________________________ OriginUpperLeft
+               OpName %________________________________ "๐“ฏ๐“พ๐“ท๐“ฌ๐“ฝ๐“ฒ๐“ธ๐“ท"
+               OpName %______________ "แต–แตƒสณแตƒแต"
+               OpName %________________________________________ "๐•–๐•Ÿ๐•ฅ๐•ฃ๐•ช๐•ก๐• ๐•š๐•Ÿ๐•ฅ"
+               OpName %_________________________________0 "๐™ซ๐™–๐™ง๐™ž๐™–๐™—๐™ก๐™š"
+      %float = OpTypeFloat 32
+        %int = OpTypeInt 32 1
+          %1 = OpTypeFunction %float %int
+       %void = OpTypeVoid
+          %8 = OpTypeFunction %void
+%_ptr_Function_int = OpTypePointer Function %int
+         %14 = OpConstantNull %int
+%________________________________ = OpFunction %float None %1
+%______________ = OpFunctionParameter %int
+          %6 = OpLabel
+          %7 = OpConvertSToF %float %______________
+               OpReturnValue %7
+               OpFunctionEnd
+%________________________________________ = OpFunction %void None %8
+         %11 = OpLabel
+%_________________________________0 = OpVariable %_ptr_Function_int Function %14
+         %16 = OpLoad %int %_________________________________0
+         %15 = OpFunctionCall %float %________________________________ %16
+               OpReturn
+               OpFunctionEnd
diff --git a/test/unicode/indentifiers.wgsl.expected.wgsl b/test/unicode/indentifiers.wgsl.expected.wgsl
new file mode 100644
index 0000000..d78ad34
--- /dev/null
+++ b/test/unicode/indentifiers.wgsl.expected.wgsl
@@ -0,0 +1,13 @@
+type ๐“‰๐“Ž๐“…โ„ฏ_๐’ถ = i32;
+
+type ๐ญ๐ฒ๐ฉ๐ž_๐› = f32;
+
+fn ๐“ฏ๐“พ๐“ท๐“ฌ๐“ฝ๐“ฒ๐“ธ๐“ท(แต–แตƒสณแตƒแต : ๐“‰๐“Ž๐“…โ„ฏ_๐’ถ) -> ๐ญ๐ฒ๐ฉ๐ž_๐› {
+  return ๐ญ๐ฒ๐ฉ๐ž_๐›(แต–แตƒสณแตƒแต);
+}
+
+@stage(fragment)
+fn ๐•–๐•Ÿ๐•ฅ๐•ฃ๐•ช๐•ก๐• ๐•š๐•Ÿ๐•ฅ() {
+  var ๐™ซ๐™–๐™ง๐™ž๐™–๐™—๐™ก๐™š : ๐“‰๐“Ž๐“…โ„ฏ_๐’ถ;
+  let ๐–—๐–Š๐–˜๐–š๐–‘๐–™ = ๐“ฏ๐“พ๐“ท๐“ฌ๐“ฝ๐“ฒ๐“ธ๐“ท(๐™ซ๐™–๐™ง๐™ž๐™–๐™—๐™ก๐™š);
+}