Add src/text

Implements a basic decoder for utf8 and codepoint tables for XID_Start and XID_Continue.

The first steps to supporting unicode identifiers.

Bug: tint:1437
Change-Id: I7371d1e63d664f3a3c34929221c7c6d043beb45d
Reviewed-on: https://dawn-review.googlesource.com/c/tint/+/80842
Kokoro: Kokoro <noreply+kokoro@google.com>
Reviewed-by: David Neto <dneto@google.com>
Commit-Queue: Ben Clayton <bclayton@google.com>
diff --git a/src/BUILD.gn b/src/BUILD.gn
index ca35959..a6f20c4 100644
--- a/src/BUILD.gn
+++ b/src/BUILD.gn
@@ -425,6 +425,8 @@
     "symbol.h",
     "symbol_table.cc",
     "symbol_table.h",
+    "text/unicode.cc",
+    "text/unicode.h",
     "traits.h",
     "transform/add_empty_entry_point.cc",
     "transform/add_empty_entry_point.h",
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 6c83d79..8a4f2eb 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -296,6 +296,8 @@
   symbol_table.h
   symbol.cc
   symbol.h
+  text/unicode.cc
+  text/unicode.h
   traits.h
   transform/add_empty_entry_point.cc
   transform/add_empty_entry_point.h
@@ -773,6 +775,7 @@
     symbol_table_test.cc
     symbol_test.cc
     test_main.cc
+    text/unicode_test.cc
     traits_test.cc
     transform/transform_test.cc
     utils/crc32_test.cc
diff --git a/src/text/unicode.cc b/src/text/unicode.cc
new file mode 100644
index 0000000..12767bf
--- /dev/null
+++ b/src/text/unicode.cc
@@ -0,0 +1,493 @@
+// Copyright 2022 The Tint Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/text/unicode.h"
+
+#include <algorithm>
+
+namespace tint::text {
+namespace {
+
+struct CodePointRange {
+  uint32_t first;  // First code point in the interval
+  uint32_t last;   // Last code point in the interval (inclusive)
+};
+
+inline bool operator<(CodePoint code_point, CodePointRange range) {
+  return code_point < range.first;
+}
+inline bool operator<(CodePointRange range, CodePoint code_point) {
+  return range.last < code_point;
+}
+
+// Interval ranges of all code points in the Unicode 14 XID_Start set
+// This array needs to be in ascending order.
+constexpr CodePointRange kXIDStartRanges[] = {
+    {0x00041, 0x0005a}, {0x00061, 0x0007a}, {0x000aa, 0x000aa},
+    {0x000b5, 0x000b5}, {0x000ba, 0x000ba}, {0x000c0, 0x000d6},
+    {0x000d8, 0x000f6}, {0x000f8, 0x002c1}, {0x002c6, 0x002d1},
+    {0x002e0, 0x002e4}, {0x002ec, 0x002ec}, {0x002ee, 0x002ee},
+    {0x00370, 0x00374}, {0x00376, 0x00377}, {0x0037b, 0x0037d},
+    {0x0037f, 0x0037f}, {0x00386, 0x00386}, {0x00388, 0x0038a},
+    {0x0038c, 0x0038c}, {0x0038e, 0x003a1}, {0x003a3, 0x003f5},
+    {0x003f7, 0x00481}, {0x0048a, 0x0052f}, {0x00531, 0x00556},
+    {0x00559, 0x00559}, {0x00560, 0x00588}, {0x005d0, 0x005ea},
+    {0x005ef, 0x005f2}, {0x00620, 0x0064a}, {0x0066e, 0x0066f},
+    {0x00671, 0x006d3}, {0x006d5, 0x006d5}, {0x006e5, 0x006e6},
+    {0x006ee, 0x006ef}, {0x006fa, 0x006fc}, {0x006ff, 0x006ff},
+    {0x00710, 0x00710}, {0x00712, 0x0072f}, {0x0074d, 0x007a5},
+    {0x007b1, 0x007b1}, {0x007ca, 0x007ea}, {0x007f4, 0x007f5},
+    {0x007fa, 0x007fa}, {0x00800, 0x00815}, {0x0081a, 0x0081a},
+    {0x00824, 0x00824}, {0x00828, 0x00828}, {0x00840, 0x00858},
+    {0x00860, 0x0086a}, {0x00870, 0x00887}, {0x00889, 0x0088e},
+    {0x008a0, 0x008c9}, {0x00904, 0x00939}, {0x0093d, 0x0093d},
+    {0x00950, 0x00950}, {0x00958, 0x00961}, {0x00971, 0x00980},
+    {0x00985, 0x0098c}, {0x0098f, 0x00990}, {0x00993, 0x009a8},
+    {0x009aa, 0x009b0}, {0x009b2, 0x009b2}, {0x009b6, 0x009b9},
+    {0x009bd, 0x009bd}, {0x009ce, 0x009ce}, {0x009dc, 0x009dd},
+    {0x009df, 0x009e1}, {0x009f0, 0x009f1}, {0x009fc, 0x009fc},
+    {0x00a05, 0x00a0a}, {0x00a0f, 0x00a10}, {0x00a13, 0x00a28},
+    {0x00a2a, 0x00a30}, {0x00a32, 0x00a33}, {0x00a35, 0x00a36},
+    {0x00a38, 0x00a39}, {0x00a59, 0x00a5c}, {0x00a5e, 0x00a5e},
+    {0x00a72, 0x00a74}, {0x00a85, 0x00a8d}, {0x00a8f, 0x00a91},
+    {0x00a93, 0x00aa8}, {0x00aaa, 0x00ab0}, {0x00ab2, 0x00ab3},
+    {0x00ab5, 0x00ab9}, {0x00abd, 0x00abd}, {0x00ad0, 0x00ad0},
+    {0x00ae0, 0x00ae1}, {0x00af9, 0x00af9}, {0x00b05, 0x00b0c},
+    {0x00b0f, 0x00b10}, {0x00b13, 0x00b28}, {0x00b2a, 0x00b30},
+    {0x00b32, 0x00b33}, {0x00b35, 0x00b39}, {0x00b3d, 0x00b3d},
+    {0x00b5c, 0x00b5d}, {0x00b5f, 0x00b61}, {0x00b71, 0x00b71},
+    {0x00b83, 0x00b83}, {0x00b85, 0x00b8a}, {0x00b8e, 0x00b90},
+    {0x00b92, 0x00b95}, {0x00b99, 0x00b9a}, {0x00b9c, 0x00b9c},
+    {0x00b9e, 0x00b9f}, {0x00ba3, 0x00ba4}, {0x00ba8, 0x00baa},
+    {0x00bae, 0x00bb9}, {0x00bd0, 0x00bd0}, {0x00c05, 0x00c0c},
+    {0x00c0e, 0x00c10}, {0x00c12, 0x00c28}, {0x00c2a, 0x00c39},
+    {0x00c3d, 0x00c3d}, {0x00c58, 0x00c5a}, {0x00c5d, 0x00c5d},
+    {0x00c60, 0x00c61}, {0x00c80, 0x00c80}, {0x00c85, 0x00c8c},
+    {0x00c8e, 0x00c90}, {0x00c92, 0x00ca8}, {0x00caa, 0x00cb3},
+    {0x00cb5, 0x00cb9}, {0x00cbd, 0x00cbd}, {0x00cdd, 0x00cde},
+    {0x00ce0, 0x00ce1}, {0x00cf1, 0x00cf2}, {0x00d04, 0x00d0c},
+    {0x00d0e, 0x00d10}, {0x00d12, 0x00d3a}, {0x00d3d, 0x00d3d},
+    {0x00d4e, 0x00d4e}, {0x00d54, 0x00d56}, {0x00d5f, 0x00d61},
+    {0x00d7a, 0x00d7f}, {0x00d85, 0x00d96}, {0x00d9a, 0x00db1},
+    {0x00db3, 0x00dbb}, {0x00dbd, 0x00dbd}, {0x00dc0, 0x00dc6},
+    {0x00e01, 0x00e30}, {0x00e32, 0x00e32}, {0x00e40, 0x00e46},
+    {0x00e81, 0x00e82}, {0x00e84, 0x00e84}, {0x00e86, 0x00e8a},
+    {0x00e8c, 0x00ea3}, {0x00ea5, 0x00ea5}, {0x00ea7, 0x00eb0},
+    {0x00eb2, 0x00eb2}, {0x00ebd, 0x00ebd}, {0x00ec0, 0x00ec4},
+    {0x00ec6, 0x00ec6}, {0x00edc, 0x00edf}, {0x00f00, 0x00f00},
+    {0x00f40, 0x00f47}, {0x00f49, 0x00f6c}, {0x00f88, 0x00f8c},
+    {0x01000, 0x0102a}, {0x0103f, 0x0103f}, {0x01050, 0x01055},
+    {0x0105a, 0x0105d}, {0x01061, 0x01061}, {0x01065, 0x01066},
+    {0x0106e, 0x01070}, {0x01075, 0x01081}, {0x0108e, 0x0108e},
+    {0x010a0, 0x010c5}, {0x010c7, 0x010c7}, {0x010cd, 0x010cd},
+    {0x010d0, 0x010fa}, {0x010fc, 0x01248}, {0x0124a, 0x0124d},
+    {0x01250, 0x01256}, {0x01258, 0x01258}, {0x0125a, 0x0125d},
+    {0x01260, 0x01288}, {0x0128a, 0x0128d}, {0x01290, 0x012b0},
+    {0x012b2, 0x012b5}, {0x012b8, 0x012be}, {0x012c0, 0x012c0},
+    {0x012c2, 0x012c5}, {0x012c8, 0x012d6}, {0x012d8, 0x01310},
+    {0x01312, 0x01315}, {0x01318, 0x0135a}, {0x01380, 0x0138f},
+    {0x013a0, 0x013f5}, {0x013f8, 0x013fd}, {0x01401, 0x0166c},
+    {0x0166f, 0x0167f}, {0x01681, 0x0169a}, {0x016a0, 0x016ea},
+    {0x016ee, 0x016f8}, {0x01700, 0x01711}, {0x0171f, 0x01731},
+    {0x01740, 0x01751}, {0x01760, 0x0176c}, {0x0176e, 0x01770},
+    {0x01780, 0x017b3}, {0x017d7, 0x017d7}, {0x017dc, 0x017dc},
+    {0x01820, 0x01878}, {0x01880, 0x018a8}, {0x018aa, 0x018aa},
+    {0x018b0, 0x018f5}, {0x01900, 0x0191e}, {0x01950, 0x0196d},
+    {0x01970, 0x01974}, {0x01980, 0x019ab}, {0x019b0, 0x019c9},
+    {0x01a00, 0x01a16}, {0x01a20, 0x01a54}, {0x01aa7, 0x01aa7},
+    {0x01b05, 0x01b33}, {0x01b45, 0x01b4c}, {0x01b83, 0x01ba0},
+    {0x01bae, 0x01baf}, {0x01bba, 0x01be5}, {0x01c00, 0x01c23},
+    {0x01c4d, 0x01c4f}, {0x01c5a, 0x01c7d}, {0x01c80, 0x01c88},
+    {0x01c90, 0x01cba}, {0x01cbd, 0x01cbf}, {0x01ce9, 0x01cec},
+    {0x01cee, 0x01cf3}, {0x01cf5, 0x01cf6}, {0x01cfa, 0x01cfa},
+    {0x01d00, 0x01dbf}, {0x01e00, 0x01f15}, {0x01f18, 0x01f1d},
+    {0x01f20, 0x01f45}, {0x01f48, 0x01f4d}, {0x01f50, 0x01f57},
+    {0x01f59, 0x01f59}, {0x01f5b, 0x01f5b}, {0x01f5d, 0x01f5d},
+    {0x01f5f, 0x01f7d}, {0x01f80, 0x01fb4}, {0x01fb6, 0x01fbc},
+    {0x01fbe, 0x01fbe}, {0x01fc2, 0x01fc4}, {0x01fc6, 0x01fcc},
+    {0x01fd0, 0x01fd3}, {0x01fd6, 0x01fdb}, {0x01fe0, 0x01fec},
+    {0x01ff2, 0x01ff4}, {0x01ff6, 0x01ffc}, {0x02071, 0x02071},
+    {0x0207f, 0x0207f}, {0x02090, 0x0209c}, {0x02102, 0x02102},
+    {0x02107, 0x02107}, {0x0210a, 0x02113}, {0x02115, 0x02115},
+    {0x02118, 0x0211d}, {0x02124, 0x02124}, {0x02126, 0x02126},
+    {0x02128, 0x02128}, {0x0212a, 0x02139}, {0x0213c, 0x0213f},
+    {0x02145, 0x02149}, {0x0214e, 0x0214e}, {0x02160, 0x02188},
+    {0x02c00, 0x02ce4}, {0x02ceb, 0x02cee}, {0x02cf2, 0x02cf3},
+    {0x02d00, 0x02d25}, {0x02d27, 0x02d27}, {0x02d2d, 0x02d2d},
+    {0x02d30, 0x02d67}, {0x02d6f, 0x02d6f}, {0x02d80, 0x02d96},
+    {0x02da0, 0x02da6}, {0x02da8, 0x02dae}, {0x02db0, 0x02db6},
+    {0x02db8, 0x02dbe}, {0x02dc0, 0x02dc6}, {0x02dc8, 0x02dce},
+    {0x02dd0, 0x02dd6}, {0x02dd8, 0x02dde}, {0x03005, 0x03007},
+    {0x03021, 0x03029}, {0x03031, 0x03035}, {0x03038, 0x0303c},
+    {0x03041, 0x03096}, {0x0309d, 0x0309f}, {0x030a1, 0x030fa},
+    {0x030fc, 0x030ff}, {0x03105, 0x0312f}, {0x03131, 0x0318e},
+    {0x031a0, 0x031bf}, {0x031f0, 0x031ff}, {0x03400, 0x04dbf},
+    {0x04e00, 0x0a48c}, {0x0a4d0, 0x0a4fd}, {0x0a500, 0x0a60c},
+    {0x0a610, 0x0a61f}, {0x0a62a, 0x0a62b}, {0x0a640, 0x0a66e},
+    {0x0a67f, 0x0a69d}, {0x0a6a0, 0x0a6ef}, {0x0a717, 0x0a71f},
+    {0x0a722, 0x0a788}, {0x0a78b, 0x0a7ca}, {0x0a7d0, 0x0a7d1},
+    {0x0a7d3, 0x0a7d3}, {0x0a7d5, 0x0a7d9}, {0x0a7f2, 0x0a801},
+    {0x0a803, 0x0a805}, {0x0a807, 0x0a80a}, {0x0a80c, 0x0a822},
+    {0x0a840, 0x0a873}, {0x0a882, 0x0a8b3}, {0x0a8f2, 0x0a8f7},
+    {0x0a8fb, 0x0a8fb}, {0x0a8fd, 0x0a8fe}, {0x0a90a, 0x0a925},
+    {0x0a930, 0x0a946}, {0x0a960, 0x0a97c}, {0x0a984, 0x0a9b2},
+    {0x0a9cf, 0x0a9cf}, {0x0a9e0, 0x0a9e4}, {0x0a9e6, 0x0a9ef},
+    {0x0a9fa, 0x0a9fe}, {0x0aa00, 0x0aa28}, {0x0aa40, 0x0aa42},
+    {0x0aa44, 0x0aa4b}, {0x0aa60, 0x0aa76}, {0x0aa7a, 0x0aa7a},
+    {0x0aa7e, 0x0aaaf}, {0x0aab1, 0x0aab1}, {0x0aab5, 0x0aab6},
+    {0x0aab9, 0x0aabd}, {0x0aac0, 0x0aac0}, {0x0aac2, 0x0aac2},
+    {0x0aadb, 0x0aadd}, {0x0aae0, 0x0aaea}, {0x0aaf2, 0x0aaf4},
+    {0x0ab01, 0x0ab06}, {0x0ab09, 0x0ab0e}, {0x0ab11, 0x0ab16},
+    {0x0ab20, 0x0ab26}, {0x0ab28, 0x0ab2e}, {0x0ab30, 0x0ab5a},
+    {0x0ab5c, 0x0ab69}, {0x0ab70, 0x0abe2}, {0x0ac00, 0x0d7a3},
+    {0x0d7b0, 0x0d7c6}, {0x0d7cb, 0x0d7fb}, {0x0f900, 0x0fa6d},
+    {0x0fa70, 0x0fad9}, {0x0fb00, 0x0fb06}, {0x0fb13, 0x0fb17},
+    {0x0fb1d, 0x0fb1d}, {0x0fb1f, 0x0fb28}, {0x0fb2a, 0x0fb36},
+    {0x0fb38, 0x0fb3c}, {0x0fb3e, 0x0fb3e}, {0x0fb40, 0x0fb41},
+    {0x0fb43, 0x0fb44}, {0x0fb46, 0x0fbb1}, {0x0fbd3, 0x0fc5d},
+    {0x0fc64, 0x0fd3d}, {0x0fd50, 0x0fd8f}, {0x0fd92, 0x0fdc7},
+    {0x0fdf0, 0x0fdf9}, {0x0fe71, 0x0fe71}, {0x0fe73, 0x0fe73},
+    {0x0fe77, 0x0fe77}, {0x0fe79, 0x0fe79}, {0x0fe7b, 0x0fe7b},
+    {0x0fe7d, 0x0fe7d}, {0x0fe7f, 0x0fefc}, {0x0ff21, 0x0ff3a},
+    {0x0ff41, 0x0ff5a}, {0x0ff66, 0x0ff9d}, {0x0ffa0, 0x0ffbe},
+    {0x0ffc2, 0x0ffc7}, {0x0ffca, 0x0ffcf}, {0x0ffd2, 0x0ffd7},
+    {0x0ffda, 0x0ffdc}, {0x10000, 0x1000b}, {0x1000d, 0x10026},
+    {0x10028, 0x1003a}, {0x1003c, 0x1003d}, {0x1003f, 0x1004d},
+    {0x10050, 0x1005d}, {0x10080, 0x100fa}, {0x10140, 0x10174},
+    {0x10280, 0x1029c}, {0x102a0, 0x102d0}, {0x10300, 0x1031f},
+    {0x1032d, 0x1034a}, {0x10350, 0x10375}, {0x10380, 0x1039d},
+    {0x103a0, 0x103c3}, {0x103c8, 0x103cf}, {0x103d1, 0x103d5},
+    {0x10400, 0x1049d}, {0x104b0, 0x104d3}, {0x104d8, 0x104fb},
+    {0x10500, 0x10527}, {0x10530, 0x10563}, {0x10570, 0x1057a},
+    {0x1057c, 0x1058a}, {0x1058c, 0x10592}, {0x10594, 0x10595},
+    {0x10597, 0x105a1}, {0x105a3, 0x105b1}, {0x105b3, 0x105b9},
+    {0x105bb, 0x105bc}, {0x10600, 0x10736}, {0x10740, 0x10755},
+    {0x10760, 0x10767}, {0x10780, 0x10785}, {0x10787, 0x107b0},
+    {0x107b2, 0x107ba}, {0x10800, 0x10805}, {0x10808, 0x10808},
+    {0x1080a, 0x10835}, {0x10837, 0x10838}, {0x1083c, 0x1083c},
+    {0x1083f, 0x10855}, {0x10860, 0x10876}, {0x10880, 0x1089e},
+    {0x108e0, 0x108f2}, {0x108f4, 0x108f5}, {0x10900, 0x10915},
+    {0x10920, 0x10939}, {0x10980, 0x109b7}, {0x109be, 0x109bf},
+    {0x10a00, 0x10a00}, {0x10a10, 0x10a13}, {0x10a15, 0x10a17},
+    {0x10a19, 0x10a35}, {0x10a60, 0x10a7c}, {0x10a80, 0x10a9c},
+    {0x10ac0, 0x10ac7}, {0x10ac9, 0x10ae4}, {0x10b00, 0x10b35},
+    {0x10b40, 0x10b55}, {0x10b60, 0x10b72}, {0x10b80, 0x10b91},
+    {0x10c00, 0x10c48}, {0x10c80, 0x10cb2}, {0x10cc0, 0x10cf2},
+    {0x10d00, 0x10d23}, {0x10e80, 0x10ea9}, {0x10eb0, 0x10eb1},
+    {0x10f00, 0x10f1c}, {0x10f27, 0x10f27}, {0x10f30, 0x10f45},
+    {0x10f70, 0x10f81}, {0x10fb0, 0x10fc4}, {0x10fe0, 0x10ff6},
+    {0x11003, 0x11037}, {0x11071, 0x11072}, {0x11075, 0x11075},
+    {0x11083, 0x110af}, {0x110d0, 0x110e8}, {0x11103, 0x11126},
+    {0x11144, 0x11144}, {0x11147, 0x11147}, {0x11150, 0x11172},
+    {0x11176, 0x11176}, {0x11183, 0x111b2}, {0x111c1, 0x111c4},
+    {0x111da, 0x111da}, {0x111dc, 0x111dc}, {0x11200, 0x11211},
+    {0x11213, 0x1122b}, {0x11280, 0x11286}, {0x11288, 0x11288},
+    {0x1128a, 0x1128d}, {0x1128f, 0x1129d}, {0x1129f, 0x112a8},
+    {0x112b0, 0x112de}, {0x11305, 0x1130c}, {0x1130f, 0x11310},
+    {0x11313, 0x11328}, {0x1132a, 0x11330}, {0x11332, 0x11333},
+    {0x11335, 0x11339}, {0x1133d, 0x1133d}, {0x11350, 0x11350},
+    {0x1135d, 0x11361}, {0x11400, 0x11434}, {0x11447, 0x1144a},
+    {0x1145f, 0x11461}, {0x11480, 0x114af}, {0x114c4, 0x114c5},
+    {0x114c7, 0x114c7}, {0x11580, 0x115ae}, {0x115d8, 0x115db},
+    {0x11600, 0x1162f}, {0x11644, 0x11644}, {0x11680, 0x116aa},
+    {0x116b8, 0x116b8}, {0x11700, 0x1171a}, {0x11740, 0x11746},
+    {0x11800, 0x1182b}, {0x118a0, 0x118df}, {0x118ff, 0x11906},
+    {0x11909, 0x11909}, {0x1190c, 0x11913}, {0x11915, 0x11916},
+    {0x11918, 0x1192f}, {0x1193f, 0x1193f}, {0x11941, 0x11941},
+    {0x119a0, 0x119a7}, {0x119aa, 0x119d0}, {0x119e1, 0x119e1},
+    {0x119e3, 0x119e3}, {0x11a00, 0x11a00}, {0x11a0b, 0x11a32},
+    {0x11a3a, 0x11a3a}, {0x11a50, 0x11a50}, {0x11a5c, 0x11a89},
+    {0x11a9d, 0x11a9d}, {0x11ab0, 0x11af8}, {0x11c00, 0x11c08},
+    {0x11c0a, 0x11c2e}, {0x11c40, 0x11c40}, {0x11c72, 0x11c8f},
+    {0x11d00, 0x11d06}, {0x11d08, 0x11d09}, {0x11d0b, 0x11d30},
+    {0x11d46, 0x11d46}, {0x11d60, 0x11d65}, {0x11d67, 0x11d68},
+    {0x11d6a, 0x11d89}, {0x11d98, 0x11d98}, {0x11ee0, 0x11ef2},
+    {0x11fb0, 0x11fb0}, {0x12000, 0x12399}, {0x12400, 0x1246e},
+    {0x12480, 0x12543}, {0x12f90, 0x12ff0}, {0x13000, 0x1342e},
+    {0x14400, 0x14646}, {0x16800, 0x16a38}, {0x16a40, 0x16a5e},
+    {0x16a70, 0x16abe}, {0x16ad0, 0x16aed}, {0x16b00, 0x16b2f},
+    {0x16b40, 0x16b43}, {0x16b63, 0x16b77}, {0x16b7d, 0x16b8f},
+    {0x16e40, 0x16e7f}, {0x16f00, 0x16f4a}, {0x16f50, 0x16f50},
+    {0x16f93, 0x16f9f}, {0x16fe0, 0x16fe1}, {0x16fe3, 0x16fe3},
+    {0x17000, 0x187f7}, {0x18800, 0x18cd5}, {0x18d00, 0x18d08},
+    {0x1aff0, 0x1aff3}, {0x1aff5, 0x1affb}, {0x1affd, 0x1affe},
+    {0x1b000, 0x1b122}, {0x1b150, 0x1b152}, {0x1b164, 0x1b167},
+    {0x1b170, 0x1b2fb}, {0x1bc00, 0x1bc6a}, {0x1bc70, 0x1bc7c},
+    {0x1bc80, 0x1bc88}, {0x1bc90, 0x1bc99}, {0x1d400, 0x1d454},
+    {0x1d456, 0x1d49c}, {0x1d49e, 0x1d49f}, {0x1d4a2, 0x1d4a2},
+    {0x1d4a5, 0x1d4a6}, {0x1d4a9, 0x1d4ac}, {0x1d4ae, 0x1d4b9},
+    {0x1d4bb, 0x1d4bb}, {0x1d4bd, 0x1d4c3}, {0x1d4c5, 0x1d505},
+    {0x1d507, 0x1d50a}, {0x1d50d, 0x1d514}, {0x1d516, 0x1d51c},
+    {0x1d51e, 0x1d539}, {0x1d53b, 0x1d53e}, {0x1d540, 0x1d544},
+    {0x1d546, 0x1d546}, {0x1d54a, 0x1d550}, {0x1d552, 0x1d6a5},
+    {0x1d6a8, 0x1d6c0}, {0x1d6c2, 0x1d6da}, {0x1d6dc, 0x1d6fa},
+    {0x1d6fc, 0x1d714}, {0x1d716, 0x1d734}, {0x1d736, 0x1d74e},
+    {0x1d750, 0x1d76e}, {0x1d770, 0x1d788}, {0x1d78a, 0x1d7a8},
+    {0x1d7aa, 0x1d7c2}, {0x1d7c4, 0x1d7cb}, {0x1df00, 0x1df1e},
+    {0x1e100, 0x1e12c}, {0x1e137, 0x1e13d}, {0x1e14e, 0x1e14e},
+    {0x1e290, 0x1e2ad}, {0x1e2c0, 0x1e2eb}, {0x1e7e0, 0x1e7e6},
+    {0x1e7e8, 0x1e7eb}, {0x1e7ed, 0x1e7ee}, {0x1e7f0, 0x1e7fe},
+    {0x1e800, 0x1e8c4}, {0x1e900, 0x1e943}, {0x1e94b, 0x1e94b},
+    {0x1ee00, 0x1ee03}, {0x1ee05, 0x1ee1f}, {0x1ee21, 0x1ee22},
+    {0x1ee24, 0x1ee24}, {0x1ee27, 0x1ee27}, {0x1ee29, 0x1ee32},
+    {0x1ee34, 0x1ee37}, {0x1ee39, 0x1ee39}, {0x1ee3b, 0x1ee3b},
+    {0x1ee42, 0x1ee42}, {0x1ee47, 0x1ee47}, {0x1ee49, 0x1ee49},
+    {0x1ee4b, 0x1ee4b}, {0x1ee4d, 0x1ee4f}, {0x1ee51, 0x1ee52},
+    {0x1ee54, 0x1ee54}, {0x1ee57, 0x1ee57}, {0x1ee59, 0x1ee59},
+    {0x1ee5b, 0x1ee5b}, {0x1ee5d, 0x1ee5d}, {0x1ee5f, 0x1ee5f},
+    {0x1ee61, 0x1ee62}, {0x1ee64, 0x1ee64}, {0x1ee67, 0x1ee6a},
+    {0x1ee6c, 0x1ee72}, {0x1ee74, 0x1ee77}, {0x1ee79, 0x1ee7c},
+    {0x1ee7e, 0x1ee7e}, {0x1ee80, 0x1ee89}, {0x1ee8b, 0x1ee9b},
+    {0x1eea1, 0x1eea3}, {0x1eea5, 0x1eea9}, {0x1eeab, 0x1eebb},
+    {0x20000, 0x2a6df}, {0x2a700, 0x2b738}, {0x2b740, 0x2b81d},
+    {0x2b820, 0x2cea1}, {0x2ceb0, 0x2ebe0}, {0x2f800, 0x2fa1d},
+    {0x30000, 0x3134a},
+};
+
+// Number of ranges in kXIDStartRanges
+constexpr size_t kNumXIDStartRanges =
+    sizeof(kXIDStartRanges) / sizeof(kXIDStartRanges[0]);
+
+// The additional code point interval ranges for the Unicode 14 XID_Continue
+// set. This extends the values in kXIDStartRanges.
+// This array needs to be in ascending order.
+constexpr CodePointRange kXIDContinueRanges[] = {
+    {0x00030, 0x00039}, {0x0005f, 0x0005f}, {0x000b7, 0x000b7},
+    {0x00300, 0x0036f}, {0x00387, 0x00387}, {0x00483, 0x00487},
+    {0x00591, 0x005bd}, {0x005bf, 0x005bf}, {0x005c1, 0x005c2},
+    {0x005c4, 0x005c5}, {0x005c7, 0x005c7}, {0x00610, 0x0061a},
+    {0x0064b, 0x00669}, {0x00670, 0x00670}, {0x006d6, 0x006dc},
+    {0x006df, 0x006e4}, {0x006e7, 0x006e8}, {0x006ea, 0x006ed},
+    {0x006f0, 0x006f9}, {0x00711, 0x00711}, {0x00730, 0x0074a},
+    {0x007a6, 0x007b0}, {0x007c0, 0x007c9}, {0x007eb, 0x007f3},
+    {0x007fd, 0x007fd}, {0x00816, 0x00819}, {0x0081b, 0x00823},
+    {0x00825, 0x00827}, {0x00829, 0x0082d}, {0x00859, 0x0085b},
+    {0x00898, 0x0089f}, {0x008ca, 0x008e1}, {0x008e3, 0x00903},
+    {0x0093a, 0x0093c}, {0x0093e, 0x0094f}, {0x00951, 0x00957},
+    {0x00962, 0x00963}, {0x00966, 0x0096f}, {0x00981, 0x00983},
+    {0x009bc, 0x009bc}, {0x009be, 0x009c4}, {0x009c7, 0x009c8},
+    {0x009cb, 0x009cd}, {0x009d7, 0x009d7}, {0x009e2, 0x009e3},
+    {0x009e6, 0x009ef}, {0x009fe, 0x009fe}, {0x00a01, 0x00a03},
+    {0x00a3c, 0x00a3c}, {0x00a3e, 0x00a42}, {0x00a47, 0x00a48},
+    {0x00a4b, 0x00a4d}, {0x00a51, 0x00a51}, {0x00a66, 0x00a71},
+    {0x00a75, 0x00a75}, {0x00a81, 0x00a83}, {0x00abc, 0x00abc},
+    {0x00abe, 0x00ac5}, {0x00ac7, 0x00ac9}, {0x00acb, 0x00acd},
+    {0x00ae2, 0x00ae3}, {0x00ae6, 0x00aef}, {0x00afa, 0x00aff},
+    {0x00b01, 0x00b03}, {0x00b3c, 0x00b3c}, {0x00b3e, 0x00b44},
+    {0x00b47, 0x00b48}, {0x00b4b, 0x00b4d}, {0x00b55, 0x00b57},
+    {0x00b62, 0x00b63}, {0x00b66, 0x00b6f}, {0x00b82, 0x00b82},
+    {0x00bbe, 0x00bc2}, {0x00bc6, 0x00bc8}, {0x00bca, 0x00bcd},
+    {0x00bd7, 0x00bd7}, {0x00be6, 0x00bef}, {0x00c00, 0x00c04},
+    {0x00c3c, 0x00c3c}, {0x00c3e, 0x00c44}, {0x00c46, 0x00c48},
+    {0x00c4a, 0x00c4d}, {0x00c55, 0x00c56}, {0x00c62, 0x00c63},
+    {0x00c66, 0x00c6f}, {0x00c81, 0x00c83}, {0x00cbc, 0x00cbc},
+    {0x00cbe, 0x00cc4}, {0x00cc6, 0x00cc8}, {0x00cca, 0x00ccd},
+    {0x00cd5, 0x00cd6}, {0x00ce2, 0x00ce3}, {0x00ce6, 0x00cef},
+    {0x00d00, 0x00d03}, {0x00d3b, 0x00d3c}, {0x00d3e, 0x00d44},
+    {0x00d46, 0x00d48}, {0x00d4a, 0x00d4d}, {0x00d57, 0x00d57},
+    {0x00d62, 0x00d63}, {0x00d66, 0x00d6f}, {0x00d81, 0x00d83},
+    {0x00dca, 0x00dca}, {0x00dcf, 0x00dd4}, {0x00dd6, 0x00dd6},
+    {0x00dd8, 0x00ddf}, {0x00de6, 0x00def}, {0x00df2, 0x00df3},
+    {0x00e31, 0x00e31}, {0x00e33, 0x00e3a}, {0x00e47, 0x00e4e},
+    {0x00e50, 0x00e59}, {0x00eb1, 0x00eb1}, {0x00eb3, 0x00ebc},
+    {0x00ec8, 0x00ecd}, {0x00ed0, 0x00ed9}, {0x00f18, 0x00f19},
+    {0x00f20, 0x00f29}, {0x00f35, 0x00f35}, {0x00f37, 0x00f37},
+    {0x00f39, 0x00f39}, {0x00f3e, 0x00f3f}, {0x00f71, 0x00f84},
+    {0x00f86, 0x00f87}, {0x00f8d, 0x00f97}, {0x00f99, 0x00fbc},
+    {0x00fc6, 0x00fc6}, {0x0102b, 0x0103e}, {0x01040, 0x01049},
+    {0x01056, 0x01059}, {0x0105e, 0x01060}, {0x01062, 0x01064},
+    {0x01067, 0x0106d}, {0x01071, 0x01074}, {0x01082, 0x0108d},
+    {0x0108f, 0x0109d}, {0x0135d, 0x0135f}, {0x01369, 0x01371},
+    {0x01712, 0x01715}, {0x01732, 0x01734}, {0x01752, 0x01753},
+    {0x01772, 0x01773}, {0x017b4, 0x017d3}, {0x017dd, 0x017dd},
+    {0x017e0, 0x017e9}, {0x0180b, 0x0180d}, {0x0180f, 0x01819},
+    {0x018a9, 0x018a9}, {0x01920, 0x0192b}, {0x01930, 0x0193b},
+    {0x01946, 0x0194f}, {0x019d0, 0x019da}, {0x01a17, 0x01a1b},
+    {0x01a55, 0x01a5e}, {0x01a60, 0x01a7c}, {0x01a7f, 0x01a89},
+    {0x01a90, 0x01a99}, {0x01ab0, 0x01abd}, {0x01abf, 0x01ace},
+    {0x01b00, 0x01b04}, {0x01b34, 0x01b44}, {0x01b50, 0x01b59},
+    {0x01b6b, 0x01b73}, {0x01b80, 0x01b82}, {0x01ba1, 0x01bad},
+    {0x01bb0, 0x01bb9}, {0x01be6, 0x01bf3}, {0x01c24, 0x01c37},
+    {0x01c40, 0x01c49}, {0x01c50, 0x01c59}, {0x01cd0, 0x01cd2},
+    {0x01cd4, 0x01ce8}, {0x01ced, 0x01ced}, {0x01cf4, 0x01cf4},
+    {0x01cf7, 0x01cf9}, {0x01dc0, 0x01dff}, {0x0203f, 0x02040},
+    {0x02054, 0x02054}, {0x020d0, 0x020dc}, {0x020e1, 0x020e1},
+    {0x020e5, 0x020f0}, {0x02cef, 0x02cf1}, {0x02d7f, 0x02d7f},
+    {0x02de0, 0x02dff}, {0x0302a, 0x0302f}, {0x03099, 0x0309a},
+    {0x0a620, 0x0a629}, {0x0a66f, 0x0a66f}, {0x0a674, 0x0a67d},
+    {0x0a69e, 0x0a69f}, {0x0a6f0, 0x0a6f1}, {0x0a802, 0x0a802},
+    {0x0a806, 0x0a806}, {0x0a80b, 0x0a80b}, {0x0a823, 0x0a827},
+    {0x0a82c, 0x0a82c}, {0x0a880, 0x0a881}, {0x0a8b4, 0x0a8c5},
+    {0x0a8d0, 0x0a8d9}, {0x0a8e0, 0x0a8f1}, {0x0a8ff, 0x0a909},
+    {0x0a926, 0x0a92d}, {0x0a947, 0x0a953}, {0x0a980, 0x0a983},
+    {0x0a9b3, 0x0a9c0}, {0x0a9d0, 0x0a9d9}, {0x0a9e5, 0x0a9e5},
+    {0x0a9f0, 0x0a9f9}, {0x0aa29, 0x0aa36}, {0x0aa43, 0x0aa43},
+    {0x0aa4c, 0x0aa4d}, {0x0aa50, 0x0aa59}, {0x0aa7b, 0x0aa7d},
+    {0x0aab0, 0x0aab0}, {0x0aab2, 0x0aab4}, {0x0aab7, 0x0aab8},
+    {0x0aabe, 0x0aabf}, {0x0aac1, 0x0aac1}, {0x0aaeb, 0x0aaef},
+    {0x0aaf5, 0x0aaf6}, {0x0abe3, 0x0abea}, {0x0abec, 0x0abed},
+    {0x0abf0, 0x0abf9}, {0x0fb1e, 0x0fb1e}, {0x0fe00, 0x0fe0f},
+    {0x0fe20, 0x0fe2f}, {0x0fe33, 0x0fe34}, {0x0fe4d, 0x0fe4f},
+    {0x0ff10, 0x0ff19}, {0x0ff3f, 0x0ff3f}, {0x0ff9e, 0x0ff9f},
+    {0x101fd, 0x101fd}, {0x102e0, 0x102e0}, {0x10376, 0x1037a},
+    {0x104a0, 0x104a9}, {0x10a01, 0x10a03}, {0x10a05, 0x10a06},
+    {0x10a0c, 0x10a0f}, {0x10a38, 0x10a3a}, {0x10a3f, 0x10a3f},
+    {0x10ae5, 0x10ae6}, {0x10d24, 0x10d27}, {0x10d30, 0x10d39},
+    {0x10eab, 0x10eac}, {0x10f46, 0x10f50}, {0x10f82, 0x10f85},
+    {0x11000, 0x11002}, {0x11038, 0x11046}, {0x11066, 0x11070},
+    {0x11073, 0x11074}, {0x1107f, 0x11082}, {0x110b0, 0x110ba},
+    {0x110c2, 0x110c2}, {0x110f0, 0x110f9}, {0x11100, 0x11102},
+    {0x11127, 0x11134}, {0x11136, 0x1113f}, {0x11145, 0x11146},
+    {0x11173, 0x11173}, {0x11180, 0x11182}, {0x111b3, 0x111c0},
+    {0x111c9, 0x111cc}, {0x111ce, 0x111d9}, {0x1122c, 0x11237},
+    {0x1123e, 0x1123e}, {0x112df, 0x112ea}, {0x112f0, 0x112f9},
+    {0x11300, 0x11303}, {0x1133b, 0x1133c}, {0x1133e, 0x11344},
+    {0x11347, 0x11348}, {0x1134b, 0x1134d}, {0x11357, 0x11357},
+    {0x11362, 0x11363}, {0x11366, 0x1136c}, {0x11370, 0x11374},
+    {0x11435, 0x11446}, {0x11450, 0x11459}, {0x1145e, 0x1145e},
+    {0x114b0, 0x114c3}, {0x114d0, 0x114d9}, {0x115af, 0x115b5},
+    {0x115b8, 0x115c0}, {0x115dc, 0x115dd}, {0x11630, 0x11640},
+    {0x11650, 0x11659}, {0x116ab, 0x116b7}, {0x116c0, 0x116c9},
+    {0x1171d, 0x1172b}, {0x11730, 0x11739}, {0x1182c, 0x1183a},
+    {0x118e0, 0x118e9}, {0x11930, 0x11935}, {0x11937, 0x11938},
+    {0x1193b, 0x1193e}, {0x11940, 0x11940}, {0x11942, 0x11943},
+    {0x11950, 0x11959}, {0x119d1, 0x119d7}, {0x119da, 0x119e0},
+    {0x119e4, 0x119e4}, {0x11a01, 0x11a0a}, {0x11a33, 0x11a39},
+    {0x11a3b, 0x11a3e}, {0x11a47, 0x11a47}, {0x11a51, 0x11a5b},
+    {0x11a8a, 0x11a99}, {0x11c2f, 0x11c36}, {0x11c38, 0x11c3f},
+    {0x11c50, 0x11c59}, {0x11c92, 0x11ca7}, {0x11ca9, 0x11cb6},
+    {0x11d31, 0x11d36}, {0x11d3a, 0x11d3a}, {0x11d3c, 0x11d3d},
+    {0x11d3f, 0x11d45}, {0x11d47, 0x11d47}, {0x11d50, 0x11d59},
+    {0x11d8a, 0x11d8e}, {0x11d90, 0x11d91}, {0x11d93, 0x11d97},
+    {0x11da0, 0x11da9}, {0x11ef3, 0x11ef6}, {0x16a60, 0x16a69},
+    {0x16ac0, 0x16ac9}, {0x16af0, 0x16af4}, {0x16b30, 0x16b36},
+    {0x16b50, 0x16b59}, {0x16f4f, 0x16f4f}, {0x16f51, 0x16f87},
+    {0x16f8f, 0x16f92}, {0x16fe4, 0x16fe4}, {0x16ff0, 0x16ff1},
+    {0x1bc9d, 0x1bc9e}, {0x1cf00, 0x1cf2d}, {0x1cf30, 0x1cf46},
+    {0x1d165, 0x1d169}, {0x1d16d, 0x1d172}, {0x1d17b, 0x1d182},
+    {0x1d185, 0x1d18b}, {0x1d1aa, 0x1d1ad}, {0x1d242, 0x1d244},
+    {0x1d7ce, 0x1d7ff}, {0x1da00, 0x1da36}, {0x1da3b, 0x1da6c},
+    {0x1da75, 0x1da75}, {0x1da84, 0x1da84}, {0x1da9b, 0x1da9f},
+    {0x1daa1, 0x1daaf}, {0x1e000, 0x1e006}, {0x1e008, 0x1e018},
+    {0x1e01b, 0x1e021}, {0x1e023, 0x1e024}, {0x1e026, 0x1e02a},
+    {0x1e130, 0x1e136}, {0x1e140, 0x1e149}, {0x1e2ae, 0x1e2ae},
+    {0x1e2ec, 0x1e2f9}, {0x1e8d0, 0x1e8d6}, {0x1e944, 0x1e94a},
+    {0x1e950, 0x1e959}, {0x1fbf0, 0x1fbf9}, {0xe0100, 0xe01ef},
+};
+
+// Number of ranges in kXIDContinueRanges
+constexpr size_t kNumXIDContinueRanges =
+    sizeof(kXIDContinueRanges) / sizeof(kXIDContinueRanges[0]);
+
+}  // namespace
+
+bool CodePoint::IsXIDStart() const {
+  return std::binary_search(kXIDStartRanges,
+                            kXIDStartRanges + kNumXIDStartRanges, *this);
+}
+
+bool CodePoint::IsXIDContinue() const {
+  return IsXIDStart() ||
+         std::binary_search(kXIDContinueRanges,
+                            kXIDContinueRanges + kNumXIDContinueRanges, *this);
+}
+
+std::ostream& operator<<(std::ostream& out, CodePoint code_point) {
+  if (code_point < 0x7f) {
+    // See https://en.cppreference.com/w/cpp/language/escape
+    switch (code_point) {
+      case '\a':
+        return out << R"('\a')";
+      case '\b':
+        return out << R"('\b')";
+      case '\f':
+        return out << R"('\f')";
+      case '\n':
+        return out << R"('\n')";
+      case '\r':
+        return out << R"('\r')";
+      case '\t':
+        return out << R"('\t')";
+      case '\v':
+        return out << R"('\v')";
+    }
+    return out << "'" << static_cast<char>(code_point) << "'";
+  }
+  return out << "'U+" << std::hex << code_point.value << "'";
+}
+
+std::pair<CodePoint, size_t> utf8::Decode(const uint8_t* ptr, size_t len) {
+  if (len < 1) {
+    return {};
+  }
+
+  // Lookup table for the first byte of a UTF-8 sequence.
+  // 0 indicates an invalid length.
+  // Note that bit encodings that can fit in a smaller number of bytes are
+  // invalid (e.g. 0xc0). Code points that exceed the unicode maximum of
+  // 0x10FFFF are also invalid (0xf5+).
+  // See: https://en.wikipedia.org/wiki/UTF-8#Encoding and
+  //      https://datatracker.ietf.org/doc/html/rfc3629#section-3
+  static constexpr uint8_t kSequenceLength[256] = {
+      //         0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f
+      /* 0x00 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      /* 0x10 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      /* 0x20 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      /* 0x30 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      /* 0x40 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      /* 0x50 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      /* 0x60 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      /* 0x70 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      /* 0x80 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      /* 0x90 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      /* 0xa0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      /* 0xb0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      /* 0xc0 */ 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+      /* 0xd0 */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+      /* 0xe0 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+      /* 0xf0 */ 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  };
+
+  uint8_t n = kSequenceLength[ptr[0]];
+  if (n > len) {
+    return {};
+  }
+
+  CodePoint c;
+
+  switch (n) {
+    // Note: n=0 (invalid) is correctly handled without a case.
+    case 1:
+      c = CodePoint{ptr[0]};
+      break;
+    case 2:
+      c = CodePoint{(static_cast<uint32_t>(ptr[0] & 0b00011111) << 6) |
+                    (static_cast<uint32_t>(ptr[1] & 0b00111111))};
+      break;
+    case 3:
+      c = CodePoint{(static_cast<uint32_t>(ptr[0] & 0b00001111) << 12) |
+                    (static_cast<uint32_t>(ptr[1] & 0b00111111) << 6) |
+                    (static_cast<uint32_t>(ptr[2] & 0b00111111))};
+      break;
+    case 4:
+      c = CodePoint{(static_cast<uint32_t>(ptr[0] & 0b00000111) << 18) |
+                    (static_cast<uint32_t>(ptr[1] & 0b00111111) << 12) |
+                    (static_cast<uint32_t>(ptr[2] & 0b00111111) << 6) |
+                    (static_cast<uint32_t>(ptr[3] & 0b00111111))};
+      break;
+  }
+  return {c, n};
+}
+
+}  // namespace tint::text
diff --git a/src/text/unicode.h b/src/text/unicode.h
new file mode 100644
index 0000000..c1ef915
--- /dev/null
+++ b/src/text/unicode.h
@@ -0,0 +1,76 @@
+// Copyright 2022 The Tint Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef SRC_TEXT_UNICODE_H_
+#define SRC_TEXT_UNICODE_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <ostream>
+#include <utility>
+
+namespace tint::text {
+
+/// CodePoint is a unicode code point.
+struct CodePoint {
+  /// Constructor
+  inline CodePoint() = default;
+
+  /// Constructor
+  /// @param v the code point value
+  inline explicit CodePoint(uint32_t v) : value(v) {}
+
+  /// @returns the code point value
+  inline operator uint32_t() const { return value; }
+
+  /// Assignment operator
+  /// @param v the new value for the code point
+  /// @returns this CodePoint
+  inline CodePoint& operator=(uint32_t v) {
+    value = v;
+    return *this;
+  }
+
+  /// @returns true if this CodePoint is in the XID_Start set.
+  /// @see https://unicode.org/reports/tr31/
+  bool IsXIDStart() const;
+
+  /// @returns true if this CodePoint is in the XID_Continue set.
+  /// @see https://unicode.org/reports/tr31/
+  bool IsXIDContinue() const;
+
+  /// The code point value
+  uint32_t value = 0;
+};
+
+/// Writes the CodePoint to the std::ostream.
+/// @param out the std::ostream to write to
+/// @param codepoint the CodePoint to write
+/// @returns out so calls can be chained
+std::ostream& operator<<(std::ostream& out, CodePoint codepoint);
+
+namespace utf8 {
+
+/// Decodes the first code point in the utf8 string.
+/// @param ptr the pointer to the first byte of the utf8 sequence
+/// @param len the maximum number of bytes to read
+/// @returns a pair of CodePoint and width in code units (bytes).
+///          If the next code point cannot be decoded then returns [0,0].
+std::pair<CodePoint, size_t> Decode(const uint8_t* ptr, size_t len);
+
+}  // namespace utf8
+
+}  // namespace tint::text
+
+#endif  // SRC_TEXT_UNICODE_H_
diff --git a/src/text/unicode_test.cc b/src/text/unicode_test.cc
new file mode 100644
index 0000000..75212ba
--- /dev/null
+++ b/src/text/unicode_test.cc
@@ -0,0 +1,485 @@
+// Copyright 2022 The Tint Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/text/unicode.h"
+
+#include <string>
+#include <vector>
+
+#include "gmock/gmock.h"
+
+/// Helper for constructing a CodePoint
+#define C(x) CodePoint(x)
+
+namespace tint::text {
+
+////////////////////////////////////////////////////////////////////////////////
+// CodePoint character set tests
+////////////////////////////////////////////////////////////////////////////////
+namespace {
+
+struct CodePointCase {
+  CodePoint code_point;
+  bool is_xid_start;
+  bool is_xid_continue;
+};
+
+std::ostream& operator<<(std::ostream& out, CodePointCase c) {
+  return out << c.code_point;
+}
+
+class CodePointTest : public testing::TestWithParam<CodePointCase> {};
+
+TEST_P(CodePointTest, CharacterSets) {
+  auto param = GetParam();
+  EXPECT_EQ(param.code_point.IsXIDStart(), param.is_xid_start);
+  EXPECT_EQ(param.code_point.IsXIDContinue(), param.is_xid_continue);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    CodePointTest,
+    CodePointTest,
+    ::testing::ValuesIn({
+        CodePointCase{C(' '), /* start */ false, /* continue */ false},
+        CodePointCase{C('\t'), /* start */ false, /* continue */ false},
+        CodePointCase{C('\n'), /* start */ false, /* continue */ false},
+        CodePointCase{C('\r'), /* start */ false, /* continue */ false},
+        CodePointCase{C('!'), /* start */ false, /* continue */ false},
+        CodePointCase{C('"'), /* start */ false, /* continue */ false},
+        CodePointCase{C('#'), /* start */ false, /* continue */ false},
+        CodePointCase{C('$'), /* start */ false, /* continue */ false},
+        CodePointCase{C('%'), /* start */ false, /* continue */ false},
+        CodePointCase{C('&'), /* start */ false, /* continue */ false},
+        CodePointCase{C('\\'), /* start */ false, /* continue */ false},
+        CodePointCase{C('/'), /* start */ false, /* continue */ false},
+        CodePointCase{C('('), /* start */ false, /* continue */ false},
+        CodePointCase{C(')'), /* start */ false, /* continue */ false},
+        CodePointCase{C('*'), /* start */ false, /* continue */ false},
+        CodePointCase{C(','), /* start */ false, /* continue */ false},
+        CodePointCase{C('-'), /* start */ false, /* continue */ false},
+        CodePointCase{C('/'), /* start */ false, /* continue */ false},
+        CodePointCase{C('`'), /* start */ false, /* continue */ false},
+        CodePointCase{C('@'), /* start */ false, /* continue */ false},
+        CodePointCase{C('^'), /* start */ false, /* continue */ false},
+        CodePointCase{C('['), /* start */ false, /* continue */ false},
+        CodePointCase{C(']'), /* start */ false, /* continue */ false},
+        CodePointCase{C('|'), /* start */ false, /* continue */ false},
+        CodePointCase{C('('), /* start */ false, /* continue */ false},
+        CodePointCase{C(','), /* start */ false, /* continue */ false},
+        CodePointCase{C('}'), /* start */ false, /* continue */ false},
+        CodePointCase{C('a'), /* start */ true, /* continue */ true},
+        CodePointCase{C('b'), /* start */ true, /* continue */ true},
+        CodePointCase{C('c'), /* start */ true, /* continue */ true},
+        CodePointCase{C('x'), /* start */ true, /* continue */ true},
+        CodePointCase{C('y'), /* start */ true, /* continue */ true},
+        CodePointCase{C('z'), /* start */ true, /* continue */ true},
+        CodePointCase{C('A'), /* start */ true, /* continue */ true},
+        CodePointCase{C('B'), /* start */ true, /* continue */ true},
+        CodePointCase{C('C'), /* start */ true, /* continue */ true},
+        CodePointCase{C('X'), /* start */ true, /* continue */ true},
+        CodePointCase{C('Y'), /* start */ true, /* continue */ true},
+        CodePointCase{C('Z'), /* start */ true, /* continue */ true},
+        CodePointCase{C('_'), /* start */ false, /* continue */ true},
+        CodePointCase{C('0'), /* start */ false, /* continue */ true},
+        CodePointCase{C('1'), /* start */ false, /* continue */ true},
+        CodePointCase{C('2'), /* start */ false, /* continue */ true},
+        CodePointCase{C('8'), /* start */ false, /* continue */ true},
+        CodePointCase{C('9'), /* start */ false, /* continue */ true},
+        CodePointCase{C('0'), /* start */ false, /* continue */ true},
+
+        // First in XID_Start
+        CodePointCase{C(0x00041), /* start */ true, /* continue */ true},
+        // Last in XID_Start
+        CodePointCase{C(0x3134a), /* start */ true, /* continue */ true},
+
+        // Random selection from XID_Start, using the interval's first
+        CodePointCase{C(0x002ee), /* start */ true, /* continue */ true},
+        CodePointCase{C(0x005ef), /* start */ true, /* continue */ true},
+        CodePointCase{C(0x009f0), /* start */ true, /* continue */ true},
+        CodePointCase{C(0x00d3d), /* start */ true, /* continue */ true},
+        CodePointCase{C(0x00d54), /* start */ true, /* continue */ true},
+        CodePointCase{C(0x00e86), /* start */ true, /* continue */ true},
+        CodePointCase{C(0x00edc), /* start */ true, /* continue */ true},
+        CodePointCase{C(0x01c00), /* start */ true, /* continue */ true},
+        CodePointCase{C(0x01c80), /* start */ true, /* continue */ true},
+        CodePointCase{C(0x02071), /* start */ true, /* continue */ true},
+        CodePointCase{C(0x02dd0), /* start */ true, /* continue */ true},
+        CodePointCase{C(0x0a4d0), /* start */ true, /* continue */ true},
+        CodePointCase{C(0x0aac0), /* start */ true, /* continue */ true},
+        CodePointCase{C(0x0ab5c), /* start */ true, /* continue */ true},
+        CodePointCase{C(0x0ffda), /* start */ true, /* continue */ true},
+        CodePointCase{C(0x11313), /* start */ true, /* continue */ true},
+        CodePointCase{C(0x1ee49), /* start */ true, /* continue */ true},
+
+        // Random selection from XID_Start, using the interval's last
+        CodePointCase{C(0x00710), /* start */ true, /* continue */ true},
+        CodePointCase{C(0x00b83), /* start */ true, /* continue */ true},
+        CodePointCase{C(0x00b9a), /* start */ true, /* continue */ true},
+        CodePointCase{C(0x00ec4), /* start */ true, /* continue */ true},
+        CodePointCase{C(0x01081), /* start */ true, /* continue */ true},
+        CodePointCase{C(0x012be), /* start */ true, /* continue */ true},
+        CodePointCase{C(0x02107), /* start */ true, /* continue */ true},
+        CodePointCase{C(0x03029), /* start */ true, /* continue */ true},
+        CodePointCase{C(0x03035), /* start */ true, /* continue */ true},
+        CodePointCase{C(0x0aadd), /* start */ true, /* continue */ true},
+        CodePointCase{C(0x10805), /* start */ true, /* continue */ true},
+        CodePointCase{C(0x11075), /* start */ true, /* continue */ true},
+        CodePointCase{C(0x1d4a2), /* start */ true, /* continue */ true},
+        CodePointCase{C(0x1e7fe), /* start */ true, /* continue */ true},
+        CodePointCase{C(0x1ee27), /* start */ true, /* continue */ true},
+        CodePointCase{C(0x2b738), /* start */ true, /* continue */ true},
+
+        // Random selection from XID_Continue, using the interval's first
+        CodePointCase{C(0x16ac0), /* start */ false, /* continue */ true},
+        CodePointCase{C(0x00dca), /* start */ false, /* continue */ true},
+        CodePointCase{C(0x16f4f), /* start */ false, /* continue */ true},
+        CodePointCase{C(0x0fe00), /* start */ false, /* continue */ true},
+        CodePointCase{C(0x00ec8), /* start */ false, /* continue */ true},
+        CodePointCase{C(0x009be), /* start */ false, /* continue */ true},
+        CodePointCase{C(0x11d47), /* start */ false, /* continue */ true},
+        CodePointCase{C(0x11d50), /* start */ false, /* continue */ true},
+        CodePointCase{C(0x0a926), /* start */ false, /* continue */ true},
+        CodePointCase{C(0x0aac1), /* start */ false, /* continue */ true},
+        CodePointCase{C(0x00f18), /* start */ false, /* continue */ true},
+        CodePointCase{C(0x11145), /* start */ false, /* continue */ true},
+        CodePointCase{C(0x017dd), /* start */ false, /* continue */ true},
+        CodePointCase{C(0x0aaeb), /* start */ false, /* continue */ true},
+        CodePointCase{C(0x11173), /* start */ false, /* continue */ true},
+        CodePointCase{C(0x00a51), /* start */ false, /* continue */ true},
+
+        // Random selection from XID_Continue, using the interval's last
+        CodePointCase{C(0x00f84), /* start */ false, /* continue */ true},
+        CodePointCase{C(0x10a3a), /* start */ false, /* continue */ true},
+        CodePointCase{C(0x1e018), /* start */ false, /* continue */ true},
+        CodePointCase{C(0x0a827), /* start */ false, /* continue */ true},
+        CodePointCase{C(0x01abd), /* start */ false, /* continue */ true},
+        CodePointCase{C(0x009d7), /* start */ false, /* continue */ true},
+        CodePointCase{C(0x00b6f), /* start */ false, /* continue */ true},
+        CodePointCase{C(0x0096f), /* start */ false, /* continue */ true},
+        CodePointCase{C(0x11146), /* start */ false, /* continue */ true},
+        CodePointCase{C(0x10eac), /* start */ false, /* continue */ true},
+        CodePointCase{C(0x00f39), /* start */ false, /* continue */ true},
+        CodePointCase{C(0x1e136), /* start */ false, /* continue */ true},
+        CodePointCase{C(0x00def), /* start */ false, /* continue */ true},
+        CodePointCase{C(0x0fe34), /* start */ false, /* continue */ true},
+        CodePointCase{C(0x009c8), /* start */ false, /* continue */ true},
+        CodePointCase{C(0x00fbc), /* start */ false, /* continue */ true},
+
+        // Random code points that are one less than an interval of XID_Start
+        CodePointCase{C(0x003f6), /* start */ false, /* continue */ false},
+        CodePointCase{C(0x005ee), /* start */ false, /* continue */ false},
+        CodePointCase{C(0x009ef), /* start */ false, /* continue */ true},
+        CodePointCase{C(0x00d3c), /* start */ false, /* continue */ true},
+        CodePointCase{C(0x00d53), /* start */ false, /* continue */ false},
+        CodePointCase{C(0x00e85), /* start */ false, /* continue */ false},
+        CodePointCase{C(0x00edb), /* start */ false, /* continue */ false},
+        CodePointCase{C(0x01bff), /* start */ false, /* continue */ false},
+        CodePointCase{C(0x02070), /* start */ false, /* continue */ false},
+        CodePointCase{C(0x02dcf), /* start */ false, /* continue */ false},
+        CodePointCase{C(0x0a4cf), /* start */ false, /* continue */ false},
+        CodePointCase{C(0x0aabf), /* start */ false, /* continue */ true},
+        CodePointCase{C(0x0ab5b), /* start */ false, /* continue */ false},
+        CodePointCase{C(0x0ffd9), /* start */ false, /* continue */ false},
+        CodePointCase{C(0x11312), /* start */ false, /* continue */ false},
+        CodePointCase{C(0x1ee48), /* start */ false, /* continue */ false},
+
+        // Random code points that are one more than an interval of XID_Continue
+        CodePointCase{C(0x00060), /* start */ false, /* continue */ false},
+        CodePointCase{C(0x00a4e), /* start */ false, /* continue */ false},
+        CodePointCase{C(0x00a84), /* start */ false, /* continue */ false},
+        CodePointCase{C(0x00cce), /* start */ false, /* continue */ false},
+        CodePointCase{C(0x00eda), /* start */ false, /* continue */ false},
+        CodePointCase{C(0x00f85), /* start */ false, /* continue */ false},
+        CodePointCase{C(0x01b74), /* start */ false, /* continue */ false},
+        CodePointCase{C(0x01c38), /* start */ false, /* continue */ false},
+        CodePointCase{C(0x0fe30), /* start */ false, /* continue */ false},
+        CodePointCase{C(0x11174), /* start */ false, /* continue */ false},
+        CodePointCase{C(0x112eb), /* start */ false, /* continue */ false},
+        CodePointCase{C(0x115de), /* start */ false, /* continue */ false},
+        CodePointCase{C(0x1172c), /* start */ false, /* continue */ false},
+        CodePointCase{C(0x11a3f), /* start */ false, /* continue */ false},
+        CodePointCase{C(0x11c37), /* start */ false, /* continue */ false},
+        CodePointCase{C(0x11d92), /* start */ false, /* continue */ false},
+        CodePointCase{C(0x1e2af), /* start */ false, /* continue */ false},
+    }));
+
+}  // namespace
+
+////////////////////////////////////////////////////////////////////////////////
+// DecodeUTF8 valid tests
+////////////////////////////////////////////////////////////////////////////////
+namespace {
+
+struct CodePointAndWidth {
+  CodePoint code_point;
+  size_t width;
+};
+
+bool operator==(const CodePointAndWidth& a, const CodePointAndWidth& b) {
+  return a.code_point == b.code_point && a.width == b.width;
+}
+
+std::ostream& operator<<(std::ostream& out, CodePointAndWidth cpw) {
+  return out << "code_point: " << cpw.code_point << ", width: " << cpw.width;
+}
+
+struct DecodeUTF8Case {
+  std::string string;
+  std::vector<CodePointAndWidth> expected;
+};
+
+std::ostream& operator<<(std::ostream& out, DecodeUTF8Case c) {
+  return out << "'" << c.string << "'";
+}
+
+class DecodeUTF8Test : public testing::TestWithParam<DecodeUTF8Case> {};
+
+TEST_P(DecodeUTF8Test, Valid) {
+  auto param = GetParam();
+
+  const uint8_t* data = reinterpret_cast<const uint8_t*>(param.string.data());
+  const size_t len = param.string.size();
+
+  std::vector<CodePointAndWidth> got;
+  size_t offset = 0;
+  while (offset < len) {
+    auto [code_point, width] = utf8::Decode(data + offset, len - offset);
+    if (width == 0) {
+      FAIL() << "Decode() failed at byte offset " << offset;
+    }
+    offset += width;
+    got.emplace_back(CodePointAndWidth{code_point, width});
+  }
+
+  EXPECT_THAT(got, ::testing::ElementsAreArray(param.expected));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    AsciiLetters,
+    DecodeUTF8Test,
+    ::testing::ValuesIn({
+        DecodeUTF8Case{"a", {{C('a'), 1}}},
+        DecodeUTF8Case{"abc", {{C('a'), 1}, {C('b'), 1}, {C('c'), 1}}},
+        DecodeUTF8Case{"def", {{C('d'), 1}, {C('e'), 1}, {C('f'), 1}}},
+        DecodeUTF8Case{"gh", {{C('g'), 1}, {C('h'), 1}}},
+        DecodeUTF8Case{"ij", {{C('i'), 1}, {C('j'), 1}}},
+        DecodeUTF8Case{"klm", {{C('k'), 1}, {C('l'), 1}, {C('m'), 1}}},
+        DecodeUTF8Case{"nop", {{C('n'), 1}, {C('o'), 1}, {C('p'), 1}}},
+        DecodeUTF8Case{"qr", {{C('q'), 1}, {C('r'), 1}}},
+        DecodeUTF8Case{"stu", {{C('s'), 1}, {C('t'), 1}, {C('u'), 1}}},
+        DecodeUTF8Case{"vw", {{C('v'), 1}, {C('w'), 1}}},
+        DecodeUTF8Case{"xyz", {{C('x'), 1}, {C('y'), 1}, {C('z'), 1}}},
+        DecodeUTF8Case{"A", {{C('A'), 1}}},
+        DecodeUTF8Case{"ABC", {{C('A'), 1}, {C('B'), 1}, {C('C'), 1}}},
+        DecodeUTF8Case{"DEF", {{C('D'), 1}, {C('E'), 1}, {C('F'), 1}}},
+        DecodeUTF8Case{"GH", {{C('G'), 1}, {C('H'), 1}}},
+        DecodeUTF8Case{"IJ", {{C('I'), 1}, {C('J'), 1}}},
+        DecodeUTF8Case{"KLM", {{C('K'), 1}, {C('L'), 1}, {C('M'), 1}}},
+        DecodeUTF8Case{"NOP", {{C('N'), 1}, {C('O'), 1}, {C('P'), 1}}},
+        DecodeUTF8Case{"QR", {{C('Q'), 1}, {C('R'), 1}}},
+        DecodeUTF8Case{"STU", {{C('S'), 1}, {C('T'), 1}, {C('U'), 1}}},
+        DecodeUTF8Case{"VW", {{C('V'), 1}, {C('W'), 1}}},
+        DecodeUTF8Case{"XYZ", {{C('X'), 1}, {C('Y'), 1}, {C('Z'), 1}}},
+    }));
+
+INSTANTIATE_TEST_SUITE_P(
+    AsciiNumbers,
+    DecodeUTF8Test,
+    ::testing::ValuesIn({
+        DecodeUTF8Case{"012", {{C('0'), 1}, {C('1'), 1}, {C('2'), 1}}},
+        DecodeUTF8Case{"345", {{C('3'), 1}, {C('4'), 1}, {C('5'), 1}}},
+        DecodeUTF8Case{"678", {{C('6'), 1}, {C('7'), 1}, {C('8'), 1}}},
+        DecodeUTF8Case{"9", {{C('9'), 1}}},
+    }));
+
+INSTANTIATE_TEST_SUITE_P(
+    AsciiSymbols,
+    DecodeUTF8Test,
+    ::testing::ValuesIn({
+        DecodeUTF8Case{"!\"#", {{C('!'), 1}, {C('"'), 1}, {C('#'), 1}}},
+        DecodeUTF8Case{"$%&", {{C('$'), 1}, {C('%'), 1}, {C('&'), 1}}},
+        DecodeUTF8Case{"'()", {{C('\''), 1}, {C('('), 1}, {C(')'), 1}}},
+        DecodeUTF8Case{"*,-", {{C('*'), 1}, {C(','), 1}, {C('-'), 1}}},
+        DecodeUTF8Case{"/`@", {{C('/'), 1}, {C('`'), 1}, {C('@'), 1}}},
+        DecodeUTF8Case{"^\\[", {{C('^'), 1}, {C('\\'), 1}, {C('['), 1}}},
+        DecodeUTF8Case{"]_|", {{C(']'), 1}, {C('_'), 1}, {C('|'), 1}}},
+        DecodeUTF8Case{"{}", {{C('{'), 1}, {C('}'), 1}}},
+    }));
+
+INSTANTIATE_TEST_SUITE_P(
+    AsciiSpecial,
+    DecodeUTF8Test,
+    ::testing::ValuesIn({
+        DecodeUTF8Case{"", {}},
+        DecodeUTF8Case{" \t\n", {{C(' '), 1}, {C('\t'), 1}, {C('\n'), 1}}},
+        DecodeUTF8Case{"\a\b\f", {{C('\a'), 1}, {C('\b'), 1}, {C('\f'), 1}}},
+        DecodeUTF8Case{"\n\r\t", {{C('\n'), 1}, {C('\r'), 1}, {C('\t'), 1}}},
+        DecodeUTF8Case{"\v", {{C('\v'), 1}}},
+    }));
+
+INSTANTIATE_TEST_SUITE_P(
+    Hindi,
+    DecodeUTF8Test,
+    ::testing::ValuesIn({DecodeUTF8Case{
+        // नमस्ते दुनिया
+        "\xe0\xa4\xa8\xe0\xa4\xae\xe0\xa4\xb8\xe0\xa5\x8d\xe0\xa4\xa4\xe0\xa5"
+        "\x87\x20\xe0\xa4\xa6\xe0\xa5\x81\xe0\xa4\xa8\xe0\xa4\xbf\xe0\xa4\xaf"
+        "\xe0\xa4\xbe",
+        {
+            {C(0x0928), 3},  // न
+            {C(0x092e), 3},  // म
+            {C(0x0938), 3},  // स
+            {C(0x094d), 3},  // ्
+            {C(0x0924), 3},  // त
+            {C(0x0947), 3},  // े
+            {C(' '), 1},
+            {C(0x0926), 3},  // द
+            {C(0x0941), 3},  // ु
+            {C(0x0928), 3},  // न
+            {C(0x093f), 3},  // ि
+            {C(0x092f), 3},  // य
+            {C(0x093e), 3},  // ा
+        },
+    }}));
+
+INSTANTIATE_TEST_SUITE_P(Mandarin,
+                         DecodeUTF8Test,
+                         ::testing::ValuesIn({DecodeUTF8Case{
+                             // 你好世界
+                             "\xe4\xbd\xa0\xe5\xa5\xbd\xe4\xb8\x96\xe7\x95\x8c",
+                             {
+                                 {C(0x4f60), 3},  // 你
+                                 {C(0x597d), 3},  // 好
+                                 {C(0x4e16), 3},  // 世
+                                 {C(0x754c), 3},  // 界
+                             },
+                         }}));
+
+INSTANTIATE_TEST_SUITE_P(Japanese,
+                         DecodeUTF8Test,
+                         ::testing::ValuesIn({DecodeUTF8Case{
+                             // こんにちは世界
+                             "\xe3\x81\x93\xe3\x82\x93\xe3\x81\xab\xe3\x81\xa1"
+                             "\xe3\x81\xaf\xe4\xb8\x96\xe7\x95\x8c",
+                             {
+                                 {C(0x3053), 3},  // こ
+                                 {C(0x3093), 3},  // ん
+                                 {C(0x306B), 3},  // に
+                                 {C(0x3061), 3},  // ち
+                                 {C(0x306F), 3},  // は
+                                 {C(0x4E16), 3},  // 世
+                                 {C(0x754C), 3},  // 界
+                             },
+                         }}));
+
+INSTANTIATE_TEST_SUITE_P(Korean,
+                         DecodeUTF8Test,
+                         ::testing::ValuesIn({DecodeUTF8Case{
+                             // 안녕하세요 세계
+                             "\xec\x95\x88\xeb\x85\x95\xed\x95\x98\xec\x84\xb8"
+                             "\xec\x9a\x94\x20\xec\x84\xb8\xea\xb3\x84",
+                             {
+                                 {C(0xc548), 3},  // 안
+                                 {C(0xb155), 3},  // 녕
+                                 {C(0xd558), 3},  // 하
+                                 {C(0xc138), 3},  // 세
+                                 {C(0xc694), 3},  // 요
+                                 {C(' '), 1},     //
+                                 {C(0xc138), 3},  // 세
+                                 {C(0xacc4), 3},  // 계
+                             },
+                         }}));
+
+INSTANTIATE_TEST_SUITE_P(Emoji,
+                         DecodeUTF8Test,
+                         ::testing::ValuesIn({DecodeUTF8Case{
+                             // 👋🌎
+                             "\xf0\x9f\x91\x8b\xf0\x9f\x8c\x8e",
+                             {
+                                 {C(0x1f44b), 4},  // 👋
+                                 {C(0x1f30e), 4},  // 🌎
+                             },
+                         }}));
+
+INSTANTIATE_TEST_SUITE_P(
+    Random,
+    DecodeUTF8Test,
+    ::testing::ValuesIn({DecodeUTF8Case{
+        // Øⓑꚫ쁹Ǵ𐌒岾🥍ⴵ㍨又ᮗ
+        "\xc3\x98\xe2\x93\x91\xea\x9a\xab\xec\x81\xb9\xc7\xb4\xf0\x90\x8c\x92"
+        "\xe5\xb2\xbe\xf0\x9f\xa5\x8d\xe2\xb4\xb5\xe3\x8d\xa8\xe5\x8f\x88\xe1"
+        "\xae\x97",
+        {
+            {C(0x000d8), 2},  // Ø
+            {C(0x024d1), 3},  // ⓑ
+            {C(0x0a6ab), 3},  // ꚫ
+            {C(0x0c079), 3},  // 쁹
+            {C(0x001f4), 2},  // Ǵ
+            {C(0x10312), 4},  // 𐌒
+            {C(0x05cbe), 3},  // 岾
+            {C(0x1f94d), 4},  // 🥍
+            {C(0x02d35), 3},  // ⴵ
+            {C(0x03368), 3},  // ㍨
+            {C(0x053c8), 3},  // 又
+            {C(0x01b97), 3},  // ᮗ
+        },
+    }}));
+
+}  // namespace
+
+////////////////////////////////////////////////////////////////////////////////
+// DecodeUTF8 invalid tests
+////////////////////////////////////////////////////////////////////////////////
+namespace {
+class DecodeUTF8InvalidTest : public testing::TestWithParam<const char*> {};
+
+TEST_P(DecodeUTF8InvalidTest, Invalid) {
+  auto* param = GetParam();
+
+  const uint8_t* data = reinterpret_cast<const uint8_t*>(param);
+  const size_t len = std::string(param).size();
+
+  auto [code_point, width] = utf8::Decode(data, len);
+  EXPECT_EQ(code_point, CodePoint(0));
+  EXPECT_EQ(width, 0u);
+}
+
+INSTANTIATE_TEST_SUITE_P(Invalid,
+                         DecodeUTF8InvalidTest,
+                         ::testing::ValuesIn({
+                             "\x80\x80\x80\x80",  // 10000000
+                             "\x81\x80\x80\x80",  // 10000001
+                             "\x8f\x80\x80\x80",  // 10001111
+                             "\x90\x80\x80\x80",  // 10010000
+                             "\x91\x80\x80\x80",  // 10010001
+                             "\x9f\x80\x80\x80",  // 10011111
+                             "\xa0\x80\x80\x80",  // 10100000
+                             "\xa1\x80\x80\x80",  // 10100001
+                             "\xaf\x80\x80\x80",  // 10101111
+                             "\xb0\x80\x80\x80",  // 10110000
+                             "\xb1\x80\x80\x80",  // 10110001
+                             "\xbf\x80\x80\x80",  // 10111111
+                             "\xc0\x80\x80\x80",  // 11000000
+                             "\xc1\x80\x80\x80",  // 11000001
+                             "\xf5\x80\x80\x80",  // 11110101
+                             "\xf6\x80\x80\x80",  // 11110110
+                             "\xf7\x80\x80\x80",  // 11110111
+                             "\xf8\x80\x80\x80",  // 11111000
+                             "\xfe\x80\x80\x80",  // 11111110
+                             "\xff\x80\x80\x80",  // 11111111
+                         }));
+
+}  // namespace
+
+}  // namespace tint::text
diff --git a/test/BUILD.gn b/test/BUILD.gn
index a1a99f6..638a767 100644
--- a/test/BUILD.gn
+++ b/test/BUILD.gn
@@ -301,6 +301,12 @@
   ]
 }
 
+tint_unittests_source_set("tint_unittests_text_src") {
+  sources = [
+    "../src/text/unicode_test.cc",
+  ]
+}
+
 tint_unittests_source_set("tint_unittests_transform_src") {
   sources = [
     "../src/transform/add_empty_entry_point_test.cc",
@@ -736,6 +742,7 @@
     ":tint_unittests_inspector_src",
     ":tint_unittests_resolver_src",
     ":tint_unittests_sem_src",
+    ":tint_unittests_text_src",
     ":tint_unittests_transform_src",
     ":tint_unittests_utils_src",
     ":tint_unittests_writer_src",