Validate UTF8 conversion is in range.
When converting UTF8 characters, we'd check the various bytes but we did
not validate that the resulting value was within range of the codepoints
supported by the number of bytes decoded.
This CL adds a check to the UTF-8 decoder to validate that the resulting
`uint32_t` value is within range for the number of source bytes.
Bug: 388039766
Change-Id: Ie7ed6a19e84f04356e3fdeba578281cf966145f6
Reviewed-on: https://dawn-review.googlesource.com/c/dawn/+/221375
Commit-Queue: Ryan Harrison <rharrison@chromium.org>
Commit-Queue: dan sinclair <dsinclair@chromium.org>
Reviewed-by: Ryan Harrison <rharrison@chromium.org>
Auto-Submit: dan sinclair <dsinclair@chromium.org>
diff --git a/src/tint/utils/text/unicode.cc b/src/tint/utils/text/unicode.cc
index 1c10135..638165f 100644
--- a/src/tint/utils/text/unicode.cc
+++ b/src/tint/utils/text/unicode.cc
@@ -419,6 +419,17 @@
n = 0;
c = 0;
}
+
+ // Validate code point range. After decoding, each byte count utf8 character must fall within
+ // the available range of code points.
+ uint32_t v = c;
+ if ((n == 1 && (v > 0x0000'007f)) || //
+ (n == 2 && (v < 0x0000'0080 || v > 0x0000'07ff)) || //
+ (n == 3 && (v < 0x0000'0800 || v > 0x0000'ffff)) || //
+ (n == 4 && (v < 0x0001'0000 || v > 0x0010'ffff))) {
+ return {};
+ }
+
return {c, n};
}
diff --git a/src/tint/utils/text/unicode_test.cc b/src/tint/utils/text/unicode_test.cc
index 204bc76..48eb9fb 100644
--- a/src/tint/utils/text/unicode_test.cc
+++ b/src/tint/utils/text/unicode_test.cc
@@ -558,6 +558,8 @@
{0xf4, 0xff, 0x8f, 0x8f}, // 4-bytes, second byte's second-MSB set
{0xf4, 0x8f, 0xff, 0x8f}, // 4-bytes, third byte's second-MSB set
{0xf4, 0x8f, 0x8f, 0xff}, // 4-bytes, fourth byte's second-MSB set
+
+ {0xe0, 0x9d, 0x81}, // Value out of range for 3-byte
}));
} // namespace utf8_tests