[tint][utils] Expand unicode support
* Add utf8::Encode()
* Add utf16::Decode() and utf16::Encode()
Change-Id: I1d73c59bb49f19a3c168b814ba1ae234723c4057
Reviewed-on: https://dawn-review.googlesource.com/c/dawn/+/183000
Reviewed-by: James Price <jrprice@google.com>
Commit-Queue: Ben Clayton <bclayton@google.com>
diff --git a/src/tint/utils/text/unicode.cc b/src/tint/utils/text/unicode.cc
index 5e4a4b7..157f5da 100644
--- a/src/tint/utils/text/unicode.cc
+++ b/src/tint/utils/text/unicode.cc
@@ -424,6 +424,40 @@
return Decode(reinterpret_cast<const uint8_t*>(utf8_string.data()), utf8_string.size());
}
+size_t Encode(CodePoint code_point, uint8_t* ptr) {
+ if (code_point <= 0x7f) {
+ if (ptr) {
+ ptr[0] = static_cast<uint8_t>(code_point);
+ }
+ return 1;
+ }
+ if (code_point <= 0x7ff) {
+ if (ptr) {
+ ptr[0] = static_cast<uint8_t>(code_point >> 6) | 0b11000000;
+ ptr[1] = static_cast<uint8_t>(code_point & 0b00111111) | 0b10000000;
+ }
+ return 2;
+ }
+ if (code_point <= 0xffff) {
+ if (ptr) {
+ ptr[0] = static_cast<uint8_t>(code_point >> 12) | 0b11100000;
+ ptr[1] = static_cast<uint8_t>((code_point >> 6) & 0b00111111) | 0b10000000;
+ ptr[2] = static_cast<uint8_t>(code_point & 0b00111111) | 0b10000000;
+ }
+ return 3;
+ }
+ if (code_point <= 0x10ffff) {
+ if (ptr) {
+ ptr[0] = static_cast<uint8_t>(code_point >> 18) | 0b11110000;
+ ptr[1] = static_cast<uint8_t>((code_point >> 12) & 0b00111111) | 0b10000000;
+ ptr[2] = static_cast<uint8_t>((code_point >> 6) & 0b00111111) | 0b10000000;
+ ptr[3] = static_cast<uint8_t>(code_point & 0b00111111) | 0b10000000;
+ }
+ return 4;
+ }
+ return 0; // invalid code point
+}
+
bool IsASCII(std::string_view str) {
for (auto c : str) {
if (c & 0x80) {
@@ -435,4 +469,49 @@
} // namespace utf8
+namespace utf16 {
+
+std::pair<CodePoint, size_t> Decode(const uint16_t* ptr, size_t len) {
+ if (len < 1) {
+ return {};
+ }
+ uint16_t a = ptr[0];
+ if (a <= 0xd7ff || a >= 0xe000) {
+ return {CodePoint{static_cast<uint32_t>(a)}, 1};
+ }
+ if (len < 2) {
+ return {};
+ }
+ uint32_t b = ptr[1];
+ if (b <= 0xd7ff || b >= 0xe000) {
+ return {};
+ }
+ uint32_t high = a - 0xd800;
+ uint32_t low = b - 0xdc00;
+ return {CodePoint{0x10000 + ((high << 10) | low)}, 2};
+}
+
+std::pair<CodePoint, size_t> Decode(std::string_view utf16_string) {
+ return Decode(reinterpret_cast<const uint16_t*>(utf16_string.data()), utf16_string.size() / 2);
+}
+
+size_t Encode(CodePoint code_point, uint16_t* ptr) {
+ if (code_point <= 0xd7ff || (code_point >= 0xe000 && code_point <= 0xffff)) {
+ if (ptr) {
+ ptr[0] = static_cast<uint16_t>(code_point);
+ }
+ return 1;
+ }
+ if (code_point >= 0x10000 && code_point <= 0x10ffff) {
+ if (ptr) {
+ auto biased = code_point - 0x10000;
+ ptr[0] = static_cast<uint16_t>((biased >> 10) + 0xd800);
+ ptr[1] = static_cast<uint16_t>((biased & 0b1111111111) + 0xdc00);
+ }
+ return 2;
+ }
+ return 0; // invalid code point
+}
+
+} // namespace utf16
} // namespace tint
diff --git a/src/tint/utils/text/unicode.h b/src/tint/utils/text/unicode.h
index c993996..76e9b2a 100644
--- a/src/tint/utils/text/unicode.h
+++ b/src/tint/utils/text/unicode.h
@@ -71,23 +71,56 @@
/// Decodes the first code point in the utf8 string.
/// @param ptr the pointer to the first byte of the utf8 sequence
-/// @param len the maximum number of bytes to read
-/// @returns a pair of CodePoint and width in code units (bytes).
+/// @param len the maximum number of uint8_t to read
+/// @returns a pair of CodePoint and width in code units (uint8_t).
/// If the next code point cannot be decoded then returns [0,0].
std::pair<CodePoint, size_t> Decode(const uint8_t* ptr, size_t len);
/// Decodes the first code point in the utf8 string.
/// @param utf8_string the string view that contains the utf8 sequence
-/// @returns a pair of CodePoint and width in code units (bytes).
-/// If the next code point cannot be decoded then returns [0,0].
+/// @returns a pair of CodePoint and width in code units (uint8_t).
+/// If the next code point cannot be decoded, then returns [0,0].
std::pair<CodePoint, size_t> Decode(std::string_view utf8_string);
+/// Encodes a code point to the utf8 string buffer or queries the number of code units used to
+/// encode the code point.
+/// @param code_point the code point to encode.
+/// @param ptr the pointer to the utf8 string buffer, or nullptr to query the number of code units
+/// that would be written if @p ptr is not nullptr.
+/// @returns the number of code units written / would be written (at most 4).
+size_t Encode(CodePoint code_point, uint8_t* ptr);
+
/// @returns true if all the utf-8 code points in the string are ASCII
/// (code-points 0x00..0x7f).
bool IsASCII(std::string_view);
} // namespace utf8
+namespace utf16 {
+
+/// Decodes the first code point in the utf16 string.
+/// @param ptr the pointer to the first byte of the utf16 sequence
+/// @param len the maximum number of code units to read
+/// @returns a pair of CodePoint and width in code units (16-bit integers).
+/// If the next code point cannot be decoded then returns [0,0].
+std::pair<CodePoint, size_t> Decode(const uint16_t* ptr, size_t len);
+
+/// Decodes the first code point in the utf16 string.
+/// @param utf16_string the string view that contains the utf16 sequence
+/// @returns a pair of CodePoint and width in code units (16-bit integers).
+/// If the next code point cannot be decoded then returns [0,0].
+std::pair<CodePoint, size_t> Decode(std::string_view utf16_string);
+
+/// Encodes a code point to the utf16 string buffer or queries the number of code units used to
+/// encode the code point.
+/// @param code_point the code point to encode.
+/// @param ptr the pointer to the utf16 string buffer, or nullptr to query the number of code units
+/// that would be written if @p ptr is not nullptr.
+/// @returns the number of code units written / would be written (at most 2).
+size_t Encode(CodePoint code_point, uint16_t* ptr);
+
+} // namespace utf16
+
} // namespace tint
#endif // SRC_TINT_UTILS_TEXT_UNICODE_H_
diff --git a/src/tint/utils/text/unicode_test.cc b/src/tint/utils/text/unicode_test.cc
index c1a8c35..1887fd7 100644
--- a/src/tint/utils/text/unicode_test.cc
+++ b/src/tint/utils/text/unicode_test.cc
@@ -27,19 +27,19 @@
#include "src/tint/utils/text/unicode.h"
+#include <cstdint>
+#include <ios>
#include <string>
+#include <string_view>
#include <vector>
#include "gmock/gmock.h"
+#include "src/tint/utils/text/string.h"
/// Helper for constructing a CodePoint
#define C(x) CodePoint(x)
namespace tint {
-
-////////////////////////////////////////////////////////////////////////////////
-// CodePoint character set tests
-////////////////////////////////////////////////////////////////////////////////
namespace {
struct CodePointCase {
@@ -48,10 +48,30 @@
bool is_xid_continue;
};
-std::ostream& operator<<(std::ostream& out, CodePointCase c) {
+static std::ostream& operator<<(std::ostream& out, CodePointCase c) {
return out << c.code_point;
}
+struct CodePointAndWidth {
+ CodePoint code_point;
+ size_t width;
+};
+
+bool operator==(const CodePointAndWidth& a, const CodePointAndWidth& b) {
+ return a.code_point == b.code_point && a.width == b.width;
+}
+
+static std::ostream& operator<<(std::ostream& out, CodePointAndWidth cpw) {
+ return out << "code_point: " << cpw.code_point << ", width: " << cpw.width;
+}
+
+} // namespace
+
+////////////////////////////////////////////////////////////////////////////////
+// CodePoint character set tests
+////////////////////////////////////////////////////////////////////////////////
+namespace {
+
class CodePointTest : public testing::TestWithParam<CodePointCase> {};
TEST_P(CodePointTest, CharacterSets) {
@@ -232,33 +252,26 @@
////////////////////////////////////////////////////////////////////////////////
// DecodeUTF8 valid tests
////////////////////////////////////////////////////////////////////////////////
-namespace {
+namespace utf8_tests {
-struct CodePointAndWidth {
- CodePoint code_point;
- size_t width;
+struct UTF8Case {
+ std::vector<uint8_t> string;
+ std::vector<CodePointAndWidth> code_points;
};
-bool operator==(const CodePointAndWidth& a, const CodePointAndWidth& b) {
- return a.code_point == b.code_point && a.width == b.width;
+static std::ostream& operator<<(std::ostream& out, UTF8Case c) {
+ for (size_t i = 0; i < c.string.size(); i++) {
+ if (i > 0) {
+ out << ", ";
+ }
+ out << "0x" << std::hex << std::setfill('0') << std::setw(2) << c.string[i];
+ }
+ return out;
}
-std::ostream& operator<<(std::ostream& out, CodePointAndWidth cpw) {
- return out << "code_point: " << cpw.code_point << ", width: " << cpw.width;
-}
+class UTF8Test : public testing::TestWithParam<UTF8Case> {};
-struct DecodeUTF8Case {
- std::string string;
- std::vector<CodePointAndWidth> expected;
-};
-
-std::ostream& operator<<(std::ostream& out, DecodeUTF8Case c) {
- return out << "'" << c.string << "'";
-}
-
-class DecodeUTF8Test : public testing::TestWithParam<DecodeUTF8Case> {};
-
-TEST_P(DecodeUTF8Test, Valid) {
+TEST_P(UTF8Test, Decode) {
auto param = GetParam();
const uint8_t* data = reinterpret_cast<const uint8_t*>(param.string.data());
@@ -275,75 +288,96 @@
got.emplace_back(CodePointAndWidth{code_point, width});
}
- EXPECT_THAT(got, ::testing::ElementsAreArray(param.expected));
+ EXPECT_THAT(got, ::testing::ElementsAreArray(param.code_points));
+}
+
+TEST_P(UTF8Test, Encode) {
+ auto param = GetParam();
+
+ Slice<const uint8_t> str{reinterpret_cast<const uint8_t*>(param.string.data()),
+ param.string.size()};
+ for (auto codepoint : param.code_points) {
+ EXPECT_EQ(utf8::Encode(codepoint.code_point, nullptr), codepoint.width);
+
+ uint8_t encoded[4];
+ size_t len = utf8::Encode(codepoint.code_point, encoded);
+ ASSERT_EQ(len, codepoint.width);
+ EXPECT_THAT(Slice<const uint8_t>(encoded, len),
+ ::testing::ElementsAreArray(str.Truncate(len)));
+ str = str.Offset(len);
+ }
}
INSTANTIATE_TEST_SUITE_P(AsciiLetters,
- DecodeUTF8Test,
+ UTF8Test,
::testing::ValuesIn({
- DecodeUTF8Case{"a", {{C('a'), 1}}},
- DecodeUTF8Case{"abc", {{C('a'), 1}, {C('b'), 1}, {C('c'), 1}}},
- DecodeUTF8Case{"def", {{C('d'), 1}, {C('e'), 1}, {C('f'), 1}}},
- DecodeUTF8Case{"gh", {{C('g'), 1}, {C('h'), 1}}},
- DecodeUTF8Case{"ij", {{C('i'), 1}, {C('j'), 1}}},
- DecodeUTF8Case{"klm", {{C('k'), 1}, {C('l'), 1}, {C('m'), 1}}},
- DecodeUTF8Case{"nop", {{C('n'), 1}, {C('o'), 1}, {C('p'), 1}}},
- DecodeUTF8Case{"qr", {{C('q'), 1}, {C('r'), 1}}},
- DecodeUTF8Case{"stu", {{C('s'), 1}, {C('t'), 1}, {C('u'), 1}}},
- DecodeUTF8Case{"vw", {{C('v'), 1}, {C('w'), 1}}},
- DecodeUTF8Case{"xyz", {{C('x'), 1}, {C('y'), 1}, {C('z'), 1}}},
- DecodeUTF8Case{"A", {{C('A'), 1}}},
- DecodeUTF8Case{"ABC", {{C('A'), 1}, {C('B'), 1}, {C('C'), 1}}},
- DecodeUTF8Case{"DEF", {{C('D'), 1}, {C('E'), 1}, {C('F'), 1}}},
- DecodeUTF8Case{"GH", {{C('G'), 1}, {C('H'), 1}}},
- DecodeUTF8Case{"IJ", {{C('I'), 1}, {C('J'), 1}}},
- DecodeUTF8Case{"KLM", {{C('K'), 1}, {C('L'), 1}, {C('M'), 1}}},
- DecodeUTF8Case{"NOP", {{C('N'), 1}, {C('O'), 1}, {C('P'), 1}}},
- DecodeUTF8Case{"QR", {{C('Q'), 1}, {C('R'), 1}}},
- DecodeUTF8Case{"STU", {{C('S'), 1}, {C('T'), 1}, {C('U'), 1}}},
- DecodeUTF8Case{"VW", {{C('V'), 1}, {C('W'), 1}}},
- DecodeUTF8Case{"XYZ", {{C('X'), 1}, {C('Y'), 1}, {C('Z'), 1}}},
+ UTF8Case{{'a'}, {{C('a'), 1}}},
+ UTF8Case{{'a', 'b', 'c'}, {{C('a'), 1}, {C('b'), 1}, {C('c'), 1}}},
+ UTF8Case{{'d', 'e', 'f'}, {{C('d'), 1}, {C('e'), 1}, {C('f'), 1}}},
+ UTF8Case{{'g', 'h'}, {{C('g'), 1}, {C('h'), 1}}},
+ UTF8Case{{'i', 'j'}, {{C('i'), 1}, {C('j'), 1}}},
+ UTF8Case{{'k', 'l', 'm'}, {{C('k'), 1}, {C('l'), 1}, {C('m'), 1}}},
+ UTF8Case{{'n', 'o', 'p'}, {{C('n'), 1}, {C('o'), 1}, {C('p'), 1}}},
+ UTF8Case{{'q', 'r'}, {{C('q'), 1}, {C('r'), 1}}},
+ UTF8Case{{'s', 't', 'u'}, {{C('s'), 1}, {C('t'), 1}, {C('u'), 1}}},
+ UTF8Case{{'v', 'w'}, {{C('v'), 1}, {C('w'), 1}}},
+ UTF8Case{{'x', 'y', 'z'}, {{C('x'), 1}, {C('y'), 1}, {C('z'), 1}}},
+ UTF8Case{{'A'}, {{C('A'), 1}}},
+ UTF8Case{{'A', 'B', 'C'}, {{C('A'), 1}, {C('B'), 1}, {C('C'), 1}}},
+ UTF8Case{{'D', 'E', 'F'}, {{C('D'), 1}, {C('E'), 1}, {C('F'), 1}}},
+ UTF8Case{{'G', 'H'}, {{C('G'), 1}, {C('H'), 1}}},
+ UTF8Case{{'I', 'J'}, {{C('I'), 1}, {C('J'), 1}}},
+ UTF8Case{{'K', 'L', 'M'}, {{C('K'), 1}, {C('L'), 1}, {C('M'), 1}}},
+ UTF8Case{{'N', 'O', 'P'}, {{C('N'), 1}, {C('O'), 1}, {C('P'), 1}}},
+ UTF8Case{{'Q', 'R'}, {{C('Q'), 1}, {C('R'), 1}}},
+ UTF8Case{{'S', 'T', 'U'}, {{C('S'), 1}, {C('T'), 1}, {C('U'), 1}}},
+ UTF8Case{{'V', 'W'}, {{C('V'), 1}, {C('W'), 1}}},
+ UTF8Case{{'X', 'Y', 'Z'}, {{C('X'), 1}, {C('Y'), 1}, {C('Z'), 1}}},
}));
INSTANTIATE_TEST_SUITE_P(AsciiNumbers,
- DecodeUTF8Test,
+ UTF8Test,
::testing::ValuesIn({
- DecodeUTF8Case{"012", {{C('0'), 1}, {C('1'), 1}, {C('2'), 1}}},
- DecodeUTF8Case{"345", {{C('3'), 1}, {C('4'), 1}, {C('5'), 1}}},
- DecodeUTF8Case{"678", {{C('6'), 1}, {C('7'), 1}, {C('8'), 1}}},
- DecodeUTF8Case{"9", {{C('9'), 1}}},
+ UTF8Case{{'0', '1', '2'}, {{C('0'), 1}, {C('1'), 1}, {C('2'), 1}}},
+ UTF8Case{{'3', '4', '5'}, {{C('3'), 1}, {C('4'), 1}, {C('5'), 1}}},
+ UTF8Case{{'6', '7', '8'}, {{C('6'), 1}, {C('7'), 1}, {C('8'), 1}}},
+ UTF8Case{{'9'}, {{C('9'), 1}}},
}));
INSTANTIATE_TEST_SUITE_P(AsciiSymbols,
- DecodeUTF8Test,
+ UTF8Test,
::testing::ValuesIn({
- DecodeUTF8Case{"!\"#", {{C('!'), 1}, {C('"'), 1}, {C('#'), 1}}},
- DecodeUTF8Case{"$%&", {{C('$'), 1}, {C('%'), 1}, {C('&'), 1}}},
- DecodeUTF8Case{"'()", {{C('\''), 1}, {C('('), 1}, {C(')'), 1}}},
- DecodeUTF8Case{"*,-", {{C('*'), 1}, {C(','), 1}, {C('-'), 1}}},
- DecodeUTF8Case{"/`@", {{C('/'), 1}, {C('`'), 1}, {C('@'), 1}}},
- DecodeUTF8Case{"^\\[", {{C('^'), 1}, {C('\\'), 1}, {C('['), 1}}},
- DecodeUTF8Case{"]_|", {{C(']'), 1}, {C('_'), 1}, {C('|'), 1}}},
- DecodeUTF8Case{"{}", {{C('{'), 1}, {C('}'), 1}}},
+ UTF8Case{{'!', '"', '#'}, {{C('!'), 1}, {C('"'), 1}, {C('#'), 1}}},
+ UTF8Case{{'$', '%', '&'}, {{C('$'), 1}, {C('%'), 1}, {C('&'), 1}}},
+ UTF8Case{{'\'', '(', ')'}, {{C('\''), 1}, {C('('), 1}, {C(')'), 1}}},
+ UTF8Case{{'*', ',', '-'}, {{C('*'), 1}, {C(','), 1}, {C('-'), 1}}},
+ UTF8Case{{'/', '`', '@'}, {{C('/'), 1}, {C('`'), 1}, {C('@'), 1}}},
+ UTF8Case{{'^', '\\', '['}, {{C('^'), 1}, {C('\\'), 1}, {C('['), 1}}},
+ UTF8Case{{']', '_', '|'}, {{C(']'), 1}, {C('_'), 1}, {C('|'), 1}}},
+ UTF8Case{{'{', '}'}, {{C('{'), 1}, {C('}'), 1}}},
}));
-INSTANTIATE_TEST_SUITE_P(AsciiSpecial,
- DecodeUTF8Test,
- ::testing::ValuesIn({
- DecodeUTF8Case{"", {}},
- DecodeUTF8Case{" \t\n", {{C(' '), 1}, {C('\t'), 1}, {C('\n'), 1}}},
- DecodeUTF8Case{"\a\b\f", {{C('\a'), 1}, {C('\b'), 1}, {C('\f'), 1}}},
- DecodeUTF8Case{"\n\r\t", {{C('\n'), 1}, {C('\r'), 1}, {C('\t'), 1}}},
- DecodeUTF8Case{"\v", {{C('\v'), 1}}},
- }));
+INSTANTIATE_TEST_SUITE_P(
+ AsciiSpecial,
+ UTF8Test,
+ ::testing::ValuesIn({
+ UTF8Case{{}, {}},
+ UTF8Case{{' ', '\t', '\n'}, {{C(' '), 1}, {C('\t'), 1}, {C('\n'), 1}}},
+ UTF8Case{{'\a', '\b', '\f'}, {{C('\a'), 1}, {C('\b'), 1}, {C('\f'), 1}}},
+ UTF8Case{{'\n', '\r', '\t'}, {{C('\n'), 1}, {C('\r'), 1}, {C('\t'), 1}}},
+ UTF8Case{{'\v'}, {{C('\v'), 1}}},
+ }));
INSTANTIATE_TEST_SUITE_P(Hindi,
- DecodeUTF8Test,
- ::testing::ValuesIn({DecodeUTF8Case{
+ UTF8Test,
+ ::testing::ValuesIn({UTF8Case{
// नमस्ते दुनिया
- "\xe0\xa4\xa8\xe0\xa4\xae\xe0\xa4\xb8\xe0\xa5\x8d\xe0\xa4\xa4\xe0\xa5"
- "\x87\x20\xe0\xa4\xa6\xe0\xa5\x81\xe0\xa4\xa8\xe0\xa4\xbf\xe0\xa4\xaf"
- "\xe0\xa4\xbe",
+ {
+ 0xe0, 0xa4, 0xa8, 0xe0, 0xa4, 0xae, 0xe0, 0xa4, 0xb8, 0xe0,
+ 0xa5, 0x8d, 0xe0, 0xa4, 0xa4, 0xe0, 0xa5, 0x87, 0x20, 0xe0,
+ 0xa4, 0xa6, 0xe0, 0xa5, 0x81, 0xe0, 0xa4, 0xa8, 0xe0, 0xa4,
+ 0xbf, 0xe0, 0xa4, 0xaf, 0xe0, 0xa4, 0xbe,
+ },
{
{C(0x0928), 3}, // न
{C(0x092e), 3}, // म
@@ -362,10 +396,23 @@
}}));
INSTANTIATE_TEST_SUITE_P(Mandarin,
- DecodeUTF8Test,
- ::testing::ValuesIn({DecodeUTF8Case{
+ UTF8Test,
+ ::testing::ValuesIn({UTF8Case{
// 你好世界
- "\xe4\xbd\xa0\xe5\xa5\xbd\xe4\xb8\x96\xe7\x95\x8c",
+ {
+ 0xe4,
+ 0xbd,
+ 0xa0,
+ 0xe5,
+ 0xa5,
+ 0xbd,
+ 0xe4,
+ 0xb8,
+ 0x96,
+ 0xe7,
+ 0x95,
+ 0x8c,
+ },
{
{C(0x4f60), 3}, // 你
{C(0x597d), 3}, // 好
@@ -375,11 +422,13 @@
}}));
INSTANTIATE_TEST_SUITE_P(Japanese,
- DecodeUTF8Test,
- ::testing::ValuesIn({DecodeUTF8Case{
+ UTF8Test,
+ ::testing::ValuesIn({UTF8Case{
// こんにちは世界
- "\xe3\x81\x93\xe3\x82\x93\xe3\x81\xab\xe3\x81\xa1"
- "\xe3\x81\xaf\xe4\xb8\x96\xe7\x95\x8c",
+ {
+ 0xe3, 0x81, 0x93, 0xe3, 0x82, 0x93, 0xe3, 0x81, 0xab, 0xe3, 0x81,
+ 0xa1, 0xe3, 0x81, 0xaf, 0xe4, 0xb8, 0x96, 0xe7, 0x95, 0x8c,
+ },
{
{C(0x3053), 3}, // こ
{C(0x3093), 3}, // ん
@@ -392,11 +441,13 @@
}}));
INSTANTIATE_TEST_SUITE_P(Korean,
- DecodeUTF8Test,
- ::testing::ValuesIn({DecodeUTF8Case{
+ UTF8Test,
+ ::testing::ValuesIn({UTF8Case{
// 안녕하세요 세계
- "\xec\x95\x88\xeb\x85\x95\xed\x95\x98\xec\x84\xb8"
- "\xec\x9a\x94\x20\xec\x84\xb8\xea\xb3\x84",
+ {
+ 0xec, 0x95, 0x88, 0xeb, 0x85, 0x95, 0xed, 0x95, 0x98, 0xec, 0x84,
+ 0xb8, 0xec, 0x9a, 0x94, 0x20, 0xec, 0x84, 0xb8, 0xea, 0xb3, 0x84,
+ },
{
{C(0xc548), 3}, // 안
{C(0xb155), 3}, // 녕
@@ -410,10 +461,19 @@
}}));
INSTANTIATE_TEST_SUITE_P(Emoji,
- DecodeUTF8Test,
- ::testing::ValuesIn({DecodeUTF8Case{
+ UTF8Test,
+ ::testing::ValuesIn({UTF8Case{
// 👋🌎
- "\xf0\x9f\x91\x8b\xf0\x9f\x8c\x8e",
+ {
+ 0xf0,
+ 0x9f,
+ 0x91,
+ 0x8b,
+ 0xf0,
+ 0x9f,
+ 0x8c,
+ 0x8e,
+ },
{
{C(0x1f44b), 4}, // 👋
{C(0x1f30e), 4}, // 🌎
@@ -421,12 +481,15 @@
}}));
INSTANTIATE_TEST_SUITE_P(Random,
- DecodeUTF8Test,
- ::testing::ValuesIn({DecodeUTF8Case{
+ UTF8Test,
+ ::testing::ValuesIn({UTF8Case{
// Øⓑꚫ쁹Ǵ𐌒岾🥍ⴵ㍨又ᮗ
- "\xc3\x98\xe2\x93\x91\xea\x9a\xab\xec\x81\xb9\xc7\xb4\xf0\x90\x8c\x92"
- "\xe5\xb2\xbe\xf0\x9f\xa5\x8d\xe2\xb4\xb5\xe3\x8d\xa8\xe5\x8f\x88\xe1"
- "\xae\x97",
+ {
+ 0xc3, 0x98, 0xe2, 0x93, 0x91, 0xea, 0x9a, 0xab, 0xec,
+ 0x81, 0xb9, 0xc7, 0xb4, 0xf0, 0x90, 0x8c, 0x92, 0xe5,
+ 0xb2, 0xbe, 0xf0, 0x9f, 0xa5, 0x8d, 0xe2, 0xb4, 0xb5,
+ 0xe3, 0x8d, 0xa8, 0xe5, 0x8f, 0x88, 0xe1, 0xae, 0x97,
+ },
{
{C(0x000d8), 2}, // Ø
{C(0x024d1), 3}, // ⓑ
@@ -443,61 +506,336 @@
},
}}));
-} // namespace
-
////////////////////////////////////////////////////////////////////////////////
// DecodeUTF8 invalid tests
////////////////////////////////////////////////////////////////////////////////
-namespace {
-class DecodeUTF8InvalidTest : public testing::TestWithParam<const char*> {};
+class DecodeUTF8InvalidTest : public testing::TestWithParam<std::vector<uint8_t>> {};
TEST_P(DecodeUTF8InvalidTest, Invalid) {
- auto* param = GetParam();
-
- const uint8_t* data = reinterpret_cast<const uint8_t*>(param);
- const size_t len = std::string(param).size();
-
- auto [code_point, width] = utf8::Decode(data, len);
+ auto [code_point, width] = utf8::Decode(GetParam().data(), GetParam().size());
EXPECT_EQ(code_point, CodePoint(0));
EXPECT_EQ(width, 0u);
}
INSTANTIATE_TEST_SUITE_P(Invalid,
DecodeUTF8InvalidTest,
- ::testing::ValuesIn({
- "\x80\x80\x80\x80", // 10000000
- "\x81\x80\x80\x80", // 10000001
- "\x8f\x80\x80\x80", // 10001111
- "\x90\x80\x80\x80", // 10010000
- "\x91\x80\x80\x80", // 10010001
- "\x9f\x80\x80\x80", // 10011111
- "\xa0\x80\x80\x80", // 10100000
- "\xa1\x80\x80\x80", // 10100001
- "\xaf\x80\x80\x80", // 10101111
- "\xb0\x80\x80\x80", // 10110000
- "\xb1\x80\x80\x80", // 10110001
- "\xbf\x80\x80\x80", // 10111111
- "\xc0\x80\x80\x80", // 11000000
- "\xc1\x80\x80\x80", // 11000001
- "\xf5\x80\x80\x80", // 11110101
- "\xf6\x80\x80\x80", // 11110110
- "\xf7\x80\x80\x80", // 11110111
- "\xf8\x80\x80\x80", // 11111000
- "\xfe\x80\x80\x80", // 11111110
- "\xff\x80\x80\x80", // 11111111
+ ::testing::ValuesIn(std::vector<std::vector<uint8_t>>{
+ {0x80, 0x80, 0x80, 0x80}, // 10000000
+ {0x81, 0x80, 0x80, 0x80}, // 10000001
+ {0x8f, 0x80, 0x80, 0x80}, // 10001111
+ {0x90, 0x80, 0x80, 0x80}, // 10010000
+ {0x91, 0x80, 0x80, 0x80}, // 10010001
+ {0x9f, 0x80, 0x80, 0x80}, // 10011111
+ {0xa0, 0x80, 0x80, 0x80}, // 10100000
+ {0xa1, 0x80, 0x80, 0x80}, // 10100001
+ {0xaf, 0x80, 0x80, 0x80}, // 10101111
+ {0xb0, 0x80, 0x80, 0x80}, // 10110000
+ {0xb1, 0x80, 0x80, 0x80}, // 10110001
+ {0xbf, 0x80, 0x80, 0x80}, // 10111111
+ {0xc0, 0x80, 0x80, 0x80}, // 11000000
+ {0xc1, 0x80, 0x80, 0x80}, // 11000001
+ {0xf5, 0x80, 0x80, 0x80}, // 11110101
+ {0xf6, 0x80, 0x80, 0x80}, // 11110110
+ {0xf7, 0x80, 0x80, 0x80}, // 11110111
+ {0xf8, 0x80, 0x80, 0x80}, // 11111000
+ {0xfe, 0x80, 0x80, 0x80}, // 11111110
+ {0xff, 0x80, 0x80, 0x80}, // 11111111
- "\xd0", // 2-bytes, missing second byte
- "\xe8\x8f", // 3-bytes, missing third byte
- "\xf4\x8f\x8f", // 4-bytes, missing fourth byte
+ {0xd0}, // 2-bytes, missing second byte
+ {0xe8, 0x8f}, // 3-bytes, missing third byte
+ {0xf4, 0x8f, 0x8f}, // 4-bytes, missing fourth byte
- "\xd0\x7f", // 2-bytes, second byte MSB unset
- "\xe8\x7f\x8f", // 3-bytes, second byte MSB unset
- "\xe8\x8f\x7f", // 3-bytes, third byte MSB unset
- "\xf4\x7f\x8f\x8f", // 4-bytes, second byte MSB unset
- "\xf4\x8f\x7f\x8f", // 4-bytes, third byte MSB unset
- "\xf4\x8f\x8f\x7f", // 4-bytes, fourth byte MSB unset
+ {0xd0, 0x7f}, // 2-bytes, second byte MSB unset
+ {0xe8, 0x7f, 0x8f}, // 3-bytes, second byte MSB unset
+ {0xe8, 0x8f, 0x7f}, // 3-bytes, third byte MSB unset
+ {0xf4, 0x7f, 0x8f, 0x8f}, // 4-bytes, second byte MSB unset
+ {0xf4, 0x8f, 0x7f, 0x8f}, // 4-bytes, third byte MSB unset
+ {0xf4, 0x8f, 0x8f, 0x7f}, // 4-bytes, fourth byte MSB unset
}));
-} // namespace
+} // namespace utf8_tests
+////////////////////////////////////////////////////////////////////////////////
+// DecodeUTF16 valid tests
+////////////////////////////////////////////////////////////////////////////////
+namespace utf16_tests {
+
+struct UTF16Case {
+ std::vector<uint16_t> string;
+ std::vector<CodePointAndWidth> code_points;
+};
+
+static std::ostream& operator<<(std::ostream& out, UTF16Case c) {
+ for (size_t i = 0; i < c.string.size(); i++) {
+ if (i > 0) {
+ out << ", ";
+ }
+ out << "0x" << std::hex << std::setfill('0') << std::setw(4) << c.string[i];
+ }
+ return out;
+}
+
+class UTF16Test : public testing::TestWithParam<UTF16Case> {};
+
+TEST_P(UTF16Test, Decode) {
+ auto param = GetParam();
+
+ const uint16_t* data = reinterpret_cast<const uint16_t*>(param.string.data());
+ const size_t len = param.string.size();
+
+ std::vector<CodePointAndWidth> got;
+ size_t offset = 0;
+ while (offset < len) {
+ auto [code_point, width] = utf16::Decode(data + offset, len - offset);
+ if (width == 0) {
+ FAIL() << "Decode() failed at byte offset " << offset;
+ }
+ offset += width;
+ got.emplace_back(CodePointAndWidth{code_point, width});
+ }
+
+ EXPECT_THAT(got, ::testing::ElementsAreArray(param.code_points));
+}
+
+TEST_P(UTF16Test, Encode) {
+ auto param = GetParam();
+
+ Slice<const uint16_t> str{reinterpret_cast<const uint16_t*>(param.string.data()),
+ param.string.size()};
+ for (auto codepoint : param.code_points) {
+ EXPECT_EQ(utf16::Encode(codepoint.code_point, nullptr), codepoint.width);
+
+ uint16_t encoded[2];
+ size_t len = utf16::Encode(codepoint.code_point, encoded);
+ ASSERT_EQ(len, codepoint.width);
+ EXPECT_THAT(Slice<const uint16_t>(encoded, len),
+ ::testing::ElementsAreArray(str.Truncate(len)));
+ str = str.Offset(len);
+ }
+}
+
+INSTANTIATE_TEST_SUITE_P(AsciiLetters,
+ UTF16Test,
+ ::testing::ValuesIn({
+ UTF16Case{{'a'}, {{C('a'), 1}}},
+ UTF16Case{{'a', 'b', 'c'}, {{C('a'), 1}, {C('b'), 1}, {C('c'), 1}}},
+ UTF16Case{{'d', 'e', 'f'}, {{C('d'), 1}, {C('e'), 1}, {C('f'), 1}}},
+ UTF16Case{{'g', 'h'}, {{C('g'), 1}, {C('h'), 1}}},
+ UTF16Case{{'i', 'j'}, {{C('i'), 1}, {C('j'), 1}}},
+ UTF16Case{{'k', 'l', 'm'}, {{C('k'), 1}, {C('l'), 1}, {C('m'), 1}}},
+ UTF16Case{{'n', 'o', 'p'}, {{C('n'), 1}, {C('o'), 1}, {C('p'), 1}}},
+ UTF16Case{{'q', 'r'}, {{C('q'), 1}, {C('r'), 1}}},
+ UTF16Case{{'s', 't', 'u'}, {{C('s'), 1}, {C('t'), 1}, {C('u'), 1}}},
+ UTF16Case{{'v', 'w'}, {{C('v'), 1}, {C('w'), 1}}},
+ UTF16Case{{'x', 'y', 'z'}, {{C('x'), 1}, {C('y'), 1}, {C('z'), 1}}},
+ UTF16Case{{'A'}, {{C('A'), 1}}},
+ UTF16Case{{'A', 'B', 'C'}, {{C('A'), 1}, {C('B'), 1}, {C('C'), 1}}},
+ UTF16Case{{'D', 'E', 'F'}, {{C('D'), 1}, {C('E'), 1}, {C('F'), 1}}},
+ UTF16Case{{'G', 'H'}, {{C('G'), 1}, {C('H'), 1}}},
+ UTF16Case{{'I', 'J'}, {{C('I'), 1}, {C('J'), 1}}},
+ UTF16Case{{'K', 'L', 'M'}, {{C('K'), 1}, {C('L'), 1}, {C('M'), 1}}},
+ UTF16Case{{'N', 'O', 'P'}, {{C('N'), 1}, {C('O'), 1}, {C('P'), 1}}},
+ UTF16Case{{'Q', 'R'}, {{C('Q'), 1}, {C('R'), 1}}},
+ UTF16Case{{'S', 'T', 'U'}, {{C('S'), 1}, {C('T'), 1}, {C('U'), 1}}},
+ UTF16Case{{'V', 'W'}, {{C('V'), 1}, {C('W'), 1}}},
+ UTF16Case{{'X', 'Y', 'Z'}, {{C('X'), 1}, {C('Y'), 1}, {C('Z'), 1}}},
+ }));
+
+INSTANTIATE_TEST_SUITE_P(AsciiNumbers,
+ UTF16Test,
+ ::testing::ValuesIn({
+ UTF16Case{{'0', '1', '2'}, {{C('0'), 1}, {C('1'), 1}, {C('2'), 1}}},
+ UTF16Case{{'3', '4', '5'}, {{C('3'), 1}, {C('4'), 1}, {C('5'), 1}}},
+ UTF16Case{{'6', '7', '8'}, {{C('6'), 1}, {C('7'), 1}, {C('8'), 1}}},
+ UTF16Case{{'9'}, {{C('9'), 1}}},
+ }));
+
+INSTANTIATE_TEST_SUITE_P(AsciiSymbols,
+ UTF16Test,
+ ::testing::ValuesIn({
+ UTF16Case{{'!', '"', '#'}, {{C('!'), 1}, {C('"'), 1}, {C('#'), 1}}},
+ UTF16Case{{'$', '%', '&'}, {{C('$'), 1}, {C('%'), 1}, {C('&'), 1}}},
+ UTF16Case{{'\'', '(', ')'}, {{C('\''), 1}, {C('('), 1}, {C(')'), 1}}},
+ UTF16Case{{'*', ',', '-'}, {{C('*'), 1}, {C(','), 1}, {C('-'), 1}}},
+ UTF16Case{{'/', '`', '@'}, {{C('/'), 1}, {C('`'), 1}, {C('@'), 1}}},
+ UTF16Case{{'^', '\\', '['}, {{C('^'), 1}, {C('\\'), 1}, {C('['), 1}}},
+ UTF16Case{{']', '_', '|'}, {{C(']'), 1}, {C('_'), 1}, {C('|'), 1}}},
+ UTF16Case{{'{', '}'}, {{C('{'), 1}, {C('}'), 1}}},
+ }));
+
+INSTANTIATE_TEST_SUITE_P(
+ AsciiSpecial,
+ UTF16Test,
+ ::testing::ValuesIn({
+ UTF16Case{{}, {}},
+ UTF16Case{{' ', '\t', '\n'}, {{C(' '), 1}, {C('\t'), 1}, {C('\n'), 1}}},
+ UTF16Case{{'\a', '\b', '\f'}, {{C('\a'), 1}, {C('\b'), 1}, {C('\f'), 1}}},
+ UTF16Case{{'\n', '\r', '\t'}, {{C('\n'), 1}, {C('\r'), 1}, {C('\t'), 1}}},
+ UTF16Case{{'\v'}, {{C('\v'), 1}}},
+ }));
+
+INSTANTIATE_TEST_SUITE_P(Hindi,
+ UTF16Test,
+ ::testing::ValuesIn({UTF16Case{
+ // नमस्ते दुनिया
+ {
+ 0x0928,
+ 0x092e,
+ 0x0938,
+ 0x094d,
+ 0x0924,
+ 0x0947,
+ 0x0020,
+ 0x0926,
+ 0x0941,
+ 0x0928,
+ 0x093f,
+ 0x092f,
+ 0x093e,
+ },
+ {
+ {C(0x0928), 1}, // न
+ {C(0x092e), 1}, // म
+ {C(0x0938), 1}, // स
+ {C(0x094d), 1}, // ् //
+ {C(0x0924), 1}, // त
+ {C(0x0947), 1}, // े //
+ {C(' '), 1},
+ {C(0x0926), 1}, // द
+ {C(0x0941), 1}, // ु //
+ {C(0x0928), 1}, // न
+ {C(0x093f), 1}, // ि //
+ {C(0x092f), 1}, // य
+ {C(0x093e), 1}, // ा //
+ },
+ }}));
+
+INSTANTIATE_TEST_SUITE_P(Mandarin,
+ UTF16Test,
+ ::testing::ValuesIn({UTF16Case{
+ // 你好世界
+ {0x4f60, 0x597d, 0x4e16, 0x754c},
+ {
+ {C(0x4f60), 1}, // 你
+ {C(0x597d), 1}, // 好
+ {C(0x4e16), 1}, // 世
+ {C(0x754c), 1}, // 界
+ },
+ }}));
+
+INSTANTIATE_TEST_SUITE_P(Japanese,
+ UTF16Test,
+ ::testing::ValuesIn({UTF16Case{
+ // こんにちは世界
+ {
+ 0x3053,
+ 0x3093,
+ 0x306b,
+ 0x3061,
+ 0x306f,
+ 0x4e16,
+ 0x754c,
+ },
+ {
+ {C(0x3053), 1}, // こ
+ {C(0x3093), 1}, // ん
+ {C(0x306B), 1}, // に
+ {C(0x3061), 1}, // ち
+ {C(0x306F), 1}, // は
+ {C(0x4E16), 1}, // 世
+ {C(0x754C), 1}, // 界
+ },
+ }}));
+
+INSTANTIATE_TEST_SUITE_P(Korean,
+ UTF16Test,
+ ::testing::ValuesIn({UTF16Case{
+ // 안녕하세요 세계
+ {
+ 0xc548,
+ 0xb155,
+ 0xd558,
+ 0xc138,
+ 0xc694,
+ 0x0020,
+ 0xc138,
+ 0xacc4,
+ },
+ {
+ {C(0xc548), 1}, // 안
+ {C(0xb155), 1}, // 녕
+ {C(0xd558), 1}, // 하
+ {C(0xc138), 1}, // 세
+ {C(0xc694), 1}, // 요
+ {C(' '), 1}, //
+ {C(0xc138), 1}, // 세
+ {C(0xacc4), 1}, // 계
+ },
+ }}));
+
+INSTANTIATE_TEST_SUITE_P(Emoji,
+ UTF16Test,
+ ::testing::ValuesIn({UTF16Case{
+ // 👋🌎
+ {0xd83d, 0xdc4b, 0xd83c, 0xdf0e},
+ {
+ {C(0x1f44b), 2}, // 👋
+ {C(0x1f30e), 2}, // 🌎
+ },
+ }}));
+
+INSTANTIATE_TEST_SUITE_P(Random,
+ UTF16Test,
+ ::testing::ValuesIn({UTF16Case{
+ // Øⓑꚫ쁹Ǵ𐌒岾🥍ⴵ㍨又ᮗ
+ {
+ 0x00d8,
+ 0x24d1,
+ 0xa6ab,
+ 0xc079,
+ 0x01f4,
+ 0xd800,
+ 0xdf12,
+ 0x5cbe,
+ 0xd83e,
+ 0xdd4d,
+ 0x2d35,
+ 0x3368,
+ 0x53c8,
+ 0x1b97,
+ },
+ {
+ {C(0x000d8), 1}, // Ø
+ {C(0x024d1), 1}, // ⓑ
+ {C(0x0a6ab), 1}, // ꚫ
+ {C(0x0c079), 1}, // 쁹
+ {C(0x001f4), 1}, // Ǵ
+ {C(0x10312), 2}, // 𐌒
+ {C(0x05cbe), 1}, // 岾
+ {C(0x1f94d), 2}, // 🥍
+ {C(0x02d35), 1}, // ⴵ
+ {C(0x03368), 1}, // ㍨
+ {C(0x053c8), 1}, // 又
+ {C(0x01b97), 1}, // ᮗ
+ },
+ }}));
+
+////////////////////////////////////////////////////////////////////////////////
+// DecodeUTF16 invalid tests
+////////////////////////////////////////////////////////////////////////////////
+class DecodeUTF16InvalidTest : public testing::TestWithParam<std::vector<uint16_t>> {};
+
+TEST_P(DecodeUTF16InvalidTest, Invalid) {
+ auto [code_point, width] = utf16::Decode(GetParam().data(), GetParam().size());
+ EXPECT_EQ(code_point, CodePoint(0));
+ EXPECT_EQ(width, 0u);
+}
+INSTANTIATE_TEST_SUITE_P(Invalid,
+ DecodeUTF16InvalidTest,
+ ::testing::ValuesIn(std::vector<std::vector<uint16_t>>{
+ {0xdc00}, // surrogate, end-of-stream
+ {0xdc00, 0x0040}, // surrogate, non-surrogate
+ }));
+
+} // namespace utf16_tests
} // namespace tint