Count the line pos, offset and size of compilation message in UTF-16

This patch counts the line position, offset and size of the compilation
message in UTF-16 and saves them to WGPUCompilationMessage to align the
latest WebGPU SPEC.

Bug: dawn:1357
Change-Id: If8f4026bd5b4a64a078e100762b6d1f61da50053
Reviewed-on: https://dawn-review.googlesource.com/c/dawn/+/115640
Kokoro: Kokoro <noreply+kokoro@google.com>
Commit-Queue: Jiawei Shao <jiawei.shao@intel.com>
Reviewed-by: Corentin Wallez <cwallez@chromium.org>
diff --git a/dawn.json b/dawn.json
index fbf2247..a04ce3f 100644
--- a/dawn.json
+++ b/dawn.json
@@ -749,7 +749,10 @@
             {"name": "line num", "type": "uint64_t"},
             {"name": "line pos", "type": "uint64_t"},
             {"name": "offset", "type": "uint64_t"},
-            {"name": "length", "type": "uint64_t"}
+            {"name": "length", "type": "uint64_t"},
+            {"name": "utf16 line pos", "type": "uint64_t"},
+            {"name": "utf16 offset", "type": "uint64_t"},
+            {"name": "utf16 length", "type": "uint64_t"}
         ]
     },
     "compilation message type": {
diff --git a/include/tint/tint.h b/include/tint/tint.h
index 02ac423..cdda85c 100644
--- a/include/tint/tint.h
+++ b/include/tint/tint.h
@@ -25,6 +25,7 @@
 #include "src/tint/diagnostic/printer.h"
 #include "src/tint/inspector/inspector.h"
 #include "src/tint/reader/reader.h"
+#include "src/tint/text/unicode.h"
 #include "src/tint/transform/binding_remapper.h"
 #include "src/tint/transform/clamp_frag_depth.h"
 #include "src/tint/transform/first_index_offset.h"
diff --git a/src/dawn/native/CompilationMessages.cpp b/src/dawn/native/CompilationMessages.cpp
index dd4fd41..f12318f 100644
--- a/src/dawn/native/CompilationMessages.cpp
+++ b/src/dawn/native/CompilationMessages.cpp
@@ -36,6 +36,39 @@
 
 }  // anonymous namespace
 
+ResultOrError<uint64_t> CountUTF16CodeUnitsFromUTF8String(const std::string_view& utf8String) {
+    if (tint::text::utf8::IsASCII(utf8String)) {
+        return utf8String.size();
+    }
+
+    uint64_t numberOfUTF16CodeUnits = 0;
+    std::string_view remaining = utf8String;
+    while (!remaining.empty()) {
+        auto [codePoint, utf8CharacterByteLength] = tint::text::utf8::Decode(remaining);
+        // Directly return as something wrong has happened during the UTF-8 decoding.
+        if (utf8CharacterByteLength == 0) {
+            return DAWN_INTERNAL_ERROR("Fail to decode the unicode string");
+        }
+        remaining = remaining.substr(utf8CharacterByteLength);
+
+        // Count the number of code units in UTF-16. See https://en.wikipedia.org/wiki/UTF-16 for
+        // more details.
+        if (codePoint.value <= 0xD7FF || (codePoint.value >= 0xE000 && codePoint.value <= 0xFFFF)) {
+            // Code points from U+0000 to U+D7FF and U+E000 to U+FFFF are encoded as single 16-bit
+            // code units.
+            ++numberOfUTF16CodeUnits;
+        } else if (codePoint.value >= 0x10000) {
+            // Code points from U+010000 to U+10FFFF are encoded as two 16-bit code units.
+            numberOfUTF16CodeUnits += 2;
+        } else {
+            // UTF-16 cannot encode the code points from U+D800 to U+DFFF.
+            return DAWN_INTERNAL_ERROR("The unicode string contains illegal unicode code point.");
+        }
+    }
+
+    return numberOfUTF16CodeUnits;
+}
+
 OwnedCompilationMessages::OwnedCompilationMessages() {
     mCompilationInfo.nextInChain = 0;
     mCompilationInfo.messageCount = 0;
@@ -53,23 +86,29 @@
     // Cannot add messages after GetCompilationInfo has been called.
     ASSERT(mCompilationInfo.messages == nullptr);
 
+    // Message can only contain ascii characters.
+    ASSERT(tint::text::utf8::IsASCII(message));
+
     mMessageStrings.push_back(message);
     mMessages.push_back({nullptr, nullptr, static_cast<WGPUCompilationMessageType>(type), lineNum,
-                         linePos, offset, length});
+                         linePos, offset, length, linePos, offset, length});
 }
 
-void OwnedCompilationMessages::AddMessage(const tint::diag::Diagnostic& diagnostic) {
+MaybeError OwnedCompilationMessages::AddMessage(const tint::diag::Diagnostic& diagnostic) {
     // Cannot add messages after GetCompilationInfo has been called.
     ASSERT(mCompilationInfo.messages == nullptr);
 
     // Tint line and column values are 1-based.
     uint64_t lineNum = diagnostic.source.range.begin.line;
-    uint64_t lineCol = diagnostic.source.range.begin.column;
+    uint64_t linePosInBytes = diagnostic.source.range.begin.column;
     // The offset is 0-based.
-    uint64_t offset = 0;
-    uint64_t length = 0;
+    uint64_t offsetInBytes = 0;
+    uint64_t lengthInBytes = 0;
+    uint64_t linePosInUTF16 = 0;
+    uint64_t offsetInUTF16 = 0;
+    uint64_t lengthInUTF16 = 0;
 
-    if (lineNum && lineCol && diagnostic.source.file) {
+    if (lineNum && linePosInBytes && diagnostic.source.file) {
         const tint::Source::FileContent& content = diagnostic.source.file->content;
 
         // Tint stores line as std::string_view in a complete source std::string that's in the
@@ -78,23 +117,38 @@
         // range starts at 1 while the array of lines start at 0 (hence the -1).
         const char* fileStart = content.data.data();
         const char* lineStart = content.lines[lineNum - 1].data();
-        offset = static_cast<uint64_t>(lineStart - fileStart) + lineCol - 1;
+        offsetInBytes = static_cast<uint64_t>(lineStart - fileStart) + linePosInBytes - 1;
+
+        // The linePosInBytes is 1-based.
+        uint64_t linePosOffsetInUTF16 = 0;
+        DAWN_TRY_ASSIGN(linePosOffsetInUTF16, CountUTF16CodeUnitsFromUTF8String(
+                                                  std::string_view(lineStart, linePosInBytes - 1)));
+        linePosInUTF16 = linePosOffsetInUTF16 + 1;
+
+        // The offset is 0-based.
+        uint64_t lineStartToFileStartOffsetInUTF16 = 0;
+        DAWN_TRY_ASSIGN(lineStartToFileStartOffsetInUTF16,
+                        CountUTF16CodeUnitsFromUTF8String(std::string_view(
+                            fileStart, static_cast<uint64_t>(lineStart - fileStart))));
+        offsetInUTF16 = lineStartToFileStartOffsetInUTF16 + linePosInUTF16 - 1;
 
         // If the range has a valid start but the end is not specified, clamp it to the start.
         uint64_t endLineNum = diagnostic.source.range.end.line;
         uint64_t endLineCol = diagnostic.source.range.end.column;
         if (endLineNum == 0 || endLineCol == 0) {
             endLineNum = lineNum;
-            endLineCol = lineCol;
+            endLineCol = linePosInBytes;
         }
 
         const char* endLineStart = content.lines[endLineNum - 1].data();
-        uint64_t endOffset = static_cast<uint64_t>(endLineStart - fileStart) + endLineCol - 1;
-
+        uint64_t endOffsetInBytes =
+            static_cast<uint64_t>(endLineStart - fileStart) + endLineCol - 1;
         // The length of the message is the difference between the starting offset and the
-        // ending offset. Negative ranges aren't allowed
-        ASSERT(endOffset >= offset);
-        length = endOffset - offset;
+        // ending offset. Negative ranges aren't allowed.
+        ASSERT(endOffsetInBytes >= offsetInBytes);
+        lengthInBytes = endOffsetInBytes - offsetInBytes;
+        DAWN_TRY_ASSIGN(lengthInUTF16, CountUTF16CodeUnitsFromUTF8String(std::string_view(
+                                           fileStart + offsetInBytes, lengthInBytes)));
     }
 
     if (diagnostic.code) {
@@ -104,18 +158,23 @@
     }
 
     mMessages.push_back({nullptr, nullptr, tintSeverityToMessageType(diagnostic.severity), lineNum,
-                         lineCol, offset, length});
+                         linePosInBytes, offsetInBytes, lengthInBytes, linePosInUTF16,
+                         offsetInUTF16, lengthInUTF16});
+
+    return {};
 }
 
-void OwnedCompilationMessages::AddMessages(const tint::diag::List& diagnostics) {
+MaybeError OwnedCompilationMessages::AddMessages(const tint::diag::List& diagnostics) {
     // Cannot add messages after GetCompilationInfo has been called.
     ASSERT(mCompilationInfo.messages == nullptr);
 
     for (const auto& diag : diagnostics) {
-        AddMessage(diag);
+        DAWN_TRY(AddMessage(diag));
     }
 
     AddFormattedTintMessages(diagnostics);
+
+    return {};
 }
 
 void OwnedCompilationMessages::ClearMessages() {
diff --git a/src/dawn/native/CompilationMessages.h b/src/dawn/native/CompilationMessages.h
index 13d30b2..9adf8f3 100644
--- a/src/dawn/native/CompilationMessages.h
+++ b/src/dawn/native/CompilationMessages.h
@@ -18,6 +18,7 @@
 #include <string>
 #include <vector>
 
+#include "dawn/native/Error.h"
 #include "dawn/native/dawn_platform.h"
 
 #include "dawn/common/NonCopyable.h"
@@ -29,6 +30,8 @@
 
 namespace dawn::native {
 
+ResultOrError<uint64_t> CountUTF16CodeUnitsFromUTF8String(const std::string_view& utf8String);
+
 class OwnedCompilationMessages : public NonCopyable {
   public:
     OwnedCompilationMessages();
@@ -41,14 +44,14 @@
         uint64_t linePos = 0,
         uint64_t offset = 0,
         uint64_t length = 0);
-    void AddMessages(const tint::diag::List& diagnostics);
+    MaybeError AddMessages(const tint::diag::List& diagnostics);
     void ClearMessages();
 
     const WGPUCompilationInfo* GetCompilationInfo();
     const std::vector<std::string>& GetFormattedTintMessages();
 
   private:
-    void AddMessage(const tint::diag::Diagnostic& diagnostic);
+    MaybeError AddMessage(const tint::diag::Diagnostic& diagnostic);
     void AddFormattedTintMessages(const tint::diag::List& diagnostics);
 
     WGPUCompilationInfo mCompilationInfo;
diff --git a/src/dawn/native/ShaderModule.cpp b/src/dawn/native/ShaderModule.cpp
index 8a6c4e1..6c04065 100644
--- a/src/dawn/native/ShaderModule.cpp
+++ b/src/dawn/native/ShaderModule.cpp
@@ -299,7 +299,7 @@
 #if TINT_BUILD_WGSL_READER
     tint::Program program = tint::reader::wgsl::Parse(file);
     if (outMessages != nullptr) {
-        outMessages->AddMessages(program.Diagnostics());
+        DAWN_TRY(outMessages->AddMessages(program.Diagnostics()));
     }
     if (!program.IsValid()) {
         return DAWN_VALIDATION_ERROR("Tint WGSL reader failure: %s\n", program.Diagnostics().str());
@@ -316,7 +316,7 @@
 #if TINT_BUILD_SPV_READER
     tint::Program program = tint::reader::spirv::Parse(spirv);
     if (outMessages != nullptr) {
-        outMessages->AddMessages(program.Diagnostics());
+        DAWN_TRY(outMessages->AddMessages(program.Diagnostics()));
     }
     if (!program.IsValid()) {
         return DAWN_VALIDATION_ERROR("Tint SPIR-V reader failure:\nParser: %s\n",
@@ -789,7 +789,7 @@
 
     if (hasDisallowedExtension) {
         if (outMessages != nullptr) {
-            outMessages->AddMessages(messages);
+            DAWN_TRY(outMessages->AddMessages(messages));
         }
         return DAWN_MAKE_ERROR(InternalErrorType::Validation,
                                "Shader module uses extension(s) not enabled for its device.");
@@ -983,7 +983,7 @@
                                            OwnedCompilationMessages* outMessages) {
     tint::transform::Output output = transform->Run(program, inputs);
     if (outMessages != nullptr) {
-        outMessages->AddMessages(output.program.Diagnostics());
+        DAWN_TRY(outMessages->AddMessages(output.program.Diagnostics()));
     }
     DAWN_INVALID_IF(!output.program.IsValid(), "Tint program failure: %s\n",
                     output.program.Diagnostics().str());
diff --git a/src/dawn/tests/BUILD.gn b/src/dawn/tests/BUILD.gn
index b555260..7614367 100644
--- a/src/dawn/tests/BUILD.gn
+++ b/src/dawn/tests/BUILD.gn
@@ -301,6 +301,7 @@
     "unittests/SystemUtilsTests.cpp",
     "unittests/ToBackendTests.cpp",
     "unittests/TypedIntegerTests.cpp",
+    "unittests/UnicodeTests.cpp",
     "unittests/native/BlobTests.cpp",
     "unittests/native/CacheRequestTests.cpp",
     "unittests/native/CommandBufferEncodingTests.cpp",
diff --git a/src/dawn/tests/unittests/UnicodeTests.cpp b/src/dawn/tests/unittests/UnicodeTests.cpp
new file mode 100644
index 0000000..f450652
--- /dev/null
+++ b/src/dawn/tests/unittests/UnicodeTests.cpp
@@ -0,0 +1,73 @@
+// Copyright 2022 The Dawn Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "dawn/native/ShaderModule.h"
+#include "dawn/tests/unittests/validation/ValidationTest.h"
+
+class CountUTF16CodeUnitsFromUTF8StringTest : public ValidationTest {};
+
+TEST_F(CountUTF16CodeUnitsFromUTF8StringTest, ValidUnicodeString) {
+    struct TestCase {
+        const char* u8String;
+        uint64_t lengthInUTF16;
+    };
+
+    // Referenced from src/tint/text/unicode_test.cc
+    constexpr std::array<TestCase, 12> kTestCases = {{
+        {"", 0},
+        {"abc", 3},
+        {"\xe4\xbd\xa0\xe5\xa5\xbd\xe4\xb8\x96\xe7\x95\x8c", 4},
+        {"def\xf0\x9f\x91\x8b\xf0\x9f\x8c\x8e", 7},
+        {"\xed\x9f\xbf", 1},      // CodePoint == 0xD7FF
+        {"\xed\x9f\xbe", 1},      // CodePoint == 0xD7FF - 1
+        {"\xee\x80\x80", 1},      // CodePoint == 0xE000
+        {"\xee\x80\x81", 1},      // CodePoint == 0xE000 + 1
+        {"\xef\xbf\xbf", 1},      // CodePoint == 0xFFFF
+        {"\xef\xbf\xbe", 1},      // CodePoint == 0xFFFF - 1
+        {"\xf0\x90\x80\x80", 2},  // CodePoint == 0x10000
+        {"\xf0\x90\x80\x81", 2},  // CodePoint == 0x10000 + 1
+    }};
+
+    for (const TestCase& testCase : kTestCases) {
+        dawn::native::ResultOrError<uint64_t> resultOrError =
+            dawn::native::CountUTF16CodeUnitsFromUTF8String(std::string_view(testCase.u8String));
+        ASSERT_TRUE(resultOrError.IsSuccess());
+        ASSERT_EQ(testCase.lengthInUTF16, resultOrError.AcquireSuccess());
+    }
+}
+
+TEST_F(CountUTF16CodeUnitsFromUTF8StringTest, InvalidUnicodeString) {
+    // Referenced from src/tint/text/unicode_test.cc
+    constexpr std::array<const char*, 12> kTestCases = {{
+        "\xed\xa0\x80",  // CodePoint == 0xD7FF + 1
+        "\xed\xbf\xbf",  // CodePoint == 0xE000 - 1
+        "ab\xed\xa0\x80",
+        "\xd0",              // 2-bytes, missing second byte
+        "\xe8\x8f",          // 3-bytes, missing third byte
+        "\xf4\x8f\x8f",      // 4-bytes, missing fourth byte
+        "\xd0\x7f",          // 2-bytes, second byte MSB unset
+        "\xe8\x7f\x8f",      // 3-bytes, second byte MSB unset
+        "\xe8\x8f\x7f",      // 3-bytes, third byte MSB unset
+        "\xf4\x7f\x8f\x8f",  // 4-bytes, second byte MSB unset
+        "\xf4\x8f\x7f\x8f",  // 4-bytes, third byte MSB unset
+        "\xf4\x8f\x8f\x7f",  // 4-bytes, fourth byte MSB unset
+    }};
+
+    for (const char* testCase : kTestCases) {
+        dawn::native::ResultOrError<uint64_t> resultOrError =
+            dawn::native::CountUTF16CodeUnitsFromUTF8String(std::string_view(testCase));
+        ASSERT_TRUE(resultOrError.IsError());
+        std::ignore = resultOrError.AcquireError();
+    }
+}
diff --git a/src/dawn/tests/unittests/wire/WireShaderModuleTests.cpp b/src/dawn/tests/unittests/wire/WireShaderModuleTests.cpp
index 67d258b..479b080 100644
--- a/src/dawn/tests/unittests/wire/WireShaderModuleTests.cpp
+++ b/src/dawn/tests/unittests/wire/WireShaderModuleTests.cpp
@@ -93,7 +93,7 @@
     wgpuShaderModuleGetCompilationInfo(shaderModule, ToMockGetCompilationInfoCallback, nullptr);
 
     WGPUCompilationMessage message = {
-        nullptr, "Test Message", WGPUCompilationMessageType_Info, 2, 4, 6, 8};
+        nullptr, "Test Message", WGPUCompilationMessageType_Info, 2, 4, 6, 8, 4, 6, 8};
     WGPUCompilationInfo compilationInfo;
     compilationInfo.nextInChain = nullptr;
     compilationInfo.messageCount = 1;
@@ -133,7 +133,7 @@
     wgpuShaderModuleGetCompilationInfo(shaderModule, ToMockGetCompilationInfoCallback, nullptr);
 
     WGPUCompilationMessage message = {
-        nullptr, "Test Message", WGPUCompilationMessageType_Info, 2, 4, 6, 8};
+        nullptr, "Test Message", WGPUCompilationMessageType_Info, 2, 4, 6, 8, 4, 6, 8};
     WGPUCompilationInfo compilationInfo;
     compilationInfo.nextInChain = nullptr;
     compilationInfo.messageCount = 1;
@@ -193,7 +193,7 @@
                                        &testData);
 
     WGPUCompilationMessage message = {
-        nullptr, "Test Message", WGPUCompilationMessageType_Info, 2, 4, 6, 8};
+        nullptr, "Test Message", WGPUCompilationMessageType_Info, 2, 4, 6, 8, 4, 6, 8};
     WGPUCompilationInfo compilationInfo;
     compilationInfo.nextInChain = nullptr;
     compilationInfo.messageCount = 1;
@@ -220,7 +220,7 @@
                                        &testData);
 
     WGPUCompilationMessage message = {
-        nullptr, "Test Message", WGPUCompilationMessageType_Info, 2, 4, 6, 8};
+        nullptr, "Test Message", WGPUCompilationMessageType_Info, 2, 4, 6, 8, 4, 6, 8};
     WGPUCompilationInfo compilationInfo;
     compilationInfo.nextInChain = nullptr;
     compilationInfo.messageCount = 1;
diff --git a/src/tint/text/unicode.cc b/src/tint/text/unicode.cc
index e23f3dd..cc9a9d1 100644
--- a/src/tint/text/unicode.cc
+++ b/src/tint/text/unicode.cc
@@ -427,6 +427,10 @@
     return {c, n};
 }
 
+std::pair<CodePoint, size_t> Decode(std::string_view utf8_string) {
+    return Decode(reinterpret_cast<const uint8_t*>(utf8_string.data()), utf8_string.size());
+}
+
 bool IsASCII(std::string_view str) {
     for (auto c : str) {
         if (c & 0x80) {
diff --git a/src/tint/text/unicode.h b/src/tint/text/unicode.h
index f0aa272..0594d31 100644
--- a/src/tint/text/unicode.h
+++ b/src/tint/text/unicode.h
@@ -69,6 +69,12 @@
 ///          If the next code point cannot be decoded then returns [0,0].
 std::pair<CodePoint, size_t> Decode(const uint8_t* ptr, size_t len);
 
+/// Decodes the first code point in the utf8 string.
+/// @param utf8_string the string view that contains the utf8 sequence
+/// @returns a pair of CodePoint and width in code units (bytes).
+///          If the next code point cannot be decoded then returns [0,0].
+std::pair<CodePoint, size_t> Decode(std::string_view utf8_string);
+
 /// @returns true if all the utf-8 code points in the string are ASCII
 /// (code-points 0x00..0x7f).
 bool IsASCII(std::string_view);