Count the line pos, offset and size of compilation message in UTF-16
This patch counts the line position, offset and size of the compilation
message in UTF-16 and saves them to WGPUCompilationMessage to align the
latest WebGPU SPEC.
Bug: dawn:1357
Change-Id: If8f4026bd5b4a64a078e100762b6d1f61da50053
Reviewed-on: https://dawn-review.googlesource.com/c/dawn/+/115640
Kokoro: Kokoro <noreply+kokoro@google.com>
Commit-Queue: Jiawei Shao <jiawei.shao@intel.com>
Reviewed-by: Corentin Wallez <cwallez@chromium.org>
diff --git a/dawn.json b/dawn.json
index fbf2247..a04ce3f 100644
--- a/dawn.json
+++ b/dawn.json
@@ -749,7 +749,10 @@
{"name": "line num", "type": "uint64_t"},
{"name": "line pos", "type": "uint64_t"},
{"name": "offset", "type": "uint64_t"},
- {"name": "length", "type": "uint64_t"}
+ {"name": "length", "type": "uint64_t"},
+ {"name": "utf16 line pos", "type": "uint64_t"},
+ {"name": "utf16 offset", "type": "uint64_t"},
+ {"name": "utf16 length", "type": "uint64_t"}
]
},
"compilation message type": {
diff --git a/include/tint/tint.h b/include/tint/tint.h
index 02ac423..cdda85c 100644
--- a/include/tint/tint.h
+++ b/include/tint/tint.h
@@ -25,6 +25,7 @@
#include "src/tint/diagnostic/printer.h"
#include "src/tint/inspector/inspector.h"
#include "src/tint/reader/reader.h"
+#include "src/tint/text/unicode.h"
#include "src/tint/transform/binding_remapper.h"
#include "src/tint/transform/clamp_frag_depth.h"
#include "src/tint/transform/first_index_offset.h"
diff --git a/src/dawn/native/CompilationMessages.cpp b/src/dawn/native/CompilationMessages.cpp
index dd4fd41..f12318f 100644
--- a/src/dawn/native/CompilationMessages.cpp
+++ b/src/dawn/native/CompilationMessages.cpp
@@ -36,6 +36,39 @@
} // anonymous namespace
+ResultOrError<uint64_t> CountUTF16CodeUnitsFromUTF8String(const std::string_view& utf8String) {
+ if (tint::text::utf8::IsASCII(utf8String)) {
+ return utf8String.size();
+ }
+
+ uint64_t numberOfUTF16CodeUnits = 0;
+ std::string_view remaining = utf8String;
+ while (!remaining.empty()) {
+ auto [codePoint, utf8CharacterByteLength] = tint::text::utf8::Decode(remaining);
+ // Directly return as something wrong has happened during the UTF-8 decoding.
+ if (utf8CharacterByteLength == 0) {
+ return DAWN_INTERNAL_ERROR("Fail to decode the unicode string");
+ }
+ remaining = remaining.substr(utf8CharacterByteLength);
+
+ // Count the number of code units in UTF-16. See https://en.wikipedia.org/wiki/UTF-16 for
+ // more details.
+ if (codePoint.value <= 0xD7FF || (codePoint.value >= 0xE000 && codePoint.value <= 0xFFFF)) {
+ // Code points from U+0000 to U+D7FF and U+E000 to U+FFFF are encoded as single 16-bit
+ // code units.
+ ++numberOfUTF16CodeUnits;
+ } else if (codePoint.value >= 0x10000) {
+ // Code points from U+010000 to U+10FFFF are encoded as two 16-bit code units.
+ numberOfUTF16CodeUnits += 2;
+ } else {
+ // UTF-16 cannot encode the code points from U+D800 to U+DFFF.
+ return DAWN_INTERNAL_ERROR("The unicode string contains illegal unicode code point.");
+ }
+ }
+
+ return numberOfUTF16CodeUnits;
+}
+
OwnedCompilationMessages::OwnedCompilationMessages() {
mCompilationInfo.nextInChain = 0;
mCompilationInfo.messageCount = 0;
@@ -53,23 +86,29 @@
// Cannot add messages after GetCompilationInfo has been called.
ASSERT(mCompilationInfo.messages == nullptr);
+ // Message can only contain ascii characters.
+ ASSERT(tint::text::utf8::IsASCII(message));
+
mMessageStrings.push_back(message);
mMessages.push_back({nullptr, nullptr, static_cast<WGPUCompilationMessageType>(type), lineNum,
- linePos, offset, length});
+ linePos, offset, length, linePos, offset, length});
}
-void OwnedCompilationMessages::AddMessage(const tint::diag::Diagnostic& diagnostic) {
+MaybeError OwnedCompilationMessages::AddMessage(const tint::diag::Diagnostic& diagnostic) {
// Cannot add messages after GetCompilationInfo has been called.
ASSERT(mCompilationInfo.messages == nullptr);
// Tint line and column values are 1-based.
uint64_t lineNum = diagnostic.source.range.begin.line;
- uint64_t lineCol = diagnostic.source.range.begin.column;
+ uint64_t linePosInBytes = diagnostic.source.range.begin.column;
// The offset is 0-based.
- uint64_t offset = 0;
- uint64_t length = 0;
+ uint64_t offsetInBytes = 0;
+ uint64_t lengthInBytes = 0;
+ uint64_t linePosInUTF16 = 0;
+ uint64_t offsetInUTF16 = 0;
+ uint64_t lengthInUTF16 = 0;
- if (lineNum && lineCol && diagnostic.source.file) {
+ if (lineNum && linePosInBytes && diagnostic.source.file) {
const tint::Source::FileContent& content = diagnostic.source.file->content;
// Tint stores line as std::string_view in a complete source std::string that's in the
@@ -78,23 +117,38 @@
// range starts at 1 while the array of lines start at 0 (hence the -1).
const char* fileStart = content.data.data();
const char* lineStart = content.lines[lineNum - 1].data();
- offset = static_cast<uint64_t>(lineStart - fileStart) + lineCol - 1;
+ offsetInBytes = static_cast<uint64_t>(lineStart - fileStart) + linePosInBytes - 1;
+
+ // The linePosInBytes is 1-based.
+ uint64_t linePosOffsetInUTF16 = 0;
+ DAWN_TRY_ASSIGN(linePosOffsetInUTF16, CountUTF16CodeUnitsFromUTF8String(
+ std::string_view(lineStart, linePosInBytes - 1)));
+ linePosInUTF16 = linePosOffsetInUTF16 + 1;
+
+ // The offset is 0-based.
+ uint64_t lineStartToFileStartOffsetInUTF16 = 0;
+ DAWN_TRY_ASSIGN(lineStartToFileStartOffsetInUTF16,
+ CountUTF16CodeUnitsFromUTF8String(std::string_view(
+ fileStart, static_cast<uint64_t>(lineStart - fileStart))));
+ offsetInUTF16 = lineStartToFileStartOffsetInUTF16 + linePosInUTF16 - 1;
// If the range has a valid start but the end is not specified, clamp it to the start.
uint64_t endLineNum = diagnostic.source.range.end.line;
uint64_t endLineCol = diagnostic.source.range.end.column;
if (endLineNum == 0 || endLineCol == 0) {
endLineNum = lineNum;
- endLineCol = lineCol;
+ endLineCol = linePosInBytes;
}
const char* endLineStart = content.lines[endLineNum - 1].data();
- uint64_t endOffset = static_cast<uint64_t>(endLineStart - fileStart) + endLineCol - 1;
-
+ uint64_t endOffsetInBytes =
+ static_cast<uint64_t>(endLineStart - fileStart) + endLineCol - 1;
// The length of the message is the difference between the starting offset and the
- // ending offset. Negative ranges aren't allowed
- ASSERT(endOffset >= offset);
- length = endOffset - offset;
+ // ending offset. Negative ranges aren't allowed.
+ ASSERT(endOffsetInBytes >= offsetInBytes);
+ lengthInBytes = endOffsetInBytes - offsetInBytes;
+ DAWN_TRY_ASSIGN(lengthInUTF16, CountUTF16CodeUnitsFromUTF8String(std::string_view(
+ fileStart + offsetInBytes, lengthInBytes)));
}
if (diagnostic.code) {
@@ -104,18 +158,23 @@
}
mMessages.push_back({nullptr, nullptr, tintSeverityToMessageType(diagnostic.severity), lineNum,
- lineCol, offset, length});
+ linePosInBytes, offsetInBytes, lengthInBytes, linePosInUTF16,
+ offsetInUTF16, lengthInUTF16});
+
+ return {};
}
-void OwnedCompilationMessages::AddMessages(const tint::diag::List& diagnostics) {
+MaybeError OwnedCompilationMessages::AddMessages(const tint::diag::List& diagnostics) {
// Cannot add messages after GetCompilationInfo has been called.
ASSERT(mCompilationInfo.messages == nullptr);
for (const auto& diag : diagnostics) {
- AddMessage(diag);
+ DAWN_TRY(AddMessage(diag));
}
AddFormattedTintMessages(diagnostics);
+
+ return {};
}
void OwnedCompilationMessages::ClearMessages() {
diff --git a/src/dawn/native/CompilationMessages.h b/src/dawn/native/CompilationMessages.h
index 13d30b2..9adf8f3 100644
--- a/src/dawn/native/CompilationMessages.h
+++ b/src/dawn/native/CompilationMessages.h
@@ -18,6 +18,7 @@
#include <string>
#include <vector>
+#include "dawn/native/Error.h"
#include "dawn/native/dawn_platform.h"
#include "dawn/common/NonCopyable.h"
@@ -29,6 +30,8 @@
namespace dawn::native {
+ResultOrError<uint64_t> CountUTF16CodeUnitsFromUTF8String(const std::string_view& utf8String);
+
class OwnedCompilationMessages : public NonCopyable {
public:
OwnedCompilationMessages();
@@ -41,14 +44,14 @@
uint64_t linePos = 0,
uint64_t offset = 0,
uint64_t length = 0);
- void AddMessages(const tint::diag::List& diagnostics);
+ MaybeError AddMessages(const tint::diag::List& diagnostics);
void ClearMessages();
const WGPUCompilationInfo* GetCompilationInfo();
const std::vector<std::string>& GetFormattedTintMessages();
private:
- void AddMessage(const tint::diag::Diagnostic& diagnostic);
+ MaybeError AddMessage(const tint::diag::Diagnostic& diagnostic);
void AddFormattedTintMessages(const tint::diag::List& diagnostics);
WGPUCompilationInfo mCompilationInfo;
diff --git a/src/dawn/native/ShaderModule.cpp b/src/dawn/native/ShaderModule.cpp
index 8a6c4e1..6c04065 100644
--- a/src/dawn/native/ShaderModule.cpp
+++ b/src/dawn/native/ShaderModule.cpp
@@ -299,7 +299,7 @@
#if TINT_BUILD_WGSL_READER
tint::Program program = tint::reader::wgsl::Parse(file);
if (outMessages != nullptr) {
- outMessages->AddMessages(program.Diagnostics());
+ DAWN_TRY(outMessages->AddMessages(program.Diagnostics()));
}
if (!program.IsValid()) {
return DAWN_VALIDATION_ERROR("Tint WGSL reader failure: %s\n", program.Diagnostics().str());
@@ -316,7 +316,7 @@
#if TINT_BUILD_SPV_READER
tint::Program program = tint::reader::spirv::Parse(spirv);
if (outMessages != nullptr) {
- outMessages->AddMessages(program.Diagnostics());
+ DAWN_TRY(outMessages->AddMessages(program.Diagnostics()));
}
if (!program.IsValid()) {
return DAWN_VALIDATION_ERROR("Tint SPIR-V reader failure:\nParser: %s\n",
@@ -789,7 +789,7 @@
if (hasDisallowedExtension) {
if (outMessages != nullptr) {
- outMessages->AddMessages(messages);
+ DAWN_TRY(outMessages->AddMessages(messages));
}
return DAWN_MAKE_ERROR(InternalErrorType::Validation,
"Shader module uses extension(s) not enabled for its device.");
@@ -983,7 +983,7 @@
OwnedCompilationMessages* outMessages) {
tint::transform::Output output = transform->Run(program, inputs);
if (outMessages != nullptr) {
- outMessages->AddMessages(output.program.Diagnostics());
+ DAWN_TRY(outMessages->AddMessages(output.program.Diagnostics()));
}
DAWN_INVALID_IF(!output.program.IsValid(), "Tint program failure: %s\n",
output.program.Diagnostics().str());
diff --git a/src/dawn/tests/BUILD.gn b/src/dawn/tests/BUILD.gn
index b555260..7614367 100644
--- a/src/dawn/tests/BUILD.gn
+++ b/src/dawn/tests/BUILD.gn
@@ -301,6 +301,7 @@
"unittests/SystemUtilsTests.cpp",
"unittests/ToBackendTests.cpp",
"unittests/TypedIntegerTests.cpp",
+ "unittests/UnicodeTests.cpp",
"unittests/native/BlobTests.cpp",
"unittests/native/CacheRequestTests.cpp",
"unittests/native/CommandBufferEncodingTests.cpp",
diff --git a/src/dawn/tests/unittests/UnicodeTests.cpp b/src/dawn/tests/unittests/UnicodeTests.cpp
new file mode 100644
index 0000000..f450652
--- /dev/null
+++ b/src/dawn/tests/unittests/UnicodeTests.cpp
@@ -0,0 +1,73 @@
+// Copyright 2022 The Dawn Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "dawn/native/ShaderModule.h"
+#include "dawn/tests/unittests/validation/ValidationTest.h"
+
+class CountUTF16CodeUnitsFromUTF8StringTest : public ValidationTest {};
+
+TEST_F(CountUTF16CodeUnitsFromUTF8StringTest, ValidUnicodeString) {
+ struct TestCase {
+ const char* u8String;
+ uint64_t lengthInUTF16;
+ };
+
+ // Referenced from src/tint/text/unicode_test.cc
+ constexpr std::array<TestCase, 12> kTestCases = {{
+ {"", 0},
+ {"abc", 3},
+ {"\xe4\xbd\xa0\xe5\xa5\xbd\xe4\xb8\x96\xe7\x95\x8c", 4},
+ {"def\xf0\x9f\x91\x8b\xf0\x9f\x8c\x8e", 7},
+ {"\xed\x9f\xbf", 1}, // CodePoint == 0xD7FF
+ {"\xed\x9f\xbe", 1}, // CodePoint == 0xD7FF - 1
+ {"\xee\x80\x80", 1}, // CodePoint == 0xE000
+ {"\xee\x80\x81", 1}, // CodePoint == 0xE000 + 1
+ {"\xef\xbf\xbf", 1}, // CodePoint == 0xFFFF
+ {"\xef\xbf\xbe", 1}, // CodePoint == 0xFFFF - 1
+ {"\xf0\x90\x80\x80", 2}, // CodePoint == 0x10000
+ {"\xf0\x90\x80\x81", 2}, // CodePoint == 0x10000 + 1
+ }};
+
+ for (const TestCase& testCase : kTestCases) {
+ dawn::native::ResultOrError<uint64_t> resultOrError =
+ dawn::native::CountUTF16CodeUnitsFromUTF8String(std::string_view(testCase.u8String));
+ ASSERT_TRUE(resultOrError.IsSuccess());
+ ASSERT_EQ(testCase.lengthInUTF16, resultOrError.AcquireSuccess());
+ }
+}
+
+TEST_F(CountUTF16CodeUnitsFromUTF8StringTest, InvalidUnicodeString) {
+ // Referenced from src/tint/text/unicode_test.cc
+ constexpr std::array<const char*, 12> kTestCases = {{
+ "\xed\xa0\x80", // CodePoint == 0xD7FF + 1
+ "\xed\xbf\xbf", // CodePoint == 0xE000 - 1
+ "ab\xed\xa0\x80",
+ "\xd0", // 2-bytes, missing second byte
+ "\xe8\x8f", // 3-bytes, missing third byte
+ "\xf4\x8f\x8f", // 4-bytes, missing fourth byte
+ "\xd0\x7f", // 2-bytes, second byte MSB unset
+ "\xe8\x7f\x8f", // 3-bytes, second byte MSB unset
+ "\xe8\x8f\x7f", // 3-bytes, third byte MSB unset
+ "\xf4\x7f\x8f\x8f", // 4-bytes, second byte MSB unset
+ "\xf4\x8f\x7f\x8f", // 4-bytes, third byte MSB unset
+ "\xf4\x8f\x8f\x7f", // 4-bytes, fourth byte MSB unset
+ }};
+
+ for (const char* testCase : kTestCases) {
+ dawn::native::ResultOrError<uint64_t> resultOrError =
+ dawn::native::CountUTF16CodeUnitsFromUTF8String(std::string_view(testCase));
+ ASSERT_TRUE(resultOrError.IsError());
+ std::ignore = resultOrError.AcquireError();
+ }
+}
diff --git a/src/dawn/tests/unittests/wire/WireShaderModuleTests.cpp b/src/dawn/tests/unittests/wire/WireShaderModuleTests.cpp
index 67d258b..479b080 100644
--- a/src/dawn/tests/unittests/wire/WireShaderModuleTests.cpp
+++ b/src/dawn/tests/unittests/wire/WireShaderModuleTests.cpp
@@ -93,7 +93,7 @@
wgpuShaderModuleGetCompilationInfo(shaderModule, ToMockGetCompilationInfoCallback, nullptr);
WGPUCompilationMessage message = {
- nullptr, "Test Message", WGPUCompilationMessageType_Info, 2, 4, 6, 8};
+ nullptr, "Test Message", WGPUCompilationMessageType_Info, 2, 4, 6, 8, 4, 6, 8};
WGPUCompilationInfo compilationInfo;
compilationInfo.nextInChain = nullptr;
compilationInfo.messageCount = 1;
@@ -133,7 +133,7 @@
wgpuShaderModuleGetCompilationInfo(shaderModule, ToMockGetCompilationInfoCallback, nullptr);
WGPUCompilationMessage message = {
- nullptr, "Test Message", WGPUCompilationMessageType_Info, 2, 4, 6, 8};
+ nullptr, "Test Message", WGPUCompilationMessageType_Info, 2, 4, 6, 8, 4, 6, 8};
WGPUCompilationInfo compilationInfo;
compilationInfo.nextInChain = nullptr;
compilationInfo.messageCount = 1;
@@ -193,7 +193,7 @@
&testData);
WGPUCompilationMessage message = {
- nullptr, "Test Message", WGPUCompilationMessageType_Info, 2, 4, 6, 8};
+ nullptr, "Test Message", WGPUCompilationMessageType_Info, 2, 4, 6, 8, 4, 6, 8};
WGPUCompilationInfo compilationInfo;
compilationInfo.nextInChain = nullptr;
compilationInfo.messageCount = 1;
@@ -220,7 +220,7 @@
&testData);
WGPUCompilationMessage message = {
- nullptr, "Test Message", WGPUCompilationMessageType_Info, 2, 4, 6, 8};
+ nullptr, "Test Message", WGPUCompilationMessageType_Info, 2, 4, 6, 8, 4, 6, 8};
WGPUCompilationInfo compilationInfo;
compilationInfo.nextInChain = nullptr;
compilationInfo.messageCount = 1;
diff --git a/src/tint/text/unicode.cc b/src/tint/text/unicode.cc
index e23f3dd..cc9a9d1 100644
--- a/src/tint/text/unicode.cc
+++ b/src/tint/text/unicode.cc
@@ -427,6 +427,10 @@
return {c, n};
}
+std::pair<CodePoint, size_t> Decode(std::string_view utf8_string) {
+ return Decode(reinterpret_cast<const uint8_t*>(utf8_string.data()), utf8_string.size());
+}
+
bool IsASCII(std::string_view str) {
for (auto c : str) {
if (c & 0x80) {
diff --git a/src/tint/text/unicode.h b/src/tint/text/unicode.h
index f0aa272..0594d31 100644
--- a/src/tint/text/unicode.h
+++ b/src/tint/text/unicode.h
@@ -69,6 +69,12 @@
/// If the next code point cannot be decoded then returns [0,0].
std::pair<CodePoint, size_t> Decode(const uint8_t* ptr, size_t len);
+/// Decodes the first code point in the utf8 string.
+/// @param utf8_string the string view that contains the utf8 sequence
+/// @returns a pair of CodePoint and width in code units (bytes).
+/// If the next code point cannot be decoded then returns [0,0].
+std::pair<CodePoint, size_t> Decode(std::string_view utf8_string);
+
/// @returns true if all the utf-8 code points in the string are ASCII
/// (code-points 0x00..0x7f).
bool IsASCII(std::string_view);