Update inter stage variable subsetting validation and add tests

Sync up with current WebGPU spec to allow FS input being a
subset of VS output instead of requiring a strict match.
This patch involves changing the validation and adding tests,
together with using the TruncateInterstageVariables for hlsl
generator to workaround the extra limit for D3D12 backend.

Bug: dawn:1493
Change-Id: I2d4ba7f43dbe57f17ecd5c5d659f4ca93bb682a3
Reviewed-on: https://dawn-review.googlesource.com/c/dawn/+/109460
Commit-Queue: Shrek Shao <shrekshao@google.com>
Reviewed-by: Corentin Wallez <cwallez@chromium.org>
Auto-Submit: Shrek Shao <shrekshao@google.com>
Kokoro: Kokoro <noreply+kokoro@google.com>
diff --git a/src/dawn/native/RenderPipeline.cpp b/src/dawn/native/RenderPipeline.cpp
index 033231b..baa6c85 100644
--- a/src/dawn/native/RenderPipeline.cpp
+++ b/src/dawn/native/RenderPipeline.cpp
@@ -382,12 +382,24 @@
     const EntryPointMetadata& fragmentMetadata =
         fragmentState.module->GetEntryPoint(fragmentState.entryPoint);
 
-    // TODO(dawn:563): Can this message give more details?
-    DAWN_INVALID_IF(
-        vertexMetadata.usedInterStageVariables != fragmentMetadata.usedInterStageVariables,
-        "One or more fragment inputs and vertex outputs are not one-to-one matching");
+    if (DAWN_UNLIKELY(
+            (vertexMetadata.usedInterStageVariables | fragmentMetadata.usedInterStageVariables) !=
+            vertexMetadata.usedInterStageVariables)) {
+        for (size_t i : IterateBitSet(fragmentMetadata.usedInterStageVariables)) {
+            if (!vertexMetadata.usedInterStageVariables.test(i)) {
+                return DAWN_VALIDATION_ERROR(
+                    "The fragment input at location %u doesn't have a corresponding vertex output.",
+                    i);
+            }
+        }
+        UNREACHABLE();
+    }
 
     for (size_t i : IterateBitSet(vertexMetadata.usedInterStageVariables)) {
+        if (!fragmentMetadata.usedInterStageVariables.test(i)) {
+            // It is valid that fragment output is a subset of vertex input
+            continue;
+        }
         const auto& vertexOutputInfo = vertexMetadata.interStageVariables[i];
         const auto& fragmentInputInfo = fragmentMetadata.interStageVariables[i];
         DAWN_INVALID_IF(
diff --git a/src/dawn/native/d3d12/RenderPipelineD3D12.cpp b/src/dawn/native/d3d12/RenderPipelineD3D12.cpp
index 80bc6fc..6c8a6fb 100644
--- a/src/dawn/native/d3d12/RenderPipelineD3D12.cpp
+++ b/src/dawn/native/d3d12/RenderPipelineD3D12.cpp
@@ -357,11 +357,20 @@
 
     PerStage<CompiledShader> compiledShader;
 
+    std::bitset<kMaxInterStageShaderVariables>* usedInterstageVariables = nullptr;
+    if (GetStageMask() & wgpu::ShaderStage::Fragment) {
+        // Now that only fragment shader can have interstage inputs.
+        const ProgrammableStage& programmableStage = GetStage(SingleShaderStage::Fragment);
+        auto entryPoint = programmableStage.module->GetEntryPoint(programmableStage.entryPoint);
+        usedInterstageVariables = &entryPoint.usedInterStageVariables;
+    }
+
     for (auto stage : IterateStages(GetStageMask())) {
         const ProgrammableStage& programmableStage = GetStage(stage);
-        DAWN_TRY_ASSIGN(compiledShader[stage], ToBackend(programmableStage.module)
-                                                   ->Compile(programmableStage, stage,
-                                                             ToBackend(GetLayout()), compileFlags));
+        DAWN_TRY_ASSIGN(compiledShader[stage],
+                        ToBackend(programmableStage.module)
+                            ->Compile(programmableStage, stage, ToBackend(GetLayout()),
+                                      compileFlags, usedInterstageVariables));
         *shaders[stage] = compiledShader[stage].GetD3D12ShaderBytecode();
     }
 
diff --git a/src/dawn/native/d3d12/ShaderModuleD3D12.cpp b/src/dawn/native/d3d12/ShaderModuleD3D12.cpp
index 18e50eb..4aa4d3c 100644
--- a/src/dawn/native/d3d12/ShaderModuleD3D12.cpp
+++ b/src/dawn/native/d3d12/ShaderModuleD3D12.cpp
@@ -90,6 +90,7 @@
     X(tint::transform::BindingRemapper::BindingPoints, remappedBindingPoints)               \
     X(tint::transform::BindingRemapper::AccessControls, remappedAccessControls)             \
     X(std::optional<tint::transform::SubstituteOverride::Config>, substituteOverrideConfig) \
+    X(std::bitset<kMaxInterStageShaderVariables>, interstageLocations)                      \
     X(LimitsForCompilationRequest, limits)                                                  \
     X(bool, disableSymbolRenaming)                                                          \
     X(bool, isRobustnessEnabled)                                                            \
@@ -392,6 +393,14 @@
     // them as well. This would allow us to only upload root constants that are actually
     // read by the shader.
     options.array_length_from_uniform = r.arrayLengthFromUniform;
+
+    if (r.stage == SingleShaderStage::Vertex) {
+        // Now that only vertex shader can have interstage outputs.
+        // Pass in the actually used interstage locations for tint to potentially truncate unused
+        // outputs.
+        options.interstage_locations = r.interstageLocations;
+    }
+
     TRACE_EVENT0(tracePlatform.UnsafeGetValue(), General, "tint::writer::hlsl::Generate");
     auto result = tint::writer::hlsl::Generate(&transformedProgram, options);
     DAWN_INVALID_IF(!result.success, "An error occured while generating HLSL: %s", result.error);
@@ -456,10 +465,12 @@
     return InitializeBase(parseResult, compilationMessages);
 }
 
-ResultOrError<CompiledShader> ShaderModule::Compile(const ProgrammableStage& programmableStage,
-                                                    SingleShaderStage stage,
-                                                    const PipelineLayout* layout,
-                                                    uint32_t compileFlags) {
+ResultOrError<CompiledShader> ShaderModule::Compile(
+    const ProgrammableStage& programmableStage,
+    SingleShaderStage stage,
+    const PipelineLayout* layout,
+    uint32_t compileFlags,
+    const std::bitset<kMaxInterStageShaderVariables>* usedInterstageVariables) {
     Device* device = ToBackend(GetDevice());
     TRACE_EVENT0(device->GetPlatform(), General, "ShaderModuleD3D12::Compile");
     ASSERT(!IsError());
@@ -475,6 +486,10 @@
     req.hlsl.disableWorkgroupInit = device->IsToggleEnabled(Toggle::DisableWorkgroupInit);
     req.hlsl.dumpShaders = device->IsToggleEnabled(Toggle::DumpShaders);
 
+    if (usedInterstageVariables) {
+        req.hlsl.interstageLocations = *usedInterstageVariables;
+    }
+
     req.bytecode.hasShaderF16Feature = device->HasFeature(Feature::ShaderF16);
     req.bytecode.compileFlags = compileFlags;
 
@@ -596,7 +611,6 @@
         std::ostringstream dumpedMsg;
         dumpedMsg << "/* Dumped generated HLSL */" << std::endl
                   << compiledShader->hlslSource << std::endl;
-        device->EmitLog(WGPULoggingType_Info, dumpedMsg.str().c_str());
 
         if (device->IsToggleEnabled(Toggle::UseDXC)) {
             dumpedMsg << "/* Dumped disassembled DXIL */" << std::endl;
diff --git a/src/dawn/native/d3d12/ShaderModuleD3D12.h b/src/dawn/native/d3d12/ShaderModuleD3D12.h
index 8c70bb4..f646117 100644
--- a/src/dawn/native/d3d12/ShaderModuleD3D12.h
+++ b/src/dawn/native/d3d12/ShaderModuleD3D12.h
@@ -52,10 +52,12 @@
                                                    ShaderModuleParseResult* parseResult,
                                                    OwnedCompilationMessages* compilationMessages);
 
-    ResultOrError<CompiledShader> Compile(const ProgrammableStage& programmableStage,
-                                          SingleShaderStage stage,
-                                          const PipelineLayout* layout,
-                                          uint32_t compileFlags);
+    ResultOrError<CompiledShader> Compile(
+        const ProgrammableStage& programmableStage,
+        SingleShaderStage stage,
+        const PipelineLayout* layout,
+        uint32_t compileFlags,
+        const std::bitset<kMaxInterStageShaderVariables>* usedInterstageVariables = nullptr);
 
   private:
     ShaderModule(Device* device, const ShaderModuleDescriptor* descriptor);
diff --git a/src/dawn/tests/end2end/ShaderTests.cpp b/src/dawn/tests/end2end/ShaderTests.cpp
index 8909e6e..57f10b3 100644
--- a/src/dawn/tests/end2end/ShaderTests.cpp
+++ b/src/dawn/tests/end2end/ShaderTests.cpp
@@ -318,6 +318,237 @@
     wgpu::RenderPipeline pipeline = device.CreateRenderPipeline(&rpDesc);
 }
 
+// Tests that sparse input output locations should work properly.
+// This test is not in dawn_unittests/RenderPipelineValidationTests because we want to test the
+// compilation of the pipeline in D3D12 backend.
+TEST_P(ShaderTests, WGSLInterstageVariablesSparse) {
+    std::string shader = R"(
+struct ShaderIO {
+    @builtin(position) position : vec4<f32>,
+    @location(1) attribute1 : vec4<f32>,
+    @location(3) attribute3 : vec4<f32>,
+}
+
+@vertex
+fn vertexMain() -> ShaderIO {
+    var output : ShaderIO;
+    output.position = vec4<f32>(0.0, 0.0, 0.0, 1.0);
+    output.attribute1 = vec4<f32>(0.0, 0.0, 0.0, 1.0);
+    output.attribute3 = vec4<f32>(0.0, 0.0, 0.0, 1.0);
+    return output;
+}
+
+@fragment
+fn fragmentMain(input : ShaderIO) -> @location(0) vec4<f32> {
+    return input.attribute1;
+})";
+    wgpu::ShaderModule shaderModule = utils::CreateShaderModule(device, shader.c_str());
+
+    utils::ComboRenderPipelineDescriptor rpDesc;
+    rpDesc.vertex.module = shaderModule;
+    rpDesc.vertex.entryPoint = "vertexMain";
+    rpDesc.cFragment.module = shaderModule;
+    rpDesc.cFragment.entryPoint = "fragmentMain";
+    wgpu::RenderPipeline pipeline = device.CreateRenderPipeline(&rpDesc);
+}
+
+// Tests that interstage built-in inputs and outputs usage mismatch don't mess up with input-output
+// locations.
+// This test is not in dawn_unittests/RenderPipelineValidationTests because we want to test the
+// compilation of the pipeline in D3D12 backend.
+TEST_P(ShaderTests, WGSLInterstageVariablesBuiltinsMismatched) {
+    std::string shader = R"(
+struct VertexOut {
+    @builtin(position) position : vec4<f32>,
+    @location(1) attribute1 : f32,
+    @location(3) attribute3 : vec4<f32>,
+}
+
+struct FragmentIn {
+    @location(3) attribute3 : vec4<f32>,
+    @builtin(front_facing) front_facing : bool,
+    @location(1) attribute1 : f32,
+    @builtin(position) position : vec4<f32>,
+}
+
+@vertex
+fn vertexMain() -> VertexOut {
+    var output : VertexOut;
+    output.position = vec4<f32>(0.0, 0.0, 0.0, 1.0);
+    output.attribute1 = 1.0;
+    output.attribute3 = vec4<f32>(0.0, 0.0, 0.0, 1.0);
+    return output;
+}
+
+@fragment
+fn fragmentMain(input : FragmentIn) -> @location(0) vec4<f32> {
+    _ = input.front_facing;
+    _ = input.position.x;
+    return input.attribute3;
+})";
+    wgpu::ShaderModule shaderModule = utils::CreateShaderModule(device, shader.c_str());
+
+    utils::ComboRenderPipelineDescriptor rpDesc;
+    rpDesc.vertex.module = shaderModule;
+    rpDesc.vertex.entryPoint = "vertexMain";
+    rpDesc.cFragment.module = shaderModule;
+    rpDesc.cFragment.entryPoint = "fragmentMain";
+    wgpu::RenderPipeline pipeline = device.CreateRenderPipeline(&rpDesc);
+}
+
+// Tests that interstage inputs could be a prefix subset of the outputs.
+// This test is not in dawn_unittests/RenderPipelineValidationTests because we want to test the
+// compilation of the pipeline in D3D12 backend.
+TEST_P(ShaderTests, WGSLInterstageVariablesPrefixSubset) {
+    std::string shader = R"(
+struct VertexOut {
+    @builtin(position) position : vec4<f32>,
+    @location(1) attribute1 : f32,
+    @location(3) attribute3 : vec4<f32>,
+}
+
+struct FragmentIn {
+    @location(1) attribute1 : f32,
+    @builtin(position) position : vec4<f32>,
+}
+
+@vertex
+fn vertexMain() -> VertexOut {
+    var output : VertexOut;
+    output.position = vec4<f32>(0.0, 0.0, 0.0, 1.0);
+    output.attribute1 = 1.0;
+    output.attribute3 = vec4<f32>(0.0, 0.0, 0.0, 1.0);
+    return output;
+}
+
+@fragment
+fn fragmentMain(input : FragmentIn) -> @location(0) vec4<f32> {
+    _ = input.position.x;
+    return vec4<f32>(input.attribute1, 0.0, 0.0, 1.0);
+})";
+    wgpu::ShaderModule shaderModule = utils::CreateShaderModule(device, shader.c_str());
+
+    utils::ComboRenderPipelineDescriptor rpDesc;
+    rpDesc.vertex.module = shaderModule;
+    rpDesc.vertex.entryPoint = "vertexMain";
+    rpDesc.cFragment.module = shaderModule;
+    rpDesc.cFragment.entryPoint = "fragmentMain";
+    wgpu::RenderPipeline pipeline = device.CreateRenderPipeline(&rpDesc);
+}
+
+// Tests that interstage inputs could be a sparse non-prefix subset of the outputs.
+// This test is not in dawn_unittests/RenderPipelineValidationTests because we want to test the
+// compilation of the pipeline in D3D12 backend.
+TEST_P(ShaderTests, WGSLInterstageVariablesSparseSubset) {
+    std::string shader = R"(
+struct VertexOut {
+    @builtin(position) position : vec4<f32>,
+    @location(1) attribute1 : f32,
+    @location(3) attribute3 : vec4<f32>,
+}
+
+struct FragmentIn {
+    @location(3) attribute3 : vec4<f32>,
+    @builtin(position) position : vec4<f32>,
+}
+
+@vertex
+fn vertexMain() -> VertexOut {
+    var output : VertexOut;
+    output.position = vec4<f32>(0.0, 0.0, 0.0, 1.0);
+    output.attribute1 = 1.0;
+    output.attribute3 = vec4<f32>(0.0, 0.0, 0.0, 1.0);
+    return output;
+}
+
+@fragment
+fn fragmentMain(input : FragmentIn) -> @location(0) vec4<f32> {
+    _ = input.position.x;
+    return input.attribute3;
+})";
+    wgpu::ShaderModule shaderModule = utils::CreateShaderModule(device, shader.c_str());
+
+    utils::ComboRenderPipelineDescriptor rpDesc;
+    rpDesc.vertex.module = shaderModule;
+    rpDesc.vertex.entryPoint = "vertexMain";
+    rpDesc.cFragment.module = shaderModule;
+    rpDesc.cFragment.entryPoint = "fragmentMain";
+    wgpu::RenderPipeline pipeline = device.CreateRenderPipeline(&rpDesc);
+}
+
+// Tests that interstage inputs could be a sparse non-prefix subset of the outputs, and that
+// fragment inputs are unused. This test is not in dawn_unittests/RenderPipelineValidationTests
+// because we want to test the compilation of the pipeline in D3D12 backend.
+TEST_P(ShaderTests, WGSLInterstageVariablesSparseSubsetUnused) {
+    std::string shader = R"(
+struct VertexOut {
+    @builtin(position) position : vec4<f32>,
+    @location(1) attribute1 : f32,
+    @location(3) attribute3 : vec4<f32>,
+}
+
+struct FragmentIn {
+    @location(3) attribute3 : vec4<f32>,
+    @builtin(position) position : vec4<f32>,
+}
+
+@vertex
+fn vertexMain() -> VertexOut {
+    var output : VertexOut;
+    output.position = vec4<f32>(0.0, 0.0, 0.0, 1.0);
+    output.attribute1 = 1.0;
+    output.attribute3 = vec4<f32>(0.0, 0.0, 0.0, 1.0);
+    return output;
+}
+
+@fragment
+fn fragmentMain(input : FragmentIn) -> @location(0) vec4<f32> {
+    return vec4<f32>(0.0, 0.0, 0.0, 1.0);
+})";
+    wgpu::ShaderModule shaderModule = utils::CreateShaderModule(device, shader.c_str());
+
+    utils::ComboRenderPipelineDescriptor rpDesc;
+    rpDesc.vertex.module = shaderModule;
+    rpDesc.vertex.entryPoint = "vertexMain";
+    rpDesc.cFragment.module = shaderModule;
+    rpDesc.cFragment.entryPoint = "fragmentMain";
+    wgpu::RenderPipeline pipeline = device.CreateRenderPipeline(&rpDesc);
+}
+
+// Tests that interstage inputs could be empty when outputs are not.
+// This test is not in dawn_unittests/RenderPipelineValidationTests because we want to test the
+// compilation of the pipeline in D3D12 backend.
+TEST_P(ShaderTests, WGSLInterstageVariablesEmptySubset) {
+    std::string shader = R"(
+struct VertexOut {
+    @builtin(position) position : vec4<f32>,
+    @location(1) attribute1 : f32,
+    @location(3) attribute3 : vec4<f32>,
+}
+
+@vertex
+fn vertexMain() -> VertexOut {
+    var output : VertexOut;
+    output.position = vec4<f32>(0.0, 0.0, 0.0, 1.0);
+    output.attribute1 = 1.0;
+    output.attribute3 = vec4<f32>(0.0, 0.0, 0.0, 1.0);
+    return output;
+}
+
+@fragment
+fn fragmentMain() -> @location(0) vec4<f32> {
+    return vec4<f32>(0.0, 0.0, 0.0, 1.0);
+})";
+    wgpu::ShaderModule shaderModule = utils::CreateShaderModule(device, shader.c_str());
+
+    utils::ComboRenderPipelineDescriptor rpDesc;
+    rpDesc.vertex.module = shaderModule;
+    rpDesc.vertex.entryPoint = "vertexMain";
+    rpDesc.cFragment.module = shaderModule;
+    rpDesc.cFragment.entryPoint = "fragmentMain";
+    wgpu::RenderPipeline pipeline = device.CreateRenderPipeline(&rpDesc);
+}
+
 // This is a regression test for an issue caused by the FirstIndexOffset transfrom being done before
 // the BindingRemapper, causing an intermediate AST to be invalid (and fail the overall
 // compilation).
@@ -949,8 +1180,195 @@
     EXPECT_BUFFER_U32_EQ(2, buf, 0);
 }
 
+// Test that when fragment input is a subset of the vertex output, the render pipeline should be
+// valid.
+TEST_P(ShaderTests, FragmentInputIsSubsetOfVertexOutput) {
+    wgpu::ShaderModule vsModule = utils::CreateShaderModule(device, R"(
+struct ShaderIO {
+    @location(1) var1: f32,
+    @location(3) @interpolate(flat) var3: u32,
+    @location(5) @interpolate(flat) var5: i32,
+    @location(7) var7: f32,
+    @location(9) @interpolate(flat) var9: u32,
+    @builtin(position) pos: vec4<f32>,
+}
+
+@vertex fn main(@builtin(vertex_index) VertexIndex : u32)
+     -> ShaderIO {
+  var pos = array<vec2<f32>, 3>(
+      vec2<f32>(-1.0, 3.0),
+      vec2<f32>(-1.0, -3.0),
+      vec2<f32>(3.0, 0.0));
+
+  var shaderIO: ShaderIO;
+  shaderIO.var1 = 0.0;
+  shaderIO.var3 = 1u;
+  shaderIO.var5 = -9;
+  shaderIO.var7 = 1.0;
+  shaderIO.var9 = 0u;
+  shaderIO.pos = vec4<f32>(pos[VertexIndex], 0.0, 1.0);
+
+  return shaderIO;
+})");
+
+    wgpu::ShaderModule fsModule = utils::CreateShaderModule(device, R"(
+struct ShaderIO {
+    @location(3) @interpolate(flat) var3: u32,
+    @location(7) var7: f32,
+}
+
+@fragment fn main(io: ShaderIO)
+    -> @location(0) vec4<f32> {
+    return vec4<f32>(f32(io.var3), io.var7, 1.0, 1.0);
+})");
+
+    utils::BasicRenderPass renderPass = utils::CreateBasicRenderPass(device, 1, 1);
+
+    utils::ComboRenderPipelineDescriptor descriptor;
+    descriptor.vertex.module = vsModule;
+    descriptor.cFragment.module = fsModule;
+    descriptor.primitive.topology = wgpu::PrimitiveTopology::TriangleList;
+    descriptor.cTargets[0].format = renderPass.colorFormat;
+
+    wgpu::RenderPipeline pipeline = device.CreateRenderPipeline(&descriptor);
+
+    wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
+    wgpu::RenderPassEncoder pass = encoder.BeginRenderPass(&renderPass.renderPassInfo);
+    pass.SetPipeline(pipeline);
+    pass.Draw(3);
+    pass.End();
+    wgpu::CommandBuffer commands = encoder.Finish();
+    queue.Submit(1, &commands);
+
+    EXPECT_PIXEL_RGBA8_EQ(utils::RGBA8(255, 255, 255, 255), renderPass.color, 0, 0);
+}
+
+// Test that when fragment input is a subset of the vertex output and the order of them is
+// different, the render pipeline should be valid.
+TEST_P(ShaderTests, FragmentInputIsSubsetOfVertexOutputWithDifferentOrder) {
+    wgpu::ShaderModule vsModule = utils::CreateShaderModule(device, R"(
+struct ShaderIO {
+    @location(5) @align(16) var5: f32,
+    @location(1) var1: f32,
+    @location(2) var2: f32,
+    @location(3) @align(8) var3: f32,
+    @location(4) var4: vec4<f32>,
+    @builtin(position) pos: vec4<f32>,
+}
+
+@vertex fn main(@builtin(vertex_index) VertexIndex : u32)
+     -> ShaderIO {
+  var pos = array<vec2<f32>, 3>(
+      vec2<f32>(-1.0, 3.0),
+      vec2<f32>(-1.0, -3.0),
+      vec2<f32>(3.0, 0.0));
+
+  var shaderIO: ShaderIO;
+  shaderIO.var1 = 0.0;
+  shaderIO.var2 = 0.0;
+  shaderIO.var3 = 1.0;
+  shaderIO.var4 = vec4<f32>(0.4, 0.4, 0.4, 0.4);
+  shaderIO.var5 = 1.0;
+  shaderIO.pos = vec4<f32>(pos[VertexIndex], 0.0, 1.0);
+
+  return shaderIO;
+})");
+
+    wgpu::ShaderModule fsModule = utils::CreateShaderModule(device, R"(
+struct ShaderIO {
+    @location(4) var4: vec4<f32>,
+    @location(1) var1: f32,
+    @location(5) @align(16) var5: f32,
+}
+
+@fragment fn main(io: ShaderIO)
+    -> @location(0) vec4<f32> {
+    return vec4<f32>(io.var1, io.var5, io.var4.x, 1.0);
+})");
+
+    utils::BasicRenderPass renderPass = utils::CreateBasicRenderPass(device, 1, 1);
+
+    utils::ComboRenderPipelineDescriptor descriptor;
+    descriptor.vertex.module = vsModule;
+    descriptor.cFragment.module = fsModule;
+    descriptor.primitive.topology = wgpu::PrimitiveTopology::TriangleList;
+    descriptor.cTargets[0].format = renderPass.colorFormat;
+
+    wgpu::RenderPipeline pipeline = device.CreateRenderPipeline(&descriptor);
+
+    wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
+    wgpu::RenderPassEncoder pass = encoder.BeginRenderPass(&renderPass.renderPassInfo);
+    pass.SetPipeline(pipeline);
+    pass.Draw(3);
+    pass.End();
+    wgpu::CommandBuffer commands = encoder.Finish();
+    queue.Submit(1, &commands);
+
+    EXPECT_PIXEL_RGBA8_EQ(utils::RGBA8(0, 255, 102, 255), renderPass.color, 0, 0);
+}
+
+// Test that when fragment input is a subset of the vertex output and that when the builtin
+// interstage variables may mess up with the order, the render pipeline should be valid.
+TEST_P(ShaderTests, FragmentInputIsSubsetOfVertexOutputBuiltinOrder) {
+    wgpu::ShaderModule vsModule = utils::CreateShaderModule(device, R"(
+struct ShaderIO {
+    @location(1) var1: f32,
+    @builtin(position) pos: vec4<f32>,
+    @location(8) var8: vec3<f32>,
+    @location(7) var7: f32,
+}
+
+@vertex fn main(@builtin(vertex_index) VertexIndex : u32)
+     -> ShaderIO {
+  var pos = array<vec2<f32>, 3>(
+      vec2<f32>(-1.0, 3.0),
+      vec2<f32>(-1.0, -3.0),
+      vec2<f32>(3.0, 0.0));
+
+  var shaderIO: ShaderIO;
+  shaderIO.var1 = 0.0;
+  shaderIO.var7 = 1.0;
+  shaderIO.var8 = vec3<f32>(1.0, 0.4, 0.0);
+  shaderIO.pos = vec4<f32>(pos[VertexIndex], 0.0, 1.0);
+
+  return shaderIO;
+})");
+
+    wgpu::ShaderModule fsModule = utils::CreateShaderModule(device, R"(
+struct ShaderIO {
+    @builtin(position) pos: vec4<f32>,
+    @location(7) var7: f32,
+}
+
+@fragment fn main(io: ShaderIO)
+    -> @location(0) vec4<f32> {
+    return vec4<f32>(0.0, io.var7, 0.4, 1.0);
+})");
+
+    utils::BasicRenderPass renderPass = utils::CreateBasicRenderPass(device, 1, 1);
+
+    utils::ComboRenderPipelineDescriptor descriptor;
+    descriptor.vertex.module = vsModule;
+    descriptor.cFragment.module = fsModule;
+    descriptor.primitive.topology = wgpu::PrimitiveTopology::TriangleList;
+    descriptor.cTargets[0].format = renderPass.colorFormat;
+
+    wgpu::RenderPipeline pipeline = device.CreateRenderPipeline(&descriptor);
+
+    wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
+    wgpu::RenderPassEncoder pass = encoder.BeginRenderPass(&renderPass.renderPassInfo);
+    pass.SetPipeline(pipeline);
+    pass.Draw(3);
+    pass.End();
+    wgpu::CommandBuffer commands = encoder.Finish();
+    queue.Submit(1, &commands);
+
+    EXPECT_PIXEL_RGBA8_EQ(utils::RGBA8(0, 255, 102, 255), renderPass.color, 0, 0);
+}
+
 DAWN_INSTANTIATE_TEST(ShaderTests,
                       D3D12Backend(),
+                      D3D12Backend({"use_dxc"}),
                       MetalBackend(),
                       OpenGLBackend(),
                       OpenGLESBackend(),
diff --git a/src/dawn/tests/unittests/validation/RenderPipelineValidationTests.cpp b/src/dawn/tests/unittests/validation/RenderPipelineValidationTests.cpp
index 5271228..675b1d9 100644
--- a/src/dawn/tests/unittests/validation/RenderPipelineValidationTests.cpp
+++ b/src/dawn/tests/unittests/validation/RenderPipelineValidationTests.cpp
@@ -1390,8 +1390,7 @@
     }
 };
 
-// Tests that creating render pipeline should fail when there is a vertex output that doesn't have
-// its corresponding fragment input at the same location, and there is a fragment input that
+// Tests that creating render pipeline should fail when there is a fragment input that
 // doesn't have its corresponding vertex output at the same location.
 TEST_F(InterStageVariableMatchingValidationTest, MissingDeclarationAtSameLocation) {
     wgpu::ShaderModule vertexModuleOutputAtLocation0 = utils::CreateShaderModule(device, R"(
@@ -1430,7 +1429,10 @@
             })");
 
     {
-        CheckCreatingRenderPipeline(vertexModuleOutputAtLocation0, fsModule, false);
+        // It is okay if the fragment output is a subset of the vertex input.
+        CheckCreatingRenderPipeline(vertexModuleOutputAtLocation0, fsModule, true);
+    }
+    {
         CheckCreatingRenderPipeline(vsModule, fragmentModuleAtLocation0, false);
         CheckCreatingRenderPipeline(vertexModuleOutputAtLocation0, fragmentModuleInputAtLocation1,
                                     false);
diff --git a/src/dawn/tests/unittests/validation/ShaderModuleValidationTests.cpp b/src/dawn/tests/unittests/validation/ShaderModuleValidationTests.cpp
index 63aafac..4d788e3 100644
--- a/src/dawn/tests/unittests/validation/ShaderModuleValidationTests.cpp
+++ b/src/dawn/tests/unittests/validation/ShaderModuleValidationTests.cpp
@@ -278,10 +278,13 @@
         }
 
         if (success) {
-            ASSERT_DEVICE_ERROR(
-                device.CreateRenderPipeline(&pDesc),
-                testing::HasSubstr(
-                    "One or more fragment inputs and vertex outputs are not one-to-one matching"));
+            if (failingShaderStage == wgpu::ShaderStage::Vertex) {
+                // It is allowed that fragment inputs are a subset of the vertex output variables.
+                device.CreateRenderPipeline(&pDesc);
+            } else {
+                ASSERT_DEVICE_ERROR(device.CreateRenderPipeline(&pDesc),
+                                    testing::HasSubstr("The fragment input at location"));
+            }
         } else {
             ASSERT_DEVICE_ERROR(device.CreateRenderPipeline(&pDesc),
                                 testing::HasSubstr(errorMatcher));
@@ -401,10 +404,13 @@
         }
 
         if (success) {
-            ASSERT_DEVICE_ERROR(
-                device.CreateRenderPipeline(&pDesc),
-                testing::HasSubstr(
-                    "One or more fragment inputs and vertex outputs are not one-to-one matching"));
+            if (failingShaderStage == wgpu::ShaderStage::Vertex) {
+                // It is allowed that fragment inputs are a subset of the vertex output variables.
+                device.CreateRenderPipeline(&pDesc);
+            } else {
+                ASSERT_DEVICE_ERROR(device.CreateRenderPipeline(&pDesc),
+                                    testing::HasSubstr("The fragment input at location"));
+            }
         } else {
             ASSERT_DEVICE_ERROR(device.CreateRenderPipeline(&pDesc),
                                 testing::HasSubstr(errorMatcher));
diff --git a/src/tint/writer/hlsl/generator.h b/src/tint/writer/hlsl/generator.h
index 9974cde..c624943 100644
--- a/src/tint/writer/hlsl/generator.h
+++ b/src/tint/writer/hlsl/generator.h
@@ -15,6 +15,7 @@
 #ifndef SRC_TINT_WRITER_HLSL_GENERATOR_H_
 #define SRC_TINT_WRITER_HLSL_GENERATOR_H_
 
+#include <bitset>
 #include <memory>
 #include <optional>
 #include <string>
@@ -25,6 +26,7 @@
 #include "src/tint/ast/pipeline_stage.h"
 #include "src/tint/reflection.h"
 #include "src/tint/sem/binding_point.h"
+#include "src/tint/utils/bitset.h"
 #include "src/tint/writer/array_length_from_uniform_options.h"
 #include "src/tint/writer/text.h"
 
@@ -56,6 +58,9 @@
     /// Options used to specify a mapping of binding points to indices into a UBO
     /// from which to load buffer sizes.
     ArrayLengthFromUniformOptions array_length_from_uniform = {};
+    /// Interstage locations actually used as inputs in the next stage of the pipeline.
+    /// This is potentially used for truncating unused interstage outputs at current shader stage.
+    std::bitset<16> interstage_locations;
 
     /// Reflect the fields of this class so that it can be used by tint::ForeachField()
     TINT_REFLECT(root_constant_binding_point,
diff --git a/src/tint/writer/hlsl/generator_impl.cc b/src/tint/writer/hlsl/generator_impl.cc
index 9cfe5d7..e3abc51 100644
--- a/src/tint/writer/hlsl/generator_impl.cc
+++ b/src/tint/writer/hlsl/generator_impl.cc
@@ -64,6 +64,7 @@
 #include "src/tint/transform/remove_continue_in_switch.h"
 #include "src/tint/transform/remove_phonies.h"
 #include "src/tint/transform/simplify_pointers.h"
+#include "src/tint/transform/truncate_interstage_variables.h"
 #include "src/tint/transform/unshadow.h"
 #include "src/tint/transform/vectorize_scalar_matrix_initializers.h"
 #include "src/tint/transform/zero_init_workgroup_memory.h"
@@ -210,6 +211,27 @@
         manager.Add<transform::ZeroInitWorkgroupMemory>();
     }
     manager.Add<transform::CanonicalizeEntryPointIO>();
+
+    if (options.interstage_locations.any()) {
+        // When interstage_locations is empty, it means there's no user-defined interstage variables
+        // being used in the next stage. This is treated as a special case.
+        // TruncateInterstageVariables transform is trying to solve the HLSL compiler register
+        // mismatch issue. So it is not needed if no register is assigned to any interstage
+        // variables. As a result we only add this transform when there is at least one interstage
+        // locations being used.
+
+        // TruncateInterstageVariables itself will skip when interstage_locations matches exactly
+        // with the current stage output.
+
+        // Build the config for internal TruncateInterstageVariables transform.
+        transform::TruncateInterstageVariables::Config truncate_interstage_variables_cfg;
+        truncate_interstage_variables_cfg.interstage_locations =
+            std::move(options.interstage_locations);
+        manager.Add<transform::TruncateInterstageVariables>();
+        data.Add<transform::TruncateInterstageVariables::Config>(
+            std::move(truncate_interstage_variables_cfg));
+    }
+
     // NumWorkgroupsFromUniform must come after CanonicalizeEntryPointIO, as it
     // assumes that num_workgroups builtins only appear as struct members and are
     // only accessed directly via member accessors.