perf_tests: Explain block matrix multiply better

Add comments, and replace magic numbers with their semantic
computations.

Parameterize the matrix component type.

Change-Id: I6fae7b36d3ca6dc85cde1f5c26b28c50e355a23f
Reviewed-on: https://dawn-review.googlesource.com/c/dawn/+/178260
Kokoro: Kokoro <noreply+kokoro@google.com>
Auto-Submit: David Neto <dneto@google.com>
Reviewed-by: Austin Eng <enga@chromium.org>
diff --git a/src/dawn/tests/perf_tests/ShaderRobustnessPerf.cpp b/src/dawn/tests/perf_tests/ShaderRobustnessPerf.cpp
index cdecf41..2b897a2 100644
--- a/src/dawn/tests/perf_tests/ShaderRobustnessPerf.cpp
+++ b/src/dawn/tests/perf_tests/ShaderRobustnessPerf.cpp
@@ -25,6 +25,7 @@
 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+#include <sstream>
 #include <string>
 #include <vector>
 
@@ -35,15 +36,27 @@
 namespace {
 
 constexpr uint32_t kTileSize = 32u;
+constexpr uint32_t kWorkgroupSizeX = 8u;
+constexpr uint32_t kWorkgroupSizeY = 8u;
+static_assert(kTileSize % kWorkgroupSizeX == 0,
+              "workgroup width must evenly divide tile dimension");
+static_assert(kTileSize % kWorkgroupSizeY == 0,
+              "workgroup height must evenly divide tile dimension");
 
-const std::string& kMatMulFloatHeader = R"(
+std::string GenMatMulFloatHeader() {
+    std::stringstream ss;
+    ss << "const kTileSize = " << kTileSize << ";  // 32\n";
+    ss << "const kWorkgroupSizeX = " << kWorkgroupSizeX << "u; // 8;\n";
+    ss << "const kWorkgroupSizeY = " << kWorkgroupSizeY << "u; // 8;\n";
+    ss << R"(
+        alias ElemT = f32;
         struct Uniforms {
             dimAOuter : u32,
             dimInner : u32,
             dimBOuter : u32,
         }
         struct Matrix {
-            numbers: array<f32>
+            numbers: array<ElemT>
         }
 
         @group(0) @binding(0) var<storage, read> firstMatrix : Matrix;
@@ -51,25 +64,25 @@
         @group(0) @binding(2) var<storage, read_write> resultMatrix : Matrix;
         @group(0) @binding(3) var<uniform> uniforms : Uniforms;
 
-        fn mm_readA(row : u32, col : u32) -> f32  {
+        fn mm_readA(row : u32, col : u32) -> ElemT  {
             if (row < uniforms.dimAOuter && col < uniforms.dimInner)
             {
-                let result : f32 = firstMatrix.numbers[row * uniforms.dimInner + col];
+                let result : ElemT = firstMatrix.numbers[row * uniforms.dimInner + col];
                 return result;
             }
             return 0.;
         }
 
-        fn mm_readB(row : u32, col : u32) -> f32 {
+        fn mm_readB(row : u32, col : u32) -> ElemT {
             if (row < uniforms.dimInner && col < uniforms.dimBOuter)
             {
-                let result : f32 = secondMatrix.numbers[row * uniforms.dimBOuter + col];
+                let result : ElemT = secondMatrix.numbers[row * uniforms.dimBOuter + col];
                 return result;
             }
             return 0.;
         }
 
-        fn mm_write(row : u32, col : u32, value : f32) {
+        fn mm_write(row : u32, col : u32, value : ElemT) {
             if (row < uniforms.dimAOuter && col < uniforms.dimBOuter)
             {
                 let index : u32 = col + row * uniforms.dimBOuter;
@@ -77,44 +90,55 @@
             }
         }
 
-        const RowPerThread : u32 = 4u;
-        const ColPerThread : u32 = 4u;
-        const TileAOuter : u32 = 32u;
-        const TileBOuter : u32 = 32u;
-        const TileInner : u32 = 32u;)";
+        const RowPerThread : u32 = kTileSize / kWorkgroupSizeY; // 4
+        const ColPerThread : u32 = kTileSize / kWorkgroupSizeX; // 4
+        const TileBOuter : u32 = kTileSize;
+        const TileInner : u32 = kTileSize;
+        )";
+    return ss.str();
+}
 
 const std::string& kMatMulFloatSharedArray1D = R"(
-        var<workgroup> mm_Asub : array<f32, 1024>;
-        var<workgroup> mm_Bsub : array<f32, 1024>;)";
+        var<workgroup> mm_Asub : array<ElemT, kTileSize * kTileSize>;
+        var<workgroup> mm_Bsub : array<ElemT, kTileSize * kTileSize>;)";
 const std::string& kMatMulFloatSharedArray2D = R"(
-        var<workgroup> mm_Asub : array<array<f32, 32>, 32>;
-        var<workgroup> mm_Bsub : array<array<f32, 32>, 32>;)";
+        var<workgroup> mm_Asub : array<array<ElemT, kTileSize>, kTileSize>;
+        var<workgroup> mm_Bsub : array<array<ElemT, kTileSize>, kTileSize>;)";
 const std::string& kMatMulFloatBodyPart1 = R"(
-        @compute @workgroup_size(8, 8, 1)
+
+        @compute @workgroup_size(kWorkgroupSizeX, kWorkgroupSizeY, 1)
         fn main(@builtin(local_invocation_id) local_id : vec3u,
                 @builtin(global_invocation_id) global_id  : vec3u) {
+            // This invocation is responsible the region in the current tile with
+            //   rows in [ tileRow, tileRow + RowPerThread -1 ]
+            //   cols in [ tileCol, tileCol + ColPerThread -1 ]
             let tileRow : u32 = local_id.y * RowPerThread;
             let tileCol : u32 = local_id.x * ColPerThread;
 
+            // This invocation is responsible the region in the output matrix with
+            //   rows in [ globalRow, globalRow + RowPerThread -1 ]
+            //   cols in [ globalCol, globalCol + ColPerThread -1 ]
             let globalRow : u32 = global_id.y * RowPerThread;
             let globalCol : u32 = global_id.x * ColPerThread;
 
             let numTiles : u32 = (uniforms.dimInner - 1u) / TileInner + 1u;
 
-            var acc: array<f32, 16>;
-            var ACached : f32;
-            var BCached : array<f32, 4>;
+            var acc: array<ElemT, RowPerThread * ColPerThread>;
+            var ACached : ElemT;
+            var BCached : array<ElemT, ColPerThread>;
 
-            let ColPerThreadA : u32 = TileInner / 8u;
+            // Define the region within the current tile that this thread
+            // is responsible for loading into the cache.
+            let ColPerThreadA : u32 = TileInner / kWorkgroupSizeX;
             let tileColA : u32 = local_id.x * ColPerThreadA;
-            let RowPerThreadB : u32 = TileInner / 8u;
+            let RowPerThreadB : u32 = TileInner / kWorkgroupSizeY;
             let tileRowB : u32 = local_id.y * RowPerThreadB;
 
             // Loop over shared dimension.
-            for (var t : u32 = 0u; t < numTiles; t = t + 1u) {
+            for (var t : u32 = 0u; t < numTiles; t++) {
                 // Load one tile of A into local memory.
-                for (var innerRow : u32 = 0u; innerRow < RowPerThread; innerRow = innerRow + 1u) {
-                for (var innerCol : u32 = 0u; innerCol < ColPerThreadA; innerCol = innerCol + 1u) {
+                for (var innerRow : u32 = 0u; innerRow < RowPerThread; innerRow++) {
+                for (var innerCol : u32 = 0u; innerCol < ColPerThreadA; innerCol++) {
                     let inputRow : u32 = tileRow + innerRow;
                     let inputCol : u32 = tileColA + innerCol;)";
 const std::string& kMatMulFloatBodyPart2Array1D = R"(
@@ -123,8 +147,8 @@
                 }
                 }
                 // Load one tile of B into local memory.
-                for (var innerRow : u32 = 0u; innerRow < RowPerThreadB; innerRow = innerRow + 1u) {
-                for (var innerCol : u32 = 0u; innerCol < ColPerThread; innerCol = innerCol + 1u) {
+                for (var innerRow : u32 = 0u; innerRow < RowPerThreadB; innerRow++) {
+                for (var innerCol : u32 = 0u; innerCol < ColPerThread; innerCol++) {
                     let inputRow : u32 = tileRowB + innerRow;
                     let inputCol : u32 = tileCol + innerCol;
                     let index : u32 = inputRow * TileBOuter + inputCol;
@@ -136,20 +160,20 @@
                 workgroupBarrier();
 
                 // Compute acc values for a single thread.
-                for (var k : u32 = 0u; k < TileInner; k = k + 1u) {
-                    for (var inner : u32 = 0u; inner < ColPerThread; inner = inner + 1u) {
+                for (var k : u32 = 0u; k < TileInner; k++) {
+                    for (var inner : u32 = 0u; inner < ColPerThread; inner++) {
                         BCached[inner] = mm_Bsub[k * TileBOuter + tileCol + inner];
                     }
 
-                    for (var innerRow : u32 = 0u; innerRow < RowPerThread; innerRow = innerRow + 1u) {
+                    for (var innerRow : u32 = 0u; innerRow < RowPerThread; innerRow++) {
                         ACached = mm_Asub[(tileRow + innerRow) * TileInner + k];)";
 const std::string& kMatMulFloatBodyPart2Array2D = R"(
                     mm_Asub[inputRow][inputCol] = mm_readA(globalRow + innerRow, t * TileInner + inputCol);
                 }
                 }
                 // Load one tile of B into local memory.
-                for (var innerRow : u32 = 0u; innerRow < RowPerThreadB; innerRow = innerRow + 1u) {
-                for (var innerCol : u32 = 0u; innerCol < ColPerThread; innerCol = innerCol + 1u) {
+                for (var innerRow : u32 = 0u; innerRow < RowPerThreadB; innerRow++) {
+                for (var innerCol : u32 = 0u; innerCol < ColPerThread; innerCol++) {
                     let inputRow : u32 = tileRowB + innerRow;
                     let inputCol : u32 = tileCol + innerCol;
 
@@ -160,15 +184,15 @@
                 workgroupBarrier();
 
                 // Compute acc values for a single thread.
-                for (var k : u32 = 0u; k < TileInner; k = k + 1u) {
-                    for (var inner : u32 = 0u; inner < ColPerThread; inner = inner + 1u) {
+                for (var k : u32 = 0u; k < TileInner; k++) {
+                    for (var inner : u32 = 0u; inner < ColPerThread; inner++) {
                         BCached[inner] = mm_Bsub[k][tileCol + inner];
                     }
 
-                    for (var innerRow : u32 = 0u; innerRow < RowPerThread; innerRow = innerRow + 1u) {
+                    for (var innerRow : u32 = 0u; innerRow < RowPerThread; innerRow++) {
                         ACached = mm_Asub[tileRow + innerRow][k];)";
 const std::string& kMatMulFloatBodyPart3 = R"(
-                        for (var innerCol : u32 = 0u; innerCol < ColPerThread; innerCol = innerCol + 1u) {
+                        for (var innerCol : u32 = 0u; innerCol < ColPerThread; innerCol++) {
                             let index : u32 = innerRow * ColPerThread + innerCol;
                             acc[index] = acc[index] + ACached * BCached[innerCol];
                         }
@@ -178,8 +202,8 @@
                 workgroupBarrier();
             }
 
-            for (var innerRow : u32 = 0u; innerRow < RowPerThread; innerRow = innerRow + 1u) {
-            for (var innerCol : u32 = 0u; innerCol < ColPerThread; innerCol = innerCol + 1u) {
+            for (var innerRow : u32 = 0u; innerRow < RowPerThread; innerRow++) {
+            for (var innerCol : u32 = 0u; innerCol < ColPerThread; innerCol++) {
                 let index : u32 = innerRow * ColPerThread + innerCol;
                 mm_write(globalRow + innerRow,
                          globalCol + innerCol,
@@ -187,23 +211,33 @@
             }
             }
         })";
-const std::string& kMatMulFloatOneDimensionalSharedArray =
-    kMatMulFloatHeader + kMatMulFloatSharedArray1D + kMatMulFloatBodyPart1 +
-    kMatMulFloatBodyPart2Array1D + kMatMulFloatBodyPart3;
+std::string GenMatMulFloatOneDimensionalSharedArray() {
+    return GenMatMulFloatHeader() + kMatMulFloatSharedArray1D + kMatMulFloatBodyPart1 +
+           kMatMulFloatBodyPart2Array1D + kMatMulFloatBodyPart3;
+}
 
-const std::string& kMatMulFloatTwoDimensionalSharedArray =
-    kMatMulFloatHeader + kMatMulFloatSharedArray2D + kMatMulFloatBodyPart1 +
-    kMatMulFloatBodyPart2Array2D + kMatMulFloatBodyPart3;
+std::string GenMatMulFloatTwoDimensionalSharedArray() {
+    return GenMatMulFloatHeader() + kMatMulFloatSharedArray2D + kMatMulFloatBodyPart1 +
+           kMatMulFloatBodyPart2Array2D + kMatMulFloatBodyPart3;
+}
 
 // The vec4 version requires that dimInner and dimBOuter are divisible by 4.
-const std::string& kMatMulVec4Header = R"(
+std::string GenMatMulVec4Header() {
+    std::stringstream ss;
+    ss << "const kTileSize = " << kTileSize << ";  // 32\n";
+    ss << "const kWorkgroupSizeX = " << kWorkgroupSizeX << "u; // 8;\n";
+    ss << "const kWorkgroupSizeY = " << kWorkgroupSizeY << "u; // 8;\n";
+    ss << R"(
+        alias ElemT = f32;
+        alias VecT = vec4<ElemT>;
+        const VecLen = 4;
         struct Uniforms {
             dimAOuter : u32,
             dimInner : u32,
             dimBOuter : u32,
         }
         struct Matrix {
-            numbers: array<vec4f>
+            numbers: array<VecT>
         }
 
         @group(0) @binding(0) var<storage, read> firstMatrix : Matrix;
@@ -211,25 +245,25 @@
         @group(0) @binding(2) var<storage, read_write> resultMatrix : Matrix;
         @group(0) @binding(3) var<uniform> uniforms : Uniforms;
 
-        fn mm_readA(row : u32, col : u32) -> vec4f  {
+        fn mm_readA(row : u32, col : u32) -> VecT  {
             if (row < uniforms.dimAOuter && col < uniforms.dimInner)
             {
-                let result : vec4f = firstMatrix.numbers[row * uniforms.dimInner / 4u + col];
+                let result : VecT = firstMatrix.numbers[row * uniforms.dimInner / 4u + col];
                 return result;
             }
-            return vec4f(0.0, 0.0, 0.0, 0.0);
+            return VecT(0.0, 0.0, 0.0, 0.0);
         }
 
-        fn mm_readB(row : u32, col : u32) -> vec4f {
+        fn mm_readB(row : u32, col : u32) -> VecT {
             if (row < uniforms.dimInner && col < uniforms.dimBOuter)
             {
-                let result : vec4f = secondMatrix.numbers[row * uniforms.dimBOuter / 4u + col];
+                let result : VecT = secondMatrix.numbers[row * uniforms.dimBOuter / 4u + col];
                 return result;
             }
-            return vec4f(0.0, 0.0, 0.0, 0.0);
+            return VecT(0.0, 0.0, 0.0, 0.0);
         }
 
-        fn mm_write(row : u32, col : u32, value : vec4f) {
+        fn mm_write(row : u32, col : u32, value : VecT) {
             if (row < uniforms.dimAOuter && col < uniforms.dimBOuter)
             {
                 let index : u32 = col + row * uniforms.dimBOuter / 4u;
@@ -237,16 +271,23 @@
             }
         }
 
-        const RowPerThread : u32 = 4u;
-        const ColPerThread : u32 = 4u;
-        const TileOuter : u32 = 32u;
-        const TileInner : u32 = 32u;)";
+        const RowPerThread : u32 = kTileSize / kWorkgroupSizeY; // 4
+        const ColPerThread : u32 = kTileSize / kWorkgroupSizeX; // 4
+        const TileOuter : u32 = kTileSize;
+        const TileInner : u32 = kTileSize;
+
+        // The code below uses an unrolled loop to fill BCached.
+        // If this count changes, then you also have to modify that code.
+        const_assert ColPerThread == 4;
+        )";
+    return ss.str();
+}
 const std::string& kMatMulVec4SharedArray1D = R"(
-        var<workgroup> mm_Asub : array<vec4f, 256>;
-        var<workgroup> mm_Bsub : array<vec4f, 256>;)";
+        var<workgroup> mm_Asub : array<VecT, 256>;
+        var<workgroup> mm_Bsub : array<VecT, 256>;)";
 const std::string& kMatMulVec4SharedArray2D = R"(
-        var<workgroup> mm_Asub : array<array<vec4f, 8>, 32>;
-        var<workgroup> mm_Bsub : array<array<vec4f, 8>, 32>;)";
+        var<workgroup> mm_Asub : array<array<VecT, 8>, kTileSize>;
+        var<workgroup> mm_Bsub : array<array<VecT, 8>, kTileSize>;)";
 const std::string& kMatMulVec4BodyPart1 = R"(
         @compute @workgroup_size(8, 8, 1)
         fn main(@builtin(local_invocation_id) local_id : vec3u,
@@ -259,18 +300,18 @@
 
             let numTiles : u32 = (uniforms.dimInner - 1u) / TileInner + 1u;
 
-            var acc: array<vec4f, 4>;
-            var ACached : vec4f;
-            var BCached : array<vec4f, 4>;
+            var acc: array<VecT, ColPerThread * RowPerThread / VecLen >;
+            var ACached : VecT;
+            var BCached : array<VecT, ColPerThread>;
 
             var globalColA : u32 = tileCol;
-            let RowPerThreadB : u32 = TileInner / 8u;
+            let RowPerThreadB : u32 = TileInner / kWorkgroupSizeY;
             let tileRowB : u32 = local_id.y * RowPerThreadB;
 
             // Loop over shared dimension.
-            for (var t : u32 = 0u; t < numTiles; t = t + 1u) {
+            for (var t : u32 = 0u; t < numTiles; t++) {
                 // Load one tile of A into local memory.
-                for (var innerRow : u32 = 0u; innerRow < RowPerThread; innerRow = innerRow + 1u) {
+                for (var innerRow : u32 = 0u; innerRow < RowPerThread; innerRow++) {
                     let inputRow : u32 = tileRow + innerRow;
                     let inputCol : u32 = tileCol;)";
 const std::string& kMatMulVec4BodyPart2Array1D = R"(
@@ -280,7 +321,7 @@
                 globalColA = globalColA + TileInner / ColPerThread;
 
                 // Load one tile of B into local memory.
-                for (var innerRow : u32 = 0u; innerRow < RowPerThreadB; innerRow = innerRow + 1u) {
+                for (var innerRow : u32 = 0u; innerRow < RowPerThreadB; innerRow++) {
                     let inputRow : u32 = tileRowB + innerRow;
                     let inputCol : u32 = tileCol;
                     let index : u32 = inputRow * TileOuter / ColPerThread + inputCol;
@@ -290,7 +331,7 @@
                 workgroupBarrier();
 
                 // Compute acc values for a single thread.
-                for (var k : u32 = 0u; k < TileInner / ColPerThread; k = k + 1u) {
+                for (var k : u32 = 0u; k < TileInner / ColPerThread; k++) {
                     BCached[0] = mm_Bsub[(k * ColPerThread) * (TileOuter / ColPerThread) + tileCol];
                     BCached[1] = mm_Bsub[(k * ColPerThread + 1u) * (TileOuter / ColPerThread) + tileCol];
                     BCached[2] = mm_Bsub[(k * ColPerThread + 2u) * (TileOuter / ColPerThread) + tileCol];
@@ -304,7 +345,7 @@
                 globalColA = globalColA + TileInner / ColPerThread;
 
                 // Load one tile of B into local memory.
-                for (var innerRow : u32 = 0u; innerRow < RowPerThreadB; innerRow = innerRow + 1u) {
+                for (var innerRow : u32 = 0u; innerRow < RowPerThreadB; innerRow++) {
                     let inputRow : u32 = tileRowB + innerRow;
                     let inputCol : u32 = tileCol;
                     mm_Bsub[inputRow][inputCol] = mm_readB(t * TileInner + inputRow, globalCol);;
@@ -313,7 +354,7 @@
                 workgroupBarrier();
 
                 // Compute acc values for a single thread.
-                for (var k : u32 = 0u; k < TileInner / ColPerThread; k = k + 1u) {
+                for (var k : u32 = 0u; k < TileInner / ColPerThread; k++) {
                     BCached[0] = mm_Bsub[k * ColPerThread][tileCol];
                     BCached[1] = mm_Bsub[k * ColPerThread + 1u][tileCol];
                     BCached[2] = mm_Bsub[k * ColPerThread + 2u][tileCol];
@@ -332,20 +373,22 @@
                 workgroupBarrier();
             }
 
-            for (var innerRow : u32 = 0u; innerRow < RowPerThread; innerRow = innerRow + 1u) {
+            for (var innerRow : u32 = 0u; innerRow < RowPerThread; innerRow++) {
                 mm_write(globalRow + innerRow,
                          globalCol,
                          acc[innerRow]);
             }
         })";
 
-const std::string& kMatMulVec4OneDimensionalSharedArray =
-    kMatMulVec4Header + kMatMulVec4SharedArray1D + kMatMulVec4BodyPart1 +
-    kMatMulVec4BodyPart2Array1D + kMatMulVec4BodyPart3;
+std::string GenMatMulVec4OneDimensionalSharedArray() {
+    return GenMatMulVec4Header() + kMatMulVec4SharedArray1D + kMatMulVec4BodyPart1 +
+           kMatMulVec4BodyPart2Array1D + kMatMulVec4BodyPart3;
+}
 
-const std::string& kMatMulVec4TwoDimensionalSharedArray =
-    kMatMulVec4Header + kMatMulVec4SharedArray2D + kMatMulVec4BodyPart1 +
-    kMatMulVec4BodyPart2Array2D + kMatMulVec4BodyPart3;
+std::string GenMatMulVec4TwoDimensionalSharedArray() {
+    return GenMatMulVec4Header() + kMatMulVec4SharedArray2D + kMatMulVec4BodyPart1 +
+           kMatMulVec4BodyPart2Array2D + kMatMulVec4BodyPart3;
+}
 
 constexpr unsigned int kNumIterations = 50;
 
@@ -428,32 +471,29 @@
     wgpu::Buffer uniformBuffer = utils::CreateBufferFromData(
         device, uniformData, sizeof(uniformData), wgpu::BufferUsage::Uniform);
 
-    wgpu::ShaderModule module;
+    std::string shader;
     switch (GetParam().mMatMulMethod) {
         case MatMulMethod::MatMulFloatOneDimSharedArray: {
-            module =
-                utils::CreateShaderModule(device, kMatMulFloatOneDimensionalSharedArray.c_str());
+            shader = GenMatMulFloatOneDimensionalSharedArray();
             break;
         }
 
         case MatMulMethod::MatMulFloatTwoDimSharedArray: {
-            module =
-                utils::CreateShaderModule(device, kMatMulFloatTwoDimensionalSharedArray.c_str());
+            shader = GenMatMulFloatTwoDimensionalSharedArray();
             break;
         }
 
         case MatMulMethod::MatMulVec4OneDimSharedArray: {
-            module =
-                utils::CreateShaderModule(device, kMatMulVec4OneDimensionalSharedArray.c_str());
+            shader = GenMatMulVec4OneDimensionalSharedArray();
             break;
         }
 
         case MatMulMethod::MatMulVec4TwoDimSharedArray: {
-            module =
-                utils::CreateShaderModule(device, kMatMulVec4TwoDimensionalSharedArray.c_str());
+            shader = GenMatMulVec4TwoDimensionalSharedArray();
             break;
         }
     }
+    wgpu::ShaderModule module = utils::CreateShaderModule(device, shader.c_str());
 
     wgpu::ComputePipelineDescriptor csDesc;
     csDesc.compute.module = module;