dawn_native/metal: Round workgroup memory size to 16 bytes

The documentation for setThreadgroupMemoryLength states:
  length - The size of the threadgroup memory, in bytes. Must be a multiple of 16 bytes.

This is the likely cause of a number of CTS flakes for 'webgpu:shader,execution,zero_init:compute,zero_init:storageClass="workgroup";*'

Bug: dawn:1277
Change-Id: Ib8c271fccacac6e68cdf0ddb5c6b5a41756173b5
Reviewed-on: https://dawn-review.googlesource.com/c/dawn/+/78801
Reviewed-by: Corentin Wallez <cwallez@chromium.org>
Commit-Queue: Ben Clayton <bclayton@chromium.org>
diff --git a/src/dawn_native/metal/ComputePipelineMTL.mm b/src/dawn_native/metal/ComputePipelineMTL.mm
index 9f5255f..9d0f563 100644
--- a/src/dawn_native/metal/ComputePipelineMTL.mm
+++ b/src/dawn_native/metal/ComputePipelineMTL.mm
@@ -14,6 +14,7 @@
 
 #include "dawn_native/metal/ComputePipelineMTL.h"
 
+#include "common/Math.h"
 #include "dawn_native/CreatePipelineAsyncTask.h"
 #include "dawn_native/metal/DeviceMTL.h"
 #include "dawn_native/metal/ShaderModuleMTL.h"
@@ -62,7 +63,9 @@
             if (mWorkgroupAllocations[i] == 0) {
                 continue;
             }
-            [encoder setThreadgroupMemoryLength:mWorkgroupAllocations[i] atIndex:i];
+            // Size must be a multiple of 16 bytes.
+            uint32_t rounded = Align<uint32_t>(mWorkgroupAllocations[i], 16);
+            [encoder setThreadgroupMemoryLength:rounded atIndex:i];
         }
     }