| #extension GL_AMD_gpu_shader_half_float : require |
| layout(binding = 0, std140) uniform u_block_std140_ubo { |
| f16mat4 conv_mat4x4_f16(mat4x4_f16 val) { |
| return f16mat4(val.col0, val.col1, val.col2, val.col3); |
| f16mat4[4] conv_arr4_mat4x4_f16(mat4x4_f16 val[4]) { |
| f16mat4 arr[4] = f16mat4[4](f16mat4(0.0hf, 0.0hf, 0.0hf, 0.0hf, 0.0hf, 0.0hf, 0.0hf, 0.0hf, 0.0hf, 0.0hf, 0.0hf, 0.0hf, 0.0hf, 0.0hf, 0.0hf, 0.0hf), f16mat4(0.0hf, 0.0hf, 0.0hf, 0.0hf, 0.0hf, 0.0hf, 0.0hf, 0.0hf, 0.0hf, 0.0hf, 0.0hf, 0.0hf, 0.0hf, 0.0hf, 0.0hf, 0.0hf), f16mat4(0.0hf, 0.0hf, 0.0hf, 0.0hf, 0.0hf, 0.0hf, 0.0hf, 0.0hf, 0.0hf, 0.0hf, 0.0hf, 0.0hf, 0.0hf, 0.0hf, 0.0hf, 0.0hf), f16mat4(0.0hf, 0.0hf, 0.0hf, 0.0hf, 0.0hf, 0.0hf, 0.0hf, 0.0hf, 0.0hf, 0.0hf, 0.0hf, 0.0hf, 0.0hf, 0.0hf, 0.0hf, 0.0hf)); |
| for(uint i = 0u; (i < 4u); i = (i + 1u)) { |
| arr[i] = conv_mat4x4_f16(val[i]); |
| void f(uint local_invocation_index) { |
| for(uint idx = local_invocation_index; (idx < 4u); idx = (idx + 1u)) { |
| w[i] = f16mat4(f16vec4(0.0hf), f16vec4(0.0hf), f16vec4(0.0hf), f16vec4(0.0hf)); |
| w = conv_arr4_mat4x4_f16(u.inner); |
| w[1] = conv_mat4x4_f16(u.inner[2u]); |
| w[1][0] = u.inner[0u].col1.ywxz; |
| w[1][0].x = u.inner[0u].col1[0u]; |
| layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in; |
| f(gl_LocalInvocationIndex); |