Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 22 additions & 7 deletions backends/vulkan/runtime/graph/ops/glsl/q8ta_linear_gemv.glsl
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,12 @@ layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
#include "linear_int_weight_sums_load.glslh"
#include "linear_fp_bias_load.glslh"

shared Int32Accum partial_accums[WGS];
// Array-of-arrays shared memory layout: partial_accums[lid][tile_n4].
// Each element is exactly 16 bytes (ivec4). This avoids the Samsung S25
// (Adreno 830) driver bug triggered by the original Int32Accum struct layout,
// where barrier() only invalidated the first 16-byte component of each
// 32-byte struct slot, leaving subsequent components stale.
shared ivec4 partial_accums[WGS][TILE_N4];

void main() {
const int lid = int(gl_LocalInvocationID.z);
Expand Down Expand Up @@ -104,19 +109,29 @@ void main() {
}
}

partial_accums[lid] = out_accum;
[[unroll]] for (int tile_n4 = 0; tile_n4 < TILE_N4; ++tile_n4) {
partial_accums[lid][tile_n4] = out_accum.data[0][tile_n4];
}

memoryBarrierShared();
barrier();

// Only the first thread writes the result
if (lid == 0) {
for (int i = 1; i < WGS; ++i) {
// Tree reduction: O(log2(WGS)).
for (int i = WGS / 2; i > 0; i /= 2) {
if (lid < i) {
[[unroll]] for (int tile_n4 = 0; tile_n4 < TILE_N4; ++tile_n4) {
out_accum.data[0][tile_n4] +=
partial_accums[i].data[0][tile_n4];
partial_accums[lid][tile_n4] += partial_accums[lid + i][tile_n4];
}
}
memoryBarrierShared();
barrier();
}

// Only the first thread writes the result
if (lid == 0) {
[[unroll]] for (int tile_n4 = 0; tile_n4 < TILE_N4; ++tile_n4) {
out_accum.data[0][tile_n4] = partial_accums[0][tile_n4];
}

FPPerOutChannelParams weight_scales_tile;
load_weight_scales_tile(weight_scales_tile, n4);
Expand Down
Loading