vulkan: Update topk_moe fusion to handle gpt's late softmax (#16656)

* vulkan: Update topk_moe fusion to handle gpt's late softmax Based on #16649. * Add ggml_check_edges * Add sync logging to show fusion effects * handle clamp added in #16655 * Update ggml/src/ggml-impl.h Co-authored-by: Diego Devesa <slarengh@gmail.com>
2025-10-29 08:44:29 -05:00
parent bcf5bda6f5
commit 10fcc41290
3 changed files with 275 additions and 141 deletions
--- a/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp
@@ -11,6 +11,8 @@ layout (push_constant) uniform parameter
 {
    uint n_rows;
    uint n_expert_used;
+    float clamp_min;
+    float clamp_max;
 };

 layout(local_size_x_id = 0, local_size_y = 4, local_size_z = 1) in;
@@ -18,6 +20,7 @@ layout(local_size_x_id = 0, local_size_y = 4, local_size_z = 1) in;
 layout(constant_id = 0) const uint WARP_SIZE = 32;
 layout(constant_id = 1) const uint n_experts = 512;
 layout(constant_id = 2) const bool with_norm = true;
+layout(constant_id = 3) const bool late_softmax = false;

 const uint experts_per_thread = (n_experts > WARP_SIZE) ? n_experts / WARP_SIZE : 1;

@@ -25,6 +28,52 @@ layout (binding = 0, std430) readonly buffer Logits {float logits[];};
 layout (binding = 1, std430) writeonly buffer Weights {float weights[];};
 layout (binding = 2, std430) writeonly buffer Ids {uint ids[];};

+const float INFINITY = 1.0 / 0.0;
+
+// Warp-local softmax used for both the pre-top-k logits and the post-top-k delayed path.
+void softmax_warp_inplace(inout float vals[experts_per_thread], const uint limit, const uint lane, const bool use_limit) {
+    float max_val = -INFINITY;
+
+    [[unroll]]
+    for (int i = 0; i < experts_per_thread; i++) {
+        const uint idx       = lane + i * WARP_SIZE;
+        const bool is_active = !use_limit || (idx < limit);
+        if (is_active) {
+            max_val = max(max_val, vals[i]);
+        }
+    }
+
+    max_val = subgroupMax(max_val);
+
+    float sum = 0.f;
+
+    [[unroll]]
+    for (int i = 0; i < experts_per_thread; i++) {
+        const uint idx       = lane + i * WARP_SIZE;
+        const bool is_active = !use_limit || (idx < limit);
+        if (is_active) {
+            const float val = exp(vals[i] - max_val);
+            vals[i]         = val;
+            sum += val;
+        } else {
+            vals[i] = 0.f;
+        }
+    }
+
+    sum = subgroupAdd(sum);
+
+    const float inv_sum = 1.0f / sum;
+
+    [[unroll]]
+    for (int i = 0; i < experts_per_thread; i++) {
+        const uint idx       = lane + i * WARP_SIZE;
+        const bool is_active = !use_limit || (idx < limit);
+        if (is_active) {
+            vals[i] *= inv_sum;
+        }
+    }
+}
+
 void main() {
    const uint row = gl_WorkGroupID.x * gl_WorkGroupSize.y + gl_LocalInvocationID.y;
    if (row >= n_rows) {
@@ -35,43 +84,16 @@ void main() {
    const uint weights_offset = n_expert_used * row;
    const uint ids_offset = n_experts * row;

-    float logits_r[experts_per_thread];
-
-    const float INFINITY = 1.0 / 0.0;
+    float wt[experts_per_thread];

    [[unroll]]
    for (uint i = 0; i < n_experts; i += WARP_SIZE) {
-        const uint expert        = i + gl_LocalInvocationID.x;
-        logits_r[i / WARP_SIZE] = n_experts % WARP_SIZE == 0 || expert < n_experts ? logits[logits_offset + expert] : -INFINITY;
+        const uint expert = i + gl_LocalInvocationID.x;
+        wt[i / WARP_SIZE] = (n_experts % WARP_SIZE == 0 || expert < n_experts) ? logits[logits_offset + expert] : -INFINITY;
    }

-    float max_val = logits_r[0];
-
-    [[unroll]]
-    for (int i = 1; i < experts_per_thread; i++) {
-        const float val = logits_r[i];
-        max_val         = max(val, max_val);
-    }
-
-    max_val = subgroupMax(max_val);
-
-    float wt[experts_per_thread];
-    float tmp = 0.f;
-
-    [[unroll]]
-    for (int i = 0; i < experts_per_thread; i++) {
-        const float val = logits_r[i];
-        wt[i]           = exp(val - max_val);
-        tmp += wt[i];
-    }
-
-    tmp = subgroupAdd(tmp);
-
-    const float inv_sum = 1.0f / tmp;
-
-    [[unroll]]
-    for (int i = 0; i < experts_per_thread; i++) {
-        wt[i] = wt[i] * inv_sum;
+    if (!late_softmax) {
+        softmax_warp_inplace(wt, n_experts, gl_LocalInvocationID.x, false);
    }

    // at this point, each thread holds a portion of softmax,
@@ -82,6 +104,11 @@ void main() {

    float output_weights[experts_per_thread];

+    [[unroll]]
+    for (int i = 0; i < experts_per_thread; i++) {
+        output_weights[i] = 0.f;
+    }
+
    for (int k = 0; k < n_expert_used; k++) {
        float max_val    = wt[0];
        uint   max_expert = gl_LocalInvocationID.x;
@@ -121,6 +148,7 @@ void main() {

    if (with_norm) {
        wt_sum              = subgroupAdd(wt_sum);
+        wt_sum              = clamp(wt_sum, clamp_min, clamp_max);
        const float inv_sum = 1.0f / wt_sum;

        [[unroll]]
@@ -129,6 +157,10 @@ void main() {
        }
    }

+    if (late_softmax) {
+        softmax_warp_inplace(output_weights, n_expert_used, gl_LocalInvocationID.x, true);
+    }
+
    [[unroll]]
    for (uint i = 0; i < experts_per_thread; ++i) {
        uint idx = i * WARP_SIZE + gl_LocalInvocationID.x;