enginex-ascend-910-llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp

#version 450

#include "generic_binary_head.comp"
#include "types.comp"

#extension GL_EXT_control_flow_attributes : enable
#extension GL_KHR_shader_subgroup_arithmetic : enable
#extension GL_KHR_shader_subgroup_basic : enable

#define BLOCK_SIZE 128

layout (constant_id = 1) const bool do_multiply = false;

layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;

layout (binding = 3, std430) readonly buffer PartialsBuf {float partial_sums[];};

shared FLOAT_TYPE sumsh[BLOCK_SIZE];

void main() {
    const uint ncols     = p.ne00;
    const uint nrows     = gl_NumWorkGroups.x;
    const uint nchannels = gl_NumWorkGroups.y;

    const uint row       = 0;
    const uint channel   = gl_WorkGroupID.y;
    const uint samp      = gl_WorkGroupID.z;
    // The work is split across multiple workgroups in the x dimension. Each invocation
    // processes one element
    const uint tid       = gl_GlobalInvocationID.x;

    const uint stride_row       = p.nb01;
    const uint stride_channel   = p.nb02;
    const uint stride_sample    = p.nb03;

    uint32_t a_offset = samp*stride_sample + channel*stride_channel + row*stride_row + get_aoffset();
    uint32_t b_offset = src1_idx(0, row, channel, samp) + get_boffset();
    uint32_t d_offset = ((samp*nchannels + channel)*nrows + row)*ncols + get_doffset();

    FLOAT_TYPE sum = FLOAT_TYPE(0.0f); // partial sum for thread in warp

    uint32_t num_partials = p.param3;
    for (uint32_t i = gl_SubgroupInvocationID; i < num_partials; i += gl_SubgroupSize) {
        sum += partial_sums[i];
    }
    sum = subgroupAdd(sum);

    uint col = tid;
    if (col >= ncols) {
        return;
    }

    const FLOAT_TYPE mean = sum / FLOAT_TYPE(ncols);
    const FLOAT_TYPE scale = inversesqrt(mean + FLOAT_TYPE(p.param1));

    if (do_multiply) {
        if (ncols > p.ne10) {
            data_d[d_offset + col] = D_TYPE(scale * FLOAT_TYPE(data_a[a_offset + col]) * FLOAT_TYPE(data_b[b_offset + fastmod(col, p.ne10)]));
        } else {
            data_d[d_offset + col] = D_TYPE(scale * FLOAT_TYPE(data_a[a_offset + col]) * FLOAT_TYPE(data_b[b_offset + col]));
        }
    } else {
        data_d[d_offset + col] = D_TYPE(scale * FLOAT_TYPE(data_a[a_offset + col]));
    }
}
vulkan: optimize rms_norm, and allow the work to spread across multiple SMs (#15281) * vulkan: optimize rms_norm, and allow the work to spread across multiple SMs There are really two parts to this change: (1) Some optimizations similar to what we have in soft_max, to unroll with different numbers of iterations. (2) A fusion optimization where we detect add followed by rms_norm, and make the add shader atomically accumulate the values^2 into memory. Then the rms_norm shader can just load that sum. This allows the rms_norm to be parallelized across multiple workgroups, it just becomes a simple per-element multiply. The fusion optimization is currently only applied when the rms_norm is on a single vector. This previously always ran on a single SM. It could apply more broadly, but when there are other dimensions the work can already spread across SMs, and there would be some complexity to tracking multiple atomic sums. * Change add+rms_norm optimization to write out an array of partial sums rather than using atomic add, to make it deterministic. The rms_norm shader fetches a subgroup's worth in parallel and uses subgroupAdd to add them up. * complete rebase against fused adds - multi_add shader can also compute partial sums * fix validation errors * disable add_rms_fusion for Intel due to possible driver bug * resolve against #15489, sync after clearing partial sums 2025-08-23 13:16:17 -05:00			`#version 450`

			`#include "generic_binary_head.comp"`
			`#include "types.comp"`

			`#extension GL_EXT_control_flow_attributes : enable`
			`#extension GL_KHR_shader_subgroup_arithmetic : enable`
			`#extension GL_KHR_shader_subgroup_basic : enable`

			`#define BLOCK_SIZE 128`

			`layout (constant_id = 1) const bool do_multiply = false;`

			`layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;`

			`layout (binding = 3, std430) readonly buffer PartialsBuf {float partial_sums[];};`

			`shared FLOAT_TYPE sumsh[BLOCK_SIZE];`

			`void main() {`
			`const uint ncols = p.ne00;`
			`const uint nrows = gl_NumWorkGroups.x;`
			`const uint nchannels = gl_NumWorkGroups.y;`

			`const uint row = 0;`
			`const uint channel = gl_WorkGroupID.y;`
			`const uint samp = gl_WorkGroupID.z;`
			`// The work is split across multiple workgroups in the x dimension. Each invocation`
			`// processes one element`
			`const uint tid = gl_GlobalInvocationID.x;`

			`const uint stride_row = p.nb01;`
			`const uint stride_channel = p.nb02;`
			`const uint stride_sample = p.nb03;`

			`uint32_t a_offset = sampstride_sample + channelstride_channel + row*stride_row + get_aoffset();`
			`uint32_t b_offset = src1_idx(0, row, channel, samp) + get_boffset();`
			`uint32_t d_offset = ((sampnchannels + channel)nrows + row)*ncols + get_doffset();`

			`FLOAT_TYPE sum = FLOAT_TYPE(0.0f); // partial sum for thread in warp`

			`uint32_t num_partials = p.param3;`
			`for (uint32_t i = gl_SubgroupInvocationID; i < num_partials; i += gl_SubgroupSize) {`
			`sum += partial_sums[i];`
			`}`
			`sum = subgroupAdd(sum);`

			`uint col = tid;`
			`if (col >= ncols) {`
			`return;`
			`}`

			`const FLOAT_TYPE mean = sum / FLOAT_TYPE(ncols);`
			`const FLOAT_TYPE scale = inversesqrt(mean + FLOAT_TYPE(p.param1));`

			`if (do_multiply) {`
			`if (ncols > p.ne10) {`
			`data_d[d_offset + col] = D_TYPE(scale * FLOAT_TYPE(data_a[a_offset + col]) * FLOAT_TYPE(data_b[b_offset + fastmod(col, p.ne10)]));`
			`} else {`
			`data_d[d_offset + col] = D_TYPE(scale * FLOAT_TYPE(data_a[a_offset + col]) * FLOAT_TYPE(data_b[b_offset + col]));`
			`}`
			`} else {`
			`data_d[d_offset + col] = D_TYPE(scale * FLOAT_TYPE(data_a[a_offset + col]));`
			`}`
			`}`