enginex-hygon-llama.cpp/ggml/src/ggml-hexagon/htp/softmax-ops.c

#pragma clang diagnostic ignored "-Wunused-variable"
#pragma clang diagnostic ignored "-Wunused-function"
#pragma clang diagnostic ignored "-Wunused-but-set-variable"

#ifdef HTP_DEBUG
#    define FARF_HIGH 1
#endif
#include <HAP_farf.h>
#include <HAP_mem.h>
#include <HAP_perf.h>
#include <HAP_ps.h>
#include <hexagon_protos.h>
#include <hexagon_types.h>
#include <math.h>
#include <qurt_thread.h>
#include <string.h>

#define GGML_COMMON_DECL_C
#include "ggml-common.h"
#include "htp-ctx.h"
#include "htp-dma.h"
#include "htp-msg.h"
#include "htp-ops.h"
#include "hvx-utils.h"
#include "ops-utils.h"

#define htp_softmax_preamble3                              \
    const uint32_t ne00 = src0->ne[0];                     \
    const uint32_t ne01 = src0->ne[1];                     \
    const uint32_t ne02 = src0->ne[2];                     \
    const uint32_t ne03 = src0->ne[3];                     \
                                                           \
    const uint32_t nb00 = src0->nb[0];                     \
    const uint32_t nb01 = src0->nb[1];                     \
    const uint32_t nb02 = src0->nb[2];                     \
    const uint32_t nb03 = src0->nb[3];                     \
                                                           \
    const uint32_t ne10 = (src1->ne[0]) ? src1->ne[0] : 1; \
    const uint32_t ne11 = (src1->ne[0]) ? src1->ne[1] : 1; \
    const uint32_t ne12 = (src1->ne[0]) ? src1->ne[2] : 1; \
    const uint32_t ne13 = (src1->ne[0]) ? src1->ne[3] : 1; \
                                                           \
    const uint32_t nb10 = (src1->ne[0]) ? src1->nb[0] : 1; \
    const uint32_t nb11 = (src1->ne[0]) ? src1->nb[1] : 1; \
    const uint32_t nb12 = (src1->ne[0]) ? src1->nb[2] : 1; \
    const uint32_t nb13 = (src1->ne[0]) ? src1->nb[3] : 1; \
                                                           \
    const uint32_t ne0 = dst->ne[0];                       \
    const uint32_t ne1 = dst->ne[1];                       \
    const uint32_t ne2 = dst->ne[2];                       \
    const uint32_t ne3 = dst->ne[3];                       \
                                                           \
    const uint32_t nb0 = dst->nb[0];                       \
    const uint32_t nb1 = dst->nb[1];                       \
    const uint32_t nb2 = dst->nb[2];                       \
    const uint32_t nb3 = dst->nb[3];

struct softmax_th_ctx {
    bool     use_f16;
    bool     use_src1;
    uint32_t n_head;
    uint32_t n_head_log2;

    float scale;
    float max_bias;
    float m0;
    float m1;

    struct htp_ops_context * octx;
};

static void init_softmax_ctx(struct softmax_th_ctx * softmax_ctx, struct htp_ops_context * octx) {
    const struct htp_tensor * src0 = &octx->src0;
    const struct htp_tensor * src1 = &octx->src1;

    memset(softmax_ctx, 0, sizeof(struct softmax_th_ctx));

    memcpy(&softmax_ctx->scale, (float *) octx->op_params, sizeof(float));
    memcpy(&softmax_ctx->max_bias, (float *) octx->op_params + 1, sizeof(float));

    softmax_ctx->n_head      = src0->ne[2];
    softmax_ctx->n_head_log2 = 1u << (uint32_t) floor(log2(softmax_ctx->n_head));

    softmax_ctx->m0 = powf(2.0f, -(softmax_ctx->max_bias) / softmax_ctx->n_head_log2);
    softmax_ctx->m1 = powf(2.0f, -(softmax_ctx->max_bias / 2.0f) / softmax_ctx->n_head_log2);

    softmax_ctx->use_src1 = (src1->ne[0] != 0);
    softmax_ctx->use_f16  = (src1->ne[0] != 0) && (src1->type == HTP_TYPE_F16);

    softmax_ctx->octx = octx;
}

static void hvx_fast_softmax_prep_f32(const uint8_t * restrict src,
                                      uint8_t * restrict dst,
                                      const int num_elems,
                                      float     scale,
                                      const uint8_t * restrict mask,
                                      float slope) {
    const uint8_t * restrict src_curr  = src;
    uint8_t * restrict dst_curr        = dst;
    const uint8_t * restrict mask_curr = mask;

    HVX_Vector scale_vec = hvx_vec_splat_fp32(scale);
    HVX_Vector slope_vec = hvx_vec_splat_fp32(slope);

    int step_of_1 = num_elems >> 5;

    #pragma unroll(4)
    for (int i = 0; i < step_of_1; i++) {
        HVX_Vector v1 = *(HVX_Vector *) src_curr;

        HVX_Vector v3 = *(HVX_Vector *) mask_curr;

        HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(v1, scale_vec);

        HVX_Vector v4 = Q6_Vqf32_vmpy_VsfVsf(v3, slope_vec);

        HVX_Vector v5 = Q6_Vqf32_vadd_Vqf32Vqf32(v2, v4);

        *(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v5);

        src_curr += VLEN;
        dst_curr += VLEN;
        mask_curr += VLEN;
    }
}

static void hvx_fast_softmax_f32(const uint8_t * restrict src,
                                 uint8_t * restrict dst,
                                 uint8_t * restrict pad,
                                 const int num_elems) {
    const HVX_Vector * restrict v_src = (HVX_Vector *) src;
    HVX_Vector * restrict v_pad       = (HVX_Vector *) pad;
    HVX_Vector * restrict v_dst       = (HVX_Vector *) dst;

    HVX_Vector sum_vec = Q6_V_vsplat_R(0x00000000);
    HVX_Vector max_vec = hvx_vec_splat_fp32(((const float *) src)[0]);
    HVX_Vector zero_v  = Q6_V_vzero();
    HVX_Vector one_v   = hvx_vec_splat_fp32(1.0);

    int step_of_1 = num_elems >> 5;

    #pragma unroll(4)
    for (int i = 0; i < step_of_1; i++) {
        HVX_Vector v1 = v_src[i];
        max_vec       = Q6_Vsf_vmax_VsfVsf(max_vec, v1);
    }

    HVX_Vector v = hvx_vec_reduce_max_fp32(max_vec);
    max_vec      = hvx_vec_repl4(v);

    #pragma unroll(4)
    for (int i = 0; i < step_of_1; i++) {
        HVX_Vector v1 = v_src[i];
        HVX_Vector v2 = Q6_Vqf32_vsub_VsfVsf(v1, max_vec);

        HVX_Vector v3 = hvx_vec_exp_fp32(Q6_Vsf_equals_Vqf32(v2));

        sum_vec = Q6_Vqf32_vadd_VsfVsf(Q6_Vsf_equals_Vqf32(sum_vec), v3);

        v_pad[i] = v3;
    }

    v       = hvx_vec_qf32_reduce_sum(sum_vec);
    sum_vec = hvx_vec_repl4(Q6_Vsf_equals_Vqf32(v));

    HVX_VectorPred pos_sum   = Q6_Q_vcmp_gt_VwVw(sum_vec, zero_v);
    HVX_Vector     v4        = hvx_vec_inverse_fp32(sum_vec);
    HVX_Vector     scale_vec = Q6_V_vmux_QVV(pos_sum, v4, one_v);

    #pragma unroll(4)
    for (int i = 0; i < step_of_1; i++) {
        HVX_Vector v1 = v_pad[i];
        HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(v1, scale_vec);
        v_dst[i]      = Q6_Vsf_equals_Vqf32(v2);
    }
}

static float hvx_softmax_f32(const uint8_t * restrict src,
                             uint8_t * restrict dst,
                             uint8_t * restrict spad,
                             const int   num_elems,
                             const float max) {
    hvx_sub_scalar_f32(src, max, spad, num_elems);

    hvx_exp_f32(spad, dst, num_elems, false);

    float sum = hvx_self_sum_f32(dst, num_elems);

    return sum;
}

static void softmax_htp_f32(int nth, int ith, struct softmax_th_ctx * softmax_ctx, int opt_path) {
    struct htp_ops_context * octx = softmax_ctx->octx;

    const struct htp_tensor * src0 = &octx->src0;
    const struct htp_tensor * src1 = &octx->src1;
    const struct htp_tensor * dst  = &octx->dst;

    htp_softmax_preamble3;

    uint8_t * src0_spad_data = octx->src0_spad.data + (ith * nb01);
    uint8_t * src1_spad_data = octx->src1_spad.data + (ith * nb01);
    uint8_t * dst_spad_data  = octx->dst_spad.data + (ith * nb1);

    float * wp0 = (float *) src0_spad_data;
    float * wp1 = (float *) src1_spad_data;
    float * wp2 = (float *) dst_spad_data;

    for (uint32_t i03 = 0; i03 < ne03; i03++) {
        for (uint32_t i02 = 0; i02 < ne02; i02++) {
            for (uint32_t i01 = ith; i01 < ne01; i01 += nth) {
                const uint32_t i11 = i01;
                const uint32_t i12 = i02 % ne12;
                const uint32_t i13 = i03 % ne13;

                // ALiBi
                const uint32_t h = i02;  // head

                const float slope = (softmax_ctx->max_bias > 0.0f) ?
                                        h < softmax_ctx->n_head_log2 ?
                                        powf(softmax_ctx->m0, h + 1) :
                                        powf(softmax_ctx->m1, 2 * (h - softmax_ctx->n_head_log2) + 1) :
                                        1.0f;

                float * sp = (float *) ((char *) octx->src0.data + i01 * nb01 + i02 * nb02 + i03 * nb03);
                float * dp = (float *) ((char *) octx->dst.data + i01 * nb1 + i02 * nb2 + i03 * nb3);

                // broadcast the mask across rows
                __fp16 * mp_f16 = (softmax_ctx->use_src1) ?
                                      (__fp16 *) ((char *) octx->src1.data + i11 * nb11 + i12 * nb12 + i13 * nb13) :
                                      NULL;
                float *  mp_f32 = (softmax_ctx->use_src1) ?
                                      (float *) ((char *) octx->src1.data + i11 * nb11 + i12 * nb12 + i13 * nb13) :
                                      NULL;

                if ((1 == opt_path) && (mp_f32) && !(softmax_ctx->use_f16)) {
                    hvx_fast_softmax_prep_f32((const uint8_t *) sp, (uint8_t *) wp0, ne00, softmax_ctx->scale,
                                              (const uint8_t *) mp_f32, slope);
                } else {
                    hvx_scale_f32((const uint8_t *) sp, (uint8_t *) wp0, ne00, softmax_ctx->scale);
                    if (mp_f32) {
                        if (softmax_ctx->use_f16) {
                            for (int i = 0; i < ne00; ++i) {
                                wp0[i] += slope * (float) mp_f16[i];
                            }
                        } else {
                            for (int i = 0; i < ne00; ++i) {
                                wp0[i] += slope * mp_f32[i];
                            }
                        }
                    }
                }

                if (1 == opt_path) {
                    hvx_fast_softmax_f32((const uint8_t *) wp0, (uint8_t *) dp, (uint8_t *) wp1, ne00);
                } else {
                    float max = hvx_self_max_f32((const uint8_t *) wp0, ne00);
                    float sum = hvx_softmax_f32((const uint8_t *) wp0, (uint8_t *) wp2, (uint8_t *) wp1, ne00, max);
                    sum       = sum > 0.0 ? (1.0 / sum) : 1;
                    hvx_scale_f32((const uint8_t *) wp2, (uint8_t *) dp, ne00, sum);
                }
            }
        }
    }
}

static void softmax_job_f32_per_thread(struct softmax_th_ctx * softmax_ctx, int nth, int ith) {
    struct htp_ops_context * octx = softmax_ctx->octx;

    const struct htp_tensor * src0 = &octx->src0;
    const struct htp_tensor * src1 = &octx->src1;
    struct htp_tensor *       dst  = &octx->dst;

    htp_softmax_preamble3;

    const uint32_t src0_nrows            = ne01 * ne02 * ne03;  // src0 rows
    const uint32_t src0_nrows_per_thread = octx->src0_nrows_per_thread;

    const uint32_t src0_start_row = src0_nrows_per_thread * ith;
    const uint32_t src0_end_row   = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);

    // no work for this thread
    if (src0_start_row >= src0_end_row) {
        return;
    }

    uint64_t t1, t2;
    t1 = HAP_perf_get_qtimer_count();

    int is_aligned = 1;
    int opt_path   = 0;
    if (!htp_is_aligned((void *) src0->data, VLEN) || !htp_is_aligned((void *) dst->data, VLEN)) {
        is_aligned = 0;
        FARF(HIGH, "softmax-f32: unaligned addresses in elementwise op, possibly slower execution\n");
    }
    if ((1 == is_aligned) && !(nb01 & (VLEN - 1))) {
        opt_path = 1;
    }

    softmax_htp_f32(nth, ith, softmax_ctx, opt_path);

    t2 = HAP_perf_get_qtimer_count();

    FARF(HIGH, "softmax-f32 %d/%d/%d/%d: %ux%ux%ux%u (%u:%u) x %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", ith, nth,
         softmax_ctx->use_f16, opt_path, ne00, ne01, ne02, ne03, src0_start_row, src0_end_row, ne10, ne11, ne12, ne13,
         ne0, ne1, ne2, ne3, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
}

static void softmax_job_dispatcher_f32(unsigned int n, unsigned int i, void * p_data) {
    struct softmax_th_ctx * p_softmax_ctx = (struct softmax_th_ctx *) p_data;
    softmax_job_f32_per_thread(p_softmax_ctx, n, i);
}

static int execute_op_softmax_f32(struct htp_ops_context * octx) {
    int err = HTP_STATUS_OK;

    const struct htp_tensor * src0 = &octx->src0;
    const struct htp_tensor * src1 = &octx->src1;
    struct htp_tensor *       dst  = &octx->dst;

    worker_callback_t op_func;
    const char *      op_type = NULL;

    struct softmax_th_ctx softmax_ctx;

    switch (octx->op) {
        case HTP_OP_SOFTMAX:
            op_func = softmax_job_dispatcher_f32;
            op_type = "softmax-f32";

            init_softmax_ctx(&softmax_ctx, octx);
            break;

        default:
            FARF(ERROR, "Unsupported Op %u\n", octx->op);
            return HTP_STATUS_NO_SUPPORT;
    }

    const uint32_t n_threads = octx->n_threads;

    const size_t src0_row_size = src0->nb[1];
    const size_t src1_row_size = src0_row_size;
    const size_t dst_row_size  = dst->nb[1];

    // VTCM scratchpads for all tensors
    // N rows per thread, padded to HVX vector size
    octx->dst_spad.size  = htp_round_up(dst_row_size, 128) * n_threads;
    octx->src0_spad.size = htp_round_up(src0_row_size, 128) * n_threads;
    octx->src1_spad.size = htp_round_up(src1_row_size, 128) * n_threads;

    size_t spad_size = octx->src0_spad.size + octx->src1_spad.size + octx->dst_spad.size;

    if (src1->ne[0]) {
        FARF(HIGH,
             "%s: %ux%ux%ux%u x %ux%ux%ux%u -> %ux%ux%ux%u : src0-spad-size %u src1-spad-size %u dst-spad-size %u\n",
             op_type, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src1->ne[0], src1->ne[1], src1->ne[2],
             src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], octx->src0_spad.size, octx->src1_spad.size,
             octx->dst_spad.size);
    } else {
        FARF(HIGH, "%s: %ux%ux%ux%u -> %ux%ux%ux%u : src0-spad-size %u src1-spad-size %u dst-spad-size %u\n", op_type,
             src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
             octx->src0_spad.size, octx->src1_spad.size, octx->dst_spad.size);
    }

    // Make sure the reserved vtcm size is sufficient
    if (octx->ctx->vtcm_size < spad_size) {
        FARF(ERROR, "%s : current VTCM reservation %zu is too small, needed %zu\n", op_type, octx->ctx->vtcm_size,
             spad_size);
        return HTP_STATUS_VTCM_TOO_SMALL;
    }

    octx->src0_spad.data = octx->ctx->vtcm_base;
    octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size;
    octx->dst_spad.data  = octx->src1_spad.data + octx->src1_spad.size;

    uint32_t src0_nrows = src0->ne[1] * src0->ne[2] * src0->ne[3];

    if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) {
        uint32_t n_jobs             = MIN(n_threads, src0_nrows);
        octx->src0_nrows_per_thread = (src0_nrows + n_jobs - 1) / n_jobs;
        worker_pool_run_func(octx->ctx->worker_pool, op_func, &softmax_ctx, n_jobs);
    }

    return err;
}

int op_softmax(struct htp_ops_context * octx) {
    int err = HTP_STATUS_OK;

    switch (octx->src0.type) {
        case HTP_TYPE_F32:
            err = execute_op_softmax_f32(octx);
            break;

        default:
            err = HTP_STATUS_NO_SUPPORT;
            break;
    }

    return err;
}
Sync with upstream `ggml-org/llama.cpp` tag `b7751` 2026-01-16 14:21:48 +08:00			`#pragma clang diagnostic ignored "-Wunused-variable"`
			`#pragma clang diagnostic ignored "-Wunused-function"`
			`#pragma clang diagnostic ignored "-Wunused-but-set-variable"`

退回到 b7516 版本 2026-01-16 18:12:13 +08:00			`#ifdef HTP_DEBUG`
			`# define FARF_HIGH 1`
			`#endif`
Sync with upstream `ggml-org/llama.cpp` tag `b7751` 2026-01-16 14:21:48 +08:00			`#include <HAP_farf.h>`
退回到 b7516 版本 2026-01-16 18:12:13 +08:00			`#include <HAP_mem.h>`
Sync with upstream `ggml-org/llama.cpp` tag `b7751` 2026-01-16 14:21:48 +08:00			`#include <HAP_perf.h>`
退回到 b7516 版本 2026-01-16 18:12:13 +08:00			`#include <HAP_ps.h>`
			`#include <hexagon_protos.h>`
			`#include <hexagon_types.h>`
Sync with upstream `ggml-org/llama.cpp` tag `b7751` 2026-01-16 14:21:48 +08:00			`#include <math.h>`
退回到 b7516 版本 2026-01-16 18:12:13 +08:00			`#include <qurt_thread.h>`
Sync with upstream `ggml-org/llama.cpp` tag `b7751` 2026-01-16 14:21:48 +08:00			`#include <string.h>`

			`#define GGML_COMMON_DECL_C`
			`#include "ggml-common.h"`
			`#include "htp-ctx.h"`
退回到 b7516 版本 2026-01-16 18:12:13 +08:00			`#include "htp-dma.h"`
Sync with upstream `ggml-org/llama.cpp` tag `b7751` 2026-01-16 14:21:48 +08:00			`#include "htp-msg.h"`
			`#include "htp-ops.h"`
退回到 b7516 版本 2026-01-16 18:12:13 +08:00			`#include "hvx-utils.h"`
			`#include "ops-utils.h"`
Sync with upstream `ggml-org/llama.cpp` tag `b7751` 2026-01-16 14:21:48 +08:00
			`#define htp_softmax_preamble3 \`
			`const uint32_t ne00 = src0->ne[0]; \`
			`const uint32_t ne01 = src0->ne[1]; \`
			`const uint32_t ne02 = src0->ne[2]; \`
			`const uint32_t ne03 = src0->ne[3]; \`
			`\`
			`const uint32_t nb00 = src0->nb[0]; \`
			`const uint32_t nb01 = src0->nb[1]; \`
			`const uint32_t nb02 = src0->nb[2]; \`
			`const uint32_t nb03 = src0->nb[3]; \`
			`\`
			`const uint32_t ne10 = (src1->ne[0]) ? src1->ne[0] : 1; \`
			`const uint32_t ne11 = (src1->ne[0]) ? src1->ne[1] : 1; \`
			`const uint32_t ne12 = (src1->ne[0]) ? src1->ne[2] : 1; \`
			`const uint32_t ne13 = (src1->ne[0]) ? src1->ne[3] : 1; \`
			`\`
			`const uint32_t nb10 = (src1->ne[0]) ? src1->nb[0] : 1; \`
			`const uint32_t nb11 = (src1->ne[0]) ? src1->nb[1] : 1; \`
			`const uint32_t nb12 = (src1->ne[0]) ? src1->nb[2] : 1; \`
			`const uint32_t nb13 = (src1->ne[0]) ? src1->nb[3] : 1; \`
			`\`
			`const uint32_t ne0 = dst->ne[0]; \`
			`const uint32_t ne1 = dst->ne[1]; \`
			`const uint32_t ne2 = dst->ne[2]; \`
			`const uint32_t ne3 = dst->ne[3]; \`
			`\`
			`const uint32_t nb0 = dst->nb[0]; \`
			`const uint32_t nb1 = dst->nb[1]; \`
			`const uint32_t nb2 = dst->nb[2]; \`
			`const uint32_t nb3 = dst->nb[3];`

			`struct softmax_th_ctx {`
			`bool use_f16;`
			`bool use_src1;`
			`uint32_t n_head;`
			`uint32_t n_head_log2;`

			`float scale;`
			`float max_bias;`
			`float m0;`
			`float m1;`

			`struct htp_ops_context * octx;`
			`};`

			`static void init_softmax_ctx(struct softmax_th_ctx * softmax_ctx, struct htp_ops_context * octx) {`
			`const struct htp_tensor * src0 = &octx->src0;`
			`const struct htp_tensor * src1 = &octx->src1;`

			`memset(softmax_ctx, 0, sizeof(struct softmax_th_ctx));`

			`memcpy(&softmax_ctx->scale, (float *) octx->op_params, sizeof(float));`
			`memcpy(&softmax_ctx->max_bias, (float *) octx->op_params + 1, sizeof(float));`

			`softmax_ctx->n_head = src0->ne[2];`
			`softmax_ctx->n_head_log2 = 1u << (uint32_t) floor(log2(softmax_ctx->n_head));`

			`softmax_ctx->m0 = powf(2.0f, -(softmax_ctx->max_bias) / softmax_ctx->n_head_log2);`
			`softmax_ctx->m1 = powf(2.0f, -(softmax_ctx->max_bias / 2.0f) / softmax_ctx->n_head_log2);`

			`softmax_ctx->use_src1 = (src1->ne[0] != 0);`
			`softmax_ctx->use_f16 = (src1->ne[0] != 0) && (src1->type == HTP_TYPE_F16);`

			`softmax_ctx->octx = octx;`
			`}`

			`static void hvx_fast_softmax_prep_f32(const uint8_t * restrict src,`
			`uint8_t * restrict dst,`
			`const int num_elems,`
			`float scale,`
			`const uint8_t * restrict mask,`
			`float slope) {`
			`const uint8_t * restrict src_curr = src;`
			`uint8_t * restrict dst_curr = dst;`
			`const uint8_t * restrict mask_curr = mask;`

退回到 b7516 版本 2026-01-16 18:12:13 +08:00			`HVX_Vector scale_vec = hvx_vec_splat_fp32(scale);`
			`HVX_Vector slope_vec = hvx_vec_splat_fp32(slope);`
Sync with upstream `ggml-org/llama.cpp` tag `b7751` 2026-01-16 14:21:48 +08:00
			`int step_of_1 = num_elems >> 5;`

			`#pragma unroll(4)`
			`for (int i = 0; i < step_of_1; i++) {`
			`HVX_Vector v1 = (HVX_Vector ) src_curr;`

			`HVX_Vector v3 = (HVX_Vector ) mask_curr;`

			`HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(v1, scale_vec);`

			`HVX_Vector v4 = Q6_Vqf32_vmpy_VsfVsf(v3, slope_vec);`

			`HVX_Vector v5 = Q6_Vqf32_vadd_Vqf32Vqf32(v2, v4);`

			`(HVX_Vector ) dst_curr = Q6_Vsf_equals_Vqf32(v5);`

			`src_curr += VLEN;`
			`dst_curr += VLEN;`
			`mask_curr += VLEN;`
			`}`
			`}`

			`static void hvx_fast_softmax_f32(const uint8_t * restrict src,`
			`uint8_t * restrict dst,`
			`uint8_t * restrict pad,`
			`const int num_elems) {`
			`const HVX_Vector * restrict v_src = (HVX_Vector *) src;`
			`HVX_Vector * restrict v_pad = (HVX_Vector *) pad;`
			`HVX_Vector * restrict v_dst = (HVX_Vector *) dst;`

			`HVX_Vector sum_vec = Q6_V_vsplat_R(0x00000000);`
退回到 b7516 版本 2026-01-16 18:12:13 +08:00			`HVX_Vector max_vec = hvx_vec_splat_fp32(((const float *) src)[0]);`
Sync with upstream `ggml-org/llama.cpp` tag `b7751` 2026-01-16 14:21:48 +08:00			`HVX_Vector zero_v = Q6_V_vzero();`
退回到 b7516 版本 2026-01-16 18:12:13 +08:00			`HVX_Vector one_v = hvx_vec_splat_fp32(1.0);`
Sync with upstream `ggml-org/llama.cpp` tag `b7751` 2026-01-16 14:21:48 +08:00
			`int step_of_1 = num_elems >> 5;`

			`#pragma unroll(4)`
			`for (int i = 0; i < step_of_1; i++) {`
			`HVX_Vector v1 = v_src[i];`
			`max_vec = Q6_Vsf_vmax_VsfVsf(max_vec, v1);`
			`}`

退回到 b7516 版本 2026-01-16 18:12:13 +08:00			`HVX_Vector v = hvx_vec_reduce_max_fp32(max_vec);`
Sync with upstream `ggml-org/llama.cpp` tag `b7751` 2026-01-16 14:21:48 +08:00			`max_vec = hvx_vec_repl4(v);`

			`#pragma unroll(4)`
			`for (int i = 0; i < step_of_1; i++) {`
			`HVX_Vector v1 = v_src[i];`
			`HVX_Vector v2 = Q6_Vqf32_vsub_VsfVsf(v1, max_vec);`

退回到 b7516 版本 2026-01-16 18:12:13 +08:00			`HVX_Vector v3 = hvx_vec_exp_fp32(Q6_Vsf_equals_Vqf32(v2));`
Sync with upstream `ggml-org/llama.cpp` tag `b7751` 2026-01-16 14:21:48 +08:00
			`sum_vec = Q6_Vqf32_vadd_VsfVsf(Q6_Vsf_equals_Vqf32(sum_vec), v3);`

			`v_pad[i] = v3;`
			`}`

退回到 b7516 版本 2026-01-16 18:12:13 +08:00			`v = hvx_vec_qf32_reduce_sum(sum_vec);`
Sync with upstream `ggml-org/llama.cpp` tag `b7751` 2026-01-16 14:21:48 +08:00			`sum_vec = hvx_vec_repl4(Q6_Vsf_equals_Vqf32(v));`

			`HVX_VectorPred pos_sum = Q6_Q_vcmp_gt_VwVw(sum_vec, zero_v);`
退回到 b7516 版本 2026-01-16 18:12:13 +08:00			`HVX_Vector v4 = hvx_vec_inverse_fp32(sum_vec);`
Sync with upstream `ggml-org/llama.cpp` tag `b7751` 2026-01-16 14:21:48 +08:00			`HVX_Vector scale_vec = Q6_V_vmux_QVV(pos_sum, v4, one_v);`

			`#pragma unroll(4)`
			`for (int i = 0; i < step_of_1; i++) {`
			`HVX_Vector v1 = v_pad[i];`
			`HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(v1, scale_vec);`
			`v_dst[i] = Q6_Vsf_equals_Vqf32(v2);`
			`}`
			`}`

			`static float hvx_softmax_f32(const uint8_t * restrict src,`
			`uint8_t * restrict dst,`
			`uint8_t * restrict spad,`
			`const int num_elems,`
			`const float max) {`
退回到 b7516 版本 2026-01-16 18:12:13 +08:00			`hvx_sub_scalar_f32(src, max, spad, num_elems);`
Sync with upstream `ggml-org/llama.cpp` tag `b7751` 2026-01-16 14:21:48 +08:00
			`hvx_exp_f32(spad, dst, num_elems, false);`

退回到 b7516 版本 2026-01-16 18:12:13 +08:00			`float sum = hvx_self_sum_f32(dst, num_elems);`
Sync with upstream `ggml-org/llama.cpp` tag `b7751` 2026-01-16 14:21:48 +08:00
			`return sum;`
			`}`

			`static void softmax_htp_f32(int nth, int ith, struct softmax_th_ctx * softmax_ctx, int opt_path) {`
			`struct htp_ops_context * octx = softmax_ctx->octx;`

			`const struct htp_tensor * src0 = &octx->src0;`
			`const struct htp_tensor * src1 = &octx->src1;`
			`const struct htp_tensor * dst = &octx->dst;`

			`htp_softmax_preamble3;`

			`uint8_t * src0_spad_data = octx->src0_spad.data + (ith * nb01);`
			`uint8_t * src1_spad_data = octx->src1_spad.data + (ith * nb01);`
			`uint8_t * dst_spad_data = octx->dst_spad.data + (ith * nb1);`

			`float * wp0 = (float *) src0_spad_data;`
			`float * wp1 = (float *) src1_spad_data;`
			`float * wp2 = (float *) dst_spad_data;`

			`for (uint32_t i03 = 0; i03 < ne03; i03++) {`
			`for (uint32_t i02 = 0; i02 < ne02; i02++) {`
			`for (uint32_t i01 = ith; i01 < ne01; i01 += nth) {`
			`const uint32_t i11 = i01;`
			`const uint32_t i12 = i02 % ne12;`
			`const uint32_t i13 = i03 % ne13;`

			`// ALiBi`
			`const uint32_t h = i02; // head`

			`const float slope = (softmax_ctx->max_bias > 0.0f) ?`
			`h < softmax_ctx->n_head_log2 ?`
			`powf(softmax_ctx->m0, h + 1) :`
			`powf(softmax_ctx->m1, 2 * (h - softmax_ctx->n_head_log2) + 1) :`
			`1.0f;`

			`float * sp = (float ) ((char ) octx->src0.data + i01 * nb01 + i02 * nb02 + i03 * nb03);`
			`float * dp = (float ) ((char ) octx->dst.data + i01 * nb1 + i02 * nb2 + i03 * nb3);`

			`// broadcast the mask across rows`
			`__fp16 * mp_f16 = (softmax_ctx->use_src1) ?`
			`(__fp16 ) ((char ) octx->src1.data + i11 * nb11 + i12 * nb12 + i13 * nb13) :`
			`NULL;`
			`float * mp_f32 = (softmax_ctx->use_src1) ?`
			`(float ) ((char ) octx->src1.data + i11 * nb11 + i12 * nb12 + i13 * nb13) :`
			`NULL;`

			`if ((1 == opt_path) && (mp_f32) && !(softmax_ctx->use_f16)) {`
			`hvx_fast_softmax_prep_f32((const uint8_t ) sp, (uint8_t ) wp0, ne00, softmax_ctx->scale,`
			`(const uint8_t *) mp_f32, slope);`
			`} else {`
退回到 b7516 版本 2026-01-16 18:12:13 +08:00			`hvx_scale_f32((const uint8_t ) sp, (uint8_t ) wp0, ne00, softmax_ctx->scale);`
Sync with upstream `ggml-org/llama.cpp` tag `b7751` 2026-01-16 14:21:48 +08:00			`if (mp_f32) {`
			`if (softmax_ctx->use_f16) {`
			`for (int i = 0; i < ne00; ++i) {`
			`wp0[i] += slope * (float) mp_f16[i];`
			`}`
			`} else {`
			`for (int i = 0; i < ne00; ++i) {`
			`wp0[i] += slope * mp_f32[i];`
			`}`
			`}`
			`}`
			`}`

			`if (1 == opt_path) {`
			`hvx_fast_softmax_f32((const uint8_t ) wp0, (uint8_t ) dp, (uint8_t *) wp1, ne00);`
			`} else {`
退回到 b7516 版本 2026-01-16 18:12:13 +08:00			`float max = hvx_self_max_f32((const uint8_t *) wp0, ne00);`
Sync with upstream `ggml-org/llama.cpp` tag `b7751` 2026-01-16 14:21:48 +08:00			`float sum = hvx_softmax_f32((const uint8_t ) wp0, (uint8_t ) wp2, (uint8_t *) wp1, ne00, max);`
			`sum = sum > 0.0 ? (1.0 / sum) : 1;`
退回到 b7516 版本 2026-01-16 18:12:13 +08:00			`hvx_scale_f32((const uint8_t ) wp2, (uint8_t ) dp, ne00, sum);`
Sync with upstream `ggml-org/llama.cpp` tag `b7751` 2026-01-16 14:21:48 +08:00			`}`
			`}`
			`}`
			`}`
			`}`

			`static void softmax_job_f32_per_thread(struct softmax_th_ctx * softmax_ctx, int nth, int ith) {`
			`struct htp_ops_context * octx = softmax_ctx->octx;`

			`const struct htp_tensor * src0 = &octx->src0;`
			`const struct htp_tensor * src1 = &octx->src1;`
			`struct htp_tensor * dst = &octx->dst;`

			`htp_softmax_preamble3;`

			`const uint32_t src0_nrows = ne01 * ne02 * ne03; // src0 rows`
			`const uint32_t src0_nrows_per_thread = octx->src0_nrows_per_thread;`

			`const uint32_t src0_start_row = src0_nrows_per_thread * ith;`
			`const uint32_t src0_end_row = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);`

			`// no work for this thread`
			`if (src0_start_row >= src0_end_row) {`
			`return;`
			`}`

			`uint64_t t1, t2;`
			`t1 = HAP_perf_get_qtimer_count();`

			`int is_aligned = 1;`
			`int opt_path = 0;`
退回到 b7516 版本 2026-01-16 18:12:13 +08:00			`if (!htp_is_aligned((void ) src0->data, VLEN) \|\| !htp_is_aligned((void ) dst->data, VLEN)) {`
Sync with upstream `ggml-org/llama.cpp` tag `b7751` 2026-01-16 14:21:48 +08:00			`is_aligned = 0;`
			`FARF(HIGH, "softmax-f32: unaligned addresses in elementwise op, possibly slower execution\n");`
			`}`
			`if ((1 == is_aligned) && !(nb01 & (VLEN - 1))) {`
			`opt_path = 1;`
			`}`

			`softmax_htp_f32(nth, ith, softmax_ctx, opt_path);`

			`t2 = HAP_perf_get_qtimer_count();`

			`FARF(HIGH, "softmax-f32 %d/%d/%d/%d: %ux%ux%ux%u (%u:%u) x %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", ith, nth,`
			`softmax_ctx->use_f16, opt_path, ne00, ne01, ne02, ne03, src0_start_row, src0_end_row, ne10, ne11, ne12, ne13,`
			`ne0, ne1, ne2, ne3, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));`
			`}`

			`static void softmax_job_dispatcher_f32(unsigned int n, unsigned int i, void * p_data) {`
			`struct softmax_th_ctx * p_softmax_ctx = (struct softmax_th_ctx *) p_data;`
			`softmax_job_f32_per_thread(p_softmax_ctx, n, i);`
			`}`

			`static int execute_op_softmax_f32(struct htp_ops_context * octx) {`
			`int err = HTP_STATUS_OK;`

			`const struct htp_tensor * src0 = &octx->src0;`
			`const struct htp_tensor * src1 = &octx->src1;`
			`struct htp_tensor * dst = &octx->dst;`

			`worker_callback_t op_func;`
			`const char * op_type = NULL;`

			`struct softmax_th_ctx softmax_ctx;`

			`switch (octx->op) {`
			`case HTP_OP_SOFTMAX:`
			`op_func = softmax_job_dispatcher_f32;`
			`op_type = "softmax-f32";`

			`init_softmax_ctx(&softmax_ctx, octx);`
			`break;`

			`default:`
			`FARF(ERROR, "Unsupported Op %u\n", octx->op);`
			`return HTP_STATUS_NO_SUPPORT;`
			`}`

			`const uint32_t n_threads = octx->n_threads;`

			`const size_t src0_row_size = src0->nb[1];`
			`const size_t src1_row_size = src0_row_size;`
			`const size_t dst_row_size = dst->nb[1];`

			`// VTCM scratchpads for all tensors`
			`// N rows per thread, padded to HVX vector size`
退回到 b7516 版本 2026-01-16 18:12:13 +08:00			`octx->dst_spad.size = htp_round_up(dst_row_size, 128) * n_threads;`
			`octx->src0_spad.size = htp_round_up(src0_row_size, 128) * n_threads;`
			`octx->src1_spad.size = htp_round_up(src1_row_size, 128) * n_threads;`
Sync with upstream `ggml-org/llama.cpp` tag `b7751` 2026-01-16 14:21:48 +08:00
			`size_t spad_size = octx->src0_spad.size + octx->src1_spad.size + octx->dst_spad.size;`

			`if (src1->ne[0]) {`
			`FARF(HIGH,`
			`"%s: %ux%ux%ux%u x %ux%ux%ux%u -> %ux%ux%ux%u : src0-spad-size %u src1-spad-size %u dst-spad-size %u\n",`
			`op_type, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src1->ne[0], src1->ne[1], src1->ne[2],`
			`src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], octx->src0_spad.size, octx->src1_spad.size,`
			`octx->dst_spad.size);`
			`} else {`
			`FARF(HIGH, "%s: %ux%ux%ux%u -> %ux%ux%ux%u : src0-spad-size %u src1-spad-size %u dst-spad-size %u\n", op_type,`
			`src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],`
			`octx->src0_spad.size, octx->src1_spad.size, octx->dst_spad.size);`
			`}`

			`// Make sure the reserved vtcm size is sufficient`
			`if (octx->ctx->vtcm_size < spad_size) {`
			`FARF(ERROR, "%s : current VTCM reservation %zu is too small, needed %zu\n", op_type, octx->ctx->vtcm_size,`
			`spad_size);`
			`return HTP_STATUS_VTCM_TOO_SMALL;`
			`}`

			`octx->src0_spad.data = octx->ctx->vtcm_base;`
			`octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size;`
			`octx->dst_spad.data = octx->src1_spad.data + octx->src1_spad.size;`

			`uint32_t src0_nrows = src0->ne[1] * src0->ne[2] * src0->ne[3];`

			`if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) {`
			`uint32_t n_jobs = MIN(n_threads, src0_nrows);`
			`octx->src0_nrows_per_thread = (src0_nrows + n_jobs - 1) / n_jobs;`
			`worker_pool_run_func(octx->ctx->worker_pool, op_func, &softmax_ctx, n_jobs);`
			`}`

			`return err;`
			`}`

			`int op_softmax(struct htp_ops_context * octx) {`
			`int err = HTP_STATUS_OK;`

			`switch (octx->src0.type) {`
			`case HTP_TYPE_F32:`
			`err = execute_op_softmax_f32(octx);`
			`break;`

			`default:`
			`err = HTP_STATUS_NO_SUPPORT;`
			`break;`
			`}`

			`return err;`
			`}`