remove redundant params in mla_preprocess kernel (#3530)

### What this PR does / why we need it? This pull request removes the redundant parameters `gamma1` and `beta1` (also named `gamma0`/`beta0` in some places) from the `mla_preprocess` kernel and its calling hierarchy. The changes are consistent across C++ kernel code, bindings, and Python call sites. The parameters were unused in the lower-level functions, so their removal is a good cleanup. ### Does this PR introduce _any_ user-facing change? The python interface of the kernel is affected, and the params of `gamma0` and `beta0` are not needed. ### How was this patch tested? The unit-test of the kernel is adapted accordingly. - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 Signed-off-by: mojave2 <chenchen145@huawei.com>
2025-10-21 19:20:13 +08:00
parent 80b8df881f
commit 6b290acfe1
9 changed files with 34 additions and 50 deletions
--- a/csrc/mla_preprocess/op_kernel/mla_preprocess_kernel.cpp
+++ b/csrc/mla_preprocess/op_kernel/mla_preprocess_kernel.cpp
@@ -19,7 +19,7 @@
 #include "../op_host/tiling/mla_preprocess_tiling.h"

 extern "C" __global__ __aicore__ void mla_preprocess(
-    GM_ADDR hiddenState, GM_ADDR gamma1, GM_ADDR beta1, GM_ADDR quantScale1, GM_ADDR quantOffset1, GM_ADDR wdqkv,
+    GM_ADDR hiddenState, GM_ADDR quantScale1, GM_ADDR quantOffset1, GM_ADDR wdqkv,
    GM_ADDR bias1, GM_ADDR gamma2, GM_ADDR beta2, GM_ADDR quantScale2, GM_ADDR quantOffset2, GM_ADDR gamma3,
    GM_ADDR sin1, GM_ADDR cos1, GM_ADDR sin2, GM_ADDR cos2, GM_ADDR keycache, GM_ADDR slotMapping, GM_ADDR wuq,
    GM_ADDR bias2, GM_ADDR wuk, GM_ADDR descale1, GM_ADDR descale2, GM_ADDR ctkvScale, GM_ADDR qnopeScale, GM_ADDR q,
@@ -143,7 +143,7 @@ extern "C" __global__ __aicore__ void mla_preprocess(
        case KEY_FP16_CACHEMODE_0_QUANTMODE_0: {
            MLAPO_FP16::MLAOperation<CACHE_MODE_KVCACHE, DataFormat::NZ, DataFormat::NZ, DataFormat::ND> opFp16Cm0Qm0(
                mlaTilingData, tiling);
-            opFp16Cm0Qm0.Init(hiddenState, gamma1, beta1, quantScale1, quantOffset1, wdqkv, bias1, gamma2, beta2,
+            opFp16Cm0Qm0.Init(hiddenState, quantScale1, quantOffset1, wdqkv, bias1, gamma2, beta2,
                              quantScale2, quantOffset2, gamma3, sin1, cos1, sin2, cos2, keycache, slotMapping, wuq,
                              bias2, wuk, descale1, descale2, ctkvScale, qnopeScale, q, keycacheOut, q2, keycacheOut2,
                              s1, s2, s3);
@@ -158,7 +158,7 @@ extern "C" __global__ __aicore__ void mla_preprocess(
        case KEY_FP16_CACHEMODE_1_QUANTMODE_0: {
            MLAPO_FP16::MLAOperation<CACHE_MODE_KROPE_CTKV, DataFormat::NZ, DataFormat::NZ, DataFormat::ND>
                opFp16Cm1Qm0(mlaTilingData, tiling);
-            opFp16Cm1Qm0.Init(hiddenState, gamma1, beta1, quantScale1, quantOffset1, wdqkv, bias1, gamma2, beta2,
+            opFp16Cm1Qm0.Init(hiddenState, quantScale1, quantOffset1, wdqkv, bias1, gamma2, beta2,
                              quantScale2, quantOffset2, gamma3, sin1, cos1, sin2, cos2, keycache, slotMapping, wuq,
                              bias2, wuk, descale1, descale2, ctkvScale, qnopeScale, q, keycacheOut, q2, keycacheOut2,
                              s1, s2, s3);
@@ -174,7 +174,7 @@ extern "C" __global__ __aicore__ void mla_preprocess(
            MLAPO_BF16::MLAOperation<__bf16, 0, DataFormat::NZ, DataFormat::NZ, DataFormat::ND,
                                     QuantMode::PER_TENSOR_ASYMM_QUANT>
                opBf16Cm0Qm0(mlaTilingData, tiling);
-            opBf16Cm0Qm0.Init(hiddenState, gamma1, beta1, quantScale1, quantOffset1, wdqkv, bias1, gamma2, beta2,
+            opBf16Cm0Qm0.Init(hiddenState, quantScale1, quantOffset1, wdqkv, bias1, gamma2, beta2,
                              quantScale2, quantOffset2, gamma3, sin1, cos1, sin2, cos2, keycache, slotMapping, wuq,
                              bias2, wuk, descale1, descale2, ctkvScale, qnopeScale, q, keycacheOut, q2, keycacheOut2,
                              s1, s2, s3, s4, s5);
@@ -190,7 +190,7 @@ extern "C" __global__ __aicore__ void mla_preprocess(
            MLAPO_BF16::MLAOperation<__bf16, 1, DataFormat::NZ, DataFormat::NZ, DataFormat::ND,
                                     QuantMode::PER_TENSOR_ASYMM_QUANT>
                opBf16Cm1Qm0(mlaTilingData, tiling);
-            opBf16Cm1Qm0.Init(hiddenState, gamma1, beta1, quantScale1, quantOffset1, wdqkv, bias1, gamma2, beta2,
+            opBf16Cm1Qm0.Init(hiddenState, quantScale1, quantOffset1, wdqkv, bias1, gamma2, beta2,
                              quantScale2, quantOffset2, gamma3, sin1, cos1, sin2, cos2, keycache, slotMapping, wuq,
                              bias2, wuk, descale1, descale2, ctkvScale, qnopeScale, q, keycacheOut, q2, keycacheOut2,
                              s1, s2, s3, s4, s5);
@@ -206,7 +206,7 @@ extern "C" __global__ __aicore__ void mla_preprocess(
            MLAPO_BF16::MLAOperation<__bf16, 3, DataFormat::NZ, DataFormat::NZ, DataFormat::ND,
                                     QuantMode::PER_TENSOR_ASYMM_QUANT>
                opBf16Cm3Qm0(mlaTilingData, tiling);
-            opBf16Cm3Qm0.Init(hiddenState, gamma1, beta1, quantScale1, quantOffset1, wdqkv, bias1, gamma2, beta2,
+            opBf16Cm3Qm0.Init(hiddenState, quantScale1, quantOffset1, wdqkv, bias1, gamma2, beta2,
                              quantScale2, quantOffset2, gamma3, sin1, cos1, sin2, cos2, keycache, slotMapping, wuq,
                              bias2, wuk, descale1, descale2, ctkvScale, qnopeScale, q, keycacheOut, q2, keycacheOut2,
                              s1, s2, s3, s4, s5);
@@ -230,8 +230,6 @@ namespace vllm_ascend {
 extern void mla_preprocess_impl(
    void* stream,
    void* hidden_state,
-    void* gamma1,
-    void* beta1,
    void* quant_scale1,
    void* quant_offset1,
    void* wdqkv,
@@ -264,8 +262,6 @@ extern void mla_preprocess_impl(
 {
    mla_preprocess<<<block_dim, nullptr, stream>>>(
        hidden_state,
-        gamma1,
-        beta1,
        quant_scale1,
        quant_offset1,
        wdqkv,
--- a/csrc/mla_preprocess/op_kernel/mla_preprocess_mix_bf16.hpp
+++ b/csrc/mla_preprocess/op_kernel/mla_preprocess_mix_bf16.hpp
@@ -2388,7 +2388,7 @@ public:
        this->mlaParams = mlaParams_;
    }

-    __aicore__ inline void Init(GM_ADDR hiddenStateGm, GM_ADDR gamma1Gm, GM_ADDR beta1Gm, GM_ADDR quantScale1Gm,
+    __aicore__ inline void Init(GM_ADDR hiddenStateGm, GM_ADDR quantScale1Gm,
                                GM_ADDR quantOffset1Gm, GM_ADDR wdqkvGm, GM_ADDR bias1Gm, GM_ADDR gamma2Gm,
                                GM_ADDR beta2Gm, GM_ADDR quantScale2Gm, GM_ADDR quantOffset2Gm, GM_ADDR gamma3Gm,
                                GM_ADDR sin1Gm, GM_ADDR cos1Gm, GM_ADDR sin2Gm, GM_ADDR cos2Gm, GM_ADDR keycacheGm,
@@ -2426,7 +2426,6 @@ public:
 #endif

        hiddenStateGmTensor.SetGlobalBuffer(reinterpret_cast<__gm__ InDtype *>(hiddenStateGm));
-        gamma1GmTensor.SetGlobalBuffer(reinterpret_cast<__gm__ InDtype *>(gamma1Gm));
        quantScale1GmTensor.SetGlobalBuffer(reinterpret_cast<__gm__ InDtype *>(quantScale1Gm));
        quantOffset1GmTensor.SetGlobalBuffer(reinterpret_cast<__gm__ int8_t *>(quantOffset1Gm));
        wdqkvGmTensor.SetGlobalBuffer(reinterpret_cast<__gm__ int8_t *>(wdqkvGm));
@@ -2444,7 +2443,6 @@ public:
        qGmTensor2.SetGlobalBuffer(reinterpret_cast<__gm__ InDtype *>(qGm2));
        bias1gmTensor.SetGlobalBuffer(reinterpret_cast<__gm__ int32_t *>(bias1Gm));
        bias2gmTensor.SetGlobalBuffer(reinterpret_cast<__gm__ int32_t *>(bias2Gm));
-        beta1GmTensor.SetGlobalBuffer(reinterpret_cast<__gm__ InDtype *>(beta1Gm));
        beta2GmTensor.SetGlobalBuffer(reinterpret_cast<__gm__ InDtype *>(beta2Gm));

 #ifdef __DAV_C220_VEC__
@@ -2711,7 +2709,6 @@ private:

    AscendC::GlobalTensor<InDtype> hiddenStateGmTensor;

-    AscendC::GlobalTensor<InDtype> gamma1GmTensor;
    AscendC::GlobalTensor<InDtype> quantScale1GmTensor;
    AscendC::GlobalTensor<int8_t> quantOffset1GmTensor;

@@ -2741,7 +2738,6 @@ private:
    AscendC::GlobalTensor<float> s5GmTensor;
    AscendC::GlobalTensor<float> descale1gmTensor;
    AscendC::GlobalTensor<float> descale2gmTensor;
-    AscendC::GlobalTensor<InDtype> beta1GmTensor;
    AscendC::GlobalTensor<InDtype> beta2GmTensor;

    AscendC::GlobalTensor<int32_t> bias1gmTensor;
--- a/csrc/mla_preprocess/op_kernel/mla_preprocess_mix_fp16.hpp
+++ b/csrc/mla_preprocess/op_kernel/mla_preprocess_mix_fp16.hpp
@@ -294,8 +294,7 @@ class Quant
 public:
    __aicore__ inline Quant() {}

-    __aicore__ inline void Init(AscendC::GlobalTensor<T> gammaGmTensor, AscendC::GlobalTensor<T> betaGmTensor,
-                                AscendC::GlobalTensor<T> quantScaleGmTensor,
+    __aicore__ inline void Init(AscendC::GlobalTensor<T> quantScaleGmTensor,
                                AscendC::GlobalTensor<int8_t> quantOffsetGmTensor,
                                AscendC::GlobalTensor<T> inputGmTensor, AscendC::GlobalTensor<int8_t> outputGmTensor,
                                uint32_t stride, uint32_t num_col, float avg_factor, uint64_t gm_offset,
@@ -2037,7 +2036,7 @@ public:
        this->mlaParams = mlaParams_;
    }

-    __aicore__ inline void Init(GM_ADDR hiddenStateGm, GM_ADDR gamma1Gm, GM_ADDR beta1Gm, GM_ADDR quantScale1Gm,
+    __aicore__ inline void Init(GM_ADDR hiddenStateGm, GM_ADDR quantScale1Gm,
                                GM_ADDR quantOffset1Gm, GM_ADDR wdqkvGm, GM_ADDR bias1Gm, GM_ADDR gamma2Gm,
                                GM_ADDR beta2Gm, GM_ADDR quantScale2Gm, GM_ADDR quantOffset2Gm, GM_ADDR gamma3Gm,
                                GM_ADDR sin1Gm, GM_ADDR cos1Gm, GM_ADDR sin2Gm, GM_ADDR cos2Gm, GM_ADDR keycacheGm,
@@ -2057,7 +2056,6 @@ public:
        mm_w8a8_1.PreloadDoubleWeight();
 #endif
        hiddenStateGmTensor.SetGlobalBuffer(reinterpret_cast<__gm__ half *>(hiddenStateGm));
-        gamma1GmTensor.SetGlobalBuffer(reinterpret_cast<__gm__ half *>(gamma1Gm));
        quantScale1GmTensor.SetGlobalBuffer(reinterpret_cast<__gm__ half *>(quantScale1Gm));
        quantOffset1GmTensor.SetGlobalBuffer(reinterpret_cast<__gm__ int8_t *>(quantOffset1Gm));

@@ -2081,7 +2079,6 @@ public:
        qGmTensor2.SetGlobalBuffer(reinterpret_cast<__gm__ half *>(qGm2));
        bias2gmTensor.SetGlobalBuffer(reinterpret_cast<__gm__ int32_t *>(bias2Gm));

-        beta1GmTensor.SetGlobalBuffer(reinterpret_cast<__gm__ half *>(beta1Gm));
        beta2GmTensor.SetGlobalBuffer(reinterpret_cast<__gm__ half *>(beta2Gm));
 #ifdef __DAV_C220_CUBE__
        mm_w8a8_2.Init(s1GmTensor, wuqGmTensor, bias2gmTensor, descale2gmTensor, s2GmTensor, mlaParams, 1);
@@ -2105,7 +2102,7 @@ public:
            row_work_ = 0;
        }
        this->splitN = mlaParams.perTaskNum;
-        Quant1.Init(gamma1GmTensor, beta1GmTensor, quantScale1GmTensor, quantOffset1GmTensor, hiddenStateGmTensor,
+        Quant1.Init(quantScale1GmTensor, quantOffset1GmTensor, hiddenStateGmTensor,
                    s1GmTensor, 0, num_col_1, 0.0001395089285,
                    vectorBlockIdx * static_cast<uint64_t>(row_work) * num_col_1,
                    vectorBlockIdx * static_cast<uint64_t>(row_work) * num_col_1, row_work_, mlaParams);
@@ -2316,7 +2313,6 @@ private:

    AscendC::GlobalTensor<half> hiddenStateGmTensor;

-    AscendC::GlobalTensor<half> gamma1GmTensor;
    AscendC::GlobalTensor<half> quantScale1GmTensor;
    AscendC::GlobalTensor<int8_t> quantOffset1GmTensor;

@@ -2343,7 +2339,6 @@ private:
    AscendC::GlobalTensor<half> s3GmTensor;
    AscendC::GlobalTensor<uint64_t> descale1gmTensor;
    AscendC::GlobalTensor<uint64_t> descale2gmTensor;
-    AscendC::GlobalTensor<half> beta1GmTensor;
    AscendC::GlobalTensor<half> beta2GmTensor;

    AscendC::GlobalTensor<int32_t> bias1gmTensor;