[feat] mlapo add bf16 no_quant support (#4852)

### What this PR does / why we need it? This PR adds mlapo operation support for bf16 no_quant mode. ### Does this PR introduce _any_ user-facing change? This PR makes quant related parameters optional. ### How was this patch tested? CI passed with new added/existing test. - vLLM version: v0.12.0 - vLLM main: ad32e3e19c --------- Signed-off-by: chenjunyi <isjunyi.chen@gmail.com>
2025-12-11 11:06:56 +08:00
parent c95c271538
commit c12eb22cbe
12 changed files with 1510 additions and 81 deletions
--- a/tests/e2e/nightly/ops/test_mla_preprocess.py
+++ b/tests/e2e/nightly/ops/test_mla_preprocess.py
@@ -67,6 +67,11 @@ def test_mla_preprocess_kernel():
        dtype=hidden_states.dtype,
        device=hidden_states.device,
    )
+    q_down = torch.empty(
+        (hidden_states.shape[0], 1536),
+        dtype=hidden_states.dtype,
+        device=hidden_states.device,
+    )
    q_nope_old = q_nope_out.clone()
    q_rope_old = q_rope_out.clone()

@@ -95,10 +100,12 @@ def test_mla_preprocess_kernel():
        q_nope_scale=qnope_scale,
        cache_mode="krope_ctkv",
        quant_mode="per_tensor_quant_asymm",
+        enable_inner_out=False,
        q_out0=q_nope_out,
        kv_cache_out0=kv_cache,
        q_out1=q_rope_out,
        kv_cache_out1=kv_cache_rope,
+        inner_out=q_down,
    )
    assert not torch.equal(q_nope_out, q_nope_old)
    assert not torch.equal(q_rope_out, q_rope_old)