[feat] mlapo add bf16 no_quant support (#4852)

### What this PR does / why we need it? This PR adds mlapo operation support for bf16 no_quant mode. ### Does this PR introduce _any_ user-facing change? This PR makes quant related parameters optional. ### How was this patch tested? CI passed with new added/existing test. - vLLM version: v0.12.0 - vLLM main: ad32e3e19c --------- Signed-off-by: chenjunyi <isjunyi.chen@gmail.com>
2025-12-11 11:06:56 +08:00
parent c95c271538
commit c12eb22cbe
12 changed files with 1510 additions and 81 deletions
--- a/csrc/torch_binding_meta.cpp
+++ b/csrc/torch_binding_meta.cpp
@@ -84,11 +84,11 @@ at::Tensor sgmv_expand_meta(at::Tensor &x, at::Tensor &weight, at::Tensor &lora_
 std::tuple<at::Tensor &, at::Tensor &, at::Tensor &, at::Tensor &, at::Tensor &> mla_preprocess(
    const at::Tensor &hiddenState,
    const at::Tensor &wdqkv,
-    const at::Tensor &descale0,
+    const c10::optional<at::Tensor> &descale0,
    const at::Tensor &gamma1,
-    const at::Tensor &beta1,
+    const c10::optional<at::Tensor> &beta1,
    const at::Tensor &wuq,
-    const at::Tensor &descale1,
+    const c10::optional<at::Tensor> &descale1,
    const at::Tensor &gamma2,
    const at::Tensor &cos,
    const at::Tensor &sin,
@@ -96,12 +96,12 @@ std::tuple<at::Tensor &, at::Tensor &, at::Tensor &, at::Tensor &, at::Tensor &>
    const at::Tensor &kv_cache,
    const at::Tensor &kv_cache_rope,
    const at::Tensor &slotmapping,
-    const at::Tensor &quant_scale0,
-    const at::Tensor &quant_offset0,
-    const at::Tensor &bias0,
-    const at::Tensor &quant_scale1,
-    const at::Tensor &quant_offset1,
-    const at::Tensor &bias1,
+    const c10::optional<at::Tensor> &quant_scale0,
+    const c10::optional<at::Tensor> &quant_offset0,
+    const c10::optional<at::Tensor> &bias0,
+    const c10::optional<at::Tensor> &quant_scale1,
+    const c10::optional<at::Tensor> &quant_offset1,
+    const c10::optional<at::Tensor> &bias1,
    const c10::optional<at::Tensor> &ctkv_scale,
    const c10::optional<at::Tensor> &q_nope_scale,
    c10::optional<c10::string_view> cache_mode,