[Bugfix]Fix deepseek 3.2 C8 precision by rotary tensor (#7537)

### What this PR does / why we need it? During the attention quantization process of DeepSeek V3.2, it is necessary to retrieve the Hadamard matrix from the weights to facilitate the computation. ### Does this PR introduce _any_ user-facing change? No. But there will be two new tensor in quant weight. ### How was this patch tested? - vLLM version: v0.18.0 - vLLM main: 8b6325758c --------- Signed-off-by: mayumeng <m30059191@china.huawei.com> Co-authored-by: mayumeng <m30059191@china.huawei.com>
2026-03-25 09:18:00 +08:00
parent d96440924a
commit 8977be1df3
4 changed files with 64 additions and 10 deletions
--- a/vllm_ascend/quantization/methods/kv_c8.py
+++ b/vllm_ascend/quantization/methods/kv_c8.py
@@ -63,3 +63,27 @@ class AscendFAQuantAttentionMethod:
        repeated_quant_kscale = fa_k_scale.repeat(self.kv_lora_rank)
        layer.quant_kscale = repeated_quant_kscale.view(1, self.kv_lora_rank)
        layer.quant_kscale = 1.0 / torch.nn.Parameter(layer.quant_kscale.to(torch.float), requires_grad=False)
+
+
+@register_scheme("INT8_DYNAMIC", "attention")
+class AscendSFAQuantAttentionMethod:
+    def __init__(self):
+        vllm_config = get_current_vllm_config()
+        config = vllm_config.model_config.hf_config
+        self.index_head_dim = config.index_head_dim
+
+    def create_weights(self, layer: torch.nn.Module) -> None:
+        extra_module_names = ["indexer"]
+        for name in extra_module_names:
+            setattr(layer, name, torch.nn.Module())
+        params_dict = {}
+        params_dict["indexer.q_rot"] = torch.empty((self.index_head_dim, self.index_head_dim), dtype=torch.float32)
+        params_dict["indexer.k_rot"] = torch.empty((self.index_head_dim, self.index_head_dim), dtype=torch.float32)
+        for name, weight in params_dict.items():
+            module_name, weight_name = name.split(".")
+            module = getattr(layer, module_name)
+            weight_param = torch.nn.Parameter(weight, requires_grad=False)
+            module.register_parameter(weight_name, weight_param)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        pass