[dev] support compressed-tensors w8a8 quantization (#75)

* [dev] support compressed-tensors w8a8 quantization Co-authored-by: Li Wei <liwei.109@outlook.com> * [refact]update KunlunScaleMMKernel impl * [rebase]resolve conflicts and remove redundant code --------- Co-authored-by: tangshiwen <tangshiwen@baidu.com>
2026-01-06 13:51:53 +08:00
parent ee0f50e68f
commit 515a4eeda9
8 changed files with 952 additions and 523 deletions
--- a/vllm_kunlun/models/qwen3_moe.py
+++ b/vllm_kunlun/models/qwen3_moe.py
@@ -173,10 +173,8 @@ class Qwen3MoeSparseMoeBlock(nn.Module):

        # router_logits: (num_tokens, n_experts)
        router_logits, _ = self.gate(hidden_states)
-        kunlun_linear_weights = self.gate.get_weights()
        final_hidden_states = self.experts(hidden_states=hidden_states,
-                                           router_logits=router_logits,
-                                           linear_weights=kunlun_linear_weights)
+                                           router_logits=router_logits)

        if self.is_sequence_parallel:
            final_hidden_states = tensor_model_parallel_all_gather(