Add support for Qwen3 MoE+GPTQ

2025-11-15 20:14:45 +08:00
parent b296c44ae0
commit 8152e24cb2
35 changed files with 6468 additions and 574 deletions
--- a/vllm/model_executor/models/qwen3_moe.py
+++ b/vllm/model_executor/models/qwen3_moe.py
@@ -122,7 +122,7 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
        self.gate = ReplicatedLinear(config.hidden_size,
                                     config.num_experts,
                                     bias=False,
-                                     quant_config=None,
+                                     quant_config=quant_config,
                                     prefix=f"{prefix}.gate")

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
@@ -294,7 +294,7 @@ class Qwen3MoeDecoderLayer(nn.Module):
        positions: torch.Tensor,
        hidden_states: torch.Tensor,
        residual: Optional[torch.Tensor],
-    ) -> torch.Tensor:
+    ) -> tuple[torch.Tensor, torch.Tensor]:
        # Self Attention
        if residual is None:
            residual = hidden_states
@@ -532,4 +532,4 @@ class Qwen3MoeForCausalLM(nn.Module, SupportsPP):
    def load_weights(self, weights: Iterable[tuple[str,
                                                   torch.Tensor]]) -> set[str]:
        loader = AutoWeightsLoader(self)
-        return loader.load_weights(weights)
+        return loader.load_weights(weights)