From b01eeb80f8406cba569af5deb40f394293f9950d Mon Sep 17 00:00:00 2001 From: Shu Wang Date: Tue, 5 Aug 2025 00:01:14 -0500 Subject: [PATCH] [NVIDIA]Fix local_num_experts for EP (#8779) --- python/sglang/srt/layers/moe/fused_moe_triton/layer.py | 3 ++- python/sglang/srt/layers/quantization/modelopt_quant.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/layer.py b/python/sglang/srt/layers/moe/fused_moe_triton/layer.py index 74558fd9b..c88aa4d2f 100644 --- a/python/sglang/srt/layers/moe/fused_moe_triton/layer.py +++ b/python/sglang/srt/layers/moe/fused_moe_triton/layer.py @@ -200,7 +200,8 @@ class FusedMoE(torch.nn.Module): self.quant_config = quant_config self.quant_method.create_weights( layer=self, - num_experts=self.num_local_experts, + num_experts=self.num_experts, + num_local_experts=self.num_local_experts, hidden_size=hidden_size, # FIXME: figure out which intermediate_size to use intermediate_size=self.intermediate_size_per_partition, diff --git a/python/sglang/srt/layers/quantization/modelopt_quant.py b/python/sglang/srt/layers/quantization/modelopt_quant.py index fca0ee38b..64df434ae 100755 --- a/python/sglang/srt/layers/quantization/modelopt_quant.py +++ b/python/sglang/srt/layers/quantization/modelopt_quant.py @@ -752,6 +752,7 @@ class ModelOptNvFp4FusedMoEMethod(FusedMoEMethodBase): self, layer: torch.nn.Module, num_experts: int, + num_local_experts: int, hidden_size: int, intermediate_size_per_partition: int, params_dtype: torch.dtype, @@ -765,7 +766,7 @@ class ModelOptNvFp4FusedMoEMethod(FusedMoEMethodBase): # TODO(ch-wan): check if this is needed layer.num_experts = num_experts - layer.num_local_experts = num_experts + layer.num_local_experts = num_local_experts layer.intermediate_size_per_partition = intermediate_size_per_partition layer.params_dtype = params_dtype layer.quant_config = self.quant_config