From b01eeb80f8406cba569af5deb40f394293f9950d Mon Sep 17 00:00:00 2001
From: Shu Wang <shuw@nvidia.com>
Date: Tue, 5 Aug 2025 00:01:14 -0500
Subject: [PATCH] [NVIDIA]Fix local_num_experts for EP (#8779)

---
 python/sglang/srt/layers/moe/fused_moe_triton/layer.py  | 3 ++-
 python/sglang/srt/layers/quantization/modelopt_quant.py | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/layer.py b/python/sglang/srt/layers/moe/fused_moe_triton/layer.py
index 74558fd9b..c88aa4d2f 100644
--- a/python/sglang/srt/layers/moe/fused_moe_triton/layer.py
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/layer.py
@@ -200,7 +200,8 @@ class FusedMoE(torch.nn.Module):
         self.quant_config = quant_config
         self.quant_method.create_weights(
             layer=self,
-            num_experts=self.num_local_experts,
+            num_experts=self.num_experts,
+            num_local_experts=self.num_local_experts,
             hidden_size=hidden_size,
             # FIXME: figure out which intermediate_size to use
             intermediate_size=self.intermediate_size_per_partition,
diff --git a/python/sglang/srt/layers/quantization/modelopt_quant.py b/python/sglang/srt/layers/quantization/modelopt_quant.py
index fca0ee38b..64df434ae 100755
--- a/python/sglang/srt/layers/quantization/modelopt_quant.py
+++ b/python/sglang/srt/layers/quantization/modelopt_quant.py
@@ -752,6 +752,7 @@ class ModelOptNvFp4FusedMoEMethod(FusedMoEMethodBase):
         self,
         layer: torch.nn.Module,
         num_experts: int,
+        num_local_experts: int,
         hidden_size: int,
         intermediate_size_per_partition: int,
         params_dtype: torch.dtype,
@@ -765,7 +766,7 @@ class ModelOptNvFp4FusedMoEMethod(FusedMoEMethodBase):
 
         # TODO(ch-wan): check if this is needed
         layer.num_experts = num_experts
-        layer.num_local_experts = num_experts
+        layer.num_local_experts = num_local_experts
         layer.intermediate_size_per_partition = intermediate_size_per_partition
         layer.params_dtype = params_dtype
         layer.quant_config = self.quant_config