From bdc721d35a25d0ec9174c3c52881b1656c4a3367 Mon Sep 17 00:00:00 2001
From: LI SHENGYONG <49200266+shenchuxiaofugui@users.noreply.github.com>
Date: Wed, 31 Dec 2025 09:19:04 +0800
Subject: [PATCH] [smoke][bugfix] moe_init_routing_v2 active_expert_range use
 int type (#5521)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### What this PR does / why we need it?
The float kernel of MOE_init_routing_v2 in the dispatch allgather
operation does not support tensor format for active_expert_range; it
only supports int.
PR5311 To unify the variables `local_num_experts` and
`self.local_num_experts`, `self.local_num_experts` was used
consistently, which led to the subsequent integer type parameter being
converted to a tensor type.

### Does this PR introduce _any_ user-facing change?

### How was this patch tested?
gsm8k | exact_match,strict-match: ground_truth=0.89 | measured=0.8939 |
success=✅
gsm8k | exact_match,flexible-extract: ground_truth=0.85 | measured=0.856
| success=✅
ceval-valid | acc,none: ground_truth=0.84 | measured=0.8373 | success=✅
Model Parameters:
{'pretrained': 'Qwen/Qwen3-30B-A3B', 'tensor_parallel_size': 2, 'dtype':
'auto', 'trust_remote_code': False, 'max_model_len': 4096,
'gpu_memory_utilization': 0.6, 'enable_expert_parallel': True}

- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/45c1ca1ca1ee8fa06df263c8715e8a412ff408d4

Signed-off-by: shenchuxiaofugui <1311027364@qq.com>
---
 vllm_ascend/eplb/core/eplb_utils.py           | 1 -
 vllm_ascend/ops/fused_moe/fused_moe.py        | 2 +-
 vllm_ascend/ops/fused_moe/token_dispatcher.py | 4 +++-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/vllm_ascend/eplb/core/eplb_utils.py b/vllm_ascend/eplb/core/eplb_utils.py
index 4920de30..aa681249 100644
--- a/vllm_ascend/eplb/core/eplb_utils.py
+++ b/vllm_ascend/eplb/core/eplb_utils.py
@@ -104,7 +104,6 @@ def generate_log2phy_map(global_expert_map, ep_rank):
     for rankid, map_per_rank in enumerate(global_expert_map):
         for idx, val in enumerate(map_per_rank):
             val = val.item()
-            # 计算value：当前值 + i * 有效元素个数
             if val != -1:
                 log2phy_map[idx].append(val + rankid * valid_count)
 
diff --git a/vllm_ascend/ops/fused_moe/fused_moe.py b/vllm_ascend/ops/fused_moe/fused_moe.py
index daaca8b9..23f327d3 100644
--- a/vllm_ascend/ops/fused_moe/fused_moe.py
+++ b/vllm_ascend/ops/fused_moe/fused_moe.py
@@ -180,7 +180,7 @@ class AscendFusedMoE(FusedMoE):
                              or ascend_config.expert_map_record_path) and (
                                  self.log2phy is not None)
         self.local_num_experts = (torch.sum(
-            self._expert_map != -1) if self._expert_map is not None else
+            self._expert_map != -1).item() if self._expert_map is not None else
                                   self.global_num_experts)
         if self._expert_map is not None:
             logger.info_once(
diff --git a/vllm_ascend/ops/fused_moe/token_dispatcher.py b/vllm_ascend/ops/fused_moe/token_dispatcher.py
index aeb751d0..e17b033e 100644
--- a/vllm_ascend/ops/fused_moe/token_dispatcher.py
+++ b/vllm_ascend/ops/fused_moe/token_dispatcher.py
@@ -335,7 +335,9 @@ class TokenDispatcherWithAllGather(MoETokenDispatcher):
         super().__init__(**kwargs)
         self.apply_router_weight_on_input = False
         self.max_num_tokens = kwargs.get("max_num_tokens")
-        self.num_experts_local = kwargs.get("num_local_experts", 0)
+        num_experts_local = kwargs.get("num_local_experts", 0)
+        self.num_experts_local = num_experts_local.item() if torch.is_tensor(
+            num_experts_local) else int(num_experts_local)
         self.original_shape = None
         self.with_quant = False