From bdc721d35a25d0ec9174c3c52881b1656c4a3367 Mon Sep 17 00:00:00 2001 From: LI SHENGYONG <49200266+shenchuxiaofugui@users.noreply.github.com> Date: Wed, 31 Dec 2025 09:19:04 +0800 Subject: [PATCH] [smoke][bugfix] moe_init_routing_v2 active_expert_range use int type (#5521) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### What this PR does / why we need it? The float kernel of MOE_init_routing_v2 in the dispatch allgather operation does not support tensor format for active_expert_range; it only supports int. PR5311 To unify the variables `local_num_experts` and `self.local_num_experts`, `self.local_num_experts` was used consistently, which led to the subsequent integer type parameter being converted to a tensor type. ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? gsm8k | exact_match,strict-match: ground_truth=0.89 | measured=0.8939 | success=✅ gsm8k | exact_match,flexible-extract: ground_truth=0.85 | measured=0.856 | success=✅ ceval-valid | acc,none: ground_truth=0.84 | measured=0.8373 | success=✅ Model Parameters: {'pretrained': 'Qwen/Qwen3-30B-A3B', 'tensor_parallel_size': 2, 'dtype': 'auto', 'trust_remote_code': False, 'max_model_len': 4096, 'gpu_memory_utilization': 0.6, 'enable_expert_parallel': True} - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/45c1ca1ca1ee8fa06df263c8715e8a412ff408d4 Signed-off-by: shenchuxiaofugui <1311027364@qq.com> --- vllm_ascend/eplb/core/eplb_utils.py | 1 - vllm_ascend/ops/fused_moe/fused_moe.py | 2 +- vllm_ascend/ops/fused_moe/token_dispatcher.py | 4 +++- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/vllm_ascend/eplb/core/eplb_utils.py b/vllm_ascend/eplb/core/eplb_utils.py index 4920de30..aa681249 100644 --- a/vllm_ascend/eplb/core/eplb_utils.py +++ b/vllm_ascend/eplb/core/eplb_utils.py @@ -104,7 +104,6 @@ def generate_log2phy_map(global_expert_map, ep_rank): for rankid, map_per_rank in enumerate(global_expert_map): for idx, val in enumerate(map_per_rank): val = val.item() - # 计算value:当前值 + i * 有效元素个数 if val != -1: log2phy_map[idx].append(val + rankid * valid_count) diff --git a/vllm_ascend/ops/fused_moe/fused_moe.py b/vllm_ascend/ops/fused_moe/fused_moe.py index daaca8b9..23f327d3 100644 --- a/vllm_ascend/ops/fused_moe/fused_moe.py +++ b/vllm_ascend/ops/fused_moe/fused_moe.py @@ -180,7 +180,7 @@ class AscendFusedMoE(FusedMoE): or ascend_config.expert_map_record_path) and ( self.log2phy is not None) self.local_num_experts = (torch.sum( - self._expert_map != -1) if self._expert_map is not None else + self._expert_map != -1).item() if self._expert_map is not None else self.global_num_experts) if self._expert_map is not None: logger.info_once( diff --git a/vllm_ascend/ops/fused_moe/token_dispatcher.py b/vllm_ascend/ops/fused_moe/token_dispatcher.py index aeb751d0..e17b033e 100644 --- a/vllm_ascend/ops/fused_moe/token_dispatcher.py +++ b/vllm_ascend/ops/fused_moe/token_dispatcher.py @@ -335,7 +335,9 @@ class TokenDispatcherWithAllGather(MoETokenDispatcher): super().__init__(**kwargs) self.apply_router_weight_on_input = False self.max_num_tokens = kwargs.get("max_num_tokens") - self.num_experts_local = kwargs.get("num_local_experts", 0) + num_experts_local = kwargs.get("num_local_experts", 0) + self.num_experts_local = num_experts_local.item() if torch.is_tensor( + num_experts_local) else int(num_experts_local) self.original_shape = None self.with_quant = False