From ae068a334297efcb7cb54e51bfa2d7c3a00ddbd6 Mon Sep 17 00:00:00 2001 From: weijinqian0 <1184188277@qq.com> Date: Mon, 24 Nov 2025 17:32:37 +0800 Subject: [PATCH] [Refactor] remove moe type of multicast. (#4224) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The main purposes of this PR are as follows: 1. Remove the multicast-related code; Reason: 1. In the scenario like a2 Dual-System Back-to-Back Networking,the performance is worse than all_gather. Before the modification, in e2e test, it was 3 tps; after the modification, it is 10 tps. 2. At the same time, we usually enable the SP feature,it is consistent with the current logic. 3. The advantage of broadcast communication lies in the fact that it does not suffer from uneven DP load and does not require the prefill ACL graph to be enabled. But we support prefill Acl graph recently. So we think there is no need to maintain the multicast as one choice in moe communication. Performance benefits are as follows: When not enable_flashcomm1, TTFT remains relatively stable at around 43000ms, which is approximately 15000ms faster than before the modification. When enable_flashcomm1, there is no diffenence, TTFT remains relatively stable at around 29000ms. - vLLM version: v0.11.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2918c1b49c88c29783c86f78d2c4221cb9622379 --------- Signed-off-by: weijinqian_v1 Signed-off-by: weijinqian0 <1184188277@qq.com> Co-authored-by: weijinqian_v1 --- tests/ut/ops/test_prepare_finalize.py | 58 +-------- .../torchair/ops/test_torchair_fused_moe.py | 6 +- tests/ut/worker/test_model_runner_v1.py | 4 +- vllm_ascend/ascend_forward_context.py | 22 ++-- vllm_ascend/ops/fused_moe/moe_comm_method.py | 33 +---- vllm_ascend/ops/fused_moe/prepare_finalize.py | 114 +----------------- vllm_ascend/spec_decode/eagle_proposer.py | 12 +- vllm_ascend/spec_decode/mtp_proposer.py | 6 +- vllm_ascend/torchair/torchair_mtp_proposer.py | 6 +- vllm_ascend/worker/model_runner_v1.py | 18 +-- 10 files changed, 30 insertions(+), 249 deletions(-) diff --git a/tests/ut/ops/test_prepare_finalize.py b/tests/ut/ops/test_prepare_finalize.py index f0480e1c..35cb01a7 100644 --- a/tests/ut/ops/test_prepare_finalize.py +++ b/tests/ut/ops/test_prepare_finalize.py @@ -6,7 +6,7 @@ from vllm.model_executor.layers.fused_moe import FusedMoEConfig from vllm_ascend.ops.fused_moe.prepare_finalize import ( PrepareAndFinalizeWithAll2All, PrepareAndFinalizeWithAllGather, - PrepareAndFinalizeWithMC2, PrepareAndFinalizeWithNaiveMulticast) + PrepareAndFinalizeWithMC2) class TestPrepareAndFinalize(unittest.TestCase): @@ -222,59 +222,3 @@ class TestPrepareAndFinalize(unittest.TestCase): mock_tp_all_reduce.return_value = result result_with_tp = layer.finalize(h_out, reduce_results=True) self.assertEqual(result_with_tp.shape[0], 3) - - @patch("vllm_ascend.ops.fused_moe.prepare_finalize.get_dp_group") - @patch( - "vllm_ascend.ops.fused_moe.prepare_finalize.tensor_model_parallel_all_reduce" - ) - @patch("vllm_ascend.ops.fused_moe.prepare_finalize.get_forward_context") - def test_naive_multicast_prepare_finalize(self, mock_get_forward_context, - mock_tp_all_reduce, - mock_get_dp_group): - # Mock forward context with DP metadata - mock_context = MagicMock() - mock_context.dp_metadata.cu_tokens_across_sp.return_value = torch.tensor( - [2, 5, 7]) - mock_get_forward_context.return_value = mock_context - - # Setup DP group mock - mock_dp_group = MagicMock() - mock_dp_group.broadcast = MagicMock() - mock_dp_group.all_reduce = MagicMock() - mock_get_dp_group.return_value = mock_dp_group - - # Mock all_reduce to just return input (simulate sum) - def mock_all_reduce(tensor): - return tensor * 2 - - mock_dp_group.all_reduce.side_effect = mock_all_reduce - - # Setup config - self.moe_config.dp_size = 3 - self.moe_config.dp_rank = 1 - self.moe_config.tp_size = 1 - self.moe_config.ep_size = 1 - - layer = PrepareAndFinalizeWithNaiveMulticast(self.moe_config) - - # Local inputs - hidden_states = torch.randn(3, 8) - router_logits = torch.randn(3, 2) - - # Run prepare - h_out, r_out, _, _ = layer.prepare(hidden_states, router_logits) - - # Should be global tensor: [7, 8] and [7, 2] - self.assertEqual(h_out.shape, (7, 8)) - self.assertEqual(r_out.shape, (7, 2)) - - # Run finalize - result = layer.finalize(h_out, reduce_results=False) - - # Should slice back to local: [3, 8] - self.assertEqual(result.shape, (3, 8)) - - # Test with reduce_results=True and TP/EP > 1 - mock_tp_all_reduce.return_value = result - result_with_tp = layer.finalize(h_out, reduce_results=True) - self.assertEqual(result_with_tp.shape, (3, 8)) diff --git a/tests/ut/torchair/ops/test_torchair_fused_moe.py b/tests/ut/torchair/ops/test_torchair_fused_moe.py index 57569a28..cf306d2a 100644 --- a/tests/ut/torchair/ops/test_torchair_fused_moe.py +++ b/tests/ut/torchair/ops/test_torchair_fused_moe.py @@ -23,7 +23,7 @@ from pytest_mock import MockerFixture from vllm.model_executor.layers.fused_moe import FusedMoEMethodBase import vllm_ascend -from vllm_ascend.ascend_forward_context import _get_fused_moe_state +from vllm_ascend.ascend_forward_context import get_fused_moe_state from vllm_ascend.quantization.quant_config import AscendFusedMoEMethod from vllm_ascend.torchair.ops.torchair_fused_moe import ( TorchairAscendFusedMoE, TorchairAscendUnquantizedFusedMoEMethod) @@ -360,7 +360,7 @@ class TestTorchairAscendUnquantizedFusedMoEMethod: global_redundant_expert_num = vllm_ascend.torchair.ops.torchair_fused_moe.get_ascend_config( ).init_redundancy_expert is_deepseek_v3_r1 = global_num_experts - global_redundant_expert_num == 256 - forward_context = MagicMock(fused_moe_state=_get_fused_moe_state( + forward_context = MagicMock(fused_moe_state=get_fused_moe_state( ep_size, is_prefill, is_deepseek_v3_r1)) with patch( "vllm_ascend.torchair.ops.torchair_fused_moe.get_forward_context", @@ -396,7 +396,7 @@ class TestTorchairAscendUnquantizedFusedMoEMethod: ep_size = others_param is_prefill = False forward_context = MagicMock( - fused_moe_state=_get_fused_moe_state(ep_size, is_prefill, True)) + fused_moe_state=get_fused_moe_state(ep_size, is_prefill, True)) with patch("vllm_ascend.torchair.ops.torchair_fused_moe.get_forward_context", return_value=forward_context), \ patch("vllm_ascend.torchair.ops.torchair_fused_moe.get_ascend_soc_version", return_value=AscendSocVersion.A3): expert_map = torch.tensor([0, 1, 2, -1, -1, -1, -1, -1]) diff --git a/tests/ut/worker/test_model_runner_v1.py b/tests/ut/worker/test_model_runner_v1.py index b4eec741..1d781490 100644 --- a/tests/ut/worker/test_model_runner_v1.py +++ b/tests/ut/worker/test_model_runner_v1.py @@ -74,7 +74,7 @@ def test_select_moe_comm_method(soc_version, enable_expert_parallel, # Bind the real method to the mock object method = NPUModelRunner._select_moe_comm_method( - mock_runner, num_tokens, False) + mock_runner, num_tokens) # Assert the result assert method == expected_method @@ -108,4 +108,4 @@ def test_select_moe_comm_method_unsupported_soc(): return_value=True), \ pytest.raises(ValueError, match=f"Unsupported soc_version: {unsupported_soc}"): - NPUModelRunner._select_moe_comm_method(mock_runner, 100, False) + NPUModelRunner._select_moe_comm_method(mock_runner, 100) diff --git a/vllm_ascend/ascend_forward_context.py b/vllm_ascend/ascend_forward_context.py index 8c477dac..11c1d3a0 100644 --- a/vllm_ascend/ascend_forward_context.py +++ b/vllm_ascend/ascend_forward_context.py @@ -29,16 +29,8 @@ class FusedMoEState(Enum): All2AllSeq = 5 -class MoECommType(Enum): - ALLGATHER = 0 - MC2 = 1 - ALLTOALL = 2 - NAIVE_MULTICAST = 3 - - -# TODO(zzzzwwjj): add soc_version to choose branch -def _get_fused_moe_state(ep_size: int, with_prefill: bool, - is_deepseek_v3_r1: bool): +def get_fused_moe_state(ep_size: int, with_prefill: bool, + is_deepseek_v3_r1: bool): # the fusion operator torch_npu.npu_grouped_matmul_finalize_routing called by allgather ep # only supports deepseek v3/r1 if (envs_ascend.VLLM_ENABLE_FUSED_EXPERTS_ALLGATHER_EP and ep_size > 1 @@ -56,6 +48,12 @@ def _get_fused_moe_state(ep_size: int, with_prefill: bool, return FusedMoEState.MC2 +class MoECommType(Enum): + ALLGATHER = 0 + MC2 = 1 + ALLTOALL = 2 + + @contextmanager def set_ascend_forward_context( attn_metadata: Any, @@ -103,8 +101,8 @@ def set_ascend_forward_context( is_deepseek_v3_r1 = hasattr( vllm_config.model_config.hf_config, 'n_routed_experts' ) and vllm_config.model_config.hf_config.n_routed_experts == 256 - fused_moe_state = _get_fused_moe_state(ep_size, with_prefill, - is_deepseek_v3_r1) + fused_moe_state = get_fused_moe_state(ep_size, with_prefill, + is_deepseek_v3_r1) forward_context.fused_moe_state = fused_moe_state forward_context.in_profile_run = in_profile_run diff --git a/vllm_ascend/ops/fused_moe/moe_comm_method.py b/vllm_ascend/ops/fused_moe/moe_comm_method.py index c89eb1df..c48ce1a4 100644 --- a/vllm_ascend/ops/fused_moe/moe_comm_method.py +++ b/vllm_ascend/ops/fused_moe/moe_comm_method.py @@ -27,7 +27,7 @@ from vllm_ascend.ascend_forward_context import MoECommType from vllm_ascend.ops.fused_moe.moe_mlp import unified_apply_mlp from vllm_ascend.ops.fused_moe.prepare_finalize import ( PrepareAndFinalizeWithAll2All, PrepareAndFinalizeWithAllGather, - PrepareAndFinalizeWithMC2, PrepareAndFinalizeWithNaiveMulticast, QuantType) + PrepareAndFinalizeWithMC2, QuantType) from vllm_ascend.ops.fused_moe.token_dispatcher import ( TokenDispatcherWithAll2AllV, TokenDispatcherWithAllGather, TokenDispatcherWithMC2, TokenDispatcherWithMoge) @@ -44,8 +44,6 @@ def setup_moe_comm_method(moe_config): _MoECommMethods[MoECommType.ALLTOALL] = AlltoAllCommImpl(moe_config) _MoECommMethods[MoECommType.ALLGATHER] = AllGatherCommImpl(moe_config) _MoECommMethods[MoECommType.MC2] = MC2CommImpl(moe_config) - _MoECommMethods[MoECommType.NAIVE_MULTICAST] = NaiveMulticastCommImpl( - moe_config) class MoECommMethod(ABC): @@ -245,32 +243,3 @@ class AlltoAllCommImpl(MoECommMethod): def _get_prepare_finalize(self): return PrepareAndFinalizeWithAll2All(self.moe_config) - - -class NaiveMulticastCommImpl(MoECommMethod): - """This implementation is the same as NativeAllGatherCommImpl, - but uses NPU-specific ops for better performance. - - This implementation should be compatible with all scenarios, and - thus it is the default implementation for MoE communication methods. - It uses `torch_npu.npu_moe_init_routing_v2` for pre-processing - and `torch_npu.npu_moe_token_unpermute` for post-processing - to handle the token-to-expert mapping and communication efficiently. - - NOTE(Yizhou): TBH, it is really weird that we were supposed to use - `torch_npu.npu_moe_init_routing_v2` and `torch_npu.npu_moe_finalize_routing` - or `torch_npu.npu_moe_token_permute` and `torch_npu.npu_moe_token_unpermute` - for pre-processing and post-processing, respectively. - But `npu_moe_finalize_routing` will lead to accuracy issues so we have to - use `torch_npu.npu_moe_token_unpermute` instead. - This is a workaround and should be removed after the issue is fixed. - """ - - def _get_token_dispatcher(self): - return TokenDispatcherWithAllGather( - top_k=self.moe_config.experts_per_token, - num_experts=self.moe_config.num_experts, - num_local_experts=self.moe_config.num_local_experts) - - def _get_prepare_finalize(self): - return PrepareAndFinalizeWithNaiveMulticast(self.moe_config) diff --git a/vllm_ascend/ops/fused_moe/prepare_finalize.py b/vllm_ascend/ops/fused_moe/prepare_finalize.py index 46640006..48350ea8 100644 --- a/vllm_ascend/ops/fused_moe/prepare_finalize.py +++ b/vllm_ascend/ops/fused_moe/prepare_finalize.py @@ -45,7 +45,7 @@ class PrepareAndFinalize(ABC): """ Abstract base class for MoE (Mixture-of-Experts) tensor preparation and finalization in distributed environments. Subclasses implement specific communication strategies - (e.g., AllGather, All2All, MC2, Naive Multicast) to handle tensor padding, slicing, + (e.g., AllGather, All2All, MC2) to handle tensor padding, slicing, broadcasting, and reduction across TP/DP/EP groups. Attributes: @@ -454,115 +454,3 @@ class PrepareAndFinalizeWithAllGather(PrepareAndFinalize): hidden_states = tensor_model_parallel_all_reduce(hidden_states) return hidden_states - - -class PrepareAndFinalizeWithNaiveMulticast(PrepareAndFinalize): - """ - MoE communication strategy using Naive Multicast (point-to-point broadcast). - Will be used in prefill when using allgather in decode. Each DP rank broadcasts its slice to all others. - Uses `cu_tokens_across_dp_cpu` (cumulative tokens) to locate slice boundaries. - """ - - def _naive_multicast(self, x: torch.Tensor, - cu_tokens_across_dp_cpu: torch.Tensor): - """ - Naive multicast implementation: - 1. Create global buffer sized by total tokens across DP. - 2. Current rank copies its slice into its designated buffer region. - 3. Each rank broadcasts its slice to all others via P2P. - - Args: - x (torch.Tensor): Local tensor [local_tokens, hidden_size] - cu_tokens_across_dp_cpu (torch.Tensor): Cumulative token counts per DP rank - - Returns: - torch.Tensor: Global tensor [total_tokens, hidden_size] - """ - assert len(x.shape) == 2, "Input must be 2D [tokens, features]" - buffer = torch.empty((cu_tokens_across_dp_cpu[-1], x.size(1)), - device=x.device, - dtype=x.dtype) - - # Copy local slice into buffer - start = 0 if self.moe_config.dp_rank == 0 else cu_tokens_across_dp_cpu[ - self.moe_config.dp_rank - 1] - end = cu_tokens_across_dp_cpu[self.moe_config.dp_rank] - buffer[start:end, :].copy_(x) - - # Broadcast each slice to all ranks - for idx in range(self.moe_config.dp_size): - start = 0 if idx == 0 else cu_tokens_across_dp_cpu[idx - 1] - end = cu_tokens_across_dp_cpu[idx] - get_dp_group().broadcast(buffer[start:end, :], idx) - return buffer - - def prepare( - self, - hidden_states: torch.Tensor, - router_logits: torch.Tensor, - enable_shared_expert_dp: bool = False, - replace_allreduce: bool = False, - quant_type=QuantType.NONE - ) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], - Optional[torch.Tensor]]: - """ - Preparation steps: - 1. Fetch cumulative token boundaries from forward context. - 2. Multicast hidden_states and router_logits to form global tensors. - - Returns: - Tuple of (global_hidden_states, global_router_logits, None, None) - """ - self.enable_shared_expert_dp = enable_shared_expert_dp - - if self.moe_config.dp_size > 1: - self.cu_tokens_across_dp_cpu = get_forward_context( - ).dp_metadata.cu_tokens_across_sp(1) - hidden_states = self._naive_multicast(hidden_states, - self.cu_tokens_across_dp_cpu) - router_logits = self._naive_multicast(router_logits, - self.cu_tokens_across_dp_cpu) - - if prefill_context_parallel_enable() and self.moe_config.pcp_size > 1: - hidden_states = get_pcp_group().all_gather( - hidden_states, - dim=0, - ) - router_logits = get_pcp_group().all_gather( - router_logits, - dim=0, - ) - - return hidden_states, router_logits, None, None - - def finalize(self, - hidden_states: torch.Tensor, - reduce_results: bool, - context_metadata: Optional[dict] = None) -> torch.Tensor: - """ - Finalization steps: - 1. If DP > 1 and not shared expert: - - All-reduce across DP - - Slice to current rank's token range using cu_tokens_across_dp_cpu - 2. If `reduce_results=True` and TP/EP > 1, apply tensor_model_parallel_all_reduce. - - Returns: - Tensor with shape [local_num_tokens, hidden_size] - """ - if self.moe_config.dp_size > 1 and not self.enable_shared_expert_dp: - start = 0 if self.moe_config.dp_rank == 0 else self.cu_tokens_across_dp_cpu[ - self.moe_config.dp_rank - 1] - end = self.cu_tokens_across_dp_cpu[self.moe_config.dp_rank] - hidden_states = get_dp_group().all_reduce( - hidden_states) # Sum across DP - hidden_states = hidden_states[start:end, :] - - if prefill_context_parallel_enable() and self.moe_config.pcp_size > 1: - hidden_states = get_pcp_group().reduce_scatter(hidden_states, - dim=0) - - if reduce_results and (self.moe_config.tp_size > 1 - or self.moe_config.ep_size > 1): - hidden_states = tensor_model_parallel_all_reduce(hidden_states) - - return hidden_states diff --git a/vllm_ascend/spec_decode/eagle_proposer.py b/vllm_ascend/spec_decode/eagle_proposer.py index d3be2ea9..4d076ac1 100644 --- a/vllm_ascend/spec_decode/eagle_proposer.py +++ b/vllm_ascend/spec_decode/eagle_proposer.py @@ -124,8 +124,7 @@ class EagleProposer(Proposer): num_tokens_across_dp: Optional[torch.Tensor] = None, aclgraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE, batch_descriptor=None): - moe_comm_type = self.runner._select_moe_comm_method( - num_tokens, with_prefill) + moe_comm_type = self.runner._select_moe_comm_method(num_tokens) with set_ascend_forward_context(None, self.vllm_config, moe_comm_type=moe_comm_type, @@ -460,11 +459,7 @@ class EagleProposer(Proposer): else: num_input_tokens = num_tokens - with_prefill = attn_metadata.attn_state not in [ - AscendAttentionState.DecodeOnly, AscendAttentionState.SpecDecoding - ] - moe_comm_type = self.runner._select_moe_comm_method( - num_input_tokens, with_prefill) + moe_comm_type = self.runner._select_moe_comm_method(num_input_tokens) # copy inputs to buffer for cudagraph self.positions[:num_tokens] = target_positions.to(device) @@ -504,8 +499,7 @@ class EagleProposer(Proposer): else: input_batch_size = batch_size - moe_comm_type = self.runner._select_moe_comm_method( - input_batch_size, False) + moe_comm_type = self.runner._select_moe_comm_method(input_batch_size) attn_metadata.num_actual_tokens = batch_size attn_metadata.max_query_len = 1 diff --git a/vllm_ascend/spec_decode/mtp_proposer.py b/vllm_ascend/spec_decode/mtp_proposer.py index 7aa9b729..556a917f 100644 --- a/vllm_ascend/spec_decode/mtp_proposer.py +++ b/vllm_ascend/spec_decode/mtp_proposer.py @@ -223,8 +223,7 @@ class MtpProposer(Proposer): with_prefill, ) = self.runner._sync_metadata_across_dp(num_tokens, with_prefill) - moe_comm_type = self.runner._select_moe_comm_method( - num_tokens, with_prefill) + moe_comm_type = self.runner._select_moe_comm_method(num_tokens) if skip_attn: attn_metadata = None @@ -672,8 +671,7 @@ class MtpProposer(Proposer): with_prefill) = self.runner._sync_metadata_across_dp( num_input_tokens, self.runner.with_prefill) - moe_comm_type = self.runner._select_moe_comm_method( - num_input_tokens, with_prefill) + moe_comm_type = self.runner._select_moe_comm_method(num_input_tokens) if scheduler_output: max_query_len = common_attn_metadata.max_query_len diff --git a/vllm_ascend/torchair/torchair_mtp_proposer.py b/vllm_ascend/torchair/torchair_mtp_proposer.py index 183e0da2..b816b8d8 100644 --- a/vllm_ascend/torchair/torchair_mtp_proposer.py +++ b/vllm_ascend/torchair/torchair_mtp_proposer.py @@ -81,8 +81,7 @@ class TorchairMtpProposer(MtpProposer): num_tokens_across_dp=None, aclgraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE, batch_descriptor=None) -> None: - moe_comm_type = self.runner._select_moe_comm_method( - num_tokens, with_prefill) + moe_comm_type = self.runner._select_moe_comm_method(num_tokens) if not with_prefill: skip_attn = False @@ -342,8 +341,7 @@ class TorchairMtpProposer(MtpProposer): num_tokens_across_dp = self.runner.num_tokens_across_dp with_prefill = self.runner.with_prefill - moe_comm_type = self.runner._select_moe_comm_method( - num_input_tokens, with_prefill) + moe_comm_type = self.runner._select_moe_comm_method(num_input_tokens) batch_descriptor = BatchDescriptor(num_tokens=num_input_tokens, uniform_decode=False) aclgraph_runtime_mode, batch_descriptor = \ diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 124102f5..8f103ddc 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -2192,8 +2192,8 @@ class NPUModelRunner(LoRAModelRunnerMixin): kv_connector_output=kv_connector_output, ) - def _select_moe_comm_method(self, num_tokens: int, - with_prefill: bool) -> Optional[MoECommType]: + def _select_moe_comm_method(self, + num_tokens: int) -> Optional[MoECommType]: """1. If expert parallel is not enabled, we use all-gather since MC2 and all-to-all are designed for expert parallelism. 2. If expert parallel is enabled, we need to consider the soc version and the @@ -2244,12 +2244,6 @@ class NPUModelRunner(LoRAModelRunnerMixin): else: raise ValueError(f"Unsupported soc_version: {soc_version}") - if moe_comm_type == MoECommType.ALLGATHER and with_prefill: - if enable_sp(): - moe_comm_type = MoECommType.ALLGATHER - else: - moe_comm_type = MoECommType.NAIVE_MULTICAST - # PanguProMoE only supports allgather if model_type == "PanguProMoE": moe_comm_type = MoECommType.ALLGATHER @@ -2289,8 +2283,7 @@ class NPUModelRunner(LoRAModelRunnerMixin): if self.dynamic_eplb: self.eplb_updator.take_update_info_from_eplb_process() - moe_comm_type = self._select_moe_comm_method(num_input_tokens, - self.with_prefill) + moe_comm_type = self._select_moe_comm_method(num_input_tokens) uniform_decode = (max_query_len == self.uniform_decode_query_len) and ( scheduler_output.total_num_scheduled_tokens @@ -2823,7 +2816,7 @@ class NPUModelRunner(LoRAModelRunnerMixin): with_prefill) = self._sync_metadata_across_dp(num_tokens, with_prefill) - moe_comm_type = self._select_moe_comm_method(num_tokens, with_prefill) + moe_comm_type = self._select_moe_comm_method(num_tokens) # If cudagraph_mode.decode_mode() == FULL and # cudagraph_mode.seperate_routine(). This means that we are using @@ -2999,8 +2992,7 @@ class NPUModelRunner(LoRAModelRunnerMixin): # allowing vLLM to correctly estimate the maximum memory required. if self.max_num_tokens > self.mc2_tokens_capacity and \ self._select_moe_comm_method( - self.mc2_tokens_capacity, - with_prefill=True) == MoECommType.MC2: + self.mc2_tokens_capacity) == MoECommType.MC2: self._dummy_run(self.mc2_tokens_capacity, with_prefill=True) output = None