[Refactor] remove moe type of multicast. (#4224)

The main purposes of this PR are as follows: 
1. Remove the multicast-related code; 

Reason:
1. In the scenario like a2 Dual-System Back-to-Back Networking,the
performance is worse than all_gather. Before the modification, in e2e
test, it was 3 tps; after the modification, it is 10 tps.
2. At the same time, we usually enable the SP feature,it is consistent
with the current logic.
3. The advantage of broadcast communication lies in the fact that it
does not suffer from uneven DP load and does not require the prefill ACL
graph to be enabled. But we support prefill Acl graph recently.

So we think there is no need to maintain the multicast as one choice in
moe communication.

Performance benefits are as follows:
When not enable_flashcomm1, TTFT remains relatively stable at around
43000ms, which is approximately 15000ms faster than before the
modification.

When enable_flashcomm1, there is no diffenence, TTFT remains relatively
stable at around 29000ms.


- vLLM version: v0.11.0
- vLLM main:
2918c1b49c

---------

Signed-off-by: weijinqian_v1 <weijinqian@huawei.com>
Signed-off-by: weijinqian0 <1184188277@qq.com>
Co-authored-by: weijinqian_v1 <weijinqian@huawei.com>
This commit is contained in:
weijinqian0
2025-11-24 17:32:37 +08:00
committed by GitHub
parent 5508a602ed
commit ae068a3342
10 changed files with 30 additions and 249 deletions

View File

@@ -6,7 +6,7 @@ from vllm.model_executor.layers.fused_moe import FusedMoEConfig
from vllm_ascend.ops.fused_moe.prepare_finalize import ( from vllm_ascend.ops.fused_moe.prepare_finalize import (
PrepareAndFinalizeWithAll2All, PrepareAndFinalizeWithAllGather, PrepareAndFinalizeWithAll2All, PrepareAndFinalizeWithAllGather,
PrepareAndFinalizeWithMC2, PrepareAndFinalizeWithNaiveMulticast) PrepareAndFinalizeWithMC2)
class TestPrepareAndFinalize(unittest.TestCase): class TestPrepareAndFinalize(unittest.TestCase):
@@ -222,59 +222,3 @@ class TestPrepareAndFinalize(unittest.TestCase):
mock_tp_all_reduce.return_value = result mock_tp_all_reduce.return_value = result
result_with_tp = layer.finalize(h_out, reduce_results=True) result_with_tp = layer.finalize(h_out, reduce_results=True)
self.assertEqual(result_with_tp.shape[0], 3) self.assertEqual(result_with_tp.shape[0], 3)
@patch("vllm_ascend.ops.fused_moe.prepare_finalize.get_dp_group")
@patch(
"vllm_ascend.ops.fused_moe.prepare_finalize.tensor_model_parallel_all_reduce"
)
@patch("vllm_ascend.ops.fused_moe.prepare_finalize.get_forward_context")
def test_naive_multicast_prepare_finalize(self, mock_get_forward_context,
mock_tp_all_reduce,
mock_get_dp_group):
# Mock forward context with DP metadata
mock_context = MagicMock()
mock_context.dp_metadata.cu_tokens_across_sp.return_value = torch.tensor(
[2, 5, 7])
mock_get_forward_context.return_value = mock_context
# Setup DP group mock
mock_dp_group = MagicMock()
mock_dp_group.broadcast = MagicMock()
mock_dp_group.all_reduce = MagicMock()
mock_get_dp_group.return_value = mock_dp_group
# Mock all_reduce to just return input (simulate sum)
def mock_all_reduce(tensor):
return tensor * 2
mock_dp_group.all_reduce.side_effect = mock_all_reduce
# Setup config
self.moe_config.dp_size = 3
self.moe_config.dp_rank = 1
self.moe_config.tp_size = 1
self.moe_config.ep_size = 1
layer = PrepareAndFinalizeWithNaiveMulticast(self.moe_config)
# Local inputs
hidden_states = torch.randn(3, 8)
router_logits = torch.randn(3, 2)
# Run prepare
h_out, r_out, _, _ = layer.prepare(hidden_states, router_logits)
# Should be global tensor: [7, 8] and [7, 2]
self.assertEqual(h_out.shape, (7, 8))
self.assertEqual(r_out.shape, (7, 2))
# Run finalize
result = layer.finalize(h_out, reduce_results=False)
# Should slice back to local: [3, 8]
self.assertEqual(result.shape, (3, 8))
# Test with reduce_results=True and TP/EP > 1
mock_tp_all_reduce.return_value = result
result_with_tp = layer.finalize(h_out, reduce_results=True)
self.assertEqual(result_with_tp.shape, (3, 8))

View File

@@ -23,7 +23,7 @@ from pytest_mock import MockerFixture
from vllm.model_executor.layers.fused_moe import FusedMoEMethodBase from vllm.model_executor.layers.fused_moe import FusedMoEMethodBase
import vllm_ascend import vllm_ascend
from vllm_ascend.ascend_forward_context import _get_fused_moe_state from vllm_ascend.ascend_forward_context import get_fused_moe_state
from vllm_ascend.quantization.quant_config import AscendFusedMoEMethod from vllm_ascend.quantization.quant_config import AscendFusedMoEMethod
from vllm_ascend.torchair.ops.torchair_fused_moe import ( from vllm_ascend.torchair.ops.torchair_fused_moe import (
TorchairAscendFusedMoE, TorchairAscendUnquantizedFusedMoEMethod) TorchairAscendFusedMoE, TorchairAscendUnquantizedFusedMoEMethod)
@@ -360,7 +360,7 @@ class TestTorchairAscendUnquantizedFusedMoEMethod:
global_redundant_expert_num = vllm_ascend.torchair.ops.torchair_fused_moe.get_ascend_config( global_redundant_expert_num = vllm_ascend.torchair.ops.torchair_fused_moe.get_ascend_config(
).init_redundancy_expert ).init_redundancy_expert
is_deepseek_v3_r1 = global_num_experts - global_redundant_expert_num == 256 is_deepseek_v3_r1 = global_num_experts - global_redundant_expert_num == 256
forward_context = MagicMock(fused_moe_state=_get_fused_moe_state( forward_context = MagicMock(fused_moe_state=get_fused_moe_state(
ep_size, is_prefill, is_deepseek_v3_r1)) ep_size, is_prefill, is_deepseek_v3_r1))
with patch( with patch(
"vllm_ascend.torchair.ops.torchair_fused_moe.get_forward_context", "vllm_ascend.torchair.ops.torchair_fused_moe.get_forward_context",
@@ -396,7 +396,7 @@ class TestTorchairAscendUnquantizedFusedMoEMethod:
ep_size = others_param ep_size = others_param
is_prefill = False is_prefill = False
forward_context = MagicMock( forward_context = MagicMock(
fused_moe_state=_get_fused_moe_state(ep_size, is_prefill, True)) fused_moe_state=get_fused_moe_state(ep_size, is_prefill, True))
with patch("vllm_ascend.torchair.ops.torchair_fused_moe.get_forward_context", return_value=forward_context), \ with patch("vllm_ascend.torchair.ops.torchair_fused_moe.get_forward_context", return_value=forward_context), \
patch("vllm_ascend.torchair.ops.torchair_fused_moe.get_ascend_soc_version", return_value=AscendSocVersion.A3): patch("vllm_ascend.torchair.ops.torchair_fused_moe.get_ascend_soc_version", return_value=AscendSocVersion.A3):
expert_map = torch.tensor([0, 1, 2, -1, -1, -1, -1, -1]) expert_map = torch.tensor([0, 1, 2, -1, -1, -1, -1, -1])

View File

@@ -74,7 +74,7 @@ def test_select_moe_comm_method(soc_version, enable_expert_parallel,
# Bind the real method to the mock object # Bind the real method to the mock object
method = NPUModelRunner._select_moe_comm_method( method = NPUModelRunner._select_moe_comm_method(
mock_runner, num_tokens, False) mock_runner, num_tokens)
# Assert the result # Assert the result
assert method == expected_method assert method == expected_method
@@ -108,4 +108,4 @@ def test_select_moe_comm_method_unsupported_soc():
return_value=True), \ return_value=True), \
pytest.raises(ValueError, match=f"Unsupported soc_version: {unsupported_soc}"): pytest.raises(ValueError, match=f"Unsupported soc_version: {unsupported_soc}"):
NPUModelRunner._select_moe_comm_method(mock_runner, 100, False) NPUModelRunner._select_moe_comm_method(mock_runner, 100)

View File

@@ -29,16 +29,8 @@ class FusedMoEState(Enum):
All2AllSeq = 5 All2AllSeq = 5
class MoECommType(Enum): def get_fused_moe_state(ep_size: int, with_prefill: bool,
ALLGATHER = 0 is_deepseek_v3_r1: bool):
MC2 = 1
ALLTOALL = 2
NAIVE_MULTICAST = 3
# TODO(zzzzwwjj): add soc_version to choose branch
def _get_fused_moe_state(ep_size: int, with_prefill: bool,
is_deepseek_v3_r1: bool):
# the fusion operator torch_npu.npu_grouped_matmul_finalize_routing called by allgather ep # the fusion operator torch_npu.npu_grouped_matmul_finalize_routing called by allgather ep
# only supports deepseek v3/r1 # only supports deepseek v3/r1
if (envs_ascend.VLLM_ENABLE_FUSED_EXPERTS_ALLGATHER_EP and ep_size > 1 if (envs_ascend.VLLM_ENABLE_FUSED_EXPERTS_ALLGATHER_EP and ep_size > 1
@@ -56,6 +48,12 @@ def _get_fused_moe_state(ep_size: int, with_prefill: bool,
return FusedMoEState.MC2 return FusedMoEState.MC2
class MoECommType(Enum):
ALLGATHER = 0
MC2 = 1
ALLTOALL = 2
@contextmanager @contextmanager
def set_ascend_forward_context( def set_ascend_forward_context(
attn_metadata: Any, attn_metadata: Any,
@@ -103,8 +101,8 @@ def set_ascend_forward_context(
is_deepseek_v3_r1 = hasattr( is_deepseek_v3_r1 = hasattr(
vllm_config.model_config.hf_config, 'n_routed_experts' vllm_config.model_config.hf_config, 'n_routed_experts'
) and vllm_config.model_config.hf_config.n_routed_experts == 256 ) and vllm_config.model_config.hf_config.n_routed_experts == 256
fused_moe_state = _get_fused_moe_state(ep_size, with_prefill, fused_moe_state = get_fused_moe_state(ep_size, with_prefill,
is_deepseek_v3_r1) is_deepseek_v3_r1)
forward_context.fused_moe_state = fused_moe_state forward_context.fused_moe_state = fused_moe_state
forward_context.in_profile_run = in_profile_run forward_context.in_profile_run = in_profile_run

View File

@@ -27,7 +27,7 @@ from vllm_ascend.ascend_forward_context import MoECommType
from vllm_ascend.ops.fused_moe.moe_mlp import unified_apply_mlp from vllm_ascend.ops.fused_moe.moe_mlp import unified_apply_mlp
from vllm_ascend.ops.fused_moe.prepare_finalize import ( from vllm_ascend.ops.fused_moe.prepare_finalize import (
PrepareAndFinalizeWithAll2All, PrepareAndFinalizeWithAllGather, PrepareAndFinalizeWithAll2All, PrepareAndFinalizeWithAllGather,
PrepareAndFinalizeWithMC2, PrepareAndFinalizeWithNaiveMulticast, QuantType) PrepareAndFinalizeWithMC2, QuantType)
from vllm_ascend.ops.fused_moe.token_dispatcher import ( from vllm_ascend.ops.fused_moe.token_dispatcher import (
TokenDispatcherWithAll2AllV, TokenDispatcherWithAllGather, TokenDispatcherWithAll2AllV, TokenDispatcherWithAllGather,
TokenDispatcherWithMC2, TokenDispatcherWithMoge) TokenDispatcherWithMC2, TokenDispatcherWithMoge)
@@ -44,8 +44,6 @@ def setup_moe_comm_method(moe_config):
_MoECommMethods[MoECommType.ALLTOALL] = AlltoAllCommImpl(moe_config) _MoECommMethods[MoECommType.ALLTOALL] = AlltoAllCommImpl(moe_config)
_MoECommMethods[MoECommType.ALLGATHER] = AllGatherCommImpl(moe_config) _MoECommMethods[MoECommType.ALLGATHER] = AllGatherCommImpl(moe_config)
_MoECommMethods[MoECommType.MC2] = MC2CommImpl(moe_config) _MoECommMethods[MoECommType.MC2] = MC2CommImpl(moe_config)
_MoECommMethods[MoECommType.NAIVE_MULTICAST] = NaiveMulticastCommImpl(
moe_config)
class MoECommMethod(ABC): class MoECommMethod(ABC):
@@ -245,32 +243,3 @@ class AlltoAllCommImpl(MoECommMethod):
def _get_prepare_finalize(self): def _get_prepare_finalize(self):
return PrepareAndFinalizeWithAll2All(self.moe_config) return PrepareAndFinalizeWithAll2All(self.moe_config)
class NaiveMulticastCommImpl(MoECommMethod):
"""This implementation is the same as NativeAllGatherCommImpl,
but uses NPU-specific ops for better performance.
This implementation should be compatible with all scenarios, and
thus it is the default implementation for MoE communication methods.
It uses `torch_npu.npu_moe_init_routing_v2` for pre-processing
and `torch_npu.npu_moe_token_unpermute` for post-processing
to handle the token-to-expert mapping and communication efficiently.
NOTE(Yizhou): TBH, it is really weird that we were supposed to use
`torch_npu.npu_moe_init_routing_v2` and `torch_npu.npu_moe_finalize_routing`
or `torch_npu.npu_moe_token_permute` and `torch_npu.npu_moe_token_unpermute`
for pre-processing and post-processing, respectively.
But `npu_moe_finalize_routing` will lead to accuracy issues so we have to
use `torch_npu.npu_moe_token_unpermute` instead.
This is a workaround and should be removed after the issue is fixed.
"""
def _get_token_dispatcher(self):
return TokenDispatcherWithAllGather(
top_k=self.moe_config.experts_per_token,
num_experts=self.moe_config.num_experts,
num_local_experts=self.moe_config.num_local_experts)
def _get_prepare_finalize(self):
return PrepareAndFinalizeWithNaiveMulticast(self.moe_config)

View File

@@ -45,7 +45,7 @@ class PrepareAndFinalize(ABC):
""" """
Abstract base class for MoE (Mixture-of-Experts) tensor preparation and finalization Abstract base class for MoE (Mixture-of-Experts) tensor preparation and finalization
in distributed environments. Subclasses implement specific communication strategies in distributed environments. Subclasses implement specific communication strategies
(e.g., AllGather, All2All, MC2, Naive Multicast) to handle tensor padding, slicing, (e.g., AllGather, All2All, MC2) to handle tensor padding, slicing,
broadcasting, and reduction across TP/DP/EP groups. broadcasting, and reduction across TP/DP/EP groups.
Attributes: Attributes:
@@ -454,115 +454,3 @@ class PrepareAndFinalizeWithAllGather(PrepareAndFinalize):
hidden_states = tensor_model_parallel_all_reduce(hidden_states) hidden_states = tensor_model_parallel_all_reduce(hidden_states)
return hidden_states return hidden_states
class PrepareAndFinalizeWithNaiveMulticast(PrepareAndFinalize):
"""
MoE communication strategy using Naive Multicast (point-to-point broadcast).
Will be used in prefill when using allgather in decode. Each DP rank broadcasts its slice to all others.
Uses `cu_tokens_across_dp_cpu` (cumulative tokens) to locate slice boundaries.
"""
def _naive_multicast(self, x: torch.Tensor,
cu_tokens_across_dp_cpu: torch.Tensor):
"""
Naive multicast implementation:
1. Create global buffer sized by total tokens across DP.
2. Current rank copies its slice into its designated buffer region.
3. Each rank broadcasts its slice to all others via P2P.
Args:
x (torch.Tensor): Local tensor [local_tokens, hidden_size]
cu_tokens_across_dp_cpu (torch.Tensor): Cumulative token counts per DP rank
Returns:
torch.Tensor: Global tensor [total_tokens, hidden_size]
"""
assert len(x.shape) == 2, "Input must be 2D [tokens, features]"
buffer = torch.empty((cu_tokens_across_dp_cpu[-1], x.size(1)),
device=x.device,
dtype=x.dtype)
# Copy local slice into buffer
start = 0 if self.moe_config.dp_rank == 0 else cu_tokens_across_dp_cpu[
self.moe_config.dp_rank - 1]
end = cu_tokens_across_dp_cpu[self.moe_config.dp_rank]
buffer[start:end, :].copy_(x)
# Broadcast each slice to all ranks
for idx in range(self.moe_config.dp_size):
start = 0 if idx == 0 else cu_tokens_across_dp_cpu[idx - 1]
end = cu_tokens_across_dp_cpu[idx]
get_dp_group().broadcast(buffer[start:end, :], idx)
return buffer
def prepare(
self,
hidden_states: torch.Tensor,
router_logits: torch.Tensor,
enable_shared_expert_dp: bool = False,
replace_allreduce: bool = False,
quant_type=QuantType.NONE
) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor],
Optional[torch.Tensor]]:
"""
Preparation steps:
1. Fetch cumulative token boundaries from forward context.
2. Multicast hidden_states and router_logits to form global tensors.
Returns:
Tuple of (global_hidden_states, global_router_logits, None, None)
"""
self.enable_shared_expert_dp = enable_shared_expert_dp
if self.moe_config.dp_size > 1:
self.cu_tokens_across_dp_cpu = get_forward_context(
).dp_metadata.cu_tokens_across_sp(1)
hidden_states = self._naive_multicast(hidden_states,
self.cu_tokens_across_dp_cpu)
router_logits = self._naive_multicast(router_logits,
self.cu_tokens_across_dp_cpu)
if prefill_context_parallel_enable() and self.moe_config.pcp_size > 1:
hidden_states = get_pcp_group().all_gather(
hidden_states,
dim=0,
)
router_logits = get_pcp_group().all_gather(
router_logits,
dim=0,
)
return hidden_states, router_logits, None, None
def finalize(self,
hidden_states: torch.Tensor,
reduce_results: bool,
context_metadata: Optional[dict] = None) -> torch.Tensor:
"""
Finalization steps:
1. If DP > 1 and not shared expert:
- All-reduce across DP
- Slice to current rank's token range using cu_tokens_across_dp_cpu
2. If `reduce_results=True` and TP/EP > 1, apply tensor_model_parallel_all_reduce.
Returns:
Tensor with shape [local_num_tokens, hidden_size]
"""
if self.moe_config.dp_size > 1 and not self.enable_shared_expert_dp:
start = 0 if self.moe_config.dp_rank == 0 else self.cu_tokens_across_dp_cpu[
self.moe_config.dp_rank - 1]
end = self.cu_tokens_across_dp_cpu[self.moe_config.dp_rank]
hidden_states = get_dp_group().all_reduce(
hidden_states) # Sum across DP
hidden_states = hidden_states[start:end, :]
if prefill_context_parallel_enable() and self.moe_config.pcp_size > 1:
hidden_states = get_pcp_group().reduce_scatter(hidden_states,
dim=0)
if reduce_results and (self.moe_config.tp_size > 1
or self.moe_config.ep_size > 1):
hidden_states = tensor_model_parallel_all_reduce(hidden_states)
return hidden_states

View File

@@ -124,8 +124,7 @@ class EagleProposer(Proposer):
num_tokens_across_dp: Optional[torch.Tensor] = None, num_tokens_across_dp: Optional[torch.Tensor] = None,
aclgraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE, aclgraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE,
batch_descriptor=None): batch_descriptor=None):
moe_comm_type = self.runner._select_moe_comm_method( moe_comm_type = self.runner._select_moe_comm_method(num_tokens)
num_tokens, with_prefill)
with set_ascend_forward_context(None, with set_ascend_forward_context(None,
self.vllm_config, self.vllm_config,
moe_comm_type=moe_comm_type, moe_comm_type=moe_comm_type,
@@ -460,11 +459,7 @@ class EagleProposer(Proposer):
else: else:
num_input_tokens = num_tokens num_input_tokens = num_tokens
with_prefill = attn_metadata.attn_state not in [ moe_comm_type = self.runner._select_moe_comm_method(num_input_tokens)
AscendAttentionState.DecodeOnly, AscendAttentionState.SpecDecoding
]
moe_comm_type = self.runner._select_moe_comm_method(
num_input_tokens, with_prefill)
# copy inputs to buffer for cudagraph # copy inputs to buffer for cudagraph
self.positions[:num_tokens] = target_positions.to(device) self.positions[:num_tokens] = target_positions.to(device)
@@ -504,8 +499,7 @@ class EagleProposer(Proposer):
else: else:
input_batch_size = batch_size input_batch_size = batch_size
moe_comm_type = self.runner._select_moe_comm_method( moe_comm_type = self.runner._select_moe_comm_method(input_batch_size)
input_batch_size, False)
attn_metadata.num_actual_tokens = batch_size attn_metadata.num_actual_tokens = batch_size
attn_metadata.max_query_len = 1 attn_metadata.max_query_len = 1

View File

@@ -223,8 +223,7 @@ class MtpProposer(Proposer):
with_prefill, with_prefill,
) = self.runner._sync_metadata_across_dp(num_tokens, with_prefill) ) = self.runner._sync_metadata_across_dp(num_tokens, with_prefill)
moe_comm_type = self.runner._select_moe_comm_method( moe_comm_type = self.runner._select_moe_comm_method(num_tokens)
num_tokens, with_prefill)
if skip_attn: if skip_attn:
attn_metadata = None attn_metadata = None
@@ -672,8 +671,7 @@ class MtpProposer(Proposer):
with_prefill) = self.runner._sync_metadata_across_dp( with_prefill) = self.runner._sync_metadata_across_dp(
num_input_tokens, self.runner.with_prefill) num_input_tokens, self.runner.with_prefill)
moe_comm_type = self.runner._select_moe_comm_method( moe_comm_type = self.runner._select_moe_comm_method(num_input_tokens)
num_input_tokens, with_prefill)
if scheduler_output: if scheduler_output:
max_query_len = common_attn_metadata.max_query_len max_query_len = common_attn_metadata.max_query_len

View File

@@ -81,8 +81,7 @@ class TorchairMtpProposer(MtpProposer):
num_tokens_across_dp=None, num_tokens_across_dp=None,
aclgraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE, aclgraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE,
batch_descriptor=None) -> None: batch_descriptor=None) -> None:
moe_comm_type = self.runner._select_moe_comm_method( moe_comm_type = self.runner._select_moe_comm_method(num_tokens)
num_tokens, with_prefill)
if not with_prefill: if not with_prefill:
skip_attn = False skip_attn = False
@@ -342,8 +341,7 @@ class TorchairMtpProposer(MtpProposer):
num_tokens_across_dp = self.runner.num_tokens_across_dp num_tokens_across_dp = self.runner.num_tokens_across_dp
with_prefill = self.runner.with_prefill with_prefill = self.runner.with_prefill
moe_comm_type = self.runner._select_moe_comm_method( moe_comm_type = self.runner._select_moe_comm_method(num_input_tokens)
num_input_tokens, with_prefill)
batch_descriptor = BatchDescriptor(num_tokens=num_input_tokens, batch_descriptor = BatchDescriptor(num_tokens=num_input_tokens,
uniform_decode=False) uniform_decode=False)
aclgraph_runtime_mode, batch_descriptor = \ aclgraph_runtime_mode, batch_descriptor = \

View File

@@ -2192,8 +2192,8 @@ class NPUModelRunner(LoRAModelRunnerMixin):
kv_connector_output=kv_connector_output, kv_connector_output=kv_connector_output,
) )
def _select_moe_comm_method(self, num_tokens: int, def _select_moe_comm_method(self,
with_prefill: bool) -> Optional[MoECommType]: num_tokens: int) -> Optional[MoECommType]:
"""1. If expert parallel is not enabled, we use all-gather since MC2 and all-to-all """1. If expert parallel is not enabled, we use all-gather since MC2 and all-to-all
are designed for expert parallelism. are designed for expert parallelism.
2. If expert parallel is enabled, we need to consider the soc version and the 2. If expert parallel is enabled, we need to consider the soc version and the
@@ -2244,12 +2244,6 @@ class NPUModelRunner(LoRAModelRunnerMixin):
else: else:
raise ValueError(f"Unsupported soc_version: {soc_version}") raise ValueError(f"Unsupported soc_version: {soc_version}")
if moe_comm_type == MoECommType.ALLGATHER and with_prefill:
if enable_sp():
moe_comm_type = MoECommType.ALLGATHER
else:
moe_comm_type = MoECommType.NAIVE_MULTICAST
# PanguProMoE only supports allgather # PanguProMoE only supports allgather
if model_type == "PanguProMoE": if model_type == "PanguProMoE":
moe_comm_type = MoECommType.ALLGATHER moe_comm_type = MoECommType.ALLGATHER
@@ -2289,8 +2283,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
if self.dynamic_eplb: if self.dynamic_eplb:
self.eplb_updator.take_update_info_from_eplb_process() self.eplb_updator.take_update_info_from_eplb_process()
moe_comm_type = self._select_moe_comm_method(num_input_tokens, moe_comm_type = self._select_moe_comm_method(num_input_tokens)
self.with_prefill)
uniform_decode = (max_query_len == self.uniform_decode_query_len) and ( uniform_decode = (max_query_len == self.uniform_decode_query_len) and (
scheduler_output.total_num_scheduled_tokens scheduler_output.total_num_scheduled_tokens
@@ -2823,7 +2816,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
with_prefill) = self._sync_metadata_across_dp(num_tokens, with_prefill) = self._sync_metadata_across_dp(num_tokens,
with_prefill) with_prefill)
moe_comm_type = self._select_moe_comm_method(num_tokens, with_prefill) moe_comm_type = self._select_moe_comm_method(num_tokens)
# If cudagraph_mode.decode_mode() == FULL and # If cudagraph_mode.decode_mode() == FULL and
# cudagraph_mode.seperate_routine(). This means that we are using # cudagraph_mode.seperate_routine(). This means that we are using
@@ -2999,8 +2992,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
# allowing vLLM to correctly estimate the maximum memory required. # allowing vLLM to correctly estimate the maximum memory required.
if self.max_num_tokens > self.mc2_tokens_capacity and \ if self.max_num_tokens > self.mc2_tokens_capacity and \
self._select_moe_comm_method( self._select_moe_comm_method(
self.mc2_tokens_capacity, self.mc2_tokens_capacity) == MoECommType.MC2:
with_prefill=True) == MoECommType.MC2:
self._dummy_run(self.mc2_tokens_capacity, with_prefill=True) self._dummy_run(self.mc2_tokens_capacity, with_prefill=True)
output = None output = None