[Feat] Multi-stream for eplb heat collection and aggregation (#4214)
### What this PR does / why we need it? This PR optimizes multistream for eplb heat collection and aggregation - vLLM version: v0.12.0 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.12.0 --------- Signed-off-by: daishixun <dsxsteven@sina.com> Co-authored-by: Mengqing Cao <cmq0113@163.com>
This commit is contained in:
@@ -36,6 +36,7 @@ from vllm_ascend.ascend_config import get_ascend_config
|
||||
from vllm_ascend.ascend_forward_context import MoECommType
|
||||
from vllm_ascend.distributed.parallel_state import get_mc2_group
|
||||
from vllm_ascend.eplb.core.eplb_utils import determine_default_log2phy_map
|
||||
from vllm_ascend.eplb.utils import moe_load_async_stream
|
||||
from vllm_ascend.ops.expert_load_balancer import ExpertLoadBalancer
|
||||
from vllm_ascend.ops.fused_moe.experts_selector import select_experts
|
||||
from vllm_ascend.ops.fused_moe.moe_comm_method import setup_moe_comm_method
|
||||
@@ -368,8 +369,15 @@ class AscendFusedMoE(FusedMoE):
|
||||
if isinstance(final_hidden_states, tuple):
|
||||
final_hidden_states, group_list_type, expert_tokens = final_hidden_states
|
||||
if self.dynamic_eplb:
|
||||
self.moe_load += expert_tokens if group_list_type == 1 else \
|
||||
torch.cat([expert_tokens[:1], expert_tokens[1:] - expert_tokens[:-1]])
|
||||
|
||||
moe_load_stream = moe_load_async_stream()
|
||||
cur_stream = torch.npu.current_stream()
|
||||
|
||||
moe_load_stream.wait_stream(cur_stream)
|
||||
with npu_stream_switch(moe_load_stream):
|
||||
self.moe_load += expert_tokens if group_list_type == 1 else \
|
||||
torch.cat([expert_tokens[:1], expert_tokens[1:] - expert_tokens[:-1]])
|
||||
cur_stream.wait_stream(moe_load_stream)
|
||||
|
||||
final_hidden_states = forward_context.moe_comm_method.finalize(
|
||||
hidden_states=final_hidden_states,
|
||||
|
||||
Reference in New Issue
Block a user