[Patch]patch of v1 executor when enable eplb. (#3511)
### What this PR does / why we need it? when using dynamic eplb, patch v1 executor to avoid create child process failed. ### How was this patch tested? deepseek in v3. - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 --------- Signed-off-by: offline0806 <3337230449@qq.com> Co-authored-by: offline0806 <3337230449@qq.com>
This commit is contained in:
@@ -365,17 +365,18 @@ def torchair_fused_experts_with_mc2(
|
||||
) if enable_dispatch_v2 else torch_npu.npu_moe_distribute_combine(
|
||||
**kwargs_mc2)
|
||||
|
||||
if dynamic_eplb:
|
||||
return (hidden_states, 1, expert_token_nums)
|
||||
|
||||
if shared_experts is None:
|
||||
if dynamic_eplb:
|
||||
return (hidden_states, 1, expert_token_nums)
|
||||
return hidden_states
|
||||
else:
|
||||
with npu_stream_switch("moe_secondary", 0):
|
||||
npu_wait_tensor(shared_act, down_out_list)
|
||||
shared_output, _ = shared_experts.down_proj(
|
||||
(shared_act, swiglu_out_scale))
|
||||
return hidden_states, shared_output
|
||||
if dynamic_eplb:
|
||||
return (hidden_states, shared_output, 1, expert_token_nums)
|
||||
return (hidden_states, shared_output)
|
||||
|
||||
|
||||
def torchair_init_routing_quant(hidden_states, top_k, topk_ids,
|
||||
|
||||
Reference in New Issue
Block a user