feat: add dp attention support for Qwen 2/3 MoE models, fixes #6088 (#6121)

Co-authored-by: King.Zevin <zevin@mail.ustc.edu.cn>
Co-authored-by: Yi Zhang <1109276519@qq.com>
This commit is contained in:
Fr4nk1in
2025-05-17 05:44:10 +08:00
committed by GitHub
parent 6fc9357503
commit 4bd2952a37
4 changed files with 451 additions and 72 deletions

View File

@@ -269,6 +269,7 @@ def _maybe_prepare_dp_attn_batch(batch: ScheduleBatch, model_runner):
batch,
dp_size=model_runner.server_args.dp_size,
attn_tp_size=1,
moe_dense_tp_size=model_runner.server_args.moe_dense_tp_size,
tp_cpu_group=model_runner.tp_group.cpu_group,
get_idle_batch=None,
disable_cuda_graph=model_runner.server_args.disable_cuda_graph,