feat: add dp attention support for Qwen 2/3 MoE models, fixes #6088 (#6121)

Co-authored-by: King.Zevin <zevin@mail.ustc.edu.cn> Co-authored-by: Yi Zhang <1109276519@qq.com>
2025-05-17 05:44:10 +08:00
parent 6fc9357503
commit 4bd2952a37
4 changed files with 451 additions and 72 deletions
--- a/python/sglang/bench_one_batch.py
+++ b/python/sglang/bench_one_batch.py
@@ -269,6 +269,7 @@ def _maybe_prepare_dp_attn_batch(batch: ScheduleBatch, model_runner):
            batch,
            dp_size=model_runner.server_args.dp_size,
            attn_tp_size=1,
+            moe_dense_tp_size=model_runner.server_args.moe_dense_tp_size,
            tp_cpu_group=model_runner.tp_group.cpu_group,
            get_idle_batch=None,
            disable_cuda_graph=model_runner.server_args.disable_cuda_graph,