Improve DP attention (#4390)

Co-authored-by: dhou-xai <dhou@x.ai> Co-authored-by: SangBin Cho <rkooo567@gmail.com>
2025-03-13 08:23:56 -07:00
parent f141298a3c
commit 8e66fbecee
9 changed files with 345 additions and 226 deletions
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -262,14 +262,14 @@ class ServerArgs:

        # Data parallelism attention
        if self.enable_dp_attention:
-            self.dp_size = self.tp_size
-            assert self.tp_size % self.dp_size == 0
-            self.chunked_prefill_size = self.chunked_prefill_size // 2
            self.schedule_conservativeness = self.schedule_conservativeness * 0.3
+            assert (
+                self.dp_size > 1
+            ), "Please set a dp-size > 1. You can use 1 < dp-size <= tp-size "
+            assert self.tp_size % self.dp_size == 0
+            self.chunked_prefill_size = self.chunked_prefill_size // self.dp_size
            logger.warning(
                f"DP attention is enabled. The chunked prefill size is adjusted to {self.chunked_prefill_size} to avoid MoE kernel issues. "
-                f"The schedule conservativeness is adjusted to {self.schedule_conservativeness}. "
-                "Data parallel size is adjusted to be the same as tensor parallel size. "
            )

        # Speculative Decoding