Support DP MLA (#1970)

2024-11-16 17:01:43 +08:00
parent 2f2e07439c
commit 976bc302e5
12 changed files with 395 additions and 63 deletions
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -129,6 +129,7 @@ class ServerArgs:
    disable_nan_detection: bool = False
    enable_overlap_schedule: bool = False
    enable_mixed_chunk: bool = False
+    enable_dp_attention: bool = False
    enable_torch_compile: bool = False
    torch_compile_max_bs: int = 32
    cuda_graph_max_bs: int = 160
@@ -203,6 +204,16 @@ class ServerArgs:
        if self.sampling_backend is None:
            self.sampling_backend = "flashinfer"

+        if self.enable_dp_attention:
+            self.dp_size = self.tp_size
+            self.chunked_prefill_size = self.chunked_prefill_size // 2
+            self.disable_cuda_graph = True
+            self.enable_overlap_schedule = False
+            logger.warning(
+                f"DP attention is enabled. The chunked prefill size is adjusted to {self.chunked_prefill_size} to avoid MoE workload issue. "
+                "The CUDA graph is disabled."
+            )
+
        if self.enable_overlap_schedule:
            logger.warning(
                "Overlap scheduler mode is enabled. This is an experimental feature. "
@@ -669,6 +680,11 @@ class ServerArgs:
            action="store_true",
            help="Enabling mixing prefill and decode in a batch when using chunked prefill.",
        )
+        parser.add_argument(
+            "--enable-dp-attention",
+            action="store_true",
+            help="Enabling data parallelism for attention and tensor parallelism for FFN. The dp size should be equal to the tp size. Currently only DeepSeek-V2 is supported.",
+        )
        parser.add_argument(
            "--enable-torch-compile",
            action="store_true",