Launch dp ranks in parallel (#2053)

Co-authored-by: Haotian Liu <6631389+haotian-liu@users.noreply.github.com>
This commit is contained in:
Lianmin Zheng
2024-11-16 17:13:36 -08:00
parent edad373135
commit f719d9aebc
5 changed files with 63 additions and 28 deletions

View File

@@ -159,7 +159,7 @@ class ServerArgs:
if self.tp_size >= 16:
self.mem_fraction_static = 0.79
elif self.tp_size >= 8:
self.mem_fraction_static = 0.83
self.mem_fraction_static = 0.82
elif self.tp_size >= 4:
self.mem_fraction_static = 0.85
elif self.tp_size >= 2:
@@ -211,7 +211,7 @@ class ServerArgs:
self.enable_overlap_schedule = False
logger.warning(
f"DP attention is enabled. The chunked prefill size is adjusted to {self.chunked_prefill_size} to avoid MoE workload issue. "
"The CUDA graph is disabled."
"The CUDA graph is disabled. Data parallel size is adjust to be the same as tensor parallel size."
)
if self.enable_overlap_schedule: