[4/N]DP refactor: support watching mode get_load and shortest queue strategy (#10201)

This commit is contained in:
Liangsheng Yin
2025-09-15 10:06:08 +08:00
committed by GitHub
parent ca63f075b7
commit 305c9e8c2d
12 changed files with 202 additions and 44 deletions

View File

@@ -233,6 +233,7 @@ class ServerArgs:
# Data parallelism
dp_size: int = 1
load_balance_method: str = "round_robin"
load_watch_interval: float = 0.1
# FIXME: remove this after dp rank scheduling is fully supported with PD-Disaggregation
prefill_round_robin_balance: bool = False
@@ -663,6 +664,7 @@ class ServerArgs:
if self.dp_size == 1:
self.enable_dp_attention = False
self.enable_dp_lm_head = False
# Data parallelism attention
if self.enable_dp_attention:
@@ -1488,6 +1490,12 @@ class ServerArgs:
"minimum_tokens",
],
)
parser.add_argument(
"--load-watch-interval",
type=float,
default=ServerArgs.load_watch_interval,
help="The interval of load watching in seconds.",
)
parser.add_argument(
"--prefill-round-robin-balance",
default=ServerArgs.prefill_round_robin_balance,