From 699384cb017c4096815cb090f473c4004388e5ad Mon Sep 17 00:00:00 2001 From: Ke Bao Date: Wed, 20 Nov 2024 12:57:18 +0800 Subject: [PATCH] Set schedule policy more conservative for DP attention (#2096) --- python/sglang/srt/server_args.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 75afecbed..de1d4ee68 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -191,10 +191,12 @@ class ServerArgs: self.dp_size = self.tp_size self.chunked_prefill_size = self.chunked_prefill_size // 2 self.cuda_graph_max_bs = min(self.cuda_graph_max_bs, 96) + self.schedule_conservativeness = self.schedule_conservativeness * 0.3 self.enable_overlap_schedule = False logger.warning( f"DP attention is enabled. The chunked prefill size is adjusted to {self.chunked_prefill_size} to avoid MoE kernel issues. " f"The CUDA graph max batch size is adjusted to {self.cuda_graph_max_bs}. " + f"The schedule conservativeness is adjusted to {self.schedule_conservativeness}. " "Data parallel size is adjusted to be the same as tensor parallel size." )