Enable overlap scheduler by default for the triton attention backend (#2105)

2024-11-20 02:58:35 -08:00
parent 56a347f7d3
commit 722530fa01
6 changed files with 21 additions and 24 deletions
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -174,17 +174,17 @@ class ServerArgs:
            self.cuda_graph_max_bs = 4
            logger.info("Automatically adjust --chunked-prefill-size for small GPUs.")

+        # Choose kernel backends
        if not is_flashinfer_available():
            self.attention_backend = "triton"
            self.sampling_backend = "pytorch"

-        # Default kernel backends
        if self.attention_backend is None:
            self.attention_backend = "flashinfer"
-
        if self.sampling_backend is None:
            self.sampling_backend = "flashinfer"

+        # Others
        if self.enable_dp_attention:
            self.dp_size = self.tp_size
            self.chunked_prefill_size = self.chunked_prefill_size // 2
@@ -205,9 +205,6 @@ class ServerArgs:
            )
            self.disable_overlap_schedule = True

-        if not self.disable_overlap_schedule:
-            self.disable_jump_forward = True
-
    @staticmethod
    def add_cli_args(parser: argparse.ArgumentParser):
        # Model and port args