Enable overlap scheduler by default for the triton attention backend (#2105)

This commit is contained in:
Lianmin Zheng
2024-11-20 02:58:35 -08:00
committed by GitHub
parent 56a347f7d3
commit 722530fa01
6 changed files with 21 additions and 24 deletions

View File

@@ -174,17 +174,17 @@ class ServerArgs:
self.cuda_graph_max_bs = 4
logger.info("Automatically adjust --chunked-prefill-size for small GPUs.")
# Choose kernel backends
if not is_flashinfer_available():
self.attention_backend = "triton"
self.sampling_backend = "pytorch"
# Default kernel backends
if self.attention_backend is None:
self.attention_backend = "flashinfer"
if self.sampling_backend is None:
self.sampling_backend = "flashinfer"
# Others
if self.enable_dp_attention:
self.dp_size = self.tp_size
self.chunked_prefill_size = self.chunked_prefill_size // 2
@@ -205,9 +205,6 @@ class ServerArgs:
)
self.disable_overlap_schedule = True
if not self.disable_overlap_schedule:
self.disable_jump_forward = True
@staticmethod
def add_cli_args(parser: argparse.ArgumentParser):
# Model and port args