Tune paged attention parameters for AMD GPU. (#3255)

This commit is contained in:
Wen-Heng (Jack) Chung
2025-02-01 19:29:45 -06:00
committed by GitHub
parent 959dca4fc7
commit d9eb9358cc
2 changed files with 13 additions and 2 deletions

View File

@@ -273,6 +273,10 @@ class ServerArgs:
) and check_gguf_file(self.model_path):
self.quantization = self.load_format = "gguf"
# AMD-specific Triton attention KV splits default number
if is_hip():
self.triton_attention_num_kv_splits = 16
@staticmethod
def add_cli_args(parser: argparse.ArgumentParser):
# Model and port args