Tune paged attention parameters for AMD GPU. (#3255)

2025-02-01 19:29:45 -06:00
parent 959dca4fc7
commit d9eb9358cc
2 changed files with 13 additions and 2 deletions
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -273,6 +273,10 @@ class ServerArgs:
        ) and check_gguf_file(self.model_path):
            self.quantization = self.load_format = "gguf"

+        # AMD-specific Triton attention KV splits default number
+        if is_hip():
+            self.triton_attention_num_kv_splits = 16
+
    @staticmethod
    def add_cli_args(parser: argparse.ArgumentParser):
        # Model and port args