Tune paged attention parameters for AMD GPU. (#3255)
This commit is contained in:
committed by
GitHub
parent
959dca4fc7
commit
d9eb9358cc
@@ -273,6 +273,10 @@ class ServerArgs:
|
||||
) and check_gguf_file(self.model_path):
|
||||
self.quantization = self.load_format = "gguf"
|
||||
|
||||
# AMD-specific Triton attention KV splits default number
|
||||
if is_hip():
|
||||
self.triton_attention_num_kv_splits = 16
|
||||
|
||||
@staticmethod
|
||||
def add_cli_args(parser: argparse.ArgumentParser):
|
||||
# Model and port args
|
||||
|
||||
Reference in New Issue
Block a user