Add split tile size for Triton attention (#10425)
This commit is contained in:
@@ -362,6 +362,7 @@ class ServerArgs:
|
||||
enable_p2p_check: bool = False
|
||||
triton_attention_reduce_in_fp32: bool = False
|
||||
triton_attention_num_kv_splits: int = 8
|
||||
triton_attention_split_tile_size: Optional[int] = None
|
||||
num_continuous_decode_steps: int = 1
|
||||
delete_ckpt_after_loading: bool = False
|
||||
enable_memory_saver: bool = False
|
||||
@@ -2100,6 +2101,12 @@ class ServerArgs:
|
||||
default=ServerArgs.triton_attention_num_kv_splits,
|
||||
help="The number of KV splits in flash decoding Triton kernel. Larger value is better in longer context scenarios. The default value is 8.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--triton-attention-split-tile-size",
|
||||
type=int,
|
||||
default=ServerArgs.triton_attention_split_tile_size,
|
||||
help="The size of split KV tile in flash decoding Triton kernel. Used for deterministic inference.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--num-continuous-decode-steps",
|
||||
type=int,
|
||||
|
||||
Reference in New Issue
Block a user