ROCm: Flex Attention Enablement with custom backends (#4178)

Co-authored-by: linsun12 <linsun12@amd.com>
This commit is contained in:
HAI
2025-03-07 04:38:53 -08:00
committed by GitHub
parent c827c671f7
commit 0beea4503f
7 changed files with 1434 additions and 35 deletions

View File

@@ -710,13 +710,23 @@ class ServerArgs:
)
# Kernel backend
parser.add_argument(
"--attention-backend",
type=str,
choices=["flashinfer", "triton", "torch_native"],
default=ServerArgs.attention_backend,
help="Choose the kernels for attention layers.",
)
if is_hip():
parser.add_argument(
"--attention-backend",
type=str,
choices=["triton", "torch_native", "aiter", "aiter_decode"],
default=ServerArgs.attention_backend,
help="Choose the kernels for attention layers.",
)
else:
parser.add_argument(
"--attention-backend",
type=str,
choices=["flashinfer", "triton", "torch_native"],
default=ServerArgs.attention_backend,
help="Choose the kernels for attention layers.",
)
parser.add_argument(
"--sampling-backend",
type=str,