Add intel_amx backend for Radix Attention for CPU (#6408)

Co-authored-by: Chunyuan WU <chunyuan.wu@intel.com>
Co-authored-by: Thien Tran <gau.nernst@yahoo.com.sg>
This commit is contained in:
YanbingJiang
2025-05-31 12:37:42 +08:00
committed by GitHub
parent e39bca0756
commit 888cb175a6
8 changed files with 185 additions and 5 deletions

View File

@@ -323,6 +323,11 @@ class ServerArgs:
self.sampling_backend = "pytorch"
# Set kernel backends
if self.device == "cpu":
if self.attention_backend is None:
self.attention_backend = "intel_amx"
self.sampling_backend = "pytorch"
if self.sampling_backend is None:
self.sampling_backend = (
"flashinfer" if is_flashinfer_available() else "pytorch"
@@ -993,6 +998,7 @@ class ServerArgs:
"fa3",
"flashmla",
"cutlass_mla",
"intel_amx",
],
default=ServerArgs.attention_backend,
help="Choose the kernels for attention layers.",