Add intel_amx backend for Radix Attention for CPU (#6408)
Co-authored-by: Chunyuan WU <chunyuan.wu@intel.com> Co-authored-by: Thien Tran <gau.nernst@yahoo.com.sg>
This commit is contained in:
@@ -323,6 +323,11 @@ class ServerArgs:
|
||||
self.sampling_backend = "pytorch"
|
||||
|
||||
# Set kernel backends
|
||||
if self.device == "cpu":
|
||||
if self.attention_backend is None:
|
||||
self.attention_backend = "intel_amx"
|
||||
self.sampling_backend = "pytorch"
|
||||
|
||||
if self.sampling_backend is None:
|
||||
self.sampling_backend = (
|
||||
"flashinfer" if is_flashinfer_available() else "pytorch"
|
||||
@@ -993,6 +998,7 @@ class ServerArgs:
|
||||
"fa3",
|
||||
"flashmla",
|
||||
"cutlass_mla",
|
||||
"intel_amx",
|
||||
],
|
||||
default=ServerArgs.attention_backend,
|
||||
help="Choose the kernels for attention layers.",
|
||||
|
||||
Reference in New Issue
Block a user