Add intel_amx backend for Radix Attention for CPU (#6408)

Co-authored-by: Chunyuan WU <chunyuan.wu@intel.com> Co-authored-by: Thien Tran <gau.nernst@yahoo.com.sg>
2025-05-31 12:37:42 +08:00
parent e39bca0756
commit 888cb175a6
8 changed files with 185 additions and 5 deletions
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -323,6 +323,11 @@ class ServerArgs:
            self.sampling_backend = "pytorch"

        # Set kernel backends
+        if self.device == "cpu":
+            if self.attention_backend is None:
+                self.attention_backend = "intel_amx"
+            self.sampling_backend = "pytorch"
+
        if self.sampling_backend is None:
            self.sampling_backend = (
                "flashinfer" if is_flashinfer_available() else "pytorch"
@@ -993,6 +998,7 @@ class ServerArgs:
                "fa3",
                "flashmla",
                "cutlass_mla",
+                "intel_amx",
            ],
            default=ServerArgs.attention_backend,
            help="Choose the kernels for attention layers.",