Init attention backend for Intel XPU (#10656)

Co-authored-by: guangyey <guangye.yu@intel.com> Co-authored-by: DiweiSun <105627594+DiweiSun@users.noreply.github.com>
2025-10-21 11:41:28 +08:00
parent fb6cc7b000
commit b113c72e7a
18 changed files with 1210 additions and 26 deletions
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -142,6 +142,7 @@ from sglang.srt.utils import (
    monkey_patch_vllm_gguf_config,
    set_cuda_arch,
    slow_rank_detector,
+    xpu_has_xmx_support,
 )
 from sglang.srt.utils.offloader import (
    create_offloader_from_server_args,
@@ -195,6 +196,7 @@ def add_chunked_prefix_cache_attention_backend(backend_name):
 _is_hip = is_hip()
 _is_npu = is_npu()
 _is_cpu_amx_available = cpu_has_amx_support()
+_is_xpu_xmx_available = xpu_has_xmx_support()

 # Use a small KV cache pool size for tests in CI
 SGLANG_CI_SMALL_KV_SIZE = os.getenv("SGLANG_CI_SMALL_KV_SIZE", None)
@@ -505,6 +507,16 @@ class ModelRunner:
            )
            server_args.attention_backend = "torch_native"

+        if (
+            server_args.attention_backend == "intel_xpu"
+            and server_args.device == "xpu"
+            and not _is_xpu_xmx_available
+        ):
+            logger.info(
+                "The current platform does not support Intel XMX, will fallback to triton backend."
+            )
+            server_args.attention_backend = "triton"
+
        if server_args.prefill_attention_backend is not None and (
            server_args.prefill_attention_backend
            == server_args.decode_attention_backend