Init attention backend for Intel XPU (#10656)

Co-authored-by: guangyey <guangye.yu@intel.com>
Co-authored-by: DiweiSun <105627594+DiweiSun@users.noreply.github.com>
This commit is contained in:
Meng, Hengyu
2025-10-21 11:41:28 +08:00
committed by GitHub
parent fb6cc7b000
commit b113c72e7a
18 changed files with 1210 additions and 26 deletions

View File

@@ -142,6 +142,7 @@ from sglang.srt.utils import (
monkey_patch_vllm_gguf_config,
set_cuda_arch,
slow_rank_detector,
xpu_has_xmx_support,
)
from sglang.srt.utils.offloader import (
create_offloader_from_server_args,
@@ -195,6 +196,7 @@ def add_chunked_prefix_cache_attention_backend(backend_name):
_is_hip = is_hip()
_is_npu = is_npu()
_is_cpu_amx_available = cpu_has_amx_support()
_is_xpu_xmx_available = xpu_has_xmx_support()
# Use a small KV cache pool size for tests in CI
SGLANG_CI_SMALL_KV_SIZE = os.getenv("SGLANG_CI_SMALL_KV_SIZE", None)
@@ -505,6 +507,16 @@ class ModelRunner:
)
server_args.attention_backend = "torch_native"
if (
server_args.attention_backend == "intel_xpu"
and server_args.device == "xpu"
and not _is_xpu_xmx_available
):
logger.info(
"The current platform does not support Intel XMX, will fallback to triton backend."
)
server_args.attention_backend = "triton"
if server_args.prefill_attention_backend is not None and (
server_args.prefill_attention_backend
== server_args.decode_attention_backend