Init attention backend for Intel XPU (#10656)
Co-authored-by: guangyey <guangye.yu@intel.com> Co-authored-by: DiweiSun <105627594+DiweiSun@users.noreply.github.com>
This commit is contained in:
@@ -142,6 +142,7 @@ from sglang.srt.utils import (
|
||||
monkey_patch_vllm_gguf_config,
|
||||
set_cuda_arch,
|
||||
slow_rank_detector,
|
||||
xpu_has_xmx_support,
|
||||
)
|
||||
from sglang.srt.utils.offloader import (
|
||||
create_offloader_from_server_args,
|
||||
@@ -195,6 +196,7 @@ def add_chunked_prefix_cache_attention_backend(backend_name):
|
||||
_is_hip = is_hip()
|
||||
_is_npu = is_npu()
|
||||
_is_cpu_amx_available = cpu_has_amx_support()
|
||||
_is_xpu_xmx_available = xpu_has_xmx_support()
|
||||
|
||||
# Use a small KV cache pool size for tests in CI
|
||||
SGLANG_CI_SMALL_KV_SIZE = os.getenv("SGLANG_CI_SMALL_KV_SIZE", None)
|
||||
@@ -505,6 +507,16 @@ class ModelRunner:
|
||||
)
|
||||
server_args.attention_backend = "torch_native"
|
||||
|
||||
if (
|
||||
server_args.attention_backend == "intel_xpu"
|
||||
and server_args.device == "xpu"
|
||||
and not _is_xpu_xmx_available
|
||||
):
|
||||
logger.info(
|
||||
"The current platform does not support Intel XMX, will fallback to triton backend."
|
||||
)
|
||||
server_args.attention_backend = "triton"
|
||||
|
||||
if server_args.prefill_attention_backend is not None and (
|
||||
server_args.prefill_attention_backend
|
||||
== server_args.decode_attention_backend
|
||||
|
||||
Reference in New Issue
Block a user