Init attention backend for Intel XPU (#10656)

Co-authored-by: guangyey <guangye.yu@intel.com> Co-authored-by: DiweiSun <105627594+DiweiSun@users.noreply.github.com>
2025-10-21 11:41:28 +08:00
parent fb6cc7b000
commit b113c72e7a
18 changed files with 1210 additions and 26 deletions
--- a/4
+++ b/4
@@ -24,7 +24,9 @@ FILES_TO_UPDATE = docker/Dockerfile.rocm \
                 docs/get_started/install.md \
                 docs/platforms/amd_gpu.md \
                 docs/platforms/ascend_npu.md \
-                 benchmark/deepseek_v3/README.md
+				 docs/platforms/cpu_server.md \
+				 docs/platforms/xpu.md \
+				 benchmark/deepseek_v3/README.md

 update: ## Update version numbers across project files. Usage: make update <new_version>
 	@if [ -z "$(filter-out $@,$(MAKECMDGOALS))" ]; then \
--- a/docker/Dockerfile.xpu
+++ b/docker/Dockerfile.xpu
@@ -48,7 +48,7 @@ RUN --mount=type=secret,id=github_token \
    cd /home/sdp && \
    . /home/sdp/miniforge3/bin/activate && \
    conda activate py${PYTHON_VERSION} && \
-    pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/xpu
+    pip3 install torch==2.8.0+xpu torchao torchvision torchaudio pytorch-triton-xpu==3.4.0 --index-url https://download.pytorch.org/whl/xpu

 RUN --mount=type=secret,id=github_token \
    cd /home/sdp && \
@@ -59,13 +59,8 @@ RUN --mount=type=secret,id=github_token \
    cd sglang && cd python && \
    cp pyproject_xpu.toml pyproject.toml && \
    pip install . && \
-    echo "Cloning ${SG_LANG_KERNEL_REPO} from ${SG_LANG_KERNEL_BRANCH}" && \
-    git clone --branch ${SG_LANG_KERNEL_BRANCH} --single-branch ${SG_LANG_KERNEL_REPO} && \
-    cd sgl-kernel-xpu && \
-    pip install -v . && \
+    pip install xgrammar --no-deps && \
    pip install msgspec blake3 py-cpuinfo compressed_tensors gguf partial_json_parser einops --root-user-action=ignore && \
-    pip uninstall pytorch-triton-xpu -y && \
-    pip install --pre pytorch-triton-xpu --index-url https://download.pytorch.org/whl/xpu && \
    conda install libsqlite=3.48.0 -y && \
    # Add environment setup commands to .bashrc again (in case it was overwritten)
    echo ". /home/sdp/miniforge3/bin/activate; conda activate py${PYTHON_VERSION}; cd /home/sdp" >> /home/sdp/.bashrc
--- a/docs/advanced_features/attention_backend.md
+++ b/docs/advanced_features/attention_backend.md
@@ -26,6 +26,7 @@ The support matrix is split into two parts: MHA (standard attention) and MLA (mu
 | **AITER (ROCm)**                | ✅                          | ❌               | ✅              | ✅              | ❌                 | ❌             |
 | **Wave (ROCm)**                 | ✅                          | ❌               | ❌              | ❌              | ❌                 | ❌             |
 | **Ascend (NPU)**                | ✅                          | ❌               | ❌              | ❌              | ❌                 | ❌             |
+| **Intel XPU**                   | ✅                          | ❌               | ❌              | ❌              | ✅                 | ❌             |

 ### MLA Backends

@@ -190,6 +191,13 @@ python3 -m sglang.launch_server \
  --attention-backend ascend
 ```

+- Intel XPU
+```bash
+python3 -m sglang.launch_server \
+  --model meta-llama/Meta-Llama-3.1-8B-Instruct \
+  --attention-backend intel_xpu
+```
+
 - Wave
 ```bash
 python3 -m sglang.launch_server \
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -75,6 +75,7 @@ Its core features include:
   platforms/tpu.md
   platforms/nvidia_jetson.md
   platforms/ascend_npu.md
+   platforms/xpu.md

 .. toctree::
   :maxdepth: 1
--- a/docs/platforms/xpu.md
+++ b/docs/platforms/xpu.md
@@ -0,0 +1,92 @@
+# XPU
+
+The document addresses how to set up the [SGLang](https://github.com/sgl-project/sglang) environment and run LLM inference on Intel GPU, [see more context about Intel GPU support within PyTorch ecosystem](https://docs.pytorch.org/docs/stable/notes/get_start_xpu.html).
+
+Specifically, SGLang is optimized for [Intel® Arc™ Pro B-Series Graphics](https://www.intel.com/content/www/us/en/ark/products/series/242616/intel-arc-pro-b-series-graphics.html) and [
+Intel® Arc™ B-Series Graphics](https://www.intel.com/content/www/us/en/ark/products/series/240391/intel-arc-b-series-graphics.html).
+
+## Optimized Model List
+
+A list of LLMs have been optimized on Intel GPU, and more are on the way:
+
+| Model Name | BF16 |
+|:---:|:---:|
+| Llama-3.2-3B | [meta-llama/Llama-3.2-3B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct) |
+| Llama-3.1-8B | [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) |
+| Qwen2.5-1.5B |   [Qwen/Qwen2.5-1.5B](https://huggingface.co/Qwen/Qwen2.5-1.5B) |
+
+**Note:** The model identifiers listed in the table above
+have been verified on [Intel® Arc™ B580 Graphics](https://www.intel.com/content/www/us/en/products/sku/241598/intel-arc-b580-graphics/specifications.html).
+
+## Installation
+
+### Install From Source
+
+Currently SGLang XPU only supports installation from source. Please refer to ["Getting Started on Intel GPU"](https://docs.pytorch.org/docs/stable/notes/get_start_xpu.html) to install XPU dependency.
+
+```bash
+# Create and activate a conda environment
+conda create -n sgl-xpu python=3.12 -y
+conda activate sgl-xpu
+
+# Set PyTorch XPU as primary pip install channel to avoid installing the larger CUDA-enabled version and prevent potential runtime issues.
+pip3 install torch==2.8.0+xpu torchao torchvision torchaudio pytorch-triton-xpu==3.4.0 --index-url https://download.pytorch.org/whl/xpu
+pip3 install xgrammar --no-deps # xgrammar will introduce CUDA-enabled triton which might conflict with XPU
+
+# Clone the SGLang code
+git clone https://github.com/sgl-project/sglang.git
+cd sglang
+git checkout <YOUR-DESIRED-VERSION>
+
+# Use dedicated toml file
+cd python
+cp pyproject_xpu.toml pyproject.toml
+# Install SGLang dependent libs, and build SGLang main package
+pip install --upgrade pip setuptools
+pip install -v .
+```
+
+### Install Using Docker
+
+The docker for XPU is under active development. Please stay tuned.
+
+## Launch of the Serving Engine
+
+Example command to launch SGLang serving:
+
+```bash
+python -m sglang.launch_server       \
+    --model <MODEL_ID_OR_PATH>       \
+    --trust-remote-code              \
+    --disable-overlap-schedule       \
+    --device xpu                     \
+    --host 0.0.0.0                   \
+    --tp 2                           \   # using multi GPUs
+    --attention-backend intel_xpu    \   # using intel optimized XPU attention backend
+    --page-size                      \   # intel_xpu attention backend supports [32, 64, 128]
+```
+
+## Benchmarking with Requests
+
+You can benchmark the performance via the `bench_serving` script.
+Run the command in another terminal.
+
+```bash
+python -m sglang.bench_serving   \
+    --dataset-name random        \
+    --random-input-len 1024      \
+    --random-output-len 1024     \
+    --num-prompts 1              \
+    --request-rate inf           \
+    --random-range-ratio 1.0
+```
+
+The detail explanations of the parameters can be looked up by the command:
+
+```bash
+python -m sglang.bench_serving -h
+```
+
+Additionally, the requests can be formed with
+[OpenAI Completions API](https://docs.sglang.ai/basic_usage/openai_api_completions.html)
+and sent via the command line (e.g. using `curl`) or via your own script.
--- a/python/pyproject_xpu.toml
+++ b/python/pyproject_xpu.toml
@@ -1,5 +1,3 @@
-# xpu is not enabled in public vllm and torch whl,
-# need to follow https://docs.vllm.ai/en/latest/getting_started/xpu-installation.html install vllm
 [build-system]
 requires = ["setuptools>=61.0", "wheel"]
 build-backend = "setuptools.build_meta"
@@ -17,6 +15,10 @@ classifiers = [
 ]

 dependencies = [
+  "torch==2.8.0",
+  "torchaudio==2.8.0",
+  "torchvision",
+  "sgl-kernel @ git+https://github.com/sgl-project/sgl-kernel-xpu.git",
  "IPython",
  "aiohttp",
  "anthropic>=0.20.0",
@@ -61,7 +63,7 @@ dependencies = [
  "transformers==4.57.1",
  "uvicorn",
  "uvloop",
-  "xgrammar==0.1.25",
+  # "xgrammar==0.1.24", , xgrammar depends on CUDA PyTorch and Triton only
  "grpcio==1.75.1", # keep it align with compile_proto.py
  "grpcio-tools==1.75.1", # keep it align with compile_proto.py
  "grpcio-reflection==1.75.1", # required by srt/entrypoints/grpc_server.py
--- a/python/sglang/bench_one_batch.py
+++ b/python/sglang/bench_one_batch.py
@@ -272,7 +272,7 @@ def prepare_synthetic_inputs_for_latency_test(
 def extend(reqs, model_runner):
    # Create dummy tree_cache for benchmarks (no prefix caching, just allocation)
    dummy_tree_cache = SimpleNamespace(
-        page_size=1,
+        page_size=model_runner.server_args.page_size,
        device=model_runner.device,
        token_to_kv_pool_allocator=model_runner.token_to_kv_pool_allocator,
    )
--- a/python/sglang/srt/distributed/parallel_state.py
+++ b/python/sglang/srt/distributed/parallel_state.py
@@ -50,11 +50,13 @@ from sglang.srt.utils import (
    is_hip,
    is_npu,
    is_shm_available,
+    is_xpu,
    supports_custom_op,
 )

 _is_npu = is_npu()
 _is_cpu = is_cpu()
+_is_xpu = is_xpu()
 _supports_custom_op = supports_custom_op()


@@ -694,7 +696,7 @@ class GroupCoordinator:
            )

    def all_gather_into_tensor(self, output: torch.Tensor, input: torch.Tensor):
-        if _is_npu or not _supports_custom_op:
+        if _is_npu or _is_xpu or not _supports_custom_op:
            self._all_gather_into_tensor(output, input)
        else:
            torch.ops.sglang.reg_all_gather_into_tensor(
@@ -1298,7 +1300,7 @@ def init_model_parallel_group(
        group_ranks=group_ranks,
        local_rank=local_rank,
        torch_distributed_backend=backend,
-        use_pynccl=not _is_npu,
+        use_pynccl=not (_is_npu or _is_xpu),
        use_pymscclpp=use_mscclpp_allreduce,
        use_custom_allreduce=use_custom_allreduce,
        use_torch_symm_mem=use_symm_mem_allreduce,
--- a/python/sglang/srt/layers/attention/attention_registry.py
+++ b/python/sglang/srt/layers/attention/attention_registry.py
@@ -217,3 +217,10 @@ def attn_backend_wrapper(runner: "ModelRunner", full_attn_backend: "AttentionBac
        )

    return full_attn_backend
+
+
+@register_attention_backend("intel_xpu")
+def create_intel_xpu_backend(runner):
+    from sglang.srt.layers.attention.xpu_backend import XPUAttentionBackend
+
+    return XPUAttentionBackend(runner)
--- a/python/sglang/srt/layers/attention/fla/layernorm_gated.py
+++ b/python/sglang/srt/layers/attention/fla/layernorm_gated.py
@@ -12,6 +12,8 @@ import triton
 import triton.language as tl
 from einops import rearrange

+from sglang.srt.utils import device_context
+

 def rms_norm_ref(
    x,
@@ -157,7 +159,7 @@ def _layer_norm_fwd(
    # heuristics for number of warps
    num_warps = min(max(BLOCK_N // 256, 1), 8)
    grid = (M, ngroups)
-    with torch.get_device_module(x.device).device(x.device.index):
+    with device_context(x.device):
        _layer_norm_fwd_1pass_kernel[grid](
            x,
            out,
--- a/python/sglang/srt/layers/attention/xpu_backend.py
+++ b/python/sglang/srt/layers/attention/xpu_backend.py
--- a/python/sglang/srt/layers/layernorm.py
+++ b/python/sglang/srt/layers/layernorm.py
@@ -42,7 +42,7 @@ _is_cpu_amx_available = cpu_has_amx_support()
 _is_cpu = is_cpu()
 _is_xpu = is_xpu()

-if _is_cuda:
+if _is_cuda or _is_xpu:
    # if _is_flashinfer_available:
    #     from flashinfer.norm import fused_add_rmsnorm
    # else:
@@ -52,13 +52,6 @@ if _is_cuda:
        gemma_rmsnorm,
        rmsnorm,
    )
-elif _is_xpu:
-    from sgl_kernel import (
-        fused_add_rmsnorm,
-        gemma_fused_add_rmsnorm,
-        gemma_rmsnorm,
-        rmsnorm,
-    )
 if _use_aiter:
    from aiter import rmsnorm2d_fwd as rms_norm
    from aiter import rmsnorm2d_fwd_with_add as fused_add_rms_norm
--- a/python/sglang/srt/layers/quantization/awq.py
+++ b/python/sglang/srt/layers/quantization/awq.py
@@ -39,10 +39,11 @@ if TYPE_CHECKING:
        CombineInput,
    )

-from sglang.srt.utils import is_cuda, is_hip
+from sglang.srt.utils import is_cuda, is_hip, is_xpu

 _is_cuda = is_cuda()
 _is_hip = is_hip()
+_is_xpu = is_xpu()
 if _is_cuda:
    from sgl_kernel import (
        awq_dequantize,
@@ -58,8 +59,12 @@ elif _is_hip:
    )

    warnings.warn(f"HIP does not support fused_marlin_moe currently.")
+elif _is_xpu:
+    from sgl_kernel import awq_dequantize
+
+    warnings.warn(f"XPU does not support fused_marlin_moe currently.")
 else:
-    warnings.warn(f"Only CUDA and HIP support AWQ currently.")
+    warnings.warn(f"Only CUDA, HIP and XPU support AWQ currently.")

 logger = logging.getLogger(__name__)

--- a/python/sglang/srt/layers/rotary_embedding.py
+++ b/python/sglang/srt/layers/rotary_embedding.py
@@ -115,7 +115,7 @@ class RotaryEmbedding(CustomOp):
        if dtype == torch.float32 or (
            (not (_is_cuda or _is_npu) or self.head_size not in [64, 128, 256, 512])
            and not (_is_cpu and _is_cpu_amx_available)
-            and not _is_xpu
+            and not (_is_xpu)
        ):
            from vllm._custom_ops import rotary_embedding

@@ -302,6 +302,7 @@ class RotaryEmbedding(CustomOp):
        offsets: Optional[torch.Tensor] = None,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        # TODO: make a wrapper, and XPU will implement this kernel later.
+        self.cos_sin_cache = self.cos_sin_cache.to(query.device)
        return self.forward_native(positions, query, key, offsets)


--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -142,6 +142,7 @@ from sglang.srt.utils import (
    monkey_patch_vllm_gguf_config,
    set_cuda_arch,
    slow_rank_detector,
+    xpu_has_xmx_support,
 )
 from sglang.srt.utils.offloader import (
    create_offloader_from_server_args,
@@ -195,6 +196,7 @@ def add_chunked_prefix_cache_attention_backend(backend_name):
 _is_hip = is_hip()
 _is_npu = is_npu()
 _is_cpu_amx_available = cpu_has_amx_support()
+_is_xpu_xmx_available = xpu_has_xmx_support()

 # Use a small KV cache pool size for tests in CI
 SGLANG_CI_SMALL_KV_SIZE = os.getenv("SGLANG_CI_SMALL_KV_SIZE", None)
@@ -505,6 +507,16 @@ class ModelRunner:
            )
            server_args.attention_backend = "torch_native"

+        if (
+            server_args.attention_backend == "intel_xpu"
+            and server_args.device == "xpu"
+            and not _is_xpu_xmx_available
+        ):
+            logger.info(
+                "The current platform does not support Intel XMX, will fallback to triton backend."
+            )
+            server_args.attention_backend = "triton"
+
        if server_args.prefill_attention_backend is not None and (
            server_args.prefill_attention_backend
            == server_args.decode_attention_backend
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -114,6 +114,7 @@ ATTENTION_BACKEND_CHOICES = [
    # Other platforms
    "intel_amx",
    "ascend",
+    "intel_xpu",
 ]

 LORA_BACKEND_CHOICES = ["triton", "csgmv"]
@@ -1098,6 +1099,12 @@ class ServerArgs:
            self.enable_mixed_chunk = False
            self.disable_radix_cache = True

+        if self.attention_backend == "intel_xpu":
+            if self.page_size not in [32, 64, 128]:
+                logger.warning(
+                    f"Intel XPU attention backend only supports page_size of 32, 64 or 128, changing page_size from {self.page_size} to 128."
+                )
+                self.page_size = 128
        if self.attention_backend == "fa4" or self.decode_attention_backend == "fa4":
            raise ValueError(
                "FA4 backend is only supported for prefill. Please use `--prefill-attention-backend fa4` instead."
--- a/python/sglang/srt/utils/common.py
+++ b/python/sglang/srt/utils/common.py
@@ -163,6 +163,20 @@ def _check(cc_major):
    ) >= (12, 3)


+@contextmanager
+def device_context(device: torch.device):
+    if device.type == "cpu" and is_cpu():
+        with torch.device("cpu"):
+            yield
+    else:
+        module = torch.get_device_module(device)
+        if module is not None:
+            with module.device(device.index):
+                yield
+        else:
+            raise ValueError(f"Unknown device module: {device}")
+
+
 is_ampere_with_cuda_12_3 = lambda: _check(8)
 is_hopper_with_cuda_12_3 = lambda: _check(9)

@@ -263,6 +277,14 @@ def use_intel_amx_backend(layer):
    return getattr(layer, "use_intel_amx_backend", False)


+def xpu_has_xmx_support():
+    # TODO: update with XPU capalibity query
+    if is_xpu():
+        # currently only PVC/LNL/BMG supports F64, so we only support these now
+        return torch.xpu.get_device_properties().has_fp64
+    return False
+
+
 def is_flashinfer_available():
    """
    Check whether flashinfer is available.
--- a/test/srt/xpu/test_intel_xpu_backend.py
+++ b/test/srt/xpu/test_intel_xpu_backend.py
@@ -8,6 +8,7 @@ import unittest
 from functools import wraps

 from sglang.test.test_utils import (
+    DEFAULT_SMALL_MODEL_NAME_FOR_TEST_BASE,
    DEFAULT_SMALL_MODEL_NAME_FOR_TEST_QWEN,
    CustomTestCase,
    is_in_ci,
@@ -55,6 +56,10 @@ class TestIntelXPUBackend(CustomTestCase):
    def test_latency_qwen_model(self):
        return DEFAULT_SMALL_MODEL_NAME_FOR_TEST_QWEN

+    @intel_xpu_benchmark(["--attention-backend", "intel_xpu", "--page-size", "128"])
+    def test_attention_backend(self):
+        return DEFAULT_SMALL_MODEL_NAME_FOR_TEST_BASE
+

 if __name__ == "__main__":
    unittest.main()