Init attention backend for Intel XPU (#10656)
Co-authored-by: guangyey <guangye.yu@intel.com> Co-authored-by: DiweiSun <105627594+DiweiSun@users.noreply.github.com>
This commit is contained in:
4
Makefile
4
Makefile
@@ -24,7 +24,9 @@ FILES_TO_UPDATE = docker/Dockerfile.rocm \
|
|||||||
docs/get_started/install.md \
|
docs/get_started/install.md \
|
||||||
docs/platforms/amd_gpu.md \
|
docs/platforms/amd_gpu.md \
|
||||||
docs/platforms/ascend_npu.md \
|
docs/platforms/ascend_npu.md \
|
||||||
benchmark/deepseek_v3/README.md
|
docs/platforms/cpu_server.md \
|
||||||
|
docs/platforms/xpu.md \
|
||||||
|
benchmark/deepseek_v3/README.md
|
||||||
|
|
||||||
update: ## Update version numbers across project files. Usage: make update <new_version>
|
update: ## Update version numbers across project files. Usage: make update <new_version>
|
||||||
@if [ -z "$(filter-out $@,$(MAKECMDGOALS))" ]; then \
|
@if [ -z "$(filter-out $@,$(MAKECMDGOALS))" ]; then \
|
||||||
|
|||||||
@@ -48,7 +48,7 @@ RUN --mount=type=secret,id=github_token \
|
|||||||
cd /home/sdp && \
|
cd /home/sdp && \
|
||||||
. /home/sdp/miniforge3/bin/activate && \
|
. /home/sdp/miniforge3/bin/activate && \
|
||||||
conda activate py${PYTHON_VERSION} && \
|
conda activate py${PYTHON_VERSION} && \
|
||||||
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/xpu
|
pip3 install torch==2.8.0+xpu torchao torchvision torchaudio pytorch-triton-xpu==3.4.0 --index-url https://download.pytorch.org/whl/xpu
|
||||||
|
|
||||||
RUN --mount=type=secret,id=github_token \
|
RUN --mount=type=secret,id=github_token \
|
||||||
cd /home/sdp && \
|
cd /home/sdp && \
|
||||||
@@ -59,13 +59,8 @@ RUN --mount=type=secret,id=github_token \
|
|||||||
cd sglang && cd python && \
|
cd sglang && cd python && \
|
||||||
cp pyproject_xpu.toml pyproject.toml && \
|
cp pyproject_xpu.toml pyproject.toml && \
|
||||||
pip install . && \
|
pip install . && \
|
||||||
echo "Cloning ${SG_LANG_KERNEL_REPO} from ${SG_LANG_KERNEL_BRANCH}" && \
|
pip install xgrammar --no-deps && \
|
||||||
git clone --branch ${SG_LANG_KERNEL_BRANCH} --single-branch ${SG_LANG_KERNEL_REPO} && \
|
|
||||||
cd sgl-kernel-xpu && \
|
|
||||||
pip install -v . && \
|
|
||||||
pip install msgspec blake3 py-cpuinfo compressed_tensors gguf partial_json_parser einops --root-user-action=ignore && \
|
pip install msgspec blake3 py-cpuinfo compressed_tensors gguf partial_json_parser einops --root-user-action=ignore && \
|
||||||
pip uninstall pytorch-triton-xpu -y && \
|
|
||||||
pip install --pre pytorch-triton-xpu --index-url https://download.pytorch.org/whl/xpu && \
|
|
||||||
conda install libsqlite=3.48.0 -y && \
|
conda install libsqlite=3.48.0 -y && \
|
||||||
# Add environment setup commands to .bashrc again (in case it was overwritten)
|
# Add environment setup commands to .bashrc again (in case it was overwritten)
|
||||||
echo ". /home/sdp/miniforge3/bin/activate; conda activate py${PYTHON_VERSION}; cd /home/sdp" >> /home/sdp/.bashrc
|
echo ". /home/sdp/miniforge3/bin/activate; conda activate py${PYTHON_VERSION}; cd /home/sdp" >> /home/sdp/.bashrc
|
||||||
|
|||||||
@@ -26,6 +26,7 @@ The support matrix is split into two parts: MHA (standard attention) and MLA (mu
|
|||||||
| **AITER (ROCm)** | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ |
|
| **AITER (ROCm)** | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ |
|
||||||
| **Wave (ROCm)** | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
| **Wave (ROCm)** | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||||
| **Ascend (NPU)** | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
| **Ascend (NPU)** | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||||
|
| **Intel XPU** | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
|
||||||
|
|
||||||
### MLA Backends
|
### MLA Backends
|
||||||
|
|
||||||
@@ -190,6 +191,13 @@ python3 -m sglang.launch_server \
|
|||||||
--attention-backend ascend
|
--attention-backend ascend
|
||||||
```
|
```
|
||||||
|
|
||||||
|
- Intel XPU
|
||||||
|
```bash
|
||||||
|
python3 -m sglang.launch_server \
|
||||||
|
--model meta-llama/Meta-Llama-3.1-8B-Instruct \
|
||||||
|
--attention-backend intel_xpu
|
||||||
|
```
|
||||||
|
|
||||||
- Wave
|
- Wave
|
||||||
```bash
|
```bash
|
||||||
python3 -m sglang.launch_server \
|
python3 -m sglang.launch_server \
|
||||||
|
|||||||
@@ -75,6 +75,7 @@ Its core features include:
|
|||||||
platforms/tpu.md
|
platforms/tpu.md
|
||||||
platforms/nvidia_jetson.md
|
platforms/nvidia_jetson.md
|
||||||
platforms/ascend_npu.md
|
platforms/ascend_npu.md
|
||||||
|
platforms/xpu.md
|
||||||
|
|
||||||
.. toctree::
|
.. toctree::
|
||||||
:maxdepth: 1
|
:maxdepth: 1
|
||||||
|
|||||||
92
docs/platforms/xpu.md
Normal file
92
docs/platforms/xpu.md
Normal file
@@ -0,0 +1,92 @@
|
|||||||
|
# XPU
|
||||||
|
|
||||||
|
The document addresses how to set up the [SGLang](https://github.com/sgl-project/sglang) environment and run LLM inference on Intel GPU, [see more context about Intel GPU support within PyTorch ecosystem](https://docs.pytorch.org/docs/stable/notes/get_start_xpu.html).
|
||||||
|
|
||||||
|
Specifically, SGLang is optimized for [Intel® Arc™ Pro B-Series Graphics](https://www.intel.com/content/www/us/en/ark/products/series/242616/intel-arc-pro-b-series-graphics.html) and [
|
||||||
|
Intel® Arc™ B-Series Graphics](https://www.intel.com/content/www/us/en/ark/products/series/240391/intel-arc-b-series-graphics.html).
|
||||||
|
|
||||||
|
## Optimized Model List
|
||||||
|
|
||||||
|
A list of LLMs have been optimized on Intel GPU, and more are on the way:
|
||||||
|
|
||||||
|
| Model Name | BF16 |
|
||||||
|
|:---:|:---:|
|
||||||
|
| Llama-3.2-3B | [meta-llama/Llama-3.2-3B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct) |
|
||||||
|
| Llama-3.1-8B | [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) |
|
||||||
|
| Qwen2.5-1.5B | [Qwen/Qwen2.5-1.5B](https://huggingface.co/Qwen/Qwen2.5-1.5B) |
|
||||||
|
|
||||||
|
**Note:** The model identifiers listed in the table above
|
||||||
|
have been verified on [Intel® Arc™ B580 Graphics](https://www.intel.com/content/www/us/en/products/sku/241598/intel-arc-b580-graphics/specifications.html).
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
### Install From Source
|
||||||
|
|
||||||
|
Currently SGLang XPU only supports installation from source. Please refer to ["Getting Started on Intel GPU"](https://docs.pytorch.org/docs/stable/notes/get_start_xpu.html) to install XPU dependency.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Create and activate a conda environment
|
||||||
|
conda create -n sgl-xpu python=3.12 -y
|
||||||
|
conda activate sgl-xpu
|
||||||
|
|
||||||
|
# Set PyTorch XPU as primary pip install channel to avoid installing the larger CUDA-enabled version and prevent potential runtime issues.
|
||||||
|
pip3 install torch==2.8.0+xpu torchao torchvision torchaudio pytorch-triton-xpu==3.4.0 --index-url https://download.pytorch.org/whl/xpu
|
||||||
|
pip3 install xgrammar --no-deps # xgrammar will introduce CUDA-enabled triton which might conflict with XPU
|
||||||
|
|
||||||
|
# Clone the SGLang code
|
||||||
|
git clone https://github.com/sgl-project/sglang.git
|
||||||
|
cd sglang
|
||||||
|
git checkout <YOUR-DESIRED-VERSION>
|
||||||
|
|
||||||
|
# Use dedicated toml file
|
||||||
|
cd python
|
||||||
|
cp pyproject_xpu.toml pyproject.toml
|
||||||
|
# Install SGLang dependent libs, and build SGLang main package
|
||||||
|
pip install --upgrade pip setuptools
|
||||||
|
pip install -v .
|
||||||
|
```
|
||||||
|
|
||||||
|
### Install Using Docker
|
||||||
|
|
||||||
|
The docker for XPU is under active development. Please stay tuned.
|
||||||
|
|
||||||
|
## Launch of the Serving Engine
|
||||||
|
|
||||||
|
Example command to launch SGLang serving:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python -m sglang.launch_server \
|
||||||
|
--model <MODEL_ID_OR_PATH> \
|
||||||
|
--trust-remote-code \
|
||||||
|
--disable-overlap-schedule \
|
||||||
|
--device xpu \
|
||||||
|
--host 0.0.0.0 \
|
||||||
|
--tp 2 \ # using multi GPUs
|
||||||
|
--attention-backend intel_xpu \ # using intel optimized XPU attention backend
|
||||||
|
--page-size \ # intel_xpu attention backend supports [32, 64, 128]
|
||||||
|
```
|
||||||
|
|
||||||
|
## Benchmarking with Requests
|
||||||
|
|
||||||
|
You can benchmark the performance via the `bench_serving` script.
|
||||||
|
Run the command in another terminal.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python -m sglang.bench_serving \
|
||||||
|
--dataset-name random \
|
||||||
|
--random-input-len 1024 \
|
||||||
|
--random-output-len 1024 \
|
||||||
|
--num-prompts 1 \
|
||||||
|
--request-rate inf \
|
||||||
|
--random-range-ratio 1.0
|
||||||
|
```
|
||||||
|
|
||||||
|
The detail explanations of the parameters can be looked up by the command:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python -m sglang.bench_serving -h
|
||||||
|
```
|
||||||
|
|
||||||
|
Additionally, the requests can be formed with
|
||||||
|
[OpenAI Completions API](https://docs.sglang.ai/basic_usage/openai_api_completions.html)
|
||||||
|
and sent via the command line (e.g. using `curl`) or via your own script.
|
||||||
@@ -1,5 +1,3 @@
|
|||||||
# xpu is not enabled in public vllm and torch whl,
|
|
||||||
# need to follow https://docs.vllm.ai/en/latest/getting_started/xpu-installation.html install vllm
|
|
||||||
[build-system]
|
[build-system]
|
||||||
requires = ["setuptools>=61.0", "wheel"]
|
requires = ["setuptools>=61.0", "wheel"]
|
||||||
build-backend = "setuptools.build_meta"
|
build-backend = "setuptools.build_meta"
|
||||||
@@ -17,6 +15,10 @@ classifiers = [
|
|||||||
]
|
]
|
||||||
|
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
"torch==2.8.0",
|
||||||
|
"torchaudio==2.8.0",
|
||||||
|
"torchvision",
|
||||||
|
"sgl-kernel @ git+https://github.com/sgl-project/sgl-kernel-xpu.git",
|
||||||
"IPython",
|
"IPython",
|
||||||
"aiohttp",
|
"aiohttp",
|
||||||
"anthropic>=0.20.0",
|
"anthropic>=0.20.0",
|
||||||
@@ -61,7 +63,7 @@ dependencies = [
|
|||||||
"transformers==4.57.1",
|
"transformers==4.57.1",
|
||||||
"uvicorn",
|
"uvicorn",
|
||||||
"uvloop",
|
"uvloop",
|
||||||
"xgrammar==0.1.25",
|
# "xgrammar==0.1.24", , xgrammar depends on CUDA PyTorch and Triton only
|
||||||
"grpcio==1.75.1", # keep it align with compile_proto.py
|
"grpcio==1.75.1", # keep it align with compile_proto.py
|
||||||
"grpcio-tools==1.75.1", # keep it align with compile_proto.py
|
"grpcio-tools==1.75.1", # keep it align with compile_proto.py
|
||||||
"grpcio-reflection==1.75.1", # required by srt/entrypoints/grpc_server.py
|
"grpcio-reflection==1.75.1", # required by srt/entrypoints/grpc_server.py
|
||||||
|
|||||||
@@ -272,7 +272,7 @@ def prepare_synthetic_inputs_for_latency_test(
|
|||||||
def extend(reqs, model_runner):
|
def extend(reqs, model_runner):
|
||||||
# Create dummy tree_cache for benchmarks (no prefix caching, just allocation)
|
# Create dummy tree_cache for benchmarks (no prefix caching, just allocation)
|
||||||
dummy_tree_cache = SimpleNamespace(
|
dummy_tree_cache = SimpleNamespace(
|
||||||
page_size=1,
|
page_size=model_runner.server_args.page_size,
|
||||||
device=model_runner.device,
|
device=model_runner.device,
|
||||||
token_to_kv_pool_allocator=model_runner.token_to_kv_pool_allocator,
|
token_to_kv_pool_allocator=model_runner.token_to_kv_pool_allocator,
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -50,11 +50,13 @@ from sglang.srt.utils import (
|
|||||||
is_hip,
|
is_hip,
|
||||||
is_npu,
|
is_npu,
|
||||||
is_shm_available,
|
is_shm_available,
|
||||||
|
is_xpu,
|
||||||
supports_custom_op,
|
supports_custom_op,
|
||||||
)
|
)
|
||||||
|
|
||||||
_is_npu = is_npu()
|
_is_npu = is_npu()
|
||||||
_is_cpu = is_cpu()
|
_is_cpu = is_cpu()
|
||||||
|
_is_xpu = is_xpu()
|
||||||
_supports_custom_op = supports_custom_op()
|
_supports_custom_op = supports_custom_op()
|
||||||
|
|
||||||
|
|
||||||
@@ -694,7 +696,7 @@ class GroupCoordinator:
|
|||||||
)
|
)
|
||||||
|
|
||||||
def all_gather_into_tensor(self, output: torch.Tensor, input: torch.Tensor):
|
def all_gather_into_tensor(self, output: torch.Tensor, input: torch.Tensor):
|
||||||
if _is_npu or not _supports_custom_op:
|
if _is_npu or _is_xpu or not _supports_custom_op:
|
||||||
self._all_gather_into_tensor(output, input)
|
self._all_gather_into_tensor(output, input)
|
||||||
else:
|
else:
|
||||||
torch.ops.sglang.reg_all_gather_into_tensor(
|
torch.ops.sglang.reg_all_gather_into_tensor(
|
||||||
@@ -1298,7 +1300,7 @@ def init_model_parallel_group(
|
|||||||
group_ranks=group_ranks,
|
group_ranks=group_ranks,
|
||||||
local_rank=local_rank,
|
local_rank=local_rank,
|
||||||
torch_distributed_backend=backend,
|
torch_distributed_backend=backend,
|
||||||
use_pynccl=not _is_npu,
|
use_pynccl=not (_is_npu or _is_xpu),
|
||||||
use_pymscclpp=use_mscclpp_allreduce,
|
use_pymscclpp=use_mscclpp_allreduce,
|
||||||
use_custom_allreduce=use_custom_allreduce,
|
use_custom_allreduce=use_custom_allreduce,
|
||||||
use_torch_symm_mem=use_symm_mem_allreduce,
|
use_torch_symm_mem=use_symm_mem_allreduce,
|
||||||
|
|||||||
@@ -217,3 +217,10 @@ def attn_backend_wrapper(runner: "ModelRunner", full_attn_backend: "AttentionBac
|
|||||||
)
|
)
|
||||||
|
|
||||||
return full_attn_backend
|
return full_attn_backend
|
||||||
|
|
||||||
|
|
||||||
|
@register_attention_backend("intel_xpu")
|
||||||
|
def create_intel_xpu_backend(runner):
|
||||||
|
from sglang.srt.layers.attention.xpu_backend import XPUAttentionBackend
|
||||||
|
|
||||||
|
return XPUAttentionBackend(runner)
|
||||||
|
|||||||
@@ -12,6 +12,8 @@ import triton
|
|||||||
import triton.language as tl
|
import triton.language as tl
|
||||||
from einops import rearrange
|
from einops import rearrange
|
||||||
|
|
||||||
|
from sglang.srt.utils import device_context
|
||||||
|
|
||||||
|
|
||||||
def rms_norm_ref(
|
def rms_norm_ref(
|
||||||
x,
|
x,
|
||||||
@@ -157,7 +159,7 @@ def _layer_norm_fwd(
|
|||||||
# heuristics for number of warps
|
# heuristics for number of warps
|
||||||
num_warps = min(max(BLOCK_N // 256, 1), 8)
|
num_warps = min(max(BLOCK_N // 256, 1), 8)
|
||||||
grid = (M, ngroups)
|
grid = (M, ngroups)
|
||||||
with torch.get_device_module(x.device).device(x.device.index):
|
with device_context(x.device):
|
||||||
_layer_norm_fwd_1pass_kernel[grid](
|
_layer_norm_fwd_1pass_kernel[grid](
|
||||||
x,
|
x,
|
||||||
out,
|
out,
|
||||||
|
|||||||
1028
python/sglang/srt/layers/attention/xpu_backend.py
Normal file
1028
python/sglang/srt/layers/attention/xpu_backend.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -42,7 +42,7 @@ _is_cpu_amx_available = cpu_has_amx_support()
|
|||||||
_is_cpu = is_cpu()
|
_is_cpu = is_cpu()
|
||||||
_is_xpu = is_xpu()
|
_is_xpu = is_xpu()
|
||||||
|
|
||||||
if _is_cuda:
|
if _is_cuda or _is_xpu:
|
||||||
# if _is_flashinfer_available:
|
# if _is_flashinfer_available:
|
||||||
# from flashinfer.norm import fused_add_rmsnorm
|
# from flashinfer.norm import fused_add_rmsnorm
|
||||||
# else:
|
# else:
|
||||||
@@ -52,13 +52,6 @@ if _is_cuda:
|
|||||||
gemma_rmsnorm,
|
gemma_rmsnorm,
|
||||||
rmsnorm,
|
rmsnorm,
|
||||||
)
|
)
|
||||||
elif _is_xpu:
|
|
||||||
from sgl_kernel import (
|
|
||||||
fused_add_rmsnorm,
|
|
||||||
gemma_fused_add_rmsnorm,
|
|
||||||
gemma_rmsnorm,
|
|
||||||
rmsnorm,
|
|
||||||
)
|
|
||||||
if _use_aiter:
|
if _use_aiter:
|
||||||
from aiter import rmsnorm2d_fwd as rms_norm
|
from aiter import rmsnorm2d_fwd as rms_norm
|
||||||
from aiter import rmsnorm2d_fwd_with_add as fused_add_rms_norm
|
from aiter import rmsnorm2d_fwd_with_add as fused_add_rms_norm
|
||||||
|
|||||||
@@ -39,10 +39,11 @@ if TYPE_CHECKING:
|
|||||||
CombineInput,
|
CombineInput,
|
||||||
)
|
)
|
||||||
|
|
||||||
from sglang.srt.utils import is_cuda, is_hip
|
from sglang.srt.utils import is_cuda, is_hip, is_xpu
|
||||||
|
|
||||||
_is_cuda = is_cuda()
|
_is_cuda = is_cuda()
|
||||||
_is_hip = is_hip()
|
_is_hip = is_hip()
|
||||||
|
_is_xpu = is_xpu()
|
||||||
if _is_cuda:
|
if _is_cuda:
|
||||||
from sgl_kernel import (
|
from sgl_kernel import (
|
||||||
awq_dequantize,
|
awq_dequantize,
|
||||||
@@ -58,8 +59,12 @@ elif _is_hip:
|
|||||||
)
|
)
|
||||||
|
|
||||||
warnings.warn(f"HIP does not support fused_marlin_moe currently.")
|
warnings.warn(f"HIP does not support fused_marlin_moe currently.")
|
||||||
|
elif _is_xpu:
|
||||||
|
from sgl_kernel import awq_dequantize
|
||||||
|
|
||||||
|
warnings.warn(f"XPU does not support fused_marlin_moe currently.")
|
||||||
else:
|
else:
|
||||||
warnings.warn(f"Only CUDA and HIP support AWQ currently.")
|
warnings.warn(f"Only CUDA, HIP and XPU support AWQ currently.")
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|||||||
@@ -115,7 +115,7 @@ class RotaryEmbedding(CustomOp):
|
|||||||
if dtype == torch.float32 or (
|
if dtype == torch.float32 or (
|
||||||
(not (_is_cuda or _is_npu) or self.head_size not in [64, 128, 256, 512])
|
(not (_is_cuda or _is_npu) or self.head_size not in [64, 128, 256, 512])
|
||||||
and not (_is_cpu and _is_cpu_amx_available)
|
and not (_is_cpu and _is_cpu_amx_available)
|
||||||
and not _is_xpu
|
and not (_is_xpu)
|
||||||
):
|
):
|
||||||
from vllm._custom_ops import rotary_embedding
|
from vllm._custom_ops import rotary_embedding
|
||||||
|
|
||||||
@@ -302,6 +302,7 @@ class RotaryEmbedding(CustomOp):
|
|||||||
offsets: Optional[torch.Tensor] = None,
|
offsets: Optional[torch.Tensor] = None,
|
||||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||||
# TODO: make a wrapper, and XPU will implement this kernel later.
|
# TODO: make a wrapper, and XPU will implement this kernel later.
|
||||||
|
self.cos_sin_cache = self.cos_sin_cache.to(query.device)
|
||||||
return self.forward_native(positions, query, key, offsets)
|
return self.forward_native(positions, query, key, offsets)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -142,6 +142,7 @@ from sglang.srt.utils import (
|
|||||||
monkey_patch_vllm_gguf_config,
|
monkey_patch_vllm_gguf_config,
|
||||||
set_cuda_arch,
|
set_cuda_arch,
|
||||||
slow_rank_detector,
|
slow_rank_detector,
|
||||||
|
xpu_has_xmx_support,
|
||||||
)
|
)
|
||||||
from sglang.srt.utils.offloader import (
|
from sglang.srt.utils.offloader import (
|
||||||
create_offloader_from_server_args,
|
create_offloader_from_server_args,
|
||||||
@@ -195,6 +196,7 @@ def add_chunked_prefix_cache_attention_backend(backend_name):
|
|||||||
_is_hip = is_hip()
|
_is_hip = is_hip()
|
||||||
_is_npu = is_npu()
|
_is_npu = is_npu()
|
||||||
_is_cpu_amx_available = cpu_has_amx_support()
|
_is_cpu_amx_available = cpu_has_amx_support()
|
||||||
|
_is_xpu_xmx_available = xpu_has_xmx_support()
|
||||||
|
|
||||||
# Use a small KV cache pool size for tests in CI
|
# Use a small KV cache pool size for tests in CI
|
||||||
SGLANG_CI_SMALL_KV_SIZE = os.getenv("SGLANG_CI_SMALL_KV_SIZE", None)
|
SGLANG_CI_SMALL_KV_SIZE = os.getenv("SGLANG_CI_SMALL_KV_SIZE", None)
|
||||||
@@ -505,6 +507,16 @@ class ModelRunner:
|
|||||||
)
|
)
|
||||||
server_args.attention_backend = "torch_native"
|
server_args.attention_backend = "torch_native"
|
||||||
|
|
||||||
|
if (
|
||||||
|
server_args.attention_backend == "intel_xpu"
|
||||||
|
and server_args.device == "xpu"
|
||||||
|
and not _is_xpu_xmx_available
|
||||||
|
):
|
||||||
|
logger.info(
|
||||||
|
"The current platform does not support Intel XMX, will fallback to triton backend."
|
||||||
|
)
|
||||||
|
server_args.attention_backend = "triton"
|
||||||
|
|
||||||
if server_args.prefill_attention_backend is not None and (
|
if server_args.prefill_attention_backend is not None and (
|
||||||
server_args.prefill_attention_backend
|
server_args.prefill_attention_backend
|
||||||
== server_args.decode_attention_backend
|
== server_args.decode_attention_backend
|
||||||
|
|||||||
@@ -114,6 +114,7 @@ ATTENTION_BACKEND_CHOICES = [
|
|||||||
# Other platforms
|
# Other platforms
|
||||||
"intel_amx",
|
"intel_amx",
|
||||||
"ascend",
|
"ascend",
|
||||||
|
"intel_xpu",
|
||||||
]
|
]
|
||||||
|
|
||||||
LORA_BACKEND_CHOICES = ["triton", "csgmv"]
|
LORA_BACKEND_CHOICES = ["triton", "csgmv"]
|
||||||
@@ -1098,6 +1099,12 @@ class ServerArgs:
|
|||||||
self.enable_mixed_chunk = False
|
self.enable_mixed_chunk = False
|
||||||
self.disable_radix_cache = True
|
self.disable_radix_cache = True
|
||||||
|
|
||||||
|
if self.attention_backend == "intel_xpu":
|
||||||
|
if self.page_size not in [32, 64, 128]:
|
||||||
|
logger.warning(
|
||||||
|
f"Intel XPU attention backend only supports page_size of 32, 64 or 128, changing page_size from {self.page_size} to 128."
|
||||||
|
)
|
||||||
|
self.page_size = 128
|
||||||
if self.attention_backend == "fa4" or self.decode_attention_backend == "fa4":
|
if self.attention_backend == "fa4" or self.decode_attention_backend == "fa4":
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"FA4 backend is only supported for prefill. Please use `--prefill-attention-backend fa4` instead."
|
"FA4 backend is only supported for prefill. Please use `--prefill-attention-backend fa4` instead."
|
||||||
|
|||||||
@@ -163,6 +163,20 @@ def _check(cc_major):
|
|||||||
) >= (12, 3)
|
) >= (12, 3)
|
||||||
|
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def device_context(device: torch.device):
|
||||||
|
if device.type == "cpu" and is_cpu():
|
||||||
|
with torch.device("cpu"):
|
||||||
|
yield
|
||||||
|
else:
|
||||||
|
module = torch.get_device_module(device)
|
||||||
|
if module is not None:
|
||||||
|
with module.device(device.index):
|
||||||
|
yield
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unknown device module: {device}")
|
||||||
|
|
||||||
|
|
||||||
is_ampere_with_cuda_12_3 = lambda: _check(8)
|
is_ampere_with_cuda_12_3 = lambda: _check(8)
|
||||||
is_hopper_with_cuda_12_3 = lambda: _check(9)
|
is_hopper_with_cuda_12_3 = lambda: _check(9)
|
||||||
|
|
||||||
@@ -263,6 +277,14 @@ def use_intel_amx_backend(layer):
|
|||||||
return getattr(layer, "use_intel_amx_backend", False)
|
return getattr(layer, "use_intel_amx_backend", False)
|
||||||
|
|
||||||
|
|
||||||
|
def xpu_has_xmx_support():
|
||||||
|
# TODO: update with XPU capalibity query
|
||||||
|
if is_xpu():
|
||||||
|
# currently only PVC/LNL/BMG supports F64, so we only support these now
|
||||||
|
return torch.xpu.get_device_properties().has_fp64
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
def is_flashinfer_available():
|
def is_flashinfer_available():
|
||||||
"""
|
"""
|
||||||
Check whether flashinfer is available.
|
Check whether flashinfer is available.
|
||||||
|
|||||||
@@ -8,6 +8,7 @@ import unittest
|
|||||||
from functools import wraps
|
from functools import wraps
|
||||||
|
|
||||||
from sglang.test.test_utils import (
|
from sglang.test.test_utils import (
|
||||||
|
DEFAULT_SMALL_MODEL_NAME_FOR_TEST_BASE,
|
||||||
DEFAULT_SMALL_MODEL_NAME_FOR_TEST_QWEN,
|
DEFAULT_SMALL_MODEL_NAME_FOR_TEST_QWEN,
|
||||||
CustomTestCase,
|
CustomTestCase,
|
||||||
is_in_ci,
|
is_in_ci,
|
||||||
@@ -55,6 +56,10 @@ class TestIntelXPUBackend(CustomTestCase):
|
|||||||
def test_latency_qwen_model(self):
|
def test_latency_qwen_model(self):
|
||||||
return DEFAULT_SMALL_MODEL_NAME_FOR_TEST_QWEN
|
return DEFAULT_SMALL_MODEL_NAME_FOR_TEST_QWEN
|
||||||
|
|
||||||
|
@intel_xpu_benchmark(["--attention-backend", "intel_xpu", "--page-size", "128"])
|
||||||
|
def test_attention_backend(self):
|
||||||
|
return DEFAULT_SMALL_MODEL_NAME_FOR_TEST_BASE
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
|||||||
Reference in New Issue
Block a user