[CI][XPU]enable sglang CI on Intel XPU (#9493)
Co-authored-by: huaiyuzh <huaiyu.zheng@intel.com> Co-authored-by: Ma Mingfei <mingfei.ma@intel.com> Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
This commit is contained in:
99
.github/workflows/pr-test-xpu.yml
vendored
Normal file
99
.github/workflows/pr-test-xpu.yml
vendored
Normal file
@@ -0,0 +1,99 @@
|
|||||||
|
name: PR Test (XPU)
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches: [ main ]
|
||||||
|
paths:
|
||||||
|
- "python/**"
|
||||||
|
- "scripts/ci/**"
|
||||||
|
- "test/**"
|
||||||
|
- "sgl-kernel/**"
|
||||||
|
- ".github/workflows/pr-test-xpu.yml"
|
||||||
|
pull_request:
|
||||||
|
branches: [ main ]
|
||||||
|
paths:
|
||||||
|
- "python/**"
|
||||||
|
- "scripts/ci/**"
|
||||||
|
- "test/**"
|
||||||
|
- "sgl-kernel/**"
|
||||||
|
- ".github/workflows/pr-test-xpu.yml"
|
||||||
|
types: [synchronize, labeled]
|
||||||
|
workflow_dispatch:
|
||||||
|
|
||||||
|
concurrency:
|
||||||
|
group: pr-test-xpu-${{ github.ref }}
|
||||||
|
cancel-in-progress: true
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
build-and-test:
|
||||||
|
if: github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'run-ci')
|
||||||
|
runs-on: intel-bmg
|
||||||
|
env:
|
||||||
|
HF_HOME: /home/sdp/.cache/huggingface
|
||||||
|
steps:
|
||||||
|
- name: Checkout code
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
with:
|
||||||
|
fetch-depth: 0
|
||||||
|
|
||||||
|
- name: Set up Docker Buildx
|
||||||
|
uses: docker/setup-buildx-action@v3
|
||||||
|
|
||||||
|
- name: Build Docker image
|
||||||
|
run: |
|
||||||
|
PR_REPO=${{ github.event.pull_request.head.repo.clone_url }}
|
||||||
|
PR_HEAD_REF=${{ github.head_ref }}
|
||||||
|
docker build \
|
||||||
|
${PR_REPO:+--build-arg SG_LANG_REPO=$PR_REPO} \
|
||||||
|
${PR_HEAD_REF:+--build-arg SG_LANG_BRANCH=$PR_HEAD_REF} \
|
||||||
|
--no-cache --progress=plain -f docker/Dockerfile.xpu -t xpu_sglang_main:bmg .
|
||||||
|
|
||||||
|
- name: Run container
|
||||||
|
id: start_container
|
||||||
|
run: |
|
||||||
|
container_id=$(docker run -dt \
|
||||||
|
--group-add 992 \
|
||||||
|
--group-add $(getent group video | cut -d: -f3) \
|
||||||
|
-v ${HF_HOME}:/root/.cache/huggingface \
|
||||||
|
--device /dev/dri \
|
||||||
|
-e HF_TOKEN="$(cat ~/huggingface_token.txt)" \
|
||||||
|
xpu_sglang_main:bmg)
|
||||||
|
echo "Started container: $container_id"
|
||||||
|
echo "container_id=$container_id" >> "$GITHUB_OUTPUT"
|
||||||
|
|
||||||
|
- name: Install Dependency
|
||||||
|
timeout-minutes: 20
|
||||||
|
run: |
|
||||||
|
cid="${{ steps.start_container.outputs.container_id }}"
|
||||||
|
docker exec "$cid" /home/sdp/miniforge3/envs/py3.10/bin/python3 -m pip install --upgrade pip
|
||||||
|
docker exec "$cid" /home/sdp/miniforge3/envs/py3.10/bin/python3 -m pip install pytest expecttest ray huggingface_hub
|
||||||
|
docker exec "$cid" /home/sdp/miniforge3/envs/py3.10/bin/python3 -m pip uninstall -y flashinfer-python
|
||||||
|
docker exec "$cid" /bin/bash -c '/home/sdp/miniforge3/envs/py3.10/bin/huggingface-cli login --token ${HF_TOKEN} '
|
||||||
|
docker exec -u root "$cid" /bin/bash -c "ln -sf /home/sdp/miniforge3/envs/py3.10/bin/python3 /usr/bin/python3"
|
||||||
|
|
||||||
|
- name: Run E2E Bfloat16 tests
|
||||||
|
timeout-minutes: 20
|
||||||
|
run: |
|
||||||
|
cid="${{ steps.start_container.outputs.container_id }}"
|
||||||
|
docker exec -w /home/sdp/sglang/ "$cid" \
|
||||||
|
bash -c "LD_LIBRARY_PATH=/home/sdp/miniforge3/envs/py3.10/lib:$LD_LIBRARY_PATH && cd ./test/srt && python3 run_suite.py --suite per-commit-xpu"
|
||||||
|
|
||||||
|
- name: Cleanup container
|
||||||
|
if: always()
|
||||||
|
run: |
|
||||||
|
cid="${{ steps.start_container.outputs.container_id }}"
|
||||||
|
docker rm -f "$cid" || true
|
||||||
|
|
||||||
|
finish:
|
||||||
|
if: always()
|
||||||
|
needs: [build-and-test]
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- name: Check job status
|
||||||
|
run: |
|
||||||
|
if [ "${{ needs.build-and-test.result }}" != "success" ]; then
|
||||||
|
echo "Job failed with result: ${{ needs.build-and-test.result }}"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
echo "All jobs completed successfully"
|
||||||
|
exit 0
|
||||||
78
docker/Dockerfile.xpu
Normal file
78
docker/Dockerfile.xpu
Normal file
@@ -0,0 +1,78 @@
|
|||||||
|
# If the device is Battlemage, we need to set UBUNTU_VERSION to 24.10
|
||||||
|
|
||||||
|
# Usage: docker build --build-arg UBUNTU_VERSION=24.04 --build-arg PYTHON_VERSION=3.10 -t sglang:xpu_kernel -f Dockerfile.xpu --no-cache .
|
||||||
|
|
||||||
|
# Use Intel deep learning essentials base image with Ubuntu 24.04
|
||||||
|
FROM intel/deep-learning-essentials:2025.1.3-0-devel-ubuntu24.04
|
||||||
|
|
||||||
|
# Avoid interactive prompts during package install
|
||||||
|
ENV DEBIAN_FRONTEND=noninteractive
|
||||||
|
|
||||||
|
# Define build arguments
|
||||||
|
ARG PYTHON_VERSION=3.10
|
||||||
|
|
||||||
|
ARG SG_LANG_REPO=https://github.com/sgl-project/sglang.git
|
||||||
|
ARG SG_LANG_BRANCH=main
|
||||||
|
|
||||||
|
ARG SG_LANG_KERNEL_REPO=https://github.com/sgl-project/sgl-kernel-xpu.git
|
||||||
|
ARG SG_LANG_KERNEL_BRANCH=main
|
||||||
|
|
||||||
|
RUN useradd -m -d /home/sdp -s /bin/bash sdp && \
|
||||||
|
chown -R sdp:sdp /home/sdp
|
||||||
|
|
||||||
|
# Switch to non-root user 'sdp'
|
||||||
|
USER sdp
|
||||||
|
|
||||||
|
# Set HOME and WORKDIR to user's home directory
|
||||||
|
ENV HOME=/home/sdp
|
||||||
|
WORKDIR /home/sdp
|
||||||
|
|
||||||
|
RUN curl -fsSL -v -o miniforge.sh -O https://github.com/conda-forge/miniforge/releases/download/25.1.1-0/Miniforge3-Linux-x86_64.sh && \
|
||||||
|
bash miniforge.sh -b -p ./miniforge3 && \
|
||||||
|
rm miniforge.sh && \
|
||||||
|
# Initialize conda environment and install pip
|
||||||
|
. ./miniforge3/bin/activate && \
|
||||||
|
conda create -y -n py${PYTHON_VERSION} python=${PYTHON_VERSION} && \
|
||||||
|
conda activate py${PYTHON_VERSION} && \
|
||||||
|
conda install pip && \
|
||||||
|
# Append environment activation to .bashrc for interactive shells
|
||||||
|
echo ". /home/sdp/miniforge3/bin/activate; conda activate py${PYTHON_VERSION}; . /opt/intel/oneapi/setvars.sh; cd /home/sdp" >> /home/sdp/.bashrc
|
||||||
|
|
||||||
|
USER root
|
||||||
|
RUN apt-get update && apt install -y intel-ocloc
|
||||||
|
|
||||||
|
# Switch back to user sdp
|
||||||
|
USER sdp
|
||||||
|
|
||||||
|
RUN --mount=type=secret,id=github_token \
|
||||||
|
cd /home/sdp && \
|
||||||
|
. /home/sdp/miniforge3/bin/activate && \
|
||||||
|
conda activate py${PYTHON_VERSION} && \
|
||||||
|
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/xpu
|
||||||
|
|
||||||
|
RUN --mount=type=secret,id=github_token \
|
||||||
|
cd /home/sdp && \
|
||||||
|
. /home/sdp/miniforge3/bin/activate && \
|
||||||
|
conda activate py${PYTHON_VERSION} && \
|
||||||
|
echo "Cloning ${SG_LANG_BRANCH} from ${SG_LANG_REPO}" && \
|
||||||
|
git clone --branch ${SG_LANG_BRANCH} --single-branch ${SG_LANG_REPO} && \
|
||||||
|
cd sglang && cd python && \
|
||||||
|
cp pyproject_xpu.toml pyproject.toml && \
|
||||||
|
pip install . && \
|
||||||
|
echo "Cloning ${SG_LANG_KERNEL_REPO} from ${SG_LANG_KERNEL_BRANCH}" && \
|
||||||
|
git clone --branch ${SG_LANG_KERNEL_BRANCH} --single-branch ${SG_LANG_KERNEL_REPO} && \
|
||||||
|
cd sgl-kernel-xpu && \
|
||||||
|
pip install -v . && \
|
||||||
|
pip install msgspec blake3 py-cpuinfo compressed_tensors gguf partial_json_parser einops --root-user-action=ignore && \
|
||||||
|
pip uninstall pytorch-triton-xpu -y && \
|
||||||
|
pip install --pre pytorch-triton-xpu --index-url https://download.pytorch.org/whl/xpu && \
|
||||||
|
conda install libsqlite=3.48.0 -y && \
|
||||||
|
# Add environment setup commands to .bashrc again (in case it was overwritten)
|
||||||
|
echo ". /home/sdp/miniforge3/bin/activate; conda activate py${PYTHON_VERSION}; cd /home/sdp" >> /home/sdp/.bashrc
|
||||||
|
|
||||||
|
# Use bash as default shell with initialization from .bashrc
|
||||||
|
SHELL ["bash", "-c"]
|
||||||
|
|
||||||
|
# Start an interactive bash shell with all environment set up
|
||||||
|
USER sdp
|
||||||
|
CMD ["bash", "-c", "source /home/sdp/.bashrc && exec bash"]
|
||||||
@@ -17,6 +17,7 @@ from sglang.srt.utils import (
|
|||||||
is_cuda,
|
is_cuda,
|
||||||
is_hip,
|
is_hip,
|
||||||
is_npu,
|
is_npu,
|
||||||
|
is_xpu,
|
||||||
)
|
)
|
||||||
|
|
||||||
_is_cuda = is_cuda()
|
_is_cuda = is_cuda()
|
||||||
@@ -25,6 +26,7 @@ _use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
|
|||||||
_is_npu = is_npu()
|
_is_npu = is_npu()
|
||||||
_is_cpu_amx_available = cpu_has_amx_support()
|
_is_cpu_amx_available = cpu_has_amx_support()
|
||||||
_is_cpu = is_cpu()
|
_is_cpu = is_cpu()
|
||||||
|
_is_xpu = is_xpu()
|
||||||
|
|
||||||
if _is_cuda:
|
if _is_cuda:
|
||||||
from sgl_kernel import FusedSetKVBufferArg, apply_rope_with_cos_sin_cache_inplace
|
from sgl_kernel import FusedSetKVBufferArg, apply_rope_with_cos_sin_cache_inplace
|
||||||
@@ -109,8 +111,10 @@ class RotaryEmbedding(CustomOp):
|
|||||||
cache = cache.to(dtype)
|
cache = cache.to(dtype)
|
||||||
|
|
||||||
if (
|
if (
|
||||||
not (_is_cuda or _is_npu) or self.head_size not in [64, 128, 256, 512]
|
(not (_is_cuda or _is_npu) or self.head_size not in [64, 128, 256, 512])
|
||||||
) and not (_is_cpu and _is_cpu_amx_available):
|
and not (_is_cpu and _is_cpu_amx_available)
|
||||||
|
and not _is_xpu
|
||||||
|
):
|
||||||
from vllm._custom_ops import rotary_embedding
|
from vllm._custom_ops import rotary_embedding
|
||||||
|
|
||||||
self.vllm_rotary_embedding = rotary_embedding
|
self.vllm_rotary_embedding = rotary_embedding
|
||||||
@@ -284,6 +288,16 @@ class RotaryEmbedding(CustomOp):
|
|||||||
s += f", base={self.base}, is_neox_style={self.is_neox_style}"
|
s += f", base={self.base}, is_neox_style={self.is_neox_style}"
|
||||||
return s
|
return s
|
||||||
|
|
||||||
|
def forward_xpu(
|
||||||
|
self,
|
||||||
|
positions: torch.Tensor,
|
||||||
|
query: torch.Tensor,
|
||||||
|
key: torch.Tensor,
|
||||||
|
offsets: Optional[torch.Tensor] = None,
|
||||||
|
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||||
|
# TODO: make a wrapper, and XPU will implement this kernel later.
|
||||||
|
return self.forward_native(positions, query, key, offsets)
|
||||||
|
|
||||||
|
|
||||||
class LinearScalingRotaryEmbedding(RotaryEmbedding):
|
class LinearScalingRotaryEmbedding(RotaryEmbedding):
|
||||||
"""RotaryEmbedding extended with linear scaling.
|
"""RotaryEmbedding extended with linear scaling.
|
||||||
|
|||||||
@@ -75,6 +75,11 @@ DEFAULT_MODEL_NAME_FOR_TEST_FP8_WITH_MOE = "gaunernst/DeepSeek-V2-Lite-Chat-FP8"
|
|||||||
DEFAULT_MODEL_NAME_FOR_TEST_W8A8 = "RedHatAI/Llama-3.2-3B-quantized.w8a8"
|
DEFAULT_MODEL_NAME_FOR_TEST_W8A8 = "RedHatAI/Llama-3.2-3B-quantized.w8a8"
|
||||||
DEFAULT_MODEL_NAME_FOR_TEST_W8A8_WITH_MOE = "nytopop/Qwen3-30B-A3B.w8a8"
|
DEFAULT_MODEL_NAME_FOR_TEST_W8A8_WITH_MOE = "nytopop/Qwen3-30B-A3B.w8a8"
|
||||||
|
|
||||||
|
# INT4 models
|
||||||
|
DEFAULT_MODEL_NAME_FOR_TEST_AWQ_INT4 = (
|
||||||
|
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
|
||||||
|
)
|
||||||
|
|
||||||
# EAGLE
|
# EAGLE
|
||||||
DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST = "meta-llama/Llama-2-7b-chat-hf"
|
DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST = "meta-llama/Llama-2-7b-chat-hf"
|
||||||
DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST = "lmsys/sglang-EAGLE-llama2-chat-7B"
|
DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST = "lmsys/sglang-EAGLE-llama2-chat-7B"
|
||||||
|
|||||||
@@ -316,6 +316,13 @@ suite_xeon = {
|
|||||||
],
|
],
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Add Intel XPU tests
|
||||||
|
suite_xpu = {
|
||||||
|
"per-commit-xpu": [
|
||||||
|
TestFile("xpu/test_intel_xpu_backend.py"),
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
# Add Ascend NPU tests
|
# Add Ascend NPU tests
|
||||||
# NOTE: please sort the test cases alphabetically by the test file name
|
# NOTE: please sort the test cases alphabetically by the test file name
|
||||||
suite_ascend = {
|
suite_ascend = {
|
||||||
@@ -341,6 +348,7 @@ suite_ascend = {
|
|||||||
suites.update(suite_amd)
|
suites.update(suite_amd)
|
||||||
suites.update(suite_xeon)
|
suites.update(suite_xeon)
|
||||||
suites.update(suite_ascend)
|
suites.update(suite_ascend)
|
||||||
|
suites.update(suite_xpu)
|
||||||
|
|
||||||
|
|
||||||
def auto_partition(files, rank, size):
|
def auto_partition(files, rank, size):
|
||||||
|
|||||||
60
test/srt/xpu/test_intel_xpu_backend.py
Normal file
60
test/srt/xpu/test_intel_xpu_backend.py
Normal file
@@ -0,0 +1,60 @@
|
|||||||
|
"""
|
||||||
|
Usage:
|
||||||
|
python3 -m unittest test_intel_xpu_backend.TestIntelXPUBackend.test_latency_qwen_model
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import unittest
|
||||||
|
from functools import wraps
|
||||||
|
|
||||||
|
from sglang.test.test_utils import (
|
||||||
|
DEFAULT_SMALL_MODEL_NAME_FOR_TEST_QWEN,
|
||||||
|
CustomTestCase,
|
||||||
|
is_in_ci,
|
||||||
|
run_bench_one_batch,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def intel_xpu_benchmark(extra_args=None, min_throughput=None):
|
||||||
|
def decorator(test_func):
|
||||||
|
@wraps(test_func)
|
||||||
|
def wrapper(self):
|
||||||
|
common_args = [
|
||||||
|
"--disable-radix",
|
||||||
|
"--trust-remote-code",
|
||||||
|
"--mem-fraction-static",
|
||||||
|
"0.3",
|
||||||
|
"--batch-size",
|
||||||
|
"1",
|
||||||
|
"--device",
|
||||||
|
"xpu",
|
||||||
|
]
|
||||||
|
full_args = common_args + (extra_args or [])
|
||||||
|
|
||||||
|
model = test_func(self)
|
||||||
|
prefill_latency, decode_throughput, decode_latency = run_bench_one_batch(
|
||||||
|
model, full_args
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f"{model=}")
|
||||||
|
print(f"{prefill_latency=}")
|
||||||
|
print(f"{decode_throughput=}")
|
||||||
|
print(f"{decode_latency=}")
|
||||||
|
|
||||||
|
if is_in_ci() and min_throughput is not None:
|
||||||
|
self.assertGreater(decode_throughput, min_throughput)
|
||||||
|
|
||||||
|
return wrapper
|
||||||
|
|
||||||
|
return decorator
|
||||||
|
|
||||||
|
|
||||||
|
class TestIntelXPUBackend(CustomTestCase):
|
||||||
|
|
||||||
|
@intel_xpu_benchmark(min_throughput=10)
|
||||||
|
def test_latency_qwen_model(self):
|
||||||
|
return DEFAULT_SMALL_MODEL_NAME_FOR_TEST_QWEN
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
unittest.main()
|
||||||
Reference in New Issue
Block a user