[CI][XPU]enable sglang CI on Intel XPU (#9493)
Co-authored-by: huaiyuzh <huaiyu.zheng@intel.com> Co-authored-by: Ma Mingfei <mingfei.ma@intel.com> Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
This commit is contained in:
99
.github/workflows/pr-test-xpu.yml
vendored
Normal file
99
.github/workflows/pr-test-xpu.yml
vendored
Normal file
@@ -0,0 +1,99 @@
|
||||
name: PR Test (XPU)
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [ main ]
|
||||
paths:
|
||||
- "python/**"
|
||||
- "scripts/ci/**"
|
||||
- "test/**"
|
||||
- "sgl-kernel/**"
|
||||
- ".github/workflows/pr-test-xpu.yml"
|
||||
pull_request:
|
||||
branches: [ main ]
|
||||
paths:
|
||||
- "python/**"
|
||||
- "scripts/ci/**"
|
||||
- "test/**"
|
||||
- "sgl-kernel/**"
|
||||
- ".github/workflows/pr-test-xpu.yml"
|
||||
types: [synchronize, labeled]
|
||||
workflow_dispatch:
|
||||
|
||||
concurrency:
|
||||
group: pr-test-xpu-${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
build-and-test:
|
||||
if: github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'run-ci')
|
||||
runs-on: intel-bmg
|
||||
env:
|
||||
HF_HOME: /home/sdp/.cache/huggingface
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Build Docker image
|
||||
run: |
|
||||
PR_REPO=${{ github.event.pull_request.head.repo.clone_url }}
|
||||
PR_HEAD_REF=${{ github.head_ref }}
|
||||
docker build \
|
||||
${PR_REPO:+--build-arg SG_LANG_REPO=$PR_REPO} \
|
||||
${PR_HEAD_REF:+--build-arg SG_LANG_BRANCH=$PR_HEAD_REF} \
|
||||
--no-cache --progress=plain -f docker/Dockerfile.xpu -t xpu_sglang_main:bmg .
|
||||
|
||||
- name: Run container
|
||||
id: start_container
|
||||
run: |
|
||||
container_id=$(docker run -dt \
|
||||
--group-add 992 \
|
||||
--group-add $(getent group video | cut -d: -f3) \
|
||||
-v ${HF_HOME}:/root/.cache/huggingface \
|
||||
--device /dev/dri \
|
||||
-e HF_TOKEN="$(cat ~/huggingface_token.txt)" \
|
||||
xpu_sglang_main:bmg)
|
||||
echo "Started container: $container_id"
|
||||
echo "container_id=$container_id" >> "$GITHUB_OUTPUT"
|
||||
|
||||
- name: Install Dependency
|
||||
timeout-minutes: 20
|
||||
run: |
|
||||
cid="${{ steps.start_container.outputs.container_id }}"
|
||||
docker exec "$cid" /home/sdp/miniforge3/envs/py3.10/bin/python3 -m pip install --upgrade pip
|
||||
docker exec "$cid" /home/sdp/miniforge3/envs/py3.10/bin/python3 -m pip install pytest expecttest ray huggingface_hub
|
||||
docker exec "$cid" /home/sdp/miniforge3/envs/py3.10/bin/python3 -m pip uninstall -y flashinfer-python
|
||||
docker exec "$cid" /bin/bash -c '/home/sdp/miniforge3/envs/py3.10/bin/huggingface-cli login --token ${HF_TOKEN} '
|
||||
docker exec -u root "$cid" /bin/bash -c "ln -sf /home/sdp/miniforge3/envs/py3.10/bin/python3 /usr/bin/python3"
|
||||
|
||||
- name: Run E2E Bfloat16 tests
|
||||
timeout-minutes: 20
|
||||
run: |
|
||||
cid="${{ steps.start_container.outputs.container_id }}"
|
||||
docker exec -w /home/sdp/sglang/ "$cid" \
|
||||
bash -c "LD_LIBRARY_PATH=/home/sdp/miniforge3/envs/py3.10/lib:$LD_LIBRARY_PATH && cd ./test/srt && python3 run_suite.py --suite per-commit-xpu"
|
||||
|
||||
- name: Cleanup container
|
||||
if: always()
|
||||
run: |
|
||||
cid="${{ steps.start_container.outputs.container_id }}"
|
||||
docker rm -f "$cid" || true
|
||||
|
||||
finish:
|
||||
if: always()
|
||||
needs: [build-and-test]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Check job status
|
||||
run: |
|
||||
if [ "${{ needs.build-and-test.result }}" != "success" ]; then
|
||||
echo "Job failed with result: ${{ needs.build-and-test.result }}"
|
||||
exit 1
|
||||
fi
|
||||
echo "All jobs completed successfully"
|
||||
exit 0
|
||||
78
docker/Dockerfile.xpu
Normal file
78
docker/Dockerfile.xpu
Normal file
@@ -0,0 +1,78 @@
|
||||
# If the device is Battlemage, we need to set UBUNTU_VERSION to 24.10
|
||||
|
||||
# Usage: docker build --build-arg UBUNTU_VERSION=24.04 --build-arg PYTHON_VERSION=3.10 -t sglang:xpu_kernel -f Dockerfile.xpu --no-cache .
|
||||
|
||||
# Use Intel deep learning essentials base image with Ubuntu 24.04
|
||||
FROM intel/deep-learning-essentials:2025.1.3-0-devel-ubuntu24.04
|
||||
|
||||
# Avoid interactive prompts during package install
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
# Define build arguments
|
||||
ARG PYTHON_VERSION=3.10
|
||||
|
||||
ARG SG_LANG_REPO=https://github.com/sgl-project/sglang.git
|
||||
ARG SG_LANG_BRANCH=main
|
||||
|
||||
ARG SG_LANG_KERNEL_REPO=https://github.com/sgl-project/sgl-kernel-xpu.git
|
||||
ARG SG_LANG_KERNEL_BRANCH=main
|
||||
|
||||
RUN useradd -m -d /home/sdp -s /bin/bash sdp && \
|
||||
chown -R sdp:sdp /home/sdp
|
||||
|
||||
# Switch to non-root user 'sdp'
|
||||
USER sdp
|
||||
|
||||
# Set HOME and WORKDIR to user's home directory
|
||||
ENV HOME=/home/sdp
|
||||
WORKDIR /home/sdp
|
||||
|
||||
RUN curl -fsSL -v -o miniforge.sh -O https://github.com/conda-forge/miniforge/releases/download/25.1.1-0/Miniforge3-Linux-x86_64.sh && \
|
||||
bash miniforge.sh -b -p ./miniforge3 && \
|
||||
rm miniforge.sh && \
|
||||
# Initialize conda environment and install pip
|
||||
. ./miniforge3/bin/activate && \
|
||||
conda create -y -n py${PYTHON_VERSION} python=${PYTHON_VERSION} && \
|
||||
conda activate py${PYTHON_VERSION} && \
|
||||
conda install pip && \
|
||||
# Append environment activation to .bashrc for interactive shells
|
||||
echo ". /home/sdp/miniforge3/bin/activate; conda activate py${PYTHON_VERSION}; . /opt/intel/oneapi/setvars.sh; cd /home/sdp" >> /home/sdp/.bashrc
|
||||
|
||||
USER root
|
||||
RUN apt-get update && apt install -y intel-ocloc
|
||||
|
||||
# Switch back to user sdp
|
||||
USER sdp
|
||||
|
||||
RUN --mount=type=secret,id=github_token \
|
||||
cd /home/sdp && \
|
||||
. /home/sdp/miniforge3/bin/activate && \
|
||||
conda activate py${PYTHON_VERSION} && \
|
||||
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/xpu
|
||||
|
||||
RUN --mount=type=secret,id=github_token \
|
||||
cd /home/sdp && \
|
||||
. /home/sdp/miniforge3/bin/activate && \
|
||||
conda activate py${PYTHON_VERSION} && \
|
||||
echo "Cloning ${SG_LANG_BRANCH} from ${SG_LANG_REPO}" && \
|
||||
git clone --branch ${SG_LANG_BRANCH} --single-branch ${SG_LANG_REPO} && \
|
||||
cd sglang && cd python && \
|
||||
cp pyproject_xpu.toml pyproject.toml && \
|
||||
pip install . && \
|
||||
echo "Cloning ${SG_LANG_KERNEL_REPO} from ${SG_LANG_KERNEL_BRANCH}" && \
|
||||
git clone --branch ${SG_LANG_KERNEL_BRANCH} --single-branch ${SG_LANG_KERNEL_REPO} && \
|
||||
cd sgl-kernel-xpu && \
|
||||
pip install -v . && \
|
||||
pip install msgspec blake3 py-cpuinfo compressed_tensors gguf partial_json_parser einops --root-user-action=ignore && \
|
||||
pip uninstall pytorch-triton-xpu -y && \
|
||||
pip install --pre pytorch-triton-xpu --index-url https://download.pytorch.org/whl/xpu && \
|
||||
conda install libsqlite=3.48.0 -y && \
|
||||
# Add environment setup commands to .bashrc again (in case it was overwritten)
|
||||
echo ". /home/sdp/miniforge3/bin/activate; conda activate py${PYTHON_VERSION}; cd /home/sdp" >> /home/sdp/.bashrc
|
||||
|
||||
# Use bash as default shell with initialization from .bashrc
|
||||
SHELL ["bash", "-c"]
|
||||
|
||||
# Start an interactive bash shell with all environment set up
|
||||
USER sdp
|
||||
CMD ["bash", "-c", "source /home/sdp/.bashrc && exec bash"]
|
||||
@@ -17,6 +17,7 @@ from sglang.srt.utils import (
|
||||
is_cuda,
|
||||
is_hip,
|
||||
is_npu,
|
||||
is_xpu,
|
||||
)
|
||||
|
||||
_is_cuda = is_cuda()
|
||||
@@ -25,6 +26,7 @@ _use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
|
||||
_is_npu = is_npu()
|
||||
_is_cpu_amx_available = cpu_has_amx_support()
|
||||
_is_cpu = is_cpu()
|
||||
_is_xpu = is_xpu()
|
||||
|
||||
if _is_cuda:
|
||||
from sgl_kernel import FusedSetKVBufferArg, apply_rope_with_cos_sin_cache_inplace
|
||||
@@ -109,8 +111,10 @@ class RotaryEmbedding(CustomOp):
|
||||
cache = cache.to(dtype)
|
||||
|
||||
if (
|
||||
not (_is_cuda or _is_npu) or self.head_size not in [64, 128, 256, 512]
|
||||
) and not (_is_cpu and _is_cpu_amx_available):
|
||||
(not (_is_cuda or _is_npu) or self.head_size not in [64, 128, 256, 512])
|
||||
and not (_is_cpu and _is_cpu_amx_available)
|
||||
and not _is_xpu
|
||||
):
|
||||
from vllm._custom_ops import rotary_embedding
|
||||
|
||||
self.vllm_rotary_embedding = rotary_embedding
|
||||
@@ -284,6 +288,16 @@ class RotaryEmbedding(CustomOp):
|
||||
s += f", base={self.base}, is_neox_style={self.is_neox_style}"
|
||||
return s
|
||||
|
||||
def forward_xpu(
|
||||
self,
|
||||
positions: torch.Tensor,
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor,
|
||||
offsets: Optional[torch.Tensor] = None,
|
||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
# TODO: make a wrapper, and XPU will implement this kernel later.
|
||||
return self.forward_native(positions, query, key, offsets)
|
||||
|
||||
|
||||
class LinearScalingRotaryEmbedding(RotaryEmbedding):
|
||||
"""RotaryEmbedding extended with linear scaling.
|
||||
|
||||
@@ -75,6 +75,11 @@ DEFAULT_MODEL_NAME_FOR_TEST_FP8_WITH_MOE = "gaunernst/DeepSeek-V2-Lite-Chat-FP8"
|
||||
DEFAULT_MODEL_NAME_FOR_TEST_W8A8 = "RedHatAI/Llama-3.2-3B-quantized.w8a8"
|
||||
DEFAULT_MODEL_NAME_FOR_TEST_W8A8_WITH_MOE = "nytopop/Qwen3-30B-A3B.w8a8"
|
||||
|
||||
# INT4 models
|
||||
DEFAULT_MODEL_NAME_FOR_TEST_AWQ_INT4 = (
|
||||
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
|
||||
)
|
||||
|
||||
# EAGLE
|
||||
DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST = "meta-llama/Llama-2-7b-chat-hf"
|
||||
DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST = "lmsys/sglang-EAGLE-llama2-chat-7B"
|
||||
|
||||
@@ -316,6 +316,13 @@ suite_xeon = {
|
||||
],
|
||||
}
|
||||
|
||||
# Add Intel XPU tests
|
||||
suite_xpu = {
|
||||
"per-commit-xpu": [
|
||||
TestFile("xpu/test_intel_xpu_backend.py"),
|
||||
],
|
||||
}
|
||||
|
||||
# Add Ascend NPU tests
|
||||
# NOTE: please sort the test cases alphabetically by the test file name
|
||||
suite_ascend = {
|
||||
@@ -341,6 +348,7 @@ suite_ascend = {
|
||||
suites.update(suite_amd)
|
||||
suites.update(suite_xeon)
|
||||
suites.update(suite_ascend)
|
||||
suites.update(suite_xpu)
|
||||
|
||||
|
||||
def auto_partition(files, rank, size):
|
||||
|
||||
60
test/srt/xpu/test_intel_xpu_backend.py
Normal file
60
test/srt/xpu/test_intel_xpu_backend.py
Normal file
@@ -0,0 +1,60 @@
|
||||
"""
|
||||
Usage:
|
||||
python3 -m unittest test_intel_xpu_backend.TestIntelXPUBackend.test_latency_qwen_model
|
||||
"""
|
||||
|
||||
import os
|
||||
import unittest
|
||||
from functools import wraps
|
||||
|
||||
from sglang.test.test_utils import (
|
||||
DEFAULT_SMALL_MODEL_NAME_FOR_TEST_QWEN,
|
||||
CustomTestCase,
|
||||
is_in_ci,
|
||||
run_bench_one_batch,
|
||||
)
|
||||
|
||||
|
||||
def intel_xpu_benchmark(extra_args=None, min_throughput=None):
|
||||
def decorator(test_func):
|
||||
@wraps(test_func)
|
||||
def wrapper(self):
|
||||
common_args = [
|
||||
"--disable-radix",
|
||||
"--trust-remote-code",
|
||||
"--mem-fraction-static",
|
||||
"0.3",
|
||||
"--batch-size",
|
||||
"1",
|
||||
"--device",
|
||||
"xpu",
|
||||
]
|
||||
full_args = common_args + (extra_args or [])
|
||||
|
||||
model = test_func(self)
|
||||
prefill_latency, decode_throughput, decode_latency = run_bench_one_batch(
|
||||
model, full_args
|
||||
)
|
||||
|
||||
print(f"{model=}")
|
||||
print(f"{prefill_latency=}")
|
||||
print(f"{decode_throughput=}")
|
||||
print(f"{decode_latency=}")
|
||||
|
||||
if is_in_ci() and min_throughput is not None:
|
||||
self.assertGreater(decode_throughput, min_throughput)
|
||||
|
||||
return wrapper
|
||||
|
||||
return decorator
|
||||
|
||||
|
||||
class TestIntelXPUBackend(CustomTestCase):
|
||||
|
||||
@intel_xpu_benchmark(min_throughput=10)
|
||||
def test_latency_qwen_model(self):
|
||||
return DEFAULT_SMALL_MODEL_NAME_FOR_TEST_QWEN
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
Reference in New Issue
Block a user