[CI][XPU]enable sglang CI on Intel XPU (#9493)

Co-authored-by: huaiyuzh <huaiyu.zheng@intel.com> Co-authored-by: Ma Mingfei <mingfei.ma@intel.com> Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
2025-10-16 08:13:19 +08:00
parent baf277a9bf
commit 4c03dbaaef
6 changed files with 266 additions and 2 deletions
--- a/.github/workflows/pr-test-xpu.yml
+++ b/.github/workflows/pr-test-xpu.yml
@@ -0,0 +1,99 @@
 name: PR Test (XPU)
 on:
  push:
    branches: [ main ]
    paths:
      - "python/**"
      - "scripts/ci/**"
      - "test/**"
      - "sgl-kernel/**"
      - ".github/workflows/pr-test-xpu.yml"
  pull_request:
    branches: [ main ]
    paths:
      - "python/**"
      - "scripts/ci/**"
      - "test/**"
      - "sgl-kernel/**"
      - ".github/workflows/pr-test-xpu.yml"
    types: [synchronize, labeled]
  workflow_dispatch:
 concurrency:
  group: pr-test-xpu-${{ github.ref }}
  cancel-in-progress: true
 jobs:
  build-and-test:
    if: github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'run-ci')
    runs-on: intel-bmg
    env:
      HF_HOME: /home/sdp/.cache/huggingface
    steps:
      - name: Checkout code
        uses: actions/checkout@v4
        with:
          fetch-depth: 0
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3
      - name: Build Docker image
        run: |
          PR_REPO=${{ github.event.pull_request.head.repo.clone_url }}
          PR_HEAD_REF=${{ github.head_ref }}
          docker build \
            ${PR_REPO:+--build-arg SG_LANG_REPO=$PR_REPO} \
            ${PR_HEAD_REF:+--build-arg SG_LANG_BRANCH=$PR_HEAD_REF} \
            --no-cache --progress=plain -f docker/Dockerfile.xpu -t xpu_sglang_main:bmg .
      - name: Run container
        id: start_container
        run: |
          container_id=$(docker run -dt \
            --group-add 992 \
            --group-add $(getent group video | cut -d: -f3) \
            -v ${HF_HOME}:/root/.cache/huggingface \
            --device /dev/dri \
            -e HF_TOKEN="$(cat ~/huggingface_token.txt)" \
            xpu_sglang_main:bmg)
          echo "Started container: $container_id"
          echo "container_id=$container_id" >> "$GITHUB_OUTPUT"
      - name: Install Dependency
        timeout-minutes: 20
        run: |
          cid="${{ steps.start_container.outputs.container_id }}"
          docker exec "$cid" /home/sdp/miniforge3/envs/py3.10/bin/python3 -m pip install --upgrade pip
          docker exec "$cid" /home/sdp/miniforge3/envs/py3.10/bin/python3 -m pip install pytest expecttest ray huggingface_hub
          docker exec "$cid" /home/sdp/miniforge3/envs/py3.10/bin/python3 -m pip uninstall -y flashinfer-python
          docker exec "$cid" /bin/bash -c '/home/sdp/miniforge3/envs/py3.10/bin/huggingface-cli login --token ${HF_TOKEN} '
          docker exec -u root "$cid" /bin/bash -c "ln -sf /home/sdp/miniforge3/envs/py3.10/bin/python3 /usr/bin/python3"
      - name: Run E2E Bfloat16 tests
        timeout-minutes: 20
        run: |
          cid="${{ steps.start_container.outputs.container_id }}"
          docker exec -w /home/sdp/sglang/ "$cid" \
            bash -c "LD_LIBRARY_PATH=/home/sdp/miniforge3/envs/py3.10/lib:$LD_LIBRARY_PATH && cd ./test/srt && python3 run_suite.py --suite per-commit-xpu"
      - name: Cleanup container
        if: always()
        run: |
          cid="${{ steps.start_container.outputs.container_id }}"
          docker rm -f "$cid" || true
  finish:
    if: always()
    needs: [build-and-test]
    runs-on: ubuntu-latest
    steps:
      - name: Check job status
        run: |
          if [ "${{ needs.build-and-test.result }}" != "success" ]; then
            echo "Job failed with result: ${{ needs.build-and-test.result }}"
            exit 1
          fi
          echo "All jobs completed successfully"
          exit 0
--- a/docker/Dockerfile.xpu
+++ b/docker/Dockerfile.xpu
@@ -0,0 +1,78 @@
 # If the device is Battlemage, we need to set UBUNTU_VERSION to 24.10
 # Usage: docker build --build-arg UBUNTU_VERSION=24.04 --build-arg PYTHON_VERSION=3.10 -t sglang:xpu_kernel -f  Dockerfile.xpu --no-cache .
 # Use Intel deep learning essentials base image with Ubuntu 24.04
 FROM intel/deep-learning-essentials:2025.1.3-0-devel-ubuntu24.04
 # Avoid interactive prompts during package install
 ENV DEBIAN_FRONTEND=noninteractive
 # Define build arguments
 ARG PYTHON_VERSION=3.10
 ARG SG_LANG_REPO=https://github.com/sgl-project/sglang.git
 ARG SG_LANG_BRANCH=main
 ARG SG_LANG_KERNEL_REPO=https://github.com/sgl-project/sgl-kernel-xpu.git
 ARG SG_LANG_KERNEL_BRANCH=main
 RUN useradd -m -d /home/sdp -s /bin/bash sdp && \
    chown -R sdp:sdp /home/sdp
 # Switch to non-root user 'sdp'
 USER sdp
 # Set HOME and WORKDIR to user's home directory
 ENV HOME=/home/sdp
 WORKDIR /home/sdp
 RUN curl -fsSL -v -o miniforge.sh -O https://github.com/conda-forge/miniforge/releases/download/25.1.1-0/Miniforge3-Linux-x86_64.sh && \
    bash miniforge.sh -b -p ./miniforge3 && \
    rm miniforge.sh && \
    # Initialize conda environment and install pip
    . ./miniforge3/bin/activate && \
    conda create -y -n py${PYTHON_VERSION} python=${PYTHON_VERSION} && \
    conda activate py${PYTHON_VERSION} && \
    conda install pip && \
    # Append environment activation to .bashrc for interactive shells
    echo ". /home/sdp/miniforge3/bin/activate; conda activate py${PYTHON_VERSION}; . /opt/intel/oneapi/setvars.sh; cd /home/sdp" >> /home/sdp/.bashrc
 USER root
 RUN apt-get update && apt install -y intel-ocloc
 # Switch back to user sdp
 USER sdp
 RUN --mount=type=secret,id=github_token \
    cd /home/sdp && \
    . /home/sdp/miniforge3/bin/activate && \
    conda activate py${PYTHON_VERSION} && \
    pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/xpu
 RUN --mount=type=secret,id=github_token \
    cd /home/sdp && \
    . /home/sdp/miniforge3/bin/activate && \
    conda activate py${PYTHON_VERSION} && \
    echo "Cloning ${SG_LANG_BRANCH} from ${SG_LANG_REPO}" && \
    git clone --branch ${SG_LANG_BRANCH} --single-branch ${SG_LANG_REPO} && \
    cd sglang && cd python && \
    cp pyproject_xpu.toml pyproject.toml && \
    pip install . && \
    echo "Cloning ${SG_LANG_KERNEL_REPO} from ${SG_LANG_KERNEL_BRANCH}" && \
    git clone --branch ${SG_LANG_KERNEL_BRANCH} --single-branch ${SG_LANG_KERNEL_REPO} && \
    cd sgl-kernel-xpu && \
    pip install -v . && \
    pip install msgspec blake3 py-cpuinfo compressed_tensors gguf partial_json_parser einops --root-user-action=ignore && \
    pip uninstall pytorch-triton-xpu -y && \
    pip install --pre pytorch-triton-xpu --index-url https://download.pytorch.org/whl/xpu && \
    conda install libsqlite=3.48.0 -y && \
    # Add environment setup commands to .bashrc again (in case it was overwritten)
    echo ". /home/sdp/miniforge3/bin/activate; conda activate py${PYTHON_VERSION}; cd /home/sdp" >> /home/sdp/.bashrc
 # Use bash as default shell with initialization from .bashrc
 SHELL ["bash", "-c"]
 # Start an interactive bash shell with all environment set up
 USER sdp
 CMD ["bash", "-c", "source /home/sdp/.bashrc && exec bash"]
--- a/python/sglang/srt/layers/rotary_embedding.py
+++ b/python/sglang/srt/layers/rotary_embedding.py
@@ -17,6 +17,7 @@ from sglang.srt.utils import (
    is_cuda,
    is_hip,
    is_npu,
    is_xpu,
 )
 _is_cuda = is_cuda()
@@ -25,6 +26,7 @@ _use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
 _is_npu = is_npu()
 _is_cpu_amx_available = cpu_has_amx_support()
 _is_cpu = is_cpu()
 _is_xpu = is_xpu()
 if _is_cuda:
    from sgl_kernel import FusedSetKVBufferArg, apply_rope_with_cos_sin_cache_inplace
@@ -109,8 +111,10 @@ class RotaryEmbedding(CustomOp):
            cache = cache.to(dtype)
        if (
-            not (_is_cuda or _is_npu) or self.head_size not in [64, 128, 256, 512]
+            (not (_is_cuda or _is_npu) or self.head_size not in [64, 128, 256, 512])
-        ) and not (_is_cpu and _is_cpu_amx_available):
+            and not (_is_cpu and _is_cpu_amx_available)
            and not _is_xpu
        ):
            from vllm._custom_ops import rotary_embedding
            self.vllm_rotary_embedding = rotary_embedding
@@ -284,6 +288,16 @@ class RotaryEmbedding(CustomOp):
        s += f", base={self.base}, is_neox_style={self.is_neox_style}"
        return s
    def forward_xpu(
        self,
        positions: torch.Tensor,
        query: torch.Tensor,
        key: torch.Tensor,
        offsets: Optional[torch.Tensor] = None,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        # TODO: make a wrapper, and XPU will implement this kernel later.
        return self.forward_native(positions, query, key, offsets)
 class LinearScalingRotaryEmbedding(RotaryEmbedding):
    """RotaryEmbedding extended with linear scaling.
--- a/python/sglang/test/test_utils.py
+++ b/python/sglang/test/test_utils.py
@@ -75,6 +75,11 @@ DEFAULT_MODEL_NAME_FOR_TEST_FP8_WITH_MOE = "gaunernst/DeepSeek-V2-Lite-Chat-FP8"
 DEFAULT_MODEL_NAME_FOR_TEST_W8A8 = "RedHatAI/Llama-3.2-3B-quantized.w8a8"
 DEFAULT_MODEL_NAME_FOR_TEST_W8A8_WITH_MOE = "nytopop/Qwen3-30B-A3B.w8a8"
 # INT4 models
 DEFAULT_MODEL_NAME_FOR_TEST_AWQ_INT4 = (
    "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
 )
 # EAGLE
 DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST = "meta-llama/Llama-2-7b-chat-hf"
 DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST = "lmsys/sglang-EAGLE-llama2-chat-7B"
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -316,6 +316,13 @@ suite_xeon = {
    ],
 }
 # Add Intel XPU tests
 suite_xpu = {
    "per-commit-xpu": [
        TestFile("xpu/test_intel_xpu_backend.py"),
    ],
 }
 # Add Ascend NPU tests
 # NOTE: please sort the test cases alphabetically by the test file name
 suite_ascend = {
@@ -341,6 +348,7 @@ suite_ascend = {
 suites.update(suite_amd)
 suites.update(suite_xeon)
 suites.update(suite_ascend)
 suites.update(suite_xpu)
 def auto_partition(files, rank, size):
--- a/test/srt/xpu/test_intel_xpu_backend.py
+++ b/test/srt/xpu/test_intel_xpu_backend.py
@@ -0,0 +1,60 @@
 """
 Usage:
 python3 -m unittest test_intel_xpu_backend.TestIntelXPUBackend.test_latency_qwen_model
 """
 import os
 import unittest
 from functools import wraps
 from sglang.test.test_utils import (
    DEFAULT_SMALL_MODEL_NAME_FOR_TEST_QWEN,
    CustomTestCase,
    is_in_ci,
    run_bench_one_batch,
 )
 def intel_xpu_benchmark(extra_args=None, min_throughput=None):
    def decorator(test_func):
        @wraps(test_func)
        def wrapper(self):
            common_args = [
                "--disable-radix",
                "--trust-remote-code",
                "--mem-fraction-static",
                "0.3",
                "--batch-size",
                "1",
                "--device",
                "xpu",
            ]
            full_args = common_args + (extra_args or [])
            model = test_func(self)
            prefill_latency, decode_throughput, decode_latency = run_bench_one_batch(
                model, full_args
            )
            print(f"{model=}")
            print(f"{prefill_latency=}")
            print(f"{decode_throughput=}")
            print(f"{decode_latency=}")
            if is_in_ci() and min_throughput is not None:
                self.assertGreater(decode_throughput, min_throughput)
        return wrapper
    return decorator
 class TestIntelXPUBackend(CustomTestCase):
    @intel_xpu_benchmark(min_throughput=10)
    def test_latency_qwen_model(self):
        return DEFAULT_SMALL_MODEL_NAME_FOR_TEST_QWEN
 if __name__ == "__main__":
    unittest.main()