diff --git a/.github/workflows/pr-test-xpu.yml b/.github/workflows/pr-test-xpu.yml new file mode 100644 index 000000000..f4cc7c952 --- /dev/null +++ b/.github/workflows/pr-test-xpu.yml @@ -0,0 +1,99 @@ +name: PR Test (XPU) + +on: + push: + branches: [ main ] + paths: + - "python/**" + - "scripts/ci/**" + - "test/**" + - "sgl-kernel/**" + - ".github/workflows/pr-test-xpu.yml" + pull_request: + branches: [ main ] + paths: + - "python/**" + - "scripts/ci/**" + - "test/**" + - "sgl-kernel/**" + - ".github/workflows/pr-test-xpu.yml" + types: [synchronize, labeled] + workflow_dispatch: + +concurrency: + group: pr-test-xpu-${{ github.ref }} + cancel-in-progress: true + +jobs: + build-and-test: + if: github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'run-ci') + runs-on: intel-bmg + env: + HF_HOME: /home/sdp/.cache/huggingface + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Build Docker image + run: | + PR_REPO=${{ github.event.pull_request.head.repo.clone_url }} + PR_HEAD_REF=${{ github.head_ref }} + docker build \ + ${PR_REPO:+--build-arg SG_LANG_REPO=$PR_REPO} \ + ${PR_HEAD_REF:+--build-arg SG_LANG_BRANCH=$PR_HEAD_REF} \ + --no-cache --progress=plain -f docker/Dockerfile.xpu -t xpu_sglang_main:bmg . + + - name: Run container + id: start_container + run: | + container_id=$(docker run -dt \ + --group-add 992 \ + --group-add $(getent group video | cut -d: -f3) \ + -v ${HF_HOME}:/root/.cache/huggingface \ + --device /dev/dri \ + -e HF_TOKEN="$(cat ~/huggingface_token.txt)" \ + xpu_sglang_main:bmg) + echo "Started container: $container_id" + echo "container_id=$container_id" >> "$GITHUB_OUTPUT" + + - name: Install Dependency + timeout-minutes: 20 + run: | + cid="${{ steps.start_container.outputs.container_id }}" + docker exec "$cid" /home/sdp/miniforge3/envs/py3.10/bin/python3 -m pip install --upgrade pip + docker exec "$cid" /home/sdp/miniforge3/envs/py3.10/bin/python3 -m pip install pytest expecttest ray huggingface_hub + docker exec "$cid" /home/sdp/miniforge3/envs/py3.10/bin/python3 -m pip uninstall -y flashinfer-python + docker exec "$cid" /bin/bash -c '/home/sdp/miniforge3/envs/py3.10/bin/huggingface-cli login --token ${HF_TOKEN} ' + docker exec -u root "$cid" /bin/bash -c "ln -sf /home/sdp/miniforge3/envs/py3.10/bin/python3 /usr/bin/python3" + + - name: Run E2E Bfloat16 tests + timeout-minutes: 20 + run: | + cid="${{ steps.start_container.outputs.container_id }}" + docker exec -w /home/sdp/sglang/ "$cid" \ + bash -c "LD_LIBRARY_PATH=/home/sdp/miniforge3/envs/py3.10/lib:$LD_LIBRARY_PATH && cd ./test/srt && python3 run_suite.py --suite per-commit-xpu" + + - name: Cleanup container + if: always() + run: | + cid="${{ steps.start_container.outputs.container_id }}" + docker rm -f "$cid" || true + + finish: + if: always() + needs: [build-and-test] + runs-on: ubuntu-latest + steps: + - name: Check job status + run: | + if [ "${{ needs.build-and-test.result }}" != "success" ]; then + echo "Job failed with result: ${{ needs.build-and-test.result }}" + exit 1 + fi + echo "All jobs completed successfully" + exit 0 diff --git a/docker/Dockerfile.xpu b/docker/Dockerfile.xpu new file mode 100644 index 000000000..bd32551f5 --- /dev/null +++ b/docker/Dockerfile.xpu @@ -0,0 +1,78 @@ +# If the device is Battlemage, we need to set UBUNTU_VERSION to 24.10 + +# Usage: docker build --build-arg UBUNTU_VERSION=24.04 --build-arg PYTHON_VERSION=3.10 -t sglang:xpu_kernel -f Dockerfile.xpu --no-cache . + +# Use Intel deep learning essentials base image with Ubuntu 24.04 +FROM intel/deep-learning-essentials:2025.1.3-0-devel-ubuntu24.04 + +# Avoid interactive prompts during package install +ENV DEBIAN_FRONTEND=noninteractive + +# Define build arguments +ARG PYTHON_VERSION=3.10 + +ARG SG_LANG_REPO=https://github.com/sgl-project/sglang.git +ARG SG_LANG_BRANCH=main + +ARG SG_LANG_KERNEL_REPO=https://github.com/sgl-project/sgl-kernel-xpu.git +ARG SG_LANG_KERNEL_BRANCH=main + +RUN useradd -m -d /home/sdp -s /bin/bash sdp && \ + chown -R sdp:sdp /home/sdp + +# Switch to non-root user 'sdp' +USER sdp + +# Set HOME and WORKDIR to user's home directory +ENV HOME=/home/sdp +WORKDIR /home/sdp + +RUN curl -fsSL -v -o miniforge.sh -O https://github.com/conda-forge/miniforge/releases/download/25.1.1-0/Miniforge3-Linux-x86_64.sh && \ + bash miniforge.sh -b -p ./miniforge3 && \ + rm miniforge.sh && \ + # Initialize conda environment and install pip + . ./miniforge3/bin/activate && \ + conda create -y -n py${PYTHON_VERSION} python=${PYTHON_VERSION} && \ + conda activate py${PYTHON_VERSION} && \ + conda install pip && \ + # Append environment activation to .bashrc for interactive shells + echo ". /home/sdp/miniforge3/bin/activate; conda activate py${PYTHON_VERSION}; . /opt/intel/oneapi/setvars.sh; cd /home/sdp" >> /home/sdp/.bashrc + +USER root +RUN apt-get update && apt install -y intel-ocloc + +# Switch back to user sdp +USER sdp + +RUN --mount=type=secret,id=github_token \ + cd /home/sdp && \ + . /home/sdp/miniforge3/bin/activate && \ + conda activate py${PYTHON_VERSION} && \ + pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/xpu + +RUN --mount=type=secret,id=github_token \ + cd /home/sdp && \ + . /home/sdp/miniforge3/bin/activate && \ + conda activate py${PYTHON_VERSION} && \ + echo "Cloning ${SG_LANG_BRANCH} from ${SG_LANG_REPO}" && \ + git clone --branch ${SG_LANG_BRANCH} --single-branch ${SG_LANG_REPO} && \ + cd sglang && cd python && \ + cp pyproject_xpu.toml pyproject.toml && \ + pip install . && \ + echo "Cloning ${SG_LANG_KERNEL_REPO} from ${SG_LANG_KERNEL_BRANCH}" && \ + git clone --branch ${SG_LANG_KERNEL_BRANCH} --single-branch ${SG_LANG_KERNEL_REPO} && \ + cd sgl-kernel-xpu && \ + pip install -v . && \ + pip install msgspec blake3 py-cpuinfo compressed_tensors gguf partial_json_parser einops --root-user-action=ignore && \ + pip uninstall pytorch-triton-xpu -y && \ + pip install --pre pytorch-triton-xpu --index-url https://download.pytorch.org/whl/xpu && \ + conda install libsqlite=3.48.0 -y && \ + # Add environment setup commands to .bashrc again (in case it was overwritten) + echo ". /home/sdp/miniforge3/bin/activate; conda activate py${PYTHON_VERSION}; cd /home/sdp" >> /home/sdp/.bashrc + +# Use bash as default shell with initialization from .bashrc +SHELL ["bash", "-c"] + +# Start an interactive bash shell with all environment set up +USER sdp +CMD ["bash", "-c", "source /home/sdp/.bashrc && exec bash"] diff --git a/python/sglang/srt/layers/rotary_embedding.py b/python/sglang/srt/layers/rotary_embedding.py index a18f0bff9..15b112539 100644 --- a/python/sglang/srt/layers/rotary_embedding.py +++ b/python/sglang/srt/layers/rotary_embedding.py @@ -17,6 +17,7 @@ from sglang.srt.utils import ( is_cuda, is_hip, is_npu, + is_xpu, ) _is_cuda = is_cuda() @@ -25,6 +26,7 @@ _use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip _is_npu = is_npu() _is_cpu_amx_available = cpu_has_amx_support() _is_cpu = is_cpu() +_is_xpu = is_xpu() if _is_cuda: from sgl_kernel import FusedSetKVBufferArg, apply_rope_with_cos_sin_cache_inplace @@ -109,8 +111,10 @@ class RotaryEmbedding(CustomOp): cache = cache.to(dtype) if ( - not (_is_cuda or _is_npu) or self.head_size not in [64, 128, 256, 512] - ) and not (_is_cpu and _is_cpu_amx_available): + (not (_is_cuda or _is_npu) or self.head_size not in [64, 128, 256, 512]) + and not (_is_cpu and _is_cpu_amx_available) + and not _is_xpu + ): from vllm._custom_ops import rotary_embedding self.vllm_rotary_embedding = rotary_embedding @@ -284,6 +288,16 @@ class RotaryEmbedding(CustomOp): s += f", base={self.base}, is_neox_style={self.is_neox_style}" return s + def forward_xpu( + self, + positions: torch.Tensor, + query: torch.Tensor, + key: torch.Tensor, + offsets: Optional[torch.Tensor] = None, + ) -> Tuple[torch.Tensor, torch.Tensor]: + # TODO: make a wrapper, and XPU will implement this kernel later. + return self.forward_native(positions, query, key, offsets) + class LinearScalingRotaryEmbedding(RotaryEmbedding): """RotaryEmbedding extended with linear scaling. diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py index 94ac56a55..edbcdefd7 100644 --- a/python/sglang/test/test_utils.py +++ b/python/sglang/test/test_utils.py @@ -75,6 +75,11 @@ DEFAULT_MODEL_NAME_FOR_TEST_FP8_WITH_MOE = "gaunernst/DeepSeek-V2-Lite-Chat-FP8" DEFAULT_MODEL_NAME_FOR_TEST_W8A8 = "RedHatAI/Llama-3.2-3B-quantized.w8a8" DEFAULT_MODEL_NAME_FOR_TEST_W8A8_WITH_MOE = "nytopop/Qwen3-30B-A3B.w8a8" +# INT4 models +DEFAULT_MODEL_NAME_FOR_TEST_AWQ_INT4 = ( + "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4" +) + # EAGLE DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST = "meta-llama/Llama-2-7b-chat-hf" DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST = "lmsys/sglang-EAGLE-llama2-chat-7B" diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index 69ae5b9c5..87903ef47 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -316,6 +316,13 @@ suite_xeon = { ], } +# Add Intel XPU tests +suite_xpu = { + "per-commit-xpu": [ + TestFile("xpu/test_intel_xpu_backend.py"), + ], +} + # Add Ascend NPU tests # NOTE: please sort the test cases alphabetically by the test file name suite_ascend = { @@ -341,6 +348,7 @@ suite_ascend = { suites.update(suite_amd) suites.update(suite_xeon) suites.update(suite_ascend) +suites.update(suite_xpu) def auto_partition(files, rank, size): diff --git a/test/srt/xpu/test_intel_xpu_backend.py b/test/srt/xpu/test_intel_xpu_backend.py new file mode 100644 index 000000000..91ebd57a2 --- /dev/null +++ b/test/srt/xpu/test_intel_xpu_backend.py @@ -0,0 +1,60 @@ +""" +Usage: +python3 -m unittest test_intel_xpu_backend.TestIntelXPUBackend.test_latency_qwen_model +""" + +import os +import unittest +from functools import wraps + +from sglang.test.test_utils import ( + DEFAULT_SMALL_MODEL_NAME_FOR_TEST_QWEN, + CustomTestCase, + is_in_ci, + run_bench_one_batch, +) + + +def intel_xpu_benchmark(extra_args=None, min_throughput=None): + def decorator(test_func): + @wraps(test_func) + def wrapper(self): + common_args = [ + "--disable-radix", + "--trust-remote-code", + "--mem-fraction-static", + "0.3", + "--batch-size", + "1", + "--device", + "xpu", + ] + full_args = common_args + (extra_args or []) + + model = test_func(self) + prefill_latency, decode_throughput, decode_latency = run_bench_one_batch( + model, full_args + ) + + print(f"{model=}") + print(f"{prefill_latency=}") + print(f"{decode_throughput=}") + print(f"{decode_latency=}") + + if is_in_ci() and min_throughput is not None: + self.assertGreater(decode_throughput, min_throughput) + + return wrapper + + return decorator + + +class TestIntelXPUBackend(CustomTestCase): + + @intel_xpu_benchmark(min_throughput=10) + def test_latency_qwen_model(self): + return DEFAULT_SMALL_MODEL_NAME_FOR_TEST_QWEN + + +if __name__ == "__main__": + unittest.main()