[CPU] enable CI for PRs, add Dockerfile and auto build task (#6458)

Co-authored-by: diwei sun <diwei.sun@intel.com> Co-authored-by: Yineng Zhang <me@zhyncs.com>
2025-06-06 04:43:54 +08:00
parent 8b2474898b
commit 562f279a2d
6 changed files with 239 additions and 2 deletions
--- a/.github/workflows/pr-test-xeon.yml
+++ b/.github/workflows/pr-test-xeon.yml
@@ -0,0 +1,86 @@
+name: PR Test (Xeon)
+on:
+  pull_request:
+    branches:
+      - main
+  workflow_dispatch:
+
+concurrency:
+  group: pr-test-xeon-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  build-test:
+    if: github.event_name == 'pull_request'
+    runs-on: sgl-kernel-build-node
+    environment: 'prod'
+    strategy:
+      matrix:
+        build_type: ['all']
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Build and Push
+        run: |
+          version=$(cat python/sglang/version.py | cut -d'"' -f2)
+          tag=v${version}-xeon
+
+          docker build . -f docker/Dockerfile.xeon  -t sglang_xeon --no-cache
+  unit-test:
+    if: github.event_name == 'pull_request'
+    needs: [build-test]
+    runs-on: sgl-kernel-build-node
+    steps:
+      - name: Run container
+        run: |
+          docker run -dt \
+            -v ${{ github.workspace }}:/sglang-checkout/ --ipc=host \
+            --name ci_sglang_xeon \
+            sglang_xeon
+
+      - name: Install Dependency
+        timeout-minutes: 20
+        run: |
+          docker exec ci_sglang_xeon bash -c "python3 -m pip install --upgrade pip"
+          docker exec ci_sglang_xeon pip uninstall sgl-kernel -y || true
+          docker exec -w /sglang-checkout/sgl-kernel ci_sglang_xeon bash -c "cp pyproject_cpu.toml pyproject.toml && pip install -v ."
+          docker exec -w /sglang-checkout/ ci_sglang_xeon bash -c "pip install -e "python[all_cpu]""
+          docker exec ci_sglang_xeon bash -c "python3 -m pip install pytest expecttest"
+
+      - name: Check AMX Support
+        id: check_amx
+        timeout-minutes: 5
+        run: |
+          docker exec -w /sglang-checkout/ ci_sglang_xeon \
+            bash -c "python3 -c 'import torch; import sgl_kernel; assert torch._C._cpu._is_amx_tile_supported(); assert hasattr(torch.ops.sgl_kernel, \"convert_weight_packed\"); '"
+        continue-on-error: true
+
+      - name: Run UT Cases
+        if: steps.check_amx.outcome == 'success'
+        timeout-minutes: 20
+        run: |
+          docker exec -w /sglang-checkout/ ci_sglang_xeon \
+            bash -c "cd ./test/srt && python3 run_suite.py --suite per-commit-cpu"
+
+      - name: Cleanup container
+        if: always()
+        run: |
+          docker rm -f ci_sglang_xeon || true
+
+  finish:
+    if: always()
+    needs: [build-test, unit-test]
+    runs-on: ubuntu-24.04
+    steps:
+      - name: Check all dependent job statuses
+        run: |
+          results=(${{ join(needs.*.result, ' ') }})
+          for result in "${results[@]}"; do
+            if [ "$result" = "failure" ] || [ "$result" = "cancelled" ]; then
+              echo "Job failed with result: $result"
+              exit 1
+            fi
+          done
+          echo "All jobs completed successfully"
+          exit 0
--- a/.github/workflows/release-docker-xeon.yml
+++ b/.github/workflows/release-docker-xeon.yml
@@ -0,0 +1,35 @@
+name: Release Docker Images
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - "python/sglang/version.py"
+  workflow_dispatch:
+
+jobs:
+  publish:
+    if: github.repository == 'sgl-project/sglang'
+    runs-on: ubuntu-24.04
+    environment: 'prod'
+    strategy:
+      matrix:
+        build_type: ['all']
+    steps:
+
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Login to Docker Hub
+        uses: docker/login-action@v2
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+
+      - name: Build and Push
+        run: |
+          version=$(cat python/sglang/version.py | cut -d'"' -f2)
+          tag=v${version}-xeon
+
+          docker build . -f docker/Dockerfile.xeon  -t lmsysorg/sglang:${tag} --no-cache
+          docker push lmsysorg/sglang:${tag}
--- a/docker/Dockerfile.xeon
+++ b/docker/Dockerfile.xeon
@@ -0,0 +1,44 @@
+FROM ubuntu:24.04
+SHELL ["/bin/bash", "-c"]
+
+ARG VER_SGLANG=main
+ARG VER_TORCH=2.6.0
+ARG VER_TORCHVISION=0.21.0
+
+RUN apt-get update && \
+    apt-get full-upgrade -y && \
+    DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \
+    ca-certificates \
+    git \
+    curl \
+    wget \
+    vim \
+    gcc \
+    g++ \
+    make
+
+WORKDIR /sgl-workspace
+
+RUN curl -fsSL -v -o miniforge.sh -O https://github.com/conda-forge/miniforge/releases/download/24.11.3-2/Miniforge3-24.11.3-2-Linux-x86_64.sh && \
+    bash miniforge.sh -b -p ./miniforge3 && \
+    rm -f miniforge.sh && \
+    . miniforge3/bin/activate && \
+    conda install -y libsqlite==3.48.0 gperftools tbb libnuma numactl
+
+ENV PATH=/sgl-workspace/miniforge3/bin:/sgl-workspace/miniforge3/condabin:${PATH}
+ENV PIP_ROOT_USER_ACTION=ignore
+
+RUN pip install intel-openmp
+
+RUN git clone https://github.com/sgl-project/sglang.git && \
+    cd sglang && \
+    git checkout ${VER_SGLANG} && \
+    pip install -e "python[all_cpu]" && \
+    pip install torch==${VER_TORCH} torchvision==${VER_TORCHVISION} --index-url https://download.pytorch.org/whl/cpu --force-reinstall && \
+    cd sgl-kernel && \
+    cp pyproject_cpu.toml pyproject.toml && \
+    pip install -v .
+
+ENV LD_PRELOAD=/sgl-workspace/miniforge3/lib/libiomp5.so:/sgl-workspace/miniforge3/lib/libtcmalloc.so:/sgl-workspace/miniforge3/lib/libtbbmalloc.so.2
+
+WORKDIR /sgl-workspace/sglang
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -89,7 +89,7 @@ srt_hpu = ["sglang[runtime_common]", "outlines>=0.0.44,<=0.1.11"]
 # CPU: currently, there are no pre-built vllm wheels for CPU.
 # To install vllm for CPU, please follow the instruction here:
 # https://docs.vllm.ai/en/latest/getting_started/installation/cpu/index.html
-srt_cpu = ["sglang[runtime_common]", "outlines>=0.0.44,<=0.1.11", "torch"]
+srt_cpu = ["sglang[runtime_common]", "outlines>=0.0.44,<=0.1.11", "einops"]
 # https://vllm-ascend.readthedocs.io/en/latest/installation.html
 srt_npu = ["sglang[runtime_common]", "outlines>=0.0.44,<=0.1.11"]

--- a/python/sglang/test/test_utils.py
+++ b/python/sglang/test/test_utils.py
@@ -26,6 +26,7 @@ from sglang.lang.backend.openai import OpenAI
 from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
 from sglang.srt.utils import (
    get_bool_env_var,
+    get_device,
    is_port_available,
    kill_process_tree,
    retry,
@@ -305,13 +306,33 @@ def add_common_other_args_and_parse(parser: argparse.ArgumentParser):
    return args


+def auto_config_device() -> str:
+    """Auto-config available device platform"""
+
+    try:
+        device = get_device()
+    except (RuntimeError, ImportError) as e:
+        print(f"Warning: {e} - Falling back to CPU")
+        device = "cpu"
+
+    return device
+
+
 def add_common_sglang_args_and_parse(parser: argparse.ArgumentParser):
    parser.add_argument("--parallel", type=int, default=64)
    parser.add_argument("--host", type=str, default="http://127.0.0.1")
    parser.add_argument("--port", type=int, default=30000)
    parser.add_argument("--backend", type=str, default="srt")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="auto",
+        choices=["auto", "cuda", "rocm", "cpu"],
+        help="Device type (auto/cuda/rocm/cpu). Auto will detect available platforms",
+    )
    parser.add_argument("--result-file", type=str, default="result.jsonl")
    args = parser.parse_args()
+
    return args


@@ -397,11 +418,25 @@ def popen_launch_server(
    base_url: str,
    timeout: float,
    api_key: Optional[str] = None,
-    other_args: list[str] = (),
+    other_args: list[str] = [],
    env: Optional[dict] = None,
    return_stdout_stderr: Optional[tuple] = None,
+    device: str = "auto",
    pd_separated: bool = False,
 ):
+    """Launch a server process with automatic device detection.
+
+    Args:
+        device: Device type ("auto", "cuda", "rocm" or "cpu").
+                If "auto", will detect available platforms automatically.
+    """
+    # Auto-detect device if needed
+    if device == "auto":
+        device = auto_config_device()
+        print(f"Auto-configed device: {device}", flush=True)
+        other_args = list(other_args)
+        other_args += ["--device", str(device)]
+
    _, host, port = base_url.split(":")
    host = host[2:]

@@ -457,6 +492,15 @@ def popen_launch_server(
    start_time = time.perf_counter()
    with requests.Session() as session:
        while time.perf_counter() - start_time < timeout:
+
+            return_code = process.poll()
+            if return_code is not None:
+                # Server failed to start (non-zero exit code) or crashed
+                raise Exception(
+                    f"Server process exited with code {return_code}. "
+                    "Check server logs for errors."
+                )
+
            try:
                headers = {
                    "Content-Type": "application/json; charset=utf-8",
@@ -627,6 +671,7 @@ def get_benchmark_args(
    disable_stream=False,
    disable_ignore_eos=False,
    seed: int = 0,
+    device="auto",
    pd_separated: bool = False,
 ):
    return SimpleNamespace(
@@ -657,6 +702,7 @@ def get_benchmark_args(
        profile=None,
        lora_name=None,
        prompt_suffix="",
+        device=device,
        pd_separated=pd_separated,
    )

@@ -676,7 +722,10 @@ def run_bench_serving(
    disable_ignore_eos=False,
    need_warmup=False,
    seed: int = 0,
+    device="auto",
 ):
+    if device == "auto":
+        device = auto_config_device()
    # Launch the server
    base_url = DEFAULT_URL_FOR_TEST
    process = popen_launch_server(
@@ -700,6 +749,7 @@ def run_bench_serving(
        disable_stream=disable_stream,
        disable_ignore_eos=disable_ignore_eos,
        seed=seed,
+        device=device,
    )

    try:
@@ -750,6 +800,18 @@ def run_bench_serving_multi(


 def run_bench_one_batch(model, other_args):
+    """Launch a offline process with automatic device detection.
+
+    Args:
+        device: Device type ("auto", "cuda", "rocm" or "cpu").
+                If "auto", will detect available platforms automatically.
+    """
+    # Auto-detect device if needed
+
+    device = auto_config_device()
+    print(f"Auto-configed device: {device}", flush=True)
+    other_args += ["--device", str(device)]
+
    command = [
        "python3",
        "-m",
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -127,6 +127,16 @@ suites = {
    "per-commit-8-gpu-amd": [
        TestFile("test_full_deepseek_v3.py", 250),
    ],
+    "per-commit-cpu": [
+        TestFile("cpu/test_activation.py"),
+        TestFile("cpu/test_decode.py"),
+        TestFile("cpu/test_extend.py"),
+        TestFile("cpu/test_gemm.py"),
+        TestFile("cpu/test_moe.py"),
+        TestFile("cpu/test_norm.py"),
+        TestFile("cpu/test_qkv_proj_with_rope.py"),
+        TestFile("cpu/test_shared_expert.py"),
+    ],
    "nightly": [
        TestFile("test_nightly_gsm8k_eval.py"),
    ],