[CPU] enable CI for PRs, add Dockerfile and auto build task (#6458)
Co-authored-by: diwei sun <diwei.sun@intel.com> Co-authored-by: Yineng Zhang <me@zhyncs.com>
This commit is contained in:
86
.github/workflows/pr-test-xeon.yml
vendored
Normal file
86
.github/workflows/pr-test-xeon.yml
vendored
Normal file
@@ -0,0 +1,86 @@
|
||||
name: PR Test (Xeon)
|
||||
on:
|
||||
pull_request:
|
||||
branches:
|
||||
- main
|
||||
workflow_dispatch:
|
||||
|
||||
concurrency:
|
||||
group: pr-test-xeon-${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
build-test:
|
||||
if: github.event_name == 'pull_request'
|
||||
runs-on: sgl-kernel-build-node
|
||||
environment: 'prod'
|
||||
strategy:
|
||||
matrix:
|
||||
build_type: ['all']
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Build and Push
|
||||
run: |
|
||||
version=$(cat python/sglang/version.py | cut -d'"' -f2)
|
||||
tag=v${version}-xeon
|
||||
|
||||
docker build . -f docker/Dockerfile.xeon -t sglang_xeon --no-cache
|
||||
unit-test:
|
||||
if: github.event_name == 'pull_request'
|
||||
needs: [build-test]
|
||||
runs-on: sgl-kernel-build-node
|
||||
steps:
|
||||
- name: Run container
|
||||
run: |
|
||||
docker run -dt \
|
||||
-v ${{ github.workspace }}:/sglang-checkout/ --ipc=host \
|
||||
--name ci_sglang_xeon \
|
||||
sglang_xeon
|
||||
|
||||
- name: Install Dependency
|
||||
timeout-minutes: 20
|
||||
run: |
|
||||
docker exec ci_sglang_xeon bash -c "python3 -m pip install --upgrade pip"
|
||||
docker exec ci_sglang_xeon pip uninstall sgl-kernel -y || true
|
||||
docker exec -w /sglang-checkout/sgl-kernel ci_sglang_xeon bash -c "cp pyproject_cpu.toml pyproject.toml && pip install -v ."
|
||||
docker exec -w /sglang-checkout/ ci_sglang_xeon bash -c "pip install -e "python[all_cpu]""
|
||||
docker exec ci_sglang_xeon bash -c "python3 -m pip install pytest expecttest"
|
||||
|
||||
- name: Check AMX Support
|
||||
id: check_amx
|
||||
timeout-minutes: 5
|
||||
run: |
|
||||
docker exec -w /sglang-checkout/ ci_sglang_xeon \
|
||||
bash -c "python3 -c 'import torch; import sgl_kernel; assert torch._C._cpu._is_amx_tile_supported(); assert hasattr(torch.ops.sgl_kernel, \"convert_weight_packed\"); '"
|
||||
continue-on-error: true
|
||||
|
||||
- name: Run UT Cases
|
||||
if: steps.check_amx.outcome == 'success'
|
||||
timeout-minutes: 20
|
||||
run: |
|
||||
docker exec -w /sglang-checkout/ ci_sglang_xeon \
|
||||
bash -c "cd ./test/srt && python3 run_suite.py --suite per-commit-cpu"
|
||||
|
||||
- name: Cleanup container
|
||||
if: always()
|
||||
run: |
|
||||
docker rm -f ci_sglang_xeon || true
|
||||
|
||||
finish:
|
||||
if: always()
|
||||
needs: [build-test, unit-test]
|
||||
runs-on: ubuntu-24.04
|
||||
steps:
|
||||
- name: Check all dependent job statuses
|
||||
run: |
|
||||
results=(${{ join(needs.*.result, ' ') }})
|
||||
for result in "${results[@]}"; do
|
||||
if [ "$result" = "failure" ] || [ "$result" = "cancelled" ]; then
|
||||
echo "Job failed with result: $result"
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
echo "All jobs completed successfully"
|
||||
exit 0
|
||||
35
.github/workflows/release-docker-xeon.yml
vendored
Normal file
35
.github/workflows/release-docker-xeon.yml
vendored
Normal file
@@ -0,0 +1,35 @@
|
||||
name: Release Docker Images
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
paths:
|
||||
- "python/sglang/version.py"
|
||||
workflow_dispatch:
|
||||
|
||||
jobs:
|
||||
publish:
|
||||
if: github.repository == 'sgl-project/sglang'
|
||||
runs-on: ubuntu-24.04
|
||||
environment: 'prod'
|
||||
strategy:
|
||||
matrix:
|
||||
build_type: ['all']
|
||||
steps:
|
||||
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@v2
|
||||
with:
|
||||
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
||||
|
||||
- name: Build and Push
|
||||
run: |
|
||||
version=$(cat python/sglang/version.py | cut -d'"' -f2)
|
||||
tag=v${version}-xeon
|
||||
|
||||
docker build . -f docker/Dockerfile.xeon -t lmsysorg/sglang:${tag} --no-cache
|
||||
docker push lmsysorg/sglang:${tag}
|
||||
44
docker/Dockerfile.xeon
Normal file
44
docker/Dockerfile.xeon
Normal file
@@ -0,0 +1,44 @@
|
||||
FROM ubuntu:24.04
|
||||
SHELL ["/bin/bash", "-c"]
|
||||
|
||||
ARG VER_SGLANG=main
|
||||
ARG VER_TORCH=2.6.0
|
||||
ARG VER_TORCHVISION=0.21.0
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get full-upgrade -y && \
|
||||
DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \
|
||||
ca-certificates \
|
||||
git \
|
||||
curl \
|
||||
wget \
|
||||
vim \
|
||||
gcc \
|
||||
g++ \
|
||||
make
|
||||
|
||||
WORKDIR /sgl-workspace
|
||||
|
||||
RUN curl -fsSL -v -o miniforge.sh -O https://github.com/conda-forge/miniforge/releases/download/24.11.3-2/Miniforge3-24.11.3-2-Linux-x86_64.sh && \
|
||||
bash miniforge.sh -b -p ./miniforge3 && \
|
||||
rm -f miniforge.sh && \
|
||||
. miniforge3/bin/activate && \
|
||||
conda install -y libsqlite==3.48.0 gperftools tbb libnuma numactl
|
||||
|
||||
ENV PATH=/sgl-workspace/miniforge3/bin:/sgl-workspace/miniforge3/condabin:${PATH}
|
||||
ENV PIP_ROOT_USER_ACTION=ignore
|
||||
|
||||
RUN pip install intel-openmp
|
||||
|
||||
RUN git clone https://github.com/sgl-project/sglang.git && \
|
||||
cd sglang && \
|
||||
git checkout ${VER_SGLANG} && \
|
||||
pip install -e "python[all_cpu]" && \
|
||||
pip install torch==${VER_TORCH} torchvision==${VER_TORCHVISION} --index-url https://download.pytorch.org/whl/cpu --force-reinstall && \
|
||||
cd sgl-kernel && \
|
||||
cp pyproject_cpu.toml pyproject.toml && \
|
||||
pip install -v .
|
||||
|
||||
ENV LD_PRELOAD=/sgl-workspace/miniforge3/lib/libiomp5.so:/sgl-workspace/miniforge3/lib/libtcmalloc.so:/sgl-workspace/miniforge3/lib/libtbbmalloc.so.2
|
||||
|
||||
WORKDIR /sgl-workspace/sglang
|
||||
@@ -89,7 +89,7 @@ srt_hpu = ["sglang[runtime_common]", "outlines>=0.0.44,<=0.1.11"]
|
||||
# CPU: currently, there are no pre-built vllm wheels for CPU.
|
||||
# To install vllm for CPU, please follow the instruction here:
|
||||
# https://docs.vllm.ai/en/latest/getting_started/installation/cpu/index.html
|
||||
srt_cpu = ["sglang[runtime_common]", "outlines>=0.0.44,<=0.1.11", "torch"]
|
||||
srt_cpu = ["sglang[runtime_common]", "outlines>=0.0.44,<=0.1.11", "einops"]
|
||||
# https://vllm-ascend.readthedocs.io/en/latest/installation.html
|
||||
srt_npu = ["sglang[runtime_common]", "outlines>=0.0.44,<=0.1.11"]
|
||||
|
||||
|
||||
@@ -26,6 +26,7 @@ from sglang.lang.backend.openai import OpenAI
|
||||
from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
|
||||
from sglang.srt.utils import (
|
||||
get_bool_env_var,
|
||||
get_device,
|
||||
is_port_available,
|
||||
kill_process_tree,
|
||||
retry,
|
||||
@@ -305,13 +306,33 @@ def add_common_other_args_and_parse(parser: argparse.ArgumentParser):
|
||||
return args
|
||||
|
||||
|
||||
def auto_config_device() -> str:
|
||||
"""Auto-config available device platform"""
|
||||
|
||||
try:
|
||||
device = get_device()
|
||||
except (RuntimeError, ImportError) as e:
|
||||
print(f"Warning: {e} - Falling back to CPU")
|
||||
device = "cpu"
|
||||
|
||||
return device
|
||||
|
||||
|
||||
def add_common_sglang_args_and_parse(parser: argparse.ArgumentParser):
|
||||
parser.add_argument("--parallel", type=int, default=64)
|
||||
parser.add_argument("--host", type=str, default="http://127.0.0.1")
|
||||
parser.add_argument("--port", type=int, default=30000)
|
||||
parser.add_argument("--backend", type=str, default="srt")
|
||||
parser.add_argument(
|
||||
"--device",
|
||||
type=str,
|
||||
default="auto",
|
||||
choices=["auto", "cuda", "rocm", "cpu"],
|
||||
help="Device type (auto/cuda/rocm/cpu). Auto will detect available platforms",
|
||||
)
|
||||
parser.add_argument("--result-file", type=str, default="result.jsonl")
|
||||
args = parser.parse_args()
|
||||
|
||||
return args
|
||||
|
||||
|
||||
@@ -397,11 +418,25 @@ def popen_launch_server(
|
||||
base_url: str,
|
||||
timeout: float,
|
||||
api_key: Optional[str] = None,
|
||||
other_args: list[str] = (),
|
||||
other_args: list[str] = [],
|
||||
env: Optional[dict] = None,
|
||||
return_stdout_stderr: Optional[tuple] = None,
|
||||
device: str = "auto",
|
||||
pd_separated: bool = False,
|
||||
):
|
||||
"""Launch a server process with automatic device detection.
|
||||
|
||||
Args:
|
||||
device: Device type ("auto", "cuda", "rocm" or "cpu").
|
||||
If "auto", will detect available platforms automatically.
|
||||
"""
|
||||
# Auto-detect device if needed
|
||||
if device == "auto":
|
||||
device = auto_config_device()
|
||||
print(f"Auto-configed device: {device}", flush=True)
|
||||
other_args = list(other_args)
|
||||
other_args += ["--device", str(device)]
|
||||
|
||||
_, host, port = base_url.split(":")
|
||||
host = host[2:]
|
||||
|
||||
@@ -457,6 +492,15 @@ def popen_launch_server(
|
||||
start_time = time.perf_counter()
|
||||
with requests.Session() as session:
|
||||
while time.perf_counter() - start_time < timeout:
|
||||
|
||||
return_code = process.poll()
|
||||
if return_code is not None:
|
||||
# Server failed to start (non-zero exit code) or crashed
|
||||
raise Exception(
|
||||
f"Server process exited with code {return_code}. "
|
||||
"Check server logs for errors."
|
||||
)
|
||||
|
||||
try:
|
||||
headers = {
|
||||
"Content-Type": "application/json; charset=utf-8",
|
||||
@@ -627,6 +671,7 @@ def get_benchmark_args(
|
||||
disable_stream=False,
|
||||
disable_ignore_eos=False,
|
||||
seed: int = 0,
|
||||
device="auto",
|
||||
pd_separated: bool = False,
|
||||
):
|
||||
return SimpleNamespace(
|
||||
@@ -657,6 +702,7 @@ def get_benchmark_args(
|
||||
profile=None,
|
||||
lora_name=None,
|
||||
prompt_suffix="",
|
||||
device=device,
|
||||
pd_separated=pd_separated,
|
||||
)
|
||||
|
||||
@@ -676,7 +722,10 @@ def run_bench_serving(
|
||||
disable_ignore_eos=False,
|
||||
need_warmup=False,
|
||||
seed: int = 0,
|
||||
device="auto",
|
||||
):
|
||||
if device == "auto":
|
||||
device = auto_config_device()
|
||||
# Launch the server
|
||||
base_url = DEFAULT_URL_FOR_TEST
|
||||
process = popen_launch_server(
|
||||
@@ -700,6 +749,7 @@ def run_bench_serving(
|
||||
disable_stream=disable_stream,
|
||||
disable_ignore_eos=disable_ignore_eos,
|
||||
seed=seed,
|
||||
device=device,
|
||||
)
|
||||
|
||||
try:
|
||||
@@ -750,6 +800,18 @@ def run_bench_serving_multi(
|
||||
|
||||
|
||||
def run_bench_one_batch(model, other_args):
|
||||
"""Launch a offline process with automatic device detection.
|
||||
|
||||
Args:
|
||||
device: Device type ("auto", "cuda", "rocm" or "cpu").
|
||||
If "auto", will detect available platforms automatically.
|
||||
"""
|
||||
# Auto-detect device if needed
|
||||
|
||||
device = auto_config_device()
|
||||
print(f"Auto-configed device: {device}", flush=True)
|
||||
other_args += ["--device", str(device)]
|
||||
|
||||
command = [
|
||||
"python3",
|
||||
"-m",
|
||||
|
||||
@@ -127,6 +127,16 @@ suites = {
|
||||
"per-commit-8-gpu-amd": [
|
||||
TestFile("test_full_deepseek_v3.py", 250),
|
||||
],
|
||||
"per-commit-cpu": [
|
||||
TestFile("cpu/test_activation.py"),
|
||||
TestFile("cpu/test_decode.py"),
|
||||
TestFile("cpu/test_extend.py"),
|
||||
TestFile("cpu/test_gemm.py"),
|
||||
TestFile("cpu/test_moe.py"),
|
||||
TestFile("cpu/test_norm.py"),
|
||||
TestFile("cpu/test_qkv_proj_with_rope.py"),
|
||||
TestFile("cpu/test_shared_expert.py"),
|
||||
],
|
||||
"nightly": [
|
||||
TestFile("test_nightly_gsm8k_eval.py"),
|
||||
],
|
||||
|
||||
Reference in New Issue
Block a user