[CPU] enable CI for PRs, add Dockerfile and auto build task (#6458)
Co-authored-by: diwei sun <diwei.sun@intel.com> Co-authored-by: Yineng Zhang <me@zhyncs.com>
This commit is contained in:
86
.github/workflows/pr-test-xeon.yml
vendored
Normal file
86
.github/workflows/pr-test-xeon.yml
vendored
Normal file
@@ -0,0 +1,86 @@
|
|||||||
|
name: PR Test (Xeon)
|
||||||
|
on:
|
||||||
|
pull_request:
|
||||||
|
branches:
|
||||||
|
- main
|
||||||
|
workflow_dispatch:
|
||||||
|
|
||||||
|
concurrency:
|
||||||
|
group: pr-test-xeon-${{ github.ref }}
|
||||||
|
cancel-in-progress: true
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
build-test:
|
||||||
|
if: github.event_name == 'pull_request'
|
||||||
|
runs-on: sgl-kernel-build-node
|
||||||
|
environment: 'prod'
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
build_type: ['all']
|
||||||
|
steps:
|
||||||
|
- name: Checkout repository
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Build and Push
|
||||||
|
run: |
|
||||||
|
version=$(cat python/sglang/version.py | cut -d'"' -f2)
|
||||||
|
tag=v${version}-xeon
|
||||||
|
|
||||||
|
docker build . -f docker/Dockerfile.xeon -t sglang_xeon --no-cache
|
||||||
|
unit-test:
|
||||||
|
if: github.event_name == 'pull_request'
|
||||||
|
needs: [build-test]
|
||||||
|
runs-on: sgl-kernel-build-node
|
||||||
|
steps:
|
||||||
|
- name: Run container
|
||||||
|
run: |
|
||||||
|
docker run -dt \
|
||||||
|
-v ${{ github.workspace }}:/sglang-checkout/ --ipc=host \
|
||||||
|
--name ci_sglang_xeon \
|
||||||
|
sglang_xeon
|
||||||
|
|
||||||
|
- name: Install Dependency
|
||||||
|
timeout-minutes: 20
|
||||||
|
run: |
|
||||||
|
docker exec ci_sglang_xeon bash -c "python3 -m pip install --upgrade pip"
|
||||||
|
docker exec ci_sglang_xeon pip uninstall sgl-kernel -y || true
|
||||||
|
docker exec -w /sglang-checkout/sgl-kernel ci_sglang_xeon bash -c "cp pyproject_cpu.toml pyproject.toml && pip install -v ."
|
||||||
|
docker exec -w /sglang-checkout/ ci_sglang_xeon bash -c "pip install -e "python[all_cpu]""
|
||||||
|
docker exec ci_sglang_xeon bash -c "python3 -m pip install pytest expecttest"
|
||||||
|
|
||||||
|
- name: Check AMX Support
|
||||||
|
id: check_amx
|
||||||
|
timeout-minutes: 5
|
||||||
|
run: |
|
||||||
|
docker exec -w /sglang-checkout/ ci_sglang_xeon \
|
||||||
|
bash -c "python3 -c 'import torch; import sgl_kernel; assert torch._C._cpu._is_amx_tile_supported(); assert hasattr(torch.ops.sgl_kernel, \"convert_weight_packed\"); '"
|
||||||
|
continue-on-error: true
|
||||||
|
|
||||||
|
- name: Run UT Cases
|
||||||
|
if: steps.check_amx.outcome == 'success'
|
||||||
|
timeout-minutes: 20
|
||||||
|
run: |
|
||||||
|
docker exec -w /sglang-checkout/ ci_sglang_xeon \
|
||||||
|
bash -c "cd ./test/srt && python3 run_suite.py --suite per-commit-cpu"
|
||||||
|
|
||||||
|
- name: Cleanup container
|
||||||
|
if: always()
|
||||||
|
run: |
|
||||||
|
docker rm -f ci_sglang_xeon || true
|
||||||
|
|
||||||
|
finish:
|
||||||
|
if: always()
|
||||||
|
needs: [build-test, unit-test]
|
||||||
|
runs-on: ubuntu-24.04
|
||||||
|
steps:
|
||||||
|
- name: Check all dependent job statuses
|
||||||
|
run: |
|
||||||
|
results=(${{ join(needs.*.result, ' ') }})
|
||||||
|
for result in "${results[@]}"; do
|
||||||
|
if [ "$result" = "failure" ] || [ "$result" = "cancelled" ]; then
|
||||||
|
echo "Job failed with result: $result"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
echo "All jobs completed successfully"
|
||||||
|
exit 0
|
||||||
35
.github/workflows/release-docker-xeon.yml
vendored
Normal file
35
.github/workflows/release-docker-xeon.yml
vendored
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
name: Release Docker Images
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- main
|
||||||
|
paths:
|
||||||
|
- "python/sglang/version.py"
|
||||||
|
workflow_dispatch:
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
publish:
|
||||||
|
if: github.repository == 'sgl-project/sglang'
|
||||||
|
runs-on: ubuntu-24.04
|
||||||
|
environment: 'prod'
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
build_type: ['all']
|
||||||
|
steps:
|
||||||
|
|
||||||
|
- name: Checkout repository
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Login to Docker Hub
|
||||||
|
uses: docker/login-action@v2
|
||||||
|
with:
|
||||||
|
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||||
|
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
||||||
|
|
||||||
|
- name: Build and Push
|
||||||
|
run: |
|
||||||
|
version=$(cat python/sglang/version.py | cut -d'"' -f2)
|
||||||
|
tag=v${version}-xeon
|
||||||
|
|
||||||
|
docker build . -f docker/Dockerfile.xeon -t lmsysorg/sglang:${tag} --no-cache
|
||||||
|
docker push lmsysorg/sglang:${tag}
|
||||||
44
docker/Dockerfile.xeon
Normal file
44
docker/Dockerfile.xeon
Normal file
@@ -0,0 +1,44 @@
|
|||||||
|
FROM ubuntu:24.04
|
||||||
|
SHELL ["/bin/bash", "-c"]
|
||||||
|
|
||||||
|
ARG VER_SGLANG=main
|
||||||
|
ARG VER_TORCH=2.6.0
|
||||||
|
ARG VER_TORCHVISION=0.21.0
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get full-upgrade -y && \
|
||||||
|
DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \
|
||||||
|
ca-certificates \
|
||||||
|
git \
|
||||||
|
curl \
|
||||||
|
wget \
|
||||||
|
vim \
|
||||||
|
gcc \
|
||||||
|
g++ \
|
||||||
|
make
|
||||||
|
|
||||||
|
WORKDIR /sgl-workspace
|
||||||
|
|
||||||
|
RUN curl -fsSL -v -o miniforge.sh -O https://github.com/conda-forge/miniforge/releases/download/24.11.3-2/Miniforge3-24.11.3-2-Linux-x86_64.sh && \
|
||||||
|
bash miniforge.sh -b -p ./miniforge3 && \
|
||||||
|
rm -f miniforge.sh && \
|
||||||
|
. miniforge3/bin/activate && \
|
||||||
|
conda install -y libsqlite==3.48.0 gperftools tbb libnuma numactl
|
||||||
|
|
||||||
|
ENV PATH=/sgl-workspace/miniforge3/bin:/sgl-workspace/miniforge3/condabin:${PATH}
|
||||||
|
ENV PIP_ROOT_USER_ACTION=ignore
|
||||||
|
|
||||||
|
RUN pip install intel-openmp
|
||||||
|
|
||||||
|
RUN git clone https://github.com/sgl-project/sglang.git && \
|
||||||
|
cd sglang && \
|
||||||
|
git checkout ${VER_SGLANG} && \
|
||||||
|
pip install -e "python[all_cpu]" && \
|
||||||
|
pip install torch==${VER_TORCH} torchvision==${VER_TORCHVISION} --index-url https://download.pytorch.org/whl/cpu --force-reinstall && \
|
||||||
|
cd sgl-kernel && \
|
||||||
|
cp pyproject_cpu.toml pyproject.toml && \
|
||||||
|
pip install -v .
|
||||||
|
|
||||||
|
ENV LD_PRELOAD=/sgl-workspace/miniforge3/lib/libiomp5.so:/sgl-workspace/miniforge3/lib/libtcmalloc.so:/sgl-workspace/miniforge3/lib/libtbbmalloc.so.2
|
||||||
|
|
||||||
|
WORKDIR /sgl-workspace/sglang
|
||||||
@@ -89,7 +89,7 @@ srt_hpu = ["sglang[runtime_common]", "outlines>=0.0.44,<=0.1.11"]
|
|||||||
# CPU: currently, there are no pre-built vllm wheels for CPU.
|
# CPU: currently, there are no pre-built vllm wheels for CPU.
|
||||||
# To install vllm for CPU, please follow the instruction here:
|
# To install vllm for CPU, please follow the instruction here:
|
||||||
# https://docs.vllm.ai/en/latest/getting_started/installation/cpu/index.html
|
# https://docs.vllm.ai/en/latest/getting_started/installation/cpu/index.html
|
||||||
srt_cpu = ["sglang[runtime_common]", "outlines>=0.0.44,<=0.1.11", "torch"]
|
srt_cpu = ["sglang[runtime_common]", "outlines>=0.0.44,<=0.1.11", "einops"]
|
||||||
# https://vllm-ascend.readthedocs.io/en/latest/installation.html
|
# https://vllm-ascend.readthedocs.io/en/latest/installation.html
|
||||||
srt_npu = ["sglang[runtime_common]", "outlines>=0.0.44,<=0.1.11"]
|
srt_npu = ["sglang[runtime_common]", "outlines>=0.0.44,<=0.1.11"]
|
||||||
|
|
||||||
|
|||||||
@@ -26,6 +26,7 @@ from sglang.lang.backend.openai import OpenAI
|
|||||||
from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
|
from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
|
||||||
from sglang.srt.utils import (
|
from sglang.srt.utils import (
|
||||||
get_bool_env_var,
|
get_bool_env_var,
|
||||||
|
get_device,
|
||||||
is_port_available,
|
is_port_available,
|
||||||
kill_process_tree,
|
kill_process_tree,
|
||||||
retry,
|
retry,
|
||||||
@@ -305,13 +306,33 @@ def add_common_other_args_and_parse(parser: argparse.ArgumentParser):
|
|||||||
return args
|
return args
|
||||||
|
|
||||||
|
|
||||||
|
def auto_config_device() -> str:
|
||||||
|
"""Auto-config available device platform"""
|
||||||
|
|
||||||
|
try:
|
||||||
|
device = get_device()
|
||||||
|
except (RuntimeError, ImportError) as e:
|
||||||
|
print(f"Warning: {e} - Falling back to CPU")
|
||||||
|
device = "cpu"
|
||||||
|
|
||||||
|
return device
|
||||||
|
|
||||||
|
|
||||||
def add_common_sglang_args_and_parse(parser: argparse.ArgumentParser):
|
def add_common_sglang_args_and_parse(parser: argparse.ArgumentParser):
|
||||||
parser.add_argument("--parallel", type=int, default=64)
|
parser.add_argument("--parallel", type=int, default=64)
|
||||||
parser.add_argument("--host", type=str, default="http://127.0.0.1")
|
parser.add_argument("--host", type=str, default="http://127.0.0.1")
|
||||||
parser.add_argument("--port", type=int, default=30000)
|
parser.add_argument("--port", type=int, default=30000)
|
||||||
parser.add_argument("--backend", type=str, default="srt")
|
parser.add_argument("--backend", type=str, default="srt")
|
||||||
|
parser.add_argument(
|
||||||
|
"--device",
|
||||||
|
type=str,
|
||||||
|
default="auto",
|
||||||
|
choices=["auto", "cuda", "rocm", "cpu"],
|
||||||
|
help="Device type (auto/cuda/rocm/cpu). Auto will detect available platforms",
|
||||||
|
)
|
||||||
parser.add_argument("--result-file", type=str, default="result.jsonl")
|
parser.add_argument("--result-file", type=str, default="result.jsonl")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
return args
|
return args
|
||||||
|
|
||||||
|
|
||||||
@@ -397,11 +418,25 @@ def popen_launch_server(
|
|||||||
base_url: str,
|
base_url: str,
|
||||||
timeout: float,
|
timeout: float,
|
||||||
api_key: Optional[str] = None,
|
api_key: Optional[str] = None,
|
||||||
other_args: list[str] = (),
|
other_args: list[str] = [],
|
||||||
env: Optional[dict] = None,
|
env: Optional[dict] = None,
|
||||||
return_stdout_stderr: Optional[tuple] = None,
|
return_stdout_stderr: Optional[tuple] = None,
|
||||||
|
device: str = "auto",
|
||||||
pd_separated: bool = False,
|
pd_separated: bool = False,
|
||||||
):
|
):
|
||||||
|
"""Launch a server process with automatic device detection.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
device: Device type ("auto", "cuda", "rocm" or "cpu").
|
||||||
|
If "auto", will detect available platforms automatically.
|
||||||
|
"""
|
||||||
|
# Auto-detect device if needed
|
||||||
|
if device == "auto":
|
||||||
|
device = auto_config_device()
|
||||||
|
print(f"Auto-configed device: {device}", flush=True)
|
||||||
|
other_args = list(other_args)
|
||||||
|
other_args += ["--device", str(device)]
|
||||||
|
|
||||||
_, host, port = base_url.split(":")
|
_, host, port = base_url.split(":")
|
||||||
host = host[2:]
|
host = host[2:]
|
||||||
|
|
||||||
@@ -457,6 +492,15 @@ def popen_launch_server(
|
|||||||
start_time = time.perf_counter()
|
start_time = time.perf_counter()
|
||||||
with requests.Session() as session:
|
with requests.Session() as session:
|
||||||
while time.perf_counter() - start_time < timeout:
|
while time.perf_counter() - start_time < timeout:
|
||||||
|
|
||||||
|
return_code = process.poll()
|
||||||
|
if return_code is not None:
|
||||||
|
# Server failed to start (non-zero exit code) or crashed
|
||||||
|
raise Exception(
|
||||||
|
f"Server process exited with code {return_code}. "
|
||||||
|
"Check server logs for errors."
|
||||||
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
headers = {
|
headers = {
|
||||||
"Content-Type": "application/json; charset=utf-8",
|
"Content-Type": "application/json; charset=utf-8",
|
||||||
@@ -627,6 +671,7 @@ def get_benchmark_args(
|
|||||||
disable_stream=False,
|
disable_stream=False,
|
||||||
disable_ignore_eos=False,
|
disable_ignore_eos=False,
|
||||||
seed: int = 0,
|
seed: int = 0,
|
||||||
|
device="auto",
|
||||||
pd_separated: bool = False,
|
pd_separated: bool = False,
|
||||||
):
|
):
|
||||||
return SimpleNamespace(
|
return SimpleNamespace(
|
||||||
@@ -657,6 +702,7 @@ def get_benchmark_args(
|
|||||||
profile=None,
|
profile=None,
|
||||||
lora_name=None,
|
lora_name=None,
|
||||||
prompt_suffix="",
|
prompt_suffix="",
|
||||||
|
device=device,
|
||||||
pd_separated=pd_separated,
|
pd_separated=pd_separated,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -676,7 +722,10 @@ def run_bench_serving(
|
|||||||
disable_ignore_eos=False,
|
disable_ignore_eos=False,
|
||||||
need_warmup=False,
|
need_warmup=False,
|
||||||
seed: int = 0,
|
seed: int = 0,
|
||||||
|
device="auto",
|
||||||
):
|
):
|
||||||
|
if device == "auto":
|
||||||
|
device = auto_config_device()
|
||||||
# Launch the server
|
# Launch the server
|
||||||
base_url = DEFAULT_URL_FOR_TEST
|
base_url = DEFAULT_URL_FOR_TEST
|
||||||
process = popen_launch_server(
|
process = popen_launch_server(
|
||||||
@@ -700,6 +749,7 @@ def run_bench_serving(
|
|||||||
disable_stream=disable_stream,
|
disable_stream=disable_stream,
|
||||||
disable_ignore_eos=disable_ignore_eos,
|
disable_ignore_eos=disable_ignore_eos,
|
||||||
seed=seed,
|
seed=seed,
|
||||||
|
device=device,
|
||||||
)
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@@ -750,6 +800,18 @@ def run_bench_serving_multi(
|
|||||||
|
|
||||||
|
|
||||||
def run_bench_one_batch(model, other_args):
|
def run_bench_one_batch(model, other_args):
|
||||||
|
"""Launch a offline process with automatic device detection.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
device: Device type ("auto", "cuda", "rocm" or "cpu").
|
||||||
|
If "auto", will detect available platforms automatically.
|
||||||
|
"""
|
||||||
|
# Auto-detect device if needed
|
||||||
|
|
||||||
|
device = auto_config_device()
|
||||||
|
print(f"Auto-configed device: {device}", flush=True)
|
||||||
|
other_args += ["--device", str(device)]
|
||||||
|
|
||||||
command = [
|
command = [
|
||||||
"python3",
|
"python3",
|
||||||
"-m",
|
"-m",
|
||||||
|
|||||||
@@ -127,6 +127,16 @@ suites = {
|
|||||||
"per-commit-8-gpu-amd": [
|
"per-commit-8-gpu-amd": [
|
||||||
TestFile("test_full_deepseek_v3.py", 250),
|
TestFile("test_full_deepseek_v3.py", 250),
|
||||||
],
|
],
|
||||||
|
"per-commit-cpu": [
|
||||||
|
TestFile("cpu/test_activation.py"),
|
||||||
|
TestFile("cpu/test_decode.py"),
|
||||||
|
TestFile("cpu/test_extend.py"),
|
||||||
|
TestFile("cpu/test_gemm.py"),
|
||||||
|
TestFile("cpu/test_moe.py"),
|
||||||
|
TestFile("cpu/test_norm.py"),
|
||||||
|
TestFile("cpu/test_qkv_proj_with_rope.py"),
|
||||||
|
TestFile("cpu/test_shared_expert.py"),
|
||||||
|
],
|
||||||
"nightly": [
|
"nightly": [
|
||||||
TestFile("test_nightly_gsm8k_eval.py"),
|
TestFile("test_nightly_gsm8k_eval.py"),
|
||||||
],
|
],
|
||||||
|
|||||||
Reference in New Issue
Block a user