[CPU] enable CI for PRs, add Dockerfile and auto build task (#6458)
Co-authored-by: diwei sun <diwei.sun@intel.com> Co-authored-by: Yineng Zhang <me@zhyncs.com>
This commit is contained in:
@@ -89,7 +89,7 @@ srt_hpu = ["sglang[runtime_common]", "outlines>=0.0.44,<=0.1.11"]
|
||||
# CPU: currently, there are no pre-built vllm wheels for CPU.
|
||||
# To install vllm for CPU, please follow the instruction here:
|
||||
# https://docs.vllm.ai/en/latest/getting_started/installation/cpu/index.html
|
||||
srt_cpu = ["sglang[runtime_common]", "outlines>=0.0.44,<=0.1.11", "torch"]
|
||||
srt_cpu = ["sglang[runtime_common]", "outlines>=0.0.44,<=0.1.11", "einops"]
|
||||
# https://vllm-ascend.readthedocs.io/en/latest/installation.html
|
||||
srt_npu = ["sglang[runtime_common]", "outlines>=0.0.44,<=0.1.11"]
|
||||
|
||||
|
||||
@@ -26,6 +26,7 @@ from sglang.lang.backend.openai import OpenAI
|
||||
from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
|
||||
from sglang.srt.utils import (
|
||||
get_bool_env_var,
|
||||
get_device,
|
||||
is_port_available,
|
||||
kill_process_tree,
|
||||
retry,
|
||||
@@ -305,13 +306,33 @@ def add_common_other_args_and_parse(parser: argparse.ArgumentParser):
|
||||
return args
|
||||
|
||||
|
||||
def auto_config_device() -> str:
|
||||
"""Auto-config available device platform"""
|
||||
|
||||
try:
|
||||
device = get_device()
|
||||
except (RuntimeError, ImportError) as e:
|
||||
print(f"Warning: {e} - Falling back to CPU")
|
||||
device = "cpu"
|
||||
|
||||
return device
|
||||
|
||||
|
||||
def add_common_sglang_args_and_parse(parser: argparse.ArgumentParser):
|
||||
parser.add_argument("--parallel", type=int, default=64)
|
||||
parser.add_argument("--host", type=str, default="http://127.0.0.1")
|
||||
parser.add_argument("--port", type=int, default=30000)
|
||||
parser.add_argument("--backend", type=str, default="srt")
|
||||
parser.add_argument(
|
||||
"--device",
|
||||
type=str,
|
||||
default="auto",
|
||||
choices=["auto", "cuda", "rocm", "cpu"],
|
||||
help="Device type (auto/cuda/rocm/cpu). Auto will detect available platforms",
|
||||
)
|
||||
parser.add_argument("--result-file", type=str, default="result.jsonl")
|
||||
args = parser.parse_args()
|
||||
|
||||
return args
|
||||
|
||||
|
||||
@@ -397,11 +418,25 @@ def popen_launch_server(
|
||||
base_url: str,
|
||||
timeout: float,
|
||||
api_key: Optional[str] = None,
|
||||
other_args: list[str] = (),
|
||||
other_args: list[str] = [],
|
||||
env: Optional[dict] = None,
|
||||
return_stdout_stderr: Optional[tuple] = None,
|
||||
device: str = "auto",
|
||||
pd_separated: bool = False,
|
||||
):
|
||||
"""Launch a server process with automatic device detection.
|
||||
|
||||
Args:
|
||||
device: Device type ("auto", "cuda", "rocm" or "cpu").
|
||||
If "auto", will detect available platforms automatically.
|
||||
"""
|
||||
# Auto-detect device if needed
|
||||
if device == "auto":
|
||||
device = auto_config_device()
|
||||
print(f"Auto-configed device: {device}", flush=True)
|
||||
other_args = list(other_args)
|
||||
other_args += ["--device", str(device)]
|
||||
|
||||
_, host, port = base_url.split(":")
|
||||
host = host[2:]
|
||||
|
||||
@@ -457,6 +492,15 @@ def popen_launch_server(
|
||||
start_time = time.perf_counter()
|
||||
with requests.Session() as session:
|
||||
while time.perf_counter() - start_time < timeout:
|
||||
|
||||
return_code = process.poll()
|
||||
if return_code is not None:
|
||||
# Server failed to start (non-zero exit code) or crashed
|
||||
raise Exception(
|
||||
f"Server process exited with code {return_code}. "
|
||||
"Check server logs for errors."
|
||||
)
|
||||
|
||||
try:
|
||||
headers = {
|
||||
"Content-Type": "application/json; charset=utf-8",
|
||||
@@ -627,6 +671,7 @@ def get_benchmark_args(
|
||||
disable_stream=False,
|
||||
disable_ignore_eos=False,
|
||||
seed: int = 0,
|
||||
device="auto",
|
||||
pd_separated: bool = False,
|
||||
):
|
||||
return SimpleNamespace(
|
||||
@@ -657,6 +702,7 @@ def get_benchmark_args(
|
||||
profile=None,
|
||||
lora_name=None,
|
||||
prompt_suffix="",
|
||||
device=device,
|
||||
pd_separated=pd_separated,
|
||||
)
|
||||
|
||||
@@ -676,7 +722,10 @@ def run_bench_serving(
|
||||
disable_ignore_eos=False,
|
||||
need_warmup=False,
|
||||
seed: int = 0,
|
||||
device="auto",
|
||||
):
|
||||
if device == "auto":
|
||||
device = auto_config_device()
|
||||
# Launch the server
|
||||
base_url = DEFAULT_URL_FOR_TEST
|
||||
process = popen_launch_server(
|
||||
@@ -700,6 +749,7 @@ def run_bench_serving(
|
||||
disable_stream=disable_stream,
|
||||
disable_ignore_eos=disable_ignore_eos,
|
||||
seed=seed,
|
||||
device=device,
|
||||
)
|
||||
|
||||
try:
|
||||
@@ -750,6 +800,18 @@ def run_bench_serving_multi(
|
||||
|
||||
|
||||
def run_bench_one_batch(model, other_args):
|
||||
"""Launch a offline process with automatic device detection.
|
||||
|
||||
Args:
|
||||
device: Device type ("auto", "cuda", "rocm" or "cpu").
|
||||
If "auto", will detect available platforms automatically.
|
||||
"""
|
||||
# Auto-detect device if needed
|
||||
|
||||
device = auto_config_device()
|
||||
print(f"Auto-configed device: {device}", flush=True)
|
||||
other_args += ["--device", str(device)]
|
||||
|
||||
command = [
|
||||
"python3",
|
||||
"-m",
|
||||
|
||||
Reference in New Issue
Block a user