[CI] Fix ci tests (#5769)

This commit is contained in:
Lianmin Zheng
2025-04-27 07:18:10 -07:00
committed by GitHub
parent 35ca04d2fa
commit 621e96bf9b
18 changed files with 126 additions and 295 deletions

View File

@@ -16,23 +16,29 @@ suites = {
TestFile("models/lora/test_lora.py", 76),
TestFile("models/lora/test_lora_backend.py", 99),
TestFile("models/lora/test_multi_lora_backend.py", 60),
TestFile("models/test_embedding_models.py", 35),
TestFile("models/test_embedding_models.py", 184),
TestFile("models/test_clip_models.py", 52),
TestFile("models/test_compressed_tensors_models.py", 42),
TestFile("models/test_generation_models.py", 103),
TestFile("models/test_gme_qwen_models.py", 45),
# TestFile("models/test_grok_models.py", 60), # Disabled due to illegal memory access
TestFile("models/test_qwen_models.py", 82),
TestFile("models/test_compressed_tensors_models.py", 100),
TestFile("models/test_reward_models.py", 83),
TestFile("models/test_gme_qwen_models.py", 45),
TestFile("models/test_clip_models.py", 52),
TestFile("models/test_vlm_models.py", 581),
TestFile("models/test_reward_models.py", 132),
TestFile("models/test_vlm_models.py", 317),
TestFile("test_abort.py", 51),
TestFile("test_block_int8.py", 22),
TestFile("test_create_kvindices.py", 2),
TestFile("test_chunked_prefill.py", 285),
TestFile("test_eagle_infer.py", 584),
TestFile("test_ebnf_constrained.py"),
TestFile("test_fa3.py", 376),
TestFile("test_fp8_kernel.py", 8),
TestFile("test_ebnf_constrained.py", 108),
TestFile("test_embedding_openai_server.py", 141),
TestFile("test_eval_fp8_accuracy.py", 303),
TestFile("test_fa3.py", 376),
TestFile("test_fim_completion.py", 40),
TestFile("test_fp8_kernel.py", 8),
TestFile("test_fused_moe.py", 30),
TestFile("test_hicache.py", 116),
TestFile("test_hicache_mla.py", 254),
TestFile("test_hidden_states.py", 55),
TestFile("test_int8_kernel.py", 8),
TestFile("test_input_embeddings.py", 38),
@@ -41,11 +47,11 @@ suites = {
TestFile("test_metrics.py", 32),
TestFile("test_mla.py", 242),
TestFile("test_mla_deepseek_v3.py", 221),
TestFile("test_mla_int8_deepseek_v3.py", 674),
TestFile("test_mla_int8_deepseek_v3.py", 389),
TestFile("test_mla_flashinfer.py", 395),
TestFile("test_mla_fp8.py", 153),
TestFile("test_no_chunked_prefill.py", 126),
TestFile("test_no_overlap_scheduler.py", 262),
TestFile("test_no_chunked_prefill.py", 108),
TestFile("test_no_overlap_scheduler.py", 216),
TestFile("test_openai_server.py", 149),
TestFile("test_penalty.py", 41),
TestFile("test_page_size.py", 60),
@@ -59,27 +65,21 @@ suites = {
TestFile("test_server_args.py", 1),
TestFile("test_skip_tokenizer_init.py", 117),
TestFile("test_srt_engine.py", 237),
TestFile("test_srt_endpoint.py", 94),
TestFile("test_srt_endpoint.py", 130),
TestFile("test_torch_compile.py", 76),
TestFile("test_torch_compile_moe.py", 235),
TestFile("test_torch_compile_moe.py", 172),
TestFile("test_torch_native_attention_backend.py", 123),
TestFile("test_torchao.py", 70),
TestFile("test_triton_attention_kernels.py", 4),
TestFile("test_triton_attention_backend.py", 134),
TestFile("test_triton_moe_channel_fp8_kernel.py", 25),
TestFile("test_update_weights_from_disk.py", 114),
TestFile("test_update_weights_from_tensor.py", 48),
TestFile("test_vertex_endpoint.py", 31),
TestFile("test_vision_chunked_prefill.py", 119),
TestFile("test_vision_chunked_prefill.py", 175),
TestFile("test_vlm_accuracy.py", 60),
TestFile("test_vision_openai_server.py", 637),
TestFile("test_fim_completion.py", 40),
TestFile("test_w8a8_quantization.py", 46),
TestFile("test_eval_fp8_accuracy.py", 303),
TestFile("test_create_kvindices.py", 2),
TestFile("test_hicache.py", 116),
TestFile("test_hicache_mla.py", 254),
TestFile("test_fused_moe.py", 30),
TestFile("test_triton_moe_channel_fp8_kernel.py", 25),
],
"per-commit-2-gpu": [
TestFile("models/lora/test_lora_tp.py", 116),

View File

@@ -29,13 +29,9 @@ class TestBenchOneBatch(CustomTestCase):
DEFAULT_MOE_MODEL_NAME_FOR_TEST, ["--tp", "2", "--cuda-graph-max-bs", "2"]
)
use_vllm_custom_allreduce = get_bool_env_var(
"USE_VLLM_CUSTOM_ALLREDUCE", default="false"
)
if is_in_ci():
write_github_step_summary(
f"### test_moe_tp2_bs1 ({use_vllm_custom_allreduce=})\n"
f"### test_moe_tp2_bs1\n"
f"output_throughput : {output_throughput:.2f} token/s\n"
)
self.assertGreater(output_throughput, 124)

View File

@@ -3,8 +3,8 @@ import unittest
from sglang.test.test_utils import (
DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST,
DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST,
DEFAULT_FP8_MODEL_NAME_FOR_TEST,
DEFAULT_MODEL_NAME_FOR_TEST,
DEFAULT_MODEL_NAME_FOR_TEST_FP8,
DEFAULT_MOE_MODEL_NAME_FOR_TEST,
CustomTestCase,
is_in_ci,
@@ -28,7 +28,7 @@ class TestBenchServing(CustomTestCase):
f"### test_offline_throughput_default\n"
f'Output throughput: {res["output_throughput"]:.2f} token/s\n'
)
self.assertGreater(res["output_throughput"], 3350)
self.assertGreater(res["output_throughput"], 3800)
def test_offline_throughput_non_stream_small_batch_size(self):
res = run_bench_serving(
@@ -48,9 +48,7 @@ class TestBenchServing(CustomTestCase):
f"### test_offline_throughput_non_stream_small_batch_size\n"
f'Output throughput: {res["output_throughput"]:.2f} token/s\n'
)
# There is a regression with torch 2.5
# This number was 950 for torch 2.4
self.assertGreater(res["output_throughput"], 1000)
self.assertGreater(res["output_throughput"], 1050)
def test_offline_throughput_without_radix_cache(self):
res = run_bench_serving(
@@ -65,7 +63,7 @@ class TestBenchServing(CustomTestCase):
f"### test_offline_throughput_without_radix_cache\n"
f'Output throughput: {res["output_throughput"]:.2f} token/s\n'
)
self.assertGreater(res["output_throughput"], 3350)
self.assertGreater(res["output_throughput"], 3800)
def test_offline_throughput_without_chunked_prefill(self):
res = run_bench_serving(
@@ -100,11 +98,11 @@ class TestBenchServing(CustomTestCase):
f"### test_offline_throughput_with_triton_attention_backend\n"
f'Output throughput: {res["output_throughput"]:.2f} token/s\n'
)
self.assertGreater(res["output_throughput"], 3450)
self.assertGreater(res["output_throughput"], 3600)
def test_offline_throughput_default_fp8(self):
res = run_bench_serving(
model=DEFAULT_FP8_MODEL_NAME_FOR_TEST,
model=DEFAULT_MODEL_NAME_FOR_TEST_FP8,
num_prompts=500,
request_rate=float("inf"),
other_server_args=[],
@@ -115,7 +113,7 @@ class TestBenchServing(CustomTestCase):
f"### test_offline_throughput_default_fp8\n"
f'Output throughput: {res["output_throughput"]:.2f} token/s\n'
)
self.assertGreater(res["output_throughput"], 3900)
self.assertGreater(res["output_throughput"], 4200)
def test_online_latency_default(self):
res = run_bench_serving(
@@ -166,8 +164,8 @@ class TestBenchServing(CustomTestCase):
f'median_e2e_latency_ms : {res["median_e2e_latency_ms"]:.2f} ms\n'
f'accept_length : {res["accept_length"]:.2f} \n'
)
self.assertLess(res["median_e2e_latency_ms"], 900)
self.assertGreater(res["accept_length"], 2.99)
self.assertLess(res["median_e2e_latency_ms"], 800)
self.assertGreater(res["accept_length"], 3.0)
def test_moe_offline_throughput_default(self):
res = run_bench_serving(

View File

@@ -4,8 +4,8 @@ from types import SimpleNamespace
from sglang.srt.utils import is_hip, kill_process_tree
from sglang.test.run_eval import run_eval
from sglang.test.test_utils import (
DEFAULT_FP8_MODEL_NAME_FOR_ACCURACY_TEST,
DEFAULT_FP8_MODEL_NAME_FOR_DYNAMIC_QUANT_ACCURACY_TEST,
DEFAULT_MODEL_NAME_FOR_ACCURACY_TEST_FP8,
DEFAULT_MODEL_NAME_FOR_DYNAMIC_QUANT_ACCURACY_TEST_FP8,
DEFAULT_MODEL_NAME_FOR_TEST,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
DEFAULT_URL_FOR_TEST,
@@ -17,7 +17,7 @@ from sglang.test.test_utils import (
class TestEvalFP8Accuracy(CustomTestCase):
@classmethod
def setUpClass(cls):
cls.model = DEFAULT_FP8_MODEL_NAME_FOR_ACCURACY_TEST
cls.model = DEFAULT_MODEL_NAME_FOR_ACCURACY_TEST_FP8
cls.base_url = DEFAULT_URL_FOR_TEST
cls.process = popen_launch_server(
cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
@@ -76,7 +76,7 @@ class TestEvalFP8DynamicQuantAccuracy(CustomTestCase):
def test_mmlu_offline_only(self):
"""Test with offline quantization only."""
self._run_test(
model=DEFAULT_FP8_MODEL_NAME_FOR_DYNAMIC_QUANT_ACCURACY_TEST,
model=DEFAULT_MODEL_NAME_FOR_DYNAMIC_QUANT_ACCURACY_TEST_FP8,
other_args=[],
expected_score=0.64,
)
@@ -84,7 +84,7 @@ class TestEvalFP8DynamicQuantAccuracy(CustomTestCase):
def test_mmlu_offline_and_online_override(self):
"""Test with both offline and online quantization."""
self._run_test(
model=DEFAULT_FP8_MODEL_NAME_FOR_DYNAMIC_QUANT_ACCURACY_TEST,
model=DEFAULT_MODEL_NAME_FOR_DYNAMIC_QUANT_ACCURACY_TEST_FP8,
other_args=["--quantization", "w8a8_fp8"],
# inference will use sgl kernel w/ online quant override
# we observed that the accuracy is higher then offline only

View File

@@ -48,7 +48,7 @@ if OFFLINE_MODE:
DEFAULT_SERVER_ARGS = [
"--trust-remote-code",
"--cuda-graph-max-bs",
"4",
"8",
"--attention-backend",
"fa3",
]

View File

@@ -6,8 +6,8 @@ import torch
from sglang.srt.utils import kill_process_tree
from sglang.test.run_eval import run_eval
from sglang.test.test_utils import (
DEFAULT_FP8_MODEL_NAME_FOR_MODELOPT_QUANT_ACCURACY_TEST,
DEFAULT_FP8_MODEL_NAME_FOR_MODELOPT_QUANT_ACCURACY_TEST_REVISION,
DEFAULT_MODEL_NAME_FOR_MODELOPT_QUANT_ACCURACY_TEST_FP8,
DEFAULT_MODEL_NAME_FOR_MODELOPT_QUANT_ACCURACY_TEST_FP8_REVISION,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
DEFAULT_URL_FOR_TEST,
CustomTestCase,
@@ -49,10 +49,10 @@ class TestEvalFP8ModelOptQuantAccuracy(CustomTestCase):
def test_mmlu_offline_only(self):
"""Test with offline quantization only."""
self._run_test(
model=DEFAULT_FP8_MODEL_NAME_FOR_MODELOPT_QUANT_ACCURACY_TEST,
model=DEFAULT_MODEL_NAME_FOR_MODELOPT_QUANT_ACCURACY_TEST_FP8,
other_args=[
"--revision",
DEFAULT_FP8_MODEL_NAME_FOR_MODELOPT_QUANT_ACCURACY_TEST_REVISION,
DEFAULT_MODEL_NAME_FOR_MODELOPT_QUANT_ACCURACY_TEST_FP8_REVISION,
],
expected_score=0.64,
)

View File

@@ -14,7 +14,6 @@ from sglang.test.test_utils import (
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
DEFAULT_URL_FOR_TEST,
CustomTestCase,
is_in_ci,
popen_launch_server,
write_github_step_summary,
@@ -45,25 +44,10 @@ def parse_models(model_string):
return [model.strip() for model in model_string.split(",") if model.strip()]
def popen_launch_server_wrapper(base_url, model, is_fp8, is_tp2):
def popen_launch_server_wrapper(base_url, model, is_tp2):
other_args = ["--log-level-http", "warning", "--trust-remote-code"]
if is_fp8:
if "Llama-3" in model or "gemma-2" in model:
other_args.extend(["--kv-cache-dtype", "fp8_e5m2"])
elif "Qwen2-72B-Instruct-FP8" in model:
other_args.extend(["--quantization", "fp8"])
elif "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8" in model:
other_args.extend([])
else:
other_args.extend(["--quantization", "fp8", "--kv-cache-dtype", "fp8_e5m2"])
if is_tp2:
other_args.extend(["--tp", "2"])
if "DeepSeek" in model:
other_args.extend(["--mem-frac", "0.85"])
if "AWQ" in model:
other_args.extend(["--quantization", "awq"])
elif "GPTQ" in model:
other_args.extend(["--quantization", "gptq"])
process = popen_launch_server(
model,
@@ -150,9 +134,7 @@ class TestNightlyGsm8KEval(unittest.TestCase):
for model_group, is_fp8, is_tp2 in self.model_groups:
for model in model_group:
with self.subTest(model=model):
process = popen_launch_server_wrapper(
self.base_url, model, is_fp8, is_tp2
)
process = popen_launch_server_wrapper(self.base_url, model, is_tp2)
args = SimpleNamespace(
base_url=self.base_url,

View File

@@ -1,105 +0,0 @@
import os
import shutil
import signal
import subprocess
import unittest
from test_nightly_gsm8k_eval import parse_models, popen_launch_server_wrapper
from sglang.srt.utils import kill_process_tree
from sglang.test.test_utils import (
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1,
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2,
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1,
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2,
DEFAULT_MODEL_NAME_FOR_TEST,
DEFAULT_URL_FOR_TEST,
CustomTestCase,
is_in_ci,
)
class TestNightlyHumanEval(CustomTestCase):
@classmethod
def setUpClass(cls):
if is_in_ci():
cls.model_groups = [([DEFAULT_MODEL_NAME_FOR_TEST], False, False)]
else:
cls.model_groups = [
(parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1), False, False),
(parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2), False, True),
(
parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1),
True,
False,
),
(parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2), True, True),
]
cls.base_url = DEFAULT_URL_FOR_TEST
cls.process = None
cls.eval_process = None
@classmethod
def tearDownClass(cls):
if cls.process:
kill_process_tree(cls.process.pid)
if cls.eval_process:
kill_process_tree(cls.eval_process.pid)
def run_evalplus(self, model):
print("Delete evalplus results")
shutil.rmtree("evalplus_results", ignore_errors=True)
cmd = [
"evalplus.evaluate",
"--model",
model,
"--dataset",
"humaneval",
"--backend",
"openai",
"--base-url",
"http://localhost:6157/v1",
"--greedy",
]
try:
self.eval_process = subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
preexec_fn=os.setsid,
)
stdout, stderr = self.eval_process.communicate(timeout=600)
if self.eval_process.returncode != 0:
print(f"Fail to human eval model={model} err={stderr}")
print("=" * 42)
print(stdout)
print("=" * 42)
except subprocess.TimeoutExpired:
if self.eval_process:
os.killpg(os.getpgid(self.eval_process.pid), signal.SIGTERM)
print(f"Timeout during evaluation for model={model}")
except Exception as e:
print(f"Error running evalplus for model={model} {str(e)}")
if self.eval_process:
os.killpg(os.getpgid(self.eval_process.pid), signal.SIGTERM)
def test_human_eval_all_models(self):
for model_group, is_fp8, is_tp2 in self.model_groups:
for model in model_group:
# NOTE: only Llama for now
if "Llama" in model:
with self.subTest(model=model):
self.process = popen_launch_server_wrapper(
self.base_url, model, is_fp8, is_tp2
)
self.run_evalplus(model)
self.tearDownClass()
if __name__ == "__main__":
unittest.main()

View File

@@ -1,47 +0,0 @@
import unittest
from types import SimpleNamespace
from sglang.srt.utils import kill_process_tree
from sglang.test.run_eval import run_eval
from sglang.test.test_utils import (
DEFAULT_MODEL_NAME_FOR_TEST,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
DEFAULT_URL_FOR_TEST,
CustomTestCase,
popen_launch_server,
)
class TestEvalAccuracyLarge(CustomTestCase):
@classmethod
def setUpClass(cls):
cls.model = DEFAULT_MODEL_NAME_FOR_TEST
cls.base_url = DEFAULT_URL_FOR_TEST
cls.process = popen_launch_server(
cls.model,
cls.base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
other_args=["--log-level-http", "warning"],
)
@classmethod
def tearDownClass(cls):
kill_process_tree(cls.process.pid)
def test_math(self):
args = SimpleNamespace(
base_url=self.base_url,
model=self.model,
eval_name="math",
num_examples=5000,
num_threads=1024,
)
metrics = run_eval(args)
self.assertGreaterEqual(
metrics["score"], 0.519 - 0.02
) # -2% to account for sampling variance
if __name__ == "__main__":
unittest.main()