Clean up server_args.py to have a dedicated function for model specific adjustments (#8983)

This commit is contained in:
Lianmin Zheng
2025-08-08 19:56:50 -07:00
committed by GitHub
parent 23f2afb2ce
commit 706bd69cc5
24 changed files with 201 additions and 340 deletions

View File

@@ -8,8 +8,6 @@ suites = {
TestFile("test_srt_backend.py"),
# Skip this due to some OPENAI_API_KEY issues
# "test_openai_backend.py",
TestFile("test_separate_reasoning.py"),
TestFile("test_separate_reasoning_execution.py"),
],
}

View File

@@ -38,7 +38,6 @@ suites = {
TestFile("openai_server/basic/test_serving_embedding.py", 10),
TestFile("openai_server/basic/test_openai_embedding.py", 141),
TestFile("openai_server/basic/test_openai_server.py", 149),
TestFile("openai_server/features/test_cache_report.py", 100),
TestFile("openai_server/features/test_enable_thinking.py", 70),
TestFile("openai_server/features/test_json_constrained.py", 98),
TestFile("openai_server/features/test_json_mode.py", 90),
@@ -103,7 +102,6 @@ suites = {
TestFile("test_update_weights_from_disk.py", 114),
TestFile("test_update_weights_from_tensor.py", 48),
TestFile("test_utils_update_weights.py", 48),
TestFile("test_vertex_endpoint.py", 31),
TestFile("test_vision_chunked_prefill.py", 175),
TestFile("test_vlm_input_format.py", 300),
TestFile("test_vision_openai_server_a.py", 584),
@@ -167,7 +165,6 @@ suites = {
TestFile("models/lora/test_lora_tp.py", 116),
TestFile("test_data_parallelism.py", 73),
TestFile("test_dp_attention.py", 277),
TestFile("test_mla_tp.py", 170),
TestFile("test_patch_torch.py", 19),
TestFile("test_update_weights_from_distributed.py", 103),
TestFile("test_release_memory_occupation.py", 127),
@@ -175,7 +172,6 @@ suites = {
"per-commit-2-gpu-amd": [
TestFile("models/lora/test_lora_tp.py", 116),
TestFile("test_data_parallelism.py", 73),
TestFile("test_mla_tp.py", 170),
TestFile("test_patch_torch.py", 19),
TestFile("test_update_weights_from_distributed.py", 103),
],

View File

@@ -15,7 +15,7 @@ from sglang.test.test_utils import (
TEST_MODEL_MATRIX = {
"Qwen/Qwen2.5-7B-Instruct": {
"accuracy": 0.85,
"accuracy": 0.84,
"latency": 150,
"output_throughput": 30,
},

View File

@@ -1,7 +1,6 @@
import asyncio
import itertools
import unittest
from random import random, uniform
import requests

View File

@@ -149,66 +149,5 @@ class TestDeepseekV3MTP(CustomTestCase):
self.assertGreater(avg_spec_accept_length, 2.5)
# compatible with old APIs
class TestDeepseekV3MTPWithDraft(CustomTestCase):
@classmethod
def setUpClass(cls):
cls.model = "lmsys/sglang-ci-dsv3-test"
cls.base_url = DEFAULT_URL_FOR_TEST
other_args = [
"--trust-remote-code",
"--cuda-graph-max-bs",
"2",
"--disable-radix",
"--enable-torch-compile",
"--torch-compile-max-bs",
"1",
"--speculative-algorithm",
"EAGLE",
"--speculative-draft",
"lmsys/sglang-ci-dsv3-test-NextN",
"--speculative-num-steps",
"2",
"--speculative-eagle-topk",
"4",
"--speculative-num-draft-tokens",
"4",
]
cls.process = popen_launch_server(
cls.model,
cls.base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
other_args=other_args,
)
@classmethod
def tearDownClass(cls):
kill_process_tree(cls.process.pid)
def test_gsm8k(self):
requests.get(self.base_url + "/flush_cache")
args = SimpleNamespace(
num_shots=5,
data_path=None,
num_questions=200,
max_new_tokens=512,
parallel=128,
host="http://127.0.0.1",
port=int(self.base_url.split(":")[-1]),
)
metrics = run_eval_few_shot_gsm8k(args)
print(metrics)
self.assertGreater(metrics["accuracy"], 0.60)
server_info = requests.get(self.base_url + "/get_server_info")
avg_spec_accept_length = server_info.json()["internal_states"][0][
"avg_spec_accept_length"
]
print(f"{avg_spec_accept_length=}")
self.assertGreater(avg_spec_accept_length, 2.5)
if __name__ == "__main__":
unittest.main()

View File

@@ -25,7 +25,7 @@ class TestFlashinferMLA(CustomTestCase):
[
"--enable-torch-compile",
"--cuda-graph-max-bs",
"2",
"4",
"--attention-backend",
"flashinfer",
]
@@ -68,7 +68,6 @@ class TestFlashinferMLAMTP(CustomTestCase):
[
"--cuda-graph-max-bs",
"4",
"--disable-radix",
"--enable-torch-compile",
"--torch-compile-max-bs",
"1",

View File

@@ -10,6 +10,7 @@ from sglang.test.test_utils import (
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
DEFAULT_URL_FOR_TEST,
CustomTestCase,
is_in_ci,
popen_launch_server,
)
@@ -112,6 +113,7 @@ class TestDeepseekV3MTPChannelInt8(CustomTestCase):
self.assertGreater(avg_spec_accept_length, 2.5)
@unittest.skipIf(is_in_ci(), "To reduce the CI execution time.")
class TestMLADeepseekV3BlockInt8(CustomTestCase):
@classmethod
def setUpClass(cls):