Clean up server_args.py to have a dedicated function for model specific adjustments (#8983)

2025-08-08 19:56:50 -07:00
parent 23f2afb2ce
commit 706bd69cc5
24 changed files with 201 additions and 340 deletions
--- a/test/lang/run_suite.py
+++ b/test/lang/run_suite.py
@@ -8,8 +8,6 @@ suites = {
        TestFile("test_srt_backend.py"),
        # Skip this due to some OPENAI_API_KEY issues
        # "test_openai_backend.py",
-        TestFile("test_separate_reasoning.py"),
-        TestFile("test_separate_reasoning_execution.py"),
    ],
 }

--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -38,7 +38,6 @@ suites = {
        TestFile("openai_server/basic/test_serving_embedding.py", 10),
        TestFile("openai_server/basic/test_openai_embedding.py", 141),
        TestFile("openai_server/basic/test_openai_server.py", 149),
-        TestFile("openai_server/features/test_cache_report.py", 100),
        TestFile("openai_server/features/test_enable_thinking.py", 70),
        TestFile("openai_server/features/test_json_constrained.py", 98),
        TestFile("openai_server/features/test_json_mode.py", 90),
@@ -103,7 +102,6 @@ suites = {
        TestFile("test_update_weights_from_disk.py", 114),
        TestFile("test_update_weights_from_tensor.py", 48),
        TestFile("test_utils_update_weights.py", 48),
-        TestFile("test_vertex_endpoint.py", 31),
        TestFile("test_vision_chunked_prefill.py", 175),
        TestFile("test_vlm_input_format.py", 300),
        TestFile("test_vision_openai_server_a.py", 584),
@@ -167,7 +165,6 @@ suites = {
        TestFile("models/lora/test_lora_tp.py", 116),
        TestFile("test_data_parallelism.py", 73),
        TestFile("test_dp_attention.py", 277),
-        TestFile("test_mla_tp.py", 170),
        TestFile("test_patch_torch.py", 19),
        TestFile("test_update_weights_from_distributed.py", 103),
        TestFile("test_release_memory_occupation.py", 127),
@@ -175,7 +172,6 @@ suites = {
    "per-commit-2-gpu-amd": [
        TestFile("models/lora/test_lora_tp.py", 116),
        TestFile("test_data_parallelism.py", 73),
-        TestFile("test_mla_tp.py", 170),
        TestFile("test_patch_torch.py", 19),
        TestFile("test_update_weights_from_distributed.py", 103),
    ],
--- a/test/srt/test_ascend_tp1_bf16.py
+++ b/test/srt/test_ascend_tp1_bf16.py
@@ -15,7 +15,7 @@ from sglang.test.test_utils import (

 TEST_MODEL_MATRIX = {
    "Qwen/Qwen2.5-7B-Instruct": {
-        "accuracy": 0.85,
+        "accuracy": 0.84,
        "latency": 150,
        "output_throughput": 30,
    },
--- a/test/srt/test_bench_serving.py
+++ b/test/srt/test_bench_serving.py
@@ -1,7 +1,6 @@
 import asyncio
 import itertools
 import unittest
-from random import random, uniform

 import requests

--- a/test/srt/test_mla_deepseek_v3.py
+++ b/test/srt/test_mla_deepseek_v3.py
@@ -149,66 +149,5 @@ class TestDeepseekV3MTP(CustomTestCase):
        self.assertGreater(avg_spec_accept_length, 2.5)


-# compatible with old APIs
-class TestDeepseekV3MTPWithDraft(CustomTestCase):
-    @classmethod
-    def setUpClass(cls):
-        cls.model = "lmsys/sglang-ci-dsv3-test"
-        cls.base_url = DEFAULT_URL_FOR_TEST
-        other_args = [
-            "--trust-remote-code",
-            "--cuda-graph-max-bs",
-            "2",
-            "--disable-radix",
-            "--enable-torch-compile",
-            "--torch-compile-max-bs",
-            "1",
-            "--speculative-algorithm",
-            "EAGLE",
-            "--speculative-draft",
-            "lmsys/sglang-ci-dsv3-test-NextN",
-            "--speculative-num-steps",
-            "2",
-            "--speculative-eagle-topk",
-            "4",
-            "--speculative-num-draft-tokens",
-            "4",
-        ]
-        cls.process = popen_launch_server(
-            cls.model,
-            cls.base_url,
-            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-            other_args=other_args,
-        )
-
-    @classmethod
-    def tearDownClass(cls):
-        kill_process_tree(cls.process.pid)
-
-    def test_gsm8k(self):
-        requests.get(self.base_url + "/flush_cache")
-
-        args = SimpleNamespace(
-            num_shots=5,
-            data_path=None,
-            num_questions=200,
-            max_new_tokens=512,
-            parallel=128,
-            host="http://127.0.0.1",
-            port=int(self.base_url.split(":")[-1]),
-        )
-        metrics = run_eval_few_shot_gsm8k(args)
-        print(metrics)
-
-        self.assertGreater(metrics["accuracy"], 0.60)
-
-        server_info = requests.get(self.base_url + "/get_server_info")
-        avg_spec_accept_length = server_info.json()["internal_states"][0][
-            "avg_spec_accept_length"
-        ]
-        print(f"{avg_spec_accept_length=}")
-        self.assertGreater(avg_spec_accept_length, 2.5)
-
-
 if __name__ == "__main__":
    unittest.main()
--- a/test/srt/test_mla_flashinfer.py
+++ b/test/srt/test_mla_flashinfer.py
@@ -25,7 +25,7 @@ class TestFlashinferMLA(CustomTestCase):
                [
                    "--enable-torch-compile",
                    "--cuda-graph-max-bs",
-                    "2",
+                    "4",
                    "--attention-backend",
                    "flashinfer",
                ]
@@ -68,7 +68,6 @@ class TestFlashinferMLAMTP(CustomTestCase):
                [
                    "--cuda-graph-max-bs",
                    "4",
-                    "--disable-radix",
                    "--enable-torch-compile",
                    "--torch-compile-max-bs",
                    "1",
--- a/test/srt/test_mla_int8_deepseek_v3.py
+++ b/test/srt/test_mla_int8_deepseek_v3.py
@@ -10,6 +10,7 @@ from sglang.test.test_utils import (
    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
    DEFAULT_URL_FOR_TEST,
    CustomTestCase,
+    is_in_ci,
    popen_launch_server,
 )

@@ -112,6 +113,7 @@ class TestDeepseekV3MTPChannelInt8(CustomTestCase):
        self.assertGreater(avg_spec_accept_length, 2.5)


+@unittest.skipIf(is_in_ci(), "To reduce the CI execution time.")
 class TestMLADeepseekV3BlockInt8(CustomTestCase):
    @classmethod
    def setUpClass(cls):