[CI] Split test cases in CI for better load balancing (#2180)

2024-11-25 04:58:16 -08:00
parent 538fa0ae13
commit 254fd130e2
6 changed files with 37 additions and 23 deletions
--- a/scripts/ci_install_dependency.sh
+++ b/scripts/ci_install_dependency.sh
@@ -6,3 +6,7 @@ pip install --upgrade pip
 pip install -e "python[all]"
 pip install transformers==4.45.2 sentence_transformers accelerate peft
 pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
 # for compling eagle kernels
 pip install cutex
 # for compling xgrammar kernels
 pip install cuda-python nvidia-cuda-nvrtc-cu12
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -17,7 +17,8 @@ suites = {
        "test_json_constrained.py",
        "test_large_max_new_tokens.py",
        "test_metrics.py",
-        "test_non_overlap_scheduler.py",
+        "test_no_chunked_prefill.py",
        "test_no_overlap_scheduler.py",
        "test_openai_server.py",
        "test_pytorch_sampling_backend.py",
        "test_radix_attention.py",
--- a/test/srt/test_chunked_prefill.py
+++ b/test/srt/test_chunked_prefill.py
@@ -4,12 +4,7 @@ python3 -m unittest test_chunked_prefill.TestChunkedPrefill.test_mixed_chunked_p
 import unittest
-from sglang.test.test_utils import (
+from sglang.test.test_utils import run_mmlu_test, run_mulit_request_test
    DEFAULT_MODEL_NAME_FOR_TEST,
    run_bench_serving,
    run_mmlu_test,
    run_mulit_request_test,
 )
 class TestChunkedPrefill(unittest.TestCase):
@@ -25,21 +20,6 @@ class TestChunkedPrefill(unittest.TestCase):
    def test_mixed_chunked_prefill_without_radix_cache(self):
        run_mmlu_test(disable_radix_cache=True, enable_mixed_chunk=True)
    def test_no_chunked_prefill(self):
        run_mmlu_test(
            disable_radix_cache=False, enable_mixed_chunk=False, chunked_prefill_size=-1
        )
    def test_no_chunked_prefill_without_radix_cache(self):
        res = run_bench_serving(
            model=DEFAULT_MODEL_NAME_FOR_TEST,
            num_prompts=10,
            request_rate=float("inf"),
            other_server_args=["--disable-radix-cache", "--chunked-prefill-size", "-1"],
        )
        assert res["completed"] == 10
    def test_mixed_chunked_prefill_multi_requests(self):
        run_mulit_request_test(
            enable_mixed_chunk=True,
--- a/test/srt/test_no_chunked_prefill.py
+++ b/test/srt/test_no_chunked_prefill.py
@@ -0,0 +1,29 @@
 import unittest
 from sglang.test.test_utils import (
    DEFAULT_MODEL_NAME_FOR_TEST,
    run_bench_serving,
    run_mmlu_test,
 )
 class TestNoChunkedPrefill(unittest.TestCase):
    def test_no_chunked_prefill(self):
        run_mmlu_test(
            disable_radix_cache=False, enable_mixed_chunk=False, chunked_prefill_size=-1
        )
    def test_no_chunked_prefill_without_radix_cache(self):
        res = run_bench_serving(
            model=DEFAULT_MODEL_NAME_FOR_TEST,
            num_prompts=10,
            request_rate=float("inf"),
            other_server_args=["--disable-radix-cache", "--chunked-prefill-size", "-1"],
        )
        assert res["completed"] == 10
 if __name__ == "__main__":
    unittest.main()
--- a/test/srt/test_non_overlap_scheduler.py
+++ b/test/srt/test_non_overlap_scheduler.py
--- a/test/srt/test_srt_endpoint.py
+++ b/test/srt/test_srt_endpoint.py
@@ -211,7 +211,7 @@ class TestSRTEndpoint(unittest.TestCase):
        diff = np.abs(output_logprobs - output_logprobs_score)
        max_diff = np.max(diff)
-        self.assertLess(max_diff, 0.2)
+        self.assertLess(max_diff, 0.25)
    def test_get_server_info(self):
        response = requests.get(self.base_url + "/get_server_info")