From 67008f4b320d8950803fcb14b1e5dc6e80bf75e4 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Mon, 13 Jan 2025 03:55:33 -0800 Subject: [PATCH] Use only one GPU for MLA CI tests (#2858) --- .github/workflows/pr-test.yml | 8 +++----- test/srt/run_suite.py | 2 ++ test/srt/test_mla.py | 35 ++++++++++++++++++++++++++++++++++- test/srt/test_mla_fp8.py | 2 -- 4 files changed, 39 insertions(+), 8 deletions(-) diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index f1c7871de..274c97c63 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -87,18 +87,16 @@ jobs: run: | bash scripts/ci_install_dependency.sh - - name: Evaluate data parallelism accuracy (DP=2) + - name: Test data parallelism (DP=2) timeout-minutes: 10 run: | cd test/srt python3 test_data_parallelism.py - - name: Evaluate MLA accuracy (TP=2) + - name: Test data parallelism attention (DP=2) timeout-minutes: 10 run: | cd test/srt - python3 test_mla.py - python3 test_mla_fp8.py python3 test_dp_attention.py - name: Test update weights from distributed @@ -107,7 +105,7 @@ jobs: cd test/srt python3 test_update_weights_from_distributed.py - - name: Evaluate MoE EP accuracy (TP=2) + - name: Test expert parallelism (EP=2) timeout-minutes: 10 run: | cd test/srt diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index 320fea729..d617fcf69 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -22,6 +22,8 @@ suites = { "test_json_constrained.py", "test_large_max_new_tokens.py", "test_metrics.py", + "test_mla.py", + "test_mla_fp8.py", "test_no_chunked_prefill.py", "test_no_overlap_scheduler.py", "test_openai_server.py", diff --git a/test/srt/test_mla.py b/test/srt/test_mla.py index b8105a84a..34bc4b446 100644 --- a/test/srt/test_mla.py +++ b/test/srt/test_mla.py @@ -2,6 +2,7 @@ import unittest from types import SimpleNamespace from sglang.srt.utils import kill_process_tree +from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k from sglang.test.run_eval import run_eval from sglang.test.test_utils import ( DEFAULT_MLA_MODEL_NAME_FOR_TEST, @@ -20,7 +21,7 @@ class TestMLA(unittest.TestCase): cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, - other_args=["--tp", "2", "--trust-remote-code"], + other_args=["--trust-remote-code"], ) @classmethod @@ -52,5 +53,37 @@ class TestMLA(unittest.TestCase): self.assertGreater(metrics["score"], 0.8) +class TestDeepseekV3(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.model = "lmzheng/sglang-ci-dsv3-test" + cls.base_url = DEFAULT_URL_FOR_TEST + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=["--trust-remote-code"], + ) + + @classmethod + def tearDownClass(cls): + kill_process_tree(cls.process.pid) + + def test_mmlu(self): + args = SimpleNamespace( + num_shots=5, + data_path=None, + num_questions=200, + max_new_tokens=512, + parallel=128, + host="http://127.0.0.1", + port=int(self.base_url.split(":")[-1]), + ) + metrics = run_eval_few_shot_gsm8k(args) + print(metrics) + + self.assertGreater(metrics["accuracy"], 0.62) + + if __name__ == "__main__": unittest.main() diff --git a/test/srt/test_mla_fp8.py b/test/srt/test_mla_fp8.py index 769bdf34d..4fe18b526 100644 --- a/test/srt/test_mla_fp8.py +++ b/test/srt/test_mla_fp8.py @@ -21,8 +21,6 @@ class TestMLA(unittest.TestCase): cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ - "--tp", - "2", "--trust-remote-code", "--kv-cache-dtype", "fp8_e5m2",