From 67008f4b320d8950803fcb14b1e5dc6e80bf75e4 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Mon, 13 Jan 2025 03:55:33 -0800
Subject: [PATCH] Use only one GPU for MLA CI tests (#2858)

---
 .github/workflows/pr-test.yml |  8 +++-----
 test/srt/run_suite.py         |  2 ++
 test/srt/test_mla.py          | 35 ++++++++++++++++++++++++++++++++++-
 test/srt/test_mla_fp8.py      |  2 --
 4 files changed, 39 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml
index f1c7871de..274c97c63 100644
--- a/.github/workflows/pr-test.yml
+++ b/.github/workflows/pr-test.yml
@@ -87,18 +87,16 @@ jobs:
         run: |
           bash scripts/ci_install_dependency.sh
 
-      - name: Evaluate data parallelism accuracy (DP=2)
+      - name: Test data parallelism (DP=2)
         timeout-minutes: 10
         run: |
           cd test/srt
           python3 test_data_parallelism.py
 
-      - name: Evaluate MLA accuracy (TP=2)
+      - name: Test data parallelism attention (DP=2)
         timeout-minutes: 10
         run: |
           cd test/srt
-          python3 test_mla.py
-          python3 test_mla_fp8.py
           python3 test_dp_attention.py
 
       - name: Test update weights from distributed
@@ -107,7 +105,7 @@ jobs:
           cd test/srt
           python3 test_update_weights_from_distributed.py
 
-      - name: Evaluate MoE EP accuracy (TP=2)
+      - name: Test expert parallelism (EP=2)
         timeout-minutes: 10
         run: |
           cd test/srt
diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py
index 320fea729..d617fcf69 100644
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -22,6 +22,8 @@ suites = {
         "test_json_constrained.py",
         "test_large_max_new_tokens.py",
         "test_metrics.py",
+        "test_mla.py",
+        "test_mla_fp8.py",
         "test_no_chunked_prefill.py",
         "test_no_overlap_scheduler.py",
         "test_openai_server.py",
diff --git a/test/srt/test_mla.py b/test/srt/test_mla.py
index b8105a84a..34bc4b446 100644
--- a/test/srt/test_mla.py
+++ b/test/srt/test_mla.py
@@ -2,6 +2,7 @@ import unittest
 from types import SimpleNamespace
 
 from sglang.srt.utils import kill_process_tree
+from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
 from sglang.test.run_eval import run_eval
 from sglang.test.test_utils import (
     DEFAULT_MLA_MODEL_NAME_FOR_TEST,
@@ -20,7 +21,7 @@ class TestMLA(unittest.TestCase):
             cls.model,
             cls.base_url,
             timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-            other_args=["--tp", "2", "--trust-remote-code"],
+            other_args=["--trust-remote-code"],
         )
 
     @classmethod
@@ -52,5 +53,37 @@ class TestMLA(unittest.TestCase):
         self.assertGreater(metrics["score"], 0.8)
 
 
+class TestDeepseekV3(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "lmzheng/sglang-ci-dsv3-test"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=["--trust-remote-code"],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=200,
+            max_new_tokens=512,
+            parallel=128,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
+        )
+        metrics = run_eval_few_shot_gsm8k(args)
+        print(metrics)
+
+        self.assertGreater(metrics["accuracy"], 0.62)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/srt/test_mla_fp8.py b/test/srt/test_mla_fp8.py
index 769bdf34d..4fe18b526 100644
--- a/test/srt/test_mla_fp8.py
+++ b/test/srt/test_mla_fp8.py
@@ -21,8 +21,6 @@ class TestMLA(unittest.TestCase):
             cls.base_url,
             timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
             other_args=[
-                "--tp",
-                "2",
                 "--trust-remote-code",
                 "--kv-cache-dtype",
                 "fp8_e5m2",