[Fix] Compatibility between DP attention and pipeline parallelism (#10100)

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-09-06 01:34:10 -07:00
parent 012584ecd5
commit 21af5c0404
2 changed files with 53 additions and 6 deletions
--- a/test/srt/test_pp_single_node.py
+++ b/test/srt/test_pp_single_node.py
@@ -14,11 +14,14 @@ import requests
 from sglang.bench_one_batch_server import BenchArgs as OneBatchBenchArgs
 from sglang.srt.server_args import ServerArgs
 from sglang.srt.utils import kill_process_tree
-from sglang.test.few_shot_gsm8k import run_eval
+from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
+from sglang.test.run_eval import run_eval
 from sglang.test.test_utils import (
+    DEFAULT_MLA_MODEL_NAME_FOR_TEST,
    DEFAULT_MODEL_NAME_FOR_TEST,
    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
    is_in_ci,
    popen_launch_server,
    run_bench_one_batch_server,
@@ -57,7 +60,7 @@ class TestPPAccuracy(unittest.TestCase):
            host="http://127.0.0.1",
            port=int(self.base_url.split(":")[-1]),
        )
-        metrics = run_eval(args)
+        metrics = run_eval_few_shot_gsm8k(args)
        print(f"{metrics=}")

        self.assertGreater(metrics["accuracy"], 0.74)
@@ -88,6 +91,45 @@ class TestPPAccuracy(unittest.TestCase):
        assert len(output_top_logprobs) == 16


+class TestDPAttentionDP2PP2(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "2",
+                "--pp-size",
+                "2",
+                "--enable-dp-attention",
+                "--dp",
+                "2",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mgsm_en(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mgsm_en",
+            num_examples=None,
+            num_threads=1024,
+        )
+
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["score"], 0.8)
+
+
 class TestQwenPPAccuracy(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
@@ -117,7 +159,7 @@ class TestQwenPPAccuracy(unittest.TestCase):
                host="http://127.0.0.1",
                port=int(self.base_url.split(":")[-1]),
            )
-            metrics = run_eval(args)
+            metrics = run_eval_few_shot_gsm8k(args)
            time.sleep(5)
            return metrics
        finally:
@@ -172,7 +214,7 @@ class TestQwenPPTieWeightsAccuracy(unittest.TestCase):
                host="http://127.0.0.1",
                port=int(self.base_url.split(":")[-1]),
            )
-            metrics = run_eval(args)
+            metrics = run_eval_few_shot_gsm8k(args)
            time.sleep(5)
            return metrics
        finally:
@@ -224,7 +266,7 @@ class TestQwenMoePPAccuracy(unittest.TestCase):
                host="http://127.0.0.1",
                port=int(self.base_url.split(":")[-1]),
            )
-            metrics = run_eval(args)
+            metrics = run_eval_few_shot_gsm8k(args)
            time.sleep(5)
            return metrics
        finally: