[Fix] Compatibility between DP attention and pipeline parallelism (#10100)

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-09-06 01:34:10 -07:00
parent 012584ecd5
commit 21af5c0404
2 changed files with 53 additions and 6 deletions
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -32,6 +32,7 @@ from sglang.srt.configs.model_config import AttentionArch, ModelConfig
 from sglang.srt.configs.update_config import adjust_config_with_unaligned_cpu_tp
 from sglang.srt.constants import GPU_MEMORY_TYPE_WEIGHTS
 from sglang.srt.distributed import (
    get_pp_group,
    get_tp_group,
    get_world_group,
    init_distributed_environment,
@@ -639,6 +640,7 @@ class ModelRunner:
            cpu_group=get_world_group().cpu_group,
        )
        self.tp_group = get_tp_group()
        self.pp_group = get_pp_group()
        self.attention_tp_group = get_attention_tp_group()
        # Check memory for tensor parallelism
@@ -1825,7 +1827,10 @@ class ModelRunner:
        else:
            raise ValueError(f"Invalid forward mode: {forward_batch.forward_mode}")
-        if forward_batch.global_num_tokens_cpu is not None:
+        if (
            forward_batch.global_num_tokens_cpu is not None
            and self.pp_group.is_last_rank
        ):
            forward_batch.post_forward_mlp_sync_batch(ret)
        return ret, can_run_cuda_graph
--- a/test/srt/test_pp_single_node.py
+++ b/test/srt/test_pp_single_node.py
@@ -14,11 +14,14 @@ import requests
 from sglang.bench_one_batch_server import BenchArgs as OneBatchBenchArgs
 from sglang.srt.server_args import ServerArgs
 from sglang.srt.utils import kill_process_tree
-from sglang.test.few_shot_gsm8k import run_eval
+from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
 from sglang.test.run_eval import run_eval
 from sglang.test.test_utils import (
    DEFAULT_MLA_MODEL_NAME_FOR_TEST,
    DEFAULT_MODEL_NAME_FOR_TEST,
    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
    DEFAULT_URL_FOR_TEST,
    CustomTestCase,
    is_in_ci,
    popen_launch_server,
    run_bench_one_batch_server,
@@ -57,7 +60,7 @@ class TestPPAccuracy(unittest.TestCase):
            host="http://127.0.0.1",
            port=int(self.base_url.split(":")[-1]),
        )
-        metrics = run_eval(args)
+        metrics = run_eval_few_shot_gsm8k(args)
        print(f"{metrics=}")
        self.assertGreater(metrics["accuracy"], 0.74)
@@ -88,6 +91,45 @@ class TestPPAccuracy(unittest.TestCase):
        assert len(output_top_logprobs) == 16
 class TestDPAttentionDP2PP2(CustomTestCase):
    @classmethod
    def setUpClass(cls):
        cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST
        cls.base_url = DEFAULT_URL_FOR_TEST
        cls.process = popen_launch_server(
            cls.model,
            cls.base_url,
            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
            other_args=[
                "--trust-remote-code",
                "--tp",
                "2",
                "--pp-size",
                "2",
                "--enable-dp-attention",
                "--dp",
                "2",
            ],
        )
    @classmethod
    def tearDownClass(cls):
        kill_process_tree(cls.process.pid)
    def test_mgsm_en(self):
        args = SimpleNamespace(
            base_url=self.base_url,
            model=self.model,
            eval_name="mgsm_en",
            num_examples=None,
            num_threads=1024,
        )
        metrics = run_eval(args)
        print(f"{metrics=}")
        self.assertGreater(metrics["score"], 0.8)
 class TestQwenPPAccuracy(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
@@ -117,7 +159,7 @@ class TestQwenPPAccuracy(unittest.TestCase):
                host="http://127.0.0.1",
                port=int(self.base_url.split(":")[-1]),
            )
-            metrics = run_eval(args)
+            metrics = run_eval_few_shot_gsm8k(args)
            time.sleep(5)
            return metrics
        finally:
@@ -172,7 +214,7 @@ class TestQwenPPTieWeightsAccuracy(unittest.TestCase):
                host="http://127.0.0.1",
                port=int(self.base_url.split(":")[-1]),
            )
-            metrics = run_eval(args)
+            metrics = run_eval_few_shot_gsm8k(args)
            time.sleep(5)
            return metrics
        finally:
@@ -224,7 +266,7 @@ class TestQwenMoePPAccuracy(unittest.TestCase):
                host="http://127.0.0.1",
                port=int(self.base_url.split(":")[-1]),
            )
-            metrics = run_eval(args)
+            metrics = run_eval_few_shot_gsm8k(args)
            time.sleep(5)
            return metrics
        finally: