piecewise cuda graph support qwen3-moe (#11845)

2025-10-21 10:55:49 +08:00
parent 74de76c685
commit 8374a96e49
4 changed files with 71 additions and 6 deletions
--- a/python/sglang/srt/layers/communicator.py
+++ b/python/sglang/srt/layers/communicator.py
@@ -212,6 +212,10 @@ class LayerCommunicator:
            )
        )
        self._speculative_algo = SpeculativeAlgorithm.from_string(
            get_global_server_args().speculative_algorithm
        )
    def prepare_attn(
        self,
        hidden_states: torch.Tensor,
@@ -315,13 +319,10 @@ class LayerCommunicator:
    def should_fuse_mlp_allreduce_with_next_layer(
        self, forward_batch: ForwardBatch
    ) -> bool:
        speculative_algo = SpeculativeAlgorithm.from_string(
            get_global_server_args().speculative_algorithm
        )
        if (
            is_dp_attention_enabled()
-            and speculative_algo is not None
+            and self._speculative_algo is not None
-            and speculative_algo.is_eagle()
+            and self._speculative_algo.is_eagle()
        ):
            return False
--- a/python/sglang/srt/layers/quantization/fp8_kernel.py
+++ b/python/sglang/srt/layers/quantization/fp8_kernel.py
@@ -1831,3 +1831,21 @@ def triton_scaled_mm(
    )
    return result.to(out_dtype)
 if _is_cuda:
    if enable_sgl_per_token_group_quant_8bit:
        @torch.library.register_fake("sgl_kernel::sgl_per_token_group_quant_8bit")
        def _(
            input, output_q, output_s, group_size, eps, fp8_min, fp8_max, scale_ue8m0
        ):
            return
    else:
        @torch.library.register_fake("sgl_kernel::sgl_per_token_group_quant_fp8")
        def _(
            input, output_q, output_s, group_size, eps, fp8_min, fp8_max, scale_ue8m0
        ):
            return
--- a/python/sglang/srt/models/qwen2_moe.py
+++ b/python/sglang/srt/models/qwen2_moe.py
@@ -17,6 +17,7 @@
 """Inference-only Qwen2MoE model compatible with HuggingFace weights."""
 import logging
 from contextlib import nullcontext
 from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
 import torch
@@ -590,7 +591,12 @@ class Qwen2MoeModel(nn.Module):
                        if residual is not None
                        else hidden_states
                    )
-                with get_global_expert_distribution_recorder().with_current_layer(i):
+                ctx = (
                    nullcontext()
                    if get_global_server_args().enable_piecewise_cuda_graph
                    else get_global_expert_distribution_recorder().with_current_layer(i)
                )
                with ctx:
                    layer = self.layers[i]
                    hidden_states, residual = layer(
                        positions, hidden_states, forward_batch, residual
--- a/test/srt/test_piecewise_cuda_graph.py
+++ b/test/srt/test_piecewise_cuda_graph.py
@@ -55,5 +55,45 @@ class TestPiecewiseCudaGraphBenchmark(CustomTestCase):
        self.assertLess(prefill_latency, 0.015)
 class TestPiecewiseCudaGraphQwen3MoE(CustomTestCase):
    """Test piecewise CUDA graph with Qwen3-Coder-30B-A3B-Instruct MoE model"""
    @classmethod
    def setUpClass(cls):
        cls.model = "Qwen/Qwen3-Coder-30B-A3B-Instruct"
        cls.base_url = DEFAULT_URL_FOR_TEST
        cls.process = popen_launch_server(
            cls.model,
            cls.base_url,
            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
            other_args=[
                "--enable-piecewise-cuda-graph",
                "--piecewise-cuda-graph-compiler",
                "eager",
            ],
        )
    @classmethod
    def tearDownClass(cls):
        kill_process_tree(cls.process.pid)
    def test_gsm8k_accuracy(self):
        """Test GSM8K accuracy with 8-shot setting"""
        num_examples = 2000
        args = SimpleNamespace(
            base_url=self.base_url,
            model=self.model,
            eval_name="mgsm_en",
            num_examples=num_examples,
            num_threads=min(num_examples, 1024),
        )
        metrics = run_eval(args)
        print(f"GSM8K Accuracy: {metrics['score']:.3f}")
        self.assertGreaterEqual(metrics["score"], 0.90)
 if __name__ == "__main__":
    unittest.main()