From 91a066ec6a4a70e1db60237576b4d6023fe614b3 Mon Sep 17 00:00:00 2001 From: fzyzcjy <5236035+fzyzcjy@users.noreply.github.com> Date: Tue, 17 Jun 2025 00:13:57 +0800 Subject: [PATCH] Tiny remove comments about DeepEP on H20 (#7234) --- .../srt/layers/moe/ep_moe/token_dispatcher.py | 32 ------------------- 1 file changed, 32 deletions(-) diff --git a/python/sglang/srt/layers/moe/ep_moe/token_dispatcher.py b/python/sglang/srt/layers/moe/ep_moe/token_dispatcher.py index 339f2e8bc..f1f9dbeb2 100644 --- a/python/sglang/srt/layers/moe/ep_moe/token_dispatcher.py +++ b/python/sglang/srt/layers/moe/ep_moe/token_dispatcher.py @@ -542,38 +542,6 @@ class _DeepEPDispatcherImplLowLatency(_DeepEPDispatcherImplBase): topk_idx: torch.Tensor, use_fp8: bool = False, ): - """ - # For H20, there will be an CUDA error: DeepEP/csrc/kernels/internode_ll.cu:337 'too many blocks in cooperative launch'. - # Please make sure to change DeepEP code in internode_ll.cu dispatch / combine as below first and then reinstall. - # More details refer: https://github.com/deepseek-ai/DeepEP/issues/15#issuecomment-2709715782 - - diff --git a/csrc/kernels/internode_ll.cu b/csrc/kernels/internode_ll.cu - index 76ae2e2..8ecd08f 100644 - --- a/csrc/kernels/internode_ll.cu - +++ b/csrc/kernels/internode_ll.cu - @@ -310,8 +310,8 @@ void dispatch(void* packed_recv_x, float* packed_recv_x_scales, - int num_topk, int num_experts, int rank, int num_ranks, bool use_fp8, - void* workspace, cudaStream_t stream, int phases) { - constexpr int kNumMaxTopK = 9; - - constexpr int kNumWarpsPerGroup = 10; - - constexpr int kNumWarpGroups = 3; - + constexpr int kNumWarpsPerGroup = 8; - + constexpr int kNumWarpGroups = 4; - EP_STATIC_ASSERT(kNumMaxTopK + 1 <= kNumWarpGroups * kNumWarpsPerGroup, "Too many top-k selections"); - - const auto num_warps = kNumWarpGroups * kNumWarpsPerGroup; - @@ -501,8 +501,8 @@ void combine(void* combined_x, - int num_combined_tokens, int hidden, int num_max_dispatch_tokens_per_rank, - int num_topk, int num_experts, int rank, int num_ranks, - void* workspace, cudaStream_t stream, int phases) { - - constexpr int kNumWarpsPerGroup = 10; - - constexpr int kNumWarpGroups = 3; - + constexpr int kNumWarpsPerGroup = 8; - + constexpr int kNumWarpGroups = 4; - constexpr int kNumMaxTopk = 9; - - const auto num_warps = kNumWarpGroups * kNumWarpsPerGroup; - """ buffer = self._get_buffer() packed_recv_hidden, packed_recv_count, self.handle, event, hook = ( buffer.low_latency_dispatch(