diff --git a/python/sglang/srt/layers/moe/topk.py b/python/sglang/srt/layers/moe/topk.py index b0b57d68d..91ca00c6e 100644 --- a/python/sglang/srt/layers/moe/topk.py +++ b/python/sglang/srt/layers/moe/topk.py @@ -141,7 +141,9 @@ def biased_grouped_topk( .expand(num_token, num_expert_group, scores.shape[-1] // num_expert_group) .reshape(num_token, -1) ) # [n, e] - tmp_scores = scores_for_choice.masked_fill(~score_mask.bool(), 0.0) # [n, e] + tmp_scores = scores_for_choice.masked_fill( + ~score_mask.bool(), float("-inf") + ) # [n, e] _, topk_ids = torch.topk(tmp_scores, k=topk, dim=-1, sorted=False) topk_weights = scores.gather(1, topk_ids)