From 2f6bacee03187b08741ef3cb8b48f54f6c9a3190 Mon Sep 17 00:00:00 2001 From: Cheng Wan <54331508+ch-wan@users.noreply.github.com> Date: Thu, 13 Mar 2025 01:22:13 -0400 Subject: [PATCH] [moe] fix: correct the cache size in the last chunk (#3679) Co-authored-by: Abatom --- python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py b/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py index 4ccaf59e6..2c2e213aa 100644 --- a/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +++ b/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py @@ -1064,7 +1064,9 @@ def fused_experts_impl( # so the cache size and config are already set correctly and # do not need to be adjusted. intermediate_cache1 = intermediate_cache1[:tokens_in_chunk] - intermediate_cache2 = intermediate_cache2[:tokens_in_chunk] + intermediate_cache2 = intermediate_cache2[ + : tokens_in_chunk * topk_ids.shape[1] + ] intermediate_cache3 = intermediate_cache3[:tokens_in_chunk] config = get_config_func(tokens_in_chunk)