fix: fix illegal cuda memory access at fused_moe_kernel (#4727)

Co-authored-by: yuethe <yuethe@tencent.com>
This commit is contained in:
saltyfish66
2025-04-03 15:07:32 +08:00
committed by GitHub
parent cccfc10e9c
commit e41549c3d6

View File

@@ -152,6 +152,7 @@ def fused_moe_kernel(
return
offs_token_id = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
offs_token = tl.load(sorted_token_ids_ptr + offs_token_id)
offs_token = offs_token.to(tl.int64)
token_mask = offs_token < num_valid_tokens
offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N