fix: fix illegal cuda memory access at fused_moe_kernel (#4727)
Co-authored-by: yuethe <yuethe@tencent.com>
This commit is contained in:
@@ -152,6 +152,7 @@ def fused_moe_kernel(
|
||||
return
|
||||
offs_token_id = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
|
||||
offs_token = tl.load(sorted_token_ids_ptr + offs_token_id)
|
||||
offs_token = offs_token.to(tl.int64)
|
||||
token_mask = offs_token < num_valid_tokens
|
||||
|
||||
offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N
|
||||
|
||||
Reference in New Issue
Block a user