diff --git a/python/sglang/srt/layers/attention/triton_ops/extend_attention.py b/python/sglang/srt/layers/attention/triton_ops/extend_attention.py index 0c12e8ce3..17e45599d 100644 --- a/python/sglang/srt/layers/attention/triton_ops/extend_attention.py +++ b/python/sglang/srt/layers/attention/triton_ops/extend_attention.py @@ -341,8 +341,8 @@ def extend_attention_fwd( else: BLOCK_M, BLOCK_N = (32, 64) elif is_cuda_available and CUDA_CAPABILITY[0] >= 8: - # 8.9 has a much smaller shared memory size (100K) than 8.0 (160K) - if CUDA_CAPABILITY[1] == 9: + # sm86/sm89 has a much smaller shared memory size (100K) than sm80 (160K) + if CUDA_CAPABILITY[1] == 9 or CUDA_CAPABILITY[1] == 6: if Lq <= 128: BLOCK_M, BLOCK_N = (64, 128) elif Lq <= 256: diff --git a/sgl-kernel/csrc/gemm/int8_gemm_kernel.cu b/sgl-kernel/csrc/gemm/int8_gemm_kernel.cu index 918faad50..a81dba3d9 100644 --- a/sgl-kernel/csrc/gemm/int8_gemm_kernel.cu +++ b/sgl-kernel/csrc/gemm/int8_gemm_kernel.cu @@ -703,8 +703,8 @@ torch::Tensor int8_scaled_mm( sm75_dispatch_shape>( out, mat_a, mat_b, scales_a, scales_b, bias); } else if (sm_version >= 80 && sm_version < 90) { - // sm89 has a much smaller shared memory size (100K) than sm80 (160K) - if (sm_version == 89) { + // sm86/sm89 has a much smaller shared memory size (100K) than sm80 (160K) + if (sm_version == 86 || sm_version == 89) { if (out_dtype == torch::kBFloat16) { sm89_dispatch_shape>( out, mat_a, mat_b, scales_a, scales_b, bias);