From 5d15fb8c9d0c62d4f9238ccfa2bc89eb78a99946 Mon Sep 17 00:00:00 2001 From: Tao He Date: Thu, 31 Jul 2025 22:41:39 +0800 Subject: [PATCH] [bugifx] QWen-1M context support[2/3] using current cuda stream in the DCA's kernel for bugfix. (#8611) Signed-off-by: Tao He Co-authored-by: sa-buc --- sgl-kernel/csrc/attention/vertical_slash_index.cu | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/sgl-kernel/csrc/attention/vertical_slash_index.cu b/sgl-kernel/csrc/attention/vertical_slash_index.cu index 93c936fdd..118f780dd 100644 --- a/sgl-kernel/csrc/attention/vertical_slash_index.cu +++ b/sgl-kernel/csrc/attention/vertical_slash_index.cu @@ -3,6 +3,7 @@ // This file is for blocksparse attention utils cuda kernel. #include +#include #include #include @@ -176,7 +177,8 @@ void convert_vertical_slash_indexes_64x64( const dim3 dimBlock((int32_t)N_THREADS); const dim3 dimGrid( (int32_t)N_HEADS, (int32_t)BATCH_SIZE, ((int32_t)N_ROWS + (int32_t)N_THREADS - 1) / (int32_t)N_THREADS); - convert_vertical_slash_indexes_kernel<<>>( + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + convert_vertical_slash_indexes_kernel<<>>( q_seqlens, kv_seqlens, vertical_indexes, @@ -393,7 +395,8 @@ void convert_vertical_slash_indexes_64x64_mergehead( const int N_THREADS = 64; const dim3 dimBlock(N_THREADS); const dim3 dimGrid(N_HEADS, BATCH_SIZE, (N_ROWS + N_THREADS - 1) / N_THREADS); - convert_vertical_slash_indexes_kernel_mergehead<<>>( + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + convert_vertical_slash_indexes_kernel_mergehead<<>>( q_seqlens, kv_seqlens, vertical_indexes,