Update extend/decode attention kernel for CPU in sgl-kernel and add UTs (#6405)

Co-authored-by: mingfeima <mingfei.ma@intel.com>
This commit is contained in:
YanbingJiang
2025-05-20 12:23:17 +08:00
committed by GitHub
parent 83f2d9d4ed
commit 32cc66efa5
4 changed files with 464 additions and 19 deletions

View File

@@ -49,9 +49,12 @@ std::tuple<at::Tensor, at::Tensor> biased_grouped_topk_cpu(
// attention
void decode_attention_cpu(
at::Tensor& query,
at::Tensor& output,
at::Tensor& k_cache,
at::Tensor& v_cahce,
at::Tensor& v_cache,
at::Tensor& output,
at::Tensor& key,
at::Tensor& value,
at::Tensor& loc,
at::Tensor& attn_logits,
at::Tensor& req_to_token,
at::Tensor& req_pool_indices,