Update extend/decode attention kernel for CPU in sgl-kernel and add UTs (#6405)
Co-authored-by: mingfeima <mingfei.ma@intel.com>
This commit is contained in:
@@ -49,9 +49,12 @@ std::tuple<at::Tensor, at::Tensor> biased_grouped_topk_cpu(
|
||||
// attention
|
||||
void decode_attention_cpu(
|
||||
at::Tensor& query,
|
||||
at::Tensor& output,
|
||||
at::Tensor& k_cache,
|
||||
at::Tensor& v_cahce,
|
||||
at::Tensor& v_cache,
|
||||
at::Tensor& output,
|
||||
at::Tensor& key,
|
||||
at::Tensor& value,
|
||||
at::Tensor& loc,
|
||||
at::Tensor& attn_logits,
|
||||
at::Tensor& req_to_token,
|
||||
at::Tensor& req_pool_indices,
|
||||
|
||||
Reference in New Issue
Block a user