add mla_preprocess kernel (#3226)
### What this PR does / why we need it? - Adds the `mla_preprocess` custom kernel to provide an optimized pre-processing operator for Multi-head Latent Attention (MLA) on Ascend NPUs. - Wires the new kernel into the C++ extension pipeline so vLLM can invoke it directly, cutting Python-side tensor shuffling and memory copies that previously bottlenecked MLA compilation paths. ### Does this PR introduce any user-facing change? - No. The change only introduces a low-level kernel; public APIs and inference behavior remain unchanged. ### How was this patch tested? - Dedicated Ascend kernels are not covered by our CI yet, so no extra automated tests were added. Future MLA-focused regression runs will cover this path. - vLLM version: v0.11.0 Signed-off-by: Chen Chen <0109chenchen@gmail.com>
This commit is contained in:
36
csrc/ops.h
36
csrc/ops.h
@@ -124,4 +124,40 @@ namespace vllm_ascend {
|
||||
uint32_t output_hidden_dim,
|
||||
uint32_t slice_offset,
|
||||
uint32_t output_full_dim);
|
||||
|
||||
extern void mla_preprocess_impl(
|
||||
void* stream,
|
||||
void* hidden_state,
|
||||
void* gamma1,
|
||||
void* beta1,
|
||||
void* quant_scale1,
|
||||
void* quant_offset1,
|
||||
void* wdqkv,
|
||||
void* bias1,
|
||||
void* gamma2,
|
||||
void* beta2,
|
||||
void* quant_scale2,
|
||||
void* quant_offset2,
|
||||
void* gamma3,
|
||||
void* sin1,
|
||||
void* cos1,
|
||||
void* sin2,
|
||||
void* cos2,
|
||||
void* keycache,
|
||||
void* slot_mapping,
|
||||
void* wuq,
|
||||
void* bias2,
|
||||
void* wuk,
|
||||
void* descale1,
|
||||
void* descale2,
|
||||
void* ctkv_scale,
|
||||
void* qnope_scale,
|
||||
void* q,
|
||||
void* keycache_out,
|
||||
void* q2,
|
||||
void* keycache_out2,
|
||||
void* workspace,
|
||||
void* tiling,
|
||||
const uint32_t block_dim
|
||||
);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user