add mla_preprocess kernel (#3226)
### What this PR does / why we need it? - Adds the `mla_preprocess` custom kernel to provide an optimized pre-processing operator for Multi-head Latent Attention (MLA) on Ascend NPUs. - Wires the new kernel into the C++ extension pipeline so vLLM can invoke it directly, cutting Python-side tensor shuffling and memory copies that previously bottlenecked MLA compilation paths. ### Does this PR introduce any user-facing change? - No. The change only introduces a low-level kernel; public APIs and inference behavior remain unchanged. ### How was this patch tested? - Dedicated Ascend kernels are not covered by our CI yet, so no extra automated tests were added. Future MLA-focused regression runs will cover this path. - vLLM version: v0.11.0 Signed-off-by: Chen Chen <0109chenchen@gmail.com>
This commit is contained in:
@@ -44,11 +44,13 @@ else()
|
||||
endif()
|
||||
|
||||
include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
|
||||
|
||||
file(GLOB KERNEL_FILES
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/csrc/kernels/*.cpp)
|
||||
|
||||
ascendc_library(vllm_ascend_kernels SHARED
|
||||
${KERNEL_FILES}
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/csrc/mla_preprocess/op_kernel/mla_preprocess_kernel.cpp
|
||||
)
|
||||
|
||||
message("TORCH_NPU_PATH is ${TORCH_NPU_PATH}")
|
||||
@@ -90,7 +92,11 @@ target_link_libraries(
|
||||
libtorch_npu.so
|
||||
vllm_ascend_kernels
|
||||
ascendcl
|
||||
tiling_api
|
||||
register
|
||||
platform
|
||||
ascendalog
|
||||
dl
|
||||
)
|
||||
|
||||
target_link_options(vllm_ascend_C PRIVATE "-Wl,-rpath,$ORIGIN:$ORIGIN/lib")
|
||||
|
||||
Reference in New Issue
Block a user