adopt rope in vllm-ascend (#530)
### What this PR does / why we need it? Adopt custom kernel rotary embedding in actual model inference, customized rotary_embedding will generate contiguous query and key in the cpp side to reduce the overhead of two contiguous and index_select compared with rotary_embedding in torch_npu. For now, rotary_embedding can only support the scenario of `is_neox = true`, non-neox version rope will be updated soon in the future. --------- Signed-off-by: ganyi <pleaplusone.gy@gmail.com>
This commit is contained in:
@@ -28,9 +28,9 @@
|
||||
using vllm_ascend::AccType;
|
||||
using vllm_ascend::local_mem_copy;
|
||||
template <typename scalar_t, bool isNeox> class RotaryEmbedding {
|
||||
// NOTE(ganyi): we use 32K as load stride for pipe, need to find another way to
|
||||
// NOTE(ganyi): we use 512B as load stride for pipe, need to find another way to
|
||||
// retrive this size from runtime for more Soc support
|
||||
static int constexpr loadSize = 1024 * 4;
|
||||
static int constexpr loadSize = 512;
|
||||
using dst_t = scalar_t;
|
||||
using acc_t = typename AccType<scalar_t>::type;
|
||||
// only half tensor have cast instruct to int8, hardcode acc_dst_t as half
|
||||
|
||||
Reference in New Issue
Block a user