### What this PR does / why we need it?
This PR supports W8A8C8 in dsv3.2/glm5 with lightning_indexer_quant ops
in pd-mix stage mainly.
Because the code for the current PD-disaggregated scenario is still
under refactoring and cleanup, this PR prioritizes ensuring the C8
functionality in the pd-mix scenario.
The next steps are planned in two parts:
① Once the optimized scatter operator is updated, we will replace the
original operator to improve the performance of storing k_scale.
② Once the code logic for the PD-disaggregated scenario becomes stable,
we will carry out more comprehensive validation and make appropriate
adaptations.
③ Because enabling C8 currently introduces several new operators whose
performance still needs improvement, performance may regress in some
scenarios. Therefore, only after all the operators are fully ready can
we ensure that this feature does not cause any performance degradation.
At that point, we will enable this feature by default and remove the
switch in `additional_config`.
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
CI passed with new added/existing test.
- vLLM version: v0.16.0
- vLLM main:
4034c3d32e
---------
Signed-off-by: rjg-lyh <1318825571@qq.com>
51 lines
2.8 KiB
C++
51 lines
2.8 KiB
C++
/**
|
|
* This program is free software, you can redistribute it and/or modify it.
|
|
* Copyright (c) 2025 Huawei Technologies Co., Ltd.
|
|
* This file is a part of the CANN Open Software.
|
|
* Licensed under CANN Open Software License Agreement Version 2.0 (the "License").
|
|
* Please refer to the License for details. You may not use this file except in compliance with the License.
|
|
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
|
|
* See LICENSE in the root of the software repository for the full text of the License.
|
|
*/
|
|
|
|
/*!
|
|
* \file lightning_indexer_quant.cpp
|
|
* \brief
|
|
*/
|
|
|
|
#include "kernel_operator.h"
|
|
#include "lib/matmul_intf.h"
|
|
#include "lightning_indexer_quant_kernel.h"
|
|
#include "lightning_indexer_quant_template_tiling_key.h"
|
|
|
|
using namespace LIQKernel;
|
|
|
|
#define INVOKE_LI_NO_KFC_OP_IMPL(templateClass, ...) \
|
|
do { \
|
|
templateClass<LIQType<__VA_ARGS__>> op; \
|
|
GET_TILING_DATA_WITH_STRUCT(LIQTilingData, tiling_data_in, tiling); \
|
|
const LIQTilingData *__restrict tiling_data = &tiling_data_in; \
|
|
op.Init(query, key, weights, queryScale, keyScale, actualSeqLengthsQ, actualSeqLengthsK, blocktable, \
|
|
sparseIndices, user, tiling_data, &tPipe); \
|
|
op.Process(); \
|
|
} while (0)
|
|
|
|
template <int DT_Q, int DT_K, int DT_OUT, int PAGE_ATTENTION, int Q_LAYOUT_T, int K_LAYOUT_T>
|
|
__global__ __aicore__ void lightning_indexer_quant(__gm__ uint8_t *query, __gm__ uint8_t *key, __gm__ uint8_t *weights,
|
|
__gm__ uint8_t *queryScale, __gm__ uint8_t *keyScale,
|
|
__gm__ uint8_t *actualSeqLengthsQ, __gm__ uint8_t *actualSeqLengthsK,
|
|
__gm__ uint8_t *blocktable, __gm__ uint8_t *sparseIndices,
|
|
__gm__ uint8_t *workspace, __gm__ uint8_t *tiling)
|
|
{
|
|
#if (__CCE_AICORE__ == 310) || (defined __DAV_310R6__) || (__CCE_AICORE__ == 200)
|
|
|
|
#else
|
|
TPipe tPipe;
|
|
__gm__ uint8_t *user = GetUserWorkspace(workspace);
|
|
KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_MIX_AIC_1_2);
|
|
|
|
INVOKE_LI_NO_KFC_OP_IMPL(LIQPreload, int8_t, int8_t, int32_t,
|
|
PAGE_ATTENTION, LI_LAYOUT(Q_LAYOUT_T), LI_LAYOUT(K_LAYOUT_T));
|
|
#endif
|
|
}
|