mlapo add qdown output (#4707)
### What this PR does / why we need it?
This PR adds mlapo operation support qdown of output.
### Does this PR introduce _any_ user-facing change?
mlapo operation add enable_inner_out of input
### How was this patch tested?
CI passed with new added/existing test.
- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c
---------
Signed-off-by: h1074112368 <h1074112368@gmail.com>
Co-authored-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
@@ -103,6 +103,9 @@ constexpr uint32_t KEY_FP16_CACHEMODE_1_QUANTMODE_0 = 1;
|
||||
constexpr uint32_t KEY_BF16_CACHEMODE_0_QUANTMODE_0 = 256;
|
||||
constexpr uint32_t KEY_BF16_CACHEMODE_1_QUANTMODE_0 = 257;
|
||||
constexpr uint32_t KEY_BF16_CACHEMODE_3_QUANTMODE_0 = 259;
|
||||
constexpr uint32_t KEY_BF16_CACHEMODE_0_QUANTMODE_0_INNER = 256 + 512;
|
||||
constexpr uint32_t KEY_BF16_CACHEMODE_1_QUANTMODE_0_INNER = 257 + 512;
|
||||
constexpr uint32_t KEY_BF16_CACHEMODE_3_QUANTMODE_0_INNER = 259 + 512;
|
||||
|
||||
enum class QuantMode : int32_t {
|
||||
PER_TENSOR_ASYMM_QUANT = 0,
|
||||
|
||||
Reference in New Issue
Block a user