[bugfix] restore pr-7029 and fix patch error (#7294)
### What this PR does / why we need it?
This PR restores #7029, which adds W8A8C8 support for dsv3.2/glm5 using
the `lightning_indexer_quant` ops in the pd-mix stage.
The original PR was reverted by #7288 because the patch did not work
with the recompute scheduler.
This PR also fixes the patching issue so that it works correctly with
the recompute scheduler.
### Does this PR introduce _any_ user-facing change?
Yes. To enable LI C8, users need to set the `enable_sparse_c8` option to
`"true"` in `additional_config`.
- vLLM version: v0.17.0
- vLLM main:
4034c3d32e
---------
Signed-off-by: rjg-lyh <1318825571@qq.com>
This commit is contained in:
@@ -42,6 +42,7 @@
|
||||
#include "moe_gating_top_k/moe_gating_top_k_torch_adpt.h"
|
||||
#include "moe_init_routing_custom/moe_init_routing_custom_torch_adpt.h"
|
||||
#include "sparse_flash_attention/sparse_flash_attention_torch_adpt.h"
|
||||
#include "lightning_indexer_quant/lightning_indexer_quant_torch_adpt.h"
|
||||
#include <c10/core/Device.h>
|
||||
#include <c10/util/Exception.h>
|
||||
#include <c10/util/Logging.h>
|
||||
@@ -918,4 +919,16 @@ TORCH_LIBRARY_EXPAND(CONCAT(_C, _ascend), ops)
|
||||
"-> Tensor[]"
|
||||
);
|
||||
ops.impl("moe_grouped_matmul", torch::kPrivateUse1,&vllm_ascend::moe_grouped_matmul);
|
||||
|
||||
// This operator is planned to be integrated into PTA in the near future.
|
||||
// Once that happens, the implementation in csrc will be removed.
|
||||
ops.def(
|
||||
"npu_lightning_indexer_quant(Tensor query, Tensor key, Tensor weights, Tensor query_dequant_scale, "
|
||||
" Tensor key_dequant_scale, *, Tensor? actual_seq_lengths_query=None, "
|
||||
" Tensor? actual_seq_lengths_key=None, Tensor? block_table=None, "
|
||||
" int query_quant_mode=0, int key_quant_mode=0, "
|
||||
" str layout_query='BSND', str layout_key='BSND',"
|
||||
" int sparse_count=2048, int sparse_mode=3) -> Tensor"
|
||||
);
|
||||
ops.impl("npu_lightning_indexer_quant", torch::kPrivateUse1, &vllm_ascend::npu_lightning_indexer_quant);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user