[bugfix] restore pr-7029 and fix patch error (#7294)

### What this PR does / why we need it? This PR restores #7029, which adds W8A8C8 support for dsv3.2/glm5 using the `lightning_indexer_quant` ops in the pd-mix stage. The original PR was reverted by #7288 because the patch did not work with the recompute scheduler. This PR also fixes the patching issue so that it works correctly with the recompute scheduler. ### Does this PR introduce _any_ user-facing change? Yes. To enable LI C8, users need to set the `enable_sparse_c8` option to `"true"` in `additional_config`. - vLLM version: v0.17.0 - vLLM main: 4034c3d32e --------- Signed-off-by: rjg-lyh <1318825571@qq.com>
2026-03-16 15:39:42 +08:00
parent 9320365dab
commit 4d443b9228
25 changed files with 4309 additions and 78 deletions
--- a/csrc/torch_binding.cpp
+++ b/csrc/torch_binding.cpp
@@ -42,6 +42,7 @@
 #include "moe_gating_top_k/moe_gating_top_k_torch_adpt.h"
 #include "moe_init_routing_custom/moe_init_routing_custom_torch_adpt.h"
 #include "sparse_flash_attention/sparse_flash_attention_torch_adpt.h"
+#include "lightning_indexer_quant/lightning_indexer_quant_torch_adpt.h"
 #include <c10/core/Device.h>
 #include <c10/util/Exception.h>
 #include <c10/util/Logging.h>
@@ -918,4 +919,16 @@ TORCH_LIBRARY_EXPAND(CONCAT(_C, _ascend), ops)
        "-> Tensor[]"
    );
    ops.impl("moe_grouped_matmul", torch::kPrivateUse1,&vllm_ascend::moe_grouped_matmul);
+
+    // This operator is planned to be integrated into PTA in the near future.
+    // Once that happens, the implementation in csrc will be removed.
+    ops.def(
+        "npu_lightning_indexer_quant(Tensor query, Tensor key, Tensor weights, Tensor query_dequant_scale, "
+        "                            Tensor key_dequant_scale, *, Tensor? actual_seq_lengths_query=None, "
+        "                            Tensor? actual_seq_lengths_key=None, Tensor? block_table=None, "
+        "                            int query_quant_mode=0, int key_quant_mode=0, "
+        "                            str layout_query='BSND', str layout_key='BSND',"
+        "                            int sparse_count=2048, int sparse_mode=3) -> Tensor"
+    );
+    ops.impl("npu_lightning_indexer_quant", torch::kPrivateUse1, &vllm_ascend::npu_lightning_indexer_quant);
 }