[Model] GLM5 adaptation (#6642)

### What this PR does / why we need it? GLM5 adaptation 1. use torch_npu.npu_lightning_indexer for GLM5 2. forbid eagle proposer when fullgraph mode is enabled because of bugs 3. add quatization config for GLM5 ### Does this PR introduce _any_ user-facing change? N/A ### How was this patch tested? by ci - vLLM main: 978a37c823 --------- Signed-off-by: yydyzr <liuyuncong1@huawei.com> Signed-off-by: shenchuxiaofugui <1311027364@qq.com> Co-authored-by: shenchuxiaofugui <1311027364@qq.com>
2026-02-11 22:22:22 +08:00
parent 140fcaffc3
commit ff3a50d011
17 changed files with 77 additions and 34 deletions
--- a/csrc/build_aclnn.sh
+++ b/csrc/build_aclnn.sh
@@ -24,7 +24,7 @@ elif [[ "$SOC_VERSION" =~ ^ascend910b ]]; then
    ABSOLUTE_CATLASS_PATH=$(cd "${CATLASS_PATH}" && pwd)
    export CPATH=${ABSOLUTE_CATLASS_PATH}:${CPATH}

-    CUSTOM_OPS="grouped_matmul_swiglu_quant_weight_nz_tensor_list;lightning_indexer;sparse_flash_attention;matmul_allreduce_add_rmsnorm;moe_init_routing_custom;moe_gating_top_k;add_rms_norm_bias;apply_top_k_top_p_custom;transpose_kv_cache_by_block;"
+    CUSTOM_OPS="grouped_matmul_swiglu_quant_weight_nz_tensor_list;lightning_indexer_vllm;sparse_flash_attention;matmul_allreduce_add_rmsnorm;moe_init_routing_custom;moe_gating_top_k;add_rms_norm_bias;apply_top_k_top_p_custom;transpose_kv_cache_by_block;"
    SOC_ARG="ascend910b"
 elif [[ "$SOC_VERSION" =~ ^ascend910_93 ]]; then
    # ASCEND910C (A3) series
@@ -68,7 +68,7 @@ elif [[ "$SOC_VERSION" =~ ^ascend910_93 ]]; then

    CUSTOM_OPS_ARRAY=(
        "grouped_matmul_swiglu_quant_weight_nz_tensor_list"
-        "lightning_indexer"
+        "lightning_indexer_vllm"
        "sparse_flash_attention"
        "dispatch_ffn_combine"
        "dispatch_ffn_combine_bf16"