From 8ed6f98a5af91d023b6faa33a4d27cf426512aa5 Mon Sep 17 00:00:00 2001 From: Mengqing Cao Date: Sat, 27 Dec 2025 02:01:06 +0800 Subject: [PATCH] [Build] Add installation script of fused_infer_attention_score kernel with flash decoding (#5402) ### What this PR does / why we need it? Add installation script of `fused_infer_attention_score` kernel with flash decoding ### Userface changes Users can install the kernel `fused_infer_attention_score` with flash decoding feature by `bash tools/install_flash_infer_attention_score_ops_a2.sh` or `bash tools/install_flash_infer_attention_score_ops_a3.sh` - vLLM version: release/v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/254f6b986720c92ddf97fbb1a6a6465da8e87e29 --------- Signed-off-by: MengqingCao --- .pre-commit-config.yaml | 2 +- ...tall_flash_infer_attention_score_ops_a2.sh | 35 +++++++++++++++++++ ...tall_flash_infer_attention_score_ops_a3.sh | 34 ++++++++++++++++++ 3 files changed, 70 insertions(+), 1 deletion(-) create mode 100644 tools/install_flash_infer_attention_score_ops_a2.sh create mode 100644 tools/install_flash_infer_attention_score_ops_a3.sh diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 82bde178..f7dd0e1d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -13,7 +13,7 @@ repos: args: [ --toml, pyproject.toml, '--skip', 'csrc/**,tests/prompts/**,./benchmarks/sonnet.txt,*tests/lora/data/**,build/**,./vllm_ascend.egg-info/**,.github/**,typos.toml', - '-L', 'CANN,cann,NNAL,nnal,ASCEND,ascend,EnQue,CopyIn,ArchType,AND,ND' + '-L', 'CANN,cann,NNAL,nnal,ASCEND,ascend,EnQue,CopyIn,ArchType,AND,ND,tbe' ] additional_dependencies: - tomli diff --git a/tools/install_flash_infer_attention_score_ops_a2.sh b/tools/install_flash_infer_attention_score_ops_a2.sh new file mode 100644 index 00000000..aa3126bc --- /dev/null +++ b/tools/install_flash_infer_attention_score_ops_a2.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# + + +cd /vllm-workspace +# download fused_infer_attention_score related source files +wget https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/fused_infer_attention_score_a2_$(uname -i).tar.gz +tar -zxvf ./fused_infer_attention_score_a2_$(uname -i).tar.gz + +# replace fused_infer_attention_score operation files +cd $ASCEND_TOOLKIT_HOME/opp/built-in/op_impl/ai_core/tbe/kernel/ascend910b +rm -rf fused_infer_attention_score +cp -r /vllm-workspace/fused_infer_attention_score_a2_$(uname -i)/fused_infer_attention_score . + +# replace related so +cd $ASCEND_TOOLKIT_HOME/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/$(uname -i) +rm libopmaster_ct.so libopmaster_rt2.0.so liboptiling.so +cp /vllm-workspace/fused_infer_attention_score_a2_$(uname -i)/*.so . diff --git a/tools/install_flash_infer_attention_score_ops_a3.sh b/tools/install_flash_infer_attention_score_ops_a3.sh new file mode 100644 index 00000000..1ad0a356 --- /dev/null +++ b/tools/install_flash_infer_attention_score_ops_a3.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# + +cd /vllm-workspace +# download fused_infer_attention_score related source files +wget https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/fused_infer_attention_score_a3_$(uname -i).tar.gz +tar -zxvf ./fused_infer_attention_score_a3_$(uname -i).tar.gz + +# replace fused_infer_attention_score operation files +cd $ASCEND_TOOLKIT_HOME/opp/built-in/op_impl/ai_core/tbe/kernel/ascend910_93 +rm -rf fused_infer_attention_score +cp -r /vllm-workspace/fused_infer_attention_score_a3_$(uname -i)/fused_infer_attention_score . + +# replace related so +cd $ASCEND_TOOLKIT_HOME/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/$(uname -i) +rm libopmaster_ct.so libopmaster_rt2.0.so liboptiling.so +cp /vllm-workspace/fused_infer_attention_score_a3_$(uname -i)/*.so .