[CI] upgrade to newest pta (#187)

Upgrade to newest torch-npu

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
Co-authored-by: angazenn <zengyanjia@huawei.com>
This commit is contained in:
wangxiyuan
2025-02-27 16:40:23 +08:00
committed by GitHub
parent fd18ae6494
commit 6042c210bc
4 changed files with 27 additions and 56 deletions

View File

@@ -108,9 +108,9 @@ jobs:
run: | run: |
mkdir pta mkdir pta
cd pta cd pta
wget https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/Daily/v2.5.1/20250218.4/pytorch_v2.5.1_py310.tar.gz wget https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/Daily/v2.5.1/20250226.4/pytorch_v2.5.1_py310.tar.gz
tar -xvf pytorch_v2.5.1_py310.tar.gz tar -xvf pytorch_v2.5.1_py310.tar.gz
pip install ./torch_npu-2.5.1.dev20250218-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl pip install ./torch_npu-2.5.1.dev20250226-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
cd .. cd ..
rm -rf pta rm -rf pta

View File

@@ -12,7 +12,7 @@ This document describes how to install vllm-ascend manually.
| Software | Supported version | Note | | Software | Supported version | Note |
| ------------ | ----------------- | ---- | | ------------ | ----------------- | ---- |
| CANN | >= 8.0.0 | Required for vllm-ascend and torch-npu | | CANN | >= 8.0.0 | Required for vllm-ascend and torch-npu |
| torch-npu | >= 2.5.1.dev20250218 | Required for vllm-ascend | | torch-npu | >= 2.5.1.dev20250226 | Required for vllm-ascend |
| torch | >= 2.5.1 | Required for torch-npu and vllm | | torch | >= 2.5.1 | Required for torch-npu and vllm |
You have 2 way to install: You have 2 way to install:
@@ -134,15 +134,15 @@ pip install vllm-ascend==|pip_vllm_ascend_version| --extra-index https://downloa
# #
# Here we take python 3.10 on aarch64 as an example. Feel free to install the correct version for your environment. See: # Here we take python 3.10 on aarch64 as an example. Feel free to install the correct version for your environment. See:
# #
# https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/Daily/v2.5.1/20250218.4/pytorch_v2.5.1_py39.tar.gz # https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/Daily/v2.5.1/20250226.4/pytorch_v2.5.1_py39.tar.gz
# https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/Daily/v2.5.1/20250218.4/pytorch_v2.5.1_py310.tar.gz # https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/Daily/v2.5.1/20250226.4/pytorch_v2.5.1_py310.tar.gz
# https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/Daily/v2.5.1/20250218.4/pytorch_v2.5.1_py311.tar.gz # https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/Daily/v2.5.1/20250226.4/pytorch_v2.5.1_py311.tar.gz
# #
mkdir pta mkdir pta
cd pta cd pta
wget https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/Daily/v2.5.1/20250218.4/pytorch_v2.5.1_py310.tar.gz wget https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/Daily/v2.5.1/20250218.4/pytorch_v2.5.1_py310.tar.gz
tar -xvf pytorch_v2.5.1_py310.tar.gz tar -xvf pytorch_v2.5.1_py310.tar.gz
pip install ./torch_npu-2.5.1.dev20250218-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl pip install ./torch_npu-2.5.1.dev20250226-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
``` ```
or build from **source code**: or build from **source code**:

View File

@@ -577,13 +577,11 @@ class AscendAttentionBackendImpl(AttentionImpl):
self.num_kv_heads, self.num_kv_heads,
self.head_size) self.head_size)
slots = attn_metadata.slot_mapping slots = attn_metadata.slot_mapping
torch_npu.npu_reshapecache(key=key, torch_npu._npu_reshape_and_cache(key=key,
value=value, value=value,
keyCache=key_cache, key_cache=key_cache,
valueCache=value_cache, value_cache=value_cache,
slotMapping=slots, slot_indices=slots)
compressType=0,
kvCacheCfg=0)
if attn_metadata.num_prefills > 0: if attn_metadata.num_prefills > 0:
@@ -596,32 +594,15 @@ class AscendAttentionBackendImpl(AttentionImpl):
np.array( np.array(
attn_metadata.prefill_metadata.seq_lens).astype( attn_metadata.prefill_metadata.seq_lens).astype(
np.int32)) np.int32))
torch_npu.npu_selfattention( torch_npu._npu_flash_attention(
query=query, query=query,
key=key, key=key,
value=value, value=value,
mask=mask, mask=mask,
maskType=1, seq_len=self.seq_lens_tensor_cpu,
isTriuMask=0, scale_value=self.scale,
seqLen=self.seq_lens_tensor_cpu, num_heads=self.num_heads,
scale=self.scale, num_kv_heads=self.num_kv_heads,
qScale=1,
headNum=self.num_heads,
kvHeadNum=self.num_kv_heads,
mlaVHeadSize=0,
calcType=3,
kernelType=0,
clampType=0,
scaleType=0,
quantType=0,
cacheType=0,
batchRunStatusEnable=False,
kvcacheCfg=0,
clampMin=0,
clampMax=0,
inputLayout=0,
windowSize=0,
outDataType=0,
out=output) out=output)
else: else:
# TODO: Will support prefix cache and chunked prefill soon. # TODO: Will support prefix cache and chunked prefill soon.
@@ -634,26 +615,16 @@ class AscendAttentionBackendImpl(AttentionImpl):
np.array(attn_metadata.decode_metadata.seq_lens).astype( np.array(attn_metadata.decode_metadata.seq_lens).astype(
np.int32)) np.int32))
block_tables = attn_metadata.decode_metadata.block_tables block_tables = attn_metadata.decode_metadata.block_tables
torch_npu.npu_pagedattention( torch_npu._npu_paged_attention(
query=query, query=query,
keyCache=key_cache, key_cache=key_cache,
valueCache=value_cache, value_cache=value_cache,
contextLens=self.seq_lens_tensor_cpu, num_kv_heads=self.num_kv_heads,
maskType=0, num_heads=self.num_heads,
kvHeadNum=self.num_kv_heads, scale_value=self.scale,
headNum=self.num_heads, block_table=block_tables,
mlaVHeadSize=0, context_lens=self.seq_lens_tensor_cpu,
qkScale=self.scale, out=output)
scaleType=0,
blockTables=block_tables,
batchRunStatusEnable=False,
hasQuantOffset=False,
calcType=3,
quantType=0,
compressType=0,
inputLayout=0,
outDataType=0,
attnOut=output)
return output.view(num_tokens, self.hidden_size) return output.view(num_tokens, self.hidden_size)

View File

@@ -42,7 +42,7 @@ def rope_forward_oot(
# TODO: Remove the contiguous in the future. # TODO: Remove the contiguous in the future.
query = query.contiguous() query = query.contiguous()
key = key.contiguous() key = key.contiguous()
torch_npu.npu_rope( torch_npu._npu_rotary_embedding(
positions, positions,
query, query,
key, key,