[CI] upgrade to newest pta (#187)
Upgrade to newest torch-npu Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> Co-authored-by: angazenn <zengyanjia@huawei.com>
This commit is contained in:
4
.github/workflows/vllm_ascend_test.yaml
vendored
4
.github/workflows/vllm_ascend_test.yaml
vendored
@@ -108,9 +108,9 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
mkdir pta
|
mkdir pta
|
||||||
cd pta
|
cd pta
|
||||||
wget https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/Daily/v2.5.1/20250218.4/pytorch_v2.5.1_py310.tar.gz
|
wget https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/Daily/v2.5.1/20250226.4/pytorch_v2.5.1_py310.tar.gz
|
||||||
tar -xvf pytorch_v2.5.1_py310.tar.gz
|
tar -xvf pytorch_v2.5.1_py310.tar.gz
|
||||||
pip install ./torch_npu-2.5.1.dev20250218-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
|
pip install ./torch_npu-2.5.1.dev20250226-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
|
||||||
cd ..
|
cd ..
|
||||||
rm -rf pta
|
rm -rf pta
|
||||||
|
|
||||||
|
|||||||
@@ -12,7 +12,7 @@ This document describes how to install vllm-ascend manually.
|
|||||||
| Software | Supported version | Note |
|
| Software | Supported version | Note |
|
||||||
| ------------ | ----------------- | ---- |
|
| ------------ | ----------------- | ---- |
|
||||||
| CANN | >= 8.0.0 | Required for vllm-ascend and torch-npu |
|
| CANN | >= 8.0.0 | Required for vllm-ascend and torch-npu |
|
||||||
| torch-npu | >= 2.5.1.dev20250218 | Required for vllm-ascend |
|
| torch-npu | >= 2.5.1.dev20250226 | Required for vllm-ascend |
|
||||||
| torch | >= 2.5.1 | Required for torch-npu and vllm |
|
| torch | >= 2.5.1 | Required for torch-npu and vllm |
|
||||||
|
|
||||||
You have 2 way to install:
|
You have 2 way to install:
|
||||||
@@ -134,15 +134,15 @@ pip install vllm-ascend==|pip_vllm_ascend_version| --extra-index https://downloa
|
|||||||
#
|
#
|
||||||
# Here we take python 3.10 on aarch64 as an example. Feel free to install the correct version for your environment. See:
|
# Here we take python 3.10 on aarch64 as an example. Feel free to install the correct version for your environment. See:
|
||||||
#
|
#
|
||||||
# https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/Daily/v2.5.1/20250218.4/pytorch_v2.5.1_py39.tar.gz
|
# https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/Daily/v2.5.1/20250226.4/pytorch_v2.5.1_py39.tar.gz
|
||||||
# https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/Daily/v2.5.1/20250218.4/pytorch_v2.5.1_py310.tar.gz
|
# https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/Daily/v2.5.1/20250226.4/pytorch_v2.5.1_py310.tar.gz
|
||||||
# https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/Daily/v2.5.1/20250218.4/pytorch_v2.5.1_py311.tar.gz
|
# https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/Daily/v2.5.1/20250226.4/pytorch_v2.5.1_py311.tar.gz
|
||||||
#
|
#
|
||||||
mkdir pta
|
mkdir pta
|
||||||
cd pta
|
cd pta
|
||||||
wget https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/Daily/v2.5.1/20250218.4/pytorch_v2.5.1_py310.tar.gz
|
wget https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/Daily/v2.5.1/20250218.4/pytorch_v2.5.1_py310.tar.gz
|
||||||
tar -xvf pytorch_v2.5.1_py310.tar.gz
|
tar -xvf pytorch_v2.5.1_py310.tar.gz
|
||||||
pip install ./torch_npu-2.5.1.dev20250218-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
|
pip install ./torch_npu-2.5.1.dev20250226-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
|
||||||
```
|
```
|
||||||
|
|
||||||
or build from **source code**:
|
or build from **source code**:
|
||||||
|
|||||||
@@ -577,13 +577,11 @@ class AscendAttentionBackendImpl(AttentionImpl):
|
|||||||
self.num_kv_heads,
|
self.num_kv_heads,
|
||||||
self.head_size)
|
self.head_size)
|
||||||
slots = attn_metadata.slot_mapping
|
slots = attn_metadata.slot_mapping
|
||||||
torch_npu.npu_reshapecache(key=key,
|
torch_npu._npu_reshape_and_cache(key=key,
|
||||||
value=value,
|
value=value,
|
||||||
keyCache=key_cache,
|
key_cache=key_cache,
|
||||||
valueCache=value_cache,
|
value_cache=value_cache,
|
||||||
slotMapping=slots,
|
slot_indices=slots)
|
||||||
compressType=0,
|
|
||||||
kvCacheCfg=0)
|
|
||||||
|
|
||||||
if attn_metadata.num_prefills > 0:
|
if attn_metadata.num_prefills > 0:
|
||||||
|
|
||||||
@@ -596,32 +594,15 @@ class AscendAttentionBackendImpl(AttentionImpl):
|
|||||||
np.array(
|
np.array(
|
||||||
attn_metadata.prefill_metadata.seq_lens).astype(
|
attn_metadata.prefill_metadata.seq_lens).astype(
|
||||||
np.int32))
|
np.int32))
|
||||||
torch_npu.npu_selfattention(
|
torch_npu._npu_flash_attention(
|
||||||
query=query,
|
query=query,
|
||||||
key=key,
|
key=key,
|
||||||
value=value,
|
value=value,
|
||||||
mask=mask,
|
mask=mask,
|
||||||
maskType=1,
|
seq_len=self.seq_lens_tensor_cpu,
|
||||||
isTriuMask=0,
|
scale_value=self.scale,
|
||||||
seqLen=self.seq_lens_tensor_cpu,
|
num_heads=self.num_heads,
|
||||||
scale=self.scale,
|
num_kv_heads=self.num_kv_heads,
|
||||||
qScale=1,
|
|
||||||
headNum=self.num_heads,
|
|
||||||
kvHeadNum=self.num_kv_heads,
|
|
||||||
mlaVHeadSize=0,
|
|
||||||
calcType=3,
|
|
||||||
kernelType=0,
|
|
||||||
clampType=0,
|
|
||||||
scaleType=0,
|
|
||||||
quantType=0,
|
|
||||||
cacheType=0,
|
|
||||||
batchRunStatusEnable=False,
|
|
||||||
kvcacheCfg=0,
|
|
||||||
clampMin=0,
|
|
||||||
clampMax=0,
|
|
||||||
inputLayout=0,
|
|
||||||
windowSize=0,
|
|
||||||
outDataType=0,
|
|
||||||
out=output)
|
out=output)
|
||||||
else:
|
else:
|
||||||
# TODO: Will support prefix cache and chunked prefill soon.
|
# TODO: Will support prefix cache and chunked prefill soon.
|
||||||
@@ -634,26 +615,16 @@ class AscendAttentionBackendImpl(AttentionImpl):
|
|||||||
np.array(attn_metadata.decode_metadata.seq_lens).astype(
|
np.array(attn_metadata.decode_metadata.seq_lens).astype(
|
||||||
np.int32))
|
np.int32))
|
||||||
block_tables = attn_metadata.decode_metadata.block_tables
|
block_tables = attn_metadata.decode_metadata.block_tables
|
||||||
torch_npu.npu_pagedattention(
|
torch_npu._npu_paged_attention(
|
||||||
query=query,
|
query=query,
|
||||||
keyCache=key_cache,
|
key_cache=key_cache,
|
||||||
valueCache=value_cache,
|
value_cache=value_cache,
|
||||||
contextLens=self.seq_lens_tensor_cpu,
|
num_kv_heads=self.num_kv_heads,
|
||||||
maskType=0,
|
num_heads=self.num_heads,
|
||||||
kvHeadNum=self.num_kv_heads,
|
scale_value=self.scale,
|
||||||
headNum=self.num_heads,
|
block_table=block_tables,
|
||||||
mlaVHeadSize=0,
|
context_lens=self.seq_lens_tensor_cpu,
|
||||||
qkScale=self.scale,
|
out=output)
|
||||||
scaleType=0,
|
|
||||||
blockTables=block_tables,
|
|
||||||
batchRunStatusEnable=False,
|
|
||||||
hasQuantOffset=False,
|
|
||||||
calcType=3,
|
|
||||||
quantType=0,
|
|
||||||
compressType=0,
|
|
||||||
inputLayout=0,
|
|
||||||
outDataType=0,
|
|
||||||
attnOut=output)
|
|
||||||
|
|
||||||
return output.view(num_tokens, self.hidden_size)
|
return output.view(num_tokens, self.hidden_size)
|
||||||
|
|
||||||
|
|||||||
@@ -42,7 +42,7 @@ def rope_forward_oot(
|
|||||||
# TODO: Remove the contiguous in the future.
|
# TODO: Remove the contiguous in the future.
|
||||||
query = query.contiguous()
|
query = query.contiguous()
|
||||||
key = key.contiguous()
|
key = key.contiguous()
|
||||||
torch_npu.npu_rope(
|
torch_npu._npu_rotary_embedding(
|
||||||
positions,
|
positions,
|
||||||
query,
|
query,
|
||||||
key,
|
key,
|
||||||
|
|||||||
Reference in New Issue
Block a user