[Platform] Add initial experimental support for Altlas 300I series (#1333)

### What this PR does / why we need it?
Add initial experimental support for Ascend 310P, this patch squash
below PR into one to help validation:

- https://github.com/vllm-project/vllm-ascend/pull/914
- https://github.com/vllm-project/vllm-ascend/pull/1318
- https://github.com/vllm-project/vllm-ascend/pull/1327


### Does this PR introduce _any_ user-facing change?
User can run vLLM on Altlas 300I DUO series

### How was this patch tested?
CI passed with:
- E2E image build for 310P
- CI test on A2 with e2e test and longterm test
- Unit test missing because need a real 310P image to have the test,
will add in a separate PR later.
- Manually e2e test:
- Qwen2.5-7b-instruct, Qwen2.5-0.5b, Qwen3-0.6B, Qwen3-4B, Qwen3-8B:
https://github.com/vllm-project/vllm-ascend/pull/914#issuecomment-2942989322
  - Pangu MGoE 72B


The patch has been tested locally on Ascend 310P hardware to ensure that
the changes do not break existing functionality and that the new
features work as intended.

#### ENV information

CANN, NNAL version: 8.1.RC1
> [!IMPORTANT]  
> PTA 2.5.1 version >= torch_npu-2.5.1.post1.dev20250528 to support NZ
format and calling NNAL operators on 310P

#### Code example

##### Build vllm-ascend from source code

```shell
# download source code as vllm-ascend
cd vllm-ascend
export SOC_VERSION=Ascend310P3
pip install -v -e .
cd ..
```

##### Run offline inference

```python
from vllm import LLM, SamplingParams
prompts = ["水的沸点是100摄氏度吗?请回答是或者否。", "若腋下体温为38摄氏度,请问这人是否发烧?请回答是或者否。",
           "水的沸点是100摄氏度吗?请回答是或者否。", "若腋下体温为38摄氏度,请问这人是否发烧?请回答是或者否。"]

# Create a sampling params object.
sampling_params = SamplingParams(temperature=0.0, top_p=0.95, max_tokens=10)
# Create an LLM.
llm = LLM(
    model="Qwen/Qwen2.5-7B-Instruct",
    max_model_len=4096,
    max_num_seqs=4,
    dtype="float16", # IMPORTANT cause some ATB ops cannot support bf16 on 310P
    disable_custom_all_reduce=True,
    trust_remote_code=True,
    tensor_parallel_size=2,
    compilation_config={"custom_ops":['none', "+rms_norm", "+rotary_embedding"]},
)

# Generate texts from the prompts.
outputs = llm.generate(prompts, sampling_params)
for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")

```

---------

Signed-off-by: Vincent Yuan <farawayboat@gmail.com>
Signed-off-by: Yikun Jiang <yikunkero@gmail.com>
Signed-off-by: angazenn <zengyanjia@huawei.com>
Co-authored-by: Vincent Yuan <farawayboat@gmail.com>
Co-authored-by: angazenn <zengyanjia@huawei.com>
Co-authored-by: wangxiyuan <wangxiyuan1007@gmail.com>
Co-authored-by: leo-pony <nengjunma@outlook.com>
Co-authored-by: shen-shanshan <467638484@qq.com>
This commit is contained in:
Yikun Jiang
2025-06-21 09:00:16 +08:00
committed by GitHub
parent 2009fdb8da
commit 097e7149f7
23 changed files with 839 additions and 62 deletions

View File

@@ -30,6 +30,8 @@ from vllm.v1.core.sched.output import SchedulerOutput
from vllm.v1.worker.gpu_input_batch import InputBatch
from vllm_ascend.ops.attention import vanilla_chunked_prefill
from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, aligned_16, is_310p,
nd_to_nz_2d, nd_to_nz_spec)
class AscendAttentionBackend(AttentionBackend):
@@ -62,6 +64,9 @@ class AscendAttentionBackend(AttentionBackend):
num_kv_heads: int,
head_size: int,
) -> Tuple[int, ...]:
if is_310p():
return (2, num_blocks, num_kv_heads * head_size // 16, block_size,
16)
return (2, num_blocks, block_size, num_kv_heads, head_size)
@staticmethod
@@ -167,6 +172,16 @@ class AscendAttentionMetadataBuilder:
query_start_loc = query_start_loc_cpu.to(self.runner.device,
non_blocking=True)
if is_310p():
if attn_state == AscendAttentionState.PrefillNoCache:
mask_nz = nd_to_nz_2d(attn_mask)
attn_mask = torch_npu.npu_format_cast(mask_nz.contiguous(),
ACL_FORMAT_FRACTAL_NZ)
elif attn_state == AscendAttentionState.ChunkedPrefill:
mask_nz = nd_to_nz_spec(attn_mask)
attn_mask = torch_npu.npu_format_cast(mask_nz.contiguous(),
ACL_FORMAT_FRACTAL_NZ)
attn_metadata = AscendMetadata(
num_actual_tokens=num_actual_tokens,
block_tables=block_table,
@@ -250,6 +265,7 @@ class AscendAttentionBackendImpl(AttentionImpl):
self.head_size,
dtype=query.dtype,
device=query.device)
ori_output = output
if trace_flag:
torch.ops.vllm.unified_ascend_attention_with_output(
query=query,
@@ -294,6 +310,18 @@ class AscendAttentionBackendImpl(AttentionImpl):
assert attn_metadata is not None
assert attn_metadata.attn_mask is not None
mask = attn_metadata.attn_mask
if is_310p():
# align q k v output tensors
query = aligned_16(query)
key = aligned_16(key)
value = aligned_16(value)
output = aligned_16(output)
# do reformat in case of broadcasted tensors
mask = mask.repeat(attn_metadata.seq_lens.size(0), 1, 1, 1)
mask = torch_npu.npu_format_cast(mask.contiguous(),
ACL_FORMAT_FRACTAL_NZ)
torch_npu._npu_flash_attention(query=query,
key=key,
value=value,
@@ -303,6 +331,7 @@ class AscendAttentionBackendImpl(AttentionImpl):
num_heads=self.num_heads,
num_kv_heads=self.num_kv_heads,
out=output)
output = output[:num_tokens, :, :]
elif attn_metadata.attn_state == AscendAttentionState.PrefillCacheHit:
assert attn_metadata is not None
assert attn_metadata.attn_mask is not None
@@ -320,6 +349,10 @@ class AscendAttentionBackendImpl(AttentionImpl):
scale_value=self.scale,
out=output)
elif attn_metadata.attn_state == AscendAttentionState.DecodeOnly:
if is_310p():
# # seq_lens_tensor needs to be transferred to the device for 310P
attn_metadata.seq_lens = \
attn_metadata.seq_lens.to(device=query.device)
torch_npu._npu_paged_attention(
query=query,
key_cache=self.key_cache,
@@ -353,6 +386,14 @@ class AscendAttentionBackendImpl(AttentionImpl):
self.scale, None, True)
else:
# use paged attention
assert attn_metadata is not None
assert attn_metadata.attn_mask is not None
if is_310p():
# do reformat in case of broadcasted tensors
attn_metadata.attn_mask = \
torch_npu.npu_format_cast(attn_metadata.attn_mask.contiguous(), ACL_FORMAT_FRACTAL_NZ)
attn_metadata.seq_lens = \
attn_metadata.seq_lens.to(device=query.device)
torch_npu._npu_paged_attention_splitfuse(
query=query,
key_cache=self.key_cache,
@@ -365,6 +406,9 @@ class AscendAttentionBackendImpl(AttentionImpl):
num_heads=self.num_heads,
scale_value=self.scale,
out=output)
# to make in-place change to the output tensor
ori_output[:, :, :] = output[:num_tokens, :, :]
return output.view(num_tokens, self.hidden_size)