### What this PR does / why we need it? Add initial experimental support for Ascend 310P, this patch squash below PR into one to help validation: - https://github.com/vllm-project/vllm-ascend/pull/914 - https://github.com/vllm-project/vllm-ascend/pull/1318 - https://github.com/vllm-project/vllm-ascend/pull/1327 ### Does this PR introduce _any_ user-facing change? User can run vLLM on Altlas 300I DUO series ### How was this patch tested? CI passed with: - E2E image build for 310P - CI test on A2 with e2e test and longterm test - Unit test missing because need a real 310P image to have the test, will add in a separate PR later. - Manually e2e test: - Qwen2.5-7b-instruct, Qwen2.5-0.5b, Qwen3-0.6B, Qwen3-4B, Qwen3-8B: https://github.com/vllm-project/vllm-ascend/pull/914#issuecomment-2942989322 - Pangu MGoE 72B The patch has been tested locally on Ascend 310P hardware to ensure that the changes do not break existing functionality and that the new features work as intended. #### ENV information CANN, NNAL version: 8.1.RC1 > [!IMPORTANT] > PTA 2.5.1 version >= torch_npu-2.5.1.post1.dev20250528 to support NZ format and calling NNAL operators on 310P #### Code example ##### Build vllm-ascend from source code ```shell # download source code as vllm-ascend cd vllm-ascend export SOC_VERSION=Ascend310P3 pip install -v -e . cd .. ``` ##### Run offline inference ```python from vllm import LLM, SamplingParams prompts = ["水的沸点是100摄氏度吗?请回答是或者否。", "若腋下体温为38摄氏度,请问这人是否发烧?请回答是或者否。", "水的沸点是100摄氏度吗?请回答是或者否。", "若腋下体温为38摄氏度,请问这人是否发烧?请回答是或者否。"] # Create a sampling params object. sampling_params = SamplingParams(temperature=0.0, top_p=0.95, max_tokens=10) # Create an LLM. llm = LLM( model="Qwen/Qwen2.5-7B-Instruct", max_model_len=4096, max_num_seqs=4, dtype="float16", # IMPORTANT cause some ATB ops cannot support bf16 on 310P disable_custom_all_reduce=True, trust_remote_code=True, tensor_parallel_size=2, compilation_config={"custom_ops":['none', "+rms_norm", "+rotary_embedding"]}, ) # Generate texts from the prompts. outputs = llm.generate(prompts, sampling_params) for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") ``` --------- Signed-off-by: Vincent Yuan <farawayboat@gmail.com> Signed-off-by: Yikun Jiang <yikunkero@gmail.com> Signed-off-by: angazenn <zengyanjia@huawei.com> Co-authored-by: Vincent Yuan <farawayboat@gmail.com> Co-authored-by: angazenn <zengyanjia@huawei.com> Co-authored-by: wangxiyuan <wangxiyuan1007@gmail.com> Co-authored-by: leo-pony <nengjunma@outlook.com> Co-authored-by: shen-shanshan <467638484@qq.com>
154 lines
5.5 KiB
Python
154 lines
5.5 KiB
Python
#
|
|
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
|
# Copyright 2023 The vLLM team.
|
|
#
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
# Adapted from vllm/model_executor/models/qwen2_vl.py
|
|
# This file is a part of the vllm-ascend project.
|
|
|
|
import torch
|
|
import vllm
|
|
import vllm.distributed
|
|
import vllm.envs as envs
|
|
from torch.distributed import ProcessGroup
|
|
from vllm.config import ParallelConfig
|
|
from vllm.distributed.utils import \
|
|
stateless_init_torch_distributed_process_group
|
|
|
|
from vllm_ascend.utils import NullHandle, is_310p
|
|
|
|
|
|
def ascend_destroy_model_parallel():
|
|
"""Set the groups to none and destroy them."""
|
|
from vllm.distributed.parallel_state import _DP, _PP, _TP
|
|
if _TP:
|
|
_TP.destroy()
|
|
_TP = None
|
|
|
|
if _PP:
|
|
_PP.destroy()
|
|
_PP = None
|
|
|
|
if _DP:
|
|
_DP.destroy()
|
|
_DP = None
|
|
from vllm_ascend.distributed.parallel_state import \
|
|
destory_ascend_model_parallel
|
|
destory_ascend_model_parallel()
|
|
|
|
|
|
def parallel_config_get_dp_port(self) -> int:
|
|
"""
|
|
We might need to initialize process groups in multiple
|
|
processes that is related to data parallelism,
|
|
e.g. both in the worker and in the engine, which
|
|
can live in different processes. To avoid port conflicts, we
|
|
increment the port number each time we need to initialize a
|
|
new process group related to data parallelism.
|
|
"""
|
|
answer = self.data_parallel_master_port
|
|
self.data_parallel_master_port += 1
|
|
|
|
# NOTE: Get port from envs directly when using torchrun
|
|
port = envs.VLLM_DP_MASTER_PORT if envs.VLLM_DP_MASTER_PORT else answer
|
|
return port
|
|
|
|
|
|
def stateless_init_dp_group(self) -> "ProcessGroup":
|
|
# TODO(Yizhou): Currently we have to set the backend to gloo
|
|
# because in vllm.config.ParallelConfig.has_unfinished_dp the
|
|
# device is set to cpu. We need to fix this in the future.
|
|
# We need to compare the performance of gloo and hccl and then
|
|
# decide which one to use.
|
|
dp_group = stateless_init_torch_distributed_process_group(
|
|
self.data_parallel_master_ip,
|
|
self.get_next_dp_init_port(),
|
|
self.data_parallel_rank,
|
|
self.data_parallel_size,
|
|
backend="gloo")
|
|
|
|
return dp_group
|
|
|
|
|
|
vllm.distributed.parallel_state.destroy_model_parallel = ascend_destroy_model_parallel
|
|
ParallelConfig.get_next_dp_init_port = parallel_config_get_dp_port
|
|
ParallelConfig.stateless_init_dp_group = stateless_init_dp_group
|
|
|
|
|
|
def communication_adaptation_310p():
|
|
|
|
def broadcast310p(tensor, src, group=None, async_op=False):
|
|
rank = torch.distributed.get_rank(group)
|
|
world_size = torch.distributed.get_world_size(group)
|
|
tensor_list = [torch.empty_like(tensor) for _ in range(world_size)]
|
|
tensor_list[rank] = tensor
|
|
torch.distributed.all_gather(tensor_list, tensor, group=group)
|
|
tensor[...] = tensor_list[src]
|
|
if async_op:
|
|
return NullHandle()
|
|
else:
|
|
return None
|
|
|
|
torch.distributed.broadcast = broadcast310p
|
|
torch.distributed.distributed_c10d.broadcast = broadcast310p
|
|
|
|
def all_reduce_wrapper_310p(fn):
|
|
|
|
def all_reduce(
|
|
tensor,
|
|
op=torch.distributed.ReduceOp.SUM,
|
|
group=None,
|
|
async_op=False,
|
|
):
|
|
if tensor.dtype != torch.int64:
|
|
return fn(tensor, op, group, async_op)
|
|
rank = torch.distributed.get_rank(group)
|
|
world_size = torch.distributed.get_world_size(group)
|
|
tensor_list = [torch.empty_like(tensor) for _ in range(world_size)]
|
|
tensor_list[rank] = tensor
|
|
torch.distributed.all_gather(tensor_list, tensor, group=group)
|
|
if op == torch.distributed.ReduceOp.SUM:
|
|
return torch.stack(tensor_list).sum(0)
|
|
elif op == torch.distributed.ReduceOp.MAX:
|
|
return torch.tensor(
|
|
torch.stack(tensor_list).cpu().numpy().max(0),
|
|
device=tensor.device,
|
|
)
|
|
else:
|
|
raise RuntimeError(f"not implement op {op}")
|
|
|
|
return all_reduce
|
|
|
|
torch.distributed.all_reduce = all_reduce_wrapper_310p(
|
|
torch.distributed.all_reduce)
|
|
torch.distributed.distributed_c10d.all_reduce = all_reduce_wrapper_310p(
|
|
torch.distributed.distributed_c10d.all_reduce)
|
|
|
|
def reduce_scatter_310p(output_tensor, input_tensor, group=None):
|
|
rank = torch.distributed.get_rank(group)
|
|
world_size = torch.distributed.get_world_size(group)
|
|
torch.distributed.all_reduce(input_tensor,
|
|
torch.distributed.ReduceOp.SUM,
|
|
group,
|
|
async_op=False)
|
|
interval = input_tensor.shape[0] // world_size
|
|
output_tensor[:] = input_tensor[rank * interval:(rank + 1) * interval]
|
|
|
|
torch.distributed._reduce_scatter_base = reduce_scatter_310p
|
|
torch.distributed.distributed_c10d._reduce_scatter_base = reduce_scatter_310p
|
|
|
|
|
|
if is_310p():
|
|
communication_adaptation_310p()
|