[DP][V1] Fix rank set in DP scenario & Bump torch-npu version to 2.5.1.post1.dev20250528 (#1235)

### What this PR does / why we need it?
1. Fix rank set in DP scenario. The new poc version of torch-npu support
setting `ASCEND_RT_VISIBLE_DEVICES` dynamically, thus we could use the
rank set in `DPEngineCoreProc` directly instead of calculating local
rank across dp by hand in the patched `_init_data_parallel`

Closes: https://github.com/vllm-project/vllm-ascend/issues/1170

2. Bump torch-npu version to 2.5.1.post1.dev20250528

Closes: https://github.com/vllm-project/vllm-ascend/pull/1242
Closes: https://github.com/vllm-project/vllm-ascend/issues/1232


### How was this patch tested?
CI passed with new added test.

---------

Signed-off-by: MengqingCao <cmq0113@163.com>
Signed-off-by: Icey <1790571317@qq.com>
Co-authored-by: Icey <1790571317@qq.com>
This commit is contained in:
Mengqing Cao
2025-06-16 23:09:53 +08:00
committed by GitHub
parent f5404dc650
commit 96fa7ff63b
19 changed files with 114 additions and 47 deletions

View File

@@ -173,6 +173,8 @@ jobs:
- name: Install vllm-project/vllm-ascend - name: Install vllm-project/vllm-ascend
working-directory: ./vllm-ascend working-directory: ./vllm-ascend
env:
PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi
run: | run: |
pip install -r requirements-dev.txt pip install -r requirements-dev.txt
pip install -e . pip install -e .

View File

@@ -19,6 +19,12 @@ on:
- '.github/workflows/image_openeuler.yml' - '.github/workflows/image_openeuler.yml'
- 'Dockerfile.openEuler' - 'Dockerfile.openEuler'
- 'vllm_ascend/**' - 'vllm_ascend/**'
- 'setup.py'
- 'pyproject.toml'
- 'requirements.txt'
- 'cmake/**'
- 'CMakeLists.txt'
- 'csrc/**'
push: push:
# Publish image when tagging, the Dockerfile in tag will be build as tag image # Publish image when tagging, the Dockerfile in tag will be build as tag image
branches: branches:

View File

@@ -19,6 +19,12 @@ on:
- '.github/workflows/image_ubuntu.yml' - '.github/workflows/image_ubuntu.yml'
- 'Dockerfile' - 'Dockerfile'
- 'vllm_ascend/**' - 'vllm_ascend/**'
- 'setup.py'
- 'pyproject.toml'
- 'requirements.txt'
- 'cmake/**'
- 'CMakeLists.txt'
- 'csrc/**'
push: push:
# Publish image when tagging, the Dockerfile in tag will be build as tag image # Publish image when tagging, the Dockerfile in tag will be build as tag image
branches: branches:

View File

@@ -115,6 +115,8 @@ jobs:
VLLM_TARGET_DEVICE=empty pip install -e . VLLM_TARGET_DEVICE=empty pip install -e .
- name: Install vllm-project/vllm-ascend - name: Install vllm-project/vllm-ascend
env:
PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi
run: | run: |
pip install -e . pip install -e .
pip install -r benchmarks/requirements-bench.txt pip install -r benchmarks/requirements-bench.txt

View File

@@ -151,6 +151,7 @@ jobs:
- name: Install vllm-project/vllm-ascend - name: Install vllm-project/vllm-ascend
run: | run: |
export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/x86_64-linux/devlib export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/x86_64-linux/devlib
python3 -m pip install -r requirements-dev.txt --extra-index https://download.pytorch.org/whl/cpu/ python3 -m pip install -r requirements-dev.txt --extra-index https://download.pytorch.org/whl/cpu/
python3 -m pip install -v . --extra-index https://download.pytorch.org/whl/cpu/ python3 -m pip install -v . --extra-index https://download.pytorch.org/whl/cpu/
@@ -214,6 +215,8 @@ jobs:
VLLM_TARGET_DEVICE=empty pip install -e . VLLM_TARGET_DEVICE=empty pip install -e .
- name: Install vllm-project/vllm-ascend - name: Install vllm-project/vllm-ascend
env:
PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi
run: | run: |
pip install -r requirements-dev.txt pip install -r requirements-dev.txt
pip install -v -e . pip install -v -e .
@@ -311,6 +314,8 @@ jobs:
VLLM_TARGET_DEVICE=empty pip install -e . VLLM_TARGET_DEVICE=empty pip install -e .
- name: Install vllm-project/vllm-ascend - name: Install vllm-project/vllm-ascend
env:
PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi
run: | run: |
pip install -r requirements-dev.txt pip install -r requirements-dev.txt
pip install -v -e . pip install -v -e .

View File

@@ -88,6 +88,8 @@ jobs:
VLLM_TARGET_DEVICE=empty pip install -e . VLLM_TARGET_DEVICE=empty pip install -e .
- name: Install vllm-project/vllm-ascend - name: Install vllm-project/vllm-ascend
env:
PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi
run: | run: |
pip install -r requirements-dev.txt pip install -r requirements-dev.txt
pip install -v -e . pip install -v -e .

View File

@@ -97,6 +97,8 @@ jobs:
VLLM_TARGET_DEVICE=empty pip install -e . VLLM_TARGET_DEVICE=empty pip install -e .
- name: Install vllm-project/vllm-ascend - name: Install vllm-project/vllm-ascend
env:
PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi
run: | run: |
pip install -r requirements-dev.txt pip install -r requirements-dev.txt
pip install -v -e . pip install -v -e .

View File

@@ -46,7 +46,8 @@ RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm
# Install vllm-ascend # Install vllm-ascend
# Append `libascend_hal.so` path (devlib) to LD_LIBRARY_PATH # Append `libascend_hal.so` path (devlib) to LD_LIBRARY_PATH
RUN source /usr/local/Ascend/ascend-toolkit/set_env.sh && \ RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
source /usr/local/Ascend/nnal/atb/set_env.sh && \ source /usr/local/Ascend/nnal/atb/set_env.sh && \
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \ python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \

View File

@@ -43,7 +43,8 @@ RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/ -
python3 -m pip cache purge python3 -m pip cache purge
# Install vllm-ascend # Install vllm-ascend
RUN source /usr/local/Ascend/ascend-toolkit/set_env.sh && \ RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
source /usr/local/Ascend/nnal/atb/set_env.sh && \ source /usr/local/Ascend/nnal/atb/set_env.sh && \
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \ python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \

View File

@@ -38,7 +38,7 @@ By using vLLM Ascend plugin, popular open-source models, including Transformer-l
- Software: - Software:
* Python >= 3.9, < 3.12 * Python >= 3.9, < 3.12
* CANN >= 8.1.RC1 * CANN >= 8.1.RC1
* PyTorch >= 2.5.1, torch-npu >= 2.5.1 * PyTorch >= 2.5.1, torch-npu >= 2.5.1.post1.dev20250528
* vLLM (the same version as vllm-ascend) * vLLM (the same version as vllm-ascend)
## Getting Started ## Getting Started

View File

@@ -39,7 +39,7 @@ vLLM 昇腾插件 (`vllm-ascend`) 是一个由社区维护的让vLLM在Ascend NP
- 软件: - 软件:
* Python >= 3.9, < 3.12 * Python >= 3.9, < 3.12
* CANN >= 8.1.RC1 * CANN >= 8.1.RC1
* PyTorch >= 2.5.1, torch-npu >= 2.5.1 * PyTorch >= 2.5.1, torch-npu >= 2.5.1.post1.dev20250528
* vLLM (与vllm-ascend版本一致) * vLLM (与vllm-ascend版本一致)
## 开始使用 ## 开始使用

View File

@@ -9,11 +9,11 @@ This document describes how to install vllm-ascend manually.
- A hardware with Ascend NPU. It's usually the Atlas 800 A2 series. - A hardware with Ascend NPU. It's usually the Atlas 800 A2 series.
- Software: - Software:
| Software | Supported version | Note | | Software | Supported version | Note |
|-----------|-------------------|----------------------------------------| |---------------|----------------------------------|-------------------------------------------|
| CANN | >= 8.1.RC1 | Required for vllm-ascend and torch-npu | | CANN | >= 8.1.RC1 | Required for vllm-ascend and torch-npu |
| torch-npu | >= 2.5.1 | Required for vllm-ascend | | torch-npu | >= 2.5.1.post1.dev20250528 | Required for vllm-ascend |
| torch | >= 2.5.1 | Required for torch-npu and vllm | | torch | >= 2.5.1 | Required for torch-npu and vllm |
You have 2 way to install: You have 2 way to install:
- **Using pip**: first prepare env manually or via CANN image, then install `vllm-ascend` using pip. - **Using pip**: first prepare env manually or via CANN image, then install `vllm-ascend` using pip.
@@ -156,6 +156,7 @@ cd ..
# Install vLLM Ascend # Install vLLM Ascend
git clone --depth 1 --branch |vllm_ascend_version| https://github.com/vllm-project/vllm-ascend.git git clone --depth 1 --branch |vllm_ascend_version| https://github.com/vllm-project/vllm-ascend.git
cd vllm-ascend cd vllm-ascend
export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
pip install -v -e . pip install -v -e .
cd .. cd ..
``` ```

View File

@@ -12,7 +12,7 @@ requires = [
"scipy", "scipy",
"setuptools>=64", "setuptools>=64",
"setuptools-scm>=8", "setuptools-scm>=8",
"torch-npu==2.5.1", "torch-npu==2.5.1.post1.dev20250528",
"torch>=2.5.1", "torch>=2.5.1",
"torchvision<0.21.0", "torchvision<0.21.0",
"wheel", "wheel",

View File

@@ -10,7 +10,6 @@ pyyaml
scipy scipy
setuptools>=64 setuptools>=64
setuptools-scm>=8 setuptools-scm>=8
torch-npu==2.5.1
torch>=2.5.1 torch>=2.5.1
torchvision<0.21.0 torchvision<0.21.0
wheel wheel
@@ -21,3 +20,8 @@ quart
# Required for N-gram speculative decoding # Required for N-gram speculative decoding
numba numba
# Install torch_npu
--pre
--extra-index-url https://mirrors.huaweicloud.com/ascend/repos/pypi
torch-npu==2.5.1.post1.dev20250528

View File

@@ -152,7 +152,7 @@ class cmake_build_ext(build_ext):
# if pybind11 is installed via pip # if pybind11 is installed via pip
pybind11_cmake_path = (subprocess.check_output( pybind11_cmake_path = (subprocess.check_output(
[python_executable, "-m", "pybind11", [python_executable, "-m", "pybind11",
"--cmake"]).decode().strip()) "--cmakedir"]).decode().strip())
except subprocess.CalledProcessError as e: except subprocess.CalledProcessError as e:
# else specify pybind11 path installed from source code on CI container # else specify pybind11 path installed from source code on CI container
raise RuntimeError(f"CMake configuration failed: {e}") raise RuntimeError(f"CMake configuration failed: {e}")

View File

@@ -0,0 +1,66 @@
#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""
Compare the outputs of vLLM with and without aclgraph.
Run `pytest tests/multicard/test_data_parallel.py`.
"""
import os
import pytest
from tests.conftest import VllmRunner
from tests.model_utils import check_outputs_equal
MODELS = ["Qwen/Qwen2.5-0.5B-Instruct"]
@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0",
reason="Data parallel only support on v1")
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_tokens", [32])
def test_data_parallel_correctness(
model: str,
max_tokens: int,
) -> None:
example_prompts = [
"Hello, my name is", "The president of the United States is",
"The capital of France is", "The future of AI is"
]
with VllmRunner(model_name=model,
max_model_len=1024,
max_num_seqs=16,
data_parallel_size=2,
distributed_executor_backend="mp") as vllm_model:
vllm_dp_outputs = vllm_model.generate_greedy(example_prompts,
max_tokens)
with VllmRunner(
model_name=model,
max_model_len=1024,
max_num_seqs=16,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
check_outputs_equal(
outputs_0_lst=vllm_outputs,
outputs_1_lst=vllm_dp_outputs,
name_0="vllm_outputs",
name_1="vllm_dp_outputs",
)

View File

@@ -47,16 +47,7 @@
# Related PR (if no, explain why): # Related PR (if no, explain why):
# Future Plan: # Future Plan:
# Remove those patch when vllm merged them # Remove those patch when vllm merged them
# 2. `vllm.v1.engine.core.DPEngineCoreProc._init_data_parallel` # 2. `vllm.config.ParallelConfig.get_next_dp_init_port`
# Why:
# There is some bug for ASCEND_RT_VISIBLE_DEVICES usage.
# How
# The ASCEND_RT_VISIBLE_DEVICES related code is dropped.
# Related PR (if no, explain why):
# No, this is a bug for vllm ascend
# Future Plan:
# Remove this patch once ASCEND_RT_VISIBLE_DEVICES bug is fixed.
# 3. `vllm.config.ParallelConfig.get_next_dp_init_port`
# Why: # Why:
# vllm doesn't support get port from environment. # vllm doesn't support get port from environment.
# How # How
@@ -65,7 +56,7 @@
# Need a PR to vllm to support get port from environment. # Need a PR to vllm to support get port from environment.
# Future Plan: # Future Plan:
# Remove those patch when vllm merged them # Remove those patch when vllm merged them
# 4. `vllm.config.ParallelConfig.ParallelConfig.stateless_init_dp_group` # 3. `vllm.config.ParallelConfig.ParallelConfig.stateless_init_dp_group`
# Why: # Why:
# vLLM use gloo backend by default to initialize stateless dp process gourp, but we want to use hccl here to # vLLM use gloo backend by default to initialize stateless dp process gourp, but we want to use hccl here to
# get better performance # get better performance

View File

@@ -21,10 +21,9 @@ import vllm
import vllm.distributed import vllm.distributed
import vllm.envs as envs import vllm.envs as envs
from torch.distributed import ProcessGroup from torch.distributed import ProcessGroup
from vllm.config import ParallelConfig, VllmConfig from vllm.config import ParallelConfig
from vllm.distributed.utils import \ from vllm.distributed.utils import \
stateless_init_torch_distributed_process_group stateless_init_torch_distributed_process_group
from vllm.v1.engine.core import DPEngineCoreProc
def ascend_destroy_model_parallel(): def ascend_destroy_model_parallel():
@@ -79,21 +78,6 @@ def stateless_init_dp_group(self) -> "ProcessGroup":
return dp_group return dp_group
def _init_data_parallel(self, vllm_config: VllmConfig):
# Configure NPUs and stateless process group for data parallel.
dp_rank = vllm_config.parallel_config.data_parallel_rank
dp_size = vllm_config.parallel_config.data_parallel_size
local_dp_rank = vllm_config.parallel_config.data_parallel_rank_local
assert dp_size > 1
assert 0 <= local_dp_rank <= dp_rank < dp_size
self.local_dp_rank = local_dp_rank
self.dp_group = vllm_config.parallel_config.stateless_init_dp_group()
self.current_wave = 0
vllm.distributed.parallel_state.destroy_model_parallel = ascend_destroy_model_parallel vllm.distributed.parallel_state.destroy_model_parallel = ascend_destroy_model_parallel
DPEngineCoreProc._init_data_parallel = _init_data_parallel
ParallelConfig.get_next_dp_init_port = parallel_config_get_dp_port ParallelConfig.get_next_dp_init_port = parallel_config_get_dp_port
ParallelConfig.stateless_init_dp_group = stateless_init_dp_group ParallelConfig.stateless_init_dp_group = stateless_init_dp_group

View File

@@ -75,12 +75,6 @@ class NPUWorker(WorkerBase):
distributed_init_method=distributed_init_method, distributed_init_method=distributed_init_method,
is_driver_worker=is_driver_worker) is_driver_worker=is_driver_worker)
# NOTE(Yizhou): Since we do not set ASCEND_RT_VISIBLE_DEVICES in
# vllm_ascend, we need to set the device id manually.
local_dp_rank = self.vllm_config.parallel_config.data_parallel_rank_local
world_size = self.vllm_config.parallel_config.world_size
self.local_rank_across_dp = local_dp_rank * world_size + self.local_rank
# Try to import mindie_turbo to accelerate vLLM inference. # Try to import mindie_turbo to accelerate vLLM inference.
try_register_lib( try_register_lib(
"mindie_turbo", "mindie_turbo",
@@ -124,7 +118,7 @@ class NPUWorker(WorkerBase):
def init_device(self): def init_device(self):
if self.device_config.device.type == "npu": if self.device_config.device.type == "npu":
self.device = torch.device(f"npu:{self.local_rank_across_dp}") self.device = torch.device(f"npu:{self.local_rank}")
NPUPlatform.set_device(self.device) NPUPlatform.set_device(self.device)
NPUPlatform.empty_cache() NPUPlatform.empty_cache()
self.init_npu_memory = NPUPlatform.mem_get_info()[0] self.init_npu_memory = NPUPlatform.mem_get_info()[0]