[DP][V1] Fix rank set in DP scenario & Bump torch-npu version to 2.5.1.post1.dev20250528 (#1235)
### What this PR does / why we need it? 1. Fix rank set in DP scenario. The new poc version of torch-npu support setting `ASCEND_RT_VISIBLE_DEVICES` dynamically, thus we could use the rank set in `DPEngineCoreProc` directly instead of calculating local rank across dp by hand in the patched `_init_data_parallel` Closes: https://github.com/vllm-project/vllm-ascend/issues/1170 2. Bump torch-npu version to 2.5.1.post1.dev20250528 Closes: https://github.com/vllm-project/vllm-ascend/pull/1242 Closes: https://github.com/vllm-project/vllm-ascend/issues/1232 ### How was this patch tested? CI passed with new added test. --------- Signed-off-by: MengqingCao <cmq0113@163.com> Signed-off-by: Icey <1790571317@qq.com> Co-authored-by: Icey <1790571317@qq.com>
This commit is contained in:
@@ -21,10 +21,9 @@ import vllm
|
||||
import vllm.distributed
|
||||
import vllm.envs as envs
|
||||
from torch.distributed import ProcessGroup
|
||||
from vllm.config import ParallelConfig, VllmConfig
|
||||
from vllm.config import ParallelConfig
|
||||
from vllm.distributed.utils import \
|
||||
stateless_init_torch_distributed_process_group
|
||||
from vllm.v1.engine.core import DPEngineCoreProc
|
||||
|
||||
|
||||
def ascend_destroy_model_parallel():
|
||||
@@ -79,21 +78,6 @@ def stateless_init_dp_group(self) -> "ProcessGroup":
|
||||
return dp_group
|
||||
|
||||
|
||||
def _init_data_parallel(self, vllm_config: VllmConfig):
|
||||
# Configure NPUs and stateless process group for data parallel.
|
||||
dp_rank = vllm_config.parallel_config.data_parallel_rank
|
||||
dp_size = vllm_config.parallel_config.data_parallel_size
|
||||
local_dp_rank = vllm_config.parallel_config.data_parallel_rank_local
|
||||
|
||||
assert dp_size > 1
|
||||
assert 0 <= local_dp_rank <= dp_rank < dp_size
|
||||
|
||||
self.local_dp_rank = local_dp_rank
|
||||
self.dp_group = vllm_config.parallel_config.stateless_init_dp_group()
|
||||
self.current_wave = 0
|
||||
|
||||
|
||||
vllm.distributed.parallel_state.destroy_model_parallel = ascend_destroy_model_parallel
|
||||
DPEngineCoreProc._init_data_parallel = _init_data_parallel
|
||||
ParallelConfig.get_next_dp_init_port = parallel_config_get_dp_port
|
||||
ParallelConfig.stateless_init_dp_group = stateless_init_dp_group
|
||||
|
||||
Reference in New Issue
Block a user