[BugFix] NetLoader: No backend type associated with device type npu (#5700)

**What this PR does / why we need it?**
This PR fixes a bug in NetLoader
[PR#2888](https://github.com/vllm-project/vllm-ascend/pull/2888). The
bug was caused by
[PR#3612](https://github.com/vllm-project/vllm-ascend/pull/3612)
([1/N][Refactor] Refactor code to adapt with vllm main), which removed
the `stateless_init_device_torch_dist_pg` function from platform.py,
leading to a failure in the call. This PR adds a way to create a
stateless process group that does not depend on external code.

**Does this PR introduce any user-facing change?**
No

**How was this patch tested?**
Same with
[PR#2888](https://github.com/vllm-project/vllm-ascend/pull/2888)
- vLLM version: v0.13.0
- vLLM main:
2f4e6548ef

---------

Signed-off-by: destinysky <kangrui10@126.com>
This commit is contained in:
Rui Kang
2026-01-09 15:54:54 +08:00
committed by GitHub
parent 64904ab5b6
commit be941cab71
2 changed files with 197 additions and 9 deletions

View File

@@ -16,11 +16,11 @@
import torch
import torch_npu
from vllm.distributed.utils import (
stateless_destroy_torch_distributed_process_group,
stateless_init_torch_distributed_process_group)
from vllm.logger import logger
from .netloader_pg import (destroy_stateless_process_group,
stateless_init_process_group)
class P2PLoad:
"""
@@ -62,12 +62,12 @@ class P2PLoad:
receiver_pg = None
loaded_model = None
try:
receiver_pg = stateless_init_torch_distributed_process_group(
receiver_pg = stateless_init_process_group(
host=self.world_name.split(":")[0],
port=self.source_port,
rank=0,
world_size=2,
backend='hccl',
group_name='netloader',
)
logger.info(
f"Finish init_process_group, name: {self.world_name}, addr: {self.source_ip}:{self.source_port}"
@@ -97,7 +97,7 @@ class P2PLoad:
logger.error("Failed to recv model: {}".format(e))
finally:
if receiver_pg:
stateless_destroy_torch_distributed_process_group(receiver_pg)
destroy_stateless_process_group(receiver_pg)
return loaded_model
@@ -134,12 +134,12 @@ class P2PSend:
)
sender_pg = None
try:
sender_pg = stateless_init_torch_distributed_process_group(
sender_pg = stateless_init_process_group(
host=self.comm_name.split(":")[0],
port=self.listen_port,
rank=1,
world_size=2,
backend='hccl',
group_name='netloader',
)
logger.info(
f"Finish init_process_group, name: {self.comm_name}, addr: {self.listen_ip}:{self.listen_port}"
@@ -167,4 +167,4 @@ class P2PSend:
)
finally:
if sender_pg:
stateless_destroy_torch_distributed_process_group(sender_pg)
destroy_stateless_process_group(sender_pg)