[BugFix] NetLoader: No backend type associated with device type npu (#5700)
**What this PR does / why we need it?**
This PR fixes a bug in NetLoader
[PR#2888](https://github.com/vllm-project/vllm-ascend/pull/2888). The
bug was caused by
[PR#3612](https://github.com/vllm-project/vllm-ascend/pull/3612)
([1/N][Refactor] Refactor code to adapt with vllm main), which removed
the `stateless_init_device_torch_dist_pg` function from platform.py,
leading to a failure in the call. This PR adds a way to create a
stateless process group that does not depend on external code.
**Does this PR introduce any user-facing change?**
No
**How was this patch tested?**
Same with
[PR#2888](https://github.com/vllm-project/vllm-ascend/pull/2888)
- vLLM version: v0.13.0
- vLLM main:
2f4e6548ef
---------
Signed-off-by: destinysky <kangrui10@126.com>
This commit is contained in:
@@ -16,11 +16,11 @@
|
||||
|
||||
import torch
|
||||
import torch_npu
|
||||
from vllm.distributed.utils import (
|
||||
stateless_destroy_torch_distributed_process_group,
|
||||
stateless_init_torch_distributed_process_group)
|
||||
from vllm.logger import logger
|
||||
|
||||
from .netloader_pg import (destroy_stateless_process_group,
|
||||
stateless_init_process_group)
|
||||
|
||||
|
||||
class P2PLoad:
|
||||
"""
|
||||
@@ -62,12 +62,12 @@ class P2PLoad:
|
||||
receiver_pg = None
|
||||
loaded_model = None
|
||||
try:
|
||||
receiver_pg = stateless_init_torch_distributed_process_group(
|
||||
receiver_pg = stateless_init_process_group(
|
||||
host=self.world_name.split(":")[0],
|
||||
port=self.source_port,
|
||||
rank=0,
|
||||
world_size=2,
|
||||
backend='hccl',
|
||||
group_name='netloader',
|
||||
)
|
||||
logger.info(
|
||||
f"Finish init_process_group, name: {self.world_name}, addr: {self.source_ip}:{self.source_port}"
|
||||
@@ -97,7 +97,7 @@ class P2PLoad:
|
||||
logger.error("Failed to recv model: {}".format(e))
|
||||
finally:
|
||||
if receiver_pg:
|
||||
stateless_destroy_torch_distributed_process_group(receiver_pg)
|
||||
destroy_stateless_process_group(receiver_pg)
|
||||
return loaded_model
|
||||
|
||||
|
||||
@@ -134,12 +134,12 @@ class P2PSend:
|
||||
)
|
||||
sender_pg = None
|
||||
try:
|
||||
sender_pg = stateless_init_torch_distributed_process_group(
|
||||
sender_pg = stateless_init_process_group(
|
||||
host=self.comm_name.split(":")[0],
|
||||
port=self.listen_port,
|
||||
rank=1,
|
||||
world_size=2,
|
||||
backend='hccl',
|
||||
group_name='netloader',
|
||||
)
|
||||
logger.info(
|
||||
f"Finish init_process_group, name: {self.comm_name}, addr: {self.listen_ip}:{self.listen_port}"
|
||||
@@ -167,4 +167,4 @@ class P2PSend:
|
||||
)
|
||||
finally:
|
||||
if sender_pg:
|
||||
stateless_destroy_torch_distributed_process_group(sender_pg)
|
||||
destroy_stateless_process_group(sender_pg)
|
||||
Reference in New Issue
Block a user