[bugfix] fix ray start failed: local_world_size cannot little than visible device count error (#4457)
### What this PR does / why we need it? Fix the ray start failed bug: local_world_size cannot little than visible device count error detail see issue #4456. This fix code is copied from vllm fixing modify, PR: [#28873](https://github.com/vllm-project/vllm/pull/28873) - vLLM version: v0.11.2 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.2 --------- Signed-off-by: leo-pony <nengjunma@outlook.com>
This commit is contained in:
@@ -59,6 +59,7 @@ class TestNPUTorchairWorker(TestBase):
|
|||||||
worker.vllm_config = MagicMock()
|
worker.vllm_config = MagicMock()
|
||||||
worker.parallel_config = MagicMock()
|
worker.parallel_config = MagicMock()
|
||||||
worker.parallel_config.local_world_size = 0
|
worker.parallel_config.local_world_size = 0
|
||||||
|
worker.parallel_config.data_parallel_size = 1
|
||||||
|
|
||||||
result = worker._init_device()
|
result = worker._init_device()
|
||||||
|
|
||||||
@@ -93,6 +94,7 @@ class TestNPUTorchairWorker(TestBase):
|
|||||||
worker.vllm_config = MagicMock()
|
worker.vllm_config = MagicMock()
|
||||||
worker.parallel_config = MagicMock()
|
worker.parallel_config = MagicMock()
|
||||||
worker.parallel_config.local_world_size = 0
|
worker.parallel_config.local_world_size = 0
|
||||||
|
worker.parallel_config.data_parallel_size = 1
|
||||||
|
|
||||||
result = worker._init_device()
|
result = worker._init_device()
|
||||||
|
|
||||||
|
|||||||
@@ -329,6 +329,8 @@ class TestNPUWorker(TestBase):
|
|||||||
worker.model_config = MagicMock()
|
worker.model_config = MagicMock()
|
||||||
worker.parallel_config = MagicMock()
|
worker.parallel_config = MagicMock()
|
||||||
worker.parallel_config.local_world_size = 0
|
worker.parallel_config.local_world_size = 0
|
||||||
|
worker.parallel_config.data_parallel_size = 1
|
||||||
|
|
||||||
worker.model_config.seed = 42
|
worker.model_config.seed = 42
|
||||||
|
|
||||||
# Test _init_device
|
# Test _init_device
|
||||||
|
|||||||
@@ -208,12 +208,18 @@ class NPUWorker(WorkerBase):
|
|||||||
NPUPlatform.set_device(device)
|
NPUPlatform.set_device(device)
|
||||||
NPUPlatform.empty_cache()
|
NPUPlatform.empty_cache()
|
||||||
|
|
||||||
visible_device_count = (torch.npu.device_count()
|
if (self.parallel_config.data_parallel_size > 1
|
||||||
if torch.npu.is_available() else 0)
|
and self.parallel_config.data_parallel_size_local > 0
|
||||||
assert self.parallel_config.local_world_size <= visible_device_count, (
|
and self.parallel_config.distributed_executor_backend
|
||||||
f"local_world_size ({self.parallel_config.local_world_size}) must be "
|
not in ["ray", "external_launcher"] and
|
||||||
f"less than or equal to the number of visible devices "
|
self.vllm_config.parallel_config.data_parallel_backend != "ray"
|
||||||
f"({visible_device_count}).")
|
and self.vllm_config.parallel_config.nnodes_within_dp == 1):
|
||||||
|
visible_device_count = (torch.npu.device_count()
|
||||||
|
if torch.npu.is_available() else 0)
|
||||||
|
assert self.parallel_config.local_world_size <= visible_device_count, (
|
||||||
|
f"local_world_size ({self.parallel_config.local_world_size}) must "
|
||||||
|
f"be less than or equal to the number of visible devices "
|
||||||
|
f"({visible_device_count}).")
|
||||||
|
|
||||||
self.init_npu_memory = NPUPlatform.mem_get_info()[0]
|
self.init_npu_memory = NPUPlatform.mem_get_info()[0]
|
||||||
# Initialize the distributed environment.
|
# Initialize the distributed environment.
|
||||||
|
|||||||
Reference in New Issue
Block a user