Fix the deadlock in multi-node tp (#1122)

This commit is contained in:
Lianmin Zheng
2024-08-16 01:39:24 -07:00
committed by GitHub
parent 6aa8ad14f8
commit 5a261bd055
7 changed files with 54 additions and 16 deletions

View File

@@ -38,6 +38,7 @@ from vllm.distributed import (
init_distributed_environment,
initialize_model_parallel,
)
from vllm.distributed.parallel_state import in_the_same_node_as
from vllm.model_executor.model_loader import get_model
from vllm.model_executor.models import ModelRegistry
@@ -112,10 +113,13 @@ class ModelRunner:
distributed_init_method=nccl_init_method,
)
initialize_model_parallel(tensor_model_parallel_size=self.tp_size)
self.tp_group = get_tp_group()
total_gpu_memory = get_available_gpu_memory(
self.gpu_id, distributed=self.tp_size > 1
)
self.tp_group = get_tp_group()
self.is_multi_node_tp = not all(
in_the_same_node_as(self.tp_group.cpu_group, source_rank=0)
)
if self.tp_size > 1:
total_local_gpu_memory = get_available_gpu_memory(self.gpu_id)