Fix the deadlock in multi-node tp (#1122)

2024-08-16 01:39:24 -07:00
parent 6aa8ad14f8
commit 5a261bd055
7 changed files with 54 additions and 16 deletions
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -38,6 +38,7 @@ from vllm.distributed import (
    init_distributed_environment,
    initialize_model_parallel,
 )
+from vllm.distributed.parallel_state import in_the_same_node_as
 from vllm.model_executor.model_loader import get_model
 from vllm.model_executor.models import ModelRegistry

@@ -112,10 +113,13 @@ class ModelRunner:
            distributed_init_method=nccl_init_method,
        )
        initialize_model_parallel(tensor_model_parallel_size=self.tp_size)
-        self.tp_group = get_tp_group()
        total_gpu_memory = get_available_gpu_memory(
            self.gpu_id, distributed=self.tp_size > 1
        )
+        self.tp_group = get_tp_group()
+        self.is_multi_node_tp = not all(
+            in_the_same_node_as(self.tp_group.cpu_group, source_rank=0)
+        )

        if self.tp_size > 1:
            total_local_gpu_memory = get_available_gpu_memory(self.gpu_id)