[1/N] Introduce Mooncake Backend and Mooncake EP to Support Elastic EP (#10423)
Co-authored-by: Hank Han <hanhan7630@outlook.com> Co-authored-by: Shangming Cai <csmthu@gmail.com>
This commit is contained in:
@@ -677,7 +677,18 @@ class ModelRunner:
|
||||
raise
|
||||
|
||||
if self.device == "cuda":
|
||||
backend = "nccl"
|
||||
if self.server_args.elastic_ep_backend == "mooncake":
|
||||
backend = "mooncake"
|
||||
if self.server_args.mooncake_ib_device:
|
||||
mooncake_ib_device = self.server_args.mooncake_ib_device.split(",")
|
||||
try:
|
||||
from mooncake import ep as mooncake_ep
|
||||
|
||||
mooncake_ep.set_device_filter(mooncake_ib_device)
|
||||
except:
|
||||
pass # A warning will be raised in `init_distributed_environment`
|
||||
else:
|
||||
backend = "nccl"
|
||||
elif self.device == "xpu":
|
||||
backend = "xccl"
|
||||
elif self.device == "hpu":
|
||||
@@ -885,17 +896,23 @@ class ModelRunner:
|
||||
f"mem usage={self.weight_load_mem_usage:.2f} GB."
|
||||
)
|
||||
|
||||
# Handle the case where some ranks do not finish loading.
|
||||
try:
|
||||
dist.monitored_barrier(
|
||||
group=get_tp_group().cpu_group,
|
||||
timeout=datetime.timedelta(seconds=UNBALANCED_MODEL_LOADING_TIMEOUT_S),
|
||||
wait_all_ranks=True,
|
||||
)
|
||||
except RuntimeError:
|
||||
raise ValueError(
|
||||
f"TP rank {self.tp_rank} could finish the model loading, but there are other ranks that didn't finish loading. It is likely due to unexpected failures (e.g., OOM) or a slow node."
|
||||
) from None
|
||||
if self.server_args.elastic_ep_backend == "mooncake":
|
||||
# Mooncake does not support `monitored_barrier`
|
||||
dist.barrier(group=get_tp_group().cpu_group)
|
||||
else:
|
||||
# Handle the case where some ranks do not finish loading.
|
||||
try:
|
||||
dist.monitored_barrier(
|
||||
group=get_tp_group().cpu_group,
|
||||
timeout=datetime.timedelta(
|
||||
seconds=UNBALANCED_MODEL_LOADING_TIMEOUT_S
|
||||
),
|
||||
wait_all_ranks=True,
|
||||
)
|
||||
except RuntimeError:
|
||||
raise ValueError(
|
||||
f"TP rank {self.tp_rank} could finish the model loading, but there are other ranks that didn't finish loading. It is likely due to unexpected failures (e.g., OOM) or a slow node."
|
||||
) from None
|
||||
|
||||
def update_expert_location(
|
||||
self,
|
||||
|
||||
Reference in New Issue
Block a user