[1/N] Introduce Mooncake Backend and Mooncake EP to Support Elastic EP (#10423)

Co-authored-by: Hank Han <hanhan7630@outlook.com>
Co-authored-by: Shangming Cai <csmthu@gmail.com>
This commit is contained in:
Xun Sun
2025-10-15 10:40:54 +08:00
committed by GitHub
parent 74737b2863
commit a40229f6f8
13 changed files with 798 additions and 32 deletions

View File

@@ -677,7 +677,18 @@ class ModelRunner:
raise
if self.device == "cuda":
backend = "nccl"
if self.server_args.elastic_ep_backend == "mooncake":
backend = "mooncake"
if self.server_args.mooncake_ib_device:
mooncake_ib_device = self.server_args.mooncake_ib_device.split(",")
try:
from mooncake import ep as mooncake_ep
mooncake_ep.set_device_filter(mooncake_ib_device)
except:
pass # A warning will be raised in `init_distributed_environment`
else:
backend = "nccl"
elif self.device == "xpu":
backend = "xccl"
elif self.device == "hpu":
@@ -885,17 +896,23 @@ class ModelRunner:
f"mem usage={self.weight_load_mem_usage:.2f} GB."
)
# Handle the case where some ranks do not finish loading.
try:
dist.monitored_barrier(
group=get_tp_group().cpu_group,
timeout=datetime.timedelta(seconds=UNBALANCED_MODEL_LOADING_TIMEOUT_S),
wait_all_ranks=True,
)
except RuntimeError:
raise ValueError(
f"TP rank {self.tp_rank} could finish the model loading, but there are other ranks that didn't finish loading. It is likely due to unexpected failures (e.g., OOM) or a slow node."
) from None
if self.server_args.elastic_ep_backend == "mooncake":
# Mooncake does not support `monitored_barrier`
dist.barrier(group=get_tp_group().cpu_group)
else:
# Handle the case where some ranks do not finish loading.
try:
dist.monitored_barrier(
group=get_tp_group().cpu_group,
timeout=datetime.timedelta(
seconds=UNBALANCED_MODEL_LOADING_TIMEOUT_S
),
wait_all_ranks=True,
)
except RuntimeError:
raise ValueError(
f"TP rank {self.tp_rank} could finish the model loading, but there are other ranks that didn't finish loading. It is likely due to unexpected failures (e.g., OOM) or a slow node."
) from None
def update_expert_location(
self,