55 lines
2.2 KiB
Python
55 lines
2.2 KiB
Python
|
|
# SPDX-License-Identifier: Apache-2.0
|
||
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||
|
|
"""Utility functions for EPLB (Expert Parallel Load Balancing)."""
|
||
|
|
|
||
|
|
import os
|
||
|
|
|
||
|
|
from vllm.config import ParallelConfig
|
||
|
|
from vllm.logger import init_logger
|
||
|
|
|
||
|
|
logger = init_logger(__name__)
|
||
|
|
|
||
|
|
|
||
|
|
def override_envs_for_eplb(parallel_config: ParallelConfig) -> None:
|
||
|
|
"""
|
||
|
|
Override environment variables for EPLB when specific conditions are met.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
parallel_config: The parallel configuration object.
|
||
|
|
"""
|
||
|
|
is_data_parallel = parallel_config.data_parallel_size > 1
|
||
|
|
is_eplb_enabled = parallel_config.enable_eplb
|
||
|
|
async_eplb = parallel_config.eplb_config.use_async
|
||
|
|
is_deepep_ll = parallel_config.all2all_backend == "deepep_low_latency"
|
||
|
|
|
||
|
|
# Override NCCL_MAX_CTAS to avoid hangs when using async EPLB with the
|
||
|
|
# DeepEP low-latency backend.
|
||
|
|
#
|
||
|
|
# The hang happens when two ranks interleave kernel launches differently
|
||
|
|
# between NCCL collectives (used by async EPLB weight exchange) and DeepEP
|
||
|
|
# low-latency (LL) kernels. DeepEP LL uses a cooperative launch and tries
|
||
|
|
# to reserve a large fraction of the GPU's SMs; if those SMs are currently
|
||
|
|
# occupied by NCCL, the DeepEP LL launch blocks until enough SMs are
|
||
|
|
# freed.
|
||
|
|
#
|
||
|
|
# If rank A enters DeepEP LL in main thread while rank B is still executing
|
||
|
|
# NCCL in async thread, rank A can block waiting for SMs, while rank B can
|
||
|
|
# block inside NCCL waiting for rank A to participate in the collective.
|
||
|
|
# This circular wait causes a deadlock.
|
||
|
|
# Limiting NCCL occupancy via NCCL_MAX_CTAS leaves space for the DeepEP
|
||
|
|
# cooperative kernel to launch and complete, breaking the deadlock.
|
||
|
|
# See: https://github.com/deepseek-ai/DeepEP/issues/496
|
||
|
|
if is_data_parallel and is_eplb_enabled and is_deepep_ll and async_eplb:
|
||
|
|
current_value_str = os.getenv("NCCL_MAX_CTAS")
|
||
|
|
|
||
|
|
if current_value_str and current_value_str.isdigit():
|
||
|
|
return
|
||
|
|
|
||
|
|
override_value = 8
|
||
|
|
os.environ["NCCL_MAX_CTAS"] = str(override_value)
|
||
|
|
logger.info_once(
|
||
|
|
f"EPLB: Setting NCCL_MAX_CTAS={override_value} "
|
||
|
|
"for expert parallel with EPLB and deepep_low_latency backend",
|
||
|
|
scope="global",
|
||
|
|
)
|