enginex-bi_150-vllm/vllm/distributed/eplb/eplb_utils.py

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Utility functions for EPLB (Expert Parallel Load Balancing)."""

import os

from vllm.config import ParallelConfig
from vllm.logger import init_logger

logger = init_logger(__name__)


def override_envs_for_eplb(parallel_config: ParallelConfig) -> None:
    """
    Override environment variables for EPLB when specific conditions are met.

    Args:
        parallel_config: The parallel configuration object.
    """
    is_data_parallel = parallel_config.data_parallel_size > 1
    is_eplb_enabled = parallel_config.enable_eplb
    async_eplb = parallel_config.eplb_config.use_async
    is_deepep_ll = parallel_config.all2all_backend == "deepep_low_latency"

    # Override NCCL_MAX_CTAS to avoid hangs when using async EPLB with the
    # DeepEP low-latency backend.
    #
    # The hang happens when two ranks interleave kernel launches differently
    # between NCCL collectives (used by async EPLB weight exchange) and DeepEP
    # low-latency (LL) kernels. DeepEP LL uses a cooperative launch and tries
    # to reserve a large fraction of the GPU's SMs; if those SMs are currently
    # occupied by NCCL, the DeepEP LL launch blocks until enough SMs are
    # freed.
    #
    # If rank A enters DeepEP LL in main thread while rank B is still executing
    # NCCL in async thread, rank A can block waiting for SMs, while rank B can
    # block inside NCCL waiting for rank A to participate in the collective.
    # This circular wait causes a deadlock.
    # Limiting NCCL occupancy via NCCL_MAX_CTAS leaves space for the DeepEP
    # cooperative kernel to launch and complete, breaking the deadlock.
    # See: https://github.com/deepseek-ai/DeepEP/issues/496
    if is_data_parallel and is_eplb_enabled and is_deepep_ll and async_eplb:
        current_value_str = os.getenv("NCCL_MAX_CTAS")

        if current_value_str and current_value_str.isdigit():
            return

        override_value = 8
        os.environ["NCCL_MAX_CTAS"] = str(override_value)
        logger.info_once(
            f"EPLB: Setting NCCL_MAX_CTAS={override_value} "
            "for expert parallel with EPLB and deepep_low_latency backend",
            scope="global",
        )
update 2026-04-09 11:23:47 +08:00			`# SPDX-License-Identifier: Apache-2.0`
			`# SPDX-FileCopyrightText: Copyright contributors to the vLLM project`
			`"""Utility functions for EPLB (Expert Parallel Load Balancing)."""`

			`import os`

			`from vllm.config import ParallelConfig`
			`from vllm.logger import init_logger`

			`logger = init_logger(__name__)`


			`def override_envs_for_eplb(parallel_config: ParallelConfig) -> None:`
			`"""`
			`Override environment variables for EPLB when specific conditions are met.`

			`Args:`
			`parallel_config: The parallel configuration object.`
			`"""`
			`is_data_parallel = parallel_config.data_parallel_size > 1`
			`is_eplb_enabled = parallel_config.enable_eplb`
			`async_eplb = parallel_config.eplb_config.use_async`
			`is_deepep_ll = parallel_config.all2all_backend == "deepep_low_latency"`

			`# Override NCCL_MAX_CTAS to avoid hangs when using async EPLB with the`
			`# DeepEP low-latency backend.`
			`#`
			`# The hang happens when two ranks interleave kernel launches differently`
			`# between NCCL collectives (used by async EPLB weight exchange) and DeepEP`
			`# low-latency (LL) kernels. DeepEP LL uses a cooperative launch and tries`
			`# to reserve a large fraction of the GPU's SMs; if those SMs are currently`
			`# occupied by NCCL, the DeepEP LL launch blocks until enough SMs are`
			`# freed.`
			`#`
			`# If rank A enters DeepEP LL in main thread while rank B is still executing`
			`# NCCL in async thread, rank A can block waiting for SMs, while rank B can`
			`# block inside NCCL waiting for rank A to participate in the collective.`
			`# This circular wait causes a deadlock.`
			`# Limiting NCCL occupancy via NCCL_MAX_CTAS leaves space for the DeepEP`
			`# cooperative kernel to launch and complete, breaking the deadlock.`
			`# See: https://github.com/deepseek-ai/DeepEP/issues/496`
			`if is_data_parallel and is_eplb_enabled and is_deepep_ll and async_eplb:`
			`current_value_str = os.getenv("NCCL_MAX_CTAS")`

			`if current_value_str and current_value_str.isdigit():`
			`return`

			`override_value = 8`
			`os.environ["NCCL_MAX_CTAS"] = str(override_value)`
			`logger.info_once(`
			`f"EPLB: Setting NCCL_MAX_CTAS={override_value} "`
			`"for expert parallel with EPLB and deepep_low_latency backend",`
			`scope="global",`
			`)`