Support updating expert locations dynamically (#6388)

2025-05-22 12:59:33 +08:00
parent 121f92c583
commit fc992a09f9
5 changed files with 723 additions and 0 deletions
--- a/python/sglang/srt/managers/expert_location.py
+++ b/python/sglang/srt/managers/expert_location.py
@@ -22,6 +22,7 @@ import torch.distributed
 import torch.nn.functional as F
 from sglang.srt.configs.model_config import ModelConfig
 from sglang.srt.managers import deepseek_eplb
 from sglang.srt.model_loader import get_model_architecture
 from sglang.srt.server_args import ServerArgs
@@ -207,6 +208,26 @@ class ExpertLocationMetadata:
            ),
        )
    # -------------------------------- mutation ------------------------------------
    def update(
        self,
        other: "ExpertLocationMetadata",
    ):
        for field in [
            "ep_size",
        ]:
            assert getattr(self, field) == getattr(other, field)
        for field in [
            "physical_to_logical_map",
            "logical_to_all_physical_map",
            "logical_to_all_physical_map_num_valid",
            "logical_to_rank_dispatch_physical_map",
        ]:
            dst = getattr(self, field)
            dst[...] = getattr(other, field)
    # -------------------------------- usage ------------------------------------
    def logical_to_all_physical(
--- a/python/sglang/srt/model_executor/expert_location_updater.py
+++ b/python/sglang/srt/model_executor/expert_location_updater.py
@@ -0,0 +1,420 @@
 # Copyright 2023-2025 SGLang Team
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
 import logging
 from typing import Dict, List, Tuple
 import torch
 import torch.distributed
 from torch.distributed import P2POp
 from sglang.srt.managers.expert_location import (
    ExpertLocationMetadata,
    get_global_expert_location_metadata,
 )
 logger = logging.getLogger(__name__)
 def update_expert_location(
    routed_experts_weights_of_layer: Dict[int, List[torch.Tensor]],
    new_expert_location_metadata: ExpertLocationMetadata,
    nnodes: int,
    rank: int,
 ):
    old_expert_location_metadata = get_global_expert_location_metadata()
    _update_expert_weights(
        routed_experts_weights_of_layer,
        old_expert_location_metadata,
        new_expert_location_metadata,
        nnodes,
        rank,
    )
    old_expert_location_metadata.update(new_expert_location_metadata)
 def _update_expert_weights(
    routed_experts_weights_of_layer: Dict[int, List[torch.Tensor]],
    old_expert_location_metadata: ExpertLocationMetadata,
    new_expert_location_metadata: ExpertLocationMetadata,
    nnodes: int,
    rank: int,
 ):
    temp_buffers = create_temp_buffers(
        next(iter(routed_experts_weights_of_layer.values()))
    )
    world_size = torch.distributed.get_world_size()
    num_local_physical_experts = old_expert_location_metadata.num_local_physical_experts
    num_gpu_per_node = world_size // nnodes
    old_physical_to_logical_map = (
        old_expert_location_metadata.physical_to_logical_map.tolist()
    )
    new_physical_to_logical_map = (
        new_expert_location_metadata.physical_to_logical_map.tolist()
    )
    for layer_id in sorted(routed_experts_weights_of_layer.keys()):
        update_expert_weights_single_layer(
            routed_experts_weights=routed_experts_weights_of_layer[layer_id],
            temp_buffers=temp_buffers,
            old_physical_to_logical_map=old_physical_to_logical_map[layer_id],
            new_physical_to_logical_map=new_physical_to_logical_map[layer_id],
            num_local_physical_experts=num_local_physical_experts,
            num_gpu_per_node=num_gpu_per_node,
            rank=rank,
        )
 def create_temp_buffers(sample_tensors):
    return [torch.empty_like(tensor) for tensor in sample_tensors]
 def update_expert_weights_single_layer(
    routed_experts_weights: List[torch.Tensor],
    temp_buffers: List[torch.Tensor],
    old_physical_to_logical_map: List[int],  # (num_physical_Experts,)
    new_physical_to_logical_map: List[int],  # (num_physical_Experts,)
    num_local_physical_experts: int,
    num_gpu_per_node: int,
    rank: int,
    debug: bool = False,
 ):
    assert all(
        tensor.shape[0] == num_local_physical_experts
        for tensor in routed_experts_weights
    ), f"{num_local_physical_experts=} {[x.shape for x in routed_experts_weights]=}"
    output_logs = [] if debug else None
    num_physical_experts = len(old_physical_to_logical_map)
    num_tensors = len(routed_experts_weights)
    self_node_id = rank // num_gpu_per_node
    local_expert_location_range = (
        rank * num_local_physical_experts,
        (rank + 1) * num_local_physical_experts,
    )
    def _entrypoint():
        # List[Tuple[logical_expert_id, List[P2POp]]]
        p2p_op_infos: List[Tuple[int, List[P2POp]]] = []
        # List[Tuple[temp_buffers_expert_location, routed_experts_weights_expert_location]]
        buffer2weight_copy_infos: List[Tuple[int, int]] = []
        _handle_recv(buffer2weight_copy_infos, p2p_op_infos)
        _create_isend_ops(p2p_op_infos)
        _execute_p2p_ops(p2p_op_infos)
        _execute_buffer2weight_copies(buffer2weight_copy_infos)
        if debug:
            output_logs.append(f"{p2p_op_infos=}")
            output_logs.append(f"{buffer2weight_copy_infos=}")
    def _handle_recv(buffer2weight_copy_infos, p2p_op_infos):
        for dst_expert_location in range(*local_expert_location_range):
            _handle_recv_of_dst_expert_location(
                dst_expert_location, buffer2weight_copy_infos, p2p_op_infos
            )
    def _handle_recv_of_dst_expert_location(
        dst_expert_location: int, buffer2weight_copy_infos, p2p_op_infos
    ):
        logical_expert_id = new_physical_to_logical_map[dst_expert_location]
        # case 1: unchanged
        if old_physical_to_logical_map[dst_expert_location] == logical_expert_id:
            if debug:
                output_logs.append(
                    f"handle_recv_of_dst_expert_location {dst_expert_location=} case=unchanged"
                )
            return
        # case 2: same-gpu
        for src_expert_location in range(*local_expert_location_range):
            if old_physical_to_logical_map[src_expert_location] == logical_expert_id:
                for i in range(num_tensors):
                    _get_tensor(temp_buffers, i, dst_expert_location).copy_(
                        _get_tensor(routed_experts_weights, i, src_expert_location)
                    )
                buffer2weight_copy_infos.append(
                    (dst_expert_location, dst_expert_location)
                )
                if debug:
                    output_logs.append(
                        f"handle_recv_of_dst_expert_location {dst_expert_location=} case=same-gpu {src_expert_location=}"
                    )
                return
        # case 3: free-rider
        for src_expert_location in range(
            rank * num_local_physical_experts, dst_expert_location
        ):
            if new_physical_to_logical_map[src_expert_location] == logical_expert_id:
                buffer2weight_copy_infos.append(
                    (src_expert_location, dst_expert_location)
                )
                if debug:
                    output_logs.append(
                        f"handle_recv_of_dst_expert_location {dst_expert_location=} case=free-rider {src_expert_location=}"
                    )
                return
        same_node_mapping, cross_node_mapping, need_comm_self_node_dst_ranks = (
            _compute_comm_info(logical_expert_id=logical_expert_id)
        )
        # case 4: same-node
        if rank in need_comm_self_node_dst_ranks:
            chosen_src_rank = same_node_mapping.chunk_value_from_element_value(
                element_value=rank
            )
            _create_p2p_recv_and_buffer2weight_copy(
                buffer2weight_copy_infos,
                p2p_op_infos,
                src_rank=chosen_src_rank,
                logical_expert_id=logical_expert_id,
                dst_expert_location=dst_expert_location,
            )
            if debug:
                output_logs.append(
                    f"handle_recv_of_dst_expert_location {dst_expert_location=} case=same-node {chosen_src_rank=}"
                )
            return
        # case 5: cross-node
        # Future work: can optimize when there are multiple ranks in the same dst node that uses the same logical expert
        chosen_src_rank = cross_node_mapping.chunk_value_from_element_value(
            element_value=rank
        )
        _create_p2p_recv_and_buffer2weight_copy(
            buffer2weight_copy_infos,
            p2p_op_infos,
            src_rank=chosen_src_rank,
            logical_expert_id=logical_expert_id,
            dst_expert_location=dst_expert_location,
        )
        if debug:
            output_logs.append(
                f"handle_recv_of_dst_expert_location {dst_expert_location=} case=cross-node {chosen_src_rank=}"
            )
        return
    def _create_p2p_recv_and_buffer2weight_copy(
        buffer2weight_copy_infos,
        p2p_op_infos,
        *,
        logical_expert_id: int,
        src_rank: int,
        dst_expert_location: int,
    ):
        p2p_op_infos.append(
            (
                logical_expert_id,
                [
                    P2POp(
                        op=torch.distributed.irecv,
                        tensor=_get_tensor(temp_buffers, i, dst_expert_location),
                        peer=src_rank,
                    )
                    for i in range(num_tensors)
                ],
            )
        )
        buffer2weight_copy_infos.append((dst_expert_location, dst_expert_location))
    def _create_isend_ops(p2p_op_infos):
        handled_logical_expert_ids = set()
        for src_expert_location in range(*local_expert_location_range):
            logical_expert_id = old_physical_to_logical_map[src_expert_location]
            if logical_expert_id in handled_logical_expert_ids:
                continue
            handled_logical_expert_ids.add(logical_expert_id)
            _create_isend_ops_of_logical_expert_id(
                logical_expert_id, src_expert_location, p2p_op_infos
            )
    def _create_isend_ops_of_logical_expert_id(
        logical_expert_id, src_expert_location, p2p_op_infos
    ):
        same_node_mapping, cross_node_mapping, need_comm_self_node_dst_ranks = (
            _compute_comm_info(logical_expert_id=logical_expert_id)
        )
        same_node_dst_ranks = same_node_mapping.element_values_from_chunk_value(
            chunk_value=rank
        )
        cross_node_dst_ranks = cross_node_mapping.element_values_from_chunk_value(
            chunk_value=rank
        )
        all_dst_ranks = same_node_dst_ranks + cross_node_dst_ranks
        if debug:
            output_logs.append(
                f"create_isend_ops_of_logical_expert_id {logical_expert_id=} {src_expert_location=} {same_node_dst_ranks=} {cross_node_dst_ranks=}"
            )
        p2p_op_infos.append(
            (
                logical_expert_id,
                [
                    P2POp(
                        op=torch.distributed.isend,
                        tensor=_get_tensor(
                            routed_experts_weights, i, src_expert_location
                        ),
                        peer=dst_rank,
                    )
                    for dst_rank in all_dst_ranks
                    for i in range(num_tensors)
                ],
            )
        )
    def _compute_comm_info(logical_expert_id: int):
        all_src_ranks = _deduplicate_ordered(
            [
                x // num_local_physical_experts
                for x in range(num_physical_experts)
                if old_physical_to_logical_map[x] == logical_expert_id
            ]
        )
        all_src_nodes = [x // num_gpu_per_node for x in all_src_ranks]
        self_node_src_ranks = [
            x for x in all_src_ranks if x // num_gpu_per_node == self_node_id
        ]
        need_comm_dst_ranks = _deduplicate_ordered(
            [
                x // num_local_physical_experts
                for x in range(num_physical_experts)
                if new_physical_to_logical_map[x] == logical_expert_id
                and x // num_local_physical_experts not in all_src_ranks
            ]
        )
        need_comm_self_node_dst_ranks = (
            [x for x in need_comm_dst_ranks if x // num_gpu_per_node == self_node_id]
            if len(self_node_src_ranks) > 0
            else []
        )
        need_comm_cross_node_dst_ranks = [
            x
            for x in need_comm_dst_ranks
            if (x // num_gpu_per_node) not in all_src_nodes
        ]
        same_node_mapping = _ChunkUtils(
            chunk_values=self_node_src_ranks,
            element_values=need_comm_self_node_dst_ranks,
        )
        cross_node_mapping = _ChunkUtils(
            chunk_values=all_src_ranks,
            element_values=need_comm_cross_node_dst_ranks,
        )
        return same_node_mapping, cross_node_mapping, need_comm_self_node_dst_ranks
    def _execute_p2p_ops(p2p_op_infos):
        sorted_infos = sorted(p2p_op_infos, key=lambda info: info[0])
        p2p_ops = [op for _, ops in sorted_infos for op in ops]
        if len(p2p_ops) == 0:
            return
        reqs = torch.distributed.batch_isend_irecv(p2p_ops)
        for req in reqs:
            req.wait()
    def _execute_buffer2weight_copies(buffer2weight_copy_infos):
        for (
            temp_buffers_expert_location,
            routed_experts_weights_expert_location,
        ) in buffer2weight_copy_infos:
            for i in range(num_tensors):
                _get_tensor(
                    routed_experts_weights, i, routed_experts_weights_expert_location
                ).copy_(_get_tensor(temp_buffers, i, temp_buffers_expert_location))
    def _get_tensor(tensors, tensor_index: int, expert_location: int) -> torch.Tensor:
        return tensors[tensor_index][_get_local_expert_location(expert_location)]
    def _get_local_expert_location(expert_location: int) -> int:
        assert (
            local_expert_location_range[0]
            <= expert_location
            < local_expert_location_range[1]
        )
        return expert_location % num_local_physical_experts
    _entrypoint()
    return output_logs
 class _ChunkUtils:
    def __init__(self, *, chunk_values: List, element_values: List):
        self.chunk_values = chunk_values
        self.element_values = element_values
    def chunk_value_from_element_value(self, element_value):
        chunk_index = self._chunk_index_from_element_index(
            num_elements=len(self.element_values),
            num_chunks=len(self.chunk_values),
            element_index=self.element_values.index(element_value),
        )
        return self.chunk_values[chunk_index]
    def element_values_from_chunk_value(self, chunk_value) -> List:
        if len(self.element_values) == 0:
            return []
        element_slice = self._element_slice_from_chunk_index(
            num_elements=len(self.element_values),
            num_chunks=len(self.chunk_values),
            chunk_index=self.chunk_values.index(chunk_value),
        )
        return self.element_values[element_slice]
    @staticmethod
    def _chunk_index_from_element_index(
        num_elements: int, num_chunks: int, element_index: int
    ) -> int:
        short_chunk_size, num_long_chunks = divmod(num_elements, num_chunks)
        num_elements_for_long_chunks = num_long_chunks * (short_chunk_size + 1)
        if element_index < num_elements_for_long_chunks:
            return element_index // (short_chunk_size + 1)
        else:
            return (
                num_long_chunks
                + (element_index - num_elements_for_long_chunks) // short_chunk_size
            )
    @staticmethod
    def _element_slice_from_chunk_index(
        num_elements: int, num_chunks: int, chunk_index: int
    ) -> slice:
        short_chunk_size, num_long_chunks = divmod(num_elements, num_chunks)
        start = chunk_index * short_chunk_size + min(chunk_index, num_long_chunks)
        end = start + short_chunk_size + int(chunk_index < num_long_chunks)
        return slice(start, end)
 def _deduplicate_ordered(arr: List[int]):
    output = []
    for item in arr:
        if len(output) == 0 or item != output[-1]:
            output.append(item)
    return output
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -57,6 +57,7 @@ from sglang.srt.managers.expert_distribution import (
    set_global_expert_distribution_recorder,
 )
 from sglang.srt.managers.expert_location import (
    ExpertLocationMetadata,
    compute_initial_expert_location_metadata,
    get_global_expert_location_metadata,
    set_global_expert_location_metadata,
@@ -70,6 +71,7 @@ from sglang.srt.mem_cache.memory_pool import (
    TokenToKVPoolAllocator,
 )
 from sglang.srt.mem_cache.paged_allocator import PagedTokenToKVPoolAllocator
 from sglang.srt.model_executor import expert_location_updater
 from sglang.srt.model_executor.cuda_graph_runner import CudaGraphRunner
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
 from sglang.srt.model_loader import get_model
@@ -575,6 +577,16 @@ class ModelRunner:
                f"TP rank {self.tp_rank} could finish the model loading, but there are other ranks that didn't finish loading. It is likely due to unexpected failures (e.g., OOM) or a slow node."
            ) from None
    def update_expert_location(
        self, new_expert_location_metadata: ExpertLocationMetadata
    ):
        expert_location_updater.update_expert_location(
            self.model.routed_experts_weights_of_layer,
            new_expert_location_metadata,
            nnodes=self.server_args.nnodes,
            rank=self.tp_rank,
        )
    def update_weights_from_disk(
        self, model_path: str, load_format: str
    ) -> tuple[bool, str]:
--- a/python/sglang/srt/models/deepseek_v2.py
+++ b/python/sglang/srt/models/deepseek_v2.py
@@ -317,6 +317,13 @@ class DeepseekV2MoE(nn.Module):
    def _enable_deepep_moe(self):
        return global_server_args_dict["enable_deepep_moe"]
    def get_moe_weights(self):
        return [
            x.data
            for name, x in self.experts.named_parameters()
            if name not in ["correction_bias"]
        ]
    def op_gate(self, state):
        if (not self._enable_deepep_moe) or is_non_idle_and_non_empty(
            state.forward_batch.forward_mode, state.hidden_states_mlp_input
@@ -1599,6 +1606,14 @@ class DeepseekV2ForCausalLM(nn.Module):
                self_attn.w_vc = w_vc.contiguous()
                self_attn.use_deep_gemm_bmm = True
        # TODO support nextn later
        if not is_nextn:
            self.routed_experts_weights_of_layer = {
                layer_id: layer.mlp.get_moe_weights()
                for layer_id, layer in enumerate(self.model.layers)
                if isinstance(layer.mlp, DeepseekV2MoE)
            }
    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]], is_nextn=False):
        if is_nextn:
            if hasattr(self.config, "num_nextn_predict_layers"):
--- a/test/srt/test_expert_location_updater.py
+++ b/test/srt/test_expert_location_updater.py
@@ -0,0 +1,255 @@
 import os
 import traceback
 import unittest
 from dataclasses import dataclass
 from typing import List
 import torch
 import torch.distributed
 import torch.multiprocessing as mp
 from torch.multiprocessing import Process
 from sglang.srt.model_executor import expert_location_updater
 from sglang.test.test_utils import CustomTestCase, find_available_port
 from sglang.utils import is_in_ci
@dataclass
 class _TestInfo:
    nnodes: int
    num_logical_experts: int
    num_physical_experts: int
    num_repeat: int = 5000
 class TestExpertLocationUpdater(CustomTestCase):
    @classmethod
    def setUpClass(cls):
        mp.set_start_method("spawn", force=True)
    def test_cpu(self):
        self._test_common(device="cpu")
        self._test_core(
            num_gpus=32,
            device="cpu",
            infos=[
                _TestInfo(
                    nnodes=4,
                    num_logical_experts=256,
                    num_physical_experts=288,
                    num_repeat=10000,
                )
            ],
        )
    def test_cpu_slow(self):
        if is_in_ci():
            return
        self._test_core(
            num_gpus=144,
            device="cpu",
            infos=[
                _TestInfo(
                    nnodes=18,
                    num_logical_experts=256,
                    num_physical_experts=288,
                    num_repeat=10000,
                )
            ],
        )
    def test_gpu(self):
        if is_in_ci():
            return
        self._test_common(device="cuda")
    def _test_common(self, device):
        infos = []
        for nnodes in [1, 2, 4]:
            for num_logical_experts in [2, 5, 20, 256]:
                for num_physical_experts in [8, 16, 256, 288]:
                    if num_logical_experts > num_physical_experts:
                        continue
                    infos.append(
                        _TestInfo(
                            nnodes=nnodes,
                            num_logical_experts=num_logical_experts,
                            num_physical_experts=num_physical_experts,
                        )
                    )
        self._test_core(num_gpus=8, device=device, infos=infos)
    def _test_core(
        self,
        num_gpus: int,
        device: str,
        infos: List[_TestInfo],
    ):
        master_port = find_available_port(23456)
        processes = []
        output_reader, output_writer = mp.Pipe(duplex=False)
        for rank in range(num_gpus):
            p = Process(
                target=_run_subprocess,
                kwargs=dict(
                    rank=rank,
                    num_gpus=num_gpus,
                    output_writer=output_writer,
                    master_port=master_port,
                    device=device,
                    infos=infos,
                ),
            )
            p.start()
            processes.append(p)
        for _ in range(num_gpus):
            self.assertTrue(
                output_reader.recv(), f"Subprocess has error, please see logs above."
            )
        for p in processes:
            p.join()
 def _run_subprocess(
    rank: int,
    num_gpus: int,
    master_port: int,
    device: str,
    infos: List[_TestInfo],
    output_writer,
 ):
    try:
        os.environ["MASTER_ADDR"] = "localhost"
        os.environ["MASTER_PORT"] = str(master_port)
        torch.random.manual_seed(42)
        torch.distributed.init_process_group(
            rank=rank,
            world_size=num_gpus,
            backend={"cpu": "gloo", "cuda": None}[device],
        )
        if device == "cuda":
            torch.cuda.set_device(f"cuda:{rank}")
        for info in infos:
            _execute_test(info, rank=rank, num_gpus=num_gpus, device=device)
        execution_ok = True
    except Exception as e:
        print(f"subprocess[{rank=}] has error: {e}", flush=True)
        traceback.print_exc()
        execution_ok = False
    output_writer.send(execution_ok)
    output_writer.close()
 def _execute_test(info: _TestInfo, rank: int, num_gpus: int, device: str):
    if rank == 0:
        print(f"Test: {num_gpus=} {info=}", flush=True)
    assert info.num_physical_experts % num_gpus == 0
    num_local_physical_experts = info.num_physical_experts // num_gpus
    assert num_gpus % info.nnodes == 0
    num_gpu_per_node = num_gpus // info.nnodes
    def _create_routed_experts_weights(physical_to_logical_map):
        local_logical_expert_ids = physical_to_logical_map[
            rank * num_local_physical_experts : (rank + 1) * num_local_physical_experts
        ].cpu()
        return [
            local_logical_expert_ids.to(device).clone(),
            torch.tensor(
                [
                    [local_logical_expert_id * 10, local_logical_expert_id * 100]
                    for local_logical_expert_id in local_logical_expert_ids.tolist()
                ],
                device=device,
            ),
        ]
    def _create_physical_to_logical_map():
        if rank == 0:
            ans = torch.concat(
                [
                    torch.arange(0, info.num_logical_experts),
                    torch.randint(
                        0,
                        info.num_logical_experts,
                        (info.num_physical_experts - info.num_logical_experts,),
                    ),
                ]
            )
            ans = ans[torch.randperm(ans.shape[0])]
        else:
            ans = torch.empty((info.num_physical_experts,), dtype=torch.int64)
        assert ans.dtype == torch.int64 and ans.shape == (info.num_physical_experts,)
        ans = ans.to(device)
        torch.distributed.broadcast(ans, src=0)
        return ans.cpu()
    physical_to_logical_map = _create_physical_to_logical_map()
    routed_experts_weights = _create_routed_experts_weights(physical_to_logical_map)
    for i in range(info.num_repeat):
        if rank == 0 and ((i % 500 == 0) or (i == info.num_repeat - 1)):
            print(f"Step {i}/{info.num_repeat}", flush=True)
        new_physical_to_logical_map = _create_physical_to_logical_map()
        expect_new_weights = _create_routed_experts_weights(new_physical_to_logical_map)
        output_logs = expert_location_updater.update_expert_weights_single_layer(
            routed_experts_weights=routed_experts_weights,
            temp_buffers=expert_location_updater.create_temp_buffers(
                routed_experts_weights
            ),
            old_physical_to_logical_map=physical_to_logical_map,
            new_physical_to_logical_map=new_physical_to_logical_map,
            num_local_physical_experts=num_local_physical_experts,
            num_gpu_per_node=num_gpu_per_node,
            rank=rank,
            debug=True,
        )
        local_has_error = not all(
            torch.all(x == y)
            for x, y in zip(routed_experts_weights, expect_new_weights, strict=True)
        )
        global_has_error = torch.tensor(local_has_error, device=device)
        torch.distributed.all_reduce(
            global_has_error, op=torch.distributed.ReduceOp.MAX
        )
        if global_has_error.cpu().item():
            output_logs_str = "\n".join(output_logs)
            local_message = (
                f"===================== rank {rank} ============================\n"
                f"{num_gpus=} {info=}\n"
                f"{routed_experts_weights[0].tolist()=}\n"
                f"{expect_new_weights[0].tolist()=}\n"
                f"{physical_to_logical_map.tolist()=}\n"
                f"{new_physical_to_logical_map.tolist()=}\n"
                f"===logs===\n"
                f"{output_logs_str}\n"
                f"==============================================================\n"
            )
            global_messages = ([None] * num_gpus) if rank == 0 else None
            torch.distributed.gather_object(local_message, global_messages, dst=0)
            if rank == 0:
                print("\n\n".join(global_messages), flush=True)
            raise AssertionError(f"Error happens, see logs above")
        physical_to_logical_map = new_physical_to_logical_map
 if __name__ == "__main__":
    unittest.main()