[BugFix]Fix eplb problems when using dynamic eplb. (#3364)

### What this PR does / why we need it? When using dynamic eplb,it will be blocking by nz tensor.We fix these prolems by clone src tensor and recv tensor. ### Does this PR introduce any user-facing change? ### How was this patch tested? Qwen3_moe in A3. - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 --------- Signed-off-by: offline0806 <3337230449@qq.com> Co-authored-by: offline0806 <3337230449@qq.com>
2025-10-11 14:04:02 +08:00
parent ca05f7d632
commit 82b6c846ca
8 changed files with 58 additions and 34 deletions
--- a/vllm_ascend/eplb/core/eplb_device_transfer_loader.py
+++ b/vllm_ascend/eplb/core/eplb_device_transfer_loader.py
@@ -45,7 +45,7 @@ class D2DExpertWeightLoader:
                                          layer_id):
        # When current send/recv and weight.expert_map update tasks are not finished, cannot accept new d2d task
        if self.state != ExpertWeightUpdateState.WAITING:
-            logger.error(
+            logger.warning_once(
                "current d2d weight update tasks are on-going, cannot accept new weight update task"
            )
            return
@@ -64,6 +64,7 @@ class D2DExpertWeightLoader:
                layer_id][global_expert_id_to_send].item()
            for src_tensor in self.eplb_adaptor.expert_param_per_layer[
                    layer_id][local_expert_id]:
+                src_tensor = src_tensor.clone()
                self.comm_op_list.append(
                    dist.P2POp(dist.isend, src_tensor, dst_rank))