[BugFix]Fix eplb problems when using dynamic eplb. (#3364)
### What this PR does / why we need it? When using dynamic eplb,it will be blocking by nz tensor.We fix these prolems by clone src tensor and recv tensor. ### Does this PR introduce any user-facing change? ### How was this patch tested? Qwen3_moe in A3. - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 --------- Signed-off-by: offline0806 <3337230449@qq.com> Co-authored-by: offline0806 <3337230449@qq.com>
This commit is contained in:
@@ -80,15 +80,15 @@ class VllmEplbAdaptor(EplbAdaptor):
|
||||
self.all_topk_ids = []
|
||||
|
||||
def init_buffer_tensor(self, num_buffer_tensor):
|
||||
for name in self.expert_weight_names:
|
||||
complete_name = "model.layers." + str(
|
||||
self.num_dense_layers) + ".mlp.experts." + name
|
||||
expert_tensor = self.param_dict[complete_name].data[
|
||||
0:num_buffer_tensor]
|
||||
buffer_tensors = torch.empty_like(expert_tensor)
|
||||
for buffer_id in range(num_buffer_tensor):
|
||||
self.buffer_tensor_list[buffer_id].append(
|
||||
buffer_tensors[buffer_id])
|
||||
for buffer_id in range(num_buffer_tensor):
|
||||
for name in self.expert_weight_names:
|
||||
complete_name = "model.layers." + str(
|
||||
self.num_dense_layers) + ".mlp.experts." + name
|
||||
expert_tensor = self.param_dict[complete_name].data[0]
|
||||
if name in ["w13_weight", "w2_weight"]:
|
||||
expert_tensor = expert_tensor.clone()
|
||||
buffer_tensor = torch.empty_like(expert_tensor)
|
||||
self.buffer_tensor_list[buffer_id].append(buffer_tensor)
|
||||
|
||||
def init_expert_param_per_layer(self):
|
||||
num_local_expert = self.param_dict["model.layers." + str(self.num_dense_layers) + \
|
||||
|
||||
@@ -45,7 +45,7 @@ class D2DExpertWeightLoader:
|
||||
layer_id):
|
||||
# When current send/recv and weight.expert_map update tasks are not finished, cannot accept new d2d task
|
||||
if self.state != ExpertWeightUpdateState.WAITING:
|
||||
logger.error(
|
||||
logger.warning_once(
|
||||
"current d2d weight update tasks are on-going, cannot accept new weight update task"
|
||||
)
|
||||
return
|
||||
@@ -64,6 +64,7 @@ class D2DExpertWeightLoader:
|
||||
layer_id][global_expert_id_to_send].item()
|
||||
for src_tensor in self.eplb_adaptor.expert_param_per_layer[
|
||||
layer_id][local_expert_id]:
|
||||
src_tensor = src_tensor.clone()
|
||||
self.comm_op_list.append(
|
||||
dist.P2POp(dist.isend, src_tensor, dst_rank))
|
||||
|
||||
|
||||
Reference in New Issue
Block a user