shared_experts+router_experts merge all_reduce(Improve TTOP 5ms) (#1395)
### What this PR does / why we need it?
When all_reduce_merge is in progress, shared_experts does not do
all_reduce in mlp, but waits until shared_experts+router_experts are
completed before doing all_reduce
In prefill and decode, as long as shared_experts+router_experts are
all_reduce, there will be benefits.
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
bash examples/run_dp_attention_etp16.sh
bash examples/run_dp_attention_etp16_benmark.sh
- vLLM version: v0.9.1
- vLLM main:
977180c912
---------
Signed-off-by: ttanzhiqiang <389825161@qq.com>
This commit is contained in:
@@ -303,7 +303,6 @@ class CustomDeepseekV2MoE(nn.Module):
|
||||
self.tp_size = get_tensor_model_parallel_world_size()
|
||||
self.routed_scaling_factor = config.routed_scaling_factor
|
||||
self.n_shared_experts = config.n_shared_experts
|
||||
self.routed_scaling_factor = config.routed_scaling_factor
|
||||
if self.tp_size > config.n_routed_experts:
|
||||
raise ValueError(
|
||||
f"Tensor parallel size {self.tp_size} is greater than "
|
||||
@@ -345,6 +344,8 @@ class CustomDeepseekV2MoE(nn.Module):
|
||||
e_score_correction_bias=self.gate.e_score_correction_bias)
|
||||
|
||||
if config.n_shared_experts is not None:
|
||||
self.all_reduce_merge = self.experts.all_reduce_merge
|
||||
reduce_results = not self.all_reduce_merge
|
||||
intermediate_size = (config.moe_intermediate_size *
|
||||
config.n_shared_experts)
|
||||
self.shared_experts = CustomDeepseekV2MLP(
|
||||
@@ -352,7 +353,7 @@ class CustomDeepseekV2MoE(nn.Module):
|
||||
intermediate_size=intermediate_size,
|
||||
hidden_act=config.hidden_act,
|
||||
quant_config=quant_config,
|
||||
reduce_results=True,
|
||||
reduce_results=reduce_results,
|
||||
force_replicate=self.enable_multistream_moe,
|
||||
prefix=f"{prefix}.shared_experts",
|
||||
)
|
||||
@@ -403,6 +404,9 @@ class CustomDeepseekV2MoE(nn.Module):
|
||||
hidden_states = (
|
||||
experts_hidden_states[0] * self.routed_scaling_factor +
|
||||
experts_hidden_states[1])
|
||||
if self.all_reduce_merge:
|
||||
# When all_reduce_merge is in progress, shared_experts does not do all_reduce in mlp, but waits until shared_experts+router_experts are completed before doing all_reduce
|
||||
hidden_states = tensor_model_parallel_all_reduce(hidden_states)
|
||||
|
||||
return hidden_states
|
||||
|
||||
|
||||
Reference in New Issue
Block a user