[Feature] Support fine-grained shared expert overlap (#5482)

Fine-grained control over shared expert overlap to prevent resource
contention.

- vLLM version: v0.13.0
- vLLM main:
5326c89803

---------

Signed-off-by: Jade Zheng <zheng.shoujian@outlook.com>
This commit is contained in:
Jade Zheng
2026-01-17 11:53:22 +08:00
committed by GitHub
parent 48e10de8c9
commit 22f253142a
9 changed files with 203 additions and 130 deletions

View File

@@ -204,9 +204,6 @@ class AscendW4A16FusedMoEMethod:
enable_force_load_balance: bool = True,
log2phy: torch.Tensor = None,
global_redundant_expert_num: int = 0,
shared_experts: Optional[Any] = None,
quantized_x_for_share: Optional[Any] = None,
dynamic_scale_for_share: Optional[Any] = None,
**kwargs,
) -> torch.Tensor:
assert router_logits.shape[
@@ -229,24 +226,21 @@ class AscendW4A16FusedMoEMethod:
topk_weights = topk_weights.to(x.dtype)
moe_comm_method = get_forward_context().moe_comm_method
return moe_comm_method.fused_experts(
hidden_states=x,
w1=layer.w13_weight_packed,
w2=layer.w2_weight_packed,
w1_scale=layer.w13_weight_scale,
w2_scale=layer.w2_weight_scale,
w1_offset=layer.w13_weight_offset,
w2_offset=layer.w2_weight_offset,
topk_weights=topk_weights,
topk_ids=topk_ids,
use_int4_w4a16=True,
expert_map=expert_map,
log2phy=log2phy,
shared_experts=shared_experts,
quantized_x_for_share=quantized_x_for_share,
dynamic_scale_for_share=dynamic_scale_for_share,
dynamic_eplb=self.dynamic_eplb,
mc2_mask=kwargs.get("mc2_mask", None))
return moe_comm_method.fused_experts(hidden_states=x,
w1=layer.w13_weight_packed,
w2=layer.w2_weight_packed,
w1_scale=layer.w13_weight_scale,
w2_scale=layer.w2_weight_scale,
w1_offset=layer.w13_weight_offset,
w2_offset=layer.w2_weight_offset,
topk_weights=topk_weights,
topk_ids=topk_ids,
use_int4_w4a16=True,
expert_map=expert_map,
log2phy=log2phy,
dynamic_eplb=self.dynamic_eplb,
mc2_mask=kwargs.get(
"mc2_mask", None))
def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
if self.transpose_weight:

View File

@@ -341,9 +341,6 @@ class AscendW4A8DynamicFusedMoEMethod:
enable_force_load_balance: bool = False,
log2phy: torch.Tensor = None,
global_redundant_expert_num: int = 0,
shared_experts: Optional[Any] = None,
quantized_x_for_share: Optional[Any] = None,
dynamic_scale_for_share: Optional[Any] = None,
**kwargs,
) -> torch.Tensor:
assert router_logits.shape[
@@ -390,9 +387,6 @@ class AscendW4A8DynamicFusedMoEMethod:
use_int4_w4a8=True,
expert_map=expert_map,
log2phy=log2phy,
shared_experts=shared_experts,
quantized_x_for_share=quantized_x_for_share,
dynamic_scale_for_share=dynamic_scale_for_share,
dynamic_eplb=self.dynamic_eplb,
mc2_mask=kwargs.get("mc2_mask", None))

View File

@@ -190,9 +190,6 @@ class AscendW8A8DynamicFusedMoEMethod:
enable_force_load_balance: bool = False,
log2phy: torch.Tensor = None,
global_redundant_expert_num: int = 0,
shared_experts: Optional[Any] = None,
quantized_x_for_share: Optional[Any] = None,
dynamic_scale_for_share: Optional[Any] = None,
pertoken_scale: Optional[Any] = None,
**kwargs,
) -> torch.Tensor:
@@ -280,9 +277,6 @@ class AscendW8A8DynamicFusedMoEMethod:
use_int8_w8a8=True,
expert_map=expert_map,
log2phy=log2phy,
shared_experts=shared_experts,
quantized_x_for_share=quantized_x_for_share,
dynamic_scale_for_share=dynamic_scale_for_share,
dynamic_eplb=self.dynamic_eplb,
mc2_mask=kwargs.get("mc2_mask", None))
if zero_expert_num > 0 and zero_expert_type is not None: