[Feat] 310p support MoE W8A8 quantizaition (#6641)
### What this PR does / why we need it?
This PR introduces support for W8A8 dynamic quantization for
Mixture-of-Experts (MoE) models on Ascend 310P devices. This is achieved
by:
- Implementing a new quantization scheme
`AscendW8A8DynamicFusedMoEMethod310`.
- Adding a unified MLP implementation (`unified_apply_mlp`) for 310P
that handles both quantized and unquantized paths.
- Refactoring the MoE and quantization configuration logic to correctly
route to the new 310P-specific implementations.
- Adding new e2e and unit tests to verify the functionality of MoE W8A8
quantization.
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
- Added a new e2e test `test_qwen3_moe_tp2_w8a8` to test MoE W8A8
quantization in a multi-card setup.
- Added several new unit tests for the 310P-specific MoE components,
including `experts_selector`, `fused_moe`, `moe_comm_method`, `moe_mlp`,
and the new `w8a8_dynamic` quantization method.
- vLLM version: v0.15.0
- vLLM main:
d7e17aaacd
---------
Signed-off-by: pu-zhe <zpuaa@outlook.com>
This commit is contained in:
@@ -58,7 +58,6 @@ class AscendUnquantizedFusedMoEMethod310(UnquantizedFusedMoEMethod):
|
||||
num_expert_group: int | None = None,
|
||||
custom_routing_function: Callable | None = None,
|
||||
scoring_func: str = "softmax",
|
||||
routed_scaling_factor: float = 1.0,
|
||||
e_score_correction_bias: torch.Tensor | None = None,
|
||||
global_num_experts: int = -1,
|
||||
expert_map: torch.Tensor | None = None,
|
||||
@@ -67,7 +66,6 @@ class AscendUnquantizedFusedMoEMethod310(UnquantizedFusedMoEMethod):
|
||||
) -> torch.Tensor:
|
||||
zero_expert_num = getattr(layer, "zero_expert_num", 0)
|
||||
zero_expert_type = getattr(layer, "zero_expert_type", None)
|
||||
assert routed_scaling_factor == 1.0
|
||||
|
||||
topk_weights, topk_ids = select_experts(
|
||||
hidden_states=x,
|
||||
@@ -195,44 +193,36 @@ class AscendFusedMoE310(FusedMoE):
|
||||
|
||||
method = quant_method.quant_method
|
||||
quant_type = getattr(method, "quant_type", QuantType.NONE)
|
||||
if quant_type != QuantType.NONE:
|
||||
# TODO: w8a8 quantization will be supported soon, and only reject w4a8 here.
|
||||
raise RuntimeError("W8A8 is not supported currently.")
|
||||
return QuantType.NONE
|
||||
if quant_type not in [QuantType.NONE, QuantType.W8A8]:
|
||||
raise RuntimeError("Only Unquant and W8A8 is supported.")
|
||||
return quant_type
|
||||
|
||||
def forward_impl( # type: ignore[override]
|
||||
self, hidden_states: torch.Tensor, router_logits: torch.Tensor
|
||||
) -> torch.Tensor:
|
||||
assert self.quant_method is not None
|
||||
assert self.routed_scaling_factor == 1.0, "routed_scaling_factor != 1.0 is not supported."
|
||||
forward_context = get_forward_context()
|
||||
|
||||
hidden_states, router_logits, _, context_metadata = forward_context.moe_comm_method.prepare(
|
||||
hidden_states=hidden_states, router_logits=router_logits, quant_type=self.quant_type
|
||||
)
|
||||
|
||||
if isinstance(hidden_states, tuple):
|
||||
hidden_states, pertoken_scale = hidden_states
|
||||
else:
|
||||
pertoken_scale = None
|
||||
|
||||
# Matrix multiply.
|
||||
fused_experts_results: FusedExpertsResult = self.quant_method.apply(
|
||||
layer=self,
|
||||
x=hidden_states,
|
||||
router_logits=router_logits,
|
||||
pertoken_scale=pertoken_scale,
|
||||
top_k=self.top_k,
|
||||
renormalize=self.renormalize,
|
||||
use_grouped_topk=self.use_grouped_topk,
|
||||
global_num_experts=self.global_num_experts,
|
||||
expert_map=self.local_expert_map,
|
||||
top_k=self.top_k,
|
||||
router_logits=router_logits,
|
||||
renormalize=self.renormalize,
|
||||
topk_group=self.topk_group,
|
||||
num_expert_group=self.num_expert_group,
|
||||
custom_routing_function=self.custom_routing_function,
|
||||
scoring_func=self.scoring_func,
|
||||
routed_scaling_factor=self.routed_scaling_factor,
|
||||
e_score_correction_bias=self.e_score_correction_bias,
|
||||
activation=self.activation,
|
||||
global_num_experts=self.global_num_experts,
|
||||
expert_map=self.local_expert_map,
|
||||
apply_router_weight_on_input=self.apply_router_weight_on_input,
|
||||
)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user