[releases/v0.18.0][bugfix][eplb] remove unnecessary weight_scale wrap behaviour (#7732)
### What this PR does / why we need it? This PR simplifies the apply method in w8a8_dynamic.py by removing the conditional logic that used fused_w1_scale and fused_w2_scale based on the fused_scale_flag. This redundant wrap behavior leads to EPLB break in int8 quantization scenarios. Cherry-picked from #7188. Note that only bugfix lines in that PR are picked. Signed-off-by: linfeng-yuan <1102311262@qq.com>
This commit is contained in:
@@ -267,8 +267,8 @@ class AscendW8A8DynamicFusedMoEMethod(AscendMoEScheme):
|
|||||||
log2phy=log2phy,
|
log2phy=log2phy,
|
||||||
pertoken_scale=pertoken_scale,
|
pertoken_scale=pertoken_scale,
|
||||||
activation=activation,
|
activation=activation,
|
||||||
w1_scale=[layer.fused_w1_scale] if fused_scale_flag else w1_scale,
|
w1_scale=w1_scale,
|
||||||
w2_scale=[layer.fused_w2_scale] if fused_scale_flag else w2_scale,
|
w2_scale=w2_scale,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
if zero_expert_num > 0 and zero_expert_type is not None:
|
if zero_expert_num > 0 and zero_expert_type is not None:
|
||||||
|
|||||||
Reference in New Issue
Block a user