[Bugfix] Fix deepseek percision issue and add acc ci for it (#905)
### What this PR does / why we need it? Fix deepseek percision issue on V0 and add acc ci for it Fixes https://github.com/vllm-project/vllm-ascend/issues/1062 ### How was this patch tested? CI passed with new added test. Signed-off-by: MengqingCao <cmq0113@163.com>
This commit is contained in:
@@ -629,6 +629,7 @@ class AscendUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod):
|
||||
scoring_func: str = "softmax",
|
||||
e_score_correction_bias: Optional[torch.Tensor] = None,
|
||||
is_prefill: bool = False,
|
||||
enable_force_load_balance: bool = False,
|
||||
**kwargs,
|
||||
):
|
||||
# NOTE: now npu_moe_gating_top_k can only support `group_count=256` pattern
|
||||
@@ -660,6 +661,13 @@ class AscendUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod):
|
||||
e_score_correction_bias=e_score_correction_bias,
|
||||
)
|
||||
|
||||
topk_weights = topk_weights.to(x.dtype)
|
||||
# this is a naive implementation for experts load balance so as
|
||||
# to avoid accumulating too much tokens on a single rank.
|
||||
# currently it is only activated when doing profile runs.
|
||||
if enable_force_load_balance:
|
||||
topk_ids = torch.randint_like(topk_ids, 0, global_num_experts)
|
||||
|
||||
if VLLM_ENABLE_MC2 and not is_prefill:
|
||||
return fused_experts_with_mc2(
|
||||
hidden_states=x,
|
||||
|
||||
Reference in New Issue
Block a user