From 2cd036ee8ec737cefa8d30c0acab56e4f18ea189 Mon Sep 17 00:00:00 2001 From: linfeng-yuan <1102311262@qq.com> Date: Tue, 6 May 2025 22:09:56 +0800 Subject: [PATCH] [Bugfix] fix accuracy problem for quantized deepseek models (#768) ### What this PR does / why we need it? The root cause of the bug is that numerical computations involving NaNs cannot eliminate them. We addressed it by using `masked_fill_` to eliminate NaNs while avoiding memory-wasting `torch.where` approach. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? This patch was tested with vllm v0.8.5 and vllm-ascend master. I run deepseek_v3 model with offline inference scripts (examples/dp_offline/run_dp.sh & data_parallel.py). Signed-off-by: linfeng-yuan <1102311262@qq.com> --- vllm_ascend/quantization/w8a8_dynamic.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm_ascend/quantization/w8a8_dynamic.py b/vllm_ascend/quantization/w8a8_dynamic.py index bcd313d..4fbfadc 100644 --- a/vllm_ascend/quantization/w8a8_dynamic.py +++ b/vllm_ascend/quantization/w8a8_dynamic.py @@ -285,7 +285,8 @@ def fused_experts(hidden_states: torch.Tensor, valid_token_mask = torch.arange( 0, sorted_token_indices.shape[0], device=device).unsqueeze(1) < num_valid_tokens - down_out_list.mul_(valid_token_mask) + down_out_list = down_out_list.masked_fill_(~valid_token_mask, + 0).to(dtype) final_hidden_states.index_add_(0, sorted_token_indices, down_out_list) else: # TODO: Reorder device memory 2 times here, replace the current