From c55d99d13eb5705836e51c1e19f463c17b5addaa Mon Sep 17 00:00:00 2001 From: linfeng-yuan <1102311262@qq.com> Date: Tue, 14 Oct 2025 21:11:05 +0800 Subject: [PATCH] [bugfix][torchair] fix missing weight nz cast for w13_weight in torchair_w8a8_dynamic.py (#3446) ### What this PR does / why we need it? Fix the issue of missing NZ conversion for quantized weights in GMM after moe_dispatch operator in torchair scenario, which does not involve aclgraph & single scenarios. ### How was this patch tested? vllm serving passed with lower latency (~5ms TPOT with bs_per_rank=28 & ep_size=32) - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 Signed-off-by: linfeng-yuan <1102311262@qq.com> --- vllm_ascend/torchair/quantization/torchair_w8a8_dynamic.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm_ascend/torchair/quantization/torchair_w8a8_dynamic.py b/vllm_ascend/torchair/quantization/torchair_w8a8_dynamic.py index 6517127..b933db6 100644 --- a/vllm_ascend/torchair/quantization/torchair_w8a8_dynamic.py +++ b/vllm_ascend/torchair/quantization/torchair_w8a8_dynamic.py @@ -1052,6 +1052,7 @@ class TorchairAscendW8A8DynamicFusedMoEMethod: layer.w2_weight.data = layer.w2_weight.data.transpose( 1, 2).contiguous() if is_enable_nz(): + torch_npu.npu_format_cast_(layer.w13_weight, ACL_FORMAT_FRACTAL_NZ) torch_npu.npu_format_cast_(layer.w2_weight, ACL_FORMAT_FRACTAL_NZ) layer.w13_weight_scale.data = layer.w13_weight_scale.data.view( layer.w13_weight_scale.data.shape[0], -1)