From c55d99d13eb5705836e51c1e19f463c17b5addaa Mon Sep 17 00:00:00 2001
From: linfeng-yuan <1102311262@qq.com>
Date: Tue, 14 Oct 2025 21:11:05 +0800
Subject: [PATCH] [bugfix][torchair] fix missing weight nz cast for w13_weight
 in torchair_w8a8_dynamic.py (#3446)

### What this PR does / why we need it?
Fix the issue of missing NZ conversion for quantized weights in GMM
after moe_dispatch operator in torchair scenario, which does not involve
aclgraph & single scenarios.

### How was this patch tested?
vllm serving passed with lower latency (~5ms TPOT with bs_per_rank=28 &
ep_size=32)

- vLLM version: v0.11.0rc3
- vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0

Signed-off-by: linfeng-yuan <1102311262@qq.com>
---
 vllm_ascend/torchair/quantization/torchair_w8a8_dynamic.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm_ascend/torchair/quantization/torchair_w8a8_dynamic.py b/vllm_ascend/torchair/quantization/torchair_w8a8_dynamic.py
index 6517127..b933db6 100644
--- a/vllm_ascend/torchair/quantization/torchair_w8a8_dynamic.py
+++ b/vllm_ascend/torchair/quantization/torchair_w8a8_dynamic.py
@@ -1052,6 +1052,7 @@ class TorchairAscendW8A8DynamicFusedMoEMethod:
             layer.w2_weight.data = layer.w2_weight.data.transpose(
                 1, 2).contiguous()
         if is_enable_nz():
+            torch_npu.npu_format_cast_(layer.w13_weight, ACL_FORMAT_FRACTAL_NZ)
             torch_npu.npu_format_cast_(layer.w2_weight, ACL_FORMAT_FRACTAL_NZ)
         layer.w13_weight_scale.data = layer.w13_weight_scale.data.view(
             layer.w13_weight_scale.data.shape[0], -1)