From 09682e075118aaacb0a717f2b7078bad040599a9 Mon Sep 17 00:00:00 2001 From: ice_rain <72488516+icerain-alt@users.noreply.github.com> Date: Fri, 9 Jan 2026 16:05:32 +0800 Subject: [PATCH] [Bugfix] Fix matmul allreduce precision issue by using original weight (#4939) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### What this PR does / why we need it? This PR fixes the precision issue from improper Tensor maintenance in `vllm_ascend/ops/linear_op.py` under the Verl reinforcement learning (RL) scenario. issue: https://github.com/vllm-project/vllm-ascend/issues/5747 Key changes: 1. Remove the custom class member `self.weight_t` in `vllm_ascend/ops/linear_op.py`; 2. Adjust the input logic of the `npu_mm_all_reduce_base` operator to directly fetch weight parameters from the model's `nn.Parameters`, instead of using pre-created Tensors. > In the vllm model, it is recommended to avoid creating additional parameter copies (such as self.weight_t) for computation; if already created, they must be synchronized with the model's original parameters. This is because parameter synchronization between training and inference in the Verl reinforcement learning (RL) scenario may cause memory address changes to nn.Parameters, and unsynchronized extra Tensors will reference old memory without updating with the parameters—ultimately leading to precision issues. ### Does this PR introduce _any_ user-facing change? No. - vLLM version: v0.12.0 - vLLM main: https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9 Signed-off-by: icerain-alt <450125138@qq.com> Co-authored-by: Shangwei-Li --- vllm_ascend/ops/linear_op.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/vllm_ascend/ops/linear_op.py b/vllm_ascend/ops/linear_op.py index 8d8ecbe0..ef612ce9 100644 --- a/vllm_ascend/ops/linear_op.py +++ b/vllm_ascend/ops/linear_op.py @@ -423,7 +423,7 @@ class MatmulAllreduceRowParallelOp(CustomRowParallelOp): bias_ = None if (self.tp_rank > 0 or self.skip_bias_add) else self.bias if self.reduce_results and self.tp_size > 1: output = torch_npu.npu_mm_all_reduce_base(input_parallel, - self.weight_t, + self.layer.weight.t(), self.hcomm_info, bias=bias_) else: @@ -450,10 +450,6 @@ class MatmulAllreduceRowParallelOp(CustomRowParallelOp): cls._HCOMM_INFO = group.get_hccl_comm_name(rank) return cls._HCOMM_INFO - def update_attrs(self): - super().update_attrs() - self.weight_t = self.layer.weight.t() - class SequenceColumnParallelOp(CustomColumnParallelOp):