From 4ee58e213b421ece745dba6e94967e6f557263ce Mon Sep 17 00:00:00 2001 From: zzhxxx <96690582+zzhx1@users.noreply.github.com> Date: Wed, 24 Sep 2025 18:44:15 +0800 Subject: [PATCH] [BugFix] explicitly setting the tensor shape of otp output (#3027) When MTP and oprojTP are enabled, it triggers the recompilation of the torchair graph, leading to a decrease in performance, and this PR fixes this issue. - vLLM version: v0.10.2 - vLLM main: https://github.com/vllm-project/vllm/commit/486c5599e3ab7d721c94dd01e89c87742c01e1ac --------- Signed-off-by: zzhx1 --- vllm_ascend/ops/linear_op.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm_ascend/ops/linear_op.py b/vllm_ascend/ops/linear_op.py index 57044f5..f6feadd 100644 --- a/vllm_ascend/ops/linear_op.py +++ b/vllm_ascend/ops/linear_op.py @@ -299,6 +299,7 @@ class OProjRowParallelOp(CustomRowParallelOp): # otp-specific: Combine partial results across devices output = self.comm_group.reduce_scatter(output_parallel, dim=0) + output = output.view(input_.shape[0], self.layer.output_size) # Handle bias return based on configuration output_bias = self.bias if self.skip_bias_add else None