From 4ee58e213b421ece745dba6e94967e6f557263ce Mon Sep 17 00:00:00 2001
From: zzhxxx <96690582+zzhx1@users.noreply.github.com>
Date: Wed, 24 Sep 2025 18:44:15 +0800
Subject: [PATCH] [BugFix] explicitly setting the tensor shape of otp output
 (#3027)

When MTP and oprojTP are enabled, it triggers the recompilation of the
torchair graph, leading to a decrease in performance, and this PR fixes
this issue.

- vLLM version: v0.10.2
- vLLM main:
https://github.com/vllm-project/vllm/commit/486c5599e3ab7d721c94dd01e89c87742c01e1ac

---------

Signed-off-by: zzhx1 <zzh_201018@outlook.com>
---
 vllm_ascend/ops/linear_op.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm_ascend/ops/linear_op.py b/vllm_ascend/ops/linear_op.py
index 57044f5..f6feadd 100644
--- a/vllm_ascend/ops/linear_op.py
+++ b/vllm_ascend/ops/linear_op.py
@@ -299,6 +299,7 @@ class OProjRowParallelOp(CustomRowParallelOp):
 
         # otp-specific: Combine partial results across devices
         output = self.comm_group.reduce_scatter(output_parallel, dim=0)
+        output = output.view(input_.shape[0], self.layer.output_size)
 
         # Handle bias return based on configuration
         output_bias = self.bias if self.skip_bias_add else None