From eb390545ec486c48916cf42f246ff693171210c0 Mon Sep 17 00:00:00 2001
From: Vincent Yuan <farawayboat@gmail.com>
Date: Sat, 5 Jul 2025 16:29:21 +0800
Subject: [PATCH] [Performance] Disable JIT and nd2nz to improve performance
 for Altlas 300I series (#1591)

### What this PR does / why we need it?

Since running on Altlas 300I Duo was initial supported after #1333 ,
this PR will disable the JIT compiler for the 310P and changed the data
format to NZ for the weight in the vocabulary embedding and QKV
projection layers, which help improving performance.

See #1563

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?

Test manually:
https://github.com/vllm-project/vllm-ascend/pull/1591#issuecomment-3028352339

Signed-off-by: Vincent Yuan <farawayboat@gmail.com>
---
 vllm_ascend/worker/model_runner_v1.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
index abb7e5b..fd40d13 100644
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -89,6 +89,7 @@ if TYPE_CHECKING:
 else:
     xgr = LazyLoader("xgr", globals(), "xgrammar")
 
+import torch_npu
 import vllm.envs as envs_vllm
 
 import vllm_ascend.envs as envs_ascend
@@ -96,6 +97,9 @@ import vllm_ascend.envs as envs_ascend
 if vllm_version_is("0.9.1"):
     from vllm.v1.spec_decode.utils import is_spec_decode_supported
 
+if is_310p():
+    torch_npu.npu.set_compile_mode(jit_compile=False)
+
 
 @dataclass
 class GraphCaptureContext:
@@ -2007,6 +2011,18 @@ class NPUModelRunner(LoRAModelRunnerMixin):
 
         with DeviceMemoryProfiler() as m:  # noqa: SIM117
             self.model = get_model(vllm_config=self.vllm_config)
+
+            if is_310p():
+                from vllm.model_executor.layers.linear import (
+                    MergedColumnParallelLinear, QKVParallelLinear,
+                    RowParallelLinear)
+                for module in self.model.modules():
+                    if isinstance(module,
+                                  (MergedColumnParallelLinear,
+                                   QKVParallelLinear, RowParallelLinear)):
+                        module.weight.data = torch_npu.npu_format_cast(
+                            module.weight.data, ACL_FORMAT_FRACTAL_NZ)
+
             try:
                 # For version compatibility, remove this after we abort vllm v0.9.1 support
                 from vllm.model_executor.models.interfaces import \