diff --git a/tests/ut/worker/test_worker_v1.py b/tests/ut/worker/test_worker_v1.py
index af3d904..7292a29 100644
--- a/tests/ut/worker/test_worker_v1.py
+++ b/tests/ut/worker/test_worker_v1.py
@@ -1009,7 +1009,10 @@ class TestNPUWorker(TestBase):
 
     @patch("vllm_ascend.worker.worker_v1.NPUPlatform.seed_everything")
     @patch("vllm_ascend.worker.worker_v1.logger")
-    def test_compile_or_warm_up_model_with_eager_mode(self, mock_logger,
+    @patch("torch_npu._npu_matmul_add_fp32")
+    def test_compile_or_warm_up_model_with_eager_mode(self,
+                                                      mock_npu_matmul_add,
+                                                      mock_logger,
                                                       mock_seed_everything):
         """Test compile_or_warm_up_model method - eager mode"""
         from vllm_ascend.worker.worker_v1 import NPUWorker
@@ -1051,10 +1054,14 @@ class TestNPUWorker(TestBase):
             # Verify seed setting
             mock_seed_everything.assert_called_once_with(12345)
 
+            # Verify calls
+            mock_npu_matmul_add.assert_called_once()
+
     @patch("vllm_ascend.worker.worker_v1.NPUPlatform.seed_everything")
     @patch("vllm_ascend.worker.worker_v1.logger")
+    @patch("torch_npu._npu_matmul_add_fp32")
     def test_compile_or_warm_up_model_with_graph_capture(
-            self, mock_logger, mock_seed_everything):
+            self, mock_npu_matmul_add, mock_logger, mock_seed_everything):
         """Test compile_or_warm_up_model method - with graph capture enabled"""
         from vllm_ascend.worker.worker_v1 import NPUWorker
 
@@ -1087,6 +1094,9 @@ class TestNPUWorker(TestBase):
             # Verify seed setting
             mock_seed_everything.assert_called_once_with(67890)
 
+            # Verify calls
+            mock_npu_matmul_add.assert_called_once()
+
     @patch("vllm_ascend.worker.worker_v1.CaMemAllocator")
     def test_initialize_from_config_with_sleep_mode(self,
                                                     mock_allocator_class):
diff --git a/vllm_ascend/models/qwen3_moe.py b/vllm_ascend/models/qwen3_moe.py
index c373fe1..4ee41eb 100644
--- a/vllm_ascend/models/qwen3_moe.py
+++ b/vllm_ascend/models/qwen3_moe.py
@@ -20,7 +20,6 @@
 from typing import Optional, Union
 
 import torch
-import torch_npu
 from torch import nn
 from transformers import PretrainedConfig
 from vllm.compilation.decorators import support_torch_compile
@@ -277,11 +276,6 @@ class CustomQwen3MoeModel(Qwen3MoeModel):
         self.make_empty_intermediate_tensors = (
             make_empty_intermediate_tensors_factory(
                 ["hidden_states", "residual"], config.hidden_size))
-        # Call ATB matmul to warm up; otherwise, the first operation (ReshapeAndCache) may cause performance degradation at runtime.
-        x = torch.rand((2, 4), dtype=torch.float16).npu()
-        weight = torch.rand((2, 4), dtype=torch.float16).npu()
-        c = torch.rand((4, 4), dtype=torch.float32).npu()
-        torch_npu._npu_matmul_add_fp32(x, weight, c)
 
     def forward(
         self,
diff --git a/vllm_ascend/worker/worker_v1.py b/vllm_ascend/worker/worker_v1.py
index a3efe46..cc788db 100644
--- a/vllm_ascend/worker/worker_v1.py
+++ b/vllm_ascend/worker/worker_v1.py
@@ -250,10 +250,19 @@ class NPUWorker(WorkerBase):
             self.model_runner._dummy_run(size)
         if not self.model_config.enforce_eager:
             self.model_runner.capture_model()
+        # Call ATB matmul to warm up; otherwise, the first operation (ReshapeAndCache)
+        # may cause performance degradation at runtime.
+        self._warm_up_atb()
         # Reset the seed to ensure that the random state is not affected by
         # the model initialization and profiling.
         NPUPlatform.seed_everything(self.model_config.seed)
 
+    def _warm_up_atb(self):
+        x = torch.rand((2, 4), dtype=torch.float16).npu()
+        weight = torch.rand((2, 4), dtype=torch.float16).npu()
+        c = torch.rand((4, 4), dtype=torch.float32).npu()
+        torch_npu._npu_matmul_add_fp32(x, weight, c)
+
     def get_model(self) -> nn.Module:
         return self.model_runner.get_model()