diff --git a/tests/ut/worker/test_worker_v1.py b/tests/ut/worker/test_worker_v1.py index af3d904..7292a29 100644 --- a/tests/ut/worker/test_worker_v1.py +++ b/tests/ut/worker/test_worker_v1.py @@ -1009,7 +1009,10 @@ class TestNPUWorker(TestBase): @patch("vllm_ascend.worker.worker_v1.NPUPlatform.seed_everything") @patch("vllm_ascend.worker.worker_v1.logger") - def test_compile_or_warm_up_model_with_eager_mode(self, mock_logger, + @patch("torch_npu._npu_matmul_add_fp32") + def test_compile_or_warm_up_model_with_eager_mode(self, + mock_npu_matmul_add, + mock_logger, mock_seed_everything): """Test compile_or_warm_up_model method - eager mode""" from vllm_ascend.worker.worker_v1 import NPUWorker @@ -1051,10 +1054,14 @@ class TestNPUWorker(TestBase): # Verify seed setting mock_seed_everything.assert_called_once_with(12345) + # Verify calls + mock_npu_matmul_add.assert_called_once() + @patch("vllm_ascend.worker.worker_v1.NPUPlatform.seed_everything") @patch("vllm_ascend.worker.worker_v1.logger") + @patch("torch_npu._npu_matmul_add_fp32") def test_compile_or_warm_up_model_with_graph_capture( - self, mock_logger, mock_seed_everything): + self, mock_npu_matmul_add, mock_logger, mock_seed_everything): """Test compile_or_warm_up_model method - with graph capture enabled""" from vllm_ascend.worker.worker_v1 import NPUWorker @@ -1087,6 +1094,9 @@ class TestNPUWorker(TestBase): # Verify seed setting mock_seed_everything.assert_called_once_with(67890) + # Verify calls + mock_npu_matmul_add.assert_called_once() + @patch("vllm_ascend.worker.worker_v1.CaMemAllocator") def test_initialize_from_config_with_sleep_mode(self, mock_allocator_class): diff --git a/vllm_ascend/models/qwen3_moe.py b/vllm_ascend/models/qwen3_moe.py index c373fe1..4ee41eb 100644 --- a/vllm_ascend/models/qwen3_moe.py +++ b/vllm_ascend/models/qwen3_moe.py @@ -20,7 +20,6 @@ from typing import Optional, Union import torch -import torch_npu from torch import nn from transformers import PretrainedConfig from vllm.compilation.decorators import support_torch_compile @@ -277,11 +276,6 @@ class CustomQwen3MoeModel(Qwen3MoeModel): self.make_empty_intermediate_tensors = ( make_empty_intermediate_tensors_factory( ["hidden_states", "residual"], config.hidden_size)) - # Call ATB matmul to warm up; otherwise, the first operation (ReshapeAndCache) may cause performance degradation at runtime. - x = torch.rand((2, 4), dtype=torch.float16).npu() - weight = torch.rand((2, 4), dtype=torch.float16).npu() - c = torch.rand((4, 4), dtype=torch.float32).npu() - torch_npu._npu_matmul_add_fp32(x, weight, c) def forward( self, diff --git a/vllm_ascend/worker/worker_v1.py b/vllm_ascend/worker/worker_v1.py index a3efe46..cc788db 100644 --- a/vllm_ascend/worker/worker_v1.py +++ b/vllm_ascend/worker/worker_v1.py @@ -250,10 +250,19 @@ class NPUWorker(WorkerBase): self.model_runner._dummy_run(size) if not self.model_config.enforce_eager: self.model_runner.capture_model() + # Call ATB matmul to warm up; otherwise, the first operation (ReshapeAndCache) + # may cause performance degradation at runtime. + self._warm_up_atb() # Reset the seed to ensure that the random state is not affected by # the model initialization and profiling. NPUPlatform.seed_everything(self.model_config.seed) + def _warm_up_atb(self): + x = torch.rand((2, 4), dtype=torch.float16).npu() + weight = torch.rand((2, 4), dtype=torch.float16).npu() + c = torch.rand((4, 4), dtype=torch.float32).npu() + torch_npu._npu_matmul_add_fp32(x, weight, c) + def get_model(self) -> nn.Module: return self.model_runner.get_model()