[main] adjust the position of warm_up_atb (#2823)
### What this PR does / why we need it?
Adjust the position of warm_up_atb.
### Does this PR introduce _any_ user-facing change?
not involved
### How was this patch tested?
CI passed with existing test.
- vLLM version: main
- vLLM main:
b23fb78623
Signed-off-by: huangxialu <huangxialu1@huawei.com>
This commit is contained in:
@@ -1009,7 +1009,10 @@ class TestNPUWorker(TestBase):
|
|||||||
|
|
||||||
@patch("vllm_ascend.worker.worker_v1.NPUPlatform.seed_everything")
|
@patch("vllm_ascend.worker.worker_v1.NPUPlatform.seed_everything")
|
||||||
@patch("vllm_ascend.worker.worker_v1.logger")
|
@patch("vllm_ascend.worker.worker_v1.logger")
|
||||||
def test_compile_or_warm_up_model_with_eager_mode(self, mock_logger,
|
@patch("torch_npu._npu_matmul_add_fp32")
|
||||||
|
def test_compile_or_warm_up_model_with_eager_mode(self,
|
||||||
|
mock_npu_matmul_add,
|
||||||
|
mock_logger,
|
||||||
mock_seed_everything):
|
mock_seed_everything):
|
||||||
"""Test compile_or_warm_up_model method - eager mode"""
|
"""Test compile_or_warm_up_model method - eager mode"""
|
||||||
from vllm_ascend.worker.worker_v1 import NPUWorker
|
from vllm_ascend.worker.worker_v1 import NPUWorker
|
||||||
@@ -1051,10 +1054,14 @@ class TestNPUWorker(TestBase):
|
|||||||
# Verify seed setting
|
# Verify seed setting
|
||||||
mock_seed_everything.assert_called_once_with(12345)
|
mock_seed_everything.assert_called_once_with(12345)
|
||||||
|
|
||||||
|
# Verify calls
|
||||||
|
mock_npu_matmul_add.assert_called_once()
|
||||||
|
|
||||||
@patch("vllm_ascend.worker.worker_v1.NPUPlatform.seed_everything")
|
@patch("vllm_ascend.worker.worker_v1.NPUPlatform.seed_everything")
|
||||||
@patch("vllm_ascend.worker.worker_v1.logger")
|
@patch("vllm_ascend.worker.worker_v1.logger")
|
||||||
|
@patch("torch_npu._npu_matmul_add_fp32")
|
||||||
def test_compile_or_warm_up_model_with_graph_capture(
|
def test_compile_or_warm_up_model_with_graph_capture(
|
||||||
self, mock_logger, mock_seed_everything):
|
self, mock_npu_matmul_add, mock_logger, mock_seed_everything):
|
||||||
"""Test compile_or_warm_up_model method - with graph capture enabled"""
|
"""Test compile_or_warm_up_model method - with graph capture enabled"""
|
||||||
from vllm_ascend.worker.worker_v1 import NPUWorker
|
from vllm_ascend.worker.worker_v1 import NPUWorker
|
||||||
|
|
||||||
@@ -1087,6 +1094,9 @@ class TestNPUWorker(TestBase):
|
|||||||
# Verify seed setting
|
# Verify seed setting
|
||||||
mock_seed_everything.assert_called_once_with(67890)
|
mock_seed_everything.assert_called_once_with(67890)
|
||||||
|
|
||||||
|
# Verify calls
|
||||||
|
mock_npu_matmul_add.assert_called_once()
|
||||||
|
|
||||||
@patch("vllm_ascend.worker.worker_v1.CaMemAllocator")
|
@patch("vllm_ascend.worker.worker_v1.CaMemAllocator")
|
||||||
def test_initialize_from_config_with_sleep_mode(self,
|
def test_initialize_from_config_with_sleep_mode(self,
|
||||||
mock_allocator_class):
|
mock_allocator_class):
|
||||||
|
|||||||
@@ -20,7 +20,6 @@
|
|||||||
from typing import Optional, Union
|
from typing import Optional, Union
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch_npu
|
|
||||||
from torch import nn
|
from torch import nn
|
||||||
from transformers import PretrainedConfig
|
from transformers import PretrainedConfig
|
||||||
from vllm.compilation.decorators import support_torch_compile
|
from vllm.compilation.decorators import support_torch_compile
|
||||||
@@ -277,11 +276,6 @@ class CustomQwen3MoeModel(Qwen3MoeModel):
|
|||||||
self.make_empty_intermediate_tensors = (
|
self.make_empty_intermediate_tensors = (
|
||||||
make_empty_intermediate_tensors_factory(
|
make_empty_intermediate_tensors_factory(
|
||||||
["hidden_states", "residual"], config.hidden_size))
|
["hidden_states", "residual"], config.hidden_size))
|
||||||
# Call ATB matmul to warm up; otherwise, the first operation (ReshapeAndCache) may cause performance degradation at runtime.
|
|
||||||
x = torch.rand((2, 4), dtype=torch.float16).npu()
|
|
||||||
weight = torch.rand((2, 4), dtype=torch.float16).npu()
|
|
||||||
c = torch.rand((4, 4), dtype=torch.float32).npu()
|
|
||||||
torch_npu._npu_matmul_add_fp32(x, weight, c)
|
|
||||||
|
|
||||||
def forward(
|
def forward(
|
||||||
self,
|
self,
|
||||||
|
|||||||
@@ -250,10 +250,19 @@ class NPUWorker(WorkerBase):
|
|||||||
self.model_runner._dummy_run(size)
|
self.model_runner._dummy_run(size)
|
||||||
if not self.model_config.enforce_eager:
|
if not self.model_config.enforce_eager:
|
||||||
self.model_runner.capture_model()
|
self.model_runner.capture_model()
|
||||||
|
# Call ATB matmul to warm up; otherwise, the first operation (ReshapeAndCache)
|
||||||
|
# may cause performance degradation at runtime.
|
||||||
|
self._warm_up_atb()
|
||||||
# Reset the seed to ensure that the random state is not affected by
|
# Reset the seed to ensure that the random state is not affected by
|
||||||
# the model initialization and profiling.
|
# the model initialization and profiling.
|
||||||
NPUPlatform.seed_everything(self.model_config.seed)
|
NPUPlatform.seed_everything(self.model_config.seed)
|
||||||
|
|
||||||
|
def _warm_up_atb(self):
|
||||||
|
x = torch.rand((2, 4), dtype=torch.float16).npu()
|
||||||
|
weight = torch.rand((2, 4), dtype=torch.float16).npu()
|
||||||
|
c = torch.rand((4, 4), dtype=torch.float32).npu()
|
||||||
|
torch_npu._npu_matmul_add_fp32(x, weight, c)
|
||||||
|
|
||||||
def get_model(self) -> nn.Module:
|
def get_model(self) -> nn.Module:
|
||||||
return self.model_runner.get_model()
|
return self.model_runner.get_model()
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user