fix profile run for vl model (#5136)

### What this PR does / why we need it? ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.12.0 - vLLM main: ad32e3e19c --------- Signed-off-by: 李少鹏 <lishaopeng21@huawei.com>
2025-12-17 23:51:31 +08:00
parent 43d974c6f7
commit 39bdd4cfaa
4 changed files with 1 additions and 3 deletions
--- a/vllm_ascend/worker/worker_v1.py
+++ b/vllm_ascend/worker/worker_v1.py
@@ -232,6 +232,7 @@ class NPUWorker(WorkerBase):
        # Init ModelRunner here, so that we have access to self.device.
        self.model_runner = NPUModelRunner(self.vllm_config, self.device)

+    @torch.inference_mode()
    def determine_available_memory(self) -> int:
        # Profile the memory usage of the model and get the maximum number of
        # cache blocks that can be allocated with the remaining free memory.