From ad9d9569ea61807d3d6d4bad570ad95dad637f1f Mon Sep 17 00:00:00 2001 From: Canlin Guo Date: Wed, 25 Feb 2026 14:37:53 +0800 Subject: [PATCH] [Bugfix] Add the missing parentheses to @torch.inference_mode (#6757) ### What this PR does / why we need it? This PR fixes a bug in `vllm_ascend/worker/model_runner_v1.py` where the `@torch.inference_mode` decorator was used without parentheses. Using the decorator without instantiation is deprecated and may not correctly disable gradient calculations, leading to performance degradation and increased memory usage during inference. This change adds the required parentheses to ensure `torch.inference_mode` is applied correctly. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? The change is a minor syntax correction. Existing CI tests should cover this. - vLLM version: v0.15.0 - vLLM main: https://github.com/vllm-project/vllm/commit/9562912cead1f11e8540fb91306c5cbda66f0007 Signed-off-by: gcanlin --- vllm_ascend/worker/model_runner_v1.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index a03c0b3b..1f0ff0bf 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -1325,7 +1325,7 @@ class NPUModelRunner(GPUModelRunner): self.kv_connector_output = kv_connector_output return None - @torch.inference_mode + @torch.inference_mode() def sample_tokens( self, grammar_output: "GrammarOutput | None" ) -> ModelRunnerOutput | AsyncModelRunnerOutput | IntermediateTensors: