diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index a03c0b3b..1f0ff0bf 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -1325,7 +1325,7 @@ class NPUModelRunner(GPUModelRunner): self.kv_connector_output = kv_connector_output return None - @torch.inference_mode + @torch.inference_mode() def sample_tokens( self, grammar_output: "GrammarOutput | None" ) -> ModelRunnerOutput | AsyncModelRunnerOutput | IntermediateTensors: