diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 3c9fc126..20891e50 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -521,8 +521,12 @@ class NPUModelRunner(LoRAModelRunnerMixin): if self.speculative_config else 0) self.use_aclgraph = self._use_aclgraph() - self.aclgraph_batch_sizes = list( - reversed(self.compilation_config.cudagraph_capture_sizes)) + + # self.aclgraph_batch_sizes sorts in ascending order. + if (self.compilation_config.cudagraph_capture_sizes and + self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE): + self.aclgraph_batch_sizes = sorted( + self.compilation_config.cudagraph_capture_sizes) self.uniform_decode_query_len = 1 if not self.speculative_config else \ 1 + self.speculative_config.num_speculative_tokens @@ -4101,7 +4105,8 @@ class NPUModelRunner(LoRAModelRunnerMixin): if aclgraph_mode.mixed_mode() != CUDAGraphMode.NONE: aclgraph_runtime_mode = aclgraph_mode.mixed_mode() - compilation_cases = sorted(self.aclgraph_batch_sizes) + # make sure we capture the largest batch size first + compilation_cases = list(reversed(self.aclgraph_batch_sizes)) try: self._capture_aclgraphs(