diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 80148785..757da788 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -2072,10 +2072,14 @@ class NPUModelRunner(GPUModelRunner): if self.is_kv_producer and not self.is_kv_consumer: with_prefill = True + has_lora = True if self.lora_config and self.compilation_config.cudagraph_specialize_lora else False + _ag_mode, batch_descriptor = \ + self.cudagraph_dispatcher.dispatch(num_tokens=num_tokens, uniform_decode=uniform_decode, has_lora=has_lora) + # Padding for DP (num_tokens, num_tokens_across_dp, - with_prefill) = self._sync_metadata_across_dp(num_tokens, - with_prefill) + with_prefill) = self._sync_metadata_across_dp( + batch_descriptor.num_tokens, with_prefill) # If cudagraph_mode.decode_mode() == FULL and # cudagraph_mode.seperate_routine(). This means that we are using @@ -2122,9 +2126,11 @@ class NPUModelRunner(GPUModelRunner): if not is_profile and self.dynamic_eplb: self.eplb_updator.forward_before() - has_lora = True if self.lora_config and self.compilation_config.cudagraph_specialize_lora else False - _ag_mode, batch_descriptor = \ - self.cudagraph_dispatcher.dispatch(num_tokens=num_tokens, uniform_decode=uniform_decode, has_lora=has_lora) + if num_tokens != batch_descriptor.num_tokens: + _ag_mode, batch_descriptor = self.cudagraph_dispatcher.dispatch( + num_tokens=num_tokens, + uniform_decode=uniform_decode, + has_lora=has_lora) num_tokens_padded = batch_descriptor.num_tokens num_reqs_padded = (batch_descriptor.num_reqs if