diff --git a/vllm_ascend/spec_decode/mtp_proposer.py b/vllm_ascend/spec_decode/mtp_proposer.py index 85bbad89..14db8976 100644 --- a/vllm_ascend/spec_decode/mtp_proposer.py +++ b/vllm_ascend/spec_decode/mtp_proposer.py @@ -748,6 +748,7 @@ class MtpProposer(Proposer): has_lora = len(self.runner.input_batch.lora_id_to_lora_request) > 0 aclgraph_runtime_mode, batch_descriptor = \ self.runner.cudagraph_dispatcher.dispatch(num_tokens=num_input_tokens, uniform_decode=uniform_decode, has_lora=has_lora) + original_aclgraph_runtime_mode = aclgraph_runtime_mode if self.use_async_scheduling: # there is synchronization between mtp steps when enabling aclgraph, # disable aclgraph when use async scheduling to avoid the @@ -802,6 +803,17 @@ class MtpProposer(Proposer): hidden_states = torch.ops.vllm.maybe_pad_and_reduce( hidden_states) + if original_aclgraph_runtime_mode == CUDAGraphMode.FULL and \ + self.use_async_scheduling and attn_metadata[layer_name].decode is not None: + for layer_name in self.attn_layer_name: + actual_size = len(attn_metadata[layer_name].decode. + actual_seq_lengths_q) + + attn_metadata[layer_name].decode.seq_lens_list = \ + attn_metadata[layer_name].decode.seq_lens_list[:actual_size] + attn_metadata[layer_name].decode.block_table = \ + attn_metadata[layer_name].decode.block_table[:actual_size] + hidden_states = self.model(input_ids=input_ids, positions=positions, hidden_states=hidden_states) @@ -1133,8 +1145,9 @@ class MtpProposer(Proposer): num_computed_tokens_cpu, seq_lens=common_attn_metadata.seq_lens) - token_indices_to_sample = (common_attn_metadata.query_start_loc[1:] - - 1 - num_rejected_tokens_gpu) + query_start_loc = common_attn_metadata.query_start_loc[ + 1:1 + num_rejected_tokens_gpu.shape[0]] + token_indices_to_sample = query_start_loc - 1 - num_rejected_tokens_gpu return spec_common_attn_metadata, token_indices, token_indices_to_sample diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 091f4e59..0b514e60 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -1019,7 +1019,7 @@ class NPUModelRunner(GPUModelRunner): # TODO: We should make this official ASAP. Also note that if we pad here, # the builders won’t need to add any extra padding. if self.compilation_config.cudagraph_mode.decode_mode() == CUDAGraphMode.FULL and \ - uniform_decode: + uniform_decode and num_input_tokens <= self.cudagraph_batch_sizes[-1]: num_reqs_padded = num_input_tokens // self.uniform_decode_query_len pad_size = num_reqs_padded - num_reqs if pad_size > 0: