From a5ea699e29c28c72cacf1e7a55348691d313f380 Mon Sep 17 00:00:00 2001 From: lilinsiman Date: Tue, 10 Mar 2026 16:32:49 +0800 Subject: [PATCH] [eagle][cp] fix eagle_cp enable bug2 (#7079) ### What this PR does / why we need it? Fix acceptance and high-concurrency bug in eagle3 and cp enabled ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? tests and ut - vLLM version: v0.16.0 - vLLM main: https://github.com/vllm-project/vllm/commit/4034c3d32e30d01639459edd3ab486f56993876d --------- Signed-off-by: lilinsiman --- vllm_ascend/attention/context_parallel/attention_cp.py | 2 +- vllm_ascend/worker/model_runner_v1.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm_ascend/attention/context_parallel/attention_cp.py b/vllm_ascend/attention/context_parallel/attention_cp.py index eadd4fd7..655fec1d 100644 --- a/vllm_ascend/attention/context_parallel/attention_cp.py +++ b/vllm_ascend/attention/context_parallel/attention_cp.py @@ -143,7 +143,7 @@ class AscendAttentionCPMetadataBuilder(AscendAttentionMetadataBuilder): chunked_context_metadata = None attn_mask_seqlens = common_long_seq_metadata.attn_mask_seqlens if num_prefills > 0: - query_lens = query_lens[num_decode_tokens:] + query_lens = query_lens[num_decodes:] context_lens_cpu = num_computed_tokens_cpu[num_decodes:num_reqs] max_context_len_cpu = context_lens_cpu.max().item() if self.chunked_prefill_enabled and max_context_len_cpu > 0: diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 1bb40291..6ed5caa6 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -1016,7 +1016,7 @@ class NPUModelRunner(GPUModelRunner): target_positions = self._get_positions(num_scheduled_tokens) target_hidden_states = hidden_states if self.use_aux_hidden_state_outputs: - target_hidden_states = torch.cat([h[:num_scheduled_tokens] for h in aux_hidden_states], dim=-1) + target_hidden_states = torch.cat([h for h in aux_hidden_states], dim=-1) else: token_indices_to_sample = None # input_ids can be None for multimodal models.