[eagle][cp] fix eagle_cp enable bug2 (#7079)

### What this PR does / why we need it? Fix acceptance and high-concurrency bug in eagle3 and cp enabled ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? tests and ut - vLLM version: v0.16.0 - vLLM main: 4034c3d32e --------- Signed-off-by: lilinsiman <lilinsiman@gmail.com>
2026-03-10 16:32:49 +08:00
parent 67d40f23fd
commit a5ea699e29
2 changed files with 2 additions and 2 deletions
--- a/vllm_ascend/attention/context_parallel/attention_cp.py
+++ b/vllm_ascend/attention/context_parallel/attention_cp.py
@@ -143,7 +143,7 @@ class AscendAttentionCPMetadataBuilder(AscendAttentionMetadataBuilder):
        chunked_context_metadata = None
        attn_mask_seqlens = common_long_seq_metadata.attn_mask_seqlens
        if num_prefills > 0:
-            query_lens = query_lens[num_decode_tokens:]
+            query_lens = query_lens[num_decodes:]
            context_lens_cpu = num_computed_tokens_cpu[num_decodes:num_reqs]
            max_context_len_cpu = context_lens_cpu.max().item()
            if self.chunked_prefill_enabled and max_context_len_cpu > 0:
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -1016,7 +1016,7 @@ class NPUModelRunner(GPUModelRunner):
                    target_positions = self._get_positions(num_scheduled_tokens)
                    target_hidden_states = hidden_states
                    if self.use_aux_hidden_state_outputs:
-                        target_hidden_states = torch.cat([h[:num_scheduled_tokens] for h in aux_hidden_states], dim=-1)
+                        target_hidden_states = torch.cat([h for h in aux_hidden_states], dim=-1)
                else:
                    token_indices_to_sample = None
                    # input_ids can be None for multimodal models.