[eagle][cp] fix eagle_cp enable bug2 (#7079)
### What this PR does / why we need it?
Fix acceptance and high-concurrency bug in eagle3 and cp enabled
### Does this PR introduce _any_ user-facing change?
no
### How was this patch tested?
tests and ut
- vLLM version: v0.16.0
- vLLM main:
4034c3d32e
---------
Signed-off-by: lilinsiman <lilinsiman@gmail.com>
This commit is contained in:
@@ -143,7 +143,7 @@ class AscendAttentionCPMetadataBuilder(AscendAttentionMetadataBuilder):
|
|||||||
chunked_context_metadata = None
|
chunked_context_metadata = None
|
||||||
attn_mask_seqlens = common_long_seq_metadata.attn_mask_seqlens
|
attn_mask_seqlens = common_long_seq_metadata.attn_mask_seqlens
|
||||||
if num_prefills > 0:
|
if num_prefills > 0:
|
||||||
query_lens = query_lens[num_decode_tokens:]
|
query_lens = query_lens[num_decodes:]
|
||||||
context_lens_cpu = num_computed_tokens_cpu[num_decodes:num_reqs]
|
context_lens_cpu = num_computed_tokens_cpu[num_decodes:num_reqs]
|
||||||
max_context_len_cpu = context_lens_cpu.max().item()
|
max_context_len_cpu = context_lens_cpu.max().item()
|
||||||
if self.chunked_prefill_enabled and max_context_len_cpu > 0:
|
if self.chunked_prefill_enabled and max_context_len_cpu > 0:
|
||||||
|
|||||||
@@ -1016,7 +1016,7 @@ class NPUModelRunner(GPUModelRunner):
|
|||||||
target_positions = self._get_positions(num_scheduled_tokens)
|
target_positions = self._get_positions(num_scheduled_tokens)
|
||||||
target_hidden_states = hidden_states
|
target_hidden_states = hidden_states
|
||||||
if self.use_aux_hidden_state_outputs:
|
if self.use_aux_hidden_state_outputs:
|
||||||
target_hidden_states = torch.cat([h[:num_scheduled_tokens] for h in aux_hidden_states], dim=-1)
|
target_hidden_states = torch.cat([h for h in aux_hidden_states], dim=-1)
|
||||||
else:
|
else:
|
||||||
token_indices_to_sample = None
|
token_indices_to_sample = None
|
||||||
# input_ids can be None for multimodal models.
|
# input_ids can be None for multimodal models.
|
||||||
|
|||||||
Reference in New Issue
Block a user