Drop 0.10.2 (#3284)

Drop v0.10.2 support, we support vLLM 0.11.0rc3 now.
- vLLM version: v0.11.0rc3
- vLLM main:
https://github.com/vllm-project/vllm/commit/releases/v0.11.0

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
wangxiyuan
2025-10-09 10:28:38 +08:00
committed by GitHub
parent 2dde1268c7
commit f12f76d7ba
17 changed files with 202 additions and 653 deletions

View File

@@ -21,7 +21,6 @@ from vllm_ascend.attention.attention_mask import AttentionMaskBuilder
from vllm_ascend.attention.attention_v1 import AscendAttentionState
from vllm_ascend.attention.utils import AscendCommonAttentionMetadata
from vllm_ascend.spec_decode.interface import Proposer, SpecDcodeType
from vllm_ascend.utils import vllm_version_is
PADDING_SLOT_ID = -1
@@ -352,10 +351,7 @@ class EagleProposer(Proposer):
decode_token_per_req=self.runner.decode_token_per_req,
num_computed_tokens_cpu=None,
seq_lens=None)
if vllm_version_is("0.10.2"):
builder = self.runner.attn_groups[0][0].metadata_builder
else:
builder = self.runner.attn_groups[0][0].get_metadata_builder()
builder = self.runner.attn_groups[0][0].get_metadata_builder()
attn_metadata_i = builder.build(0, common_attn_metadata,
self.runner.get_model())
for layer_name in kv_cache_group_spec.layer_names:
@@ -447,10 +443,7 @@ class EagleProposer(Proposer):
num_computed_tokens_cpu=None,
seq_lens=None)
# FIXME(woosuk): The below two ops cause synchronization. Optimize.
if vllm_version_is("0.10.2"):
builder = self.runner.attn_groups[0][0].metadata_builder
else:
builder = self.runner.attn_groups[0][0].get_metadata_builder()
builder = self.runner.attn_groups[0][0].get_metadata_builder()
attn_metadata = builder.build(0, common_attn_metadata,
self.runner.get_model())
if self.use_cuda_graph and \
@@ -479,10 +472,7 @@ class EagleProposer(Proposer):
hidden_states=self.hidden_states[:num_input_tokens],
)
sample_hidden_states = last_hidden_states[last_token_indices]
if vllm_version_is("0.10.2"):
logits = self.model.compute_logits(sample_hidden_states, None)
else:
logits = self.model.compute_logits(sample_hidden_states)
logits = self.model.compute_logits(sample_hidden_states)
draft_token_ids = logits.argmax(dim=-1)
# Early exit if there is only one draft token to be generated.
@@ -586,12 +576,7 @@ class EagleProposer(Proposer):
hidden_states=self.hidden_states[:input_batch_size],
)
hidden_states = hidden_states[:batch_size]
if vllm_version_is("0.10.2"):
logits = self.model.compute_logits(
last_hidden_states[:batch_size], None)
else:
logits = self.model.compute_logits(
last_hidden_states[:batch_size])
logits = self.model.compute_logits(last_hidden_states[:batch_size])
# TODO(wenlong): get more than one token for tree attention
draft_token_ids = logits.argmax(dim=-1)