[CI] Fix broken CI (#6599)

Revert
4fb3d5e1b2
it breaks E2E Test

- vLLM version: v0.15.0
- vLLM main:
d7e17aaacd
This commit is contained in:
wangxiyuan
2026-02-06 17:23:58 +08:00
committed by GitHub
parent 19b5d44ea8
commit 06c0aed124
17 changed files with 1147 additions and 947 deletions

View File

@@ -1,11 +1,13 @@
import torch
from vllm.config import CUDAGraphMode
from vllm.v1.spec_decode.suffix_decoding import SuffixDecodingProposer as VllmSuffixDecodingProposer
from vllm.v1.spec_decode.suffix_decoding import \
SuffixDecodingProposer as VllmSuffixDecodingProposer
from vllm_ascend.spec_decode.interface import Proposer, SpecDcodeType
class SuffixDecodingProposer(VllmSuffixDecodingProposer, Proposer):
def __init__(self, vllm_config, device, runner):
super().__init__(vllm_config)
self.name = SpecDcodeType.SUFFIX
@@ -17,30 +19,27 @@ class SuffixDecodingProposer(VllmSuffixDecodingProposer, Proposer):
pass
@torch.inference_mode()
def dummy_run(
self,
num_tokens,
with_prefill=None,
in_graph_capturing=None,
num_reqs=None,
num_tokens_across_dp=None,
aclgraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE,
batch_descriptor=None,
dummy_compute_logits=lambda hidden_states: None,
is_profile=False,
):
def dummy_run(self,
num_tokens,
with_prefill=None,
in_graph_capturing=None,
num_reqs=None,
num_tokens_across_dp=None,
aclgraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE,
batch_descriptor=None,
dummy_compute_logits=lambda hidden_states: None,
is_profile=False):
pass
def generate_token_ids(
self,
valid_sampled_token_ids,
sampling_metadata=None,
scheduler_output=None,
spec_decode_metadata=None,
positions=None,
num_scheduled_tokens=None,
hidden_states=None,
aux_hidden_states=None,
) -> list[list[int]]:
draft_token_ids = self.propose(self.runner.input_batch, valid_sampled_token_ids)
def generate_token_ids(self,
valid_sampled_token_ids,
sampling_metadata=None,
scheduler_output=None,
spec_decode_metadata=None,
positions=None,
num_scheduled_tokens=None,
hidden_states=None,
aux_hidden_states=None) -> list[list[int]]:
draft_token_ids = self.propose(self.runner.input_batch,
valid_sampled_token_ids)
return draft_token_ids