[Spec Decode]clean up spec decode interface (#6947)

This pull request refactors the speculative decoding proposer interface
to align with upstream vLLM, removing the local `Proposer` interface and
renaming methods to `propose`.

This is the first step. In the future we should remove the class
register and just add few Ascend specified method once the arch in vLLM
is ready.

- vLLM version: v0.16.0
- vLLM main:
15d76f74e2

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
wangxiyuan
2026-03-05 14:30:10 +08:00
committed by GitHub
parent 2bd9c35788
commit 13777bf3f0
11 changed files with 194 additions and 315 deletions

View File

@@ -30,8 +30,7 @@ from vllm.utils.platform_utils import is_pin_memory_available
from vllm.v1.attention.backends.utils import CommonAttentionMetadata
from vllm.v1.core.sched.output import SchedulerOutput
from vllm.v1.sample.metadata import SamplingMetadata
from vllm.v1.spec_decode.eagle import PADDING_SLOT_ID
from vllm.v1.spec_decode.eagle import EagleProposer as VllmEagleProposer
from vllm.v1.spec_decode.eagle import PADDING_SLOT_ID, EagleProposer
from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
@@ -81,7 +80,7 @@ def split_inputs_tp_to_sp(hidden_states, out):
return out[:padded_num_tokens_per_rank]
class EagleProposer(VllmEagleProposer):
class AscendEagleProposer(EagleProposer):
_runnable: ACLGraphWrapper | Callable
def __init__(self, vllm_config: VllmConfig, device: torch.device, runner=None):