[Spec Decode]clean up spec decode interface (#6947)
This pull request refactors the speculative decoding proposer interface
to align with upstream vLLM, removing the local `Proposer` interface and
renaming methods to `propose`.
This is the first step. In the future we should remove the class
register and just add few Ascend specified method once the arch in vLLM
is ready.
- vLLM version: v0.16.0
- vLLM main:
15d76f74e2
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
@@ -1,36 +1,20 @@
|
||||
import torch
|
||||
from vllm.config import CUDAGraphMode, VllmConfig
|
||||
from vllm.config import CUDAGraphMode
|
||||
from vllm.logger import init_logger
|
||||
from vllm.v1.sample.metadata import SamplingMetadata
|
||||
from vllm.v1.spec_decode.medusa import MedusaProposer as VllmMedusaProposer
|
||||
from vllm.v1.spec_decode.medusa import MedusaProposer
|
||||
from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
|
||||
|
||||
from vllm_ascend.ascend_forward_context import set_ascend_forward_context
|
||||
from vllm_ascend.spec_decode.interface import SpecDcodeType
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class MedusaProposer(VllmMedusaProposer):
|
||||
class AscendMedusaProposer(MedusaProposer):
|
||||
"""
|
||||
Medusa proposer class for generating token sequences
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vllm_config: VllmConfig,
|
||||
device: torch.device,
|
||||
runner,
|
||||
):
|
||||
# Save config parameters
|
||||
self.name = SpecDcodeType.MEDUSA
|
||||
self.vllm_config = vllm_config
|
||||
self.device = device
|
||||
self.max_num_tokens = vllm_config.scheduler_config.max_num_batched_tokens
|
||||
self.hidden_size = vllm_config.speculative_config.draft_model_config.get_hidden_size()
|
||||
self.dtype = vllm_config.model_config.dtype
|
||||
self.runner = runner
|
||||
|
||||
@torch.inference_mode()
|
||||
def dummy_run(
|
||||
self,
|
||||
@@ -62,14 +46,12 @@ class MedusaProposer(VllmMedusaProposer):
|
||||
self.model(hidden_states)
|
||||
dummy_compute_logits(hidden_states)
|
||||
|
||||
def generate_token_ids(
|
||||
def propose(
|
||||
self,
|
||||
valid_sampled_token_ids: list[list[int]],
|
||||
sampling_metadata: SamplingMetadata,
|
||||
spec_decode_metadata: SpecDecodeMetadata,
|
||||
sample_hidden_states: torch.Tensor,
|
||||
*args,
|
||||
**kwargs,
|
||||
):
|
||||
if sample_hidden_states.shape[0] == len(valid_sampled_token_ids):
|
||||
# The input to the target model does not include draft tokens.
|
||||
@@ -84,7 +66,7 @@ class MedusaProposer(VllmMedusaProposer):
|
||||
indices = offsets + num_accepted_tokens - 1
|
||||
hidden_states = sample_hidden_states[indices]
|
||||
|
||||
spec_token_ids = self.propose(
|
||||
spec_token_ids = super().propose(
|
||||
target_hidden_states=hidden_states,
|
||||
sampling_metadata=sampling_metadata,
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user