[CI/UT][bugfix] fix v0 spec decode (#1321)
### What this PR does / why we need it? 1. [PR913](https://github.com/vllm-project/vllm-ascend/pull/913) introduced an error that caused V0's spec decode function to fail. [PR1109](https://github.com/vllm-project/vllm-ascend/pull/1109) wanted to fix this problem. Unfortunately, the fix broke the ngram function. I fixed the ngram function in this PR. **PS**: Q: Why is there a problem when ngram is not found when pr1109 is merged? A: The newly introduced problem will only appear when tp>1, and the use cases on CI are all tp=1 2. In versions after 0.7.3, vllm-ascend deleted some spec decode UTs to avoid CI taking too long, including eagle speculative UTs, which made CI unable to take care of the eagle function. I added it(`test_eagle_correctness.py`) back in this PR 3. Because of the reason mentioned in 2, the current version of Eagle has a problem. I located and fixed this problem. It was because vllm's `draft_model_runner.py` was changed and vllm-ascend was not synchronized in time. 4. Currently, the UTs of v0 and v1 are mixed in the spec_decode directory. I split them into two directories: spec_decode_v0 and spec_decode_v1. 5. i found `vllm.spec_decode.multi_step_worker.MultiStepWorker.set_include_gpu_probs_tensor` and `vllm.spec_decode.multi_step_worker.MultiStepWorker.set_should_modify_greedy_probs_inplace` have changed in vllm, so i remove it in this pr. ### Does this PR introduce _any_ user-facing change? This PR fixes the functions of ngram and eagle spec decode in the v0 engine ### How was this patch tested? tested by CI Signed-off-by: mengwei805 <mengwei25@huawei.com>
This commit is contained in:
@@ -100,18 +100,6 @@
|
||||
# Future Plan:
|
||||
# Revert it when the related pr is merged in vllm and vllm-ascend.
|
||||
#
|
||||
# 2. `vllm.spec_decode.multi_step_worker.MultiStepWorker.set_include_gpu_probs_tensor` and
|
||||
# `vllm.spec_decode.multi_step_worker.MultiStepWorker.set_should_modify_greedy_probs_inplace`
|
||||
# Why:
|
||||
# vLLM `Remove Sampler from Model Code` so vllm-ascend needs adapt to this change.
|
||||
# How:
|
||||
# Use vLLM 0.8.4 method to patch it.
|
||||
# Related PR (if no, explain why):
|
||||
# - https://github.com/vllm-project/vllm/pull/15195
|
||||
# - https://github.com/vllm-project/vllm-ascend/pull/395
|
||||
# Future Plan:
|
||||
# Remove it when we identify the reasons clearly.
|
||||
#
|
||||
# ** File: worker/patch_common/patch_spec_decode_worker.py **
|
||||
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
# 1. `vllm.spec_decode.spec_decode_worker.SpecDecodeWorker.create_worker`
|
||||
|
||||
@@ -88,20 +88,4 @@ def sampler_output(
|
||||
return filtered_model_outputs, True
|
||||
|
||||
|
||||
def set_include_gpu_probs_tensor(self) -> None:
|
||||
# Need include_gpu_probs_tensor for MultiSteoWorker
|
||||
if hasattr(self.model_runner.model, "sampler"):
|
||||
self.model_runner.model.sampler.include_gpu_probs_tensor = True
|
||||
self.model_runner.sampler.include_gpu_probs_tensor = True
|
||||
|
||||
|
||||
def set_should_modify_greedy_probs_inplace(self) -> None:
|
||||
if hasattr(self.model_runner.model, "sampler"):
|
||||
self.model_runner.model.sampler.should_modify_greedy_probs_inplace = (
|
||||
True)
|
||||
self.model_runner.sampler.should_modify_greedy_probs_inplace = True
|
||||
|
||||
|
||||
MultiStepWorker.sampler_output = torch.inference_mode()(sampler_output)
|
||||
MultiStepWorker.set_include_gpu_probs_tensor = set_include_gpu_probs_tensor
|
||||
MultiStepWorker.set_should_modify_greedy_probs_inplace = set_should_modify_greedy_probs_inplace
|
||||
|
||||
@@ -57,11 +57,6 @@ def create_worker(
|
||||
ngram_prompt_lookup_min = (
|
||||
draft_worker_kwargs.pop("ngram_prompt_lookup_min"))
|
||||
|
||||
# TODO(Yizhou): A quick fix, must be refactored ASAP
|
||||
draft_worker_kwargs["vllm_config"].parallel_config.expert_parallel_size = 1
|
||||
draft_worker_kwargs[
|
||||
"vllm_config"].parallel_config.expert_tensor_parallel_size = 1
|
||||
|
||||
draft_model_config = draft_worker_kwargs["vllm_config"].model_config
|
||||
draft_parallel_config: ParallelConfig = draft_worker_kwargs[
|
||||
'vllm_config'].parallel_config
|
||||
@@ -72,6 +67,13 @@ def create_worker(
|
||||
proposer_worker.set_ngram_window_size(ngram_prompt_lookup_min,
|
||||
ngram_prompt_lookup_max)
|
||||
else:
|
||||
# TODO(Yizhou): A quick fix, must be refactored ASAP
|
||||
# ngram need not this fix.
|
||||
draft_worker_kwargs[
|
||||
"vllm_config"].parallel_config.expert_parallel_size = 1
|
||||
draft_worker_kwargs[
|
||||
"vllm_config"].parallel_config.expert_tensor_parallel_size = 1
|
||||
|
||||
draft_tp = draft_parallel_config.tensor_parallel_size
|
||||
target_tp = scorer_worker.parallel_config.tensor_parallel_size
|
||||
|
||||
|
||||
@@ -51,12 +51,6 @@ class TP1DraftModelRunner(ModelRunnerWrapperBase):
|
||||
"""
|
||||
|
||||
def __init__(self, model_runner: ModelRunnerBase):
|
||||
if hasattr(
|
||||
model_runner,
|
||||
"return_hidden_states") and model_runner.return_hidden_states:
|
||||
raise ValueError(
|
||||
"return_hidden_states is not supported for TP1DraftModelRunner."
|
||||
)
|
||||
super().__init__(model_runner)
|
||||
|
||||
self.indices_of_seq_with_bonus_tokens = None
|
||||
@@ -211,6 +205,9 @@ class TP1DraftModelRunner(ModelRunnerWrapperBase):
|
||||
if self.prompt_adapter_config is not None:
|
||||
raise ValueError("TP1DraftModelRunner has no support for "
|
||||
"prompt_adapter_config")
|
||||
if model_input.inputs_embeds is not None:
|
||||
raise ValueError("TP1DraftModelRunner has no support for "
|
||||
"inputs_embeds")
|
||||
if model_input.multi_modal_kwargs:
|
||||
raise ValueError(
|
||||
"TP1DraftModelRunner has no support for multi_modal_kwargs"
|
||||
@@ -272,6 +269,7 @@ class TP1DraftModelRunner(ModelRunnerWrapperBase):
|
||||
|
||||
hidden_states = model_executable(
|
||||
input_ids=model_input.input_tokens,
|
||||
inputs_embeds=None,
|
||||
positions=model_input.input_positions,
|
||||
intermediate_tensors=intermediate_tensors,
|
||||
**MultiModalKwargs.as_kwargs(multi_modal_kwargs,
|
||||
@@ -293,6 +291,9 @@ class TP1DraftModelRunner(ModelRunnerWrapperBase):
|
||||
)
|
||||
outputs.append(output)
|
||||
|
||||
if self.return_hidden_states and is_fallback:
|
||||
output.hidden_states = hidden_states
|
||||
|
||||
if model_input.attn_metadata.num_prefills == 0 \
|
||||
and self.indices_of_seq_with_bonus_tokens is not None:
|
||||
assert output.sampled_token_ids is not None
|
||||
|
||||
Reference in New Issue
Block a user