[Feat] Refactor rejection sampler (#4975)
### What this PR does / why we need it?
Currently, we are using `AscendRejctionSampler` that extends from
`RejctionSampler` in spec decoding. `AscendRejctionSampler` override
`forward` of `RejctionSampler`, only aming to replace `rejection_sample`
func. This
causes a lot of code of `RejctionSampler` cannot be reused, for example:
- https://github.com/vllm-project/vllm/pull/19482
- https://github.com/vllm-project/vllm/pull/26060
- https://github.com/vllm-project/vllm/pull/29223
#### Proposed Change:
- Delete `AscendRejctionSampler` and use `RejctionSampler` directly in
model runner.
- Patch `RejctionSampler.expand_batch_to_tokens` and
`RejctionSampler.rejection_sample`, maybe a better way is to make them
as custom ops.
- Modify `NPUModelRunner` following
https://github.com/vllm-project/vllm/pull/26060
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
- [x] test logits processor for spec decoding
- [x] test logprobs for spec decoding
- [x] test logprobs for spec decoding + async shcheduling (test with
https://github.com/vllm-project/vllm-ascend/pull/4893/)
- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c
---------
Signed-off-by: realliujiaxu <realliujiaxu@163.com>
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
from __future__ import annotations
|
||||
|
||||
import math
|
||||
import os
|
||||
import random
|
||||
from typing import Any
|
||||
@@ -239,3 +240,56 @@ def test_suffix_acceptance(
|
||||
|
||||
# Heuristic: expect at least 80% acceptance rate at the end.
|
||||
assert last_accept_rate > 0.60
|
||||
|
||||
|
||||
@pytest.mark.parametrize("use_eagle3", [True], ids=["eagle3"])
|
||||
def test_eagle_logprobs(
|
||||
model_name: str,
|
||||
use_eagle3: bool,
|
||||
):
|
||||
prompt = {"role": "user", "content": "Hello world " * 10}
|
||||
sampling_params = SamplingParams(temperature=0,
|
||||
logprobs=1,
|
||||
max_tokens=10,
|
||||
ignore_eos=False)
|
||||
|
||||
ref_llm = LLM(model=model_name, max_model_len=2048, enforce_eager=False)
|
||||
ref_outputs = ref_llm.chat([prompt], sampling_params)
|
||||
ref_logprobs = []
|
||||
for output in ref_outputs[0].outputs:
|
||||
for logprobs in output.logprobs:
|
||||
for token_id in logprobs:
|
||||
ref_logprobs.append(logprobs[token_id])
|
||||
del ref_llm
|
||||
|
||||
spec_model_name = eagle3_model_name() if use_eagle3 else eagle_model_name()
|
||||
with VllmRunner(
|
||||
model_name,
|
||||
max_num_seqs=1,
|
||||
max_num_batched_tokens=2048,
|
||||
gpu_memory_utilization=0.6,
|
||||
speculative_config={
|
||||
"method": "eagle3" if use_eagle3 else "eagle",
|
||||
"model": spec_model_name,
|
||||
"num_speculative_tokens": 2,
|
||||
"max_model_len": 128,
|
||||
},
|
||||
max_model_len=128,
|
||||
enforce_eager=False,
|
||||
) as runner:
|
||||
spec_outputs = runner.model.chat([prompt], sampling_params)
|
||||
|
||||
# Collect logprobs outputs from spec decode LLM.
|
||||
spec_logprobs = []
|
||||
for output in spec_outputs[0].outputs:
|
||||
for logprobs in output.logprobs:
|
||||
for token_id in logprobs:
|
||||
spec_logprobs.append(logprobs[token_id])
|
||||
|
||||
for ref_logprob, spec_logprob in zip(ref_logprobs, spec_logprobs):
|
||||
assert math.isclose(ref_logprob.logprob,
|
||||
spec_logprob.logprob,
|
||||
rel_tol=5e-2,
|
||||
abs_tol=1e-1)
|
||||
assert ref_logprob.rank == spec_logprob.rank
|
||||
assert ref_logprob.decoded_token == spec_logprob.decoded_token
|
||||
|
||||
Reference in New Issue
Block a user