[Feat][Bugfix][main] Adapted SP to eagle3 (#5562)
### What this PR does / why we need it?
Adapted sp to eagle3.
There may still be some problems, e.g., accuracy in some scenes,
`sp`+`dp`...
We will fix them later.
### Does this PR introduce _any_ user-facing change?
N/A
### How was this patch tested?
We tested it mainly in a new `e2e`.
```shell
pytest -s tests/e2e/singlecard/spec_decode/test_v1_spec_decode.py::test_llama_qwen_eagle_acceptance
```
```text
.
=============================== warnings summary ===============================
<frozen importlib._bootstrap>:241
<frozen importlib._bootstrap>:241: DeprecationWarning: builtin type SwigPyPacked has no __module__ attribute
<frozen importlib._bootstrap>:241
<frozen importlib._bootstrap>:241: DeprecationWarning: builtin type SwigPyObject has no __module__ attribute
-- Docs: https://docs.pytest.org/en/stable/how-to/capture-warnings.html
============= 3 passed, 1 skipped, 2 warnings in 142.05s (0:02:22) =============
```
It passed.
- vLLM version: v0.13.0
- vLLM main:
7157596103
Signed-off-by: drslark <slarksblood@qq.com>
This commit is contained in:
@@ -34,6 +34,10 @@ BASELINES = {
|
||||
"eagle3": [0.68, 0.40, 0.18],
|
||||
}
|
||||
|
||||
BASELINES_SP = {
|
||||
"eagle3": [0.68, 0.40, 0.18],
|
||||
}
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def test_prompts():
|
||||
@@ -371,3 +375,111 @@ def test_llama_qwen_eagle_acceptance(
|
||||
print(f"golden: {golden}")
|
||||
|
||||
assert match
|
||||
|
||||
|
||||
# TODO the function of sp in eagle3 is improving gradually,
|
||||
# there are still problems when enable sp + dp and some unknown scenes.
|
||||
# this e2e should also be improving gradually.
|
||||
@pytest.mark.parametrize("method", ["eagle3"])
|
||||
@pytest.mark.parametrize("num_speculative_tokens", [3])
|
||||
@pytest.mark.parametrize("disable_padded_drafter_batch", [True, False])
|
||||
@pytest.mark.parametrize("async_scheduling", [True, False])
|
||||
def test_eagle3_sp_acceptance(
|
||||
method: str,
|
||||
num_speculative_tokens: int,
|
||||
disable_padded_drafter_batch: bool,
|
||||
async_scheduling: bool,
|
||||
):
|
||||
if disable_padded_drafter_batch and async_scheduling:
|
||||
pytest.skip(
|
||||
"skip disable_padded_drafter_batch=True and async_scheduling=True",
|
||||
)
|
||||
|
||||
main_model_name = MODELS[method]["main"]
|
||||
spec_model_name = MODELS[method]["spec"]
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
main_model_name,
|
||||
trust_remote_code=True,
|
||||
)
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0,
|
||||
ignore_eos=False,
|
||||
max_tokens=256,
|
||||
)
|
||||
|
||||
# sp will only be enabled when query_lens > 1000
|
||||
prompts = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": " " * 1000 + "Hello, my name is",
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": " " * 1000 + "The president of the United States is",
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": " " * 1000 + "The capital of France is",
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": " " * 1000 + "The future of AI is",
|
||||
},
|
||||
]
|
||||
prompts = [
|
||||
tokenizer.apply_chat_template(
|
||||
[prompt],
|
||||
tokenize=False,
|
||||
add_generation_prompt=True,
|
||||
) for prompt in prompts
|
||||
]
|
||||
|
||||
speculative_config = {
|
||||
"method": method,
|
||||
"num_speculative_tokens": num_speculative_tokens,
|
||||
"disable_padded_drafter_batch": disable_padded_drafter_batch,
|
||||
"model": spec_model_name,
|
||||
}
|
||||
|
||||
compilation_config = CompilationConfig(cudagraph_capture_sizes=[12])
|
||||
|
||||
with VllmRunner(
|
||||
main_model_name,
|
||||
enforce_eager=True,
|
||||
max_model_len=8192,
|
||||
disable_log_stats=False,
|
||||
tensor_parallel_size=1,
|
||||
max_num_seqs=256,
|
||||
distributed_executor_backend="mp",
|
||||
gpu_memory_utilization=0.7,
|
||||
speculative_config=speculative_config,
|
||||
compilation_config=compilation_config,
|
||||
async_scheduling=async_scheduling,
|
||||
) as llm:
|
||||
_ = llm.generate(prompts, sampling_params)
|
||||
metrics = llm.model.get_metrics()
|
||||
|
||||
num_drafts = 0
|
||||
num_accepted_tokens_per_pos = [0] * num_speculative_tokens
|
||||
for metric in metrics:
|
||||
if metric.name == "vllm:spec_decode_num_drafts":
|
||||
assert isinstance(metric, Counter)
|
||||
num_drafts += metric.value
|
||||
elif metric.name == "vllm:spec_decode_num_accepted_tokens_per_pos":
|
||||
assert isinstance(metric, Vector)
|
||||
for pos in range(len(metric.values)):
|
||||
num_accepted_tokens_per_pos[pos] += metric.values[pos]
|
||||
|
||||
acceptance_per_pos = [
|
||||
num_accepted_tokens / num_drafts
|
||||
for num_accepted_tokens in num_accepted_tokens_per_pos
|
||||
]
|
||||
golden = BASELINES_SP[method]
|
||||
|
||||
match = all(abs(a - b) < 0.06 for a, b in zip(acceptance_per_pos, golden))
|
||||
if not match:
|
||||
print(f"acceptance_per_pos: {acceptance_per_pos}")
|
||||
print(f"golden: {golden}")
|
||||
|
||||
assert match
|
||||
|
||||
Reference in New Issue
Block a user