2025-06-04 20:26:44 +08:00
|
|
|
#
|
|
|
|
|
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
|
|
|
|
# Copyright 2023 The vLLM team.
|
|
|
|
|
#
|
|
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
|
|
# You may obtain a copy of the License at
|
|
|
|
|
#
|
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
|
#
|
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
|
# limitations under the License.
|
|
|
|
|
# This file is a part of the vllm-ascend project.
|
|
|
|
|
# Adapted from vllm-project/blob/main/tests/entrypoints/llm/test_accuracy.py
|
|
|
|
|
#
|
|
|
|
|
|
|
|
|
|
import gc
|
|
|
|
|
import multiprocessing
|
|
|
|
|
from multiprocessing import Queue
|
|
|
|
|
|
|
|
|
|
import lm_eval
|
|
|
|
|
import pytest
|
|
|
|
|
import torch
|
|
|
|
|
|
|
|
|
|
# pre-trained model path on Hugging Face.
|
|
|
|
|
MODELS = ["deepseek-ai/DeepSeek-V2-Lite"]
|
|
|
|
|
# Math reasoning benchmark (Grade School Math 8K).
|
|
|
|
|
TASK = "gsm8k"
|
|
|
|
|
# Answer validation requiring format consistency.
|
|
|
|
|
FILTER = "exact_match,strict-match"
|
|
|
|
|
# 3% relative tolerance for numerical accuracy.
|
|
|
|
|
RTOL = 0.03
|
|
|
|
|
# Baseline accuracy after VLLM optimization.
|
2025-06-08 22:33:37 +08:00
|
|
|
EXPECTED_VALUE = 0.3843821076573162
|
2025-06-04 20:26:44 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def run_test(model_name, queue, more_args=None):
|
[V1][eagle3] Support eagle3 proposer for v1 (#1032)
### What this PR does / why we need it?
This PR implements the Eagle Pososer feature for vLLM v1, which enables
more efficient speculative decoding by using a draft model to predict
potential future tokens.
- The implementation includes the core Eagle algorithm integration with
vLLM's existing architecture, allowing for faster inference while
maintaining output quality.
- This is needed to significantly improve the generation speed of large
language models without compromising on the quality of generated text.
### Does this PR introduce any user-facing change?
Yes, this PR introduces a new speculative decoding mode that can be
enabled via configuration.
- Users can now choose to use Eagle Pososer by setting appropriate flags
in the inference configuration.
- The API remains backward compatible, with the new functionality being
opt-in.
### How was this patch tested?
CI passed with new unit tests added for the Eagle Pososer functionality.
- Benchmark tests were conducted comparing generation speed and quality
with and without Eagle Pososer.
- Integration tests were performed with various model architectures to
ensure compatibility.
- Manual testing was done using different prompt scenarios to verify
output quality remains consistent.
- we test accept rate on one Ascend 910B npu, The acceptance rate
results are basically consistent with those shown here:
https://github.com/vllm-project/vllm/pull/16937
- Currently, we support scenarios where num_spec_tokens <= 2. When
num_spec_tokens > 2, issues such as insufficient GPU memory and operator
computation errors may occur. We will address this in subsequent
updates.
- We will add support for Eagle v1 in future updates.
### Acceptance Test Script
```bash
SCRIPT="/offline/eagle.py"
DATASET="ShareGpt"
MODEL=Meta-Llama-3.1-8B-Instruct
DRAFT=EAGLE3-LLaMA3.1-Instruct-8B
CUDA_VISIBLE_DEVICES="0" VLLM_USE_V1=1 $PYTHON $SCRIPT \
--dataset $DATASET \
--num_spec_tokens 2 \
--max_num_seqs 1 \
--model_dir $MODEL \
--eagle_dir $DRAFT \
--tp 1 \
--num_prompts 80
```
### Acceptance Test Results
```bash
██████████████████████████████████████████████████████████████████████████████████████████████████████████| 80/80 [21:22<00:00, 16.03s/it, est. speed input: 4.72 toks/s, output: 13.56 toks/s]
-------------------------------------------------------------------------------------
mean acceptance length: 1.63
-------------------------------------------------------------------------------------
total_counts: 8062
acceptance at token 0: 1.00 (8062 times)
acceptance at token 1: 0.70 (5612 times)
acceptance at token 2: 0.47 (3765 times)
```
Closes: https://github.com/vllm-project/vllm-ascend/issues/1004
---------
Signed-off-by: yuancaoyaoHW <a2749322671@gmail.com>
2025-06-20 17:19:54 +08:00
|
|
|
model_args = f"pretrained={model_name},max_model_len=4096,trust_remote_code=True,tensor_parallel_size=4,enforce_eager=True"
|
2025-06-04 20:26:44 +08:00
|
|
|
if more_args is not None:
|
|
|
|
|
model_args = f"{model_args},{more_args}"
|
|
|
|
|
results = lm_eval.simple_evaluate(
|
|
|
|
|
model="vllm",
|
|
|
|
|
model_args=model_args,
|
|
|
|
|
tasks=TASK,
|
|
|
|
|
batch_size="auto",
|
|
|
|
|
)
|
|
|
|
|
result = results["results"][TASK][FILTER]
|
|
|
|
|
print(100 * "*", "\nThe accuracy test result:", result)
|
|
|
|
|
queue.put(result)
|
|
|
|
|
del results
|
|
|
|
|
torch.npu.empty_cache()
|
|
|
|
|
gc.collect()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize("model", MODELS)
|
|
|
|
|
def test_lm_eval_accuracy(model, monkeypatch: pytest.MonkeyPatch):
|
|
|
|
|
with monkeypatch.context():
|
|
|
|
|
result_queue: Queue[float] = multiprocessing.Queue()
|
|
|
|
|
p = multiprocessing.Process(target=run_test,
|
|
|
|
|
args=(
|
|
|
|
|
model,
|
|
|
|
|
result_queue,
|
|
|
|
|
))
|
|
|
|
|
p.start()
|
|
|
|
|
p.join()
|
|
|
|
|
result = result_queue.get()
|
|
|
|
|
assert (EXPECTED_VALUE - RTOL < result < EXPECTED_VALUE + RTOL), \
|
|
|
|
|
f"Expected: {EXPECTED_VALUE}±{RTOL} | Measured: {result}"
|