[Test][Feature] Add e2e test for QuaRot model with eagle3 (#7128)
### What this PR does / why we need it?
Add an e2e test for QuaRot model with eagle3 that runs both the QuaRot
model and the float model, and then compares their acceptance rates. The
QuaRot model adapting eagle3 PR(#6914, #7038)
- vLLM version: v0.16.0
- vLLM main:
4034c3d32e
Signed-off-by: zhaomingyu <zhaomingyu13@h-partners.com>
This commit is contained in:
3
.github/workflows/misc/model_list.json
vendored
3
.github/workflows/misc/model_list.json
vendored
@@ -1,6 +1,7 @@
|
||||
{
|
||||
"models": [
|
||||
"AngelSlim/Qwen3-32B_eagle3",
|
||||
"AngelSlim/Qwen3-a3B_eagle3",
|
||||
"Anionex/Qwen3-1.7B-W4A8-V1",
|
||||
"ArthurZ/ilama-3.2-1B",
|
||||
"BAAI/bge-base-en-v1.5",
|
||||
@@ -207,10 +208,12 @@
|
||||
"vllm-ascend/Qwen3-30B-A3B-Puring",
|
||||
"vllm-ascend/Qwen3-30B-A3B-W8A8",
|
||||
"vllm-ascend/Qwen3-30B-A3B-W8A8-Pruning",
|
||||
"vllm-ascend/Qwen3-30B-A3B-W8A8-QuaRot",
|
||||
"vllm-ascend/Qwen3-30B-A3B-Instruct-2507-quantized.w8a8",
|
||||
"vllm-ascend/Qwen3-30B-A3B-Instruct-2507-quantized.w4a8",
|
||||
"vllm-ascend/Qwen3-32B-W4A4",
|
||||
"vllm-ascend/Qwen3-32B-W8A8",
|
||||
"vllm-ascend/Qwen3-32B-W8A8-QuaRot",
|
||||
"vllm-ascend/Qwen3-8B",
|
||||
"vllm-ascend/Qwen3-8B-W4A8",
|
||||
"vllm-ascend/Qwen3-8B-W8A8",
|
||||
|
||||
4
.github/workflows/scripts/config.yaml
vendored
4
.github/workflows/scripts/config.yaml
vendored
@@ -54,7 +54,7 @@ e2e-singlecard:
|
||||
- name: tests/e2e/singlecard/spec_decode/test_mtp_eagle_correctness.py
|
||||
estimated_time: 1500
|
||||
- name: tests/e2e/singlecard/spec_decode/test_v1_spec_decode.py
|
||||
estimated_time: 1800
|
||||
estimated_time: 600
|
||||
- name: tests/e2e/singlecard/model_runner_v2/test_basic.py
|
||||
estimated_time: 80
|
||||
is_skipped: true
|
||||
@@ -101,6 +101,8 @@ e2e-multicard-2-cards:
|
||||
estimated_time: 60
|
||||
- name: tests/e2e/multicard/2-cards/test_llama32_lora_tp2.py
|
||||
estimated_time: 223
|
||||
- name: tests/e2e/multicard/2-cards/spec_decode/test_quarot_eagle.py
|
||||
estimated_time: 600
|
||||
# Run the test in a separate step to avoid oom
|
||||
- name: tests/e2e/multicard/2-cards/test_offline_inference_distributed.py::test_deepseek_multistream_moe_tp2
|
||||
estimated_time: 100
|
||||
|
||||
Reference in New Issue
Block a user