[Test][Feature] Add e2e test for QuaRot model with eagle3 (#7128)

### What this PR does / why we need it? Add an e2e test for QuaRot model with eagle3 that runs both the QuaRot model and the float model, and then compares their acceptance rates. The QuaRot model adapting eagle3 PR(#6914, #7038) - vLLM version: v0.16.0 - vLLM main: 4034c3d32e Signed-off-by: zhaomingyu <zhaomingyu13@h-partners.com>
2026-03-16 15:35:55 +08:00
parent 71c21f76f5
commit 9320365dab
3 changed files with 215 additions and 1 deletions
--- a/.github/workflows/misc/model_list.json
+++ b/.github/workflows/misc/model_list.json
@@ -1,6 +1,7 @@
 {
    "models": [
      "AngelSlim/Qwen3-32B_eagle3",
+      "AngelSlim/Qwen3-a3B_eagle3",
      "Anionex/Qwen3-1.7B-W4A8-V1",
      "ArthurZ/ilama-3.2-1B",
      "BAAI/bge-base-en-v1.5",
@@ -207,10 +208,12 @@
      "vllm-ascend/Qwen3-30B-A3B-Puring",
      "vllm-ascend/Qwen3-30B-A3B-W8A8",
      "vllm-ascend/Qwen3-30B-A3B-W8A8-Pruning",
+      "vllm-ascend/Qwen3-30B-A3B-W8A8-QuaRot",
      "vllm-ascend/Qwen3-30B-A3B-Instruct-2507-quantized.w8a8",
      "vllm-ascend/Qwen3-30B-A3B-Instruct-2507-quantized.w4a8",
      "vllm-ascend/Qwen3-32B-W4A4",
      "vllm-ascend/Qwen3-32B-W8A8",
+      "vllm-ascend/Qwen3-32B-W8A8-QuaRot",
      "vllm-ascend/Qwen3-8B",
      "vllm-ascend/Qwen3-8B-W4A8",
      "vllm-ascend/Qwen3-8B-W8A8",
--- a/.github/workflows/scripts/config.yaml
+++ b/.github/workflows/scripts/config.yaml
@@ -54,7 +54,7 @@ e2e-singlecard:
  - name: tests/e2e/singlecard/spec_decode/test_mtp_eagle_correctness.py
    estimated_time: 1500
  - name: tests/e2e/singlecard/spec_decode/test_v1_spec_decode.py
-    estimated_time: 1800
+    estimated_time: 600
  - name: tests/e2e/singlecard/model_runner_v2/test_basic.py
    estimated_time: 80
    is_skipped: true
@@ -101,6 +101,8 @@ e2e-multicard-2-cards:
    estimated_time: 60
  - name: tests/e2e/multicard/2-cards/test_llama32_lora_tp2.py
    estimated_time: 223
+  - name: tests/e2e/multicard/2-cards/spec_decode/test_quarot_eagle.py
+    estimated_time: 600
  # Run the test in a separate step to avoid oom
  - name: tests/e2e/multicard/2-cards/test_offline_inference_distributed.py::test_deepseek_multistream_moe_tp2
    estimated_time: 100