From 70606e0bb93cc23c1f8d5dfb1b681bd24e66d2ab Mon Sep 17 00:00:00 2001
From: SILONG ZENG <2609716663@qq.com>
Date: Mon, 15 Dec 2025 15:04:20 +0800
Subject: [PATCH] [Test]update accuracy test of models (#4911)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### What this PR does / why we need it?
Delete accuracy tests for models that are no longer retained：
- Meta-Llama-3.1-8B-Instruct
- llava-1.5-7b-hf
- InternVL2-8B.yaml
- InternVL2_5-8B.yaml
- InternVL3-8B.yaml

Add accuracy tests for the new models：
- Llama-3.2-3B-Instruct
- llava-onevision-qwen2-0.5b-ov-hf
- Qwen3-VL-30B-A3B-Instruct

- vLLM version: v0.12.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9

---------

Signed-off-by: MrZ20 <2609716663@qq.com>
---
 .github/workflows/nightly_test_a2.yaml                |  7 +++----
 tests/e2e/models/configs/InternVL2-8B.yaml            | 11 -----------
 tests/e2e/models/configs/InternVL2_5-8B.yaml          | 11 -----------
 tests/e2e/models/configs/InternVL3-8B.yaml            | 11 -----------
 ....1-8B-Instruct.yaml => Llama-3.2-3B-Instruct.yaml} |  7 +++----
 .../models/configs/Qwen3-Omni-30B-A3B-Instruct.yaml   | 11 +++++++++++
 tests/e2e/models/configs/accuracy.txt                 | 10 ++++------
 tests/e2e/models/configs/gemma-3-4b-it.yaml           |  1 +
 ...-hf.yaml => llava-onevision-qwen2-0.5b-ov-hf.yaml} |  5 ++---
 9 files changed, 24 insertions(+), 50 deletions(-)
 delete mode 100644 tests/e2e/models/configs/InternVL2-8B.yaml
 delete mode 100644 tests/e2e/models/configs/InternVL2_5-8B.yaml
 delete mode 100644 tests/e2e/models/configs/InternVL3-8B.yaml
 rename tests/e2e/models/configs/{Meta-Llama-3.1-8B-Instruct.yaml => Llama-3.2-3B-Instruct.yaml} (64%)
 create mode 100644 tests/e2e/models/configs/Qwen3-Omni-30B-A3B-Instruct.yaml
 rename tests/e2e/models/configs/{llava-1.5-7b-hf.yaml => llava-onevision-qwen2-0.5b-ov-hf.yaml} (68%)

diff --git a/.github/workflows/nightly_test_a2.yaml b/.github/workflows/nightly_test_a2.yaml
index d13e79f1..dee605cc 100644
--- a/.github/workflows/nightly_test_a2.yaml
+++ b/.github/workflows/nightly_test_a2.yaml
@@ -86,15 +86,13 @@ jobs:
               - Qwen3-8B-W8A8
               - Qwen3-VL-8B-Instruct
               - Qwen2.5-Omni-7B
-              - Meta-Llama-3.1-8B-Instruct
           - os: linux-aarch64-a2-1
             model_list:
               - ERNIE-4.5-21B-A3B-PT
-              - gemma-3-4b-it
-              - internlm-7b
               - InternVL3_5-8B-hf
-              - llava-1.5-7b-hf
               - Molmo-7B-D-0924
+              - Llama-3.2-3B-Instruct
+              - llava-onevision-qwen2-0.5b-ov-hf
           - os: linux-aarch64-a2-2
             model_list:
               - Qwen3-30B-A3B
@@ -103,6 +101,7 @@ jobs:
           - os: linux-aarch64-a2-4
             model_list:
               - Qwen3-Next-80B-A3B-Instruct
+              - Qwen3-VL-30B-A3B-Instruct
     uses: ./.github/workflows/_e2e_nightly_single_node_models.yaml
     with:
       vllm: v0.12.0
diff --git a/tests/e2e/models/configs/InternVL2-8B.yaml b/tests/e2e/models/configs/InternVL2-8B.yaml
deleted file mode 100644
index bf705365..00000000
--- a/tests/e2e/models/configs/InternVL2-8B.yaml
+++ /dev/null
@@ -1,11 +0,0 @@
-model_name: "OpenGVLab/InternVL2-8B"
-runner: "linux-aarch64-a2-1"
-hardware: "Atlas A2 Series"
-model: "vllm-vlm"
-tasks:
-  - name: "mmmu_val"
-    metrics:
-    - name: "acc,none"
-      value: 0.58
-max_model_len: 32768
-trust_remote_code: True
diff --git a/tests/e2e/models/configs/InternVL2_5-8B.yaml b/tests/e2e/models/configs/InternVL2_5-8B.yaml
deleted file mode 100644
index d8c1fafe..00000000
--- a/tests/e2e/models/configs/InternVL2_5-8B.yaml
+++ /dev/null
@@ -1,11 +0,0 @@
-model_name: "OpenGVLab/InternVL2_5-8B"
-runner: "linux-aarch64-a2-1"
-hardware: "Atlas A2 Series"
-model: "vllm-vlm"
-tasks:
-  - name: "mmmu_val"
-    metrics:
-    - name: "acc,none"
-      value: 0.58
-max_model_len: 32768
-trust_remote_code: True
diff --git a/tests/e2e/models/configs/InternVL3-8B.yaml b/tests/e2e/models/configs/InternVL3-8B.yaml
deleted file mode 100644
index d07dc6f9..00000000
--- a/tests/e2e/models/configs/InternVL3-8B.yaml
+++ /dev/null
@@ -1,11 +0,0 @@
-model_name: "OpenGVLab/InternVL3-8B"
-runner: "linux-aarch64-a2-1"
-hardware: "Atlas A2 Series"
-model: "vllm-vlm"
-tasks:
-  - name: "mmmu_val"
-    metrics:
-    - name: "acc,none"
-      value: 0.58
-max_model_len: 32768
-trust_remote_code: True
diff --git a/tests/e2e/models/configs/Meta-Llama-3.1-8B-Instruct.yaml b/tests/e2e/models/configs/Llama-3.2-3B-Instruct.yaml
similarity index 64%
rename from tests/e2e/models/configs/Meta-Llama-3.1-8B-Instruct.yaml
rename to tests/e2e/models/configs/Llama-3.2-3B-Instruct.yaml
index 4590116c..0b9a1009 100644
--- a/tests/e2e/models/configs/Meta-Llama-3.1-8B-Instruct.yaml
+++ b/tests/e2e/models/configs/Llama-3.2-3B-Instruct.yaml
@@ -1,11 +1,10 @@
-model_name: "LLM-Research/Meta-Llama-3.1-8B-Instruct"
+model_name: "LLM-Research/Llama-3.2-3B-Instruct"
 hardware: "Atlas A2 Series"
 tasks:
 - name: "gsm8k"
   metrics:
   - name: "exact_match,strict-match"
-    value: 0.82
+    value: 0.71
   - name: "exact_match,flexible-extract"
-    value: 0.84
-
+    value: 0.76
 num_fewshot: 5
diff --git a/tests/e2e/models/configs/Qwen3-Omni-30B-A3B-Instruct.yaml b/tests/e2e/models/configs/Qwen3-Omni-30B-A3B-Instruct.yaml
new file mode 100644
index 00000000..cdf3866b
--- /dev/null
+++ b/tests/e2e/models/configs/Qwen3-Omni-30B-A3B-Instruct.yaml
@@ -0,0 +1,11 @@
+model_name: "Qwen/Qwen3-Omni-30B-A3B-Instruct"
+hardware: "Atlas A2 Series"
+model: "vllm-vlm"
+tasks:
+- name: "mmmu_val"
+  metrics:
+  - name: "acc,none"
+    value: 0.52
+max_model_len: 8192
+tensor_parallel_size: 4
+enable_expert_parallel: True
diff --git a/tests/e2e/models/configs/accuracy.txt b/tests/e2e/models/configs/accuracy.txt
index d4238488..3361b949 100644
--- a/tests/e2e/models/configs/accuracy.txt
+++ b/tests/e2e/models/configs/accuracy.txt
@@ -5,13 +5,11 @@ Qwen2-Audio-7B-Instruct.yaml
 Qwen3-VL-30B-A3B-Instruct.yaml
 Qwen3-VL-8B-Instruct.yaml
 Qwen2.5-Omni-7B.yaml
-Meta-Llama-3.1-8B-Instruct.yaml
-InternVL2-8B.yaml
-InternVL2_5-8B.yaml
-InternVL3-8B.yaml
-InternVL3_5-8B.yaml
+Qwen3-Omni-30B-A3B-Instruct.yaml
+InternVL3_5-8B-hf.yaml
 ERNIE-4.5-21B-A3B-PT.yaml
 gemma-3-4b-it.yaml
 internlm3-8b-instruct.yaml
 Molmo-7B-D-0924.yaml
-llava-1.5-7b-hf.yaml
+llava-onevision-qwen2-0.5b-ov-hf.yaml
+Llama-3.2-3B-Instruct.yaml
diff --git a/tests/e2e/models/configs/gemma-3-4b-it.yaml b/tests/e2e/models/configs/gemma-3-4b-it.yaml
index 42366800..4305db95 100644
--- a/tests/e2e/models/configs/gemma-3-4b-it.yaml
+++ b/tests/e2e/models/configs/gemma-3-4b-it.yaml
@@ -11,3 +11,4 @@ num_fewshot: 5
 apply_chat_template: False
 fewshot_as_multiturn: False
 gpu_memory_utilization: 0.7
+enforce_eager: True
diff --git a/tests/e2e/models/configs/llava-1.5-7b-hf.yaml b/tests/e2e/models/configs/llava-onevision-qwen2-0.5b-ov-hf.yaml
similarity index 68%
rename from tests/e2e/models/configs/llava-1.5-7b-hf.yaml
rename to tests/e2e/models/configs/llava-onevision-qwen2-0.5b-ov-hf.yaml
index 7bd69de9..40ce9b5e 100644
--- a/tests/e2e/models/configs/llava-1.5-7b-hf.yaml
+++ b/tests/e2e/models/configs/llava-onevision-qwen2-0.5b-ov-hf.yaml
@@ -1,11 +1,10 @@
-model_name: "llava-hf/llava-1.5-7b-hf"
+model_name: "llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
 hardware: "Atlas A2 Series"
 model: "vllm-vlm"
 tasks:
 - name: "ceval-valid"
   metrics:
   - name: "acc,none"
-    value: 0.30
+    value: 0.42
 trust_remote_code: True
 gpu_memory_utilization: 0.8
-dtype: "bfloat16"