From 70606e0bb93cc23c1f8d5dfb1b681bd24e66d2ab Mon Sep 17 00:00:00 2001 From: SILONG ZENG <2609716663@qq.com> Date: Mon, 15 Dec 2025 15:04:20 +0800 Subject: [PATCH] [Test]update accuracy test of models (#4911) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### What this PR does / why we need it? Delete accuracy tests for models that are no longer retained: - Meta-Llama-3.1-8B-Instruct - llava-1.5-7b-hf - InternVL2-8B.yaml - InternVL2_5-8B.yaml - InternVL3-8B.yaml Add accuracy tests for the new models: - Llama-3.2-3B-Instruct - llava-onevision-qwen2-0.5b-ov-hf - Qwen3-VL-30B-A3B-Instruct - vLLM version: v0.12.0 - vLLM main: https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9 --------- Signed-off-by: MrZ20 <2609716663@qq.com> --- .github/workflows/nightly_test_a2.yaml | 7 +++---- tests/e2e/models/configs/InternVL2-8B.yaml | 11 ----------- tests/e2e/models/configs/InternVL2_5-8B.yaml | 11 ----------- tests/e2e/models/configs/InternVL3-8B.yaml | 11 ----------- ....1-8B-Instruct.yaml => Llama-3.2-3B-Instruct.yaml} | 7 +++---- .../models/configs/Qwen3-Omni-30B-A3B-Instruct.yaml | 11 +++++++++++ tests/e2e/models/configs/accuracy.txt | 10 ++++------ tests/e2e/models/configs/gemma-3-4b-it.yaml | 1 + ...-hf.yaml => llava-onevision-qwen2-0.5b-ov-hf.yaml} | 5 ++--- 9 files changed, 24 insertions(+), 50 deletions(-) delete mode 100644 tests/e2e/models/configs/InternVL2-8B.yaml delete mode 100644 tests/e2e/models/configs/InternVL2_5-8B.yaml delete mode 100644 tests/e2e/models/configs/InternVL3-8B.yaml rename tests/e2e/models/configs/{Meta-Llama-3.1-8B-Instruct.yaml => Llama-3.2-3B-Instruct.yaml} (64%) create mode 100644 tests/e2e/models/configs/Qwen3-Omni-30B-A3B-Instruct.yaml rename tests/e2e/models/configs/{llava-1.5-7b-hf.yaml => llava-onevision-qwen2-0.5b-ov-hf.yaml} (68%) diff --git a/.github/workflows/nightly_test_a2.yaml b/.github/workflows/nightly_test_a2.yaml index d13e79f1..dee605cc 100644 --- a/.github/workflows/nightly_test_a2.yaml +++ b/.github/workflows/nightly_test_a2.yaml @@ -86,15 +86,13 @@ jobs: - Qwen3-8B-W8A8 - Qwen3-VL-8B-Instruct - Qwen2.5-Omni-7B - - Meta-Llama-3.1-8B-Instruct - os: linux-aarch64-a2-1 model_list: - ERNIE-4.5-21B-A3B-PT - - gemma-3-4b-it - - internlm-7b - InternVL3_5-8B-hf - - llava-1.5-7b-hf - Molmo-7B-D-0924 + - Llama-3.2-3B-Instruct + - llava-onevision-qwen2-0.5b-ov-hf - os: linux-aarch64-a2-2 model_list: - Qwen3-30B-A3B @@ -103,6 +101,7 @@ jobs: - os: linux-aarch64-a2-4 model_list: - Qwen3-Next-80B-A3B-Instruct + - Qwen3-VL-30B-A3B-Instruct uses: ./.github/workflows/_e2e_nightly_single_node_models.yaml with: vllm: v0.12.0 diff --git a/tests/e2e/models/configs/InternVL2-8B.yaml b/tests/e2e/models/configs/InternVL2-8B.yaml deleted file mode 100644 index bf705365..00000000 --- a/tests/e2e/models/configs/InternVL2-8B.yaml +++ /dev/null @@ -1,11 +0,0 @@ -model_name: "OpenGVLab/InternVL2-8B" -runner: "linux-aarch64-a2-1" -hardware: "Atlas A2 Series" -model: "vllm-vlm" -tasks: - - name: "mmmu_val" - metrics: - - name: "acc,none" - value: 0.58 -max_model_len: 32768 -trust_remote_code: True diff --git a/tests/e2e/models/configs/InternVL2_5-8B.yaml b/tests/e2e/models/configs/InternVL2_5-8B.yaml deleted file mode 100644 index d8c1fafe..00000000 --- a/tests/e2e/models/configs/InternVL2_5-8B.yaml +++ /dev/null @@ -1,11 +0,0 @@ -model_name: "OpenGVLab/InternVL2_5-8B" -runner: "linux-aarch64-a2-1" -hardware: "Atlas A2 Series" -model: "vllm-vlm" -tasks: - - name: "mmmu_val" - metrics: - - name: "acc,none" - value: 0.58 -max_model_len: 32768 -trust_remote_code: True diff --git a/tests/e2e/models/configs/InternVL3-8B.yaml b/tests/e2e/models/configs/InternVL3-8B.yaml deleted file mode 100644 index d07dc6f9..00000000 --- a/tests/e2e/models/configs/InternVL3-8B.yaml +++ /dev/null @@ -1,11 +0,0 @@ -model_name: "OpenGVLab/InternVL3-8B" -runner: "linux-aarch64-a2-1" -hardware: "Atlas A2 Series" -model: "vllm-vlm" -tasks: - - name: "mmmu_val" - metrics: - - name: "acc,none" - value: 0.58 -max_model_len: 32768 -trust_remote_code: True diff --git a/tests/e2e/models/configs/Meta-Llama-3.1-8B-Instruct.yaml b/tests/e2e/models/configs/Llama-3.2-3B-Instruct.yaml similarity index 64% rename from tests/e2e/models/configs/Meta-Llama-3.1-8B-Instruct.yaml rename to tests/e2e/models/configs/Llama-3.2-3B-Instruct.yaml index 4590116c..0b9a1009 100644 --- a/tests/e2e/models/configs/Meta-Llama-3.1-8B-Instruct.yaml +++ b/tests/e2e/models/configs/Llama-3.2-3B-Instruct.yaml @@ -1,11 +1,10 @@ -model_name: "LLM-Research/Meta-Llama-3.1-8B-Instruct" +model_name: "LLM-Research/Llama-3.2-3B-Instruct" hardware: "Atlas A2 Series" tasks: - name: "gsm8k" metrics: - name: "exact_match,strict-match" - value: 0.82 + value: 0.71 - name: "exact_match,flexible-extract" - value: 0.84 - + value: 0.76 num_fewshot: 5 diff --git a/tests/e2e/models/configs/Qwen3-Omni-30B-A3B-Instruct.yaml b/tests/e2e/models/configs/Qwen3-Omni-30B-A3B-Instruct.yaml new file mode 100644 index 00000000..cdf3866b --- /dev/null +++ b/tests/e2e/models/configs/Qwen3-Omni-30B-A3B-Instruct.yaml @@ -0,0 +1,11 @@ +model_name: "Qwen/Qwen3-Omni-30B-A3B-Instruct" +hardware: "Atlas A2 Series" +model: "vllm-vlm" +tasks: +- name: "mmmu_val" + metrics: + - name: "acc,none" + value: 0.52 +max_model_len: 8192 +tensor_parallel_size: 4 +enable_expert_parallel: True diff --git a/tests/e2e/models/configs/accuracy.txt b/tests/e2e/models/configs/accuracy.txt index d4238488..3361b949 100644 --- a/tests/e2e/models/configs/accuracy.txt +++ b/tests/e2e/models/configs/accuracy.txt @@ -5,13 +5,11 @@ Qwen2-Audio-7B-Instruct.yaml Qwen3-VL-30B-A3B-Instruct.yaml Qwen3-VL-8B-Instruct.yaml Qwen2.5-Omni-7B.yaml -Meta-Llama-3.1-8B-Instruct.yaml -InternVL2-8B.yaml -InternVL2_5-8B.yaml -InternVL3-8B.yaml -InternVL3_5-8B.yaml +Qwen3-Omni-30B-A3B-Instruct.yaml +InternVL3_5-8B-hf.yaml ERNIE-4.5-21B-A3B-PT.yaml gemma-3-4b-it.yaml internlm3-8b-instruct.yaml Molmo-7B-D-0924.yaml -llava-1.5-7b-hf.yaml +llava-onevision-qwen2-0.5b-ov-hf.yaml +Llama-3.2-3B-Instruct.yaml diff --git a/tests/e2e/models/configs/gemma-3-4b-it.yaml b/tests/e2e/models/configs/gemma-3-4b-it.yaml index 42366800..4305db95 100644 --- a/tests/e2e/models/configs/gemma-3-4b-it.yaml +++ b/tests/e2e/models/configs/gemma-3-4b-it.yaml @@ -11,3 +11,4 @@ num_fewshot: 5 apply_chat_template: False fewshot_as_multiturn: False gpu_memory_utilization: 0.7 +enforce_eager: True diff --git a/tests/e2e/models/configs/llava-1.5-7b-hf.yaml b/tests/e2e/models/configs/llava-onevision-qwen2-0.5b-ov-hf.yaml similarity index 68% rename from tests/e2e/models/configs/llava-1.5-7b-hf.yaml rename to tests/e2e/models/configs/llava-onevision-qwen2-0.5b-ov-hf.yaml index 7bd69de9..40ce9b5e 100644 --- a/tests/e2e/models/configs/llava-1.5-7b-hf.yaml +++ b/tests/e2e/models/configs/llava-onevision-qwen2-0.5b-ov-hf.yaml @@ -1,11 +1,10 @@ -model_name: "llava-hf/llava-1.5-7b-hf" +model_name: "llava-hf/llava-onevision-qwen2-0.5b-ov-hf" hardware: "Atlas A2 Series" model: "vllm-vlm" tasks: - name: "ceval-valid" metrics: - name: "acc,none" - value: 0.30 + value: 0.42 trust_remote_code: True gpu_memory_utilization: 0.8 -dtype: "bfloat16"