From dc1a6cb5039b5ee1520fde8cf85b99f7ce33ec7b Mon Sep 17 00:00:00 2001 From: ZengSilong <121143079+MrZ20@users.noreply.github.com> Date: Tue, 4 Nov 2025 14:46:39 +0800 Subject: [PATCH] [Test]Add accuracy test for multiple models (#3823) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### What this PR does / why we need it? Add accuracy test for multiple models: - Meta_Llama_3.1_8B_Instruct - Qwen2.5-Omni-7B - Qwen3-VL-8B-Instruct - vLLM version: v0.11.0 - vLLM main: https://github.com/vllm-project/vllm/commit/83f478bb19489b41e9d208b47b4bb5a95ac171ac --------- Signed-off-by: MrZ20 <2609716663@qq.com> --- .github/workflows/accuracy_test.yaml | 11 +++++++++-- tests/e2e/models/configs/DeepSeek-V2-Lite.yaml | 1 - .../models/configs/Meta-Llama-3.1-8B-Instruct.yaml | 11 +++++++++++ tests/e2e/models/configs/Qwen2.5-Omni-7B.yaml | 10 ++++++++++ tests/e2e/models/configs/Qwen2.5-VL-7B-Instruct.yaml | 3 +-- tests/e2e/models/configs/Qwen3-30B-A3B.yaml | 3 +-- tests/e2e/models/configs/Qwen3-8B-Base.yaml | 1 - tests/e2e/models/configs/Qwen3-VL-8B-Instruct.yaml | 11 +++++++++++ tests/e2e/models/configs/accuracy.txt | 3 +++ 9 files changed, 46 insertions(+), 8 deletions(-) create mode 100644 tests/e2e/models/configs/Meta-Llama-3.1-8B-Instruct.yaml create mode 100644 tests/e2e/models/configs/Qwen2.5-Omni-7B.yaml create mode 100644 tests/e2e/models/configs/Qwen3-VL-8B-Instruct.yaml diff --git a/.github/workflows/accuracy_test.yaml b/.github/workflows/accuracy_test.yaml index 0822fc59..5621b615 100644 --- a/.github/workflows/accuracy_test.yaml +++ b/.github/workflows/accuracy_test.yaml @@ -49,8 +49,9 @@ jobs: model_name: Qwen3-8B - runner: a2-1 model_name: Qwen2.5-VL-7B-Instruct - - runner: a2-1 - model_name: Qwen2-Audio-7B-Instruct + # To do: This model has a bug that needs to be fixed and readded + # - runner: a2-1 + # model_name: Qwen2-Audio-7B-Instruct - runner: a2-2 model_name: Qwen3-30B-A3B - runner: a2-2 @@ -61,6 +62,12 @@ jobs: model_name: Qwen3-Next-80B-A3B-Instruct - runner: a2-1 model_name: Qwen3-8B-W8A8 + - runner: a2-1 + model_name: Qwen3-VL-8B-Instruct + - runner: a2-1 + model_name: Qwen2.5-Omni-7B + - runner: a2-1 + model_name: Meta-Llama-3.1-8B-Instruct fail-fast: false # test will be triggered when tag 'accuracy-test' & 'ready-for-test' if: >- diff --git a/tests/e2e/models/configs/DeepSeek-V2-Lite.yaml b/tests/e2e/models/configs/DeepSeek-V2-Lite.yaml index 848a4911..c23be35b 100644 --- a/tests/e2e/models/configs/DeepSeek-V2-Lite.yaml +++ b/tests/e2e/models/configs/DeepSeek-V2-Lite.yaml @@ -1,5 +1,4 @@ model_name: "deepseek-ai/DeepSeek-V2-Lite" -runner: "linux-aarch64-a2-2" hardware: "Atlas A2 Series" tasks: - name: "gsm8k" diff --git a/tests/e2e/models/configs/Meta-Llama-3.1-8B-Instruct.yaml b/tests/e2e/models/configs/Meta-Llama-3.1-8B-Instruct.yaml new file mode 100644 index 00000000..4590116c --- /dev/null +++ b/tests/e2e/models/configs/Meta-Llama-3.1-8B-Instruct.yaml @@ -0,0 +1,11 @@ +model_name: "LLM-Research/Meta-Llama-3.1-8B-Instruct" +hardware: "Atlas A2 Series" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.82 + - name: "exact_match,flexible-extract" + value: 0.84 + +num_fewshot: 5 diff --git a/tests/e2e/models/configs/Qwen2.5-Omni-7B.yaml b/tests/e2e/models/configs/Qwen2.5-Omni-7B.yaml new file mode 100644 index 00000000..dec228dd --- /dev/null +++ b/tests/e2e/models/configs/Qwen2.5-Omni-7B.yaml @@ -0,0 +1,10 @@ +model_name: "Qwen/Qwen2.5-Omni-7B" +hardware: "Atlas A2 Series" +model: "vllm-vlm" +tasks: +- name: "mmmu_val" + metrics: + - name: "acc,none" + value: 0.52 +max_model_len: 8192 +gpu_memory_utilization: 0.7 diff --git a/tests/e2e/models/configs/Qwen2.5-VL-7B-Instruct.yaml b/tests/e2e/models/configs/Qwen2.5-VL-7B-Instruct.yaml index 3543e0c2..85489899 100644 --- a/tests/e2e/models/configs/Qwen2.5-VL-7B-Instruct.yaml +++ b/tests/e2e/models/configs/Qwen2.5-VL-7B-Instruct.yaml @@ -1,5 +1,4 @@ model_name: "Qwen/Qwen2.5-VL-7B-Instruct" -runner: "linux-aarch64-a2-1" hardware: "Atlas A2 Series" model: "vllm-vlm" tasks: @@ -7,4 +6,4 @@ tasks: metrics: - name: "acc,none" value: 0.51 -max_model_len: 8192 \ No newline at end of file +max_model_len: 8192 diff --git a/tests/e2e/models/configs/Qwen3-30B-A3B.yaml b/tests/e2e/models/configs/Qwen3-30B-A3B.yaml index 6b042523..b97f6dae 100644 --- a/tests/e2e/models/configs/Qwen3-30B-A3B.yaml +++ b/tests/e2e/models/configs/Qwen3-30B-A3B.yaml @@ -1,5 +1,4 @@ model_name: "Qwen/Qwen3-30B-A3B" -runner: "linux-aarch64-a2-2" hardware: "Atlas A2 Series" tasks: - name: "gsm8k" @@ -17,4 +16,4 @@ gpu_memory_utilization: 0.6 enable_expert_parallel: True tensor_parallel_size: 2 apply_chat_template: False -fewshot_as_multiturn: False \ No newline at end of file +fewshot_as_multiturn: False diff --git a/tests/e2e/models/configs/Qwen3-8B-Base.yaml b/tests/e2e/models/configs/Qwen3-8B-Base.yaml index 21243615..73026446 100644 --- a/tests/e2e/models/configs/Qwen3-8B-Base.yaml +++ b/tests/e2e/models/configs/Qwen3-8B-Base.yaml @@ -1,5 +1,4 @@ model_name: "Qwen/Qwen3-8B-Base" -runner: "linux-aarch64-a2-1" hardware: "Atlas A2 Series" tasks: - name: "gsm8k" diff --git a/tests/e2e/models/configs/Qwen3-VL-8B-Instruct.yaml b/tests/e2e/models/configs/Qwen3-VL-8B-Instruct.yaml new file mode 100644 index 00000000..8803a120 --- /dev/null +++ b/tests/e2e/models/configs/Qwen3-VL-8B-Instruct.yaml @@ -0,0 +1,11 @@ +model_name: "Qwen/Qwen3-VL-8B-Instruct" +hardware: "Atlas A2 Series" +model: "vllm-vlm" +tasks: +- name: "mmmu_val" + metrics: + - name: "acc,none" + value: 0.55 +max_model_len: 8192 +batch_size: 32 +gpu_memory_utilization: 0.7 diff --git a/tests/e2e/models/configs/accuracy.txt b/tests/e2e/models/configs/accuracy.txt index 3bdcfd8a..5a839071 100644 --- a/tests/e2e/models/configs/accuracy.txt +++ b/tests/e2e/models/configs/accuracy.txt @@ -6,3 +6,6 @@ Qwen2-7B.yaml Qwen2-VL-7B-Instruct.yaml Qwen2-Audio-7B-Instruct.yaml Qwen3-VL-30B-A3B-Instruct.yaml +Qwen3-VL-8B-Instruct.yaml +Qwen2.5-Omni-7B.yaml +Meta-Llama-3.1-8B-Instruct.yaml