[Test]update accuracy test of models (#4911)

### What this PR does / why we need it? Delete accuracy tests for models that are no longer retained： - Meta-Llama-3.1-8B-Instruct - llava-1.5-7b-hf - InternVL2-8B.yaml - InternVL2_5-8B.yaml - InternVL3-8B.yaml Add accuracy tests for the new models： - Llama-3.2-3B-Instruct - llava-onevision-qwen2-0.5b-ov-hf - Qwen3-VL-30B-A3B-Instruct - vLLM version: v0.12.0 - vLLM main: ad32e3e19c --------- Signed-off-by: MrZ20 <2609716663@qq.com>
2025-12-15 15:04:20 +08:00
parent b75bfc58f6
commit 70606e0bb9
9 changed files with 24 additions and 50 deletions
--- a/.github/workflows/nightly_test_a2.yaml
+++ b/.github/workflows/nightly_test_a2.yaml
@@ -86,15 +86,13 @@ jobs:
              - Qwen3-8B-W8A8
              - Qwen3-VL-8B-Instruct
              - Qwen2.5-Omni-7B
              - Meta-Llama-3.1-8B-Instruct
          - os: linux-aarch64-a2-1
            model_list:
              - ERNIE-4.5-21B-A3B-PT
              - gemma-3-4b-it
              - internlm-7b
              - InternVL3_5-8B-hf
              - llava-1.5-7b-hf
              - Molmo-7B-D-0924
              - Llama-3.2-3B-Instruct
              - llava-onevision-qwen2-0.5b-ov-hf
          - os: linux-aarch64-a2-2
            model_list:
              - Qwen3-30B-A3B
@@ -103,6 +101,7 @@ jobs:
          - os: linux-aarch64-a2-4
            model_list:
              - Qwen3-Next-80B-A3B-Instruct
              - Qwen3-VL-30B-A3B-Instruct
    uses: ./.github/workflows/_e2e_nightly_single_node_models.yaml
    with:
      vllm: v0.12.0
--- a/tests/e2e/models/configs/InternVL2-8B.yaml
+++ b/tests/e2e/models/configs/InternVL2-8B.yaml
@@ -1,11 +0,0 @@
 model_name: "OpenGVLab/InternVL2-8B"
 runner: "linux-aarch64-a2-1"
 hardware: "Atlas A2 Series"
 model: "vllm-vlm"
 tasks:
  - name: "mmmu_val"
    metrics:
    - name: "acc,none"
      value: 0.58
 max_model_len: 32768
 trust_remote_code: True
--- a/tests/e2e/models/configs/InternVL2_5-8B.yaml
+++ b/tests/e2e/models/configs/InternVL2_5-8B.yaml
@@ -1,11 +0,0 @@
 model_name: "OpenGVLab/InternVL2_5-8B"
 runner: "linux-aarch64-a2-1"
 hardware: "Atlas A2 Series"
 model: "vllm-vlm"
 tasks:
  - name: "mmmu_val"
    metrics:
    - name: "acc,none"
      value: 0.58
 max_model_len: 32768
 trust_remote_code: True
--- a/tests/e2e/models/configs/InternVL3-8B.yaml
+++ b/tests/e2e/models/configs/InternVL3-8B.yaml
@@ -1,11 +0,0 @@
 model_name: "OpenGVLab/InternVL3-8B"
 runner: "linux-aarch64-a2-1"
 hardware: "Atlas A2 Series"
 model: "vllm-vlm"
 tasks:
  - name: "mmmu_val"
    metrics:
    - name: "acc,none"
      value: 0.58
 max_model_len: 32768
 trust_remote_code: True
--- a/tests/e2e/models/configs/Meta-Llama-3.1-8B-Instruct.yaml
+++ b/tests/e2e/models/configs/Meta-Llama-3.1-8B-Instruct.yaml
@@ -1,11 +1,10 @@
-model_name: "LLM-Research/Meta-Llama-3.1-8B-Instruct"
+model_name: "LLM-Research/Llama-3.2-3B-Instruct"
 hardware: "Atlas A2 Series"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
-    value: 0.82
+    value: 0.71
  - name: "exact_match,flexible-extract"
-    value: 0.84
+    value: 0.76
 num_fewshot: 5
--- a/tests/e2e/models/configs/Qwen3-Omni-30B-A3B-Instruct.yaml
+++ b/tests/e2e/models/configs/Qwen3-Omni-30B-A3B-Instruct.yaml
@@ -0,0 +1,11 @@
 model_name: "Qwen/Qwen3-Omni-30B-A3B-Instruct"
 hardware: "Atlas A2 Series"
 model: "vllm-vlm"
 tasks:
 - name: "mmmu_val"
  metrics:
  - name: "acc,none"
    value: 0.52
 max_model_len: 8192
 tensor_parallel_size: 4
 enable_expert_parallel: True
--- a/tests/e2e/models/configs/accuracy.txt
+++ b/tests/e2e/models/configs/accuracy.txt
@@ -5,13 +5,11 @@ Qwen2-Audio-7B-Instruct.yaml
 Qwen3-VL-30B-A3B-Instruct.yaml
 Qwen3-VL-8B-Instruct.yaml
 Qwen2.5-Omni-7B.yaml
-Meta-Llama-3.1-8B-Instruct.yaml
+Qwen3-Omni-30B-A3B-Instruct.yaml
-InternVL2-8B.yaml
+InternVL3_5-8B-hf.yaml
 InternVL2_5-8B.yaml
 InternVL3-8B.yaml
 InternVL3_5-8B.yaml
 ERNIE-4.5-21B-A3B-PT.yaml
 gemma-3-4b-it.yaml
 internlm3-8b-instruct.yaml
 Molmo-7B-D-0924.yaml
-llava-1.5-7b-hf.yaml
+llava-onevision-qwen2-0.5b-ov-hf.yaml
 Llama-3.2-3B-Instruct.yaml
--- a/tests/e2e/models/configs/gemma-3-4b-it.yaml
+++ b/tests/e2e/models/configs/gemma-3-4b-it.yaml
@@ -11,3 +11,4 @@ num_fewshot: 5
 apply_chat_template: False
 fewshot_as_multiturn: False
 gpu_memory_utilization: 0.7
 enforce_eager: True
--- a/tests/e2e/models/configs/llava-onevision-qwen2-0.5b-ov-hf.yaml
+++ b/tests/e2e/models/configs/llava-onevision-qwen2-0.5b-ov-hf.yaml
@@ -1,11 +1,10 @@
-model_name: "llava-hf/llava-1.5-7b-hf"
+model_name: "llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
 hardware: "Atlas A2 Series"
 model: "vllm-vlm"
 tasks:
 - name: "ceval-valid"
  metrics:
  - name: "acc,none"
-    value: 0.30
+    value: 0.42
 trust_remote_code: True
 gpu_memory_utilization: 0.8
 dtype: "bfloat16"