[Test] Add accuracy nightly test for new models (#4262)

### What this PR does / why we need it? Add accuracy nightly test for new models： PaddlePaddle/ERNIE-4.5-21B-A3B-PT LLM-Research/Molmo-7B-D-0924 LLM-Research/gemma-2-9b-it LLM-Research/gemma-3-4b-it Shanghai_AI_Laboratory/internlm-7b llava-hf/llava-1.5-7b-hf - vLLM version: v0.11.2 Signed-off-by: hfadzxy <starmoon_zhang@163.com>
2025-12-01 22:28:46 +08:00
parent 8e7f5cff6d
commit b6afec73e1
11 changed files with 97 additions and 4 deletions
--- a/tests/e2e/models/configs/ERNIE-4.5-21B-A3B-PT.yaml
+++ b/tests/e2e/models/configs/ERNIE-4.5-21B-A3B-PT.yaml
@@ -0,0 +1,9 @@
+model_name: "PaddlePaddle/ERNIE-4.5-21B-A3B-PT"
+hardware: "Atlas A2 Series"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,flexible-extract"
+    value: 0.71
+num_fewshot: 5
+trust_remote_code: True
--- a/tests/e2e/models/configs/InternVL3_5-8B-hf.yaml
+++ b/tests/e2e/models/configs/InternVL3_5-8B-hf.yaml
--- a/tests/e2e/models/configs/Molmo-7B-D-0924.yaml
+++ b/tests/e2e/models/configs/Molmo-7B-D-0924.yaml
@@ -0,0 +1,13 @@
+model_name: "LLM-Research/Molmo-7B-D-0924"
+hardware: "Atlas A2 Series"
+model: "vllm-vlm"
+tasks:
+- name: "ceval-valid"
+  metrics:
+  - name: "acc,none"
+    value: 0.71
+max_model_len: 4096
+trust_remote_code: True
+apply_chat_template: False
+fewshot_as_multiturn: False
+gpu_memory_utilization: 0.8
--- a/tests/e2e/models/configs/accuracy.txt
+++ b/tests/e2e/models/configs/accuracy.txt
@@ -9,4 +9,10 @@ Qwen3-VL-30B-A3B-Instruct.yaml
 Qwen3-VL-8B-Instruct.yaml
 Qwen2.5-Omni-7B.yaml
 Meta-Llama-3.1-8B-Instruct.yaml
-InternVL3_5-8B.yaml
+InternVL3_5-8B.yaml
+ERNIE-4.5-21B-A3B-PT.yaml
+gemma-2-9b-it.yaml
+gemma-3-4b-it.yaml
+internlm-7b.yaml
+Molmo-7B-D-0924.yaml
+llava-1.5-7b-hf.yaml
--- a/tests/e2e/models/configs/gemma-2-9b-it.yaml
+++ b/tests/e2e/models/configs/gemma-2-9b-it.yaml
@@ -0,0 +1,11 @@
+model_name: "LLM-Research/gemma-2-9b-it"
+hardware: "Atlas A2 Series"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.46
+  - name: "exact_match,flexible-extract"
+    value: 0.79
+num_fewshot: 5
+gpu_memory_utilization: 0.8
--- a/tests/e2e/models/configs/gemma-3-4b-it.yaml
+++ b/tests/e2e/models/configs/gemma-3-4b-it.yaml
@@ -0,0 +1,13 @@
+model_name: "LLM-Research/gemma-3-4b-it"
+hardware: "Atlas A2 Series"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.59
+  - name: "exact_match,flexible-extract"
+    value: 0.59
+num_fewshot: 5
+apply_chat_template: False
+fewshot_as_multiturn: False
+gpu_memory_utilization: 0.7
--- a/tests/e2e/models/configs/internlm-7b.yaml
+++ b/tests/e2e/models/configs/internlm-7b.yaml
@@ -0,0 +1,13 @@
+model_name: "Shanghai_AI_Laboratory/internlm-7b"
+hardware: "Atlas A2 Series"
+tasks:
+- name: "ceval-valid"
+  metrics:
+  - name: "acc,none"
+    value: 0.42
+num_fewshot: 5
+max_model_len: 2048
+trust_remote_code: True
+dtype: "bfloat16"
+apply_chat_template: False
+fewshot_as_multiturn: False
--- a/tests/e2e/models/configs/llava-1.5-7b-hf.yaml
+++ b/tests/e2e/models/configs/llava-1.5-7b-hf.yaml
@@ -0,0 +1,11 @@
+model_name: "llava-hf/llava-1.5-7b-hf"
+hardware: "Atlas A2 Series"
+model: "vllm-vlm"
+tasks:
+- name: "ceval-valid"
+  metrics:
+  - name: "acc,none"
+    value: 0.30
+trust_remote_code: True
+gpu_memory_utilization: 0.8
+dtype: "bfloat16"
--- a/tests/e2e/models/test_lm_eval_correctness.py
+++ b/tests/e2e/models/test_lm_eval_correctness.py
@@ -39,10 +39,11 @@ def env_config() -> EnvConfig:
 def build_model_args(eval_config, tp_size):
    trust_remote_code = eval_config.get("trust_remote_code", False)
    max_model_len = eval_config.get("max_model_len", 4096)
+    dtype = eval_config.get("dtype", "auto")
    model_args = {
        "pretrained": eval_config["model_name"],
        "tensor_parallel_size": tp_size,
-        "dtype": "auto",
+        "dtype": dtype,
        "trust_remote_code": trust_remote_code,
        "max_model_len": max_model_len,
    }