[Test] Add accuracy nightly test for new models (#4262)

### What this PR does / why we need it?
Add accuracy nightly test for new models:

PaddlePaddle/ERNIE-4.5-21B-A3B-PT
LLM-Research/Molmo-7B-D-0924
LLM-Research/gemma-2-9b-it
LLM-Research/gemma-3-4b-it
Shanghai_AI_Laboratory/internlm-7b
llava-hf/llava-1.5-7b-hf

- vLLM version: v0.11.2

Signed-off-by: hfadzxy <starmoon_zhang@163.com>
This commit is contained in:
zhangxinyuehfad
2025-12-01 22:28:46 +08:00
committed by GitHub
parent 8e7f5cff6d
commit b6afec73e1
11 changed files with 97 additions and 4 deletions

View File

@@ -0,0 +1,9 @@
model_name: "PaddlePaddle/ERNIE-4.5-21B-A3B-PT"
hardware: "Atlas A2 Series"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,flexible-extract"
value: 0.71
num_fewshot: 5
trust_remote_code: True

View File

@@ -0,0 +1,13 @@
model_name: "LLM-Research/Molmo-7B-D-0924"
hardware: "Atlas A2 Series"
model: "vllm-vlm"
tasks:
- name: "ceval-valid"
metrics:
- name: "acc,none"
value: 0.71
max_model_len: 4096
trust_remote_code: True
apply_chat_template: False
fewshot_as_multiturn: False
gpu_memory_utilization: 0.8

View File

@@ -9,4 +9,10 @@ Qwen3-VL-30B-A3B-Instruct.yaml
Qwen3-VL-8B-Instruct.yaml
Qwen2.5-Omni-7B.yaml
Meta-Llama-3.1-8B-Instruct.yaml
InternVL3_5-8B.yaml
InternVL3_5-8B.yaml
ERNIE-4.5-21B-A3B-PT.yaml
gemma-2-9b-it.yaml
gemma-3-4b-it.yaml
internlm-7b.yaml
Molmo-7B-D-0924.yaml
llava-1.5-7b-hf.yaml

View File

@@ -0,0 +1,11 @@
model_name: "LLM-Research/gemma-2-9b-it"
hardware: "Atlas A2 Series"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.46
- name: "exact_match,flexible-extract"
value: 0.79
num_fewshot: 5
gpu_memory_utilization: 0.8

View File

@@ -0,0 +1,13 @@
model_name: "LLM-Research/gemma-3-4b-it"
hardware: "Atlas A2 Series"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.59
- name: "exact_match,flexible-extract"
value: 0.59
num_fewshot: 5
apply_chat_template: False
fewshot_as_multiturn: False
gpu_memory_utilization: 0.7

View File

@@ -0,0 +1,13 @@
model_name: "Shanghai_AI_Laboratory/internlm-7b"
hardware: "Atlas A2 Series"
tasks:
- name: "ceval-valid"
metrics:
- name: "acc,none"
value: 0.42
num_fewshot: 5
max_model_len: 2048
trust_remote_code: True
dtype: "bfloat16"
apply_chat_template: False
fewshot_as_multiturn: False

View File

@@ -0,0 +1,11 @@
model_name: "llava-hf/llava-1.5-7b-hf"
hardware: "Atlas A2 Series"
model: "vllm-vlm"
tasks:
- name: "ceval-valid"
metrics:
- name: "acc,none"
value: 0.30
trust_remote_code: True
gpu_memory_utilization: 0.8
dtype: "bfloat16"

View File

@@ -39,10 +39,11 @@ def env_config() -> EnvConfig:
def build_model_args(eval_config, tp_size):
trust_remote_code = eval_config.get("trust_remote_code", False)
max_model_len = eval_config.get("max_model_len", 4096)
dtype = eval_config.get("dtype", "auto")
model_args = {
"pretrained": eval_config["model_name"],
"tensor_parallel_size": tp_size,
"dtype": "auto",
"dtype": dtype,
"trust_remote_code": trust_remote_code,
"max_model_len": max_model_len,
}