[Test] Add accuracy nightly test for new models (#4262)
### What this PR does / why we need it? Add accuracy nightly test for new models: PaddlePaddle/ERNIE-4.5-21B-A3B-PT LLM-Research/Molmo-7B-D-0924 LLM-Research/gemma-2-9b-it LLM-Research/gemma-3-4b-it Shanghai_AI_Laboratory/internlm-7b llava-hf/llava-1.5-7b-hf - vLLM version: v0.11.2 Signed-off-by: hfadzxy <starmoon_zhang@163.com>
This commit is contained in:
9
tests/e2e/models/configs/ERNIE-4.5-21B-A3B-PT.yaml
Normal file
9
tests/e2e/models/configs/ERNIE-4.5-21B-A3B-PT.yaml
Normal file
@@ -0,0 +1,9 @@
|
||||
model_name: "PaddlePaddle/ERNIE-4.5-21B-A3B-PT"
|
||||
hardware: "Atlas A2 Series"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.71
|
||||
num_fewshot: 5
|
||||
trust_remote_code: True
|
||||
13
tests/e2e/models/configs/Molmo-7B-D-0924.yaml
Normal file
13
tests/e2e/models/configs/Molmo-7B-D-0924.yaml
Normal file
@@ -0,0 +1,13 @@
|
||||
model_name: "LLM-Research/Molmo-7B-D-0924"
|
||||
hardware: "Atlas A2 Series"
|
||||
model: "vllm-vlm"
|
||||
tasks:
|
||||
- name: "ceval-valid"
|
||||
metrics:
|
||||
- name: "acc,none"
|
||||
value: 0.71
|
||||
max_model_len: 4096
|
||||
trust_remote_code: True
|
||||
apply_chat_template: False
|
||||
fewshot_as_multiturn: False
|
||||
gpu_memory_utilization: 0.8
|
||||
@@ -9,4 +9,10 @@ Qwen3-VL-30B-A3B-Instruct.yaml
|
||||
Qwen3-VL-8B-Instruct.yaml
|
||||
Qwen2.5-Omni-7B.yaml
|
||||
Meta-Llama-3.1-8B-Instruct.yaml
|
||||
InternVL3_5-8B.yaml
|
||||
InternVL3_5-8B.yaml
|
||||
ERNIE-4.5-21B-A3B-PT.yaml
|
||||
gemma-2-9b-it.yaml
|
||||
gemma-3-4b-it.yaml
|
||||
internlm-7b.yaml
|
||||
Molmo-7B-D-0924.yaml
|
||||
llava-1.5-7b-hf.yaml
|
||||
|
||||
11
tests/e2e/models/configs/gemma-2-9b-it.yaml
Normal file
11
tests/e2e/models/configs/gemma-2-9b-it.yaml
Normal file
@@ -0,0 +1,11 @@
|
||||
model_name: "LLM-Research/gemma-2-9b-it"
|
||||
hardware: "Atlas A2 Series"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.46
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.79
|
||||
num_fewshot: 5
|
||||
gpu_memory_utilization: 0.8
|
||||
13
tests/e2e/models/configs/gemma-3-4b-it.yaml
Normal file
13
tests/e2e/models/configs/gemma-3-4b-it.yaml
Normal file
@@ -0,0 +1,13 @@
|
||||
model_name: "LLM-Research/gemma-3-4b-it"
|
||||
hardware: "Atlas A2 Series"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.59
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.59
|
||||
num_fewshot: 5
|
||||
apply_chat_template: False
|
||||
fewshot_as_multiturn: False
|
||||
gpu_memory_utilization: 0.7
|
||||
13
tests/e2e/models/configs/internlm-7b.yaml
Normal file
13
tests/e2e/models/configs/internlm-7b.yaml
Normal file
@@ -0,0 +1,13 @@
|
||||
model_name: "Shanghai_AI_Laboratory/internlm-7b"
|
||||
hardware: "Atlas A2 Series"
|
||||
tasks:
|
||||
- name: "ceval-valid"
|
||||
metrics:
|
||||
- name: "acc,none"
|
||||
value: 0.42
|
||||
num_fewshot: 5
|
||||
max_model_len: 2048
|
||||
trust_remote_code: True
|
||||
dtype: "bfloat16"
|
||||
apply_chat_template: False
|
||||
fewshot_as_multiturn: False
|
||||
11
tests/e2e/models/configs/llava-1.5-7b-hf.yaml
Normal file
11
tests/e2e/models/configs/llava-1.5-7b-hf.yaml
Normal file
@@ -0,0 +1,11 @@
|
||||
model_name: "llava-hf/llava-1.5-7b-hf"
|
||||
hardware: "Atlas A2 Series"
|
||||
model: "vllm-vlm"
|
||||
tasks:
|
||||
- name: "ceval-valid"
|
||||
metrics:
|
||||
- name: "acc,none"
|
||||
value: 0.30
|
||||
trust_remote_code: True
|
||||
gpu_memory_utilization: 0.8
|
||||
dtype: "bfloat16"
|
||||
@@ -39,10 +39,11 @@ def env_config() -> EnvConfig:
|
||||
def build_model_args(eval_config, tp_size):
|
||||
trust_remote_code = eval_config.get("trust_remote_code", False)
|
||||
max_model_len = eval_config.get("max_model_len", 4096)
|
||||
dtype = eval_config.get("dtype", "auto")
|
||||
model_args = {
|
||||
"pretrained": eval_config["model_name"],
|
||||
"tensor_parallel_size": tp_size,
|
||||
"dtype": "auto",
|
||||
"dtype": dtype,
|
||||
"trust_remote_code": trust_remote_code,
|
||||
"max_model_len": max_model_len,
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user