[Test] Add accuracy nightly test for new models (#4262)
### What this PR does / why we need it? Add accuracy nightly test for new models: PaddlePaddle/ERNIE-4.5-21B-A3B-PT LLM-Research/Molmo-7B-D-0924 LLM-Research/gemma-2-9b-it LLM-Research/gemma-3-4b-it Shanghai_AI_Laboratory/internlm-7b llava-hf/llava-1.5-7b-hf - vLLM version: v0.11.2 Signed-off-by: hfadzxy <starmoon_zhang@163.com>
This commit is contained in:
@@ -59,7 +59,7 @@ jobs:
|
|||||||
name: ${{inputs.model_list}} accuracy test
|
name: ${{inputs.model_list}} accuracy test
|
||||||
runs-on: ${{ inputs.runner }}
|
runs-on: ${{ inputs.runner }}
|
||||||
container:
|
container:
|
||||||
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-910b-ubuntu22.04-py3.11
|
image: "${{ inputs.image }}"
|
||||||
env:
|
env:
|
||||||
VLLM_USE_MODELSCOPE: True
|
VLLM_USE_MODELSCOPE: True
|
||||||
GHA_VLLM_ASCEND_VERSION: ${{ inputs.vllm-ascend }}
|
GHA_VLLM_ASCEND_VERSION: ${{ inputs.vllm-ascend }}
|
||||||
@@ -111,6 +111,12 @@ jobs:
|
|||||||
. /usr/local/Ascend/ascend-toolkit/8.3.RC2/bisheng_toolkit/set_env.sh
|
. /usr/local/Ascend/ascend-toolkit/8.3.RC2/bisheng_toolkit/set_env.sh
|
||||||
python3 -m pip install "https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/triton_ascend-3.2.0.dev2025110717-cp311-cp311-manylinux_2_27_aarch64.whl"
|
python3 -m pip install "https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/triton_ascend-3.2.0.dev2025110717-cp311-cp311-manylinux_2_27_aarch64.whl"
|
||||||
|
|
||||||
|
- name: Install tensorflow (for Molmo-7B-D-0924)
|
||||||
|
if: ${{ inputs.runner == 'linux-aarch64-a2-1' && contains(inputs.model_list, 'Molmo-7B-D-0924') }}
|
||||||
|
shell: bash -l {0}
|
||||||
|
run: |
|
||||||
|
pip install tensorflow --no-cache-dir
|
||||||
|
|
||||||
- name: Resolve vllm-ascend version
|
- name: Resolve vllm-ascend version
|
||||||
run: |
|
run: |
|
||||||
VERSION_INPUT="${{ inputs.vllm-ascend }}"
|
VERSION_INPUT="${{ inputs.vllm-ascend }}"
|
||||||
@@ -172,6 +178,7 @@ jobs:
|
|||||||
id: report
|
id: report
|
||||||
env:
|
env:
|
||||||
VLLM_WORKER_MULTIPROC_METHOD: spawn
|
VLLM_WORKER_MULTIPROC_METHOD: spawn
|
||||||
|
HF_DATASETS_OFFLINE: True
|
||||||
VLLM_USE_MODELSCOPE: True
|
VLLM_USE_MODELSCOPE: True
|
||||||
VLLM_CI_RUNNER: ${{ inputs.runner }}
|
VLLM_CI_RUNNER: ${{ inputs.runner }}
|
||||||
VLLM_VERSION: ${{ env.GHA_VLLM_VERSION }}
|
VLLM_VERSION: ${{ env.GHA_VLLM_VERSION }}
|
||||||
|
|||||||
@@ -114,6 +114,15 @@ jobs:
|
|||||||
- Qwen3-VL-8B-Instruct
|
- Qwen3-VL-8B-Instruct
|
||||||
- Qwen2.5-Omni-7B
|
- Qwen2.5-Omni-7B
|
||||||
- Meta-Llama-3.1-8B-Instruct
|
- Meta-Llama-3.1-8B-Instruct
|
||||||
|
- os: linux-aarch64-a2-1
|
||||||
|
model_list:
|
||||||
|
- ERNIE-4.5-21B-A3B-PT
|
||||||
|
- gemma-2-9b-it
|
||||||
|
- gemma-3-4b-it
|
||||||
|
- internlm-7b
|
||||||
|
- InternVL3_5-8B-hf
|
||||||
|
- llava-1.5-7b-hf
|
||||||
|
- Molmo-7B-D-0924
|
||||||
- os: linux-aarch64-a2-2
|
- os: linux-aarch64-a2-2
|
||||||
model_list:
|
model_list:
|
||||||
- Qwen3-30B-A3B
|
- Qwen3-30B-A3B
|
||||||
@@ -128,5 +137,5 @@ jobs:
|
|||||||
vllm: v0.11.2
|
vllm: v0.11.2
|
||||||
runner: ${{ matrix.test_config.os }}
|
runner: ${{ matrix.test_config.os }}
|
||||||
model_list: ${{ toJson(matrix.test_config.model_list) }}
|
model_list: ${{ toJson(matrix.test_config.model_list) }}
|
||||||
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11
|
image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-910b-ubuntu22.04-py3.11'
|
||||||
upload: false
|
upload: false
|
||||||
|
|||||||
9
tests/e2e/models/configs/ERNIE-4.5-21B-A3B-PT.yaml
Normal file
9
tests/e2e/models/configs/ERNIE-4.5-21B-A3B-PT.yaml
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
model_name: "PaddlePaddle/ERNIE-4.5-21B-A3B-PT"
|
||||||
|
hardware: "Atlas A2 Series"
|
||||||
|
tasks:
|
||||||
|
- name: "gsm8k"
|
||||||
|
metrics:
|
||||||
|
- name: "exact_match,flexible-extract"
|
||||||
|
value: 0.71
|
||||||
|
num_fewshot: 5
|
||||||
|
trust_remote_code: True
|
||||||
13
tests/e2e/models/configs/Molmo-7B-D-0924.yaml
Normal file
13
tests/e2e/models/configs/Molmo-7B-D-0924.yaml
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
model_name: "LLM-Research/Molmo-7B-D-0924"
|
||||||
|
hardware: "Atlas A2 Series"
|
||||||
|
model: "vllm-vlm"
|
||||||
|
tasks:
|
||||||
|
- name: "ceval-valid"
|
||||||
|
metrics:
|
||||||
|
- name: "acc,none"
|
||||||
|
value: 0.71
|
||||||
|
max_model_len: 4096
|
||||||
|
trust_remote_code: True
|
||||||
|
apply_chat_template: False
|
||||||
|
fewshot_as_multiturn: False
|
||||||
|
gpu_memory_utilization: 0.8
|
||||||
@@ -9,4 +9,10 @@ Qwen3-VL-30B-A3B-Instruct.yaml
|
|||||||
Qwen3-VL-8B-Instruct.yaml
|
Qwen3-VL-8B-Instruct.yaml
|
||||||
Qwen2.5-Omni-7B.yaml
|
Qwen2.5-Omni-7B.yaml
|
||||||
Meta-Llama-3.1-8B-Instruct.yaml
|
Meta-Llama-3.1-8B-Instruct.yaml
|
||||||
InternVL3_5-8B.yaml
|
InternVL3_5-8B.yaml
|
||||||
|
ERNIE-4.5-21B-A3B-PT.yaml
|
||||||
|
gemma-2-9b-it.yaml
|
||||||
|
gemma-3-4b-it.yaml
|
||||||
|
internlm-7b.yaml
|
||||||
|
Molmo-7B-D-0924.yaml
|
||||||
|
llava-1.5-7b-hf.yaml
|
||||||
|
|||||||
11
tests/e2e/models/configs/gemma-2-9b-it.yaml
Normal file
11
tests/e2e/models/configs/gemma-2-9b-it.yaml
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
model_name: "LLM-Research/gemma-2-9b-it"
|
||||||
|
hardware: "Atlas A2 Series"
|
||||||
|
tasks:
|
||||||
|
- name: "gsm8k"
|
||||||
|
metrics:
|
||||||
|
- name: "exact_match,strict-match"
|
||||||
|
value: 0.46
|
||||||
|
- name: "exact_match,flexible-extract"
|
||||||
|
value: 0.79
|
||||||
|
num_fewshot: 5
|
||||||
|
gpu_memory_utilization: 0.8
|
||||||
13
tests/e2e/models/configs/gemma-3-4b-it.yaml
Normal file
13
tests/e2e/models/configs/gemma-3-4b-it.yaml
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
model_name: "LLM-Research/gemma-3-4b-it"
|
||||||
|
hardware: "Atlas A2 Series"
|
||||||
|
tasks:
|
||||||
|
- name: "gsm8k"
|
||||||
|
metrics:
|
||||||
|
- name: "exact_match,strict-match"
|
||||||
|
value: 0.59
|
||||||
|
- name: "exact_match,flexible-extract"
|
||||||
|
value: 0.59
|
||||||
|
num_fewshot: 5
|
||||||
|
apply_chat_template: False
|
||||||
|
fewshot_as_multiturn: False
|
||||||
|
gpu_memory_utilization: 0.7
|
||||||
13
tests/e2e/models/configs/internlm-7b.yaml
Normal file
13
tests/e2e/models/configs/internlm-7b.yaml
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
model_name: "Shanghai_AI_Laboratory/internlm-7b"
|
||||||
|
hardware: "Atlas A2 Series"
|
||||||
|
tasks:
|
||||||
|
- name: "ceval-valid"
|
||||||
|
metrics:
|
||||||
|
- name: "acc,none"
|
||||||
|
value: 0.42
|
||||||
|
num_fewshot: 5
|
||||||
|
max_model_len: 2048
|
||||||
|
trust_remote_code: True
|
||||||
|
dtype: "bfloat16"
|
||||||
|
apply_chat_template: False
|
||||||
|
fewshot_as_multiturn: False
|
||||||
11
tests/e2e/models/configs/llava-1.5-7b-hf.yaml
Normal file
11
tests/e2e/models/configs/llava-1.5-7b-hf.yaml
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
model_name: "llava-hf/llava-1.5-7b-hf"
|
||||||
|
hardware: "Atlas A2 Series"
|
||||||
|
model: "vllm-vlm"
|
||||||
|
tasks:
|
||||||
|
- name: "ceval-valid"
|
||||||
|
metrics:
|
||||||
|
- name: "acc,none"
|
||||||
|
value: 0.30
|
||||||
|
trust_remote_code: True
|
||||||
|
gpu_memory_utilization: 0.8
|
||||||
|
dtype: "bfloat16"
|
||||||
@@ -39,10 +39,11 @@ def env_config() -> EnvConfig:
|
|||||||
def build_model_args(eval_config, tp_size):
|
def build_model_args(eval_config, tp_size):
|
||||||
trust_remote_code = eval_config.get("trust_remote_code", False)
|
trust_remote_code = eval_config.get("trust_remote_code", False)
|
||||||
max_model_len = eval_config.get("max_model_len", 4096)
|
max_model_len = eval_config.get("max_model_len", 4096)
|
||||||
|
dtype = eval_config.get("dtype", "auto")
|
||||||
model_args = {
|
model_args = {
|
||||||
"pretrained": eval_config["model_name"],
|
"pretrained": eval_config["model_name"],
|
||||||
"tensor_parallel_size": tp_size,
|
"tensor_parallel_size": tp_size,
|
||||||
"dtype": "auto",
|
"dtype": dtype,
|
||||||
"trust_remote_code": trust_remote_code,
|
"trust_remote_code": trust_remote_code,
|
||||||
"max_model_len": max_model_len,
|
"max_model_len": max_model_len,
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user