Enable accuracy test for PR labeled with "*accuracy-test" (#1040)
### What this PR does / why we need it? This PR enable accuracy test for PR labeled with "*accuracy-test" and workflow_dispatch. Only one model test running for each type test to reduce excution time. - The dense test costs about `25mins` to complete (gsm8k 7mins, ~mmlu 3h24mins,~ cEval 18mins) - The vl test costs about `40mins` to complete In futute, we might consider enable all job test as nightly schedule job. Below is mainly changes: - the dense/vl accuracy test will be triggered by lableling `accuracy-test` and `ready-for-test` - the dense accuracy test will be triggered by lableling `dense-accuracy-test` and `ready-for-test` - the vl accuracy test will be triggered by lableling `vl-accuracy-test` and `ready-for-test` - accuracy test will also be triggered by workflow_dispatch - Support V1 and V0 for qwen and V0 for VL For PR test we also generate summary in test summary. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? - CI passed with accuracy-test label - Preview: https://github.com/vllm-project/vllm-ascend/actions/runs/15407628722?pr=1040 Closes: https://github.com/vllm-project/vllm-ascend/pull/953 --------- Signed-off-by: hfadzxy <starmoon_zhang@163.com> Signed-off-by: Yikun Jiang <yikunkero@gmail.com> Co-authored-by: hfadzxy <starmoon_zhang@163.com>
This commit is contained in:
@@ -26,11 +26,8 @@ from multiprocessing import Queue
|
||||
import lm_eval
|
||||
import torch
|
||||
|
||||
UNIMODAL_MODEL_NAME = [
|
||||
"Qwen/Qwen2.5-7B-Instruct", "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"Qwen/Qwen3-8B"
|
||||
]
|
||||
UNIMODAL_TASK = ["ceval-valid", "mmlu", "gsm8k"]
|
||||
UNIMODAL_MODEL_NAME = ["Qwen/Qwen2.5-7B-Instruct", "Qwen/Qwen3-8B"]
|
||||
UNIMODAL_TASK = ["ceval-valid", "gsm8k"]
|
||||
MULTIMODAL_NAME = ["Qwen/Qwen2.5-VL-7B-Instruct"]
|
||||
MULTIMODAL_TASK = ["mmmu_val"]
|
||||
|
||||
@@ -38,22 +35,17 @@ batch_size_dict = {"ceval-valid": 1, "mmlu": 1, "gsm8k": "auto", "mmmu_val": 1}
|
||||
|
||||
MODEL_RUN_INFO = {
|
||||
"Qwen/Qwen2.5-7B-Instruct":
|
||||
("export MODEL_AEGS='{model}, max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.6'\n"
|
||||
("export MODEL_ARGS='pretrained={model}, max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.6'\n"
|
||||
"lm_eval --model vllm --modlel_args $MODEL_ARGS --tasks {datasets} \ \n"
|
||||
"--apply_chat_template --fewshot_as_multiturn --num_fewshot 5 --batch_size 1"
|
||||
),
|
||||
"LLM-Research/Meta-Llama-3.1-8B-Instruct":
|
||||
("export MODEL_AEGS='{model}, max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.6'\n"
|
||||
"lm_eval --model vllm --modlel_args $MODEL_ARGS --tasks {datasets} \ \n"
|
||||
"--apply_chat_template --fewshot_as_multiturn --num_fewshot 5 --batch_size 1"
|
||||
),
|
||||
"Qwen/Qwen3-8B":
|
||||
("export MODEL_AEGS='{model}, max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.6'\n"
|
||||
"Qwen/Qwen3-8B-Base":
|
||||
("export MODEL_ARGS='pretrained={model}, max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.6'\n"
|
||||
"lm_eval --model vllm --modlel_args $MODEL_ARGS --tasks {datasets} \ \n"
|
||||
"--apply_chat_template --fewshot_as_multiturn --num_fewshot 5 --batch_size 1"
|
||||
),
|
||||
"Qwen/Qwen2.5-VL-7B-Instruct":
|
||||
("export MODEL_AEGS='{model}, max_model_len=8192,dtype=auto,tensor_parallel_size=2,max_images=2'\n"
|
||||
("export MODEL_ARGS='pretrained={model}, max_model_len=8192,dtype=auto,tensor_parallel_size=4,max_images=2'\n"
|
||||
"lm_eval --model vllm-vlm --modlel_args $MODEL_ARGS --tasks {datasets} \ \n"
|
||||
"--apply_chat_template --fewshot_as_multiturn --batch_size 1"),
|
||||
}
|
||||
@@ -85,7 +77,7 @@ def run_accuracy_unimodal(queue, model, dataset):
|
||||
|
||||
def run_accuracy_multimodal(queue, model, dataset):
|
||||
try:
|
||||
model_args = f"pretrained={model},max_model_len=8192,dtype=auto,tensor_parallel_size=2,max_images=2"
|
||||
model_args = f"pretrained={model},max_model_len=8192,dtype=auto,tensor_parallel_size=4,max_images=2"
|
||||
results = lm_eval.simple_evaluate(
|
||||
model="vllm-vlm",
|
||||
model_args=model_args,
|
||||
@@ -110,7 +102,7 @@ def generate_md(model_name, tasks_list, args, datasets):
|
||||
run_cmd = MODEL_RUN_INFO[model_name].format(model=model_name,
|
||||
datasets=datasets)
|
||||
model = model_name.split("/")[1]
|
||||
preamble = f"""# {model} Accuracy Test
|
||||
preamble = f"""# 🎯 {model} Accuracy Test
|
||||
<div>
|
||||
<strong>vLLM version:</strong> vLLM: {args.vllm_version}, vLLM Ascend: {args.vllm_ascend_version} <br>
|
||||
</div>
|
||||
@@ -228,4 +220,7 @@ if __name__ == "__main__":
|
||||
parser.add_argument("--vllm_version", type=str, required=False)
|
||||
parser.add_argument("--cann_version", type=str, required=False)
|
||||
args = parser.parse_args()
|
||||
# TODO(yikun):
|
||||
# 1. add a exit 1 if accuracy is not as expected
|
||||
# 2. Add ✅, ❌ to markdown if accuracy is not as expected
|
||||
main(args)
|
||||
|
||||
Reference in New Issue
Block a user