# # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. # Copyright 2023 The vLLM team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # This file is a part of the vllm-ascend project. # import argparse import gc import json import multiprocessing import sys from multiprocessing import Queue import lm_eval import torch UNIMODAL_MODEL_NAME = ["Qwen/Qwen2.5-7B-Instruct", "Qwen/Qwen3-8B-Base"] UNIMODAL_TASK = ["ceval-valid", "gsm8k"] MULTIMODAL_NAME = ["Qwen/Qwen2.5-VL-7B-Instruct"] MULTIMODAL_TASK = ["mmmu_val"] batch_size_dict = {"ceval-valid": 1, "mmlu": 1, "gsm8k": "auto", "mmmu_val": 1} MODEL_RUN_INFO = { "Qwen/Qwen2.5-7B-Instruct": ("export MODEL_ARGS='pretrained={model}, max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.6'\n" "lm_eval --model vllm --model_args $MODEL_ARGS --tasks {datasets} \ \n" "--apply_chat_template --fewshot_as_multiturn --num_fewshot 5 --batch_size 1" ), "Qwen/Qwen3-8B-Base": ("export MODEL_ARGS='pretrained={model}, max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.6'\n" "lm_eval --model vllm --model_args $MODEL_ARGS --tasks {datasets} \ \n" "--apply_chat_template --fewshot_as_multiturn --num_fewshot 5 --batch_size 1" ), "Qwen/Qwen2.5-VL-7B-Instruct": ("export MODEL_ARGS='pretrained={model}, max_model_len=8192,dtype=auto,tensor_parallel_size=4,max_images=2'\n" "lm_eval --model vllm-vlm --model_args $MODEL_ARGS --tasks {datasets} \ \n" "--apply_chat_template --fewshot_as_multiturn --batch_size 1"), } def run_accuracy_unimodal(queue, model, dataset): try: model_args = f"pretrained={model},max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.6" results = lm_eval.simple_evaluate( model="vllm", model_args=model_args, tasks=dataset, apply_chat_template=True, fewshot_as_multiturn=True, batch_size=batch_size_dict[dataset], num_fewshot=5, ) print(f"Success: {model} on {dataset}") measured_value = results["results"] queue.put(measured_value) except Exception as e: print(f"Error in run_accuracy_unimodal: {e}") queue.put(e) sys.exit(1) finally: torch.npu.empty_cache() gc.collect() def run_accuracy_multimodal(queue, model, dataset): try: model_args = f"pretrained={model},max_model_len=8192,dtype=auto,tensor_parallel_size=4,max_images=2" results = lm_eval.simple_evaluate( model="vllm-vlm", model_args=model_args, tasks=dataset, apply_chat_template=True, fewshot_as_multiturn=True, batch_size=batch_size_dict[dataset], ) print(f"Success: {model} on {dataset}") measured_value = results["results"] queue.put(measured_value) except Exception as e: print(f"Error in run_accuracy_multimodal: {e}") queue.put(e) sys.exit(1) finally: torch.npu.empty_cache() gc.collect() def generate_md(model_name, tasks_list, args, datasets): run_cmd = MODEL_RUN_INFO[model_name].format(model=model_name, datasets=datasets) model = model_name.split("/")[1] preamble = f"""# 🎯 {model} Accuracy Test