[Test] Remove V0 accuracy test and enable MoE and VL test on V1 (#1574)
### What this PR does / why we need it? Update accuracy test 1. remove accuarcy report on V0 2. add parallel and execution mode 3. add Qwen/Qwen3-30B-A3B and remove Qwen/Qwen2.5-7B-Instruct ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? CI passed Signed-off-by: hfadzxy <starmoon_zhang@163.com>
This commit is contained in:
48
.github/workflows/accuracy_test.yaml
vendored
48
.github/workflows/accuracy_test.yaml
vendored
@@ -53,9 +53,9 @@ on:
|
|||||||
type: choice
|
type: choice
|
||||||
options:
|
options:
|
||||||
- all
|
- all
|
||||||
- Qwen/Qwen2.5-7B-Instruct
|
|
||||||
- Qwen/Qwen2.5-VL-7B-Instruct
|
- Qwen/Qwen2.5-VL-7B-Instruct
|
||||||
- Qwen/Qwen3-8B-Base
|
- Qwen/Qwen3-8B-Base
|
||||||
|
- Qwen/Qwen3-30B-A3B
|
||||||
default: 'all'
|
default: 'all'
|
||||||
|
|
||||||
# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
|
# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
|
||||||
@@ -77,48 +77,48 @@ jobs:
|
|||||||
${{
|
${{
|
||||||
(contains(github.event.pull_request.labels.*.name, 'accuracy-test') ||
|
(contains(github.event.pull_request.labels.*.name, 'accuracy-test') ||
|
||||||
contains(github.event.pull_request.labels.*.name, 'vl-accuracy-test') ||
|
contains(github.event.pull_request.labels.*.name, 'vl-accuracy-test') ||
|
||||||
|
contains(github.event.pull_request.labels.*.name, 'moe-accuracy-test') ||
|
||||||
contains(github.event.pull_request.labels.*.name, 'dense-accuracy-test')) &&
|
contains(github.event.pull_request.labels.*.name, 'dense-accuracy-test')) &&
|
||||||
contains(github.event.pull_request.labels.*.name, 'ready-for-test') ||
|
contains(github.event.pull_request.labels.*.name, 'ready-for-test') ||
|
||||||
github.event_name == 'workflow_dispatch' || github.event_name == 'schedule'
|
github.event_name == 'workflow_dispatch' || github.event_name == 'schedule'
|
||||||
}}
|
}}
|
||||||
runs-on: >-
|
runs-on: >-
|
||||||
${{
|
${{
|
||||||
(matrix.model_name == 'Qwen/Qwen2.5-VL-7B-Instruct' && 'linux-arm64-npu-4') ||
|
(matrix.model_name == 'Qwen/Qwen3-30B-A3B' && 'linux-arm64-npu-4') ||
|
||||||
'linux-arm64-npu-2'
|
'linux-arm64-npu-2'
|
||||||
}}
|
}}
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
vllm_use_version: [0, 1]
|
vllm_use_version: [1]
|
||||||
# the accuracy test will run:
|
# the accuracy test will run:
|
||||||
# 1. workflow_dispatch with models input
|
# 1. workflow_dispatch with models input
|
||||||
# - all: Qwen/Qwen2.5-7B-Instruct, Qwen/Qwen2.5-VL-7B-Instruct, Qwen/Qwen3-8B-Base
|
# - all: Qwen/Qwen3-30B-A3B, Qwen/Qwen2.5-VL-7B-Instruct, Qwen/Qwen3-8B-Base
|
||||||
# - specified but not all: Qwen/Qwen2.5-7B-Instruct, Qwen/Qwen2.5-VL-7B-Instruct, Qwen/Qwen3-8B-Base
|
# - specified but not all: Qwen/Qwen3-30B-A3B, Qwen/Qwen2.5-VL-7B-Instruct, Qwen/Qwen3-8B-Base
|
||||||
# 2. PR labeled with "*-accuracy-test"
|
# 2. PR labeled with "*-accuracy-test"
|
||||||
# - accuracy-test: Qwen/Qwen2.5-7B-Instruct, Qwen/Qwen2.5-VL-7B-Instruct
|
# - accuracy-test: Qwen/Qwen3-8B-Base, Qwen/Qwen2.5-VL-7B-Instruct, Qwen/Qwen3-30B-A3B
|
||||||
# - dense-accuracy-test: Qwen/Qwen2.5-7B-Instruct
|
# - dense-accuracy-test: Qwen/Qwen3-8B-Base
|
||||||
# - vl-accuracy-test: Qwen/Qwen2.5-VL-7B-Instruct
|
# - vl-accuracy-test: Qwen/Qwen2.5-VL-7B-Instruct
|
||||||
|
# - moe-accuracy-test: Qwen/Qwen3-30B-A3B
|
||||||
model_name: ${{ fromJSON(
|
model_name: ${{ fromJSON(
|
||||||
(github.event_name == 'schedule' &&
|
(github.event_name == 'schedule' &&
|
||||||
'["Qwen/Qwen2.5-7B-Instruct","Qwen/Qwen2.5-VL-7B-Instruct","Qwen/Qwen3-8B-Base"]') ||
|
'["Qwen/Qwen3-30B-A3B","Qwen/Qwen2.5-VL-7B-Instruct","Qwen/Qwen3-8B-Base"]') ||
|
||||||
(github.event.inputs.models == 'all' &&
|
(github.event.inputs.models == 'all' &&
|
||||||
'["Qwen/Qwen2.5-7B-Instruct","Qwen/Qwen2.5-VL-7B-Instruct","Qwen/Qwen3-8B-Base"]') ||
|
'["Qwen/Qwen3-30B-A3B","Qwen/Qwen2.5-VL-7B-Instruct","Qwen/Qwen3-8B-Base"]') ||
|
||||||
(github.event.inputs.models == 'Qwen/Qwen2.5-7B-Instruct' &&
|
(github.event.inputs.models == 'Qwen/Qwen3-30B-A3B' &&
|
||||||
'["Qwen/Qwen2.5-7B-Instruct"]') ||
|
'["Qwen/Qwen3-30B-A3B"]') ||
|
||||||
(github.event.inputs.models == 'Qwen/Qwen2.5-VL-7B-Instruct' &&
|
(github.event.inputs.models == 'Qwen/Qwen2.5-VL-7B-Instruct' &&
|
||||||
'["Qwen/Qwen2.5-VL-7B-Instruct"]') ||
|
'["Qwen/Qwen2.5-VL-7B-Instruct"]') ||
|
||||||
(github.event.inputs.models == 'Qwen/Qwen3-8B-Base' &&
|
(github.event.inputs.models == 'Qwen/Qwen3-8B-Base' &&
|
||||||
'["Qwen/Qwen3-8B-Base"]') ||
|
'["Qwen/Qwen3-8B-Base"]') ||
|
||||||
contains(github.event.pull_request.labels.*.name, 'accuracy-test') &&
|
contains(github.event.pull_request.labels.*.name, 'accuracy-test') &&
|
||||||
'["Qwen/Qwen3-8B-Base","Qwen/Qwen2.5-VL-7B-Instruct"]' ||
|
'["Qwen/Qwen3-8B-Base","Qwen/Qwen2.5-VL-7B-Instruct", "Qwen/Qwen3-30B-A3B"]' ||
|
||||||
contains(github.event.pull_request.labels.*.name, 'dense-accuracy-test') &&
|
contains(github.event.pull_request.labels.*.name, 'dense-accuracy-test') &&
|
||||||
'["Qwen/Qwen3-8B-Base"]' ||
|
'["Qwen/Qwen3-8B-Base"]' ||
|
||||||
contains(github.event.pull_request.labels.*.name, 'vl-accuracy-test') &&
|
contains(github.event.pull_request.labels.*.name, 'vl-accuracy-test') &&
|
||||||
'["Qwen/Qwen2.5-VL-7B-Instruct"]'
|
'["Qwen/Qwen2.5-VL-7B-Instruct"]' ||
|
||||||
|
contains(github.event.pull_request.labels.*.name, 'moe-accuracy-test') &&
|
||||||
|
'["Qwen/Qwen3-30B-A3B"]'
|
||||||
) }}
|
) }}
|
||||||
# Remove exclude after https://github.com/vllm-project/vllm-ascend/issues/1044 resolved
|
|
||||||
exclude:
|
|
||||||
- model_name: Qwen/Qwen2.5-VL-7B-Instruct
|
|
||||||
vllm_use_version: 1
|
|
||||||
|
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
name: ${{ matrix.model_name }} accuracy V${{ matrix.vllm_use_version }}
|
name: ${{ matrix.model_name }} accuracy V${{ matrix.vllm_use_version }}
|
||||||
@@ -187,23 +187,19 @@ jobs:
|
|||||||
- name: Get vLLM commit hash and URL
|
- name: Get vLLM commit hash and URL
|
||||||
working-directory: ./vllm-empty
|
working-directory: ./vllm-empty
|
||||||
run: |
|
run: |
|
||||||
VLLM_COMMIT=$(git rev-parse HEAD)
|
VLLM_COMMIT=$(git rev-parse --short=7 HEAD)
|
||||||
echo "VLLM_COMMIT=$VLLM_COMMIT" >> $GITHUB_ENV
|
echo "VLLM_COMMIT=$VLLM_COMMIT" >> $GITHUB_ENV
|
||||||
echo "VLLM_COMMIT_URL=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> $GITHUB_ENV
|
|
||||||
|
|
||||||
- name: Get vLLM-Ascend commit hash and URL
|
- name: Get vLLM-Ascend commit hash and URL
|
||||||
working-directory: ./vllm-ascend
|
working-directory: ./vllm-ascend
|
||||||
run: |
|
run: |
|
||||||
VLLM_ASCEND_COMMIT=$(git rev-parse HEAD)
|
VLLM_ASCEND_COMMIT=$(git rev-parse --short=7 HEAD)
|
||||||
echo "VLLM_ASCEND_COMMIT=$VLLM_ASCEND_COMMIT" >> $GITHUB_ENV
|
echo "VLLM_ASCEND_COMMIT=$VLLM_ASCEND_COMMIT" >> $GITHUB_ENV
|
||||||
echo "VLLM_ASCEND_COMMIT_URL=https://github.com/vllm-project/vllm-ascend/commit/$VLLM_ASCEND_COMMIT" >> $GITHUB_ENV
|
|
||||||
|
|
||||||
- name: Print resolved hashes and URLs
|
- name: Print resolved hashes
|
||||||
run: |
|
run: |
|
||||||
echo "vLLM : ${{ env.VLLM_COMMIT }}"
|
echo "vLLM : ${{ env.VLLM_COMMIT }}"
|
||||||
echo "vLLM link : ${{ env.VLLM_COMMIT_URL }}"
|
|
||||||
echo "vLLM-Ascend: ${{ env.VLLM_ASCEND_COMMIT }}"
|
echo "vLLM-Ascend: ${{ env.VLLM_ASCEND_COMMIT }}"
|
||||||
echo "Ascend link: ${{ env.VLLM_ASCEND_COMMIT_URL }}"
|
|
||||||
|
|
||||||
- name: Install lm-eval, ray, and datasets
|
- name: Install lm-eval, ray, and datasets
|
||||||
run: |
|
run: |
|
||||||
@@ -262,8 +258,6 @@ jobs:
|
|||||||
--vllm_version "${{ env.GHA_VLLM_VERSION }}" \
|
--vllm_version "${{ env.GHA_VLLM_VERSION }}" \
|
||||||
--vllm_commit "${{ env.VLLM_COMMIT }}" \
|
--vllm_commit "${{ env.VLLM_COMMIT }}" \
|
||||||
--vllm_ascend_commit "${{ env.VLLM_ASCEND_COMMIT }}" \
|
--vllm_ascend_commit "${{ env.VLLM_ASCEND_COMMIT }}" \
|
||||||
--vllm_commit_url "${{ env.VLLM_COMMIT_URL }}" \
|
|
||||||
--vllm_ascend_commit_url "${{ env.VLLM_ASCEND_COMMIT_URL }}" \
|
|
||||||
--vllm_use_v1 "$VLLM_USE_V1"
|
--vllm_use_v1 "$VLLM_USE_V1"
|
||||||
|
|
||||||
- name: Generate step summary
|
- name: Generate step summary
|
||||||
@@ -385,7 +379,7 @@ jobs:
|
|||||||
body: `The accuracy results running on NPU Altlas A2 have changed, updating reports for:
|
body: `The accuracy results running on NPU Altlas A2 have changed, updating reports for:
|
||||||
${{
|
${{
|
||||||
github.event.inputs.models == 'all'
|
github.event.inputs.models == 'all'
|
||||||
&& 'All models (Qwen2.5-7B-Instruct, Qwen2.5-VL-7B-Instruct, Qwen3-8B-Base)'
|
&& 'All models (Qwen/Qwen3-30B-A3B, Qwen2.5-VL-7B-Instruct, Qwen3-8B-Base)'
|
||||||
|| github.event.inputs.models
|
|| github.event.inputs.models
|
||||||
}}
|
}}
|
||||||
|
|
||||||
|
|||||||
@@ -21,21 +21,36 @@ import gc
|
|||||||
import json
|
import json
|
||||||
import multiprocessing
|
import multiprocessing
|
||||||
import sys
|
import sys
|
||||||
|
import time
|
||||||
from multiprocessing import Queue
|
from multiprocessing import Queue
|
||||||
|
|
||||||
import lm_eval
|
import lm_eval
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
UNIMODAL_MODEL_NAME = ["Qwen/Qwen2.5-7B-Instruct", "Qwen/Qwen3-8B-Base"]
|
# URLs for version information in Markdown report
|
||||||
|
VLLM_URL = "https://github.com/vllm-project/vllm/commit/"
|
||||||
|
VLLM_ASCEND_URL = "https://github.com/vllm-project/vllm-ascend/commit/"
|
||||||
|
|
||||||
|
# Model and task configurations
|
||||||
|
UNIMODAL_MODEL_NAME = ["Qwen/Qwen3-8B-Base", "Qwen/Qwen3-30B-A3B"]
|
||||||
UNIMODAL_TASK = ["ceval-valid", "gsm8k"]
|
UNIMODAL_TASK = ["ceval-valid", "gsm8k"]
|
||||||
MULTIMODAL_NAME = ["Qwen/Qwen2.5-VL-7B-Instruct"]
|
MULTIMODAL_NAME = ["Qwen/Qwen2.5-VL-7B-Instruct"]
|
||||||
MULTIMODAL_TASK = ["mmmu_val"]
|
MULTIMODAL_TASK = ["mmmu_val"]
|
||||||
|
|
||||||
|
# Batch size configurations per task
|
||||||
BATCH_SIZE = {"ceval-valid": 1, "mmlu": 1, "gsm8k": "auto", "mmmu_val": 1}
|
BATCH_SIZE = {"ceval-valid": 1, "mmlu": 1, "gsm8k": "auto", "mmmu_val": 1}
|
||||||
|
|
||||||
|
# Model type mapping (vllm for text, vllm-vlm for vision-language)
|
||||||
|
MODEL_TYPE = {
|
||||||
|
"Qwen/Qwen3-8B-Base": "vllm",
|
||||||
|
"Qwen/Qwen3-30B-A3B": "vllm",
|
||||||
|
"Qwen/Qwen2.5-VL-7B-Instruct": "vllm-vlm"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Command templates for running evaluations
|
||||||
MODEL_RUN_INFO = {
|
MODEL_RUN_INFO = {
|
||||||
"Qwen/Qwen2.5-7B-Instruct":
|
"Qwen/Qwen3-30B-A3B":
|
||||||
("export MODEL_ARGS='pretrained={model},max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.6'\n"
|
("export MODEL_ARGS='pretrained={model},max_model_len=4096,dtype=auto,tensor_parallel_size=4,gpu_memory_utilization=0.6,enable_expert_parallel=True'\n"
|
||||||
"lm_eval --model vllm --model_args $MODEL_ARGS --tasks {datasets} \ \n"
|
"lm_eval --model vllm --model_args $MODEL_ARGS --tasks {datasets} \ \n"
|
||||||
"--apply_chat_template --fewshot_as_multiturn --num_fewshot 5 --batch_size 1"
|
"--apply_chat_template --fewshot_as_multiturn --num_fewshot 5 --batch_size 1"
|
||||||
),
|
),
|
||||||
@@ -45,19 +60,23 @@ MODEL_RUN_INFO = {
|
|||||||
"--apply_chat_template --fewshot_as_multiturn --num_fewshot 5 --batch_size 1"
|
"--apply_chat_template --fewshot_as_multiturn --num_fewshot 5 --batch_size 1"
|
||||||
),
|
),
|
||||||
"Qwen/Qwen2.5-VL-7B-Instruct":
|
"Qwen/Qwen2.5-VL-7B-Instruct":
|
||||||
("export MODEL_ARGS='pretrained={model},max_model_len=8192,dtype=auto,tensor_parallel_size=4,max_images=2'\n"
|
("export MODEL_ARGS='pretrained={model},max_model_len=8192,dtype=auto,tensor_parallel_size=2,max_images=2'\n"
|
||||||
"lm_eval --model vllm-vlm --model_args $MODEL_ARGS --tasks {datasets} \ \n"
|
"lm_eval --model vllm-vlm --model_args $MODEL_ARGS --tasks {datasets} \ \n"
|
||||||
"--apply_chat_template --fewshot_as_multiturn --batch_size 1"),
|
"--apply_chat_template --fewshot_as_multiturn --batch_size 1"),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Evaluation metric filters per task
|
||||||
FILTER = {
|
FILTER = {
|
||||||
"gsm8k": "exact_match,flexible-extract",
|
"gsm8k": "exact_match,flexible-extract",
|
||||||
"ceval-valid": "acc,none",
|
"ceval-valid": "acc,none",
|
||||||
"mmmu_val": "acc,none"
|
"mmmu_val": "acc,none"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Expected accuracy values for models
|
||||||
EXPECTED_VALUE = {
|
EXPECTED_VALUE = {
|
||||||
"Qwen/Qwen2.5-7B-Instruct": {
|
"Qwen/Qwen3-30B-A3B": {
|
||||||
"ceval-valid": 0.80,
|
"ceval-valid": 0.83,
|
||||||
"gsm8k": 0.72
|
"gsm8k": 0.85
|
||||||
},
|
},
|
||||||
"Qwen/Qwen3-8B-Base": {
|
"Qwen/Qwen3-8B-Base": {
|
||||||
"ceval-valid": 0.82,
|
"ceval-valid": 0.82,
|
||||||
@@ -67,73 +86,102 @@ EXPECTED_VALUE = {
|
|||||||
"mmmu_val": 0.51
|
"mmmu_val": 0.51
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
PARALLEL_MODE = {
|
||||||
|
"Qwen/Qwen3-8B-Base": "TP",
|
||||||
|
"Qwen/Qwen2.5-VL-7B-Instruct": "TP",
|
||||||
|
"Qwen/Qwen3-30B-A3B": "EP"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Execution backend configuration
|
||||||
|
EXECUTION_MODE = {
|
||||||
|
"Qwen/Qwen3-8B-Base": "ACLGraph",
|
||||||
|
"Qwen/Qwen2.5-VL-7B-Instruct": "ACLGraph",
|
||||||
|
"Qwen/Qwen3-30B-A3B": "ACLGraph"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Model arguments for evaluation
|
||||||
|
MODEL_ARGS = {
|
||||||
|
"Qwen/Qwen3-8B-Base":
|
||||||
|
"pretrained=Qwen/Qwen3-8B-Base,max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.6",
|
||||||
|
"Qwen/Qwen2.5-VL-7B-Instruct":
|
||||||
|
"pretrained=Qwen/Qwen2.5-VL-7B-Instruct,max_model_len=8192,dtype=auto,tensor_parallel_size=2,max_images=2",
|
||||||
|
"Qwen/Qwen3-30B-A3B":
|
||||||
|
"pretrained=Qwen/Qwen3-30B-A3B,max_model_len=4096,dtype=auto,tensor_parallel_size=4,gpu_memory_utilization=0.6,enable_expert_parallel=True"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Whether to apply chat template formatting
|
||||||
|
APPLY_CHAT_TEMPLATE = {
|
||||||
|
"Qwen/Qwen3-8B-Base": True,
|
||||||
|
"Qwen/Qwen2.5-VL-7B-Instruct": True,
|
||||||
|
"Qwen/Qwen3-30B-A3B": False
|
||||||
|
}
|
||||||
|
# Few-shot examples handling as multi-turn dialogues.
|
||||||
|
FEWSHOT_AS_MULTITURN = {
|
||||||
|
"Qwen/Qwen3-8B-Base": True,
|
||||||
|
"Qwen/Qwen2.5-VL-7B-Instruct": True,
|
||||||
|
"Qwen/Qwen3-30B-A3B": False
|
||||||
|
}
|
||||||
|
|
||||||
|
# Relative tolerance for accuracy checks
|
||||||
RTOL = 0.03
|
RTOL = 0.03
|
||||||
ACCURACY_FLAG = {}
|
ACCURACY_FLAG = {}
|
||||||
|
|
||||||
|
|
||||||
def run_accuracy_unimodal(queue, model, dataset):
|
def run_accuracy_test(queue, model, dataset):
|
||||||
|
"""Run accuracy evaluation for a model on a dataset in separate process"""
|
||||||
try:
|
try:
|
||||||
model_args = f"pretrained={model},max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.6"
|
eval_params = {
|
||||||
results = lm_eval.simple_evaluate(
|
"model": MODEL_TYPE[model],
|
||||||
model="vllm",
|
"model_args": MODEL_ARGS[model],
|
||||||
model_args=model_args,
|
"tasks": dataset,
|
||||||
tasks=dataset,
|
"apply_chat_template": APPLY_CHAT_TEMPLATE[model],
|
||||||
apply_chat_template=True,
|
"fewshot_as_multiturn": FEWSHOT_AS_MULTITURN[model],
|
||||||
fewshot_as_multiturn=True,
|
"batch_size": BATCH_SIZE[dataset]
|
||||||
batch_size=BATCH_SIZE[dataset],
|
}
|
||||||
num_fewshot=5,
|
|
||||||
)
|
if MODEL_TYPE[model] == "vllm":
|
||||||
print(f"Success: {model} on {dataset}")
|
eval_params["num_fewshot"] = 5
|
||||||
|
|
||||||
|
results = lm_eval.simple_evaluate(**eval_params)
|
||||||
|
print(f"Success: {model} on {dataset} ")
|
||||||
measured_value = results["results"]
|
measured_value = results["results"]
|
||||||
queue.put(measured_value)
|
queue.put(measured_value)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error in run_accuracy_unimodal: {e}")
|
print(f"Error in run_accuracy_test: {e}")
|
||||||
queue.put(e)
|
queue.put(e)
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
finally:
|
finally:
|
||||||
torch.npu.empty_cache()
|
if 'results' in locals():
|
||||||
|
del results
|
||||||
gc.collect()
|
gc.collect()
|
||||||
|
|
||||||
|
|
||||||
def run_accuracy_multimodal(queue, model, dataset):
|
|
||||||
try:
|
|
||||||
model_args = f"pretrained={model},max_model_len=8192,dtype=auto,tensor_parallel_size=4,max_images=2"
|
|
||||||
results = lm_eval.simple_evaluate(
|
|
||||||
model="vllm-vlm",
|
|
||||||
model_args=model_args,
|
|
||||||
tasks=dataset,
|
|
||||||
apply_chat_template=True,
|
|
||||||
fewshot_as_multiturn=True,
|
|
||||||
batch_size=BATCH_SIZE[dataset],
|
|
||||||
)
|
|
||||||
print(f"Success: {model} on {dataset}")
|
|
||||||
measured_value = results["results"]
|
|
||||||
queue.put(measured_value)
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Error in run_accuracy_multimodal: {e}")
|
|
||||||
queue.put(e)
|
|
||||||
sys.exit(1)
|
|
||||||
finally:
|
|
||||||
torch.npu.empty_cache()
|
torch.npu.empty_cache()
|
||||||
gc.collect()
|
time.sleep(5)
|
||||||
|
|
||||||
|
|
||||||
def generate_md(model_name, tasks_list, args, datasets):
|
def generate_md(model_name, tasks_list, args, datasets):
|
||||||
|
"""Generate Markdown report with evaluation results"""
|
||||||
|
# Format the run command
|
||||||
run_cmd = MODEL_RUN_INFO[model_name].format(model=model_name,
|
run_cmd = MODEL_RUN_INFO[model_name].format(model=model_name,
|
||||||
datasets=datasets)
|
datasets=datasets)
|
||||||
model = model_name.split("/")[1]
|
model = model_name.split("/")[1]
|
||||||
|
|
||||||
|
# Version information section
|
||||||
version_info = (
|
version_info = (
|
||||||
f"**vLLM Version**: vLLM: {args.vllm_version} "
|
f"**vLLM Version**: vLLM: {args.vllm_version} "
|
||||||
f"([{args.vllm_commit}]({args.vllm_commit_url})), "
|
f"([{args.vllm_commit}]({VLLM_URL+args.vllm_commit})), "
|
||||||
f"**vLLM Ascend**: {args.vllm_ascend_version} "
|
f"vLLM Ascend: {args.vllm_ascend_version} "
|
||||||
f"([{args.vllm_ascend_commit}]({args.vllm_ascend_commit_url}))")
|
f"([{args.vllm_ascend_commit}]({VLLM_ASCEND_URL+args.vllm_ascend_commit})) "
|
||||||
|
)
|
||||||
|
|
||||||
preamble = f"""# 🎯 {model}
|
# Report header with system info
|
||||||
|
preamble = f"""# {model}
|
||||||
{version_info}
|
{version_info}
|
||||||
**vLLM Engine**: V{args.vllm_use_v1}
|
|
||||||
**Software Environment**: CANN: {args.cann_version}, PyTorch: {args.torch_version}, torch-npu: {args.torch_npu_version}
|
**Software Environment**: CANN: {args.cann_version}, PyTorch: {args.torch_version}, torch-npu: {args.torch_npu_version}
|
||||||
**Hardware Environment**: Atlas A2 Series
|
**Hardware Environment**: Atlas A2 Series
|
||||||
**Datasets**: {datasets}
|
**Datasets**: {datasets}
|
||||||
|
**vLLM Engine**: V{args.vllm_use_v1}
|
||||||
|
**Parallel Mode**: {PARALLEL_MODE[model_name]}
|
||||||
|
**Execution Mode**: {EXECUTION_MODE[model_name]}
|
||||||
**Command**:
|
**Command**:
|
||||||
```bash
|
```bash
|
||||||
{run_cmd}
|
{run_cmd}
|
||||||
@@ -146,6 +194,7 @@ def generate_md(model_name, tasks_list, args, datasets):
|
|||||||
)
|
)
|
||||||
rows = []
|
rows = []
|
||||||
rows_sub = []
|
rows_sub = []
|
||||||
|
# Process results for each task
|
||||||
for task_dict in tasks_list:
|
for task_dict in tasks_list:
|
||||||
for key, stats in task_dict.items():
|
for key, stats in task_dict.items():
|
||||||
alias = stats.get("alias", key)
|
alias = stats.get("alias", key)
|
||||||
@@ -181,6 +230,7 @@ def generate_md(model_name, tasks_list, args, datasets):
|
|||||||
" details" + "</summary>" + "\n" * 2 + header)
|
" details" + "</summary>" + "\n" * 2 + header)
|
||||||
rows_sub.append(row)
|
rows_sub.append(row)
|
||||||
rows_sub.append("</details>")
|
rows_sub.append("</details>")
|
||||||
|
# Combine all Markdown sections
|
||||||
md = preamble + "\n" + header + "\n" + "\n".join(rows) + "\n" + "\n".join(
|
md = preamble + "\n" + header + "\n" + "\n".join(rows) + "\n" + "\n".join(
|
||||||
rows_sub) + "\n"
|
rows_sub) + "\n"
|
||||||
print(md)
|
print(md)
|
||||||
@@ -188,6 +238,9 @@ def generate_md(model_name, tasks_list, args, datasets):
|
|||||||
|
|
||||||
|
|
||||||
def safe_md(args, accuracy, datasets):
|
def safe_md(args, accuracy, datasets):
|
||||||
|
"""
|
||||||
|
Safely generate and save Markdown report from accuracy results.
|
||||||
|
"""
|
||||||
data = json.loads(json.dumps(accuracy))
|
data = json.loads(json.dumps(accuracy))
|
||||||
for model_key, tasks_list in data.items():
|
for model_key, tasks_list in data.items():
|
||||||
md_content = generate_md(model_key, tasks_list, args, datasets)
|
md_content = generate_md(model_key, tasks_list, args, datasets)
|
||||||
@@ -197,50 +250,45 @@ def safe_md(args, accuracy, datasets):
|
|||||||
|
|
||||||
|
|
||||||
def main(args):
|
def main(args):
|
||||||
|
"""Main evaluation workflow"""
|
||||||
accuracy = {}
|
accuracy = {}
|
||||||
accuracy[args.model] = []
|
accuracy[args.model] = []
|
||||||
result_queue: Queue[float] = multiprocessing.Queue()
|
result_queue: Queue[float] = multiprocessing.Queue()
|
||||||
if args.model in UNIMODAL_MODEL_NAME:
|
if args.model in UNIMODAL_MODEL_NAME:
|
||||||
datasets = ",".join(UNIMODAL_TASK)
|
datasets = UNIMODAL_TASK
|
||||||
for dataset in UNIMODAL_TASK:
|
else:
|
||||||
accuracy_expected = EXPECTED_VALUE[args.model][dataset]
|
datasets = MULTIMODAL_TASK
|
||||||
p = multiprocessing.Process(target=run_accuracy_unimodal,
|
datasets_str = ",".join(datasets)
|
||||||
args=(result_queue, args.model,
|
# Evaluate model on each dataset
|
||||||
dataset))
|
for dataset in datasets:
|
||||||
p.start()
|
accuracy_expected = EXPECTED_VALUE[args.model][dataset]
|
||||||
|
p = multiprocessing.Process(target=run_accuracy_test,
|
||||||
|
args=(result_queue, args.model, dataset))
|
||||||
|
p.start()
|
||||||
|
p.join()
|
||||||
|
if p.is_alive():
|
||||||
|
p.terminate()
|
||||||
p.join()
|
p.join()
|
||||||
result = result_queue.get()
|
gc.collect()
|
||||||
print(result)
|
torch.npu.empty_cache()
|
||||||
if accuracy_expected - RTOL < result[dataset][
|
time.sleep(10)
|
||||||
FILTER[dataset]] < accuracy_expected + RTOL:
|
result = result_queue.get()
|
||||||
ACCURACY_FLAG[dataset] = "✅"
|
print(result)
|
||||||
else:
|
if accuracy_expected - RTOL < result[dataset][
|
||||||
ACCURACY_FLAG[dataset] = "❌"
|
FILTER[dataset]] < accuracy_expected + RTOL:
|
||||||
accuracy[args.model].append(result)
|
ACCURACY_FLAG[dataset] = "✅"
|
||||||
if args.model in MULTIMODAL_NAME:
|
else:
|
||||||
datasets = ",".join(MULTIMODAL_TASK)
|
ACCURACY_FLAG[dataset] = "❌"
|
||||||
for dataset in MULTIMODAL_TASK:
|
accuracy[args.model].append(result)
|
||||||
accuracy_expected = EXPECTED_VALUE[args.model][dataset]
|
|
||||||
p = multiprocessing.Process(target=run_accuracy_multimodal,
|
|
||||||
args=(result_queue, args.model,
|
|
||||||
dataset))
|
|
||||||
p.start()
|
|
||||||
p.join()
|
|
||||||
result = result_queue.get()
|
|
||||||
print(result)
|
|
||||||
if accuracy_expected - RTOL < result[dataset][
|
|
||||||
FILTER[dataset]] < accuracy_expected + RTOL:
|
|
||||||
ACCURACY_FLAG[dataset] = "✅"
|
|
||||||
else:
|
|
||||||
ACCURACY_FLAG[dataset] = "❌"
|
|
||||||
accuracy[args.model].append(result)
|
|
||||||
print(accuracy)
|
print(accuracy)
|
||||||
safe_md(args, accuracy, datasets)
|
safe_md(args, accuracy, datasets_str)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
multiprocessing.set_start_method('spawn', force=True)
|
multiprocessing.set_start_method('spawn', force=True)
|
||||||
parser = argparse.ArgumentParser()
|
# Initialize argument parser
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Run model accuracy evaluation and generate report")
|
||||||
parser.add_argument("--output", type=str, required=True)
|
parser.add_argument("--output", type=str, required=True)
|
||||||
parser.add_argument("--model", type=str, required=True)
|
parser.add_argument("--model", type=str, required=True)
|
||||||
parser.add_argument("--vllm_ascend_version", type=str, required=False)
|
parser.add_argument("--vllm_ascend_version", type=str, required=False)
|
||||||
@@ -248,12 +296,8 @@ if __name__ == "__main__":
|
|||||||
parser.add_argument("--torch_npu_version", type=str, required=False)
|
parser.add_argument("--torch_npu_version", type=str, required=False)
|
||||||
parser.add_argument("--vllm_version", type=str, required=False)
|
parser.add_argument("--vllm_version", type=str, required=False)
|
||||||
parser.add_argument("--cann_version", type=str, required=False)
|
parser.add_argument("--cann_version", type=str, required=False)
|
||||||
parser.add_argument("--vllm_commit", type=lambda s: s[:7], required=False)
|
parser.add_argument("--vllm_commit", type=str, required=False)
|
||||||
parser.add_argument("--vllm_commit_url", type=str, required=False)
|
parser.add_argument("--vllm_ascend_commit", type=str, required=False)
|
||||||
parser.add_argument("--vllm_ascend_commit",
|
|
||||||
type=lambda s: s[:7],
|
|
||||||
required=False)
|
|
||||||
parser.add_argument("--vllm_ascend_commit_url", type=str, required=False)
|
|
||||||
parser.add_argument("--vllm_use_v1", type=str, required=False)
|
parser.add_argument("--vllm_use_v1", type=str, required=False)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
main(args)
|
main(args)
|
||||||
|
|||||||
Reference in New Issue
Block a user