diff --git a/tests/e2e/models/configs/DeepSeek-V2-Lite.yaml b/tests/e2e/models/configs/DeepSeek-V2-Lite.yaml index e4adbde..58af318 100644 --- a/tests/e2e/models/configs/DeepSeek-V2-Lite.yaml +++ b/tests/e2e/models/configs/DeepSeek-V2-Lite.yaml @@ -1,4 +1,6 @@ model_name: "deepseek-ai/DeepSeek-V2-Lite" +runner: "linux-aarch64-a2-2" +hardware: "Atlas A2 Series" tasks: - name: "gsm8k" metrics: diff --git a/tests/e2e/models/configs/Qwen2.5-VL-7B-Instruct.yaml b/tests/e2e/models/configs/Qwen2.5-VL-7B-Instruct.yaml index eb7196a..3543e0c 100644 --- a/tests/e2e/models/configs/Qwen2.5-VL-7B-Instruct.yaml +++ b/tests/e2e/models/configs/Qwen2.5-VL-7B-Instruct.yaml @@ -1,4 +1,6 @@ model_name: "Qwen/Qwen2.5-VL-7B-Instruct" +runner: "linux-aarch64-a2-1" +hardware: "Atlas A2 Series" model: "vllm-vlm" tasks: - name: "mmmu_val" diff --git a/tests/e2e/models/configs/Qwen3-30B-A3B.yaml b/tests/e2e/models/configs/Qwen3-30B-A3B.yaml index be1bbb0..6b04252 100644 --- a/tests/e2e/models/configs/Qwen3-30B-A3B.yaml +++ b/tests/e2e/models/configs/Qwen3-30B-A3B.yaml @@ -1,4 +1,6 @@ model_name: "Qwen/Qwen3-30B-A3B" +runner: "linux-aarch64-a2-2" +hardware: "Atlas A2 Series" tasks: - name: "gsm8k" metrics: diff --git a/tests/e2e/models/configs/Qwen3-8B-Base.yaml b/tests/e2e/models/configs/Qwen3-8B-Base.yaml index e60cc9a..2124361 100644 --- a/tests/e2e/models/configs/Qwen3-8B-Base.yaml +++ b/tests/e2e/models/configs/Qwen3-8B-Base.yaml @@ -1,4 +1,6 @@ model_name: "Qwen/Qwen3-8B-Base" +runner: "linux-aarch64-a2-1" +hardware: "Atlas A2 Series" tasks: - name: "gsm8k" metrics: diff --git a/tests/e2e/models/configs/accuracy.txt b/tests/e2e/models/configs/accuracy.txt index e29ff1a..2184a59 100644 --- a/tests/e2e/models/configs/accuracy.txt +++ b/tests/e2e/models/configs/accuracy.txt @@ -1,3 +1,4 @@ +DeepSeek-V2-Lite.yaml Qwen3-8B-Base.yaml Qwen2.5-VL-7B-Instruct.yaml Qwen3-30B-A3B.yaml \ No newline at end of file diff --git a/tests/e2e/models/report_template.md b/tests/e2e/models/report_template.md index 8402545..81dd717 100644 --- a/tests/e2e/models/report_template.md +++ b/tests/e2e/models/report_template.md @@ -2,16 +2,28 @@ - **vLLM Version**: vLLM: {{ vllm_version }} ([{{ vllm_commit[:7] }}](https://github.com/vllm-project/vllm/commit/{{ vllm_commit }})), **vLLM Ascend Version**: {{ vllm_ascend_version }} ([{{ vllm_ascend_commit[:7] }}](https://github.com/vllm-project/vllm-ascend/commit/{{ vllm_ascend_commit }})) - **Software Environment**: **CANN**: {{ cann_version }}, **PyTorch**: {{ torch_version }}, **torch-npu**: {{ torch_npu_version }} -- **Hardware Environment**: Atlas A2 Series +- **Hardware Environment**: {{ hardware }} - **Parallel mode**: {{ parallel_mode }} -- **Execution mode**: ACLGraph +- **Execution mode**: {{ execution_model }} **Command**: ```bash export MODEL_ARGS={{ model_args }} lm_eval --model {{ model_type }} --model_args $MODEL_ARGS --tasks {{ datasets }} \ -{% if apply_chat_template %} --apply_chat_template {{ apply_chat_template }} {% endif %} {% if fewshot_as_multiturn %} --fewshot_as_multiturn {{ fewshot_as_multiturn }} {% endif %} {% if num_fewshot is defined and num_fewshot != "N/A" %} --num_fewshot {{ num_fewshot }} {% endif %} {% if limit is defined and limit != "N/A" %} --limit {{ limit }} {% endif %} --batch_size {{ batch_size}} +{% if apply_chat_template is defined and (apply_chat_template|string|lower in ["true", "1"]) -%} + --apply_chat_template \ +{%- endif %} +{% if fewshot_as_multiturn is defined and (fewshot_as_multiturn|string|lower in ["true", "1"]) -%} + --fewshot_as_multiturn \ +{%- endif %} +{% if num_fewshot is defined and num_fewshot != "N/A" -%} + --num_fewshot {{ num_fewshot }} \ +{%- endif %} +{% if limit is defined and limit != "N/A" -%} + --limit {{ limit }} \ +{%- endif %} +--batch_size {{ batch_size }} ``` | Task | Metric | Value | Stderr | diff --git a/tests/e2e/models/test_lm_eval_correctness.py b/tests/e2e/models/test_lm_eval_correctness.py index 7d023b1..eaef67d 100644 --- a/tests/e2e/models/test_lm_eval_correctness.py +++ b/tests/e2e/models/test_lm_eval_correctness.py @@ -69,6 +69,8 @@ def generate_report(tp_size, eval_config, report_data, report_dir, env_config): if model_args.get('enable_expert_parallel', False): parallel_mode += " + EP" + execution_model = f"{'Eager' if model_args.get('enforce_eager', False) else 'ACLGraph'}" + report_content = template.render( vllm_version=env_config.vllm_version, vllm_commit=env_config.vllm_commit, @@ -77,6 +79,7 @@ def generate_report(tp_size, eval_config, report_data, report_dir, env_config): cann_version=env_config.cann_version, torch_version=env_config.torch_version, torch_npu_version=env_config.torch_npu_version, + hardware=eval_config.get("hardware", "unknown"), model_name=eval_config["model_name"], model_args=f"'{','.join(f'{k}={v}' for k, v in model_args.items())}'", model_type=eval_config.get("model", "vllm"), @@ -87,7 +90,8 @@ def generate_report(tp_size, eval_config, report_data, report_dir, env_config): batch_size=eval_config.get("batch_size", "auto"), num_fewshot=eval_config.get("num_fewshot", "N/A"), rows=report_data["rows"], - parallel_mode=parallel_mode) + parallel_mode=parallel_mode, + execution_model=execution_model) report_output = os.path.join( report_dir, f"{os.path.basename(eval_config['model_name'])}.md")