init v0.11.0rc0

2025-10-14 10:38:28 +08:00
parent 67afd0ea78
commit 66dc16f966
278 changed files with 28130 additions and 11708 deletions
--- a/tests/e2e/models/configs/DeepSeek-V2-Lite.yaml
+++ b/tests/e2e/models/configs/DeepSeek-V2-Lite.yaml
@@ -1,12 +1,16 @@
 model_name: "deepseek-ai/DeepSeek-V2-Lite"
+runner: "linux-aarch64-a2-2"
+hardware: "Atlas A2 Series"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
-    value: 0.375
+    value: 0.385
  - name: "exact_match,flexible-extract"
-    value: 0.375
+    value: 0.385
 tensor_parallel_size: 2
+batch_size: 32
+gpu_memory_utilization: 0.7
 apply_chat_template: False
 fewshot_as_multiturn: False
 trust_remote_code: True
--- a/tests/e2e/models/configs/Qwen2.5-VL-7B-Instruct.yaml
+++ b/tests/e2e/models/configs/Qwen2.5-VL-7B-Instruct.yaml
@@ -1,4 +1,6 @@
 model_name: "Qwen/Qwen2.5-VL-7B-Instruct"
+runner: "linux-aarch64-a2-1"
+hardware: "Atlas A2 Series"
 model: "vllm-vlm"
 tasks:
 - name: "mmmu_val"
--- a/tests/e2e/models/configs/Qwen3-30B-A3B.yaml
+++ b/tests/e2e/models/configs/Qwen3-30B-A3B.yaml
@@ -1,4 +1,6 @@
 model_name: "Qwen/Qwen3-30B-A3B"
+runner: "linux-aarch64-a2-2"
+hardware: "Atlas A2 Series"
 tasks:
 - name: "gsm8k"
  metrics:
--- a/tests/e2e/models/configs/Qwen3-8B-Base.yaml
+++ b/tests/e2e/models/configs/Qwen3-8B-Base.yaml
@@ -1,4 +1,6 @@
 model_name: "Qwen/Qwen3-8B-Base"
+runner: "linux-aarch64-a2-1"
+hardware: "Atlas A2 Series"
 tasks:
 - name: "gsm8k"
  metrics:
--- a/tests/e2e/models/configs/accuracy.txt
+++ b/tests/e2e/models/configs/accuracy.txt
@@ -1,3 +1,4 @@
+DeepSeek-V2-Lite.yaml
 Qwen3-8B-Base.yaml
 Qwen2.5-VL-7B-Instruct.yaml
 Qwen3-30B-A3B.yaml
--- a/tests/e2e/models/report_template.md
+++ b/tests/e2e/models/report_template.md
@@ -2,16 +2,28 @@

 - **vLLM Version**: vLLM: {{ vllm_version }} ([{{ vllm_commit[:7] }}](https://github.com/vllm-project/vllm/commit/{{ vllm_commit }})), **vLLM Ascend Version**: {{ vllm_ascend_version }} ([{{ vllm_ascend_commit[:7] }}](https://github.com/vllm-project/vllm-ascend/commit/{{ vllm_ascend_commit }}))  
 - **Software Environment**: **CANN**: {{ cann_version }}, **PyTorch**: {{ torch_version }}, **torch-npu**: {{ torch_npu_version }}  
- **Hardware Environment**: Atlas A2 Series  
+- **Hardware Environment**: {{ hardware }}
 - **Parallel mode**: {{ parallel_mode }}
- **Execution mode**: ACLGraph
+- **Execution mode**: {{ execution_model }}

 **Command**:  

 ```bash
 export MODEL_ARGS={{ model_args }}
 lm_eval --model {{ model_type }} --model_args $MODEL_ARGS --tasks {{ datasets }} \
-{% if apply_chat_template %} --apply_chat_template {{ apply_chat_template }} {% endif %} {% if fewshot_as_multiturn %} --fewshot_as_multiturn {{ fewshot_as_multiturn }} {% endif %} {% if num_fewshot is defined and num_fewshot != "N/A" %} --num_fewshot {{ num_fewshot }} {% endif %} {% if limit is defined and limit != "N/A" %} --limit {{ limit }} {% endif %} --batch_size {{ batch_size}}
+{% if apply_chat_template is defined and (apply_chat_template|string|lower in ["true", "1"]) -%}
+  --apply_chat_template \
+{%- endif %}
+{% if fewshot_as_multiturn is defined and (fewshot_as_multiturn|string|lower in ["true", "1"]) -%}
+  --fewshot_as_multiturn \
+{%- endif %}
+{% if num_fewshot is defined and num_fewshot != "N/A" -%}
+  --num_fewshot {{ num_fewshot }} \
+{%- endif %}
+{% if limit is defined and limit != "N/A" -%}
+  --limit {{ limit }} \
+{%- endif %}
+--batch_size {{ batch_size }}
 ```

 | Task                  | Metric      | Value     | Stderr |
--- a/tests/e2e/models/test_lm_eval_correctness.py
+++ b/tests/e2e/models/test_lm_eval_correctness.py
@@ -69,6 +69,8 @@ def generate_report(tp_size, eval_config, report_data, report_dir, env_config):
    if model_args.get('enable_expert_parallel', False):
        parallel_mode += " + EP"

+    execution_model = f"{'Eager' if model_args.get('enforce_eager', False) else 'ACLGraph'}"
+
    report_content = template.render(
        vllm_version=env_config.vllm_version,
        vllm_commit=env_config.vllm_commit,
@@ -77,6 +79,7 @@ def generate_report(tp_size, eval_config, report_data, report_dir, env_config):
        cann_version=env_config.cann_version,
        torch_version=env_config.torch_version,
        torch_npu_version=env_config.torch_npu_version,
+        hardware=eval_config.get("hardware", "unknown"),
        model_name=eval_config["model_name"],
        model_args=f"'{','.join(f'{k}={v}' for k, v in model_args.items())}'",
        model_type=eval_config.get("model", "vllm"),
@@ -84,10 +87,11 @@ def generate_report(tp_size, eval_config, report_data, report_dir, env_config):
        apply_chat_template=eval_config.get("apply_chat_template", True),
        fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", True),
        limit=eval_config.get("limit", "N/A"),
-        batch_size="auto",
+        batch_size=eval_config.get("batch_size", "auto"),
        num_fewshot=eval_config.get("num_fewshot", "N/A"),
        rows=report_data["rows"],
-        parallel_mode=parallel_mode)
+        parallel_mode=parallel_mode,
+        execution_model=execution_model)

    report_output = os.path.join(
        report_dir, f"{os.path.basename(eval_config['model_name'])}.md")
@@ -110,7 +114,7 @@ def test_lm_eval_correctness_param(config_filename, tp_size, report_dir,
        "apply_chat_template": eval_config.get("apply_chat_template", True),
        "fewshot_as_multiturn": eval_config.get("fewshot_as_multiturn", True),
        "limit": eval_config.get("limit", None),
-        "batch_size": "auto",
+        "batch_size": eval_config.get("batch_size", "auto"),
    }
    for s in ["num_fewshot", "fewshot_as_multiturn", "apply_chat_template"]:
        val = eval_config.get(s, None)