[Test] Add accuracy test report workflow (#542)

### What this PR does / why we need it? 1. Provide accuracy test report for development branch release. 2. Models and datasets for accuracy test： | Model | datasets | |---------------------------- | --------------------------- | | Qwen2.5-7B-Instruct | ceval-val, gsm8k, mmlu | | Qwen3-8B | ceval-val, gsm8k, mmlu | | Llama-3.1-8B-Instruct | ceval-val, gsm8k, mmlu | | Qwen2.5-VL-7B-Instruct | mmmu_val | ### Does this PR introduce _any_ user-facing change? This PR will display the accuracy test report of the release versionin docs/source/developer_guide/accuracy_report。 Qwen2.5-7B-Instruct.md Qwen3-8B.md Llama-3.1-8B-Instruct.md Qwen2.5-VL-7B-Instruct .md Signed-off-by: hfadzxy <starmoon_zhang@163.com>
2025-04-30 14:53:58 +08:00
parent ba9714ccee
commit affca6f348
6 changed files with 587 additions and 2 deletions
--- a/.github/actionlint.yaml
+++ b/.github/actionlint.yaml
@@ -2,4 +2,5 @@ self-hosted-runner:
  # Labels of self-hosted runner in array of strings.
  labels:
    - linux-arm64-npu-1
    - linux-arm64-npu-2
    - linux-arm64-npu-4
--- a/.github/workflows/accuracy_report.yaml
+++ b/.github/workflows/accuracy_report.yaml
@@ -0,0 +1,150 @@
 #
 # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # This file is a part of the vllm-ascend project.
 #
 name: Accuracy Report
 on:
  workflow_dispatch:
    inputs:
      branch:
        description: 'choose a dev branch to pr'
        required: true
      vllm-ascend-version:
        description: 'what vllm-ascend version to accuracy test?'
        required: true
        type: string
 jobs:
  download:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
        uses: actions/checkout@v4
        with:
          ref: ${{ github.event.inputs.branch }}
      - name: Debug List Artifacts
        run: gh api /repos/${{ github.repository }}/actions/artifacts
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      - name: Query artifact run id for Qwen2.5-VL-7B-Instruct V0 latest artifact
        id: get_Qwen2_5_VL_7B_Instruct_latest_run_id_V0
        run: |
          ARTIFACT_JSON=$(gh api "repos/${{ github.repository }}/actions/artifacts")
          RUN_ID=$(echo "$ARTIFACT_JSON" | \
            jq -r '[.artifacts[] | select(.name=="${{ github.event.inputs.vllm-ascend-version }}-Qwen2.5-VL-7B-Instruct-V0-report")] | sort_by(.created_at) | last | .workflow_run.id')
          echo "runid=$RUN_ID" >> "$GITHUB_OUTPUT"
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      - name: Query artifact run id for Qwen2.5-7B-Instruct V0 latest artifact
        id: get_Qwen2_5_7B_Instruct_latest_run_id_V0
        run: |
          ARTIFACT_JSON=$(gh api "repos/${{ github.repository }}/actions/artifacts")
          RUN_ID=$(echo "$ARTIFACT_JSON" | \
            jq -r '[.artifacts[] | select(.name=="${{ github.event.inputs.vllm-ascend-version }}-Qwen2.5-7B-Instruct-V0-report")] | sort_by(.created_at) | last | .workflow_run.id')
          echo "runid=$RUN_ID" >> "$GITHUB_OUTPUT"
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      - name: Query artifact run id for Llama-3.1-8B-Instruct V0 latest artifact
        id: get_Llama_3_1_8B_Instruct_latest_run_id_V0
        run: |
          ARTIFACT_JSON=$(gh api "repos/${{ github.repository }}/actions/artifacts")
          RUN_ID=$(echo "$ARTIFACT_JSON" | \
            jq -r '[.artifacts[] | select(.name=="${{ github.event.inputs.vllm-ascend-version }}-Llama-3.1-8B-Instruct-V0-report")] | sort_by(.created_at) | last | .workflow_run.id')
          echo "runid=$RUN_ID" >> "$GITHUB_OUTPUT"
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      - name: Query artifact run id for Qwen3-8B V0 latest artifact
        id: get_Qwen3_8B_latest_run_id_V0
        run: |
          ARTIFACT_JSON=$(gh api "repos/${{ github.repository }}/actions/artifacts")
          RUN_ID=$(echo "$ARTIFACT_JSON" | \
            jq -r '[.artifacts[] | select(.name=="${{ github.event.inputs.vllm-ascend-version }}-Qwen3-8B-V0-report")] | sort_by(.created_at) | last | .workflow_run.id')
          echo "runid=$RUN_ID" >> "$GITHUB_OUTPUT"
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      - name: Download Qwen/Qwen2.5-VL-7B-Instruct V0 Artifact
        uses: actions/download-artifact@v4
        with:
          name: ${{ github.event.inputs.vllm-ascend-version }}-Qwen2.5-VL-7B-Instruct-V0-report
          path: ./docs/source/developer_guide/evaluation/accuracy_report
          github-token: ${{ secrets.GITHUB_TOKEN }}
          repository: vllm-project/vllm-ascend
          run-id: ${{ steps.get_Qwen2_5_VL_7B_Instruct_latest_run_id_V0.outputs.runid }}
      - name: Download Qwen/Qwen2.5-7B-Instruct Artifact
        uses: actions/download-artifact@v4
        with:
          name: ${{ github.event.inputs.vllm-ascend-version }}-Qwen2.5-7B-Instruct-V0-report
          path: ./docs/source/developer_guide/evaluation/accuracy_report
          github-token: ${{ secrets.GITHUB_TOKEN }}
          repository: vllm-project/vllm-ascend
          run-id: ${{ steps.get_Qwen2_5_7B_Instruct_latest_run_id_V0.outputs.runid }}
      - name: Download meta-llama/Llama-3.1-8B-Instruct Artifact
        uses: actions/download-artifact@v4
        with:
          name: ${{ github.event.inputs.vllm-ascend-version }}-Llama-3.1-8B-Instruct-V0-report
          path: ./docs/source/developer_guide/evaluation/accuracy_report
          github-token: ${{ secrets.GITHUB_TOKEN }}
          repository: vllm-project/vllm-ascend
          run-id: ${{ steps.get_Llama_3_1_8B_Instruct_latest_run_id_V0.outputs.runid }}
      - name: Download Qwen/Qwen3-8B Artifact
        uses: actions/download-artifact@v4
        with:
          name: ${{ github.event.inputs.vllm-ascend-version }}-Qwen3-8B-V0-report
          path: ./docs/source/developer_guide/evaluation/accuracy_report
          github-token: ${{ secrets.GITHUB_TOKEN }}
          repository: vllm-project/vllm-ascend
          run-id: ${{ steps.get_Qwen3_8B_latest_run_id_V0.outputs.runid }}
      - name: Display Files
        working-directory: ./docs/source/developer_guide/evaluation/accuracy_report
        run: |
          cat ./Qwen2.5-VL-7B-Instruct.md
          cat ./Llama-3.1-8B-Instruct.md
          cat ./Qwen2.5-7B-Instruct.md
          cat ./Qwen3-8B.md
      - name: Create Pull Request for markdown update
        uses: peter-evans/create-pull-request@v7
        with:
          token: ${{ secrets.PR_TOKEN }}
          base: ${{ github.ref_name }}
          branch: auto-pr/accuracy-test
          commit-message: "Update accuracy report for ${{ github.event.inputs.branch }}"
          add-paths: ./docs/source/developer_guide/evaluation/accuracy_report/*.md
          title: "[Doc]Update accuracy report for ${{ github.event.inputs.branch }}"
          body: |
            The accuracy results running on Ascend NPU have changed, I'm updating the report.
            Please review the changes.
            - [Workflow run][1]
            - [Qwen2.5-7B-Instruct accuracy report][2]
            - [Llama-3.1-8B-Instruct accuracy report][3]
            - [Qwen2.5-VL-7B-Instruct accuracy report][4]
            - [Qwen3-8B accuracy report][5]
            [1]: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
            [2]: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ steps.get_Qwen2_5_7B_Instruct_latest_run_id_V0.outputs.runid }}
            [3]: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ steps.get_Llama_3_1_8B_Instruct_latest_run_id_V0.outputs.runid }}
            [4]: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ steps.get_Qwen2_5_VL_7B_Instruct_latest_run_id_V0.outputs.runid }}
            [5]: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ steps.get_Qwen3_8B_latest_run_id_V0.outputs.runid }}
--- a/.github/workflows/accuracy_test.yaml
+++ b/.github/workflows/accuracy_test.yaml
@@ -0,0 +1,203 @@
 #
 # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # This file is a part of the vllm-ascend project.
 #
 name: Accuracy Tests
 on:
  workflow_dispatch:
    inputs:
      vllm-version:
        description: 'what vllm version to accuracy test?'
        required: true
        type: string
      vllm-ascend-version:
        description: 'what vllm-ascend version to accuracy test?'
        required: true
        type: string
      models:
        description: 'choose model(all/Qwen2.5-7B-Instruct/Llama-3.1-8B-Instruct/Qwen2.5-VL-7B-Instruct/Qwen3-8B)'
        required: true
        type: choice
        options:
          - all
          - Qwen/Qwen2.5-7B-Instruct
          - meta-llama/Llama-3.1-8B-Instruct
          - Qwen/Qwen2.5-VL-7B-Instruct
          - Qwen/Qwen3-8B
        default: 'all'
 # Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
 # declared as "shell: bash -el {0}" on steps that need to be properly activated.
 # It's used to activate ascend-toolkit environment variables.
 defaults:
  run:
    shell: bash -el {0}
 jobs:
  model_tests:
    name: Model Test - ${{ matrix.model_name }}
    runs-on: 'linux-arm64-npu-2'
    strategy:
      matrix:
        include: ${{ fromJSON(
          (github.event.inputs.models == 'all' && '[{"model_name":"Qwen/Qwen2.5-7B-Instruct","output_file":"Qwen2.5-7B-Instruct"},{"model_name":"meta-llama/Llama-3.1-8B-Instruct","output_file":"Llama-3.1-8B-Instruct"},{"model_name":"Qwen/Qwen2.5-VL-7B-Instruct","output_file":"Qwen2.5-VL-7B-Instruct"}, {"model_name":"Qwen/Qwen3-8B","output_file":"Qwen3-8B"}]') ||
          (github.event.inputs.models == 'Qwen/Qwen2.5-7B-Instruct' && '[{"model_name":"Qwen/Qwen2.5-7B-Instruct","output_file":"Qwen2.5-7B-Instruct"}]') ||
          (github.event.inputs.models == 'meta-llama/Llama-3.1-8B-Instruct' && '[{"model_name":"meta-llama/Llama-3.1-8B-Instruct","output_file":"Llama-3.1-8B-Instruct"}]') ||
          (github.event.inputs.models == 'Qwen/Qwen2.5-VL-7B-Instruct' && '[{"model_name":"Qwen/Qwen2.5-VL-7B-Instruct","output_file":"Qwen2.5-VL-7B-Instruct"}]') ||
          (github.event.inputs.models == 'Qwen/Qwen3-8B' && '[{"model_name":"Qwen/Qwen3-8B","output_file":"Qwen3-8B"}]')
         ) }}
      fail-fast: false
    container:
      image: quay.io/ascend/cann:8.0.0-910b-ubuntu22.04-py3.10
      env:
        HF_ENDPOINT: https://hf-mirror.com
        HF_TOKEN: ${{ secrets.HF_TOKEN }}
        DATASET_SOURCE: ModelScope
    steps:
      - name: Checkout repository
        uses: actions/checkout@v4
      - name: Check npu and CANN info
        run: |
          npu-smi info
          cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
      - name: Config mirrors
        run: |
          sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list
          pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
          apt-get update -y
          apt install git -y
          git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/
      - name: Install system dependencies
        run: |
          apt-get -y install `cat packages.txt`
          apt-get -y install gcc g++ cmake libnuma-dev
      - name: Install system dependencies
        run: |
          apt-get -y install `cat packages.txt`
          apt-get -y install gcc g++ cmake libnuma-dev
      - name: Checkout vllm-project/vllm repo
        uses: actions/checkout@v4
        with:
          repository: vllm-project/vllm
          path: ./vllm-empty
          ref: ${{ github.event.inputs.vllm-version }}
      - name: Install vllm-project/vllm from source
        working-directory: ./vllm-empty
        run: VLLM_TARGET_DEVICE=empty pip install -e .
      - name: Checkout vllm-project/vllm-ascend repo
        uses: actions/checkout@v4
        with:
          repository: vllm-project/vllm-ascend
          path: ./vllm-ascend
          ref: ${{ github.event.inputs.vllm-ascend-version }}
          fetch-depth: 0
      - name: Install pta
        run: |
          if [ ! -d /root/.cache/pta ]; then
            mkdir -p /root/.cache/pta
          fi
          if [ ! -f /root/.cache/pta/torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl ]; then
            cd /root/.cache/pta
            rm -rf pytorch_v2.5.1_py310*
            wget https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/Daily/v2.5.1/20250320.3/pytorch_v2.5.1_py310.tar.gz
            tar -zxvf pytorch_v2.5.1_py310.tar.gz
          fi
          pip install /root/.cache/pta/torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
      - name: Install vllm-project/vllm-ascend
        working-directory: ./vllm-ascend
        run: |
          pip install -r requirements-dev.txt
          pip install -e .
      - name: Checkout EleutherAI/lm-evaluation-harness repo
        uses: actions/checkout@v4
        with:
          repository: EleutherAI/lm-evaluation-harness
          path: ./lm-eval
          fetch-depth: 0
      - name: Install EleutherAI/lm-evaluation-harness
        working-directory: ./lm-eval
        run: |
            pip install -e .
            pip install ray datasets==2.16.0 transformers==4.50.3 huggingface-hub==0.29.3
      - name: Collect version info
        run: |
          for dir in /usr/local/Ascend/ascend-toolkit/*; do
            dname=$(basename "$dir")
            if [ "$dname" != "latest" ]; then
              TOOLKIT_DIR="$dname"
              break
            fi
          done
          INFO_FILE="/usr/local/Ascend/ascend-toolkit/${TOOLKIT_DIR}/$(uname -i)-linux/ascend_toolkit_install.info"
          CANN_VERSION=$(grep "version=" "$INFO_FILE" \
                           | head -n1 \
                           | cut -d'=' -f2 \
                           | tr -d '"')
          {
            echo "CANN_VERSION=$CANN_VERSION"
            pip show torch | grep "Version:" | awk '{print "TORCH_VERSION="$2}'
            pip show torch_npu | grep "Version:" | awk '{print "TORCH_NPU_VERSION="$2}'
            pip show vllm | grep "Version:" | awk '{print "VLLM_VERSION="$2}' | sed 's/+.*//'
          } >> "$GITHUB_ENV"
      - name: Print versions
        run: |
          echo "CANN: ${{ env.CANN_VERSION }}"
          echo "Torch NPU: ${{ env.TORCH_NPU_VERSION }}"
          echo "Torch: ${{ env.TORCH_VERSION }}"
          echo "vLLM: ${{ env.VLLM_VERSION }}"
      - name: Run Accuracy Test for V0
        working-directory: ./benchmarks
        env:
          VLLM_USE_V1: 0
          PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256
        run: |
          mkdir -p ./accuracy/V0
          python ./scripts/run_accuracy.py \
            --model "${{ matrix.model_name }}" \
            --output "./accuracy/V0/${{ matrix.output_file }}.md" \
            --vllm_ascend_version "${{ github.event.inputs.vllm-ascend-version }}" \
            --cann_version "${{ env.CANN_VERSION }}" \
            --torch_npu_version "${{ env.TORCH_NPU_VERSION }}" \
            --torch_version "${{ env.TORCH_VERSION }}" \
            --vllm_version "${{ env.VLLM_VERSION }}"
      - name: Upload Report for V0
        uses: actions/upload-artifact@v4
        with:
          name: "${{ github.event.inputs.vllm-ascend-version }}-${{ matrix.output_file }}-V0-report"
          path: ./benchmarks/accuracy/V0/${{ matrix.output_file }}.md
          if-no-files-found: warn
          retention-days: 90
          overwrite: true
--- a/benchmarks/scripts/run_accuracy.py
+++ b/benchmarks/scripts/run_accuracy.py
@@ -0,0 +1,231 @@
 #
 # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
 # Copyright 2023 The vLLM team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # This file is a part of the vllm-ascend project.
 #
 import argparse
 import gc
 import json
 import multiprocessing
 import sys
 from multiprocessing import Queue
 import lm_eval
 import torch
 UNIMODAL_MODEL_NAME = [
    "Qwen/Qwen2.5-7B-Instruct", "meta-llama/Llama-3.1-8B-Instruct",
    "Qwen/Qwen3-8B"
 ]
 UNIMODAL_TASK = ["ceval-valid", "mmlu", "gsm8k"]
 MULTIMODAL_NAME = ["Qwen/Qwen2.5-VL-7B-Instruct"]
 MULTIMODAL_TASK = ["mmmu_val"]
 batch_size_dict = {"ceval-valid": 1, "mmlu": 1, "gsm8k": "auto", "mmmu_val": 1}
 MODEL_RUN_INFO = {
    "Qwen/Qwen2.5-7B-Instruct":
    ("export MODEL_AEGS='{model}, max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.6'\n"
     "lm_eval --model vllm --modlel_args $MODEL_ARGS --tasks {datasets} \ \n"
     "--apply_chat_template --fewshot_as_multiturn --num_fewshot 5 --batch_size 1"
     ),
    "LLM-Research/Meta-Llama-3.1-8B-Instruct":
    ("export MODEL_AEGS='{model}, max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.6'\n"
     "lm_eval --model vllm --modlel_args $MODEL_ARGS --tasks {datasets} \ \n"
     "--apply_chat_template --fewshot_as_multiturn --num_fewshot 5 --batch_size 1"
     ),
    "Qwen/Qwen3-8B":
    ("export MODEL_AEGS='{model}, max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.6'\n"
     "lm_eval --model vllm --modlel_args $MODEL_ARGS --tasks {datasets} \ \n"
     "--apply_chat_template --fewshot_as_multiturn --num_fewshot 5 --batch_size 1"
     ),
    "Qwen/Qwen2.5-VL-7B-Instruct":
    ("export MODEL_AEGS='{model}, max_model_len=8192,dtype=auto,tensor_parallel_size=2,max_images=2'\n"
     "lm_eval --model vllm-vlm --modlel_args $MODEL_ARGS --tasks {datasets} \ \n"
     "--apply_chat_template --fewshot_as_multiturn  --batch_size 1"),
 }
 def run_accuracy_unimodal(queue, model, dataset):
    try:
        model_args = f"pretrained={model},max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.6"
        results = lm_eval.simple_evaluate(
            model="vllm",
            model_args=model_args,
            tasks=dataset,
            apply_chat_template=True,
            fewshot_as_multiturn=True,
            batch_size=batch_size_dict[dataset],
            num_fewshot=5,
        )
        print(f"Success: {model} on {dataset}")
        measured_value = results["results"]
        queue.put(measured_value)
    except Exception as e:
        print(f"Error in run_accuracy_unimodal: {e}")
        queue.put(e)
        sys.exit(1)
    finally:
        torch.npu.empty_cache()
        gc.collect()
 def run_accuracy_multimodal(queue, model, dataset):
    try:
        model_args = f"pretrained={model},max_model_len=8192,dtype=auto,tensor_parallel_size=2,max_images=2"
        results = lm_eval.simple_evaluate(
            model="vllm-vlm",
            model_args=model_args,
            tasks=dataset,
            apply_chat_template=True,
            fewshot_as_multiturn=True,
            batch_size=batch_size_dict[dataset],
        )
        print(f"Success: {model} on {dataset}")
        measured_value = results["results"]
        queue.put(measured_value)
    except Exception as e:
        print(f"Error in run_accuracy_multimodal: {e}")
        queue.put(e)
        sys.exit(1)
    finally:
        torch.npu.empty_cache()
        gc.collect()
 def generate_md(model_name, tasks_list, args, datasets):
    run_cmd = MODEL_RUN_INFO[model_name].format(model=model_name,
                                                datasets=datasets)
    model = model_name.split("/")[1]
    preamble = f"""# {model} Accuracy Test
  <div>
    <strong>vLLM version:</strong> vLLM: {args.vllm_version}, vLLM Ascend: {args.vllm_ascend_version} <br>
  </div>
  <div>
      <strong>Software Environment:</strong> CANN: {args.cann_version}, PyTorch: {args.torch_version}, torch-npu: {args.torch_npu_version} <br>
  </div>
  <div>
      <strong>Hardware Environment</strong>: Atlas A2 Series <br>
  </div>
  <div>
      <strong>Datasets</strong>: {datasets} <br>
  </div>
  <div>
      <strong>Command</strong>: 
  ```bash
  {run_cmd}
  ```
  </div>
  <div>&nbsp;</div>
  """
    header = (
        "| Task                  | Filter | n-shot | Metric   | Value   | Stderr |\n"
        "|-----------------------|-------:|-------:|----------|--------:|-------:|"
    )
    rows = []
    rows_sub = []
    for task_dict in tasks_list:
        for key, stats in task_dict.items():
            alias = stats.get("alias", key)
            task_name = alias.strip()
            if "exact_match,flexible-extract" in stats:
                metric_key = "exact_match,flexible-extract"
            else:
                metric_key = None
                for k in stats:
                    if "," in k and not k.startswith("acc_stderr"):
                        metric_key = k
                        break
            if metric_key is None:
                continue
            metric, flt = metric_key.split(",", 1)
            value = stats[metric_key]
            stderr = stats.get(f"{metric}_stderr,{flt}", 0)
            if model_name in UNIMODAL_MODEL_NAME:
                n_shot = "5"
            else:
                n_shot = "0"
            row = (f"| {task_name:<37} "
                   f"| {flt:<6} "
                   f"| {n_shot:6} "
                   f"| {metric:<6} "
                   f"| ↑ {value:>5.4f} "
                   f"| ± {stderr:>5.4f} |")
            if not task_name.startswith("-"):
                rows.append(row)
                rows_sub.append("<details>" + "\n" + "<summary>" + task_name +
                                " details" + "</summary>" + "\n" * 2 + header)
            rows_sub.append(row)
        rows_sub.append("</details>")
    md = preamble + "\n" + header + "\n" + "\n".join(rows) + "\n" + "\n".join(
        rows_sub) + "\n"
    print(md)
    return md
 def safe_md(args, accuracy, datasets):
    data = json.loads(json.dumps(accuracy))
    for model_key, tasks_list in data.items():
        md_content = generate_md(model_key, tasks_list, args, datasets)
        with open(args.output, "w", encoding="utf-8") as f:
            f.write(md_content)
        print(f"create Markdown file:{args.output}")
 def main(args):
    accuracy = {}
    accuracy[args.model] = []
    result_queue: Queue[float] = multiprocessing.Queue()
    if args.model in UNIMODAL_MODEL_NAME:
        datasets = ",".join(UNIMODAL_TASK)
        for dataset in UNIMODAL_TASK:
            p = multiprocessing.Process(target=run_accuracy_unimodal,
                                        args=(result_queue, args.model,
                                              dataset))
            p.start()
            p.join()
            result = result_queue.get()
            print(result)
            accuracy[args.model].append(result)
    if args.model in MULTIMODAL_NAME:
        datasets = ",".join(MULTIMODAL_TASK)
        for dataset in MULTIMODAL_TASK:
            p = multiprocessing.Process(target=run_accuracy_multimodal,
                                        args=(result_queue, args.model,
                                              dataset))
            p.start()
            p.join()
            result = result_queue.get()
            print(result)
            accuracy[args.model].append(result)
    print(accuracy)
    safe_md(args, accuracy, datasets)
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--output", type=str, required=True)
    parser.add_argument("--model", type=str, required=True)
    parser.add_argument("--vllm_ascend_version", type=str, required=False)
    parser.add_argument("--torch_version", type=str, required=False)
    parser.add_argument("--torch_npu_version", type=str, required=False)
    parser.add_argument("--vllm_version", type=str, required=False)
    parser.add_argument("--cann_version", type=str, required=False)
    args = parser.parse_args()
    main(args)
--- a/docs/source/developer_guide/evaluation/index.md
+++ b/docs/source/developer_guide/evaluation/index.md
@@ -3,8 +3,8 @@
 :::{toctree}
 :caption: Accuracy
 :maxdepth: 1
 using_opencompass
 using_lm_eval
 using_opencompass
 using_evalscope
 :::
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -52,7 +52,7 @@ user_guide/release_notes
 % How to contribute to the vLLM Ascend project
 :::{toctree}
 :caption: Developer Guide
-:maxdepth: 2
+:maxdepth: 1
 developer_guide/contributing
 developer_guide/versioning_policy
 developer_guide/evaluation/index