[CI] Enable auto upgrade e2e estimated time for auto-partition suites (#6840)

### What this PR does / why we need it? This patch add a schedule triggered workflow for auto upgrade e2e estimated-time for batter load balance 1. The workflow will run the full e2e test to get the duration of each test. 2. The script `update_estimated_time.py` will upgrade the [config.json](https://github.com/vllm-project/vllm-ascend/blob/main/.github/workflows/scripts/config.yaml) according to the latest time 3. The workflow will submit a pull request that includes changes to `config.json` automatically <img width="2484" height="764" alt="image" src="https://github.com/user-attachments/assets/02f3459c-bb3b-4f8e-9966-8bb2e5c1bbea" /> ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.15.0 - vLLM main: 83b47f67b1 - ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.15.0 - vLLM main: 83b47f67b1 --------- Signed-off-by: wangli <wangli858794774@gmail.com>
2026-03-04 10:38:34 +08:00
parent c7fd7a25f7
commit d431d7d526
5 changed files with 575 additions and 262 deletions
--- a/.github/workflows/_e2e_test.yaml
+++ b/.github/workflows/_e2e_test.yaml
@@ -15,6 +15,10 @@ on:
      contains_310:
        required: true
        type: boolean
+      continue_on_error:
+        required: false
+        type: boolean
+        default: false

 jobs:
  e2e-light:
@@ -80,7 +84,29 @@ jobs:
          PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256
          VLLM_WORKER_MULTIPROC_METHOD: spawn
        run: |
-          python3 .github/workflows/scripts/run_suite.py --suite e2e-singlecard-light --auto-partition-id ${{ matrix.part }} --auto-partition-size 1
+          if [ "${{ inputs.continue_on_error }}" = "true" ]; then
+            python3 .github/workflows/scripts/run_suite.py \
+              --suite e2e-singlecard-light \
+              --auto-partition-id "${{ matrix.part }}" \
+              --auto-partition-size 1 \
+              --auto-upgrade-estimated-times \
+              --continue-on-error
+          else
+            python3 .github/workflows/scripts/run_suite.py \
+              --suite e2e-singlecard-light \
+              --auto-partition-id "${{ matrix.part }}" \
+              --auto-partition-size 1
+          fi
+
+
+      - name: Upload timing data
+        uses: actions/upload-artifact@v4
+        if: ${{ inputs.continue_on_error == true }}
+        with:
+          name: timing-data-singlecard-light-part${{ matrix.part }}
+          path: test_timing_data.json
+          if-no-files-found: warn
+          retention-days: 5

  e2e-full:
    name: singlecard-full
@@ -146,7 +172,28 @@ jobs:
          VLLM_WORKER_MULTIPROC_METHOD: spawn
          PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256
        run: |
-          python3 .github/workflows/scripts/run_suite.py --suite e2e-singlecard --auto-partition-id ${{ matrix.part }} --auto-partition-size 2
+          if [ "${{ inputs.continue_on_error }}" = "true" ]; then
+            python3 .github/workflows/scripts/run_suite.py \
+              --suite e2e-singlecard \
+              --auto-partition-id "${{ matrix.part }}" \
+              --auto-partition-size 2 \
+              --auto-upgrade-estimated-times \
+              --continue-on-error
+          else
+            python3 .github/workflows/scripts/run_suite.py \
+              --suite e2e-singlecard \
+              --auto-partition-id "${{ matrix.part }}" \
+              --auto-partition-size 2
+          fi
+
+      - name: Upload timing data
+        uses: actions/upload-artifact@v4
+        if: ${{ inputs.continue_on_error == true }}
+        with:
+          name: timing-data-singlecard-full-part${{ matrix.part }}
+          path: test_timing_data.json
+          if-no-files-found: warn
+          retention-days: 5

  e2e-2-cards-light:
    name: multicard-2-light
@@ -210,7 +257,29 @@ jobs:
        env:
          VLLM_WORKER_MULTIPROC_METHOD: spawn
        run: |
-          python3 .github/workflows/scripts/run_suite.py --suite e2e-2card-light --auto-partition-id ${{ matrix.part }} --auto-partition-size 1
+          if [ "${{ inputs.continue_on_error }}" = "true" ]; then
+            python3 .github/workflows/scripts/run_suite.py \
+              --suite e2e-2card-light \
+              --auto-partition-id "${{ matrix.part }}" \
+              --auto-partition-size 1 \
+              --auto-upgrade-estimated-times \
+              --continue-on-error
+          else
+            python3 .github/workflows/scripts/run_suite.py \
+              --suite e2e-2card-light \
+              --auto-partition-id "${{ matrix.part }}" \
+              --auto-partition-size 1
+          fi
+
+
+      - name: Upload timing data
+        uses: actions/upload-artifact@v4
+        if: ${{ inputs.continue_on_error == true }}
+        with:
+          name: timing-data-2card-light-part${{ matrix.part }}
+          path: test_timing_data.json
+          if-no-files-found: warn
+          retention-days: 5

  e2e-2-cards-full:
    name: multicard-2-full
@@ -274,7 +343,29 @@ jobs:
        env:
          VLLM_WORKER_MULTIPROC_METHOD: spawn
        run: |
-          python3 .github/workflows/scripts/run_suite.py --suite e2e-multicard-2-cards --auto-partition-id ${{ matrix.part }} --auto-partition-size 1
+          if [ "${{ inputs.continue_on_error }}" = "true" ]; then
+            python3 .github/workflows/scripts/run_suite.py \
+              --suite e2e-multicard-2-cards \
+              --auto-partition-id "${{ matrix.part }}" \
+              --auto-partition-size 1 \
+              --auto-upgrade-estimated-times \
+              --continue-on-error
+          else
+            python3 .github/workflows/scripts/run_suite.py \
+              --suite e2e-multicard-2-cards \
+              --auto-partition-id "${{ matrix.part }}" \
+              --auto-partition-size 1
+          fi
+
+
+      - name: Upload timing data
+        uses: actions/upload-artifact@v4
+        if: ${{ inputs.continue_on_error == true }}
+        with:
+          name: timing-data-2card-full-part${{ matrix.part }}
+          path: test_timing_data.json
+          if-no-files-found: warn
+          retention-days: 5

      - name: Run vllm-project/vllm-ascend test (non triton)
        if: ${{ inputs.type == 'full' && matrix.part == 0 }}
@@ -346,7 +437,29 @@ jobs:
        env:
          VLLM_WORKER_MULTIPROC_METHOD: spawn
        run: |
-          python3 .github/workflows/scripts/run_suite.py --suite e2e-multicard-4-cards --auto-partition-id ${{ matrix.part }} --auto-partition-size 1
+          if [ "${{ inputs.continue_on_error }}" = "true" ]; then
+            python3 .github/workflows/scripts/run_suite.py \
+              --suite e2e-multicard-4-cards \
+              --auto-partition-id "${{ matrix.part }}" \
+              --auto-partition-size 1 \
+              --auto-upgrade-estimated-times \
+              --continue-on-error
+          else
+            python3 .github/workflows/scripts/run_suite.py \
+              --suite e2e-multicard-4-cards \
+              --auto-partition-id "${{ matrix.part }}" \
+              --auto-partition-size 1
+          fi
+
+
+      - name: Upload timing data
+        uses: actions/upload-artifact@v4
+        if: ${{ inputs.continue_on_error == true }}
+        with:
+          name: timing-data-4card-full-part${{ matrix.part }}
+          path: test_timing_data.json
+          if-no-files-found: warn
+          retention-days: 5

  e2e_310p:
    name: 310p singlecard
--- a/.github/workflows/schedule_update_estimated_time.yaml
+++ b/.github/workflows/schedule_update_estimated_time.yaml
@@ -0,0 +1,111 @@
+name: Update estimated test times
+
+on:
+  schedule:
+    - cron: '0 2 * * 1'  # Every Monday at 02:00 UTC
+  workflow_dispatch:
+  pull_request:
+    branches:
+      - 'main'
+    paths:
+      - '.github/workflows/schedule_update_estimated_time.yaml'
+
+permissions:
+  contents: write
+  pull-requests: write
+
+concurrency:
+  group: update-estimated-times-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  e2e-test:
+    name: e2e-test
+    strategy:
+      matrix:
+        vllm_version: [15d76f74e2fdb12a95ea00f0ca283acf6219a2b7]
+        type: [full, light]
+    uses: ./.github/workflows/_e2e_test.yaml
+    with:
+      vllm: ${{ matrix.vllm_version }}
+      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:main
+      contains_310: false
+      type: ${{ matrix.type }}
+      continue_on_error: true  # Continue even if some tests fail, we want to collect as much timing data as possible
+
+  update-estimated-times:
+    name: Update estimated_time in config.yaml
+    needs: [e2e-test]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repo
+        uses: actions/checkout@v4
+
+      - name: Download all timing artifacts
+        uses: actions/download-artifact@v4
+        with:
+          pattern: timing-data-*
+          path: timing-artifacts/
+          merge-multiple: false
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install pyyaml
+
+      - name: Update config.yaml from timing data
+        run: |
+          python3 .github/workflows/scripts/update_estimated_time.py \
+            --timing-dir timing-artifacts/ \
+            --config .github/workflows/scripts/config.yaml
+
+      - name: Check for changes
+        id: check_changes
+        run: |
+          if git diff --quiet .github/workflows/scripts/config.yaml; then
+            echo "changed=false" >> "$GITHUB_OUTPUT"
+            echo "No changes to config.yaml."
+          else
+            echo "changed=true" >> "$GITHUB_OUTPUT"
+            echo "config.yaml has been updated:"
+            git diff .github/workflows/scripts/config.yaml
+          fi
+
+      - name: Create pull request
+        if: steps.check_changes.outputs.changed == 'true' && github.event_name != 'pull_request'
+        env:
+          GH_TOKEN: ${{ github.token }}
+        run: |
+          BRANCH="auto/update-estimated-times-${{ github.run_id }}"
+          git config user.name "github-actions[bot]"
+          git config user.email "github-actions[bot]@users.noreply.github.com"
+          git checkout -b "$BRANCH"
+          git add .github/workflows/scripts/config.yaml
+          git commit -m "[CI] Auto-update estimated test times in config.yaml
+
+          Computed from timing-data artifacts of workflow run ${{ github.run_id }}.
+          Buffer ratio: 1.1x median, rounded to the nearest 10 s."
+          git push origin "$BRANCH"
+          gh pr create \
+            --repo "${{ github.repository }}" \
+            --base main \
+            --head "$BRANCH" \
+            --title "chore: Auto-update estimated test times in config.yaml" \
+            --body "## Summary
+
+          This PR was auto-generated by the **Update estimated test times** workflow.
+
+          It updates the \`estimated_time\` values in \`.github/workflows/scripts/config.yaml\`
+          based on actual elapsed times collected from workflow run \`${{ github.run_id }}\`.
+
+          ### Methodology
+          - Timing data is uploaded as \`timing-data-*\` artifacts by each e2e test job.
+          - For each test file, the **median** of all collected elapsed times is taken.
+          - A **10 % safety buffer** is applied and the result is rounded to the nearest 10 s.
+
+          Please review the diff and merge if the new values look reasonable."
--- a/.github/workflows/scripts/ci_utils.py
+++ b/.github/workflows/scripts/ci_utils.py
@@ -1,24 +1,13 @@
-import logging
-import os
 import subprocess
 import time
 from dataclasses import dataclass

-# Configure logger to output to stdout
-logging.basicConfig(level=logging.INFO, format="%(message)s")
-logger = logging.getLogger(__name__)

-
-class Colors:
+class _Color:
    HEADER = "\033[95m"
-    OKBLUE = "\033[94m"
-    OKCYAN = "\033[96m"
-    OKGREEN = "\033[92m"
-    WARNING = "\033[93m"
-    FAIL = "\033[91m"
-    ENDC = "\033[0m"
-    BOLD = "\033[1m"
-    UNDERLINE = "\033[4m"
+    GREEN = "\033[92m"
+    RED = "\033[91m"
+    RESET = "\033[0m"


@dataclass
@@ -28,74 +17,77 @@ class TestFile:
    is_skipped: bool = False


-def run_e2e_files(
+@dataclass
+class TestRecord:
+    name: str
+    passed: bool
+    elapsed: float
+    estimated: float
+
+    def to_dict(self) -> dict:
+        return {
+            "name": self.name,
+            "passed": self.passed,
+            "elapsed": self.elapsed,
+            "estimated": self.estimated,
+        }
+
+
+def run_tests(
    files: list[TestFile],
    continue_on_error: bool = False,
-):
+) -> tuple[int, list[TestRecord]]:
    """
-    Run a list of test files.
+    Run each TestFile with pytest and collect timing results.

    Args:
-        files: List of TestFile objects to run
-        continue_on_error: If True, continue running remaining tests even if one fails.
-                          If False, stop at first failure (default behavior for PR tests).
+        files: Tests to run (skipped entries should already be filtered out).
+        continue_on_error: If True, keep running after a failure.
+        report_path: If provided, write a Markdown timing report here.
+
+    Returns:
+        (exit_code, records) — exit_code is 0 on full success, -1 otherwise.
    """
-    tic = time.perf_counter()
-    success = True
-    passed_tests = []
-    failed_tests = []
+    records: list[TestRecord] = []
+    all_passed = True
+    total_start = time.perf_counter()

-    for i, file in enumerate(files):
-        filename, estimated_time = file.name, file.estimated_time
-
-        full_path = os.path.join(os.getcwd(), filename)
-        logger.info(f".\n.\n{Colors.HEADER}Begin ({i}/{len(files)}):{Colors.ENDC}\npytest -sv {full_path}\n.\n.\n")
-        file_tic = time.perf_counter()
-
-        process = subprocess.Popen(
-            ["pytest", "-sv", "--durations=0", "--color=yes", full_path],
-            stdout=None,
-            stderr=None,
-            env=os.environ,
-        )
-        process.wait()
-
-        elapsed = time.perf_counter() - file_tic
-        ret_code = process.returncode
-
-        logger.info(
-            f".\n.\n{Colors.HEADER}End ({i}/{len(files)}):{Colors.ENDC}\n{filename=}, \
-                {elapsed=:.0f}, {estimated_time=}\n.\n.\n"
+    for i, test in enumerate(files):
+        print(f"\n{'.' * 60}", flush=True)
+        print(
+            f"{_Color.HEADER}[{i + 1}/{len(files)}] START  {test.name}{_Color.RESET}",
+            flush=True,
        )

-        if ret_code == 0:
-            passed_tests.append(filename)
-        else:
-            logger.info(f"\n{Colors.FAIL}✗ FAILED: {filename} returned exit code {ret_code}{Colors.ENDC}\n")
-            failed_tests.append((filename, f"exit code {ret_code}"))
-            success = False
+        start = time.perf_counter()
+        result = subprocess.run(["pytest", "-sv", "--durations=0", "--color=yes", test.name])
+        elapsed = time.perf_counter() - start
+        passed = result.returncode == 0
+
+        records.append(TestRecord(name=test.name, passed=passed, elapsed=elapsed, estimated=test.estimated_time))
+
+        color = _Color.GREEN if passed else _Color.RED
+        status = "PASSED" if passed else f"FAILED (exit code {result.returncode})"
+        print(
+            f"{color}[{i + 1}/{len(files)}] {status}  {test.name}  ({elapsed:.0f}s){_Color.RESET}",
+            flush=True,
+        )
+
+        if not passed:
+            all_passed = False
            if not continue_on_error:
                break

-    elapsed_total = time.perf_counter() - tic
+    total_elapsed = time.perf_counter() - total_start
+    passed_count = sum(1 for r in records if r.passed)

-    if success:
-        logger.info(f"{Colors.OKGREEN}Success. Time elapsed: {elapsed_total:.2f}s{Colors.ENDC}")
-    else:
-        logger.info(f"{Colors.FAIL}Fail. Time elapsed: {elapsed_total:.2f}s{Colors.ENDC}")
+    print(f"\n{'=' * 60}")
+    color = _Color.GREEN if all_passed else _Color.RED
+    print(f"{color}Summary: {passed_count}/{len(files)} passed  ({total_elapsed:.2f}s total){_Color.RESET}")
+    print("=" * 60)
+    for r in records:
+        icon = f"{_Color.GREEN}✓{_Color.RESET}" if r.passed else f"{_Color.RED}✗{_Color.RESET}"
+        print(f"  {icon} {r.name}  ({r.elapsed:.0f}s)")
+    print(flush=True)

-    # Print summary
-    logger.info(f"\n{'=' * 60}")
-    logger.info(f"Test Summary: {Colors.OKGREEN}{len(passed_tests)}/{len(files)} passed{Colors.ENDC}")
-    logger.info(f"{'=' * 60}")
-    if passed_tests:
-        logger.info(f"{Colors.OKGREEN}✓ PASSED:{Colors.ENDC}")
-        for test in passed_tests:
-            logger.info(f"  {test}")
-    if failed_tests:
-        logger.info(f"\n{Colors.FAIL}✗ FAILED:{Colors.ENDC}")
-        for test, reason in failed_tests:
-            logger.info(f"  {test} ({reason})")
-    logger.info(f"{'=' * 60}\n")
-
-    return 0 if success else -1
+    return (0 if all_passed else -1), records
--- a/.github/workflows/scripts/run_suite.py
+++ b/.github/workflows/scripts/run_suite.py
@@ -1,244 +1,239 @@
 import argparse
+import json
 import os
+import sys
+from datetime import datetime, timezone
 from pathlib import Path

 import tabulate
 import yaml
-from ci_utils import TestFile, run_e2e_files
+from ci_utils import TestFile, TestRecord, run_tests
+
+_CONFIG_PATH = Path(__file__).parent / "config.yaml"


-def load_suites_from_config(config_path: str = "config.yaml") -> dict[str, list[TestFile]]:
-    # Get absolute path relative to this script
-    script_dir = Path(__file__).parent
-    abs_config_path = script_dir / config_path
-
-    with open(abs_config_path) as f:
-        suites_data = yaml.safe_load(f)
-
-    suites = {}
-
-    for suite_name, test_files in suites_data.items():
-        suites[suite_name] = []
-        for file_data in test_files:
-            name = file_data.get("name")
-            estimated_time = file_data.get("estimated_time", 60)
-            is_skipped = file_data.get("is_skipped", False)
-            suites[suite_name].append(TestFile(name, estimated_time, is_skipped))
-
-    return suites
+def load_suites(config_path: Path = _CONFIG_PATH) -> dict[str, list[TestFile]]:
+    """Load all test suites from config.yaml."""
+    data = yaml.safe_load(config_path.read_text())
+    return {
+        suite_name: [
+            TestFile(
+                name=entry["name"],
+                estimated_time=entry.get("estimated_time", 60),
+                is_skipped=entry.get("is_skipped", False),
+            )
+            for entry in entries
+        ]
+        for suite_name, entries in data.items()
+    }


-suites = load_suites_from_config()
-
-
-def auto_partition(files, rank, size):
+def partition(files: list[TestFile], rank: int, size: int) -> list[TestFile]:
    """
-    Partition files into size sublists with approximately equal sums of estimated times
-    using stable sorting, and return the partition for the specified rank.
-
-    Args:
-        files (list): List of file objects with estimated_time attribute
-        rank (int): Index of the partition to return (0 to size-1)
-        size (int): Number of partitions
-
-    Returns:
-        list: List of file objects in the specified rank's partition
+    Split non-skipped files into `size` groups of approximately equal estimated
+    time using a greedy algorithm, and return the group at index `rank`.
+    Files within the returned group are sorted ascending by estimated_time.
    """
-    # Filter out skipped files
-    files = [f for f in files if not f.is_skipped]
-    weights = [f.estimated_time for f in files]
-
-    if not weights or size <= 0 or size > len(weights):
+    active = [f for f in files if not f.is_skipped]
+    if not active or size <= 0 or size > len(active):
        return []

-    # Create list of (weight, original_index) tuples
-    # Using negative index as secondary key to maintain original order for equal weights
-    indexed_weights = [(w, -i) for i, w in enumerate(weights)]
-    # Stable sort in descending order by weight
-    # If weights are equal, larger (negative) index comes first (i.e., earlier original position)
-    indexed_weights = sorted(indexed_weights, reverse=True)
+    # Sort descending by weight; use original index as tiebreaker to be stable
+    indexed = sorted(enumerate(active), key=lambda x: (-x[1].estimated_time, x[0]))

-    # Extract original indices (negate back to positive)
-    indexed_weights = [(w, -i) for w, i in indexed_weights]
-
-    # Initialize partitions and their sums
-    partitions = [[] for _ in range(size)]
+    buckets: list[list[int]] = [[] for _ in range(size)]
    sums = [0.0] * size

-    # Greedy approach: assign each weight to partition with smallest current sum
-    for weight, idx in indexed_weights:
-        # Find partition with minimum sum
-        min_sum_idx = sums.index(min(sums))
-        partitions[min_sum_idx].append(idx)
-        sums[min_sum_idx] += weight
+    for idx, test in indexed:
+        lightest = sums.index(min(sums))
+        buckets[lightest].append(idx)
+        sums[lightest] += test.estimated_time

-    # Return the files corresponding to the indices in the specified rank's partition
-    indices = partitions[rank]
-    indices.sort(key=lambda i: files[i].estimated_time)
-    return [files[i] for i in indices]
+    return sorted([active[i] for i in buckets[rank]], key=lambda f: f.estimated_time)


-def _get_disk_covered_dirs(all_suite_files: set[str], project_root: Path | str) -> list[str]:
-    covered_dirs = set()
-    for file_path in all_suite_files:
-        # e.g. tests/e2e/singlecard/test_foo.py -> tests/e2e/singlecard
-        parent_dir = (project_root / file_path).parent if os.path.isfile(file_path) else (project_root / file_path)
-        if parent_dir.exists():
-            # Store relative path to project root
-            try:
-                rel_dir = parent_dir.relative_to(project_root)
-
-                # Check if this directory is already covered by a parent directory
-                is_covered = False
-                for existing_dir in list(covered_dirs):
-                    # If existing_dir is a parent of rel_dir, rel_dir is already covered
-                    if existing_dir in rel_dir.parents or existing_dir == rel_dir:
-                        is_covered = True
-                        break
-                    # If rel_dir is a parent of existing_dir, replace existing_dir with rel_dir
-                    elif rel_dir in existing_dir.parents:
-                        covered_dirs.remove(existing_dir)
-                        # We continue checking other existing_dirs, but we know rel_dir should be added
-                        # unless another parent covers it (which is handled by the first if block logic effectively
-                        # but we need to be careful with modification during iteration, so we use list copy)
-
-                if not is_covered:
-                    covered_dirs.add(rel_dir)
-
-            except ValueError:
-                pass
-    return covered_dirs
+def _find_project_root() -> Path:
+    root = Path.cwd()
+    if (root / "tests").exists():
+        return root
+    # Fall back: assume script lives at .github/workflows/scripts/
+    return Path(__file__).parents[3]


-def _sanity_check_suites(suites: dict[str, list[TestFile]]):
+def _minimal_covered_dirs(file_paths: set[str], root: Path) -> set[Path]:
+    """Return the minimal set of directories that covers all file_paths."""
+    dirs: set[Path] = set()
+    for fp in file_paths:
+        candidate = (root / fp).parent
+        if not candidate.exists():
+            continue
+        try:
+            rel = candidate.relative_to(root)
+        except ValueError:
+            continue
+        # Drop any existing entries that are subdirectories of rel
+        dirs = {d for d in dirs if rel not in d.parents}
+        # Only add rel if no ancestor already covers it
+        if not any(d == rel or d in rel.parents for d in dirs):
+            dirs.add(rel)
+    return dirs
+
+
+def sanity_check(suites: dict[str, list[TestFile]]) -> None:
    """
-    Check if all test files defined in the suites exist on disk.
+    Verify that:
+    1. Every test file in any suite exists on disk.
+    2. No test_*.py files exist on disk (in covered dirs) that are absent from all suites.
+    Raises SystemExit with a descriptive message on failure.
    """
-    # 1. Collect all test files defined in all suites
-    all_suite_files = set()
-    for suite in suites.values():
-        for test_file in suite:
-            # Handle ::test_case syntax
-            file_path = test_file.name.split("::")[0]
-            all_suite_files.add(file_path)
+    suite_files = {f.name.split("::")[0] for tests in suites.values() for f in tests}
+    root = _find_project_root()
+    covered = _minimal_covered_dirs(suite_files, root)

-    # 2. Identify all directories covered by the suites
-    project_root = Path.cwd()
-    if not (project_root / "tests").exists():
-        script_dir = Path(__file__).parent
-        # .github/workflows/scripts -> ../../../ -> root
-        project_root = script_dir.parents[2]
-    # For now, we only check dirs under [tests/e2e/singlecard, tests/e2e/multicard]
-    covered_dirs = _get_disk_covered_dirs(all_suite_files, project_root)
+    disk_files = {str(p.relative_to(root)) for d in covered for p in (root / d).rglob("test_*.py")}

-    # 3. Scan disk for all test_*.py files in these directories
-    all_disk_files = set()
-    for dir_path in covered_dirs:
-        full_dir_path = project_root / dir_path
-        # rglob is equivalent to glob('**/' + pattern)
-        for py_file in full_dir_path.rglob("test_*.py"):
-            try:
-                rel_path = py_file.relative_to(project_root)
-                all_disk_files.add(str(rel_path))
-            except ValueError:
-                pass
+    missing_from_suite = sorted(disk_files - suite_files)
+    if missing_from_suite:
+        entries = "\n".join(f'  TestFile("{f}"),' for f in missing_from_suite)
+        raise SystemExit(f"Test files on disk are not in any suite (add them or mark is_skipped=True):\n{entries}")

-    # 4. Find files on disk but missing from ANY suite
-    # We check if a disk file is present in 'all_suite_files' (union of all suites)
-    missing_files = sorted(list(all_disk_files - all_suite_files))
+    missing_from_disk = sorted(suite_files - disk_files)
+    if missing_from_disk:
+        entries = "\n".join(f'  TestFile("{f}"),' for f in missing_from_disk)
+        raise SystemExit(f"Test files listed in suite do not exist on disk:\n{entries}")

-    missing_text = "\n".join(f'TestFile("{x}"),' for x in missing_files)

-    if missing_files:
-        assert len(missing_files) == 0, (
-            f"Some test files found on disk in covered directories are not in ANY test suite.\n"
-            f"Scanned directories: {sorted([str(d) for d in covered_dirs])}\n"
-            f"Missing files:\n"
-            f"{missing_text}\n"
-            f"If this is intentional, please label them as 'is_skipped=True' and add them to the test suite."
-        )
+def _print_plan(
+    suite: str,
+    files: list[TestFile],
+    skipped: list[TestFile],
+    partition_info: str,
+) -> None:
+    print(tabulate.tabulate([[suite, partition_info]], headers=["Suite", "Partition"], tablefmt="psql"))
+    total_est = sum(f.estimated_time for f in files)
+    print(f"✅ Enabled {len(files)} test(s)  (est. total {total_est:.1f}s):")
+    for f in files:
+        print(f"  - {f.name}  (est={f.estimated_time}s)")
+    if skipped:
+        print(f"\n❌ Skipped {len(skipped)} test(s) (consider recovering):")
+        for f in skipped:
+            print(f"  - {f.name}")
+    print(flush=True)

-    # 5. check if all files in suites exist on disk
-    non_existent_files = sorted(list(all_suite_files - all_disk_files))
-    non_existent_text = "\n".join(f'TestFile("{x}"),' for x in non_existent_files)
-    assert len(non_existent_files) == 0, (
-        f"Some test files in test suite do not exist on disk:\n"
-        f"{non_existent_text}\n"
-        f"Please check if the test files are correctly specified in the local repository."
+
+def _print_results(
+    suite: str,
+    records: list[TestRecord],
+    skipped: list[TestFile],
+    partition_info: str,
+) -> None:
+    print(tabulate.tabulate([[suite, partition_info]], headers=["Suite", "Partition"], tablefmt="psql"))
+    total_elapsed = sum(r.elapsed for r in records)
+    passed_count = sum(1 for r in records if r.passed)
+    print(f"Results: {passed_count}/{len(records)} passed  (actual total {total_elapsed:.1f}s):")
+    for r in records:
+        status = "✅ PASSED" if r.passed else "❌ FAILED"
+        print(f"  {status}  {r.name}  (actual={r.elapsed:.0f}s  est={r.estimated:.0f}s)")
+    if skipped:
+        print(f"\n❌ Skipped {len(skipped)} test(s) (consider recovering):")
+        for f in skipped:
+            print(f"  - {f.name}")
+    print(flush=True)
+
+
+def _save_timing_json(
+    records: list[TestRecord],
+    suite: str,
+    partition_id: int | None,
+    partition_size: int | None,
+    output_path: Path,
+) -> None:
+    passed_suites = [r.to_dict() for r in records if r.passed]
+    payload = {
+        "suite": suite,
+        "partition_id": partition_id,
+        "partition_size": partition_size,
+        "commit_sha": os.environ.get("GITHUB_SHA", ""),
+        "github_run_id": os.environ.get("GITHUB_RUN_ID", ""),
+        "timestamp": datetime.now(timezone.utc).isoformat(),
+        "tests": passed_suites,
+    }
+    output_path.write_text(json.dumps(payload, indent=2))
+    print(
+        f"Timing data written to {output_path}  ({len(passed_suites)}/{len(records)} passed)",
+        flush=True,
    )


-def main():
-    arg_parser = argparse.ArgumentParser()
-    arg_parser.add_argument(
+def main() -> None:
+    suites = load_suites()
+
+    parser = argparse.ArgumentParser(description="Run a named e2e test suite")
+    parser.add_argument(
        "--suite",
-        type=str,
-        default=list(suites.keys())[0],
-        choices=list(suites.keys()) + ["all"],
-        help="The suite to run",
+        required=True,
+        choices=list(suites.keys()),
+        help="Name of the test suite to run",
    )
-    arg_parser.add_argument(
+    parser.add_argument(
        "--auto-partition-id",
        type=int,
-        help="Use auto load balancing. The part id.",
+        default=None,
+        metavar="ID",
+        help="Zero-based partition index (requires --auto-partition-size)",
    )
-    arg_parser.add_argument(
+    parser.add_argument(
        "--auto-partition-size",
        type=int,
-        help="Use auto load balancing. The number of parts.",
+        default=None,
+        metavar="N",
+        help="Total number of partitions",
    )
-    arg_parser.add_argument(
+    parser.add_argument(
+        "--auto-upgrade-estimated-times",
+        action="store_true",
+        help="Automatically update estimated times in config.yaml based on actual timings (default: False) \
+If enabled, the script always exit with 0, even if some tests fail, since the primary purpose is to gather \
+timing data to improve estimates.",
+    )
+    parser.add_argument(
        "--continue-on-error",
        action="store_true",
-        default=True,
-        help="Continue running remaining tests even if one fails (useful for nightly tests)",
+        help="Continue running after a test failure (default: True)",
    )
-    args = arg_parser.parse_args()
-    print(f"{args=}")
+    parser.add_argument(
+        "--timing-report-json",
+        type=Path,
+        default=Path("test_timing_data.json"),
+        help="Path to write the JSON timing data for CI aggregation",
+    )
+    args = parser.parse_args()

-    _sanity_check_suites(suites)
-    files = suites[args.suite]
+    sanity_check(suites)

-    files_disabled = [f for f in files if f.is_skipped]
+    all_files = suites[args.suite]
+    skipped = [f for f in all_files if f.is_skipped]

-    if args.auto_partition_size:
-        files = auto_partition(files, args.auto_partition_id, args.auto_partition_size)
-
-    # Print test info at beginning (similar to test/run_suite.py pretty_print_tests)
-    if args.auto_partition_size:
-        partition_info = (
-            f"{args.auto_partition_id + 1}/{args.auto_partition_size} (0-based id={args.auto_partition_id})"
-        )
+    if args.auto_partition_size is not None:
+        files = partition(all_files, args.auto_partition_id, args.auto_partition_size)
+        partition_info = f"{args.auto_partition_id + 1}/{args.auto_partition_size}"
    else:
+        files = [f for f in all_files if not f.is_skipped]
        partition_info = "full"

-    headers = ["Suite", "Partition"]
-    rows = [[args.suite, partition_info]]
-    msg = tabulate.tabulate(rows, headers=headers, tablefmt="psql") + "\n"
+    _print_plan(args.suite, files, skipped, partition_info)

-    total_est_time = sum(f.estimated_time for f in files)
-    msg += f"✅ Enabled {len(files)} test(s) (est total {total_est_time:.1f}s):\n"
-    for f in files:
-        msg += f"  - {f.name} (est_time={f.estimated_time})\n"
-    msg += f"\n❌ Disabled {len(files_disabled)} test(s)(Please consider to recover them):\n"
-    for f in files_disabled:
-        msg += f"  - {f.name} (est_time={f.estimated_time})\n"
-
-    print(msg, flush=True)
-
-    exit_code = run_e2e_files(
+    exit_code, records = run_tests(
        files,
        continue_on_error=args.continue_on_error,
    )

-    # Print tests again at the end for visibility
-    msg = "\n" + tabulate.tabulate(rows, headers=headers, tablefmt="psql") + "\n"
-    msg += f"✅ Executed {len(files)} test(s) (est total {total_est_time:.1f}s):\n"
-    for f in files:
-        msg += f"  - {f.name} (est_time={f.estimated_time})\n"
-    print(msg, flush=True)
+    _save_timing_json(records, args.suite, args.auto_partition_id, args.auto_partition_size, args.timing_report_json)

-    exit(exit_code)
+    _print_results(args.suite, records, skipped, partition_info)
+    if args.auto_upgrade_estimated_times:
+        sys.exit(0)
+    sys.exit(exit_code)


 if __name__ == "__main__":
--- a/.github/workflows/scripts/update_estimated_time.py
+++ b/.github/workflows/scripts/update_estimated_time.py
@@ -0,0 +1,102 @@
+#!/usr/bin/env python3
+"""
+Update estimated_time in config.yaml from CI timing data.
+
+Usage:
+    python3 update_estimated_time.py \
+        --timing-dir ./timing-artifacts \
+        --config .github/workflows/scripts/config.yaml
+"""
+
+import argparse
+import json
+from pathlib import Path
+
+import yaml
+
+
+def collect_timings(timing_dir: Path) -> dict[str, int]:
+    """
+    Recursively scan timing_dir for JSON files produced by run_suite.py.
+    Returns {test_name: elapsed_seconds} for all passed tests.
+    Warns if the same test name appears in multiple files.
+    """
+    json_files = list(timing_dir.rglob("*.json"))
+    print(f"Found {len(json_files)} timing file(s) in {timing_dir}")
+
+    timings: dict[str, int] = {}
+    for path in json_files:
+        try:
+            data = json.loads(path.read_text())
+        except (json.JSONDecodeError, OSError) as e:
+            print(f"  Warning: skipping {path}: {e}")
+            continue
+
+        for test in data.get("tests", []):
+            if not test.get("passed", False):
+                continue
+            name: str = test.get("name", "")
+            elapsed: float = test.get("elapsed", 0.0)
+            if not name or elapsed <= 0:
+                continue
+            if name in timings:
+                print(f"  Warning: duplicate entry for '{name}', overwriting {timings[name]}s with {int(elapsed)}s")
+            timings[name] = int(elapsed)
+
+    return timings
+
+
+def update_config(config_path: Path, timings: dict[str, int]) -> int:
+    """
+    Load config.yaml, update estimated_time for each test found in timings,
+    and write the result back. Returns the number of changed entries.
+    """
+    configs: dict = yaml.safe_load(config_path.read_text())
+
+    changed = 0
+    for suite_tests in configs.values():
+        for test in suite_tests:
+            name: str = test.get("name", "")
+            if name not in timings:
+                continue
+            old_time: int = test.get("estimated_time", 0)
+            new_time: int = timings[name]
+            if old_time == new_time:
+                continue
+            test["estimated_time"] = new_time
+            print(f"  {name}: {old_time}s -> {new_time}s")
+            changed += 1
+
+    config_path.write_text(yaml.dump(configs, default_flow_style=False, allow_unicode=True, sort_keys=False))
+    return changed
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Update estimated_time in config.yaml from CI timing data")
+    parser.add_argument(
+        "--timing-dir",
+        required=True,
+        type=Path,
+        help="Directory containing timing JSON files (searched recursively)",
+    )
+    parser.add_argument(
+        "--config",
+        default=".github/workflows/scripts/config.yaml",
+        type=Path,
+        help="Path to config.yaml (default: .github/workflows/scripts/config.yaml)",
+    )
+    args = parser.parse_args()
+
+    timings = collect_timings(args.timing_dir)
+    if not timings:
+        print("No timing data collected. Exiting without changes.")
+        return
+
+    print(f"\nCollected timing data for {len(timings)} test(s).")
+    print(f"Updating {args.config}...")
+    changed = update_config(args.config, timings)
+    print(f"\nDone. {changed} estimated_time value(s) changed.")
+
+
+if __name__ == "__main__":
+    main()