ci: fix night-ci with push retry mechanism (#11765)

This commit is contained in:
Mick
2025-10-24 02:31:05 +08:00
committed by GitHub
parent ab07cd3e5a
commit 8bd26dd4e6
2 changed files with 61 additions and 33 deletions

View File

@@ -7,6 +7,8 @@ import base64
import json import json
import os import os
import sys import sys
import time
from urllib.error import HTTPError
from urllib.request import Request, urlopen from urllib.request import Request, urlopen
@@ -28,14 +30,17 @@ def make_github_request(url, token, method="GET", data=None):
try: try:
with urlopen(req) as response: with urlopen(req) as response:
return response.read().decode("utf-8") return response.read().decode("utf-8")
except Exception as e: except HTTPError as e:
print(f"GitHub API request failed: {e}") print(f"GitHub API request failed: {e}")
if hasattr(e, "read"):
try: try:
error_body = e.read().decode("utf-8") error_body = e.read().decode("utf-8")
print(f"Error response body: {error_body}") print(f"Error response body: {error_body}")
except: e.error_body = error_body # Attach for later inspection
pass except Exception:
e.error_body = ""
raise
except Exception as e:
print(f"GitHub API request failed with a non-HTTP error: {e}")
raise raise
@@ -196,6 +201,10 @@ def publish_traces(traces_dir, run_id, run_number, is_vlm=False):
) )
sys.exit(1) sys.exit(1)
max_retries = 5
retry_delay = 5 # seconds
for attempt in range(max_retries):
try: try:
# Get current branch head # Get current branch head
branch_sha = get_branch_sha(repo_owner, repo_name, branch, token) branch_sha = get_branch_sha(repo_owner, repo_name, branch, token)
@@ -214,7 +223,12 @@ def publish_traces(traces_dir, run_id, run_number, is_vlm=False):
# Create commit # Create commit
commit_message = f"Nightly traces for run {run_id} at {run_number} ({len(files_to_upload)} files)" commit_message = f"Nightly traces for run {run_id} at {run_number} ({len(files_to_upload)} files)"
commit_sha = create_commit( commit_sha = create_commit(
repo_owner, repo_name, new_tree_sha, branch_sha, commit_message, token repo_owner,
repo_name,
new_tree_sha,
branch_sha,
commit_message,
token,
) )
print(f"Created commit: {commit_sha}") print(f"Created commit: {commit_sha}")
@@ -223,8 +237,22 @@ def publish_traces(traces_dir, run_id, run_number, is_vlm=False):
print("Updated branch reference") print("Updated branch reference")
print("Successfully published all traces in a single commit") print("Successfully published all traces in a single commit")
return
except Exception as e: except Exception as e:
is_ff_error = False
if (
hasattr(e, "error_body")
and "Update is not a fast forward" in e.error_body
):
is_ff_error = True
if is_ff_error and attempt < max_retries - 1:
print(
f"Attempt {attempt + 1} failed: not a fast-forward update. Retrying in {retry_delay} seconds..."
)
time.sleep(retry_delay)
else:
print(f"Failed to publish traces: {e}") print(f"Failed to publish traces: {e}")
raise raise

View File

@@ -37,7 +37,7 @@ MODEL_THRESHOLDS = {
ModelLaunchSettings("Qwen/Qwen2.5-VL-7B-Instruct"): ModelEvalMetrics(0.340, 31.9), ModelLaunchSettings("Qwen/Qwen2.5-VL-7B-Instruct"): ModelEvalMetrics(0.340, 31.9),
ModelLaunchSettings( ModelLaunchSettings(
"Qwen/Qwen3-VL-30B-A3B-Instruct", extra_args=["--tp=2"] "Qwen/Qwen3-VL-30B-A3B-Instruct", extra_args=["--tp=2"]
): ModelEvalMetrics(0.29, 29.1), ): ModelEvalMetrics(0.29, 37.0),
ModelLaunchSettings( ModelLaunchSettings(
"unsloth/Mistral-Small-3.1-24B-Instruct-2503" "unsloth/Mistral-Small-3.1-24B-Instruct-2503"
): ModelEvalMetrics(0.310, 16.7), ): ModelEvalMetrics(0.310, 16.7),