ci: fix night-ci with push retry mechanism (#11765)

This commit is contained in:
Mick
2025-10-24 02:31:05 +08:00
committed by GitHub
parent ab07cd3e5a
commit 8bd26dd4e6
2 changed files with 61 additions and 33 deletions

View File

@@ -7,6 +7,8 @@ import base64
import json import json
import os import os
import sys import sys
import time
from urllib.error import HTTPError
from urllib.request import Request, urlopen from urllib.request import Request, urlopen
@@ -28,14 +30,17 @@ def make_github_request(url, token, method="GET", data=None):
try: try:
with urlopen(req) as response: with urlopen(req) as response:
return response.read().decode("utf-8") return response.read().decode("utf-8")
except Exception as e: except HTTPError as e:
print(f"GitHub API request failed: {e}") print(f"GitHub API request failed: {e}")
if hasattr(e, "read"): try:
try: error_body = e.read().decode("utf-8")
error_body = e.read().decode("utf-8") print(f"Error response body: {error_body}")
print(f"Error response body: {error_body}") e.error_body = error_body # Attach for later inspection
except: except Exception:
pass e.error_body = ""
raise
except Exception as e:
print(f"GitHub API request failed with a non-HTTP error: {e}")
raise raise
@@ -196,37 +201,60 @@ def publish_traces(traces_dir, run_id, run_number, is_vlm=False):
) )
sys.exit(1) sys.exit(1)
try: max_retries = 5
# Get current branch head retry_delay = 5 # seconds
branch_sha = get_branch_sha(repo_owner, repo_name, branch, token)
print(f"Current branch head: {branch_sha}")
# Get current tree for attempt in range(max_retries):
tree_sha = get_tree_sha(repo_owner, repo_name, branch_sha, token) try:
print(f"Current tree SHA: {tree_sha}") # Get current branch head
branch_sha = get_branch_sha(repo_owner, repo_name, branch, token)
print(f"Current branch head: {branch_sha}")
# Create new tree with all files # Get current tree
new_tree_sha = create_tree( tree_sha = get_tree_sha(repo_owner, repo_name, branch_sha, token)
repo_owner, repo_name, tree_sha, files_to_upload, token print(f"Current tree SHA: {tree_sha}")
)
print(f"Created new tree: {new_tree_sha}")
# Create commit # Create new tree with all files
commit_message = f"Nightly traces for run {run_id} at {run_number} ({len(files_to_upload)} files)" new_tree_sha = create_tree(
commit_sha = create_commit( repo_owner, repo_name, tree_sha, files_to_upload, token
repo_owner, repo_name, new_tree_sha, branch_sha, commit_message, token )
) print(f"Created new tree: {new_tree_sha}")
print(f"Created commit: {commit_sha}")
# Update branch reference # Create commit
update_branch_ref(repo_owner, repo_name, branch, commit_sha, token) commit_message = f"Nightly traces for run {run_id} at {run_number} ({len(files_to_upload)} files)"
print("Updated branch reference") commit_sha = create_commit(
repo_owner,
repo_name,
new_tree_sha,
branch_sha,
commit_message,
token,
)
print(f"Created commit: {commit_sha}")
print("Successfully published all traces in a single commit") # Update branch reference
update_branch_ref(repo_owner, repo_name, branch, commit_sha, token)
print("Updated branch reference")
except Exception as e: print("Successfully published all traces in a single commit")
print(f"Failed to publish traces: {e}") return
raise
except Exception as e:
is_ff_error = False
if (
hasattr(e, "error_body")
and "Update is not a fast forward" in e.error_body
):
is_ff_error = True
if is_ff_error and attempt < max_retries - 1:
print(
f"Attempt {attempt + 1} failed: not a fast-forward update. Retrying in {retry_delay} seconds..."
)
time.sleep(retry_delay)
else:
print(f"Failed to publish traces: {e}")
raise
def main(): def main():

View File

@@ -37,7 +37,7 @@ MODEL_THRESHOLDS = {
ModelLaunchSettings("Qwen/Qwen2.5-VL-7B-Instruct"): ModelEvalMetrics(0.340, 31.9), ModelLaunchSettings("Qwen/Qwen2.5-VL-7B-Instruct"): ModelEvalMetrics(0.340, 31.9),
ModelLaunchSettings( ModelLaunchSettings(
"Qwen/Qwen3-VL-30B-A3B-Instruct", extra_args=["--tp=2"] "Qwen/Qwen3-VL-30B-A3B-Instruct", extra_args=["--tp=2"]
): ModelEvalMetrics(0.29, 29.1), ): ModelEvalMetrics(0.29, 37.0),
ModelLaunchSettings( ModelLaunchSettings(
"unsloth/Mistral-Small-3.1-24B-Instruct-2503" "unsloth/Mistral-Small-3.1-24B-Instruct-2503"
): ModelEvalMetrics(0.310, 16.7), ): ModelEvalMetrics(0.310, 16.7),