ci: fix night-ci with push retry mechanism (#11765)
This commit is contained in:
@@ -7,6 +7,8 @@ import base64
|
|||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
import time
|
||||||
|
from urllib.error import HTTPError
|
||||||
from urllib.request import Request, urlopen
|
from urllib.request import Request, urlopen
|
||||||
|
|
||||||
|
|
||||||
@@ -28,14 +30,17 @@ def make_github_request(url, token, method="GET", data=None):
|
|||||||
try:
|
try:
|
||||||
with urlopen(req) as response:
|
with urlopen(req) as response:
|
||||||
return response.read().decode("utf-8")
|
return response.read().decode("utf-8")
|
||||||
except Exception as e:
|
except HTTPError as e:
|
||||||
print(f"GitHub API request failed: {e}")
|
print(f"GitHub API request failed: {e}")
|
||||||
if hasattr(e, "read"):
|
try:
|
||||||
try:
|
error_body = e.read().decode("utf-8")
|
||||||
error_body = e.read().decode("utf-8")
|
print(f"Error response body: {error_body}")
|
||||||
print(f"Error response body: {error_body}")
|
e.error_body = error_body # Attach for later inspection
|
||||||
except:
|
except Exception:
|
||||||
pass
|
e.error_body = ""
|
||||||
|
raise
|
||||||
|
except Exception as e:
|
||||||
|
print(f"GitHub API request failed with a non-HTTP error: {e}")
|
||||||
raise
|
raise
|
||||||
|
|
||||||
|
|
||||||
@@ -196,37 +201,60 @@ def publish_traces(traces_dir, run_id, run_number, is_vlm=False):
|
|||||||
)
|
)
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
try:
|
max_retries = 5
|
||||||
# Get current branch head
|
retry_delay = 5 # seconds
|
||||||
branch_sha = get_branch_sha(repo_owner, repo_name, branch, token)
|
|
||||||
print(f"Current branch head: {branch_sha}")
|
|
||||||
|
|
||||||
# Get current tree
|
for attempt in range(max_retries):
|
||||||
tree_sha = get_tree_sha(repo_owner, repo_name, branch_sha, token)
|
try:
|
||||||
print(f"Current tree SHA: {tree_sha}")
|
# Get current branch head
|
||||||
|
branch_sha = get_branch_sha(repo_owner, repo_name, branch, token)
|
||||||
|
print(f"Current branch head: {branch_sha}")
|
||||||
|
|
||||||
# Create new tree with all files
|
# Get current tree
|
||||||
new_tree_sha = create_tree(
|
tree_sha = get_tree_sha(repo_owner, repo_name, branch_sha, token)
|
||||||
repo_owner, repo_name, tree_sha, files_to_upload, token
|
print(f"Current tree SHA: {tree_sha}")
|
||||||
)
|
|
||||||
print(f"Created new tree: {new_tree_sha}")
|
|
||||||
|
|
||||||
# Create commit
|
# Create new tree with all files
|
||||||
commit_message = f"Nightly traces for run {run_id} at {run_number} ({len(files_to_upload)} files)"
|
new_tree_sha = create_tree(
|
||||||
commit_sha = create_commit(
|
repo_owner, repo_name, tree_sha, files_to_upload, token
|
||||||
repo_owner, repo_name, new_tree_sha, branch_sha, commit_message, token
|
)
|
||||||
)
|
print(f"Created new tree: {new_tree_sha}")
|
||||||
print(f"Created commit: {commit_sha}")
|
|
||||||
|
|
||||||
# Update branch reference
|
# Create commit
|
||||||
update_branch_ref(repo_owner, repo_name, branch, commit_sha, token)
|
commit_message = f"Nightly traces for run {run_id} at {run_number} ({len(files_to_upload)} files)"
|
||||||
print("Updated branch reference")
|
commit_sha = create_commit(
|
||||||
|
repo_owner,
|
||||||
|
repo_name,
|
||||||
|
new_tree_sha,
|
||||||
|
branch_sha,
|
||||||
|
commit_message,
|
||||||
|
token,
|
||||||
|
)
|
||||||
|
print(f"Created commit: {commit_sha}")
|
||||||
|
|
||||||
print("Successfully published all traces in a single commit")
|
# Update branch reference
|
||||||
|
update_branch_ref(repo_owner, repo_name, branch, commit_sha, token)
|
||||||
|
print("Updated branch reference")
|
||||||
|
|
||||||
except Exception as e:
|
print("Successfully published all traces in a single commit")
|
||||||
print(f"Failed to publish traces: {e}")
|
return
|
||||||
raise
|
|
||||||
|
except Exception as e:
|
||||||
|
is_ff_error = False
|
||||||
|
if (
|
||||||
|
hasattr(e, "error_body")
|
||||||
|
and "Update is not a fast forward" in e.error_body
|
||||||
|
):
|
||||||
|
is_ff_error = True
|
||||||
|
|
||||||
|
if is_ff_error and attempt < max_retries - 1:
|
||||||
|
print(
|
||||||
|
f"Attempt {attempt + 1} failed: not a fast-forward update. Retrying in {retry_delay} seconds..."
|
||||||
|
)
|
||||||
|
time.sleep(retry_delay)
|
||||||
|
else:
|
||||||
|
print(f"Failed to publish traces: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
|||||||
@@ -37,7 +37,7 @@ MODEL_THRESHOLDS = {
|
|||||||
ModelLaunchSettings("Qwen/Qwen2.5-VL-7B-Instruct"): ModelEvalMetrics(0.340, 31.9),
|
ModelLaunchSettings("Qwen/Qwen2.5-VL-7B-Instruct"): ModelEvalMetrics(0.340, 31.9),
|
||||||
ModelLaunchSettings(
|
ModelLaunchSettings(
|
||||||
"Qwen/Qwen3-VL-30B-A3B-Instruct", extra_args=["--tp=2"]
|
"Qwen/Qwen3-VL-30B-A3B-Instruct", extra_args=["--tp=2"]
|
||||||
): ModelEvalMetrics(0.29, 29.1),
|
): ModelEvalMetrics(0.29, 37.0),
|
||||||
ModelLaunchSettings(
|
ModelLaunchSettings(
|
||||||
"unsloth/Mistral-Small-3.1-24B-Instruct-2503"
|
"unsloth/Mistral-Small-3.1-24B-Instruct-2503"
|
||||||
): ModelEvalMetrics(0.310, 16.7),
|
): ModelEvalMetrics(0.310, 16.7),
|
||||||
|
|||||||
Reference in New Issue
Block a user