Files
sglang/scripts/code_sync/copy_to_oss.py

426 lines
14 KiB
Python
Raw Normal View History

2025-08-30 16:02:29 -07:00
"""
Sync a specific commit from the local private repo to the OSS upstream and open a PR.
NOTE:
1. You need to execute this script in the git root folder.
2. A GH_TOKEN environment variable is required to create the pull request.
- see also https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens
This script will:
1. Take a commit hash as an argument (or use the latest commit by default).
2. Create a patch for that commit.
3. Filter the patch to only include changes in specified directories.
4. Clone the sgl-project/sglang repository.
5. Create a new branch in the OSS repo.
6. Apply the filtered patch, commit, and force push.
7. Open a pull request to the OSS repo using the GitHub CLI (gh).
Usage:
# Sync the latest commit from the current branch
python3 scripts/copy_to_oss.py
# Run the full sync and PR creation process for a given commit
python3 scripts/copy_to_oss.py --commit <commit_hash>
# Perform a dry run without making any actual changes
python3 scripts/copy_to_oss.py --commit <commit_hash> --dry-run
"""
import argparse
import datetime
import os
import shutil
import subprocess
import tempfile
# --- Configuration Begin ---
# List of folders and files to copy to the OSS repo.
# Changes outside these paths will be ignored.
folder_names = [
"3rdparty",
"assets",
"benchmark",
"docker",
"docs",
"examples",
"sgl-kernel",
"README.md",
"python/sglang/lang",
"python/sglang/srt",
"python/sglang/test",
"test/lang",
"test/srt",
]
# --- Configuration End ---
def write_github_step_summary(content):
if not os.environ.get("GITHUB_STEP_SUMMARY"):
return
with open(os.environ["GITHUB_STEP_SUMMARY"], "a") as f:
f.write(content)
def get_commit_info(commit_ref):
"""
Retrieves the hash and message of a specific commit.
Args:
commit_ref (str): The commit hash, tag, or branch to inspect (e.g., 'HEAD').
Returns:
A tuple containing the (commit_hash, commit_message),
or (None, None) if an error occurs.
"""
try:
# Use a custom format to get the hash (%H) and the full message (%B)
# separated by a null character for safe parsing.
command = ["git", "log", "-1", f"--pretty=%H%x00%B", commit_ref]
result = subprocess.run(
command, capture_output=True, text=True, check=True, encoding="utf-8"
)
# Split the output by the null character separator
commit_hash, commit_message = result.stdout.strip().split("\x00", 1)
return commit_hash, commit_message
except FileNotFoundError:
print("❌ Error: 'git' command not found. Is Git installed and in your PATH?")
except subprocess.CalledProcessError as e:
print(f"❌ Error getting commit info for '{commit_ref}': {e.stderr.strip()}")
print(
"Hint: Make sure you are running this from within a Git repository and the commit exists."
)
return None, None
def check_dependencies():
"""Check for required command-line tools."""
if not shutil.which("git"):
raise EnvironmentError("git is not installed or not in PATH.")
if not shutil.which("gh"):
raise EnvironmentError("GitHub CLI (gh) is not installed or not in PATH.")
print("✅ All dependencies (git, gh) are available.")
def create_filtered_patch(commit_hash, dry_run):
"""
Create a patch file for the given commit, containing only changes
to files and directories specified in `folder_names`.
"""
print(f"Creating a filtered patch for commit {commit_hash}")
try:
# Get the list of all files changed in the commit
changed_files_raw = subprocess.run(
["git", "diff-tree", "--no-commit-id", "--name-only", "-r", commit_hash],
capture_output=True,
text=True,
check=True,
).stdout
changed_files = changed_files_raw.strip().split("\n")
# Filter the list of files
relevant_files = [
f for f in changed_files if any(f.startswith(path) for path in folder_names)
]
if not relevant_files:
msg = "\n😴 No relevant file changes found in this commit. Exiting."
print(msg)
write_github_step_summary(msg)
return None, None
print("Found relevant changes in the following files:")
for f in relevant_files:
print(f" - {f}")
# Create a patch containing only the changes for the relevant files
patch_command = [
"git",
"format-patch",
"--stdout",
f"{commit_hash}^..{commit_hash}",
"--",
] + relevant_files
print(f"Run: {' '.join(patch_command)}")
patch_content = subprocess.run(
patch_command, capture_output=True, text=True, check=True
).stdout
# Save the patch to a temporary file
patch_file = tempfile.NamedTemporaryFile(
mode="w", delete=False, suffix=".patch", encoding="utf-8"
)
patch_file.write(patch_content)
patch_file.close()
print(f"✅ Filtered patch created successfully at: {patch_file.name}")
return patch_file.name, relevant_files
except subprocess.CalledProcessError as e:
print(f"Error creating patch: {e.stderr}")
raise
def get_oss_repo(dry_run):
"""
Clones the OSS repository into a temporary directory.
Returns the path to the repo root and the temp directory itself.
"""
gh_token = os.getenv("GH_TOKEN")
if not gh_token:
print("⚠️ Warning: GH_TOKEN environment variable not set. Skipping PR creation.")
if not dry_run:
return
temp_dir = tempfile.mkdtemp()
oss_root = os.path.join(temp_dir, "sglang")
print(f"\nCreated temporary directory for OSS repo: {temp_dir}")
repo_url = f"https://{gh_token}@github.com/sgl-project/sglang.git"
command = ["git", "clone", "--branch", "main", repo_url, oss_root]
print(f"Run: {' '.join(command)}")
if not dry_run:
try:
subprocess.run(command, check=True, capture_output=True)
print(f"✅ Successfully cloned repository to {oss_root}")
except subprocess.CalledProcessError as e:
print(f"Error cloning repository: {e.stderr.decode()}")
shutil.rmtree(temp_dir)
raise
return oss_root, temp_dir
def apply_patch_and_push(oss_root, patch_file, branch_name, commit_message, dry_run):
"""
In the OSS repo, create a branch, apply the patch, commit, and push.
"""
print("\nApplying patch and pushing to OSS repo...")
original_cwd = os.getcwd()
if not dry_run:
os.chdir(oss_root)
try:
# Define commands as lists to avoid shell injection issues
commands_to_run = [
["git", "checkout", "-b", branch_name],
["git", "apply", patch_file],
["git", "config", "user.name", "github-actions[bot]"],
[
"git",
"config",
"user.email",
"github-actions[bot]@users.noreply.github.com",
],
["git", "add", "."],
]
for cmd_list in commands_to_run:
print(f"Run: {' '.join(cmd_list)}")
if not dry_run:
subprocess.run(cmd_list, check=True, capture_output=True, text=True)
# Handle commit separately to pass multi-line message safely via stdin
commit_cmd = ["git", "commit", "-F", "-"]
print(f"Run: {' '.join(commit_cmd)}")
if not dry_run:
print(f"Commit Message:\n---\n{commit_message}\n---")
subprocess.run(
commit_cmd,
input=commit_message,
text=True,
check=True,
capture_output=True,
)
# Push the changes
push_cmd = ["git", "push", "origin", branch_name, "--force"]
print(f"Run: {' '.join(push_cmd)}")
if not dry_run:
subprocess.run(push_cmd, check=True, capture_output=True, text=True)
except subprocess.CalledProcessError as e:
print(f"Git command failed: {e.stderr}")
raise
finally:
if not dry_run:
os.chdir(original_cwd)
print("✅ Branch created, patch applied, and pushed successfully.")
def create_pull_request(oss_root, branch_name, title, body, dry_run):
"""Create a pull request in the OSS repo using the GitHub CLI."""
gh_token = os.getenv("GH_TOKEN")
if not gh_token:
print("⚠️ Warning: GH_TOKEN environment variable not set. Skipping PR creation.")
if not dry_run:
return
print("\nCreating pull request...")
command = [
"gh",
"pr",
"create",
"--base",
"main",
"--head",
branch_name,
"--repo",
"sgl-project/sglang",
"--title",
title,
"--body",
body,
]
print(f"Run: {' '.join(command)}")
if not dry_run:
env = os.environ.copy()
env["GH_TOKEN"] = gh_token
try:
result = subprocess.run(
command,
check=True,
capture_output=True,
text=True,
env=env,
cwd=oss_root,
)
msg = f"✅ Successfully created pull request: {result.stdout.strip()}"
print(msg)
write_github_step_summary(msg)
except subprocess.CalledProcessError as e:
print(f"Error creating pull request: {e.stderr}")
# Check if a PR already exists
if "A pull request for" in e.stderr and "already exists" in e.stderr:
print(" A PR for this branch likely already exists.")
else:
raise
def get_commit_author(commit_hash):
"""Get the author name and email of a commit."""
try:
author_name = subprocess.run(
["git", "show", "-s", "--format=%an", commit_hash],
capture_output=True,
text=True,
check=True,
).stdout.strip()
author_email = subprocess.run(
["git", "show", "-s", "--format=%ae", commit_hash],
capture_output=True,
text=True,
check=True,
).stdout.strip()
return author_name, author_email
except subprocess.CalledProcessError as e:
print(f"Error getting commit author for {commit_hash}: {e.stderr}")
raise
def main():
parser = argparse.ArgumentParser(
description="Copy a commit from the private repo to OSS and open a PR."
)
parser.add_argument(
"--commit",
type=str,
default="LAST",
help="The commit hash to sync. Defaults to 'LAST' to use the latest commit.",
)
parser.add_argument(
"--dry-run",
action="store_true",
help="Dry run the script without executing git, rsync, or gh commands.",
)
args = parser.parse_args()
check_dependencies()
commit_ref = "HEAD" if args.commit == "LAST" else args.commit
commit_hash, original_commit_message = get_commit_info(commit_ref)
if not commit_hash:
return # Exit if we couldn't get commit info
# Display the details of the commit being processed
if args.commit == "LAST":
summary = (
f"\n No commit specified. Using the last commit:\n"
f" - **Hash:** `{commit_hash}`\n"
f" - **Message:** {original_commit_message}\n\n"
)
else:
summary = (
f"\n Using specified commit:\n"
f" - **Hash:** `{commit_hash}`\n"
f" - **Message:** {original_commit_message}\n\n"
)
print(summary)
write_github_step_summary(summary)
short_hash = commit_hash[:8]
patch_file = None
temp_dir = None
try:
# 1. Create a filtered patch from the local repo
patch_file, relevant_files = create_filtered_patch(commit_hash, args.dry_run)
if not patch_file:
return
# 2. Get the OSS repo
oss_root, temp_dir = get_oss_repo(args.dry_run)
# 3. Get original commit author for the co-author line
author_name, author_email = get_commit_author(commit_hash)
# 4. Prepare content for the commit and PR based on changed files
file_list_str = "\n".join([f"- {f}" for f in relevant_files])
filename_list_str = ", ".join([f.split("/")[-1] for f in relevant_files])
if len(filename_list_str) > 40:
filename_list_str = filename_list_str[:40] + "..."
current_date = datetime.datetime.now().strftime("%Y%m%d")
pr_title = f"[Auto Sync] Update {filename_list_str} ({current_date})"
pr_body = (
f"Sync changes from commit `{short_hash}`.\n\n"
f"**Relevant Files Changed:**\n{file_list_str}"
"\n\n---\n\n"
"*This is an automated PR created by a script.*"
)
# 5. Create branch, apply patch, and push
branch_name = f"sync-{short_hash}-{current_date}"
co_author_line = f"Co-authored-by: {author_name} <{author_email}>"
commit_message = f"{pr_title}\n\n{co_author_line}"
apply_patch_and_push(
oss_root, patch_file, branch_name, commit_message, args.dry_run
)
# 6. Create Pull Request
create_pull_request(oss_root, branch_name, pr_title, pr_body, args.dry_run)
finally:
# Cleanup temporary files
if patch_file and os.path.exists(patch_file):
os.remove(patch_file)
print(f"\nRemoved temporary patch file: {patch_file}")
if temp_dir and os.path.exists(temp_dir):
shutil.rmtree(temp_dir)
print(f"Removed temporary directory: {temp_dir}")
if __name__ == "__main__":
main()