#!/usr/bin/env python3 """ Fetch all commits between two tags from a GitHub repository. Usage: python fetch_commits.py [--token YOUR_GITHUB_TOKEN] """ import argparse import os import re import dotenv import requests # Load .env.local first (higher priority), then .env as fallback dotenv.load_dotenv(".env.local") dotenv.load_dotenv() # .env as fallback def get_github_token(): """Get GitHub token from environment or argument.""" return os.environ.get("GITHUB_TOKEN") or os.environ.get("GH_TOKEN") def resolve_tag_to_sha(base_url: str, tag: str, headers: dict) -> str: """Resolve a tag name to its commit SHA.""" print(f"Resolving tag {tag}...") tag_resp = requests.get(f"{base_url}/git/refs/tags/{tag}", headers=headers) if tag_resp.status_code != 200: raise Exception(f"Failed to get tag {tag}: {tag_resp.text}") tag_data = tag_resp.json() sha = tag_data["object"]["sha"] # If it's an annotated tag, we need to get the commit it points to if tag_data["object"]["type"] == "tag": tag_obj_resp = requests.get(f"{base_url}/git/tags/{sha}", headers=headers) if tag_obj_resp.status_code == 200: sha = tag_obj_resp.json()["object"]["sha"] return sha def resolve_commit_sha(base_url: str, commit_ref: str, headers: dict) -> str: """Resolve a commit reference (SHA or short SHA) to full SHA.""" print(f"Resolving commit {commit_ref}...") commit_resp = requests.get(f"{base_url}/commits/{commit_ref}", headers=headers) if commit_resp.status_code != 200: raise Exception(f"Failed to get commit {commit_ref}: {commit_resp.text}") return commit_resp.json()["sha"] def get_default_branch_head(base_url: str, headers: dict) -> tuple[str, str]: """ Get the HEAD commit of the default branch. Returns: Tuple of (branch_name, head_sha) """ print("Getting default branch HEAD...") # Get repository info to find default branch repo_resp = requests.get(base_url, headers=headers) if repo_resp.status_code != 200: raise Exception(f"Failed to get repository info: {repo_resp.text}") default_branch = repo_resp.json()["default_branch"] print(f" Default branch: {default_branch}") # Get the HEAD commit of the default branch branch_resp = requests.get(f"{base_url}/branches/{default_branch}", headers=headers) if branch_resp.status_code != 200: raise Exception(f"Failed to get branch {default_branch}: {branch_resp.text}") head_sha = branch_resp.json()["commit"]["sha"] print(f" HEAD: {head_sha[:8]}") return (default_branch, head_sha) def get_all_tags(base_url: str, headers: dict) -> list[dict]: """Get all tags from the repository with their commit SHAs and dates.""" print("Fetching all tags...") all_tags = [] page = 1 per_page = 100 while True: resp = requests.get( f"{base_url}/tags", headers=headers, params={"per_page": per_page, "page": page}, ) if resp.status_code != 200: raise Exception(f"Failed to get tags: {resp.text}") tags = resp.json() if not tags: break all_tags.extend(tags) page += 1 if len(tags) < per_page: break print(f" Found {len(all_tags)} tags") return all_tags def get_commit_date(base_url: str, sha: str, headers: dict) -> str: """Get the commit date for a given SHA.""" commit_resp = requests.get(f"{base_url}/commits/{sha}", headers=headers) if commit_resp.status_code != 200: return None return commit_resp.json()["commit"]["committer"]["date"] def find_previous_tag( base_url: str, head_sha: str, headers: dict, tag_pattern: str | None = None ) -> tuple[str, str] | None: """ Find the most recent tag that is an ancestor of the given commit. Uses git history to find tags that are reachable from the commit. Args: base_url: GitHub API base URL head_sha: The commit SHA to search from headers: Request headers tag_pattern: Optional regex pattern to filter tags (e.g., r'^v\\d+\\.\\d+\\.\\d+$') Returns: Tuple of (tag_name, tag_sha) or None if no tag found """ print(f"Finding previous tag before commit {head_sha[:8]}...") # Get the date of the head commit head_date = get_commit_date(base_url, head_sha, headers) if not head_date: print(" Warning: Could not get head commit date") return None print(f" Head commit date: {head_date}") # Get all tags all_tags = get_all_tags(base_url, headers) # Filter tags by pattern if provided if tag_pattern: import re pattern = re.compile(tag_pattern) all_tags = [t for t in all_tags if pattern.match(t["name"])] print(f" After pattern filter: {len(all_tags)} tags") # For each tag, check if it's an ancestor of head_sha and get its date tag_candidates = [] for tag in all_tags: tag_name = tag["name"] tag_commit_sha = tag["commit"]["sha"] # Skip if this is the same commit as head if tag_commit_sha == head_sha: continue # Check if this tag's commit is an ancestor of head compare_resp = requests.get(f"{base_url}/compare/{tag_commit_sha}...{head_sha}", headers=headers) if compare_resp.status_code != 200: continue compare_data = compare_resp.json() # If tag is behind head (status = "behind" or "ahead"), it's an ancestor # We want tags where the comparison shows head is ahead if compare_data.get("status") in ["ahead", "diverged"]: # Get the tag's commit date tag_date = get_commit_date(base_url, tag_commit_sha, headers) if tag_date and tag_date < head_date: tag_candidates.append( { "name": tag_name, "sha": tag_commit_sha, "date": tag_date, "ahead_by": compare_data.get("ahead_by", 0), } ) print(f" Found candidate: {tag_name} ({compare_data.get('ahead_by', 0)} commits behind)") if not tag_candidates: print(" No previous tag found") return None # Sort by date (most recent first) or by ahead_by (smallest first) # Using ahead_by gives us the closest tag tag_candidates.sort(key=lambda x: x["ahead_by"]) best_tag = tag_candidates[0] print(f" Selected: {best_tag['name']} ({best_tag['ahead_by']} commits behind)") return (best_tag["name"], best_tag["sha"]) def fetch_commits_between_tags( owner: str, repo: str, base_tag: str, head_tag: str, token: str | None = None ) -> list[dict]: """ Fetch all commits between two tags by walking the commit graph. This method traverses from head_tag back to base_tag, collecting all commits. It properly handles the commit history and doesn't rely on date filtering. Args: owner: Repository owner (e.g., 'vllm-project') repo: Repository name (e.g., 'vllm') base_tag: Base tag (older, e.g., 'v0.11.2') head_tag: Head tag (newer, e.g., 'v0.12.0') token: Optional GitHub token for higher rate limits Returns: List of commit dictionaries """ headers = { "Accept": "application/vnd.github.v3+json", } if token: headers["Authorization"] = f"token {token}" base_url = f"https://api.github.com/repos/{owner}/{repo}" # Resolve tags to commit SHAs base_sha = resolve_tag_to_sha(base_url, base_tag, headers) head_sha = resolve_tag_to_sha(base_url, head_tag, headers) print(f"\nBase SHA ({base_tag}): {base_sha}") print(f"Head SHA ({head_tag}): {head_sha}") # First, use Compare API to get total commit count (for progress info) print(f"\nComparing {base_tag}...{head_tag}...") compare_resp = requests.get(f"{base_url}/compare/{base_sha}...{head_sha}", headers=headers) if compare_resp.status_code == 200: compare_data = compare_resp.json() total_commits = compare_data.get("total_commits", "unknown") print(f"Total commits to fetch: {total_commits}") # Walk the commit history from head to base # We use the commits API starting from head_sha and stop when we reach base_sha all_commits = [] seen_shas = set() seen_shas.add(base_sha) # Don't include the base commit itself # BFS traversal of commit graph to_visit = [head_sha] page_count = 0 print(f"\nFetching commits from {head_tag} back to {base_tag}...") while to_visit: current_sha = to_visit.pop(0) if current_sha in seen_shas: continue seen_shas.add(current_sha) # Fetch commit details commit_resp = requests.get(f"{base_url}/commits/{current_sha}", headers=headers) if commit_resp.status_code != 200: print(f" Warning: Failed to fetch commit {current_sha[:8]}") continue commit = commit_resp.json() all_commits.append(commit) # Add parent commits to visit queue for parent in commit.get("parents", []): parent_sha = parent["sha"] if parent_sha not in seen_shas: to_visit.append(parent_sha) # Progress logging if len(all_commits) % 50 == 0: page_count += 1 print(f" Fetched {len(all_commits)} commits...") print(f" Completed: {len(all_commits)} commits fetched") return all_commits def fetch_commits_by_date_range( owner: str, repo: str, since: str, until: str, token: str | None = None, branch: str | None = None, ) -> list[dict]: """ Fetch all commits within a date range. Args: owner: Repository owner (e.g., 'vllm-project') repo: Repository name (e.g., 'vllm') since: Start date (ISO 8601 format, e.g., '2025-01-01' or '2025-01-01T00:00:00Z') until: End date (ISO 8601 format, e.g., '2025-01-31' or '2025-01-31T23:59:59Z') token: Optional GitHub token for higher rate limits branch: Optional branch name (defaults to repository's default branch) Returns: List of commit dictionaries """ headers = { "Accept": "application/vnd.github.v3+json", } if token: headers["Authorization"] = f"token {token}" base_url = f"https://api.github.com/repos/{owner}/{repo}" per_page = 100 # Normalize date format - add time if not present if len(since) == 10: # YYYY-MM-DD format since = f"{since}T00:00:00Z" if len(until) == 10: # YYYY-MM-DD format until = f"{until}T23:59:59Z" print(f"\nFetching commits from {since} to {until}...") if branch: print(f" Branch: {branch}") all_commits = [] page = 1 while True: params = {"since": since, "until": until, "per_page": per_page, "page": page} if branch: params["sha"] = branch response = requests.get(f"{base_url}/commits", headers=headers, params=params) if response.status_code != 200: raise Exception(f"Failed to fetch commits: {response.text}") commits = response.json() if not commits: break all_commits.extend(commits) print(f" Page {page}: fetched {len(commits)} commits (total: {len(all_commits)})") if len(commits) < per_page: break page += 1 print(f" Completed: {len(all_commits)} commits fetched") return all_commits def get_merge_base(base_url: str, base_sha: str, head_sha: str, headers: dict) -> str | None: """ Get the merge base (common ancestor) of two commits. Args: base_url: GitHub API base URL for the repo base_sha: First commit SHA head_sha: Second commit SHA headers: Request headers Returns: Merge base commit SHA, or None if not found """ # GitHub Compare API returns merge_base_commit compare_resp = requests.get( f"{base_url}/compare/{base_sha}...{head_sha}", headers=headers, ) if compare_resp.status_code != 200: return None compare_data = compare_resp.json() merge_base = compare_data.get("merge_base_commit", {}).get("sha") return merge_base def fetch_commits_by_walking_history( base_url: str, base_sha: str, head_sha: str, base_tag: str, head_tag: str, headers: dict, stop_sha: str | None = None, ) -> list[dict]: """ Fetch commits by walking the commit history from head to a stop point. This method correctly handles release branches with cherry-picks. It walks the head's commit history until it reaches the stop commit. Args: base_url: GitHub API base URL for the repo base_sha: Base commit SHA (for display purposes) head_sha: Head commit SHA (newer) base_tag: Display name for base reference head_tag: Display name for head reference headers: Request headers stop_sha: SHA to stop at (if None, uses base_sha) Returns: List of commit dictionaries (excluding stop commit) """ per_page = 100 all_commits = [] page = 1 target_sha = stop_sha or base_sha print(f"\nWalking commit history from {head_tag} back to {base_tag}...") print(f" Stop SHA: {target_sha[:8]}") while True: response = requests.get( f"{base_url}/commits", headers=headers, params={"sha": head_sha, "per_page": per_page, "page": page}, ) if response.status_code != 200: print(f" Warning: API error on page {page}, stopping") break commits = response.json() if not commits: print(f" No more commits found on page {page}") break found_stop = False for commit in commits: if commit["sha"] == target_sha: # Reached stop commit, stop (don't include it) found_stop = True break all_commits.append(commit) print(f" Page {page}: fetched {len(commits)} commits (total: {len(all_commits)})") if found_stop: print(f" Reached stop commit ({target_sha[:8]})") break if len(commits) < per_page: print(" Warning: Reached end of history without finding stop commit") break page += 1 return all_commits def fetch_commits_between_tags_fast( owner: str, repo: str, base_tag: str, head_tag: str, token: str | None = None, head_is_commit: bool = False, base_is_commit: bool = False, ) -> list[dict]: """ Fetch all commits between two tags using GitHub Compare API with pagination. This properly fetches only the commits between the two tags. Automatically handles diverged branches (e.g., release branches with cherry-picks) by falling back to walking the commit history. Args: owner: Repository owner (e.g., 'vllm-project') repo: Repository name (e.g., 'vllm') base_tag: Base tag (older, e.g., 'v0.11.2') or commit SHA if base_is_commit=True head_tag: Head tag (newer, e.g., 'v0.12.0') or commit SHA if head_is_commit=True token: Optional GitHub token for higher rate limits head_is_commit: If True, treat head_tag as a commit SHA instead of a tag base_is_commit: If True, treat base_tag as a commit SHA instead of a tag Returns: List of commit dictionaries """ headers = { "Accept": "application/vnd.github.v3+json", } if token: headers["Authorization"] = f"token {token}" base_url = f"https://api.github.com/repos/{owner}/{repo}" per_page = 100 # Resolve to commit SHAs if base_is_commit: base_sha = resolve_commit_sha(base_url, base_tag, headers) else: base_sha = resolve_tag_to_sha(base_url, base_tag, headers) if head_is_commit: head_sha = resolve_commit_sha(base_url, head_tag, headers) else: head_sha = resolve_tag_to_sha(base_url, head_tag, headers) print(f"\nBase SHA ({base_tag}): {base_sha}") print(f"Head SHA ({head_tag}): {head_sha}") # Use Compare API to check relationship and get commits print(f"\nComparing {base_tag}...{head_tag}...") compare_resp = requests.get( f"{base_url}/compare/{base_sha}...{head_sha}", headers=headers, params={"per_page": per_page}, ) if compare_resp.status_code != 200: raise Exception(f"Failed to compare: {compare_resp.text}") compare_data = compare_resp.json() status = compare_data.get("status", "unknown") total_commits = compare_data.get("total_commits", 0) print(f" Comparison status: {status}") print(f" Total commits: {total_commits}") # Get merge_base for potential fallback merge_base = compare_data.get("merge_base_commit", {}).get("sha") # If branches have diverged (e.g., release branch with cherry-picks), # we need to filter by PR numbers to avoid duplicates is_diverged = status == "diverged" if is_diverged: print("\n Branches have diverged (likely a release branch scenario)") print(" Will filter by PR numbers to handle cherry-picks...") if merge_base: print(f" Merge base: {merge_base[:8]}") # Use Compare API results all_commits = compare_data.get("commits", []) print(f" Initial fetch: {len(all_commits)} commits") if len(all_commits) >= total_commits: print(" All commits fetched in initial response") return all_commits # Need to paginate - try Compare API pagination first page = 1 while len(all_commits) < total_commits: page += 1 print(f" Fetching page {page}...") compare_resp = requests.get( f"{base_url}/compare/{base_sha}...{head_sha}", headers=headers, params={"per_page": per_page, "page": page}, ) if compare_resp.status_code != 200: # Compare API doesn't support pagination well for large diffs print(" Compare API pagination not supported, using commit walk...") break page_data = compare_resp.json() page_commits = page_data.get("commits", []) if not page_commits: break all_commits.extend(page_commits) print(f" Page {page}: got {len(page_commits)} commits (total: {len(all_commits)})") # If we still don't have all commits, walk the history if len(all_commits) < total_commits: print(f"\n Need to fetch remaining {total_commits - len(all_commits)} commits via history walk...") # For diverged branches, use merge_base as stop point # For non-diverged, use base_sha stop_sha = merge_base if (status == "diverged" and merge_base) else base_sha # Get commits we already have seen_shas = {c["sha"] for c in all_commits} # Walk from head, collecting commits not already seen, until we reach stop point walk_commits = [] walk_page = 1 found_stop = False while len(all_commits) + len(walk_commits) < total_commits and not found_stop: response = requests.get( f"{base_url}/commits", headers=headers, params={"sha": head_sha, "per_page": per_page, "page": walk_page}, ) if response.status_code != 200: print(f" Warning: API error on page {walk_page}") break commits = response.json() if not commits: break for commit in commits: sha = commit["sha"] if sha == stop_sha: found_stop = True break if sha not in seen_shas: seen_shas.add(sha) walk_commits.append(commit) print(f" Walk page {walk_page}: found {len(walk_commits)} additional commits") walk_page += 1 # Combine: Compare API commits first (they're in order), then walk commits # Actually, we should return all unique commits all_commits.extend(walk_commits) print(f" Total after walk: {len(all_commits)} commits") # For diverged branches, filter out commits whose PRs are already in base release # This handles cherry-picks that exist in both releases if is_diverged: print(f"\n Filtering out PRs already in {base_tag}...") # Get base release commits to extract PR numbers print(f" Fetching {base_tag} commits...") base_commits = [] base_page = 1 while True: response = requests.get( f"{base_url}/commits", headers=headers, params={"sha": base_sha, "per_page": per_page, "page": base_page}, ) if response.status_code != 200: break commits = response.json() if not commits: break for commit in commits: if merge_base and commit["sha"] == merge_base: break base_commits.append(commit) else: base_page += 1 continue break print(f" Found {len(base_commits)} commits in {base_tag}") # Extract PR numbers from base commits base_pr_numbers = set() for commit in base_commits: message = commit.get("commit", {}).get("message", "") pr_num = extract_pr_number(message) if pr_num: base_pr_numbers.add(pr_num) print(f" Found {len(base_pr_numbers)} unique PRs in {base_tag}") # Filter out commits whose PR is already in base filtered_commits = [] skipped_count = 0 for commit in all_commits: message = commit.get("commit", {}).get("message", "") pr_num = extract_pr_number(message) if pr_num and pr_num in base_pr_numbers: skipped_count += 1 continue filtered_commits.append(commit) print(f" Skipped {skipped_count} commits (PRs already in {base_tag})") print(f" Final count: {len(filtered_commits)} new commits in {head_tag}") return filtered_commits return all_commits def extract_contributors(commits: list[dict]) -> dict: """ Extract unique contributors from commits. Returns a dict with: - contributors: set of (login, name) tuples - by_login: dict mapping login -> contributor info - by_email: dict mapping email -> contributor info (for commits without GitHub user) """ contributors_by_login = {} contributors_by_email = {} for commit in commits: # Try to get GitHub user info first (author field) author = commit.get("author") if author and author.get("login"): login = author["login"] if login not in contributors_by_login: contributors_by_login[login] = { "login": login, "name": commit.get("commit", {}).get("author", {}).get("name", ""), "email": commit.get("commit", {}).get("author", {}).get("email", ""), "avatar_url": author.get("avatar_url", ""), "html_url": author.get("html_url", ""), "commits": 0, } contributors_by_login[login]["commits"] += 1 else: # Fallback to git author info git_author = commit.get("commit", {}).get("author", {}) email = git_author.get("email", "") name = git_author.get("name", "") if email and email not in contributors_by_email: contributors_by_email[email] = { "login": None, "name": name, "email": email, "avatar_url": "", "html_url": "", "commits": 0, } if email: contributors_by_email[email]["commits"] += 1 return { "by_login": contributors_by_login, "by_email": contributors_by_email, "total": len(contributors_by_login) + len(contributors_by_email), } def get_tag_date(base_url: str, tag: str, headers: dict) -> str: """Get the date of a tag's commit.""" # First resolve the tag to a commit SHA tag_resp = requests.get(f"{base_url}/git/refs/tags/{tag}", headers=headers) if tag_resp.status_code != 200: return None tag_data = tag_resp.json() sha = tag_data["object"]["sha"] # If it's an annotated tag, get the underlying commit if tag_data["object"]["type"] == "tag": tag_obj_resp = requests.get(f"{base_url}/git/tags/{sha}", headers=headers) if tag_obj_resp.status_code == 200: sha = tag_obj_resp.json()["object"]["sha"] # Get the commit date commit_resp = requests.get(f"{base_url}/commits/{sha}", headers=headers) if commit_resp.status_code == 200: return commit_resp.json()["commit"]["committer"]["date"] return None def check_contributor_is_new(owner: str, repo: str, login: str, before_date: str, headers: dict) -> bool: """ Check if a contributor has any commits before a given date. Returns True if this is their first contribution (no commits before the date). """ base_url = f"https://api.github.com/repos/{owner}/{repo}" # Search for commits by this author before the base tag date response = requests.get( f"{base_url}/commits", headers=headers, params={"author": login, "until": before_date, "per_page": 1}, ) if response.status_code == 200: commits = response.json() # If no commits found before the date, they're a new contributor return len(commits) == 0 return False def find_first_contribution(commits: list[dict], login: str) -> dict | None: """ Find the first (earliest) contribution by a user in the commit list. Returns the commit dict or None. """ user_commits = [] for commit in commits: author = commit.get("author") if author and author.get("login") == login: user_commits.append(commit) # Commits are usually newest first, so reverse to get oldest first if user_commits: return user_commits[-1] # Last one is the oldest/first contribution return None def calculate_new_contributors_via_generate_notes( owner: str, repo: str, base_tag: str, head_tag: str, token: str | None = None, ) -> list[dict]: """ Calculate new contributors using GitHub's generate-notes API. This is more accurate than checking commit history because GitHub tracks contributor status internally. Args: owner: Repository owner repo: Repository name base_tag: The base tag (older version) head_tag: The head tag (newer version) token: GitHub token Returns: List of new contributor info dicts with login and first_pr fields """ import re import subprocess print("\nGetting new contributors via GitHub generate-notes API...") # Use gh CLI to call the generate-notes API cmd = [ "gh", "api", f"repos/{owner}/{repo}/releases/generate-notes", "-f", f"tag_name={head_tag}", "-f", f"target_commitish={head_tag}", "-f", f"previous_tag_name={base_tag}", "--jq", ".body", ] try: result = subprocess.run(cmd, capture_output=True, text=True, timeout=60) if result.returncode != 0: print(f" Warning: gh CLI failed: {result.stderr}") return [] body = result.stdout # Parse new contributors from the generated notes # Format: "* @username made their first contribution in https://github.com/owner/repo/pull/12345" pattern = r"\* @(\S+) made their first contribution in https://github\.com/[^/]+/[^/]+/pull/(\d+)" matches = re.findall(pattern, body) new_contributors = [] for login, pr_number in matches: new_contributors.append( { "login": login, "first_pr": pr_number, } ) print(f" Found {len(new_contributors)} new contributors") return new_contributors except subprocess.TimeoutExpired: print(" Warning: gh CLI timed out") return [] except FileNotFoundError: print(" Warning: gh CLI not found, falling back to legacy method") return [] def calculate_new_contributors( commits: list[dict], current_contributors: dict, owner: str, repo: str, base_tag: str, head_tag: str = "", token: str | None = None, ) -> list[dict]: """ Calculate which contributors are new (first-time) in this release. First tries GitHub's generate-notes API (more accurate), then falls back to checking commit history if that fails. Args: commits: List of commits in the current release current_contributors: Output from extract_contributors() owner: Repository owner repo: Repository name base_tag: The base tag (older version) head_tag: The head tag (newer version) token: GitHub token Returns: List of new contributor info dicts with first_pr field """ # Try the accurate method first (via generate-notes API) if head_tag: new_contributors = calculate_new_contributors_via_generate_notes( owner=owner, repo=repo, base_tag=base_tag, head_tag=head_tag, token=token, ) if new_contributors: return new_contributors # Fall back to legacy method (checking commit history) print("\nFalling back to legacy new contributor detection...") headers = { "Accept": "application/vnd.github.v3+json", } if token: headers["Authorization"] = f"token {token}" base_url = f"https://api.github.com/repos/{owner}/{repo}" # Get the date of the base tag print("Getting base tag date...") base_date = get_tag_date(base_url, base_tag, headers) if not base_date: print(f" Warning: Could not get date for tag {base_tag}") return [] print(f" Base tag date: {base_date}") new_contributors = [] logins = list(current_contributors["by_login"].keys()) total = len(logins) print(f"\nChecking {total} contributors for first-time status...") for i, login in enumerate(logins): if (i + 1) % 20 == 0: print(f" Checked {i + 1}/{total} contributors...") is_new = check_contributor_is_new(owner, repo, login, base_date, headers) if is_new: info = current_contributors["by_login"][login].copy() # Find their first PR in this release first_commit = find_first_contribution(commits, login) if first_commit: message = first_commit.get("commit", {}).get("message", "") pr_number = extract_pr_number(message) info["first_pr"] = pr_number info["first_commit_sha"] = first_commit.get("sha", "")[:8] new_contributors.append(info) print(f" Found {len(new_contributors)} new contributors (legacy method)") return new_contributors def generate_contributor_stats( commits: list[dict], owner: str, repo: str, base_tag: str, head_tag: str, token: str | None = None, check_new: bool = True, ) -> dict: """ Generate contributor statistics for the release. Returns a dict with all statistics data. """ print("\n" + "=" * 60) print("CONTRIBUTOR STATISTICS") print("=" * 60) # Extract contributors from current commits contributors = extract_contributors(commits) print(f"\nTotal commits: {len(commits)}") print(f"Total contributors: {contributors['total']}") print(f" - With GitHub account: {len(contributors['by_login'])}") print(f" - Without GitHub account (by email): {len(contributors['by_email'])}") new_count = 0 new_contributors_list = [] if check_new: # Calculate new contributors (tries GitHub generate-notes API first, then falls back to commit history) new_contributors_list = calculate_new_contributors( commits=commits, current_contributors=contributors, owner=owner, repo=repo, base_tag=base_tag, head_tag=head_tag, token=token, ) new_count = len(new_contributors_list) print(f"\nNew contributors (first-time): {new_count}") if new_contributors_list: print("\nNew contributors list:") for c in sorted(new_contributors_list, key=lambda x: x["login"].lower()): pr_info = f" in #{c['first_pr']}" if c.get("first_pr") else "" print(f" - @{c['login']} made their first contribution{pr_info}") # Print summary line for release notes print("\n" + "-" * 60) print("RELEASE NOTES SUMMARY LINE:") print("-" * 60) if check_new: summary_line = ( f"This release features {len(commits)} commits from {contributors['total']} contributors ({new_count} new)!" ) else: summary_line = f"This release features {len(commits)} commits from {contributors['total']} contributors!" print(summary_line) print("-" * 60) # Get all contributors sorted by commit count all_contributors_list = list(contributors["by_login"].values()) + list(contributors["by_email"].values()) sorted_contributors = sorted(all_contributors_list, key=lambda x: x["commits"], reverse=True) # Print top contributors print("\nTop contributors by commit count:") for i, c in enumerate(sorted_contributors[:20], 1): if c.get("login"): print(f" {i:2}. @{c['login']:20} - {c['commits']:3} commits") else: print(f" {i:2}. {c['name']:20} - {c['commits']:3} commits (no GitHub account)") return { "total_commits": len(commits), "total_contributors": contributors["total"], "new_contributors": new_count if check_new else None, "new_contributors_list": new_contributors_list, "contributors": contributors, "sorted_contributors": sorted_contributors, "summary_line": summary_line, "base_tag": base_tag, "head_tag": head_tag, "owner": owner, "repo": repo, } def save_contributor_stats(stats: dict, output_file: str, owner: str, repo: str): """ Save contributor statistics to a markdown file. Args: stats: Statistics dict from generate_contributor_stats() output_file: Output file path owner: Repository owner repo: Repository name """ lines = [] # Header lines.append(f"# Contributor Statistics: {stats['base_tag']} → {stats['head_tag']}") lines.append("") # Summary for release notes lines.append("## Release Notes Summary") lines.append("") lines.append(f"> {stats['summary_line']}") lines.append("") # Overview stats lines.append("## Overview") lines.append("") lines.append(f"- **Total Commits**: {stats['total_commits']}") lines.append(f"- **Total Contributors**: {stats['total_contributors']}") if stats["new_contributors"] is not None: lines.append(f"- **New Contributors**: {stats['new_contributors']}") lines.append("") # Top contributors table lines.append("## Top Contributors") lines.append("") lines.append("| Rank | Contributor | Commits |") lines.append("|------|-------------|---------|") for i, c in enumerate(stats["sorted_contributors"][:30], 1): if c.get("login"): contributor_link = f"[@{c['login']}](https://github.com/{c['login']})" else: contributor_link = c["name"] lines.append(f"| {i} | {contributor_link} | {c['commits']} |") lines.append("") # New contributors section if stats["new_contributors_list"]: lines.append("## New Contributors 🎉") lines.append("") sorted_new = sorted(stats["new_contributors_list"], key=lambda x: x["login"].lower()) for c in sorted_new: pr_num = c.get("first_pr") if pr_num: pr_link = f"https://github.com/{owner}/{repo}/pull/{pr_num}" lines.append(f"* @{c['login']} made their first contribution in {pr_link}") else: lines.append(f"* @{c['login']} made their first contribution") lines.append("") # All contributors section (collapsed) lines.append("## All Contributors") lines.append("") lines.append("
") lines.append("Click to expand full list") lines.append("") lines.append("| Contributor | Commits |") lines.append("|-------------|---------|") for c in stats["sorted_contributors"]: if c.get("login"): contributor_link = f"[@{c['login']}](https://github.com/{c['login']})" else: contributor_link = c["name"] lines.append(f"| {contributor_link} | {c['commits']} |") lines.append("") lines.append("
") lines.append("") # Write to file with open(output_file, "w", encoding="utf-8") as f: f.write("\n".join(lines)) print(f"\nSaved contributor statistics to {output_file}") def extract_pr_number(message: str) -> str | None: """Extract PR number from commit message.""" # Common patterns: (#12345), (https://github.com/.../pull/12345) patterns = [ r"\(#(\d+)\)", # (#12345) r"pull/(\d+)", # https://github.com/.../pull/12345 r"#(\d+)$", # #12345 at end ] for pattern in patterns: match = re.search(pattern, message) if match: return match.group(1) return None def format_commit_message( commit: dict, owner: str, repo: str, include_sha: bool = False, include_date: bool = False, ) -> str: """ Format a commit message for the output file. Format: [Category] Description in https://github.com/owner/repo/pull/XXXX or: [Category] Description (#XXXX) If include_sha is True, prepends the full SHA: `sha` Message (#XXXX) If include_date is True, prepends the date: [YYYY-MM-DD] Message (#XXXX) """ message = commit["commit"]["message"] sha = commit.get("sha", "") # Get commit date (use committer date for when it was merged) commit_date = "" if include_date: date_str = commit.get("commit", {}).get("committer", {}).get("date", "") if date_str: # Parse ISO format and extract date part (YYYY-MM-DD) commit_date = date_str[:10] # Get the first line of the commit message first_line = message.split("\n")[0].strip() # Extract PR number if present pr_number = extract_pr_number(first_line) # Clean up the message - remove existing PR references for reformatting clean_message = first_line clean_message = re.sub(r"\s*\(#\d+\)\s*$", "", clean_message) clean_message = re.sub(r"\s*https://github\.com/[^/]+/[^/]+/pull/\d+\s*", "", clean_message) clean_message = re.sub(r"\s+in\s*$", "", clean_message) clean_message = clean_message.strip() # Format output if pr_number: # Check if message already contains the full URL pattern if f"https://github.com/{owner}/{repo}/pull/" in first_line: formatted = first_line else: formatted = f"{clean_message} (#{pr_number})" else: formatted = clean_message # Prepend metadata if requested prefix_parts = [] if include_date and commit_date: prefix_parts.append(f"[{commit_date}]") if include_sha and sha: prefix_parts.append(f"`{sha}`") if prefix_parts: formatted = f"{' '.join(prefix_parts)} {formatted}" return formatted def save_commits_to_file( commits: list[dict], output_file: str, owner: str, repo: str, sort_mode: str = "chronological", include_sha: bool = False, include_date: bool = False, ): """ Save formatted commits to a markdown file. Args: commits: List of commit dictionaries output_file: Output file path owner: Repository owner repo: Repository name sort_mode: "chronological" (newest first, like GitHub), "alphabetical" (by commit message), "reverse" (oldest first) include_sha: If True, include full commit SHA in output include_date: If True, include commit date in output """ print(f"\nFormatting and saving {len(commits)} commits to {output_file}...") formatted_lines = [] for commit in commits: formatted = format_commit_message(commit, owner, repo, include_sha=include_sha, include_date=include_date) formatted_lines.append(formatted) # Sort based on mode if sort_mode == "alphabetical": formatted_lines.sort(key=lambda x: x.lower()) print(" Sorted alphabetically by commit message") elif sort_mode == "reverse": formatted_lines.reverse() print(" Sorted chronologically (oldest first)") else: # chronological - keep original order (newest first, as returned by API) print(" Keeping chronological order (newest first)") with open(output_file, "w", encoding="utf-8") as f: for line in formatted_lines: f.write(line + "\n") print(f"Saved {len(formatted_lines)} commits to {output_file}") def main(): parser = argparse.ArgumentParser(description="Fetch commits between two GitHub tags or between a tag and a commit") parser.add_argument( "--owner", default="vllm-project", help="Repository owner (default: vllm-project)", ) parser.add_argument("--repo", default="vllm", help="Repository name (default: vllm)") parser.add_argument( "--base-tag", help="Base tag (older, e.g., v0.11.2). If not provided with --head-commit, will auto-detect previous tag.", ) parser.add_argument( "--head-tag", help="Head tag (newer, e.g., v0.12.0). Use this OR --head-commit. If neither specified, uses " "HEAD of default branch.", ) parser.add_argument( "--head-commit", help="Head commit SHA (can be short or full). If not specified and no --head-tag, uses HEAD of default branch.", ) parser.add_argument( "--tag-pattern", default=r"^v\d+\.\d+\.\d+$", help="Regex pattern to filter tags when auto-detecting previous tag (default: ^v\\d+\\.\\d+\\.\\d+$)", ) parser.add_argument( "--output", default="0-current-raw-commits.md", help="Output file (default: 0-current-raw-commits.md)", ) parser.add_argument("--token", help="GitHub token (or set GITHUB_TOKEN env var)") parser.add_argument( "--slow", action="store_true", help="Use slower but more thorough commit-by-commit fetching", ) parser.add_argument( "--sort", choices=["chronological", "alphabetical", "reverse"], default="chronological", help="Sort mode: chronological (newest first, like GitHub), alphabetical (by message), reverse (oldest first)", ) parser.add_argument("--stats", action="store_true", help="Generate and save contributor statistics") parser.add_argument( "--stats-output", default="0-contributor-stats.md", help="Output file for contributor statistics (default: 0-contributor-stats.md)", ) parser.add_argument( "--no-new-check", action="store_true", help="Skip checking for new contributors (faster, avoids extra API calls)", ) parser.add_argument( "--include-sha", action="store_true", help="Include full commit SHA in output (format: `sha` message)", ) parser.add_argument( "--include-date", action="store_true", help="Include commit date in output (format: [YYYY-MM-DD] message)", ) parser.add_argument( "--since", help="Fetch commits since this date (ISO 8601: YYYY-MM-DD or YYYY-MM-DDTHH:MM:SSZ). " "Use with --until for date range mode.", ) parser.add_argument( "--until", help="Fetch commits until this date (ISO 8601: YYYY-MM-DD or YYYY-MM-DDTHH:MM:SSZ). " "Use with --since for date range mode.", ) parser.add_argument( "--branch", help="Branch to fetch commits from (only used with --since/--until date range mode)", ) args = parser.parse_args() # Validate arguments if args.head_tag and args.head_commit: parser.error("Cannot specify both --head-tag and --head-commit") # Check for date range mode date_range_mode = args.since is not None or args.until is not None if date_range_mode: if not args.since or not args.until: parser.error("Both --since and --until must be specified for date range mode") if args.head_tag or args.head_commit or args.base_tag: parser.error("Cannot use --since/--until with --head-tag, --head-commit, or --base-tag") token = args.token or get_github_token() if not token: print("Warning: No GitHub token provided. Rate limits will be stricter.") print("Set GITHUB_TOKEN environment variable or use --token argument.") print() headers = { "Accept": "application/vnd.github.v3+json", } if token: headers["Authorization"] = f"token {token}" base_url = f"https://api.github.com/repos/{args.owner}/{args.repo}" try: # Date range mode if date_range_mode: print(f"\n{'=' * 60}") print(f"Fetching commits by date range: {args.since} → {args.until}") if args.branch: print(f"Branch: {args.branch}") print(f"{'=' * 60}") commits = fetch_commits_by_date_range( owner=args.owner, repo=args.repo, since=args.since, until=args.until, token=token, branch=args.branch, ) print(f"\nTotal commits found: {len(commits)}") save_commits_to_file( commits=commits, output_file=args.output, owner=args.owner, repo=args.repo, sort_mode=args.sort, include_sha=args.include_sha, include_date=args.include_date, ) # Stats not fully supported in date range mode (no base_tag for new contributor check) if args.stats: print("\nNote: Contributor statistics in date range mode won't check for new contributors.") stats = generate_contributor_stats( commits=commits, owner=args.owner, repo=args.repo, base_tag=args.since, head_tag=args.until, token=token, check_new=False, # Can't check new contributors without a base tag ) save_contributor_stats( stats=stats, output_file=args.stats_output, owner=args.owner, repo=args.repo, ) return # Tag/commit mode (existing logic) # Determine head reference head_is_commit = False head_ref = None head_display_name = None if args.head_tag: head_ref = args.head_tag head_is_commit = False head_display_name = args.head_tag elif args.head_commit: head_ref = args.head_commit head_is_commit = True head_display_name = args.head_commit[:8] if len(args.head_commit) > 8 else args.head_commit else: # Auto-detect HEAD of default branch branch_name, head_sha = get_default_branch_head(base_url, headers) head_ref = head_sha head_is_commit = True head_display_name = f"{branch_name} ({head_sha[:8]})" base_tag = args.base_tag base_is_commit = False # Auto-detect previous tag if needed if not base_tag and head_is_commit: print("Auto-detecting previous tag...") head_sha = resolve_commit_sha(base_url, head_ref, headers) result = find_previous_tag( base_url=base_url, head_sha=head_sha, headers=headers, tag_pattern=args.tag_pattern, ) if result is None: raise Exception("Could not find a previous tag. Please specify --base-tag manually.") base_tag, _ = result print(f"\nUsing auto-detected base tag: {base_tag}") elif not base_tag: parser.error("Must specify --base-tag when using --head-tag") print(f"\n{'=' * 60}") print(f"Fetching commits: {base_tag} → {head_display_name}") print(f"{'=' * 60}") if args.slow: # Note: slow mode doesn't support commit SHA yet, only tags if head_is_commit: print("Warning: --slow mode with --head-commit not fully supported, using fast mode") commits = fetch_commits_between_tags_fast( owner=args.owner, repo=args.repo, base_tag=base_tag, head_tag=head_ref, token=token, head_is_commit=head_is_commit, base_is_commit=base_is_commit, ) else: commits = fetch_commits_between_tags_fast( owner=args.owner, repo=args.repo, base_tag=base_tag, head_tag=head_ref, token=token, head_is_commit=head_is_commit, base_is_commit=base_is_commit, ) print(f"\nTotal commits found: {len(commits)}") save_commits_to_file( commits=commits, output_file=args.output, owner=args.owner, repo=args.repo, sort_mode=args.sort, include_sha=args.include_sha, include_date=args.include_date, ) # Generate and save contributor statistics if requested if args.stats: stats = generate_contributor_stats( commits=commits, owner=args.owner, repo=args.repo, base_tag=base_tag, head_tag=head_display_name, token=token, check_new=not args.no_new_check, ) save_contributor_stats( stats=stats, output_file=args.stats_output, owner=args.owner, repo=args.repo, ) except Exception as e: print(f"Error: {e}") raise if __name__ == "__main__": main()