Files
xc-llm-ascend/.agents/skills/vllm-ascend-release-note-writer/scripts/fetch_commits-optimize.py
Yikun Jiang ed175d6d92 [Doc][Release] Add release note skill (#6824)
### What this PR does / why we need it?
This PR adds the releaseing note skills:
- `SKILL.md`: vLLM Ascend Releasing Note Writer
- `references/ref-past-release-notes-highlight.md`:
And also add a `output/v0.13.0` examples which was used by
2da476d82f

Inspired: https://github.com/simon-mo/release-notes-writing/

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?

- vLLM version: v0.15.0
- vLLM main:
83b47f67b1


Co-authored-by: esmeetu <jasonailu87@gmail.com>

---------

Signed-off-by: Yikun Jiang <yikunkero@gmail.com>
2026-02-26 21:01:21 +08:00

1547 lines
51 KiB
Python

#!/usr/bin/env python3
"""
Fetch all commits between two tags from a GitHub repository.
Usage: python fetch_commits.py [--token YOUR_GITHUB_TOKEN]
"""
import argparse
import os
import re
import dotenv
import requests
# Load .env.local first (higher priority), then .env as fallback
dotenv.load_dotenv(".env.local")
dotenv.load_dotenv() # .env as fallback
def get_github_token():
"""Get GitHub token from environment or argument."""
return os.environ.get("GITHUB_TOKEN") or os.environ.get("GH_TOKEN")
def resolve_tag_to_sha(base_url: str, tag: str, headers: dict) -> str:
"""Resolve a tag name to its commit SHA."""
print(f"Resolving tag {tag}...")
tag_resp = requests.get(f"{base_url}/git/refs/tags/{tag}", headers=headers)
if tag_resp.status_code != 200:
raise Exception(f"Failed to get tag {tag}: {tag_resp.text}")
tag_data = tag_resp.json()
sha = tag_data["object"]["sha"]
# If it's an annotated tag, we need to get the commit it points to
if tag_data["object"]["type"] == "tag":
tag_obj_resp = requests.get(f"{base_url}/git/tags/{sha}", headers=headers)
if tag_obj_resp.status_code == 200:
sha = tag_obj_resp.json()["object"]["sha"]
return sha
def resolve_commit_sha(base_url: str, commit_ref: str, headers: dict) -> str:
"""Resolve a commit reference (SHA or short SHA) to full SHA."""
print(f"Resolving commit {commit_ref}...")
commit_resp = requests.get(f"{base_url}/commits/{commit_ref}", headers=headers)
if commit_resp.status_code != 200:
raise Exception(f"Failed to get commit {commit_ref}: {commit_resp.text}")
return commit_resp.json()["sha"]
def get_default_branch_head(base_url: str, headers: dict) -> tuple[str, str]:
"""
Get the HEAD commit of the default branch.
Returns:
Tuple of (branch_name, head_sha)
"""
print("Getting default branch HEAD...")
# Get repository info to find default branch
repo_resp = requests.get(base_url, headers=headers)
if repo_resp.status_code != 200:
raise Exception(f"Failed to get repository info: {repo_resp.text}")
default_branch = repo_resp.json()["default_branch"]
print(f" Default branch: {default_branch}")
# Get the HEAD commit of the default branch
branch_resp = requests.get(f"{base_url}/branches/{default_branch}", headers=headers)
if branch_resp.status_code != 200:
raise Exception(f"Failed to get branch {default_branch}: {branch_resp.text}")
head_sha = branch_resp.json()["commit"]["sha"]
print(f" HEAD: {head_sha[:8]}")
return (default_branch, head_sha)
def get_all_tags(base_url: str, headers: dict) -> list[dict]:
"""Get all tags from the repository with their commit SHAs and dates."""
print("Fetching all tags...")
all_tags = []
page = 1
per_page = 100
while True:
resp = requests.get(
f"{base_url}/tags",
headers=headers,
params={"per_page": per_page, "page": page},
)
if resp.status_code != 200:
raise Exception(f"Failed to get tags: {resp.text}")
tags = resp.json()
if not tags:
break
all_tags.extend(tags)
page += 1
if len(tags) < per_page:
break
print(f" Found {len(all_tags)} tags")
return all_tags
def get_commit_date(base_url: str, sha: str, headers: dict) -> str:
"""Get the commit date for a given SHA."""
commit_resp = requests.get(f"{base_url}/commits/{sha}", headers=headers)
if commit_resp.status_code != 200:
return None
return commit_resp.json()["commit"]["committer"]["date"]
def find_previous_tag(
base_url: str, head_sha: str, headers: dict, tag_pattern: str | None = None
) -> tuple[str, str] | None:
"""
Find the most recent tag that is an ancestor of the given commit.
Uses git history to find tags that are reachable from the commit.
Args:
base_url: GitHub API base URL
head_sha: The commit SHA to search from
headers: Request headers
tag_pattern: Optional regex pattern to filter tags (e.g., r'^v\\d+\\.\\d+\\.\\d+$')
Returns:
Tuple of (tag_name, tag_sha) or None if no tag found
"""
print(f"Finding previous tag before commit {head_sha[:8]}...")
# Get the date of the head commit
head_date = get_commit_date(base_url, head_sha, headers)
if not head_date:
print(" Warning: Could not get head commit date")
return None
print(f" Head commit date: {head_date}")
# Get all tags
all_tags = get_all_tags(base_url, headers)
# Filter tags by pattern if provided
if tag_pattern:
import re
pattern = re.compile(tag_pattern)
all_tags = [t for t in all_tags if pattern.match(t["name"])]
print(f" After pattern filter: {len(all_tags)} tags")
# For each tag, check if it's an ancestor of head_sha and get its date
tag_candidates = []
for tag in all_tags:
tag_name = tag["name"]
tag_commit_sha = tag["commit"]["sha"]
# Skip if this is the same commit as head
if tag_commit_sha == head_sha:
continue
# Check if this tag's commit is an ancestor of head
compare_resp = requests.get(f"{base_url}/compare/{tag_commit_sha}...{head_sha}", headers=headers)
if compare_resp.status_code != 200:
continue
compare_data = compare_resp.json()
# If tag is behind head (status = "behind" or "ahead"), it's an ancestor
# We want tags where the comparison shows head is ahead
if compare_data.get("status") in ["ahead", "diverged"]:
# Get the tag's commit date
tag_date = get_commit_date(base_url, tag_commit_sha, headers)
if tag_date and tag_date < head_date:
tag_candidates.append(
{
"name": tag_name,
"sha": tag_commit_sha,
"date": tag_date,
"ahead_by": compare_data.get("ahead_by", 0),
}
)
print(f" Found candidate: {tag_name} ({compare_data.get('ahead_by', 0)} commits behind)")
if not tag_candidates:
print(" No previous tag found")
return None
# Sort by date (most recent first) or by ahead_by (smallest first)
# Using ahead_by gives us the closest tag
tag_candidates.sort(key=lambda x: x["ahead_by"])
best_tag = tag_candidates[0]
print(f" Selected: {best_tag['name']} ({best_tag['ahead_by']} commits behind)")
return (best_tag["name"], best_tag["sha"])
def fetch_commits_between_tags(
owner: str, repo: str, base_tag: str, head_tag: str, token: str | None = None
) -> list[dict]:
"""
Fetch all commits between two tags by walking the commit graph.
This method traverses from head_tag back to base_tag, collecting all commits.
It properly handles the commit history and doesn't rely on date filtering.
Args:
owner: Repository owner (e.g., 'vllm-project')
repo: Repository name (e.g., 'vllm')
base_tag: Base tag (older, e.g., 'v0.11.2')
head_tag: Head tag (newer, e.g., 'v0.12.0')
token: Optional GitHub token for higher rate limits
Returns:
List of commit dictionaries
"""
headers = {
"Accept": "application/vnd.github.v3+json",
}
if token:
headers["Authorization"] = f"token {token}"
base_url = f"https://api.github.com/repos/{owner}/{repo}"
# Resolve tags to commit SHAs
base_sha = resolve_tag_to_sha(base_url, base_tag, headers)
head_sha = resolve_tag_to_sha(base_url, head_tag, headers)
print(f"\nBase SHA ({base_tag}): {base_sha}")
print(f"Head SHA ({head_tag}): {head_sha}")
# First, use Compare API to get total commit count (for progress info)
print(f"\nComparing {base_tag}...{head_tag}...")
compare_resp = requests.get(f"{base_url}/compare/{base_sha}...{head_sha}", headers=headers)
if compare_resp.status_code == 200:
compare_data = compare_resp.json()
total_commits = compare_data.get("total_commits", "unknown")
print(f"Total commits to fetch: {total_commits}")
# Walk the commit history from head to base
# We use the commits API starting from head_sha and stop when we reach base_sha
all_commits = []
seen_shas = set()
seen_shas.add(base_sha) # Don't include the base commit itself
# BFS traversal of commit graph
to_visit = [head_sha]
page_count = 0
print(f"\nFetching commits from {head_tag} back to {base_tag}...")
while to_visit:
current_sha = to_visit.pop(0)
if current_sha in seen_shas:
continue
seen_shas.add(current_sha)
# Fetch commit details
commit_resp = requests.get(f"{base_url}/commits/{current_sha}", headers=headers)
if commit_resp.status_code != 200:
print(f" Warning: Failed to fetch commit {current_sha[:8]}")
continue
commit = commit_resp.json()
all_commits.append(commit)
# Add parent commits to visit queue
for parent in commit.get("parents", []):
parent_sha = parent["sha"]
if parent_sha not in seen_shas:
to_visit.append(parent_sha)
# Progress logging
if len(all_commits) % 50 == 0:
page_count += 1
print(f" Fetched {len(all_commits)} commits...")
print(f" Completed: {len(all_commits)} commits fetched")
return all_commits
def fetch_commits_by_date_range(
owner: str,
repo: str,
since: str,
until: str,
token: str | None = None,
branch: str | None = None,
) -> list[dict]:
"""
Fetch all commits within a date range.
Args:
owner: Repository owner (e.g., 'vllm-project')
repo: Repository name (e.g., 'vllm')
since: Start date (ISO 8601 format, e.g., '2025-01-01' or '2025-01-01T00:00:00Z')
until: End date (ISO 8601 format, e.g., '2025-01-31' or '2025-01-31T23:59:59Z')
token: Optional GitHub token for higher rate limits
branch: Optional branch name (defaults to repository's default branch)
Returns:
List of commit dictionaries
"""
headers = {
"Accept": "application/vnd.github.v3+json",
}
if token:
headers["Authorization"] = f"token {token}"
base_url = f"https://api.github.com/repos/{owner}/{repo}"
per_page = 100
# Normalize date format - add time if not present
if len(since) == 10: # YYYY-MM-DD format
since = f"{since}T00:00:00Z"
if len(until) == 10: # YYYY-MM-DD format
until = f"{until}T23:59:59Z"
print(f"\nFetching commits from {since} to {until}...")
if branch:
print(f" Branch: {branch}")
all_commits = []
page = 1
while True:
params = {"since": since, "until": until, "per_page": per_page, "page": page}
if branch:
params["sha"] = branch
response = requests.get(f"{base_url}/commits", headers=headers, params=params)
if response.status_code != 200:
raise Exception(f"Failed to fetch commits: {response.text}")
commits = response.json()
if not commits:
break
all_commits.extend(commits)
print(f" Page {page}: fetched {len(commits)} commits (total: {len(all_commits)})")
if len(commits) < per_page:
break
page += 1
print(f" Completed: {len(all_commits)} commits fetched")
return all_commits
def get_merge_base(base_url: str, base_sha: str, head_sha: str, headers: dict) -> str | None:
"""
Get the merge base (common ancestor) of two commits.
Args:
base_url: GitHub API base URL for the repo
base_sha: First commit SHA
head_sha: Second commit SHA
headers: Request headers
Returns:
Merge base commit SHA, or None if not found
"""
# GitHub Compare API returns merge_base_commit
compare_resp = requests.get(
f"{base_url}/compare/{base_sha}...{head_sha}",
headers=headers,
)
if compare_resp.status_code != 200:
return None
compare_data = compare_resp.json()
merge_base = compare_data.get("merge_base_commit", {}).get("sha")
return merge_base
def fetch_commits_by_walking_history(
base_url: str,
base_sha: str,
head_sha: str,
base_tag: str,
head_tag: str,
headers: dict,
stop_sha: str | None = None,
) -> list[dict]:
"""
Fetch commits by walking the commit history from head to a stop point.
This method correctly handles release branches with cherry-picks.
It walks the head's commit history until it reaches the stop commit.
Args:
base_url: GitHub API base URL for the repo
base_sha: Base commit SHA (for display purposes)
head_sha: Head commit SHA (newer)
base_tag: Display name for base reference
head_tag: Display name for head reference
headers: Request headers
stop_sha: SHA to stop at (if None, uses base_sha)
Returns:
List of commit dictionaries (excluding stop commit)
"""
per_page = 100
all_commits = []
page = 1
target_sha = stop_sha or base_sha
print(f"\nWalking commit history from {head_tag} back to {base_tag}...")
print(f" Stop SHA: {target_sha[:8]}")
while True:
response = requests.get(
f"{base_url}/commits",
headers=headers,
params={"sha": head_sha, "per_page": per_page, "page": page},
)
if response.status_code != 200:
print(f" Warning: API error on page {page}, stopping")
break
commits = response.json()
if not commits:
print(f" No more commits found on page {page}")
break
found_stop = False
for commit in commits:
if commit["sha"] == target_sha:
# Reached stop commit, stop (don't include it)
found_stop = True
break
all_commits.append(commit)
print(f" Page {page}: fetched {len(commits)} commits (total: {len(all_commits)})")
if found_stop:
print(f" Reached stop commit ({target_sha[:8]})")
break
if len(commits) < per_page:
print(" Warning: Reached end of history without finding stop commit")
break
page += 1
return all_commits
def fetch_commits_between_tags_fast(
owner: str,
repo: str,
base_tag: str,
head_tag: str,
token: str | None = None,
head_is_commit: bool = False,
base_is_commit: bool = False,
) -> list[dict]:
"""
Fetch all commits between two tags using GitHub Compare API with pagination.
This properly fetches only the commits between the two tags.
Automatically handles diverged branches (e.g., release branches with cherry-picks)
by falling back to walking the commit history.
Args:
owner: Repository owner (e.g., 'vllm-project')
repo: Repository name (e.g., 'vllm')
base_tag: Base tag (older, e.g., 'v0.11.2') or commit SHA if base_is_commit=True
head_tag: Head tag (newer, e.g., 'v0.12.0') or commit SHA if head_is_commit=True
token: Optional GitHub token for higher rate limits
head_is_commit: If True, treat head_tag as a commit SHA instead of a tag
base_is_commit: If True, treat base_tag as a commit SHA instead of a tag
Returns:
List of commit dictionaries
"""
headers = {
"Accept": "application/vnd.github.v3+json",
}
if token:
headers["Authorization"] = f"token {token}"
base_url = f"https://api.github.com/repos/{owner}/{repo}"
per_page = 100
# Resolve to commit SHAs
if base_is_commit:
base_sha = resolve_commit_sha(base_url, base_tag, headers)
else:
base_sha = resolve_tag_to_sha(base_url, base_tag, headers)
if head_is_commit:
head_sha = resolve_commit_sha(base_url, head_tag, headers)
else:
head_sha = resolve_tag_to_sha(base_url, head_tag, headers)
print(f"\nBase SHA ({base_tag}): {base_sha}")
print(f"Head SHA ({head_tag}): {head_sha}")
# Use Compare API to check relationship and get commits
print(f"\nComparing {base_tag}...{head_tag}...")
compare_resp = requests.get(
f"{base_url}/compare/{base_sha}...{head_sha}",
headers=headers,
params={"per_page": per_page},
)
if compare_resp.status_code != 200:
raise Exception(f"Failed to compare: {compare_resp.text}")
compare_data = compare_resp.json()
status = compare_data.get("status", "unknown")
total_commits = compare_data.get("total_commits", 0)
print(f" Comparison status: {status}")
print(f" Total commits: {total_commits}")
# Get merge_base for potential fallback
merge_base = compare_data.get("merge_base_commit", {}).get("sha")
# If branches have diverged (e.g., release branch with cherry-picks),
# we need to filter by PR numbers to avoid duplicates
is_diverged = status == "diverged"
if is_diverged:
print("\n Branches have diverged (likely a release branch scenario)")
print(" Will filter by PR numbers to handle cherry-picks...")
if merge_base:
print(f" Merge base: {merge_base[:8]}")
# Use Compare API results
all_commits = compare_data.get("commits", [])
print(f" Initial fetch: {len(all_commits)} commits")
if len(all_commits) >= total_commits:
print(" All commits fetched in initial response")
return all_commits
# Need to paginate - try Compare API pagination first
page = 1
while len(all_commits) < total_commits:
page += 1
print(f" Fetching page {page}...")
compare_resp = requests.get(
f"{base_url}/compare/{base_sha}...{head_sha}",
headers=headers,
params={"per_page": per_page, "page": page},
)
if compare_resp.status_code != 200:
# Compare API doesn't support pagination well for large diffs
print(" Compare API pagination not supported, using commit walk...")
break
page_data = compare_resp.json()
page_commits = page_data.get("commits", [])
if not page_commits:
break
all_commits.extend(page_commits)
print(f" Page {page}: got {len(page_commits)} commits (total: {len(all_commits)})")
# If we still don't have all commits, walk the history
if len(all_commits) < total_commits:
print(f"\n Need to fetch remaining {total_commits - len(all_commits)} commits via history walk...")
# For diverged branches, use merge_base as stop point
# For non-diverged, use base_sha
stop_sha = merge_base if (status == "diverged" and merge_base) else base_sha
# Get commits we already have
seen_shas = {c["sha"] for c in all_commits}
# Walk from head, collecting commits not already seen, until we reach stop point
walk_commits = []
walk_page = 1
found_stop = False
while len(all_commits) + len(walk_commits) < total_commits and not found_stop:
response = requests.get(
f"{base_url}/commits",
headers=headers,
params={"sha": head_sha, "per_page": per_page, "page": walk_page},
)
if response.status_code != 200:
print(f" Warning: API error on page {walk_page}")
break
commits = response.json()
if not commits:
break
for commit in commits:
sha = commit["sha"]
if sha == stop_sha:
found_stop = True
break
if sha not in seen_shas:
seen_shas.add(sha)
walk_commits.append(commit)
print(f" Walk page {walk_page}: found {len(walk_commits)} additional commits")
walk_page += 1
# Combine: Compare API commits first (they're in order), then walk commits
# Actually, we should return all unique commits
all_commits.extend(walk_commits)
print(f" Total after walk: {len(all_commits)} commits")
# For diverged branches, filter out commits whose PRs are already in base release
# This handles cherry-picks that exist in both releases
if is_diverged:
print(f"\n Filtering out PRs already in {base_tag}...")
# Get base release commits to extract PR numbers
print(f" Fetching {base_tag} commits...")
base_commits = []
base_page = 1
while True:
response = requests.get(
f"{base_url}/commits",
headers=headers,
params={"sha": base_sha, "per_page": per_page, "page": base_page},
)
if response.status_code != 200:
break
commits = response.json()
if not commits:
break
for commit in commits:
if merge_base and commit["sha"] == merge_base:
break
base_commits.append(commit)
else:
base_page += 1
continue
break
print(f" Found {len(base_commits)} commits in {base_tag}")
# Extract PR numbers from base commits
base_pr_numbers = set()
for commit in base_commits:
message = commit.get("commit", {}).get("message", "")
pr_num = extract_pr_number(message)
if pr_num:
base_pr_numbers.add(pr_num)
print(f" Found {len(base_pr_numbers)} unique PRs in {base_tag}")
# Filter out commits whose PR is already in base
filtered_commits = []
skipped_count = 0
for commit in all_commits:
message = commit.get("commit", {}).get("message", "")
pr_num = extract_pr_number(message)
if pr_num and pr_num in base_pr_numbers:
skipped_count += 1
continue
filtered_commits.append(commit)
print(f" Skipped {skipped_count} commits (PRs already in {base_tag})")
print(f" Final count: {len(filtered_commits)} new commits in {head_tag}")
return filtered_commits
return all_commits
def extract_contributors(commits: list[dict]) -> dict:
"""
Extract unique contributors from commits.
Returns a dict with:
- contributors: set of (login, name) tuples
- by_login: dict mapping login -> contributor info
- by_email: dict mapping email -> contributor info (for commits without GitHub user)
"""
contributors_by_login = {}
contributors_by_email = {}
for commit in commits:
# Try to get GitHub user info first (author field)
author = commit.get("author")
if author and author.get("login"):
login = author["login"]
if login not in contributors_by_login:
contributors_by_login[login] = {
"login": login,
"name": commit.get("commit", {}).get("author", {}).get("name", ""),
"email": commit.get("commit", {}).get("author", {}).get("email", ""),
"avatar_url": author.get("avatar_url", ""),
"html_url": author.get("html_url", ""),
"commits": 0,
}
contributors_by_login[login]["commits"] += 1
else:
# Fallback to git author info
git_author = commit.get("commit", {}).get("author", {})
email = git_author.get("email", "")
name = git_author.get("name", "")
if email and email not in contributors_by_email:
contributors_by_email[email] = {
"login": None,
"name": name,
"email": email,
"avatar_url": "",
"html_url": "",
"commits": 0,
}
if email:
contributors_by_email[email]["commits"] += 1
return {
"by_login": contributors_by_login,
"by_email": contributors_by_email,
"total": len(contributors_by_login) + len(contributors_by_email),
}
def get_tag_date(base_url: str, tag: str, headers: dict) -> str:
"""Get the date of a tag's commit."""
# First resolve the tag to a commit SHA
tag_resp = requests.get(f"{base_url}/git/refs/tags/{tag}", headers=headers)
if tag_resp.status_code != 200:
return None
tag_data = tag_resp.json()
sha = tag_data["object"]["sha"]
# If it's an annotated tag, get the underlying commit
if tag_data["object"]["type"] == "tag":
tag_obj_resp = requests.get(f"{base_url}/git/tags/{sha}", headers=headers)
if tag_obj_resp.status_code == 200:
sha = tag_obj_resp.json()["object"]["sha"]
# Get the commit date
commit_resp = requests.get(f"{base_url}/commits/{sha}", headers=headers)
if commit_resp.status_code == 200:
return commit_resp.json()["commit"]["committer"]["date"]
return None
def check_contributor_is_new(owner: str, repo: str, login: str, before_date: str, headers: dict) -> bool:
"""
Check if a contributor has any commits before a given date.
Returns True if this is their first contribution (no commits before the date).
"""
base_url = f"https://api.github.com/repos/{owner}/{repo}"
# Search for commits by this author before the base tag date
response = requests.get(
f"{base_url}/commits",
headers=headers,
params={"author": login, "until": before_date, "per_page": 1},
)
if response.status_code == 200:
commits = response.json()
# If no commits found before the date, they're a new contributor
return len(commits) == 0
return False
def find_first_contribution(commits: list[dict], login: str) -> dict | None:
"""
Find the first (earliest) contribution by a user in the commit list.
Returns the commit dict or None.
"""
user_commits = []
for commit in commits:
author = commit.get("author")
if author and author.get("login") == login:
user_commits.append(commit)
# Commits are usually newest first, so reverse to get oldest first
if user_commits:
return user_commits[-1] # Last one is the oldest/first contribution
return None
def calculate_new_contributors_via_generate_notes(
owner: str,
repo: str,
base_tag: str,
head_tag: str,
token: str | None = None,
) -> list[dict]:
"""
Calculate new contributors using GitHub's generate-notes API.
This is more accurate than checking commit history because GitHub
tracks contributor status internally.
Args:
owner: Repository owner
repo: Repository name
base_tag: The base tag (older version)
head_tag: The head tag (newer version)
token: GitHub token
Returns:
List of new contributor info dicts with login and first_pr fields
"""
import re
import subprocess
print("\nGetting new contributors via GitHub generate-notes API...")
# Use gh CLI to call the generate-notes API
cmd = [
"gh",
"api",
f"repos/{owner}/{repo}/releases/generate-notes",
"-f",
f"tag_name={head_tag}",
"-f",
f"target_commitish={head_tag}",
"-f",
f"previous_tag_name={base_tag}",
"--jq",
".body",
]
try:
result = subprocess.run(cmd, capture_output=True, text=True, timeout=60)
if result.returncode != 0:
print(f" Warning: gh CLI failed: {result.stderr}")
return []
body = result.stdout
# Parse new contributors from the generated notes
# Format: "* @username made their first contribution in https://github.com/owner/repo/pull/12345"
pattern = r"\* @(\S+) made their first contribution in https://github\.com/[^/]+/[^/]+/pull/(\d+)"
matches = re.findall(pattern, body)
new_contributors = []
for login, pr_number in matches:
new_contributors.append(
{
"login": login,
"first_pr": pr_number,
}
)
print(f" Found {len(new_contributors)} new contributors")
return new_contributors
except subprocess.TimeoutExpired:
print(" Warning: gh CLI timed out")
return []
except FileNotFoundError:
print(" Warning: gh CLI not found, falling back to legacy method")
return []
def calculate_new_contributors(
commits: list[dict],
current_contributors: dict,
owner: str,
repo: str,
base_tag: str,
head_tag: str = "",
token: str | None = None,
) -> list[dict]:
"""
Calculate which contributors are new (first-time) in this release.
First tries GitHub's generate-notes API (more accurate), then falls back
to checking commit history if that fails.
Args:
commits: List of commits in the current release
current_contributors: Output from extract_contributors()
owner: Repository owner
repo: Repository name
base_tag: The base tag (older version)
head_tag: The head tag (newer version)
token: GitHub token
Returns:
List of new contributor info dicts with first_pr field
"""
# Try the accurate method first (via generate-notes API)
if head_tag:
new_contributors = calculate_new_contributors_via_generate_notes(
owner=owner,
repo=repo,
base_tag=base_tag,
head_tag=head_tag,
token=token,
)
if new_contributors:
return new_contributors
# Fall back to legacy method (checking commit history)
print("\nFalling back to legacy new contributor detection...")
headers = {
"Accept": "application/vnd.github.v3+json",
}
if token:
headers["Authorization"] = f"token {token}"
base_url = f"https://api.github.com/repos/{owner}/{repo}"
# Get the date of the base tag
print("Getting base tag date...")
base_date = get_tag_date(base_url, base_tag, headers)
if not base_date:
print(f" Warning: Could not get date for tag {base_tag}")
return []
print(f" Base tag date: {base_date}")
new_contributors = []
logins = list(current_contributors["by_login"].keys())
total = len(logins)
print(f"\nChecking {total} contributors for first-time status...")
for i, login in enumerate(logins):
if (i + 1) % 20 == 0:
print(f" Checked {i + 1}/{total} contributors...")
is_new = check_contributor_is_new(owner, repo, login, base_date, headers)
if is_new:
info = current_contributors["by_login"][login].copy()
# Find their first PR in this release
first_commit = find_first_contribution(commits, login)
if first_commit:
message = first_commit.get("commit", {}).get("message", "")
pr_number = extract_pr_number(message)
info["first_pr"] = pr_number
info["first_commit_sha"] = first_commit.get("sha", "")[:8]
new_contributors.append(info)
print(f" Found {len(new_contributors)} new contributors (legacy method)")
return new_contributors
def generate_contributor_stats(
commits: list[dict],
owner: str,
repo: str,
base_tag: str,
head_tag: str,
token: str | None = None,
check_new: bool = True,
) -> dict:
"""
Generate contributor statistics for the release.
Returns a dict with all statistics data.
"""
print("\n" + "=" * 60)
print("CONTRIBUTOR STATISTICS")
print("=" * 60)
# Extract contributors from current commits
contributors = extract_contributors(commits)
print(f"\nTotal commits: {len(commits)}")
print(f"Total contributors: {contributors['total']}")
print(f" - With GitHub account: {len(contributors['by_login'])}")
print(f" - Without GitHub account (by email): {len(contributors['by_email'])}")
new_count = 0
new_contributors_list = []
if check_new:
# Calculate new contributors (tries GitHub generate-notes API first, then falls back to commit history)
new_contributors_list = calculate_new_contributors(
commits=commits,
current_contributors=contributors,
owner=owner,
repo=repo,
base_tag=base_tag,
head_tag=head_tag,
token=token,
)
new_count = len(new_contributors_list)
print(f"\nNew contributors (first-time): {new_count}")
if new_contributors_list:
print("\nNew contributors list:")
for c in sorted(new_contributors_list, key=lambda x: x["login"].lower()):
pr_info = f" in #{c['first_pr']}" if c.get("first_pr") else ""
print(f" - @{c['login']} made their first contribution{pr_info}")
# Print summary line for release notes
print("\n" + "-" * 60)
print("RELEASE NOTES SUMMARY LINE:")
print("-" * 60)
if check_new:
summary_line = (
f"This release features {len(commits)} commits from {contributors['total']} contributors ({new_count} new)!"
)
else:
summary_line = f"This release features {len(commits)} commits from {contributors['total']} contributors!"
print(summary_line)
print("-" * 60)
# Get all contributors sorted by commit count
all_contributors_list = list(contributors["by_login"].values()) + list(contributors["by_email"].values())
sorted_contributors = sorted(all_contributors_list, key=lambda x: x["commits"], reverse=True)
# Print top contributors
print("\nTop contributors by commit count:")
for i, c in enumerate(sorted_contributors[:20], 1):
if c.get("login"):
print(f" {i:2}. @{c['login']:20} - {c['commits']:3} commits")
else:
print(f" {i:2}. {c['name']:20} - {c['commits']:3} commits (no GitHub account)")
return {
"total_commits": len(commits),
"total_contributors": contributors["total"],
"new_contributors": new_count if check_new else None,
"new_contributors_list": new_contributors_list,
"contributors": contributors,
"sorted_contributors": sorted_contributors,
"summary_line": summary_line,
"base_tag": base_tag,
"head_tag": head_tag,
"owner": owner,
"repo": repo,
}
def save_contributor_stats(stats: dict, output_file: str, owner: str, repo: str):
"""
Save contributor statistics to a markdown file.
Args:
stats: Statistics dict from generate_contributor_stats()
output_file: Output file path
owner: Repository owner
repo: Repository name
"""
lines = []
# Header
lines.append(f"# Contributor Statistics: {stats['base_tag']}{stats['head_tag']}")
lines.append("")
# Summary for release notes
lines.append("## Release Notes Summary")
lines.append("")
lines.append(f"> {stats['summary_line']}")
lines.append("")
# Overview stats
lines.append("## Overview")
lines.append("")
lines.append(f"- **Total Commits**: {stats['total_commits']}")
lines.append(f"- **Total Contributors**: {stats['total_contributors']}")
if stats["new_contributors"] is not None:
lines.append(f"- **New Contributors**: {stats['new_contributors']}")
lines.append("")
# Top contributors table
lines.append("## Top Contributors")
lines.append("")
lines.append("| Rank | Contributor | Commits |")
lines.append("|------|-------------|---------|")
for i, c in enumerate(stats["sorted_contributors"][:30], 1):
if c.get("login"):
contributor_link = f"[@{c['login']}](https://github.com/{c['login']})"
else:
contributor_link = c["name"]
lines.append(f"| {i} | {contributor_link} | {c['commits']} |")
lines.append("")
# New contributors section
if stats["new_contributors_list"]:
lines.append("## New Contributors 🎉")
lines.append("")
sorted_new = sorted(stats["new_contributors_list"], key=lambda x: x["login"].lower())
for c in sorted_new:
pr_num = c.get("first_pr")
if pr_num:
pr_link = f"https://github.com/{owner}/{repo}/pull/{pr_num}"
lines.append(f"* @{c['login']} made their first contribution in {pr_link}")
else:
lines.append(f"* @{c['login']} made their first contribution")
lines.append("")
# All contributors section (collapsed)
lines.append("## All Contributors")
lines.append("")
lines.append("<details>")
lines.append("<summary>Click to expand full list</summary>")
lines.append("")
lines.append("| Contributor | Commits |")
lines.append("|-------------|---------|")
for c in stats["sorted_contributors"]:
if c.get("login"):
contributor_link = f"[@{c['login']}](https://github.com/{c['login']})"
else:
contributor_link = c["name"]
lines.append(f"| {contributor_link} | {c['commits']} |")
lines.append("")
lines.append("</details>")
lines.append("")
# Write to file
with open(output_file, "w", encoding="utf-8") as f:
f.write("\n".join(lines))
print(f"\nSaved contributor statistics to {output_file}")
def extract_pr_number(message: str) -> str | None:
"""Extract PR number from commit message."""
# Common patterns: (#12345), (https://github.com/.../pull/12345)
patterns = [
r"\(#(\d+)\)", # (#12345)
r"pull/(\d+)", # https://github.com/.../pull/12345
r"#(\d+)$", # #12345 at end
]
for pattern in patterns:
match = re.search(pattern, message)
if match:
return match.group(1)
return None
def format_commit_message(
commit: dict,
owner: str,
repo: str,
include_sha: bool = False,
include_date: bool = False,
) -> str:
"""
Format a commit message for the output file.
Format: [Category] Description in https://github.com/owner/repo/pull/XXXX
or: [Category] Description (#XXXX)
If include_sha is True, prepends the full SHA: `sha` Message (#XXXX)
If include_date is True, prepends the date: [YYYY-MM-DD] Message (#XXXX)
"""
message = commit["commit"]["message"]
sha = commit.get("sha", "")
# Get commit date (use committer date for when it was merged)
commit_date = ""
if include_date:
date_str = commit.get("commit", {}).get("committer", {}).get("date", "")
if date_str:
# Parse ISO format and extract date part (YYYY-MM-DD)
commit_date = date_str[:10]
# Get the first line of the commit message
first_line = message.split("\n")[0].strip()
# Extract PR number if present
pr_number = extract_pr_number(first_line)
# Clean up the message - remove existing PR references for reformatting
clean_message = first_line
clean_message = re.sub(r"\s*\(#\d+\)\s*$", "", clean_message)
clean_message = re.sub(r"\s*https://github\.com/[^/]+/[^/]+/pull/\d+\s*", "", clean_message)
clean_message = re.sub(r"\s+in\s*$", "", clean_message)
clean_message = clean_message.strip()
# Format output
if pr_number:
# Check if message already contains the full URL pattern
if f"https://github.com/{owner}/{repo}/pull/" in first_line:
formatted = first_line
else:
formatted = f"{clean_message} (#{pr_number})"
else:
formatted = clean_message
# Prepend metadata if requested
prefix_parts = []
if include_date and commit_date:
prefix_parts.append(f"[{commit_date}]")
if include_sha and sha:
prefix_parts.append(f"`{sha}`")
if prefix_parts:
formatted = f"{' '.join(prefix_parts)} {formatted}"
return formatted
def save_commits_to_file(
commits: list[dict],
output_file: str,
owner: str,
repo: str,
sort_mode: str = "chronological",
include_sha: bool = False,
include_date: bool = False,
):
"""
Save formatted commits to a markdown file.
Args:
commits: List of commit dictionaries
output_file: Output file path
owner: Repository owner
repo: Repository name
sort_mode: "chronological" (newest first, like GitHub),
"alphabetical" (by commit message),
"reverse" (oldest first)
include_sha: If True, include full commit SHA in output
include_date: If True, include commit date in output
"""
print(f"\nFormatting and saving {len(commits)} commits to {output_file}...")
formatted_lines = []
for commit in commits:
formatted = format_commit_message(commit, owner, repo, include_sha=include_sha, include_date=include_date)
formatted_lines.append(formatted)
# Sort based on mode
if sort_mode == "alphabetical":
formatted_lines.sort(key=lambda x: x.lower())
print(" Sorted alphabetically by commit message")
elif sort_mode == "reverse":
formatted_lines.reverse()
print(" Sorted chronologically (oldest first)")
else:
# chronological - keep original order (newest first, as returned by API)
print(" Keeping chronological order (newest first)")
with open(output_file, "w", encoding="utf-8") as f:
for line in formatted_lines:
f.write(line + "\n")
print(f"Saved {len(formatted_lines)} commits to {output_file}")
def main():
parser = argparse.ArgumentParser(description="Fetch commits between two GitHub tags or between a tag and a commit")
parser.add_argument(
"--owner",
default="vllm-project",
help="Repository owner (default: vllm-project)",
)
parser.add_argument("--repo", default="vllm", help="Repository name (default: vllm)")
parser.add_argument(
"--base-tag",
help="Base tag (older, e.g., v0.11.2). If not provided with --head-commit, will auto-detect previous tag.",
)
parser.add_argument(
"--head-tag",
help="Head tag (newer, e.g., v0.12.0). Use this OR --head-commit. If neither specified, uses "
"HEAD of default branch.",
)
parser.add_argument(
"--head-commit",
help="Head commit SHA (can be short or full). If not specified and no --head-tag, uses HEAD of default branch.",
)
parser.add_argument(
"--tag-pattern",
default=r"^v\d+\.\d+\.\d+$",
help="Regex pattern to filter tags when auto-detecting previous tag (default: ^v\\d+\\.\\d+\\.\\d+$)",
)
parser.add_argument(
"--output",
default="0-current-raw-commits.md",
help="Output file (default: 0-current-raw-commits.md)",
)
parser.add_argument("--token", help="GitHub token (or set GITHUB_TOKEN env var)")
parser.add_argument(
"--slow",
action="store_true",
help="Use slower but more thorough commit-by-commit fetching",
)
parser.add_argument(
"--sort",
choices=["chronological", "alphabetical", "reverse"],
default="chronological",
help="Sort mode: chronological (newest first, like GitHub), alphabetical (by message), reverse (oldest first)",
)
parser.add_argument("--stats", action="store_true", help="Generate and save contributor statistics")
parser.add_argument(
"--stats-output",
default="0-contributor-stats.md",
help="Output file for contributor statistics (default: 0-contributor-stats.md)",
)
parser.add_argument(
"--no-new-check",
action="store_true",
help="Skip checking for new contributors (faster, avoids extra API calls)",
)
parser.add_argument(
"--include-sha",
action="store_true",
help="Include full commit SHA in output (format: `sha` message)",
)
parser.add_argument(
"--include-date",
action="store_true",
help="Include commit date in output (format: [YYYY-MM-DD] message)",
)
parser.add_argument(
"--since",
help="Fetch commits since this date (ISO 8601: YYYY-MM-DD or YYYY-MM-DDTHH:MM:SSZ). "
"Use with --until for date range mode.",
)
parser.add_argument(
"--until",
help="Fetch commits until this date (ISO 8601: YYYY-MM-DD or YYYY-MM-DDTHH:MM:SSZ). "
"Use with --since for date range mode.",
)
parser.add_argument(
"--branch",
help="Branch to fetch commits from (only used with --since/--until date range mode)",
)
args = parser.parse_args()
# Validate arguments
if args.head_tag and args.head_commit:
parser.error("Cannot specify both --head-tag and --head-commit")
# Check for date range mode
date_range_mode = args.since is not None or args.until is not None
if date_range_mode:
if not args.since or not args.until:
parser.error("Both --since and --until must be specified for date range mode")
if args.head_tag or args.head_commit or args.base_tag:
parser.error("Cannot use --since/--until with --head-tag, --head-commit, or --base-tag")
token = args.token or get_github_token()
if not token:
print("Warning: No GitHub token provided. Rate limits will be stricter.")
print("Set GITHUB_TOKEN environment variable or use --token argument.")
print()
headers = {
"Accept": "application/vnd.github.v3+json",
}
if token:
headers["Authorization"] = f"token {token}"
base_url = f"https://api.github.com/repos/{args.owner}/{args.repo}"
try:
# Date range mode
if date_range_mode:
print(f"\n{'=' * 60}")
print(f"Fetching commits by date range: {args.since}{args.until}")
if args.branch:
print(f"Branch: {args.branch}")
print(f"{'=' * 60}")
commits = fetch_commits_by_date_range(
owner=args.owner,
repo=args.repo,
since=args.since,
until=args.until,
token=token,
branch=args.branch,
)
print(f"\nTotal commits found: {len(commits)}")
save_commits_to_file(
commits=commits,
output_file=args.output,
owner=args.owner,
repo=args.repo,
sort_mode=args.sort,
include_sha=args.include_sha,
include_date=args.include_date,
)
# Stats not fully supported in date range mode (no base_tag for new contributor check)
if args.stats:
print("\nNote: Contributor statistics in date range mode won't check for new contributors.")
stats = generate_contributor_stats(
commits=commits,
owner=args.owner,
repo=args.repo,
base_tag=args.since,
head_tag=args.until,
token=token,
check_new=False, # Can't check new contributors without a base tag
)
save_contributor_stats(
stats=stats,
output_file=args.stats_output,
owner=args.owner,
repo=args.repo,
)
return
# Tag/commit mode (existing logic)
# Determine head reference
head_is_commit = False
head_ref = None
head_display_name = None
if args.head_tag:
head_ref = args.head_tag
head_is_commit = False
head_display_name = args.head_tag
elif args.head_commit:
head_ref = args.head_commit
head_is_commit = True
head_display_name = args.head_commit[:8] if len(args.head_commit) > 8 else args.head_commit
else:
# Auto-detect HEAD of default branch
branch_name, head_sha = get_default_branch_head(base_url, headers)
head_ref = head_sha
head_is_commit = True
head_display_name = f"{branch_name} ({head_sha[:8]})"
base_tag = args.base_tag
base_is_commit = False
# Auto-detect previous tag if needed
if not base_tag and head_is_commit:
print("Auto-detecting previous tag...")
head_sha = resolve_commit_sha(base_url, head_ref, headers)
result = find_previous_tag(
base_url=base_url,
head_sha=head_sha,
headers=headers,
tag_pattern=args.tag_pattern,
)
if result is None:
raise Exception("Could not find a previous tag. Please specify --base-tag manually.")
base_tag, _ = result
print(f"\nUsing auto-detected base tag: {base_tag}")
elif not base_tag:
parser.error("Must specify --base-tag when using --head-tag")
print(f"\n{'=' * 60}")
print(f"Fetching commits: {base_tag}{head_display_name}")
print(f"{'=' * 60}")
if args.slow:
# Note: slow mode doesn't support commit SHA yet, only tags
if head_is_commit:
print("Warning: --slow mode with --head-commit not fully supported, using fast mode")
commits = fetch_commits_between_tags_fast(
owner=args.owner,
repo=args.repo,
base_tag=base_tag,
head_tag=head_ref,
token=token,
head_is_commit=head_is_commit,
base_is_commit=base_is_commit,
)
else:
commits = fetch_commits_between_tags_fast(
owner=args.owner,
repo=args.repo,
base_tag=base_tag,
head_tag=head_ref,
token=token,
head_is_commit=head_is_commit,
base_is_commit=base_is_commit,
)
print(f"\nTotal commits found: {len(commits)}")
save_commits_to_file(
commits=commits,
output_file=args.output,
owner=args.owner,
repo=args.repo,
sort_mode=args.sort,
include_sha=args.include_sha,
include_date=args.include_date,
)
# Generate and save contributor statistics if requested
if args.stats:
stats = generate_contributor_stats(
commits=commits,
owner=args.owner,
repo=args.repo,
base_tag=base_tag,
head_tag=head_display_name,
token=token,
check_new=not args.no_new_check,
)
save_contributor_stats(
stats=stats,
output_file=args.stats_output,
owner=args.owner,
repo=args.repo,
)
except Exception as e:
print(f"Error: {e}")
raise
if __name__ == "__main__":
main()