[CI/Build] Modify biweekly report readme files (#131)
Co-authored-by: v_lihao66 <v_lihao66@baidu.com>
This commit is contained in:
421
community/collect_github_data.py
Normal file
421
community/collect_github_data.py
Normal file
@@ -0,0 +1,421 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
GitHub open-source repository statistics script.
|
||||
|
||||
This tool collects activity metrics (PR/Issue/Contributors/Stars/Forks) for one or more
|
||||
GitHub repositories within a specified time window, and outputs a summary report.
|
||||
It also generates a visualization chart and saves it as 'github_stats.png'.
|
||||
"""
|
||||
|
||||
import os
|
||||
import requests
|
||||
from datetime import datetime, timedelta
|
||||
from requests.adapters import HTTPAdapter
|
||||
from urllib3.util.retry import Retry
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
|
||||
"""
|
||||
# Initialization & configuration
|
||||
"""
|
||||
|
||||
# Load configuration from environment variables
|
||||
GH_PAT = os.getenv("GH_PAT")
|
||||
TARGET_REPOS = os.getenv("TARGET_REPOS")
|
||||
HEADERS = {"Authorization": f"token {GH_PAT}"}
|
||||
|
||||
# Configure request retry strategy (to handle transient GitHub API instability)
|
||||
session = requests.Session()
|
||||
retry = Retry(
|
||||
total=3,
|
||||
backoff_factor=1,
|
||||
status_forcelist=[429, 500, 502, 503, 504]
|
||||
)
|
||||
adapter = HTTPAdapter(max_retries=retry)
|
||||
session.mount("https://", adapter)
|
||||
|
||||
|
||||
def validate_config():
|
||||
"""
|
||||
Validate required configuration and return TARGET_REPOS_LIST.
|
||||
|
||||
Returns:
|
||||
List[str]: Repositories list in format ["owner/repo1", "owner/repo2", ...]
|
||||
"""
|
||||
if not GH_PAT:
|
||||
raise ValueError("ERROR: GH_PAT is not set. Please add it in Secrets/environment variables.")
|
||||
if not TARGET_REPOS or TARGET_REPOS.strip() == "":
|
||||
raise ValueError(
|
||||
"ERROR: TARGET_REPOS is not set. Please provide a comma-separated repo list "
|
||||
"(e.g. 'owner/repo1,owner/repo2')."
|
||||
)
|
||||
|
||||
target_repos_list = TARGET_REPOS.split(",")
|
||||
if not all(repo.strip() for repo in target_repos_list):
|
||||
raise ValueError("ERROR: Invalid TARGET_REPOS format. Repo list cannot contain empty items.")
|
||||
return [repo.strip() for repo in target_repos_list]
|
||||
|
||||
|
||||
def get_stat_period():
|
||||
"""
|
||||
Get the statistics time range.
|
||||
|
||||
Returns:
|
||||
tuple[str, str]: (since, until)
|
||||
- since: 00:00:00Z of START_DATE or (now - 14 days)
|
||||
- until: 23:59:59Z of END_DATE or today
|
||||
"""
|
||||
# Prefer manually specified window via env vars
|
||||
# Format: START_DATE=2025-10-31, END_DATE=2026-01-31
|
||||
start_str = os.getenv("START_DATE")
|
||||
end_str = os.getenv("END_DATE")
|
||||
|
||||
if start_str and end_str:
|
||||
try:
|
||||
datetime.strptime(start_str, "%Y-%m-%d")
|
||||
datetime.strptime(end_str, "%Y-%m-%d")
|
||||
return f"{start_str}T00:00:00Z", f"{end_str}T23:59:59Z"
|
||||
except ValueError:
|
||||
raise ValueError("ERROR: Invalid START_DATE/END_DATE format. Expected 'YYYY-MM-DD'.")
|
||||
|
||||
# Auto-calculate biweekly window: last 14 days to today
|
||||
end_date = datetime.now()
|
||||
start_date = end_date - timedelta(days=14)
|
||||
return start_date.strftime("%Y-%m-%dT00:00:00Z"), end_date.strftime("%Y-%m-%dT23:59:59Z")
|
||||
|
||||
|
||||
def get_github_contributor_stats(
|
||||
repo,
|
||||
since,
|
||||
until,
|
||||
token=None,
|
||||
per_page=1000,
|
||||
):
|
||||
"""
|
||||
Collect contributor statistics for a repository.
|
||||
|
||||
Returns:
|
||||
(int, int):
|
||||
- total_contributors_count: total historical contributors count
|
||||
- period_contributors_count: contributors within the given time window
|
||||
"""
|
||||
headers = {}
|
||||
if token:
|
||||
headers["Authorization"] = f"Bearer {token}"
|
||||
|
||||
# -------------------------
|
||||
# 1) Total historical contributors
|
||||
# -------------------------
|
||||
total_contributors = set()
|
||||
page = 1
|
||||
|
||||
while True:
|
||||
url = (
|
||||
f"https://api.github.com/repos/{repo}/contributors"
|
||||
f"?per_page={per_page}&page={page}"
|
||||
)
|
||||
resp = requests.get(url, headers=headers)
|
||||
resp.raise_for_status()
|
||||
|
||||
data = resp.json()
|
||||
if not data:
|
||||
break
|
||||
|
||||
for c in data:
|
||||
if c.get("login"):
|
||||
total_contributors.add(c["login"])
|
||||
|
||||
page += 1
|
||||
|
||||
# -------------------------
|
||||
# 2) Contributors in time window (from commits)
|
||||
# -------------------------
|
||||
period_contributors = set()
|
||||
page = 1
|
||||
|
||||
while True:
|
||||
url = (
|
||||
f"https://api.github.com/repos/{repo}/commits"
|
||||
f"?since={since}&until={until}"
|
||||
f"&per_page={per_page}&page={page}"
|
||||
)
|
||||
resp = requests.get(url, headers=headers)
|
||||
resp.raise_for_status()
|
||||
|
||||
commits = resp.json()
|
||||
if not commits:
|
||||
break
|
||||
|
||||
for c in commits:
|
||||
if c.get("author") and c["author"]:
|
||||
period_contributors.add(c["author"]["login"])
|
||||
else:
|
||||
# Fallback to email (anonymous commits)
|
||||
email = c["commit"]["author"].get("email")
|
||||
if email:
|
||||
period_contributors.add(email)
|
||||
|
||||
page += 1
|
||||
|
||||
return len(total_contributors), len(period_contributors)
|
||||
|
||||
|
||||
def fetch_prs(repo, start_str, end_str, is_main_repo=False):
|
||||
"""
|
||||
Fetch PR statistics within the time window.
|
||||
Also supports counting PRs merged into a main/community repo from target repos.
|
||||
|
||||
Returns:
|
||||
(int, int, int, int, int):
|
||||
- total PRs
|
||||
- merged PRs (within time window)
|
||||
- closed PRs (closed but not merged)
|
||||
- open PRs
|
||||
- prs_from_target (only for main repo scenario)
|
||||
"""
|
||||
try:
|
||||
url = f"https://api.github.com/repos/{repo}/pulls"
|
||||
params = {"state": "all", "since": start_str, "until": end_str, "per_page": 1000}
|
||||
response = session.get(url, headers=HEADERS, params=params)
|
||||
response.raise_for_status()
|
||||
prs = response.json()
|
||||
|
||||
open_prs = [pr for pr in prs if pr["state"] == "open"]
|
||||
merged_prs = [pr for pr in prs if pr.get("merged_at") and start_str <= pr["merged_at"] <= end_str]
|
||||
closed_prs = [pr for pr in prs if pr["state"] == "closed" and not pr.get("merged_at")]
|
||||
|
||||
if is_main_repo:
|
||||
target_repos = validate_config()
|
||||
prs_from_target = 0
|
||||
for pr in merged_prs:
|
||||
# Determine whether PR head repo is from a target repo list
|
||||
if pr.get("head") and pr["head"].get("repo") and pr["head"]["repo"].get("full_name") in target_repos:
|
||||
prs_from_target += 1
|
||||
return len(prs), len(merged_prs), len(closed_prs), len(open_prs), prs_from_target
|
||||
|
||||
return len(prs), len(merged_prs), len(closed_prs), len(open_prs), 0
|
||||
|
||||
except requests.exceptions.HTTPError as e:
|
||||
raise Exception(f"Failed to fetch PR data for {repo}: {e.response.status_code} - {e.response.text}")
|
||||
except Exception as e:
|
||||
raise Exception(f"Exception while fetching PR data for {repo}: {str(e)}")
|
||||
|
||||
|
||||
def fetch_issues(repo, start_str, end_str):
|
||||
"""
|
||||
Fetch issue statistics within the time window (Open/Closed).
|
||||
Note: GitHub issues API also returns PRs; we filter PR objects out.
|
||||
|
||||
Returns:
|
||||
(int, int): open_issues_count, closed_issues_count
|
||||
"""
|
||||
try:
|
||||
url = f"https://api.github.com/repos/{repo}/issues"
|
||||
params = {"state": "all", "since": start_str, "until": end_str, "per_page": 1000}
|
||||
response = session.get(url, headers=HEADERS, params=params)
|
||||
response.raise_for_status()
|
||||
issues = response.json()
|
||||
|
||||
# Filter out PR objects
|
||||
issues = [issue for issue in issues if "pull_request" not in issue]
|
||||
|
||||
open_issues = [issue for issue in issues if issue["state"] == "open"]
|
||||
close_issues = [issue for issue in issues if issue["state"] == "closed"]
|
||||
|
||||
return len(open_issues), len(close_issues)
|
||||
|
||||
except requests.exceptions.HTTPError as e:
|
||||
raise Exception(f"Failed to fetch Issue data for {repo}: {e.response.status_code} - {e.response.text}")
|
||||
except Exception as e:
|
||||
raise Exception(f"Exception while fetching Issue data for {repo}: {str(e)}")
|
||||
|
||||
|
||||
def fetch_stars_forks(repo, start_str):
|
||||
"""
|
||||
Fetch stars/forks totals and estimate new stars/forks since start date.
|
||||
Note: This uses /events which is limited and only covers a short history.
|
||||
|
||||
Returns:
|
||||
(int, int, int, int):
|
||||
- new_stars
|
||||
- current_stars_total
|
||||
- new_forks
|
||||
- current_forks_total
|
||||
"""
|
||||
try:
|
||||
# Current totals
|
||||
url = f"https://api.github.com/repos/{repo}"
|
||||
response = session.get(url, headers=HEADERS)
|
||||
response.raise_for_status()
|
||||
repo_info = response.json()
|
||||
current_stars = repo_info["stargazers_count"]
|
||||
current_forks = repo_info["forks_count"]
|
||||
|
||||
# Count events since start date
|
||||
start_date = start_str.split("T")[0]
|
||||
url = f"https://api.github.com/repos/{repo}/events?per_page=1000"
|
||||
response = session.get(url, headers=HEADERS)
|
||||
response.raise_for_status()
|
||||
events = response.json()
|
||||
|
||||
new_stars = 0
|
||||
new_forks = 0
|
||||
for event in events:
|
||||
event_date = event["created_at"].split("T")[0]
|
||||
if event_date >= start_date:
|
||||
if event["type"] == "WatchEvent":
|
||||
new_stars += 1
|
||||
elif event["type"] == "ForkEvent":
|
||||
new_forks += 1
|
||||
|
||||
return new_stars, current_stars, new_forks, current_forks
|
||||
|
||||
except requests.exceptions.HTTPError as e:
|
||||
raise Exception(f"Failed to fetch Stars/Forks data for {repo}: {e.response.status_code} - {e.response.text}")
|
||||
except Exception as e:
|
||||
raise Exception(f"Exception while fetching Stars/Forks data for {repo}: {str(e)}")
|
||||
|
||||
|
||||
def plot_github_stats(stats: dict, start_time: str, end_time: str):
|
||||
"""
|
||||
Visualize GitHub community metrics (PR/Issue/Contributors/Stars/Forks).
|
||||
|
||||
Args:
|
||||
stats (dict): aggregated statistics
|
||||
start_time (str): start time string
|
||||
end_time (str): end time string
|
||||
"""
|
||||
labels = ["PR Total", "Issue Total", "Contributors", "Stars", "Forks"]
|
||||
values = [
|
||||
stats["prs_total"],
|
||||
stats["issues_total"],
|
||||
stats["total_contributors"],
|
||||
stats["stars_total"],
|
||||
stats["forks_total"],
|
||||
]
|
||||
|
||||
colors = ['#4E79A7', '#F28E2B', '#E15759', '#76B7B2', '#59A14F']
|
||||
|
||||
def extract_date(date_str):
|
||||
# Support multiple time formats
|
||||
for fmt in ('%Y-%m-%d', '%Y-%m-%dT%H:%M:%SZ', '%Y-%m-%d %H:%M:%S', '%Y/%m/%d'):
|
||||
try:
|
||||
return datetime.strptime(date_str.strip(), fmt).strftime('%Y-%m-%d')
|
||||
except ValueError:
|
||||
continue
|
||||
return date_str[:10] if len(date_str) >= 10 else date_str
|
||||
|
||||
start_date = extract_date(start_time)
|
||||
end_date = extract_date(end_time)
|
||||
|
||||
plt.figure(figsize=(9, 6))
|
||||
bars = plt.bar(labels, values, color=colors)
|
||||
|
||||
for bar in bars:
|
||||
height = bar.get_height()
|
||||
plt.text(
|
||||
bar.get_x() + bar.get_width() / 2,
|
||||
height + max(values) * 0.01,
|
||||
f'{int(height):,}',
|
||||
ha='center',
|
||||
va='bottom',
|
||||
fontweight='bold',
|
||||
fontsize=10
|
||||
)
|
||||
|
||||
plt.title(f'GitHub Community Statistics\n({start_date} ~ {end_date})', fontsize=14, fontweight='bold')
|
||||
plt.ylabel("Count", fontsize=12)
|
||||
plt.xticks(rotation=45, ha="right")
|
||||
plt.tight_layout()
|
||||
|
||||
save_path = os.path.join(os.path.dirname(__file__), "github_stats.png")
|
||||
plt.savefig(save_path, dpi=300, bbox_inches="tight")
|
||||
plt.close()
|
||||
|
||||
|
||||
def main():
|
||||
"""
|
||||
Main entry:
|
||||
- Validate configuration
|
||||
- Compute time window
|
||||
- Fetch statistics for target repos
|
||||
- Print summary
|
||||
- Generate chart
|
||||
"""
|
||||
try:
|
||||
# 1) Validate config
|
||||
target_repos_list = validate_config()
|
||||
|
||||
# 2) Get time window
|
||||
start_str, end_str = get_stat_period()
|
||||
print(f"Statistics period: {start_str} ~ {end_str}")
|
||||
|
||||
# 3) Aggregation container
|
||||
total_data = {
|
||||
"total_contributors": 0,
|
||||
"period_contributors": 0,
|
||||
"prs_total": 0,
|
||||
"prs_merged": 0,
|
||||
"prs_closed": 0,
|
||||
"prs_open": 0,
|
||||
"stars_new": 0,
|
||||
"stars_total": 0,
|
||||
"forks_new": 0,
|
||||
"forks_total": 0,
|
||||
"open_issues": 0,
|
||||
"close_issues": 0,
|
||||
"issues_total": 0
|
||||
}
|
||||
|
||||
# 4) Fetch data per repo
|
||||
for repo in target_repos_list:
|
||||
total_contributors, period_contributors = get_github_contributor_stats(repo, start_str, end_str, GH_PAT)
|
||||
prs_t, prs_m, prs_c, prs_o, _ = fetch_prs(repo, start_str, end_str)
|
||||
open_issues, close_issues = fetch_issues(repo, start_str, end_str)
|
||||
stars_n, stars_t, forks_n, forks_t = fetch_stars_forks(repo, start_str)
|
||||
|
||||
# Aggregate
|
||||
total_data["total_contributors"] += total_contributors
|
||||
total_data["period_contributors"] += period_contributors
|
||||
total_data["prs_total"] += prs_t
|
||||
total_data["prs_merged"] += prs_m
|
||||
total_data["prs_closed"] += prs_c
|
||||
total_data["prs_open"] += prs_o
|
||||
total_data["open_issues"] += open_issues
|
||||
total_data["close_issues"] += close_issues
|
||||
total_data["issues_total"] += open_issues + close_issues
|
||||
total_data["stars_new"] += stars_n
|
||||
total_data["stars_total"] = stars_t # simplified: last repo's total
|
||||
total_data["forks_new"] += forks_n
|
||||
total_data["forks_total"] = forks_t
|
||||
|
||||
prs_closed_total = total_data["prs_merged"] + total_data["prs_closed"]
|
||||
|
||||
print("\n================ GitHub Community Summary ================\n")
|
||||
|
||||
print("[Pull Request]")
|
||||
print(f" - Total PRs: {total_data['prs_total']}")
|
||||
print(f" - Closed PRs (Merged + Closed): {prs_closed_total}")
|
||||
print(f" - Current Open PRs: {total_data['prs_open']}\n")
|
||||
|
||||
print("[Issues]")
|
||||
print(f" - Total Issues: {total_data['issues_total']}")
|
||||
print(f" - Closed Issues: {total_data['close_issues']}")
|
||||
print(f" - Current Open Issues: {total_data['open_issues']}\n")
|
||||
|
||||
print("[Community Metrics]")
|
||||
print(f" - Total Stars: {total_data['stars_total']}")
|
||||
print(f" - Total Forks: {total_data['forks_total']}")
|
||||
print(f" - Total Contributors: {total_data['total_contributors']}")
|
||||
|
||||
plot_github_stats(total_data, start_str, end_str)
|
||||
print("\nChart generated successfully: github_stats.png\n")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Data collection failed: {str(e)}")
|
||||
exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
84
community/readme.md
Normal file
84
community/readme.md
Normal file
@@ -0,0 +1,84 @@
|
||||
# GitHub CI/CD Data Collection Tool
|
||||
|
||||
This tool is used to pull activity data from specified GitHub repositories within a given time window (e.g., PRs, Issues, Contributors, Stars, etc.), which can then be used for further analysis and visualization.
|
||||
|
||||
Suitable for:
|
||||
- Engineering productivity metrics (PR/Issue/Contributors/Stars within a period)
|
||||
- CI/CD change tracking (by time range)
|
||||
- Batch data collection for a single repository
|
||||
|
||||
---
|
||||
|
||||
## Directory Structure
|
||||
|
||||
```text
|
||||
community/
|
||||
collect_github_data.py
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Requirements
|
||||
|
||||
- Python 3.8+ (recommended 3.10+)
|
||||
- Network access to the GitHub API
|
||||
- GitHub Personal Access Token (PAT)
|
||||
|
||||
---
|
||||
|
||||
## Quick Start
|
||||
|
||||
### 1) Create a virtual environment (recommended)
|
||||
|
||||
```bash
|
||||
python3 -m venv venv/dev-cicd
|
||||
source venv/dev-cicd/bin/activate
|
||||
```
|
||||
|
||||
(Optional) Upgrade pip:
|
||||
|
||||
```bash
|
||||
pip install -U pip
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 2) Configure a GitHub Token (required)
|
||||
|
||||
This tool fetches data through the GitHub API. To avoid API rate limits and ensure access to private repositories, it is recommended to configure a personal token.
|
||||
|
||||
1. Open the token creation page:
|
||||
https://github.com/settings/tokens
|
||||
2. Create a token (read-only permissions are sufficient)
|
||||
3. Export the token as an environment variable:
|
||||
|
||||
```bash
|
||||
export GH_PAT=xxx
|
||||
```
|
||||
|
||||
> ⚠️ Do NOT hardcode your token or commit it into the repository.
|
||||
|
||||
---
|
||||
|
||||
### 3) Set collection parameters and run
|
||||
|
||||
```bash
|
||||
export START_DATE=2025-12-08
|
||||
export END_DATE=2026-01-24
|
||||
export TARGET_REPOS=baidu/vLLM-Kunlun
|
||||
|
||||
python community/collect_github_data.py
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Parameters
|
||||
|
||||
The tool is configured via environment variables:
|
||||
|
||||
| Parameter | Required | Example | Description |
|
||||
|----------|----------|---------|-------------|
|
||||
| `GH_PAT` | ✅ | `ghp_xxx` | GitHub Personal Access Token |
|
||||
| `START_DATE` | ✅ | `2025-12-08` | Collection start date (format: YYYY-MM-DD) |
|
||||
| `END_DATE` | ✅ | `2026-01-24` | Collection end date (format: YYYY-MM-DD) |
|
||||
| `TARGET_REPOS` | ✅ | `baidu/vLLM-Kunlun` | Target repositories (supports multiple) |
|
||||
Reference in New Issue
Block a user