From 561a235a3fcc47e53290bfbd15fce7aa3147590b Mon Sep 17 00:00:00 2001 From: haoli5009-debug Date: Tue, 20 Jan 2026 16:58:36 +0800 Subject: [PATCH] [CI/Build] Modify biweekly report readme files (#131) Co-authored-by: v_lihao66 --- community/collect_github_data.py | 421 +++++++++++++++++++++++++++++++ community/readme.md | 84 ++++++ 2 files changed, 505 insertions(+) create mode 100644 community/collect_github_data.py create mode 100644 community/readme.md diff --git a/community/collect_github_data.py b/community/collect_github_data.py new file mode 100644 index 0000000..21d4c59 --- /dev/null +++ b/community/collect_github_data.py @@ -0,0 +1,421 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +GitHub open-source repository statistics script. + +This tool collects activity metrics (PR/Issue/Contributors/Stars/Forks) for one or more +GitHub repositories within a specified time window, and outputs a summary report. +It also generates a visualization chart and saves it as 'github_stats.png'. +""" + +import os +import requests +from datetime import datetime, timedelta +from requests.adapters import HTTPAdapter +from urllib3.util.retry import Retry +import matplotlib.pyplot as plt + + +""" +# Initialization & configuration +""" + +# Load configuration from environment variables +GH_PAT = os.getenv("GH_PAT") +TARGET_REPOS = os.getenv("TARGET_REPOS") +HEADERS = {"Authorization": f"token {GH_PAT}"} + +# Configure request retry strategy (to handle transient GitHub API instability) +session = requests.Session() +retry = Retry( + total=3, + backoff_factor=1, + status_forcelist=[429, 500, 502, 503, 504] +) +adapter = HTTPAdapter(max_retries=retry) +session.mount("https://", adapter) + + +def validate_config(): + """ + Validate required configuration and return TARGET_REPOS_LIST. + + Returns: + List[str]: Repositories list in format ["owner/repo1", "owner/repo2", ...] + """ + if not GH_PAT: + raise ValueError("ERROR: GH_PAT is not set. Please add it in Secrets/environment variables.") + if not TARGET_REPOS or TARGET_REPOS.strip() == "": + raise ValueError( + "ERROR: TARGET_REPOS is not set. Please provide a comma-separated repo list " + "(e.g. 'owner/repo1,owner/repo2')." + ) + + target_repos_list = TARGET_REPOS.split(",") + if not all(repo.strip() for repo in target_repos_list): + raise ValueError("ERROR: Invalid TARGET_REPOS format. Repo list cannot contain empty items.") + return [repo.strip() for repo in target_repos_list] + + +def get_stat_period(): + """ + Get the statistics time range. + + Returns: + tuple[str, str]: (since, until) + - since: 00:00:00Z of START_DATE or (now - 14 days) + - until: 23:59:59Z of END_DATE or today + """ + # Prefer manually specified window via env vars + # Format: START_DATE=2025-10-31, END_DATE=2026-01-31 + start_str = os.getenv("START_DATE") + end_str = os.getenv("END_DATE") + + if start_str and end_str: + try: + datetime.strptime(start_str, "%Y-%m-%d") + datetime.strptime(end_str, "%Y-%m-%d") + return f"{start_str}T00:00:00Z", f"{end_str}T23:59:59Z" + except ValueError: + raise ValueError("ERROR: Invalid START_DATE/END_DATE format. Expected 'YYYY-MM-DD'.") + + # Auto-calculate biweekly window: last 14 days to today + end_date = datetime.now() + start_date = end_date - timedelta(days=14) + return start_date.strftime("%Y-%m-%dT00:00:00Z"), end_date.strftime("%Y-%m-%dT23:59:59Z") + + +def get_github_contributor_stats( + repo, + since, + until, + token=None, + per_page=1000, +): + """ + Collect contributor statistics for a repository. + + Returns: + (int, int): + - total_contributors_count: total historical contributors count + - period_contributors_count: contributors within the given time window + """ + headers = {} + if token: + headers["Authorization"] = f"Bearer {token}" + + # ------------------------- + # 1) Total historical contributors + # ------------------------- + total_contributors = set() + page = 1 + + while True: + url = ( + f"https://api.github.com/repos/{repo}/contributors" + f"?per_page={per_page}&page={page}" + ) + resp = requests.get(url, headers=headers) + resp.raise_for_status() + + data = resp.json() + if not data: + break + + for c in data: + if c.get("login"): + total_contributors.add(c["login"]) + + page += 1 + + # ------------------------- + # 2) Contributors in time window (from commits) + # ------------------------- + period_contributors = set() + page = 1 + + while True: + url = ( + f"https://api.github.com/repos/{repo}/commits" + f"?since={since}&until={until}" + f"&per_page={per_page}&page={page}" + ) + resp = requests.get(url, headers=headers) + resp.raise_for_status() + + commits = resp.json() + if not commits: + break + + for c in commits: + if c.get("author") and c["author"]: + period_contributors.add(c["author"]["login"]) + else: + # Fallback to email (anonymous commits) + email = c["commit"]["author"].get("email") + if email: + period_contributors.add(email) + + page += 1 + + return len(total_contributors), len(period_contributors) + + +def fetch_prs(repo, start_str, end_str, is_main_repo=False): + """ + Fetch PR statistics within the time window. + Also supports counting PRs merged into a main/community repo from target repos. + + Returns: + (int, int, int, int, int): + - total PRs + - merged PRs (within time window) + - closed PRs (closed but not merged) + - open PRs + - prs_from_target (only for main repo scenario) + """ + try: + url = f"https://api.github.com/repos/{repo}/pulls" + params = {"state": "all", "since": start_str, "until": end_str, "per_page": 1000} + response = session.get(url, headers=HEADERS, params=params) + response.raise_for_status() + prs = response.json() + + open_prs = [pr for pr in prs if pr["state"] == "open"] + merged_prs = [pr for pr in prs if pr.get("merged_at") and start_str <= pr["merged_at"] <= end_str] + closed_prs = [pr for pr in prs if pr["state"] == "closed" and not pr.get("merged_at")] + + if is_main_repo: + target_repos = validate_config() + prs_from_target = 0 + for pr in merged_prs: + # Determine whether PR head repo is from a target repo list + if pr.get("head") and pr["head"].get("repo") and pr["head"]["repo"].get("full_name") in target_repos: + prs_from_target += 1 + return len(prs), len(merged_prs), len(closed_prs), len(open_prs), prs_from_target + + return len(prs), len(merged_prs), len(closed_prs), len(open_prs), 0 + + except requests.exceptions.HTTPError as e: + raise Exception(f"Failed to fetch PR data for {repo}: {e.response.status_code} - {e.response.text}") + except Exception as e: + raise Exception(f"Exception while fetching PR data for {repo}: {str(e)}") + + +def fetch_issues(repo, start_str, end_str): + """ + Fetch issue statistics within the time window (Open/Closed). + Note: GitHub issues API also returns PRs; we filter PR objects out. + + Returns: + (int, int): open_issues_count, closed_issues_count + """ + try: + url = f"https://api.github.com/repos/{repo}/issues" + params = {"state": "all", "since": start_str, "until": end_str, "per_page": 1000} + response = session.get(url, headers=HEADERS, params=params) + response.raise_for_status() + issues = response.json() + + # Filter out PR objects + issues = [issue for issue in issues if "pull_request" not in issue] + + open_issues = [issue for issue in issues if issue["state"] == "open"] + close_issues = [issue for issue in issues if issue["state"] == "closed"] + + return len(open_issues), len(close_issues) + + except requests.exceptions.HTTPError as e: + raise Exception(f"Failed to fetch Issue data for {repo}: {e.response.status_code} - {e.response.text}") + except Exception as e: + raise Exception(f"Exception while fetching Issue data for {repo}: {str(e)}") + + +def fetch_stars_forks(repo, start_str): + """ + Fetch stars/forks totals and estimate new stars/forks since start date. + Note: This uses /events which is limited and only covers a short history. + + Returns: + (int, int, int, int): + - new_stars + - current_stars_total + - new_forks + - current_forks_total + """ + try: + # Current totals + url = f"https://api.github.com/repos/{repo}" + response = session.get(url, headers=HEADERS) + response.raise_for_status() + repo_info = response.json() + current_stars = repo_info["stargazers_count"] + current_forks = repo_info["forks_count"] + + # Count events since start date + start_date = start_str.split("T")[0] + url = f"https://api.github.com/repos/{repo}/events?per_page=1000" + response = session.get(url, headers=HEADERS) + response.raise_for_status() + events = response.json() + + new_stars = 0 + new_forks = 0 + for event in events: + event_date = event["created_at"].split("T")[0] + if event_date >= start_date: + if event["type"] == "WatchEvent": + new_stars += 1 + elif event["type"] == "ForkEvent": + new_forks += 1 + + return new_stars, current_stars, new_forks, current_forks + + except requests.exceptions.HTTPError as e: + raise Exception(f"Failed to fetch Stars/Forks data for {repo}: {e.response.status_code} - {e.response.text}") + except Exception as e: + raise Exception(f"Exception while fetching Stars/Forks data for {repo}: {str(e)}") + + +def plot_github_stats(stats: dict, start_time: str, end_time: str): + """ + Visualize GitHub community metrics (PR/Issue/Contributors/Stars/Forks). + + Args: + stats (dict): aggregated statistics + start_time (str): start time string + end_time (str): end time string + """ + labels = ["PR Total", "Issue Total", "Contributors", "Stars", "Forks"] + values = [ + stats["prs_total"], + stats["issues_total"], + stats["total_contributors"], + stats["stars_total"], + stats["forks_total"], + ] + + colors = ['#4E79A7', '#F28E2B', '#E15759', '#76B7B2', '#59A14F'] + + def extract_date(date_str): + # Support multiple time formats + for fmt in ('%Y-%m-%d', '%Y-%m-%dT%H:%M:%SZ', '%Y-%m-%d %H:%M:%S', '%Y/%m/%d'): + try: + return datetime.strptime(date_str.strip(), fmt).strftime('%Y-%m-%d') + except ValueError: + continue + return date_str[:10] if len(date_str) >= 10 else date_str + + start_date = extract_date(start_time) + end_date = extract_date(end_time) + + plt.figure(figsize=(9, 6)) + bars = plt.bar(labels, values, color=colors) + + for bar in bars: + height = bar.get_height() + plt.text( + bar.get_x() + bar.get_width() / 2, + height + max(values) * 0.01, + f'{int(height):,}', + ha='center', + va='bottom', + fontweight='bold', + fontsize=10 + ) + + plt.title(f'GitHub Community Statistics\n({start_date} ~ {end_date})', fontsize=14, fontweight='bold') + plt.ylabel("Count", fontsize=12) + plt.xticks(rotation=45, ha="right") + plt.tight_layout() + + save_path = os.path.join(os.path.dirname(__file__), "github_stats.png") + plt.savefig(save_path, dpi=300, bbox_inches="tight") + plt.close() + + +def main(): + """ + Main entry: + - Validate configuration + - Compute time window + - Fetch statistics for target repos + - Print summary + - Generate chart + """ + try: + # 1) Validate config + target_repos_list = validate_config() + + # 2) Get time window + start_str, end_str = get_stat_period() + print(f"Statistics period: {start_str} ~ {end_str}") + + # 3) Aggregation container + total_data = { + "total_contributors": 0, + "period_contributors": 0, + "prs_total": 0, + "prs_merged": 0, + "prs_closed": 0, + "prs_open": 0, + "stars_new": 0, + "stars_total": 0, + "forks_new": 0, + "forks_total": 0, + "open_issues": 0, + "close_issues": 0, + "issues_total": 0 + } + + # 4) Fetch data per repo + for repo in target_repos_list: + total_contributors, period_contributors = get_github_contributor_stats(repo, start_str, end_str, GH_PAT) + prs_t, prs_m, prs_c, prs_o, _ = fetch_prs(repo, start_str, end_str) + open_issues, close_issues = fetch_issues(repo, start_str, end_str) + stars_n, stars_t, forks_n, forks_t = fetch_stars_forks(repo, start_str) + + # Aggregate + total_data["total_contributors"] += total_contributors + total_data["period_contributors"] += period_contributors + total_data["prs_total"] += prs_t + total_data["prs_merged"] += prs_m + total_data["prs_closed"] += prs_c + total_data["prs_open"] += prs_o + total_data["open_issues"] += open_issues + total_data["close_issues"] += close_issues + total_data["issues_total"] += open_issues + close_issues + total_data["stars_new"] += stars_n + total_data["stars_total"] = stars_t # simplified: last repo's total + total_data["forks_new"] += forks_n + total_data["forks_total"] = forks_t + + prs_closed_total = total_data["prs_merged"] + total_data["prs_closed"] + + print("\n================ GitHub Community Summary ================\n") + + print("[Pull Request]") + print(f" - Total PRs: {total_data['prs_total']}") + print(f" - Closed PRs (Merged + Closed): {prs_closed_total}") + print(f" - Current Open PRs: {total_data['prs_open']}\n") + + print("[Issues]") + print(f" - Total Issues: {total_data['issues_total']}") + print(f" - Closed Issues: {total_data['close_issues']}") + print(f" - Current Open Issues: {total_data['open_issues']}\n") + + print("[Community Metrics]") + print(f" - Total Stars: {total_data['stars_total']}") + print(f" - Total Forks: {total_data['forks_total']}") + print(f" - Total Contributors: {total_data['total_contributors']}") + + plot_github_stats(total_data, start_str, end_str) + print("\nChart generated successfully: github_stats.png\n") + + except Exception as e: + print(f"Data collection failed: {str(e)}") + exit(1) + + +if __name__ == "__main__": + main() diff --git a/community/readme.md b/community/readme.md new file mode 100644 index 0000000..b3f5a15 --- /dev/null +++ b/community/readme.md @@ -0,0 +1,84 @@ +# GitHub CI/CD Data Collection Tool + +This tool is used to pull activity data from specified GitHub repositories within a given time window (e.g., PRs, Issues, Contributors, Stars, etc.), which can then be used for further analysis and visualization. + +Suitable for: +- Engineering productivity metrics (PR/Issue/Contributors/Stars within a period) +- CI/CD change tracking (by time range) +- Batch data collection for a single repository + +--- + +## Directory Structure + +```text +community/ + collect_github_data.py +``` + +--- + +## Requirements + +- Python 3.8+ (recommended 3.10+) +- Network access to the GitHub API +- GitHub Personal Access Token (PAT) + +--- + +## Quick Start + +### 1) Create a virtual environment (recommended) + +```bash +python3 -m venv venv/dev-cicd +source venv/dev-cicd/bin/activate +``` + +(Optional) Upgrade pip: + +```bash +pip install -U pip +``` + +--- + +### 2) Configure a GitHub Token (required) + +This tool fetches data through the GitHub API. To avoid API rate limits and ensure access to private repositories, it is recommended to configure a personal token. + +1. Open the token creation page: + https://github.com/settings/tokens +2. Create a token (read-only permissions are sufficient) +3. Export the token as an environment variable: + +```bash +export GH_PAT=xxx +``` + +> ⚠️ Do NOT hardcode your token or commit it into the repository. + +--- + +### 3) Set collection parameters and run + +```bash +export START_DATE=2025-12-08 +export END_DATE=2026-01-24 +export TARGET_REPOS=baidu/vLLM-Kunlun + +python community/collect_github_data.py +``` + +--- + +## Parameters + +The tool is configured via environment variables: + +| Parameter | Required | Example | Description | +|----------|----------|---------|-------------| +| `GH_PAT` | ✅ | `ghp_xxx` | GitHub Personal Access Token | +| `START_DATE` | ✅ | `2025-12-08` | Collection start date (format: YYYY-MM-DD) | +| `END_DATE` | ✅ | `2026-01-24` | Collection end date (format: YYYY-MM-DD) | +| `TARGET_REPOS` | ✅ | `baidu/vLLM-Kunlun` | Target repositories (supports multiple) |