From 2387c22b5614288987ae35aef4fe344e852be77f Mon Sep 17 00:00:00 2001 From: Xiaoyu Zhang <35585791+BBuf@users.noreply.github.com> Date: Sat, 27 Sep 2025 09:11:21 +0800 Subject: [PATCH] Ci monitor support performance (#10965) --- .github/workflows/ci-monitor.yml | 15 +- scripts/ci_monitor/README.md | 189 ++++++- scripts/ci_monitor/ci_analyzer_perf.py | 732 +++++++++++++++++++++++++ 3 files changed, 922 insertions(+), 14 deletions(-) create mode 100755 scripts/ci_monitor/ci_analyzer_perf.py diff --git a/.github/workflows/ci-monitor.yml b/.github/workflows/ci-monitor.yml index d5155e625..c29dff261 100644 --- a/.github/workflows/ci-monitor.yml +++ b/.github/workflows/ci-monitor.yml @@ -32,7 +32,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install requests + pip install requests matplotlib pandas - name: Run CI Analysis env: @@ -43,9 +43,20 @@ jobs: cd scripts/ci_monitor python ci_analyzer.py --token $GITHUB_TOKEN --limit ${{ github.event.inputs.limit || '1000' }} --output ci_analysis_$(date +%Y%m%d_%H%M%S).json + - name: Run Performance Analysis + env: + GITHUB_TOKEN: ${{ secrets.GH_PAT_FOR_NIGHTLY_CI }} + PYTHONUNBUFFERED: 1 + PYTHONIOENCODING: utf-8 + run: | + cd scripts/ci_monitor + python ci_analyzer_perf.py --token $GITHUB_TOKEN --limit 500 --output-dir performance_tables_$(date +%Y%m%d_%H%M%S) + - name: Upload Analysis Results uses: actions/upload-artifact@v4 with: name: ci-analysis-results-${{ github.run_number }} - path: scripts/ci_monitor/ci_analysis_*.json + path: | + scripts/ci_monitor/ci_analysis_*.json + scripts/ci_monitor/performance_tables_* retention-days: 30 diff --git a/scripts/ci_monitor/README.md b/scripts/ci_monitor/README.md index 4bd94d8b1..709bc09be 100644 --- a/scripts/ci_monitor/README.md +++ b/scripts/ci_monitor/README.md @@ -1,35 +1,61 @@ # SGLang CI Monitor -A simple tool to analyze CI failures for the SGLang project. This tool fetches recent CI run data from GitHub Actions and provides detailed analysis of failure patterns. +> **Note**: This README.md is primarily generated by Claude 4 with some manual adjustments. + +A comprehensive toolkit to analyze CI failures and performance trends for the SGLang project. This toolkit includes two main tools: + +1. **CI Analyzer** (`ci_analyzer.py`): Analyzes CI failures and provides detailed failure pattern analysis +2. **Performance Analyzer** (`ci_analyzer_perf.py`): Tracks performance metrics over time and generates trend charts ## Features +### CI Analyzer (`ci_analyzer.py`) - **Simple Analysis**: Analyze recent CI runs and identify failure patterns - **Category Classification**: Automatically categorize failures by type (unit-test, performance, etc.) - **Pattern Recognition**: Identify common failure patterns (timeouts, build failures, etc.) - **CI Links**: Direct links to recent failed CI runs for detailed investigation - **Last Success Tracking**: Track the last successful run for each failed job with PR information - **JSON Export**: Export detailed analysis data to JSON format -- **Automated Monitoring**: GitHub Actions workflow for continuous CI monitoring + +### Performance Analyzer (`ci_analyzer_perf.py`) +- **Performance Tracking**: Monitor performance metrics across CI runs over time +- **Automated Chart Generation**: Generate time-series charts for each performance metric +- **Multi-Test Support**: Track performance for all test types (throughput, latency, accuracy) +- **CSV Export**: Export performance data in structured CSV format +- **Trend Analysis**: Visualize performance trends with interactive charts +- **Comprehensive Metrics**: Track output throughput, E2E latency, TTFT, accept length, and more + +### Common Features +- **Automated Monitoring**: GitHub Actions workflow for continuous CI and performance monitoring ## Installation +### For CI Analyzer No additional dependencies required beyond Python standard library and `requests`: ```bash pip install requests ``` +### For Performance Analyzer +Additional dependencies required for chart generation: + +```bash +pip install requests matplotlib pandas +``` + ## Usage -### Basic Usage +### CI Analyzer + +#### Basic Usage ```bash # Replace YOUR_GITHUB_TOKEN with your actual token from https://github.com/settings/tokens python ci_analyzer.py --token YOUR_GITHUB_TOKEN ``` -### Advanced Usage +#### Advanced Usage ```bash # Analyze last 1000 runs @@ -39,16 +65,45 @@ python ci_analyzer.py --token YOUR_GITHUB_TOKEN --limit 1000 python ci_analyzer.py --token YOUR_GITHUB_TOKEN --limit 500 --output my_analysis.json ``` +### Performance Analyzer + +#### Basic Usage + +```bash +# Analyze performance trends from recent CI runs +python ci_analyzer_perf.py --token YOUR_GITHUB_TOKEN +``` + +#### Advanced Usage + +```bash +# Analyze last 1000 PR Test runs +python ci_analyzer_perf.py --token YOUR_GITHUB_TOKEN --limit 1000 + +# Custom output directory +python ci_analyzer_perf.py --token YOUR_GITHUB_TOKEN --limit 500 --output-dir my_performance_data +``` + **Important**: Make sure your GitHub token has `repo` and `workflow` permissions, otherwise you'll get 404 errors. ## Parameters +### CI Analyzer Parameters + | Parameter | Default | Description | |-----------|---------|-------------| | `--token` | Required | GitHub Personal Access Token | | `--limit` | 100 | Number of CI runs to analyze | | `--output` | ci_analysis.json | Output JSON file for detailed data | +### Performance Analyzer Parameters + +| Parameter | Default | Description | +|-----------|---------|-------------| +| `--token` | Required | GitHub Personal Access Token | +| `--limit` | 100 | Number of PR Test runs to analyze | +| `--output-dir` | performance_tables | Output directory for CSV tables and PNG charts | + ## Getting GitHub Token 1. Go to [GitHub Settings > Personal Access Tokens](https://github.com/settings/tokens) @@ -62,15 +117,15 @@ python ci_analyzer.py --token YOUR_GITHUB_TOKEN --limit 500 --output my_analysis ## Output -The tool provides: +### CI Analyzer Output -### Console Output +#### Console Output - Overall statistics (total runs, success rate, etc.) - Category failure breakdown - Most frequently failed jobs (Top 50) with direct CI links - Failure pattern analysis -### JSON Export +#### JSON Export Detailed analysis data including: - Complete failure statistics - Job failure counts @@ -78,8 +133,51 @@ Detailed analysis data including: - Failure patterns - Recent failure details +### Performance Analyzer Output + +#### Console Output +- Performance data collection progress +- Summary statistics of collected tests and records +- Generated file locations (CSV tables and PNG charts) + +#### File Outputs +- **CSV Tables**: Structured performance data with columns: + - `created_at`: Timestamp of the CI run + - `run_number`: GitHub Actions run number + - `pr_number`: Pull request number (if applicable) + - `author`: Developer who triggered the run + - `head_sha`: Git commit SHA + - Performance metrics (varies by test type): + - `output_throughput_token_s`: Output throughput in tokens/second + - `median_e2e_latency_ms`: Median end-to-end latency in milliseconds + - `median_ttft_ms`: Median time-to-first-token in milliseconds + - `accept_length`: Accept length for speculative decoding tests + - `url`: Direct link to the GitHub Actions run + +- **PNG Charts**: Time-series visualization charts for each metric: + - X-axis: Time (MM-DD HH:MM format) + - Y-axis: Performance metric values + - File naming: `{test_name}_{metric_name}.png` + +#### Directory Structure +``` +performance_tables/ +├── performance-test-1-gpu-part-1_summary/ +│ ├── test_bs1_default.csv +│ ├── test_bs1_default_output_throughput_token_s.png +│ ├── test_online_latency_default.csv +│ ├── test_online_latency_default_median_e2e_latency_ms.png +│ └── ... +├── performance-test-1-gpu-part-2_summary/ +│ └── ... +└── performance-test-2-gpu_summary/ + └── ... +``` + ## Example Output +### CI Analyzer Example + ``` ============================================================ @@ -412,6 +510,58 @@ Failure Pattern Analysis: Build Failure: 15 times ``` +### Performance Analyzer Example + +``` +============================================================ +SGLang Performance Analysis Report +============================================================ + +Getting recent 100 PR Test runs... +Got 100 PR test runs... + +Collecting performance data from CI runs... +Processing run 34882 (2025-09-26 03:16)... + Found performance-test-1-gpu-part-1 job (success) + Found performance-test-1-gpu-part-2 job (success) + Found performance-test-2-gpu job (success) +Processing run 34881 (2025-09-26 02:45)... + Found performance-test-1-gpu-part-1 job (success) + Found performance-test-1-gpu-part-2 job (success) +... + +Performance data collection completed! + +Generating performance tables to directory: performance_tables + Generated table: performance_tables/performance-test-1-gpu-part-1_summary/test_bs1_default.csv + Generated chart: performance_tables/performance-test-1-gpu-part-1_summary/test_bs1_default_output_throughput_token_s.png + Generated table: performance_tables/performance-test-1-gpu-part-1_summary/test_online_latency_default.csv + Generated chart: performance_tables/performance-test-1-gpu-part-1_summary/test_online_latency_default_median_e2e_latency_ms.png + ... + +Performance tables and charts generation completed! + +============================================================ +Performance Analysis Summary +============================================================ + +Total PR Test runs processed: 100 +Total performance tests found: 15 +Total performance records collected: 1,247 + +Performance test breakdown: + performance-test-1-gpu-part-1: 7 tests, 423 records + performance-test-1-gpu-part-2: 5 tests, 387 records + performance-test-2-gpu: 6 tests, 437 records + +Generated files: + CSV tables: 18 files + PNG charts: 18 files + Output directory: performance_tables/ + +Analysis completed successfully! +``` + ## CI Job Categories The tool automatically categorizes CI jobs into: @@ -459,11 +609,17 @@ logging.basicConfig(level=logging.DEBUG) ## Automated Monitoring -The CI monitor is also available as a GitHub Actions workflow that runs automatically every 6 hours. The workflow: +Both CI and Performance analyzers are available as a GitHub Actions workflow that runs automatically every 6 hours. The workflow: -- Analyzes the last 500 CI runs -- Generates detailed reports -- Uploads analysis results as artifacts +### CI Analysis +- Analyzes the last 1000 CI runs (configurable) +- Generates detailed failure reports +- Uploads analysis results as JSON artifacts + +### Performance Analysis +- Analyzes the last 1000 PR Test runs (configurable) +- Generates performance trend data and charts +- Uploads CSV tables and PNG charts as artifacts ### Workflow Configuration @@ -472,7 +628,16 @@ The workflow is located at `.github/workflows/ci-monitor.yml` and uses the `GH_P ### Manual Trigger You can manually trigger the workflow from the GitHub Actions tab with custom parameters: -- `limit`: Number of CI runs to analyze (default: 500) +- `limit`: Number of CI runs to analyze (default: 1000) + +### Artifacts Generated + +The workflow generates and uploads the following artifacts: +- **CI Analysis**: JSON files with failure analysis data +- **Performance Analysis**: + - CSV files with performance metrics organized by test type + - PNG charts showing performance trends over time + - Directory structure: `performance_tables_{timestamp}/` ## License diff --git a/scripts/ci_monitor/ci_analyzer_perf.py b/scripts/ci_monitor/ci_analyzer_perf.py new file mode 100755 index 000000000..d1535bca5 --- /dev/null +++ b/scripts/ci_monitor/ci_analyzer_perf.py @@ -0,0 +1,732 @@ +#!/usr/bin/env python3 +""" +SGLang CI Performance Analyzer - Simplified Version +Collect performance data based on actual log format +""" + +import argparse +import csv +import os +import re +import sys +import time +from concurrent.futures import ThreadPoolExecutor, as_completed +from datetime import datetime +from typing import Dict, List, Optional + +import matplotlib.dates as mdates +import matplotlib.pyplot as plt +import pandas as pd +import requests +from matplotlib import rcParams + + +class SGLangPerfAnalyzer: + """SGLang CI Performance Analyzer""" + + def __init__(self, token: str): + self.token = token + self.base_url = "https://api.github.com" + self.repo = "sgl-project/sglang" + self.headers = { + "Authorization": f"token {token}", + "Accept": "application/vnd.github.v3+json", + "User-Agent": "SGLang-Perf-Analyzer/1.0", + } + self.session = requests.Session() + self.session.headers.update(self.headers) + + # Performance test job names + self.performance_jobs = [ + "performance-test-1-gpu-part-1", + "performance-test-1-gpu-part-2", + "performance-test-2-gpu", + ] + + # Strictly match tests and metrics shown in the images + self.target_tests_and_metrics = { + "performance-test-1-gpu-part-1": { + "test_bs1_default": ["output_throughput_token_s"], + "test_online_latency_default": ["median_e2e_latency_ms"], + "test_offline_throughput_default": ["output_throughput_token_s"], + "test_offline_throughput_non_stream_small_batch_size": [ + "output_throughput_token_s" + ], + "test_online_latency_eagle": ["median_e2e_latency_ms", "accept_length"], + "test_lora_online_latency": ["median_e2e_latency_ms", "median_ttft_ms"], + "test_lora_online_latency_with_concurrent_adapter_updates": [ + "median_e2e_latency_ms", + "median_ttft_ms", + ], + }, + "performance-test-1-gpu-part-2": { + "test_offline_throughput_without_radix_cache": [ + "output_throughput_token_s" + ], + "test_offline_throughput_with_triton_attention_backend": [ + "output_throughput_token_s" + ], + "test_offline_throughput_default_fp8": ["output_throughput_token_s"], + "test_vlm_offline_throughput": ["output_throughput_token_s"], + "test_vlm_online_latency": ["median_e2e_latency_ms"], + }, + "performance-test-2-gpu": { + "test_moe_tp2_bs1": ["output_throughput_token_s"], + "test_torch_compile_tp2_bs1": ["output_throughput_token_s"], + "test_moe_offline_throughput_default": ["output_throughput_token_s"], + "test_moe_offline_throughput_without_radix_cache": [ + "output_throughput_token_s" + ], + "test_pp_offline_throughput_default_decode": [ + "output_throughput_token_s" + ], + "test_pp_long_context_prefill": ["input_throughput_token_s"], + }, + } + + # Performance metric patterns - only keep metrics needed in images + self.perf_patterns = { + # Key metrics shown in images + "output_throughput_token_s": r"Output token throughput \(tok/s\):\s*([\d.]+)", + "Output_throughput_token_s": r"Output throughput:\s*([\d.]+)\s*token/s", + "median_e2e_latency_ms": r"Median E2E Latency \(ms\):\s*([\d.]+)", + "median_ttft_ms": r"Median TTFT \(ms\):\s*([\d.]+)", + "accept_length": r"Accept length:\s*([\d.]+)", + "input_throughput_token_s": r"Input token throughput \(tok/s\):\s*([\d.]+)", + } + + # Pre-compile regex patterns for better performance + self.compiled_patterns = { + name: re.compile(pattern, re.IGNORECASE) + for name, pattern in self.perf_patterns.items() + } + + # Pre-compile test pattern + self.test_pattern = re.compile( + r"python3 -m unittest (test_bench_\w+\.TestBench\w+\.test_\w+)" + ) + + # Setup matplotlib fonts and styles + self._setup_matplotlib() + + def _setup_matplotlib(self): + """Setup matplotlib fonts and styles""" + # Set fonts + rcParams["font.sans-serif"] = ["Arial", "DejaVu Sans", "Liberation Sans"] + rcParams["axes.unicode_minus"] = False # Fix minus sign display issue + + # Set chart styles + plt.style.use("default") + rcParams["figure.figsize"] = (12, 6) + rcParams["font.size"] = 10 + rcParams["axes.grid"] = True + rcParams["grid.alpha"] = 0.3 + + def get_recent_runs(self, limit: int = 100) -> List[Dict]: + """Get recent CI run data""" + print(f"Getting recent {limit} PR Test runs...") + + pr_test_runs = [] + page = 1 + per_page = 100 + + while len(pr_test_runs) < limit: + url = f"{self.base_url}/repos/{self.repo}/actions/runs" + params = {"per_page": per_page, "page": page} + + try: + response = self.session.get(url, params=params) + response.raise_for_status() + data = response.json() + + if not data.get("workflow_runs"): + break + + # Filter PR Test runs + current_pr_tests = [ + run for run in data["workflow_runs"] if run.get("name") == "PR Test" + ] + + # Add to result list, but not exceed limit + for run in current_pr_tests: + if len(pr_test_runs) < limit: + pr_test_runs.append(run) + else: + break + + print(f"Got {len(pr_test_runs)} PR test runs...") + + # Exit if no more data on this page or reached limit + if len(data["workflow_runs"]) < per_page or len(pr_test_runs) >= limit: + break + + page += 1 + time.sleep(0.1) # Avoid API rate limiting + + except requests.exceptions.RequestException as e: + print(f"Error getting CI data: {e}") + break + + return pr_test_runs + + def get_job_logs(self, run_id: int, job_name: str) -> Optional[str]: + """Get logs for specific job with early exit optimization""" + try: + # First get job list + jobs_url = f"{self.base_url}/repos/{self.repo}/actions/runs/{run_id}/jobs" + response = self.session.get(jobs_url) + response.raise_for_status() + jobs_data = response.json() + + # Find matching job with early exit + target_job = None + for job in jobs_data.get("jobs", []): + if job_name in job.get("name", ""): + # Early exit if job failed or was skipped + if job.get("conclusion") not in ["success", "neutral"]: + return None + target_job = job + break + + if not target_job: + return None + + # Get logs + logs_url = f"{self.base_url}/repos/{self.repo}/actions/jobs/{target_job['id']}/logs" + response = self.session.get(logs_url) + response.raise_for_status() + + return response.text + + except Exception as e: + # Reduce verbose error logging for common failures + if "404" not in str(e): + print(f"Failed to get job {job_name} logs: {e}") + return None + + def get_all_job_logs_parallel(self, run_id: int) -> Dict[str, Optional[str]]: + """Get logs for all performance jobs in parallel""" + + def fetch_job_logs(job_name: str) -> tuple[str, Optional[str]]: + """Fetch logs for a single job""" + logs = self.get_job_logs(run_id, job_name) + return job_name, logs + + results = {} + with ThreadPoolExecutor( + max_workers=8 + ) as executor: # Increased concurrent requests + # Submit all job log requests + future_to_job = { + executor.submit(fetch_job_logs, job_name): job_name + for job_name in self.performance_jobs + } + + # Collect results as they complete + for future in as_completed(future_to_job): + job_name, logs = future.result() + results[job_name] = logs + + return results + + def parse_performance_data( + self, log_content: str, job_name: str + ) -> Dict[str, Dict[str, str]]: + """Parse specified performance data from logs""" + if not log_content: + return {} + + test_data = {} + + # Get target tests for current job + target_tests = self.target_tests_and_metrics.get(job_name, {}) + if not target_tests: + return test_data + + # Find all unittest tests using pre-compiled pattern + test_matches = self.test_pattern.findall(log_content) + + for test_match in test_matches: + test_name = test_match.split(".")[-1] # Extract test name + + # Only process target tests + if test_name not in target_tests: + continue + + # Find performance data after this test + test_section = self._extract_test_section(log_content, test_match) + if test_section: + # Only find metrics needed for this test + target_metrics = target_tests[test_name] + perf_data = {} + + for metric_name in target_metrics: + if metric_name in self.compiled_patterns: + compiled_pattern = self.compiled_patterns[metric_name] + matches = compiled_pattern.findall(test_section) + if matches: + perf_data[metric_name] = matches[-1] # Take the last match + + if perf_data: + test_data[test_name] = perf_data + + return test_data + + def _extract_test_section(self, log_content: str, test_pattern: str) -> str: + """Extract log section for specific test""" + lines = log_content.split("\n") + test_start = -1 + test_end = len(lines) + + # Find test start position + for i, line in enumerate(lines): + if test_pattern in line: + test_start = i + break + + if test_start == -1: + return "" + + # Find test end position (next test start or major separator) + for i in range(test_start + 1, len(lines)): + line = lines[i] + if ( + "python3 -m unittest" in line and "test_" in line + ) or "##[group]" in line: + test_end = i + break + + return "\n".join(lines[test_start:test_end]) + + def collect_performance_data(self, runs: List[Dict]) -> Dict[str, List[Dict]]: + """Collect all performance data""" + print("Starting performance data collection...") + + # Create data list for each test + all_test_data = {} + + total_runs = len(runs) + for i, run in enumerate(runs, 1): + print(f"Processing run {i}/{total_runs}: #{run.get('run_number')}") + + run_info = { + "run_number": run.get("run_number"), + "created_at": run.get("created_at"), + "head_sha": run.get("head_sha", "")[:8], + "author": run.get("head_commit", {}) + .get("author", {}) + .get("name", "Unknown"), + "pr_number": None, + "url": f"https://github.com/{self.repo}/actions/runs/{run.get('id')}", + } + + # Extract PR number + pull_requests = run.get("pull_requests", []) + if pull_requests: + run_info["pr_number"] = pull_requests[0].get("number") + + # Get all job logs in parallel + all_job_logs = self.get_all_job_logs_parallel(run.get("id")) + + # Process each performance test job + for job_name, logs in all_job_logs.items(): + if not logs: + continue + + # Parse performance data + test_results = self.parse_performance_data(logs, job_name) + + for test_name, perf_data in test_results.items(): + # Create full test name including job info + full_test_name = f"{job_name}_{test_name}" + + if full_test_name not in all_test_data: + all_test_data[full_test_name] = [] + + test_entry = {**run_info, **perf_data} + all_test_data[full_test_name].append(test_entry) + print( + f" Found {test_name} performance data: {list(perf_data.keys())}" + ) + + time.sleep(0.2) # Slightly longer delay between runs to be API-friendly + + return all_test_data + + def generate_performance_tables( + self, test_data: Dict[str, List[Dict]], output_dir: str = "performance_tables" + ): + """Generate performance data tables""" + print(f"Generating performance tables to directory: {output_dir}") + + # Create output directory structure + os.makedirs(output_dir, exist_ok=True) + + # Create subdirectory for each job + job_dirs = {} + for job_name in self.performance_jobs: + job_dir = os.path.join(output_dir, f"{job_name}_summary") + os.makedirs(job_dir, exist_ok=True) + job_dirs[job_name] = job_dir + + # Generate table for each test + for full_test_name, data_list in test_data.items(): + if not data_list: + continue + + # Determine which job this test belongs to + job_name = None + test_name = full_test_name + for job in self.performance_jobs: + if full_test_name.startswith(job): + job_name = job + test_name = full_test_name[len(job) + 1 :] # Remove job prefix + break + + if not job_name: + continue + + job_dir = job_dirs[job_name] + table_file = os.path.join(job_dir, f"{test_name}.csv") + + # Generate CSV table + self._write_csv_table(table_file, test_name, data_list) + + # Generate corresponding chart + print(f" Generating chart for {test_name}...") + self._generate_chart(table_file, test_name, data_list, job_dir) + + print("Performance tables and charts generation completed!") + + def _write_csv_table(self, file_path: str, test_name: str, data_list: List[Dict]): + """Write CSV table""" + if not data_list: + return + + # Get all possible columns + all_columns = set() + for entry in data_list: + all_columns.update(entry.keys()) + + # Define column order + base_columns = ["created_at", "run_number", "pr_number", "author", "head_sha"] + perf_columns = [col for col in all_columns if col not in base_columns + ["url"]] + columns = base_columns + sorted(perf_columns) + ["url"] + + with open(file_path, "w", encoding="utf-8", newline="") as f: + writer = csv.writer(f) + + # Write header + writer.writerow(columns) + + # Write data rows + for entry in sorted( + data_list, key=lambda x: x.get("created_at", ""), reverse=True + ): + row = [] + for col in columns: + value = entry.get(col, "") + if col == "created_at" and value: + # Format time to consistent format + try: + # Handle ISO 8601 format: "2025-09-26T11:16:40Z" + if "T" in value and "Z" in value: + dt = datetime.fromisoformat( + value.replace("Z", "+00:00") + ) + value = dt.strftime("%Y-%m-%d %H:%M") + # If already in desired format, keep it + elif len(value) == 16 and " " in value: + # Validate format + datetime.strptime(value, "%Y-%m-%d %H:%M") + else: + # Try to parse and reformat + dt = datetime.fromisoformat(value) + value = dt.strftime("%Y-%m-%d %H:%M") + except: + # If all parsing fails, keep original value + pass + elif col == "pr_number" and value: + value = f"#{value}" + row.append(str(value)) + writer.writerow(row) + + print(f" Generated table: {file_path} ({len(data_list)} records)") + + def _generate_chart( + self, csv_file_path: str, test_name: str, data_list: List[Dict], output_dir: str + ): + """Generate corresponding time series charts for tables""" + print( + f" Starting chart generation for {test_name} with {len(data_list)} data points" + ) + + if not data_list or len(data_list) < 2: + print( + f" Skipping chart for {test_name}: insufficient data ({len(data_list) if data_list else 0} records)" + ) + return + + try: + # Prepare data + timestamps = [] + metrics_data = {} + + # Get performance metric columns (exclude basic info columns) + base_columns = { + "created_at", + "run_number", + "pr_number", + "author", + "head_sha", + "url", + } + perf_metrics = [] + + for entry in data_list: + for key in entry.keys(): + if key not in base_columns and key not in perf_metrics: + perf_metrics.append(key) + + if not perf_metrics: + print( + f" Skipping chart for {test_name}: no performance metrics found" + ) + return + + print(f" Found performance metrics: {perf_metrics}") + + # Parse data + for entry in data_list: + # Parse time + try: + time_str = entry.get("created_at", "") + if time_str: + # Handle different time formats + timestamp = None + + # Try ISO 8601 format first (from GitHub API): "2025-09-26T11:16:40Z" + if "T" in time_str and "Z" in time_str: + try: + # Parse and convert to naive datetime (remove timezone info) + dt_with_tz = datetime.fromisoformat( + time_str.replace("Z", "+00:00") + ) + timestamp = dt_with_tz.replace(tzinfo=None) + except: + # Fallback for older Python versions + timestamp = datetime.strptime( + time_str, "%Y-%m-%dT%H:%M:%SZ" + ) + + # Try CSV format: "2025-09-26 08:43" + elif " " in time_str and len(time_str) == 16: + timestamp = datetime.strptime(time_str, "%Y-%m-%d %H:%M") + + # Try other common formats + else: + formats_to_try = [ + "%Y-%m-%d %H:%M:%S", + "%Y-%m-%dT%H:%M:%S", + "%Y-%m-%d", + ] + for fmt in formats_to_try: + try: + timestamp = datetime.strptime(time_str, fmt) + break + except: + continue + + if timestamp: + timestamps.append(timestamp) + + # Collect metric data + for metric in perf_metrics: + if metric not in metrics_data: + metrics_data[metric] = [] + + value = entry.get(metric, "") + try: + numeric_value = float(value) + metrics_data[metric].append(numeric_value) + except: + metrics_data[metric].append(None) + else: + print( + f" Failed to parse timestamp format: '{time_str}'" + ) + + except Exception as e: + print(f" Error processing entry: {e}") + continue + + if not timestamps: + print( + f" Skipping chart for {test_name}: no valid timestamps found" + ) + return + + print(f" Parsed {len(timestamps)} timestamps") + + # Sort by time + sorted_data = sorted( + zip(timestamps, *[metrics_data[m] for m in perf_metrics]) + ) + timestamps = [item[0] for item in sorted_data] + for i, metric in enumerate(perf_metrics): + metrics_data[metric] = [item[i + 1] for item in sorted_data] + + # Create chart for each metric + for metric in perf_metrics: + values = metrics_data[metric] + valid_data = [ + (t, v) for t, v in zip(timestamps, values) if v is not None + ] + + if len(valid_data) < 2: + print( + f" Skipping chart for {test_name}_{metric}: insufficient valid data ({len(valid_data)} points)" + ) + continue + + valid_timestamps, valid_values = zip(*valid_data) + + # Create chart + plt.figure(figsize=(12, 6)) + plt.plot( + valid_timestamps, + valid_values, + marker="o", + linewidth=2, + markersize=4, + ) + + # Set title and labels + title = f"{test_name} - {self._format_metric_name(metric)}" + plt.title(title, fontsize=14, fontweight="bold") + plt.xlabel("Time", fontsize=12) + plt.ylabel(self._get_metric_unit(metric), fontsize=12) + + # Format x-axis + plt.gca().xaxis.set_major_formatter(mdates.DateFormatter("%m-%d %H:%M")) + plt.gca().xaxis.set_major_locator( + mdates.HourLocator(interval=max(1, len(valid_timestamps) // 10)) + ) + plt.xticks(rotation=45) + + # Add grid + plt.grid(True, alpha=0.3) + + # Adjust layout + plt.tight_layout() + + # Save chart + chart_filename = f"{test_name}_{metric}.png" + chart_path = os.path.join(output_dir, chart_filename) + plt.savefig(chart_path, dpi=300, bbox_inches="tight") + plt.close() + + print(f" Generated chart: {chart_path}") + + except Exception as e: + print(f" Failed to generate chart for {test_name}: {e}") + import traceback + + traceback.print_exc() + + def _format_metric_name(self, metric: str) -> str: + """Format metric name for display""" + name_mapping = { + "output_throughput_token_s": "Output Throughput", + "median_e2e_latency_ms": "Median E2E Latency", + "median_ttft_ms": "Median TTFT", + "accept_length": "Accept Length", + "input_throughput_token_s": "Input Throughput", + } + return name_mapping.get(metric, metric) + + def _get_metric_unit(self, metric: str) -> str: + """Get metric unit""" + if "throughput" in metric and "token_s" in metric: + return "token/s" + elif "latency" in metric and "ms" in metric: + return "ms" + elif "accept_length" in metric: + return "length" + else: + return "value" + + def generate_summary_report(self, test_data: Dict[str, List[Dict]]): + """Generate summary report""" + print("\n" + "=" * 60) + print("SGLang CI Performance Data Collection Report") + print("=" * 60) + + total_tests = len([test for test, data in test_data.items() if data]) + total_records = sum(len(data) for data in test_data.values()) + + print(f"\nOverall Statistics:") + print(f" Number of tests collected: {total_tests}") + print(f" Total records: {total_records}") + + print(f"\nStatistics by job:") + for job_name in self.performance_jobs: + job_tests = [test for test in test_data.keys() if test.startswith(job_name)] + job_records = sum(len(test_data[test]) for test in job_tests) + print(f" {job_name}: {len(job_tests)} tests, {job_records} records") + + for test in job_tests: + data = test_data[test] + test_short_name = test[len(job_name) + 1 :] + print(f" - {test_short_name}: {len(data)} records") + + print("\n" + "=" * 60) + + +def main(): + parser = argparse.ArgumentParser(description="SGLang CI Performance Analyzer") + parser.add_argument("--token", required=True, help="GitHub Personal Access Token") + parser.add_argument( + "--limit", + type=int, + default=100, + help="Number of runs to analyze (default: 100)", + ) + parser.add_argument( + "--output-dir", + default="performance_tables", + help="Output directory (default: performance_tables)", + ) + + args = parser.parse_args() + + # Create analyzer + analyzer = SGLangPerfAnalyzer(args.token) + + try: + # Get CI run data + runs = analyzer.get_recent_runs(args.limit) + + if not runs: + print("No CI run data found") + return + + # Collect performance data + test_data = analyzer.collect_performance_data(runs) + + # Generate performance tables + analyzer.generate_performance_tables(test_data, args.output_dir) + + # Generate summary report + analyzer.generate_summary_report(test_data) + + except Exception as e: + print(f"Error during analysis: {e}") + import traceback + + traceback.print_exc() + sys.exit(1) + + +if __name__ == "__main__": + main()