From c1f39013b78f1c6fc6d03d60faa1c8210e2a3d6f Mon Sep 17 00:00:00 2001 From: Xiaoyu Zhang <35585791+BBuf@users.noreply.github.com> Date: Thu, 25 Sep 2025 14:16:29 +0800 Subject: [PATCH] [ci feature] add ci monitor (#10872) --- scripts/ci_monitor/README.md | 479 ++++++++++++++++++++++++++++++ scripts/ci_monitor/ci_analyzer.py | 389 ++++++++++++++++++++++++ scripts/ci_monitor/example.sh | 14 + 3 files changed, 882 insertions(+) create mode 100644 scripts/ci_monitor/README.md create mode 100755 scripts/ci_monitor/ci_analyzer.py create mode 100755 scripts/ci_monitor/example.sh diff --git a/scripts/ci_monitor/README.md b/scripts/ci_monitor/README.md new file mode 100644 index 000000000..4bd94d8b1 --- /dev/null +++ b/scripts/ci_monitor/README.md @@ -0,0 +1,479 @@ +# SGLang CI Monitor + +A simple tool to analyze CI failures for the SGLang project. This tool fetches recent CI run data from GitHub Actions and provides detailed analysis of failure patterns. + +## Features + +- **Simple Analysis**: Analyze recent CI runs and identify failure patterns +- **Category Classification**: Automatically categorize failures by type (unit-test, performance, etc.) +- **Pattern Recognition**: Identify common failure patterns (timeouts, build failures, etc.) +- **CI Links**: Direct links to recent failed CI runs for detailed investigation +- **Last Success Tracking**: Track the last successful run for each failed job with PR information +- **JSON Export**: Export detailed analysis data to JSON format +- **Automated Monitoring**: GitHub Actions workflow for continuous CI monitoring + +## Installation + +No additional dependencies required beyond Python standard library and `requests`: + +```bash +pip install requests +``` + +## Usage + +### Basic Usage + +```bash +# Replace YOUR_GITHUB_TOKEN with your actual token from https://github.com/settings/tokens +python ci_analyzer.py --token YOUR_GITHUB_TOKEN +``` + +### Advanced Usage + +```bash +# Analyze last 1000 runs +python ci_analyzer.py --token YOUR_GITHUB_TOKEN --limit 1000 + +# Custom output file +python ci_analyzer.py --token YOUR_GITHUB_TOKEN --limit 500 --output my_analysis.json +``` + +**Important**: Make sure your GitHub token has `repo` and `workflow` permissions, otherwise you'll get 404 errors. + +## Parameters + +| Parameter | Default | Description | +|-----------|---------|-------------| +| `--token` | Required | GitHub Personal Access Token | +| `--limit` | 100 | Number of CI runs to analyze | +| `--output` | ci_analysis.json | Output JSON file for detailed data | + +## Getting GitHub Token + +1. Go to [GitHub Settings > Personal Access Tokens](https://github.com/settings/tokens) +2. Click "Generate new token" > "Generate new token (classic)" +3. **Important**: Select the following permissions: + - `repo` (Full control of private repositories) - **Required for accessing repository data** + - `workflow` (Update GitHub Action workflows) - **Required for reading CI/CD data** +4. Copy the generated token and use it as `YOUR_GITHUB_TOKEN` + +**Note**: Without the `repo` and `workflow` permissions, the tool will not be able to access CI run data and will return 404 errors. + +## Output + +The tool provides: + +### Console Output +- Overall statistics (total runs, success rate, etc.) +- Category failure breakdown +- Most frequently failed jobs (Top 50) with direct CI links +- Failure pattern analysis + +### JSON Export +Detailed analysis data including: +- Complete failure statistics +- Job failure counts +- Workflow failure counts +- Failure patterns +- Recent failure details + +## Example Output + +``` + +============================================================ +SGLang CI Analysis Report +============================================================ + +Overall Statistics: + Total runs: 1000 + Successful: 392 + Failed: 187 + Cancelled: 181 + Skipped: 150 + Success rate: 39.2% + +Category Failure Statistics: + unit-test: 351 failures + accuracy: 84 failures + performance: 55 failures + deepep: 1 failures + +Most Frequently Failed Jobs (Top 50): + 1. unit-test-backend-1-gpu-amd-mi35x (linux-mi35x-gpu-1): 32 times + Last Success: Run #28893 (2025-09-24 13:35) by Xiaoze Fan: https://github.com/sgl-project/sglang/actions/runs/17978451434 + Recent Failures: + - Run #28958 (2025-09-25 01:51) (PR #1 by Yuhao Yao): https://github.com/sgl-project/sglang/actions/runs/17994520789 + - Run #28957 (2025-09-25 01:10) (PR #10883 by Lianmin Zheng): https://github.com/sgl-project/sglang/actions/runs/17993860400 + - Run #28956 (2025-09-25 01:07) (PR #10495 by Lianmin Zheng): https://github.com/sgl-project/sglang/actions/runs/17993826732 + 2. unit-test-backend-1-gpu-amd (linux-mi300-gpu-1, 3): 31 times + Last Success: Run #28903 (2025-09-24 15:38) by gholmes829: https://github.com/sgl-project/sglang/actions/runs/17981905113 + Recent Failures: + - Run #28958 (2025-09-25 01:51) (PR #1 by Yuhao Yao): https://github.com/sgl-project/sglang/actions/runs/17994520789 + - Run #28957 (2025-09-25 01:10) (PR #10883 by Lianmin Zheng): https://github.com/sgl-project/sglang/actions/runs/17993860400 + - Run #28956 (2025-09-25 01:07) (PR #10495 by Lianmin Zheng): https://github.com/sgl-project/sglang/actions/runs/17993826732 + 3. accuracy-test-2-gpu-amd (linux-mi35x-gpu-2): 29 times + Last Success: Run #28890 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435645 + Recent Failures: + - Run #28958 (2025-09-25 01:51) (PR #1 by Yuhao Yao): https://github.com/sgl-project/sglang/actions/runs/17994520789 + - Run #28957 (2025-09-25 01:10) (PR #10883 by Lianmin Zheng): https://github.com/sgl-project/sglang/actions/runs/17993860400 + - Run #28956 (2025-09-25 01:07) (PR #10495 by Lianmin Zheng): https://github.com/sgl-project/sglang/actions/runs/17993826732 + 4. unit-test-backend-1-gpu-amd (linux-mi300-gpu-1, 5): 23 times + Last Success: Run #28906 (2025-09-24 15:43) (PR #10372 by Xiaoyu Zhang): https://github.com/sgl-project/sglang/actions/runs/17982029749 + Recent Failures: + - Run #28958 (2025-09-25 01:51) (PR #1 by Yuhao Yao): https://github.com/sgl-project/sglang/actions/runs/17994520789 + - Run #28956 (2025-09-25 01:07) (PR #10495 by Lianmin Zheng): https://github.com/sgl-project/sglang/actions/runs/17993826732 + - Run #28955 (2025-09-25 00:40) by Hubert Lu: https://github.com/sgl-project/sglang/actions/runs/17993426068 + 5. unit-test-backend-1-gpu-amd (linux-mi300-gpu-1, 0): 23 times + Last Success: Run #28893 (2025-09-24 13:35) by Xiaoze Fan: https://github.com/sgl-project/sglang/actions/runs/17978451434 + Recent Failures: + - Run #28956 (2025-09-25 01:07) (PR #10495 by Lianmin Zheng): https://github.com/sgl-project/sglang/actions/runs/17993826732 + - Run #28955 (2025-09-25 00:40) by Hubert Lu: https://github.com/sgl-project/sglang/actions/runs/17993426068 + - Run #28953 (2025-09-25 00:24) (PR #10372 by BBuf): https://github.com/sgl-project/sglang/actions/runs/17993178855 + 6. unit-test-backend-1-gpu-amd (linux-mi300-gpu-1, 7): 18 times + Last Success: Run #28890 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435645 + Recent Failures: + - Run #28956 (2025-09-25 01:07) (PR #10495 by Lianmin Zheng): https://github.com/sgl-project/sglang/actions/runs/17993826732 + - Run #28955 (2025-09-25 00:40) by Hubert Lu: https://github.com/sgl-project/sglang/actions/runs/17993426068 + - Run #28953 (2025-09-25 00:24) (PR #10372 by BBuf): https://github.com/sgl-project/sglang/actions/runs/17993178855 + 7. unit-test-backend-1-gpu-amd (linux-mi325-gpu-1, 3): 17 times + Last Success: Run #28893 (2025-09-24 13:35) by Xiaoze Fan: https://github.com/sgl-project/sglang/actions/runs/17978451434 + Recent Failures: + - Run #28955 (2025-09-25 00:40) by Hubert Lu: https://github.com/sgl-project/sglang/actions/runs/17993426068 + - Run #28953 (2025-09-25 00:24) (PR #10372 by BBuf): https://github.com/sgl-project/sglang/actions/runs/17993178855 + - Run #28952 (2025-09-24 23:57) (PR #10372 by Xiaoyu Zhang): https://github.com/sgl-project/sglang/actions/runs/17992751764 + 8. build-test (all): 16 times + Last Success: Run #15748 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435618 + Recent Failures: + - Run #15824 (2025-09-25 02:16) by Yuan Luo: https://github.com/sgl-project/sglang/actions/runs/17994892894 + - Run #15814 (2025-09-25 00:53) by diwei sun: https://github.com/sgl-project/sglang/actions/runs/17993616261 + - Run #15812 (2025-09-25 00:35) by Hubert Lu: https://github.com/sgl-project/sglang/actions/runs/17993338746 + 9. bench-test-2-gpu-amd (linux-mi300-gpu-2): 15 times + Last Success: Run #28893 (2025-09-24 13:35) by Xiaoze Fan: https://github.com/sgl-project/sglang/actions/runs/17978451434 + Recent Failures: + - Run #28957 (2025-09-25 01:10) (PR #10883 by Lianmin Zheng): https://github.com/sgl-project/sglang/actions/runs/17993860400 + - Run #28953 (2025-09-25 00:24) (PR #10372 by BBuf): https://github.com/sgl-project/sglang/actions/runs/17993178855 + - Run #28952 (2025-09-24 23:57) (PR #10372 by Xiaoyu Zhang): https://github.com/sgl-project/sglang/actions/runs/17992751764 + 10. performance-test-1-gpu-part-2-amd (linux-mi300-gpu-1): 15 times + Last Success: Run #28890 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435645 + Recent Failures: + - Run #28955 (2025-09-25 00:40) by Hubert Lu: https://github.com/sgl-project/sglang/actions/runs/17993426068 + - Run #28953 (2025-09-25 00:24) (PR #10372 by BBuf): https://github.com/sgl-project/sglang/actions/runs/17993178855 + - Run #28952 (2025-09-24 23:57) (PR #10372 by Xiaoyu Zhang): https://github.com/sgl-project/sglang/actions/runs/17992751764 + 11. accuracy-test-1-gpu-amd (linux-mi325-gpu-1): 15 times + Last Success: Run #28890 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435645 + Recent Failures: + - Run #28955 (2025-09-25 00:40) by Hubert Lu: https://github.com/sgl-project/sglang/actions/runs/17993426068 + - Run #28953 (2025-09-25 00:24) (PR #10372 by BBuf): https://github.com/sgl-project/sglang/actions/runs/17993178855 + - Run #28952 (2025-09-24 23:57) (PR #10372 by Xiaoyu Zhang): https://github.com/sgl-project/sglang/actions/runs/17992751764 + 12. unit-test-backend-8-gpu-amd (linux-mi300-gpu-8): 15 times + Last Success: Run #28890 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435645 + Recent Failures: + - Run #28953 (2025-09-25 00:24) (PR #10372 by BBuf): https://github.com/sgl-project/sglang/actions/runs/17993178855 + - Run #28952 (2025-09-24 23:57) (PR #10372 by Xiaoyu Zhang): https://github.com/sgl-project/sglang/actions/runs/17992751764 + - Run #28951 (2025-09-24 23:47) (PR #10881 by Chang Su): https://github.com/sgl-project/sglang/actions/runs/17992619816 + 13. unit-test-backend-1-gpu-amd (linux-mi300-gpu-1, 1): 14 times + Last Success: Run #28890 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435645 + Recent Failures: + - Run #28955 (2025-09-25 00:40) by Hubert Lu: https://github.com/sgl-project/sglang/actions/runs/17993426068 + - Run #28953 (2025-09-25 00:24) (PR #10372 by BBuf): https://github.com/sgl-project/sglang/actions/runs/17993178855 + - Run #28952 (2025-09-24 23:57) (PR #10372 by Xiaoyu Zhang): https://github.com/sgl-project/sglang/actions/runs/17992751764 + 14. unit-test-backend-2-gpu-amd (linux-mi300-gpu-2): 14 times + Last Success: Run #28890 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435645 + Recent Failures: + - Run #28953 (2025-09-25 00:24) (PR #10372 by BBuf): https://github.com/sgl-project/sglang/actions/runs/17993178855 + - Run #28952 (2025-09-24 23:57) (PR #10372 by Xiaoyu Zhang): https://github.com/sgl-project/sglang/actions/runs/17992751764 + - Run #28951 (2025-09-24 23:47) (PR #10881 by Chang Su): https://github.com/sgl-project/sglang/actions/runs/17992619816 + 15. performance-test-1-gpu-part-1-amd (linux-mi325-gpu-1): 13 times + Last Success: Run #28890 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435645 + Recent Failures: + - Run #28955 (2025-09-25 00:40) by Hubert Lu: https://github.com/sgl-project/sglang/actions/runs/17993426068 + - Run #28953 (2025-09-25 00:24) (PR #10372 by BBuf): https://github.com/sgl-project/sglang/actions/runs/17993178855 + - Run #28952 (2025-09-24 23:57) (PR #10372 by Xiaoyu Zhang): https://github.com/sgl-project/sglang/actions/runs/17992751764 + 16. unit-test-backend-1-gpu-amd (linux-mi300-gpu-1, 2): 13 times + Last Success: Run #28890 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435645 + Recent Failures: + - Run #28955 (2025-09-25 00:40) by Hubert Lu: https://github.com/sgl-project/sglang/actions/runs/17993426068 + - Run #28953 (2025-09-25 00:24) (PR #10372 by BBuf): https://github.com/sgl-project/sglang/actions/runs/17993178855 + - Run #28952 (2025-09-24 23:57) (PR #10372 by Xiaoyu Zhang): https://github.com/sgl-project/sglang/actions/runs/17992751764 + 17. unit-test-backend-1-gpu-amd (linux-mi300-gpu-1, 4): 13 times + Last Success: Run #28890 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435645 + Recent Failures: + - Run #28955 (2025-09-25 00:40) by Hubert Lu: https://github.com/sgl-project/sglang/actions/runs/17993426068 + - Run #28953 (2025-09-25 00:24) (PR #10372 by BBuf): https://github.com/sgl-project/sglang/actions/runs/17993178855 + - Run #28952 (2025-09-24 23:57) (PR #10372 by Xiaoyu Zhang): https://github.com/sgl-project/sglang/actions/runs/17992751764 + 18. accuracy-test-2-gpu-amd (linux-mi325-gpu-2): 13 times + Last Success: Run #28890 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435645 + Recent Failures: + - Run #28953 (2025-09-25 00:24) (PR #10372 by BBuf): https://github.com/sgl-project/sglang/actions/runs/17993178855 + - Run #28952 (2025-09-24 23:57) (PR #10372 by Xiaoyu Zhang): https://github.com/sgl-project/sglang/actions/runs/17992751764 + - Run #28951 (2025-09-24 23:47) (PR #10881 by Chang Su): https://github.com/sgl-project/sglang/actions/runs/17992619816 + 19. mla-test-1-gpu-amd (linux-mi325-gpu-1): 13 times + Last Success: Run #28890 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435645 + Recent Failures: + - Run #28953 (2025-09-25 00:24) (PR #10372 by BBuf): https://github.com/sgl-project/sglang/actions/runs/17993178855 + - Run #28952 (2025-09-24 23:57) (PR #10372 by Xiaoyu Zhang): https://github.com/sgl-project/sglang/actions/runs/17992751764 + - Run #28951 (2025-09-24 23:47) (PR #10881 by Chang Su): https://github.com/sgl-project/sglang/actions/runs/17992619816 + 20. accuracy-test-2-gpu-amd (linux-mi300-gpu-2): 13 times + Last Success: Run #28890 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435645 + Recent Failures: + - Run #28953 (2025-09-25 00:24) (PR #10372 by BBuf): https://github.com/sgl-project/sglang/actions/runs/17993178855 + - Run #28952 (2025-09-24 23:57) (PR #10372 by Xiaoyu Zhang): https://github.com/sgl-project/sglang/actions/runs/17992751764 + - Run #28951 (2025-09-24 23:47) (PR #10881 by Chang Su): https://github.com/sgl-project/sglang/actions/runs/17992619816 + 21. accuracy-test-1-gpu-amd (linux-mi300-gpu-1): 12 times + Last Success: Run #28890 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435645 + Recent Failures: + - Run #28955 (2025-09-25 00:40) by Hubert Lu: https://github.com/sgl-project/sglang/actions/runs/17993426068 + - Run #28953 (2025-09-25 00:24) (PR #10372 by BBuf): https://github.com/sgl-project/sglang/actions/runs/17993178855 + - Run #28952 (2025-09-24 23:57) (PR #10372 by Xiaoyu Zhang): https://github.com/sgl-project/sglang/actions/runs/17992751764 + 22. performance-test-1-gpu-part-2-amd (linux-mi325-gpu-1): 12 times + Last Success: Run #28890 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435645 + Recent Failures: + - Run #28955 (2025-09-25 00:40) by Hubert Lu: https://github.com/sgl-project/sglang/actions/runs/17993426068 + - Run #28953 (2025-09-25 00:24) (PR #10372 by BBuf): https://github.com/sgl-project/sglang/actions/runs/17993178855 + - Run #28952 (2025-09-24 23:57) (PR #10372 by Xiaoyu Zhang): https://github.com/sgl-project/sglang/actions/runs/17992751764 + 23. bench-test-2-gpu-amd (linux-mi325-gpu-2): 11 times + Last Success: Run #28890 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435645 + Recent Failures: + - Run #28957 (2025-09-25 01:10) (PR #10883 by Lianmin Zheng): https://github.com/sgl-project/sglang/actions/runs/17993860400 + - Run #28953 (2025-09-25 00:24) (PR #10372 by BBuf): https://github.com/sgl-project/sglang/actions/runs/17993178855 + - Run #28952 (2025-09-24 23:57) (PR #10372 by Xiaoyu Zhang): https://github.com/sgl-project/sglang/actions/runs/17992751764 + 24. unit-test-sgl-kernel-amd (linux-mi325-gpu-1): 11 times + Last Success: Run #28891 (2025-09-24 12:44) by Yuan Luo: https://github.com/sgl-project/sglang/actions/runs/17977053408 + Recent Failures: + - Run #28956 (2025-09-25 01:07) (PR #10495 by Lianmin Zheng): https://github.com/sgl-project/sglang/actions/runs/17993826732 + - Run #28953 (2025-09-25 00:24) (PR #10372 by BBuf): https://github.com/sgl-project/sglang/actions/runs/17993178855 + - Run #28952 (2025-09-24 23:57) (PR #10372 by Xiaoyu Zhang): https://github.com/sgl-project/sglang/actions/runs/17992751764 + 25. performance-test-1-gpu-part-1-amd (linux-mi300-gpu-1): 11 times + Last Success: Run #28890 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435645 + Recent Failures: + - Run #28955 (2025-09-25 00:40) by Hubert Lu: https://github.com/sgl-project/sglang/actions/runs/17993426068 + - Run #28953 (2025-09-25 00:24) (PR #10372 by BBuf): https://github.com/sgl-project/sglang/actions/runs/17993178855 + - Run #28952 (2025-09-24 23:57) (PR #10372 by Xiaoyu Zhang): https://github.com/sgl-project/sglang/actions/runs/17992751764 + 26. unit-test-backend-1-gpu-amd (linux-mi300-gpu-1, 6): 11 times + Last Success: Run #28890 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435645 + Recent Failures: + - Run #28955 (2025-09-25 00:40) by Hubert Lu: https://github.com/sgl-project/sglang/actions/runs/17993426068 + - Run #28953 (2025-09-25 00:24) (PR #10372 by BBuf): https://github.com/sgl-project/sglang/actions/runs/17993178855 + - Run #28952 (2025-09-24 23:57) (PR #10372 by Xiaoyu Zhang): https://github.com/sgl-project/sglang/actions/runs/17992751764 + 27. unit-test-backend-2-gpu-amd (linux-mi325-gpu-2): 11 times + Last Success: Run #28890 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435645 + Recent Failures: + - Run #28953 (2025-09-25 00:24) (PR #10372 by BBuf): https://github.com/sgl-project/sglang/actions/runs/17993178855 + - Run #28952 (2025-09-24 23:57) (PR #10372 by Xiaoyu Zhang): https://github.com/sgl-project/sglang/actions/runs/17992751764 + - Run #28951 (2025-09-24 23:47) (PR #10881 by Chang Su): https://github.com/sgl-project/sglang/actions/runs/17992619816 + 28. unit-test-backend-1-gpu (9): 10 times + Last Success: Run #34533 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435636 + Recent Failures: + - Run #34623 (2025-09-25 01:07) (PR #10495 by Lianmin Zheng): https://github.com/sgl-project/sglang/actions/runs/17993826751 + - Run #34617 (2025-09-24 23:47) (PR #10881 by Chang Su): https://github.com/sgl-project/sglang/actions/runs/17992619818 + - Run #34581 (2025-09-24 19:49) by Yineng Zhang: https://github.com/sgl-project/sglang/actions/runs/17987860976 + 29. unit-test-backend-1-gpu-amd (linux-mi325-gpu-1, 0): 10 times + Last Success: Run #28890 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435645 + Recent Failures: + - Run #28955 (2025-09-25 00:40) by Hubert Lu: https://github.com/sgl-project/sglang/actions/runs/17993426068 + - Run #28953 (2025-09-25 00:24) (PR #10372 by BBuf): https://github.com/sgl-project/sglang/actions/runs/17993178855 + - Run #28952 (2025-09-24 23:57) (PR #10372 by Xiaoyu Zhang): https://github.com/sgl-project/sglang/actions/runs/17992751764 + 30. unit-test-backend-1-gpu-amd (linux-mi325-gpu-1, 1): 10 times + Last Success: Run #28891 (2025-09-24 12:44) by Yuan Luo: https://github.com/sgl-project/sglang/actions/runs/17977053408 + Recent Failures: + - Run #28955 (2025-09-25 00:40) by Hubert Lu: https://github.com/sgl-project/sglang/actions/runs/17993426068 + - Run #28952 (2025-09-24 23:57) (PR #10372 by Xiaoyu Zhang): https://github.com/sgl-project/sglang/actions/runs/17992751764 + - Run #28951 (2025-09-24 23:47) (PR #10881 by Chang Su): https://github.com/sgl-project/sglang/actions/runs/17992619816 + 31. mla-test-1-gpu-amd (linux-mi300-gpu-1): 10 times + Last Success: Run #28890 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435645 + Recent Failures: + - Run #28953 (2025-09-25 00:24) (PR #10372 by BBuf): https://github.com/sgl-project/sglang/actions/runs/17993178855 + - Run #28952 (2025-09-24 23:57) (PR #10372 by Xiaoyu Zhang): https://github.com/sgl-project/sglang/actions/runs/17992751764 + - Run #28951 (2025-09-24 23:47) (PR #10881 by Chang Su): https://github.com/sgl-project/sglang/actions/runs/17992619816 + 32. unit-test-backend-1-gpu (5): 9 times + Last Success: Run #34533 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435636 + Recent Failures: + - Run #34624 (2025-09-25 01:10) (PR #10883 by Lianmin Zheng): https://github.com/sgl-project/sglang/actions/runs/17993860412 + - Run #34617 (2025-09-24 23:47) (PR #10881 by Chang Su): https://github.com/sgl-project/sglang/actions/runs/17992619818 + - Run #34560 (2025-09-24 17:01) by Hubert Lu: https://github.com/sgl-project/sglang/actions/runs/17983919007 + 33. unit-test-backend-1-gpu-amd (linux-mi325-gpu-1, 2): 9 times + Last Success: Run #28906 (2025-09-24 15:43) (PR #10372 by Xiaoyu Zhang): https://github.com/sgl-project/sglang/actions/runs/17982029749 + Recent Failures: + - Run #28955 (2025-09-25 00:40) by Hubert Lu: https://github.com/sgl-project/sglang/actions/runs/17993426068 + - Run #28952 (2025-09-24 23:57) (PR #10372 by Xiaoyu Zhang): https://github.com/sgl-project/sglang/actions/runs/17992751764 + - Run #28951 (2025-09-24 23:47) (PR #10881 by Chang Su): https://github.com/sgl-project/sglang/actions/runs/17992619816 + 34. unit-test-sgl-kernel-amd (linux-mi300-gpu-1): 9 times + Last Success: Run #28890 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435645 + Recent Failures: + - Run #28953 (2025-09-25 00:24) (PR #10372 by BBuf): https://github.com/sgl-project/sglang/actions/runs/17993178855 + - Run #28952 (2025-09-24 23:57) (PR #10372 by Xiaoyu Zhang): https://github.com/sgl-project/sglang/actions/runs/17992751764 + - Run #28951 (2025-09-24 23:47) (PR #10881 by Chang Su): https://github.com/sgl-project/sglang/actions/runs/17992619816 + 35. unit-test-backend-1-gpu-amd (linux-mi325-gpu-1, 4): 7 times + Last Success: Run #28890 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435645 + Recent Failures: + - Run #28955 (2025-09-25 00:40) by Hubert Lu: https://github.com/sgl-project/sglang/actions/runs/17993426068 + - Run #28953 (2025-09-25 00:24) (PR #10372 by BBuf): https://github.com/sgl-project/sglang/actions/runs/17993178855 + - Run #28949 (2025-09-24 23:44) (PR #10372 by Xiaoyu Zhang): https://github.com/sgl-project/sglang/actions/runs/17992591372 + 36. unit-test-backend-1-gpu-amd (linux-mi325-gpu-1, 6): 7 times + Last Success: Run #28890 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435645 + Recent Failures: + - Run #28950 (2025-09-24 23:45) (PR #1 by Xiaoyu Zhang): https://github.com/sgl-project/sglang/actions/runs/17992598523 + - Run #28946 (2025-09-24 23:39) (PR #10881 by Chang Su): https://github.com/sgl-project/sglang/actions/runs/17992521547 + - Run #28936 (2025-09-24 21:32) by xiafang: https://github.com/sgl-project/sglang/actions/runs/17990244192 + 37. vllm-dependency-test: 6 times + Last Success: Run #22949 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435651 + Recent Failures: + - Run #23028 (2025-09-25 02:39) by xuyongfei.xyf: https://github.com/sgl-project/sglang/actions/runs/17995251178 + - Run #23021 (2025-09-25 02:16) by Yuan Luo: https://github.com/sgl-project/sglang/actions/runs/17994892873 + - Run #22993 (2025-09-24 21:32) by xiafang: https://github.com/sgl-project/sglang/actions/runs/17990244213 + 38. per-commit-4-ascend-npu: 6 times + Last Success: Run #10065 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435703 + Recent Failures: + - Run #10138 (2025-09-25 02:17) by wangyi: https://github.com/sgl-project/sglang/actions/runs/17994908950 + - Run #10137 (2025-09-25 02:16) by Yuan Luo: https://github.com/sgl-project/sglang/actions/runs/17994892896 + - Run #10124 (2025-09-24 23:47) (PR #10881 by Chang Su): https://github.com/sgl-project/sglang/actions/runs/17992619819 + 39. unit-test-backend-2-gpu (0): 6 times + Last Success: Run #34533 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435636 + Recent Failures: + - Run #34624 (2025-09-25 01:10) (PR #10883 by Lianmin Zheng): https://github.com/sgl-project/sglang/actions/runs/17993860412 + - Run #34593 (2025-09-24 21:32) by xiafang: https://github.com/sgl-project/sglang/actions/runs/17990244227 + - Run #34576 (2025-09-24 18:46) by eigen: https://github.com/sgl-project/sglang/actions/runs/17986403452 + 40. unit-test-backend-1-gpu (4): 6 times + Last Success: Run #34533 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435636 + Recent Failures: + - Run #34623 (2025-09-25 01:07) (PR #10495 by Lianmin Zheng): https://github.com/sgl-project/sglang/actions/runs/17993826751 + - Run #34609 (2025-09-24 23:25) (PR #10853 by Yineng Zhang): https://github.com/sgl-project/sglang/actions/runs/17992311361 + - Run #34560 (2025-09-24 17:01) by Hubert Lu: https://github.com/sgl-project/sglang/actions/runs/17983919007 + 41. run-all-notebooks: 6 times + Last Success: Run #26939 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435610 + Recent Failures: + - Run #26988 (2025-09-24 23:25) (PR #10853 by Yineng Zhang): https://github.com/sgl-project/sglang/actions/runs/17992311396 + - Run #26982 (2025-09-24 21:32) by xiafang: https://github.com/sgl-project/sglang/actions/runs/17990244193 + - Run #26973 (2025-09-24 18:46) by eigen: https://github.com/sgl-project/sglang/actions/runs/17986403458 + 42. per-commit-2-ascend-npu: 5 times + Last Success: Run #10065 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435703 + Recent Failures: + - Run #10135 (2025-09-25 02:16) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17994888152 + - Run #10109 (2025-09-24 21:32) by xiafang: https://github.com/sgl-project/sglang/actions/runs/17990244207 + - Run #10085 (2025-09-24 16:42) by likesen: https://github.com/sgl-project/sglang/actions/runs/17983486537 + 43. unit-test-backend-8-gpu (0): 5 times + Last Success: Run #34533 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435636 + Recent Failures: + - Run #34623 (2025-09-25 01:07) (PR #10495 by Lianmin Zheng): https://github.com/sgl-project/sglang/actions/runs/17993826751 + - Run #34621 (2025-09-25 00:40) by Hubert Lu: https://github.com/sgl-project/sglang/actions/runs/17993426098 + - Run #34619 (2025-09-25 00:24) (PR #10372 by BBuf): https://github.com/sgl-project/sglang/actions/runs/17993178853 + 44. pytest-rust: 5 times + Last Success: Run #1761 (2025-09-24 16:39) by Chang Su: https://github.com/sgl-project/sglang/actions/runs/17983415401 + Recent Failures: + - Run #1770 (2025-09-24 21:02) by Simo Lin: https://github.com/sgl-project/sglang/actions/runs/17989538977 + - Run #1769 (2025-09-24 20:54) by Simo Lin: https://github.com/sgl-project/sglang/actions/runs/17989380799 + - Run #1767 (2025-09-24 20:36) by Ata Fatahi: https://github.com/sgl-project/sglang/actions/runs/17988964074 + 45. per-commit-16-ascend-a3: 4 times + Last Success: Run #10065 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435703 + Recent Failures: + - Run #10138 (2025-09-25 02:17) by wangyi: https://github.com/sgl-project/sglang/actions/runs/17994908950 + - Run #10135 (2025-09-25 02:16) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17994888152 + - Run #10109 (2025-09-24 21:32) by xiafang: https://github.com/sgl-project/sglang/actions/runs/17990244207 + 46. unit-test-backend-1-gpu (7): 4 times + Last Success: Run #34533 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435636 + Recent Failures: + - Run #34624 (2025-09-25 01:10) (PR #10883 by Lianmin Zheng): https://github.com/sgl-project/sglang/actions/runs/17993860412 + - Run #34573 (2025-09-24 18:45) by Tejesh Anand: https://github.com/sgl-project/sglang/actions/runs/17986382981 + - Run #34565 (2025-09-24 17:35) by YAMY: https://github.com/sgl-project/sglang/actions/runs/17984740528 + 47. unit-test-backend-2-gpu (1): 4 times + Last Success: Run #34533 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435636 + Recent Failures: + - Run #34593 (2025-09-24 21:32) by xiafang: https://github.com/sgl-project/sglang/actions/runs/17990244227 + - Run #34576 (2025-09-24 18:46) by eigen: https://github.com/sgl-project/sglang/actions/runs/17986403452 + - Run #34565 (2025-09-24 17:35) by YAMY: https://github.com/sgl-project/sglang/actions/runs/17984740528 + 48. per-commit-1-ascend-npu: 3 times + Last Success: Run #10065 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435703 + Recent Failures: + - Run #10138 (2025-09-25 02:17) by wangyi: https://github.com/sgl-project/sglang/actions/runs/17994908950 + - Run #10109 (2025-09-24 21:32) by xiafang: https://github.com/sgl-project/sglang/actions/runs/17990244207 + - Run #10085 (2025-09-24 16:42) by likesen: https://github.com/sgl-project/sglang/actions/runs/17983486537 + 49. unit-test-backend-1-gpu (1): 3 times + Last Success: Run #34533 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435636 + Recent Failures: + - Run #34623 (2025-09-25 01:07) (PR #10495 by Lianmin Zheng): https://github.com/sgl-project/sglang/actions/runs/17993826751 + - Run #34554 (2025-09-24 16:29) by Yuan Luo: https://github.com/sgl-project/sglang/actions/runs/17983177051 + - Run #34548 (2025-09-24 15:38) by gholmes829: https://github.com/sgl-project/sglang/actions/runs/17981905143 + 50. unit-test-backend-1-gpu (8): 3 times + Last Success: Run #34533 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435636 + Recent Failures: + - Run #34617 (2025-09-24 23:47) (PR #10881 by Chang Su): https://github.com/sgl-project/sglang/actions/runs/17992619818 + - Run #34581 (2025-09-24 19:49) by Yineng Zhang: https://github.com/sgl-project/sglang/actions/runs/17987860976 + - Run #34554 (2025-09-24 16:29) by Yuan Luo: https://github.com/sgl-project/sglang/actions/runs/17983177051 + +Failure Pattern Analysis: + GPU Related Failure: 223 times + Unit Test Failure: 190 times + Accuracy Test Failure: 84 times + Performance Test Failure: 54 times + Other: 34 times + Dependency Installation Failure: 19 times + Build Failure: 15 times +``` + +## CI Job Categories + +The tool automatically categorizes CI jobs into: + +- **sgl-kernel**: Kernel-related tests (build, unit tests, MLA tests) +- **unit-test**: Unit tests (frontend, backend with different GPU counts) +- **performance**: Performance tests (latency, throughput benchmarks) +- **accuracy**: Accuracy tests (model evaluation) +- **deepep**: DeepEP-related tests +- **b200**: B200 hardware-specific tests + +## Failure Patterns + +The tool recognizes these failure patterns: + +- **Timeout**: Step execution timeout +- **Unit Test Failure**: Unit test execution failures +- **Performance Test Failure**: Performance benchmark failures +- **Accuracy Test Failure**: Model accuracy evaluation failures +- **Build Failure**: Compilation/build process failures +- **Dependency Installation Failure**: Package installation issues +- **GPU Related Failure**: GPU-specific test failures +- **Other**: Unclassified failures + +## Troubleshooting + +### Common Issues + +1. **404 Error**: + - Ensure the repository name is correct (`sgl-project/sglang`) + - **Most common cause**: Missing `repo` or `workflow` permissions in your GitHub token + - Go to [GitHub Settings > Personal Access Tokens](https://github.com/settings/tokens) and regenerate with correct permissions +2. **403 Error**: Check that your GitHub token has the correct permissions (`repo` and `workflow`) +3. **Rate Limiting**: The tool includes built-in delays to avoid API rate limits +4. **Network Issues**: Ensure stable internet connection + +### Debug Mode + +For detailed API call information, you can modify the code to include logging: + +```python +import logging +logging.basicConfig(level=logging.DEBUG) +``` + +## Automated Monitoring + +The CI monitor is also available as a GitHub Actions workflow that runs automatically every 6 hours. The workflow: + +- Analyzes the last 500 CI runs +- Generates detailed reports +- Uploads analysis results as artifacts + +### Workflow Configuration + +The workflow is located at `.github/workflows/ci-monitor.yml` and uses the `GH_PAT_FOR_NIGHTLY_CI` secret for GitHub API access. + +### Manual Trigger + +You can manually trigger the workflow from the GitHub Actions tab with custom parameters: +- `limit`: Number of CI runs to analyze (default: 500) + +## License + +This tool follows the same license as the SGLang project. diff --git a/scripts/ci_monitor/ci_analyzer.py b/scripts/ci_monitor/ci_analyzer.py new file mode 100755 index 000000000..20089f20d --- /dev/null +++ b/scripts/ci_monitor/ci_analyzer.py @@ -0,0 +1,389 @@ +#!/usr/bin/env python3 +""" +SGLang CI Analyzer +Simple tool to analyze CI failures for SGLang project +""" + +import argparse +import json +import os +import sys +import time +from collections import Counter, defaultdict +from datetime import datetime +from typing import Dict, List + +import requests + + +class SGLangCIAnalyzer: + """SGLang CI Analyzer""" + + def __init__(self, token: str): + self.token = token + self.base_url = "https://api.github.com" + self.repo = "sgl-project/sglang" + self.headers = { + "Authorization": f"token {token}", + "Accept": "application/vnd.github.v3+json", + "User-Agent": "SGLang-CI-Analyzer/1.0", + } + self.session = requests.Session() + self.session.headers.update(self.headers) + + def get_recent_runs(self, limit: int = 100) -> List[Dict]: + """Get recent CI run data""" + print(f"Fetching {limit} recent CI runs...") + + all_runs = [] + page = 1 + per_page = 100 + + while len(all_runs) < limit: + url = f"{self.base_url}/repos/{self.repo}/actions/runs" + params = {"per_page": min(per_page, limit - len(all_runs)), "page": page} + + try: + response = self.session.get(url, params=params) + response.raise_for_status() + data = response.json() + + if not data.get("workflow_runs"): + break + + all_runs.extend(data["workflow_runs"]) + print(f"Fetched {len(all_runs)} runs so far...") + + if len(data["workflow_runs"]) < per_page: + break + + page += 1 + time.sleep(0.1) # Avoid API rate limits + + except requests.exceptions.RequestException as e: + print(f"Error fetching CI data: {e}") + break + + return all_runs[:limit] + + def analyze_ci_failures(self, runs: List[Dict]) -> Dict: + """Analyze CI failure patterns""" + print("Analyzing CI failure data...") + + # SGLang specific job categories + job_categories = { + "sgl-kernel": [ + "sgl-kernel-build-wheels", + "sgl-kernel-unit-test", + "sgl-kernel-mla-test", + ], + "unit-test": [ + "unit-test-frontend", + "unit-test-backend-1-gpu", + "unit-test-backend-2-gpu", + "unit-test-backend-4-gpu", + "unit-test-backend-8-gpu", + ], + "performance": [ + "performance-test-1-gpu-part-1", + "performance-test-1-gpu-part-2", + "performance-test-2-gpu", + ], + "accuracy": ["accuracy-test-1-gpu", "accuracy-test-2-gpu"], + "deepep": ["unit-test-deepep-4-gpu", "unit-test-deepep-8-gpu"], + "b200": ["unit-test-backend-4-gpu-b200"], + } + + stats = { + "total_runs": len(runs), + "failed_runs": 0, + "successful_runs": 0, + "cancelled_runs": 0, + "skipped_runs": 0, + "category_failures": defaultdict(int), + "job_failures": defaultdict(int), + "failure_patterns": defaultdict(int), + "job_failure_links": defaultdict( + list + ), # Store recent failure links for each job + "job_last_success": {}, # Store last successful run for each job + } + + total_runs = len(runs) + for i, run in enumerate(runs, 1): + # Show progress every 10% or every 50 runs, whichever is smaller + if i % max(1, min(50, total_runs // 10)) == 0 or i == total_runs: + progress = (i / total_runs) * 100 + print(f"Progress: {i}/{total_runs} ({progress:.1f}%)") + + run_status = run.get("conclusion", "unknown") + workflow_name = run.get("name", "Unknown") + run_id = run.get("id") + run_number = run.get("run_number") + created_at = run.get("created_at") + + # Count run status + if run_status == "failure": + stats["failed_runs"] += 1 + elif run_status == "success": + stats["successful_runs"] += 1 + elif run_status == "cancelled": + stats["cancelled_runs"] += 1 + elif run_status == "skipped": + stats["skipped_runs"] += 1 + + # Get detailed job information for all runs + jobs = self._get_job_details(run_id) + run_url = f"https://github.com/{self.repo}/actions/runs/{run_id}" + pr_info = self._get_pr_info(run) + + for job in jobs: + job_name = job.get("name", "Unknown") + job_conclusion = job.get("conclusion", "unknown") + + # Filter out non-specific CI jobs + if job_name not in [ + "check-changes", + "pr-test-finish", + "pr-test-h20-finish", + "lint", + ]: + # Record successful jobs (update last success) + if job_conclusion == "success": + stats["job_last_success"][job_name] = { + "url": run_url, + "run_number": run_number, + "created_at": created_at, + "pr_info": pr_info, + } + + # Record failed jobs + elif job_conclusion == "failure" and run_status == "failure": + stats["job_failures"][job_name] += 1 + + # Store failure link (keep only last 3 for each job) + if len(stats["job_failure_links"][job_name]) < 3: + stats["job_failure_links"][job_name].append( + { + "url": run_url, + "run_number": run_number, + "created_at": created_at, + "pr_info": pr_info, + } + ) + + # Categorize failed jobs + for category, jobs_list in job_categories.items(): + if any( + job_pattern in job_name for job_pattern in jobs_list + ): + stats["category_failures"][category] += 1 + break + + # Analyze failure patterns + self._analyze_failure_pattern(job, stats) + + time.sleep(0.1) # Avoid API rate limits + + return stats + + def _get_job_details(self, run_id: int) -> List[Dict]: + """Get job details for a specific run""" + url = f"{self.base_url}/repos/{self.repo}/actions/runs/{run_id}/jobs" + try: + response = self.session.get(url) + response.raise_for_status() + return response.json().get("jobs", []) + except: + return [] + + def _get_pr_info(self, run: Dict) -> Dict: + """Get PR information from a run""" + pr_info = { + "pr_number": None, + "author": run.get("head_commit", {}) + .get("author", {}) + .get("name", "Unknown"), + "head_sha": run.get("head_sha", ""), + "head_branch": run.get("head_branch", ""), + } + + # Try to extract PR number from pull_requests + pull_requests = run.get("pull_requests", []) + if pull_requests: + pr_info["pr_number"] = pull_requests[0].get("number") + + return pr_info + + def _analyze_failure_pattern(self, job: Dict, stats: Dict): + """Analyze failure patterns""" + job_name = job.get("name", "") + steps = job.get("steps", []) + + for step in steps: + if step.get("conclusion") == "failure": + step_name = step.get("name", "") + + # SGLang specific failure pattern recognition + if "timeout" in step_name.lower(): + stats["failure_patterns"]["Timeout"] += 1 + elif "test" in step_name.lower() and "unit" in job_name.lower(): + stats["failure_patterns"]["Unit Test Failure"] += 1 + elif "performance" in job_name.lower(): + stats["failure_patterns"]["Performance Test Failure"] += 1 + elif "accuracy" in job_name.lower(): + stats["failure_patterns"]["Accuracy Test Failure"] += 1 + elif "build" in step_name.lower(): + stats["failure_patterns"]["Build Failure"] += 1 + elif "install" in step_name.lower(): + stats["failure_patterns"]["Dependency Installation Failure"] += 1 + elif "gpu" in job_name.lower(): + stats["failure_patterns"]["GPU Related Failure"] += 1 + else: + stats["failure_patterns"]["Other"] += 1 + + def generate_report(self, stats: Dict): + """Generate CI analysis report""" + print("\n" + "=" * 60) + print("SGLang CI Analysis Report") + print("=" * 60) + + # Overall statistics + total = stats["total_runs"] + failed = stats["failed_runs"] + success = stats["successful_runs"] + cancelled = stats["cancelled_runs"] + skipped = stats["skipped_runs"] + success_rate = (success / total * 100) if total > 0 else 0 + + print(f"\nOverall Statistics:") + print(f" Total runs: {total}") + print(f" Successful: {success}") + print(f" Failed: {failed}") + print(f" Cancelled: {cancelled}") + print(f" Skipped: {skipped}") + print(f" Success rate: {success_rate:.1f}%") + + # Category failure statistics + if stats["category_failures"]: + print(f"\nCategory Failure Statistics:") + for category, count in sorted( + stats["category_failures"].items(), key=lambda x: x[1], reverse=True + ): + print(f" {category}: {count} failures") + + # Most frequently failed jobs with links + if stats["job_failures"]: + print(f"\nMost Frequently Failed Jobs (Top 50):") + for i, (job, count) in enumerate( + sorted(stats["job_failures"].items(), key=lambda x: x[1], reverse=True)[ + :50 + ], + 1, + ): + print(f" {i:2d}. {job}: {count} times") + + # Show last successful run + if job in stats["job_last_success"]: + last_success = stats["job_last_success"][job] + success_date = datetime.fromisoformat( + last_success["created_at"].replace("Z", "+00:00") + ) + pr_info = last_success["pr_info"] + + pr_text = "" + if pr_info["pr_number"]: + pr_text = ( + f" (PR #{pr_info['pr_number']} by {pr_info['author']})" + ) + else: + pr_text = f" by {pr_info['author']}" + + print( + f" Last Success: Run #{last_success['run_number']} ({success_date.strftime('%Y-%m-%d %H:%M')}){pr_text}: {last_success['url']}" + ) + + # Show recent failure links + if ( + job in stats["job_failure_links"] + and stats["job_failure_links"][job] + ): + print(" Recent Failures:") + for link_info in stats["job_failure_links"][job]: + created_at = datetime.fromisoformat( + link_info["created_at"].replace("Z", "+00:00") + ) + + # Format PR info for failures + pr_info = link_info.get("pr_info", {}) + pr_text = "" + if pr_info.get("pr_number"): + pr_text = f" (PR #{pr_info['pr_number']} by {pr_info.get('author', 'Unknown')})" + else: + pr_text = f" by {pr_info.get('author', 'Unknown')}" + + print( + f" - Run #{link_info['run_number']} ({created_at.strftime('%Y-%m-%d %H:%M')}){pr_text}: {link_info['url']}" + ) + + # Failure pattern analysis + if stats["failure_patterns"]: + print(f"\nFailure Pattern Analysis:") + for pattern, count in sorted( + stats["failure_patterns"].items(), key=lambda x: x[1], reverse=True + ): + print(f" {pattern}: {count} times") + + print("\n" + "=" * 60) + + def save_detailed_report(self, stats: Dict, output_file: str = "ci_analysis.json"): + """Save detailed report to file""" + with open(output_file, "w", encoding="utf-8") as f: + json.dump(stats, f, ensure_ascii=False, indent=2) + print(f"\nDetailed report saved to: {output_file}") + + +def main(): + parser = argparse.ArgumentParser(description="SGLang CI Analyzer") + parser.add_argument("--token", required=True, help="GitHub Personal Access Token") + parser.add_argument( + "--limit", + type=int, + default=100, + help="Number of runs to analyze (default: 100)", + ) + parser.add_argument( + "--output", + default="ci_analysis.json", + help="Output file (default: ci_analysis.json)", + ) + + args = parser.parse_args() + + # Create analyzer + analyzer = SGLangCIAnalyzer(args.token) + + try: + # Get CI run data + runs = analyzer.get_recent_runs(args.limit) + + if not runs: + print("No CI run data found") + return + + # Analyze failures + stats = analyzer.analyze_ci_failures(runs) + + # Generate report + analyzer.generate_report(stats) + + # Save detailed report + analyzer.save_detailed_report(stats, args.output) + + except Exception as e: + print(f"Error during analysis: {e}") + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/scripts/ci_monitor/example.sh b/scripts/ci_monitor/example.sh new file mode 100755 index 000000000..abc656fce --- /dev/null +++ b/scripts/ci_monitor/example.sh @@ -0,0 +1,14 @@ +#!/bin/bash +# Example usage of SGLang CI Analyzer + +# IMPORTANT: Get your GitHub token from https://github.com/settings/tokens +# Make sure to select 'repo' and 'workflow' permissions! + +# Basic usage - analyze last 100 runs +python3 ci_analyzer.py --token YOUR_GITHUB_TOKEN + +# Analyze last 1000 runs +python3 ci_analyzer.py --token YOUR_GITHUB_TOKEN --limit 1000 + +# Custom output file +python3 ci_analyzer.py --token YOUR_GITHUB_TOKEN --limit 500 --output my_analysis.json