From 6806c4e63ed5aa0c51e599260e2b43678baf7c71 Mon Sep 17 00:00:00 2001 From: Xiaoyu Zhang <35585791+BBuf@users.noreply.github.com> Date: Mon, 13 Oct 2025 13:31:09 +0800 Subject: [PATCH] [CI monitor] Improve CI analyzer: fix job failure tracking and add CUDA-focused filtering (#11505) --- scripts/ci_monitor/README.md | 528 ------------------------------ scripts/ci_monitor/ci_analyzer.py | 102 ++++-- 2 files changed, 76 insertions(+), 554 deletions(-) diff --git a/scripts/ci_monitor/README.md b/scripts/ci_monitor/README.md index 94c09098d..4e87f7ab1 100644 --- a/scripts/ci_monitor/README.md +++ b/scripts/ci_monitor/README.md @@ -193,531 +193,3 @@ Use `--start-date` and `--end-date` parameters to get **ALL** CI runs within a s 4. Copy the generated token and use it as `YOUR_GITHUB_TOKEN` **Note**: Without the `repo` and `workflow` permissions, the tool will not be able to access CI run data and will return 404 errors. - -## Output - -### CI Analyzer Output - -#### Console Output -- Overall statistics (total runs, success rate, etc.) -- Category failure breakdown -- Most frequently failed jobs (Top 50) with direct CI links -- Failure pattern analysis - -#### JSON Export -Detailed analysis data including: -- Complete failure statistics -- Job failure counts -- Workflow failure counts -- Failure patterns -- Recent failure details - -### Performance Analyzer Output - -#### Console Output -- Performance data collection progress -- Summary statistics of collected tests and records -- Generated file locations (CSV tables and PNG charts) - -#### File Outputs -- **CSV Tables**: Structured performance data with columns: - - `created_at`: Timestamp of the CI run - - `run_number`: GitHub Actions run number - - `pr_number`: Pull request number (if applicable) - - `author`: Developer who triggered the run - - `head_sha`: Git commit SHA - - Performance metrics (varies by test type): - - `output_throughput_token_s`: Output throughput in tokens/second - - `median_e2e_latency_ms`: Median end-to-end latency in milliseconds - - `median_ttft_ms`: Median time-to-first-token in milliseconds - - `accept_length`: Accept length for speculative decoding tests - - `url`: Direct link to the GitHub Actions run - -- **PNG Charts**: Time-series visualization charts for each metric: - - X-axis: Time (MM-DD HH:MM format) - - Y-axis: Performance metric values - - File naming: `{test_name}_{metric_name}.png` - -#### Directory Structure -``` -performance_tables/ -├── performance-test-1-gpu-part-1_summary/ -│ ├── test_bs1_default.csv -│ ├── test_bs1_default_output_throughput_token_s.png -│ ├── test_online_latency_default.csv -│ ├── test_online_latency_default_median_e2e_latency_ms.png -│ └── ... -├── performance-test-1-gpu-part-2_summary/ -│ └── ... -└── performance-test-2-gpu_summary/ - └── ... -``` - -## Example Output - -### CI Analyzer Example - -``` - -============================================================ -SGLang CI Analysis Report -============================================================ - -Overall Statistics: - Total runs: 1000 - Successful: 392 - Failed: 187 - Cancelled: 181 - Skipped: 150 - Success rate: 39.2% - -Category Failure Statistics: - unit-test: 351 failures - accuracy: 84 failures - performance: 55 failures - deepep: 1 failures - -Most Frequently Failed Jobs (Top 50): - 1. unit-test-backend-1-gpu-amd-mi35x (linux-mi35x-gpu-1): 32 times - Last Success: Run #28893 (2025-09-24 13:35) by Xiaoze Fan: https://github.com/sgl-project/sglang/actions/runs/17978451434 - Recent Failures: - - Run #28958 (2025-09-25 01:51) (PR #1 by Yuhao Yao): https://github.com/sgl-project/sglang/actions/runs/17994520789 - - Run #28957 (2025-09-25 01:10) (PR #10883 by Lianmin Zheng): https://github.com/sgl-project/sglang/actions/runs/17993860400 - - Run #28956 (2025-09-25 01:07) (PR #10495 by Lianmin Zheng): https://github.com/sgl-project/sglang/actions/runs/17993826732 - 2. unit-test-backend-1-gpu-amd (linux-mi300-gpu-1, 3): 31 times - Last Success: Run #28903 (2025-09-24 15:38) by gholmes829: https://github.com/sgl-project/sglang/actions/runs/17981905113 - Recent Failures: - - Run #28958 (2025-09-25 01:51) (PR #1 by Yuhao Yao): https://github.com/sgl-project/sglang/actions/runs/17994520789 - - Run #28957 (2025-09-25 01:10) (PR #10883 by Lianmin Zheng): https://github.com/sgl-project/sglang/actions/runs/17993860400 - - Run #28956 (2025-09-25 01:07) (PR #10495 by Lianmin Zheng): https://github.com/sgl-project/sglang/actions/runs/17993826732 - 3. accuracy-test-2-gpu-amd (linux-mi35x-gpu-2): 29 times - Last Success: Run #28890 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435645 - Recent Failures: - - Run #28958 (2025-09-25 01:51) (PR #1 by Yuhao Yao): https://github.com/sgl-project/sglang/actions/runs/17994520789 - - Run #28957 (2025-09-25 01:10) (PR #10883 by Lianmin Zheng): https://github.com/sgl-project/sglang/actions/runs/17993860400 - - Run #28956 (2025-09-25 01:07) (PR #10495 by Lianmin Zheng): https://github.com/sgl-project/sglang/actions/runs/17993826732 - 4. unit-test-backend-1-gpu-amd (linux-mi300-gpu-1, 5): 23 times - Last Success: Run #28906 (2025-09-24 15:43) (PR #10372 by Xiaoyu Zhang): https://github.com/sgl-project/sglang/actions/runs/17982029749 - Recent Failures: - - Run #28958 (2025-09-25 01:51) (PR #1 by Yuhao Yao): https://github.com/sgl-project/sglang/actions/runs/17994520789 - - Run #28956 (2025-09-25 01:07) (PR #10495 by Lianmin Zheng): https://github.com/sgl-project/sglang/actions/runs/17993826732 - - Run #28955 (2025-09-25 00:40) by Hubert Lu: https://github.com/sgl-project/sglang/actions/runs/17993426068 - 5. unit-test-backend-1-gpu-amd (linux-mi300-gpu-1, 0): 23 times - Last Success: Run #28893 (2025-09-24 13:35) by Xiaoze Fan: https://github.com/sgl-project/sglang/actions/runs/17978451434 - Recent Failures: - - Run #28956 (2025-09-25 01:07) (PR #10495 by Lianmin Zheng): https://github.com/sgl-project/sglang/actions/runs/17993826732 - - Run #28955 (2025-09-25 00:40) by Hubert Lu: https://github.com/sgl-project/sglang/actions/runs/17993426068 - - Run #28953 (2025-09-25 00:24) (PR #10372 by BBuf): https://github.com/sgl-project/sglang/actions/runs/17993178855 - 6. unit-test-backend-1-gpu-amd (linux-mi300-gpu-1, 7): 18 times - Last Success: Run #28890 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435645 - Recent Failures: - - Run #28956 (2025-09-25 01:07) (PR #10495 by Lianmin Zheng): https://github.com/sgl-project/sglang/actions/runs/17993826732 - - Run #28955 (2025-09-25 00:40) by Hubert Lu: https://github.com/sgl-project/sglang/actions/runs/17993426068 - - Run #28953 (2025-09-25 00:24) (PR #10372 by BBuf): https://github.com/sgl-project/sglang/actions/runs/17993178855 - 7. unit-test-backend-1-gpu-amd (linux-mi325-gpu-1, 3): 17 times - Last Success: Run #28893 (2025-09-24 13:35) by Xiaoze Fan: https://github.com/sgl-project/sglang/actions/runs/17978451434 - Recent Failures: - - Run #28955 (2025-09-25 00:40) by Hubert Lu: https://github.com/sgl-project/sglang/actions/runs/17993426068 - - Run #28953 (2025-09-25 00:24) (PR #10372 by BBuf): https://github.com/sgl-project/sglang/actions/runs/17993178855 - - Run #28952 (2025-09-24 23:57) (PR #10372 by Xiaoyu Zhang): https://github.com/sgl-project/sglang/actions/runs/17992751764 - 8. build-test (all): 16 times - Last Success: Run #15748 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435618 - Recent Failures: - - Run #15824 (2025-09-25 02:16) by Yuan Luo: https://github.com/sgl-project/sglang/actions/runs/17994892894 - - Run #15814 (2025-09-25 00:53) by diwei sun: https://github.com/sgl-project/sglang/actions/runs/17993616261 - - Run #15812 (2025-09-25 00:35) by Hubert Lu: https://github.com/sgl-project/sglang/actions/runs/17993338746 - 9. bench-test-2-gpu-amd (linux-mi300-gpu-2): 15 times - Last Success: Run #28893 (2025-09-24 13:35) by Xiaoze Fan: https://github.com/sgl-project/sglang/actions/runs/17978451434 - Recent Failures: - - Run #28957 (2025-09-25 01:10) (PR #10883 by Lianmin Zheng): https://github.com/sgl-project/sglang/actions/runs/17993860400 - - Run #28953 (2025-09-25 00:24) (PR #10372 by BBuf): https://github.com/sgl-project/sglang/actions/runs/17993178855 - - Run #28952 (2025-09-24 23:57) (PR #10372 by Xiaoyu Zhang): https://github.com/sgl-project/sglang/actions/runs/17992751764 - 10. performance-test-1-gpu-part-2-amd (linux-mi300-gpu-1): 15 times - Last Success: Run #28890 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435645 - Recent Failures: - - Run #28955 (2025-09-25 00:40) by Hubert Lu: https://github.com/sgl-project/sglang/actions/runs/17993426068 - - Run #28953 (2025-09-25 00:24) (PR #10372 by BBuf): https://github.com/sgl-project/sglang/actions/runs/17993178855 - - Run #28952 (2025-09-24 23:57) (PR #10372 by Xiaoyu Zhang): https://github.com/sgl-project/sglang/actions/runs/17992751764 - 11. accuracy-test-1-gpu-amd (linux-mi325-gpu-1): 15 times - Last Success: Run #28890 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435645 - Recent Failures: - - Run #28955 (2025-09-25 00:40) by Hubert Lu: https://github.com/sgl-project/sglang/actions/runs/17993426068 - - Run #28953 (2025-09-25 00:24) (PR #10372 by BBuf): https://github.com/sgl-project/sglang/actions/runs/17993178855 - - Run #28952 (2025-09-24 23:57) (PR #10372 by Xiaoyu Zhang): https://github.com/sgl-project/sglang/actions/runs/17992751764 - 12. unit-test-backend-8-gpu-amd (linux-mi300-gpu-8): 15 times - Last Success: Run #28890 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435645 - Recent Failures: - - Run #28953 (2025-09-25 00:24) (PR #10372 by BBuf): https://github.com/sgl-project/sglang/actions/runs/17993178855 - - Run #28952 (2025-09-24 23:57) (PR #10372 by Xiaoyu Zhang): https://github.com/sgl-project/sglang/actions/runs/17992751764 - - Run #28951 (2025-09-24 23:47) (PR #10881 by Chang Su): https://github.com/sgl-project/sglang/actions/runs/17992619816 - 13. unit-test-backend-1-gpu-amd (linux-mi300-gpu-1, 1): 14 times - Last Success: Run #28890 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435645 - Recent Failures: - - Run #28955 (2025-09-25 00:40) by Hubert Lu: https://github.com/sgl-project/sglang/actions/runs/17993426068 - - Run #28953 (2025-09-25 00:24) (PR #10372 by BBuf): https://github.com/sgl-project/sglang/actions/runs/17993178855 - - Run #28952 (2025-09-24 23:57) (PR #10372 by Xiaoyu Zhang): https://github.com/sgl-project/sglang/actions/runs/17992751764 - 14. unit-test-backend-2-gpu-amd (linux-mi300-gpu-2): 14 times - Last Success: Run #28890 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435645 - Recent Failures: - - Run #28953 (2025-09-25 00:24) (PR #10372 by BBuf): https://github.com/sgl-project/sglang/actions/runs/17993178855 - - Run #28952 (2025-09-24 23:57) (PR #10372 by Xiaoyu Zhang): https://github.com/sgl-project/sglang/actions/runs/17992751764 - - Run #28951 (2025-09-24 23:47) (PR #10881 by Chang Su): https://github.com/sgl-project/sglang/actions/runs/17992619816 - 15. performance-test-1-gpu-part-1-amd (linux-mi325-gpu-1): 13 times - Last Success: Run #28890 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435645 - Recent Failures: - - Run #28955 (2025-09-25 00:40) by Hubert Lu: https://github.com/sgl-project/sglang/actions/runs/17993426068 - - Run #28953 (2025-09-25 00:24) (PR #10372 by BBuf): https://github.com/sgl-project/sglang/actions/runs/17993178855 - - Run #28952 (2025-09-24 23:57) (PR #10372 by Xiaoyu Zhang): https://github.com/sgl-project/sglang/actions/runs/17992751764 - 16. unit-test-backend-1-gpu-amd (linux-mi300-gpu-1, 2): 13 times - Last Success: Run #28890 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435645 - Recent Failures: - - Run #28955 (2025-09-25 00:40) by Hubert Lu: https://github.com/sgl-project/sglang/actions/runs/17993426068 - - Run #28953 (2025-09-25 00:24) (PR #10372 by BBuf): https://github.com/sgl-project/sglang/actions/runs/17993178855 - - Run #28952 (2025-09-24 23:57) (PR #10372 by Xiaoyu Zhang): https://github.com/sgl-project/sglang/actions/runs/17992751764 - 17. unit-test-backend-1-gpu-amd (linux-mi300-gpu-1, 4): 13 times - Last Success: Run #28890 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435645 - Recent Failures: - - Run #28955 (2025-09-25 00:40) by Hubert Lu: https://github.com/sgl-project/sglang/actions/runs/17993426068 - - Run #28953 (2025-09-25 00:24) (PR #10372 by BBuf): https://github.com/sgl-project/sglang/actions/runs/17993178855 - - Run #28952 (2025-09-24 23:57) (PR #10372 by Xiaoyu Zhang): https://github.com/sgl-project/sglang/actions/runs/17992751764 - 18. accuracy-test-2-gpu-amd (linux-mi325-gpu-2): 13 times - Last Success: Run #28890 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435645 - Recent Failures: - - Run #28953 (2025-09-25 00:24) (PR #10372 by BBuf): https://github.com/sgl-project/sglang/actions/runs/17993178855 - - Run #28952 (2025-09-24 23:57) (PR #10372 by Xiaoyu Zhang): https://github.com/sgl-project/sglang/actions/runs/17992751764 - - Run #28951 (2025-09-24 23:47) (PR #10881 by Chang Su): https://github.com/sgl-project/sglang/actions/runs/17992619816 - 19. mla-test-1-gpu-amd (linux-mi325-gpu-1): 13 times - Last Success: Run #28890 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435645 - Recent Failures: - - Run #28953 (2025-09-25 00:24) (PR #10372 by BBuf): https://github.com/sgl-project/sglang/actions/runs/17993178855 - - Run #28952 (2025-09-24 23:57) (PR #10372 by Xiaoyu Zhang): https://github.com/sgl-project/sglang/actions/runs/17992751764 - - Run #28951 (2025-09-24 23:47) (PR #10881 by Chang Su): https://github.com/sgl-project/sglang/actions/runs/17992619816 - 20. accuracy-test-2-gpu-amd (linux-mi300-gpu-2): 13 times - Last Success: Run #28890 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435645 - Recent Failures: - - Run #28953 (2025-09-25 00:24) (PR #10372 by BBuf): https://github.com/sgl-project/sglang/actions/runs/17993178855 - - Run #28952 (2025-09-24 23:57) (PR #10372 by Xiaoyu Zhang): https://github.com/sgl-project/sglang/actions/runs/17992751764 - - Run #28951 (2025-09-24 23:47) (PR #10881 by Chang Su): https://github.com/sgl-project/sglang/actions/runs/17992619816 - 21. accuracy-test-1-gpu-amd (linux-mi300-gpu-1): 12 times - Last Success: Run #28890 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435645 - Recent Failures: - - Run #28955 (2025-09-25 00:40) by Hubert Lu: https://github.com/sgl-project/sglang/actions/runs/17993426068 - - Run #28953 (2025-09-25 00:24) (PR #10372 by BBuf): https://github.com/sgl-project/sglang/actions/runs/17993178855 - - Run #28952 (2025-09-24 23:57) (PR #10372 by Xiaoyu Zhang): https://github.com/sgl-project/sglang/actions/runs/17992751764 - 22. performance-test-1-gpu-part-2-amd (linux-mi325-gpu-1): 12 times - Last Success: Run #28890 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435645 - Recent Failures: - - Run #28955 (2025-09-25 00:40) by Hubert Lu: https://github.com/sgl-project/sglang/actions/runs/17993426068 - - Run #28953 (2025-09-25 00:24) (PR #10372 by BBuf): https://github.com/sgl-project/sglang/actions/runs/17993178855 - - Run #28952 (2025-09-24 23:57) (PR #10372 by Xiaoyu Zhang): https://github.com/sgl-project/sglang/actions/runs/17992751764 - 23. bench-test-2-gpu-amd (linux-mi325-gpu-2): 11 times - Last Success: Run #28890 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435645 - Recent Failures: - - Run #28957 (2025-09-25 01:10) (PR #10883 by Lianmin Zheng): https://github.com/sgl-project/sglang/actions/runs/17993860400 - - Run #28953 (2025-09-25 00:24) (PR #10372 by BBuf): https://github.com/sgl-project/sglang/actions/runs/17993178855 - - Run #28952 (2025-09-24 23:57) (PR #10372 by Xiaoyu Zhang): https://github.com/sgl-project/sglang/actions/runs/17992751764 - 24. unit-test-sgl-kernel-amd (linux-mi325-gpu-1): 11 times - Last Success: Run #28891 (2025-09-24 12:44) by Yuan Luo: https://github.com/sgl-project/sglang/actions/runs/17977053408 - Recent Failures: - - Run #28956 (2025-09-25 01:07) (PR #10495 by Lianmin Zheng): https://github.com/sgl-project/sglang/actions/runs/17993826732 - - Run #28953 (2025-09-25 00:24) (PR #10372 by BBuf): https://github.com/sgl-project/sglang/actions/runs/17993178855 - - Run #28952 (2025-09-24 23:57) (PR #10372 by Xiaoyu Zhang): https://github.com/sgl-project/sglang/actions/runs/17992751764 - 25. performance-test-1-gpu-part-1-amd (linux-mi300-gpu-1): 11 times - Last Success: Run #28890 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435645 - Recent Failures: - - Run #28955 (2025-09-25 00:40) by Hubert Lu: https://github.com/sgl-project/sglang/actions/runs/17993426068 - - Run #28953 (2025-09-25 00:24) (PR #10372 by BBuf): https://github.com/sgl-project/sglang/actions/runs/17993178855 - - Run #28952 (2025-09-24 23:57) (PR #10372 by Xiaoyu Zhang): https://github.com/sgl-project/sglang/actions/runs/17992751764 - 26. unit-test-backend-1-gpu-amd (linux-mi300-gpu-1, 6): 11 times - Last Success: Run #28890 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435645 - Recent Failures: - - Run #28955 (2025-09-25 00:40) by Hubert Lu: https://github.com/sgl-project/sglang/actions/runs/17993426068 - - Run #28953 (2025-09-25 00:24) (PR #10372 by BBuf): https://github.com/sgl-project/sglang/actions/runs/17993178855 - - Run #28952 (2025-09-24 23:57) (PR #10372 by Xiaoyu Zhang): https://github.com/sgl-project/sglang/actions/runs/17992751764 - 27. unit-test-backend-2-gpu-amd (linux-mi325-gpu-2): 11 times - Last Success: Run #28890 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435645 - Recent Failures: - - Run #28953 (2025-09-25 00:24) (PR #10372 by BBuf): https://github.com/sgl-project/sglang/actions/runs/17993178855 - - Run #28952 (2025-09-24 23:57) (PR #10372 by Xiaoyu Zhang): https://github.com/sgl-project/sglang/actions/runs/17992751764 - - Run #28951 (2025-09-24 23:47) (PR #10881 by Chang Su): https://github.com/sgl-project/sglang/actions/runs/17992619816 - 28. unit-test-backend-1-gpu (9): 10 times - Last Success: Run #34533 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435636 - Recent Failures: - - Run #34623 (2025-09-25 01:07) (PR #10495 by Lianmin Zheng): https://github.com/sgl-project/sglang/actions/runs/17993826751 - - Run #34617 (2025-09-24 23:47) (PR #10881 by Chang Su): https://github.com/sgl-project/sglang/actions/runs/17992619818 - - Run #34581 (2025-09-24 19:49) by Yineng Zhang: https://github.com/sgl-project/sglang/actions/runs/17987860976 - 29. unit-test-backend-1-gpu-amd (linux-mi325-gpu-1, 0): 10 times - Last Success: Run #28890 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435645 - Recent Failures: - - Run #28955 (2025-09-25 00:40) by Hubert Lu: https://github.com/sgl-project/sglang/actions/runs/17993426068 - - Run #28953 (2025-09-25 00:24) (PR #10372 by BBuf): https://github.com/sgl-project/sglang/actions/runs/17993178855 - - Run #28952 (2025-09-24 23:57) (PR #10372 by Xiaoyu Zhang): https://github.com/sgl-project/sglang/actions/runs/17992751764 - 30. unit-test-backend-1-gpu-amd (linux-mi325-gpu-1, 1): 10 times - Last Success: Run #28891 (2025-09-24 12:44) by Yuan Luo: https://github.com/sgl-project/sglang/actions/runs/17977053408 - Recent Failures: - - Run #28955 (2025-09-25 00:40) by Hubert Lu: https://github.com/sgl-project/sglang/actions/runs/17993426068 - - Run #28952 (2025-09-24 23:57) (PR #10372 by Xiaoyu Zhang): https://github.com/sgl-project/sglang/actions/runs/17992751764 - - Run #28951 (2025-09-24 23:47) (PR #10881 by Chang Su): https://github.com/sgl-project/sglang/actions/runs/17992619816 - 31. mla-test-1-gpu-amd (linux-mi300-gpu-1): 10 times - Last Success: Run #28890 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435645 - Recent Failures: - - Run #28953 (2025-09-25 00:24) (PR #10372 by BBuf): https://github.com/sgl-project/sglang/actions/runs/17993178855 - - Run #28952 (2025-09-24 23:57) (PR #10372 by Xiaoyu Zhang): https://github.com/sgl-project/sglang/actions/runs/17992751764 - - Run #28951 (2025-09-24 23:47) (PR #10881 by Chang Su): https://github.com/sgl-project/sglang/actions/runs/17992619816 - 32. unit-test-backend-1-gpu (5): 9 times - Last Success: Run #34533 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435636 - Recent Failures: - - Run #34624 (2025-09-25 01:10) (PR #10883 by Lianmin Zheng): https://github.com/sgl-project/sglang/actions/runs/17993860412 - - Run #34617 (2025-09-24 23:47) (PR #10881 by Chang Su): https://github.com/sgl-project/sglang/actions/runs/17992619818 - - Run #34560 (2025-09-24 17:01) by Hubert Lu: https://github.com/sgl-project/sglang/actions/runs/17983919007 - 33. unit-test-backend-1-gpu-amd (linux-mi325-gpu-1, 2): 9 times - Last Success: Run #28906 (2025-09-24 15:43) (PR #10372 by Xiaoyu Zhang): https://github.com/sgl-project/sglang/actions/runs/17982029749 - Recent Failures: - - Run #28955 (2025-09-25 00:40) by Hubert Lu: https://github.com/sgl-project/sglang/actions/runs/17993426068 - - Run #28952 (2025-09-24 23:57) (PR #10372 by Xiaoyu Zhang): https://github.com/sgl-project/sglang/actions/runs/17992751764 - - Run #28951 (2025-09-24 23:47) (PR #10881 by Chang Su): https://github.com/sgl-project/sglang/actions/runs/17992619816 - 34. unit-test-sgl-kernel-amd (linux-mi300-gpu-1): 9 times - Last Success: Run #28890 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435645 - Recent Failures: - - Run #28953 (2025-09-25 00:24) (PR #10372 by BBuf): https://github.com/sgl-project/sglang/actions/runs/17993178855 - - Run #28952 (2025-09-24 23:57) (PR #10372 by Xiaoyu Zhang): https://github.com/sgl-project/sglang/actions/runs/17992751764 - - Run #28951 (2025-09-24 23:47) (PR #10881 by Chang Su): https://github.com/sgl-project/sglang/actions/runs/17992619816 - 35. unit-test-backend-1-gpu-amd (linux-mi325-gpu-1, 4): 7 times - Last Success: Run #28890 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435645 - Recent Failures: - - Run #28955 (2025-09-25 00:40) by Hubert Lu: https://github.com/sgl-project/sglang/actions/runs/17993426068 - - Run #28953 (2025-09-25 00:24) (PR #10372 by BBuf): https://github.com/sgl-project/sglang/actions/runs/17993178855 - - Run #28949 (2025-09-24 23:44) (PR #10372 by Xiaoyu Zhang): https://github.com/sgl-project/sglang/actions/runs/17992591372 - 36. unit-test-backend-1-gpu-amd (linux-mi325-gpu-1, 6): 7 times - Last Success: Run #28890 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435645 - Recent Failures: - - Run #28950 (2025-09-24 23:45) (PR #1 by Xiaoyu Zhang): https://github.com/sgl-project/sglang/actions/runs/17992598523 - - Run #28946 (2025-09-24 23:39) (PR #10881 by Chang Su): https://github.com/sgl-project/sglang/actions/runs/17992521547 - - Run #28936 (2025-09-24 21:32) by xiafang: https://github.com/sgl-project/sglang/actions/runs/17990244192 - 37. vllm-dependency-test: 6 times - Last Success: Run #22949 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435651 - Recent Failures: - - Run #23028 (2025-09-25 02:39) by xuyongfei.xyf: https://github.com/sgl-project/sglang/actions/runs/17995251178 - - Run #23021 (2025-09-25 02:16) by Yuan Luo: https://github.com/sgl-project/sglang/actions/runs/17994892873 - - Run #22993 (2025-09-24 21:32) by xiafang: https://github.com/sgl-project/sglang/actions/runs/17990244213 - 38. per-commit-4-ascend-npu: 6 times - Last Success: Run #10065 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435703 - Recent Failures: - - Run #10138 (2025-09-25 02:17) by wangyi: https://github.com/sgl-project/sglang/actions/runs/17994908950 - - Run #10137 (2025-09-25 02:16) by Yuan Luo: https://github.com/sgl-project/sglang/actions/runs/17994892896 - - Run #10124 (2025-09-24 23:47) (PR #10881 by Chang Su): https://github.com/sgl-project/sglang/actions/runs/17992619819 - 39. unit-test-backend-2-gpu (0): 6 times - Last Success: Run #34533 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435636 - Recent Failures: - - Run #34624 (2025-09-25 01:10) (PR #10883 by Lianmin Zheng): https://github.com/sgl-project/sglang/actions/runs/17993860412 - - Run #34593 (2025-09-24 21:32) by xiafang: https://github.com/sgl-project/sglang/actions/runs/17990244227 - - Run #34576 (2025-09-24 18:46) by eigen: https://github.com/sgl-project/sglang/actions/runs/17986403452 - 40. unit-test-backend-1-gpu (4): 6 times - Last Success: Run #34533 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435636 - Recent Failures: - - Run #34623 (2025-09-25 01:07) (PR #10495 by Lianmin Zheng): https://github.com/sgl-project/sglang/actions/runs/17993826751 - - Run #34609 (2025-09-24 23:25) (PR #10853 by Yineng Zhang): https://github.com/sgl-project/sglang/actions/runs/17992311361 - - Run #34560 (2025-09-24 17:01) by Hubert Lu: https://github.com/sgl-project/sglang/actions/runs/17983919007 - 41. run-all-notebooks: 6 times - Last Success: Run #26939 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435610 - Recent Failures: - - Run #26988 (2025-09-24 23:25) (PR #10853 by Yineng Zhang): https://github.com/sgl-project/sglang/actions/runs/17992311396 - - Run #26982 (2025-09-24 21:32) by xiafang: https://github.com/sgl-project/sglang/actions/runs/17990244193 - - Run #26973 (2025-09-24 18:46) by eigen: https://github.com/sgl-project/sglang/actions/runs/17986403458 - 42. per-commit-2-ascend-npu: 5 times - Last Success: Run #10065 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435703 - Recent Failures: - - Run #10135 (2025-09-25 02:16) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17994888152 - - Run #10109 (2025-09-24 21:32) by xiafang: https://github.com/sgl-project/sglang/actions/runs/17990244207 - - Run #10085 (2025-09-24 16:42) by likesen: https://github.com/sgl-project/sglang/actions/runs/17983486537 - 43. unit-test-backend-8-gpu (0): 5 times - Last Success: Run #34533 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435636 - Recent Failures: - - Run #34623 (2025-09-25 01:07) (PR #10495 by Lianmin Zheng): https://github.com/sgl-project/sglang/actions/runs/17993826751 - - Run #34621 (2025-09-25 00:40) by Hubert Lu: https://github.com/sgl-project/sglang/actions/runs/17993426098 - - Run #34619 (2025-09-25 00:24) (PR #10372 by BBuf): https://github.com/sgl-project/sglang/actions/runs/17993178853 - 44. pytest-rust: 5 times - Last Success: Run #1761 (2025-09-24 16:39) by Chang Su: https://github.com/sgl-project/sglang/actions/runs/17983415401 - Recent Failures: - - Run #1770 (2025-09-24 21:02) by Simo Lin: https://github.com/sgl-project/sglang/actions/runs/17989538977 - - Run #1769 (2025-09-24 20:54) by Simo Lin: https://github.com/sgl-project/sglang/actions/runs/17989380799 - - Run #1767 (2025-09-24 20:36) by Ata Fatahi: https://github.com/sgl-project/sglang/actions/runs/17988964074 - 45. per-commit-16-ascend-a3: 4 times - Last Success: Run #10065 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435703 - Recent Failures: - - Run #10138 (2025-09-25 02:17) by wangyi: https://github.com/sgl-project/sglang/actions/runs/17994908950 - - Run #10135 (2025-09-25 02:16) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17994888152 - - Run #10109 (2025-09-24 21:32) by xiafang: https://github.com/sgl-project/sglang/actions/runs/17990244207 - 46. unit-test-backend-1-gpu (7): 4 times - Last Success: Run #34533 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435636 - Recent Failures: - - Run #34624 (2025-09-25 01:10) (PR #10883 by Lianmin Zheng): https://github.com/sgl-project/sglang/actions/runs/17993860412 - - Run #34573 (2025-09-24 18:45) by Tejesh Anand: https://github.com/sgl-project/sglang/actions/runs/17986382981 - - Run #34565 (2025-09-24 17:35) by YAMY: https://github.com/sgl-project/sglang/actions/runs/17984740528 - 47. unit-test-backend-2-gpu (1): 4 times - Last Success: Run #34533 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435636 - Recent Failures: - - Run #34593 (2025-09-24 21:32) by xiafang: https://github.com/sgl-project/sglang/actions/runs/17990244227 - - Run #34576 (2025-09-24 18:46) by eigen: https://github.com/sgl-project/sglang/actions/runs/17986403452 - - Run #34565 (2025-09-24 17:35) by YAMY: https://github.com/sgl-project/sglang/actions/runs/17984740528 - 48. per-commit-1-ascend-npu: 3 times - Last Success: Run #10065 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435703 - Recent Failures: - - Run #10138 (2025-09-25 02:17) by wangyi: https://github.com/sgl-project/sglang/actions/runs/17994908950 - - Run #10109 (2025-09-24 21:32) by xiafang: https://github.com/sgl-project/sglang/actions/runs/17990244207 - - Run #10085 (2025-09-24 16:42) by likesen: https://github.com/sgl-project/sglang/actions/runs/17983486537 - 49. unit-test-backend-1-gpu (1): 3 times - Last Success: Run #34533 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435636 - Recent Failures: - - Run #34623 (2025-09-25 01:07) (PR #10495 by Lianmin Zheng): https://github.com/sgl-project/sglang/actions/runs/17993826751 - - Run #34554 (2025-09-24 16:29) by Yuan Luo: https://github.com/sgl-project/sglang/actions/runs/17983177051 - - Run #34548 (2025-09-24 15:38) by gholmes829: https://github.com/sgl-project/sglang/actions/runs/17981905143 - 50. unit-test-backend-1-gpu (8): 3 times - Last Success: Run #34533 (2025-09-24 12:20) by Yuhong Guo: https://github.com/sgl-project/sglang/actions/runs/17976435636 - Recent Failures: - - Run #34617 (2025-09-24 23:47) (PR #10881 by Chang Su): https://github.com/sgl-project/sglang/actions/runs/17992619818 - - Run #34581 (2025-09-24 19:49) by Yineng Zhang: https://github.com/sgl-project/sglang/actions/runs/17987860976 - - Run #34554 (2025-09-24 16:29) by Yuan Luo: https://github.com/sgl-project/sglang/actions/runs/17983177051 - -Failure Pattern Analysis: - GPU Related Failure: 223 times - Unit Test Failure: 190 times - Accuracy Test Failure: 84 times - Performance Test Failure: 54 times - Other: 34 times - Dependency Installation Failure: 19 times - Build Failure: 15 times -``` - -### Performance Analyzer Example - -``` -============================================================ -SGLang Performance Analysis Report -============================================================ - -Getting recent 100 PR Test runs... -Got 100 PR test runs... - -Collecting performance data from CI runs... -Processing run 34882 (2025-09-26 03:16)... - Found performance-test-1-gpu-part-1 job (success) - Found performance-test-1-gpu-part-2 job (success) - Found performance-test-2-gpu job (success) -Processing run 34881 (2025-09-26 02:45)... - Found performance-test-1-gpu-part-1 job (success) - Found performance-test-1-gpu-part-2 job (success) -... - -Performance data collection completed! - -Generating performance tables to directory: performance_tables - Generated table: performance_tables/performance-test-1-gpu-part-1_summary/test_bs1_default.csv - Generated chart: performance_tables/performance-test-1-gpu-part-1_summary/test_bs1_default_output_throughput_token_s.png - Generated table: performance_tables/performance-test-1-gpu-part-1_summary/test_online_latency_default.csv - Generated chart: performance_tables/performance-test-1-gpu-part-1_summary/test_online_latency_default_median_e2e_latency_ms.png - ... - -Performance tables and charts generation completed! - -============================================================ -Performance Analysis Summary -============================================================ - -Total PR Test runs processed: 100 -Total performance tests found: 15 -Total performance records collected: 1,247 - -Performance test breakdown: - performance-test-1-gpu-part-1: 7 tests, 423 records - performance-test-1-gpu-part-2: 5 tests, 387 records - performance-test-2-gpu: 6 tests, 437 records - -Generated files: - CSV tables: 18 files - PNG charts: 18 files - Output directory: performance_tables/ - -Analysis completed successfully! -``` - -## CI Job Categories - -The tool automatically categorizes CI jobs into: - -- **sgl-kernel**: Kernel-related tests (build, unit tests, MLA tests) -- **unit-test**: Unit tests (frontend, backend with different GPU counts) -- **performance**: Performance tests (latency, throughput benchmarks) -- **accuracy**: Accuracy tests (model evaluation) -- **deepep**: DeepEP-related tests -- **b200**: B200 hardware-specific tests - -## Failure Patterns - -The tool recognizes these failure patterns: - -- **Timeout**: Step execution timeout -- **Unit Test Failure**: Unit test execution failures -- **Performance Test Failure**: Performance benchmark failures -- **Accuracy Test Failure**: Model accuracy evaluation failures -- **Build Failure**: Compilation/build process failures -- **Dependency Installation Failure**: Package installation issues -- **GPU Related Failure**: GPU-specific test failures -- **Other**: Unclassified failures - -## Troubleshooting - -### Common Issues - -1. **404 Error**: - - Ensure the repository name is correct (`sgl-project/sglang`) - - **Most common cause**: Missing `repo` or `workflow` permissions in your GitHub token - - Go to [GitHub Settings > Personal Access Tokens](https://github.com/settings/tokens) and regenerate with correct permissions -2. **403 Error**: Check that your GitHub token has the correct permissions (`repo` and `workflow`) -3. **Rate Limiting**: The tool includes built-in delays to avoid API rate limits -4. **Network Issues**: Ensure stable internet connection - -### Debug Mode - -For detailed API call information, you can modify the code to include logging: - -```python -import logging -logging.basicConfig(level=logging.DEBUG) -``` - -## Automated Monitoring - -Both CI and Performance analyzers are available as a GitHub Actions workflow that runs automatically every 6 hours. The workflow: - -### CI Analysis -- Analyzes the last 1000 CI runs (configurable) -- Generates detailed failure reports -- Uploads analysis results as JSON artifacts - -### Performance Analysis -- Analyzes the last 1000 PR Test runs (configurable) -- Generates performance trend data and charts -- Uploads CSV tables and PNG charts as artifacts - -### Workflow Configuration - -The workflow is located at `.github/workflows/ci-monitor.yml` and uses the `GH_PAT_FOR_NIGHTLY_CI` secret for GitHub API access. - -### Manual Trigger - -You can manually trigger the workflow from the GitHub Actions tab with custom parameters: -- `limit`: Number of CI runs to analyze (default: 1000) - -### Artifacts Generated - -The workflow generates and uploads the following artifacts: -- **CI Analysis**: JSON files with failure analysis data -- **Performance Analysis**: - - CSV files with performance metrics organized by test type - - PNG charts showing performance trends over time - - Directory structure: `performance_tables_{timestamp}/` - -## License - -This tool follows the same license as the SGLang project. diff --git a/scripts/ci_monitor/ci_analyzer.py b/scripts/ci_monitor/ci_analyzer.py index 20089f20d..1655c7e53 100755 --- a/scripts/ci_monitor/ci_analyzer.py +++ b/scripts/ci_monitor/ci_analyzer.py @@ -67,15 +67,14 @@ class SGLangCIAnalyzer: return all_runs[:limit] def analyze_ci_failures(self, runs: List[Dict]) -> Dict: - """Analyze CI failure patterns""" - print("Analyzing CI failure data...") + """Analyze CI failure patterns (CUDA jobs only)""" + print("Analyzing CI failure data (CUDA only)...") - # SGLang specific job categories + # SGLang specific job categories (CUDA only) job_categories = { - "sgl-kernel": [ + "build": [ + "build-test", "sgl-kernel-build-wheels", - "sgl-kernel-unit-test", - "sgl-kernel-mla-test", ], "unit-test": [ "unit-test-frontend", @@ -87,11 +86,35 @@ class SGLangCIAnalyzer: "performance": [ "performance-test-1-gpu-part-1", "performance-test-1-gpu-part-2", + "performance-test-1-gpu-part-3", "performance-test-2-gpu", ], - "accuracy": ["accuracy-test-1-gpu", "accuracy-test-2-gpu"], - "deepep": ["unit-test-deepep-4-gpu", "unit-test-deepep-8-gpu"], - "b200": ["unit-test-backend-4-gpu-b200"], + "accuracy": [ + "accuracy-test-1-gpu", + "accuracy-test-2-gpu", + ], + "mla-test": [ + "sgl-kernel-mla-test", + ], + "deepep": [ + "unit-test-deepep-4-gpu", + "unit-test-deepep-8-gpu", + ], + "per-commit": [ + "per-commit-8-gpu-h20", + ], + "nightly": [ + "nightly-test-perf-text-models", + "nightly-test-eval-text-models", + ], + "integration": [ + "run-all-notebooks", + "vllm-dependency-test", + "test-disaggregation", + ], + "b200": [ + "unit-test-backend-4-gpu-b200", + ], } stats = { @@ -141,13 +164,26 @@ class SGLangCIAnalyzer: job_name = job.get("name", "Unknown") job_conclusion = job.get("conclusion", "unknown") - # Filter out non-specific CI jobs - if job_name not in [ - "check-changes", - "pr-test-finish", - "pr-test-h20-finish", - "lint", - ]: + # Filter out non-specific CI jobs and non-CUDA jobs + # Skip meta jobs and AMD/NPU related jobs + if ( + job_name + not in [ + "check-changes", + "pr-test-finish", + "pr-test-h20-finish", + "pr-test-amd-finish", + "pr-test-b200-finish", + "lint", + "Set up job", + ] + and "-amd" not in job_name.lower() + and "mi300" not in job_name.lower() + and "mi325" not in job_name.lower() + and "gfx" not in job_name.lower() + and "-npu" not in job_name.lower() + and "ascend" not in job_name.lower() + ): # Record successful jobs (update last success) if job_conclusion == "success": stats["job_last_success"][job_name] = { @@ -158,7 +194,7 @@ class SGLangCIAnalyzer: } # Record failed jobs - elif job_conclusion == "failure" and run_status == "failure": + elif job_conclusion == "failure": stats["job_failures"][job_name] += 1 # Store failure link (keep only last 3 for each job) @@ -216,7 +252,7 @@ class SGLangCIAnalyzer: return pr_info def _analyze_failure_pattern(self, job: Dict, stats: Dict): - """Analyze failure patterns""" + """Analyze failure patterns (CUDA jobs only)""" job_name = job.get("name", "") steps = job.get("steps", []) @@ -224,19 +260,33 @@ class SGLangCIAnalyzer: if step.get("conclusion") == "failure": step_name = step.get("name", "") - # SGLang specific failure pattern recognition + # SGLang specific failure pattern recognition (CUDA only) if "timeout" in step_name.lower(): stats["failure_patterns"]["Timeout"] += 1 - elif "test" in step_name.lower() and "unit" in job_name.lower(): + elif "build" in step_name.lower() or "build" in job_name.lower(): + stats["failure_patterns"]["Build Failure"] += 1 + elif "install" in step_name.lower() or "dependency" in job_name.lower(): + stats["failure_patterns"]["Dependency Installation Failure"] += 1 + elif "unit" in job_name.lower() or "unit-test" in job_name.lower(): stats["failure_patterns"]["Unit Test Failure"] += 1 - elif "performance" in job_name.lower(): + elif "performance" in job_name.lower() or "perf" in job_name.lower(): stats["failure_patterns"]["Performance Test Failure"] += 1 elif "accuracy" in job_name.lower(): stats["failure_patterns"]["Accuracy Test Failure"] += 1 - elif "build" in step_name.lower(): - stats["failure_patterns"]["Build Failure"] += 1 - elif "install" in step_name.lower(): - stats["failure_patterns"]["Dependency Installation Failure"] += 1 + elif "mla" in job_name.lower(): + stats["failure_patterns"]["MLA Test Failure"] += 1 + elif "deepep" in job_name.lower(): + stats["failure_patterns"]["DeepEP Test Failure"] += 1 + elif "nightly" in job_name.lower(): + stats["failure_patterns"]["Nightly Test Failure"] += 1 + elif "notebook" in job_name.lower(): + stats["failure_patterns"]["Notebook Test Failure"] += 1 + elif "disaggregation" in job_name.lower(): + stats["failure_patterns"]["Disaggregation Test Failure"] += 1 + elif "h20" in job_name.lower() or "h200" in job_name.lower(): + stats["failure_patterns"]["H20/H200 GPU Failure"] += 1 + elif "b200" in job_name.lower(): + stats["failure_patterns"]["B200 GPU Failure"] += 1 elif "gpu" in job_name.lower(): stats["failure_patterns"]["GPU Related Failure"] += 1 else: @@ -245,7 +295,7 @@ class SGLangCIAnalyzer: def generate_report(self, stats: Dict): """Generate CI analysis report""" print("\n" + "=" * 60) - print("SGLang CI Analysis Report") + print("SGLang CI Analysis Report (CUDA Only)") print("=" * 60) # Overall statistics