init

2025-10-09 16:47:16 +08:00
parent c8feb4deb5
commit e27e3f16bb
5248 changed files with 1778505 additions and 0 deletions
--- a/transformers/utils/process_bad_commit_report.py
+++ b/transformers/utils/process_bad_commit_report.py
@@ -0,0 +1,130 @@
+"""An internal script to process `new_failures_with_bad_commit.json` produced by `utils/check_bad_commit.py`.
+
+This is used by `.github/workflows/check_failed_model_tests.yml` to produce a slack report of the following form
+
+```
+<{url}|New failed tests>
+{
+   "GH_ydshieh": {
+       "vit": 1
+   }
+}
+```
+"""
+
+import json
+import os
+from collections import Counter
+from copy import deepcopy
+
+from get_previous_daily_ci import get_last_daily_ci_run
+from huggingface_hub import HfApi
+
+
+if __name__ == "__main__":
+    api = HfApi()
+
+    job_name = os.environ.get("JOB_NAME")
+
+    with open("new_failures_with_bad_commit.json") as fp:
+        data = json.load(fp)
+
+    with open(f"ci_results_{job_name}/job_links.json") as fp:
+        job_links = json.load(fp)
+
+    # TODO: extend
+    team_members = [
+        "ArthurZucker",
+        "Cyrilvallez",
+        "LysandreJik",
+        "MekkCyber",
+        "Rocketknight1",
+        "SunMarc",
+        "ebezzam",
+        "eustlb",
+        "gante",
+        "itazap",
+        "ivarflakstad",
+        "molbap",
+        "muellerzr",
+        "remi-or",
+        "stevhliu",
+        "vasqu",
+        "ydshieh",
+        "zucchini-nlp",
+    ]
+
+    # Counting the number of failures grouped by authors
+    new_data = {}
+    for model, model_result in data.items():
+        for device, failed_tests in model_result.items():
+            for failed_test in failed_tests:
+                author = failed_test["author"]
+
+                if author not in team_members:
+                    author = failed_test["merged_by"]
+
+                if author not in new_data:
+                    new_data[author] = Counter()
+                new_data[author].update([model])
+    for author in new_data:
+        new_data[author] = dict(new_data[author])
+
+    # Group by author
+    new_data_full = {author: deepcopy(data) for author in new_data}
+    for author, _data in new_data_full.items():
+        for model, model_result in _data.items():
+            for device, failed_tests in model_result.items():
+                # prepare job_link and add it to each entry of new failed test information.
+                # need to change from `single-gpu` to `single` and same for `multi-gpu` to match `job_link`.
+                key = model
+                if list(job_links.keys()) == [job_name]:
+                    key = job_name
+                job_link = job_links[key][device.replace("-gpu", "")]
+
+                failed_tests = [x for x in failed_tests if x["author"] == author or x["merged_by"] == author]
+                for x in failed_tests:
+                    x.update({"job_link": job_link})
+                model_result[device] = failed_tests
+            _data[model] = {k: v for k, v in model_result.items() if len(v) > 0}
+        new_data_full[author] = {k: v for k, v in _data.items() if len(v) > 0}
+
+    # Upload to Hub and get the url
+    # if it is not a scheduled run, upload the reports to a subfolder under `report_repo_folder`
+    report_repo_subfolder = ""
+    if os.getenv("GITHUB_EVENT_NAME") != "schedule":
+        report_repo_subfolder = f"{os.getenv('GITHUB_RUN_NUMBER')}-{os.getenv('GITHUB_RUN_ID')}"
+        report_repo_subfolder = f"runs/{report_repo_subfolder}"
+
+    workflow_run = get_last_daily_ci_run(
+        token=os.environ["ACCESS_REPO_INFO_TOKEN"], workflow_run_id=os.getenv("GITHUB_RUN_ID")
+    )
+    workflow_run_created_time = workflow_run["created_at"]
+
+    report_repo_folder = workflow_run_created_time.split("T")[0]
+
+    if report_repo_subfolder:
+        report_repo_folder = f"{report_repo_folder}/{report_repo_subfolder}"
+
+    report_repo_id = os.getenv("REPORT_REPO_ID")
+
+    with open("new_failures_with_bad_commit_grouped_by_authors.json", "w") as fp:
+        json.dump(new_data_full, fp, ensure_ascii=False, indent=4)
+    commit_info = api.upload_file(
+        path_or_fileobj="new_failures_with_bad_commit_grouped_by_authors.json",
+        path_in_repo=f"{report_repo_folder}/ci_results_{job_name}/new_failures_with_bad_commit_grouped_by_authors.json",
+        repo_id=report_repo_id,
+        repo_type="dataset",
+        token=os.environ.get("TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN", None),
+    )
+    url = f"https://huggingface.co/datasets/{report_repo_id}/raw/{commit_info.oid}/{report_repo_folder}/ci_results_{job_name}/new_failures_with_bad_commit_grouped_by_authors.json"
+
+    # Add `GH_` prefix as keyword mention
+    output = {}
+    for author, item in new_data.items():
+        author = f"GH_{author}"
+        output[author] = item
+
+    report = f"<{url}|New failed tests>\\n\\n"
+    report += json.dumps(output, indent=4).replace('"', '\\"').replace("\n", "\\n")
+    print(report)