init

2025-10-09 16:47:16 +08:00
parent c8feb4deb5
commit e27e3f16bb
5248 changed files with 1778505 additions and 0 deletions
--- a/transformers/scripts/check_tokenizers.py
+++ b/transformers/scripts/check_tokenizers.py
@@ -0,0 +1,179 @@
+from collections import Counter
+
+import datasets
+
+import transformers
+from transformers.convert_slow_tokenizer import SLOW_TO_FAST_CONVERTERS
+from transformers.tokenization_utils_base import PreTrainedTokenizerBase
+from transformers.utils import logging
+
+
+logging.set_verbosity_info()
+
+TOKENIZER_CLASSES = {
+    name: (getattr(transformers, name), getattr(transformers, name + "Fast")) for name in SLOW_TO_FAST_CONVERTERS
+}
+
+dataset = datasets.load_dataset("facebook/xnli", split="test+validation")  # no-script
+
+total = 0
+perfect = 0
+imperfect = 0
+wrong = 0
+
+
+def check_diff(
+    spm_diff: list[int], tok_diff: list[int], slow: PreTrainedTokenizerBase, fast: PreTrainedTokenizerBase
+) -> bool:
+    if spm_diff == list(reversed(tok_diff)):
+        # AAA -> AA+A vs A+AA case.
+        return True
+    elif len(spm_diff) == len(tok_diff) and fast.decode(spm_diff) == fast.decode(tok_diff):
+        # Second order OK
+        # Barrich -> Barr + ich vs Bar + rich
+        return True
+    spm_reencoded = slow.encode(slow.decode(spm_diff))
+    tok_reencoded = fast.encode(fast.decode(spm_diff))
+    if spm_reencoded != spm_diff and spm_reencoded == tok_reencoded:
+        # Type 3 error.
+        # Snehagatha ->
+        #       Sne, h, aga, th, a
+        #       Sne, ha, gat, ha
+        # Encoding the wrong with sp does not even recover what spm gave us
+        # It fits tokenizer however...
+        return True
+    return False
+
+
+def check_LTR_mark(line: str, idx: int, fast: PreTrainedTokenizerBase) -> bool:
+    enc = fast.encode_plus(line)[0]
+    offsets = enc.offsets
+    curr, prev = offsets[idx], offsets[idx - 1]
+    if curr is not None and line[curr[0] : curr[1]] == "\u200f":
+        return True
+    if prev is not None and line[prev[0] : prev[1]] == "\u200f":
+        return True
+    return False
+
+
+def check_details(
+    line: str, spm_ids: list[int], tok_ids: list[int], slow: PreTrainedTokenizerBase, fast: PreTrainedTokenizerBase
+) -> bool:
+    # Encoding can be the same with same result AAA -> A + AA vs AA + A
+    # We can check that we use at least exactly the same number of tokens.
+    for i, (spm_id, tok_id) in enumerate(zip(spm_ids, tok_ids)):
+        if spm_id != tok_id:
+            break
+    first = i
+    for i, (spm_id, tok_id) in enumerate(zip(reversed(spm_ids), reversed(tok_ids))):
+        if spm_id != tok_id:
+            break
+    last = len(spm_ids) - i
+
+    spm_diff = spm_ids[first:last]
+    tok_diff = tok_ids[first:last]
+
+    if check_diff(spm_diff, tok_diff, slow, fast):
+        return True
+
+    if check_LTR_mark(line, first, fast):
+        return True
+
+    if last - first > 5:
+        # We might have twice a single problem, attempt to subdivide the disjointed tokens into smaller problems
+        spms = Counter(spm_ids[first:last])
+        toks = Counter(tok_ids[first:last])
+
+        removable_tokens = {spm_ for (spm_, si) in spms.items() if toks.get(spm_, 0) == si}
+        min_width = 3
+        for i in range(last - first - min_width):
+            if all(spm_ids[first + i + j] in removable_tokens for j in range(min_width)):
+                possible_matches = [
+                    k
+                    for k in range(last - first - min_width)
+                    if tok_ids[first + k : first + k + min_width] == spm_ids[first + i : first + i + min_width]
+                ]
+                for j in possible_matches:
+                    if check_diff(
+                        spm_ids[first : first + i], tok_ids[first : first + j], slow, fast
+                    ) and check_details(
+                        line,
+                        spm_ids[first + i : last],
+                        tok_ids[first + j : last],
+                        slow,
+                        fast,
+                    ):
+                        return True
+
+    print(f"Spm: {[fast.decode([spm_ids[i]]) for i in range(first, last)]}")
+    try:
+        print(f"Tok: {[fast.decode([tok_ids[i]]) for i in range(first, last)]}")
+    except Exception:
+        pass
+
+    fast.decode(spm_ids[:first])
+    fast.decode(spm_ids[last:])
+    wrong = fast.decode(spm_ids[first:last])
+    print()
+    print(wrong)
+    return False
+
+
+def test_string(slow: PreTrainedTokenizerBase, fast: PreTrainedTokenizerBase, text: str) -> None:
+    global perfect
+    global imperfect
+    global wrong
+    global total
+
+    slow_ids = slow.encode(text)
+    fast_ids = fast.encode(text)
+
+    skip_assert = False
+    total += 1
+
+    if slow_ids != fast_ids:
+        if check_details(text, slow_ids, fast_ids, slow, fast):
+            skip_assert = True
+            imperfect += 1
+        else:
+            wrong += 1
+    else:
+        perfect += 1
+
+    if total % 10000 == 0:
+        print(f"({perfect} / {imperfect} / {wrong} ----- {perfect + imperfect + wrong})")
+
+    if skip_assert:
+        return
+
+    assert slow_ids == fast_ids, (
+        f"line {text} : \n\n{slow_ids}\n{fast_ids}\n\n{slow.tokenize(text)}\n{fast.tokenize(text)}"
+    )
+
+
+def test_tokenizer(slow: PreTrainedTokenizerBase, fast: PreTrainedTokenizerBase) -> None:
+    global batch_total
+    for i in range(len(dataset)):
+        # premise, all languages
+        for text in dataset[i]["premise"].values():
+            test_string(slow, fast, text)
+
+        # hypothesis, all languages
+        for text in dataset[i]["hypothesis"]["translation"]:
+            test_string(slow, fast, text)
+
+
+if __name__ == "__main__":
+    for name, (slow_class, fast_class) in TOKENIZER_CLASSES.items():
+        checkpoint_names = list(slow_class.max_model_input_sizes.keys())
+        for checkpoint in checkpoint_names:
+            imperfect = 0
+            perfect = 0
+            wrong = 0
+            total = 0
+
+            print(f"========================== Checking {name}: {checkpoint} ==========================")
+            slow = slow_class.from_pretrained(checkpoint, force_download=True)
+            fast = fast_class.from_pretrained(checkpoint, force_download=True)
+            test_tokenizer(slow, fast)
+            print(f"Accuracy {perfect * 100 / total:.2f}")
--- a/transformers/scripts/distributed/torch-distributed-gpu-test.py
+++ b/transformers/scripts/distributed/torch-distributed-gpu-test.py
@@ -0,0 +1,92 @@
+#!/usr/bin/env python
+
+#
+# This a `torch.distributed` diagnostics script that checks that all GPUs in the cluster (one or
+# many nodes) can talk to each other via nccl and allocate gpu memory.
+#
+# To run first adjust the number of processes and nodes:
+#
+# python -m torch.distributed.run --nproc_per_node 2 --nnodes 1 torch-distributed-gpu-test.py
+#
+# You may need to add --master_addr $MASTER_ADDR --master_port $MASTER_PORT if using a custom addr:port
+#
+# You can also use the rdzv API: --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT --rdzv_backend c10d
+#
+# use torch.distributed.launch instead of torch.distributed.run for torch < 1.9
+#
+# If you get a hanging in `barrier` calls you have some network issues, you may try to debug this with:
+#
+# NCCL_DEBUG=INFO python -m torch.distributed.run --nproc_per_node 2 --nnodes 1 torch-distributed-gpu-test.py
+#
+# which should tell you what's going on behind the scenes.
+#
+#
+# This script can be run via `srun` in the SLURM environment as well. Here is a SLURM script that
+# runs on 2 nodes of 4 gpus per node:
+#
+# #SBATCH --job-name=test-nodes        # name
+# #SBATCH --nodes=2                    # nodes
+# #SBATCH --ntasks-per-node=1          # crucial - only 1 task per dist per node!
+# #SBATCH --cpus-per-task=10           # number of cores per tasks
+# #SBATCH --gres=gpu:4                 # number of gpus
+# #SBATCH --time 0:05:00               # maximum execution time (HH:MM:SS)
+# #SBATCH --output=%x-%j.out           # output file name
+#
+# GPUS_PER_NODE=4
+# MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
+# MASTER_PORT=6000
+#
+# srun --jobid $SLURM_JOBID bash -c 'python -m torch.distributed.run \
+# --nproc_per_node $GPUS_PER_NODE --nnodes $SLURM_NNODES --node_rank $SLURM_PROCID \
+# --master_addr $MASTER_ADDR --master_port $MASTER_PORT \
+# torch-distributed-gpu-test.py'
+#
+
+import fcntl
+import os
+import socket
+
+import torch
+import torch.distributed as dist
+
+
+def printflock(*msgs):
+    """solves multi-process interleaved print problem"""
+    with open(__file__, "r") as fh:
+        fcntl.flock(fh, fcntl.LOCK_EX)
+        try:
+            print(*msgs)
+        finally:
+            fcntl.flock(fh, fcntl.LOCK_UN)
+
+
+local_rank = int(os.environ["LOCAL_RANK"])
+torch.cuda.set_device(local_rank)
+device = torch.device("cuda", local_rank)
+hostname = socket.gethostname()
+
+gpu = f"[{hostname}-{local_rank}]"
+
+try:
+    # test distributed
+    dist.init_process_group("nccl")
+    dist.all_reduce(torch.ones(1).to(device), op=dist.ReduceOp.SUM)
+    dist.barrier()
+
+    # test cuda is available and can allocate memory
+    torch.cuda.is_available()
+    torch.ones(1).cuda(local_rank)
+
+    # global rank
+    rank = dist.get_rank()
+    world_size = dist.get_world_size()
+
+    printflock(f"{gpu} is OK (global rank: {rank}/{world_size})")
+
+    dist.barrier()
+    if rank == 0:
+        printflock(f"pt={torch.__version__}, cuda={torch.version.cuda}, nccl={torch.cuda.nccl.version()}")
+
+except Exception:
+    printflock(f"{gpu} is broken")
+    raise
--- a/transformers/scripts/stale.py
+++ b/transformers/scripts/stale.py
@@ -0,0 +1,76 @@
+# Copyright 2021 The HuggingFace Team, the AllenNLP library authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Script to close stale issue. Taken in part from the AllenNLP repository.
+https://github.com/allenai/allennlp.
+"""
+
+import os
+from datetime import datetime as dt
+
+import github.GithubException
+from github import Github
+
+
+LABELS_TO_EXEMPT = [
+    "good first issue",
+    "good second issue",
+    "good difficult issue",
+    "feature request",
+    "new model",
+    "wip",
+]
+
+
+def main():
+    g = Github(os.environ["GITHUB_TOKEN"])
+    repo = g.get_repo("huggingface/transformers")
+    open_issues = repo.get_issues(state="open")
+
+    for i, issue in enumerate(open_issues):
+        print(i, issue)
+        comments = sorted(issue.get_comments(), key=lambda i: i.created_at, reverse=True)
+        last_comment = comments[0] if len(comments) > 0 else None
+        if (
+            last_comment is not None
+            and last_comment.user.login == "github-actions[bot]"
+            and (dt.utcnow() - issue.updated_at.replace(tzinfo=None)).days > 7
+            and (dt.utcnow() - issue.created_at.replace(tzinfo=None)).days >= 30
+            and not any(label.name.lower() in LABELS_TO_EXEMPT for label in issue.get_labels())
+        ):
+            # print(f"Would close issue {issue.number} since it has been 7 days of inactivity since bot mention.")
+            try:
+                issue.edit(state="closed")
+            except github.GithubException as e:
+                print("Couldn't close the issue:", repr(e))
+        elif (
+            (dt.utcnow() - issue.updated_at.replace(tzinfo=None)).days > 23
+            and (dt.utcnow() - issue.created_at.replace(tzinfo=None)).days >= 30
+            and not any(label.name.lower() in LABELS_TO_EXEMPT for label in issue.get_labels())
+        ):
+            # print(f"Would add stale comment to {issue.number}")
+            try:
+                issue.create_comment(
+                    "This issue has been automatically marked as stale because it has not had "
+                    "recent activity. If you think this still needs to be addressed "
+                    "please comment on this thread.\n\nPlease note that issues that do not follow the "
+                    "[contributing guidelines](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md) "
+                    "are likely to be ignored."
+                )
+            except github.GithubException as e:
+                print("Couldn't create comment:", repr(e))
+
+
+if __name__ == "__main__":
+    main()