This commit is contained in:
2025-10-09 16:47:16 +08:00
parent c8feb4deb5
commit e27e3f16bb
5248 changed files with 1778505 additions and 0 deletions

View File

@@ -0,0 +1,179 @@
from collections import Counter
import datasets
import transformers
from transformers.convert_slow_tokenizer import SLOW_TO_FAST_CONVERTERS
from transformers.tokenization_utils_base import PreTrainedTokenizerBase
from transformers.utils import logging
logging.set_verbosity_info()
TOKENIZER_CLASSES = {
name: (getattr(transformers, name), getattr(transformers, name + "Fast")) for name in SLOW_TO_FAST_CONVERTERS
}
dataset = datasets.load_dataset("facebook/xnli", split="test+validation") # no-script
total = 0
perfect = 0
imperfect = 0
wrong = 0
def check_diff(
spm_diff: list[int], tok_diff: list[int], slow: PreTrainedTokenizerBase, fast: PreTrainedTokenizerBase
) -> bool:
if spm_diff == list(reversed(tok_diff)):
# AAA -> AA+A vs A+AA case.
return True
elif len(spm_diff) == len(tok_diff) and fast.decode(spm_diff) == fast.decode(tok_diff):
# Second order OK
# Barrich -> Barr + ich vs Bar + rich
return True
spm_reencoded = slow.encode(slow.decode(spm_diff))
tok_reencoded = fast.encode(fast.decode(spm_diff))
if spm_reencoded != spm_diff and spm_reencoded == tok_reencoded:
# Type 3 error.
# Snehagatha ->
# Sne, h, aga, th, a
# Sne, ha, gat, ha
# Encoding the wrong with sp does not even recover what spm gave us
# It fits tokenizer however...
return True
return False
def check_LTR_mark(line: str, idx: int, fast: PreTrainedTokenizerBase) -> bool:
enc = fast.encode_plus(line)[0]
offsets = enc.offsets
curr, prev = offsets[idx], offsets[idx - 1]
if curr is not None and line[curr[0] : curr[1]] == "\u200f":
return True
if prev is not None and line[prev[0] : prev[1]] == "\u200f":
return True
return False
def check_details(
line: str, spm_ids: list[int], tok_ids: list[int], slow: PreTrainedTokenizerBase, fast: PreTrainedTokenizerBase
) -> bool:
# Encoding can be the same with same result AAA -> A + AA vs AA + A
# We can check that we use at least exactly the same number of tokens.
for i, (spm_id, tok_id) in enumerate(zip(spm_ids, tok_ids)):
if spm_id != tok_id:
break
first = i
for i, (spm_id, tok_id) in enumerate(zip(reversed(spm_ids), reversed(tok_ids))):
if spm_id != tok_id:
break
last = len(spm_ids) - i
spm_diff = spm_ids[first:last]
tok_diff = tok_ids[first:last]
if check_diff(spm_diff, tok_diff, slow, fast):
return True
if check_LTR_mark(line, first, fast):
return True
if last - first > 5:
# We might have twice a single problem, attempt to subdivide the disjointed tokens into smaller problems
spms = Counter(spm_ids[first:last])
toks = Counter(tok_ids[first:last])
removable_tokens = {spm_ for (spm_, si) in spms.items() if toks.get(spm_, 0) == si}
min_width = 3
for i in range(last - first - min_width):
if all(spm_ids[first + i + j] in removable_tokens for j in range(min_width)):
possible_matches = [
k
for k in range(last - first - min_width)
if tok_ids[first + k : first + k + min_width] == spm_ids[first + i : first + i + min_width]
]
for j in possible_matches:
if check_diff(
spm_ids[first : first + i], tok_ids[first : first + j], slow, fast
) and check_details(
line,
spm_ids[first + i : last],
tok_ids[first + j : last],
slow,
fast,
):
return True
print(f"Spm: {[fast.decode([spm_ids[i]]) for i in range(first, last)]}")
try:
print(f"Tok: {[fast.decode([tok_ids[i]]) for i in range(first, last)]}")
except Exception:
pass
fast.decode(spm_ids[:first])
fast.decode(spm_ids[last:])
wrong = fast.decode(spm_ids[first:last])
print()
print(wrong)
return False
def test_string(slow: PreTrainedTokenizerBase, fast: PreTrainedTokenizerBase, text: str) -> None:
global perfect
global imperfect
global wrong
global total
slow_ids = slow.encode(text)
fast_ids = fast.encode(text)
skip_assert = False
total += 1
if slow_ids != fast_ids:
if check_details(text, slow_ids, fast_ids, slow, fast):
skip_assert = True
imperfect += 1
else:
wrong += 1
else:
perfect += 1
if total % 10000 == 0:
print(f"({perfect} / {imperfect} / {wrong} ----- {perfect + imperfect + wrong})")
if skip_assert:
return
assert slow_ids == fast_ids, (
f"line {text} : \n\n{slow_ids}\n{fast_ids}\n\n{slow.tokenize(text)}\n{fast.tokenize(text)}"
)
def test_tokenizer(slow: PreTrainedTokenizerBase, fast: PreTrainedTokenizerBase) -> None:
global batch_total
for i in range(len(dataset)):
# premise, all languages
for text in dataset[i]["premise"].values():
test_string(slow, fast, text)
# hypothesis, all languages
for text in dataset[i]["hypothesis"]["translation"]:
test_string(slow, fast, text)
if __name__ == "__main__":
for name, (slow_class, fast_class) in TOKENIZER_CLASSES.items():
checkpoint_names = list(slow_class.max_model_input_sizes.keys())
for checkpoint in checkpoint_names:
imperfect = 0
perfect = 0
wrong = 0
total = 0
print(f"========================== Checking {name}: {checkpoint} ==========================")
slow = slow_class.from_pretrained(checkpoint, force_download=True)
fast = fast_class.from_pretrained(checkpoint, force_download=True)
test_tokenizer(slow, fast)
print(f"Accuracy {perfect * 100 / total:.2f}")

View File

@@ -0,0 +1,92 @@
#!/usr/bin/env python
#
# This a `torch.distributed` diagnostics script that checks that all GPUs in the cluster (one or
# many nodes) can talk to each other via nccl and allocate gpu memory.
#
# To run first adjust the number of processes and nodes:
#
# python -m torch.distributed.run --nproc_per_node 2 --nnodes 1 torch-distributed-gpu-test.py
#
# You may need to add --master_addr $MASTER_ADDR --master_port $MASTER_PORT if using a custom addr:port
#
# You can also use the rdzv API: --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT --rdzv_backend c10d
#
# use torch.distributed.launch instead of torch.distributed.run for torch < 1.9
#
# If you get a hanging in `barrier` calls you have some network issues, you may try to debug this with:
#
# NCCL_DEBUG=INFO python -m torch.distributed.run --nproc_per_node 2 --nnodes 1 torch-distributed-gpu-test.py
#
# which should tell you what's going on behind the scenes.
#
#
# This script can be run via `srun` in the SLURM environment as well. Here is a SLURM script that
# runs on 2 nodes of 4 gpus per node:
#
# #SBATCH --job-name=test-nodes # name
# #SBATCH --nodes=2 # nodes
# #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node!
# #SBATCH --cpus-per-task=10 # number of cores per tasks
# #SBATCH --gres=gpu:4 # number of gpus
# #SBATCH --time 0:05:00 # maximum execution time (HH:MM:SS)
# #SBATCH --output=%x-%j.out # output file name
#
# GPUS_PER_NODE=4
# MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
# MASTER_PORT=6000
#
# srun --jobid $SLURM_JOBID bash -c 'python -m torch.distributed.run \
# --nproc_per_node $GPUS_PER_NODE --nnodes $SLURM_NNODES --node_rank $SLURM_PROCID \
# --master_addr $MASTER_ADDR --master_port $MASTER_PORT \
# torch-distributed-gpu-test.py'
#
import fcntl
import os
import socket
import torch
import torch.distributed as dist
def printflock(*msgs):
"""solves multi-process interleaved print problem"""
with open(__file__, "r") as fh:
fcntl.flock(fh, fcntl.LOCK_EX)
try:
print(*msgs)
finally:
fcntl.flock(fh, fcntl.LOCK_UN)
local_rank = int(os.environ["LOCAL_RANK"])
torch.cuda.set_device(local_rank)
device = torch.device("cuda", local_rank)
hostname = socket.gethostname()
gpu = f"[{hostname}-{local_rank}]"
try:
# test distributed
dist.init_process_group("nccl")
dist.all_reduce(torch.ones(1).to(device), op=dist.ReduceOp.SUM)
dist.barrier()
# test cuda is available and can allocate memory
torch.cuda.is_available()
torch.ones(1).cuda(local_rank)
# global rank
rank = dist.get_rank()
world_size = dist.get_world_size()
printflock(f"{gpu} is OK (global rank: {rank}/{world_size})")
dist.barrier()
if rank == 0:
printflock(f"pt={torch.__version__}, cuda={torch.version.cuda}, nccl={torch.cuda.nccl.version()}")
except Exception:
printflock(f"{gpu} is broken")
raise

View File

@@ -0,0 +1,76 @@
# Copyright 2021 The HuggingFace Team, the AllenNLP library authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Script to close stale issue. Taken in part from the AllenNLP repository.
https://github.com/allenai/allennlp.
"""
import os
from datetime import datetime as dt
import github.GithubException
from github import Github
LABELS_TO_EXEMPT = [
"good first issue",
"good second issue",
"good difficult issue",
"feature request",
"new model",
"wip",
]
def main():
g = Github(os.environ["GITHUB_TOKEN"])
repo = g.get_repo("huggingface/transformers")
open_issues = repo.get_issues(state="open")
for i, issue in enumerate(open_issues):
print(i, issue)
comments = sorted(issue.get_comments(), key=lambda i: i.created_at, reverse=True)
last_comment = comments[0] if len(comments) > 0 else None
if (
last_comment is not None
and last_comment.user.login == "github-actions[bot]"
and (dt.utcnow() - issue.updated_at.replace(tzinfo=None)).days > 7
and (dt.utcnow() - issue.created_at.replace(tzinfo=None)).days >= 30
and not any(label.name.lower() in LABELS_TO_EXEMPT for label in issue.get_labels())
):
# print(f"Would close issue {issue.number} since it has been 7 days of inactivity since bot mention.")
try:
issue.edit(state="closed")
except github.GithubException as e:
print("Couldn't close the issue:", repr(e))
elif (
(dt.utcnow() - issue.updated_at.replace(tzinfo=None)).days > 23
and (dt.utcnow() - issue.created_at.replace(tzinfo=None)).days >= 30
and not any(label.name.lower() in LABELS_TO_EXEMPT for label in issue.get_labels())
):
# print(f"Would add stale comment to {issue.number}")
try:
issue.create_comment(
"This issue has been automatically marked as stale because it has not had "
"recent activity. If you think this still needs to be addressed "
"please comment on this thread.\n\nPlease note that issues that do not follow the "
"[contributing guidelines](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md) "
"are likely to be ignored."
)
except github.GithubException as e:
print("Couldn't create comment:", repr(e))
if __name__ == "__main__":
main()