初始化项目,由ModelHub XC社区提供模型

Model: mikr/whisper-large-v3-czech-cv13
Source: Original Platform
This commit is contained in:
ModelHub XC
2026-05-20 16:04:41 +08:00
commit 1425cf8cc5
23 changed files with 183228 additions and 0 deletions

35
.gitattributes vendored Normal file
View File

@@ -0,0 +1,35 @@
*.7z filter=lfs diff=lfs merge=lfs -text
*.arrow filter=lfs diff=lfs merge=lfs -text
*.bin filter=lfs diff=lfs merge=lfs -text
*.bz2 filter=lfs diff=lfs merge=lfs -text
*.ckpt filter=lfs diff=lfs merge=lfs -text
*.ftz filter=lfs diff=lfs merge=lfs -text
*.gz filter=lfs diff=lfs merge=lfs -text
*.h5 filter=lfs diff=lfs merge=lfs -text
*.joblib filter=lfs diff=lfs merge=lfs -text
*.lfs.* filter=lfs diff=lfs merge=lfs -text
*.mlmodel filter=lfs diff=lfs merge=lfs -text
*.model filter=lfs diff=lfs merge=lfs -text
*.msgpack filter=lfs diff=lfs merge=lfs -text
*.npy filter=lfs diff=lfs merge=lfs -text
*.npz filter=lfs diff=lfs merge=lfs -text
*.onnx filter=lfs diff=lfs merge=lfs -text
*.ot filter=lfs diff=lfs merge=lfs -text
*.parquet filter=lfs diff=lfs merge=lfs -text
*.pb filter=lfs diff=lfs merge=lfs -text
*.pickle filter=lfs diff=lfs merge=lfs -text
*.pkl filter=lfs diff=lfs merge=lfs -text
*.pt filter=lfs diff=lfs merge=lfs -text
*.pth filter=lfs diff=lfs merge=lfs -text
*.rar filter=lfs diff=lfs merge=lfs -text
*.safetensors filter=lfs diff=lfs merge=lfs -text
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
*.tar.* filter=lfs diff=lfs merge=lfs -text
*.tar filter=lfs diff=lfs merge=lfs -text
*.tflite filter=lfs diff=lfs merge=lfs -text
*.tgz filter=lfs diff=lfs merge=lfs -text
*.wasm filter=lfs diff=lfs merge=lfs -text
*.xz filter=lfs diff=lfs merge=lfs -text
*.zip filter=lfs diff=lfs merge=lfs -text
*.zst filter=lfs diff=lfs merge=lfs -text
*tfevents* filter=lfs diff=lfs merge=lfs -text

67
README.md Normal file
View File

@@ -0,0 +1,67 @@
---
license: apache-2.0
base_model: openai/whisper-large-v3
tags:
- generated_from_trainer
metrics:
- wer
model-index:
- name: openai/whisper-large-v3
results: []
---
<!-- This model card has been generated automatically according to the information the Trainer had access to. You
should probably proofread and complete it, then remove this comment. -->
# openai/whisper-large-v3
This model is a fine-tuned version of [openai/whisper-large-v3](https://huggingface.co/openai/whisper-large-v3) on the None dataset.
It achieves the following results on the evaluation set:
- Loss: 0.1283
- Wer: 0.0789
## Model description
More information needed
## Intended uses & limitations
More information needed
## Training and evaluation data
More information needed
## Training procedure
### Training hyperparameters
The following hyperparameters were used during training:
- learning_rate: 1e-05
- train_batch_size: 62
- eval_batch_size: 16
- seed: 42
- distributed_type: multi-GPU
- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
- lr_scheduler_type: linear
- lr_scheduler_warmup_steps: 500
- training_steps: 5000
- mixed_precision_training: Native AMP
### Training results
| Training Loss | Epoch | Step | Validation Loss | Wer |
|:-------------:|:-----:|:----:|:---------------:|:------:|
| 0.0138 | 2.24 | 1000 | 0.0962 | 0.0863 |
| 0.004 | 4.48 | 2000 | 0.1117 | 0.0844 |
| 0.0015 | 6.73 | 3000 | 0.1178 | 0.0807 |
| 0.0004 | 8.97 | 4000 | 0.1219 | 0.0792 |
| 0.0002 | 11.21 | 5000 | 0.1283 | 0.0789 |
### Framework versions
- Transformers 4.36.0.dev0
- Pytorch 2.0.0+cu117
- Datasets 2.14.6
- Tokenizers 0.14.1

1611
added_tokens.json Normal file

File diff suppressed because it is too large Load Diff

50
config.json Normal file
View File

@@ -0,0 +1,50 @@
{
"_name_or_path": "openai/whisper-large-v3",
"activation_dropout": 0.0,
"activation_function": "gelu",
"apply_spec_augment": false,
"architectures": [
"WhisperForConditionalGeneration"
],
"attention_dropout": 0.0,
"begin_suppress_tokens": [
220,
50256
],
"bos_token_id": 50256,
"classifier_proj_size": 256,
"d_model": 1280,
"decoder_attention_heads": 20,
"decoder_ffn_dim": 5120,
"decoder_layerdrop": 0.0,
"decoder_layers": 32,
"decoder_start_token_id": 50257,
"dropout": 0.0,
"encoder_attention_heads": 20,
"encoder_ffn_dim": 5120,
"encoder_layerdrop": 0.0,
"encoder_layers": 32,
"eos_token_id": 50256,
"forced_decoder_ids": null,
"init_std": 0.02,
"is_encoder_decoder": true,
"mask_feature_length": 10,
"mask_feature_min_masks": 0,
"mask_feature_prob": 0.0,
"mask_time_length": 10,
"mask_time_min_masks": 2,
"mask_time_prob": 0.05,
"max_source_positions": 1500,
"max_target_positions": 448,
"median_filter_width": 7,
"model_type": "whisper",
"num_hidden_layers": 32,
"num_mel_bins": 128,
"pad_token_id": 50256,
"scale_embedding": false,
"torch_dtype": "float16",
"transformers_version": "4.36.0.dev0",
"use_cache": true,
"use_weighted_layer_sum": false,
"vocab_size": 51866
}

50
ds_config.json Normal file
View File

@@ -0,0 +1,50 @@
{
"fp16": {
"enabled": "auto",
"loss_scale": 0,
"loss_scale_window": 1000,
"initial_scale_power": 16,
"hysteresis": 2,
"min_loss_scale": 1
},
"optimizer": {
"type": "AdamW",
"params": {
"lr": "auto",
"betas": "auto",
"eps": "auto",
"weight_decay": "auto"
}
},
"scheduler": {
"type": "WarmupDecayLR",
"params": {
"last_batch_iteration": -1,
"total_num_steps": "auto",
"warmup_min_lr": "auto",
"warmup_max_lr": "auto",
"warmup_num_steps": "auto"
}
},
"zero_optimization": {
"stage": 2,
"offload_optimizer": {
"device": "cpu",
"pin_memory": true
},
"allgather_partitions": true,
"allgather_bucket_size": 2e8,
"overlap_comm": true,
"reduce_scatter": true,
"reduce_bucket_size": 2e8,
"contiguous_gradients": true
},
"gradient_accumulation_steps": "auto",
"gradient_clipping": "auto",
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto"
}

264
generation_config.json Normal file
View File

@@ -0,0 +1,264 @@
{
"alignment_heads": [
[
7,
0
],
[
10,
17
],
[
12,
18
],
[
13,
12
],
[
16,
1
],
[
17,
14
],
[
19,
11
],
[
21,
4
],
[
24,
1
],
[
25,
6
]
],
"begin_suppress_tokens": [
220,
50257
],
"bos_token_id": 50257,
"decoder_start_token_id": 50258,
"eos_token_id": 50257,
"forced_decoder_ids": [
[
1,
null
],
[
2,
50360
]
],
"is_multilingual": true,
"lang_to_id": {
"<|af|>": 50327,
"<|am|>": 50334,
"<|ar|>": 50272,
"<|as|>": 50350,
"<|az|>": 50304,
"<|ba|>": 50355,
"<|be|>": 50330,
"<|bg|>": 50292,
"<|bn|>": 50302,
"<|bo|>": 50347,
"<|br|>": 50309,
"<|bs|>": 50315,
"<|ca|>": 50270,
"<|cs|>": 50283,
"<|cy|>": 50297,
"<|da|>": 50285,
"<|de|>": 50261,
"<|el|>": 50281,
"<|en|>": 50259,
"<|es|>": 50262,
"<|et|>": 50307,
"<|eu|>": 50310,
"<|fa|>": 50300,
"<|fi|>": 50277,
"<|fo|>": 50338,
"<|fr|>": 50265,
"<|gl|>": 50319,
"<|gu|>": 50333,
"<|haw|>": 50352,
"<|ha|>": 50354,
"<|he|>": 50279,
"<|hi|>": 50276,
"<|hr|>": 50291,
"<|ht|>": 50339,
"<|hu|>": 50286,
"<|hy|>": 50312,
"<|id|>": 50275,
"<|is|>": 50311,
"<|it|>": 50274,
"<|ja|>": 50266,
"<|jw|>": 50356,
"<|ka|>": 50329,
"<|kk|>": 50316,
"<|km|>": 50323,
"<|kn|>": 50306,
"<|ko|>": 50264,
"<|la|>": 50294,
"<|lb|>": 50345,
"<|ln|>": 50353,
"<|lo|>": 50336,
"<|lt|>": 50293,
"<|lv|>": 50301,
"<|mg|>": 50349,
"<|mi|>": 50295,
"<|mk|>": 50308,
"<|ml|>": 50296,
"<|mn|>": 50314,
"<|mr|>": 50320,
"<|ms|>": 50282,
"<|mt|>": 50343,
"<|my|>": 50346,
"<|ne|>": 50313,
"<|nl|>": 50271,
"<|nn|>": 50342,
"<|no|>": 50288,
"<|oc|>": 50328,
"<|pa|>": 50321,
"<|pl|>": 50269,
"<|ps|>": 50340,
"<|pt|>": 50267,
"<|ro|>": 50284,
"<|ru|>": 50263,
"<|sa|>": 50344,
"<|sd|>": 50332,
"<|si|>": 50322,
"<|sk|>": 50298,
"<|sl|>": 50305,
"<|sn|>": 50324,
"<|so|>": 50326,
"<|sq|>": 50317,
"<|sr|>": 50303,
"<|su|>": 50357,
"<|sv|>": 50273,
"<|sw|>": 50318,
"<|ta|>": 50287,
"<|te|>": 50299,
"<|tg|>": 50331,
"<|th|>": 50289,
"<|tk|>": 50341,
"<|tl|>": 50348,
"<|tr|>": 50268,
"<|tt|>": 50351,
"<|uk|>": 50280,
"<|ur|>": 50290,
"<|uz|>": 50337,
"<|vi|>": 50278,
"<|yi|>": 50335,
"<|yo|>": 50325,
"<|yue|>": 50358,
"<|zh|>": 50260
},
"max_initial_timestamp_index": 1,
"max_length": 448,
"no_timestamps_token_id": 50364,
"pad_token_id": 50257,
"return_timestamps": false,
"suppress_tokens": [
1,
2,
7,
8,
9,
10,
14,
25,
26,
27,
28,
29,
31,
58,
59,
60,
61,
62,
63,
90,
91,
92,
93,
359,
503,
522,
542,
873,
893,
902,
918,
922,
931,
1350,
1853,
1982,
2460,
2627,
3246,
3253,
3268,
3536,
3846,
3961,
4183,
4667,
6585,
6647,
7273,
9061,
9383,
10428,
10929,
11938,
12033,
12331,
12562,
13793,
14157,
14635,
15265,
15618,
16553,
16604,
18362,
18956,
20075,
21675,
22520,
26130,
26161,
26435,
28279,
29464,
31650,
32302,
32470,
36865,
42863,
47425,
49870,
50254,
50258,
50359,
50360,
50361,
50362,
50363
],
"task_to_id": {
"transcribe": 50360,
"translate": 50359
},
"transformers_version": "4.36.0.dev0"
}

50001
merges.txt Normal file

File diff suppressed because it is too large Load Diff

3
model.safetensors Normal file
View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:6fd0431d832cac84fbe6f016bacfac3397c95b143d67d9e0e8a60c90402c5c27
size 3219908024

1742
normalizer.json Normal file

File diff suppressed because it is too large Load Diff

14
preprocessor_config.json Normal file
View File

@@ -0,0 +1,14 @@
{
"chunk_length": 30,
"feature_extractor_type": "WhisperFeatureExtractor",
"feature_size": 128,
"hop_length": 160,
"n_fft": 400,
"n_samples": 480000,
"nb_max_frames": 3000,
"padding_side": "right",
"padding_value": 0.0,
"processor_class": "WhisperProcessor",
"return_attention_mask": false,
"sampling_rate": 16000
}

7
requirements.txt Normal file
View File

@@ -0,0 +1,7 @@
datasets >= 1.18.0
git+https://github.com/huggingface/transformers
torch >= 1.5
torchaudio
librosa
jiwer
evaluate

39
run-stream.sh Normal file
View File

@@ -0,0 +1,39 @@
deepspeed run_speech_recognition_seq2seq_streaming.py \
--deepspeed="ds_config.json" \
--model_name_or_path="openai/whisper-large-v3" \
--dataset_name="mozilla-foundation/common_voice_13_0" \
--dataset_config_name="cs" \
--language="czech" \
--train_split_name="train+validation" \
--eval_split_name="test" \
--max_steps="5000" \
--output_dir="./" \
--per_device_train_batch_size="20" \
--per_device_eval_batch_size="16" \
--gradient_accumulation_steps="1" \
--logging_steps="25" \
--learning_rate="1e-6" \
--warmup_steps="500" \
--evaluation_strategy="steps" \
--eval_steps="1000" \
--save_strategy="steps" \
--save_steps="1000" \
--generation_max_length="225" \
--length_column_name="input_length" \
--max_duration_in_seconds="30" \
--text_column_name="sentence" \
--freeze_feature_encoder="False" \
--report_to="tensorboard" \
--metric_for_best_model="wer" \
--greater_is_better="False" \
--load_best_model_at_end \
--gradient_checkpointing \
--fp16 \
--overwrite_output_dir \
--do_train \
--do_eval \
--predict_with_generate \
--do_normalize_eval \
--streaming="False" \
--use_auth_token \
--push_to_hub

40
run.sh Normal file
View File

@@ -0,0 +1,40 @@
deepspeed run_speech_recognition_seq2seq.py \
--deepspeed="ds_config.json" \
--model_name_or_path="openai/whisper-large-v3" \
--dataset_name="mozilla-foundation/common_voice_13_0" \
--dataset_config_name="cs" \
--language="czech" \
--train_split_name="train+validation" \
--eval_split_name="test" \
--max_steps="5000" \
--output_dir="./" \
--per_device_train_batch_size="62" \
--per_device_eval_batch_size="16" \
--gradient_accumulation_steps="1" \
--logging_steps="25" \
--learning_rate="1e-5" \
--warmup_steps="500" \
--evaluation_strategy="steps" \
--eval_steps="1000" \
--save_strategy="steps" \
--save_steps="1000" \
--do_lower_case="False" \
--generation_max_length="225" \
--preprocessing_num_workers="16" \
--length_column_name="input_length" \
--max_duration_in_seconds="30" \
--text_column_name="sentence" \
--freeze_feature_encoder="False" \
--report_to="tensorboard" \
--metric_for_best_model="wer" \
--greater_is_better="False" \
--load_best_model_at_end \
--gradient_checkpointing \
--group_by_length \
--fp16 \
--overwrite_output_dir \
--do_train \
--do_eval \
--predict_with_generate \
--use_auth_token \
--push_to_hub

View File

@@ -0,0 +1,625 @@
#!/usr/bin/env python
# coding=utf-8
# Copyright 2021 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Fine-tuning the library models for sequence to sequence speech recognition.
"""
# You can also adapt this script on your own sequence to sequence speech
# recognition task. Pointers for this are left as comments.
import logging
import os
import sys
import warnings
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union
import datasets
import evaluate
import torch
from datasets import DatasetDict, load_dataset
import transformers
from transformers import (
AutoConfig,
AutoFeatureExtractor,
AutoModelForSpeechSeq2Seq,
AutoProcessor,
AutoTokenizer,
HfArgumentParser,
Seq2SeqTrainer,
Seq2SeqTrainingArguments,
set_seed,
)
from transformers.trainer_utils import get_last_checkpoint, is_main_process
from transformers.utils import check_min_version, send_example_telemetry
from transformers.utils.versions import require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.36.0.dev0")
require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")
logger = logging.getLogger(__name__)
@dataclass
class ModelArguments:
"""
Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
"""
model_name_or_path: str = field(
metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
)
config_name: Optional[str] = field(
default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
)
tokenizer_name: Optional[str] = field(
default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
)
feature_extractor_name: Optional[str] = field(
default=None, metadata={"help": "feature extractor name or path if not the same as model_name"}
)
cache_dir: Optional[str] = field(
default=None,
metadata={"help": "Where to store the pretrained models downloaded from huggingface.co"},
)
use_fast_tokenizer: bool = field(
default=True,
metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
)
model_revision: str = field(
default="main",
metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
)
token: str = field(
default=None,
metadata={
"help": (
"The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
"generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
)
},
)
use_auth_token: bool = field(
default=None,
metadata={
"help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead."
},
)
trust_remote_code: bool = field(
default=False,
metadata={
"help": (
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
"execute code present on the Hub on your local machine."
)
},
)
freeze_feature_encoder: bool = field(
default=True, metadata={"help": "Whether to freeze the feature encoder layers of the model."}
)
freeze_encoder: bool = field(
default=False, metadata={"help": "Whether to freeze the entire encoder of the seq2seq model."}
)
forced_decoder_ids: List[List[int]] = field(
default=None,
metadata={
"help": (
"A list of pairs of integers which indicates a mapping from generation indices to token indices "
"that will be forced before sampling. For example, [[0, 123]] means the first generated token "
"will always be a token of index 123."
)
},
)
suppress_tokens: List[int] = field(
default=None, metadata={"help": "A list of tokens that will be suppressed at generation."}
)
apply_spec_augment: bool = field(
default=False,
metadata={
"help": "Whether to apply *SpecAugment* data augmentation to the input features. This is currently only relevant for Wav2Vec2, HuBERT, WavLM and Whisper models."
},
)
@dataclass
class DataTrainingArguments:
"""
Arguments pertaining to what data we are going to input our model for training and eval.
"""
dataset_name: str = field(
default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
)
dataset_config_name: Optional[str] = field(
default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
)
overwrite_cache: bool = field(
default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
)
preprocessing_num_workers: Optional[int] = field(
default=None,
metadata={"help": "The number of processes to use for the preprocessing."},
)
max_train_samples: Optional[int] = field(
default=None,
metadata={
"help": (
"For debugging purposes or quicker training, truncate the number of training examples to this "
"value if set."
)
},
)
max_eval_samples: Optional[int] = field(
default=None,
metadata={
"help": (
"For debugging purposes or quicker training, truncate the number of evaluation examples to this "
"value if set."
)
},
)
audio_column_name: str = field(
default="audio",
metadata={"help": "The name of the dataset column containing the audio data. Defaults to 'audio'"},
)
text_column_name: str = field(
default="text",
metadata={"help": "The name of the dataset column containing the text data. Defaults to 'text'"},
)
max_duration_in_seconds: float = field(
default=20.0,
metadata={
"help": (
"Truncate audio files that are longer than `max_duration_in_seconds` seconds to"
" 'max_duration_in_seconds`"
)
},
)
min_duration_in_seconds: float = field(
default=0.0, metadata={"help": "Filter audio files that are shorter than `min_duration_in_seconds` seconds"}
)
preprocessing_only: bool = field(
default=False,
metadata={
"help": (
"Whether to only do data preprocessing and skip training. This is especially useful when data"
" preprocessing errors out in distributed training due to timeout. In this case, one should run the"
" preprocessing in a non-distributed setup with `preprocessing_only=True` so that the cached datasets"
" can consequently be loaded in distributed training"
)
},
)
train_split_name: str = field(
default="train",
metadata={
"help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
},
)
eval_split_name: str = field(
default="test",
metadata={
"help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
},
)
do_lower_case: bool = field(
default=True,
metadata={"help": "Whether the target text should be lower cased."},
)
language: str = field(
default=None,
metadata={
"help": (
"Language for multilingual fine-tuning. This argument should be set for multilingual fine-tuning "
"only. For English speech recognition, it should be set to `None`."
)
},
)
task: str = field(
default="transcribe",
metadata={"help": "Task, either `transcribe` for speech recognition or `translate` for speech translation."},
)
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
"""
Data collator that will dynamically pad the inputs received.
Args:
processor ([`WhisperProcessor`])
The processor used for processing the data.
decoder_start_token_id (`int`)
The begin-of-sentence of the decoder.
forward_attention_mask (`bool`)
Whether to return attention_mask.
"""
processor: Any
decoder_start_token_id: int
forward_attention_mask: bool
def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
# split inputs and labels since they have to be of different lengths and need
# different padding methods
model_input_name = self.processor.model_input_names[0]
input_features = [{model_input_name: feature[model_input_name]} for feature in features]
label_features = [{"input_ids": feature["labels"]} for feature in features]
batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")
if self.forward_attention_mask:
batch["attention_mask"] = torch.LongTensor([feature["attention_mask"] for feature in features])
labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")
# replace padding with -100 to ignore loss correctly
labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
# if bos token is appended in previous tokenization step,
# cut bos token here as it's append later anyways
if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
labels = labels[:, 1:]
batch["labels"] = labels
return batch
def main():
# 1. Parse input arguments
# See all possible arguments in src/transformers/training_args.py
# or by passing the --help flag to this script.
# We now keep distinct sets of args, for a cleaner separation of concerns.
parser = HfArgumentParser((ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments))
if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
# If we pass only one argument to the script and it's the path to a json file,
# let's parse it to get our arguments.
model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
else:
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
if model_args.use_auth_token is not None:
warnings.warn(
"The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead.",
FutureWarning,
)
if model_args.token is not None:
raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
model_args.token = model_args.use_auth_token
# Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
# information sent is the one passed as arguments along with your Python/PyTorch versions.
send_example_telemetry("run_speech_recognition_seq2seq", model_args, data_args)
# 2. Setup logging
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S",
handlers=[logging.StreamHandler(sys.stdout)],
)
log_level = training_args.get_process_log_level()
logger.setLevel(log_level)
datasets.utils.logging.set_verbosity(log_level)
transformers.utils.logging.set_verbosity(log_level)
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()
logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN)
# Log on each process the small summary:
logger.warning(
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
)
logger.info(f"Training/evaluation parameters {training_args}")
# Set the verbosity to info of the Transformers logger (on main process only):
if is_main_process(training_args.local_rank):
transformers.utils.logging.set_verbosity_info()
logger.info("Training/evaluation parameters %s", training_args)
# 3. Detecting last checkpoint and eventually continue from last checkpoint
last_checkpoint = None
if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
last_checkpoint = get_last_checkpoint(training_args.output_dir)
if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
raise ValueError(
f"Output directory ({training_args.output_dir}) already exists and is not empty. "
"Use --overwrite_output_dir to overcome."
)
elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
logger.info(
f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
"the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
)
# Set seed before initializing model.
set_seed(training_args.seed)
# 4. Load dataset
raw_datasets = DatasetDict()
if training_args.do_train:
raw_datasets["train"] = load_dataset(
data_args.dataset_name,
data_args.dataset_config_name,
split=data_args.train_split_name,
cache_dir=model_args.cache_dir,
token=model_args.token,
)
if training_args.do_eval:
raw_datasets["eval"] = load_dataset(
data_args.dataset_name,
data_args.dataset_config_name,
split=data_args.eval_split_name,
cache_dir=model_args.cache_dir,
token=model_args.token,
)
if data_args.audio_column_name not in next(iter(raw_datasets.values())).column_names:
raise ValueError(
f"--audio_column_name '{data_args.audio_column_name}' not found in dataset '{data_args.dataset_name}'. "
"Make sure to set `--audio_column_name` to the correct audio column - one of "
f"{', '.join(next(iter(raw_datasets.values())).column_names)}."
)
if data_args.text_column_name not in next(iter(raw_datasets.values())).column_names:
raise ValueError(
f"--text_column_name {data_args.text_column_name} not found in dataset '{data_args.dataset_name}'. "
"Make sure to set `--text_column_name` to the correct text column - one of "
f"{', '.join(next(iter(raw_datasets.values())).column_names)}."
)
# 5. Load pretrained model, tokenizer, and feature extractor
#
# Distributed training:
# The .from_pretrained methods guarantee that only one local process can concurrently
config = AutoConfig.from_pretrained(
model_args.config_name if model_args.config_name else model_args.model_name_or_path,
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
)
config.update({"forced_decoder_ids": model_args.forced_decoder_ids, "suppress_tokens": model_args.suppress_tokens})
# SpecAugment for whisper models
if getattr(config, "model_type", None) == "whisper":
config.update({"apply_spec_augment": model_args.apply_spec_augment})
feature_extractor = AutoFeatureExtractor.from_pretrained(
model_args.feature_extractor_name if model_args.feature_extractor_name else model_args.model_name_or_path,
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
)
tokenizer = AutoTokenizer.from_pretrained(
model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
cache_dir=model_args.cache_dir,
use_fast=model_args.use_fast_tokenizer,
revision=model_args.model_revision,
token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
)
model = AutoModelForSpeechSeq2Seq.from_pretrained(
model_args.model_name_or_path,
config=config,
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
)
if model.config.decoder_start_token_id is None:
raise ValueError("Make sure that `config.decoder_start_token_id` is correctly defined")
if model_args.freeze_feature_encoder:
model.freeze_feature_encoder()
if model_args.freeze_encoder:
model.freeze_encoder()
model.model.encoder.gradient_checkpointing = False
if data_args.language is not None:
# We only need to set the task id when the language is specified (i.e. in a multilingual setting)
tokenizer.set_prefix_tokens(language=data_args.language, task=data_args.task)
# 6. Resample speech dataset if necessary
dataset_sampling_rate = next(iter(raw_datasets.values())).features[data_args.audio_column_name].sampling_rate
if dataset_sampling_rate != feature_extractor.sampling_rate:
raw_datasets = raw_datasets.cast_column(
data_args.audio_column_name, datasets.features.Audio(sampling_rate=feature_extractor.sampling_rate)
)
# 7. Preprocessing the datasets.
# We need to read the audio files as arrays and tokenize the targets.
max_input_length = data_args.max_duration_in_seconds * feature_extractor.sampling_rate
min_input_length = data_args.min_duration_in_seconds * feature_extractor.sampling_rate
audio_column_name = data_args.audio_column_name
num_workers = data_args.preprocessing_num_workers
text_column_name = data_args.text_column_name
model_input_name = feature_extractor.model_input_names[0]
do_lower_case = data_args.do_lower_case
# if SpecAugment is used for whisper models, return attention_mask to guide the mask along time axis
forward_attention_mask = (
getattr(config, "model_type", None) == "whisper"
and getattr(config, "apply_spec_augment", False)
and getattr(config, "mask_time_prob", 0) > 0
)
if data_args.max_train_samples is not None:
raw_datasets["train"] = raw_datasets["train"].select(range(data_args.max_train_samples))
if data_args.max_eval_samples is not None:
raw_datasets["eval"] = raw_datasets["eval"].select(range(data_args.max_eval_samples))
def prepare_dataset(batch):
# process audio
sample = batch[audio_column_name]
inputs = feature_extractor(
sample["array"], sampling_rate=sample["sampling_rate"], return_attention_mask=forward_attention_mask
)
# process audio length
batch[model_input_name] = inputs.get(model_input_name)[0]
batch["input_length"] = len(sample["array"])
if forward_attention_mask:
batch["attention_mask"] = inputs.get("attention_mask")[0]
# process targets
input_str = batch[text_column_name].lower() if do_lower_case else batch[text_column_name]
batch["labels"] = tokenizer(input_str).input_ids
return batch
with training_args.main_process_first(desc="dataset map pre-processing"):
vectorized_datasets = raw_datasets.map(
prepare_dataset,
remove_columns=next(iter(raw_datasets.values())).column_names,
num_proc=data_args.preprocessing_num_workers,
desc="preprocess train dataset",
)
# filter data that is shorter than min_input_length or longer than
# max_input_length
def is_audio_in_length_range(length):
return length > min_input_length and length < max_input_length
vectorized_datasets = vectorized_datasets.filter(
is_audio_in_length_range,
num_proc=num_workers,
input_columns=["input_length"],
)
# for large datasets it is advised to run the preprocessing on a
# single machine first with `args.preprocessing_only` since there will mostly likely
# be a timeout when running the script in distributed mode.
# In a second step `args.preprocessing_only` can then be set to `False` to load the
# cached dataset
if data_args.preprocessing_only:
cache = {k: v.cache_files for k, v in vectorized_datasets.items()}
logger.info(f"Data preprocessing finished. Files cached at {cache}.")
return
# 8. Load Metric
metric = evaluate.load("wer")
def compute_metrics(pred):
pred_ids = pred.predictions
pred.label_ids[pred.label_ids == -100] = tokenizer.pad_token_id
pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
# we do not want to group tokens when computing the metrics
label_str = tokenizer.batch_decode(pred.label_ids, skip_special_tokens=True)
wer = metric.compute(predictions=pred_str, references=label_str)
return {"wer": wer}
# 9. Create a single speech processor
# make sure all processes wait until data is saved
with training_args.main_process_first():
# only the main process saves them
if is_main_process(training_args.local_rank):
# save feature extractor, tokenizer and config
feature_extractor.save_pretrained(training_args.output_dir)
tokenizer.save_pretrained(training_args.output_dir)
config.save_pretrained(training_args.output_dir)
processor = AutoProcessor.from_pretrained(training_args.output_dir)
# 10. Define data collator
data_collator = DataCollatorSpeechSeq2SeqWithPadding(
processor=processor,
decoder_start_token_id=model.config.decoder_start_token_id,
forward_attention_mask=forward_attention_mask,
)
# 11. Initialize Trainer
trainer = Seq2SeqTrainer(
model=model,
args=training_args,
train_dataset=vectorized_datasets["train"] if training_args.do_train else None,
eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None,
tokenizer=feature_extractor,
data_collator=data_collator,
compute_metrics=compute_metrics if training_args.predict_with_generate else None,
)
# 12. Training
if training_args.do_train:
checkpoint = None
if training_args.resume_from_checkpoint is not None:
checkpoint = training_args.resume_from_checkpoint
elif last_checkpoint is not None:
checkpoint = last_checkpoint
train_result = trainer.train(resume_from_checkpoint=checkpoint)
trainer.save_model() # Saves the feature extractor too for easy upload
metrics = train_result.metrics
max_train_samples = (
data_args.max_train_samples
if data_args.max_train_samples is not None
else len(vectorized_datasets["train"])
)
metrics["train_samples"] = min(max_train_samples, len(vectorized_datasets["train"]))
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()
# 13. Evaluation
results = {}
if training_args.do_eval:
logger.info("*** Evaluate ***")
metrics = trainer.evaluate(
metric_key_prefix="eval",
max_length=training_args.generation_max_length,
num_beams=training_args.generation_num_beams,
)
max_eval_samples = (
data_args.max_eval_samples if data_args.max_eval_samples is not None else len(vectorized_datasets["eval"])
)
metrics["eval_samples"] = min(max_eval_samples, len(vectorized_datasets["eval"]))
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)
# 14. Write Training Stats
kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "automatic-speech-recognition"}
if data_args.dataset_name is not None:
kwargs["dataset_tags"] = data_args.dataset_name
if data_args.dataset_config_name is not None:
kwargs["dataset_args"] = data_args.dataset_config_name
kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
else:
kwargs["dataset"] = data_args.dataset_name
if training_args.push_to_hub:
trainer.push_to_hub(**kwargs)
else:
trainer.create_model_card(**kwargs)
return results
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,629 @@
#!/usr/bin/env python
# coding=utf-8
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Fine-tuning the library models for sequence to sequence speech recognition
with 🤗 Datasets' streaming mode.
"""
# You can also adapt this script for your own sequence to sequence speech
# recognition task. Pointers for this are left as comments.
import logging
import os
import sys
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union
import datasets
import torch
from datasets import DatasetDict, IterableDatasetDict, interleave_datasets, load_dataset
from torch.utils.data import IterableDataset
import evaluate
import transformers
from transformers import (
AutoConfig,
AutoFeatureExtractor,
AutoModelForSpeechSeq2Seq,
AutoProcessor,
AutoTokenizer,
HfArgumentParser,
Seq2SeqTrainer,
Seq2SeqTrainingArguments,
TrainerCallback,
set_seed,
)
from transformers.models.whisper.english_normalizer import BasicTextNormalizer
from transformers.trainer_pt_utils import IterableDatasetShard
from transformers.trainer_utils import get_last_checkpoint, is_main_process
from transformers.utils import check_min_version, send_example_telemetry
from transformers.utils.versions import require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.25.0.dev0")
require_version("datasets>=1.18.2", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")
logger = logging.getLogger(__name__)
@dataclass
class ModelArguments:
"""
Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
"""
model_name_or_path: str = field(
metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
)
config_name: Optional[str] = field(
default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
)
tokenizer_name: Optional[str] = field(
default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
)
feature_extractor_name: Optional[str] = field(
default=None, metadata={"help": "feature extractor name or path if not the same as model_name"}
)
cache_dir: Optional[str] = field(
default=None,
metadata={"help": "Where to store the pretrained models downloaded from huggingface.co"},
)
use_fast_tokenizer: bool = field(
default=True,
metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
)
model_revision: str = field(
default="main",
metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
)
use_auth_token: bool = field(
default=False,
metadata={
"help": (
"Will use the token generated when running `huggingface-cli login` (necessary to use this script "
"with private models)."
)
},
)
freeze_feature_encoder: bool = field(
default=True, metadata={"help": "Whether to freeze the feature encoder layers of the model."}
)
freeze_encoder: bool = field(
default=False, metadata={"help": "Whether to freeze the entire encoder of the seq2seq model."}
)
forced_decoder_ids: List[List[int]] = field(
default=None,
metadata={
"help": (
"A list of pairs of integers which indicates a mapping from generation indices to token indices "
"that will be forced before sampling. For example, [[0, 123]] means the first generated token "
"will always be a token of index 123."
)
},
)
suppress_tokens: List[int] = field(
default=None, metadata={"help": "A list of tokens that will be suppressed at generation."}
)
model_index_name: str = field(default=None, metadata={"help": "Pretty name for the model card."})
@dataclass
class DataTrainingArguments:
"""
Arguments pertaining to what data we are going to input our model for training and eval.
"""
dataset_name: str = field(
default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
)
dataset_config_name: Optional[str] = field(
default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
)
text_column: Optional[str] = field(
default=None,
metadata={"help": "The name of the column in the datasets containing the full texts (for summarization)."},
)
max_train_samples: Optional[int] = field(
default=None,
metadata={
"help": (
"For debugging purposes or quicker training, truncate the number of training examples to this "
"value if set."
)
},
)
max_eval_samples: Optional[int] = field(
default=None,
metadata={
"help": (
"For debugging purposes or quicker training, truncate the number of evaluation examples to this "
"value if set."
)
},
)
audio_column_name: str = field(
default="audio",
metadata={"help": "The name of the dataset column containing the audio data. Defaults to 'audio'"},
)
text_column_name: str = field(
default="text",
metadata={"help": "The name of the dataset column containing the text data. Defaults to 'text'"},
)
max_duration_in_seconds: float = field(
default=20.0,
metadata={
"help": (
"Truncate audio files that are longer than `max_duration_in_seconds` seconds to"
" 'max_duration_in_seconds`"
)
},
)
min_duration_in_seconds: float = field(
default=0.0, metadata={"help": "Filter audio files that are shorter than `min_duration_in_seconds` seconds"}
)
train_split_name: str = field(
default="train",
metadata={
"help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
},
)
eval_split_name: str = field(
default="test",
metadata={
"help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
},
)
do_lower_case: bool = field(
default=False,
metadata={"help": "Whether the target text should be lower cased."},
)
do_remove_punctuation: bool = field(
default=False,
metadata={"help": "Whether the target text should be striped of punctuation."},
)
do_normalize_eval: bool = field(
default=True,
metadata={"help": "Whether to normalise the references and predictions in the eval WER calculation."},
)
language: str = field(
default=None,
metadata={
"help": (
"Language for multilingual fine-tuning. This argument should be set for multilingual fine-tuning "
"only. For English speech recognition, it should be set to `None`."
)
},
)
task: str = field(
default="transcribe",
metadata={"help": "Task, either `transcribe` for speech recognition or `translate` for speech translation."},
)
shuffle_buffer_size: Optional[int] = field(
default=500,
metadata={
"help": (
"The number of streamed examples to download before shuffling them. The large the buffer, "
"the closer it is to real offline shuffling."
)
},
)
streaming: bool = field(
default=True,
metadata={"help": "Whether to use streaming mode to load and pre-process the data."},
)
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
"""
Data collator that will dynamically pad the inputs received.
Args:
processor ([`WhisperProcessor`])
The processor used for processing the data.
decoder_start_token_id (`int`)
The begin-of-sentence of the decoder.
"""
processor: Any
decoder_start_token_id: int
def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
# split inputs and labels since they have to be of different lengths and need
# different padding methods
model_input_name = self.processor.model_input_names[0]
input_features = [{model_input_name: feature[model_input_name]} for feature in features]
label_features = [{"input_ids": feature["labels"]} for feature in features]
batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")
labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")
# replace padding with -100 to ignore loss correctly
labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
# if bos token is appended in previous tokenization step,
# cut bos token here as it's append later anyways
if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
labels = labels[:, 1:]
batch["labels"] = labels
return batch
def load_maybe_streaming_dataset(dataset_name, dataset_config_name, split="train", streaming=True, **kwargs):
"""
Utility function to load a dataset in streaming mode. For datasets with multiple splits,
each split is loaded individually and then splits combined by taking alternating examples from
each (interleaving).
"""
if "+" in split:
# load multiple splits separated by the `+` symbol with streaming mode
dataset_splits = [
load_dataset(dataset_name, dataset_config_name, split=split_name, streaming=streaming, **kwargs)
for split_name in split.split("+")
]
# interleave multiple splits to form one dataset
interleaved_dataset = interleave_datasets(dataset_splits)
return interleaved_dataset
else:
# load a single split *with* streaming mode
dataset = load_dataset(dataset_name, dataset_config_name, split=split, streaming=streaming, **kwargs)
return dataset
def main():
# 1. Parse input arguments
# See all possible arguments in src/transformers/training_args.py
# or by passing the --help flag to this script.
# We now keep distinct sets of args, for a cleaner separation of concerns.
parser = HfArgumentParser((ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments))
if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
# If we pass only one argument to the script and it's the path to a json file,
# let's parse it to get our arguments.
model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
else:
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
# Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
# information sent is the one passed as arguments along with your Python/PyTorch versions.
send_example_telemetry("run_speech_recognition_seq2seq_streaming", model_args, data_args)
# 2. Setup logging
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S",
handlers=[logging.StreamHandler(sys.stdout)],
)
log_level = training_args.get_process_log_level()
logger.setLevel(log_level)
datasets.utils.logging.set_verbosity(log_level)
transformers.utils.logging.set_verbosity(log_level)
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()
logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN)
# Log on each process the small summary:
logger.warning(
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
)
logger.info(f"Training/evaluation parameters {training_args}")
# Set the verbosity to info of the Transformers logger (on main process only):
if is_main_process(training_args.local_rank):
transformers.utils.logging.set_verbosity_info()
logger.info("Training/evaluation parameters %s", training_args)
# 3. Detecting last checkpoint and eventually continue from last checkpoint
last_checkpoint = None
if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
last_checkpoint = get_last_checkpoint(training_args.output_dir)
if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
raise ValueError(
f"Output directory ({training_args.output_dir}) already exists and is not empty. "
"Use --overwrite_output_dir to overcome."
)
elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
logger.info(
f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
"the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
)
# Set seed before initializing model.
set_seed(training_args.seed)
# 4. Load dataset
raw_datasets = IterableDatasetDict() if data_args.streaming else DatasetDict()
if training_args.do_train:
raw_datasets["train"] = load_maybe_streaming_dataset(
data_args.dataset_name,
data_args.dataset_config_name,
split=data_args.train_split_name,
use_auth_token=True if model_args.use_auth_token else None,
streaming=data_args.streaming,
)
if training_args.do_eval:
raw_datasets["eval"] = load_maybe_streaming_dataset(
data_args.dataset_name,
data_args.dataset_config_name,
split=data_args.eval_split_name,
use_auth_token=True if model_args.use_auth_token else None,
streaming=data_args.streaming,
)
raw_datasets_features = list(next(iter(raw_datasets.values())).features.keys())
if data_args.audio_column_name not in raw_datasets_features:
raise ValueError(
f"--audio_column_name '{data_args.audio_column_name}' not found in dataset '{data_args.dataset_name}'. "
"Make sure to set `--audio_column_name` to the correct audio column - one of "
f"{', '.join(raw_datasets_features)}."
)
if data_args.text_column_name not in raw_datasets_features:
raise ValueError(
f"--text_column_name {data_args.text_column_name} not found in dataset '{data_args.dataset_name}'. "
"Make sure to set `--text_column_name` to the correct text column - one of "
f"{', '.join(raw_datasets_features)}."
)
# 5. Load pretrained model, tokenizer, and feature extractor
#
# Distributed training:
# The .from_pretrained methods guarantee that only one local process can concurrently
config = AutoConfig.from_pretrained(
model_args.config_name if model_args.config_name else model_args.model_name_or_path,
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
)
config.update({"forced_decoder_ids": model_args.forced_decoder_ids, "suppress_tokens": model_args.suppress_tokens})
if training_args.gradient_checkpointing:
config.update({"use_cache": False})
feature_extractor = AutoFeatureExtractor.from_pretrained(
model_args.feature_extractor_name if model_args.feature_extractor_name else model_args.model_name_or_path,
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
)
tokenizer = AutoTokenizer.from_pretrained(
model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
cache_dir=model_args.cache_dir,
use_fast=model_args.use_fast_tokenizer,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
)
model = AutoModelForSpeechSeq2Seq.from_pretrained(
model_args.model_name_or_path,
config=config,
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
)
if model.config.decoder_start_token_id is None:
raise ValueError("Make sure that `config.decoder_start_token_id` is correctly defined")
if model_args.freeze_feature_encoder:
model.freeze_feature_encoder()
if model_args.freeze_encoder:
model.freeze_encoder()
if data_args.language is not None:
# We only need to set the task id when the language is specified (i.e. in a multilingual setting)
tokenizer.set_prefix_tokens(language=data_args.language, task=data_args.task)
# 6. Resample speech dataset if necessary
dataset_sampling_rate = next(iter(raw_datasets.values())).features[data_args.audio_column_name].sampling_rate
if dataset_sampling_rate != feature_extractor.sampling_rate:
raw_datasets = raw_datasets.cast_column(
data_args.audio_column_name, datasets.features.Audio(sampling_rate=feature_extractor.sampling_rate)
)
# 7. Preprocessing the datasets.
# We need to read the audio files as arrays and tokenize the targets.
max_input_length = data_args.max_duration_in_seconds * feature_extractor.sampling_rate
min_input_length = data_args.min_duration_in_seconds * feature_extractor.sampling_rate
audio_column_name = data_args.audio_column_name
text_column_name = data_args.text_column_name
model_input_name = feature_extractor.model_input_names[0]
do_lower_case = data_args.do_lower_case
do_remove_punctuation = data_args.do_remove_punctuation
normalizer = BasicTextNormalizer() # 'official' text normalizer from OpenAI
if data_args.max_train_samples is not None:
raw_datasets["train"] = (
raw_datasets["train"].take(data_args.max_train_samples)
if data_args.streaming
else raw_datasets["train"].select(range(data_args.max_train_samples))
)
if data_args.max_eval_samples is not None:
raw_datasets["eval"] = (
raw_datasets["eval"].take(data_args.max_eval_samples)
if data_args.streaming
else raw_datasets["eval"].select(range(data_args.max_eval_samples))
)
def prepare_dataset(batch):
# process audio
sample = batch[audio_column_name]
inputs = feature_extractor(sample["array"], sampling_rate=sample["sampling_rate"])
# process audio length
batch[model_input_name] = inputs.get(model_input_name)[0]
batch["input_length"] = len(sample["array"])
# process targets
input_str = batch[text_column_name].lower() if do_lower_case else batch[text_column_name]
if do_remove_punctuation:
input_str = normalizer(input_str).strip()
batch["labels"] = tokenizer(input_str).input_ids
return batch
with training_args.main_process_first(desc="dataset map pre-processing"):
vectorized_datasets = raw_datasets.map(
prepare_dataset,
remove_columns=raw_datasets_features,
).with_format("torch")
if training_args.do_train and data_args.streaming:
# manually shuffle if streaming (done by the trainer for non-streaming)
vectorized_datasets["train"] = vectorized_datasets["train"].shuffle(
buffer_size=data_args.shuffle_buffer_size,
seed=training_args.seed,
)
# filter training data that is shorter than min_input_length or longer than
# max_input_length
def is_audio_in_length_range(length):
return min_input_length < length < max_input_length
if training_args.do_train:
vectorized_datasets["train"] = vectorized_datasets["train"].filter(
is_audio_in_length_range,
input_columns=["input_length"],
)
# 8. Load Metric
metric = evaluate.load("wer")
do_normalize_eval = data_args.do_normalize_eval
def compute_metrics(pred):
pred_ids = pred.predictions
pred.label_ids[pred.label_ids == -100] = tokenizer.pad_token_id
pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
# we do not want to group tokens when computing the metrics
label_str = tokenizer.batch_decode(pred.label_ids, skip_special_tokens=True)
if do_normalize_eval:
pred_str = [normalizer(pred) for pred in pred_str]
label_str = [normalizer(label) for label in label_str]
# filtering step to only evaluate the samples that correspond to non-zero references:
pred_str = [pred_str[i] for i in range(len(pred_str)) if len(label_str[i]) > 0]
label_str = [label_str[i] for i in range(len(label_str)) if len(label_str[i]) > 0]
wer = 100 * metric.compute(predictions=pred_str, references=label_str)
return {"wer": wer}
# 9. Create a single speech processor
if is_main_process(training_args.local_rank):
# save feature extractor, tokenizer and config
feature_extractor.save_pretrained(training_args.output_dir)
tokenizer.save_pretrained(training_args.output_dir)
config.save_pretrained(training_args.output_dir)
processor = AutoProcessor.from_pretrained(training_args.output_dir)
# 10. Define data collator
data_collator = DataCollatorSpeechSeq2SeqWithPadding(
processor=processor,
decoder_start_token_id=model.config.decoder_start_token_id,
)
# 11. Configure Trainer
# Trainer callback to reinitialise and reshuffle the streamable datasets at the beginning of each epoch
# Only required for streaming: Trainer automatically shuffles non-streaming datasets
class ShuffleCallback(TrainerCallback):
def on_epoch_begin(self, args, state, control, train_dataloader, **kwargs):
if isinstance(train_dataloader.dataset, IterableDatasetShard):
pass # set_epoch() is handled by the Trainer
elif isinstance(train_dataloader.dataset, IterableDataset):
train_dataloader.dataset.set_epoch(train_dataloader.dataset._epoch + 1)
# Initialize Trainer
trainer = Seq2SeqTrainer(
model=model,
args=training_args,
train_dataset=vectorized_datasets["train"] if training_args.do_train else None,
eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None,
tokenizer=feature_extractor,
data_collator=data_collator,
compute_metrics=compute_metrics if training_args.predict_with_generate else None,
callbacks=[ShuffleCallback()] if data_args.streaming else None,
)
# 12. Training
if training_args.do_train:
checkpoint = None
if training_args.resume_from_checkpoint is not None:
checkpoint = training_args.resume_from_checkpoint
elif last_checkpoint is not None:
checkpoint = last_checkpoint
train_result = trainer.train(resume_from_checkpoint=checkpoint)
trainer.save_model() # Saves the feature extractor too for easy upload
metrics = train_result.metrics
if data_args.max_train_samples:
metrics["train_samples"] = data_args.max_train_samples
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()
# 13. Evaluation
results = {}
if training_args.do_eval:
logger.info("*** Evaluate ***")
metrics = trainer.evaluate(
metric_key_prefix="eval",
max_length=training_args.generation_max_length,
num_beams=training_args.generation_num_beams,
)
if data_args.max_eval_samples:
metrics["eval_samples"] = data_args.max_eval_samples
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)
# 14. Write Training Stats
kwargs = {
"finetuned_from": model_args.model_name_or_path,
"tasks": "automatic-speech-recognition",
"tags": "whisper-event",
}
if data_args.dataset_name is not None:
kwargs["dataset_tags"] = data_args.dataset_name
if data_args.dataset_config_name is not None:
kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
else:
kwargs["dataset"] = data_args.dataset_name
if "common_voice" in data_args.dataset_name:
kwargs["language"] = data_args.dataset_config_name.split('-')[0]
if model_args.model_index_name is not None:
kwargs["model_name"] = model_args.model_index_name
if training_args.push_to_hub:
trainer.push_to_hub(**kwargs)
else:
trainer.create_model_card(**kwargs)
return results
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:2e2ac8a8a9e1e90d6627f516a852d953dd162e6a2676fddf5939abb3da2ef7e6
size 5161

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:e16cb63fb9f0e03fa4fcd079db81ecda37373375b27dcae475d73cbeb1a1ea8f
size 5007

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:9102a6f3df4e711d987b5d35c62d3b85384b52d2f706df8d3976edb09f1804dc
size 38336

139
special_tokens_map.json Normal file
View File

@@ -0,0 +1,139 @@
{
"additional_special_tokens": [
"<|startoftranscript|>",
"<|en|>",
"<|zh|>",
"<|de|>",
"<|es|>",
"<|ru|>",
"<|ko|>",
"<|fr|>",
"<|ja|>",
"<|pt|>",
"<|tr|>",
"<|pl|>",
"<|ca|>",
"<|nl|>",
"<|ar|>",
"<|sv|>",
"<|it|>",
"<|id|>",
"<|hi|>",
"<|fi|>",
"<|vi|>",
"<|he|>",
"<|uk|>",
"<|el|>",
"<|ms|>",
"<|cs|>",
"<|ro|>",
"<|da|>",
"<|hu|>",
"<|ta|>",
"<|no|>",
"<|th|>",
"<|ur|>",
"<|hr|>",
"<|bg|>",
"<|lt|>",
"<|la|>",
"<|mi|>",
"<|ml|>",
"<|cy|>",
"<|sk|>",
"<|te|>",
"<|fa|>",
"<|lv|>",
"<|bn|>",
"<|sr|>",
"<|az|>",
"<|sl|>",
"<|kn|>",
"<|et|>",
"<|mk|>",
"<|br|>",
"<|eu|>",
"<|is|>",
"<|hy|>",
"<|ne|>",
"<|mn|>",
"<|bs|>",
"<|kk|>",
"<|sq|>",
"<|sw|>",
"<|gl|>",
"<|mr|>",
"<|pa|>",
"<|si|>",
"<|km|>",
"<|sn|>",
"<|yo|>",
"<|so|>",
"<|af|>",
"<|oc|>",
"<|ka|>",
"<|be|>",
"<|tg|>",
"<|sd|>",
"<|gu|>",
"<|am|>",
"<|yi|>",
"<|lo|>",
"<|uz|>",
"<|fo|>",
"<|ht|>",
"<|ps|>",
"<|tk|>",
"<|nn|>",
"<|mt|>",
"<|sa|>",
"<|lb|>",
"<|my|>",
"<|bo|>",
"<|tl|>",
"<|mg|>",
"<|as|>",
"<|tt|>",
"<|haw|>",
"<|ln|>",
"<|ha|>",
"<|ba|>",
"<|jw|>",
"<|su|>",
"<|yue|>",
"<|translate|>",
"<|transcribe|>",
"<|startoflm|>",
"<|startofprev|>",
"<|nospeech|>",
"<|notimestamps|>"
],
"bos_token": {
"content": "<|endoftext|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
},
"eos_token": {
"content": "<|endoftext|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
},
"pad_token": {
"content": "<|endoftext|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
},
"unk_token": {
"content": "<|endoftext|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
}
}

114903
tokenizer.json Normal file

File diff suppressed because it is too large Load Diff

12996
tokenizer_config.json Normal file

File diff suppressed because it is too large Load Diff

3
training_args.bin Normal file
View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:1adac70f1f1f82a14f7164910af93cebe7c58a829359d4b895c9772e839d3f49
size 5883

1
vocab.json Normal file

File diff suppressed because one or more lines are too long