初始化项目，由ModelHub XC社区提供模型

Model: projecte-aina/whisper-large-v3-ca-3catparla Source: Original Platform
2026-05-13 17:57:28 +08:00
commit fe20ba2ad8
18 changed files with 119141 additions and 0 deletions
--- a/.gitattributes
+++ b/.gitattributes
@@ -0,0 +1,35 @@
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
--- a/README.md
+++ b/README.md
@@ -0,0 +1,377 @@
+---
+language: ca
+datasets:
+- projecte-aina/3catparla_asr
+tags:
+- audio
+- automatic-speech-recognition
+- catalan
+- whisper-large-v3
+- projecte-aina
+- barcelona-supercomputing-center
+- bsc
+license: apache-2.0
+model-index:
+- name: whisper-large-v3-ca-3catparla
+  results:
+  - task:
+      name: Automatic Speech Recognition
+      type: automatic-speech-recognition
+    dataset:
+      name: 3CatParla (Test)
+      type: projecte-aina/3catparla_asr
+      split: test
+      args:
+        language: ca
+    metrics:
+    - name: WER
+      type: wer
+      value: 0.96
+  - task:
+      name: Automatic Speech Recognition
+      type: automatic-speech-recognition
+    dataset:
+      name: 3CatParla (Dev)
+      type: projecte-aina/3catparla_asr
+      split: dev
+      args:
+        language: ca
+    metrics:
+    - name: WER
+      type: wer
+      value: 0.92
+  - task:
+      name: Automatic Speech Recognition
+      type: automatic-speech-recognition
+    dataset:
+      name: Mozilla Common Voice 17.0 (Test)
+      type: mozilla-foundation/common_voice_17_0
+      split: test
+      args:
+        language: ca
+    metrics:
+    - name: WER
+      type: wer
+      value: 10.32
+  - task:
+      name: Automatic Speech Recognition
+      type: automatic-speech-recognition
+    dataset:
+      name: Mozilla Common Voice 17.0 (Dev)
+      type: mozilla-foundation/common_voice_17_0
+      split: validation
+      args:
+        language: ca
+    metrics:
+    - name: WER
+      type: wer
+      value: 9.26
+  - task:
+      name: Automatic Speech Recognition
+      type: automatic-speech-recognition
+    dataset:
+      name: CV Benchmark Catalan Accents (Balearic fem)
+      type: projecte-aina/commonvoice_benchmark_catalan_accents
+      split: Balearic female
+      args:
+        language: ca
+    metrics:
+    - name: WER
+      type: wer
+      value: 12.25
+  - task:
+      name: Automatic Speech Recognition
+      type: automatic-speech-recognition
+    dataset:
+      name: CV Benchmark Catalan Accents (Balearic male)
+      type: projecte-aina/commonvoice_benchmark_catalan_accents
+      split: Balearic male
+      args:
+        language: ca
+    metrics:
+    - name: WER
+      type: wer
+      value: 12.18
+  - task:
+      name: Automatic Speech Recognition
+      type: automatic-speech-recognition
+    dataset:
+      name: CV Benchmark Catalan Accents (Central fem)
+      type: projecte-aina/commonvoice_benchmark_catalan_accents
+      split: Central female
+      args:
+        language: ca
+    metrics:
+    - name: WER
+      type: wer
+      value: 8.51
+  - task:
+      name: Automatic Speech Recognition
+      type: automatic-speech-recognition
+    dataset:
+      name: CV Benchmark Catalan Accents (Central male)
+      type: projecte-aina/commonvoice_benchmark_catalan_accents
+      split: Central male
+      args:
+        language: ca
+    metrics:
+    - name: WER
+      type: wer
+      value: 8.73
+  - task:
+      name: Automatic Speech Recognition
+      type: automatic-speech-recognition
+    dataset:
+      name: CV Benchmark Catalan Accents (Northern fem)
+      type: projecte-aina/commonvoice_benchmark_catalan_accents
+      split: Northern female
+      args:
+        language: ca
+    metrics:
+    - name: WER
+      type: wer
+      value: 8.09
+  - task:
+      name: Automatic Speech Recognition
+      type: automatic-speech-recognition
+    dataset:
+      name: CV Benchmark Catalan Accents (Northern male)
+      type: projecte-aina/commonvoice_benchmark_catalan_accents
+      split: Northern male
+      args:
+        language: ca
+    metrics:
+    - name: WER
+      type: wer
+      value: 8.28
+  - task:
+      name: Automatic Speech Recognition
+      type: automatic-speech-recognition
+    dataset:
+      name: CV Benchmark Catalan Accents (Northwestern fem)
+      type: projecte-aina/commonvoice_benchmark_catalan_accents
+      split: Northwestern female
+      args:
+        language: ca
+    metrics:
+    - name: WER
+      type: wer
+      value: 7.88
+  - task:
+      name: Automatic Speech Recognition
+      type: automatic-speech-recognition
+    dataset:
+      name: CV Benchmark Catalan Accents (Northwestern male)
+      type: projecte-aina/commonvoice_benchmark_catalan_accents
+      split: Northwestern male
+      args:
+        language: ca
+    metrics:
+    - name: WER
+      type: wer
+      value: 8.44
+  - task:
+      name: Automatic Speech Recognition
+      type: automatic-speech-recognition
+    dataset:
+      name: CV Benchmark Catalan Accents (Valencian fem)
+      type: projecte-aina/commonvoice_benchmark_catalan_accents
+      split: Valencian female
+      args:
+        language: ca
+    metrics:
+    - name: WER
+      type: wer
+      value: 9.58
+  - task:
+      name: Automatic Speech Recognition
+      type: automatic-speech-recognition
+    dataset:
+      name: CV Benchmark Catalan Accents (Valencian male)
+      type: projecte-aina/commonvoice_benchmark_catalan_accents
+      split: Valencian male
+      args:
+        language: ca
+    metrics:
+    - name: WER
+      type: wer
+      value: 9.1
+library_name: transformers
+---
+# whisper-large-v3-ca-3catparla
+
+## Table of Contents
+<details>
+<summary>Click to expand</summary>
+
+- [Paper](#paper)
+- [Model Description](#model-description)
+- [Intended Uses and Limitations](#intended-uses-and-limitations)
+- [How to Get Started with the Model](#how-to-get-started-with-the-model)
+- [Training Details](#training-details)
+- [Citation](#citation)
+- [Additional Information](#additional-information)
+
+</details>
+
+## Paper
+
+**PDF:** [3CatParla: A New Open-Source Corpus of Broadcast TV in Catalan for Automatic Speech Recognition](https://www.isca-archive.org/iberspeech_2024/hernandezmena24_iberspeech.pdf)
+
+## Model Description
+
+The "whisper-large-v3-ca-3catparla" is an acoustic model suitable for Automatic Speech Recognition in Catalan. It is the result of finetuning the model ["openai/whisper-large-v3"](https://huggingface.co/openai/whisper-large-v3) with 710 hours of Catalan data released by the [Projecte AINA](https://projecteaina.cat/) from Barcelona, Spain.
+
+## Intended Uses and Limitations
+
+This model can be used for Automatic Speech Recognition (ASR) in Catalan. The model is intended to transcribe audio files in Catalan to plain text without punctuation.
+
+## How to Get Started with the Model
+
+To see an updated and functional version of this code, please see our our [Notebook](https://colab.research.google.com/drive/1MHiPrffNTwiyWeUyMQvSdSbfkef_8aJC?usp=sharing)
+
+### Installation
+
+In order to use this model, you may install [datasets](https://huggingface.co/docs/datasets/installation) and [transformers](https://huggingface.co/docs/transformers/installation):
+
+Create a virtual environment:
+```bash
+python -m venv /path/to/venv
+```
+Activate the environment:
+```bash
+source /path/to/venv/bin/activate
+```
+Install the modules:
+```bash
+pip install datasets transformers 
+```
+
+### For Inference
+In order to transcribe audio in Catalan using this model, you can follow this example:
+
+```bash
+#Install Prerequisites
+pip install torch
+pip install datasets
+pip install 'transformers[torch]'
+pip install evaluate
+pip install jiwer
+```
+
+```python
+#This code works with GPU
+
+#Notice that: load_metric is no longer part of datasets.
+#you have to remove it and use evaluate's load instead.
+#(Note from November 2024)
+
+import torch
+from transformers import WhisperForConditionalGeneration, WhisperProcessor
+
+#Load the processor and model.
+MODEL_NAME="projecte-aina/whisper-large-v3-ca-3catparla"
+processor = WhisperProcessor.from_pretrained(MODEL_NAME)
+model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME).to("cuda")
+
+#Load the dataset
+from datasets import load_dataset, load_metric, Audio
+ds=load_dataset("projecte-aina/3catparla_asr",split='test')
+
+#Downsample to 16kHz
+ds = ds.cast_column("audio", Audio(sampling_rate=16_000))
+
+#Process the dataset
+def map_to_pred(batch):
+	audio = batch["audio"]
+	input_features = processor(audio["array"], sampling_rate=audio["sampling_rate"], return_tensors="pt").input_features
+	batch["reference"] = processor.tokenizer._normalize(batch['normalized_text'])
+
+	with torch.no_grad():
+		predicted_ids = model.generate(input_features.to("cuda"))[0]
+	
+	transcription = processor.decode(predicted_ids)
+	batch["prediction"] = processor.tokenizer._normalize(transcription)
+	
+	return batch
+	
+#Do the evaluation
+result = ds.map(map_to_pred)
+
+#Compute the overall WER now.
+from evaluate import load
+
+wer = load("wer")
+WER=100 * wer.compute(references=result["reference"], predictions=result["prediction"])
+print(WER)
+```
+**Test Result**: 0.96
+
+## Training Details
+
+### Training data
+
+The specific dataset used to create the model is called ["3CatParla"](https://huggingface.co/datasets/projecte-aina/3catparla_asr).
+
+### Training procedure
+
+This model is the result of finetuning the model ["openai/whisper-large-v3"](https://huggingface.co/openai/whisper-large-v3) by following this [tutorial](https://huggingface.co/blog/fine-tune-whisper) provided by Hugging Face.
+
+### Training Hyperparameters
+
+* language: catalan
+* hours of training audio: 710
+* learning rate: 1.95e-07
+* sample rate: 16000
+* train batch size: 32 (x4 GPUs)
+  * gradient accumulation steps: 1
+* eval batch size: 32
+* save total limit: 3
+* max steps: 19842
+* warmup steps: 1984
+* eval steps: 3307
+* save steps: 3307
+* shuffle buffer size: 480
+
+## Citation
+If this model contributes to your research, please cite the work:
+```bibtex
+@inproceedings{hernandez20243catparla,
+  title={3CatParla: A New Open-Source Corpus of Broadcast TV in Catalan for Automatic Speech Recognition},
+  author={Hern{\'a}ndez Mena, Carlos Daniel and Armentano Oller, Carme and Solito, Sarah and K{\"u}lebi, Baybars},
+  booktitle={Proc. IberSPEECH 2024},
+  pages={176--180},
+  year={2024}
+}
+```
+
+<!--
+@misc{mena2024whisperlarge3catparla,
+      title={Acoustic Model in Catalan: whisper-large-v3-ca-3catparla.}, 
+      author={Hernandez Mena, Carlos Daniel; Armentano-Oller, Carme; Solito, Sarah; Külebi, Baybars},
+      organization={Barcelona Supercomputing Center},
+      url={https://huggingface.co/projecte-aina/whisper-large-v3-ca-3catparla},
+      year={2024}
+}
+-->
+
+## Additional Information
+
+### Author
+
+The fine-tuning process was perform during July (2024) in the [Language Technologies Unit](https://huggingface.co/BSC-LT) of the [Barcelona Supercomputing Center](https://www.bsc.es/) by [Carlos Daniel Hernández Mena](https://huggingface.co/carlosdanielhernandezmena).
+
+### Contact
+For further information, please send an email to <langtech@bsc.es>.
+
+### Copyright
+Copyright(c) 2024 by Language Technologies Unit, Barcelona Supercomputing Center.
+
+### License
+
+[Apache-2.0](https://www.apache.org/licenses/LICENSE-2.0)
+
+### Funding
+This work has been promoted and financed by the Generalitat de Catalunya through the [Aina project](https://projecteaina.cat/).
+
+The training of the model was possible thanks to the compute time provided by [Barcelona Supercomputing Center](https://www.bsc.es/) through MareNostrum 5.
--- a/added_tokens.json
+++ b/added_tokens.json
--- a/all_results.json
+++ b/all_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 11.083056143533918,
+    "total_flos": 8.628884758428616e+21,
+    "train_loss": 0.04815149868992161,
+    "train_runtime": 124240.955,
+    "train_samples_per_second": 20.442,
+    "train_steps_per_second": 0.16
+}
--- a/config.json
+++ b/config.json
@@ -0,0 +1,52 @@
+{
+  "_name_or_path": "/gpfs/projects/bsc88/speech/ASR/models/whisper-large-v3",
+  "activation_dropout": 0.0,
+  "activation_function": "gelu",
+  "apply_spec_augment": false,
+  "architectures": [
+    "WhisperForConditionalGeneration"
+  ],
+  "attention_dropout": 0.0,
+  "begin_suppress_tokens": [
+    220,
+    50257
+  ],
+  "bos_token_id": 50257,
+  "classifier_proj_size": 256,
+  "d_model": 1280,
+  "decoder_attention_heads": 20,
+  "decoder_ffn_dim": 5120,
+  "decoder_layerdrop": 0.0,
+  "decoder_layers": 32,
+  "decoder_start_token_id": 50258,
+  "dropout": 0.0,
+  "encoder_attention_heads": 20,
+  "encoder_ffn_dim": 5120,
+  "encoder_layerdrop": 0.0,
+  "encoder_layers": 32,
+  "eos_token_id": 50257,
+  "forced_decoder_ids": null,
+  "init_std": 0.02,
+  "is_encoder_decoder": true,
+  "mask_feature_length": 10,
+  "mask_feature_min_masks": 0,
+  "mask_feature_prob": 0.0,
+  "mask_time_length": 10,
+  "mask_time_min_masks": 2,
+  "mask_time_prob": 0.05,
+  "max_length": 448,
+  "max_source_positions": 1500,
+  "max_target_positions": 448,
+  "median_filter_width": 7,
+  "model_type": "whisper",
+  "num_hidden_layers": 32,
+  "num_mel_bins": 128,
+  "pad_token_id": 50256,
+  "scale_embedding": false,
+  "suppress_tokens": [],
+  "torch_dtype": "float32",
+  "transformers_version": "4.40.2",
+  "use_cache": false,
+  "use_weighted_layer_sum": false,
+  "vocab_size": 51866
+}
--- a/generation_config.json
+++ b/generation_config.json
@@ -0,0 +1,266 @@
+{
+  "alignment_heads": [
+    [
+      7,
+      0
+    ],
+    [
+      10,
+      17
+    ],
+    [
+      12,
+      18
+    ],
+    [
+      13,
+      12
+    ],
+    [
+      16,
+      1
+    ],
+    [
+      17,
+      14
+    ],
+    [
+      19,
+      11
+    ],
+    [
+      21,
+      4
+    ],
+    [
+      24,
+      1
+    ],
+    [
+      25,
+      6
+    ]
+  ],
+  "begin_suppress_tokens": [
+    220,
+    50257
+  ],
+  "bos_token_id": 50257,
+  "decoder_start_token_id": 50258,
+  "eos_token_id": 50257,
+  "forced_decoder_ids": [
+    [
+      1,
+      null
+    ],
+    [
+      2,
+      50360
+    ]
+  ],
+  "is_multilingual": true,
+  "lang_to_id": {
+    "<|af|>": 50327,
+    "<|am|>": 50334,
+    "<|ar|>": 50272,
+    "<|as|>": 50350,
+    "<|az|>": 50304,
+    "<|ba|>": 50355,
+    "<|be|>": 50330,
+    "<|bg|>": 50292,
+    "<|bn|>": 50302,
+    "<|bo|>": 50347,
+    "<|br|>": 50309,
+    "<|bs|>": 50315,
+    "<|ca|>": 50270,
+    "<|cs|>": 50283,
+    "<|cy|>": 50297,
+    "<|da|>": 50285,
+    "<|de|>": 50261,
+    "<|el|>": 50281,
+    "<|en|>": 50259,
+    "<|es|>": 50262,
+    "<|et|>": 50307,
+    "<|eu|>": 50310,
+    "<|fa|>": 50300,
+    "<|fi|>": 50277,
+    "<|fo|>": 50338,
+    "<|fr|>": 50265,
+    "<|gl|>": 50319,
+    "<|gu|>": 50333,
+    "<|haw|>": 50352,
+    "<|ha|>": 50354,
+    "<|he|>": 50279,
+    "<|hi|>": 50276,
+    "<|hr|>": 50291,
+    "<|ht|>": 50339,
+    "<|hu|>": 50286,
+    "<|hy|>": 50312,
+    "<|id|>": 50275,
+    "<|is|>": 50311,
+    "<|it|>": 50274,
+    "<|ja|>": 50266,
+    "<|jw|>": 50356,
+    "<|ka|>": 50329,
+    "<|kk|>": 50316,
+    "<|km|>": 50323,
+    "<|kn|>": 50306,
+    "<|ko|>": 50264,
+    "<|la|>": 50294,
+    "<|lb|>": 50345,
+    "<|ln|>": 50353,
+    "<|lo|>": 50336,
+    "<|lt|>": 50293,
+    "<|lv|>": 50301,
+    "<|mg|>": 50349,
+    "<|mi|>": 50295,
+    "<|mk|>": 50308,
+    "<|ml|>": 50296,
+    "<|mn|>": 50314,
+    "<|mr|>": 50320,
+    "<|ms|>": 50282,
+    "<|mt|>": 50343,
+    "<|my|>": 50346,
+    "<|ne|>": 50313,
+    "<|nl|>": 50271,
+    "<|nn|>": 50342,
+    "<|no|>": 50288,
+    "<|oc|>": 50328,
+    "<|pa|>": 50321,
+    "<|pl|>": 50269,
+    "<|ps|>": 50340,
+    "<|pt|>": 50267,
+    "<|ro|>": 50284,
+    "<|ru|>": 50263,
+    "<|sa|>": 50344,
+    "<|sd|>": 50332,
+    "<|si|>": 50322,
+    "<|sk|>": 50298,
+    "<|sl|>": 50305,
+    "<|sn|>": 50324,
+    "<|so|>": 50326,
+    "<|sq|>": 50317,
+    "<|sr|>": 50303,
+    "<|su|>": 50357,
+    "<|sv|>": 50273,
+    "<|sw|>": 50318,
+    "<|ta|>": 50287,
+    "<|te|>": 50299,
+    "<|tg|>": 50331,
+    "<|th|>": 50289,
+    "<|tk|>": 50341,
+    "<|tl|>": 50348,
+    "<|tr|>": 50268,
+    "<|tt|>": 50351,
+    "<|uk|>": 50280,
+    "<|ur|>": 50290,
+    "<|uz|>": 50337,
+    "<|vi|>": 50278,
+    "<|yi|>": 50335,
+    "<|yo|>": 50325,
+    "<|yue|>": 50358,
+    "<|zh|>": 50260
+  },
+  "language": "catalan",
+  "max_initial_timestamp_index": 50,
+  "max_length": 448,
+  "no_timestamps_token_id": 50364,
+  "pad_token_id": 50257,
+  "prev_sot_token_id": 50362,
+  "return_timestamps": false,
+  "suppress_tokens": [
+    1,
+    2,
+    7,
+    8,
+    9,
+    10,
+    14,
+    25,
+    26,
+    27,
+    28,
+    29,
+    31,
+    58,
+    59,
+    60,
+    61,
+    62,
+    63,
+    90,
+    91,
+    92,
+    93,
+    359,
+    503,
+    522,
+    542,
+    873,
+    893,
+    902,
+    918,
+    922,
+    931,
+    1350,
+    1853,
+    1982,
+    2460,
+    2627,
+    3246,
+    3253,
+    3268,
+    3536,
+    3846,
+    3961,
+    4183,
+    4667,
+    6585,
+    6647,
+    7273,
+    9061,
+    9383,
+    10428,
+    10929,
+    11938,
+    12033,
+    12331,
+    12562,
+    13793,
+    14157,
+    14635,
+    15265,
+    15618,
+    16553,
+    16604,
+    18362,
+    18956,
+    20075,
+    21675,
+    22520,
+    26130,
+    26161,
+    26435,
+    28279,
+    29464,
+    31650,
+    32302,
+    32470,
+    36865,
+    42863,
+    47425,
+    49870,
+    50254,
+    50258,
+    50359,
+    50360,
+    50361,
+    50362,
+    50363
+  ],
+  "task_to_id": {
+    "transcribe": 50360,
+    "translate": 50359
+  },
+  "transformers_version": "4.40.2"
+}
--- a/merges.txt
+++ b/merges.txt
--- a/normalizer.json
+++ b/normalizer.json
--- a/preprocessor_config.json
+++ b/preprocessor_config.json
@@ -0,0 +1,14 @@
+{
+  "chunk_length": 30,
+  "feature_extractor_type": "WhisperFeatureExtractor",
+  "feature_size": 128,
+  "hop_length": 160,
+  "n_fft": 400,
+  "n_samples": 480000,
+  "nb_max_frames": 3000,
+  "padding_side": "right",
+  "padding_value": 0.0,
+  "processor_class": "WhisperProcessor",
+  "return_attention_mask": false,
+  "sampling_rate": 16000
+}
--- a/pytorch_model-00001-of-00002.bin
+++ b/pytorch_model-00001-of-00002.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b5dd9863724f0133ef9e2fc7fe7c438c9d8728d6a31ef999b85a5099285d0cac
+size 4993686017
--- a/pytorch_model-00002-of-00002.bin
+++ b/pytorch_model-00002-of-00002.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ac4eb29e216f51ba613a65495e00e036a1ca27c11ea1892548c4b69dafbf87da
+size 1180727888
--- a/pytorch_model.bin.index.json
+++ b/pytorch_model.bin.index.json
--- a/special_tokens_map.json
+++ b/special_tokens_map.json
@@ -0,0 +1,139 @@
+{
+  "additional_special_tokens": [
+    "<|startoftranscript|>",
+    "<|en|>",
+    "<|zh|>",
+    "<|de|>",
+    "<|es|>",
+    "<|ru|>",
+    "<|ko|>",
+    "<|fr|>",
+    "<|ja|>",
+    "<|pt|>",
+    "<|tr|>",
+    "<|pl|>",
+    "<|ca|>",
+    "<|nl|>",
+    "<|ar|>",
+    "<|sv|>",
+    "<|it|>",
+    "<|id|>",
+    "<|hi|>",
+    "<|fi|>",
+    "<|vi|>",
+    "<|he|>",
+    "<|uk|>",
+    "<|el|>",
+    "<|ms|>",
+    "<|cs|>",
+    "<|ro|>",
+    "<|da|>",
+    "<|hu|>",
+    "<|ta|>",
+    "<|no|>",
+    "<|th|>",
+    "<|ur|>",
+    "<|hr|>",
+    "<|bg|>",
+    "<|lt|>",
+    "<|la|>",
+    "<|mi|>",
+    "<|ml|>",
+    "<|cy|>",
+    "<|sk|>",
+    "<|te|>",
+    "<|fa|>",
+    "<|lv|>",
+    "<|bn|>",
+    "<|sr|>",
+    "<|az|>",
+    "<|sl|>",
+    "<|kn|>",
+    "<|et|>",
+    "<|mk|>",
+    "<|br|>",
+    "<|eu|>",
+    "<|is|>",
+    "<|hy|>",
+    "<|ne|>",
+    "<|mn|>",
+    "<|bs|>",
+    "<|kk|>",
+    "<|sq|>",
+    "<|sw|>",
+    "<|gl|>",
+    "<|mr|>",
+    "<|pa|>",
+    "<|si|>",
+    "<|km|>",
+    "<|sn|>",
+    "<|yo|>",
+    "<|so|>",
+    "<|af|>",
+    "<|oc|>",
+    "<|ka|>",
+    "<|be|>",
+    "<|tg|>",
+    "<|sd|>",
+    "<|gu|>",
+    "<|am|>",
+    "<|yi|>",
+    "<|lo|>",
+    "<|uz|>",
+    "<|fo|>",
+    "<|ht|>",
+    "<|ps|>",
+    "<|tk|>",
+    "<|nn|>",
+    "<|mt|>",
+    "<|sa|>",
+    "<|lb|>",
+    "<|my|>",
+    "<|bo|>",
+    "<|tl|>",
+    "<|mg|>",
+    "<|as|>",
+    "<|tt|>",
+    "<|haw|>",
+    "<|ln|>",
+    "<|ha|>",
+    "<|ba|>",
+    "<|jw|>",
+    "<|su|>",
+    "<|yue|>",
+    "<|translate|>",
+    "<|transcribe|>",
+    "<|startoflm|>",
+    "<|startofprev|>",
+    "<|nospeech|>",
+    "<|notimestamps|>"
+  ],
+  "bos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
--- a/tokenizer_config.json
+++ b/tokenizer_config.json
--- a/train_results.json
+++ b/train_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 11.083056143533918,
+    "total_flos": 8.628884758428616e+21,
+    "train_loss": 0.04815149868992161,
+    "train_runtime": 124240.955,
+    "train_samples_per_second": 20.442,
+    "train_steps_per_second": 0.16
+}
--- a/trainer_state.json
+++ b/trainer_state.json
@@ -0,0 +1,357 @@
+{
+  "best_metric": 0.8985657508208054,
+  "best_model_checkpoint": "CHECKPOINTS/checkpoint-3307",
+  "epoch": 11.083056143533918,
+  "eval_steps": 3307,
+  "global_step": 19842,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.02519907267412559,
+      "grad_norm": 0.7393302321434021,
+      "learning_rate": 2.4899193548387098e-06,
+      "loss": 0.3059,
+      "step": 500
+    },
+    {
+      "epoch": 0.05039814534825118,
+      "grad_norm": 0.6863130331039429,
+      "learning_rate": 5.010080645161291e-06,
+      "loss": 0.1492,
+      "step": 1000
+    },
+    {
+      "epoch": 0.07559721802237677,
+      "grad_norm": 0.703405499458313,
+      "learning_rate": 7.5302419354838715e-06,
+      "loss": 0.1428,
+      "step": 1500
+    },
+    {
+      "epoch": 1.017437758290495,
+      "grad_norm": 0.6589027643203735,
+      "learning_rate": 9.994400268787099e-06,
+      "loss": 0.1261,
+      "step": 2000
+    },
+    {
+      "epoch": 1.0426368309646206,
+      "grad_norm": 0.6031370759010315,
+      "learning_rate": 9.71441370814201e-06,
+      "loss": 0.1202,
+      "step": 2500
+    },
+    {
+      "epoch": 1.0678359036387461,
+      "grad_norm": 0.6696850657463074,
+      "learning_rate": 9.434427147496921e-06,
+      "loss": 0.1171,
+      "step": 3000
+    },
+    {
+      "epoch": 1.0833081342606592,
+      "eval_loss": 0.017358383163809776,
+      "eval_runtime": 504.91,
+      "eval_samples_per_second": 0.824,
+      "eval_steps_per_second": 0.008,
+      "eval_wer": 0.8985657508208054,
+      "step": 3307
+    },
+    {
+      "epoch": 2.0096764439068644,
+      "grad_norm": 0.591613233089447,
+      "learning_rate": 9.154440586851832e-06,
+      "loss": 0.1042,
+      "step": 3500
+    },
+    {
+      "epoch": 2.03487551658099,
+      "grad_norm": 0.5527406930923462,
+      "learning_rate": 8.874454026206742e-06,
+      "loss": 0.0861,
+      "step": 4000
+    },
+    {
+      "epoch": 2.0600745892551156,
+      "grad_norm": 0.5651601552963257,
+      "learning_rate": 8.594467465561653e-06,
+      "loss": 0.0878,
+      "step": 4500
+    },
+    {
+      "epoch": 3.0019151295232334,
+      "grad_norm": 0.4751633107662201,
+      "learning_rate": 8.314480904916565e-06,
+      "loss": 0.0865,
+      "step": 5000
+    },
+    {
+      "epoch": 3.027114202197359,
+      "grad_norm": 0.5415444374084473,
+      "learning_rate": 8.034494344271475e-06,
+      "loss": 0.0625,
+      "step": 5500
+    },
+    {
+      "epoch": 3.0523132748714845,
+      "grad_norm": 0.5294741988182068,
+      "learning_rate": 7.754507783626388e-06,
+      "loss": 0.0636,
+      "step": 6000
+    },
+    {
+      "epoch": 3.0775123475456105,
+      "grad_norm": 0.538415789604187,
+      "learning_rate": 7.474521222981298e-06,
+      "loss": 0.0652,
+      "step": 6500
+    },
+    {
+      "epoch": 3.083257736115311,
+      "eval_loss": 0.01661744900047779,
+      "eval_runtime": 517.2779,
+      "eval_samples_per_second": 0.804,
+      "eval_steps_per_second": 0.008,
+      "eval_wer": 0.9369659965823781,
+      "step": 6614
+    },
+    {
+      "epoch": 4.019352887813729,
+      "grad_norm": 0.6536675691604614,
+      "learning_rate": 7.194534662336209e-06,
+      "loss": 0.0474,
+      "step": 7000
+    },
+    {
+      "epoch": 4.044551960487854,
+      "grad_norm": 0.5821442604064941,
+      "learning_rate": 6.91454810169112e-06,
+      "loss": 0.0441,
+      "step": 7500
+    },
+    {
+      "epoch": 4.06975103316198,
+      "grad_norm": 0.5527841448783875,
+      "learning_rate": 6.6345615410460304e-06,
+      "loss": 0.0449,
+      "step": 8000
+    },
+    {
+      "epoch": 5.011591573430098,
+      "grad_norm": 0.453218549489975,
+      "learning_rate": 6.354574980400942e-06,
+      "loss": 0.0369,
+      "step": 8500
+    },
+    {
+      "epoch": 5.036790646104223,
+      "grad_norm": 0.5013980865478516,
+      "learning_rate": 6.074588419755852e-06,
+      "loss": 0.0273,
+      "step": 9000
+    },
+    {
+      "epoch": 5.061989718778349,
+      "grad_norm": 0.5885359644889832,
+      "learning_rate": 5.7946018591107636e-06,
+      "loss": 0.0288,
+      "step": 9500
+    },
+    {
+      "epoch": 5.083207337969963,
+      "eval_loss": 0.017346344888210297,
+      "eval_runtime": 530.4397,
+      "eval_samples_per_second": 0.784,
+      "eval_steps_per_second": 0.008,
+      "eval_wer": 1.0060864389532094,
+      "step": 9921
+    },
+    {
+      "epoch": 6.003830259046467,
+      "grad_norm": 0.4446285665035248,
+      "learning_rate": 5.514615298465674e-06,
+      "loss": 0.0276,
+      "step": 10000
+    },
+    {
+      "epoch": 6.029029331720593,
+      "grad_norm": 0.46283265948295593,
+      "learning_rate": 5.234628737820585e-06,
+      "loss": 0.016,
+      "step": 10500
+    },
+    {
+      "epoch": 6.054228404394718,
+      "grad_norm": 0.42813611030578613,
+      "learning_rate": 4.954642177175496e-06,
+      "loss": 0.0173,
+      "step": 11000
+    },
+    {
+      "epoch": 6.079427477068844,
+      "grad_norm": 0.48960232734680176,
+      "learning_rate": 4.674655616530407e-06,
+      "loss": 0.018,
+      "step": 11500
+    },
+    {
+      "epoch": 7.021268017336962,
+      "grad_norm": 0.3988407254219055,
+      "learning_rate": 4.394669055885318e-06,
+      "loss": 0.0109,
+      "step": 12000
+    },
+    {
+      "epoch": 7.046467090011087,
+      "grad_norm": 0.3698909282684326,
+      "learning_rate": 4.114682495240229e-06,
+      "loss": 0.0101,
+      "step": 12500
+    },
+    {
+      "epoch": 7.071666162685213,
+      "grad_norm": 0.4073663055896759,
+      "learning_rate": 3.8346959345951395e-06,
+      "loss": 0.0109,
+      "step": 13000
+    },
+    {
+      "epoch": 7.083156939824614,
+      "eval_loss": 0.0192726943641901,
+      "eval_runtime": 518.0983,
+      "eval_samples_per_second": 0.803,
+      "eval_steps_per_second": 0.008,
+      "eval_wer": 0.9907263406485801,
+      "step": 13228
+    },
+    {
+      "epoch": 8.01350670295333,
+      "grad_norm": 0.33067503571510315,
+      "learning_rate": 3.5547093739500504e-06,
+      "loss": 0.0082,
+      "step": 13500
+    },
+    {
+      "epoch": 8.038705775627458,
+      "grad_norm": 0.3731881380081177,
+      "learning_rate": 3.2747228133049617e-06,
+      "loss": 0.0062,
+      "step": 14000
+    },
+    {
+      "epoch": 8.063904848301583,
+      "grad_norm": 0.2673242688179016,
+      "learning_rate": 2.9947362526598727e-06,
+      "loss": 0.0066,
+      "step": 14500
+    },
+    {
+      "epoch": 9.0057453885697,
+      "grad_norm": 0.18087884783744812,
+      "learning_rate": 2.7147496920147836e-06,
+      "loss": 0.0062,
+      "step": 15000
+    },
+    {
+      "epoch": 9.030944461243827,
+      "grad_norm": 0.250787615776062,
+      "learning_rate": 2.4347631313696945e-06,
+      "loss": 0.0038,
+      "step": 15500
+    },
+    {
+      "epoch": 9.056143533917952,
+      "grad_norm": 0.2255438268184662,
+      "learning_rate": 2.1547765707246054e-06,
+      "loss": 0.0041,
+      "step": 16000
+    },
+    {
+      "epoch": 9.081342606592077,
+      "grad_norm": 0.29366812109947205,
+      "learning_rate": 1.8747900100795163e-06,
+      "loss": 0.0044,
+      "step": 16500
+    },
+    {
+      "epoch": 9.083106541679266,
+      "eval_loss": 0.020848926156759262,
+      "eval_runtime": 513.0393,
+      "eval_samples_per_second": 0.811,
+      "eval_steps_per_second": 0.008,
+      "eval_wer": 0.9772862546320297,
+      "step": 16535
+    },
+    {
+      "epoch": 10.023183146860196,
+      "grad_norm": 0.1837795376777649,
+      "learning_rate": 1.5948034494344272e-06,
+      "loss": 0.0026,
+      "step": 17000
+    },
+    {
+      "epoch": 10.048382219534322,
+      "grad_norm": 0.24585728347301483,
+      "learning_rate": 1.314816888789338e-06,
+      "loss": 0.0026,
+      "step": 17500
+    },
+    {
+      "epoch": 10.073581292208447,
+      "grad_norm": 0.15548868477344513,
+      "learning_rate": 1.0348303281442492e-06,
+      "loss": 0.0026,
+      "step": 18000
+    },
+    {
+      "epoch": 11.015421832476564,
+      "grad_norm": 0.08757825195789337,
+      "learning_rate": 7.554037406204502e-07,
+      "loss": 0.002,
+      "step": 18500
+    },
+    {
+      "epoch": 11.040620905150691,
+      "grad_norm": 0.1045205295085907,
+      "learning_rate": 4.7541717997536123e-07,
+      "loss": 0.0016,
+      "step": 19000
+    },
+    {
+      "epoch": 11.065819977824816,
+      "grad_norm": 0.07768367975950241,
+      "learning_rate": 1.9543061933027217e-07,
+      "loss": 0.0016,
+      "step": 19500
+    },
+    {
+      "epoch": 11.083056143533918,
+      "eval_loss": 0.02412882074713707,
+      "eval_runtime": 518.0169,
+      "eval_samples_per_second": 0.803,
+      "eval_steps_per_second": 0.008,
+      "eval_wer": 0.9926463529366589,
+      "step": 19842
+    },
+    {
+      "epoch": 11.083056143533918,
+      "step": 19842,
+      "total_flos": 8.628884758428616e+21,
+      "train_loss": 0.04815149868992161,
+      "train_runtime": 124240.955,
+      "train_samples_per_second": 20.442,
+      "train_steps_per_second": 0.16
+    }
+  ],
+  "logging_steps": 500,
+  "max_steps": 19842,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 9223372036854775807,
+  "save_steps": 3307,
+  "total_flos": 8.628884758428616e+21,
+  "train_batch_size": 32,
+  "trial_name": null,
+  "trial_params": null
+}
--- a/training_args.bin
+++ b/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:74768d83badcff2bde356dd7f72dc7ffe861663843fa21b5a97e286a5eae8f3e
+size 5176
--- a/vocab.json
+++ b/vocab.json