初始化项目，由ModelHub XC社区提供模型

Model: projecte-aina/whisper-large-v3-ca-3catparla Source: Original Platform
2026-05-13 17:57:28 +08:00
commit fe20ba2ad8
18 changed files with 119141 additions and 0 deletions
--- a/.gitattributes
+++ b/.gitattributes
@@ -0,0 +1,35 @@
 *.7z filter=lfs diff=lfs merge=lfs -text
 *.arrow filter=lfs diff=lfs merge=lfs -text
 *.bin filter=lfs diff=lfs merge=lfs -text
 *.bz2 filter=lfs diff=lfs merge=lfs -text
 *.ckpt filter=lfs diff=lfs merge=lfs -text
 *.ftz filter=lfs diff=lfs merge=lfs -text
 *.gz filter=lfs diff=lfs merge=lfs -text
 *.h5 filter=lfs diff=lfs merge=lfs -text
 *.joblib filter=lfs diff=lfs merge=lfs -text
 *.lfs.* filter=lfs diff=lfs merge=lfs -text
 *.mlmodel filter=lfs diff=lfs merge=lfs -text
 *.model filter=lfs diff=lfs merge=lfs -text
 *.msgpack filter=lfs diff=lfs merge=lfs -text
 *.npy filter=lfs diff=lfs merge=lfs -text
 *.npz filter=lfs diff=lfs merge=lfs -text
 *.onnx filter=lfs diff=lfs merge=lfs -text
 *.ot filter=lfs diff=lfs merge=lfs -text
 *.parquet filter=lfs diff=lfs merge=lfs -text
 *.pb filter=lfs diff=lfs merge=lfs -text
 *.pickle filter=lfs diff=lfs merge=lfs -text
 *.pkl filter=lfs diff=lfs merge=lfs -text
 *.pt filter=lfs diff=lfs merge=lfs -text
 *.pth filter=lfs diff=lfs merge=lfs -text
 *.rar filter=lfs diff=lfs merge=lfs -text
 *.safetensors filter=lfs diff=lfs merge=lfs -text
 saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.tar.* filter=lfs diff=lfs merge=lfs -text
 *.tar filter=lfs diff=lfs merge=lfs -text
 *.tflite filter=lfs diff=lfs merge=lfs -text
 *.tgz filter=lfs diff=lfs merge=lfs -text
 *.wasm filter=lfs diff=lfs merge=lfs -text
 *.xz filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
--- a/README.md
+++ b/README.md
@@ -0,0 +1,377 @@
 ---
 language: ca
 datasets:
 - projecte-aina/3catparla_asr
 tags:
 - audio
 - automatic-speech-recognition
 - catalan
 - whisper-large-v3
 - projecte-aina
 - barcelona-supercomputing-center
 - bsc
 license: apache-2.0
 model-index:
 - name: whisper-large-v3-ca-3catparla
  results:
  - task:
      name: Automatic Speech Recognition
      type: automatic-speech-recognition
    dataset:
      name: 3CatParla (Test)
      type: projecte-aina/3catparla_asr
      split: test
      args:
        language: ca
    metrics:
    - name: WER
      type: wer
      value: 0.96
  - task:
      name: Automatic Speech Recognition
      type: automatic-speech-recognition
    dataset:
      name: 3CatParla (Dev)
      type: projecte-aina/3catparla_asr
      split: dev
      args:
        language: ca
    metrics:
    - name: WER
      type: wer
      value: 0.92
  - task:
      name: Automatic Speech Recognition
      type: automatic-speech-recognition
    dataset:
      name: Mozilla Common Voice 17.0 (Test)
      type: mozilla-foundation/common_voice_17_0
      split: test
      args:
        language: ca
    metrics:
    - name: WER
      type: wer
      value: 10.32
  - task:
      name: Automatic Speech Recognition
      type: automatic-speech-recognition
    dataset:
      name: Mozilla Common Voice 17.0 (Dev)
      type: mozilla-foundation/common_voice_17_0
      split: validation
      args:
        language: ca
    metrics:
    - name: WER
      type: wer
      value: 9.26
  - task:
      name: Automatic Speech Recognition
      type: automatic-speech-recognition
    dataset:
      name: CV Benchmark Catalan Accents (Balearic fem)
      type: projecte-aina/commonvoice_benchmark_catalan_accents
      split: Balearic female
      args:
        language: ca
    metrics:
    - name: WER
      type: wer
      value: 12.25
  - task:
      name: Automatic Speech Recognition
      type: automatic-speech-recognition
    dataset:
      name: CV Benchmark Catalan Accents (Balearic male)
      type: projecte-aina/commonvoice_benchmark_catalan_accents
      split: Balearic male
      args:
        language: ca
    metrics:
    - name: WER
      type: wer
      value: 12.18
  - task:
      name: Automatic Speech Recognition
      type: automatic-speech-recognition
    dataset:
      name: CV Benchmark Catalan Accents (Central fem)
      type: projecte-aina/commonvoice_benchmark_catalan_accents
      split: Central female
      args:
        language: ca
    metrics:
    - name: WER
      type: wer
      value: 8.51
  - task:
      name: Automatic Speech Recognition
      type: automatic-speech-recognition
    dataset:
      name: CV Benchmark Catalan Accents (Central male)
      type: projecte-aina/commonvoice_benchmark_catalan_accents
      split: Central male
      args:
        language: ca
    metrics:
    - name: WER
      type: wer
      value: 8.73
  - task:
      name: Automatic Speech Recognition
      type: automatic-speech-recognition
    dataset:
      name: CV Benchmark Catalan Accents (Northern fem)
      type: projecte-aina/commonvoice_benchmark_catalan_accents
      split: Northern female
      args:
        language: ca
    metrics:
    - name: WER
      type: wer
      value: 8.09
  - task:
      name: Automatic Speech Recognition
      type: automatic-speech-recognition
    dataset:
      name: CV Benchmark Catalan Accents (Northern male)
      type: projecte-aina/commonvoice_benchmark_catalan_accents
      split: Northern male
      args:
        language: ca
    metrics:
    - name: WER
      type: wer
      value: 8.28
  - task:
      name: Automatic Speech Recognition
      type: automatic-speech-recognition
    dataset:
      name: CV Benchmark Catalan Accents (Northwestern fem)
      type: projecte-aina/commonvoice_benchmark_catalan_accents
      split: Northwestern female
      args:
        language: ca
    metrics:
    - name: WER
      type: wer
      value: 7.88
  - task:
      name: Automatic Speech Recognition
      type: automatic-speech-recognition
    dataset:
      name: CV Benchmark Catalan Accents (Northwestern male)
      type: projecte-aina/commonvoice_benchmark_catalan_accents
      split: Northwestern male
      args:
        language: ca
    metrics:
    - name: WER
      type: wer
      value: 8.44
  - task:
      name: Automatic Speech Recognition
      type: automatic-speech-recognition
    dataset:
      name: CV Benchmark Catalan Accents (Valencian fem)
      type: projecte-aina/commonvoice_benchmark_catalan_accents
      split: Valencian female
      args:
        language: ca
    metrics:
    - name: WER
      type: wer
      value: 9.58
  - task:
      name: Automatic Speech Recognition
      type: automatic-speech-recognition
    dataset:
      name: CV Benchmark Catalan Accents (Valencian male)
      type: projecte-aina/commonvoice_benchmark_catalan_accents
      split: Valencian male
      args:
        language: ca
    metrics:
    - name: WER
      type: wer
      value: 9.1
 library_name: transformers
 ---
 # whisper-large-v3-ca-3catparla
 ## Table of Contents
 <details>
 <summary>Click to expand</summary>
 - [Paper](#paper)
 - [Model Description](#model-description)
 - [Intended Uses and Limitations](#intended-uses-and-limitations)
 - [How to Get Started with the Model](#how-to-get-started-with-the-model)
 - [Training Details](#training-details)
 - [Citation](#citation)
 - [Additional Information](#additional-information)
 </details>
 ## Paper
 **PDF:** [3CatParla: A New Open-Source Corpus of Broadcast TV in Catalan for Automatic Speech Recognition](https://www.isca-archive.org/iberspeech_2024/hernandezmena24_iberspeech.pdf)
 ## Model Description
 The "whisper-large-v3-ca-3catparla" is an acoustic model suitable for Automatic Speech Recognition in Catalan. It is the result of finetuning the model ["openai/whisper-large-v3"](https://huggingface.co/openai/whisper-large-v3) with 710 hours of Catalan data released by the [Projecte AINA](https://projecteaina.cat/) from Barcelona, Spain.
 ## Intended Uses and Limitations
 This model can be used for Automatic Speech Recognition (ASR) in Catalan. The model is intended to transcribe audio files in Catalan to plain text without punctuation.
 ## How to Get Started with the Model
 To see an updated and functional version of this code, please see our our [Notebook](https://colab.research.google.com/drive/1MHiPrffNTwiyWeUyMQvSdSbfkef_8aJC?usp=sharing)
 ### Installation
 In order to use this model, you may install [datasets](https://huggingface.co/docs/datasets/installation) and [transformers](https://huggingface.co/docs/transformers/installation):
 Create a virtual environment:
 ```bash
 python -m venv /path/to/venv
 ```
 Activate the environment:
 ```bash
 source /path/to/venv/bin/activate
 ```
 Install the modules:
 ```bash
 pip install datasets transformers 
 ```
 ### For Inference
 In order to transcribe audio in Catalan using this model, you can follow this example:
 ```bash
 #Install Prerequisites
 pip install torch
 pip install datasets
 pip install 'transformers[torch]'
 pip install evaluate
 pip install jiwer
 ```
 ```python
 #This code works with GPU
 #Notice that: load_metric is no longer part of datasets.
 #you have to remove it and use evaluate's load instead.
 #(Note from November 2024)
 import torch
 from transformers import WhisperForConditionalGeneration, WhisperProcessor
 #Load the processor and model.
 MODEL_NAME="projecte-aina/whisper-large-v3-ca-3catparla"
 processor = WhisperProcessor.from_pretrained(MODEL_NAME)
 model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME).to("cuda")
 #Load the dataset
 from datasets import load_dataset, load_metric, Audio
 ds=load_dataset("projecte-aina/3catparla_asr",split='test')
 #Downsample to 16kHz
 ds = ds.cast_column("audio", Audio(sampling_rate=16_000))
 #Process the dataset
 def map_to_pred(batch):
 	audio = batch["audio"]
 	input_features = processor(audio["array"], sampling_rate=audio["sampling_rate"], return_tensors="pt").input_features
 	batch["reference"] = processor.tokenizer._normalize(batch['normalized_text'])
 	with torch.no_grad():
 		predicted_ids = model.generate(input_features.to("cuda"))[0]
 	transcription = processor.decode(predicted_ids)
 	batch["prediction"] = processor.tokenizer._normalize(transcription)
 	return batch
 #Do the evaluation
 result = ds.map(map_to_pred)
 #Compute the overall WER now.
 from evaluate import load
 wer = load("wer")
 WER=100 * wer.compute(references=result["reference"], predictions=result["prediction"])
 print(WER)
 ```
 **Test Result**: 0.96
 ## Training Details
 ### Training data
 The specific dataset used to create the model is called ["3CatParla"](https://huggingface.co/datasets/projecte-aina/3catparla_asr).
 ### Training procedure
 This model is the result of finetuning the model ["openai/whisper-large-v3"](https://huggingface.co/openai/whisper-large-v3) by following this [tutorial](https://huggingface.co/blog/fine-tune-whisper) provided by Hugging Face.
 ### Training Hyperparameters
 * language: catalan
 * hours of training audio: 710
 * learning rate: 1.95e-07
 * sample rate: 16000
 * train batch size: 32 (x4 GPUs)
  * gradient accumulation steps: 1
 * eval batch size: 32
 * save total limit: 3
 * max steps: 19842
 * warmup steps: 1984
 * eval steps: 3307
 * save steps: 3307
 * shuffle buffer size: 480
 ## Citation
 If this model contributes to your research, please cite the work:
 ```bibtex
@inproceedings{hernandez20243catparla,
  title={3CatParla: A New Open-Source Corpus of Broadcast TV in Catalan for Automatic Speech Recognition},
  author={Hern{\'a}ndez Mena, Carlos Daniel and Armentano Oller, Carme and Solito, Sarah and K{\"u}lebi, Baybars},
  booktitle={Proc. IberSPEECH 2024},
  pages={176--180},
  year={2024}
 }
 ```
 <!--
@misc{mena2024whisperlarge3catparla,
      title={Acoustic Model in Catalan: whisper-large-v3-ca-3catparla.}, 
      author={Hernandez Mena, Carlos Daniel; Armentano-Oller, Carme; Solito, Sarah; Külebi, Baybars},
      organization={Barcelona Supercomputing Center},
      url={https://huggingface.co/projecte-aina/whisper-large-v3-ca-3catparla},
      year={2024}
 }
 -->
 ## Additional Information
 ### Author
 The fine-tuning process was perform during July (2024) in the [Language Technologies Unit](https://huggingface.co/BSC-LT) of the [Barcelona Supercomputing Center](https://www.bsc.es/) by [Carlos Daniel Hernández Mena](https://huggingface.co/carlosdanielhernandezmena).
 ### Contact
 For further information, please send an email to <langtech@bsc.es>.
 ### Copyright
 Copyright(c) 2024 by Language Technologies Unit, Barcelona Supercomputing Center.
 ### License
 [Apache-2.0](https://www.apache.org/licenses/LICENSE-2.0)
 ### Funding
 This work has been promoted and financed by the Generalitat de Catalunya through the [Aina project](https://projecteaina.cat/).
 The training of the model was possible thanks to the compute time provided by [Barcelona Supercomputing Center](https://www.bsc.es/) through MareNostrum 5.
--- a/added_tokens.json
+++ b/added_tokens.json
--- a/all_results.json
+++ b/all_results.json
@@ -0,0 +1,8 @@
 {
    "epoch": 11.083056143533918,
    "total_flos": 8.628884758428616e+21,
    "train_loss": 0.04815149868992161,
    "train_runtime": 124240.955,
    "train_samples_per_second": 20.442,
    "train_steps_per_second": 0.16
 }
--- a/config.json
+++ b/config.json
@@ -0,0 +1,52 @@
 {
  "_name_or_path": "/gpfs/projects/bsc88/speech/ASR/models/whisper-large-v3",
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "apply_spec_augment": false,
  "architectures": [
    "WhisperForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "begin_suppress_tokens": [
    220,
    50257
  ],
  "bos_token_id": 50257,
  "classifier_proj_size": 256,
  "d_model": 1280,
  "decoder_attention_heads": 20,
  "decoder_ffn_dim": 5120,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 32,
  "decoder_start_token_id": 50258,
  "dropout": 0.0,
  "encoder_attention_heads": 20,
  "encoder_ffn_dim": 5120,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 32,
  "eos_token_id": 50257,
  "forced_decoder_ids": null,
  "init_std": 0.02,
  "is_encoder_decoder": true,
  "mask_feature_length": 10,
  "mask_feature_min_masks": 0,
  "mask_feature_prob": 0.0,
  "mask_time_length": 10,
  "mask_time_min_masks": 2,
  "mask_time_prob": 0.05,
  "max_length": 448,
  "max_source_positions": 1500,
  "max_target_positions": 448,
  "median_filter_width": 7,
  "model_type": "whisper",
  "num_hidden_layers": 32,
  "num_mel_bins": 128,
  "pad_token_id": 50256,
  "scale_embedding": false,
  "suppress_tokens": [],
  "torch_dtype": "float32",
  "transformers_version": "4.40.2",
  "use_cache": false,
  "use_weighted_layer_sum": false,
  "vocab_size": 51866
 }
--- a/generation_config.json
+++ b/generation_config.json
@@ -0,0 +1,266 @@
 {
  "alignment_heads": [
    [
      7,
      0
    ],
    [
      10,
      17
    ],
    [
      12,
      18
    ],
    [
      13,
      12
    ],
    [
      16,
      1
    ],
    [
      17,
      14
    ],
    [
      19,
      11
    ],
    [
      21,
      4
    ],
    [
      24,
      1
    ],
    [
      25,
      6
    ]
  ],
  "begin_suppress_tokens": [
    220,
    50257
  ],
  "bos_token_id": 50257,
  "decoder_start_token_id": 50258,
  "eos_token_id": 50257,
  "forced_decoder_ids": [
    [
      1,
      null
    ],
    [
      2,
      50360
    ]
  ],
  "is_multilingual": true,
  "lang_to_id": {
    "<|af|>": 50327,
    "<|am|>": 50334,
    "<|ar|>": 50272,
    "<|as|>": 50350,
    "<|az|>": 50304,
    "<|ba|>": 50355,
    "<|be|>": 50330,
    "<|bg|>": 50292,
    "<|bn|>": 50302,
    "<|bo|>": 50347,
    "<|br|>": 50309,
    "<|bs|>": 50315,
    "<|ca|>": 50270,
    "<|cs|>": 50283,
    "<|cy|>": 50297,
    "<|da|>": 50285,
    "<|de|>": 50261,
    "<|el|>": 50281,
    "<|en|>": 50259,
    "<|es|>": 50262,
    "<|et|>": 50307,
    "<|eu|>": 50310,
    "<|fa|>": 50300,
    "<|fi|>": 50277,
    "<|fo|>": 50338,
    "<|fr|>": 50265,
    "<|gl|>": 50319,
    "<|gu|>": 50333,
    "<|haw|>": 50352,
    "<|ha|>": 50354,
    "<|he|>": 50279,
    "<|hi|>": 50276,
    "<|hr|>": 50291,
    "<|ht|>": 50339,
    "<|hu|>": 50286,
    "<|hy|>": 50312,
    "<|id|>": 50275,
    "<|is|>": 50311,
    "<|it|>": 50274,
    "<|ja|>": 50266,
    "<|jw|>": 50356,
    "<|ka|>": 50329,
    "<|kk|>": 50316,
    "<|km|>": 50323,
    "<|kn|>": 50306,
    "<|ko|>": 50264,
    "<|la|>": 50294,
    "<|lb|>": 50345,
    "<|ln|>": 50353,
    "<|lo|>": 50336,
    "<|lt|>": 50293,
    "<|lv|>": 50301,
    "<|mg|>": 50349,
    "<|mi|>": 50295,
    "<|mk|>": 50308,
    "<|ml|>": 50296,
    "<|mn|>": 50314,
    "<|mr|>": 50320,
    "<|ms|>": 50282,
    "<|mt|>": 50343,
    "<|my|>": 50346,
    "<|ne|>": 50313,
    "<|nl|>": 50271,
    "<|nn|>": 50342,
    "<|no|>": 50288,
    "<|oc|>": 50328,
    "<|pa|>": 50321,
    "<|pl|>": 50269,
    "<|ps|>": 50340,
    "<|pt|>": 50267,
    "<|ro|>": 50284,
    "<|ru|>": 50263,
    "<|sa|>": 50344,
    "<|sd|>": 50332,
    "<|si|>": 50322,
    "<|sk|>": 50298,
    "<|sl|>": 50305,
    "<|sn|>": 50324,
    "<|so|>": 50326,
    "<|sq|>": 50317,
    "<|sr|>": 50303,
    "<|su|>": 50357,
    "<|sv|>": 50273,
    "<|sw|>": 50318,
    "<|ta|>": 50287,
    "<|te|>": 50299,
    "<|tg|>": 50331,
    "<|th|>": 50289,
    "<|tk|>": 50341,
    "<|tl|>": 50348,
    "<|tr|>": 50268,
    "<|tt|>": 50351,
    "<|uk|>": 50280,
    "<|ur|>": 50290,
    "<|uz|>": 50337,
    "<|vi|>": 50278,
    "<|yi|>": 50335,
    "<|yo|>": 50325,
    "<|yue|>": 50358,
    "<|zh|>": 50260
  },
  "language": "catalan",
  "max_initial_timestamp_index": 50,
  "max_length": 448,
  "no_timestamps_token_id": 50364,
  "pad_token_id": 50257,
  "prev_sot_token_id": 50362,
  "return_timestamps": false,
  "suppress_tokens": [
    1,
    2,
    7,
    8,
    9,
    10,
    14,
    25,
    26,
    27,
    28,
    29,
    31,
    58,
    59,
    60,
    61,
    62,
    63,
    90,
    91,
    92,
    93,
    359,
    503,
    522,
    542,
    873,
    893,
    902,
    918,
    922,
    931,
    1350,
    1853,
    1982,
    2460,
    2627,
    3246,
    3253,
    3268,
    3536,
    3846,
    3961,
    4183,
    4667,
    6585,
    6647,
    7273,
    9061,
    9383,
    10428,
    10929,
    11938,
    12033,
    12331,
    12562,
    13793,
    14157,
    14635,
    15265,
    15618,
    16553,
    16604,
    18362,
    18956,
    20075,
    21675,
    22520,
    26130,
    26161,
    26435,
    28279,
    29464,
    31650,
    32302,
    32470,
    36865,
    42863,
    47425,
    49870,
    50254,
    50258,
    50359,
    50360,
    50361,
    50362,
    50363
  ],
  "task_to_id": {
    "transcribe": 50360,
    "translate": 50359
  },
  "transformers_version": "4.40.2"
 }
--- a/merges.txt
+++ b/merges.txt
--- a/normalizer.json
+++ b/normalizer.json
--- a/preprocessor_config.json
+++ b/preprocessor_config.json
@@ -0,0 +1,14 @@
 {
  "chunk_length": 30,
  "feature_extractor_type": "WhisperFeatureExtractor",
  "feature_size": 128,
  "hop_length": 160,
  "n_fft": 400,
  "n_samples": 480000,
  "nb_max_frames": 3000,
  "padding_side": "right",
  "padding_value": 0.0,
  "processor_class": "WhisperProcessor",
  "return_attention_mask": false,
  "sampling_rate": 16000
 }
--- a/pytorch_model-00001-of-00002.bin
+++ b/pytorch_model-00001-of-00002.bin
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:b5dd9863724f0133ef9e2fc7fe7c438c9d8728d6a31ef999b85a5099285d0cac
 size 4993686017
--- a/pytorch_model-00002-of-00002.bin
+++ b/pytorch_model-00002-of-00002.bin
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:ac4eb29e216f51ba613a65495e00e036a1ca27c11ea1892548c4b69dafbf87da
 size 1180727888
--- a/pytorch_model.bin.index.json
+++ b/pytorch_model.bin.index.json
--- a/special_tokens_map.json
+++ b/special_tokens_map.json
@@ -0,0 +1,139 @@
 {
  "additional_special_tokens": [
    "<|startoftranscript|>",
    "<|en|>",
    "<|zh|>",
    "<|de|>",
    "<|es|>",
    "<|ru|>",
    "<|ko|>",
    "<|fr|>",
    "<|ja|>",
    "<|pt|>",
    "<|tr|>",
    "<|pl|>",
    "<|ca|>",
    "<|nl|>",
    "<|ar|>",
    "<|sv|>",
    "<|it|>",
    "<|id|>",
    "<|hi|>",
    "<|fi|>",
    "<|vi|>",
    "<|he|>",
    "<|uk|>",
    "<|el|>",
    "<|ms|>",
    "<|cs|>",
    "<|ro|>",
    "<|da|>",
    "<|hu|>",
    "<|ta|>",
    "<|no|>",
    "<|th|>",
    "<|ur|>",
    "<|hr|>",
    "<|bg|>",
    "<|lt|>",
    "<|la|>",
    "<|mi|>",
    "<|ml|>",
    "<|cy|>",
    "<|sk|>",
    "<|te|>",
    "<|fa|>",
    "<|lv|>",
    "<|bn|>",
    "<|sr|>",
    "<|az|>",
    "<|sl|>",
    "<|kn|>",
    "<|et|>",
    "<|mk|>",
    "<|br|>",
    "<|eu|>",
    "<|is|>",
    "<|hy|>",
    "<|ne|>",
    "<|mn|>",
    "<|bs|>",
    "<|kk|>",
    "<|sq|>",
    "<|sw|>",
    "<|gl|>",
    "<|mr|>",
    "<|pa|>",
    "<|si|>",
    "<|km|>",
    "<|sn|>",
    "<|yo|>",
    "<|so|>",
    "<|af|>",
    "<|oc|>",
    "<|ka|>",
    "<|be|>",
    "<|tg|>",
    "<|sd|>",
    "<|gu|>",
    "<|am|>",
    "<|yi|>",
    "<|lo|>",
    "<|uz|>",
    "<|fo|>",
    "<|ht|>",
    "<|ps|>",
    "<|tk|>",
    "<|nn|>",
    "<|mt|>",
    "<|sa|>",
    "<|lb|>",
    "<|my|>",
    "<|bo|>",
    "<|tl|>",
    "<|mg|>",
    "<|as|>",
    "<|tt|>",
    "<|haw|>",
    "<|ln|>",
    "<|ha|>",
    "<|ba|>",
    "<|jw|>",
    "<|su|>",
    "<|yue|>",
    "<|translate|>",
    "<|transcribe|>",
    "<|startoflm|>",
    "<|startofprev|>",
    "<|nospeech|>",
    "<|notimestamps|>"
  ],
  "bos_token": {
    "content": "<|endoftext|>",
    "lstrip": false,
    "normalized": false,
    "rstrip": false,
    "single_word": false
  },
  "eos_token": {
    "content": "<|endoftext|>",
    "lstrip": false,
    "normalized": false,
    "rstrip": false,
    "single_word": false
  },
  "pad_token": {
    "content": "<|endoftext|>",
    "lstrip": false,
    "normalized": false,
    "rstrip": false,
    "single_word": false
  },
  "unk_token": {
    "content": "<|endoftext|>",
    "lstrip": false,
    "normalized": false,
    "rstrip": false,
    "single_word": false
  }
 }
--- a/tokenizer_config.json
+++ b/tokenizer_config.json
--- a/train_results.json
+++ b/train_results.json
@@ -0,0 +1,8 @@
 {
    "epoch": 11.083056143533918,
    "total_flos": 8.628884758428616e+21,
    "train_loss": 0.04815149868992161,
    "train_runtime": 124240.955,
    "train_samples_per_second": 20.442,
    "train_steps_per_second": 0.16
 }
--- a/trainer_state.json
+++ b/trainer_state.json
@@ -0,0 +1,357 @@
 {
  "best_metric": 0.8985657508208054,
  "best_model_checkpoint": "CHECKPOINTS/checkpoint-3307",
  "epoch": 11.083056143533918,
  "eval_steps": 3307,
  "global_step": 19842,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.02519907267412559,
      "grad_norm": 0.7393302321434021,
      "learning_rate": 2.4899193548387098e-06,
      "loss": 0.3059,
      "step": 500
    },
    {
      "epoch": 0.05039814534825118,
      "grad_norm": 0.6863130331039429,
      "learning_rate": 5.010080645161291e-06,
      "loss": 0.1492,
      "step": 1000
    },
    {
      "epoch": 0.07559721802237677,
      "grad_norm": 0.703405499458313,
      "learning_rate": 7.5302419354838715e-06,
      "loss": 0.1428,
      "step": 1500
    },
    {
      "epoch": 1.017437758290495,
      "grad_norm": 0.6589027643203735,
      "learning_rate": 9.994400268787099e-06,
      "loss": 0.1261,
      "step": 2000
    },
    {
      "epoch": 1.0426368309646206,
      "grad_norm": 0.6031370759010315,
      "learning_rate": 9.71441370814201e-06,
      "loss": 0.1202,
      "step": 2500
    },
    {
      "epoch": 1.0678359036387461,
      "grad_norm": 0.6696850657463074,
      "learning_rate": 9.434427147496921e-06,
      "loss": 0.1171,
      "step": 3000
    },
    {
      "epoch": 1.0833081342606592,
      "eval_loss": 0.017358383163809776,
      "eval_runtime": 504.91,
      "eval_samples_per_second": 0.824,
      "eval_steps_per_second": 0.008,
      "eval_wer": 0.8985657508208054,
      "step": 3307
    },
    {
      "epoch": 2.0096764439068644,
      "grad_norm": 0.591613233089447,
      "learning_rate": 9.154440586851832e-06,
      "loss": 0.1042,
      "step": 3500
    },
    {
      "epoch": 2.03487551658099,
      "grad_norm": 0.5527406930923462,
      "learning_rate": 8.874454026206742e-06,
      "loss": 0.0861,
      "step": 4000
    },
    {
      "epoch": 2.0600745892551156,
      "grad_norm": 0.5651601552963257,
      "learning_rate": 8.594467465561653e-06,
      "loss": 0.0878,
      "step": 4500
    },
    {
      "epoch": 3.0019151295232334,
      "grad_norm": 0.4751633107662201,
      "learning_rate": 8.314480904916565e-06,
      "loss": 0.0865,
      "step": 5000
    },
    {
      "epoch": 3.027114202197359,
      "grad_norm": 0.5415444374084473,
      "learning_rate": 8.034494344271475e-06,
      "loss": 0.0625,
      "step": 5500
    },
    {
      "epoch": 3.0523132748714845,
      "grad_norm": 0.5294741988182068,
      "learning_rate": 7.754507783626388e-06,
      "loss": 0.0636,
      "step": 6000
    },
    {
      "epoch": 3.0775123475456105,
      "grad_norm": 0.538415789604187,
      "learning_rate": 7.474521222981298e-06,
      "loss": 0.0652,
      "step": 6500
    },
    {
      "epoch": 3.083257736115311,
      "eval_loss": 0.01661744900047779,
      "eval_runtime": 517.2779,
      "eval_samples_per_second": 0.804,
      "eval_steps_per_second": 0.008,
      "eval_wer": 0.9369659965823781,
      "step": 6614
    },
    {
      "epoch": 4.019352887813729,
      "grad_norm": 0.6536675691604614,
      "learning_rate": 7.194534662336209e-06,
      "loss": 0.0474,
      "step": 7000
    },
    {
      "epoch": 4.044551960487854,
      "grad_norm": 0.5821442604064941,
      "learning_rate": 6.91454810169112e-06,
      "loss": 0.0441,
      "step": 7500
    },
    {
      "epoch": 4.06975103316198,
      "grad_norm": 0.5527841448783875,
      "learning_rate": 6.6345615410460304e-06,
      "loss": 0.0449,
      "step": 8000
    },
    {
      "epoch": 5.011591573430098,
      "grad_norm": 0.453218549489975,
      "learning_rate": 6.354574980400942e-06,
      "loss": 0.0369,
      "step": 8500
    },
    {
      "epoch": 5.036790646104223,
      "grad_norm": 0.5013980865478516,
      "learning_rate": 6.074588419755852e-06,
      "loss": 0.0273,
      "step": 9000
    },
    {
      "epoch": 5.061989718778349,
      "grad_norm": 0.5885359644889832,
      "learning_rate": 5.7946018591107636e-06,
      "loss": 0.0288,
      "step": 9500
    },
    {
      "epoch": 5.083207337969963,
      "eval_loss": 0.017346344888210297,
      "eval_runtime": 530.4397,
      "eval_samples_per_second": 0.784,
      "eval_steps_per_second": 0.008,
      "eval_wer": 1.0060864389532094,
      "step": 9921
    },
    {
      "epoch": 6.003830259046467,
      "grad_norm": 0.4446285665035248,
      "learning_rate": 5.514615298465674e-06,
      "loss": 0.0276,
      "step": 10000
    },
    {
      "epoch": 6.029029331720593,
      "grad_norm": 0.46283265948295593,
      "learning_rate": 5.234628737820585e-06,
      "loss": 0.016,
      "step": 10500
    },
    {
      "epoch": 6.054228404394718,
      "grad_norm": 0.42813611030578613,
      "learning_rate": 4.954642177175496e-06,
      "loss": 0.0173,
      "step": 11000
    },
    {
      "epoch": 6.079427477068844,
      "grad_norm": 0.48960232734680176,
      "learning_rate": 4.674655616530407e-06,
      "loss": 0.018,
      "step": 11500
    },
    {
      "epoch": 7.021268017336962,
      "grad_norm": 0.3988407254219055,
      "learning_rate": 4.394669055885318e-06,
      "loss": 0.0109,
      "step": 12000
    },
    {
      "epoch": 7.046467090011087,
      "grad_norm": 0.3698909282684326,
      "learning_rate": 4.114682495240229e-06,
      "loss": 0.0101,
      "step": 12500
    },
    {
      "epoch": 7.071666162685213,
      "grad_norm": 0.4073663055896759,
      "learning_rate": 3.8346959345951395e-06,
      "loss": 0.0109,
      "step": 13000
    },
    {
      "epoch": 7.083156939824614,
      "eval_loss": 0.0192726943641901,
      "eval_runtime": 518.0983,
      "eval_samples_per_second": 0.803,
      "eval_steps_per_second": 0.008,
      "eval_wer": 0.9907263406485801,
      "step": 13228
    },
    {
      "epoch": 8.01350670295333,
      "grad_norm": 0.33067503571510315,
      "learning_rate": 3.5547093739500504e-06,
      "loss": 0.0082,
      "step": 13500
    },
    {
      "epoch": 8.038705775627458,
      "grad_norm": 0.3731881380081177,
      "learning_rate": 3.2747228133049617e-06,
      "loss": 0.0062,
      "step": 14000
    },
    {
      "epoch": 8.063904848301583,
      "grad_norm": 0.2673242688179016,
      "learning_rate": 2.9947362526598727e-06,
      "loss": 0.0066,
      "step": 14500
    },
    {
      "epoch": 9.0057453885697,
      "grad_norm": 0.18087884783744812,
      "learning_rate": 2.7147496920147836e-06,
      "loss": 0.0062,
      "step": 15000
    },
    {
      "epoch": 9.030944461243827,
      "grad_norm": 0.250787615776062,
      "learning_rate": 2.4347631313696945e-06,
      "loss": 0.0038,
      "step": 15500
    },
    {
      "epoch": 9.056143533917952,
      "grad_norm": 0.2255438268184662,
      "learning_rate": 2.1547765707246054e-06,
      "loss": 0.0041,
      "step": 16000
    },
    {
      "epoch": 9.081342606592077,
      "grad_norm": 0.29366812109947205,
      "learning_rate": 1.8747900100795163e-06,
      "loss": 0.0044,
      "step": 16500
    },
    {
      "epoch": 9.083106541679266,
      "eval_loss": 0.020848926156759262,
      "eval_runtime": 513.0393,
      "eval_samples_per_second": 0.811,
      "eval_steps_per_second": 0.008,
      "eval_wer": 0.9772862546320297,
      "step": 16535
    },
    {
      "epoch": 10.023183146860196,
      "grad_norm": 0.1837795376777649,
      "learning_rate": 1.5948034494344272e-06,
      "loss": 0.0026,
      "step": 17000
    },
    {
      "epoch": 10.048382219534322,
      "grad_norm": 0.24585728347301483,
      "learning_rate": 1.314816888789338e-06,
      "loss": 0.0026,
      "step": 17500
    },
    {
      "epoch": 10.073581292208447,
      "grad_norm": 0.15548868477344513,
      "learning_rate": 1.0348303281442492e-06,
      "loss": 0.0026,
      "step": 18000
    },
    {
      "epoch": 11.015421832476564,
      "grad_norm": 0.08757825195789337,
      "learning_rate": 7.554037406204502e-07,
      "loss": 0.002,
      "step": 18500
    },
    {
      "epoch": 11.040620905150691,
      "grad_norm": 0.1045205295085907,
      "learning_rate": 4.7541717997536123e-07,
      "loss": 0.0016,
      "step": 19000
    },
    {
      "epoch": 11.065819977824816,
      "grad_norm": 0.07768367975950241,
      "learning_rate": 1.9543061933027217e-07,
      "loss": 0.0016,
      "step": 19500
    },
    {
      "epoch": 11.083056143533918,
      "eval_loss": 0.02412882074713707,
      "eval_runtime": 518.0169,
      "eval_samples_per_second": 0.803,
      "eval_steps_per_second": 0.008,
      "eval_wer": 0.9926463529366589,
      "step": 19842
    },
    {
      "epoch": 11.083056143533918,
      "step": 19842,
      "total_flos": 8.628884758428616e+21,
      "train_loss": 0.04815149868992161,
      "train_runtime": 124240.955,
      "train_samples_per_second": 20.442,
      "train_steps_per_second": 0.16
    }
  ],
  "logging_steps": 500,
  "max_steps": 19842,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 9223372036854775807,
  "save_steps": 3307,
  "total_flos": 8.628884758428616e+21,
  "train_batch_size": 32,
  "trial_name": null,
  "trial_params": null
 }
--- a/training_args.bin
+++ b/training_args.bin
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:74768d83badcff2bde356dd7f72dc7ffe861663843fa21b5a97e286a5eae8f3e
 size 5176
--- a/vocab.json
+++ b/vocab.json