初始化项目,由ModelHub XC社区提供模型

Model: projecte-aina/whisper-large-v3-ca-3catparla
Source: Original Platform
This commit is contained in:
ModelHub XC
2026-05-13 17:57:28 +08:00
commit fe20ba2ad8
18 changed files with 119141 additions and 0 deletions

35
.gitattributes vendored Normal file
View File

@@ -0,0 +1,35 @@
*.7z filter=lfs diff=lfs merge=lfs -text
*.arrow filter=lfs diff=lfs merge=lfs -text
*.bin filter=lfs diff=lfs merge=lfs -text
*.bz2 filter=lfs diff=lfs merge=lfs -text
*.ckpt filter=lfs diff=lfs merge=lfs -text
*.ftz filter=lfs diff=lfs merge=lfs -text
*.gz filter=lfs diff=lfs merge=lfs -text
*.h5 filter=lfs diff=lfs merge=lfs -text
*.joblib filter=lfs diff=lfs merge=lfs -text
*.lfs.* filter=lfs diff=lfs merge=lfs -text
*.mlmodel filter=lfs diff=lfs merge=lfs -text
*.model filter=lfs diff=lfs merge=lfs -text
*.msgpack filter=lfs diff=lfs merge=lfs -text
*.npy filter=lfs diff=lfs merge=lfs -text
*.npz filter=lfs diff=lfs merge=lfs -text
*.onnx filter=lfs diff=lfs merge=lfs -text
*.ot filter=lfs diff=lfs merge=lfs -text
*.parquet filter=lfs diff=lfs merge=lfs -text
*.pb filter=lfs diff=lfs merge=lfs -text
*.pickle filter=lfs diff=lfs merge=lfs -text
*.pkl filter=lfs diff=lfs merge=lfs -text
*.pt filter=lfs diff=lfs merge=lfs -text
*.pth filter=lfs diff=lfs merge=lfs -text
*.rar filter=lfs diff=lfs merge=lfs -text
*.safetensors filter=lfs diff=lfs merge=lfs -text
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
*.tar.* filter=lfs diff=lfs merge=lfs -text
*.tar filter=lfs diff=lfs merge=lfs -text
*.tflite filter=lfs diff=lfs merge=lfs -text
*.tgz filter=lfs diff=lfs merge=lfs -text
*.wasm filter=lfs diff=lfs merge=lfs -text
*.xz filter=lfs diff=lfs merge=lfs -text
*.zip filter=lfs diff=lfs merge=lfs -text
*.zst filter=lfs diff=lfs merge=lfs -text
*tfevents* filter=lfs diff=lfs merge=lfs -text

377
README.md Normal file
View File

@@ -0,0 +1,377 @@
---
language: ca
datasets:
- projecte-aina/3catparla_asr
tags:
- audio
- automatic-speech-recognition
- catalan
- whisper-large-v3
- projecte-aina
- barcelona-supercomputing-center
- bsc
license: apache-2.0
model-index:
- name: whisper-large-v3-ca-3catparla
results:
- task:
name: Automatic Speech Recognition
type: automatic-speech-recognition
dataset:
name: 3CatParla (Test)
type: projecte-aina/3catparla_asr
split: test
args:
language: ca
metrics:
- name: WER
type: wer
value: 0.96
- task:
name: Automatic Speech Recognition
type: automatic-speech-recognition
dataset:
name: 3CatParla (Dev)
type: projecte-aina/3catparla_asr
split: dev
args:
language: ca
metrics:
- name: WER
type: wer
value: 0.92
- task:
name: Automatic Speech Recognition
type: automatic-speech-recognition
dataset:
name: Mozilla Common Voice 17.0 (Test)
type: mozilla-foundation/common_voice_17_0
split: test
args:
language: ca
metrics:
- name: WER
type: wer
value: 10.32
- task:
name: Automatic Speech Recognition
type: automatic-speech-recognition
dataset:
name: Mozilla Common Voice 17.0 (Dev)
type: mozilla-foundation/common_voice_17_0
split: validation
args:
language: ca
metrics:
- name: WER
type: wer
value: 9.26
- task:
name: Automatic Speech Recognition
type: automatic-speech-recognition
dataset:
name: CV Benchmark Catalan Accents (Balearic fem)
type: projecte-aina/commonvoice_benchmark_catalan_accents
split: Balearic female
args:
language: ca
metrics:
- name: WER
type: wer
value: 12.25
- task:
name: Automatic Speech Recognition
type: automatic-speech-recognition
dataset:
name: CV Benchmark Catalan Accents (Balearic male)
type: projecte-aina/commonvoice_benchmark_catalan_accents
split: Balearic male
args:
language: ca
metrics:
- name: WER
type: wer
value: 12.18
- task:
name: Automatic Speech Recognition
type: automatic-speech-recognition
dataset:
name: CV Benchmark Catalan Accents (Central fem)
type: projecte-aina/commonvoice_benchmark_catalan_accents
split: Central female
args:
language: ca
metrics:
- name: WER
type: wer
value: 8.51
- task:
name: Automatic Speech Recognition
type: automatic-speech-recognition
dataset:
name: CV Benchmark Catalan Accents (Central male)
type: projecte-aina/commonvoice_benchmark_catalan_accents
split: Central male
args:
language: ca
metrics:
- name: WER
type: wer
value: 8.73
- task:
name: Automatic Speech Recognition
type: automatic-speech-recognition
dataset:
name: CV Benchmark Catalan Accents (Northern fem)
type: projecte-aina/commonvoice_benchmark_catalan_accents
split: Northern female
args:
language: ca
metrics:
- name: WER
type: wer
value: 8.09
- task:
name: Automatic Speech Recognition
type: automatic-speech-recognition
dataset:
name: CV Benchmark Catalan Accents (Northern male)
type: projecte-aina/commonvoice_benchmark_catalan_accents
split: Northern male
args:
language: ca
metrics:
- name: WER
type: wer
value: 8.28
- task:
name: Automatic Speech Recognition
type: automatic-speech-recognition
dataset:
name: CV Benchmark Catalan Accents (Northwestern fem)
type: projecte-aina/commonvoice_benchmark_catalan_accents
split: Northwestern female
args:
language: ca
metrics:
- name: WER
type: wer
value: 7.88
- task:
name: Automatic Speech Recognition
type: automatic-speech-recognition
dataset:
name: CV Benchmark Catalan Accents (Northwestern male)
type: projecte-aina/commonvoice_benchmark_catalan_accents
split: Northwestern male
args:
language: ca
metrics:
- name: WER
type: wer
value: 8.44
- task:
name: Automatic Speech Recognition
type: automatic-speech-recognition
dataset:
name: CV Benchmark Catalan Accents (Valencian fem)
type: projecte-aina/commonvoice_benchmark_catalan_accents
split: Valencian female
args:
language: ca
metrics:
- name: WER
type: wer
value: 9.58
- task:
name: Automatic Speech Recognition
type: automatic-speech-recognition
dataset:
name: CV Benchmark Catalan Accents (Valencian male)
type: projecte-aina/commonvoice_benchmark_catalan_accents
split: Valencian male
args:
language: ca
metrics:
- name: WER
type: wer
value: 9.1
library_name: transformers
---
# whisper-large-v3-ca-3catparla
## Table of Contents
<details>
<summary>Click to expand</summary>
- [Paper](#paper)
- [Model Description](#model-description)
- [Intended Uses and Limitations](#intended-uses-and-limitations)
- [How to Get Started with the Model](#how-to-get-started-with-the-model)
- [Training Details](#training-details)
- [Citation](#citation)
- [Additional Information](#additional-information)
</details>
## Paper
**PDF:** [3CatParla: A New Open-Source Corpus of Broadcast TV in Catalan for Automatic Speech Recognition](https://www.isca-archive.org/iberspeech_2024/hernandezmena24_iberspeech.pdf)
## Model Description
The "whisper-large-v3-ca-3catparla" is an acoustic model suitable for Automatic Speech Recognition in Catalan. It is the result of finetuning the model ["openai/whisper-large-v3"](https://huggingface.co/openai/whisper-large-v3) with 710 hours of Catalan data released by the [Projecte AINA](https://projecteaina.cat/) from Barcelona, Spain.
## Intended Uses and Limitations
This model can be used for Automatic Speech Recognition (ASR) in Catalan. The model is intended to transcribe audio files in Catalan to plain text without punctuation.
## How to Get Started with the Model
To see an updated and functional version of this code, please see our our [Notebook](https://colab.research.google.com/drive/1MHiPrffNTwiyWeUyMQvSdSbfkef_8aJC?usp=sharing)
### Installation
In order to use this model, you may install [datasets](https://huggingface.co/docs/datasets/installation) and [transformers](https://huggingface.co/docs/transformers/installation):
Create a virtual environment:
```bash
python -m venv /path/to/venv
```
Activate the environment:
```bash
source /path/to/venv/bin/activate
```
Install the modules:
```bash
pip install datasets transformers
```
### For Inference
In order to transcribe audio in Catalan using this model, you can follow this example:
```bash
#Install Prerequisites
pip install torch
pip install datasets
pip install 'transformers[torch]'
pip install evaluate
pip install jiwer
```
```python
#This code works with GPU
#Notice that: load_metric is no longer part of datasets.
#you have to remove it and use evaluate's load instead.
#(Note from November 2024)
import torch
from transformers import WhisperForConditionalGeneration, WhisperProcessor
#Load the processor and model.
MODEL_NAME="projecte-aina/whisper-large-v3-ca-3catparla"
processor = WhisperProcessor.from_pretrained(MODEL_NAME)
model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME).to("cuda")
#Load the dataset
from datasets import load_dataset, load_metric, Audio
ds=load_dataset("projecte-aina/3catparla_asr",split='test')
#Downsample to 16kHz
ds = ds.cast_column("audio", Audio(sampling_rate=16_000))
#Process the dataset
def map_to_pred(batch):
audio = batch["audio"]
input_features = processor(audio["array"], sampling_rate=audio["sampling_rate"], return_tensors="pt").input_features
batch["reference"] = processor.tokenizer._normalize(batch['normalized_text'])
with torch.no_grad():
predicted_ids = model.generate(input_features.to("cuda"))[0]
transcription = processor.decode(predicted_ids)
batch["prediction"] = processor.tokenizer._normalize(transcription)
return batch
#Do the evaluation
result = ds.map(map_to_pred)
#Compute the overall WER now.
from evaluate import load
wer = load("wer")
WER=100 * wer.compute(references=result["reference"], predictions=result["prediction"])
print(WER)
```
**Test Result**: 0.96
## Training Details
### Training data
The specific dataset used to create the model is called ["3CatParla"](https://huggingface.co/datasets/projecte-aina/3catparla_asr).
### Training procedure
This model is the result of finetuning the model ["openai/whisper-large-v3"](https://huggingface.co/openai/whisper-large-v3) by following this [tutorial](https://huggingface.co/blog/fine-tune-whisper) provided by Hugging Face.
### Training Hyperparameters
* language: catalan
* hours of training audio: 710
* learning rate: 1.95e-07
* sample rate: 16000
* train batch size: 32 (x4 GPUs)
* gradient accumulation steps: 1
* eval batch size: 32
* save total limit: 3
* max steps: 19842
* warmup steps: 1984
* eval steps: 3307
* save steps: 3307
* shuffle buffer size: 480
## Citation
If this model contributes to your research, please cite the work:
```bibtex
@inproceedings{hernandez20243catparla,
title={3CatParla: A New Open-Source Corpus of Broadcast TV in Catalan for Automatic Speech Recognition},
author={Hern{\'a}ndez Mena, Carlos Daniel and Armentano Oller, Carme and Solito, Sarah and K{\"u}lebi, Baybars},
booktitle={Proc. IberSPEECH 2024},
pages={176--180},
year={2024}
}
```
<!--
@misc{mena2024whisperlarge3catparla,
title={Acoustic Model in Catalan: whisper-large-v3-ca-3catparla.},
author={Hernandez Mena, Carlos Daniel; Armentano-Oller, Carme; Solito, Sarah; Külebi, Baybars},
organization={Barcelona Supercomputing Center},
url={https://huggingface.co/projecte-aina/whisper-large-v3-ca-3catparla},
year={2024}
}
-->
## Additional Information
### Author
The fine-tuning process was perform during July (2024) in the [Language Technologies Unit](https://huggingface.co/BSC-LT) of the [Barcelona Supercomputing Center](https://www.bsc.es/) by [Carlos Daniel Hernández Mena](https://huggingface.co/carlosdanielhernandezmena).
### Contact
For further information, please send an email to <langtech@bsc.es>.
### Copyright
Copyright(c) 2024 by Language Technologies Unit, Barcelona Supercomputing Center.
### License
[Apache-2.0](https://www.apache.org/licenses/LICENSE-2.0)
### Funding
This work has been promoted and financed by the Generalitat de Catalunya through the [Aina project](https://projecteaina.cat/).
The training of the model was possible thanks to the compute time provided by [Barcelona Supercomputing Center](https://www.bsc.es/) through MareNostrum 5.

1611
added_tokens.json Normal file

File diff suppressed because it is too large Load Diff

8
all_results.json Normal file
View File

@@ -0,0 +1,8 @@
{
"epoch": 11.083056143533918,
"total_flos": 8.628884758428616e+21,
"train_loss": 0.04815149868992161,
"train_runtime": 124240.955,
"train_samples_per_second": 20.442,
"train_steps_per_second": 0.16
}

52
config.json Normal file
View File

@@ -0,0 +1,52 @@
{
"_name_or_path": "/gpfs/projects/bsc88/speech/ASR/models/whisper-large-v3",
"activation_dropout": 0.0,
"activation_function": "gelu",
"apply_spec_augment": false,
"architectures": [
"WhisperForConditionalGeneration"
],
"attention_dropout": 0.0,
"begin_suppress_tokens": [
220,
50257
],
"bos_token_id": 50257,
"classifier_proj_size": 256,
"d_model": 1280,
"decoder_attention_heads": 20,
"decoder_ffn_dim": 5120,
"decoder_layerdrop": 0.0,
"decoder_layers": 32,
"decoder_start_token_id": 50258,
"dropout": 0.0,
"encoder_attention_heads": 20,
"encoder_ffn_dim": 5120,
"encoder_layerdrop": 0.0,
"encoder_layers": 32,
"eos_token_id": 50257,
"forced_decoder_ids": null,
"init_std": 0.02,
"is_encoder_decoder": true,
"mask_feature_length": 10,
"mask_feature_min_masks": 0,
"mask_feature_prob": 0.0,
"mask_time_length": 10,
"mask_time_min_masks": 2,
"mask_time_prob": 0.05,
"max_length": 448,
"max_source_positions": 1500,
"max_target_positions": 448,
"median_filter_width": 7,
"model_type": "whisper",
"num_hidden_layers": 32,
"num_mel_bins": 128,
"pad_token_id": 50256,
"scale_embedding": false,
"suppress_tokens": [],
"torch_dtype": "float32",
"transformers_version": "4.40.2",
"use_cache": false,
"use_weighted_layer_sum": false,
"vocab_size": 51866
}

266
generation_config.json Normal file
View File

@@ -0,0 +1,266 @@
{
"alignment_heads": [
[
7,
0
],
[
10,
17
],
[
12,
18
],
[
13,
12
],
[
16,
1
],
[
17,
14
],
[
19,
11
],
[
21,
4
],
[
24,
1
],
[
25,
6
]
],
"begin_suppress_tokens": [
220,
50257
],
"bos_token_id": 50257,
"decoder_start_token_id": 50258,
"eos_token_id": 50257,
"forced_decoder_ids": [
[
1,
null
],
[
2,
50360
]
],
"is_multilingual": true,
"lang_to_id": {
"<|af|>": 50327,
"<|am|>": 50334,
"<|ar|>": 50272,
"<|as|>": 50350,
"<|az|>": 50304,
"<|ba|>": 50355,
"<|be|>": 50330,
"<|bg|>": 50292,
"<|bn|>": 50302,
"<|bo|>": 50347,
"<|br|>": 50309,
"<|bs|>": 50315,
"<|ca|>": 50270,
"<|cs|>": 50283,
"<|cy|>": 50297,
"<|da|>": 50285,
"<|de|>": 50261,
"<|el|>": 50281,
"<|en|>": 50259,
"<|es|>": 50262,
"<|et|>": 50307,
"<|eu|>": 50310,
"<|fa|>": 50300,
"<|fi|>": 50277,
"<|fo|>": 50338,
"<|fr|>": 50265,
"<|gl|>": 50319,
"<|gu|>": 50333,
"<|haw|>": 50352,
"<|ha|>": 50354,
"<|he|>": 50279,
"<|hi|>": 50276,
"<|hr|>": 50291,
"<|ht|>": 50339,
"<|hu|>": 50286,
"<|hy|>": 50312,
"<|id|>": 50275,
"<|is|>": 50311,
"<|it|>": 50274,
"<|ja|>": 50266,
"<|jw|>": 50356,
"<|ka|>": 50329,
"<|kk|>": 50316,
"<|km|>": 50323,
"<|kn|>": 50306,
"<|ko|>": 50264,
"<|la|>": 50294,
"<|lb|>": 50345,
"<|ln|>": 50353,
"<|lo|>": 50336,
"<|lt|>": 50293,
"<|lv|>": 50301,
"<|mg|>": 50349,
"<|mi|>": 50295,
"<|mk|>": 50308,
"<|ml|>": 50296,
"<|mn|>": 50314,
"<|mr|>": 50320,
"<|ms|>": 50282,
"<|mt|>": 50343,
"<|my|>": 50346,
"<|ne|>": 50313,
"<|nl|>": 50271,
"<|nn|>": 50342,
"<|no|>": 50288,
"<|oc|>": 50328,
"<|pa|>": 50321,
"<|pl|>": 50269,
"<|ps|>": 50340,
"<|pt|>": 50267,
"<|ro|>": 50284,
"<|ru|>": 50263,
"<|sa|>": 50344,
"<|sd|>": 50332,
"<|si|>": 50322,
"<|sk|>": 50298,
"<|sl|>": 50305,
"<|sn|>": 50324,
"<|so|>": 50326,
"<|sq|>": 50317,
"<|sr|>": 50303,
"<|su|>": 50357,
"<|sv|>": 50273,
"<|sw|>": 50318,
"<|ta|>": 50287,
"<|te|>": 50299,
"<|tg|>": 50331,
"<|th|>": 50289,
"<|tk|>": 50341,
"<|tl|>": 50348,
"<|tr|>": 50268,
"<|tt|>": 50351,
"<|uk|>": 50280,
"<|ur|>": 50290,
"<|uz|>": 50337,
"<|vi|>": 50278,
"<|yi|>": 50335,
"<|yo|>": 50325,
"<|yue|>": 50358,
"<|zh|>": 50260
},
"language": "catalan",
"max_initial_timestamp_index": 50,
"max_length": 448,
"no_timestamps_token_id": 50364,
"pad_token_id": 50257,
"prev_sot_token_id": 50362,
"return_timestamps": false,
"suppress_tokens": [
1,
2,
7,
8,
9,
10,
14,
25,
26,
27,
28,
29,
31,
58,
59,
60,
61,
62,
63,
90,
91,
92,
93,
359,
503,
522,
542,
873,
893,
902,
918,
922,
931,
1350,
1853,
1982,
2460,
2627,
3246,
3253,
3268,
3536,
3846,
3961,
4183,
4667,
6585,
6647,
7273,
9061,
9383,
10428,
10929,
11938,
12033,
12331,
12562,
13793,
14157,
14635,
15265,
15618,
16553,
16604,
18362,
18956,
20075,
21675,
22520,
26130,
26161,
26435,
28279,
29464,
31650,
32302,
32470,
36865,
42863,
47425,
49870,
50254,
50258,
50359,
50360,
50361,
50362,
50363
],
"task_to_id": {
"transcribe": 50360,
"translate": 50359
},
"transformers_version": "4.40.2"
}

50001
merges.txt Normal file

File diff suppressed because it is too large Load Diff

1742
normalizer.json Normal file

File diff suppressed because it is too large Load Diff

14
preprocessor_config.json Normal file
View File

@@ -0,0 +1,14 @@
{
"chunk_length": 30,
"feature_extractor_type": "WhisperFeatureExtractor",
"feature_size": 128,
"hop_length": 160,
"n_fft": 400,
"n_samples": 480000,
"nb_max_frames": 3000,
"padding_side": "right",
"padding_value": 0.0,
"processor_class": "WhisperProcessor",
"return_attention_mask": false,
"sampling_rate": 16000
}

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:b5dd9863724f0133ef9e2fc7fe7c438c9d8728d6a31ef999b85a5099285d0cac
size 4993686017

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:ac4eb29e216f51ba613a65495e00e036a1ca27c11ea1892548c4b69dafbf87da
size 1180727888

1267
pytorch_model.bin.index.json Normal file

File diff suppressed because it is too large Load Diff

139
special_tokens_map.json Normal file
View File

@@ -0,0 +1,139 @@
{
"additional_special_tokens": [
"<|startoftranscript|>",
"<|en|>",
"<|zh|>",
"<|de|>",
"<|es|>",
"<|ru|>",
"<|ko|>",
"<|fr|>",
"<|ja|>",
"<|pt|>",
"<|tr|>",
"<|pl|>",
"<|ca|>",
"<|nl|>",
"<|ar|>",
"<|sv|>",
"<|it|>",
"<|id|>",
"<|hi|>",
"<|fi|>",
"<|vi|>",
"<|he|>",
"<|uk|>",
"<|el|>",
"<|ms|>",
"<|cs|>",
"<|ro|>",
"<|da|>",
"<|hu|>",
"<|ta|>",
"<|no|>",
"<|th|>",
"<|ur|>",
"<|hr|>",
"<|bg|>",
"<|lt|>",
"<|la|>",
"<|mi|>",
"<|ml|>",
"<|cy|>",
"<|sk|>",
"<|te|>",
"<|fa|>",
"<|lv|>",
"<|bn|>",
"<|sr|>",
"<|az|>",
"<|sl|>",
"<|kn|>",
"<|et|>",
"<|mk|>",
"<|br|>",
"<|eu|>",
"<|is|>",
"<|hy|>",
"<|ne|>",
"<|mn|>",
"<|bs|>",
"<|kk|>",
"<|sq|>",
"<|sw|>",
"<|gl|>",
"<|mr|>",
"<|pa|>",
"<|si|>",
"<|km|>",
"<|sn|>",
"<|yo|>",
"<|so|>",
"<|af|>",
"<|oc|>",
"<|ka|>",
"<|be|>",
"<|tg|>",
"<|sd|>",
"<|gu|>",
"<|am|>",
"<|yi|>",
"<|lo|>",
"<|uz|>",
"<|fo|>",
"<|ht|>",
"<|ps|>",
"<|tk|>",
"<|nn|>",
"<|mt|>",
"<|sa|>",
"<|lb|>",
"<|my|>",
"<|bo|>",
"<|tl|>",
"<|mg|>",
"<|as|>",
"<|tt|>",
"<|haw|>",
"<|ln|>",
"<|ha|>",
"<|ba|>",
"<|jw|>",
"<|su|>",
"<|yue|>",
"<|translate|>",
"<|transcribe|>",
"<|startoflm|>",
"<|startofprev|>",
"<|nospeech|>",
"<|notimestamps|>"
],
"bos_token": {
"content": "<|endoftext|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
},
"eos_token": {
"content": "<|endoftext|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
},
"pad_token": {
"content": "<|endoftext|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
},
"unk_token": {
"content": "<|endoftext|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
}
}

12996
tokenizer_config.json Normal file

File diff suppressed because it is too large Load Diff

8
train_results.json Normal file
View File

@@ -0,0 +1,8 @@
{
"epoch": 11.083056143533918,
"total_flos": 8.628884758428616e+21,
"train_loss": 0.04815149868992161,
"train_runtime": 124240.955,
"train_samples_per_second": 20.442,
"train_steps_per_second": 0.16
}

357
trainer_state.json Normal file
View File

@@ -0,0 +1,357 @@
{
"best_metric": 0.8985657508208054,
"best_model_checkpoint": "CHECKPOINTS/checkpoint-3307",
"epoch": 11.083056143533918,
"eval_steps": 3307,
"global_step": 19842,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.02519907267412559,
"grad_norm": 0.7393302321434021,
"learning_rate": 2.4899193548387098e-06,
"loss": 0.3059,
"step": 500
},
{
"epoch": 0.05039814534825118,
"grad_norm": 0.6863130331039429,
"learning_rate": 5.010080645161291e-06,
"loss": 0.1492,
"step": 1000
},
{
"epoch": 0.07559721802237677,
"grad_norm": 0.703405499458313,
"learning_rate": 7.5302419354838715e-06,
"loss": 0.1428,
"step": 1500
},
{
"epoch": 1.017437758290495,
"grad_norm": 0.6589027643203735,
"learning_rate": 9.994400268787099e-06,
"loss": 0.1261,
"step": 2000
},
{
"epoch": 1.0426368309646206,
"grad_norm": 0.6031370759010315,
"learning_rate": 9.71441370814201e-06,
"loss": 0.1202,
"step": 2500
},
{
"epoch": 1.0678359036387461,
"grad_norm": 0.6696850657463074,
"learning_rate": 9.434427147496921e-06,
"loss": 0.1171,
"step": 3000
},
{
"epoch": 1.0833081342606592,
"eval_loss": 0.017358383163809776,
"eval_runtime": 504.91,
"eval_samples_per_second": 0.824,
"eval_steps_per_second": 0.008,
"eval_wer": 0.8985657508208054,
"step": 3307
},
{
"epoch": 2.0096764439068644,
"grad_norm": 0.591613233089447,
"learning_rate": 9.154440586851832e-06,
"loss": 0.1042,
"step": 3500
},
{
"epoch": 2.03487551658099,
"grad_norm": 0.5527406930923462,
"learning_rate": 8.874454026206742e-06,
"loss": 0.0861,
"step": 4000
},
{
"epoch": 2.0600745892551156,
"grad_norm": 0.5651601552963257,
"learning_rate": 8.594467465561653e-06,
"loss": 0.0878,
"step": 4500
},
{
"epoch": 3.0019151295232334,
"grad_norm": 0.4751633107662201,
"learning_rate": 8.314480904916565e-06,
"loss": 0.0865,
"step": 5000
},
{
"epoch": 3.027114202197359,
"grad_norm": 0.5415444374084473,
"learning_rate": 8.034494344271475e-06,
"loss": 0.0625,
"step": 5500
},
{
"epoch": 3.0523132748714845,
"grad_norm": 0.5294741988182068,
"learning_rate": 7.754507783626388e-06,
"loss": 0.0636,
"step": 6000
},
{
"epoch": 3.0775123475456105,
"grad_norm": 0.538415789604187,
"learning_rate": 7.474521222981298e-06,
"loss": 0.0652,
"step": 6500
},
{
"epoch": 3.083257736115311,
"eval_loss": 0.01661744900047779,
"eval_runtime": 517.2779,
"eval_samples_per_second": 0.804,
"eval_steps_per_second": 0.008,
"eval_wer": 0.9369659965823781,
"step": 6614
},
{
"epoch": 4.019352887813729,
"grad_norm": 0.6536675691604614,
"learning_rate": 7.194534662336209e-06,
"loss": 0.0474,
"step": 7000
},
{
"epoch": 4.044551960487854,
"grad_norm": 0.5821442604064941,
"learning_rate": 6.91454810169112e-06,
"loss": 0.0441,
"step": 7500
},
{
"epoch": 4.06975103316198,
"grad_norm": 0.5527841448783875,
"learning_rate": 6.6345615410460304e-06,
"loss": 0.0449,
"step": 8000
},
{
"epoch": 5.011591573430098,
"grad_norm": 0.453218549489975,
"learning_rate": 6.354574980400942e-06,
"loss": 0.0369,
"step": 8500
},
{
"epoch": 5.036790646104223,
"grad_norm": 0.5013980865478516,
"learning_rate": 6.074588419755852e-06,
"loss": 0.0273,
"step": 9000
},
{
"epoch": 5.061989718778349,
"grad_norm": 0.5885359644889832,
"learning_rate": 5.7946018591107636e-06,
"loss": 0.0288,
"step": 9500
},
{
"epoch": 5.083207337969963,
"eval_loss": 0.017346344888210297,
"eval_runtime": 530.4397,
"eval_samples_per_second": 0.784,
"eval_steps_per_second": 0.008,
"eval_wer": 1.0060864389532094,
"step": 9921
},
{
"epoch": 6.003830259046467,
"grad_norm": 0.4446285665035248,
"learning_rate": 5.514615298465674e-06,
"loss": 0.0276,
"step": 10000
},
{
"epoch": 6.029029331720593,
"grad_norm": 0.46283265948295593,
"learning_rate": 5.234628737820585e-06,
"loss": 0.016,
"step": 10500
},
{
"epoch": 6.054228404394718,
"grad_norm": 0.42813611030578613,
"learning_rate": 4.954642177175496e-06,
"loss": 0.0173,
"step": 11000
},
{
"epoch": 6.079427477068844,
"grad_norm": 0.48960232734680176,
"learning_rate": 4.674655616530407e-06,
"loss": 0.018,
"step": 11500
},
{
"epoch": 7.021268017336962,
"grad_norm": 0.3988407254219055,
"learning_rate": 4.394669055885318e-06,
"loss": 0.0109,
"step": 12000
},
{
"epoch": 7.046467090011087,
"grad_norm": 0.3698909282684326,
"learning_rate": 4.114682495240229e-06,
"loss": 0.0101,
"step": 12500
},
{
"epoch": 7.071666162685213,
"grad_norm": 0.4073663055896759,
"learning_rate": 3.8346959345951395e-06,
"loss": 0.0109,
"step": 13000
},
{
"epoch": 7.083156939824614,
"eval_loss": 0.0192726943641901,
"eval_runtime": 518.0983,
"eval_samples_per_second": 0.803,
"eval_steps_per_second": 0.008,
"eval_wer": 0.9907263406485801,
"step": 13228
},
{
"epoch": 8.01350670295333,
"grad_norm": 0.33067503571510315,
"learning_rate": 3.5547093739500504e-06,
"loss": 0.0082,
"step": 13500
},
{
"epoch": 8.038705775627458,
"grad_norm": 0.3731881380081177,
"learning_rate": 3.2747228133049617e-06,
"loss": 0.0062,
"step": 14000
},
{
"epoch": 8.063904848301583,
"grad_norm": 0.2673242688179016,
"learning_rate": 2.9947362526598727e-06,
"loss": 0.0066,
"step": 14500
},
{
"epoch": 9.0057453885697,
"grad_norm": 0.18087884783744812,
"learning_rate": 2.7147496920147836e-06,
"loss": 0.0062,
"step": 15000
},
{
"epoch": 9.030944461243827,
"grad_norm": 0.250787615776062,
"learning_rate": 2.4347631313696945e-06,
"loss": 0.0038,
"step": 15500
},
{
"epoch": 9.056143533917952,
"grad_norm": 0.2255438268184662,
"learning_rate": 2.1547765707246054e-06,
"loss": 0.0041,
"step": 16000
},
{
"epoch": 9.081342606592077,
"grad_norm": 0.29366812109947205,
"learning_rate": 1.8747900100795163e-06,
"loss": 0.0044,
"step": 16500
},
{
"epoch": 9.083106541679266,
"eval_loss": 0.020848926156759262,
"eval_runtime": 513.0393,
"eval_samples_per_second": 0.811,
"eval_steps_per_second": 0.008,
"eval_wer": 0.9772862546320297,
"step": 16535
},
{
"epoch": 10.023183146860196,
"grad_norm": 0.1837795376777649,
"learning_rate": 1.5948034494344272e-06,
"loss": 0.0026,
"step": 17000
},
{
"epoch": 10.048382219534322,
"grad_norm": 0.24585728347301483,
"learning_rate": 1.314816888789338e-06,
"loss": 0.0026,
"step": 17500
},
{
"epoch": 10.073581292208447,
"grad_norm": 0.15548868477344513,
"learning_rate": 1.0348303281442492e-06,
"loss": 0.0026,
"step": 18000
},
{
"epoch": 11.015421832476564,
"grad_norm": 0.08757825195789337,
"learning_rate": 7.554037406204502e-07,
"loss": 0.002,
"step": 18500
},
{
"epoch": 11.040620905150691,
"grad_norm": 0.1045205295085907,
"learning_rate": 4.7541717997536123e-07,
"loss": 0.0016,
"step": 19000
},
{
"epoch": 11.065819977824816,
"grad_norm": 0.07768367975950241,
"learning_rate": 1.9543061933027217e-07,
"loss": 0.0016,
"step": 19500
},
{
"epoch": 11.083056143533918,
"eval_loss": 0.02412882074713707,
"eval_runtime": 518.0169,
"eval_samples_per_second": 0.803,
"eval_steps_per_second": 0.008,
"eval_wer": 0.9926463529366589,
"step": 19842
},
{
"epoch": 11.083056143533918,
"step": 19842,
"total_flos": 8.628884758428616e+21,
"train_loss": 0.04815149868992161,
"train_runtime": 124240.955,
"train_samples_per_second": 20.442,
"train_steps_per_second": 0.16
}
],
"logging_steps": 500,
"max_steps": 19842,
"num_input_tokens_seen": 0,
"num_train_epochs": 9223372036854775807,
"save_steps": 3307,
"total_flos": 8.628884758428616e+21,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}

3
training_args.bin Normal file
View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:74768d83badcff2bde356dd7f72dc7ffe861663843fa21b5a97e286a5eae8f3e
size 5176

50259
vocab.json Normal file

File diff suppressed because it is too large Load Diff