commit 818196607519d11acb3afc933b79517243858be7 Author: ModelHub XC Date: Tue May 12 22:56:36 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: GetmanY1/wav2vec2-large-fi-150k-finetuned Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..a6344aa --- /dev/null +++ b/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..ade4065 --- /dev/null +++ b/README.md @@ -0,0 +1,170 @@ +--- +license: apache-2.0 +tags: +- automatic-speech-recognition +- fi +- finnish +library_name: transformers +language: fi +base_model: +- GetmanY1/wav2vec2-large-fi-150k +model-index: + - name: wav2vec2-large-fi-150k-finetuned + results: + - task: + name: Automatic Speech Recognition + type: automatic-speech-recognition + dataset: + name: Lahjoita puhetta (Donate Speech) + type: lahjoita-puhetta + args: fi + metrics: + - name: Dev WER + type: wer + value: 15.34 + - name: Dev CER + type: cer + value: 4.14 + - name: Test WER + type: wer + value: 16.86 + - name: Test CER + type: cer + value: 5.07 + - task: + name: Automatic Speech Recognition + type: automatic-speech-recognition + dataset: + name: Finnish Parliament + type: FinParl + args: fi + metrics: + - name: Dev16 WER + type: wer + value: 11.3 + - name: Dev16 CER + type: cer + value: 4.75 + - name: Test16 WER + type: wer + value: 8.29 + - name: Test16 CER + type: cer + value: 3.34 + - name: Test20 WER + type: wer + value: 6.94 + - name: Test20 CER + type: cer + value: 2.15 + - task: + name: Automatic Speech Recognition + type: automatic-speech-recognition + dataset: + name: Common Voice 16.1 + type: mozilla-foundation/common_voice_16_1 + args: fi + metrics: + - name: Dev WER + type: wer + value: 7.17 + - name: Dev CER + type: cer + value: 1.11 + - name: Test WER + type: wer + value: 5.86 + - name: Test CER + type: cer + value: 0.91 + - task: + name: Automatic Speech Recognition + type: automatic-speech-recognition + dataset: + name: FLEURS + type: google/fleurs + args: fi_fi + metrics: + - name: Dev WER + type: wer + value: 9.2 + - name: Dev CER + type: cer + value: 5.23 + - name: Test WER + type: wer + value: 10.69 + - name: Test CER + type: cer + value: 5.79 +--- + +# Finnish Wav2vec2-Large ASR + +[GetmanY1/wav2vec2-large-fi-150k](https://huggingface.co/GetmanY1/wav2vec2-large-fi-150k) fine-tuned on 4600 hours of Finnish speech on 16kHz sampled speech audio: +* 1500 hours of [Lahjoita puhetta (Donate Speech)](https://link.springer.com/article/10.1007/s10579-022-09606-3) (colloquial Finnish) +* 3100 hours of the [Finnish Parliament dataset](https://link.springer.com/article/10.1007/s10579-023-09650-7) + +When using the model make sure that your speech input is also sampled at 16Khz. + +## Model description + +The Finnish Wav2Vec2 Large has the same architecture and uses the same training objective as the English and multilingual one described in [Paper](https://arxiv.org/abs/2006.11477). + +[GetmanY1/wav2vec2-large-fi-150k](https://huggingface.co/GetmanY1/wav2vec2-large-fi-150k) is a large-scale, 317-million parameter monolingual model pre-trained on 158k hours of unlabeled Finnish speech, including [KAVI radio and television archive materials](https://kavi.fi/en/radio-ja-televisioarkistointia-vuodesta-2008/), Lahjoita puhetta (Donate Speech), Finnish Parliament, Finnish VoxPopuli. + +You can read more about the pre-trained model from [this paper](https://www.isca-archive.org/interspeech_2025/getman25_interspeech.html). The training scripts are available on [GitHub](https://github.com/aalto-speech/large-scale-monolingual-speech-foundation-models). + +## Intended uses + +You can use this model for Finnish ASR (speech-to-text). + +### How to use + +To transcribe audio files the model can be used as a standalone acoustic model as follows: + +``` +from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC +from datasets import load_dataset +import torch + +# load model and processor +processor = Wav2Vec2Processor.from_pretrained("GetmanY1/wav2vec2-large-fi-150k-finetuned") +model = Wav2Vec2ForCTC.from_pretrained("GetmanY1/wav2vec2-large-fi-150k-finetuned") + +# load dummy dataset and read soundfiles +ds = load_dataset("mozilla-foundation/common_voice_16_1", "fi", split='test') + +# tokenize +input_values = processor(ds[0]["audio"]["array"], return_tensors="pt", padding="longest").input_values # Batch size 1 + +# retrieve logits +logits = model(input_values).logits + +# take argmax and decode +predicted_ids = torch.argmax(logits, dim=-1) +transcription = processor.batch_decode(predicted_ids) +``` + +## Citation + +If you use our models or scripts, please cite our article as: + +```bibtex +@inproceedings{getman25_interspeech, + title = {{Is your model big enough? Training and interpreting large-scale monolingual speech foundation models}}, + author = {{Yaroslav Getman and Tamás Grósz and Tommi Lehtonen and Mikko Kurimo}}, + year = {{2025}}, + booktitle = {{Interspeech 2025}}, + pages = {{231--235}}, + doi = {{10.21437/Interspeech.2025-46}}, + issn = {{2958-1796}}, +} +``` + +## Team Members + +- Yaroslav Getman, [Hugging Face profile](https://huggingface.co/GetmanY1), [LinkedIn profile](https://www.linkedin.com/in/yaroslav-getman/) +- Tamas Grosz, [Hugging Face profile](https://huggingface.co/Grosy), [LinkedIn profile](https://www.linkedin.com/in/tam%C3%A1s-gr%C3%B3sz-950a049a/) + +Feel free to contact us for more details 🤗 \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..076e0db --- /dev/null +++ b/config.json @@ -0,0 +1,115 @@ +{ + "activation_dropout": 0.0, + "adapter_attn_dim": null, + "adapter_kernel_size": 3, + "adapter_stride": 2, + "add_adapter": false, + "apply_spec_augment": true, + "architectures": [ + "Wav2Vec2ForCTC" + ], + "attention_dropout": 0.1, + "bos_token_id": 1, + "classifier_proj_size": 256, + "codevector_dim": 768, + "contrastive_logits_temperature": 0.1, + "conv_bias": true, + "conv_dim": [ + 512, + 512, + 512, + 512, + 512, + 512, + 512 + ], + "conv_kernel": [ + 10, + 3, + 3, + 3, + 3, + 2, + 2 + ], + "conv_stride": [ + 5, + 2, + 2, + 2, + 2, + 2, + 2 + ], + "ctc_loss_reduction": "sum", + "ctc_zero_infinity": false, + "diversity_loss_weight": 0.1, + "do_stable_layer_norm": true, + "eos_token_id": 2, + "feat_extract_activation": "gelu", + "feat_extract_dropout": 0.0, + "feat_extract_norm": "layer", + "feat_proj_dropout": 0.1, + "feat_quantizer_dropout": 0.0, + "final_dropout": 0.0, + "hidden_act": "gelu", + "hidden_dropout": 0.1, + "hidden_size": 1024, + "initializer_range": 0.02, + "intermediate_size": 4096, + "layer_norm_eps": 1e-05, + "layerdrop": 0.1, + "mask_channel_length": 10, + "mask_channel_min_space": 1, + "mask_channel_other": 0.0, + "mask_channel_prob": 0.0, + "mask_channel_selection": "static", + "mask_feature_length": 10, + "mask_feature_min_masks": 0, + "mask_feature_prob": 0.0, + "mask_time_length": 10, + "mask_time_min_masks": 2, + "mask_time_min_space": 1, + "mask_time_other": 0.0, + "mask_time_prob": 0.075, + "mask_time_selection": "static", + "model_type": "wav2vec2", + "num_adapter_layers": 3, + "num_attention_heads": 16, + "num_codevector_groups": 2, + "num_codevectors_per_group": 320, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_feat_extract_layers": 7, + "num_hidden_layers": 24, + "num_negatives": 100, + "output_hidden_size": 1024, + "pad_token_id": 0, + "proj_codevector_dim": 768, + "tdnn_dilation": [ + 1, + 2, + 3, + 1, + 1 + ], + "tdnn_dim": [ + 512, + 512, + 512, + 512, + 1500 + ], + "tdnn_kernel": [ + 5, + 3, + 3, + 1, + 1 + ], + "torch_dtype": "float32", + "transformers_version": "4.36.0.dev0", + "use_weighted_layer_sum": false, + "vocab_size": 34, + "xvector_output_dim": 512 +} diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..b84375c --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5633b8a5321d4813536e03bdac02bac5b9c188d10d50565dc2f6a903b6c8862e +size 1261946832 diff --git a/preprocessor_config.json b/preprocessor_config.json new file mode 100644 index 0000000..e7d53bd --- /dev/null +++ b/preprocessor_config.json @@ -0,0 +1,10 @@ +{ + "do_normalize": true, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0, + "processor_class": "Wav2Vec2Processor", + "return_attention_mask": true, + "sampling_rate": 16000 +} diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..fdafe48 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,6 @@ +{ + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "" +} diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..08545c1 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,48 @@ +{ + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + }, + "1": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + }, + "2": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + }, + "3": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": true, + "do_lower_case": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "processor_class": "Wav2Vec2Processor", + "replace_word_delimiter_char": " ", + "target_lang": null, + "tokenizer_class": "Wav2Vec2CTCTokenizer", + "unk_token": "", + "word_delimiter_token": "|" +} diff --git a/vocab.json b/vocab.json new file mode 100644 index 0000000..efc0a99 --- /dev/null +++ b/vocab.json @@ -0,0 +1,36 @@ +{ + "": 2, + "": 0, + "": 1, + "": 3, + "a": 16, + "b": 27, + "c": 28, + "d": 23, + "e": 19, + "f": 26, + "g": 25, + "h": 8, + "i": 9, + "j": 11, + "k": 21, + "l": 20, + "m": 18, + "n": 10, + "o": 7, + "p": 15, + "q": 31, + "r": 22, + "s": 17, + "t": 14, + "u": 6, + "v": 5, + "w": 30, + "x": 32, + "y": 13, + "z": 29, + "|": 4, + "ä": 12, + "å": 33, + "ö": 24 +}