commit b766ca751bd4bc0f385d83e45d7f5fccabcedcbf Author: ModelHub XC Date: Fri May 8 11:35:50 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: bond005/wav2vec2-large-ru-golos Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..ac481c8 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,27 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zstandard filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..9db025d --- /dev/null +++ b/README.md @@ -0,0 +1,227 @@ +--- +datasets: +- SberDevices/Golos +- bond005/sova_rudevices +- bond005/rulibrispeech +language: ru +license: apache-2.0 +metrics: +- wer +- cer +library_name: transformers +pipeline_tag: automatic-speech-recognition +tags: +- audio +- automatic-speech-recognition +- speech +- xlsr-fine-tuning-week +widget: +- example_title: test sound with Russian speech "нейросети это хорошо" + src: https://huggingface.co/bond005/wav2vec2-large-ru-golos/resolve/main/test_sound_ru.flac +model-index: +- name: XLSR Wav2Vec2 Russian by Ivan Bondarenko + results: + - task: + type: automatic-speech-recognition + name: Speech Recognition + dataset: + name: Sberdevices Golos (crowd) + type: SberDevices/Golos + args: ru + metrics: + - type: wer + value: 10.144 + name: Test WER + - type: cer + value: 2.168 + name: Test CER + - type: wer + value: 20.353 + name: Test WER + - type: cer + value: 6.03 + name: Test CER + - task: + type: automatic-speech-recognition + name: Automatic Speech Recognition + dataset: + name: Common Voice ru + type: common_voice + args: ru + metrics: + - type: wer + value: 18.548 + name: Test WER + - type: cer + value: 4.0 + name: Test CER + - task: + type: automatic-speech-recognition + name: Automatic Speech Recognition + dataset: + name: Sova RuDevices + type: bond005/sova_rudevices + args: ru + metrics: + - type: wer + value: 25.41 + name: Test WER + - type: cer + value: 7.965 + name: Test CER + - task: + type: automatic-speech-recognition + name: Automatic Speech Recognition + dataset: + name: Russian Librispeech + type: bond005/rulibrispeech + args: ru + metrics: + - type: wer + value: 21.872 + name: Test WER + - type: cer + value: 4.469 + name: Test CER + - task: + type: automatic-speech-recognition + name: Automatic Speech Recognition + dataset: + name: Voxforge Ru + type: dangrebenkin/voxforge-ru-dataset + args: ru + metrics: + - type: wer + value: 27.084 + name: Test WER + - type: cer + value: 6.986 + name: Test CER +--- + +# Wav2Vec2-Large-Ru-Golos + +This model is a component of the **Pisets** speech-to-text system, presented in the paper [Pisets: A Robust Speech Recognition System for Lectures and Interviews](https://huggingface.co/papers/2601.18415). + +The source code for the **Pisets** system is available on GitHub: [bond005/pisets](https://github.com/bond005/pisets). + +The Wav2Vec2 model is based on [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53), fine-tuned in Russian using [Sberdevices Golos](https://huggingface.co/datasets/SberDevices/Golos) with audio augmentations like as pitch shift, acceleration/deceleration of sound, reverberation etc. + +When using this model, make sure that your speech input is sampled at 16kHz. + +## Usage + +To transcribe audio files the model can be used as a standalone acoustic model as follows: + +```python +from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC +from datasets import load_dataset +import torch + +# load model and tokenizer +processor = Wav2Vec2Processor.from_pretrained("bond005/wav2vec2-large-ru-golos") +model = Wav2Vec2ForCTC.from_pretrained("bond005/wav2vec2-large-ru-golos") + +# load the test part of Golos dataset and read first soundfile +ds = load_dataset("bond005/sberdevices_golos_10h_crowd", split="test") + +# tokenize +processed = processor(ds[0]["audio"]["array"], return_tensors="pt", padding="longest") # Batch size 1 + +# retrieve logits +logits = model(processed.input_values, attention_mask=processed.attention_mask).logits + +# take argmax and decode +predicted_ids = torch.argmax(logits, dim=-1) +transcription = processor.batch_decode(predicted_ids)[0] +print(transcription) +``` + + ## Evaluation + +This code snippet shows how to evaluate **bond005/wav2vec2-large-ru-golos** on Golos dataset's "crowd" and "farfield" test data. + +```python +from datasets import load_dataset +from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor +import torch +from jiwer import wer, cer # we need word error rate (WER) and character error rate (CER) + +# load the test part of Golos Crowd and remove samples with empty "true" transcriptions +golos_crowd_test = load_dataset("bond005/sberdevices_golos_10h_crowd", split="test") +golos_crowd_test = golos_crowd_test.filter( + lambda it1: (it1["transcription"] is not None) and (len(it1["transcription"].strip()) > 0) +) + +# load the test part of Golos Farfield and remove sampels with empty "true" transcriptions +golos_farfield_test = load_dataset("bond005/sberdevices_golos_100h_farfield", split="test") +golos_farfield_test = golos_farfield_test.filter( + lambda it2: (it2["transcription"] is not None) and (len(it2["transcription"].strip()) > 0) +) + +# load model and tokenizer +model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h").to("cuda") +processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h") + +# recognize one sound +def map_to_pred(batch): + # tokenize and vectorize + processed = processor( + batch["audio"]["array"], sampling_rate=batch["audio"]["sampling_rate"], + return_tensors="pt", padding="longest" + ) + input_values = processed.input_values.to("cuda") + attention_mask = processed.attention_mask.to("cuda") + + # recognize + with torch.no_grad(): + logits = model(input_values, attention_mask=attention_mask).logits + predicted_ids = torch.argmax(logits, dim=-1) + + # decode + transcription = processor.batch_decode(predicted_ids) + batch["text"] = transcription[0] + return batch + +# calculate WER and CER on the crowd domain +crowd_result = golos_crowd_test.map(map_to_pred, remove_columns=["audio"]) +crowd_wer = wer(crowd_result["transcription"], crowd_result["text"]) +crowd_cer = cer(crowd_result["transcription"], crowd_result["text"]) +print("Word error rate on the Crowd domain:", crowd_wer) +print("Character error rate on the Crowd domain:", crowd_cer) + +# calculate WER and CER on the farfield domain +farfield_result = golos_farfield_test.map(map_to_pred, remove_columns=["audio"]) +farfield_wer = wer(farfield_result["transcription"], farfield_result["text"]) +farfield_cer = cer(farfield_result["transcription"], farfield_result["text"]) +print("Word error rate on the Farfield domain:", farfield_wer) +print("Character error rate on the Farfield domain:", farfield_cer) +``` + +*Result (WER, %)*: + +| "crowd" | "farfield" | +|---------|------------| +| 10.144 | 20.353 | + +*Result (CER, %)*: + +| "crowd" | "farfield" | +|---------|------------| +| 2.168 | 6.030 | + +You can see the evaluation script on other datasets, including Russian Librispeech and SOVA RuDevices, on my Kaggle web-page https://www.kaggle.com/code/bond005/wav2vec2-ru-eval + +## Citation +If you want to cite this model you can use this: + +```bibtex +@misc{bondarenko2022wav2vec2-large-ru-golos, + title={XLSR Wav2Vec2 Russian by Ivan Bondarenko}, + author={Bondarenko, Ivan}, + publisher={Hugging Face}, + journal={Hugging Face Hub}, + howpublished={\url{https://huggingface.co/bond005/wav2vec2-large-ru-golos}}, + year={2022} +} +``` \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..21edc54 --- /dev/null +++ b/config.json @@ -0,0 +1,115 @@ +{ + "_name_or_path": "/storage0/bi/models/wav2vec2-large-ru-golos", + "activation_dropout": 0.0, + "adapter_kernel_size": 3, + "adapter_stride": 2, + "add_adapter": false, + "apply_spec_augment": true, + "architectures": [ + "Wav2Vec2ForCTC" + ], + "attention_dropout": 0.0, + "bos_token_id": 1, + "classifier_proj_size": 256, + "codevector_dim": 768, + "contrastive_logits_temperature": 0.1, + "conv_bias": true, + "conv_dim": [ + 512, + 512, + 512, + 512, + 512, + 512, + 512 + ], + "conv_kernel": [ + 10, + 3, + 3, + 3, + 3, + 2, + 2 + ], + "conv_stride": [ + 5, + 2, + 2, + 2, + 2, + 2, + 2 + ], + "ctc_loss_reduction": "mean", + "ctc_zero_infinity": true, + "diversity_loss_weight": 0.1, + "do_stable_layer_norm": true, + "eos_token_id": 2, + "feat_extract_activation": "gelu", + "feat_extract_dropout": 0.0, + "feat_extract_norm": "layer", + "feat_proj_dropout": 0.0, + "feat_quantizer_dropout": 0.0, + "final_dropout": 0.0, + "hidden_act": "gelu", + "hidden_dropout": 0.0, + "hidden_size": 1024, + "initializer_range": 0.02, + "intermediate_size": 4096, + "layer_norm_eps": 1e-05, + "layerdrop": 0.0, + "mask_channel_length": 10, + "mask_channel_min_space": 1, + "mask_channel_other": 0.0, + "mask_channel_prob": 0.0, + "mask_channel_selection": "static", + "mask_feature_length": 10, + "mask_feature_min_masks": 0, + "mask_feature_prob": 0.0, + "mask_time_length": 10, + "mask_time_min_masks": 2, + "mask_time_min_space": 1, + "mask_time_other": 0.0, + "mask_time_prob": 0.05, + "mask_time_selection": "static", + "model_type": "wav2vec2", + "num_adapter_layers": 3, + "num_attention_heads": 16, + "num_codevector_groups": 2, + "num_codevectors_per_group": 320, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_feat_extract_layers": 7, + "num_hidden_layers": 24, + "num_negatives": 100, + "output_hidden_size": 1024, + "pad_token_id": 0, + "proj_codevector_dim": 768, + "tdnn_dilation": [ + 1, + 2, + 3, + 1, + 1 + ], + "tdnn_dim": [ + 512, + 512, + 512, + 512, + 1500 + ], + "tdnn_kernel": [ + 5, + 3, + 3, + 1, + 1 + ], + "torch_dtype": "float32", + "transformers_version": "4.26.1", + "use_weighted_layer_sum": false, + "vocab_size": 37, + "xvector_output_dim": 512 +} diff --git a/preprocessor_config.json b/preprocessor_config.json new file mode 100644 index 0000000..73caa15 --- /dev/null +++ b/preprocessor_config.json @@ -0,0 +1,9 @@ +{ + "do_normalize": true, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0.0, + "return_attention_mask": true, + "sampling_rate": 16000 +} diff --git a/pytorch_model.bin b/pytorch_model.bin new file mode 100644 index 0000000..300c851 --- /dev/null +++ b/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db2a4ee1d1cb6d5b72c1128c0baaf5ab9c95b4931031604b2d5c9787ebc2781d +size 1262053549 diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..25bc396 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1 @@ +{"bos_token": "", "eos_token": "", "unk_token": "", "pad_token": ""} \ No newline at end of file diff --git a/test_sound_ru.flac b/test_sound_ru.flac new file mode 100644 index 0000000..7912da8 Binary files /dev/null and b/test_sound_ru.flac differ diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..43772fe --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1 @@ +{"unk_token": "", "bos_token": "", "eos_token": "", "pad_token": "", "do_lower_case": false, "word_delimiter_token": "|"} \ No newline at end of file diff --git a/vocab.json b/vocab.json new file mode 100644 index 0000000..3ca695f --- /dev/null +++ b/vocab.json @@ -0,0 +1 @@ +{"": 0, "": 1, "": 2, "": 3, "|": 4, "а": 5, "б": 6, "в": 7, "г": 8, "д": 9, "е": 10, "ж": 11, "з": 12, "и": 13, "й": 14, "к": 15, "л": 16, "м": 17, "н": 18, "о": 19, "п": 20, "р": 21, "с": 22, "т": 23, "у": 24, "ф": 25, "х": 26, "ц": 27, "ч": 28, "ш": 29, "щ": 30, "ъ": 31, "ы": 32, "ь": 33, "э": 34, "ю": 35, "я": 36} \ No newline at end of file