commit 9c732d9c0a8714eb9fc32d376b761b47e578a0bb Author: ModelHub XC Date: Fri May 8 11:35:49 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: facebook/wav2vec2-base-10k-voxpopuli-ft-en Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..d699711 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,17 @@ +*.bin.* filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tar.gz filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..c6411ed --- /dev/null +++ b/README.md @@ -0,0 +1,69 @@ +--- +language: en +tags: +- audio +- automatic-speech-recognition +- voxpopuli +license: cc-by-nc-4.0 +--- + +# Wav2Vec2-Base-VoxPopuli-Finetuned + +[Facebook's Wav2Vec2](https://ai.facebook.com/blog/wav2vec-20-learning-the-structure-of-speech-from-raw-audio/) base model pretrained on the 10K unlabeled subset of [VoxPopuli corpus](https://arxiv.org/abs/2101.00390) and fine-tuned on the transcribed data in en (refer to Table 1 of paper for more information). + +**Paper**: *[VoxPopuli: A Large-Scale Multilingual Speech Corpus for Representation +Learning, Semi-Supervised Learning and Interpretation](https://arxiv.org/abs/2101.00390)* + +**Authors**: *Changhan Wang, Morgane Riviere, Ann Lee, Anne Wu, Chaitanya Talnikar, Daniel Haziza, Mary Williamson, Juan Pino, Emmanuel Dupoux* from *Facebook AI* + +See the official website for more information, [here](https://github.com/facebookresearch/voxpopuli/) + + +# Usage for inference + +In the following it is shown how the model can be used in inference on a sample of the [Common Voice dataset](https://commonvoice.mozilla.org/en/datasets) + +```python +#!/usr/bin/env python3 +from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC +from datasets import load_dataset +import torchaudio +import torch + +# resample audio + +# load model & processor +model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-10k-voxpopuli-ft-en") +processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-10k-voxpopuli-ft-en") + +# load dataset +ds = load_dataset("common_voice", "en", split="validation[:1%]") + +# common voice does not match target sampling rate +common_voice_sample_rate = 48000 +target_sample_rate = 16000 + +resampler = torchaudio.transforms.Resample(common_voice_sample_rate, target_sample_rate) + + +# define mapping fn to read in sound file and resample +def map_to_array(batch): + speech, _ = torchaudio.load(batch["path"]) + speech = resampler(speech) + batch["speech"] = speech[0] + return batch + + +# load all audio files +ds = ds.map(map_to_array) + +# run inference on the first 5 data samples +inputs = processor(ds[:5]["speech"], sampling_rate=target_sample_rate, return_tensors="pt", padding=True) + +# inference +logits = model(**inputs).logits +predicted_ids = torch.argmax(logits, axis=-1) + +print(processor.batch_decode(predicted_ids)) +``` + diff --git a/config.json b/config.json new file mode 100644 index 0000000..23ea24e --- /dev/null +++ b/config.json @@ -0,0 +1,68 @@ +{ + "activation_dropout": 0.1, + "apply_spec_augment": true, + "architectures": [ + "Wav2Vec2ForCTC" + ], + "attention_dropout": 0.1, + "bos_token_id": 0, + "conv_bias": false, + "conv_dim": [ + 512, + 512, + 512, + 512, + 512, + 512, + 512 + ], + "conv_kernel": [ + 10, + 3, + 3, + 3, + 3, + 2, + 2 + ], + "conv_stride": [ + 5, + 2, + 2, + 2, + 2, + 2, + 2 + ], + "ctc_loss_reduction": "sum", + "ctc_zero_infinity": false, + "do_stable_layer_norm": false, + "eos_token_id": 2, + "feat_extract_activation": "gelu", + "feat_extract_dropout": 0.0, + "feat_extract_norm": "group", + "feat_proj_dropout": 0.1, + "final_dropout": 0.1, + "gradient_checkpointing": false, + "hidden_act": "gelu", + "hidden_dropout": 0.1, + "hidden_dropout_prob": 0.1, + "hidden_size": 768, + "initializer_range": 0.02, + "intermediate_size": 3072, + "layer_norm_eps": 1e-05, + "layerdrop": 0.1, + "mask_feature_length": 10, + "mask_feature_prob": 0.0, + "mask_time_length": 10, + "mask_time_prob": 0.05, + "model_type": "wav2vec2", + "num_attention_heads": 12, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_feat_extract_layers": 7, + "num_hidden_layers": 12, + "pad_token_id": 1, + "transformers_version": "4.6.0.dev0", + "vocab_size": 32 +} diff --git a/preprocessor_config.json b/preprocessor_config.json new file mode 100644 index 0000000..8df8da1 --- /dev/null +++ b/preprocessor_config.json @@ -0,0 +1,9 @@ +{ + "do_normalize": true, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0, + "return_attention_mask": false, + "sampling_rate": 16000 +} diff --git a/pytorch_model.bin b/pytorch_model.bin new file mode 100644 index 0000000..116e47e --- /dev/null +++ b/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1334f627b73604c14d3f79c2007c42f3b46b2d66ecb81f35c57f522f6e28d485 +size 377672556 diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..25bc396 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1 @@ +{"bos_token": "", "eos_token": "", "unk_token": "", "pad_token": ""} \ No newline at end of file diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..43772fe --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1 @@ +{"unk_token": "", "bos_token": "", "eos_token": "", "pad_token": "", "do_lower_case": false, "word_delimiter_token": "|"} \ No newline at end of file diff --git a/vocab.json b/vocab.json new file mode 100644 index 0000000..e7205c2 --- /dev/null +++ b/vocab.json @@ -0,0 +1 @@ +{"": 1, "": 0, "": 2, "": 3, "|": 4, "e": 5, "t": 6, "o": 7, "i": 8, "a": 9, "n": 10, "s": 11, "r": 12, "h": 13, "l": 14, "d": 15, "c": 16, "u": 17, "m": 18, "p": 19, "f": 20, "g": 21, "w": 22, "y": 23, "b": 24, "v": 25, "k": 26, "x": 27, "j": 28, "q": 29, "z": 30, "1": 31} \ No newline at end of file