初始化项目,由ModelHub XC社区提供模型
Model: facebook/wav2vec2-base-10k-voxpopuli-ft-en Source: Original Platform
This commit is contained in:
17
.gitattributes
vendored
Normal file
17
.gitattributes
vendored
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
*.bin.* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.bin filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.h5 filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tflite filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tar.gz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.ot filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.onnx filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.arrow filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.ftz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.joblib filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.model filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pb filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pt filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pth filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
||||||
69
README.md
Normal file
69
README.md
Normal file
@@ -0,0 +1,69 @@
|
|||||||
|
---
|
||||||
|
language: en
|
||||||
|
tags:
|
||||||
|
- audio
|
||||||
|
- automatic-speech-recognition
|
||||||
|
- voxpopuli
|
||||||
|
license: cc-by-nc-4.0
|
||||||
|
---
|
||||||
|
|
||||||
|
# Wav2Vec2-Base-VoxPopuli-Finetuned
|
||||||
|
|
||||||
|
[Facebook's Wav2Vec2](https://ai.facebook.com/blog/wav2vec-20-learning-the-structure-of-speech-from-raw-audio/) base model pretrained on the 10K unlabeled subset of [VoxPopuli corpus](https://arxiv.org/abs/2101.00390) and fine-tuned on the transcribed data in en (refer to Table 1 of paper for more information).
|
||||||
|
|
||||||
|
**Paper**: *[VoxPopuli: A Large-Scale Multilingual Speech Corpus for Representation
|
||||||
|
Learning, Semi-Supervised Learning and Interpretation](https://arxiv.org/abs/2101.00390)*
|
||||||
|
|
||||||
|
**Authors**: *Changhan Wang, Morgane Riviere, Ann Lee, Anne Wu, Chaitanya Talnikar, Daniel Haziza, Mary Williamson, Juan Pino, Emmanuel Dupoux* from *Facebook AI*
|
||||||
|
|
||||||
|
See the official website for more information, [here](https://github.com/facebookresearch/voxpopuli/)
|
||||||
|
|
||||||
|
|
||||||
|
# Usage for inference
|
||||||
|
|
||||||
|
In the following it is shown how the model can be used in inference on a sample of the [Common Voice dataset](https://commonvoice.mozilla.org/en/datasets)
|
||||||
|
|
||||||
|
```python
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
|
||||||
|
from datasets import load_dataset
|
||||||
|
import torchaudio
|
||||||
|
import torch
|
||||||
|
|
||||||
|
# resample audio
|
||||||
|
|
||||||
|
# load model & processor
|
||||||
|
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-10k-voxpopuli-ft-en")
|
||||||
|
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-10k-voxpopuli-ft-en")
|
||||||
|
|
||||||
|
# load dataset
|
||||||
|
ds = load_dataset("common_voice", "en", split="validation[:1%]")
|
||||||
|
|
||||||
|
# common voice does not match target sampling rate
|
||||||
|
common_voice_sample_rate = 48000
|
||||||
|
target_sample_rate = 16000
|
||||||
|
|
||||||
|
resampler = torchaudio.transforms.Resample(common_voice_sample_rate, target_sample_rate)
|
||||||
|
|
||||||
|
|
||||||
|
# define mapping fn to read in sound file and resample
|
||||||
|
def map_to_array(batch):
|
||||||
|
speech, _ = torchaudio.load(batch["path"])
|
||||||
|
speech = resampler(speech)
|
||||||
|
batch["speech"] = speech[0]
|
||||||
|
return batch
|
||||||
|
|
||||||
|
|
||||||
|
# load all audio files
|
||||||
|
ds = ds.map(map_to_array)
|
||||||
|
|
||||||
|
# run inference on the first 5 data samples
|
||||||
|
inputs = processor(ds[:5]["speech"], sampling_rate=target_sample_rate, return_tensors="pt", padding=True)
|
||||||
|
|
||||||
|
# inference
|
||||||
|
logits = model(**inputs).logits
|
||||||
|
predicted_ids = torch.argmax(logits, axis=-1)
|
||||||
|
|
||||||
|
print(processor.batch_decode(predicted_ids))
|
||||||
|
```
|
||||||
|
|
||||||
68
config.json
Normal file
68
config.json
Normal file
@@ -0,0 +1,68 @@
|
|||||||
|
{
|
||||||
|
"activation_dropout": 0.1,
|
||||||
|
"apply_spec_augment": true,
|
||||||
|
"architectures": [
|
||||||
|
"Wav2Vec2ForCTC"
|
||||||
|
],
|
||||||
|
"attention_dropout": 0.1,
|
||||||
|
"bos_token_id": 0,
|
||||||
|
"conv_bias": false,
|
||||||
|
"conv_dim": [
|
||||||
|
512,
|
||||||
|
512,
|
||||||
|
512,
|
||||||
|
512,
|
||||||
|
512,
|
||||||
|
512,
|
||||||
|
512
|
||||||
|
],
|
||||||
|
"conv_kernel": [
|
||||||
|
10,
|
||||||
|
3,
|
||||||
|
3,
|
||||||
|
3,
|
||||||
|
3,
|
||||||
|
2,
|
||||||
|
2
|
||||||
|
],
|
||||||
|
"conv_stride": [
|
||||||
|
5,
|
||||||
|
2,
|
||||||
|
2,
|
||||||
|
2,
|
||||||
|
2,
|
||||||
|
2,
|
||||||
|
2
|
||||||
|
],
|
||||||
|
"ctc_loss_reduction": "sum",
|
||||||
|
"ctc_zero_infinity": false,
|
||||||
|
"do_stable_layer_norm": false,
|
||||||
|
"eos_token_id": 2,
|
||||||
|
"feat_extract_activation": "gelu",
|
||||||
|
"feat_extract_dropout": 0.0,
|
||||||
|
"feat_extract_norm": "group",
|
||||||
|
"feat_proj_dropout": 0.1,
|
||||||
|
"final_dropout": 0.1,
|
||||||
|
"gradient_checkpointing": false,
|
||||||
|
"hidden_act": "gelu",
|
||||||
|
"hidden_dropout": 0.1,
|
||||||
|
"hidden_dropout_prob": 0.1,
|
||||||
|
"hidden_size": 768,
|
||||||
|
"initializer_range": 0.02,
|
||||||
|
"intermediate_size": 3072,
|
||||||
|
"layer_norm_eps": 1e-05,
|
||||||
|
"layerdrop": 0.1,
|
||||||
|
"mask_feature_length": 10,
|
||||||
|
"mask_feature_prob": 0.0,
|
||||||
|
"mask_time_length": 10,
|
||||||
|
"mask_time_prob": 0.05,
|
||||||
|
"model_type": "wav2vec2",
|
||||||
|
"num_attention_heads": 12,
|
||||||
|
"num_conv_pos_embedding_groups": 16,
|
||||||
|
"num_conv_pos_embeddings": 128,
|
||||||
|
"num_feat_extract_layers": 7,
|
||||||
|
"num_hidden_layers": 12,
|
||||||
|
"pad_token_id": 1,
|
||||||
|
"transformers_version": "4.6.0.dev0",
|
||||||
|
"vocab_size": 32
|
||||||
|
}
|
||||||
9
preprocessor_config.json
Normal file
9
preprocessor_config.json
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
{
|
||||||
|
"do_normalize": true,
|
||||||
|
"feature_extractor_type": "Wav2Vec2FeatureExtractor",
|
||||||
|
"feature_size": 1,
|
||||||
|
"padding_side": "right",
|
||||||
|
"padding_value": 0,
|
||||||
|
"return_attention_mask": false,
|
||||||
|
"sampling_rate": 16000
|
||||||
|
}
|
||||||
3
pytorch_model.bin
Normal file
3
pytorch_model.bin
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:1334f627b73604c14d3f79c2007c42f3b46b2d66ecb81f35c57f522f6e28d485
|
||||||
|
size 377672556
|
||||||
1
special_tokens_map.json
Normal file
1
special_tokens_map.json
Normal file
@@ -0,0 +1 @@
|
|||||||
|
{"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>"}
|
||||||
1
tokenizer_config.json
Normal file
1
tokenizer_config.json
Normal file
@@ -0,0 +1 @@
|
|||||||
|
{"unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "<pad>", "do_lower_case": false, "word_delimiter_token": "|"}
|
||||||
1
vocab.json
Normal file
1
vocab.json
Normal file
@@ -0,0 +1 @@
|
|||||||
|
{"<s>": 1, "<pad>": 0, "</s>": 2, "<unk>": 3, "|": 4, "e": 5, "t": 6, "o": 7, "i": 8, "a": 9, "n": 10, "s": 11, "r": 12, "h": 13, "l": 14, "d": 15, "c": 16, "u": 17, "m": 18, "p": 19, "f": 20, "g": 21, "w": 22, "y": 23, "b": 24, "v": 25, "k": 26, "x": 27, "j": 28, "q": 29, "z": 30, "1": 31}
|
||||||
Reference in New Issue
Block a user