初始化项目,由ModelHub XC社区提供模型

Model: alefiury/wav2vec2-large-xlsr-53-coraa-brazilian-portuguese-gain-normalization
Source: Original Platform
This commit is contained in:
ModelHub XC
2026-05-08 11:37:37 +08:00
commit 805cf722ec
8 changed files with 171 additions and 0 deletions

27
.gitattributes vendored Normal file
View File

@@ -0,0 +1,27 @@
*.7z filter=lfs diff=lfs merge=lfs -text
*.arrow filter=lfs diff=lfs merge=lfs -text
*.bin filter=lfs diff=lfs merge=lfs -text
*.bin.* filter=lfs diff=lfs merge=lfs -text
*.bz2 filter=lfs diff=lfs merge=lfs -text
*.ftz filter=lfs diff=lfs merge=lfs -text
*.gz filter=lfs diff=lfs merge=lfs -text
*.h5 filter=lfs diff=lfs merge=lfs -text
*.joblib filter=lfs diff=lfs merge=lfs -text
*.lfs.* filter=lfs diff=lfs merge=lfs -text
*.model filter=lfs diff=lfs merge=lfs -text
*.msgpack filter=lfs diff=lfs merge=lfs -text
*.onnx filter=lfs diff=lfs merge=lfs -text
*.ot filter=lfs diff=lfs merge=lfs -text
*.parquet filter=lfs diff=lfs merge=lfs -text
*.pb filter=lfs diff=lfs merge=lfs -text
*.pt filter=lfs diff=lfs merge=lfs -text
*.pth filter=lfs diff=lfs merge=lfs -text
*.rar filter=lfs diff=lfs merge=lfs -text
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
*.tar.* filter=lfs diff=lfs merge=lfs -text
*.tflite filter=lfs diff=lfs merge=lfs -text
*.tgz filter=lfs diff=lfs merge=lfs -text
*.xz filter=lfs diff=lfs merge=lfs -text
*.zip filter=lfs diff=lfs merge=lfs -text
*.zstandard filter=lfs diff=lfs merge=lfs -text
*tfevents* filter=lfs diff=lfs merge=lfs -text

45
README.md Normal file
View File

@@ -0,0 +1,45 @@
---
language: pt
datasets:
- CORAA
- common_voice
- mls
- cetuc
- voxforge
metrics:
- wer
tags:
- audio
- speech
- wav2vec2
- pt
- portuguese-speech-corpus
- automatic-speech-recognition
- speech
- PyTorch
license: apache-2.0
model-index:
- name: Alef Iury XLSR Wav2Vec2 Large 53 Portuguese
results:
- task:
name: Speech Recognition
type: automatic-speech-recognition
metrics:
- name: Test CORAA WER
type: wer
value: 24.89%
---
# Wav2vec 2.0 trained with CORAA Portuguese Dataset and Open Portuguese Datasets
This a the demonstration of a fine-tuned Wav2vec model for Portuguese using the following datasets:
- [CORAA dataset](https://github.com/nilc-nlp/CORAA)
- [CETUC](http://www02.smt.ufrj.br/~igor.quintanilha/alcaim.tar.gz).
- [Multilingual Librispeech (MLS)](http://www.openslr.org/94/).
- [VoxForge](http://www.voxforge.org/).
- [Common Voice 6.1](https://commonvoice.mozilla.org/pt).
## Repository
The repository that implements the model to be trained and tested is avaible [here](https://github.com/alefiury/SE-R_2022_Challenge_Wav2vec2).

84
config.json Normal file
View File

@@ -0,0 +1,84 @@
{
"_name_or_path": "../checkpoints/Wav2Vec2/CORAA-norm/final-version/train/checkpoint-18420",
"activation_dropout": 0.0,
"apply_spec_augment": true,
"architectures": [
"Wav2Vec2ForCTC"
],
"attention_dropout": 0.1,
"bos_token_id": 1,
"codevector_dim": 768,
"contrastive_logits_temperature": 0.1,
"conv_bias": true,
"conv_dim": [
512,
512,
512,
512,
512,
512,
512
],
"conv_kernel": [
10,
3,
3,
3,
3,
2,
2
],
"conv_stride": [
5,
2,
2,
2,
2,
2,
2
],
"ctc_loss_reduction": "mean",
"ctc_zero_infinity": true,
"diversity_loss_weight": 0.1,
"do_stable_layer_norm": true,
"eos_token_id": 2,
"feat_extract_activation": "gelu",
"feat_extract_dropout": 0.0,
"feat_extract_norm": "layer",
"feat_proj_dropout": 0.1,
"feat_quantizer_dropout": 0.0,
"final_dropout": 0.0,
"gradient_checkpointing": true,
"hidden_act": "gelu",
"hidden_dropout": 0.1,
"hidden_size": 1024,
"initializer_range": 0.02,
"intermediate_size": 4096,
"layer_norm_eps": 1e-05,
"layerdrop": 0.0,
"mask_channel_length": 10,
"mask_channel_min_space": 1,
"mask_channel_other": 0.0,
"mask_channel_prob": 0.0,
"mask_channel_selection": "static",
"mask_feature_length": 10,
"mask_feature_prob": 0.0,
"mask_time_length": 10,
"mask_time_min_space": 1,
"mask_time_other": 0.0,
"mask_time_prob": 0.05,
"mask_time_selection": "static",
"model_type": "wav2vec2",
"num_attention_heads": 16,
"num_codevector_groups": 2,
"num_codevectors_per_group": 320,
"num_conv_pos_embedding_groups": 16,
"num_conv_pos_embeddings": 128,
"num_feat_extract_layers": 7,
"num_hidden_layers": 24,
"num_negatives": 100,
"pad_token_id": 0,
"proj_codevector_dim": 768,
"transformers_version": "4.6.1",
"vocab_size": 45
}

9
preprocessor_config.json Normal file
View File

@@ -0,0 +1,9 @@
{
"do_normalize": true,
"feature_extractor_type": "Wav2Vec2FeatureExtractor",
"feature_size": 1,
"padding_side": "right",
"padding_value": 0.0,
"return_attention_mask": true,
"sampling_rate": 16000
}

3
pytorch_model.bin Normal file
View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:10af01cf75949c26226c85061548c31ad67314c0da183b7e9eee8fec86d5f216
size 1262102257

1
special_tokens_map.json Normal file
View File

@@ -0,0 +1 @@
{"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>"}

1
tokenizer_config.json Normal file
View File

@@ -0,0 +1 @@
{"unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "<pad>", "do_lower_case": false, "word_delimiter_token": "|"}

1
vocab.json Normal file
View File

@@ -0,0 +1 @@
{"<pad>": 0, "|": 1, "<unk>": 2, "a": 3, "b": 4, "c": 5, "d": 6, "e": 7, "f": 8, "g": 9, "h": 10, "i": 11, "j": 12, "k": 13, "l": 14, "m": 15, "n": 16, "o": 17, "p": 18, "q": 19, "r": 20, "s": 21, "t": 22, "u": 23, "v": 24, "w": 25, "x": 26, "y": 27, "z": 28, "ç": 29, "ã": 30, "à": 31, "á": 32, "â": 33, "ê": 34, "é": 35, "í": 36, "ó": 37, "ô": 38, "õ": 39, "ú": 40, "û": 41, "-": 42, "<s>": 43, "</s>": 44}