初始化项目,由ModelHub XC社区提供模型

Model: boumehdi/wav2vec2-large-xlsr-moroccan-darija
Source: Original Platform
This commit is contained in:
ModelHub XC
2026-05-16 10:56:32 +08:00
commit 4f976a4dc2
14 changed files with 383 additions and 0 deletions

34
.gitattributes vendored Normal file
View File

@@ -0,0 +1,34 @@
*.7z filter=lfs diff=lfs merge=lfs -text
*.arrow filter=lfs diff=lfs merge=lfs -text
*.bin filter=lfs diff=lfs merge=lfs -text
*.bz2 filter=lfs diff=lfs merge=lfs -text
*.ckpt filter=lfs diff=lfs merge=lfs -text
*.ftz filter=lfs diff=lfs merge=lfs -text
*.gz filter=lfs diff=lfs merge=lfs -text
*.h5 filter=lfs diff=lfs merge=lfs -text
*.joblib filter=lfs diff=lfs merge=lfs -text
*.lfs.* filter=lfs diff=lfs merge=lfs -text
*.mlmodel filter=lfs diff=lfs merge=lfs -text
*.model filter=lfs diff=lfs merge=lfs -text
*.msgpack filter=lfs diff=lfs merge=lfs -text
*.npy filter=lfs diff=lfs merge=lfs -text
*.npz filter=lfs diff=lfs merge=lfs -text
*.onnx filter=lfs diff=lfs merge=lfs -text
*.ot filter=lfs diff=lfs merge=lfs -text
*.parquet filter=lfs diff=lfs merge=lfs -text
*.pb filter=lfs diff=lfs merge=lfs -text
*.pickle filter=lfs diff=lfs merge=lfs -text
*.pkl filter=lfs diff=lfs merge=lfs -text
*.pt filter=lfs diff=lfs merge=lfs -text
*.pth filter=lfs diff=lfs merge=lfs -text
*.rar filter=lfs diff=lfs merge=lfs -text
*.safetensors filter=lfs diff=lfs merge=lfs -text
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
*.tar.* filter=lfs diff=lfs merge=lfs -text
*.tflite filter=lfs diff=lfs merge=lfs -text
*.tgz filter=lfs diff=lfs merge=lfs -text
*.wasm filter=lfs diff=lfs merge=lfs -text
*.xz filter=lfs diff=lfs merge=lfs -text
*.zip filter=lfs diff=lfs merge=lfs -text
*.zst filter=lfs diff=lfs merge=lfs -text
*tfevents* filter=lfs diff=lfs merge=lfs -text

73
README.md Normal file
View File

@@ -0,0 +1,73 @@
---
language: ary
base_model: facebook/wav2vec2-large-xlsr-53
metrics:
- wer
tags:
- audio
- automatic-speech-recognition
- speech
- xlsr-fine-tuning-week
license: apache-2.0
model-index:
- name: XLSR Wav2Vec2 Moroccan Arabic dialect by Boumehdi
results:
- task:
name: Speech Recognition
type: automatic-speech-recognition
metrics:
- name: Test WER
type: wer
value: 0.084904
---
# Wav2Vec2-Large-XLSR-53-Moroccan-Darija
**wav2vec2-large-xlsr-53 new model**
- Fine-tuned on 57 hours of labeled Darija Audios extracted from MDVC (https://ijeecs.iaescore.com/index.php/IJEECS/article/view/35709) which contains more than 1000 hours of Moroccan Darija "ary".
- Fine-tuning is ongoing 24/7 to enhance accuracy.
- We are consistently adding data to the model every day (We prefer not to add all MDVC Corpus at once as we are trying to standardize more and more the way we write the Moroccan Darija).
<table><thead><tr><th><strong>Training Loss</strong></th> <th><strong>Validation</strong></th> <th><strong>Loss Wer</strong></th></tr></thead> <tbody><tr>
<td>0.121300</td>
<td>0.103430</td>
<td>0.084904</td>
</tr> </tbody></table>
## Usage
The model can be used directly as follows:
```python
import librosa
import torch
from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2ForCTC, Wav2Vec2Processor, TrainingArguments, Wav2Vec2FeatureExtractor, Trainer
tokenizer = Wav2Vec2CTCTokenizer("./vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")
processor = Wav2Vec2Processor.from_pretrained('boumehdi/wav2vec2-large-xlsr-moroccan-darija', tokenizer=tokenizer)
model=Wav2Vec2ForCTC.from_pretrained('boumehdi/wav2vec2-large-xlsr-moroccan-darija')
# load the audio data (use your own wav file here!)
input_audio, sr = librosa.load('file.wav', sr=16000)
# tokenize
input_values = processor(input_audio, return_tensors="pt", padding=True).input_values
# retrieve logits
logits = model(input_values).logits
tokens = torch.argmax(logits, axis=-1)
# decode using n-gram
transcription = tokenizer.batch_decode(tokens)
# print the output
print(transcription)
```
Output: قالت ليا هاد السيد هادا ما كاينش بحالو
email: souregh@gmail.com
BOUMEHDI Ahmed

116
config.json Normal file
View File

@@ -0,0 +1,116 @@
{
"_name_or_path": "C:\\workspace\\checkpoints\\",
"activation_dropout": 0.0,
"adapter_kernel_size": 3,
"adapter_stride": 2,
"add_adapter": false,
"apply_spec_augment": true,
"architectures": [
"Wav2Vec2ForCTC"
],
"attention_dropout": 0.0,
"bos_token_id": 1,
"classifier_proj_size": 256,
"codevector_dim": 768,
"contrastive_logits_temperature": 0.1,
"conv_bias": true,
"conv_dim": [
512,
512,
512,
512,
512,
512,
512
],
"conv_kernel": [
10,
3,
3,
3,
3,
2,
2
],
"conv_stride": [
5,
2,
2,
2,
2,
2,
2
],
"ctc_loss_reduction": "mean",
"ctc_zero_infinity": false,
"diversity_loss_weight": 0.1,
"do_stable_layer_norm": true,
"eos_token_id": 2,
"feat_extract_activation": "gelu",
"feat_extract_dropout": 0.0,
"feat_extract_norm": "layer",
"feat_proj_dropout": 0.0,
"feat_quantizer_dropout": 0.0,
"final_dropout": 0.0,
"gradient_checkpointing": false,
"hidden_act": "gelu",
"hidden_dropout": 0.0,
"hidden_size": 1024,
"initializer_range": 0.02,
"intermediate_size": 4096,
"layer_norm_eps": 1e-05,
"layerdrop": 0.0,
"mask_channel_length": 10,
"mask_channel_min_space": 1,
"mask_channel_other": 0.0,
"mask_channel_prob": 0.0,
"mask_channel_selection": "static",
"mask_feature_length": 10,
"mask_feature_min_masks": 0,
"mask_feature_prob": 0.0,
"mask_time_length": 10,
"mask_time_min_masks": 2,
"mask_time_min_space": 1,
"mask_time_other": 0.0,
"mask_time_prob": 0.05,
"mask_time_selection": "static",
"model_type": "wav2vec2",
"num_adapter_layers": 3,
"num_attention_heads": 16,
"num_codevector_groups": 2,
"num_codevectors_per_group": 320,
"num_conv_pos_embedding_groups": 16,
"num_conv_pos_embeddings": 128,
"num_feat_extract_layers": 7,
"num_hidden_layers": 24,
"num_negatives": 100,
"output_hidden_size": 1024,
"pad_token_id": 64,
"proj_codevector_dim": 768,
"tdnn_dilation": [
1,
2,
3,
1,
1
],
"tdnn_dim": [
512,
512,
512,
512,
1500
],
"tdnn_kernel": [
5,
3,
3,
1,
1
],
"torch_dtype": "float32",
"transformers_version": "4.21.1",
"use_weighted_layer_sum": false,
"vocab_size": 65,
"xvector_output_dim": 512
}

3
optimizer.pt Normal file
View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:ea5629f8cd6119e48a95b2cf9f656773966e86bbc44c436f4e29990b166ecd93
size 2490594117

9
preprocessor_config.json Normal file
View File

@@ -0,0 +1,9 @@
{
"do_normalize": true,
"feature_extractor_type": "Wav2Vec2FeatureExtractor",
"feature_size": 1,
"padding_side": "right",
"padding_value": 0.0,
"return_attention_mask": true,
"sampling_rate": 16000
}

3
pytorch_model.bin Normal file
View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:c96c7c8c8cad458f5481f9a3e061ee0bc6110ba45d5595757ffe365b2e5a55e0
size 1262168365

3
rng_state.pth Normal file
View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:0cdaf50a0753d3a6e101916319bd71a8a4deca5c897164903ee046122c35005a
size 14575

3
scaler.pt Normal file
View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:c4e827e327ff1e038aa2027dda2c3ed3be5c6a6d7a0f9578e32510d4dcaf26a3
size 557

3
scheduler.pt Normal file
View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:387faccb2a2f1652b597f93e0dfe377728691d7dafb8a403a4470c4646c76cd5
size 627

1
special_tokens_map.json Normal file
View File

@@ -0,0 +1 @@
{"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]"}

1
tokenizer_config.json Normal file
View File

@@ -0,0 +1 @@
{"unk_token": "[UNK]", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "[PAD]", "do_lower_case": false, "word_delimiter_token": "|"}

130
trainer_state.json Normal file
View File

@@ -0,0 +1,130 @@
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 13.791738382099828,
"global_step": 2000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 1.03,
"learning_rate": 9.99965514863094e-07,
"loss": 0.0333,
"step": 150
},
{
"epoch": 2.07,
"learning_rate": 9.99862059452376e-07,
"loss": 0.0332,
"step": 300
},
{
"epoch": 3.1,
"learning_rate": 9.99758604041658e-07,
"loss": 0.0336,
"step": 450
},
{
"epoch": 3.45,
"eval_loss": 0.21229705214500427,
"eval_runtime": 487.1939,
"eval_samples_per_second": 10.649,
"eval_steps_per_second": 1.332,
"eval_wer": 0.1448122092799754,
"step": 500
},
{
"epoch": 4.14,
"learning_rate": 9.9965514863094e-07,
"loss": 0.0329,
"step": 600
},
{
"epoch": 5.17,
"learning_rate": 9.995516932202221e-07,
"loss": 0.0323,
"step": 750
},
{
"epoch": 6.21,
"learning_rate": 9.99448237809504e-07,
"loss": 0.0317,
"step": 900
},
{
"epoch": 6.9,
"eval_loss": 0.20801204442977905,
"eval_runtime": 282.77,
"eval_samples_per_second": 18.347,
"eval_steps_per_second": 2.295,
"eval_wer": 0.14550417099142737,
"step": 1000
},
{
"epoch": 7.24,
"learning_rate": 9.993454721015242e-07,
"loss": 0.0323,
"step": 1050
},
{
"epoch": 8.28,
"learning_rate": 9.992420166908062e-07,
"loss": 0.031,
"step": 1200
},
{
"epoch": 9.31,
"learning_rate": 9.991385612800883e-07,
"loss": 0.0314,
"step": 1350
},
{
"epoch": 10.34,
"learning_rate": 9.990351058693703e-07,
"loss": 0.0317,
"step": 1500
},
{
"epoch": 10.34,
"eval_loss": 0.2071654200553894,
"eval_runtime": 303.1157,
"eval_samples_per_second": 17.116,
"eval_steps_per_second": 2.141,
"eval_wer": 0.14465844001076386,
"step": 1500
},
{
"epoch": 11.38,
"learning_rate": 9.989316504586522e-07,
"loss": 0.031,
"step": 1650
},
{
"epoch": 12.41,
"learning_rate": 9.988281950479342e-07,
"loss": 0.0318,
"step": 1800
},
{
"epoch": 13.45,
"learning_rate": 9.987247396372162e-07,
"loss": 0.0314,
"step": 1950
},
{
"epoch": 13.79,
"eval_loss": 0.20706616342067719,
"eval_runtime": 287.3918,
"eval_samples_per_second": 18.052,
"eval_steps_per_second": 2.258,
"eval_wer": 0.1442355745204321,
"step": 2000
}
],
"max_steps": 1450000,
"num_train_epochs": 10000,
"total_flos": 5.652539665735567e+19,
"trial_name": null,
"trial_params": null
}

3
training_args.bin Normal file
View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:c61dd66333a3a0d68eedaaf6d552d4521e71274edd381e8de2a88579575bba8c
size 3323

1
vocab.json Normal file
View File

@@ -0,0 +1 @@
{"ا": 0, "_": 1, "ك": 3, "ن": 4, "@": 5, "ذ": 6, "ٖ": 7, "ش": 8, "غ": 9, "آ": 10, "ص": 11, "ي": 12, "ث": 13, "n": 14, "ج": 15, "d": 16, "ئ": 17, "ة": 18, "ه": 19, "ز": 20, "ع": 21, "ف": 22, "i": 23, "r": 24, "v": 25, "ڸ": 26, "k": 27, "ِ": 28, "ء": 29, "ر": 30, "s": 31, "t": 32, "ى": 33, "ـ": 34, "": 35, "ؤ": 36, "ق": 37, "ض": 38, "م": 39, "ل": 40, "…": 41, "ط": 42, "ت": 43, "ّ": 44, "c": 45, "ظ": 46, "e": 47, "؟": 48, "h": 49, "ب": 50, "o": 51, "س": 52, "a": 53, "د": 54, "p": 55, "أ": 56, "ْ": 57, "\n": 58, "خ": 59, "ح": 60, "و": 61, "إ": 62, "|": 2, "[UNK]": 63, "[PAD]": 64}