commit 9515e2392826f58e7ce664dac331a8a326ed438a Author: ModelHub XC Date: Thu May 28 11:02:18 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: anas/wav2vec2-large-xlsr-arabic Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..d699711 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,17 @@ +*.bin.* filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tar.gz filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text diff --git a/.ipynb_checkpoints/README-checkpoint.md b/.ipynb_checkpoints/README-checkpoint.md new file mode 100644 index 0000000..25f7e61 --- /dev/null +++ b/.ipynb_checkpoints/README-checkpoint.md @@ -0,0 +1,144 @@ +--- +language: ar +datasets: +- common_voice: Common Voice Corpus 4 +metrics: +- wer +tags: +- audio +- automatic-speech-recognition +- speech +- xlsr-fine-tuning-week +license: apache-2.0 +model-index: +- name: Hasni XLSR Wav2Vec2 Large 53 + results: + - task: + name: Speech Recognition + type: automatic-speech-recognition + dataset: + name: Common Voice ar + type: common_voice + args: ar + metrics: + - name: Test WER + type: wer + value: 52.18 +--- + +# Wav2Vec2-Large-XLSR-53-Arabic + +Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) on Arabic using the [Common Voice Corpus 4](https://commonvoice.mozilla.org/en/datasets) dataset. +When using this model, make sure that your speech input is sampled at 16kHz. + +## Usage + +The model can be used directly (without a language model) as follows: + +```python +import torch +import torchaudio +from datasets import load_dataset +from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor + +test_dataset = load_dataset("common_voice", "ar", split="test[:2%]") + +processor = Wav2Vec2Processor.from_pretrained("anas/wav2vec2-large-xlsr-arabic") +model = Wav2Vec2ForCTC.from_pretrained("anas/wav2vec2-large-xlsr-arabic") + +resampler = torchaudio.transforms.Resample(48_000, 16_000) + +# Preprocessing the datasets. +# We need to read the aduio files as arrays +def speech_file_to_array_fn(batch): + speech_array, sampling_rate = torchaudio.load(batch["path"]) + batch["speech"] = resampler(speech_array).squeeze().numpy() + return batch + +test_dataset = test_dataset.map(speech_file_to_array_fn) +inputs = processor(test_dataset["speech"][:2], sampling_rate=16_000, return_tensors="pt", padding=True) + +with torch.no_grad(): + logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits + +predicted_ids = torch.argmax(logits, dim=-1) + +print("Prediction:", processor.batch_decode(predicted_ids)) +print("Reference:", test_dataset["sentence"][:2]) +``` + + +## Evaluation + +The model can be evaluated as follows on the Arabic test data of Common Voice. + + +```python +import torch +import torchaudio +from datasets import load_dataset, load_metric +from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor +import re + +test_dataset = load_dataset("common_voice", "ar", split="test") + +processor = Wav2Vec2Processor.from_pretrained("anas/wav2vec2-large-xlsr-arabic") +model = Wav2Vec2ForCTC.from_pretrained("anas/wav2vec2-large-xlsr-arabic/") +model.to("cuda") + +chars_to_ignore_regex = '[\\\\,\\\\؟\\\\.\\\\!\\\\-\\\\;\\\\\\\\:\\\\'\\\\"\\\\☭\\\\«\\\\»\\\\؛\\\\—\\\\ـ\\\\_\\\\،\\\\“\\\\%\\\\‘\\\\”\\\\�]' + +resampler = torchaudio.transforms.Resample(48_000, 16_000) + +# Preprocessing the datasets. +# We need to read the aduio files as arrays +def speech_file_to_array_fn(batch): + batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + batch["sentence"] = re.sub('[a-z]','',batch["sentence"]) + batch["sentence"] = re.sub("[إأٱآا]", "ا", batch["sentence"]) + noise = re.compile(""" ّ | # Tashdid + َ | # Fatha + ً | # Tanwin Fath + ُ | # Damma + ٌ | # Tanwin Damm + ِ | # Kasra + ٍ | # Tanwin Kasr + ْ | # Sukun + ـ # Tatwil/Kashida + """, re.VERBOSE) + batch["sentence"] = re.sub(noise, '', batch["sentence"]) + speech_array, sampling_rate = torchaudio.load(batch["path"]) + batch["speech"] = resampler(speech_array).squeeze().numpy() + return batch + +test_dataset = test_dataset.map(speech_file_to_array_fn) + +# Preprocessing the datasets. +# We need to read the aduio files as arrays +def evaluate(batch): + inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True) + + with torch.no_grad(): + logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits + + pred_ids = torch.argmax(logits, dim=-1) + batch["pred_strings"] = processor.batch_decode(pred_ids) + return batch + +result = test_dataset.map(evaluate, batched=True, batch_size=8) + +print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"]))) +``` + +**Test Result**: 52.18 % + + +## Training + +The Common Voice Corpus 4 `train`, `validation`, datasets were used for training + +The script used for training can be found [here](...) + +Twitter: [here](https://twitter.com/hasnii_anas) + +Email: anashasni146@gmail.com \ No newline at end of file diff --git a/.ipynb_checkpoints/preprocessor_config-checkpoint.json b/.ipynb_checkpoints/preprocessor_config-checkpoint.json new file mode 100644 index 0000000..0886a48 --- /dev/null +++ b/.ipynb_checkpoints/preprocessor_config-checkpoint.json @@ -0,0 +1,8 @@ +{ + "do_normalize": true, + "feature_size": 1, + "padding_side": "right", + "padding_value": 0.0, + "return_attention_mask": true, + "sampling_rate": 16000 +} diff --git a/.ipynb_checkpoints/special_tokens_map-checkpoint.json b/.ipynb_checkpoints/special_tokens_map-checkpoint.json new file mode 100644 index 0000000..9abf719 --- /dev/null +++ b/.ipynb_checkpoints/special_tokens_map-checkpoint.json @@ -0,0 +1 @@ +{"bos_token": "", "eos_token": "", "unk_token": "[UNK]", "pad_token": "[PAD]"} \ No newline at end of file diff --git a/.ipynb_checkpoints/tokenizer_config-checkpoint.json b/.ipynb_checkpoints/tokenizer_config-checkpoint.json new file mode 100644 index 0000000..a2a8340 --- /dev/null +++ b/.ipynb_checkpoints/tokenizer_config-checkpoint.json @@ -0,0 +1 @@ +{"unk_token": "[UNK]", "bos_token": "", "eos_token": "", "pad_token": "[PAD]", "do_lower_case": false, "word_delimiter_token": "|"} \ No newline at end of file diff --git a/.ipynb_checkpoints/vocab-checkpoint.json b/.ipynb_checkpoints/vocab-checkpoint.json new file mode 100644 index 0000000..fa9cd0a --- /dev/null +++ b/.ipynb_checkpoints/vocab-checkpoint.json @@ -0,0 +1 @@ +{"خ": 0, "ة": 1, "د": 2, "ا": 4, "ض": 5, "م": 6, "و": 7, "ك": 8, "ث": 9, "ش": 10, "ع": 11, "ز": 12, "ء": 13, "ی": 14, "ن": 15, "ه": 16, "ق": 17, "ت": 18, "ب": 19, "ف": 20, "ظ": 21, "ح": 22, "ص": 23, "ئ": 24, "ذ": 25, "ى": 26, "غ": 27, "س": 28, "ر": 29, "ط": 30, "ي": 31, "ل": 32, "ؤ": 33, "ج": 34, "|": 3, "[UNK]": 35, "[PAD]": 36} \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..cec6f06 --- /dev/null +++ b/README.md @@ -0,0 +1,144 @@ +--- +language: ar +datasets: +- common_voice: Common Voice Corpus 4 +metrics: +- wer +tags: +- audio +- automatic-speech-recognition +- speech +- xlsr-fine-tuning-week +license: apache-2.0 +model-index: +- name: Hasni XLSR Wav2Vec2 Large 53 + results: + - task: + name: Speech Recognition + type: automatic-speech-recognition + dataset: + name: Common Voice ar + type: common_voice + args: ar + metrics: + - name: Test WER + type: wer + value: 52.18 +--- + +# Wav2Vec2-Large-XLSR-53-Arabic + +Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) on Arabic using the [Common Voice Corpus 4](https://commonvoice.mozilla.org/en/datasets) dataset. +When using this model, make sure that your speech input is sampled at 16kHz. + +## Usage + +The model can be used directly (without a language model) as follows: + +```python +import torch +import torchaudio +from datasets import load_dataset +from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor + +test_dataset = load_dataset("common_voice", "ar", split="test[:2%]") + +processor = Wav2Vec2Processor.from_pretrained("anas/wav2vec2-large-xlsr-arabic") +model = Wav2Vec2ForCTC.from_pretrained("anas/wav2vec2-large-xlsr-arabic") + +resampler = torchaudio.transforms.Resample(48_000, 16_000) + +# Preprocessing the datasets. +# We need to read the aduio files as arrays +def speech_file_to_array_fn(batch): + speech_array, sampling_rate = torchaudio.load(batch["path"]) + batch["speech"] = resampler(speech_array).squeeze().numpy() + return batch + +test_dataset = test_dataset.map(speech_file_to_array_fn) +inputs = processor(test_dataset["speech"][:2], sampling_rate=16_000, return_tensors="pt", padding=True) + +with torch.no_grad(): + logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits + +predicted_ids = torch.argmax(logits, dim=-1) + +print("Prediction:", processor.batch_decode(predicted_ids)) +print("Reference:", test_dataset["sentence"][:2]) +``` + + +## Evaluation + +The model can be evaluated as follows on the Arabic test data of Common Voice. + + +```python +import torch +import torchaudio +from datasets import load_dataset, load_metric +from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor +import re + +test_dataset = load_dataset("common_voice", "ar", split="test") + +processor = Wav2Vec2Processor.from_pretrained("anas/wav2vec2-large-xlsr-arabic") +model = Wav2Vec2ForCTC.from_pretrained("anas/wav2vec2-large-xlsr-arabic/") +model.to("cuda") + +chars_to_ignore_regex = '[\,\؟\.\!\-\;\\:\'\"\☭\«\»\؛\—\ـ\_\،\“\%\‘\”\�]' + +resampler = torchaudio.transforms.Resample(48_000, 16_000) + +# Preprocessing the datasets. +# We need to read the aduio files as arrays +def speech_file_to_array_fn(batch): + batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + batch["sentence"] = re.sub('[a-z]','',batch["sentence"]) + batch["sentence"] = re.sub("[إأٱآا]", "ا", batch["sentence"]) + noise = re.compile(""" ّ | # Tashdid + َ | # Fatha + ً | # Tanwin Fath + ُ | # Damma + ٌ | # Tanwin Damm + ِ | # Kasra + ٍ | # Tanwin Kasr + ْ | # Sukun + ـ # Tatwil/Kashida + """, re.VERBOSE) + batch["sentence"] = re.sub(noise, '', batch["sentence"]) + speech_array, sampling_rate = torchaudio.load(batch["path"]) + batch["speech"] = resampler(speech_array).squeeze().numpy() + return batch + +test_dataset = test_dataset.map(speech_file_to_array_fn) + +# Preprocessing the datasets. +# We need to read the aduio files as arrays +def evaluate(batch): + inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True) + + with torch.no_grad(): + logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits + + pred_ids = torch.argmax(logits, dim=-1) + batch["pred_strings"] = processor.batch_decode(pred_ids) + return batch + +result = test_dataset.map(evaluate, batched=True, batch_size=8) + +print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"]))) +``` + +**Test Result**: 52.18 % + + +## Training + +The Common Voice Corpus 4 `train`, `validation`, datasets were used for training + +The script used for training can be found [here](https://github.com/anashas/Fine-Tuning-of-XLSR-Wav2Vec2-on-Arabic) + +Twitter: [here](https://twitter.com/hasnii_anas) + +Email: anashasni146@gmail.com \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..a6292d2 --- /dev/null +++ b/config.json @@ -0,0 +1,76 @@ +{ + "_name_or_path": "facebook/wav2vec2-large-xlsr-53", + "activation_dropout": 0.0, + "apply_spec_augment": true, + "architectures": [ + "Wav2Vec2ForCTC" + ], + "attention_dropout": 0.1, + "bos_token_id": 1, + "conv_bias": true, + "conv_dim": [ + 512, + 512, + 512, + 512, + 512, + 512, + 512 + ], + "conv_kernel": [ + 10, + 3, + 3, + 3, + 3, + 2, + 2 + ], + "conv_stride": [ + 5, + 2, + 2, + 2, + 2, + 2, + 2 + ], + "ctc_loss_reduction": "mean", + "ctc_zero_infinity": false, + "do_stable_layer_norm": true, + "eos_token_id": 2, + "feat_extract_activation": "gelu", + "feat_extract_dropout": 0.0, + "feat_extract_norm": "layer", + "feat_proj_dropout": 0.0, + "final_dropout": 0.0, + "gradient_checkpointing": true, + "hidden_act": "gelu", + "hidden_dropout": 0.05, + "hidden_size": 1024, + "initializer_range": 0.02, + "intermediate_size": 4096, + "layer_norm_eps": 1e-05, + "layerdrop": 0.1, + "mask_channel_length": 10, + "mask_channel_min_space": 1, + "mask_channel_other": 0.0, + "mask_channel_prob": 0.0, + "mask_channel_selection": "static", + "mask_feature_length": 10, + "mask_feature_prob": 0.0, + "mask_time_length": 10, + "mask_time_min_space": 1, + "mask_time_other": 0.0, + "mask_time_prob": 0.05, + "mask_time_selection": "static", + "model_type": "wav2vec2", + "num_attention_heads": 16, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_feat_extract_layers": 7, + "num_hidden_layers": 24, + "pad_token_id": 36, + "transformers_version": "4.5.0.dev0", + "vocab_size": 37 +} diff --git a/flax_model.msgpack b/flax_model.msgpack new file mode 100644 index 0000000..0e1a9fe --- /dev/null +++ b/flax_model.msgpack @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28991f821fbe8a965410fed20f36cc750d7c9b874ca371781dea99a5130f6841 +size 1261921972 diff --git a/preprocessor_config.json b/preprocessor_config.json new file mode 100644 index 0000000..0886a48 --- /dev/null +++ b/preprocessor_config.json @@ -0,0 +1,8 @@ +{ + "do_normalize": true, + "feature_size": 1, + "padding_side": "right", + "padding_value": 0.0, + "return_attention_mask": true, + "sampling_rate": 16000 +} diff --git a/pytorch_model.bin b/pytorch_model.bin new file mode 100644 index 0000000..97e3bfa --- /dev/null +++ b/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9d92c7e4e59488cb3de5cb0336893c24517ea0da99be9f9cd6f77ada2ecbe0b +size 1262085527 diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..9abf719 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1 @@ +{"bos_token": "", "eos_token": "", "unk_token": "[UNK]", "pad_token": "[PAD]"} \ No newline at end of file diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..a2a8340 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1 @@ +{"unk_token": "[UNK]", "bos_token": "", "eos_token": "", "pad_token": "[PAD]", "do_lower_case": false, "word_delimiter_token": "|"} \ No newline at end of file diff --git a/vocab.json b/vocab.json new file mode 100644 index 0000000..fa9cd0a --- /dev/null +++ b/vocab.json @@ -0,0 +1 @@ +{"خ": 0, "ة": 1, "د": 2, "ا": 4, "ض": 5, "م": 6, "و": 7, "ك": 8, "ث": 9, "ش": 10, "ع": 11, "ز": 12, "ء": 13, "ی": 14, "ن": 15, "ه": 16, "ق": 17, "ت": 18, "ب": 19, "ف": 20, "ظ": 21, "ح": 22, "ص": 23, "ئ": 24, "ذ": 25, "ى": 26, "غ": 27, "س": 28, "ر": 29, "ط": 30, "ي": 31, "ل": 32, "ؤ": 33, "ج": 34, "|": 3, "[UNK]": 35, "[PAD]": 36} \ No newline at end of file