From 3470c2e74a904b131b836a05ce34a144e3a7cf47 Mon Sep 17 00:00:00 2001 From: ModelHub XC Date: Fri, 8 May 2026 11:40:38 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=9D=E5=A7=8B=E5=8C=96=E9=A1=B9=E7=9B=AE?= =?UTF-8?q?=EF=BC=8C=E7=94=B1ModelHub=20XC=E7=A4=BE=E5=8C=BA=E6=8F=90?= =?UTF-8?q?=E4=BE=9B=E6=A8=A1=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Model: IbrahimAmin/egyptian-arabic-wav2vec2-xlsr-53 Source: Original Platform --- .gitattributes | 35 +++++++ README.md | 215 +++++++++++++++++++++++++++++++++++++++ added_tokens.json | 4 + config.json | 109 ++++++++++++++++++++ model.safetensors | 3 + preprocessor_config.json | 10 ++ special_tokens_map.json | 6 ++ tokenizer_config.json | 48 +++++++++ training_args.bin | 3 + vocab.json | 43 ++++++++ 10 files changed, 476 insertions(+) create mode 100644 .gitattributes create mode 100644 README.md create mode 100644 added_tokens.json create mode 100644 config.json create mode 100644 model.safetensors create mode 100644 preprocessor_config.json create mode 100644 special_tokens_map.json create mode 100644 tokenizer_config.json create mode 100644 training_args.bin create mode 100644 vocab.json diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..a6344aa --- /dev/null +++ b/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..fadbb7b --- /dev/null +++ b/README.md @@ -0,0 +1,215 @@ +--- +license: apache-2.0 +language: +- ar +- arz +library_name: transformers +pipeline_tag: automatic-speech-recognition +datasets: +- YouTube +- rsalshalan/MGB3 +- pain/MASC +- mozilla-foundation/common_voice_15_0 +- halabi2016/arabic_speech_corpus +model-index: +- name: egyptian-arabic-wav2vec2-xlsr-53 + results: + - task: + name: Speech Recognition + type: automatic-speech-recognition + dataset: + name: mozilla-foundation/common_voice_17_0 + type: mozilla-foundation/common_voice_17_0 + args: ar + metrics: + - name: Test WER + type: wer + value: 27.20 +base_model: +- omarxadel/wav2vec2-large-xlsr-53-arabic-egyptian +--- + +# 🐪🇪🇬 Egyptian Arabic ASR — wav2vec2-large-xlsr-53 Fine-tuned + +This model is a fine-tuned version of [omarxadel/wav2vec2-large-xlsr-53-arabic-egyptian](https://huggingface.co/omarxadel/wav2vec2-large-xlsr-53-arabic-egyptian), +enhancing **Egyptian Arabic**, **Modern Standard Arabic (MSA)** and **Gulf / Levantine Arabic** for Automatic Speech Recognition. + +--- + +## 📚 Dataset + +It was trained on a diverse combination of publicly available and custom-collected Arabic speech datasets, including: + +- **📺 YouTube Egyptian Arabic Speech** *(custom-curated)* +- **🎧 MASC** *(Media Arabic Speech Corpus)* +- **🌍 Common Voice 15 - Arabic** +- **📻 MGB-3 Broadcast Speech** +- **🗂️ Arabic Speech Corpus** + +--- + +## 🔥 Model Highlights + +- 📌 Focused on real-life Egyptian Arabic speech (YouTube, spontaneous, conversational) +- 🚀 Supports MSA and other Arabic dialects. +- 🔉 Trained on both scripted and natural speech + +--- + +## 💬 Languages & Dialects + +| Dialect | Coverage | +| ---------------------------- | ------------ | +| Egyptian Arabic | ✅ Primary | +| Modern Standard Arabic (MSA) | ✅ Supported | +| Gulf / Levantine | ✅ Supported | + +--- + +## 🚀 Usage + +```python +from transformers import pipeline + +asr = pipeline("automatic-speech-recognition", model="IbrahimAmin/egyptian-arabic-wav2vec2-xlsr-53") +asr("path/to/audio.wav") + +# Long-Form Transcription: https://huggingface.co/blog/asr-chunking +asr = pipeline("automatic-speech-recognition", model="IbrahimAmin/egyptian-arabic-wav2vec2-xlsr-53", chunk_length_s=30) +asr("path/to/audio.wav") +``` + +```python +from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor +import torch +import torchaudio + +model = Wav2Vec2ForCTC.from_pretrained("IbrahimAmin/egyptian-arabic-wav2vec2-xlsr-53") +processor = Wav2Vec2Processor.from_pretrained("IbrahimAmin/egyptian-arabic-wav2vec2-xlsr-53") + +# Load audio (must be mono, 16kHz) +waveform, sr = torchaudio.load("path/to/audio.wav") + +# Convert to mono if not already +if waveform.shape[0] > 1: + waveform = torch.mean(waveform, dim=0, keepdim=True) + +# Resample if needed to 16 kHz +if sr != 16000: + resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000) + waveform = resampler(waveform) + +inputs = processor(waveform.squeeze(), sampling_rate=16000, return_tensors="pt") + +with torch.inference_mode(): + logits = model(**inputs).logits + +predicted_ids = torch.argmax(logits, dim=-1) +transcription = processor.batch_decode(predicted_ids) +print(transcription) +``` + +--- + +## 🧪 Evaluation + +```python +import torch +import torchaudio +import re +from datasets import load_dataset +from evaluate import load +from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor + +# Device setup +device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + +# 🔑 Replace with your Hugging Face token and the desired Wav2Vec2-based model ID +HF_TOKEN = "your_hf_token" +MODEL_NAME = "your_model_name_or_path" + +# Load the Common Voice 17.0 Arabic test split +test_dataset = load_dataset( + "mozilla-foundation/common_voice_17_0", + "ar", + split="test", + token=HF_TOKEN +) + +# Load WER metric +wer = load("wer") + +# Load processor and model +processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME, token=HF_TOKEN) +model = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME, token=HF_TOKEN).to(device) + +# Define regex for cleaning up unwanted characters +CHARS_TO_IGNORE_REGEX = r'[\؛\—\_get\«\»\ـ\,\?\.\!\-\;\:"\“\%\‘\”\�\#\،\☭,\؟]' + +def preprocess(batch): + """Removes unwanted characters and resamples audio to 16kHz.""" + batch["sentence"] = re.sub(CHARS_TO_IGNORE_REGEX, "", batch["sentence"]) + speech_array, sampling_rate = torchaudio.load(batch["path"]) + resampler = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=16_000) + batch["speech"] = resampler(speech_array).squeeze().numpy() + return batch + +# Apply preprocessing +test_dataset = test_dataset.map(preprocess) + +def predict(batch): + """Runs inference and decodes predicted text.""" + inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True) + + with torch.inference_mode(): + logits = model( + input_values=inputs["input_values"].to(device), + attention_mask=inputs["attention_mask"].to(device) + ).logits + + predicted_ids = torch.argmax(logits, dim=-1) + batch["pred_strings"] = processor.batch_decode(predicted_ids) + return batch + +# Run prediction +result = test_dataset.map(predict, batched=True, batch_size=8) + +# Compute and print Word Error Rate +wer_score = wer.compute(predictions=result["pred_strings"], references=result["sentence"]) +print(f"WER: {wer_score * 100:.2f}%") +``` + +--- + +## 🗣️ Model Comparison on Common Voice 17.0 Arabic Subset (Test Set) + +| **Model** | **WER (%)** | +| -------------------------------------------------- | ----------: | +| **`IbrahimAmin/egyptian-arabic-wav2vec2-xlsr-53`** | **27.20** | +| `jonatasgrosman/wav2vec2-large-xlsr-53-arabic` | 45.55 | +| `AndrewMcDowell/wav2vec2-xls-r-300m-arabic` | 47.22 | +| `openai/whisper-large-v3`* | 52.36 | +| `Ahmed107/hamsa-v0.6Q`* | 53.27 | +| `nadsoft/hamsa-v0.1-beta`* | 65.60 | +| `openai/whisper-medium`* | 67.75 | +| `openai/whisper-small`* | 74.16 | +| `omarxadel/wav2vec2-large-xlsr-53-arabic-egyptian` | 91.82 | +| `arbml/wav2vec2-large-xlsr-53-arabic-egyptian` | 93.92 | +| `mboushaba/whisper-large-v3-turbo-arabic`* | 96.90 | + +\*: *Whisper models were decoded using beam search (`beam_size = 5`) and evaluated using `BasicTextNormalizer` with `remove_diacritics=False` and `split_letters=False`, applied to both predictions and reference text.* + +--- + +## ✨ Citation + +If you want to cite this model you can use this: + +```bibtex +@misc{amin2025egyptianasr, + title={Egyptian Arabic ASR with wav2vec2 XLSR 53}, + author={Ibrahim Amin}, + year={2025}, + howpublished={\url{https://huggingface.co/IbrahimAmin/egyptian-arabic-wav2vec2-xlsr-53}}, +} +``` \ No newline at end of file diff --git a/added_tokens.json b/added_tokens.json new file mode 100644 index 0000000..dc44d27 --- /dev/null +++ b/added_tokens.json @@ -0,0 +1,4 @@ +{ + "": 42, + "": 41 +} diff --git a/config.json b/config.json new file mode 100644 index 0000000..a55b6a8 --- /dev/null +++ b/config.json @@ -0,0 +1,109 @@ +{ + "_name_or_path": "omarxadel/wav2vec2-large-xlsr-53-arabic-egyptian", + "activation_dropout": 0.0, + "adapter_attn_dim": null, + "adapter_kernel_size": 3, + "adapter_stride": 2, + "add_adapter": false, + "apply_spec_augment": true, + "architectures": [ + "Wav2Vec2ForCTC" + ], + "attention_dropout": 0.0, + "bos_token_id": 1, + "classifier_proj_size": 256, + "codevector_dim": 768, + "contrastive_logits_temperature": 0.1, + "conv_bias": true, + "conv_dim": [ + 512, + 512, + 512, + 512, + 512, + 512, + 512 + ], + "conv_kernel": [ + 10, + 3, + 3, + 3, + 3, + 2, + 2 + ], + "conv_stride": [ + 5, + 2, + 2, + 2, + 2, + 2, + 2 + ], + "ctc_loss_reduction": "mean", + "ctc_zero_infinity": true, + "diversity_loss_weight": 0.1, + "do_stable_layer_norm": true, + "eos_token_id": 2, + "feat_extract_activation": "gelu", + "feat_extract_dropout": 0.0, + "feat_extract_norm": "layer", + "feat_proj_dropout": 0.0, + "feat_quantizer_dropout": 0.0, + "final_dropout": 0.0, + "gradient_checkpointing": false, + "hidden_act": "gelu", + "hidden_dropout": 0.0, + "hidden_size": 1024, + "initializer_range": 0.02, + "intermediate_size": 4096, + "layer_norm_eps": 1e-05, + "layerdrop": 0.0, + "mask_feature_length": 10, + "mask_feature_min_masks": 0, + "mask_feature_prob": 0.0, + "mask_time_length": 10, + "mask_time_min_masks": 2, + "mask_time_prob": 0.0, + "model_type": "wav2vec2", + "num_adapter_layers": 3, + "num_attention_heads": 16, + "num_codevector_groups": 2, + "num_codevectors_per_group": 320, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_feat_extract_layers": 7, + "num_hidden_layers": 24, + "num_negatives": 100, + "output_hidden_size": 1024, + "pad_token_id": 40, + "proj_codevector_dim": 768, + "tdnn_dilation": [ + 1, + 2, + 3, + 1, + 1 + ], + "tdnn_dim": [ + 512, + 512, + 512, + 512, + 1500 + ], + "tdnn_kernel": [ + 5, + 3, + 3, + 1, + 1 + ], + "torch_dtype": "float32", + "transformers_version": "4.35.0", + "use_weighted_layer_sum": false, + "vocab_size": 43, + "xvector_output_dim": 512 +} diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..1de3ea8 --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:32d694bc610f9850d3ea4a1ef5de8fb27c49c333c015545715972b36f2432824 +size 1261979588 diff --git a/preprocessor_config.json b/preprocessor_config.json new file mode 100644 index 0000000..67dee51 --- /dev/null +++ b/preprocessor_config.json @@ -0,0 +1,10 @@ +{ + "do_normalize": true, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0.0, + "processor_class": "Wav2Vec2Processor", + "return_attention_mask": true, + "sampling_rate": 16000 +} diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..1c2a036 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,6 @@ +{ + "bos_token": "", + "eos_token": "", + "pad_token": "[PAD]", + "unk_token": "[UNK]" +} diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..6cbb96a --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,48 @@ +{ + "added_tokens_decoder": { + "39": { + "content": "[UNK]", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + }, + "40": { + "content": "[PAD]", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "42": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": true, + "do_lower_case": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "[PAD]", + "processor_class": "Wav2Vec2Processor", + "replace_word_delimiter_char": " ", + "target_lang": null, + "tokenizer_class": "Wav2Vec2CTCTokenizer", + "unk_token": "[UNK]", + "word_delimiter_token": "|" +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..81c2d1d --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3410c18d918bbb020b7fed2ecde45e0536684b0ff5c712d0c8b289181dc6824c +size 4728 diff --git a/vocab.json b/vocab.json new file mode 100644 index 0000000..f026754 --- /dev/null +++ b/vocab.json @@ -0,0 +1,43 @@ +{ + "[PAD]": 40, + "[UNK]": 39, + "|": 0, + "ء": 1, + "آ": 2, + "أ": 3, + "إ": 4, + "ئ": 5, + "ا": 6, + "ب": 7, + "ة": 8, + "ت": 9, + "ث": 10, + "ج": 11, + "ح": 12, + "خ": 13, + "د": 14, + "ذ": 15, + "ر": 16, + "ز": 17, + "س": 18, + "ش": 19, + "ص": 20, + "ض": 21, + "ط": 22, + "ظ": 23, + "ع": 24, + "غ": 25, + "ف": 26, + "ق": 27, + "ك": 28, + "ل": 29, + "م": 30, + "ن": 31, + "ه": 32, + "و": 33, + "ى": 34, + "ي": 35, + "ڨ": 36, + "ی": 37, + "ﻻ": 38 +}