From 450366c17e6bb891a5a60dd7df1e5e3385698af1 Mon Sep 17 00:00:00 2001 From: ModelHub XC Date: Thu, 28 May 2026 11:04:18 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=9D=E5=A7=8B=E5=8C=96=E9=A1=B9=E7=9B=AE?= =?UTF-8?q?=EF=BC=8C=E7=94=B1ModelHub=20XC=E7=A4=BE=E5=8C=BA=E6=8F=90?= =?UTF-8?q?=E4=BE=9B=E6=A8=A1=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Model: arbml/wav2vec2-large-xlsr-53-arabic-egyptian Source: Original Platform --- .gitattributes | 17 ++++++ README.md | 125 ++++++++++++++++++++++++++++++++++++++ config.json | 76 +++++++++++++++++++++++ dialects_speech_corpus.py | 108 ++++++++++++++++++++++++++++++++ flax_model.msgpack | 3 + optimizer.pt | 3 + preprocessor_config.json | 8 +++ pytorch_model.bin | 3 + scheduler.pt | 3 + special_tokens_map.json | 1 + tokenizer_config.json | 1 + trainer_state.json | 100 ++++++++++++++++++++++++++++++ training_args.bin | 3 + vocab.json | 1 + 14 files changed, 452 insertions(+) create mode 100644 .gitattributes create mode 100644 README.md create mode 100644 config.json create mode 100644 dialects_speech_corpus.py create mode 100644 flax_model.msgpack create mode 100644 optimizer.pt create mode 100644 preprocessor_config.json create mode 100644 pytorch_model.bin create mode 100644 scheduler.pt create mode 100644 special_tokens_map.json create mode 100644 tokenizer_config.json create mode 100644 trainer_state.json create mode 100644 training_args.bin create mode 100644 vocab.json diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..d699711 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,17 @@ +*.bin.* filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tar.gz filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..4cc615f --- /dev/null +++ b/README.md @@ -0,0 +1,125 @@ +--- +language: ??? +datasets: +- common_voice +tags: +- audio +- automatic-speech-recognition +- speech +- xlsr-fine-tuning-week +license: apache-2.0 +model-index: +- name: XLSR Wav2Vec2 Arabic Egyptian by Zaid + results: + - task: + name: Speech Recognition + type: automatic-speech-recognition + dataset: + name: Common Voice ??? + type: common_voice + args: ??? + metrics: + - name: Test WER + type: wer + value: ??? +--- + +# Wav2Vec2-Large-XLSR-53-Tamil + +Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) in Tamil using the [Common Voice](https://huggingface.co/datasets/common_voice) +When using this model, make sure that your speech input is sampled at 16kHz. + +## Usage + +The model can be used directly (without a language model) as follows: + +```python +import torch +import torchaudio +from datasets import load_dataset +from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor + +test_dataset = load_dataset("common_voice", "???", split="test[:2%]"). + +processor = Wav2Vec2Processor.from_pretrained("Zaid/wav2vec2-large-xlsr-53-arabic-egyptian") +model = Wav2Vec2ForCTC.from_pretrained("Zaid/wav2vec2-large-xlsr-53-arabic-egyptian") + +resampler = torchaudio.transforms.Resample(48_000, 16_000) + +# Preprocessing the datasets. +# We need to read the aduio files as arrays +def speech_file_to_array_fn(batch): + speech_array, sampling_rate = torchaudio.load(batch["path"]) + batch["speech"] = resampler(speech_array).squeeze().numpy() + return batch + +test_dataset = test_dataset.map(speech_file_to_array_fn) +inputs = processor(test_dataset["speech"][:2], sampling_rate=16_000, return_tensors="pt", padding=True) + +with torch.no_grad(): + logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits + +predicted_ids = torch.argmax(logits, dim=-1) + +print("Prediction:", processor.batch_decode(predicted_ids)) +print("Reference:", test_dataset["sentence"][:2]) +``` + + +## Evaluation + +The model can be evaluated as follows on the {language} test data of Common Voice. + + +```python +import torch +import torchaudio +from datasets import load_dataset, load_metric +from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor +import re + +test_dataset = load_dataset("common_voice", "???", split="test") +wer = load_metric("wer") + +processor = Wav2Vec2Processor.from_pretrained("Zaid/wav2vec2-large-xlsr-53-arabic-egyptian") +model = Wav2Vec2ForCTC.from_pretrained("Zaid/wav2vec2-large-xlsr-53-arabic-egyptian") +model.to("cuda") + +chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“]' +resampler = torchaudio.transforms.Resample(48_000, 16_000) + +# Preprocessing the datasets. +# We need to read the aduio files as arrays +def speech_file_to_array_fn(batch): + batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + speech_array, sampling_rate = torchaudio.load(batch["path"]) + batch["speech"] = resampler(speech_array).squeeze().numpy() + return batch + +test_dataset = test_dataset.map(speech_file_to_array_fn) + +# Preprocessing the datasets. +# We need to read the aduio files as arrays +def evaluate(batch): + inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True) + + with torch.no_grad(): + logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits + + pred_ids = torch.argmax(logits, dim=-1) + batch["pred_strings"] = processor.batch_decode(pred_ids) + return batch + +result = test_dataset.map(evaluate, batched=True, batch_size=8) + +print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"]))) +``` + +**Test Result**: ??? % + + +## Training + +The Common Voice `train`, `validation` datasets were used for training. + +The script used for training can be found ??? \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..7ca1bdd --- /dev/null +++ b/config.json @@ -0,0 +1,76 @@ +{ + "_name_or_path": "facebook/wav2vec2-large-xlsr-53", + "activation_dropout": 0.0, + "apply_spec_augment": true, + "architectures": [ + "Wav2Vec2ForCTC" + ], + "attention_dropout": 0.1, + "bos_token_id": 1, + "conv_bias": true, + "conv_dim": [ + 512, + 512, + 512, + 512, + 512, + 512, + 512 + ], + "conv_kernel": [ + 10, + 3, + 3, + 3, + 3, + 2, + 2 + ], + "conv_stride": [ + 5, + 2, + 2, + 2, + 2, + 2, + 2 + ], + "ctc_loss_reduction": "mean", + "ctc_zero_infinity": false, + "do_stable_layer_norm": true, + "eos_token_id": 2, + "feat_extract_activation": "gelu", + "feat_extract_dropout": 0.0, + "feat_extract_norm": "layer", + "feat_proj_dropout": 0.0, + "final_dropout": 0.0, + "gradient_checkpointing": true, + "hidden_act": "gelu", + "hidden_dropout": 0.1, + "hidden_size": 1024, + "initializer_range": 0.02, + "intermediate_size": 4096, + "layer_norm_eps": 1e-05, + "layerdrop": 0.1, + "mask_channel_length": 10, + "mask_channel_min_space": 1, + "mask_channel_other": 0.0, + "mask_channel_prob": 0.0, + "mask_channel_selection": "static", + "mask_feature_length": 10, + "mask_feature_prob": 0.0, + "mask_time_length": 10, + "mask_time_min_space": 1, + "mask_time_other": 0.0, + "mask_time_prob": 0.05, + "mask_time_selection": "static", + "model_type": "wav2vec2", + "num_attention_heads": 16, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_feat_extract_layers": 7, + "num_hidden_layers": 24, + "pad_token_id": 44, + "transformers_version": "4.4.0", + "vocab_size": 45 +} diff --git a/dialects_speech_corpus.py b/dialects_speech_corpus.py new file mode 100644 index 0000000..19cfb71 --- /dev/null +++ b/dialects_speech_corpus.py @@ -0,0 +1,108 @@ +"""Arabic Speech Corpus""" + +from __future__ import absolute_import, division, print_function + +import os + +import datasets + + +_CITATION = """ +""" + +_DESCRIPTION = """\ + + +```python +import soundfile as sf + +def map_to_array(batch): + speech_array, _ = sf.read(batch["file"]) + batch["speech"] = speech_array + return batch + +dataset = dataset.map(map_to_array, remove_columns=["file"]) +``` +""" + +_URL = "mgb3.zip" +corrupt_files = ['familyKids_02_first_12min.wav','sports_04_first_12min.wav', +'cooking_05_first_12min.wav', 'moviesDrama_07_first_12min.wav','science_06_first_12min.wav', +'comedy_09_first_12min.wav','cultural_08_first_12min.wav','familyKids_11_first_12min.wav', +'science_10_first_12min.wav'] +import soundfile as sf + +class EgyptianSpeechCorpusConfig(datasets.BuilderConfig): + """BuilderConfig for EgyptianSpeechCorpus.""" + + def __init__(self, **kwargs): + """ + Args: + data_dir: `string`, the path to the folder containing the files in the + downloaded .tar + citation: `string`, citation for the data set + url: `string`, url for information about the data set + **kwargs: keyword arguments forwarded to super. + """ + super(EgyptianSpeechCorpusConfig, self).__init__(version=datasets.Version("2.1.0", ""), **kwargs) + + +def map_to_array(batch): + start, stop = batch['segment'].split('_') + speech_array, _ = sf.read(batch["file"], start = start, stop = stop) + batch["speech"] = speech_array + return batch + +class EgyptionSpeechCorpus(datasets.GeneratorBasedBuilder): + """EgyptianSpeechCorpus dataset.""" + + BUILDER_CONFIGS = [ + EgyptianSpeechCorpusConfig(name="clean", description="'Clean' speech."), + ] + + def _info(self): + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=datasets.Features( + { + "file": datasets.Value("string"), + "text": datasets.Value("string"), + "segment": datasets.Value("string") + } + ), + supervised_keys=("file", "text"), + homepage=_URL, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager): + self.archive_path = '/content/mgb3' + return [ + datasets.SplitGenerator(name="train", gen_kwargs={"archive_path": os.path.join(self.archive_path, "adapt")}), + datasets.SplitGenerator(name="dev", gen_kwargs={"archive_path": os.path.join(self.archive_path, "dev")}), + datasets.SplitGenerator(name="test", gen_kwargs={"archive_path": os.path.join(self.archive_path, "test")}), + ] + + def _generate_examples(self, archive_path): + """Generate examples from a Librispeech archive_path.""" + text_dir = os.path.join(archive_path, "Alaa") + wav_dir = os.path.join(self.archive_path, "wav") + + segments_file = os.path.join(text_dir, "text_noverlap") + + with open(segments_file, "r", encoding="utf-8") as f: + for _id, line in enumerate(f): + segment = line.split(' ')[0] + text = ' '.join(line.split(' ')[1:]) + wav_file = '_'.join(segment.split('_')[:4]) +'.wav' + start, stop = segment.split('_')[4:6] + wav_path = os.path.join(wav_dir, wav_file) + if (wav_file in corrupt_files) or (wav_file not in os.listdir(wav_dir)): + continue + example = { + "file": wav_path, + "text": text, + "segment":('_').join([start, stop]) + } + yield str(_id), example + diff --git a/flax_model.msgpack b/flax_model.msgpack new file mode 100644 index 0000000..17c9802 --- /dev/null +++ b/flax_model.msgpack @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95832f40a311c8ab065115bd0cd977d2537b69a66e798c400a0c1247e49232d9 +size 1261954772 diff --git a/optimizer.pt b/optimizer.pt new file mode 100644 index 0000000..8478a7b --- /dev/null +++ b/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:35567b797d2ad024959817f25246b0d06ececc4dd794d69da7e9334950b455aa +size 2490446215 diff --git a/preprocessor_config.json b/preprocessor_config.json new file mode 100644 index 0000000..0886a48 --- /dev/null +++ b/preprocessor_config.json @@ -0,0 +1,8 @@ +{ + "do_normalize": true, + "feature_size": 1, + "padding_side": "right", + "padding_value": 0.0, + "return_attention_mask": true, + "sampling_rate": 16000 +} diff --git a/pytorch_model.bin b/pytorch_model.bin new file mode 100644 index 0000000..4aef489 --- /dev/null +++ b/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a20efe1c5480b033037f9ee3e21e24d9a51cf765fd5b96f0375b02012664b7b0 +size 1262118359 diff --git a/scheduler.pt b/scheduler.pt new file mode 100644 index 0000000..53e9d97 --- /dev/null +++ b/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0818666afb82b7b432824bd91f7b1cbfd826c0bbd672aea23cda693e38746b70 +size 623 diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..9abf719 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1 @@ +{"bos_token": "", "eos_token": "", "unk_token": "[UNK]", "pad_token": "[PAD]"} \ No newline at end of file diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..a2a8340 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1 @@ +{"unk_token": "[UNK]", "bos_token": "", "eos_token": "", "pad_token": "[PAD]", "do_lower_case": false, "word_delimiter_token": "|"} \ No newline at end of file diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000..1224250 --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,100 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 18.46153846153846, + "global_step": 1200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 3.08, + "learning_rate": 0.00011999999999999999, + "loss": 5.7855, + "step": 200 + }, + { + "epoch": 3.08, + "eval_loss": 3.094594717025757, + "eval_runtime": 32.9209, + "eval_samples_per_second": 7.017, + "eval_wer": 1.0, + "step": 200 + }, + { + "epoch": 6.15, + "learning_rate": 0.00023999999999999998, + "loss": 3.0293, + "step": 400 + }, + { + "epoch": 6.15, + "eval_loss": 3.005324125289917, + "eval_runtime": 32.0726, + "eval_samples_per_second": 7.202, + "eval_wer": 1.0, + "step": 400 + }, + { + "epoch": 9.23, + "learning_rate": 0.0002625, + "loss": 1.9629, + "step": 600 + }, + { + "epoch": 9.23, + "eval_loss": 1.0649160146713257, + "eval_runtime": 32.4082, + "eval_samples_per_second": 7.128, + "eval_wer": 0.7503426222019187, + "step": 600 + }, + { + "epoch": 12.31, + "learning_rate": 0.00018749999999999998, + "loss": 0.9013, + "step": 800 + }, + { + "epoch": 12.31, + "eval_loss": 0.8863689303398132, + "eval_runtime": 32.197, + "eval_samples_per_second": 7.175, + "eval_wer": 0.6196893558702604, + "step": 800 + }, + { + "epoch": 15.38, + "learning_rate": 0.0001125, + "loss": 0.6283, + "step": 1000 + }, + { + "epoch": 15.38, + "eval_loss": 0.8295900821685791, + "eval_runtime": 32.1739, + "eval_samples_per_second": 7.18, + "eval_wer": 0.5616719963453631, + "step": 1000 + }, + { + "epoch": 18.46, + "learning_rate": 3.75e-05, + "loss": 0.4995, + "step": 1200 + }, + { + "epoch": 18.46, + "eval_loss": 0.8448638319969177, + "eval_runtime": 32.5186, + "eval_samples_per_second": 7.104, + "eval_wer": 0.5415714938328003, + "step": 1200 + } + ], + "max_steps": 1300, + "num_train_epochs": 20, + "total_flos": 7.999595747693107e+18, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..a65a377 --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12f8d81898a4d31d043925ba819cf0ce4cfa60b145af57af0c27995159ac9a6b +size 2287 diff --git a/vocab.json b/vocab.json new file mode 100644 index 0000000..870e499 --- /dev/null +++ b/vocab.json @@ -0,0 +1 @@ +{"ذ": 0, "آ": 1, "ز": 2, "ى": 3, ")": 4, "خ": 5, "ط": 6, "ض": 7, "ج": 8, "م": 9, "ك": 10, "ء": 11, "ح": 12, "ؤ": 13, "ا": 14, "ع": 16, "ه": 17, "ق": 18, "ب": 19, "غ": 20, "ر": 21, "د": 22, "ن": 23, "#": 24, "(": 25, "ة": 26, "س": 27, "ظ": 28, "ي": 29, "ث": 30, "ئ": 31, "ش": 32, "ف": 33, "ت": 34, "ص": 35, "و": 36, "إ": 37, "،": 38, "ّ": 39, "أ": 40, "ل": 41, "؟": 42, "|": 15, "[UNK]": 43, "[PAD]": 44} \ No newline at end of file