From e076a20f8f3fc35713b2f1f44a2f2307b310276c Mon Sep 17 00:00:00 2001 From: ModelHub XC Date: Fri, 8 May 2026 11:40:38 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=9D=E5=A7=8B=E5=8C=96=E9=A1=B9=E7=9B=AE?= =?UTF-8?q?=EF=BC=8C=E7=94=B1ModelHub=20XC=E7=A4=BE=E5=8C=BA=E6=8F=90?= =?UTF-8?q?=E4=BE=9B=E6=A8=A1=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Model: SiangLao/xlsr-53-lao-asr Source: Original Platform --- .gitattributes | 35 ++++++++++++ README.md | 80 +++++++++++++++++++++++++++ added_tokens.json | 4 ++ config.json | 116 +++++++++++++++++++++++++++++++++++++++ model.safetensors | 3 + preprocessor_config.json | 10 ++++ special_tokens_map.json | 6 ++ tokenizer_config.json | 49 +++++++++++++++++ training_results.json | 57 +++++++++++++++++++ vocab.json | 57 +++++++++++++++++++ 10 files changed, 417 insertions(+) create mode 100644 .gitattributes create mode 100644 README.md create mode 100644 added_tokens.json create mode 100644 config.json create mode 100644 model.safetensors create mode 100644 preprocessor_config.json create mode 100644 special_tokens_map.json create mode 100644 tokenizer_config.json create mode 100644 training_results.json create mode 100644 vocab.json diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..a6344aa --- /dev/null +++ b/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..c7cb814 --- /dev/null +++ b/README.md @@ -0,0 +1,80 @@ +--- +language: lo +license: apache-2.0 +tags: +- automatic-speech-recognition +- speech +- audio +- lao +- wav2vec2 +- xlsr +datasets: +- SiangLao/lao-asr-thesis-dataset +metrics: +- cer +base_model: +- facebook/wav2vec2-large-xlsr-53 +library_name: transformers +--- + +# XLSR-53 Lao ASR + +Fine-tuned XLSR-53 model for Lao automatic speech recognition, achieving 16.22% CER on test data. + +## Model Details + +This model is fine-tuned from [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) using the SiangLao/lao-asr-thesis-dataset. + +### Training Configuration +- **Epochs**: 15 +- **Batch Size**: 16 +- **Learning Rate**: 1e-4 +- **Training Date**: June 3, 2025 +- **Vocabulary Size**: 55 Lao characters + special tokens + +### Performance + +| Split | CER | Loss | +|-------|-----|------| +| Test | 16.22% | 0.419 | +| Validation | 16.52% | 0.487 | + +## Usage + +```python +from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor +import torch +import librosa + +# Load model and processor +model = Wav2Vec2ForCTC.from_pretrained("SiangLao/xlsr-53-lao-asr") +processor = Wav2Vec2Processor.from_pretrained("SiangLao/xlsr-53-lao-asr") + +# Load audio (must be 16kHz) +audio, sr = librosa.load("audio.wav", sr=16000) + +# Process audio +inputs = processor(audio, sampling_rate=16000, return_tensors="pt") + +# Generate prediction +with torch.no_grad(): + logits = model(**inputs).logits + predicted_ids = torch.argmax(logits, dim=-1) + transcription = processor.batch_decode(predicted_ids)[0] + +# Clean transcription +transcription = transcription.replace("", " ").strip() + +print(transcription) +``` + +## Citation +```bibtex +@thesis{naovalath2025lao, + title={Lao Automatic Speech Recognition using Transfer Learning}, + author={Souphaxay Naovalath and Sounmy Chanthavong}, + advisor={Dr. Somsack Inthasone}, + school={National University of Laos, Faculty of Natural Sciences, Computer Science Department}, + year={2025} +} +``` \ No newline at end of file diff --git a/added_tokens.json b/added_tokens.json new file mode 100644 index 0000000..3118bfe --- /dev/null +++ b/added_tokens.json @@ -0,0 +1,4 @@ +{ + "": 56, + "": 55 +} diff --git a/config.json b/config.json new file mode 100644 index 0000000..fb1b7ae --- /dev/null +++ b/config.json @@ -0,0 +1,116 @@ +{ + "activation_dropout": 0.0, + "adapter_attn_dim": null, + "adapter_kernel_size": 3, + "adapter_stride": 2, + "add_adapter": false, + "apply_spec_augment": true, + "architectures": [ + "Wav2Vec2ForCTC" + ], + "attention_dropout": 0.1, + "bos_token_id": 1, + "classifier_proj_size": 256, + "codevector_dim": 768, + "contrastive_logits_temperature": 0.1, + "conv_bias": true, + "conv_dim": [ + 512, + 512, + 512, + 512, + 512, + 512, + 512 + ], + "conv_kernel": [ + 10, + 3, + 3, + 3, + 3, + 2, + 2 + ], + "conv_stride": [ + 5, + 2, + 2, + 2, + 2, + 2, + 2 + ], + "ctc_loss_reduction": "mean", + "ctc_zero_infinity": false, + "diversity_loss_weight": 0.1, + "do_stable_layer_norm": true, + "eos_token_id": 2, + "feat_extract_activation": "gelu", + "feat_extract_dropout": 0.0, + "feat_extract_norm": "layer", + "feat_proj_dropout": 0.1, + "feat_quantizer_dropout": 0.0, + "final_dropout": 0.0, + "gradient_checkpointing": false, + "hidden_act": "gelu", + "hidden_dropout": 0.1, + "hidden_size": 1024, + "initializer_range": 0.02, + "intermediate_size": 4096, + "layer_norm_eps": 1e-05, + "layerdrop": 0.1, + "mask_channel_length": 10, + "mask_channel_min_space": 1, + "mask_channel_other": 0.0, + "mask_channel_prob": 0.0, + "mask_channel_selection": "static", + "mask_feature_length": 10, + "mask_feature_min_masks": 0, + "mask_feature_prob": 0.0, + "mask_time_length": 10, + "mask_time_min_masks": 2, + "mask_time_min_space": 1, + "mask_time_other": 0.0, + "mask_time_prob": 0.075, + "mask_time_selection": "static", + "model_type": "wav2vec2", + "num_adapter_layers": 3, + "num_attention_heads": 16, + "num_codevector_groups": 2, + "num_codevectors_per_group": 320, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_feat_extract_layers": 7, + "num_hidden_layers": 24, + "num_negatives": 100, + "output_hidden_size": 1024, + "pad_token_id": 0, + "proj_codevector_dim": 768, + "tdnn_dilation": [ + 1, + 2, + 3, + 1, + 1 + ], + "tdnn_dim": [ + 512, + 512, + 512, + 512, + 1500 + ], + "tdnn_kernel": [ + 5, + 3, + 3, + 1, + 1 + ], + "torch_dtype": "float32", + "transformers_version": "4.52.4", + "use_weighted_layer_sum": false, + "vocab_size": 57, + "xvector_output_dim": 512 +} diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..ef7c081 --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b0e780c6fa0a9d4409d479f379a3d5829a3f32949973c2365234bd6e04909834 +size 1262041180 diff --git a/preprocessor_config.json b/preprocessor_config.json new file mode 100644 index 0000000..e7d53bd --- /dev/null +++ b/preprocessor_config.json @@ -0,0 +1,10 @@ +{ + "do_normalize": true, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0, + "processor_class": "Wav2Vec2Processor", + "return_attention_mask": true, + "sampling_rate": 16000 +} diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..fdafe48 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,6 @@ +{ + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "" +} diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..a8fe78e --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,49 @@ +{ + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + }, + "1": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": false, + "do_lower_case": false, + "eos_token": "", + "extra_special_tokens": {}, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "processor_class": "Wav2Vec2Processor", + "replace_word_delimiter_char": " ", + "target_lang": null, + "tokenizer_class": "Wav2Vec2CTCTokenizer", + "unk_token": "", + "word_delimiter_token": "|" +} diff --git a/training_results.json b/training_results.json new file mode 100644 index 0000000..b8aabb6 --- /dev/null +++ b/training_results.json @@ -0,0 +1,57 @@ +{ + "experiment_info": { + "model_name": "facebook/wav2vec2-large-xlsr-53", + "model_description": "XLSR-53", + "dataset_name": "h3llohihi/lao-asr-thesis-dataset", + "training_date": "2025-06-03 19:11:56", + "vocab_size": 55, + "primary_metric": "cer" + }, + "training_config": { + "dataset_name": "h3llohihi/lao-asr-thesis-dataset", + "gradient_accumulation": 1, + "weight_decay": 0.01, + "warmup_steps": 400, + "num_epochs": 15, + "max_steps": -1, + "eval_steps": 250, + "save_steps": 500, + "logging_steps": 50, + "early_stopping_patience": 6, + "dataloader_workers": 8, + "use_fp16": false, + "gradient_checkpointing": true, + "label_smoothing": 0.0, + "primary_metric": "cer", + "secondary_metric": "wer", + "logs_dir": "./logs", + "model_name": "facebook/wav2vec2-large-xlsr-53", + "description": "XLSR-53", + "output_dir": "./lao-asr-xlsr-53", + "batch_size": 16, + "learning_rate": 0.0001 + }, + "results": { + "validation": { + "cer": 0.16520911038806443, + "wer": 1.0, + "loss": 0.48708805441856384 + }, + "test": { + "cer": 0.16219106723220977, + "wer": 0.9958333333333333, + "loss": 0.41934534907341003 + }, + "dev": { + "cer": 0.10665818490245971, + "wer": 0.975, + "loss": 0.1934221088886261 + } + }, + "training_metrics": { + "final_loss": 1.7461039689566584, + "training_time": 12829.0071, + "samples_per_second": 4.499, + "total_steps": 0 + } +} \ No newline at end of file diff --git a/vocab.json b/vocab.json new file mode 100644 index 0000000..387f553 --- /dev/null +++ b/vocab.json @@ -0,0 +1,57 @@ +{ + " ": 2, + "": 0, + "": 1, + "ກ": 3, + "ຂ": 4, + "ຄ": 5, + "ງ": 6, + "ຈ": 7, + "ຊ": 8, + "ຍ": 9, + "ດ": 10, + "ຕ": 11, + "ຖ": 12, + "ທ": 13, + "ນ": 14, + "ບ": 15, + "ປ": 16, + "ຜ": 17, + "ຝ": 18, + "ພ": 19, + "ຟ": 20, + "ມ": 21, + "ຢ": 22, + "ຣ": 23, + "ລ": 24, + "ວ": 25, + "ສ": 26, + "ຫ": 27, + "ອ": 28, + "ຮ": 29, + "ະ": 30, + "ັ": 31, + "າ": 32, + "ຳ": 33, + "ິ": 34, + "ີ": 35, + "ຶ": 36, + "ື": 37, + "ຸ": 38, + "ູ": 39, + "ົ": 40, + "ຼ": 41, + "ຽ": 42, + "ເ": 43, + "ແ": 44, + "ໂ": 45, + "ໃ": 46, + "ໄ": 47, + "ໆ": 48, + "່": 49, + "້": 50, + "໋": 51, + "ໍ": 52, + "ໜ": 53, + "ໝ": 54 +}