From 64421904d622d0505640268e9bed60d7136aaa1d Mon Sep 17 00:00:00 2001 From: ModelHub XC Date: Fri, 8 May 2026 11:13:05 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=9D=E5=A7=8B=E5=8C=96=E9=A1=B9=E7=9B=AE?= =?UTF-8?q?=EF=BC=8C=E7=94=B1ModelHub=20XC=E7=A4=BE=E5=8C=BA=E6=8F=90?= =?UTF-8?q?=E4=BE=9B=E6=A8=A1=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Model: Cnam-LMSSC/wav2vec2-italian-phonemizer Source: Original Platform --- .gitattributes | 35 +++++++++ README.md | 149 +++++++++++++++++++++++++++++++++++++++ added_tokens.json | 4 ++ config.json | 117 ++++++++++++++++++++++++++++++ model.safetensors | 3 + preprocessor_config.json | 10 +++ pytorch_model.bin | 3 + special_tokens_map.json | 6 ++ tokenizer_config.json | 49 +++++++++++++ vocab.json | 39 ++++++++++ 10 files changed, 415 insertions(+) create mode 100644 .gitattributes create mode 100644 README.md create mode 100644 added_tokens.json create mode 100644 config.json create mode 100644 model.safetensors create mode 100644 preprocessor_config.json create mode 100644 pytorch_model.bin create mode 100644 special_tokens_map.json create mode 100644 tokenizer_config.json create mode 100644 vocab.json diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..a6344aa --- /dev/null +++ b/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..6f1c796 --- /dev/null +++ b/README.md @@ -0,0 +1,149 @@ +--- +library_name: transformers +license: mit +language: it +metrics: +- per +tags: +- audio +- automatic-speech-recognition +- speech +- phonemize +- phoneme +datasets: +- facebook/multilingual_librispeech +model-index: +- name: Wav2Vec2-base Italian finetuned for phonemes by LMSSC + results: + - task: + type: automatic-speech-recognition + name: Speech Recognition + dataset: + name: Multilingual Librispeech + type: facebook/multilingual_librispeech + args: it + metrics: + - type: per + value: 4.34 + name: Test PER on Multilingual Librispeech IT | Trained + - type: per + value: 4.25 + name: Val PER on Multilingual Librispeech IT | Trained +--- + +# Fine-tuned Italian Voxpopuli v2 wav2vec2-base model for speech-to-phoneme task in Italian + +Fine-tuned [facebook/wav2vec2-base-it-voxpopuli-v2](https://huggingface.co/facebook/wav2vec2-base-it-voxpopuli-v2) for **Italian speech-to-phoneme** (without language model) using the train and validation splits of [Multilingual Librispeech](https://huggingface.co/datasets/facebook/multilingual_librispeech). + +## Audio samplerate for usage + +When using this model, make sure that your speech input is **sampled at 16kHz**. + +## Output + +As this model is specifically trained for a speech-to-phoneme task, the output is sequence of [IPA-encoded](https://en.wikipedia.org/wiki/International_Phonetic_Alphabet) words, without punctuation. +If you don't read the phonetic alphabet fluently, you can use this excellent [IPA reader website](http://ipa-reader.xyz) to convert the transcript back to audio synthetic speech in order to check the quality of the phonetic transcription. + +## Training procedure + +The model has been finetuned on Multilingual Librispeech (IT) for 30 epochs on a 1xADA_6000 GPU at Cnam/LMSSC using a ddp strategy and gradient-accumulation procedure (256 audios per update, corresponding roughly to 25 minutes of speech per update -> 2k updates per epoch) + +- Learning rate schedule : Double Tri-state schedule + - Warmup from 1e-5 for 7% of total updates + - Constant at 1e-4 for 28% of total updates + - Linear decrease to 1e-6 for 36% of total updates + - Second warmup boost to 3e-5 for 3% of total updates + - Constant at 3e-5 for 12% of total updates + - Linear decrease to 1e-7 for remaining 14% of updates + +- The set of hyperparameters used for training are the same as those detailed in Annex B and Table 6 of [wav2vec2 paper](https://arxiv.org/pdf/2006.11477.pdf). + +## Usage (using the online Inference API) + +Just record your voice on the ⚡ Inference API on this webpage, and then click on "Compute", that's all ! + +## Usage (with HuggingSound library) + +The model can be used directly using the [HuggingSound](https://github.com/jonatasgrosman/huggingsound) library: + +```python +import pandas as pd +from huggingsound import SpeechRecognitionModel + +model = SpeechRecognitionModel("Cnam-LMSSC/wav2vec2-italian-phonemizer") +audio_paths = ["./test_rilettura_testo.wav", "./10179_11051_000021.flac"] + +# No need for the Audio files to be sampled at 16 kHz here, +# they are automatically resampled by Huggingsound + +transcriptions = model.transcribe(audio_paths) + +# (Optionnal) Display results in a table : +## transcriptions is list of dicts also containing timestamps and probabilities ! + +df = pd.DataFrame(transcriptions) +df['Audio file'] = pd.DataFrame(audio_paths) +df.set_index('Audio file', inplace=True) +df[['transcription']] +``` + +**Output** : + +| **Audio file** | **Phonetic transcription (IPA)** | +|:---------------------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------| +| ./test_rilettura_testo.wav | prezɪ lɪ kwatːrotʃɛnto fjorinɪ d̪iː ɔro e reze le debite ɡratsje al pretore sɪ parti e messɔzɪ al merkatantare divɛnne wɔmo sadʒːo e dɪ ɡran manedʒːo | +| ./10179_11051_000021.flac | la bʊɔna femina ke ɛra fʊdʒːita il tutːo vedɛva e molto sʊspeza restava e parevale ʊn ora mille annɪ dɪ fʊrarla e dɪ potɛr operare tal effɛtːo | + +## Inference script (if you do not want to use the huggingsound library) : + +```python +import torch +from transformers import AutoModelForCTC, Wav2Vec2Processor +from datasets import load_dataset +import soundfile as sf # Or Librosa if you prefer to ... + +MODEL_ID = "Cnam-LMSSC/wav2vec2-italian-phonemizer" + +model = AutoModelForCTC.from_pretrained(MODEL_ID) +processor = Wav2Vec2Processor.from_pretrained(MODEL_ID) + +audio = sf.read('example.wav') +# Make sure you have a 16 kHz sampled audio file, or resample it ! + +inputs = processor(np.array(audio[0]),sampling_rate=16_000., return_tensors="pt") + +with torch.no_grad(): + logits = model(**inputs).logits + +predicted_ids = torch.argmax(logits,dim = -1) +transcription = processor.batch_decode(predicted_ids) + +print("Phonetic transcription : ", transcription) +``` + +**Output** : + +'ˈsoːno ˈmolto ˈljɛːto di prezenˈtarvi la ˈnɔstra soluˈttsjone per fonemiˈddzaːre fatʃilˈmente ʎi ˈawdjo funˈtsjoːna davˈveːro ˈmolto ˈbɛːne' + +## Test Results: + +In the table below, we report the Phoneme Error Rate (PER) of the model on Multilingual Librispeech (using the Italian configs for the dataset of course) : + +| Model | Test Set | PER | +| ------------- | ------------- | ------------- | +| Cnam-LMSSC/wav2vec2-italian-phonemizer | Multilingual Librispeech (Italian) | **4.34%** | + + +## Citation +If you use this finetuned model for any publication, please use this to cite our work : + +```bibtex +@misc {lmssc-wav2vec2-base-phonemizer-italian_2026, + author = { Olivier, Malo }, + title = { wav2vec2-italian-phonemizer (Revision 4d8a3a1) }, + year = 2026, + url = { https://huggingface.co/Cnam-LMSSC/wav2vec2-italian-phonemizer }, + doi = { 10.57967/hf/7982 }, + publisher = { Hugging Face } +} +``` \ No newline at end of file diff --git a/added_tokens.json b/added_tokens.json new file mode 100644 index 0000000..83392c3 --- /dev/null +++ b/added_tokens.json @@ -0,0 +1,4 @@ +{ + "": 38, + "": 37 +} diff --git a/config.json b/config.json new file mode 100644 index 0000000..f451d6c --- /dev/null +++ b/config.json @@ -0,0 +1,117 @@ +{ + "activation_dropout": 0.0, + "adapter_attn_dim": null, + "adapter_kernel_size": 3, + "adapter_stride": 2, + "add_adapter": false, + "apply_spec_augment": true, + "architectures": [ + "Wav2Vec2ForCTC" + ], + "attention_dropout": 0.1, + "bos_token_id": 1, + "classifier_proj_size": 256, + "codevector_dim": 256, + "contrastive_logits_temperature": 0.1, + "conv_bias": false, + "conv_dim": [ + 512, + 512, + 512, + 512, + 512, + 512, + 512 + ], + "conv_kernel": [ + 10, + 3, + 3, + 3, + 3, + 2, + 2 + ], + "conv_stride": [ + 5, + 2, + 2, + 2, + 2, + 2, + 2 + ], + "ctc_loss_reduction": "sum", + "ctc_zero_infinity": false, + "diversity_loss_weight": 0.1, + "do_stable_layer_norm": false, + "dtype": "float32", + "eos_token_id": 2, + "feat_extract_activation": "gelu", + "feat_extract_norm": "group", + "feat_proj_dropout": 0.1, + "feat_quantizer_dropout": 0.0, + "final_dropout": 0.0, + "freeze_feat_extract_train": true, + "hidden_act": "gelu", + "hidden_dropout": 0.1, + "hidden_size": 768, + "initializer_range": 0.02, + "intermediate_size": 3072, + "layer_norm_eps": 1e-05, + "layerdrop": 0.0, + "mask_channel_length": 10, + "mask_channel_min_space": 1, + "mask_channel_other": 0.0, + "mask_channel_prob": 0.0, + "mask_channel_selection": "static", + "mask_feature_length": 10, + "mask_feature_min_masks": 0, + "mask_feature_prob": 0.0, + "mask_time_length": 10, + "mask_time_min_masks": 2, + "mask_time_min_space": 1, + "mask_time_other": 0.0, + "mask_time_prob": 0.05, + "mask_time_selection": "static", + "model_type": "wav2vec2", + "no_mask_channel_overlap": false, + "no_mask_time_overlap": false, + "num_adapter_layers": 3, + "num_attention_heads": 12, + "num_codevector_groups": 2, + "num_codevectors_per_group": 320, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_feat_extract_layers": 7, + "num_hidden_layers": 12, + "num_negatives": 100, + "output_hidden_size": 768, + "pad_token_id": 36, + "proj_codevector_dim": 256, + "tdnn_dilation": [ + 1, + 2, + 3, + 1, + 1 + ], + "tdnn_dim": [ + 512, + 512, + 512, + 512, + 1500 + ], + "tdnn_kernel": [ + 5, + 3, + 3, + 1, + 1 + ], + "transformers_version": "4.57.3", + "use_weighted_layer_sum": false, + "vocab_size": 39, + "xvector_output_dim": 512 +} diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..972dd44 --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2eb50a76a2b59adea41a67945a03d615106a8077653acc68a33086fd4a35e52 +size 377632660 diff --git a/preprocessor_config.json b/preprocessor_config.json new file mode 100644 index 0000000..c626b55 --- /dev/null +++ b/preprocessor_config.json @@ -0,0 +1,10 @@ +{ + "do_normalize": true, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0.0, + "processor_class": "Wav2Vec2Processor", + "return_attention_mask": false, + "sampling_rate": 16000 +} diff --git a/pytorch_model.bin b/pytorch_model.bin new file mode 100644 index 0000000..04114ac --- /dev/null +++ b/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62f1eb8a6d8ffee497ceb21fb24889ff0b84b2d5137f876eebc2fbb45f956e59 +size 377678199 diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..1c2a036 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,6 @@ +{ + "bos_token": "", + "eos_token": "", + "pad_token": "[PAD]", + "unk_token": "[UNK]" +} diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..5290245 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,49 @@ +{ + "added_tokens_decoder": { + "35": { + "content": "[UNK]", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + }, + "36": { + "content": "[PAD]", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": false, + "do_lower_case": false, + "eos_token": "", + "extra_special_tokens": {}, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "[PAD]", + "processor_class": "Wav2Vec2Processor", + "replace_word_delimiter_char": " ", + "target_lang": null, + "tokenizer_class": "Wav2Vec2CTCTokenizer", + "unk_token": "[UNK]", + "word_delimiter_token": "|" +} diff --git a/vocab.json b/vocab.json new file mode 100644 index 0000000..bfad50c --- /dev/null +++ b/vocab.json @@ -0,0 +1,39 @@ +{ + "[PAD]": 36, + "[UNK]": 35, + "a": 1, + "b": 2, + "d": 3, + "e": 4, + "f": 5, + "h": 6, + "i": 7, + "j": 8, + "k": 9, + "l": 10, + "m": 11, + "n": 12, + "o": 13, + "p": 14, + "r": 15, + "s": 16, + "t": 17, + "u": 18, + "v": 19, + "w": 20, + "z": 21, + "|": 0, + "ŋ": 22, + "ɔ": 23, + "ɛ": 24, + "ɡ": 25, + "ɪ": 26, + "ɲ": 27, + "ɾ": 28, + "ʃ": 29, + "ʊ": 30, + "ʎ": 31, + "ʒ": 32, + "ː": 33, + "̪": 34 +}