初始化项目，由ModelHub XC社区提供模型

Model: saheedniyi/YarnGPT-local Source: Original Platform
2026-04-24 07:52:03 +08:00
commit 771e4d828e
18 changed files with 349166 additions and 0 deletions
--- a/.gitattributes
+++ b/.gitattributes
@@ -0,0 +1,36 @@
 *.7z filter=lfs diff=lfs merge=lfs -text
 *.arrow filter=lfs diff=lfs merge=lfs -text
 *.bin filter=lfs diff=lfs merge=lfs -text
 *.bz2 filter=lfs diff=lfs merge=lfs -text
 *.ckpt filter=lfs diff=lfs merge=lfs -text
 *.ftz filter=lfs diff=lfs merge=lfs -text
 *.gz filter=lfs diff=lfs merge=lfs -text
 *.h5 filter=lfs diff=lfs merge=lfs -text
 *.joblib filter=lfs diff=lfs merge=lfs -text
 *.lfs.* filter=lfs diff=lfs merge=lfs -text
 *.mlmodel filter=lfs diff=lfs merge=lfs -text
 *.model filter=lfs diff=lfs merge=lfs -text
 *.msgpack filter=lfs diff=lfs merge=lfs -text
 *.npy filter=lfs diff=lfs merge=lfs -text
 *.npz filter=lfs diff=lfs merge=lfs -text
 *.onnx filter=lfs diff=lfs merge=lfs -text
 *.ot filter=lfs diff=lfs merge=lfs -text
 *.parquet filter=lfs diff=lfs merge=lfs -text
 *.pb filter=lfs diff=lfs merge=lfs -text
 *.pickle filter=lfs diff=lfs merge=lfs -text
 *.pkl filter=lfs diff=lfs merge=lfs -text
 *.pt filter=lfs diff=lfs merge=lfs -text
 *.pth filter=lfs diff=lfs merge=lfs -text
 *.rar filter=lfs diff=lfs merge=lfs -text
 *.safetensors filter=lfs diff=lfs merge=lfs -text
 saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.tar.* filter=lfs diff=lfs merge=lfs -text
 *.tar filter=lfs diff=lfs merge=lfs -text
 *.tflite filter=lfs diff=lfs merge=lfs -text
 *.tgz filter=lfs diff=lfs merge=lfs -text
 *.wasm filter=lfs diff=lfs merge=lfs -text
 *.xz filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 audio/YarnGPT-Local.mp4 filter=lfs diff=lfs merge=lfs -text
--- a/README.md
+++ b/README.md
@@ -0,0 +1,403 @@
 ---
 library_name: transformers
 language:
 - yo
 - ig
 - ha
 base_model:
 - HuggingFaceTB/SmolLM2-360M
 - saheedniyi/YarnGPT
 pipeline_tag: text-to-speech
 license: cc-by-nc-sa-4.0
 ---
 # YarnGPT-local
 ![image/png](https://huggingface.co/saheedniyi/YarnGPT/resolve/main/audio/logo.webp)
 ## Table of Contents
 1. [Model Summary](#model-summary)  
 2. [Model Description](#model-description)  
 3. [Bias, Risks, and Limitations](#bias-risks-and-limitations)  
   - [Recommendations](#recommendations)  
 4. [Speech Samples](#speech-samples)  
 5. [Training](#training)  
 6. [Future Improvements](#future-improvements)  
 7. [Citation](#citation)  
 8. [Credits & References](#credits--references)
 ## Model Summary
 YarnGPT-local is a text-to-speech (TTS) model designed to synthesize Yoruba, Igbo and Hausa leveraging pure language modelling without external adapters or complex architectures, offering high-quality, natural, and culturally relevant speech synthesis for diverse applications.
 <video controls width="600">
  <source src="https://huggingface.co/saheedniyi/YarnGPT-local/resolve/main/audio/YarnGPT-Local.mp4" type="video/mp4">
  Your browser does not support the video tag.
 </video>
 #### How to use (on Google Colab)
 The model can generate audio on its own but its better to use a voice to prompt the model, there are about 10 voices supported by default:
 - hausa_female1
 - hausa_female2
 - hausa_male1
 - hausa_male2
 - igbo_female1
 - igbo_female2
 - igbo_male2
 - yoruba_female1
 - yoruba_female2
 - yoruba_male2
 ### Prompt YarnGPT-local
 ```python
 # clone the YarnGPT repo to get access to the `audiotokenizer`
 !git clone https://github.com/saheedniyi02/yarngpt.git
 # install some necessary libraries
 !pip install outetts==0.2.3 uroman
 #import some important packages 
 import os
 import re
 import json
 import torch
 import inflect
 import random
 import uroman as ur
 import numpy as np
 import torchaudio
 import IPython
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from outetts.wav_tokenizer.decoder import WavTokenizer
 from yarngpt.audiotokenizer import AudioTokenizerForLocal
 # download the wavtokenizer weights and config (to encode and decode the audio)
 !wget https://huggingface.co/novateur/WavTokenizer-medium-speech-75token/resolve/main/wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml
 !gdown 1-ASeEkrn4HY49yZWHTASgfGFNXdVnLTt
 # model path and wavtokenizer weight path (the paths are assumed based on Google colab, a different environment might save the weights to a different location).
 hf_path="saheedniyi/YarnGPT-local"
 wav_tokenizer_config_path="/content/wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml"
 wav_tokenizer_model_path = "/content/wavtokenizer_large_speech_320_24k.ckpt"
 # create the AudioTokenizer object 
 audio_tokenizer=AudioTokenizerForLocal(
    hf_path,wav_tokenizer_model_path,wav_tokenizer_config_path
 )
 #load the model weights
 model = AutoModelForCausalLM.from_pretrained(hf_path,torch_dtype="auto").to(audio_tokenizer.device)
 # your input text
 text="Ẹ maa rii pe lati bi ọsẹ meloo kan ni ijiroro ti wa lati ọdọ awọn ileeṣẹ wọnyi wi pe wọn fẹẹ ṣafikun si owo ipe pẹlu ida ọgọrun-un."
 # creating a prompt, when creating a prompt, there is an optional `speaker_name` parameter
 prompt=audio_tokenizer.create_prompt(text,"yoruba","yoruba_male2")
 # tokenize the prompt
 input_ids=audio_tokenizer.tokenize_prompt(prompt)
 # generate output from the model, you can tune the `.generate` parameters as you wish
 output  = model.generate(
            input_ids=input_ids,
            temperature=0.1,
            repetition_penalty=1.1,
            num_beams=4,
            max_length=4000,
        )
 # convert the output to "audio codes"
 codes=audio_tokenizer.get_codes(output)
 # converts the codes to audio 
 audio=audio_tokenizer.get_audio(codes)
 # play the audio
 IPython.display.Audio(audio,rate=24000)
 # save the audio 
 torchaudio.save(f"audio.wav", audio, sample_rate=24000)
 ```
 ### Simple News-Reader for Local languages
 ```python
 # clone the YarnGPT repo to get access to the `audiotokenizer`
 !git clone https://github.com/saheedniyi02/yarngpt.git
 # install some necessary libraries
 !pip install outetts uroman trafilatura pydub
 #import important packages
 import os
 import re
 import json
 import torch
 import inflect
 import random
 import requests
 import trafilatura
 import inflect
 import uroman as ur
 import numpy as np
 import torchaudio
 import IPython
 from pydub import AudioSegment
 from pydub.effects import normalize
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from outetts.wav_tokenizer.decoder import WavTokenizer
 from yarngpt.audiotokenizer import AudioTokenizer,AudioTokenizerForLocal
 # download the `WavTokenizer` files
 !wget https://huggingface.co/novateur/WavTokenizer-medium-speech-75token/resolve/main/wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml
 !gdown 1-ASeEkrn4HY49yZWHTASgfGFNXdVnLTt
 tokenizer_path="saheedniyi/YarnGPT-local"
 wav_tokenizer_config_path="/content/wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml"
 wav_tokenizer_model_path = "/content/wavtokenizer_large_speech_320_24k.ckpt"
 audio_tokenizer=AudioTokenizerForLocal(
    tokenizer_path,wav_tokenizer_model_path,wav_tokenizer_config_path
       )
 model = AutoModelForCausalLM.from_pretrained(tokenizer_path,torch_dtype="auto").to(audio_tokenizer.device)
 # Split text into chunks
 def split_text_into_chunks(text, word_limit=25):
  sentences=[sentence.strip() for sentence in text.split('.') if sentence.strip()]
  chunks=[]
  for sentence in sentences:
    chunks.append(".")
    sentence_splitted=sentence.split(" ")
    num_words=len(sentence_splitted)
    start_index=0
    if num_words>word_limit:
      while start_index<num_words:
        end_index=min(num_words,start_index+word_limit)
        chunks.append(" ".join(sentence_splitted[start_index:start_index+word_limit]))
        start_index=end_index
    else:
      chunks.append(sentence)
  return chunks
 # reduce the speed of the audio, results from the local languages are always fast
 def speed_change(sound, speed=0.9):
    # Manually override the frame_rate. This tells the computer how many
    # samples to play per second
    sound_with_altered_frame_rate = sound._spawn(sound.raw_data, overrides={
         "frame_rate": int(sound.frame_rate * speed)
      })
     # convert the sound with altered frame rate to a standard frame rate
     # so that regular playback programs will work right. They often only
     # know how to play audio at standard frame rate (like 44.1k)
    return sound_with_altered_frame_rate.set_frame_rate(sound.frame_rate)
 page=requests.get("https://alaroye.org/a-maa-too-fo-ipinle-ogun-mo-omo-egbe-okunkun-meje-lowo-ti-te-bayii-omolola/")
 content=trafilatura.extract(page.text)
 chunks=split_text_into_chunks(content)
 all_codes=[]
 for i,chunk in enumerate(chunks):
  print(i)
  print("\n")
  print(chunk)
  if chunk==".":
    #add silence for 0.5 seconds if we encounter a full stop
    all_codes.extend([453]*38)
  else:
    prompt=audio_tokenizer.create_prompt(chunk,lang="yoruba",speaker_name="yoruba_female2")
    input_ids=audio_tokenizer.tokenize_prompt(prompt)
    output  = model.generate(
            input_ids=input_ids,
            temperature=0.1,
            repetition_penalty=1.1,
            max_length=4000,
            num_beams=5,
        )
    codes=audio_tokenizer.get_codes(output)
    all_codes.extend(codes)
 audio=audio_tokenizer.get_audio(all_codes)
 #display the output
 IPython.display.Audio(audio,rate=24000)
 #save audio
 torchaudio.save(f"news1.wav", audio, sample_rate=24000)
 #convert file to an `AudioSegment` object for furher processing
 audio_dub=AudioSegment.from_file("news1.wav")
 # reduce audio speed: it reduces quality also
 speed_change(audio_dub,0.9)
 ```
 ## Model Description
 - **Developed by:** [Saheedniyi](https://linkedin.com/in/azeez-saheed)
 - **Model type:** Text-to-Speech
 - **Language(s) (NLP):** Igbo, Yoruba, Hausa--> Speech
 - **Finetuned from:** [HuggingFaceTB/SmolLM2-360M](https://huggingface.co/HuggingFaceTB/SmolLM2-360M)
 - **Repository:** [YarnGPT Github Repository](https://github.com/saheedniyi02/yarngpt)
 - **Paper:** IN PROGRESS.
 - **Demo:** 1) [Prompt YarnGPT-local notebook](https://colab.research.google.com/drive/1UWeirECQbjFGib1SqpiDdkzS1Bi_vi9i?usp=sharing)
            2) [Simple news reader: YarnGPT-local](https://colab.research.google.com/drive/1CMsLVsDaX2u4YUtV01fOvnDCtCC59bNe?usp=sharing)
 #### Uses
 Generate yoruba, igbo and hausa speech for experimental purposes.
 #### Out-of-Scope Use
 The model is not suitable for generating speech in languages other than Yoruba, Igbo and Hausa.
 ## Bias, Risks, and Limitations
 - The model may not capture the full diversity of Nigerian accents and could exhibit biases based on the training dataset.
 - The audio generated by the model are sometimes very fast and might need some post-processing to be done.
 - The model doesn't take 'intonations' into account which sometimes leads to mispronounce meant of some words.
 - Model doesn't respond to some prompt
 #### Recommendations
 <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
 Users (both direct and downstream) should be made aware of the risks, biases, and limitations of the model. Feedback and diverse training data contributions are encouraged.
 ## Speech Samples
 Listen to samples generated by YarnGPT:
 <div style="margin-top: 20px;">
 <table style="width: 100%; border-collapse: collapse;">
  <thead>
    <tr>
        <th style="border: 1px solid #ddd; padding: 8px; text-align: left; width: 40%;">Input</th>
        <th style="border: 1px solid #ddd; padding: 8px; text-align: left; width: 40%;">Audio</th>
        <th style="border: 1px solid #ddd; padding: 8px; text-align: left; width: 10%;">Notes</th>
    </tr>
  </thead>
  <tbody>
    <tr>
        <td style="border: 1px solid #ddd; padding: 8px;">Ẹ maa rii pe lati bi ọsẹ meloo kan ni ijiroro ti wa lati ọdọ awọn ileeṣẹ wọnyi wi pe wọn fẹẹ ṣafikun si owo ipe pẹlu ida ọgọrun-un</td>
        <td style="border: 1px solid #ddd; padding: 8px;">
            <audio controls style="width: 100%;">
                <source src="https://huggingface.co/saheedniyi/YarnGPT-local/resolve/main/audio/Sample1_yor.wav" type="audio/wav">
                Your browser does not support the audio element.
            </audio>
        </td>
        <td style="border: 1px solid #ddd; padding: 8px;">(temperature=0.1, repetition_penalty=1.1,num_beams=4), voice: yoruba_male2</td>
    </tr>
    <tr>
        <td style="border: 1px solid #ddd; padding: 8px;"> Iwadii fihan pe ọkan lara awọn eeyan meji yii lo ṣee si ja sinu tanki epo disu naa lasiko to n ṣiṣẹ lọwọ.</td>
        <td style="border: 1px solid #ddd; padding: 8px;">
            <audio controls style="width: 100%;">
                <source src="https://huggingface.co/saheedniyi/YarnGPT-local/resolve/main/audio/Sample2_yor.wav" type="audio/wav">
                Your browser does not support the audio element.
            </audio>
        </td>
        <td style="border: 1px solid #ddd; padding: 8px;">(temperature=0.1, repetition_penalty=1.1,num_beams=4), voice: yoruba_female1</td>
    </tr>
    <tr>
        <td style="border: 1px solid #ddd; padding: 8px;"> Shirun da gwamnati mai ci yanzu ta yi wajen kin bayani a akan halin da ake ciki a game da batun kidayar shi ne ya janyo wannan zargi da jam'iyyar ta Labour ta yi.</td>
        <td style="border: 1px solid #ddd; padding: 8px;">
            <audio controls style="width: 100%;">
                <source src="https://huggingface.co/saheedniyi/YarnGPT-local/resolve/main/audio/Sample1_hau.wav" type="audio/wav">
                Your browser does not support the audio element.
            </audio>
        </td>
        <td style="border: 1px solid #ddd; padding: 8px;">(temperature=0.1, repetition_penalty=1.1,num_beams=4), voice: hausa_male2</td>
    </tr>
    <tr>
        <td style="border: 1px solid #ddd; padding: 8px;">A lokuta da dama yakan fito a matsayin jarumin da ke taimaka wa babban jarumi, kodayake a wasu fina-finan yakan fito a matsayin babban jarumi.</td>
        <td style="border: 1px solid #ddd; padding: 8px;">
            <audio controls style="width: 100%;">
                <source src="https://huggingface.co/saheedniyi/YarnGPT-local/resolve/main/audio/Sample2_hau.wav" type="audio/wav">
                Your browser does not support the audio element.
            </audio>
        </td>
        <td style="border: 1px solid #ddd; padding: 8px;">(temperature=0.1, repetition_penalty=1.1,num_beams=4), voice: hausa_female1</td>
    </tr>
    <tr>
        <td style="border: 1px solid #ddd; padding: 8px;">Amụma ndị ọzọ o buru gụnyere inweta ihe zuru oke, ịmụta ụmụaka nye ndị na-achọ nwa</td>
        <td style="border: 1px solid #ddd; padding: 8px;">
            <audio controls style="width: 100%;">
                <source src="https://huggingface.co/saheedniyi/YarnGPT-local/resolve/main/audio/Sample1_igb.wav" type="audio/wav">
                Your browser does not support the audio element.
            </audio>
        </td>
        <td style="border: 1px solid #ddd; padding: 8px;">(temperature=0.1, repetition_penalty=1.1,num_beams=4), voice: igbo_female1</td>
    </tr>
  </tbody>
  </table>
 </div>
 ## Training
 #### Data
 Trained on open source dataset on Yoruba, Igbo and Hausa.
 #### Preprocessing 
 Audio files were preprocessed and resampled to 24Khz and tokenized using [wavtokenizer](https://huggingface.co/novateur/WavTokenizer).
 #### Training Hyperparameters
 - **Number of epochs:** 5
 - **batch_size:** 4
 - **Scheduler:** linear schedule with warmup for 4 epochs, then linear decay to zero for the last epoch
 - **Optimizer:** AdamW (betas=(0.9, 0.95),weight_decay=0.01)
 - **Learning rate:** 1*10^-3
 #### Hardware
 - **GPUs:** 1 A100 (google colab: 30 hours)
 #### Software
 - **Training Framework:** Pytorch
 ## Future Improvements?
 - Scaling up model size and training data
 - Wrap the model around an API endpoint 
 - Voice cloning.
 - Potential expansion into speech-to-speech assistant models
 ## Citation [optional]
 #### BibTeX:
 ```python
@misc{yarngpt2025,
  author = {Saheed Azeez},
  title = {YarnGPT: Nigerian-Accented English Text-to-Speech Model},
  year = {2025},
  publisher = {Hugging Face},
  url = {https://huggingface.co/SaheedAzeez/yarngpt}
 }
 ```
 #### APA:
 ```python
 Saheed Azeez. (2025). YarnGPT-local: Nigerian languages Text-to-Speech Model. Hugging Face. Available at: https://huggingface.co/saheedniyi/YarnGPT-local
 ```
 ## Credits & References
 - [OuteAI/OuteTTS-0.2-500M](https://huggingface.co/OuteAI/OuteTTS-0.2-500M/)
 - [WavTokenizer](https://github.com/jishengpeng/WavTokenizer)
 - [CTC Forced Alignment](https://pytorch.org/audio/stable/tutorials/ctc_forced_alignment_api_tutorial.html)
 - [Voicera](https://huggingface.co/Lwasinam/voicera)
--- a/added_tokens.json
+++ b/added_tokens.json
--- a/audio/Sample1_hau.wav
+++ b/audio/Sample1_hau.wav
--- a/audio/Sample1_igb.wav
+++ b/audio/Sample1_igb.wav
--- a/audio/Sample1_yor.wav
+++ b/audio/Sample1_yor.wav
--- a/audio/Sample2_hau.wav
+++ b/audio/Sample2_hau.wav
--- a/audio/Sample2_igb.wav
+++ b/audio/Sample2_igb.wav
--- a/audio/Sample2_yor.wav
+++ b/audio/Sample2_yor.wav
--- a/audio/YarnGPT-Local.mp4
+++ b/audio/YarnGPT-Local.mp4
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:2b4fdd6811b48195d578a108ff7b4e74075027d0e97b26a9e8453fb6d217c618
 size 10598161
--- a/config.json
+++ b/config.json
@@ -0,0 +1,32 @@
 {
  "_name_or_path": "saheedniyi/yih2",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "eos_token_id": 0,
  "head_dim": 64,
  "hidden_act": "silu",
  "hidden_size": 960,
  "initializer_range": 0.02,
  "intermediate_size": 2560,
  "is_llama_config": true,
  "max_position_embeddings": 8192,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 15,
  "num_hidden_layers": 32,
  "num_key_value_heads": 5,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_interleaved": false,
  "rope_scaling": null,
  "rope_theta": 100000,
  "tie_word_embeddings": true,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.47.1",
  "use_cache": true,
  "vocab_size": 53248
 }
--- a/generation_config.json
+++ b/generation_config.json
@@ -0,0 +1,6 @@
 {
  "_from_model_config": true,
  "bos_token_id": 0,
  "eos_token_id": 0,
  "transformers_version": "4.47.1"
 }
--- a/merges.txt
+++ b/merges.txt
--- a/model.safetensors
+++ b/model.safetensors
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:7e0b051e69263daa6d1d56d3ec6eab95900535142f61745008a70b9a1ffddeea
 size 731539240
--- a/special_tokens_map.json
+++ b/special_tokens_map.json
@@ -0,0 +1,49 @@
 {
  "additional_special_tokens": [
    "<|endoftext|>",
    "<|im_start|>",
    "<|im_end|>",
    "<repo_name>",
    "<reponame>",
    "<file_sep>",
    "<filename>",
    "<gh_stars>",
    "<issue_start>",
    "<issue_comment>",
    "<issue_closed>",
    "<jupyter_start>",
    "<jupyter_text>",
    "<jupyter_code>",
    "<jupyter_output>",
    "<jupyter_script>",
    "<empty_output>"
  ],
  "bos_token": {
    "content": "<|endoftext|>",
    "lstrip": false,
    "normalized": false,
    "rstrip": false,
    "single_word": false
  },
  "eos_token": {
    "content": "<|endoftext|>",
    "lstrip": false,
    "normalized": false,
    "rstrip": false,
    "single_word": false
  },
  "pad_token": {
    "content": "<|endoftext|>",
    "lstrip": false,
    "normalized": false,
    "rstrip": false,
    "single_word": false
  },
  "unk_token": {
    "content": "<|endoftext|>",
    "lstrip": false,
    "normalized": false,
    "rstrip": false,
    "single_word": false
  }
 }
--- a/tokenizer.json
+++ b/tokenizer.json
--- a/tokenizer_config.json
+++ b/tokenizer_config.json
--- a/vocab.json
+++ b/vocab.json