初始化项目,由ModelHub XC社区提供模型
Model: saheedniyi/YarnGPT-local Source: Original Platform
This commit is contained in:
36
.gitattributes
vendored
Normal file
36
.gitattributes
vendored
Normal file
@@ -0,0 +1,36 @@
|
|||||||
|
*.7z filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.arrow filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.bin filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.ftz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.gz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.h5 filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.joblib filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.model filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.npy filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.npz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.onnx filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.ot filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.parquet filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pb filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pickle filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pkl filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pt filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pth filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.rar filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
||||||
|
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tar filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tflite filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tgz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.wasm filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.xz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.zip filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.zst filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
audio/YarnGPT-Local.mp4 filter=lfs diff=lfs merge=lfs -text
|
||||||
403
README.md
Normal file
403
README.md
Normal file
@@ -0,0 +1,403 @@
|
|||||||
|
---
|
||||||
|
library_name: transformers
|
||||||
|
language:
|
||||||
|
- yo
|
||||||
|
- ig
|
||||||
|
- ha
|
||||||
|
base_model:
|
||||||
|
- HuggingFaceTB/SmolLM2-360M
|
||||||
|
- saheedniyi/YarnGPT
|
||||||
|
pipeline_tag: text-to-speech
|
||||||
|
license: cc-by-nc-sa-4.0
|
||||||
|
---
|
||||||
|
|
||||||
|
# YarnGPT-local
|
||||||
|

|
||||||
|
|
||||||
|
## Table of Contents
|
||||||
|
|
||||||
|
1. [Model Summary](#model-summary)
|
||||||
|
2. [Model Description](#model-description)
|
||||||
|
3. [Bias, Risks, and Limitations](#bias-risks-and-limitations)
|
||||||
|
- [Recommendations](#recommendations)
|
||||||
|
4. [Speech Samples](#speech-samples)
|
||||||
|
5. [Training](#training)
|
||||||
|
6. [Future Improvements](#future-improvements)
|
||||||
|
7. [Citation](#citation)
|
||||||
|
8. [Credits & References](#credits--references)
|
||||||
|
|
||||||
|
## Model Summary
|
||||||
|
|
||||||
|
YarnGPT-local is a text-to-speech (TTS) model designed to synthesize Yoruba, Igbo and Hausa leveraging pure language modelling without external adapters or complex architectures, offering high-quality, natural, and culturally relevant speech synthesis for diverse applications.
|
||||||
|
|
||||||
|
<video controls width="600">
|
||||||
|
<source src="https://huggingface.co/saheedniyi/YarnGPT-local/resolve/main/audio/YarnGPT-Local.mp4" type="video/mp4">
|
||||||
|
Your browser does not support the video tag.
|
||||||
|
</video>
|
||||||
|
|
||||||
|
#### How to use (on Google Colab)
|
||||||
|
The model can generate audio on its own but its better to use a voice to prompt the model, there are about 10 voices supported by default:
|
||||||
|
- hausa_female1
|
||||||
|
- hausa_female2
|
||||||
|
- hausa_male1
|
||||||
|
- hausa_male2
|
||||||
|
- igbo_female1
|
||||||
|
- igbo_female2
|
||||||
|
- igbo_male2
|
||||||
|
- yoruba_female1
|
||||||
|
- yoruba_female2
|
||||||
|
- yoruba_male2
|
||||||
|
|
||||||
|
### Prompt YarnGPT-local
|
||||||
|
```python
|
||||||
|
# clone the YarnGPT repo to get access to the `audiotokenizer`
|
||||||
|
!git clone https://github.com/saheedniyi02/yarngpt.git
|
||||||
|
|
||||||
|
|
||||||
|
# install some necessary libraries
|
||||||
|
!pip install outetts==0.2.3 uroman
|
||||||
|
|
||||||
|
#import some important packages
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import json
|
||||||
|
import torch
|
||||||
|
import inflect
|
||||||
|
import random
|
||||||
|
import uroman as ur
|
||||||
|
import numpy as np
|
||||||
|
import torchaudio
|
||||||
|
import IPython
|
||||||
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||||
|
from outetts.wav_tokenizer.decoder import WavTokenizer
|
||||||
|
from yarngpt.audiotokenizer import AudioTokenizerForLocal
|
||||||
|
|
||||||
|
|
||||||
|
# download the wavtokenizer weights and config (to encode and decode the audio)
|
||||||
|
!wget https://huggingface.co/novateur/WavTokenizer-medium-speech-75token/resolve/main/wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml
|
||||||
|
!gdown 1-ASeEkrn4HY49yZWHTASgfGFNXdVnLTt
|
||||||
|
|
||||||
|
# model path and wavtokenizer weight path (the paths are assumed based on Google colab, a different environment might save the weights to a different location).
|
||||||
|
hf_path="saheedniyi/YarnGPT-local"
|
||||||
|
wav_tokenizer_config_path="/content/wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml"
|
||||||
|
wav_tokenizer_model_path = "/content/wavtokenizer_large_speech_320_24k.ckpt"
|
||||||
|
|
||||||
|
# create the AudioTokenizer object
|
||||||
|
audio_tokenizer=AudioTokenizerForLocal(
|
||||||
|
hf_path,wav_tokenizer_model_path,wav_tokenizer_config_path
|
||||||
|
)
|
||||||
|
|
||||||
|
#load the model weights
|
||||||
|
|
||||||
|
model = AutoModelForCausalLM.from_pretrained(hf_path,torch_dtype="auto").to(audio_tokenizer.device)
|
||||||
|
|
||||||
|
# your input text
|
||||||
|
text="Ẹ maa rii pe lati bi ọsẹ meloo kan ni ijiroro ti wa lati ọdọ awọn ileeṣẹ wọnyi wi pe wọn fẹẹ ṣafikun si owo ipe pẹlu ida ọgọrun-un."
|
||||||
|
|
||||||
|
# creating a prompt, when creating a prompt, there is an optional `speaker_name` parameter
|
||||||
|
prompt=audio_tokenizer.create_prompt(text,"yoruba","yoruba_male2")
|
||||||
|
|
||||||
|
# tokenize the prompt
|
||||||
|
input_ids=audio_tokenizer.tokenize_prompt(prompt)
|
||||||
|
|
||||||
|
# generate output from the model, you can tune the `.generate` parameters as you wish
|
||||||
|
output = model.generate(
|
||||||
|
input_ids=input_ids,
|
||||||
|
temperature=0.1,
|
||||||
|
repetition_penalty=1.1,
|
||||||
|
num_beams=4,
|
||||||
|
max_length=4000,
|
||||||
|
)
|
||||||
|
|
||||||
|
# convert the output to "audio codes"
|
||||||
|
codes=audio_tokenizer.get_codes(output)
|
||||||
|
|
||||||
|
# converts the codes to audio
|
||||||
|
audio=audio_tokenizer.get_audio(codes)
|
||||||
|
|
||||||
|
# play the audio
|
||||||
|
IPython.display.Audio(audio,rate=24000)
|
||||||
|
|
||||||
|
# save the audio
|
||||||
|
torchaudio.save(f"audio.wav", audio, sample_rate=24000)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Simple News-Reader for Local languages
|
||||||
|
```python
|
||||||
|
# clone the YarnGPT repo to get access to the `audiotokenizer`
|
||||||
|
!git clone https://github.com/saheedniyi02/yarngpt.git
|
||||||
|
|
||||||
|
|
||||||
|
# install some necessary libraries
|
||||||
|
!pip install outetts uroman trafilatura pydub
|
||||||
|
|
||||||
|
|
||||||
|
#import important packages
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import json
|
||||||
|
import torch
|
||||||
|
import inflect
|
||||||
|
import random
|
||||||
|
import requests
|
||||||
|
import trafilatura
|
||||||
|
import inflect
|
||||||
|
import uroman as ur
|
||||||
|
import numpy as np
|
||||||
|
import torchaudio
|
||||||
|
import IPython
|
||||||
|
from pydub import AudioSegment
|
||||||
|
from pydub.effects import normalize
|
||||||
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||||
|
from outetts.wav_tokenizer.decoder import WavTokenizer
|
||||||
|
from yarngpt.audiotokenizer import AudioTokenizer,AudioTokenizerForLocal
|
||||||
|
|
||||||
|
# download the `WavTokenizer` files
|
||||||
|
!wget https://huggingface.co/novateur/WavTokenizer-medium-speech-75token/resolve/main/wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml
|
||||||
|
!gdown 1-ASeEkrn4HY49yZWHTASgfGFNXdVnLTt
|
||||||
|
|
||||||
|
tokenizer_path="saheedniyi/YarnGPT-local"
|
||||||
|
wav_tokenizer_config_path="/content/wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml"
|
||||||
|
wav_tokenizer_model_path = "/content/wavtokenizer_large_speech_320_24k.ckpt"
|
||||||
|
|
||||||
|
|
||||||
|
audio_tokenizer=AudioTokenizerForLocal(
|
||||||
|
tokenizer_path,wav_tokenizer_model_path,wav_tokenizer_config_path
|
||||||
|
)
|
||||||
|
|
||||||
|
model = AutoModelForCausalLM.from_pretrained(tokenizer_path,torch_dtype="auto").to(audio_tokenizer.device)
|
||||||
|
|
||||||
|
# Split text into chunks
|
||||||
|
def split_text_into_chunks(text, word_limit=25):
|
||||||
|
sentences=[sentence.strip() for sentence in text.split('.') if sentence.strip()]
|
||||||
|
chunks=[]
|
||||||
|
for sentence in sentences:
|
||||||
|
chunks.append(".")
|
||||||
|
sentence_splitted=sentence.split(" ")
|
||||||
|
num_words=len(sentence_splitted)
|
||||||
|
start_index=0
|
||||||
|
if num_words>word_limit:
|
||||||
|
while start_index<num_words:
|
||||||
|
end_index=min(num_words,start_index+word_limit)
|
||||||
|
chunks.append(" ".join(sentence_splitted[start_index:start_index+word_limit]))
|
||||||
|
start_index=end_index
|
||||||
|
else:
|
||||||
|
chunks.append(sentence)
|
||||||
|
return chunks
|
||||||
|
|
||||||
|
# reduce the speed of the audio, results from the local languages are always fast
|
||||||
|
def speed_change(sound, speed=0.9):
|
||||||
|
# Manually override the frame_rate. This tells the computer how many
|
||||||
|
# samples to play per second
|
||||||
|
sound_with_altered_frame_rate = sound._spawn(sound.raw_data, overrides={
|
||||||
|
"frame_rate": int(sound.frame_rate * speed)
|
||||||
|
})
|
||||||
|
# convert the sound with altered frame rate to a standard frame rate
|
||||||
|
# so that regular playback programs will work right. They often only
|
||||||
|
# know how to play audio at standard frame rate (like 44.1k)
|
||||||
|
return sound_with_altered_frame_rate.set_frame_rate(sound.frame_rate)
|
||||||
|
|
||||||
|
|
||||||
|
page=requests.get("https://alaroye.org/a-maa-too-fo-ipinle-ogun-mo-omo-egbe-okunkun-meje-lowo-ti-te-bayii-omolola/")
|
||||||
|
content=trafilatura.extract(page.text)
|
||||||
|
chunks=split_text_into_chunks(content)
|
||||||
|
|
||||||
|
|
||||||
|
all_codes=[]
|
||||||
|
for i,chunk in enumerate(chunks):
|
||||||
|
print(i)
|
||||||
|
print("\n")
|
||||||
|
print(chunk)
|
||||||
|
if chunk==".":
|
||||||
|
#add silence for 0.5 seconds if we encounter a full stop
|
||||||
|
all_codes.extend([453]*38)
|
||||||
|
else:
|
||||||
|
prompt=audio_tokenizer.create_prompt(chunk,lang="yoruba",speaker_name="yoruba_female2")
|
||||||
|
input_ids=audio_tokenizer.tokenize_prompt(prompt)
|
||||||
|
output = model.generate(
|
||||||
|
input_ids=input_ids,
|
||||||
|
temperature=0.1,
|
||||||
|
repetition_penalty=1.1,
|
||||||
|
max_length=4000,
|
||||||
|
num_beams=5,
|
||||||
|
)
|
||||||
|
codes=audio_tokenizer.get_codes(output)
|
||||||
|
all_codes.extend(codes)
|
||||||
|
|
||||||
|
|
||||||
|
audio=audio_tokenizer.get_audio(all_codes)
|
||||||
|
|
||||||
|
#display the output
|
||||||
|
IPython.display.Audio(audio,rate=24000)
|
||||||
|
|
||||||
|
#save audio
|
||||||
|
torchaudio.save(f"news1.wav", audio, sample_rate=24000)
|
||||||
|
|
||||||
|
#convert file to an `AudioSegment` object for furher processing
|
||||||
|
audio_dub=AudioSegment.from_file("news1.wav")
|
||||||
|
|
||||||
|
# reduce audio speed: it reduces quality also
|
||||||
|
speed_change(audio_dub,0.9)
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
## Model Description
|
||||||
|
|
||||||
|
- **Developed by:** [Saheedniyi](https://linkedin.com/in/azeez-saheed)
|
||||||
|
- **Model type:** Text-to-Speech
|
||||||
|
- **Language(s) (NLP):** Igbo, Yoruba, Hausa--> Speech
|
||||||
|
- **Finetuned from:** [HuggingFaceTB/SmolLM2-360M](https://huggingface.co/HuggingFaceTB/SmolLM2-360M)
|
||||||
|
- **Repository:** [YarnGPT Github Repository](https://github.com/saheedniyi02/yarngpt)
|
||||||
|
- **Paper:** IN PROGRESS.
|
||||||
|
- **Demo:** 1) [Prompt YarnGPT-local notebook](https://colab.research.google.com/drive/1UWeirECQbjFGib1SqpiDdkzS1Bi_vi9i?usp=sharing)
|
||||||
|
2) [Simple news reader: YarnGPT-local](https://colab.research.google.com/drive/1CMsLVsDaX2u4YUtV01fOvnDCtCC59bNe?usp=sharing)
|
||||||
|
|
||||||
|
#### Uses
|
||||||
|
|
||||||
|
Generate yoruba, igbo and hausa speech for experimental purposes.
|
||||||
|
|
||||||
|
|
||||||
|
#### Out-of-Scope Use
|
||||||
|
|
||||||
|
The model is not suitable for generating speech in languages other than Yoruba, Igbo and Hausa.
|
||||||
|
|
||||||
|
|
||||||
|
## Bias, Risks, and Limitations
|
||||||
|
|
||||||
|
- The model may not capture the full diversity of Nigerian accents and could exhibit biases based on the training dataset.
|
||||||
|
- The audio generated by the model are sometimes very fast and might need some post-processing to be done.
|
||||||
|
- The model doesn't take 'intonations' into account which sometimes leads to mispronounce meant of some words.
|
||||||
|
- Model doesn't respond to some prompt
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#### Recommendations
|
||||||
|
|
||||||
|
<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
|
||||||
|
|
||||||
|
Users (both direct and downstream) should be made aware of the risks, biases, and limitations of the model. Feedback and diverse training data contributions are encouraged.
|
||||||
|
## Speech Samples
|
||||||
|
|
||||||
|
Listen to samples generated by YarnGPT:
|
||||||
|
|
||||||
|
<div style="margin-top: 20px;">
|
||||||
|
<table style="width: 100%; border-collapse: collapse;">
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th style="border: 1px solid #ddd; padding: 8px; text-align: left; width: 40%;">Input</th>
|
||||||
|
<th style="border: 1px solid #ddd; padding: 8px; text-align: left; width: 40%;">Audio</th>
|
||||||
|
<th style="border: 1px solid #ddd; padding: 8px; text-align: left; width: 10%;">Notes</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
<tr>
|
||||||
|
<td style="border: 1px solid #ddd; padding: 8px;">Ẹ maa rii pe lati bi ọsẹ meloo kan ni ijiroro ti wa lati ọdọ awọn ileeṣẹ wọnyi wi pe wọn fẹẹ ṣafikun si owo ipe pẹlu ida ọgọrun-un</td>
|
||||||
|
<td style="border: 1px solid #ddd; padding: 8px;">
|
||||||
|
<audio controls style="width: 100%;">
|
||||||
|
<source src="https://huggingface.co/saheedniyi/YarnGPT-local/resolve/main/audio/Sample1_yor.wav" type="audio/wav">
|
||||||
|
Your browser does not support the audio element.
|
||||||
|
</audio>
|
||||||
|
</td>
|
||||||
|
<td style="border: 1px solid #ddd; padding: 8px;">(temperature=0.1, repetition_penalty=1.1,num_beams=4), voice: yoruba_male2</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td style="border: 1px solid #ddd; padding: 8px;"> Iwadii fihan pe ọkan lara awọn eeyan meji yii lo ṣee si ja sinu tanki epo disu naa lasiko to n ṣiṣẹ lọwọ.</td>
|
||||||
|
<td style="border: 1px solid #ddd; padding: 8px;">
|
||||||
|
<audio controls style="width: 100%;">
|
||||||
|
<source src="https://huggingface.co/saheedniyi/YarnGPT-local/resolve/main/audio/Sample2_yor.wav" type="audio/wav">
|
||||||
|
Your browser does not support the audio element.
|
||||||
|
</audio>
|
||||||
|
</td>
|
||||||
|
<td style="border: 1px solid #ddd; padding: 8px;">(temperature=0.1, repetition_penalty=1.1,num_beams=4), voice: yoruba_female1</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td style="border: 1px solid #ddd; padding: 8px;"> Shirun da gwamnati mai ci yanzu ta yi wajen kin bayani a akan halin da ake ciki a game da batun kidayar shi ne ya janyo wannan zargi da jam'iyyar ta Labour ta yi.</td>
|
||||||
|
<td style="border: 1px solid #ddd; padding: 8px;">
|
||||||
|
<audio controls style="width: 100%;">
|
||||||
|
<source src="https://huggingface.co/saheedniyi/YarnGPT-local/resolve/main/audio/Sample1_hau.wav" type="audio/wav">
|
||||||
|
Your browser does not support the audio element.
|
||||||
|
</audio>
|
||||||
|
</td>
|
||||||
|
<td style="border: 1px solid #ddd; padding: 8px;">(temperature=0.1, repetition_penalty=1.1,num_beams=4), voice: hausa_male2</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td style="border: 1px solid #ddd; padding: 8px;">A lokuta da dama yakan fito a matsayin jarumin da ke taimaka wa babban jarumi, kodayake a wasu fina-finan yakan fito a matsayin babban jarumi.</td>
|
||||||
|
<td style="border: 1px solid #ddd; padding: 8px;">
|
||||||
|
<audio controls style="width: 100%;">
|
||||||
|
<source src="https://huggingface.co/saheedniyi/YarnGPT-local/resolve/main/audio/Sample2_hau.wav" type="audio/wav">
|
||||||
|
Your browser does not support the audio element.
|
||||||
|
</audio>
|
||||||
|
</td>
|
||||||
|
<td style="border: 1px solid #ddd; padding: 8px;">(temperature=0.1, repetition_penalty=1.1,num_beams=4), voice: hausa_female1</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td style="border: 1px solid #ddd; padding: 8px;">Amụma ndị ọzọ o buru gụnyere inweta ihe zuru oke, ịmụta ụmụaka nye ndị na-achọ nwa</td>
|
||||||
|
<td style="border: 1px solid #ddd; padding: 8px;">
|
||||||
|
<audio controls style="width: 100%;">
|
||||||
|
<source src="https://huggingface.co/saheedniyi/YarnGPT-local/resolve/main/audio/Sample1_igb.wav" type="audio/wav">
|
||||||
|
Your browser does not support the audio element.
|
||||||
|
</audio>
|
||||||
|
</td>
|
||||||
|
<td style="border: 1px solid #ddd; padding: 8px;">(temperature=0.1, repetition_penalty=1.1,num_beams=4), voice: igbo_female1</td>
|
||||||
|
</tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
|
||||||
|
## Training
|
||||||
|
|
||||||
|
#### Data
|
||||||
|
Trained on open source dataset on Yoruba, Igbo and Hausa.
|
||||||
|
|
||||||
|
#### Preprocessing
|
||||||
|
|
||||||
|
Audio files were preprocessed and resampled to 24Khz and tokenized using [wavtokenizer](https://huggingface.co/novateur/WavTokenizer).
|
||||||
|
|
||||||
|
#### Training Hyperparameters
|
||||||
|
- **Number of epochs:** 5
|
||||||
|
- **batch_size:** 4
|
||||||
|
- **Scheduler:** linear schedule with warmup for 4 epochs, then linear decay to zero for the last epoch
|
||||||
|
- **Optimizer:** AdamW (betas=(0.9, 0.95),weight_decay=0.01)
|
||||||
|
- **Learning rate:** 1*10^-3
|
||||||
|
|
||||||
|
#### Hardware
|
||||||
|
|
||||||
|
- **GPUs:** 1 A100 (google colab: 30 hours)
|
||||||
|
|
||||||
|
#### Software
|
||||||
|
|
||||||
|
- **Training Framework:** Pytorch
|
||||||
|
|
||||||
|
## Future Improvements?
|
||||||
|
- Scaling up model size and training data
|
||||||
|
- Wrap the model around an API endpoint
|
||||||
|
- Voice cloning.
|
||||||
|
- Potential expansion into speech-to-speech assistant models
|
||||||
|
|
||||||
|
## Citation [optional]
|
||||||
|
|
||||||
|
#### BibTeX:
|
||||||
|
|
||||||
|
```python
|
||||||
|
@misc{yarngpt2025,
|
||||||
|
author = {Saheed Azeez},
|
||||||
|
title = {YarnGPT: Nigerian-Accented English Text-to-Speech Model},
|
||||||
|
year = {2025},
|
||||||
|
publisher = {Hugging Face},
|
||||||
|
url = {https://huggingface.co/SaheedAzeez/yarngpt}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
#### APA:
|
||||||
|
|
||||||
|
```python
|
||||||
|
Saheed Azeez. (2025). YarnGPT-local: Nigerian languages Text-to-Speech Model. Hugging Face. Available at: https://huggingface.co/saheedniyi/YarnGPT-local
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
## Credits & References
|
||||||
|
- [OuteAI/OuteTTS-0.2-500M](https://huggingface.co/OuteAI/OuteTTS-0.2-500M/)
|
||||||
|
- [WavTokenizer](https://github.com/jishengpeng/WavTokenizer)
|
||||||
|
- [CTC Forced Alignment](https://pytorch.org/audio/stable/tutorials/ctc_forced_alignment_api_tutorial.html)
|
||||||
|
- [Voicera](https://huggingface.co/Lwasinam/voicera)
|
||||||
3036
added_tokens.json
Normal file
3036
added_tokens.json
Normal file
File diff suppressed because it is too large
Load Diff
BIN
audio/Sample1_hau.wav
Normal file
BIN
audio/Sample1_hau.wav
Normal file
Binary file not shown.
BIN
audio/Sample1_igb.wav
Normal file
BIN
audio/Sample1_igb.wav
Normal file
Binary file not shown.
BIN
audio/Sample1_yor.wav
Normal file
BIN
audio/Sample1_yor.wav
Normal file
Binary file not shown.
BIN
audio/Sample2_hau.wav
Normal file
BIN
audio/Sample2_hau.wav
Normal file
Binary file not shown.
BIN
audio/Sample2_igb.wav
Normal file
BIN
audio/Sample2_igb.wav
Normal file
Binary file not shown.
BIN
audio/Sample2_yor.wav
Normal file
BIN
audio/Sample2_yor.wav
Normal file
Binary file not shown.
3
audio/YarnGPT-Local.mp4
Normal file
3
audio/YarnGPT-Local.mp4
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:2b4fdd6811b48195d578a108ff7b4e74075027d0e97b26a9e8453fb6d217c618
|
||||||
|
size 10598161
|
||||||
32
config.json
Normal file
32
config.json
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
{
|
||||||
|
"_name_or_path": "saheedniyi/yih2",
|
||||||
|
"architectures": [
|
||||||
|
"LlamaForCausalLM"
|
||||||
|
],
|
||||||
|
"attention_bias": false,
|
||||||
|
"attention_dropout": 0.0,
|
||||||
|
"bos_token_id": 0,
|
||||||
|
"eos_token_id": 0,
|
||||||
|
"head_dim": 64,
|
||||||
|
"hidden_act": "silu",
|
||||||
|
"hidden_size": 960,
|
||||||
|
"initializer_range": 0.02,
|
||||||
|
"intermediate_size": 2560,
|
||||||
|
"is_llama_config": true,
|
||||||
|
"max_position_embeddings": 8192,
|
||||||
|
"mlp_bias": false,
|
||||||
|
"model_type": "llama",
|
||||||
|
"num_attention_heads": 15,
|
||||||
|
"num_hidden_layers": 32,
|
||||||
|
"num_key_value_heads": 5,
|
||||||
|
"pretraining_tp": 1,
|
||||||
|
"rms_norm_eps": 1e-05,
|
||||||
|
"rope_interleaved": false,
|
||||||
|
"rope_scaling": null,
|
||||||
|
"rope_theta": 100000,
|
||||||
|
"tie_word_embeddings": true,
|
||||||
|
"torch_dtype": "bfloat16",
|
||||||
|
"transformers_version": "4.47.1",
|
||||||
|
"use_cache": true,
|
||||||
|
"vocab_size": 53248
|
||||||
|
}
|
||||||
6
generation_config.json
Normal file
6
generation_config.json
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
{
|
||||||
|
"_from_model_config": true,
|
||||||
|
"bos_token_id": 0,
|
||||||
|
"eos_token_id": 0,
|
||||||
|
"transformers_version": "4.47.1"
|
||||||
|
}
|
||||||
48901
merges.txt
Normal file
48901
merges.txt
Normal file
File diff suppressed because it is too large
Load Diff
3
model.safetensors
Normal file
3
model.safetensors
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:7e0b051e69263daa6d1d56d3ec6eab95900535142f61745008a70b9a1ffddeea
|
||||||
|
size 731539240
|
||||||
49
special_tokens_map.json
Normal file
49
special_tokens_map.json
Normal file
@@ -0,0 +1,49 @@
|
|||||||
|
{
|
||||||
|
"additional_special_tokens": [
|
||||||
|
"<|endoftext|>",
|
||||||
|
"<|im_start|>",
|
||||||
|
"<|im_end|>",
|
||||||
|
"<repo_name>",
|
||||||
|
"<reponame>",
|
||||||
|
"<file_sep>",
|
||||||
|
"<filename>",
|
||||||
|
"<gh_stars>",
|
||||||
|
"<issue_start>",
|
||||||
|
"<issue_comment>",
|
||||||
|
"<issue_closed>",
|
||||||
|
"<jupyter_start>",
|
||||||
|
"<jupyter_text>",
|
||||||
|
"<jupyter_code>",
|
||||||
|
"<jupyter_output>",
|
||||||
|
"<jupyter_script>",
|
||||||
|
"<empty_output>"
|
||||||
|
],
|
||||||
|
"bos_token": {
|
||||||
|
"content": "<|endoftext|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false
|
||||||
|
},
|
||||||
|
"eos_token": {
|
||||||
|
"content": "<|endoftext|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false
|
||||||
|
},
|
||||||
|
"pad_token": {
|
||||||
|
"content": "<|endoftext|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false
|
||||||
|
},
|
||||||
|
"unk_token": {
|
||||||
|
"content": "<|endoftext|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false
|
||||||
|
}
|
||||||
|
}
|
||||||
272255
tokenizer.json
Normal file
272255
tokenizer.json
Normal file
File diff suppressed because it is too large
Load Diff
24441
tokenizer_config.json
Normal file
24441
tokenizer_config.json
Normal file
File diff suppressed because it is too large
Load Diff
1
vocab.json
Normal file
1
vocab.json
Normal file
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user