2023-12-02 15:35:11 +08:00
#!/usr/bin/env python3
#
# Copyright (c) 2023 Xiaomi Corporation
"""
This file demonstrates how to use sherpa - onnx Python API to generate audio
from text , i . e . , text - to - speech .
Different from . / offline - tts . py , this file plays back the generated audio
while the model is still generating .
Usage :
2025-02-06 22:57:13 +08:00
Example ( 1 / 7 )
2023-12-02 15:35:11 +08:00
wget https : / / github . com / k2 - fsa / sherpa - onnx / releases / download / tts - models / vits - piper - en_US - amy - low . tar . bz2
tar xf vits - piper - en_US - amy - low . tar . bz2
python3 . / python - api - examples / offline - tts - play . py \
- - vits - model = . / vits - piper - en_US - amy - low / en_US - amy - low . onnx \
- - vits - tokens = . / vits - piper - en_US - amy - low / tokens . txt \
- - vits - data - dir = . / vits - piper - en_US - amy - low / espeak - ng - data \
- - output - filename = . / generated . wav \
" Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar. "
2025-02-06 22:57:13 +08:00
Example ( 2 / 7 )
2023-12-02 15:35:11 +08:00
wget https : / / github . com / k2 - fsa / sherpa - onnx / releases / download / tts - models / vits - zh - aishell3 . tar . bz2
tar xvf vits - zh - aishell3 . tar . bz2
python3 . / python - api - examples / offline - tts - play . py \
2024-05-11 09:21:51 +08:00
- - vits - model = . / vits - icefall - zh - aishell3 / model . onnx \
- - vits - lexicon = . / vits - icefall - zh - aishell3 / lexicon . txt \
- - vits - tokens = . / vits - icefall - zh - aishell3 / tokens . txt \
- - tts - rule - fsts = ' ./vits-icefall-zh-aishell3/phone.fst,./vits-icefall-zh-aishell3/date.fst,./vits-icefall-zh-aishell3/number.fst ' \
2023-12-02 15:35:11 +08:00
- - sid = 21 \
- - output - filename = . / liubei - 21. wav \
" 勿以恶小而为之, 勿以善小而不为。惟贤惟德, 能服于人。122334 "
2025-02-06 22:57:13 +08:00
Example ( 3 / 7 )
2024-05-11 09:21:51 +08:00
wget https : / / github . com / k2 - fsa / sherpa - onnx / releases / download / tts - models / sherpa - onnx - vits - zh - ll . tar . bz2
tar xvf sherpa - onnx - vits - zh - ll . tar . bz2
rm sherpa - onnx - vits - zh - ll . tar . bz2
python3 . / python - api - examples / offline - tts - play . py \
- - vits - model = . / sherpa - onnx - vits - zh - ll / model . onnx \
- - vits - lexicon = . / sherpa - onnx - vits - zh - ll / lexicon . txt \
- - vits - tokens = . / sherpa - onnx - vits - zh - ll / tokens . txt \
2024-05-11 20:58:23 +08:00
- - tts - rule - fsts = . / sherpa - onnx - vits - zh - ll / phone . fst , . / sherpa - onnx - vits - zh - ll / date . fst , . / sherpa - onnx - vits - zh - ll / number . fst \
2024-05-11 09:21:51 +08:00
- - vits - dict - dir = . / sherpa - onnx - vits - zh - ll / dict \
- - sid = 2 \
- - output - filename = . / test - 2. wav \
" 当夜幕降临, 星光点点, 伴随着微风拂面, 我在静谧中感受着时光的流转, 思念如涟漪荡漾, 梦境如画卷展开, 我与自然融为一体, 沉静在这片宁静的美丽之中, 感受着生命的奇迹与温柔。2024年5月11号, 拨打110或者18920240511。123456块钱。 "
2025-02-06 22:57:13 +08:00
Example ( 4 / 7 )
2024-12-31 12:44:14 +08:00
curl - O - SL https : / / github . com / k2 - fsa / sherpa - onnx / releases / download / tts - models / matcha - icefall - zh - baker . tar . bz2
tar xvf matcha - icefall - zh - baker . tar . bz2
rm matcha - icefall - zh - baker . tar . bz2
2025-03-17 17:05:15 +08:00
curl - SL - O https : / / github . com / k2 - fsa / sherpa - onnx / releases / download / vocoder - models / vocos - 22 khz - univ . onnx
2024-12-31 12:44:14 +08:00
python3 . / python - api - examples / offline - tts - play . py \
- - matcha - acoustic - model = . / matcha - icefall - zh - baker / model - steps - 3. onnx \
2025-03-17 17:05:15 +08:00
- - matcha - vocoder = . / vocos - 22 khz - univ . onnx \
2024-12-31 12:44:14 +08:00
- - matcha - lexicon = . / matcha - icefall - zh - baker / lexicon . txt \
- - matcha - tokens = . / matcha - icefall - zh - baker / tokens . txt \
- - tts - rule - fsts = . / matcha - icefall - zh - baker / phone . fst , . / matcha - icefall - zh - baker / date . fst , . / matcha - icefall - zh - baker / number . fst \
- - matcha - dict - dir = . / matcha - icefall - zh - baker / dict \
- - output - filename = . / test - matcha . wav \
" 某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号, 拨打110或者18920240511。123456块钱。 "
2025-02-06 22:57:13 +08:00
Example ( 5 / 7 )
2025-01-02 13:46:43 +08:00
curl - SL - O https : / / github . com / k2 - fsa / sherpa - onnx / releases / download / tts - models / matcha - icefall - en_US - ljspeech . tar . bz2
tar xvf matcha - icefall - en_US - ljspeech . tar . bz2
rm matcha - icefall - en_US - ljspeech . tar . bz2
2025-03-17 17:05:15 +08:00
curl - SL - O https : / / github . com / k2 - fsa / sherpa - onnx / releases / download / vocoder - models / vocos - 22 khz - univ . onnx
2025-01-02 13:46:43 +08:00
python3 . / python - api - examples / offline - tts - play . py \
- - matcha - acoustic - model = . / matcha - icefall - en_US - ljspeech / model - steps - 3. onnx \
2025-03-17 17:05:15 +08:00
- - matcha - vocoder = . / vocos - 22 khz - univ . onnx \
2025-01-02 13:46:43 +08:00
- - matcha - tokens = . / matcha - icefall - en_US - ljspeech / tokens . txt \
- - matcha - data - dir = . / matcha - icefall - en_US - ljspeech / espeak - ng - data \
- - output - filename = . / test - matcha - ljspeech - en . wav \
- - num - threads = 2 \
" Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar. "
2025-02-06 22:57:13 +08:00
Example ( 6 / 7 )
( This version of kokoro supports only English )
2025-01-16 14:24:51 +08:00
curl - SL - O https : / / github . com / k2 - fsa / sherpa - onnx / releases / download / tts - models / kokoro - en - v0_19 . tar . bz2
tar xf kokoro - en - v0_19 . tar . bz2
rm kokoro - en - v0_19 . tar . bz2
python3 . / python - api - examples / offline - tts . py \
- - debug = 1 \
- - kokoro - model = . / kokoro - en - v0_19 / model . onnx \
- - kokoro - voices = . / kokoro - en - v0_19 / voices . bin \
- - kokoro - tokens = . / kokoro - en - v0_19 / tokens . txt \
- - kokoro - data - dir = . / kokoro - en - v0_19 / espeak - ng - data \
- - num - threads = 2 \
- - sid = 10 \
- - output - filename = " ./kokoro-10.wav " \
" Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be a statesman, a businessman, an official, or a scholar. "
2024-05-11 09:21:51 +08:00
2025-02-06 22:57:13 +08:00
Example ( 7 / 7 )
( This version of kokoro supports English , Chinese , etc . )
curl - SL - O https : / / github . com / k2 - fsa / sherpa - onnx / releases / download / tts - models / kokoro - multi - lang - v1_0 . tar . bz2
tar xf kokoro - multi - lang - v1_0 . tar . bz2
rm kokoro - multi - lang - v1_0 . tar . bz2
python3 . / python - api - examples / offline - tts - play . py \
- - debug = 1 \
- - kokoro - model = . / kokoro - multi - lang - v1_0 / model . onnx \
- - kokoro - voices = . / kokoro - multi - lang - v1_0 / voices . bin \
- - kokoro - tokens = . / kokoro - multi - lang - v1_0 / tokens . txt \
- - kokoro - data - dir = . / kokoro - multi - lang - v1_0 / espeak - ng - data \
- - kokoro - dict - dir = . / kokoro - multi - lang - v1_0 / dict \
- - kokoro - lexicon = . / kokoro - multi - lang - v1_0 / lexicon - us - en . txt , . / kokoro - multi - lang - v1_0 / lexicon - zh . txt \
- - num - threads = 2 \
- - sid = 18 \
- - output - filename = " ./kokoro-18-zh-en.wav " \
" 中英文语音合成测试。This is generated by next generation Kaldi using Kokoro without Misaki. 你觉得中英文说的如何呢? "
2023-12-02 15:35:11 +08:00
You can find more models at
https : / / github . com / k2 - fsa / sherpa - onnx / releases / tag / tts - models
Please see
https : / / k2 - fsa . github . io / sherpa / onnx / tts / index . html
for details .
"""
import argparse
import logging
import queue
import sys
import threading
import time
import numpy as np
import sherpa_onnx
import soundfile as sf
try :
import sounddevice as sd
except ImportError :
print ( " Please install sounddevice first. You can use " )
print ( )
print ( " pip install sounddevice " )
print ( )
print ( " to install it " )
sys . exit ( - 1 )
2024-12-31 12:44:14 +08:00
def add_vits_args ( parser ) :
2023-12-02 15:35:11 +08:00
parser . add_argument (
" --vits-model " ,
type = str ,
2024-12-31 12:44:14 +08:00
default = " " ,
2023-12-02 15:35:11 +08:00
help = " Path to vits model.onnx " ,
)
parser . add_argument (
" --vits-lexicon " ,
type = str ,
default = " " ,
help = " Path to lexicon.txt " ,
)
parser . add_argument (
" --vits-tokens " ,
type = str ,
default = " " ,
help = " Path to tokens.txt " ,
)
parser . add_argument (
" --vits-data-dir " ,
type = str ,
default = " " ,
2024-05-11 09:21:51 +08:00
help = """ Path to the dict directory of espeak-ng. If it is specified,
2023-12-02 15:35:11 +08:00
- - vits - lexicon and - - vits - tokens are ignored """ ,
)
2024-05-11 09:21:51 +08:00
parser . add_argument (
" --vits-dict-dir " ,
type = str ,
default = " " ,
help = " Path to the dict directory for models using jieba " ,
)
2024-12-31 12:44:14 +08:00
def add_matcha_args ( parser ) :
parser . add_argument (
" --matcha-acoustic-model " ,
type = str ,
default = " " ,
help = " Path to model.onnx for matcha " ,
)
parser . add_argument (
" --matcha-vocoder " ,
type = str ,
default = " " ,
help = " Path to vocoder for matcha " ,
)
parser . add_argument (
" --matcha-lexicon " ,
type = str ,
default = " " ,
help = " Path to lexicon.txt for matcha " ,
)
parser . add_argument (
" --matcha-tokens " ,
type = str ,
default = " " ,
help = " Path to tokens.txt for matcha " ,
)
parser . add_argument (
" --matcha-data-dir " ,
type = str ,
default = " " ,
help = """ Path to the dict directory of espeak-ng. If it is specified,
- - matcha - lexicon and - - matcha - tokens are ignored """ ,
)
parser . add_argument (
" --matcha-dict-dir " ,
type = str ,
default = " " ,
help = " Path to the dict directory for models using jieba " ,
)
2025-01-16 14:24:51 +08:00
def add_kokoro_args ( parser ) :
parser . add_argument (
" --kokoro-model " ,
type = str ,
default = " " ,
help = " Path to model.onnx for kokoro " ,
)
parser . add_argument (
" --kokoro-voices " ,
type = str ,
default = " " ,
help = " Path to voices.bin for kokoro " ,
)
parser . add_argument (
" --kokoro-tokens " ,
type = str ,
default = " " ,
help = " Path to tokens.txt for kokoro " ,
)
parser . add_argument (
" --kokoro-data-dir " ,
type = str ,
default = " " ,
help = " Path to the dict directory of espeak-ng. " ,
)
2025-02-06 22:57:13 +08:00
parser . add_argument (
" --kokoro-dict-dir " ,
type = str ,
default = " " ,
help = " Path to the dict directory for models using jieba. Needed only by multilingual kokoro " ,
)
parser . add_argument (
" --kokoro-lexicon " ,
type = str ,
default = " " ,
help = " Path to lexicon.txt for kokoro. Needed only by multilingual kokoro " ,
)
2025-01-16 14:24:51 +08:00
2024-12-31 12:44:14 +08:00
def get_args ( ) :
parser = argparse . ArgumentParser (
formatter_class = argparse . ArgumentDefaultsHelpFormatter
)
add_vits_args ( parser )
add_matcha_args ( parser )
2025-01-16 14:24:51 +08:00
add_kokoro_args ( parser )
2024-12-31 12:44:14 +08:00
2023-12-02 15:35:11 +08:00
parser . add_argument (
" --tts-rule-fsts " ,
type = str ,
default = " " ,
help = " Path to rule.fst " ,
)
parser . add_argument (
" --output-filename " ,
type = str ,
default = " ./generated.wav " ,
help = " Path to save generated wave " ,
)
parser . add_argument (
" --sid " ,
type = int ,
default = 0 ,
help = """ Speaker ID. Used only for multi-speaker models, e.g.
models trained using the VCTK dataset . Not used for single - speaker
models , e . g . , models trained using the LJ speech dataset .
""" ,
)
parser . add_argument (
" --debug " ,
type = bool ,
default = False ,
help = " True to show debug messages " ,
)
parser . add_argument (
" --provider " ,
type = str ,
default = " cpu " ,
help = " valid values: cpu, cuda, coreml " ,
)
parser . add_argument (
" --num-threads " ,
type = int ,
default = 1 ,
help = " Number of threads for neural network computation " ,
)
parser . add_argument (
" --speed " ,
type = float ,
default = 1.0 ,
help = " Speech speed. Larger->faster; smaller->slower " ,
)
parser . add_argument (
" text " ,
type = str ,
help = " The input text to generate audio for " ,
)
return parser . parse_args ( )
# buffer saves audio samples to be played
buffer = queue . Queue ( )
# started is set to True once generated_audio_callback is called.
started = False
# stopped is set to True once all the text has been processed
stopped = False
# killed is set to True once ctrl + C is pressed
killed = False
# Note: When started is True, and stopped is True, and buffer is empty,
# we will exit the program since all audio samples have been played.
sample_rate = None
event = threading . Event ( )
2024-03-11 11:05:42 +08:00
first_message_time = None
2023-12-02 15:35:11 +08:00
2024-03-28 18:05:40 +08:00
def generated_audio_callback ( samples : np . ndarray , progress : float ) :
2023-12-02 15:35:11 +08:00
""" This function is called whenever max_num_sentences sentences
have been processed .
Note that it is passed to C + + and is invoked in C + + .
Args :
samples :
A 1 - D np . float32 array containing audio samples
"""
2024-03-11 11:05:42 +08:00
global first_message_time
if first_message_time is None :
first_message_time = time . time ( )
2023-12-02 15:35:11 +08:00
buffer . put ( samples )
global started
if started is False :
logging . info ( " Start playing ... " )
started = True
2024-06-22 18:18:36 +08:00
# 1 means to keep generating
# 0 means to stop generating
if killed :
return 0
return 1
2023-12-02 15:35:11 +08:00
# see https://python-sounddevice.readthedocs.io/en/0.4.6/api/streams.html#sounddevice.OutputStream
def play_audio_callback (
outdata : np . ndarray , frames : int , time , status : sd . CallbackFlags
) :
if killed or ( started and buffer . empty ( ) and stopped ) :
event . set ( )
# outdata is of shape (frames, num_channels)
if buffer . empty ( ) :
outdata . fill ( 0 )
return
n = 0
while n < frames and not buffer . empty ( ) :
remaining = frames - n
k = buffer . queue [ 0 ] . shape [ 0 ]
if remaining < = k :
outdata [ n : , 0 ] = buffer . queue [ 0 ] [ : remaining ]
buffer . queue [ 0 ] = buffer . queue [ 0 ] [ remaining : ]
n = frames
if buffer . queue [ 0 ] . shape [ 0 ] == 0 :
buffer . get ( )
break
outdata [ n : n + k , 0 ] = buffer . get ( )
n + = k
if n < frames :
outdata [ n : , 0 ] = 0
# Please see
# https://python-sounddevice.readthedocs.io/en/0.4.6/usage.html#device-selection
# for how to select a device
def play_audio ( ) :
if False :
# This if branch can be safely removed. It is here to show you how to
# change the default output device in case you need that.
devices = sd . query_devices ( )
print ( devices )
# sd.default.device[1] is the output device, if you want to
# select a different device, say, 3, as the output device, please
# use self.default.device[1] = 3
default_output_device_idx = sd . default . device [ 1 ]
print (
f ' Use default output device: { devices [ default_output_device_idx ] [ " name " ] } '
)
with sd . OutputStream (
channels = 1 ,
callback = play_audio_callback ,
dtype = " float32 " ,
samplerate = sample_rate ,
blocksize = 1024 ,
) :
event . wait ( )
logging . info ( " Exiting ... " )
def main ( ) :
args = get_args ( )
print ( args )
tts_config = sherpa_onnx . OfflineTtsConfig (
model = sherpa_onnx . OfflineTtsModelConfig (
vits = sherpa_onnx . OfflineTtsVitsModelConfig (
model = args . vits_model ,
lexicon = args . vits_lexicon ,
data_dir = args . vits_data_dir ,
2024-05-11 09:21:51 +08:00
dict_dir = args . vits_dict_dir ,
2023-12-02 15:35:11 +08:00
tokens = args . vits_tokens ,
) ,
2024-12-31 12:44:14 +08:00
matcha = sherpa_onnx . OfflineTtsMatchaModelConfig (
acoustic_model = args . matcha_acoustic_model ,
vocoder = args . matcha_vocoder ,
lexicon = args . matcha_lexicon ,
tokens = args . matcha_tokens ,
data_dir = args . matcha_data_dir ,
dict_dir = args . matcha_dict_dir ,
) ,
2025-01-16 14:24:51 +08:00
kokoro = sherpa_onnx . OfflineTtsKokoroModelConfig (
model = args . kokoro_model ,
voices = args . kokoro_voices ,
tokens = args . kokoro_tokens ,
data_dir = args . kokoro_data_dir ,
2025-02-06 22:57:13 +08:00
dict_dir = args . kokoro_dict_dir ,
lexicon = args . kokoro_lexicon ,
2025-01-16 14:24:51 +08:00
) ,
2023-12-02 15:35:11 +08:00
provider = args . provider ,
debug = args . debug ,
num_threads = args . num_threads ,
) ,
rule_fsts = args . tts_rule_fsts ,
max_num_sentences = 1 ,
)
if not tts_config . validate ( ) :
raise ValueError ( " Please check your config " )
logging . info ( " Loading model ... " )
tts = sherpa_onnx . OfflineTts ( tts_config )
logging . info ( " Loading model done. " )
global sample_rate
sample_rate = tts . sample_rate
play_back_thread = threading . Thread ( target = play_audio )
play_back_thread . start ( )
logging . info ( " Start generating ... " )
2024-03-11 11:05:42 +08:00
start_time = time . time ( )
2023-12-02 15:35:11 +08:00
audio = tts . generate (
args . text ,
sid = args . sid ,
speed = args . speed ,
callback = generated_audio_callback ,
)
2024-03-11 11:05:42 +08:00
end_time = time . time ( )
2023-12-02 15:35:11 +08:00
logging . info ( " Finished generating! " )
global stopped
stopped = True
if len ( audio . samples ) == 0 :
print ( " Error in generating audios. Please read previous error messages. " )
2023-12-06 11:00:38 +08:00
global killed
killed = True
play_back_thread . join ( )
2023-12-02 15:35:11 +08:00
return
2024-03-11 11:05:42 +08:00
elapsed_seconds = end_time - start_time
2023-12-02 15:35:11 +08:00
audio_duration = len ( audio . samples ) / audio . sample_rate
real_time_factor = elapsed_seconds / audio_duration
sf . write (
args . output_filename ,
audio . samples ,
samplerate = audio . sample_rate ,
subtype = " PCM_16 " ,
)
logging . info ( f " The text is ' { args . text } ' " )
2024-03-11 11:05:42 +08:00
logging . info (
" Time in seconds to receive the first "
f " message: { first_message_time - start_time : .3f } "
)
2023-12-02 15:35:11 +08:00
logging . info ( f " Elapsed seconds: { elapsed_seconds : .3f } " )
logging . info ( f " Audio duration in seconds: { audio_duration : .3f } " )
logging . info (
f " RTF: { elapsed_seconds : .3f } / { audio_duration : .3f } = { real_time_factor : .3f } "
)
logging . info ( f " *** Saved to { args . output_filename } *** " )
print ( " \n >>>>>>>>> You can safely press ctrl + C to stop the play <<<<<<<<<< \n " )
play_back_thread . join ( )
if __name__ == " __main__ " :
formatter = " %(asctime)s %(levelname)s [ %(filename)s : %(lineno)d ] %(message)s "
logging . basicConfig ( format = formatter , level = logging . INFO )
try :
main ( )
except KeyboardInterrupt :
print ( " \n Caught Ctrl + C. Exiting " )
killed = True
sys . exit ( 0 )