581 lines
17 KiB
Python
Executable File
581 lines
17 KiB
Python
Executable File
#!/usr/bin/env python3
|
||
#
|
||
# Copyright (c) 2023 Xiaomi Corporation
|
||
|
||
"""
|
||
This file demonstrates how to use sherpa-onnx Python API to generate audio
|
||
from text, i.e., text-to-speech.
|
||
|
||
Different from ./offline-tts.py, this file plays back the generated audio
|
||
while the model is still generating.
|
||
|
||
Usage:
|
||
|
||
Example (1/7)
|
||
|
||
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2
|
||
tar xf vits-piper-en_US-amy-low.tar.bz2
|
||
|
||
python3 ./python-api-examples/offline-tts-play.py \
|
||
--vits-model=./vits-piper-en_US-amy-low/en_US-amy-low.onnx \
|
||
--vits-tokens=./vits-piper-en_US-amy-low/tokens.txt \
|
||
--vits-data-dir=./vits-piper-en_US-amy-low/espeak-ng-data \
|
||
--output-filename=./generated.wav \
|
||
"Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar."
|
||
|
||
Example (2/7)
|
||
|
||
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-zh-aishell3.tar.bz2
|
||
tar xvf vits-zh-aishell3.tar.bz2
|
||
|
||
python3 ./python-api-examples/offline-tts-play.py \
|
||
--vits-model=./vits-icefall-zh-aishell3/model.onnx \
|
||
--vits-lexicon=./vits-icefall-zh-aishell3/lexicon.txt \
|
||
--vits-tokens=./vits-icefall-zh-aishell3/tokens.txt \
|
||
--tts-rule-fsts='./vits-icefall-zh-aishell3/phone.fst,./vits-icefall-zh-aishell3/date.fst,./vits-icefall-zh-aishell3/number.fst' \
|
||
--sid=21 \
|
||
--output-filename=./liubei-21.wav \
|
||
"勿以恶小而为之,勿以善小而不为。惟贤惟德,能服于人。122334"
|
||
|
||
Example (3/7)
|
||
|
||
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/sherpa-onnx-vits-zh-ll.tar.bz2
|
||
tar xvf sherpa-onnx-vits-zh-ll.tar.bz2
|
||
rm sherpa-onnx-vits-zh-ll.tar.bz2
|
||
|
||
python3 ./python-api-examples/offline-tts-play.py \
|
||
--vits-model=./sherpa-onnx-vits-zh-ll/model.onnx \
|
||
--vits-lexicon=./sherpa-onnx-vits-zh-ll/lexicon.txt \
|
||
--vits-tokens=./sherpa-onnx-vits-zh-ll/tokens.txt \
|
||
--tts-rule-fsts=./sherpa-onnx-vits-zh-ll/phone.fst,./sherpa-onnx-vits-zh-ll/date.fst,./sherpa-onnx-vits-zh-ll/number.fst \
|
||
--vits-dict-dir=./sherpa-onnx-vits-zh-ll/dict \
|
||
--sid=2 \
|
||
--output-filename=./test-2.wav \
|
||
"当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔。2024年5月11号,拨打110或者18920240511。123456块钱。"
|
||
|
||
Example (4/7)
|
||
|
||
curl -O -SL https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
|
||
tar xvf matcha-icefall-zh-baker.tar.bz2
|
||
rm matcha-icefall-zh-baker.tar.bz2
|
||
|
||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/vocos-22khz-univ.onnx
|
||
|
||
python3 ./python-api-examples/offline-tts-play.py \
|
||
--matcha-acoustic-model=./matcha-icefall-zh-baker/model-steps-3.onnx \
|
||
--matcha-vocoder=./vocos-22khz-univ.onnx \
|
||
--matcha-lexicon=./matcha-icefall-zh-baker/lexicon.txt \
|
||
--matcha-tokens=./matcha-icefall-zh-baker/tokens.txt \
|
||
--tts-rule-fsts=./matcha-icefall-zh-baker/phone.fst,./matcha-icefall-zh-baker/date.fst,./matcha-icefall-zh-baker/number.fst \
|
||
--matcha-dict-dir=./matcha-icefall-zh-baker/dict \
|
||
--output-filename=./test-matcha.wav \
|
||
"某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。"
|
||
|
||
Example (5/7)
|
||
|
||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
|
||
tar xvf matcha-icefall-en_US-ljspeech.tar.bz2
|
||
rm matcha-icefall-en_US-ljspeech.tar.bz2
|
||
|
||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/vocos-22khz-univ.onnx
|
||
|
||
python3 ./python-api-examples/offline-tts-play.py \
|
||
--matcha-acoustic-model=./matcha-icefall-en_US-ljspeech/model-steps-3.onnx \
|
||
--matcha-vocoder=./vocos-22khz-univ.onnx \
|
||
--matcha-tokens=./matcha-icefall-en_US-ljspeech/tokens.txt \
|
||
--matcha-data-dir=./matcha-icefall-en_US-ljspeech/espeak-ng-data \
|
||
--output-filename=./test-matcha-ljspeech-en.wav \
|
||
--num-threads=2 \
|
||
"Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar."
|
||
|
||
Example (6/7)
|
||
|
||
(This version of kokoro supports only English)
|
||
|
||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2
|
||
tar xf kokoro-en-v0_19.tar.bz2
|
||
rm kokoro-en-v0_19.tar.bz2
|
||
|
||
python3 ./python-api-examples/offline-tts.py \
|
||
--debug=1 \
|
||
--kokoro-model=./kokoro-en-v0_19/model.onnx \
|
||
--kokoro-voices=./kokoro-en-v0_19/voices.bin \
|
||
--kokoro-tokens=./kokoro-en-v0_19/tokens.txt \
|
||
--kokoro-data-dir=./kokoro-en-v0_19/espeak-ng-data \
|
||
--num-threads=2 \
|
||
--sid=10 \
|
||
--output-filename="./kokoro-10.wav" \
|
||
"Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be a statesman, a businessman, an official, or a scholar."
|
||
|
||
Example (7/7)
|
||
|
||
(This version of kokoro supports English, Chinese, etc.)
|
||
|
||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-multi-lang-v1_0.tar.bz2
|
||
tar xf kokoro-multi-lang-v1_0.tar.bz2
|
||
rm kokoro-multi-lang-v1_0.tar.bz2
|
||
|
||
python3 ./python-api-examples/offline-tts-play.py \
|
||
--debug=1 \
|
||
--kokoro-model=./kokoro-multi-lang-v1_0/model.onnx \
|
||
--kokoro-voices=./kokoro-multi-lang-v1_0/voices.bin \
|
||
--kokoro-tokens=./kokoro-multi-lang-v1_0/tokens.txt \
|
||
--kokoro-data-dir=./kokoro-multi-lang-v1_0/espeak-ng-data \
|
||
--kokoro-dict-dir=./kokoro-multi-lang-v1_0/dict \
|
||
--kokoro-lexicon=./kokoro-multi-lang-v1_0/lexicon-us-en.txt,./kokoro-multi-lang-v1_0/lexicon-zh.txt \
|
||
--num-threads=2 \
|
||
--sid=18 \
|
||
--output-filename="./kokoro-18-zh-en.wav" \
|
||
"中英文语音合成测试。This is generated by next generation Kaldi using Kokoro without Misaki. 你觉得中英文说的如何呢?"
|
||
|
||
You can find more models at
|
||
https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models
|
||
|
||
Please see
|
||
https://k2-fsa.github.io/sherpa/onnx/tts/index.html
|
||
for details.
|
||
"""
|
||
|
||
import argparse
|
||
import logging
|
||
import queue
|
||
import sys
|
||
import threading
|
||
import time
|
||
|
||
import numpy as np
|
||
import sherpa_onnx
|
||
import soundfile as sf
|
||
|
||
try:
|
||
import sounddevice as sd
|
||
except ImportError:
|
||
print("Please install sounddevice first. You can use")
|
||
print()
|
||
print(" pip install sounddevice")
|
||
print()
|
||
print("to install it")
|
||
sys.exit(-1)
|
||
|
||
|
||
def add_vits_args(parser):
|
||
parser.add_argument(
|
||
"--vits-model",
|
||
type=str,
|
||
default="",
|
||
help="Path to vits model.onnx",
|
||
)
|
||
|
||
parser.add_argument(
|
||
"--vits-lexicon",
|
||
type=str,
|
||
default="",
|
||
help="Path to lexicon.txt",
|
||
)
|
||
|
||
parser.add_argument(
|
||
"--vits-tokens",
|
||
type=str,
|
||
default="",
|
||
help="Path to tokens.txt",
|
||
)
|
||
|
||
parser.add_argument(
|
||
"--vits-data-dir",
|
||
type=str,
|
||
default="",
|
||
help="""Path to the dict directory of espeak-ng. If it is specified,
|
||
--vits-lexicon and --vits-tokens are ignored""",
|
||
)
|
||
|
||
parser.add_argument(
|
||
"--vits-dict-dir",
|
||
type=str,
|
||
default="",
|
||
help="Path to the dict directory for models using jieba",
|
||
)
|
||
|
||
|
||
def add_matcha_args(parser):
|
||
parser.add_argument(
|
||
"--matcha-acoustic-model",
|
||
type=str,
|
||
default="",
|
||
help="Path to model.onnx for matcha",
|
||
)
|
||
|
||
parser.add_argument(
|
||
"--matcha-vocoder",
|
||
type=str,
|
||
default="",
|
||
help="Path to vocoder for matcha",
|
||
)
|
||
|
||
parser.add_argument(
|
||
"--matcha-lexicon",
|
||
type=str,
|
||
default="",
|
||
help="Path to lexicon.txt for matcha",
|
||
)
|
||
|
||
parser.add_argument(
|
||
"--matcha-tokens",
|
||
type=str,
|
||
default="",
|
||
help="Path to tokens.txt for matcha",
|
||
)
|
||
|
||
parser.add_argument(
|
||
"--matcha-data-dir",
|
||
type=str,
|
||
default="",
|
||
help="""Path to the dict directory of espeak-ng. If it is specified,
|
||
--matcha-lexicon and --matcha-tokens are ignored""",
|
||
)
|
||
|
||
parser.add_argument(
|
||
"--matcha-dict-dir",
|
||
type=str,
|
||
default="",
|
||
help="Path to the dict directory for models using jieba",
|
||
)
|
||
|
||
|
||
def add_kokoro_args(parser):
|
||
parser.add_argument(
|
||
"--kokoro-model",
|
||
type=str,
|
||
default="",
|
||
help="Path to model.onnx for kokoro",
|
||
)
|
||
|
||
parser.add_argument(
|
||
"--kokoro-voices",
|
||
type=str,
|
||
default="",
|
||
help="Path to voices.bin for kokoro",
|
||
)
|
||
|
||
parser.add_argument(
|
||
"--kokoro-tokens",
|
||
type=str,
|
||
default="",
|
||
help="Path to tokens.txt for kokoro",
|
||
)
|
||
|
||
parser.add_argument(
|
||
"--kokoro-data-dir",
|
||
type=str,
|
||
default="",
|
||
help="Path to the dict directory of espeak-ng.",
|
||
)
|
||
|
||
parser.add_argument(
|
||
"--kokoro-dict-dir",
|
||
type=str,
|
||
default="",
|
||
help="Path to the dict directory for models using jieba. Needed only by multilingual kokoro",
|
||
)
|
||
|
||
parser.add_argument(
|
||
"--kokoro-lexicon",
|
||
type=str,
|
||
default="",
|
||
help="Path to lexicon.txt for kokoro. Needed only by multilingual kokoro",
|
||
)
|
||
|
||
|
||
def get_args():
|
||
parser = argparse.ArgumentParser(
|
||
formatter_class=argparse.ArgumentDefaultsHelpFormatter
|
||
)
|
||
|
||
add_vits_args(parser)
|
||
add_matcha_args(parser)
|
||
add_kokoro_args(parser)
|
||
|
||
parser.add_argument(
|
||
"--tts-rule-fsts",
|
||
type=str,
|
||
default="",
|
||
help="Path to rule.fst",
|
||
)
|
||
|
||
parser.add_argument(
|
||
"--output-filename",
|
||
type=str,
|
||
default="./generated.wav",
|
||
help="Path to save generated wave",
|
||
)
|
||
|
||
parser.add_argument(
|
||
"--sid",
|
||
type=int,
|
||
default=0,
|
||
help="""Speaker ID. Used only for multi-speaker models, e.g.
|
||
models trained using the VCTK dataset. Not used for single-speaker
|
||
models, e.g., models trained using the LJ speech dataset.
|
||
""",
|
||
)
|
||
|
||
parser.add_argument(
|
||
"--debug",
|
||
type=bool,
|
||
default=False,
|
||
help="True to show debug messages",
|
||
)
|
||
|
||
parser.add_argument(
|
||
"--provider",
|
||
type=str,
|
||
default="cpu",
|
||
help="valid values: cpu, cuda, coreml",
|
||
)
|
||
|
||
parser.add_argument(
|
||
"--num-threads",
|
||
type=int,
|
||
default=1,
|
||
help="Number of threads for neural network computation",
|
||
)
|
||
|
||
parser.add_argument(
|
||
"--speed",
|
||
type=float,
|
||
default=1.0,
|
||
help="Speech speed. Larger->faster; smaller->slower",
|
||
)
|
||
|
||
parser.add_argument(
|
||
"text",
|
||
type=str,
|
||
help="The input text to generate audio for",
|
||
)
|
||
|
||
return parser.parse_args()
|
||
|
||
|
||
# buffer saves audio samples to be played
|
||
buffer = queue.Queue()
|
||
|
||
# started is set to True once generated_audio_callback is called.
|
||
started = False
|
||
|
||
# stopped is set to True once all the text has been processed
|
||
stopped = False
|
||
|
||
# killed is set to True once ctrl + C is pressed
|
||
killed = False
|
||
|
||
# Note: When started is True, and stopped is True, and buffer is empty,
|
||
# we will exit the program since all audio samples have been played.
|
||
|
||
sample_rate = None
|
||
|
||
event = threading.Event()
|
||
|
||
first_message_time = None
|
||
|
||
|
||
def generated_audio_callback(samples: np.ndarray, progress: float):
|
||
"""This function is called whenever max_num_sentences sentences
|
||
have been processed.
|
||
|
||
Note that it is passed to C++ and is invoked in C++.
|
||
|
||
Args:
|
||
samples:
|
||
A 1-D np.float32 array containing audio samples
|
||
"""
|
||
global first_message_time
|
||
if first_message_time is None:
|
||
first_message_time = time.time()
|
||
|
||
buffer.put(samples)
|
||
global started
|
||
|
||
if started is False:
|
||
logging.info("Start playing ...")
|
||
started = True
|
||
|
||
# 1 means to keep generating
|
||
# 0 means to stop generating
|
||
if killed:
|
||
return 0
|
||
|
||
return 1
|
||
|
||
|
||
# see https://python-sounddevice.readthedocs.io/en/0.4.6/api/streams.html#sounddevice.OutputStream
|
||
def play_audio_callback(
|
||
outdata: np.ndarray, frames: int, time, status: sd.CallbackFlags
|
||
):
|
||
if killed or (started and buffer.empty() and stopped):
|
||
event.set()
|
||
|
||
# outdata is of shape (frames, num_channels)
|
||
if buffer.empty():
|
||
outdata.fill(0)
|
||
return
|
||
|
||
n = 0
|
||
while n < frames and not buffer.empty():
|
||
remaining = frames - n
|
||
k = buffer.queue[0].shape[0]
|
||
|
||
if remaining <= k:
|
||
outdata[n:, 0] = buffer.queue[0][:remaining]
|
||
buffer.queue[0] = buffer.queue[0][remaining:]
|
||
n = frames
|
||
if buffer.queue[0].shape[0] == 0:
|
||
buffer.get()
|
||
|
||
break
|
||
|
||
outdata[n : n + k, 0] = buffer.get()
|
||
n += k
|
||
|
||
if n < frames:
|
||
outdata[n:, 0] = 0
|
||
|
||
|
||
# Please see
|
||
# https://python-sounddevice.readthedocs.io/en/0.4.6/usage.html#device-selection
|
||
# for how to select a device
|
||
def play_audio():
|
||
if False:
|
||
# This if branch can be safely removed. It is here to show you how to
|
||
# change the default output device in case you need that.
|
||
devices = sd.query_devices()
|
||
print(devices)
|
||
|
||
# sd.default.device[1] is the output device, if you want to
|
||
# select a different device, say, 3, as the output device, please
|
||
# use self.default.device[1] = 3
|
||
|
||
default_output_device_idx = sd.default.device[1]
|
||
print(
|
||
f'Use default output device: {devices[default_output_device_idx]["name"]}'
|
||
)
|
||
|
||
with sd.OutputStream(
|
||
channels=1,
|
||
callback=play_audio_callback,
|
||
dtype="float32",
|
||
samplerate=sample_rate,
|
||
blocksize=1024,
|
||
):
|
||
event.wait()
|
||
|
||
logging.info("Exiting ...")
|
||
|
||
|
||
def main():
|
||
args = get_args()
|
||
print(args)
|
||
|
||
tts_config = sherpa_onnx.OfflineTtsConfig(
|
||
model=sherpa_onnx.OfflineTtsModelConfig(
|
||
vits=sherpa_onnx.OfflineTtsVitsModelConfig(
|
||
model=args.vits_model,
|
||
lexicon=args.vits_lexicon,
|
||
data_dir=args.vits_data_dir,
|
||
dict_dir=args.vits_dict_dir,
|
||
tokens=args.vits_tokens,
|
||
),
|
||
matcha=sherpa_onnx.OfflineTtsMatchaModelConfig(
|
||
acoustic_model=args.matcha_acoustic_model,
|
||
vocoder=args.matcha_vocoder,
|
||
lexicon=args.matcha_lexicon,
|
||
tokens=args.matcha_tokens,
|
||
data_dir=args.matcha_data_dir,
|
||
dict_dir=args.matcha_dict_dir,
|
||
),
|
||
kokoro=sherpa_onnx.OfflineTtsKokoroModelConfig(
|
||
model=args.kokoro_model,
|
||
voices=args.kokoro_voices,
|
||
tokens=args.kokoro_tokens,
|
||
data_dir=args.kokoro_data_dir,
|
||
dict_dir=args.kokoro_dict_dir,
|
||
lexicon=args.kokoro_lexicon,
|
||
),
|
||
provider=args.provider,
|
||
debug=args.debug,
|
||
num_threads=args.num_threads,
|
||
),
|
||
rule_fsts=args.tts_rule_fsts,
|
||
max_num_sentences=1,
|
||
)
|
||
|
||
if not tts_config.validate():
|
||
raise ValueError("Please check your config")
|
||
|
||
logging.info("Loading model ...")
|
||
tts = sherpa_onnx.OfflineTts(tts_config)
|
||
logging.info("Loading model done.")
|
||
|
||
global sample_rate
|
||
sample_rate = tts.sample_rate
|
||
|
||
play_back_thread = threading.Thread(target=play_audio)
|
||
play_back_thread.start()
|
||
|
||
logging.info("Start generating ...")
|
||
start_time = time.time()
|
||
audio = tts.generate(
|
||
args.text,
|
||
sid=args.sid,
|
||
speed=args.speed,
|
||
callback=generated_audio_callback,
|
||
)
|
||
end_time = time.time()
|
||
logging.info("Finished generating!")
|
||
global stopped
|
||
stopped = True
|
||
|
||
if len(audio.samples) == 0:
|
||
print("Error in generating audios. Please read previous error messages.")
|
||
global killed
|
||
killed = True
|
||
play_back_thread.join()
|
||
return
|
||
|
||
elapsed_seconds = end_time - start_time
|
||
audio_duration = len(audio.samples) / audio.sample_rate
|
||
real_time_factor = elapsed_seconds / audio_duration
|
||
|
||
sf.write(
|
||
args.output_filename,
|
||
audio.samples,
|
||
samplerate=audio.sample_rate,
|
||
subtype="PCM_16",
|
||
)
|
||
logging.info(f"The text is '{args.text}'")
|
||
logging.info(
|
||
"Time in seconds to receive the first "
|
||
f"message: {first_message_time-start_time:.3f}"
|
||
)
|
||
logging.info(f"Elapsed seconds: {elapsed_seconds:.3f}")
|
||
logging.info(f"Audio duration in seconds: {audio_duration:.3f}")
|
||
logging.info(
|
||
f"RTF: {elapsed_seconds:.3f}/{audio_duration:.3f} = {real_time_factor:.3f}"
|
||
)
|
||
|
||
logging.info(f"*** Saved to {args.output_filename} ***")
|
||
|
||
print("\n >>>>>>>>> You can safely press ctrl + C to stop the play <<<<<<<<<<\n")
|
||
|
||
play_back_thread.join()
|
||
|
||
|
||
if __name__ == "__main__":
|
||
formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
|
||
|
||
logging.basicConfig(format=formatter, level=logging.INFO)
|
||
try:
|
||
main()
|
||
except KeyboardInterrupt:
|
||
print("\nCaught Ctrl + C. Exiting")
|
||
killed = True
|
||
sys.exit(0)
|