2023-10-14 14:21:53 +08:00
#!/usr/bin/env python3
#
# Copyright (c) 2023 Xiaomi Corporation
"""
This file demonstrates how to use sherpa - onnx Python API to generate audio
from text , i . e . , text - to - speech .
2023-12-02 15:35:11 +08:00
Different from . / offline - tts - play . py , this file does not play back the
generated audio .
2023-10-14 14:21:53 +08:00
Usage :
2023-12-02 15:35:11 +08:00
Example ( 1 / 2 )
2023-10-14 14:21:53 +08:00
2023-12-02 15:35:11 +08:00
wget https : / / github . com / k2 - fsa / sherpa - onnx / releases / download / tts - models / vits - piper - en_US - amy - low . tar . bz2
tar xf vits - piper - en_US - amy - low . tar . bz2
2023-10-14 14:21:53 +08:00
python3 . / python - api - examples / offline - tts . py \
2023-12-02 15:35:11 +08:00
- - vits - model = . / vits - piper - en_US - amy - low / en_US - amy - low . onnx \
- - vits - tokens = . / vits - piper - en_US - amy - low / tokens . txt \
- - vits - data - dir = . / vits - piper - en_US - amy - low / espeak - ng - data \
- - output - filename = . / generated . wav \
" Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar. "
2023-10-18 10:19:10 +08:00
2023-12-02 15:35:11 +08:00
Example ( 2 / 2 )
2023-11-07 14:18:18 +08:00
2023-12-02 15:35:11 +08:00
wget https : / / github . com / k2 - fsa / sherpa - onnx / releases / download / tts - models / vits - zh - aishell3 . tar . bz2
tar xvf vits - zh - aishell3 . tar . bz2
2023-11-07 14:18:18 +08:00
2023-12-02 15:35:11 +08:00
python3 . / python - api - examples / offline - tts . py \
2023-11-07 14:18:18 +08:00
- - vits - model = . / vits - aishell3 . onnx \
- - vits - lexicon = . / lexicon . txt \
- - vits - tokens = . / tokens . txt \
- - tts - rule - fsts = . / rule . fst \
- - sid = 21 \
- - output - filename = . / liubei - 21. wav \
" 勿以恶小而为之, 勿以善小而不为。惟贤惟德, 能服于人。122334 "
2023-12-02 15:35:11 +08:00
You can find more models at
https : / / github . com / k2 - fsa / sherpa - onnx / releases / tag / tts - models
2023-10-18 10:19:10 +08:00
Please see
https : / / k2 - fsa . github . io / sherpa / onnx / tts / index . html
for details .
2023-12-02 15:35:11 +08:00
2023-10-14 14:21:53 +08:00
"""
import argparse
2023-10-18 10:19:10 +08:00
import time
2023-10-14 14:21:53 +08:00
import sherpa_onnx
import soundfile as sf
def get_args ( ) :
parser = argparse . ArgumentParser (
formatter_class = argparse . ArgumentDefaultsHelpFormatter
)
parser . add_argument (
" --vits-model " ,
type = str ,
help = " Path to vits model.onnx " ,
)
parser . add_argument (
" --vits-lexicon " ,
type = str ,
2023-11-30 23:57:43 +08:00
default = " " ,
2023-10-14 14:21:53 +08:00
help = " Path to lexicon.txt " ,
)
parser . add_argument (
" --vits-tokens " ,
type = str ,
2023-11-30 23:57:43 +08:00
default = " " ,
2023-10-14 14:21:53 +08:00
help = " Path to tokens.txt " ,
)
2023-11-30 23:57:43 +08:00
parser . add_argument (
" --vits-data-dir " ,
type = str ,
default = " " ,
help = """ Path to the dict director of espeak-ng. If it is specified,
- - vits - lexicon and - - vits - tokens are ignored """ ,
)
2023-11-07 14:18:18 +08:00
parser . add_argument (
" --tts-rule-fsts " ,
type = str ,
default = " " ,
help = " Path to rule.fst " ,
)
2023-11-30 23:57:43 +08:00
parser . add_argument (
" --max-num-sentences " ,
type = int ,
default = 2 ,
help = """ Max number of sentences in a batch to avoid OOM if the input
text is very long . Set it to - 1 to process all the sentences in a
single batch . A smaller value does not mean it is slower compared
to a larger one on CPU .
""" ,
)
2023-10-14 14:21:53 +08:00
parser . add_argument (
" --output-filename " ,
type = str ,
default = " ./generated.wav " ,
help = " Path to save generated wave " ,
)
2023-10-16 17:22:30 +08:00
parser . add_argument (
" --sid " ,
type = int ,
default = 0 ,
help = """ Speaker ID. Used only for multi-speaker models, e.g.
models trained using the VCTK dataset . Not used for single - speaker
models , e . g . , models trained using the LJ speech dataset .
""" ,
)
2023-10-14 14:21:53 +08:00
parser . add_argument (
" --debug " ,
type = bool ,
default = False ,
help = " True to show debug messages " ,
)
parser . add_argument (
" --provider " ,
type = str ,
default = " cpu " ,
help = " valid values: cpu, cuda, coreml " ,
)
parser . add_argument (
" --num-threads " ,
type = int ,
default = 1 ,
help = " Number of threads for neural network computation " ,
)
2023-10-24 21:38:58 +08:00
parser . add_argument (
" --speed " ,
type = float ,
default = 1.0 ,
help = " Speech speed. Larger->faster; smaller->slower " ,
)
2023-10-14 14:21:53 +08:00
parser . add_argument (
" text " ,
type = str ,
help = " The input text to generate audio for " ,
)
return parser . parse_args ( )
def main ( ) :
args = get_args ( )
print ( args )
tts_config = sherpa_onnx . OfflineTtsConfig (
model = sherpa_onnx . OfflineTtsModelConfig (
vits = sherpa_onnx . OfflineTtsVitsModelConfig (
model = args . vits_model ,
lexicon = args . vits_lexicon ,
2023-11-30 23:57:43 +08:00
data_dir = args . vits_data_dir ,
2023-10-14 14:21:53 +08:00
tokens = args . vits_tokens ,
) ,
provider = args . provider ,
debug = args . debug ,
num_threads = args . num_threads ,
2023-11-07 14:18:18 +08:00
) ,
2023-11-30 23:57:43 +08:00
rule_fsts = args . tts_rule_fsts ,
max_num_sentences = args . max_num_sentences ,
2023-10-14 14:21:53 +08:00
)
2023-11-30 23:57:43 +08:00
if not tts_config . validate ( ) :
raise ValueError ( " Please check your config " )
2023-10-14 14:21:53 +08:00
tts = sherpa_onnx . OfflineTts ( tts_config )
2023-10-18 10:19:10 +08:00
start = time . time ( )
2023-10-24 21:38:58 +08:00
audio = tts . generate ( args . text , sid = args . sid , speed = args . speed )
2023-10-18 10:19:10 +08:00
end = time . time ( )
2023-10-18 14:02:01 +08:00
if len ( audio . samples ) == 0 :
print ( " Error in generating audios. Please read previous error messages. " )
return
2023-10-18 10:19:10 +08:00
elapsed_seconds = end - start
audio_duration = len ( audio . samples ) / audio . sample_rate
real_time_factor = elapsed_seconds / audio_duration
2023-10-14 14:21:53 +08:00
sf . write (
args . output_filename ,
audio . samples ,
samplerate = audio . sample_rate ,
subtype = " PCM_16 " ,
)
print ( f " Saved to { args . output_filename } " )
print ( f " The text is ' { args . text } ' " )
2023-10-18 10:19:10 +08:00
print ( f " Elapsed seconds: { elapsed_seconds : .3f } " )
print ( f " Audio duration in seconds: { audio_duration : .3f } " )
print ( f " RTF: { elapsed_seconds : .3f } / { audio_duration : .3f } = { real_time_factor : .3f } " )
2023-10-14 14:21:53 +08:00
if __name__ == " __main__ " :
main ( )