add python API and examples for TTS (#364)
This commit is contained in:
20
.github/scripts/test-python.sh
vendored
20
.github/scripts/test-python.sh
vendored
@@ -8,6 +8,24 @@ log() {
|
||||
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
|
||||
}
|
||||
|
||||
log "Offline TTS test"
|
||||
|
||||
wget -qq https://huggingface.co/csukuangfj/vits-ljs/resolve/main/vits-ljs.onnx
|
||||
wget -qq https://huggingface.co/csukuangfj/vits-ljs/resolve/main/lexicon.txt
|
||||
wget -qq https://huggingface.co/csukuangfj/vits-ljs/resolve/main/tokens.txt
|
||||
|
||||
python3 ./python-api-examples/offline-tts.py \
|
||||
--vits-model=./vits-ljs.onnx \
|
||||
--vits-lexicon=./lexicon.txt \
|
||||
--vits-tokens=./tokens.txt \
|
||||
--output-filename=./tts.wav \
|
||||
'liliana, the most beautiful and lovely assistant of our team!'
|
||||
|
||||
ls -lh ./tts.wav
|
||||
file ./tts.wav
|
||||
|
||||
rm -v vits-ljs.onnx ./lexicon.txt ./tokens.txt
|
||||
|
||||
mkdir -p /tmp/icefall-models
|
||||
dir=/tmp/icefall-models
|
||||
|
||||
@@ -171,3 +189,5 @@ rm -rf $repo
|
||||
git clone https://github.com/pkufool/sherpa-test-data /tmp/sherpa-test-data
|
||||
|
||||
python3 sherpa-onnx/python/tests/test_text2token.py --verbose
|
||||
|
||||
rm -rf /tmp/sherpa-test-data
|
||||
|
||||
9
.github/workflows/run-python-test.yaml
vendored
9
.github/workflows/run-python-test.yaml
vendored
@@ -42,7 +42,7 @@ jobs:
|
||||
python-version: "3.10"
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
@@ -54,7 +54,7 @@ jobs:
|
||||
- name: Install Python dependencies
|
||||
shell: bash
|
||||
run: |
|
||||
python3 -m pip install --upgrade pip numpy sentencepiece==0.1.96
|
||||
python3 -m pip install --upgrade pip numpy sentencepiece==0.1.96 soundfile
|
||||
|
||||
- name: Install sherpa-onnx
|
||||
shell: bash
|
||||
@@ -65,3 +65,8 @@ jobs:
|
||||
shell: bash
|
||||
run: |
|
||||
.github/scripts/test-python.sh
|
||||
|
||||
- uses: actions/upload-artifact@v3
|
||||
with:
|
||||
name: tts-generated-test-files
|
||||
path: tts.wav
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
cmake_minimum_required(VERSION 3.13 FATAL_ERROR)
|
||||
project(sherpa-onnx)
|
||||
|
||||
set(SHERPA_ONNX_VERSION "1.7.21")
|
||||
set(SHERPA_ONNX_VERSION "1.8.0")
|
||||
|
||||
# Disable warning about
|
||||
#
|
||||
|
||||
@@ -137,6 +137,7 @@ class BuildExtension(build_ext):
|
||||
binaries += ["sherpa-onnx-offline-websocket-server"]
|
||||
binaries += ["sherpa-onnx-online-websocket-client"]
|
||||
binaries += ["sherpa-onnx-vad-microphone"]
|
||||
binaries += ["sherpa-onnx-offline-tts"]
|
||||
|
||||
if is_windows():
|
||||
binaries += ["kaldi-native-fbank-core.dll"]
|
||||
@@ -144,6 +145,9 @@ class BuildExtension(build_ext):
|
||||
binaries += ["sherpa-onnx-core.dll"]
|
||||
binaries += ["sherpa-onnx-portaudio.dll"]
|
||||
binaries += ["onnxruntime.dll"]
|
||||
binaries += ["kaldi-decoder-core.dll"]
|
||||
binaries += ["sherpa-onnx-fst.dll"]
|
||||
binaries += ["sherpa-onnx-kaldifst-core.dll"]
|
||||
|
||||
for f in binaries:
|
||||
suffix = "" if "dll" in f else suffix
|
||||
|
||||
120
python-api-examples/offline-tts.py
Executable file
120
python-api-examples/offline-tts.py
Executable file
@@ -0,0 +1,120 @@
|
||||
#!/usr/bin/env python3
|
||||
#
|
||||
# Copyright (c) 2023 Xiaomi Corporation
|
||||
|
||||
"""
|
||||
This file demonstrates how to use sherpa-onnx Python API to generate audio
|
||||
from text, i.e., text-to-speech.
|
||||
|
||||
Usage:
|
||||
|
||||
1. Download a model
|
||||
|
||||
wget https://huggingface.co/csukuangfj/vits-ljs/resolve/main/vits-ljs.onnx
|
||||
wget https://huggingface.co/csukuangfj/vits-ljs/resolve/main/lexicon.txt
|
||||
wget https://huggingface.co/csukuangfj/vits-ljs/resolve/main/tokens.txt
|
||||
|
||||
python3 ./python-api-examples/offline-tts.py \
|
||||
--vits-model=./vits-ljs.onnx \
|
||||
--vits-lexicon=./lexicon.txt \
|
||||
--vits-tokens=./tokens.txt \
|
||||
--output-filename=./generated.wav \
|
||||
'liliana, the most beautiful and lovely assistant of our team!'
|
||||
"""
|
||||
|
||||
import argparse
|
||||
|
||||
import sherpa_onnx
|
||||
import soundfile as sf
|
||||
|
||||
|
||||
def get_args():
|
||||
parser = argparse.ArgumentParser(
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--vits-model",
|
||||
type=str,
|
||||
help="Path to vits model.onnx",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--vits-lexicon",
|
||||
type=str,
|
||||
help="Path to lexicon.txt",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--vits-tokens",
|
||||
type=str,
|
||||
help="Path to tokens.txt",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--output-filename",
|
||||
type=str,
|
||||
default="./generated.wav",
|
||||
help="Path to save generated wave",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--debug",
|
||||
type=bool,
|
||||
default=False,
|
||||
help="True to show debug messages",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--provider",
|
||||
type=str,
|
||||
default="cpu",
|
||||
help="valid values: cpu, cuda, coreml",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--num-threads",
|
||||
type=int,
|
||||
default=1,
|
||||
help="Number of threads for neural network computation",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"text",
|
||||
type=str,
|
||||
help="The input text to generate audio for",
|
||||
)
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main():
|
||||
args = get_args()
|
||||
print(args)
|
||||
|
||||
tts_config = sherpa_onnx.OfflineTtsConfig(
|
||||
model=sherpa_onnx.OfflineTtsModelConfig(
|
||||
vits=sherpa_onnx.OfflineTtsVitsModelConfig(
|
||||
model=args.vits_model,
|
||||
lexicon=args.vits_lexicon,
|
||||
tokens=args.vits_tokens,
|
||||
),
|
||||
provider=args.provider,
|
||||
debug=args.debug,
|
||||
num_threads=args.num_threads,
|
||||
)
|
||||
)
|
||||
tts = sherpa_onnx.OfflineTts(tts_config)
|
||||
audio = tts.generate(args.text)
|
||||
sf.write(
|
||||
args.output_filename,
|
||||
audio.samples,
|
||||
samplerate=audio.sample_rate,
|
||||
subtype="PCM_16",
|
||||
)
|
||||
print(f"Saved to {args.output_filename}")
|
||||
print(f"The text is '{args.text}'")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
4
setup.py
4
setup.py
@@ -57,12 +57,16 @@ def get_binaries_to_install():
|
||||
binaries += ["sherpa-onnx-offline-websocket-server"]
|
||||
binaries += ["sherpa-onnx-online-websocket-client"]
|
||||
binaries += ["sherpa-onnx-vad-microphone"]
|
||||
binaries += ["sherpa-onnx-offline-tts"]
|
||||
if is_windows():
|
||||
binaries += ["kaldi-native-fbank-core.dll"]
|
||||
binaries += ["sherpa-onnx-c-api.dll"]
|
||||
binaries += ["sherpa-onnx-core.dll"]
|
||||
binaries += ["sherpa-onnx-portaudio.dll"]
|
||||
binaries += ["onnxruntime.dll"]
|
||||
binaries += ["kaldi-decoder-core.dll"]
|
||||
binaries += ["sherpa-onnx-fst.dll"]
|
||||
binaries += ["sherpa-onnx-kaldifst-core.dll"]
|
||||
|
||||
exe = []
|
||||
for f in binaries:
|
||||
|
||||
@@ -21,9 +21,7 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl {
|
||||
explicit OfflineTtsVitsImpl(const OfflineTtsConfig &config)
|
||||
: model_(std::make_unique<OfflineTtsVitsModel>(config.model)),
|
||||
lexicon_(config.model.vits.lexicon, config.model.vits.tokens,
|
||||
model_->Punctuations()) {
|
||||
SHERPA_ONNX_LOGE("config: %s\n", config.ToString().c_str());
|
||||
}
|
||||
model_->Punctuations()) {}
|
||||
|
||||
GeneratedAudio Generate(const std::string &text) const override {
|
||||
std::vector<int64_t> x = lexicon_.ConvertTextToTokenIds(text);
|
||||
|
||||
@@ -14,6 +14,9 @@ pybind11_add_module(_sherpa_onnx
|
||||
offline-stream.cc
|
||||
offline-tdnn-model-config.cc
|
||||
offline-transducer-model-config.cc
|
||||
offline-tts-model-config.cc
|
||||
offline-tts-vits-model-config.cc
|
||||
offline-tts.cc
|
||||
offline-whisper-model-config.cc
|
||||
offline-zipformer-ctc-model-config.cc
|
||||
online-lm-config.cc
|
||||
|
||||
32
sherpa-onnx/python/csrc/offline-tts-model-config.cc
Normal file
32
sherpa-onnx/python/csrc/offline-tts-model-config.cc
Normal file
@@ -0,0 +1,32 @@
|
||||
// sherpa-onnx/python/csrc/offline-tts-model-config.cc
|
||||
//
|
||||
// Copyright (c) 2023 Xiaomi Corporation
|
||||
|
||||
#include "sherpa-onnx/python/csrc/offline-tts-model-config.h"
|
||||
|
||||
#include <string>
|
||||
|
||||
#include "sherpa-onnx/csrc/offline-tts-model-config.h"
|
||||
#include "sherpa-onnx/python/csrc/offline-tts-vits-model-config.h"
|
||||
|
||||
namespace sherpa_onnx {
|
||||
|
||||
void PybindOfflineTtsModelConfig(py::module *m) {
|
||||
PybindOfflineTtsVitsModelConfig(m);
|
||||
|
||||
using PyClass = OfflineTtsModelConfig;
|
||||
|
||||
py::class_<PyClass>(*m, "OfflineTtsModelConfig")
|
||||
.def(py::init<>())
|
||||
.def(py::init<const OfflineTtsVitsModelConfig &, int32_t, bool,
|
||||
const std::string &>(),
|
||||
py::arg("vits"), py::arg("num_threads") = 1,
|
||||
py::arg("debug") = false, py::arg("provider") = "cpu")
|
||||
.def_readwrite("vits", &PyClass::vits)
|
||||
.def_readwrite("num_threads", &PyClass::num_threads)
|
||||
.def_readwrite("debug", &PyClass::debug)
|
||||
.def_readwrite("provider", &PyClass::provider)
|
||||
.def("__str__", &PyClass::ToString);
|
||||
}
|
||||
|
||||
} // namespace sherpa_onnx
|
||||
16
sherpa-onnx/python/csrc/offline-tts-model-config.h
Normal file
16
sherpa-onnx/python/csrc/offline-tts-model-config.h
Normal file
@@ -0,0 +1,16 @@
|
||||
// sherpa-onnx/python/csrc/offline-tts-model-config.h
|
||||
//
|
||||
// Copyright (c) 2023 Xiaomi Corporation
|
||||
|
||||
#ifndef SHERPA_ONNX_PYTHON_CSRC_OFFLINE_TTS_MODEL_CONFIG_H_
|
||||
#define SHERPA_ONNX_PYTHON_CSRC_OFFLINE_TTS_MODEL_CONFIG_H_
|
||||
|
||||
#include "sherpa-onnx/python/csrc/sherpa-onnx.h"
|
||||
|
||||
namespace sherpa_onnx {
|
||||
|
||||
void PybindOfflineTtsModelConfig(py::module *m);
|
||||
|
||||
}
|
||||
|
||||
#endif // SHERPA_ONNX_PYTHON_CSRC_OFFLINE_TTS_MODEL_CONFIG_H_
|
||||
27
sherpa-onnx/python/csrc/offline-tts-vits-model-config.cc
Normal file
27
sherpa-onnx/python/csrc/offline-tts-vits-model-config.cc
Normal file
@@ -0,0 +1,27 @@
|
||||
// sherpa-onnx/python/csrc/offline-tts-vits-model-config.cc
|
||||
//
|
||||
// Copyright (c) 2023 Xiaomi Corporation
|
||||
|
||||
#include "sherpa-onnx/python/csrc/offline-tts-vits-model-config.h"
|
||||
|
||||
#include <string>
|
||||
|
||||
#include "sherpa-onnx/csrc/offline-tts-vits-model-config.h"
|
||||
|
||||
namespace sherpa_onnx {
|
||||
|
||||
void PybindOfflineTtsVitsModelConfig(py::module *m) {
|
||||
using PyClass = OfflineTtsVitsModelConfig;
|
||||
|
||||
py::class_<PyClass>(*m, "OfflineTtsVitsModelConfig")
|
||||
.def(py::init<>())
|
||||
.def(py::init<const std::string &, const std::string &,
|
||||
const std::string &>(),
|
||||
py::arg("model"), py::arg("lexicon"), py::arg("tokens"))
|
||||
.def_readwrite("model", &PyClass::model)
|
||||
.def_readwrite("lexicon", &PyClass::lexicon)
|
||||
.def_readwrite("tokens", &PyClass::tokens)
|
||||
.def("__str__", &PyClass::ToString);
|
||||
}
|
||||
|
||||
} // namespace sherpa_onnx
|
||||
16
sherpa-onnx/python/csrc/offline-tts-vits-model-config.h
Normal file
16
sherpa-onnx/python/csrc/offline-tts-vits-model-config.h
Normal file
@@ -0,0 +1,16 @@
|
||||
// sherpa-onnx/python/csrc/offline-tts-vits-model-config.h
|
||||
//
|
||||
// Copyright (c) 2023 Xiaomi Corporation
|
||||
|
||||
#ifndef SHERPA_ONNX_PYTHON_CSRC_OFFLINE_TTS_VITS_MODEL_CONFIG_H_
|
||||
#define SHERPA_ONNX_PYTHON_CSRC_OFFLINE_TTS_VITS_MODEL_CONFIG_H_
|
||||
|
||||
#include "sherpa-onnx/python/csrc/sherpa-onnx.h"
|
||||
|
||||
namespace sherpa_onnx {
|
||||
|
||||
void PybindOfflineTtsVitsModelConfig(py::module *m);
|
||||
|
||||
}
|
||||
|
||||
#endif // SHERPA_ONNX_PYTHON_CSRC_OFFLINE_TTS_VITS_MODEL_CONFIG_H_
|
||||
46
sherpa-onnx/python/csrc/offline-tts.cc
Normal file
46
sherpa-onnx/python/csrc/offline-tts.cc
Normal file
@@ -0,0 +1,46 @@
|
||||
// sherpa-onnx/python/csrc/offline-tts.cc
|
||||
//
|
||||
// Copyright (c) 2023 Xiaomi Corporation
|
||||
#include "sherpa-onnx/python/csrc/offline-tts.h"
|
||||
|
||||
#include "sherpa-onnx/csrc/offline-tts.h"
|
||||
#include "sherpa-onnx/python/csrc/offline-tts-model-config.h"
|
||||
|
||||
namespace sherpa_onnx {
|
||||
|
||||
static void PybindGeneratedAudio(py::module *m) {
|
||||
using PyClass = GeneratedAudio;
|
||||
py::class_<PyClass>(*m, "GeneratedAudio")
|
||||
.def(py::init<>())
|
||||
.def_readwrite("samples", &PyClass::samples)
|
||||
.def_readwrite("sample_rate", &PyClass::sample_rate)
|
||||
.def("__str__", [](PyClass &self) {
|
||||
std::ostringstream os;
|
||||
os << "GeneratedAudio(sample_rate=" << self.sample_rate << ", ";
|
||||
os << "num_samples=" << self.samples.size() << ")";
|
||||
return os.str();
|
||||
});
|
||||
}
|
||||
|
||||
static void PybindOfflineTtsConfig(py::module *m) {
|
||||
PybindOfflineTtsModelConfig(m);
|
||||
|
||||
using PyClass = OfflineTtsConfig;
|
||||
py::class_<PyClass>(*m, "OfflineTtsConfig")
|
||||
.def(py::init<>())
|
||||
.def(py::init<const OfflineTtsModelConfig &>(), py::arg("model"))
|
||||
.def_readwrite("model", &PyClass::model)
|
||||
.def("__str__", &PyClass::ToString);
|
||||
}
|
||||
|
||||
void PybindOfflineTts(py::module *m) {
|
||||
PybindOfflineTtsConfig(m);
|
||||
PybindGeneratedAudio(m);
|
||||
|
||||
using PyClass = OfflineTts;
|
||||
py::class_<PyClass>(*m, "OfflineTts")
|
||||
.def(py::init<const OfflineTtsConfig &>(), py::arg("config"))
|
||||
.def("generate", &PyClass::Generate);
|
||||
}
|
||||
|
||||
} // namespace sherpa_onnx
|
||||
16
sherpa-onnx/python/csrc/offline-tts.h
Normal file
16
sherpa-onnx/python/csrc/offline-tts.h
Normal file
@@ -0,0 +1,16 @@
|
||||
// sherpa-onnx/python/csrc/offline-tts.h
|
||||
//
|
||||
// Copyright (c) 2023 Xiaomi Corporation
|
||||
|
||||
#ifndef SHERPA_ONNX_PYTHON_CSRC_OFFLINE_TTS_H_
|
||||
#define SHERPA_ONNX_PYTHON_CSRC_OFFLINE_TTS_H_
|
||||
|
||||
#include "sherpa-onnx/python/csrc/sherpa-onnx.h"
|
||||
|
||||
namespace sherpa_onnx {
|
||||
|
||||
void PybindOfflineTts(py::module *m);
|
||||
|
||||
}
|
||||
|
||||
#endif // SHERPA_ONNX_PYTHON_CSRC_OFFLINE_TTS_H_
|
||||
@@ -13,6 +13,7 @@
|
||||
#include "sherpa-onnx/python/csrc/offline-model-config.h"
|
||||
#include "sherpa-onnx/python/csrc/offline-recognizer.h"
|
||||
#include "sherpa-onnx/python/csrc/offline-stream.h"
|
||||
#include "sherpa-onnx/python/csrc/offline-tts.h"
|
||||
#include "sherpa-onnx/python/csrc/online-lm-config.h"
|
||||
#include "sherpa-onnx/python/csrc/online-model-config.h"
|
||||
#include "sherpa-onnx/python/csrc/online-recognizer.h"
|
||||
@@ -45,6 +46,8 @@ PYBIND11_MODULE(_sherpa_onnx, m) {
|
||||
PybindVadModel(&m);
|
||||
PybindCircularBuffer(&m);
|
||||
PybindVoiceActivityDetector(&m);
|
||||
|
||||
PybindOfflineTts(&m);
|
||||
}
|
||||
|
||||
} // namespace sherpa_onnx
|
||||
|
||||
@@ -2,6 +2,10 @@ from _sherpa_onnx import (
|
||||
CircularBuffer,
|
||||
Display,
|
||||
OfflineStream,
|
||||
OfflineTts,
|
||||
OfflineTtsConfig,
|
||||
OfflineTtsModelConfig,
|
||||
OfflineTtsVitsModelConfig,
|
||||
OnlineStream,
|
||||
SileroVadModelConfig,
|
||||
SpeechSegment,
|
||||
|
||||
Reference in New Issue
Block a user