Add MFC TTS example on Windows (#378)

This commit is contained in:
Fangjun Kuang
2023-10-21 00:13:07 +08:00
committed by GitHub
parent a69d0a950e
commit 1937717705
29 changed files with 994 additions and 22 deletions

View File

@@ -568,8 +568,9 @@ SherpaOnnxOfflineTts *SherpaOnnxCreateOfflineTts(
void SherpaOnnxDestroyOfflineTts(SherpaOnnxOfflineTts *tts) { delete tts; }
const SherpaOnnxGeneratedAudio *SherpaOnnxOfflineTtsGenerate(
const SherpaOnnxOfflineTts *tts, const char *text, int32_t sid) {
sherpa_onnx::GeneratedAudio audio = tts->impl->Generate(text, sid);
const SherpaOnnxOfflineTts *tts, const char *text, int32_t sid,
float speed) {
sherpa_onnx::GeneratedAudio audio = tts->impl->Generate(text, sid, speed);
if (audio.samples.empty()) {
return nullptr;

View File

@@ -639,7 +639,8 @@ SHERPA_ONNX_API void SherpaOnnxDestroyOfflineTts(SherpaOnnxOfflineTts *tts);
// The user has to use DestroyOfflineTtsGeneratedAudio() to free the returned
// pointer to avoid memory leak.
SHERPA_ONNX_API const SherpaOnnxGeneratedAudio *SherpaOnnxOfflineTtsGenerate(
const SherpaOnnxOfflineTts *tts, const char *text, int32_t sid);
const SherpaOnnxOfflineTts *tts, const char *text, int32_t sid,
float speed);
SHERPA_ONNX_API void SherpaOnnxDestroyOfflineTtsGeneratedAudio(
const SherpaOnnxGeneratedAudio *p);

View File

@@ -18,8 +18,8 @@ class OfflineTtsImpl {
static std::unique_ptr<OfflineTtsImpl> Create(const OfflineTtsConfig &config);
virtual GeneratedAudio Generate(const std::string &text,
int64_t sid = 0) const = 0;
virtual GeneratedAudio Generate(const std::string &text, int64_t sid = 0,
float speed = 1.0) const = 0;
};
} // namespace sherpa_onnx

View File

@@ -24,8 +24,8 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl {
model_->Punctuations(), model_->Language(),
config.model.debug) {}
GeneratedAudio Generate(const std::string &text,
int64_t sid = 0) const override {
GeneratedAudio Generate(const std::string &text, int64_t sid = 0,
float speed = 1.0) const override {
int32_t num_speakers = model_->NumSpeakers();
if (num_speakers == 0 && sid != 0) {
SHERPA_ONNX_LOGE(
@@ -66,7 +66,7 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl {
Ort::Value x_tensor = Ort::Value::CreateTensor(
memory_info, x.data(), x.size(), x_shape.data(), x_shape.size());
Ort::Value audio = model_->Run(std::move(x_tensor), sid);
Ort::Value audio = model_->Run(std::move(x_tensor), sid, speed);
std::vector<int64_t> audio_shape =
audio.GetTensorTypeAndShapeInfo().GetShape();

View File

@@ -17,7 +17,7 @@ void OfflineTtsVitsModelConfig::Register(ParseOptions *po) {
po->Register("vits-noise-scale-w", &noise_scale_w,
"noise_scale_w for VITS models");
po->Register("vits-length-scale", &length_scale,
"length_scale for VITS models");
"Speech speed. Larger->Slower; Smaller->faster.");
}
bool OfflineTtsVitsModelConfig::Validate() const {

View File

@@ -26,7 +26,7 @@ class OfflineTtsVitsModel::Impl {
Init(buf.data(), buf.size());
}
Ort::Value Run(Ort::Value x, int64_t sid) {
Ort::Value Run(Ort::Value x, int64_t sid, float speed) {
auto memory_info =
Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeDefault);
@@ -48,6 +48,10 @@ class OfflineTtsVitsModel::Impl {
float length_scale = config_.vits.length_scale;
float noise_scale_w = config_.vits.noise_scale_w;
if (speed != 1 && speed > 0) {
length_scale = 1. / speed;
}
Ort::Value noise_scale_tensor =
Ort::Value::CreateTensor(memory_info, &noise_scale, 1, &scale_shape, 1);
@@ -139,8 +143,9 @@ OfflineTtsVitsModel::OfflineTtsVitsModel(const OfflineTtsModelConfig &config)
OfflineTtsVitsModel::~OfflineTtsVitsModel() = default;
Ort::Value OfflineTtsVitsModel::Run(Ort::Value x, int64_t sid /*=0*/) {
return impl_->Run(std::move(x), sid);
Ort::Value OfflineTtsVitsModel::Run(Ort::Value x, int64_t sid /*=0*/,
float speed /*= 1.0*/) {
return impl_->Run(std::move(x), sid, speed);
}
int32_t OfflineTtsVitsModel::SampleRate() const { return impl_->SampleRate(); }

View File

@@ -29,7 +29,7 @@ class OfflineTtsVitsModel {
* @return Return a float32 tensor containing audio samples. You can flatten
* it to a 1-D tensor.
*/
Ort::Value Run(Ort::Value x, int64_t sid = 0);
Ort::Value Run(Ort::Value x, int64_t sid = 0, float speed = 1.0);
// Sample rate of the generated audio
int32_t SampleRate() const;

View File

@@ -28,9 +28,9 @@ OfflineTts::OfflineTts(const OfflineTtsConfig &config)
OfflineTts::~OfflineTts() = default;
GeneratedAudio OfflineTts::Generate(const std::string &text,
int64_t sid /*=0*/) const {
return impl_->Generate(text, sid);
GeneratedAudio OfflineTts::Generate(const std::string &text, int64_t sid /*=0*/,
float speed /*= 1.0*/) const {
return impl_->Generate(text, sid, speed);
}
} // namespace sherpa_onnx

View File

@@ -43,7 +43,8 @@ class OfflineTts {
// trained using the VCTK dataset. It is not used for
// single-speaker models, e.g., models trained using the ljspeech
// dataset.
GeneratedAudio Generate(const std::string &text, int64_t sid = 0) const;
GeneratedAudio Generate(const std::string &text, int64_t sid = 0,
float speed = 1.0) const;
private:
std::unique_ptr<OfflineTtsImpl> impl_;

View File

@@ -40,7 +40,8 @@ void PybindOfflineTts(py::module *m) {
using PyClass = OfflineTts;
py::class_<PyClass>(*m, "OfflineTts")
.def(py::init<const OfflineTtsConfig &>(), py::arg("config"))
.def("generate", &PyClass::Generate, py::arg("text"), py::arg("sid") = 0);
.def("generate", &PyClass::Generate, py::arg("text"), py::arg("sid") = 0,
py::arg("speed") = 1.0);
}
} // namespace sherpa_onnx