Add MFC TTS example on Windows (#378)

This commit is contained in:
Fangjun Kuang
2023-10-21 00:13:07 +08:00
committed by GitHub
parent a69d0a950e
commit 1937717705
29 changed files with 994 additions and 22 deletions

View File

@@ -18,8 +18,8 @@ class OfflineTtsImpl {
static std::unique_ptr<OfflineTtsImpl> Create(const OfflineTtsConfig &config);
virtual GeneratedAudio Generate(const std::string &text,
int64_t sid = 0) const = 0;
virtual GeneratedAudio Generate(const std::string &text, int64_t sid = 0,
float speed = 1.0) const = 0;
};
} // namespace sherpa_onnx

View File

@@ -24,8 +24,8 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl {
model_->Punctuations(), model_->Language(),
config.model.debug) {}
GeneratedAudio Generate(const std::string &text,
int64_t sid = 0) const override {
GeneratedAudio Generate(const std::string &text, int64_t sid = 0,
float speed = 1.0) const override {
int32_t num_speakers = model_->NumSpeakers();
if (num_speakers == 0 && sid != 0) {
SHERPA_ONNX_LOGE(
@@ -66,7 +66,7 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl {
Ort::Value x_tensor = Ort::Value::CreateTensor(
memory_info, x.data(), x.size(), x_shape.data(), x_shape.size());
Ort::Value audio = model_->Run(std::move(x_tensor), sid);
Ort::Value audio = model_->Run(std::move(x_tensor), sid, speed);
std::vector<int64_t> audio_shape =
audio.GetTensorTypeAndShapeInfo().GetShape();

View File

@@ -17,7 +17,7 @@ void OfflineTtsVitsModelConfig::Register(ParseOptions *po) {
po->Register("vits-noise-scale-w", &noise_scale_w,
"noise_scale_w for VITS models");
po->Register("vits-length-scale", &length_scale,
"length_scale for VITS models");
"Speech speed. Larger->Slower; Smaller->faster.");
}
bool OfflineTtsVitsModelConfig::Validate() const {

View File

@@ -26,7 +26,7 @@ class OfflineTtsVitsModel::Impl {
Init(buf.data(), buf.size());
}
Ort::Value Run(Ort::Value x, int64_t sid) {
Ort::Value Run(Ort::Value x, int64_t sid, float speed) {
auto memory_info =
Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeDefault);
@@ -48,6 +48,10 @@ class OfflineTtsVitsModel::Impl {
float length_scale = config_.vits.length_scale;
float noise_scale_w = config_.vits.noise_scale_w;
if (speed != 1 && speed > 0) {
length_scale = 1. / speed;
}
Ort::Value noise_scale_tensor =
Ort::Value::CreateTensor(memory_info, &noise_scale, 1, &scale_shape, 1);
@@ -139,8 +143,9 @@ OfflineTtsVitsModel::OfflineTtsVitsModel(const OfflineTtsModelConfig &config)
OfflineTtsVitsModel::~OfflineTtsVitsModel() = default;
Ort::Value OfflineTtsVitsModel::Run(Ort::Value x, int64_t sid /*=0*/) {
return impl_->Run(std::move(x), sid);
Ort::Value OfflineTtsVitsModel::Run(Ort::Value x, int64_t sid /*=0*/,
float speed /*= 1.0*/) {
return impl_->Run(std::move(x), sid, speed);
}
int32_t OfflineTtsVitsModel::SampleRate() const { return impl_->SampleRate(); }

View File

@@ -29,7 +29,7 @@ class OfflineTtsVitsModel {
* @return Return a float32 tensor containing audio samples. You can flatten
* it to a 1-D tensor.
*/
Ort::Value Run(Ort::Value x, int64_t sid = 0);
Ort::Value Run(Ort::Value x, int64_t sid = 0, float speed = 1.0);
// Sample rate of the generated audio
int32_t SampleRate() const;

View File

@@ -28,9 +28,9 @@ OfflineTts::OfflineTts(const OfflineTtsConfig &config)
OfflineTts::~OfflineTts() = default;
GeneratedAudio OfflineTts::Generate(const std::string &text,
int64_t sid /*=0*/) const {
return impl_->Generate(text, sid);
GeneratedAudio OfflineTts::Generate(const std::string &text, int64_t sid /*=0*/,
float speed /*= 1.0*/) const {
return impl_->Generate(text, sid, speed);
}
} // namespace sherpa_onnx

View File

@@ -43,7 +43,8 @@ class OfflineTts {
// trained using the VCTK dataset. It is not used for
// single-speaker models, e.g., models trained using the ljspeech
// dataset.
GeneratedAudio Generate(const std::string &text, int64_t sid = 0) const;
GeneratedAudio Generate(const std::string &text, int64_t sid = 0,
float speed = 1.0) const;
private:
std::unique_ptr<OfflineTtsImpl> impl_;