Add MFC TTS example on Windows (#378)
This commit is contained in:
@@ -18,8 +18,8 @@ class OfflineTtsImpl {
|
||||
|
||||
static std::unique_ptr<OfflineTtsImpl> Create(const OfflineTtsConfig &config);
|
||||
|
||||
virtual GeneratedAudio Generate(const std::string &text,
|
||||
int64_t sid = 0) const = 0;
|
||||
virtual GeneratedAudio Generate(const std::string &text, int64_t sid = 0,
|
||||
float speed = 1.0) const = 0;
|
||||
};
|
||||
|
||||
} // namespace sherpa_onnx
|
||||
|
||||
@@ -24,8 +24,8 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl {
|
||||
model_->Punctuations(), model_->Language(),
|
||||
config.model.debug) {}
|
||||
|
||||
GeneratedAudio Generate(const std::string &text,
|
||||
int64_t sid = 0) const override {
|
||||
GeneratedAudio Generate(const std::string &text, int64_t sid = 0,
|
||||
float speed = 1.0) const override {
|
||||
int32_t num_speakers = model_->NumSpeakers();
|
||||
if (num_speakers == 0 && sid != 0) {
|
||||
SHERPA_ONNX_LOGE(
|
||||
@@ -66,7 +66,7 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl {
|
||||
Ort::Value x_tensor = Ort::Value::CreateTensor(
|
||||
memory_info, x.data(), x.size(), x_shape.data(), x_shape.size());
|
||||
|
||||
Ort::Value audio = model_->Run(std::move(x_tensor), sid);
|
||||
Ort::Value audio = model_->Run(std::move(x_tensor), sid, speed);
|
||||
|
||||
std::vector<int64_t> audio_shape =
|
||||
audio.GetTensorTypeAndShapeInfo().GetShape();
|
||||
|
||||
@@ -17,7 +17,7 @@ void OfflineTtsVitsModelConfig::Register(ParseOptions *po) {
|
||||
po->Register("vits-noise-scale-w", &noise_scale_w,
|
||||
"noise_scale_w for VITS models");
|
||||
po->Register("vits-length-scale", &length_scale,
|
||||
"length_scale for VITS models");
|
||||
"Speech speed. Larger->Slower; Smaller->faster.");
|
||||
}
|
||||
|
||||
bool OfflineTtsVitsModelConfig::Validate() const {
|
||||
|
||||
@@ -26,7 +26,7 @@ class OfflineTtsVitsModel::Impl {
|
||||
Init(buf.data(), buf.size());
|
||||
}
|
||||
|
||||
Ort::Value Run(Ort::Value x, int64_t sid) {
|
||||
Ort::Value Run(Ort::Value x, int64_t sid, float speed) {
|
||||
auto memory_info =
|
||||
Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeDefault);
|
||||
|
||||
@@ -48,6 +48,10 @@ class OfflineTtsVitsModel::Impl {
|
||||
float length_scale = config_.vits.length_scale;
|
||||
float noise_scale_w = config_.vits.noise_scale_w;
|
||||
|
||||
if (speed != 1 && speed > 0) {
|
||||
length_scale = 1. / speed;
|
||||
}
|
||||
|
||||
Ort::Value noise_scale_tensor =
|
||||
Ort::Value::CreateTensor(memory_info, &noise_scale, 1, &scale_shape, 1);
|
||||
|
||||
@@ -139,8 +143,9 @@ OfflineTtsVitsModel::OfflineTtsVitsModel(const OfflineTtsModelConfig &config)
|
||||
|
||||
OfflineTtsVitsModel::~OfflineTtsVitsModel() = default;
|
||||
|
||||
Ort::Value OfflineTtsVitsModel::Run(Ort::Value x, int64_t sid /*=0*/) {
|
||||
return impl_->Run(std::move(x), sid);
|
||||
Ort::Value OfflineTtsVitsModel::Run(Ort::Value x, int64_t sid /*=0*/,
|
||||
float speed /*= 1.0*/) {
|
||||
return impl_->Run(std::move(x), sid, speed);
|
||||
}
|
||||
|
||||
int32_t OfflineTtsVitsModel::SampleRate() const { return impl_->SampleRate(); }
|
||||
|
||||
@@ -29,7 +29,7 @@ class OfflineTtsVitsModel {
|
||||
* @return Return a float32 tensor containing audio samples. You can flatten
|
||||
* it to a 1-D tensor.
|
||||
*/
|
||||
Ort::Value Run(Ort::Value x, int64_t sid = 0);
|
||||
Ort::Value Run(Ort::Value x, int64_t sid = 0, float speed = 1.0);
|
||||
|
||||
// Sample rate of the generated audio
|
||||
int32_t SampleRate() const;
|
||||
|
||||
@@ -28,9 +28,9 @@ OfflineTts::OfflineTts(const OfflineTtsConfig &config)
|
||||
|
||||
OfflineTts::~OfflineTts() = default;
|
||||
|
||||
GeneratedAudio OfflineTts::Generate(const std::string &text,
|
||||
int64_t sid /*=0*/) const {
|
||||
return impl_->Generate(text, sid);
|
||||
GeneratedAudio OfflineTts::Generate(const std::string &text, int64_t sid /*=0*/,
|
||||
float speed /*= 1.0*/) const {
|
||||
return impl_->Generate(text, sid, speed);
|
||||
}
|
||||
|
||||
} // namespace sherpa_onnx
|
||||
|
||||
@@ -43,7 +43,8 @@ class OfflineTts {
|
||||
// trained using the VCTK dataset. It is not used for
|
||||
// single-speaker models, e.g., models trained using the ljspeech
|
||||
// dataset.
|
||||
GeneratedAudio Generate(const std::string &text, int64_t sid = 0) const;
|
||||
GeneratedAudio Generate(const std::string &text, int64_t sid = 0,
|
||||
float speed = 1.0) const;
|
||||
|
||||
private:
|
||||
std::unique_ptr<OfflineTtsImpl> impl_;
|
||||
|
||||
Reference in New Issue
Block a user