Support scaling the duration of a pause in TTS. (#1820)
This commit is contained in:
@@ -420,6 +420,12 @@ class OfflineTtsKokoroImpl : public OfflineTtsImpl {
|
||||
GeneratedAudio ans;
|
||||
ans.sample_rate = model_->GetMetaData().sample_rate;
|
||||
ans.samples = std::vector<float>(p, p + total);
|
||||
|
||||
float silence_scale = config_.silence_scale;
|
||||
if (silence_scale != 1) {
|
||||
ans = ans.ScaleSilence(silence_scale);
|
||||
}
|
||||
|
||||
return ans;
|
||||
}
|
||||
|
||||
|
||||
@@ -398,6 +398,12 @@ class OfflineTtsMatchaImpl : public OfflineTtsImpl {
|
||||
GeneratedAudio ans;
|
||||
ans.sample_rate = model_->GetMetaData().sample_rate;
|
||||
ans.samples = std::vector<float>(p, p + total);
|
||||
|
||||
float silence_scale = config_.silence_scale;
|
||||
if (silence_scale != 1) {
|
||||
ans = ans.ScaleSilence(silence_scale);
|
||||
}
|
||||
|
||||
return ans;
|
||||
}
|
||||
|
||||
|
||||
@@ -485,6 +485,12 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl {
|
||||
GeneratedAudio ans;
|
||||
ans.sample_rate = model_->GetMetaData().sample_rate;
|
||||
ans.samples = std::vector<float>(p, p + total);
|
||||
|
||||
float silence_scale = config_.silence_scale;
|
||||
if (silence_scale != 1) {
|
||||
ans = ans.ScaleSilence(silence_scale);
|
||||
}
|
||||
|
||||
return ans;
|
||||
}
|
||||
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
|
||||
#include "sherpa-onnx/csrc/offline-tts.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
|
||||
@@ -23,6 +24,72 @@
|
||||
|
||||
namespace sherpa_onnx {
|
||||
|
||||
struct SilenceInterval {
|
||||
int32_t start;
|
||||
int32_t end;
|
||||
};
|
||||
|
||||
GeneratedAudio GeneratedAudio::ScaleSilence(float scale) const {
|
||||
if (scale == 1) {
|
||||
return *this;
|
||||
}
|
||||
// if the interval is larger than 0.6 second, then we assume it is a pause
|
||||
int32_t threshold = static_cast<int32_t>(sample_rate * 0.6);
|
||||
|
||||
std::vector<SilenceInterval> intervals;
|
||||
int32_t num_samples = static_cast<int32_t>(samples.size());
|
||||
|
||||
int32_t last = -1;
|
||||
int32_t i;
|
||||
for (i = 0; i != num_samples; ++i) {
|
||||
if (fabs(samples[i]) <= 0.01) {
|
||||
if (last == -1) {
|
||||
last = i;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
if (last != -1 && i - last < threshold) {
|
||||
last = -1;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (last != -1) {
|
||||
intervals.push_back({last, i});
|
||||
last = -1;
|
||||
}
|
||||
}
|
||||
|
||||
if (last != -1 && num_samples - last > threshold) {
|
||||
intervals.push_back({last, num_samples});
|
||||
}
|
||||
|
||||
if (intervals.empty()) {
|
||||
return *this;
|
||||
}
|
||||
|
||||
GeneratedAudio ans;
|
||||
ans.sample_rate = sample_rate;
|
||||
ans.samples.reserve(samples.size());
|
||||
|
||||
i = 0;
|
||||
for (const auto &interval : intervals) {
|
||||
ans.samples.insert(ans.samples.end(), samples.begin() + i,
|
||||
samples.begin() + interval.start);
|
||||
i = interval.end;
|
||||
int32_t n = static_cast<int32_t>((interval.end - interval.start) * scale);
|
||||
|
||||
ans.samples.insert(ans.samples.end(), samples.begin() + interval.start,
|
||||
samples.begin() + interval.start + n);
|
||||
}
|
||||
|
||||
if (i < num_samples) {
|
||||
ans.samples.insert(ans.samples.end(), samples.begin() + i, samples.end());
|
||||
}
|
||||
|
||||
return ans;
|
||||
}
|
||||
|
||||
void OfflineTtsConfig::Register(ParseOptions *po) {
|
||||
model.Register(po);
|
||||
|
||||
@@ -44,6 +111,10 @@ void OfflineTtsConfig::Register(ParseOptions *po) {
|
||||
"Maximum number of sentences that we process at a time. "
|
||||
"This is to avoid OOM for very long input text. "
|
||||
"If you set it to -1, then we process all sentences in a single batch.");
|
||||
|
||||
po->Register("tts-silence-scale", &silence_scale,
|
||||
"Duration of the pause is scaled by this number. So a smaller "
|
||||
"value leads to a shorter pause.");
|
||||
}
|
||||
|
||||
bool OfflineTtsConfig::Validate() const {
|
||||
@@ -69,6 +140,11 @@ bool OfflineTtsConfig::Validate() const {
|
||||
}
|
||||
}
|
||||
|
||||
if (silence_scale < 0.001) {
|
||||
SHERPA_ONNX_LOGE("--tts-silence-scale '%.3f' is too small", silence_scale);
|
||||
return false;
|
||||
}
|
||||
|
||||
return model.Validate();
|
||||
}
|
||||
|
||||
@@ -79,7 +155,8 @@ std::string OfflineTtsConfig::ToString() const {
|
||||
os << "model=" << model.ToString() << ", ";
|
||||
os << "rule_fsts=\"" << rule_fsts << "\", ";
|
||||
os << "rule_fars=\"" << rule_fars << "\", ";
|
||||
os << "max_num_sentences=" << max_num_sentences << ")";
|
||||
os << "max_num_sentences=" << max_num_sentences << ", ";
|
||||
os << "silence_scale=" << silence_scale << ")";
|
||||
|
||||
return os.str();
|
||||
}
|
||||
|
||||
@@ -32,14 +32,20 @@ struct OfflineTtsConfig {
|
||||
// If you set it to -1, then we process all sentences in a single batch.
|
||||
int32_t max_num_sentences = 1;
|
||||
|
||||
// A silence interval containing audio samples with value close to 0.
|
||||
//
|
||||
// the duration of the new interval is old_duration * silence_scale.
|
||||
float silence_scale = 0.2;
|
||||
|
||||
OfflineTtsConfig() = default;
|
||||
OfflineTtsConfig(const OfflineTtsModelConfig &model,
|
||||
const std::string &rule_fsts, const std::string &rule_fars,
|
||||
int32_t max_num_sentences)
|
||||
int32_t max_num_sentences, float silence_scale)
|
||||
: model(model),
|
||||
rule_fsts(rule_fsts),
|
||||
rule_fars(rule_fars),
|
||||
max_num_sentences(max_num_sentences) {}
|
||||
max_num_sentences(max_num_sentences),
|
||||
silence_scale(silence_scale) {}
|
||||
|
||||
void Register(ParseOptions *po);
|
||||
bool Validate() const;
|
||||
@@ -50,6 +56,11 @@ struct OfflineTtsConfig {
|
||||
struct GeneratedAudio {
|
||||
std::vector<float> samples;
|
||||
int32_t sample_rate;
|
||||
|
||||
// Silence means pause here.
|
||||
// If scale > 1, then it increases the duration of a pause
|
||||
// If scale < 1, then it reduces the duration of a pause
|
||||
GeneratedAudio ScaleSilence(float scale) const;
|
||||
};
|
||||
|
||||
class OfflineTtsImpl;
|
||||
|
||||
Reference in New Issue
Block a user