Support scaling the duration of a pause in TTS. (#1820)

This commit is contained in:
Fangjun Kuang
2025-02-08 12:47:26 +08:00
committed by GitHub
parent d38cb81014
commit 69f489f0cd
24 changed files with 171 additions and 19 deletions

View File

@@ -420,6 +420,12 @@ class OfflineTtsKokoroImpl : public OfflineTtsImpl {
GeneratedAudio ans;
ans.sample_rate = model_->GetMetaData().sample_rate;
ans.samples = std::vector<float>(p, p + total);
float silence_scale = config_.silence_scale;
if (silence_scale != 1) {
ans = ans.ScaleSilence(silence_scale);
}
return ans;
}

View File

@@ -398,6 +398,12 @@ class OfflineTtsMatchaImpl : public OfflineTtsImpl {
GeneratedAudio ans;
ans.sample_rate = model_->GetMetaData().sample_rate;
ans.samples = std::vector<float>(p, p + total);
float silence_scale = config_.silence_scale;
if (silence_scale != 1) {
ans = ans.ScaleSilence(silence_scale);
}
return ans;
}

View File

@@ -485,6 +485,12 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl {
GeneratedAudio ans;
ans.sample_rate = model_->GetMetaData().sample_rate;
ans.samples = std::vector<float>(p, p + total);
float silence_scale = config_.silence_scale;
if (silence_scale != 1) {
ans = ans.ScaleSilence(silence_scale);
}
return ans;
}

View File

@@ -4,6 +4,7 @@
#include "sherpa-onnx/csrc/offline-tts.h"
#include <cmath>
#include <string>
#include <utility>
@@ -23,6 +24,72 @@
namespace sherpa_onnx {
struct SilenceInterval {
int32_t start;
int32_t end;
};
GeneratedAudio GeneratedAudio::ScaleSilence(float scale) const {
if (scale == 1) {
return *this;
}
// if the interval is larger than 0.6 second, then we assume it is a pause
int32_t threshold = static_cast<int32_t>(sample_rate * 0.6);
std::vector<SilenceInterval> intervals;
int32_t num_samples = static_cast<int32_t>(samples.size());
int32_t last = -1;
int32_t i;
for (i = 0; i != num_samples; ++i) {
if (fabs(samples[i]) <= 0.01) {
if (last == -1) {
last = i;
}
continue;
}
if (last != -1 && i - last < threshold) {
last = -1;
continue;
}
if (last != -1) {
intervals.push_back({last, i});
last = -1;
}
}
if (last != -1 && num_samples - last > threshold) {
intervals.push_back({last, num_samples});
}
if (intervals.empty()) {
return *this;
}
GeneratedAudio ans;
ans.sample_rate = sample_rate;
ans.samples.reserve(samples.size());
i = 0;
for (const auto &interval : intervals) {
ans.samples.insert(ans.samples.end(), samples.begin() + i,
samples.begin() + interval.start);
i = interval.end;
int32_t n = static_cast<int32_t>((interval.end - interval.start) * scale);
ans.samples.insert(ans.samples.end(), samples.begin() + interval.start,
samples.begin() + interval.start + n);
}
if (i < num_samples) {
ans.samples.insert(ans.samples.end(), samples.begin() + i, samples.end());
}
return ans;
}
void OfflineTtsConfig::Register(ParseOptions *po) {
model.Register(po);
@@ -44,6 +111,10 @@ void OfflineTtsConfig::Register(ParseOptions *po) {
"Maximum number of sentences that we process at a time. "
"This is to avoid OOM for very long input text. "
"If you set it to -1, then we process all sentences in a single batch.");
po->Register("tts-silence-scale", &silence_scale,
"Duration of the pause is scaled by this number. So a smaller "
"value leads to a shorter pause.");
}
bool OfflineTtsConfig::Validate() const {
@@ -69,6 +140,11 @@ bool OfflineTtsConfig::Validate() const {
}
}
if (silence_scale < 0.001) {
SHERPA_ONNX_LOGE("--tts-silence-scale '%.3f' is too small", silence_scale);
return false;
}
return model.Validate();
}
@@ -79,7 +155,8 @@ std::string OfflineTtsConfig::ToString() const {
os << "model=" << model.ToString() << ", ";
os << "rule_fsts=\"" << rule_fsts << "\", ";
os << "rule_fars=\"" << rule_fars << "\", ";
os << "max_num_sentences=" << max_num_sentences << ")";
os << "max_num_sentences=" << max_num_sentences << ", ";
os << "silence_scale=" << silence_scale << ")";
return os.str();
}

View File

@@ -32,14 +32,20 @@ struct OfflineTtsConfig {
// If you set it to -1, then we process all sentences in a single batch.
int32_t max_num_sentences = 1;
// A silence interval containing audio samples with value close to 0.
//
// the duration of the new interval is old_duration * silence_scale.
float silence_scale = 0.2;
OfflineTtsConfig() = default;
OfflineTtsConfig(const OfflineTtsModelConfig &model,
const std::string &rule_fsts, const std::string &rule_fars,
int32_t max_num_sentences)
int32_t max_num_sentences, float silence_scale)
: model(model),
rule_fsts(rule_fsts),
rule_fars(rule_fars),
max_num_sentences(max_num_sentences) {}
max_num_sentences(max_num_sentences),
silence_scale(silence_scale) {}
void Register(ParseOptions *po);
bool Validate() const;
@@ -50,6 +56,11 @@ struct OfflineTtsConfig {
struct GeneratedAudio {
std::vector<float> samples;
int32_t sample_rate;
// Silence means pause here.
// If scale > 1, then it increases the duration of a pause
// If scale < 1, then it reduces the duration of a pause
GeneratedAudio ScaleSilence(float scale) const;
};
class OfflineTtsImpl;