Support scaling the duration of a pause in TTS. (#1820)

This commit is contained in:
Fangjun Kuang
2025-02-08 12:47:26 +08:00
committed by GitHub
parent d38cb81014
commit 69f489f0cd
24 changed files with 171 additions and 19 deletions

View File

@@ -4,6 +4,7 @@
#include "sherpa-onnx/csrc/offline-tts.h"
#include <cmath>
#include <string>
#include <utility>
@@ -23,6 +24,72 @@
namespace sherpa_onnx {
struct SilenceInterval {
int32_t start;
int32_t end;
};
GeneratedAudio GeneratedAudio::ScaleSilence(float scale) const {
if (scale == 1) {
return *this;
}
// if the interval is larger than 0.6 second, then we assume it is a pause
int32_t threshold = static_cast<int32_t>(sample_rate * 0.6);
std::vector<SilenceInterval> intervals;
int32_t num_samples = static_cast<int32_t>(samples.size());
int32_t last = -1;
int32_t i;
for (i = 0; i != num_samples; ++i) {
if (fabs(samples[i]) <= 0.01) {
if (last == -1) {
last = i;
}
continue;
}
if (last != -1 && i - last < threshold) {
last = -1;
continue;
}
if (last != -1) {
intervals.push_back({last, i});
last = -1;
}
}
if (last != -1 && num_samples - last > threshold) {
intervals.push_back({last, num_samples});
}
if (intervals.empty()) {
return *this;
}
GeneratedAudio ans;
ans.sample_rate = sample_rate;
ans.samples.reserve(samples.size());
i = 0;
for (const auto &interval : intervals) {
ans.samples.insert(ans.samples.end(), samples.begin() + i,
samples.begin() + interval.start);
i = interval.end;
int32_t n = static_cast<int32_t>((interval.end - interval.start) * scale);
ans.samples.insert(ans.samples.end(), samples.begin() + interval.start,
samples.begin() + interval.start + n);
}
if (i < num_samples) {
ans.samples.insert(ans.samples.end(), samples.begin() + i, samples.end());
}
return ans;
}
void OfflineTtsConfig::Register(ParseOptions *po) {
model.Register(po);
@@ -44,6 +111,10 @@ void OfflineTtsConfig::Register(ParseOptions *po) {
"Maximum number of sentences that we process at a time. "
"This is to avoid OOM for very long input text. "
"If you set it to -1, then we process all sentences in a single batch.");
po->Register("tts-silence-scale", &silence_scale,
"Duration of the pause is scaled by this number. So a smaller "
"value leads to a shorter pause.");
}
bool OfflineTtsConfig::Validate() const {
@@ -69,6 +140,11 @@ bool OfflineTtsConfig::Validate() const {
}
}
if (silence_scale < 0.001) {
SHERPA_ONNX_LOGE("--tts-silence-scale '%.3f' is too small", silence_scale);
return false;
}
return model.Validate();
}
@@ -79,7 +155,8 @@ std::string OfflineTtsConfig::ToString() const {
os << "model=" << model.ToString() << ", ";
os << "rule_fsts=\"" << rule_fsts << "\", ";
os << "rule_fars=\"" << rule_fars << "\", ";
os << "max_num_sentences=" << max_num_sentences << ")";
os << "max_num_sentences=" << max_num_sentences << ", ";
os << "silence_scale=" << silence_scale << ")";
return os.str();
}