Support scaling the duration of a pause in TTS. (#1820)

2025-02-08 12:47:26 +08:00
parent d38cb81014
commit 69f489f0cd
24 changed files with 171 additions and 19 deletions
--- a/sherpa-onnx/csrc/offline-tts-kokoro-impl.h
+++ b/sherpa-onnx/csrc/offline-tts-kokoro-impl.h
@@ -420,6 +420,12 @@ class OfflineTtsKokoroImpl : public OfflineTtsImpl {
    GeneratedAudio ans;
    ans.sample_rate = model_->GetMetaData().sample_rate;
    ans.samples = std::vector<float>(p, p + total);
+
+    float silence_scale = config_.silence_scale;
+    if (silence_scale != 1) {
+      ans = ans.ScaleSilence(silence_scale);
+    }
+
    return ans;
  }

--- a/sherpa-onnx/csrc/offline-tts-matcha-impl.h
+++ b/sherpa-onnx/csrc/offline-tts-matcha-impl.h
@@ -398,6 +398,12 @@ class OfflineTtsMatchaImpl : public OfflineTtsImpl {
    GeneratedAudio ans;
    ans.sample_rate = model_->GetMetaData().sample_rate;
    ans.samples = std::vector<float>(p, p + total);
+
+    float silence_scale = config_.silence_scale;
+    if (silence_scale != 1) {
+      ans = ans.ScaleSilence(silence_scale);
+    }
+
    return ans;
  }

--- a/sherpa-onnx/csrc/offline-tts-vits-impl.h
+++ b/sherpa-onnx/csrc/offline-tts-vits-impl.h
@@ -485,6 +485,12 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl {
    GeneratedAudio ans;
    ans.sample_rate = model_->GetMetaData().sample_rate;
    ans.samples = std::vector<float>(p, p + total);
+
+    float silence_scale = config_.silence_scale;
+    if (silence_scale != 1) {
+      ans = ans.ScaleSilence(silence_scale);
+    }
+
    return ans;
  }

--- a/sherpa-onnx/csrc/offline-tts.cc
+++ b/sherpa-onnx/csrc/offline-tts.cc
@@ -4,6 +4,7 @@

 #include "sherpa-onnx/csrc/offline-tts.h"

+#include <cmath>
 #include <string>
 #include <utility>

@@ -23,6 +24,72 @@

 namespace sherpa_onnx {

+struct SilenceInterval {
+  int32_t start;
+  int32_t end;
+};
+
+GeneratedAudio GeneratedAudio::ScaleSilence(float scale) const {
+  if (scale == 1) {
+    return *this;
+  }
+  // if the interval is larger than 0.6 second, then we assume it is a pause
+  int32_t threshold = static_cast<int32_t>(sample_rate * 0.6);
+
+  std::vector<SilenceInterval> intervals;
+  int32_t num_samples = static_cast<int32_t>(samples.size());
+
+  int32_t last = -1;
+  int32_t i;
+  for (i = 0; i != num_samples; ++i) {
+    if (fabs(samples[i]) <= 0.01) {
+      if (last == -1) {
+        last = i;
+      }
+      continue;
+    }
+
+    if (last != -1 && i - last < threshold) {
+      last = -1;
+      continue;
+    }
+
+    if (last != -1) {
+      intervals.push_back({last, i});
+      last = -1;
+    }
+  }
+
+  if (last != -1 && num_samples - last > threshold) {
+    intervals.push_back({last, num_samples});
+  }
+
+  if (intervals.empty()) {
+    return *this;
+  }
+
+  GeneratedAudio ans;
+  ans.sample_rate = sample_rate;
+  ans.samples.reserve(samples.size());
+
+  i = 0;
+  for (const auto &interval : intervals) {
+    ans.samples.insert(ans.samples.end(), samples.begin() + i,
+                       samples.begin() + interval.start);
+    i = interval.end;
+    int32_t n = static_cast<int32_t>((interval.end - interval.start) * scale);
+
+    ans.samples.insert(ans.samples.end(), samples.begin() + interval.start,
+                       samples.begin() + interval.start + n);
+  }
+
+  if (i < num_samples) {
+    ans.samples.insert(ans.samples.end(), samples.begin() + i, samples.end());
+  }
+
+  return ans;
+}
+
 void OfflineTtsConfig::Register(ParseOptions *po) {
  model.Register(po);

@@ -44,6 +111,10 @@ void OfflineTtsConfig::Register(ParseOptions *po) {
      "Maximum number of sentences that we process at a time. "
      "This is to avoid OOM for very long input text. "
      "If you set it to -1, then we process all sentences in a single batch.");
+
+  po->Register("tts-silence-scale", &silence_scale,
+               "Duration of the pause is scaled by this number. So a smaller "
+               "value leads to a shorter pause.");
 }

 bool OfflineTtsConfig::Validate() const {
@@ -69,6 +140,11 @@ bool OfflineTtsConfig::Validate() const {
    }
  }

+  if (silence_scale < 0.001) {
+    SHERPA_ONNX_LOGE("--tts-silence-scale '%.3f' is too small", silence_scale);
+    return false;
+  }
+
  return model.Validate();
 }

@@ -79,7 +155,8 @@ std::string OfflineTtsConfig::ToString() const {
  os << "model=" << model.ToString() << ", ";
  os << "rule_fsts=\"" << rule_fsts << "\", ";
  os << "rule_fars=\"" << rule_fars << "\", ";
-  os << "max_num_sentences=" << max_num_sentences << ")";
+  os << "max_num_sentences=" << max_num_sentences << ", ";
+  os << "silence_scale=" << silence_scale << ")";

  return os.str();
 }
--- a/sherpa-onnx/csrc/offline-tts.h
+++ b/sherpa-onnx/csrc/offline-tts.h
@@ -32,14 +32,20 @@ struct OfflineTtsConfig {
  // If you set it to -1, then we process all sentences in a single batch.
  int32_t max_num_sentences = 1;

+  // A silence interval containing audio samples with value close to 0.
+  //
+  // the duration of the new interval is old_duration * silence_scale.
+  float silence_scale = 0.2;
+
  OfflineTtsConfig() = default;
  OfflineTtsConfig(const OfflineTtsModelConfig &model,
                   const std::string &rule_fsts, const std::string &rule_fars,
-                   int32_t max_num_sentences)
+                   int32_t max_num_sentences, float silence_scale)
      : model(model),
        rule_fsts(rule_fsts),
        rule_fars(rule_fars),
-        max_num_sentences(max_num_sentences) {}
+        max_num_sentences(max_num_sentences),
+        silence_scale(silence_scale) {}

  void Register(ParseOptions *po);
  bool Validate() const;
@@ -50,6 +56,11 @@ struct OfflineTtsConfig {
 struct GeneratedAudio {
  std::vector<float> samples;
  int32_t sample_rate;
+
+  // Silence means pause here.
+  // If scale > 1, then it increases the duration of a pause
+  // If scale < 1, then it reduces the duration of a pause
+  GeneratedAudio ScaleSilence(float scale) const;
 };

 class OfflineTtsImpl;