From 69f489f0cdc57b558c8e24b24c87092f14f7ea44 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Sat, 8 Feb 2025 12:47:26 +0800 Subject: [PATCH] Support scaling the duration of a pause in TTS. (#1820) --- ...s-spotter-buffered-tokens-keywords-c-api.c | 4 +- .../lib/src/sherpa_onnx_bindings.dart | 3 + flutter/sherpa_onnx/lib/src/tts.dart | 5 +- .../src/main/cpp/non-streaming-tts.cc | 1 + .../main/ets/components/NonStreamingTts.ets | 3 +- scripts/dotnet/OfflineTtsConfig.cs | 6 +- scripts/go/sherpa_onnx.go | 2 + sherpa-onnx/c-api/c-api.cc | 1 + sherpa-onnx/c-api/c-api.h | 1 + sherpa-onnx/c-api/cxx-api.cc | 1 + sherpa-onnx/c-api/cxx-api.h | 1 + sherpa-onnx/csrc/offline-tts-kokoro-impl.h | 6 ++ sherpa-onnx/csrc/offline-tts-matcha-impl.h | 6 ++ sherpa-onnx/csrc/offline-tts-vits-impl.h | 6 ++ sherpa-onnx/csrc/offline-tts.cc | 79 ++++++++++++++++++- sherpa-onnx/csrc/offline-tts.h | 15 +++- .../k2fsa/sherpa/onnx/OfflineTtsConfig.java | 12 +++ sherpa-onnx/jni/offline-tts.cc | 3 + sherpa-onnx/kotlin-api/Tts.kt | 1 + sherpa-onnx/pascal-api/sherpa_onnx.pas | 11 ++- sherpa-onnx/python/csrc/offline-tts.cc | 6 +- swift-api-examples/SherpaOnnx.swift | 6 +- wasm/tts/sherpa-onnx-tts.js | 8 +- wasm/tts/sherpa-onnx-wasm-main-tts.cc | 3 +- 24 files changed, 171 insertions(+), 19 deletions(-) diff --git a/c-api-examples/keywords-spotter-buffered-tokens-keywords-c-api.c b/c-api-examples/keywords-spotter-buffered-tokens-keywords-c-api.c index 45a0bb87..0286452f 100644 --- a/c-api-examples/keywords-spotter-buffered-tokens-keywords-c-api.c +++ b/c-api-examples/keywords-spotter-buffered-tokens-keywords-c-api.c @@ -116,7 +116,7 @@ int32_t main() { keywords_spotter_config.keywords_buf = keywords_buf; keywords_spotter_config.keywords_buf_size = keywords_buf_size; - SherpaOnnxKeywordSpotter *keywords_spotter = + const SherpaOnnxKeywordSpotter *keywords_spotter = SherpaOnnxCreateKeywordSpotter(&keywords_spotter_config); free((void *)tokens_buf); @@ -130,7 +130,7 @@ int32_t main() { return -1; } - SherpaOnnxOnlineStream *stream = + const SherpaOnnxOnlineStream *stream = SherpaOnnxCreateKeywordStream(keywords_spotter); const SherpaOnnxDisplay *display = SherpaOnnxCreateDisplay(50); diff --git a/flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart b/flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart index c22c2a52..1a069dc5 100644 --- a/flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart +++ b/flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart @@ -180,6 +180,9 @@ final class SherpaOnnxOfflineTtsConfig extends Struct { external int maxNumSenetences; external Pointer ruleFars; + + @Float() + external double silenceScale; } final class SherpaOnnxGeneratedAudio extends Struct { diff --git a/flutter/sherpa_onnx/lib/src/tts.dart b/flutter/sherpa_onnx/lib/src/tts.dart index e03126d0..a74690e7 100644 --- a/flutter/sherpa_onnx/lib/src/tts.dart +++ b/flutter/sherpa_onnx/lib/src/tts.dart @@ -114,17 +114,19 @@ class OfflineTtsConfig { this.ruleFsts = '', this.maxNumSenetences = 1, this.ruleFars = '', + this.silenceScale = 0.2, }); @override String toString() { - return 'OfflineTtsConfig(model: $model, ruleFsts: $ruleFsts, maxNumSenetences: $maxNumSenetences, ruleFars: $ruleFars)'; + return 'OfflineTtsConfig(model: $model, ruleFsts: $ruleFsts, maxNumSenetences: $maxNumSenetences, ruleFars: $ruleFars, silenceScale: $silenceScale)'; } final OfflineTtsModelConfig model; final String ruleFsts; final int maxNumSenetences; final String ruleFars; + final double silenceScale; } class GeneratedAudio { @@ -180,6 +182,7 @@ class OfflineTts { c.ref.ruleFsts = config.ruleFsts.toNativeUtf8(); c.ref.maxNumSenetences = config.maxNumSenetences; c.ref.ruleFars = config.ruleFars.toNativeUtf8(); + c.ref.silenceScale = config.silenceScale; final ptr = SherpaOnnxBindings.createOfflineTts?.call(c) ?? nullptr; diff --git a/harmony-os/SherpaOnnxHar/sherpa_onnx/src/main/cpp/non-streaming-tts.cc b/harmony-os/SherpaOnnxHar/sherpa_onnx/src/main/cpp/non-streaming-tts.cc index b4095526..003bd3b7 100644 --- a/harmony-os/SherpaOnnxHar/sherpa_onnx/src/main/cpp/non-streaming-tts.cc +++ b/harmony-os/SherpaOnnxHar/sherpa_onnx/src/main/cpp/non-streaming-tts.cc @@ -146,6 +146,7 @@ static Napi::External CreateOfflineTtsWrapper( SHERPA_ONNX_ASSIGN_ATTR_STR(rule_fsts, ruleFsts); SHERPA_ONNX_ASSIGN_ATTR_INT32(max_num_sentences, maxNumSentences); SHERPA_ONNX_ASSIGN_ATTR_STR(rule_fars, ruleFars); + SHERPA_ONNX_ASSIGN_ATTR_STR(silence_scale, silenceScale); #if __OHOS__ std::unique_ptr { return offlineTtsGenerateAsync(this.handle, input); } -} \ No newline at end of file +} diff --git a/scripts/dotnet/OfflineTtsConfig.cs b/scripts/dotnet/OfflineTtsConfig.cs index 897df0c0..9d02668d 100644 --- a/scripts/dotnet/OfflineTtsConfig.cs +++ b/scripts/dotnet/OfflineTtsConfig.cs @@ -13,6 +13,7 @@ namespace SherpaOnnx RuleFsts = ""; MaxNumSentences = 1; RuleFars = ""; + SilenceScale = 0.2F; } public OfflineTtsModelConfig Model; @@ -23,6 +24,7 @@ namespace SherpaOnnx [MarshalAs(UnmanagedType.LPStr)] public string RuleFars; - } -} \ No newline at end of file + public float SilenceScale; + } +} diff --git a/scripts/go/sherpa_onnx.go b/scripts/go/sherpa_onnx.go index d374f519..88777a49 100644 --- a/scripts/go/sherpa_onnx.go +++ b/scripts/go/sherpa_onnx.go @@ -712,6 +712,7 @@ type OfflineTtsConfig struct { RuleFsts string RuleFars string MaxNumSentences int + SilenceScale float32 } type GeneratedAudio struct { @@ -744,6 +745,7 @@ func NewOfflineTts(config *OfflineTtsConfig) *OfflineTts { defer C.free(unsafe.Pointer(c.rule_fars)) c.max_num_sentences = C.int(config.MaxNumSentences) + c.silence_scale = C.float(config.SilenceScale) // vits c.model.vits.model = C.CString(config.Model.Vits.Model) diff --git a/sherpa-onnx/c-api/c-api.cc b/sherpa-onnx/c-api/c-api.cc index b6a2c9a8..106a62c0 100644 --- a/sherpa-onnx/c-api/c-api.cc +++ b/sherpa-onnx/c-api/c-api.cc @@ -1135,6 +1135,7 @@ static sherpa_onnx::OfflineTtsConfig GetOfflineTtsConfig( tts_config.rule_fsts = SHERPA_ONNX_OR(config->rule_fsts, ""); tts_config.rule_fars = SHERPA_ONNX_OR(config->rule_fars, ""); tts_config.max_num_sentences = SHERPA_ONNX_OR(config->max_num_sentences, 1); + tts_config.silence_scale = SHERPA_ONNX_OR(config->silence_scale, 0.2); if (tts_config.model.debug) { #if __OHOS__ diff --git a/sherpa-onnx/c-api/c-api.h b/sherpa-onnx/c-api/c-api.h index cabfc15c..2c53d52c 100644 --- a/sherpa-onnx/c-api/c-api.h +++ b/sherpa-onnx/c-api/c-api.h @@ -944,6 +944,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTtsConfig { const char *rule_fsts; int32_t max_num_sentences; const char *rule_fars; + float silence_scale; } SherpaOnnxOfflineTtsConfig; SHERPA_ONNX_API typedef struct SherpaOnnxGeneratedAudio { diff --git a/sherpa-onnx/c-api/cxx-api.cc b/sherpa-onnx/c-api/cxx-api.cc index 7ce2b63a..d0bbd8ba 100644 --- a/sherpa-onnx/c-api/cxx-api.cc +++ b/sherpa-onnx/c-api/cxx-api.cc @@ -352,6 +352,7 @@ OfflineTts OfflineTts::Create(const OfflineTtsConfig &config) { c.rule_fsts = config.rule_fsts.c_str(); c.max_num_sentences = config.max_num_sentences; + c.silence_scale = config.silence_scale; c.rule_fars = config.rule_fars.c_str(); auto p = SherpaOnnxCreateOfflineTts(&c); diff --git a/sherpa-onnx/c-api/cxx-api.h b/sherpa-onnx/c-api/cxx-api.h index 66133f6d..dc87bfc3 100644 --- a/sherpa-onnx/c-api/cxx-api.h +++ b/sherpa-onnx/c-api/cxx-api.h @@ -363,6 +363,7 @@ struct OfflineTtsConfig { std::string rule_fsts; std::string rule_fars; int32_t max_num_sentences = 1; + float silence_scale = 0.2; }; struct GeneratedAudio { diff --git a/sherpa-onnx/csrc/offline-tts-kokoro-impl.h b/sherpa-onnx/csrc/offline-tts-kokoro-impl.h index 416cadce..fe8f2331 100644 --- a/sherpa-onnx/csrc/offline-tts-kokoro-impl.h +++ b/sherpa-onnx/csrc/offline-tts-kokoro-impl.h @@ -420,6 +420,12 @@ class OfflineTtsKokoroImpl : public OfflineTtsImpl { GeneratedAudio ans; ans.sample_rate = model_->GetMetaData().sample_rate; ans.samples = std::vector(p, p + total); + + float silence_scale = config_.silence_scale; + if (silence_scale != 1) { + ans = ans.ScaleSilence(silence_scale); + } + return ans; } diff --git a/sherpa-onnx/csrc/offline-tts-matcha-impl.h b/sherpa-onnx/csrc/offline-tts-matcha-impl.h index 7bd45fed..e717e64f 100644 --- a/sherpa-onnx/csrc/offline-tts-matcha-impl.h +++ b/sherpa-onnx/csrc/offline-tts-matcha-impl.h @@ -398,6 +398,12 @@ class OfflineTtsMatchaImpl : public OfflineTtsImpl { GeneratedAudio ans; ans.sample_rate = model_->GetMetaData().sample_rate; ans.samples = std::vector(p, p + total); + + float silence_scale = config_.silence_scale; + if (silence_scale != 1) { + ans = ans.ScaleSilence(silence_scale); + } + return ans; } diff --git a/sherpa-onnx/csrc/offline-tts-vits-impl.h b/sherpa-onnx/csrc/offline-tts-vits-impl.h index 72146b02..a8de6200 100644 --- a/sherpa-onnx/csrc/offline-tts-vits-impl.h +++ b/sherpa-onnx/csrc/offline-tts-vits-impl.h @@ -485,6 +485,12 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl { GeneratedAudio ans; ans.sample_rate = model_->GetMetaData().sample_rate; ans.samples = std::vector(p, p + total); + + float silence_scale = config_.silence_scale; + if (silence_scale != 1) { + ans = ans.ScaleSilence(silence_scale); + } + return ans; } diff --git a/sherpa-onnx/csrc/offline-tts.cc b/sherpa-onnx/csrc/offline-tts.cc index d858b71b..d3fa14b5 100644 --- a/sherpa-onnx/csrc/offline-tts.cc +++ b/sherpa-onnx/csrc/offline-tts.cc @@ -4,6 +4,7 @@ #include "sherpa-onnx/csrc/offline-tts.h" +#include #include #include @@ -23,6 +24,72 @@ namespace sherpa_onnx { +struct SilenceInterval { + int32_t start; + int32_t end; +}; + +GeneratedAudio GeneratedAudio::ScaleSilence(float scale) const { + if (scale == 1) { + return *this; + } + // if the interval is larger than 0.6 second, then we assume it is a pause + int32_t threshold = static_cast(sample_rate * 0.6); + + std::vector intervals; + int32_t num_samples = static_cast(samples.size()); + + int32_t last = -1; + int32_t i; + for (i = 0; i != num_samples; ++i) { + if (fabs(samples[i]) <= 0.01) { + if (last == -1) { + last = i; + } + continue; + } + + if (last != -1 && i - last < threshold) { + last = -1; + continue; + } + + if (last != -1) { + intervals.push_back({last, i}); + last = -1; + } + } + + if (last != -1 && num_samples - last > threshold) { + intervals.push_back({last, num_samples}); + } + + if (intervals.empty()) { + return *this; + } + + GeneratedAudio ans; + ans.sample_rate = sample_rate; + ans.samples.reserve(samples.size()); + + i = 0; + for (const auto &interval : intervals) { + ans.samples.insert(ans.samples.end(), samples.begin() + i, + samples.begin() + interval.start); + i = interval.end; + int32_t n = static_cast((interval.end - interval.start) * scale); + + ans.samples.insert(ans.samples.end(), samples.begin() + interval.start, + samples.begin() + interval.start + n); + } + + if (i < num_samples) { + ans.samples.insert(ans.samples.end(), samples.begin() + i, samples.end()); + } + + return ans; +} + void OfflineTtsConfig::Register(ParseOptions *po) { model.Register(po); @@ -44,6 +111,10 @@ void OfflineTtsConfig::Register(ParseOptions *po) { "Maximum number of sentences that we process at a time. " "This is to avoid OOM for very long input text. " "If you set it to -1, then we process all sentences in a single batch."); + + po->Register("tts-silence-scale", &silence_scale, + "Duration of the pause is scaled by this number. So a smaller " + "value leads to a shorter pause."); } bool OfflineTtsConfig::Validate() const { @@ -69,6 +140,11 @@ bool OfflineTtsConfig::Validate() const { } } + if (silence_scale < 0.001) { + SHERPA_ONNX_LOGE("--tts-silence-scale '%.3f' is too small", silence_scale); + return false; + } + return model.Validate(); } @@ -79,7 +155,8 @@ std::string OfflineTtsConfig::ToString() const { os << "model=" << model.ToString() << ", "; os << "rule_fsts=\"" << rule_fsts << "\", "; os << "rule_fars=\"" << rule_fars << "\", "; - os << "max_num_sentences=" << max_num_sentences << ")"; + os << "max_num_sentences=" << max_num_sentences << ", "; + os << "silence_scale=" << silence_scale << ")"; return os.str(); } diff --git a/sherpa-onnx/csrc/offline-tts.h b/sherpa-onnx/csrc/offline-tts.h index 884173e7..d1b0b21d 100644 --- a/sherpa-onnx/csrc/offline-tts.h +++ b/sherpa-onnx/csrc/offline-tts.h @@ -32,14 +32,20 @@ struct OfflineTtsConfig { // If you set it to -1, then we process all sentences in a single batch. int32_t max_num_sentences = 1; + // A silence interval containing audio samples with value close to 0. + // + // the duration of the new interval is old_duration * silence_scale. + float silence_scale = 0.2; + OfflineTtsConfig() = default; OfflineTtsConfig(const OfflineTtsModelConfig &model, const std::string &rule_fsts, const std::string &rule_fars, - int32_t max_num_sentences) + int32_t max_num_sentences, float silence_scale) : model(model), rule_fsts(rule_fsts), rule_fars(rule_fars), - max_num_sentences(max_num_sentences) {} + max_num_sentences(max_num_sentences), + silence_scale(silence_scale) {} void Register(ParseOptions *po); bool Validate() const; @@ -50,6 +56,11 @@ struct OfflineTtsConfig { struct GeneratedAudio { std::vector samples; int32_t sample_rate; + + // Silence means pause here. + // If scale > 1, then it increases the duration of a pause + // If scale < 1, then it reduces the duration of a pause + GeneratedAudio ScaleSilence(float scale) const; }; class OfflineTtsImpl; diff --git a/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineTtsConfig.java b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineTtsConfig.java index 738ffadb..53d9ee87 100644 --- a/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineTtsConfig.java +++ b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineTtsConfig.java @@ -7,12 +7,14 @@ public class OfflineTtsConfig { private final String ruleFsts; private final String ruleFars; private final int maxNumSentences; + private final float silenceScale; private OfflineTtsConfig(Builder builder) { this.model = builder.model; this.ruleFsts = builder.ruleFsts; this.ruleFars = builder.ruleFars; this.maxNumSentences = builder.maxNumSentences; + this.silenceScale = builder.silenceScale; } public static Builder builder() { @@ -35,11 +37,16 @@ public class OfflineTtsConfig { return maxNumSentences; } + public float getSilenceScale() { + return silenceScale; + } + public static class Builder { private OfflineTtsModelConfig model = OfflineTtsModelConfig.builder().build(); private String ruleFsts = ""; private String ruleFars = ""; private int maxNumSentences = 1; + private float silenceScale = 0.2f; public OfflineTtsConfig build() { return new OfflineTtsConfig(this); @@ -64,5 +71,10 @@ public class OfflineTtsConfig { this.maxNumSentences = maxNumSentences; return this; } + + public Builder setSilenceScale(float silenceScale) { + this.silenceScale = silenceScale; + return this; + } } } diff --git a/sherpa-onnx/jni/offline-tts.cc b/sherpa-onnx/jni/offline-tts.cc index 8d8f9029..14d8cc4f 100644 --- a/sherpa-onnx/jni/offline-tts.cc +++ b/sherpa-onnx/jni/offline-tts.cc @@ -187,6 +187,9 @@ static OfflineTtsConfig GetOfflineTtsConfig(JNIEnv *env, jobject config) { fid = env->GetFieldID(cls, "maxNumSentences", "I"); ans.max_num_sentences = env->GetIntField(config, fid); + fid = env->GetFieldID(cls, "silenceScale", "F"); + ans.silence_scale = env->GetFloatField(config, fid); + return ans; } diff --git a/sherpa-onnx/kotlin-api/Tts.kt b/sherpa-onnx/kotlin-api/Tts.kt index c6861065..b4e07984 100644 --- a/sherpa-onnx/kotlin-api/Tts.kt +++ b/sherpa-onnx/kotlin-api/Tts.kt @@ -49,6 +49,7 @@ data class OfflineTtsConfig( var ruleFsts: String = "", var ruleFars: String = "", var maxNumSentences: Int = 1, + var silenceScale: Float = 0.2f, ) class GeneratedAudio( diff --git a/sherpa-onnx/pascal-api/sherpa_onnx.pas b/sherpa-onnx/pascal-api/sherpa_onnx.pas index 9163a63c..3d57bf18 100644 --- a/sherpa-onnx/pascal-api/sherpa_onnx.pas +++ b/sherpa-onnx/pascal-api/sherpa_onnx.pas @@ -106,6 +106,7 @@ type RuleFsts: AnsiString; MaxNumSentences: Integer; RuleFars: AnsiString; + SilenceScale: Single; function ToString: AnsiString; class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsConfig); @@ -777,6 +778,7 @@ type RuleFsts: PAnsiChar; MaxNumSentences: cint32; RuleFars: PAnsiChar; + SilenceScale: cfloat; end; PSherpaOnnxOfflineTtsConfig = ^SherpaOnnxOfflineTtsConfig; @@ -1976,15 +1978,17 @@ begin 'Model := %s, ' + 'RuleFsts := %s, ' + 'MaxNumSentences := %d, ' + - 'RuleFars := %s' + + 'RuleFars := %s, ' + + 'SilenceScale := %f' + ')', - [Self.Model.ToString, Self.RuleFsts, Self.MaxNumSentences, Self.RuleFars - ]); + [Self.Model.ToString, Self.RuleFsts, Self.MaxNumSentences, Self.RuleFars, + Self.SilenceScale]); end; class operator TSherpaOnnxOfflineTtsConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsConfig); begin Dest.MaxNumSentences := 1; + Dest.SilenceScale := 0.2; end; constructor TSherpaOnnxOfflineTts.Create(Config: TSherpaOnnxOfflineTtsConfig); @@ -2027,6 +2031,7 @@ begin C.RuleFsts := PAnsiChar(Config.RuleFsts); C.MaxNumSentences := Config.MaxNumSentences; C.RuleFars := PAnsiChar(Config.RuleFars); + C.SilenceScale := Config.SilenceScale; Self.Handle := SherpaOnnxCreateOfflineTts(@C); diff --git a/sherpa-onnx/python/csrc/offline-tts.cc b/sherpa-onnx/python/csrc/offline-tts.cc index 52a51b9d..e811bc4b 100644 --- a/sherpa-onnx/python/csrc/offline-tts.cc +++ b/sherpa-onnx/python/csrc/offline-tts.cc @@ -32,13 +32,15 @@ static void PybindOfflineTtsConfig(py::module *m) { py::class_(*m, "OfflineTtsConfig") .def(py::init<>()) .def(py::init(), + const std::string &, int32_t, float>(), py::arg("model"), py::arg("rule_fsts") = "", - py::arg("rule_fars") = "", py::arg("max_num_sentences") = 2) + py::arg("rule_fars") = "", py::arg("max_num_sentences") = 2, + py::arg("silence_scale") = 0.2) .def_readwrite("model", &PyClass::model) .def_readwrite("rule_fsts", &PyClass::rule_fsts) .def_readwrite("rule_fars", &PyClass::rule_fars) .def_readwrite("max_num_sentences", &PyClass::max_num_sentences) + .def_readwrite("silence_scale", &PyClass::silence_scale) .def("validate", &PyClass::Validate) .def("__str__", &PyClass::ToString); } diff --git a/swift-api-examples/SherpaOnnx.swift b/swift-api-examples/SherpaOnnx.swift index cb64218b..b4efaa1f 100644 --- a/swift-api-examples/SherpaOnnx.swift +++ b/swift-api-examples/SherpaOnnx.swift @@ -804,13 +804,15 @@ func sherpaOnnxOfflineTtsConfig( model: SherpaOnnxOfflineTtsModelConfig, ruleFsts: String = "", ruleFars: String = "", - maxNumSentences: Int = 1 + maxNumSentences: Int = 1, + silenceScale: Float = 0.2 ) -> SherpaOnnxOfflineTtsConfig { return SherpaOnnxOfflineTtsConfig( model: model, rule_fsts: toCPointer(ruleFsts), max_num_sentences: Int32(maxNumSentences), - rule_fars: toCPointer(ruleFars) + rule_fars: toCPointer(ruleFars), + silence_scale: silenceScale ) } diff --git a/wasm/tts/sherpa-onnx-tts.js b/wasm/tts/sherpa-onnx-tts.js index 5c09050c..4716d1fd 100644 --- a/wasm/tts/sherpa-onnx-tts.js +++ b/wasm/tts/sherpa-onnx-tts.js @@ -21,7 +21,7 @@ function freeConfig(config, Module) { // The user should free the returned pointers function initSherpaOnnxOfflineTtsVitsModelConfig(config, Module) { - const modelLen = Module.lengthBytesUTF8(config.model || '')+ 1; + const modelLen = Module.lengthBytesUTF8(config.model || '') + 1; const lexiconLen = Module.lengthBytesUTF8(config.lexicon || '') + 1; const tokensLen = Module.lengthBytesUTF8(config.tokens || '') + 1; const dataDirLen = Module.lengthBytesUTF8(config.dataDir || '') + 1; @@ -282,7 +282,7 @@ function initSherpaOnnxOfflineTtsModelConfig(config, Module) { function initSherpaOnnxOfflineTtsConfig(config, Module) { const modelConfig = initSherpaOnnxOfflineTtsModelConfig(config.offlineTtsModelConfig, Module); - const len = modelConfig.len + 3 * 4; + const len = modelConfig.len + 4 * 4; const ptr = Module._malloc(len); let offset = 0; @@ -303,6 +303,10 @@ function initSherpaOnnxOfflineTtsConfig(config, Module) { offset += 4; Module.setValue(ptr + offset, buffer + ruleFstsLen, 'i8*'); + offset += 4; + + Module.setValue(ptr + offset, config.silenceScale || 0.2, 'float'); + offset += 4; return { buffer: buffer, ptr: ptr, len: len, config: modelConfig, diff --git a/wasm/tts/sherpa-onnx-wasm-main-tts.cc b/wasm/tts/sherpa-onnx-wasm-main-tts.cc index 07bf4d42..44c45842 100644 --- a/wasm/tts/sherpa-onnx-wasm-main-tts.cc +++ b/wasm/tts/sherpa-onnx-wasm-main-tts.cc @@ -22,7 +22,7 @@ static_assert(sizeof(SherpaOnnxOfflineTtsModelConfig) == sizeof(SherpaOnnxOfflineTtsKokoroModelConfig) + 3 * 4, ""); static_assert(sizeof(SherpaOnnxOfflineTtsConfig) == - sizeof(SherpaOnnxOfflineTtsModelConfig) + 3 * 4, + sizeof(SherpaOnnxOfflineTtsModelConfig) + 4 * 4, ""); void MyPrint(SherpaOnnxOfflineTtsConfig *tts_config) { @@ -68,6 +68,7 @@ void MyPrint(SherpaOnnxOfflineTtsConfig *tts_config) { fprintf(stdout, "rule_fsts: %s\n", tts_config->rule_fsts); fprintf(stdout, "rule_fars: %s\n", tts_config->rule_fars); fprintf(stdout, "max num sentences: %d\n", tts_config->max_num_sentences); + fprintf(stdout, "silence scale: %.3f\n", tts_config->silence_scale); } void CopyHeap(const char *src, int32_t num_bytes, char *dst) {