Support scaling the duration of a pause in TTS. (#1820)

This commit is contained in:
Fangjun Kuang
2025-02-08 12:47:26 +08:00
committed by GitHub
parent d38cb81014
commit 69f489f0cd
24 changed files with 171 additions and 19 deletions

View File

@@ -116,7 +116,7 @@ int32_t main() {
keywords_spotter_config.keywords_buf = keywords_buf; keywords_spotter_config.keywords_buf = keywords_buf;
keywords_spotter_config.keywords_buf_size = keywords_buf_size; keywords_spotter_config.keywords_buf_size = keywords_buf_size;
SherpaOnnxKeywordSpotter *keywords_spotter = const SherpaOnnxKeywordSpotter *keywords_spotter =
SherpaOnnxCreateKeywordSpotter(&keywords_spotter_config); SherpaOnnxCreateKeywordSpotter(&keywords_spotter_config);
free((void *)tokens_buf); free((void *)tokens_buf);
@@ -130,7 +130,7 @@ int32_t main() {
return -1; return -1;
} }
SherpaOnnxOnlineStream *stream = const SherpaOnnxOnlineStream *stream =
SherpaOnnxCreateKeywordStream(keywords_spotter); SherpaOnnxCreateKeywordStream(keywords_spotter);
const SherpaOnnxDisplay *display = SherpaOnnxCreateDisplay(50); const SherpaOnnxDisplay *display = SherpaOnnxCreateDisplay(50);

View File

@@ -180,6 +180,9 @@ final class SherpaOnnxOfflineTtsConfig extends Struct {
external int maxNumSenetences; external int maxNumSenetences;
external Pointer<Utf8> ruleFars; external Pointer<Utf8> ruleFars;
@Float()
external double silenceScale;
} }
final class SherpaOnnxGeneratedAudio extends Struct { final class SherpaOnnxGeneratedAudio extends Struct {

View File

@@ -114,17 +114,19 @@ class OfflineTtsConfig {
this.ruleFsts = '', this.ruleFsts = '',
this.maxNumSenetences = 1, this.maxNumSenetences = 1,
this.ruleFars = '', this.ruleFars = '',
this.silenceScale = 0.2,
}); });
@override @override
String toString() { String toString() {
return 'OfflineTtsConfig(model: $model, ruleFsts: $ruleFsts, maxNumSenetences: $maxNumSenetences, ruleFars: $ruleFars)'; return 'OfflineTtsConfig(model: $model, ruleFsts: $ruleFsts, maxNumSenetences: $maxNumSenetences, ruleFars: $ruleFars, silenceScale: $silenceScale)';
} }
final OfflineTtsModelConfig model; final OfflineTtsModelConfig model;
final String ruleFsts; final String ruleFsts;
final int maxNumSenetences; final int maxNumSenetences;
final String ruleFars; final String ruleFars;
final double silenceScale;
} }
class GeneratedAudio { class GeneratedAudio {
@@ -180,6 +182,7 @@ class OfflineTts {
c.ref.ruleFsts = config.ruleFsts.toNativeUtf8(); c.ref.ruleFsts = config.ruleFsts.toNativeUtf8();
c.ref.maxNumSenetences = config.maxNumSenetences; c.ref.maxNumSenetences = config.maxNumSenetences;
c.ref.ruleFars = config.ruleFars.toNativeUtf8(); c.ref.ruleFars = config.ruleFars.toNativeUtf8();
c.ref.silenceScale = config.silenceScale;
final ptr = SherpaOnnxBindings.createOfflineTts?.call(c) ?? nullptr; final ptr = SherpaOnnxBindings.createOfflineTts?.call(c) ?? nullptr;

View File

@@ -146,6 +146,7 @@ static Napi::External<SherpaOnnxOfflineTts> CreateOfflineTtsWrapper(
SHERPA_ONNX_ASSIGN_ATTR_STR(rule_fsts, ruleFsts); SHERPA_ONNX_ASSIGN_ATTR_STR(rule_fsts, ruleFsts);
SHERPA_ONNX_ASSIGN_ATTR_INT32(max_num_sentences, maxNumSentences); SHERPA_ONNX_ASSIGN_ATTR_INT32(max_num_sentences, maxNumSentences);
SHERPA_ONNX_ASSIGN_ATTR_STR(rule_fars, ruleFars); SHERPA_ONNX_ASSIGN_ATTR_STR(rule_fars, ruleFars);
SHERPA_ONNX_ASSIGN_ATTR_STR(silence_scale, silenceScale);
#if __OHOS__ #if __OHOS__
std::unique_ptr<NativeResourceManager, std::unique_ptr<NativeResourceManager,

View File

@@ -52,6 +52,7 @@ export class OfflineTtsConfig {
public ruleFsts: string = ''; public ruleFsts: string = '';
public ruleFars: string = ''; public ruleFars: string = '';
public maxNumSentences: number = 1; public maxNumSentences: number = 1;
public silenceScale: number = 0.2;
} }
export class TtsOutput { export class TtsOutput {
@@ -98,4 +99,4 @@ export class OfflineTts {
generateAsync(input: TtsInput): Promise<TtsOutput> { generateAsync(input: TtsInput): Promise<TtsOutput> {
return offlineTtsGenerateAsync(this.handle, input); return offlineTtsGenerateAsync(this.handle, input);
} }
} }

View File

@@ -13,6 +13,7 @@ namespace SherpaOnnx
RuleFsts = ""; RuleFsts = "";
MaxNumSentences = 1; MaxNumSentences = 1;
RuleFars = ""; RuleFars = "";
SilenceScale = 0.2F;
} }
public OfflineTtsModelConfig Model; public OfflineTtsModelConfig Model;
@@ -23,6 +24,7 @@ namespace SherpaOnnx
[MarshalAs(UnmanagedType.LPStr)] [MarshalAs(UnmanagedType.LPStr)]
public string RuleFars; public string RuleFars;
}
} public float SilenceScale;
}
}

View File

@@ -712,6 +712,7 @@ type OfflineTtsConfig struct {
RuleFsts string RuleFsts string
RuleFars string RuleFars string
MaxNumSentences int MaxNumSentences int
SilenceScale float32
} }
type GeneratedAudio struct { type GeneratedAudio struct {
@@ -744,6 +745,7 @@ func NewOfflineTts(config *OfflineTtsConfig) *OfflineTts {
defer C.free(unsafe.Pointer(c.rule_fars)) defer C.free(unsafe.Pointer(c.rule_fars))
c.max_num_sentences = C.int(config.MaxNumSentences) c.max_num_sentences = C.int(config.MaxNumSentences)
c.silence_scale = C.float(config.SilenceScale)
// vits // vits
c.model.vits.model = C.CString(config.Model.Vits.Model) c.model.vits.model = C.CString(config.Model.Vits.Model)

View File

@@ -1135,6 +1135,7 @@ static sherpa_onnx::OfflineTtsConfig GetOfflineTtsConfig(
tts_config.rule_fsts = SHERPA_ONNX_OR(config->rule_fsts, ""); tts_config.rule_fsts = SHERPA_ONNX_OR(config->rule_fsts, "");
tts_config.rule_fars = SHERPA_ONNX_OR(config->rule_fars, ""); tts_config.rule_fars = SHERPA_ONNX_OR(config->rule_fars, "");
tts_config.max_num_sentences = SHERPA_ONNX_OR(config->max_num_sentences, 1); tts_config.max_num_sentences = SHERPA_ONNX_OR(config->max_num_sentences, 1);
tts_config.silence_scale = SHERPA_ONNX_OR(config->silence_scale, 0.2);
if (tts_config.model.debug) { if (tts_config.model.debug) {
#if __OHOS__ #if __OHOS__

View File

@@ -944,6 +944,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTtsConfig {
const char *rule_fsts; const char *rule_fsts;
int32_t max_num_sentences; int32_t max_num_sentences;
const char *rule_fars; const char *rule_fars;
float silence_scale;
} SherpaOnnxOfflineTtsConfig; } SherpaOnnxOfflineTtsConfig;
SHERPA_ONNX_API typedef struct SherpaOnnxGeneratedAudio { SHERPA_ONNX_API typedef struct SherpaOnnxGeneratedAudio {

View File

@@ -352,6 +352,7 @@ OfflineTts OfflineTts::Create(const OfflineTtsConfig &config) {
c.rule_fsts = config.rule_fsts.c_str(); c.rule_fsts = config.rule_fsts.c_str();
c.max_num_sentences = config.max_num_sentences; c.max_num_sentences = config.max_num_sentences;
c.silence_scale = config.silence_scale;
c.rule_fars = config.rule_fars.c_str(); c.rule_fars = config.rule_fars.c_str();
auto p = SherpaOnnxCreateOfflineTts(&c); auto p = SherpaOnnxCreateOfflineTts(&c);

View File

@@ -363,6 +363,7 @@ struct OfflineTtsConfig {
std::string rule_fsts; std::string rule_fsts;
std::string rule_fars; std::string rule_fars;
int32_t max_num_sentences = 1; int32_t max_num_sentences = 1;
float silence_scale = 0.2;
}; };
struct GeneratedAudio { struct GeneratedAudio {

View File

@@ -420,6 +420,12 @@ class OfflineTtsKokoroImpl : public OfflineTtsImpl {
GeneratedAudio ans; GeneratedAudio ans;
ans.sample_rate = model_->GetMetaData().sample_rate; ans.sample_rate = model_->GetMetaData().sample_rate;
ans.samples = std::vector<float>(p, p + total); ans.samples = std::vector<float>(p, p + total);
float silence_scale = config_.silence_scale;
if (silence_scale != 1) {
ans = ans.ScaleSilence(silence_scale);
}
return ans; return ans;
} }

View File

@@ -398,6 +398,12 @@ class OfflineTtsMatchaImpl : public OfflineTtsImpl {
GeneratedAudio ans; GeneratedAudio ans;
ans.sample_rate = model_->GetMetaData().sample_rate; ans.sample_rate = model_->GetMetaData().sample_rate;
ans.samples = std::vector<float>(p, p + total); ans.samples = std::vector<float>(p, p + total);
float silence_scale = config_.silence_scale;
if (silence_scale != 1) {
ans = ans.ScaleSilence(silence_scale);
}
return ans; return ans;
} }

View File

@@ -485,6 +485,12 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl {
GeneratedAudio ans; GeneratedAudio ans;
ans.sample_rate = model_->GetMetaData().sample_rate; ans.sample_rate = model_->GetMetaData().sample_rate;
ans.samples = std::vector<float>(p, p + total); ans.samples = std::vector<float>(p, p + total);
float silence_scale = config_.silence_scale;
if (silence_scale != 1) {
ans = ans.ScaleSilence(silence_scale);
}
return ans; return ans;
} }

View File

@@ -4,6 +4,7 @@
#include "sherpa-onnx/csrc/offline-tts.h" #include "sherpa-onnx/csrc/offline-tts.h"
#include <cmath>
#include <string> #include <string>
#include <utility> #include <utility>
@@ -23,6 +24,72 @@
namespace sherpa_onnx { namespace sherpa_onnx {
struct SilenceInterval {
int32_t start;
int32_t end;
};
GeneratedAudio GeneratedAudio::ScaleSilence(float scale) const {
if (scale == 1) {
return *this;
}
// if the interval is larger than 0.6 second, then we assume it is a pause
int32_t threshold = static_cast<int32_t>(sample_rate * 0.6);
std::vector<SilenceInterval> intervals;
int32_t num_samples = static_cast<int32_t>(samples.size());
int32_t last = -1;
int32_t i;
for (i = 0; i != num_samples; ++i) {
if (fabs(samples[i]) <= 0.01) {
if (last == -1) {
last = i;
}
continue;
}
if (last != -1 && i - last < threshold) {
last = -1;
continue;
}
if (last != -1) {
intervals.push_back({last, i});
last = -1;
}
}
if (last != -1 && num_samples - last > threshold) {
intervals.push_back({last, num_samples});
}
if (intervals.empty()) {
return *this;
}
GeneratedAudio ans;
ans.sample_rate = sample_rate;
ans.samples.reserve(samples.size());
i = 0;
for (const auto &interval : intervals) {
ans.samples.insert(ans.samples.end(), samples.begin() + i,
samples.begin() + interval.start);
i = interval.end;
int32_t n = static_cast<int32_t>((interval.end - interval.start) * scale);
ans.samples.insert(ans.samples.end(), samples.begin() + interval.start,
samples.begin() + interval.start + n);
}
if (i < num_samples) {
ans.samples.insert(ans.samples.end(), samples.begin() + i, samples.end());
}
return ans;
}
void OfflineTtsConfig::Register(ParseOptions *po) { void OfflineTtsConfig::Register(ParseOptions *po) {
model.Register(po); model.Register(po);
@@ -44,6 +111,10 @@ void OfflineTtsConfig::Register(ParseOptions *po) {
"Maximum number of sentences that we process at a time. " "Maximum number of sentences that we process at a time. "
"This is to avoid OOM for very long input text. " "This is to avoid OOM for very long input text. "
"If you set it to -1, then we process all sentences in a single batch."); "If you set it to -1, then we process all sentences in a single batch.");
po->Register("tts-silence-scale", &silence_scale,
"Duration of the pause is scaled by this number. So a smaller "
"value leads to a shorter pause.");
} }
bool OfflineTtsConfig::Validate() const { bool OfflineTtsConfig::Validate() const {
@@ -69,6 +140,11 @@ bool OfflineTtsConfig::Validate() const {
} }
} }
if (silence_scale < 0.001) {
SHERPA_ONNX_LOGE("--tts-silence-scale '%.3f' is too small", silence_scale);
return false;
}
return model.Validate(); return model.Validate();
} }
@@ -79,7 +155,8 @@ std::string OfflineTtsConfig::ToString() const {
os << "model=" << model.ToString() << ", "; os << "model=" << model.ToString() << ", ";
os << "rule_fsts=\"" << rule_fsts << "\", "; os << "rule_fsts=\"" << rule_fsts << "\", ";
os << "rule_fars=\"" << rule_fars << "\", "; os << "rule_fars=\"" << rule_fars << "\", ";
os << "max_num_sentences=" << max_num_sentences << ")"; os << "max_num_sentences=" << max_num_sentences << ", ";
os << "silence_scale=" << silence_scale << ")";
return os.str(); return os.str();
} }

View File

@@ -32,14 +32,20 @@ struct OfflineTtsConfig {
// If you set it to -1, then we process all sentences in a single batch. // If you set it to -1, then we process all sentences in a single batch.
int32_t max_num_sentences = 1; int32_t max_num_sentences = 1;
// A silence interval containing audio samples with value close to 0.
//
// the duration of the new interval is old_duration * silence_scale.
float silence_scale = 0.2;
OfflineTtsConfig() = default; OfflineTtsConfig() = default;
OfflineTtsConfig(const OfflineTtsModelConfig &model, OfflineTtsConfig(const OfflineTtsModelConfig &model,
const std::string &rule_fsts, const std::string &rule_fars, const std::string &rule_fsts, const std::string &rule_fars,
int32_t max_num_sentences) int32_t max_num_sentences, float silence_scale)
: model(model), : model(model),
rule_fsts(rule_fsts), rule_fsts(rule_fsts),
rule_fars(rule_fars), rule_fars(rule_fars),
max_num_sentences(max_num_sentences) {} max_num_sentences(max_num_sentences),
silence_scale(silence_scale) {}
void Register(ParseOptions *po); void Register(ParseOptions *po);
bool Validate() const; bool Validate() const;
@@ -50,6 +56,11 @@ struct OfflineTtsConfig {
struct GeneratedAudio { struct GeneratedAudio {
std::vector<float> samples; std::vector<float> samples;
int32_t sample_rate; int32_t sample_rate;
// Silence means pause here.
// If scale > 1, then it increases the duration of a pause
// If scale < 1, then it reduces the duration of a pause
GeneratedAudio ScaleSilence(float scale) const;
}; };
class OfflineTtsImpl; class OfflineTtsImpl;

View File

@@ -7,12 +7,14 @@ public class OfflineTtsConfig {
private final String ruleFsts; private final String ruleFsts;
private final String ruleFars; private final String ruleFars;
private final int maxNumSentences; private final int maxNumSentences;
private final float silenceScale;
private OfflineTtsConfig(Builder builder) { private OfflineTtsConfig(Builder builder) {
this.model = builder.model; this.model = builder.model;
this.ruleFsts = builder.ruleFsts; this.ruleFsts = builder.ruleFsts;
this.ruleFars = builder.ruleFars; this.ruleFars = builder.ruleFars;
this.maxNumSentences = builder.maxNumSentences; this.maxNumSentences = builder.maxNumSentences;
this.silenceScale = builder.silenceScale;
} }
public static Builder builder() { public static Builder builder() {
@@ -35,11 +37,16 @@ public class OfflineTtsConfig {
return maxNumSentences; return maxNumSentences;
} }
public float getSilenceScale() {
return silenceScale;
}
public static class Builder { public static class Builder {
private OfflineTtsModelConfig model = OfflineTtsModelConfig.builder().build(); private OfflineTtsModelConfig model = OfflineTtsModelConfig.builder().build();
private String ruleFsts = ""; private String ruleFsts = "";
private String ruleFars = ""; private String ruleFars = "";
private int maxNumSentences = 1; private int maxNumSentences = 1;
private float silenceScale = 0.2f;
public OfflineTtsConfig build() { public OfflineTtsConfig build() {
return new OfflineTtsConfig(this); return new OfflineTtsConfig(this);
@@ -64,5 +71,10 @@ public class OfflineTtsConfig {
this.maxNumSentences = maxNumSentences; this.maxNumSentences = maxNumSentences;
return this; return this;
} }
public Builder setSilenceScale(float silenceScale) {
this.silenceScale = silenceScale;
return this;
}
} }
} }

View File

@@ -187,6 +187,9 @@ static OfflineTtsConfig GetOfflineTtsConfig(JNIEnv *env, jobject config) {
fid = env->GetFieldID(cls, "maxNumSentences", "I"); fid = env->GetFieldID(cls, "maxNumSentences", "I");
ans.max_num_sentences = env->GetIntField(config, fid); ans.max_num_sentences = env->GetIntField(config, fid);
fid = env->GetFieldID(cls, "silenceScale", "F");
ans.silence_scale = env->GetFloatField(config, fid);
return ans; return ans;
} }

View File

@@ -49,6 +49,7 @@ data class OfflineTtsConfig(
var ruleFsts: String = "", var ruleFsts: String = "",
var ruleFars: String = "", var ruleFars: String = "",
var maxNumSentences: Int = 1, var maxNumSentences: Int = 1,
var silenceScale: Float = 0.2f,
) )
class GeneratedAudio( class GeneratedAudio(

View File

@@ -106,6 +106,7 @@ type
RuleFsts: AnsiString; RuleFsts: AnsiString;
MaxNumSentences: Integer; MaxNumSentences: Integer;
RuleFars: AnsiString; RuleFars: AnsiString;
SilenceScale: Single;
function ToString: AnsiString; function ToString: AnsiString;
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsConfig); class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsConfig);
@@ -777,6 +778,7 @@ type
RuleFsts: PAnsiChar; RuleFsts: PAnsiChar;
MaxNumSentences: cint32; MaxNumSentences: cint32;
RuleFars: PAnsiChar; RuleFars: PAnsiChar;
SilenceScale: cfloat;
end; end;
PSherpaOnnxOfflineTtsConfig = ^SherpaOnnxOfflineTtsConfig; PSherpaOnnxOfflineTtsConfig = ^SherpaOnnxOfflineTtsConfig;
@@ -1976,15 +1978,17 @@ begin
'Model := %s, ' + 'Model := %s, ' +
'RuleFsts := %s, ' + 'RuleFsts := %s, ' +
'MaxNumSentences := %d, ' + 'MaxNumSentences := %d, ' +
'RuleFars := %s' + 'RuleFars := %s, ' +
'SilenceScale := %f' +
')', ')',
[Self.Model.ToString, Self.RuleFsts, Self.MaxNumSentences, Self.RuleFars [Self.Model.ToString, Self.RuleFsts, Self.MaxNumSentences, Self.RuleFars,
]); Self.SilenceScale]);
end; end;
class operator TSherpaOnnxOfflineTtsConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsConfig); class operator TSherpaOnnxOfflineTtsConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsConfig);
begin begin
Dest.MaxNumSentences := 1; Dest.MaxNumSentences := 1;
Dest.SilenceScale := 0.2;
end; end;
constructor TSherpaOnnxOfflineTts.Create(Config: TSherpaOnnxOfflineTtsConfig); constructor TSherpaOnnxOfflineTts.Create(Config: TSherpaOnnxOfflineTtsConfig);
@@ -2027,6 +2031,7 @@ begin
C.RuleFsts := PAnsiChar(Config.RuleFsts); C.RuleFsts := PAnsiChar(Config.RuleFsts);
C.MaxNumSentences := Config.MaxNumSentences; C.MaxNumSentences := Config.MaxNumSentences;
C.RuleFars := PAnsiChar(Config.RuleFars); C.RuleFars := PAnsiChar(Config.RuleFars);
C.SilenceScale := Config.SilenceScale;
Self.Handle := SherpaOnnxCreateOfflineTts(@C); Self.Handle := SherpaOnnxCreateOfflineTts(@C);

View File

@@ -32,13 +32,15 @@ static void PybindOfflineTtsConfig(py::module *m) {
py::class_<PyClass>(*m, "OfflineTtsConfig") py::class_<PyClass>(*m, "OfflineTtsConfig")
.def(py::init<>()) .def(py::init<>())
.def(py::init<const OfflineTtsModelConfig &, const std::string &, .def(py::init<const OfflineTtsModelConfig &, const std::string &,
const std::string &, int32_t>(), const std::string &, int32_t, float>(),
py::arg("model"), py::arg("rule_fsts") = "", py::arg("model"), py::arg("rule_fsts") = "",
py::arg("rule_fars") = "", py::arg("max_num_sentences") = 2) py::arg("rule_fars") = "", py::arg("max_num_sentences") = 2,
py::arg("silence_scale") = 0.2)
.def_readwrite("model", &PyClass::model) .def_readwrite("model", &PyClass::model)
.def_readwrite("rule_fsts", &PyClass::rule_fsts) .def_readwrite("rule_fsts", &PyClass::rule_fsts)
.def_readwrite("rule_fars", &PyClass::rule_fars) .def_readwrite("rule_fars", &PyClass::rule_fars)
.def_readwrite("max_num_sentences", &PyClass::max_num_sentences) .def_readwrite("max_num_sentences", &PyClass::max_num_sentences)
.def_readwrite("silence_scale", &PyClass::silence_scale)
.def("validate", &PyClass::Validate) .def("validate", &PyClass::Validate)
.def("__str__", &PyClass::ToString); .def("__str__", &PyClass::ToString);
} }

View File

@@ -804,13 +804,15 @@ func sherpaOnnxOfflineTtsConfig(
model: SherpaOnnxOfflineTtsModelConfig, model: SherpaOnnxOfflineTtsModelConfig,
ruleFsts: String = "", ruleFsts: String = "",
ruleFars: String = "", ruleFars: String = "",
maxNumSentences: Int = 1 maxNumSentences: Int = 1,
silenceScale: Float = 0.2
) -> SherpaOnnxOfflineTtsConfig { ) -> SherpaOnnxOfflineTtsConfig {
return SherpaOnnxOfflineTtsConfig( return SherpaOnnxOfflineTtsConfig(
model: model, model: model,
rule_fsts: toCPointer(ruleFsts), rule_fsts: toCPointer(ruleFsts),
max_num_sentences: Int32(maxNumSentences), max_num_sentences: Int32(maxNumSentences),
rule_fars: toCPointer(ruleFars) rule_fars: toCPointer(ruleFars),
silence_scale: silenceScale
) )
} }

View File

@@ -21,7 +21,7 @@ function freeConfig(config, Module) {
// The user should free the returned pointers // The user should free the returned pointers
function initSherpaOnnxOfflineTtsVitsModelConfig(config, Module) { function initSherpaOnnxOfflineTtsVitsModelConfig(config, Module) {
const modelLen = Module.lengthBytesUTF8(config.model || '')+ 1; const modelLen = Module.lengthBytesUTF8(config.model || '') + 1;
const lexiconLen = Module.lengthBytesUTF8(config.lexicon || '') + 1; const lexiconLen = Module.lengthBytesUTF8(config.lexicon || '') + 1;
const tokensLen = Module.lengthBytesUTF8(config.tokens || '') + 1; const tokensLen = Module.lengthBytesUTF8(config.tokens || '') + 1;
const dataDirLen = Module.lengthBytesUTF8(config.dataDir || '') + 1; const dataDirLen = Module.lengthBytesUTF8(config.dataDir || '') + 1;
@@ -282,7 +282,7 @@ function initSherpaOnnxOfflineTtsModelConfig(config, Module) {
function initSherpaOnnxOfflineTtsConfig(config, Module) { function initSherpaOnnxOfflineTtsConfig(config, Module) {
const modelConfig = const modelConfig =
initSherpaOnnxOfflineTtsModelConfig(config.offlineTtsModelConfig, Module); initSherpaOnnxOfflineTtsModelConfig(config.offlineTtsModelConfig, Module);
const len = modelConfig.len + 3 * 4; const len = modelConfig.len + 4 * 4;
const ptr = Module._malloc(len); const ptr = Module._malloc(len);
let offset = 0; let offset = 0;
@@ -303,6 +303,10 @@ function initSherpaOnnxOfflineTtsConfig(config, Module) {
offset += 4; offset += 4;
Module.setValue(ptr + offset, buffer + ruleFstsLen, 'i8*'); Module.setValue(ptr + offset, buffer + ruleFstsLen, 'i8*');
offset += 4;
Module.setValue(ptr + offset, config.silenceScale || 0.2, 'float');
offset += 4;
return { return {
buffer: buffer, ptr: ptr, len: len, config: modelConfig, buffer: buffer, ptr: ptr, len: len, config: modelConfig,

View File

@@ -22,7 +22,7 @@ static_assert(sizeof(SherpaOnnxOfflineTtsModelConfig) ==
sizeof(SherpaOnnxOfflineTtsKokoroModelConfig) + 3 * 4, sizeof(SherpaOnnxOfflineTtsKokoroModelConfig) + 3 * 4,
""); "");
static_assert(sizeof(SherpaOnnxOfflineTtsConfig) == static_assert(sizeof(SherpaOnnxOfflineTtsConfig) ==
sizeof(SherpaOnnxOfflineTtsModelConfig) + 3 * 4, sizeof(SherpaOnnxOfflineTtsModelConfig) + 4 * 4,
""); "");
void MyPrint(SherpaOnnxOfflineTtsConfig *tts_config) { void MyPrint(SherpaOnnxOfflineTtsConfig *tts_config) {
@@ -68,6 +68,7 @@ void MyPrint(SherpaOnnxOfflineTtsConfig *tts_config) {
fprintf(stdout, "rule_fsts: %s\n", tts_config->rule_fsts); fprintf(stdout, "rule_fsts: %s\n", tts_config->rule_fsts);
fprintf(stdout, "rule_fars: %s\n", tts_config->rule_fars); fprintf(stdout, "rule_fars: %s\n", tts_config->rule_fars);
fprintf(stdout, "max num sentences: %d\n", tts_config->max_num_sentences); fprintf(stdout, "max num sentences: %d\n", tts_config->max_num_sentences);
fprintf(stdout, "silence scale: %.3f\n", tts_config->silence_scale);
} }
void CopyHeap(const char *src, int32_t num_bytes, char *dst) { void CopyHeap(const char *src, int32_t num_bytes, char *dst) {