Support scaling the duration of a pause in TTS. (#1820)
This commit is contained in:
@@ -116,7 +116,7 @@ int32_t main() {
|
|||||||
keywords_spotter_config.keywords_buf = keywords_buf;
|
keywords_spotter_config.keywords_buf = keywords_buf;
|
||||||
keywords_spotter_config.keywords_buf_size = keywords_buf_size;
|
keywords_spotter_config.keywords_buf_size = keywords_buf_size;
|
||||||
|
|
||||||
SherpaOnnxKeywordSpotter *keywords_spotter =
|
const SherpaOnnxKeywordSpotter *keywords_spotter =
|
||||||
SherpaOnnxCreateKeywordSpotter(&keywords_spotter_config);
|
SherpaOnnxCreateKeywordSpotter(&keywords_spotter_config);
|
||||||
|
|
||||||
free((void *)tokens_buf);
|
free((void *)tokens_buf);
|
||||||
@@ -130,7 +130,7 @@ int32_t main() {
|
|||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
SherpaOnnxOnlineStream *stream =
|
const SherpaOnnxOnlineStream *stream =
|
||||||
SherpaOnnxCreateKeywordStream(keywords_spotter);
|
SherpaOnnxCreateKeywordStream(keywords_spotter);
|
||||||
|
|
||||||
const SherpaOnnxDisplay *display = SherpaOnnxCreateDisplay(50);
|
const SherpaOnnxDisplay *display = SherpaOnnxCreateDisplay(50);
|
||||||
|
|||||||
@@ -180,6 +180,9 @@ final class SherpaOnnxOfflineTtsConfig extends Struct {
|
|||||||
external int maxNumSenetences;
|
external int maxNumSenetences;
|
||||||
|
|
||||||
external Pointer<Utf8> ruleFars;
|
external Pointer<Utf8> ruleFars;
|
||||||
|
|
||||||
|
@Float()
|
||||||
|
external double silenceScale;
|
||||||
}
|
}
|
||||||
|
|
||||||
final class SherpaOnnxGeneratedAudio extends Struct {
|
final class SherpaOnnxGeneratedAudio extends Struct {
|
||||||
|
|||||||
@@ -114,17 +114,19 @@ class OfflineTtsConfig {
|
|||||||
this.ruleFsts = '',
|
this.ruleFsts = '',
|
||||||
this.maxNumSenetences = 1,
|
this.maxNumSenetences = 1,
|
||||||
this.ruleFars = '',
|
this.ruleFars = '',
|
||||||
|
this.silenceScale = 0.2,
|
||||||
});
|
});
|
||||||
|
|
||||||
@override
|
@override
|
||||||
String toString() {
|
String toString() {
|
||||||
return 'OfflineTtsConfig(model: $model, ruleFsts: $ruleFsts, maxNumSenetences: $maxNumSenetences, ruleFars: $ruleFars)';
|
return 'OfflineTtsConfig(model: $model, ruleFsts: $ruleFsts, maxNumSenetences: $maxNumSenetences, ruleFars: $ruleFars, silenceScale: $silenceScale)';
|
||||||
}
|
}
|
||||||
|
|
||||||
final OfflineTtsModelConfig model;
|
final OfflineTtsModelConfig model;
|
||||||
final String ruleFsts;
|
final String ruleFsts;
|
||||||
final int maxNumSenetences;
|
final int maxNumSenetences;
|
||||||
final String ruleFars;
|
final String ruleFars;
|
||||||
|
final double silenceScale;
|
||||||
}
|
}
|
||||||
|
|
||||||
class GeneratedAudio {
|
class GeneratedAudio {
|
||||||
@@ -180,6 +182,7 @@ class OfflineTts {
|
|||||||
c.ref.ruleFsts = config.ruleFsts.toNativeUtf8();
|
c.ref.ruleFsts = config.ruleFsts.toNativeUtf8();
|
||||||
c.ref.maxNumSenetences = config.maxNumSenetences;
|
c.ref.maxNumSenetences = config.maxNumSenetences;
|
||||||
c.ref.ruleFars = config.ruleFars.toNativeUtf8();
|
c.ref.ruleFars = config.ruleFars.toNativeUtf8();
|
||||||
|
c.ref.silenceScale = config.silenceScale;
|
||||||
|
|
||||||
final ptr = SherpaOnnxBindings.createOfflineTts?.call(c) ?? nullptr;
|
final ptr = SherpaOnnxBindings.createOfflineTts?.call(c) ?? nullptr;
|
||||||
|
|
||||||
|
|||||||
@@ -146,6 +146,7 @@ static Napi::External<SherpaOnnxOfflineTts> CreateOfflineTtsWrapper(
|
|||||||
SHERPA_ONNX_ASSIGN_ATTR_STR(rule_fsts, ruleFsts);
|
SHERPA_ONNX_ASSIGN_ATTR_STR(rule_fsts, ruleFsts);
|
||||||
SHERPA_ONNX_ASSIGN_ATTR_INT32(max_num_sentences, maxNumSentences);
|
SHERPA_ONNX_ASSIGN_ATTR_INT32(max_num_sentences, maxNumSentences);
|
||||||
SHERPA_ONNX_ASSIGN_ATTR_STR(rule_fars, ruleFars);
|
SHERPA_ONNX_ASSIGN_ATTR_STR(rule_fars, ruleFars);
|
||||||
|
SHERPA_ONNX_ASSIGN_ATTR_STR(silence_scale, silenceScale);
|
||||||
|
|
||||||
#if __OHOS__
|
#if __OHOS__
|
||||||
std::unique_ptr<NativeResourceManager,
|
std::unique_ptr<NativeResourceManager,
|
||||||
|
|||||||
@@ -52,6 +52,7 @@ export class OfflineTtsConfig {
|
|||||||
public ruleFsts: string = '';
|
public ruleFsts: string = '';
|
||||||
public ruleFars: string = '';
|
public ruleFars: string = '';
|
||||||
public maxNumSentences: number = 1;
|
public maxNumSentences: number = 1;
|
||||||
|
public silenceScale: number = 0.2;
|
||||||
}
|
}
|
||||||
|
|
||||||
export class TtsOutput {
|
export class TtsOutput {
|
||||||
@@ -98,4 +99,4 @@ export class OfflineTts {
|
|||||||
generateAsync(input: TtsInput): Promise<TtsOutput> {
|
generateAsync(input: TtsInput): Promise<TtsOutput> {
|
||||||
return offlineTtsGenerateAsync(this.handle, input);
|
return offlineTtsGenerateAsync(this.handle, input);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -13,6 +13,7 @@ namespace SherpaOnnx
|
|||||||
RuleFsts = "";
|
RuleFsts = "";
|
||||||
MaxNumSentences = 1;
|
MaxNumSentences = 1;
|
||||||
RuleFars = "";
|
RuleFars = "";
|
||||||
|
SilenceScale = 0.2F;
|
||||||
}
|
}
|
||||||
public OfflineTtsModelConfig Model;
|
public OfflineTtsModelConfig Model;
|
||||||
|
|
||||||
@@ -23,6 +24,7 @@ namespace SherpaOnnx
|
|||||||
|
|
||||||
[MarshalAs(UnmanagedType.LPStr)]
|
[MarshalAs(UnmanagedType.LPStr)]
|
||||||
public string RuleFars;
|
public string RuleFars;
|
||||||
}
|
|
||||||
|
|
||||||
}
|
public float SilenceScale;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -712,6 +712,7 @@ type OfflineTtsConfig struct {
|
|||||||
RuleFsts string
|
RuleFsts string
|
||||||
RuleFars string
|
RuleFars string
|
||||||
MaxNumSentences int
|
MaxNumSentences int
|
||||||
|
SilenceScale float32
|
||||||
}
|
}
|
||||||
|
|
||||||
type GeneratedAudio struct {
|
type GeneratedAudio struct {
|
||||||
@@ -744,6 +745,7 @@ func NewOfflineTts(config *OfflineTtsConfig) *OfflineTts {
|
|||||||
defer C.free(unsafe.Pointer(c.rule_fars))
|
defer C.free(unsafe.Pointer(c.rule_fars))
|
||||||
|
|
||||||
c.max_num_sentences = C.int(config.MaxNumSentences)
|
c.max_num_sentences = C.int(config.MaxNumSentences)
|
||||||
|
c.silence_scale = C.float(config.SilenceScale)
|
||||||
|
|
||||||
// vits
|
// vits
|
||||||
c.model.vits.model = C.CString(config.Model.Vits.Model)
|
c.model.vits.model = C.CString(config.Model.Vits.Model)
|
||||||
|
|||||||
@@ -1135,6 +1135,7 @@ static sherpa_onnx::OfflineTtsConfig GetOfflineTtsConfig(
|
|||||||
tts_config.rule_fsts = SHERPA_ONNX_OR(config->rule_fsts, "");
|
tts_config.rule_fsts = SHERPA_ONNX_OR(config->rule_fsts, "");
|
||||||
tts_config.rule_fars = SHERPA_ONNX_OR(config->rule_fars, "");
|
tts_config.rule_fars = SHERPA_ONNX_OR(config->rule_fars, "");
|
||||||
tts_config.max_num_sentences = SHERPA_ONNX_OR(config->max_num_sentences, 1);
|
tts_config.max_num_sentences = SHERPA_ONNX_OR(config->max_num_sentences, 1);
|
||||||
|
tts_config.silence_scale = SHERPA_ONNX_OR(config->silence_scale, 0.2);
|
||||||
|
|
||||||
if (tts_config.model.debug) {
|
if (tts_config.model.debug) {
|
||||||
#if __OHOS__
|
#if __OHOS__
|
||||||
|
|||||||
@@ -944,6 +944,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTtsConfig {
|
|||||||
const char *rule_fsts;
|
const char *rule_fsts;
|
||||||
int32_t max_num_sentences;
|
int32_t max_num_sentences;
|
||||||
const char *rule_fars;
|
const char *rule_fars;
|
||||||
|
float silence_scale;
|
||||||
} SherpaOnnxOfflineTtsConfig;
|
} SherpaOnnxOfflineTtsConfig;
|
||||||
|
|
||||||
SHERPA_ONNX_API typedef struct SherpaOnnxGeneratedAudio {
|
SHERPA_ONNX_API typedef struct SherpaOnnxGeneratedAudio {
|
||||||
|
|||||||
@@ -352,6 +352,7 @@ OfflineTts OfflineTts::Create(const OfflineTtsConfig &config) {
|
|||||||
|
|
||||||
c.rule_fsts = config.rule_fsts.c_str();
|
c.rule_fsts = config.rule_fsts.c_str();
|
||||||
c.max_num_sentences = config.max_num_sentences;
|
c.max_num_sentences = config.max_num_sentences;
|
||||||
|
c.silence_scale = config.silence_scale;
|
||||||
c.rule_fars = config.rule_fars.c_str();
|
c.rule_fars = config.rule_fars.c_str();
|
||||||
|
|
||||||
auto p = SherpaOnnxCreateOfflineTts(&c);
|
auto p = SherpaOnnxCreateOfflineTts(&c);
|
||||||
|
|||||||
@@ -363,6 +363,7 @@ struct OfflineTtsConfig {
|
|||||||
std::string rule_fsts;
|
std::string rule_fsts;
|
||||||
std::string rule_fars;
|
std::string rule_fars;
|
||||||
int32_t max_num_sentences = 1;
|
int32_t max_num_sentences = 1;
|
||||||
|
float silence_scale = 0.2;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct GeneratedAudio {
|
struct GeneratedAudio {
|
||||||
|
|||||||
@@ -420,6 +420,12 @@ class OfflineTtsKokoroImpl : public OfflineTtsImpl {
|
|||||||
GeneratedAudio ans;
|
GeneratedAudio ans;
|
||||||
ans.sample_rate = model_->GetMetaData().sample_rate;
|
ans.sample_rate = model_->GetMetaData().sample_rate;
|
||||||
ans.samples = std::vector<float>(p, p + total);
|
ans.samples = std::vector<float>(p, p + total);
|
||||||
|
|
||||||
|
float silence_scale = config_.silence_scale;
|
||||||
|
if (silence_scale != 1) {
|
||||||
|
ans = ans.ScaleSilence(silence_scale);
|
||||||
|
}
|
||||||
|
|
||||||
return ans;
|
return ans;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -398,6 +398,12 @@ class OfflineTtsMatchaImpl : public OfflineTtsImpl {
|
|||||||
GeneratedAudio ans;
|
GeneratedAudio ans;
|
||||||
ans.sample_rate = model_->GetMetaData().sample_rate;
|
ans.sample_rate = model_->GetMetaData().sample_rate;
|
||||||
ans.samples = std::vector<float>(p, p + total);
|
ans.samples = std::vector<float>(p, p + total);
|
||||||
|
|
||||||
|
float silence_scale = config_.silence_scale;
|
||||||
|
if (silence_scale != 1) {
|
||||||
|
ans = ans.ScaleSilence(silence_scale);
|
||||||
|
}
|
||||||
|
|
||||||
return ans;
|
return ans;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -485,6 +485,12 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl {
|
|||||||
GeneratedAudio ans;
|
GeneratedAudio ans;
|
||||||
ans.sample_rate = model_->GetMetaData().sample_rate;
|
ans.sample_rate = model_->GetMetaData().sample_rate;
|
||||||
ans.samples = std::vector<float>(p, p + total);
|
ans.samples = std::vector<float>(p, p + total);
|
||||||
|
|
||||||
|
float silence_scale = config_.silence_scale;
|
||||||
|
if (silence_scale != 1) {
|
||||||
|
ans = ans.ScaleSilence(silence_scale);
|
||||||
|
}
|
||||||
|
|
||||||
return ans;
|
return ans;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -4,6 +4,7 @@
|
|||||||
|
|
||||||
#include "sherpa-onnx/csrc/offline-tts.h"
|
#include "sherpa-onnx/csrc/offline-tts.h"
|
||||||
|
|
||||||
|
#include <cmath>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <utility>
|
#include <utility>
|
||||||
|
|
||||||
@@ -23,6 +24,72 @@
|
|||||||
|
|
||||||
namespace sherpa_onnx {
|
namespace sherpa_onnx {
|
||||||
|
|
||||||
|
struct SilenceInterval {
|
||||||
|
int32_t start;
|
||||||
|
int32_t end;
|
||||||
|
};
|
||||||
|
|
||||||
|
GeneratedAudio GeneratedAudio::ScaleSilence(float scale) const {
|
||||||
|
if (scale == 1) {
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
// if the interval is larger than 0.6 second, then we assume it is a pause
|
||||||
|
int32_t threshold = static_cast<int32_t>(sample_rate * 0.6);
|
||||||
|
|
||||||
|
std::vector<SilenceInterval> intervals;
|
||||||
|
int32_t num_samples = static_cast<int32_t>(samples.size());
|
||||||
|
|
||||||
|
int32_t last = -1;
|
||||||
|
int32_t i;
|
||||||
|
for (i = 0; i != num_samples; ++i) {
|
||||||
|
if (fabs(samples[i]) <= 0.01) {
|
||||||
|
if (last == -1) {
|
||||||
|
last = i;
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (last != -1 && i - last < threshold) {
|
||||||
|
last = -1;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (last != -1) {
|
||||||
|
intervals.push_back({last, i});
|
||||||
|
last = -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (last != -1 && num_samples - last > threshold) {
|
||||||
|
intervals.push_back({last, num_samples});
|
||||||
|
}
|
||||||
|
|
||||||
|
if (intervals.empty()) {
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
GeneratedAudio ans;
|
||||||
|
ans.sample_rate = sample_rate;
|
||||||
|
ans.samples.reserve(samples.size());
|
||||||
|
|
||||||
|
i = 0;
|
||||||
|
for (const auto &interval : intervals) {
|
||||||
|
ans.samples.insert(ans.samples.end(), samples.begin() + i,
|
||||||
|
samples.begin() + interval.start);
|
||||||
|
i = interval.end;
|
||||||
|
int32_t n = static_cast<int32_t>((interval.end - interval.start) * scale);
|
||||||
|
|
||||||
|
ans.samples.insert(ans.samples.end(), samples.begin() + interval.start,
|
||||||
|
samples.begin() + interval.start + n);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (i < num_samples) {
|
||||||
|
ans.samples.insert(ans.samples.end(), samples.begin() + i, samples.end());
|
||||||
|
}
|
||||||
|
|
||||||
|
return ans;
|
||||||
|
}
|
||||||
|
|
||||||
void OfflineTtsConfig::Register(ParseOptions *po) {
|
void OfflineTtsConfig::Register(ParseOptions *po) {
|
||||||
model.Register(po);
|
model.Register(po);
|
||||||
|
|
||||||
@@ -44,6 +111,10 @@ void OfflineTtsConfig::Register(ParseOptions *po) {
|
|||||||
"Maximum number of sentences that we process at a time. "
|
"Maximum number of sentences that we process at a time. "
|
||||||
"This is to avoid OOM for very long input text. "
|
"This is to avoid OOM for very long input text. "
|
||||||
"If you set it to -1, then we process all sentences in a single batch.");
|
"If you set it to -1, then we process all sentences in a single batch.");
|
||||||
|
|
||||||
|
po->Register("tts-silence-scale", &silence_scale,
|
||||||
|
"Duration of the pause is scaled by this number. So a smaller "
|
||||||
|
"value leads to a shorter pause.");
|
||||||
}
|
}
|
||||||
|
|
||||||
bool OfflineTtsConfig::Validate() const {
|
bool OfflineTtsConfig::Validate() const {
|
||||||
@@ -69,6 +140,11 @@ bool OfflineTtsConfig::Validate() const {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (silence_scale < 0.001) {
|
||||||
|
SHERPA_ONNX_LOGE("--tts-silence-scale '%.3f' is too small", silence_scale);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
return model.Validate();
|
return model.Validate();
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -79,7 +155,8 @@ std::string OfflineTtsConfig::ToString() const {
|
|||||||
os << "model=" << model.ToString() << ", ";
|
os << "model=" << model.ToString() << ", ";
|
||||||
os << "rule_fsts=\"" << rule_fsts << "\", ";
|
os << "rule_fsts=\"" << rule_fsts << "\", ";
|
||||||
os << "rule_fars=\"" << rule_fars << "\", ";
|
os << "rule_fars=\"" << rule_fars << "\", ";
|
||||||
os << "max_num_sentences=" << max_num_sentences << ")";
|
os << "max_num_sentences=" << max_num_sentences << ", ";
|
||||||
|
os << "silence_scale=" << silence_scale << ")";
|
||||||
|
|
||||||
return os.str();
|
return os.str();
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -32,14 +32,20 @@ struct OfflineTtsConfig {
|
|||||||
// If you set it to -1, then we process all sentences in a single batch.
|
// If you set it to -1, then we process all sentences in a single batch.
|
||||||
int32_t max_num_sentences = 1;
|
int32_t max_num_sentences = 1;
|
||||||
|
|
||||||
|
// A silence interval containing audio samples with value close to 0.
|
||||||
|
//
|
||||||
|
// the duration of the new interval is old_duration * silence_scale.
|
||||||
|
float silence_scale = 0.2;
|
||||||
|
|
||||||
OfflineTtsConfig() = default;
|
OfflineTtsConfig() = default;
|
||||||
OfflineTtsConfig(const OfflineTtsModelConfig &model,
|
OfflineTtsConfig(const OfflineTtsModelConfig &model,
|
||||||
const std::string &rule_fsts, const std::string &rule_fars,
|
const std::string &rule_fsts, const std::string &rule_fars,
|
||||||
int32_t max_num_sentences)
|
int32_t max_num_sentences, float silence_scale)
|
||||||
: model(model),
|
: model(model),
|
||||||
rule_fsts(rule_fsts),
|
rule_fsts(rule_fsts),
|
||||||
rule_fars(rule_fars),
|
rule_fars(rule_fars),
|
||||||
max_num_sentences(max_num_sentences) {}
|
max_num_sentences(max_num_sentences),
|
||||||
|
silence_scale(silence_scale) {}
|
||||||
|
|
||||||
void Register(ParseOptions *po);
|
void Register(ParseOptions *po);
|
||||||
bool Validate() const;
|
bool Validate() const;
|
||||||
@@ -50,6 +56,11 @@ struct OfflineTtsConfig {
|
|||||||
struct GeneratedAudio {
|
struct GeneratedAudio {
|
||||||
std::vector<float> samples;
|
std::vector<float> samples;
|
||||||
int32_t sample_rate;
|
int32_t sample_rate;
|
||||||
|
|
||||||
|
// Silence means pause here.
|
||||||
|
// If scale > 1, then it increases the duration of a pause
|
||||||
|
// If scale < 1, then it reduces the duration of a pause
|
||||||
|
GeneratedAudio ScaleSilence(float scale) const;
|
||||||
};
|
};
|
||||||
|
|
||||||
class OfflineTtsImpl;
|
class OfflineTtsImpl;
|
||||||
|
|||||||
@@ -7,12 +7,14 @@ public class OfflineTtsConfig {
|
|||||||
private final String ruleFsts;
|
private final String ruleFsts;
|
||||||
private final String ruleFars;
|
private final String ruleFars;
|
||||||
private final int maxNumSentences;
|
private final int maxNumSentences;
|
||||||
|
private final float silenceScale;
|
||||||
|
|
||||||
private OfflineTtsConfig(Builder builder) {
|
private OfflineTtsConfig(Builder builder) {
|
||||||
this.model = builder.model;
|
this.model = builder.model;
|
||||||
this.ruleFsts = builder.ruleFsts;
|
this.ruleFsts = builder.ruleFsts;
|
||||||
this.ruleFars = builder.ruleFars;
|
this.ruleFars = builder.ruleFars;
|
||||||
this.maxNumSentences = builder.maxNumSentences;
|
this.maxNumSentences = builder.maxNumSentences;
|
||||||
|
this.silenceScale = builder.silenceScale;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static Builder builder() {
|
public static Builder builder() {
|
||||||
@@ -35,11 +37,16 @@ public class OfflineTtsConfig {
|
|||||||
return maxNumSentences;
|
return maxNumSentences;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public float getSilenceScale() {
|
||||||
|
return silenceScale;
|
||||||
|
}
|
||||||
|
|
||||||
public static class Builder {
|
public static class Builder {
|
||||||
private OfflineTtsModelConfig model = OfflineTtsModelConfig.builder().build();
|
private OfflineTtsModelConfig model = OfflineTtsModelConfig.builder().build();
|
||||||
private String ruleFsts = "";
|
private String ruleFsts = "";
|
||||||
private String ruleFars = "";
|
private String ruleFars = "";
|
||||||
private int maxNumSentences = 1;
|
private int maxNumSentences = 1;
|
||||||
|
private float silenceScale = 0.2f;
|
||||||
|
|
||||||
public OfflineTtsConfig build() {
|
public OfflineTtsConfig build() {
|
||||||
return new OfflineTtsConfig(this);
|
return new OfflineTtsConfig(this);
|
||||||
@@ -64,5 +71,10 @@ public class OfflineTtsConfig {
|
|||||||
this.maxNumSentences = maxNumSentences;
|
this.maxNumSentences = maxNumSentences;
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public Builder setSilenceScale(float silenceScale) {
|
||||||
|
this.silenceScale = silenceScale;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -187,6 +187,9 @@ static OfflineTtsConfig GetOfflineTtsConfig(JNIEnv *env, jobject config) {
|
|||||||
fid = env->GetFieldID(cls, "maxNumSentences", "I");
|
fid = env->GetFieldID(cls, "maxNumSentences", "I");
|
||||||
ans.max_num_sentences = env->GetIntField(config, fid);
|
ans.max_num_sentences = env->GetIntField(config, fid);
|
||||||
|
|
||||||
|
fid = env->GetFieldID(cls, "silenceScale", "F");
|
||||||
|
ans.silence_scale = env->GetFloatField(config, fid);
|
||||||
|
|
||||||
return ans;
|
return ans;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -49,6 +49,7 @@ data class OfflineTtsConfig(
|
|||||||
var ruleFsts: String = "",
|
var ruleFsts: String = "",
|
||||||
var ruleFars: String = "",
|
var ruleFars: String = "",
|
||||||
var maxNumSentences: Int = 1,
|
var maxNumSentences: Int = 1,
|
||||||
|
var silenceScale: Float = 0.2f,
|
||||||
)
|
)
|
||||||
|
|
||||||
class GeneratedAudio(
|
class GeneratedAudio(
|
||||||
|
|||||||
@@ -106,6 +106,7 @@ type
|
|||||||
RuleFsts: AnsiString;
|
RuleFsts: AnsiString;
|
||||||
MaxNumSentences: Integer;
|
MaxNumSentences: Integer;
|
||||||
RuleFars: AnsiString;
|
RuleFars: AnsiString;
|
||||||
|
SilenceScale: Single;
|
||||||
|
|
||||||
function ToString: AnsiString;
|
function ToString: AnsiString;
|
||||||
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsConfig);
|
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsConfig);
|
||||||
@@ -777,6 +778,7 @@ type
|
|||||||
RuleFsts: PAnsiChar;
|
RuleFsts: PAnsiChar;
|
||||||
MaxNumSentences: cint32;
|
MaxNumSentences: cint32;
|
||||||
RuleFars: PAnsiChar;
|
RuleFars: PAnsiChar;
|
||||||
|
SilenceScale: cfloat;
|
||||||
end;
|
end;
|
||||||
|
|
||||||
PSherpaOnnxOfflineTtsConfig = ^SherpaOnnxOfflineTtsConfig;
|
PSherpaOnnxOfflineTtsConfig = ^SherpaOnnxOfflineTtsConfig;
|
||||||
@@ -1976,15 +1978,17 @@ begin
|
|||||||
'Model := %s, ' +
|
'Model := %s, ' +
|
||||||
'RuleFsts := %s, ' +
|
'RuleFsts := %s, ' +
|
||||||
'MaxNumSentences := %d, ' +
|
'MaxNumSentences := %d, ' +
|
||||||
'RuleFars := %s' +
|
'RuleFars := %s, ' +
|
||||||
|
'SilenceScale := %f' +
|
||||||
')',
|
')',
|
||||||
[Self.Model.ToString, Self.RuleFsts, Self.MaxNumSentences, Self.RuleFars
|
[Self.Model.ToString, Self.RuleFsts, Self.MaxNumSentences, Self.RuleFars,
|
||||||
]);
|
Self.SilenceScale]);
|
||||||
end;
|
end;
|
||||||
|
|
||||||
class operator TSherpaOnnxOfflineTtsConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsConfig);
|
class operator TSherpaOnnxOfflineTtsConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsConfig);
|
||||||
begin
|
begin
|
||||||
Dest.MaxNumSentences := 1;
|
Dest.MaxNumSentences := 1;
|
||||||
|
Dest.SilenceScale := 0.2;
|
||||||
end;
|
end;
|
||||||
|
|
||||||
constructor TSherpaOnnxOfflineTts.Create(Config: TSherpaOnnxOfflineTtsConfig);
|
constructor TSherpaOnnxOfflineTts.Create(Config: TSherpaOnnxOfflineTtsConfig);
|
||||||
@@ -2027,6 +2031,7 @@ begin
|
|||||||
C.RuleFsts := PAnsiChar(Config.RuleFsts);
|
C.RuleFsts := PAnsiChar(Config.RuleFsts);
|
||||||
C.MaxNumSentences := Config.MaxNumSentences;
|
C.MaxNumSentences := Config.MaxNumSentences;
|
||||||
C.RuleFars := PAnsiChar(Config.RuleFars);
|
C.RuleFars := PAnsiChar(Config.RuleFars);
|
||||||
|
C.SilenceScale := Config.SilenceScale;
|
||||||
|
|
||||||
Self.Handle := SherpaOnnxCreateOfflineTts(@C);
|
Self.Handle := SherpaOnnxCreateOfflineTts(@C);
|
||||||
|
|
||||||
|
|||||||
@@ -32,13 +32,15 @@ static void PybindOfflineTtsConfig(py::module *m) {
|
|||||||
py::class_<PyClass>(*m, "OfflineTtsConfig")
|
py::class_<PyClass>(*m, "OfflineTtsConfig")
|
||||||
.def(py::init<>())
|
.def(py::init<>())
|
||||||
.def(py::init<const OfflineTtsModelConfig &, const std::string &,
|
.def(py::init<const OfflineTtsModelConfig &, const std::string &,
|
||||||
const std::string &, int32_t>(),
|
const std::string &, int32_t, float>(),
|
||||||
py::arg("model"), py::arg("rule_fsts") = "",
|
py::arg("model"), py::arg("rule_fsts") = "",
|
||||||
py::arg("rule_fars") = "", py::arg("max_num_sentences") = 2)
|
py::arg("rule_fars") = "", py::arg("max_num_sentences") = 2,
|
||||||
|
py::arg("silence_scale") = 0.2)
|
||||||
.def_readwrite("model", &PyClass::model)
|
.def_readwrite("model", &PyClass::model)
|
||||||
.def_readwrite("rule_fsts", &PyClass::rule_fsts)
|
.def_readwrite("rule_fsts", &PyClass::rule_fsts)
|
||||||
.def_readwrite("rule_fars", &PyClass::rule_fars)
|
.def_readwrite("rule_fars", &PyClass::rule_fars)
|
||||||
.def_readwrite("max_num_sentences", &PyClass::max_num_sentences)
|
.def_readwrite("max_num_sentences", &PyClass::max_num_sentences)
|
||||||
|
.def_readwrite("silence_scale", &PyClass::silence_scale)
|
||||||
.def("validate", &PyClass::Validate)
|
.def("validate", &PyClass::Validate)
|
||||||
.def("__str__", &PyClass::ToString);
|
.def("__str__", &PyClass::ToString);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -804,13 +804,15 @@ func sherpaOnnxOfflineTtsConfig(
|
|||||||
model: SherpaOnnxOfflineTtsModelConfig,
|
model: SherpaOnnxOfflineTtsModelConfig,
|
||||||
ruleFsts: String = "",
|
ruleFsts: String = "",
|
||||||
ruleFars: String = "",
|
ruleFars: String = "",
|
||||||
maxNumSentences: Int = 1
|
maxNumSentences: Int = 1,
|
||||||
|
silenceScale: Float = 0.2
|
||||||
) -> SherpaOnnxOfflineTtsConfig {
|
) -> SherpaOnnxOfflineTtsConfig {
|
||||||
return SherpaOnnxOfflineTtsConfig(
|
return SherpaOnnxOfflineTtsConfig(
|
||||||
model: model,
|
model: model,
|
||||||
rule_fsts: toCPointer(ruleFsts),
|
rule_fsts: toCPointer(ruleFsts),
|
||||||
max_num_sentences: Int32(maxNumSentences),
|
max_num_sentences: Int32(maxNumSentences),
|
||||||
rule_fars: toCPointer(ruleFars)
|
rule_fars: toCPointer(ruleFars),
|
||||||
|
silence_scale: silenceScale
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -21,7 +21,7 @@ function freeConfig(config, Module) {
|
|||||||
|
|
||||||
// The user should free the returned pointers
|
// The user should free the returned pointers
|
||||||
function initSherpaOnnxOfflineTtsVitsModelConfig(config, Module) {
|
function initSherpaOnnxOfflineTtsVitsModelConfig(config, Module) {
|
||||||
const modelLen = Module.lengthBytesUTF8(config.model || '')+ 1;
|
const modelLen = Module.lengthBytesUTF8(config.model || '') + 1;
|
||||||
const lexiconLen = Module.lengthBytesUTF8(config.lexicon || '') + 1;
|
const lexiconLen = Module.lengthBytesUTF8(config.lexicon || '') + 1;
|
||||||
const tokensLen = Module.lengthBytesUTF8(config.tokens || '') + 1;
|
const tokensLen = Module.lengthBytesUTF8(config.tokens || '') + 1;
|
||||||
const dataDirLen = Module.lengthBytesUTF8(config.dataDir || '') + 1;
|
const dataDirLen = Module.lengthBytesUTF8(config.dataDir || '') + 1;
|
||||||
@@ -282,7 +282,7 @@ function initSherpaOnnxOfflineTtsModelConfig(config, Module) {
|
|||||||
function initSherpaOnnxOfflineTtsConfig(config, Module) {
|
function initSherpaOnnxOfflineTtsConfig(config, Module) {
|
||||||
const modelConfig =
|
const modelConfig =
|
||||||
initSherpaOnnxOfflineTtsModelConfig(config.offlineTtsModelConfig, Module);
|
initSherpaOnnxOfflineTtsModelConfig(config.offlineTtsModelConfig, Module);
|
||||||
const len = modelConfig.len + 3 * 4;
|
const len = modelConfig.len + 4 * 4;
|
||||||
const ptr = Module._malloc(len);
|
const ptr = Module._malloc(len);
|
||||||
|
|
||||||
let offset = 0;
|
let offset = 0;
|
||||||
@@ -303,6 +303,10 @@ function initSherpaOnnxOfflineTtsConfig(config, Module) {
|
|||||||
offset += 4;
|
offset += 4;
|
||||||
|
|
||||||
Module.setValue(ptr + offset, buffer + ruleFstsLen, 'i8*');
|
Module.setValue(ptr + offset, buffer + ruleFstsLen, 'i8*');
|
||||||
|
offset += 4;
|
||||||
|
|
||||||
|
Module.setValue(ptr + offset, config.silenceScale || 0.2, 'float');
|
||||||
|
offset += 4;
|
||||||
|
|
||||||
return {
|
return {
|
||||||
buffer: buffer, ptr: ptr, len: len, config: modelConfig,
|
buffer: buffer, ptr: ptr, len: len, config: modelConfig,
|
||||||
|
|||||||
@@ -22,7 +22,7 @@ static_assert(sizeof(SherpaOnnxOfflineTtsModelConfig) ==
|
|||||||
sizeof(SherpaOnnxOfflineTtsKokoroModelConfig) + 3 * 4,
|
sizeof(SherpaOnnxOfflineTtsKokoroModelConfig) + 3 * 4,
|
||||||
"");
|
"");
|
||||||
static_assert(sizeof(SherpaOnnxOfflineTtsConfig) ==
|
static_assert(sizeof(SherpaOnnxOfflineTtsConfig) ==
|
||||||
sizeof(SherpaOnnxOfflineTtsModelConfig) + 3 * 4,
|
sizeof(SherpaOnnxOfflineTtsModelConfig) + 4 * 4,
|
||||||
"");
|
"");
|
||||||
|
|
||||||
void MyPrint(SherpaOnnxOfflineTtsConfig *tts_config) {
|
void MyPrint(SherpaOnnxOfflineTtsConfig *tts_config) {
|
||||||
@@ -68,6 +68,7 @@ void MyPrint(SherpaOnnxOfflineTtsConfig *tts_config) {
|
|||||||
fprintf(stdout, "rule_fsts: %s\n", tts_config->rule_fsts);
|
fprintf(stdout, "rule_fsts: %s\n", tts_config->rule_fsts);
|
||||||
fprintf(stdout, "rule_fars: %s\n", tts_config->rule_fars);
|
fprintf(stdout, "rule_fars: %s\n", tts_config->rule_fars);
|
||||||
fprintf(stdout, "max num sentences: %d\n", tts_config->max_num_sentences);
|
fprintf(stdout, "max num sentences: %d\n", tts_config->max_num_sentences);
|
||||||
|
fprintf(stdout, "silence scale: %.3f\n", tts_config->silence_scale);
|
||||||
}
|
}
|
||||||
|
|
||||||
void CopyHeap(const char *src, int32_t num_bytes, char *dst) {
|
void CopyHeap(const char *src, int32_t num_bytes, char *dst) {
|
||||||
|
|||||||
Reference in New Issue
Block a user