Add APIs about max speech duration in VAD for various programming languages (#1349)
This commit is contained in:
@@ -907,6 +907,9 @@ SherpaOnnxVoiceActivityDetector *SherpaOnnxCreateVoiceActivityDetector(
|
||||
vad_config.silero_vad.window_size =
|
||||
SHERPA_ONNX_OR(config->silero_vad.window_size, 512);
|
||||
|
||||
vad_config.silero_vad.max_speech_duration =
|
||||
SHERPA_ONNX_OR(config->silero_vad.max_speech_duration, 20);
|
||||
|
||||
vad_config.sample_rate = SHERPA_ONNX_OR(config->sample_rate, 16000);
|
||||
vad_config.num_threads = SHERPA_ONNX_OR(config->num_threads, 1);
|
||||
vad_config.provider = SHERPA_ONNX_OR(config->provider, "cpu");
|
||||
|
||||
@@ -746,6 +746,11 @@ SHERPA_ONNX_API typedef struct SherpaOnnxSileroVadModelConfig {
|
||||
float min_speech_duration;
|
||||
|
||||
int window_size;
|
||||
|
||||
// If a speech segment is longer than this value, then we increase
|
||||
// the threshold to 0.9. After finishing detecting the segment,
|
||||
// the threshold value is reset to its original value.
|
||||
float max_speech_duration;
|
||||
} SherpaOnnxSileroVadModelConfig;
|
||||
|
||||
SHERPA_ONNX_API typedef struct SherpaOnnxVadModelConfig {
|
||||
|
||||
@@ -8,6 +8,7 @@ public class SileroVadModelConfig {
|
||||
private final float minSilenceDuration;
|
||||
private final float minSpeechDuration;
|
||||
private final int windowSize;
|
||||
private final float maxSpeechDuration;
|
||||
|
||||
private SileroVadModelConfig(Builder builder) {
|
||||
this.model = builder.model;
|
||||
@@ -15,6 +16,7 @@ public class SileroVadModelConfig {
|
||||
this.minSilenceDuration = builder.minSilenceDuration;
|
||||
this.minSpeechDuration = builder.minSpeechDuration;
|
||||
this.windowSize = builder.windowSize;
|
||||
this.maxSpeechDuration = builder.maxSpeechDuration;
|
||||
}
|
||||
|
||||
public static Builder builder() {
|
||||
@@ -41,12 +43,17 @@ public class SileroVadModelConfig {
|
||||
return windowSize;
|
||||
}
|
||||
|
||||
public float getMaxSpeechDuration() {
|
||||
return maxSpeechDuration;
|
||||
}
|
||||
|
||||
public static class Builder {
|
||||
private String model = "";
|
||||
private float threshold = 0.5f;
|
||||
private float minSilenceDuration = 0.25f;
|
||||
private float minSpeechDuration = 0.5f;
|
||||
private int windowSize = 512;
|
||||
private float maxSpeechDuration = 5.0f;
|
||||
|
||||
public SileroVadModelConfig build() {
|
||||
return new SileroVadModelConfig(this);
|
||||
@@ -77,5 +84,10 @@ public class SileroVadModelConfig {
|
||||
this.windowSize = windowSize;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder setMaxSpeechDuration(float maxSpeechDuration) {
|
||||
this.maxSpeechDuration = maxSpeechDuration;
|
||||
return this;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -40,6 +40,10 @@ static VadModelConfig GetVadModelConfig(JNIEnv *env, jobject config) {
|
||||
fid = env->GetFieldID(silero_vad_config_cls, "windowSize", "I");
|
||||
ans.silero_vad.window_size = env->GetIntField(silero_vad_config, fid);
|
||||
|
||||
fid = env->GetFieldID(silero_vad_config_cls, "maxSpeechDuration", "F");
|
||||
ans.silero_vad.max_speech_duration =
|
||||
env->GetFloatField(silero_vad_config, fid);
|
||||
|
||||
fid = env->GetFieldID(cls, "sampleRate", "I");
|
||||
ans.sample_rate = env->GetIntField(config, fid);
|
||||
|
||||
|
||||
@@ -9,6 +9,7 @@ data class SileroVadModelConfig(
|
||||
var minSilenceDuration: Float = 0.25F,
|
||||
var minSpeechDuration: Float = 0.25F,
|
||||
var windowSize: Int = 512,
|
||||
var maxSpeechDuration: Float = 5.0F,
|
||||
)
|
||||
|
||||
data class VadModelConfig(
|
||||
|
||||
@@ -341,6 +341,7 @@ type
|
||||
MinSilenceDuration: Single;
|
||||
MinSpeechDuration: Single;
|
||||
WindowSize: Integer;
|
||||
MaxSpeechDuration: Single;
|
||||
function ToString: AnsiString;
|
||||
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxSileroVadModelConfig);
|
||||
end;
|
||||
@@ -594,6 +595,7 @@ type
|
||||
MinSilenceDuration: cfloat;
|
||||
MinSpeechDuration: cfloat;
|
||||
WindowSize: cint32;
|
||||
MaxSpeechDuration: cfloat;
|
||||
end;
|
||||
SherpaOnnxVadModelConfig = record
|
||||
SileroVad: SherpaOnnxSileroVadModelConfig;
|
||||
@@ -1402,10 +1404,11 @@ begin
|
||||
'Threshold := %.2f, ' +
|
||||
'MinSilenceDuration := %.2f, ' +
|
||||
'MinSpeechDuration := %.2f, ' +
|
||||
'WindowSize := %d' +
|
||||
'WindowSize := %d, ' +
|
||||
'MaxSpeechDuration := %.2f' +
|
||||
')',
|
||||
[Self.Model, Self.Threshold, Self.MinSilenceDuration,
|
||||
Self.MinSpeechDuration, Self.WindowSize
|
||||
Self.MinSpeechDuration, Self.WindowSize, Self.MaxSpeechDuration
|
||||
]);
|
||||
end;
|
||||
|
||||
@@ -1415,6 +1418,7 @@ begin
|
||||
Dest.MinSilenceDuration := 0.5;
|
||||
Dest.MinSpeechDuration := 0.25;
|
||||
Dest.WindowSize := 512;
|
||||
Dest.MaxSpeechDuration := 5.0;
|
||||
end;
|
||||
|
||||
function TSherpaOnnxVadModelConfig.ToString: AnsiString;
|
||||
@@ -1569,6 +1573,7 @@ begin
|
||||
C.SileroVad.MinSilenceDuration := Config.SileroVad.MinSilenceDuration;
|
||||
C.SileroVad.MinSpeechDuration := Config.SileroVad.MinSpeechDuration;
|
||||
C.SileroVad.WindowSize := Config.SileroVad.WindowSize;
|
||||
C.SileroVad.MaxSpeechDuration := Config.SileroVad.MaxSpeechDuration;
|
||||
|
||||
C.SampleRate := Config.SampleRate;
|
||||
C.NumThreads := Config.NumThreads;
|
||||
|
||||
Reference in New Issue
Block a user