Add APIs about max speech duration in VAD for various programming languages (#1349)
This commit is contained in:
2
.github/workflows/dot-net.yaml
vendored
2
.github/workflows/dot-net.yaml
vendored
@@ -93,6 +93,8 @@ jobs:
|
|||||||
git clone https://huggingface.co/csukuangfj/sherpa-onnx-libs huggingface
|
git clone https://huggingface.co/csukuangfj/sherpa-onnx-libs huggingface
|
||||||
|
|
||||||
cd huggingface
|
cd huggingface
|
||||||
|
git fetch
|
||||||
|
git pull
|
||||||
mkdir -p windows-for-dotnet
|
mkdir -p windows-for-dotnet
|
||||||
|
|
||||||
cp -v ../sherpa-onnx-*.tar.bz2 ./windows-for-dotnet
|
cp -v ../sherpa-onnx-*.tar.bz2 ./windows-for-dotnet
|
||||||
|
|||||||
@@ -32,6 +32,7 @@ void main(List<String> arguments) async {
|
|||||||
model: sileroVad,
|
model: sileroVad,
|
||||||
minSilenceDuration: 0.25,
|
minSilenceDuration: 0.25,
|
||||||
minSpeechDuration: 0.5,
|
minSpeechDuration: 0.5,
|
||||||
|
maxSpeechDuration: 5.0,
|
||||||
);
|
);
|
||||||
|
|
||||||
final vadConfig = sherpa_onnx.VadModelConfig(
|
final vadConfig = sherpa_onnx.VadModelConfig(
|
||||||
|
|||||||
@@ -38,6 +38,7 @@ void main(List<String> arguments) async {
|
|||||||
model: sileroVad,
|
model: sileroVad,
|
||||||
minSilenceDuration: 0.25,
|
minSilenceDuration: 0.25,
|
||||||
minSpeechDuration: 0.5,
|
minSpeechDuration: 0.5,
|
||||||
|
maxSpeechDuration: 5.0,
|
||||||
);
|
);
|
||||||
|
|
||||||
final vadConfig = sherpa_onnx.VadModelConfig(
|
final vadConfig = sherpa_onnx.VadModelConfig(
|
||||||
|
|||||||
@@ -37,6 +37,7 @@ void main(List<String> arguments) async {
|
|||||||
model: sileroVad,
|
model: sileroVad,
|
||||||
minSilenceDuration: 0.25,
|
minSilenceDuration: 0.25,
|
||||||
minSpeechDuration: 0.5,
|
minSpeechDuration: 0.5,
|
||||||
|
maxSpeechDuration: 5.0,
|
||||||
);
|
);
|
||||||
|
|
||||||
final vadConfig = sherpa_onnx.VadModelConfig(
|
final vadConfig = sherpa_onnx.VadModelConfig(
|
||||||
|
|||||||
@@ -33,6 +33,7 @@ void main(List<String> arguments) async {
|
|||||||
model: sileroVad,
|
model: sileroVad,
|
||||||
minSilenceDuration: 0.25,
|
minSilenceDuration: 0.25,
|
||||||
minSpeechDuration: 0.5,
|
minSpeechDuration: 0.5,
|
||||||
|
maxSpeechDuration: 5.0,
|
||||||
);
|
);
|
||||||
|
|
||||||
final vadConfig = sherpa_onnx.VadModelConfig(
|
final vadConfig = sherpa_onnx.VadModelConfig(
|
||||||
|
|||||||
@@ -34,6 +34,7 @@ void main(List<String> arguments) async {
|
|||||||
model: sileroVad,
|
model: sileroVad,
|
||||||
minSilenceDuration: 0.25,
|
minSilenceDuration: 0.25,
|
||||||
minSpeechDuration: 0.5,
|
minSpeechDuration: 0.5,
|
||||||
|
maxSpeechDuration: 5.0,
|
||||||
);
|
);
|
||||||
|
|
||||||
final vadConfig = sherpa_onnx.VadModelConfig(
|
final vadConfig = sherpa_onnx.VadModelConfig(
|
||||||
|
|||||||
@@ -37,6 +37,7 @@ void main(List<String> arguments) async {
|
|||||||
model: sileroVad,
|
model: sileroVad,
|
||||||
minSilenceDuration: 0.25,
|
minSilenceDuration: 0.25,
|
||||||
minSpeechDuration: 0.5,
|
minSpeechDuration: 0.5,
|
||||||
|
maxSpeechDuration: 5.0,
|
||||||
);
|
);
|
||||||
|
|
||||||
final vadConfig = sherpa_onnx.VadModelConfig(
|
final vadConfig = sherpa_onnx.VadModelConfig(
|
||||||
|
|||||||
@@ -301,6 +301,9 @@ final class SherpaOnnxSileroVadModelConfig extends Struct {
|
|||||||
|
|
||||||
@Int32()
|
@Int32()
|
||||||
external int windowSize;
|
external int windowSize;
|
||||||
|
|
||||||
|
@Float()
|
||||||
|
external double maxSpeechDuration;
|
||||||
}
|
}
|
||||||
|
|
||||||
final class SherpaOnnxVadModelConfig extends Struct {
|
final class SherpaOnnxVadModelConfig extends Struct {
|
||||||
|
|||||||
@@ -11,11 +11,12 @@ class SileroVadModelConfig {
|
|||||||
this.threshold = 0.5,
|
this.threshold = 0.5,
|
||||||
this.minSilenceDuration = 0.5,
|
this.minSilenceDuration = 0.5,
|
||||||
this.minSpeechDuration = 0.25,
|
this.minSpeechDuration = 0.25,
|
||||||
this.windowSize = 512});
|
this.windowSize = 512,
|
||||||
|
this.maxSpeechDuration = 5.0});
|
||||||
|
|
||||||
@override
|
@override
|
||||||
String toString() {
|
String toString() {
|
||||||
return 'SileroVadModelConfig(model: $model, threshold: $threshold, minSilenceDuration: $minSilenceDuration, minSpeechDuration: $minSpeechDuration, windowSize: $windowSize)';
|
return 'SileroVadModelConfig(model: $model, threshold: $threshold, minSilenceDuration: $minSilenceDuration, minSpeechDuration: $minSpeechDuration, windowSize: $windowSize, maxSpeechDuration: $maxSpeechDuration)';
|
||||||
}
|
}
|
||||||
|
|
||||||
final String model;
|
final String model;
|
||||||
@@ -23,6 +24,7 @@ class SileroVadModelConfig {
|
|||||||
final double minSilenceDuration;
|
final double minSilenceDuration;
|
||||||
final double minSpeechDuration;
|
final double minSpeechDuration;
|
||||||
final int windowSize;
|
final int windowSize;
|
||||||
|
final double maxSpeechDuration;
|
||||||
}
|
}
|
||||||
|
|
||||||
class VadModelConfig {
|
class VadModelConfig {
|
||||||
@@ -127,6 +129,7 @@ class VoiceActivityDetector {
|
|||||||
c.ref.sileroVad.minSilenceDuration = config.sileroVad.minSilenceDuration;
|
c.ref.sileroVad.minSilenceDuration = config.sileroVad.minSilenceDuration;
|
||||||
c.ref.sileroVad.minSpeechDuration = config.sileroVad.minSpeechDuration;
|
c.ref.sileroVad.minSpeechDuration = config.sileroVad.minSpeechDuration;
|
||||||
c.ref.sileroVad.windowSize = config.sileroVad.windowSize;
|
c.ref.sileroVad.windowSize = config.sileroVad.windowSize;
|
||||||
|
c.ref.sileroVad.maxSpeechDuration = config.sileroVad.maxSpeechDuration;
|
||||||
|
|
||||||
c.ref.sampleRate = config.sampleRate;
|
c.ref.sampleRate = config.sampleRate;
|
||||||
c.ref.numThreads = config.numThreads;
|
c.ref.numThreads = config.numThreads;
|
||||||
|
|||||||
@@ -22,6 +22,7 @@ func main() {
|
|||||||
config.SileroVad.MinSilenceDuration = 0.5
|
config.SileroVad.MinSilenceDuration = 0.5
|
||||||
config.SileroVad.MinSpeechDuration = 0.25
|
config.SileroVad.MinSpeechDuration = 0.25
|
||||||
config.SileroVad.WindowSize = 512
|
config.SileroVad.WindowSize = 512
|
||||||
|
config.SileroVad.MaxSpeechDuration = 5.0
|
||||||
config.SampleRate = 16000
|
config.SampleRate = 16000
|
||||||
config.NumThreads = 1
|
config.NumThreads = 1
|
||||||
config.Provider = "cpu"
|
config.Provider = "cpu"
|
||||||
|
|||||||
@@ -22,6 +22,7 @@ func main() {
|
|||||||
config.SileroVad.MinSilenceDuration = 0.5
|
config.SileroVad.MinSilenceDuration = 0.5
|
||||||
config.SileroVad.MinSpeechDuration = 0.25
|
config.SileroVad.MinSpeechDuration = 0.25
|
||||||
config.SileroVad.WindowSize = 512
|
config.SileroVad.WindowSize = 512
|
||||||
|
config.SileroVad.MaxSpeechDuration = 5.0
|
||||||
config.SampleRate = 16000
|
config.SampleRate = 16000
|
||||||
config.NumThreads = 1
|
config.NumThreads = 1
|
||||||
config.Provider = "cpu"
|
config.Provider = "cpu"
|
||||||
|
|||||||
@@ -18,6 +18,7 @@ public class VadNonStreamingParaformer {
|
|||||||
.setMinSilenceDuration(0.25f)
|
.setMinSilenceDuration(0.25f)
|
||||||
.setMinSpeechDuration(0.5f)
|
.setMinSpeechDuration(0.5f)
|
||||||
.setWindowSize(512)
|
.setWindowSize(512)
|
||||||
|
.setMaxSpeechDuration(5.0f)
|
||||||
.build();
|
.build();
|
||||||
|
|
||||||
VadModelConfig config =
|
VadModelConfig config =
|
||||||
|
|||||||
@@ -18,6 +18,7 @@ public class VadNonStreamingSenseVoice {
|
|||||||
.setMinSilenceDuration(0.25f)
|
.setMinSilenceDuration(0.25f)
|
||||||
.setMinSpeechDuration(0.5f)
|
.setMinSpeechDuration(0.5f)
|
||||||
.setWindowSize(512)
|
.setWindowSize(512)
|
||||||
|
.setMaxSpeechDuration(5.0f)
|
||||||
.build();
|
.build();
|
||||||
|
|
||||||
VadModelConfig config =
|
VadModelConfig config =
|
||||||
|
|||||||
@@ -19,6 +19,7 @@ public class VadRemoveSilence {
|
|||||||
.setMinSilenceDuration(0.25f)
|
.setMinSilenceDuration(0.25f)
|
||||||
.setMinSpeechDuration(0.5f)
|
.setMinSpeechDuration(0.5f)
|
||||||
.setWindowSize(512)
|
.setWindowSize(512)
|
||||||
|
.setMaxSpeechDuration(5.0f)
|
||||||
.build();
|
.build();
|
||||||
|
|
||||||
VadModelConfig config =
|
VadModelConfig config =
|
||||||
|
|||||||
@@ -48,8 +48,9 @@ begin
|
|||||||
WindowSize := 512; {Please don't change it unless you know the details}
|
WindowSize := 512; {Please don't change it unless you know the details}
|
||||||
|
|
||||||
Config.SileroVad.Model := VadFilename;
|
Config.SileroVad.Model := VadFilename;
|
||||||
Config.SileroVad.MinSpeechDuration := 0.5;
|
Config.SileroVad.MinSpeechDuration := 0.25;
|
||||||
Config.SileroVad.MinSilenceDuration := 0.5;
|
Config.SileroVad.MinSilenceDuration := 0.5;
|
||||||
|
Config.SileroVad.MaxSpeechDuration := 5.0;
|
||||||
Config.SileroVad.Threshold := 0.5;
|
Config.SileroVad.Threshold := 0.5;
|
||||||
Config.SileroVad.WindowSize := WindowSize;
|
Config.SileroVad.WindowSize := WindowSize;
|
||||||
Config.NumThreads:= 2;
|
Config.NumThreads:= 2;
|
||||||
|
|||||||
@@ -34,6 +34,7 @@ function createVad() {
|
|||||||
threshold: 0.5,
|
threshold: 0.5,
|
||||||
minSpeechDuration: 0.25,
|
minSpeechDuration: 0.25,
|
||||||
minSilenceDuration: 0.5,
|
minSilenceDuration: 0.5,
|
||||||
|
maxSpeechDuration: 5,
|
||||||
windowSize: 512,
|
windowSize: 512,
|
||||||
},
|
},
|
||||||
sampleRate: 16000,
|
sampleRate: 16000,
|
||||||
|
|||||||
@@ -29,6 +29,7 @@ function createVad() {
|
|||||||
threshold: 0.5,
|
threshold: 0.5,
|
||||||
minSpeechDuration: 0.25,
|
minSpeechDuration: 0.25,
|
||||||
minSilenceDuration: 0.5,
|
minSilenceDuration: 0.5,
|
||||||
|
maxSpeechDuration: 5,
|
||||||
windowSize: 512,
|
windowSize: 512,
|
||||||
},
|
},
|
||||||
sampleRate: 16000,
|
sampleRate: 16000,
|
||||||
|
|||||||
@@ -90,6 +90,15 @@ def main():
|
|||||||
|
|
||||||
config = sherpa_onnx.VadModelConfig()
|
config = sherpa_onnx.VadModelConfig()
|
||||||
config.silero_vad.model = args.silero_vad_model
|
config.silero_vad.model = args.silero_vad_model
|
||||||
|
config.silero_vad.threshold = 0.5
|
||||||
|
config.silero_vad.min_silence_duration = 0.25 # seconds
|
||||||
|
config.silero_vad.min_speech_duration = 0.25 # seconds
|
||||||
|
|
||||||
|
# If the current segment is larger than this value, then it increases
|
||||||
|
# the threshold to 0.9 internally. After detecting this segment,
|
||||||
|
# it resets the threshold to its original value.
|
||||||
|
config.silero_vad.max_speech_duration = 5 # seconds
|
||||||
|
|
||||||
config.sample_rate = sample_rate
|
config.sample_rate = sample_rate
|
||||||
|
|
||||||
window_size = config.silero_vad.window_size
|
window_size = config.silero_vad.window_size
|
||||||
|
|||||||
@@ -14,6 +14,7 @@ namespace SherpaOnnx
|
|||||||
MinSilenceDuration = 0.5F;
|
MinSilenceDuration = 0.5F;
|
||||||
MinSpeechDuration = 0.25F;
|
MinSpeechDuration = 0.25F;
|
||||||
WindowSize = 512;
|
WindowSize = 512;
|
||||||
|
MaxSpeechDuration = 5.0F;
|
||||||
}
|
}
|
||||||
|
|
||||||
[MarshalAs(UnmanagedType.LPStr)]
|
[MarshalAs(UnmanagedType.LPStr)]
|
||||||
@@ -26,5 +27,7 @@ namespace SherpaOnnx
|
|||||||
public float MinSpeechDuration;
|
public float MinSpeechDuration;
|
||||||
|
|
||||||
public int WindowSize;
|
public int WindowSize;
|
||||||
|
|
||||||
|
public float MaxSpeechDuration;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -771,6 +771,7 @@ type SileroVadModelConfig struct {
|
|||||||
MinSilenceDuration float32
|
MinSilenceDuration float32
|
||||||
MinSpeechDuration float32
|
MinSpeechDuration float32
|
||||||
WindowSize int
|
WindowSize int
|
||||||
|
MaxSpeechDuration float32
|
||||||
}
|
}
|
||||||
|
|
||||||
type VadModelConfig struct {
|
type VadModelConfig struct {
|
||||||
@@ -849,6 +850,7 @@ func NewVoiceActivityDetector(config *VadModelConfig, bufferSizeInSeconds float3
|
|||||||
c.silero_vad.min_silence_duration = C.float(config.SileroVad.MinSilenceDuration)
|
c.silero_vad.min_silence_duration = C.float(config.SileroVad.MinSilenceDuration)
|
||||||
c.silero_vad.min_speech_duration = C.float(config.SileroVad.MinSpeechDuration)
|
c.silero_vad.min_speech_duration = C.float(config.SileroVad.MinSpeechDuration)
|
||||||
c.silero_vad.window_size = C.int(config.SileroVad.WindowSize)
|
c.silero_vad.window_size = C.int(config.SileroVad.WindowSize)
|
||||||
|
c.silero_vad.max_speech_duration = C.float(config.SileroVad.MaxSpeechDuration)
|
||||||
|
|
||||||
c.sample_rate = C.int(config.SampleRate)
|
c.sample_rate = C.int(config.SampleRate)
|
||||||
c.num_threads = C.int(config.NumThreads)
|
c.num_threads = C.int(config.NumThreads)
|
||||||
|
|||||||
@@ -39,6 +39,9 @@ config = {
|
|||||||
sileroVad: {
|
sileroVad: {
|
||||||
model: "./silero_vad.onnx",
|
model: "./silero_vad.onnx",
|
||||||
threshold: 0.5,
|
threshold: 0.5,
|
||||||
|
minSilenceDuration: 0.5,
|
||||||
|
minSpeechDuration: 0.25,
|
||||||
|
maxSpeechDuration: 5,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
*/
|
*/
|
||||||
|
|||||||
@@ -279,6 +279,7 @@ static SherpaOnnxSileroVadModelConfig GetSileroVadConfig(
|
|||||||
SHERPA_ONNX_ASSIGN_ATTR_FLOAT(min_silence_duration, minSilenceDuration);
|
SHERPA_ONNX_ASSIGN_ATTR_FLOAT(min_silence_duration, minSilenceDuration);
|
||||||
SHERPA_ONNX_ASSIGN_ATTR_FLOAT(min_speech_duration, minSpeechDuration);
|
SHERPA_ONNX_ASSIGN_ATTR_FLOAT(min_speech_duration, minSpeechDuration);
|
||||||
SHERPA_ONNX_ASSIGN_ATTR_INT32(window_size, windowSize);
|
SHERPA_ONNX_ASSIGN_ATTR_INT32(window_size, windowSize);
|
||||||
|
SHERPA_ONNX_ASSIGN_ATTR_FLOAT(max_speech_duration, maxSpeechDuration);
|
||||||
|
|
||||||
return c;
|
return c;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -907,6 +907,9 @@ SherpaOnnxVoiceActivityDetector *SherpaOnnxCreateVoiceActivityDetector(
|
|||||||
vad_config.silero_vad.window_size =
|
vad_config.silero_vad.window_size =
|
||||||
SHERPA_ONNX_OR(config->silero_vad.window_size, 512);
|
SHERPA_ONNX_OR(config->silero_vad.window_size, 512);
|
||||||
|
|
||||||
|
vad_config.silero_vad.max_speech_duration =
|
||||||
|
SHERPA_ONNX_OR(config->silero_vad.max_speech_duration, 20);
|
||||||
|
|
||||||
vad_config.sample_rate = SHERPA_ONNX_OR(config->sample_rate, 16000);
|
vad_config.sample_rate = SHERPA_ONNX_OR(config->sample_rate, 16000);
|
||||||
vad_config.num_threads = SHERPA_ONNX_OR(config->num_threads, 1);
|
vad_config.num_threads = SHERPA_ONNX_OR(config->num_threads, 1);
|
||||||
vad_config.provider = SHERPA_ONNX_OR(config->provider, "cpu");
|
vad_config.provider = SHERPA_ONNX_OR(config->provider, "cpu");
|
||||||
|
|||||||
@@ -746,6 +746,11 @@ SHERPA_ONNX_API typedef struct SherpaOnnxSileroVadModelConfig {
|
|||||||
float min_speech_duration;
|
float min_speech_duration;
|
||||||
|
|
||||||
int window_size;
|
int window_size;
|
||||||
|
|
||||||
|
// If a speech segment is longer than this value, then we increase
|
||||||
|
// the threshold to 0.9. After finishing detecting the segment,
|
||||||
|
// the threshold value is reset to its original value.
|
||||||
|
float max_speech_duration;
|
||||||
} SherpaOnnxSileroVadModelConfig;
|
} SherpaOnnxSileroVadModelConfig;
|
||||||
|
|
||||||
SHERPA_ONNX_API typedef struct SherpaOnnxVadModelConfig {
|
SHERPA_ONNX_API typedef struct SherpaOnnxVadModelConfig {
|
||||||
|
|||||||
@@ -8,6 +8,7 @@ public class SileroVadModelConfig {
|
|||||||
private final float minSilenceDuration;
|
private final float minSilenceDuration;
|
||||||
private final float minSpeechDuration;
|
private final float minSpeechDuration;
|
||||||
private final int windowSize;
|
private final int windowSize;
|
||||||
|
private final float maxSpeechDuration;
|
||||||
|
|
||||||
private SileroVadModelConfig(Builder builder) {
|
private SileroVadModelConfig(Builder builder) {
|
||||||
this.model = builder.model;
|
this.model = builder.model;
|
||||||
@@ -15,6 +16,7 @@ public class SileroVadModelConfig {
|
|||||||
this.minSilenceDuration = builder.minSilenceDuration;
|
this.minSilenceDuration = builder.minSilenceDuration;
|
||||||
this.minSpeechDuration = builder.minSpeechDuration;
|
this.minSpeechDuration = builder.minSpeechDuration;
|
||||||
this.windowSize = builder.windowSize;
|
this.windowSize = builder.windowSize;
|
||||||
|
this.maxSpeechDuration = builder.maxSpeechDuration;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static Builder builder() {
|
public static Builder builder() {
|
||||||
@@ -41,12 +43,17 @@ public class SileroVadModelConfig {
|
|||||||
return windowSize;
|
return windowSize;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public float getMaxSpeechDuration() {
|
||||||
|
return maxSpeechDuration;
|
||||||
|
}
|
||||||
|
|
||||||
public static class Builder {
|
public static class Builder {
|
||||||
private String model = "";
|
private String model = "";
|
||||||
private float threshold = 0.5f;
|
private float threshold = 0.5f;
|
||||||
private float minSilenceDuration = 0.25f;
|
private float minSilenceDuration = 0.25f;
|
||||||
private float minSpeechDuration = 0.5f;
|
private float minSpeechDuration = 0.5f;
|
||||||
private int windowSize = 512;
|
private int windowSize = 512;
|
||||||
|
private float maxSpeechDuration = 5.0f;
|
||||||
|
|
||||||
public SileroVadModelConfig build() {
|
public SileroVadModelConfig build() {
|
||||||
return new SileroVadModelConfig(this);
|
return new SileroVadModelConfig(this);
|
||||||
@@ -77,5 +84,10 @@ public class SileroVadModelConfig {
|
|||||||
this.windowSize = windowSize;
|
this.windowSize = windowSize;
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public Builder setMaxSpeechDuration(float maxSpeechDuration) {
|
||||||
|
this.maxSpeechDuration = maxSpeechDuration;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -40,6 +40,10 @@ static VadModelConfig GetVadModelConfig(JNIEnv *env, jobject config) {
|
|||||||
fid = env->GetFieldID(silero_vad_config_cls, "windowSize", "I");
|
fid = env->GetFieldID(silero_vad_config_cls, "windowSize", "I");
|
||||||
ans.silero_vad.window_size = env->GetIntField(silero_vad_config, fid);
|
ans.silero_vad.window_size = env->GetIntField(silero_vad_config, fid);
|
||||||
|
|
||||||
|
fid = env->GetFieldID(silero_vad_config_cls, "maxSpeechDuration", "F");
|
||||||
|
ans.silero_vad.max_speech_duration =
|
||||||
|
env->GetFloatField(silero_vad_config, fid);
|
||||||
|
|
||||||
fid = env->GetFieldID(cls, "sampleRate", "I");
|
fid = env->GetFieldID(cls, "sampleRate", "I");
|
||||||
ans.sample_rate = env->GetIntField(config, fid);
|
ans.sample_rate = env->GetIntField(config, fid);
|
||||||
|
|
||||||
|
|||||||
@@ -9,6 +9,7 @@ data class SileroVadModelConfig(
|
|||||||
var minSilenceDuration: Float = 0.25F,
|
var minSilenceDuration: Float = 0.25F,
|
||||||
var minSpeechDuration: Float = 0.25F,
|
var minSpeechDuration: Float = 0.25F,
|
||||||
var windowSize: Int = 512,
|
var windowSize: Int = 512,
|
||||||
|
var maxSpeechDuration: Float = 5.0F,
|
||||||
)
|
)
|
||||||
|
|
||||||
data class VadModelConfig(
|
data class VadModelConfig(
|
||||||
|
|||||||
@@ -341,6 +341,7 @@ type
|
|||||||
MinSilenceDuration: Single;
|
MinSilenceDuration: Single;
|
||||||
MinSpeechDuration: Single;
|
MinSpeechDuration: Single;
|
||||||
WindowSize: Integer;
|
WindowSize: Integer;
|
||||||
|
MaxSpeechDuration: Single;
|
||||||
function ToString: AnsiString;
|
function ToString: AnsiString;
|
||||||
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxSileroVadModelConfig);
|
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxSileroVadModelConfig);
|
||||||
end;
|
end;
|
||||||
@@ -594,6 +595,7 @@ type
|
|||||||
MinSilenceDuration: cfloat;
|
MinSilenceDuration: cfloat;
|
||||||
MinSpeechDuration: cfloat;
|
MinSpeechDuration: cfloat;
|
||||||
WindowSize: cint32;
|
WindowSize: cint32;
|
||||||
|
MaxSpeechDuration: cfloat;
|
||||||
end;
|
end;
|
||||||
SherpaOnnxVadModelConfig = record
|
SherpaOnnxVadModelConfig = record
|
||||||
SileroVad: SherpaOnnxSileroVadModelConfig;
|
SileroVad: SherpaOnnxSileroVadModelConfig;
|
||||||
@@ -1402,10 +1404,11 @@ begin
|
|||||||
'Threshold := %.2f, ' +
|
'Threshold := %.2f, ' +
|
||||||
'MinSilenceDuration := %.2f, ' +
|
'MinSilenceDuration := %.2f, ' +
|
||||||
'MinSpeechDuration := %.2f, ' +
|
'MinSpeechDuration := %.2f, ' +
|
||||||
'WindowSize := %d' +
|
'WindowSize := %d, ' +
|
||||||
|
'MaxSpeechDuration := %.2f' +
|
||||||
')',
|
')',
|
||||||
[Self.Model, Self.Threshold, Self.MinSilenceDuration,
|
[Self.Model, Self.Threshold, Self.MinSilenceDuration,
|
||||||
Self.MinSpeechDuration, Self.WindowSize
|
Self.MinSpeechDuration, Self.WindowSize, Self.MaxSpeechDuration
|
||||||
]);
|
]);
|
||||||
end;
|
end;
|
||||||
|
|
||||||
@@ -1415,6 +1418,7 @@ begin
|
|||||||
Dest.MinSilenceDuration := 0.5;
|
Dest.MinSilenceDuration := 0.5;
|
||||||
Dest.MinSpeechDuration := 0.25;
|
Dest.MinSpeechDuration := 0.25;
|
||||||
Dest.WindowSize := 512;
|
Dest.WindowSize := 512;
|
||||||
|
Dest.MaxSpeechDuration := 5.0;
|
||||||
end;
|
end;
|
||||||
|
|
||||||
function TSherpaOnnxVadModelConfig.ToString: AnsiString;
|
function TSherpaOnnxVadModelConfig.ToString: AnsiString;
|
||||||
@@ -1569,6 +1573,7 @@ begin
|
|||||||
C.SileroVad.MinSilenceDuration := Config.SileroVad.MinSilenceDuration;
|
C.SileroVad.MinSilenceDuration := Config.SileroVad.MinSilenceDuration;
|
||||||
C.SileroVad.MinSpeechDuration := Config.SileroVad.MinSpeechDuration;
|
C.SileroVad.MinSpeechDuration := Config.SileroVad.MinSpeechDuration;
|
||||||
C.SileroVad.WindowSize := Config.SileroVad.WindowSize;
|
C.SileroVad.WindowSize := Config.SileroVad.WindowSize;
|
||||||
|
C.SileroVad.MaxSpeechDuration := Config.SileroVad.MaxSpeechDuration;
|
||||||
|
|
||||||
C.SampleRate := Config.SampleRate;
|
C.SampleRate := Config.SampleRate;
|
||||||
C.NumThreads := Config.NumThreads;
|
C.NumThreads := Config.NumThreads;
|
||||||
|
|||||||
@@ -550,14 +550,16 @@ func sherpaOnnxSileroVadModelConfig(
|
|||||||
threshold: Float = 0.5,
|
threshold: Float = 0.5,
|
||||||
minSilenceDuration: Float = 0.25,
|
minSilenceDuration: Float = 0.25,
|
||||||
minSpeechDuration: Float = 0.5,
|
minSpeechDuration: Float = 0.5,
|
||||||
windowSize: Int = 512
|
windowSize: Int = 512,
|
||||||
|
maxSpeechDuration: Float = 5.0
|
||||||
) -> SherpaOnnxSileroVadModelConfig {
|
) -> SherpaOnnxSileroVadModelConfig {
|
||||||
return SherpaOnnxSileroVadModelConfig(
|
return SherpaOnnxSileroVadModelConfig(
|
||||||
model: toCPointer(model),
|
model: toCPointer(model),
|
||||||
threshold: threshold,
|
threshold: threshold,
|
||||||
min_silence_duration: minSilenceDuration,
|
min_silence_duration: minSilenceDuration,
|
||||||
min_speech_duration: minSpeechDuration,
|
min_speech_duration: minSpeechDuration,
|
||||||
window_size: Int32(windowSize)
|
window_size: Int32(windowSize),
|
||||||
|
max_speech_duration: maxSpeechDuration
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -19,7 +19,7 @@ function initSherpaOnnxSileroVadModelConfig(config, Module) {
|
|||||||
|
|
||||||
const buffer = Module._malloc(n);
|
const buffer = Module._malloc(n);
|
||||||
|
|
||||||
const len = 5 * 4;
|
const len = 6 * 4;
|
||||||
const ptr = Module._malloc(len);
|
const ptr = Module._malloc(len);
|
||||||
|
|
||||||
Module.stringToUTF8(config.model || '', buffer, modelLen);
|
Module.stringToUTF8(config.model || '', buffer, modelLen);
|
||||||
@@ -40,6 +40,9 @@ function initSherpaOnnxSileroVadModelConfig(config, Module) {
|
|||||||
Module.setValue(ptr + offset, config.windowSize || 512, 'i32');
|
Module.setValue(ptr + offset, config.windowSize || 512, 'i32');
|
||||||
offset += 4;
|
offset += 4;
|
||||||
|
|
||||||
|
Module.setValue(ptr + offset, config.maxSpeechDuration || 20, 'float');
|
||||||
|
offset += 4;
|
||||||
|
|
||||||
return {
|
return {
|
||||||
buffer: buffer, ptr: ptr, len: len,
|
buffer: buffer, ptr: ptr, len: len,
|
||||||
}
|
}
|
||||||
@@ -53,6 +56,7 @@ function initSherpaOnnxVadModelConfig(config, Module) {
|
|||||||
minSilenceDuration: 0.50,
|
minSilenceDuration: 0.50,
|
||||||
minSpeechDuration: 0.25,
|
minSpeechDuration: 0.25,
|
||||||
windowSize: 512,
|
windowSize: 512,
|
||||||
|
maxSpeechDuration: 20,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -93,6 +97,7 @@ function createVad(Module, myConfig) {
|
|||||||
threshold: 0.50,
|
threshold: 0.50,
|
||||||
minSilenceDuration: 0.50,
|
minSilenceDuration: 0.50,
|
||||||
minSpeechDuration: 0.25,
|
minSpeechDuration: 0.25,
|
||||||
|
maxSpeechDuration: 20,
|
||||||
windowSize: 512,
|
windowSize: 512,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@@ -13,7 +13,7 @@
|
|||||||
|
|
||||||
extern "C" {
|
extern "C" {
|
||||||
|
|
||||||
static_assert(sizeof(SherpaOnnxSileroVadModelConfig) == 5 * 4, "");
|
static_assert(sizeof(SherpaOnnxSileroVadModelConfig) == 6 * 4, "");
|
||||||
|
|
||||||
static_assert(sizeof(SherpaOnnxVadModelConfig) ==
|
static_assert(sizeof(SherpaOnnxVadModelConfig) ==
|
||||||
sizeof(SherpaOnnxSileroVadModelConfig) + 4 * 4,
|
sizeof(SherpaOnnxSileroVadModelConfig) + 4 * 4,
|
||||||
@@ -29,6 +29,8 @@ void MyPrint(SherpaOnnxVadModelConfig *config) {
|
|||||||
fprintf(stdout, "min_speech_duration: %.3f\n",
|
fprintf(stdout, "min_speech_duration: %.3f\n",
|
||||||
silero_vad->min_speech_duration);
|
silero_vad->min_speech_duration);
|
||||||
fprintf(stdout, "window_size: %d\n", silero_vad->window_size);
|
fprintf(stdout, "window_size: %d\n", silero_vad->window_size);
|
||||||
|
fprintf(stdout, "max_speech_duration: %.3f\n",
|
||||||
|
silero_vad->max_speech_duration);
|
||||||
|
|
||||||
fprintf(stdout, "----------config----------\n");
|
fprintf(stdout, "----------config----------\n");
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user