116 lines
2.9 KiB
ObjectPascal
116 lines
2.9 KiB
ObjectPascal
{ Copyright (c) 2024 Xiaomi Corporation }
|
|
{
|
|
This file shows how to use the VAD API from sherpa-onnx
|
|
to remove silences from a wave file.
|
|
}
|
|
program main;
|
|
|
|
{$mode delphi}
|
|
|
|
uses
|
|
sherpa_onnx,
|
|
SysUtils;
|
|
|
|
var
|
|
Wave: TSherpaOnnxWave;
|
|
|
|
Config: TSherpaOnnxVadModelConfig;
|
|
Vad: TSherpaOnnxVoiceActivityDetector;
|
|
Offset: Integer;
|
|
WindowSize: Integer;
|
|
SpeechSegment: TSherpaOnnxSpeechSegment;
|
|
|
|
Start: Single;
|
|
Duration: Single;
|
|
SampleRate: Integer;
|
|
|
|
AllSpeechSegment: array of TSherpaOnnxSpeechSegment;
|
|
AllSamples: array of Single;
|
|
N: Integer;
|
|
I: Integer;
|
|
begin
|
|
SampleRate := 16000; {Please don't change it unless you know the details}
|
|
|
|
Wave := SherpaOnnxReadWave('./lei-jun-test.wav');
|
|
if Wave.SampleRate <> SampleRate then
|
|
begin
|
|
WriteLn(Format('Expected sample rate: %d. Given: %d',
|
|
[SampleRate, Wave.SampleRate]));
|
|
|
|
Exit;
|
|
end;
|
|
|
|
WindowSize := 512; {Please don't change it unless you know the details}
|
|
Initialize(Config);
|
|
|
|
Config.SileroVad.Model := './silero_vad.onnx';
|
|
Config.SileroVad.MinSpeechDuration := 0.25;
|
|
Config.SileroVad.MinSilenceDuration := 0.5;
|
|
Config.SileroVad.Threshold := 0.5;
|
|
Config.SileroVad.WindowSize := WindowSize;
|
|
Config.NumThreads:= 1;
|
|
Config.Debug:= True;
|
|
Config.Provider:= 'cpu';
|
|
Config.SampleRate := SampleRate;
|
|
|
|
Vad := TSherpaOnnxVoiceActivityDetector.Create(Config, 20);
|
|
|
|
AllSpeechSegment := nil;
|
|
AllSamples := nil;
|
|
Offset := 0;
|
|
while Offset + WindowSize <= Length(Wave.Samples) do
|
|
begin
|
|
Vad.AcceptWaveform(Wave.Samples, Offset, WindowSize);
|
|
Inc(Offset, WindowSize);
|
|
|
|
while not Vad.IsEmpty do
|
|
begin
|
|
SetLength(AllSpeechSegment, Length(AllSpeechSegment) + 1);
|
|
|
|
SpeechSegment := Vad.Front();
|
|
Vad.Pop();
|
|
AllSpeechSegment[Length(AllSpeechSegment)-1] := SpeechSegment;
|
|
|
|
Start := SpeechSegment.Start / SampleRate;
|
|
Duration := Length(SpeechSegment.Samples) / SampleRate;
|
|
WriteLn(Format('%.3f -- %.3f', [Start, Start + Duration]));
|
|
end;
|
|
end;
|
|
|
|
Vad.Flush;
|
|
|
|
while not Vad.IsEmpty do
|
|
begin
|
|
SetLength(AllSpeechSegment, Length(AllSpeechSegment) + 1);
|
|
|
|
SpeechSegment := Vad.Front();
|
|
Vad.Pop();
|
|
AllSpeechSegment[Length(AllSpeechSegment)-1] := SpeechSegment;
|
|
|
|
Start := SpeechSegment.Start / SampleRate;
|
|
Duration := Length(SpeechSegment.Samples) / SampleRate;
|
|
WriteLn(Format('%.3f -- %.3f', [Start, Start + Duration]));
|
|
end;
|
|
|
|
N := 0;
|
|
for SpeechSegment in AllSpeechSegment do
|
|
Inc(N, Length(SpeechSegment.Samples));
|
|
|
|
SetLength(AllSamples, N);
|
|
|
|
N := 0;
|
|
for SpeechSegment in AllSpeechSegment do
|
|
begin
|
|
for I := Low(SpeechSegment.Samples) to High(SpeechSegment.Samples) do
|
|
begin
|
|
AllSamples[N] := SpeechSegment.Samples[I];
|
|
Inc(N);
|
|
end;
|
|
end;
|
|
|
|
SherpaOnnxWriteWave('./lei-jun-test-no-silence.wav', AllSamples, SampleRate);
|
|
WriteLn('Saved to ./lei-jun-test-no-silence.wav');
|
|
|
|
FreeAndNil(Vad);
|
|
end.
|