This repository has been archived on 2025-08-26. You can view files and clone it, but cannot push or open issues or pull requests.
Files
enginex_bi_series-sherpa-onnx/pascal-api-examples/vad/remove_silence.pas
2024-08-13 16:16:51 +08:00

116 lines
2.9 KiB
ObjectPascal

{ Copyright (c) 2024 Xiaomi Corporation }
{
This file shows how to use the VAD API from sherpa-onnx
to remove silences from a wave file.
}
program main;
{$mode delphi}
uses
sherpa_onnx,
SysUtils;
var
Wave: TSherpaOnnxWave;
Config: TSherpaOnnxVadModelConfig;
Vad: TSherpaOnnxVoiceActivityDetector;
Offset: Integer;
WindowSize: Integer;
SpeechSegment: TSherpaOnnxSpeechSegment;
Start: Single;
Duration: Single;
SampleRate: Integer;
AllSpeechSegment: array of TSherpaOnnxSpeechSegment;
AllSamples: array of Single;
N: Integer;
I: Integer;
begin
SampleRate := 16000; {Please don't change it unless you know the details}
Wave := SherpaOnnxReadWave('./lei-jun-test.wav');
if Wave.SampleRate <> SampleRate then
begin
WriteLn(Format('Expected sample rate: %d. Given: %d',
[SampleRate, Wave.SampleRate]));
Exit;
end;
WindowSize := 512; {Please don't change it unless you know the details}
Initialize(Config);
Config.SileroVad.Model := './silero_vad.onnx';
Config.SileroVad.MinSpeechDuration := 0.25;
Config.SileroVad.MinSilenceDuration := 0.5;
Config.SileroVad.Threshold := 0.5;
Config.SileroVad.WindowSize := WindowSize;
Config.NumThreads:= 1;
Config.Debug:= True;
Config.Provider:= 'cpu';
Config.SampleRate := SampleRate;
Vad := TSherpaOnnxVoiceActivityDetector.Create(Config, 20);
AllSpeechSegment := nil;
AllSamples := nil;
Offset := 0;
while Offset + WindowSize <= Length(Wave.Samples) do
begin
Vad.AcceptWaveform(Wave.Samples, Offset, WindowSize);
Inc(Offset, WindowSize);
while not Vad.IsEmpty do
begin
SetLength(AllSpeechSegment, Length(AllSpeechSegment) + 1);
SpeechSegment := Vad.Front();
Vad.Pop();
AllSpeechSegment[Length(AllSpeechSegment)-1] := SpeechSegment;
Start := SpeechSegment.Start / SampleRate;
Duration := Length(SpeechSegment.Samples) / SampleRate;
WriteLn(Format('%.3f -- %.3f', [Start, Start + Duration]));
end;
end;
Vad.Flush;
while not Vad.IsEmpty do
begin
SetLength(AllSpeechSegment, Length(AllSpeechSegment) + 1);
SpeechSegment := Vad.Front();
Vad.Pop();
AllSpeechSegment[Length(AllSpeechSegment)-1] := SpeechSegment;
Start := SpeechSegment.Start / SampleRate;
Duration := Length(SpeechSegment.Samples) / SampleRate;
WriteLn(Format('%.3f -- %.3f', [Start, Start + Duration]));
end;
N := 0;
for SpeechSegment in AllSpeechSegment do
Inc(N, Length(SpeechSegment.Samples));
SetLength(AllSamples, N);
N := 0;
for SpeechSegment in AllSpeechSegment do
begin
for I := Low(SpeechSegment.Samples) to High(SpeechSegment.Samples) do
begin
AllSamples[N] := SpeechSegment.Samples[I];
Inc(N);
end;
end;
SherpaOnnxWriteWave('./lei-jun-test-no-silence.wav', AllSamples, SampleRate);
WriteLn('Saved to ./lei-jun-test-no-silence.wav');
FreeAndNil(Vad);
end.